1; 2; jidctint.asm - accurate integer IDCT (SSE2) 3; 4; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB 5; Copyright (C) 2016, D. R. Commander. 6; 7; Based on the x86 SIMD extension for IJG JPEG library 8; Copyright (C) 1999-2006, MIYASAKA Masaru. 9; For conditions of distribution and use, see copyright notice in jsimdext.inc 10; 11; This file should be assembled with NASM (Netwide Assembler), 12; can *not* be assembled with Microsoft's MASM or any compatible 13; assembler (including Borland's Turbo Assembler). 14; NASM is available from http://nasm.sourceforge.net/ or 15; http://sourceforge.net/project/showfiles.php?group_id=6208 16; 17; This file contains a slow-but-accurate integer implementation of the 18; inverse DCT (Discrete Cosine Transform). The following code is based 19; directly on the IJG's original jidctint.c; see the jidctint.c for 20; more details. 21 22%include "jsimdext.inc" 23%include "jdct.inc" 24 25; -------------------------------------------------------------------------- 26 27%define CONST_BITS 13 28%define PASS1_BITS 2 29 30%define DESCALE_P1 (CONST_BITS - PASS1_BITS) 31%define DESCALE_P2 (CONST_BITS + PASS1_BITS + 3) 32 33%if CONST_BITS == 13 34F_0_298 equ 2446 ; FIX(0.298631336) 35F_0_390 equ 3196 ; FIX(0.390180644) 36F_0_541 equ 4433 ; FIX(0.541196100) 37F_0_765 equ 6270 ; FIX(0.765366865) 38F_0_899 equ 7373 ; FIX(0.899976223) 39F_1_175 equ 9633 ; FIX(1.175875602) 40F_1_501 equ 12299 ; FIX(1.501321110) 41F_1_847 equ 15137 ; FIX(1.847759065) 42F_1_961 equ 16069 ; FIX(1.961570560) 43F_2_053 equ 16819 ; FIX(2.053119869) 44F_2_562 equ 20995 ; FIX(2.562915447) 45F_3_072 equ 25172 ; FIX(3.072711026) 46%else 47; NASM cannot do compile-time arithmetic on floating-point constants. 48%define DESCALE(x, n) (((x) + (1 << ((n) - 1))) >> (n)) 49F_0_298 equ DESCALE( 320652955, 30 - CONST_BITS) ; FIX(0.298631336) 50F_0_390 equ DESCALE( 418953276, 30 - CONST_BITS) ; FIX(0.390180644) 51F_0_541 equ DESCALE( 581104887, 30 - CONST_BITS) ; FIX(0.541196100) 52F_0_765 equ DESCALE( 821806413, 30 - CONST_BITS) ; FIX(0.765366865) 53F_0_899 equ DESCALE( 966342111, 30 - CONST_BITS) ; FIX(0.899976223) 54F_1_175 equ DESCALE(1262586813, 30 - CONST_BITS) ; FIX(1.175875602) 55F_1_501 equ DESCALE(1612031267, 30 - CONST_BITS) ; FIX(1.501321110) 56F_1_847 equ DESCALE(1984016188, 30 - CONST_BITS) ; FIX(1.847759065) 57F_1_961 equ DESCALE(2106220350, 30 - CONST_BITS) ; FIX(1.961570560) 58F_2_053 equ DESCALE(2204520673, 30 - CONST_BITS) ; FIX(2.053119869) 59F_2_562 equ DESCALE(2751909506, 30 - CONST_BITS) ; FIX(2.562915447) 60F_3_072 equ DESCALE(3299298341, 30 - CONST_BITS) ; FIX(3.072711026) 61%endif 62 63; -------------------------------------------------------------------------- 64 SECTION SEG_CONST 65 66 alignz 32 67 GLOBAL_DATA(jconst_idct_islow_sse2) 68 69EXTN(jconst_idct_islow_sse2): 70 71PW_F130_F054 times 4 dw (F_0_541 + F_0_765), F_0_541 72PW_F054_MF130 times 4 dw F_0_541, (F_0_541 - F_1_847) 73PW_MF078_F117 times 4 dw (F_1_175 - F_1_961), F_1_175 74PW_F117_F078 times 4 dw F_1_175, (F_1_175 - F_0_390) 75PW_MF060_MF089 times 4 dw (F_0_298 - F_0_899), -F_0_899 76PW_MF089_F060 times 4 dw -F_0_899, (F_1_501 - F_0_899) 77PW_MF050_MF256 times 4 dw (F_2_053 - F_2_562), -F_2_562 78PW_MF256_F050 times 4 dw -F_2_562, (F_3_072 - F_2_562) 79PD_DESCALE_P1 times 4 dd 1 << (DESCALE_P1 - 1) 80PD_DESCALE_P2 times 4 dd 1 << (DESCALE_P2 - 1) 81PB_CENTERJSAMP times 16 db CENTERJSAMPLE 82 83 alignz 32 84 85; -------------------------------------------------------------------------- 86 SECTION SEG_TEXT 87 BITS 32 88; 89; Perform dequantization and inverse DCT on one block of coefficients. 90; 91; GLOBAL(void) 92; jsimd_idct_islow_sse2(void *dct_table, JCOEFPTR coef_block, 93; JSAMPARRAY output_buf, JDIMENSION output_col) 94; 95 96%define dct_table(b) (b) + 8 ; jpeg_component_info *compptr 97%define coef_block(b) (b) + 12 ; JCOEFPTR coef_block 98%define output_buf(b) (b) + 16 ; JSAMPARRAY output_buf 99%define output_col(b) (b) + 20 ; JDIMENSION output_col 100 101%define original_ebp ebp + 0 102%define wk(i) ebp - (WK_NUM - (i)) * SIZEOF_XMMWORD 103 ; xmmword wk[WK_NUM] 104%define WK_NUM 12 105 106 align 32 107 GLOBAL_FUNCTION(jsimd_idct_islow_sse2) 108 109EXTN(jsimd_idct_islow_sse2): 110 push ebp 111 mov eax, esp ; eax = original ebp 112 sub esp, byte 4 113 and esp, byte (-SIZEOF_XMMWORD) ; align to 128 bits 114 mov [esp], eax 115 mov ebp, esp ; ebp = aligned ebp 116 lea esp, [wk(0)] 117 pushpic ebx 118; push ecx ; unused 119; push edx ; need not be preserved 120 push esi 121 push edi 122 123 get_GOT ebx ; get GOT address 124 125 ; ---- Pass 1: process columns from input. 126 127; mov eax, [original_ebp] 128 mov edx, POINTER [dct_table(eax)] ; quantptr 129 mov esi, JCOEFPTR [coef_block(eax)] ; inptr 130 131%ifndef NO_ZERO_COLUMN_TEST_ISLOW_SSE2 132 mov eax, dword [DWBLOCK(1,0,esi,SIZEOF_JCOEF)] 133 or eax, dword [DWBLOCK(2,0,esi,SIZEOF_JCOEF)] 134 jnz near .columnDCT 135 136 movdqa xmm0, XMMWORD [XMMBLOCK(1,0,esi,SIZEOF_JCOEF)] 137 movdqa xmm1, XMMWORD [XMMBLOCK(2,0,esi,SIZEOF_JCOEF)] 138 por xmm0, XMMWORD [XMMBLOCK(3,0,esi,SIZEOF_JCOEF)] 139 por xmm1, XMMWORD [XMMBLOCK(4,0,esi,SIZEOF_JCOEF)] 140 por xmm0, XMMWORD [XMMBLOCK(5,0,esi,SIZEOF_JCOEF)] 141 por xmm1, XMMWORD [XMMBLOCK(6,0,esi,SIZEOF_JCOEF)] 142 por xmm0, XMMWORD [XMMBLOCK(7,0,esi,SIZEOF_JCOEF)] 143 por xmm1, xmm0 144 packsswb xmm1, xmm1 145 packsswb xmm1, xmm1 146 movd eax, xmm1 147 test eax, eax 148 jnz short .columnDCT 149 150 ; -- AC terms all zero 151 152 movdqa xmm5, XMMWORD [XMMBLOCK(0,0,esi,SIZEOF_JCOEF)] 153 pmullw xmm5, XMMWORD [XMMBLOCK(0,0,edx,SIZEOF_ISLOW_MULT_TYPE)] 154 155 psllw xmm5, PASS1_BITS 156 157 movdqa xmm4, xmm5 ; xmm5=in0=(00 01 02 03 04 05 06 07) 158 punpcklwd xmm5, xmm5 ; xmm5=(00 00 01 01 02 02 03 03) 159 punpckhwd xmm4, xmm4 ; xmm4=(04 04 05 05 06 06 07 07) 160 161 pshufd xmm7, xmm5, 0x00 ; xmm7=col0=(00 00 00 00 00 00 00 00) 162 pshufd xmm6, xmm5, 0x55 ; xmm6=col1=(01 01 01 01 01 01 01 01) 163 pshufd xmm1, xmm5, 0xAA ; xmm1=col2=(02 02 02 02 02 02 02 02) 164 pshufd xmm5, xmm5, 0xFF ; xmm5=col3=(03 03 03 03 03 03 03 03) 165 pshufd xmm0, xmm4, 0x00 ; xmm0=col4=(04 04 04 04 04 04 04 04) 166 pshufd xmm3, xmm4, 0x55 ; xmm3=col5=(05 05 05 05 05 05 05 05) 167 pshufd xmm2, xmm4, 0xAA ; xmm2=col6=(06 06 06 06 06 06 06 06) 168 pshufd xmm4, xmm4, 0xFF ; xmm4=col7=(07 07 07 07 07 07 07 07) 169 170 movdqa XMMWORD [wk(8)], xmm6 ; wk(8)=col1 171 movdqa XMMWORD [wk(9)], xmm5 ; wk(9)=col3 172 movdqa XMMWORD [wk(10)], xmm3 ; wk(10)=col5 173 movdqa XMMWORD [wk(11)], xmm4 ; wk(11)=col7 174 jmp near .column_end 175 alignx 16, 7 176%endif 177.columnDCT: 178 179 ; -- Even part 180 181 movdqa xmm0, XMMWORD [XMMBLOCK(0,0,esi,SIZEOF_JCOEF)] 182 movdqa xmm1, XMMWORD [XMMBLOCK(2,0,esi,SIZEOF_JCOEF)] 183 pmullw xmm0, XMMWORD [XMMBLOCK(0,0,edx,SIZEOF_ISLOW_MULT_TYPE)] 184 pmullw xmm1, XMMWORD [XMMBLOCK(2,0,edx,SIZEOF_ISLOW_MULT_TYPE)] 185 movdqa xmm2, XMMWORD [XMMBLOCK(4,0,esi,SIZEOF_JCOEF)] 186 movdqa xmm3, XMMWORD [XMMBLOCK(6,0,esi,SIZEOF_JCOEF)] 187 pmullw xmm2, XMMWORD [XMMBLOCK(4,0,edx,SIZEOF_ISLOW_MULT_TYPE)] 188 pmullw xmm3, XMMWORD [XMMBLOCK(6,0,edx,SIZEOF_ISLOW_MULT_TYPE)] 189 190 ; (Original) 191 ; z1 = (z2 + z3) * 0.541196100; 192 ; tmp2 = z1 + z3 * -1.847759065; 193 ; tmp3 = z1 + z2 * 0.765366865; 194 ; 195 ; (This implementation) 196 ; tmp2 = z2 * 0.541196100 + z3 * (0.541196100 - 1.847759065); 197 ; tmp3 = z2 * (0.541196100 + 0.765366865) + z3 * 0.541196100; 198 199 movdqa xmm4, xmm1 ; xmm1=in2=z2 200 movdqa xmm5, xmm1 201 punpcklwd xmm4, xmm3 ; xmm3=in6=z3 202 punpckhwd xmm5, xmm3 203 movdqa xmm1, xmm4 204 movdqa xmm3, xmm5 205 pmaddwd xmm4, [GOTOFF(ebx,PW_F130_F054)] ; xmm4=tmp3L 206 pmaddwd xmm5, [GOTOFF(ebx,PW_F130_F054)] ; xmm5=tmp3H 207 pmaddwd xmm1, [GOTOFF(ebx,PW_F054_MF130)] ; xmm1=tmp2L 208 pmaddwd xmm3, [GOTOFF(ebx,PW_F054_MF130)] ; xmm3=tmp2H 209 210 movdqa xmm6, xmm0 211 paddw xmm0, xmm2 ; xmm0=in0+in4 212 psubw xmm6, xmm2 ; xmm6=in0-in4 213 214 pxor xmm7, xmm7 215 pxor xmm2, xmm2 216 punpcklwd xmm7, xmm0 ; xmm7=tmp0L 217 punpckhwd xmm2, xmm0 ; xmm2=tmp0H 218 psrad xmm7, (16-CONST_BITS) ; psrad xmm7,16 & pslld xmm7,CONST_BITS 219 psrad xmm2, (16-CONST_BITS) ; psrad xmm2,16 & pslld xmm2,CONST_BITS 220 221 movdqa xmm0, xmm7 222 paddd xmm7, xmm4 ; xmm7=tmp10L 223 psubd xmm0, xmm4 ; xmm0=tmp13L 224 movdqa xmm4, xmm2 225 paddd xmm2, xmm5 ; xmm2=tmp10H 226 psubd xmm4, xmm5 ; xmm4=tmp13H 227 228 movdqa XMMWORD [wk(0)], xmm7 ; wk(0)=tmp10L 229 movdqa XMMWORD [wk(1)], xmm2 ; wk(1)=tmp10H 230 movdqa XMMWORD [wk(2)], xmm0 ; wk(2)=tmp13L 231 movdqa XMMWORD [wk(3)], xmm4 ; wk(3)=tmp13H 232 233 pxor xmm5, xmm5 234 pxor xmm7, xmm7 235 punpcklwd xmm5, xmm6 ; xmm5=tmp1L 236 punpckhwd xmm7, xmm6 ; xmm7=tmp1H 237 psrad xmm5, (16-CONST_BITS) ; psrad xmm5,16 & pslld xmm5,CONST_BITS 238 psrad xmm7, (16-CONST_BITS) ; psrad xmm7,16 & pslld xmm7,CONST_BITS 239 240 movdqa xmm2, xmm5 241 paddd xmm5, xmm1 ; xmm5=tmp11L 242 psubd xmm2, xmm1 ; xmm2=tmp12L 243 movdqa xmm0, xmm7 244 paddd xmm7, xmm3 ; xmm7=tmp11H 245 psubd xmm0, xmm3 ; xmm0=tmp12H 246 247 movdqa XMMWORD [wk(4)], xmm5 ; wk(4)=tmp11L 248 movdqa XMMWORD [wk(5)], xmm7 ; wk(5)=tmp11H 249 movdqa XMMWORD [wk(6)], xmm2 ; wk(6)=tmp12L 250 movdqa XMMWORD [wk(7)], xmm0 ; wk(7)=tmp12H 251 252 ; -- Odd part 253 254 movdqa xmm4, XMMWORD [XMMBLOCK(1,0,esi,SIZEOF_JCOEF)] 255 movdqa xmm6, XMMWORD [XMMBLOCK(3,0,esi,SIZEOF_JCOEF)] 256 pmullw xmm4, XMMWORD [XMMBLOCK(1,0,edx,SIZEOF_ISLOW_MULT_TYPE)] 257 pmullw xmm6, XMMWORD [XMMBLOCK(3,0,edx,SIZEOF_ISLOW_MULT_TYPE)] 258 movdqa xmm1, XMMWORD [XMMBLOCK(5,0,esi,SIZEOF_JCOEF)] 259 movdqa xmm3, XMMWORD [XMMBLOCK(7,0,esi,SIZEOF_JCOEF)] 260 pmullw xmm1, XMMWORD [XMMBLOCK(5,0,edx,SIZEOF_ISLOW_MULT_TYPE)] 261 pmullw xmm3, XMMWORD [XMMBLOCK(7,0,edx,SIZEOF_ISLOW_MULT_TYPE)] 262 263 movdqa xmm5, xmm6 264 movdqa xmm7, xmm4 265 paddw xmm5, xmm3 ; xmm5=z3 266 paddw xmm7, xmm1 ; xmm7=z4 267 268 ; (Original) 269 ; z5 = (z3 + z4) * 1.175875602; 270 ; z3 = z3 * -1.961570560; z4 = z4 * -0.390180644; 271 ; z3 += z5; z4 += z5; 272 ; 273 ; (This implementation) 274 ; z3 = z3 * (1.175875602 - 1.961570560) + z4 * 1.175875602; 275 ; z4 = z3 * 1.175875602 + z4 * (1.175875602 - 0.390180644); 276 277 movdqa xmm2, xmm5 278 movdqa xmm0, xmm5 279 punpcklwd xmm2, xmm7 280 punpckhwd xmm0, xmm7 281 movdqa xmm5, xmm2 282 movdqa xmm7, xmm0 283 pmaddwd xmm2, [GOTOFF(ebx,PW_MF078_F117)] ; xmm2=z3L 284 pmaddwd xmm0, [GOTOFF(ebx,PW_MF078_F117)] ; xmm0=z3H 285 pmaddwd xmm5, [GOTOFF(ebx,PW_F117_F078)] ; xmm5=z4L 286 pmaddwd xmm7, [GOTOFF(ebx,PW_F117_F078)] ; xmm7=z4H 287 288 movdqa XMMWORD [wk(10)], xmm2 ; wk(10)=z3L 289 movdqa XMMWORD [wk(11)], xmm0 ; wk(11)=z3H 290 291 ; (Original) 292 ; z1 = tmp0 + tmp3; z2 = tmp1 + tmp2; 293 ; tmp0 = tmp0 * 0.298631336; tmp1 = tmp1 * 2.053119869; 294 ; tmp2 = tmp2 * 3.072711026; tmp3 = tmp3 * 1.501321110; 295 ; z1 = z1 * -0.899976223; z2 = z2 * -2.562915447; 296 ; tmp0 += z1 + z3; tmp1 += z2 + z4; 297 ; tmp2 += z2 + z3; tmp3 += z1 + z4; 298 ; 299 ; (This implementation) 300 ; tmp0 = tmp0 * (0.298631336 - 0.899976223) + tmp3 * -0.899976223; 301 ; tmp1 = tmp1 * (2.053119869 - 2.562915447) + tmp2 * -2.562915447; 302 ; tmp2 = tmp1 * -2.562915447 + tmp2 * (3.072711026 - 2.562915447); 303 ; tmp3 = tmp0 * -0.899976223 + tmp3 * (1.501321110 - 0.899976223); 304 ; tmp0 += z3; tmp1 += z4; 305 ; tmp2 += z3; tmp3 += z4; 306 307 movdqa xmm2, xmm3 308 movdqa xmm0, xmm3 309 punpcklwd xmm2, xmm4 310 punpckhwd xmm0, xmm4 311 movdqa xmm3, xmm2 312 movdqa xmm4, xmm0 313 pmaddwd xmm2, [GOTOFF(ebx,PW_MF060_MF089)] ; xmm2=tmp0L 314 pmaddwd xmm0, [GOTOFF(ebx,PW_MF060_MF089)] ; xmm0=tmp0H 315 pmaddwd xmm3, [GOTOFF(ebx,PW_MF089_F060)] ; xmm3=tmp3L 316 pmaddwd xmm4, [GOTOFF(ebx,PW_MF089_F060)] ; xmm4=tmp3H 317 318 paddd xmm2, XMMWORD [wk(10)] ; xmm2=tmp0L 319 paddd xmm0, XMMWORD [wk(11)] ; xmm0=tmp0H 320 paddd xmm3, xmm5 ; xmm3=tmp3L 321 paddd xmm4, xmm7 ; xmm4=tmp3H 322 323 movdqa XMMWORD [wk(8)], xmm2 ; wk(8)=tmp0L 324 movdqa XMMWORD [wk(9)], xmm0 ; wk(9)=tmp0H 325 326 movdqa xmm2, xmm1 327 movdqa xmm0, xmm1 328 punpcklwd xmm2, xmm6 329 punpckhwd xmm0, xmm6 330 movdqa xmm1, xmm2 331 movdqa xmm6, xmm0 332 pmaddwd xmm2, [GOTOFF(ebx,PW_MF050_MF256)] ; xmm2=tmp1L 333 pmaddwd xmm0, [GOTOFF(ebx,PW_MF050_MF256)] ; xmm0=tmp1H 334 pmaddwd xmm1, [GOTOFF(ebx,PW_MF256_F050)] ; xmm1=tmp2L 335 pmaddwd xmm6, [GOTOFF(ebx,PW_MF256_F050)] ; xmm6=tmp2H 336 337 paddd xmm2, xmm5 ; xmm2=tmp1L 338 paddd xmm0, xmm7 ; xmm0=tmp1H 339 paddd xmm1, XMMWORD [wk(10)] ; xmm1=tmp2L 340 paddd xmm6, XMMWORD [wk(11)] ; xmm6=tmp2H 341 342 movdqa XMMWORD [wk(10)], xmm2 ; wk(10)=tmp1L 343 movdqa XMMWORD [wk(11)], xmm0 ; wk(11)=tmp1H 344 345 ; -- Final output stage 346 347 movdqa xmm5, XMMWORD [wk(0)] ; xmm5=tmp10L 348 movdqa xmm7, XMMWORD [wk(1)] ; xmm7=tmp10H 349 350 movdqa xmm2, xmm5 351 movdqa xmm0, xmm7 352 paddd xmm5, xmm3 ; xmm5=data0L 353 paddd xmm7, xmm4 ; xmm7=data0H 354 psubd xmm2, xmm3 ; xmm2=data7L 355 psubd xmm0, xmm4 ; xmm0=data7H 356 357 movdqa xmm3, [GOTOFF(ebx,PD_DESCALE_P1)] ; xmm3=[PD_DESCALE_P1] 358 359 paddd xmm5, xmm3 360 paddd xmm7, xmm3 361 psrad xmm5, DESCALE_P1 362 psrad xmm7, DESCALE_P1 363 paddd xmm2, xmm3 364 paddd xmm0, xmm3 365 psrad xmm2, DESCALE_P1 366 psrad xmm0, DESCALE_P1 367 368 packssdw xmm5, xmm7 ; xmm5=data0=(00 01 02 03 04 05 06 07) 369 packssdw xmm2, xmm0 ; xmm2=data7=(70 71 72 73 74 75 76 77) 370 371 movdqa xmm4, XMMWORD [wk(4)] ; xmm4=tmp11L 372 movdqa xmm3, XMMWORD [wk(5)] ; xmm3=tmp11H 373 374 movdqa xmm7, xmm4 375 movdqa xmm0, xmm3 376 paddd xmm4, xmm1 ; xmm4=data1L 377 paddd xmm3, xmm6 ; xmm3=data1H 378 psubd xmm7, xmm1 ; xmm7=data6L 379 psubd xmm0, xmm6 ; xmm0=data6H 380 381 movdqa xmm1, [GOTOFF(ebx,PD_DESCALE_P1)] ; xmm1=[PD_DESCALE_P1] 382 383 paddd xmm4, xmm1 384 paddd xmm3, xmm1 385 psrad xmm4, DESCALE_P1 386 psrad xmm3, DESCALE_P1 387 paddd xmm7, xmm1 388 paddd xmm0, xmm1 389 psrad xmm7, DESCALE_P1 390 psrad xmm0, DESCALE_P1 391 392 packssdw xmm4, xmm3 ; xmm4=data1=(10 11 12 13 14 15 16 17) 393 packssdw xmm7, xmm0 ; xmm7=data6=(60 61 62 63 64 65 66 67) 394 395 movdqa xmm6, xmm5 ; transpose coefficients(phase 1) 396 punpcklwd xmm5, xmm4 ; xmm5=(00 10 01 11 02 12 03 13) 397 punpckhwd xmm6, xmm4 ; xmm6=(04 14 05 15 06 16 07 17) 398 movdqa xmm1, xmm7 ; transpose coefficients(phase 1) 399 punpcklwd xmm7, xmm2 ; xmm7=(60 70 61 71 62 72 63 73) 400 punpckhwd xmm1, xmm2 ; xmm1=(64 74 65 75 66 76 67 77) 401 402 movdqa xmm3, XMMWORD [wk(6)] ; xmm3=tmp12L 403 movdqa xmm0, XMMWORD [wk(7)] ; xmm0=tmp12H 404 movdqa xmm4, XMMWORD [wk(10)] ; xmm4=tmp1L 405 movdqa xmm2, XMMWORD [wk(11)] ; xmm2=tmp1H 406 407 movdqa XMMWORD [wk(0)], xmm5 ; wk(0)=(00 10 01 11 02 12 03 13) 408 movdqa XMMWORD [wk(1)], xmm6 ; wk(1)=(04 14 05 15 06 16 07 17) 409 movdqa XMMWORD [wk(4)], xmm7 ; wk(4)=(60 70 61 71 62 72 63 73) 410 movdqa XMMWORD [wk(5)], xmm1 ; wk(5)=(64 74 65 75 66 76 67 77) 411 412 movdqa xmm5, xmm3 413 movdqa xmm6, xmm0 414 paddd xmm3, xmm4 ; xmm3=data2L 415 paddd xmm0, xmm2 ; xmm0=data2H 416 psubd xmm5, xmm4 ; xmm5=data5L 417 psubd xmm6, xmm2 ; xmm6=data5H 418 419 movdqa xmm7, [GOTOFF(ebx,PD_DESCALE_P1)] ; xmm7=[PD_DESCALE_P1] 420 421 paddd xmm3, xmm7 422 paddd xmm0, xmm7 423 psrad xmm3, DESCALE_P1 424 psrad xmm0, DESCALE_P1 425 paddd xmm5, xmm7 426 paddd xmm6, xmm7 427 psrad xmm5, DESCALE_P1 428 psrad xmm6, DESCALE_P1 429 430 packssdw xmm3, xmm0 ; xmm3=data2=(20 21 22 23 24 25 26 27) 431 packssdw xmm5, xmm6 ; xmm5=data5=(50 51 52 53 54 55 56 57) 432 433 movdqa xmm1, XMMWORD [wk(2)] ; xmm1=tmp13L 434 movdqa xmm4, XMMWORD [wk(3)] ; xmm4=tmp13H 435 movdqa xmm2, XMMWORD [wk(8)] ; xmm2=tmp0L 436 movdqa xmm7, XMMWORD [wk(9)] ; xmm7=tmp0H 437 438 movdqa xmm0, xmm1 439 movdqa xmm6, xmm4 440 paddd xmm1, xmm2 ; xmm1=data3L 441 paddd xmm4, xmm7 ; xmm4=data3H 442 psubd xmm0, xmm2 ; xmm0=data4L 443 psubd xmm6, xmm7 ; xmm6=data4H 444 445 movdqa xmm2, [GOTOFF(ebx,PD_DESCALE_P1)] ; xmm2=[PD_DESCALE_P1] 446 447 paddd xmm1, xmm2 448 paddd xmm4, xmm2 449 psrad xmm1, DESCALE_P1 450 psrad xmm4, DESCALE_P1 451 paddd xmm0, xmm2 452 paddd xmm6, xmm2 453 psrad xmm0, DESCALE_P1 454 psrad xmm6, DESCALE_P1 455 456 packssdw xmm1, xmm4 ; xmm1=data3=(30 31 32 33 34 35 36 37) 457 packssdw xmm0, xmm6 ; xmm0=data4=(40 41 42 43 44 45 46 47) 458 459 movdqa xmm7, XMMWORD [wk(0)] ; xmm7=(00 10 01 11 02 12 03 13) 460 movdqa xmm2, XMMWORD [wk(1)] ; xmm2=(04 14 05 15 06 16 07 17) 461 462 movdqa xmm4, xmm3 ; transpose coefficients(phase 1) 463 punpcklwd xmm3, xmm1 ; xmm3=(20 30 21 31 22 32 23 33) 464 punpckhwd xmm4, xmm1 ; xmm4=(24 34 25 35 26 36 27 37) 465 movdqa xmm6, xmm0 ; transpose coefficients(phase 1) 466 punpcklwd xmm0, xmm5 ; xmm0=(40 50 41 51 42 52 43 53) 467 punpckhwd xmm6, xmm5 ; xmm6=(44 54 45 55 46 56 47 57) 468 469 movdqa xmm1, xmm7 ; transpose coefficients(phase 2) 470 punpckldq xmm7, xmm3 ; xmm7=(00 10 20 30 01 11 21 31) 471 punpckhdq xmm1, xmm3 ; xmm1=(02 12 22 32 03 13 23 33) 472 movdqa xmm5, xmm2 ; transpose coefficients(phase 2) 473 punpckldq xmm2, xmm4 ; xmm2=(04 14 24 34 05 15 25 35) 474 punpckhdq xmm5, xmm4 ; xmm5=(06 16 26 36 07 17 27 37) 475 476 movdqa xmm3, XMMWORD [wk(4)] ; xmm3=(60 70 61 71 62 72 63 73) 477 movdqa xmm4, XMMWORD [wk(5)] ; xmm4=(64 74 65 75 66 76 67 77) 478 479 movdqa XMMWORD [wk(6)], xmm2 ; wk(6)=(04 14 24 34 05 15 25 35) 480 movdqa XMMWORD [wk(7)], xmm5 ; wk(7)=(06 16 26 36 07 17 27 37) 481 482 movdqa xmm2, xmm0 ; transpose coefficients(phase 2) 483 punpckldq xmm0, xmm3 ; xmm0=(40 50 60 70 41 51 61 71) 484 punpckhdq xmm2, xmm3 ; xmm2=(42 52 62 72 43 53 63 73) 485 movdqa xmm5, xmm6 ; transpose coefficients(phase 2) 486 punpckldq xmm6, xmm4 ; xmm6=(44 54 64 74 45 55 65 75) 487 punpckhdq xmm5, xmm4 ; xmm5=(46 56 66 76 47 57 67 77) 488 489 movdqa xmm3, xmm7 ; transpose coefficients(phase 3) 490 punpcklqdq xmm7, xmm0 ; xmm7=col0=(00 10 20 30 40 50 60 70) 491 punpckhqdq xmm3, xmm0 ; xmm3=col1=(01 11 21 31 41 51 61 71) 492 movdqa xmm4, xmm1 ; transpose coefficients(phase 3) 493 punpcklqdq xmm1, xmm2 ; xmm1=col2=(02 12 22 32 42 52 62 72) 494 punpckhqdq xmm4, xmm2 ; xmm4=col3=(03 13 23 33 43 53 63 73) 495 496 movdqa xmm0, XMMWORD [wk(6)] ; xmm0=(04 14 24 34 05 15 25 35) 497 movdqa xmm2, XMMWORD [wk(7)] ; xmm2=(06 16 26 36 07 17 27 37) 498 499 movdqa XMMWORD [wk(8)], xmm3 ; wk(8)=col1 500 movdqa XMMWORD [wk(9)], xmm4 ; wk(9)=col3 501 502 movdqa xmm3, xmm0 ; transpose coefficients(phase 3) 503 punpcklqdq xmm0, xmm6 ; xmm0=col4=(04 14 24 34 44 54 64 74) 504 punpckhqdq xmm3, xmm6 ; xmm3=col5=(05 15 25 35 45 55 65 75) 505 movdqa xmm4, xmm2 ; transpose coefficients(phase 3) 506 punpcklqdq xmm2, xmm5 ; xmm2=col6=(06 16 26 36 46 56 66 76) 507 punpckhqdq xmm4, xmm5 ; xmm4=col7=(07 17 27 37 47 57 67 77) 508 509 movdqa XMMWORD [wk(10)], xmm3 ; wk(10)=col5 510 movdqa XMMWORD [wk(11)], xmm4 ; wk(11)=col7 511.column_end: 512 513 ; -- Prefetch the next coefficient block 514 515 prefetchnta [esi + DCTSIZE2*SIZEOF_JCOEF + 0*32] 516 prefetchnta [esi + DCTSIZE2*SIZEOF_JCOEF + 1*32] 517 prefetchnta [esi + DCTSIZE2*SIZEOF_JCOEF + 2*32] 518 prefetchnta [esi + DCTSIZE2*SIZEOF_JCOEF + 3*32] 519 520 ; ---- Pass 2: process rows from work array, store into output array. 521 522 mov eax, [original_ebp] 523 mov edi, JSAMPARRAY [output_buf(eax)] ; (JSAMPROW *) 524 mov eax, JDIMENSION [output_col(eax)] 525 526 ; -- Even part 527 528 ; xmm7=col0, xmm1=col2, xmm0=col4, xmm2=col6 529 530 ; (Original) 531 ; z1 = (z2 + z3) * 0.541196100; 532 ; tmp2 = z1 + z3 * -1.847759065; 533 ; tmp3 = z1 + z2 * 0.765366865; 534 ; 535 ; (This implementation) 536 ; tmp2 = z2 * 0.541196100 + z3 * (0.541196100 - 1.847759065); 537 ; tmp3 = z2 * (0.541196100 + 0.765366865) + z3 * 0.541196100; 538 539 movdqa xmm6, xmm1 ; xmm1=in2=z2 540 movdqa xmm5, xmm1 541 punpcklwd xmm6, xmm2 ; xmm2=in6=z3 542 punpckhwd xmm5, xmm2 543 movdqa xmm1, xmm6 544 movdqa xmm2, xmm5 545 pmaddwd xmm6, [GOTOFF(ebx,PW_F130_F054)] ; xmm6=tmp3L 546 pmaddwd xmm5, [GOTOFF(ebx,PW_F130_F054)] ; xmm5=tmp3H 547 pmaddwd xmm1, [GOTOFF(ebx,PW_F054_MF130)] ; xmm1=tmp2L 548 pmaddwd xmm2, [GOTOFF(ebx,PW_F054_MF130)] ; xmm2=tmp2H 549 550 movdqa xmm3, xmm7 551 paddw xmm7, xmm0 ; xmm7=in0+in4 552 psubw xmm3, xmm0 ; xmm3=in0-in4 553 554 pxor xmm4, xmm4 555 pxor xmm0, xmm0 556 punpcklwd xmm4, xmm7 ; xmm4=tmp0L 557 punpckhwd xmm0, xmm7 ; xmm0=tmp0H 558 psrad xmm4, (16-CONST_BITS) ; psrad xmm4,16 & pslld xmm4,CONST_BITS 559 psrad xmm0, (16-CONST_BITS) ; psrad xmm0,16 & pslld xmm0,CONST_BITS 560 561 movdqa xmm7, xmm4 562 paddd xmm4, xmm6 ; xmm4=tmp10L 563 psubd xmm7, xmm6 ; xmm7=tmp13L 564 movdqa xmm6, xmm0 565 paddd xmm0, xmm5 ; xmm0=tmp10H 566 psubd xmm6, xmm5 ; xmm6=tmp13H 567 568 movdqa XMMWORD [wk(0)], xmm4 ; wk(0)=tmp10L 569 movdqa XMMWORD [wk(1)], xmm0 ; wk(1)=tmp10H 570 movdqa XMMWORD [wk(2)], xmm7 ; wk(2)=tmp13L 571 movdqa XMMWORD [wk(3)], xmm6 ; wk(3)=tmp13H 572 573 pxor xmm5, xmm5 574 pxor xmm4, xmm4 575 punpcklwd xmm5, xmm3 ; xmm5=tmp1L 576 punpckhwd xmm4, xmm3 ; xmm4=tmp1H 577 psrad xmm5, (16-CONST_BITS) ; psrad xmm5,16 & pslld xmm5,CONST_BITS 578 psrad xmm4, (16-CONST_BITS) ; psrad xmm4,16 & pslld xmm4,CONST_BITS 579 580 movdqa xmm0, xmm5 581 paddd xmm5, xmm1 ; xmm5=tmp11L 582 psubd xmm0, xmm1 ; xmm0=tmp12L 583 movdqa xmm7, xmm4 584 paddd xmm4, xmm2 ; xmm4=tmp11H 585 psubd xmm7, xmm2 ; xmm7=tmp12H 586 587 movdqa XMMWORD [wk(4)], xmm5 ; wk(4)=tmp11L 588 movdqa XMMWORD [wk(5)], xmm4 ; wk(5)=tmp11H 589 movdqa XMMWORD [wk(6)], xmm0 ; wk(6)=tmp12L 590 movdqa XMMWORD [wk(7)], xmm7 ; wk(7)=tmp12H 591 592 ; -- Odd part 593 594 movdqa xmm6, XMMWORD [wk(9)] ; xmm6=col3 595 movdqa xmm3, XMMWORD [wk(8)] ; xmm3=col1 596 movdqa xmm1, XMMWORD [wk(11)] ; xmm1=col7 597 movdqa xmm2, XMMWORD [wk(10)] ; xmm2=col5 598 599 movdqa xmm5, xmm6 600 movdqa xmm4, xmm3 601 paddw xmm5, xmm1 ; xmm5=z3 602 paddw xmm4, xmm2 ; xmm4=z4 603 604 ; (Original) 605 ; z5 = (z3 + z4) * 1.175875602; 606 ; z3 = z3 * -1.961570560; z4 = z4 * -0.390180644; 607 ; z3 += z5; z4 += z5; 608 ; 609 ; (This implementation) 610 ; z3 = z3 * (1.175875602 - 1.961570560) + z4 * 1.175875602; 611 ; z4 = z3 * 1.175875602 + z4 * (1.175875602 - 0.390180644); 612 613 movdqa xmm0, xmm5 614 movdqa xmm7, xmm5 615 punpcklwd xmm0, xmm4 616 punpckhwd xmm7, xmm4 617 movdqa xmm5, xmm0 618 movdqa xmm4, xmm7 619 pmaddwd xmm0, [GOTOFF(ebx,PW_MF078_F117)] ; xmm0=z3L 620 pmaddwd xmm7, [GOTOFF(ebx,PW_MF078_F117)] ; xmm7=z3H 621 pmaddwd xmm5, [GOTOFF(ebx,PW_F117_F078)] ; xmm5=z4L 622 pmaddwd xmm4, [GOTOFF(ebx,PW_F117_F078)] ; xmm4=z4H 623 624 movdqa XMMWORD [wk(10)], xmm0 ; wk(10)=z3L 625 movdqa XMMWORD [wk(11)], xmm7 ; wk(11)=z3H 626 627 ; (Original) 628 ; z1 = tmp0 + tmp3; z2 = tmp1 + tmp2; 629 ; tmp0 = tmp0 * 0.298631336; tmp1 = tmp1 * 2.053119869; 630 ; tmp2 = tmp2 * 3.072711026; tmp3 = tmp3 * 1.501321110; 631 ; z1 = z1 * -0.899976223; z2 = z2 * -2.562915447; 632 ; tmp0 += z1 + z3; tmp1 += z2 + z4; 633 ; tmp2 += z2 + z3; tmp3 += z1 + z4; 634 ; 635 ; (This implementation) 636 ; tmp0 = tmp0 * (0.298631336 - 0.899976223) + tmp3 * -0.899976223; 637 ; tmp1 = tmp1 * (2.053119869 - 2.562915447) + tmp2 * -2.562915447; 638 ; tmp2 = tmp1 * -2.562915447 + tmp2 * (3.072711026 - 2.562915447); 639 ; tmp3 = tmp0 * -0.899976223 + tmp3 * (1.501321110 - 0.899976223); 640 ; tmp0 += z3; tmp1 += z4; 641 ; tmp2 += z3; tmp3 += z4; 642 643 movdqa xmm0, xmm1 644 movdqa xmm7, xmm1 645 punpcklwd xmm0, xmm3 646 punpckhwd xmm7, xmm3 647 movdqa xmm1, xmm0 648 movdqa xmm3, xmm7 649 pmaddwd xmm0, [GOTOFF(ebx,PW_MF060_MF089)] ; xmm0=tmp0L 650 pmaddwd xmm7, [GOTOFF(ebx,PW_MF060_MF089)] ; xmm7=tmp0H 651 pmaddwd xmm1, [GOTOFF(ebx,PW_MF089_F060)] ; xmm1=tmp3L 652 pmaddwd xmm3, [GOTOFF(ebx,PW_MF089_F060)] ; xmm3=tmp3H 653 654 paddd xmm0, XMMWORD [wk(10)] ; xmm0=tmp0L 655 paddd xmm7, XMMWORD [wk(11)] ; xmm7=tmp0H 656 paddd xmm1, xmm5 ; xmm1=tmp3L 657 paddd xmm3, xmm4 ; xmm3=tmp3H 658 659 movdqa XMMWORD [wk(8)], xmm0 ; wk(8)=tmp0L 660 movdqa XMMWORD [wk(9)], xmm7 ; wk(9)=tmp0H 661 662 movdqa xmm0, xmm2 663 movdqa xmm7, xmm2 664 punpcklwd xmm0, xmm6 665 punpckhwd xmm7, xmm6 666 movdqa xmm2, xmm0 667 movdqa xmm6, xmm7 668 pmaddwd xmm0, [GOTOFF(ebx,PW_MF050_MF256)] ; xmm0=tmp1L 669 pmaddwd xmm7, [GOTOFF(ebx,PW_MF050_MF256)] ; xmm7=tmp1H 670 pmaddwd xmm2, [GOTOFF(ebx,PW_MF256_F050)] ; xmm2=tmp2L 671 pmaddwd xmm6, [GOTOFF(ebx,PW_MF256_F050)] ; xmm6=tmp2H 672 673 paddd xmm0, xmm5 ; xmm0=tmp1L 674 paddd xmm7, xmm4 ; xmm7=tmp1H 675 paddd xmm2, XMMWORD [wk(10)] ; xmm2=tmp2L 676 paddd xmm6, XMMWORD [wk(11)] ; xmm6=tmp2H 677 678 movdqa XMMWORD [wk(10)], xmm0 ; wk(10)=tmp1L 679 movdqa XMMWORD [wk(11)], xmm7 ; wk(11)=tmp1H 680 681 ; -- Final output stage 682 683 movdqa xmm5, XMMWORD [wk(0)] ; xmm5=tmp10L 684 movdqa xmm4, XMMWORD [wk(1)] ; xmm4=tmp10H 685 686 movdqa xmm0, xmm5 687 movdqa xmm7, xmm4 688 paddd xmm5, xmm1 ; xmm5=data0L 689 paddd xmm4, xmm3 ; xmm4=data0H 690 psubd xmm0, xmm1 ; xmm0=data7L 691 psubd xmm7, xmm3 ; xmm7=data7H 692 693 movdqa xmm1, [GOTOFF(ebx,PD_DESCALE_P2)] ; xmm1=[PD_DESCALE_P2] 694 695 paddd xmm5, xmm1 696 paddd xmm4, xmm1 697 psrad xmm5, DESCALE_P2 698 psrad xmm4, DESCALE_P2 699 paddd xmm0, xmm1 700 paddd xmm7, xmm1 701 psrad xmm0, DESCALE_P2 702 psrad xmm7, DESCALE_P2 703 704 packssdw xmm5, xmm4 ; xmm5=data0=(00 10 20 30 40 50 60 70) 705 packssdw xmm0, xmm7 ; xmm0=data7=(07 17 27 37 47 57 67 77) 706 707 movdqa xmm3, XMMWORD [wk(4)] ; xmm3=tmp11L 708 movdqa xmm1, XMMWORD [wk(5)] ; xmm1=tmp11H 709 710 movdqa xmm4, xmm3 711 movdqa xmm7, xmm1 712 paddd xmm3, xmm2 ; xmm3=data1L 713 paddd xmm1, xmm6 ; xmm1=data1H 714 psubd xmm4, xmm2 ; xmm4=data6L 715 psubd xmm7, xmm6 ; xmm7=data6H 716 717 movdqa xmm2, [GOTOFF(ebx,PD_DESCALE_P2)] ; xmm2=[PD_DESCALE_P2] 718 719 paddd xmm3, xmm2 720 paddd xmm1, xmm2 721 psrad xmm3, DESCALE_P2 722 psrad xmm1, DESCALE_P2 723 paddd xmm4, xmm2 724 paddd xmm7, xmm2 725 psrad xmm4, DESCALE_P2 726 psrad xmm7, DESCALE_P2 727 728 packssdw xmm3, xmm1 ; xmm3=data1=(01 11 21 31 41 51 61 71) 729 packssdw xmm4, xmm7 ; xmm4=data6=(06 16 26 36 46 56 66 76) 730 731 packsswb xmm5, xmm4 ; xmm5=(00 10 20 30 40 50 60 70 06 16 26 36 46 56 66 76) 732 packsswb xmm3, xmm0 ; xmm3=(01 11 21 31 41 51 61 71 07 17 27 37 47 57 67 77) 733 734 movdqa xmm6, XMMWORD [wk(6)] ; xmm6=tmp12L 735 movdqa xmm2, XMMWORD [wk(7)] ; xmm2=tmp12H 736 movdqa xmm1, XMMWORD [wk(10)] ; xmm1=tmp1L 737 movdqa xmm7, XMMWORD [wk(11)] ; xmm7=tmp1H 738 739 movdqa XMMWORD [wk(0)], xmm5 ; wk(0)=(00 10 20 30 40 50 60 70 06 16 26 36 46 56 66 76) 740 movdqa XMMWORD [wk(1)], xmm3 ; wk(1)=(01 11 21 31 41 51 61 71 07 17 27 37 47 57 67 77) 741 742 movdqa xmm4, xmm6 743 movdqa xmm0, xmm2 744 paddd xmm6, xmm1 ; xmm6=data2L 745 paddd xmm2, xmm7 ; xmm2=data2H 746 psubd xmm4, xmm1 ; xmm4=data5L 747 psubd xmm0, xmm7 ; xmm0=data5H 748 749 movdqa xmm5, [GOTOFF(ebx,PD_DESCALE_P2)] ; xmm5=[PD_DESCALE_P2] 750 751 paddd xmm6, xmm5 752 paddd xmm2, xmm5 753 psrad xmm6, DESCALE_P2 754 psrad xmm2, DESCALE_P2 755 paddd xmm4, xmm5 756 paddd xmm0, xmm5 757 psrad xmm4, DESCALE_P2 758 psrad xmm0, DESCALE_P2 759 760 packssdw xmm6, xmm2 ; xmm6=data2=(02 12 22 32 42 52 62 72) 761 packssdw xmm4, xmm0 ; xmm4=data5=(05 15 25 35 45 55 65 75) 762 763 movdqa xmm3, XMMWORD [wk(2)] ; xmm3=tmp13L 764 movdqa xmm1, XMMWORD [wk(3)] ; xmm1=tmp13H 765 movdqa xmm7, XMMWORD [wk(8)] ; xmm7=tmp0L 766 movdqa xmm5, XMMWORD [wk(9)] ; xmm5=tmp0H 767 768 movdqa xmm2, xmm3 769 movdqa xmm0, xmm1 770 paddd xmm3, xmm7 ; xmm3=data3L 771 paddd xmm1, xmm5 ; xmm1=data3H 772 psubd xmm2, xmm7 ; xmm2=data4L 773 psubd xmm0, xmm5 ; xmm0=data4H 774 775 movdqa xmm7, [GOTOFF(ebx,PD_DESCALE_P2)] ; xmm7=[PD_DESCALE_P2] 776 777 paddd xmm3, xmm7 778 paddd xmm1, xmm7 779 psrad xmm3, DESCALE_P2 780 psrad xmm1, DESCALE_P2 781 paddd xmm2, xmm7 782 paddd xmm0, xmm7 783 psrad xmm2, DESCALE_P2 784 psrad xmm0, DESCALE_P2 785 786 movdqa xmm5, [GOTOFF(ebx,PB_CENTERJSAMP)] ; xmm5=[PB_CENTERJSAMP] 787 788 packssdw xmm3, xmm1 ; xmm3=data3=(03 13 23 33 43 53 63 73) 789 packssdw xmm2, xmm0 ; xmm2=data4=(04 14 24 34 44 54 64 74) 790 791 movdqa xmm7, XMMWORD [wk(0)] ; xmm7=(00 10 20 30 40 50 60 70 06 16 26 36 46 56 66 76) 792 movdqa xmm1, XMMWORD [wk(1)] ; xmm1=(01 11 21 31 41 51 61 71 07 17 27 37 47 57 67 77) 793 794 packsswb xmm6, xmm2 ; xmm6=(02 12 22 32 42 52 62 72 04 14 24 34 44 54 64 74) 795 packsswb xmm3, xmm4 ; xmm3=(03 13 23 33 43 53 63 73 05 15 25 35 45 55 65 75) 796 797 paddb xmm7, xmm5 798 paddb xmm1, xmm5 799 paddb xmm6, xmm5 800 paddb xmm3, xmm5 801 802 movdqa xmm0, xmm7 ; transpose coefficients(phase 1) 803 punpcklbw xmm7, xmm1 ; xmm7=(00 01 10 11 20 21 30 31 40 41 50 51 60 61 70 71) 804 punpckhbw xmm0, xmm1 ; xmm0=(06 07 16 17 26 27 36 37 46 47 56 57 66 67 76 77) 805 movdqa xmm2, xmm6 ; transpose coefficients(phase 1) 806 punpcklbw xmm6, xmm3 ; xmm6=(02 03 12 13 22 23 32 33 42 43 52 53 62 63 72 73) 807 punpckhbw xmm2, xmm3 ; xmm2=(04 05 14 15 24 25 34 35 44 45 54 55 64 65 74 75) 808 809 movdqa xmm4, xmm7 ; transpose coefficients(phase 2) 810 punpcklwd xmm7, xmm6 ; xmm7=(00 01 02 03 10 11 12 13 20 21 22 23 30 31 32 33) 811 punpckhwd xmm4, xmm6 ; xmm4=(40 41 42 43 50 51 52 53 60 61 62 63 70 71 72 73) 812 movdqa xmm5, xmm2 ; transpose coefficients(phase 2) 813 punpcklwd xmm2, xmm0 ; xmm2=(04 05 06 07 14 15 16 17 24 25 26 27 34 35 36 37) 814 punpckhwd xmm5, xmm0 ; xmm5=(44 45 46 47 54 55 56 57 64 65 66 67 74 75 76 77) 815 816 movdqa xmm1, xmm7 ; transpose coefficients(phase 3) 817 punpckldq xmm7, xmm2 ; xmm7=(00 01 02 03 04 05 06 07 10 11 12 13 14 15 16 17) 818 punpckhdq xmm1, xmm2 ; xmm1=(20 21 22 23 24 25 26 27 30 31 32 33 34 35 36 37) 819 movdqa xmm3, xmm4 ; transpose coefficients(phase 3) 820 punpckldq xmm4, xmm5 ; xmm4=(40 41 42 43 44 45 46 47 50 51 52 53 54 55 56 57) 821 punpckhdq xmm3, xmm5 ; xmm3=(60 61 62 63 64 65 66 67 70 71 72 73 74 75 76 77) 822 823 pshufd xmm6, xmm7, 0x4E ; xmm6=(10 11 12 13 14 15 16 17 00 01 02 03 04 05 06 07) 824 pshufd xmm0, xmm1, 0x4E ; xmm0=(30 31 32 33 34 35 36 37 20 21 22 23 24 25 26 27) 825 pshufd xmm2, xmm4, 0x4E ; xmm2=(50 51 52 53 54 55 56 57 40 41 42 43 44 45 46 47) 826 pshufd xmm5, xmm3, 0x4E ; xmm5=(70 71 72 73 74 75 76 77 60 61 62 63 64 65 66 67) 827 828 mov edx, JSAMPROW [edi+0*SIZEOF_JSAMPROW] 829 mov esi, JSAMPROW [edi+2*SIZEOF_JSAMPROW] 830 movq XMM_MMWORD [edx+eax*SIZEOF_JSAMPLE], xmm7 831 movq XMM_MMWORD [esi+eax*SIZEOF_JSAMPLE], xmm1 832 mov edx, JSAMPROW [edi+4*SIZEOF_JSAMPROW] 833 mov esi, JSAMPROW [edi+6*SIZEOF_JSAMPROW] 834 movq XMM_MMWORD [edx+eax*SIZEOF_JSAMPLE], xmm4 835 movq XMM_MMWORD [esi+eax*SIZEOF_JSAMPLE], xmm3 836 837 mov edx, JSAMPROW [edi+1*SIZEOF_JSAMPROW] 838 mov esi, JSAMPROW [edi+3*SIZEOF_JSAMPROW] 839 movq XMM_MMWORD [edx+eax*SIZEOF_JSAMPLE], xmm6 840 movq XMM_MMWORD [esi+eax*SIZEOF_JSAMPLE], xmm0 841 mov edx, JSAMPROW [edi+5*SIZEOF_JSAMPROW] 842 mov esi, JSAMPROW [edi+7*SIZEOF_JSAMPROW] 843 movq XMM_MMWORD [edx+eax*SIZEOF_JSAMPLE], xmm2 844 movq XMM_MMWORD [esi+eax*SIZEOF_JSAMPLE], xmm5 845 846 pop edi 847 pop esi 848; pop edx ; need not be preserved 849; pop ecx ; unused 850 poppic ebx 851 mov esp, ebp ; esp <- aligned ebp 852 pop esp ; esp <- original ebp 853 pop ebp 854 ret 855 856; For some reason, the OS X linker does not honor the request to align the 857; segment unless we do this. 858 align 32 859