1; 2; jidctfst.asm - fast integer IDCT (MMX) 3; 4; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB 5; Copyright (C) 2016, D. R. Commander. 6; 7; Based on the x86 SIMD extension for IJG JPEG library 8; Copyright (C) 1999-2006, MIYASAKA Masaru. 9; For conditions of distribution and use, see copyright notice in jsimdext.inc 10; 11; This file should be assembled with NASM (Netwide Assembler), 12; can *not* be assembled with Microsoft's MASM or any compatible 13; assembler (including Borland's Turbo Assembler). 14; NASM is available from http://nasm.sourceforge.net/ or 15; http://sourceforge.net/project/showfiles.php?group_id=6208 16; 17; This file contains a fast, not so accurate integer implementation of 18; the inverse DCT (Discrete Cosine Transform). The following code is 19; based directly on the IJG's original jidctfst.c; see the jidctfst.c 20; for more details. 21; 22; [TAB8] 23 24%include "jsimdext.inc" 25%include "jdct.inc" 26 27; -------------------------------------------------------------------------- 28 29%define CONST_BITS 8 ; 14 is also OK. 30%define PASS1_BITS 2 31 32%if IFAST_SCALE_BITS != PASS1_BITS 33%error "'IFAST_SCALE_BITS' must be equal to 'PASS1_BITS'." 34%endif 35 36%if CONST_BITS == 8 37F_1_082 equ 277 ; FIX(1.082392200) 38F_1_414 equ 362 ; FIX(1.414213562) 39F_1_847 equ 473 ; FIX(1.847759065) 40F_2_613 equ 669 ; FIX(2.613125930) 41F_1_613 equ (F_2_613 - 256) ; FIX(2.613125930) - FIX(1) 42%else 43; NASM cannot do compile-time arithmetic on floating-point constants. 44%define DESCALE(x, n) (((x) + (1 << ((n) - 1))) >> (n)) 45F_1_082 equ DESCALE(1162209775, 30 - CONST_BITS) ; FIX(1.082392200) 46F_1_414 equ DESCALE(1518500249, 30 - CONST_BITS) ; FIX(1.414213562) 47F_1_847 equ DESCALE(1984016188, 30 - CONST_BITS) ; FIX(1.847759065) 48F_2_613 equ DESCALE(2805822602, 30 - CONST_BITS) ; FIX(2.613125930) 49F_1_613 equ (F_2_613 - (1 << CONST_BITS)) ; FIX(2.613125930) - FIX(1) 50%endif 51 52; -------------------------------------------------------------------------- 53 SECTION SEG_CONST 54 55; PRE_MULTIPLY_SCALE_BITS <= 2 (to avoid overflow) 56; CONST_BITS + CONST_SHIFT + PRE_MULTIPLY_SCALE_BITS == 16 (for pmulhw) 57 58%define PRE_MULTIPLY_SCALE_BITS 2 59%define CONST_SHIFT (16 - PRE_MULTIPLY_SCALE_BITS - CONST_BITS) 60 61 alignz 32 62 GLOBAL_DATA(jconst_idct_ifast_mmx) 63 64EXTN(jconst_idct_ifast_mmx): 65 66PW_F1414 times 4 dw F_1_414 << CONST_SHIFT 67PW_F1847 times 4 dw F_1_847 << CONST_SHIFT 68PW_MF1613 times 4 dw -F_1_613 << CONST_SHIFT 69PW_F1082 times 4 dw F_1_082 << CONST_SHIFT 70PB_CENTERJSAMP times 8 db CENTERJSAMPLE 71 72 alignz 32 73 74; -------------------------------------------------------------------------- 75 SECTION SEG_TEXT 76 BITS 32 77; 78; Perform dequantization and inverse DCT on one block of coefficients. 79; 80; GLOBAL(void) 81; jsimd_idct_ifast_mmx(void *dct_table, JCOEFPTR coef_block, 82; JSAMPARRAY output_buf, JDIMENSION output_col) 83; 84 85%define dct_table(b) (b) + 8 ; jpeg_component_info *compptr 86%define coef_block(b) (b) + 12 ; JCOEFPTR coef_block 87%define output_buf(b) (b) + 16 ; JSAMPARRAY output_buf 88%define output_col(b) (b) + 20 ; JDIMENSION output_col 89 90%define original_ebp ebp + 0 91%define wk(i) ebp - (WK_NUM - (i)) * SIZEOF_MMWORD 92 ; mmword wk[WK_NUM] 93%define WK_NUM 2 94%define workspace wk(0) - DCTSIZE2 * SIZEOF_JCOEF 95 ; JCOEF workspace[DCTSIZE2] 96 97 align 32 98 GLOBAL_FUNCTION(jsimd_idct_ifast_mmx) 99 100EXTN(jsimd_idct_ifast_mmx): 101 push ebp 102 mov eax, esp ; eax = original ebp 103 sub esp, byte 4 104 and esp, byte (-SIZEOF_MMWORD) ; align to 64 bits 105 mov [esp], eax 106 mov ebp, esp ; ebp = aligned ebp 107 lea esp, [workspace] 108 push ebx 109; push ecx ; need not be preserved 110; push edx ; need not be preserved 111 push esi 112 push edi 113 114 get_GOT ebx ; get GOT address 115 116 ; ---- Pass 1: process columns from input, store into work array. 117 118; mov eax, [original_ebp] 119 mov edx, POINTER [dct_table(eax)] ; quantptr 120 mov esi, JCOEFPTR [coef_block(eax)] ; inptr 121 lea edi, [workspace] ; JCOEF *wsptr 122 mov ecx, DCTSIZE/4 ; ctr 123 alignx 16, 7 124.columnloop: 125%ifndef NO_ZERO_COLUMN_TEST_IFAST_MMX 126 mov eax, DWORD [DWBLOCK(1,0,esi,SIZEOF_JCOEF)] 127 or eax, DWORD [DWBLOCK(2,0,esi,SIZEOF_JCOEF)] 128 jnz short .columnDCT 129 130 movq mm0, MMWORD [MMBLOCK(1,0,esi,SIZEOF_JCOEF)] 131 movq mm1, MMWORD [MMBLOCK(2,0,esi,SIZEOF_JCOEF)] 132 por mm0, MMWORD [MMBLOCK(3,0,esi,SIZEOF_JCOEF)] 133 por mm1, MMWORD [MMBLOCK(4,0,esi,SIZEOF_JCOEF)] 134 por mm0, MMWORD [MMBLOCK(5,0,esi,SIZEOF_JCOEF)] 135 por mm1, MMWORD [MMBLOCK(6,0,esi,SIZEOF_JCOEF)] 136 por mm0, MMWORD [MMBLOCK(7,0,esi,SIZEOF_JCOEF)] 137 por mm1, mm0 138 packsswb mm1, mm1 139 movd eax, mm1 140 test eax, eax 141 jnz short .columnDCT 142 143 ; -- AC terms all zero 144 145 movq mm0, MMWORD [MMBLOCK(0,0,esi,SIZEOF_JCOEF)] 146 pmullw mm0, MMWORD [MMBLOCK(0,0,edx,SIZEOF_IFAST_MULT_TYPE)] 147 148 movq mm2, mm0 ; mm0=in0=(00 01 02 03) 149 punpcklwd mm0, mm0 ; mm0=(00 00 01 01) 150 punpckhwd mm2, mm2 ; mm2=(02 02 03 03) 151 152 movq mm1, mm0 153 punpckldq mm0, mm0 ; mm0=(00 00 00 00) 154 punpckhdq mm1, mm1 ; mm1=(01 01 01 01) 155 movq mm3, mm2 156 punpckldq mm2, mm2 ; mm2=(02 02 02 02) 157 punpckhdq mm3, mm3 ; mm3=(03 03 03 03) 158 159 movq MMWORD [MMBLOCK(0,0,edi,SIZEOF_JCOEF)], mm0 160 movq MMWORD [MMBLOCK(0,1,edi,SIZEOF_JCOEF)], mm0 161 movq MMWORD [MMBLOCK(1,0,edi,SIZEOF_JCOEF)], mm1 162 movq MMWORD [MMBLOCK(1,1,edi,SIZEOF_JCOEF)], mm1 163 movq MMWORD [MMBLOCK(2,0,edi,SIZEOF_JCOEF)], mm2 164 movq MMWORD [MMBLOCK(2,1,edi,SIZEOF_JCOEF)], mm2 165 movq MMWORD [MMBLOCK(3,0,edi,SIZEOF_JCOEF)], mm3 166 movq MMWORD [MMBLOCK(3,1,edi,SIZEOF_JCOEF)], mm3 167 jmp near .nextcolumn 168 alignx 16, 7 169%endif 170.columnDCT: 171 172 ; -- Even part 173 174 movq mm0, MMWORD [MMBLOCK(0,0,esi,SIZEOF_JCOEF)] 175 movq mm1, MMWORD [MMBLOCK(2,0,esi,SIZEOF_JCOEF)] 176 pmullw mm0, MMWORD [MMBLOCK(0,0,edx,SIZEOF_IFAST_MULT_TYPE)] 177 pmullw mm1, MMWORD [MMBLOCK(2,0,edx,SIZEOF_IFAST_MULT_TYPE)] 178 movq mm2, MMWORD [MMBLOCK(4,0,esi,SIZEOF_JCOEF)] 179 movq mm3, MMWORD [MMBLOCK(6,0,esi,SIZEOF_JCOEF)] 180 pmullw mm2, MMWORD [MMBLOCK(4,0,edx,SIZEOF_IFAST_MULT_TYPE)] 181 pmullw mm3, MMWORD [MMBLOCK(6,0,edx,SIZEOF_IFAST_MULT_TYPE)] 182 183 movq mm4, mm0 184 movq mm5, mm1 185 psubw mm0, mm2 ; mm0=tmp11 186 psubw mm1, mm3 187 paddw mm4, mm2 ; mm4=tmp10 188 paddw mm5, mm3 ; mm5=tmp13 189 190 psllw mm1, PRE_MULTIPLY_SCALE_BITS 191 pmulhw mm1, [GOTOFF(ebx,PW_F1414)] 192 psubw mm1, mm5 ; mm1=tmp12 193 194 movq mm6, mm4 195 movq mm7, mm0 196 psubw mm4, mm5 ; mm4=tmp3 197 psubw mm0, mm1 ; mm0=tmp2 198 paddw mm6, mm5 ; mm6=tmp0 199 paddw mm7, mm1 ; mm7=tmp1 200 201 movq MMWORD [wk(1)], mm4 ; wk(1)=tmp3 202 movq MMWORD [wk(0)], mm0 ; wk(0)=tmp2 203 204 ; -- Odd part 205 206 movq mm2, MMWORD [MMBLOCK(1,0,esi,SIZEOF_JCOEF)] 207 movq mm3, MMWORD [MMBLOCK(3,0,esi,SIZEOF_JCOEF)] 208 pmullw mm2, MMWORD [MMBLOCK(1,0,edx,SIZEOF_IFAST_MULT_TYPE)] 209 pmullw mm3, MMWORD [MMBLOCK(3,0,edx,SIZEOF_IFAST_MULT_TYPE)] 210 movq mm5, MMWORD [MMBLOCK(5,0,esi,SIZEOF_JCOEF)] 211 movq mm1, MMWORD [MMBLOCK(7,0,esi,SIZEOF_JCOEF)] 212 pmullw mm5, MMWORD [MMBLOCK(5,0,edx,SIZEOF_IFAST_MULT_TYPE)] 213 pmullw mm1, MMWORD [MMBLOCK(7,0,edx,SIZEOF_IFAST_MULT_TYPE)] 214 215 movq mm4, mm2 216 movq mm0, mm5 217 psubw mm2, mm1 ; mm2=z12 218 psubw mm5, mm3 ; mm5=z10 219 paddw mm4, mm1 ; mm4=z11 220 paddw mm0, mm3 ; mm0=z13 221 222 movq mm1, mm5 ; mm1=z10(unscaled) 223 psllw mm2, PRE_MULTIPLY_SCALE_BITS 224 psllw mm5, PRE_MULTIPLY_SCALE_BITS 225 226 movq mm3, mm4 227 psubw mm4, mm0 228 paddw mm3, mm0 ; mm3=tmp7 229 230 psllw mm4, PRE_MULTIPLY_SCALE_BITS 231 pmulhw mm4, [GOTOFF(ebx,PW_F1414)] ; mm4=tmp11 232 233 ; To avoid overflow... 234 ; 235 ; (Original) 236 ; tmp12 = -2.613125930 * z10 + z5; 237 ; 238 ; (This implementation) 239 ; tmp12 = (-1.613125930 - 1) * z10 + z5; 240 ; = -1.613125930 * z10 - z10 + z5; 241 242 movq mm0, mm5 243 paddw mm5, mm2 244 pmulhw mm5, [GOTOFF(ebx,PW_F1847)] ; mm5=z5 245 pmulhw mm0, [GOTOFF(ebx,PW_MF1613)] 246 pmulhw mm2, [GOTOFF(ebx,PW_F1082)] 247 psubw mm0, mm1 248 psubw mm2, mm5 ; mm2=tmp10 249 paddw mm0, mm5 ; mm0=tmp12 250 251 ; -- Final output stage 252 253 psubw mm0, mm3 ; mm0=tmp6 254 movq mm1, mm6 255 movq mm5, mm7 256 paddw mm6, mm3 ; mm6=data0=(00 01 02 03) 257 paddw mm7, mm0 ; mm7=data1=(10 11 12 13) 258 psubw mm1, mm3 ; mm1=data7=(70 71 72 73) 259 psubw mm5, mm0 ; mm5=data6=(60 61 62 63) 260 psubw mm4, mm0 ; mm4=tmp5 261 262 movq mm3, mm6 ; transpose coefficients(phase 1) 263 punpcklwd mm6, mm7 ; mm6=(00 10 01 11) 264 punpckhwd mm3, mm7 ; mm3=(02 12 03 13) 265 movq mm0, mm5 ; transpose coefficients(phase 1) 266 punpcklwd mm5, mm1 ; mm5=(60 70 61 71) 267 punpckhwd mm0, mm1 ; mm0=(62 72 63 73) 268 269 movq mm7, MMWORD [wk(0)] ; mm7=tmp2 270 movq mm1, MMWORD [wk(1)] ; mm1=tmp3 271 272 movq MMWORD [wk(0)], mm5 ; wk(0)=(60 70 61 71) 273 movq MMWORD [wk(1)], mm0 ; wk(1)=(62 72 63 73) 274 275 paddw mm2, mm4 ; mm2=tmp4 276 movq mm5, mm7 277 movq mm0, mm1 278 paddw mm7, mm4 ; mm7=data2=(20 21 22 23) 279 paddw mm1, mm2 ; mm1=data4=(40 41 42 43) 280 psubw mm5, mm4 ; mm5=data5=(50 51 52 53) 281 psubw mm0, mm2 ; mm0=data3=(30 31 32 33) 282 283 movq mm4, mm7 ; transpose coefficients(phase 1) 284 punpcklwd mm7, mm0 ; mm7=(20 30 21 31) 285 punpckhwd mm4, mm0 ; mm4=(22 32 23 33) 286 movq mm2, mm1 ; transpose coefficients(phase 1) 287 punpcklwd mm1, mm5 ; mm1=(40 50 41 51) 288 punpckhwd mm2, mm5 ; mm2=(42 52 43 53) 289 290 movq mm0, mm6 ; transpose coefficients(phase 2) 291 punpckldq mm6, mm7 ; mm6=(00 10 20 30) 292 punpckhdq mm0, mm7 ; mm0=(01 11 21 31) 293 movq mm5, mm3 ; transpose coefficients(phase 2) 294 punpckldq mm3, mm4 ; mm3=(02 12 22 32) 295 punpckhdq mm5, mm4 ; mm5=(03 13 23 33) 296 297 movq mm7, MMWORD [wk(0)] ; mm7=(60 70 61 71) 298 movq mm4, MMWORD [wk(1)] ; mm4=(62 72 63 73) 299 300 movq MMWORD [MMBLOCK(0,0,edi,SIZEOF_JCOEF)], mm6 301 movq MMWORD [MMBLOCK(1,0,edi,SIZEOF_JCOEF)], mm0 302 movq MMWORD [MMBLOCK(2,0,edi,SIZEOF_JCOEF)], mm3 303 movq MMWORD [MMBLOCK(3,0,edi,SIZEOF_JCOEF)], mm5 304 305 movq mm6, mm1 ; transpose coefficients(phase 2) 306 punpckldq mm1, mm7 ; mm1=(40 50 60 70) 307 punpckhdq mm6, mm7 ; mm6=(41 51 61 71) 308 movq mm0, mm2 ; transpose coefficients(phase 2) 309 punpckldq mm2, mm4 ; mm2=(42 52 62 72) 310 punpckhdq mm0, mm4 ; mm0=(43 53 63 73) 311 312 movq MMWORD [MMBLOCK(0,1,edi,SIZEOF_JCOEF)], mm1 313 movq MMWORD [MMBLOCK(1,1,edi,SIZEOF_JCOEF)], mm6 314 movq MMWORD [MMBLOCK(2,1,edi,SIZEOF_JCOEF)], mm2 315 movq MMWORD [MMBLOCK(3,1,edi,SIZEOF_JCOEF)], mm0 316 317.nextcolumn: 318 add esi, byte 4*SIZEOF_JCOEF ; coef_block 319 add edx, byte 4*SIZEOF_IFAST_MULT_TYPE ; quantptr 320 add edi, byte 4*DCTSIZE*SIZEOF_JCOEF ; wsptr 321 dec ecx ; ctr 322 jnz near .columnloop 323 324 ; ---- Pass 2: process rows from work array, store into output array. 325 326 mov eax, [original_ebp] 327 lea esi, [workspace] ; JCOEF *wsptr 328 mov edi, JSAMPARRAY [output_buf(eax)] ; (JSAMPROW *) 329 mov eax, JDIMENSION [output_col(eax)] 330 mov ecx, DCTSIZE/4 ; ctr 331 alignx 16, 7 332.rowloop: 333 334 ; -- Even part 335 336 movq mm0, MMWORD [MMBLOCK(0,0,esi,SIZEOF_JCOEF)] 337 movq mm1, MMWORD [MMBLOCK(2,0,esi,SIZEOF_JCOEF)] 338 movq mm2, MMWORD [MMBLOCK(4,0,esi,SIZEOF_JCOEF)] 339 movq mm3, MMWORD [MMBLOCK(6,0,esi,SIZEOF_JCOEF)] 340 341 movq mm4, mm0 342 movq mm5, mm1 343 psubw mm0, mm2 ; mm0=tmp11 344 psubw mm1, mm3 345 paddw mm4, mm2 ; mm4=tmp10 346 paddw mm5, mm3 ; mm5=tmp13 347 348 psllw mm1, PRE_MULTIPLY_SCALE_BITS 349 pmulhw mm1, [GOTOFF(ebx,PW_F1414)] 350 psubw mm1, mm5 ; mm1=tmp12 351 352 movq mm6, mm4 353 movq mm7, mm0 354 psubw mm4, mm5 ; mm4=tmp3 355 psubw mm0, mm1 ; mm0=tmp2 356 paddw mm6, mm5 ; mm6=tmp0 357 paddw mm7, mm1 ; mm7=tmp1 358 359 movq MMWORD [wk(1)], mm4 ; wk(1)=tmp3 360 movq MMWORD [wk(0)], mm0 ; wk(0)=tmp2 361 362 ; -- Odd part 363 364 movq mm2, MMWORD [MMBLOCK(1,0,esi,SIZEOF_JCOEF)] 365 movq mm3, MMWORD [MMBLOCK(3,0,esi,SIZEOF_JCOEF)] 366 movq mm5, MMWORD [MMBLOCK(5,0,esi,SIZEOF_JCOEF)] 367 movq mm1, MMWORD [MMBLOCK(7,0,esi,SIZEOF_JCOEF)] 368 369 movq mm4, mm2 370 movq mm0, mm5 371 psubw mm2, mm1 ; mm2=z12 372 psubw mm5, mm3 ; mm5=z10 373 paddw mm4, mm1 ; mm4=z11 374 paddw mm0, mm3 ; mm0=z13 375 376 movq mm1, mm5 ; mm1=z10(unscaled) 377 psllw mm2, PRE_MULTIPLY_SCALE_BITS 378 psllw mm5, PRE_MULTIPLY_SCALE_BITS 379 380 movq mm3, mm4 381 psubw mm4, mm0 382 paddw mm3, mm0 ; mm3=tmp7 383 384 psllw mm4, PRE_MULTIPLY_SCALE_BITS 385 pmulhw mm4, [GOTOFF(ebx,PW_F1414)] ; mm4=tmp11 386 387 ; To avoid overflow... 388 ; 389 ; (Original) 390 ; tmp12 = -2.613125930 * z10 + z5; 391 ; 392 ; (This implementation) 393 ; tmp12 = (-1.613125930 - 1) * z10 + z5; 394 ; = -1.613125930 * z10 - z10 + z5; 395 396 movq mm0, mm5 397 paddw mm5, mm2 398 pmulhw mm5, [GOTOFF(ebx,PW_F1847)] ; mm5=z5 399 pmulhw mm0, [GOTOFF(ebx,PW_MF1613)] 400 pmulhw mm2, [GOTOFF(ebx,PW_F1082)] 401 psubw mm0, mm1 402 psubw mm2, mm5 ; mm2=tmp10 403 paddw mm0, mm5 ; mm0=tmp12 404 405 ; -- Final output stage 406 407 psubw mm0, mm3 ; mm0=tmp6 408 movq mm1, mm6 409 movq mm5, mm7 410 paddw mm6, mm3 ; mm6=data0=(00 10 20 30) 411 paddw mm7, mm0 ; mm7=data1=(01 11 21 31) 412 psraw mm6, (PASS1_BITS+3) ; descale 413 psraw mm7, (PASS1_BITS+3) ; descale 414 psubw mm1, mm3 ; mm1=data7=(07 17 27 37) 415 psubw mm5, mm0 ; mm5=data6=(06 16 26 36) 416 psraw mm1, (PASS1_BITS+3) ; descale 417 psraw mm5, (PASS1_BITS+3) ; descale 418 psubw mm4, mm0 ; mm4=tmp5 419 420 packsswb mm6, mm5 ; mm6=(00 10 20 30 06 16 26 36) 421 packsswb mm7, mm1 ; mm7=(01 11 21 31 07 17 27 37) 422 423 movq mm3, MMWORD [wk(0)] ; mm3=tmp2 424 movq mm0, MMWORD [wk(1)] ; mm0=tmp3 425 426 paddw mm2, mm4 ; mm2=tmp4 427 movq mm5, mm3 428 movq mm1, mm0 429 paddw mm3, mm4 ; mm3=data2=(02 12 22 32) 430 paddw mm0, mm2 ; mm0=data4=(04 14 24 34) 431 psraw mm3, (PASS1_BITS+3) ; descale 432 psraw mm0, (PASS1_BITS+3) ; descale 433 psubw mm5, mm4 ; mm5=data5=(05 15 25 35) 434 psubw mm1, mm2 ; mm1=data3=(03 13 23 33) 435 psraw mm5, (PASS1_BITS+3) ; descale 436 psraw mm1, (PASS1_BITS+3) ; descale 437 438 movq mm4, [GOTOFF(ebx,PB_CENTERJSAMP)] ; mm4=[PB_CENTERJSAMP] 439 440 packsswb mm3, mm0 ; mm3=(02 12 22 32 04 14 24 34) 441 packsswb mm1, mm5 ; mm1=(03 13 23 33 05 15 25 35) 442 443 paddb mm6, mm4 444 paddb mm7, mm4 445 paddb mm3, mm4 446 paddb mm1, mm4 447 448 movq mm2, mm6 ; transpose coefficients(phase 1) 449 punpcklbw mm6, mm7 ; mm6=(00 01 10 11 20 21 30 31) 450 punpckhbw mm2, mm7 ; mm2=(06 07 16 17 26 27 36 37) 451 movq mm0, mm3 ; transpose coefficients(phase 1) 452 punpcklbw mm3, mm1 ; mm3=(02 03 12 13 22 23 32 33) 453 punpckhbw mm0, mm1 ; mm0=(04 05 14 15 24 25 34 35) 454 455 movq mm5, mm6 ; transpose coefficients(phase 2) 456 punpcklwd mm6, mm3 ; mm6=(00 01 02 03 10 11 12 13) 457 punpckhwd mm5, mm3 ; mm5=(20 21 22 23 30 31 32 33) 458 movq mm4, mm0 ; transpose coefficients(phase 2) 459 punpcklwd mm0, mm2 ; mm0=(04 05 06 07 14 15 16 17) 460 punpckhwd mm4, mm2 ; mm4=(24 25 26 27 34 35 36 37) 461 462 movq mm7, mm6 ; transpose coefficients(phase 3) 463 punpckldq mm6, mm0 ; mm6=(00 01 02 03 04 05 06 07) 464 punpckhdq mm7, mm0 ; mm7=(10 11 12 13 14 15 16 17) 465 movq mm1, mm5 ; transpose coefficients(phase 3) 466 punpckldq mm5, mm4 ; mm5=(20 21 22 23 24 25 26 27) 467 punpckhdq mm1, mm4 ; mm1=(30 31 32 33 34 35 36 37) 468 469 pushpic ebx ; save GOT address 470 471 mov edx, JSAMPROW [edi+0*SIZEOF_JSAMPROW] 472 mov ebx, JSAMPROW [edi+1*SIZEOF_JSAMPROW] 473 movq MMWORD [edx+eax*SIZEOF_JSAMPLE], mm6 474 movq MMWORD [ebx+eax*SIZEOF_JSAMPLE], mm7 475 mov edx, JSAMPROW [edi+2*SIZEOF_JSAMPROW] 476 mov ebx, JSAMPROW [edi+3*SIZEOF_JSAMPROW] 477 movq MMWORD [edx+eax*SIZEOF_JSAMPLE], mm5 478 movq MMWORD [ebx+eax*SIZEOF_JSAMPLE], mm1 479 480 poppic ebx ; restore GOT address 481 482 add esi, byte 4*SIZEOF_JCOEF ; wsptr 483 add edi, byte 4*SIZEOF_JSAMPROW 484 dec ecx ; ctr 485 jnz near .rowloop 486 487 emms ; empty MMX state 488 489 pop edi 490 pop esi 491; pop edx ; need not be preserved 492; pop ecx ; need not be preserved 493 pop ebx 494 mov esp, ebp ; esp <- aligned ebp 495 pop esp ; esp <- original ebp 496 pop ebp 497 ret 498 499; For some reason, the OS X linker does not honor the request to align the 500; segment unless we do this. 501 align 32 502