1; 2; jidctfst.asm - fast integer IDCT (SSE2) 3; 4; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB 5; Copyright (C) 2016, D. R. Commander. 6; 7; Based on the x86 SIMD extension for IJG JPEG library 8; Copyright (C) 1999-2006, MIYASAKA Masaru. 9; For conditions of distribution and use, see copyright notice in jsimdext.inc 10; 11; This file should be assembled with NASM (Netwide Assembler), 12; can *not* be assembled with Microsoft's MASM or any compatible 13; assembler (including Borland's Turbo Assembler). 14; NASM is available from http://nasm.sourceforge.net/ or 15; http://sourceforge.net/project/showfiles.php?group_id=6208 16; 17; This file contains a fast, not so accurate integer implementation of 18; the inverse DCT (Discrete Cosine Transform). The following code is 19; based directly on the IJG's original jidctfst.c; see the jidctfst.c 20; for more details. 21; 22; [TAB8] 23 24%include "jsimdext.inc" 25%include "jdct.inc" 26 27; -------------------------------------------------------------------------- 28 29%define CONST_BITS 8 ; 14 is also OK. 30%define PASS1_BITS 2 31 32%if IFAST_SCALE_BITS != PASS1_BITS 33%error "'IFAST_SCALE_BITS' must be equal to 'PASS1_BITS'." 34%endif 35 36%if CONST_BITS == 8 37F_1_082 equ 277 ; FIX(1.082392200) 38F_1_414 equ 362 ; FIX(1.414213562) 39F_1_847 equ 473 ; FIX(1.847759065) 40F_2_613 equ 669 ; FIX(2.613125930) 41F_1_613 equ (F_2_613 - 256) ; FIX(2.613125930) - FIX(1) 42%else 43; NASM cannot do compile-time arithmetic on floating-point constants. 44%define DESCALE(x, n) (((x) + (1 << ((n) - 1))) >> (n)) 45F_1_082 equ DESCALE(1162209775, 30 - CONST_BITS) ; FIX(1.082392200) 46F_1_414 equ DESCALE(1518500249, 30 - CONST_BITS) ; FIX(1.414213562) 47F_1_847 equ DESCALE(1984016188, 30 - CONST_BITS) ; FIX(1.847759065) 48F_2_613 equ DESCALE(2805822602, 30 - CONST_BITS) ; FIX(2.613125930) 49F_1_613 equ (F_2_613 - (1 << CONST_BITS)) ; FIX(2.613125930) - FIX(1) 50%endif 51 52; -------------------------------------------------------------------------- 53 SECTION SEG_CONST 54 55; PRE_MULTIPLY_SCALE_BITS <= 2 (to avoid overflow) 56; CONST_BITS + CONST_SHIFT + PRE_MULTIPLY_SCALE_BITS == 16 (for pmulhw) 57 58%define PRE_MULTIPLY_SCALE_BITS 2 59%define CONST_SHIFT (16 - PRE_MULTIPLY_SCALE_BITS - CONST_BITS) 60 61 alignz 32 62 GLOBAL_DATA(jconst_idct_ifast_sse2) 63 64EXTN(jconst_idct_ifast_sse2): 65 66PW_F1414 times 8 dw F_1_414 << CONST_SHIFT 67PW_F1847 times 8 dw F_1_847 << CONST_SHIFT 68PW_MF1613 times 8 dw -F_1_613 << CONST_SHIFT 69PW_F1082 times 8 dw F_1_082 << CONST_SHIFT 70PB_CENTERJSAMP times 16 db CENTERJSAMPLE 71 72 alignz 32 73 74; -------------------------------------------------------------------------- 75 SECTION SEG_TEXT 76 BITS 32 77; 78; Perform dequantization and inverse DCT on one block of coefficients. 79; 80; GLOBAL(void) 81; jsimd_idct_ifast_sse2(void *dct_table, JCOEFPTR coef_block, 82; JSAMPARRAY output_buf, JDIMENSION output_col) 83; 84 85%define dct_table(b) (b) + 8 ; jpeg_component_info *compptr 86%define coef_block(b) (b) + 12 ; JCOEFPTR coef_block 87%define output_buf(b) (b) + 16 ; JSAMPARRAY output_buf 88%define output_col(b) (b) + 20 ; JDIMENSION output_col 89 90%define original_ebp ebp + 0 91%define wk(i) ebp - (WK_NUM - (i)) * SIZEOF_XMMWORD 92 ; xmmword wk[WK_NUM] 93%define WK_NUM 2 94 95 align 32 96 GLOBAL_FUNCTION(jsimd_idct_ifast_sse2) 97 98EXTN(jsimd_idct_ifast_sse2): 99 push ebp 100 mov eax, esp ; eax = original ebp 101 sub esp, byte 4 102 and esp, byte (-SIZEOF_XMMWORD) ; align to 128 bits 103 mov [esp], eax 104 mov ebp, esp ; ebp = aligned ebp 105 lea esp, [wk(0)] 106 pushpic ebx 107; push ecx ; unused 108; push edx ; need not be preserved 109 push esi 110 push edi 111 112 get_GOT ebx ; get GOT address 113 114 ; ---- Pass 1: process columns from input. 115 116; mov eax, [original_ebp] 117 mov edx, POINTER [dct_table(eax)] ; quantptr 118 mov esi, JCOEFPTR [coef_block(eax)] ; inptr 119 120%ifndef NO_ZERO_COLUMN_TEST_IFAST_SSE2 121 mov eax, DWORD [DWBLOCK(1,0,esi,SIZEOF_JCOEF)] 122 or eax, DWORD [DWBLOCK(2,0,esi,SIZEOF_JCOEF)] 123 jnz near .columnDCT 124 125 movdqa xmm0, XMMWORD [XMMBLOCK(1,0,esi,SIZEOF_JCOEF)] 126 movdqa xmm1, XMMWORD [XMMBLOCK(2,0,esi,SIZEOF_JCOEF)] 127 por xmm0, XMMWORD [XMMBLOCK(3,0,esi,SIZEOF_JCOEF)] 128 por xmm1, XMMWORD [XMMBLOCK(4,0,esi,SIZEOF_JCOEF)] 129 por xmm0, XMMWORD [XMMBLOCK(5,0,esi,SIZEOF_JCOEF)] 130 por xmm1, XMMWORD [XMMBLOCK(6,0,esi,SIZEOF_JCOEF)] 131 por xmm0, XMMWORD [XMMBLOCK(7,0,esi,SIZEOF_JCOEF)] 132 por xmm1, xmm0 133 packsswb xmm1, xmm1 134 packsswb xmm1, xmm1 135 movd eax, xmm1 136 test eax, eax 137 jnz short .columnDCT 138 139 ; -- AC terms all zero 140 141 movdqa xmm0, XMMWORD [XMMBLOCK(0,0,esi,SIZEOF_JCOEF)] 142 pmullw xmm0, XMMWORD [XMMBLOCK(0,0,edx,SIZEOF_ISLOW_MULT_TYPE)] 143 144 movdqa xmm7, xmm0 ; xmm0=in0=(00 01 02 03 04 05 06 07) 145 punpcklwd xmm0, xmm0 ; xmm0=(00 00 01 01 02 02 03 03) 146 punpckhwd xmm7, xmm7 ; xmm7=(04 04 05 05 06 06 07 07) 147 148 pshufd xmm6, xmm0, 0x00 ; xmm6=col0=(00 00 00 00 00 00 00 00) 149 pshufd xmm2, xmm0, 0x55 ; xmm2=col1=(01 01 01 01 01 01 01 01) 150 pshufd xmm5, xmm0, 0xAA ; xmm5=col2=(02 02 02 02 02 02 02 02) 151 pshufd xmm0, xmm0, 0xFF ; xmm0=col3=(03 03 03 03 03 03 03 03) 152 pshufd xmm1, xmm7, 0x00 ; xmm1=col4=(04 04 04 04 04 04 04 04) 153 pshufd xmm4, xmm7, 0x55 ; xmm4=col5=(05 05 05 05 05 05 05 05) 154 pshufd xmm3, xmm7, 0xAA ; xmm3=col6=(06 06 06 06 06 06 06 06) 155 pshufd xmm7, xmm7, 0xFF ; xmm7=col7=(07 07 07 07 07 07 07 07) 156 157 movdqa XMMWORD [wk(0)], xmm2 ; wk(0)=col1 158 movdqa XMMWORD [wk(1)], xmm0 ; wk(1)=col3 159 jmp near .column_end 160 alignx 16, 7 161%endif 162.columnDCT: 163 164 ; -- Even part 165 166 movdqa xmm0, XMMWORD [XMMBLOCK(0,0,esi,SIZEOF_JCOEF)] 167 movdqa xmm1, XMMWORD [XMMBLOCK(2,0,esi,SIZEOF_JCOEF)] 168 pmullw xmm0, XMMWORD [XMMBLOCK(0,0,edx,SIZEOF_IFAST_MULT_TYPE)] 169 pmullw xmm1, XMMWORD [XMMBLOCK(2,0,edx,SIZEOF_IFAST_MULT_TYPE)] 170 movdqa xmm2, XMMWORD [XMMBLOCK(4,0,esi,SIZEOF_JCOEF)] 171 movdqa xmm3, XMMWORD [XMMBLOCK(6,0,esi,SIZEOF_JCOEF)] 172 pmullw xmm2, XMMWORD [XMMBLOCK(4,0,edx,SIZEOF_IFAST_MULT_TYPE)] 173 pmullw xmm3, XMMWORD [XMMBLOCK(6,0,edx,SIZEOF_IFAST_MULT_TYPE)] 174 175 movdqa xmm4, xmm0 176 movdqa xmm5, xmm1 177 psubw xmm0, xmm2 ; xmm0=tmp11 178 psubw xmm1, xmm3 179 paddw xmm4, xmm2 ; xmm4=tmp10 180 paddw xmm5, xmm3 ; xmm5=tmp13 181 182 psllw xmm1, PRE_MULTIPLY_SCALE_BITS 183 pmulhw xmm1, [GOTOFF(ebx,PW_F1414)] 184 psubw xmm1, xmm5 ; xmm1=tmp12 185 186 movdqa xmm6, xmm4 187 movdqa xmm7, xmm0 188 psubw xmm4, xmm5 ; xmm4=tmp3 189 psubw xmm0, xmm1 ; xmm0=tmp2 190 paddw xmm6, xmm5 ; xmm6=tmp0 191 paddw xmm7, xmm1 ; xmm7=tmp1 192 193 movdqa XMMWORD [wk(1)], xmm4 ; wk(1)=tmp3 194 movdqa XMMWORD [wk(0)], xmm0 ; wk(0)=tmp2 195 196 ; -- Odd part 197 198 movdqa xmm2, XMMWORD [XMMBLOCK(1,0,esi,SIZEOF_JCOEF)] 199 movdqa xmm3, XMMWORD [XMMBLOCK(3,0,esi,SIZEOF_JCOEF)] 200 pmullw xmm2, XMMWORD [XMMBLOCK(1,0,edx,SIZEOF_IFAST_MULT_TYPE)] 201 pmullw xmm3, XMMWORD [XMMBLOCK(3,0,edx,SIZEOF_IFAST_MULT_TYPE)] 202 movdqa xmm5, XMMWORD [XMMBLOCK(5,0,esi,SIZEOF_JCOEF)] 203 movdqa xmm1, XMMWORD [XMMBLOCK(7,0,esi,SIZEOF_JCOEF)] 204 pmullw xmm5, XMMWORD [XMMBLOCK(5,0,edx,SIZEOF_IFAST_MULT_TYPE)] 205 pmullw xmm1, XMMWORD [XMMBLOCK(7,0,edx,SIZEOF_IFAST_MULT_TYPE)] 206 207 movdqa xmm4, xmm2 208 movdqa xmm0, xmm5 209 psubw xmm2, xmm1 ; xmm2=z12 210 psubw xmm5, xmm3 ; xmm5=z10 211 paddw xmm4, xmm1 ; xmm4=z11 212 paddw xmm0, xmm3 ; xmm0=z13 213 214 movdqa xmm1, xmm5 ; xmm1=z10(unscaled) 215 psllw xmm2, PRE_MULTIPLY_SCALE_BITS 216 psllw xmm5, PRE_MULTIPLY_SCALE_BITS 217 218 movdqa xmm3, xmm4 219 psubw xmm4, xmm0 220 paddw xmm3, xmm0 ; xmm3=tmp7 221 222 psllw xmm4, PRE_MULTIPLY_SCALE_BITS 223 pmulhw xmm4, [GOTOFF(ebx,PW_F1414)] ; xmm4=tmp11 224 225 ; To avoid overflow... 226 ; 227 ; (Original) 228 ; tmp12 = -2.613125930 * z10 + z5; 229 ; 230 ; (This implementation) 231 ; tmp12 = (-1.613125930 - 1) * z10 + z5; 232 ; = -1.613125930 * z10 - z10 + z5; 233 234 movdqa xmm0, xmm5 235 paddw xmm5, xmm2 236 pmulhw xmm5, [GOTOFF(ebx,PW_F1847)] ; xmm5=z5 237 pmulhw xmm0, [GOTOFF(ebx,PW_MF1613)] 238 pmulhw xmm2, [GOTOFF(ebx,PW_F1082)] 239 psubw xmm0, xmm1 240 psubw xmm2, xmm5 ; xmm2=tmp10 241 paddw xmm0, xmm5 ; xmm0=tmp12 242 243 ; -- Final output stage 244 245 psubw xmm0, xmm3 ; xmm0=tmp6 246 movdqa xmm1, xmm6 247 movdqa xmm5, xmm7 248 paddw xmm6, xmm3 ; xmm6=data0=(00 01 02 03 04 05 06 07) 249 paddw xmm7, xmm0 ; xmm7=data1=(10 11 12 13 14 15 16 17) 250 psubw xmm1, xmm3 ; xmm1=data7=(70 71 72 73 74 75 76 77) 251 psubw xmm5, xmm0 ; xmm5=data6=(60 61 62 63 64 65 66 67) 252 psubw xmm4, xmm0 ; xmm4=tmp5 253 254 movdqa xmm3, xmm6 ; transpose coefficients(phase 1) 255 punpcklwd xmm6, xmm7 ; xmm6=(00 10 01 11 02 12 03 13) 256 punpckhwd xmm3, xmm7 ; xmm3=(04 14 05 15 06 16 07 17) 257 movdqa xmm0, xmm5 ; transpose coefficients(phase 1) 258 punpcklwd xmm5, xmm1 ; xmm5=(60 70 61 71 62 72 63 73) 259 punpckhwd xmm0, xmm1 ; xmm0=(64 74 65 75 66 76 67 77) 260 261 movdqa xmm7, XMMWORD [wk(0)] ; xmm7=tmp2 262 movdqa xmm1, XMMWORD [wk(1)] ; xmm1=tmp3 263 264 movdqa XMMWORD [wk(0)], xmm5 ; wk(0)=(60 70 61 71 62 72 63 73) 265 movdqa XMMWORD [wk(1)], xmm0 ; wk(1)=(64 74 65 75 66 76 67 77) 266 267 paddw xmm2, xmm4 ; xmm2=tmp4 268 movdqa xmm5, xmm7 269 movdqa xmm0, xmm1 270 paddw xmm7, xmm4 ; xmm7=data2=(20 21 22 23 24 25 26 27) 271 paddw xmm1, xmm2 ; xmm1=data4=(40 41 42 43 44 45 46 47) 272 psubw xmm5, xmm4 ; xmm5=data5=(50 51 52 53 54 55 56 57) 273 psubw xmm0, xmm2 ; xmm0=data3=(30 31 32 33 34 35 36 37) 274 275 movdqa xmm4, xmm7 ; transpose coefficients(phase 1) 276 punpcklwd xmm7, xmm0 ; xmm7=(20 30 21 31 22 32 23 33) 277 punpckhwd xmm4, xmm0 ; xmm4=(24 34 25 35 26 36 27 37) 278 movdqa xmm2, xmm1 ; transpose coefficients(phase 1) 279 punpcklwd xmm1, xmm5 ; xmm1=(40 50 41 51 42 52 43 53) 280 punpckhwd xmm2, xmm5 ; xmm2=(44 54 45 55 46 56 47 57) 281 282 movdqa xmm0, xmm3 ; transpose coefficients(phase 2) 283 punpckldq xmm3, xmm4 ; xmm3=(04 14 24 34 05 15 25 35) 284 punpckhdq xmm0, xmm4 ; xmm0=(06 16 26 36 07 17 27 37) 285 movdqa xmm5, xmm6 ; transpose coefficients(phase 2) 286 punpckldq xmm6, xmm7 ; xmm6=(00 10 20 30 01 11 21 31) 287 punpckhdq xmm5, xmm7 ; xmm5=(02 12 22 32 03 13 23 33) 288 289 movdqa xmm4, XMMWORD [wk(0)] ; xmm4=(60 70 61 71 62 72 63 73) 290 movdqa xmm7, XMMWORD [wk(1)] ; xmm7=(64 74 65 75 66 76 67 77) 291 292 movdqa XMMWORD [wk(0)], xmm3 ; wk(0)=(04 14 24 34 05 15 25 35) 293 movdqa XMMWORD [wk(1)], xmm0 ; wk(1)=(06 16 26 36 07 17 27 37) 294 295 movdqa xmm3, xmm1 ; transpose coefficients(phase 2) 296 punpckldq xmm1, xmm4 ; xmm1=(40 50 60 70 41 51 61 71) 297 punpckhdq xmm3, xmm4 ; xmm3=(42 52 62 72 43 53 63 73) 298 movdqa xmm0, xmm2 ; transpose coefficients(phase 2) 299 punpckldq xmm2, xmm7 ; xmm2=(44 54 64 74 45 55 65 75) 300 punpckhdq xmm0, xmm7 ; xmm0=(46 56 66 76 47 57 67 77) 301 302 movdqa xmm4, xmm6 ; transpose coefficients(phase 3) 303 punpcklqdq xmm6, xmm1 ; xmm6=col0=(00 10 20 30 40 50 60 70) 304 punpckhqdq xmm4, xmm1 ; xmm4=col1=(01 11 21 31 41 51 61 71) 305 movdqa xmm7, xmm5 ; transpose coefficients(phase 3) 306 punpcklqdq xmm5, xmm3 ; xmm5=col2=(02 12 22 32 42 52 62 72) 307 punpckhqdq xmm7, xmm3 ; xmm7=col3=(03 13 23 33 43 53 63 73) 308 309 movdqa xmm1, XMMWORD [wk(0)] ; xmm1=(04 14 24 34 05 15 25 35) 310 movdqa xmm3, XMMWORD [wk(1)] ; xmm3=(06 16 26 36 07 17 27 37) 311 312 movdqa XMMWORD [wk(0)], xmm4 ; wk(0)=col1 313 movdqa XMMWORD [wk(1)], xmm7 ; wk(1)=col3 314 315 movdqa xmm4, xmm1 ; transpose coefficients(phase 3) 316 punpcklqdq xmm1, xmm2 ; xmm1=col4=(04 14 24 34 44 54 64 74) 317 punpckhqdq xmm4, xmm2 ; xmm4=col5=(05 15 25 35 45 55 65 75) 318 movdqa xmm7, xmm3 ; transpose coefficients(phase 3) 319 punpcklqdq xmm3, xmm0 ; xmm3=col6=(06 16 26 36 46 56 66 76) 320 punpckhqdq xmm7, xmm0 ; xmm7=col7=(07 17 27 37 47 57 67 77) 321.column_end: 322 323 ; -- Prefetch the next coefficient block 324 325 prefetchnta [esi + DCTSIZE2*SIZEOF_JCOEF + 0*32] 326 prefetchnta [esi + DCTSIZE2*SIZEOF_JCOEF + 1*32] 327 prefetchnta [esi + DCTSIZE2*SIZEOF_JCOEF + 2*32] 328 prefetchnta [esi + DCTSIZE2*SIZEOF_JCOEF + 3*32] 329 330 ; ---- Pass 2: process rows from work array, store into output array. 331 332 mov eax, [original_ebp] 333 mov edi, JSAMPARRAY [output_buf(eax)] ; (JSAMPROW *) 334 mov eax, JDIMENSION [output_col(eax)] 335 336 ; -- Even part 337 338 ; xmm6=col0, xmm5=col2, xmm1=col4, xmm3=col6 339 340 movdqa xmm2, xmm6 341 movdqa xmm0, xmm5 342 psubw xmm6, xmm1 ; xmm6=tmp11 343 psubw xmm5, xmm3 344 paddw xmm2, xmm1 ; xmm2=tmp10 345 paddw xmm0, xmm3 ; xmm0=tmp13 346 347 psllw xmm5, PRE_MULTIPLY_SCALE_BITS 348 pmulhw xmm5, [GOTOFF(ebx,PW_F1414)] 349 psubw xmm5, xmm0 ; xmm5=tmp12 350 351 movdqa xmm1, xmm2 352 movdqa xmm3, xmm6 353 psubw xmm2, xmm0 ; xmm2=tmp3 354 psubw xmm6, xmm5 ; xmm6=tmp2 355 paddw xmm1, xmm0 ; xmm1=tmp0 356 paddw xmm3, xmm5 ; xmm3=tmp1 357 358 movdqa xmm0, XMMWORD [wk(0)] ; xmm0=col1 359 movdqa xmm5, XMMWORD [wk(1)] ; xmm5=col3 360 361 movdqa XMMWORD [wk(0)], xmm2 ; wk(0)=tmp3 362 movdqa XMMWORD [wk(1)], xmm6 ; wk(1)=tmp2 363 364 ; -- Odd part 365 366 ; xmm0=col1, xmm5=col3, xmm4=col5, xmm7=col7 367 368 movdqa xmm2, xmm0 369 movdqa xmm6, xmm4 370 psubw xmm0, xmm7 ; xmm0=z12 371 psubw xmm4, xmm5 ; xmm4=z10 372 paddw xmm2, xmm7 ; xmm2=z11 373 paddw xmm6, xmm5 ; xmm6=z13 374 375 movdqa xmm7, xmm4 ; xmm7=z10(unscaled) 376 psllw xmm0, PRE_MULTIPLY_SCALE_BITS 377 psllw xmm4, PRE_MULTIPLY_SCALE_BITS 378 379 movdqa xmm5, xmm2 380 psubw xmm2, xmm6 381 paddw xmm5, xmm6 ; xmm5=tmp7 382 383 psllw xmm2, PRE_MULTIPLY_SCALE_BITS 384 pmulhw xmm2, [GOTOFF(ebx,PW_F1414)] ; xmm2=tmp11 385 386 ; To avoid overflow... 387 ; 388 ; (Original) 389 ; tmp12 = -2.613125930 * z10 + z5; 390 ; 391 ; (This implementation) 392 ; tmp12 = (-1.613125930 - 1) * z10 + z5; 393 ; = -1.613125930 * z10 - z10 + z5; 394 395 movdqa xmm6, xmm4 396 paddw xmm4, xmm0 397 pmulhw xmm4, [GOTOFF(ebx,PW_F1847)] ; xmm4=z5 398 pmulhw xmm6, [GOTOFF(ebx,PW_MF1613)] 399 pmulhw xmm0, [GOTOFF(ebx,PW_F1082)] 400 psubw xmm6, xmm7 401 psubw xmm0, xmm4 ; xmm0=tmp10 402 paddw xmm6, xmm4 ; xmm6=tmp12 403 404 ; -- Final output stage 405 406 psubw xmm6, xmm5 ; xmm6=tmp6 407 movdqa xmm7, xmm1 408 movdqa xmm4, xmm3 409 paddw xmm1, xmm5 ; xmm1=data0=(00 10 20 30 40 50 60 70) 410 paddw xmm3, xmm6 ; xmm3=data1=(01 11 21 31 41 51 61 71) 411 psraw xmm1, (PASS1_BITS+3) ; descale 412 psraw xmm3, (PASS1_BITS+3) ; descale 413 psubw xmm7, xmm5 ; xmm7=data7=(07 17 27 37 47 57 67 77) 414 psubw xmm4, xmm6 ; xmm4=data6=(06 16 26 36 46 56 66 76) 415 psraw xmm7, (PASS1_BITS+3) ; descale 416 psraw xmm4, (PASS1_BITS+3) ; descale 417 psubw xmm2, xmm6 ; xmm2=tmp5 418 419 packsswb xmm1, xmm4 ; xmm1=(00 10 20 30 40 50 60 70 06 16 26 36 46 56 66 76) 420 packsswb xmm3, xmm7 ; xmm3=(01 11 21 31 41 51 61 71 07 17 27 37 47 57 67 77) 421 422 movdqa xmm5, XMMWORD [wk(1)] ; xmm5=tmp2 423 movdqa xmm6, XMMWORD [wk(0)] ; xmm6=tmp3 424 425 paddw xmm0, xmm2 ; xmm0=tmp4 426 movdqa xmm4, xmm5 427 movdqa xmm7, xmm6 428 paddw xmm5, xmm2 ; xmm5=data2=(02 12 22 32 42 52 62 72) 429 paddw xmm6, xmm0 ; xmm6=data4=(04 14 24 34 44 54 64 74) 430 psraw xmm5, (PASS1_BITS+3) ; descale 431 psraw xmm6, (PASS1_BITS+3) ; descale 432 psubw xmm4, xmm2 ; xmm4=data5=(05 15 25 35 45 55 65 75) 433 psubw xmm7, xmm0 ; xmm7=data3=(03 13 23 33 43 53 63 73) 434 psraw xmm4, (PASS1_BITS+3) ; descale 435 psraw xmm7, (PASS1_BITS+3) ; descale 436 437 movdqa xmm2, [GOTOFF(ebx,PB_CENTERJSAMP)] ; xmm2=[PB_CENTERJSAMP] 438 439 packsswb xmm5, xmm6 ; xmm5=(02 12 22 32 42 52 62 72 04 14 24 34 44 54 64 74) 440 packsswb xmm7, xmm4 ; xmm7=(03 13 23 33 43 53 63 73 05 15 25 35 45 55 65 75) 441 442 paddb xmm1, xmm2 443 paddb xmm3, xmm2 444 paddb xmm5, xmm2 445 paddb xmm7, xmm2 446 447 movdqa xmm0, xmm1 ; transpose coefficients(phase 1) 448 punpcklbw xmm1, xmm3 ; xmm1=(00 01 10 11 20 21 30 31 40 41 50 51 60 61 70 71) 449 punpckhbw xmm0, xmm3 ; xmm0=(06 07 16 17 26 27 36 37 46 47 56 57 66 67 76 77) 450 movdqa xmm6, xmm5 ; transpose coefficients(phase 1) 451 punpcklbw xmm5, xmm7 ; xmm5=(02 03 12 13 22 23 32 33 42 43 52 53 62 63 72 73) 452 punpckhbw xmm6, xmm7 ; xmm6=(04 05 14 15 24 25 34 35 44 45 54 55 64 65 74 75) 453 454 movdqa xmm4, xmm1 ; transpose coefficients(phase 2) 455 punpcklwd xmm1, xmm5 ; xmm1=(00 01 02 03 10 11 12 13 20 21 22 23 30 31 32 33) 456 punpckhwd xmm4, xmm5 ; xmm4=(40 41 42 43 50 51 52 53 60 61 62 63 70 71 72 73) 457 movdqa xmm2, xmm6 ; transpose coefficients(phase 2) 458 punpcklwd xmm6, xmm0 ; xmm6=(04 05 06 07 14 15 16 17 24 25 26 27 34 35 36 37) 459 punpckhwd xmm2, xmm0 ; xmm2=(44 45 46 47 54 55 56 57 64 65 66 67 74 75 76 77) 460 461 movdqa xmm3, xmm1 ; transpose coefficients(phase 3) 462 punpckldq xmm1, xmm6 ; xmm1=(00 01 02 03 04 05 06 07 10 11 12 13 14 15 16 17) 463 punpckhdq xmm3, xmm6 ; xmm3=(20 21 22 23 24 25 26 27 30 31 32 33 34 35 36 37) 464 movdqa xmm7, xmm4 ; transpose coefficients(phase 3) 465 punpckldq xmm4, xmm2 ; xmm4=(40 41 42 43 44 45 46 47 50 51 52 53 54 55 56 57) 466 punpckhdq xmm7, xmm2 ; xmm7=(60 61 62 63 64 65 66 67 70 71 72 73 74 75 76 77) 467 468 pshufd xmm5, xmm1, 0x4E ; xmm5=(10 11 12 13 14 15 16 17 00 01 02 03 04 05 06 07) 469 pshufd xmm0, xmm3, 0x4E ; xmm0=(30 31 32 33 34 35 36 37 20 21 22 23 24 25 26 27) 470 pshufd xmm6, xmm4, 0x4E ; xmm6=(50 51 52 53 54 55 56 57 40 41 42 43 44 45 46 47) 471 pshufd xmm2, xmm7, 0x4E ; xmm2=(70 71 72 73 74 75 76 77 60 61 62 63 64 65 66 67) 472 473 mov edx, JSAMPROW [edi+0*SIZEOF_JSAMPROW] 474 mov esi, JSAMPROW [edi+2*SIZEOF_JSAMPROW] 475 movq XMM_MMWORD [edx+eax*SIZEOF_JSAMPLE], xmm1 476 movq XMM_MMWORD [esi+eax*SIZEOF_JSAMPLE], xmm3 477 mov edx, JSAMPROW [edi+4*SIZEOF_JSAMPROW] 478 mov esi, JSAMPROW [edi+6*SIZEOF_JSAMPROW] 479 movq XMM_MMWORD [edx+eax*SIZEOF_JSAMPLE], xmm4 480 movq XMM_MMWORD [esi+eax*SIZEOF_JSAMPLE], xmm7 481 482 mov edx, JSAMPROW [edi+1*SIZEOF_JSAMPROW] 483 mov esi, JSAMPROW [edi+3*SIZEOF_JSAMPROW] 484 movq XMM_MMWORD [edx+eax*SIZEOF_JSAMPLE], xmm5 485 movq XMM_MMWORD [esi+eax*SIZEOF_JSAMPLE], xmm0 486 mov edx, JSAMPROW [edi+5*SIZEOF_JSAMPROW] 487 mov esi, JSAMPROW [edi+7*SIZEOF_JSAMPROW] 488 movq XMM_MMWORD [edx+eax*SIZEOF_JSAMPLE], xmm6 489 movq XMM_MMWORD [esi+eax*SIZEOF_JSAMPLE], xmm2 490 491 pop edi 492 pop esi 493; pop edx ; need not be preserved 494; pop ecx ; unused 495 poppic ebx 496 mov esp, ebp ; esp <- aligned ebp 497 pop esp ; esp <- original ebp 498 pop ebp 499 ret 500 501; For some reason, the OS X linker does not honor the request to align the 502; segment unless we do this. 503 align 32 504