1; 2; jidctfst.asm - fast integer IDCT (64-bit SSE2) 3; 4; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB 5; Copyright (C) 2009, 2016, D. R. Commander. 6; 7; Based on the x86 SIMD extension for IJG JPEG library 8; Copyright (C) 1999-2006, MIYASAKA Masaru. 9; For conditions of distribution and use, see copyright notice in jsimdext.inc 10; 11; This file should be assembled with NASM (Netwide Assembler), 12; can *not* be assembled with Microsoft's MASM or any compatible 13; assembler (including Borland's Turbo Assembler). 14; NASM is available from http://nasm.sourceforge.net/ or 15; http://sourceforge.net/project/showfiles.php?group_id=6208 16; 17; This file contains a fast, not so accurate integer implementation of 18; the inverse DCT (Discrete Cosine Transform). The following code is 19; based directly on the IJG's original jidctfst.c; see the jidctfst.c 20; for more details. 21 22%include "jsimdext.inc" 23%include "jdct.inc" 24 25; -------------------------------------------------------------------------- 26 27%define CONST_BITS 8 ; 14 is also OK. 28%define PASS1_BITS 2 29 30%if IFAST_SCALE_BITS != PASS1_BITS 31%error "'IFAST_SCALE_BITS' must be equal to 'PASS1_BITS'." 32%endif 33 34%if CONST_BITS == 8 35F_1_082 equ 277 ; FIX(1.082392200) 36F_1_414 equ 362 ; FIX(1.414213562) 37F_1_847 equ 473 ; FIX(1.847759065) 38F_2_613 equ 669 ; FIX(2.613125930) 39F_1_613 equ (F_2_613 - 256) ; FIX(2.613125930) - FIX(1) 40%else 41; NASM cannot do compile-time arithmetic on floating-point constants. 42%define DESCALE(x, n) (((x) + (1 << ((n) - 1))) >> (n)) 43F_1_082 equ DESCALE(1162209775, 30 - CONST_BITS) ; FIX(1.082392200) 44F_1_414 equ DESCALE(1518500249, 30 - CONST_BITS) ; FIX(1.414213562) 45F_1_847 equ DESCALE(1984016188, 30 - CONST_BITS) ; FIX(1.847759065) 46F_2_613 equ DESCALE(2805822602, 30 - CONST_BITS) ; FIX(2.613125930) 47F_1_613 equ (F_2_613 - (1 << CONST_BITS)) ; FIX(2.613125930) - FIX(1) 48%endif 49 50; -------------------------------------------------------------------------- 51 SECTION SEG_CONST 52 53; PRE_MULTIPLY_SCALE_BITS <= 2 (to avoid overflow) 54; CONST_BITS + CONST_SHIFT + PRE_MULTIPLY_SCALE_BITS == 16 (for pmulhw) 55 56%define PRE_MULTIPLY_SCALE_BITS 2 57%define CONST_SHIFT (16 - PRE_MULTIPLY_SCALE_BITS - CONST_BITS) 58 59 alignz 32 60 GLOBAL_DATA(jconst_idct_ifast_sse2) 61 62EXTN(jconst_idct_ifast_sse2): 63 64PW_F1414 times 8 dw F_1_414 << CONST_SHIFT 65PW_F1847 times 8 dw F_1_847 << CONST_SHIFT 66PW_MF1613 times 8 dw -F_1_613 << CONST_SHIFT 67PW_F1082 times 8 dw F_1_082 << CONST_SHIFT 68PB_CENTERJSAMP times 16 db CENTERJSAMPLE 69 70 alignz 32 71 72; -------------------------------------------------------------------------- 73 SECTION SEG_TEXT 74 BITS 64 75; 76; Perform dequantization and inverse DCT on one block of coefficients. 77; 78; GLOBAL(void) 79; jsimd_idct_ifast_sse2(void *dct_table, JCOEFPTR coef_block, 80; JSAMPARRAY output_buf, JDIMENSION output_col) 81; 82 83; r10 = jpeg_component_info *compptr 84; r11 = JCOEFPTR coef_block 85; r12 = JSAMPARRAY output_buf 86; r13d = JDIMENSION output_col 87 88%define original_rbp rbp + 0 89%define wk(i) rbp - (WK_NUM - (i)) * SIZEOF_XMMWORD 90 ; xmmword wk[WK_NUM] 91%define WK_NUM 2 92 93 align 32 94 GLOBAL_FUNCTION(jsimd_idct_ifast_sse2) 95 96EXTN(jsimd_idct_ifast_sse2): 97 push rbp 98 mov rax, rsp ; rax = original rbp 99 sub rsp, byte 4 100 and rsp, byte (-SIZEOF_XMMWORD) ; align to 128 bits 101 mov [rsp], rax 102 mov rbp, rsp ; rbp = aligned rbp 103 lea rsp, [wk(0)] 104 collect_args 4 105 106 ; ---- Pass 1: process columns from input. 107 108 mov rdx, r10 ; quantptr 109 mov rsi, r11 ; inptr 110 111%ifndef NO_ZERO_COLUMN_TEST_IFAST_SSE2 112 mov eax, dword [DWBLOCK(1,0,rsi,SIZEOF_JCOEF)] 113 or eax, dword [DWBLOCK(2,0,rsi,SIZEOF_JCOEF)] 114 jnz near .columnDCT 115 116 movdqa xmm0, XMMWORD [XMMBLOCK(1,0,rsi,SIZEOF_JCOEF)] 117 movdqa xmm1, XMMWORD [XMMBLOCK(2,0,rsi,SIZEOF_JCOEF)] 118 por xmm0, XMMWORD [XMMBLOCK(3,0,rsi,SIZEOF_JCOEF)] 119 por xmm1, XMMWORD [XMMBLOCK(4,0,rsi,SIZEOF_JCOEF)] 120 por xmm0, XMMWORD [XMMBLOCK(5,0,rsi,SIZEOF_JCOEF)] 121 por xmm1, XMMWORD [XMMBLOCK(6,0,rsi,SIZEOF_JCOEF)] 122 por xmm0, XMMWORD [XMMBLOCK(7,0,rsi,SIZEOF_JCOEF)] 123 por xmm1, xmm0 124 packsswb xmm1, xmm1 125 packsswb xmm1, xmm1 126 movd eax, xmm1 127 test rax, rax 128 jnz short .columnDCT 129 130 ; -- AC terms all zero 131 132 movdqa xmm0, XMMWORD [XMMBLOCK(0,0,rsi,SIZEOF_JCOEF)] 133 pmullw xmm0, XMMWORD [XMMBLOCK(0,0,rdx,SIZEOF_ISLOW_MULT_TYPE)] 134 135 movdqa xmm7, xmm0 ; xmm0=in0=(00 01 02 03 04 05 06 07) 136 punpcklwd xmm0, xmm0 ; xmm0=(00 00 01 01 02 02 03 03) 137 punpckhwd xmm7, xmm7 ; xmm7=(04 04 05 05 06 06 07 07) 138 139 pshufd xmm6, xmm0, 0x00 ; xmm6=col0=(00 00 00 00 00 00 00 00) 140 pshufd xmm2, xmm0, 0x55 ; xmm2=col1=(01 01 01 01 01 01 01 01) 141 pshufd xmm5, xmm0, 0xAA ; xmm5=col2=(02 02 02 02 02 02 02 02) 142 pshufd xmm0, xmm0, 0xFF ; xmm0=col3=(03 03 03 03 03 03 03 03) 143 pshufd xmm1, xmm7, 0x00 ; xmm1=col4=(04 04 04 04 04 04 04 04) 144 pshufd xmm4, xmm7, 0x55 ; xmm4=col5=(05 05 05 05 05 05 05 05) 145 pshufd xmm3, xmm7, 0xAA ; xmm3=col6=(06 06 06 06 06 06 06 06) 146 pshufd xmm7, xmm7, 0xFF ; xmm7=col7=(07 07 07 07 07 07 07 07) 147 148 movdqa XMMWORD [wk(0)], xmm2 ; wk(0)=col1 149 movdqa XMMWORD [wk(1)], xmm0 ; wk(1)=col3 150 jmp near .column_end 151%endif 152.columnDCT: 153 154 ; -- Even part 155 156 movdqa xmm0, XMMWORD [XMMBLOCK(0,0,rsi,SIZEOF_JCOEF)] 157 movdqa xmm1, XMMWORD [XMMBLOCK(2,0,rsi,SIZEOF_JCOEF)] 158 pmullw xmm0, XMMWORD [XMMBLOCK(0,0,rdx,SIZEOF_IFAST_MULT_TYPE)] 159 pmullw xmm1, XMMWORD [XMMBLOCK(2,0,rdx,SIZEOF_IFAST_MULT_TYPE)] 160 movdqa xmm2, XMMWORD [XMMBLOCK(4,0,rsi,SIZEOF_JCOEF)] 161 movdqa xmm3, XMMWORD [XMMBLOCK(6,0,rsi,SIZEOF_JCOEF)] 162 pmullw xmm2, XMMWORD [XMMBLOCK(4,0,rdx,SIZEOF_IFAST_MULT_TYPE)] 163 pmullw xmm3, XMMWORD [XMMBLOCK(6,0,rdx,SIZEOF_IFAST_MULT_TYPE)] 164 165 movdqa xmm4, xmm0 166 movdqa xmm5, xmm1 167 psubw xmm0, xmm2 ; xmm0=tmp11 168 psubw xmm1, xmm3 169 paddw xmm4, xmm2 ; xmm4=tmp10 170 paddw xmm5, xmm3 ; xmm5=tmp13 171 172 psllw xmm1, PRE_MULTIPLY_SCALE_BITS 173 pmulhw xmm1, [rel PW_F1414] 174 psubw xmm1, xmm5 ; xmm1=tmp12 175 176 movdqa xmm6, xmm4 177 movdqa xmm7, xmm0 178 psubw xmm4, xmm5 ; xmm4=tmp3 179 psubw xmm0, xmm1 ; xmm0=tmp2 180 paddw xmm6, xmm5 ; xmm6=tmp0 181 paddw xmm7, xmm1 ; xmm7=tmp1 182 183 movdqa XMMWORD [wk(1)], xmm4 ; wk(1)=tmp3 184 movdqa XMMWORD [wk(0)], xmm0 ; wk(0)=tmp2 185 186 ; -- Odd part 187 188 movdqa xmm2, XMMWORD [XMMBLOCK(1,0,rsi,SIZEOF_JCOEF)] 189 movdqa xmm3, XMMWORD [XMMBLOCK(3,0,rsi,SIZEOF_JCOEF)] 190 pmullw xmm2, XMMWORD [XMMBLOCK(1,0,rdx,SIZEOF_IFAST_MULT_TYPE)] 191 pmullw xmm3, XMMWORD [XMMBLOCK(3,0,rdx,SIZEOF_IFAST_MULT_TYPE)] 192 movdqa xmm5, XMMWORD [XMMBLOCK(5,0,rsi,SIZEOF_JCOEF)] 193 movdqa xmm1, XMMWORD [XMMBLOCK(7,0,rsi,SIZEOF_JCOEF)] 194 pmullw xmm5, XMMWORD [XMMBLOCK(5,0,rdx,SIZEOF_IFAST_MULT_TYPE)] 195 pmullw xmm1, XMMWORD [XMMBLOCK(7,0,rdx,SIZEOF_IFAST_MULT_TYPE)] 196 197 movdqa xmm4, xmm2 198 movdqa xmm0, xmm5 199 psubw xmm2, xmm1 ; xmm2=z12 200 psubw xmm5, xmm3 ; xmm5=z10 201 paddw xmm4, xmm1 ; xmm4=z11 202 paddw xmm0, xmm3 ; xmm0=z13 203 204 movdqa xmm1, xmm5 ; xmm1=z10(unscaled) 205 psllw xmm2, PRE_MULTIPLY_SCALE_BITS 206 psllw xmm5, PRE_MULTIPLY_SCALE_BITS 207 208 movdqa xmm3, xmm4 209 psubw xmm4, xmm0 210 paddw xmm3, xmm0 ; xmm3=tmp7 211 212 psllw xmm4, PRE_MULTIPLY_SCALE_BITS 213 pmulhw xmm4, [rel PW_F1414] ; xmm4=tmp11 214 215 ; To avoid overflow... 216 ; 217 ; (Original) 218 ; tmp12 = -2.613125930 * z10 + z5; 219 ; 220 ; (This implementation) 221 ; tmp12 = (-1.613125930 - 1) * z10 + z5; 222 ; = -1.613125930 * z10 - z10 + z5; 223 224 movdqa xmm0, xmm5 225 paddw xmm5, xmm2 226 pmulhw xmm5, [rel PW_F1847] ; xmm5=z5 227 pmulhw xmm0, [rel PW_MF1613] 228 pmulhw xmm2, [rel PW_F1082] 229 psubw xmm0, xmm1 230 psubw xmm2, xmm5 ; xmm2=tmp10 231 paddw xmm0, xmm5 ; xmm0=tmp12 232 233 ; -- Final output stage 234 235 psubw xmm0, xmm3 ; xmm0=tmp6 236 movdqa xmm1, xmm6 237 movdqa xmm5, xmm7 238 paddw xmm6, xmm3 ; xmm6=data0=(00 01 02 03 04 05 06 07) 239 paddw xmm7, xmm0 ; xmm7=data1=(10 11 12 13 14 15 16 17) 240 psubw xmm1, xmm3 ; xmm1=data7=(70 71 72 73 74 75 76 77) 241 psubw xmm5, xmm0 ; xmm5=data6=(60 61 62 63 64 65 66 67) 242 psubw xmm4, xmm0 ; xmm4=tmp5 243 244 movdqa xmm3, xmm6 ; transpose coefficients(phase 1) 245 punpcklwd xmm6, xmm7 ; xmm6=(00 10 01 11 02 12 03 13) 246 punpckhwd xmm3, xmm7 ; xmm3=(04 14 05 15 06 16 07 17) 247 movdqa xmm0, xmm5 ; transpose coefficients(phase 1) 248 punpcklwd xmm5, xmm1 ; xmm5=(60 70 61 71 62 72 63 73) 249 punpckhwd xmm0, xmm1 ; xmm0=(64 74 65 75 66 76 67 77) 250 251 movdqa xmm7, XMMWORD [wk(0)] ; xmm7=tmp2 252 movdqa xmm1, XMMWORD [wk(1)] ; xmm1=tmp3 253 254 movdqa XMMWORD [wk(0)], xmm5 ; wk(0)=(60 70 61 71 62 72 63 73) 255 movdqa XMMWORD [wk(1)], xmm0 ; wk(1)=(64 74 65 75 66 76 67 77) 256 257 paddw xmm2, xmm4 ; xmm2=tmp4 258 movdqa xmm5, xmm7 259 movdqa xmm0, xmm1 260 paddw xmm7, xmm4 ; xmm7=data2=(20 21 22 23 24 25 26 27) 261 paddw xmm1, xmm2 ; xmm1=data4=(40 41 42 43 44 45 46 47) 262 psubw xmm5, xmm4 ; xmm5=data5=(50 51 52 53 54 55 56 57) 263 psubw xmm0, xmm2 ; xmm0=data3=(30 31 32 33 34 35 36 37) 264 265 movdqa xmm4, xmm7 ; transpose coefficients(phase 1) 266 punpcklwd xmm7, xmm0 ; xmm7=(20 30 21 31 22 32 23 33) 267 punpckhwd xmm4, xmm0 ; xmm4=(24 34 25 35 26 36 27 37) 268 movdqa xmm2, xmm1 ; transpose coefficients(phase 1) 269 punpcklwd xmm1, xmm5 ; xmm1=(40 50 41 51 42 52 43 53) 270 punpckhwd xmm2, xmm5 ; xmm2=(44 54 45 55 46 56 47 57) 271 272 movdqa xmm0, xmm3 ; transpose coefficients(phase 2) 273 punpckldq xmm3, xmm4 ; xmm3=(04 14 24 34 05 15 25 35) 274 punpckhdq xmm0, xmm4 ; xmm0=(06 16 26 36 07 17 27 37) 275 movdqa xmm5, xmm6 ; transpose coefficients(phase 2) 276 punpckldq xmm6, xmm7 ; xmm6=(00 10 20 30 01 11 21 31) 277 punpckhdq xmm5, xmm7 ; xmm5=(02 12 22 32 03 13 23 33) 278 279 movdqa xmm4, XMMWORD [wk(0)] ; xmm4=(60 70 61 71 62 72 63 73) 280 movdqa xmm7, XMMWORD [wk(1)] ; xmm7=(64 74 65 75 66 76 67 77) 281 282 movdqa XMMWORD [wk(0)], xmm3 ; wk(0)=(04 14 24 34 05 15 25 35) 283 movdqa XMMWORD [wk(1)], xmm0 ; wk(1)=(06 16 26 36 07 17 27 37) 284 285 movdqa xmm3, xmm1 ; transpose coefficients(phase 2) 286 punpckldq xmm1, xmm4 ; xmm1=(40 50 60 70 41 51 61 71) 287 punpckhdq xmm3, xmm4 ; xmm3=(42 52 62 72 43 53 63 73) 288 movdqa xmm0, xmm2 ; transpose coefficients(phase 2) 289 punpckldq xmm2, xmm7 ; xmm2=(44 54 64 74 45 55 65 75) 290 punpckhdq xmm0, xmm7 ; xmm0=(46 56 66 76 47 57 67 77) 291 292 movdqa xmm4, xmm6 ; transpose coefficients(phase 3) 293 punpcklqdq xmm6, xmm1 ; xmm6=col0=(00 10 20 30 40 50 60 70) 294 punpckhqdq xmm4, xmm1 ; xmm4=col1=(01 11 21 31 41 51 61 71) 295 movdqa xmm7, xmm5 ; transpose coefficients(phase 3) 296 punpcklqdq xmm5, xmm3 ; xmm5=col2=(02 12 22 32 42 52 62 72) 297 punpckhqdq xmm7, xmm3 ; xmm7=col3=(03 13 23 33 43 53 63 73) 298 299 movdqa xmm1, XMMWORD [wk(0)] ; xmm1=(04 14 24 34 05 15 25 35) 300 movdqa xmm3, XMMWORD [wk(1)] ; xmm3=(06 16 26 36 07 17 27 37) 301 302 movdqa XMMWORD [wk(0)], xmm4 ; wk(0)=col1 303 movdqa XMMWORD [wk(1)], xmm7 ; wk(1)=col3 304 305 movdqa xmm4, xmm1 ; transpose coefficients(phase 3) 306 punpcklqdq xmm1, xmm2 ; xmm1=col4=(04 14 24 34 44 54 64 74) 307 punpckhqdq xmm4, xmm2 ; xmm4=col5=(05 15 25 35 45 55 65 75) 308 movdqa xmm7, xmm3 ; transpose coefficients(phase 3) 309 punpcklqdq xmm3, xmm0 ; xmm3=col6=(06 16 26 36 46 56 66 76) 310 punpckhqdq xmm7, xmm0 ; xmm7=col7=(07 17 27 37 47 57 67 77) 311.column_end: 312 313 ; -- Prefetch the next coefficient block 314 315 prefetchnta [rsi + DCTSIZE2*SIZEOF_JCOEF + 0*32] 316 prefetchnta [rsi + DCTSIZE2*SIZEOF_JCOEF + 1*32] 317 prefetchnta [rsi + DCTSIZE2*SIZEOF_JCOEF + 2*32] 318 prefetchnta [rsi + DCTSIZE2*SIZEOF_JCOEF + 3*32] 319 320 ; ---- Pass 2: process rows from work array, store into output array. 321 322 mov rax, [original_rbp] 323 mov rdi, r12 ; (JSAMPROW *) 324 mov eax, r13d 325 326 ; -- Even part 327 328 ; xmm6=col0, xmm5=col2, xmm1=col4, xmm3=col6 329 330 movdqa xmm2, xmm6 331 movdqa xmm0, xmm5 332 psubw xmm6, xmm1 ; xmm6=tmp11 333 psubw xmm5, xmm3 334 paddw xmm2, xmm1 ; xmm2=tmp10 335 paddw xmm0, xmm3 ; xmm0=tmp13 336 337 psllw xmm5, PRE_MULTIPLY_SCALE_BITS 338 pmulhw xmm5, [rel PW_F1414] 339 psubw xmm5, xmm0 ; xmm5=tmp12 340 341 movdqa xmm1, xmm2 342 movdqa xmm3, xmm6 343 psubw xmm2, xmm0 ; xmm2=tmp3 344 psubw xmm6, xmm5 ; xmm6=tmp2 345 paddw xmm1, xmm0 ; xmm1=tmp0 346 paddw xmm3, xmm5 ; xmm3=tmp1 347 348 movdqa xmm0, XMMWORD [wk(0)] ; xmm0=col1 349 movdqa xmm5, XMMWORD [wk(1)] ; xmm5=col3 350 351 movdqa XMMWORD [wk(0)], xmm2 ; wk(0)=tmp3 352 movdqa XMMWORD [wk(1)], xmm6 ; wk(1)=tmp2 353 354 ; -- Odd part 355 356 ; xmm0=col1, xmm5=col3, xmm4=col5, xmm7=col7 357 358 movdqa xmm2, xmm0 359 movdqa xmm6, xmm4 360 psubw xmm0, xmm7 ; xmm0=z12 361 psubw xmm4, xmm5 ; xmm4=z10 362 paddw xmm2, xmm7 ; xmm2=z11 363 paddw xmm6, xmm5 ; xmm6=z13 364 365 movdqa xmm7, xmm4 ; xmm7=z10(unscaled) 366 psllw xmm0, PRE_MULTIPLY_SCALE_BITS 367 psllw xmm4, PRE_MULTIPLY_SCALE_BITS 368 369 movdqa xmm5, xmm2 370 psubw xmm2, xmm6 371 paddw xmm5, xmm6 ; xmm5=tmp7 372 373 psllw xmm2, PRE_MULTIPLY_SCALE_BITS 374 pmulhw xmm2, [rel PW_F1414] ; xmm2=tmp11 375 376 ; To avoid overflow... 377 ; 378 ; (Original) 379 ; tmp12 = -2.613125930 * z10 + z5; 380 ; 381 ; (This implementation) 382 ; tmp12 = (-1.613125930 - 1) * z10 + z5; 383 ; = -1.613125930 * z10 - z10 + z5; 384 385 movdqa xmm6, xmm4 386 paddw xmm4, xmm0 387 pmulhw xmm4, [rel PW_F1847] ; xmm4=z5 388 pmulhw xmm6, [rel PW_MF1613] 389 pmulhw xmm0, [rel PW_F1082] 390 psubw xmm6, xmm7 391 psubw xmm0, xmm4 ; xmm0=tmp10 392 paddw xmm6, xmm4 ; xmm6=tmp12 393 394 ; -- Final output stage 395 396 psubw xmm6, xmm5 ; xmm6=tmp6 397 movdqa xmm7, xmm1 398 movdqa xmm4, xmm3 399 paddw xmm1, xmm5 ; xmm1=data0=(00 10 20 30 40 50 60 70) 400 paddw xmm3, xmm6 ; xmm3=data1=(01 11 21 31 41 51 61 71) 401 psraw xmm1, (PASS1_BITS+3) ; descale 402 psraw xmm3, (PASS1_BITS+3) ; descale 403 psubw xmm7, xmm5 ; xmm7=data7=(07 17 27 37 47 57 67 77) 404 psubw xmm4, xmm6 ; xmm4=data6=(06 16 26 36 46 56 66 76) 405 psraw xmm7, (PASS1_BITS+3) ; descale 406 psraw xmm4, (PASS1_BITS+3) ; descale 407 psubw xmm2, xmm6 ; xmm2=tmp5 408 409 packsswb xmm1, xmm4 ; xmm1=(00 10 20 30 40 50 60 70 06 16 26 36 46 56 66 76) 410 packsswb xmm3, xmm7 ; xmm3=(01 11 21 31 41 51 61 71 07 17 27 37 47 57 67 77) 411 412 movdqa xmm5, XMMWORD [wk(1)] ; xmm5=tmp2 413 movdqa xmm6, XMMWORD [wk(0)] ; xmm6=tmp3 414 415 paddw xmm0, xmm2 ; xmm0=tmp4 416 movdqa xmm4, xmm5 417 movdqa xmm7, xmm6 418 paddw xmm5, xmm2 ; xmm5=data2=(02 12 22 32 42 52 62 72) 419 paddw xmm6, xmm0 ; xmm6=data4=(04 14 24 34 44 54 64 74) 420 psraw xmm5, (PASS1_BITS+3) ; descale 421 psraw xmm6, (PASS1_BITS+3) ; descale 422 psubw xmm4, xmm2 ; xmm4=data5=(05 15 25 35 45 55 65 75) 423 psubw xmm7, xmm0 ; xmm7=data3=(03 13 23 33 43 53 63 73) 424 psraw xmm4, (PASS1_BITS+3) ; descale 425 psraw xmm7, (PASS1_BITS+3) ; descale 426 427 movdqa xmm2, [rel PB_CENTERJSAMP] ; xmm2=[rel PB_CENTERJSAMP] 428 429 packsswb xmm5, xmm6 ; xmm5=(02 12 22 32 42 52 62 72 04 14 24 34 44 54 64 74) 430 packsswb xmm7, xmm4 ; xmm7=(03 13 23 33 43 53 63 73 05 15 25 35 45 55 65 75) 431 432 paddb xmm1, xmm2 433 paddb xmm3, xmm2 434 paddb xmm5, xmm2 435 paddb xmm7, xmm2 436 437 movdqa xmm0, xmm1 ; transpose coefficients(phase 1) 438 punpcklbw xmm1, xmm3 ; xmm1=(00 01 10 11 20 21 30 31 40 41 50 51 60 61 70 71) 439 punpckhbw xmm0, xmm3 ; xmm0=(06 07 16 17 26 27 36 37 46 47 56 57 66 67 76 77) 440 movdqa xmm6, xmm5 ; transpose coefficients(phase 1) 441 punpcklbw xmm5, xmm7 ; xmm5=(02 03 12 13 22 23 32 33 42 43 52 53 62 63 72 73) 442 punpckhbw xmm6, xmm7 ; xmm6=(04 05 14 15 24 25 34 35 44 45 54 55 64 65 74 75) 443 444 movdqa xmm4, xmm1 ; transpose coefficients(phase 2) 445 punpcklwd xmm1, xmm5 ; xmm1=(00 01 02 03 10 11 12 13 20 21 22 23 30 31 32 33) 446 punpckhwd xmm4, xmm5 ; xmm4=(40 41 42 43 50 51 52 53 60 61 62 63 70 71 72 73) 447 movdqa xmm2, xmm6 ; transpose coefficients(phase 2) 448 punpcklwd xmm6, xmm0 ; xmm6=(04 05 06 07 14 15 16 17 24 25 26 27 34 35 36 37) 449 punpckhwd xmm2, xmm0 ; xmm2=(44 45 46 47 54 55 56 57 64 65 66 67 74 75 76 77) 450 451 movdqa xmm3, xmm1 ; transpose coefficients(phase 3) 452 punpckldq xmm1, xmm6 ; xmm1=(00 01 02 03 04 05 06 07 10 11 12 13 14 15 16 17) 453 punpckhdq xmm3, xmm6 ; xmm3=(20 21 22 23 24 25 26 27 30 31 32 33 34 35 36 37) 454 movdqa xmm7, xmm4 ; transpose coefficients(phase 3) 455 punpckldq xmm4, xmm2 ; xmm4=(40 41 42 43 44 45 46 47 50 51 52 53 54 55 56 57) 456 punpckhdq xmm7, xmm2 ; xmm7=(60 61 62 63 64 65 66 67 70 71 72 73 74 75 76 77) 457 458 pshufd xmm5, xmm1, 0x4E ; xmm5=(10 11 12 13 14 15 16 17 00 01 02 03 04 05 06 07) 459 pshufd xmm0, xmm3, 0x4E ; xmm0=(30 31 32 33 34 35 36 37 20 21 22 23 24 25 26 27) 460 pshufd xmm6, xmm4, 0x4E ; xmm6=(50 51 52 53 54 55 56 57 40 41 42 43 44 45 46 47) 461 pshufd xmm2, xmm7, 0x4E ; xmm2=(70 71 72 73 74 75 76 77 60 61 62 63 64 65 66 67) 462 463 mov rdx, JSAMPROW [rdi+0*SIZEOF_JSAMPROW] 464 mov rsi, JSAMPROW [rdi+2*SIZEOF_JSAMPROW] 465 movq XMM_MMWORD [rdx+rax*SIZEOF_JSAMPLE], xmm1 466 movq XMM_MMWORD [rsi+rax*SIZEOF_JSAMPLE], xmm3 467 mov rdx, JSAMPROW [rdi+4*SIZEOF_JSAMPROW] 468 mov rsi, JSAMPROW [rdi+6*SIZEOF_JSAMPROW] 469 movq XMM_MMWORD [rdx+rax*SIZEOF_JSAMPLE], xmm4 470 movq XMM_MMWORD [rsi+rax*SIZEOF_JSAMPLE], xmm7 471 472 mov rdx, JSAMPROW [rdi+1*SIZEOF_JSAMPROW] 473 mov rsi, JSAMPROW [rdi+3*SIZEOF_JSAMPROW] 474 movq XMM_MMWORD [rdx+rax*SIZEOF_JSAMPLE], xmm5 475 movq XMM_MMWORD [rsi+rax*SIZEOF_JSAMPLE], xmm0 476 mov rdx, JSAMPROW [rdi+5*SIZEOF_JSAMPROW] 477 mov rsi, JSAMPROW [rdi+7*SIZEOF_JSAMPROW] 478 movq XMM_MMWORD [rdx+rax*SIZEOF_JSAMPLE], xmm6 479 movq XMM_MMWORD [rsi+rax*SIZEOF_JSAMPLE], xmm2 480 481 uncollect_args 4 482 mov rsp, rbp ; rsp <- aligned rbp 483 pop rsp ; rsp <- original rbp 484 pop rbp 485 ret 486 ret 487 488; For some reason, the OS X linker does not honor the request to align the 489; segment unless we do this. 490 align 32 491