1; 2; jidctfst.asm - fast integer IDCT (64-bit SSE2) 3; 4; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB 5; Copyright (C) 2009, 2016, D. R. Commander. 6; Copyright (C) 2018, Matthias Räncker. 7; 8; Based on the x86 SIMD extension for IJG JPEG library 9; Copyright (C) 1999-2006, MIYASAKA Masaru. 10; For conditions of distribution and use, see copyright notice in jsimdext.inc 11; 12; This file should be assembled with NASM (Netwide Assembler), 13; can *not* be assembled with Microsoft's MASM or any compatible 14; assembler (including Borland's Turbo Assembler). 15; NASM is available from http://nasm.sourceforge.net/ or 16; http://sourceforge.net/project/showfiles.php?group_id=6208 17; 18; This file contains a fast, not so accurate integer implementation of 19; the inverse DCT (Discrete Cosine Transform). The following code is 20; based directly on the IJG's original jidctfst.c; see the jidctfst.c 21; for more details. 22 23%include "jsimdext.inc" 24%include "jdct.inc" 25 26; -------------------------------------------------------------------------- 27 28%define CONST_BITS 8 ; 14 is also OK. 29%define PASS1_BITS 2 30 31%if IFAST_SCALE_BITS != PASS1_BITS 32%error "'IFAST_SCALE_BITS' must be equal to 'PASS1_BITS'." 33%endif 34 35%if CONST_BITS == 8 36F_1_082 equ 277 ; FIX(1.082392200) 37F_1_414 equ 362 ; FIX(1.414213562) 38F_1_847 equ 473 ; FIX(1.847759065) 39F_2_613 equ 669 ; FIX(2.613125930) 40F_1_613 equ (F_2_613 - 256) ; FIX(2.613125930) - FIX(1) 41%else 42; NASM cannot do compile-time arithmetic on floating-point constants. 43%define DESCALE(x, n) (((x) + (1 << ((n) - 1))) >> (n)) 44F_1_082 equ DESCALE(1162209775, 30 - CONST_BITS) ; FIX(1.082392200) 45F_1_414 equ DESCALE(1518500249, 30 - CONST_BITS) ; FIX(1.414213562) 46F_1_847 equ DESCALE(1984016188, 30 - CONST_BITS) ; FIX(1.847759065) 47F_2_613 equ DESCALE(2805822602, 30 - CONST_BITS) ; FIX(2.613125930) 48F_1_613 equ (F_2_613 - (1 << CONST_BITS)) ; FIX(2.613125930) - FIX(1) 49%endif 50 51; -------------------------------------------------------------------------- 52 SECTION SEG_CONST 53 54; PRE_MULTIPLY_SCALE_BITS <= 2 (to avoid overflow) 55; CONST_BITS + CONST_SHIFT + PRE_MULTIPLY_SCALE_BITS == 16 (for pmulhw) 56 57%define PRE_MULTIPLY_SCALE_BITS 2 58%define CONST_SHIFT (16 - PRE_MULTIPLY_SCALE_BITS - CONST_BITS) 59 60 alignz 32 61 GLOBAL_DATA(jconst_idct_ifast_sse2) 62 63EXTN(jconst_idct_ifast_sse2): 64 65PW_F1414 times 8 dw F_1_414 << CONST_SHIFT 66PW_F1847 times 8 dw F_1_847 << CONST_SHIFT 67PW_MF1613 times 8 dw -F_1_613 << CONST_SHIFT 68PW_F1082 times 8 dw F_1_082 << CONST_SHIFT 69PB_CENTERJSAMP times 16 db CENTERJSAMPLE 70 71 alignz 32 72 73; -------------------------------------------------------------------------- 74 SECTION SEG_TEXT 75 BITS 64 76; 77; Perform dequantization and inverse DCT on one block of coefficients. 78; 79; GLOBAL(void) 80; jsimd_idct_ifast_sse2(void *dct_table, JCOEFPTR coef_block, 81; JSAMPARRAY output_buf, JDIMENSION output_col) 82; 83 84; r10 = jpeg_component_info *compptr 85; r11 = JCOEFPTR coef_block 86; r12 = JSAMPARRAY output_buf 87; r13d = JDIMENSION output_col 88 89%define original_rbp rbp + 0 90%define wk(i) rbp - (WK_NUM - (i)) * SIZEOF_XMMWORD 91 ; xmmword wk[WK_NUM] 92%define WK_NUM 2 93 94 align 32 95 GLOBAL_FUNCTION(jsimd_idct_ifast_sse2) 96 97EXTN(jsimd_idct_ifast_sse2): 98 push rbp 99 mov rax, rsp ; rax = original rbp 100 sub rsp, byte 4 101 and rsp, byte (-SIZEOF_XMMWORD) ; align to 128 bits 102 mov [rsp], rax 103 mov rbp, rsp ; rbp = aligned rbp 104 lea rsp, [wk(0)] 105 collect_args 4 106 107 ; ---- Pass 1: process columns from input. 108 109 mov rdx, r10 ; quantptr 110 mov rsi, r11 ; inptr 111 112%ifndef NO_ZERO_COLUMN_TEST_IFAST_SSE2 113 mov eax, dword [DWBLOCK(1,0,rsi,SIZEOF_JCOEF)] 114 or eax, dword [DWBLOCK(2,0,rsi,SIZEOF_JCOEF)] 115 jnz near .columnDCT 116 117 movdqa xmm0, XMMWORD [XMMBLOCK(1,0,rsi,SIZEOF_JCOEF)] 118 movdqa xmm1, XMMWORD [XMMBLOCK(2,0,rsi,SIZEOF_JCOEF)] 119 por xmm0, XMMWORD [XMMBLOCK(3,0,rsi,SIZEOF_JCOEF)] 120 por xmm1, XMMWORD [XMMBLOCK(4,0,rsi,SIZEOF_JCOEF)] 121 por xmm0, XMMWORD [XMMBLOCK(5,0,rsi,SIZEOF_JCOEF)] 122 por xmm1, XMMWORD [XMMBLOCK(6,0,rsi,SIZEOF_JCOEF)] 123 por xmm0, XMMWORD [XMMBLOCK(7,0,rsi,SIZEOF_JCOEF)] 124 por xmm1, xmm0 125 packsswb xmm1, xmm1 126 packsswb xmm1, xmm1 127 movd eax, xmm1 128 test rax, rax 129 jnz short .columnDCT 130 131 ; -- AC terms all zero 132 133 movdqa xmm0, XMMWORD [XMMBLOCK(0,0,rsi,SIZEOF_JCOEF)] 134 pmullw xmm0, XMMWORD [XMMBLOCK(0,0,rdx,SIZEOF_ISLOW_MULT_TYPE)] 135 136 movdqa xmm7, xmm0 ; xmm0=in0=(00 01 02 03 04 05 06 07) 137 punpcklwd xmm0, xmm0 ; xmm0=(00 00 01 01 02 02 03 03) 138 punpckhwd xmm7, xmm7 ; xmm7=(04 04 05 05 06 06 07 07) 139 140 pshufd xmm6, xmm0, 0x00 ; xmm6=col0=(00 00 00 00 00 00 00 00) 141 pshufd xmm2, xmm0, 0x55 ; xmm2=col1=(01 01 01 01 01 01 01 01) 142 pshufd xmm5, xmm0, 0xAA ; xmm5=col2=(02 02 02 02 02 02 02 02) 143 pshufd xmm0, xmm0, 0xFF ; xmm0=col3=(03 03 03 03 03 03 03 03) 144 pshufd xmm1, xmm7, 0x00 ; xmm1=col4=(04 04 04 04 04 04 04 04) 145 pshufd xmm4, xmm7, 0x55 ; xmm4=col5=(05 05 05 05 05 05 05 05) 146 pshufd xmm3, xmm7, 0xAA ; xmm3=col6=(06 06 06 06 06 06 06 06) 147 pshufd xmm7, xmm7, 0xFF ; xmm7=col7=(07 07 07 07 07 07 07 07) 148 149 movdqa XMMWORD [wk(0)], xmm2 ; wk(0)=col1 150 movdqa XMMWORD [wk(1)], xmm0 ; wk(1)=col3 151 jmp near .column_end 152%endif 153.columnDCT: 154 155 ; -- Even part 156 157 movdqa xmm0, XMMWORD [XMMBLOCK(0,0,rsi,SIZEOF_JCOEF)] 158 movdqa xmm1, XMMWORD [XMMBLOCK(2,0,rsi,SIZEOF_JCOEF)] 159 pmullw xmm0, XMMWORD [XMMBLOCK(0,0,rdx,SIZEOF_IFAST_MULT_TYPE)] 160 pmullw xmm1, XMMWORD [XMMBLOCK(2,0,rdx,SIZEOF_IFAST_MULT_TYPE)] 161 movdqa xmm2, XMMWORD [XMMBLOCK(4,0,rsi,SIZEOF_JCOEF)] 162 movdqa xmm3, XMMWORD [XMMBLOCK(6,0,rsi,SIZEOF_JCOEF)] 163 pmullw xmm2, XMMWORD [XMMBLOCK(4,0,rdx,SIZEOF_IFAST_MULT_TYPE)] 164 pmullw xmm3, XMMWORD [XMMBLOCK(6,0,rdx,SIZEOF_IFAST_MULT_TYPE)] 165 166 movdqa xmm4, xmm0 167 movdqa xmm5, xmm1 168 psubw xmm0, xmm2 ; xmm0=tmp11 169 psubw xmm1, xmm3 170 paddw xmm4, xmm2 ; xmm4=tmp10 171 paddw xmm5, xmm3 ; xmm5=tmp13 172 173 psllw xmm1, PRE_MULTIPLY_SCALE_BITS 174 pmulhw xmm1, [rel PW_F1414] 175 psubw xmm1, xmm5 ; xmm1=tmp12 176 177 movdqa xmm6, xmm4 178 movdqa xmm7, xmm0 179 psubw xmm4, xmm5 ; xmm4=tmp3 180 psubw xmm0, xmm1 ; xmm0=tmp2 181 paddw xmm6, xmm5 ; xmm6=tmp0 182 paddw xmm7, xmm1 ; xmm7=tmp1 183 184 movdqa XMMWORD [wk(1)], xmm4 ; wk(1)=tmp3 185 movdqa XMMWORD [wk(0)], xmm0 ; wk(0)=tmp2 186 187 ; -- Odd part 188 189 movdqa xmm2, XMMWORD [XMMBLOCK(1,0,rsi,SIZEOF_JCOEF)] 190 movdqa xmm3, XMMWORD [XMMBLOCK(3,0,rsi,SIZEOF_JCOEF)] 191 pmullw xmm2, XMMWORD [XMMBLOCK(1,0,rdx,SIZEOF_IFAST_MULT_TYPE)] 192 pmullw xmm3, XMMWORD [XMMBLOCK(3,0,rdx,SIZEOF_IFAST_MULT_TYPE)] 193 movdqa xmm5, XMMWORD [XMMBLOCK(5,0,rsi,SIZEOF_JCOEF)] 194 movdqa xmm1, XMMWORD [XMMBLOCK(7,0,rsi,SIZEOF_JCOEF)] 195 pmullw xmm5, XMMWORD [XMMBLOCK(5,0,rdx,SIZEOF_IFAST_MULT_TYPE)] 196 pmullw xmm1, XMMWORD [XMMBLOCK(7,0,rdx,SIZEOF_IFAST_MULT_TYPE)] 197 198 movdqa xmm4, xmm2 199 movdqa xmm0, xmm5 200 psubw xmm2, xmm1 ; xmm2=z12 201 psubw xmm5, xmm3 ; xmm5=z10 202 paddw xmm4, xmm1 ; xmm4=z11 203 paddw xmm0, xmm3 ; xmm0=z13 204 205 movdqa xmm1, xmm5 ; xmm1=z10(unscaled) 206 psllw xmm2, PRE_MULTIPLY_SCALE_BITS 207 psllw xmm5, PRE_MULTIPLY_SCALE_BITS 208 209 movdqa xmm3, xmm4 210 psubw xmm4, xmm0 211 paddw xmm3, xmm0 ; xmm3=tmp7 212 213 psllw xmm4, PRE_MULTIPLY_SCALE_BITS 214 pmulhw xmm4, [rel PW_F1414] ; xmm4=tmp11 215 216 ; To avoid overflow... 217 ; 218 ; (Original) 219 ; tmp12 = -2.613125930 * z10 + z5; 220 ; 221 ; (This implementation) 222 ; tmp12 = (-1.613125930 - 1) * z10 + z5; 223 ; = -1.613125930 * z10 - z10 + z5; 224 225 movdqa xmm0, xmm5 226 paddw xmm5, xmm2 227 pmulhw xmm5, [rel PW_F1847] ; xmm5=z5 228 pmulhw xmm0, [rel PW_MF1613] 229 pmulhw xmm2, [rel PW_F1082] 230 psubw xmm0, xmm1 231 psubw xmm2, xmm5 ; xmm2=tmp10 232 paddw xmm0, xmm5 ; xmm0=tmp12 233 234 ; -- Final output stage 235 236 psubw xmm0, xmm3 ; xmm0=tmp6 237 movdqa xmm1, xmm6 238 movdqa xmm5, xmm7 239 paddw xmm6, xmm3 ; xmm6=data0=(00 01 02 03 04 05 06 07) 240 paddw xmm7, xmm0 ; xmm7=data1=(10 11 12 13 14 15 16 17) 241 psubw xmm1, xmm3 ; xmm1=data7=(70 71 72 73 74 75 76 77) 242 psubw xmm5, xmm0 ; xmm5=data6=(60 61 62 63 64 65 66 67) 243 psubw xmm4, xmm0 ; xmm4=tmp5 244 245 movdqa xmm3, xmm6 ; transpose coefficients(phase 1) 246 punpcklwd xmm6, xmm7 ; xmm6=(00 10 01 11 02 12 03 13) 247 punpckhwd xmm3, xmm7 ; xmm3=(04 14 05 15 06 16 07 17) 248 movdqa xmm0, xmm5 ; transpose coefficients(phase 1) 249 punpcklwd xmm5, xmm1 ; xmm5=(60 70 61 71 62 72 63 73) 250 punpckhwd xmm0, xmm1 ; xmm0=(64 74 65 75 66 76 67 77) 251 252 movdqa xmm7, XMMWORD [wk(0)] ; xmm7=tmp2 253 movdqa xmm1, XMMWORD [wk(1)] ; xmm1=tmp3 254 255 movdqa XMMWORD [wk(0)], xmm5 ; wk(0)=(60 70 61 71 62 72 63 73) 256 movdqa XMMWORD [wk(1)], xmm0 ; wk(1)=(64 74 65 75 66 76 67 77) 257 258 paddw xmm2, xmm4 ; xmm2=tmp4 259 movdqa xmm5, xmm7 260 movdqa xmm0, xmm1 261 paddw xmm7, xmm4 ; xmm7=data2=(20 21 22 23 24 25 26 27) 262 paddw xmm1, xmm2 ; xmm1=data4=(40 41 42 43 44 45 46 47) 263 psubw xmm5, xmm4 ; xmm5=data5=(50 51 52 53 54 55 56 57) 264 psubw xmm0, xmm2 ; xmm0=data3=(30 31 32 33 34 35 36 37) 265 266 movdqa xmm4, xmm7 ; transpose coefficients(phase 1) 267 punpcklwd xmm7, xmm0 ; xmm7=(20 30 21 31 22 32 23 33) 268 punpckhwd xmm4, xmm0 ; xmm4=(24 34 25 35 26 36 27 37) 269 movdqa xmm2, xmm1 ; transpose coefficients(phase 1) 270 punpcklwd xmm1, xmm5 ; xmm1=(40 50 41 51 42 52 43 53) 271 punpckhwd xmm2, xmm5 ; xmm2=(44 54 45 55 46 56 47 57) 272 273 movdqa xmm0, xmm3 ; transpose coefficients(phase 2) 274 punpckldq xmm3, xmm4 ; xmm3=(04 14 24 34 05 15 25 35) 275 punpckhdq xmm0, xmm4 ; xmm0=(06 16 26 36 07 17 27 37) 276 movdqa xmm5, xmm6 ; transpose coefficients(phase 2) 277 punpckldq xmm6, xmm7 ; xmm6=(00 10 20 30 01 11 21 31) 278 punpckhdq xmm5, xmm7 ; xmm5=(02 12 22 32 03 13 23 33) 279 280 movdqa xmm4, XMMWORD [wk(0)] ; xmm4=(60 70 61 71 62 72 63 73) 281 movdqa xmm7, XMMWORD [wk(1)] ; xmm7=(64 74 65 75 66 76 67 77) 282 283 movdqa XMMWORD [wk(0)], xmm3 ; wk(0)=(04 14 24 34 05 15 25 35) 284 movdqa XMMWORD [wk(1)], xmm0 ; wk(1)=(06 16 26 36 07 17 27 37) 285 286 movdqa xmm3, xmm1 ; transpose coefficients(phase 2) 287 punpckldq xmm1, xmm4 ; xmm1=(40 50 60 70 41 51 61 71) 288 punpckhdq xmm3, xmm4 ; xmm3=(42 52 62 72 43 53 63 73) 289 movdqa xmm0, xmm2 ; transpose coefficients(phase 2) 290 punpckldq xmm2, xmm7 ; xmm2=(44 54 64 74 45 55 65 75) 291 punpckhdq xmm0, xmm7 ; xmm0=(46 56 66 76 47 57 67 77) 292 293 movdqa xmm4, xmm6 ; transpose coefficients(phase 3) 294 punpcklqdq xmm6, xmm1 ; xmm6=col0=(00 10 20 30 40 50 60 70) 295 punpckhqdq xmm4, xmm1 ; xmm4=col1=(01 11 21 31 41 51 61 71) 296 movdqa xmm7, xmm5 ; transpose coefficients(phase 3) 297 punpcklqdq xmm5, xmm3 ; xmm5=col2=(02 12 22 32 42 52 62 72) 298 punpckhqdq xmm7, xmm3 ; xmm7=col3=(03 13 23 33 43 53 63 73) 299 300 movdqa xmm1, XMMWORD [wk(0)] ; xmm1=(04 14 24 34 05 15 25 35) 301 movdqa xmm3, XMMWORD [wk(1)] ; xmm3=(06 16 26 36 07 17 27 37) 302 303 movdqa XMMWORD [wk(0)], xmm4 ; wk(0)=col1 304 movdqa XMMWORD [wk(1)], xmm7 ; wk(1)=col3 305 306 movdqa xmm4, xmm1 ; transpose coefficients(phase 3) 307 punpcklqdq xmm1, xmm2 ; xmm1=col4=(04 14 24 34 44 54 64 74) 308 punpckhqdq xmm4, xmm2 ; xmm4=col5=(05 15 25 35 45 55 65 75) 309 movdqa xmm7, xmm3 ; transpose coefficients(phase 3) 310 punpcklqdq xmm3, xmm0 ; xmm3=col6=(06 16 26 36 46 56 66 76) 311 punpckhqdq xmm7, xmm0 ; xmm7=col7=(07 17 27 37 47 57 67 77) 312.column_end: 313 314 ; -- Prefetch the next coefficient block 315 316 prefetchnta [rsi + DCTSIZE2*SIZEOF_JCOEF + 0*32] 317 prefetchnta [rsi + DCTSIZE2*SIZEOF_JCOEF + 1*32] 318 prefetchnta [rsi + DCTSIZE2*SIZEOF_JCOEF + 2*32] 319 prefetchnta [rsi + DCTSIZE2*SIZEOF_JCOEF + 3*32] 320 321 ; ---- Pass 2: process rows from work array, store into output array. 322 323 mov rax, [original_rbp] 324 mov rdi, r12 ; (JSAMPROW *) 325 mov eax, r13d 326 327 ; -- Even part 328 329 ; xmm6=col0, xmm5=col2, xmm1=col4, xmm3=col6 330 331 movdqa xmm2, xmm6 332 movdqa xmm0, xmm5 333 psubw xmm6, xmm1 ; xmm6=tmp11 334 psubw xmm5, xmm3 335 paddw xmm2, xmm1 ; xmm2=tmp10 336 paddw xmm0, xmm3 ; xmm0=tmp13 337 338 psllw xmm5, PRE_MULTIPLY_SCALE_BITS 339 pmulhw xmm5, [rel PW_F1414] 340 psubw xmm5, xmm0 ; xmm5=tmp12 341 342 movdqa xmm1, xmm2 343 movdqa xmm3, xmm6 344 psubw xmm2, xmm0 ; xmm2=tmp3 345 psubw xmm6, xmm5 ; xmm6=tmp2 346 paddw xmm1, xmm0 ; xmm1=tmp0 347 paddw xmm3, xmm5 ; xmm3=tmp1 348 349 movdqa xmm0, XMMWORD [wk(0)] ; xmm0=col1 350 movdqa xmm5, XMMWORD [wk(1)] ; xmm5=col3 351 352 movdqa XMMWORD [wk(0)], xmm2 ; wk(0)=tmp3 353 movdqa XMMWORD [wk(1)], xmm6 ; wk(1)=tmp2 354 355 ; -- Odd part 356 357 ; xmm0=col1, xmm5=col3, xmm4=col5, xmm7=col7 358 359 movdqa xmm2, xmm0 360 movdqa xmm6, xmm4 361 psubw xmm0, xmm7 ; xmm0=z12 362 psubw xmm4, xmm5 ; xmm4=z10 363 paddw xmm2, xmm7 ; xmm2=z11 364 paddw xmm6, xmm5 ; xmm6=z13 365 366 movdqa xmm7, xmm4 ; xmm7=z10(unscaled) 367 psllw xmm0, PRE_MULTIPLY_SCALE_BITS 368 psllw xmm4, PRE_MULTIPLY_SCALE_BITS 369 370 movdqa xmm5, xmm2 371 psubw xmm2, xmm6 372 paddw xmm5, xmm6 ; xmm5=tmp7 373 374 psllw xmm2, PRE_MULTIPLY_SCALE_BITS 375 pmulhw xmm2, [rel PW_F1414] ; xmm2=tmp11 376 377 ; To avoid overflow... 378 ; 379 ; (Original) 380 ; tmp12 = -2.613125930 * z10 + z5; 381 ; 382 ; (This implementation) 383 ; tmp12 = (-1.613125930 - 1) * z10 + z5; 384 ; = -1.613125930 * z10 - z10 + z5; 385 386 movdqa xmm6, xmm4 387 paddw xmm4, xmm0 388 pmulhw xmm4, [rel PW_F1847] ; xmm4=z5 389 pmulhw xmm6, [rel PW_MF1613] 390 pmulhw xmm0, [rel PW_F1082] 391 psubw xmm6, xmm7 392 psubw xmm0, xmm4 ; xmm0=tmp10 393 paddw xmm6, xmm4 ; xmm6=tmp12 394 395 ; -- Final output stage 396 397 psubw xmm6, xmm5 ; xmm6=tmp6 398 movdqa xmm7, xmm1 399 movdqa xmm4, xmm3 400 paddw xmm1, xmm5 ; xmm1=data0=(00 10 20 30 40 50 60 70) 401 paddw xmm3, xmm6 ; xmm3=data1=(01 11 21 31 41 51 61 71) 402 psraw xmm1, (PASS1_BITS+3) ; descale 403 psraw xmm3, (PASS1_BITS+3) ; descale 404 psubw xmm7, xmm5 ; xmm7=data7=(07 17 27 37 47 57 67 77) 405 psubw xmm4, xmm6 ; xmm4=data6=(06 16 26 36 46 56 66 76) 406 psraw xmm7, (PASS1_BITS+3) ; descale 407 psraw xmm4, (PASS1_BITS+3) ; descale 408 psubw xmm2, xmm6 ; xmm2=tmp5 409 410 packsswb xmm1, xmm4 ; xmm1=(00 10 20 30 40 50 60 70 06 16 26 36 46 56 66 76) 411 packsswb xmm3, xmm7 ; xmm3=(01 11 21 31 41 51 61 71 07 17 27 37 47 57 67 77) 412 413 movdqa xmm5, XMMWORD [wk(1)] ; xmm5=tmp2 414 movdqa xmm6, XMMWORD [wk(0)] ; xmm6=tmp3 415 416 paddw xmm0, xmm2 ; xmm0=tmp4 417 movdqa xmm4, xmm5 418 movdqa xmm7, xmm6 419 paddw xmm5, xmm2 ; xmm5=data2=(02 12 22 32 42 52 62 72) 420 paddw xmm6, xmm0 ; xmm6=data4=(04 14 24 34 44 54 64 74) 421 psraw xmm5, (PASS1_BITS+3) ; descale 422 psraw xmm6, (PASS1_BITS+3) ; descale 423 psubw xmm4, xmm2 ; xmm4=data5=(05 15 25 35 45 55 65 75) 424 psubw xmm7, xmm0 ; xmm7=data3=(03 13 23 33 43 53 63 73) 425 psraw xmm4, (PASS1_BITS+3) ; descale 426 psraw xmm7, (PASS1_BITS+3) ; descale 427 428 movdqa xmm2, [rel PB_CENTERJSAMP] ; xmm2=[rel PB_CENTERJSAMP] 429 430 packsswb xmm5, xmm6 ; xmm5=(02 12 22 32 42 52 62 72 04 14 24 34 44 54 64 74) 431 packsswb xmm7, xmm4 ; xmm7=(03 13 23 33 43 53 63 73 05 15 25 35 45 55 65 75) 432 433 paddb xmm1, xmm2 434 paddb xmm3, xmm2 435 paddb xmm5, xmm2 436 paddb xmm7, xmm2 437 438 movdqa xmm0, xmm1 ; transpose coefficients(phase 1) 439 punpcklbw xmm1, xmm3 ; xmm1=(00 01 10 11 20 21 30 31 40 41 50 51 60 61 70 71) 440 punpckhbw xmm0, xmm3 ; xmm0=(06 07 16 17 26 27 36 37 46 47 56 57 66 67 76 77) 441 movdqa xmm6, xmm5 ; transpose coefficients(phase 1) 442 punpcklbw xmm5, xmm7 ; xmm5=(02 03 12 13 22 23 32 33 42 43 52 53 62 63 72 73) 443 punpckhbw xmm6, xmm7 ; xmm6=(04 05 14 15 24 25 34 35 44 45 54 55 64 65 74 75) 444 445 movdqa xmm4, xmm1 ; transpose coefficients(phase 2) 446 punpcklwd xmm1, xmm5 ; xmm1=(00 01 02 03 10 11 12 13 20 21 22 23 30 31 32 33) 447 punpckhwd xmm4, xmm5 ; xmm4=(40 41 42 43 50 51 52 53 60 61 62 63 70 71 72 73) 448 movdqa xmm2, xmm6 ; transpose coefficients(phase 2) 449 punpcklwd xmm6, xmm0 ; xmm6=(04 05 06 07 14 15 16 17 24 25 26 27 34 35 36 37) 450 punpckhwd xmm2, xmm0 ; xmm2=(44 45 46 47 54 55 56 57 64 65 66 67 74 75 76 77) 451 452 movdqa xmm3, xmm1 ; transpose coefficients(phase 3) 453 punpckldq xmm1, xmm6 ; xmm1=(00 01 02 03 04 05 06 07 10 11 12 13 14 15 16 17) 454 punpckhdq xmm3, xmm6 ; xmm3=(20 21 22 23 24 25 26 27 30 31 32 33 34 35 36 37) 455 movdqa xmm7, xmm4 ; transpose coefficients(phase 3) 456 punpckldq xmm4, xmm2 ; xmm4=(40 41 42 43 44 45 46 47 50 51 52 53 54 55 56 57) 457 punpckhdq xmm7, xmm2 ; xmm7=(60 61 62 63 64 65 66 67 70 71 72 73 74 75 76 77) 458 459 pshufd xmm5, xmm1, 0x4E ; xmm5=(10 11 12 13 14 15 16 17 00 01 02 03 04 05 06 07) 460 pshufd xmm0, xmm3, 0x4E ; xmm0=(30 31 32 33 34 35 36 37 20 21 22 23 24 25 26 27) 461 pshufd xmm6, xmm4, 0x4E ; xmm6=(50 51 52 53 54 55 56 57 40 41 42 43 44 45 46 47) 462 pshufd xmm2, xmm7, 0x4E ; xmm2=(70 71 72 73 74 75 76 77 60 61 62 63 64 65 66 67) 463 464 mov rdxp, JSAMPROW [rdi+0*SIZEOF_JSAMPROW] 465 mov rsip, JSAMPROW [rdi+2*SIZEOF_JSAMPROW] 466 movq XMM_MMWORD [rdx+rax*SIZEOF_JSAMPLE], xmm1 467 movq XMM_MMWORD [rsi+rax*SIZEOF_JSAMPLE], xmm3 468 mov rdxp, JSAMPROW [rdi+4*SIZEOF_JSAMPROW] 469 mov rsip, JSAMPROW [rdi+6*SIZEOF_JSAMPROW] 470 movq XMM_MMWORD [rdx+rax*SIZEOF_JSAMPLE], xmm4 471 movq XMM_MMWORD [rsi+rax*SIZEOF_JSAMPLE], xmm7 472 473 mov rdxp, JSAMPROW [rdi+1*SIZEOF_JSAMPROW] 474 mov rsip, JSAMPROW [rdi+3*SIZEOF_JSAMPROW] 475 movq XMM_MMWORD [rdx+rax*SIZEOF_JSAMPLE], xmm5 476 movq XMM_MMWORD [rsi+rax*SIZEOF_JSAMPLE], xmm0 477 mov rdxp, JSAMPROW [rdi+5*SIZEOF_JSAMPROW] 478 mov rsip, JSAMPROW [rdi+7*SIZEOF_JSAMPROW] 479 movq XMM_MMWORD [rdx+rax*SIZEOF_JSAMPLE], xmm6 480 movq XMM_MMWORD [rsi+rax*SIZEOF_JSAMPLE], xmm2 481 482 uncollect_args 4 483 mov rsp, rbp ; rsp <- aligned rbp 484 pop rsp ; rsp <- original rbp 485 pop rbp 486 ret 487 ret 488 489; For some reason, the OS X linker does not honor the request to align the 490; segment unless we do this. 491 align 32 492