1; 2; jfdctfst.asm - fast integer FDCT (SSE2) 3; 4; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB 5; 6; Based on the x86 SIMD extension for IJG JPEG library 7; Copyright (C) 1999-2006, MIYASAKA Masaru. 8; For conditions of distribution and use, see copyright notice in jsimdext.inc 9; 10; This file should be assembled with NASM (Netwide Assembler), 11; can *not* be assembled with Microsoft's MASM or any compatible 12; assembler (including Borland's Turbo Assembler). 13; NASM is available from http://nasm.sourceforge.net/ or 14; http://sourceforge.net/project/showfiles.php?group_id=6208 15; 16; This file contains a fast, not so accurate integer implementation of 17; the forward DCT (Discrete Cosine Transform). The following code is 18; based directly on the IJG's original jfdctfst.c; see the jfdctfst.c 19; for more details. 20; 21; [TAB8] 22 23%include "jsimdext.inc" 24%include "jdct.inc" 25 26; -------------------------------------------------------------------------- 27 28%define CONST_BITS 8 ; 14 is also OK. 29 30%if CONST_BITS == 8 31F_0_382 equ 98 ; FIX(0.382683433) 32F_0_541 equ 139 ; FIX(0.541196100) 33F_0_707 equ 181 ; FIX(0.707106781) 34F_1_306 equ 334 ; FIX(1.306562965) 35%else 36; NASM cannot do compile-time arithmetic on floating-point constants. 37%define DESCALE(x,n) (((x)+(1<<((n)-1)))>>(n)) 38F_0_382 equ DESCALE( 410903207,30-CONST_BITS) ; FIX(0.382683433) 39F_0_541 equ DESCALE( 581104887,30-CONST_BITS) ; FIX(0.541196100) 40F_0_707 equ DESCALE( 759250124,30-CONST_BITS) ; FIX(0.707106781) 41F_1_306 equ DESCALE(1402911301,30-CONST_BITS) ; FIX(1.306562965) 42%endif 43 44; -------------------------------------------------------------------------- 45 SECTION SEG_CONST 46 47; PRE_MULTIPLY_SCALE_BITS <= 2 (to avoid overflow) 48; CONST_BITS + CONST_SHIFT + PRE_MULTIPLY_SCALE_BITS == 16 (for pmulhw) 49 50%define PRE_MULTIPLY_SCALE_BITS 2 51%define CONST_SHIFT (16 - PRE_MULTIPLY_SCALE_BITS - CONST_BITS) 52 53 alignz 16 54 global EXTN(jconst_fdct_ifast_sse2) 55 56EXTN(jconst_fdct_ifast_sse2): 57 58PW_F0707 times 8 dw F_0_707 << CONST_SHIFT 59PW_F0382 times 8 dw F_0_382 << CONST_SHIFT 60PW_F0541 times 8 dw F_0_541 << CONST_SHIFT 61PW_F1306 times 8 dw F_1_306 << CONST_SHIFT 62 63 alignz 16 64 65; -------------------------------------------------------------------------- 66 SECTION SEG_TEXT 67 BITS 32 68; 69; Perform the forward DCT on one block of samples. 70; 71; GLOBAL(void) 72; jsimd_fdct_ifast_sse2 (DCTELEM *data) 73; 74 75%define data(b) (b)+8 ; DCTELEM *data 76 77%define original_ebp ebp+0 78%define wk(i) ebp-(WK_NUM-(i))*SIZEOF_XMMWORD ; xmmword wk[WK_NUM] 79%define WK_NUM 2 80 81 align 16 82 global EXTN(jsimd_fdct_ifast_sse2) 83 84EXTN(jsimd_fdct_ifast_sse2): 85 push ebp 86 mov eax,esp ; eax = original ebp 87 sub esp, byte 4 88 and esp, byte (-SIZEOF_XMMWORD) ; align to 128 bits 89 mov [esp],eax 90 mov ebp,esp ; ebp = aligned ebp 91 lea esp, [wk(0)] 92 pushpic ebx 93; push ecx ; unused 94; push edx ; need not be preserved 95; push esi ; unused 96; push edi ; unused 97 98 get_GOT ebx ; get GOT address 99 100 ; ---- Pass 1: process rows. 101 102 mov edx, POINTER [data(eax)] ; (DCTELEM *) 103 104 movdqa xmm0, XMMWORD [XMMBLOCK(0,0,edx,SIZEOF_DCTELEM)] 105 movdqa xmm1, XMMWORD [XMMBLOCK(1,0,edx,SIZEOF_DCTELEM)] 106 movdqa xmm2, XMMWORD [XMMBLOCK(2,0,edx,SIZEOF_DCTELEM)] 107 movdqa xmm3, XMMWORD [XMMBLOCK(3,0,edx,SIZEOF_DCTELEM)] 108 109 ; xmm0=(00 01 02 03 04 05 06 07), xmm2=(20 21 22 23 24 25 26 27) 110 ; xmm1=(10 11 12 13 14 15 16 17), xmm3=(30 31 32 33 34 35 36 37) 111 112 movdqa xmm4,xmm0 ; transpose coefficients(phase 1) 113 punpcklwd xmm0,xmm1 ; xmm0=(00 10 01 11 02 12 03 13) 114 punpckhwd xmm4,xmm1 ; xmm4=(04 14 05 15 06 16 07 17) 115 movdqa xmm5,xmm2 ; transpose coefficients(phase 1) 116 punpcklwd xmm2,xmm3 ; xmm2=(20 30 21 31 22 32 23 33) 117 punpckhwd xmm5,xmm3 ; xmm5=(24 34 25 35 26 36 27 37) 118 119 movdqa xmm6, XMMWORD [XMMBLOCK(4,0,edx,SIZEOF_DCTELEM)] 120 movdqa xmm7, XMMWORD [XMMBLOCK(5,0,edx,SIZEOF_DCTELEM)] 121 movdqa xmm1, XMMWORD [XMMBLOCK(6,0,edx,SIZEOF_DCTELEM)] 122 movdqa xmm3, XMMWORD [XMMBLOCK(7,0,edx,SIZEOF_DCTELEM)] 123 124 ; xmm6=( 4 12 20 28 36 44 52 60), xmm1=( 6 14 22 30 38 46 54 62) 125 ; xmm7=( 5 13 21 29 37 45 53 61), xmm3=( 7 15 23 31 39 47 55 63) 126 127 movdqa XMMWORD [wk(0)], xmm2 ; wk(0)=(20 30 21 31 22 32 23 33) 128 movdqa XMMWORD [wk(1)], xmm5 ; wk(1)=(24 34 25 35 26 36 27 37) 129 130 movdqa xmm2,xmm6 ; transpose coefficients(phase 1) 131 punpcklwd xmm6,xmm7 ; xmm6=(40 50 41 51 42 52 43 53) 132 punpckhwd xmm2,xmm7 ; xmm2=(44 54 45 55 46 56 47 57) 133 movdqa xmm5,xmm1 ; transpose coefficients(phase 1) 134 punpcklwd xmm1,xmm3 ; xmm1=(60 70 61 71 62 72 63 73) 135 punpckhwd xmm5,xmm3 ; xmm5=(64 74 65 75 66 76 67 77) 136 137 movdqa xmm7,xmm6 ; transpose coefficients(phase 2) 138 punpckldq xmm6,xmm1 ; xmm6=(40 50 60 70 41 51 61 71) 139 punpckhdq xmm7,xmm1 ; xmm7=(42 52 62 72 43 53 63 73) 140 movdqa xmm3,xmm2 ; transpose coefficients(phase 2) 141 punpckldq xmm2,xmm5 ; xmm2=(44 54 64 74 45 55 65 75) 142 punpckhdq xmm3,xmm5 ; xmm3=(46 56 66 76 47 57 67 77) 143 144 movdqa xmm1, XMMWORD [wk(0)] ; xmm1=(20 30 21 31 22 32 23 33) 145 movdqa xmm5, XMMWORD [wk(1)] ; xmm5=(24 34 25 35 26 36 27 37) 146 movdqa XMMWORD [wk(0)], xmm7 ; wk(0)=(42 52 62 72 43 53 63 73) 147 movdqa XMMWORD [wk(1)], xmm2 ; wk(1)=(44 54 64 74 45 55 65 75) 148 149 movdqa xmm7,xmm0 ; transpose coefficients(phase 2) 150 punpckldq xmm0,xmm1 ; xmm0=(00 10 20 30 01 11 21 31) 151 punpckhdq xmm7,xmm1 ; xmm7=(02 12 22 32 03 13 23 33) 152 movdqa xmm2,xmm4 ; transpose coefficients(phase 2) 153 punpckldq xmm4,xmm5 ; xmm4=(04 14 24 34 05 15 25 35) 154 punpckhdq xmm2,xmm5 ; xmm2=(06 16 26 36 07 17 27 37) 155 156 movdqa xmm1,xmm0 ; transpose coefficients(phase 3) 157 punpcklqdq xmm0,xmm6 ; xmm0=(00 10 20 30 40 50 60 70)=data0 158 punpckhqdq xmm1,xmm6 ; xmm1=(01 11 21 31 41 51 61 71)=data1 159 movdqa xmm5,xmm2 ; transpose coefficients(phase 3) 160 punpcklqdq xmm2,xmm3 ; xmm2=(06 16 26 36 46 56 66 76)=data6 161 punpckhqdq xmm5,xmm3 ; xmm5=(07 17 27 37 47 57 67 77)=data7 162 163 movdqa xmm6,xmm1 164 movdqa xmm3,xmm0 165 psubw xmm1,xmm2 ; xmm1=data1-data6=tmp6 166 psubw xmm0,xmm5 ; xmm0=data0-data7=tmp7 167 paddw xmm6,xmm2 ; xmm6=data1+data6=tmp1 168 paddw xmm3,xmm5 ; xmm3=data0+data7=tmp0 169 170 movdqa xmm2, XMMWORD [wk(0)] ; xmm2=(42 52 62 72 43 53 63 73) 171 movdqa xmm5, XMMWORD [wk(1)] ; xmm5=(44 54 64 74 45 55 65 75) 172 movdqa XMMWORD [wk(0)], xmm1 ; wk(0)=tmp6 173 movdqa XMMWORD [wk(1)], xmm0 ; wk(1)=tmp7 174 175 movdqa xmm1,xmm7 ; transpose coefficients(phase 3) 176 punpcklqdq xmm7,xmm2 ; xmm7=(02 12 22 32 42 52 62 72)=data2 177 punpckhqdq xmm1,xmm2 ; xmm1=(03 13 23 33 43 53 63 73)=data3 178 movdqa xmm0,xmm4 ; transpose coefficients(phase 3) 179 punpcklqdq xmm4,xmm5 ; xmm4=(04 14 24 34 44 54 64 74)=data4 180 punpckhqdq xmm0,xmm5 ; xmm0=(05 15 25 35 45 55 65 75)=data5 181 182 movdqa xmm2,xmm1 183 movdqa xmm5,xmm7 184 paddw xmm1,xmm4 ; xmm1=data3+data4=tmp3 185 paddw xmm7,xmm0 ; xmm7=data2+data5=tmp2 186 psubw xmm2,xmm4 ; xmm2=data3-data4=tmp4 187 psubw xmm5,xmm0 ; xmm5=data2-data5=tmp5 188 189 ; -- Even part 190 191 movdqa xmm4,xmm3 192 movdqa xmm0,xmm6 193 psubw xmm3,xmm1 ; xmm3=tmp13 194 psubw xmm6,xmm7 ; xmm6=tmp12 195 paddw xmm4,xmm1 ; xmm4=tmp10 196 paddw xmm0,xmm7 ; xmm0=tmp11 197 198 paddw xmm6,xmm3 199 psllw xmm6,PRE_MULTIPLY_SCALE_BITS 200 pmulhw xmm6,[GOTOFF(ebx,PW_F0707)] ; xmm6=z1 201 202 movdqa xmm1,xmm4 203 movdqa xmm7,xmm3 204 psubw xmm4,xmm0 ; xmm4=data4 205 psubw xmm3,xmm6 ; xmm3=data6 206 paddw xmm1,xmm0 ; xmm1=data0 207 paddw xmm7,xmm6 ; xmm7=data2 208 209 movdqa xmm0, XMMWORD [wk(0)] ; xmm0=tmp6 210 movdqa xmm6, XMMWORD [wk(1)] ; xmm6=tmp7 211 movdqa XMMWORD [wk(0)], xmm4 ; wk(0)=data4 212 movdqa XMMWORD [wk(1)], xmm3 ; wk(1)=data6 213 214 ; -- Odd part 215 216 paddw xmm2,xmm5 ; xmm2=tmp10 217 paddw xmm5,xmm0 ; xmm5=tmp11 218 paddw xmm0,xmm6 ; xmm0=tmp12, xmm6=tmp7 219 220 psllw xmm2,PRE_MULTIPLY_SCALE_BITS 221 psllw xmm0,PRE_MULTIPLY_SCALE_BITS 222 223 psllw xmm5,PRE_MULTIPLY_SCALE_BITS 224 pmulhw xmm5,[GOTOFF(ebx,PW_F0707)] ; xmm5=z3 225 226 movdqa xmm4,xmm2 ; xmm4=tmp10 227 psubw xmm2,xmm0 228 pmulhw xmm2,[GOTOFF(ebx,PW_F0382)] ; xmm2=z5 229 pmulhw xmm4,[GOTOFF(ebx,PW_F0541)] ; xmm4=MULTIPLY(tmp10,FIX_0_541196) 230 pmulhw xmm0,[GOTOFF(ebx,PW_F1306)] ; xmm0=MULTIPLY(tmp12,FIX_1_306562) 231 paddw xmm4,xmm2 ; xmm4=z2 232 paddw xmm0,xmm2 ; xmm0=z4 233 234 movdqa xmm3,xmm6 235 psubw xmm6,xmm5 ; xmm6=z13 236 paddw xmm3,xmm5 ; xmm3=z11 237 238 movdqa xmm2,xmm6 239 movdqa xmm5,xmm3 240 psubw xmm6,xmm4 ; xmm6=data3 241 psubw xmm3,xmm0 ; xmm3=data7 242 paddw xmm2,xmm4 ; xmm2=data5 243 paddw xmm5,xmm0 ; xmm5=data1 244 245 ; ---- Pass 2: process columns. 246 247; mov edx, POINTER [data(eax)] ; (DCTELEM *) 248 249 ; xmm1=(00 10 20 30 40 50 60 70), xmm7=(02 12 22 32 42 52 62 72) 250 ; xmm5=(01 11 21 31 41 51 61 71), xmm6=(03 13 23 33 43 53 63 73) 251 252 movdqa xmm4,xmm1 ; transpose coefficients(phase 1) 253 punpcklwd xmm1,xmm5 ; xmm1=(00 01 10 11 20 21 30 31) 254 punpckhwd xmm4,xmm5 ; xmm4=(40 41 50 51 60 61 70 71) 255 movdqa xmm0,xmm7 ; transpose coefficients(phase 1) 256 punpcklwd xmm7,xmm6 ; xmm7=(02 03 12 13 22 23 32 33) 257 punpckhwd xmm0,xmm6 ; xmm0=(42 43 52 53 62 63 72 73) 258 259 movdqa xmm5, XMMWORD [wk(0)] ; xmm5=col4 260 movdqa xmm6, XMMWORD [wk(1)] ; xmm6=col6 261 262 ; xmm5=(04 14 24 34 44 54 64 74), xmm6=(06 16 26 36 46 56 66 76) 263 ; xmm2=(05 15 25 35 45 55 65 75), xmm3=(07 17 27 37 47 57 67 77) 264 265 movdqa XMMWORD [wk(0)], xmm7 ; wk(0)=(02 03 12 13 22 23 32 33) 266 movdqa XMMWORD [wk(1)], xmm0 ; wk(1)=(42 43 52 53 62 63 72 73) 267 268 movdqa xmm7,xmm5 ; transpose coefficients(phase 1) 269 punpcklwd xmm5,xmm2 ; xmm5=(04 05 14 15 24 25 34 35) 270 punpckhwd xmm7,xmm2 ; xmm7=(44 45 54 55 64 65 74 75) 271 movdqa xmm0,xmm6 ; transpose coefficients(phase 1) 272 punpcklwd xmm6,xmm3 ; xmm6=(06 07 16 17 26 27 36 37) 273 punpckhwd xmm0,xmm3 ; xmm0=(46 47 56 57 66 67 76 77) 274 275 movdqa xmm2,xmm5 ; transpose coefficients(phase 2) 276 punpckldq xmm5,xmm6 ; xmm5=(04 05 06 07 14 15 16 17) 277 punpckhdq xmm2,xmm6 ; xmm2=(24 25 26 27 34 35 36 37) 278 movdqa xmm3,xmm7 ; transpose coefficients(phase 2) 279 punpckldq xmm7,xmm0 ; xmm7=(44 45 46 47 54 55 56 57) 280 punpckhdq xmm3,xmm0 ; xmm3=(64 65 66 67 74 75 76 77) 281 282 movdqa xmm6, XMMWORD [wk(0)] ; xmm6=(02 03 12 13 22 23 32 33) 283 movdqa xmm0, XMMWORD [wk(1)] ; xmm0=(42 43 52 53 62 63 72 73) 284 movdqa XMMWORD [wk(0)], xmm2 ; wk(0)=(24 25 26 27 34 35 36 37) 285 movdqa XMMWORD [wk(1)], xmm7 ; wk(1)=(44 45 46 47 54 55 56 57) 286 287 movdqa xmm2,xmm1 ; transpose coefficients(phase 2) 288 punpckldq xmm1,xmm6 ; xmm1=(00 01 02 03 10 11 12 13) 289 punpckhdq xmm2,xmm6 ; xmm2=(20 21 22 23 30 31 32 33) 290 movdqa xmm7,xmm4 ; transpose coefficients(phase 2) 291 punpckldq xmm4,xmm0 ; xmm4=(40 41 42 43 50 51 52 53) 292 punpckhdq xmm7,xmm0 ; xmm7=(60 61 62 63 70 71 72 73) 293 294 movdqa xmm6,xmm1 ; transpose coefficients(phase 3) 295 punpcklqdq xmm1,xmm5 ; xmm1=(00 01 02 03 04 05 06 07)=data0 296 punpckhqdq xmm6,xmm5 ; xmm6=(10 11 12 13 14 15 16 17)=data1 297 movdqa xmm0,xmm7 ; transpose coefficients(phase 3) 298 punpcklqdq xmm7,xmm3 ; xmm7=(60 61 62 63 64 65 66 67)=data6 299 punpckhqdq xmm0,xmm3 ; xmm0=(70 71 72 73 74 75 76 77)=data7 300 301 movdqa xmm5,xmm6 302 movdqa xmm3,xmm1 303 psubw xmm6,xmm7 ; xmm6=data1-data6=tmp6 304 psubw xmm1,xmm0 ; xmm1=data0-data7=tmp7 305 paddw xmm5,xmm7 ; xmm5=data1+data6=tmp1 306 paddw xmm3,xmm0 ; xmm3=data0+data7=tmp0 307 308 movdqa xmm7, XMMWORD [wk(0)] ; xmm7=(24 25 26 27 34 35 36 37) 309 movdqa xmm0, XMMWORD [wk(1)] ; xmm0=(44 45 46 47 54 55 56 57) 310 movdqa XMMWORD [wk(0)], xmm6 ; wk(0)=tmp6 311 movdqa XMMWORD [wk(1)], xmm1 ; wk(1)=tmp7 312 313 movdqa xmm6,xmm2 ; transpose coefficients(phase 3) 314 punpcklqdq xmm2,xmm7 ; xmm2=(20 21 22 23 24 25 26 27)=data2 315 punpckhqdq xmm6,xmm7 ; xmm6=(30 31 32 33 34 35 36 37)=data3 316 movdqa xmm1,xmm4 ; transpose coefficients(phase 3) 317 punpcklqdq xmm4,xmm0 ; xmm4=(40 41 42 43 44 45 46 47)=data4 318 punpckhqdq xmm1,xmm0 ; xmm1=(50 51 52 53 54 55 56 57)=data5 319 320 movdqa xmm7,xmm6 321 movdqa xmm0,xmm2 322 paddw xmm6,xmm4 ; xmm6=data3+data4=tmp3 323 paddw xmm2,xmm1 ; xmm2=data2+data5=tmp2 324 psubw xmm7,xmm4 ; xmm7=data3-data4=tmp4 325 psubw xmm0,xmm1 ; xmm0=data2-data5=tmp5 326 327 ; -- Even part 328 329 movdqa xmm4,xmm3 330 movdqa xmm1,xmm5 331 psubw xmm3,xmm6 ; xmm3=tmp13 332 psubw xmm5,xmm2 ; xmm5=tmp12 333 paddw xmm4,xmm6 ; xmm4=tmp10 334 paddw xmm1,xmm2 ; xmm1=tmp11 335 336 paddw xmm5,xmm3 337 psllw xmm5,PRE_MULTIPLY_SCALE_BITS 338 pmulhw xmm5,[GOTOFF(ebx,PW_F0707)] ; xmm5=z1 339 340 movdqa xmm6,xmm4 341 movdqa xmm2,xmm3 342 psubw xmm4,xmm1 ; xmm4=data4 343 psubw xmm3,xmm5 ; xmm3=data6 344 paddw xmm6,xmm1 ; xmm6=data0 345 paddw xmm2,xmm5 ; xmm2=data2 346 347 movdqa XMMWORD [XMMBLOCK(4,0,edx,SIZEOF_DCTELEM)], xmm4 348 movdqa XMMWORD [XMMBLOCK(6,0,edx,SIZEOF_DCTELEM)], xmm3 349 movdqa XMMWORD [XMMBLOCK(0,0,edx,SIZEOF_DCTELEM)], xmm6 350 movdqa XMMWORD [XMMBLOCK(2,0,edx,SIZEOF_DCTELEM)], xmm2 351 352 ; -- Odd part 353 354 movdqa xmm1, XMMWORD [wk(0)] ; xmm1=tmp6 355 movdqa xmm5, XMMWORD [wk(1)] ; xmm5=tmp7 356 357 paddw xmm7,xmm0 ; xmm7=tmp10 358 paddw xmm0,xmm1 ; xmm0=tmp11 359 paddw xmm1,xmm5 ; xmm1=tmp12, xmm5=tmp7 360 361 psllw xmm7,PRE_MULTIPLY_SCALE_BITS 362 psllw xmm1,PRE_MULTIPLY_SCALE_BITS 363 364 psllw xmm0,PRE_MULTIPLY_SCALE_BITS 365 pmulhw xmm0,[GOTOFF(ebx,PW_F0707)] ; xmm0=z3 366 367 movdqa xmm4,xmm7 ; xmm4=tmp10 368 psubw xmm7,xmm1 369 pmulhw xmm7,[GOTOFF(ebx,PW_F0382)] ; xmm7=z5 370 pmulhw xmm4,[GOTOFF(ebx,PW_F0541)] ; xmm4=MULTIPLY(tmp10,FIX_0_541196) 371 pmulhw xmm1,[GOTOFF(ebx,PW_F1306)] ; xmm1=MULTIPLY(tmp12,FIX_1_306562) 372 paddw xmm4,xmm7 ; xmm4=z2 373 paddw xmm1,xmm7 ; xmm1=z4 374 375 movdqa xmm3,xmm5 376 psubw xmm5,xmm0 ; xmm5=z13 377 paddw xmm3,xmm0 ; xmm3=z11 378 379 movdqa xmm6,xmm5 380 movdqa xmm2,xmm3 381 psubw xmm5,xmm4 ; xmm5=data3 382 psubw xmm3,xmm1 ; xmm3=data7 383 paddw xmm6,xmm4 ; xmm6=data5 384 paddw xmm2,xmm1 ; xmm2=data1 385 386 movdqa XMMWORD [XMMBLOCK(3,0,edx,SIZEOF_DCTELEM)], xmm5 387 movdqa XMMWORD [XMMBLOCK(7,0,edx,SIZEOF_DCTELEM)], xmm3 388 movdqa XMMWORD [XMMBLOCK(5,0,edx,SIZEOF_DCTELEM)], xmm6 389 movdqa XMMWORD [XMMBLOCK(1,0,edx,SIZEOF_DCTELEM)], xmm2 390 391; pop edi ; unused 392; pop esi ; unused 393; pop edx ; need not be preserved 394; pop ecx ; unused 395 poppic ebx 396 mov esp,ebp ; esp <- aligned ebp 397 pop esp ; esp <- original ebp 398 pop ebp 399 ret 400 401; For some reason, the OS X linker does not honor the request to align the 402; segment unless we do this. 403 align 16 404