1; 2; jfdctflt.asm - floating-point FDCT (3DNow!) 3; 4; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB 5; 6; Based on 7; x86 SIMD extension for IJG JPEG library 8; Copyright (C) 1999-2006, MIYASAKA Masaru. 9; For conditions of distribution and use, see copyright notice in jsimdext.inc 10; 11; This file should be assembled with NASM (Netwide Assembler), 12; can *not* be assembled with Microsoft's MASM or any compatible 13; assembler (including Borland's Turbo Assembler). 14; NASM is available from http://nasm.sourceforge.net/ or 15; http://sourceforge.net/project/showfiles.php?group_id=6208 16; 17; This file contains a floating-point implementation of the forward DCT 18; (Discrete Cosine Transform). The following code is based directly on 19; the IJG's original jfdctflt.c; see the jfdctflt.c for more details. 20; 21; [TAB8] 22 23%include "jsimdext.inc" 24%include "jdct.inc" 25 26; -------------------------------------------------------------------------- 27 SECTION SEG_CONST 28 29 alignz 16 30 global EXTN(jconst_fdct_float_3dnow) 31 32EXTN(jconst_fdct_float_3dnow): 33 34PD_0_382 times 2 dd 0.382683432365089771728460 35PD_0_707 times 2 dd 0.707106781186547524400844 36PD_0_541 times 2 dd 0.541196100146196984399723 37PD_1_306 times 2 dd 1.306562964876376527856643 38 39 alignz 16 40 41; -------------------------------------------------------------------------- 42 SECTION SEG_TEXT 43 BITS 32 44; 45; Perform the forward DCT on one block of samples. 46; 47; GLOBAL(void) 48; jsimd_fdct_float_3dnow (FAST_FLOAT * data) 49; 50 51%define data(b) (b)+8 ; FAST_FLOAT * data 52 53%define original_ebp ebp+0 54%define wk(i) ebp-(WK_NUM-(i))*SIZEOF_MMWORD ; mmword wk[WK_NUM] 55%define WK_NUM 2 56 57 align 16 58 global EXTN(jsimd_fdct_float_3dnow) 59 60EXTN(jsimd_fdct_float_3dnow): 61 push ebp 62 mov eax,esp ; eax = original ebp 63 sub esp, byte 4 64 and esp, byte (-SIZEOF_MMWORD) ; align to 64 bits 65 mov [esp],eax 66 mov ebp,esp ; ebp = aligned ebp 67 lea esp, [wk(0)] 68 pushpic ebx 69; push ecx ; need not be preserved 70; push edx ; need not be preserved 71; push esi ; unused 72; push edi ; unused 73 74 get_GOT ebx ; get GOT address 75 76 ; ---- Pass 1: process rows. 77 78 mov edx, POINTER [data(eax)] ; (FAST_FLOAT *) 79 mov ecx, DCTSIZE/2 80 alignx 16,7 81.rowloop: 82 83 movq mm0, MMWORD [MMBLOCK(0,0,edx,SIZEOF_FAST_FLOAT)] 84 movq mm1, MMWORD [MMBLOCK(1,0,edx,SIZEOF_FAST_FLOAT)] 85 movq mm2, MMWORD [MMBLOCK(0,3,edx,SIZEOF_FAST_FLOAT)] 86 movq mm3, MMWORD [MMBLOCK(1,3,edx,SIZEOF_FAST_FLOAT)] 87 88 ; mm0=(00 01), mm1=(10 11), mm2=(06 07), mm3=(16 17) 89 90 movq mm4,mm0 ; transpose coefficients 91 punpckldq mm0,mm1 ; mm0=(00 10)=data0 92 punpckhdq mm4,mm1 ; mm4=(01 11)=data1 93 movq mm5,mm2 ; transpose coefficients 94 punpckldq mm2,mm3 ; mm2=(06 16)=data6 95 punpckhdq mm5,mm3 ; mm5=(07 17)=data7 96 97 movq mm6,mm4 98 movq mm7,mm0 99 pfsub mm4,mm2 ; mm4=data1-data6=tmp6 100 pfsub mm0,mm5 ; mm0=data0-data7=tmp7 101 pfadd mm6,mm2 ; mm6=data1+data6=tmp1 102 pfadd mm7,mm5 ; mm7=data0+data7=tmp0 103 104 movq mm1, MMWORD [MMBLOCK(0,1,edx,SIZEOF_FAST_FLOAT)] 105 movq mm3, MMWORD [MMBLOCK(1,1,edx,SIZEOF_FAST_FLOAT)] 106 movq mm2, MMWORD [MMBLOCK(0,2,edx,SIZEOF_FAST_FLOAT)] 107 movq mm5, MMWORD [MMBLOCK(1,2,edx,SIZEOF_FAST_FLOAT)] 108 109 ; mm1=(02 03), mm3=(12 13), mm2=(04 05), mm5=(14 15) 110 111 movq MMWORD [wk(0)], mm4 ; wk(0)=tmp6 112 movq MMWORD [wk(1)], mm0 ; wk(1)=tmp7 113 114 movq mm4,mm1 ; transpose coefficients 115 punpckldq mm1,mm3 ; mm1=(02 12)=data2 116 punpckhdq mm4,mm3 ; mm4=(03 13)=data3 117 movq mm0,mm2 ; transpose coefficients 118 punpckldq mm2,mm5 ; mm2=(04 14)=data4 119 punpckhdq mm0,mm5 ; mm0=(05 15)=data5 120 121 movq mm3,mm4 122 movq mm5,mm1 123 pfadd mm4,mm2 ; mm4=data3+data4=tmp3 124 pfadd mm1,mm0 ; mm1=data2+data5=tmp2 125 pfsub mm3,mm2 ; mm3=data3-data4=tmp4 126 pfsub mm5,mm0 ; mm5=data2-data5=tmp5 127 128 ; -- Even part 129 130 movq mm2,mm7 131 movq mm0,mm6 132 pfsub mm7,mm4 ; mm7=tmp13 133 pfsub mm6,mm1 ; mm6=tmp12 134 pfadd mm2,mm4 ; mm2=tmp10 135 pfadd mm0,mm1 ; mm0=tmp11 136 137 pfadd mm6,mm7 138 pfmul mm6,[GOTOFF(ebx,PD_0_707)] ; mm6=z1 139 140 movq mm4,mm2 141 movq mm1,mm7 142 pfsub mm2,mm0 ; mm2=data4 143 pfsub mm7,mm6 ; mm7=data6 144 pfadd mm4,mm0 ; mm4=data0 145 pfadd mm1,mm6 ; mm1=data2 146 147 movq MMWORD [MMBLOCK(0,2,edx,SIZEOF_FAST_FLOAT)], mm2 148 movq MMWORD [MMBLOCK(0,3,edx,SIZEOF_FAST_FLOAT)], mm7 149 movq MMWORD [MMBLOCK(0,0,edx,SIZEOF_FAST_FLOAT)], mm4 150 movq MMWORD [MMBLOCK(0,1,edx,SIZEOF_FAST_FLOAT)], mm1 151 152 ; -- Odd part 153 154 movq mm0, MMWORD [wk(0)] ; mm0=tmp6 155 movq mm6, MMWORD [wk(1)] ; mm6=tmp7 156 157 pfadd mm3,mm5 ; mm3=tmp10 158 pfadd mm5,mm0 ; mm5=tmp11 159 pfadd mm0,mm6 ; mm0=tmp12, mm6=tmp7 160 161 pfmul mm5,[GOTOFF(ebx,PD_0_707)] ; mm5=z3 162 163 movq mm2,mm3 ; mm2=tmp10 164 pfsub mm3,mm0 165 pfmul mm3,[GOTOFF(ebx,PD_0_382)] ; mm3=z5 166 pfmul mm2,[GOTOFF(ebx,PD_0_541)] ; mm2=MULTIPLY(tmp10,FIX_0_54119610) 167 pfmul mm0,[GOTOFF(ebx,PD_1_306)] ; mm0=MULTIPLY(tmp12,FIX_1_30656296) 168 pfadd mm2,mm3 ; mm2=z2 169 pfadd mm0,mm3 ; mm0=z4 170 171 movq mm7,mm6 172 pfsub mm6,mm5 ; mm6=z13 173 pfadd mm7,mm5 ; mm7=z11 174 175 movq mm4,mm6 176 movq mm1,mm7 177 pfsub mm6,mm2 ; mm6=data3 178 pfsub mm7,mm0 ; mm7=data7 179 pfadd mm4,mm2 ; mm4=data5 180 pfadd mm1,mm0 ; mm1=data1 181 182 movq MMWORD [MMBLOCK(1,1,edx,SIZEOF_FAST_FLOAT)], mm6 183 movq MMWORD [MMBLOCK(1,3,edx,SIZEOF_FAST_FLOAT)], mm7 184 movq MMWORD [MMBLOCK(1,2,edx,SIZEOF_FAST_FLOAT)], mm4 185 movq MMWORD [MMBLOCK(1,0,edx,SIZEOF_FAST_FLOAT)], mm1 186 187 add edx, byte 2*DCTSIZE*SIZEOF_FAST_FLOAT 188 dec ecx 189 jnz near .rowloop 190 191 ; ---- Pass 2: process columns. 192 193 mov edx, POINTER [data(eax)] ; (FAST_FLOAT *) 194 mov ecx, DCTSIZE/2 195 alignx 16,7 196.columnloop: 197 198 movq mm0, MMWORD [MMBLOCK(0,0,edx,SIZEOF_FAST_FLOAT)] 199 movq mm1, MMWORD [MMBLOCK(1,0,edx,SIZEOF_FAST_FLOAT)] 200 movq mm2, MMWORD [MMBLOCK(6,0,edx,SIZEOF_FAST_FLOAT)] 201 movq mm3, MMWORD [MMBLOCK(7,0,edx,SIZEOF_FAST_FLOAT)] 202 203 ; mm0=(00 10), mm1=(01 11), mm2=(60 70), mm3=(61 71) 204 205 movq mm4,mm0 ; transpose coefficients 206 punpckldq mm0,mm1 ; mm0=(00 01)=data0 207 punpckhdq mm4,mm1 ; mm4=(10 11)=data1 208 movq mm5,mm2 ; transpose coefficients 209 punpckldq mm2,mm3 ; mm2=(60 61)=data6 210 punpckhdq mm5,mm3 ; mm5=(70 71)=data7 211 212 movq mm6,mm4 213 movq mm7,mm0 214 pfsub mm4,mm2 ; mm4=data1-data6=tmp6 215 pfsub mm0,mm5 ; mm0=data0-data7=tmp7 216 pfadd mm6,mm2 ; mm6=data1+data6=tmp1 217 pfadd mm7,mm5 ; mm7=data0+data7=tmp0 218 219 movq mm1, MMWORD [MMBLOCK(2,0,edx,SIZEOF_FAST_FLOAT)] 220 movq mm3, MMWORD [MMBLOCK(3,0,edx,SIZEOF_FAST_FLOAT)] 221 movq mm2, MMWORD [MMBLOCK(4,0,edx,SIZEOF_FAST_FLOAT)] 222 movq mm5, MMWORD [MMBLOCK(5,0,edx,SIZEOF_FAST_FLOAT)] 223 224 ; mm1=(20 30), mm3=(21 31), mm2=(40 50), mm5=(41 51) 225 226 movq MMWORD [wk(0)], mm4 ; wk(0)=tmp6 227 movq MMWORD [wk(1)], mm0 ; wk(1)=tmp7 228 229 movq mm4,mm1 ; transpose coefficients 230 punpckldq mm1,mm3 ; mm1=(20 21)=data2 231 punpckhdq mm4,mm3 ; mm4=(30 31)=data3 232 movq mm0,mm2 ; transpose coefficients 233 punpckldq mm2,mm5 ; mm2=(40 41)=data4 234 punpckhdq mm0,mm5 ; mm0=(50 51)=data5 235 236 movq mm3,mm4 237 movq mm5,mm1 238 pfadd mm4,mm2 ; mm4=data3+data4=tmp3 239 pfadd mm1,mm0 ; mm1=data2+data5=tmp2 240 pfsub mm3,mm2 ; mm3=data3-data4=tmp4 241 pfsub mm5,mm0 ; mm5=data2-data5=tmp5 242 243 ; -- Even part 244 245 movq mm2,mm7 246 movq mm0,mm6 247 pfsub mm7,mm4 ; mm7=tmp13 248 pfsub mm6,mm1 ; mm6=tmp12 249 pfadd mm2,mm4 ; mm2=tmp10 250 pfadd mm0,mm1 ; mm0=tmp11 251 252 pfadd mm6,mm7 253 pfmul mm6,[GOTOFF(ebx,PD_0_707)] ; mm6=z1 254 255 movq mm4,mm2 256 movq mm1,mm7 257 pfsub mm2,mm0 ; mm2=data4 258 pfsub mm7,mm6 ; mm7=data6 259 pfadd mm4,mm0 ; mm4=data0 260 pfadd mm1,mm6 ; mm1=data2 261 262 movq MMWORD [MMBLOCK(4,0,edx,SIZEOF_FAST_FLOAT)], mm2 263 movq MMWORD [MMBLOCK(6,0,edx,SIZEOF_FAST_FLOAT)], mm7 264 movq MMWORD [MMBLOCK(0,0,edx,SIZEOF_FAST_FLOAT)], mm4 265 movq MMWORD [MMBLOCK(2,0,edx,SIZEOF_FAST_FLOAT)], mm1 266 267 ; -- Odd part 268 269 movq mm0, MMWORD [wk(0)] ; mm0=tmp6 270 movq mm6, MMWORD [wk(1)] ; mm6=tmp7 271 272 pfadd mm3,mm5 ; mm3=tmp10 273 pfadd mm5,mm0 ; mm5=tmp11 274 pfadd mm0,mm6 ; mm0=tmp12, mm6=tmp7 275 276 pfmul mm5,[GOTOFF(ebx,PD_0_707)] ; mm5=z3 277 278 movq mm2,mm3 ; mm2=tmp10 279 pfsub mm3,mm0 280 pfmul mm3,[GOTOFF(ebx,PD_0_382)] ; mm3=z5 281 pfmul mm2,[GOTOFF(ebx,PD_0_541)] ; mm2=MULTIPLY(tmp10,FIX_0_54119610) 282 pfmul mm0,[GOTOFF(ebx,PD_1_306)] ; mm0=MULTIPLY(tmp12,FIX_1_30656296) 283 pfadd mm2,mm3 ; mm2=z2 284 pfadd mm0,mm3 ; mm0=z4 285 286 movq mm7,mm6 287 pfsub mm6,mm5 ; mm6=z13 288 pfadd mm7,mm5 ; mm7=z11 289 290 movq mm4,mm6 291 movq mm1,mm7 292 pfsub mm6,mm2 ; mm6=data3 293 pfsub mm7,mm0 ; mm7=data7 294 pfadd mm4,mm2 ; mm4=data5 295 pfadd mm1,mm0 ; mm1=data1 296 297 movq MMWORD [MMBLOCK(3,0,edx,SIZEOF_FAST_FLOAT)], mm6 298 movq MMWORD [MMBLOCK(7,0,edx,SIZEOF_FAST_FLOAT)], mm7 299 movq MMWORD [MMBLOCK(5,0,edx,SIZEOF_FAST_FLOAT)], mm4 300 movq MMWORD [MMBLOCK(1,0,edx,SIZEOF_FAST_FLOAT)], mm1 301 302 add edx, byte 2*SIZEOF_FAST_FLOAT 303 dec ecx 304 jnz near .columnloop 305 306 femms ; empty MMX/3DNow! state 307 308; pop edi ; unused 309; pop esi ; unused 310; pop edx ; need not be preserved 311; pop ecx ; need not be preserved 312 poppic ebx 313 mov esp,ebp ; esp <- aligned ebp 314 pop esp ; esp <- original ebp 315 pop ebp 316 ret 317 318; For some reason, the OS X linker does not honor the request to align the 319; segment unless we do this. 320 align 16 321