1; 2; jfdctflt.asm - floating-point FDCT (3DNow!) 3; 4; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB 5; Copyright (C) 2016, D. R. Commander. 6; 7; Based on the x86 SIMD extension for IJG JPEG library 8; Copyright (C) 1999-2006, MIYASAKA Masaru. 9; For conditions of distribution and use, see copyright notice in jsimdext.inc 10; 11; This file should be assembled with NASM (Netwide Assembler), 12; can *not* be assembled with Microsoft's MASM or any compatible 13; assembler (including Borland's Turbo Assembler). 14; NASM is available from http://nasm.sourceforge.net/ or 15; http://sourceforge.net/project/showfiles.php?group_id=6208 16; 17; This file contains a floating-point implementation of the forward DCT 18; (Discrete Cosine Transform). The following code is based directly on 19; the IJG's original jfdctflt.c; see the jfdctflt.c for more details. 20 21%include "jsimdext.inc" 22%include "jdct.inc" 23 24; -------------------------------------------------------------------------- 25 SECTION SEG_CONST 26 27 alignz 32 28 GLOBAL_DATA(jconst_fdct_float_3dnow) 29 30EXTN(jconst_fdct_float_3dnow): 31 32PD_0_382 times 2 dd 0.382683432365089771728460 33PD_0_707 times 2 dd 0.707106781186547524400844 34PD_0_541 times 2 dd 0.541196100146196984399723 35PD_1_306 times 2 dd 1.306562964876376527856643 36 37 alignz 32 38 39; -------------------------------------------------------------------------- 40 SECTION SEG_TEXT 41 BITS 32 42; 43; Perform the forward DCT on one block of samples. 44; 45; GLOBAL(void) 46; jsimd_fdct_float_3dnow(FAST_FLOAT *data) 47; 48 49%define data(b) (b) + 8 ; FAST_FLOAT *data 50 51%define original_ebp ebp + 0 52%define wk(i) ebp - (WK_NUM - (i)) * SIZEOF_MMWORD ; mmword wk[WK_NUM] 53%define WK_NUM 2 54 55 align 32 56 GLOBAL_FUNCTION(jsimd_fdct_float_3dnow) 57 58EXTN(jsimd_fdct_float_3dnow): 59 push ebp 60 mov eax, esp ; eax = original ebp 61 sub esp, byte 4 62 and esp, byte (-SIZEOF_MMWORD) ; align to 64 bits 63 mov [esp], eax 64 mov ebp, esp ; ebp = aligned ebp 65 lea esp, [wk(0)] 66 pushpic ebx 67; push ecx ; need not be preserved 68; push edx ; need not be preserved 69; push esi ; unused 70; push edi ; unused 71 72 get_GOT ebx ; get GOT address 73 74 ; ---- Pass 1: process rows. 75 76 mov edx, POINTER [data(eax)] ; (FAST_FLOAT *) 77 mov ecx, DCTSIZE/2 78 alignx 16, 7 79.rowloop: 80 81 movq mm0, MMWORD [MMBLOCK(0,0,edx,SIZEOF_FAST_FLOAT)] 82 movq mm1, MMWORD [MMBLOCK(1,0,edx,SIZEOF_FAST_FLOAT)] 83 movq mm2, MMWORD [MMBLOCK(0,3,edx,SIZEOF_FAST_FLOAT)] 84 movq mm3, MMWORD [MMBLOCK(1,3,edx,SIZEOF_FAST_FLOAT)] 85 86 ; mm0=(00 01), mm1=(10 11), mm2=(06 07), mm3=(16 17) 87 88 movq mm4, mm0 ; transpose coefficients 89 punpckldq mm0, mm1 ; mm0=(00 10)=data0 90 punpckhdq mm4, mm1 ; mm4=(01 11)=data1 91 movq mm5, mm2 ; transpose coefficients 92 punpckldq mm2, mm3 ; mm2=(06 16)=data6 93 punpckhdq mm5, mm3 ; mm5=(07 17)=data7 94 95 movq mm6, mm4 96 movq mm7, mm0 97 pfsub mm4, mm2 ; mm4=data1-data6=tmp6 98 pfsub mm0, mm5 ; mm0=data0-data7=tmp7 99 pfadd mm6, mm2 ; mm6=data1+data6=tmp1 100 pfadd mm7, mm5 ; mm7=data0+data7=tmp0 101 102 movq mm1, MMWORD [MMBLOCK(0,1,edx,SIZEOF_FAST_FLOAT)] 103 movq mm3, MMWORD [MMBLOCK(1,1,edx,SIZEOF_FAST_FLOAT)] 104 movq mm2, MMWORD [MMBLOCK(0,2,edx,SIZEOF_FAST_FLOAT)] 105 movq mm5, MMWORD [MMBLOCK(1,2,edx,SIZEOF_FAST_FLOAT)] 106 107 ; mm1=(02 03), mm3=(12 13), mm2=(04 05), mm5=(14 15) 108 109 movq MMWORD [wk(0)], mm4 ; wk(0)=tmp6 110 movq MMWORD [wk(1)], mm0 ; wk(1)=tmp7 111 112 movq mm4, mm1 ; transpose coefficients 113 punpckldq mm1, mm3 ; mm1=(02 12)=data2 114 punpckhdq mm4, mm3 ; mm4=(03 13)=data3 115 movq mm0, mm2 ; transpose coefficients 116 punpckldq mm2, mm5 ; mm2=(04 14)=data4 117 punpckhdq mm0, mm5 ; mm0=(05 15)=data5 118 119 movq mm3, mm4 120 movq mm5, mm1 121 pfadd mm4, mm2 ; mm4=data3+data4=tmp3 122 pfadd mm1, mm0 ; mm1=data2+data5=tmp2 123 pfsub mm3, mm2 ; mm3=data3-data4=tmp4 124 pfsub mm5, mm0 ; mm5=data2-data5=tmp5 125 126 ; -- Even part 127 128 movq mm2, mm7 129 movq mm0, mm6 130 pfsub mm7, mm4 ; mm7=tmp13 131 pfsub mm6, mm1 ; mm6=tmp12 132 pfadd mm2, mm4 ; mm2=tmp10 133 pfadd mm0, mm1 ; mm0=tmp11 134 135 pfadd mm6, mm7 136 pfmul mm6, [GOTOFF(ebx,PD_0_707)] ; mm6=z1 137 138 movq mm4, mm2 139 movq mm1, mm7 140 pfsub mm2, mm0 ; mm2=data4 141 pfsub mm7, mm6 ; mm7=data6 142 pfadd mm4, mm0 ; mm4=data0 143 pfadd mm1, mm6 ; mm1=data2 144 145 movq MMWORD [MMBLOCK(0,2,edx,SIZEOF_FAST_FLOAT)], mm2 146 movq MMWORD [MMBLOCK(0,3,edx,SIZEOF_FAST_FLOAT)], mm7 147 movq MMWORD [MMBLOCK(0,0,edx,SIZEOF_FAST_FLOAT)], mm4 148 movq MMWORD [MMBLOCK(0,1,edx,SIZEOF_FAST_FLOAT)], mm1 149 150 ; -- Odd part 151 152 movq mm0, MMWORD [wk(0)] ; mm0=tmp6 153 movq mm6, MMWORD [wk(1)] ; mm6=tmp7 154 155 pfadd mm3, mm5 ; mm3=tmp10 156 pfadd mm5, mm0 ; mm5=tmp11 157 pfadd mm0, mm6 ; mm0=tmp12, mm6=tmp7 158 159 pfmul mm5, [GOTOFF(ebx,PD_0_707)] ; mm5=z3 160 161 movq mm2, mm3 ; mm2=tmp10 162 pfsub mm3, mm0 163 pfmul mm3, [GOTOFF(ebx,PD_0_382)] ; mm3=z5 164 pfmul mm2, [GOTOFF(ebx,PD_0_541)] ; mm2=MULTIPLY(tmp10,FIX_0_54119610) 165 pfmul mm0, [GOTOFF(ebx,PD_1_306)] ; mm0=MULTIPLY(tmp12,FIX_1_30656296) 166 pfadd mm2, mm3 ; mm2=z2 167 pfadd mm0, mm3 ; mm0=z4 168 169 movq mm7, mm6 170 pfsub mm6, mm5 ; mm6=z13 171 pfadd mm7, mm5 ; mm7=z11 172 173 movq mm4, mm6 174 movq mm1, mm7 175 pfsub mm6, mm2 ; mm6=data3 176 pfsub mm7, mm0 ; mm7=data7 177 pfadd mm4, mm2 ; mm4=data5 178 pfadd mm1, mm0 ; mm1=data1 179 180 movq MMWORD [MMBLOCK(1,1,edx,SIZEOF_FAST_FLOAT)], mm6 181 movq MMWORD [MMBLOCK(1,3,edx,SIZEOF_FAST_FLOAT)], mm7 182 movq MMWORD [MMBLOCK(1,2,edx,SIZEOF_FAST_FLOAT)], mm4 183 movq MMWORD [MMBLOCK(1,0,edx,SIZEOF_FAST_FLOAT)], mm1 184 185 add edx, byte 2*DCTSIZE*SIZEOF_FAST_FLOAT 186 dec ecx 187 jnz near .rowloop 188 189 ; ---- Pass 2: process columns. 190 191 mov edx, POINTER [data(eax)] ; (FAST_FLOAT *) 192 mov ecx, DCTSIZE/2 193 alignx 16, 7 194.columnloop: 195 196 movq mm0, MMWORD [MMBLOCK(0,0,edx,SIZEOF_FAST_FLOAT)] 197 movq mm1, MMWORD [MMBLOCK(1,0,edx,SIZEOF_FAST_FLOAT)] 198 movq mm2, MMWORD [MMBLOCK(6,0,edx,SIZEOF_FAST_FLOAT)] 199 movq mm3, MMWORD [MMBLOCK(7,0,edx,SIZEOF_FAST_FLOAT)] 200 201 ; mm0=(00 10), mm1=(01 11), mm2=(60 70), mm3=(61 71) 202 203 movq mm4, mm0 ; transpose coefficients 204 punpckldq mm0, mm1 ; mm0=(00 01)=data0 205 punpckhdq mm4, mm1 ; mm4=(10 11)=data1 206 movq mm5, mm2 ; transpose coefficients 207 punpckldq mm2, mm3 ; mm2=(60 61)=data6 208 punpckhdq mm5, mm3 ; mm5=(70 71)=data7 209 210 movq mm6, mm4 211 movq mm7, mm0 212 pfsub mm4, mm2 ; mm4=data1-data6=tmp6 213 pfsub mm0, mm5 ; mm0=data0-data7=tmp7 214 pfadd mm6, mm2 ; mm6=data1+data6=tmp1 215 pfadd mm7, mm5 ; mm7=data0+data7=tmp0 216 217 movq mm1, MMWORD [MMBLOCK(2,0,edx,SIZEOF_FAST_FLOAT)] 218 movq mm3, MMWORD [MMBLOCK(3,0,edx,SIZEOF_FAST_FLOAT)] 219 movq mm2, MMWORD [MMBLOCK(4,0,edx,SIZEOF_FAST_FLOAT)] 220 movq mm5, MMWORD [MMBLOCK(5,0,edx,SIZEOF_FAST_FLOAT)] 221 222 ; mm1=(20 30), mm3=(21 31), mm2=(40 50), mm5=(41 51) 223 224 movq MMWORD [wk(0)], mm4 ; wk(0)=tmp6 225 movq MMWORD [wk(1)], mm0 ; wk(1)=tmp7 226 227 movq mm4, mm1 ; transpose coefficients 228 punpckldq mm1, mm3 ; mm1=(20 21)=data2 229 punpckhdq mm4, mm3 ; mm4=(30 31)=data3 230 movq mm0, mm2 ; transpose coefficients 231 punpckldq mm2, mm5 ; mm2=(40 41)=data4 232 punpckhdq mm0, mm5 ; mm0=(50 51)=data5 233 234 movq mm3, mm4 235 movq mm5, mm1 236 pfadd mm4, mm2 ; mm4=data3+data4=tmp3 237 pfadd mm1, mm0 ; mm1=data2+data5=tmp2 238 pfsub mm3, mm2 ; mm3=data3-data4=tmp4 239 pfsub mm5, mm0 ; mm5=data2-data5=tmp5 240 241 ; -- Even part 242 243 movq mm2, mm7 244 movq mm0, mm6 245 pfsub mm7, mm4 ; mm7=tmp13 246 pfsub mm6, mm1 ; mm6=tmp12 247 pfadd mm2, mm4 ; mm2=tmp10 248 pfadd mm0, mm1 ; mm0=tmp11 249 250 pfadd mm6, mm7 251 pfmul mm6, [GOTOFF(ebx,PD_0_707)] ; mm6=z1 252 253 movq mm4, mm2 254 movq mm1, mm7 255 pfsub mm2, mm0 ; mm2=data4 256 pfsub mm7, mm6 ; mm7=data6 257 pfadd mm4, mm0 ; mm4=data0 258 pfadd mm1, mm6 ; mm1=data2 259 260 movq MMWORD [MMBLOCK(4,0,edx,SIZEOF_FAST_FLOAT)], mm2 261 movq MMWORD [MMBLOCK(6,0,edx,SIZEOF_FAST_FLOAT)], mm7 262 movq MMWORD [MMBLOCK(0,0,edx,SIZEOF_FAST_FLOAT)], mm4 263 movq MMWORD [MMBLOCK(2,0,edx,SIZEOF_FAST_FLOAT)], mm1 264 265 ; -- Odd part 266 267 movq mm0, MMWORD [wk(0)] ; mm0=tmp6 268 movq mm6, MMWORD [wk(1)] ; mm6=tmp7 269 270 pfadd mm3, mm5 ; mm3=tmp10 271 pfadd mm5, mm0 ; mm5=tmp11 272 pfadd mm0, mm6 ; mm0=tmp12, mm6=tmp7 273 274 pfmul mm5, [GOTOFF(ebx,PD_0_707)] ; mm5=z3 275 276 movq mm2, mm3 ; mm2=tmp10 277 pfsub mm3, mm0 278 pfmul mm3, [GOTOFF(ebx,PD_0_382)] ; mm3=z5 279 pfmul mm2, [GOTOFF(ebx,PD_0_541)] ; mm2=MULTIPLY(tmp10,FIX_0_54119610) 280 pfmul mm0, [GOTOFF(ebx,PD_1_306)] ; mm0=MULTIPLY(tmp12,FIX_1_30656296) 281 pfadd mm2, mm3 ; mm2=z2 282 pfadd mm0, mm3 ; mm0=z4 283 284 movq mm7, mm6 285 pfsub mm6, mm5 ; mm6=z13 286 pfadd mm7, mm5 ; mm7=z11 287 288 movq mm4, mm6 289 movq mm1, mm7 290 pfsub mm6, mm2 ; mm6=data3 291 pfsub mm7, mm0 ; mm7=data7 292 pfadd mm4, mm2 ; mm4=data5 293 pfadd mm1, mm0 ; mm1=data1 294 295 movq MMWORD [MMBLOCK(3,0,edx,SIZEOF_FAST_FLOAT)], mm6 296 movq MMWORD [MMBLOCK(7,0,edx,SIZEOF_FAST_FLOAT)], mm7 297 movq MMWORD [MMBLOCK(5,0,edx,SIZEOF_FAST_FLOAT)], mm4 298 movq MMWORD [MMBLOCK(1,0,edx,SIZEOF_FAST_FLOAT)], mm1 299 300 add edx, byte 2*SIZEOF_FAST_FLOAT 301 dec ecx 302 jnz near .columnloop 303 304 femms ; empty MMX/3DNow! state 305 306; pop edi ; unused 307; pop esi ; unused 308; pop edx ; need not be preserved 309; pop ecx ; need not be preserved 310 poppic ebx 311 mov esp, ebp ; esp <- aligned ebp 312 pop esp ; esp <- original ebp 313 pop ebp 314 ret 315 316; For some reason, the OS X linker does not honor the request to align the 317; segment unless we do this. 318 align 32 319