1; back port from GOGO-no coda 2.24b by Takehiro TOMINAGA 2 3; GOGO-no-coda 4; Copyright (C) 1999 shigeo 5; special thanks to Keiichi SAKAI 6 7%include "nasm.h" 8 9 globaldef fht_SSE 10 11 segment_data 12 align 16 13Q_MMPP dd 0x0,0x0,0x80000000,0x80000000 14Q_MPMP dd 0x0,0x80000000,0x0,0x80000000 15D_1100 dd 0.0, 0.0, 1.0, 1.0 16costab_fft: 17 dd 9.238795325112867e-01 18 dd 3.826834323650898e-01 19 dd 9.951847266721969e-01 20 dd 9.801714032956060e-02 21 dd 9.996988186962042e-01 22 dd 2.454122852291229e-02 23 dd 9.999811752836011e-01 24 dd 6.135884649154475e-03 25S_SQRT2 dd 1.414213562 26 27 segment_code 28 29PIC_OFFSETTABLE 30 31;------------------------------------------------------------------------ 32; by K. SAKAI 33; 99/08/18 PIII 23k[clk] 34; 99/08/19 ̿�������촹�� PIII 22k[clk] 35; 99/08/20 bit reversal ����夫��ܿ����� PIII 17k[clk] 36; 99/08/23 ���� unroll PIII 14k[clk] 37; 99/11/12 clean up 38; 39;void fht_SSE(float *fz, int n); 40 align 16 41fht_SSE: 42 push ebx 43 push esi 44 push edi 45 push ebp 46 47%assign _P 4*5 48 49 ;2���ܤΥ롼�� 50 mov eax,[esp+_P+0] ;eax=fz 51 mov ebp,[esp+_P+4] ;=n 52 shl ebp,3 53 add ebp,eax ; fn = fz + n, ���δؿ���λ�ޤ����� 54 push ebp 55 56 call get_pc.bp 57 add ebp, PIC_BASE() 58 59 lea ecx,[PIC_EBP_REL(costab_fft)] 60 xor eax,eax 61 mov al,8 ; =k1=1*(sizeof float) // 4, 16, 64, 256,... 62.lp2: ; do{ 63 mov esi,[esp+_P+4] ; esi=fi=fz 64 lea edx,[eax+eax*2] 65 mov ebx, esi 66 67; ��������2�������ԤǤ��ʤ���ʬ��FPU�Τۤ���®���� 68 loopalign 16 69.lp20: ; do{ 70; f0 = fi[0 ] + fi[k1]; 71; f2 = fi[k2] + fi[k3]; 72; f1 = fi[0 ] - fi[k1]; 73; f3 = fi[k2] - fi[k3]; 74; fi[0 ] = f0 + f2; 75; fi[k1] = f1 + f3; 76; fi[k2] = f0 - f2; 77; fi[k3] = f1 - f3; 78 lea edi,[ebx+eax] ; edi=gi=fi+ki/2 79 fld dword [ebx] 80 fadd dword [ebx+eax*2] 81 fld dword [ebx+eax*4] 82 fadd dword [ebx+edx*2] 83 84 fld dword [ebx] 85 fsub dword [ebx+eax*2] 86 fld dword [ebx+eax*4] 87 fsub dword [ebx+edx*2] 88 89 fld st1 90 fadd st0,st1 91 fstp dword [ebx+eax*2] 92 fsubp st1,st0 93 fstp dword [ebx+edx*2] 94 95 fld st1 96 fadd st0,st1 97 fstp dword [ebx] 98 fsubp st1,st0 99 fstp dword [ebx+eax*4] 100 101 lea ebx,[ebx + eax*8] ; = fi += (k1 * 4); 102; g0 = gi[0 ] + gi[k1]; 103; g2 = SQRT2 * gi[k2]; 104; g1 = gi[0 ] - gi[k1]; 105; g3 = SQRT2 * gi[k3]; 106; gi[0 ] = g0 + g2; 107; gi[k2] = g0 - g2; 108; gi[k1] = g1 + g3; 109; gi[k3] = g1 - g3; 110 fld dword [edi] 111 fadd dword [edi+eax*2] 112 fld dword [PIC_EBP_REL(S_SQRT2)] 113 fmul dword [edi+eax*4] 114 115 fld dword [edi] 116 fsub dword [edi+eax*2] 117 fld dword [PIC_EBP_REL(S_SQRT2)] 118 fmul dword [edi+edx*2] 119 120 fld st1 121 fadd st0,st1 122 fstp dword [edi+eax*2] 123 fsubp st1,st0 124 fstp dword [edi+edx*2] 125 126 fld st1 127 fadd st0,st1 128 fstp dword [edi] 129 fsubp st1,st0 130 fstp dword [edi+eax*4] 131 132 cmp ebx,[esp] 133 jl near .lp20 ; while (fi<fn); 134 135 136; i = 1; //for (i=1;i<kx;i++){ 137; c1 = 1.0*t_c - 0.0*t_s; 138; s1 = 0.0*t_c + 1.0*t_s; 139 movlps xmm6,[ecx] ; = { --, --, s1, c1} 140 movaps xmm7,xmm6 141 142 shufps xmm6,xmm6,R4(0,1,1,0) ; = {+c1, +s1, +s1, +c1} -> ɬ�� 143; c2 = c1*c1 - s1*s1 = 1 - (2*s1)*s1; 144; s2 = c1*s1 + s1*c1 = 2*s1*c1; 145 shufps xmm7,xmm7,R4(1,0,0,1) 146 movss xmm5,xmm7 ; = { --, --, --, s1} 147 xorps xmm7,[PIC_EBP_REL(Q_MMPP)] ; = {-s1, -c1, +c1, +s1} -> ɬ�� 148 149 addss xmm5,xmm5 ; = (--, --, --, 2*s1) 150 add esi,4 ; esi = fi = fz + i 151 shufps xmm5,xmm5,R4(0,0,0,0) ; = (2*s1, 2*s1, 2*s1, 2*s1) 152 mulps xmm5,xmm6 ; = (2*s1*c1, 2*s1*s1, 2*s1*s1, 2*s1*c1) 153 subps xmm5,[PIC_EBP_REL(D_1100)] ; = (--, 2*s1*s1-1, --, 2*s1*c1) = {-- -c2 -- s2} 154 movaps xmm4,xmm5 155 shufps xmm5,xmm5,R4(2,0,2,0) ; = {-c2, s2, -c2, s2} -> ɬ�� 156 157 xorps xmm4,[PIC_EBP_REL(Q_MMPP)] ; = {--, c2, --, s2} 158 shufps xmm4,xmm4,R4(0,2,0,2) ; = {s2, c2, s2, c2} -> ɬ�� 159 160 loopalign 16 161.lp21: ; do{ 162; a = c2*fi[k1] + s2*gi[k1]; 163; b = s2*fi[k1] - c2*gi[k1]; 164; c = c2*fi[k3] + s2*gi[k3]; 165; d = s2*fi[k3] - c2*gi[k3]; 166; f0 = fi[0 ] + a; 167; g0 = gi[0 ] + b; 168; f2 = fi[k1 * 2] + c; 169; g2 = gi[k1 * 2] + d; 170; f1 = fi[0 ] - a; 171; g1 = gi[0 ] - b; 172; f3 = fi[k1 * 2] - c; 173; g3 = gi[k1 * 2] - d; 174 lea edi,[esi + eax*2 - 8] ; edi = gi = fz +k1-i 175 176 movss xmm0,[esi + eax*2] ; = fi[k1] 177 movss xmm2,[esi + edx*2] ; = fi[k3] 178 shufps xmm0,xmm2,0x00 ; = {fi[k3], fi[k3], fi[k1], fi[k1]} 179 movss xmm1,[edi + eax*2] ; = fi[k1] 180 movss xmm3,[edi + edx*2] ; = fi[k3] 181 shufps xmm1,xmm3,0x00 ; = {gi[k3], gi[k3], gi[k1], gi[k1]} 182 movss xmm2,[esi] ; = fi[0] 183 mulps xmm0,xmm4 ; *= {+s2, +c2, +s2, +c2} 184 movss xmm3,[esi + eax*4] ; = fi[k2] 185 unpcklps xmm2,xmm3 ; = {--, --, fi[k2], fi[0]} 186 mulps xmm1,xmm5 ; *= {-c2, +s2, -c2, +s2} 187 movss xmm3,[edi + eax*4] ; = gi[k2] 188 addps xmm0,xmm1 ; = {d, c, b, a} 189 movss xmm1,[edi] ; = gi[0] 190 unpcklps xmm1,xmm3 ; = {--, --, gi[k2], gi[0]} 191 unpcklps xmm2,xmm1 ; = {gi[k2], fi[k2], gi[0], fi[0]} 192 movaps xmm1,xmm2 193 addps xmm1,xmm0 ; = {g2, f2, g0, f0} 194 subps xmm2,xmm0 ; = {g3, f3, g1, f1} 195 196; a = c1*f2 + s1*g3; 197; c = s1*g2 + c1*f3; 198; b = s1*f2 - c1*g3; 199; d = c1*g2 - s1*f3; 200; fi[0 ] = f0 + a; 201; gi[0 ] = g0 + c; 202; gi[k1] = g1 + b; 203; fi[k1] = f1 + d; 204; fi[k1 * 2] = f0 - a; 205; gi[k1 * 2] = g0 - c; 206; gi[k3] = g1 - b; 207; fi[k3] = f1 - d; 208 movaps xmm3,xmm1 209 movhlps xmm1,xmm1 ; = {g2, f2, g2, f2} 210 shufps xmm3,xmm2,0x14 ; = {f1, g1, g0, f0} 211 mulps xmm1,xmm6 ; *= {+c1, +s1, +s1, +c1} 212 shufps xmm2,xmm2,0xBB ; = {f3, g3, f3, g3} 213 mulps xmm2,xmm7 ; *= {-s1, -c1, +c1, +s1} 214 addps xmm1,xmm2 ; = {d, b, c, a} 215 movaps xmm2,xmm3 216 addps xmm3,xmm1 ; = {fi[k1], gi[k1], gi[0], fi[0]} 217 subps xmm2,xmm1 ; = {fi[k3], gi[k3], gi[k1*2], fi[k1*2]} 218 movhlps xmm0,xmm3 219 movss [esi],xmm3 220 shufps xmm3,xmm3,0x55 221 movss [edi+eax*2],xmm0 222 shufps xmm0,xmm0,0x55 223 movss [edi],xmm3 224 movss [esi+eax*2],xmm0 225 movhlps xmm0,xmm2 226 movss [esi+eax*4],xmm2 227 shufps xmm2,xmm2,0x55 228 movss [edi+edx*2],xmm0 229 shufps xmm0,xmm0,0x55 230 movss [edi+eax*4],xmm2 231 movss [esi+edx*2],xmm0 232 lea esi,[esi + eax*8] ; fi += (k1 * 4); 233 cmp esi,[esp] 234 jl near .lp21 ; while (fi<fn); 235 236 237; unroll����do loop��43+4̿�� 238 239; ������ǤϤʤ�for�롼�פ�i=2�������unrolling���� 240; kx= 2, 8, 32, 128 241; k4= 16, 64, 256, 1024 242; 0, 6/2,30/2,126/2 243 244 xor ebx,ebx 245 mov bl, 4*2 ; = i = 4 246 cmp ebx,eax ; i < k1 247 jnl near .F22 248; for (i=2;i<kx;i+=2){ 249 loopalign 16 250.lp22: 251; at here, xmm6 is {c3, s3, s3, c3} 252; c1 = c3*t_c - s3*t_s; 253; s1 = c3*t_s + s3*t_c; 254 movlps xmm0,[ecx] 255 shufps xmm0,xmm0,R4(1,1,0,0) ; = {t_s, t_s, t_c, t_c} 256 mulps xmm6,xmm0 ; = {c3*ts, s3*ts, s3*tc, c3*tc} 257 movhlps xmm4,xmm6 ; = {--, --, c3*ts, s3*ts} 258 xorps xmm4,[PIC_EBP_REL(Q_MPMP)] ; = {--, --, -c3*ts, s3*ts} 259 subps xmm6,xmm4 ; = {-,-, c3*ts+s3*tc, c3*tc-s3*ts}={-,-,s1,c1} 260 261; c3 = c1*t_c - s1*t_s; 262; s3 = s1*t_c + c1*t_s; 263 shufps xmm6,xmm6,0x14 ; = {c1, s1, s1, c1} 264 mulps xmm0,xmm6 ; = {ts*c1 ts*s1 tc*s1 tc*c1} 265 movhlps xmm3,xmm0 266 xorps xmm3,[PIC_EBP_REL(Q_MPMP)] 267 subps xmm0,xmm3 ; = {--, --, s3, c3} 268 269; {s2 s4 c4 c2} = {2*s1*c1 2*s3*c3 1-2*s3*s3 1-2*s1*s1} 270 unpcklps xmm6,xmm0 ; xmm6 = {s3, s1, c3, c1} 271 movaps xmm7, xmm6 272 shufps xmm6,xmm6,R4(2,3,1,0) ; xmm6 = {s1, s3, c3, c1} 273 addps xmm7, xmm7 ; {s3*2, s1*2, --, --} 274 mov edi,[esp+_P+4] ; = fz 275 shufps xmm7, xmm7, R4(2,3,3,2) ; {s1*2, s3*2, s3*2, s1*2} 276 sub edi,ebx ; edi = fz - i/2 277 mulps xmm7, xmm6 ; {s1*s1*2, s3*s3*2, s3*c3*2, s1*c1*2} 278 lea esi,[edi + ebx*2] ; esi = fi = fz +i/2 279 subps xmm7, [PIC_EBP_REL(D_1100)] ; {-c2, -c4, s4, s2} 280 lea edi,[edi + eax*2-4] ; edi = gi = fz +k1-i/2 281 282; fi = fz +i; 283; gi = fz +k1-i; 284; do{ 285.lp220: 286; unroll���do loop��51+4̿�� 287; a = c2*fi[k1 ] + s2*gi[k1 ]; 288; e = c4*fi[k1+1] + s4*gi[k1-1]; 289; f = s4*fi[k1+1] - c4*gi[k1-1]; 290; b = s2*fi[k1 ] - c2*gi[k1 ]; 291; c = c2*fi[k3 ] + s2*gi[k3 ]; 292; g = c4*fi[k3+1] + s4*gi[k3-1]; 293; h = s4*fi[k3+1] - c4*gi[k3-1]; 294; d = s2*fi[k3 ] - c2*gi[k3 ]; 295 296 movaps xmm4,xmm7 ; = {-c2 -c4 s4 s2} 297 xorps xmm4,[PIC_EBP_REL(Q_MMPP)] ; = { c2 c4 s4 s2} 298 shufps xmm4,xmm4,0x1B ; = { s2 s4 c4 c2} 299 movlps xmm0,[esi+eax*2] 300 movlps xmm1,[edi+eax*2] 301 movlps xmm2,[esi+edx*2] 302 movlps xmm3,[edi+edx*2] 303 shufps xmm0,xmm0,0x14 304 shufps xmm1,xmm1,0x41 305 shufps xmm2,xmm2,0x14 306 shufps xmm3,xmm3,0x41 307 mulps xmm0,xmm4 308 mulps xmm1,xmm7 309 mulps xmm2,xmm4 310 mulps xmm3,xmm7 311 addps xmm0,xmm1 ; xmm0 = {b, f, e, a} 312 addps xmm2,xmm3 ; xmm2 = {d, h, g, c} 313;17 314 315; f0 = fi[0 ] + a; 316; f4 = fi[0 +1] + e; 317; g4 = gi[0 -1] + f; 318; g0 = gi[0 ] + b; 319; f1 = fi[0 ] - a; 320; f5 = fi[0 +1] - e; 321; g5 = gi[0 -1] - f; 322; g1 = gi[0 ] - b; 323; f2 = fi[k2 ] + c; 324; f6 = fi[k2+1] + g; 325; g6 = gi[k2-1] + h; 326; g2 = gi[k2 ] + d; 327; f3 = fi[k2 ] - c; 328; f7 = fi[k2+1] - g; 329; g7 = gi[k2-1] - h; 330; g3 = gi[k2 ] - d; 331 movlps xmm1,[esi ] 332 movhps xmm1,[edi ] 333 movaps xmm4,xmm1 334 subps xmm1,xmm0 ; xmm1 = {g1, g5, f5, f1} 335 movlps xmm3,[esi+eax*4] 336 movhps xmm3,[edi+eax*4] 337 movaps xmm5,xmm3 338 subps xmm3,xmm2 ; xmm3 = {g3, g7, f7, f3} 339 addps xmm0,xmm4 ; xmm0 = {g0, g4, f4, f0} 340 addps xmm2,xmm5 ; xmm2 = {g2, g6, f6, f2} 341;10 342 343; a = c1*f2 + s1*g3; ��*�� + ��*�� 344; e = c3*f6 + s3*g7; 345; g = s3*g6 + c3*f7; 346; c = s1*g2 + c1*f3; 347; d = c1*g2 - s1*f3; ��*�� - ��*�� 348; h = c3*g6 - s3*f7; 349; f = s3*f6 - c3*g7; 350; b = s1*f2 - c1*g3; 351 352 movaps xmm5,xmm6 ; xmm6 = {s1, s3, c3, c1} 353 shufps xmm5,xmm5,0x1B ; = {c1, c3, s3, s1} 354 movaps xmm4,xmm2 355 mulps xmm4,xmm6 356 shufps xmm2,xmm2,0x1B ; xmm2 = {f2, f6, g6, g2} 357 mulps xmm2,xmm6 358 mulps xmm5,xmm3 359 mulps xmm3,xmm6 360 shufps xmm3,xmm3,0x1B 361 addps xmm4,xmm3 ; = {c, g, e, a} 362 subps xmm2,xmm5 ; = {b, f, h, d} 363;10 364 365; fi[0 ] = f0 + a; 366; fi[0 +1] = f4 + e; 367; gi[0 -1] = g4 + g; 368; gi[0 ] = g0 + c; 369; fi[k2 ] = f0 - a; 370; fi[k2+1] = f4 - e; 371; gi[k2-1] = g4 - g; 372; gi[k2 ] = g0 - c; 373; fi[k1 ] = f1 + d; 374; fi[k1+1] = f5 + h; 375; gi[k1-1] = g5 + f; 376; gi[k1 ] = g1 + b; 377; fi[k3 ] = f1 - d; 378; fi[k3+1] = f5 - h; 379; gi[k3-1] = g5 - f; 380; gi[k3 ] = g1 - b; 381 movaps xmm3,xmm0 382 subps xmm0,xmm4 383 movlps [esi+eax*4],xmm0 384 movhps [edi+eax*4],xmm0 385 addps xmm4,xmm3 386 movlps [esi ],xmm4 387 movhps [edi ],xmm4 388 389 movaps xmm5,xmm1 390 subps xmm1,xmm2 391 movlps [esi+edx*2],xmm1 392 movhps [edi+edx*2],xmm1 393 addps xmm2,xmm5 394 movlps [esi+eax*2],xmm2 395 movhps [edi+eax*2],xmm2 396; 14 397; gi += k4; 398; fi += k4; 399 lea edi,[edi + eax*8] ; gi += (k1 * 4); 400 lea esi,[esi + eax*8] ; fi += (k1 * 4); 401 cmp esi,[esp] 402 jl near .lp220 ; while (fi<fn); 403; } while (fi<fn); 404 405 add ebx,byte 2*4 ; i+= 4 406 cmp ebx,eax ; i < k1 407 shufps xmm6,xmm6,R4(1,2,2,1) ; (--,s3,c3,--) => {c3, s3, s3, c3} 408 jl near .lp22 409; } 410.F22: 411 shl eax,2 412 add ecx, byte 8 413 cmp eax,[esp+_P+8] ; while ((k1 * 4)<n); 414 jle near .lp2 415 pop ebp 416 pop ebp 417 pop edi 418 pop esi 419 pop ebx 420 ret 421 422 end 423