• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1; back port from GOGO-no coda 2.24b by Takehiro TOMINAGA
2
3; GOGO-no-coda
4;	Copyright (C) 1999 shigeo
5;	special thanks to Keiichi SAKAI
6
7%include "nasm.h"
8
9	globaldef fht_SSE
10
11	segment_data
12	align 16
13Q_MMPP	dd	0x0,0x0,0x80000000,0x80000000
14Q_MPMP	dd	0x0,0x80000000,0x0,0x80000000
15D_1100	dd 0.0, 0.0, 1.0, 1.0
16costab_fft:
17	dd 9.238795325112867e-01
18	dd 3.826834323650898e-01
19	dd 9.951847266721969e-01
20	dd 9.801714032956060e-02
21	dd 9.996988186962042e-01
22	dd 2.454122852291229e-02
23	dd 9.999811752836011e-01
24	dd 6.135884649154475e-03
25S_SQRT2	dd	1.414213562
26
27	segment_code
28
29PIC_OFFSETTABLE
30
31;------------------------------------------------------------------------
32;	by K. SAKAI
33;	99/08/18	PIII 23k[clk]
34;	99/08/19	̿�������촹�� PIII 22k[clk]
35;	99/08/20	bit reversal ����夫��ܿ����� PIII 17k[clk]
36;	99/08/23	���� unroll PIII 14k[clk]
37;	99/11/12	clean up
38;
39;void fht_SSE(float *fz, int n);
40	align 16
41fht_SSE:
42	push	ebx
43	push	esi
44	push	edi
45	push	ebp
46
47%assign _P 4*5
48
49	;2���ܤΥ롼��
50	mov	eax,[esp+_P+0]	;eax=fz
51	mov	ebp,[esp+_P+4]	;=n
52	shl	ebp,3
53	add	ebp,eax		; fn  = fz + n, ���δؿ���λ�ޤ�����
54	push	ebp
55
56	call	get_pc.bp
57	add	ebp, PIC_BASE()
58
59	lea	ecx,[PIC_EBP_REL(costab_fft)]
60	xor	eax,eax
61	mov	al,8		; =k1=1*(sizeof float)	// 4, 16, 64, 256,...
62.lp2:				; do{
63	mov	esi,[esp+_P+4]	; esi=fi=fz
64	lea	edx,[eax+eax*2]
65	mov	ebx, esi
66
67; ��������2���������ԤǤ��ʤ���ʬ��FPU�Τۤ���®����
68	loopalign	16
69.lp20:				; do{
70;                       f0     = fi[0 ] + fi[k1];
71;                       f2     = fi[k2] + fi[k3];
72;                       f1     = fi[0 ] - fi[k1];
73;                       f3     = fi[k2] - fi[k3];
74;                       fi[0 ] = f0     + f2;
75;                       fi[k1] = f1     + f3;
76;                       fi[k2] = f0     - f2;
77;                       fi[k3] = f1     - f3;
78	lea	edi,[ebx+eax]	; edi=gi=fi+ki/2
79	fld	dword [ebx]
80	fadd	dword [ebx+eax*2]
81	fld	dword [ebx+eax*4]
82	fadd	dword [ebx+edx*2]
83
84	fld	dword [ebx]
85	fsub	dword [ebx+eax*2]
86	fld	dword [ebx+eax*4]
87	fsub	dword [ebx+edx*2]
88
89	fld	st1
90	fadd	st0,st1
91	fstp	dword [ebx+eax*2]
92	fsubp	st1,st0
93	fstp	dword [ebx+edx*2]
94
95	fld	st1
96	fadd	st0,st1
97	fstp	dword [ebx]
98	fsubp	st1,st0
99	fstp	dword [ebx+eax*4]
100
101	lea	ebx,[ebx + eax*8]	; = fi += (k1 * 4);
102;                       g0     = gi[0 ] + gi[k1];
103;                       g2     = SQRT2  * gi[k2];
104;                       g1     = gi[0 ] - gi[k1];
105;                       g3     = SQRT2  * gi[k3];
106;                       gi[0 ] = g0     + g2;
107;                       gi[k2] = g0     - g2;
108;                       gi[k1] = g1     + g3;
109;                       gi[k3] = g1     - g3;
110	fld	dword [edi]
111	fadd	dword [edi+eax*2]
112	fld	dword [PIC_EBP_REL(S_SQRT2)]
113	fmul	dword [edi+eax*4]
114
115	fld	dword [edi]
116	fsub	dword [edi+eax*2]
117	fld	dword [PIC_EBP_REL(S_SQRT2)]
118	fmul	dword [edi+edx*2]
119
120	fld	st1
121	fadd	st0,st1
122	fstp	dword [edi+eax*2]
123	fsubp	st1,st0
124	fstp	dword [edi+edx*2]
125
126	fld	st1
127	fadd	st0,st1
128	fstp	dword [edi]
129	fsubp	st1,st0
130	fstp	dword [edi+eax*4]
131
132	cmp	ebx,[esp]
133	jl	near .lp20		; while (fi<fn);
134
135
136;               i = 1; //for (i=1;i<kx;i++){
137;                       c1 = 1.0*t_c - 0.0*t_s;
138;                       s1 = 0.0*t_c + 1.0*t_s;
139	movlps	xmm6,[ecx] ; = { --,  --,  s1, c1}
140	movaps	xmm7,xmm6
141
142	shufps	xmm6,xmm6,R4(0,1,1,0)	; = {+c1, +s1, +s1, +c1} -> ɬ��
143;                       c2 = c1*c1 - s1*s1 = 1 - (2*s1)*s1;
144;                       s2 = c1*s1 + s1*c1 = 2*s1*c1;
145	shufps	xmm7,xmm7,R4(1,0,0,1)
146	movss	xmm5,xmm7		; = { --,  --,  --, s1}
147	xorps	xmm7,[PIC_EBP_REL(Q_MMPP)]	; = {-s1, -c1, +c1, +s1} -> ɬ��
148
149	addss	xmm5,xmm5		; = (--, --,  --, 2*s1)
150	add	esi,4		; esi = fi = fz + i
151	shufps	xmm5,xmm5,R4(0,0,0,0)	; = (2*s1, 2*s1, 2*s1, 2*s1)
152	mulps	xmm5,xmm6		; = (2*s1*c1, 2*s1*s1, 2*s1*s1, 2*s1*c1)
153	subps	xmm5,[PIC_EBP_REL(D_1100)]		; = (--, 2*s1*s1-1, --, 2*s1*c1) = {-- -c2 -- s2}
154	movaps	xmm4,xmm5
155	shufps	xmm5,xmm5,R4(2,0,2,0)	; = {-c2, s2, -c2, s2} -> ɬ��
156
157	xorps	xmm4,[PIC_EBP_REL(Q_MMPP)]		; = {--, c2, --, s2}
158	shufps	xmm4,xmm4,R4(0,2,0,2)	; = {s2, c2, s2, c2} -> ɬ��
159
160	loopalign	16
161.lp21:				; do{
162;                               a       = c2*fi[k1] + s2*gi[k1];
163;                               b       = s2*fi[k1] - c2*gi[k1];
164;                               c       = c2*fi[k3] + s2*gi[k3];
165;                               d       = s2*fi[k3] - c2*gi[k3];
166;                               f0      = fi[0 ]        + a;
167;                               g0      = gi[0 ]        + b;
168;                               f2      = fi[k1 * 2]    + c;
169;                               g2      = gi[k1 * 2]    + d;
170;                               f1      = fi[0 ]        - a;
171;                               g1      = gi[0 ]        - b;
172;                               f3      = fi[k1 * 2]    - c;
173;                               g3      = gi[k1 * 2]    - d;
174	lea	edi,[esi + eax*2 - 8]	; edi = gi = fz +k1-i
175
176	movss	xmm0,[esi + eax*2]	; = fi[k1]
177	movss	xmm2,[esi + edx*2]	; = fi[k3]
178	shufps	xmm0,xmm2,0x00	; = {fi[k3], fi[k3], fi[k1], fi[k1]}
179	movss	xmm1,[edi + eax*2]	; = fi[k1]
180	movss	xmm3,[edi + edx*2]	; = fi[k3]
181	shufps	xmm1,xmm3,0x00	; = {gi[k3], gi[k3], gi[k1], gi[k1]}
182	movss	xmm2,[esi]		; = fi[0]
183	mulps	xmm0,xmm4		; *= {+s2, +c2, +s2, +c2}
184	movss	xmm3,[esi + eax*4]	; = fi[k2]
185	unpcklps	xmm2,xmm3	; = {--, --, fi[k2], fi[0]}
186	mulps	xmm1,xmm5		; *= {-c2, +s2, -c2, +s2}
187	movss	xmm3,[edi + eax*4]	; = gi[k2]
188	addps	xmm0,xmm1		; = {d, c, b, a}
189	movss	xmm1,[edi]		; = gi[0]
190	unpcklps	xmm1,xmm3	; = {--,  --, gi[k2], gi[0]}
191	unpcklps	xmm2,xmm1	; = {gi[k2], fi[k2], gi[0], fi[0]}
192	movaps	xmm1,xmm2
193	addps	xmm1,xmm0	; = {g2, f2, g0, f0}
194	subps	xmm2,xmm0	; = {g3, f3, g1, f1}
195
196;                               a       = c1*f2     + s1*g3;
197;                               c       = s1*g2     + c1*f3;
198;                               b       = s1*f2     - c1*g3;
199;                               d       = c1*g2     - s1*f3;
200;                               fi[0 ]  = f0        + a;
201;                               gi[0 ]  = g0        + c;
202;                               gi[k1]  = g1        + b;
203;                               fi[k1]  = f1        + d;
204;                               fi[k1 * 2]  = f0    - a;
205;                               gi[k1 * 2]  = g0    - c;
206;                               gi[k3]      = g1    - b;
207;                               fi[k3]      = f1    - d;
208	movaps	xmm3,xmm1
209	movhlps	xmm1,xmm1	; = {g2, f2, g2, f2}
210	shufps	xmm3,xmm2,0x14	; = {f1, g1, g0, f0}
211	mulps	xmm1,xmm6	; *= {+c1, +s1, +s1, +c1}
212	shufps	xmm2,xmm2,0xBB	; = {f3, g3, f3, g3}
213	mulps	xmm2,xmm7	; *= {-s1, -c1, +c1, +s1}
214	addps	xmm1,xmm2	; = {d, b, c, a}
215	movaps	xmm2,xmm3
216	addps	xmm3,xmm1	; = {fi[k1], gi[k1], gi[0], fi[0]}
217	subps	xmm2,xmm1	; = {fi[k3], gi[k3], gi[k1*2], fi[k1*2]}
218	movhlps	xmm0,xmm3
219	movss	[esi],xmm3
220	shufps	xmm3,xmm3,0x55
221	movss	[edi+eax*2],xmm0
222	shufps	xmm0,xmm0,0x55
223	movss	[edi],xmm3
224	movss	[esi+eax*2],xmm0
225	movhlps	xmm0,xmm2
226	movss	[esi+eax*4],xmm2
227	shufps	xmm2,xmm2,0x55
228	movss	[edi+edx*2],xmm0
229	shufps	xmm0,xmm0,0x55
230	movss	[edi+eax*4],xmm2
231	movss	[esi+edx*2],xmm0
232	lea	esi,[esi + eax*8] ; fi += (k1 * 4);
233	cmp	esi,[esp]
234	jl	near .lp21		; while (fi<fn);
235
236
237; unroll����do loop��43+4̿��
238
239; ������ǤϤʤ�for�롼�פ�i=2�������unrolling����
240; kx=   2,   8,  32,  128
241; k4=  16,  64, 256, 1024
242;       0, 6/2,30/2,126/2
243
244	xor	ebx,ebx
245	mov	bl, 4*2		; = i = 4
246	cmp	ebx,eax		; i < k1
247	jnl	near .F22
248;               for (i=2;i<kx;i+=2){
249	loopalign	16
250.lp22:
251; at here, xmm6 is {c3, s3, s3, c3}
252;                       c1 = c3*t_c - s3*t_s;
253;                       s1 = c3*t_s + s3*t_c;
254	movlps	xmm0,[ecx]
255	shufps	xmm0,xmm0,R4(1,1,0,0)	; = {t_s, t_s, t_c, t_c}
256	mulps	xmm6,xmm0	; = {c3*ts, s3*ts, s3*tc, c3*tc}
257	movhlps	xmm4,xmm6	; = {--,    --,    c3*ts, s3*ts}
258	xorps	xmm4,[PIC_EBP_REL(Q_MPMP)]	; = {--,    --,   -c3*ts, s3*ts}
259	subps	xmm6,xmm4	; = {-,-, c3*ts+s3*tc, c3*tc-s3*ts}={-,-,s1,c1}
260
261;                       c3 = c1*t_c - s1*t_s;
262;                       s3 = s1*t_c + c1*t_s;
263	shufps	xmm6,xmm6,0x14	; = {c1, s1, s1, c1}
264	mulps	xmm0,xmm6	; = {ts*c1 ts*s1 tc*s1 tc*c1}
265	movhlps	xmm3,xmm0
266	xorps	xmm3,[PIC_EBP_REL(Q_MPMP)]
267	subps	xmm0,xmm3	; = {--, --, s3, c3}
268
269; {s2 s4 c4 c2} = {2*s1*c1 2*s3*c3 1-2*s3*s3 1-2*s1*s1}
270	unpcklps	xmm6,xmm0	; xmm6 = {s3, s1, c3, c1}
271	movaps	xmm7, xmm6
272	shufps	xmm6,xmm6,R4(2,3,1,0)	; xmm6 = {s1, s3, c3, c1}
273	addps	xmm7, xmm7		; {s3*2, s1*2,   --,   --}
274	mov	edi,[esp+_P+4]		; = fz
275	shufps	xmm7, xmm7, R4(2,3,3,2)	; {s1*2, s3*2, s3*2, s1*2}
276	sub	edi,ebx			; edi = fz - i/2
277	mulps	xmm7, xmm6		; {s1*s1*2, s3*s3*2, s3*c3*2, s1*c1*2}
278	lea	esi,[edi + ebx*2]	; esi = fi = fz +i/2
279	subps	xmm7, [PIC_EBP_REL(D_1100)]		; {-c2, -c4, s4, s2}
280	lea	edi,[edi + eax*2-4]	; edi = gi = fz +k1-i/2
281
282;                       fi = fz +i;
283;                       gi = fz +k1-i;
284;                       do{
285.lp220:
286; unroll���do loop��51+4̿��
287;                               a       = c2*fi[k1  ] + s2*gi[k1  ];
288;                               e       = c4*fi[k1+1] + s4*gi[k1-1];
289;                               f       = s4*fi[k1+1] - c4*gi[k1-1];
290;                               b       = s2*fi[k1  ] - c2*gi[k1  ];
291;                               c       = c2*fi[k3  ] + s2*gi[k3  ];
292;                               g       = c4*fi[k3+1] + s4*gi[k3-1];
293;                               h       = s4*fi[k3+1] - c4*gi[k3-1];
294;                               d       = s2*fi[k3  ] - c2*gi[k3  ];
295
296	movaps	xmm4,xmm7	; = {-c2 -c4  s4  s2}
297	xorps	xmm4,[PIC_EBP_REL(Q_MMPP)]	; = { c2  c4  s4  s2}
298	shufps	xmm4,xmm4,0x1B	; = { s2  s4  c4  c2}
299	movlps	xmm0,[esi+eax*2]
300	movlps	xmm1,[edi+eax*2]
301	movlps	xmm2,[esi+edx*2]
302	movlps	xmm3,[edi+edx*2]
303	shufps	xmm0,xmm0,0x14
304	shufps	xmm1,xmm1,0x41
305	shufps	xmm2,xmm2,0x14
306	shufps	xmm3,xmm3,0x41
307	mulps	xmm0,xmm4
308	mulps	xmm1,xmm7
309	mulps	xmm2,xmm4
310	mulps	xmm3,xmm7
311	addps	xmm0,xmm1	; xmm0 = {b, f, e, a}
312	addps	xmm2,xmm3	; xmm2 = {d, h, g, c}
313;17
314
315;                               f0      = fi[0   ]    + a;
316;                               f4      = fi[0 +1]    + e;
317;                               g4      = gi[0 -1]    + f;
318;                               g0      = gi[0   ]    + b;
319;                               f1      = fi[0   ]    - a;
320;                               f5      = fi[0 +1]    - e;
321;                               g5      = gi[0 -1]    - f;
322;                               g1      = gi[0   ]    - b;
323;                               f2      = fi[k2  ]    + c;
324;                               f6      = fi[k2+1]    + g;
325;                               g6      = gi[k2-1]    + h;
326;                               g2      = gi[k2  ]    + d;
327;                               f3      = fi[k2  ]    - c;
328;                               f7      = fi[k2+1]    - g;
329;                               g7      = gi[k2-1]    - h;
330;                               g3      = gi[k2  ]    - d;
331	movlps	xmm1,[esi      ]
332	movhps	xmm1,[edi      ]
333	movaps	xmm4,xmm1
334	subps	xmm1,xmm0	; xmm1 = {g1, g5, f5, f1}
335	movlps	xmm3,[esi+eax*4]
336	movhps	xmm3,[edi+eax*4]
337	movaps	xmm5,xmm3
338	subps	xmm3,xmm2	; xmm3 = {g3, g7, f7, f3}
339	addps	xmm0,xmm4	; xmm0 = {g0, g4, f4, f0}
340	addps	xmm2,xmm5	; xmm2 = {g2, g6, f6, f2}
341;10
342
343;                               a       = c1*f2     + s1*g3;	��*�� + ��*��
344;                               e       = c3*f6     + s3*g7;
345;                               g       = s3*g6     + c3*f7;
346;                               c       = s1*g2     + c1*f3;
347;                               d       = c1*g2     - s1*f3;	��*�� - ��*��
348;                               h       = c3*g6     - s3*f7;
349;                               f       = s3*f6     - c3*g7;
350;                               b       = s1*f2     - c1*g3;
351
352	movaps	xmm5,xmm6	; xmm6 = {s1, s3, c3, c1}
353	shufps	xmm5,xmm5,0x1B	; = {c1, c3, s3, s1}
354	movaps	xmm4,xmm2
355	mulps	xmm4,xmm6
356	shufps	xmm2,xmm2,0x1B	; xmm2 = {f2, f6, g6, g2}
357	mulps	xmm2,xmm6
358	mulps	xmm5,xmm3
359	mulps	xmm3,xmm6
360	shufps	xmm3,xmm3,0x1B
361	addps	xmm4,xmm3	; = {c, g, e, a}
362	subps	xmm2,xmm5	; = {b, f, h, d}
363;10
364
365;                               fi[0   ]  = f0        + a;
366;                               fi[0 +1]  = f4        + e;
367;                               gi[0 -1]  = g4        + g;
368;                               gi[0   ]  = g0        + c;
369;                               fi[k2  ]  = f0        - a;
370;                               fi[k2+1]  = f4        - e;
371;                               gi[k2-1]  = g4        - g;
372;                               gi[k2  ]  = g0        - c;
373;                               fi[k1  ]  = f1        + d;
374;                               fi[k1+1]  = f5        + h;
375;                               gi[k1-1]  = g5        + f;
376;                               gi[k1  ]  = g1        + b;
377;                               fi[k3  ]  = f1        - d;
378;                               fi[k3+1]  = f5        - h;
379;                               gi[k3-1]  = g5        - f;
380;                               gi[k3  ]  = g1        - b;
381	movaps	xmm3,xmm0
382	subps	xmm0,xmm4
383	movlps	[esi+eax*4],xmm0
384	movhps	[edi+eax*4],xmm0
385	addps	xmm4,xmm3
386	movlps	[esi      ],xmm4
387	movhps	[edi      ],xmm4
388
389	movaps	xmm5,xmm1
390	subps	xmm1,xmm2
391	movlps	[esi+edx*2],xmm1
392	movhps	[edi+edx*2],xmm1
393	addps	xmm2,xmm5
394	movlps	[esi+eax*2],xmm2
395	movhps	[edi+eax*2],xmm2
396; 14
397;                               gi     += k4;
398;                               fi     += k4;
399	lea	edi,[edi + eax*8] ; gi += (k1 * 4);
400	lea	esi,[esi + eax*8] ; fi += (k1 * 4);
401	cmp	esi,[esp]
402	jl	near .lp220		; while (fi<fn);
403;                       } while (fi<fn);
404
405	add	ebx,byte 2*4	; i+= 4
406	cmp	ebx,eax		; i < k1
407	shufps	xmm6,xmm6,R4(1,2,2,1)	; (--,s3,c3,--) => {c3, s3, s3, c3}
408	jl	near .lp22
409;               }
410.F22:
411	shl	eax,2
412	add	ecx, byte 8
413	cmp	eax,[esp+_P+8]	; while ((k1 * 4)<n);
414	jle	near .lp2
415	pop	ebp
416	pop	ebp
417	pop	edi
418	pop	esi
419	pop	ebx
420	ret
421
422	end
423