• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1; from a new GOGO-no-coda (1999/09)
2;	Copyright (C) 1999 shigeo
3;	special thanks to Keiichi SAKAI, URURI
4; hacked and back-ported to LAME
5;	 by Takehiro TOMINAGA Nov 2000
6
7%include "nasm.h"
8
9	globaldef fht_3DN
10
11	segment_data
12	align	16
13costab	dd	0x80000000, 0
14	dd	1.414213562,1.414213562
15	dd	9.238795283293805e-01, 9.238795283293805e-01
16	dd	3.826834424611044e-01, 3.826834424611044e-01
17	dd	9.951847264044178e-01, 9.951847264044178e-01
18	dd	9.801714304836734e-02, 9.801714304836734e-02
19	dd	9.996988186794428e-01, 9.996988186794428e-01
20	dd	2.454122920569705e-02, 2.454122920569705e-02
21	dd	9.999811752815535e-01, 9.999811752815535e-01
22	dd	6.135884819898878e-03, 6.135884819898878e-03
23D_1_0_0_0	dd	0.0		, 1.0
24
25	segment_code
26
27PIC_OFFSETTABLE
28
29
30;void fht_3DN(float *fz, int nn);
31
32proc	fht_3DN
33
34	pushd	ebp, ebx, esi, edi
35
36	sub	esp, 20
37
38	call	get_pc.bp
39	add	ebp, PIC_BASE()
40
41	mov	r0, [esp+40]		;fi
42	mov	r1, [esp+44]		;r1 = nn
43	lea	r3, [PIC_EBP_REL(costab)]		;tri = costab
44	lea	r4, [r0+r1*8]		;r4 = fn = &fz[n]
45	mov	[esp+16], r4
46	mov	r4, 8			;kx = k1/2
47
48	pmov	mm7, [r3]
49
50	loopalign 16
51.do1
52	lea	r3, [r3+16]	;tri += 2;
53	pmov	mm6, [PIC_EBP_REL(costab+8)]
54	lea	r2, [r4+r4*2]		;k3*fsize/2
55	mov	r5, 4		;i = 1*fsize
56
57	loopalign 16
58.do2:
59	lea	r1, [r0+r4]		;gi = fi + kx
60	;f
61	pmov	mm0, [r0]	;fi0
62	pmov	mm1, [r0+r4*2]	;fi1
63	pmov	mm2, [r0+r2*2]	;fi3
64	pmov	mm3, [r0+r4*4]	;fi2
65
66	pupldq	mm0, mm0	;fi0 | fi0
67	pupldq	mm1, mm1	;fi1 | fi1
68	pupldq	mm2, mm2	;fi2 | fi2
69	pupldq	mm3, mm3	;fi3 | fi3
70
71	pxor	mm1, mm7	;fi1 | -fi1
72	pxor	mm3, mm7	;fi3 | -fi3
73
74	pfsub	mm0, mm1	;f1 | f0
75	pfsub	mm2, mm3	;f3 | f2
76
77	pmov	mm4, mm0
78	pfadd	mm0, mm2	;f1+f3|f0+f2 = fi1 | fi0
79	pfsub	mm4, mm2	;f1-f3|f0-f2 = fi3 | fi2
80
81	pmovd	[r0], mm0	;fi[0]
82	puphdq	mm0, mm0
83	pmovd	[r0+r4*4], mm4	;fi[k2]
84	puphdq	mm4, mm4
85
86	pmovd	[r0+r4*2], mm4	;fi[k1]
87	pmovd	[r0+r2*2], mm0	;fi[k3]
88	lea	r0, [r0+r4*8]
89
90	;g
91	pmov	mm0, [r1]	;gi0
92	pmov	mm1, [r1+r4*2]	;gi1
93	pmov	mm2, [r1+r4*4]	;gi2
94	pmov	mm3, [r1+r2*2]	;gi3
95
96	pupldq	mm1, mm1
97	pupldq	mm0, mm0	;gi0 | gi0
98	pupldq	mm2, mm3	;gi3 | gi2
99
100	pxor	mm1, mm7	;gi1 | -gi1
101
102	pfsub	mm0, mm1	;gi0-gi1|gi0+gi1 = g1 | g0
103	pfmul	mm2, mm6	;gi3*SQRT2|gi2*SQRT2 = g3 | g2
104
105	pmov	mm4, mm0
106	pfadd	mm0, mm2	;g1+g3|g0+g2 = gi1 | gi0
107	pfsub	mm4, mm2	;g1-g3|g0-g2 = gi3 | gi2
108
109	pmovd	[r1], mm0	;gi[0]
110	puphdq	mm0, mm0
111	pmovd	[r1+r4*4], mm4	;gi[k2]
112	puphdq	mm4, mm4
113
114	cmp	r0, [esp + 16]
115	pmovd	[r1+r4*2], mm0	;gi[k1]
116	pmovd	[r1+r2*2], mm4	;gi[k3]
117
118	jb near .do2
119
120	pmov	mm6, [r3+r5]	; this is not aligned address!!
121
122	loopalign 16
123.for:
124;
125; mm6 = c1 | s1
126; mm7 = 0x800000000 | 0
127;
128	pmov	mm1, mm6
129	mov	r0, [esp+40]	; fz
130	puphdq	mm1, mm1	; c1 | c1
131	lea	r1, [r0+r4*2]
132	pfadd	mm1, mm1	; c1+c1 | c1+c1
133	pfmul	mm1, mm6	; 2*c1*c1 | 2*c1*s1
134	pfsub	mm1, [PIC_EBP_REL(D_1_0_0_0)] ; 2*c1*c1-1.0 | 2*c1*s1 = -c2 | s2
135
136	pmov	mm0, mm1
137	pxor	mm7, mm6	; c1 | -s1
138
139	pupldq	mm2, mm0
140	pupldq	mm3, mm6	; ** | c1
141	puphdq	mm0, mm2	; s2 | c2
142	puphdq	mm6, mm3	;-s1 | c1
143
144	pxor	mm0, [PIC_EBP_REL(costab)]	; c2 | -s2
145
146; mm0 =  s2| c2
147; mm1 = -c2| s2
148; mm6 =  c1| s1
149; mm7 =  s1|-c1 (we use the opposite sign. from GOGO here)
150
151	pmov	[esp], mm0
152	pmov	[esp+8], mm1
153
154	sub	r1, r5		;r1 = gi
155	add	r0, r5		;r0 = fi
156
157	loopalign 16
158.do3:
159	pmov	mm2, [r0+r4*2] ; fi[k1]
160	pmov	mm4, [r1+r4*2] ; gi[k1]
161	pmov	mm3, [r0+r2*2] ; fi[k3]
162	pmov	mm5, [r1+r2*2] ; gi[k3]
163
164	pupldq	mm2, mm2	; fi1 | fi1
165	pupldq	mm4, mm4	; gi1 | gi1
166	pupldq	mm3, mm3	; fi3 | fi3
167	pupldq	mm5, mm5	; gi3 | gi3
168
169	pfmul	mm2, mm0	; s2 * fi1 | c2 * fi1
170	pfmul	mm4, mm1	;-c2 * gi1 | s2 * gi1
171	pfmul	mm3, mm0	; s2 * fi3 | c2 * fi3
172	pfmul	mm5, mm1	;-c2 * gi3 | s2 * gi3
173
174	pfadd	mm2, mm4		;b | a
175	pfadd	mm3, mm5		;d | c
176
177	pmov	mm0, [r0]
178	pmov	mm4, [r1]
179	pmov	mm1, [r0+r4*4]
180	pmov	mm5, [r1+r4*4]
181
182	pupldq	mm0, mm4		;gi0 | fi0
183	pupldq	mm1, mm5		;gi2 | fi2
184
185	pmov	mm4, mm2
186	pmov	mm5, mm3
187
188	pfadd	mm2, mm0		;g0 | f0
189	pfadd	mm3, mm1		;g2 | f2
190
191	pfsub	mm0, mm4		;g1 | f1
192	pfsub	mm1, mm5		;g3 | f3
193
194	pmov	mm4, mm3
195	pmov	mm5, mm1
196
197	pupldq	mm4, mm4		;f2 | f2
198	puphdq	mm5, mm5		;g3 | g3
199	puphdq	mm3, mm3		;g2 | g2
200	pupldq	mm1, mm1		;f3 | f3
201
202	pfmul	mm4, mm6		;f2 * c1 | f2 * s1
203	pfmul	mm5, mm7		;g3 * s1 | g3 *-c1
204	pfmul	mm3, mm6		;g2 * c1 | g2 * s1
205	pfmul	mm1, mm7		;f3 * s1 | f3 *-c1
206
207	pfadd	mm4, mm5		;a | b
208	pfsub	mm3, mm1		;d | c
209
210	pmov	mm5, mm2
211	pmov	mm1, mm0
212
213	pupldq	mm2, mm2		;f0 | f0
214	pupldq	mm0, mm0		;f1 | f1
215
216	puphdq	mm1, mm2		;f0 | g1
217	puphdq	mm5, mm0		;f1 | g0
218
219	pmov	mm2, mm4
220	pmov	mm0, mm3
221
222	pfadd	mm4, mm1		;fi0 | gi1
223	pfadd	mm3, mm5		;fi1 | gi0
224	pfsub	mm1, mm2		;fi2 | gi3
225	pfsub	mm5, mm0		;fi3 | gi2
226
227	pmovd	[r1+r4*2], mm4	;gi[k1]
228	puphdq	mm4, mm4
229	pmovd	[r1], mm3		;gi[0]
230	puphdq	mm3, mm3
231	pmovd	[r1+r2*2], mm1	;gi[k3]
232	puphdq	mm1, mm1
233	pmovd	[r1+r4*4], mm5	;gi[k2]
234	puphdq	mm5, mm5
235
236	pmovd	[r0], mm4	;fi[0]
237	pmovd	[r0+r4*2], mm3	;fi[k1]
238	pmovd	[r0+r4*4], mm1	;fi[k2]
239	pmovd	[r0+r2*2], mm5	;fi[k3]
240
241	lea	r0, [r0+r4*8]
242	lea	r1, [r1+r4*8]
243	cmp	r0, [esp + 16]
244	pmov	mm0, [esp]
245	pmov	mm1, [esp+8]
246
247	jb near	.do3
248
249	add	r5, 4
250; mm6 =  c1| s1
251; mm7 =  s1|-c1 (we use the opposite sign. from GOGO here)
252	pfmul	mm6, [r3]	; c1*a | s1*a
253	pfmul	mm7, [r3+8]	; s1*b |-c1*b
254	cmp	r5, r4
255
256	pfsub	mm6, mm7	; c1*a-s1*b | s1*a+c1*b
257	pupldq	mm7,mm6
258	puphdq	mm6,mm7
259	pmov	mm7, [PIC_EBP_REL(costab)]
260	jb near	.for
261
262	mov	r0, [esp+40]	;fi
263	cmp	r4, [esp+40+4]
264	lea	r4, [r4*4]	;kx *= 4
265
266	jb near	.do1
267.exitttt
268	femms
269	add	esp,20
270	popd	ebp, ebx, esi, edi
271endproc
272
273
274;void fht_E3DN(float *fz, int nn);
275
276proc	fht_E3DN
277
278	pushd	ebp, ebx, esi, edi
279
280	sub	esp, 20
281
282	call	get_pc.bp
283	add	ebp, PIC_BASE()
284
285	mov	r0, [esp+40]		;fi
286	mov	r1, [esp+44]		;r1 = nn
287	lea	r3, [PIC_EBP_REL(costab)]		;tri = costab
288	lea	r4, [r0+r1*8]		;r4 = fn = &fz[n]
289	mov	[esp+16], r4
290	mov	r4, 8			;kx = k1/2
291
292	pmov	mm7, [r3]
293
294	loopalign 16
295.do1
296	lea	r3, [r3+16]	;tri += 2;
297	pmov	mm6, [PIC_EBP_REL(costab+8)]
298	lea	r2, [r4+r4*2]		;k3*fsize/2
299	mov	r5, 4		;i = 1*fsize
300
301	loopalign 16
302.do2:
303	lea	r1, [r0+r4]		;gi = fi + kx
304;f
305	pmov	mm0, [r0]	; X  | fi0
306	pmov	mm1, [r0+r4*4]	; X  | fi2
307	pupldq	mm0, [r0+r4*2]	;fi1 | fi0
308	pupldq	mm1, [r0+r2*2]	;fi3 | fi2
309	pfpnacc	mm0, mm0	;fi0+fi1 | fi0-fi1 = f0|f1
310	pfpnacc	mm1, mm1	;fi2+fi3 | fi2-fi3 = f2|f3
311
312	pmov	mm2, mm0
313	pfadd	mm0, mm1	;f0+f2|f1+f3 = fi0 | fi1
314	pfsub	mm2, mm1	;f0-f2|f1-f3 = fi2 | fi3
315
316	pmovd	[r0+r4*2], mm0	;fi[k1]
317	pmovd	[r0+r2*2], mm2	;fi[k3]
318
319	puphdq	mm0, mm0
320	puphdq	mm2, mm2
321	pmovd	[r0], mm0	;fi[0]
322	pmovd	[r0+r4*4], mm2	;fi[k2]
323
324	lea	r0, [r0+r4*8]
325;g
326	pmov	mm3, [r1]	;    gi0
327	pmov	mm4, [r1+r2*2]	;    gi3
328	pupldq	mm3, [r1+r4*2]	;gi1|gi0
329	pupldq	mm4, [r1+r4*4]	;gi2|gi3
330
331	pfpnacc	mm3, mm3	;gi0+gi1  |gi0-gi1   = f0|f1
332	pfmul	mm4, mm6	;gi2*SQRT2|gi3*SQRT2 = f2|f3
333
334	pmov	mm5, mm3
335	pfadd	mm3, mm4	;f0+f2|f1+f3
336	pfsub	mm5, mm4	;f0-f2|f1-f3
337
338	cmp	r0, [esp + 16]
339	pmovd	[r1+r4*2], mm3	;gi[k1]
340	pmovd	[r1+r2*2], mm5	;gi[k3]
341	puphdq	mm3, mm3
342	puphdq	mm5, mm5
343	pmovd	[r1], mm3	;gi[0]
344	pmovd	[r1+r4*4], mm5	;gi[k2]
345
346	jb near .do2
347
348	pmov	mm6, [r3+r5]	; this is not aligned address!!
349
350	loopalign 16
351.for:
352;
353; mm6 = c1 | s1
354; mm7 = 0x800000000 | 0
355;
356	pmov	mm5, mm6
357	mov	r0, [esp+40]	; fz
358	puphdq	mm5, mm5	; c1 | c1
359	lea	r1, [r0+r4*2]
360	pfadd	mm5, mm5	; c1+c1 | c1+c1
361	pfmul	mm5, mm6	; 2*c1*c1 | 2*c1*s1
362	pfsub	mm5, [PIC_EBP_REL(D_1_0_0_0)] ; 2*c1*c1-1.0 | 2*c1*s1 = -c2 | s2
363
364	pswapd	mm4, mm5	; s2 |-c2
365	pxor	mm4, mm7	; s2 | c2
366	pxor	mm7, mm6	; c1 |-s1
367	pswapd	mm6, mm6	; s1 | c1
368
369; mm4 =  s2| c2
370; mm5 = -c2| s2
371; mm6 =  c1| s1
372; mm7 =  s1|-c1 (we use the opposite sign. from GOGO here)
373
374	pmov	[esp], mm4
375	pmov	[esp+8], mm5
376
377	sub	r1, r5		;r1 = gi
378	add	r0, r5		;r0 = fi
379
380	loopalign 16
381.do3:
382	pmov	mm0, [r0+r2*2] ; fi[k1]
383	pmov	mm2, [r1+r2*2] ; gi[k1]
384	pmov	mm1, [r0+r4*2] ; fi[k3]
385	pmov	mm3, [r1+r4*2] ; gi[k3]
386
387	pupldq	mm0, mm0
388	pupldq	mm2, mm2
389	pupldq	mm1, mm1
390	pupldq	mm3, mm3
391
392	pfmul	mm0, mm4
393	pfmul	mm2, mm5
394	pfmul	mm1, mm4
395	pfmul	mm3, mm5
396
397	pfadd	mm0, mm2		;d | c
398	pfadd	mm1, mm3		;b | a
399
400	pmov	mm2, [r0+r4*4]		;fi2
401	pupldq	mm3, [r1+r4*4]		;gi2 | -
402	pmov	mm4, [r0]		;fi0
403	pupldq	mm5, [r1]		;gi0 | -
404
405	pupldq	mm2, mm0		;c | fi2
406	puphdq	mm3, mm0		;d | gi2
407	pupldq	mm4, mm1		;a | fi0
408	puphdq	mm5, mm1		;b | gi0
409
410	pfpnacc	mm2, mm2		;f2 | f3
411	pfpnacc	mm3, mm3		;g2 | g3
412	pfpnacc	mm4, mm4		;f0 | f1
413	pfpnacc	mm5, mm5		;g0 | g1
414
415	pmov	mm0, mm2
416	pmov	mm1, mm3
417	pupldq	mm2, mm2		;f3 | f3
418	pupldq	mm3, mm3		;g3 | g3
419	puphdq	mm0, mm0		;f2 | f2
420	puphdq	mm1, mm1		;g2 | g2
421
422	pswapd	mm4, mm4		;f1 | f0
423	pswapd	mm5, mm5		;g1 | g0
424
425	pfmul	mm0, mm7		;f2 * s1 | f2 *-c1
426	pfmul	mm3, mm6		;g3 * c1 | g3 * s1
427	pfmul	mm1, mm6		;g2 * c1 | g2 * s1
428	pfmul	mm2, mm7		;f3 * s1 | f3 *-c1
429
430	pfsub	mm0, mm3		; b |-a
431	pfsub	mm1, mm2		; d | c
432
433	pmov	mm2, mm5
434	pmov	mm3, mm4
435	pupldq	mm4, mm0		;-a | f0
436	pupldq	mm5, mm1		; c | g0
437	puphdq	mm2, mm0		; b | g1
438	puphdq	mm3, mm1		; d | f1
439
440	pfpnacc	mm4, mm4		;fi2 | fi0
441	pfpnacc	mm5, mm5		;gi0 | gi2
442	pfpnacc	mm2, mm2		;gi1 | gi3
443	pfpnacc	mm3, mm3		;fi1 | fi3
444
445	pmovd	[r0], mm4		;fi[0]
446	pmovd	[r1+r4*4], mm5		;gi[k2]
447	pmovd	[r1+r2*2], mm2		;gi[k3]
448	pmovd	[r0+r2*2], mm3		;fi[k3]
449
450	puphdq	mm4, mm4
451	puphdq	mm5, mm5
452	puphdq	mm2, mm2
453	puphdq	mm3, mm3
454	pmovd	[r0+r4*4], mm4		;fi[k2]
455	pmovd	[r1], mm5		;gi[0]
456	pmovd	[r1+r4*2], mm2		;gi[k1]
457	pmovd	[r0+r4*2], mm3		;fi[k1]
458
459	lea	r0, [r0+r4*8]
460	lea	r1, [r1+r4*8]
461	cmp	r0, [esp + 16]
462	pmov	mm4, [esp]
463	pmov	mm5, [esp+8]
464
465	jb near	.do3
466
467	add	r5, 4
468; mm6 =  c1| s1
469; mm7 =  s1|-c1 (we use the opposite sign. from GOGO here)
470	pfmul	mm6, [r3]	; c1*a | s1*a
471	pfmul	mm7, [r3+8]	; s1*b |-c1*b
472	cmp	r5, r4
473
474	pfsub	mm6, mm7	; c1*a-s1*b | s1*a+c1*b
475	pswapd	mm6, mm6 ; ???	; s1*a+c1*b | c1*a-s1*b
476	pmov	mm7, [PIC_EBP_REL(costab)]
477	jb near	.for
478
479	mov	r0, [esp+40]	;fi
480	cmp	r4, [esp+40+4]
481	lea	r4, [r4*4]	;kx *= 4
482
483	jb near	.do1
484.exitttt
485	femms
486	add	esp,20
487	popd	ebp, ebx, esi, edi
488endproc
489