• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1
2/*
3 * Mesa 3-D graphics library
4 *
5 * Copyright (C) 1999-2001  Brian Paul   All Rights Reserved.
6 *
7 * Permission is hereby granted, free of charge, to any person obtaining a
8 * copy of this software and associated documentation files (the "Software"),
9 * to deal in the Software without restriction, including without limitation
10 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
11 * and/or sell copies of the Software, and to permit persons to whom the
12 * Software is furnished to do so, subject to the following conditions:
13 *
14 * The above copyright notice and this permission notice shall be included
15 * in all copies or substantial portions of the Software.
16 *
17 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
18 * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
19 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
20 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR
21 * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
22 * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
23 * OTHER DEALINGS IN THE SOFTWARE.
24 */
25
26/** TODO:
27  * - insert PREFETCH instructions to avoid cache-misses !
28  * - some more optimizations are possible...
29  * - for 40-50% more performance in the SSE-functions, the
30  *   data (trans-matrix, src_vert, dst_vert) needs to be 16byte aligned !
31  */
32
33#ifdef USE_SSE_ASM
34#include "assyntax.h"
35#define MATH_ASM_PTR_SIZE 4
36#include "math/m_vector_asm.h"
37#include "xform_args.h"
38
39   SEG_TEXT
40
41#define S(i) 	REGOFF(i * 4, ESI)
42#define D(i) 	REGOFF(i * 4, EDI)
43#define M(i) 	REGOFF(i * 4, EDX)
44
45
46ALIGNTEXT4
47GLOBL GLNAME(_mesa_sse_transform_points3_general)
48HIDDEN(_mesa_sse_transform_points3_general)
49GLNAME( _mesa_sse_transform_points3_general ):
50    _CET_ENDBR
51#define FRAME_OFFSET 8
52    PUSH_L    ( ESI )
53    PUSH_L    ( EDI )
54
55    MOV_L( REGOFF(OFFSET_SOURCE+8, ESP), ESI ) 	/* ptr to source GLvector4f */
56    MOV_L( REGOFF(OFFSET_DEST+8, ESP), EDI ) 	/* ptr to dest GLvector4f */
57
58    MOV_L( ARG_MATRIX, EDX ) 			/* ptr to matrix */
59    MOV_L( REGOFF(V4F_COUNT, ESI), ECX ) 	/* source count */
60
61    CMP_L     ( CONST(0), ECX )			/* count == 0 ? */
62    JE        ( LLBL(K_GTPGR_finish) )		/* yes -> nothing to do. */
63
64    MOV_L( REGOFF(V4F_STRIDE, ESI), EAX ) 	/* stride */
65    OR_L( CONST(VEC_SIZE_4), REGOFF(V4F_FLAGS, EDI) ) 	/* set dest flags */
66
67    MOV_L( ECX, REGOFF(V4F_COUNT, EDI) ) 	/* set dest count */
68    MOV_L( CONST(4), REGOFF(V4F_SIZE, EDI) ) 	/* set dest size */
69
70    SHL_L( CONST(4), ECX ) 			/* count *= 16 */
71    MOV_L( REGOFF(V4F_START, ESI), ESI ) 	/* ptr to first source vertex */
72
73    MOV_L( REGOFF(V4F_START, EDI), EDI ) 	/* ptr to first dest vertex */
74    ADD_L( EDI, ECX ) 				/* count += dest ptr */
75
76
77ALIGNTEXT32
78    MOVAPS    ( REGOFF(0, EDX), XMM0 )	/* m0  | m1  | m2  | m3 */
79    MOVAPS    ( REGOFF(16, EDX), XMM1 )	/* m4  | m5  | m6  | m7 */
80    MOVAPS    ( REGOFF(32, EDX), XMM2 )	/* m8  | m9  | m10 | m11 */
81    MOVAPS    ( REGOFF(48, EDX), XMM3 )	/* m12 | m13 | m14 | m15 */
82
83
84ALIGNTEXT32
85LLBL(K_GTPGR_top):
86    MOVSS     ( REGOFF(0, ESI), XMM4 )		/*    |    |    | ox */
87    SHUFPS    ( CONST(0x0), XMM4, XMM4 )	/* ox | ox | ox | ox */
88    MOVSS     ( REGOFF(4, ESI), XMM5 )		/*    |    |    | oy */
89    SHUFPS    ( CONST(0x0), XMM5, XMM5 )	/* oy | oy | oy | oy */
90    MOVSS     ( REGOFF(8, ESI), XMM6 )		/*    |    |    | oz */
91    SHUFPS    ( CONST(0x0), XMM6, XMM6 )	/* oz | oz | oz | oz */
92
93    MULPS     ( XMM0, XMM4 )		/* m3*ox  | m2*ox  | m1*ox | m0*ox */
94    MULPS     ( XMM1, XMM5 )		/* m7*oy  | m6*oy  | m5*oy | m4*oy */
95    MULPS     ( XMM2, XMM6 )		/* m11*oz | m10*oz | m9*oz | m8*oz */
96
97    ADDPS     ( XMM5, XMM4 )
98    ADDPS     ( XMM6, XMM4 )
99    ADDPS     ( XMM3, XMM4 )
100
101    MOVAPS    ( XMM4, REGOFF(0, EDI) )
102
103LLBL(K_GTPGR_skip):
104    ADD_L     ( CONST(16), EDI )
105    ADD_L     ( EAX, ESI )
106    CMP_L     ( ECX, EDI )
107    JNE       ( LLBL(K_GTPGR_top) )
108
109LLBL(K_GTPGR_finish):
110    POP_L     ( EDI )
111    POP_L     ( ESI )
112    RET
113#undef FRAME_OFFSET
114
115
116ALIGNTEXT4
117GLOBL GLNAME(_mesa_sse_transform_points3_identity)
118HIDDEN(_mesa_sse_transform_points3_identity)
119GLNAME( _mesa_sse_transform_points3_identity ):
120    _CET_ENDBR
121#define FRAME_OFFSET 8
122    PUSH_L    ( ESI )
123    PUSH_L    ( EDI )
124
125    MOV_L( REGOFF(OFFSET_SOURCE+8, ESP), ESI ) 	/* ptr to source GLvector4f */
126    MOV_L( REGOFF(OFFSET_DEST+8, ESP), EDI ) 	/* ptr to dest GLvector4f */
127
128    MOV_L( REGOFF(V4F_COUNT, ESI), ECX ) 	/* source count */
129
130    TEST_L( ECX, ECX)
131    JZ( LLBL(K_GTPIR_finish) ) 			/* count was zero; go to finish */
132
133    MOV_L( REGOFF(V4F_STRIDE, ESI), EAX ) 	/* stride */
134    OR_L( CONST(VEC_SIZE_3), REGOFF(V4F_FLAGS, EDI) ) 	/* set dest flags */
135
136    MOV_L( ECX, REGOFF(V4F_COUNT, EDI) ) 	/* set dest count */
137    MOV_L( CONST(3), REGOFF(V4F_SIZE, EDI) ) 	/* set dest size */
138
139    SHL_L( CONST(4), ECX ) 			/* count *= 16 */
140    MOV_L( REGOFF(V4F_START, ESI), ESI ) 	/* ptr to first source vertex */
141
142    MOV_L( REGOFF(V4F_START, EDI), EDI ) 	/* ptr to first dest vertex */
143    ADD_L( EDI, ECX ) 				/* count += dest ptr */
144
145    CMP_L( ESI, EDI )
146    JE( LLBL(K_GTPIR_finish) )
147
148
149ALIGNTEXT32
150LLBL(K_GTPIR_top):
151    MOVLPS    ( S(0), XMM0 )
152    MOVLPS    ( XMM0, D(0) )
153    MOVSS     ( S(2), XMM0 )
154    MOVSS     ( XMM0, D(2) )
155
156LLBL(K_GTPIR_skip):
157    ADD_L     ( CONST(16), EDI )
158    ADD_L     ( EAX, ESI )
159    CMP_L     ( ECX, EDI )
160    JNE       ( LLBL(K_GTPIR_top) )
161
162LLBL(K_GTPIR_finish):
163    POP_L     ( EDI )
164    POP_L     ( ESI )
165    RET
166#undef FRAME_OFFSET
167
168
169
170
171ALIGNTEXT4
172GLOBL GLNAME(_mesa_sse_transform_points3_3d_no_rot)
173HIDDEN(_mesa_sse_transform_points3_3d_no_rot)
174GLNAME(_mesa_sse_transform_points3_3d_no_rot):
175    _CET_ENDBR
176#define FRAME_OFFSET 8
177    PUSH_L( ESI )
178    PUSH_L( EDI )
179
180    MOV_L( REGOFF(OFFSET_SOURCE+8, ESP), ESI ) 	/* ptr to source GLvector4f */
181    MOV_L( REGOFF(OFFSET_DEST+8, ESP), EDI ) 	/* ptr to dest GLvector4f */
182
183
184    MOV_L( ARG_MATRIX, EDX ) 			/* ptr to matrix */
185    MOV_L( REGOFF(V4F_COUNT, ESI), ECX ) 	/* source count */
186
187    TEST_L( ECX, ECX)
188    JZ( LLBL(K_GTP3DNRR_finish) ) 		/* count was zero; go to finish */
189
190    MOV_L( REGOFF(V4F_STRIDE, ESI), EAX ) 	/* stride */
191    OR_L( CONST(VEC_SIZE_3), REGOFF(V4F_FLAGS, EDI) ) 	/* set dest flags */
192
193    MOV_L( ECX, REGOFF(V4F_COUNT, EDI) ) 	/* set dest count */
194    MOV_L( CONST(3), REGOFF(V4F_SIZE, EDI) ) 	/* set dest size */
195
196    SHL_L( CONST(4), ECX ) 			/* count *= 16 */
197    MOV_L( REGOFF(V4F_START, ESI), ESI ) 	/* ptr to first source vertex */
198
199    MOV_L( REGOFF(V4F_START, EDI), EDI ) 	/* ptr to first dest vertex */
200    ADD_L( EDI, ECX ) 				/* count += dest ptr */
201
202    XORPS( XMM0, XMM0 )                         /* clean the working register */
203
204ALIGNTEXT32
205    MOVSS    ( M(0), XMM1 )			/* - | - |  -  | m0  */
206    MOVSS    ( M(5), XMM2 )			/* - | - |  -  | m5  */
207    UNPCKLPS ( XMM2, XMM1 )			/* - | - | m5  | m0  */
208    MOVLPS   ( M(12), XMM2 )			/* - | - | m13 | m12 */
209    MOVSS    ( M(10), XMM3 )			/* - | - |  -  | m10 */
210    MOVSS    ( M(14), XMM4 )			/* - | - |  -  | m14 */
211
212ALIGNTEXT32
213LLBL(K_GTP3DNRR_top):
214
215    MOVLPS   ( S(0), XMM0 )			/* - | - |  s1   | s0 */
216    MULPS    ( XMM1, XMM0 )			/* - | - | s1*m5 | s0*m0 */
217    ADDPS    ( XMM2, XMM0 )			/* - | - | +m13  | +m12 */
218    MOVLPS   ( XMM0, D(0) )			/* -> D(1) | -> D(0) */
219
220    MOVSS    ( S(2), XMM0 )			/* sz */
221    MULSS    ( XMM3, XMM0 )			/* sz*m10 */
222    ADDSS    ( XMM4, XMM0 )			/* +m14 */
223    MOVSS    ( XMM0, D(2) )			/* -> D(2) */
224
225LLBL(K_GTP3DNRR_skip):
226    ADD_L    ( CONST(16), EDI )
227    ADD_L    ( EAX, ESI )
228    CMP_L    ( ECX, EDI )
229    JNE      ( LLBL(K_GTP3DNRR_top) )
230
231LLBL(K_GTP3DNRR_finish):
232    POP_L    ( EDI )
233    POP_L    ( ESI )
234    RET
235#undef FRAME_OFFSET
236
237
238
239ALIGNTEXT4
240GLOBL GLNAME(_mesa_sse_transform_points3_perspective)
241HIDDEN(_mesa_sse_transform_points3_perspective)
242GLNAME(_mesa_sse_transform_points3_perspective):
243    _CET_ENDBR
244#define FRAME_OFFSET 8
245    PUSH_L   ( ESI )
246    PUSH_L   ( EDI )
247
248    MOV_L( REGOFF(OFFSET_SOURCE+8, ESP), ESI ) 	/* ptr to source GLvector4f */
249    MOV_L( REGOFF(OFFSET_DEST+8, ESP), EDI ) 	/* ptr to dest GLvector4f */
250
251    MOV_L( ARG_MATRIX, EDX ) 			/* ptr to matrix */
252    MOV_L( REGOFF(V4F_COUNT, ESI), ECX ) 	/* source count */
253
254    TEST_L( ECX, ECX)
255    JZ( LLBL(K_GTP3PR_finish) )			/* count was zero; go to finish */
256
257    MOV_L( REGOFF(V4F_STRIDE, ESI), EAX ) 	/* stride */
258    OR_L( CONST(VEC_SIZE_4), REGOFF(V4F_FLAGS, EDI) ) 	/* set dest flags */
259
260    MOV_L( ECX, REGOFF(V4F_COUNT, EDI) ) 	/* set dest count */
261    MOV_L( CONST(4), REGOFF(V4F_SIZE, EDI) ) 	/* set dest size */
262
263    SHL_L( CONST(4), ECX ) 			/* count *= 16 */
264    MOV_L( REGOFF(V4F_START, ESI), ESI ) 	/* ptr to first source vertex */
265
266    MOV_L( REGOFF(V4F_START, EDI), EDI ) 	/* ptr to first dest vertex */
267    ADD_L( EDI, ECX ) 				/* count += dest ptr */
268
269ALIGNTEXT32
270    MOVSS    ( M(0), XMM1 )			/* -  | -  |  -  | m0  */
271    MOVSS    ( M(5), XMM2 )			/* -  | -  |  -  | m5  */
272    UNPCKLPS ( XMM2, XMM1 )			/* -  | -  | m5  | m0  */
273    MOVLPS   ( M(8), XMM2 )			/* -  | -  | m9  | m8  */
274    MOVSS    ( M(10), XMM3 )			/* m10 */
275    MOVSS    ( M(14), XMM4 )			/* m14 */
276    XORPS    ( XMM6, XMM6 )			/* 0 */
277
278ALIGNTEXT32
279LLBL(K_GTP3PR_top):
280    MOVLPS   ( S(0), XMM0 )			/* oy | ox */
281    MULPS    ( XMM1, XMM0 )			/* oy*m5 | ox*m0 */
282    MOVSS    ( S(2), XMM5 )			/* oz */
283    SHUFPS   ( CONST(0x0), XMM5, XMM5 )		/* oz | oz */
284    MULPS    ( XMM2, XMM5 )			/* oz*m9 | oz*m8 */
285    ADDPS    ( XMM5, XMM0 )			/* +oy*m5 | +ox*m0 */
286    MOVLPS   ( XMM0, D(0) )			/* ->D(1) | ->D(0) */
287
288    MOVSS    ( S(2), XMM0 )			/* oz */
289    MULSS    ( XMM3, XMM0 )			/* oz*m10 */
290    ADDSS    ( XMM4, XMM0 )			/* +m14 */
291    MOVSS    ( XMM0, D(2) )			/* ->D(2) */
292
293    MOVSS    ( S(2), XMM0 )			/* oz */
294    MOVSS    ( XMM6, XMM5 )			/* 0 */
295    SUBPS    ( XMM0, XMM5 )			/* -oz */
296    MOVSS    ( XMM5, D(3) )			/* ->D(3) */
297
298LLBL(K_GTP3PR_skip):
299    ADD_L( CONST(16), EDI )
300    ADD_L( EAX, ESI )
301    CMP_L( ECX, EDI )
302    JNE( LLBL(K_GTP3PR_top) )
303
304LLBL(K_GTP3PR_finish):
305    POP_L    ( EDI )
306    POP_L    ( ESI )
307    RET
308#undef FRAME_OFFSET
309
310
311
312ALIGNTEXT4
313GLOBL GLNAME(_mesa_sse_transform_points3_2d)
314HIDDEN(_mesa_sse_transform_points3_2d)
315GLNAME(_mesa_sse_transform_points3_2d):
316    _CET_ENDBR
317#define FRAME_OFFSET 8
318    PUSH_L( ESI )
319    PUSH_L( EDI )
320
321    MOV_L( REGOFF(OFFSET_SOURCE+8, ESP), ESI ) 	/* ptr to source GLvector4f */
322    MOV_L( REGOFF(OFFSET_DEST+8, ESP), EDI ) 	/* ptr to dest GLvector4f */
323
324    MOV_L( ARG_MATRIX, EDX ) 			/* ptr to matrix */
325    MOV_L( REGOFF(V4F_COUNT, ESI), ECX ) 	/* source count */
326
327    TEST_L( ECX, ECX)
328    JZ( LLBL(K_GTP3P2DR_finish) ) 		/* count was zero; go to finish */
329
330    MOV_L( REGOFF(V4F_STRIDE, ESI), EAX ) 	/* stride */
331    OR_L( CONST(VEC_SIZE_3), REGOFF(V4F_FLAGS, EDI) ) 	/* set dest flags */
332
333    MOV_L( ECX, REGOFF(V4F_COUNT, EDI) ) 	/* set dest count */
334    MOV_L( CONST(3), REGOFF(V4F_SIZE, EDI) ) 	/* set dest size */
335
336    SHL_L( CONST(4), ECX ) 			/* count *= 16 */
337    MOV_L( REGOFF(V4F_START, ESI), ESI ) 	/* ptr to first source vertex */
338
339    MOV_L( REGOFF(V4F_START, EDI), EDI ) 	/* ptr to first dest vertex */
340    ADD_L( EDI, ECX ) 				/* count += dest ptr */
341
342ALIGNTEXT32
343    MOVLPS( M(0), XMM0 )			/* m1  | m0 */
344    MOVLPS( M(4), XMM1 )			/* m5  | m4 */
345    MOVLPS( M(12), XMM2 )			/* m13 | m12 */
346
347ALIGNTEXT32
348LLBL(K_GTP3P2DR_top):
349    MOVSS    ( S(0), XMM3 )			/* ox */
350    SHUFPS   ( CONST(0x0), XMM3, XMM3 )		/* ox | ox */
351    MULPS    ( XMM0, XMM3 )			/* ox*m1 | ox*m0 */
352    MOVSS    ( S(1), XMM4 )			/* oy */
353    SHUFPS   ( CONST(0x0), XMM4, XMM4 )		/* oy | oy */
354    MULPS    ( XMM1, XMM4 )			/* oy*m5 | oy*m4 */
355
356    ADDPS    ( XMM4, XMM3 )
357    ADDPS    ( XMM2, XMM3 )
358    MOVLPS   ( XMM3, D(0) )
359
360    MOVSS    ( S(2), XMM3 )
361    MOVSS    ( XMM3, D(2) )
362
363LLBL(K_GTP3P2DR_skip):
364    ADD_L    ( CONST(16), EDI )
365    ADD_L    ( EAX, ESI )
366    CMP_L    ( ECX, EDI )
367    JNE      ( LLBL(K_GTP3P2DR_top) )
368
369LLBL(K_GTP3P2DR_finish):
370    POP_L    ( EDI )
371    POP_L    ( ESI )
372    RET
373#undef FRAME_OFFSET
374
375
376
377ALIGNTEXT4
378GLOBL GLNAME(_mesa_sse_transform_points3_2d_no_rot)
379HIDDEN(_mesa_sse_transform_points3_2d_no_rot)
380GLNAME(_mesa_sse_transform_points3_2d_no_rot):
381	_CET_ENDBR
382#define FRAME_OFFSET 8
383	PUSH_L( ESI )
384	PUSH_L( EDI )
385
386	MOV_L( REGOFF(OFFSET_SOURCE+8, ESP), ESI ) 	/* ptr to source GLvector4f */
387	MOV_L( REGOFF(OFFSET_DEST+8, ESP), EDI ) 	/* ptr to dest GLvector4f */
388
389	MOV_L( ARG_MATRIX, EDX ) 		/* ptr to matrix */
390	MOV_L( REGOFF(V4F_COUNT, ESI), ECX ) 	/* source count */
391
392	TEST_L( ECX, ECX)
393	JZ( LLBL(K_GTP3P2DNRR_finish) ) 	/* count was zero; go to finish */
394
395	MOV_L( REGOFF(V4F_STRIDE, ESI), EAX ) 	/* stride */
396	OR_L( CONST(VEC_SIZE_3), REGOFF(V4F_FLAGS, EDI) ) 	/* set dest flags */
397
398	MOV_L( ECX, REGOFF(V4F_COUNT, EDI) ) 	/* set dest count */
399	MOV_L( CONST(3), REGOFF(V4F_SIZE, EDI) ) 	/* set dest size */
400
401	SHL_L( CONST(4), ECX ) 			/* count *= 16 */
402	MOV_L( REGOFF(V4F_START, ESI), ESI ) 	/* ptr to first source vertex */
403
404	MOV_L( REGOFF(V4F_START, EDI), EDI ) 	/* ptr to first dest vertex */
405	ADD_L( EDI, ECX ) 			/* count += dest ptr */
406
407ALIGNTEXT32
408	MOVSS    ( M(0), XMM1 )			/* m0 */
409	MOVSS    ( M(5), XMM2 )			/* m5 */
410	UNPCKLPS ( XMM2, XMM1 )			/* m5 | m0 */
411	MOVLPS   ( M(12), XMM2 )		/* m13 | m12 */
412
413ALIGNTEXT32
414LLBL(K_GTP3P2DNRR_top):
415	MOVLPS( S(0), XMM0 )			/* oy | ox */
416	MULPS( XMM1, XMM0 )			/* oy*m5 | ox*m0 */
417	ADDPS( XMM2, XMM0 )			/* +m13 | +m12 */
418	MOVLPS( XMM0, D(0) )			/* ->D(1) | ->D(0) */
419
420	MOVSS( S(2), XMM0 )
421	MOVSS( XMM0, D(2) )
422
423LLBL(K_GTP3P2DNRR_skip):
424	ADD_L( CONST(16), EDI )
425	ADD_L( EAX, ESI )
426	CMP_L( ECX, EDI )
427	JNE( LLBL(K_GTP3P2DNRR_top) )
428
429LLBL(K_GTP3P2DNRR_finish):
430	POP_L( EDI )
431	POP_L( ESI )
432	RET
433#undef FRAME_OFFSET
434
435
436
437
438ALIGNTEXT4
439GLOBL GLNAME(_mesa_sse_transform_points3_3d)
440HIDDEN(_mesa_sse_transform_points3_3d)
441GLNAME(_mesa_sse_transform_points3_3d):
442	_CET_ENDBR
443#define FRAME_OFFSET 8
444	PUSH_L( ESI )
445	PUSH_L( EDI )
446
447	MOV_L( REGOFF(OFFSET_SOURCE+8, ESP), ESI ) 	/* ptr to source GLvector4f */
448	MOV_L( REGOFF(OFFSET_DEST+8, ESP), EDI ) 	/* ptr to dest GLvector4f */
449
450
451	MOV_L( ARG_MATRIX, EDX ) 		/* ptr to matrix */
452	MOV_L( REGOFF(V4F_COUNT, ESI), ECX ) 	/* source count */
453
454	TEST_L( ECX, ECX)
455	JZ( LLBL(K_GTP3P3DR_finish) ) 	/* count was zero; go to finish */
456
457	MOV_L( REGOFF(V4F_STRIDE, ESI), EAX ) 	/* stride */
458	OR_L( CONST(VEC_SIZE_3), REGOFF(V4F_FLAGS, EDI) ) 	/* set dest flags */
459
460	MOV_L( ECX, REGOFF(V4F_COUNT, EDI) ) 	/* set dest count */
461	MOV_L( CONST(3), REGOFF(V4F_SIZE, EDI) ) 	/* set dest size */
462
463	SHL_L( CONST(4), ECX ) 			/* count *= 16 */
464	MOV_L( REGOFF(V4F_START, ESI), ESI ) 	/* ptr to first source vertex */
465
466	MOV_L( REGOFF(V4F_START, EDI), EDI ) 	/* ptr to first dest vertex */
467	ADD_L( EDI, ECX ) 			/* count += dest ptr */
468
469
470ALIGNTEXT32
471	MOVAPS( M(0), XMM0 )			/* m2  | m1  | m0 */
472	MOVAPS( M(4), XMM1 )			/* m6  | m5  | m4 */
473	MOVAPS( M(8), XMM2 )			/* m10 | m9  | m8 */
474	MOVAPS( M(12), XMM3 )			/* m14 | m13 | m12 */
475
476ALIGNTEXT32
477LLBL(K_GTP3P3DR_top):
478	MOVSS( S(0), XMM4 )
479	SHUFPS( CONST(0x0), XMM4, XMM4 )	/* ox | ox | ox */
480	MULPS( XMM0, XMM4 )			/* ox*m2 | ox*m1 | ox*m0 */
481
482	MOVSS( S(1), XMM5 )
483	SHUFPS( CONST(0x0), XMM5, XMM5 )	/* oy | oy | oy */
484	MULPS( XMM1, XMM5 )			/* oy*m6 | oy*m5 | oy*m4 */
485
486	MOVSS( S(2), XMM6 )
487	SHUFPS( CONST(0x0), XMM6, XMM6 )	/* oz | oz | oz */
488	MULPS( XMM2, XMM6 )			/* oz*m10 | oz*m9 | oz*m8 */
489
490	ADDPS( XMM5, XMM4 )			/* + | + | + */
491	ADDPS( XMM6, XMM4 )			/* + | + | + */
492	ADDPS( XMM3, XMM4 )			/* + | + | + */
493
494	MOVLPS( XMM4, D(0) )			/* => D(1) | => D(0) */
495	UNPCKHPS( XMM4, XMM4 )
496	MOVSS( XMM4, D(2) )
497
498LLBL(K_GTP3P3DR_skip):
499	ADD_L( CONST(16), EDI )
500	ADD_L( EAX, ESI )
501	CMP_L( ECX, EDI )
502	JNE( LLBL(K_GTP3P3DR_top) )
503
504LLBL(K_GTP3P3DR_finish):
505	POP_L( EDI )
506	POP_L( ESI )
507	RET
508#undef FRAME_OFFSET
509#endif
510
511#if defined (__ELF__) && defined (__linux__)
512	.section .note.GNU-stack,"",%progbits
513#endif
514