• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1/*
2 * Mesa 3-D graphics library
3 *
4 * Copyright (C) 1999-2007  Brian Paul   All Rights Reserved.
5 *
6 * Permission is hereby granted, free of charge, to any person obtaining a
7 * copy of this software and associated documentation files (the "Software"),
8 * to deal in the Software without restriction, including without limitation
9 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
10 * and/or sell copies of the Software, and to permit persons to whom the
11 * Software is furnished to do so, subject to the following conditions:
12 *
13 * The above copyright notice and this permission notice shall be included
14 * in all copies or substantial portions of the Software.
15 *
16 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
17 * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
19 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR
20 * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
21 * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
22 * OTHER DEALINGS IN THE SOFTWARE.
23 */
24#ifdef HAVE_CET_H
25#include <cet.h>
26#else
27#define _CET_ENDBR
28#endif
29
30#ifdef USE_X86_64_ASM
31
32#define MATH_ASM_PTR_SIZE 8
33#include "math/m_vector_asm.h"
34
35.text
36
37.align 16
38.globl _mesa_x86_64_cpuid
39.hidden _mesa_x86_64_cpuid
40_mesa_x86_64_cpuid:
41	_CET_ENDBR
42	pushq	%rbx
43	movl	(%rdi), %eax
44	movl	8(%rdi), %ecx
45
46	cpuid
47
48	movl	%ebx, 4(%rdi)
49	movl	%eax, (%rdi)
50	movl	%ecx, 8(%rdi)
51	movl	%edx, 12(%rdi)
52	popq	%rbx
53	ret
54
55.align 16
56.globl _mesa_x86_64_transform_points4_general
57.hidden _mesa_x86_64_transform_points4_general
58_mesa_x86_64_transform_points4_general:
59/*
60 *	rdi = dest
61 *	rsi = matrix
62 *	rdx = source
63 */
64	_CET_ENDBR
65	movl V4F_COUNT(%rdx), %ecx	/* count */
66	movzbl V4F_STRIDE(%rdx), %eax	/* stride */
67
68	movl %ecx, V4F_COUNT(%rdi)	/* set dest count */
69	movl $4, V4F_SIZE(%rdi)		/* set dest size */
70	.byte 0x66, 0x66, 0x66, 0x90		/* manual align += 3 */
71	orl $VEC_SIZE_4, V4F_FLAGS(%rdi)/* set dest flags */
72
73	testl %ecx, %ecx		/* verify non-zero count */
74	prefetchnta 64(%rsi)
75	jz p4_general_done
76
77	movq V4F_START(%rdx), %rdx	/* ptr to first src vertex */
78	movq V4F_START(%rdi), %rdi	/* ptr to first dest vertex */
79
80	prefetcht1 16(%rdx)
81
82	movaps 0(%rsi), %xmm4		/* m3  | m2  | m1  | m0  */
83	movaps 16(%rsi), %xmm5		/* m7  | m6  | m5  | m4  */
84	.byte 0x66, 0x66, 0x90		/* manual align += 3 */
85	movaps 32(%rsi), %xmm6		/* m11 | m10 | m9  | m8  */
86        movaps 48(%rsi), %xmm7		/* m15 | m14 | m13 | m12 */
87
88p4_general_loop:
89
90	movups (%rdx), %xmm8		/* ox | oy | oz | ow */
91	prefetcht1 16(%rdi)
92
93	pshufd $0x00, %xmm8, %xmm0	/* ox | ox | ox | ox */
94	addq %rax, %rdx
95	pshufd $0x55, %xmm8, %xmm1	/* oy | oy | oy | oy */
96	mulps %xmm4, %xmm0		/* ox*m3 | ox*m2 | ox*m1 | ox*m0 */
97	pshufd $0xAA, %xmm8, %xmm2	/* oz | oz | oz | ox */
98	mulps %xmm5, %xmm1		/* oy*m7 | oy*m6 | oy*m5 | oy*m4 */
99	pshufd $0xFF, %xmm8, %xmm3	/* ow | ow | ow | ow */
100	mulps %xmm6, %xmm2		/* oz*m11 | oz*m10 | oz*m9 | oz*m8 */
101	addps %xmm1, %xmm0		/* ox*m3+oy*m7 | ... */
102	mulps %xmm7, %xmm3		/* ow*m15 | ow*m14 | ow*m13 | ow*m12 */
103	addps %xmm2, %xmm0		/* ox*m3+oy*m7+oz*m11 | ... */
104	prefetcht1 16(%rdx)
105	addps %xmm3, %xmm0		/* ox*m3+oy*m7+oz*m11+ow*m15 | ... */
106
107	movaps %xmm0, (%rdi)		/* ->D(3) | ->D(2) | ->D(1) | ->D(0) */
108	addq $16, %rdi
109
110	decl %ecx
111	jnz p4_general_loop
112
113p4_general_done:
114	.byte 0xf3
115	ret
116
117.section .rodata
118
119.align 16
120p4_constants:
121.byte  0xff, 0xff, 0xff, 0xff
122.byte  0xff, 0xff, 0xff, 0xff
123.byte  0xff, 0xff, 0xff, 0xff
124.byte  0x00, 0x00, 0x00, 0x00
125
126.byte  0x00, 0x00, 0x00, 0x00
127.byte  0x00, 0x00, 0x00, 0x00
128.byte  0x00, 0x00, 0x00, 0x00
129.float 1.0
130
131.text
132.align 16
133.globl _mesa_x86_64_transform_points4_3d
134.hidden _mesa_x86_64_transform_points4_3d
135/*
136 * this is slower than _mesa_x86_64_transform_points4_general
137 * because it ensures that the last matrix row (or is it column?) is 0,0,0,1
138 */
139_mesa_x86_64_transform_points4_3d:
140	_CET_ENDBR
141	leaq p4_constants(%rip), %rax
142
143	prefetchnta 64(%rsi)
144
145	movaps (%rax), %xmm9
146	movaps 16(%rax), %xmm10
147
148	movl V4F_COUNT(%rdx), %ecx	/* count */
149	movzbl V4F_STRIDE(%rdx), %eax	/* stride */
150
151	movl %ecx, V4F_COUNT(%rdi)	/* set dest count */
152	movl $4, V4F_SIZE(%rdi)		/* set dest size */
153	orl $VEC_SIZE_4, V4F_FLAGS(%rdi)/* set dest flags */
154
155	testl %ecx, %ecx		/* verify non-zero count */
156	jz p4_3d_done
157
158	movq V4F_START(%rdx), %rdx	/* ptr to first src vertex */
159	movq V4F_START(%rdi), %rdi	/* ptr to first dest vertex */
160
161	prefetcht1 16(%rdx)
162
163	movaps 0(%rsi), %xmm4		/* m3  | m2  | m1  | m0  */
164	movaps 16(%rsi), %xmm5		/* m7  | m6  | m5  | m4  */
165	andps  %xmm9, %xmm4             /* 0.0 | m2  | m1  | m0  */
166	movaps 32(%rsi), %xmm6		/* m11 | m10 | m9  | m8  */
167	andps  %xmm9, %xmm5		/* 0.0 | m6  | m5  | m4  */
168        movaps 48(%rsi), %xmm7		/* m15 | m14 | m13 | m12 */
169	andps  %xmm9, %xmm6		/* 0.0 | m10 | m9  | m8  */
170	andps  %xmm9, %xmm7		/* 0.0 | m14 | m13 | m12  */
171	.byte 0x66, 0x66, 0x90		/* manual align += 3 */
172	orps   %xmm10, %xmm7		/* 1.0 | m14 | m13 | m12  */
173
174p4_3d_loop:
175
176	movups (%rdx), %xmm8		/* ox | oy | oz | ow */
177	prefetcht1 16(%rdi)
178
179	pshufd $0x00, %xmm8, %xmm0	/* ox | ox | ox | ox */
180	addq %rax, %rdx
181	pshufd $0x55, %xmm8, %xmm1	/* oy | oy | oy | oy */
182	mulps %xmm4, %xmm0		/* ox*m3 | ox*m2 | ox*m1 | ox*m0 */
183	pshufd $0xAA, %xmm8, %xmm2	/* oz | oz | oz | ox */
184	mulps %xmm5, %xmm1		/* oy*m7 | oy*m6 | oy*m5 | oy*m4 */
185	pshufd $0xFF, %xmm8, %xmm3	/* ow | ow | ow | ow */
186	mulps %xmm6, %xmm2		/* oz*m11 | oz*m10 | oz*m9 | oz*m8 */
187	addps %xmm1, %xmm0		/* ox*m3+oy*m7 | ... */
188	mulps %xmm7, %xmm3		/* ow*m15 | ow*m14 | ow*m13 | ow*m12 */
189	addps %xmm2, %xmm0		/* ox*m3+oy*m7+oz*m11 | ... */
190	prefetcht1 16(%rdx)
191	addps %xmm3, %xmm0		/* ox*m3+oy*m7+oz*m11+ow*m15 | ... */
192
193	movaps %xmm0, (%rdi)		/* ->D(3) | ->D(2) | ->D(1) | ->D(0) */
194	addq $16, %rdi
195
196	dec %ecx
197	jnz p4_3d_loop
198
199p4_3d_done:
200	.byte 0xf3
201	ret
202
203
204.align 16
205.globl _mesa_x86_64_transform_points4_identity
206.hidden _mesa_x86_64_transform_points4_identity
207_mesa_x86_64_transform_points4_identity:
208	_CET_ENDBR
209	movl V4F_COUNT(%rdx), %ecx	/* count */
210	movzbl V4F_STRIDE(%rdx), %eax	/* stride */
211
212	movl %ecx, V4F_COUNT(%rdi)	/* set dest count */
213	movl $4, V4F_SIZE(%rdi)		/* set dest size */
214	orl $VEC_SIZE_4, V4F_FLAGS(%rdi)/* set dest flags */
215
216	test %ecx, %ecx
217	jz p4_identity_done
218
219	movq V4F_START(%rdx), %rsi	/* ptr to first src vertex */
220	movq V4F_START(%rdi), %rdi	/* ptr to first dest vertex */
221	prefetcht1 64(%rsi)
222	prefetcht1 64(%rdi)
223
224	add %ecx, %ecx
225
226	rep movsq
227
228p4_identity_done:
229	.byte 0xf3
230	ret
231
232
233.align 16
234.globl _mesa_3dnow_transform_points4_3d_no_rot
235.hidden _mesa_3dnow_transform_points4_3d_no_rot
236_mesa_3dnow_transform_points4_3d_no_rot:
237	_CET_ENDBR
238	movl V4F_COUNT(%rdx), %ecx	/* count */
239	movzbl V4F_STRIDE(%rdx), %eax	/* stride */
240
241	movl %ecx, V4F_COUNT(%rdi)	/* set dest count */
242	movl $4, V4F_SIZE(%rdi)		/* set dest size */
243	.byte 0x66, 0x66, 0x90	        /* manual align += 3 */
244	orl $VEC_SIZE_4, V4F_FLAGS(%rdi)/* set dest flags */
245
246	test %ecx, %ecx
247	.byte 0x66, 0x66, 0x90		/* manual align += 3 */
248	jz p4_3d_no_rot_done
249
250	movq V4F_START(%rdx), %rdx	/* ptr to first src vertex */
251	movq V4F_START(%rdi), %rdi	/* ptr to first dest vertex */
252
253	prefetcht1 (%rdx)
254
255	movd (%rsi), %mm0		/*                 | m00             */
256	.byte 0x66, 0x66, 0x90	        /* manual align += 3 */
257	punpckldq 20(%rsi), %mm0	/* m11             | m00             */
258
259	movd 40(%rsi), %mm2		/*                 | m22             */
260	movq 48(%rsi), %mm1		/* m31             | m30             */
261
262	punpckldq 56(%rsi), %mm2	/* m11             | m00             */
263
264p4_3d_no_rot_loop:
265
266	prefetcht1 32(%rdi)
267
268	movq  (%rdx), %mm4		/* x1              | x0              */
269	movq  8(%rdx), %mm5		/* x3              | x2              */
270	movd  12(%rdx), %mm7		/*                 | x3              */
271
272	movq  %mm5, %mm6		/* x3              | x2              */
273	pfmul %mm0, %mm4		/* x1*m11          | x0*m00          */
274
275	punpckhdq %mm6, %mm6		/* x3              | x3              */
276	pfmul %mm2, %mm5		/* x3*m32          | x2*m22          */
277
278	pfmul %mm1, %mm6		/* x3*m31          | x3*m30          */
279	pfacc %mm7, %mm5		/* x3              | x2*m22+x3*m32   */
280
281        pfadd %mm6, %mm4		/* x1*m11+x3*m31   | x0*m00+x3*m30   */
282
283	addq %rax, %rdx
284	movq %mm4, (%rdi)		/* write r0, r1                      */
285	movq %mm5, 8(%rdi)		/* write r2, r3                      */
286
287	addq $16, %rdi
288
289	decl %ecx
290	prefetcht1 32(%rdx)
291	jnz p4_3d_no_rot_loop
292
293p4_3d_no_rot_done:
294	femms
295	ret
296
297
298.align 16
299.globl _mesa_3dnow_transform_points4_perspective
300.hidden _mesa_3dnow_transform_points4_perspective
301_mesa_3dnow_transform_points4_perspective:
302	_CET_ENDBR
303	movl V4F_COUNT(%rdx), %ecx	/* count */
304	movzbl V4F_STRIDE(%rdx), %eax	/* stride */
305
306	movl %ecx, V4F_COUNT(%rdi)	/* set dest count */
307	movl $4, V4F_SIZE(%rdi)		/* set dest size */
308	orl $VEC_SIZE_4, V4F_FLAGS(%rdi)/* set dest flags */
309
310	test %ecx, %ecx
311	.byte 0x66, 0x66, 0x90		/* manual align += 3 */
312	jz p4_perspective_done
313
314	movq V4F_START(%rdx), %rdx	/* ptr to first src vertex */
315	movq V4F_START(%rdi), %rdi	/* ptr to first dest vertex */
316
317	movd (%rsi), %mm0		/*                 | m00             */
318        pxor %mm7, %mm7			/* 0               | 0               */
319	punpckldq 20(%rsi), %mm0	/* m11             | m00             */
320
321	movq 32(%rsi), %mm2		/* m21             | m20             */
322	prefetcht1 (%rdx)
323
324	movd 40(%rsi), %mm1		/*                 | m22             */
325
326	.byte 0x66, 0x66, 0x90	        /* manual align += 3 */
327	punpckldq 56(%rsi), %mm1	/* m32             | m22             */
328
329
330p4_perspective_loop:
331
332	prefetcht1 32(%rdi)		/* prefetch 2 vertices ahead         */
333
334	movq (%rdx), %mm4		/* x1              | x0              */
335	movq 8(%rdx), %mm5		/* x3              | x2              */
336	movd 8(%rdx), %mm3		/*                 | x2              */
337
338	movq %mm5, %mm6			/* x3              | x2              */
339	pfmul %mm0, %mm4		/* x1*m11          | x0*m00          */
340
341	punpckldq %mm5, %mm5		/* x2              | x2              */
342
343	pfmul %mm2, %mm5		/* x2*m21          | x2*m20          */
344	pfsubr %mm7, %mm3		/*                 | -x2             */
345
346	pfmul %mm1, %mm6		/* x3*m32          | x2*m22          */
347	pfadd %mm4, %mm5		/* x1*m11+x2*m21   | x0*m00+x2*m20   */
348
349	pfacc %mm3, %mm6		/* -x2             | x2*m22+x3*m32   */
350
351	movq %mm5, (%rdi)		/* write r0, r1                      */
352	addq %rax, %rdx
353	movq %mm6, 8(%rdi)		/* write r2, r3                      */
354
355	addq $16, %rdi
356
357	decl %ecx
358	prefetcht1 32(%rdx)		/* hopefully stride is zero          */
359	jnz p4_perspective_loop
360
361p4_perspective_done:
362	femms
363	ret
364
365.align 16
366.globl _mesa_3dnow_transform_points4_2d_no_rot
367.hidden _mesa_3dnow_transform_points4_2d_no_rot
368_mesa_3dnow_transform_points4_2d_no_rot:
369	_CET_ENDBR
370	movl V4F_COUNT(%rdx), %ecx	/* count */
371	movzbl V4F_STRIDE(%rdx), %eax	/* stride */
372
373	movl %ecx, V4F_COUNT(%rdi)	/* set dest count */
374	movl $4, V4F_SIZE(%rdi)		/* set dest size */
375	orl $VEC_SIZE_4, V4F_FLAGS(%rdi)/* set dest flags */
376
377	test %ecx, %ecx
378	.byte 0x90			/* manual align += 1 */
379	jz p4_2d_no_rot_done
380
381	movq V4F_START(%rdx), %rdx	/* ptr to first src vertex */
382	movq V4F_START(%rdi), %rdi	/* ptr to first dest vertex */
383
384	movd (%rsi), %mm0		/*                 | m00             */
385	prefetcht1 (%rdx)
386	punpckldq 20(%rsi), %mm0	/* m11             | m00             */
387
388	movq 48(%rsi), %mm1		/* m31             | m30             */
389
390p4_2d_no_rot_loop:
391
392	prefetcht1 32(%rdi)		/* prefetch 2 vertices ahead         */
393
394	movq (%rdx), %mm4		/* x1              | x0              */
395	movq 8(%rdx), %mm5		/* x3              | x2              */
396
397	pfmul %mm0, %mm4		/* x1*m11          | x0*m00          */
398	movq %mm5, %mm6			/* x3              | x2              */
399
400	punpckhdq %mm6, %mm6		/* x3              | x3              */
401
402	addq %rax, %rdx
403	pfmul %mm1, %mm6		/* x3*m31          | x3*m30          */
404
405	prefetcht1 32(%rdx)		/* hopefully stride is zero          */
406	pfadd %mm4, %mm6		/* x1*m11+x3*m31   | x0*m00+x3*m30   */
407
408	movq %mm6, (%rdi)		/* write r0, r1                      */
409	movq %mm5, 8(%rdi)		/* write r2, r3                      */
410
411	addq $16, %rdi
412
413	decl %ecx
414	jnz p4_2d_no_rot_loop
415
416p4_2d_no_rot_done:
417	femms
418	ret
419
420
421.align 16
422.globl _mesa_3dnow_transform_points4_2d
423.hidden _mesa_3dnow_transform_points4_2d
424_mesa_3dnow_transform_points4_2d:
425	_CET_ENDBR
426	movl V4F_COUNT(%rdx), %ecx	/* count */
427	movzbl V4F_STRIDE(%rdx), %eax	/* stride */
428
429	movl %ecx, V4F_COUNT(%rdi)	/* set dest count */
430	movl $4, V4F_SIZE(%rdi)		/* set dest size */
431	.byte 0x66, 0x66, 0x90		/* manual align += 4 */
432	orl $VEC_SIZE_4, V4F_FLAGS(%rdi)/* set dest flags */
433
434	test %ecx, %ecx
435	.byte 0x66, 0x66, 0x90		/* manual align += 4 */
436	jz p4_2d_done
437
438	movq V4F_START(%rdx), %rdx	/* ptr to first src vertex */
439	movq V4F_START(%rdi), %rdi	/* ptr to first dest vertex */
440
441	movd (%rsi), %mm0		/*                 | m00             */
442	movd 4(%rsi), %mm1		/*                 | m01             */
443
444	prefetcht1 (%rdx)
445
446	punpckldq 16(%rsi), %mm0	/* m10             | m00             */
447	.byte 0x66, 0x66, 0x90		/* manual align += 4 */
448	punpckldq 20(%rsi), %mm1	/* m11             | m01             */
449
450	movq 48(%rsi), %mm2		/* m31             | m30             */
451
452p4_2d_loop:
453
454	prefetcht1 32(%rdi)		/* prefetch 2 vertices ahead         */
455
456	movq (%rdx), %mm3		/* x1              | x0              */
457	movq 8(%rdx), %mm5		/* x3              | x2              */
458
459	movq %mm3, %mm4			/* x1              | x0              */
460	movq %mm5, %mm6			/* x3              | x2              */
461
462	pfmul %mm1, %mm4		/* x1*m11          | x0*m01          */
463	punpckhdq %mm6, %mm6		/* x3              | x3              */
464
465	pfmul %mm0, %mm3		/* x1*m10          | x0*m00          */
466
467	addq %rax, %rdx
468	pfacc %mm4, %mm3		/* x0*m01+x1*m11   | x0*m00+x1*m10   */
469
470	pfmul %mm2, %mm6		/* x3*m31          | x3*m30          */
471	prefetcht1 32(%rdx)		/* hopefully stride is zero          */
472
473	pfadd %mm6, %mm3		/* r1              | r0              */
474
475	movq %mm3, (%rdi)		/* write r0, r1                      */
476	movq %mm5, 8(%rdi)		/* write r2, r3                      */
477
478	addq $16, %rdi
479
480	decl %ecx
481	jnz p4_2d_loop
482
483p4_2d_done:
484	femms
485	ret
486
487#endif
488
489#if defined (__ELF__) && defined (__linux__)
490	.section .note.GNU-stack,"",%progbits
491#endif
492