• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*
2  * Mesa 3-D graphics library
3  * Version:  7.1
4  *
5  * Copyright (C) 1999-2007  Brian Paul   All Rights Reserved.
6  *
7  * Permission is hereby granted, free of charge, to any person obtaining a
8  * copy of this software and associated documentation files (the "Software"),
9  * to deal in the Software without restriction, including without limitation
10  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
11  * and/or sell copies of the Software, and to permit persons to whom the
12  * Software is furnished to do so, subject to the following conditions:
13  *
14  * The above copyright notice and this permission notice shall be included
15  * in all copies or substantial portions of the Software.
16  *
17  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
18  * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
19  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
20  * BRIAN PAUL BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN
21  * AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
22  * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
23  */
24 
25 #ifdef USE_X86_64_ASM
26 
27 #include "matypes.h"
28 
29 .text
30 
31 .align 16
32 .globl _mesa_x86_64_cpuid
33 .hidden _mesa_x86_64_cpuid
34 _mesa_x86_64_cpuid:
35 	pushq	%rbx
36 	movl	(%rdi), %eax
37 	movl	8(%rdi), %ecx
38 
39 	cpuid
40 
41 	movl	%ebx, 4(%rdi)
42 	movl	%eax, (%rdi)
43 	movl	%ecx, 8(%rdi)
44 	movl	%edx, 12(%rdi)
45 	popq	%rbx
46 	ret
47 
48 .align 16
49 .globl _mesa_x86_64_transform_points4_general
50 .hidden _mesa_x86_64_transform_points4_general
51 _mesa_x86_64_transform_points4_general:
52 /*
53  *	rdi = dest
54  *	rsi = matrix
55  *	rdx = source
56  */
57 	movl V4F_COUNT(%rdx), %ecx	/* count */
58 	movzbl V4F_STRIDE(%rdx), %eax	/* stride */
59 
60 	movl %ecx, V4F_COUNT(%rdi)	/* set dest count */
61 	movl $4, V4F_SIZE(%rdi)		/* set dest size */
62 	.byte 0x66, 0x66, 0x66, 0x90		/* manual align += 3 */
63 	orl $VEC_SIZE_4, V4F_FLAGS(%rdi)/* set dest flags */
64 
65 	testl %ecx, %ecx		/* verify non-zero count */
66 	prefetchnta 64(%rsi)
67 	jz p4_general_done
68 
69 	movq V4F_START(%rdx), %rdx	/* ptr to first src vertex */
70 	movq V4F_START(%rdi), %rdi	/* ptr to first dest vertex */
71 
72 	prefetch 16(%rdx)
73 
74 	movaps 0(%rsi), %xmm4		/* m3  | m2  | m1  | m0  */
75 	movaps 16(%rsi), %xmm5		/* m7  | m6  | m5  | m4  */
76 	.byte 0x66, 0x66, 0x90		/* manual align += 3 */
77 	movaps 32(%rsi), %xmm6		/* m11 | m10 | m9  | m8  */
78         movaps 48(%rsi), %xmm7		/* m15 | m14 | m13 | m12 */
79 
80 p4_general_loop:
81 
82 	movups (%rdx), %xmm8		/* ox | oy | oz | ow */
83 	prefetchw 16(%rdi)
84 
85 	pshufd $0x00, %xmm8, %xmm0	/* ox | ox | ox | ox */
86 	addq %rax, %rdx
87 	pshufd $0x55, %xmm8, %xmm1	/* oy | oy | oy | oy */
88 	mulps %xmm4, %xmm0		/* ox*m3 | ox*m2 | ox*m1 | ox*m0 */
89 	pshufd $0xAA, %xmm8, %xmm2	/* oz | oz | oz | ox */
90 	mulps %xmm5, %xmm1		/* oy*m7 | oy*m6 | oy*m5 | oy*m4 */
91 	pshufd $0xFF, %xmm8, %xmm3	/* ow | ow | ow | ow */
92 	mulps %xmm6, %xmm2		/* oz*m11 | oz*m10 | oz*m9 | oz*m8 */
93 	addps %xmm1, %xmm0		/* ox*m3+oy*m7 | ... */
94 	mulps %xmm7, %xmm3		/* ow*m15 | ow*m14 | ow*m13 | ow*m12 */
95 	addps %xmm2, %xmm0		/* ox*m3+oy*m7+oz*m11 | ... */
96 	prefetch 16(%rdx)
97 	addps %xmm3, %xmm0		/* ox*m3+oy*m7+oz*m11+ow*m15 | ... */
98 
99 	movaps %xmm0, (%rdi)		/* ->D(3) | ->D(2) | ->D(1) | ->D(0) */
100 	addq $16, %rdi
101 
102 	decl %ecx
103 	jnz p4_general_loop
104 
105 p4_general_done:
106 	.byte 0xf3
107 	ret
108 
109 .section .rodata
110 
111 .align 16
112 p4_constants:
113 .byte  0xff, 0xff, 0xff, 0xff
114 .byte  0xff, 0xff, 0xff, 0xff
115 .byte  0xff, 0xff, 0xff, 0xff
116 .byte  0x00, 0x00, 0x00, 0x00
117 
118 .byte  0x00, 0x00, 0x00, 0x00
119 .byte  0x00, 0x00, 0x00, 0x00
120 .byte  0x00, 0x00, 0x00, 0x00
121 .float 1.0
122 
123 .text
124 .align 16
125 .globl _mesa_x86_64_transform_points4_3d
126 .hidden _mesa_x86_64_transform_points4_3d
127 /*
128  * this is slower than _mesa_x86_64_transform_points4_general
129  * because it ensures that the last matrix row (or is it column?) is 0,0,0,1
130  */
131 _mesa_x86_64_transform_points4_3d:
132 
133 	leaq p4_constants(%rip), %rax
134 
135 	prefetchnta 64(%rsi)
136 
137 	movaps (%rax), %xmm9
138 	movaps 16(%rax), %xmm10
139 
140 	movl V4F_COUNT(%rdx), %ecx	/* count */
141 	movzbl V4F_STRIDE(%rdx), %eax	/* stride */
142 
143 	movl %ecx, V4F_COUNT(%rdi)	/* set dest count */
144 	movl $4, V4F_SIZE(%rdi)		/* set dest size */
145 	orl $VEC_SIZE_4, V4F_FLAGS(%rdi)/* set dest flags */
146 
147 	testl %ecx, %ecx		/* verify non-zero count */
148 	jz p4_3d_done
149 
150 	movq V4F_START(%rdx), %rdx	/* ptr to first src vertex */
151 	movq V4F_START(%rdi), %rdi	/* ptr to first dest vertex */
152 
153 	prefetch 16(%rdx)
154 
155 	movaps 0(%rsi), %xmm4		/* m3  | m2  | m1  | m0  */
156 	movaps 16(%rsi), %xmm5		/* m7  | m6  | m5  | m4  */
157 	andps  %xmm9, %xmm4             /* 0.0 | m2  | m1  | m0  */
158 	movaps 32(%rsi), %xmm6		/* m11 | m10 | m9  | m8  */
159 	andps  %xmm9, %xmm5		/* 0.0 | m6  | m5  | m4  */
160         movaps 48(%rsi), %xmm7		/* m15 | m14 | m13 | m12 */
161 	andps  %xmm9, %xmm6		/* 0.0 | m10 | m9  | m8  */
162 	andps  %xmm9, %xmm7		/* 0.0 | m14 | m13 | m12  */
163 	.byte 0x66, 0x66, 0x90		/* manual align += 3 */
164 	orps   %xmm10, %xmm7		/* 1.0 | m14 | m13 | m12  */
165 
166 p4_3d_loop:
167 
168 	movups (%rdx), %xmm8		/* ox | oy | oz | ow */
169 	prefetchw 16(%rdi)
170 
171 	pshufd $0x00, %xmm8, %xmm0	/* ox | ox | ox | ox */
172 	addq %rax, %rdx
173 	pshufd $0x55, %xmm8, %xmm1	/* oy | oy | oy | oy */
174 	mulps %xmm4, %xmm0		/* ox*m3 | ox*m2 | ox*m1 | ox*m0 */
175 	pshufd $0xAA, %xmm8, %xmm2	/* oz | oz | oz | ox */
176 	mulps %xmm5, %xmm1		/* oy*m7 | oy*m6 | oy*m5 | oy*m4 */
177 	pshufd $0xFF, %xmm8, %xmm3	/* ow | ow | ow | ow */
178 	mulps %xmm6, %xmm2		/* oz*m11 | oz*m10 | oz*m9 | oz*m8 */
179 	addps %xmm1, %xmm0		/* ox*m3+oy*m7 | ... */
180 	mulps %xmm7, %xmm3		/* ow*m15 | ow*m14 | ow*m13 | ow*m12 */
181 	addps %xmm2, %xmm0		/* ox*m3+oy*m7+oz*m11 | ... */
182 	prefetch 16(%rdx)
183 	addps %xmm3, %xmm0		/* ox*m3+oy*m7+oz*m11+ow*m15 | ... */
184 
185 	movaps %xmm0, (%rdi)		/* ->D(3) | ->D(2) | ->D(1) | ->D(0) */
186 	addq $16, %rdi
187 
188 	dec %ecx
189 	jnz p4_3d_loop
190 
191 p4_3d_done:
192 	.byte 0xf3
193 	ret
194 
195 
196 .align 16
197 .globl _mesa_x86_64_transform_points4_identity
198 .hidden _mesa_x86_64_transform_points4_identity
199 _mesa_x86_64_transform_points4_identity:
200 
201 	movl V4F_COUNT(%rdx), %ecx	/* count */
202 	movzbl V4F_STRIDE(%rdx), %eax	/* stride */
203 
204 	movl %ecx, V4F_COUNT(%rdi)	/* set dest count */
205 	movl $4, V4F_SIZE(%rdi)		/* set dest size */
206 	orl $VEC_SIZE_4, V4F_FLAGS(%rdi)/* set dest flags */
207 
208 	test %ecx, %ecx
209 	jz p4_identity_done
210 
211 	movq V4F_START(%rdx), %rsi	/* ptr to first src vertex */
212 	movq V4F_START(%rdi), %rdi	/* ptr to first dest vertex */
213 	prefetch 64(%rsi)
214 	prefetchw 64(%rdi)
215 
216 	add %ecx, %ecx
217 
218 	rep movsq
219 
220 p4_identity_done:
221 	.byte 0xf3
222 	ret
223 
224 
225 .align 16
226 .globl _mesa_3dnow_transform_points4_3d_no_rot
227 .hidden _mesa_3dnow_transform_points4_3d_no_rot
228 _mesa_3dnow_transform_points4_3d_no_rot:
229 
230 	movl V4F_COUNT(%rdx), %ecx	/* count */
231 	movzbl V4F_STRIDE(%rdx), %eax	/* stride */
232 
233 	movl %ecx, V4F_COUNT(%rdi)	/* set dest count */
234 	movl $4, V4F_SIZE(%rdi)		/* set dest size */
235 	.byte 0x66, 0x66, 0x90	        /* manual align += 3 */
236 	orl $VEC_SIZE_4, V4F_FLAGS(%rdi)/* set dest flags */
237 
238 	test %ecx, %ecx
239 	.byte 0x66, 0x66, 0x90		/* manual align += 3 */
240 	jz p4_3d_no_rot_done
241 
242 	movq V4F_START(%rdx), %rdx	/* ptr to first src vertex */
243 	movq V4F_START(%rdi), %rdi	/* ptr to first dest vertex */
244 
245 	prefetch (%rdx)
246 
247 	movd (%rsi), %mm0		/*                 | m00             */
248 	.byte 0x66, 0x66, 0x90	        /* manual align += 3 */
249 	punpckldq 20(%rsi), %mm0	/* m11             | m00             */
250 
251 	movd 40(%rsi), %mm2		/*                 | m22             */
252 	movq 48(%rsi), %mm1		/* m31             | m30             */
253 
254 	punpckldq 56(%rsi), %mm2	/* m11             | m00             */
255 
256 p4_3d_no_rot_loop:
257 
258 	prefetchw 32(%rdi)
259 
260 	movq  (%rdx), %mm4		/* x1              | x0              */
261 	movq  8(%rdx), %mm5		/* x3              | x2              */
262 	movd  12(%rdx), %mm7		/*                 | x3              */
263 
264 	movq  %mm5, %mm6		/* x3              | x2              */
265 	pfmul %mm0, %mm4		/* x1*m11          | x0*m00          */
266 
267 	punpckhdq %mm6, %mm6		/* x3              | x3              */
268 	pfmul %mm2, %mm5		/* x3*m32          | x2*m22          */
269 
270 	pfmul %mm1, %mm6		/* x3*m31          | x3*m30          */
271 	pfacc %mm7, %mm5		/* x3              | x2*m22+x3*m32   */
272 
273         pfadd %mm6, %mm4		/* x1*m11+x3*m31   | x0*m00+x3*m30   */
274 
275 	addq %rax, %rdx
276 	movq %mm4, (%rdi)		/* write r0, r1                      */
277 	movq %mm5, 8(%rdi)		/* write r2, r3                      */
278 
279 	addq $16, %rdi
280 
281 	decl %ecx
282 	prefetch 32(%rdx)
283 	jnz p4_3d_no_rot_loop
284 
285 p4_3d_no_rot_done:
286 	femms
287 	ret
288 
289 
290 .align 16
291 .globl _mesa_3dnow_transform_points4_perspective
292 .hidden _mesa_3dnow_transform_points4_perspective
293 _mesa_3dnow_transform_points4_perspective:
294 
295 	movl V4F_COUNT(%rdx), %ecx	/* count */
296 	movzbl V4F_STRIDE(%rdx), %eax	/* stride */
297 
298 	movl %ecx, V4F_COUNT(%rdi)	/* set dest count */
299 	movl $4, V4F_SIZE(%rdi)		/* set dest size */
300 	orl $VEC_SIZE_4, V4F_FLAGS(%rdi)/* set dest flags */
301 
302 	test %ecx, %ecx
303 	.byte 0x66, 0x66, 0x90		/* manual align += 3 */
304 	jz p4_perspective_done
305 
306 	movq V4F_START(%rdx), %rdx	/* ptr to first src vertex */
307 	movq V4F_START(%rdi), %rdi	/* ptr to first dest vertex */
308 
309 	movd (%rsi), %mm0		/*                 | m00             */
310         pxor %mm7, %mm7			/* 0               | 0               */
311 	punpckldq 20(%rsi), %mm0	/* m11             | m00             */
312 
313 	movq 32(%rsi), %mm2		/* m21             | m20             */
314 	prefetch (%rdx)
315 
316 	movd 40(%rsi), %mm1		/*                 | m22             */
317 
318 	.byte 0x66, 0x66, 0x90	        /* manual align += 3 */
319 	punpckldq 56(%rsi), %mm1	/* m32             | m22             */
320 
321 
322 p4_perspective_loop:
323 
324 	prefetchw 32(%rdi)		/* prefetch 2 vertices ahead         */
325 
326 	movq (%rdx), %mm4		/* x1              | x0              */
327 	movq 8(%rdx), %mm5		/* x3              | x2              */
328 	movd 8(%rdx), %mm3		/*                 | x2              */
329 
330 	movq %mm5, %mm6			/* x3              | x2              */
331 	pfmul %mm0, %mm4		/* x1*m11          | x0*m00          */
332 
333 	punpckldq %mm5, %mm5		/* x2              | x2              */
334 
335 	pfmul %mm2, %mm5		/* x2*m21          | x2*m20          */
336 	pfsubr %mm7, %mm3		/*                 | -x2             */
337 
338 	pfmul %mm1, %mm6		/* x3*m32          | x2*m22          */
339 	pfadd %mm4, %mm5		/* x1*m11+x2*m21   | x0*m00+x2*m20   */
340 
341 	pfacc %mm3, %mm6		/* -x2             | x2*m22+x3*m32   */
342 
343 	movq %mm5, (%rdi)		/* write r0, r1                      */
344 	addq %rax, %rdx
345 	movq %mm6, 8(%rdi)		/* write r2, r3                      */
346 
347 	addq $16, %rdi
348 
349 	decl %ecx
350 	prefetch 32(%rdx)		/* hopefully stride is zero          */
351 	jnz p4_perspective_loop
352 
353 p4_perspective_done:
354 	femms
355 	ret
356 
357 .align 16
358 .globl _mesa_3dnow_transform_points4_2d_no_rot
359 .hidden _mesa_3dnow_transform_points4_2d_no_rot
360 _mesa_3dnow_transform_points4_2d_no_rot:
361 
362 	movl V4F_COUNT(%rdx), %ecx	/* count */
363 	movzbl V4F_STRIDE(%rdx), %eax	/* stride */
364 
365 	movl %ecx, V4F_COUNT(%rdi)	/* set dest count */
366 	movl $4, V4F_SIZE(%rdi)		/* set dest size */
367 	orl $VEC_SIZE_4, V4F_FLAGS(%rdi)/* set dest flags */
368 
369 	test %ecx, %ecx
370 	.byte 0x90			/* manual align += 1 */
371 	jz p4_2d_no_rot_done
372 
373 	movq V4F_START(%rdx), %rdx	/* ptr to first src vertex */
374 	movq V4F_START(%rdi), %rdi	/* ptr to first dest vertex */
375 
376 	movd (%rsi), %mm0		/*                 | m00             */
377 	prefetch (%rdx)
378 	punpckldq 20(%rsi), %mm0	/* m11             | m00             */
379 
380 	movq 48(%rsi), %mm1		/* m31             | m30             */
381 
382 p4_2d_no_rot_loop:
383 
384 	prefetchw 32(%rdi)		/* prefetch 2 vertices ahead         */
385 
386 	movq (%rdx), %mm4		/* x1              | x0              */
387 	movq 8(%rdx), %mm5		/* x3              | x2              */
388 
389 	pfmul %mm0, %mm4		/* x1*m11          | x0*m00          */
390 	movq %mm5, %mm6			/* x3              | x2              */
391 
392 	punpckhdq %mm6, %mm6		/* x3              | x3              */
393 
394 	addq %rax, %rdx
395 	pfmul %mm1, %mm6		/* x3*m31          | x3*m30          */
396 
397 	prefetch 32(%rdx)		/* hopefully stride is zero          */
398 	pfadd %mm4, %mm6		/* x1*m11+x3*m31   | x0*m00+x3*m30   */
399 
400 	movq %mm6, (%rdi)		/* write r0, r1                      */
401 	movq %mm5, 8(%rdi)		/* write r2, r3                      */
402 
403 	addq $16, %rdi
404 
405 	decl %ecx
406 	jnz p4_2d_no_rot_loop
407 
408 p4_2d_no_rot_done:
409 	femms
410 	ret
411 
412 
413 .align 16
414 .globl _mesa_3dnow_transform_points4_2d
415 .hidden _mesa_3dnow_transform_points4_2d
416 _mesa_3dnow_transform_points4_2d:
417 
418 	movl V4F_COUNT(%rdx), %ecx	/* count */
419 	movzbl V4F_STRIDE(%rdx), %eax	/* stride */
420 
421 	movl %ecx, V4F_COUNT(%rdi)	/* set dest count */
422 	movl $4, V4F_SIZE(%rdi)		/* set dest size */
423 	.byte 0x66, 0x66, 0x90		/* manual align += 4 */
424 	orl $VEC_SIZE_4, V4F_FLAGS(%rdi)/* set dest flags */
425 
426 	test %ecx, %ecx
427 	.byte 0x66, 0x66, 0x90		/* manual align += 4 */
428 	jz p4_2d_done
429 
430 	movq V4F_START(%rdx), %rdx	/* ptr to first src vertex */
431 	movq V4F_START(%rdi), %rdi	/* ptr to first dest vertex */
432 
433 	movd (%rsi), %mm0		/*                 | m00             */
434 	movd 4(%rsi), %mm1		/*                 | m01             */
435 
436 	prefetch (%rdx)
437 
438 	punpckldq 16(%rsi), %mm0	/* m10             | m00             */
439 	.byte 0x66, 0x66, 0x90		/* manual align += 4 */
440 	punpckldq 20(%rsi), %mm1	/* m11             | m01             */
441 
442 	movq 48(%rsi), %mm2		/* m31             | m30             */
443 
444 p4_2d_loop:
445 
446 	prefetchw 32(%rdi)		/* prefetch 2 vertices ahead         */
447 
448 	movq (%rdx), %mm3		/* x1              | x0              */
449 	movq 8(%rdx), %mm5		/* x3              | x2              */
450 
451 	movq %mm3, %mm4			/* x1              | x0              */
452 	movq %mm5, %mm6			/* x3              | x2              */
453 
454 	pfmul %mm1, %mm4		/* x1*m11          | x0*m01          */
455 	punpckhdq %mm6, %mm6		/* x3              | x3              */
456 
457 	pfmul %mm0, %mm3		/* x1*m10          | x0*m00          */
458 
459 	addq %rax, %rdx
460 	pfacc %mm4, %mm3		/* x0*m01+x1*m11   | x0*m00+x1*m10   */
461 
462 	pfmul %mm2, %mm6		/* x3*m31          | x3*m30          */
463 	prefetch 32(%rdx)		/* hopefully stride is zero          */
464 
465 	pfadd %mm6, %mm3		/* r1              | r0              */
466 
467 	movq %mm3, (%rdi)		/* write r0, r1                      */
468 	movq %mm5, 8(%rdi)		/* write r2, r3                      */
469 
470 	addq $16, %rdi
471 
472 	decl %ecx
473 	jnz p4_2d_loop
474 
475 p4_2d_done:
476 	femms
477 	ret
478 
479 #endif
480 
481 #if defined (__ELF__) && defined (__linux__)
482 	.section .note.GNU-stack,"",%progbits
483 #endif
484