1/* 2 * Mesa 3-D graphics library 3 * 4 * Copyright (C) 1999-2007 Brian Paul All Rights Reserved. 5 * 6 * Permission is hereby granted, free of charge, to any person obtaining a 7 * copy of this software and associated documentation files (the "Software"), 8 * to deal in the Software without restriction, including without limitation 9 * the rights to use, copy, modify, merge, publish, distribute, sublicense, 10 * and/or sell copies of the Software, and to permit persons to whom the 11 * Software is furnished to do so, subject to the following conditions: 12 * 13 * The above copyright notice and this permission notice shall be included 14 * in all copies or substantial portions of the Software. 15 * 16 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS 17 * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 18 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 19 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR 20 * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, 21 * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR 22 * OTHER DEALINGS IN THE SOFTWARE. 23 */ 24#ifdef HAVE_CET_H 25#include <cet.h> 26#else 27#define _CET_ENDBR 28#endif 29 30#ifdef USE_X86_64_ASM 31 32#define MATH_ASM_PTR_SIZE 8 33#include "math/m_vector_asm.h" 34 35.text 36 37.align 16 38.globl _mesa_x86_64_cpuid 39.hidden _mesa_x86_64_cpuid 40_mesa_x86_64_cpuid: 41 _CET_ENDBR 42 pushq %rbx 43 movl (%rdi), %eax 44 movl 8(%rdi), %ecx 45 46 cpuid 47 48 movl %ebx, 4(%rdi) 49 movl %eax, (%rdi) 50 movl %ecx, 8(%rdi) 51 movl %edx, 12(%rdi) 52 popq %rbx 53 ret 54 55.align 16 56.globl _mesa_x86_64_transform_points4_general 57.hidden _mesa_x86_64_transform_points4_general 58_mesa_x86_64_transform_points4_general: 59/* 60 * rdi = dest 61 * rsi = matrix 62 * rdx = source 63 */ 64 _CET_ENDBR 65 movl V4F_COUNT(%rdx), %ecx /* count */ 66 movzbl V4F_STRIDE(%rdx), %eax /* stride */ 67 68 movl %ecx, V4F_COUNT(%rdi) /* set dest count */ 69 movl $4, V4F_SIZE(%rdi) /* set dest size */ 70 .byte 0x66, 0x66, 0x66, 0x90 /* manual align += 3 */ 71 orl $VEC_SIZE_4, V4F_FLAGS(%rdi)/* set dest flags */ 72 73 testl %ecx, %ecx /* verify non-zero count */ 74 prefetchnta 64(%rsi) 75 jz p4_general_done 76 77 movq V4F_START(%rdx), %rdx /* ptr to first src vertex */ 78 movq V4F_START(%rdi), %rdi /* ptr to first dest vertex */ 79 80 prefetcht1 16(%rdx) 81 82 movaps 0(%rsi), %xmm4 /* m3 | m2 | m1 | m0 */ 83 movaps 16(%rsi), %xmm5 /* m7 | m6 | m5 | m4 */ 84 .byte 0x66, 0x66, 0x90 /* manual align += 3 */ 85 movaps 32(%rsi), %xmm6 /* m11 | m10 | m9 | m8 */ 86 movaps 48(%rsi), %xmm7 /* m15 | m14 | m13 | m12 */ 87 88p4_general_loop: 89 90 movups (%rdx), %xmm8 /* ox | oy | oz | ow */ 91 prefetcht1 16(%rdi) 92 93 pshufd $0x00, %xmm8, %xmm0 /* ox | ox | ox | ox */ 94 addq %rax, %rdx 95 pshufd $0x55, %xmm8, %xmm1 /* oy | oy | oy | oy */ 96 mulps %xmm4, %xmm0 /* ox*m3 | ox*m2 | ox*m1 | ox*m0 */ 97 pshufd $0xAA, %xmm8, %xmm2 /* oz | oz | oz | ox */ 98 mulps %xmm5, %xmm1 /* oy*m7 | oy*m6 | oy*m5 | oy*m4 */ 99 pshufd $0xFF, %xmm8, %xmm3 /* ow | ow | ow | ow */ 100 mulps %xmm6, %xmm2 /* oz*m11 | oz*m10 | oz*m9 | oz*m8 */ 101 addps %xmm1, %xmm0 /* ox*m3+oy*m7 | ... */ 102 mulps %xmm7, %xmm3 /* ow*m15 | ow*m14 | ow*m13 | ow*m12 */ 103 addps %xmm2, %xmm0 /* ox*m3+oy*m7+oz*m11 | ... */ 104 prefetcht1 16(%rdx) 105 addps %xmm3, %xmm0 /* ox*m3+oy*m7+oz*m11+ow*m15 | ... */ 106 107 movaps %xmm0, (%rdi) /* ->D(3) | ->D(2) | ->D(1) | ->D(0) */ 108 addq $16, %rdi 109 110 decl %ecx 111 jnz p4_general_loop 112 113p4_general_done: 114 .byte 0xf3 115 ret 116 117.section .rodata 118 119.align 16 120p4_constants: 121.byte 0xff, 0xff, 0xff, 0xff 122.byte 0xff, 0xff, 0xff, 0xff 123.byte 0xff, 0xff, 0xff, 0xff 124.byte 0x00, 0x00, 0x00, 0x00 125 126.byte 0x00, 0x00, 0x00, 0x00 127.byte 0x00, 0x00, 0x00, 0x00 128.byte 0x00, 0x00, 0x00, 0x00 129.float 1.0 130 131.text 132.align 16 133.globl _mesa_x86_64_transform_points4_3d 134.hidden _mesa_x86_64_transform_points4_3d 135/* 136 * this is slower than _mesa_x86_64_transform_points4_general 137 * because it ensures that the last matrix row (or is it column?) is 0,0,0,1 138 */ 139_mesa_x86_64_transform_points4_3d: 140 _CET_ENDBR 141 leaq p4_constants(%rip), %rax 142 143 prefetchnta 64(%rsi) 144 145 movaps (%rax), %xmm9 146 movaps 16(%rax), %xmm10 147 148 movl V4F_COUNT(%rdx), %ecx /* count */ 149 movzbl V4F_STRIDE(%rdx), %eax /* stride */ 150 151 movl %ecx, V4F_COUNT(%rdi) /* set dest count */ 152 movl $4, V4F_SIZE(%rdi) /* set dest size */ 153 orl $VEC_SIZE_4, V4F_FLAGS(%rdi)/* set dest flags */ 154 155 testl %ecx, %ecx /* verify non-zero count */ 156 jz p4_3d_done 157 158 movq V4F_START(%rdx), %rdx /* ptr to first src vertex */ 159 movq V4F_START(%rdi), %rdi /* ptr to first dest vertex */ 160 161 prefetcht1 16(%rdx) 162 163 movaps 0(%rsi), %xmm4 /* m3 | m2 | m1 | m0 */ 164 movaps 16(%rsi), %xmm5 /* m7 | m6 | m5 | m4 */ 165 andps %xmm9, %xmm4 /* 0.0 | m2 | m1 | m0 */ 166 movaps 32(%rsi), %xmm6 /* m11 | m10 | m9 | m8 */ 167 andps %xmm9, %xmm5 /* 0.0 | m6 | m5 | m4 */ 168 movaps 48(%rsi), %xmm7 /* m15 | m14 | m13 | m12 */ 169 andps %xmm9, %xmm6 /* 0.0 | m10 | m9 | m8 */ 170 andps %xmm9, %xmm7 /* 0.0 | m14 | m13 | m12 */ 171 .byte 0x66, 0x66, 0x90 /* manual align += 3 */ 172 orps %xmm10, %xmm7 /* 1.0 | m14 | m13 | m12 */ 173 174p4_3d_loop: 175 176 movups (%rdx), %xmm8 /* ox | oy | oz | ow */ 177 prefetcht1 16(%rdi) 178 179 pshufd $0x00, %xmm8, %xmm0 /* ox | ox | ox | ox */ 180 addq %rax, %rdx 181 pshufd $0x55, %xmm8, %xmm1 /* oy | oy | oy | oy */ 182 mulps %xmm4, %xmm0 /* ox*m3 | ox*m2 | ox*m1 | ox*m0 */ 183 pshufd $0xAA, %xmm8, %xmm2 /* oz | oz | oz | ox */ 184 mulps %xmm5, %xmm1 /* oy*m7 | oy*m6 | oy*m5 | oy*m4 */ 185 pshufd $0xFF, %xmm8, %xmm3 /* ow | ow | ow | ow */ 186 mulps %xmm6, %xmm2 /* oz*m11 | oz*m10 | oz*m9 | oz*m8 */ 187 addps %xmm1, %xmm0 /* ox*m3+oy*m7 | ... */ 188 mulps %xmm7, %xmm3 /* ow*m15 | ow*m14 | ow*m13 | ow*m12 */ 189 addps %xmm2, %xmm0 /* ox*m3+oy*m7+oz*m11 | ... */ 190 prefetcht1 16(%rdx) 191 addps %xmm3, %xmm0 /* ox*m3+oy*m7+oz*m11+ow*m15 | ... */ 192 193 movaps %xmm0, (%rdi) /* ->D(3) | ->D(2) | ->D(1) | ->D(0) */ 194 addq $16, %rdi 195 196 dec %ecx 197 jnz p4_3d_loop 198 199p4_3d_done: 200 .byte 0xf3 201 ret 202 203 204.align 16 205.globl _mesa_x86_64_transform_points4_identity 206.hidden _mesa_x86_64_transform_points4_identity 207_mesa_x86_64_transform_points4_identity: 208 _CET_ENDBR 209 movl V4F_COUNT(%rdx), %ecx /* count */ 210 movzbl V4F_STRIDE(%rdx), %eax /* stride */ 211 212 movl %ecx, V4F_COUNT(%rdi) /* set dest count */ 213 movl $4, V4F_SIZE(%rdi) /* set dest size */ 214 orl $VEC_SIZE_4, V4F_FLAGS(%rdi)/* set dest flags */ 215 216 test %ecx, %ecx 217 jz p4_identity_done 218 219 movq V4F_START(%rdx), %rsi /* ptr to first src vertex */ 220 movq V4F_START(%rdi), %rdi /* ptr to first dest vertex */ 221 prefetcht1 64(%rsi) 222 prefetcht1 64(%rdi) 223 224 add %ecx, %ecx 225 226 rep movsq 227 228p4_identity_done: 229 .byte 0xf3 230 ret 231 232 233.align 16 234.globl _mesa_3dnow_transform_points4_3d_no_rot 235.hidden _mesa_3dnow_transform_points4_3d_no_rot 236_mesa_3dnow_transform_points4_3d_no_rot: 237 _CET_ENDBR 238 movl V4F_COUNT(%rdx), %ecx /* count */ 239 movzbl V4F_STRIDE(%rdx), %eax /* stride */ 240 241 movl %ecx, V4F_COUNT(%rdi) /* set dest count */ 242 movl $4, V4F_SIZE(%rdi) /* set dest size */ 243 .byte 0x66, 0x66, 0x90 /* manual align += 3 */ 244 orl $VEC_SIZE_4, V4F_FLAGS(%rdi)/* set dest flags */ 245 246 test %ecx, %ecx 247 .byte 0x66, 0x66, 0x90 /* manual align += 3 */ 248 jz p4_3d_no_rot_done 249 250 movq V4F_START(%rdx), %rdx /* ptr to first src vertex */ 251 movq V4F_START(%rdi), %rdi /* ptr to first dest vertex */ 252 253 prefetcht1 (%rdx) 254 255 movd (%rsi), %mm0 /* | m00 */ 256 .byte 0x66, 0x66, 0x90 /* manual align += 3 */ 257 punpckldq 20(%rsi), %mm0 /* m11 | m00 */ 258 259 movd 40(%rsi), %mm2 /* | m22 */ 260 movq 48(%rsi), %mm1 /* m31 | m30 */ 261 262 punpckldq 56(%rsi), %mm2 /* m11 | m00 */ 263 264p4_3d_no_rot_loop: 265 266 prefetcht1 32(%rdi) 267 268 movq (%rdx), %mm4 /* x1 | x0 */ 269 movq 8(%rdx), %mm5 /* x3 | x2 */ 270 movd 12(%rdx), %mm7 /* | x3 */ 271 272 movq %mm5, %mm6 /* x3 | x2 */ 273 pfmul %mm0, %mm4 /* x1*m11 | x0*m00 */ 274 275 punpckhdq %mm6, %mm6 /* x3 | x3 */ 276 pfmul %mm2, %mm5 /* x3*m32 | x2*m22 */ 277 278 pfmul %mm1, %mm6 /* x3*m31 | x3*m30 */ 279 pfacc %mm7, %mm5 /* x3 | x2*m22+x3*m32 */ 280 281 pfadd %mm6, %mm4 /* x1*m11+x3*m31 | x0*m00+x3*m30 */ 282 283 addq %rax, %rdx 284 movq %mm4, (%rdi) /* write r0, r1 */ 285 movq %mm5, 8(%rdi) /* write r2, r3 */ 286 287 addq $16, %rdi 288 289 decl %ecx 290 prefetcht1 32(%rdx) 291 jnz p4_3d_no_rot_loop 292 293p4_3d_no_rot_done: 294 femms 295 ret 296 297 298.align 16 299.globl _mesa_3dnow_transform_points4_perspective 300.hidden _mesa_3dnow_transform_points4_perspective 301_mesa_3dnow_transform_points4_perspective: 302 _CET_ENDBR 303 movl V4F_COUNT(%rdx), %ecx /* count */ 304 movzbl V4F_STRIDE(%rdx), %eax /* stride */ 305 306 movl %ecx, V4F_COUNT(%rdi) /* set dest count */ 307 movl $4, V4F_SIZE(%rdi) /* set dest size */ 308 orl $VEC_SIZE_4, V4F_FLAGS(%rdi)/* set dest flags */ 309 310 test %ecx, %ecx 311 .byte 0x66, 0x66, 0x90 /* manual align += 3 */ 312 jz p4_perspective_done 313 314 movq V4F_START(%rdx), %rdx /* ptr to first src vertex */ 315 movq V4F_START(%rdi), %rdi /* ptr to first dest vertex */ 316 317 movd (%rsi), %mm0 /* | m00 */ 318 pxor %mm7, %mm7 /* 0 | 0 */ 319 punpckldq 20(%rsi), %mm0 /* m11 | m00 */ 320 321 movq 32(%rsi), %mm2 /* m21 | m20 */ 322 prefetcht1 (%rdx) 323 324 movd 40(%rsi), %mm1 /* | m22 */ 325 326 .byte 0x66, 0x66, 0x90 /* manual align += 3 */ 327 punpckldq 56(%rsi), %mm1 /* m32 | m22 */ 328 329 330p4_perspective_loop: 331 332 prefetcht1 32(%rdi) /* prefetch 2 vertices ahead */ 333 334 movq (%rdx), %mm4 /* x1 | x0 */ 335 movq 8(%rdx), %mm5 /* x3 | x2 */ 336 movd 8(%rdx), %mm3 /* | x2 */ 337 338 movq %mm5, %mm6 /* x3 | x2 */ 339 pfmul %mm0, %mm4 /* x1*m11 | x0*m00 */ 340 341 punpckldq %mm5, %mm5 /* x2 | x2 */ 342 343 pfmul %mm2, %mm5 /* x2*m21 | x2*m20 */ 344 pfsubr %mm7, %mm3 /* | -x2 */ 345 346 pfmul %mm1, %mm6 /* x3*m32 | x2*m22 */ 347 pfadd %mm4, %mm5 /* x1*m11+x2*m21 | x0*m00+x2*m20 */ 348 349 pfacc %mm3, %mm6 /* -x2 | x2*m22+x3*m32 */ 350 351 movq %mm5, (%rdi) /* write r0, r1 */ 352 addq %rax, %rdx 353 movq %mm6, 8(%rdi) /* write r2, r3 */ 354 355 addq $16, %rdi 356 357 decl %ecx 358 prefetcht1 32(%rdx) /* hopefully stride is zero */ 359 jnz p4_perspective_loop 360 361p4_perspective_done: 362 femms 363 ret 364 365.align 16 366.globl _mesa_3dnow_transform_points4_2d_no_rot 367.hidden _mesa_3dnow_transform_points4_2d_no_rot 368_mesa_3dnow_transform_points4_2d_no_rot: 369 _CET_ENDBR 370 movl V4F_COUNT(%rdx), %ecx /* count */ 371 movzbl V4F_STRIDE(%rdx), %eax /* stride */ 372 373 movl %ecx, V4F_COUNT(%rdi) /* set dest count */ 374 movl $4, V4F_SIZE(%rdi) /* set dest size */ 375 orl $VEC_SIZE_4, V4F_FLAGS(%rdi)/* set dest flags */ 376 377 test %ecx, %ecx 378 .byte 0x90 /* manual align += 1 */ 379 jz p4_2d_no_rot_done 380 381 movq V4F_START(%rdx), %rdx /* ptr to first src vertex */ 382 movq V4F_START(%rdi), %rdi /* ptr to first dest vertex */ 383 384 movd (%rsi), %mm0 /* | m00 */ 385 prefetcht1 (%rdx) 386 punpckldq 20(%rsi), %mm0 /* m11 | m00 */ 387 388 movq 48(%rsi), %mm1 /* m31 | m30 */ 389 390p4_2d_no_rot_loop: 391 392 prefetcht1 32(%rdi) /* prefetch 2 vertices ahead */ 393 394 movq (%rdx), %mm4 /* x1 | x0 */ 395 movq 8(%rdx), %mm5 /* x3 | x2 */ 396 397 pfmul %mm0, %mm4 /* x1*m11 | x0*m00 */ 398 movq %mm5, %mm6 /* x3 | x2 */ 399 400 punpckhdq %mm6, %mm6 /* x3 | x3 */ 401 402 addq %rax, %rdx 403 pfmul %mm1, %mm6 /* x3*m31 | x3*m30 */ 404 405 prefetcht1 32(%rdx) /* hopefully stride is zero */ 406 pfadd %mm4, %mm6 /* x1*m11+x3*m31 | x0*m00+x3*m30 */ 407 408 movq %mm6, (%rdi) /* write r0, r1 */ 409 movq %mm5, 8(%rdi) /* write r2, r3 */ 410 411 addq $16, %rdi 412 413 decl %ecx 414 jnz p4_2d_no_rot_loop 415 416p4_2d_no_rot_done: 417 femms 418 ret 419 420 421.align 16 422.globl _mesa_3dnow_transform_points4_2d 423.hidden _mesa_3dnow_transform_points4_2d 424_mesa_3dnow_transform_points4_2d: 425 _CET_ENDBR 426 movl V4F_COUNT(%rdx), %ecx /* count */ 427 movzbl V4F_STRIDE(%rdx), %eax /* stride */ 428 429 movl %ecx, V4F_COUNT(%rdi) /* set dest count */ 430 movl $4, V4F_SIZE(%rdi) /* set dest size */ 431 .byte 0x66, 0x66, 0x90 /* manual align += 4 */ 432 orl $VEC_SIZE_4, V4F_FLAGS(%rdi)/* set dest flags */ 433 434 test %ecx, %ecx 435 .byte 0x66, 0x66, 0x90 /* manual align += 4 */ 436 jz p4_2d_done 437 438 movq V4F_START(%rdx), %rdx /* ptr to first src vertex */ 439 movq V4F_START(%rdi), %rdi /* ptr to first dest vertex */ 440 441 movd (%rsi), %mm0 /* | m00 */ 442 movd 4(%rsi), %mm1 /* | m01 */ 443 444 prefetcht1 (%rdx) 445 446 punpckldq 16(%rsi), %mm0 /* m10 | m00 */ 447 .byte 0x66, 0x66, 0x90 /* manual align += 4 */ 448 punpckldq 20(%rsi), %mm1 /* m11 | m01 */ 449 450 movq 48(%rsi), %mm2 /* m31 | m30 */ 451 452p4_2d_loop: 453 454 prefetcht1 32(%rdi) /* prefetch 2 vertices ahead */ 455 456 movq (%rdx), %mm3 /* x1 | x0 */ 457 movq 8(%rdx), %mm5 /* x3 | x2 */ 458 459 movq %mm3, %mm4 /* x1 | x0 */ 460 movq %mm5, %mm6 /* x3 | x2 */ 461 462 pfmul %mm1, %mm4 /* x1*m11 | x0*m01 */ 463 punpckhdq %mm6, %mm6 /* x3 | x3 */ 464 465 pfmul %mm0, %mm3 /* x1*m10 | x0*m00 */ 466 467 addq %rax, %rdx 468 pfacc %mm4, %mm3 /* x0*m01+x1*m11 | x0*m00+x1*m10 */ 469 470 pfmul %mm2, %mm6 /* x3*m31 | x3*m30 */ 471 prefetcht1 32(%rdx) /* hopefully stride is zero */ 472 473 pfadd %mm6, %mm3 /* r1 | r0 */ 474 475 movq %mm3, (%rdi) /* write r0, r1 */ 476 movq %mm5, 8(%rdi) /* write r2, r3 */ 477 478 addq $16, %rdi 479 480 decl %ecx 481 jnz p4_2d_loop 482 483p4_2d_done: 484 femms 485 ret 486 487#endif 488 489#if defined (__ELF__) && defined (__linux__) 490 .section .note.GNU-stack,"",%progbits 491#endif 492