1 /*
2 * Copyright 2003 VMware, Inc.
3 * All Rights Reserved.
4 *
5 * Permission is hereby granted, free of charge, to any person obtaining a
6 * copy of this software and associated documentation files (the "Software"),
7 * to deal in the Software without restriction, including without limitation
8 * on the rights to use, copy, modify, merge, publish, distribute, sub
9 * license, and/or sell copies of the Software, and to permit persons to whom
10 * the Software is furnished to do so, subject to the following conditions:
11 *
12 * The above copyright notice and this permission notice (including the next
13 * paragraph) shall be included in all copies or substantial portions of the
14 * Software.
15 *
16 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18 * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL
19 * VMWARE AND/OR THEIR SUPPLIERS BE LIABLE FOR ANY CLAIM,
20 * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
21 * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
22 * USE OR OTHER DEALINGS IN THE SOFTWARE.
23 *
24 * Authors:
25 * Keith Whitwell <keithw@vmware.com>
26 */
27
28 #include <stdio.h>
29
30 #include "main/glheader.h"
31 #include "main/context.h"
32 #include "main/enums.h"
33 #include "swrast/s_chan.h"
34 #include "t_context.h"
35 #include "t_vertex.h"
36
37 #if defined(USE_SSE_ASM)
38
39 #include "x86/rtasm/x86sse.h"
40 #include "x86/common_x86_asm.h"
41
42
43 /**
44 * Number of bytes to allocate for generated SSE functions
45 */
46 #define MAX_SSE_CODE_SIZE 1024
47
48
49 #define X 0
50 #define Y 1
51 #define Z 2
52 #define W 3
53
54
55 struct x86_program {
56 struct x86_function func;
57
58 struct gl_context *ctx;
59 GLboolean inputs_safe;
60 GLboolean outputs_safe;
61 GLboolean have_sse2;
62
63 struct x86_reg identity;
64 struct x86_reg chan0;
65 };
66
67
get_identity(struct x86_program * p)68 static struct x86_reg get_identity( struct x86_program *p )
69 {
70 return p->identity;
71 }
72
emit_load4f_4(struct x86_program * p,struct x86_reg dest,struct x86_reg arg0)73 static void emit_load4f_4( struct x86_program *p,
74 struct x86_reg dest,
75 struct x86_reg arg0 )
76 {
77 sse_movups(&p->func, dest, arg0);
78 }
79
emit_load4f_3(struct x86_program * p,struct x86_reg dest,struct x86_reg arg0)80 static void emit_load4f_3( struct x86_program *p,
81 struct x86_reg dest,
82 struct x86_reg arg0 )
83 {
84 /* Have to jump through some hoops:
85 *
86 * c 0 0 0
87 * c 0 0 1
88 * 0 0 c 1
89 * a b c 1
90 */
91 sse_movss(&p->func, dest, x86_make_disp(arg0, 8));
92 sse_shufps(&p->func, dest, get_identity(p), SHUF(X,Y,Z,W) );
93 sse_shufps(&p->func, dest, dest, SHUF(Y,Z,X,W) );
94 sse_movlps(&p->func, dest, arg0);
95 }
96
emit_load4f_2(struct x86_program * p,struct x86_reg dest,struct x86_reg arg0)97 static void emit_load4f_2( struct x86_program *p,
98 struct x86_reg dest,
99 struct x86_reg arg0 )
100 {
101 /* Initialize from identity, then pull in low two words:
102 */
103 sse_movups(&p->func, dest, get_identity(p));
104 sse_movlps(&p->func, dest, arg0);
105 }
106
emit_load4f_1(struct x86_program * p,struct x86_reg dest,struct x86_reg arg0)107 static void emit_load4f_1( struct x86_program *p,
108 struct x86_reg dest,
109 struct x86_reg arg0 )
110 {
111 /* Pull in low word, then swizzle in identity */
112 sse_movss(&p->func, dest, arg0);
113 sse_shufps(&p->func, dest, get_identity(p), SHUF(X,Y,Z,W) );
114 }
115
116
117
emit_load3f_3(struct x86_program * p,struct x86_reg dest,struct x86_reg arg0)118 static void emit_load3f_3( struct x86_program *p,
119 struct x86_reg dest,
120 struct x86_reg arg0 )
121 {
122 /* Over-reads by 1 dword - potential SEGV if input is a vertex
123 * array.
124 */
125 if (p->inputs_safe) {
126 sse_movups(&p->func, dest, arg0);
127 }
128 else {
129 /* c 0 0 0
130 * c c c c
131 * a b c c
132 */
133 sse_movss(&p->func, dest, x86_make_disp(arg0, 8));
134 sse_shufps(&p->func, dest, dest, SHUF(X,X,X,X));
135 sse_movlps(&p->func, dest, arg0);
136 }
137 }
138
emit_load3f_2(struct x86_program * p,struct x86_reg dest,struct x86_reg arg0)139 static void emit_load3f_2( struct x86_program *p,
140 struct x86_reg dest,
141 struct x86_reg arg0 )
142 {
143 emit_load4f_2(p, dest, arg0);
144 }
145
emit_load3f_1(struct x86_program * p,struct x86_reg dest,struct x86_reg arg0)146 static void emit_load3f_1( struct x86_program *p,
147 struct x86_reg dest,
148 struct x86_reg arg0 )
149 {
150 /* Loading from memory erases the upper bits. */
151 sse_movss(&p->func, dest, arg0);
152 }
153
emit_load2f_2(struct x86_program * p,struct x86_reg dest,struct x86_reg arg0)154 static void emit_load2f_2( struct x86_program *p,
155 struct x86_reg dest,
156 struct x86_reg arg0 )
157 {
158 sse_movlps(&p->func, dest, arg0);
159 }
160
emit_load2f_1(struct x86_program * p,struct x86_reg dest,struct x86_reg arg0)161 static void emit_load2f_1( struct x86_program *p,
162 struct x86_reg dest,
163 struct x86_reg arg0 )
164 {
165 /* Loading from memory erases the upper bits. */
166 sse_movss(&p->func, dest, arg0);
167 }
168
emit_load1f_1(struct x86_program * p,struct x86_reg dest,struct x86_reg arg0)169 static void emit_load1f_1( struct x86_program *p,
170 struct x86_reg dest,
171 struct x86_reg arg0 )
172 {
173 sse_movss(&p->func, dest, arg0);
174 }
175
176 static void (*load[4][4])( struct x86_program *p,
177 struct x86_reg dest,
178 struct x86_reg arg0 ) = {
179 { emit_load1f_1,
180 emit_load1f_1,
181 emit_load1f_1,
182 emit_load1f_1 },
183
184 { emit_load2f_1,
185 emit_load2f_2,
186 emit_load2f_2,
187 emit_load2f_2 },
188
189 { emit_load3f_1,
190 emit_load3f_2,
191 emit_load3f_3,
192 emit_load3f_3 },
193
194 { emit_load4f_1,
195 emit_load4f_2,
196 emit_load4f_3,
197 emit_load4f_4 }
198 };
199
emit_load(struct x86_program * p,struct x86_reg dest,GLuint sz,struct x86_reg src,GLuint src_sz)200 static void emit_load( struct x86_program *p,
201 struct x86_reg dest,
202 GLuint sz,
203 struct x86_reg src,
204 GLuint src_sz)
205 {
206 load[sz-1][src_sz-1](p, dest, src);
207 }
208
emit_store4f(struct x86_program * p,struct x86_reg dest,struct x86_reg arg0)209 static void emit_store4f( struct x86_program *p,
210 struct x86_reg dest,
211 struct x86_reg arg0 )
212 {
213 sse_movups(&p->func, dest, arg0);
214 }
215
emit_store3f(struct x86_program * p,struct x86_reg dest,struct x86_reg arg0)216 static void emit_store3f( struct x86_program *p,
217 struct x86_reg dest,
218 struct x86_reg arg0 )
219 {
220 if (p->outputs_safe) {
221 /* Emit the extra dword anyway. This may hurt writecombining,
222 * may cause other problems.
223 */
224 sse_movups(&p->func, dest, arg0);
225 }
226 else {
227 /* Alternate strategy - emit two, shuffle, emit one.
228 */
229 sse_movlps(&p->func, dest, arg0);
230 sse_shufps(&p->func, arg0, arg0, SHUF(Z,Z,Z,Z) ); /* NOTE! destructive */
231 sse_movss(&p->func, x86_make_disp(dest,8), arg0);
232 }
233 }
234
emit_store2f(struct x86_program * p,struct x86_reg dest,struct x86_reg arg0)235 static void emit_store2f( struct x86_program *p,
236 struct x86_reg dest,
237 struct x86_reg arg0 )
238 {
239 sse_movlps(&p->func, dest, arg0);
240 }
241
emit_store1f(struct x86_program * p,struct x86_reg dest,struct x86_reg arg0)242 static void emit_store1f( struct x86_program *p,
243 struct x86_reg dest,
244 struct x86_reg arg0 )
245 {
246 sse_movss(&p->func, dest, arg0);
247 }
248
249
250 static void (*store[4])( struct x86_program *p,
251 struct x86_reg dest,
252 struct x86_reg arg0 ) =
253 {
254 emit_store1f,
255 emit_store2f,
256 emit_store3f,
257 emit_store4f
258 };
259
emit_store(struct x86_program * p,struct x86_reg dest,GLuint sz,struct x86_reg temp)260 static void emit_store( struct x86_program *p,
261 struct x86_reg dest,
262 GLuint sz,
263 struct x86_reg temp )
264
265 {
266 store[sz-1](p, dest, temp);
267 }
268
emit_pack_store_4ub(struct x86_program * p,struct x86_reg dest,struct x86_reg temp)269 static void emit_pack_store_4ub( struct x86_program *p,
270 struct x86_reg dest,
271 struct x86_reg temp )
272 {
273 /* Scale by 255.0
274 */
275 sse_mulps(&p->func, temp, p->chan0);
276
277 if (p->have_sse2) {
278 sse2_cvtps2dq(&p->func, temp, temp);
279 sse2_packssdw(&p->func, temp, temp);
280 sse2_packuswb(&p->func, temp, temp);
281 sse_movss(&p->func, dest, temp);
282 }
283 else {
284 struct x86_reg mmx0 = x86_make_reg(file_MMX, 0);
285 struct x86_reg mmx1 = x86_make_reg(file_MMX, 1);
286 sse_cvtps2pi(&p->func, mmx0, temp);
287 sse_movhlps(&p->func, temp, temp);
288 sse_cvtps2pi(&p->func, mmx1, temp);
289 mmx_packssdw(&p->func, mmx0, mmx1);
290 mmx_packuswb(&p->func, mmx0, mmx0);
291 mmx_movd(&p->func, dest, mmx0);
292 }
293 }
294
get_offset(const void * a,const void * b)295 static GLint get_offset( const void *a, const void *b )
296 {
297 return (const char *)b - (const char *)a;
298 }
299
300 /* Not much happens here. Eventually use this function to try and
301 * avoid saving/reloading the source pointers each vertex (if some of
302 * them can fit in registers).
303 */
get_src_ptr(struct x86_program * p,struct x86_reg srcREG,struct x86_reg vtxREG,struct tnl_clipspace_attr * a)304 static void get_src_ptr( struct x86_program *p,
305 struct x86_reg srcREG,
306 struct x86_reg vtxREG,
307 struct tnl_clipspace_attr *a )
308 {
309 struct tnl_clipspace *vtx = GET_VERTEX_STATE(p->ctx);
310 struct x86_reg ptr_to_src = x86_make_disp(vtxREG, get_offset(vtx, &a->inputptr));
311
312 /* Load current a[j].inputptr
313 */
314 x86_mov(&p->func, srcREG, ptr_to_src);
315 }
316
update_src_ptr(struct x86_program * p,struct x86_reg srcREG,struct x86_reg vtxREG,struct tnl_clipspace_attr * a)317 static void update_src_ptr( struct x86_program *p,
318 struct x86_reg srcREG,
319 struct x86_reg vtxREG,
320 struct tnl_clipspace_attr *a )
321 {
322 if (a->inputstride) {
323 struct tnl_clipspace *vtx = GET_VERTEX_STATE(p->ctx);
324 struct x86_reg ptr_to_src = x86_make_disp(vtxREG, get_offset(vtx, &a->inputptr));
325
326 /* add a[j].inputstride (hardcoded value - could just as easily
327 * pull the stride value from memory each time).
328 */
329 x86_lea(&p->func, srcREG, x86_make_disp(srcREG, a->inputstride));
330
331 /* save new value of a[j].inputptr
332 */
333 x86_mov(&p->func, ptr_to_src, srcREG);
334 }
335 }
336
337
338 /* Lots of hardcoding
339 *
340 * EAX -- pointer to current output vertex
341 * ECX -- pointer to current attribute
342 *
343 */
build_vertex_emit(struct x86_program * p)344 static GLboolean build_vertex_emit( struct x86_program *p )
345 {
346 struct gl_context *ctx = p->ctx;
347 TNLcontext *tnl = TNL_CONTEXT(ctx);
348 struct tnl_clipspace *vtx = GET_VERTEX_STATE(ctx);
349 GLuint j = 0;
350
351 struct x86_reg vertexEAX = x86_make_reg(file_REG32, reg_AX);
352 struct x86_reg srcECX = x86_make_reg(file_REG32, reg_CX);
353 struct x86_reg countEBP = x86_make_reg(file_REG32, reg_BP);
354 struct x86_reg vtxESI = x86_make_reg(file_REG32, reg_SI);
355 struct x86_reg temp = x86_make_reg(file_XMM, 0);
356 struct x86_reg vp0 = x86_make_reg(file_XMM, 1);
357 struct x86_reg vp1 = x86_make_reg(file_XMM, 2);
358 struct x86_reg temp2 = x86_make_reg(file_XMM, 3);
359 GLubyte *fixup, *label;
360
361 /* Push a few regs?
362 */
363 x86_push(&p->func, countEBP);
364 x86_push(&p->func, vtxESI);
365
366
367 /* Get vertex count, compare to zero
368 */
369 x86_xor(&p->func, srcECX, srcECX);
370 x86_mov(&p->func, countEBP, x86_fn_arg(&p->func, 2));
371 x86_cmp(&p->func, countEBP, srcECX);
372 fixup = x86_jcc_forward(&p->func, cc_E);
373
374 /* Initialize destination register.
375 */
376 x86_mov(&p->func, vertexEAX, x86_fn_arg(&p->func, 3));
377
378 /* Dereference ctx to get tnl, then vtx:
379 */
380 x86_mov(&p->func, vtxESI, x86_fn_arg(&p->func, 1));
381 x86_mov(&p->func, vtxESI, x86_make_disp(vtxESI, get_offset(ctx, &ctx->swtnl_context)));
382 vtxESI = x86_make_disp(vtxESI, get_offset(tnl, &tnl->clipspace));
383
384
385 /* Possibly load vp0, vp1 for viewport calcs:
386 */
387 if (vtx->need_viewport) {
388 sse_movups(&p->func, vp0, x86_make_disp(vtxESI, get_offset(vtx, &vtx->vp_scale[0])));
389 sse_movups(&p->func, vp1, x86_make_disp(vtxESI, get_offset(vtx, &vtx->vp_xlate[0])));
390 }
391
392 /* always load, needed or not:
393 */
394 sse_movups(&p->func, p->chan0, x86_make_disp(vtxESI, get_offset(vtx, &vtx->chan_scale[0])));
395 sse_movups(&p->func, p->identity, x86_make_disp(vtxESI, get_offset(vtx, &vtx->identity[0])));
396
397 /* Note address for loop jump */
398 label = x86_get_label(&p->func);
399
400 /* Emit code for each of the attributes. Currently routes
401 * everything through SSE registers, even when it might be more
402 * efficient to stick with regular old x86. No optimization or
403 * other tricks - enough new ground to cover here just getting
404 * things working.
405 */
406 while (j < vtx->attr_count) {
407 struct tnl_clipspace_attr *a = &vtx->attr[j];
408 struct x86_reg dest = x86_make_disp(vertexEAX, a->vertoffset);
409
410 /* Now, load an XMM reg from src, perhaps transform, then save.
411 * Could be shortcircuited in specific cases:
412 */
413 switch (a->format) {
414 case EMIT_1F:
415 get_src_ptr(p, srcECX, vtxESI, a);
416 emit_load(p, temp, 1, x86_deref(srcECX), a->inputsize);
417 emit_store(p, dest, 1, temp);
418 update_src_ptr(p, srcECX, vtxESI, a);
419 break;
420 case EMIT_2F:
421 get_src_ptr(p, srcECX, vtxESI, a);
422 emit_load(p, temp, 2, x86_deref(srcECX), a->inputsize);
423 emit_store(p, dest, 2, temp);
424 update_src_ptr(p, srcECX, vtxESI, a);
425 break;
426 case EMIT_3F:
427 /* Potentially the worst case - hardcode 2+1 copying:
428 */
429 if (0) {
430 get_src_ptr(p, srcECX, vtxESI, a);
431 emit_load(p, temp, 3, x86_deref(srcECX), a->inputsize);
432 emit_store(p, dest, 3, temp);
433 update_src_ptr(p, srcECX, vtxESI, a);
434 }
435 else {
436 get_src_ptr(p, srcECX, vtxESI, a);
437 emit_load(p, temp, 2, x86_deref(srcECX), a->inputsize);
438 emit_store(p, dest, 2, temp);
439 if (a->inputsize > 2) {
440 emit_load(p, temp, 1, x86_make_disp(srcECX, 8), 1);
441 emit_store(p, x86_make_disp(dest,8), 1, temp);
442 }
443 else {
444 sse_movss(&p->func, x86_make_disp(dest,8), get_identity(p));
445 }
446 update_src_ptr(p, srcECX, vtxESI, a);
447 }
448 break;
449 case EMIT_4F:
450 get_src_ptr(p, srcECX, vtxESI, a);
451 emit_load(p, temp, 4, x86_deref(srcECX), a->inputsize);
452 emit_store(p, dest, 4, temp);
453 update_src_ptr(p, srcECX, vtxESI, a);
454 break;
455 case EMIT_2F_VIEWPORT:
456 get_src_ptr(p, srcECX, vtxESI, a);
457 emit_load(p, temp, 2, x86_deref(srcECX), a->inputsize);
458 sse_mulps(&p->func, temp, vp0);
459 sse_addps(&p->func, temp, vp1);
460 emit_store(p, dest, 2, temp);
461 update_src_ptr(p, srcECX, vtxESI, a);
462 break;
463 case EMIT_3F_VIEWPORT:
464 get_src_ptr(p, srcECX, vtxESI, a);
465 emit_load(p, temp, 3, x86_deref(srcECX), a->inputsize);
466 sse_mulps(&p->func, temp, vp0);
467 sse_addps(&p->func, temp, vp1);
468 emit_store(p, dest, 3, temp);
469 update_src_ptr(p, srcECX, vtxESI, a);
470 break;
471 case EMIT_4F_VIEWPORT:
472 get_src_ptr(p, srcECX, vtxESI, a);
473 emit_load(p, temp, 4, x86_deref(srcECX), a->inputsize);
474 sse_mulps(&p->func, temp, vp0);
475 sse_addps(&p->func, temp, vp1);
476 emit_store(p, dest, 4, temp);
477 update_src_ptr(p, srcECX, vtxESI, a);
478 break;
479 case EMIT_3F_XYW:
480 get_src_ptr(p, srcECX, vtxESI, a);
481 emit_load(p, temp, 4, x86_deref(srcECX), a->inputsize);
482 sse_shufps(&p->func, temp, temp, SHUF(X,Y,W,Z));
483 emit_store(p, dest, 3, temp);
484 update_src_ptr(p, srcECX, vtxESI, a);
485 break;
486
487 case EMIT_1UB_1F:
488 /* Test for PAD3 + 1UB:
489 */
490 if (j > 0 &&
491 a[-1].vertoffset + a[-1].vertattrsize <= a->vertoffset - 3)
492 {
493 get_src_ptr(p, srcECX, vtxESI, a);
494 emit_load(p, temp, 1, x86_deref(srcECX), a->inputsize);
495 sse_shufps(&p->func, temp, temp, SHUF(X,X,X,X));
496 emit_pack_store_4ub(p, x86_make_disp(dest, -3), temp); /* overkill! */
497 update_src_ptr(p, srcECX, vtxESI, a);
498 }
499 else {
500 printf("Can't emit 1ub %x %x %d\n", a->vertoffset, a[-1].vertoffset, a[-1].vertattrsize );
501 return GL_FALSE;
502 }
503 break;
504 case EMIT_3UB_3F_RGB:
505 case EMIT_3UB_3F_BGR:
506 /* Test for 3UB + PAD1:
507 */
508 if (j == vtx->attr_count - 1 ||
509 a[1].vertoffset >= a->vertoffset + 4) {
510 get_src_ptr(p, srcECX, vtxESI, a);
511 emit_load(p, temp, 3, x86_deref(srcECX), a->inputsize);
512 if (a->format == EMIT_3UB_3F_BGR)
513 sse_shufps(&p->func, temp, temp, SHUF(Z,Y,X,W));
514 emit_pack_store_4ub(p, dest, temp);
515 update_src_ptr(p, srcECX, vtxESI, a);
516 }
517 /* Test for 3UB + 1UB:
518 */
519 else if (j < vtx->attr_count - 1 &&
520 a[1].format == EMIT_1UB_1F &&
521 a[1].vertoffset == a->vertoffset + 3) {
522 get_src_ptr(p, srcECX, vtxESI, a);
523 emit_load(p, temp, 3, x86_deref(srcECX), a->inputsize);
524 update_src_ptr(p, srcECX, vtxESI, a);
525
526 /* Make room for incoming value:
527 */
528 sse_shufps(&p->func, temp, temp, SHUF(W,X,Y,Z));
529
530 get_src_ptr(p, srcECX, vtxESI, &a[1]);
531 emit_load(p, temp2, 1, x86_deref(srcECX), a[1].inputsize);
532 sse_movss(&p->func, temp, temp2);
533 update_src_ptr(p, srcECX, vtxESI, &a[1]);
534
535 /* Rearrange and possibly do BGR conversion:
536 */
537 if (a->format == EMIT_3UB_3F_BGR)
538 sse_shufps(&p->func, temp, temp, SHUF(W,Z,Y,X));
539 else
540 sse_shufps(&p->func, temp, temp, SHUF(Y,Z,W,X));
541
542 emit_pack_store_4ub(p, dest, temp);
543 j++; /* NOTE: two attrs consumed */
544 }
545 else {
546 printf("Can't emit 3ub\n");
547 return GL_FALSE; /* add this later */
548 }
549 break;
550
551 case EMIT_4UB_4F_RGBA:
552 get_src_ptr(p, srcECX, vtxESI, a);
553 emit_load(p, temp, 4, x86_deref(srcECX), a->inputsize);
554 emit_pack_store_4ub(p, dest, temp);
555 update_src_ptr(p, srcECX, vtxESI, a);
556 break;
557 case EMIT_4UB_4F_BGRA:
558 get_src_ptr(p, srcECX, vtxESI, a);
559 emit_load(p, temp, 4, x86_deref(srcECX), a->inputsize);
560 sse_shufps(&p->func, temp, temp, SHUF(Z,Y,X,W));
561 emit_pack_store_4ub(p, dest, temp);
562 update_src_ptr(p, srcECX, vtxESI, a);
563 break;
564 case EMIT_4UB_4F_ARGB:
565 get_src_ptr(p, srcECX, vtxESI, a);
566 emit_load(p, temp, 4, x86_deref(srcECX), a->inputsize);
567 sse_shufps(&p->func, temp, temp, SHUF(W,X,Y,Z));
568 emit_pack_store_4ub(p, dest, temp);
569 update_src_ptr(p, srcECX, vtxESI, a);
570 break;
571 case EMIT_4UB_4F_ABGR:
572 get_src_ptr(p, srcECX, vtxESI, a);
573 emit_load(p, temp, 4, x86_deref(srcECX), a->inputsize);
574 sse_shufps(&p->func, temp, temp, SHUF(W,Z,Y,X));
575 emit_pack_store_4ub(p, dest, temp);
576 update_src_ptr(p, srcECX, vtxESI, a);
577 break;
578 case EMIT_4CHAN_4F_RGBA:
579 switch (CHAN_TYPE) {
580 case GL_UNSIGNED_BYTE:
581 get_src_ptr(p, srcECX, vtxESI, a);
582 emit_load(p, temp, 4, x86_deref(srcECX), a->inputsize);
583 emit_pack_store_4ub(p, dest, temp);
584 update_src_ptr(p, srcECX, vtxESI, a);
585 break;
586 case GL_FLOAT:
587 get_src_ptr(p, srcECX, vtxESI, a);
588 emit_load(p, temp, 4, x86_deref(srcECX), a->inputsize);
589 emit_store(p, dest, 4, temp);
590 update_src_ptr(p, srcECX, vtxESI, a);
591 break;
592 case GL_UNSIGNED_SHORT:
593 default:
594 printf("unknown CHAN_TYPE %s\n", _mesa_enum_to_string(CHAN_TYPE));
595 return GL_FALSE;
596 }
597 break;
598 default:
599 printf("unknown a[%d].format %d\n", j, a->format);
600 return GL_FALSE; /* catch any new opcodes */
601 }
602
603 /* Increment j by at least 1 - may have been incremented above also:
604 */
605 j++;
606 }
607
608 /* Next vertex:
609 */
610 x86_lea(&p->func, vertexEAX, x86_make_disp(vertexEAX, vtx->vertex_size));
611
612 /* decr count, loop if not zero
613 */
614 x86_dec(&p->func, countEBP);
615 x86_test(&p->func, countEBP, countEBP);
616 x86_jcc(&p->func, cc_NZ, label);
617
618 /* Exit mmx state?
619 */
620 if (p->func.need_emms)
621 mmx_emms(&p->func);
622
623 /* Land forward jump here:
624 */
625 x86_fixup_fwd_jump(&p->func, fixup);
626
627 /* Pop regs and return
628 */
629 x86_pop(&p->func, x86_get_base_reg(vtxESI));
630 x86_pop(&p->func, countEBP);
631 x86_ret(&p->func);
632
633 assert(!vtx->emit);
634 vtx->emit = (tnl_emit_func)x86_get_func(&p->func);
635
636 assert( (char *) p->func.csr - (char *) p->func.store <= MAX_SSE_CODE_SIZE );
637 return GL_TRUE;
638 }
639
640
641
_tnl_generate_sse_emit(struct gl_context * ctx)642 void _tnl_generate_sse_emit( struct gl_context *ctx )
643 {
644 struct tnl_clipspace *vtx = GET_VERTEX_STATE(ctx);
645 struct x86_program p;
646
647 if (!cpu_has_xmm) {
648 vtx->codegen_emit = NULL;
649 return;
650 }
651
652 memset(&p, 0, sizeof(p));
653
654 p.ctx = ctx;
655 p.inputs_safe = 0; /* for now */
656 p.outputs_safe = 0; /* for now */
657 p.have_sse2 = cpu_has_xmm2;
658 p.identity = x86_make_reg(file_XMM, 6);
659 p.chan0 = x86_make_reg(file_XMM, 7);
660
661 if (!x86_init_func_size(&p.func, MAX_SSE_CODE_SIZE)) {
662 vtx->emit = NULL;
663 return;
664 }
665
666 if (build_vertex_emit(&p)) {
667 _tnl_register_fastpath( vtx, GL_TRUE );
668 }
669 else {
670 /* Note the failure so that we don't keep trying to codegen an
671 * impossible state:
672 */
673 _tnl_register_fastpath( vtx, GL_FALSE );
674 x86_release_func(&p.func);
675 }
676 }
677
678 #else
679
_tnl_generate_sse_emit(struct gl_context * ctx)680 void _tnl_generate_sse_emit( struct gl_context *ctx )
681 {
682 /* Dummy version for when USE_SSE_ASM not defined */
683 }
684
685 #endif
686