• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 
2 /* FF is big and ugly so feel free to write lines as long as you like.
3  * Aieeeeeeeee !
4  *
5  * Let me make that clearer:
6  * Aieeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeee ! !! !!!
7  */
8 
9 #include "device9.h"
10 #include "basetexture9.h"
11 #include "vertexdeclaration9.h"
12 #include "vertexshader9.h"
13 #include "pixelshader9.h"
14 #include "nine_ff.h"
15 #include "nine_defines.h"
16 #include "nine_helpers.h"
17 #include "nine_pipe.h"
18 #include "nine_dump.h"
19 
20 #include "pipe/p_context.h"
21 #include "tgsi/tgsi_ureg.h"
22 #include "tgsi/tgsi_dump.h"
23 #include "util/bitscan.h"
24 #include "util/u_box.h"
25 #include "util/u_hash_table.h"
26 #include "util/u_upload_mgr.h"
27 
28 #define DBG_CHANNEL DBG_FF
29 
30 #define NINE_FF_NUM_VS_CONST 204
31 #define NINE_FF_NUM_PS_CONST 24
32 
33 struct fvec4
34 {
35     float x, y, z, w;
36 };
37 
38 struct nine_ff_vs_key
39 {
40     union {
41         struct {
42             uint32_t position_t : 1;
43             uint32_t lighting   : 1;
44             uint32_t darkness   : 1; /* lighting enabled but no active lights */
45             uint32_t localviewer : 1;
46             uint32_t vertexpointsize : 1;
47             uint32_t pointscale : 1;
48             uint32_t vertexblend : 3;
49             uint32_t vertexblend_indexed : 1;
50             uint32_t vertextween : 1;
51             uint32_t mtl_diffuse : 2; /* 0 = material, 1 = color1, 2 = color2 */
52             uint32_t mtl_ambient : 2;
53             uint32_t mtl_specular : 2;
54             uint32_t mtl_emissive : 2;
55             uint32_t fog_mode : 2;
56             uint32_t fog_range : 1;
57             uint32_t color0in_one : 1;
58             uint32_t color1in_zero : 1;
59             uint32_t has_normal : 1;
60             uint32_t fog : 1;
61             uint32_t normalizenormals : 1;
62             uint32_t ucp : 1;
63             uint32_t pad1 : 4;
64             uint32_t tc_dim_input: 16; /* 8 * 2 bits */
65             uint32_t pad2 : 16;
66             uint32_t tc_dim_output: 24; /* 8 * 3 bits */
67             uint32_t pad3 : 8;
68             uint32_t tc_gen : 24; /* 8 * 3 bits */
69             uint32_t pad4 : 8;
70             uint32_t tc_idx : 24;
71             uint32_t clipplane_emulate : 8;
72             uint32_t passthrough;
73         };
74         uint64_t value64[3]; /* don't forget to resize VertexShader9.ff_key */
75         uint32_t value32[6];
76     };
77 };
78 
79 /* Texture stage state:
80  *
81  * COLOROP       D3DTOP 5 bit
82  * ALPHAOP       D3DTOP 5 bit
83  * COLORARG0     D3DTA  3 bit
84  * COLORARG1     D3DTA  3 bit
85  * COLORARG2     D3DTA  3 bit
86  * ALPHAARG0     D3DTA  3 bit
87  * ALPHAARG1     D3DTA  3 bit
88  * ALPHAARG2     D3DTA  3 bit
89  * RESULTARG     D3DTA  1 bit (CURRENT:0 or TEMP:1)
90  * TEXCOORDINDEX 0 - 7  3 bit
91  * ===========================
92  *                     32 bit per stage
93  */
94 struct nine_ff_ps_key
95 {
96     union {
97         struct {
98             struct {
99                 uint32_t colorop   : 5;
100                 uint32_t alphaop   : 5;
101                 uint32_t colorarg0 : 3;
102                 uint32_t colorarg1 : 3;
103                 uint32_t colorarg2 : 3;
104                 uint32_t alphaarg0 : 3;
105                 uint32_t alphaarg1 : 3;
106                 uint32_t alphaarg2 : 3;
107                 uint32_t resultarg : 1; /* CURRENT:0 or TEMP:1 */
108                 uint32_t textarget : 2; /* 1D/2D/3D/CUBE */
109                 uint32_t pad       : 1;
110                 /* that's 32 bit exactly */
111             } ts[8];
112             uint32_t projected : 16;
113             uint32_t fog : 1; /* for vFog coming from VS */
114             uint32_t fog_mode : 2;
115             uint32_t fog_source : 1; /* 0: Z, 1: W */
116             uint32_t specular : 1;
117             uint32_t alpha_test_emulation : 3;
118             uint32_t flatshade : 1;
119             uint32_t pad1 : 7; /* 9 32-bit words with this */
120             uint8_t colorarg_b4[3];
121             uint8_t colorarg_b5[3];
122             uint8_t alphaarg_b4[3]; /* 11 32-bit words plus a byte */
123             uint8_t pad2[3];
124         };
125         uint64_t value64[6]; /* don't forget to resize PixelShader9.ff_key */
126         uint32_t value32[12];
127     };
128 };
129 
nine_ff_vs_key_hash(const void * key)130 static uint32_t nine_ff_vs_key_hash(const void *key)
131 {
132     const struct nine_ff_vs_key *vs = key;
133     unsigned i;
134     uint32_t hash = vs->value32[0];
135     for (i = 1; i < ARRAY_SIZE(vs->value32); ++i)
136         hash ^= vs->value32[i];
137     return hash;
138 }
nine_ff_vs_key_comp(const void * key1,const void * key2)139 static bool nine_ff_vs_key_comp(const void *key1, const void *key2)
140 {
141     struct nine_ff_vs_key *a = (struct nine_ff_vs_key *)key1;
142     struct nine_ff_vs_key *b = (struct nine_ff_vs_key *)key2;
143 
144     return memcmp(a->value64, b->value64, sizeof(a->value64)) == 0;
145 }
nine_ff_ps_key_hash(const void * key)146 static uint32_t nine_ff_ps_key_hash(const void *key)
147 {
148     const struct nine_ff_ps_key *ps = key;
149     unsigned i;
150     uint32_t hash = ps->value32[0];
151     for (i = 1; i < ARRAY_SIZE(ps->value32); ++i)
152         hash ^= ps->value32[i];
153     return hash;
154 }
nine_ff_ps_key_comp(const void * key1,const void * key2)155 static bool nine_ff_ps_key_comp(const void *key1, const void *key2)
156 {
157     struct nine_ff_ps_key *a = (struct nine_ff_ps_key *)key1;
158     struct nine_ff_ps_key *b = (struct nine_ff_ps_key *)key2;
159 
160     return memcmp(a->value64, b->value64, sizeof(a->value64)) == 0;
161 }
nine_ff_fvf_key_hash(const void * key)162 static uint32_t nine_ff_fvf_key_hash(const void *key)
163 {
164     return *(DWORD *)key;
165 }
nine_ff_fvf_key_comp(const void * key1,const void * key2)166 static bool nine_ff_fvf_key_comp(const void *key1, const void *key2)
167 {
168     return *(DWORD *)key1 == *(DWORD *)key2;
169 }
170 
171 static void nine_ff_prune_vs(struct NineDevice9 *);
172 static void nine_ff_prune_ps(struct NineDevice9 *);
173 
nine_ureg_tgsi_dump(struct ureg_program * ureg,bool override)174 static void nine_ureg_tgsi_dump(struct ureg_program *ureg, bool override)
175 {
176     if (debug_get_bool_option("NINE_FF_DUMP", false) || override) {
177         const struct tgsi_token *toks = ureg_get_tokens(ureg, NULL);
178         tgsi_dump(toks, 0);
179         ureg_free_tokens(toks);
180     }
181 }
182 
183 #define _X(r) ureg_scalar(ureg_src(r), TGSI_SWIZZLE_X)
184 #define _Y(r) ureg_scalar(ureg_src(r), TGSI_SWIZZLE_Y)
185 #define _Z(r) ureg_scalar(ureg_src(r), TGSI_SWIZZLE_Z)
186 #define _W(r) ureg_scalar(ureg_src(r), TGSI_SWIZZLE_W)
187 
188 #define _XXXX(r) ureg_scalar(r, TGSI_SWIZZLE_X)
189 #define _YYYY(r) ureg_scalar(r, TGSI_SWIZZLE_Y)
190 #define _ZZZZ(r) ureg_scalar(r, TGSI_SWIZZLE_Z)
191 #define _WWWW(r) ureg_scalar(r, TGSI_SWIZZLE_W)
192 
193 #define _XYZW(r) (r)
194 
195 /* AL should contain base address of lights table. */
196 #define LIGHT_CONST(i)                                                \
197     ureg_src_indirect(ureg_DECL_constant(ureg, i), _X(AL))
198 
199 #define MATERIAL_CONST(i) \
200     ureg_DECL_constant(ureg, 19 + (i))
201 
202 #define _CONST(n) ureg_DECL_constant(ureg, n)
203 
204 /* VS FF constants layout:
205  *
206  * CONST[ 0.. 3] D3DTS_WORLD * D3DTS_VIEW * D3DTS_PROJECTION
207  * CONST[ 4.. 7] D3DTS_WORLD * D3DTS_VIEW
208  * CONST[ 8..11] D3DTS_PROJECTION
209  * CONST[12..15] D3DTS_VIEW^(-1)
210  * CONST[16..18] Normal matrix
211  *
212  * CONST[19].xyz  MATERIAL.Emissive + Material.Ambient * RS.Ambient
213  * CONST[20]      MATERIAL.Diffuse
214  * CONST[21]      MATERIAL.Ambient
215  * CONST[22]      MATERIAL.Specular
216  * CONST[23].x___ MATERIAL.Power
217  * CONST[24]      MATERIAL.Emissive
218  * CONST[25]      RS.Ambient
219  *
220  * CONST[26].x___ RS.PointSizeMin
221  * CONST[26]._y__ RS.PointSizeMax
222  * CONST[26].__z_ RS.PointSize
223  * CONST[26].___w RS.PointScaleA
224  * CONST[27].x___ RS.PointScaleB
225  * CONST[27]._y__ RS.PointScaleC
226  *
227  * CONST[28].x___ RS.FogEnd
228  * CONST[28]._y__ 1.0f / (RS.FogEnd - RS.FogStart)
229  * CONST[28].__z_ RS.FogDensity
230 
231  * CONST[30].x___ TWEENFACTOR
232  *
233  * CONST[32].x___ LIGHT[0].Type
234  * CONST[32]._yzw LIGHT[0].Attenuation0,1,2
235  * CONST[33]      LIGHT[0].Diffuse
236  * CONST[34]      LIGHT[0].Specular
237  * CONST[35]      LIGHT[0].Ambient
238  * CONST[36].xyz_ LIGHT[0].Position
239  * CONST[36].___w LIGHT[0].Range
240  * CONST[37].xyz_ LIGHT[0].Direction
241  * CONST[37].___w LIGHT[0].Falloff
242  * CONST[38].x___ cos(LIGHT[0].Theta / 2)
243  * CONST[38]._y__ cos(LIGHT[0].Phi / 2)
244  * CONST[38].__z_ 1.0f / (cos(LIGHT[0].Theta / 2) - cos(Light[0].Phi / 2))
245  * CONST[39].xyz_ LIGHT[0].HalfVector (for directional lights)
246  * CONST[39].___w 1 if this is the last active light, 0 if not
247  * CONST[40]      LIGHT[1]
248  * CONST[48]      LIGHT[2]
249  * CONST[56]      LIGHT[3]
250  * CONST[64]      LIGHT[4]
251  * CONST[72]      LIGHT[5]
252  * CONST[80]      LIGHT[6]
253  * CONST[88]      LIGHT[7]
254  * NOTE: no lighting code is generated if there are no active lights
255  *
256  * CONST[100].x___ Viewport 2/width
257  * CONST[100]._y__ Viewport 2/height
258  * CONST[100].__z_ Viewport 1/(zmax - zmin)
259  * CONST[100].___w Viewport width
260  * CONST[101].x___ Viewport x0
261  * CONST[101]._y__ Viewport y0
262  * CONST[101].__z_ Viewport z0
263  *
264  * CONST[128..131] D3DTS_TEXTURE0
265  * CONST[132..135] D3DTS_TEXTURE1
266  * CONST[136..139] D3DTS_TEXTURE2
267  * CONST[140..143] D3DTS_TEXTURE3
268  * CONST[144..147] D3DTS_TEXTURE4
269  * CONST[148..151] D3DTS_TEXTURE5
270  * CONST[152..155] D3DTS_TEXTURE6
271  * CONST[156..159] D3DTS_TEXTURE7
272  *
273  * CONST[160] D3DTS_WORLDMATRIX[0] * D3DTS_VIEW
274  * CONST[164] D3DTS_WORLDMATRIX[1] * D3DTS_VIEW
275  * ...
276  * CONST[192] D3DTS_WORLDMATRIX[8] * D3DTS_VIEW
277  * CONST[196] UCP0
278  ...
279  * CONST[203] UCP7
280  */
281 struct vs_build_ctx
282 {
283     struct ureg_program *ureg;
284     const struct nine_ff_vs_key *key;
285 
286     uint16_t input[PIPE_MAX_ATTRIBS];
287     unsigned num_inputs;
288 
289     struct ureg_src aVtx;
290     struct ureg_src aNrm;
291     struct ureg_src aCol[2];
292     struct ureg_src aTex[8];
293     struct ureg_src aPsz;
294     struct ureg_src aInd;
295     struct ureg_src aWgt;
296 
297     struct ureg_src aVtx1; /* tweening */
298     struct ureg_src aNrm1;
299 
300     struct ureg_src mtlA;
301     struct ureg_src mtlD;
302     struct ureg_src mtlS;
303     struct ureg_src mtlE;
304 };
305 
306 static inline unsigned
get_texcoord_sn(struct pipe_screen * screen)307 get_texcoord_sn(struct pipe_screen *screen)
308 {
309     if (screen->get_param(screen, PIPE_CAP_TGSI_TEXCOORD))
310         return TGSI_SEMANTIC_TEXCOORD;
311     return TGSI_SEMANTIC_GENERIC;
312 }
313 
314 static inline struct ureg_src
build_vs_add_input(struct vs_build_ctx * vs,uint16_t ndecl)315 build_vs_add_input(struct vs_build_ctx *vs, uint16_t ndecl)
316 {
317     const unsigned i = vs->num_inputs++;
318     assert(i < PIPE_MAX_ATTRIBS);
319     vs->input[i] = ndecl;
320     return ureg_DECL_vs_input(vs->ureg, i);
321 }
322 
323 /* NOTE: dst may alias src */
324 static inline void
ureg_normalize3(struct ureg_program * ureg,struct ureg_dst dst,struct ureg_src src)325 ureg_normalize3(struct ureg_program *ureg,
326                 struct ureg_dst dst, struct ureg_src src)
327 {
328     struct ureg_dst tmp = ureg_DECL_temporary(ureg);
329     struct ureg_dst tmp_x = ureg_writemask(tmp, TGSI_WRITEMASK_X);
330 
331     ureg_DP3(ureg, tmp_x, src, src);
332     ureg_RSQ(ureg, tmp_x, _X(tmp));
333     ureg_MUL(ureg, dst, src, _X(tmp));
334     ureg_release_temporary(ureg, tmp);
335 }
336 
337 static void *
nine_ff_build_vs(struct NineDevice9 * device,struct vs_build_ctx * vs)338 nine_ff_build_vs(struct NineDevice9 *device, struct vs_build_ctx *vs)
339 {
340     const struct nine_ff_vs_key *key = vs->key;
341     struct ureg_program *ureg = ureg_create(PIPE_SHADER_VERTEX);
342     struct ureg_dst oPos, oCol[2], oPsz, oFog;
343     struct ureg_dst AR;
344     unsigned i, c;
345     unsigned label[32], l = 0;
346     bool need_aNrm = key->lighting || key->passthrough & (1 << NINE_DECLUSAGE_NORMAL);
347     bool has_aNrm;
348     bool need_aVtx = key->lighting || key->fog_mode || key->pointscale || key->ucp;
349     const unsigned texcoord_sn = get_texcoord_sn(device->screen);
350 
351     vs->ureg = ureg;
352 
353     /* Check which inputs we should transform. */
354     for (i = 0; i < 8 * 3; i += 3) {
355         switch ((key->tc_gen >> i) & 0x7) {
356         case NINED3DTSS_TCI_CAMERASPACENORMAL:
357             need_aNrm = true;
358             break;
359         case NINED3DTSS_TCI_CAMERASPACEPOSITION:
360             need_aVtx = true;
361             break;
362         case NINED3DTSS_TCI_CAMERASPACEREFLECTIONVECTOR:
363             need_aVtx = need_aNrm = true;
364             break;
365         case NINED3DTSS_TCI_SPHEREMAP:
366             need_aVtx = need_aNrm = true;
367             break;
368         default:
369             break;
370         }
371     }
372 
373     has_aNrm = need_aNrm && key->has_normal;
374 
375     /* Declare and record used inputs (needed for linkage with vertex format):
376      * (texture coordinates handled later)
377      */
378     vs->aVtx = build_vs_add_input(vs,
379         key->position_t ? NINE_DECLUSAGE_POSITIONT : NINE_DECLUSAGE_POSITION);
380 
381     vs->aNrm = ureg_imm1f(ureg, 0.0f);
382     if (has_aNrm)
383         vs->aNrm = build_vs_add_input(vs, NINE_DECLUSAGE_NORMAL);
384 
385     vs->aCol[0] = ureg_imm1f(ureg, 1.0f);
386     vs->aCol[1] = ureg_imm1f(ureg, 0.0f);
387 
388     if (key->lighting || key->darkness) {
389         const unsigned mask = key->mtl_diffuse | key->mtl_specular |
390                               key->mtl_ambient | key->mtl_emissive;
391         if ((mask & 0x1) && !key->color0in_one)
392             vs->aCol[0] = build_vs_add_input(vs, NINE_DECLUSAGE_i(COLOR, 0));
393         if ((mask & 0x2) && !key->color1in_zero)
394             vs->aCol[1] = build_vs_add_input(vs, NINE_DECLUSAGE_i(COLOR, 1));
395 
396         vs->mtlD = MATERIAL_CONST(1);
397         vs->mtlA = MATERIAL_CONST(2);
398         vs->mtlS = MATERIAL_CONST(3);
399         vs->mtlE = MATERIAL_CONST(5);
400         if (key->mtl_diffuse  == 1) vs->mtlD = vs->aCol[0]; else
401         if (key->mtl_diffuse  == 2) vs->mtlD = vs->aCol[1];
402         if (key->mtl_ambient  == 1) vs->mtlA = vs->aCol[0]; else
403         if (key->mtl_ambient  == 2) vs->mtlA = vs->aCol[1];
404         if (key->mtl_specular == 1) vs->mtlS = vs->aCol[0]; else
405         if (key->mtl_specular == 2) vs->mtlS = vs->aCol[1];
406         if (key->mtl_emissive == 1) vs->mtlE = vs->aCol[0]; else
407         if (key->mtl_emissive == 2) vs->mtlE = vs->aCol[1];
408     } else {
409         if (!key->color0in_one) vs->aCol[0] = build_vs_add_input(vs, NINE_DECLUSAGE_i(COLOR, 0));
410         if (!key->color1in_zero) vs->aCol[1] = build_vs_add_input(vs, NINE_DECLUSAGE_i(COLOR, 1));
411     }
412 
413     if (key->vertexpointsize)
414         vs->aPsz = build_vs_add_input(vs, NINE_DECLUSAGE_PSIZE);
415 
416     if (key->vertexblend_indexed || key->passthrough & (1 << NINE_DECLUSAGE_BLENDINDICES))
417         vs->aInd = build_vs_add_input(vs, NINE_DECLUSAGE_BLENDINDICES);
418     if (key->vertexblend || key->passthrough & (1 << NINE_DECLUSAGE_BLENDWEIGHT))
419         vs->aWgt = build_vs_add_input(vs, NINE_DECLUSAGE_BLENDWEIGHT);
420     if (key->vertextween) {
421         vs->aVtx1 = build_vs_add_input(vs, NINE_DECLUSAGE_i(POSITION,1));
422         vs->aNrm1 = build_vs_add_input(vs, NINE_DECLUSAGE_i(NORMAL,1));
423     }
424 
425     /* Declare outputs:
426      */
427     oPos = ureg_DECL_output(ureg, TGSI_SEMANTIC_POSITION, 0); /* HPOS */
428     oCol[0] = ureg_saturate(ureg_DECL_output(ureg, TGSI_SEMANTIC_COLOR, 0));
429     oCol[1] = ureg_saturate(ureg_DECL_output(ureg, TGSI_SEMANTIC_COLOR, 1));
430     if (key->fog || key->passthrough & (1 << NINE_DECLUSAGE_FOG)) {
431         oFog = ureg_DECL_output(ureg, TGSI_SEMANTIC_GENERIC, 16);
432         oFog = ureg_writemask(oFog, TGSI_WRITEMASK_X);
433     }
434 
435     if (key->vertexpointsize || key->pointscale || device->driver_caps.always_output_pointsize) {
436         oPsz = ureg_DECL_output_masked(ureg, TGSI_SEMANTIC_PSIZE, 0,
437                                        TGSI_WRITEMASK_X, 0, 1);
438         oPsz = ureg_writemask(oPsz, TGSI_WRITEMASK_X);
439     }
440 
441     if (key->lighting || key->vertexblend)
442         AR = ureg_DECL_address(ureg);
443 
444     /* === Vertex transformation / vertex blending:
445      */
446 
447     if (key->position_t) {
448         if (device->driver_caps.window_space_position_support) {
449             ureg_MOV(ureg, oPos, vs->aVtx);
450         } else {
451             struct ureg_dst tmp = ureg_DECL_temporary(ureg);
452             /* vs->aVtx contains the coordinates buffer wise.
453             * later in the pipeline, clipping, viewport and division
454             * by w (rhw = 1/w) are going to be applied, so do the reverse
455             * of these transformations (except clipping) to have the good
456             * position at the end.*/
457             ureg_MOV(ureg, tmp, vs->aVtx);
458             /* X from [X_min, X_min + width] to [-1, 1], same for Y. Z to [0, 1] */
459             ureg_ADD(ureg, ureg_writemask(tmp, TGSI_WRITEMASK_XYZ), ureg_src(tmp), ureg_negate(_CONST(101)));
460             ureg_MUL(ureg, ureg_writemask(tmp, TGSI_WRITEMASK_XYZ), ureg_src(tmp), _CONST(100));
461             ureg_ADD(ureg, ureg_writemask(tmp, TGSI_WRITEMASK_XY), ureg_src(tmp), ureg_imm1f(ureg, -1.0f));
462             /* Y needs to be reversed */
463             ureg_MOV(ureg, ureg_writemask(tmp, TGSI_WRITEMASK_Y), ureg_negate(ureg_src(tmp)));
464             /* Replace w by 1 if it equals to 0 */
465             ureg_CMP(ureg, ureg_writemask(tmp, TGSI_WRITEMASK_W), ureg_negate(ureg_abs(ureg_scalar(ureg_src(tmp), TGSI_SWIZZLE_W))),
466                      ureg_scalar(ureg_src(tmp), TGSI_SWIZZLE_W), ureg_imm1f(ureg, 1.0f));
467             /* inverse rhw */
468             ureg_RCP(ureg, ureg_writemask(tmp, TGSI_WRITEMASK_W), _W(tmp));
469             /* multiply X, Y, Z by w */
470             ureg_MUL(ureg, ureg_writemask(tmp, TGSI_WRITEMASK_XYZ), ureg_src(tmp), _W(tmp));
471             ureg_MOV(ureg, oPos, ureg_src(tmp));
472             ureg_release_temporary(ureg, tmp);
473         }
474     } else if (key->vertexblend) {
475         struct ureg_dst tmp = ureg_DECL_temporary(ureg);
476         struct ureg_dst tmp2 = ureg_DECL_temporary(ureg);
477         struct ureg_dst aVtx_dst = ureg_DECL_temporary(ureg);
478         struct ureg_dst aNrm_dst = ureg_DECL_temporary(ureg);
479         struct ureg_dst sum_blendweights = ureg_DECL_temporary(ureg);
480         struct ureg_src cWM[4];
481 
482         for (i = 160; i <= 195; ++i)
483             ureg_DECL_constant(ureg, i);
484 
485         /* translate world matrix index to constant file index */
486         if (key->vertexblend_indexed) {
487             ureg_MAD(ureg, tmp, vs->aInd, ureg_imm1f(ureg, 4.0f), ureg_imm1f(ureg, 160.0f));
488             ureg_ARL(ureg, AR, ureg_src(tmp));
489         }
490 
491         ureg_MOV(ureg, aVtx_dst, ureg_imm4f(ureg, 0.0f, 0.0f, 0.0f, 0.0f));
492         ureg_MOV(ureg, aNrm_dst, ureg_imm4f(ureg, 0.0f, 0.0f, 0.0f, 0.0f));
493         ureg_MOV(ureg, sum_blendweights, ureg_imm4f(ureg, 1.0f, 1.0f, 1.0f, 1.0f));
494 
495         for (i = 0; i < key->vertexblend; ++i) {
496             for (c = 0; c < 4; ++c) {
497                 cWM[c] = ureg_src_dimension(ureg_src_register(TGSI_FILE_CONSTANT, (160 + i * 4) * !key->vertexblend_indexed + c), 0);
498                 if (key->vertexblend_indexed)
499                     cWM[c] = ureg_src_indirect(cWM[c], ureg_scalar(ureg_src(AR), i));
500             }
501 
502             /* multiply by WORLD(index) */
503             ureg_MUL(ureg, tmp, _XXXX(vs->aVtx), cWM[0]);
504             ureg_MAD(ureg, tmp, _YYYY(vs->aVtx), cWM[1], ureg_src(tmp));
505             ureg_MAD(ureg, tmp, _ZZZZ(vs->aVtx), cWM[2], ureg_src(tmp));
506             ureg_MAD(ureg, tmp, _WWWW(vs->aVtx), cWM[3], ureg_src(tmp));
507 
508             if (has_aNrm) {
509                 /* Note: the spec says the transpose of the inverse of the
510                  * WorldView matrices should be used, but all tests show
511                  * otherwise.
512                  * Only case unknown: D3DVBF_0WEIGHTS */
513                 ureg_MUL(ureg, tmp2, _XXXX(vs->aNrm), cWM[0]);
514                 ureg_MAD(ureg, tmp2, _YYYY(vs->aNrm), cWM[1], ureg_src(tmp2));
515                 ureg_MAD(ureg, tmp2, _ZZZZ(vs->aNrm), cWM[2], ureg_src(tmp2));
516             }
517 
518             if (i < (key->vertexblend - 1)) {
519                 /* accumulate weighted position value */
520                 ureg_MAD(ureg, aVtx_dst, ureg_src(tmp), ureg_scalar(vs->aWgt, i), ureg_src(aVtx_dst));
521                 if (has_aNrm)
522                     ureg_MAD(ureg, aNrm_dst, ureg_src(tmp2), ureg_scalar(vs->aWgt, i), ureg_src(aNrm_dst));
523                 /* subtract weighted position value for last value */
524                 ureg_ADD(ureg, sum_blendweights, ureg_src(sum_blendweights), ureg_negate(ureg_scalar(vs->aWgt, i)));
525             }
526         }
527 
528         /* the last weighted position is always 1 - sum_of_previous_weights */
529         ureg_MAD(ureg, aVtx_dst, ureg_src(tmp), ureg_scalar(ureg_src(sum_blendweights), key->vertexblend - 1), ureg_src(aVtx_dst));
530         if (has_aNrm)
531             ureg_MAD(ureg, aNrm_dst, ureg_src(tmp2), ureg_scalar(ureg_src(sum_blendweights), key->vertexblend - 1), ureg_src(aNrm_dst));
532 
533         /* multiply by VIEW_PROJ */
534         ureg_MUL(ureg, tmp, _X(aVtx_dst), _CONST(8));
535         ureg_MAD(ureg, tmp, _Y(aVtx_dst), _CONST(9),  ureg_src(tmp));
536         ureg_MAD(ureg, tmp, _Z(aVtx_dst), _CONST(10), ureg_src(tmp));
537         ureg_MAD(ureg, oPos, _W(aVtx_dst), _CONST(11), ureg_src(tmp));
538 
539         if (need_aVtx)
540             vs->aVtx = ureg_src(aVtx_dst);
541 
542         ureg_release_temporary(ureg, tmp);
543         ureg_release_temporary(ureg, tmp2);
544         ureg_release_temporary(ureg, sum_blendweights);
545         if (!need_aVtx)
546             ureg_release_temporary(ureg, aVtx_dst);
547 
548         if (has_aNrm) {
549             if (key->normalizenormals)
550                ureg_normalize3(ureg, aNrm_dst, ureg_src(aNrm_dst));
551             vs->aNrm = ureg_src(aNrm_dst);
552         } else
553             ureg_release_temporary(ureg, aNrm_dst);
554     } else {
555         struct ureg_dst tmp = ureg_DECL_temporary(ureg);
556 
557         if (key->vertextween) {
558             struct ureg_dst aVtx_dst = ureg_DECL_temporary(ureg);
559             ureg_LRP(ureg, aVtx_dst, _XXXX(_CONST(30)), vs->aVtx1, vs->aVtx);
560             vs->aVtx = ureg_src(aVtx_dst);
561             if (has_aNrm) {
562                 struct ureg_dst aNrm_dst = ureg_DECL_temporary(ureg);
563                 ureg_LRP(ureg, aNrm_dst, _XXXX(_CONST(30)), vs->aNrm1, vs->aNrm);
564                 vs->aNrm = ureg_src(aNrm_dst);
565             }
566         }
567 
568         /* position = vertex * WORLD_VIEW_PROJ */
569         ureg_MUL(ureg, tmp, _XXXX(vs->aVtx), _CONST(0));
570         ureg_MAD(ureg, tmp, _YYYY(vs->aVtx), _CONST(1), ureg_src(tmp));
571         ureg_MAD(ureg, tmp, _ZZZZ(vs->aVtx), _CONST(2), ureg_src(tmp));
572         ureg_MAD(ureg, oPos, _WWWW(vs->aVtx), _CONST(3), ureg_src(tmp));
573         ureg_release_temporary(ureg, tmp);
574 
575         if (need_aVtx) {
576             struct ureg_dst aVtx_dst = ureg_writemask(ureg_DECL_temporary(ureg), TGSI_WRITEMASK_XYZ);
577             ureg_MUL(ureg, aVtx_dst, _XXXX(vs->aVtx), _CONST(4));
578             ureg_MAD(ureg, aVtx_dst, _YYYY(vs->aVtx), _CONST(5), ureg_src(aVtx_dst));
579             ureg_MAD(ureg, aVtx_dst, _ZZZZ(vs->aVtx), _CONST(6), ureg_src(aVtx_dst));
580             ureg_MAD(ureg, aVtx_dst, _WWWW(vs->aVtx), _CONST(7), ureg_src(aVtx_dst));
581             vs->aVtx = ureg_src(aVtx_dst);
582         }
583         if (has_aNrm) {
584             struct ureg_dst aNrm_dst = ureg_writemask(ureg_DECL_temporary(ureg), TGSI_WRITEMASK_XYZ);
585             ureg_MUL(ureg, aNrm_dst, _XXXX(vs->aNrm), _CONST(16));
586             ureg_MAD(ureg, aNrm_dst, _YYYY(vs->aNrm), _CONST(17), ureg_src(aNrm_dst));
587             ureg_MAD(ureg, aNrm_dst, _ZZZZ(vs->aNrm), _CONST(18), ureg_src(aNrm_dst));
588             if (key->normalizenormals)
589                ureg_normalize3(ureg, aNrm_dst, ureg_src(aNrm_dst));
590             vs->aNrm = ureg_src(aNrm_dst);
591         }
592     }
593 
594     /* === Process point size:
595      */
596     if (key->vertexpointsize || key->pointscale || device->driver_caps.always_output_pointsize) {
597         struct ureg_dst tmp = ureg_DECL_temporary(ureg);
598         struct ureg_dst tmp_x = ureg_writemask(tmp, TGSI_WRITEMASK_X);
599         struct ureg_dst tmp_y = ureg_writemask(tmp, TGSI_WRITEMASK_Y);
600         struct ureg_dst tmp_z = ureg_writemask(tmp, TGSI_WRITEMASK_Z);
601         if (key->vertexpointsize) {
602             struct ureg_src cPsz1 = ureg_DECL_constant(ureg, 26);
603             ureg_MAX(ureg, tmp_z, _XXXX(vs->aPsz), _XXXX(cPsz1));
604             ureg_MIN(ureg, tmp_z, _Z(tmp), _YYYY(cPsz1));
605         } else {
606             struct ureg_src cPsz1 = ureg_DECL_constant(ureg, 26);
607             ureg_MOV(ureg, tmp_z, _ZZZZ(cPsz1));
608         }
609 
610         if (key->pointscale) {
611             struct ureg_src cPsz1 = ureg_DECL_constant(ureg, 26);
612             struct ureg_src cPsz2 = ureg_DECL_constant(ureg, 27);
613 
614             ureg_DP3(ureg, tmp_x, vs->aVtx, vs->aVtx);
615             ureg_RSQ(ureg, tmp_y, _X(tmp));
616             ureg_MUL(ureg, tmp_y, _Y(tmp), _X(tmp));
617             ureg_CMP(ureg, tmp_y, ureg_negate(_Y(tmp)), _Y(tmp), ureg_imm1f(ureg, 0.0f));
618             ureg_MAD(ureg, tmp_x, _Y(tmp), _YYYY(cPsz2), _XXXX(cPsz2));
619             ureg_MAD(ureg, tmp_x, _Y(tmp), _X(tmp), _WWWW(cPsz1));
620             ureg_RSQ(ureg, tmp_x, _X(tmp));
621             ureg_MUL(ureg, tmp_x, _X(tmp), _Z(tmp));
622             ureg_MUL(ureg, tmp_x, _X(tmp), _WWWW(_CONST(100)));
623             ureg_MAX(ureg, tmp_x, _X(tmp), _XXXX(cPsz1));
624             ureg_MIN(ureg, tmp_z, _X(tmp), _YYYY(cPsz1));
625         }
626 
627         ureg_MOV(ureg, oPsz, _Z(tmp));
628         ureg_release_temporary(ureg, tmp);
629     }
630 
631     for (i = 0; i < 8; ++i) {
632         struct ureg_dst tmp, tmp_x, tmp2;
633         struct ureg_dst oTex, input_coord, transformed, t, aVtx_normed;
634         unsigned c, writemask;
635         const unsigned tci = (key->tc_gen >> (i * 3)) & 0x7;
636         const unsigned idx = (key->tc_idx >> (i * 3)) & 0x7;
637         unsigned dim_input = 1 + ((key->tc_dim_input >> (i * 2)) & 0x3);
638         const unsigned dim_output = (key->tc_dim_output >> (i * 3)) & 0x7;
639 
640         /* No texture output of index s */
641         if (tci == NINED3DTSS_TCI_DISABLE)
642             continue;
643         oTex = ureg_DECL_output(ureg, texcoord_sn, i);
644         tmp = ureg_DECL_temporary(ureg);
645         tmp_x = ureg_writemask(tmp, TGSI_WRITEMASK_X);
646         input_coord = ureg_DECL_temporary(ureg);
647         transformed = ureg_DECL_temporary(ureg);
648 
649         /* Get the coordinate */
650         switch (tci) {
651         case NINED3DTSS_TCI_PASSTHRU:
652             /* NINED3DTSS_TCI_PASSTHRU => Use texcoord coming from index idx *
653              * Else the idx is used only to determine wrapping mode. */
654             vs->aTex[idx] = build_vs_add_input(vs, NINE_DECLUSAGE_i(TEXCOORD,idx));
655             ureg_MOV(ureg, input_coord, vs->aTex[idx]);
656             break;
657         case NINED3DTSS_TCI_CAMERASPACENORMAL:
658             ureg_MOV(ureg, ureg_writemask(input_coord, TGSI_WRITEMASK_XYZ), vs->aNrm);
659             ureg_MOV(ureg, ureg_writemask(input_coord, TGSI_WRITEMASK_W), ureg_imm1f(ureg, 1.0f));
660             dim_input = 4;
661             break;
662         case NINED3DTSS_TCI_CAMERASPACEPOSITION:
663             ureg_MOV(ureg, ureg_writemask(input_coord, TGSI_WRITEMASK_XYZ), vs->aVtx);
664             ureg_MOV(ureg, ureg_writemask(input_coord, TGSI_WRITEMASK_W), ureg_imm1f(ureg, 1.0f));
665             dim_input = 4;
666             break;
667         case NINED3DTSS_TCI_CAMERASPACEREFLECTIONVECTOR:
668             tmp.WriteMask = TGSI_WRITEMASK_XYZ;
669             aVtx_normed = ureg_DECL_temporary(ureg);
670             ureg_normalize3(ureg, aVtx_normed, vs->aVtx);
671             ureg_DP3(ureg, tmp_x, ureg_src(aVtx_normed), vs->aNrm);
672             ureg_MUL(ureg, tmp, vs->aNrm, _X(tmp));
673             ureg_ADD(ureg, tmp, ureg_src(tmp), ureg_src(tmp));
674             ureg_ADD(ureg, ureg_writemask(input_coord, TGSI_WRITEMASK_XYZ), ureg_src(aVtx_normed), ureg_negate(ureg_src(tmp)));
675             ureg_MOV(ureg, ureg_writemask(input_coord, TGSI_WRITEMASK_W), ureg_imm1f(ureg, 1.0f));
676             ureg_release_temporary(ureg, aVtx_normed);
677             dim_input = 4;
678             tmp.WriteMask = TGSI_WRITEMASK_XYZW;
679             break;
680         case NINED3DTSS_TCI_SPHEREMAP:
681             /* Implement the formula of GL_SPHERE_MAP */
682             tmp.WriteMask = TGSI_WRITEMASK_XYZ;
683             aVtx_normed = ureg_DECL_temporary(ureg);
684             tmp2 = ureg_DECL_temporary(ureg);
685             ureg_normalize3(ureg, aVtx_normed, vs->aVtx);
686             ureg_DP3(ureg, tmp_x, ureg_src(aVtx_normed), vs->aNrm);
687             ureg_MUL(ureg, tmp, vs->aNrm, _X(tmp));
688             ureg_ADD(ureg, tmp, ureg_src(tmp), ureg_src(tmp));
689             ureg_ADD(ureg, tmp, ureg_src(aVtx_normed), ureg_negate(ureg_src(tmp)));
690             /* now tmp = normed(Vtx) - 2 dot3(normed(Vtx), Nrm) Nrm */
691             ureg_MOV(ureg, ureg_writemask(tmp2, TGSI_WRITEMASK_XYZ), ureg_src(tmp));
692             ureg_MUL(ureg, tmp2, ureg_src(tmp2), ureg_src(tmp2));
693             ureg_DP3(ureg, ureg_writemask(tmp2, TGSI_WRITEMASK_X), ureg_src(tmp2), ureg_src(tmp2));
694             ureg_RSQ(ureg, ureg_writemask(tmp2, TGSI_WRITEMASK_X), ureg_src(tmp2));
695             ureg_MUL(ureg, ureg_writemask(tmp2, TGSI_WRITEMASK_X), ureg_src(tmp2), ureg_imm1f(ureg, 0.5f));
696             /* tmp2 = 0.5 / sqrt(tmp.x^2 + tmp.y^2 + (tmp.z+1)^2)
697              * TODO: z coordinates are a bit different gl vs d3d, should the formula be adapted ? */
698             ureg_MUL(ureg, tmp, ureg_src(tmp), _X(tmp2));
699             ureg_ADD(ureg, ureg_writemask(input_coord, TGSI_WRITEMASK_XY), ureg_src(tmp), ureg_imm1f(ureg, 0.5f));
700             ureg_MOV(ureg, ureg_writemask(input_coord, TGSI_WRITEMASK_ZW), ureg_imm4f(ureg, 0.0f, 0.0f, 0.0f, 1.0f));
701             ureg_release_temporary(ureg, aVtx_normed);
702             ureg_release_temporary(ureg, tmp2);
703             dim_input = 4;
704             tmp.WriteMask = TGSI_WRITEMASK_XYZW;
705             break;
706         default:
707             assert(0);
708             break;
709         }
710 
711         /* Apply the transformation */
712         /* dim_output == 0 => do not transform the components.
713          * XYZRHW also disables transformation */
714         if (!dim_output || key->position_t) {
715             ureg_release_temporary(ureg, transformed);
716             transformed = input_coord;
717             writemask = TGSI_WRITEMASK_XYZW;
718         } else {
719             for (c = 0; c < dim_output; c++) {
720                 t = ureg_writemask(transformed, 1 << c);
721                 switch (dim_input) {
722                 /* dim_input = 1 2 3: -> we add trailing 1 to input*/
723                 case 1: ureg_MAD(ureg, t, _X(input_coord), _XXXX(_CONST(128 + i * 4 + c)), _YYYY(_CONST(128 + i * 4 + c)));
724                         break;
725                 case 2: ureg_DP2(ureg, t, ureg_src(input_coord), _CONST(128 + i * 4 + c));
726                         ureg_ADD(ureg, t, ureg_src(transformed), _ZZZZ(_CONST(128 + i * 4 + c)));
727                         break;
728                 case 3: ureg_DP3(ureg, t, ureg_src(input_coord), _CONST(128 + i * 4 + c));
729                         ureg_ADD(ureg, t, ureg_src(transformed), _WWWW(_CONST(128 + i * 4 + c)));
730                         break;
731                 case 4: ureg_DP4(ureg, t, ureg_src(input_coord), _CONST(128 + i * 4 + c)); break;
732                 default:
733                     assert(0);
734                 }
735             }
736             writemask = (1 << dim_output) - 1;
737             ureg_release_temporary(ureg, input_coord);
738         }
739 
740         ureg_MOV(ureg, ureg_writemask(oTex, writemask), ureg_src(transformed));
741         ureg_release_temporary(ureg, transformed);
742         ureg_release_temporary(ureg, tmp);
743     }
744 
745     /* === Lighting:
746      *
747      * DIRECTIONAL:  Light at infinite distance, parallel rays, no attenuation.
748      * POINT: Finite distance to scene, divergent rays, isotropic, attenuation.
749      * SPOT: Finite distance, divergent rays, angular dependence, attenuation.
750      *
751      * vec3 normal = normalize(in.Normal * NormalMatrix);
752      * vec3 hitDir = light.direction;
753      * float atten = 1.0;
754      *
755      * if (light.type != DIRECTIONAL)
756      * {
757      *     vec3 hitVec = light.position - eyeVertex;
758      *     float d = length(hitVec);
759      *     hitDir = hitVec / d;
760      *     atten = 1 / ((light.atten2 * d + light.atten1) * d + light.atten0);
761      * }
762      *
763      * if (light.type == SPOTLIGHT)
764      * {
765      *     float rho = dp3(-hitVec, light.direction);
766      *     if (rho < cos(light.phi / 2))
767      *         atten = 0;
768      *     if (rho < cos(light.theta / 2))
769      *         atten *= pow(some_func(rho), light.falloff);
770      * }
771      *
772      * float nDotHit = dp3_sat(normal, hitVec);
773      * float powFact = 0.0;
774      *
775      * if (nDotHit > 0.0)
776      * {
777      *     vec3 midVec = normalize(hitDir + eye);
778      *     float nDotMid = dp3_sat(normal, midVec);
779      *     pFact = pow(nDotMid, material.power);
780      * }
781      *
782      * ambient += light.ambient * atten;
783      * diffuse += light.diffuse * atten * nDotHit;
784      * specular += light.specular * atten * powFact;
785      */
786     if (key->lighting) {
787         struct ureg_dst tmp = ureg_DECL_temporary(ureg);
788         struct ureg_dst tmp_x = ureg_writemask(tmp, TGSI_WRITEMASK_X);
789         struct ureg_dst tmp_y = ureg_writemask(tmp, TGSI_WRITEMASK_Y);
790         struct ureg_dst tmp_z = ureg_writemask(tmp, TGSI_WRITEMASK_Z);
791         struct ureg_dst rAtt = ureg_writemask(ureg_DECL_temporary(ureg), TGSI_WRITEMASK_W);
792         struct ureg_dst rHit = ureg_writemask(ureg_DECL_temporary(ureg), TGSI_WRITEMASK_XYZ);
793         struct ureg_dst rMid = ureg_writemask(ureg_DECL_temporary(ureg), TGSI_WRITEMASK_XYZ);
794 
795         struct ureg_dst rCtr = ureg_writemask(ureg_DECL_temporary(ureg), TGSI_WRITEMASK_W);
796 
797         struct ureg_dst AL = ureg_writemask(AR, TGSI_WRITEMASK_X);
798 
799         /* Light.*.Alpha is not used. */
800         struct ureg_dst rD = ureg_writemask(ureg_DECL_temporary(ureg), TGSI_WRITEMASK_XYZ);
801         struct ureg_dst rA = ureg_writemask(ureg_DECL_temporary(ureg), TGSI_WRITEMASK_XYZ);
802         struct ureg_dst rS = ureg_DECL_temporary(ureg);
803 
804         struct ureg_src mtlP = _XXXX(MATERIAL_CONST(4));
805 
806         struct ureg_src cLKind = _XXXX(LIGHT_CONST(0));
807         struct ureg_src cLAtt0 = _YYYY(LIGHT_CONST(0));
808         struct ureg_src cLAtt1 = _ZZZZ(LIGHT_CONST(0));
809         struct ureg_src cLAtt2 = _WWWW(LIGHT_CONST(0));
810         struct ureg_src cLColD = _XYZW(LIGHT_CONST(1));
811         struct ureg_src cLColS = _XYZW(LIGHT_CONST(2));
812         struct ureg_src cLColA = _XYZW(LIGHT_CONST(3));
813         struct ureg_src cLPos  = _XYZW(LIGHT_CONST(4));
814         struct ureg_src cLRng  = _WWWW(LIGHT_CONST(4));
815         struct ureg_src cLDir  = _XYZW(LIGHT_CONST(5));
816         struct ureg_src cLFOff = _WWWW(LIGHT_CONST(5));
817         struct ureg_src cLTht  = _XXXX(LIGHT_CONST(6));
818         struct ureg_src cLPhi  = _YYYY(LIGHT_CONST(6));
819         struct ureg_src cLSDiv = _ZZZZ(LIGHT_CONST(6));
820         struct ureg_src cLLast = _WWWW(LIGHT_CONST(7));
821 
822         const unsigned loop_label = l++;
823 
824         /* Declare all light constants to allow indirect adressing */
825         for (i = 32; i < 96; i++)
826             ureg_DECL_constant(ureg, i);
827 
828         ureg_MOV(ureg, rCtr, ureg_imm1f(ureg, 32.0f)); /* &lightconst(0) */
829         ureg_MOV(ureg, rD, ureg_imm1f(ureg, 0.0f));
830         ureg_MOV(ureg, rA, ureg_imm1f(ureg, 0.0f));
831         ureg_MOV(ureg, rS, ureg_imm1f(ureg, 0.0f));
832 
833         /* loop management */
834         ureg_BGNLOOP(ureg, &label[loop_label]);
835         ureg_ARL(ureg, AL, _W(rCtr));
836 
837         /* if (not DIRECTIONAL light): */
838         ureg_SNE(ureg, tmp_x, cLKind, ureg_imm1f(ureg, D3DLIGHT_DIRECTIONAL));
839         ureg_MOV(ureg, rHit, ureg_negate(cLDir));
840         ureg_MOV(ureg, rAtt, ureg_imm1f(ureg, 1.0f));
841         ureg_IF(ureg, _X(tmp), &label[l++]);
842         {
843             /* hitDir = light.position - eyeVtx
844              * d = length(hitDir)
845              */
846             ureg_ADD(ureg, rHit, cLPos, ureg_negate(vs->aVtx));
847             ureg_DP3(ureg, tmp_x, ureg_src(rHit), ureg_src(rHit));
848             ureg_RSQ(ureg, tmp_y, _X(tmp));
849             ureg_MUL(ureg, tmp_x, _X(tmp), _Y(tmp)); /* length */
850 
851             /* att = 1.0 / (light.att0 + (light.att1 + light.att2 * d) * d) */
852             ureg_MAD(ureg, rAtt, _X(tmp), cLAtt2, cLAtt1);
853             ureg_MAD(ureg, rAtt, _X(tmp), _W(rAtt), cLAtt0);
854             ureg_RCP(ureg, rAtt, _W(rAtt));
855             /* cut-off if distance exceeds Light.Range */
856             ureg_SLT(ureg, tmp_x, _X(tmp), cLRng);
857             ureg_MUL(ureg, rAtt, _W(rAtt), _X(tmp));
858         }
859         ureg_fixup_label(ureg, label[l-1], ureg_get_instruction_number(ureg));
860         ureg_ENDIF(ureg);
861 
862         /* normalize hitDir */
863         ureg_normalize3(ureg, rHit, ureg_src(rHit));
864 
865         /* if (SPOT light) */
866         ureg_SEQ(ureg, tmp_x, cLKind, ureg_imm1f(ureg, D3DLIGHT_SPOT));
867         ureg_IF(ureg, _X(tmp), &label[l++]);
868         {
869             /* rho = dp3(-hitDir, light.spotDir)
870              *
871              * if (rho  > light.ctht2) NOTE: 0 <= phi <= pi, 0 <= theta <= phi
872              *     spotAtt = 1
873              * else
874              * if (rho <= light.cphi2)
875              *     spotAtt = 0
876              * else
877              *     spotAtt = (rho - light.cphi2) / (light.ctht2 - light.cphi2) ^ light.falloff
878              */
879             ureg_DP3(ureg, tmp_y, ureg_negate(ureg_src(rHit)), cLDir); /* rho */
880             ureg_ADD(ureg, tmp_x, _Y(tmp), ureg_negate(cLPhi));
881             ureg_MUL(ureg, tmp_x, _X(tmp), cLSDiv);
882             ureg_POW(ureg, tmp_x, _X(tmp), cLFOff); /* spotAtten */
883             ureg_SGE(ureg, tmp_z, _Y(tmp), cLTht); /* if inside theta && phi */
884             ureg_SGE(ureg, tmp_y, _Y(tmp), cLPhi); /* if inside phi */
885             ureg_MAD(ureg, ureg_saturate(tmp_x), _X(tmp), _Y(tmp), _Z(tmp));
886             ureg_MUL(ureg, rAtt, _W(rAtt), _X(tmp));
887         }
888         ureg_fixup_label(ureg, label[l-1], ureg_get_instruction_number(ureg));
889         ureg_ENDIF(ureg);
890 
891         /* directional factors, let's not use LIT because of clarity */
892 
893         if (has_aNrm) {
894             if (key->localviewer) {
895                 ureg_normalize3(ureg, rMid, vs->aVtx);
896                 ureg_ADD(ureg, rMid, ureg_src(rHit), ureg_negate(ureg_src(rMid)));
897             } else {
898                 ureg_ADD(ureg, rMid, ureg_src(rHit), ureg_imm3f(ureg, 0.0f, 0.0f, -1.0f));
899             }
900             ureg_normalize3(ureg, rMid, ureg_src(rMid));
901             ureg_DP3(ureg, ureg_saturate(tmp_x), vs->aNrm, ureg_src(rHit));
902             ureg_DP3(ureg, ureg_saturate(tmp_y), vs->aNrm, ureg_src(rMid));
903             ureg_MUL(ureg, tmp_z, _X(tmp), _Y(tmp));
904             /* Tests show that specular is computed only if (dp3(normal,hitDir) > 0).
905              * For front facing, it is more restrictive than test (dp3(normal,mid) > 0).
906              * No tests were made for backfacing, so add the two conditions */
907             ureg_IF(ureg, _Z(tmp), &label[l++]);
908             {
909                 ureg_DP3(ureg, ureg_saturate(tmp_y), vs->aNrm, ureg_src(rMid));
910                 ureg_POW(ureg, tmp_y, _Y(tmp), mtlP);
911                 ureg_MUL(ureg, tmp_y, _W(rAtt), _Y(tmp)); /* power factor * att */
912                 ureg_MAD(ureg, rS, cLColS, _Y(tmp), ureg_src(rS)); /* accumulate specular */
913             }
914             ureg_fixup_label(ureg, label[l-1], ureg_get_instruction_number(ureg));
915             ureg_ENDIF(ureg);
916 
917             ureg_MUL(ureg, tmp_x, _W(rAtt), _X(tmp)); /* dp3(normal,hitDir) * att */
918             ureg_MAD(ureg, rD, cLColD, _X(tmp), ureg_src(rD)); /* accumulate diffuse */
919         }
920 
921         ureg_MAD(ureg, rA, cLColA, _W(rAtt), ureg_src(rA)); /* accumulate ambient */
922 
923         /* break if this was the last light */
924         ureg_IF(ureg, cLLast, &label[l++]);
925         ureg_BRK(ureg);
926         ureg_ENDIF(ureg);
927         ureg_fixup_label(ureg, label[l-1], ureg_get_instruction_number(ureg));
928 
929         ureg_ADD(ureg, rCtr, _W(rCtr), ureg_imm1f(ureg, 8.0f));
930         ureg_fixup_label(ureg, label[loop_label], ureg_get_instruction_number(ureg));
931         ureg_ENDLOOP(ureg, &label[loop_label]);
932 
933         /* Apply to material:
934          *
935          * oCol[0] = (material.emissive + material.ambient * rs.ambient) +
936          *           material.ambient * ambient +
937          *           material.diffuse * diffuse +
938          * oCol[1] = material.specular * specular;
939          */
940         if (key->mtl_emissive == 0 && key->mtl_ambient == 0)
941             ureg_MAD(ureg, ureg_writemask(tmp, TGSI_WRITEMASK_XYZ), ureg_src(rA), vs->mtlA, _CONST(19));
942         else {
943             ureg_ADD(ureg, ureg_writemask(tmp, TGSI_WRITEMASK_XYZ), ureg_src(rA), _CONST(25));
944             ureg_MAD(ureg, ureg_writemask(tmp, TGSI_WRITEMASK_XYZ), vs->mtlA, ureg_src(tmp), vs->mtlE);
945         }
946 
947         ureg_MAD(ureg, ureg_writemask(oCol[0], TGSI_WRITEMASK_XYZ), ureg_src(rD), vs->mtlD, ureg_src(tmp));
948         ureg_MOV(ureg, ureg_writemask(oCol[0], TGSI_WRITEMASK_W), vs->mtlD);
949         ureg_MUL(ureg, oCol[1], ureg_src(rS), vs->mtlS);
950         ureg_release_temporary(ureg, rAtt);
951         ureg_release_temporary(ureg, rHit);
952         ureg_release_temporary(ureg, rMid);
953         ureg_release_temporary(ureg, rCtr);
954         ureg_release_temporary(ureg, rD);
955         ureg_release_temporary(ureg, rA);
956         ureg_release_temporary(ureg, rS);
957         ureg_release_temporary(ureg, rAtt);
958         ureg_release_temporary(ureg, tmp);
959     } else
960     /* COLOR */
961     if (key->darkness) {
962         if (key->mtl_emissive == 0 && key->mtl_ambient == 0)
963             ureg_MOV(ureg, ureg_writemask(oCol[0], TGSI_WRITEMASK_XYZ), _CONST(19));
964         else
965             ureg_MAD(ureg, ureg_writemask(oCol[0], TGSI_WRITEMASK_XYZ), vs->mtlA, _CONST(25), vs->mtlE);
966         ureg_MOV(ureg, ureg_writemask(oCol[0], TGSI_WRITEMASK_W), vs->mtlD);
967         ureg_MOV(ureg, oCol[1], ureg_imm1f(ureg, 0.0f));
968     } else {
969         ureg_MOV(ureg, oCol[0], vs->aCol[0]);
970         ureg_MOV(ureg, oCol[1], vs->aCol[1]);
971     }
972 
973     /* === Process fog.
974      *
975      * exp(x) = ex2(log2(e) * x)
976      */
977     if (key->fog_mode) {
978         struct ureg_dst tmp = ureg_DECL_temporary(ureg);
979         struct ureg_dst tmp_x = ureg_writemask(tmp, TGSI_WRITEMASK_X);
980         struct ureg_dst tmp_z = ureg_writemask(tmp, TGSI_WRITEMASK_Z);
981         if (key->fog_range) {
982             ureg_DP3(ureg, tmp_x, vs->aVtx, vs->aVtx);
983             ureg_RSQ(ureg, tmp_z, _X(tmp));
984             ureg_MUL(ureg, tmp_z, _Z(tmp), _X(tmp));
985         } else {
986             ureg_MOV(ureg, tmp_z, ureg_abs(_ZZZZ(vs->aVtx)));
987         }
988 
989         if (key->fog_mode == D3DFOG_EXP) {
990             ureg_MUL(ureg, tmp_x, _Z(tmp), _ZZZZ(_CONST(28)));
991             ureg_MUL(ureg, tmp_x, _X(tmp), ureg_imm1f(ureg, -1.442695f));
992             ureg_EX2(ureg, tmp_x, _X(tmp));
993         } else
994         if (key->fog_mode == D3DFOG_EXP2) {
995             ureg_MUL(ureg, tmp_x, _Z(tmp), _ZZZZ(_CONST(28)));
996             ureg_MUL(ureg, tmp_x, _X(tmp), _X(tmp));
997             ureg_MUL(ureg, tmp_x, _X(tmp), ureg_imm1f(ureg, -1.442695f));
998             ureg_EX2(ureg, tmp_x, _X(tmp));
999         } else
1000         if (key->fog_mode == D3DFOG_LINEAR) {
1001             ureg_ADD(ureg, tmp_x, _XXXX(_CONST(28)), ureg_negate(_Z(tmp)));
1002             ureg_MUL(ureg, ureg_saturate(tmp_x), _X(tmp), _YYYY(_CONST(28)));
1003         }
1004         ureg_MOV(ureg, oFog, _X(tmp));
1005         ureg_release_temporary(ureg, tmp);
1006     } else if (key->fog && !(key->passthrough & (1 << NINE_DECLUSAGE_FOG))) {
1007         ureg_MOV(ureg, oFog, ureg_scalar(vs->aCol[1], TGSI_SWIZZLE_W));
1008     }
1009 
1010     if (key->passthrough & (1 << NINE_DECLUSAGE_BLENDWEIGHT)) {
1011         struct ureg_src input;
1012         struct ureg_dst output;
1013         input = vs->aWgt;
1014         output = ureg_DECL_output(ureg, TGSI_SEMANTIC_GENERIC, 19);
1015         ureg_MOV(ureg, output, input);
1016     }
1017     if (key->passthrough & (1 << NINE_DECLUSAGE_BLENDINDICES)) {
1018         struct ureg_src input;
1019         struct ureg_dst output;
1020         input = vs->aInd;
1021         output = ureg_DECL_output(ureg, TGSI_SEMANTIC_GENERIC, 20);
1022         ureg_MOV(ureg, output, input);
1023     }
1024     if (key->passthrough & (1 << NINE_DECLUSAGE_NORMAL)) {
1025         struct ureg_src input;
1026         struct ureg_dst output;
1027         input = vs->aNrm;
1028         output = ureg_DECL_output(ureg, TGSI_SEMANTIC_GENERIC, 21);
1029         ureg_MOV(ureg, output, input);
1030     }
1031     if (key->passthrough & (1 << NINE_DECLUSAGE_TANGENT)) {
1032         struct ureg_src input;
1033         struct ureg_dst output;
1034         input = build_vs_add_input(vs, NINE_DECLUSAGE_TANGENT);
1035         output = ureg_DECL_output(ureg, TGSI_SEMANTIC_GENERIC, 22);
1036         ureg_MOV(ureg, output, input);
1037     }
1038     if (key->passthrough & (1 << NINE_DECLUSAGE_BINORMAL)) {
1039         struct ureg_src input;
1040         struct ureg_dst output;
1041         input = build_vs_add_input(vs, NINE_DECLUSAGE_BINORMAL);
1042         output = ureg_DECL_output(ureg, TGSI_SEMANTIC_GENERIC, 23);
1043         ureg_MOV(ureg, output, input);
1044     }
1045     if (key->passthrough & (1 << NINE_DECLUSAGE_FOG)) {
1046         struct ureg_src input;
1047         struct ureg_dst output;
1048         input = build_vs_add_input(vs, NINE_DECLUSAGE_FOG);
1049         input = ureg_scalar(input, TGSI_SWIZZLE_X);
1050         output = oFog;
1051         ureg_MOV(ureg, output, input);
1052     }
1053     if (key->passthrough & (1 << NINE_DECLUSAGE_DEPTH)) {
1054         (void) 0; /* TODO: replace z of position output ? */
1055     }
1056 
1057     /* ucp for ff applies on world coordinates.
1058      * aVtx is in worldview coordinates. */
1059     if (key->ucp) {
1060         struct ureg_dst tmp = ureg_DECL_temporary(ureg);
1061         ureg_MUL(ureg, tmp, _XXXX(vs->aVtx), _CONST(12));
1062         ureg_MAD(ureg, tmp, _YYYY(vs->aVtx), _CONST(13),  ureg_src(tmp));
1063         ureg_MAD(ureg, tmp, _ZZZZ(vs->aVtx), _CONST(14), ureg_src(tmp));
1064         if (!key->clipplane_emulate) {
1065             struct ureg_dst clipVect = ureg_DECL_output(ureg, TGSI_SEMANTIC_CLIPVERTEX, 0);
1066             ureg_ADD(ureg, clipVect, _CONST(15), ureg_src(tmp));
1067         } else {
1068             struct ureg_dst clipdist[2] = {ureg_dst_undef(), ureg_dst_undef()};
1069             int num_clipdist = ffs(key->clipplane_emulate);
1070             ureg_ADD(ureg, tmp, _CONST(15), ureg_src(tmp));
1071             clipdist[0] = ureg_DECL_output_masked(ureg, TGSI_SEMANTIC_CLIPDIST, 0,
1072                                                       ((1 << num_clipdist) - 1) & 0xf, 0, 1);
1073             if (num_clipdist >= 5)
1074                 clipdist[1] = ureg_DECL_output_masked(ureg, TGSI_SEMANTIC_CLIPDIST, 1,
1075                                                       ((1 << (num_clipdist - 4)) - 1) & 0xf, 0, 1);
1076             ureg_property(ureg, TGSI_PROPERTY_NUM_CLIPDIST_ENABLED, num_clipdist);
1077             for (i = 0; i < num_clipdist; i++) {
1078                 assert(!ureg_dst_is_undef(clipdist[i>>2]));
1079                 if (!(key->clipplane_emulate & (1 << i)))
1080                     ureg_MOV(ureg, ureg_writemask(clipdist[i>>2], 1 << (i & 0x2)), ureg_imm1f(ureg, 0.f));
1081                 else
1082                     ureg_DP4(ureg, ureg_writemask(clipdist[i>>2], 1 << (i & 0x2)),
1083                              ureg_src(tmp), _CONST(196+i));
1084             }
1085         }
1086         ureg_release_temporary(ureg, tmp);
1087     }
1088 
1089     if (key->position_t && device->driver_caps.window_space_position_support)
1090         ureg_property(ureg, TGSI_PROPERTY_VS_WINDOW_SPACE_POSITION, true);
1091 
1092     ureg_END(ureg);
1093     nine_ureg_tgsi_dump(ureg, false);
1094     return nine_create_shader_with_so_and_destroy(ureg, device->context.pipe, NULL);
1095 }
1096 
1097 /* PS FF constants layout:
1098  *
1099  * CONST[ 0.. 7]      stage[i].D3DTSS_CONSTANT
1100  * CONST[ 8..15].x___ stage[i].D3DTSS_BUMPENVMAT00
1101  * CONST[ 8..15]._y__ stage[i].D3DTSS_BUMPENVMAT01
1102  * CONST[ 8..15].__z_ stage[i].D3DTSS_BUMPENVMAT10
1103  * CONST[ 8..15].___w stage[i].D3DTSS_BUMPENVMAT11
1104  * CONST[16..19].x_z_ stage[i].D3DTSS_BUMPENVLSCALE
1105  * CONST[17..19]._y_w stage[i].D3DTSS_BUMPENVLOFFSET
1106  *
1107  * CONST[20] D3DRS_TEXTUREFACTOR
1108  * CONST[21] D3DRS_FOGCOLOR
1109  * CONST[22].x___ RS.FogEnd
1110  * CONST[22]._y__ 1.0f / (RS.FogEnd - RS.FogStart)
1111  * CONST[22].__z_ RS.FogDensity
1112  * CONST[22].___w Alpha ref
1113  */
1114 struct ps_build_ctx
1115 {
1116     struct ureg_program *ureg;
1117     unsigned color_interpolate_flag;
1118 
1119     struct ureg_src vC[2]; /* DIFFUSE, SPECULAR */
1120     struct ureg_src vT[8]; /* TEXCOORD[i] */
1121     struct ureg_dst rCur; /* D3DTA_CURRENT */
1122     struct ureg_dst rMod;
1123     struct ureg_src rCurSrc;
1124     struct ureg_dst rTmp; /* D3DTA_TEMP */
1125     struct ureg_src rTmpSrc;
1126     struct ureg_dst rTex;
1127     struct ureg_src rTexSrc;
1128     struct ureg_src cBEM[8];
1129     struct ureg_src s[8];
1130 
1131     struct {
1132         unsigned index;
1133         unsigned index_pre_mod;
1134     } stage;
1135 };
1136 
1137 static struct ureg_src
ps_get_ts_arg(struct ps_build_ctx * ps,unsigned ta)1138 ps_get_ts_arg(struct ps_build_ctx *ps, unsigned ta)
1139 {
1140     struct ureg_src reg;
1141 
1142     switch (ta & D3DTA_SELECTMASK) {
1143     case D3DTA_CONSTANT:
1144         reg = ureg_DECL_constant(ps->ureg, ps->stage.index);
1145         break;
1146     case D3DTA_CURRENT:
1147         reg = (ps->stage.index == ps->stage.index_pre_mod) ? ureg_src(ps->rMod) : ps->rCurSrc;
1148         break;
1149     case D3DTA_DIFFUSE:
1150         reg = ureg_DECL_fs_input(ps->ureg, TGSI_SEMANTIC_COLOR, 0, ps->color_interpolate_flag);
1151         break;
1152     case D3DTA_SPECULAR:
1153         reg = ureg_DECL_fs_input(ps->ureg, TGSI_SEMANTIC_COLOR, 1, ps->color_interpolate_flag);
1154         break;
1155     case D3DTA_TEMP:
1156         reg = ps->rTmpSrc;
1157         break;
1158     case D3DTA_TEXTURE:
1159         reg = ps->rTexSrc;
1160         break;
1161     case D3DTA_TFACTOR:
1162         reg = ureg_DECL_constant(ps->ureg, 20);
1163         break;
1164     default:
1165         assert(0);
1166         reg = ureg_src_undef();
1167         break;
1168     }
1169     if (ta & D3DTA_COMPLEMENT) {
1170         struct ureg_dst dst = ureg_DECL_temporary(ps->ureg);
1171         ureg_ADD(ps->ureg, dst, ureg_imm1f(ps->ureg, 1.0f), ureg_negate(reg));
1172         reg = ureg_src(dst);
1173     }
1174     if (ta & D3DTA_ALPHAREPLICATE)
1175         reg = _WWWW(reg);
1176     return reg;
1177 }
1178 
1179 static struct ureg_dst
ps_get_ts_dst(struct ps_build_ctx * ps,unsigned ta)1180 ps_get_ts_dst(struct ps_build_ctx *ps, unsigned ta)
1181 {
1182     assert(!(ta & (D3DTA_COMPLEMENT | D3DTA_ALPHAREPLICATE)));
1183 
1184     switch (ta & D3DTA_SELECTMASK) {
1185     case D3DTA_CURRENT:
1186         return ps->rCur;
1187     case D3DTA_TEMP:
1188         return ps->rTmp;
1189     default:
1190         assert(0);
1191         return ureg_dst_undef();
1192     }
1193 }
1194 
ps_d3dtop_args_mask(D3DTEXTUREOP top)1195 static uint8_t ps_d3dtop_args_mask(D3DTEXTUREOP top)
1196 {
1197     switch (top) {
1198     case D3DTOP_DISABLE:
1199         return 0x0;
1200     case D3DTOP_SELECTARG1:
1201     case D3DTOP_PREMODULATE:
1202         return 0x2;
1203     case D3DTOP_SELECTARG2:
1204         return 0x4;
1205     case D3DTOP_MULTIPLYADD:
1206     case D3DTOP_LERP:
1207         return 0x7;
1208     default:
1209         return 0x6;
1210     }
1211 }
1212 
1213 static inline bool
is_MOV_no_op(struct ureg_dst dst,struct ureg_src src)1214 is_MOV_no_op(struct ureg_dst dst, struct ureg_src src)
1215 {
1216     return !dst.WriteMask ||
1217         (dst.File == src.File &&
1218          dst.Index == src.Index &&
1219          !dst.Indirect &&
1220          !dst.Saturate &&
1221          !src.Indirect &&
1222          !src.Negate &&
1223          !src.Absolute &&
1224          (!(dst.WriteMask & TGSI_WRITEMASK_X) || (src.SwizzleX == TGSI_SWIZZLE_X)) &&
1225          (!(dst.WriteMask & TGSI_WRITEMASK_Y) || (src.SwizzleY == TGSI_SWIZZLE_Y)) &&
1226          (!(dst.WriteMask & TGSI_WRITEMASK_Z) || (src.SwizzleZ == TGSI_SWIZZLE_Z)) &&
1227          (!(dst.WriteMask & TGSI_WRITEMASK_W) || (src.SwizzleW == TGSI_SWIZZLE_W)));
1228 
1229 }
1230 
1231 static void
ps_do_ts_op(struct ps_build_ctx * ps,unsigned top,struct ureg_dst dst,struct ureg_src * arg)1232 ps_do_ts_op(struct ps_build_ctx *ps, unsigned top, struct ureg_dst dst, struct ureg_src *arg)
1233 {
1234     struct ureg_program *ureg = ps->ureg;
1235     struct ureg_dst tmp = ureg_DECL_temporary(ureg);
1236     struct ureg_dst tmp2 = ureg_DECL_temporary(ureg);
1237     struct ureg_dst tmp_x = ureg_writemask(tmp, TGSI_WRITEMASK_X);
1238 
1239     tmp.WriteMask = dst.WriteMask;
1240 
1241     if (top != D3DTOP_SELECTARG1 && top != D3DTOP_SELECTARG2 &&
1242         top != D3DTOP_MODULATE && top != D3DTOP_PREMODULATE &&
1243         top != D3DTOP_BLENDDIFFUSEALPHA && top != D3DTOP_BLENDTEXTUREALPHA &&
1244         top != D3DTOP_BLENDFACTORALPHA && top != D3DTOP_BLENDCURRENTALPHA &&
1245         top != D3DTOP_BUMPENVMAP && top != D3DTOP_BUMPENVMAPLUMINANCE &&
1246         top != D3DTOP_LERP)
1247         dst = ureg_saturate(dst);
1248 
1249     switch (top) {
1250     case D3DTOP_SELECTARG1:
1251         if (!is_MOV_no_op(dst, arg[1]))
1252             ureg_MOV(ureg, dst, arg[1]);
1253         break;
1254     case D3DTOP_SELECTARG2:
1255         if (!is_MOV_no_op(dst, arg[2]))
1256             ureg_MOV(ureg, dst, arg[2]);
1257         break;
1258     case D3DTOP_MODULATE:
1259         ureg_MUL(ureg, dst, arg[1], arg[2]);
1260         break;
1261     case D3DTOP_MODULATE2X:
1262         ureg_MUL(ureg, tmp, arg[1], arg[2]);
1263         ureg_ADD(ureg, dst, ureg_src(tmp), ureg_src(tmp));
1264         break;
1265     case D3DTOP_MODULATE4X:
1266         ureg_MUL(ureg, tmp, arg[1], arg[2]);
1267         ureg_MUL(ureg, dst, ureg_src(tmp), ureg_imm1f(ureg, 4.0f));
1268         break;
1269     case D3DTOP_ADD:
1270         ureg_ADD(ureg, dst, arg[1], arg[2]);
1271         break;
1272     case D3DTOP_ADDSIGNED:
1273         ureg_ADD(ureg, tmp, arg[1], arg[2]);
1274         ureg_ADD(ureg, dst, ureg_src(tmp), ureg_imm1f(ureg, -0.5f));
1275         break;
1276     case D3DTOP_ADDSIGNED2X:
1277         ureg_ADD(ureg, tmp, arg[1], arg[2]);
1278         ureg_MAD(ureg, dst, ureg_src(tmp), ureg_imm1f(ureg, 2.0f), ureg_imm1f(ureg, -1.0f));
1279         break;
1280     case D3DTOP_SUBTRACT:
1281         ureg_ADD(ureg, dst, arg[1], ureg_negate(arg[2]));
1282         break;
1283     case D3DTOP_ADDSMOOTH:
1284         ureg_ADD(ureg, tmp, ureg_imm1f(ureg, 1.0f), ureg_negate(arg[1]));
1285         ureg_MAD(ureg, dst, ureg_src(tmp), arg[2], arg[1]);
1286         break;
1287     case D3DTOP_BLENDDIFFUSEALPHA:
1288         ureg_LRP(ureg, dst, _WWWW(ps->vC[0]), arg[1], arg[2]);
1289         break;
1290     case D3DTOP_BLENDTEXTUREALPHA:
1291         /* XXX: alpha taken from previous stage, texture or result ? */
1292         ureg_LRP(ureg, dst, _W(ps->rTex), arg[1], arg[2]);
1293         break;
1294     case D3DTOP_BLENDFACTORALPHA:
1295         ureg_LRP(ureg, dst, _WWWW(_CONST(20)), arg[1], arg[2]);
1296         break;
1297     case D3DTOP_BLENDTEXTUREALPHAPM:
1298         ureg_ADD(ureg, tmp_x, ureg_imm1f(ureg, 1.0f), ureg_negate(_W(ps->rTex)));
1299         ureg_MAD(ureg, dst, arg[2], _X(tmp), arg[1]);
1300         break;
1301     case D3DTOP_BLENDCURRENTALPHA:
1302         ureg_LRP(ureg, dst, _WWWW(ps->rCurSrc), arg[1], arg[2]);
1303         break;
1304     case D3DTOP_PREMODULATE:
1305         ureg_MOV(ureg, dst, arg[1]);
1306         ps->stage.index_pre_mod = ps->stage.index + 1;
1307         break;
1308     case D3DTOP_MODULATEALPHA_ADDCOLOR:
1309         ureg_MAD(ureg, dst, _WWWW(arg[1]), arg[2], arg[1]);
1310         break;
1311     case D3DTOP_MODULATECOLOR_ADDALPHA:
1312         ureg_MAD(ureg, dst, arg[1], arg[2], _WWWW(arg[1]));
1313         break;
1314     case D3DTOP_MODULATEINVALPHA_ADDCOLOR:
1315         ureg_ADD(ureg, tmp_x, ureg_imm1f(ureg, 1.0f), ureg_negate(_WWWW(arg[1])));
1316         ureg_MAD(ureg, dst, _X(tmp), arg[2], arg[1]);
1317         break;
1318     case D3DTOP_MODULATEINVCOLOR_ADDALPHA:
1319         ureg_ADD(ureg, tmp, ureg_imm1f(ureg, 1.0f), ureg_negate(arg[1]));
1320         ureg_MAD(ureg, dst, ureg_src(tmp), arg[2], _WWWW(arg[1]));
1321         break;
1322     case D3DTOP_BUMPENVMAP:
1323         break;
1324     case D3DTOP_BUMPENVMAPLUMINANCE:
1325         break;
1326     case D3DTOP_DOTPRODUCT3:
1327         ureg_ADD(ureg, tmp, arg[1], ureg_imm4f(ureg,-0.5,-0.5,-0.5,-0.5));
1328         ureg_ADD(ureg, tmp2, arg[2] , ureg_imm4f(ureg,-0.5,-0.5,-0.5,-0.5));
1329         ureg_DP3(ureg, tmp, ureg_src(tmp), ureg_src(tmp2));
1330         ureg_MUL(ureg, ureg_saturate(dst), ureg_src(tmp), ureg_imm4f(ureg,4.0,4.0,4.0,4.0));
1331         break;
1332     case D3DTOP_MULTIPLYADD:
1333         ureg_MAD(ureg, dst, arg[1], arg[2], arg[0]);
1334         break;
1335     case D3DTOP_LERP:
1336         ureg_LRP(ureg, dst, arg[0], arg[1], arg[2]);
1337         break;
1338     case D3DTOP_DISABLE:
1339         /* no-op ? */
1340         break;
1341     default:
1342         assert(!"invalid D3DTOP");
1343         break;
1344     }
1345     ureg_release_temporary(ureg, tmp);
1346     ureg_release_temporary(ureg, tmp2);
1347 }
1348 
1349 static void *
nine_ff_build_ps(struct NineDevice9 * device,struct nine_ff_ps_key * key)1350 nine_ff_build_ps(struct NineDevice9 *device, struct nine_ff_ps_key *key)
1351 {
1352     struct ps_build_ctx ps;
1353     struct ureg_program *ureg = ureg_create(PIPE_SHADER_FRAGMENT);
1354     struct ureg_dst oCol;
1355     unsigned s;
1356     const unsigned texcoord_sn = get_texcoord_sn(device->screen);
1357 
1358     memset(&ps, 0, sizeof(ps));
1359     ps.ureg = ureg;
1360     ps.color_interpolate_flag = key->flatshade ? TGSI_INTERPOLATE_CONSTANT : TGSI_INTERPOLATE_PERSPECTIVE;
1361     ps.stage.index_pre_mod = -1;
1362 
1363     ps.vC[0] = ureg_DECL_fs_input(ureg, TGSI_SEMANTIC_COLOR, 0, ps.color_interpolate_flag);
1364 
1365     ps.rCur = ureg_DECL_temporary(ureg);
1366     ps.rTmp = ureg_DECL_temporary(ureg);
1367     ps.rTex = ureg_DECL_temporary(ureg);
1368     ps.rCurSrc = ureg_src(ps.rCur);
1369     ps.rTmpSrc = ureg_src(ps.rTmp);
1370     ps.rTexSrc = ureg_src(ps.rTex);
1371 
1372     /* Initial values */
1373     ureg_MOV(ureg, ps.rCur, ps.vC[0]);
1374     ureg_MOV(ureg, ps.rTmp, ureg_imm1f(ureg, 0.0f));
1375     ureg_MOV(ureg, ps.rTex, ureg_imm1f(ureg, 0.0f));
1376 
1377     for (s = 0; s < 8; ++s) {
1378         ps.s[s] = ureg_src_undef();
1379 
1380         if (key->ts[s].colorop != D3DTOP_DISABLE) {
1381             if (key->ts[s].colorarg0 == D3DTA_SPECULAR ||
1382                 key->ts[s].colorarg1 == D3DTA_SPECULAR ||
1383                 key->ts[s].colorarg2 == D3DTA_SPECULAR)
1384                 ps.vC[1] = ureg_DECL_fs_input(ureg, TGSI_SEMANTIC_COLOR, 1, ps.color_interpolate_flag);
1385 
1386             if (key->ts[s].colorarg0 == D3DTA_TEXTURE ||
1387                 key->ts[s].colorarg1 == D3DTA_TEXTURE ||
1388                 key->ts[s].colorarg2 == D3DTA_TEXTURE ||
1389                 key->ts[s].colorop == D3DTOP_BLENDTEXTUREALPHA ||
1390                 key->ts[s].colorop == D3DTOP_BLENDTEXTUREALPHAPM) {
1391                 ps.s[s] = ureg_DECL_sampler(ureg, s);
1392                 ps.vT[s] = ureg_DECL_fs_input(ureg, texcoord_sn, s, TGSI_INTERPOLATE_PERSPECTIVE);
1393             }
1394             if (s && (key->ts[s - 1].colorop == D3DTOP_PREMODULATE ||
1395                       key->ts[s - 1].alphaop == D3DTOP_PREMODULATE))
1396                 ps.s[s] = ureg_DECL_sampler(ureg, s);
1397         }
1398 
1399         if (key->ts[s].alphaop != D3DTOP_DISABLE) {
1400             if (key->ts[s].alphaarg0 == D3DTA_SPECULAR ||
1401                 key->ts[s].alphaarg1 == D3DTA_SPECULAR ||
1402                 key->ts[s].alphaarg2 == D3DTA_SPECULAR)
1403                 ps.vC[1] = ureg_DECL_fs_input(ureg, TGSI_SEMANTIC_COLOR, 1, ps.color_interpolate_flag);
1404 
1405             if (key->ts[s].alphaarg0 == D3DTA_TEXTURE ||
1406                 key->ts[s].alphaarg1 == D3DTA_TEXTURE ||
1407                 key->ts[s].alphaarg2 == D3DTA_TEXTURE ||
1408                 key->ts[s].colorop == D3DTOP_BLENDTEXTUREALPHA ||
1409                 key->ts[s].colorop == D3DTOP_BLENDTEXTUREALPHAPM) {
1410                 ps.s[s] = ureg_DECL_sampler(ureg, s);
1411                 ps.vT[s] = ureg_DECL_fs_input(ureg, texcoord_sn, s, TGSI_INTERPOLATE_PERSPECTIVE);
1412             }
1413         }
1414     }
1415     if (key->specular)
1416         ps.vC[1] = ureg_DECL_fs_input(ureg, TGSI_SEMANTIC_COLOR, 1, ps.color_interpolate_flag);
1417 
1418     oCol = ureg_DECL_output(ureg, TGSI_SEMANTIC_COLOR, 0);
1419 
1420     /* Run stages.
1421      */
1422     for (s = 0; s < 8; ++s) {
1423         unsigned colorarg[3];
1424         unsigned alphaarg[3];
1425         const uint8_t used_c = ps_d3dtop_args_mask(key->ts[s].colorop);
1426         const uint8_t used_a = ps_d3dtop_args_mask(key->ts[s].alphaop);
1427         struct ureg_dst dst;
1428         struct ureg_src arg[3];
1429 
1430         if (key->ts[s].colorop == D3DTOP_DISABLE) {
1431             assert (key->ts[s].alphaop == D3DTOP_DISABLE);
1432             continue;
1433         }
1434         ps.stage.index = s;
1435 
1436         DBG("STAGE[%u]: colorop=%s alphaop=%s\n", s,
1437             nine_D3DTOP_to_str(key->ts[s].colorop),
1438             nine_D3DTOP_to_str(key->ts[s].alphaop));
1439 
1440         if (!ureg_src_is_undef(ps.s[s])) {
1441             unsigned target;
1442             struct ureg_src texture_coord = ps.vT[s];
1443             struct ureg_dst delta;
1444             switch (key->ts[s].textarget) {
1445             case 0: target = TGSI_TEXTURE_1D; break;
1446             case 1: target = TGSI_TEXTURE_2D; break;
1447             case 2: target = TGSI_TEXTURE_3D; break;
1448             case 3: target = TGSI_TEXTURE_CUBE; break;
1449             /* this is a 2 bit bitfield, do I really need a default case ? */
1450             }
1451 
1452             /* Modify coordinates */
1453             if (s >= 1 &&
1454                 (key->ts[s-1].colorop == D3DTOP_BUMPENVMAP ||
1455                  key->ts[s-1].colorop == D3DTOP_BUMPENVMAPLUMINANCE)) {
1456                 delta = ureg_DECL_temporary(ureg);
1457                 /* Du' = D3DTSS_BUMPENVMAT00(stage s-1)*t(s-1)R + D3DTSS_BUMPENVMAT10(stage s-1)*t(s-1)G */
1458                 ureg_MUL(ureg, ureg_writemask(delta, TGSI_WRITEMASK_X), _X(ps.rTex), _XXXX(_CONST(8 + s - 1)));
1459                 ureg_MAD(ureg, ureg_writemask(delta, TGSI_WRITEMASK_X), _Y(ps.rTex), _ZZZZ(_CONST(8 + s - 1)), ureg_src(delta));
1460                 /* Dv' = D3DTSS_BUMPENVMAT01(stage s-1)*t(s-1)R + D3DTSS_BUMPENVMAT11(stage s-1)*t(s-1)G */
1461                 ureg_MUL(ureg, ureg_writemask(delta, TGSI_WRITEMASK_Y), _X(ps.rTex), _YYYY(_CONST(8 + s - 1)));
1462                 ureg_MAD(ureg, ureg_writemask(delta, TGSI_WRITEMASK_Y), _Y(ps.rTex), _WWWW(_CONST(8 + s - 1)), ureg_src(delta));
1463                 texture_coord = ureg_src(ureg_DECL_temporary(ureg));
1464                 ureg_MOV(ureg, ureg_writemask(ureg_dst(texture_coord), ureg_dst(ps.vT[s]).WriteMask), ps.vT[s]);
1465                 ureg_ADD(ureg, ureg_writemask(ureg_dst(texture_coord), TGSI_WRITEMASK_XY), texture_coord, ureg_src(delta));
1466                 /* Prepare luminance multiplier
1467                  * t(s)RGBA = t(s)RGBA * clamp[(t(s-1)B * D3DTSS_BUMPENVLSCALE(stage s-1)) + D3DTSS_BUMPENVLOFFSET(stage s-1)] */
1468                 if (key->ts[s-1].colorop == D3DTOP_BUMPENVMAPLUMINANCE) {
1469                     struct ureg_src bumpenvlscale = ((s-1) & 1) ? _ZZZZ(_CONST(16 + (s-1) / 2)) : _XXXX(_CONST(16 + (s-1) / 2));
1470                     struct ureg_src bumpenvloffset = ((s-1) & 1) ? _WWWW(_CONST(16 + (s-1) / 2)) : _YYYY(_CONST(16 + (s-1) / 2));
1471 
1472                     ureg_MAD(ureg, ureg_saturate(ureg_writemask(delta, TGSI_WRITEMASK_X)), _Z(ps.rTex), bumpenvlscale, bumpenvloffset);
1473                 }
1474             }
1475             if (key->projected & (3 << (s *2))) {
1476                 unsigned dim = 1 + ((key->projected >> (2 * s)) & 3);
1477                 if (dim == 4)
1478                     ureg_TXP(ureg, ps.rTex, target, texture_coord, ps.s[s]);
1479                 else {
1480                     struct ureg_dst tmp = ureg_DECL_temporary(ureg);
1481                     ureg_RCP(ureg, ureg_writemask(tmp, TGSI_WRITEMASK_X), ureg_scalar(texture_coord, dim-1));
1482                     ureg_MUL(ureg, ps.rTmp, _X(tmp), texture_coord);
1483                     ureg_TEX(ureg, ps.rTex, target, ps.rTmpSrc, ps.s[s]);
1484                     ureg_release_temporary(ureg, tmp);
1485                 }
1486             } else {
1487                 ureg_TEX(ureg, ps.rTex, target, texture_coord, ps.s[s]);
1488             }
1489             if (s >= 1 && key->ts[s-1].colorop == D3DTOP_BUMPENVMAPLUMINANCE)
1490                 ureg_MUL(ureg, ps.rTex, ureg_src(ps.rTex), _X(delta));
1491         }
1492 
1493         if (key->ts[s].colorop == D3DTOP_BUMPENVMAP ||
1494             key->ts[s].colorop == D3DTOP_BUMPENVMAPLUMINANCE)
1495             continue;
1496 
1497         dst = ps_get_ts_dst(&ps, key->ts[s].resultarg ? D3DTA_TEMP : D3DTA_CURRENT);
1498 
1499         if (ps.stage.index_pre_mod == ps.stage.index) {
1500             ps.rMod = ureg_DECL_temporary(ureg);
1501             ureg_MUL(ureg, ps.rMod, ps.rCurSrc, ps.rTexSrc);
1502         }
1503 
1504         colorarg[0] = (key->ts[s].colorarg0 | (((key->colorarg_b4[0] >> s) & 0x1) << 4) | ((key->colorarg_b5[0] >> s) << 5)) & 0x3f;
1505         colorarg[1] = (key->ts[s].colorarg1 | (((key->colorarg_b4[1] >> s) & 0x1) << 4) | ((key->colorarg_b5[1] >> s) << 5)) & 0x3f;
1506         colorarg[2] = (key->ts[s].colorarg2 | (((key->colorarg_b4[2] >> s) & 0x1) << 4) | ((key->colorarg_b5[2] >> s) << 5)) & 0x3f;
1507         alphaarg[0] = (key->ts[s].alphaarg0 | ((key->alphaarg_b4[0] >> s) << 4)) & 0x1f;
1508         alphaarg[1] = (key->ts[s].alphaarg1 | ((key->alphaarg_b4[1] >> s) << 4)) & 0x1f;
1509         alphaarg[2] = (key->ts[s].alphaarg2 | ((key->alphaarg_b4[2] >> s) << 4)) & 0x1f;
1510 
1511         if (key->ts[s].colorop != key->ts[s].alphaop ||
1512             colorarg[0] != alphaarg[0] ||
1513             colorarg[1] != alphaarg[1] ||
1514             colorarg[2] != alphaarg[2])
1515             dst.WriteMask = TGSI_WRITEMASK_XYZ;
1516 
1517         /* Special DOTPRODUCT behaviour (see wine tests) */
1518         if (key->ts[s].colorop == D3DTOP_DOTPRODUCT3)
1519             dst.WriteMask = TGSI_WRITEMASK_XYZW;
1520 
1521         if (used_c & 0x1) arg[0] = ps_get_ts_arg(&ps, colorarg[0]);
1522         if (used_c & 0x2) arg[1] = ps_get_ts_arg(&ps, colorarg[1]);
1523         if (used_c & 0x4) arg[2] = ps_get_ts_arg(&ps, colorarg[2]);
1524         ps_do_ts_op(&ps, key->ts[s].colorop, dst, arg);
1525 
1526         if (dst.WriteMask != TGSI_WRITEMASK_XYZW) {
1527             dst.WriteMask = TGSI_WRITEMASK_W;
1528 
1529             if (used_a & 0x1) arg[0] = ps_get_ts_arg(&ps, alphaarg[0]);
1530             if (used_a & 0x2) arg[1] = ps_get_ts_arg(&ps, alphaarg[1]);
1531             if (used_a & 0x4) arg[2] = ps_get_ts_arg(&ps, alphaarg[2]);
1532             ps_do_ts_op(&ps, key->ts[s].alphaop, dst, arg);
1533         }
1534     }
1535 
1536     if (key->specular)
1537         ureg_ADD(ureg, ureg_writemask(ps.rCur, TGSI_WRITEMASK_XYZ), ps.rCurSrc, ps.vC[1]);
1538 
1539     if (key->alpha_test_emulation == PIPE_FUNC_NEVER) {
1540         ureg_KILL(ureg);
1541     } else if (key->alpha_test_emulation != PIPE_FUNC_ALWAYS) {
1542         unsigned cmp_op;
1543         struct ureg_src src[2];
1544         struct ureg_dst tmp = ps.rTmp;
1545         cmp_op = pipe_comp_to_tgsi_opposite(key->alpha_test_emulation);
1546         src[0] = ureg_scalar(ps.rCurSrc, TGSI_SWIZZLE_W); /* Read color alpha channel */
1547         src[1] = _WWWW(_CONST(22)); /* Read alpha ref */
1548         ureg_insn(ureg, cmp_op, &tmp, 1, src, 2, 0);
1549         ureg_KILL_IF(ureg, ureg_negate(ureg_scalar(ureg_src(tmp), TGSI_SWIZZLE_X))); /* if opposite test passes, discard */
1550     }
1551 
1552     /* Fog.
1553      */
1554     if (key->fog_mode) {
1555         struct ureg_dst rFog = ureg_writemask(ps.rTmp, TGSI_WRITEMASK_X);
1556         struct ureg_src vPos;
1557         if (device->screen->get_param(device->screen,
1558                                       PIPE_CAP_FS_POSITION_IS_SYSVAL)) {
1559             vPos = ureg_DECL_system_value(ureg, TGSI_SEMANTIC_POSITION, 0);
1560         } else {
1561             vPos = ureg_DECL_fs_input(ureg, TGSI_SEMANTIC_POSITION, 0,
1562                                       TGSI_INTERPOLATE_LINEAR);
1563         }
1564 
1565         /* Source is either W or Z.
1566          * Z is when an orthogonal projection matrix is detected,
1567          * W (WFOG) else.
1568          */
1569         if (!key->fog_source)
1570             ureg_MOV(ureg, rFog, _ZZZZ(vPos));
1571         else
1572             /* Position's w is 1/w */
1573             ureg_RCP(ureg, rFog, _WWWW(vPos));
1574 
1575         if (key->fog_mode == D3DFOG_EXP) {
1576             ureg_MUL(ureg, rFog, _X(rFog), _ZZZZ(_CONST(22)));
1577             ureg_MUL(ureg, rFog, _X(rFog), ureg_imm1f(ureg, -1.442695f));
1578             ureg_EX2(ureg, rFog, _X(rFog));
1579         } else
1580         if (key->fog_mode == D3DFOG_EXP2) {
1581             ureg_MUL(ureg, rFog, _X(rFog), _ZZZZ(_CONST(22)));
1582             ureg_MUL(ureg, rFog, _X(rFog), _X(rFog));
1583             ureg_MUL(ureg, rFog, _X(rFog), ureg_imm1f(ureg, -1.442695f));
1584             ureg_EX2(ureg, rFog, _X(rFog));
1585         } else
1586         if (key->fog_mode == D3DFOG_LINEAR) {
1587             ureg_ADD(ureg, rFog, _XXXX(_CONST(22)), ureg_negate(_X(rFog)));
1588             ureg_MUL(ureg, ureg_saturate(rFog), _X(rFog), _YYYY(_CONST(22)));
1589         }
1590         ureg_LRP(ureg, ureg_writemask(oCol, TGSI_WRITEMASK_XYZ), _X(rFog), ps.rCurSrc, _CONST(21));
1591         ureg_MOV(ureg, ureg_writemask(oCol, TGSI_WRITEMASK_W), ps.rCurSrc);
1592     } else
1593     if (key->fog) {
1594         struct ureg_src vFog = ureg_DECL_fs_input(ureg, TGSI_SEMANTIC_GENERIC, 16, TGSI_INTERPOLATE_PERSPECTIVE);
1595         ureg_LRP(ureg, ureg_writemask(oCol, TGSI_WRITEMASK_XYZ), _XXXX(vFog), ps.rCurSrc, _CONST(21));
1596         ureg_MOV(ureg, ureg_writemask(oCol, TGSI_WRITEMASK_W), ps.rCurSrc);
1597     } else {
1598         ureg_MOV(ureg, oCol, ps.rCurSrc);
1599     }
1600 
1601     ureg_END(ureg);
1602     nine_ureg_tgsi_dump(ureg, false);
1603     return nine_create_shader_with_so_and_destroy(ureg, device->context.pipe, NULL);
1604 }
1605 
1606 static struct NineVertexShader9 *
nine_ff_get_vs(struct NineDevice9 * device)1607 nine_ff_get_vs(struct NineDevice9 *device)
1608 {
1609     const struct nine_context *context = &device->context;
1610     struct NineVertexShader9 *vs;
1611     struct vs_build_ctx bld;
1612     struct nine_ff_vs_key key;
1613     unsigned s, i;
1614     bool has_indexes = false;
1615     bool has_weights = false;
1616     int8_t input_texture_coord[8];
1617 
1618     assert(sizeof(key) <= sizeof(key.value32));
1619 
1620     memset(&key, 0, sizeof(key));
1621     memset(&bld, 0, sizeof(bld));
1622     memset(&input_texture_coord, 0, sizeof(input_texture_coord));
1623 
1624     bld.key = &key;
1625 
1626     /* FIXME: this shouldn't be NULL, but it is on init */
1627     if (context->vdecl) {
1628         key.color0in_one = 1;
1629         key.color1in_zero = 1;
1630         for (i = 0; i < context->vdecl->nelems; i++) {
1631             uint16_t usage = context->vdecl->usage_map[i];
1632             if (usage == NINE_DECLUSAGE_POSITIONT)
1633                 key.position_t = 1;
1634             else if (usage == NINE_DECLUSAGE_i(COLOR, 0))
1635                 key.color0in_one = 0;
1636             else if (usage == NINE_DECLUSAGE_i(COLOR, 1))
1637                 key.color1in_zero = 0;
1638             else if (usage == NINE_DECLUSAGE_i(BLENDINDICES, 0)) {
1639                 has_indexes = true;
1640                 key.passthrough |= 1 << usage;
1641             } else if (usage == NINE_DECLUSAGE_i(BLENDWEIGHT, 0)) {
1642                 has_weights = true;
1643                 key.passthrough |= 1 << usage;
1644             } else if (usage == NINE_DECLUSAGE_i(NORMAL, 0)) {
1645                 key.has_normal = 1;
1646                 key.passthrough |= 1 << usage;
1647             } else if (usage == NINE_DECLUSAGE_PSIZE)
1648                 key.vertexpointsize = 1;
1649             else if (usage % NINE_DECLUSAGE_COUNT == NINE_DECLUSAGE_TEXCOORD) {
1650                 s = usage / NINE_DECLUSAGE_COUNT;
1651                 if (s < 8)
1652                     input_texture_coord[s] = nine_decltype_get_dim(context->vdecl->decls[i].Type);
1653                 else
1654                     DBG("FF given texture coordinate >= 8. Ignoring\n");
1655             } else if (usage < NINE_DECLUSAGE_NONE)
1656                 key.passthrough |= 1 << usage;
1657         }
1658     }
1659     /* ff vs + ps 3.0: some elements are passed to the ps (wine test).
1660      * We do restrict to indices 0 */
1661     key.passthrough &= ~((1 << NINE_DECLUSAGE_POSITION) | (1 << NINE_DECLUSAGE_PSIZE) |
1662                          (1 << NINE_DECLUSAGE_TEXCOORD) | (1 << NINE_DECLUSAGE_POSITIONT) |
1663                          (1 << NINE_DECLUSAGE_TESSFACTOR) | (1 << NINE_DECLUSAGE_SAMPLE));
1664     if (!key.position_t)
1665         key.passthrough = 0;
1666     key.pointscale = !!context->rs[D3DRS_POINTSCALEENABLE];
1667 
1668     key.lighting = !!context->rs[D3DRS_LIGHTING] &&  context->ff.num_lights_active;
1669     key.darkness = !!context->rs[D3DRS_LIGHTING] && !context->ff.num_lights_active;
1670     if (key.position_t) {
1671         key.darkness = 0; /* |= key.lighting; */ /* XXX ? */
1672         key.lighting = 0;
1673     }
1674     if ((key.lighting | key.darkness) && context->rs[D3DRS_COLORVERTEX]) {
1675         uint32_t mask = (key.color0in_one ? 0 : 1) | (key.color1in_zero ? 0 : 2);
1676         key.mtl_diffuse = context->rs[D3DRS_DIFFUSEMATERIALSOURCE] & mask;
1677         key.mtl_ambient = context->rs[D3DRS_AMBIENTMATERIALSOURCE] & mask;
1678         key.mtl_specular = context->rs[D3DRS_SPECULARMATERIALSOURCE] & mask;
1679         key.mtl_emissive = context->rs[D3DRS_EMISSIVEMATERIALSOURCE] & mask;
1680     }
1681     key.fog = !!context->rs[D3DRS_FOGENABLE];
1682     key.fog_mode = (!key.position_t && context->rs[D3DRS_FOGENABLE]) ? context->rs[D3DRS_FOGVERTEXMODE] : 0;
1683     if (key.fog_mode)
1684         key.fog_range = context->rs[D3DRS_RANGEFOGENABLE];
1685 
1686     key.localviewer = !!context->rs[D3DRS_LOCALVIEWER];
1687     key.normalizenormals = !!context->rs[D3DRS_NORMALIZENORMALS];
1688     key.ucp = !!context->rs[D3DRS_CLIPPLANEENABLE];
1689     key.clipplane_emulate = device->driver_caps.emulate_ucp ? (context->rs[D3DRS_CLIPPLANEENABLE] & 0xff) : 0;
1690 
1691     if (context->rs[D3DRS_VERTEXBLEND] != D3DVBF_DISABLE) {
1692         key.vertexblend_indexed = !!context->rs[D3DRS_INDEXEDVERTEXBLENDENABLE] && has_indexes;
1693 
1694         switch (context->rs[D3DRS_VERTEXBLEND]) {
1695         case D3DVBF_0WEIGHTS: key.vertexblend = key.vertexblend_indexed; break;
1696         case D3DVBF_1WEIGHTS: key.vertexblend = 2; break;
1697         case D3DVBF_2WEIGHTS: key.vertexblend = 3; break;
1698         case D3DVBF_3WEIGHTS: key.vertexblend = 4; break;
1699         case D3DVBF_TWEENING: key.vertextween = 1; break;
1700         default:
1701             assert(!"invalid D3DVBF");
1702             break;
1703         }
1704         if (!has_weights && context->rs[D3DRS_VERTEXBLEND] != D3DVBF_0WEIGHTS)
1705             key.vertexblend = 0; /* TODO: if key.vertexblend_indexed, perhaps it should use 1.0 as weight, or revert to D3DVBF_0WEIGHTS */
1706     }
1707 
1708     for (s = 0; s < 8; ++s) {
1709         unsigned gen = (context->ff.tex_stage[s][D3DTSS_TEXCOORDINDEX] >> 16) + 1;
1710         unsigned idx = context->ff.tex_stage[s][D3DTSS_TEXCOORDINDEX] & 7;
1711         unsigned dim;
1712 
1713         if (key.position_t && gen > NINED3DTSS_TCI_PASSTHRU)
1714             gen = NINED3DTSS_TCI_PASSTHRU;
1715 
1716         if (!input_texture_coord[idx] && gen == NINED3DTSS_TCI_PASSTHRU)
1717             gen = NINED3DTSS_TCI_DISABLE;
1718 
1719         key.tc_gen |= gen << (s * 3);
1720         key.tc_idx |= idx << (s * 3);
1721         key.tc_dim_input |= ((input_texture_coord[idx]-1) & 0x3) << (s * 2);
1722 
1723         dim = context->ff.tex_stage[s][D3DTSS_TEXTURETRANSFORMFLAGS] & 0x7;
1724         if (dim > 4)
1725             dim = input_texture_coord[idx];
1726         if (dim == 1) /* NV behaviour */
1727             dim = 0;
1728         key.tc_dim_output |= dim << (s * 3);
1729     }
1730 
1731     DBG("VS ff key hash: %x\n", nine_ff_vs_key_hash(&key));
1732     vs = util_hash_table_get(device->ff.ht_vs, &key);
1733     if (vs)
1734         return vs;
1735     NineVertexShader9_new(device, &vs, NULL, nine_ff_build_vs(device, &bld));
1736 
1737     nine_ff_prune_vs(device);
1738     if (vs) {
1739         unsigned n;
1740 
1741         memcpy(&vs->ff_key, &key, sizeof(vs->ff_key));
1742 
1743         _mesa_hash_table_insert(device->ff.ht_vs, &vs->ff_key, vs);
1744         device->ff.num_vs++;
1745 
1746         vs->num_inputs = bld.num_inputs;
1747         for (n = 0; n < bld.num_inputs; ++n)
1748             vs->input_map[n].ndecl = bld.input[n];
1749 
1750         vs->position_t = key.position_t;
1751         vs->point_size = key.vertexpointsize | key.pointscale | device->driver_caps.always_output_pointsize;
1752     }
1753     return vs;
1754 }
1755 
1756 #define GET_D3DTS(n) nine_state_access_transform(&context->ff, D3DTS_##n, FALSE)
1757 #define IS_D3DTS_DIRTY(s,n) ((s)->ff.changed.transform[(D3DTS_##n) / 32] & (1 << ((D3DTS_##n) % 32)))
1758 
1759 static struct NinePixelShader9 *
nine_ff_get_ps(struct NineDevice9 * device)1760 nine_ff_get_ps(struct NineDevice9 *device)
1761 {
1762     struct nine_context *context = &device->context;
1763     struct NinePixelShader9 *ps;
1764     struct nine_ff_ps_key key;
1765     unsigned s;
1766     uint8_t sampler_mask = 0;
1767 
1768     assert(sizeof(key) <= sizeof(key.value32));
1769 
1770     memset(&key, 0, sizeof(key));
1771     for (s = 0; s < 8; ++s) {
1772         key.ts[s].colorop = context->ff.tex_stage[s][D3DTSS_COLOROP];
1773         key.ts[s].alphaop = context->ff.tex_stage[s][D3DTSS_ALPHAOP];
1774         const uint8_t used_c = ps_d3dtop_args_mask(key.ts[s].colorop);
1775         const uint8_t used_a = ps_d3dtop_args_mask(key.ts[s].alphaop);
1776         /* MSDN says D3DTOP_DISABLE disables this and all subsequent stages.
1777          * ALPHAOP cannot be enabled if COLOROP is disabled.
1778          * Verified on Windows. */
1779         if (key.ts[s].colorop == D3DTOP_DISABLE) {
1780             key.ts[s].alphaop = D3DTOP_DISABLE; /* DISABLE == 1, avoid degenerate keys */
1781             break;
1782         }
1783 
1784         if (!context->texture[s].enabled &&
1785             ((context->ff.tex_stage[s][D3DTSS_COLORARG0] == D3DTA_TEXTURE &&
1786               used_c & 0x1) ||
1787              (context->ff.tex_stage[s][D3DTSS_COLORARG1] == D3DTA_TEXTURE &&
1788               used_c & 0x2) ||
1789              (context->ff.tex_stage[s][D3DTSS_COLORARG2] == D3DTA_TEXTURE &&
1790               used_c & 0x4))) {
1791             /* Tested on Windows: Invalid texture read disables the stage
1792              * and the subsequent ones, but only for colorop. For alpha,
1793              * it's as if the texture had alpha of 1.0, which is what
1794              * has our dummy texture in that case. Invalid color also
1795              * disabled the following alpha stages. */
1796             key.ts[s].colorop = key.ts[s].alphaop = D3DTOP_DISABLE;
1797             break;
1798         }
1799 
1800         if (context->ff.tex_stage[s][D3DTSS_COLORARG0] == D3DTA_TEXTURE ||
1801             context->ff.tex_stage[s][D3DTSS_COLORARG1] == D3DTA_TEXTURE ||
1802             context->ff.tex_stage[s][D3DTSS_COLORARG2] == D3DTA_TEXTURE ||
1803             context->ff.tex_stage[s][D3DTSS_ALPHAARG0] == D3DTA_TEXTURE ||
1804             context->ff.tex_stage[s][D3DTSS_ALPHAARG1] == D3DTA_TEXTURE ||
1805             context->ff.tex_stage[s][D3DTSS_ALPHAARG2] == D3DTA_TEXTURE)
1806             sampler_mask |= (1 << s);
1807 
1808         if (key.ts[s].colorop != D3DTOP_DISABLE) {
1809             if (used_c & 0x1) key.ts[s].colorarg0 = context->ff.tex_stage[s][D3DTSS_COLORARG0] & 0x7;
1810             if (used_c & 0x2) key.ts[s].colorarg1 = context->ff.tex_stage[s][D3DTSS_COLORARG1] & 0x7;
1811             if (used_c & 0x4) key.ts[s].colorarg2 = context->ff.tex_stage[s][D3DTSS_COLORARG2] & 0x7;
1812             if (used_c & 0x1) key.colorarg_b4[0] |= ((context->ff.tex_stage[s][D3DTSS_COLORARG0] >> 4) & 0x1) << s;
1813             if (used_c & 0x1) key.colorarg_b5[0] |= ((context->ff.tex_stage[s][D3DTSS_COLORARG0] >> 5) & 0x1) << s;
1814             if (used_c & 0x2) key.colorarg_b4[1] |= ((context->ff.tex_stage[s][D3DTSS_COLORARG1] >> 4) & 0x1) << s;
1815             if (used_c & 0x2) key.colorarg_b5[1] |= ((context->ff.tex_stage[s][D3DTSS_COLORARG1] >> 5) & 0x1) << s;
1816             if (used_c & 0x4) key.colorarg_b4[2] |= ((context->ff.tex_stage[s][D3DTSS_COLORARG2] >> 4) & 0x1) << s;
1817             if (used_c & 0x4) key.colorarg_b5[2] |= ((context->ff.tex_stage[s][D3DTSS_COLORARG2] >> 5) & 0x1) << s;
1818         }
1819         if (key.ts[s].alphaop != D3DTOP_DISABLE) {
1820             if (used_a & 0x1) key.ts[s].alphaarg0 = context->ff.tex_stage[s][D3DTSS_ALPHAARG0] & 0x7;
1821             if (used_a & 0x2) key.ts[s].alphaarg1 = context->ff.tex_stage[s][D3DTSS_ALPHAARG1] & 0x7;
1822             if (used_a & 0x4) key.ts[s].alphaarg2 = context->ff.tex_stage[s][D3DTSS_ALPHAARG2] & 0x7;
1823             if (used_a & 0x1) key.alphaarg_b4[0] |= ((context->ff.tex_stage[s][D3DTSS_ALPHAARG0] >> 4) & 0x1) << s;
1824             if (used_a & 0x2) key.alphaarg_b4[1] |= ((context->ff.tex_stage[s][D3DTSS_ALPHAARG1] >> 4) & 0x1) << s;
1825             if (used_a & 0x4) key.alphaarg_b4[2] |= ((context->ff.tex_stage[s][D3DTSS_ALPHAARG2] >> 4) & 0x1) << s;
1826         }
1827         key.ts[s].resultarg = context->ff.tex_stage[s][D3DTSS_RESULTARG] == D3DTA_TEMP;
1828 
1829         if (context->texture[s].enabled) {
1830             switch (context->texture[s].type) {
1831             case D3DRTYPE_TEXTURE:       key.ts[s].textarget = 1; break;
1832             case D3DRTYPE_VOLUMETEXTURE: key.ts[s].textarget = 2; break;
1833             case D3DRTYPE_CUBETEXTURE:   key.ts[s].textarget = 3; break;
1834             default:
1835                 assert(!"unexpected texture type");
1836                 break;
1837             }
1838         } else {
1839             key.ts[s].textarget = 1;
1840         }
1841     }
1842 
1843     /* Note: If colorop is D3DTOP_DISABLE for the first stage
1844      * (which implies alphaop is too), nothing particular happens,
1845      * that is, current is equal to diffuse (which is the case anyway,
1846      * because it is how it is initialized).
1847      * Special case seems if alphaop is D3DTOP_DISABLE and not colorop,
1848      * because then if the resultarg is TEMP, then diffuse alpha is written
1849      * to it. */
1850     if (key.ts[0].colorop != D3DTOP_DISABLE &&
1851         key.ts[0].alphaop == D3DTOP_DISABLE &&
1852         key.ts[0].resultarg != 0) {
1853         key.ts[0].alphaop = D3DTOP_SELECTARG1;
1854         key.ts[0].alphaarg1 = D3DTA_DIFFUSE;
1855     }
1856     /* When no alpha stage writes to current, diffuse alpha is taken.
1857      * Since we initialize current to diffuse, we have the behaviour. */
1858 
1859     /* Last stage always writes to Current */
1860     if (s >= 1)
1861         key.ts[s-1].resultarg = 0;
1862 
1863     key.projected = nine_ff_get_projected_key_ff(context);
1864     key.specular = !!context->rs[D3DRS_SPECULARENABLE];
1865     key.flatshade = context->rs[D3DRS_SHADEMODE] == D3DSHADE_FLAT;
1866 
1867     for (; s < 8; ++s)
1868         key.ts[s].colorop = key.ts[s].alphaop = D3DTOP_DISABLE;
1869     if (context->rs[D3DRS_FOGENABLE])
1870         key.fog_mode = context->rs[D3DRS_FOGTABLEMODE];
1871     key.fog = !!context->rs[D3DRS_FOGENABLE];
1872     if (key.fog_mode && key.fog)
1873         key.fog_source = !context->zfog;
1874     key.alpha_test_emulation = context->rs[NINED3DRS_EMULATED_ALPHATEST] & 0x7;
1875 
1876     DBG("PS ff key hash: %x\n", nine_ff_ps_key_hash(&key));
1877     ps = util_hash_table_get(device->ff.ht_ps, &key);
1878     if (ps)
1879         return ps;
1880     NinePixelShader9_new(device, &ps, NULL, nine_ff_build_ps(device, &key));
1881 
1882     nine_ff_prune_ps(device);
1883     if (ps) {
1884         memcpy(&ps->ff_key, &key, sizeof(ps->ff_key));
1885 
1886         _mesa_hash_table_insert(device->ff.ht_ps, &ps->ff_key, ps);
1887         device->ff.num_ps++;
1888 
1889         ps->rt_mask = 0x1;
1890         ps->sampler_mask = sampler_mask;
1891     }
1892     return ps;
1893 }
1894 
1895 static void
nine_ff_load_vs_transforms(struct NineDevice9 * device)1896 nine_ff_load_vs_transforms(struct NineDevice9 *device)
1897 {
1898     struct nine_context *context = &device->context;
1899     D3DMATRIX T;
1900     D3DMATRIX *M = (D3DMATRIX *)device->ff.vs_const;
1901     unsigned i;
1902 
1903     /* TODO: make this nicer, and only upload the ones we need */
1904     /* TODO: use ff.vs_const as storage of W, V, P matrices */
1905 
1906     if (IS_D3DTS_DIRTY(context, WORLD) ||
1907         IS_D3DTS_DIRTY(context, VIEW) ||
1908         IS_D3DTS_DIRTY(context, PROJECTION)) {
1909         /* WVP, WV matrices */
1910         nine_d3d_matrix_matrix_mul(&M[1], GET_D3DTS(WORLD), GET_D3DTS(VIEW));
1911         nine_d3d_matrix_matrix_mul(&M[0], &M[1], GET_D3DTS(PROJECTION));
1912 
1913         /* normal matrix == transpose(inverse(WV)) */
1914         nine_d3d_matrix_inverse(&T, &M[1]);
1915         nine_d3d_matrix_transpose(&M[4], &T);
1916 
1917         /* P matrix */
1918         M[2] = *GET_D3DTS(PROJECTION);
1919 
1920         /* V and W matrix */
1921         nine_d3d_matrix_inverse(&M[3], GET_D3DTS(VIEW));
1922         M[40] = M[1];
1923     }
1924 
1925     if (context->rs[D3DRS_VERTEXBLEND] != D3DVBF_DISABLE) {
1926         /* load other world matrices */
1927         for (i = 1; i <= 8; ++i) {
1928             nine_d3d_matrix_matrix_mul(&M[40 + i], GET_D3DTS(WORLDMATRIX(i)), GET_D3DTS(VIEW));
1929         }
1930     }
1931 
1932     device->ff.vs_const[30 * 4] = asfloat(context->rs[D3DRS_TWEENFACTOR]);
1933 }
1934 
1935 static void
nine_ff_load_lights(struct NineDevice9 * device)1936 nine_ff_load_lights(struct NineDevice9 *device)
1937 {
1938     struct nine_context *context = &device->context;
1939     struct fvec4 *dst = (struct fvec4 *)device->ff.vs_const;
1940     unsigned l;
1941 
1942     if (context->changed.group & NINE_STATE_FF_MATERIAL) {
1943         const D3DMATERIAL9 *mtl = &context->ff.material;
1944 
1945         memcpy(&dst[20], &mtl->Diffuse, 4 * sizeof(float));
1946         memcpy(&dst[21], &mtl->Ambient, 4 * sizeof(float));
1947         memcpy(&dst[22], &mtl->Specular, 4 * sizeof(float));
1948         dst[23].x = mtl->Power;
1949         memcpy(&dst[24], &mtl->Emissive, 4 * sizeof(float));
1950         d3dcolor_to_rgba(&dst[25].x, context->rs[D3DRS_AMBIENT]);
1951         dst[19].x = dst[25].x * mtl->Ambient.r + mtl->Emissive.r;
1952         dst[19].y = dst[25].y * mtl->Ambient.g + mtl->Emissive.g;
1953         dst[19].z = dst[25].z * mtl->Ambient.b + mtl->Emissive.b;
1954     }
1955 
1956     if (!(context->changed.group & NINE_STATE_FF_LIGHTING))
1957         return;
1958 
1959     for (l = 0; l < context->ff.num_lights_active; ++l) {
1960         const D3DLIGHT9 *light = &context->ff.light[context->ff.active_light[l]];
1961 
1962         dst[32 + l * 8].x = light->Type;
1963         dst[32 + l * 8].y = light->Attenuation0;
1964         dst[32 + l * 8].z = light->Attenuation1;
1965         dst[32 + l * 8].w = light->Attenuation2;
1966         memcpy(&dst[33 + l * 8].x, &light->Diffuse, sizeof(light->Diffuse));
1967         memcpy(&dst[34 + l * 8].x, &light->Specular, sizeof(light->Specular));
1968         memcpy(&dst[35 + l * 8].x, &light->Ambient, sizeof(light->Ambient));
1969         nine_d3d_vector4_matrix_mul((D3DVECTOR *)&dst[36 + l * 8].x, &light->Position, GET_D3DTS(VIEW));
1970         nine_d3d_vector3_matrix_mul((D3DVECTOR *)&dst[37 + l * 8].x, &light->Direction, GET_D3DTS(VIEW));
1971         dst[36 + l * 8].w = light->Type == D3DLIGHT_DIRECTIONAL ? 1e9f : light->Range;
1972         dst[37 + l * 8].w = light->Falloff;
1973         dst[38 + l * 8].x = cosf(light->Theta * 0.5f);
1974         dst[38 + l * 8].y = cosf(light->Phi * 0.5f);
1975         dst[38 + l * 8].z = 1.0f / (dst[38 + l * 8].x - dst[38 + l * 8].y);
1976         dst[39 + l * 8].w = (float)((l + 1) == context->ff.num_lights_active);
1977     }
1978 }
1979 
1980 static void
nine_ff_load_point_and_fog_params(struct NineDevice9 * device)1981 nine_ff_load_point_and_fog_params(struct NineDevice9 *device)
1982 {
1983     struct nine_context *context = &device->context;
1984     struct fvec4 *dst = (struct fvec4 *)device->ff.vs_const;
1985 
1986     if (!(context->changed.group & NINE_STATE_FF_VS_OTHER))
1987         return;
1988     dst[26].x = asfloat(context->rs[D3DRS_POINTSIZE_MIN]);
1989     dst[26].y = asfloat(context->rs[D3DRS_POINTSIZE_MAX]);
1990     dst[26].z = CLAMP(asfloat(context->rs[D3DRS_POINTSIZE]),
1991                 asfloat(context->rs[D3DRS_POINTSIZE_MIN]),
1992                 asfloat(context->rs[D3DRS_POINTSIZE_MAX]));
1993     dst[26].w = asfloat(context->rs[D3DRS_POINTSCALE_A]);
1994     dst[27].x = asfloat(context->rs[D3DRS_POINTSCALE_B]);
1995     dst[27].y = asfloat(context->rs[D3DRS_POINTSCALE_C]);
1996     dst[28].x = asfloat(context->rs[D3DRS_FOGEND]);
1997     dst[28].y = 1.0f / (asfloat(context->rs[D3DRS_FOGEND]) - asfloat(context->rs[D3DRS_FOGSTART]));
1998     if (isinf(dst[28].y))
1999         dst[28].y = 0.0f;
2000     dst[28].z = asfloat(context->rs[D3DRS_FOGDENSITY]);
2001     if (device->driver_caps.emulate_ucp)
2002         memcpy(&dst[196], &context->clip.ucp, sizeof(context->clip));
2003 }
2004 
2005 static void
nine_ff_load_tex_matrices(struct NineDevice9 * device)2006 nine_ff_load_tex_matrices(struct NineDevice9 *device)
2007 {
2008     struct nine_context *context = &device->context;
2009     D3DMATRIX *M = (D3DMATRIX *)device->ff.vs_const;
2010     unsigned s;
2011 
2012     if (!(context->ff.changed.transform[0] & 0xff0000))
2013         return;
2014     for (s = 0; s < 8; ++s) {
2015         if (IS_D3DTS_DIRTY(context, TEXTURE0 + s))
2016             nine_d3d_matrix_transpose(&M[32 + s], nine_state_access_transform(&context->ff, D3DTS_TEXTURE0 + s, false));
2017     }
2018 }
2019 
2020 static void
nine_ff_load_ps_params(struct NineDevice9 * device)2021 nine_ff_load_ps_params(struct NineDevice9 *device)
2022 {
2023     struct nine_context *context = &device->context;
2024     struct fvec4 *dst = (struct fvec4 *)device->ff.ps_const;
2025     unsigned s;
2026 
2027     if (!(context->changed.group & NINE_STATE_FF_PS_CONSTS))
2028         return;
2029 
2030     for (s = 0; s < 8; ++s)
2031         d3dcolor_to_rgba(&dst[s].x, context->ff.tex_stage[s][D3DTSS_CONSTANT]);
2032 
2033     for (s = 0; s < 8; ++s) {
2034         dst[8 + s].x = asfloat(context->ff.tex_stage[s][D3DTSS_BUMPENVMAT00]);
2035         dst[8 + s].y = asfloat(context->ff.tex_stage[s][D3DTSS_BUMPENVMAT01]);
2036         dst[8 + s].z = asfloat(context->ff.tex_stage[s][D3DTSS_BUMPENVMAT10]);
2037         dst[8 + s].w = asfloat(context->ff.tex_stage[s][D3DTSS_BUMPENVMAT11]);
2038         if (s & 1) {
2039             dst[16 + s / 2].z = asfloat(context->ff.tex_stage[s][D3DTSS_BUMPENVLSCALE]);
2040             dst[16 + s / 2].w = asfloat(context->ff.tex_stage[s][D3DTSS_BUMPENVLOFFSET]);
2041         } else {
2042             dst[16 + s / 2].x = asfloat(context->ff.tex_stage[s][D3DTSS_BUMPENVLSCALE]);
2043             dst[16 + s / 2].y = asfloat(context->ff.tex_stage[s][D3DTSS_BUMPENVLOFFSET]);
2044         }
2045     }
2046 
2047     d3dcolor_to_rgba(&dst[20].x, context->rs[D3DRS_TEXTUREFACTOR]);
2048     d3dcolor_to_rgba(&dst[21].x, context->rs[D3DRS_FOGCOLOR]);
2049     dst[22].x = asfloat(context->rs[D3DRS_FOGEND]);
2050     dst[22].y = 1.0f / (asfloat(context->rs[D3DRS_FOGEND]) - asfloat(context->rs[D3DRS_FOGSTART]));
2051     dst[22].z = asfloat(context->rs[D3DRS_FOGDENSITY]);
2052     dst[22].w = (float)context->rs[D3DRS_ALPHAREF] / 255.f;
2053 }
2054 
2055 static void
nine_ff_load_viewport_info(struct NineDevice9 * device)2056 nine_ff_load_viewport_info(struct NineDevice9 *device)
2057 {
2058     D3DVIEWPORT9 *viewport = &device->context.viewport;
2059     struct fvec4 *dst = (struct fvec4 *)device->ff.vs_const;
2060     float diffZ = viewport->MaxZ - viewport->MinZ;
2061 
2062     /* Note: the other functions avoids to fill the const again if nothing changed.
2063      * But we don't have much to fill, and adding code to allow that may be complex
2064      * so just fill it always */
2065     dst[100].x = 2.0f / (float)(viewport->Width);
2066     dst[100].y = 2.0f / (float)(viewport->Height);
2067     dst[100].z = (diffZ == 0.0f) ? 0.0f : (1.0f / diffZ);
2068     dst[100].w = (float)(viewport->Width);
2069     dst[101].x = (float)(viewport->X);
2070     dst[101].y = (float)(viewport->Y);
2071     dst[101].z = (float)(viewport->MinZ);
2072 }
2073 
2074 void
nine_ff_update(struct NineDevice9 * device)2075 nine_ff_update(struct NineDevice9 *device)
2076 {
2077     struct nine_context *context = &device->context;
2078     struct pipe_constant_buffer cb;
2079 
2080     DBG("vs=%p ps=%p\n", context->vs, context->ps);
2081 
2082     /* NOTE: the only reference belongs to the hash table */
2083     if (!context->programmable_vs) {
2084         device->ff.vs = nine_ff_get_vs(device);
2085         context->changed.group |= NINE_STATE_VS;
2086     }
2087     if (!context->ps) {
2088         device->ff.ps = nine_ff_get_ps(device);
2089         context->changed.group |= NINE_STATE_PS;
2090     }
2091 
2092     if (!context->programmable_vs) {
2093         nine_ff_load_vs_transforms(device);
2094         nine_ff_load_tex_matrices(device);
2095         nine_ff_load_lights(device);
2096         nine_ff_load_point_and_fog_params(device);
2097         nine_ff_load_viewport_info(device);
2098 
2099         memset(context->ff.changed.transform, 0, sizeof(context->ff.changed.transform));
2100 
2101         cb.buffer_offset = 0;
2102         cb.buffer = NULL;
2103         cb.user_buffer = device->ff.vs_const;
2104         cb.buffer_size = NINE_FF_NUM_VS_CONST * 4 * sizeof(float);
2105 
2106         context->pipe_data.cb_vs_ff = cb;
2107         context->commit |= NINE_STATE_COMMIT_CONST_VS;
2108 
2109         context->changed.group &= ~NINE_STATE_FF_VS;
2110     }
2111 
2112     if (!context->ps) {
2113         nine_ff_load_ps_params(device);
2114 
2115         cb.buffer_offset = 0;
2116         cb.buffer = NULL;
2117         cb.user_buffer = device->ff.ps_const;
2118         cb.buffer_size = NINE_FF_NUM_PS_CONST * 4 * sizeof(float);
2119 
2120         context->pipe_data.cb_ps_ff = cb;
2121         context->commit |= NINE_STATE_COMMIT_CONST_PS;
2122 
2123         context->changed.group &= ~NINE_STATE_FF_PS;
2124     }
2125 }
2126 
2127 
2128 bool
nine_ff_init(struct NineDevice9 * device)2129 nine_ff_init(struct NineDevice9 *device)
2130 {
2131     device->ff.ht_vs = _mesa_hash_table_create(NULL, nine_ff_vs_key_hash,
2132                                                nine_ff_vs_key_comp);
2133     device->ff.ht_ps = _mesa_hash_table_create(NULL, nine_ff_ps_key_hash,
2134                                                nine_ff_ps_key_comp);
2135 
2136     device->ff.ht_fvf = _mesa_hash_table_create(NULL, nine_ff_fvf_key_hash,
2137                                                 nine_ff_fvf_key_comp);
2138 
2139     device->ff.vs_const = CALLOC(NINE_FF_NUM_VS_CONST, 4 * sizeof(float));
2140     device->ff.ps_const = CALLOC(NINE_FF_NUM_PS_CONST, 4 * sizeof(float));
2141 
2142     return device->ff.ht_vs && device->ff.ht_ps &&
2143         device->ff.ht_fvf &&
2144         device->ff.vs_const && device->ff.ps_const;
2145 }
2146 
nine_ff_ht_delete_cb(void * key,void * value,void * data)2147 static enum pipe_error nine_ff_ht_delete_cb(void *key, void *value, void *data)
2148 {
2149     NineUnknown_Unbind(NineUnknown(value));
2150     return PIPE_OK;
2151 }
2152 
2153 void
nine_ff_fini(struct NineDevice9 * device)2154 nine_ff_fini(struct NineDevice9 *device)
2155 {
2156     if (device->ff.ht_vs) {
2157         util_hash_table_foreach(device->ff.ht_vs, nine_ff_ht_delete_cb, NULL);
2158         _mesa_hash_table_destroy(device->ff.ht_vs, NULL);
2159     }
2160     if (device->ff.ht_ps) {
2161         util_hash_table_foreach(device->ff.ht_ps, nine_ff_ht_delete_cb, NULL);
2162         _mesa_hash_table_destroy(device->ff.ht_ps, NULL);
2163     }
2164     if (device->ff.ht_fvf) {
2165         util_hash_table_foreach(device->ff.ht_fvf, nine_ff_ht_delete_cb, NULL);
2166         _mesa_hash_table_destroy(device->ff.ht_fvf, NULL);
2167     }
2168     device->ff.vs = NULL; /* destroyed by unbinding from hash table */
2169     device->ff.ps = NULL;
2170 
2171     FREE(device->ff.vs_const);
2172     FREE(device->ff.ps_const);
2173 }
2174 
2175 static void
nine_ff_prune_vs(struct NineDevice9 * device)2176 nine_ff_prune_vs(struct NineDevice9 *device)
2177 {
2178     struct nine_context *context = &device->context;
2179 
2180     if (device->ff.num_vs > 1024) {
2181         /* could destroy the bound one here, so unbind */
2182         context->pipe->bind_vs_state(context->pipe, NULL);
2183         util_hash_table_foreach(device->ff.ht_vs, nine_ff_ht_delete_cb, NULL);
2184         _mesa_hash_table_clear(device->ff.ht_vs, NULL);
2185         device->ff.num_vs = 0;
2186         context->changed.group |= NINE_STATE_VS;
2187     }
2188 }
2189 static void
nine_ff_prune_ps(struct NineDevice9 * device)2190 nine_ff_prune_ps(struct NineDevice9 *device)
2191 {
2192     struct nine_context *context = &device->context;
2193 
2194     if (device->ff.num_ps > 1024) {
2195         /* could destroy the bound one here, so unbind */
2196         context->pipe->bind_fs_state(context->pipe, NULL);
2197         util_hash_table_foreach(device->ff.ht_ps, nine_ff_ht_delete_cb, NULL);
2198         _mesa_hash_table_clear(device->ff.ht_ps, NULL);
2199         device->ff.num_ps = 0;
2200         context->changed.group |= NINE_STATE_PS;
2201     }
2202 }
2203 
2204 /* ========================================================================== */
2205 
2206 /* Matrix multiplication:
2207  *
2208  * in memory: 0 1 2 3 (row major)
2209  *            4 5 6 7
2210  *            8 9 a b
2211  *            c d e f
2212  *
2213  *    cA cB cC cD
2214  * r0             = (r0 * cA) (r0 * cB) . .
2215  * r1             = (r1 * cA) (r1 * cB)
2216  * r2             = (r2 * cA) .
2217  * r3             = (r3 * cA) .
2218  *
2219  *               r: (11) (12) (13) (14)
2220  *                  (21) (22) (23) (24)
2221  *                  (31) (32) (33) (34)
2222  *                  (41) (42) (43) (44)
2223  * l: (11 12 13 14)
2224  *    (21 22 23 24)
2225  *    (31 32 33 34)
2226  *    (41 42 43 44)
2227  *
2228  * v: (x  y  z  1 )
2229  *
2230  * t.xyzw = MUL(v.xxxx, r[0]);
2231  * t.xyzw = MAD(v.yyyy, r[1], t.xyzw);
2232  * t.xyzw = MAD(v.zzzz, r[2], t.xyzw);
2233  * v.xyzw = MAD(v.wwww, r[3], t.xyzw);
2234  *
2235  * v.x = DP4(v, c[0]);
2236  * v.y = DP4(v, c[1]);
2237  * v.z = DP4(v, c[2]);
2238  * v.w = DP4(v, c[3]) = 1
2239  */
2240 
2241 /*
2242 static void
2243 nine_D3DMATRIX_print(const D3DMATRIX *M)
2244 {
2245     DBG("\n(%f %f %f %f)\n"
2246         "(%f %f %f %f)\n"
2247         "(%f %f %f %f)\n"
2248         "(%f %f %f %f)\n",
2249         M->m[0][0], M->m[0][1], M->m[0][2], M->m[0][3],
2250         M->m[1][0], M->m[1][1], M->m[1][2], M->m[1][3],
2251         M->m[2][0], M->m[2][1], M->m[2][2], M->m[2][3],
2252         M->m[3][0], M->m[3][1], M->m[3][2], M->m[3][3]);
2253 }
2254 */
2255 
2256 static inline float
nine_DP4_row_col(const D3DMATRIX * A,int r,const D3DMATRIX * B,int c)2257 nine_DP4_row_col(const D3DMATRIX *A, int r, const D3DMATRIX *B, int c)
2258 {
2259     return A->m[r][0] * B->m[0][c] +
2260            A->m[r][1] * B->m[1][c] +
2261            A->m[r][2] * B->m[2][c] +
2262            A->m[r][3] * B->m[3][c];
2263 }
2264 
2265 static inline float
nine_DP4_vec_col(const D3DVECTOR * v,const D3DMATRIX * M,int c)2266 nine_DP4_vec_col(const D3DVECTOR *v, const D3DMATRIX *M, int c)
2267 {
2268     return v->x * M->m[0][c] +
2269            v->y * M->m[1][c] +
2270            v->z * M->m[2][c] +
2271            1.0f * M->m[3][c];
2272 }
2273 
2274 static inline float
nine_DP3_vec_col(const D3DVECTOR * v,const D3DMATRIX * M,int c)2275 nine_DP3_vec_col(const D3DVECTOR *v, const D3DMATRIX *M, int c)
2276 {
2277     return v->x * M->m[0][c] +
2278            v->y * M->m[1][c] +
2279            v->z * M->m[2][c];
2280 }
2281 
2282 void
nine_d3d_matrix_matrix_mul(D3DMATRIX * D,const D3DMATRIX * L,const D3DMATRIX * R)2283 nine_d3d_matrix_matrix_mul(D3DMATRIX *D, const D3DMATRIX *L, const D3DMATRIX *R)
2284 {
2285     D->_11 = nine_DP4_row_col(L, 0, R, 0);
2286     D->_12 = nine_DP4_row_col(L, 0, R, 1);
2287     D->_13 = nine_DP4_row_col(L, 0, R, 2);
2288     D->_14 = nine_DP4_row_col(L, 0, R, 3);
2289 
2290     D->_21 = nine_DP4_row_col(L, 1, R, 0);
2291     D->_22 = nine_DP4_row_col(L, 1, R, 1);
2292     D->_23 = nine_DP4_row_col(L, 1, R, 2);
2293     D->_24 = nine_DP4_row_col(L, 1, R, 3);
2294 
2295     D->_31 = nine_DP4_row_col(L, 2, R, 0);
2296     D->_32 = nine_DP4_row_col(L, 2, R, 1);
2297     D->_33 = nine_DP4_row_col(L, 2, R, 2);
2298     D->_34 = nine_DP4_row_col(L, 2, R, 3);
2299 
2300     D->_41 = nine_DP4_row_col(L, 3, R, 0);
2301     D->_42 = nine_DP4_row_col(L, 3, R, 1);
2302     D->_43 = nine_DP4_row_col(L, 3, R, 2);
2303     D->_44 = nine_DP4_row_col(L, 3, R, 3);
2304 }
2305 
2306 void
nine_d3d_vector4_matrix_mul(D3DVECTOR * d,const D3DVECTOR * v,const D3DMATRIX * M)2307 nine_d3d_vector4_matrix_mul(D3DVECTOR *d, const D3DVECTOR *v, const D3DMATRIX *M)
2308 {
2309     d->x = nine_DP4_vec_col(v, M, 0);
2310     d->y = nine_DP4_vec_col(v, M, 1);
2311     d->z = nine_DP4_vec_col(v, M, 2);
2312 }
2313 
2314 void
nine_d3d_vector3_matrix_mul(D3DVECTOR * d,const D3DVECTOR * v,const D3DMATRIX * M)2315 nine_d3d_vector3_matrix_mul(D3DVECTOR *d, const D3DVECTOR *v, const D3DMATRIX *M)
2316 {
2317     d->x = nine_DP3_vec_col(v, M, 0);
2318     d->y = nine_DP3_vec_col(v, M, 1);
2319     d->z = nine_DP3_vec_col(v, M, 2);
2320 }
2321 
2322 void
nine_d3d_matrix_transpose(D3DMATRIX * D,const D3DMATRIX * M)2323 nine_d3d_matrix_transpose(D3DMATRIX *D, const D3DMATRIX *M)
2324 {
2325     unsigned i, j;
2326     for (i = 0; i < 4; ++i)
2327     for (j = 0; j < 4; ++j)
2328         D->m[i][j] = M->m[j][i];
2329 }
2330 
2331 #define _M_ADD_PROD_1i_2j_3k_4l(i,j,k,l) do {            \
2332     float t = M->_1##i * M->_2##j * M->_3##k * M->_4##l; \
2333     if (t > 0.0f) pos += t; else neg += t; } while(0)
2334 
2335 #define _M_SUB_PROD_1i_2j_3k_4l(i,j,k,l) do {            \
2336     float t = M->_1##i * M->_2##j * M->_3##k * M->_4##l; \
2337     if (t > 0.0f) neg -= t; else pos -= t; } while(0)
2338 float
nine_d3d_matrix_det(const D3DMATRIX * M)2339 nine_d3d_matrix_det(const D3DMATRIX *M)
2340 {
2341     float pos = 0.0f;
2342     float neg = 0.0f;
2343 
2344     _M_ADD_PROD_1i_2j_3k_4l(1, 2, 3, 4);
2345     _M_ADD_PROD_1i_2j_3k_4l(1, 3, 4, 2);
2346     _M_ADD_PROD_1i_2j_3k_4l(1, 4, 2, 3);
2347 
2348     _M_ADD_PROD_1i_2j_3k_4l(2, 1, 4, 3);
2349     _M_ADD_PROD_1i_2j_3k_4l(2, 3, 1, 4);
2350     _M_ADD_PROD_1i_2j_3k_4l(2, 4, 3, 1);
2351 
2352     _M_ADD_PROD_1i_2j_3k_4l(3, 1, 2, 4);
2353     _M_ADD_PROD_1i_2j_3k_4l(3, 2, 4, 1);
2354     _M_ADD_PROD_1i_2j_3k_4l(3, 4, 1, 2);
2355 
2356     _M_ADD_PROD_1i_2j_3k_4l(4, 1, 3, 2);
2357     _M_ADD_PROD_1i_2j_3k_4l(4, 2, 1, 3);
2358     _M_ADD_PROD_1i_2j_3k_4l(4, 3, 2, 1);
2359 
2360     _M_SUB_PROD_1i_2j_3k_4l(1, 2, 4, 3);
2361     _M_SUB_PROD_1i_2j_3k_4l(1, 3, 2, 4);
2362     _M_SUB_PROD_1i_2j_3k_4l(1, 4, 3, 2);
2363 
2364     _M_SUB_PROD_1i_2j_3k_4l(2, 1, 3, 4);
2365     _M_SUB_PROD_1i_2j_3k_4l(2, 3, 4, 1);
2366     _M_SUB_PROD_1i_2j_3k_4l(2, 4, 1, 3);
2367 
2368     _M_SUB_PROD_1i_2j_3k_4l(3, 1, 4, 2);
2369     _M_SUB_PROD_1i_2j_3k_4l(3, 2, 1, 4);
2370     _M_SUB_PROD_1i_2j_3k_4l(3, 4, 2, 1);
2371 
2372     _M_SUB_PROD_1i_2j_3k_4l(4, 1, 2, 3);
2373     _M_SUB_PROD_1i_2j_3k_4l(4, 2, 3, 1);
2374     _M_SUB_PROD_1i_2j_3k_4l(4, 3, 1, 2);
2375 
2376     return pos + neg;
2377 }
2378 
2379 /* XXX: Probably better to just use src/mesa/math/m_matrix.c because
2380  * I have no idea where this code came from.
2381  */
2382 void
nine_d3d_matrix_inverse(D3DMATRIX * D,const D3DMATRIX * M)2383 nine_d3d_matrix_inverse(D3DMATRIX *D, const D3DMATRIX *M)
2384 {
2385     int i, k;
2386     float det;
2387 
2388     D->m[0][0] =
2389         M->m[1][1] * M->m[2][2] * M->m[3][3] -
2390         M->m[1][1] * M->m[3][2] * M->m[2][3] -
2391         M->m[1][2] * M->m[2][1] * M->m[3][3] +
2392         M->m[1][2] * M->m[3][1] * M->m[2][3] +
2393         M->m[1][3] * M->m[2][1] * M->m[3][2] -
2394         M->m[1][3] * M->m[3][1] * M->m[2][2];
2395 
2396     D->m[0][1] =
2397        -M->m[0][1] * M->m[2][2] * M->m[3][3] +
2398         M->m[0][1] * M->m[3][2] * M->m[2][3] +
2399         M->m[0][2] * M->m[2][1] * M->m[3][3] -
2400         M->m[0][2] * M->m[3][1] * M->m[2][3] -
2401         M->m[0][3] * M->m[2][1] * M->m[3][2] +
2402         M->m[0][3] * M->m[3][1] * M->m[2][2];
2403 
2404     D->m[0][2] =
2405         M->m[0][1] * M->m[1][2] * M->m[3][3] -
2406         M->m[0][1] * M->m[3][2] * M->m[1][3] -
2407         M->m[0][2] * M->m[1][1] * M->m[3][3] +
2408         M->m[0][2] * M->m[3][1] * M->m[1][3] +
2409         M->m[0][3] * M->m[1][1] * M->m[3][2] -
2410         M->m[0][3] * M->m[3][1] * M->m[1][2];
2411 
2412     D->m[0][3] =
2413        -M->m[0][1] * M->m[1][2] * M->m[2][3] +
2414         M->m[0][1] * M->m[2][2] * M->m[1][3] +
2415         M->m[0][2] * M->m[1][1] * M->m[2][3] -
2416         M->m[0][2] * M->m[2][1] * M->m[1][3] -
2417         M->m[0][3] * M->m[1][1] * M->m[2][2] +
2418         M->m[0][3] * M->m[2][1] * M->m[1][2];
2419 
2420     D->m[1][0] =
2421        -M->m[1][0] * M->m[2][2] * M->m[3][3] +
2422         M->m[1][0] * M->m[3][2] * M->m[2][3] +
2423         M->m[1][2] * M->m[2][0] * M->m[3][3] -
2424         M->m[1][2] * M->m[3][0] * M->m[2][3] -
2425         M->m[1][3] * M->m[2][0] * M->m[3][2] +
2426         M->m[1][3] * M->m[3][0] * M->m[2][2];
2427 
2428     D->m[1][1] =
2429         M->m[0][0] * M->m[2][2] * M->m[3][3] -
2430         M->m[0][0] * M->m[3][2] * M->m[2][3] -
2431         M->m[0][2] * M->m[2][0] * M->m[3][3] +
2432         M->m[0][2] * M->m[3][0] * M->m[2][3] +
2433         M->m[0][3] * M->m[2][0] * M->m[3][2] -
2434         M->m[0][3] * M->m[3][0] * M->m[2][2];
2435 
2436     D->m[1][2] =
2437        -M->m[0][0] * M->m[1][2] * M->m[3][3] +
2438         M->m[0][0] * M->m[3][2] * M->m[1][3] +
2439         M->m[0][2] * M->m[1][0] * M->m[3][3] -
2440         M->m[0][2] * M->m[3][0] * M->m[1][3] -
2441         M->m[0][3] * M->m[1][0] * M->m[3][2] +
2442         M->m[0][3] * M->m[3][0] * M->m[1][2];
2443 
2444     D->m[1][3] =
2445         M->m[0][0] * M->m[1][2] * M->m[2][3] -
2446         M->m[0][0] * M->m[2][2] * M->m[1][3] -
2447         M->m[0][2] * M->m[1][0] * M->m[2][3] +
2448         M->m[0][2] * M->m[2][0] * M->m[1][3] +
2449         M->m[0][3] * M->m[1][0] * M->m[2][2] -
2450         M->m[0][3] * M->m[2][0] * M->m[1][2];
2451 
2452     D->m[2][0] =
2453         M->m[1][0] * M->m[2][1] * M->m[3][3] -
2454         M->m[1][0] * M->m[3][1] * M->m[2][3] -
2455         M->m[1][1] * M->m[2][0] * M->m[3][3] +
2456         M->m[1][1] * M->m[3][0] * M->m[2][3] +
2457         M->m[1][3] * M->m[2][0] * M->m[3][1] -
2458         M->m[1][3] * M->m[3][0] * M->m[2][1];
2459 
2460     D->m[2][1] =
2461        -M->m[0][0] * M->m[2][1] * M->m[3][3] +
2462         M->m[0][0] * M->m[3][1] * M->m[2][3] +
2463         M->m[0][1] * M->m[2][0] * M->m[3][3] -
2464         M->m[0][1] * M->m[3][0] * M->m[2][3] -
2465         M->m[0][3] * M->m[2][0] * M->m[3][1] +
2466         M->m[0][3] * M->m[3][0] * M->m[2][1];
2467 
2468     D->m[2][2] =
2469         M->m[0][0] * M->m[1][1] * M->m[3][3] -
2470         M->m[0][0] * M->m[3][1] * M->m[1][3] -
2471         M->m[0][1] * M->m[1][0] * M->m[3][3] +
2472         M->m[0][1] * M->m[3][0] * M->m[1][3] +
2473         M->m[0][3] * M->m[1][0] * M->m[3][1] -
2474         M->m[0][3] * M->m[3][0] * M->m[1][1];
2475 
2476     D->m[2][3] =
2477        -M->m[0][0] * M->m[1][1] * M->m[2][3] +
2478         M->m[0][0] * M->m[2][1] * M->m[1][3] +
2479         M->m[0][1] * M->m[1][0] * M->m[2][3] -
2480         M->m[0][1] * M->m[2][0] * M->m[1][3] -
2481         M->m[0][3] * M->m[1][0] * M->m[2][1] +
2482         M->m[0][3] * M->m[2][0] * M->m[1][1];
2483 
2484     D->m[3][0] =
2485        -M->m[1][0] * M->m[2][1] * M->m[3][2] +
2486         M->m[1][0] * M->m[3][1] * M->m[2][2] +
2487         M->m[1][1] * M->m[2][0] * M->m[3][2] -
2488         M->m[1][1] * M->m[3][0] * M->m[2][2] -
2489         M->m[1][2] * M->m[2][0] * M->m[3][1] +
2490         M->m[1][2] * M->m[3][0] * M->m[2][1];
2491 
2492     D->m[3][1] =
2493         M->m[0][0] * M->m[2][1] * M->m[3][2] -
2494         M->m[0][0] * M->m[3][1] * M->m[2][2] -
2495         M->m[0][1] * M->m[2][0] * M->m[3][2] +
2496         M->m[0][1] * M->m[3][0] * M->m[2][2] +
2497         M->m[0][2] * M->m[2][0] * M->m[3][1] -
2498         M->m[0][2] * M->m[3][0] * M->m[2][1];
2499 
2500     D->m[3][2] =
2501        -M->m[0][0] * M->m[1][1] * M->m[3][2] +
2502         M->m[0][0] * M->m[3][1] * M->m[1][2] +
2503         M->m[0][1] * M->m[1][0] * M->m[3][2] -
2504         M->m[0][1] * M->m[3][0] * M->m[1][2] -
2505         M->m[0][2] * M->m[1][0] * M->m[3][1] +
2506         M->m[0][2] * M->m[3][0] * M->m[1][1];
2507 
2508     D->m[3][3] =
2509         M->m[0][0] * M->m[1][1] * M->m[2][2] -
2510         M->m[0][0] * M->m[2][1] * M->m[1][2] -
2511         M->m[0][1] * M->m[1][0] * M->m[2][2] +
2512         M->m[0][1] * M->m[2][0] * M->m[1][2] +
2513         M->m[0][2] * M->m[1][0] * M->m[2][1] -
2514         M->m[0][2] * M->m[2][0] * M->m[1][1];
2515 
2516     det =
2517         M->m[0][0] * D->m[0][0] +
2518         M->m[1][0] * D->m[0][1] +
2519         M->m[2][0] * D->m[0][2] +
2520         M->m[3][0] * D->m[0][3];
2521 
2522     if (fabsf(det) < 1e-30) {/* non inversible */
2523         *D = *M; /* wine tests */
2524         return;
2525     }
2526 
2527     det = 1.0 / det;
2528 
2529     for (i = 0; i < 4; i++)
2530     for (k = 0; k < 4; k++)
2531         D->m[i][k] *= det;
2532 
2533 #if defined(DEBUG) || !defined(NDEBUG)
2534     {
2535         D3DMATRIX I;
2536 
2537         nine_d3d_matrix_matrix_mul(&I, D, M);
2538 
2539         for (i = 0; i < 4; ++i)
2540         for (k = 0; k < 4; ++k)
2541             if (fabsf(I.m[i][k] - (float)(i == k)) > 1e-3)
2542                 DBG("Matrix inversion check FAILED !\n");
2543     }
2544 #endif
2545 }
2546