1 /**************************************************************************
2 *
3 * Copyright 2009 VMware, Inc.
4 * Copyright 2007-2008 VMware, Inc.
5 * All Rights Reserved.
6 *
7 * Permission is hereby granted, free of charge, to any person obtaining a
8 * copy of this software and associated documentation files (the
9 * "Software"), to deal in the Software without restriction, including
10 * without limitation the rights to use, copy, modify, merge, publish,
11 * distribute, sub license, and/or sell copies of the Software, and to
12 * permit persons to whom the Software is furnished to do so, subject to
13 * the following conditions:
14 *
15 * The above copyright notice and this permission notice (including the
16 * next paragraph) shall be included in all copies or substantial portions
17 * of the Software.
18 *
19 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
20 * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
21 * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
22 * IN NO EVENT SHALL VMWARE AND/OR ITS SUPPLIERS BE LIABLE FOR
23 * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
24 * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
25 * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
26 *
27 **************************************************************************/
28
29 /**
30 * @file
31 * Position and shader input interpolation.
32 *
33 * @author Jose Fonseca <jfonseca@vmware.com>
34 */
35
36 #include "pipe/p_shader_tokens.h"
37 #include "util/u_debug.h"
38 #include "util/u_memory.h"
39 #include "util/u_math.h"
40 #include "tgsi/tgsi_scan.h"
41 #include "gallivm/lp_bld_debug.h"
42 #include "gallivm/lp_bld_const.h"
43 #include "gallivm/lp_bld_arit.h"
44 #include "gallivm/lp_bld_swizzle.h"
45 #include "gallivm/lp_bld_flow.h"
46 #include "lp_bld_interp.h"
47
48
49 /*
50 * The shader JIT function operates on blocks of quads.
51 * Each block has 2x2 quads and each quad has 2x2 pixels.
52 *
53 * We iterate over the quads in order 0, 1, 2, 3:
54 *
55 * #################
56 * # | # | #
57 * #---0---#---1---#
58 * # | # | #
59 * #################
60 * # | # | #
61 * #---2---#---3---#
62 * # | # | #
63 * #################
64 *
65 * If we iterate over multiple quads at once, quads 01 and 23 are processed
66 * together.
67 *
68 * Within each quad, we have four pixels which are represented in SOA
69 * order:
70 *
71 * #########
72 * # 0 | 1 #
73 * #---+---#
74 * # 2 | 3 #
75 * #########
76 *
77 * So the green channel (for example) of the four pixels is stored in
78 * a single vector register: {g0, g1, g2, g3}.
79 * The order stays the same even with multiple quads:
80 * 0 1 4 5
81 * 2 3 6 7
82 * is stored as g0..g7
83 */
84
85
86 /**
87 * Do one perspective divide per quad.
88 *
89 * For perspective interpolation, the final attribute value is given
90 *
91 * a' = a/w = a * oow
92 *
93 * where
94 *
95 * a = a0 + dadx*x + dady*y
96 * w = w0 + dwdx*x + dwdy*y
97 * oow = 1/w = 1/(w0 + dwdx*x + dwdy*y)
98 *
99 * Instead of computing the division per pixel, with this macro we compute the
100 * division on the upper left pixel of each quad, and use a linear
101 * approximation in the remaining pixels, given by:
102 *
103 * da'dx = (dadx - dwdx*a)*oow
104 * da'dy = (dady - dwdy*a)*oow
105 *
106 * Ironically, this actually makes things slower -- probably because the
107 * divide hardware unit is rarely used, whereas the multiply unit is typically
108 * already saturated.
109 */
110 #define PERSPECTIVE_DIVIDE_PER_QUAD 0
111
112
113 static const unsigned char quad_offset_x[16] = {0, 1, 0, 1, 2, 3, 2, 3, 0, 1, 0, 1, 2, 3, 2, 3};
114 static const unsigned char quad_offset_y[16] = {0, 0, 1, 1, 0, 0, 1, 1, 2, 2, 3, 3, 2, 2, 3, 3};
115
116
117 static void
attrib_name(LLVMValueRef val,unsigned attrib,unsigned chan,const char * suffix)118 attrib_name(LLVMValueRef val, unsigned attrib, unsigned chan, const char *suffix)
119 {
120 if(attrib == 0)
121 lp_build_name(val, "pos.%c%s", "xyzw"[chan], suffix);
122 else
123 lp_build_name(val, "input%u.%c%s", attrib - 1, "xyzw"[chan], suffix);
124 }
125
126 static void
calc_offsets(struct lp_build_context * coeff_bld,unsigned quad_start_index,LLVMValueRef * pixoffx,LLVMValueRef * pixoffy)127 calc_offsets(struct lp_build_context *coeff_bld,
128 unsigned quad_start_index,
129 LLVMValueRef *pixoffx,
130 LLVMValueRef *pixoffy)
131 {
132 unsigned i;
133 unsigned num_pix = coeff_bld->type.length;
134 struct gallivm_state *gallivm = coeff_bld->gallivm;
135 LLVMBuilderRef builder = coeff_bld->gallivm->builder;
136 LLVMValueRef nr, pixxf, pixyf;
137
138 *pixoffx = coeff_bld->undef;
139 *pixoffy = coeff_bld->undef;
140
141 for (i = 0; i < num_pix; i++) {
142 nr = lp_build_const_int32(gallivm, i);
143 pixxf = lp_build_const_float(gallivm, quad_offset_x[i % num_pix] +
144 (quad_start_index & 1) * 2);
145 pixyf = lp_build_const_float(gallivm, quad_offset_y[i % num_pix] +
146 (quad_start_index & 2));
147 *pixoffx = LLVMBuildInsertElement(builder, *pixoffx, pixxf, nr, "");
148 *pixoffy = LLVMBuildInsertElement(builder, *pixoffy, pixyf, nr, "");
149 }
150 }
151
152
153 /* Much easier, and significantly less instructions in the per-stamp
154 * part (less than half) but overall more instructions so a loss if
155 * most quads are active. Might be a win though with larger vectors.
156 * No ability to do per-quad divide (doable but not implemented)
157 * Could be made to work with passed in pixel offsets (i.e. active quad merging).
158 */
159 static void
coeffs_init_simple(struct lp_build_interp_soa_context * bld,LLVMValueRef a0_ptr,LLVMValueRef dadx_ptr,LLVMValueRef dady_ptr)160 coeffs_init_simple(struct lp_build_interp_soa_context *bld,
161 LLVMValueRef a0_ptr,
162 LLVMValueRef dadx_ptr,
163 LLVMValueRef dady_ptr)
164 {
165 struct lp_build_context *coeff_bld = &bld->coeff_bld;
166 struct lp_build_context *setup_bld = &bld->setup_bld;
167 struct gallivm_state *gallivm = coeff_bld->gallivm;
168 LLVMBuilderRef builder = gallivm->builder;
169 unsigned attrib;
170
171 for (attrib = 0; attrib < bld->num_attribs; ++attrib) {
172 /*
173 * always fetch all 4 values for performance/simplicity
174 * Note: we do that here because it seems to generate better
175 * code. It generates a lot of moves initially but less
176 * moves later. As far as I can tell this looks like a
177 * llvm issue, instead of simply reloading the values from
178 * the passed in pointers it if it runs out of registers
179 * it spills/reloads them. Maybe some optimization passes
180 * would help.
181 * Might want to investigate this again later.
182 */
183 const unsigned interp = bld->interp[attrib];
184 LLVMValueRef index = lp_build_const_int32(gallivm,
185 attrib * TGSI_NUM_CHANNELS);
186 LLVMValueRef ptr;
187 LLVMValueRef dadxaos = setup_bld->zero;
188 LLVMValueRef dadyaos = setup_bld->zero;
189 LLVMValueRef a0aos = setup_bld->zero;
190
191 switch (interp) {
192 case LP_INTERP_PERSPECTIVE:
193 /* fall-through */
194
195 case LP_INTERP_LINEAR:
196 ptr = LLVMBuildGEP(builder, dadx_ptr, &index, 1, "");
197 ptr = LLVMBuildBitCast(builder, ptr,
198 LLVMPointerType(setup_bld->vec_type, 0), "");
199 dadxaos = LLVMBuildLoad(builder, ptr, "");
200
201 ptr = LLVMBuildGEP(builder, dady_ptr, &index, 1, "");
202 ptr = LLVMBuildBitCast(builder, ptr,
203 LLVMPointerType(setup_bld->vec_type, 0), "");
204 dadyaos = LLVMBuildLoad(builder, ptr, "");
205
206 attrib_name(dadxaos, attrib, 0, ".dadxaos");
207 attrib_name(dadyaos, attrib, 0, ".dadyaos");
208 /* fall-through */
209
210 case LP_INTERP_CONSTANT:
211 case LP_INTERP_FACING:
212 ptr = LLVMBuildGEP(builder, a0_ptr, &index, 1, "");
213 ptr = LLVMBuildBitCast(builder, ptr,
214 LLVMPointerType(setup_bld->vec_type, 0), "");
215 a0aos = LLVMBuildLoad(builder, ptr, "");
216 attrib_name(a0aos, attrib, 0, ".a0aos");
217 break;
218
219 case LP_INTERP_POSITION:
220 /* Nothing to do as the position coeffs are already setup in slot 0 */
221 continue;
222
223 default:
224 assert(0);
225 break;
226 }
227 bld->a0aos[attrib] = a0aos;
228 bld->dadxaos[attrib] = dadxaos;
229 bld->dadyaos[attrib] = dadyaos;
230 }
231 }
232
233 /**
234 * Interpolate the shader input attribute values.
235 * This is called for each (group of) quad(s).
236 */
237 static void
attribs_update_simple(struct lp_build_interp_soa_context * bld,struct gallivm_state * gallivm,LLVMValueRef loop_iter,int start,int end)238 attribs_update_simple(struct lp_build_interp_soa_context *bld,
239 struct gallivm_state *gallivm,
240 LLVMValueRef loop_iter,
241 int start,
242 int end)
243 {
244 LLVMBuilderRef builder = gallivm->builder;
245 struct lp_build_context *coeff_bld = &bld->coeff_bld;
246 struct lp_build_context *setup_bld = &bld->setup_bld;
247 LLVMValueRef oow = NULL;
248 unsigned attrib;
249 LLVMValueRef pixoffx;
250 LLVMValueRef pixoffy;
251 LLVMValueRef ptr;
252
253 /* could do this with code-generated passed in pixel offsets too */
254
255 assert(loop_iter);
256 ptr = LLVMBuildGEP(builder, bld->xoffset_store, &loop_iter, 1, "");
257 pixoffx = LLVMBuildLoad(builder, ptr, "");
258 ptr = LLVMBuildGEP(builder, bld->yoffset_store, &loop_iter, 1, "");
259 pixoffy = LLVMBuildLoad(builder, ptr, "");
260
261 pixoffx = LLVMBuildFAdd(builder, pixoffx,
262 lp_build_broadcast_scalar(coeff_bld, bld->x), "");
263 pixoffy = LLVMBuildFAdd(builder, pixoffy,
264 lp_build_broadcast_scalar(coeff_bld, bld->y), "");
265
266 for (attrib = start; attrib < end; attrib++) {
267 const unsigned mask = bld->mask[attrib];
268 const unsigned interp = bld->interp[attrib];
269 unsigned chan;
270
271 for (chan = 0; chan < TGSI_NUM_CHANNELS; chan++) {
272 if (mask & (1 << chan)) {
273 LLVMValueRef index;
274 LLVMValueRef dadx = coeff_bld->zero;
275 LLVMValueRef dady = coeff_bld->zero;
276 LLVMValueRef a = coeff_bld->zero;
277
278 index = lp_build_const_int32(gallivm, chan);
279 switch (interp) {
280 case LP_INTERP_PERSPECTIVE:
281 /* fall-through */
282
283 case LP_INTERP_LINEAR:
284 if (attrib == 0 && chan == 0) {
285 dadx = coeff_bld->one;
286 if (bld->pos_offset) {
287 a = lp_build_const_vec(gallivm, coeff_bld->type, bld->pos_offset);
288 }
289 }
290 else if (attrib == 0 && chan == 1) {
291 dady = coeff_bld->one;
292 if (bld->pos_offset) {
293 a = lp_build_const_vec(gallivm, coeff_bld->type, bld->pos_offset);
294 }
295 }
296 else {
297 dadx = lp_build_extract_broadcast(gallivm, setup_bld->type,
298 coeff_bld->type, bld->dadxaos[attrib],
299 index);
300 dady = lp_build_extract_broadcast(gallivm, setup_bld->type,
301 coeff_bld->type, bld->dadyaos[attrib],
302 index);
303 a = lp_build_extract_broadcast(gallivm, setup_bld->type,
304 coeff_bld->type, bld->a0aos[attrib],
305 index);
306 }
307 /*
308 * a = a0 + (x * dadx + y * dady)
309 */
310 a = lp_build_fmuladd(builder, dadx, pixoffx, a);
311 a = lp_build_fmuladd(builder, dady, pixoffy, a);
312
313 if (interp == LP_INTERP_PERSPECTIVE) {
314 if (oow == NULL) {
315 LLVMValueRef w = bld->attribs[0][3];
316 assert(attrib != 0);
317 assert(bld->mask[0] & TGSI_WRITEMASK_W);
318 oow = lp_build_rcp(coeff_bld, w);
319 }
320 a = lp_build_mul(coeff_bld, a, oow);
321 }
322 break;
323
324 case LP_INTERP_CONSTANT:
325 case LP_INTERP_FACING:
326 a = lp_build_extract_broadcast(gallivm, setup_bld->type,
327 coeff_bld->type, bld->a0aos[attrib],
328 index);
329 break;
330
331 case LP_INTERP_POSITION:
332 assert(attrib > 0);
333 a = bld->attribs[0][chan];
334 break;
335
336 default:
337 assert(0);
338 break;
339 }
340
341 if ((attrib == 0) && (chan == 2) && !bld->depth_clamp){
342 /* FIXME: Depth values can exceed 1.0, due to the fact that
343 * setup interpolation coefficients refer to (0,0) which causes
344 * precision loss. So we must clamp to 1.0 here to avoid artifacts.
345 * Note though values outside [0,1] are perfectly valid with
346 * depth clip disabled.
347 * XXX: If depth clip is disabled but we force depth clamp
348 * we may get values larger than 1.0 in the fs (but not in
349 * depth test). Not sure if that's an issue...
350 * Also, on a similar note, it is not obvious if the depth values
351 * appearing in fs (with depth clip disabled) should be clamped
352 * to [0,1], clamped to near/far or not be clamped at all...
353 */
354 a = lp_build_min(coeff_bld, a, coeff_bld->one);
355 }
356 bld->attribs[attrib][chan] = a;
357 }
358 }
359 }
360 }
361
362 /**
363 * Initialize the bld->a, dadq fields. This involves fetching
364 * those values from the arrays which are passed into the JIT function.
365 */
366 static void
coeffs_init(struct lp_build_interp_soa_context * bld,LLVMValueRef a0_ptr,LLVMValueRef dadx_ptr,LLVMValueRef dady_ptr)367 coeffs_init(struct lp_build_interp_soa_context *bld,
368 LLVMValueRef a0_ptr,
369 LLVMValueRef dadx_ptr,
370 LLVMValueRef dady_ptr)
371 {
372 struct lp_build_context *coeff_bld = &bld->coeff_bld;
373 struct lp_build_context *setup_bld = &bld->setup_bld;
374 struct gallivm_state *gallivm = coeff_bld->gallivm;
375 LLVMBuilderRef builder = gallivm->builder;
376 LLVMValueRef pixoffx, pixoffy;
377 unsigned attrib;
378 unsigned chan;
379 unsigned i;
380
381 pixoffx = coeff_bld->undef;
382 pixoffy = coeff_bld->undef;
383 for (i = 0; i < coeff_bld->type.length; i++) {
384 LLVMValueRef nr = lp_build_const_int32(gallivm, i);
385 LLVMValueRef pixxf = lp_build_const_float(gallivm, quad_offset_x[i]);
386 LLVMValueRef pixyf = lp_build_const_float(gallivm, quad_offset_y[i]);
387 pixoffx = LLVMBuildInsertElement(builder, pixoffx, pixxf, nr, "");
388 pixoffy = LLVMBuildInsertElement(builder, pixoffy, pixyf, nr, "");
389 }
390
391
392 for (attrib = 0; attrib < bld->num_attribs; ++attrib) {
393 const unsigned mask = bld->mask[attrib];
394 const unsigned interp = bld->interp[attrib];
395 LLVMValueRef index = lp_build_const_int32(gallivm,
396 attrib * TGSI_NUM_CHANNELS);
397 LLVMValueRef ptr;
398 LLVMValueRef dadxaos = setup_bld->zero;
399 LLVMValueRef dadyaos = setup_bld->zero;
400 LLVMValueRef a0aos = setup_bld->zero;
401
402 /* always fetch all 4 values for performance/simplicity */
403 switch (interp) {
404 case LP_INTERP_PERSPECTIVE:
405 /* fall-through */
406
407 case LP_INTERP_LINEAR:
408 ptr = LLVMBuildGEP(builder, dadx_ptr, &index, 1, "");
409 ptr = LLVMBuildBitCast(builder, ptr,
410 LLVMPointerType(setup_bld->vec_type, 0), "");
411 dadxaos = LLVMBuildLoad(builder, ptr, "");
412
413 ptr = LLVMBuildGEP(builder, dady_ptr, &index, 1, "");
414 ptr = LLVMBuildBitCast(builder, ptr,
415 LLVMPointerType(setup_bld->vec_type, 0), "");
416 dadyaos = LLVMBuildLoad(builder, ptr, "");
417
418 attrib_name(dadxaos, attrib, 0, ".dadxaos");
419 attrib_name(dadyaos, attrib, 0, ".dadyaos");
420 /* fall-through */
421
422 case LP_INTERP_CONSTANT:
423 case LP_INTERP_FACING:
424 ptr = LLVMBuildGEP(builder, a0_ptr, &index, 1, "");
425 ptr = LLVMBuildBitCast(builder, ptr,
426 LLVMPointerType(setup_bld->vec_type, 0), "");
427 a0aos = LLVMBuildLoad(builder, ptr, "");
428 attrib_name(a0aos, attrib, 0, ".a0aos");
429 break;
430
431 case LP_INTERP_POSITION:
432 /* Nothing to do as the position coeffs are already setup in slot 0 */
433 continue;
434
435 default:
436 assert(0);
437 break;
438 }
439
440 /*
441 * a = a0 + (x * dadx + y * dady)
442 * a0aos is the attrib value at top left corner of stamp
443 */
444 if (interp != LP_INTERP_CONSTANT &&
445 interp != LP_INTERP_FACING) {
446 LLVMValueRef x = lp_build_broadcast_scalar(setup_bld, bld->x);
447 LLVMValueRef y = lp_build_broadcast_scalar(setup_bld, bld->y);
448 a0aos = lp_build_fmuladd(builder, x, dadxaos, a0aos);
449 a0aos = lp_build_fmuladd(builder, y, dadyaos, a0aos);
450 }
451
452 /*
453 * dadq = {0, dadx, dady, dadx + dady}
454 * for two quads (side by side) this is:
455 * {0, dadx, dady, dadx+dady, 2*dadx, 2*dadx+dady, 3*dadx+dady}
456 */
457 for (chan = 0; chan < TGSI_NUM_CHANNELS; ++chan) {
458 /* this generates a CRAPLOAD of shuffles... */
459 if (mask & (1 << chan)) {
460 LLVMValueRef dadx, dady;
461 LLVMValueRef dadq, dadq2;
462 LLVMValueRef a;
463 LLVMValueRef chan_index = lp_build_const_int32(gallivm, chan);
464
465 if (attrib == 0 && chan == 0) {
466 a = bld->x;
467 if (bld->pos_offset) {
468 a = LLVMBuildFAdd(builder, a, lp_build_const_float(gallivm, bld->pos_offset), "");
469 }
470 a = lp_build_broadcast_scalar(coeff_bld, a);
471 dadx = coeff_bld->one;
472 dady = coeff_bld->zero;
473 }
474 else if (attrib == 0 && chan == 1) {
475 a = bld->y;
476 if (bld->pos_offset) {
477 a = LLVMBuildFAdd(builder, a, lp_build_const_float(gallivm, bld->pos_offset), "");
478 }
479 a = lp_build_broadcast_scalar(coeff_bld, a);
480 dady = coeff_bld->one;
481 dadx = coeff_bld->zero;
482 }
483 else {
484 dadx = lp_build_extract_broadcast(gallivm, setup_bld->type,
485 coeff_bld->type, dadxaos, chan_index);
486 dady = lp_build_extract_broadcast(gallivm, setup_bld->type,
487 coeff_bld->type, dadyaos, chan_index);
488
489 /*
490 * a = {a, a, a, a}
491 */
492 a = lp_build_extract_broadcast(gallivm, setup_bld->type,
493 coeff_bld->type, a0aos, chan_index);
494 }
495
496 dadx = LLVMBuildFMul(builder, dadx, pixoffx, "");
497 dady = LLVMBuildFMul(builder, dady, pixoffy, "");
498 dadq = LLVMBuildFAdd(builder, dadx, dady, "");
499
500 /*
501 * Compute the attrib values on the upper-left corner of each
502 * group of quads.
503 * Note that if we process 2 quads at once this doesn't
504 * really exactly to what we want.
505 * We need to access elem 0 and 2 respectively later if we process
506 * 2 quads at once.
507 */
508
509 if (interp != LP_INTERP_CONSTANT &&
510 interp != LP_INTERP_FACING) {
511 dadq2 = LLVMBuildFAdd(builder, dadq, dadq, "");
512 a = LLVMBuildFAdd(builder, a, dadq2, "");
513 }
514
515 #if PERSPECTIVE_DIVIDE_PER_QUAD
516 /*
517 * a *= 1 / w
518 */
519
520 /*
521 * XXX since we're only going to access elements 0,2 out of 8
522 * if we have 8-wide vectors we should do the division only 4-wide.
523 * a is really a 2-elements in a 4-wide vector disguised as 8-wide
524 * in this case.
525 */
526 if (interp == LP_INTERP_PERSPECTIVE) {
527 LLVMValueRef w = bld->a[0][3];
528 assert(attrib != 0);
529 assert(bld->mask[0] & TGSI_WRITEMASK_W);
530 if (!bld->oow) {
531 bld->oow = lp_build_rcp(coeff_bld, w);
532 lp_build_name(bld->oow, "oow");
533 }
534 a = lp_build_mul(coeff_bld, a, bld->oow);
535 }
536 #endif
537
538 attrib_name(a, attrib, chan, ".a");
539 attrib_name(dadq, attrib, chan, ".dadq");
540
541 bld->a[attrib][chan] = lp_build_alloca(gallivm,
542 LLVMTypeOf(a), "");
543 LLVMBuildStore(builder, a, bld->a[attrib][chan]);
544 bld->dadq[attrib][chan] = dadq;
545 }
546 }
547 }
548 }
549
550
551 /**
552 * Increment the shader input attribute values.
553 * This is called when we move from one quad to the next.
554 */
555 static void
attribs_update(struct lp_build_interp_soa_context * bld,struct gallivm_state * gallivm,LLVMValueRef loop_iter,int start,int end)556 attribs_update(struct lp_build_interp_soa_context *bld,
557 struct gallivm_state *gallivm,
558 LLVMValueRef loop_iter,
559 int start,
560 int end)
561 {
562 LLVMBuilderRef builder = gallivm->builder;
563 struct lp_build_context *coeff_bld = &bld->coeff_bld;
564 LLVMValueRef oow = NULL;
565 unsigned attrib;
566 unsigned chan;
567
568 for(attrib = start; attrib < end; ++attrib) {
569 const unsigned mask = bld->mask[attrib];
570 const unsigned interp = bld->interp[attrib];
571 for(chan = 0; chan < TGSI_NUM_CHANNELS; ++chan) {
572 if(mask & (1 << chan)) {
573 LLVMValueRef a;
574 if (interp == LP_INTERP_CONSTANT ||
575 interp == LP_INTERP_FACING) {
576 a = LLVMBuildLoad(builder, bld->a[attrib][chan], "");
577 }
578 else if (interp == LP_INTERP_POSITION) {
579 assert(attrib > 0);
580 a = bld->attribs[0][chan];
581 }
582 else {
583 LLVMValueRef dadq;
584
585 a = bld->a[attrib][chan];
586
587 /*
588 * Broadcast the attribute value for this quad into all elements
589 */
590
591 {
592 /* stored as vector load as float */
593 LLVMTypeRef ptr_type = LLVMPointerType(LLVMFloatTypeInContext(
594 gallivm->context), 0);
595 LLVMValueRef ptr;
596 a = LLVMBuildBitCast(builder, a, ptr_type, "");
597 ptr = LLVMBuildGEP(builder, a, &loop_iter, 1, "");
598 a = LLVMBuildLoad(builder, ptr, "");
599 a = lp_build_broadcast_scalar(&bld->coeff_bld, a);
600 }
601
602 /*
603 * Get the derivatives.
604 */
605
606 dadq = bld->dadq[attrib][chan];
607
608 #if PERSPECTIVE_DIVIDE_PER_QUAD
609 if (interp == LP_INTERP_PERSPECTIVE) {
610 LLVMValueRef dwdq = bld->dadq[0][3];
611
612 if (oow == NULL) {
613 assert(bld->oow);
614 oow = LLVMBuildShuffleVector(coeff_bld->builder,
615 bld->oow, coeff_bld->undef,
616 shuffle, "");
617 }
618
619 dadq = lp_build_sub(coeff_bld,
620 dadq,
621 lp_build_mul(coeff_bld, a, dwdq));
622 dadq = lp_build_mul(coeff_bld, dadq, oow);
623 }
624 #endif
625
626 /*
627 * Add the derivatives
628 */
629
630 a = lp_build_add(coeff_bld, a, dadq);
631
632 #if !PERSPECTIVE_DIVIDE_PER_QUAD
633 if (interp == LP_INTERP_PERSPECTIVE) {
634 if (oow == NULL) {
635 LLVMValueRef w = bld->attribs[0][3];
636 assert(attrib != 0);
637 assert(bld->mask[0] & TGSI_WRITEMASK_W);
638 oow = lp_build_rcp(coeff_bld, w);
639 }
640 a = lp_build_mul(coeff_bld, a, oow);
641 }
642 #endif
643
644 if (attrib == 0 && chan == 2 && !bld->depth_clamp) {
645 /* FIXME: Depth values can exceed 1.0, due to the fact that
646 * setup interpolation coefficients refer to (0,0) which causes
647 * precision loss. So we must clamp to 1.0 here to avoid artifacts.
648 * Note though values outside [0,1] are perfectly valid with
649 * depth clip disabled..
650 * XXX: If depth clip is disabled but we force depth clamp
651 * we may get values larger than 1.0 in the fs (but not in
652 * depth test). Not sure if that's an issue...
653 * Also, on a similar note, it is not obvious if the depth values
654 * appearing in fs (with depth clip disabled) should be clamped
655 * to [0,1], clamped to near/far or not be clamped at all...
656 */
657 a = lp_build_min(coeff_bld, a, coeff_bld->one);
658 }
659
660 attrib_name(a, attrib, chan, "");
661 }
662 bld->attribs[attrib][chan] = a;
663 }
664 }
665 }
666 }
667
668
669 /**
670 * Generate the position vectors.
671 *
672 * Parameter x0, y0 are the integer values with upper left coordinates.
673 */
674 static void
pos_init(struct lp_build_interp_soa_context * bld,LLVMValueRef x0,LLVMValueRef y0)675 pos_init(struct lp_build_interp_soa_context *bld,
676 LLVMValueRef x0,
677 LLVMValueRef y0)
678 {
679 LLVMBuilderRef builder = bld->coeff_bld.gallivm->builder;
680 struct lp_build_context *coeff_bld = &bld->coeff_bld;
681
682 bld->x = LLVMBuildSIToFP(builder, x0, coeff_bld->elem_type, "");
683 bld->y = LLVMBuildSIToFP(builder, y0, coeff_bld->elem_type, "");
684 }
685
686
687 /**
688 * Initialize fragment shader input attribute info.
689 */
690 void
lp_build_interp_soa_init(struct lp_build_interp_soa_context * bld,struct gallivm_state * gallivm,unsigned num_inputs,const struct lp_shader_input * inputs,boolean pixel_center_integer,boolean depth_clamp,LLVMBuilderRef builder,struct lp_type type,LLVMValueRef a0_ptr,LLVMValueRef dadx_ptr,LLVMValueRef dady_ptr,LLVMValueRef x0,LLVMValueRef y0)691 lp_build_interp_soa_init(struct lp_build_interp_soa_context *bld,
692 struct gallivm_state *gallivm,
693 unsigned num_inputs,
694 const struct lp_shader_input *inputs,
695 boolean pixel_center_integer,
696 boolean depth_clamp,
697 LLVMBuilderRef builder,
698 struct lp_type type,
699 LLVMValueRef a0_ptr,
700 LLVMValueRef dadx_ptr,
701 LLVMValueRef dady_ptr,
702 LLVMValueRef x0,
703 LLVMValueRef y0)
704 {
705 struct lp_type coeff_type;
706 struct lp_type setup_type;
707 unsigned attrib;
708 unsigned chan;
709
710 memset(bld, 0, sizeof *bld);
711
712 memset(&coeff_type, 0, sizeof coeff_type);
713 coeff_type.floating = TRUE;
714 coeff_type.sign = TRUE;
715 coeff_type.width = 32;
716 coeff_type.length = type.length;
717
718 memset(&setup_type, 0, sizeof setup_type);
719 setup_type.floating = TRUE;
720 setup_type.sign = TRUE;
721 setup_type.width = 32;
722 setup_type.length = TGSI_NUM_CHANNELS;
723
724
725 /* XXX: we don't support interpolating into any other types */
726 assert(memcmp(&coeff_type, &type, sizeof coeff_type) == 0);
727
728 lp_build_context_init(&bld->coeff_bld, gallivm, coeff_type);
729 lp_build_context_init(&bld->setup_bld, gallivm, setup_type);
730
731 /* For convenience */
732 bld->pos = bld->attribs[0];
733 bld->inputs = (const LLVMValueRef (*)[TGSI_NUM_CHANNELS]) bld->attribs[1];
734
735 /* Position */
736 bld->mask[0] = TGSI_WRITEMASK_XYZW;
737 bld->interp[0] = LP_INTERP_LINEAR;
738
739 /* Inputs */
740 for (attrib = 0; attrib < num_inputs; ++attrib) {
741 bld->mask[1 + attrib] = inputs[attrib].usage_mask;
742 bld->interp[1 + attrib] = inputs[attrib].interp;
743 }
744 bld->num_attribs = 1 + num_inputs;
745
746 /* Ensure all masked out input channels have a valid value */
747 for (attrib = 0; attrib < bld->num_attribs; ++attrib) {
748 for (chan = 0; chan < TGSI_NUM_CHANNELS; ++chan) {
749 bld->attribs[attrib][chan] = bld->coeff_bld.undef;
750 }
751 }
752
753 if (pixel_center_integer) {
754 bld->pos_offset = 0.0;
755 } else {
756 bld->pos_offset = 0.5;
757 }
758 bld->depth_clamp = depth_clamp;
759
760 pos_init(bld, x0, y0);
761
762 /*
763 * Simple method (single step interpolation) may be slower if vector length
764 * is just 4, but the results are different (generally less accurate) with
765 * the other method, so always use more accurate version.
766 */
767 if (1) {
768 bld->simple_interp = TRUE;
769 {
770 /* XXX this should use a global static table */
771 unsigned i;
772 unsigned num_loops = 16 / type.length;
773 LLVMValueRef pixoffx, pixoffy, index;
774 LLVMValueRef ptr;
775
776 bld->xoffset_store = lp_build_array_alloca(gallivm,
777 lp_build_vec_type(gallivm, type),
778 lp_build_const_int32(gallivm, num_loops),
779 "");
780 bld->yoffset_store = lp_build_array_alloca(gallivm,
781 lp_build_vec_type(gallivm, type),
782 lp_build_const_int32(gallivm, num_loops),
783 "");
784 for (i = 0; i < num_loops; i++) {
785 index = lp_build_const_int32(gallivm, i);
786 calc_offsets(&bld->coeff_bld, i*type.length/4, &pixoffx, &pixoffy);
787 ptr = LLVMBuildGEP(builder, bld->xoffset_store, &index, 1, "");
788 LLVMBuildStore(builder, pixoffx, ptr);
789 ptr = LLVMBuildGEP(builder, bld->yoffset_store, &index, 1, "");
790 LLVMBuildStore(builder, pixoffy, ptr);
791 }
792 }
793 coeffs_init_simple(bld, a0_ptr, dadx_ptr, dady_ptr);
794 }
795 else {
796 bld->simple_interp = FALSE;
797 coeffs_init(bld, a0_ptr, dadx_ptr, dady_ptr);
798 }
799
800 }
801
802
803 /*
804 * Advance the position and inputs to the given quad within the block.
805 */
806
807 void
lp_build_interp_soa_update_inputs_dyn(struct lp_build_interp_soa_context * bld,struct gallivm_state * gallivm,LLVMValueRef quad_start_index)808 lp_build_interp_soa_update_inputs_dyn(struct lp_build_interp_soa_context *bld,
809 struct gallivm_state *gallivm,
810 LLVMValueRef quad_start_index)
811 {
812 if (bld->simple_interp) {
813 attribs_update_simple(bld, gallivm, quad_start_index, 1, bld->num_attribs);
814 }
815 else {
816 attribs_update(bld, gallivm, quad_start_index, 1, bld->num_attribs);
817 }
818 }
819
820 void
lp_build_interp_soa_update_pos_dyn(struct lp_build_interp_soa_context * bld,struct gallivm_state * gallivm,LLVMValueRef quad_start_index)821 lp_build_interp_soa_update_pos_dyn(struct lp_build_interp_soa_context *bld,
822 struct gallivm_state *gallivm,
823 LLVMValueRef quad_start_index)
824 {
825 if (bld->simple_interp) {
826 attribs_update_simple(bld, gallivm, quad_start_index, 0, 1);
827 }
828 else {
829 attribs_update(bld, gallivm, quad_start_index, 0, 1);
830 }
831 }
832
833