1 /**************************************************************************
2 *
3 * Copyright 2009 VMware, Inc.
4 * All Rights Reserved.
5 *
6 * Permission is hereby granted, free of charge, to any person obtaining a
7 * copy of this software and associated documentation files (the
8 * "Software"), to deal in the Software without restriction, including
9 * without limitation the rights to use, copy, modify, merge, publish,
10 * distribute, sub license, and/or sell copies of the Software, and to
11 * permit persons to whom the Software is furnished to do so, subject to
12 * the following conditions:
13 *
14 * The above copyright notice and this permission notice (including the
15 * next paragraph) shall be included in all copies or substantial portions
16 * of the Software.
17 *
18 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
19 * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
20 * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
21 * IN NO EVENT SHALL VMWARE AND/OR ITS SUPPLIERS BE LIABLE FOR
22 * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
23 * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
24 * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
25 *
26 **************************************************************************/
27
28 /**
29 * @file
30 * Texture sampling -- SoA.
31 *
32 * @author Jose Fonseca <jfonseca@vmware.com>
33 * @author Brian Paul <brianp@vmware.com>
34 */
35
36 #include "pipe/p_defines.h"
37 #include "pipe/p_state.h"
38 #include "pipe/p_shader_tokens.h"
39 #include "util/compiler.h"
40 #include "util/u_debug.h"
41 #include "util/u_dump.h"
42 #include "util/u_memory.h"
43 #include "util/u_math.h"
44 #include "util/format/u_format.h"
45 #include "util/u_cpu_detect.h"
46 #include "util/format_rgb9e5.h"
47 #include "lp_bld_debug.h"
48 #include "lp_bld_type.h"
49 #include "lp_bld_const.h"
50 #include "lp_bld_conv.h"
51 #include "lp_bld_arit.h"
52 #include "lp_bld_bitarit.h"
53 #include "lp_bld_logic.h"
54 #include "lp_bld_printf.h"
55 #include "lp_bld_swizzle.h"
56 #include "lp_bld_flow.h"
57 #include "lp_bld_gather.h"
58 #include "lp_bld_format.h"
59 #include "lp_bld_sample.h"
60 #include "lp_bld_sample_aos.h"
61 #include "lp_bld_struct.h"
62 #include "lp_bld_quad.h"
63 #include "lp_bld_pack.h"
64 #include "lp_bld_intr.h"
65 #include "lp_bld_misc.h"
66
67
68 /**
69 * Generate code to fetch a texel from a texture at int coords (x, y, z).
70 * The computation depends on whether the texture is 1D, 2D or 3D.
71 * The result, texel, will be float vectors:
72 * texel[0] = red values
73 * texel[1] = green values
74 * texel[2] = blue values
75 * texel[3] = alpha values
76 */
77 static void
lp_build_sample_texel_soa(struct lp_build_sample_context * bld,LLVMValueRef width,LLVMValueRef height,LLVMValueRef depth,LLVMValueRef x,LLVMValueRef y,LLVMValueRef z,LLVMValueRef y_stride,LLVMValueRef z_stride,LLVMValueRef data_ptr,LLVMValueRef mipoffsets,LLVMValueRef texel_out[4])78 lp_build_sample_texel_soa(struct lp_build_sample_context *bld,
79 LLVMValueRef width,
80 LLVMValueRef height,
81 LLVMValueRef depth,
82 LLVMValueRef x,
83 LLVMValueRef y,
84 LLVMValueRef z,
85 LLVMValueRef y_stride,
86 LLVMValueRef z_stride,
87 LLVMValueRef data_ptr,
88 LLVMValueRef mipoffsets,
89 LLVMValueRef texel_out[4])
90 {
91 const struct lp_static_sampler_state *static_state = bld->static_sampler_state;
92 const unsigned dims = bld->dims;
93 struct lp_build_context *int_coord_bld = &bld->int_coord_bld;
94 LLVMBuilderRef builder = bld->gallivm->builder;
95 LLVMValueRef offset;
96 LLVMValueRef i, j;
97 LLVMValueRef use_border = NULL;
98
99 /* use_border = x < 0 || x >= width || y < 0 || y >= height */
100 if (lp_sampler_wrap_mode_uses_border_color(static_state->wrap_s,
101 static_state->min_img_filter,
102 static_state->mag_img_filter)) {
103 LLVMValueRef b1, b2;
104 b1 = lp_build_cmp(int_coord_bld, PIPE_FUNC_LESS, x, int_coord_bld->zero);
105 b2 = lp_build_cmp(int_coord_bld, PIPE_FUNC_GEQUAL, x, width);
106 use_border = LLVMBuildOr(builder, b1, b2, "b1_or_b2");
107 }
108
109 if (dims >= 2 &&
110 lp_sampler_wrap_mode_uses_border_color(static_state->wrap_t,
111 static_state->min_img_filter,
112 static_state->mag_img_filter)) {
113 LLVMValueRef b1, b2;
114 b1 = lp_build_cmp(int_coord_bld, PIPE_FUNC_LESS, y, int_coord_bld->zero);
115 b2 = lp_build_cmp(int_coord_bld, PIPE_FUNC_GEQUAL, y, height);
116 if (use_border) {
117 use_border = LLVMBuildOr(builder, use_border, b1, "ub_or_b1");
118 use_border = LLVMBuildOr(builder, use_border, b2, "ub_or_b2");
119 }
120 else {
121 use_border = LLVMBuildOr(builder, b1, b2, "b1_or_b2");
122 }
123 }
124
125 if (dims == 3 &&
126 lp_sampler_wrap_mode_uses_border_color(static_state->wrap_r,
127 static_state->min_img_filter,
128 static_state->mag_img_filter)) {
129 LLVMValueRef b1, b2;
130 b1 = lp_build_cmp(int_coord_bld, PIPE_FUNC_LESS, z, int_coord_bld->zero);
131 b2 = lp_build_cmp(int_coord_bld, PIPE_FUNC_GEQUAL, z, depth);
132 if (use_border) {
133 use_border = LLVMBuildOr(builder, use_border, b1, "ub_or_b1");
134 use_border = LLVMBuildOr(builder, use_border, b2, "ub_or_b2");
135 }
136 else {
137 use_border = LLVMBuildOr(builder, b1, b2, "b1_or_b2");
138 }
139 }
140
141 /* convert x,y,z coords to linear offset from start of texture, in bytes */
142 lp_build_sample_offset(&bld->int_coord_bld,
143 bld->format_desc,
144 x, y, z, y_stride, z_stride,
145 &offset, &i, &j);
146 if (mipoffsets) {
147 offset = lp_build_add(&bld->int_coord_bld, offset, mipoffsets);
148 }
149
150 if (use_border) {
151 /* If we can sample the border color, it means that texcoords may
152 * lie outside the bounds of the texture image. We need to do
153 * something to prevent reading out of bounds and causing a segfault.
154 *
155 * Simply AND the texture coords with !use_border. This will cause
156 * coords which are out of bounds to become zero. Zero's guaranteed
157 * to be inside the texture image.
158 */
159 offset = lp_build_andnot(&bld->int_coord_bld, offset, use_border);
160 }
161
162 lp_build_fetch_rgba_soa(bld->gallivm,
163 bld->format_desc,
164 bld->texel_type, TRUE,
165 data_ptr, offset,
166 i, j,
167 bld->cache,
168 texel_out);
169
170 /*
171 * Note: if we find an app which frequently samples the texture border
172 * we might want to implement a true conditional here to avoid sampling
173 * the texture whenever possible (since that's quite a bit of code).
174 * Ex:
175 * if (use_border) {
176 * texel = border_color;
177 * }
178 * else {
179 * texel = sample_texture(coord);
180 * }
181 * As it is now, we always sample the texture, then selectively replace
182 * the texel color results with the border color.
183 */
184
185 if (use_border) {
186 /* select texel color or border color depending on use_border. */
187 const struct util_format_description *format_desc = bld->format_desc;
188 int chan;
189 struct lp_type border_type = bld->texel_type;
190 border_type.length = 4;
191 /*
192 * Only replace channels which are actually present. The others should
193 * get optimized away eventually by sampler_view swizzle anyway but it's
194 * easier too.
195 */
196 for (chan = 0; chan < 4; chan++) {
197 unsigned chan_s;
198 /* reverse-map channel... */
199 if (util_format_has_stencil(format_desc)) {
200 if (chan == 0)
201 chan_s = 0;
202 else
203 break;
204 }
205 else {
206 for (chan_s = 0; chan_s < 4; chan_s++) {
207 if (chan_s == format_desc->swizzle[chan]) {
208 break;
209 }
210 }
211 }
212 if (chan_s <= 3) {
213 /* use the already clamped color */
214 LLVMValueRef idx = lp_build_const_int32(bld->gallivm, chan);
215 LLVMValueRef border_chan;
216
217 border_chan = lp_build_extract_broadcast(bld->gallivm,
218 border_type,
219 bld->texel_type,
220 bld->border_color_clamped,
221 idx);
222 texel_out[chan] = lp_build_select(&bld->texel_bld, use_border,
223 border_chan, texel_out[chan]);
224 }
225 }
226 }
227 }
228
229
230 /**
231 * Helper to compute the mirror function for the PIPE_WRAP_MIRROR_REPEAT mode.
232 * (Note that with pot sizes could do this much more easily post-scale
233 * with some bit arithmetic.)
234 */
235 static LLVMValueRef
lp_build_coord_mirror(struct lp_build_sample_context * bld,LLVMValueRef coord,boolean posOnly)236 lp_build_coord_mirror(struct lp_build_sample_context *bld,
237 LLVMValueRef coord, boolean posOnly)
238 {
239 struct lp_build_context *coord_bld = &bld->coord_bld;
240 LLVMValueRef fract;
241 LLVMValueRef half = lp_build_const_vec(bld->gallivm, coord_bld->type, 0.5);
242
243 /*
244 * We can just use 2*(x - round(0.5*x)) to do all the mirroring,
245 * it all works out. (The result is in range [-1, 1.0], negative if
246 * the coord is in the "odd" section, otherwise positive.)
247 */
248
249 coord = lp_build_mul(coord_bld, coord, half);
250 fract = lp_build_round(coord_bld, coord);
251 fract = lp_build_sub(coord_bld, coord, fract);
252 coord = lp_build_add(coord_bld, fract, fract);
253
254 if (posOnly) {
255 /*
256 * Theoretically it's not quite 100% accurate because the spec says
257 * that ultimately a scaled coord of -x.0 should map to int coord
258 * -x + 1 with mirroring, not -x (this does not matter for bilinear
259 * filtering).
260 */
261 coord = lp_build_abs(coord_bld, coord);
262 /* kill off NaNs */
263 /* XXX: not safe without arch rounding, fract can be anything. */
264 coord = lp_build_max_ext(coord_bld, coord, coord_bld->zero,
265 GALLIVM_NAN_RETURN_OTHER_SECOND_NONNAN);
266 }
267
268 return coord;
269 }
270
271
272 /**
273 * Helper to compute the first coord and the weight for
274 * linear wrap repeat npot textures
275 */
276 void
lp_build_coord_repeat_npot_linear(struct lp_build_sample_context * bld,LLVMValueRef coord_f,LLVMValueRef length_i,LLVMValueRef length_f,LLVMValueRef * coord0_i,LLVMValueRef * weight_f)277 lp_build_coord_repeat_npot_linear(struct lp_build_sample_context *bld,
278 LLVMValueRef coord_f,
279 LLVMValueRef length_i,
280 LLVMValueRef length_f,
281 LLVMValueRef *coord0_i,
282 LLVMValueRef *weight_f)
283 {
284 struct lp_build_context *coord_bld = &bld->coord_bld;
285 struct lp_build_context *int_coord_bld = &bld->int_coord_bld;
286 LLVMValueRef half = lp_build_const_vec(bld->gallivm, coord_bld->type, 0.5);
287 LLVMValueRef length_minus_one = lp_build_sub(int_coord_bld, length_i,
288 int_coord_bld->one);
289 LLVMValueRef mask;
290 /* wrap with normalized floats is just fract */
291 coord_f = lp_build_fract(coord_bld, coord_f);
292 /* mul by size and subtract 0.5 */
293 coord_f = lp_build_mul(coord_bld, coord_f, length_f);
294 coord_f = lp_build_sub(coord_bld, coord_f, half);
295 /*
296 * we avoided the 0.5/length division before the repeat wrap,
297 * now need to fix up edge cases with selects
298 */
299 /*
300 * Note we do a float (unordered) compare so we can eliminate NaNs.
301 * (Otherwise would need fract_safe above).
302 */
303 mask = lp_build_compare(coord_bld->gallivm, coord_bld->type,
304 PIPE_FUNC_LESS, coord_f, coord_bld->zero);
305
306 /* convert to int, compute lerp weight */
307 lp_build_ifloor_fract(coord_bld, coord_f, coord0_i, weight_f);
308 *coord0_i = lp_build_select(int_coord_bld, mask, length_minus_one, *coord0_i);
309 }
310
311
312 /**
313 * Build LLVM code for texture wrap mode for linear filtering.
314 * \param x0_out returns first integer texcoord
315 * \param x1_out returns second integer texcoord
316 * \param weight_out returns linear interpolation weight
317 */
318 static void
lp_build_sample_wrap_linear(struct lp_build_sample_context * bld,boolean is_gather,LLVMValueRef coord,LLVMValueRef length,LLVMValueRef length_f,LLVMValueRef offset,boolean is_pot,unsigned wrap_mode,LLVMValueRef * x0_out,LLVMValueRef * x1_out,LLVMValueRef * weight_out)319 lp_build_sample_wrap_linear(struct lp_build_sample_context *bld,
320 boolean is_gather,
321 LLVMValueRef coord,
322 LLVMValueRef length,
323 LLVMValueRef length_f,
324 LLVMValueRef offset,
325 boolean is_pot,
326 unsigned wrap_mode,
327 LLVMValueRef *x0_out,
328 LLVMValueRef *x1_out,
329 LLVMValueRef *weight_out)
330 {
331 struct lp_build_context *coord_bld = &bld->coord_bld;
332 struct lp_build_context *int_coord_bld = &bld->int_coord_bld;
333 LLVMBuilderRef builder = bld->gallivm->builder;
334 LLVMValueRef half = lp_build_const_vec(bld->gallivm, coord_bld->type, 0.5);
335 LLVMValueRef length_minus_one = lp_build_sub(int_coord_bld, length, int_coord_bld->one);
336 LLVMValueRef coord0, coord1, weight;
337
338 switch(wrap_mode) {
339 case PIPE_TEX_WRAP_REPEAT:
340 if (is_pot) {
341 /* mul by size and subtract 0.5 */
342 coord = lp_build_mul(coord_bld, coord, length_f);
343 coord = lp_build_sub(coord_bld, coord, half);
344 if (offset) {
345 offset = lp_build_int_to_float(coord_bld, offset);
346 coord = lp_build_add(coord_bld, coord, offset);
347 }
348 /* convert to int, compute lerp weight */
349 lp_build_ifloor_fract(coord_bld, coord, &coord0, &weight);
350 coord1 = lp_build_add(int_coord_bld, coord0, int_coord_bld->one);
351 /* repeat wrap */
352 coord0 = LLVMBuildAnd(builder, coord0, length_minus_one, "");
353 coord1 = LLVMBuildAnd(builder, coord1, length_minus_one, "");
354 }
355 else {
356 LLVMValueRef mask;
357 if (offset) {
358 offset = lp_build_int_to_float(coord_bld, offset);
359 offset = lp_build_div(coord_bld, offset, length_f);
360 coord = lp_build_add(coord_bld, coord, offset);
361 }
362 lp_build_coord_repeat_npot_linear(bld, coord,
363 length, length_f,
364 &coord0, &weight);
365 mask = lp_build_compare(int_coord_bld->gallivm, int_coord_bld->type,
366 PIPE_FUNC_NOTEQUAL, coord0, length_minus_one);
367 coord1 = LLVMBuildAnd(builder,
368 lp_build_add(int_coord_bld, coord0, int_coord_bld->one),
369 mask, "");
370 }
371 break;
372
373 case PIPE_TEX_WRAP_CLAMP:
374 if (bld->static_sampler_state->normalized_coords) {
375 /* scale coord to length */
376 coord = lp_build_mul(coord_bld, coord, length_f);
377 }
378 if (offset) {
379 offset = lp_build_int_to_float(coord_bld, offset);
380 coord = lp_build_add(coord_bld, coord, offset);
381 }
382
383 /*
384 * clamp to [0, length]
385 *
386 * Unlike some other wrap modes, this should be correct for gather
387 * too. GL_CLAMP explicitly does this clamp on the coord prior to
388 * actual wrapping (which is per sample).
389 */
390 coord = lp_build_clamp(coord_bld, coord, coord_bld->zero, length_f);
391
392 coord = lp_build_sub(coord_bld, coord, half);
393
394 /* convert to int, compute lerp weight */
395 lp_build_ifloor_fract(coord_bld, coord, &coord0, &weight);
396 coord1 = lp_build_add(int_coord_bld, coord0, int_coord_bld->one);
397 break;
398
399 case PIPE_TEX_WRAP_CLAMP_TO_EDGE:
400 {
401 struct lp_build_context abs_coord_bld = bld->coord_bld;
402 abs_coord_bld.type.sign = FALSE;
403
404 if (bld->static_sampler_state->normalized_coords) {
405 /* mul by tex size */
406 coord = lp_build_mul(coord_bld, coord, length_f);
407 }
408 if (offset) {
409 offset = lp_build_int_to_float(coord_bld, offset);
410 coord = lp_build_add(coord_bld, coord, offset);
411 }
412
413 /* clamp to length max */
414 coord = lp_build_min_ext(coord_bld, coord, length_f,
415 GALLIVM_NAN_RETURN_OTHER_SECOND_NONNAN);
416 if (!is_gather) {
417 /* subtract 0.5 */
418 coord = lp_build_sub(coord_bld, coord, half);
419 /* clamp to [0, length - 0.5] */
420 coord = lp_build_max(coord_bld, coord, coord_bld->zero);
421 /* convert to int, compute lerp weight */
422 lp_build_ifloor_fract(&abs_coord_bld, coord, &coord0, &weight);
423 coord1 = lp_build_add(int_coord_bld, coord0, int_coord_bld->one);
424 } else {
425 /*
426 * The non-gather path will end up with coords 0, 1 if coord was
427 * smaller than 0.5 (with corresponding weight 0.0 so it doesn't
428 * really matter what the second coord is). But for gather, we
429 * really need to end up with coords 0, 0.
430 */
431 coord = lp_build_max(coord_bld, coord, coord_bld->zero);
432 coord0 = lp_build_sub(coord_bld, coord, half);
433 coord1 = lp_build_add(coord_bld, coord, half);
434 /* Values range ([-0.5, length_f - 0.5], [0.5, length_f + 0.5] */
435 coord0 = lp_build_itrunc(coord_bld, coord0);
436 coord1 = lp_build_itrunc(coord_bld, coord1);
437 weight = coord_bld->undef;
438 }
439 /* coord1 = min(coord1, length-1) */
440 coord1 = lp_build_min(int_coord_bld, coord1, length_minus_one);
441 break;
442 }
443
444 case PIPE_TEX_WRAP_CLAMP_TO_BORDER:
445 if (bld->static_sampler_state->normalized_coords) {
446 /* scale coord to length */
447 coord = lp_build_mul(coord_bld, coord, length_f);
448 }
449 if (offset) {
450 offset = lp_build_int_to_float(coord_bld, offset);
451 coord = lp_build_add(coord_bld, coord, offset);
452 }
453 /*
454 * We don't need any clamp. Technically, for very large (pos or neg)
455 * (or infinite) values, clamp against [-length, length] would be
456 * correct, but we don't need to guarantee any specific
457 * result for such coords (the ifloor will be undefined, but for modes
458 * requiring border all resulting coords are safe).
459 */
460 coord = lp_build_sub(coord_bld, coord, half);
461 /* convert to int, compute lerp weight */
462 lp_build_ifloor_fract(coord_bld, coord, &coord0, &weight);
463 coord1 = lp_build_add(int_coord_bld, coord0, int_coord_bld->one);
464 break;
465
466 case PIPE_TEX_WRAP_MIRROR_REPEAT:
467 if (offset) {
468 offset = lp_build_int_to_float(coord_bld, offset);
469 offset = lp_build_div(coord_bld, offset, length_f);
470 coord = lp_build_add(coord_bld, coord, offset);
471 }
472 if (!is_gather) {
473 /* compute mirror function */
474 coord = lp_build_coord_mirror(bld, coord, TRUE);
475
476 /* scale coord to length */
477 coord = lp_build_mul(coord_bld, coord, length_f);
478 coord = lp_build_sub(coord_bld, coord, half);
479
480 /* convert to int, compute lerp weight */
481 lp_build_ifloor_fract(coord_bld, coord, &coord0, &weight);
482 coord1 = lp_build_add(int_coord_bld, coord0, int_coord_bld->one);
483
484 /* coord0 = max(coord0, 0) */
485 coord0 = lp_build_max(int_coord_bld, coord0, int_coord_bld->zero);
486 /* coord1 = min(coord1, length-1) */
487 coord1 = lp_build_min(int_coord_bld, coord1, length_minus_one);
488 } else {
489 /*
490 * This is pretty reasonable in the end, all what the tests care
491 * about is nasty edge cases (scaled coords x.5, so the individual
492 * coords are actually integers, which is REALLY tricky to get right
493 * due to this working differently both for negative numbers as well
494 * as for even/odd cases). But with enough magic it's not too complex
495 * after all.
496 * Maybe should try a bit arithmetic one though for POT textures...
497 */
498 LLVMValueRef isNeg;
499 /*
500 * Wrapping just once still works, even though it means we can
501 * get "wrong" sign due to performing mirror in the middle of the
502 * two coords (because this can only happen very near the odd/even
503 * edges, so both coords will actually end up as 0 or length - 1
504 * in the end).
505 * For GL4 gather with per-sample offsets we'd need to the mirroring
506 * per coord too.
507 */
508 coord = lp_build_coord_mirror(bld, coord, FALSE);
509 coord = lp_build_mul(coord_bld, coord, length_f);
510
511 /*
512 * NaNs should be safe here, we'll do away with them with
513 * the ones' complement plus min.
514 */
515 coord0 = lp_build_sub(coord_bld, coord, half);
516 coord0 = lp_build_ifloor(coord_bld, coord0);
517 coord1 = lp_build_add(int_coord_bld, coord0, int_coord_bld->one);
518 /* ones complement for neg numbers (mirror(negX) = X - 1) */
519 isNeg = lp_build_cmp(int_coord_bld, PIPE_FUNC_LESS,
520 coord0, int_coord_bld->zero);
521 coord0 = lp_build_xor(int_coord_bld, coord0, isNeg);
522 isNeg = lp_build_cmp(int_coord_bld, PIPE_FUNC_LESS,
523 coord1, int_coord_bld->zero);
524 coord1 = lp_build_xor(int_coord_bld, coord1, isNeg);
525 coord0 = lp_build_min(int_coord_bld, coord0, length_minus_one);
526 coord1 = lp_build_min(int_coord_bld, coord1, length_minus_one);
527
528 weight = coord_bld->undef;
529 }
530 break;
531
532 case PIPE_TEX_WRAP_MIRROR_CLAMP:
533 if (bld->static_sampler_state->normalized_coords) {
534 /* scale coord to length */
535 coord = lp_build_mul(coord_bld, coord, length_f);
536 }
537 if (offset) {
538 offset = lp_build_int_to_float(coord_bld, offset);
539 coord = lp_build_add(coord_bld, coord, offset);
540 }
541 /*
542 * XXX: probably not correct for gather, albeit I'm not
543 * entirely sure as it's poorly specified. The wrapping looks
544 * correct according to the spec which is against gl 1.2.1,
545 * however negative values will be swapped - gl re-specified
546 * wrapping with newer versions (no more pre-clamp except with
547 * GL_CLAMP).
548 */
549 coord = lp_build_abs(coord_bld, coord);
550
551 /* clamp to [0, length] */
552 coord = lp_build_min_ext(coord_bld, coord, length_f,
553 GALLIVM_NAN_RETURN_OTHER_SECOND_NONNAN);
554
555 coord = lp_build_sub(coord_bld, coord, half);
556
557 /* convert to int, compute lerp weight */
558 lp_build_ifloor_fract(coord_bld, coord, &coord0, &weight);
559 coord1 = lp_build_add(int_coord_bld, coord0, int_coord_bld->one);
560 break;
561
562 case PIPE_TEX_WRAP_MIRROR_CLAMP_TO_EDGE:
563 {
564 struct lp_build_context abs_coord_bld = bld->coord_bld;
565 abs_coord_bld.type.sign = FALSE;
566
567 if (bld->static_sampler_state->normalized_coords) {
568 /* scale coord to length */
569 coord = lp_build_mul(coord_bld, coord, length_f);
570 }
571 if (offset) {
572 offset = lp_build_int_to_float(coord_bld, offset);
573 coord = lp_build_add(coord_bld, coord, offset);
574 }
575 if (!is_gather) {
576 coord = lp_build_abs(coord_bld, coord);
577
578 /* clamp to length max */
579 coord = lp_build_min_ext(coord_bld, coord, length_f,
580 GALLIVM_NAN_RETURN_OTHER_SECOND_NONNAN);
581 /* subtract 0.5 */
582 coord = lp_build_sub(coord_bld, coord, half);
583 /* clamp to [0, length - 0.5] */
584 coord = lp_build_max(coord_bld, coord, coord_bld->zero);
585
586 /* convert to int, compute lerp weight */
587 lp_build_ifloor_fract(&abs_coord_bld, coord, &coord0, &weight);
588 coord1 = lp_build_add(int_coord_bld, coord0, int_coord_bld->one);
589 /* coord1 = min(coord1, length-1) */
590 coord1 = lp_build_min(int_coord_bld, coord1, length_minus_one);
591 } else {
592 /*
593 * The non-gather path will swap coord0/1 if coord was negative,
594 * which is ok for filtering since the filter weight matches
595 * accordingly. Also, if coord is close to zero, coord0/1 will
596 * be 0 and 1, instead of 0 and 0 (again ok due to filter
597 * weight being 0.0). Both issues need to be fixed for gather.
598 */
599 LLVMValueRef isNeg;
600
601 /*
602 * Actually wanted to cheat here and use:
603 * coord1 = lp_build_iround(coord_bld, coord);
604 * but it's not good enough for some tests (even piglit
605 * textureGather is set up in a way so the coords area always
606 * .5, that is right at the crossover points).
607 * So do ordinary sub/floor, then do ones' complement
608 * for negative numbers.
609 * (Note can't just do sub|add/abs/itrunc per coord neither -
610 * because the spec demands that mirror(3.0) = 3 but
611 * mirror(-3.0) = 2.)
612 */
613 coord = lp_build_sub(coord_bld, coord, half);
614 coord0 = lp_build_ifloor(coord_bld, coord);
615 coord1 = lp_build_add(int_coord_bld, coord0, int_coord_bld->one);
616 isNeg = lp_build_cmp(int_coord_bld, PIPE_FUNC_LESS, coord0,
617 int_coord_bld->zero);
618 coord0 = lp_build_xor(int_coord_bld, isNeg, coord0);
619 coord0 = lp_build_min(int_coord_bld, coord0, length_minus_one);
620
621 isNeg = lp_build_cmp(int_coord_bld, PIPE_FUNC_LESS, coord1,
622 int_coord_bld->zero);
623 coord1 = lp_build_xor(int_coord_bld, isNeg, coord1);
624 coord1 = lp_build_min(int_coord_bld, coord1, length_minus_one);
625
626 weight = coord_bld->undef;
627 }
628 }
629 break;
630
631 case PIPE_TEX_WRAP_MIRROR_CLAMP_TO_BORDER:
632 {
633 if (bld->static_sampler_state->normalized_coords) {
634 /* scale coord to length */
635 coord = lp_build_mul(coord_bld, coord, length_f);
636 }
637 if (offset) {
638 offset = lp_build_int_to_float(coord_bld, offset);
639 coord = lp_build_add(coord_bld, coord, offset);
640 }
641 /*
642 * XXX: probably not correct for gather due to swapped
643 * order if coord is negative (same rationale as for
644 * MIRROR_CLAMP).
645 */
646 coord = lp_build_abs(coord_bld, coord);
647
648 /*
649 * We don't need any clamp. Technically, for very large
650 * (or infinite) values, clamp against length would be
651 * correct, but we don't need to guarantee any specific
652 * result for such coords (the ifloor will be undefined, but
653 * for modes requiring border all resulting coords are safe).
654 */
655 coord = lp_build_sub(coord_bld, coord, half);
656
657 /* convert to int, compute lerp weight */
658 lp_build_ifloor_fract(coord_bld, coord, &coord0, &weight);
659 coord1 = lp_build_add(int_coord_bld, coord0, int_coord_bld->one);
660 }
661 break;
662
663 default:
664 assert(0);
665 coord0 = NULL;
666 coord1 = NULL;
667 weight = NULL;
668 }
669
670 *x0_out = coord0;
671 *x1_out = coord1;
672 *weight_out = weight;
673 }
674
675
676 /**
677 * Build LLVM code for texture wrap mode for nearest filtering.
678 * \param coord the incoming texcoord (nominally in [0,1])
679 * \param length the texture size along one dimension, as int vector
680 * \param length_f the texture size along one dimension, as float vector
681 * \param offset texel offset along one dimension (as int vector)
682 * \param is_pot if TRUE, length is a power of two
683 * \param wrap_mode one of PIPE_TEX_WRAP_x
684 */
685 static LLVMValueRef
lp_build_sample_wrap_nearest(struct lp_build_sample_context * bld,LLVMValueRef coord,LLVMValueRef length,LLVMValueRef length_f,LLVMValueRef offset,boolean is_pot,unsigned wrap_mode)686 lp_build_sample_wrap_nearest(struct lp_build_sample_context *bld,
687 LLVMValueRef coord,
688 LLVMValueRef length,
689 LLVMValueRef length_f,
690 LLVMValueRef offset,
691 boolean is_pot,
692 unsigned wrap_mode)
693 {
694 struct lp_build_context *coord_bld = &bld->coord_bld;
695 struct lp_build_context *int_coord_bld = &bld->int_coord_bld;
696 LLVMBuilderRef builder = bld->gallivm->builder;
697 LLVMValueRef length_minus_one = lp_build_sub(int_coord_bld, length, int_coord_bld->one);
698 LLVMValueRef icoord;
699
700 switch(wrap_mode) {
701 case PIPE_TEX_WRAP_REPEAT:
702 if (is_pot) {
703 coord = lp_build_mul(coord_bld, coord, length_f);
704 icoord = lp_build_ifloor(coord_bld, coord);
705 if (offset) {
706 icoord = lp_build_add(int_coord_bld, icoord, offset);
707 }
708 icoord = LLVMBuildAnd(builder, icoord, length_minus_one, "");
709 }
710 else {
711 if (offset) {
712 offset = lp_build_int_to_float(coord_bld, offset);
713 offset = lp_build_div(coord_bld, offset, length_f);
714 coord = lp_build_add(coord_bld, coord, offset);
715 }
716 /* take fraction, unnormalize */
717 coord = lp_build_fract_safe(coord_bld, coord);
718 coord = lp_build_mul(coord_bld, coord, length_f);
719 icoord = lp_build_itrunc(coord_bld, coord);
720 }
721 break;
722
723 case PIPE_TEX_WRAP_CLAMP:
724 case PIPE_TEX_WRAP_CLAMP_TO_EDGE:
725 if (bld->static_sampler_state->normalized_coords) {
726 /* scale coord to length */
727 coord = lp_build_mul(coord_bld, coord, length_f);
728 }
729
730 if (offset) {
731 offset = lp_build_int_to_float(coord_bld, offset);
732 coord = lp_build_add(coord_bld, coord, offset);
733 }
734 /* floor */
735 /* use itrunc instead since we clamp to 0 anyway */
736 icoord = lp_build_itrunc(coord_bld, coord);
737
738 /* clamp to [0, length - 1]. */
739 icoord = lp_build_clamp(int_coord_bld, icoord, int_coord_bld->zero,
740 length_minus_one);
741 break;
742
743 case PIPE_TEX_WRAP_CLAMP_TO_BORDER:
744 if (bld->static_sampler_state->normalized_coords) {
745 /* scale coord to length */
746 coord = lp_build_mul(coord_bld, coord, length_f);
747 }
748 /* no clamp necessary, border masking will handle this */
749 icoord = lp_build_ifloor(coord_bld, coord);
750 if (offset) {
751 icoord = lp_build_add(int_coord_bld, icoord, offset);
752 }
753 break;
754
755 case PIPE_TEX_WRAP_MIRROR_REPEAT:
756 if (offset) {
757 offset = lp_build_int_to_float(coord_bld, offset);
758 offset = lp_build_div(coord_bld, offset, length_f);
759 coord = lp_build_add(coord_bld, coord, offset);
760 }
761 /* compute mirror function */
762 coord = lp_build_coord_mirror(bld, coord, TRUE);
763
764 /* scale coord to length */
765 assert(bld->static_sampler_state->normalized_coords);
766 coord = lp_build_mul(coord_bld, coord, length_f);
767
768 /* itrunc == ifloor here */
769 icoord = lp_build_itrunc(coord_bld, coord);
770
771 /* clamp to [0, length - 1] */
772 icoord = lp_build_min(int_coord_bld, icoord, length_minus_one);
773 break;
774
775 case PIPE_TEX_WRAP_MIRROR_CLAMP:
776 case PIPE_TEX_WRAP_MIRROR_CLAMP_TO_EDGE:
777 if (bld->static_sampler_state->normalized_coords) {
778 /* scale coord to length */
779 coord = lp_build_mul(coord_bld, coord, length_f);
780 }
781 if (offset) {
782 offset = lp_build_int_to_float(coord_bld, offset);
783 coord = lp_build_add(coord_bld, coord, offset);
784 }
785 coord = lp_build_abs(coord_bld, coord);
786
787 /* itrunc == ifloor here */
788 icoord = lp_build_itrunc(coord_bld, coord);
789 /*
790 * Use unsigned min due to possible undef values (NaNs, overflow)
791 */
792 {
793 struct lp_build_context abs_coord_bld = *int_coord_bld;
794 abs_coord_bld.type.sign = FALSE;
795 /* clamp to [0, length - 1] */
796 icoord = lp_build_min(&abs_coord_bld, icoord, length_minus_one);
797 }
798 break;
799
800 case PIPE_TEX_WRAP_MIRROR_CLAMP_TO_BORDER:
801 if (bld->static_sampler_state->normalized_coords) {
802 /* scale coord to length */
803 coord = lp_build_mul(coord_bld, coord, length_f);
804 }
805 if (offset) {
806 offset = lp_build_int_to_float(coord_bld, offset);
807 coord = lp_build_add(coord_bld, coord, offset);
808 }
809 coord = lp_build_abs(coord_bld, coord);
810
811 /* itrunc == ifloor here */
812 icoord = lp_build_itrunc(coord_bld, coord);
813 break;
814
815 default:
816 assert(0);
817 icoord = NULL;
818 }
819
820 return icoord;
821 }
822
823
824 /**
825 * Do shadow test/comparison.
826 * \param p shadow ref value
827 * \param texel the texel to compare against
828 */
829 static LLVMValueRef
lp_build_sample_comparefunc(struct lp_build_sample_context * bld,LLVMValueRef p,LLVMValueRef texel)830 lp_build_sample_comparefunc(struct lp_build_sample_context *bld,
831 LLVMValueRef p,
832 LLVMValueRef texel)
833 {
834 struct lp_build_context *texel_bld = &bld->texel_bld;
835 LLVMValueRef res;
836
837 if (0) {
838 //lp_build_print_value(bld->gallivm, "shadow cmp coord", p);
839 lp_build_print_value(bld->gallivm, "shadow cmp texel", texel);
840 }
841
842 /* result = (p FUNC texel) ? 1 : 0 */
843 /*
844 * honor d3d10 floating point rules here, which state that comparisons
845 * are ordered except NOT_EQUAL which is unordered.
846 */
847 if (bld->static_sampler_state->compare_func != PIPE_FUNC_NOTEQUAL) {
848 res = lp_build_cmp_ordered(texel_bld, bld->static_sampler_state->compare_func,
849 p, texel);
850 }
851 else {
852 res = lp_build_cmp(texel_bld, bld->static_sampler_state->compare_func,
853 p, texel);
854 }
855 return res;
856 }
857
858
859 /**
860 * Generate code to sample a mipmap level with nearest filtering.
861 * If sampling a cube texture, r = cube face in [0,5].
862 */
863 static void
lp_build_sample_image_nearest(struct lp_build_sample_context * bld,LLVMValueRef size,LLVMValueRef row_stride_vec,LLVMValueRef img_stride_vec,LLVMValueRef data_ptr,LLVMValueRef mipoffsets,const LLVMValueRef * coords,const LLVMValueRef * offsets,LLVMValueRef colors_out[4])864 lp_build_sample_image_nearest(struct lp_build_sample_context *bld,
865 LLVMValueRef size,
866 LLVMValueRef row_stride_vec,
867 LLVMValueRef img_stride_vec,
868 LLVMValueRef data_ptr,
869 LLVMValueRef mipoffsets,
870 const LLVMValueRef *coords,
871 const LLVMValueRef *offsets,
872 LLVMValueRef colors_out[4])
873 {
874 const unsigned dims = bld->dims;
875 LLVMValueRef width_vec;
876 LLVMValueRef height_vec;
877 LLVMValueRef depth_vec;
878 LLVMValueRef flt_size;
879 LLVMValueRef flt_width_vec;
880 LLVMValueRef flt_height_vec;
881 LLVMValueRef flt_depth_vec;
882 LLVMValueRef x, y = NULL, z = NULL;
883
884 lp_build_extract_image_sizes(bld,
885 &bld->int_size_bld,
886 bld->int_coord_type,
887 size,
888 &width_vec, &height_vec, &depth_vec);
889
890 flt_size = lp_build_int_to_float(&bld->float_size_bld, size);
891
892 lp_build_extract_image_sizes(bld,
893 &bld->float_size_bld,
894 bld->coord_type,
895 flt_size,
896 &flt_width_vec, &flt_height_vec, &flt_depth_vec);
897
898 /*
899 * Compute integer texcoords.
900 */
901 x = lp_build_sample_wrap_nearest(bld, coords[0], width_vec,
902 flt_width_vec, offsets[0],
903 bld->static_texture_state->pot_width,
904 bld->static_sampler_state->wrap_s);
905 lp_build_name(x, "tex.x.wrapped");
906
907 if (dims >= 2) {
908 y = lp_build_sample_wrap_nearest(bld, coords[1], height_vec,
909 flt_height_vec, offsets[1],
910 bld->static_texture_state->pot_height,
911 bld->static_sampler_state->wrap_t);
912 lp_build_name(y, "tex.y.wrapped");
913
914 if (dims == 3) {
915 z = lp_build_sample_wrap_nearest(bld, coords[2], depth_vec,
916 flt_depth_vec, offsets[2],
917 bld->static_texture_state->pot_depth,
918 bld->static_sampler_state->wrap_r);
919 lp_build_name(z, "tex.z.wrapped");
920 }
921 }
922 if (has_layer_coord(bld->static_texture_state->target)) {
923 if (bld->static_texture_state->target == PIPE_TEXTURE_CUBE_ARRAY) {
924 /* add cube layer to face */
925 z = lp_build_add(&bld->int_coord_bld, coords[2], coords[3]);
926 }
927 else {
928 z = coords[2];
929 }
930 lp_build_name(z, "tex.z.layer");
931 }
932
933 /*
934 * Get texture colors.
935 */
936 lp_build_sample_texel_soa(bld,
937 width_vec, height_vec, depth_vec,
938 x, y, z,
939 row_stride_vec, img_stride_vec,
940 data_ptr, mipoffsets, colors_out);
941
942 if (bld->static_sampler_state->compare_mode != PIPE_TEX_COMPARE_NONE) {
943 LLVMValueRef cmpval;
944 cmpval = lp_build_sample_comparefunc(bld, coords[4], colors_out[0]);
945 /* this is really just a AND 1.0, cmpval but llvm is clever enough */
946 colors_out[0] = lp_build_select(&bld->texel_bld, cmpval,
947 bld->texel_bld.one, bld->texel_bld.zero);
948 colors_out[1] = colors_out[2] = colors_out[3] = colors_out[0];
949 }
950
951 }
952
953
954 /**
955 * Like a lerp, but inputs are 0/~0 masks, so can simplify slightly.
956 */
957 static LLVMValueRef
lp_build_masklerp(struct lp_build_context * bld,LLVMValueRef weight,LLVMValueRef mask0,LLVMValueRef mask1)958 lp_build_masklerp(struct lp_build_context *bld,
959 LLVMValueRef weight,
960 LLVMValueRef mask0,
961 LLVMValueRef mask1)
962 {
963 struct gallivm_state *gallivm = bld->gallivm;
964 LLVMBuilderRef builder = gallivm->builder;
965 LLVMValueRef weight2;
966
967 weight2 = lp_build_sub(bld, bld->one, weight);
968 weight = LLVMBuildBitCast(builder, weight,
969 lp_build_int_vec_type(gallivm, bld->type), "");
970 weight2 = LLVMBuildBitCast(builder, weight2,
971 lp_build_int_vec_type(gallivm, bld->type), "");
972 weight = LLVMBuildAnd(builder, weight, mask1, "");
973 weight2 = LLVMBuildAnd(builder, weight2, mask0, "");
974 weight = LLVMBuildBitCast(builder, weight, bld->vec_type, "");
975 weight2 = LLVMBuildBitCast(builder, weight2, bld->vec_type, "");
976 return lp_build_add(bld, weight, weight2);
977 }
978
979 /**
980 * Like a 2d lerp, but inputs are 0/~0 masks, so can simplify slightly.
981 */
982 static LLVMValueRef
lp_build_masklerp2d(struct lp_build_context * bld,LLVMValueRef weight0,LLVMValueRef weight1,LLVMValueRef mask00,LLVMValueRef mask01,LLVMValueRef mask10,LLVMValueRef mask11)983 lp_build_masklerp2d(struct lp_build_context *bld,
984 LLVMValueRef weight0,
985 LLVMValueRef weight1,
986 LLVMValueRef mask00,
987 LLVMValueRef mask01,
988 LLVMValueRef mask10,
989 LLVMValueRef mask11)
990 {
991 LLVMValueRef val0 = lp_build_masklerp(bld, weight0, mask00, mask01);
992 LLVMValueRef val1 = lp_build_masklerp(bld, weight0, mask10, mask11);
993 return lp_build_lerp(bld, weight1, val0, val1, 0);
994 }
995
996 /*
997 * this is a bit excessive code for something OpenGL just recommends
998 * but does not require.
999 */
1000 #define ACCURATE_CUBE_CORNERS 1
1001
1002 /**
1003 * Generate code to sample a mipmap level with linear filtering.
1004 * If sampling a cube texture, r = cube face in [0,5].
1005 * If linear_mask is present, only pixels having their mask set
1006 * will receive linear filtering, the rest will use nearest.
1007 */
1008 static void
lp_build_sample_image_linear(struct lp_build_sample_context * bld,boolean is_gather,LLVMValueRef size,LLVMValueRef linear_mask,LLVMValueRef row_stride_vec,LLVMValueRef img_stride_vec,LLVMValueRef data_ptr,LLVMValueRef mipoffsets,const LLVMValueRef * coords,const LLVMValueRef * offsets,LLVMValueRef colors_out[4])1009 lp_build_sample_image_linear(struct lp_build_sample_context *bld,
1010 boolean is_gather,
1011 LLVMValueRef size,
1012 LLVMValueRef linear_mask,
1013 LLVMValueRef row_stride_vec,
1014 LLVMValueRef img_stride_vec,
1015 LLVMValueRef data_ptr,
1016 LLVMValueRef mipoffsets,
1017 const LLVMValueRef *coords,
1018 const LLVMValueRef *offsets,
1019 LLVMValueRef colors_out[4])
1020 {
1021 LLVMBuilderRef builder = bld->gallivm->builder;
1022 struct lp_build_context *ivec_bld = &bld->int_coord_bld;
1023 struct lp_build_context *coord_bld = &bld->coord_bld;
1024 struct lp_build_context *texel_bld = &bld->texel_bld;
1025 const unsigned dims = bld->dims;
1026 LLVMValueRef width_vec;
1027 LLVMValueRef height_vec;
1028 LLVMValueRef depth_vec;
1029 LLVMValueRef flt_size;
1030 LLVMValueRef flt_width_vec;
1031 LLVMValueRef flt_height_vec;
1032 LLVMValueRef flt_depth_vec;
1033 LLVMValueRef fall_off[4] = { 0 }, have_corners = NULL;
1034 LLVMValueRef z1 = NULL;
1035 LLVMValueRef z00 = NULL, z01 = NULL, z10 = NULL, z11 = NULL;
1036 LLVMValueRef x00 = NULL, x01 = NULL, x10 = NULL, x11 = NULL;
1037 LLVMValueRef y00 = NULL, y01 = NULL, y10 = NULL, y11 = NULL;
1038 LLVMValueRef s_fpart, t_fpart = NULL, r_fpart = NULL;
1039 LLVMValueRef xs[4], ys[4], zs[4];
1040 LLVMValueRef neighbors[2][2][4];
1041 int chan, texel_index;
1042 boolean seamless_cube_filter, accurate_cube_corners;
1043 unsigned chan_swiz = bld->static_texture_state->swizzle_r;
1044
1045 if (is_gather) {
1046 switch (bld->gather_comp) {
1047 case 0: chan_swiz = bld->static_texture_state->swizzle_r; break;
1048 case 1: chan_swiz = bld->static_texture_state->swizzle_g; break;
1049 case 2: chan_swiz = bld->static_texture_state->swizzle_b; break;
1050 case 3: chan_swiz = bld->static_texture_state->swizzle_a; break;
1051 default:
1052 break;
1053 }
1054 }
1055
1056 seamless_cube_filter = (bld->static_texture_state->target == PIPE_TEXTURE_CUBE ||
1057 bld->static_texture_state->target == PIPE_TEXTURE_CUBE_ARRAY) &&
1058 bld->static_sampler_state->seamless_cube_map;
1059
1060 /*
1061 * Disable accurate cube corners for integer textures, which should only
1062 * get here in the gather path.
1063 */
1064 accurate_cube_corners = ACCURATE_CUBE_CORNERS && seamless_cube_filter &&
1065 !util_format_is_pure_integer(bld->static_texture_state->format);
1066
1067 lp_build_extract_image_sizes(bld,
1068 &bld->int_size_bld,
1069 bld->int_coord_type,
1070 size,
1071 &width_vec, &height_vec, &depth_vec);
1072
1073 flt_size = lp_build_int_to_float(&bld->float_size_bld, size);
1074
1075 lp_build_extract_image_sizes(bld,
1076 &bld->float_size_bld,
1077 bld->coord_type,
1078 flt_size,
1079 &flt_width_vec, &flt_height_vec, &flt_depth_vec);
1080
1081 /*
1082 * Compute integer texcoords.
1083 */
1084
1085 if (!seamless_cube_filter) {
1086 lp_build_sample_wrap_linear(bld, is_gather, coords[0], width_vec,
1087 flt_width_vec, offsets[0],
1088 bld->static_texture_state->pot_width,
1089 bld->static_sampler_state->wrap_s,
1090 &x00, &x01, &s_fpart);
1091 lp_build_name(x00, "tex.x0.wrapped");
1092 lp_build_name(x01, "tex.x1.wrapped");
1093 x10 = x00;
1094 x11 = x01;
1095
1096 if (dims >= 2) {
1097 lp_build_sample_wrap_linear(bld, is_gather, coords[1], height_vec,
1098 flt_height_vec, offsets[1],
1099 bld->static_texture_state->pot_height,
1100 bld->static_sampler_state->wrap_t,
1101 &y00, &y10, &t_fpart);
1102 lp_build_name(y00, "tex.y0.wrapped");
1103 lp_build_name(y10, "tex.y1.wrapped");
1104 y01 = y00;
1105 y11 = y10;
1106
1107 if (dims == 3) {
1108 lp_build_sample_wrap_linear(bld, is_gather, coords[2], depth_vec,
1109 flt_depth_vec, offsets[2],
1110 bld->static_texture_state->pot_depth,
1111 bld->static_sampler_state->wrap_r,
1112 &z00, &z1, &r_fpart);
1113 z01 = z10 = z11 = z00;
1114 lp_build_name(z00, "tex.z0.wrapped");
1115 lp_build_name(z1, "tex.z1.wrapped");
1116 }
1117 }
1118 if (has_layer_coord(bld->static_texture_state->target)) {
1119 if (bld->static_texture_state->target == PIPE_TEXTURE_CUBE_ARRAY) {
1120 /* add cube layer to face */
1121 z00 = z01 = z10 = z11 = z1 =
1122 lp_build_add(&bld->int_coord_bld, coords[2], coords[3]);
1123 }
1124 else {
1125 z00 = z01 = z10 = z11 = z1 = coords[2]; /* cube face or layer */
1126 }
1127 lp_build_name(z00, "tex.z0.layer");
1128 lp_build_name(z1, "tex.z1.layer");
1129 }
1130 }
1131 else {
1132 struct lp_build_if_state edge_if;
1133 LLVMTypeRef int1t;
1134 LLVMValueRef new_faces[4], new_xcoords[4][2], new_ycoords[4][2];
1135 LLVMValueRef coord0, coord1, have_edge, have_corner;
1136 LLVMValueRef fall_off_ym_notxm, fall_off_ym_notxp, fall_off_x, fall_off_y;
1137 LLVMValueRef fall_off_yp_notxm, fall_off_yp_notxp;
1138 LLVMValueRef x0, x1, y0, y1, y0_clamped, y1_clamped;
1139 LLVMValueRef face = coords[2];
1140 LLVMValueRef half = lp_build_const_vec(bld->gallivm, coord_bld->type, 0.5f);
1141 LLVMValueRef length_minus_one = lp_build_sub(ivec_bld, width_vec, ivec_bld->one);
1142 /* XXX drop height calcs. Could (should) do this without seamless filtering too */
1143 height_vec = width_vec;
1144 flt_height_vec = flt_width_vec;
1145
1146 /* XXX the overflow logic is actually sort of duplicated with trilinear,
1147 * since an overflow in one mip should also have a corresponding overflow
1148 * in another.
1149 */
1150 /* should always have normalized coords, and offsets are undefined */
1151 assert(bld->static_sampler_state->normalized_coords);
1152 /*
1153 * The coords should all be between [0,1] however we can have NaNs,
1154 * which will wreak havoc. In particular the y1_clamped value below
1155 * can be -INT_MAX (on x86) and be propagated right through (probably
1156 * other values might be bogus in the end too).
1157 * So kill off the NaNs here.
1158 */
1159 coord0 = lp_build_max_ext(coord_bld, coords[0], coord_bld->zero,
1160 GALLIVM_NAN_RETURN_OTHER_SECOND_NONNAN);
1161 coord0 = lp_build_mul(coord_bld, coord0, flt_width_vec);
1162 /* instead of clamp, build mask if overflowed */
1163 coord0 = lp_build_sub(coord_bld, coord0, half);
1164 /* convert to int, compute lerp weight */
1165 /* not ideal with AVX (and no AVX2) */
1166 lp_build_ifloor_fract(coord_bld, coord0, &x0, &s_fpart);
1167 x1 = lp_build_add(ivec_bld, x0, ivec_bld->one);
1168 coord1 = lp_build_max_ext(coord_bld, coords[1], coord_bld->zero,
1169 GALLIVM_NAN_RETURN_OTHER_SECOND_NONNAN);
1170 coord1 = lp_build_mul(coord_bld, coord1, flt_height_vec);
1171 coord1 = lp_build_sub(coord_bld, coord1, half);
1172 lp_build_ifloor_fract(coord_bld, coord1, &y0, &t_fpart);
1173 y1 = lp_build_add(ivec_bld, y0, ivec_bld->one);
1174
1175 fall_off[0] = lp_build_cmp(ivec_bld, PIPE_FUNC_LESS, x0, ivec_bld->zero);
1176 fall_off[1] = lp_build_cmp(ivec_bld, PIPE_FUNC_GREATER, x1, length_minus_one);
1177 fall_off[2] = lp_build_cmp(ivec_bld, PIPE_FUNC_LESS, y0, ivec_bld->zero);
1178 fall_off[3] = lp_build_cmp(ivec_bld, PIPE_FUNC_GREATER, y1, length_minus_one);
1179
1180 fall_off_x = lp_build_or(ivec_bld, fall_off[0], fall_off[1]);
1181 fall_off_y = lp_build_or(ivec_bld, fall_off[2], fall_off[3]);
1182 have_edge = lp_build_or(ivec_bld, fall_off_x, fall_off_y);
1183 have_edge = lp_build_any_true_range(ivec_bld, ivec_bld->type.length, have_edge);
1184
1185 /* needed for accurate corner filtering branch later, rely on 0 init */
1186 int1t = LLVMInt1TypeInContext(bld->gallivm->context);
1187 have_corners = lp_build_alloca(bld->gallivm, int1t, "have_corner");
1188
1189 for (texel_index = 0; texel_index < 4; texel_index++) {
1190 xs[texel_index] = lp_build_alloca(bld->gallivm, ivec_bld->vec_type, "xs");
1191 ys[texel_index] = lp_build_alloca(bld->gallivm, ivec_bld->vec_type, "ys");
1192 zs[texel_index] = lp_build_alloca(bld->gallivm, ivec_bld->vec_type, "zs");
1193 }
1194
1195 lp_build_if(&edge_if, bld->gallivm, have_edge);
1196
1197 have_corner = lp_build_and(ivec_bld, fall_off_x, fall_off_y);
1198 have_corner = lp_build_any_true_range(ivec_bld, ivec_bld->type.length, have_corner);
1199 LLVMBuildStore(builder, have_corner, have_corners);
1200
1201 /*
1202 * Need to feed clamped values here for cheap corner handling,
1203 * but only for y coord (as when falling off both edges we only
1204 * fall off the x one) - this should be sufficient.
1205 */
1206 y0_clamped = lp_build_max(ivec_bld, y0, ivec_bld->zero);
1207 y1_clamped = lp_build_min(ivec_bld, y1, length_minus_one);
1208
1209 /*
1210 * Get all possible new coords.
1211 */
1212 lp_build_cube_new_coords(ivec_bld, face,
1213 x0, x1, y0_clamped, y1_clamped,
1214 length_minus_one,
1215 new_faces, new_xcoords, new_ycoords);
1216
1217 /* handle fall off x-, x+ direction */
1218 /* determine new coords, face (not both fall_off vars can be true at same time) */
1219 x00 = lp_build_select(ivec_bld, fall_off[0], new_xcoords[0][0], x0);
1220 y00 = lp_build_select(ivec_bld, fall_off[0], new_ycoords[0][0], y0_clamped);
1221 x10 = lp_build_select(ivec_bld, fall_off[0], new_xcoords[0][1], x0);
1222 y10 = lp_build_select(ivec_bld, fall_off[0], new_ycoords[0][1], y1_clamped);
1223 x01 = lp_build_select(ivec_bld, fall_off[1], new_xcoords[1][0], x1);
1224 y01 = lp_build_select(ivec_bld, fall_off[1], new_ycoords[1][0], y0_clamped);
1225 x11 = lp_build_select(ivec_bld, fall_off[1], new_xcoords[1][1], x1);
1226 y11 = lp_build_select(ivec_bld, fall_off[1], new_ycoords[1][1], y1_clamped);
1227
1228 z00 = z10 = lp_build_select(ivec_bld, fall_off[0], new_faces[0], face);
1229 z01 = z11 = lp_build_select(ivec_bld, fall_off[1], new_faces[1], face);
1230
1231 /* handle fall off y-, y+ direction */
1232 /*
1233 * Cheap corner logic: just hack up things so a texel doesn't fall
1234 * off both sides (which means filter weights will be wrong but we'll only
1235 * use valid texels in the filter).
1236 * This means however (y) coords must additionally be clamped (see above).
1237 * This corner handling should be fully OpenGL (but not d3d10) compliant.
1238 */
1239 fall_off_ym_notxm = lp_build_andnot(ivec_bld, fall_off[2], fall_off[0]);
1240 fall_off_ym_notxp = lp_build_andnot(ivec_bld, fall_off[2], fall_off[1]);
1241 fall_off_yp_notxm = lp_build_andnot(ivec_bld, fall_off[3], fall_off[0]);
1242 fall_off_yp_notxp = lp_build_andnot(ivec_bld, fall_off[3], fall_off[1]);
1243
1244 x00 = lp_build_select(ivec_bld, fall_off_ym_notxm, new_xcoords[2][0], x00);
1245 y00 = lp_build_select(ivec_bld, fall_off_ym_notxm, new_ycoords[2][0], y00);
1246 x01 = lp_build_select(ivec_bld, fall_off_ym_notxp, new_xcoords[2][1], x01);
1247 y01 = lp_build_select(ivec_bld, fall_off_ym_notxp, new_ycoords[2][1], y01);
1248 x10 = lp_build_select(ivec_bld, fall_off_yp_notxm, new_xcoords[3][0], x10);
1249 y10 = lp_build_select(ivec_bld, fall_off_yp_notxm, new_ycoords[3][0], y10);
1250 x11 = lp_build_select(ivec_bld, fall_off_yp_notxp, new_xcoords[3][1], x11);
1251 y11 = lp_build_select(ivec_bld, fall_off_yp_notxp, new_ycoords[3][1], y11);
1252
1253 z00 = lp_build_select(ivec_bld, fall_off_ym_notxm, new_faces[2], z00);
1254 z01 = lp_build_select(ivec_bld, fall_off_ym_notxp, new_faces[2], z01);
1255 z10 = lp_build_select(ivec_bld, fall_off_yp_notxm, new_faces[3], z10);
1256 z11 = lp_build_select(ivec_bld, fall_off_yp_notxp, new_faces[3], z11);
1257
1258 if (bld->static_texture_state->target == PIPE_TEXTURE_CUBE_ARRAY) {
1259 /* now can add cube layer to face (per sample) */
1260 z00 = lp_build_add(ivec_bld, z00, coords[3]);
1261 z01 = lp_build_add(ivec_bld, z01, coords[3]);
1262 z10 = lp_build_add(ivec_bld, z10, coords[3]);
1263 z11 = lp_build_add(ivec_bld, z11, coords[3]);
1264 }
1265
1266 LLVMBuildStore(builder, x00, xs[0]);
1267 LLVMBuildStore(builder, x01, xs[1]);
1268 LLVMBuildStore(builder, x10, xs[2]);
1269 LLVMBuildStore(builder, x11, xs[3]);
1270 LLVMBuildStore(builder, y00, ys[0]);
1271 LLVMBuildStore(builder, y01, ys[1]);
1272 LLVMBuildStore(builder, y10, ys[2]);
1273 LLVMBuildStore(builder, y11, ys[3]);
1274 LLVMBuildStore(builder, z00, zs[0]);
1275 LLVMBuildStore(builder, z01, zs[1]);
1276 LLVMBuildStore(builder, z10, zs[2]);
1277 LLVMBuildStore(builder, z11, zs[3]);
1278
1279 lp_build_else(&edge_if);
1280
1281 LLVMBuildStore(builder, x0, xs[0]);
1282 LLVMBuildStore(builder, x1, xs[1]);
1283 LLVMBuildStore(builder, x0, xs[2]);
1284 LLVMBuildStore(builder, x1, xs[3]);
1285 LLVMBuildStore(builder, y0, ys[0]);
1286 LLVMBuildStore(builder, y0, ys[1]);
1287 LLVMBuildStore(builder, y1, ys[2]);
1288 LLVMBuildStore(builder, y1, ys[3]);
1289 if (bld->static_texture_state->target == PIPE_TEXTURE_CUBE_ARRAY) {
1290 LLVMValueRef cube_layer = lp_build_add(ivec_bld, face, coords[3]);
1291 LLVMBuildStore(builder, cube_layer, zs[0]);
1292 LLVMBuildStore(builder, cube_layer, zs[1]);
1293 LLVMBuildStore(builder, cube_layer, zs[2]);
1294 LLVMBuildStore(builder, cube_layer, zs[3]);
1295 }
1296 else {
1297 LLVMBuildStore(builder, face, zs[0]);
1298 LLVMBuildStore(builder, face, zs[1]);
1299 LLVMBuildStore(builder, face, zs[2]);
1300 LLVMBuildStore(builder, face, zs[3]);
1301 }
1302
1303 lp_build_endif(&edge_if);
1304
1305 x00 = LLVMBuildLoad(builder, xs[0], "");
1306 x01 = LLVMBuildLoad(builder, xs[1], "");
1307 x10 = LLVMBuildLoad(builder, xs[2], "");
1308 x11 = LLVMBuildLoad(builder, xs[3], "");
1309 y00 = LLVMBuildLoad(builder, ys[0], "");
1310 y01 = LLVMBuildLoad(builder, ys[1], "");
1311 y10 = LLVMBuildLoad(builder, ys[2], "");
1312 y11 = LLVMBuildLoad(builder, ys[3], "");
1313 z00 = LLVMBuildLoad(builder, zs[0], "");
1314 z01 = LLVMBuildLoad(builder, zs[1], "");
1315 z10 = LLVMBuildLoad(builder, zs[2], "");
1316 z11 = LLVMBuildLoad(builder, zs[3], "");
1317 }
1318
1319 if (linear_mask) {
1320 /*
1321 * Whack filter weights into place. Whatever texel had more weight is
1322 * the one which should have been selected by nearest filtering hence
1323 * just use 100% weight for it.
1324 */
1325 struct lp_build_context *c_bld = &bld->coord_bld;
1326 LLVMValueRef w1_mask, w1_weight;
1327 LLVMValueRef half = lp_build_const_vec(bld->gallivm, c_bld->type, 0.5f);
1328
1329 w1_mask = lp_build_cmp(c_bld, PIPE_FUNC_GREATER, s_fpart, half);
1330 /* this select is really just a "and" */
1331 w1_weight = lp_build_select(c_bld, w1_mask, c_bld->one, c_bld->zero);
1332 s_fpart = lp_build_select(c_bld, linear_mask, s_fpart, w1_weight);
1333 if (dims >= 2) {
1334 w1_mask = lp_build_cmp(c_bld, PIPE_FUNC_GREATER, t_fpart, half);
1335 w1_weight = lp_build_select(c_bld, w1_mask, c_bld->one, c_bld->zero);
1336 t_fpart = lp_build_select(c_bld, linear_mask, t_fpart, w1_weight);
1337 if (dims == 3) {
1338 w1_mask = lp_build_cmp(c_bld, PIPE_FUNC_GREATER, r_fpart, half);
1339 w1_weight = lp_build_select(c_bld, w1_mask, c_bld->one, c_bld->zero);
1340 r_fpart = lp_build_select(c_bld, linear_mask, r_fpart, w1_weight);
1341 }
1342 }
1343 }
1344
1345 /*
1346 * Get texture colors.
1347 */
1348 /* get x0/x1 texels */
1349 lp_build_sample_texel_soa(bld,
1350 width_vec, height_vec, depth_vec,
1351 x00, y00, z00,
1352 row_stride_vec, img_stride_vec,
1353 data_ptr, mipoffsets, neighbors[0][0]);
1354 lp_build_sample_texel_soa(bld,
1355 width_vec, height_vec, depth_vec,
1356 x01, y01, z01,
1357 row_stride_vec, img_stride_vec,
1358 data_ptr, mipoffsets, neighbors[0][1]);
1359
1360 if (dims == 1) {
1361 assert(!is_gather);
1362 if (bld->static_sampler_state->compare_mode == PIPE_TEX_COMPARE_NONE) {
1363 lp_build_reduce_filter(texel_bld,
1364 bld->static_sampler_state->reduction_mode,
1365 0,
1366 4,
1367 s_fpart,
1368 neighbors[0][0],
1369 neighbors[0][1],
1370 colors_out);
1371 }
1372 else {
1373 LLVMValueRef cmpval0, cmpval1;
1374 cmpval0 = lp_build_sample_comparefunc(bld, coords[4], neighbors[0][0][0]);
1375 cmpval1 = lp_build_sample_comparefunc(bld, coords[4], neighbors[0][1][0]);
1376 /* simplified lerp, AND mask with weight and add */
1377 colors_out[0] = lp_build_masklerp(texel_bld, s_fpart,
1378 cmpval0, cmpval1);
1379 colors_out[1] = colors_out[2] = colors_out[3] = colors_out[0];
1380 }
1381 }
1382 else {
1383 /* 2D/3D texture */
1384 struct lp_build_if_state corner_if;
1385 LLVMValueRef colors0[4], colorss[4] = { 0 };
1386
1387 /* get x0/x1 texels at y1 */
1388 lp_build_sample_texel_soa(bld,
1389 width_vec, height_vec, depth_vec,
1390 x10, y10, z10,
1391 row_stride_vec, img_stride_vec,
1392 data_ptr, mipoffsets, neighbors[1][0]);
1393 lp_build_sample_texel_soa(bld,
1394 width_vec, height_vec, depth_vec,
1395 x11, y11, z11,
1396 row_stride_vec, img_stride_vec,
1397 data_ptr, mipoffsets, neighbors[1][1]);
1398
1399 /*
1400 * To avoid having to duplicate linear_mask / fetch code use
1401 * another branch (with corner condition though edge would work
1402 * as well) here.
1403 */
1404 if (have_corners && accurate_cube_corners &&
1405 bld->static_sampler_state->reduction_mode == PIPE_TEX_REDUCTION_WEIGHTED_AVERAGE) {
1406 LLVMValueRef c00, c01, c10, c11, c00f, c01f, c10f, c11f;
1407 LLVMValueRef have_corner, one_third;
1408
1409 colorss[0] = lp_build_alloca(bld->gallivm, coord_bld->vec_type, "cs0");
1410 colorss[1] = lp_build_alloca(bld->gallivm, coord_bld->vec_type, "cs1");
1411 colorss[2] = lp_build_alloca(bld->gallivm, coord_bld->vec_type, "cs2");
1412 colorss[3] = lp_build_alloca(bld->gallivm, coord_bld->vec_type, "cs3");
1413
1414 have_corner = LLVMBuildLoad(builder, have_corners, "");
1415
1416 lp_build_if(&corner_if, bld->gallivm, have_corner);
1417
1418 one_third = lp_build_const_vec(bld->gallivm, coord_bld->type,
1419 1.0f/3.0f);
1420
1421 /* find corner */
1422 c00 = lp_build_and(ivec_bld, fall_off[0], fall_off[2]);
1423 c00f = LLVMBuildBitCast(builder, c00, coord_bld->vec_type, "");
1424 c01 = lp_build_and(ivec_bld, fall_off[1], fall_off[2]);
1425 c01f = LLVMBuildBitCast(builder, c01, coord_bld->vec_type, "");
1426 c10 = lp_build_and(ivec_bld, fall_off[0], fall_off[3]);
1427 c10f = LLVMBuildBitCast(builder, c10, coord_bld->vec_type, "");
1428 c11 = lp_build_and(ivec_bld, fall_off[1], fall_off[3]);
1429 c11f = LLVMBuildBitCast(builder, c11, coord_bld->vec_type, "");
1430
1431 if (!is_gather) {
1432 /*
1433 * we can't use standard 2d lerp as we need per-element weight
1434 * in case of corners, so just calculate bilinear result as
1435 * w00*s00 + w01*s01 + w10*s10 + w11*s11.
1436 * (This is actually less work than using 2d lerp, 7 vs. 9
1437 * instructions, however calculating the weights needs another 6,
1438 * so actually probably not slower than 2d lerp only for 4 channels
1439 * as weights only need to be calculated once - of course fixing
1440 * the weights has additional cost.)
1441 */
1442 LLVMValueRef w00, w01, w10, w11, wx0, wy0, c_weight, tmp;
1443 wx0 = lp_build_sub(coord_bld, coord_bld->one, s_fpart);
1444 wy0 = lp_build_sub(coord_bld, coord_bld->one, t_fpart);
1445 w00 = lp_build_mul(coord_bld, wx0, wy0);
1446 w01 = lp_build_mul(coord_bld, s_fpart, wy0);
1447 w10 = lp_build_mul(coord_bld, wx0, t_fpart);
1448 w11 = lp_build_mul(coord_bld, s_fpart, t_fpart);
1449
1450 /* find corner weight */
1451 c_weight = lp_build_select(coord_bld, c00, w00, coord_bld->zero);
1452 c_weight = lp_build_select(coord_bld, c01, w01, c_weight);
1453 c_weight = lp_build_select(coord_bld, c10, w10, c_weight);
1454 c_weight = lp_build_select(coord_bld, c11, w11, c_weight);
1455
1456 /*
1457 * add 1/3 of the corner weight to the weight of the 3 other
1458 * samples and null out corner weight.
1459 */
1460 c_weight = lp_build_mul(coord_bld, c_weight, one_third);
1461 w00 = lp_build_add(coord_bld, w00, c_weight);
1462 w00 = lp_build_andnot(coord_bld, w00, c00f);
1463 w01 = lp_build_add(coord_bld, w01, c_weight);
1464 w01 = lp_build_andnot(coord_bld, w01, c01f);
1465 w10 = lp_build_add(coord_bld, w10, c_weight);
1466 w10 = lp_build_andnot(coord_bld, w10, c10f);
1467 w11 = lp_build_add(coord_bld, w11, c_weight);
1468 w11 = lp_build_andnot(coord_bld, w11, c11f);
1469
1470 if (bld->static_sampler_state->compare_mode ==
1471 PIPE_TEX_COMPARE_NONE) {
1472 for (chan = 0; chan < 4; chan++) {
1473 colors0[chan] = lp_build_mul(coord_bld, w00,
1474 neighbors[0][0][chan]);
1475 tmp = lp_build_mul(coord_bld, w01, neighbors[0][1][chan]);
1476 colors0[chan] = lp_build_add(coord_bld, tmp, colors0[chan]);
1477 tmp = lp_build_mul(coord_bld, w10, neighbors[1][0][chan]);
1478 colors0[chan] = lp_build_add(coord_bld, tmp, colors0[chan]);
1479 tmp = lp_build_mul(coord_bld, w11, neighbors[1][1][chan]);
1480 colors0[chan] = lp_build_add(coord_bld, tmp, colors0[chan]);
1481 }
1482 }
1483 else {
1484 LLVMValueRef cmpval00, cmpval01, cmpval10, cmpval11;
1485 cmpval00 = lp_build_sample_comparefunc(bld, coords[4],
1486 neighbors[0][0][0]);
1487 cmpval01 = lp_build_sample_comparefunc(bld, coords[4],
1488 neighbors[0][1][0]);
1489 cmpval10 = lp_build_sample_comparefunc(bld, coords[4],
1490 neighbors[1][0][0]);
1491 cmpval11 = lp_build_sample_comparefunc(bld, coords[4],
1492 neighbors[1][1][0]);
1493 /*
1494 * inputs to interpolation are just masks so just add
1495 * masked weights together
1496 */
1497 cmpval00 = LLVMBuildBitCast(builder, cmpval00,
1498 coord_bld->vec_type, "");
1499 cmpval01 = LLVMBuildBitCast(builder, cmpval01,
1500 coord_bld->vec_type, "");
1501 cmpval10 = LLVMBuildBitCast(builder, cmpval10,
1502 coord_bld->vec_type, "");
1503 cmpval11 = LLVMBuildBitCast(builder, cmpval11,
1504 coord_bld->vec_type, "");
1505 colors0[0] = lp_build_and(coord_bld, w00, cmpval00);
1506 tmp = lp_build_and(coord_bld, w01, cmpval01);
1507 colors0[0] = lp_build_add(coord_bld, tmp, colors0[0]);
1508 tmp = lp_build_and(coord_bld, w10, cmpval10);
1509 colors0[0] = lp_build_add(coord_bld, tmp, colors0[0]);
1510 tmp = lp_build_and(coord_bld, w11, cmpval11);
1511 colors0[0] = lp_build_add(coord_bld, tmp, colors0[0]);
1512 colors0[1] = colors0[2] = colors0[3] = colors0[0];
1513 }
1514 }
1515 else {
1516 /*
1517 * We don't have any weights to adjust, so instead calculate
1518 * the fourth texel as simply the average of the other 3.
1519 * (This would work for non-gather too, however we'd have
1520 * a boatload more of the select stuff due to there being
1521 * 4 times as many colors as weights.)
1522 */
1523 LLVMValueRef col00, col01, col10, col11;
1524 LLVMValueRef colc, colc0, colc1;
1525 col10 = lp_build_swizzle_soa_channel(texel_bld,
1526 neighbors[1][0], chan_swiz);
1527 col11 = lp_build_swizzle_soa_channel(texel_bld,
1528 neighbors[1][1], chan_swiz);
1529 col01 = lp_build_swizzle_soa_channel(texel_bld,
1530 neighbors[0][1], chan_swiz);
1531 col00 = lp_build_swizzle_soa_channel(texel_bld,
1532 neighbors[0][0], chan_swiz);
1533
1534 /*
1535 * The spec says for comparison filtering, the comparison
1536 * must happen before synthesizing the new value.
1537 * This means all gathered values are always 0 or 1,
1538 * except for the non-existing texel, which can be 0,1/3,2/3,1...
1539 * Seems like we'd be allowed to just return 0 or 1 too, so we
1540 * could simplify and pass down the compare mask values to the
1541 * end (using int arithmetic/compare on the mask values to
1542 * construct the fourth texel) and only there convert to floats
1543 * but it's probably not worth it (it might be easier for the cpu
1544 * but not for the code)...
1545 */
1546 if (bld->static_sampler_state->compare_mode !=
1547 PIPE_TEX_COMPARE_NONE) {
1548 LLVMValueRef cmpval00, cmpval01, cmpval10, cmpval11;
1549 cmpval00 = lp_build_sample_comparefunc(bld, coords[4], col00);
1550 cmpval01 = lp_build_sample_comparefunc(bld, coords[4], col01);
1551 cmpval10 = lp_build_sample_comparefunc(bld, coords[4], col10);
1552 cmpval11 = lp_build_sample_comparefunc(bld, coords[4], col11);
1553 col00 = lp_build_select(texel_bld, cmpval00,
1554 texel_bld->one, texel_bld->zero);
1555 col01 = lp_build_select(texel_bld, cmpval01,
1556 texel_bld->one, texel_bld->zero);
1557 col10 = lp_build_select(texel_bld, cmpval10,
1558 texel_bld->one, texel_bld->zero);
1559 col11 = lp_build_select(texel_bld, cmpval11,
1560 texel_bld->one, texel_bld->zero);
1561 }
1562
1563 /*
1564 * Null out corner color.
1565 */
1566 col00 = lp_build_andnot(coord_bld, col00, c00f);
1567 col01 = lp_build_andnot(coord_bld, col01, c01f);
1568 col10 = lp_build_andnot(coord_bld, col10, c10f);
1569 col11 = lp_build_andnot(coord_bld, col11, c11f);
1570
1571 /*
1572 * New corner texel color is all colors added / 3.
1573 */
1574 colc0 = lp_build_add(coord_bld, col00, col01);
1575 colc1 = lp_build_add(coord_bld, col10, col11);
1576 colc = lp_build_add(coord_bld, colc0, colc1);
1577 colc = lp_build_mul(coord_bld, one_third, colc);
1578
1579 /*
1580 * Replace the corner texel color with the new value.
1581 */
1582 col00 = lp_build_select(coord_bld, c00, colc, col00);
1583 col01 = lp_build_select(coord_bld, c01, colc, col01);
1584 col10 = lp_build_select(coord_bld, c10, colc, col10);
1585 col11 = lp_build_select(coord_bld, c11, colc, col11);
1586
1587 colors0[0] = col10;
1588 colors0[1] = col11;
1589 colors0[2] = col01;
1590 colors0[3] = col00;
1591 }
1592
1593 LLVMBuildStore(builder, colors0[0], colorss[0]);
1594 LLVMBuildStore(builder, colors0[1], colorss[1]);
1595 LLVMBuildStore(builder, colors0[2], colorss[2]);
1596 LLVMBuildStore(builder, colors0[3], colorss[3]);
1597
1598 lp_build_else(&corner_if);
1599 }
1600
1601 if (bld->static_sampler_state->compare_mode == PIPE_TEX_COMPARE_NONE) {
1602 if (is_gather) {
1603 /*
1604 * Just assign the red channel (no component selection yet).
1605 * This is a bit hackish, we usually do the swizzle at the
1606 * end of sampling (much less values to swizzle), but this
1607 * obviously cannot work when using gather.
1608 */
1609 colors0[0] = lp_build_swizzle_soa_channel(texel_bld,
1610 neighbors[1][0],
1611 chan_swiz);
1612 colors0[1] = lp_build_swizzle_soa_channel(texel_bld,
1613 neighbors[1][1],
1614 chan_swiz);
1615 colors0[2] = lp_build_swizzle_soa_channel(texel_bld,
1616 neighbors[0][1],
1617 chan_swiz);
1618 colors0[3] = lp_build_swizzle_soa_channel(texel_bld,
1619 neighbors[0][0],
1620 chan_swiz);
1621 }
1622 else {
1623 /* Bilinear interpolate the four samples from the 2D image / 3D slice */
1624 lp_build_reduce_filter_2d(texel_bld,
1625 bld->static_sampler_state->reduction_mode,
1626 0,
1627 4,
1628 s_fpart,
1629 t_fpart,
1630 neighbors[0][0],
1631 neighbors[0][1],
1632 neighbors[1][0],
1633 neighbors[1][1],
1634 colors0);
1635 }
1636 }
1637 else {
1638 LLVMValueRef cmpval00, cmpval01, cmpval10, cmpval11;
1639 cmpval00 = lp_build_sample_comparefunc(bld, coords[4], neighbors[0][0][0]);
1640 cmpval01 = lp_build_sample_comparefunc(bld, coords[4], neighbors[0][1][0]);
1641 cmpval10 = lp_build_sample_comparefunc(bld, coords[4], neighbors[1][0][0]);
1642 cmpval11 = lp_build_sample_comparefunc(bld, coords[4], neighbors[1][1][0]);
1643
1644 if (is_gather) {
1645 /* more hacks for swizzling, should be X, ONE or ZERO... */
1646 colors0[0] = lp_build_select(texel_bld, cmpval10,
1647 texel_bld->one, texel_bld->zero);
1648 colors0[1] = lp_build_select(texel_bld, cmpval11,
1649 texel_bld->one, texel_bld->zero);
1650 colors0[2] = lp_build_select(texel_bld, cmpval01,
1651 texel_bld->one, texel_bld->zero);
1652 colors0[3] = lp_build_select(texel_bld, cmpval00,
1653 texel_bld->one, texel_bld->zero);
1654 }
1655 else {
1656 colors0[0] = lp_build_masklerp2d(texel_bld, s_fpart, t_fpart,
1657 cmpval00, cmpval01, cmpval10, cmpval11);
1658 colors0[1] = colors0[2] = colors0[3] = colors0[0];
1659 }
1660 }
1661
1662 if (have_corners && accurate_cube_corners &&
1663 bld->static_sampler_state->reduction_mode == PIPE_TEX_REDUCTION_WEIGHTED_AVERAGE) {
1664 LLVMBuildStore(builder, colors0[0], colorss[0]);
1665 LLVMBuildStore(builder, colors0[1], colorss[1]);
1666 LLVMBuildStore(builder, colors0[2], colorss[2]);
1667 LLVMBuildStore(builder, colors0[3], colorss[3]);
1668
1669 lp_build_endif(&corner_if);
1670
1671 colors0[0] = LLVMBuildLoad(builder, colorss[0], "");
1672 colors0[1] = LLVMBuildLoad(builder, colorss[1], "");
1673 colors0[2] = LLVMBuildLoad(builder, colorss[2], "");
1674 colors0[3] = LLVMBuildLoad(builder, colorss[3], "");
1675 }
1676
1677 if (dims == 3) {
1678 LLVMValueRef neighbors1[2][2][4];
1679 LLVMValueRef colors1[4];
1680
1681 assert(!is_gather);
1682
1683 /* get x0/x1/y0/y1 texels at z1 */
1684 lp_build_sample_texel_soa(bld,
1685 width_vec, height_vec, depth_vec,
1686 x00, y00, z1,
1687 row_stride_vec, img_stride_vec,
1688 data_ptr, mipoffsets, neighbors1[0][0]);
1689 lp_build_sample_texel_soa(bld,
1690 width_vec, height_vec, depth_vec,
1691 x01, y01, z1,
1692 row_stride_vec, img_stride_vec,
1693 data_ptr, mipoffsets, neighbors1[0][1]);
1694 lp_build_sample_texel_soa(bld,
1695 width_vec, height_vec, depth_vec,
1696 x10, y10, z1,
1697 row_stride_vec, img_stride_vec,
1698 data_ptr, mipoffsets, neighbors1[1][0]);
1699 lp_build_sample_texel_soa(bld,
1700 width_vec, height_vec, depth_vec,
1701 x11, y11, z1,
1702 row_stride_vec, img_stride_vec,
1703 data_ptr, mipoffsets, neighbors1[1][1]);
1704
1705 if (bld->static_sampler_state->compare_mode == PIPE_TEX_COMPARE_NONE) {
1706 /* Bilinear interpolate the four samples from the second Z slice */
1707 lp_build_reduce_filter_2d(texel_bld,
1708 bld->static_sampler_state->reduction_mode,
1709 0,
1710 4,
1711 s_fpart,
1712 t_fpart,
1713 neighbors1[0][0],
1714 neighbors1[0][1],
1715 neighbors1[1][0],
1716 neighbors1[1][1],
1717 colors1);
1718
1719 /* Linearly interpolate the two samples from the two 3D slices */
1720 lp_build_reduce_filter(texel_bld,
1721 bld->static_sampler_state->reduction_mode,
1722 0,
1723 4,
1724 r_fpart,
1725 colors0,
1726 colors1,
1727 colors_out);
1728 }
1729 else {
1730 LLVMValueRef cmpval00, cmpval01, cmpval10, cmpval11;
1731 cmpval00 = lp_build_sample_comparefunc(bld, coords[4], neighbors[0][0][0]);
1732 cmpval01 = lp_build_sample_comparefunc(bld, coords[4], neighbors[0][1][0]);
1733 cmpval10 = lp_build_sample_comparefunc(bld, coords[4], neighbors[1][0][0]);
1734 cmpval11 = lp_build_sample_comparefunc(bld, coords[4], neighbors[1][1][0]);
1735 colors1[0] = lp_build_masklerp2d(texel_bld, s_fpart, t_fpart,
1736 cmpval00, cmpval01, cmpval10, cmpval11);
1737 /* Linearly interpolate the two samples from the two 3D slices */
1738 colors_out[0] = lp_build_lerp(texel_bld,
1739 r_fpart,
1740 colors0[0], colors1[0],
1741 0);
1742 colors_out[1] = colors_out[2] = colors_out[3] = colors_out[0];
1743 }
1744 }
1745 else {
1746 /* 2D tex */
1747 for (chan = 0; chan < 4; chan++) {
1748 colors_out[chan] = colors0[chan];
1749 }
1750 }
1751 }
1752 if (is_gather) {
1753 /*
1754 * For gather, we can't do our usual channel swizzling done later,
1755 * so do it here. It only really matters for 0/1 swizzles in case
1756 * of comparison filtering, since in this case the results would be
1757 * wrong, without comparison it should all work out alright but it
1758 * can't hurt to do that here, since it will instantly drop all
1759 * calculations above, though it's a rather stupid idea to do
1760 * gather on a channel which will always return 0 or 1 in any case...
1761 */
1762 if (chan_swiz == PIPE_SWIZZLE_1) {
1763 for (chan = 0; chan < 4; chan++) {
1764 colors_out[chan] = texel_bld->one;
1765 }
1766 } else if (chan_swiz == PIPE_SWIZZLE_0) {
1767 for (chan = 0; chan < 4; chan++) {
1768 colors_out[chan] = texel_bld->zero;
1769 }
1770 }
1771 }
1772 }
1773
1774
1775 /**
1776 * Sample the texture/mipmap using given image filter and mip filter.
1777 * ilevel0 and ilevel1 indicate the two mipmap levels to sample
1778 * from (vectors or scalars).
1779 * If we're using nearest miplevel sampling the '1' values will be null/unused.
1780 */
1781 static void
lp_build_sample_mipmap(struct lp_build_sample_context * bld,unsigned img_filter,unsigned mip_filter,boolean is_gather,const LLVMValueRef * coords,const LLVMValueRef * offsets,LLVMValueRef ilevel0,LLVMValueRef ilevel1,LLVMValueRef lod_fpart,LLVMValueRef * colors_out)1782 lp_build_sample_mipmap(struct lp_build_sample_context *bld,
1783 unsigned img_filter,
1784 unsigned mip_filter,
1785 boolean is_gather,
1786 const LLVMValueRef *coords,
1787 const LLVMValueRef *offsets,
1788 LLVMValueRef ilevel0,
1789 LLVMValueRef ilevel1,
1790 LLVMValueRef lod_fpart,
1791 LLVMValueRef *colors_out)
1792 {
1793 LLVMBuilderRef builder = bld->gallivm->builder;
1794 LLVMValueRef size0 = NULL;
1795 LLVMValueRef size1 = NULL;
1796 LLVMValueRef row_stride0_vec = NULL;
1797 LLVMValueRef row_stride1_vec = NULL;
1798 LLVMValueRef img_stride0_vec = NULL;
1799 LLVMValueRef img_stride1_vec = NULL;
1800 LLVMValueRef data_ptr0 = NULL;
1801 LLVMValueRef data_ptr1 = NULL;
1802 LLVMValueRef mipoff0 = NULL;
1803 LLVMValueRef mipoff1 = NULL;
1804 LLVMValueRef colors0[4], colors1[4];
1805 unsigned chan;
1806
1807 /* sample the first mipmap level */
1808 lp_build_mipmap_level_sizes(bld, ilevel0,
1809 &size0,
1810 &row_stride0_vec, &img_stride0_vec);
1811 if (bld->num_mips == 1) {
1812 data_ptr0 = lp_build_get_mipmap_level(bld, ilevel0);
1813 }
1814 else {
1815 /* This path should work for num_lods 1 too but slightly less efficient */
1816 data_ptr0 = bld->base_ptr;
1817 mipoff0 = lp_build_get_mip_offsets(bld, ilevel0);
1818 }
1819 if (img_filter == PIPE_TEX_FILTER_NEAREST) {
1820 lp_build_sample_image_nearest(bld, size0,
1821 row_stride0_vec, img_stride0_vec,
1822 data_ptr0, mipoff0, coords, offsets,
1823 colors0);
1824 }
1825 else {
1826 assert(img_filter == PIPE_TEX_FILTER_LINEAR);
1827 lp_build_sample_image_linear(bld, is_gather, size0, NULL,
1828 row_stride0_vec, img_stride0_vec,
1829 data_ptr0, mipoff0, coords, offsets,
1830 colors0);
1831 }
1832
1833 /* Store the first level's colors in the output variables */
1834 for (chan = 0; chan < 4; chan++) {
1835 LLVMBuildStore(builder, colors0[chan], colors_out[chan]);
1836 }
1837
1838 if (mip_filter == PIPE_TEX_MIPFILTER_LINEAR) {
1839 struct lp_build_if_state if_ctx;
1840 LLVMValueRef need_lerp;
1841
1842 /* need_lerp = lod_fpart > 0 */
1843 if (bld->num_lods == 1) {
1844 need_lerp = LLVMBuildFCmp(builder, LLVMRealUGT,
1845 lod_fpart, bld->lodf_bld.zero,
1846 "need_lerp");
1847 }
1848 else {
1849 /*
1850 * We'll do mip filtering if any of the quads (or individual
1851 * pixel in case of per-pixel lod) need it.
1852 * It might be better to split the vectors here and only fetch/filter
1853 * quads which need it (if there's one lod per quad).
1854 */
1855 need_lerp = lp_build_compare(bld->gallivm, bld->lodf_bld.type,
1856 PIPE_FUNC_GREATER,
1857 lod_fpart, bld->lodf_bld.zero);
1858 need_lerp = lp_build_any_true_range(&bld->lodi_bld, bld->num_lods, need_lerp);
1859 lp_build_name(need_lerp, "need_lerp");
1860 }
1861
1862 lp_build_if(&if_ctx, bld->gallivm, need_lerp);
1863 {
1864 /*
1865 * We unfortunately need to clamp lod_fpart here since we can get
1866 * negative values which would screw up filtering if not all
1867 * lod_fpart values have same sign.
1868 */
1869 lod_fpart = lp_build_max(&bld->lodf_bld, lod_fpart,
1870 bld->lodf_bld.zero);
1871 /* sample the second mipmap level */
1872 lp_build_mipmap_level_sizes(bld, ilevel1,
1873 &size1,
1874 &row_stride1_vec, &img_stride1_vec);
1875 if (bld->num_mips == 1) {
1876 data_ptr1 = lp_build_get_mipmap_level(bld, ilevel1);
1877 }
1878 else {
1879 data_ptr1 = bld->base_ptr;
1880 mipoff1 = lp_build_get_mip_offsets(bld, ilevel1);
1881 }
1882 if (img_filter == PIPE_TEX_FILTER_NEAREST) {
1883 lp_build_sample_image_nearest(bld, size1,
1884 row_stride1_vec, img_stride1_vec,
1885 data_ptr1, mipoff1, coords, offsets,
1886 colors1);
1887 }
1888 else {
1889 lp_build_sample_image_linear(bld, FALSE, size1, NULL,
1890 row_stride1_vec, img_stride1_vec,
1891 data_ptr1, mipoff1, coords, offsets,
1892 colors1);
1893 }
1894
1895 /* interpolate samples from the two mipmap levels */
1896
1897 if (bld->num_lods != bld->coord_type.length)
1898 lod_fpart = lp_build_unpack_broadcast_aos_scalars(bld->gallivm,
1899 bld->lodf_bld.type,
1900 bld->texel_bld.type,
1901 lod_fpart);
1902
1903 for (chan = 0; chan < 4; chan++) {
1904 colors0[chan] = lp_build_lerp(&bld->texel_bld, lod_fpart,
1905 colors0[chan], colors1[chan],
1906 0);
1907 LLVMBuildStore(builder, colors0[chan], colors_out[chan]);
1908 }
1909 }
1910 lp_build_endif(&if_ctx);
1911 }
1912 }
1913
1914
1915 /**
1916 * Sample the texture/mipmap using given mip filter, and using
1917 * both nearest and linear filtering at the same time depending
1918 * on linear_mask.
1919 * lod can be per quad but linear_mask is always per pixel.
1920 * ilevel0 and ilevel1 indicate the two mipmap levels to sample
1921 * from (vectors or scalars).
1922 * If we're using nearest miplevel sampling the '1' values will be null/unused.
1923 */
1924 static void
lp_build_sample_mipmap_both(struct lp_build_sample_context * bld,LLVMValueRef linear_mask,unsigned mip_filter,const LLVMValueRef * coords,const LLVMValueRef * offsets,LLVMValueRef ilevel0,LLVMValueRef ilevel1,LLVMValueRef lod_fpart,LLVMValueRef lod_positive,LLVMValueRef * colors_out)1925 lp_build_sample_mipmap_both(struct lp_build_sample_context *bld,
1926 LLVMValueRef linear_mask,
1927 unsigned mip_filter,
1928 const LLVMValueRef *coords,
1929 const LLVMValueRef *offsets,
1930 LLVMValueRef ilevel0,
1931 LLVMValueRef ilevel1,
1932 LLVMValueRef lod_fpart,
1933 LLVMValueRef lod_positive,
1934 LLVMValueRef *colors_out)
1935 {
1936 LLVMBuilderRef builder = bld->gallivm->builder;
1937 LLVMValueRef size0 = NULL;
1938 LLVMValueRef size1 = NULL;
1939 LLVMValueRef row_stride0_vec = NULL;
1940 LLVMValueRef row_stride1_vec = NULL;
1941 LLVMValueRef img_stride0_vec = NULL;
1942 LLVMValueRef img_stride1_vec = NULL;
1943 LLVMValueRef data_ptr0 = NULL;
1944 LLVMValueRef data_ptr1 = NULL;
1945 LLVMValueRef mipoff0 = NULL;
1946 LLVMValueRef mipoff1 = NULL;
1947 LLVMValueRef colors0[4], colors1[4];
1948 unsigned chan;
1949
1950 /* sample the first mipmap level */
1951 lp_build_mipmap_level_sizes(bld, ilevel0,
1952 &size0,
1953 &row_stride0_vec, &img_stride0_vec);
1954 if (bld->num_mips == 1) {
1955 data_ptr0 = lp_build_get_mipmap_level(bld, ilevel0);
1956 }
1957 else {
1958 /* This path should work for num_lods 1 too but slightly less efficient */
1959 data_ptr0 = bld->base_ptr;
1960 mipoff0 = lp_build_get_mip_offsets(bld, ilevel0);
1961 }
1962
1963 lp_build_sample_image_linear(bld, FALSE, size0, linear_mask,
1964 row_stride0_vec, img_stride0_vec,
1965 data_ptr0, mipoff0, coords, offsets,
1966 colors0);
1967
1968 /* Store the first level's colors in the output variables */
1969 for (chan = 0; chan < 4; chan++) {
1970 LLVMBuildStore(builder, colors0[chan], colors_out[chan]);
1971 }
1972
1973 if (mip_filter == PIPE_TEX_MIPFILTER_LINEAR) {
1974 struct lp_build_if_state if_ctx;
1975 LLVMValueRef need_lerp;
1976
1977 /*
1978 * We'll do mip filtering if any of the quads (or individual
1979 * pixel in case of per-pixel lod) need it.
1980 * Note using lod_positive here not lod_fpart since it may be the same
1981 * condition as that used in the outer "if" in the caller hence llvm
1982 * should be able to merge the branches in this case.
1983 */
1984 need_lerp = lp_build_any_true_range(&bld->lodi_bld, bld->num_lods, lod_positive);
1985 lp_build_name(need_lerp, "need_lerp");
1986
1987 lp_build_if(&if_ctx, bld->gallivm, need_lerp);
1988 {
1989 /*
1990 * We unfortunately need to clamp lod_fpart here since we can get
1991 * negative values which would screw up filtering if not all
1992 * lod_fpart values have same sign.
1993 */
1994 lod_fpart = lp_build_max(&bld->lodf_bld, lod_fpart,
1995 bld->lodf_bld.zero);
1996 /* sample the second mipmap level */
1997 lp_build_mipmap_level_sizes(bld, ilevel1,
1998 &size1,
1999 &row_stride1_vec, &img_stride1_vec);
2000 if (bld->num_mips == 1) {
2001 data_ptr1 = lp_build_get_mipmap_level(bld, ilevel1);
2002 }
2003 else {
2004 data_ptr1 = bld->base_ptr;
2005 mipoff1 = lp_build_get_mip_offsets(bld, ilevel1);
2006 }
2007
2008 lp_build_sample_image_linear(bld, FALSE, size1, linear_mask,
2009 row_stride1_vec, img_stride1_vec,
2010 data_ptr1, mipoff1, coords, offsets,
2011 colors1);
2012
2013 /* interpolate samples from the two mipmap levels */
2014
2015 if (bld->num_lods != bld->coord_type.length)
2016 lod_fpart = lp_build_unpack_broadcast_aos_scalars(bld->gallivm,
2017 bld->lodf_bld.type,
2018 bld->texel_bld.type,
2019 lod_fpart);
2020
2021 for (chan = 0; chan < 4; chan++) {
2022 colors0[chan] = lp_build_lerp(&bld->texel_bld, lod_fpart,
2023 colors0[chan], colors1[chan],
2024 0);
2025 LLVMBuildStore(builder, colors0[chan], colors_out[chan]);
2026 }
2027 }
2028 lp_build_endif(&if_ctx);
2029 }
2030 }
2031
2032
2033 /**
2034 * Build (per-coord) layer value.
2035 * Either clamp layer to valid values or fill in optional out_of_bounds
2036 * value and just return value unclamped.
2037 */
2038 static LLVMValueRef
lp_build_layer_coord(struct lp_build_sample_context * bld,unsigned texture_unit,boolean is_cube_array,LLVMValueRef layer,LLVMValueRef * out_of_bounds)2039 lp_build_layer_coord(struct lp_build_sample_context *bld,
2040 unsigned texture_unit,
2041 boolean is_cube_array,
2042 LLVMValueRef layer,
2043 LLVMValueRef *out_of_bounds)
2044 {
2045 LLVMValueRef num_layers;
2046 struct lp_build_context *int_coord_bld = &bld->int_coord_bld;
2047
2048 num_layers = bld->dynamic_state->depth(bld->dynamic_state, bld->gallivm,
2049 bld->context_ptr, texture_unit, NULL);
2050
2051 if (out_of_bounds) {
2052 LLVMValueRef out1, out;
2053 assert(!is_cube_array);
2054 num_layers = lp_build_broadcast_scalar(int_coord_bld, num_layers);
2055 out = lp_build_cmp(int_coord_bld, PIPE_FUNC_LESS, layer, int_coord_bld->zero);
2056 out1 = lp_build_cmp(int_coord_bld, PIPE_FUNC_GEQUAL, layer, num_layers);
2057 *out_of_bounds = lp_build_or(int_coord_bld, out, out1);
2058 return layer;
2059 }
2060 else {
2061 LLVMValueRef maxlayer;
2062 LLVMValueRef s = is_cube_array ? lp_build_const_int32(bld->gallivm, 6) :
2063 bld->int_bld.one;
2064 maxlayer = lp_build_sub(&bld->int_bld, num_layers, s);
2065 maxlayer = lp_build_broadcast_scalar(int_coord_bld, maxlayer);
2066 return lp_build_clamp(int_coord_bld, layer, int_coord_bld->zero, maxlayer);
2067 }
2068 }
2069
2070 #define WEIGHT_LUT_SIZE 1024
2071
2072 static void
lp_build_sample_aniso(struct lp_build_sample_context * bld,unsigned img_filter,unsigned mip_filter,boolean is_gather,const LLVMValueRef * coords,const LLVMValueRef * offsets,LLVMValueRef ilevel0,LLVMValueRef ilevel1,LLVMValueRef lod_fpart,LLVMValueRef * colors_out)2073 lp_build_sample_aniso(struct lp_build_sample_context *bld,
2074 unsigned img_filter,
2075 unsigned mip_filter,
2076 boolean is_gather,
2077 const LLVMValueRef *coords,
2078 const LLVMValueRef *offsets,
2079 LLVMValueRef ilevel0,
2080 LLVMValueRef ilevel1,
2081 LLVMValueRef lod_fpart,
2082 LLVMValueRef *colors_out)
2083 {
2084 struct gallivm_state *gallivm = bld->gallivm;
2085 LLVMBuilderRef builder = gallivm->builder;
2086 struct lp_build_context *coord_bld = &bld->coord_bld;
2087 struct lp_build_context *float_size_bld = &bld->float_size_in_bld;
2088 LLVMValueRef ddx_ddy = lp_build_packed_ddx_ddy_twocoord(&bld->coord_bld, coords[0], coords[1]);
2089 LLVMValueRef float_size;
2090 LLVMTypeRef i32t = LLVMInt32TypeInContext(gallivm->context);
2091 LLVMValueRef index0 = LLVMConstInt(i32t, 0, 0);
2092 LLVMValueRef index1 = LLVMConstInt(i32t, 1, 0);
2093 unsigned length = bld->coord_bld.type.length;
2094 unsigned num_quads = length / 4;
2095 unsigned i;
2096 LLVMValueRef filter_table = bld->aniso_filter_table;
2097 LLVMValueRef size0, row_stride0_vec, img_stride0_vec;
2098 LLVMValueRef data_ptr0, mipoff0 = NULL;
2099
2100 lp_build_mipmap_level_sizes(bld, ilevel0,
2101 &size0,
2102 &row_stride0_vec, &img_stride0_vec);
2103 if (bld->num_mips == 1) {
2104 data_ptr0 = lp_build_get_mipmap_level(bld, ilevel0);
2105 }
2106 else {
2107 /* This path should work for num_lods 1 too but slightly less efficient */
2108 data_ptr0 = bld->base_ptr;
2109 mipoff0 = lp_build_get_mip_offsets(bld, ilevel0);
2110 }
2111
2112 float_size = lp_build_int_to_float(&bld->float_size_in_bld, bld->int_size);
2113
2114 LLVMValueRef float_size_lvl = lp_build_int_to_float(&bld->float_size_bld, size0);
2115 /* extract width and height into vectors for use later */
2116 static const unsigned char swizzle15[] = { /* no-op swizzle */
2117 1, 1, 1, 1, 5, 5, 5, 5
2118 };
2119 static const unsigned char swizzle04[] = { /* no-op swizzle */
2120 0, 0, 0, 0, 4, 4, 4, 4
2121 };
2122 LLVMValueRef width_dim, height_dim;
2123
2124 width_dim = lp_build_swizzle_aos_n(gallivm, float_size_lvl, swizzle04, bld->float_size_bld.type.length, bld->coord_bld.type.length);
2125 height_dim = lp_build_swizzle_aos_n(gallivm, float_size_lvl, swizzle15, bld->float_size_bld.type.length, bld->coord_bld.type.length);
2126
2127
2128 /* shuffle width/height for ddx/ddy calculations. */
2129 LLVMValueRef shuffles[LP_MAX_VECTOR_LENGTH / 4];
2130
2131 for (i = 0; i < num_quads; i++) {
2132 shuffles[i*4+0] = shuffles[i*4+1] = index0;
2133 shuffles[i*4+2] = shuffles[i*4+3] = index1;
2134 }
2135
2136 LLVMValueRef floatdim = LLVMBuildShuffleVector(builder, float_size, float_size,
2137 LLVMConstVector(shuffles, length), "");
2138
2139 ddx_ddy = lp_build_mul(coord_bld, ddx_ddy, floatdim);
2140
2141 LLVMValueRef scaling = lp_build_shl(&bld->leveli_bld, bld->leveli_bld.one, ilevel0);
2142 scaling = lp_build_int_to_float(&bld->levelf_bld, scaling);
2143 scaling = lp_build_rcp(&bld->levelf_bld, scaling);
2144
2145 if (bld->num_lods != length) {
2146 if (bld->levelf_bld.type.length == 1)
2147 scaling = lp_build_broadcast_scalar(coord_bld,
2148 scaling);
2149 else
2150 scaling = lp_build_unpack_broadcast_aos_scalars(bld->gallivm,
2151 bld->levelf_bld.type,
2152 coord_bld->type,
2153 scaling);
2154 }
2155
2156 ddx_ddy = lp_build_mul(coord_bld, ddx_ddy, scaling);
2157
2158 static const unsigned char swizzle01[] = { /* no-op swizzle */
2159 0, 1, 0, 1,
2160 };
2161 static const unsigned char swizzle23[] = {
2162 2, 3, 2, 3,
2163 };
2164
2165 LLVMValueRef ddx_ddys, ddx_ddyt;
2166 ddx_ddys = lp_build_swizzle_aos(coord_bld, ddx_ddy, swizzle01);
2167 ddx_ddyt = lp_build_swizzle_aos(coord_bld, ddx_ddy, swizzle23);
2168
2169 /* compute ellipse coefficients */
2170 /* * A*x*x + B*x*y + C*y*y = F.*/
2171 /* float A = vx*vx+vy*vy+1; */
2172 LLVMValueRef A = lp_build_mul(coord_bld, ddx_ddyt, ddx_ddyt);
2173
2174 LLVMValueRef Ay = lp_build_swizzle_aos(coord_bld, A, swizzle15);
2175 A = lp_build_add(coord_bld, A, Ay);
2176 A = lp_build_add(coord_bld, A, coord_bld->one);
2177 A = lp_build_swizzle_aos(coord_bld, A, swizzle04);
2178
2179 /* float B = -2*(ux*vx+uy*vy); */
2180 LLVMValueRef B = lp_build_mul(coord_bld, ddx_ddys, ddx_ddyt);
2181 LLVMValueRef By = lp_build_swizzle_aos(coord_bld, B, swizzle15);
2182 B = lp_build_add(coord_bld, B, By);
2183 B = lp_build_mul_imm(coord_bld, B, -2);
2184 B = lp_build_swizzle_aos(coord_bld, B, swizzle04);
2185
2186 /* float C = ux*ux+uy*uy+1; */
2187 LLVMValueRef C = lp_build_mul(coord_bld, ddx_ddys, ddx_ddys);
2188 LLVMValueRef Cy = lp_build_swizzle_aos(coord_bld, C, swizzle15);
2189 C = lp_build_add(coord_bld, C, Cy);
2190 C = lp_build_add(coord_bld, C, coord_bld->one);
2191 C = lp_build_swizzle_aos(coord_bld, C, swizzle04);
2192
2193 /* float F = A*C-B*B/4.0f; */
2194 LLVMValueRef F = lp_build_mul(coord_bld, B, B);
2195 F = lp_build_div(coord_bld, F, lp_build_const_vec(gallivm, coord_bld->type, 4.0));
2196 LLVMValueRef F_p2 = lp_build_mul(coord_bld, A, C);
2197 F = lp_build_sub(coord_bld, F_p2, F);
2198
2199 /* compute ellipse bounding box in texture space */
2200 /* const float d = -B*B+4.0f*C*A; */
2201 LLVMValueRef d = lp_build_sub(coord_bld, coord_bld->zero, lp_build_mul(coord_bld, B, B));
2202 LLVMValueRef d_p2 = lp_build_mul(coord_bld, A, C);
2203 d_p2 = lp_build_mul_imm(coord_bld, d_p2, 4);
2204 d = lp_build_add(coord_bld, d, d_p2);
2205
2206 /* const float box_u = 2.0f / d * sqrtf(d*C*F); */
2207 /* box_u -> half of bbox with */
2208 LLVMValueRef temp;
2209 temp = lp_build_mul(coord_bld, d, C);
2210 temp = lp_build_mul(coord_bld, temp, F);
2211 temp = lp_build_sqrt(coord_bld, temp);
2212
2213 LLVMValueRef box_u = lp_build_div(coord_bld, lp_build_const_vec(gallivm, coord_bld->type, 2.0), d);
2214 box_u = lp_build_mul(coord_bld, box_u, temp);
2215
2216 /* const float box_v = 2.0f / d * sqrtf(A*d*F); */
2217 /* box_v -> half of bbox height */
2218 temp = lp_build_mul(coord_bld, A, d);
2219 temp = lp_build_mul(coord_bld, temp, F);
2220 temp = lp_build_sqrt(coord_bld, temp);
2221
2222 LLVMValueRef box_v = lp_build_div(coord_bld, lp_build_const_vec(gallivm, coord_bld->type, 2.0), d);
2223 box_v = lp_build_mul(coord_bld, box_v, temp);
2224
2225 /* Scale ellipse formula to directly index the Filter Lookup Table.
2226 * i.e. scale so that F = WEIGHT_LUT_SIZE-1
2227 */
2228 LLVMValueRef formScale = lp_build_div(coord_bld, lp_build_const_vec(gallivm, coord_bld->type, WEIGHT_LUT_SIZE - 1), F);
2229
2230 A = lp_build_mul(coord_bld, A, formScale);
2231 B = lp_build_mul(coord_bld, B, formScale);
2232 C = lp_build_mul(coord_bld, C, formScale);
2233 /* F *= formScale; */ /* no need to scale F as we don't use it below here */
2234
2235 LLVMValueRef ddq = lp_build_mul_imm(coord_bld, A, 2);
2236
2237 /* Heckbert MS thesis, p. 59; scan over the bounding box of the ellipse
2238 * and incrementally update the value of Ax^2+Bxy*Cy^2; when this
2239 * value, q, is less than F, we're inside the ellipse
2240 */
2241
2242 LLVMValueRef float_size0 = lp_build_int_to_float(float_size_bld, bld->int_size);
2243 LLVMValueRef width0 = lp_build_extract_broadcast(gallivm,
2244 float_size_bld->type,
2245 coord_bld->type,
2246 float_size0, index0);
2247 LLVMValueRef height0 = lp_build_extract_broadcast(gallivm,
2248 float_size_bld->type,
2249 coord_bld->type,
2250 float_size0, index1);
2251
2252 /* texture->width0 * scaling */
2253 width0 = lp_build_mul(coord_bld, width0, scaling);
2254 /* texture->height0 * scaling */
2255 height0 = lp_build_mul(coord_bld, height0, scaling);
2256
2257 /* tex_u = -0.5f * s[j] * texture->width0 * scaling */
2258 LLVMValueRef tex_u = lp_build_mul(coord_bld, coords[0], width0);
2259 tex_u = lp_build_add(coord_bld, tex_u, lp_build_const_vec(gallivm, coord_bld->type, -0.5f));
2260
2261 /* tex_v = -0.5f * t[j] * texture->height0 * scaling */
2262 LLVMValueRef tex_v = lp_build_mul(coord_bld, coords[1], height0);
2263 tex_v = lp_build_add(coord_bld, tex_v, lp_build_const_vec(gallivm, coord_bld->type, -0.5f));
2264
2265 /* const int u0 = (int) floorf(tex_u - box_u); */
2266 LLVMValueRef u0 = lp_build_itrunc(coord_bld, lp_build_floor(coord_bld, lp_build_sub(coord_bld, tex_u, box_u)));
2267 /* const int u1 = (int) ceilf(tex_u + box_u); */
2268 LLVMValueRef u1 = lp_build_itrunc(coord_bld, lp_build_ceil(coord_bld, lp_build_add(coord_bld, tex_u, box_u)));
2269
2270 /* const int v0 = (int) floorf(tex_v - box_v); */
2271 LLVMValueRef v0 = lp_build_itrunc(coord_bld, lp_build_floor(coord_bld, lp_build_sub(coord_bld, tex_v, box_v)));
2272 /* const int v1 = (int) ceilf(tex_v + box_v); */
2273 LLVMValueRef v1 = lp_build_itrunc(coord_bld, lp_build_ceil(coord_bld, lp_build_add(coord_bld, tex_v, box_v)));
2274
2275 /* const float U = u0 - tex_u; */
2276 LLVMValueRef U = lp_build_sub(coord_bld, lp_build_int_to_float(coord_bld, u0), tex_u);
2277
2278 /* A * (2 * U + 1) */
2279 LLVMValueRef dq_base = lp_build_mul_imm(coord_bld, U, 2);
2280 dq_base = lp_build_add(coord_bld, dq_base, coord_bld->one);
2281 dq_base = lp_build_mul(coord_bld, dq_base, A);
2282
2283 /* A * U * U */
2284 LLVMValueRef q_base = lp_build_mul(coord_bld, U, U);
2285 q_base = lp_build_mul(coord_bld, q_base, A);
2286
2287 LLVMValueRef colors0[4];
2288 LLVMValueRef den_store = lp_build_alloca(gallivm, bld->texel_bld.vec_type, "den");
2289
2290 unsigned chan;
2291 for (chan = 0; chan < 4; chan++)
2292 colors0[chan] = lp_build_alloca(gallivm, bld->texel_bld.vec_type, "colors");
2293
2294 LLVMValueRef q_store, dq_store;
2295 q_store = lp_build_alloca(gallivm, bld->coord_bld.vec_type, "q");
2296 dq_store = lp_build_alloca(gallivm, bld->coord_bld.vec_type, "dq");
2297
2298 LLVMValueRef v_limiter = lp_build_alloca(gallivm, bld->int_coord_bld.vec_type, "v_limiter");
2299 LLVMValueRef u_limiter = lp_build_alloca(gallivm, bld->int_coord_bld.vec_type, "u_limiter");
2300
2301 LLVMBuildStore(builder, v0, v_limiter);
2302
2303 /* create an LLVM loop block for the V iterator */
2304 LLVMBasicBlockRef v_loop_block = lp_build_insert_new_block(gallivm, "vloop");
2305
2306 LLVMBuildBr(builder, v_loop_block);
2307 LLVMPositionBuilderAtEnd(builder, v_loop_block);
2308
2309 LLVMValueRef v_val = LLVMBuildLoad(builder, v_limiter, "");
2310 LLVMValueRef v_mask = LLVMBuildICmp(builder,
2311 LLVMIntSLE,
2312 v_val,
2313 v1, "");
2314
2315 /* loop over V values. */
2316 {
2317 /* const float V = v - tex_v; */
2318 LLVMValueRef V = lp_build_sub(coord_bld, lp_build_int_to_float(coord_bld, v_val), tex_v);
2319
2320 /* float dq = dq_base + B * V; */
2321 LLVMValueRef dq = lp_build_mul(coord_bld, V, B);
2322 dq = lp_build_add(coord_bld, dq, dq_base);
2323
2324 /* float q = (C * V + B * U) * V + q_base */
2325 LLVMValueRef q = lp_build_mul(coord_bld, C, V);
2326 q = lp_build_add(coord_bld, q, lp_build_mul(coord_bld, B, U));
2327 q = lp_build_mul(coord_bld, q, V);
2328 q = lp_build_add(coord_bld, q, q_base);
2329
2330 LLVMBuildStore(builder, q, q_store);
2331 LLVMBuildStore(builder, dq, dq_store);
2332
2333 LLVMBuildStore(builder, u0, u_limiter);
2334
2335 /* create an LLVM loop block for the V iterator */
2336 LLVMBasicBlockRef u_loop_block = lp_build_insert_new_block(gallivm, "uloop");
2337
2338 LLVMBuildBr(builder, u_loop_block);
2339 LLVMPositionBuilderAtEnd(builder, u_loop_block);
2340
2341 LLVMValueRef u_val = LLVMBuildLoad(builder, u_limiter, "");
2342 LLVMValueRef u_mask = LLVMBuildICmp(builder,
2343 LLVMIntSLE,
2344 u_val,
2345 u1, "");
2346
2347 /* loop over U values */
2348 {
2349 /* q = (int)q */
2350 q = lp_build_itrunc(coord_bld, LLVMBuildLoad(builder, q_store, ""));
2351
2352 /*
2353 * avoid OOB access to filter table, generate a mask for q > 1024,
2354 * then truncate it.
2355 */
2356 LLVMValueRef q_mask = LLVMBuildICmp(builder,
2357 LLVMIntSLE,
2358 q,
2359 lp_build_const_int_vec(gallivm, bld->int_coord_bld.type, 0x3ff), "");
2360 q_mask = LLVMBuildSExt(builder, q_mask, bld->int_coord_bld.vec_type, "");
2361
2362 q = lp_build_max(&bld->int_coord_bld, q, bld->int_coord_bld.zero);
2363 q = lp_build_and(&bld->int_coord_bld, q, lp_build_const_int_vec(gallivm, bld->int_coord_bld.type, 0x3ff));
2364
2365 /* update the offsets to deal with float size. */
2366 q = lp_build_mul_imm(&bld->int_coord_bld, q, 4);
2367 filter_table = LLVMBuildBitCast(gallivm->builder, filter_table, LLVMPointerType(LLVMInt8TypeInContext(gallivm->context), 0), "");
2368
2369 /* Lookup weights in filter table */
2370 LLVMValueRef weights = lp_build_gather(gallivm, coord_bld->type.length,
2371 coord_bld->type.width,
2372 lp_elem_type(coord_bld->type),
2373 TRUE, filter_table, q, TRUE);
2374
2375 /*
2376 * Mask off the weights here which should ensure no-op for loops
2377 * where some of the u/v values are not being calculated.
2378 */
2379 weights = LLVMBuildBitCast(builder, weights, bld->int_coord_bld.vec_type, "");
2380 weights = lp_build_and(&bld->int_coord_bld, weights, LLVMBuildSExt(builder, u_mask, bld->int_coord_bld.vec_type, ""));
2381 weights = lp_build_and(&bld->int_coord_bld, weights, LLVMBuildSExt(builder, v_mask, bld->int_coord_bld.vec_type, ""));
2382 weights = lp_build_and(&bld->int_coord_bld, weights, q_mask);
2383 weights = LLVMBuildBitCast(builder, weights, bld->coord_bld.vec_type, "");
2384
2385 /* if the weights are all 0 avoid doing the sampling at all. */
2386 struct lp_build_if_state noloadw0;
2387
2388 LLVMValueRef wnz = LLVMBuildFCmp(gallivm->builder, LLVMRealUNE,
2389 weights, bld->coord_bld.zero, "");
2390 wnz = LLVMBuildSExt(builder, wnz, bld->int_coord_bld.vec_type, "");
2391 wnz = lp_build_any_true_range(&bld->coord_bld, bld->coord_bld.type.length, wnz);
2392 lp_build_if(&noloadw0, gallivm, wnz);
2393 LLVMValueRef new_coords[3];
2394 new_coords[0] = lp_build_div(coord_bld, lp_build_int_to_float(coord_bld, u_val), width_dim);
2395 new_coords[1] = lp_build_div(coord_bld, lp_build_int_to_float(coord_bld, v_val), height_dim);
2396 new_coords[2] = coords[2];
2397
2398 /* lookup q in filter table */
2399 LLVMValueRef temp_colors[4];
2400 lp_build_sample_image_nearest(bld, size0,
2401 row_stride0_vec, img_stride0_vec,
2402 data_ptr0, mipoff0, new_coords, offsets,
2403 temp_colors);
2404
2405 for (chan = 0; chan < 4; chan++) {
2406 LLVMValueRef tcolor = LLVMBuildLoad(builder, colors0[chan], "");
2407
2408 tcolor = lp_build_add(&bld->texel_bld, tcolor, lp_build_mul(&bld->texel_bld, temp_colors[chan], weights));
2409 LLVMBuildStore(builder, tcolor, colors0[chan]);
2410 }
2411
2412 /* multiple colors by weight and add in. */
2413 /* den += weight; */
2414 LLVMValueRef den = LLVMBuildLoad(builder, den_store, "");
2415 den = lp_build_add(&bld->texel_bld, den, weights);
2416 LLVMBuildStore(builder, den, den_store);
2417
2418 lp_build_endif(&noloadw0);
2419 /* q += dq; */
2420 /* dq += ddq; */
2421 q = LLVMBuildLoad(builder, q_store, "");
2422 dq = LLVMBuildLoad(builder, dq_store, "");
2423 q = lp_build_add(coord_bld, q, dq);
2424 dq = lp_build_add(coord_bld, dq, ddq);
2425 LLVMBuildStore(builder, q, q_store);
2426 LLVMBuildStore(builder, dq, dq_store);
2427 }
2428 /* u += 1 */
2429 u_val = LLVMBuildLoad(builder, u_limiter, "");
2430 u_val = lp_build_add(&bld->int_coord_bld, u_val, bld->int_coord_bld.one);
2431 LLVMBuildStore(builder, u_val, u_limiter);
2432
2433 u_mask = LLVMBuildICmp(builder,
2434 LLVMIntSLE,
2435 u_val,
2436 u1, "");
2437 LLVMValueRef u_end_cond = LLVMBuildSExt(builder, u_mask, bld->int_coord_bld.vec_type, "");
2438 u_end_cond = lp_build_any_true_range(&bld->coord_bld, bld->coord_bld.type.length, u_end_cond);
2439
2440 LLVMBasicBlockRef u_end_loop = lp_build_insert_new_block(gallivm, "u_end_loop");
2441
2442 LLVMBuildCondBr(builder, u_end_cond,
2443 u_loop_block, u_end_loop);
2444
2445 LLVMPositionBuilderAtEnd(builder, u_end_loop);
2446
2447 }
2448
2449 /* v += 1 */
2450 v_val = LLVMBuildLoad(builder, v_limiter, "");
2451 v_val = lp_build_add(&bld->int_coord_bld, v_val, bld->int_coord_bld.one);
2452 LLVMBuildStore(builder, v_val, v_limiter);
2453
2454 v_mask = LLVMBuildICmp(builder,
2455 LLVMIntSLE,
2456 v_val,
2457 v1, "");
2458 LLVMValueRef v_end_cond = LLVMBuildSExt(builder, v_mask, bld->int_coord_bld.vec_type, "");
2459 v_end_cond = lp_build_any_true_range(&bld->coord_bld, bld->coord_bld.type.length, v_end_cond);
2460
2461 LLVMBasicBlockRef v_end_loop = lp_build_insert_new_block(gallivm, "v_end_loop");
2462
2463 LLVMBuildCondBr(builder, v_end_cond,
2464 v_loop_block, v_end_loop);
2465
2466 LLVMPositionBuilderAtEnd(builder, v_end_loop);
2467
2468 LLVMValueRef den = LLVMBuildLoad(builder, den_store, "");
2469
2470 for (chan = 0; chan < 4; chan++)
2471 colors0[chan] = lp_build_div(&bld->texel_bld, LLVMBuildLoad(builder, colors0[chan], ""), den);
2472 LLVMValueRef den0 = lp_build_cmp(&bld->coord_bld, PIPE_FUNC_EQUAL, den, bld->coord_bld.zero);
2473
2474 LLVMValueRef den0_any = lp_build_any_true_range(&bld->coord_bld, bld->coord_bld.type.length, den0);
2475
2476 struct lp_build_if_state den0_fallback;
2477 lp_build_if(&den0_fallback, gallivm, den0_any);
2478
2479 LLVMValueRef colors_den0[4];
2480 lp_build_sample_image_linear(bld, false, size0, NULL,
2481 row_stride0_vec, img_stride0_vec,
2482 data_ptr0, mipoff0, coords, offsets,
2483 colors_den0);
2484 for (chan = 0; chan < 4; chan++) {
2485 LLVMValueRef chan_val = lp_build_select(&bld->texel_bld, den0, colors_den0[chan], colors0[chan]);
2486 LLVMBuildStore(builder, chan_val, colors_out[chan]);
2487 }
2488 lp_build_else(&den0_fallback);
2489 for (chan = 0; chan < 4; chan++)
2490 LLVMBuildStore(builder, colors0[chan], colors_out[chan]);
2491 lp_build_endif(&den0_fallback);
2492 }
2493
2494 /**
2495 * Calculate cube face, lod, mip levels.
2496 */
2497 static void
lp_build_sample_common(struct lp_build_sample_context * bld,boolean is_lodq,unsigned texture_index,unsigned sampler_index,LLVMValueRef * coords,const struct lp_derivatives * derivs,LLVMValueRef lod_bias,LLVMValueRef explicit_lod,LLVMValueRef * lod_pos_or_zero,LLVMValueRef * lod,LLVMValueRef * lod_fpart,LLVMValueRef * ilevel0,LLVMValueRef * ilevel1)2498 lp_build_sample_common(struct lp_build_sample_context *bld,
2499 boolean is_lodq,
2500 unsigned texture_index,
2501 unsigned sampler_index,
2502 LLVMValueRef *coords,
2503 const struct lp_derivatives *derivs, /* optional */
2504 LLVMValueRef lod_bias, /* optional */
2505 LLVMValueRef explicit_lod, /* optional */
2506 LLVMValueRef *lod_pos_or_zero,
2507 LLVMValueRef *lod,
2508 LLVMValueRef *lod_fpart,
2509 LLVMValueRef *ilevel0,
2510 LLVMValueRef *ilevel1)
2511 {
2512 const unsigned mip_filter = bld->static_sampler_state->min_mip_filter;
2513 const unsigned min_filter = bld->static_sampler_state->min_img_filter;
2514 const unsigned mag_filter = bld->static_sampler_state->mag_img_filter;
2515 const unsigned target = bld->static_texture_state->target;
2516 const bool aniso = bld->static_sampler_state->aniso;
2517 LLVMValueRef first_level, cube_rho = NULL;
2518 LLVMValueRef lod_ipart = NULL;
2519 struct lp_derivatives cube_derivs;
2520
2521 /*
2522 printf("%s mip %d min %d mag %d\n", __FUNCTION__,
2523 mip_filter, min_filter, mag_filter);
2524 */
2525
2526 /*
2527 * Choose cube face, recompute texcoords for the chosen face and
2528 * compute rho here too (as it requires transform of derivatives).
2529 */
2530 if (target == PIPE_TEXTURE_CUBE || target == PIPE_TEXTURE_CUBE_ARRAY) {
2531 boolean need_derivs;
2532 need_derivs = ((min_filter != mag_filter ||
2533 mip_filter != PIPE_TEX_MIPFILTER_NONE) &&
2534 !bld->static_sampler_state->min_max_lod_equal &&
2535 !explicit_lod);
2536 lp_build_cube_lookup(bld, coords, derivs, &cube_rho, &cube_derivs, need_derivs);
2537 derivs = &cube_derivs;
2538 if (target == PIPE_TEXTURE_CUBE_ARRAY && !is_lodq) {
2539 /* calculate cube layer coord now */
2540 LLVMValueRef layer = lp_build_iround(&bld->coord_bld, coords[3]);
2541 LLVMValueRef six = lp_build_const_int_vec(bld->gallivm, bld->int_coord_type, 6);
2542 layer = lp_build_mul(&bld->int_coord_bld, layer, six);
2543 coords[3] = lp_build_layer_coord(bld, texture_index, TRUE, layer, NULL);
2544 /* because of seamless filtering can't add it to face (coords[2]) here. */
2545 }
2546 }
2547 else if ((target == PIPE_TEXTURE_1D_ARRAY ||
2548 target == PIPE_TEXTURE_2D_ARRAY) && !is_lodq) {
2549 coords[2] = lp_build_iround(&bld->coord_bld, coords[2]);
2550 coords[2] = lp_build_layer_coord(bld, texture_index, FALSE, coords[2], NULL);
2551 }
2552
2553 if (bld->static_sampler_state->compare_mode != PIPE_TEX_COMPARE_NONE) {
2554 /*
2555 * Clamp p coords to [0,1] for fixed function depth texture format here.
2556 * Technically this is not entirely correct for unorm depth as the ref value
2557 * should be converted to the depth format (quantization!) and comparison
2558 * then done in texture format. This would actually help performance (since
2559 * only need to do it once and could save the per-sample conversion of texels
2560 * to floats instead), but it would need more messy code (would need to push
2561 * at least some bits down to actual fetch so conversion could be skipped,
2562 * and would have ugly interaction with border color, would need to convert
2563 * border color to that format too or do some other tricks to make it work).
2564 */
2565 const struct util_format_description *format_desc = bld->format_desc;
2566 unsigned chan_type;
2567 /* not entirely sure we couldn't end up with non-valid swizzle here */
2568 chan_type = format_desc->swizzle[0] <= PIPE_SWIZZLE_W ?
2569 format_desc->channel[format_desc->swizzle[0]].type :
2570 UTIL_FORMAT_TYPE_FLOAT;
2571 if (chan_type != UTIL_FORMAT_TYPE_FLOAT) {
2572 coords[4] = lp_build_clamp(&bld->coord_bld, coords[4],
2573 bld->coord_bld.zero, bld->coord_bld.one);
2574 }
2575 }
2576
2577 /*
2578 * Compute the level of detail (float).
2579 */
2580 if (min_filter != mag_filter ||
2581 mip_filter != PIPE_TEX_MIPFILTER_NONE || is_lodq) {
2582 LLVMValueRef max_aniso = NULL;
2583
2584 if (aniso)
2585 max_aniso = bld->dynamic_state->max_aniso(bld->dynamic_state,
2586 bld->gallivm,
2587 bld->context_ptr,
2588 sampler_index);
2589
2590 /* Need to compute lod either to choose mipmap levels or to
2591 * distinguish between minification/magnification with one mipmap level.
2592 */
2593 lp_build_lod_selector(bld, is_lodq, texture_index, sampler_index,
2594 coords[0], coords[1], coords[2], cube_rho,
2595 derivs, lod_bias, explicit_lod,
2596 mip_filter, max_aniso, lod,
2597 &lod_ipart, lod_fpart, lod_pos_or_zero);
2598 if (is_lodq) {
2599 LLVMValueRef last_level;
2600 last_level = bld->dynamic_state->last_level(bld->dynamic_state,
2601 bld->gallivm,
2602 bld->context_ptr,
2603 texture_index, NULL);
2604 first_level = bld->dynamic_state->first_level(bld->dynamic_state,
2605 bld->gallivm,
2606 bld->context_ptr,
2607 texture_index, NULL);
2608 last_level = lp_build_sub(&bld->int_bld, last_level, first_level);
2609 last_level = lp_build_int_to_float(&bld->float_bld, last_level);
2610 last_level = lp_build_broadcast_scalar(&bld->lodf_bld, last_level);
2611
2612 switch (mip_filter) {
2613 case PIPE_TEX_MIPFILTER_NONE:
2614 *lod_fpart = bld->lodf_bld.zero;
2615 break;
2616 case PIPE_TEX_MIPFILTER_NEAREST:
2617 *lod_fpart = lp_build_round(&bld->lodf_bld, *lod_fpart);
2618 FALLTHROUGH;
2619 case PIPE_TEX_MIPFILTER_LINEAR:
2620 *lod_fpart = lp_build_clamp(&bld->lodf_bld, *lod_fpart,
2621 bld->lodf_bld.zero, last_level);
2622 break;
2623 }
2624 return;
2625 }
2626
2627 } else {
2628 lod_ipart = bld->lodi_bld.zero;
2629 *lod_pos_or_zero = bld->lodi_bld.zero;
2630 }
2631
2632 if ((bld->num_lods != bld->num_mips || bld->num_lods == 1) &&
2633 bld->lodi_bld.type.length != 1) {
2634 /* only makes sense if there's just a single mip level */
2635 assert(bld->num_mips == 1);
2636 lod_ipart = lp_build_extract_range(bld->gallivm, lod_ipart, 0, 1);
2637 }
2638
2639 /*
2640 * Compute integer mipmap level(s) to fetch texels from: ilevel0, ilevel1
2641 */
2642
2643 if (aniso) {
2644 lp_build_nearest_mip_level(bld, texture_index, lod_ipart, ilevel0, NULL);
2645 return;
2646 }
2647
2648 switch (mip_filter) {
2649 default:
2650 debug_assert(0 && "bad mip_filter value in lp_build_sample_soa()");
2651 #if defined(NDEBUG) || defined(DEBUG)
2652 FALLTHROUGH;
2653 #endif
2654 case PIPE_TEX_MIPFILTER_NONE:
2655 /* always use mip level 0 */
2656 first_level = bld->dynamic_state->first_level(bld->dynamic_state,
2657 bld->gallivm, bld->context_ptr,
2658 texture_index, NULL);
2659 first_level = lp_build_broadcast_scalar(&bld->leveli_bld, first_level);
2660 *ilevel0 = first_level;
2661 break;
2662 case PIPE_TEX_MIPFILTER_NEAREST:
2663 assert(lod_ipart);
2664 lp_build_nearest_mip_level(bld, texture_index, lod_ipart, ilevel0, NULL);
2665 break;
2666 case PIPE_TEX_MIPFILTER_LINEAR:
2667 assert(lod_ipart);
2668 assert(*lod_fpart);
2669 lp_build_linear_mip_levels(bld, texture_index,
2670 lod_ipart, lod_fpart,
2671 ilevel0, ilevel1);
2672 break;
2673 }
2674 }
2675
2676 static void
lp_build_clamp_border_color(struct lp_build_sample_context * bld,unsigned sampler_unit)2677 lp_build_clamp_border_color(struct lp_build_sample_context *bld,
2678 unsigned sampler_unit)
2679 {
2680 struct gallivm_state *gallivm = bld->gallivm;
2681 LLVMBuilderRef builder = gallivm->builder;
2682 LLVMValueRef border_color_ptr =
2683 bld->dynamic_state->border_color(bld->dynamic_state, gallivm,
2684 bld->context_ptr, sampler_unit);
2685 LLVMValueRef border_color;
2686 const struct util_format_description *format_desc = bld->format_desc;
2687 struct lp_type vec4_type = bld->texel_type;
2688 struct lp_build_context vec4_bld;
2689 LLVMValueRef min_clamp = NULL;
2690 LLVMValueRef max_clamp = NULL;
2691
2692 /*
2693 * For normalized format need to clamp border color (technically
2694 * probably should also quantize the data). Really sucks doing this
2695 * here but can't avoid at least for now since this is part of
2696 * sampler state and texture format is part of sampler_view state.
2697 * GL expects also expects clamping for uint/sint formats too so
2698 * do that as well (d3d10 can't end up here with uint/sint since it
2699 * only supports them with ld).
2700 */
2701 vec4_type.length = 4;
2702 lp_build_context_init(&vec4_bld, gallivm, vec4_type);
2703
2704 /*
2705 * Vectorized clamping of border color. Loading is a bit of a hack since
2706 * we just cast the pointer to float array to pointer to vec4
2707 * (int or float).
2708 */
2709 border_color_ptr = lp_build_array_get_ptr(gallivm, border_color_ptr,
2710 lp_build_const_int32(gallivm, 0));
2711 border_color_ptr = LLVMBuildBitCast(builder, border_color_ptr,
2712 LLVMPointerType(vec4_bld.vec_type, 0), "");
2713 border_color = LLVMBuildLoad(builder, border_color_ptr, "");
2714 /* we don't have aligned type in the dynamic state unfortunately */
2715 LLVMSetAlignment(border_color, 4);
2716
2717 /*
2718 * Instead of having some incredibly complex logic which will try to figure out
2719 * clamping necessary for each channel, simply use the first channel, and treat
2720 * mixed signed/unsigned normalized formats specially.
2721 * (Mixed non-normalized, which wouldn't work at all here, do not exist for a
2722 * good reason.)
2723 */
2724 if (format_desc->layout == UTIL_FORMAT_LAYOUT_PLAIN) {
2725 int chan;
2726 /* d/s needs special handling because both present means just sampling depth */
2727 if (util_format_is_depth_and_stencil(format_desc->format)) {
2728 chan = format_desc->swizzle[0];
2729 }
2730 else {
2731 chan = util_format_get_first_non_void_channel(format_desc->format);
2732 }
2733 if (chan >= 0 && chan <= PIPE_SWIZZLE_W) {
2734 unsigned chan_type = format_desc->channel[chan].type;
2735 unsigned chan_norm = format_desc->channel[chan].normalized;
2736 unsigned chan_pure = format_desc->channel[chan].pure_integer;
2737 if (chan_type == UTIL_FORMAT_TYPE_SIGNED) {
2738 if (chan_norm) {
2739 min_clamp = lp_build_const_vec(gallivm, vec4_type, -1.0F);
2740 max_clamp = vec4_bld.one;
2741 }
2742 else if (chan_pure) {
2743 /*
2744 * Border color was stored as int, hence need min/max clamp
2745 * only if chan has less than 32 bits..
2746 */
2747 unsigned chan_size = format_desc->channel[chan].size;
2748 if (chan_size < 32) {
2749 min_clamp = lp_build_const_int_vec(gallivm, vec4_type,
2750 0 - (1 << (chan_size - 1)));
2751 max_clamp = lp_build_const_int_vec(gallivm, vec4_type,
2752 (1 << (chan_size - 1)) - 1);
2753 }
2754 }
2755 /* TODO: no idea about non-pure, non-normalized! */
2756 }
2757 else if (chan_type == UTIL_FORMAT_TYPE_UNSIGNED) {
2758 if (chan_norm) {
2759 min_clamp = vec4_bld.zero;
2760 max_clamp = vec4_bld.one;
2761 }
2762 /*
2763 * Need a ugly hack here, because we don't have Z32_FLOAT_X8X24
2764 * we use Z32_FLOAT_S8X24 to imply sampling depth component
2765 * and ignoring stencil, which will blow up here if we try to
2766 * do a uint clamp in a float texel build...
2767 * And even if we had that format, mesa st also thinks using z24s8
2768 * means depth sampling ignoring stencil.
2769 */
2770 else if (chan_pure) {
2771 /*
2772 * Border color was stored as uint, hence never need min
2773 * clamp, and only need max clamp if chan has less than 32 bits.
2774 */
2775 unsigned chan_size = format_desc->channel[chan].size;
2776 if (chan_size < 32) {
2777 max_clamp = lp_build_const_int_vec(gallivm, vec4_type,
2778 (1 << chan_size) - 1);
2779 }
2780 /* TODO: no idea about non-pure, non-normalized! */
2781 }
2782 }
2783 else if (chan_type == UTIL_FORMAT_TYPE_FIXED) {
2784 /* TODO: I have no idea what clamp this would need if any! */
2785 }
2786 }
2787 /* mixed plain formats (or different pure size) */
2788 switch (format_desc->format) {
2789 case PIPE_FORMAT_B10G10R10A2_UINT:
2790 case PIPE_FORMAT_R10G10B10A2_UINT:
2791 {
2792 unsigned max10 = (1 << 10) - 1;
2793 max_clamp = lp_build_const_aos(gallivm, vec4_type, max10, max10,
2794 max10, (1 << 2) - 1, NULL);
2795 }
2796 break;
2797 case PIPE_FORMAT_R10SG10SB10SA2U_NORM:
2798 min_clamp = lp_build_const_aos(gallivm, vec4_type, -1.0F, -1.0F,
2799 -1.0F, 0.0F, NULL);
2800 max_clamp = vec4_bld.one;
2801 break;
2802 case PIPE_FORMAT_R8SG8SB8UX8U_NORM:
2803 case PIPE_FORMAT_R5SG5SB6U_NORM:
2804 min_clamp = lp_build_const_aos(gallivm, vec4_type, -1.0F, -1.0F,
2805 0.0F, 0.0F, NULL);
2806 max_clamp = vec4_bld.one;
2807 break;
2808 default:
2809 break;
2810 }
2811 }
2812 else {
2813 /* cannot figure this out from format description */
2814 if (format_desc->layout == UTIL_FORMAT_LAYOUT_S3TC) {
2815 /* s3tc formats are always unorm */
2816 min_clamp = vec4_bld.zero;
2817 max_clamp = vec4_bld.one;
2818 }
2819 else if (format_desc->layout == UTIL_FORMAT_LAYOUT_RGTC ||
2820 format_desc->layout == UTIL_FORMAT_LAYOUT_ETC ||
2821 format_desc->layout == UTIL_FORMAT_LAYOUT_BPTC) {
2822 switch (format_desc->format) {
2823 case PIPE_FORMAT_RGTC1_UNORM:
2824 case PIPE_FORMAT_RGTC2_UNORM:
2825 case PIPE_FORMAT_LATC1_UNORM:
2826 case PIPE_FORMAT_LATC2_UNORM:
2827 case PIPE_FORMAT_ETC1_RGB8:
2828 case PIPE_FORMAT_BPTC_RGBA_UNORM:
2829 case PIPE_FORMAT_BPTC_SRGBA:
2830 min_clamp = vec4_bld.zero;
2831 max_clamp = vec4_bld.one;
2832 break;
2833 case PIPE_FORMAT_RGTC1_SNORM:
2834 case PIPE_FORMAT_RGTC2_SNORM:
2835 case PIPE_FORMAT_LATC1_SNORM:
2836 case PIPE_FORMAT_LATC2_SNORM:
2837 min_clamp = lp_build_const_vec(gallivm, vec4_type, -1.0F);
2838 max_clamp = vec4_bld.one;
2839 break;
2840 case PIPE_FORMAT_BPTC_RGB_FLOAT:
2841 /* not sure if we should clamp to max half float? */
2842 break;
2843 case PIPE_FORMAT_BPTC_RGB_UFLOAT:
2844 min_clamp = vec4_bld.zero;
2845 break;
2846 default:
2847 assert(0);
2848 break;
2849 }
2850 }
2851 /*
2852 * all others from subsampled/other group, though we don't care
2853 * about yuv (and should not have any from zs here)
2854 */
2855 else if (format_desc->colorspace != UTIL_FORMAT_COLORSPACE_YUV){
2856 switch (format_desc->format) {
2857 case PIPE_FORMAT_R8G8_B8G8_UNORM:
2858 case PIPE_FORMAT_G8R8_G8B8_UNORM:
2859 case PIPE_FORMAT_G8R8_B8R8_UNORM:
2860 case PIPE_FORMAT_R8G8_R8B8_UNORM:
2861 case PIPE_FORMAT_R1_UNORM: /* doesn't make sense but ah well */
2862 min_clamp = vec4_bld.zero;
2863 max_clamp = vec4_bld.one;
2864 break;
2865 case PIPE_FORMAT_R8G8Bx_SNORM:
2866 min_clamp = lp_build_const_vec(gallivm, vec4_type, -1.0F);
2867 max_clamp = vec4_bld.one;
2868 break;
2869 /*
2870 * Note smallfloat formats usually don't need clamping
2871 * (they still have infinite range) however this is not
2872 * true for r11g11b10 and r9g9b9e5, which can't represent
2873 * negative numbers (and additionally r9g9b9e5 can't represent
2874 * very large numbers). d3d10 seems happy without clamping in
2875 * this case, but gl spec is pretty clear: "for floating
2876 * point and integer formats, border values are clamped to
2877 * the representable range of the format" so do that here.
2878 */
2879 case PIPE_FORMAT_R11G11B10_FLOAT:
2880 min_clamp = vec4_bld.zero;
2881 break;
2882 case PIPE_FORMAT_R9G9B9E5_FLOAT:
2883 min_clamp = vec4_bld.zero;
2884 max_clamp = lp_build_const_vec(gallivm, vec4_type, MAX_RGB9E5);
2885 break;
2886 default:
2887 assert(0);
2888 break;
2889 }
2890 }
2891 }
2892
2893 if (min_clamp) {
2894 border_color = lp_build_max(&vec4_bld, border_color, min_clamp);
2895 }
2896 if (max_clamp) {
2897 border_color = lp_build_min(&vec4_bld, border_color, max_clamp);
2898 }
2899
2900 bld->border_color_clamped = border_color;
2901 }
2902
2903
2904 /**
2905 * General texture sampling codegen.
2906 * This function handles texture sampling for all texture targets (1D,
2907 * 2D, 3D, cube) and all filtering modes.
2908 */
2909 static void
lp_build_sample_general(struct lp_build_sample_context * bld,unsigned sampler_unit,boolean is_gather,const LLVMValueRef * coords,const LLVMValueRef * offsets,LLVMValueRef lod_positive,LLVMValueRef lod_fpart,LLVMValueRef ilevel0,LLVMValueRef ilevel1,LLVMValueRef * colors_out)2910 lp_build_sample_general(struct lp_build_sample_context *bld,
2911 unsigned sampler_unit,
2912 boolean is_gather,
2913 const LLVMValueRef *coords,
2914 const LLVMValueRef *offsets,
2915 LLVMValueRef lod_positive,
2916 LLVMValueRef lod_fpart,
2917 LLVMValueRef ilevel0,
2918 LLVMValueRef ilevel1,
2919 LLVMValueRef *colors_out)
2920 {
2921 LLVMBuilderRef builder = bld->gallivm->builder;
2922 const struct lp_static_sampler_state *sampler_state = bld->static_sampler_state;
2923 const unsigned mip_filter = sampler_state->min_mip_filter;
2924 const unsigned min_filter = sampler_state->min_img_filter;
2925 const unsigned mag_filter = sampler_state->mag_img_filter;
2926 LLVMValueRef texels[4];
2927 unsigned chan;
2928
2929 /* if we need border color, (potentially) clamp it now */
2930 if (lp_sampler_wrap_mode_uses_border_color(sampler_state->wrap_s,
2931 min_filter,
2932 mag_filter) ||
2933 (bld->dims > 1 &&
2934 lp_sampler_wrap_mode_uses_border_color(sampler_state->wrap_t,
2935 min_filter,
2936 mag_filter)) ||
2937 (bld->dims > 2 &&
2938 lp_sampler_wrap_mode_uses_border_color(sampler_state->wrap_r,
2939 min_filter,
2940 mag_filter))) {
2941 lp_build_clamp_border_color(bld, sampler_unit);
2942 }
2943
2944
2945 /*
2946 * Get/interpolate texture colors.
2947 */
2948
2949 for (chan = 0; chan < 4; ++chan) {
2950 texels[chan] = lp_build_alloca(bld->gallivm, bld->texel_bld.vec_type, "");
2951 lp_build_name(texels[chan], "sampler%u_texel_%c_var", sampler_unit, "xyzw"[chan]);
2952 }
2953
2954 if (sampler_state->aniso) {
2955 lp_build_sample_aniso(bld, PIPE_TEX_FILTER_NEAREST, mip_filter,
2956 false, coords, offsets, ilevel0,
2957 ilevel1, lod_fpart, texels);
2958 } else if (min_filter == mag_filter) {
2959 /* no need to distinguish between minification and magnification */
2960 lp_build_sample_mipmap(bld, min_filter, mip_filter,
2961 is_gather,
2962 coords, offsets,
2963 ilevel0, ilevel1, lod_fpart,
2964 texels);
2965 }
2966 else {
2967 /*
2968 * Could also get rid of the if-logic and always use mipmap_both, both
2969 * for the single lod and multi-lod case if nothing really uses this.
2970 */
2971 if (bld->num_lods == 1) {
2972 /* Emit conditional to choose min image filter or mag image filter
2973 * depending on the lod being > 0 or <= 0, respectively.
2974 */
2975 struct lp_build_if_state if_ctx;
2976
2977 lod_positive = LLVMBuildTrunc(builder, lod_positive,
2978 LLVMInt1TypeInContext(bld->gallivm->context),
2979 "lod_pos");
2980
2981 lp_build_if(&if_ctx, bld->gallivm, lod_positive);
2982 {
2983 /* Use the minification filter */
2984 lp_build_sample_mipmap(bld, min_filter, mip_filter, FALSE,
2985 coords, offsets,
2986 ilevel0, ilevel1, lod_fpart,
2987 texels);
2988 }
2989 lp_build_else(&if_ctx);
2990 {
2991 /* Use the magnification filter */
2992 lp_build_sample_mipmap(bld, mag_filter, PIPE_TEX_MIPFILTER_NONE,
2993 FALSE,
2994 coords, offsets,
2995 ilevel0, NULL, NULL,
2996 texels);
2997 }
2998 lp_build_endif(&if_ctx);
2999 }
3000 else {
3001 LLVMValueRef need_linear, linear_mask;
3002 unsigned mip_filter_for_nearest;
3003 struct lp_build_if_state if_ctx;
3004
3005 if (min_filter == PIPE_TEX_FILTER_LINEAR) {
3006 linear_mask = lod_positive;
3007 mip_filter_for_nearest = PIPE_TEX_MIPFILTER_NONE;
3008 }
3009 else {
3010 linear_mask = lp_build_not(&bld->lodi_bld, lod_positive);
3011 mip_filter_for_nearest = mip_filter;
3012 }
3013 need_linear = lp_build_any_true_range(&bld->lodi_bld, bld->num_lods,
3014 linear_mask);
3015 lp_build_name(need_linear, "need_linear");
3016
3017 if (bld->num_lods != bld->coord_type.length) {
3018 linear_mask = lp_build_unpack_broadcast_aos_scalars(bld->gallivm,
3019 bld->lodi_type,
3020 bld->int_coord_type,
3021 linear_mask);
3022 }
3023
3024 lp_build_if(&if_ctx, bld->gallivm, need_linear);
3025 {
3026 /*
3027 * Do sampling with both filters simultaneously. This means using
3028 * a linear filter and doing some tricks (with weights) for the pixels
3029 * which need nearest filter.
3030 * Note that it's probably rare some pixels need nearest and some
3031 * linear filter but the fixups required for the nearest pixels
3032 * aren't all that complicated so just always run a combined path
3033 * if at least some pixels require linear.
3034 */
3035 lp_build_sample_mipmap_both(bld, linear_mask, mip_filter,
3036 coords, offsets,
3037 ilevel0, ilevel1,
3038 lod_fpart, lod_positive,
3039 texels);
3040 }
3041 lp_build_else(&if_ctx);
3042 {
3043 /*
3044 * All pixels require just nearest filtering, which is way
3045 * cheaper than linear, hence do a separate path for that.
3046 */
3047 lp_build_sample_mipmap(bld, PIPE_TEX_FILTER_NEAREST,
3048 mip_filter_for_nearest, FALSE,
3049 coords, offsets,
3050 ilevel0, ilevel1, lod_fpart,
3051 texels);
3052 }
3053 lp_build_endif(&if_ctx);
3054 }
3055 }
3056
3057 for (chan = 0; chan < 4; ++chan) {
3058 colors_out[chan] = LLVMBuildLoad(builder, texels[chan], "");
3059 lp_build_name(colors_out[chan], "sampler%u_texel_%c", sampler_unit, "xyzw"[chan]);
3060 }
3061 }
3062
3063
3064 /**
3065 * Texel fetch function.
3066 * In contrast to general sampling there is no filtering, no coord minification,
3067 * lod (if any) is always explicit uint, coords are uints (in terms of texel units)
3068 * directly to be applied to the selected mip level (after adding texel offsets).
3069 * This function handles texel fetch for all targets where texel fetch is supported
3070 * (no cube maps, but 1d, 2d, 3d are supported, arrays and buffers should be too).
3071 */
3072 static void
lp_build_fetch_texel(struct lp_build_sample_context * bld,unsigned texture_unit,LLVMValueRef ms_index,const LLVMValueRef * coords,LLVMValueRef explicit_lod,const LLVMValueRef * offsets,LLVMValueRef * colors_out)3073 lp_build_fetch_texel(struct lp_build_sample_context *bld,
3074 unsigned texture_unit,
3075 LLVMValueRef ms_index,
3076 const LLVMValueRef *coords,
3077 LLVMValueRef explicit_lod,
3078 const LLVMValueRef *offsets,
3079 LLVMValueRef *colors_out)
3080 {
3081 struct lp_build_context *perquadi_bld = &bld->lodi_bld;
3082 struct lp_build_context *int_coord_bld = &bld->int_coord_bld;
3083 unsigned dims = bld->dims, chan;
3084 unsigned target = bld->static_texture_state->target;
3085 boolean out_of_bound_ret_zero = TRUE;
3086 LLVMValueRef size, ilevel;
3087 LLVMValueRef row_stride_vec = NULL, img_stride_vec = NULL;
3088 LLVMValueRef x = coords[0], y = coords[1], z = coords[2];
3089 LLVMValueRef width, height, depth, i, j;
3090 LLVMValueRef offset, out_of_bounds, out1;
3091
3092 out_of_bounds = int_coord_bld->zero;
3093
3094 if (explicit_lod && bld->static_texture_state->target != PIPE_BUFFER) {
3095 if (bld->num_mips != int_coord_bld->type.length) {
3096 ilevel = lp_build_pack_aos_scalars(bld->gallivm, int_coord_bld->type,
3097 perquadi_bld->type, explicit_lod, 0);
3098 }
3099 else {
3100 ilevel = explicit_lod;
3101 }
3102 lp_build_nearest_mip_level(bld, texture_unit, ilevel, &ilevel,
3103 out_of_bound_ret_zero ? &out_of_bounds : NULL);
3104 }
3105 else {
3106 assert(bld->num_mips == 1);
3107 if (bld->static_texture_state->target != PIPE_BUFFER) {
3108 ilevel = bld->dynamic_state->first_level(bld->dynamic_state, bld->gallivm,
3109 bld->context_ptr, texture_unit, NULL);
3110 }
3111 else {
3112 ilevel = lp_build_const_int32(bld->gallivm, 0);
3113 }
3114 }
3115 lp_build_mipmap_level_sizes(bld, ilevel,
3116 &size,
3117 &row_stride_vec, &img_stride_vec);
3118 lp_build_extract_image_sizes(bld, &bld->int_size_bld, int_coord_bld->type,
3119 size, &width, &height, &depth);
3120
3121 if (target == PIPE_TEXTURE_1D_ARRAY ||
3122 target == PIPE_TEXTURE_2D_ARRAY) {
3123 if (out_of_bound_ret_zero) {
3124 z = lp_build_layer_coord(bld, texture_unit, FALSE, z, &out1);
3125 out_of_bounds = lp_build_or(int_coord_bld, out_of_bounds, out1);
3126 }
3127 else {
3128 z = lp_build_layer_coord(bld, texture_unit, FALSE, z, NULL);
3129 }
3130 }
3131
3132 /* This is a lot like border sampling */
3133 if (offsets[0]) {
3134 /*
3135 * coords are really unsigned, offsets are signed, but I don't think
3136 * exceeding 31 bits is possible
3137 */
3138 x = lp_build_add(int_coord_bld, x, offsets[0]);
3139 }
3140 out1 = lp_build_cmp(int_coord_bld, PIPE_FUNC_LESS, x, int_coord_bld->zero);
3141 out_of_bounds = lp_build_or(int_coord_bld, out_of_bounds, out1);
3142 out1 = lp_build_cmp(int_coord_bld, PIPE_FUNC_GEQUAL, x, width);
3143 out_of_bounds = lp_build_or(int_coord_bld, out_of_bounds, out1);
3144
3145 if (dims >= 2) {
3146 if (offsets[1]) {
3147 y = lp_build_add(int_coord_bld, y, offsets[1]);
3148 }
3149 out1 = lp_build_cmp(int_coord_bld, PIPE_FUNC_LESS, y, int_coord_bld->zero);
3150 out_of_bounds = lp_build_or(int_coord_bld, out_of_bounds, out1);
3151 out1 = lp_build_cmp(int_coord_bld, PIPE_FUNC_GEQUAL, y, height);
3152 out_of_bounds = lp_build_or(int_coord_bld, out_of_bounds, out1);
3153
3154 if (dims >= 3) {
3155 if (offsets[2]) {
3156 z = lp_build_add(int_coord_bld, z, offsets[2]);
3157 }
3158 out1 = lp_build_cmp(int_coord_bld, PIPE_FUNC_LESS, z, int_coord_bld->zero);
3159 out_of_bounds = lp_build_or(int_coord_bld, out_of_bounds, out1);
3160 out1 = lp_build_cmp(int_coord_bld, PIPE_FUNC_GEQUAL, z, depth);
3161 out_of_bounds = lp_build_or(int_coord_bld, out_of_bounds, out1);
3162 }
3163 }
3164
3165 lp_build_sample_offset(int_coord_bld,
3166 bld->format_desc,
3167 x, y, z, row_stride_vec, img_stride_vec,
3168 &offset, &i, &j);
3169
3170 if (bld->static_texture_state->target != PIPE_BUFFER) {
3171 offset = lp_build_add(int_coord_bld, offset,
3172 lp_build_get_mip_offsets(bld, ilevel));
3173 }
3174
3175 if (bld->fetch_ms) {
3176 LLVMValueRef num_samples;
3177 num_samples = bld->dynamic_state->num_samples(bld->dynamic_state, bld->gallivm,
3178 bld->context_ptr, texture_unit, NULL);
3179 out1 = lp_build_cmp(int_coord_bld, PIPE_FUNC_LESS, ms_index, int_coord_bld->zero);
3180 out_of_bounds = lp_build_or(int_coord_bld, out_of_bounds, out1);
3181 out1 = lp_build_cmp(int_coord_bld, PIPE_FUNC_GEQUAL, ms_index, lp_build_broadcast_scalar(int_coord_bld, num_samples));
3182 out_of_bounds = lp_build_or(int_coord_bld, out_of_bounds, out1);
3183 offset = lp_build_add(int_coord_bld, offset,
3184 lp_build_mul(int_coord_bld, bld->sample_stride, ms_index));
3185 }
3186
3187 offset = lp_build_andnot(int_coord_bld, offset, out_of_bounds);
3188
3189 lp_build_fetch_rgba_soa(bld->gallivm,
3190 bld->format_desc,
3191 bld->texel_type, TRUE,
3192 bld->base_ptr, offset,
3193 i, j,
3194 bld->cache,
3195 colors_out);
3196
3197 if (out_of_bound_ret_zero) {
3198 /*
3199 * Only needed for ARB_robust_buffer_access_behavior and d3d10.
3200 * Could use min/max above instead of out-of-bounds comparisons
3201 * if we don't care about the result returned for out-of-bounds.
3202 */
3203 for (chan = 0; chan < 4; chan++) {
3204 colors_out[chan] = lp_build_select(&bld->texel_bld, out_of_bounds,
3205 bld->texel_bld.zero, colors_out[chan]);
3206 }
3207 }
3208 }
3209
3210
3211 /**
3212 * Just set texels to white instead of actually sampling the texture.
3213 * For debugging.
3214 */
3215 void
lp_build_sample_nop(struct gallivm_state * gallivm,struct lp_type type,const LLVMValueRef * coords,LLVMValueRef texel_out[4])3216 lp_build_sample_nop(struct gallivm_state *gallivm,
3217 struct lp_type type,
3218 const LLVMValueRef *coords,
3219 LLVMValueRef texel_out[4])
3220 {
3221 LLVMValueRef one = lp_build_one(gallivm, type);
3222 unsigned chan;
3223
3224 for (chan = 0; chan < 4; chan++) {
3225 texel_out[chan] = one;
3226 }
3227 }
3228
3229 static struct lp_type
lp_build_texel_type(struct lp_type texel_type,const struct util_format_description * format_desc)3230 lp_build_texel_type(struct lp_type texel_type,
3231 const struct util_format_description *format_desc)
3232 {
3233 /* always using the first channel hopefully should be safe,
3234 * if not things WILL break in other places anyway.
3235 */
3236 if (format_desc->colorspace == UTIL_FORMAT_COLORSPACE_RGB &&
3237 format_desc->channel[0].pure_integer) {
3238 if (format_desc->channel[0].type == UTIL_FORMAT_TYPE_SIGNED) {
3239 texel_type = lp_type_int_vec(texel_type.width, texel_type.width * texel_type.length);
3240 } else if (format_desc->channel[0].type == UTIL_FORMAT_TYPE_UNSIGNED) {
3241 texel_type = lp_type_uint_vec(texel_type.width, texel_type.width * texel_type.length);
3242 }
3243 }
3244 else if (util_format_has_stencil(format_desc) &&
3245 !util_format_has_depth(format_desc)) {
3246 /* for stencil only formats, sample stencil (uint) */
3247 texel_type = lp_type_uint_vec(texel_type.width, texel_type.width * texel_type.length);
3248 }
3249 return texel_type;
3250 }
3251
3252
3253 /**
3254 * Build the actual texture sampling code.
3255 * 'texel' will return a vector of four LLVMValueRefs corresponding to
3256 * R, G, B, A.
3257 * \param type vector float type to use for coords, etc.
3258 * \param sample_key
3259 * \param derivs partial derivatives of (s,t,r,q) with respect to x and y
3260 */
3261 static void
lp_build_sample_soa_code(struct gallivm_state * gallivm,const struct lp_static_texture_state * static_texture_state,const struct lp_static_sampler_state * static_sampler_state,struct lp_sampler_dynamic_state * dynamic_state,struct lp_type type,unsigned sample_key,unsigned texture_index,unsigned sampler_index,LLVMValueRef context_ptr,LLVMValueRef thread_data_ptr,const LLVMValueRef * coords,const LLVMValueRef * offsets,const struct lp_derivatives * derivs,LLVMValueRef lod,LLVMValueRef ms_index,LLVMValueRef aniso_filter_table,LLVMValueRef texel_out[4])3262 lp_build_sample_soa_code(struct gallivm_state *gallivm,
3263 const struct lp_static_texture_state *static_texture_state,
3264 const struct lp_static_sampler_state *static_sampler_state,
3265 struct lp_sampler_dynamic_state *dynamic_state,
3266 struct lp_type type,
3267 unsigned sample_key,
3268 unsigned texture_index,
3269 unsigned sampler_index,
3270 LLVMValueRef context_ptr,
3271 LLVMValueRef thread_data_ptr,
3272 const LLVMValueRef *coords,
3273 const LLVMValueRef *offsets,
3274 const struct lp_derivatives *derivs, /* optional */
3275 LLVMValueRef lod, /* optional */
3276 LLVMValueRef ms_index, /* optional */
3277 LLVMValueRef aniso_filter_table,
3278 LLVMValueRef texel_out[4])
3279 {
3280 unsigned target = static_texture_state->target;
3281 unsigned dims = texture_dims(target);
3282 unsigned num_quads = type.length / 4;
3283 unsigned mip_filter, min_img_filter, mag_img_filter, i;
3284 struct lp_build_sample_context bld;
3285 struct lp_static_sampler_state derived_sampler_state = *static_sampler_state;
3286 LLVMTypeRef i32t = LLVMInt32TypeInContext(gallivm->context);
3287 LLVMBuilderRef builder = gallivm->builder;
3288 LLVMValueRef tex_width, newcoords[5];
3289 enum lp_sampler_lod_property lod_property;
3290 enum lp_sampler_lod_control lod_control;
3291 enum lp_sampler_op_type op_type;
3292 LLVMValueRef lod_bias = NULL;
3293 LLVMValueRef explicit_lod = NULL;
3294 boolean op_is_tex, op_is_lodq, op_is_gather, fetch_ms;
3295
3296 if (0) {
3297 enum pipe_format fmt = static_texture_state->format;
3298 debug_printf("Sample from %s\n", util_format_name(fmt));
3299 }
3300
3301 lod_property = (sample_key & LP_SAMPLER_LOD_PROPERTY_MASK) >>
3302 LP_SAMPLER_LOD_PROPERTY_SHIFT;
3303 lod_control = (sample_key & LP_SAMPLER_LOD_CONTROL_MASK) >>
3304 LP_SAMPLER_LOD_CONTROL_SHIFT;
3305 op_type = (sample_key & LP_SAMPLER_OP_TYPE_MASK) >>
3306 LP_SAMPLER_OP_TYPE_SHIFT;
3307 fetch_ms = !!(sample_key & LP_SAMPLER_FETCH_MS);
3308
3309 op_is_tex = op_type == LP_SAMPLER_OP_TEXTURE;
3310 op_is_lodq = op_type == LP_SAMPLER_OP_LODQ;
3311 op_is_gather = op_type == LP_SAMPLER_OP_GATHER;
3312
3313 if (lod_control == LP_SAMPLER_LOD_BIAS) {
3314 lod_bias = lod;
3315 assert(lod);
3316 assert(derivs == NULL);
3317 }
3318 else if (lod_control == LP_SAMPLER_LOD_EXPLICIT) {
3319 explicit_lod = lod;
3320 assert(lod);
3321 assert(derivs == NULL);
3322 }
3323 else if (lod_control == LP_SAMPLER_LOD_DERIVATIVES) {
3324 assert(derivs);
3325 assert(lod == NULL);
3326 }
3327 else {
3328 assert(derivs == NULL);
3329 assert(lod == NULL);
3330 }
3331
3332 if (static_texture_state->format == PIPE_FORMAT_NONE) {
3333 /*
3334 * If there's nothing bound, format is NONE, and we must return
3335 * all zero as mandated by d3d10 in this case.
3336 */
3337 unsigned chan;
3338 LLVMValueRef zero = lp_build_zero(gallivm, type);
3339 for (chan = 0; chan < 4; chan++) {
3340 texel_out[chan] = zero;
3341 }
3342 return;
3343 }
3344
3345 assert(type.floating);
3346
3347 /* Setup our build context */
3348 memset(&bld, 0, sizeof bld);
3349 bld.gallivm = gallivm;
3350 bld.context_ptr = context_ptr;
3351 bld.aniso_filter_table = aniso_filter_table;
3352 bld.static_sampler_state = &derived_sampler_state;
3353 bld.static_texture_state = static_texture_state;
3354 bld.dynamic_state = dynamic_state;
3355 bld.format_desc = util_format_description(static_texture_state->format);
3356 bld.dims = dims;
3357
3358 if (gallivm_perf & GALLIVM_PERF_NO_QUAD_LOD || op_is_lodq) {
3359 bld.no_quad_lod = TRUE;
3360 }
3361 if (!(gallivm_perf & GALLIVM_PERF_RHO_APPROX) || op_is_lodq) {
3362 bld.no_rho_approx = TRUE;
3363 }
3364 if (!(gallivm_perf & GALLIVM_PERF_BRILINEAR) || op_is_lodq || lod_bias || explicit_lod) {
3365 bld.no_brilinear = TRUE;
3366 }
3367
3368 bld.vector_width = lp_type_width(type);
3369
3370 bld.float_type = lp_type_float(32);
3371 bld.int_type = lp_type_int(32);
3372 bld.coord_type = type;
3373 bld.int_coord_type = lp_int_type(type);
3374 bld.float_size_in_type = lp_type_float(32);
3375 bld.float_size_in_type.length = dims > 1 ? 4 : 1;
3376 bld.int_size_in_type = lp_int_type(bld.float_size_in_type);
3377
3378 bld.texel_type = lp_build_texel_type(type, bld.format_desc);
3379
3380 if (!static_texture_state->level_zero_only ||
3381 !static_sampler_state->max_lod_pos || op_is_lodq) {
3382 derived_sampler_state.min_mip_filter = static_sampler_state->min_mip_filter;
3383 } else {
3384 derived_sampler_state.min_mip_filter = PIPE_TEX_MIPFILTER_NONE;
3385 }
3386 if (op_is_gather) {
3387 /*
3388 * gather4 is exactly like GL_LINEAR filtering but in the end skipping
3389 * the actual filtering. Using mostly the same paths, so cube face
3390 * selection, coord wrapping etc. all naturally uses the same code.
3391 */
3392 derived_sampler_state.min_mip_filter = PIPE_TEX_MIPFILTER_NONE;
3393 derived_sampler_state.min_img_filter = PIPE_TEX_FILTER_LINEAR;
3394 derived_sampler_state.mag_img_filter = PIPE_TEX_FILTER_LINEAR;
3395 }
3396 mip_filter = derived_sampler_state.min_mip_filter;
3397
3398 if (0) {
3399 debug_printf(" .min_mip_filter = %u\n", derived_sampler_state.min_mip_filter);
3400 }
3401
3402 if (static_texture_state->target == PIPE_TEXTURE_CUBE ||
3403 static_texture_state->target == PIPE_TEXTURE_CUBE_ARRAY)
3404 {
3405 /*
3406 * Seamless filtering ignores wrap modes.
3407 * Setting to CLAMP_TO_EDGE is correct for nearest filtering, for
3408 * bilinear it's not correct but way better than using for instance repeat.
3409 * Note we even set this for non-seamless. Technically GL allows any wrap
3410 * mode, which made sense when supporting true borders (can get seamless
3411 * effect with border and CLAMP_TO_BORDER), but gallium doesn't support
3412 * borders and d3d9 requires wrap modes to be ignored and it's a pain to fix
3413 * up the sampler state (as it makes it texture dependent).
3414 */
3415 derived_sampler_state.wrap_s = PIPE_TEX_WRAP_CLAMP_TO_EDGE;
3416 derived_sampler_state.wrap_t = PIPE_TEX_WRAP_CLAMP_TO_EDGE;
3417 }
3418 /*
3419 * We could force CLAMP to CLAMP_TO_EDGE here if min/mag filter is nearest,
3420 * so AoS path could be used. Not sure it's worth the trouble...
3421 */
3422
3423 min_img_filter = derived_sampler_state.min_img_filter;
3424 mag_img_filter = derived_sampler_state.mag_img_filter;
3425
3426
3427 /*
3428 * This is all a bit complicated different paths are chosen for performance
3429 * reasons.
3430 * Essentially, there can be 1 lod per element, 1 lod per quad or 1 lod for
3431 * everything (the last two options are equivalent for 4-wide case).
3432 * If there's per-quad lod but we split to 4-wide so we can use AoS, per-quad
3433 * lod is calculated then the lod value extracted afterwards so making this
3434 * case basically the same as far as lod handling is concerned for the
3435 * further sample/filter code as the 1 lod for everything case.
3436 * Different lod handling mostly shows up when building mipmap sizes
3437 * (lp_build_mipmap_level_sizes() and friends) and also in filtering
3438 * (getting the fractional part of the lod to the right texels).
3439 */
3440
3441 /*
3442 * There are other situations where at least the multiple int lods could be
3443 * avoided like min and max lod being equal.
3444 */
3445 bld.num_mips = bld.num_lods = 1;
3446
3447 if (bld.no_quad_lod && bld.no_rho_approx &&
3448 ((mip_filter != PIPE_TEX_MIPFILTER_NONE && op_is_tex &&
3449 (static_texture_state->target == PIPE_TEXTURE_CUBE ||
3450 static_texture_state->target == PIPE_TEXTURE_CUBE_ARRAY)) ||
3451 op_is_lodq)) {
3452 /*
3453 * special case for using per-pixel lod even for implicit lod,
3454 * which is generally never required (ok by APIs) except to please
3455 * some (somewhat broken imho) tests (because per-pixel face selection
3456 * can cause derivatives to be different for pixels outside the primitive
3457 * due to the major axis division even if pre-project derivatives are
3458 * looking normal).
3459 * For lodq, we do it to simply avoid scalar pack / unpack (albeit for
3460 * cube maps we do indeed get per-pixel lod values).
3461 */
3462 bld.num_mips = type.length;
3463 bld.num_lods = type.length;
3464 }
3465 else if (lod_property == LP_SAMPLER_LOD_PER_ELEMENT ||
3466 (explicit_lod || lod_bias || derivs)) {
3467 if ((!op_is_tex && target != PIPE_BUFFER) ||
3468 (op_is_tex && mip_filter != PIPE_TEX_MIPFILTER_NONE)) {
3469 bld.num_mips = type.length;
3470 bld.num_lods = type.length;
3471 }
3472 else if (op_is_tex && min_img_filter != mag_img_filter) {
3473 bld.num_mips = 1;
3474 bld.num_lods = type.length;
3475 }
3476 }
3477 /* TODO: for true scalar_lod should only use 1 lod value */
3478 else if ((!op_is_tex && explicit_lod && target != PIPE_BUFFER) ||
3479 (op_is_tex && mip_filter != PIPE_TEX_MIPFILTER_NONE)) {
3480 bld.num_mips = num_quads;
3481 bld.num_lods = num_quads;
3482 }
3483 else if (op_is_tex && min_img_filter != mag_img_filter) {
3484 bld.num_mips = 1;
3485 bld.num_lods = num_quads;
3486 }
3487
3488 bld.fetch_ms = fetch_ms;
3489 if (op_is_gather)
3490 bld.gather_comp = (sample_key & LP_SAMPLER_GATHER_COMP_MASK) >> LP_SAMPLER_GATHER_COMP_SHIFT;
3491 bld.lodf_type = type;
3492 /* we want native vector size to be able to use our intrinsics */
3493 if (bld.num_lods != type.length) {
3494 /* TODO: this currently always has to be per-quad or per-element */
3495 bld.lodf_type.length = type.length > 4 ? ((type.length + 15) / 16) * 4 : 1;
3496 }
3497 bld.lodi_type = lp_int_type(bld.lodf_type);
3498 bld.levelf_type = bld.lodf_type;
3499 if (bld.num_mips == 1) {
3500 bld.levelf_type.length = 1;
3501 }
3502 bld.leveli_type = lp_int_type(bld.levelf_type);
3503 bld.float_size_type = bld.float_size_in_type;
3504 /* Note: size vectors may not be native. They contain minified w/h/d/_ values,
3505 * with per-element lod that is w0/h0/d0/_/w1/h1/d1_/... so up to 8x4f32 */
3506 if (bld.num_mips > 1) {
3507 bld.float_size_type.length = bld.num_mips == type.length ?
3508 bld.num_mips * bld.float_size_in_type.length :
3509 type.length;
3510 }
3511 bld.int_size_type = lp_int_type(bld.float_size_type);
3512
3513 lp_build_context_init(&bld.float_bld, gallivm, bld.float_type);
3514 lp_build_context_init(&bld.float_vec_bld, gallivm, type);
3515 lp_build_context_init(&bld.int_bld, gallivm, bld.int_type);
3516 lp_build_context_init(&bld.coord_bld, gallivm, bld.coord_type);
3517 lp_build_context_init(&bld.int_coord_bld, gallivm, bld.int_coord_type);
3518 lp_build_context_init(&bld.int_size_in_bld, gallivm, bld.int_size_in_type);
3519 lp_build_context_init(&bld.float_size_in_bld, gallivm, bld.float_size_in_type);
3520 lp_build_context_init(&bld.int_size_bld, gallivm, bld.int_size_type);
3521 lp_build_context_init(&bld.float_size_bld, gallivm, bld.float_size_type);
3522 lp_build_context_init(&bld.texel_bld, gallivm, bld.texel_type);
3523 lp_build_context_init(&bld.levelf_bld, gallivm, bld.levelf_type);
3524 lp_build_context_init(&bld.leveli_bld, gallivm, bld.leveli_type);
3525 lp_build_context_init(&bld.lodf_bld, gallivm, bld.lodf_type);
3526 lp_build_context_init(&bld.lodi_bld, gallivm, bld.lodi_type);
3527
3528 /* Get the dynamic state */
3529 tex_width = dynamic_state->width(dynamic_state, gallivm,
3530 context_ptr, texture_index, NULL);
3531 bld.row_stride_array = dynamic_state->row_stride(dynamic_state, gallivm,
3532 context_ptr, texture_index, NULL);
3533 bld.img_stride_array = dynamic_state->img_stride(dynamic_state, gallivm,
3534 context_ptr, texture_index, NULL);
3535 bld.base_ptr = dynamic_state->base_ptr(dynamic_state, gallivm,
3536 context_ptr, texture_index, NULL);
3537 bld.mip_offsets = dynamic_state->mip_offsets(dynamic_state, gallivm,
3538 context_ptr, texture_index, NULL);
3539
3540 if (fetch_ms)
3541 bld.sample_stride = lp_build_broadcast_scalar(&bld.int_coord_bld, dynamic_state->sample_stride(dynamic_state, gallivm,
3542 context_ptr, texture_index, NULL));
3543 /* Note that mip_offsets is an array[level] of offsets to texture images */
3544
3545 if (dynamic_state->cache_ptr && thread_data_ptr) {
3546 bld.cache = dynamic_state->cache_ptr(dynamic_state, gallivm,
3547 thread_data_ptr, texture_index);
3548 }
3549
3550 /* width, height, depth as single int vector */
3551 if (dims <= 1) {
3552 bld.int_size = tex_width;
3553 }
3554 else {
3555 bld.int_size = LLVMBuildInsertElement(builder, bld.int_size_in_bld.undef,
3556 tex_width,
3557 LLVMConstInt(i32t, 0, 0), "");
3558 if (dims >= 2) {
3559 LLVMValueRef tex_height =
3560 dynamic_state->height(dynamic_state, gallivm,
3561 context_ptr, texture_index, NULL);
3562 bld.int_size = LLVMBuildInsertElement(builder, bld.int_size,
3563 tex_height,
3564 LLVMConstInt(i32t, 1, 0), "");
3565 if (dims >= 3) {
3566 LLVMValueRef tex_depth =
3567 dynamic_state->depth(dynamic_state, gallivm, context_ptr,
3568 texture_index, NULL);
3569 bld.int_size = LLVMBuildInsertElement(builder, bld.int_size,
3570 tex_depth,
3571 LLVMConstInt(i32t, 2, 0), "");
3572 }
3573 }
3574 }
3575
3576 for (i = 0; i < 5; i++) {
3577 newcoords[i] = coords[i];
3578 }
3579
3580 if (util_format_is_pure_integer(static_texture_state->format) &&
3581 !util_format_has_depth(bld.format_desc) && op_is_tex &&
3582 (static_sampler_state->min_mip_filter == PIPE_TEX_MIPFILTER_LINEAR ||
3583 static_sampler_state->min_img_filter == PIPE_TEX_FILTER_LINEAR ||
3584 static_sampler_state->mag_img_filter == PIPE_TEX_FILTER_LINEAR)) {
3585 /*
3586 * Bail if impossible filtering is specified (the awkard additional
3587 * depth check is because it is legal in gallium to have things like S8Z24
3588 * here which would say it's pure int despite such formats should sample
3589 * the depth component).
3590 * In GL such filters make the texture incomplete, this makes it robust
3591 * against gallium frontends which set this up regardless (we'd crash in the
3592 * lerp later otherwise).
3593 * At least in some apis it may be legal to use such filters with lod
3594 * queries and/or gather (at least for gather d3d10 says only the wrap
3595 * bits are really used hence filter bits are likely simply ignored).
3596 * For fetch, we don't get valid samplers either way here.
3597 */
3598 unsigned chan;
3599 LLVMValueRef zero = lp_build_zero(gallivm, type);
3600 for (chan = 0; chan < 4; chan++) {
3601 texel_out[chan] = zero;
3602 }
3603 return;
3604 }
3605
3606 if (0) {
3607 /* For debug: no-op texture sampling */
3608 lp_build_sample_nop(gallivm,
3609 bld.texel_type,
3610 newcoords,
3611 texel_out);
3612 }
3613
3614 else if (op_type == LP_SAMPLER_OP_FETCH) {
3615 lp_build_fetch_texel(&bld, texture_index, ms_index, newcoords,
3616 lod, offsets,
3617 texel_out);
3618 }
3619
3620 else {
3621 LLVMValueRef lod_fpart = NULL, lod_positive = NULL;
3622 LLVMValueRef ilevel0 = NULL, ilevel1 = NULL, lod = NULL;
3623 boolean use_aos;
3624
3625 use_aos = util_format_fits_8unorm(bld.format_desc) &&
3626 op_is_tex &&
3627 /* not sure this is strictly needed or simply impossible */
3628 derived_sampler_state.compare_mode == PIPE_TEX_COMPARE_NONE &&
3629 derived_sampler_state.aniso == 0 &&
3630 lp_is_simple_wrap_mode(derived_sampler_state.wrap_s);
3631
3632 use_aos &= bld.num_lods <= num_quads ||
3633 derived_sampler_state.min_img_filter ==
3634 derived_sampler_state.mag_img_filter;
3635
3636 if(gallivm_perf & GALLIVM_PERF_NO_AOS_SAMPLING) {
3637 use_aos = 0;
3638 }
3639
3640 if (dims > 1) {
3641 use_aos &= lp_is_simple_wrap_mode(derived_sampler_state.wrap_t);
3642 if (dims > 2) {
3643 use_aos &= lp_is_simple_wrap_mode(derived_sampler_state.wrap_r);
3644 }
3645 }
3646 if ((static_texture_state->target == PIPE_TEXTURE_CUBE ||
3647 static_texture_state->target == PIPE_TEXTURE_CUBE_ARRAY) &&
3648 derived_sampler_state.seamless_cube_map &&
3649 (derived_sampler_state.min_img_filter == PIPE_TEX_FILTER_LINEAR ||
3650 derived_sampler_state.mag_img_filter == PIPE_TEX_FILTER_LINEAR)) {
3651 /* theoretically possible with AoS filtering but not implemented (complex!) */
3652 use_aos = 0;
3653 }
3654
3655 if ((gallivm_debug & GALLIVM_DEBUG_PERF) &&
3656 !use_aos && util_format_fits_8unorm(bld.format_desc)) {
3657 debug_printf("%s: using floating point linear filtering for %s\n",
3658 __FUNCTION__, bld.format_desc->short_name);
3659 debug_printf(" min_img %d mag_img %d mip %d target %d seamless %d"
3660 " wraps %d wrapt %d wrapr %d\n",
3661 derived_sampler_state.min_img_filter,
3662 derived_sampler_state.mag_img_filter,
3663 derived_sampler_state.min_mip_filter,
3664 static_texture_state->target,
3665 derived_sampler_state.seamless_cube_map,
3666 derived_sampler_state.wrap_s,
3667 derived_sampler_state.wrap_t,
3668 derived_sampler_state.wrap_r);
3669 }
3670
3671 lp_build_sample_common(&bld, op_is_lodq, texture_index, sampler_index,
3672 newcoords,
3673 derivs, lod_bias, explicit_lod,
3674 &lod_positive, &lod, &lod_fpart,
3675 &ilevel0, &ilevel1);
3676
3677 if (op_is_lodq) {
3678 texel_out[0] = lod_fpart;
3679 texel_out[1] = lod;
3680 texel_out[2] = texel_out[3] = bld.coord_bld.zero;
3681 return;
3682 }
3683
3684 if (use_aos && static_texture_state->target == PIPE_TEXTURE_CUBE_ARRAY) {
3685 /* The aos path doesn't do seamless filtering so simply add cube layer
3686 * to face now.
3687 */
3688 newcoords[2] = lp_build_add(&bld.int_coord_bld, newcoords[2], newcoords[3]);
3689 }
3690
3691 /*
3692 * we only try 8-wide sampling with soa or if we have AVX2
3693 * as it appears to be a loss with just AVX)
3694 */
3695 if (num_quads == 1 || !use_aos ||
3696 (util_get_cpu_caps()->has_avx2 &&
3697 (bld.num_lods == 1 ||
3698 derived_sampler_state.min_img_filter == derived_sampler_state.mag_img_filter))) {
3699 if (use_aos) {
3700 /* do sampling/filtering with fixed pt arithmetic */
3701 lp_build_sample_aos(&bld, sampler_index,
3702 newcoords[0], newcoords[1],
3703 newcoords[2],
3704 offsets, lod_positive, lod_fpart,
3705 ilevel0, ilevel1,
3706 texel_out);
3707 }
3708
3709 else {
3710 lp_build_sample_general(&bld, sampler_index,
3711 op_type == LP_SAMPLER_OP_GATHER,
3712 newcoords, offsets,
3713 lod_positive, lod_fpart,
3714 ilevel0, ilevel1,
3715 texel_out);
3716 }
3717 }
3718 else {
3719 unsigned j;
3720 struct lp_build_sample_context bld4;
3721 struct lp_type type4 = type;
3722 unsigned i;
3723 LLVMValueRef texelout4[4];
3724 LLVMValueRef texelouttmp[4][LP_MAX_VECTOR_LENGTH/16];
3725
3726 type4.length = 4;
3727
3728 /* Setup our build context */
3729 memset(&bld4, 0, sizeof bld4);
3730 bld4.no_quad_lod = bld.no_quad_lod;
3731 bld4.no_rho_approx = bld.no_rho_approx;
3732 bld4.no_brilinear = bld.no_brilinear;
3733 bld4.gallivm = bld.gallivm;
3734 bld4.context_ptr = bld.context_ptr;
3735 bld4.aniso_filter_table = aniso_filter_table;
3736 bld4.static_texture_state = bld.static_texture_state;
3737 bld4.static_sampler_state = bld.static_sampler_state;
3738 bld4.dynamic_state = bld.dynamic_state;
3739 bld4.format_desc = bld.format_desc;
3740 bld4.dims = bld.dims;
3741 bld4.row_stride_array = bld.row_stride_array;
3742 bld4.img_stride_array = bld.img_stride_array;
3743 bld4.base_ptr = bld.base_ptr;
3744 bld4.mip_offsets = bld.mip_offsets;
3745 bld4.int_size = bld.int_size;
3746 bld4.cache = bld.cache;
3747
3748 bld4.vector_width = lp_type_width(type4);
3749
3750 bld4.float_type = lp_type_float(32);
3751 bld4.int_type = lp_type_int(32);
3752 bld4.coord_type = type4;
3753 bld4.int_coord_type = lp_int_type(type4);
3754 bld4.float_size_in_type = lp_type_float(32);
3755 bld4.float_size_in_type.length = dims > 1 ? 4 : 1;
3756 bld4.int_size_in_type = lp_int_type(bld4.float_size_in_type);
3757 bld4.texel_type = bld.texel_type;
3758 bld4.texel_type.length = 4;
3759
3760 bld4.num_mips = bld4.num_lods = 1;
3761 if (bld4.no_quad_lod && bld4.no_rho_approx &&
3762 (static_texture_state->target == PIPE_TEXTURE_CUBE ||
3763 static_texture_state->target == PIPE_TEXTURE_CUBE_ARRAY) &&
3764 (op_is_tex && mip_filter != PIPE_TEX_MIPFILTER_NONE)) {
3765 bld4.num_mips = type4.length;
3766 bld4.num_lods = type4.length;
3767 }
3768 if (lod_property == LP_SAMPLER_LOD_PER_ELEMENT &&
3769 (explicit_lod || lod_bias || derivs)) {
3770 if ((!op_is_tex && target != PIPE_BUFFER) ||
3771 (op_is_tex && mip_filter != PIPE_TEX_MIPFILTER_NONE)) {
3772 bld4.num_mips = type4.length;
3773 bld4.num_lods = type4.length;
3774 }
3775 else if (op_is_tex && min_img_filter != mag_img_filter) {
3776 bld4.num_mips = 1;
3777 bld4.num_lods = type4.length;
3778 }
3779 }
3780
3781 /* we want native vector size to be able to use our intrinsics */
3782 bld4.lodf_type = type4;
3783 if (bld4.num_lods != type4.length) {
3784 bld4.lodf_type.length = 1;
3785 }
3786 bld4.lodi_type = lp_int_type(bld4.lodf_type);
3787 bld4.levelf_type = type4;
3788 if (bld4.num_mips != type4.length) {
3789 bld4.levelf_type.length = 1;
3790 }
3791 bld4.leveli_type = lp_int_type(bld4.levelf_type);
3792 bld4.float_size_type = bld4.float_size_in_type;
3793 if (bld4.num_mips > 1) {
3794 bld4.float_size_type.length = bld4.num_mips == type4.length ?
3795 bld4.num_mips * bld4.float_size_in_type.length :
3796 type4.length;
3797 }
3798 bld4.int_size_type = lp_int_type(bld4.float_size_type);
3799
3800 lp_build_context_init(&bld4.float_bld, gallivm, bld4.float_type);
3801 lp_build_context_init(&bld4.float_vec_bld, gallivm, type4);
3802 lp_build_context_init(&bld4.int_bld, gallivm, bld4.int_type);
3803 lp_build_context_init(&bld4.coord_bld, gallivm, bld4.coord_type);
3804 lp_build_context_init(&bld4.int_coord_bld, gallivm, bld4.int_coord_type);
3805 lp_build_context_init(&bld4.int_size_in_bld, gallivm, bld4.int_size_in_type);
3806 lp_build_context_init(&bld4.float_size_in_bld, gallivm, bld4.float_size_in_type);
3807 lp_build_context_init(&bld4.int_size_bld, gallivm, bld4.int_size_type);
3808 lp_build_context_init(&bld4.float_size_bld, gallivm, bld4.float_size_type);
3809 lp_build_context_init(&bld4.texel_bld, gallivm, bld4.texel_type);
3810 lp_build_context_init(&bld4.levelf_bld, gallivm, bld4.levelf_type);
3811 lp_build_context_init(&bld4.leveli_bld, gallivm, bld4.leveli_type);
3812 lp_build_context_init(&bld4.lodf_bld, gallivm, bld4.lodf_type);
3813 lp_build_context_init(&bld4.lodi_bld, gallivm, bld4.lodi_type);
3814
3815 for (i = 0; i < num_quads; i++) {
3816 LLVMValueRef s4, t4, r4;
3817 LLVMValueRef lod_positive4, lod_fpart4 = NULL;
3818 LLVMValueRef ilevel04, ilevel14 = NULL;
3819 LLVMValueRef offsets4[4] = { NULL };
3820 unsigned num_lods = bld4.num_lods;
3821
3822 s4 = lp_build_extract_range(gallivm, newcoords[0], 4*i, 4);
3823 t4 = lp_build_extract_range(gallivm, newcoords[1], 4*i, 4);
3824 r4 = lp_build_extract_range(gallivm, newcoords[2], 4*i, 4);
3825
3826 if (offsets[0]) {
3827 offsets4[0] = lp_build_extract_range(gallivm, offsets[0], 4*i, 4);
3828 if (dims > 1) {
3829 offsets4[1] = lp_build_extract_range(gallivm, offsets[1], 4*i, 4);
3830 if (dims > 2) {
3831 offsets4[2] = lp_build_extract_range(gallivm, offsets[2], 4*i, 4);
3832 }
3833 }
3834 }
3835 lod_positive4 = lp_build_extract_range(gallivm, lod_positive, num_lods * i, num_lods);
3836 ilevel04 = bld.num_mips == 1 ? ilevel0 :
3837 lp_build_extract_range(gallivm, ilevel0, num_lods * i, num_lods);
3838 if (mip_filter == PIPE_TEX_MIPFILTER_LINEAR) {
3839 ilevel14 = lp_build_extract_range(gallivm, ilevel1, num_lods * i, num_lods);
3840 lod_fpart4 = lp_build_extract_range(gallivm, lod_fpart, num_lods * i, num_lods);
3841 }
3842
3843 if (use_aos) {
3844 /* do sampling/filtering with fixed pt arithmetic */
3845 lp_build_sample_aos(&bld4, sampler_index,
3846 s4, t4, r4, offsets4,
3847 lod_positive4, lod_fpart4,
3848 ilevel04, ilevel14,
3849 texelout4);
3850 }
3851
3852 else {
3853 /* this path is currently unreachable and hence might break easily... */
3854 LLVMValueRef newcoords4[5];
3855 newcoords4[0] = s4;
3856 newcoords4[1] = t4;
3857 newcoords4[2] = r4;
3858 newcoords4[3] = lp_build_extract_range(gallivm, newcoords[3], 4*i, 4);
3859 newcoords4[4] = lp_build_extract_range(gallivm, newcoords[4], 4*i, 4);
3860
3861 lp_build_sample_general(&bld4, sampler_index,
3862 op_type == LP_SAMPLER_OP_GATHER,
3863 newcoords4, offsets4,
3864 lod_positive4, lod_fpart4,
3865 ilevel04, ilevel14,
3866 texelout4);
3867 }
3868 for (j = 0; j < 4; j++) {
3869 texelouttmp[j][i] = texelout4[j];
3870 }
3871 }
3872
3873 for (j = 0; j < 4; j++) {
3874 texel_out[j] = lp_build_concat(gallivm, texelouttmp[j], type4, num_quads);
3875 }
3876 }
3877 }
3878
3879 if (target != PIPE_BUFFER && op_type != LP_SAMPLER_OP_GATHER) {
3880 apply_sampler_swizzle(&bld, texel_out);
3881 }
3882
3883 /*
3884 * texel type can be a (32bit) int/uint (for pure int formats only),
3885 * however we are expected to always return floats (storage is untyped).
3886 */
3887 if (!bld.texel_type.floating) {
3888 unsigned chan;
3889 for (chan = 0; chan < 4; chan++) {
3890 texel_out[chan] = LLVMBuildBitCast(builder, texel_out[chan],
3891 lp_build_vec_type(gallivm, type), "");
3892 }
3893 }
3894 }
3895
3896
3897 #define USE_TEX_FUNC_CALL 1
3898
3899 #define LP_MAX_TEX_FUNC_ARGS 32
3900
3901 static inline void
get_target_info(enum pipe_texture_target target,unsigned * num_coords,unsigned * num_derivs,unsigned * num_offsets,unsigned * layer)3902 get_target_info(enum pipe_texture_target target,
3903 unsigned *num_coords, unsigned *num_derivs,
3904 unsigned *num_offsets, unsigned *layer)
3905 {
3906 unsigned dims = texture_dims(target);
3907 *num_coords = dims;
3908 *num_offsets = dims;
3909 *num_derivs = (target == PIPE_TEXTURE_CUBE ||
3910 target == PIPE_TEXTURE_CUBE_ARRAY) ? 3 : dims;
3911 *layer = has_layer_coord(target) ? 2: 0;
3912 if (target == PIPE_TEXTURE_CUBE_ARRAY) {
3913 /*
3914 * dims doesn't include r coord for cubes - this is handled
3915 * by layer instead, but need to fix up for cube arrays...
3916 */
3917 *layer = 3;
3918 *num_coords = 3;
3919 }
3920 }
3921
3922
3923 /**
3924 * Generate the function body for a texture sampling function.
3925 */
3926 static void
lp_build_sample_gen_func(struct gallivm_state * gallivm,const struct lp_static_texture_state * static_texture_state,const struct lp_static_sampler_state * static_sampler_state,struct lp_sampler_dynamic_state * dynamic_state,struct lp_type type,unsigned texture_index,unsigned sampler_index,LLVMValueRef function,unsigned num_args,unsigned sample_key,bool has_aniso_filter_table)3927 lp_build_sample_gen_func(struct gallivm_state *gallivm,
3928 const struct lp_static_texture_state *static_texture_state,
3929 const struct lp_static_sampler_state *static_sampler_state,
3930 struct lp_sampler_dynamic_state *dynamic_state,
3931 struct lp_type type,
3932 unsigned texture_index,
3933 unsigned sampler_index,
3934 LLVMValueRef function,
3935 unsigned num_args,
3936 unsigned sample_key,
3937 bool has_aniso_filter_table)
3938 {
3939 LLVMBuilderRef old_builder;
3940 LLVMBasicBlockRef block;
3941 LLVMValueRef coords[5];
3942 LLVMValueRef offsets[3] = { NULL };
3943 LLVMValueRef lod = NULL;
3944 LLVMValueRef ms_index = NULL;
3945 LLVMValueRef context_ptr;
3946 LLVMValueRef thread_data_ptr = NULL;
3947 LLVMValueRef aniso_filter_table = NULL;
3948 LLVMValueRef texel_out[4];
3949 struct lp_derivatives derivs;
3950 struct lp_derivatives *deriv_ptr = NULL;
3951 unsigned num_param = 0;
3952 unsigned i, num_coords, num_derivs, num_offsets, layer;
3953 enum lp_sampler_lod_control lod_control;
3954 enum lp_sampler_op_type op_type;
3955 boolean need_cache = FALSE;
3956
3957 lod_control = (sample_key & LP_SAMPLER_LOD_CONTROL_MASK) >>
3958 LP_SAMPLER_LOD_CONTROL_SHIFT;
3959
3960 op_type = (sample_key & LP_SAMPLER_OP_TYPE_MASK) >>
3961 LP_SAMPLER_OP_TYPE_SHIFT;
3962
3963 get_target_info(static_texture_state->target,
3964 &num_coords, &num_derivs, &num_offsets, &layer);
3965
3966 /* lod query doesn't take a layer */
3967 if (layer && op_type == LP_SAMPLER_OP_LODQ)
3968 layer = 0;
3969
3970 if (dynamic_state->cache_ptr) {
3971 const struct util_format_description *format_desc;
3972 format_desc = util_format_description(static_texture_state->format);
3973 if (format_desc && format_desc->layout == UTIL_FORMAT_LAYOUT_S3TC) {
3974 need_cache = TRUE;
3975 }
3976 }
3977
3978 /* "unpack" arguments */
3979 context_ptr = LLVMGetParam(function, num_param++);
3980 if (has_aniso_filter_table)
3981 aniso_filter_table = LLVMGetParam(function, num_param++);
3982 if (need_cache) {
3983 thread_data_ptr = LLVMGetParam(function, num_param++);
3984 }
3985 for (i = 0; i < num_coords; i++) {
3986 coords[i] = LLVMGetParam(function, num_param++);
3987 }
3988 for (i = num_coords; i < 5; i++) {
3989 /* This is rather unfortunate... */
3990 coords[i] = lp_build_undef(gallivm, type);
3991 }
3992 if (layer) {
3993 coords[layer] = LLVMGetParam(function, num_param++);
3994 }
3995 if (sample_key & LP_SAMPLER_SHADOW) {
3996 coords[4] = LLVMGetParam(function, num_param++);
3997 }
3998 if (sample_key & LP_SAMPLER_FETCH_MS) {
3999 ms_index = LLVMGetParam(function, num_param++);
4000 }
4001 if (sample_key & LP_SAMPLER_OFFSETS) {
4002 for (i = 0; i < num_offsets; i++) {
4003 offsets[i] = LLVMGetParam(function, num_param++);
4004 }
4005 }
4006 if (lod_control == LP_SAMPLER_LOD_BIAS ||
4007 lod_control == LP_SAMPLER_LOD_EXPLICIT) {
4008 lod = LLVMGetParam(function, num_param++);
4009 }
4010 else if (lod_control == LP_SAMPLER_LOD_DERIVATIVES) {
4011 for (i = 0; i < num_derivs; i++) {
4012 derivs.ddx[i] = LLVMGetParam(function, num_param++);
4013 derivs.ddy[i] = LLVMGetParam(function, num_param++);
4014 }
4015 deriv_ptr = &derivs;
4016 }
4017
4018 assert(num_args == num_param);
4019
4020 /*
4021 * Function body
4022 */
4023
4024 old_builder = gallivm->builder;
4025 block = LLVMAppendBasicBlockInContext(gallivm->context, function, "entry");
4026 gallivm->builder = LLVMCreateBuilderInContext(gallivm->context);
4027 LLVMPositionBuilderAtEnd(gallivm->builder, block);
4028
4029 lp_build_sample_soa_code(gallivm,
4030 static_texture_state,
4031 static_sampler_state,
4032 dynamic_state,
4033 type,
4034 sample_key,
4035 texture_index,
4036 sampler_index,
4037 context_ptr,
4038 thread_data_ptr,
4039 coords,
4040 offsets,
4041 deriv_ptr,
4042 lod,
4043 ms_index,
4044 aniso_filter_table,
4045 texel_out);
4046
4047 LLVMBuildAggregateRet(gallivm->builder, texel_out, 4);
4048
4049 LLVMDisposeBuilder(gallivm->builder);
4050 gallivm->builder = old_builder;
4051
4052 gallivm_verify_function(gallivm, function);
4053 }
4054
4055
4056 /**
4057 * Call the matching function for texture sampling.
4058 * If there's no match, generate a new one.
4059 */
4060 static void
lp_build_sample_soa_func(struct gallivm_state * gallivm,const struct lp_static_texture_state * static_texture_state,const struct lp_static_sampler_state * static_sampler_state,struct lp_sampler_dynamic_state * dynamic_state,const struct lp_sampler_params * params,int texture_index,int sampler_index,LLVMValueRef * tex_ret)4061 lp_build_sample_soa_func(struct gallivm_state *gallivm,
4062 const struct lp_static_texture_state *static_texture_state,
4063 const struct lp_static_sampler_state *static_sampler_state,
4064 struct lp_sampler_dynamic_state *dynamic_state,
4065 const struct lp_sampler_params *params,
4066 int texture_index, int sampler_index,
4067 LLVMValueRef *tex_ret)
4068 {
4069 LLVMBuilderRef builder = gallivm->builder;
4070 LLVMModuleRef module = LLVMGetGlobalParent(LLVMGetBasicBlockParent(
4071 LLVMGetInsertBlock(builder)));
4072 LLVMValueRef function, inst;
4073 LLVMValueRef args[LP_MAX_TEX_FUNC_ARGS];
4074 LLVMBasicBlockRef bb;
4075 unsigned num_args = 0;
4076 char func_name[64];
4077 unsigned i, num_coords, num_derivs, num_offsets, layer;
4078 unsigned sample_key = params->sample_key;
4079 const LLVMValueRef *coords = params->coords;
4080 const LLVMValueRef *offsets = params->offsets;
4081 const struct lp_derivatives *derivs = params->derivs;
4082 enum lp_sampler_lod_control lod_control;
4083 enum lp_sampler_op_type op_type;
4084 boolean need_cache = FALSE;
4085
4086 lod_control = (sample_key & LP_SAMPLER_LOD_CONTROL_MASK) >>
4087 LP_SAMPLER_LOD_CONTROL_SHIFT;
4088
4089 op_type = (sample_key & LP_SAMPLER_OP_TYPE_MASK) >>
4090 LP_SAMPLER_OP_TYPE_SHIFT;
4091
4092 get_target_info(static_texture_state->target,
4093 &num_coords, &num_derivs, &num_offsets, &layer);
4094
4095 /* lod query doesn't take a layer */
4096 if (layer && op_type == LP_SAMPLER_OP_LODQ)
4097 layer = 0;
4098
4099 if (dynamic_state->cache_ptr) {
4100 const struct util_format_description *format_desc;
4101 format_desc = util_format_description(static_texture_state->format);
4102 if (format_desc && format_desc->layout == UTIL_FORMAT_LAYOUT_S3TC) {
4103 need_cache = TRUE;
4104 }
4105 }
4106 /*
4107 * texture function matches are found by name.
4108 * Thus the name has to include both the texture and sampler unit
4109 * (which covers all static state) plus the actual texture function
4110 * (including things like offsets, shadow coord, lod control).
4111 * Additionally lod_property has to be included too.
4112 */
4113
4114 snprintf(func_name, sizeof(func_name), "texfunc_res_%d_sam_%d_%x",
4115 texture_index, sampler_index, sample_key);
4116
4117 function = LLVMGetNamedFunction(module, func_name);
4118
4119 if(!function) {
4120 LLVMTypeRef arg_types[LP_MAX_TEX_FUNC_ARGS];
4121 LLVMTypeRef ret_type;
4122 LLVMTypeRef function_type;
4123 LLVMTypeRef val_type[4];
4124 unsigned num_param = 0;
4125
4126 /*
4127 * Generate the function prototype.
4128 */
4129
4130 arg_types[num_param++] = LLVMTypeOf(params->context_ptr);
4131 if (params->aniso_filter_table)
4132 arg_types[num_param++] = LLVMTypeOf(params->aniso_filter_table);
4133 if (need_cache) {
4134 arg_types[num_param++] = LLVMTypeOf(params->thread_data_ptr);
4135 }
4136 for (i = 0; i < num_coords; i++) {
4137 arg_types[num_param++] = LLVMTypeOf(coords[0]);
4138 assert(LLVMTypeOf(coords[0]) == LLVMTypeOf(coords[i]));
4139 }
4140 if (layer) {
4141 arg_types[num_param++] = LLVMTypeOf(coords[layer]);
4142 assert(LLVMTypeOf(coords[0]) == LLVMTypeOf(coords[layer]));
4143 }
4144 if (sample_key & LP_SAMPLER_SHADOW) {
4145 arg_types[num_param++] = LLVMTypeOf(coords[0]);
4146 }
4147 if (sample_key & LP_SAMPLER_FETCH_MS) {
4148 arg_types[num_param++] = LLVMTypeOf(params->ms_index);
4149 }
4150 if (sample_key & LP_SAMPLER_OFFSETS) {
4151 for (i = 0; i < num_offsets; i++) {
4152 arg_types[num_param++] = LLVMTypeOf(offsets[0]);
4153 assert(LLVMTypeOf(offsets[0]) == LLVMTypeOf(offsets[i]));
4154 }
4155 }
4156 if (lod_control == LP_SAMPLER_LOD_BIAS ||
4157 lod_control == LP_SAMPLER_LOD_EXPLICIT) {
4158 arg_types[num_param++] = LLVMTypeOf(params->lod);
4159 }
4160 else if (lod_control == LP_SAMPLER_LOD_DERIVATIVES) {
4161 for (i = 0; i < num_derivs; i++) {
4162 arg_types[num_param++] = LLVMTypeOf(derivs->ddx[i]);
4163 arg_types[num_param++] = LLVMTypeOf(derivs->ddy[i]);
4164 assert(LLVMTypeOf(derivs->ddx[0]) == LLVMTypeOf(derivs->ddx[i]));
4165 assert(LLVMTypeOf(derivs->ddy[0]) == LLVMTypeOf(derivs->ddy[i]));
4166 }
4167 }
4168
4169 val_type[0] = val_type[1] = val_type[2] = val_type[3] =
4170 lp_build_vec_type(gallivm, params->type);
4171 ret_type = LLVMStructTypeInContext(gallivm->context, val_type, 4, 0);
4172 function_type = LLVMFunctionType(ret_type, arg_types, num_param, 0);
4173 function = LLVMAddFunction(module, func_name, function_type);
4174
4175 for (i = 0; i < num_param; ++i) {
4176 if(LLVMGetTypeKind(arg_types[i]) == LLVMPointerTypeKind) {
4177
4178 lp_add_function_attr(function, i + 1, LP_FUNC_ATTR_NOALIAS);
4179 }
4180 }
4181
4182 LLVMSetFunctionCallConv(function, LLVMFastCallConv);
4183 LLVMSetLinkage(function, LLVMInternalLinkage);
4184
4185 lp_build_sample_gen_func(gallivm,
4186 static_texture_state,
4187 static_sampler_state,
4188 dynamic_state,
4189 params->type,
4190 texture_index,
4191 sampler_index,
4192 function,
4193 num_param,
4194 sample_key,
4195 params->aniso_filter_table ? true : false);
4196 }
4197
4198 num_args = 0;
4199 args[num_args++] = params->context_ptr;
4200 if (params->aniso_filter_table)
4201 args[num_args++] = params->aniso_filter_table;
4202 if (need_cache) {
4203 args[num_args++] = params->thread_data_ptr;
4204 }
4205 for (i = 0; i < num_coords; i++) {
4206 args[num_args++] = coords[i];
4207 }
4208 if (layer) {
4209 args[num_args++] = coords[layer];
4210 }
4211 if (sample_key & LP_SAMPLER_SHADOW) {
4212 args[num_args++] = coords[4];
4213 }
4214 if (sample_key & LP_SAMPLER_FETCH_MS) {
4215 args[num_args++] = params->ms_index;
4216 }
4217 if (sample_key & LP_SAMPLER_OFFSETS) {
4218 for (i = 0; i < num_offsets; i++) {
4219 args[num_args++] = offsets[i];
4220 }
4221 }
4222 if (lod_control == LP_SAMPLER_LOD_BIAS ||
4223 lod_control == LP_SAMPLER_LOD_EXPLICIT) {
4224 args[num_args++] = params->lod;
4225 }
4226 else if (lod_control == LP_SAMPLER_LOD_DERIVATIVES) {
4227 for (i = 0; i < num_derivs; i++) {
4228 args[num_args++] = derivs->ddx[i];
4229 args[num_args++] = derivs->ddy[i];
4230 }
4231 }
4232
4233 assert(num_args <= LP_MAX_TEX_FUNC_ARGS);
4234
4235 *tex_ret = LLVMBuildCall(builder, function, args, num_args, "");
4236 bb = LLVMGetInsertBlock(builder);
4237 inst = LLVMGetLastInstruction(bb);
4238 LLVMSetInstructionCallConv(inst, LLVMFastCallConv);
4239
4240 }
4241
4242
4243 /**
4244 * Build texture sampling code.
4245 * Either via a function call or inline it directly.
4246 */
4247 void
lp_build_sample_soa(const struct lp_static_texture_state * static_texture_state,const struct lp_static_sampler_state * static_sampler_state,struct lp_sampler_dynamic_state * dynamic_state,struct gallivm_state * gallivm,const struct lp_sampler_params * params)4248 lp_build_sample_soa(const struct lp_static_texture_state *static_texture_state,
4249 const struct lp_static_sampler_state *static_sampler_state,
4250 struct lp_sampler_dynamic_state *dynamic_state,
4251 struct gallivm_state *gallivm,
4252 const struct lp_sampler_params *params)
4253 {
4254 boolean use_tex_func = FALSE;
4255
4256 /*
4257 * Do not use a function call if the sampling is "simple enough".
4258 * We define this by
4259 * a) format
4260 * b) no mips (either one level only or no mip filter)
4261 * No mips will definitely make the code smaller, though
4262 * the format requirement is a bit iffy - there's some (SoA) formats
4263 * which definitely generate less code. This does happen to catch
4264 * some important cases though which are hurt quite a bit by using
4265 * a call (though not really because of the call overhead but because
4266 * they are reusing the same texture unit with some of the same
4267 * parameters).
4268 * Ideally we'd let llvm recognize this stuff by doing IPO passes.
4269 */
4270
4271 if (USE_TEX_FUNC_CALL) {
4272 const struct util_format_description *format_desc;
4273 boolean simple_format;
4274 boolean simple_tex;
4275 enum lp_sampler_op_type op_type;
4276 format_desc = util_format_description(static_texture_state->format);
4277 simple_format = !format_desc ||
4278 (util_format_is_rgba8_variant(format_desc) &&
4279 format_desc->colorspace == UTIL_FORMAT_COLORSPACE_RGB);
4280
4281 op_type = (params->sample_key & LP_SAMPLER_OP_TYPE_MASK) >>
4282 LP_SAMPLER_OP_TYPE_SHIFT;
4283 simple_tex =
4284 op_type != LP_SAMPLER_OP_TEXTURE ||
4285 ((static_sampler_state->min_mip_filter == PIPE_TEX_MIPFILTER_NONE ||
4286 static_texture_state->level_zero_only == TRUE) &&
4287 static_sampler_state->min_img_filter == static_sampler_state->mag_img_filter);
4288
4289 use_tex_func = format_desc && !(simple_format && simple_tex);
4290 }
4291
4292 if (use_tex_func) {
4293 LLVMValueRef tex_ret;
4294 lp_build_sample_soa_func(gallivm,
4295 static_texture_state,
4296 static_sampler_state,
4297 dynamic_state,
4298 params, params->texture_index, params->sampler_index, &tex_ret);
4299
4300 for (unsigned i = 0; i < 4; i++) {
4301 params->texel[i] = LLVMBuildExtractValue(gallivm->builder, tex_ret, i, "");
4302 }
4303 }
4304 else {
4305 lp_build_sample_soa_code(gallivm,
4306 static_texture_state,
4307 static_sampler_state,
4308 dynamic_state,
4309 params->type,
4310 params->sample_key,
4311 params->texture_index,
4312 params->sampler_index,
4313 params->context_ptr,
4314 params->thread_data_ptr,
4315 params->coords,
4316 params->offsets,
4317 params->derivs,
4318 params->lod,
4319 params->ms_index,
4320 params->aniso_filter_table,
4321 params->texel);
4322 }
4323 }
4324
4325
4326 void
lp_build_size_query_soa(struct gallivm_state * gallivm,const struct lp_static_texture_state * static_state,struct lp_sampler_dynamic_state * dynamic_state,const struct lp_sampler_size_query_params * params)4327 lp_build_size_query_soa(struct gallivm_state *gallivm,
4328 const struct lp_static_texture_state *static_state,
4329 struct lp_sampler_dynamic_state *dynamic_state,
4330 const struct lp_sampler_size_query_params *params)
4331 {
4332 LLVMValueRef lod, level = 0, size;
4333 LLVMValueRef first_level = NULL;
4334 int dims, i;
4335 boolean has_array;
4336 unsigned num_lods = 1;
4337 struct lp_build_context bld_int_vec4;
4338 LLVMValueRef context_ptr = params->context_ptr;
4339 unsigned texture_unit = params->texture_unit;
4340 unsigned target = params->target;
4341 LLVMValueRef texture_unit_offset = params->texture_unit_offset;
4342
4343 if (static_state->format == PIPE_FORMAT_NONE) {
4344 /*
4345 * If there's nothing bound, format is NONE, and we must return
4346 * all zero as mandated by d3d10 in this case.
4347 */
4348 unsigned chan;
4349 LLVMValueRef zero = lp_build_const_vec(gallivm, params->int_type, 0.0F);
4350 for (chan = 0; chan < 4; chan++) {
4351 params->sizes_out[chan] = zero;
4352 }
4353 return;
4354 }
4355
4356 /*
4357 * Do some sanity verification about bound texture and shader dcl target.
4358 * Not entirely sure what's possible but assume array/non-array
4359 * always compatible (probably not ok for OpenGL but d3d10 has no
4360 * distinction of arrays at the resource level).
4361 * Everything else looks bogus (though not entirely sure about rect/2d).
4362 * Currently disabled because it causes assertion failures if there's
4363 * nothing bound (or rather a dummy texture, not that this case would
4364 * return the right values).
4365 */
4366 if (0 && static_state->target != target) {
4367 if (static_state->target == PIPE_TEXTURE_1D)
4368 assert(target == PIPE_TEXTURE_1D_ARRAY);
4369 else if (static_state->target == PIPE_TEXTURE_1D_ARRAY)
4370 assert(target == PIPE_TEXTURE_1D);
4371 else if (static_state->target == PIPE_TEXTURE_2D)
4372 assert(target == PIPE_TEXTURE_2D_ARRAY);
4373 else if (static_state->target == PIPE_TEXTURE_2D_ARRAY)
4374 assert(target == PIPE_TEXTURE_2D);
4375 else if (static_state->target == PIPE_TEXTURE_CUBE)
4376 assert(target == PIPE_TEXTURE_CUBE_ARRAY);
4377 else if (static_state->target == PIPE_TEXTURE_CUBE_ARRAY)
4378 assert(target == PIPE_TEXTURE_CUBE);
4379 else
4380 assert(0);
4381 }
4382
4383 dims = texture_dims(target);
4384
4385 has_array = has_layer_coord(target);
4386
4387 assert(!params->int_type.floating);
4388
4389 lp_build_context_init(&bld_int_vec4, gallivm, lp_type_int_vec(32, 128));
4390
4391 if (params->samples_only) {
4392 params->sizes_out[0] = lp_build_broadcast(gallivm, lp_build_vec_type(gallivm, params->int_type),
4393 dynamic_state->num_samples(dynamic_state, gallivm,
4394 context_ptr, texture_unit,
4395 texture_unit_offset));
4396 return;
4397 }
4398 if (params->explicit_lod) {
4399 /* FIXME: this needs to honor per-element lod */
4400 lod = LLVMBuildExtractElement(gallivm->builder, params->explicit_lod,
4401 lp_build_const_int32(gallivm, 0), "");
4402 first_level = dynamic_state->first_level(dynamic_state, gallivm,
4403 context_ptr, texture_unit, texture_unit_offset);
4404 level = LLVMBuildAdd(gallivm->builder, lod, first_level, "level");
4405 lod = lp_build_broadcast_scalar(&bld_int_vec4, level);
4406 } else {
4407 lod = bld_int_vec4.zero;
4408 }
4409
4410 size = bld_int_vec4.undef;
4411
4412 size = LLVMBuildInsertElement(gallivm->builder, size,
4413 dynamic_state->width(dynamic_state, gallivm,
4414 context_ptr, texture_unit, texture_unit_offset),
4415 lp_build_const_int32(gallivm, 0), "");
4416
4417 if (dims >= 2) {
4418 size = LLVMBuildInsertElement(gallivm->builder, size,
4419 dynamic_state->height(dynamic_state, gallivm,
4420 context_ptr, texture_unit, texture_unit_offset),
4421 lp_build_const_int32(gallivm, 1), "");
4422 }
4423
4424 if (dims >= 3) {
4425 size = LLVMBuildInsertElement(gallivm->builder, size,
4426 dynamic_state->depth(dynamic_state, gallivm,
4427 context_ptr, texture_unit, texture_unit_offset),
4428 lp_build_const_int32(gallivm, 2), "");
4429 }
4430
4431 size = lp_build_minify(&bld_int_vec4, size, lod, TRUE);
4432
4433 if (has_array) {
4434 LLVMValueRef layers = dynamic_state->depth(dynamic_state, gallivm,
4435 context_ptr, texture_unit, texture_unit_offset);
4436 if (target == PIPE_TEXTURE_CUBE_ARRAY) {
4437 /*
4438 * It looks like GL wants number of cubes, d3d10.1 has it undefined?
4439 * Could avoid this by passing in number of cubes instead of total
4440 * number of layers (might make things easier elsewhere too).
4441 */
4442 LLVMValueRef six = lp_build_const_int32(gallivm, 6);
4443 layers = LLVMBuildSDiv(gallivm->builder, layers, six, "");
4444 }
4445 size = LLVMBuildInsertElement(gallivm->builder, size, layers,
4446 lp_build_const_int32(gallivm, dims), "");
4447 }
4448
4449 /*
4450 * d3d10 requires zero for x/y/z values (but not w, i.e. mip levels)
4451 * if level is out of bounds (note this can't cover unbound texture
4452 * here, which also requires returning zero).
4453 */
4454 if (params->explicit_lod && params->is_sviewinfo) {
4455 LLVMValueRef last_level, out, out1;
4456 struct lp_build_context leveli_bld;
4457
4458 /* everything is scalar for now */
4459 lp_build_context_init(&leveli_bld, gallivm, lp_type_int_vec(32, 32));
4460 last_level = dynamic_state->last_level(dynamic_state, gallivm,
4461 context_ptr, texture_unit, texture_unit_offset);
4462
4463 out = lp_build_cmp(&leveli_bld, PIPE_FUNC_LESS, level, first_level);
4464 out1 = lp_build_cmp(&leveli_bld, PIPE_FUNC_GREATER, level, last_level);
4465 out = lp_build_or(&leveli_bld, out, out1);
4466 if (num_lods == 1) {
4467 out = lp_build_broadcast_scalar(&bld_int_vec4, out);
4468 }
4469 else {
4470 /* TODO */
4471 assert(0);
4472 }
4473 size = lp_build_andnot(&bld_int_vec4, size, out);
4474 }
4475 for (i = 0; i < dims + (has_array ? 1 : 0); i++) {
4476 params->sizes_out[i] = lp_build_extract_broadcast(gallivm, bld_int_vec4.type, params->int_type,
4477 size,
4478 lp_build_const_int32(gallivm, i));
4479 }
4480 if (params->is_sviewinfo) {
4481 for (; i < 4; i++) {
4482 params->sizes_out[i] = lp_build_const_vec(gallivm, params->int_type, 0.0);
4483 }
4484 }
4485
4486 /*
4487 * if there's no explicit_lod (buffers, rects) queries requiring nr of
4488 * mips would be illegal.
4489 */
4490 if (params->is_sviewinfo && params->explicit_lod) {
4491 struct lp_build_context bld_int_scalar;
4492 LLVMValueRef num_levels;
4493 lp_build_context_init(&bld_int_scalar, gallivm, lp_type_int(32));
4494
4495 if (static_state->level_zero_only) {
4496 num_levels = bld_int_scalar.one;
4497 }
4498 else {
4499 LLVMValueRef last_level;
4500
4501 last_level = dynamic_state->last_level(dynamic_state, gallivm,
4502 context_ptr, texture_unit, texture_unit_offset);
4503 num_levels = lp_build_sub(&bld_int_scalar, last_level, first_level);
4504 num_levels = lp_build_add(&bld_int_scalar, num_levels, bld_int_scalar.one);
4505 }
4506 params->sizes_out[3] = lp_build_broadcast(gallivm, lp_build_vec_type(gallivm, params->int_type),
4507 num_levels);
4508 }
4509 }
4510
4511 static void
lp_build_do_atomic_soa(struct gallivm_state * gallivm,const struct util_format_description * format_desc,struct lp_type type,LLVMValueRef exec_mask,LLVMValueRef base_ptr,LLVMValueRef offset,LLVMValueRef out_of_bounds,unsigned img_op,LLVMAtomicRMWBinOp op,const LLVMValueRef rgba_in[4],const LLVMValueRef rgba2_in[4],LLVMValueRef atomic_result[4])4512 lp_build_do_atomic_soa(struct gallivm_state *gallivm,
4513 const struct util_format_description *format_desc,
4514 struct lp_type type,
4515 LLVMValueRef exec_mask,
4516 LLVMValueRef base_ptr,
4517 LLVMValueRef offset,
4518 LLVMValueRef out_of_bounds,
4519 unsigned img_op,
4520 LLVMAtomicRMWBinOp op,
4521 const LLVMValueRef rgba_in[4],
4522 const LLVMValueRef rgba2_in[4],
4523 LLVMValueRef atomic_result[4])
4524 {
4525 enum pipe_format format = format_desc->format;
4526
4527 if (format != PIPE_FORMAT_R32_UINT && format != PIPE_FORMAT_R32_SINT && format != PIPE_FORMAT_R32_FLOAT) {
4528 atomic_result[0] = lp_build_zero(gallivm, type);
4529 return;
4530 }
4531
4532 LLVMValueRef atom_res = lp_build_alloca(gallivm,
4533 LLVMVectorType(LLVMInt32TypeInContext(gallivm->context), type.length), "");
4534
4535 offset = LLVMBuildGEP(gallivm->builder, base_ptr, &offset, 1, "");
4536 struct lp_build_loop_state loop_state;
4537 lp_build_loop_begin(&loop_state, gallivm, lp_build_const_int32(gallivm, 0));
4538 struct lp_build_if_state ifthen;
4539 LLVMValueRef cond;
4540 LLVMValueRef packed = rgba_in[0], packed2 = rgba2_in[0];
4541
4542 LLVMValueRef should_store_mask = LLVMBuildAnd(gallivm->builder, exec_mask, LLVMBuildNot(gallivm->builder, out_of_bounds, ""), "store_mask");
4543 assert(exec_mask);
4544
4545 cond = LLVMBuildICmp(gallivm->builder, LLVMIntNE, should_store_mask, lp_build_const_int_vec(gallivm, type, 0), "");
4546 cond = LLVMBuildExtractElement(gallivm->builder, cond, loop_state.counter, "");
4547 lp_build_if(&ifthen, gallivm, cond);
4548
4549 LLVMValueRef data = LLVMBuildExtractElement(gallivm->builder, packed, loop_state.counter, "");
4550 LLVMValueRef cast_base_ptr = LLVMBuildExtractElement(gallivm->builder, offset, loop_state.counter, "");
4551 cast_base_ptr = LLVMBuildBitCast(gallivm->builder, cast_base_ptr, LLVMPointerType(LLVMInt32TypeInContext(gallivm->context), 0), "");
4552 data = LLVMBuildBitCast(gallivm->builder, data, LLVMInt32TypeInContext(gallivm->context), "");
4553
4554 if (img_op == LP_IMG_ATOMIC_CAS) {
4555 LLVMValueRef cas_src_ptr = LLVMBuildExtractElement(gallivm->builder, packed2, loop_state.counter, "");
4556 LLVMValueRef cas_src = LLVMBuildBitCast(gallivm->builder, cas_src_ptr, LLVMInt32TypeInContext(gallivm->context), "");
4557 data = LLVMBuildAtomicCmpXchg(gallivm->builder, cast_base_ptr, data,
4558 cas_src,
4559 LLVMAtomicOrderingSequentiallyConsistent,
4560 LLVMAtomicOrderingSequentiallyConsistent,
4561 false);
4562 data = LLVMBuildExtractValue(gallivm->builder, data, 0, "");
4563 } else {
4564 data = LLVMBuildAtomicRMW(gallivm->builder, op,
4565 cast_base_ptr, data,
4566 LLVMAtomicOrderingSequentiallyConsistent,
4567 false);
4568 }
4569
4570 LLVMValueRef temp_res = LLVMBuildLoad(gallivm->builder, atom_res, "");
4571 temp_res = LLVMBuildInsertElement(gallivm->builder, temp_res, data, loop_state.counter, "");
4572 LLVMBuildStore(gallivm->builder, temp_res, atom_res);
4573
4574 lp_build_endif(&ifthen);
4575 lp_build_loop_end_cond(&loop_state, lp_build_const_int32(gallivm, type.length),
4576 NULL, LLVMIntUGE);
4577 atomic_result[0] = LLVMBuildLoad(gallivm->builder, atom_res, "");
4578 }
4579
4580 static void
lp_build_img_op_no_format(struct gallivm_state * gallivm,const struct lp_img_params * params,LLVMValueRef outdata[4])4581 lp_build_img_op_no_format(struct gallivm_state *gallivm,
4582 const struct lp_img_params *params,
4583 LLVMValueRef outdata[4])
4584 {
4585 /*
4586 * If there's nothing bound, format is NONE, and we must return
4587 * all zero as mandated by d3d10 in this case.
4588 */
4589 if (params->img_op != LP_IMG_STORE) {
4590 LLVMValueRef zero = lp_build_zero(gallivm, params->type);
4591 for (unsigned chan = 0; chan < (params->img_op == LP_IMG_LOAD ? 4 : 1); chan++) {
4592 outdata[chan] = zero;
4593 }
4594 }
4595 }
4596
4597 void
lp_build_img_op_soa(const struct lp_static_texture_state * static_texture_state,struct lp_sampler_dynamic_state * dynamic_state,struct gallivm_state * gallivm,const struct lp_img_params * params,LLVMValueRef outdata[4])4598 lp_build_img_op_soa(const struct lp_static_texture_state *static_texture_state,
4599 struct lp_sampler_dynamic_state *dynamic_state,
4600 struct gallivm_state *gallivm,
4601 const struct lp_img_params *params,
4602 LLVMValueRef outdata[4])
4603 {
4604 unsigned target = params->target;
4605 unsigned dims = texture_dims(target);
4606 /** regular scalar int type */
4607 struct lp_type int_coord_type;
4608 struct lp_build_context int_coord_bld;
4609 const struct util_format_description *format_desc = util_format_description(static_texture_state->format);
4610 LLVMValueRef x = params->coords[0], y = params->coords[1], z = params->coords[2];
4611 LLVMValueRef ms_index = params->ms_index;
4612 LLVMValueRef row_stride_vec = NULL, img_stride_vec = NULL;
4613 int_coord_type = lp_uint_type(params->type);
4614 lp_build_context_init(&int_coord_bld, gallivm, int_coord_type);
4615
4616 if (static_texture_state->format == PIPE_FORMAT_NONE) {
4617 lp_build_img_op_no_format(gallivm, params, outdata);
4618 return;
4619 }
4620 LLVMValueRef offset, i, j;
4621
4622 LLVMValueRef row_stride = dynamic_state->row_stride(dynamic_state, gallivm,
4623 params->context_ptr, params->image_index, NULL);
4624 LLVMValueRef img_stride = dynamic_state->img_stride(dynamic_state, gallivm,
4625 params->context_ptr, params->image_index, NULL);
4626 LLVMValueRef base_ptr = dynamic_state->base_ptr(dynamic_state, gallivm,
4627 params->context_ptr, params->image_index, NULL);
4628 LLVMValueRef width = dynamic_state->width(dynamic_state, gallivm,
4629 params->context_ptr, params->image_index, NULL);
4630 LLVMValueRef height = dynamic_state->height(dynamic_state, gallivm,
4631 params->context_ptr, params->image_index, NULL);
4632 LLVMValueRef depth = dynamic_state->depth(dynamic_state, gallivm,
4633 params->context_ptr, params->image_index, NULL);
4634 LLVMValueRef num_samples = NULL, sample_stride = NULL;
4635 if (ms_index) {
4636 num_samples = dynamic_state->num_samples(dynamic_state, gallivm,
4637 params->context_ptr, params->image_index, NULL);
4638 sample_stride = dynamic_state->sample_stride(dynamic_state, gallivm,
4639 params->context_ptr, params->image_index, NULL);
4640 }
4641
4642 boolean layer_coord = has_layer_coord(target);
4643
4644 width = lp_build_broadcast_scalar(&int_coord_bld, width);
4645 if (dims >= 2) {
4646 height = lp_build_broadcast_scalar(&int_coord_bld, height);
4647 row_stride_vec = lp_build_broadcast_scalar(&int_coord_bld, row_stride);
4648 }
4649 if (dims >= 3 || layer_coord) {
4650 depth = lp_build_broadcast_scalar(&int_coord_bld, depth);
4651 img_stride_vec = lp_build_broadcast_scalar(&int_coord_bld, img_stride);
4652 }
4653
4654 LLVMValueRef out_of_bounds = int_coord_bld.zero;
4655 LLVMValueRef out1;
4656 out1 = lp_build_cmp(&int_coord_bld, PIPE_FUNC_GEQUAL, x, width);
4657 out_of_bounds = lp_build_or(&int_coord_bld, out_of_bounds, out1);
4658
4659 if (dims >= 2) {
4660 out1 = lp_build_cmp(&int_coord_bld, PIPE_FUNC_GEQUAL, y, height);
4661 out_of_bounds = lp_build_or(&int_coord_bld, out_of_bounds, out1);
4662 }
4663 if (dims >= 3) {
4664 out1 = lp_build_cmp(&int_coord_bld, PIPE_FUNC_GEQUAL, z, depth);
4665 out_of_bounds = lp_build_or(&int_coord_bld, out_of_bounds, out1);
4666 }
4667 lp_build_sample_offset(&int_coord_bld,
4668 format_desc,
4669 x, y, z, row_stride_vec, img_stride_vec,
4670 &offset, &i, &j);
4671
4672 if (ms_index) {
4673 out1 = lp_build_cmp(&int_coord_bld, PIPE_FUNC_GEQUAL, ms_index, lp_build_broadcast_scalar(&int_coord_bld, num_samples));
4674 out_of_bounds = lp_build_or(&int_coord_bld, out_of_bounds, out1);
4675
4676 offset = lp_build_add(&int_coord_bld, offset,
4677 lp_build_mul(&int_coord_bld, lp_build_broadcast_scalar(&int_coord_bld, sample_stride),
4678 ms_index));
4679 }
4680 if (params->img_op == LP_IMG_LOAD) {
4681 struct lp_type texel_type = lp_build_texel_type(params->type, format_desc);
4682
4683 offset = lp_build_andnot(&int_coord_bld, offset, out_of_bounds);
4684 struct lp_build_context texel_bld;
4685 lp_build_context_init(&texel_bld, gallivm, texel_type);
4686 lp_build_fetch_rgba_soa(gallivm,
4687 format_desc,
4688 texel_type, TRUE,
4689 base_ptr, offset,
4690 i, j,
4691 NULL,
4692 outdata);
4693
4694 for (unsigned chan = 0; chan < 3; chan++) {
4695 outdata[chan] = lp_build_select(&texel_bld, out_of_bounds,
4696 texel_bld.zero, outdata[chan]);
4697 }
4698 if (format_desc->swizzle[3] == PIPE_SWIZZLE_1)
4699 outdata[3] = lp_build_select(&texel_bld, out_of_bounds,
4700 texel_bld.one, outdata[3]);
4701 else
4702 outdata[3] = lp_build_select(&texel_bld, out_of_bounds,
4703 texel_bld.zero, outdata[3]);
4704 } else if (params->img_op == LP_IMG_STORE) {
4705 lp_build_store_rgba_soa(gallivm, format_desc, params->type, params->exec_mask, base_ptr, offset, out_of_bounds,
4706 params->indata);
4707 } else {
4708 lp_build_do_atomic_soa(gallivm, format_desc, params->type, params->exec_mask, base_ptr, offset, out_of_bounds,
4709 params->img_op, params->op, params->indata, params->indata2, outdata);
4710 }
4711 }
4712
4713 /*
4714 * These functions are for indirect texture access suppoort.
4715 *
4716 * Indirect textures are implemented using a switch statement, that
4717 * takes the texture index and jumps to the sampler functions for
4718 * that texture unit.
4719 */
4720
4721 /*
4722 * Initialise an indexed sampler switch block.
4723 *
4724 * This sets up the switch_info state and adds the LLVM flow control pieces.
4725 */
4726 void
lp_build_sample_array_init_soa(struct lp_build_sample_array_switch * switch_info,struct gallivm_state * gallivm,const struct lp_sampler_params * params,LLVMValueRef idx,unsigned base,unsigned range)4727 lp_build_sample_array_init_soa(struct lp_build_sample_array_switch *switch_info,
4728 struct gallivm_state *gallivm,
4729 const struct lp_sampler_params *params,
4730 LLVMValueRef idx,
4731 unsigned base, unsigned range)
4732 {
4733 switch_info->gallivm = gallivm;
4734 switch_info->params = *params;
4735 switch_info->base = base;
4736 switch_info->range = range;
4737
4738 /* for generating the switch functions we don't want the texture index offset */
4739 switch_info->params.texture_index_offset = 0;
4740
4741 LLVMBasicBlockRef initial_block = LLVMGetInsertBlock(gallivm->builder);
4742 switch_info->merge_ref = lp_build_insert_new_block(gallivm, "texmerge");
4743
4744 switch_info->switch_ref = LLVMBuildSwitch(gallivm->builder, idx,
4745 switch_info->merge_ref, range - base);
4746
4747 LLVMTypeRef val_type[4];
4748 val_type[0] = val_type[1] = val_type[2] = val_type[3] =
4749 lp_build_vec_type(gallivm, params->type);
4750 LLVMTypeRef ret_type = LLVMStructTypeInContext(gallivm->context, val_type, 4, 0);
4751
4752 LLVMValueRef undef_val = LLVMGetUndef(ret_type);
4753
4754 LLVMPositionBuilderAtEnd(gallivm->builder, switch_info->merge_ref);
4755
4756 switch_info->phi = LLVMBuildPhi(gallivm->builder, ret_type, "");
4757 LLVMAddIncoming(switch_info->phi, &undef_val, &initial_block, 1);
4758 }
4759
4760 /*
4761 * Add an individual entry to the indirect texture switch.
4762 *
4763 * This builds the sample function and links a case for it into the switch statement.
4764 */
4765 void
lp_build_sample_array_case_soa(struct lp_build_sample_array_switch * switch_info,int idx,const struct lp_static_texture_state * static_texture_state,const struct lp_static_sampler_state * static_sampler_state,struct lp_sampler_dynamic_state * dynamic_texture_state)4766 lp_build_sample_array_case_soa(struct lp_build_sample_array_switch *switch_info,
4767 int idx,
4768 const struct lp_static_texture_state *static_texture_state,
4769 const struct lp_static_sampler_state *static_sampler_state,
4770 struct lp_sampler_dynamic_state *dynamic_texture_state)
4771 {
4772 struct gallivm_state *gallivm = switch_info->gallivm;
4773 LLVMBasicBlockRef this_block = lp_build_insert_new_block(gallivm, "texblock");
4774 LLVMValueRef tex_ret;
4775
4776 LLVMAddCase(switch_info->switch_ref, LLVMConstInt(LLVMInt32TypeInContext(gallivm->context), idx, 0), this_block);
4777 LLVMPositionBuilderAtEnd(gallivm->builder, this_block);
4778
4779 lp_build_sample_soa_func(gallivm, static_texture_state,
4780 static_sampler_state, dynamic_texture_state, &switch_info->params, idx, idx,
4781 &tex_ret);
4782
4783 LLVMAddIncoming(switch_info->phi, &tex_ret, &this_block, 1);
4784 LLVMBuildBr(gallivm->builder, switch_info->merge_ref);
4785 }
4786
4787 /*
4788 * Finish a switch statement.
4789 *
4790 * This handles extract the results from the switch.
4791 */
lp_build_sample_array_fini_soa(struct lp_build_sample_array_switch * switch_info)4792 void lp_build_sample_array_fini_soa(struct lp_build_sample_array_switch *switch_info)
4793 {
4794 struct gallivm_state *gallivm = switch_info->gallivm;
4795
4796 LLVMPositionBuilderAtEnd(gallivm->builder, switch_info->merge_ref);
4797 for (unsigned i = 0; i < 4; i++)
4798 switch_info->params.texel[i] = LLVMBuildExtractValue(gallivm->builder, switch_info->phi, i, "");
4799 }
4800
4801 void
lp_build_image_op_switch_soa(struct lp_build_img_op_array_switch * switch_info,struct gallivm_state * gallivm,const struct lp_img_params * params,LLVMValueRef idx,unsigned base,unsigned range)4802 lp_build_image_op_switch_soa(struct lp_build_img_op_array_switch *switch_info,
4803 struct gallivm_state *gallivm,
4804 const struct lp_img_params *params,
4805 LLVMValueRef idx,
4806 unsigned base, unsigned range)
4807 {
4808 switch_info->gallivm = gallivm;
4809 switch_info->params = *params;
4810 switch_info->base = base;
4811 switch_info->range = range;
4812
4813 /* for generating the switch functions we don't want the texture index offset */
4814 switch_info->params.image_index_offset = 0;
4815
4816 LLVMBasicBlockRef initial_block = LLVMGetInsertBlock(gallivm->builder);
4817 switch_info->merge_ref = lp_build_insert_new_block(gallivm, "imgmerge");
4818
4819 switch_info->switch_ref = LLVMBuildSwitch(gallivm->builder, idx,
4820 switch_info->merge_ref, range - base);
4821
4822 if (params->img_op != LP_IMG_STORE) {
4823 LLVMTypeRef ret_type = lp_build_vec_type(gallivm, params->type);
4824 LLVMValueRef undef_val = LLVMGetUndef(ret_type);
4825
4826 LLVMPositionBuilderAtEnd(gallivm->builder, switch_info->merge_ref);
4827
4828 for (unsigned i = 0; i < ((params->img_op == LP_IMG_LOAD) ? 4 : 1); i++) {
4829 switch_info->phi[i] = LLVMBuildPhi(gallivm->builder, ret_type, "");
4830 LLVMAddIncoming(switch_info->phi[i], &undef_val, &initial_block, 1);
4831 }
4832 }
4833 }
4834
4835 void
lp_build_image_op_array_case(struct lp_build_img_op_array_switch * switch_info,int idx,const struct lp_static_texture_state * static_texture_state,struct lp_sampler_dynamic_state * dynamic_state)4836 lp_build_image_op_array_case(struct lp_build_img_op_array_switch *switch_info,
4837 int idx,
4838 const struct lp_static_texture_state *static_texture_state,
4839 struct lp_sampler_dynamic_state *dynamic_state)
4840 {
4841 struct gallivm_state *gallivm = switch_info->gallivm;
4842 LLVMBasicBlockRef this_block = lp_build_insert_new_block(gallivm, "img");
4843 LLVMValueRef tex_ret[4];
4844
4845 LLVMAddCase(switch_info->switch_ref, lp_build_const_int32(gallivm, idx), this_block);
4846 LLVMPositionBuilderAtEnd(gallivm->builder, this_block);
4847
4848 switch_info->params.image_index = idx;
4849
4850 lp_build_img_op_soa(static_texture_state, dynamic_state, switch_info->gallivm, &switch_info->params, tex_ret);
4851 if (switch_info->params.img_op != LP_IMG_STORE) {
4852 for (unsigned i = 0; i < ((switch_info->params.img_op == LP_IMG_LOAD) ? 4 : 1); i++)
4853 tex_ret[i] = LLVMBuildBitCast(gallivm->builder, tex_ret[i], lp_build_vec_type(gallivm, switch_info->params.type), "");
4854
4855 this_block = LLVMGetInsertBlock(gallivm->builder);
4856 for (unsigned i = 0; i < ((switch_info->params.img_op == LP_IMG_LOAD) ? 4 : 1); i++) {
4857 LLVMAddIncoming(switch_info->phi[i], &tex_ret[i], &this_block, 1);
4858 }
4859 }
4860 LLVMBuildBr(gallivm->builder, switch_info->merge_ref);
4861 }
4862
lp_build_image_op_array_fini_soa(struct lp_build_img_op_array_switch * switch_info)4863 void lp_build_image_op_array_fini_soa(struct lp_build_img_op_array_switch *switch_info)
4864 {
4865 struct gallivm_state *gallivm = switch_info->gallivm;
4866
4867 LLVMPositionBuilderAtEnd(gallivm->builder, switch_info->merge_ref);
4868
4869 if (switch_info->params.img_op != LP_IMG_STORE) {
4870 for (unsigned i = 0; i < ((switch_info->params.img_op == LP_IMG_LOAD) ? 4 : 1); i++) {
4871 switch_info->params.outdata[i] = switch_info->phi[i];
4872 }
4873 }
4874 }
4875