1 /**************************************************************************
2 *
3 * Copyright 2009 VMware, Inc.
4 * All Rights Reserved.
5 *
6 * Permission is hereby granted, free of charge, to any person obtaining a
7 * copy of this software and associated documentation files (the
8 * "Software"), to deal in the Software without restriction, including
9 * without limitation the rights to use, copy, modify, merge, publish,
10 * distribute, sub license, and/or sell copies of the Software, and to
11 * permit persons to whom the Software is furnished to do so, subject to
12 * the following conditions:
13 *
14 * The above copyright notice and this permission notice (including the
15 * next paragraph) shall be included in all copies or substantial portions
16 * of the Software.
17 *
18 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
19 * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
20 * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
21 * IN NO EVENT SHALL VMWARE AND/OR ITS SUPPLIERS BE LIABLE FOR
22 * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
23 * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
24 * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
25 *
26 **************************************************************************/
27
28 /**
29 * @file
30 * Texture sampling -- SoA.
31 *
32 * @author Jose Fonseca <jfonseca@vmware.com>
33 * @author Brian Paul <brianp@vmware.com>
34 */
35
36 #include "pipe/p_defines.h"
37 #include "pipe/p_state.h"
38 #include "pipe/p_shader_tokens.h"
39 #include "util/compiler.h"
40 #include "util/u_debug.h"
41 #include "util/u_dump.h"
42 #include "util/u_memory.h"
43 #include "util/u_math.h"
44 #include "util/format/u_format.h"
45 #include "util/u_cpu_detect.h"
46 #include "util/format_rgb9e5.h"
47 #include "lp_bld_debug.h"
48 #include "lp_bld_type.h"
49 #include "lp_bld_const.h"
50 #include "lp_bld_conv.h"
51 #include "lp_bld_arit.h"
52 #include "lp_bld_bitarit.h"
53 #include "lp_bld_logic.h"
54 #include "lp_bld_printf.h"
55 #include "lp_bld_swizzle.h"
56 #include "lp_bld_flow.h"
57 #include "lp_bld_gather.h"
58 #include "lp_bld_format.h"
59 #include "lp_bld_sample.h"
60 #include "lp_bld_sample_aos.h"
61 #include "lp_bld_struct.h"
62 #include "lp_bld_quad.h"
63 #include "lp_bld_pack.h"
64 #include "lp_bld_intr.h"
65 #include "lp_bld_misc.h"
66
67
68 /**
69 * Generate code to fetch a texel from a texture at int coords (x, y, z).
70 * The computation depends on whether the texture is 1D, 2D or 3D.
71 * The result, texel, will be float vectors:
72 * texel[0] = red values
73 * texel[1] = green values
74 * texel[2] = blue values
75 * texel[3] = alpha values
76 */
77 static void
lp_build_sample_texel_soa(struct lp_build_sample_context * bld,LLVMValueRef width,LLVMValueRef height,LLVMValueRef depth,LLVMValueRef x,LLVMValueRef y,LLVMValueRef z,LLVMValueRef y_stride,LLVMValueRef z_stride,LLVMValueRef data_ptr,LLVMValueRef mipoffsets,LLVMValueRef texel_out[4])78 lp_build_sample_texel_soa(struct lp_build_sample_context *bld,
79 LLVMValueRef width,
80 LLVMValueRef height,
81 LLVMValueRef depth,
82 LLVMValueRef x,
83 LLVMValueRef y,
84 LLVMValueRef z,
85 LLVMValueRef y_stride,
86 LLVMValueRef z_stride,
87 LLVMValueRef data_ptr,
88 LLVMValueRef mipoffsets,
89 LLVMValueRef texel_out[4])
90 {
91 const struct lp_static_sampler_state *static_state = bld->static_sampler_state;
92 const unsigned dims = bld->dims;
93 struct lp_build_context *int_coord_bld = &bld->int_coord_bld;
94 LLVMBuilderRef builder = bld->gallivm->builder;
95 LLVMValueRef offset;
96 LLVMValueRef i, j;
97 LLVMValueRef use_border = NULL;
98
99 /* use_border = x < 0 || x >= width || y < 0 || y >= height */
100 if (lp_sampler_wrap_mode_uses_border_color(static_state->wrap_s,
101 static_state->min_img_filter,
102 static_state->mag_img_filter)) {
103 LLVMValueRef b1, b2;
104 b1 = lp_build_cmp(int_coord_bld, PIPE_FUNC_LESS, x, int_coord_bld->zero);
105 b2 = lp_build_cmp(int_coord_bld, PIPE_FUNC_GEQUAL, x, width);
106 use_border = LLVMBuildOr(builder, b1, b2, "b1_or_b2");
107 }
108
109 if (dims >= 2 &&
110 lp_sampler_wrap_mode_uses_border_color(static_state->wrap_t,
111 static_state->min_img_filter,
112 static_state->mag_img_filter)) {
113 LLVMValueRef b1, b2;
114 b1 = lp_build_cmp(int_coord_bld, PIPE_FUNC_LESS, y, int_coord_bld->zero);
115 b2 = lp_build_cmp(int_coord_bld, PIPE_FUNC_GEQUAL, y, height);
116 if (use_border) {
117 use_border = LLVMBuildOr(builder, use_border, b1, "ub_or_b1");
118 use_border = LLVMBuildOr(builder, use_border, b2, "ub_or_b2");
119 }
120 else {
121 use_border = LLVMBuildOr(builder, b1, b2, "b1_or_b2");
122 }
123 }
124
125 if (dims == 3 &&
126 lp_sampler_wrap_mode_uses_border_color(static_state->wrap_r,
127 static_state->min_img_filter,
128 static_state->mag_img_filter)) {
129 LLVMValueRef b1, b2;
130 b1 = lp_build_cmp(int_coord_bld, PIPE_FUNC_LESS, z, int_coord_bld->zero);
131 b2 = lp_build_cmp(int_coord_bld, PIPE_FUNC_GEQUAL, z, depth);
132 if (use_border) {
133 use_border = LLVMBuildOr(builder, use_border, b1, "ub_or_b1");
134 use_border = LLVMBuildOr(builder, use_border, b2, "ub_or_b2");
135 }
136 else {
137 use_border = LLVMBuildOr(builder, b1, b2, "b1_or_b2");
138 }
139 }
140
141 /* convert x,y,z coords to linear offset from start of texture, in bytes */
142 lp_build_sample_offset(&bld->int_coord_bld,
143 bld->format_desc,
144 x, y, z, y_stride, z_stride,
145 &offset, &i, &j);
146 if (mipoffsets) {
147 offset = lp_build_add(&bld->int_coord_bld, offset, mipoffsets);
148 }
149
150 if (use_border) {
151 /* If we can sample the border color, it means that texcoords may
152 * lie outside the bounds of the texture image. We need to do
153 * something to prevent reading out of bounds and causing a segfault.
154 *
155 * Simply AND the texture coords with !use_border. This will cause
156 * coords which are out of bounds to become zero. Zero's guaranteed
157 * to be inside the texture image.
158 */
159 offset = lp_build_andnot(&bld->int_coord_bld, offset, use_border);
160 }
161
162 lp_build_fetch_rgba_soa(bld->gallivm,
163 bld->format_desc,
164 bld->texel_type, TRUE,
165 data_ptr, offset,
166 i, j,
167 bld->cache,
168 texel_out);
169
170 /*
171 * Note: if we find an app which frequently samples the texture border
172 * we might want to implement a true conditional here to avoid sampling
173 * the texture whenever possible (since that's quite a bit of code).
174 * Ex:
175 * if (use_border) {
176 * texel = border_color;
177 * }
178 * else {
179 * texel = sample_texture(coord);
180 * }
181 * As it is now, we always sample the texture, then selectively replace
182 * the texel color results with the border color.
183 */
184
185 if (use_border) {
186 /* select texel color or border color depending on use_border. */
187 const struct util_format_description *format_desc = bld->format_desc;
188 int chan;
189 struct lp_type border_type = bld->texel_type;
190 border_type.length = 4;
191 /*
192 * Only replace channels which are actually present. The others should
193 * get optimized away eventually by sampler_view swizzle anyway but it's
194 * easier too.
195 */
196 for (chan = 0; chan < 4; chan++) {
197 unsigned chan_s;
198 /* reverse-map channel... */
199 if (util_format_has_stencil(format_desc)) {
200 if (chan == 0)
201 chan_s = 0;
202 else
203 break;
204 }
205 else {
206 for (chan_s = 0; chan_s < 4; chan_s++) {
207 if (chan_s == format_desc->swizzle[chan]) {
208 break;
209 }
210 }
211 }
212 if (chan_s <= 3) {
213 /* use the already clamped color */
214 LLVMValueRef idx = lp_build_const_int32(bld->gallivm, chan);
215 LLVMValueRef border_chan;
216
217 border_chan = lp_build_extract_broadcast(bld->gallivm,
218 border_type,
219 bld->texel_type,
220 bld->border_color_clamped,
221 idx);
222 texel_out[chan] = lp_build_select(&bld->texel_bld, use_border,
223 border_chan, texel_out[chan]);
224 }
225 }
226 }
227 }
228
229
230 /**
231 * Helper to compute the mirror function for the PIPE_WRAP_MIRROR_REPEAT mode.
232 * (Note that with pot sizes could do this much more easily post-scale
233 * with some bit arithmetic.)
234 */
235 static LLVMValueRef
lp_build_coord_mirror(struct lp_build_sample_context * bld,LLVMValueRef coord,boolean posOnly)236 lp_build_coord_mirror(struct lp_build_sample_context *bld,
237 LLVMValueRef coord, boolean posOnly)
238 {
239 struct lp_build_context *coord_bld = &bld->coord_bld;
240 LLVMValueRef fract;
241 LLVMValueRef half = lp_build_const_vec(bld->gallivm, coord_bld->type, 0.5);
242
243 /*
244 * We can just use 2*(x - round(0.5*x)) to do all the mirroring,
245 * it all works out. (The result is in range [-1, 1.0], negative if
246 * the coord is in the "odd" section, otherwise positive.)
247 */
248
249 coord = lp_build_mul(coord_bld, coord, half);
250 fract = lp_build_round(coord_bld, coord);
251 fract = lp_build_sub(coord_bld, coord, fract);
252 coord = lp_build_add(coord_bld, fract, fract);
253
254 if (posOnly) {
255 /*
256 * Theoretically it's not quite 100% accurate because the spec says
257 * that ultimately a scaled coord of -x.0 should map to int coord
258 * -x + 1 with mirroring, not -x (this does not matter for bilinear
259 * filtering).
260 */
261 coord = lp_build_abs(coord_bld, coord);
262 /* kill off NaNs */
263 /* XXX: not safe without arch rounding, fract can be anything. */
264 coord = lp_build_max_ext(coord_bld, coord, coord_bld->zero,
265 GALLIVM_NAN_RETURN_OTHER_SECOND_NONNAN);
266 }
267
268 return coord;
269 }
270
271
272 /**
273 * Helper to compute the first coord and the weight for
274 * linear wrap repeat npot textures
275 */
276 void
lp_build_coord_repeat_npot_linear(struct lp_build_sample_context * bld,LLVMValueRef coord_f,LLVMValueRef length_i,LLVMValueRef length_f,LLVMValueRef * coord0_i,LLVMValueRef * weight_f)277 lp_build_coord_repeat_npot_linear(struct lp_build_sample_context *bld,
278 LLVMValueRef coord_f,
279 LLVMValueRef length_i,
280 LLVMValueRef length_f,
281 LLVMValueRef *coord0_i,
282 LLVMValueRef *weight_f)
283 {
284 struct lp_build_context *coord_bld = &bld->coord_bld;
285 struct lp_build_context *int_coord_bld = &bld->int_coord_bld;
286 LLVMValueRef half = lp_build_const_vec(bld->gallivm, coord_bld->type, 0.5);
287 LLVMValueRef length_minus_one = lp_build_sub(int_coord_bld, length_i,
288 int_coord_bld->one);
289 LLVMValueRef mask;
290 /* wrap with normalized floats is just fract */
291 coord_f = lp_build_fract(coord_bld, coord_f);
292 /* mul by size and subtract 0.5 */
293 coord_f = lp_build_mul(coord_bld, coord_f, length_f);
294 coord_f = lp_build_sub(coord_bld, coord_f, half);
295 /*
296 * we avoided the 0.5/length division before the repeat wrap,
297 * now need to fix up edge cases with selects
298 */
299 /*
300 * Note we do a float (unordered) compare so we can eliminate NaNs.
301 * (Otherwise would need fract_safe above).
302 */
303 mask = lp_build_compare(coord_bld->gallivm, coord_bld->type,
304 PIPE_FUNC_LESS, coord_f, coord_bld->zero);
305
306 /* convert to int, compute lerp weight */
307 lp_build_ifloor_fract(coord_bld, coord_f, coord0_i, weight_f);
308 *coord0_i = lp_build_select(int_coord_bld, mask, length_minus_one, *coord0_i);
309 }
310
311
312 /**
313 * Build LLVM code for texture wrap mode for linear filtering.
314 * \param x0_out returns first integer texcoord
315 * \param x1_out returns second integer texcoord
316 * \param weight_out returns linear interpolation weight
317 */
318 static void
lp_build_sample_wrap_linear(struct lp_build_sample_context * bld,boolean is_gather,LLVMValueRef coord,LLVMValueRef length,LLVMValueRef length_f,LLVMValueRef offset,boolean is_pot,unsigned wrap_mode,LLVMValueRef * x0_out,LLVMValueRef * x1_out,LLVMValueRef * weight_out)319 lp_build_sample_wrap_linear(struct lp_build_sample_context *bld,
320 boolean is_gather,
321 LLVMValueRef coord,
322 LLVMValueRef length,
323 LLVMValueRef length_f,
324 LLVMValueRef offset,
325 boolean is_pot,
326 unsigned wrap_mode,
327 LLVMValueRef *x0_out,
328 LLVMValueRef *x1_out,
329 LLVMValueRef *weight_out)
330 {
331 struct lp_build_context *coord_bld = &bld->coord_bld;
332 struct lp_build_context *int_coord_bld = &bld->int_coord_bld;
333 LLVMBuilderRef builder = bld->gallivm->builder;
334 LLVMValueRef half = lp_build_const_vec(bld->gallivm, coord_bld->type, 0.5);
335 LLVMValueRef length_minus_one = lp_build_sub(int_coord_bld, length, int_coord_bld->one);
336 LLVMValueRef coord0, coord1, weight;
337
338 switch(wrap_mode) {
339 case PIPE_TEX_WRAP_REPEAT:
340 if (is_pot) {
341 /* mul by size and subtract 0.5 */
342 coord = lp_build_mul(coord_bld, coord, length_f);
343 coord = lp_build_sub(coord_bld, coord, half);
344 if (offset) {
345 offset = lp_build_int_to_float(coord_bld, offset);
346 coord = lp_build_add(coord_bld, coord, offset);
347 }
348 /* convert to int, compute lerp weight */
349 lp_build_ifloor_fract(coord_bld, coord, &coord0, &weight);
350 coord1 = lp_build_add(int_coord_bld, coord0, int_coord_bld->one);
351 /* repeat wrap */
352 coord0 = LLVMBuildAnd(builder, coord0, length_minus_one, "");
353 coord1 = LLVMBuildAnd(builder, coord1, length_minus_one, "");
354 }
355 else {
356 LLVMValueRef mask;
357 if (offset) {
358 offset = lp_build_int_to_float(coord_bld, offset);
359 offset = lp_build_div(coord_bld, offset, length_f);
360 coord = lp_build_add(coord_bld, coord, offset);
361 }
362 lp_build_coord_repeat_npot_linear(bld, coord,
363 length, length_f,
364 &coord0, &weight);
365 mask = lp_build_compare(int_coord_bld->gallivm, int_coord_bld->type,
366 PIPE_FUNC_NOTEQUAL, coord0, length_minus_one);
367 coord1 = LLVMBuildAnd(builder,
368 lp_build_add(int_coord_bld, coord0, int_coord_bld->one),
369 mask, "");
370 }
371 break;
372
373 case PIPE_TEX_WRAP_CLAMP:
374 if (bld->static_sampler_state->normalized_coords) {
375 /* scale coord to length */
376 coord = lp_build_mul(coord_bld, coord, length_f);
377 }
378 if (offset) {
379 offset = lp_build_int_to_float(coord_bld, offset);
380 coord = lp_build_add(coord_bld, coord, offset);
381 }
382
383 /*
384 * clamp to [0, length]
385 *
386 * Unlike some other wrap modes, this should be correct for gather
387 * too. GL_CLAMP explicitly does this clamp on the coord prior to
388 * actual wrapping (which is per sample).
389 */
390 coord = lp_build_clamp(coord_bld, coord, coord_bld->zero, length_f);
391
392 coord = lp_build_sub(coord_bld, coord, half);
393
394 /* convert to int, compute lerp weight */
395 lp_build_ifloor_fract(coord_bld, coord, &coord0, &weight);
396 coord1 = lp_build_add(int_coord_bld, coord0, int_coord_bld->one);
397 break;
398
399 case PIPE_TEX_WRAP_CLAMP_TO_EDGE:
400 {
401 struct lp_build_context abs_coord_bld = bld->coord_bld;
402 abs_coord_bld.type.sign = FALSE;
403
404 if (bld->static_sampler_state->normalized_coords) {
405 /* mul by tex size */
406 coord = lp_build_mul(coord_bld, coord, length_f);
407 }
408 if (offset) {
409 offset = lp_build_int_to_float(coord_bld, offset);
410 coord = lp_build_add(coord_bld, coord, offset);
411 }
412
413 /* clamp to length max */
414 coord = lp_build_min_ext(coord_bld, coord, length_f,
415 GALLIVM_NAN_RETURN_OTHER_SECOND_NONNAN);
416 if (!is_gather) {
417 /* subtract 0.5 */
418 coord = lp_build_sub(coord_bld, coord, half);
419 /* clamp to [0, length - 0.5] */
420 coord = lp_build_max(coord_bld, coord, coord_bld->zero);
421 /* convert to int, compute lerp weight */
422 lp_build_ifloor_fract(&abs_coord_bld, coord, &coord0, &weight);
423 coord1 = lp_build_add(int_coord_bld, coord0, int_coord_bld->one);
424 } else {
425 /*
426 * The non-gather path will end up with coords 0, 1 if coord was
427 * smaller than 0.5 (with corresponding weight 0.0 so it doesn't
428 * really matter what the second coord is). But for gather, we
429 * really need to end up with coords 0, 0.
430 */
431 coord = lp_build_max(coord_bld, coord, coord_bld->zero);
432 coord0 = lp_build_sub(coord_bld, coord, half);
433 coord1 = lp_build_add(coord_bld, coord, half);
434 /* Values range ([-0.5, length_f - 0.5], [0.5, length_f + 0.5] */
435 coord0 = lp_build_itrunc(coord_bld, coord0);
436 coord1 = lp_build_itrunc(coord_bld, coord1);
437 weight = coord_bld->undef;
438 }
439 /* coord1 = min(coord1, length-1) */
440 coord1 = lp_build_min(int_coord_bld, coord1, length_minus_one);
441 break;
442 }
443
444 case PIPE_TEX_WRAP_CLAMP_TO_BORDER:
445 if (bld->static_sampler_state->normalized_coords) {
446 /* scale coord to length */
447 coord = lp_build_mul(coord_bld, coord, length_f);
448 }
449 if (offset) {
450 offset = lp_build_int_to_float(coord_bld, offset);
451 coord = lp_build_add(coord_bld, coord, offset);
452 }
453 /*
454 * We don't need any clamp. Technically, for very large (pos or neg)
455 * (or infinite) values, clamp against [-length, length] would be
456 * correct, but we don't need to guarantee any specific
457 * result for such coords (the ifloor will be undefined, but for modes
458 * requiring border all resulting coords are safe).
459 */
460 coord = lp_build_sub(coord_bld, coord, half);
461 /* convert to int, compute lerp weight */
462 lp_build_ifloor_fract(coord_bld, coord, &coord0, &weight);
463 coord1 = lp_build_add(int_coord_bld, coord0, int_coord_bld->one);
464 break;
465
466 case PIPE_TEX_WRAP_MIRROR_REPEAT:
467 if (offset) {
468 offset = lp_build_int_to_float(coord_bld, offset);
469 offset = lp_build_div(coord_bld, offset, length_f);
470 coord = lp_build_add(coord_bld, coord, offset);
471 }
472 if (!is_gather) {
473 /* compute mirror function */
474 coord = lp_build_coord_mirror(bld, coord, TRUE);
475
476 /* scale coord to length */
477 coord = lp_build_mul(coord_bld, coord, length_f);
478 coord = lp_build_sub(coord_bld, coord, half);
479
480 /* convert to int, compute lerp weight */
481 lp_build_ifloor_fract(coord_bld, coord, &coord0, &weight);
482 coord1 = lp_build_add(int_coord_bld, coord0, int_coord_bld->one);
483
484 /* coord0 = max(coord0, 0) */
485 coord0 = lp_build_max(int_coord_bld, coord0, int_coord_bld->zero);
486 /* coord1 = min(coord1, length-1) */
487 coord1 = lp_build_min(int_coord_bld, coord1, length_minus_one);
488 } else {
489 /*
490 * This is pretty reasonable in the end, all what the tests care
491 * about is nasty edge cases (scaled coords x.5, so the individual
492 * coords are actually integers, which is REALLY tricky to get right
493 * due to this working differently both for negative numbers as well
494 * as for even/odd cases). But with enough magic it's not too complex
495 * after all.
496 * Maybe should try a bit arithmetic one though for POT textures...
497 */
498 LLVMValueRef isNeg;
499 /*
500 * Wrapping just once still works, even though it means we can
501 * get "wrong" sign due to performing mirror in the middle of the
502 * two coords (because this can only happen very near the odd/even
503 * edges, so both coords will actually end up as 0 or length - 1
504 * in the end).
505 * For GL4 gather with per-sample offsets we'd need to the mirroring
506 * per coord too.
507 */
508 coord = lp_build_coord_mirror(bld, coord, FALSE);
509 coord = lp_build_mul(coord_bld, coord, length_f);
510
511 /*
512 * NaNs should be safe here, we'll do away with them with
513 * the ones' complement plus min.
514 */
515 coord0 = lp_build_sub(coord_bld, coord, half);
516 coord0 = lp_build_ifloor(coord_bld, coord0);
517 coord1 = lp_build_add(int_coord_bld, coord0, int_coord_bld->one);
518 /* ones complement for neg numbers (mirror(negX) = X - 1) */
519 isNeg = lp_build_cmp(int_coord_bld, PIPE_FUNC_LESS,
520 coord0, int_coord_bld->zero);
521 coord0 = lp_build_xor(int_coord_bld, coord0, isNeg);
522 isNeg = lp_build_cmp(int_coord_bld, PIPE_FUNC_LESS,
523 coord1, int_coord_bld->zero);
524 coord1 = lp_build_xor(int_coord_bld, coord1, isNeg);
525 coord0 = lp_build_min(int_coord_bld, coord0, length_minus_one);
526 coord1 = lp_build_min(int_coord_bld, coord1, length_minus_one);
527
528 weight = coord_bld->undef;
529 }
530 break;
531
532 case PIPE_TEX_WRAP_MIRROR_CLAMP:
533 if (bld->static_sampler_state->normalized_coords) {
534 /* scale coord to length */
535 coord = lp_build_mul(coord_bld, coord, length_f);
536 }
537 if (offset) {
538 offset = lp_build_int_to_float(coord_bld, offset);
539 coord = lp_build_add(coord_bld, coord, offset);
540 }
541 /*
542 * XXX: probably not correct for gather, albeit I'm not
543 * entirely sure as it's poorly specified. The wrapping looks
544 * correct according to the spec which is against gl 1.2.1,
545 * however negative values will be swapped - gl re-specified
546 * wrapping with newer versions (no more pre-clamp except with
547 * GL_CLAMP).
548 */
549 coord = lp_build_abs(coord_bld, coord);
550
551 /* clamp to [0, length] */
552 coord = lp_build_min_ext(coord_bld, coord, length_f,
553 GALLIVM_NAN_RETURN_OTHER_SECOND_NONNAN);
554
555 coord = lp_build_sub(coord_bld, coord, half);
556
557 /* convert to int, compute lerp weight */
558 lp_build_ifloor_fract(coord_bld, coord, &coord0, &weight);
559 coord1 = lp_build_add(int_coord_bld, coord0, int_coord_bld->one);
560 break;
561
562 case PIPE_TEX_WRAP_MIRROR_CLAMP_TO_EDGE:
563 {
564 struct lp_build_context abs_coord_bld = bld->coord_bld;
565 abs_coord_bld.type.sign = FALSE;
566
567 if (bld->static_sampler_state->normalized_coords) {
568 /* scale coord to length */
569 coord = lp_build_mul(coord_bld, coord, length_f);
570 }
571 if (offset) {
572 offset = lp_build_int_to_float(coord_bld, offset);
573 coord = lp_build_add(coord_bld, coord, offset);
574 }
575 if (!is_gather) {
576 coord = lp_build_abs(coord_bld, coord);
577
578 /* clamp to length max */
579 coord = lp_build_min_ext(coord_bld, coord, length_f,
580 GALLIVM_NAN_RETURN_OTHER_SECOND_NONNAN);
581 /* subtract 0.5 */
582 coord = lp_build_sub(coord_bld, coord, half);
583 /* clamp to [0, length - 0.5] */
584 coord = lp_build_max(coord_bld, coord, coord_bld->zero);
585
586 /* convert to int, compute lerp weight */
587 lp_build_ifloor_fract(&abs_coord_bld, coord, &coord0, &weight);
588 coord1 = lp_build_add(int_coord_bld, coord0, int_coord_bld->one);
589 /* coord1 = min(coord1, length-1) */
590 coord1 = lp_build_min(int_coord_bld, coord1, length_minus_one);
591 } else {
592 /*
593 * The non-gather path will swap coord0/1 if coord was negative,
594 * which is ok for filtering since the filter weight matches
595 * accordingly. Also, if coord is close to zero, coord0/1 will
596 * be 0 and 1, instead of 0 and 0 (again ok due to filter
597 * weight being 0.0). Both issues need to be fixed for gather.
598 */
599 LLVMValueRef isNeg;
600
601 /*
602 * Actually wanted to cheat here and use:
603 * coord1 = lp_build_iround(coord_bld, coord);
604 * but it's not good enough for some tests (even piglit
605 * textureGather is set up in a way so the coords area always
606 * .5, that is right at the crossover points).
607 * So do ordinary sub/floor, then do ones' complement
608 * for negative numbers.
609 * (Note can't just do sub|add/abs/itrunc per coord neither -
610 * because the spec demands that mirror(3.0) = 3 but
611 * mirror(-3.0) = 2.)
612 */
613 coord = lp_build_sub(coord_bld, coord, half);
614 coord0 = lp_build_ifloor(coord_bld, coord);
615 coord1 = lp_build_add(int_coord_bld, coord0, int_coord_bld->one);
616 isNeg = lp_build_cmp(int_coord_bld, PIPE_FUNC_LESS, coord0,
617 int_coord_bld->zero);
618 coord0 = lp_build_xor(int_coord_bld, isNeg, coord0);
619 coord0 = lp_build_min(int_coord_bld, coord0, length_minus_one);
620
621 isNeg = lp_build_cmp(int_coord_bld, PIPE_FUNC_LESS, coord1,
622 int_coord_bld->zero);
623 coord1 = lp_build_xor(int_coord_bld, isNeg, coord1);
624 coord1 = lp_build_min(int_coord_bld, coord1, length_minus_one);
625
626 weight = coord_bld->undef;
627 }
628 }
629 break;
630
631 case PIPE_TEX_WRAP_MIRROR_CLAMP_TO_BORDER:
632 {
633 if (bld->static_sampler_state->normalized_coords) {
634 /* scale coord to length */
635 coord = lp_build_mul(coord_bld, coord, length_f);
636 }
637 if (offset) {
638 offset = lp_build_int_to_float(coord_bld, offset);
639 coord = lp_build_add(coord_bld, coord, offset);
640 }
641 /*
642 * XXX: probably not correct for gather due to swapped
643 * order if coord is negative (same rationale as for
644 * MIRROR_CLAMP).
645 */
646 coord = lp_build_abs(coord_bld, coord);
647
648 /*
649 * We don't need any clamp. Technically, for very large
650 * (or infinite) values, clamp against length would be
651 * correct, but we don't need to guarantee any specific
652 * result for such coords (the ifloor will be undefined, but
653 * for modes requiring border all resulting coords are safe).
654 */
655 coord = lp_build_sub(coord_bld, coord, half);
656
657 /* convert to int, compute lerp weight */
658 lp_build_ifloor_fract(coord_bld, coord, &coord0, &weight);
659 coord1 = lp_build_add(int_coord_bld, coord0, int_coord_bld->one);
660 }
661 break;
662
663 default:
664 assert(0);
665 coord0 = NULL;
666 coord1 = NULL;
667 weight = NULL;
668 }
669
670 *x0_out = coord0;
671 *x1_out = coord1;
672 *weight_out = weight;
673 }
674
675
676 /**
677 * Build LLVM code for texture wrap mode for nearest filtering.
678 * \param coord the incoming texcoord (nominally in [0,1])
679 * \param length the texture size along one dimension, as int vector
680 * \param length_f the texture size along one dimension, as float vector
681 * \param offset texel offset along one dimension (as int vector)
682 * \param is_pot if TRUE, length is a power of two
683 * \param wrap_mode one of PIPE_TEX_WRAP_x
684 */
685 static LLVMValueRef
lp_build_sample_wrap_nearest(struct lp_build_sample_context * bld,LLVMValueRef coord,LLVMValueRef length,LLVMValueRef length_f,LLVMValueRef offset,boolean is_pot,unsigned wrap_mode)686 lp_build_sample_wrap_nearest(struct lp_build_sample_context *bld,
687 LLVMValueRef coord,
688 LLVMValueRef length,
689 LLVMValueRef length_f,
690 LLVMValueRef offset,
691 boolean is_pot,
692 unsigned wrap_mode)
693 {
694 struct lp_build_context *coord_bld = &bld->coord_bld;
695 struct lp_build_context *int_coord_bld = &bld->int_coord_bld;
696 LLVMBuilderRef builder = bld->gallivm->builder;
697 LLVMValueRef length_minus_one = lp_build_sub(int_coord_bld, length, int_coord_bld->one);
698 LLVMValueRef icoord;
699
700 switch(wrap_mode) {
701 case PIPE_TEX_WRAP_REPEAT:
702 if (is_pot) {
703 coord = lp_build_mul(coord_bld, coord, length_f);
704 icoord = lp_build_ifloor(coord_bld, coord);
705 if (offset) {
706 icoord = lp_build_add(int_coord_bld, icoord, offset);
707 }
708 icoord = LLVMBuildAnd(builder, icoord, length_minus_one, "");
709 }
710 else {
711 if (offset) {
712 offset = lp_build_int_to_float(coord_bld, offset);
713 offset = lp_build_div(coord_bld, offset, length_f);
714 coord = lp_build_add(coord_bld, coord, offset);
715 }
716 /* take fraction, unnormalize */
717 coord = lp_build_fract_safe(coord_bld, coord);
718 coord = lp_build_mul(coord_bld, coord, length_f);
719 icoord = lp_build_itrunc(coord_bld, coord);
720 }
721 break;
722
723 case PIPE_TEX_WRAP_CLAMP:
724 case PIPE_TEX_WRAP_CLAMP_TO_EDGE:
725 if (bld->static_sampler_state->normalized_coords) {
726 /* scale coord to length */
727 coord = lp_build_mul(coord_bld, coord, length_f);
728 }
729
730 if (offset) {
731 offset = lp_build_int_to_float(coord_bld, offset);
732 coord = lp_build_add(coord_bld, coord, offset);
733 }
734 /* floor */
735 /* use itrunc instead since we clamp to 0 anyway */
736 icoord = lp_build_itrunc(coord_bld, coord);
737
738 /* clamp to [0, length - 1]. */
739 icoord = lp_build_clamp(int_coord_bld, icoord, int_coord_bld->zero,
740 length_minus_one);
741 break;
742
743 case PIPE_TEX_WRAP_CLAMP_TO_BORDER:
744 if (bld->static_sampler_state->normalized_coords) {
745 /* scale coord to length */
746 coord = lp_build_mul(coord_bld, coord, length_f);
747 }
748 /* no clamp necessary, border masking will handle this */
749 icoord = lp_build_ifloor(coord_bld, coord);
750 if (offset) {
751 icoord = lp_build_add(int_coord_bld, icoord, offset);
752 }
753 break;
754
755 case PIPE_TEX_WRAP_MIRROR_REPEAT:
756 if (offset) {
757 offset = lp_build_int_to_float(coord_bld, offset);
758 offset = lp_build_div(coord_bld, offset, length_f);
759 coord = lp_build_add(coord_bld, coord, offset);
760 }
761 /* compute mirror function */
762 coord = lp_build_coord_mirror(bld, coord, TRUE);
763
764 /* scale coord to length */
765 assert(bld->static_sampler_state->normalized_coords);
766 coord = lp_build_mul(coord_bld, coord, length_f);
767
768 /* itrunc == ifloor here */
769 icoord = lp_build_itrunc(coord_bld, coord);
770
771 /* clamp to [0, length - 1] */
772 icoord = lp_build_min(int_coord_bld, icoord, length_minus_one);
773 break;
774
775 case PIPE_TEX_WRAP_MIRROR_CLAMP:
776 case PIPE_TEX_WRAP_MIRROR_CLAMP_TO_EDGE:
777 if (bld->static_sampler_state->normalized_coords) {
778 /* scale coord to length */
779 coord = lp_build_mul(coord_bld, coord, length_f);
780 }
781 if (offset) {
782 offset = lp_build_int_to_float(coord_bld, offset);
783 coord = lp_build_add(coord_bld, coord, offset);
784 }
785 coord = lp_build_abs(coord_bld, coord);
786
787 /* itrunc == ifloor here */
788 icoord = lp_build_itrunc(coord_bld, coord);
789 /*
790 * Use unsigned min due to possible undef values (NaNs, overflow)
791 */
792 {
793 struct lp_build_context abs_coord_bld = *int_coord_bld;
794 abs_coord_bld.type.sign = FALSE;
795 /* clamp to [0, length - 1] */
796 icoord = lp_build_min(&abs_coord_bld, icoord, length_minus_one);
797 }
798 break;
799
800 case PIPE_TEX_WRAP_MIRROR_CLAMP_TO_BORDER:
801 if (bld->static_sampler_state->normalized_coords) {
802 /* scale coord to length */
803 coord = lp_build_mul(coord_bld, coord, length_f);
804 }
805 if (offset) {
806 offset = lp_build_int_to_float(coord_bld, offset);
807 coord = lp_build_add(coord_bld, coord, offset);
808 }
809 coord = lp_build_abs(coord_bld, coord);
810
811 /* itrunc == ifloor here */
812 icoord = lp_build_itrunc(coord_bld, coord);
813 break;
814
815 default:
816 assert(0);
817 icoord = NULL;
818 }
819
820 return icoord;
821 }
822
823
824 /**
825 * Do shadow test/comparison.
826 * \param p shadow ref value
827 * \param texel the texel to compare against
828 */
829 static LLVMValueRef
lp_build_sample_comparefunc(struct lp_build_sample_context * bld,LLVMValueRef p,LLVMValueRef texel)830 lp_build_sample_comparefunc(struct lp_build_sample_context *bld,
831 LLVMValueRef p,
832 LLVMValueRef texel)
833 {
834 struct lp_build_context *texel_bld = &bld->texel_bld;
835 LLVMValueRef res;
836
837 if (0) {
838 //lp_build_print_value(bld->gallivm, "shadow cmp coord", p);
839 lp_build_print_value(bld->gallivm, "shadow cmp texel", texel);
840 }
841
842 /* result = (p FUNC texel) ? 1 : 0 */
843 /*
844 * honor d3d10 floating point rules here, which state that comparisons
845 * are ordered except NOT_EQUAL which is unordered.
846 */
847 if (bld->static_sampler_state->compare_func != PIPE_FUNC_NOTEQUAL) {
848 res = lp_build_cmp_ordered(texel_bld, bld->static_sampler_state->compare_func,
849 p, texel);
850 }
851 else {
852 res = lp_build_cmp(texel_bld, bld->static_sampler_state->compare_func,
853 p, texel);
854 }
855 return res;
856 }
857
858
859 /**
860 * Generate code to sample a mipmap level with nearest filtering.
861 * If sampling a cube texture, r = cube face in [0,5].
862 */
863 static void
lp_build_sample_image_nearest(struct lp_build_sample_context * bld,LLVMValueRef size,LLVMValueRef row_stride_vec,LLVMValueRef img_stride_vec,LLVMValueRef data_ptr,LLVMValueRef mipoffsets,const LLVMValueRef * coords,const LLVMValueRef * offsets,LLVMValueRef colors_out[4])864 lp_build_sample_image_nearest(struct lp_build_sample_context *bld,
865 LLVMValueRef size,
866 LLVMValueRef row_stride_vec,
867 LLVMValueRef img_stride_vec,
868 LLVMValueRef data_ptr,
869 LLVMValueRef mipoffsets,
870 const LLVMValueRef *coords,
871 const LLVMValueRef *offsets,
872 LLVMValueRef colors_out[4])
873 {
874 const unsigned dims = bld->dims;
875 LLVMValueRef width_vec;
876 LLVMValueRef height_vec;
877 LLVMValueRef depth_vec;
878 LLVMValueRef flt_size;
879 LLVMValueRef flt_width_vec;
880 LLVMValueRef flt_height_vec;
881 LLVMValueRef flt_depth_vec;
882 LLVMValueRef x, y = NULL, z = NULL;
883
884 lp_build_extract_image_sizes(bld,
885 &bld->int_size_bld,
886 bld->int_coord_type,
887 size,
888 &width_vec, &height_vec, &depth_vec);
889
890 flt_size = lp_build_int_to_float(&bld->float_size_bld, size);
891
892 lp_build_extract_image_sizes(bld,
893 &bld->float_size_bld,
894 bld->coord_type,
895 flt_size,
896 &flt_width_vec, &flt_height_vec, &flt_depth_vec);
897
898 /*
899 * Compute integer texcoords.
900 */
901 x = lp_build_sample_wrap_nearest(bld, coords[0], width_vec,
902 flt_width_vec, offsets[0],
903 bld->static_texture_state->pot_width,
904 bld->static_sampler_state->wrap_s);
905 lp_build_name(x, "tex.x.wrapped");
906
907 if (dims >= 2) {
908 y = lp_build_sample_wrap_nearest(bld, coords[1], height_vec,
909 flt_height_vec, offsets[1],
910 bld->static_texture_state->pot_height,
911 bld->static_sampler_state->wrap_t);
912 lp_build_name(y, "tex.y.wrapped");
913
914 if (dims == 3) {
915 z = lp_build_sample_wrap_nearest(bld, coords[2], depth_vec,
916 flt_depth_vec, offsets[2],
917 bld->static_texture_state->pot_depth,
918 bld->static_sampler_state->wrap_r);
919 lp_build_name(z, "tex.z.wrapped");
920 }
921 }
922 if (has_layer_coord(bld->static_texture_state->target)) {
923 if (bld->static_texture_state->target == PIPE_TEXTURE_CUBE_ARRAY) {
924 /* add cube layer to face */
925 z = lp_build_add(&bld->int_coord_bld, coords[2], coords[3]);
926 }
927 else {
928 z = coords[2];
929 }
930 lp_build_name(z, "tex.z.layer");
931 }
932
933 /*
934 * Get texture colors.
935 */
936 lp_build_sample_texel_soa(bld,
937 width_vec, height_vec, depth_vec,
938 x, y, z,
939 row_stride_vec, img_stride_vec,
940 data_ptr, mipoffsets, colors_out);
941
942 if (bld->static_sampler_state->compare_mode != PIPE_TEX_COMPARE_NONE) {
943 LLVMValueRef cmpval;
944 cmpval = lp_build_sample_comparefunc(bld, coords[4], colors_out[0]);
945 /* this is really just a AND 1.0, cmpval but llvm is clever enough */
946 colors_out[0] = lp_build_select(&bld->texel_bld, cmpval,
947 bld->texel_bld.one, bld->texel_bld.zero);
948 colors_out[1] = colors_out[2] = colors_out[3] = colors_out[0];
949 }
950
951 }
952
953
954 /**
955 * Like a lerp, but inputs are 0/~0 masks, so can simplify slightly.
956 */
957 static LLVMValueRef
lp_build_masklerp(struct lp_build_context * bld,LLVMValueRef weight,LLVMValueRef mask0,LLVMValueRef mask1)958 lp_build_masklerp(struct lp_build_context *bld,
959 LLVMValueRef weight,
960 LLVMValueRef mask0,
961 LLVMValueRef mask1)
962 {
963 struct gallivm_state *gallivm = bld->gallivm;
964 LLVMBuilderRef builder = gallivm->builder;
965 LLVMValueRef weight2;
966
967 weight2 = lp_build_sub(bld, bld->one, weight);
968 weight = LLVMBuildBitCast(builder, weight,
969 lp_build_int_vec_type(gallivm, bld->type), "");
970 weight2 = LLVMBuildBitCast(builder, weight2,
971 lp_build_int_vec_type(gallivm, bld->type), "");
972 weight = LLVMBuildAnd(builder, weight, mask1, "");
973 weight2 = LLVMBuildAnd(builder, weight2, mask0, "");
974 weight = LLVMBuildBitCast(builder, weight, bld->vec_type, "");
975 weight2 = LLVMBuildBitCast(builder, weight2, bld->vec_type, "");
976 return lp_build_add(bld, weight, weight2);
977 }
978
979 /**
980 * Like a 2d lerp, but inputs are 0/~0 masks, so can simplify slightly.
981 */
982 static LLVMValueRef
lp_build_masklerp2d(struct lp_build_context * bld,LLVMValueRef weight0,LLVMValueRef weight1,LLVMValueRef mask00,LLVMValueRef mask01,LLVMValueRef mask10,LLVMValueRef mask11)983 lp_build_masklerp2d(struct lp_build_context *bld,
984 LLVMValueRef weight0,
985 LLVMValueRef weight1,
986 LLVMValueRef mask00,
987 LLVMValueRef mask01,
988 LLVMValueRef mask10,
989 LLVMValueRef mask11)
990 {
991 LLVMValueRef val0 = lp_build_masklerp(bld, weight0, mask00, mask01);
992 LLVMValueRef val1 = lp_build_masklerp(bld, weight0, mask10, mask11);
993 return lp_build_lerp(bld, weight1, val0, val1, 0);
994 }
995
996 /*
997 * this is a bit excessive code for something OpenGL just recommends
998 * but does not require.
999 */
1000 #define ACCURATE_CUBE_CORNERS 1
1001
1002 /**
1003 * Generate code to sample a mipmap level with linear filtering.
1004 * If sampling a cube texture, r = cube face in [0,5].
1005 * If linear_mask is present, only pixels having their mask set
1006 * will receive linear filtering, the rest will use nearest.
1007 */
1008 static void
lp_build_sample_image_linear(struct lp_build_sample_context * bld,boolean is_gather,LLVMValueRef size,LLVMValueRef linear_mask,LLVMValueRef row_stride_vec,LLVMValueRef img_stride_vec,LLVMValueRef data_ptr,LLVMValueRef mipoffsets,const LLVMValueRef * coords,const LLVMValueRef * offsets,LLVMValueRef colors_out[4])1009 lp_build_sample_image_linear(struct lp_build_sample_context *bld,
1010 boolean is_gather,
1011 LLVMValueRef size,
1012 LLVMValueRef linear_mask,
1013 LLVMValueRef row_stride_vec,
1014 LLVMValueRef img_stride_vec,
1015 LLVMValueRef data_ptr,
1016 LLVMValueRef mipoffsets,
1017 const LLVMValueRef *coords,
1018 const LLVMValueRef *offsets,
1019 LLVMValueRef colors_out[4])
1020 {
1021 LLVMBuilderRef builder = bld->gallivm->builder;
1022 struct lp_build_context *ivec_bld = &bld->int_coord_bld;
1023 struct lp_build_context *coord_bld = &bld->coord_bld;
1024 struct lp_build_context *texel_bld = &bld->texel_bld;
1025 const unsigned dims = bld->dims;
1026 LLVMValueRef width_vec;
1027 LLVMValueRef height_vec;
1028 LLVMValueRef depth_vec;
1029 LLVMValueRef flt_size;
1030 LLVMValueRef flt_width_vec;
1031 LLVMValueRef flt_height_vec;
1032 LLVMValueRef flt_depth_vec;
1033 LLVMValueRef fall_off[4] = { 0 }, have_corners = NULL;
1034 LLVMValueRef z1 = NULL;
1035 LLVMValueRef z00 = NULL, z01 = NULL, z10 = NULL, z11 = NULL;
1036 LLVMValueRef x00 = NULL, x01 = NULL, x10 = NULL, x11 = NULL;
1037 LLVMValueRef y00 = NULL, y01 = NULL, y10 = NULL, y11 = NULL;
1038 LLVMValueRef s_fpart, t_fpart = NULL, r_fpart = NULL;
1039 LLVMValueRef xs[4], ys[4], zs[4];
1040 LLVMValueRef neighbors[2][2][4];
1041 int chan, texel_index;
1042 boolean seamless_cube_filter, accurate_cube_corners;
1043 unsigned chan_swiz = bld->static_texture_state->swizzle_r;
1044
1045 if (is_gather) {
1046 switch (bld->gather_comp) {
1047 case 0: chan_swiz = bld->static_texture_state->swizzle_r; break;
1048 case 1: chan_swiz = bld->static_texture_state->swizzle_g; break;
1049 case 2: chan_swiz = bld->static_texture_state->swizzle_b; break;
1050 case 3: chan_swiz = bld->static_texture_state->swizzle_a; break;
1051 default:
1052 break;
1053 }
1054 }
1055
1056 seamless_cube_filter = (bld->static_texture_state->target == PIPE_TEXTURE_CUBE ||
1057 bld->static_texture_state->target == PIPE_TEXTURE_CUBE_ARRAY) &&
1058 bld->static_sampler_state->seamless_cube_map;
1059
1060 /*
1061 * Disable accurate cube corners for integer textures, which should only
1062 * get here in the gather path.
1063 */
1064 accurate_cube_corners = ACCURATE_CUBE_CORNERS && seamless_cube_filter &&
1065 !util_format_is_pure_integer(bld->static_texture_state->format);
1066
1067 lp_build_extract_image_sizes(bld,
1068 &bld->int_size_bld,
1069 bld->int_coord_type,
1070 size,
1071 &width_vec, &height_vec, &depth_vec);
1072
1073 flt_size = lp_build_int_to_float(&bld->float_size_bld, size);
1074
1075 lp_build_extract_image_sizes(bld,
1076 &bld->float_size_bld,
1077 bld->coord_type,
1078 flt_size,
1079 &flt_width_vec, &flt_height_vec, &flt_depth_vec);
1080
1081 LLVMTypeRef int1t = LLVMInt1TypeInContext(bld->gallivm->context);
1082
1083 /*
1084 * Compute integer texcoords.
1085 */
1086
1087 if (!seamless_cube_filter) {
1088 lp_build_sample_wrap_linear(bld, is_gather, coords[0], width_vec,
1089 flt_width_vec, offsets[0],
1090 bld->static_texture_state->pot_width,
1091 bld->static_sampler_state->wrap_s,
1092 &x00, &x01, &s_fpart);
1093 lp_build_name(x00, "tex.x0.wrapped");
1094 lp_build_name(x01, "tex.x1.wrapped");
1095 x10 = x00;
1096 x11 = x01;
1097
1098 if (dims >= 2) {
1099 lp_build_sample_wrap_linear(bld, is_gather, coords[1], height_vec,
1100 flt_height_vec, offsets[1],
1101 bld->static_texture_state->pot_height,
1102 bld->static_sampler_state->wrap_t,
1103 &y00, &y10, &t_fpart);
1104 lp_build_name(y00, "tex.y0.wrapped");
1105 lp_build_name(y10, "tex.y1.wrapped");
1106 y01 = y00;
1107 y11 = y10;
1108
1109 if (dims == 3) {
1110 lp_build_sample_wrap_linear(bld, is_gather, coords[2], depth_vec,
1111 flt_depth_vec, offsets[2],
1112 bld->static_texture_state->pot_depth,
1113 bld->static_sampler_state->wrap_r,
1114 &z00, &z1, &r_fpart);
1115 z01 = z10 = z11 = z00;
1116 lp_build_name(z00, "tex.z0.wrapped");
1117 lp_build_name(z1, "tex.z1.wrapped");
1118 }
1119 }
1120 if (has_layer_coord(bld->static_texture_state->target)) {
1121 if (bld->static_texture_state->target == PIPE_TEXTURE_CUBE_ARRAY) {
1122 /* add cube layer to face */
1123 z00 = z01 = z10 = z11 = z1 =
1124 lp_build_add(&bld->int_coord_bld, coords[2], coords[3]);
1125 }
1126 else {
1127 z00 = z01 = z10 = z11 = z1 = coords[2]; /* cube face or layer */
1128 }
1129 lp_build_name(z00, "tex.z0.layer");
1130 lp_build_name(z1, "tex.z1.layer");
1131 }
1132 }
1133 else {
1134 struct lp_build_if_state edge_if;
1135 LLVMValueRef new_faces[4], new_xcoords[4][2], new_ycoords[4][2];
1136 LLVMValueRef coord0, coord1, have_edge, have_corner;
1137 LLVMValueRef fall_off_ym_notxm, fall_off_ym_notxp, fall_off_x, fall_off_y;
1138 LLVMValueRef fall_off_yp_notxm, fall_off_yp_notxp;
1139 LLVMValueRef x0, x1, y0, y1, y0_clamped, y1_clamped;
1140 LLVMValueRef face = coords[2];
1141 LLVMValueRef half = lp_build_const_vec(bld->gallivm, coord_bld->type, 0.5f);
1142 LLVMValueRef length_minus_one = lp_build_sub(ivec_bld, width_vec, ivec_bld->one);
1143 /* XXX drop height calcs. Could (should) do this without seamless filtering too */
1144 height_vec = width_vec;
1145 flt_height_vec = flt_width_vec;
1146
1147 /* XXX the overflow logic is actually sort of duplicated with trilinear,
1148 * since an overflow in one mip should also have a corresponding overflow
1149 * in another.
1150 */
1151 /* should always have normalized coords, and offsets are undefined */
1152 assert(bld->static_sampler_state->normalized_coords);
1153 /*
1154 * The coords should all be between [0,1] however we can have NaNs,
1155 * which will wreak havoc. In particular the y1_clamped value below
1156 * can be -INT_MAX (on x86) and be propagated right through (probably
1157 * other values might be bogus in the end too).
1158 * So kill off the NaNs here.
1159 */
1160 coord0 = lp_build_max_ext(coord_bld, coords[0], coord_bld->zero,
1161 GALLIVM_NAN_RETURN_OTHER_SECOND_NONNAN);
1162 coord0 = lp_build_mul(coord_bld, coord0, flt_width_vec);
1163 /* instead of clamp, build mask if overflowed */
1164 coord0 = lp_build_sub(coord_bld, coord0, half);
1165 /* convert to int, compute lerp weight */
1166 /* not ideal with AVX (and no AVX2) */
1167 lp_build_ifloor_fract(coord_bld, coord0, &x0, &s_fpart);
1168 x1 = lp_build_add(ivec_bld, x0, ivec_bld->one);
1169 coord1 = lp_build_max_ext(coord_bld, coords[1], coord_bld->zero,
1170 GALLIVM_NAN_RETURN_OTHER_SECOND_NONNAN);
1171 coord1 = lp_build_mul(coord_bld, coord1, flt_height_vec);
1172 coord1 = lp_build_sub(coord_bld, coord1, half);
1173 lp_build_ifloor_fract(coord_bld, coord1, &y0, &t_fpart);
1174 y1 = lp_build_add(ivec_bld, y0, ivec_bld->one);
1175
1176 fall_off[0] = lp_build_cmp(ivec_bld, PIPE_FUNC_LESS, x0, ivec_bld->zero);
1177 fall_off[1] = lp_build_cmp(ivec_bld, PIPE_FUNC_GREATER, x1, length_minus_one);
1178 fall_off[2] = lp_build_cmp(ivec_bld, PIPE_FUNC_LESS, y0, ivec_bld->zero);
1179 fall_off[3] = lp_build_cmp(ivec_bld, PIPE_FUNC_GREATER, y1, length_minus_one);
1180
1181 fall_off_x = lp_build_or(ivec_bld, fall_off[0], fall_off[1]);
1182 fall_off_y = lp_build_or(ivec_bld, fall_off[2], fall_off[3]);
1183 have_edge = lp_build_or(ivec_bld, fall_off_x, fall_off_y);
1184 have_edge = lp_build_any_true_range(ivec_bld, ivec_bld->type.length, have_edge);
1185
1186 /* needed for accurate corner filtering branch later, rely on 0 init */
1187 have_corners = lp_build_alloca(bld->gallivm, int1t, "have_corner");
1188
1189 for (texel_index = 0; texel_index < 4; texel_index++) {
1190 xs[texel_index] = lp_build_alloca(bld->gallivm, ivec_bld->vec_type, "xs");
1191 ys[texel_index] = lp_build_alloca(bld->gallivm, ivec_bld->vec_type, "ys");
1192 zs[texel_index] = lp_build_alloca(bld->gallivm, ivec_bld->vec_type, "zs");
1193 }
1194
1195 lp_build_if(&edge_if, bld->gallivm, have_edge);
1196
1197 have_corner = lp_build_and(ivec_bld, fall_off_x, fall_off_y);
1198 have_corner = lp_build_any_true_range(ivec_bld, ivec_bld->type.length, have_corner);
1199 LLVMBuildStore(builder, have_corner, have_corners);
1200
1201 /*
1202 * Need to feed clamped values here for cheap corner handling,
1203 * but only for y coord (as when falling off both edges we only
1204 * fall off the x one) - this should be sufficient.
1205 */
1206 y0_clamped = lp_build_max(ivec_bld, y0, ivec_bld->zero);
1207 y1_clamped = lp_build_min(ivec_bld, y1, length_minus_one);
1208
1209 /*
1210 * Get all possible new coords.
1211 */
1212 lp_build_cube_new_coords(ivec_bld, face,
1213 x0, x1, y0_clamped, y1_clamped,
1214 length_minus_one,
1215 new_faces, new_xcoords, new_ycoords);
1216
1217 /* handle fall off x-, x+ direction */
1218 /* determine new coords, face (not both fall_off vars can be true at same time) */
1219 x00 = lp_build_select(ivec_bld, fall_off[0], new_xcoords[0][0], x0);
1220 y00 = lp_build_select(ivec_bld, fall_off[0], new_ycoords[0][0], y0_clamped);
1221 x10 = lp_build_select(ivec_bld, fall_off[0], new_xcoords[0][1], x0);
1222 y10 = lp_build_select(ivec_bld, fall_off[0], new_ycoords[0][1], y1_clamped);
1223 x01 = lp_build_select(ivec_bld, fall_off[1], new_xcoords[1][0], x1);
1224 y01 = lp_build_select(ivec_bld, fall_off[1], new_ycoords[1][0], y0_clamped);
1225 x11 = lp_build_select(ivec_bld, fall_off[1], new_xcoords[1][1], x1);
1226 y11 = lp_build_select(ivec_bld, fall_off[1], new_ycoords[1][1], y1_clamped);
1227
1228 z00 = z10 = lp_build_select(ivec_bld, fall_off[0], new_faces[0], face);
1229 z01 = z11 = lp_build_select(ivec_bld, fall_off[1], new_faces[1], face);
1230
1231 /* handle fall off y-, y+ direction */
1232 /*
1233 * Cheap corner logic: just hack up things so a texel doesn't fall
1234 * off both sides (which means filter weights will be wrong but we'll only
1235 * use valid texels in the filter).
1236 * This means however (y) coords must additionally be clamped (see above).
1237 * This corner handling should be fully OpenGL (but not d3d10) compliant.
1238 */
1239 fall_off_ym_notxm = lp_build_andnot(ivec_bld, fall_off[2], fall_off[0]);
1240 fall_off_ym_notxp = lp_build_andnot(ivec_bld, fall_off[2], fall_off[1]);
1241 fall_off_yp_notxm = lp_build_andnot(ivec_bld, fall_off[3], fall_off[0]);
1242 fall_off_yp_notxp = lp_build_andnot(ivec_bld, fall_off[3], fall_off[1]);
1243
1244 x00 = lp_build_select(ivec_bld, fall_off_ym_notxm, new_xcoords[2][0], x00);
1245 y00 = lp_build_select(ivec_bld, fall_off_ym_notxm, new_ycoords[2][0], y00);
1246 x01 = lp_build_select(ivec_bld, fall_off_ym_notxp, new_xcoords[2][1], x01);
1247 y01 = lp_build_select(ivec_bld, fall_off_ym_notxp, new_ycoords[2][1], y01);
1248 x10 = lp_build_select(ivec_bld, fall_off_yp_notxm, new_xcoords[3][0], x10);
1249 y10 = lp_build_select(ivec_bld, fall_off_yp_notxm, new_ycoords[3][0], y10);
1250 x11 = lp_build_select(ivec_bld, fall_off_yp_notxp, new_xcoords[3][1], x11);
1251 y11 = lp_build_select(ivec_bld, fall_off_yp_notxp, new_ycoords[3][1], y11);
1252
1253 z00 = lp_build_select(ivec_bld, fall_off_ym_notxm, new_faces[2], z00);
1254 z01 = lp_build_select(ivec_bld, fall_off_ym_notxp, new_faces[2], z01);
1255 z10 = lp_build_select(ivec_bld, fall_off_yp_notxm, new_faces[3], z10);
1256 z11 = lp_build_select(ivec_bld, fall_off_yp_notxp, new_faces[3], z11);
1257
1258 if (bld->static_texture_state->target == PIPE_TEXTURE_CUBE_ARRAY) {
1259 /* now can add cube layer to face (per sample) */
1260 z00 = lp_build_add(ivec_bld, z00, coords[3]);
1261 z01 = lp_build_add(ivec_bld, z01, coords[3]);
1262 z10 = lp_build_add(ivec_bld, z10, coords[3]);
1263 z11 = lp_build_add(ivec_bld, z11, coords[3]);
1264 }
1265
1266 LLVMBuildStore(builder, x00, xs[0]);
1267 LLVMBuildStore(builder, x01, xs[1]);
1268 LLVMBuildStore(builder, x10, xs[2]);
1269 LLVMBuildStore(builder, x11, xs[3]);
1270 LLVMBuildStore(builder, y00, ys[0]);
1271 LLVMBuildStore(builder, y01, ys[1]);
1272 LLVMBuildStore(builder, y10, ys[2]);
1273 LLVMBuildStore(builder, y11, ys[3]);
1274 LLVMBuildStore(builder, z00, zs[0]);
1275 LLVMBuildStore(builder, z01, zs[1]);
1276 LLVMBuildStore(builder, z10, zs[2]);
1277 LLVMBuildStore(builder, z11, zs[3]);
1278
1279 lp_build_else(&edge_if);
1280
1281 LLVMBuildStore(builder, x0, xs[0]);
1282 LLVMBuildStore(builder, x1, xs[1]);
1283 LLVMBuildStore(builder, x0, xs[2]);
1284 LLVMBuildStore(builder, x1, xs[3]);
1285 LLVMBuildStore(builder, y0, ys[0]);
1286 LLVMBuildStore(builder, y0, ys[1]);
1287 LLVMBuildStore(builder, y1, ys[2]);
1288 LLVMBuildStore(builder, y1, ys[3]);
1289 if (bld->static_texture_state->target == PIPE_TEXTURE_CUBE_ARRAY) {
1290 LLVMValueRef cube_layer = lp_build_add(ivec_bld, face, coords[3]);
1291 LLVMBuildStore(builder, cube_layer, zs[0]);
1292 LLVMBuildStore(builder, cube_layer, zs[1]);
1293 LLVMBuildStore(builder, cube_layer, zs[2]);
1294 LLVMBuildStore(builder, cube_layer, zs[3]);
1295 }
1296 else {
1297 LLVMBuildStore(builder, face, zs[0]);
1298 LLVMBuildStore(builder, face, zs[1]);
1299 LLVMBuildStore(builder, face, zs[2]);
1300 LLVMBuildStore(builder, face, zs[3]);
1301 }
1302
1303 lp_build_endif(&edge_if);
1304
1305 LLVMTypeRef type = ivec_bld->vec_type;
1306 x00 = LLVMBuildLoad2(builder, type, xs[0], "");
1307 x01 = LLVMBuildLoad2(builder, type, xs[1], "");
1308 x10 = LLVMBuildLoad2(builder, type, xs[2], "");
1309 x11 = LLVMBuildLoad2(builder, type, xs[3], "");
1310 y00 = LLVMBuildLoad2(builder, type, ys[0], "");
1311 y01 = LLVMBuildLoad2(builder, type, ys[1], "");
1312 y10 = LLVMBuildLoad2(builder, type, ys[2], "");
1313 y11 = LLVMBuildLoad2(builder, type, ys[3], "");
1314 z00 = LLVMBuildLoad2(builder, type, zs[0], "");
1315 z01 = LLVMBuildLoad2(builder, type, zs[1], "");
1316 z10 = LLVMBuildLoad2(builder, type, zs[2], "");
1317 z11 = LLVMBuildLoad2(builder, type, zs[3], "");
1318 }
1319
1320 if (linear_mask) {
1321 /*
1322 * Whack filter weights into place. Whatever texel had more weight is
1323 * the one which should have been selected by nearest filtering hence
1324 * just use 100% weight for it.
1325 */
1326 struct lp_build_context *c_bld = &bld->coord_bld;
1327 LLVMValueRef w1_mask, w1_weight;
1328 LLVMValueRef half = lp_build_const_vec(bld->gallivm, c_bld->type, 0.5f);
1329
1330 w1_mask = lp_build_cmp(c_bld, PIPE_FUNC_GREATER, s_fpart, half);
1331 /* this select is really just a "and" */
1332 w1_weight = lp_build_select(c_bld, w1_mask, c_bld->one, c_bld->zero);
1333 s_fpart = lp_build_select(c_bld, linear_mask, s_fpart, w1_weight);
1334 if (dims >= 2) {
1335 w1_mask = lp_build_cmp(c_bld, PIPE_FUNC_GREATER, t_fpart, half);
1336 w1_weight = lp_build_select(c_bld, w1_mask, c_bld->one, c_bld->zero);
1337 t_fpart = lp_build_select(c_bld, linear_mask, t_fpart, w1_weight);
1338 if (dims == 3) {
1339 w1_mask = lp_build_cmp(c_bld, PIPE_FUNC_GREATER, r_fpart, half);
1340 w1_weight = lp_build_select(c_bld, w1_mask, c_bld->one, c_bld->zero);
1341 r_fpart = lp_build_select(c_bld, linear_mask, r_fpart, w1_weight);
1342 }
1343 }
1344 }
1345
1346 /*
1347 * Get texture colors.
1348 */
1349 /* get x0/x1 texels */
1350 lp_build_sample_texel_soa(bld,
1351 width_vec, height_vec, depth_vec,
1352 x00, y00, z00,
1353 row_stride_vec, img_stride_vec,
1354 data_ptr, mipoffsets, neighbors[0][0]);
1355 lp_build_sample_texel_soa(bld,
1356 width_vec, height_vec, depth_vec,
1357 x01, y01, z01,
1358 row_stride_vec, img_stride_vec,
1359 data_ptr, mipoffsets, neighbors[0][1]);
1360
1361 if (dims == 1) {
1362 assert(!is_gather);
1363 if (bld->static_sampler_state->compare_mode == PIPE_TEX_COMPARE_NONE) {
1364 lp_build_reduce_filter(texel_bld,
1365 bld->static_sampler_state->reduction_mode,
1366 0,
1367 4,
1368 s_fpart,
1369 neighbors[0][0],
1370 neighbors[0][1],
1371 colors_out);
1372 }
1373 else {
1374 LLVMValueRef cmpval0, cmpval1;
1375 cmpval0 = lp_build_sample_comparefunc(bld, coords[4], neighbors[0][0][0]);
1376 cmpval1 = lp_build_sample_comparefunc(bld, coords[4], neighbors[0][1][0]);
1377 /* simplified lerp, AND mask with weight and add */
1378 colors_out[0] = lp_build_masklerp(texel_bld, s_fpart,
1379 cmpval0, cmpval1);
1380 colors_out[1] = colors_out[2] = colors_out[3] = colors_out[0];
1381 }
1382 }
1383 else {
1384 /* 2D/3D texture */
1385 struct lp_build_if_state corner_if;
1386 LLVMValueRef colors0[4], colorss[4] = { 0 };
1387
1388 /* get x0/x1 texels at y1 */
1389 lp_build_sample_texel_soa(bld,
1390 width_vec, height_vec, depth_vec,
1391 x10, y10, z10,
1392 row_stride_vec, img_stride_vec,
1393 data_ptr, mipoffsets, neighbors[1][0]);
1394 lp_build_sample_texel_soa(bld,
1395 width_vec, height_vec, depth_vec,
1396 x11, y11, z11,
1397 row_stride_vec, img_stride_vec,
1398 data_ptr, mipoffsets, neighbors[1][1]);
1399
1400 /*
1401 * To avoid having to duplicate linear_mask / fetch code use
1402 * another branch (with corner condition though edge would work
1403 * as well) here.
1404 */
1405 if (have_corners && accurate_cube_corners &&
1406 bld->static_sampler_state->reduction_mode == PIPE_TEX_REDUCTION_WEIGHTED_AVERAGE) {
1407 LLVMValueRef c00, c01, c10, c11, c00f, c01f, c10f, c11f;
1408 LLVMValueRef have_corner, one_third;
1409
1410 colorss[0] = lp_build_alloca(bld->gallivm, coord_bld->vec_type, "cs0");
1411 colorss[1] = lp_build_alloca(bld->gallivm, coord_bld->vec_type, "cs1");
1412 colorss[2] = lp_build_alloca(bld->gallivm, coord_bld->vec_type, "cs2");
1413 colorss[3] = lp_build_alloca(bld->gallivm, coord_bld->vec_type, "cs3");
1414
1415 have_corner = LLVMBuildLoad2(builder, int1t, have_corners, "");
1416
1417 lp_build_if(&corner_if, bld->gallivm, have_corner);
1418
1419 one_third = lp_build_const_vec(bld->gallivm, coord_bld->type,
1420 1.0f/3.0f);
1421
1422 /* find corner */
1423 c00 = lp_build_and(ivec_bld, fall_off[0], fall_off[2]);
1424 c00f = LLVMBuildBitCast(builder, c00, coord_bld->vec_type, "");
1425 c01 = lp_build_and(ivec_bld, fall_off[1], fall_off[2]);
1426 c01f = LLVMBuildBitCast(builder, c01, coord_bld->vec_type, "");
1427 c10 = lp_build_and(ivec_bld, fall_off[0], fall_off[3]);
1428 c10f = LLVMBuildBitCast(builder, c10, coord_bld->vec_type, "");
1429 c11 = lp_build_and(ivec_bld, fall_off[1], fall_off[3]);
1430 c11f = LLVMBuildBitCast(builder, c11, coord_bld->vec_type, "");
1431
1432 if (!is_gather) {
1433 /*
1434 * we can't use standard 2d lerp as we need per-element weight
1435 * in case of corners, so just calculate bilinear result as
1436 * w00*s00 + w01*s01 + w10*s10 + w11*s11.
1437 * (This is actually less work than using 2d lerp, 7 vs. 9
1438 * instructions, however calculating the weights needs another 6,
1439 * so actually probably not slower than 2d lerp only for 4 channels
1440 * as weights only need to be calculated once - of course fixing
1441 * the weights has additional cost.)
1442 */
1443 LLVMValueRef w00, w01, w10, w11, wx0, wy0, c_weight, tmp;
1444 wx0 = lp_build_sub(coord_bld, coord_bld->one, s_fpart);
1445 wy0 = lp_build_sub(coord_bld, coord_bld->one, t_fpart);
1446 w00 = lp_build_mul(coord_bld, wx0, wy0);
1447 w01 = lp_build_mul(coord_bld, s_fpart, wy0);
1448 w10 = lp_build_mul(coord_bld, wx0, t_fpart);
1449 w11 = lp_build_mul(coord_bld, s_fpart, t_fpart);
1450
1451 /* find corner weight */
1452 c_weight = lp_build_select(coord_bld, c00, w00, coord_bld->zero);
1453 c_weight = lp_build_select(coord_bld, c01, w01, c_weight);
1454 c_weight = lp_build_select(coord_bld, c10, w10, c_weight);
1455 c_weight = lp_build_select(coord_bld, c11, w11, c_weight);
1456
1457 /*
1458 * add 1/3 of the corner weight to the weight of the 3 other
1459 * samples and null out corner weight.
1460 */
1461 c_weight = lp_build_mul(coord_bld, c_weight, one_third);
1462 w00 = lp_build_add(coord_bld, w00, c_weight);
1463 w00 = lp_build_andnot(coord_bld, w00, c00f);
1464 w01 = lp_build_add(coord_bld, w01, c_weight);
1465 w01 = lp_build_andnot(coord_bld, w01, c01f);
1466 w10 = lp_build_add(coord_bld, w10, c_weight);
1467 w10 = lp_build_andnot(coord_bld, w10, c10f);
1468 w11 = lp_build_add(coord_bld, w11, c_weight);
1469 w11 = lp_build_andnot(coord_bld, w11, c11f);
1470
1471 if (bld->static_sampler_state->compare_mode ==
1472 PIPE_TEX_COMPARE_NONE) {
1473 for (chan = 0; chan < 4; chan++) {
1474 colors0[chan] = lp_build_mul(coord_bld, w00,
1475 neighbors[0][0][chan]);
1476 tmp = lp_build_mul(coord_bld, w01, neighbors[0][1][chan]);
1477 colors0[chan] = lp_build_add(coord_bld, tmp, colors0[chan]);
1478 tmp = lp_build_mul(coord_bld, w10, neighbors[1][0][chan]);
1479 colors0[chan] = lp_build_add(coord_bld, tmp, colors0[chan]);
1480 tmp = lp_build_mul(coord_bld, w11, neighbors[1][1][chan]);
1481 colors0[chan] = lp_build_add(coord_bld, tmp, colors0[chan]);
1482 }
1483 }
1484 else {
1485 LLVMValueRef cmpval00, cmpval01, cmpval10, cmpval11;
1486 cmpval00 = lp_build_sample_comparefunc(bld, coords[4],
1487 neighbors[0][0][0]);
1488 cmpval01 = lp_build_sample_comparefunc(bld, coords[4],
1489 neighbors[0][1][0]);
1490 cmpval10 = lp_build_sample_comparefunc(bld, coords[4],
1491 neighbors[1][0][0]);
1492 cmpval11 = lp_build_sample_comparefunc(bld, coords[4],
1493 neighbors[1][1][0]);
1494 /*
1495 * inputs to interpolation are just masks so just add
1496 * masked weights together
1497 */
1498 cmpval00 = LLVMBuildBitCast(builder, cmpval00,
1499 coord_bld->vec_type, "");
1500 cmpval01 = LLVMBuildBitCast(builder, cmpval01,
1501 coord_bld->vec_type, "");
1502 cmpval10 = LLVMBuildBitCast(builder, cmpval10,
1503 coord_bld->vec_type, "");
1504 cmpval11 = LLVMBuildBitCast(builder, cmpval11,
1505 coord_bld->vec_type, "");
1506 colors0[0] = lp_build_and(coord_bld, w00, cmpval00);
1507 tmp = lp_build_and(coord_bld, w01, cmpval01);
1508 colors0[0] = lp_build_add(coord_bld, tmp, colors0[0]);
1509 tmp = lp_build_and(coord_bld, w10, cmpval10);
1510 colors0[0] = lp_build_add(coord_bld, tmp, colors0[0]);
1511 tmp = lp_build_and(coord_bld, w11, cmpval11);
1512 colors0[0] = lp_build_add(coord_bld, tmp, colors0[0]);
1513 colors0[1] = colors0[2] = colors0[3] = colors0[0];
1514 }
1515 }
1516 else {
1517 /*
1518 * We don't have any weights to adjust, so instead calculate
1519 * the fourth texel as simply the average of the other 3.
1520 * (This would work for non-gather too, however we'd have
1521 * a boatload more of the select stuff due to there being
1522 * 4 times as many colors as weights.)
1523 */
1524 LLVMValueRef col00, col01, col10, col11;
1525 LLVMValueRef colc, colc0, colc1;
1526 col10 = lp_build_swizzle_soa_channel(texel_bld,
1527 neighbors[1][0], chan_swiz);
1528 col11 = lp_build_swizzle_soa_channel(texel_bld,
1529 neighbors[1][1], chan_swiz);
1530 col01 = lp_build_swizzle_soa_channel(texel_bld,
1531 neighbors[0][1], chan_swiz);
1532 col00 = lp_build_swizzle_soa_channel(texel_bld,
1533 neighbors[0][0], chan_swiz);
1534
1535 /*
1536 * The spec says for comparison filtering, the comparison
1537 * must happen before synthesizing the new value.
1538 * This means all gathered values are always 0 or 1,
1539 * except for the non-existing texel, which can be 0,1/3,2/3,1...
1540 * Seems like we'd be allowed to just return 0 or 1 too, so we
1541 * could simplify and pass down the compare mask values to the
1542 * end (using int arithmetic/compare on the mask values to
1543 * construct the fourth texel) and only there convert to floats
1544 * but it's probably not worth it (it might be easier for the cpu
1545 * but not for the code)...
1546 */
1547 if (bld->static_sampler_state->compare_mode !=
1548 PIPE_TEX_COMPARE_NONE) {
1549 LLVMValueRef cmpval00, cmpval01, cmpval10, cmpval11;
1550 cmpval00 = lp_build_sample_comparefunc(bld, coords[4], col00);
1551 cmpval01 = lp_build_sample_comparefunc(bld, coords[4], col01);
1552 cmpval10 = lp_build_sample_comparefunc(bld, coords[4], col10);
1553 cmpval11 = lp_build_sample_comparefunc(bld, coords[4], col11);
1554 col00 = lp_build_select(texel_bld, cmpval00,
1555 texel_bld->one, texel_bld->zero);
1556 col01 = lp_build_select(texel_bld, cmpval01,
1557 texel_bld->one, texel_bld->zero);
1558 col10 = lp_build_select(texel_bld, cmpval10,
1559 texel_bld->one, texel_bld->zero);
1560 col11 = lp_build_select(texel_bld, cmpval11,
1561 texel_bld->one, texel_bld->zero);
1562 }
1563
1564 /*
1565 * Null out corner color.
1566 */
1567 col00 = lp_build_andnot(coord_bld, col00, c00f);
1568 col01 = lp_build_andnot(coord_bld, col01, c01f);
1569 col10 = lp_build_andnot(coord_bld, col10, c10f);
1570 col11 = lp_build_andnot(coord_bld, col11, c11f);
1571
1572 /*
1573 * New corner texel color is all colors added / 3.
1574 */
1575 colc0 = lp_build_add(coord_bld, col00, col01);
1576 colc1 = lp_build_add(coord_bld, col10, col11);
1577 colc = lp_build_add(coord_bld, colc0, colc1);
1578 colc = lp_build_mul(coord_bld, one_third, colc);
1579
1580 /*
1581 * Replace the corner texel color with the new value.
1582 */
1583 col00 = lp_build_select(coord_bld, c00, colc, col00);
1584 col01 = lp_build_select(coord_bld, c01, colc, col01);
1585 col10 = lp_build_select(coord_bld, c10, colc, col10);
1586 col11 = lp_build_select(coord_bld, c11, colc, col11);
1587
1588 colors0[0] = col10;
1589 colors0[1] = col11;
1590 colors0[2] = col01;
1591 colors0[3] = col00;
1592 }
1593
1594 LLVMBuildStore(builder, colors0[0], colorss[0]);
1595 LLVMBuildStore(builder, colors0[1], colorss[1]);
1596 LLVMBuildStore(builder, colors0[2], colorss[2]);
1597 LLVMBuildStore(builder, colors0[3], colorss[3]);
1598
1599 lp_build_else(&corner_if);
1600 }
1601
1602 if (bld->static_sampler_state->compare_mode == PIPE_TEX_COMPARE_NONE) {
1603 if (is_gather) {
1604 /*
1605 * Just assign the red channel (no component selection yet).
1606 * This is a bit hackish, we usually do the swizzle at the
1607 * end of sampling (much less values to swizzle), but this
1608 * obviously cannot work when using gather.
1609 */
1610 colors0[0] = lp_build_swizzle_soa_channel(texel_bld,
1611 neighbors[1][0],
1612 chan_swiz);
1613 colors0[1] = lp_build_swizzle_soa_channel(texel_bld,
1614 neighbors[1][1],
1615 chan_swiz);
1616 colors0[2] = lp_build_swizzle_soa_channel(texel_bld,
1617 neighbors[0][1],
1618 chan_swiz);
1619 colors0[3] = lp_build_swizzle_soa_channel(texel_bld,
1620 neighbors[0][0],
1621 chan_swiz);
1622 }
1623 else {
1624 /* Bilinear interpolate the four samples from the 2D image / 3D slice */
1625 lp_build_reduce_filter_2d(texel_bld,
1626 bld->static_sampler_state->reduction_mode,
1627 0,
1628 4,
1629 s_fpart,
1630 t_fpart,
1631 neighbors[0][0],
1632 neighbors[0][1],
1633 neighbors[1][0],
1634 neighbors[1][1],
1635 colors0);
1636 }
1637 }
1638 else {
1639 LLVMValueRef cmpval00, cmpval01, cmpval10, cmpval11;
1640 cmpval00 = lp_build_sample_comparefunc(bld, coords[4], neighbors[0][0][0]);
1641 cmpval01 = lp_build_sample_comparefunc(bld, coords[4], neighbors[0][1][0]);
1642 cmpval10 = lp_build_sample_comparefunc(bld, coords[4], neighbors[1][0][0]);
1643 cmpval11 = lp_build_sample_comparefunc(bld, coords[4], neighbors[1][1][0]);
1644
1645 if (is_gather) {
1646 /* more hacks for swizzling, should be X, ONE or ZERO... */
1647 colors0[0] = lp_build_select(texel_bld, cmpval10,
1648 texel_bld->one, texel_bld->zero);
1649 colors0[1] = lp_build_select(texel_bld, cmpval11,
1650 texel_bld->one, texel_bld->zero);
1651 colors0[2] = lp_build_select(texel_bld, cmpval01,
1652 texel_bld->one, texel_bld->zero);
1653 colors0[3] = lp_build_select(texel_bld, cmpval00,
1654 texel_bld->one, texel_bld->zero);
1655 }
1656 else {
1657 colors0[0] = lp_build_masklerp2d(texel_bld, s_fpart, t_fpart,
1658 cmpval00, cmpval01, cmpval10, cmpval11);
1659 colors0[1] = colors0[2] = colors0[3] = colors0[0];
1660 }
1661 }
1662
1663 if (have_corners && accurate_cube_corners &&
1664 bld->static_sampler_state->reduction_mode == PIPE_TEX_REDUCTION_WEIGHTED_AVERAGE) {
1665 LLVMBuildStore(builder, colors0[0], colorss[0]);
1666 LLVMBuildStore(builder, colors0[1], colorss[1]);
1667 LLVMBuildStore(builder, colors0[2], colorss[2]);
1668 LLVMBuildStore(builder, colors0[3], colorss[3]);
1669
1670 lp_build_endif(&corner_if);
1671
1672 colors0[0] = LLVMBuildLoad2(builder, coord_bld->vec_type, colorss[0], "");
1673 colors0[1] = LLVMBuildLoad2(builder, coord_bld->vec_type, colorss[1], "");
1674 colors0[2] = LLVMBuildLoad2(builder, coord_bld->vec_type, colorss[2], "");
1675 colors0[3] = LLVMBuildLoad2(builder, coord_bld->vec_type, colorss[3], "");
1676 }
1677
1678 if (dims == 3) {
1679 LLVMValueRef neighbors1[2][2][4];
1680 LLVMValueRef colors1[4];
1681
1682 assert(!is_gather);
1683
1684 /* get x0/x1/y0/y1 texels at z1 */
1685 lp_build_sample_texel_soa(bld,
1686 width_vec, height_vec, depth_vec,
1687 x00, y00, z1,
1688 row_stride_vec, img_stride_vec,
1689 data_ptr, mipoffsets, neighbors1[0][0]);
1690 lp_build_sample_texel_soa(bld,
1691 width_vec, height_vec, depth_vec,
1692 x01, y01, z1,
1693 row_stride_vec, img_stride_vec,
1694 data_ptr, mipoffsets, neighbors1[0][1]);
1695 lp_build_sample_texel_soa(bld,
1696 width_vec, height_vec, depth_vec,
1697 x10, y10, z1,
1698 row_stride_vec, img_stride_vec,
1699 data_ptr, mipoffsets, neighbors1[1][0]);
1700 lp_build_sample_texel_soa(bld,
1701 width_vec, height_vec, depth_vec,
1702 x11, y11, z1,
1703 row_stride_vec, img_stride_vec,
1704 data_ptr, mipoffsets, neighbors1[1][1]);
1705
1706 if (bld->static_sampler_state->compare_mode == PIPE_TEX_COMPARE_NONE) {
1707 /* Bilinear interpolate the four samples from the second Z slice */
1708 lp_build_reduce_filter_2d(texel_bld,
1709 bld->static_sampler_state->reduction_mode,
1710 0,
1711 4,
1712 s_fpart,
1713 t_fpart,
1714 neighbors1[0][0],
1715 neighbors1[0][1],
1716 neighbors1[1][0],
1717 neighbors1[1][1],
1718 colors1);
1719
1720 /* Linearly interpolate the two samples from the two 3D slices */
1721 lp_build_reduce_filter(texel_bld,
1722 bld->static_sampler_state->reduction_mode,
1723 0,
1724 4,
1725 r_fpart,
1726 colors0,
1727 colors1,
1728 colors_out);
1729 }
1730 else {
1731 LLVMValueRef cmpval00, cmpval01, cmpval10, cmpval11;
1732 cmpval00 = lp_build_sample_comparefunc(bld, coords[4], neighbors[0][0][0]);
1733 cmpval01 = lp_build_sample_comparefunc(bld, coords[4], neighbors[0][1][0]);
1734 cmpval10 = lp_build_sample_comparefunc(bld, coords[4], neighbors[1][0][0]);
1735 cmpval11 = lp_build_sample_comparefunc(bld, coords[4], neighbors[1][1][0]);
1736 colors1[0] = lp_build_masklerp2d(texel_bld, s_fpart, t_fpart,
1737 cmpval00, cmpval01, cmpval10, cmpval11);
1738 /* Linearly interpolate the two samples from the two 3D slices */
1739 colors_out[0] = lp_build_lerp(texel_bld,
1740 r_fpart,
1741 colors0[0], colors1[0],
1742 0);
1743 colors_out[1] = colors_out[2] = colors_out[3] = colors_out[0];
1744 }
1745 }
1746 else {
1747 /* 2D tex */
1748 for (chan = 0; chan < 4; chan++) {
1749 colors_out[chan] = colors0[chan];
1750 }
1751 }
1752 }
1753 if (is_gather) {
1754 /*
1755 * For gather, we can't do our usual channel swizzling done later,
1756 * so do it here. It only really matters for 0/1 swizzles in case
1757 * of comparison filtering, since in this case the results would be
1758 * wrong, without comparison it should all work out alright but it
1759 * can't hurt to do that here, since it will instantly drop all
1760 * calculations above, though it's a rather stupid idea to do
1761 * gather on a channel which will always return 0 or 1 in any case...
1762 */
1763 if (chan_swiz == PIPE_SWIZZLE_1) {
1764 for (chan = 0; chan < 4; chan++) {
1765 colors_out[chan] = texel_bld->one;
1766 }
1767 } else if (chan_swiz == PIPE_SWIZZLE_0) {
1768 for (chan = 0; chan < 4; chan++) {
1769 colors_out[chan] = texel_bld->zero;
1770 }
1771 }
1772 }
1773 }
1774
1775
1776 /**
1777 * Sample the texture/mipmap using given image filter and mip filter.
1778 * ilevel0 and ilevel1 indicate the two mipmap levels to sample
1779 * from (vectors or scalars).
1780 * If we're using nearest miplevel sampling the '1' values will be null/unused.
1781 */
1782 static void
lp_build_sample_mipmap(struct lp_build_sample_context * bld,unsigned img_filter,unsigned mip_filter,boolean is_gather,const LLVMValueRef * coords,const LLVMValueRef * offsets,LLVMValueRef ilevel0,LLVMValueRef ilevel1,LLVMValueRef lod_fpart,LLVMValueRef * colors_out)1783 lp_build_sample_mipmap(struct lp_build_sample_context *bld,
1784 unsigned img_filter,
1785 unsigned mip_filter,
1786 boolean is_gather,
1787 const LLVMValueRef *coords,
1788 const LLVMValueRef *offsets,
1789 LLVMValueRef ilevel0,
1790 LLVMValueRef ilevel1,
1791 LLVMValueRef lod_fpart,
1792 LLVMValueRef *colors_out)
1793 {
1794 LLVMBuilderRef builder = bld->gallivm->builder;
1795 LLVMValueRef size0 = NULL;
1796 LLVMValueRef size1 = NULL;
1797 LLVMValueRef row_stride0_vec = NULL;
1798 LLVMValueRef row_stride1_vec = NULL;
1799 LLVMValueRef img_stride0_vec = NULL;
1800 LLVMValueRef img_stride1_vec = NULL;
1801 LLVMValueRef data_ptr0 = NULL;
1802 LLVMValueRef data_ptr1 = NULL;
1803 LLVMValueRef mipoff0 = NULL;
1804 LLVMValueRef mipoff1 = NULL;
1805 LLVMValueRef colors0[4], colors1[4];
1806 unsigned chan;
1807
1808 /* sample the first mipmap level */
1809 lp_build_mipmap_level_sizes(bld, ilevel0,
1810 &size0,
1811 &row_stride0_vec, &img_stride0_vec);
1812 if (bld->num_mips == 1) {
1813 data_ptr0 = lp_build_get_mipmap_level(bld, ilevel0);
1814 }
1815 else {
1816 /* This path should work for num_lods 1 too but slightly less efficient */
1817 data_ptr0 = bld->base_ptr;
1818 mipoff0 = lp_build_get_mip_offsets(bld, ilevel0);
1819 }
1820 if (img_filter == PIPE_TEX_FILTER_NEAREST) {
1821 lp_build_sample_image_nearest(bld, size0,
1822 row_stride0_vec, img_stride0_vec,
1823 data_ptr0, mipoff0, coords, offsets,
1824 colors0);
1825 }
1826 else {
1827 assert(img_filter == PIPE_TEX_FILTER_LINEAR);
1828 lp_build_sample_image_linear(bld, is_gather, size0, NULL,
1829 row_stride0_vec, img_stride0_vec,
1830 data_ptr0, mipoff0, coords, offsets,
1831 colors0);
1832 }
1833
1834 /* Store the first level's colors in the output variables */
1835 for (chan = 0; chan < 4; chan++) {
1836 LLVMBuildStore(builder, colors0[chan], colors_out[chan]);
1837 }
1838
1839 if (mip_filter == PIPE_TEX_MIPFILTER_LINEAR) {
1840 struct lp_build_if_state if_ctx;
1841 LLVMValueRef need_lerp;
1842
1843 /* need_lerp = lod_fpart > 0 */
1844 if (bld->num_lods == 1) {
1845 need_lerp = LLVMBuildFCmp(builder, LLVMRealUGT,
1846 lod_fpart, bld->lodf_bld.zero,
1847 "need_lerp");
1848 }
1849 else {
1850 /*
1851 * We'll do mip filtering if any of the quads (or individual
1852 * pixel in case of per-pixel lod) need it.
1853 * It might be better to split the vectors here and only fetch/filter
1854 * quads which need it (if there's one lod per quad).
1855 */
1856 need_lerp = lp_build_compare(bld->gallivm, bld->lodf_bld.type,
1857 PIPE_FUNC_GREATER,
1858 lod_fpart, bld->lodf_bld.zero);
1859 need_lerp = lp_build_any_true_range(&bld->lodi_bld, bld->num_lods, need_lerp);
1860 lp_build_name(need_lerp, "need_lerp");
1861 }
1862
1863 lp_build_if(&if_ctx, bld->gallivm, need_lerp);
1864 {
1865 /*
1866 * We unfortunately need to clamp lod_fpart here since we can get
1867 * negative values which would screw up filtering if not all
1868 * lod_fpart values have same sign.
1869 */
1870 lod_fpart = lp_build_max(&bld->lodf_bld, lod_fpart,
1871 bld->lodf_bld.zero);
1872 /* sample the second mipmap level */
1873 lp_build_mipmap_level_sizes(bld, ilevel1,
1874 &size1,
1875 &row_stride1_vec, &img_stride1_vec);
1876 if (bld->num_mips == 1) {
1877 data_ptr1 = lp_build_get_mipmap_level(bld, ilevel1);
1878 }
1879 else {
1880 data_ptr1 = bld->base_ptr;
1881 mipoff1 = lp_build_get_mip_offsets(bld, ilevel1);
1882 }
1883 if (img_filter == PIPE_TEX_FILTER_NEAREST) {
1884 lp_build_sample_image_nearest(bld, size1,
1885 row_stride1_vec, img_stride1_vec,
1886 data_ptr1, mipoff1, coords, offsets,
1887 colors1);
1888 }
1889 else {
1890 lp_build_sample_image_linear(bld, FALSE, size1, NULL,
1891 row_stride1_vec, img_stride1_vec,
1892 data_ptr1, mipoff1, coords, offsets,
1893 colors1);
1894 }
1895
1896 /* interpolate samples from the two mipmap levels */
1897
1898 if (bld->num_lods != bld->coord_type.length)
1899 lod_fpart = lp_build_unpack_broadcast_aos_scalars(bld->gallivm,
1900 bld->lodf_bld.type,
1901 bld->texel_bld.type,
1902 lod_fpart);
1903
1904 for (chan = 0; chan < 4; chan++) {
1905 colors0[chan] = lp_build_lerp(&bld->texel_bld, lod_fpart,
1906 colors0[chan], colors1[chan],
1907 0);
1908 LLVMBuildStore(builder, colors0[chan], colors_out[chan]);
1909 }
1910 }
1911 lp_build_endif(&if_ctx);
1912 }
1913 }
1914
1915
1916 /**
1917 * Sample the texture/mipmap using given mip filter, and using
1918 * both nearest and linear filtering at the same time depending
1919 * on linear_mask.
1920 * lod can be per quad but linear_mask is always per pixel.
1921 * ilevel0 and ilevel1 indicate the two mipmap levels to sample
1922 * from (vectors or scalars).
1923 * If we're using nearest miplevel sampling the '1' values will be null/unused.
1924 */
1925 static void
lp_build_sample_mipmap_both(struct lp_build_sample_context * bld,LLVMValueRef linear_mask,unsigned mip_filter,const LLVMValueRef * coords,const LLVMValueRef * offsets,LLVMValueRef ilevel0,LLVMValueRef ilevel1,LLVMValueRef lod_fpart,LLVMValueRef lod_positive,LLVMValueRef * colors_out)1926 lp_build_sample_mipmap_both(struct lp_build_sample_context *bld,
1927 LLVMValueRef linear_mask,
1928 unsigned mip_filter,
1929 const LLVMValueRef *coords,
1930 const LLVMValueRef *offsets,
1931 LLVMValueRef ilevel0,
1932 LLVMValueRef ilevel1,
1933 LLVMValueRef lod_fpart,
1934 LLVMValueRef lod_positive,
1935 LLVMValueRef *colors_out)
1936 {
1937 LLVMBuilderRef builder = bld->gallivm->builder;
1938 LLVMValueRef size0 = NULL;
1939 LLVMValueRef size1 = NULL;
1940 LLVMValueRef row_stride0_vec = NULL;
1941 LLVMValueRef row_stride1_vec = NULL;
1942 LLVMValueRef img_stride0_vec = NULL;
1943 LLVMValueRef img_stride1_vec = NULL;
1944 LLVMValueRef data_ptr0 = NULL;
1945 LLVMValueRef data_ptr1 = NULL;
1946 LLVMValueRef mipoff0 = NULL;
1947 LLVMValueRef mipoff1 = NULL;
1948 LLVMValueRef colors0[4], colors1[4];
1949 unsigned chan;
1950
1951 /* sample the first mipmap level */
1952 lp_build_mipmap_level_sizes(bld, ilevel0,
1953 &size0,
1954 &row_stride0_vec, &img_stride0_vec);
1955 if (bld->num_mips == 1) {
1956 data_ptr0 = lp_build_get_mipmap_level(bld, ilevel0);
1957 }
1958 else {
1959 /* This path should work for num_lods 1 too but slightly less efficient */
1960 data_ptr0 = bld->base_ptr;
1961 mipoff0 = lp_build_get_mip_offsets(bld, ilevel0);
1962 }
1963
1964 lp_build_sample_image_linear(bld, FALSE, size0, linear_mask,
1965 row_stride0_vec, img_stride0_vec,
1966 data_ptr0, mipoff0, coords, offsets,
1967 colors0);
1968
1969 /* Store the first level's colors in the output variables */
1970 for (chan = 0; chan < 4; chan++) {
1971 LLVMBuildStore(builder, colors0[chan], colors_out[chan]);
1972 }
1973
1974 if (mip_filter == PIPE_TEX_MIPFILTER_LINEAR) {
1975 struct lp_build_if_state if_ctx;
1976 LLVMValueRef need_lerp;
1977
1978 /*
1979 * We'll do mip filtering if any of the quads (or individual
1980 * pixel in case of per-pixel lod) need it.
1981 * Note using lod_positive here not lod_fpart since it may be the same
1982 * condition as that used in the outer "if" in the caller hence llvm
1983 * should be able to merge the branches in this case.
1984 */
1985 need_lerp = lp_build_any_true_range(&bld->lodi_bld, bld->num_lods, lod_positive);
1986 lp_build_name(need_lerp, "need_lerp");
1987
1988 lp_build_if(&if_ctx, bld->gallivm, need_lerp);
1989 {
1990 /*
1991 * We unfortunately need to clamp lod_fpart here since we can get
1992 * negative values which would screw up filtering if not all
1993 * lod_fpart values have same sign.
1994 */
1995 lod_fpart = lp_build_max(&bld->lodf_bld, lod_fpart,
1996 bld->lodf_bld.zero);
1997 /* sample the second mipmap level */
1998 lp_build_mipmap_level_sizes(bld, ilevel1,
1999 &size1,
2000 &row_stride1_vec, &img_stride1_vec);
2001 if (bld->num_mips == 1) {
2002 data_ptr1 = lp_build_get_mipmap_level(bld, ilevel1);
2003 }
2004 else {
2005 data_ptr1 = bld->base_ptr;
2006 mipoff1 = lp_build_get_mip_offsets(bld, ilevel1);
2007 }
2008
2009 lp_build_sample_image_linear(bld, FALSE, size1, linear_mask,
2010 row_stride1_vec, img_stride1_vec,
2011 data_ptr1, mipoff1, coords, offsets,
2012 colors1);
2013
2014 /* interpolate samples from the two mipmap levels */
2015
2016 if (bld->num_lods != bld->coord_type.length)
2017 lod_fpart = lp_build_unpack_broadcast_aos_scalars(bld->gallivm,
2018 bld->lodf_bld.type,
2019 bld->texel_bld.type,
2020 lod_fpart);
2021
2022 for (chan = 0; chan < 4; chan++) {
2023 colors0[chan] = lp_build_lerp(&bld->texel_bld, lod_fpart,
2024 colors0[chan], colors1[chan],
2025 0);
2026 LLVMBuildStore(builder, colors0[chan], colors_out[chan]);
2027 }
2028 }
2029 lp_build_endif(&if_ctx);
2030 }
2031 }
2032
2033
2034 /**
2035 * Build (per-coord) layer value.
2036 * Either clamp layer to valid values or fill in optional out_of_bounds
2037 * value and just return value unclamped.
2038 */
2039 static LLVMValueRef
lp_build_layer_coord(struct lp_build_sample_context * bld,unsigned texture_unit,boolean is_cube_array,LLVMValueRef layer,LLVMValueRef * out_of_bounds)2040 lp_build_layer_coord(struct lp_build_sample_context *bld,
2041 unsigned texture_unit,
2042 boolean is_cube_array,
2043 LLVMValueRef layer,
2044 LLVMValueRef *out_of_bounds)
2045 {
2046 LLVMValueRef num_layers;
2047 struct lp_build_context *int_coord_bld = &bld->int_coord_bld;
2048
2049 num_layers = bld->dynamic_state->depth(bld->dynamic_state, bld->gallivm,
2050 bld->context_ptr, texture_unit, NULL);
2051
2052 if (out_of_bounds) {
2053 LLVMValueRef out1, out;
2054 assert(!is_cube_array);
2055 num_layers = lp_build_broadcast_scalar(int_coord_bld, num_layers);
2056 out = lp_build_cmp(int_coord_bld, PIPE_FUNC_LESS, layer, int_coord_bld->zero);
2057 out1 = lp_build_cmp(int_coord_bld, PIPE_FUNC_GEQUAL, layer, num_layers);
2058 *out_of_bounds = lp_build_or(int_coord_bld, out, out1);
2059 return layer;
2060 }
2061 else {
2062 LLVMValueRef maxlayer;
2063 LLVMValueRef s = is_cube_array ? lp_build_const_int32(bld->gallivm, 6) :
2064 bld->int_bld.one;
2065 maxlayer = lp_build_sub(&bld->int_bld, num_layers, s);
2066 maxlayer = lp_build_broadcast_scalar(int_coord_bld, maxlayer);
2067 return lp_build_clamp(int_coord_bld, layer, int_coord_bld->zero, maxlayer);
2068 }
2069 }
2070
2071 #define WEIGHT_LUT_SIZE 1024
2072
2073 static void
lp_build_sample_aniso(struct lp_build_sample_context * bld,unsigned img_filter,unsigned mip_filter,boolean is_gather,const LLVMValueRef * coords,const LLVMValueRef * offsets,LLVMValueRef ilevel0,LLVMValueRef ilevel1,LLVMValueRef lod_fpart,LLVMValueRef * colors_out)2074 lp_build_sample_aniso(struct lp_build_sample_context *bld,
2075 unsigned img_filter,
2076 unsigned mip_filter,
2077 boolean is_gather,
2078 const LLVMValueRef *coords,
2079 const LLVMValueRef *offsets,
2080 LLVMValueRef ilevel0,
2081 LLVMValueRef ilevel1,
2082 LLVMValueRef lod_fpart,
2083 LLVMValueRef *colors_out)
2084 {
2085 struct gallivm_state *gallivm = bld->gallivm;
2086 LLVMBuilderRef builder = gallivm->builder;
2087 struct lp_build_context *coord_bld = &bld->coord_bld;
2088 struct lp_build_context *float_size_bld = &bld->float_size_in_bld;
2089 LLVMValueRef ddx_ddy = lp_build_packed_ddx_ddy_twocoord(&bld->coord_bld, coords[0], coords[1]);
2090 LLVMValueRef float_size;
2091 LLVMTypeRef i32t = LLVMInt32TypeInContext(gallivm->context);
2092 LLVMValueRef index0 = LLVMConstInt(i32t, 0, 0);
2093 LLVMValueRef index1 = LLVMConstInt(i32t, 1, 0);
2094 const unsigned length = bld->coord_bld.type.length;
2095 const unsigned num_quads = length / 4;
2096 LLVMValueRef filter_table = bld->aniso_filter_table;
2097 LLVMValueRef size0, row_stride0_vec, img_stride0_vec;
2098 LLVMValueRef data_ptr0, mipoff0 = NULL;
2099
2100 lp_build_mipmap_level_sizes(bld, ilevel0,
2101 &size0,
2102 &row_stride0_vec, &img_stride0_vec);
2103 if (bld->num_mips == 1) {
2104 data_ptr0 = lp_build_get_mipmap_level(bld, ilevel0);
2105 }
2106 else {
2107 /* This path should work for num_lods 1 too but slightly less efficient */
2108 data_ptr0 = bld->base_ptr;
2109 mipoff0 = lp_build_get_mip_offsets(bld, ilevel0);
2110 }
2111
2112 float_size = lp_build_int_to_float(&bld->float_size_in_bld, bld->int_size);
2113
2114 LLVMValueRef float_size_lvl = lp_build_int_to_float(&bld->float_size_bld, size0);
2115 /* extract width and height into vectors for use later */
2116 static const unsigned char swizzle15[] = { /* no-op swizzle */
2117 1, 1, 1, 1, 5, 5, 5, 5
2118 };
2119 static const unsigned char swizzle04[] = { /* no-op swizzle */
2120 0, 0, 0, 0, 4, 4, 4, 4
2121 };
2122 LLVMValueRef width_dim, height_dim;
2123
2124 width_dim = lp_build_swizzle_aos_n(gallivm, float_size_lvl, swizzle04,
2125 bld->float_size_bld.type.length,
2126 bld->coord_bld.type.length);
2127 height_dim = lp_build_swizzle_aos_n(gallivm, float_size_lvl, swizzle15,
2128 bld->float_size_bld.type.length,
2129 bld->coord_bld.type.length);
2130
2131
2132 /* shuffle width/height for ddx/ddy calculations. */
2133 LLVMValueRef shuffles[LP_MAX_VECTOR_LENGTH / 4];
2134
2135 for (unsigned i = 0; i < num_quads; i++) {
2136 shuffles[i*4+0] = shuffles[i*4+1] = index0;
2137 shuffles[i*4+2] = shuffles[i*4+3] = index1;
2138 }
2139
2140 LLVMValueRef floatdim =
2141 LLVMBuildShuffleVector(builder, float_size, float_size,
2142 LLVMConstVector(shuffles, length), "");
2143
2144 ddx_ddy = lp_build_mul(coord_bld, ddx_ddy, floatdim);
2145
2146 LLVMValueRef scaling =
2147 lp_build_shl(&bld->leveli_bld, bld->leveli_bld.one, ilevel0);
2148 scaling = lp_build_int_to_float(&bld->levelf_bld, scaling);
2149 scaling = lp_build_rcp(&bld->levelf_bld, scaling);
2150
2151 if (bld->num_lods != length) {
2152 if (bld->levelf_bld.type.length == 1)
2153 scaling = lp_build_broadcast_scalar(coord_bld,
2154 scaling);
2155 else
2156 scaling = lp_build_unpack_broadcast_aos_scalars(bld->gallivm,
2157 bld->levelf_bld.type,
2158 coord_bld->type,
2159 scaling);
2160 }
2161
2162 ddx_ddy = lp_build_mul(coord_bld, ddx_ddy, scaling);
2163
2164 static const unsigned char swizzle01[] = { /* no-op swizzle */
2165 0, 1, 0, 1,
2166 };
2167 static const unsigned char swizzle23[] = {
2168 2, 3, 2, 3,
2169 };
2170
2171 LLVMValueRef ddx_ddys, ddx_ddyt;
2172 ddx_ddys = lp_build_swizzle_aos(coord_bld, ddx_ddy, swizzle01);
2173 ddx_ddyt = lp_build_swizzle_aos(coord_bld, ddx_ddy, swizzle23);
2174
2175 /* compute ellipse coefficients */
2176 /* * A*x*x + B*x*y + C*y*y = F.*/
2177 /* float A = vx*vx+vy*vy+1; */
2178 LLVMValueRef A = lp_build_mul(coord_bld, ddx_ddyt, ddx_ddyt);
2179
2180 LLVMValueRef Ay = lp_build_swizzle_aos(coord_bld, A, swizzle15);
2181 A = lp_build_add(coord_bld, A, Ay);
2182 A = lp_build_add(coord_bld, A, coord_bld->one);
2183 A = lp_build_swizzle_aos(coord_bld, A, swizzle04);
2184
2185 /* float B = -2*(ux*vx+uy*vy); */
2186 LLVMValueRef B = lp_build_mul(coord_bld, ddx_ddys, ddx_ddyt);
2187 LLVMValueRef By = lp_build_swizzle_aos(coord_bld, B, swizzle15);
2188 B = lp_build_add(coord_bld, B, By);
2189 B = lp_build_mul_imm(coord_bld, B, -2);
2190 B = lp_build_swizzle_aos(coord_bld, B, swizzle04);
2191
2192 /* float C = ux*ux+uy*uy+1; */
2193 LLVMValueRef C = lp_build_mul(coord_bld, ddx_ddys, ddx_ddys);
2194 LLVMValueRef Cy = lp_build_swizzle_aos(coord_bld, C, swizzle15);
2195 C = lp_build_add(coord_bld, C, Cy);
2196 C = lp_build_add(coord_bld, C, coord_bld->one);
2197 C = lp_build_swizzle_aos(coord_bld, C, swizzle04);
2198
2199 /* float F = A*C-B*B/4.0f; */
2200 LLVMValueRef F = lp_build_mul(coord_bld, B, B);
2201 F = lp_build_div(coord_bld, F, lp_build_const_vec(gallivm, coord_bld->type, 4.0));
2202 LLVMValueRef F_p2 = lp_build_mul(coord_bld, A, C);
2203 F = lp_build_sub(coord_bld, F_p2, F);
2204
2205 /* compute ellipse bounding box in texture space */
2206 /* const float d = -B*B+4.0f*C*A; */
2207 LLVMValueRef d = lp_build_sub(coord_bld, coord_bld->zero, lp_build_mul(coord_bld, B, B));
2208 LLVMValueRef d_p2 = lp_build_mul(coord_bld, A, C);
2209 d_p2 = lp_build_mul_imm(coord_bld, d_p2, 4);
2210 d = lp_build_add(coord_bld, d, d_p2);
2211
2212 /* const float box_u = 2.0f / d * sqrtf(d*C*F); */
2213 /* box_u -> half of bbox with */
2214 LLVMValueRef temp;
2215 temp = lp_build_mul(coord_bld, d, C);
2216 temp = lp_build_mul(coord_bld, temp, F);
2217 temp = lp_build_sqrt(coord_bld, temp);
2218
2219 LLVMValueRef box_u = lp_build_div(coord_bld, lp_build_const_vec(gallivm, coord_bld->type, 2.0), d);
2220 box_u = lp_build_mul(coord_bld, box_u, temp);
2221
2222 /* const float box_v = 2.0f / d * sqrtf(A*d*F); */
2223 /* box_v -> half of bbox height */
2224 temp = lp_build_mul(coord_bld, A, d);
2225 temp = lp_build_mul(coord_bld, temp, F);
2226 temp = lp_build_sqrt(coord_bld, temp);
2227
2228 LLVMValueRef box_v = lp_build_div(coord_bld, lp_build_const_vec(gallivm, coord_bld->type, 2.0), d);
2229 box_v = lp_build_mul(coord_bld, box_v, temp);
2230
2231 /* Scale ellipse formula to directly index the Filter Lookup Table.
2232 * i.e. scale so that F = WEIGHT_LUT_SIZE-1
2233 */
2234 LLVMValueRef formScale = lp_build_div(coord_bld, lp_build_const_vec(gallivm, coord_bld->type, WEIGHT_LUT_SIZE - 1), F);
2235
2236 A = lp_build_mul(coord_bld, A, formScale);
2237 B = lp_build_mul(coord_bld, B, formScale);
2238 C = lp_build_mul(coord_bld, C, formScale);
2239 /* F *= formScale; */ /* no need to scale F as we don't use it below here */
2240
2241 LLVMValueRef ddq = lp_build_mul_imm(coord_bld, A, 2);
2242
2243 /* Heckbert MS thesis, p. 59; scan over the bounding box of the ellipse
2244 * and incrementally update the value of Ax^2+Bxy*Cy^2; when this
2245 * value, q, is less than F, we're inside the ellipse
2246 */
2247
2248 LLVMValueRef float_size0 = lp_build_int_to_float(float_size_bld, bld->int_size);
2249 LLVMValueRef width0 = lp_build_extract_broadcast(gallivm,
2250 float_size_bld->type,
2251 coord_bld->type,
2252 float_size0, index0);
2253 LLVMValueRef height0 = lp_build_extract_broadcast(gallivm,
2254 float_size_bld->type,
2255 coord_bld->type,
2256 float_size0, index1);
2257
2258 /* texture->width0 * scaling */
2259 width0 = lp_build_mul(coord_bld, width0, scaling);
2260 /* texture->height0 * scaling */
2261 height0 = lp_build_mul(coord_bld, height0, scaling);
2262
2263 /* tex_u = -0.5f * s[j] * texture->width0 * scaling */
2264 LLVMValueRef tex_u = lp_build_mul(coord_bld, coords[0], width0);
2265 tex_u = lp_build_add(coord_bld, tex_u, lp_build_const_vec(gallivm, coord_bld->type, -0.5f));
2266
2267 /* tex_v = -0.5f * t[j] * texture->height0 * scaling */
2268 LLVMValueRef tex_v = lp_build_mul(coord_bld, coords[1], height0);
2269 tex_v = lp_build_add(coord_bld, tex_v, lp_build_const_vec(gallivm, coord_bld->type, -0.5f));
2270
2271 /* const int u0 = (int) floorf(tex_u - box_u); */
2272 LLVMValueRef u0 = lp_build_itrunc(coord_bld, lp_build_floor(coord_bld, lp_build_sub(coord_bld, tex_u, box_u)));
2273 /* const int u1 = (int) ceilf(tex_u + box_u); */
2274 LLVMValueRef u1 = lp_build_itrunc(coord_bld, lp_build_ceil(coord_bld, lp_build_add(coord_bld, tex_u, box_u)));
2275
2276 /* const int v0 = (int) floorf(tex_v - box_v); */
2277 LLVMValueRef v0 = lp_build_itrunc(coord_bld, lp_build_floor(coord_bld, lp_build_sub(coord_bld, tex_v, box_v)));
2278 /* const int v1 = (int) ceilf(tex_v + box_v); */
2279 LLVMValueRef v1 = lp_build_itrunc(coord_bld, lp_build_ceil(coord_bld, lp_build_add(coord_bld, tex_v, box_v)));
2280
2281 /* const float U = u0 - tex_u; */
2282 LLVMValueRef U = lp_build_sub(coord_bld, lp_build_int_to_float(coord_bld, u0), tex_u);
2283
2284 /* A * (2 * U + 1) */
2285 LLVMValueRef dq_base = lp_build_mul_imm(coord_bld, U, 2);
2286 dq_base = lp_build_add(coord_bld, dq_base, coord_bld->one);
2287 dq_base = lp_build_mul(coord_bld, dq_base, A);
2288
2289 /* A * U * U */
2290 LLVMValueRef q_base = lp_build_mul(coord_bld, U, U);
2291 q_base = lp_build_mul(coord_bld, q_base, A);
2292
2293 LLVMValueRef colors0[4];
2294 LLVMValueRef den_store = lp_build_alloca(gallivm, bld->texel_bld.vec_type, "den");
2295
2296 for (unsigned chan = 0; chan < 4; chan++)
2297 colors0[chan] = lp_build_alloca(gallivm, bld->texel_bld.vec_type, "colors");
2298
2299 LLVMValueRef q_store, dq_store;
2300 q_store = lp_build_alloca(gallivm, bld->coord_bld.vec_type, "q");
2301 dq_store = lp_build_alloca(gallivm, bld->coord_bld.vec_type, "dq");
2302
2303 LLVMValueRef v_limiter = lp_build_alloca(gallivm, bld->int_coord_bld.vec_type, "v_limiter");
2304 LLVMValueRef u_limiter = lp_build_alloca(gallivm, bld->int_coord_bld.vec_type, "u_limiter");
2305
2306 LLVMBuildStore(builder, v0, v_limiter);
2307
2308 /* create an LLVM loop block for the V iterator */
2309 LLVMBasicBlockRef v_loop_block = lp_build_insert_new_block(gallivm, "vloop");
2310
2311 LLVMBuildBr(builder, v_loop_block);
2312 LLVMPositionBuilderAtEnd(builder, v_loop_block);
2313
2314 LLVMValueRef v_val = LLVMBuildLoad2(builder, bld->int_coord_bld.vec_type, v_limiter, "");
2315 LLVMValueRef v_mask = LLVMBuildICmp(builder, LLVMIntSLE, v_val, v1, "");
2316
2317 /* loop over V values. */
2318 {
2319 /* const float V = v - tex_v; */
2320 LLVMValueRef V =
2321 lp_build_sub(coord_bld,
2322 lp_build_int_to_float(coord_bld, v_val), tex_v);
2323
2324 /* float dq = dq_base + B * V; */
2325 LLVMValueRef dq = lp_build_mul(coord_bld, V, B);
2326 dq = lp_build_add(coord_bld, dq, dq_base);
2327
2328 /* float q = (C * V + B * U) * V + q_base */
2329 LLVMValueRef q = lp_build_mul(coord_bld, C, V);
2330 q = lp_build_add(coord_bld, q, lp_build_mul(coord_bld, B, U));
2331 q = lp_build_mul(coord_bld, q, V);
2332 q = lp_build_add(coord_bld, q, q_base);
2333
2334 LLVMBuildStore(builder, q, q_store);
2335 LLVMBuildStore(builder, dq, dq_store);
2336
2337 LLVMBuildStore(builder, u0, u_limiter);
2338
2339 /* create an LLVM loop block for the V iterator */
2340 LLVMBasicBlockRef u_loop_block = lp_build_insert_new_block(gallivm, "uloop");
2341
2342 LLVMBuildBr(builder, u_loop_block);
2343 LLVMPositionBuilderAtEnd(builder, u_loop_block);
2344
2345 LLVMValueRef u_val = LLVMBuildLoad2(builder, bld->int_coord_bld.vec_type,
2346 u_limiter, "");
2347 LLVMValueRef u_mask = LLVMBuildICmp(builder,
2348 LLVMIntSLE,
2349 u_val,
2350 u1, "");
2351
2352 /* loop over U values */
2353 {
2354 /* q = (int)q */
2355 q = lp_build_itrunc(coord_bld,
2356 LLVMBuildLoad2(builder, bld->coord_bld.vec_type,
2357 q_store, ""));
2358
2359 /*
2360 * avoid OOB access to filter table, generate a mask for q > 1024,
2361 * then truncate it.
2362 */
2363 LLVMValueRef q_mask = LLVMBuildICmp(builder,
2364 LLVMIntSLE,
2365 q,
2366 lp_build_const_int_vec(gallivm, bld->int_coord_bld.type, 0x3ff), "");
2367 q_mask = LLVMBuildSExt(builder, q_mask, bld->int_coord_bld.vec_type, "");
2368
2369 q = lp_build_max(&bld->int_coord_bld, q, bld->int_coord_bld.zero);
2370 q = lp_build_and(&bld->int_coord_bld, q, lp_build_const_int_vec(gallivm, bld->int_coord_bld.type, 0x3ff));
2371
2372 /* update the offsets to deal with float size. */
2373 q = lp_build_mul_imm(&bld->int_coord_bld, q, 4);
2374 filter_table = LLVMBuildBitCast(gallivm->builder, filter_table, LLVMPointerType(LLVMInt8TypeInContext(gallivm->context), 0), "");
2375
2376 /* Lookup weights in filter table */
2377 LLVMValueRef weights = lp_build_gather(gallivm, coord_bld->type.length,
2378 coord_bld->type.width,
2379 lp_elem_type(coord_bld->type),
2380 TRUE, filter_table, q, TRUE);
2381
2382 /*
2383 * Mask off the weights here which should ensure no-op for loops
2384 * where some of the u/v values are not being calculated.
2385 */
2386 weights = LLVMBuildBitCast(builder, weights, bld->int_coord_bld.vec_type, "");
2387 weights = lp_build_and(&bld->int_coord_bld, weights, LLVMBuildSExt(builder, u_mask, bld->int_coord_bld.vec_type, ""));
2388 weights = lp_build_and(&bld->int_coord_bld, weights, LLVMBuildSExt(builder, v_mask, bld->int_coord_bld.vec_type, ""));
2389 weights = lp_build_and(&bld->int_coord_bld, weights, q_mask);
2390 weights = LLVMBuildBitCast(builder, weights, bld->coord_bld.vec_type, "");
2391
2392 /* if the weights are all 0 avoid doing the sampling at all. */
2393 struct lp_build_if_state noloadw0;
2394
2395 LLVMValueRef wnz = LLVMBuildFCmp(gallivm->builder, LLVMRealUNE,
2396 weights, bld->coord_bld.zero, "");
2397 wnz = LLVMBuildSExt(builder, wnz, bld->int_coord_bld.vec_type, "");
2398 wnz = lp_build_any_true_range(&bld->coord_bld, bld->coord_bld.type.length, wnz);
2399 lp_build_if(&noloadw0, gallivm, wnz);
2400 LLVMValueRef new_coords[4];
2401 new_coords[0] = lp_build_div(coord_bld, lp_build_int_to_float(coord_bld, u_val), width_dim);
2402 new_coords[1] = lp_build_div(coord_bld, lp_build_int_to_float(coord_bld, v_val), height_dim);
2403 new_coords[2] = coords[2];
2404 new_coords[3] = coords[3];
2405
2406 /* lookup q in filter table */
2407 LLVMValueRef temp_colors[4];
2408 lp_build_sample_image_nearest(bld, size0,
2409 row_stride0_vec, img_stride0_vec,
2410 data_ptr0, mipoff0, new_coords, offsets,
2411 temp_colors);
2412
2413 for (unsigned chan = 0; chan < 4; chan++) {
2414 LLVMValueRef tcolor = LLVMBuildLoad2(builder, bld->texel_bld.vec_type, colors0[chan], "");
2415
2416 tcolor = lp_build_add(&bld->texel_bld, tcolor, lp_build_mul(&bld->texel_bld, temp_colors[chan], weights));
2417 LLVMBuildStore(builder, tcolor, colors0[chan]);
2418 }
2419
2420 /* multiple colors by weight and add in. */
2421 /* den += weight; */
2422 LLVMValueRef den = LLVMBuildLoad2(builder, bld->texel_bld.vec_type, den_store, "");
2423 den = lp_build_add(&bld->texel_bld, den, weights);
2424 LLVMBuildStore(builder, den, den_store);
2425
2426 lp_build_endif(&noloadw0);
2427 /* q += dq; */
2428 /* dq += ddq; */
2429 q = LLVMBuildLoad2(builder, bld->texel_bld.vec_type, q_store, "");
2430 dq = LLVMBuildLoad2(builder, bld->texel_bld.vec_type, dq_store, "");
2431 q = lp_build_add(coord_bld, q, dq);
2432 dq = lp_build_add(coord_bld, dq, ddq);
2433 LLVMBuildStore(builder, q, q_store);
2434 LLVMBuildStore(builder, dq, dq_store);
2435 }
2436 /* u += 1 */
2437 u_val = LLVMBuildLoad2(builder, bld->int_coord_bld.vec_type, u_limiter, "");
2438 u_val = lp_build_add(&bld->int_coord_bld, u_val, bld->int_coord_bld.one);
2439 LLVMBuildStore(builder, u_val, u_limiter);
2440
2441 u_mask = LLVMBuildICmp(builder,
2442 LLVMIntSLE,
2443 u_val,
2444 u1, "");
2445 LLVMValueRef u_end_cond = LLVMBuildSExt(builder, u_mask, bld->int_coord_bld.vec_type, "");
2446 u_end_cond = lp_build_any_true_range(&bld->coord_bld, bld->coord_bld.type.length, u_end_cond);
2447
2448 LLVMBasicBlockRef u_end_loop = lp_build_insert_new_block(gallivm, "u_end_loop");
2449
2450 LLVMBuildCondBr(builder, u_end_cond,
2451 u_loop_block, u_end_loop);
2452
2453 LLVMPositionBuilderAtEnd(builder, u_end_loop);
2454
2455 }
2456
2457 /* v += 1 */
2458 v_val = LLVMBuildLoad2(builder, bld->int_coord_bld.vec_type, v_limiter, "");
2459 v_val = lp_build_add(&bld->int_coord_bld, v_val, bld->int_coord_bld.one);
2460 LLVMBuildStore(builder, v_val, v_limiter);
2461
2462 v_mask = LLVMBuildICmp(builder,
2463 LLVMIntSLE,
2464 v_val,
2465 v1, "");
2466 LLVMValueRef v_end_cond = LLVMBuildSExt(builder, v_mask,
2467 bld->int_coord_bld.vec_type, "");
2468 v_end_cond = lp_build_any_true_range(&bld->coord_bld,
2469 bld->coord_bld.type.length, v_end_cond);
2470
2471 LLVMBasicBlockRef v_end_loop = lp_build_insert_new_block(gallivm, "v_end_loop");
2472
2473 LLVMBuildCondBr(builder, v_end_cond,
2474 v_loop_block, v_end_loop);
2475
2476 LLVMPositionBuilderAtEnd(builder, v_end_loop);
2477
2478 LLVMValueRef den = LLVMBuildLoad2(builder, bld->texel_bld.vec_type, den_store, "");
2479
2480 for (unsigned chan = 0; chan < 4; chan++) {
2481 colors0[chan] =
2482 lp_build_div(&bld->texel_bld,
2483 LLVMBuildLoad2(builder, bld->texel_bld.vec_type,
2484 colors0[chan], ""), den);
2485 }
2486
2487 LLVMValueRef den0 = lp_build_cmp(&bld->coord_bld, PIPE_FUNC_EQUAL,
2488 den, bld->coord_bld.zero);
2489
2490 LLVMValueRef den0_any =
2491 lp_build_any_true_range(&bld->coord_bld,
2492 bld->coord_bld.type.length, den0);
2493
2494 struct lp_build_if_state den0_fallback;
2495 lp_build_if(&den0_fallback, gallivm, den0_any);
2496 {
2497 LLVMValueRef colors_den0[4];
2498 lp_build_sample_image_linear(bld, false, size0, NULL,
2499 row_stride0_vec, img_stride0_vec,
2500 data_ptr0, mipoff0, coords, offsets,
2501 colors_den0);
2502 for (unsigned chan = 0; chan < 4; chan++) {
2503 LLVMValueRef chan_val =
2504 lp_build_select(&bld->texel_bld, den0,
2505 colors_den0[chan], colors0[chan]);
2506 LLVMBuildStore(builder, chan_val, colors_out[chan]);
2507 }
2508 }
2509 lp_build_else(&den0_fallback);
2510 {
2511 for (unsigned chan = 0; chan < 4; chan++) {
2512 LLVMBuildStore(builder, colors0[chan], colors_out[chan]);
2513 }
2514 }
2515 lp_build_endif(&den0_fallback);
2516 }
2517
2518
2519 /**
2520 * Calculate cube face, lod, mip levels.
2521 */
2522 static void
lp_build_sample_common(struct lp_build_sample_context * bld,boolean is_lodq,unsigned texture_index,unsigned sampler_index,LLVMValueRef * coords,const struct lp_derivatives * derivs,LLVMValueRef lod_bias,LLVMValueRef explicit_lod,LLVMValueRef * lod_pos_or_zero,LLVMValueRef * lod,LLVMValueRef * lod_fpart,LLVMValueRef * ilevel0,LLVMValueRef * ilevel1)2523 lp_build_sample_common(struct lp_build_sample_context *bld,
2524 boolean is_lodq,
2525 unsigned texture_index,
2526 unsigned sampler_index,
2527 LLVMValueRef *coords,
2528 const struct lp_derivatives *derivs, /* optional */
2529 LLVMValueRef lod_bias, /* optional */
2530 LLVMValueRef explicit_lod, /* optional */
2531 LLVMValueRef *lod_pos_or_zero,
2532 LLVMValueRef *lod,
2533 LLVMValueRef *lod_fpart,
2534 LLVMValueRef *ilevel0,
2535 LLVMValueRef *ilevel1)
2536 {
2537 const unsigned mip_filter = bld->static_sampler_state->min_mip_filter;
2538 const unsigned min_filter = bld->static_sampler_state->min_img_filter;
2539 const unsigned mag_filter = bld->static_sampler_state->mag_img_filter;
2540 const unsigned target = bld->static_texture_state->target;
2541 const bool aniso = bld->static_sampler_state->aniso;
2542 LLVMValueRef first_level, cube_rho = NULL;
2543 LLVMValueRef lod_ipart = NULL;
2544 struct lp_derivatives cube_derivs;
2545
2546 /*
2547 printf("%s mip %d min %d mag %d\n", __FUNCTION__,
2548 mip_filter, min_filter, mag_filter);
2549 */
2550
2551 /*
2552 * Choose cube face, recompute texcoords for the chosen face and
2553 * compute rho here too (as it requires transform of derivatives).
2554 */
2555 if (target == PIPE_TEXTURE_CUBE || target == PIPE_TEXTURE_CUBE_ARRAY) {
2556 boolean need_derivs;
2557 need_derivs = ((min_filter != mag_filter ||
2558 mip_filter != PIPE_TEX_MIPFILTER_NONE) &&
2559 !bld->static_sampler_state->min_max_lod_equal &&
2560 !explicit_lod);
2561 lp_build_cube_lookup(bld, coords, derivs, &cube_rho, &cube_derivs, need_derivs);
2562 derivs = &cube_derivs;
2563 if (target == PIPE_TEXTURE_CUBE_ARRAY && !is_lodq) {
2564 /* calculate cube layer coord now */
2565 LLVMValueRef layer = lp_build_iround(&bld->coord_bld, coords[3]);
2566 LLVMValueRef six = lp_build_const_int_vec(bld->gallivm, bld->int_coord_type, 6);
2567 layer = lp_build_mul(&bld->int_coord_bld, layer, six);
2568 coords[3] = lp_build_layer_coord(bld, texture_index, TRUE, layer, NULL);
2569 /* because of seamless filtering can't add it to face (coords[2]) here. */
2570 }
2571 }
2572 else if ((target == PIPE_TEXTURE_1D_ARRAY ||
2573 target == PIPE_TEXTURE_2D_ARRAY) && !is_lodq) {
2574 coords[2] = lp_build_iround(&bld->coord_bld, coords[2]);
2575 coords[2] = lp_build_layer_coord(bld, texture_index, FALSE, coords[2], NULL);
2576 }
2577
2578 if (bld->static_sampler_state->compare_mode != PIPE_TEX_COMPARE_NONE) {
2579 /*
2580 * Clamp p coords to [0,1] for fixed function depth texture format here.
2581 * Technically this is not entirely correct for unorm depth as the ref
2582 * value should be converted to the depth format (quantization!) and
2583 * comparison then done in texture format. This would actually help
2584 * performance (since only need to do it once and could save the
2585 * per-sample conversion of texels to floats instead), but it would need
2586 * more messy code (would need to push at least some bits down to actual
2587 * fetch so conversion could be skipped, and would have ugly interaction
2588 * with border color, would need to convert border color to that format
2589 * too or do some other tricks to make it work).
2590 */
2591 const struct util_format_description *format_desc = bld->format_desc;
2592 unsigned chan_type;
2593 /* not entirely sure we couldn't end up with non-valid swizzle here */
2594 chan_type = format_desc->swizzle[0] <= PIPE_SWIZZLE_W ?
2595 format_desc->channel[format_desc->swizzle[0]].type :
2596 UTIL_FORMAT_TYPE_FLOAT;
2597 if (chan_type != UTIL_FORMAT_TYPE_FLOAT) {
2598 coords[4] = lp_build_clamp(&bld->coord_bld, coords[4],
2599 bld->coord_bld.zero, bld->coord_bld.one);
2600 }
2601 }
2602
2603 /*
2604 * Compute the level of detail (float).
2605 */
2606 if (min_filter != mag_filter ||
2607 mip_filter != PIPE_TEX_MIPFILTER_NONE || is_lodq) {
2608 LLVMValueRef max_aniso = NULL;
2609
2610 if (aniso)
2611 max_aniso = bld->dynamic_state->max_aniso(bld->dynamic_state,
2612 bld->gallivm,
2613 bld->context_ptr,
2614 sampler_index);
2615
2616 /* Need to compute lod either to choose mipmap levels or to
2617 * distinguish between minification/magnification with one mipmap level.
2618 */
2619 lp_build_lod_selector(bld, is_lodq, texture_index, sampler_index,
2620 coords[0], coords[1], coords[2], cube_rho,
2621 derivs, lod_bias, explicit_lod,
2622 mip_filter, max_aniso, lod,
2623 &lod_ipart, lod_fpart, lod_pos_or_zero);
2624 if (is_lodq) {
2625 LLVMValueRef last_level;
2626 last_level = bld->dynamic_state->last_level(bld->dynamic_state,
2627 bld->gallivm,
2628 bld->context_ptr,
2629 texture_index, NULL);
2630 first_level = bld->dynamic_state->first_level(bld->dynamic_state,
2631 bld->gallivm,
2632 bld->context_ptr,
2633 texture_index, NULL);
2634 last_level = lp_build_sub(&bld->int_bld, last_level, first_level);
2635 last_level = lp_build_int_to_float(&bld->float_bld, last_level);
2636 last_level = lp_build_broadcast_scalar(&bld->lodf_bld, last_level);
2637
2638 switch (mip_filter) {
2639 case PIPE_TEX_MIPFILTER_NONE:
2640 *lod_fpart = bld->lodf_bld.zero;
2641 break;
2642 case PIPE_TEX_MIPFILTER_NEAREST:
2643 *lod_fpart = lp_build_round(&bld->lodf_bld, *lod_fpart);
2644 FALLTHROUGH;
2645 case PIPE_TEX_MIPFILTER_LINEAR:
2646 *lod_fpart = lp_build_clamp(&bld->lodf_bld, *lod_fpart,
2647 bld->lodf_bld.zero, last_level);
2648 break;
2649 }
2650 return;
2651 }
2652 } else {
2653 lod_ipart = bld->lodi_bld.zero;
2654 *lod_pos_or_zero = bld->lodi_bld.zero;
2655 }
2656
2657 if ((bld->num_lods != bld->num_mips || bld->num_lods == 1) &&
2658 bld->lodi_bld.type.length != 1) {
2659 /* only makes sense if there's just a single mip level */
2660 assert(bld->num_mips == 1);
2661 lod_ipart = lp_build_extract_range(bld->gallivm, lod_ipart, 0, 1);
2662 }
2663
2664 /*
2665 * Compute integer mipmap level(s) to fetch texels from: ilevel0, ilevel1
2666 */
2667
2668 if (aniso) {
2669 lp_build_nearest_mip_level(bld, texture_index, lod_ipart, ilevel0, NULL);
2670 return;
2671 }
2672
2673 switch (mip_filter) {
2674 default:
2675 assert(0 && "bad mip_filter value in lp_build_sample_soa()");
2676 case PIPE_TEX_MIPFILTER_NONE:
2677 /* always use mip level 0 */
2678 first_level = bld->dynamic_state->first_level(bld->dynamic_state,
2679 bld->gallivm, bld->context_ptr,
2680 texture_index, NULL);
2681 first_level = lp_build_broadcast_scalar(&bld->leveli_bld, first_level);
2682 *ilevel0 = first_level;
2683 break;
2684 case PIPE_TEX_MIPFILTER_NEAREST:
2685 assert(lod_ipart);
2686 lp_build_nearest_mip_level(bld, texture_index, lod_ipart, ilevel0, NULL);
2687 break;
2688 case PIPE_TEX_MIPFILTER_LINEAR:
2689 assert(lod_ipart);
2690 assert(*lod_fpart);
2691 lp_build_linear_mip_levels(bld, texture_index,
2692 lod_ipart, lod_fpart,
2693 ilevel0, ilevel1);
2694 break;
2695 }
2696 }
2697
2698
2699 static void
lp_build_clamp_border_color(struct lp_build_sample_context * bld,unsigned sampler_unit)2700 lp_build_clamp_border_color(struct lp_build_sample_context *bld,
2701 unsigned sampler_unit)
2702 {
2703 struct gallivm_state *gallivm = bld->gallivm;
2704 LLVMBuilderRef builder = gallivm->builder;
2705 LLVMValueRef border_color_ptr =
2706 bld->dynamic_state->border_color(bld->dynamic_state, gallivm,
2707 bld->context_ptr, sampler_unit);
2708 LLVMValueRef border_color;
2709 const struct util_format_description *format_desc = bld->format_desc;
2710 struct lp_type vec4_type = bld->texel_type;
2711 struct lp_build_context vec4_bld;
2712 LLVMValueRef min_clamp = NULL;
2713 LLVMValueRef max_clamp = NULL;
2714
2715 /*
2716 * For normalized format need to clamp border color (technically
2717 * probably should also quantize the data). Really sucks doing this
2718 * here but can't avoid at least for now since this is part of
2719 * sampler state and texture format is part of sampler_view state.
2720 * GL expects also expects clamping for uint/sint formats too so
2721 * do that as well (d3d10 can't end up here with uint/sint since it
2722 * only supports them with ld).
2723 */
2724 vec4_type.length = 4;
2725 lp_build_context_init(&vec4_bld, gallivm, vec4_type);
2726
2727 /*
2728 * Vectorized clamping of border color. Loading is a bit of a hack since
2729 * we just cast the pointer to float array to pointer to vec4
2730 * (int or float).
2731 */
2732 border_color_ptr = lp_build_array_get_ptr(gallivm, border_color_ptr,
2733 lp_build_const_int32(gallivm, 0));
2734 border_color_ptr = LLVMBuildBitCast(builder, border_color_ptr,
2735 LLVMPointerType(vec4_bld.vec_type, 0), "");
2736 border_color = LLVMBuildLoad2(builder, vec4_bld.vec_type, border_color_ptr, "");
2737 /* we don't have aligned type in the dynamic state unfortunately */
2738 LLVMSetAlignment(border_color, 4);
2739
2740 /*
2741 * Instead of having some incredibly complex logic which will try to figure
2742 * out clamping necessary for each channel, simply use the first channel,
2743 * and treat mixed signed/unsigned normalized formats specially. (Mixed
2744 * non-normalized, which wouldn't work at all here, do not exist for a good
2745 * reason.)
2746 */
2747 if (format_desc->layout == UTIL_FORMAT_LAYOUT_PLAIN) {
2748 int chan;
2749 /* d/s needs special handling because both present means just sampling depth */
2750 if (util_format_is_depth_and_stencil(format_desc->format)) {
2751 chan = format_desc->swizzle[0];
2752 }
2753 else {
2754 chan = util_format_get_first_non_void_channel(format_desc->format);
2755 }
2756 if (chan >= 0 && chan <= PIPE_SWIZZLE_W) {
2757 unsigned chan_type = format_desc->channel[chan].type;
2758 unsigned chan_norm = format_desc->channel[chan].normalized;
2759 unsigned chan_pure = format_desc->channel[chan].pure_integer;
2760 if (chan_type == UTIL_FORMAT_TYPE_SIGNED) {
2761 if (chan_norm) {
2762 min_clamp = lp_build_const_vec(gallivm, vec4_type, -1.0F);
2763 max_clamp = vec4_bld.one;
2764 }
2765 else if (chan_pure) {
2766 /*
2767 * Border color was stored as int, hence need min/max clamp
2768 * only if chan has less than 32 bits..
2769 */
2770 unsigned chan_size = format_desc->channel[chan].size;
2771 if (chan_size < 32) {
2772 min_clamp = lp_build_const_int_vec(gallivm, vec4_type,
2773 0 - (1 << (chan_size - 1)));
2774 max_clamp = lp_build_const_int_vec(gallivm, vec4_type,
2775 (1 << (chan_size - 1)) - 1);
2776 }
2777 }
2778 /* TODO: no idea about non-pure, non-normalized! */
2779 }
2780 else if (chan_type == UTIL_FORMAT_TYPE_UNSIGNED) {
2781 if (chan_norm) {
2782 min_clamp = vec4_bld.zero;
2783 max_clamp = vec4_bld.one;
2784 }
2785 /*
2786 * Need a ugly hack here, because we don't have Z32_FLOAT_X8X24
2787 * we use Z32_FLOAT_S8X24 to imply sampling depth component
2788 * and ignoring stencil, which will blow up here if we try to
2789 * do a uint clamp in a float texel build...
2790 * And even if we had that format, mesa st also thinks using z24s8
2791 * means depth sampling ignoring stencil.
2792 */
2793 else if (chan_pure) {
2794 /*
2795 * Border color was stored as uint, hence never need min
2796 * clamp, and only need max clamp if chan has less than 32 bits.
2797 */
2798 unsigned chan_size = format_desc->channel[chan].size;
2799 if (chan_size < 32) {
2800 max_clamp = lp_build_const_int_vec(gallivm, vec4_type,
2801 (1 << chan_size) - 1);
2802 }
2803 /* TODO: no idea about non-pure, non-normalized! */
2804 }
2805 }
2806 else if (chan_type == UTIL_FORMAT_TYPE_FIXED) {
2807 /* TODO: I have no idea what clamp this would need if any! */
2808 }
2809 }
2810 /* mixed plain formats (or different pure size) */
2811 switch (format_desc->format) {
2812 case PIPE_FORMAT_B10G10R10A2_UINT:
2813 case PIPE_FORMAT_R10G10B10A2_UINT:
2814 {
2815 unsigned max10 = (1 << 10) - 1;
2816 max_clamp = lp_build_const_aos(gallivm, vec4_type, max10, max10,
2817 max10, (1 << 2) - 1, NULL);
2818 }
2819 break;
2820 case PIPE_FORMAT_R10SG10SB10SA2U_NORM:
2821 min_clamp = lp_build_const_aos(gallivm, vec4_type, -1.0F, -1.0F,
2822 -1.0F, 0.0F, NULL);
2823 max_clamp = vec4_bld.one;
2824 break;
2825 case PIPE_FORMAT_R8SG8SB8UX8U_NORM:
2826 case PIPE_FORMAT_R5SG5SB6U_NORM:
2827 min_clamp = lp_build_const_aos(gallivm, vec4_type, -1.0F, -1.0F,
2828 0.0F, 0.0F, NULL);
2829 max_clamp = vec4_bld.one;
2830 break;
2831 default:
2832 break;
2833 }
2834 }
2835 else {
2836 /* cannot figure this out from format description */
2837 if (format_desc->layout == UTIL_FORMAT_LAYOUT_S3TC) {
2838 /* s3tc formats are always unorm */
2839 min_clamp = vec4_bld.zero;
2840 max_clamp = vec4_bld.one;
2841 }
2842 else if (format_desc->layout == UTIL_FORMAT_LAYOUT_RGTC ||
2843 format_desc->layout == UTIL_FORMAT_LAYOUT_ETC ||
2844 format_desc->layout == UTIL_FORMAT_LAYOUT_BPTC) {
2845 switch (format_desc->format) {
2846 case PIPE_FORMAT_RGTC1_UNORM:
2847 case PIPE_FORMAT_RGTC2_UNORM:
2848 case PIPE_FORMAT_LATC1_UNORM:
2849 case PIPE_FORMAT_LATC2_UNORM:
2850 case PIPE_FORMAT_ETC1_RGB8:
2851 case PIPE_FORMAT_BPTC_RGBA_UNORM:
2852 case PIPE_FORMAT_BPTC_SRGBA:
2853 min_clamp = vec4_bld.zero;
2854 max_clamp = vec4_bld.one;
2855 break;
2856 case PIPE_FORMAT_RGTC1_SNORM:
2857 case PIPE_FORMAT_RGTC2_SNORM:
2858 case PIPE_FORMAT_LATC1_SNORM:
2859 case PIPE_FORMAT_LATC2_SNORM:
2860 min_clamp = lp_build_const_vec(gallivm, vec4_type, -1.0F);
2861 max_clamp = vec4_bld.one;
2862 break;
2863 case PIPE_FORMAT_BPTC_RGB_FLOAT:
2864 /* not sure if we should clamp to max half float? */
2865 break;
2866 case PIPE_FORMAT_BPTC_RGB_UFLOAT:
2867 min_clamp = vec4_bld.zero;
2868 break;
2869 default:
2870 assert(0);
2871 break;
2872 }
2873 }
2874 /*
2875 * all others from subsampled/other group, though we don't care
2876 * about yuv (and should not have any from zs here)
2877 */
2878 else if (format_desc->colorspace != UTIL_FORMAT_COLORSPACE_YUV){
2879 switch (format_desc->format) {
2880 case PIPE_FORMAT_R8G8_B8G8_UNORM:
2881 case PIPE_FORMAT_G8R8_G8B8_UNORM:
2882 case PIPE_FORMAT_G8R8_B8R8_UNORM:
2883 case PIPE_FORMAT_R8G8_R8B8_UNORM:
2884 case PIPE_FORMAT_R1_UNORM: /* doesn't make sense but ah well */
2885 min_clamp = vec4_bld.zero;
2886 max_clamp = vec4_bld.one;
2887 break;
2888 case PIPE_FORMAT_R8G8Bx_SNORM:
2889 min_clamp = lp_build_const_vec(gallivm, vec4_type, -1.0F);
2890 max_clamp = vec4_bld.one;
2891 break;
2892 /*
2893 * Note smallfloat formats usually don't need clamping
2894 * (they still have infinite range) however this is not
2895 * true for r11g11b10 and r9g9b9e5, which can't represent
2896 * negative numbers (and additionally r9g9b9e5 can't represent
2897 * very large numbers). d3d10 seems happy without clamping in
2898 * this case, but gl spec is pretty clear: "for floating
2899 * point and integer formats, border values are clamped to
2900 * the representable range of the format" so do that here.
2901 */
2902 case PIPE_FORMAT_R11G11B10_FLOAT:
2903 min_clamp = vec4_bld.zero;
2904 break;
2905 case PIPE_FORMAT_R9G9B9E5_FLOAT:
2906 min_clamp = vec4_bld.zero;
2907 max_clamp = lp_build_const_vec(gallivm, vec4_type, MAX_RGB9E5);
2908 break;
2909 default:
2910 assert(0);
2911 break;
2912 }
2913 }
2914 }
2915
2916 if (min_clamp) {
2917 border_color = lp_build_max(&vec4_bld, border_color, min_clamp);
2918 }
2919 if (max_clamp) {
2920 border_color = lp_build_min(&vec4_bld, border_color, max_clamp);
2921 }
2922
2923 bld->border_color_clamped = border_color;
2924 }
2925
2926
2927 /**
2928 * General texture sampling codegen.
2929 * This function handles texture sampling for all texture targets (1D,
2930 * 2D, 3D, cube) and all filtering modes.
2931 */
2932 static void
lp_build_sample_general(struct lp_build_sample_context * bld,unsigned sampler_unit,boolean is_gather,const LLVMValueRef * coords,const LLVMValueRef * offsets,LLVMValueRef lod_positive,LLVMValueRef lod_fpart,LLVMValueRef ilevel0,LLVMValueRef ilevel1,LLVMValueRef * colors_out)2933 lp_build_sample_general(struct lp_build_sample_context *bld,
2934 unsigned sampler_unit,
2935 boolean is_gather,
2936 const LLVMValueRef *coords,
2937 const LLVMValueRef *offsets,
2938 LLVMValueRef lod_positive,
2939 LLVMValueRef lod_fpart,
2940 LLVMValueRef ilevel0,
2941 LLVMValueRef ilevel1,
2942 LLVMValueRef *colors_out)
2943 {
2944 LLVMBuilderRef builder = bld->gallivm->builder;
2945 const struct lp_static_sampler_state *sampler_state = bld->static_sampler_state;
2946 const unsigned mip_filter = sampler_state->min_mip_filter;
2947 const unsigned min_filter = sampler_state->min_img_filter;
2948 const unsigned mag_filter = sampler_state->mag_img_filter;
2949 LLVMValueRef texels[4];
2950 unsigned chan;
2951
2952 /* if we need border color, (potentially) clamp it now */
2953 if (lp_sampler_wrap_mode_uses_border_color(sampler_state->wrap_s,
2954 min_filter,
2955 mag_filter) ||
2956 (bld->dims > 1 &&
2957 lp_sampler_wrap_mode_uses_border_color(sampler_state->wrap_t,
2958 min_filter,
2959 mag_filter)) ||
2960 (bld->dims > 2 &&
2961 lp_sampler_wrap_mode_uses_border_color(sampler_state->wrap_r,
2962 min_filter,
2963 mag_filter))) {
2964 lp_build_clamp_border_color(bld, sampler_unit);
2965 }
2966
2967
2968 /*
2969 * Get/interpolate texture colors.
2970 */
2971
2972 for (chan = 0; chan < 4; ++chan) {
2973 texels[chan] = lp_build_alloca(bld->gallivm, bld->texel_bld.vec_type, "");
2974 lp_build_name(texels[chan], "sampler%u_texel_%c_var", sampler_unit, "xyzw"[chan]);
2975 }
2976
2977 if (sampler_state->aniso) {
2978 lp_build_sample_aniso(bld, PIPE_TEX_FILTER_NEAREST, mip_filter,
2979 false, coords, offsets, ilevel0,
2980 ilevel1, lod_fpart, texels);
2981 } else if (min_filter == mag_filter) {
2982 /* no need to distinguish between minification and magnification */
2983 lp_build_sample_mipmap(bld, min_filter, mip_filter,
2984 is_gather,
2985 coords, offsets,
2986 ilevel0, ilevel1, lod_fpart,
2987 texels);
2988 }
2989 else {
2990 /*
2991 * Could also get rid of the if-logic and always use mipmap_both, both
2992 * for the single lod and multi-lod case if nothing really uses this.
2993 */
2994 if (bld->num_lods == 1) {
2995 /* Emit conditional to choose min image filter or mag image filter
2996 * depending on the lod being > 0 or <= 0, respectively.
2997 */
2998 struct lp_build_if_state if_ctx;
2999
3000 lod_positive = LLVMBuildTrunc(builder, lod_positive,
3001 LLVMInt1TypeInContext(bld->gallivm->context),
3002 "lod_pos");
3003
3004 lp_build_if(&if_ctx, bld->gallivm, lod_positive);
3005 {
3006 /* Use the minification filter */
3007 lp_build_sample_mipmap(bld, min_filter, mip_filter, FALSE,
3008 coords, offsets,
3009 ilevel0, ilevel1, lod_fpart,
3010 texels);
3011 }
3012 lp_build_else(&if_ctx);
3013 {
3014 /* Use the magnification filter */
3015 lp_build_sample_mipmap(bld, mag_filter, PIPE_TEX_MIPFILTER_NONE,
3016 FALSE,
3017 coords, offsets,
3018 ilevel0, NULL, NULL,
3019 texels);
3020 }
3021 lp_build_endif(&if_ctx);
3022 }
3023 else {
3024 LLVMValueRef need_linear, linear_mask;
3025 unsigned mip_filter_for_nearest;
3026 struct lp_build_if_state if_ctx;
3027
3028 if (min_filter == PIPE_TEX_FILTER_LINEAR) {
3029 linear_mask = lod_positive;
3030 mip_filter_for_nearest = PIPE_TEX_MIPFILTER_NONE;
3031 }
3032 else {
3033 linear_mask = lp_build_not(&bld->lodi_bld, lod_positive);
3034 mip_filter_for_nearest = mip_filter;
3035 }
3036 need_linear = lp_build_any_true_range(&bld->lodi_bld, bld->num_lods,
3037 linear_mask);
3038 lp_build_name(need_linear, "need_linear");
3039
3040 if (bld->num_lods != bld->coord_type.length) {
3041 linear_mask = lp_build_unpack_broadcast_aos_scalars(bld->gallivm,
3042 bld->lodi_type,
3043 bld->int_coord_type,
3044 linear_mask);
3045 }
3046
3047 lp_build_if(&if_ctx, bld->gallivm, need_linear);
3048 {
3049 /*
3050 * Do sampling with both filters simultaneously. This means using
3051 * a linear filter and doing some tricks (with weights) for the
3052 * pixels which need nearest filter.
3053 * Note that it's probably rare some pixels need nearest and some
3054 * linear filter but the fixups required for the nearest pixels
3055 * aren't all that complicated so just always run a combined path
3056 * if at least some pixels require linear.
3057 */
3058 lp_build_sample_mipmap_both(bld, linear_mask, mip_filter,
3059 coords, offsets,
3060 ilevel0, ilevel1,
3061 lod_fpart, lod_positive,
3062 texels);
3063 }
3064 lp_build_else(&if_ctx);
3065 {
3066 /*
3067 * All pixels require just nearest filtering, which is way
3068 * cheaper than linear, hence do a separate path for that.
3069 */
3070 lp_build_sample_mipmap(bld, PIPE_TEX_FILTER_NEAREST,
3071 mip_filter_for_nearest, FALSE,
3072 coords, offsets,
3073 ilevel0, ilevel1, lod_fpart,
3074 texels);
3075 }
3076 lp_build_endif(&if_ctx);
3077 }
3078 }
3079
3080 for (chan = 0; chan < 4; ++chan) {
3081 colors_out[chan] = LLVMBuildLoad2(builder, bld->texel_bld.vec_type, texels[chan], "");
3082 lp_build_name(colors_out[chan], "sampler%u_texel_%c", sampler_unit, "xyzw"[chan]);
3083 }
3084 }
3085
3086
3087 /**
3088 * Texel fetch function. In contrast to general sampling there is no
3089 * filtering, no coord minification, lod (if any) is always explicit uint,
3090 * coords are uints (in terms of texel units) directly to be applied to the
3091 * selected mip level (after adding texel offsets). This function handles
3092 * texel fetch for all targets where texel fetch is supported (no cube maps,
3093 * but 1d, 2d, 3d are supported, arrays and buffers should be too).
3094 */
3095 static void
lp_build_fetch_texel(struct lp_build_sample_context * bld,unsigned texture_unit,LLVMValueRef ms_index,const LLVMValueRef * coords,LLVMValueRef explicit_lod,const LLVMValueRef * offsets,LLVMValueRef * colors_out)3096 lp_build_fetch_texel(struct lp_build_sample_context *bld,
3097 unsigned texture_unit,
3098 LLVMValueRef ms_index,
3099 const LLVMValueRef *coords,
3100 LLVMValueRef explicit_lod,
3101 const LLVMValueRef *offsets,
3102 LLVMValueRef *colors_out)
3103 {
3104 struct lp_build_context *perquadi_bld = &bld->lodi_bld;
3105 struct lp_build_context *int_coord_bld = &bld->int_coord_bld;
3106 unsigned dims = bld->dims, chan;
3107 unsigned target = bld->static_texture_state->target;
3108 boolean out_of_bound_ret_zero = TRUE;
3109 LLVMValueRef size, ilevel;
3110 LLVMValueRef row_stride_vec = NULL, img_stride_vec = NULL;
3111 LLVMValueRef x = coords[0], y = coords[1], z = coords[2];
3112 LLVMValueRef width, height, depth, i, j;
3113 LLVMValueRef offset, out_of_bounds, out1;
3114
3115 out_of_bounds = int_coord_bld->zero;
3116
3117 if (explicit_lod && bld->static_texture_state->target != PIPE_BUFFER) {
3118 if (bld->num_mips != int_coord_bld->type.length) {
3119 ilevel = lp_build_pack_aos_scalars(bld->gallivm, int_coord_bld->type,
3120 perquadi_bld->type, explicit_lod, 0);
3121 }
3122 else {
3123 ilevel = explicit_lod;
3124 }
3125 lp_build_nearest_mip_level(bld, texture_unit, ilevel, &ilevel,
3126 out_of_bound_ret_zero ? &out_of_bounds : NULL);
3127 }
3128 else {
3129 assert(bld->num_mips == 1);
3130 if (bld->static_texture_state->target != PIPE_BUFFER) {
3131 ilevel = bld->dynamic_state->first_level(bld->dynamic_state, bld->gallivm,
3132 bld->context_ptr, texture_unit, NULL);
3133 }
3134 else {
3135 ilevel = lp_build_const_int32(bld->gallivm, 0);
3136 }
3137 }
3138 lp_build_mipmap_level_sizes(bld, ilevel,
3139 &size,
3140 &row_stride_vec, &img_stride_vec);
3141 lp_build_extract_image_sizes(bld, &bld->int_size_bld, int_coord_bld->type,
3142 size, &width, &height, &depth);
3143
3144 if (target == PIPE_TEXTURE_1D_ARRAY ||
3145 target == PIPE_TEXTURE_2D_ARRAY) {
3146 if (out_of_bound_ret_zero) {
3147 z = lp_build_layer_coord(bld, texture_unit, FALSE, z, &out1);
3148 out_of_bounds = lp_build_or(int_coord_bld, out_of_bounds, out1);
3149 }
3150 else {
3151 z = lp_build_layer_coord(bld, texture_unit, FALSE, z, NULL);
3152 }
3153 }
3154
3155 /* This is a lot like border sampling */
3156 if (offsets[0]) {
3157 /*
3158 * coords are really unsigned, offsets are signed, but I don't think
3159 * exceeding 31 bits is possible
3160 */
3161 x = lp_build_add(int_coord_bld, x, offsets[0]);
3162 }
3163 out1 = lp_build_cmp(int_coord_bld, PIPE_FUNC_LESS, x, int_coord_bld->zero);
3164 out_of_bounds = lp_build_or(int_coord_bld, out_of_bounds, out1);
3165 out1 = lp_build_cmp(int_coord_bld, PIPE_FUNC_GEQUAL, x, width);
3166 out_of_bounds = lp_build_or(int_coord_bld, out_of_bounds, out1);
3167
3168 if (dims >= 2) {
3169 if (offsets[1]) {
3170 y = lp_build_add(int_coord_bld, y, offsets[1]);
3171 }
3172 out1 = lp_build_cmp(int_coord_bld, PIPE_FUNC_LESS, y, int_coord_bld->zero);
3173 out_of_bounds = lp_build_or(int_coord_bld, out_of_bounds, out1);
3174 out1 = lp_build_cmp(int_coord_bld, PIPE_FUNC_GEQUAL, y, height);
3175 out_of_bounds = lp_build_or(int_coord_bld, out_of_bounds, out1);
3176
3177 if (dims >= 3) {
3178 if (offsets[2]) {
3179 z = lp_build_add(int_coord_bld, z, offsets[2]);
3180 }
3181 out1 = lp_build_cmp(int_coord_bld, PIPE_FUNC_LESS, z, int_coord_bld->zero);
3182 out_of_bounds = lp_build_or(int_coord_bld, out_of_bounds, out1);
3183 out1 = lp_build_cmp(int_coord_bld, PIPE_FUNC_GEQUAL, z, depth);
3184 out_of_bounds = lp_build_or(int_coord_bld, out_of_bounds, out1);
3185 }
3186 }
3187
3188 lp_build_sample_offset(int_coord_bld,
3189 bld->format_desc,
3190 x, y, z, row_stride_vec, img_stride_vec,
3191 &offset, &i, &j);
3192
3193 if (bld->static_texture_state->target != PIPE_BUFFER) {
3194 offset = lp_build_add(int_coord_bld, offset,
3195 lp_build_get_mip_offsets(bld, ilevel));
3196 }
3197
3198 if (bld->fetch_ms) {
3199 LLVMValueRef num_samples;
3200 num_samples = bld->dynamic_state->num_samples(bld->dynamic_state, bld->gallivm,
3201 bld->context_ptr, texture_unit, NULL);
3202 out1 = lp_build_cmp(int_coord_bld, PIPE_FUNC_LESS, ms_index, int_coord_bld->zero);
3203 out_of_bounds = lp_build_or(int_coord_bld, out_of_bounds, out1);
3204 out1 = lp_build_cmp(int_coord_bld, PIPE_FUNC_GEQUAL, ms_index, lp_build_broadcast_scalar(int_coord_bld, num_samples));
3205 out_of_bounds = lp_build_or(int_coord_bld, out_of_bounds, out1);
3206 offset = lp_build_add(int_coord_bld, offset,
3207 lp_build_mul(int_coord_bld, bld->sample_stride, ms_index));
3208 }
3209
3210 offset = lp_build_andnot(int_coord_bld, offset, out_of_bounds);
3211
3212 lp_build_fetch_rgba_soa(bld->gallivm,
3213 bld->format_desc,
3214 bld->texel_type, TRUE,
3215 bld->base_ptr, offset,
3216 i, j,
3217 bld->cache,
3218 colors_out);
3219
3220 if (out_of_bound_ret_zero) {
3221 /*
3222 * Only needed for ARB_robust_buffer_access_behavior and d3d10.
3223 * Could use min/max above instead of out-of-bounds comparisons
3224 * if we don't care about the result returned for out-of-bounds.
3225 */
3226 LLVMValueRef oob[4] = {
3227 bld->texel_bld.zero,
3228 bld->texel_bld.zero,
3229 bld->texel_bld.zero,
3230 bld->texel_bld.zero,
3231 };
3232 lp_build_format_swizzle_soa(bld->format_desc, &bld->texel_bld, oob, oob);
3233 for (chan = 0; chan < 4; chan++) {
3234 colors_out[chan] = lp_build_select(&bld->texel_bld, out_of_bounds,
3235 oob[chan], colors_out[chan]);
3236 }
3237 }
3238 }
3239
3240
3241 /**
3242 * Just set texels to white instead of actually sampling the texture.
3243 * For debugging.
3244 */
3245 void
lp_build_sample_nop(struct gallivm_state * gallivm,struct lp_type type,const LLVMValueRef * coords,LLVMValueRef texel_out[4])3246 lp_build_sample_nop(struct gallivm_state *gallivm,
3247 struct lp_type type,
3248 const LLVMValueRef *coords,
3249 LLVMValueRef texel_out[4])
3250 {
3251 LLVMValueRef one = lp_build_one(gallivm, type);
3252 for (unsigned chan = 0; chan < 4; chan++) {
3253 texel_out[chan] = one;
3254 }
3255 }
3256
3257
3258 static struct lp_type
lp_build_texel_type(struct lp_type texel_type,const struct util_format_description * format_desc)3259 lp_build_texel_type(struct lp_type texel_type,
3260 const struct util_format_description *format_desc)
3261 {
3262 /* always using the first channel hopefully should be safe,
3263 * if not things WILL break in other places anyway.
3264 */
3265 if (format_desc->colorspace == UTIL_FORMAT_COLORSPACE_RGB &&
3266 format_desc->channel[0].pure_integer) {
3267 if (format_desc->channel[0].type == UTIL_FORMAT_TYPE_SIGNED) {
3268 texel_type = lp_type_int_vec(texel_type.width, texel_type.width * texel_type.length);
3269 } else if (format_desc->channel[0].type == UTIL_FORMAT_TYPE_UNSIGNED) {
3270 texel_type = lp_type_uint_vec(texel_type.width, texel_type.width * texel_type.length);
3271 }
3272 }
3273 else if (util_format_has_stencil(format_desc) &&
3274 !util_format_has_depth(format_desc)) {
3275 /* for stencil only formats, sample stencil (uint) */
3276 texel_type = lp_type_uint_vec(texel_type.width, texel_type.width * texel_type.length);
3277 }
3278 return texel_type;
3279 }
3280
3281
3282 /**
3283 * Build the actual texture sampling code.
3284 * 'texel' will return a vector of four LLVMValueRefs corresponding to
3285 * R, G, B, A.
3286 * \param type vector float type to use for coords, etc.
3287 * \param sample_key
3288 * \param derivs partial derivatives of (s,t,r,q) with respect to x and y
3289 */
3290 static void
lp_build_sample_soa_code(struct gallivm_state * gallivm,const struct lp_static_texture_state * static_texture_state,const struct lp_static_sampler_state * static_sampler_state,struct lp_sampler_dynamic_state * dynamic_state,struct lp_type type,unsigned sample_key,unsigned texture_index,unsigned sampler_index,LLVMValueRef context_ptr,LLVMValueRef thread_data_ptr,const LLVMValueRef * coords,const LLVMValueRef * offsets,const struct lp_derivatives * derivs,LLVMValueRef lod,LLVMValueRef ms_index,LLVMValueRef aniso_filter_table,LLVMValueRef texel_out[4])3291 lp_build_sample_soa_code(struct gallivm_state *gallivm,
3292 const struct lp_static_texture_state *static_texture_state,
3293 const struct lp_static_sampler_state *static_sampler_state,
3294 struct lp_sampler_dynamic_state *dynamic_state,
3295 struct lp_type type,
3296 unsigned sample_key,
3297 unsigned texture_index,
3298 unsigned sampler_index,
3299 LLVMValueRef context_ptr,
3300 LLVMValueRef thread_data_ptr,
3301 const LLVMValueRef *coords,
3302 const LLVMValueRef *offsets,
3303 const struct lp_derivatives *derivs, /* optional */
3304 LLVMValueRef lod, /* optional */
3305 LLVMValueRef ms_index, /* optional */
3306 LLVMValueRef aniso_filter_table,
3307 LLVMValueRef texel_out[4])
3308 {
3309 assert(static_texture_state);
3310 assert(static_texture_state->format < PIPE_FORMAT_COUNT);
3311 assert(static_sampler_state);
3312
3313 const enum pipe_texture_target target = static_texture_state->target;
3314 const unsigned dims = texture_dims(target);
3315 const unsigned num_quads = type.length / 4;
3316 struct lp_build_sample_context bld;
3317 struct lp_static_sampler_state derived_sampler_state = *static_sampler_state;
3318 LLVMTypeRef i32t = LLVMInt32TypeInContext(gallivm->context);
3319 LLVMBuilderRef builder = gallivm->builder;
3320
3321 if (0) {
3322 enum pipe_format fmt = static_texture_state->format;
3323 debug_printf("Sample from %s\n", util_format_name(fmt));
3324 }
3325
3326 const enum lp_sampler_lod_property lod_property =
3327 (sample_key & LP_SAMPLER_LOD_PROPERTY_MASK) >>
3328 LP_SAMPLER_LOD_PROPERTY_SHIFT;
3329 const enum lp_sampler_lod_control lod_control =
3330 (sample_key & LP_SAMPLER_LOD_CONTROL_MASK) >>
3331 LP_SAMPLER_LOD_CONTROL_SHIFT;
3332 const enum lp_sampler_op_type op_type =
3333 (sample_key & LP_SAMPLER_OP_TYPE_MASK) >>
3334 LP_SAMPLER_OP_TYPE_SHIFT;
3335
3336 const boolean fetch_ms = !!(sample_key & LP_SAMPLER_FETCH_MS);
3337 const boolean op_is_tex = op_type == LP_SAMPLER_OP_TEXTURE;
3338 const boolean op_is_lodq = op_type == LP_SAMPLER_OP_LODQ;
3339 const boolean op_is_gather = op_type == LP_SAMPLER_OP_GATHER;
3340
3341 LLVMValueRef lod_bias = NULL;
3342 LLVMValueRef explicit_lod = NULL;
3343 if (lod_control == LP_SAMPLER_LOD_BIAS) {
3344 lod_bias = lod;
3345 assert(lod);
3346 assert(derivs == NULL);
3347 }
3348 else if (lod_control == LP_SAMPLER_LOD_EXPLICIT) {
3349 explicit_lod = lod;
3350 assert(lod);
3351 assert(derivs == NULL);
3352 }
3353 else if (lod_control == LP_SAMPLER_LOD_DERIVATIVES) {
3354 assert(derivs);
3355 assert(lod == NULL);
3356 }
3357 else {
3358 assert(derivs == NULL);
3359 assert(lod == NULL);
3360 }
3361
3362 if (static_texture_state->format == PIPE_FORMAT_NONE) {
3363 /*
3364 * If there's nothing bound, format is NONE, and we must return
3365 * all zero as mandated by d3d10 in this case.
3366 */
3367 LLVMValueRef zero = lp_build_zero(gallivm, type);
3368 for (unsigned chan = 0; chan < 4; chan++) {
3369 texel_out[chan] = zero;
3370 }
3371 return;
3372 }
3373
3374 assert(type.floating);
3375
3376 /* Setup our build context */
3377 memset(&bld, 0, sizeof bld);
3378 bld.gallivm = gallivm;
3379 bld.context_ptr = context_ptr;
3380 bld.aniso_filter_table = aniso_filter_table;
3381 bld.static_sampler_state = &derived_sampler_state;
3382 bld.static_texture_state = static_texture_state;
3383 bld.dynamic_state = dynamic_state;
3384 bld.format_desc = util_format_description(static_texture_state->format);
3385 bld.dims = dims;
3386
3387 if (gallivm_perf & GALLIVM_PERF_NO_QUAD_LOD || op_is_lodq) {
3388 bld.no_quad_lod = TRUE;
3389 }
3390 if (!(gallivm_perf & GALLIVM_PERF_RHO_APPROX) || op_is_lodq) {
3391 bld.no_rho_approx = TRUE;
3392 }
3393 if (!(gallivm_perf & GALLIVM_PERF_BRILINEAR) || op_is_lodq || lod_bias || explicit_lod) {
3394 bld.no_brilinear = TRUE;
3395 }
3396
3397 bld.vector_width = lp_type_width(type);
3398
3399 bld.float_type = lp_type_float(32);
3400 bld.int_type = lp_type_int(32);
3401 bld.coord_type = type;
3402 bld.int_coord_type = lp_int_type(type);
3403 bld.float_size_in_type = lp_type_float(32);
3404 bld.float_size_in_type.length = dims > 1 ? 4 : 1;
3405 bld.int_size_in_type = lp_int_type(bld.float_size_in_type);
3406
3407 bld.texel_type = lp_build_texel_type(type, bld.format_desc);
3408
3409 if (!static_texture_state->level_zero_only ||
3410 !static_sampler_state->max_lod_pos || op_is_lodq) {
3411 derived_sampler_state.min_mip_filter = static_sampler_state->min_mip_filter;
3412 } else {
3413 derived_sampler_state.min_mip_filter = PIPE_TEX_MIPFILTER_NONE;
3414 }
3415
3416 if (op_is_gather) {
3417 /*
3418 * gather4 is exactly like GL_LINEAR filtering but in the end skipping
3419 * the actual filtering. Using mostly the same paths, so cube face
3420 * selection, coord wrapping etc. all naturally uses the same code.
3421 */
3422 derived_sampler_state.min_mip_filter = PIPE_TEX_MIPFILTER_NONE;
3423 derived_sampler_state.min_img_filter = PIPE_TEX_FILTER_LINEAR;
3424 derived_sampler_state.mag_img_filter = PIPE_TEX_FILTER_LINEAR;
3425 }
3426
3427 const enum pipe_tex_mipfilter mip_filter =
3428 derived_sampler_state.min_mip_filter;
3429
3430 if (static_texture_state->target == PIPE_TEXTURE_CUBE ||
3431 static_texture_state->target == PIPE_TEXTURE_CUBE_ARRAY) {
3432 /*
3433 * Seamless filtering ignores wrap modes.
3434 * Setting to CLAMP_TO_EDGE is correct for nearest filtering, for
3435 * bilinear it's not correct but way better than using for instance
3436 * repeat. Note we even set this for non-seamless. Technically GL
3437 * allows any wrap mode, which made sense when supporting true borders
3438 * (can get seamless effect with border and CLAMP_TO_BORDER), but
3439 * gallium doesn't support borders and d3d9 requires wrap modes to be
3440 * ignored and it's a pain to fix up the sampler state (as it makes it
3441 * texture dependent).
3442 */
3443 derived_sampler_state.wrap_s = PIPE_TEX_WRAP_CLAMP_TO_EDGE;
3444 derived_sampler_state.wrap_t = PIPE_TEX_WRAP_CLAMP_TO_EDGE;
3445 }
3446
3447 /*
3448 * We could force CLAMP to CLAMP_TO_EDGE here if min/mag filter is nearest,
3449 * so AoS path could be used. Not sure it's worth the trouble...
3450 */
3451 const enum pipe_tex_filter min_img_filter =
3452 derived_sampler_state.min_img_filter;
3453 const enum pipe_tex_filter mag_img_filter =
3454 derived_sampler_state.mag_img_filter;
3455
3456 /*
3457 * This is all a bit complicated different paths are chosen for performance
3458 * reasons.
3459 * Essentially, there can be 1 lod per element, 1 lod per quad or 1 lod for
3460 * everything (the last two options are equivalent for 4-wide case).
3461 * If there's per-quad lod but we split to 4-wide so we can use AoS, per-quad
3462 * lod is calculated then the lod value extracted afterwards so making this
3463 * case basically the same as far as lod handling is concerned for the
3464 * further sample/filter code as the 1 lod for everything case.
3465 * Different lod handling mostly shows up when building mipmap sizes
3466 * (lp_build_mipmap_level_sizes() and friends) and also in filtering
3467 * (getting the fractional part of the lod to the right texels).
3468 */
3469
3470 /*
3471 * There are other situations where at least the multiple int lods could be
3472 * avoided like min and max lod being equal.
3473 */
3474 bld.num_mips = bld.num_lods = 1;
3475
3476 if (bld.no_quad_lod && bld.no_rho_approx &&
3477 ((mip_filter != PIPE_TEX_MIPFILTER_NONE && op_is_tex &&
3478 (static_texture_state->target == PIPE_TEXTURE_CUBE ||
3479 static_texture_state->target == PIPE_TEXTURE_CUBE_ARRAY)) ||
3480 op_is_lodq)) {
3481 /*
3482 * special case for using per-pixel lod even for implicit lod,
3483 * which is generally never required (ok by APIs) except to please
3484 * some (somewhat broken imho) tests (because per-pixel face selection
3485 * can cause derivatives to be different for pixels outside the primitive
3486 * due to the major axis division even if pre-project derivatives are
3487 * looking normal).
3488 * For lodq, we do it to simply avoid scalar pack / unpack (albeit for
3489 * cube maps we do indeed get per-pixel lod values).
3490 */
3491 bld.num_mips = type.length;
3492 bld.num_lods = type.length;
3493 }
3494 else if (lod_property == LP_SAMPLER_LOD_PER_ELEMENT ||
3495 (explicit_lod || lod_bias || derivs)) {
3496 if ((!op_is_tex && target != PIPE_BUFFER) ||
3497 (op_is_tex && mip_filter != PIPE_TEX_MIPFILTER_NONE)) {
3498 bld.num_mips = type.length;
3499 bld.num_lods = type.length;
3500 }
3501 else if (op_is_tex && min_img_filter != mag_img_filter) {
3502 bld.num_mips = 1;
3503 bld.num_lods = type.length;
3504 }
3505 }
3506 /* TODO: for true scalar_lod should only use 1 lod value */
3507 else if ((!op_is_tex && explicit_lod && target != PIPE_BUFFER) ||
3508 (op_is_tex && mip_filter != PIPE_TEX_MIPFILTER_NONE)) {
3509 bld.num_mips = num_quads;
3510 bld.num_lods = num_quads;
3511 }
3512 else if (op_is_tex && min_img_filter != mag_img_filter) {
3513 bld.num_mips = 1;
3514 bld.num_lods = num_quads;
3515 }
3516
3517 bld.fetch_ms = fetch_ms;
3518 if (op_is_gather)
3519 bld.gather_comp = (sample_key & LP_SAMPLER_GATHER_COMP_MASK) >> LP_SAMPLER_GATHER_COMP_SHIFT;
3520 bld.lodf_type = type;
3521 /* we want native vector size to be able to use our intrinsics */
3522 if (bld.num_lods != type.length) {
3523 /* TODO: this currently always has to be per-quad or per-element */
3524 bld.lodf_type.length = type.length > 4 ? ((type.length + 15) / 16) * 4 : 1;
3525 }
3526 bld.lodi_type = lp_int_type(bld.lodf_type);
3527 bld.levelf_type = bld.lodf_type;
3528 if (bld.num_mips == 1) {
3529 bld.levelf_type.length = 1;
3530 }
3531 bld.leveli_type = lp_int_type(bld.levelf_type);
3532 bld.float_size_type = bld.float_size_in_type;
3533
3534 /* Note: size vectors may not be native. They contain minified w/h/d/_
3535 * values, with per-element lod that is w0/h0/d0/_/w1/h1/d1_/... so up to
3536 * 8x4f32
3537 */
3538 if (bld.num_mips > 1) {
3539 bld.float_size_type.length = bld.num_mips == type.length ?
3540 bld.num_mips * bld.float_size_in_type.length :
3541 type.length;
3542 }
3543 bld.int_size_type = lp_int_type(bld.float_size_type);
3544
3545 lp_build_context_init(&bld.float_bld, gallivm, bld.float_type);
3546 lp_build_context_init(&bld.float_vec_bld, gallivm, type);
3547 lp_build_context_init(&bld.int_bld, gallivm, bld.int_type);
3548 lp_build_context_init(&bld.coord_bld, gallivm, bld.coord_type);
3549 lp_build_context_init(&bld.int_coord_bld, gallivm, bld.int_coord_type);
3550 lp_build_context_init(&bld.int_size_in_bld, gallivm, bld.int_size_in_type);
3551 lp_build_context_init(&bld.float_size_in_bld, gallivm, bld.float_size_in_type);
3552 lp_build_context_init(&bld.int_size_bld, gallivm, bld.int_size_type);
3553 lp_build_context_init(&bld.float_size_bld, gallivm, bld.float_size_type);
3554 lp_build_context_init(&bld.texel_bld, gallivm, bld.texel_type);
3555 lp_build_context_init(&bld.levelf_bld, gallivm, bld.levelf_type);
3556 lp_build_context_init(&bld.leveli_bld, gallivm, bld.leveli_type);
3557 lp_build_context_init(&bld.lodf_bld, gallivm, bld.lodf_type);
3558 lp_build_context_init(&bld.lodi_bld, gallivm, bld.lodi_type);
3559
3560 /* Get the dynamic state */
3561 LLVMValueRef tex_width = dynamic_state->width(dynamic_state, gallivm,
3562 context_ptr, texture_index,
3563 NULL);
3564 bld.row_stride_array = dynamic_state->row_stride(dynamic_state, gallivm,
3565 context_ptr, texture_index, NULL);
3566 bld.img_stride_array = dynamic_state->img_stride(dynamic_state, gallivm,
3567 context_ptr, texture_index, NULL);
3568 bld.base_ptr = dynamic_state->base_ptr(dynamic_state, gallivm,
3569 context_ptr, texture_index, NULL);
3570 bld.mip_offsets = dynamic_state->mip_offsets(dynamic_state, gallivm,
3571 context_ptr, texture_index, NULL);
3572
3573 if (fetch_ms) {
3574 bld.sample_stride =
3575 lp_build_broadcast_scalar(&bld.int_coord_bld,
3576 dynamic_state->sample_stride(dynamic_state,
3577 gallivm,
3578 context_ptr,
3579 texture_index,
3580 NULL));
3581 }
3582
3583 /* Note that mip_offsets is an array[level] of offsets to texture images */
3584
3585 if (dynamic_state->cache_ptr && thread_data_ptr) {
3586 bld.cache = dynamic_state->cache_ptr(dynamic_state, gallivm,
3587 thread_data_ptr, texture_index);
3588 }
3589
3590 /* width, height, depth as single int vector */
3591 if (dims <= 1) {
3592 bld.int_size = tex_width;
3593 }
3594 else {
3595 bld.int_size = LLVMBuildInsertElement(builder, bld.int_size_in_bld.undef,
3596 tex_width,
3597 LLVMConstInt(i32t, 0, 0), "");
3598 if (dims >= 2) {
3599 LLVMValueRef tex_height =
3600 dynamic_state->height(dynamic_state, gallivm,
3601 context_ptr, texture_index, NULL);
3602 bld.int_size = LLVMBuildInsertElement(builder, bld.int_size,
3603 tex_height,
3604 LLVMConstInt(i32t, 1, 0), "");
3605 if (dims >= 3) {
3606 LLVMValueRef tex_depth =
3607 dynamic_state->depth(dynamic_state, gallivm, context_ptr,
3608 texture_index, NULL);
3609 bld.int_size = LLVMBuildInsertElement(builder, bld.int_size,
3610 tex_depth,
3611 LLVMConstInt(i32t, 2, 0), "");
3612 }
3613 }
3614 }
3615
3616 LLVMValueRef newcoords[5];
3617 for (unsigned i = 0; i < 5; i++) {
3618 newcoords[i] = coords[i];
3619 }
3620
3621 if (util_format_is_pure_integer(static_texture_state->format) &&
3622 !util_format_has_depth(bld.format_desc) && op_is_tex &&
3623 (static_sampler_state->min_mip_filter == PIPE_TEX_MIPFILTER_LINEAR ||
3624 static_sampler_state->min_img_filter == PIPE_TEX_FILTER_LINEAR ||
3625 static_sampler_state->mag_img_filter == PIPE_TEX_FILTER_LINEAR)) {
3626 /*
3627 * Bail if impossible filtering is specified (the awkard additional
3628 * depth check is because it is legal in gallium to have things like
3629 * S8Z24 here which would say it's pure int despite such formats should
3630 * sample the depth component).
3631 * In GL such filters make the texture incomplete, this makes it robust
3632 * against gallium frontends which set this up regardless (we'd crash in
3633 * the lerp later otherwise).
3634 * At least in some apis it may be legal to use such filters with lod
3635 * queries and/or gather (at least for gather d3d10 says only the wrap
3636 * bits are really used hence filter bits are likely simply ignored).
3637 * For fetch, we don't get valid samplers either way here.
3638 */
3639 LLVMValueRef zero = lp_build_zero(gallivm, type);
3640 for (unsigned chan = 0; chan < 4; chan++) {
3641 texel_out[chan] = zero;
3642 }
3643 return;
3644 }
3645
3646 if (0) {
3647 /* For debug: no-op texture sampling */
3648 lp_build_sample_nop(gallivm,
3649 bld.texel_type,
3650 newcoords,
3651 texel_out);
3652 } else if (op_type == LP_SAMPLER_OP_FETCH) {
3653 lp_build_fetch_texel(&bld, texture_index, ms_index, newcoords,
3654 lod, offsets, texel_out);
3655 } else {
3656 LLVMValueRef lod_fpart = NULL, lod_positive = NULL;
3657 LLVMValueRef ilevel0 = NULL, ilevel1 = NULL, lod = NULL;
3658 boolean use_aos = util_format_fits_8unorm(bld.format_desc) &&
3659 op_is_tex &&
3660 /* not sure this is strictly needed or simply impossible */
3661 derived_sampler_state.compare_mode == PIPE_TEX_COMPARE_NONE &&
3662 derived_sampler_state.aniso == 0 &&
3663 lp_is_simple_wrap_mode(derived_sampler_state.wrap_s);
3664
3665 use_aos &= bld.num_lods <= num_quads ||
3666 derived_sampler_state.min_img_filter ==
3667 derived_sampler_state.mag_img_filter;
3668
3669 if (gallivm_perf & GALLIVM_PERF_NO_AOS_SAMPLING) {
3670 use_aos = 0;
3671 }
3672
3673 if (dims > 1) {
3674 use_aos &= lp_is_simple_wrap_mode(derived_sampler_state.wrap_t);
3675 if (dims > 2) {
3676 use_aos &= lp_is_simple_wrap_mode(derived_sampler_state.wrap_r);
3677 }
3678 }
3679 if ((static_texture_state->target == PIPE_TEXTURE_CUBE ||
3680 static_texture_state->target == PIPE_TEXTURE_CUBE_ARRAY) &&
3681 derived_sampler_state.seamless_cube_map &&
3682 (derived_sampler_state.min_img_filter == PIPE_TEX_FILTER_LINEAR ||
3683 derived_sampler_state.mag_img_filter == PIPE_TEX_FILTER_LINEAR)) {
3684 /* theoretically possible with AoS filtering but not implemented (complex!) */
3685 use_aos = 0;
3686 }
3687
3688 if ((gallivm_debug & GALLIVM_DEBUG_PERF) &&
3689 !use_aos && util_format_fits_8unorm(bld.format_desc)) {
3690 debug_printf("%s: using floating point linear filtering for %s\n",
3691 __FUNCTION__, bld.format_desc->short_name);
3692 debug_printf(" min_img %d mag_img %d mip %d target %d seamless %d"
3693 " wraps %d wrapt %d wrapr %d\n",
3694 derived_sampler_state.min_img_filter,
3695 derived_sampler_state.mag_img_filter,
3696 derived_sampler_state.min_mip_filter,
3697 static_texture_state->target,
3698 derived_sampler_state.seamless_cube_map,
3699 derived_sampler_state.wrap_s,
3700 derived_sampler_state.wrap_t,
3701 derived_sampler_state.wrap_r);
3702 }
3703
3704 lp_build_sample_common(&bld, op_is_lodq, texture_index, sampler_index,
3705 newcoords, derivs, lod_bias, explicit_lod,
3706 &lod_positive, &lod, &lod_fpart,
3707 &ilevel0, &ilevel1);
3708
3709 if (op_is_lodq) {
3710 texel_out[0] = lod_fpart;
3711 texel_out[1] = lod;
3712 texel_out[2] = texel_out[3] = bld.coord_bld.zero;
3713 return;
3714 }
3715
3716 if (use_aos && static_texture_state->target == PIPE_TEXTURE_CUBE_ARRAY) {
3717 /* The aos path doesn't do seamless filtering so simply add cube layer
3718 * to face now.
3719 */
3720 newcoords[2] = lp_build_add(&bld.int_coord_bld, newcoords[2], newcoords[3]);
3721 }
3722
3723 /*
3724 * we only try 8-wide sampling with soa or if we have AVX2
3725 * as it appears to be a loss with just AVX)
3726 */
3727 if (num_quads == 1 || !use_aos ||
3728 (util_get_cpu_caps()->has_avx2 &&
3729 (bld.num_lods == 1 ||
3730 derived_sampler_state.min_img_filter == derived_sampler_state.mag_img_filter))) {
3731 if (use_aos) {
3732 /* do sampling/filtering with fixed pt arithmetic */
3733 lp_build_sample_aos(&bld, sampler_index,
3734 newcoords[0], newcoords[1],
3735 newcoords[2],
3736 offsets, lod_positive, lod_fpart,
3737 ilevel0, ilevel1,
3738 texel_out);
3739 } else {
3740 lp_build_sample_general(&bld, sampler_index,
3741 op_type == LP_SAMPLER_OP_GATHER,
3742 newcoords, offsets,
3743 lod_positive, lod_fpart,
3744 ilevel0, ilevel1,
3745 texel_out);
3746 }
3747 }
3748 else {
3749 struct lp_build_sample_context bld4;
3750 struct lp_type type4 = type;
3751 LLVMValueRef texelout4[4];
3752 LLVMValueRef texelouttmp[4][LP_MAX_VECTOR_LENGTH/16];
3753
3754 type4.length = 4;
3755
3756 /* Setup our build context */
3757 memset(&bld4, 0, sizeof bld4);
3758 bld4.no_quad_lod = bld.no_quad_lod;
3759 bld4.no_rho_approx = bld.no_rho_approx;
3760 bld4.no_brilinear = bld.no_brilinear;
3761 bld4.gallivm = bld.gallivm;
3762 bld4.context_ptr = bld.context_ptr;
3763 bld4.aniso_filter_table = aniso_filter_table;
3764 bld4.static_texture_state = bld.static_texture_state;
3765 bld4.static_sampler_state = bld.static_sampler_state;
3766 bld4.dynamic_state = bld.dynamic_state;
3767 bld4.format_desc = bld.format_desc;
3768 bld4.dims = bld.dims;
3769 bld4.row_stride_array = bld.row_stride_array;
3770 bld4.img_stride_array = bld.img_stride_array;
3771 bld4.base_ptr = bld.base_ptr;
3772 bld4.mip_offsets = bld.mip_offsets;
3773 bld4.int_size = bld.int_size;
3774 bld4.cache = bld.cache;
3775
3776 bld4.vector_width = lp_type_width(type4);
3777
3778 bld4.float_type = lp_type_float(32);
3779 bld4.int_type = lp_type_int(32);
3780 bld4.coord_type = type4;
3781 bld4.int_coord_type = lp_int_type(type4);
3782 bld4.float_size_in_type = lp_type_float(32);
3783 bld4.float_size_in_type.length = dims > 1 ? 4 : 1;
3784 bld4.int_size_in_type = lp_int_type(bld4.float_size_in_type);
3785 bld4.texel_type = bld.texel_type;
3786 bld4.texel_type.length = 4;
3787
3788 bld4.num_mips = bld4.num_lods = 1;
3789 if (bld4.no_quad_lod && bld4.no_rho_approx &&
3790 (static_texture_state->target == PIPE_TEXTURE_CUBE ||
3791 static_texture_state->target == PIPE_TEXTURE_CUBE_ARRAY) &&
3792 (op_is_tex && mip_filter != PIPE_TEX_MIPFILTER_NONE)) {
3793 bld4.num_mips = type4.length;
3794 bld4.num_lods = type4.length;
3795 }
3796 if (lod_property == LP_SAMPLER_LOD_PER_ELEMENT &&
3797 (explicit_lod || lod_bias || derivs)) {
3798 if ((!op_is_tex && target != PIPE_BUFFER) ||
3799 (op_is_tex && mip_filter != PIPE_TEX_MIPFILTER_NONE)) {
3800 bld4.num_mips = type4.length;
3801 bld4.num_lods = type4.length;
3802 }
3803 else if (op_is_tex && min_img_filter != mag_img_filter) {
3804 bld4.num_mips = 1;
3805 bld4.num_lods = type4.length;
3806 }
3807 }
3808
3809 /* we want native vector size to be able to use our intrinsics */
3810 bld4.lodf_type = type4;
3811 if (bld4.num_lods != type4.length) {
3812 bld4.lodf_type.length = 1;
3813 }
3814 bld4.lodi_type = lp_int_type(bld4.lodf_type);
3815 bld4.levelf_type = type4;
3816 if (bld4.num_mips != type4.length) {
3817 bld4.levelf_type.length = 1;
3818 }
3819 bld4.leveli_type = lp_int_type(bld4.levelf_type);
3820 bld4.float_size_type = bld4.float_size_in_type;
3821 if (bld4.num_mips > 1) {
3822 bld4.float_size_type.length = bld4.num_mips == type4.length ?
3823 bld4.num_mips * bld4.float_size_in_type.length :
3824 type4.length;
3825 }
3826 bld4.int_size_type = lp_int_type(bld4.float_size_type);
3827
3828 lp_build_context_init(&bld4.float_bld, gallivm, bld4.float_type);
3829 lp_build_context_init(&bld4.float_vec_bld, gallivm, type4);
3830 lp_build_context_init(&bld4.int_bld, gallivm, bld4.int_type);
3831 lp_build_context_init(&bld4.coord_bld, gallivm, bld4.coord_type);
3832 lp_build_context_init(&bld4.int_coord_bld, gallivm, bld4.int_coord_type);
3833 lp_build_context_init(&bld4.int_size_in_bld, gallivm, bld4.int_size_in_type);
3834 lp_build_context_init(&bld4.float_size_in_bld, gallivm, bld4.float_size_in_type);
3835 lp_build_context_init(&bld4.int_size_bld, gallivm, bld4.int_size_type);
3836 lp_build_context_init(&bld4.float_size_bld, gallivm, bld4.float_size_type);
3837 lp_build_context_init(&bld4.texel_bld, gallivm, bld4.texel_type);
3838 lp_build_context_init(&bld4.levelf_bld, gallivm, bld4.levelf_type);
3839 lp_build_context_init(&bld4.leveli_bld, gallivm, bld4.leveli_type);
3840 lp_build_context_init(&bld4.lodf_bld, gallivm, bld4.lodf_type);
3841 lp_build_context_init(&bld4.lodi_bld, gallivm, bld4.lodi_type);
3842
3843 for (unsigned i = 0; i < num_quads; i++) {
3844 LLVMValueRef s4, t4, r4;
3845 LLVMValueRef lod_positive4, lod_fpart4 = NULL;
3846 LLVMValueRef ilevel04, ilevel14 = NULL;
3847 LLVMValueRef offsets4[4] = { NULL };
3848 unsigned num_lods = bld4.num_lods;
3849
3850 s4 = lp_build_extract_range(gallivm, newcoords[0], 4*i, 4);
3851 t4 = lp_build_extract_range(gallivm, newcoords[1], 4*i, 4);
3852 r4 = lp_build_extract_range(gallivm, newcoords[2], 4*i, 4);
3853
3854 if (offsets[0]) {
3855 offsets4[0] = lp_build_extract_range(gallivm, offsets[0], 4*i, 4);
3856 if (dims > 1) {
3857 offsets4[1] = lp_build_extract_range(gallivm, offsets[1], 4*i, 4);
3858 if (dims > 2) {
3859 offsets4[2] = lp_build_extract_range(gallivm, offsets[2], 4*i, 4);
3860 }
3861 }
3862 }
3863 lod_positive4 = lp_build_extract_range(gallivm, lod_positive, num_lods * i, num_lods);
3864 ilevel04 = bld.num_mips == 1 ? ilevel0 :
3865 lp_build_extract_range(gallivm, ilevel0, num_lods * i, num_lods);
3866 if (mip_filter == PIPE_TEX_MIPFILTER_LINEAR) {
3867 ilevel14 = lp_build_extract_range(gallivm, ilevel1, num_lods * i, num_lods);
3868 lod_fpart4 = lp_build_extract_range(gallivm, lod_fpart, num_lods * i, num_lods);
3869 }
3870
3871 if (use_aos) {
3872 /* do sampling/filtering with fixed pt arithmetic */
3873 lp_build_sample_aos(&bld4, sampler_index,
3874 s4, t4, r4, offsets4,
3875 lod_positive4, lod_fpart4,
3876 ilevel04, ilevel14,
3877 texelout4);
3878 }
3879
3880 else {
3881 /* this path is currently unreachable and hence might break easily... */
3882 LLVMValueRef newcoords4[5];
3883 newcoords4[0] = s4;
3884 newcoords4[1] = t4;
3885 newcoords4[2] = r4;
3886 newcoords4[3] = lp_build_extract_range(gallivm, newcoords[3], 4*i, 4);
3887 newcoords4[4] = lp_build_extract_range(gallivm, newcoords[4], 4*i, 4);
3888
3889 lp_build_sample_general(&bld4, sampler_index,
3890 op_type == LP_SAMPLER_OP_GATHER,
3891 newcoords4, offsets4,
3892 lod_positive4, lod_fpart4,
3893 ilevel04, ilevel14,
3894 texelout4);
3895 }
3896 for (unsigned j = 0; j < 4; j++) {
3897 texelouttmp[j][i] = texelout4[j];
3898 }
3899 }
3900
3901 for (unsigned j = 0; j < 4; j++) {
3902 texel_out[j] = lp_build_concat(gallivm, texelouttmp[j], type4, num_quads);
3903 }
3904 }
3905 }
3906
3907 if (target != PIPE_BUFFER && op_type != LP_SAMPLER_OP_GATHER) {
3908 apply_sampler_swizzle(&bld, texel_out);
3909 }
3910
3911 /*
3912 * texel type can be a (32bit) int/uint (for pure int formats only),
3913 * however we are expected to always return floats (storage is untyped).
3914 */
3915 if (!bld.texel_type.floating) {
3916 unsigned chan;
3917 for (chan = 0; chan < 4; chan++) {
3918 texel_out[chan] = LLVMBuildBitCast(builder, texel_out[chan],
3919 lp_build_vec_type(gallivm, type), "");
3920 }
3921 }
3922 }
3923
3924
3925 #define USE_TEX_FUNC_CALL 1
3926
3927 #define LP_MAX_TEX_FUNC_ARGS 32
3928
3929 static inline void
get_target_info(enum pipe_texture_target target,unsigned * num_coords,unsigned * num_derivs,unsigned * num_offsets,unsigned * layer)3930 get_target_info(enum pipe_texture_target target,
3931 unsigned *num_coords, unsigned *num_derivs,
3932 unsigned *num_offsets, unsigned *layer)
3933 {
3934 unsigned dims = texture_dims(target);
3935 *num_coords = dims;
3936 *num_offsets = dims;
3937 *num_derivs = (target == PIPE_TEXTURE_CUBE ||
3938 target == PIPE_TEXTURE_CUBE_ARRAY) ? 3 : dims;
3939 *layer = has_layer_coord(target) ? 2: 0;
3940 if (target == PIPE_TEXTURE_CUBE_ARRAY) {
3941 /*
3942 * dims doesn't include r coord for cubes - this is handled
3943 * by layer instead, but need to fix up for cube arrays...
3944 */
3945 *layer = 3;
3946 *num_coords = 3;
3947 }
3948 }
3949
3950
3951 /**
3952 * Generate the function body for a texture sampling function.
3953 */
3954 static void
lp_build_sample_gen_func(struct gallivm_state * gallivm,const struct lp_static_texture_state * static_texture_state,const struct lp_static_sampler_state * static_sampler_state,struct lp_sampler_dynamic_state * dynamic_state,struct lp_type type,unsigned texture_index,unsigned sampler_index,LLVMValueRef function,unsigned num_args,unsigned sample_key,bool has_aniso_filter_table)3955 lp_build_sample_gen_func(struct gallivm_state *gallivm,
3956 const struct lp_static_texture_state *static_texture_state,
3957 const struct lp_static_sampler_state *static_sampler_state,
3958 struct lp_sampler_dynamic_state *dynamic_state,
3959 struct lp_type type,
3960 unsigned texture_index,
3961 unsigned sampler_index,
3962 LLVMValueRef function,
3963 unsigned num_args,
3964 unsigned sample_key,
3965 bool has_aniso_filter_table)
3966 {
3967 LLVMBuilderRef old_builder;
3968 LLVMBasicBlockRef block;
3969 LLVMValueRef coords[5];
3970 LLVMValueRef offsets[3] = { NULL };
3971 LLVMValueRef lod = NULL;
3972 LLVMValueRef ms_index = NULL;
3973 LLVMValueRef context_ptr;
3974 LLVMValueRef thread_data_ptr = NULL;
3975 LLVMValueRef aniso_filter_table = NULL;
3976 LLVMValueRef texel_out[4];
3977 struct lp_derivatives derivs;
3978 struct lp_derivatives *deriv_ptr = NULL;
3979 unsigned num_param = 0;
3980 unsigned num_coords, num_derivs, num_offsets, layer;
3981 enum lp_sampler_lod_control lod_control;
3982 enum lp_sampler_op_type op_type;
3983 boolean need_cache = FALSE;
3984
3985 lod_control = (sample_key & LP_SAMPLER_LOD_CONTROL_MASK) >>
3986 LP_SAMPLER_LOD_CONTROL_SHIFT;
3987
3988 op_type = (sample_key & LP_SAMPLER_OP_TYPE_MASK) >>
3989 LP_SAMPLER_OP_TYPE_SHIFT;
3990
3991 get_target_info(static_texture_state->target,
3992 &num_coords, &num_derivs, &num_offsets, &layer);
3993
3994 /* lod query doesn't take a layer */
3995 if (layer && op_type == LP_SAMPLER_OP_LODQ)
3996 layer = 0;
3997
3998 if (dynamic_state->cache_ptr) {
3999 const struct util_format_description *format_desc;
4000 format_desc = util_format_description(static_texture_state->format);
4001 if (format_desc->layout == UTIL_FORMAT_LAYOUT_S3TC) {
4002 need_cache = TRUE;
4003 }
4004 }
4005
4006 /* "unpack" arguments */
4007 context_ptr = LLVMGetParam(function, num_param++);
4008 if (has_aniso_filter_table)
4009 aniso_filter_table = LLVMGetParam(function, num_param++);
4010 if (need_cache) {
4011 thread_data_ptr = LLVMGetParam(function, num_param++);
4012 }
4013 for (unsigned i = 0; i < num_coords; i++) {
4014 coords[i] = LLVMGetParam(function, num_param++);
4015 }
4016 for (unsigned i = num_coords; i < 5; i++) {
4017 /* This is rather unfortunate... */
4018 coords[i] = lp_build_undef(gallivm, type);
4019 }
4020 if (layer) {
4021 coords[layer] = LLVMGetParam(function, num_param++);
4022 }
4023 if (sample_key & LP_SAMPLER_SHADOW) {
4024 coords[4] = LLVMGetParam(function, num_param++);
4025 }
4026 if (sample_key & LP_SAMPLER_FETCH_MS) {
4027 ms_index = LLVMGetParam(function, num_param++);
4028 }
4029 if (sample_key & LP_SAMPLER_OFFSETS) {
4030 for (unsigned i = 0; i < num_offsets; i++) {
4031 offsets[i] = LLVMGetParam(function, num_param++);
4032 }
4033 }
4034 if (lod_control == LP_SAMPLER_LOD_BIAS ||
4035 lod_control == LP_SAMPLER_LOD_EXPLICIT) {
4036 lod = LLVMGetParam(function, num_param++);
4037 }
4038 else if (lod_control == LP_SAMPLER_LOD_DERIVATIVES) {
4039 for (unsigned i = 0; i < num_derivs; i++) {
4040 derivs.ddx[i] = LLVMGetParam(function, num_param++);
4041 derivs.ddy[i] = LLVMGetParam(function, num_param++);
4042 }
4043 deriv_ptr = &derivs;
4044 }
4045
4046 assert(num_args == num_param);
4047
4048 /*
4049 * Function body
4050 */
4051
4052 old_builder = gallivm->builder;
4053 block = LLVMAppendBasicBlockInContext(gallivm->context, function, "entry");
4054 gallivm->builder = LLVMCreateBuilderInContext(gallivm->context);
4055 LLVMPositionBuilderAtEnd(gallivm->builder, block);
4056
4057 lp_build_sample_soa_code(gallivm,
4058 static_texture_state,
4059 static_sampler_state,
4060 dynamic_state,
4061 type,
4062 sample_key,
4063 texture_index,
4064 sampler_index,
4065 context_ptr,
4066 thread_data_ptr,
4067 coords,
4068 offsets,
4069 deriv_ptr,
4070 lod,
4071 ms_index,
4072 aniso_filter_table,
4073 texel_out);
4074
4075 LLVMBuildAggregateRet(gallivm->builder, texel_out, 4);
4076
4077 LLVMDisposeBuilder(gallivm->builder);
4078 gallivm->builder = old_builder;
4079
4080 gallivm_verify_function(gallivm, function);
4081 }
4082
4083
4084 /**
4085 * Call the matching function for texture sampling.
4086 * If there's no match, generate a new one.
4087 */
4088 static void
lp_build_sample_soa_func(struct gallivm_state * gallivm,const struct lp_static_texture_state * static_texture_state,const struct lp_static_sampler_state * static_sampler_state,struct lp_sampler_dynamic_state * dynamic_state,const struct lp_sampler_params * params,int texture_index,int sampler_index,LLVMValueRef * tex_ret)4089 lp_build_sample_soa_func(struct gallivm_state *gallivm,
4090 const struct lp_static_texture_state *static_texture_state,
4091 const struct lp_static_sampler_state *static_sampler_state,
4092 struct lp_sampler_dynamic_state *dynamic_state,
4093 const struct lp_sampler_params *params,
4094 int texture_index, int sampler_index,
4095 LLVMValueRef *tex_ret)
4096 {
4097 LLVMBuilderRef builder = gallivm->builder;
4098 LLVMModuleRef module = LLVMGetGlobalParent(LLVMGetBasicBlockParent(
4099 LLVMGetInsertBlock(builder)));
4100 LLVMValueRef function, inst;
4101 LLVMValueRef args[LP_MAX_TEX_FUNC_ARGS];
4102 LLVMBasicBlockRef bb;
4103 unsigned num_args = 0;
4104 char func_name[64];
4105 unsigned i, num_coords, num_derivs, num_offsets, layer;
4106 unsigned sample_key = params->sample_key;
4107 const LLVMValueRef *coords = params->coords;
4108 const LLVMValueRef *offsets = params->offsets;
4109 const struct lp_derivatives *derivs = params->derivs;
4110 enum lp_sampler_lod_control lod_control;
4111 enum lp_sampler_op_type op_type;
4112 boolean need_cache = FALSE;
4113
4114 lod_control = (sample_key & LP_SAMPLER_LOD_CONTROL_MASK) >>
4115 LP_SAMPLER_LOD_CONTROL_SHIFT;
4116
4117 op_type = (sample_key & LP_SAMPLER_OP_TYPE_MASK) >>
4118 LP_SAMPLER_OP_TYPE_SHIFT;
4119
4120 get_target_info(static_texture_state->target,
4121 &num_coords, &num_derivs, &num_offsets, &layer);
4122
4123 /* lod query doesn't take a layer */
4124 if (layer && op_type == LP_SAMPLER_OP_LODQ)
4125 layer = 0;
4126
4127 if (dynamic_state->cache_ptr) {
4128 const struct util_format_description *format_desc;
4129 format_desc = util_format_description(static_texture_state->format);
4130 if (format_desc->layout == UTIL_FORMAT_LAYOUT_S3TC) {
4131 need_cache = TRUE;
4132 }
4133 }
4134 /*
4135 * texture function matches are found by name.
4136 * Thus the name has to include both the texture and sampler unit
4137 * (which covers all static state) plus the actual texture function
4138 * (including things like offsets, shadow coord, lod control).
4139 * Additionally lod_property has to be included too.
4140 */
4141
4142 snprintf(func_name, sizeof(func_name), "texfunc_res_%d_sam_%d_%x",
4143 texture_index, sampler_index, sample_key);
4144
4145 function = LLVMGetNamedFunction(module, func_name);
4146
4147 LLVMTypeRef arg_types[LP_MAX_TEX_FUNC_ARGS];
4148 LLVMTypeRef ret_type;
4149 LLVMTypeRef val_type[4];
4150 unsigned num_param = 0;
4151
4152 /*
4153 * Generate the function prototype.
4154 */
4155
4156 arg_types[num_param++] = LLVMTypeOf(params->context_ptr);
4157 if (params->aniso_filter_table)
4158 arg_types[num_param++] = LLVMTypeOf(params->aniso_filter_table);
4159 if (need_cache) {
4160 arg_types[num_param++] = LLVMTypeOf(params->thread_data_ptr);
4161 }
4162 for (i = 0; i < num_coords; i++) {
4163 arg_types[num_param++] = LLVMTypeOf(coords[0]);
4164 assert(LLVMTypeOf(coords[0]) == LLVMTypeOf(coords[i]));
4165 }
4166 if (layer) {
4167 arg_types[num_param++] = LLVMTypeOf(coords[layer]);
4168 assert(LLVMTypeOf(coords[0]) == LLVMTypeOf(coords[layer]));
4169 }
4170 if (sample_key & LP_SAMPLER_SHADOW) {
4171 arg_types[num_param++] = LLVMTypeOf(coords[0]);
4172 }
4173 if (sample_key & LP_SAMPLER_FETCH_MS) {
4174 arg_types[num_param++] = LLVMTypeOf(params->ms_index);
4175 }
4176 if (sample_key & LP_SAMPLER_OFFSETS) {
4177 for (i = 0; i < num_offsets; i++) {
4178 arg_types[num_param++] = LLVMTypeOf(offsets[0]);
4179 assert(LLVMTypeOf(offsets[0]) == LLVMTypeOf(offsets[i]));
4180 }
4181 }
4182 if (lod_control == LP_SAMPLER_LOD_BIAS ||
4183 lod_control == LP_SAMPLER_LOD_EXPLICIT) {
4184 arg_types[num_param++] = LLVMTypeOf(params->lod);
4185 }
4186 else if (lod_control == LP_SAMPLER_LOD_DERIVATIVES) {
4187 for (i = 0; i < num_derivs; i++) {
4188 arg_types[num_param++] = LLVMTypeOf(derivs->ddx[i]);
4189 arg_types[num_param++] = LLVMTypeOf(derivs->ddy[i]);
4190 assert(LLVMTypeOf(derivs->ddx[0]) == LLVMTypeOf(derivs->ddx[i]));
4191 assert(LLVMTypeOf(derivs->ddy[0]) == LLVMTypeOf(derivs->ddy[i]));
4192 }
4193 }
4194
4195 val_type[0] = val_type[1] = val_type[2] = val_type[3] =
4196 lp_build_vec_type(gallivm, params->type);
4197 ret_type = LLVMStructTypeInContext(gallivm->context, val_type, 4, 0);
4198 LLVMTypeRef function_type = LLVMFunctionType(ret_type, arg_types, num_param, 0);
4199
4200 if (!function) {
4201 function = LLVMAddFunction(module, func_name, function_type);
4202
4203 for (i = 0; i < num_param; ++i) {
4204 if (LLVMGetTypeKind(arg_types[i]) == LLVMPointerTypeKind) {
4205
4206 lp_add_function_attr(function, i + 1, LP_FUNC_ATTR_NOALIAS);
4207 }
4208 }
4209
4210 LLVMSetFunctionCallConv(function, LLVMFastCallConv);
4211 LLVMSetLinkage(function, LLVMInternalLinkage);
4212
4213 lp_build_sample_gen_func(gallivm,
4214 static_texture_state,
4215 static_sampler_state,
4216 dynamic_state,
4217 params->type,
4218 texture_index,
4219 sampler_index,
4220 function,
4221 num_param,
4222 sample_key,
4223 params->aniso_filter_table ? true : false);
4224 }
4225
4226 num_args = 0;
4227 args[num_args++] = params->context_ptr;
4228 if (params->aniso_filter_table)
4229 args[num_args++] = params->aniso_filter_table;
4230 if (need_cache) {
4231 args[num_args++] = params->thread_data_ptr;
4232 }
4233 for (i = 0; i < num_coords; i++) {
4234 args[num_args++] = coords[i];
4235 }
4236 if (layer) {
4237 args[num_args++] = coords[layer];
4238 }
4239 if (sample_key & LP_SAMPLER_SHADOW) {
4240 args[num_args++] = coords[4];
4241 }
4242 if (sample_key & LP_SAMPLER_FETCH_MS) {
4243 args[num_args++] = params->ms_index;
4244 }
4245 if (sample_key & LP_SAMPLER_OFFSETS) {
4246 for (i = 0; i < num_offsets; i++) {
4247 args[num_args++] = offsets[i];
4248 }
4249 }
4250 if (lod_control == LP_SAMPLER_LOD_BIAS ||
4251 lod_control == LP_SAMPLER_LOD_EXPLICIT) {
4252 args[num_args++] = params->lod;
4253 }
4254 else if (lod_control == LP_SAMPLER_LOD_DERIVATIVES) {
4255 for (i = 0; i < num_derivs; i++) {
4256 args[num_args++] = derivs->ddx[i];
4257 args[num_args++] = derivs->ddy[i];
4258 }
4259 }
4260
4261 assert(num_args <= LP_MAX_TEX_FUNC_ARGS);
4262
4263 *tex_ret = LLVMBuildCall2(builder, function_type, function, args, num_args, "");
4264 bb = LLVMGetInsertBlock(builder);
4265 inst = LLVMGetLastInstruction(bb);
4266 LLVMSetInstructionCallConv(inst, LLVMFastCallConv);
4267
4268 }
4269
4270
4271 /**
4272 * Build texture sampling code.
4273 * Either via a function call or inline it directly.
4274 */
4275 void
lp_build_sample_soa(const struct lp_static_texture_state * static_texture_state,const struct lp_static_sampler_state * static_sampler_state,struct lp_sampler_dynamic_state * dynamic_state,struct gallivm_state * gallivm,const struct lp_sampler_params * params)4276 lp_build_sample_soa(const struct lp_static_texture_state *static_texture_state,
4277 const struct lp_static_sampler_state *static_sampler_state,
4278 struct lp_sampler_dynamic_state *dynamic_state,
4279 struct gallivm_state *gallivm,
4280 const struct lp_sampler_params *params)
4281 {
4282 boolean use_tex_func = FALSE;
4283
4284 /*
4285 * Do not use a function call if the sampling is "simple enough".
4286 * We define this by
4287 * a) format
4288 * b) no mips (either one level only or no mip filter)
4289 * No mips will definitely make the code smaller, though
4290 * the format requirement is a bit iffy - there's some (SoA) formats
4291 * which definitely generate less code. This does happen to catch
4292 * some important cases though which are hurt quite a bit by using
4293 * a call (though not really because of the call overhead but because
4294 * they are reusing the same texture unit with some of the same
4295 * parameters).
4296 * Ideally we'd let llvm recognize this stuff by doing IPO passes.
4297 */
4298
4299 if (USE_TEX_FUNC_CALL) {
4300 const struct util_format_description *format_desc =
4301 util_format_description(static_texture_state->format);
4302 const boolean simple_format =
4303 (util_format_is_rgba8_variant(format_desc) &&
4304 format_desc->colorspace == UTIL_FORMAT_COLORSPACE_RGB);
4305 const enum lp_sampler_op_type op_type =
4306 (params->sample_key & LP_SAMPLER_OP_TYPE_MASK) >>
4307 LP_SAMPLER_OP_TYPE_SHIFT;
4308 const boolean simple_tex =
4309 op_type != LP_SAMPLER_OP_TEXTURE ||
4310 ((static_sampler_state->min_mip_filter == PIPE_TEX_MIPFILTER_NONE ||
4311 static_texture_state->level_zero_only == TRUE) &&
4312 static_sampler_state->min_img_filter == static_sampler_state->mag_img_filter);
4313
4314 use_tex_func = !(simple_format && simple_tex);
4315 }
4316
4317 if (use_tex_func) {
4318 LLVMValueRef tex_ret;
4319 lp_build_sample_soa_func(gallivm,
4320 static_texture_state,
4321 static_sampler_state,
4322 dynamic_state,
4323 params, params->texture_index,
4324 params->sampler_index, &tex_ret);
4325
4326 for (unsigned i = 0; i < 4; i++) {
4327 params->texel[i] =
4328 LLVMBuildExtractValue(gallivm->builder, tex_ret, i, "");
4329 }
4330 }
4331 else {
4332 lp_build_sample_soa_code(gallivm,
4333 static_texture_state,
4334 static_sampler_state,
4335 dynamic_state,
4336 params->type,
4337 params->sample_key,
4338 params->texture_index,
4339 params->sampler_index,
4340 params->context_ptr,
4341 params->thread_data_ptr,
4342 params->coords,
4343 params->offsets,
4344 params->derivs,
4345 params->lod,
4346 params->ms_index,
4347 params->aniso_filter_table,
4348 params->texel);
4349 }
4350 }
4351
4352
4353 void
lp_build_size_query_soa(struct gallivm_state * gallivm,const struct lp_static_texture_state * static_state,struct lp_sampler_dynamic_state * dynamic_state,const struct lp_sampler_size_query_params * params)4354 lp_build_size_query_soa(struct gallivm_state *gallivm,
4355 const struct lp_static_texture_state *static_state,
4356 struct lp_sampler_dynamic_state *dynamic_state,
4357 const struct lp_sampler_size_query_params *params)
4358 {
4359 LLVMValueRef lod, level = 0, size;
4360 LLVMValueRef first_level = NULL;
4361 unsigned num_lods = 1;
4362 struct lp_build_context bld_int_vec4;
4363 LLVMValueRef context_ptr = params->context_ptr;
4364 unsigned texture_unit = params->texture_unit;
4365 unsigned target = params->target;
4366 LLVMValueRef texture_unit_offset = params->texture_unit_offset;
4367
4368 if (static_state->format == PIPE_FORMAT_NONE) {
4369 /*
4370 * If there's nothing bound, format is NONE, and we must return
4371 * all zero as mandated by d3d10 in this case.
4372 */
4373 LLVMValueRef zero = lp_build_const_vec(gallivm, params->int_type, 0.0F);
4374 for (unsigned chan = 0; chan < 4; chan++) {
4375 params->sizes_out[chan] = zero;
4376 }
4377 return;
4378 }
4379
4380 /*
4381 * Do some sanity verification about bound texture and shader dcl target.
4382 * Not entirely sure what's possible but assume array/non-array
4383 * always compatible (probably not ok for OpenGL but d3d10 has no
4384 * distinction of arrays at the resource level).
4385 * Everything else looks bogus (though not entirely sure about rect/2d).
4386 * Currently disabled because it causes assertion failures if there's
4387 * nothing bound (or rather a dummy texture, not that this case would
4388 * return the right values).
4389 */
4390 if (0 && static_state->target != target) {
4391 if (static_state->target == PIPE_TEXTURE_1D)
4392 assert(target == PIPE_TEXTURE_1D_ARRAY);
4393 else if (static_state->target == PIPE_TEXTURE_1D_ARRAY)
4394 assert(target == PIPE_TEXTURE_1D);
4395 else if (static_state->target == PIPE_TEXTURE_2D)
4396 assert(target == PIPE_TEXTURE_2D_ARRAY);
4397 else if (static_state->target == PIPE_TEXTURE_2D_ARRAY)
4398 assert(target == PIPE_TEXTURE_2D);
4399 else if (static_state->target == PIPE_TEXTURE_CUBE)
4400 assert(target == PIPE_TEXTURE_CUBE_ARRAY);
4401 else if (static_state->target == PIPE_TEXTURE_CUBE_ARRAY)
4402 assert(target == PIPE_TEXTURE_CUBE);
4403 else
4404 assert(0);
4405 }
4406
4407 const unsigned dims = texture_dims(target);
4408
4409 const boolean has_array = has_layer_coord(target);
4410
4411 assert(!params->int_type.floating);
4412
4413 lp_build_context_init(&bld_int_vec4, gallivm, lp_type_int_vec(32, 128));
4414
4415 if (params->samples_only) {
4416 params->sizes_out[0] =
4417 lp_build_broadcast(gallivm,
4418 lp_build_vec_type(gallivm, params->int_type),
4419 dynamic_state->num_samples(dynamic_state, gallivm,
4420 context_ptr,
4421 texture_unit,
4422 texture_unit_offset));
4423 return;
4424 }
4425
4426 if (params->explicit_lod) {
4427 /* FIXME: this needs to honor per-element lod */
4428 lod = LLVMBuildExtractElement(gallivm->builder, params->explicit_lod,
4429 lp_build_const_int32(gallivm, 0), "");
4430 first_level = dynamic_state->first_level(dynamic_state, gallivm,
4431 context_ptr, texture_unit,
4432 texture_unit_offset);
4433 level = LLVMBuildAdd(gallivm->builder, lod, first_level, "level");
4434 lod = lp_build_broadcast_scalar(&bld_int_vec4, level);
4435 } else {
4436 lod = bld_int_vec4.zero;
4437 }
4438
4439 size = bld_int_vec4.undef;
4440
4441 size = LLVMBuildInsertElement(gallivm->builder, size,
4442 dynamic_state->width(dynamic_state, gallivm,
4443 context_ptr,
4444 texture_unit,
4445 texture_unit_offset),
4446 lp_build_const_int32(gallivm, 0), "");
4447
4448 if (dims >= 2) {
4449 size = LLVMBuildInsertElement(gallivm->builder, size,
4450 dynamic_state->height(dynamic_state,
4451 gallivm, context_ptr,
4452 texture_unit,
4453 texture_unit_offset),
4454 lp_build_const_int32(gallivm, 1), "");
4455 }
4456
4457 if (dims >= 3) {
4458 size = LLVMBuildInsertElement(gallivm->builder, size,
4459 dynamic_state->depth(dynamic_state, gallivm,
4460 context_ptr,
4461 texture_unit,
4462 texture_unit_offset),
4463 lp_build_const_int32(gallivm, 2), "");
4464 }
4465
4466 size = lp_build_minify(&bld_int_vec4, size, lod, TRUE);
4467
4468 if (has_array) {
4469 LLVMValueRef layers = dynamic_state->depth(dynamic_state, gallivm,
4470 context_ptr, texture_unit,
4471 texture_unit_offset);
4472 if (target == PIPE_TEXTURE_CUBE_ARRAY) {
4473 /*
4474 * It looks like GL wants number of cubes, d3d10.1 has it undefined?
4475 * Could avoid this by passing in number of cubes instead of total
4476 * number of layers (might make things easier elsewhere too).
4477 */
4478 LLVMValueRef six = lp_build_const_int32(gallivm, 6);
4479 layers = LLVMBuildSDiv(gallivm->builder, layers, six, "");
4480 }
4481 size = LLVMBuildInsertElement(gallivm->builder, size, layers,
4482 lp_build_const_int32(gallivm, dims), "");
4483 }
4484
4485 /*
4486 * d3d10 requires zero for x/y/z values (but not w, i.e. mip levels)
4487 * if level is out of bounds (note this can't cover unbound texture
4488 * here, which also requires returning zero).
4489 */
4490 if (params->explicit_lod && params->is_sviewinfo) {
4491 LLVMValueRef last_level, out, out1;
4492 struct lp_build_context leveli_bld;
4493
4494 /* everything is scalar for now */
4495 lp_build_context_init(&leveli_bld, gallivm, lp_type_int_vec(32, 32));
4496 last_level = dynamic_state->last_level(dynamic_state, gallivm,
4497 context_ptr, texture_unit,
4498 texture_unit_offset);
4499
4500 out = lp_build_cmp(&leveli_bld, PIPE_FUNC_LESS, level, first_level);
4501 out1 = lp_build_cmp(&leveli_bld, PIPE_FUNC_GREATER, level, last_level);
4502 out = lp_build_or(&leveli_bld, out, out1);
4503 if (num_lods == 1) {
4504 out = lp_build_broadcast_scalar(&bld_int_vec4, out);
4505 }
4506 else {
4507 /* TODO */
4508 assert(0);
4509 }
4510 size = lp_build_andnot(&bld_int_vec4, size, out);
4511 }
4512
4513 unsigned i;
4514 for (i = 0; i < dims + (has_array ? 1 : 0); i++) {
4515 params->sizes_out[i] =
4516 lp_build_extract_broadcast(gallivm, bld_int_vec4.type,
4517 params->int_type,
4518 size,
4519 lp_build_const_int32(gallivm, i));
4520 }
4521 if (params->is_sviewinfo) {
4522 for (; i < 4; i++) {
4523 params->sizes_out[i] = lp_build_const_vec(gallivm,
4524 params->int_type, 0.0);
4525 }
4526 }
4527
4528 /*
4529 * if there's no explicit_lod (buffers, rects) queries requiring nr of
4530 * mips would be illegal.
4531 */
4532 if (params->is_sviewinfo && params->explicit_lod) {
4533 struct lp_build_context bld_int_scalar;
4534 lp_build_context_init(&bld_int_scalar, gallivm, lp_type_int(32));
4535
4536 LLVMValueRef num_levels;
4537 if (static_state->level_zero_only) {
4538 num_levels = bld_int_scalar.one;
4539 }
4540 else {
4541 LLVMValueRef last_level;
4542
4543 last_level = dynamic_state->last_level(dynamic_state, gallivm,
4544 context_ptr, texture_unit,
4545 texture_unit_offset);
4546 num_levels = lp_build_sub(&bld_int_scalar, last_level, first_level);
4547 num_levels = lp_build_add(&bld_int_scalar, num_levels,
4548 bld_int_scalar.one);
4549 }
4550 params->sizes_out[3] =
4551 lp_build_broadcast(gallivm,
4552 lp_build_vec_type(gallivm, params->int_type),
4553 num_levels);
4554 }
4555 }
4556
4557
4558 static void
lp_build_do_atomic_soa(struct gallivm_state * gallivm,const struct util_format_description * format_desc,struct lp_type type,LLVMValueRef exec_mask,LLVMValueRef base_ptr,LLVMValueRef offset,LLVMValueRef out_of_bounds,unsigned img_op,LLVMAtomicRMWBinOp op,const LLVMValueRef rgba_in[4],const LLVMValueRef rgba2_in[4],LLVMValueRef atomic_result[4])4559 lp_build_do_atomic_soa(struct gallivm_state *gallivm,
4560 const struct util_format_description *format_desc,
4561 struct lp_type type,
4562 LLVMValueRef exec_mask,
4563 LLVMValueRef base_ptr,
4564 LLVMValueRef offset,
4565 LLVMValueRef out_of_bounds,
4566 unsigned img_op,
4567 LLVMAtomicRMWBinOp op,
4568 const LLVMValueRef rgba_in[4],
4569 const LLVMValueRef rgba2_in[4],
4570 LLVMValueRef atomic_result[4])
4571 {
4572 const enum pipe_format format = format_desc->format;
4573
4574 if (format != PIPE_FORMAT_R32_UINT &&
4575 format != PIPE_FORMAT_R32_SINT &&
4576 format != PIPE_FORMAT_R32_FLOAT) {
4577 atomic_result[0] = lp_build_zero(gallivm, type);
4578 return;
4579 }
4580
4581 LLVMTypeRef atom_res_elem_type =
4582 LLVMVectorType(LLVMInt32TypeInContext(gallivm->context), type.length);
4583 LLVMValueRef atom_res = lp_build_alloca(gallivm, atom_res_elem_type, "");
4584
4585 offset = LLVMBuildGEP(gallivm->builder, base_ptr, &offset, 1, "");
4586
4587 struct lp_build_loop_state loop_state;
4588 lp_build_loop_begin(&loop_state, gallivm, lp_build_const_int32(gallivm, 0));
4589 struct lp_build_if_state ifthen;
4590 LLVMValueRef cond;
4591 LLVMValueRef packed = rgba_in[0], packed2 = rgba2_in[0];
4592
4593 LLVMValueRef should_store_mask =
4594 LLVMBuildAnd(gallivm->builder, exec_mask,
4595 LLVMBuildNot(gallivm->builder, out_of_bounds, ""),
4596 "store_mask");
4597 assert(exec_mask);
4598
4599 cond = LLVMBuildICmp(gallivm->builder, LLVMIntNE, should_store_mask,
4600 lp_build_const_int_vec(gallivm, type, 0), "");
4601 cond = LLVMBuildExtractElement(gallivm->builder, cond,
4602 loop_state.counter, "");
4603 lp_build_if(&ifthen, gallivm, cond);
4604
4605 LLVMValueRef data =
4606 LLVMBuildExtractElement(gallivm->builder, packed, loop_state.counter, "");
4607 LLVMValueRef cast_base_ptr =
4608 LLVMBuildExtractElement(gallivm->builder, offset, loop_state.counter, "");
4609 cast_base_ptr = LLVMBuildBitCast(gallivm->builder, cast_base_ptr,
4610 LLVMPointerType(LLVMInt32TypeInContext(gallivm->context), 0), "");
4611 data = LLVMBuildBitCast(gallivm->builder, data,
4612 LLVMInt32TypeInContext(gallivm->context), "");
4613
4614 if (img_op == LP_IMG_ATOMIC_CAS) {
4615 LLVMValueRef cas_src_ptr =
4616 LLVMBuildExtractElement(gallivm->builder, packed2,
4617 loop_state.counter, "");
4618 LLVMValueRef cas_src =
4619 LLVMBuildBitCast(gallivm->builder, cas_src_ptr,
4620 LLVMInt32TypeInContext(gallivm->context), "");
4621 data = LLVMBuildAtomicCmpXchg(gallivm->builder, cast_base_ptr, data,
4622 cas_src,
4623 LLVMAtomicOrderingSequentiallyConsistent,
4624 LLVMAtomicOrderingSequentiallyConsistent,
4625 false);
4626 data = LLVMBuildExtractValue(gallivm->builder, data, 0, "");
4627 } else {
4628 data = LLVMBuildAtomicRMW(gallivm->builder, op,
4629 cast_base_ptr, data,
4630 LLVMAtomicOrderingSequentiallyConsistent,
4631 false);
4632 }
4633
4634 LLVMValueRef temp_res =
4635 LLVMBuildLoad2(gallivm->builder, atom_res_elem_type, atom_res, "");
4636 temp_res = LLVMBuildInsertElement(gallivm->builder, temp_res, data,
4637 loop_state.counter, "");
4638 LLVMBuildStore(gallivm->builder, temp_res, atom_res);
4639
4640 lp_build_endif(&ifthen);
4641 lp_build_loop_end_cond(&loop_state,
4642 lp_build_const_int32(gallivm, type.length),
4643 NULL, LLVMIntUGE);
4644 atomic_result[0] = LLVMBuildLoad2(gallivm->builder, atom_res_elem_type,
4645 atom_res, "");
4646 }
4647
4648
4649 static void
lp_build_img_op_no_format(struct gallivm_state * gallivm,const struct lp_img_params * params,LLVMValueRef outdata[4])4650 lp_build_img_op_no_format(struct gallivm_state *gallivm,
4651 const struct lp_img_params *params,
4652 LLVMValueRef outdata[4])
4653 {
4654 /*
4655 * If there's nothing bound, format is NONE, and we must return
4656 * all zero as mandated by d3d10 in this case.
4657 */
4658 if (params->img_op != LP_IMG_STORE) {
4659 LLVMValueRef zero = lp_build_zero(gallivm, params->type);
4660 for (unsigned chan = 0; chan < (params->img_op == LP_IMG_LOAD ? 4 : 1);
4661 chan++) {
4662 outdata[chan] = zero;
4663 }
4664 }
4665 }
4666
4667
4668 void
lp_build_img_op_soa(const struct lp_static_texture_state * static_texture_state,struct lp_sampler_dynamic_state * dynamic_state,struct gallivm_state * gallivm,const struct lp_img_params * params,LLVMValueRef outdata[4])4669 lp_build_img_op_soa(const struct lp_static_texture_state *static_texture_state,
4670 struct lp_sampler_dynamic_state *dynamic_state,
4671 struct gallivm_state *gallivm,
4672 const struct lp_img_params *params,
4673 LLVMValueRef outdata[4])
4674 {
4675 const enum pipe_texture_target target = params->target;
4676 const unsigned dims = texture_dims(target);
4677 /** regular scalar int type */
4678 struct lp_type int_coord_type;
4679 struct lp_build_context int_coord_bld;
4680 const struct util_format_description *format_desc =
4681 util_format_description(static_texture_state->format);
4682 LLVMValueRef x = params->coords[0], y = params->coords[1],
4683 z = params->coords[2];
4684 LLVMValueRef ms_index = params->ms_index;
4685 LLVMValueRef row_stride_vec = NULL, img_stride_vec = NULL;
4686
4687 int_coord_type = lp_uint_type(params->type);
4688 lp_build_context_init(&int_coord_bld, gallivm, int_coord_type);
4689
4690 if (static_texture_state->format == PIPE_FORMAT_NONE) {
4691 lp_build_img_op_no_format(gallivm, params, outdata);
4692 return;
4693
4694 }
4695 LLVMValueRef offset, i, j;
4696
4697 LLVMValueRef row_stride = dynamic_state->row_stride(dynamic_state, gallivm,
4698 params->context_ptr,
4699 params->image_index, NULL);
4700 LLVMValueRef img_stride = dynamic_state->img_stride(dynamic_state, gallivm,
4701 params->context_ptr,
4702 params->image_index, NULL);
4703 LLVMValueRef base_ptr = dynamic_state->base_ptr(dynamic_state, gallivm,
4704 params->context_ptr,
4705 params->image_index, NULL);
4706 LLVMValueRef width = dynamic_state->width(dynamic_state, gallivm,
4707 params->context_ptr,
4708 params->image_index, NULL);
4709 LLVMValueRef height = dynamic_state->height(dynamic_state, gallivm,
4710 params->context_ptr,
4711 params->image_index, NULL);
4712 LLVMValueRef depth = dynamic_state->depth(dynamic_state, gallivm,
4713 params->context_ptr,
4714 params->image_index, NULL);
4715 LLVMValueRef num_samples = NULL, sample_stride = NULL;
4716 if (ms_index) {
4717 num_samples = dynamic_state->num_samples(dynamic_state, gallivm,
4718 params->context_ptr,
4719 params->image_index, NULL);
4720 sample_stride = dynamic_state->sample_stride(dynamic_state, gallivm,
4721 params->context_ptr,
4722 params->image_index, NULL);
4723 }
4724
4725 boolean layer_coord = has_layer_coord(target);
4726
4727 width = lp_build_broadcast_scalar(&int_coord_bld, width);
4728 if (dims >= 2) {
4729 height = lp_build_broadcast_scalar(&int_coord_bld, height);
4730 row_stride_vec = lp_build_broadcast_scalar(&int_coord_bld, row_stride);
4731 }
4732 if (dims >= 3 || layer_coord) {
4733 depth = lp_build_broadcast_scalar(&int_coord_bld, depth);
4734 img_stride_vec = lp_build_broadcast_scalar(&int_coord_bld, img_stride);
4735 }
4736
4737 LLVMValueRef out_of_bounds = int_coord_bld.zero;
4738 LLVMValueRef out1 = lp_build_cmp(&int_coord_bld, PIPE_FUNC_GEQUAL, x, width);
4739 out_of_bounds = lp_build_or(&int_coord_bld, out_of_bounds, out1);
4740
4741 if (dims >= 2) {
4742 out1 = lp_build_cmp(&int_coord_bld, PIPE_FUNC_GEQUAL, y, height);
4743 out_of_bounds = lp_build_or(&int_coord_bld, out_of_bounds, out1);
4744 }
4745 if (dims >= 3 || layer_coord) {
4746 out1 = lp_build_cmp(&int_coord_bld, PIPE_FUNC_GEQUAL, z, depth);
4747 out_of_bounds = lp_build_or(&int_coord_bld, out_of_bounds, out1);
4748 }
4749 lp_build_sample_offset(&int_coord_bld,
4750 format_desc,
4751 x, y, z, row_stride_vec, img_stride_vec,
4752 &offset, &i, &j);
4753
4754 if (ms_index) {
4755 out1 = lp_build_cmp(&int_coord_bld, PIPE_FUNC_GEQUAL, ms_index,
4756 lp_build_broadcast_scalar(&int_coord_bld,
4757 num_samples));
4758 out_of_bounds = lp_build_or(&int_coord_bld, out_of_bounds, out1);
4759
4760 offset =
4761 lp_build_add(&int_coord_bld, offset,
4762 lp_build_mul(&int_coord_bld,
4763 lp_build_broadcast_scalar(&int_coord_bld,
4764 sample_stride),
4765 ms_index));
4766 }
4767 if (params->img_op == LP_IMG_LOAD) {
4768 struct lp_type texel_type = lp_build_texel_type(params->type, format_desc);
4769
4770 offset = lp_build_andnot(&int_coord_bld, offset, out_of_bounds);
4771 struct lp_build_context texel_bld;
4772 lp_build_context_init(&texel_bld, gallivm, texel_type);
4773 lp_build_fetch_rgba_soa(gallivm,
4774 format_desc,
4775 texel_type, TRUE,
4776 base_ptr, offset,
4777 i, j,
4778 NULL,
4779 outdata);
4780
4781 for (unsigned chan = 0; chan < 3; chan++) {
4782 outdata[chan] = lp_build_select(&texel_bld, out_of_bounds,
4783 texel_bld.zero, outdata[chan]);
4784 }
4785 if (format_desc->swizzle[3] == PIPE_SWIZZLE_1)
4786 outdata[3] = lp_build_select(&texel_bld, out_of_bounds,
4787 texel_bld.one, outdata[3]);
4788 else
4789 outdata[3] = lp_build_select(&texel_bld, out_of_bounds,
4790 texel_bld.zero, outdata[3]);
4791 } else if (params->img_op == LP_IMG_STORE) {
4792 lp_build_store_rgba_soa(gallivm, format_desc, params->type,
4793 params->exec_mask, base_ptr, offset,
4794 out_of_bounds, params->indata);
4795 } else {
4796 lp_build_do_atomic_soa(gallivm, format_desc, params->type,
4797 params->exec_mask, base_ptr, offset,
4798 out_of_bounds, params->img_op, params->op,
4799 params->indata, params->indata2, outdata);
4800 }
4801 }
4802
4803
4804 /*
4805 * These functions are for indirect texture access suppoort.
4806 *
4807 * Indirect textures are implemented using a switch statement, that
4808 * takes the texture index and jumps to the sampler functions for
4809 * that texture unit.
4810 */
4811
4812 /*
4813 * Initialise an indexed sampler switch block.
4814 *
4815 * This sets up the switch_info state and adds the LLVM flow control pieces.
4816 */
4817 void
lp_build_sample_array_init_soa(struct lp_build_sample_array_switch * switch_info,struct gallivm_state * gallivm,const struct lp_sampler_params * params,LLVMValueRef idx,unsigned base,unsigned range)4818 lp_build_sample_array_init_soa(struct lp_build_sample_array_switch *switch_info,
4819 struct gallivm_state *gallivm,
4820 const struct lp_sampler_params *params,
4821 LLVMValueRef idx,
4822 unsigned base, unsigned range)
4823 {
4824 switch_info->gallivm = gallivm;
4825 switch_info->params = *params;
4826 switch_info->base = base;
4827 switch_info->range = range;
4828
4829 /* for generating the switch functions we don't want the texture index
4830 * offset
4831 */
4832 switch_info->params.texture_index_offset = 0;
4833
4834 LLVMBasicBlockRef initial_block = LLVMGetInsertBlock(gallivm->builder);
4835 switch_info->merge_ref = lp_build_insert_new_block(gallivm, "texmerge");
4836
4837 switch_info->switch_ref = LLVMBuildSwitch(gallivm->builder, idx,
4838 switch_info->merge_ref,
4839 range - base);
4840
4841 LLVMTypeRef val_type[4];
4842 val_type[0] = val_type[1] = val_type[2] = val_type[3] =
4843 lp_build_vec_type(gallivm, params->type);
4844
4845 LLVMTypeRef ret_type =
4846 LLVMStructTypeInContext(gallivm->context, val_type, 4, 0);
4847
4848 LLVMValueRef undef_val = LLVMGetUndef(ret_type);
4849
4850 LLVMPositionBuilderAtEnd(gallivm->builder, switch_info->merge_ref);
4851
4852 switch_info->phi = LLVMBuildPhi(gallivm->builder, ret_type, "");
4853 LLVMAddIncoming(switch_info->phi, &undef_val, &initial_block, 1);
4854 }
4855
4856
4857 /*
4858 * Add an individual entry to the indirect texture switch.
4859 *
4860 * This builds the sample function and links a case for it into the switch
4861 * statement.
4862 */
4863 void
lp_build_sample_array_case_soa(struct lp_build_sample_array_switch * switch_info,int idx,const struct lp_static_texture_state * static_texture_state,const struct lp_static_sampler_state * static_sampler_state,struct lp_sampler_dynamic_state * dynamic_texture_state)4864 lp_build_sample_array_case_soa(struct lp_build_sample_array_switch *switch_info,
4865 int idx,
4866 const struct lp_static_texture_state *static_texture_state,
4867 const struct lp_static_sampler_state *static_sampler_state,
4868 struct lp_sampler_dynamic_state *dynamic_texture_state)
4869 {
4870 struct gallivm_state *gallivm = switch_info->gallivm;
4871 LLVMBasicBlockRef this_block = lp_build_insert_new_block(gallivm, "texblock");
4872
4873 LLVMAddCase(switch_info->switch_ref,
4874 LLVMConstInt(LLVMInt32TypeInContext(gallivm->context), idx, 0),
4875 this_block);
4876 LLVMPositionBuilderAtEnd(gallivm->builder, this_block);
4877
4878 LLVMValueRef tex_ret;
4879 lp_build_sample_soa_func(gallivm, static_texture_state,
4880 static_sampler_state, dynamic_texture_state,
4881 &switch_info->params, idx, idx, &tex_ret);
4882
4883 LLVMAddIncoming(switch_info->phi, &tex_ret, &this_block, 1);
4884 LLVMBuildBr(gallivm->builder, switch_info->merge_ref);
4885 }
4886
4887
4888 /*
4889 * Finish a switch statement.
4890 *
4891 * This handles extract the results from the switch.
4892 */
4893 void
lp_build_sample_array_fini_soa(struct lp_build_sample_array_switch * switch_info)4894 lp_build_sample_array_fini_soa(struct lp_build_sample_array_switch *switch_info)
4895 {
4896 struct gallivm_state *gallivm = switch_info->gallivm;
4897
4898 LLVMPositionBuilderAtEnd(gallivm->builder, switch_info->merge_ref);
4899 for (unsigned i = 0; i < 4; i++) {
4900 switch_info->params.texel[i] =
4901 LLVMBuildExtractValue(gallivm->builder, switch_info->phi, i, "");
4902 }
4903 }
4904
4905
4906 void
lp_build_image_op_switch_soa(struct lp_build_img_op_array_switch * switch_info,struct gallivm_state * gallivm,const struct lp_img_params * params,LLVMValueRef idx,unsigned base,unsigned range)4907 lp_build_image_op_switch_soa(struct lp_build_img_op_array_switch *switch_info,
4908 struct gallivm_state *gallivm,
4909 const struct lp_img_params *params,
4910 LLVMValueRef idx,
4911 unsigned base, unsigned range)
4912 {
4913 switch_info->gallivm = gallivm;
4914 switch_info->params = *params;
4915 switch_info->base = base;
4916 switch_info->range = range;
4917
4918 /* for generating the switch functions we don't want the texture index
4919 * offset
4920 */
4921 switch_info->params.image_index_offset = 0;
4922
4923 LLVMBasicBlockRef initial_block = LLVMGetInsertBlock(gallivm->builder);
4924 switch_info->merge_ref = lp_build_insert_new_block(gallivm, "imgmerge");
4925
4926 switch_info->switch_ref =
4927 LLVMBuildSwitch(gallivm->builder, idx,
4928 switch_info->merge_ref, range - base);
4929
4930 if (params->img_op != LP_IMG_STORE) {
4931 LLVMTypeRef ret_type = lp_build_vec_type(gallivm, params->type);
4932 LLVMValueRef undef_val = LLVMGetUndef(ret_type);
4933
4934 LLVMPositionBuilderAtEnd(gallivm->builder, switch_info->merge_ref);
4935
4936 for (unsigned i = 0; i < ((params->img_op == LP_IMG_LOAD) ? 4 : 1); i++) {
4937 switch_info->phi[i] = LLVMBuildPhi(gallivm->builder, ret_type, "");
4938 LLVMAddIncoming(switch_info->phi[i], &undef_val, &initial_block, 1);
4939 }
4940 }
4941 }
4942
4943
4944 void
lp_build_image_op_array_case(struct lp_build_img_op_array_switch * switch_info,int idx,const struct lp_static_texture_state * static_texture_state,struct lp_sampler_dynamic_state * dynamic_state)4945 lp_build_image_op_array_case(struct lp_build_img_op_array_switch *switch_info,
4946 int idx,
4947 const struct lp_static_texture_state *static_texture_state,
4948 struct lp_sampler_dynamic_state *dynamic_state)
4949 {
4950 struct gallivm_state *gallivm = switch_info->gallivm;
4951 LLVMBasicBlockRef this_block = lp_build_insert_new_block(gallivm, "img");
4952 LLVMValueRef tex_ret[4];
4953
4954 LLVMAddCase(switch_info->switch_ref,
4955 lp_build_const_int32(gallivm, idx), this_block);
4956 LLVMPositionBuilderAtEnd(gallivm->builder, this_block);
4957
4958 switch_info->params.image_index = idx;
4959
4960 lp_build_img_op_soa(static_texture_state, dynamic_state,
4961 switch_info->gallivm, &switch_info->params, tex_ret);
4962
4963 if (switch_info->params.img_op != LP_IMG_STORE) {
4964 for (unsigned i = 0;
4965 i < ((switch_info->params.img_op == LP_IMG_LOAD) ? 4 : 1); i++) {
4966 tex_ret[i] =
4967 LLVMBuildBitCast(gallivm->builder, tex_ret[i],
4968 lp_build_vec_type(gallivm,
4969 switch_info->params.type), "");
4970 }
4971
4972 this_block = LLVMGetInsertBlock(gallivm->builder);
4973 for (unsigned i = 0;
4974 i < ((switch_info->params.img_op == LP_IMG_LOAD) ? 4 : 1); i++) {
4975 LLVMAddIncoming(switch_info->phi[i], &tex_ret[i], &this_block, 1);
4976 }
4977 }
4978 LLVMBuildBr(gallivm->builder, switch_info->merge_ref);
4979 }
4980
4981
4982 void
lp_build_image_op_array_fini_soa(struct lp_build_img_op_array_switch * switch_info)4983 lp_build_image_op_array_fini_soa(struct lp_build_img_op_array_switch *switch_info)
4984 {
4985 struct gallivm_state *gallivm = switch_info->gallivm;
4986
4987 LLVMPositionBuilderAtEnd(gallivm->builder, switch_info->merge_ref);
4988
4989 if (switch_info->params.img_op != LP_IMG_STORE) {
4990 for (unsigned i = 0;
4991 i < ((switch_info->params.img_op == LP_IMG_LOAD) ? 4 : 1); i++) {
4992 switch_info->params.outdata[i] = switch_info->phi[i];
4993 }
4994 }
4995 }
4996