• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /**************************************************************************
2  *
3  * Copyright 2009 VMware, Inc.
4  * All Rights Reserved.
5  *
6  * Permission is hereby granted, free of charge, to any person obtaining a
7  * copy of this software and associated documentation files (the
8  * "Software"), to deal in the Software without restriction, including
9  * without limitation the rights to use, copy, modify, merge, publish,
10  * distribute, sub license, and/or sell copies of the Software, and to
11  * permit persons to whom the Software is furnished to do so, subject to
12  * the following conditions:
13  *
14  * The above copyright notice and this permission notice (including the
15  * next paragraph) shall be included in all copies or substantial portions
16  * of the Software.
17  *
18  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
19  * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
20  * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
21  * IN NO EVENT SHALL VMWARE AND/OR ITS SUPPLIERS BE LIABLE FOR
22  * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
23  * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
24  * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
25  *
26  **************************************************************************/
27 
28 /**
29  * @file
30  * Texture sampling -- SoA.
31  *
32  * @author Jose Fonseca <jfonseca@vmware.com>
33  * @author Brian Paul <brianp@vmware.com>
34  */
35 
36 #include "pipe/p_defines.h"
37 #include "pipe/p_state.h"
38 #include "pipe/p_shader_tokens.h"
39 #include "util/compiler.h"
40 #include "util/u_debug.h"
41 #include "util/u_dump.h"
42 #include "util/u_memory.h"
43 #include "util/u_math.h"
44 #include "util/format/u_format.h"
45 #include "util/u_cpu_detect.h"
46 #include "util/format_rgb9e5.h"
47 #include "lp_bld_debug.h"
48 #include "lp_bld_type.h"
49 #include "lp_bld_const.h"
50 #include "lp_bld_conv.h"
51 #include "lp_bld_arit.h"
52 #include "lp_bld_bitarit.h"
53 #include "lp_bld_logic.h"
54 #include "lp_bld_printf.h"
55 #include "lp_bld_swizzle.h"
56 #include "lp_bld_flow.h"
57 #include "lp_bld_gather.h"
58 #include "lp_bld_format.h"
59 #include "lp_bld_sample.h"
60 #include "lp_bld_sample_aos.h"
61 #include "lp_bld_struct.h"
62 #include "lp_bld_quad.h"
63 #include "lp_bld_pack.h"
64 #include "lp_bld_intr.h"
65 #include "lp_bld_misc.h"
66 
67 
68 /**
69  * Generate code to fetch a texel from a texture at int coords (x, y, z).
70  * The computation depends on whether the texture is 1D, 2D or 3D.
71  * The result, texel, will be float vectors:
72  *   texel[0] = red values
73  *   texel[1] = green values
74  *   texel[2] = blue values
75  *   texel[3] = alpha values
76  */
77 static void
lp_build_sample_texel_soa(struct lp_build_sample_context * bld,LLVMValueRef width,LLVMValueRef height,LLVMValueRef depth,LLVMValueRef x,LLVMValueRef y,LLVMValueRef z,LLVMValueRef y_stride,LLVMValueRef z_stride,LLVMValueRef data_ptr,LLVMValueRef mipoffsets,LLVMValueRef texel_out[4])78 lp_build_sample_texel_soa(struct lp_build_sample_context *bld,
79                           LLVMValueRef width,
80                           LLVMValueRef height,
81                           LLVMValueRef depth,
82                           LLVMValueRef x,
83                           LLVMValueRef y,
84                           LLVMValueRef z,
85                           LLVMValueRef y_stride,
86                           LLVMValueRef z_stride,
87                           LLVMValueRef data_ptr,
88                           LLVMValueRef mipoffsets,
89                           LLVMValueRef texel_out[4])
90 {
91    const struct lp_static_sampler_state *static_state = bld->static_sampler_state;
92    const unsigned dims = bld->dims;
93    struct lp_build_context *int_coord_bld = &bld->int_coord_bld;
94    LLVMBuilderRef builder = bld->gallivm->builder;
95    LLVMValueRef offset;
96    LLVMValueRef i, j;
97    LLVMValueRef use_border = NULL;
98 
99    /* use_border = x < 0 || x >= width || y < 0 || y >= height */
100    if (lp_sampler_wrap_mode_uses_border_color(static_state->wrap_s,
101                                               static_state->min_img_filter,
102                                               static_state->mag_img_filter)) {
103       LLVMValueRef b1, b2;
104       b1 = lp_build_cmp(int_coord_bld, PIPE_FUNC_LESS, x, int_coord_bld->zero);
105       b2 = lp_build_cmp(int_coord_bld, PIPE_FUNC_GEQUAL, x, width);
106       use_border = LLVMBuildOr(builder, b1, b2, "b1_or_b2");
107    }
108 
109    if (dims >= 2 &&
110        lp_sampler_wrap_mode_uses_border_color(static_state->wrap_t,
111                                               static_state->min_img_filter,
112                                               static_state->mag_img_filter)) {
113       LLVMValueRef b1, b2;
114       b1 = lp_build_cmp(int_coord_bld, PIPE_FUNC_LESS, y, int_coord_bld->zero);
115       b2 = lp_build_cmp(int_coord_bld, PIPE_FUNC_GEQUAL, y, height);
116       if (use_border) {
117          use_border = LLVMBuildOr(builder, use_border, b1, "ub_or_b1");
118          use_border = LLVMBuildOr(builder, use_border, b2, "ub_or_b2");
119       }
120       else {
121          use_border = LLVMBuildOr(builder, b1, b2, "b1_or_b2");
122       }
123    }
124 
125    if (dims == 3 &&
126        lp_sampler_wrap_mode_uses_border_color(static_state->wrap_r,
127                                               static_state->min_img_filter,
128                                               static_state->mag_img_filter)) {
129       LLVMValueRef b1, b2;
130       b1 = lp_build_cmp(int_coord_bld, PIPE_FUNC_LESS, z, int_coord_bld->zero);
131       b2 = lp_build_cmp(int_coord_bld, PIPE_FUNC_GEQUAL, z, depth);
132       if (use_border) {
133          use_border = LLVMBuildOr(builder, use_border, b1, "ub_or_b1");
134          use_border = LLVMBuildOr(builder, use_border, b2, "ub_or_b2");
135       }
136       else {
137          use_border = LLVMBuildOr(builder, b1, b2, "b1_or_b2");
138       }
139    }
140 
141    /* convert x,y,z coords to linear offset from start of texture, in bytes */
142    lp_build_sample_offset(&bld->int_coord_bld,
143                           bld->format_desc,
144                           x, y, z, y_stride, z_stride,
145                           &offset, &i, &j);
146    if (mipoffsets) {
147       offset = lp_build_add(&bld->int_coord_bld, offset, mipoffsets);
148    }
149 
150    if (use_border) {
151       /* If we can sample the border color, it means that texcoords may
152        * lie outside the bounds of the texture image.  We need to do
153        * something to prevent reading out of bounds and causing a segfault.
154        *
155        * Simply AND the texture coords with !use_border.  This will cause
156        * coords which are out of bounds to become zero.  Zero's guaranteed
157        * to be inside the texture image.
158        */
159       offset = lp_build_andnot(&bld->int_coord_bld, offset, use_border);
160    }
161 
162    lp_build_fetch_rgba_soa(bld->gallivm,
163                            bld->format_desc,
164                            bld->texel_type, TRUE,
165                            data_ptr, offset,
166                            i, j,
167                            bld->cache,
168                            texel_out);
169 
170    /*
171     * Note: if we find an app which frequently samples the texture border
172     * we might want to implement a true conditional here to avoid sampling
173     * the texture whenever possible (since that's quite a bit of code).
174     * Ex:
175     *   if (use_border) {
176     *      texel = border_color;
177     *   }
178     *   else {
179     *      texel = sample_texture(coord);
180     *   }
181     * As it is now, we always sample the texture, then selectively replace
182     * the texel color results with the border color.
183     */
184 
185    if (use_border) {
186       /* select texel color or border color depending on use_border. */
187       const struct util_format_description *format_desc = bld->format_desc;
188       int chan;
189       struct lp_type border_type = bld->texel_type;
190       border_type.length = 4;
191       /*
192        * Only replace channels which are actually present. The others should
193        * get optimized away eventually by sampler_view swizzle anyway but it's
194        * easier too.
195        */
196       for (chan = 0; chan < 4; chan++) {
197          unsigned chan_s;
198          /* reverse-map channel... */
199          if (util_format_has_stencil(format_desc)) {
200             if (chan == 0)
201                chan_s = 0;
202             else
203                break;
204          }
205          else {
206             for (chan_s = 0; chan_s < 4; chan_s++) {
207                if (chan_s == format_desc->swizzle[chan]) {
208                   break;
209                }
210             }
211          }
212          if (chan_s <= 3) {
213             /* use the already clamped color */
214             LLVMValueRef idx = lp_build_const_int32(bld->gallivm, chan);
215             LLVMValueRef border_chan;
216 
217             border_chan = lp_build_extract_broadcast(bld->gallivm,
218                                                      border_type,
219                                                      bld->texel_type,
220                                                      bld->border_color_clamped,
221                                                      idx);
222             texel_out[chan] = lp_build_select(&bld->texel_bld, use_border,
223                                               border_chan, texel_out[chan]);
224          }
225       }
226    }
227 }
228 
229 
230 /**
231  * Helper to compute the mirror function for the PIPE_WRAP_MIRROR_REPEAT mode.
232  * (Note that with pot sizes could do this much more easily post-scale
233  * with some bit arithmetic.)
234  */
235 static LLVMValueRef
lp_build_coord_mirror(struct lp_build_sample_context * bld,LLVMValueRef coord,boolean posOnly)236 lp_build_coord_mirror(struct lp_build_sample_context *bld,
237                       LLVMValueRef coord, boolean posOnly)
238 {
239    struct lp_build_context *coord_bld = &bld->coord_bld;
240    LLVMValueRef fract;
241    LLVMValueRef half = lp_build_const_vec(bld->gallivm, coord_bld->type, 0.5);
242 
243    /*
244     * We can just use 2*(x - round(0.5*x)) to do all the mirroring,
245     * it all works out. (The result is in range [-1, 1.0], negative if
246     * the coord is in the "odd" section, otherwise positive.)
247     */
248 
249    coord = lp_build_mul(coord_bld, coord, half);
250    fract = lp_build_round(coord_bld, coord);
251    fract = lp_build_sub(coord_bld, coord, fract);
252    coord = lp_build_add(coord_bld, fract, fract);
253 
254    if (posOnly) {
255       /*
256        * Theoretically it's not quite 100% accurate because the spec says
257        * that ultimately a scaled coord of -x.0 should map to int coord
258        * -x + 1 with mirroring, not -x (this does not matter for bilinear
259        * filtering).
260        */
261       coord = lp_build_abs(coord_bld, coord);
262       /* kill off NaNs */
263       /* XXX: not safe without arch rounding, fract can be anything. */
264       coord = lp_build_max_ext(coord_bld, coord, coord_bld->zero,
265                                GALLIVM_NAN_RETURN_OTHER_SECOND_NONNAN);
266    }
267 
268    return coord;
269 }
270 
271 
272 /**
273  * Helper to compute the first coord and the weight for
274  * linear wrap repeat npot textures
275  */
276 void
lp_build_coord_repeat_npot_linear(struct lp_build_sample_context * bld,LLVMValueRef coord_f,LLVMValueRef length_i,LLVMValueRef length_f,LLVMValueRef * coord0_i,LLVMValueRef * weight_f)277 lp_build_coord_repeat_npot_linear(struct lp_build_sample_context *bld,
278                                   LLVMValueRef coord_f,
279                                   LLVMValueRef length_i,
280                                   LLVMValueRef length_f,
281                                   LLVMValueRef *coord0_i,
282                                   LLVMValueRef *weight_f)
283 {
284    struct lp_build_context *coord_bld = &bld->coord_bld;
285    struct lp_build_context *int_coord_bld = &bld->int_coord_bld;
286    LLVMValueRef half = lp_build_const_vec(bld->gallivm, coord_bld->type, 0.5);
287    LLVMValueRef length_minus_one = lp_build_sub(int_coord_bld, length_i,
288                                                 int_coord_bld->one);
289    LLVMValueRef mask;
290    /* wrap with normalized floats is just fract */
291    coord_f = lp_build_fract(coord_bld, coord_f);
292    /* mul by size and subtract 0.5 */
293    coord_f = lp_build_mul(coord_bld, coord_f, length_f);
294    coord_f = lp_build_sub(coord_bld, coord_f, half);
295    /*
296     * we avoided the 0.5/length division before the repeat wrap,
297     * now need to fix up edge cases with selects
298     */
299    /*
300     * Note we do a float (unordered) compare so we can eliminate NaNs.
301     * (Otherwise would need fract_safe above).
302     */
303    mask = lp_build_compare(coord_bld->gallivm, coord_bld->type,
304                            PIPE_FUNC_LESS, coord_f, coord_bld->zero);
305 
306    /* convert to int, compute lerp weight */
307    lp_build_ifloor_fract(coord_bld, coord_f, coord0_i, weight_f);
308    *coord0_i = lp_build_select(int_coord_bld, mask, length_minus_one, *coord0_i);
309 }
310 
311 
312 /**
313  * Build LLVM code for texture wrap mode for linear filtering.
314  * \param x0_out  returns first integer texcoord
315  * \param x1_out  returns second integer texcoord
316  * \param weight_out  returns linear interpolation weight
317  */
318 static void
lp_build_sample_wrap_linear(struct lp_build_sample_context * bld,boolean is_gather,LLVMValueRef coord,LLVMValueRef length,LLVMValueRef length_f,LLVMValueRef offset,boolean is_pot,unsigned wrap_mode,LLVMValueRef * x0_out,LLVMValueRef * x1_out,LLVMValueRef * weight_out)319 lp_build_sample_wrap_linear(struct lp_build_sample_context *bld,
320                             boolean is_gather,
321                             LLVMValueRef coord,
322                             LLVMValueRef length,
323                             LLVMValueRef length_f,
324                             LLVMValueRef offset,
325                             boolean is_pot,
326                             unsigned wrap_mode,
327                             LLVMValueRef *x0_out,
328                             LLVMValueRef *x1_out,
329                             LLVMValueRef *weight_out)
330 {
331    struct lp_build_context *coord_bld = &bld->coord_bld;
332    struct lp_build_context *int_coord_bld = &bld->int_coord_bld;
333    LLVMBuilderRef builder = bld->gallivm->builder;
334    LLVMValueRef half = lp_build_const_vec(bld->gallivm, coord_bld->type, 0.5);
335    LLVMValueRef length_minus_one = lp_build_sub(int_coord_bld, length, int_coord_bld->one);
336    LLVMValueRef coord0, coord1, weight;
337 
338    switch(wrap_mode) {
339    case PIPE_TEX_WRAP_REPEAT:
340       if (is_pot) {
341          /* mul by size and subtract 0.5 */
342          coord = lp_build_mul(coord_bld, coord, length_f);
343          coord = lp_build_sub(coord_bld, coord, half);
344          if (offset) {
345             offset = lp_build_int_to_float(coord_bld, offset);
346             coord = lp_build_add(coord_bld, coord, offset);
347          }
348          /* convert to int, compute lerp weight */
349          lp_build_ifloor_fract(coord_bld, coord, &coord0, &weight);
350          coord1 = lp_build_add(int_coord_bld, coord0, int_coord_bld->one);
351          /* repeat wrap */
352          coord0 = LLVMBuildAnd(builder, coord0, length_minus_one, "");
353          coord1 = LLVMBuildAnd(builder, coord1, length_minus_one, "");
354       }
355       else {
356          LLVMValueRef mask;
357          if (offset) {
358             offset = lp_build_int_to_float(coord_bld, offset);
359             offset = lp_build_div(coord_bld, offset, length_f);
360             coord = lp_build_add(coord_bld, coord, offset);
361          }
362          lp_build_coord_repeat_npot_linear(bld, coord,
363                                            length, length_f,
364                                            &coord0, &weight);
365          mask = lp_build_compare(int_coord_bld->gallivm, int_coord_bld->type,
366                                  PIPE_FUNC_NOTEQUAL, coord0, length_minus_one);
367          coord1 = LLVMBuildAnd(builder,
368                                lp_build_add(int_coord_bld, coord0, int_coord_bld->one),
369                                mask, "");
370       }
371       break;
372 
373    case PIPE_TEX_WRAP_CLAMP:
374       if (bld->static_sampler_state->normalized_coords) {
375          /* scale coord to length */
376          coord = lp_build_mul(coord_bld, coord, length_f);
377       }
378       if (offset) {
379          offset = lp_build_int_to_float(coord_bld, offset);
380          coord = lp_build_add(coord_bld, coord, offset);
381       }
382 
383       /*
384        * clamp to [0, length]
385        *
386        * Unlike some other wrap modes, this should be correct for gather
387        * too. GL_CLAMP explicitly does this clamp on the coord prior to
388        * actual wrapping (which is per sample).
389        */
390       coord = lp_build_clamp(coord_bld, coord, coord_bld->zero, length_f);
391 
392       coord = lp_build_sub(coord_bld, coord, half);
393 
394       /* convert to int, compute lerp weight */
395       lp_build_ifloor_fract(coord_bld, coord, &coord0, &weight);
396       coord1 = lp_build_add(int_coord_bld, coord0, int_coord_bld->one);
397       break;
398 
399    case PIPE_TEX_WRAP_CLAMP_TO_EDGE:
400       {
401          struct lp_build_context abs_coord_bld = bld->coord_bld;
402          abs_coord_bld.type.sign = FALSE;
403 
404          if (bld->static_sampler_state->normalized_coords) {
405             /* mul by tex size */
406             coord = lp_build_mul(coord_bld, coord, length_f);
407          }
408          if (offset) {
409             offset = lp_build_int_to_float(coord_bld, offset);
410             coord = lp_build_add(coord_bld, coord, offset);
411          }
412 
413          /* clamp to length max */
414          coord = lp_build_min_ext(coord_bld, coord, length_f,
415                                   GALLIVM_NAN_RETURN_OTHER_SECOND_NONNAN);
416          if (!is_gather) {
417             /* subtract 0.5 */
418             coord = lp_build_sub(coord_bld, coord, half);
419             /* clamp to [0, length - 0.5] */
420             coord = lp_build_max(coord_bld, coord, coord_bld->zero);
421             /* convert to int, compute lerp weight */
422             lp_build_ifloor_fract(&abs_coord_bld, coord, &coord0, &weight);
423             coord1 = lp_build_add(int_coord_bld, coord0, int_coord_bld->one);
424          } else {
425             /*
426              * The non-gather path will end up with coords 0, 1 if coord was
427              * smaller than 0.5 (with corresponding weight 0.0 so it doesn't
428              * really matter what the second coord is). But for gather, we
429              * really need to end up with coords 0, 0.
430              */
431             coord = lp_build_max(coord_bld, coord, coord_bld->zero);
432             coord0 = lp_build_sub(coord_bld, coord, half);
433             coord1 = lp_build_add(coord_bld, coord, half);
434             /* Values range ([-0.5, length_f - 0.5], [0.5, length_f + 0.5] */
435             coord0 = lp_build_itrunc(coord_bld, coord0);
436             coord1 = lp_build_itrunc(coord_bld, coord1);
437             weight = coord_bld->undef;
438          }
439          /* coord1 = min(coord1, length-1) */
440          coord1 = lp_build_min(int_coord_bld, coord1, length_minus_one);
441          break;
442       }
443 
444    case PIPE_TEX_WRAP_CLAMP_TO_BORDER:
445       if (bld->static_sampler_state->normalized_coords) {
446          /* scale coord to length */
447          coord = lp_build_mul(coord_bld, coord, length_f);
448       }
449       if (offset) {
450          offset = lp_build_int_to_float(coord_bld, offset);
451          coord = lp_build_add(coord_bld, coord, offset);
452       }
453       /*
454        * We don't need any clamp. Technically, for very large (pos or neg)
455        * (or infinite) values, clamp against [-length, length] would be
456        * correct, but we don't need to guarantee any specific
457        * result for such coords (the ifloor will be undefined, but for modes
458        * requiring border all resulting coords are safe).
459        */
460       coord = lp_build_sub(coord_bld, coord, half);
461       /* convert to int, compute lerp weight */
462       lp_build_ifloor_fract(coord_bld, coord, &coord0, &weight);
463       coord1 = lp_build_add(int_coord_bld, coord0, int_coord_bld->one);
464       break;
465 
466    case PIPE_TEX_WRAP_MIRROR_REPEAT:
467       if (offset) {
468          offset = lp_build_int_to_float(coord_bld, offset);
469          offset = lp_build_div(coord_bld, offset, length_f);
470          coord = lp_build_add(coord_bld, coord, offset);
471       }
472       if (!is_gather) {
473          /* compute mirror function */
474          coord = lp_build_coord_mirror(bld, coord, TRUE);
475 
476          /* scale coord to length */
477          coord = lp_build_mul(coord_bld, coord, length_f);
478          coord = lp_build_sub(coord_bld, coord, half);
479 
480          /* convert to int, compute lerp weight */
481          lp_build_ifloor_fract(coord_bld, coord, &coord0, &weight);
482          coord1 = lp_build_add(int_coord_bld, coord0, int_coord_bld->one);
483 
484          /* coord0 = max(coord0, 0) */
485          coord0 = lp_build_max(int_coord_bld, coord0, int_coord_bld->zero);
486          /* coord1 = min(coord1, length-1) */
487          coord1 = lp_build_min(int_coord_bld, coord1, length_minus_one);
488       } else {
489          /*
490           * This is pretty reasonable in the end,  all what the tests care
491           * about is nasty edge cases (scaled coords x.5, so the individual
492           * coords are actually integers, which is REALLY tricky to get right
493           * due to this working differently both for negative numbers as well
494           * as for even/odd cases). But with enough magic it's not too complex
495           * after all.
496           * Maybe should try a bit arithmetic one though for POT textures...
497           */
498          LLVMValueRef isNeg;
499          /*
500           * Wrapping just once still works, even though it means we can
501           * get "wrong" sign due to performing mirror in the middle of the
502           * two coords (because this can only happen very near the odd/even
503           * edges, so both coords will actually end up as 0 or length - 1
504           * in the end).
505           * For GL4 gather with per-sample offsets we'd need to the mirroring
506           * per coord too.
507           */
508          coord = lp_build_coord_mirror(bld, coord, FALSE);
509          coord = lp_build_mul(coord_bld, coord, length_f);
510 
511          /*
512           * NaNs should be safe here, we'll do away with them with
513           * the ones' complement plus min.
514           */
515          coord0 = lp_build_sub(coord_bld, coord, half);
516          coord0 = lp_build_ifloor(coord_bld, coord0);
517          coord1 = lp_build_add(int_coord_bld, coord0, int_coord_bld->one);
518          /* ones complement for neg numbers (mirror(negX) = X - 1)  */
519          isNeg = lp_build_cmp(int_coord_bld, PIPE_FUNC_LESS,
520                               coord0, int_coord_bld->zero);
521          coord0 = lp_build_xor(int_coord_bld, coord0, isNeg);
522          isNeg = lp_build_cmp(int_coord_bld, PIPE_FUNC_LESS,
523                               coord1, int_coord_bld->zero);
524          coord1 = lp_build_xor(int_coord_bld, coord1, isNeg);
525          coord0 = lp_build_min(int_coord_bld, coord0, length_minus_one);
526          coord1 = lp_build_min(int_coord_bld, coord1, length_minus_one);
527 
528          weight = coord_bld->undef;
529       }
530       break;
531 
532    case PIPE_TEX_WRAP_MIRROR_CLAMP:
533       if (bld->static_sampler_state->normalized_coords) {
534          /* scale coord to length */
535          coord = lp_build_mul(coord_bld, coord, length_f);
536       }
537       if (offset) {
538          offset = lp_build_int_to_float(coord_bld, offset);
539          coord = lp_build_add(coord_bld, coord, offset);
540       }
541       /*
542        * XXX: probably not correct for gather, albeit I'm not
543        * entirely sure as it's poorly specified. The wrapping looks
544        * correct according to the spec which is against gl 1.2.1,
545        * however negative values will be swapped - gl re-specified
546        * wrapping with newer versions (no more pre-clamp except with
547        * GL_CLAMP).
548        */
549       coord = lp_build_abs(coord_bld, coord);
550 
551       /* clamp to [0, length] */
552       coord = lp_build_min_ext(coord_bld, coord, length_f,
553                                GALLIVM_NAN_RETURN_OTHER_SECOND_NONNAN);
554 
555       coord = lp_build_sub(coord_bld, coord, half);
556 
557       /* convert to int, compute lerp weight */
558       lp_build_ifloor_fract(coord_bld, coord, &coord0, &weight);
559       coord1 = lp_build_add(int_coord_bld, coord0, int_coord_bld->one);
560       break;
561 
562    case PIPE_TEX_WRAP_MIRROR_CLAMP_TO_EDGE:
563       {
564          struct lp_build_context abs_coord_bld = bld->coord_bld;
565          abs_coord_bld.type.sign = FALSE;
566 
567          if (bld->static_sampler_state->normalized_coords) {
568             /* scale coord to length */
569             coord = lp_build_mul(coord_bld, coord, length_f);
570          }
571          if (offset) {
572             offset = lp_build_int_to_float(coord_bld, offset);
573             coord = lp_build_add(coord_bld, coord, offset);
574          }
575          if (!is_gather) {
576             coord = lp_build_abs(coord_bld, coord);
577 
578             /* clamp to length max */
579             coord = lp_build_min_ext(coord_bld, coord, length_f,
580                                      GALLIVM_NAN_RETURN_OTHER_SECOND_NONNAN);
581             /* subtract 0.5 */
582             coord = lp_build_sub(coord_bld, coord, half);
583             /* clamp to [0, length - 0.5] */
584             coord = lp_build_max(coord_bld, coord, coord_bld->zero);
585 
586             /* convert to int, compute lerp weight */
587             lp_build_ifloor_fract(&abs_coord_bld, coord, &coord0, &weight);
588             coord1 = lp_build_add(int_coord_bld, coord0, int_coord_bld->one);
589             /* coord1 = min(coord1, length-1) */
590             coord1 = lp_build_min(int_coord_bld, coord1, length_minus_one);
591          } else {
592             /*
593              * The non-gather path will swap coord0/1 if coord was negative,
594              * which is ok for filtering since the filter weight matches
595              * accordingly. Also, if coord is close to zero, coord0/1 will
596              * be 0 and 1, instead of 0 and 0 (again ok due to filter
597              * weight being 0.0). Both issues need to be fixed for gather.
598              */
599             LLVMValueRef isNeg;
600 
601             /*
602              * Actually wanted to cheat here and use:
603              * coord1 = lp_build_iround(coord_bld, coord);
604              * but it's not good enough for some tests (even piglit
605              * textureGather is set up in a way so the coords area always
606              * .5, that is right at the crossover points).
607              * So do ordinary sub/floor, then do ones' complement
608              * for negative numbers.
609              * (Note can't just do sub|add/abs/itrunc per coord neither -
610              * because the spec demands that mirror(3.0) = 3 but
611              * mirror(-3.0) = 2.)
612              */
613             coord = lp_build_sub(coord_bld, coord, half);
614             coord0 = lp_build_ifloor(coord_bld, coord);
615             coord1 = lp_build_add(int_coord_bld, coord0, int_coord_bld->one);
616             isNeg = lp_build_cmp(int_coord_bld, PIPE_FUNC_LESS, coord0,
617                                  int_coord_bld->zero);
618             coord0 = lp_build_xor(int_coord_bld, isNeg, coord0);
619             coord0 = lp_build_min(int_coord_bld, coord0, length_minus_one);
620 
621             isNeg = lp_build_cmp(int_coord_bld, PIPE_FUNC_LESS, coord1,
622                                  int_coord_bld->zero);
623             coord1 = lp_build_xor(int_coord_bld, isNeg, coord1);
624             coord1 = lp_build_min(int_coord_bld, coord1, length_minus_one);
625 
626             weight = coord_bld->undef;
627          }
628       }
629       break;
630 
631    case PIPE_TEX_WRAP_MIRROR_CLAMP_TO_BORDER:
632       {
633          if (bld->static_sampler_state->normalized_coords) {
634             /* scale coord to length */
635             coord = lp_build_mul(coord_bld, coord, length_f);
636          }
637          if (offset) {
638             offset = lp_build_int_to_float(coord_bld, offset);
639             coord = lp_build_add(coord_bld, coord, offset);
640          }
641          /*
642           * XXX: probably not correct for gather due to swapped
643           * order if coord is negative (same rationale as for
644           * MIRROR_CLAMP).
645           */
646          coord = lp_build_abs(coord_bld, coord);
647 
648          /*
649           * We don't need any clamp. Technically, for very large
650           * (or infinite) values, clamp against length would be
651           * correct, but we don't need to guarantee any specific
652           * result for such coords (the ifloor will be undefined, but
653           * for modes requiring border all resulting coords are safe).
654           */
655          coord = lp_build_sub(coord_bld, coord, half);
656 
657          /* convert to int, compute lerp weight */
658          lp_build_ifloor_fract(coord_bld, coord, &coord0, &weight);
659          coord1 = lp_build_add(int_coord_bld, coord0, int_coord_bld->one);
660       }
661       break;
662 
663    default:
664       assert(0);
665       coord0 = NULL;
666       coord1 = NULL;
667       weight = NULL;
668    }
669 
670    *x0_out = coord0;
671    *x1_out = coord1;
672    *weight_out = weight;
673 }
674 
675 
676 /**
677  * Build LLVM code for texture wrap mode for nearest filtering.
678  * \param coord  the incoming texcoord (nominally in [0,1])
679  * \param length  the texture size along one dimension, as int vector
680  * \param length_f  the texture size along one dimension, as float vector
681  * \param offset  texel offset along one dimension (as int vector)
682  * \param is_pot  if TRUE, length is a power of two
683  * \param wrap_mode  one of PIPE_TEX_WRAP_x
684  */
685 static LLVMValueRef
lp_build_sample_wrap_nearest(struct lp_build_sample_context * bld,LLVMValueRef coord,LLVMValueRef length,LLVMValueRef length_f,LLVMValueRef offset,boolean is_pot,unsigned wrap_mode)686 lp_build_sample_wrap_nearest(struct lp_build_sample_context *bld,
687                              LLVMValueRef coord,
688                              LLVMValueRef length,
689                              LLVMValueRef length_f,
690                              LLVMValueRef offset,
691                              boolean is_pot,
692                              unsigned wrap_mode)
693 {
694    struct lp_build_context *coord_bld = &bld->coord_bld;
695    struct lp_build_context *int_coord_bld = &bld->int_coord_bld;
696    LLVMBuilderRef builder = bld->gallivm->builder;
697    LLVMValueRef length_minus_one = lp_build_sub(int_coord_bld, length, int_coord_bld->one);
698    LLVMValueRef icoord;
699 
700    switch(wrap_mode) {
701    case PIPE_TEX_WRAP_REPEAT:
702       if (is_pot) {
703          coord = lp_build_mul(coord_bld, coord, length_f);
704          icoord = lp_build_ifloor(coord_bld, coord);
705          if (offset) {
706             icoord = lp_build_add(int_coord_bld, icoord, offset);
707          }
708          icoord = LLVMBuildAnd(builder, icoord, length_minus_one, "");
709       }
710       else {
711           if (offset) {
712              offset = lp_build_int_to_float(coord_bld, offset);
713              offset = lp_build_div(coord_bld, offset, length_f);
714              coord = lp_build_add(coord_bld, coord, offset);
715           }
716           /* take fraction, unnormalize */
717           coord = lp_build_fract_safe(coord_bld, coord);
718           coord = lp_build_mul(coord_bld, coord, length_f);
719           icoord = lp_build_itrunc(coord_bld, coord);
720       }
721       break;
722 
723    case PIPE_TEX_WRAP_CLAMP:
724    case PIPE_TEX_WRAP_CLAMP_TO_EDGE:
725       if (bld->static_sampler_state->normalized_coords) {
726          /* scale coord to length */
727          coord = lp_build_mul(coord_bld, coord, length_f);
728       }
729 
730       if (offset) {
731          offset = lp_build_int_to_float(coord_bld, offset);
732          coord = lp_build_add(coord_bld, coord, offset);
733       }
734       /* floor */
735       /* use itrunc instead since we clamp to 0 anyway */
736       icoord = lp_build_itrunc(coord_bld, coord);
737 
738       /* clamp to [0, length - 1]. */
739       icoord = lp_build_clamp(int_coord_bld, icoord, int_coord_bld->zero,
740                               length_minus_one);
741       break;
742 
743    case PIPE_TEX_WRAP_CLAMP_TO_BORDER:
744       if (bld->static_sampler_state->normalized_coords) {
745          /* scale coord to length */
746          coord = lp_build_mul(coord_bld, coord, length_f);
747       }
748       /* no clamp necessary, border masking will handle this */
749       icoord = lp_build_ifloor(coord_bld, coord);
750       if (offset) {
751          icoord = lp_build_add(int_coord_bld, icoord, offset);
752       }
753       break;
754 
755    case PIPE_TEX_WRAP_MIRROR_REPEAT:
756       if (offset) {
757          offset = lp_build_int_to_float(coord_bld, offset);
758          offset = lp_build_div(coord_bld, offset, length_f);
759          coord = lp_build_add(coord_bld, coord, offset);
760       }
761       /* compute mirror function */
762       coord = lp_build_coord_mirror(bld, coord, TRUE);
763 
764       /* scale coord to length */
765       assert(bld->static_sampler_state->normalized_coords);
766       coord = lp_build_mul(coord_bld, coord, length_f);
767 
768       /* itrunc == ifloor here */
769       icoord = lp_build_itrunc(coord_bld, coord);
770 
771       /* clamp to [0, length - 1] */
772       icoord = lp_build_min(int_coord_bld, icoord, length_minus_one);
773       break;
774 
775    case PIPE_TEX_WRAP_MIRROR_CLAMP:
776    case PIPE_TEX_WRAP_MIRROR_CLAMP_TO_EDGE:
777       if (bld->static_sampler_state->normalized_coords) {
778          /* scale coord to length */
779          coord = lp_build_mul(coord_bld, coord, length_f);
780       }
781       if (offset) {
782          offset = lp_build_int_to_float(coord_bld, offset);
783          coord = lp_build_add(coord_bld, coord, offset);
784       }
785       coord = lp_build_abs(coord_bld, coord);
786 
787       /* itrunc == ifloor here */
788       icoord = lp_build_itrunc(coord_bld, coord);
789       /*
790        * Use unsigned min due to possible undef values (NaNs, overflow)
791        */
792       {
793          struct lp_build_context abs_coord_bld = *int_coord_bld;
794          abs_coord_bld.type.sign = FALSE;
795          /* clamp to [0, length - 1] */
796          icoord = lp_build_min(&abs_coord_bld, icoord, length_minus_one);
797       }
798       break;
799 
800    case PIPE_TEX_WRAP_MIRROR_CLAMP_TO_BORDER:
801       if (bld->static_sampler_state->normalized_coords) {
802          /* scale coord to length */
803          coord = lp_build_mul(coord_bld, coord, length_f);
804       }
805       if (offset) {
806          offset = lp_build_int_to_float(coord_bld, offset);
807          coord = lp_build_add(coord_bld, coord, offset);
808       }
809       coord = lp_build_abs(coord_bld, coord);
810 
811       /* itrunc == ifloor here */
812       icoord = lp_build_itrunc(coord_bld, coord);
813       break;
814 
815    default:
816       assert(0);
817       icoord = NULL;
818    }
819 
820    return icoord;
821 }
822 
823 
824 /**
825  * Do shadow test/comparison.
826  * \param p shadow ref value
827  * \param texel  the texel to compare against
828  */
829 static LLVMValueRef
lp_build_sample_comparefunc(struct lp_build_sample_context * bld,LLVMValueRef p,LLVMValueRef texel)830 lp_build_sample_comparefunc(struct lp_build_sample_context *bld,
831                             LLVMValueRef p,
832                             LLVMValueRef texel)
833 {
834    struct lp_build_context *texel_bld = &bld->texel_bld;
835    LLVMValueRef res;
836 
837    if (0) {
838       //lp_build_print_value(bld->gallivm, "shadow cmp coord", p);
839       lp_build_print_value(bld->gallivm, "shadow cmp texel", texel);
840    }
841 
842    /* result = (p FUNC texel) ? 1 : 0 */
843    /*
844     * honor d3d10 floating point rules here, which state that comparisons
845     * are ordered except NOT_EQUAL which is unordered.
846     */
847    if (bld->static_sampler_state->compare_func != PIPE_FUNC_NOTEQUAL) {
848       res = lp_build_cmp_ordered(texel_bld, bld->static_sampler_state->compare_func,
849                                  p, texel);
850    }
851    else {
852       res = lp_build_cmp(texel_bld, bld->static_sampler_state->compare_func,
853                          p, texel);
854    }
855    return res;
856 }
857 
858 
859 /**
860  * Generate code to sample a mipmap level with nearest filtering.
861  * If sampling a cube texture, r = cube face in [0,5].
862  */
863 static void
lp_build_sample_image_nearest(struct lp_build_sample_context * bld,LLVMValueRef size,LLVMValueRef row_stride_vec,LLVMValueRef img_stride_vec,LLVMValueRef data_ptr,LLVMValueRef mipoffsets,const LLVMValueRef * coords,const LLVMValueRef * offsets,LLVMValueRef colors_out[4])864 lp_build_sample_image_nearest(struct lp_build_sample_context *bld,
865                               LLVMValueRef size,
866                               LLVMValueRef row_stride_vec,
867                               LLVMValueRef img_stride_vec,
868                               LLVMValueRef data_ptr,
869                               LLVMValueRef mipoffsets,
870                               const LLVMValueRef *coords,
871                               const LLVMValueRef *offsets,
872                               LLVMValueRef colors_out[4])
873 {
874    const unsigned dims = bld->dims;
875    LLVMValueRef width_vec;
876    LLVMValueRef height_vec;
877    LLVMValueRef depth_vec;
878    LLVMValueRef flt_size;
879    LLVMValueRef flt_width_vec;
880    LLVMValueRef flt_height_vec;
881    LLVMValueRef flt_depth_vec;
882    LLVMValueRef x, y = NULL, z = NULL;
883 
884    lp_build_extract_image_sizes(bld,
885                                 &bld->int_size_bld,
886                                 bld->int_coord_type,
887                                 size,
888                                 &width_vec, &height_vec, &depth_vec);
889 
890    flt_size = lp_build_int_to_float(&bld->float_size_bld, size);
891 
892    lp_build_extract_image_sizes(bld,
893                                 &bld->float_size_bld,
894                                 bld->coord_type,
895                                 flt_size,
896                                 &flt_width_vec, &flt_height_vec, &flt_depth_vec);
897 
898    /*
899     * Compute integer texcoords.
900     */
901    x = lp_build_sample_wrap_nearest(bld, coords[0], width_vec,
902                                     flt_width_vec, offsets[0],
903                                     bld->static_texture_state->pot_width,
904                                     bld->static_sampler_state->wrap_s);
905    lp_build_name(x, "tex.x.wrapped");
906 
907    if (dims >= 2) {
908       y = lp_build_sample_wrap_nearest(bld, coords[1], height_vec,
909                                        flt_height_vec, offsets[1],
910                                        bld->static_texture_state->pot_height,
911                                        bld->static_sampler_state->wrap_t);
912       lp_build_name(y, "tex.y.wrapped");
913 
914       if (dims == 3) {
915          z = lp_build_sample_wrap_nearest(bld, coords[2], depth_vec,
916                                           flt_depth_vec, offsets[2],
917                                           bld->static_texture_state->pot_depth,
918                                           bld->static_sampler_state->wrap_r);
919          lp_build_name(z, "tex.z.wrapped");
920       }
921    }
922    if (has_layer_coord(bld->static_texture_state->target)) {
923       if (bld->static_texture_state->target == PIPE_TEXTURE_CUBE_ARRAY) {
924          /* add cube layer to face */
925          z = lp_build_add(&bld->int_coord_bld, coords[2], coords[3]);
926       }
927       else {
928          z = coords[2];
929       }
930       lp_build_name(z, "tex.z.layer");
931    }
932 
933    /*
934     * Get texture colors.
935     */
936    lp_build_sample_texel_soa(bld,
937                              width_vec, height_vec, depth_vec,
938                              x, y, z,
939                              row_stride_vec, img_stride_vec,
940                              data_ptr, mipoffsets, colors_out);
941 
942    if (bld->static_sampler_state->compare_mode != PIPE_TEX_COMPARE_NONE) {
943       LLVMValueRef cmpval;
944       cmpval = lp_build_sample_comparefunc(bld, coords[4], colors_out[0]);
945       /* this is really just a AND 1.0, cmpval but llvm is clever enough */
946       colors_out[0] = lp_build_select(&bld->texel_bld, cmpval,
947                                       bld->texel_bld.one, bld->texel_bld.zero);
948       colors_out[1] = colors_out[2] = colors_out[3] = colors_out[0];
949    }
950 
951 }
952 
953 
954 /**
955  * Like a lerp, but inputs are 0/~0 masks, so can simplify slightly.
956  */
957 static LLVMValueRef
lp_build_masklerp(struct lp_build_context * bld,LLVMValueRef weight,LLVMValueRef mask0,LLVMValueRef mask1)958 lp_build_masklerp(struct lp_build_context *bld,
959                  LLVMValueRef weight,
960                  LLVMValueRef mask0,
961                  LLVMValueRef mask1)
962 {
963    struct gallivm_state *gallivm = bld->gallivm;
964    LLVMBuilderRef builder = gallivm->builder;
965    LLVMValueRef weight2;
966 
967    weight2 = lp_build_sub(bld, bld->one, weight);
968    weight = LLVMBuildBitCast(builder, weight,
969                               lp_build_int_vec_type(gallivm, bld->type), "");
970    weight2 = LLVMBuildBitCast(builder, weight2,
971                               lp_build_int_vec_type(gallivm, bld->type), "");
972    weight = LLVMBuildAnd(builder, weight, mask1, "");
973    weight2 = LLVMBuildAnd(builder, weight2, mask0, "");
974    weight = LLVMBuildBitCast(builder, weight, bld->vec_type, "");
975    weight2 = LLVMBuildBitCast(builder, weight2, bld->vec_type, "");
976    return lp_build_add(bld, weight, weight2);
977 }
978 
979 /**
980  * Like a 2d lerp, but inputs are 0/~0 masks, so can simplify slightly.
981  */
982 static LLVMValueRef
lp_build_masklerp2d(struct lp_build_context * bld,LLVMValueRef weight0,LLVMValueRef weight1,LLVMValueRef mask00,LLVMValueRef mask01,LLVMValueRef mask10,LLVMValueRef mask11)983 lp_build_masklerp2d(struct lp_build_context *bld,
984                     LLVMValueRef weight0,
985                     LLVMValueRef weight1,
986                     LLVMValueRef mask00,
987                     LLVMValueRef mask01,
988                     LLVMValueRef mask10,
989                     LLVMValueRef mask11)
990 {
991    LLVMValueRef val0 = lp_build_masklerp(bld, weight0, mask00, mask01);
992    LLVMValueRef val1 = lp_build_masklerp(bld, weight0, mask10, mask11);
993    return lp_build_lerp(bld, weight1, val0, val1, 0);
994 }
995 
996 /*
997  * this is a bit excessive code for something OpenGL just recommends
998  * but does not require.
999  */
1000 #define ACCURATE_CUBE_CORNERS 1
1001 
1002 /**
1003  * Generate code to sample a mipmap level with linear filtering.
1004  * If sampling a cube texture, r = cube face in [0,5].
1005  * If linear_mask is present, only pixels having their mask set
1006  * will receive linear filtering, the rest will use nearest.
1007  */
1008 static void
lp_build_sample_image_linear(struct lp_build_sample_context * bld,boolean is_gather,LLVMValueRef size,LLVMValueRef linear_mask,LLVMValueRef row_stride_vec,LLVMValueRef img_stride_vec,LLVMValueRef data_ptr,LLVMValueRef mipoffsets,const LLVMValueRef * coords,const LLVMValueRef * offsets,LLVMValueRef colors_out[4])1009 lp_build_sample_image_linear(struct lp_build_sample_context *bld,
1010                              boolean is_gather,
1011                              LLVMValueRef size,
1012                              LLVMValueRef linear_mask,
1013                              LLVMValueRef row_stride_vec,
1014                              LLVMValueRef img_stride_vec,
1015                              LLVMValueRef data_ptr,
1016                              LLVMValueRef mipoffsets,
1017                              const LLVMValueRef *coords,
1018                              const LLVMValueRef *offsets,
1019                              LLVMValueRef colors_out[4])
1020 {
1021    LLVMBuilderRef builder = bld->gallivm->builder;
1022    struct lp_build_context *ivec_bld = &bld->int_coord_bld;
1023    struct lp_build_context *coord_bld = &bld->coord_bld;
1024    struct lp_build_context *texel_bld = &bld->texel_bld;
1025    const unsigned dims = bld->dims;
1026    LLVMValueRef width_vec;
1027    LLVMValueRef height_vec;
1028    LLVMValueRef depth_vec;
1029    LLVMValueRef flt_size;
1030    LLVMValueRef flt_width_vec;
1031    LLVMValueRef flt_height_vec;
1032    LLVMValueRef flt_depth_vec;
1033    LLVMValueRef fall_off[4] = { 0 }, have_corners = NULL;
1034    LLVMValueRef z1 = NULL;
1035    LLVMValueRef z00 = NULL, z01 = NULL, z10 = NULL, z11 = NULL;
1036    LLVMValueRef x00 = NULL, x01 = NULL, x10 = NULL, x11 = NULL;
1037    LLVMValueRef y00 = NULL, y01 = NULL, y10 = NULL, y11 = NULL;
1038    LLVMValueRef s_fpart, t_fpart = NULL, r_fpart = NULL;
1039    LLVMValueRef xs[4], ys[4], zs[4];
1040    LLVMValueRef neighbors[2][2][4];
1041    int chan, texel_index;
1042    boolean seamless_cube_filter, accurate_cube_corners;
1043    unsigned chan_swiz = bld->static_texture_state->swizzle_r;
1044 
1045    if (is_gather) {
1046       switch (bld->gather_comp) {
1047       case 0: chan_swiz = bld->static_texture_state->swizzle_r; break;
1048       case 1: chan_swiz = bld->static_texture_state->swizzle_g; break;
1049       case 2: chan_swiz = bld->static_texture_state->swizzle_b; break;
1050       case 3: chan_swiz = bld->static_texture_state->swizzle_a; break;
1051       default:
1052 	 break;
1053       }
1054    }
1055 
1056    seamless_cube_filter = (bld->static_texture_state->target == PIPE_TEXTURE_CUBE ||
1057                            bld->static_texture_state->target == PIPE_TEXTURE_CUBE_ARRAY) &&
1058                           bld->static_sampler_state->seamless_cube_map;
1059 
1060    /*
1061     * Disable accurate cube corners for integer textures, which should only
1062     * get here in the gather path.
1063     */
1064    accurate_cube_corners = ACCURATE_CUBE_CORNERS && seamless_cube_filter &&
1065      !util_format_is_pure_integer(bld->static_texture_state->format);
1066 
1067    lp_build_extract_image_sizes(bld,
1068                                 &bld->int_size_bld,
1069                                 bld->int_coord_type,
1070                                 size,
1071                                 &width_vec, &height_vec, &depth_vec);
1072 
1073    flt_size = lp_build_int_to_float(&bld->float_size_bld, size);
1074 
1075    lp_build_extract_image_sizes(bld,
1076                                 &bld->float_size_bld,
1077                                 bld->coord_type,
1078                                 flt_size,
1079                                 &flt_width_vec, &flt_height_vec, &flt_depth_vec);
1080 
1081    LLVMTypeRef int1t = LLVMInt1TypeInContext(bld->gallivm->context);
1082 
1083    /*
1084     * Compute integer texcoords.
1085     */
1086 
1087    if (!seamless_cube_filter) {
1088       lp_build_sample_wrap_linear(bld, is_gather, coords[0], width_vec,
1089                                   flt_width_vec, offsets[0],
1090                                   bld->static_texture_state->pot_width,
1091                                   bld->static_sampler_state->wrap_s,
1092                                   &x00, &x01, &s_fpart);
1093       lp_build_name(x00, "tex.x0.wrapped");
1094       lp_build_name(x01, "tex.x1.wrapped");
1095       x10 = x00;
1096       x11 = x01;
1097 
1098       if (dims >= 2) {
1099          lp_build_sample_wrap_linear(bld, is_gather, coords[1], height_vec,
1100                                      flt_height_vec, offsets[1],
1101                                      bld->static_texture_state->pot_height,
1102                                      bld->static_sampler_state->wrap_t,
1103                                      &y00, &y10, &t_fpart);
1104          lp_build_name(y00, "tex.y0.wrapped");
1105          lp_build_name(y10, "tex.y1.wrapped");
1106          y01 = y00;
1107          y11 = y10;
1108 
1109          if (dims == 3) {
1110             lp_build_sample_wrap_linear(bld, is_gather, coords[2], depth_vec,
1111                                         flt_depth_vec, offsets[2],
1112                                         bld->static_texture_state->pot_depth,
1113                                         bld->static_sampler_state->wrap_r,
1114                                         &z00, &z1, &r_fpart);
1115             z01 = z10 = z11 = z00;
1116             lp_build_name(z00, "tex.z0.wrapped");
1117             lp_build_name(z1, "tex.z1.wrapped");
1118          }
1119       }
1120       if (has_layer_coord(bld->static_texture_state->target)) {
1121          if (bld->static_texture_state->target == PIPE_TEXTURE_CUBE_ARRAY) {
1122             /* add cube layer to face */
1123             z00 = z01 = z10 = z11 = z1 =
1124                lp_build_add(&bld->int_coord_bld, coords[2], coords[3]);
1125          }
1126          else {
1127             z00 = z01 = z10 = z11 = z1 = coords[2];  /* cube face or layer */
1128          }
1129          lp_build_name(z00, "tex.z0.layer");
1130          lp_build_name(z1, "tex.z1.layer");
1131       }
1132    }
1133    else {
1134       struct lp_build_if_state edge_if;
1135       LLVMValueRef new_faces[4], new_xcoords[4][2], new_ycoords[4][2];
1136       LLVMValueRef coord0, coord1, have_edge, have_corner;
1137       LLVMValueRef fall_off_ym_notxm, fall_off_ym_notxp, fall_off_x, fall_off_y;
1138       LLVMValueRef fall_off_yp_notxm, fall_off_yp_notxp;
1139       LLVMValueRef x0, x1, y0, y1, y0_clamped, y1_clamped;
1140       LLVMValueRef face = coords[2];
1141       LLVMValueRef half = lp_build_const_vec(bld->gallivm, coord_bld->type, 0.5f);
1142       LLVMValueRef length_minus_one = lp_build_sub(ivec_bld, width_vec, ivec_bld->one);
1143       /* XXX drop height calcs. Could (should) do this without seamless filtering too */
1144       height_vec = width_vec;
1145       flt_height_vec = flt_width_vec;
1146 
1147       /* XXX the overflow logic is actually sort of duplicated with trilinear,
1148        * since an overflow in one mip should also have a corresponding overflow
1149        * in another.
1150        */
1151       /* should always have normalized coords, and offsets are undefined */
1152       assert(bld->static_sampler_state->normalized_coords);
1153       /*
1154        * The coords should all be between [0,1] however we can have NaNs,
1155        * which will wreak havoc. In particular the y1_clamped value below
1156        * can be -INT_MAX (on x86) and be propagated right through (probably
1157        * other values might be bogus in the end too).
1158        * So kill off the NaNs here.
1159        */
1160       coord0 = lp_build_max_ext(coord_bld, coords[0], coord_bld->zero,
1161                                 GALLIVM_NAN_RETURN_OTHER_SECOND_NONNAN);
1162       coord0 = lp_build_mul(coord_bld, coord0, flt_width_vec);
1163       /* instead of clamp, build mask if overflowed */
1164       coord0 = lp_build_sub(coord_bld, coord0, half);
1165       /* convert to int, compute lerp weight */
1166       /* not ideal with AVX (and no AVX2) */
1167       lp_build_ifloor_fract(coord_bld, coord0, &x0, &s_fpart);
1168       x1 = lp_build_add(ivec_bld, x0, ivec_bld->one);
1169       coord1 = lp_build_max_ext(coord_bld, coords[1], coord_bld->zero,
1170                                 GALLIVM_NAN_RETURN_OTHER_SECOND_NONNAN);
1171       coord1 = lp_build_mul(coord_bld, coord1, flt_height_vec);
1172       coord1 = lp_build_sub(coord_bld, coord1, half);
1173       lp_build_ifloor_fract(coord_bld, coord1, &y0, &t_fpart);
1174       y1 = lp_build_add(ivec_bld, y0, ivec_bld->one);
1175 
1176       fall_off[0] = lp_build_cmp(ivec_bld, PIPE_FUNC_LESS, x0, ivec_bld->zero);
1177       fall_off[1] = lp_build_cmp(ivec_bld, PIPE_FUNC_GREATER, x1, length_minus_one);
1178       fall_off[2] = lp_build_cmp(ivec_bld, PIPE_FUNC_LESS, y0, ivec_bld->zero);
1179       fall_off[3] = lp_build_cmp(ivec_bld, PIPE_FUNC_GREATER, y1, length_minus_one);
1180 
1181       fall_off_x = lp_build_or(ivec_bld, fall_off[0], fall_off[1]);
1182       fall_off_y = lp_build_or(ivec_bld, fall_off[2], fall_off[3]);
1183       have_edge = lp_build_or(ivec_bld, fall_off_x, fall_off_y);
1184       have_edge = lp_build_any_true_range(ivec_bld, ivec_bld->type.length, have_edge);
1185 
1186       /* needed for accurate corner filtering branch later, rely on 0 init */
1187       have_corners = lp_build_alloca(bld->gallivm, int1t, "have_corner");
1188 
1189       for (texel_index = 0; texel_index < 4; texel_index++) {
1190          xs[texel_index] = lp_build_alloca(bld->gallivm, ivec_bld->vec_type, "xs");
1191          ys[texel_index] = lp_build_alloca(bld->gallivm, ivec_bld->vec_type, "ys");
1192          zs[texel_index] = lp_build_alloca(bld->gallivm, ivec_bld->vec_type, "zs");
1193       }
1194 
1195       lp_build_if(&edge_if, bld->gallivm, have_edge);
1196 
1197       have_corner = lp_build_and(ivec_bld, fall_off_x, fall_off_y);
1198       have_corner = lp_build_any_true_range(ivec_bld, ivec_bld->type.length, have_corner);
1199       LLVMBuildStore(builder, have_corner, have_corners);
1200 
1201       /*
1202        * Need to feed clamped values here for cheap corner handling,
1203        * but only for y coord (as when falling off both edges we only
1204        * fall off the x one) - this should be sufficient.
1205        */
1206       y0_clamped = lp_build_max(ivec_bld, y0, ivec_bld->zero);
1207       y1_clamped = lp_build_min(ivec_bld, y1, length_minus_one);
1208 
1209       /*
1210        * Get all possible new coords.
1211        */
1212       lp_build_cube_new_coords(ivec_bld, face,
1213                                x0, x1, y0_clamped, y1_clamped,
1214                                length_minus_one,
1215                                new_faces, new_xcoords, new_ycoords);
1216 
1217       /* handle fall off x-, x+ direction */
1218       /* determine new coords, face (not both fall_off vars can be true at same time) */
1219       x00 = lp_build_select(ivec_bld, fall_off[0], new_xcoords[0][0], x0);
1220       y00 = lp_build_select(ivec_bld, fall_off[0], new_ycoords[0][0], y0_clamped);
1221       x10 = lp_build_select(ivec_bld, fall_off[0], new_xcoords[0][1], x0);
1222       y10 = lp_build_select(ivec_bld, fall_off[0], new_ycoords[0][1], y1_clamped);
1223       x01 = lp_build_select(ivec_bld, fall_off[1], new_xcoords[1][0], x1);
1224       y01 = lp_build_select(ivec_bld, fall_off[1], new_ycoords[1][0], y0_clamped);
1225       x11 = lp_build_select(ivec_bld, fall_off[1], new_xcoords[1][1], x1);
1226       y11 = lp_build_select(ivec_bld, fall_off[1], new_ycoords[1][1], y1_clamped);
1227 
1228       z00 = z10 = lp_build_select(ivec_bld, fall_off[0], new_faces[0], face);
1229       z01 = z11 = lp_build_select(ivec_bld, fall_off[1], new_faces[1], face);
1230 
1231       /* handle fall off y-, y+ direction */
1232       /*
1233        * Cheap corner logic: just hack up things so a texel doesn't fall
1234        * off both sides (which means filter weights will be wrong but we'll only
1235        * use valid texels in the filter).
1236        * This means however (y) coords must additionally be clamped (see above).
1237        * This corner handling should be fully OpenGL (but not d3d10) compliant.
1238        */
1239       fall_off_ym_notxm = lp_build_andnot(ivec_bld, fall_off[2], fall_off[0]);
1240       fall_off_ym_notxp = lp_build_andnot(ivec_bld, fall_off[2], fall_off[1]);
1241       fall_off_yp_notxm = lp_build_andnot(ivec_bld, fall_off[3], fall_off[0]);
1242       fall_off_yp_notxp = lp_build_andnot(ivec_bld, fall_off[3], fall_off[1]);
1243 
1244       x00 = lp_build_select(ivec_bld, fall_off_ym_notxm, new_xcoords[2][0], x00);
1245       y00 = lp_build_select(ivec_bld, fall_off_ym_notxm, new_ycoords[2][0], y00);
1246       x01 = lp_build_select(ivec_bld, fall_off_ym_notxp, new_xcoords[2][1], x01);
1247       y01 = lp_build_select(ivec_bld, fall_off_ym_notxp, new_ycoords[2][1], y01);
1248       x10 = lp_build_select(ivec_bld, fall_off_yp_notxm, new_xcoords[3][0], x10);
1249       y10 = lp_build_select(ivec_bld, fall_off_yp_notxm, new_ycoords[3][0], y10);
1250       x11 = lp_build_select(ivec_bld, fall_off_yp_notxp, new_xcoords[3][1], x11);
1251       y11 = lp_build_select(ivec_bld, fall_off_yp_notxp, new_ycoords[3][1], y11);
1252 
1253       z00 = lp_build_select(ivec_bld, fall_off_ym_notxm, new_faces[2], z00);
1254       z01 = lp_build_select(ivec_bld, fall_off_ym_notxp, new_faces[2], z01);
1255       z10 = lp_build_select(ivec_bld, fall_off_yp_notxm, new_faces[3], z10);
1256       z11 = lp_build_select(ivec_bld, fall_off_yp_notxp, new_faces[3], z11);
1257 
1258       if (bld->static_texture_state->target == PIPE_TEXTURE_CUBE_ARRAY) {
1259          /* now can add cube layer to face (per sample) */
1260          z00 = lp_build_add(ivec_bld, z00, coords[3]);
1261          z01 = lp_build_add(ivec_bld, z01, coords[3]);
1262          z10 = lp_build_add(ivec_bld, z10, coords[3]);
1263          z11 = lp_build_add(ivec_bld, z11, coords[3]);
1264       }
1265 
1266       LLVMBuildStore(builder, x00, xs[0]);
1267       LLVMBuildStore(builder, x01, xs[1]);
1268       LLVMBuildStore(builder, x10, xs[2]);
1269       LLVMBuildStore(builder, x11, xs[3]);
1270       LLVMBuildStore(builder, y00, ys[0]);
1271       LLVMBuildStore(builder, y01, ys[1]);
1272       LLVMBuildStore(builder, y10, ys[2]);
1273       LLVMBuildStore(builder, y11, ys[3]);
1274       LLVMBuildStore(builder, z00, zs[0]);
1275       LLVMBuildStore(builder, z01, zs[1]);
1276       LLVMBuildStore(builder, z10, zs[2]);
1277       LLVMBuildStore(builder, z11, zs[3]);
1278 
1279       lp_build_else(&edge_if);
1280 
1281       LLVMBuildStore(builder, x0, xs[0]);
1282       LLVMBuildStore(builder, x1, xs[1]);
1283       LLVMBuildStore(builder, x0, xs[2]);
1284       LLVMBuildStore(builder, x1, xs[3]);
1285       LLVMBuildStore(builder, y0, ys[0]);
1286       LLVMBuildStore(builder, y0, ys[1]);
1287       LLVMBuildStore(builder, y1, ys[2]);
1288       LLVMBuildStore(builder, y1, ys[3]);
1289       if (bld->static_texture_state->target == PIPE_TEXTURE_CUBE_ARRAY) {
1290          LLVMValueRef cube_layer = lp_build_add(ivec_bld, face, coords[3]);
1291          LLVMBuildStore(builder, cube_layer, zs[0]);
1292          LLVMBuildStore(builder, cube_layer, zs[1]);
1293          LLVMBuildStore(builder, cube_layer, zs[2]);
1294          LLVMBuildStore(builder, cube_layer, zs[3]);
1295       }
1296       else {
1297          LLVMBuildStore(builder, face, zs[0]);
1298          LLVMBuildStore(builder, face, zs[1]);
1299          LLVMBuildStore(builder, face, zs[2]);
1300          LLVMBuildStore(builder, face, zs[3]);
1301       }
1302 
1303       lp_build_endif(&edge_if);
1304 
1305       LLVMTypeRef type = ivec_bld->vec_type;
1306       x00 = LLVMBuildLoad2(builder, type, xs[0], "");
1307       x01 = LLVMBuildLoad2(builder, type, xs[1], "");
1308       x10 = LLVMBuildLoad2(builder, type, xs[2], "");
1309       x11 = LLVMBuildLoad2(builder, type, xs[3], "");
1310       y00 = LLVMBuildLoad2(builder, type, ys[0], "");
1311       y01 = LLVMBuildLoad2(builder, type, ys[1], "");
1312       y10 = LLVMBuildLoad2(builder, type, ys[2], "");
1313       y11 = LLVMBuildLoad2(builder, type, ys[3], "");
1314       z00 = LLVMBuildLoad2(builder, type, zs[0], "");
1315       z01 = LLVMBuildLoad2(builder, type, zs[1], "");
1316       z10 = LLVMBuildLoad2(builder, type, zs[2], "");
1317       z11 = LLVMBuildLoad2(builder, type, zs[3], "");
1318    }
1319 
1320    if (linear_mask) {
1321       /*
1322        * Whack filter weights into place. Whatever texel had more weight is
1323        * the one which should have been selected by nearest filtering hence
1324        * just use 100% weight for it.
1325        */
1326       struct lp_build_context *c_bld = &bld->coord_bld;
1327       LLVMValueRef w1_mask, w1_weight;
1328       LLVMValueRef half = lp_build_const_vec(bld->gallivm, c_bld->type, 0.5f);
1329 
1330       w1_mask = lp_build_cmp(c_bld, PIPE_FUNC_GREATER, s_fpart, half);
1331       /* this select is really just a "and" */
1332       w1_weight = lp_build_select(c_bld, w1_mask, c_bld->one, c_bld->zero);
1333       s_fpart = lp_build_select(c_bld, linear_mask, s_fpart, w1_weight);
1334       if (dims >= 2) {
1335          w1_mask = lp_build_cmp(c_bld, PIPE_FUNC_GREATER, t_fpart, half);
1336          w1_weight = lp_build_select(c_bld, w1_mask, c_bld->one, c_bld->zero);
1337          t_fpart = lp_build_select(c_bld, linear_mask, t_fpart, w1_weight);
1338          if (dims == 3) {
1339             w1_mask = lp_build_cmp(c_bld, PIPE_FUNC_GREATER, r_fpart, half);
1340             w1_weight = lp_build_select(c_bld, w1_mask, c_bld->one, c_bld->zero);
1341             r_fpart = lp_build_select(c_bld, linear_mask, r_fpart, w1_weight);
1342          }
1343       }
1344    }
1345 
1346    /*
1347     * Get texture colors.
1348     */
1349    /* get x0/x1 texels */
1350    lp_build_sample_texel_soa(bld,
1351                              width_vec, height_vec, depth_vec,
1352                              x00, y00, z00,
1353                              row_stride_vec, img_stride_vec,
1354                              data_ptr, mipoffsets, neighbors[0][0]);
1355    lp_build_sample_texel_soa(bld,
1356                              width_vec, height_vec, depth_vec,
1357                              x01, y01, z01,
1358                              row_stride_vec, img_stride_vec,
1359                              data_ptr, mipoffsets, neighbors[0][1]);
1360 
1361    if (dims == 1) {
1362       assert(!is_gather);
1363       if (bld->static_sampler_state->compare_mode == PIPE_TEX_COMPARE_NONE) {
1364          lp_build_reduce_filter(texel_bld,
1365                                 bld->static_sampler_state->reduction_mode,
1366                                 0,
1367                                 4,
1368                                 s_fpart,
1369                                 neighbors[0][0],
1370                                 neighbors[0][1],
1371                                 colors_out);
1372       }
1373       else {
1374          LLVMValueRef cmpval0, cmpval1;
1375          cmpval0 = lp_build_sample_comparefunc(bld, coords[4], neighbors[0][0][0]);
1376          cmpval1 = lp_build_sample_comparefunc(bld, coords[4], neighbors[0][1][0]);
1377          /* simplified lerp, AND mask with weight and add */
1378          colors_out[0] = lp_build_masklerp(texel_bld, s_fpart,
1379                                            cmpval0, cmpval1);
1380          colors_out[1] = colors_out[2] = colors_out[3] = colors_out[0];
1381       }
1382    }
1383    else {
1384       /* 2D/3D texture */
1385       struct lp_build_if_state corner_if;
1386       LLVMValueRef colors0[4], colorss[4] = { 0 };
1387 
1388       /* get x0/x1 texels at y1 */
1389       lp_build_sample_texel_soa(bld,
1390                                 width_vec, height_vec, depth_vec,
1391                                 x10, y10, z10,
1392                                 row_stride_vec, img_stride_vec,
1393                                 data_ptr, mipoffsets, neighbors[1][0]);
1394       lp_build_sample_texel_soa(bld,
1395                                 width_vec, height_vec, depth_vec,
1396                                 x11, y11, z11,
1397                                 row_stride_vec, img_stride_vec,
1398                                 data_ptr, mipoffsets, neighbors[1][1]);
1399 
1400       /*
1401        * To avoid having to duplicate linear_mask / fetch code use
1402        * another branch (with corner condition though edge would work
1403        * as well) here.
1404        */
1405       if (have_corners && accurate_cube_corners &&
1406           bld->static_sampler_state->reduction_mode == PIPE_TEX_REDUCTION_WEIGHTED_AVERAGE) {
1407          LLVMValueRef c00, c01, c10, c11, c00f, c01f, c10f, c11f;
1408          LLVMValueRef have_corner, one_third;
1409 
1410          colorss[0] = lp_build_alloca(bld->gallivm, coord_bld->vec_type, "cs0");
1411          colorss[1] = lp_build_alloca(bld->gallivm, coord_bld->vec_type, "cs1");
1412          colorss[2] = lp_build_alloca(bld->gallivm, coord_bld->vec_type, "cs2");
1413          colorss[3] = lp_build_alloca(bld->gallivm, coord_bld->vec_type, "cs3");
1414 
1415          have_corner = LLVMBuildLoad2(builder, int1t, have_corners, "");
1416 
1417          lp_build_if(&corner_if, bld->gallivm, have_corner);
1418 
1419          one_third = lp_build_const_vec(bld->gallivm, coord_bld->type,
1420                                         1.0f/3.0f);
1421 
1422          /* find corner */
1423          c00 = lp_build_and(ivec_bld, fall_off[0], fall_off[2]);
1424          c00f = LLVMBuildBitCast(builder, c00, coord_bld->vec_type, "");
1425          c01 = lp_build_and(ivec_bld, fall_off[1], fall_off[2]);
1426          c01f = LLVMBuildBitCast(builder, c01, coord_bld->vec_type, "");
1427          c10 = lp_build_and(ivec_bld, fall_off[0], fall_off[3]);
1428          c10f = LLVMBuildBitCast(builder, c10, coord_bld->vec_type, "");
1429          c11 = lp_build_and(ivec_bld, fall_off[1], fall_off[3]);
1430          c11f = LLVMBuildBitCast(builder, c11, coord_bld->vec_type, "");
1431 
1432          if (!is_gather) {
1433             /*
1434              * we can't use standard 2d lerp as we need per-element weight
1435              * in case of corners, so just calculate bilinear result as
1436              * w00*s00 + w01*s01 + w10*s10 + w11*s11.
1437              * (This is actually less work than using 2d lerp, 7 vs. 9
1438              * instructions, however calculating the weights needs another 6,
1439              * so actually probably not slower than 2d lerp only for 4 channels
1440              * as weights only need to be calculated once - of course fixing
1441              * the weights has additional cost.)
1442              */
1443             LLVMValueRef w00, w01, w10, w11, wx0, wy0, c_weight, tmp;
1444             wx0 = lp_build_sub(coord_bld, coord_bld->one, s_fpart);
1445             wy0 = lp_build_sub(coord_bld, coord_bld->one, t_fpart);
1446             w00 = lp_build_mul(coord_bld, wx0, wy0);
1447             w01 = lp_build_mul(coord_bld, s_fpart, wy0);
1448             w10 = lp_build_mul(coord_bld, wx0, t_fpart);
1449             w11 = lp_build_mul(coord_bld, s_fpart, t_fpart);
1450 
1451             /* find corner weight */
1452             c_weight = lp_build_select(coord_bld, c00, w00, coord_bld->zero);
1453             c_weight = lp_build_select(coord_bld, c01, w01, c_weight);
1454             c_weight = lp_build_select(coord_bld, c10, w10, c_weight);
1455             c_weight = lp_build_select(coord_bld, c11, w11, c_weight);
1456 
1457             /*
1458              * add 1/3 of the corner weight to the weight of the 3 other
1459              * samples and null out corner weight.
1460              */
1461             c_weight = lp_build_mul(coord_bld, c_weight, one_third);
1462             w00 = lp_build_add(coord_bld, w00, c_weight);
1463             w00 = lp_build_andnot(coord_bld, w00, c00f);
1464             w01 = lp_build_add(coord_bld, w01, c_weight);
1465             w01 = lp_build_andnot(coord_bld, w01, c01f);
1466             w10 = lp_build_add(coord_bld, w10, c_weight);
1467             w10 = lp_build_andnot(coord_bld, w10, c10f);
1468             w11 = lp_build_add(coord_bld, w11, c_weight);
1469             w11 = lp_build_andnot(coord_bld, w11, c11f);
1470 
1471             if (bld->static_sampler_state->compare_mode ==
1472                 PIPE_TEX_COMPARE_NONE) {
1473                for (chan = 0; chan < 4; chan++) {
1474                   colors0[chan] = lp_build_mul(coord_bld, w00,
1475                                                neighbors[0][0][chan]);
1476                   tmp = lp_build_mul(coord_bld, w01, neighbors[0][1][chan]);
1477                   colors0[chan] = lp_build_add(coord_bld, tmp, colors0[chan]);
1478                   tmp = lp_build_mul(coord_bld, w10, neighbors[1][0][chan]);
1479                   colors0[chan] = lp_build_add(coord_bld, tmp, colors0[chan]);
1480                   tmp = lp_build_mul(coord_bld, w11, neighbors[1][1][chan]);
1481                   colors0[chan] = lp_build_add(coord_bld, tmp, colors0[chan]);
1482                }
1483             }
1484             else {
1485                LLVMValueRef cmpval00, cmpval01, cmpval10, cmpval11;
1486                cmpval00 = lp_build_sample_comparefunc(bld, coords[4],
1487                                                       neighbors[0][0][0]);
1488                cmpval01 = lp_build_sample_comparefunc(bld, coords[4],
1489                                                       neighbors[0][1][0]);
1490                cmpval10 = lp_build_sample_comparefunc(bld, coords[4],
1491                                                       neighbors[1][0][0]);
1492                cmpval11 = lp_build_sample_comparefunc(bld, coords[4],
1493                                                       neighbors[1][1][0]);
1494                /*
1495                 * inputs to interpolation are just masks so just add
1496                 * masked weights together
1497                 */
1498                cmpval00 = LLVMBuildBitCast(builder, cmpval00,
1499                                            coord_bld->vec_type, "");
1500                cmpval01 = LLVMBuildBitCast(builder, cmpval01,
1501                                            coord_bld->vec_type, "");
1502                cmpval10 = LLVMBuildBitCast(builder, cmpval10,
1503                                            coord_bld->vec_type, "");
1504                cmpval11 = LLVMBuildBitCast(builder, cmpval11,
1505                                            coord_bld->vec_type, "");
1506                colors0[0] = lp_build_and(coord_bld, w00, cmpval00);
1507                tmp = lp_build_and(coord_bld, w01, cmpval01);
1508                colors0[0] = lp_build_add(coord_bld, tmp, colors0[0]);
1509                tmp = lp_build_and(coord_bld, w10, cmpval10);
1510                colors0[0] = lp_build_add(coord_bld, tmp, colors0[0]);
1511                tmp = lp_build_and(coord_bld, w11, cmpval11);
1512                colors0[0] = lp_build_add(coord_bld, tmp, colors0[0]);
1513                colors0[1] = colors0[2] = colors0[3] = colors0[0];
1514             }
1515          }
1516          else {
1517             /*
1518              * We don't have any weights to adjust, so instead calculate
1519              * the fourth texel as simply the average of the other 3.
1520              * (This would work for non-gather too, however we'd have
1521              * a boatload more of the select stuff due to there being
1522              * 4 times as many colors as weights.)
1523              */
1524             LLVMValueRef col00, col01, col10, col11;
1525             LLVMValueRef colc, colc0, colc1;
1526             col10 = lp_build_swizzle_soa_channel(texel_bld,
1527                                                  neighbors[1][0], chan_swiz);
1528             col11 = lp_build_swizzle_soa_channel(texel_bld,
1529                                                  neighbors[1][1], chan_swiz);
1530             col01 = lp_build_swizzle_soa_channel(texel_bld,
1531                                                  neighbors[0][1], chan_swiz);
1532             col00 = lp_build_swizzle_soa_channel(texel_bld,
1533                                                  neighbors[0][0], chan_swiz);
1534 
1535             /*
1536              * The spec says for comparison filtering, the comparison
1537              * must happen before synthesizing the new value.
1538              * This means all gathered values are always 0 or 1,
1539              * except for the non-existing texel, which can be 0,1/3,2/3,1...
1540              * Seems like we'd be allowed to just return 0 or 1 too, so we
1541              * could simplify and pass down the compare mask values to the
1542              * end (using int arithmetic/compare on the mask values to
1543              * construct the fourth texel) and only there convert to floats
1544              * but it's probably not worth it (it might be easier for the cpu
1545              * but not for the code)...
1546              */
1547             if (bld->static_sampler_state->compare_mode !=
1548                 PIPE_TEX_COMPARE_NONE) {
1549                LLVMValueRef cmpval00, cmpval01, cmpval10, cmpval11;
1550                cmpval00 = lp_build_sample_comparefunc(bld, coords[4], col00);
1551                cmpval01 = lp_build_sample_comparefunc(bld, coords[4], col01);
1552                cmpval10 = lp_build_sample_comparefunc(bld, coords[4], col10);
1553                cmpval11 = lp_build_sample_comparefunc(bld, coords[4], col11);
1554                col00 = lp_build_select(texel_bld, cmpval00,
1555                                        texel_bld->one, texel_bld->zero);
1556                col01 = lp_build_select(texel_bld, cmpval01,
1557                                        texel_bld->one, texel_bld->zero);
1558                col10 = lp_build_select(texel_bld, cmpval10,
1559                                        texel_bld->one, texel_bld->zero);
1560                col11 = lp_build_select(texel_bld, cmpval11,
1561                                        texel_bld->one, texel_bld->zero);
1562             }
1563 
1564             /*
1565              * Null out corner color.
1566              */
1567             col00 = lp_build_andnot(coord_bld, col00, c00f);
1568             col01 = lp_build_andnot(coord_bld, col01, c01f);
1569             col10 = lp_build_andnot(coord_bld, col10, c10f);
1570             col11 = lp_build_andnot(coord_bld, col11, c11f);
1571 
1572             /*
1573              * New corner texel color is all colors added / 3.
1574              */
1575             colc0 = lp_build_add(coord_bld, col00, col01);
1576             colc1 = lp_build_add(coord_bld, col10, col11);
1577             colc = lp_build_add(coord_bld, colc0, colc1);
1578             colc = lp_build_mul(coord_bld, one_third, colc);
1579 
1580             /*
1581              * Replace the corner texel color with the new value.
1582              */
1583             col00 = lp_build_select(coord_bld, c00, colc, col00);
1584             col01 = lp_build_select(coord_bld, c01, colc, col01);
1585             col10 = lp_build_select(coord_bld, c10, colc, col10);
1586             col11 = lp_build_select(coord_bld, c11, colc, col11);
1587 
1588             colors0[0] = col10;
1589             colors0[1] = col11;
1590             colors0[2] = col01;
1591             colors0[3] = col00;
1592          }
1593 
1594          LLVMBuildStore(builder, colors0[0], colorss[0]);
1595          LLVMBuildStore(builder, colors0[1], colorss[1]);
1596          LLVMBuildStore(builder, colors0[2], colorss[2]);
1597          LLVMBuildStore(builder, colors0[3], colorss[3]);
1598 
1599          lp_build_else(&corner_if);
1600       }
1601 
1602       if (bld->static_sampler_state->compare_mode == PIPE_TEX_COMPARE_NONE) {
1603          if (is_gather) {
1604             /*
1605              * Just assign the red channel (no component selection yet).
1606              * This is a bit hackish, we usually do the swizzle at the
1607              * end of sampling (much less values to swizzle), but this
1608              * obviously cannot work when using gather.
1609              */
1610             colors0[0] = lp_build_swizzle_soa_channel(texel_bld,
1611                                                       neighbors[1][0],
1612                                                       chan_swiz);
1613             colors0[1] = lp_build_swizzle_soa_channel(texel_bld,
1614                                                       neighbors[1][1],
1615                                                       chan_swiz);
1616             colors0[2] = lp_build_swizzle_soa_channel(texel_bld,
1617                                                       neighbors[0][1],
1618                                                       chan_swiz);
1619             colors0[3] = lp_build_swizzle_soa_channel(texel_bld,
1620                                                       neighbors[0][0],
1621                                                       chan_swiz);
1622          }
1623          else {
1624             /* Bilinear interpolate the four samples from the 2D image / 3D slice */
1625             lp_build_reduce_filter_2d(texel_bld,
1626                                       bld->static_sampler_state->reduction_mode,
1627                                       0,
1628                                       4,
1629                                       s_fpart,
1630                                       t_fpart,
1631                                       neighbors[0][0],
1632                                       neighbors[0][1],
1633                                       neighbors[1][0],
1634                                       neighbors[1][1],
1635                                       colors0);
1636          }
1637       }
1638       else {
1639          LLVMValueRef cmpval00, cmpval01, cmpval10, cmpval11;
1640          cmpval00 = lp_build_sample_comparefunc(bld, coords[4], neighbors[0][0][0]);
1641          cmpval01 = lp_build_sample_comparefunc(bld, coords[4], neighbors[0][1][0]);
1642          cmpval10 = lp_build_sample_comparefunc(bld, coords[4], neighbors[1][0][0]);
1643          cmpval11 = lp_build_sample_comparefunc(bld, coords[4], neighbors[1][1][0]);
1644 
1645          if (is_gather) {
1646             /* more hacks for swizzling, should be X, ONE or ZERO... */
1647             colors0[0] = lp_build_select(texel_bld, cmpval10,
1648                                          texel_bld->one, texel_bld->zero);
1649             colors0[1] = lp_build_select(texel_bld, cmpval11,
1650                                          texel_bld->one, texel_bld->zero);
1651             colors0[2] = lp_build_select(texel_bld, cmpval01,
1652                                          texel_bld->one, texel_bld->zero);
1653             colors0[3] = lp_build_select(texel_bld, cmpval00,
1654                                          texel_bld->one, texel_bld->zero);
1655          }
1656          else {
1657             colors0[0] = lp_build_masklerp2d(texel_bld, s_fpart, t_fpart,
1658                                              cmpval00, cmpval01, cmpval10, cmpval11);
1659             colors0[1] = colors0[2] = colors0[3] = colors0[0];
1660          }
1661       }
1662 
1663       if (have_corners && accurate_cube_corners &&
1664           bld->static_sampler_state->reduction_mode == PIPE_TEX_REDUCTION_WEIGHTED_AVERAGE) {
1665          LLVMBuildStore(builder, colors0[0], colorss[0]);
1666          LLVMBuildStore(builder, colors0[1], colorss[1]);
1667          LLVMBuildStore(builder, colors0[2], colorss[2]);
1668          LLVMBuildStore(builder, colors0[3], colorss[3]);
1669 
1670          lp_build_endif(&corner_if);
1671 
1672          colors0[0] = LLVMBuildLoad2(builder, coord_bld->vec_type, colorss[0], "");
1673          colors0[1] = LLVMBuildLoad2(builder, coord_bld->vec_type, colorss[1], "");
1674          colors0[2] = LLVMBuildLoad2(builder, coord_bld->vec_type, colorss[2], "");
1675          colors0[3] = LLVMBuildLoad2(builder, coord_bld->vec_type, colorss[3], "");
1676       }
1677 
1678       if (dims == 3) {
1679          LLVMValueRef neighbors1[2][2][4];
1680          LLVMValueRef colors1[4];
1681 
1682          assert(!is_gather);
1683 
1684          /* get x0/x1/y0/y1 texels at z1 */
1685          lp_build_sample_texel_soa(bld,
1686                                    width_vec, height_vec, depth_vec,
1687                                    x00, y00, z1,
1688                                    row_stride_vec, img_stride_vec,
1689                                    data_ptr, mipoffsets, neighbors1[0][0]);
1690          lp_build_sample_texel_soa(bld,
1691                                    width_vec, height_vec, depth_vec,
1692                                    x01, y01, z1,
1693                                    row_stride_vec, img_stride_vec,
1694                                    data_ptr, mipoffsets, neighbors1[0][1]);
1695          lp_build_sample_texel_soa(bld,
1696                                    width_vec, height_vec, depth_vec,
1697                                    x10, y10, z1,
1698                                    row_stride_vec, img_stride_vec,
1699                                    data_ptr, mipoffsets, neighbors1[1][0]);
1700          lp_build_sample_texel_soa(bld,
1701                                    width_vec, height_vec, depth_vec,
1702                                    x11, y11, z1,
1703                                    row_stride_vec, img_stride_vec,
1704                                    data_ptr, mipoffsets, neighbors1[1][1]);
1705 
1706          if (bld->static_sampler_state->compare_mode == PIPE_TEX_COMPARE_NONE) {
1707             /* Bilinear interpolate the four samples from the second Z slice */
1708             lp_build_reduce_filter_2d(texel_bld,
1709                                       bld->static_sampler_state->reduction_mode,
1710                                       0,
1711                                       4,
1712                                       s_fpart,
1713                                       t_fpart,
1714                                       neighbors1[0][0],
1715                                       neighbors1[0][1],
1716                                       neighbors1[1][0],
1717                                       neighbors1[1][1],
1718                                       colors1);
1719 
1720             /* Linearly interpolate the two samples from the two 3D slices */
1721             lp_build_reduce_filter(texel_bld,
1722                                    bld->static_sampler_state->reduction_mode,
1723                                    0,
1724                                    4,
1725                                    r_fpart,
1726                                    colors0,
1727                                    colors1,
1728                                    colors_out);
1729          }
1730          else {
1731             LLVMValueRef cmpval00, cmpval01, cmpval10, cmpval11;
1732             cmpval00 = lp_build_sample_comparefunc(bld, coords[4], neighbors[0][0][0]);
1733             cmpval01 = lp_build_sample_comparefunc(bld, coords[4], neighbors[0][1][0]);
1734             cmpval10 = lp_build_sample_comparefunc(bld, coords[4], neighbors[1][0][0]);
1735             cmpval11 = lp_build_sample_comparefunc(bld, coords[4], neighbors[1][1][0]);
1736             colors1[0] = lp_build_masklerp2d(texel_bld, s_fpart, t_fpart,
1737                                              cmpval00, cmpval01, cmpval10, cmpval11);
1738             /* Linearly interpolate the two samples from the two 3D slices */
1739             colors_out[0] = lp_build_lerp(texel_bld,
1740                                           r_fpart,
1741                                           colors0[0], colors1[0],
1742                                           0);
1743             colors_out[1] = colors_out[2] = colors_out[3] = colors_out[0];
1744          }
1745       }
1746       else {
1747          /* 2D tex */
1748          for (chan = 0; chan < 4; chan++) {
1749             colors_out[chan] = colors0[chan];
1750          }
1751       }
1752    }
1753    if (is_gather) {
1754       /*
1755        * For gather, we can't do our usual channel swizzling done later,
1756        * so do it here. It only really matters for 0/1 swizzles in case
1757        * of comparison filtering, since in this case the results would be
1758        * wrong, without comparison it should all work out alright but it
1759        * can't hurt to do that here, since it will instantly drop all
1760        * calculations above, though it's a rather stupid idea to do
1761        * gather on a channel which will always return 0 or 1 in any case...
1762        */
1763       if (chan_swiz == PIPE_SWIZZLE_1) {
1764          for (chan = 0; chan < 4; chan++) {
1765             colors_out[chan] = texel_bld->one;
1766          }
1767       } else if (chan_swiz == PIPE_SWIZZLE_0) {
1768          for (chan = 0; chan < 4; chan++) {
1769             colors_out[chan] = texel_bld->zero;
1770          }
1771       }
1772    }
1773 }
1774 
1775 
1776 /**
1777  * Sample the texture/mipmap using given image filter and mip filter.
1778  * ilevel0 and ilevel1 indicate the two mipmap levels to sample
1779  * from (vectors or scalars).
1780  * If we're using nearest miplevel sampling the '1' values will be null/unused.
1781  */
1782 static void
lp_build_sample_mipmap(struct lp_build_sample_context * bld,unsigned img_filter,unsigned mip_filter,boolean is_gather,const LLVMValueRef * coords,const LLVMValueRef * offsets,LLVMValueRef ilevel0,LLVMValueRef ilevel1,LLVMValueRef lod_fpart,LLVMValueRef * colors_out)1783 lp_build_sample_mipmap(struct lp_build_sample_context *bld,
1784                        unsigned img_filter,
1785                        unsigned mip_filter,
1786                        boolean is_gather,
1787                        const LLVMValueRef *coords,
1788                        const LLVMValueRef *offsets,
1789                        LLVMValueRef ilevel0,
1790                        LLVMValueRef ilevel1,
1791                        LLVMValueRef lod_fpart,
1792                        LLVMValueRef *colors_out)
1793 {
1794    LLVMBuilderRef builder = bld->gallivm->builder;
1795    LLVMValueRef size0 = NULL;
1796    LLVMValueRef size1 = NULL;
1797    LLVMValueRef row_stride0_vec = NULL;
1798    LLVMValueRef row_stride1_vec = NULL;
1799    LLVMValueRef img_stride0_vec = NULL;
1800    LLVMValueRef img_stride1_vec = NULL;
1801    LLVMValueRef data_ptr0 = NULL;
1802    LLVMValueRef data_ptr1 = NULL;
1803    LLVMValueRef mipoff0 = NULL;
1804    LLVMValueRef mipoff1 = NULL;
1805    LLVMValueRef colors0[4], colors1[4];
1806    unsigned chan;
1807 
1808    /* sample the first mipmap level */
1809    lp_build_mipmap_level_sizes(bld, ilevel0,
1810                                &size0,
1811                                &row_stride0_vec, &img_stride0_vec);
1812    if (bld->num_mips == 1) {
1813       data_ptr0 = lp_build_get_mipmap_level(bld, ilevel0);
1814    }
1815    else {
1816       /* This path should work for num_lods 1 too but slightly less efficient */
1817       data_ptr0 = bld->base_ptr;
1818       mipoff0 = lp_build_get_mip_offsets(bld, ilevel0);
1819    }
1820    if (img_filter == PIPE_TEX_FILTER_NEAREST) {
1821       lp_build_sample_image_nearest(bld, size0,
1822                                     row_stride0_vec, img_stride0_vec,
1823                                     data_ptr0, mipoff0, coords, offsets,
1824                                     colors0);
1825    }
1826    else {
1827       assert(img_filter == PIPE_TEX_FILTER_LINEAR);
1828       lp_build_sample_image_linear(bld, is_gather, size0, NULL,
1829                                    row_stride0_vec, img_stride0_vec,
1830                                    data_ptr0, mipoff0, coords, offsets,
1831                                    colors0);
1832    }
1833 
1834    /* Store the first level's colors in the output variables */
1835    for (chan = 0; chan < 4; chan++) {
1836        LLVMBuildStore(builder, colors0[chan], colors_out[chan]);
1837    }
1838 
1839    if (mip_filter == PIPE_TEX_MIPFILTER_LINEAR) {
1840       struct lp_build_if_state if_ctx;
1841       LLVMValueRef need_lerp;
1842 
1843       /* need_lerp = lod_fpart > 0 */
1844       if (bld->num_lods == 1) {
1845          need_lerp = LLVMBuildFCmp(builder, LLVMRealUGT,
1846                                    lod_fpart, bld->lodf_bld.zero,
1847                                    "need_lerp");
1848       }
1849       else {
1850          /*
1851           * We'll do mip filtering if any of the quads (or individual
1852           * pixel in case of per-pixel lod) need it.
1853           * It might be better to split the vectors here and only fetch/filter
1854           * quads which need it (if there's one lod per quad).
1855           */
1856          need_lerp = lp_build_compare(bld->gallivm, bld->lodf_bld.type,
1857                                       PIPE_FUNC_GREATER,
1858                                       lod_fpart, bld->lodf_bld.zero);
1859          need_lerp = lp_build_any_true_range(&bld->lodi_bld, bld->num_lods, need_lerp);
1860          lp_build_name(need_lerp, "need_lerp");
1861       }
1862 
1863       lp_build_if(&if_ctx, bld->gallivm, need_lerp);
1864       {
1865          /*
1866           * We unfortunately need to clamp lod_fpart here since we can get
1867           * negative values which would screw up filtering if not all
1868           * lod_fpart values have same sign.
1869           */
1870          lod_fpart = lp_build_max(&bld->lodf_bld, lod_fpart,
1871                                   bld->lodf_bld.zero);
1872          /* sample the second mipmap level */
1873          lp_build_mipmap_level_sizes(bld, ilevel1,
1874                                      &size1,
1875                                      &row_stride1_vec, &img_stride1_vec);
1876          if (bld->num_mips == 1) {
1877             data_ptr1 = lp_build_get_mipmap_level(bld, ilevel1);
1878          }
1879          else {
1880             data_ptr1 = bld->base_ptr;
1881             mipoff1 = lp_build_get_mip_offsets(bld, ilevel1);
1882          }
1883          if (img_filter == PIPE_TEX_FILTER_NEAREST) {
1884             lp_build_sample_image_nearest(bld, size1,
1885                                           row_stride1_vec, img_stride1_vec,
1886                                           data_ptr1, mipoff1, coords, offsets,
1887                                           colors1);
1888          }
1889          else {
1890             lp_build_sample_image_linear(bld, FALSE, size1, NULL,
1891                                          row_stride1_vec, img_stride1_vec,
1892                                          data_ptr1, mipoff1, coords, offsets,
1893                                          colors1);
1894          }
1895 
1896          /* interpolate samples from the two mipmap levels */
1897 
1898          if (bld->num_lods != bld->coord_type.length)
1899             lod_fpart = lp_build_unpack_broadcast_aos_scalars(bld->gallivm,
1900                                                               bld->lodf_bld.type,
1901                                                               bld->texel_bld.type,
1902                                                               lod_fpart);
1903 
1904          for (chan = 0; chan < 4; chan++) {
1905             colors0[chan] = lp_build_lerp(&bld->texel_bld, lod_fpart,
1906                                           colors0[chan], colors1[chan],
1907                                           0);
1908             LLVMBuildStore(builder, colors0[chan], colors_out[chan]);
1909          }
1910       }
1911       lp_build_endif(&if_ctx);
1912    }
1913 }
1914 
1915 
1916 /**
1917  * Sample the texture/mipmap using given mip filter, and using
1918  * both nearest and linear filtering at the same time depending
1919  * on linear_mask.
1920  * lod can be per quad but linear_mask is always per pixel.
1921  * ilevel0 and ilevel1 indicate the two mipmap levels to sample
1922  * from (vectors or scalars).
1923  * If we're using nearest miplevel sampling the '1' values will be null/unused.
1924  */
1925 static void
lp_build_sample_mipmap_both(struct lp_build_sample_context * bld,LLVMValueRef linear_mask,unsigned mip_filter,const LLVMValueRef * coords,const LLVMValueRef * offsets,LLVMValueRef ilevel0,LLVMValueRef ilevel1,LLVMValueRef lod_fpart,LLVMValueRef lod_positive,LLVMValueRef * colors_out)1926 lp_build_sample_mipmap_both(struct lp_build_sample_context *bld,
1927                             LLVMValueRef linear_mask,
1928                             unsigned mip_filter,
1929                             const LLVMValueRef *coords,
1930                             const LLVMValueRef *offsets,
1931                             LLVMValueRef ilevel0,
1932                             LLVMValueRef ilevel1,
1933                             LLVMValueRef lod_fpart,
1934                             LLVMValueRef lod_positive,
1935                             LLVMValueRef *colors_out)
1936 {
1937    LLVMBuilderRef builder = bld->gallivm->builder;
1938    LLVMValueRef size0 = NULL;
1939    LLVMValueRef size1 = NULL;
1940    LLVMValueRef row_stride0_vec = NULL;
1941    LLVMValueRef row_stride1_vec = NULL;
1942    LLVMValueRef img_stride0_vec = NULL;
1943    LLVMValueRef img_stride1_vec = NULL;
1944    LLVMValueRef data_ptr0 = NULL;
1945    LLVMValueRef data_ptr1 = NULL;
1946    LLVMValueRef mipoff0 = NULL;
1947    LLVMValueRef mipoff1 = NULL;
1948    LLVMValueRef colors0[4], colors1[4];
1949    unsigned chan;
1950 
1951    /* sample the first mipmap level */
1952    lp_build_mipmap_level_sizes(bld, ilevel0,
1953                                &size0,
1954                                &row_stride0_vec, &img_stride0_vec);
1955    if (bld->num_mips == 1) {
1956       data_ptr0 = lp_build_get_mipmap_level(bld, ilevel0);
1957    }
1958    else {
1959       /* This path should work for num_lods 1 too but slightly less efficient */
1960       data_ptr0 = bld->base_ptr;
1961       mipoff0 = lp_build_get_mip_offsets(bld, ilevel0);
1962    }
1963 
1964    lp_build_sample_image_linear(bld, FALSE, size0, linear_mask,
1965                                 row_stride0_vec, img_stride0_vec,
1966                                 data_ptr0, mipoff0, coords, offsets,
1967                                 colors0);
1968 
1969    /* Store the first level's colors in the output variables */
1970    for (chan = 0; chan < 4; chan++) {
1971        LLVMBuildStore(builder, colors0[chan], colors_out[chan]);
1972    }
1973 
1974    if (mip_filter == PIPE_TEX_MIPFILTER_LINEAR) {
1975       struct lp_build_if_state if_ctx;
1976       LLVMValueRef need_lerp;
1977 
1978       /*
1979        * We'll do mip filtering if any of the quads (or individual
1980        * pixel in case of per-pixel lod) need it.
1981        * Note using lod_positive here not lod_fpart since it may be the same
1982        * condition as that used in the outer "if" in the caller hence llvm
1983        * should be able to merge the branches in this case.
1984        */
1985       need_lerp = lp_build_any_true_range(&bld->lodi_bld, bld->num_lods, lod_positive);
1986       lp_build_name(need_lerp, "need_lerp");
1987 
1988       lp_build_if(&if_ctx, bld->gallivm, need_lerp);
1989       {
1990          /*
1991           * We unfortunately need to clamp lod_fpart here since we can get
1992           * negative values which would screw up filtering if not all
1993           * lod_fpart values have same sign.
1994           */
1995          lod_fpart = lp_build_max(&bld->lodf_bld, lod_fpart,
1996                                   bld->lodf_bld.zero);
1997          /* sample the second mipmap level */
1998          lp_build_mipmap_level_sizes(bld, ilevel1,
1999                                      &size1,
2000                                      &row_stride1_vec, &img_stride1_vec);
2001          if (bld->num_mips == 1) {
2002             data_ptr1 = lp_build_get_mipmap_level(bld, ilevel1);
2003          }
2004          else {
2005             data_ptr1 = bld->base_ptr;
2006             mipoff1 = lp_build_get_mip_offsets(bld, ilevel1);
2007          }
2008 
2009          lp_build_sample_image_linear(bld, FALSE, size1, linear_mask,
2010                                       row_stride1_vec, img_stride1_vec,
2011                                       data_ptr1, mipoff1, coords, offsets,
2012                                       colors1);
2013 
2014          /* interpolate samples from the two mipmap levels */
2015 
2016          if (bld->num_lods != bld->coord_type.length)
2017             lod_fpart = lp_build_unpack_broadcast_aos_scalars(bld->gallivm,
2018                                                               bld->lodf_bld.type,
2019                                                               bld->texel_bld.type,
2020                                                               lod_fpart);
2021 
2022          for (chan = 0; chan < 4; chan++) {
2023             colors0[chan] = lp_build_lerp(&bld->texel_bld, lod_fpart,
2024                                           colors0[chan], colors1[chan],
2025                                           0);
2026             LLVMBuildStore(builder, colors0[chan], colors_out[chan]);
2027          }
2028       }
2029       lp_build_endif(&if_ctx);
2030    }
2031 }
2032 
2033 
2034 /**
2035  * Build (per-coord) layer value.
2036  * Either clamp layer to valid values or fill in optional out_of_bounds
2037  * value and just return value unclamped.
2038  */
2039 static LLVMValueRef
lp_build_layer_coord(struct lp_build_sample_context * bld,unsigned texture_unit,boolean is_cube_array,LLVMValueRef layer,LLVMValueRef * out_of_bounds)2040 lp_build_layer_coord(struct lp_build_sample_context *bld,
2041                      unsigned texture_unit,
2042                      boolean is_cube_array,
2043                      LLVMValueRef layer,
2044                      LLVMValueRef *out_of_bounds)
2045 {
2046    LLVMValueRef num_layers;
2047    struct lp_build_context *int_coord_bld = &bld->int_coord_bld;
2048 
2049    num_layers = bld->dynamic_state->depth(bld->dynamic_state, bld->gallivm,
2050                                           bld->context_ptr, texture_unit, NULL);
2051 
2052    if (out_of_bounds) {
2053       LLVMValueRef out1, out;
2054       assert(!is_cube_array);
2055       num_layers = lp_build_broadcast_scalar(int_coord_bld, num_layers);
2056       out = lp_build_cmp(int_coord_bld, PIPE_FUNC_LESS, layer, int_coord_bld->zero);
2057       out1 = lp_build_cmp(int_coord_bld, PIPE_FUNC_GEQUAL, layer, num_layers);
2058       *out_of_bounds = lp_build_or(int_coord_bld, out, out1);
2059       return layer;
2060    }
2061    else {
2062       LLVMValueRef maxlayer;
2063       LLVMValueRef s = is_cube_array ? lp_build_const_int32(bld->gallivm, 6) :
2064                                        bld->int_bld.one;
2065       maxlayer = lp_build_sub(&bld->int_bld, num_layers, s);
2066       maxlayer = lp_build_broadcast_scalar(int_coord_bld, maxlayer);
2067       return lp_build_clamp(int_coord_bld, layer, int_coord_bld->zero, maxlayer);
2068    }
2069 }
2070 
2071 #define WEIGHT_LUT_SIZE 1024
2072 
2073 static void
lp_build_sample_aniso(struct lp_build_sample_context * bld,unsigned img_filter,unsigned mip_filter,boolean is_gather,const LLVMValueRef * coords,const LLVMValueRef * offsets,LLVMValueRef ilevel0,LLVMValueRef ilevel1,LLVMValueRef lod_fpart,LLVMValueRef * colors_out)2074 lp_build_sample_aniso(struct lp_build_sample_context *bld,
2075                       unsigned img_filter,
2076                       unsigned mip_filter,
2077                       boolean is_gather,
2078                       const LLVMValueRef *coords,
2079                       const LLVMValueRef *offsets,
2080                       LLVMValueRef ilevel0,
2081                       LLVMValueRef ilevel1,
2082                       LLVMValueRef lod_fpart,
2083                       LLVMValueRef *colors_out)
2084 {
2085    struct gallivm_state *gallivm = bld->gallivm;
2086    LLVMBuilderRef builder = gallivm->builder;
2087    struct lp_build_context *coord_bld = &bld->coord_bld;
2088    struct lp_build_context *float_size_bld = &bld->float_size_in_bld;
2089    LLVMValueRef ddx_ddy = lp_build_packed_ddx_ddy_twocoord(&bld->coord_bld, coords[0], coords[1]);
2090    LLVMValueRef float_size;
2091    LLVMTypeRef i32t = LLVMInt32TypeInContext(gallivm->context);
2092    LLVMValueRef index0 = LLVMConstInt(i32t, 0, 0);
2093    LLVMValueRef index1 = LLVMConstInt(i32t, 1, 0);
2094    const unsigned length = bld->coord_bld.type.length;
2095    const unsigned num_quads = length / 4;
2096    LLVMValueRef filter_table = bld->aniso_filter_table;
2097    LLVMValueRef size0, row_stride0_vec, img_stride0_vec;
2098    LLVMValueRef data_ptr0, mipoff0 = NULL;
2099 
2100    lp_build_mipmap_level_sizes(bld, ilevel0,
2101                                &size0,
2102                                &row_stride0_vec, &img_stride0_vec);
2103    if (bld->num_mips == 1) {
2104       data_ptr0 = lp_build_get_mipmap_level(bld, ilevel0);
2105    }
2106    else {
2107       /* This path should work for num_lods 1 too but slightly less efficient */
2108       data_ptr0 = bld->base_ptr;
2109       mipoff0 = lp_build_get_mip_offsets(bld, ilevel0);
2110    }
2111 
2112    float_size = lp_build_int_to_float(&bld->float_size_in_bld, bld->int_size);
2113 
2114    LLVMValueRef float_size_lvl = lp_build_int_to_float(&bld->float_size_bld, size0);
2115    /* extract width and height into vectors for use later */
2116    static const unsigned char swizzle15[] = { /* no-op swizzle */
2117       1, 1, 1, 1, 5, 5, 5, 5
2118    };
2119    static const unsigned char swizzle04[] = { /* no-op swizzle */
2120       0, 0, 0, 0, 4, 4, 4, 4
2121    };
2122    LLVMValueRef width_dim, height_dim;
2123 
2124    width_dim = lp_build_swizzle_aos_n(gallivm, float_size_lvl, swizzle04,
2125                                       bld->float_size_bld.type.length,
2126                                       bld->coord_bld.type.length);
2127    height_dim = lp_build_swizzle_aos_n(gallivm, float_size_lvl, swizzle15,
2128                                        bld->float_size_bld.type.length,
2129                                        bld->coord_bld.type.length);
2130 
2131 
2132    /* shuffle width/height for ddx/ddy calculations. */
2133    LLVMValueRef shuffles[LP_MAX_VECTOR_LENGTH / 4];
2134 
2135    for (unsigned i = 0; i < num_quads; i++) {
2136       shuffles[i*4+0] = shuffles[i*4+1] = index0;
2137       shuffles[i*4+2] = shuffles[i*4+3] = index1;
2138    }
2139 
2140    LLVMValueRef floatdim =
2141       LLVMBuildShuffleVector(builder, float_size, float_size,
2142                              LLVMConstVector(shuffles, length), "");
2143 
2144    ddx_ddy = lp_build_mul(coord_bld, ddx_ddy, floatdim);
2145 
2146    LLVMValueRef scaling =
2147       lp_build_shl(&bld->leveli_bld, bld->leveli_bld.one, ilevel0);
2148    scaling = lp_build_int_to_float(&bld->levelf_bld, scaling);
2149    scaling = lp_build_rcp(&bld->levelf_bld, scaling);
2150 
2151    if (bld->num_lods != length) {
2152       if (bld->levelf_bld.type.length == 1)
2153          scaling = lp_build_broadcast_scalar(coord_bld,
2154                                              scaling);
2155       else
2156          scaling = lp_build_unpack_broadcast_aos_scalars(bld->gallivm,
2157                                                          bld->levelf_bld.type,
2158                                                          coord_bld->type,
2159                                                          scaling);
2160    }
2161 
2162    ddx_ddy = lp_build_mul(coord_bld, ddx_ddy, scaling);
2163 
2164    static const unsigned char swizzle01[] = { /* no-op swizzle */
2165       0, 1, 0, 1,
2166    };
2167    static const unsigned char swizzle23[] = {
2168       2, 3, 2, 3,
2169    };
2170 
2171    LLVMValueRef ddx_ddys, ddx_ddyt;
2172    ddx_ddys = lp_build_swizzle_aos(coord_bld, ddx_ddy, swizzle01);
2173    ddx_ddyt = lp_build_swizzle_aos(coord_bld, ddx_ddy, swizzle23);
2174 
2175    /* compute ellipse coefficients */
2176    /* * A*x*x + B*x*y + C*y*y = F.*/
2177    /* float A = vx*vx+vy*vy+1; */
2178    LLVMValueRef A = lp_build_mul(coord_bld, ddx_ddyt, ddx_ddyt);
2179 
2180    LLVMValueRef Ay = lp_build_swizzle_aos(coord_bld, A, swizzle15);
2181    A = lp_build_add(coord_bld, A, Ay);
2182    A = lp_build_add(coord_bld, A, coord_bld->one);
2183    A = lp_build_swizzle_aos(coord_bld, A, swizzle04);
2184 
2185    /* float B = -2*(ux*vx+uy*vy); */
2186    LLVMValueRef B = lp_build_mul(coord_bld, ddx_ddys, ddx_ddyt);
2187    LLVMValueRef By = lp_build_swizzle_aos(coord_bld, B, swizzle15);
2188    B = lp_build_add(coord_bld, B, By);
2189    B = lp_build_mul_imm(coord_bld, B, -2);
2190    B = lp_build_swizzle_aos(coord_bld, B, swizzle04);
2191 
2192    /* float C = ux*ux+uy*uy+1; */
2193    LLVMValueRef C = lp_build_mul(coord_bld, ddx_ddys, ddx_ddys);
2194    LLVMValueRef Cy = lp_build_swizzle_aos(coord_bld, C, swizzle15);
2195    C = lp_build_add(coord_bld, C, Cy);
2196    C = lp_build_add(coord_bld, C, coord_bld->one);
2197    C = lp_build_swizzle_aos(coord_bld, C, swizzle04);
2198 
2199    /* float F = A*C-B*B/4.0f; */
2200    LLVMValueRef F = lp_build_mul(coord_bld, B, B);
2201    F = lp_build_div(coord_bld, F, lp_build_const_vec(gallivm, coord_bld->type, 4.0));
2202    LLVMValueRef F_p2 = lp_build_mul(coord_bld, A, C);
2203    F = lp_build_sub(coord_bld, F_p2, F);
2204 
2205    /* compute ellipse bounding box in texture space */
2206    /* const float d = -B*B+4.0f*C*A; */
2207    LLVMValueRef d = lp_build_sub(coord_bld, coord_bld->zero, lp_build_mul(coord_bld, B, B));
2208    LLVMValueRef d_p2 = lp_build_mul(coord_bld, A, C);
2209    d_p2 = lp_build_mul_imm(coord_bld, d_p2, 4);
2210    d = lp_build_add(coord_bld, d, d_p2);
2211 
2212    /* const float box_u = 2.0f / d * sqrtf(d*C*F); */
2213    /* box_u -> half of bbox with   */
2214    LLVMValueRef temp;
2215    temp = lp_build_mul(coord_bld, d, C);
2216    temp = lp_build_mul(coord_bld, temp, F);
2217    temp = lp_build_sqrt(coord_bld, temp);
2218 
2219    LLVMValueRef box_u = lp_build_div(coord_bld, lp_build_const_vec(gallivm, coord_bld->type, 2.0), d);
2220    box_u = lp_build_mul(coord_bld, box_u, temp);
2221 
2222    /* const float box_v = 2.0f / d * sqrtf(A*d*F); */
2223    /* box_v -> half of bbox height */
2224    temp = lp_build_mul(coord_bld, A, d);
2225    temp = lp_build_mul(coord_bld, temp, F);
2226    temp = lp_build_sqrt(coord_bld, temp);
2227 
2228    LLVMValueRef box_v = lp_build_div(coord_bld, lp_build_const_vec(gallivm, coord_bld->type, 2.0), d);
2229    box_v = lp_build_mul(coord_bld, box_v, temp);
2230 
2231    /* Scale ellipse formula to directly index the Filter Lookup Table.
2232     * i.e. scale so that F = WEIGHT_LUT_SIZE-1
2233     */
2234    LLVMValueRef formScale = lp_build_div(coord_bld, lp_build_const_vec(gallivm, coord_bld->type, WEIGHT_LUT_SIZE - 1), F);
2235 
2236    A = lp_build_mul(coord_bld, A, formScale);
2237    B = lp_build_mul(coord_bld, B, formScale);
2238    C = lp_build_mul(coord_bld, C, formScale);
2239    /* F *= formScale; */ /* no need to scale F as we don't use it below here */
2240 
2241    LLVMValueRef ddq = lp_build_mul_imm(coord_bld, A, 2);
2242 
2243    /* Heckbert MS thesis, p. 59; scan over the bounding box of the ellipse
2244     * and incrementally update the value of Ax^2+Bxy*Cy^2; when this
2245     * value, q, is less than F, we're inside the ellipse
2246     */
2247 
2248    LLVMValueRef float_size0 = lp_build_int_to_float(float_size_bld, bld->int_size);
2249    LLVMValueRef width0 = lp_build_extract_broadcast(gallivm,
2250                                                     float_size_bld->type,
2251                                                     coord_bld->type,
2252                                                     float_size0, index0);
2253    LLVMValueRef height0 = lp_build_extract_broadcast(gallivm,
2254                                                      float_size_bld->type,
2255                                                      coord_bld->type,
2256                                                      float_size0, index1);
2257 
2258    /* texture->width0 * scaling */
2259    width0 = lp_build_mul(coord_bld, width0, scaling);
2260    /* texture->height0 * scaling */
2261    height0 = lp_build_mul(coord_bld, height0, scaling);
2262 
2263    /* tex_u = -0.5f * s[j] * texture->width0 * scaling */
2264    LLVMValueRef tex_u = lp_build_mul(coord_bld, coords[0], width0);
2265    tex_u = lp_build_add(coord_bld, tex_u, lp_build_const_vec(gallivm, coord_bld->type, -0.5f));
2266 
2267    /* tex_v = -0.5f * t[j] * texture->height0 * scaling */
2268    LLVMValueRef tex_v = lp_build_mul(coord_bld, coords[1], height0);
2269    tex_v = lp_build_add(coord_bld, tex_v, lp_build_const_vec(gallivm, coord_bld->type, -0.5f));
2270 
2271    /* const int u0 = (int) floorf(tex_u - box_u); */
2272    LLVMValueRef u0 = lp_build_itrunc(coord_bld, lp_build_floor(coord_bld, lp_build_sub(coord_bld, tex_u, box_u)));
2273    /* const int u1 = (int) ceilf(tex_u + box_u); */
2274    LLVMValueRef u1 = lp_build_itrunc(coord_bld, lp_build_ceil(coord_bld, lp_build_add(coord_bld, tex_u, box_u)));
2275 
2276    /* const int v0 = (int) floorf(tex_v - box_v); */
2277    LLVMValueRef v0 = lp_build_itrunc(coord_bld, lp_build_floor(coord_bld, lp_build_sub(coord_bld, tex_v, box_v)));
2278    /* const int v1 = (int) ceilf(tex_v + box_v); */
2279    LLVMValueRef v1 = lp_build_itrunc(coord_bld, lp_build_ceil(coord_bld, lp_build_add(coord_bld, tex_v, box_v)));
2280 
2281    /* const float U = u0 - tex_u; */
2282    LLVMValueRef U = lp_build_sub(coord_bld, lp_build_int_to_float(coord_bld, u0), tex_u);
2283 
2284    /* A * (2 * U + 1) */
2285    LLVMValueRef dq_base = lp_build_mul_imm(coord_bld, U, 2);
2286    dq_base = lp_build_add(coord_bld, dq_base, coord_bld->one);
2287    dq_base = lp_build_mul(coord_bld, dq_base, A);
2288 
2289    /* A * U * U */
2290    LLVMValueRef q_base = lp_build_mul(coord_bld, U, U);
2291    q_base = lp_build_mul(coord_bld, q_base, A);
2292 
2293    LLVMValueRef colors0[4];
2294    LLVMValueRef den_store = lp_build_alloca(gallivm, bld->texel_bld.vec_type, "den");
2295 
2296    for (unsigned chan = 0; chan < 4; chan++)
2297       colors0[chan] = lp_build_alloca(gallivm, bld->texel_bld.vec_type, "colors");
2298 
2299    LLVMValueRef q_store, dq_store;
2300    q_store = lp_build_alloca(gallivm, bld->coord_bld.vec_type, "q");
2301    dq_store = lp_build_alloca(gallivm, bld->coord_bld.vec_type, "dq");
2302 
2303    LLVMValueRef v_limiter = lp_build_alloca(gallivm, bld->int_coord_bld.vec_type, "v_limiter");
2304    LLVMValueRef u_limiter = lp_build_alloca(gallivm, bld->int_coord_bld.vec_type, "u_limiter");
2305 
2306    LLVMBuildStore(builder, v0, v_limiter);
2307 
2308    /* create an LLVM loop block for the V iterator */
2309    LLVMBasicBlockRef v_loop_block = lp_build_insert_new_block(gallivm, "vloop");
2310 
2311    LLVMBuildBr(builder, v_loop_block);
2312    LLVMPositionBuilderAtEnd(builder, v_loop_block);
2313 
2314    LLVMValueRef v_val = LLVMBuildLoad2(builder, bld->int_coord_bld.vec_type, v_limiter, "");
2315    LLVMValueRef v_mask = LLVMBuildICmp(builder, LLVMIntSLE, v_val, v1, "");
2316 
2317    /* loop over V values. */
2318    {
2319       /*  const float V = v - tex_v; */
2320       LLVMValueRef V =
2321          lp_build_sub(coord_bld,
2322                       lp_build_int_to_float(coord_bld, v_val), tex_v);
2323 
2324       /* float dq = dq_base + B * V; */
2325       LLVMValueRef dq = lp_build_mul(coord_bld, V, B);
2326       dq = lp_build_add(coord_bld, dq, dq_base);
2327 
2328       /* float q = (C * V + B * U) * V + q_base */
2329       LLVMValueRef q = lp_build_mul(coord_bld, C, V);
2330       q = lp_build_add(coord_bld, q, lp_build_mul(coord_bld, B, U));
2331       q = lp_build_mul(coord_bld, q, V);
2332       q = lp_build_add(coord_bld, q, q_base);
2333 
2334       LLVMBuildStore(builder, q, q_store);
2335       LLVMBuildStore(builder, dq, dq_store);
2336 
2337       LLVMBuildStore(builder, u0, u_limiter);
2338 
2339       /* create an LLVM loop block for the V iterator */
2340       LLVMBasicBlockRef u_loop_block = lp_build_insert_new_block(gallivm, "uloop");
2341 
2342       LLVMBuildBr(builder, u_loop_block);
2343       LLVMPositionBuilderAtEnd(builder, u_loop_block);
2344 
2345       LLVMValueRef u_val = LLVMBuildLoad2(builder, bld->int_coord_bld.vec_type,
2346                                           u_limiter, "");
2347       LLVMValueRef u_mask = LLVMBuildICmp(builder,
2348                                           LLVMIntSLE,
2349                                           u_val,
2350                                           u1, "");
2351 
2352       /* loop over U values */
2353       {
2354          /* q = (int)q */
2355          q = lp_build_itrunc(coord_bld,
2356                              LLVMBuildLoad2(builder, bld->coord_bld.vec_type,
2357                                             q_store, ""));
2358 
2359          /*
2360           * avoid OOB access to filter table, generate a mask for q > 1024,
2361           * then truncate it.
2362           */
2363          LLVMValueRef q_mask = LLVMBuildICmp(builder,
2364                                              LLVMIntSLE,
2365                                              q,
2366                                              lp_build_const_int_vec(gallivm, bld->int_coord_bld.type, 0x3ff), "");
2367          q_mask = LLVMBuildSExt(builder, q_mask, bld->int_coord_bld.vec_type, "");
2368 
2369          q = lp_build_max(&bld->int_coord_bld, q, bld->int_coord_bld.zero);
2370          q = lp_build_and(&bld->int_coord_bld, q, lp_build_const_int_vec(gallivm, bld->int_coord_bld.type, 0x3ff));
2371 
2372          /* update the offsets to deal with float size. */
2373          q = lp_build_mul_imm(&bld->int_coord_bld, q, 4);
2374          filter_table = LLVMBuildBitCast(gallivm->builder, filter_table, LLVMPointerType(LLVMInt8TypeInContext(gallivm->context), 0), "");
2375 
2376          /* Lookup weights in filter table */
2377          LLVMValueRef weights = lp_build_gather(gallivm, coord_bld->type.length,
2378                                                 coord_bld->type.width,
2379                                                 lp_elem_type(coord_bld->type),
2380                                                 TRUE, filter_table, q, TRUE);
2381 
2382          /*
2383           * Mask off the weights here which should ensure no-op for loops
2384           * where some of the u/v values are not being calculated.
2385           */
2386          weights = LLVMBuildBitCast(builder, weights, bld->int_coord_bld.vec_type, "");
2387          weights = lp_build_and(&bld->int_coord_bld, weights, LLVMBuildSExt(builder, u_mask, bld->int_coord_bld.vec_type, ""));
2388          weights = lp_build_and(&bld->int_coord_bld, weights, LLVMBuildSExt(builder, v_mask, bld->int_coord_bld.vec_type, ""));
2389          weights = lp_build_and(&bld->int_coord_bld, weights, q_mask);
2390          weights = LLVMBuildBitCast(builder, weights, bld->coord_bld.vec_type, "");
2391 
2392          /* if the weights are all 0 avoid doing the sampling at all. */
2393          struct lp_build_if_state noloadw0;
2394 
2395          LLVMValueRef wnz = LLVMBuildFCmp(gallivm->builder, LLVMRealUNE,
2396                                           weights, bld->coord_bld.zero, "");
2397          wnz = LLVMBuildSExt(builder, wnz, bld->int_coord_bld.vec_type, "");
2398          wnz = lp_build_any_true_range(&bld->coord_bld, bld->coord_bld.type.length, wnz);
2399          lp_build_if(&noloadw0, gallivm, wnz);
2400          LLVMValueRef new_coords[4];
2401          new_coords[0] = lp_build_div(coord_bld, lp_build_int_to_float(coord_bld, u_val), width_dim);
2402          new_coords[1] = lp_build_div(coord_bld, lp_build_int_to_float(coord_bld, v_val), height_dim);
2403          new_coords[2] = coords[2];
2404          new_coords[3] = coords[3];
2405 
2406          /* lookup q in filter table */
2407          LLVMValueRef temp_colors[4];
2408          lp_build_sample_image_nearest(bld, size0,
2409                                        row_stride0_vec, img_stride0_vec,
2410                                        data_ptr0, mipoff0, new_coords, offsets,
2411                                        temp_colors);
2412 
2413          for (unsigned chan = 0; chan < 4; chan++) {
2414             LLVMValueRef tcolor = LLVMBuildLoad2(builder, bld->texel_bld.vec_type, colors0[chan], "");
2415 
2416             tcolor = lp_build_add(&bld->texel_bld, tcolor, lp_build_mul(&bld->texel_bld, temp_colors[chan], weights));
2417             LLVMBuildStore(builder, tcolor, colors0[chan]);
2418          }
2419 
2420          /* multiple colors by weight and add in. */
2421          /* den += weight; */
2422          LLVMValueRef den = LLVMBuildLoad2(builder, bld->texel_bld.vec_type, den_store, "");
2423          den = lp_build_add(&bld->texel_bld, den, weights);
2424          LLVMBuildStore(builder, den, den_store);
2425 
2426          lp_build_endif(&noloadw0);
2427          /* q += dq; */
2428          /* dq += ddq; */
2429          q = LLVMBuildLoad2(builder, bld->texel_bld.vec_type, q_store, "");
2430          dq = LLVMBuildLoad2(builder, bld->texel_bld.vec_type, dq_store, "");
2431          q = lp_build_add(coord_bld, q, dq);
2432          dq = lp_build_add(coord_bld, dq, ddq);
2433          LLVMBuildStore(builder, q, q_store);
2434          LLVMBuildStore(builder, dq, dq_store);
2435       }
2436       /* u += 1 */
2437       u_val = LLVMBuildLoad2(builder, bld->int_coord_bld.vec_type, u_limiter, "");
2438       u_val = lp_build_add(&bld->int_coord_bld, u_val, bld->int_coord_bld.one);
2439       LLVMBuildStore(builder, u_val, u_limiter);
2440 
2441       u_mask = LLVMBuildICmp(builder,
2442                              LLVMIntSLE,
2443                              u_val,
2444                              u1, "");
2445       LLVMValueRef u_end_cond = LLVMBuildSExt(builder, u_mask, bld->int_coord_bld.vec_type, "");
2446       u_end_cond = lp_build_any_true_range(&bld->coord_bld, bld->coord_bld.type.length, u_end_cond);
2447 
2448       LLVMBasicBlockRef u_end_loop = lp_build_insert_new_block(gallivm, "u_end_loop");
2449 
2450       LLVMBuildCondBr(builder, u_end_cond,
2451                       u_loop_block, u_end_loop);
2452 
2453       LLVMPositionBuilderAtEnd(builder, u_end_loop);
2454 
2455    }
2456 
2457    /* v += 1 */
2458    v_val = LLVMBuildLoad2(builder, bld->int_coord_bld.vec_type, v_limiter, "");
2459    v_val = lp_build_add(&bld->int_coord_bld, v_val, bld->int_coord_bld.one);
2460    LLVMBuildStore(builder, v_val, v_limiter);
2461 
2462    v_mask = LLVMBuildICmp(builder,
2463                           LLVMIntSLE,
2464                           v_val,
2465                           v1, "");
2466    LLVMValueRef v_end_cond = LLVMBuildSExt(builder, v_mask,
2467                                            bld->int_coord_bld.vec_type, "");
2468    v_end_cond = lp_build_any_true_range(&bld->coord_bld,
2469                                         bld->coord_bld.type.length, v_end_cond);
2470 
2471    LLVMBasicBlockRef v_end_loop = lp_build_insert_new_block(gallivm, "v_end_loop");
2472 
2473    LLVMBuildCondBr(builder, v_end_cond,
2474                    v_loop_block, v_end_loop);
2475 
2476    LLVMPositionBuilderAtEnd(builder, v_end_loop);
2477 
2478    LLVMValueRef den = LLVMBuildLoad2(builder, bld->texel_bld.vec_type, den_store, "");
2479 
2480    for (unsigned chan = 0; chan < 4; chan++) {
2481       colors0[chan] =
2482          lp_build_div(&bld->texel_bld,
2483                       LLVMBuildLoad2(builder, bld->texel_bld.vec_type,
2484                                      colors0[chan], ""), den);
2485    }
2486 
2487    LLVMValueRef den0 = lp_build_cmp(&bld->coord_bld, PIPE_FUNC_EQUAL,
2488                                     den, bld->coord_bld.zero);
2489 
2490    LLVMValueRef den0_any =
2491       lp_build_any_true_range(&bld->coord_bld,
2492                               bld->coord_bld.type.length, den0);
2493 
2494    struct lp_build_if_state den0_fallback;
2495    lp_build_if(&den0_fallback, gallivm, den0_any);
2496    {
2497       LLVMValueRef colors_den0[4];
2498       lp_build_sample_image_linear(bld, false, size0, NULL,
2499                                    row_stride0_vec, img_stride0_vec,
2500                                    data_ptr0, mipoff0, coords, offsets,
2501                                    colors_den0);
2502       for (unsigned chan = 0; chan < 4; chan++) {
2503          LLVMValueRef chan_val =
2504             lp_build_select(&bld->texel_bld, den0,
2505                             colors_den0[chan], colors0[chan]);
2506          LLVMBuildStore(builder, chan_val, colors_out[chan]);
2507       }
2508    }
2509    lp_build_else(&den0_fallback);
2510    {
2511       for (unsigned chan = 0; chan < 4; chan++) {
2512          LLVMBuildStore(builder, colors0[chan], colors_out[chan]);
2513       }
2514    }
2515    lp_build_endif(&den0_fallback);
2516 }
2517 
2518 
2519 /**
2520  * Calculate cube face, lod, mip levels.
2521  */
2522 static void
lp_build_sample_common(struct lp_build_sample_context * bld,boolean is_lodq,unsigned texture_index,unsigned sampler_index,LLVMValueRef * coords,const struct lp_derivatives * derivs,LLVMValueRef lod_bias,LLVMValueRef explicit_lod,LLVMValueRef * lod_pos_or_zero,LLVMValueRef * lod,LLVMValueRef * lod_fpart,LLVMValueRef * ilevel0,LLVMValueRef * ilevel1)2523 lp_build_sample_common(struct lp_build_sample_context *bld,
2524                        boolean is_lodq,
2525                        unsigned texture_index,
2526                        unsigned sampler_index,
2527                        LLVMValueRef *coords,
2528                        const struct lp_derivatives *derivs, /* optional */
2529                        LLVMValueRef lod_bias, /* optional */
2530                        LLVMValueRef explicit_lod, /* optional */
2531                        LLVMValueRef *lod_pos_or_zero,
2532                        LLVMValueRef *lod,
2533                        LLVMValueRef *lod_fpart,
2534                        LLVMValueRef *ilevel0,
2535                        LLVMValueRef *ilevel1)
2536 {
2537    const unsigned mip_filter = bld->static_sampler_state->min_mip_filter;
2538    const unsigned min_filter = bld->static_sampler_state->min_img_filter;
2539    const unsigned mag_filter = bld->static_sampler_state->mag_img_filter;
2540    const unsigned target = bld->static_texture_state->target;
2541    const bool aniso = bld->static_sampler_state->aniso;
2542    LLVMValueRef first_level, cube_rho = NULL;
2543    LLVMValueRef lod_ipart = NULL;
2544    struct lp_derivatives cube_derivs;
2545 
2546    /*
2547    printf("%s mip %d  min %d  mag %d\n", __FUNCTION__,
2548           mip_filter, min_filter, mag_filter);
2549    */
2550 
2551    /*
2552     * Choose cube face, recompute texcoords for the chosen face and
2553     * compute rho here too (as it requires transform of derivatives).
2554     */
2555    if (target == PIPE_TEXTURE_CUBE || target == PIPE_TEXTURE_CUBE_ARRAY) {
2556       boolean need_derivs;
2557       need_derivs = ((min_filter != mag_filter ||
2558                       mip_filter != PIPE_TEX_MIPFILTER_NONE) &&
2559                       !bld->static_sampler_state->min_max_lod_equal &&
2560                       !explicit_lod);
2561       lp_build_cube_lookup(bld, coords, derivs, &cube_rho, &cube_derivs, need_derivs);
2562       derivs = &cube_derivs;
2563       if (target == PIPE_TEXTURE_CUBE_ARRAY && !is_lodq) {
2564          /* calculate cube layer coord now */
2565          LLVMValueRef layer = lp_build_iround(&bld->coord_bld, coords[3]);
2566          LLVMValueRef six = lp_build_const_int_vec(bld->gallivm, bld->int_coord_type, 6);
2567          layer = lp_build_mul(&bld->int_coord_bld, layer, six);
2568          coords[3] = lp_build_layer_coord(bld, texture_index, TRUE, layer, NULL);
2569          /* because of seamless filtering can't add it to face (coords[2]) here. */
2570       }
2571    }
2572    else if ((target == PIPE_TEXTURE_1D_ARRAY ||
2573              target == PIPE_TEXTURE_2D_ARRAY) && !is_lodq) {
2574       coords[2] = lp_build_iround(&bld->coord_bld, coords[2]);
2575       coords[2] = lp_build_layer_coord(bld, texture_index, FALSE, coords[2], NULL);
2576    }
2577 
2578    if (bld->static_sampler_state->compare_mode != PIPE_TEX_COMPARE_NONE) {
2579       /*
2580        * Clamp p coords to [0,1] for fixed function depth texture format here.
2581        * Technically this is not entirely correct for unorm depth as the ref
2582        * value should be converted to the depth format (quantization!) and
2583        * comparison then done in texture format. This would actually help
2584        * performance (since only need to do it once and could save the
2585        * per-sample conversion of texels to floats instead), but it would need
2586        * more messy code (would need to push at least some bits down to actual
2587        * fetch so conversion could be skipped, and would have ugly interaction
2588        * with border color, would need to convert border color to that format
2589        * too or do some other tricks to make it work).
2590        */
2591       const struct util_format_description *format_desc = bld->format_desc;
2592       unsigned chan_type;
2593       /* not entirely sure we couldn't end up with non-valid swizzle here */
2594       chan_type = format_desc->swizzle[0] <= PIPE_SWIZZLE_W ?
2595                      format_desc->channel[format_desc->swizzle[0]].type :
2596                      UTIL_FORMAT_TYPE_FLOAT;
2597       if (chan_type != UTIL_FORMAT_TYPE_FLOAT) {
2598          coords[4] = lp_build_clamp(&bld->coord_bld, coords[4],
2599                                     bld->coord_bld.zero, bld->coord_bld.one);
2600       }
2601    }
2602 
2603    /*
2604     * Compute the level of detail (float).
2605     */
2606    if (min_filter != mag_filter ||
2607        mip_filter != PIPE_TEX_MIPFILTER_NONE || is_lodq) {
2608       LLVMValueRef max_aniso = NULL;
2609 
2610       if (aniso)
2611          max_aniso = bld->dynamic_state->max_aniso(bld->dynamic_state,
2612                                                    bld->gallivm,
2613                                                    bld->context_ptr,
2614                                                    sampler_index);
2615 
2616       /* Need to compute lod either to choose mipmap levels or to
2617        * distinguish between minification/magnification with one mipmap level.
2618        */
2619       lp_build_lod_selector(bld, is_lodq, texture_index, sampler_index,
2620                             coords[0], coords[1], coords[2], cube_rho,
2621                             derivs, lod_bias, explicit_lod,
2622                             mip_filter, max_aniso, lod,
2623                             &lod_ipart, lod_fpart, lod_pos_or_zero);
2624       if (is_lodq) {
2625          LLVMValueRef last_level;
2626          last_level = bld->dynamic_state->last_level(bld->dynamic_state,
2627                                                      bld->gallivm,
2628                                                      bld->context_ptr,
2629                                                      texture_index, NULL);
2630          first_level = bld->dynamic_state->first_level(bld->dynamic_state,
2631                                                        bld->gallivm,
2632                                                        bld->context_ptr,
2633                                                        texture_index, NULL);
2634          last_level = lp_build_sub(&bld->int_bld, last_level, first_level);
2635          last_level = lp_build_int_to_float(&bld->float_bld, last_level);
2636          last_level = lp_build_broadcast_scalar(&bld->lodf_bld, last_level);
2637 
2638          switch (mip_filter) {
2639          case PIPE_TEX_MIPFILTER_NONE:
2640             *lod_fpart = bld->lodf_bld.zero;
2641             break;
2642          case PIPE_TEX_MIPFILTER_NEAREST:
2643             *lod_fpart = lp_build_round(&bld->lodf_bld, *lod_fpart);
2644             FALLTHROUGH;
2645          case PIPE_TEX_MIPFILTER_LINEAR:
2646             *lod_fpart = lp_build_clamp(&bld->lodf_bld, *lod_fpart,
2647                                         bld->lodf_bld.zero, last_level);
2648             break;
2649          }
2650          return;
2651       }
2652    } else {
2653       lod_ipart = bld->lodi_bld.zero;
2654       *lod_pos_or_zero = bld->lodi_bld.zero;
2655    }
2656 
2657    if ((bld->num_lods != bld->num_mips || bld->num_lods == 1) &&
2658        bld->lodi_bld.type.length != 1) {
2659       /* only makes sense if there's just a single mip level */
2660       assert(bld->num_mips == 1);
2661       lod_ipart = lp_build_extract_range(bld->gallivm, lod_ipart, 0, 1);
2662    }
2663 
2664    /*
2665     * Compute integer mipmap level(s) to fetch texels from: ilevel0, ilevel1
2666     */
2667 
2668    if (aniso) {
2669       lp_build_nearest_mip_level(bld, texture_index, lod_ipart, ilevel0, NULL);
2670       return;
2671    }
2672 
2673    switch (mip_filter) {
2674    default:
2675       assert(0 && "bad mip_filter value in lp_build_sample_soa()");
2676    case PIPE_TEX_MIPFILTER_NONE:
2677       /* always use mip level 0 */
2678       first_level = bld->dynamic_state->first_level(bld->dynamic_state,
2679                                                     bld->gallivm, bld->context_ptr,
2680                                                     texture_index, NULL);
2681       first_level = lp_build_broadcast_scalar(&bld->leveli_bld, first_level);
2682       *ilevel0 = first_level;
2683       break;
2684    case PIPE_TEX_MIPFILTER_NEAREST:
2685       assert(lod_ipart);
2686       lp_build_nearest_mip_level(bld, texture_index, lod_ipart, ilevel0, NULL);
2687       break;
2688    case PIPE_TEX_MIPFILTER_LINEAR:
2689       assert(lod_ipart);
2690       assert(*lod_fpart);
2691       lp_build_linear_mip_levels(bld, texture_index,
2692                                  lod_ipart, lod_fpart,
2693                                  ilevel0, ilevel1);
2694       break;
2695    }
2696 }
2697 
2698 
2699 static void
lp_build_clamp_border_color(struct lp_build_sample_context * bld,unsigned sampler_unit)2700 lp_build_clamp_border_color(struct lp_build_sample_context *bld,
2701                             unsigned sampler_unit)
2702 {
2703    struct gallivm_state *gallivm = bld->gallivm;
2704    LLVMBuilderRef builder = gallivm->builder;
2705    LLVMValueRef border_color_ptr =
2706       bld->dynamic_state->border_color(bld->dynamic_state, gallivm,
2707                                        bld->context_ptr, sampler_unit);
2708    LLVMValueRef border_color;
2709    const struct util_format_description *format_desc = bld->format_desc;
2710    struct lp_type vec4_type = bld->texel_type;
2711    struct lp_build_context vec4_bld;
2712    LLVMValueRef min_clamp = NULL;
2713    LLVMValueRef max_clamp = NULL;
2714 
2715    /*
2716     * For normalized format need to clamp border color (technically
2717     * probably should also quantize the data). Really sucks doing this
2718     * here but can't avoid at least for now since this is part of
2719     * sampler state and texture format is part of sampler_view state.
2720     * GL expects also expects clamping for uint/sint formats too so
2721     * do that as well (d3d10 can't end up here with uint/sint since it
2722     * only supports them with ld).
2723     */
2724    vec4_type.length = 4;
2725    lp_build_context_init(&vec4_bld, gallivm, vec4_type);
2726 
2727    /*
2728     * Vectorized clamping of border color. Loading is a bit of a hack since
2729     * we just cast the pointer to float array to pointer to vec4
2730     * (int or float).
2731     */
2732    border_color_ptr = lp_build_array_get_ptr(gallivm, border_color_ptr,
2733                                              lp_build_const_int32(gallivm, 0));
2734    border_color_ptr = LLVMBuildBitCast(builder, border_color_ptr,
2735                                        LLVMPointerType(vec4_bld.vec_type, 0), "");
2736    border_color = LLVMBuildLoad2(builder, vec4_bld.vec_type, border_color_ptr, "");
2737    /* we don't have aligned type in the dynamic state unfortunately */
2738    LLVMSetAlignment(border_color, 4);
2739 
2740    /*
2741     * Instead of having some incredibly complex logic which will try to figure
2742     * out clamping necessary for each channel, simply use the first channel,
2743     * and treat mixed signed/unsigned normalized formats specially.  (Mixed
2744     * non-normalized, which wouldn't work at all here, do not exist for a good
2745     * reason.)
2746     */
2747    if (format_desc->layout == UTIL_FORMAT_LAYOUT_PLAIN) {
2748       int chan;
2749       /* d/s needs special handling because both present means just sampling depth */
2750       if (util_format_is_depth_and_stencil(format_desc->format)) {
2751          chan = format_desc->swizzle[0];
2752       }
2753       else {
2754          chan = util_format_get_first_non_void_channel(format_desc->format);
2755       }
2756       if (chan >= 0 && chan <= PIPE_SWIZZLE_W) {
2757          unsigned chan_type = format_desc->channel[chan].type;
2758          unsigned chan_norm = format_desc->channel[chan].normalized;
2759          unsigned chan_pure = format_desc->channel[chan].pure_integer;
2760          if (chan_type == UTIL_FORMAT_TYPE_SIGNED) {
2761             if (chan_norm) {
2762                min_clamp = lp_build_const_vec(gallivm, vec4_type, -1.0F);
2763                max_clamp = vec4_bld.one;
2764             }
2765             else if (chan_pure) {
2766                /*
2767                 * Border color was stored as int, hence need min/max clamp
2768                 * only if chan has less than 32 bits..
2769                 */
2770                unsigned chan_size = format_desc->channel[chan].size;
2771                if (chan_size < 32) {
2772                   min_clamp = lp_build_const_int_vec(gallivm, vec4_type,
2773                                                      0 - (1 << (chan_size - 1)));
2774                   max_clamp = lp_build_const_int_vec(gallivm, vec4_type,
2775                                                      (1 << (chan_size - 1)) - 1);
2776                }
2777             }
2778             /* TODO: no idea about non-pure, non-normalized! */
2779          }
2780          else if (chan_type == UTIL_FORMAT_TYPE_UNSIGNED) {
2781             if (chan_norm) {
2782                min_clamp = vec4_bld.zero;
2783                max_clamp = vec4_bld.one;
2784             }
2785             /*
2786              * Need a ugly hack here, because we don't have Z32_FLOAT_X8X24
2787              * we use Z32_FLOAT_S8X24 to imply sampling depth component
2788              * and ignoring stencil, which will blow up here if we try to
2789              * do a uint clamp in a float texel build...
2790              * And even if we had that format, mesa st also thinks using z24s8
2791              * means depth sampling ignoring stencil.
2792              */
2793             else if (chan_pure) {
2794                /*
2795                 * Border color was stored as uint, hence never need min
2796                 * clamp, and only need max clamp if chan has less than 32 bits.
2797                 */
2798                unsigned chan_size = format_desc->channel[chan].size;
2799                if (chan_size < 32) {
2800                   max_clamp = lp_build_const_int_vec(gallivm, vec4_type,
2801                                                      (1 << chan_size) - 1);
2802                }
2803                /* TODO: no idea about non-pure, non-normalized! */
2804             }
2805          }
2806          else if (chan_type == UTIL_FORMAT_TYPE_FIXED) {
2807             /* TODO: I have no idea what clamp this would need if any! */
2808          }
2809       }
2810       /* mixed plain formats (or different pure size) */
2811       switch (format_desc->format) {
2812       case PIPE_FORMAT_B10G10R10A2_UINT:
2813       case PIPE_FORMAT_R10G10B10A2_UINT:
2814          {
2815             unsigned max10 = (1 << 10) - 1;
2816             max_clamp = lp_build_const_aos(gallivm, vec4_type, max10, max10,
2817                                            max10, (1 << 2) - 1, NULL);
2818          }
2819          break;
2820       case PIPE_FORMAT_R10SG10SB10SA2U_NORM:
2821          min_clamp = lp_build_const_aos(gallivm, vec4_type, -1.0F, -1.0F,
2822                                         -1.0F, 0.0F, NULL);
2823          max_clamp = vec4_bld.one;
2824          break;
2825       case PIPE_FORMAT_R8SG8SB8UX8U_NORM:
2826       case PIPE_FORMAT_R5SG5SB6U_NORM:
2827          min_clamp = lp_build_const_aos(gallivm, vec4_type, -1.0F, -1.0F,
2828                                         0.0F, 0.0F, NULL);
2829          max_clamp = vec4_bld.one;
2830          break;
2831       default:
2832          break;
2833       }
2834    }
2835    else {
2836       /* cannot figure this out from format description */
2837       if (format_desc->layout == UTIL_FORMAT_LAYOUT_S3TC) {
2838          /* s3tc formats are always unorm */
2839          min_clamp = vec4_bld.zero;
2840          max_clamp = vec4_bld.one;
2841       }
2842       else if (format_desc->layout == UTIL_FORMAT_LAYOUT_RGTC ||
2843                format_desc->layout == UTIL_FORMAT_LAYOUT_ETC ||
2844                format_desc->layout == UTIL_FORMAT_LAYOUT_BPTC) {
2845          switch (format_desc->format) {
2846          case PIPE_FORMAT_RGTC1_UNORM:
2847          case PIPE_FORMAT_RGTC2_UNORM:
2848          case PIPE_FORMAT_LATC1_UNORM:
2849          case PIPE_FORMAT_LATC2_UNORM:
2850          case PIPE_FORMAT_ETC1_RGB8:
2851          case PIPE_FORMAT_BPTC_RGBA_UNORM:
2852          case PIPE_FORMAT_BPTC_SRGBA:
2853             min_clamp = vec4_bld.zero;
2854             max_clamp = vec4_bld.one;
2855             break;
2856          case PIPE_FORMAT_RGTC1_SNORM:
2857          case PIPE_FORMAT_RGTC2_SNORM:
2858          case PIPE_FORMAT_LATC1_SNORM:
2859          case PIPE_FORMAT_LATC2_SNORM:
2860             min_clamp = lp_build_const_vec(gallivm, vec4_type, -1.0F);
2861             max_clamp = vec4_bld.one;
2862             break;
2863          case PIPE_FORMAT_BPTC_RGB_FLOAT:
2864             /* not sure if we should clamp to max half float? */
2865             break;
2866          case PIPE_FORMAT_BPTC_RGB_UFLOAT:
2867             min_clamp = vec4_bld.zero;
2868             break;
2869          default:
2870             assert(0);
2871             break;
2872          }
2873       }
2874       /*
2875        * all others from subsampled/other group, though we don't care
2876        * about yuv (and should not have any from zs here)
2877        */
2878       else if (format_desc->colorspace != UTIL_FORMAT_COLORSPACE_YUV){
2879          switch (format_desc->format) {
2880          case PIPE_FORMAT_R8G8_B8G8_UNORM:
2881          case PIPE_FORMAT_G8R8_G8B8_UNORM:
2882          case PIPE_FORMAT_G8R8_B8R8_UNORM:
2883          case PIPE_FORMAT_R8G8_R8B8_UNORM:
2884          case PIPE_FORMAT_R1_UNORM: /* doesn't make sense but ah well */
2885             min_clamp = vec4_bld.zero;
2886             max_clamp = vec4_bld.one;
2887             break;
2888          case PIPE_FORMAT_R8G8Bx_SNORM:
2889             min_clamp = lp_build_const_vec(gallivm, vec4_type, -1.0F);
2890             max_clamp = vec4_bld.one;
2891             break;
2892             /*
2893              * Note smallfloat formats usually don't need clamping
2894              * (they still have infinite range) however this is not
2895              * true for r11g11b10 and r9g9b9e5, which can't represent
2896              * negative numbers (and additionally r9g9b9e5 can't represent
2897              * very large numbers). d3d10 seems happy without clamping in
2898              * this case, but gl spec is pretty clear: "for floating
2899              * point and integer formats, border values are clamped to
2900              * the representable range of the format" so do that here.
2901              */
2902          case PIPE_FORMAT_R11G11B10_FLOAT:
2903             min_clamp = vec4_bld.zero;
2904             break;
2905          case PIPE_FORMAT_R9G9B9E5_FLOAT:
2906             min_clamp = vec4_bld.zero;
2907             max_clamp = lp_build_const_vec(gallivm, vec4_type, MAX_RGB9E5);
2908             break;
2909          default:
2910             assert(0);
2911             break;
2912          }
2913       }
2914    }
2915 
2916    if (min_clamp) {
2917       border_color = lp_build_max(&vec4_bld, border_color, min_clamp);
2918    }
2919    if (max_clamp) {
2920       border_color = lp_build_min(&vec4_bld, border_color, max_clamp);
2921    }
2922 
2923    bld->border_color_clamped = border_color;
2924 }
2925 
2926 
2927 /**
2928  * General texture sampling codegen.
2929  * This function handles texture sampling for all texture targets (1D,
2930  * 2D, 3D, cube) and all filtering modes.
2931  */
2932 static void
lp_build_sample_general(struct lp_build_sample_context * bld,unsigned sampler_unit,boolean is_gather,const LLVMValueRef * coords,const LLVMValueRef * offsets,LLVMValueRef lod_positive,LLVMValueRef lod_fpart,LLVMValueRef ilevel0,LLVMValueRef ilevel1,LLVMValueRef * colors_out)2933 lp_build_sample_general(struct lp_build_sample_context *bld,
2934                         unsigned sampler_unit,
2935                         boolean is_gather,
2936                         const LLVMValueRef *coords,
2937                         const LLVMValueRef *offsets,
2938                         LLVMValueRef lod_positive,
2939                         LLVMValueRef lod_fpart,
2940                         LLVMValueRef ilevel0,
2941                         LLVMValueRef ilevel1,
2942                         LLVMValueRef *colors_out)
2943 {
2944    LLVMBuilderRef builder = bld->gallivm->builder;
2945    const struct lp_static_sampler_state *sampler_state = bld->static_sampler_state;
2946    const unsigned mip_filter = sampler_state->min_mip_filter;
2947    const unsigned min_filter = sampler_state->min_img_filter;
2948    const unsigned mag_filter = sampler_state->mag_img_filter;
2949    LLVMValueRef texels[4];
2950    unsigned chan;
2951 
2952    /* if we need border color, (potentially) clamp it now */
2953    if (lp_sampler_wrap_mode_uses_border_color(sampler_state->wrap_s,
2954                                               min_filter,
2955                                               mag_filter) ||
2956        (bld->dims > 1 &&
2957            lp_sampler_wrap_mode_uses_border_color(sampler_state->wrap_t,
2958                                                   min_filter,
2959                                                   mag_filter)) ||
2960        (bld->dims > 2 &&
2961            lp_sampler_wrap_mode_uses_border_color(sampler_state->wrap_r,
2962                                                   min_filter,
2963                                                   mag_filter))) {
2964       lp_build_clamp_border_color(bld, sampler_unit);
2965    }
2966 
2967 
2968    /*
2969     * Get/interpolate texture colors.
2970     */
2971 
2972    for (chan = 0; chan < 4; ++chan) {
2973      texels[chan] = lp_build_alloca(bld->gallivm, bld->texel_bld.vec_type, "");
2974      lp_build_name(texels[chan], "sampler%u_texel_%c_var", sampler_unit, "xyzw"[chan]);
2975    }
2976 
2977    if (sampler_state->aniso) {
2978       lp_build_sample_aniso(bld, PIPE_TEX_FILTER_NEAREST, mip_filter,
2979                             false, coords, offsets, ilevel0,
2980                             ilevel1, lod_fpart, texels);
2981    } else if (min_filter == mag_filter) {
2982       /* no need to distinguish between minification and magnification */
2983       lp_build_sample_mipmap(bld, min_filter, mip_filter,
2984                              is_gather,
2985                              coords, offsets,
2986                              ilevel0, ilevel1, lod_fpart,
2987                              texels);
2988    }
2989    else {
2990       /*
2991        * Could also get rid of the if-logic and always use mipmap_both, both
2992        * for the single lod and multi-lod case if nothing really uses this.
2993        */
2994       if (bld->num_lods == 1) {
2995          /* Emit conditional to choose min image filter or mag image filter
2996           * depending on the lod being > 0 or <= 0, respectively.
2997           */
2998          struct lp_build_if_state if_ctx;
2999 
3000          lod_positive = LLVMBuildTrunc(builder, lod_positive,
3001                                        LLVMInt1TypeInContext(bld->gallivm->context),
3002                                        "lod_pos");
3003 
3004          lp_build_if(&if_ctx, bld->gallivm, lod_positive);
3005          {
3006             /* Use the minification filter */
3007             lp_build_sample_mipmap(bld, min_filter, mip_filter, FALSE,
3008                                    coords, offsets,
3009                                    ilevel0, ilevel1, lod_fpart,
3010                                    texels);
3011          }
3012          lp_build_else(&if_ctx);
3013          {
3014             /* Use the magnification filter */
3015             lp_build_sample_mipmap(bld, mag_filter, PIPE_TEX_MIPFILTER_NONE,
3016                                    FALSE,
3017                                    coords, offsets,
3018                                    ilevel0, NULL, NULL,
3019                                    texels);
3020          }
3021          lp_build_endif(&if_ctx);
3022       }
3023       else {
3024          LLVMValueRef need_linear, linear_mask;
3025          unsigned mip_filter_for_nearest;
3026          struct lp_build_if_state if_ctx;
3027 
3028          if (min_filter == PIPE_TEX_FILTER_LINEAR) {
3029             linear_mask = lod_positive;
3030             mip_filter_for_nearest = PIPE_TEX_MIPFILTER_NONE;
3031          }
3032          else {
3033             linear_mask = lp_build_not(&bld->lodi_bld, lod_positive);
3034             mip_filter_for_nearest = mip_filter;
3035          }
3036          need_linear = lp_build_any_true_range(&bld->lodi_bld, bld->num_lods,
3037                                                linear_mask);
3038          lp_build_name(need_linear, "need_linear");
3039 
3040          if (bld->num_lods != bld->coord_type.length) {
3041             linear_mask = lp_build_unpack_broadcast_aos_scalars(bld->gallivm,
3042                                                                 bld->lodi_type,
3043                                                                 bld->int_coord_type,
3044                                                                 linear_mask);
3045          }
3046 
3047          lp_build_if(&if_ctx, bld->gallivm, need_linear);
3048          {
3049             /*
3050              * Do sampling with both filters simultaneously. This means using
3051              * a linear filter and doing some tricks (with weights) for the
3052              * pixels which need nearest filter.
3053              * Note that it's probably rare some pixels need nearest and some
3054              * linear filter but the fixups required for the nearest pixels
3055              * aren't all that complicated so just always run a combined path
3056              * if at least some pixels require linear.
3057              */
3058             lp_build_sample_mipmap_both(bld, linear_mask, mip_filter,
3059                                         coords, offsets,
3060                                         ilevel0, ilevel1,
3061                                         lod_fpart, lod_positive,
3062                                         texels);
3063          }
3064          lp_build_else(&if_ctx);
3065          {
3066             /*
3067              * All pixels require just nearest filtering, which is way
3068              * cheaper than linear, hence do a separate path for that.
3069              */
3070             lp_build_sample_mipmap(bld, PIPE_TEX_FILTER_NEAREST,
3071                                    mip_filter_for_nearest, FALSE,
3072                                    coords, offsets,
3073                                    ilevel0, ilevel1, lod_fpart,
3074                                    texels);
3075          }
3076          lp_build_endif(&if_ctx);
3077       }
3078    }
3079 
3080    for (chan = 0; chan < 4; ++chan) {
3081      colors_out[chan] = LLVMBuildLoad2(builder, bld->texel_bld.vec_type, texels[chan], "");
3082      lp_build_name(colors_out[chan], "sampler%u_texel_%c", sampler_unit, "xyzw"[chan]);
3083    }
3084 }
3085 
3086 
3087 /**
3088  * Texel fetch function.  In contrast to general sampling there is no
3089  * filtering, no coord minification, lod (if any) is always explicit uint,
3090  * coords are uints (in terms of texel units) directly to be applied to the
3091  * selected mip level (after adding texel offsets).  This function handles
3092  * texel fetch for all targets where texel fetch is supported (no cube maps,
3093  * but 1d, 2d, 3d are supported, arrays and buffers should be too).
3094  */
3095 static void
lp_build_fetch_texel(struct lp_build_sample_context * bld,unsigned texture_unit,LLVMValueRef ms_index,const LLVMValueRef * coords,LLVMValueRef explicit_lod,const LLVMValueRef * offsets,LLVMValueRef * colors_out)3096 lp_build_fetch_texel(struct lp_build_sample_context *bld,
3097                      unsigned texture_unit,
3098                      LLVMValueRef ms_index,
3099                      const LLVMValueRef *coords,
3100                      LLVMValueRef explicit_lod,
3101                      const LLVMValueRef *offsets,
3102                      LLVMValueRef *colors_out)
3103 {
3104    struct lp_build_context *perquadi_bld = &bld->lodi_bld;
3105    struct lp_build_context *int_coord_bld = &bld->int_coord_bld;
3106    unsigned dims = bld->dims, chan;
3107    unsigned target = bld->static_texture_state->target;
3108    boolean out_of_bound_ret_zero = TRUE;
3109    LLVMValueRef size, ilevel;
3110    LLVMValueRef row_stride_vec = NULL, img_stride_vec = NULL;
3111    LLVMValueRef x = coords[0], y = coords[1], z = coords[2];
3112    LLVMValueRef width, height, depth, i, j;
3113    LLVMValueRef offset, out_of_bounds, out1;
3114 
3115    out_of_bounds = int_coord_bld->zero;
3116 
3117    if (explicit_lod && bld->static_texture_state->target != PIPE_BUFFER) {
3118       if (bld->num_mips != int_coord_bld->type.length) {
3119          ilevel = lp_build_pack_aos_scalars(bld->gallivm, int_coord_bld->type,
3120                                             perquadi_bld->type, explicit_lod, 0);
3121       }
3122       else {
3123          ilevel = explicit_lod;
3124       }
3125       lp_build_nearest_mip_level(bld, texture_unit, ilevel, &ilevel,
3126                                  out_of_bound_ret_zero ? &out_of_bounds : NULL);
3127    }
3128    else {
3129       assert(bld->num_mips == 1);
3130       if (bld->static_texture_state->target != PIPE_BUFFER) {
3131          ilevel = bld->dynamic_state->first_level(bld->dynamic_state, bld->gallivm,
3132                                                   bld->context_ptr, texture_unit, NULL);
3133       }
3134       else {
3135          ilevel = lp_build_const_int32(bld->gallivm, 0);
3136       }
3137    }
3138    lp_build_mipmap_level_sizes(bld, ilevel,
3139                                &size,
3140                                &row_stride_vec, &img_stride_vec);
3141    lp_build_extract_image_sizes(bld, &bld->int_size_bld, int_coord_bld->type,
3142                                 size, &width, &height, &depth);
3143 
3144    if (target == PIPE_TEXTURE_1D_ARRAY ||
3145        target == PIPE_TEXTURE_2D_ARRAY) {
3146       if (out_of_bound_ret_zero) {
3147          z = lp_build_layer_coord(bld, texture_unit, FALSE, z, &out1);
3148          out_of_bounds = lp_build_or(int_coord_bld, out_of_bounds, out1);
3149       }
3150       else {
3151          z = lp_build_layer_coord(bld, texture_unit, FALSE, z, NULL);
3152       }
3153    }
3154 
3155    /* This is a lot like border sampling */
3156    if (offsets[0]) {
3157       /*
3158        * coords are really unsigned, offsets are signed, but I don't think
3159        * exceeding 31 bits is possible
3160        */
3161       x = lp_build_add(int_coord_bld, x, offsets[0]);
3162    }
3163    out1 = lp_build_cmp(int_coord_bld, PIPE_FUNC_LESS, x, int_coord_bld->zero);
3164    out_of_bounds = lp_build_or(int_coord_bld, out_of_bounds, out1);
3165    out1 = lp_build_cmp(int_coord_bld, PIPE_FUNC_GEQUAL, x, width);
3166    out_of_bounds = lp_build_or(int_coord_bld, out_of_bounds, out1);
3167 
3168    if (dims >= 2) {
3169       if (offsets[1]) {
3170          y = lp_build_add(int_coord_bld, y, offsets[1]);
3171       }
3172       out1 = lp_build_cmp(int_coord_bld, PIPE_FUNC_LESS, y, int_coord_bld->zero);
3173       out_of_bounds = lp_build_or(int_coord_bld, out_of_bounds, out1);
3174       out1 = lp_build_cmp(int_coord_bld, PIPE_FUNC_GEQUAL, y, height);
3175       out_of_bounds = lp_build_or(int_coord_bld, out_of_bounds, out1);
3176 
3177       if (dims >= 3) {
3178          if (offsets[2]) {
3179             z = lp_build_add(int_coord_bld, z, offsets[2]);
3180          }
3181          out1 = lp_build_cmp(int_coord_bld, PIPE_FUNC_LESS, z, int_coord_bld->zero);
3182          out_of_bounds = lp_build_or(int_coord_bld, out_of_bounds, out1);
3183          out1 = lp_build_cmp(int_coord_bld, PIPE_FUNC_GEQUAL, z, depth);
3184          out_of_bounds = lp_build_or(int_coord_bld, out_of_bounds, out1);
3185       }
3186    }
3187 
3188    lp_build_sample_offset(int_coord_bld,
3189                           bld->format_desc,
3190                           x, y, z, row_stride_vec, img_stride_vec,
3191                           &offset, &i, &j);
3192 
3193    if (bld->static_texture_state->target != PIPE_BUFFER) {
3194       offset = lp_build_add(int_coord_bld, offset,
3195                             lp_build_get_mip_offsets(bld, ilevel));
3196    }
3197 
3198    if (bld->fetch_ms) {
3199       LLVMValueRef num_samples;
3200       num_samples = bld->dynamic_state->num_samples(bld->dynamic_state, bld->gallivm,
3201                                                     bld->context_ptr, texture_unit, NULL);
3202       out1 = lp_build_cmp(int_coord_bld, PIPE_FUNC_LESS, ms_index, int_coord_bld->zero);
3203       out_of_bounds = lp_build_or(int_coord_bld, out_of_bounds, out1);
3204       out1 = lp_build_cmp(int_coord_bld, PIPE_FUNC_GEQUAL, ms_index, lp_build_broadcast_scalar(int_coord_bld, num_samples));
3205       out_of_bounds = lp_build_or(int_coord_bld, out_of_bounds, out1);
3206       offset = lp_build_add(int_coord_bld, offset,
3207                             lp_build_mul(int_coord_bld, bld->sample_stride, ms_index));
3208    }
3209 
3210    offset = lp_build_andnot(int_coord_bld, offset, out_of_bounds);
3211 
3212    lp_build_fetch_rgba_soa(bld->gallivm,
3213                            bld->format_desc,
3214                            bld->texel_type, TRUE,
3215                            bld->base_ptr, offset,
3216                            i, j,
3217                            bld->cache,
3218                            colors_out);
3219 
3220    if (out_of_bound_ret_zero) {
3221       /*
3222        * Only needed for ARB_robust_buffer_access_behavior and d3d10.
3223        * Could use min/max above instead of out-of-bounds comparisons
3224        * if we don't care about the result returned for out-of-bounds.
3225        */
3226       LLVMValueRef oob[4] = {
3227          bld->texel_bld.zero,
3228          bld->texel_bld.zero,
3229          bld->texel_bld.zero,
3230          bld->texel_bld.zero,
3231       };
3232       lp_build_format_swizzle_soa(bld->format_desc, &bld->texel_bld, oob, oob);
3233       for (chan = 0; chan < 4; chan++) {
3234          colors_out[chan] = lp_build_select(&bld->texel_bld, out_of_bounds,
3235                                             oob[chan], colors_out[chan]);
3236       }
3237    }
3238 }
3239 
3240 
3241 /**
3242  * Just set texels to white instead of actually sampling the texture.
3243  * For debugging.
3244  */
3245 void
lp_build_sample_nop(struct gallivm_state * gallivm,struct lp_type type,const LLVMValueRef * coords,LLVMValueRef texel_out[4])3246 lp_build_sample_nop(struct gallivm_state *gallivm,
3247                     struct lp_type type,
3248                     const LLVMValueRef *coords,
3249                     LLVMValueRef texel_out[4])
3250 {
3251    LLVMValueRef one = lp_build_one(gallivm, type);
3252    for (unsigned chan = 0; chan < 4; chan++) {
3253       texel_out[chan] = one;
3254    }
3255 }
3256 
3257 
3258 static struct lp_type
lp_build_texel_type(struct lp_type texel_type,const struct util_format_description * format_desc)3259 lp_build_texel_type(struct lp_type texel_type,
3260                     const struct util_format_description *format_desc)
3261 {
3262    /* always using the first channel hopefully should be safe,
3263     * if not things WILL break in other places anyway.
3264     */
3265    if (format_desc->colorspace == UTIL_FORMAT_COLORSPACE_RGB &&
3266        format_desc->channel[0].pure_integer) {
3267       if (format_desc->channel[0].type == UTIL_FORMAT_TYPE_SIGNED) {
3268          texel_type = lp_type_int_vec(texel_type.width, texel_type.width * texel_type.length);
3269       } else if (format_desc->channel[0].type == UTIL_FORMAT_TYPE_UNSIGNED) {
3270          texel_type = lp_type_uint_vec(texel_type.width, texel_type.width * texel_type.length);
3271       }
3272    }
3273    else if (util_format_has_stencil(format_desc) &&
3274        !util_format_has_depth(format_desc)) {
3275       /* for stencil only formats, sample stencil (uint) */
3276       texel_type = lp_type_uint_vec(texel_type.width, texel_type.width * texel_type.length);
3277    }
3278    return texel_type;
3279 }
3280 
3281 
3282 /**
3283  * Build the actual texture sampling code.
3284  * 'texel' will return a vector of four LLVMValueRefs corresponding to
3285  * R, G, B, A.
3286  * \param type  vector float type to use for coords, etc.
3287  * \param sample_key
3288  * \param derivs  partial derivatives of (s,t,r,q) with respect to x and y
3289  */
3290 static void
lp_build_sample_soa_code(struct gallivm_state * gallivm,const struct lp_static_texture_state * static_texture_state,const struct lp_static_sampler_state * static_sampler_state,struct lp_sampler_dynamic_state * dynamic_state,struct lp_type type,unsigned sample_key,unsigned texture_index,unsigned sampler_index,LLVMValueRef context_ptr,LLVMValueRef thread_data_ptr,const LLVMValueRef * coords,const LLVMValueRef * offsets,const struct lp_derivatives * derivs,LLVMValueRef lod,LLVMValueRef ms_index,LLVMValueRef aniso_filter_table,LLVMValueRef texel_out[4])3291 lp_build_sample_soa_code(struct gallivm_state *gallivm,
3292                          const struct lp_static_texture_state *static_texture_state,
3293                          const struct lp_static_sampler_state *static_sampler_state,
3294                          struct lp_sampler_dynamic_state *dynamic_state,
3295                          struct lp_type type,
3296                          unsigned sample_key,
3297                          unsigned texture_index,
3298                          unsigned sampler_index,
3299                          LLVMValueRef context_ptr,
3300                          LLVMValueRef thread_data_ptr,
3301                          const LLVMValueRef *coords,
3302                          const LLVMValueRef *offsets,
3303                          const struct lp_derivatives *derivs, /* optional */
3304                          LLVMValueRef lod, /* optional */
3305                          LLVMValueRef ms_index, /* optional */
3306                          LLVMValueRef aniso_filter_table,
3307                          LLVMValueRef texel_out[4])
3308 {
3309    assert(static_texture_state);
3310    assert(static_texture_state->format < PIPE_FORMAT_COUNT);
3311    assert(static_sampler_state);
3312 
3313    const enum pipe_texture_target target = static_texture_state->target;
3314    const unsigned dims = texture_dims(target);
3315    const unsigned num_quads = type.length / 4;
3316    struct lp_build_sample_context bld;
3317    struct lp_static_sampler_state derived_sampler_state = *static_sampler_state;
3318    LLVMTypeRef i32t = LLVMInt32TypeInContext(gallivm->context);
3319    LLVMBuilderRef builder = gallivm->builder;
3320 
3321    if (0) {
3322       enum pipe_format fmt = static_texture_state->format;
3323       debug_printf("Sample from %s\n", util_format_name(fmt));
3324    }
3325 
3326    const enum lp_sampler_lod_property lod_property =
3327       (sample_key & LP_SAMPLER_LOD_PROPERTY_MASK) >>
3328       LP_SAMPLER_LOD_PROPERTY_SHIFT;
3329    const enum lp_sampler_lod_control lod_control =
3330       (sample_key & LP_SAMPLER_LOD_CONTROL_MASK) >>
3331       LP_SAMPLER_LOD_CONTROL_SHIFT;
3332    const enum lp_sampler_op_type op_type =
3333       (sample_key & LP_SAMPLER_OP_TYPE_MASK) >>
3334       LP_SAMPLER_OP_TYPE_SHIFT;
3335 
3336    const boolean fetch_ms = !!(sample_key & LP_SAMPLER_FETCH_MS);
3337    const boolean op_is_tex = op_type == LP_SAMPLER_OP_TEXTURE;
3338    const boolean op_is_lodq = op_type == LP_SAMPLER_OP_LODQ;
3339    const boolean op_is_gather = op_type == LP_SAMPLER_OP_GATHER;
3340 
3341    LLVMValueRef lod_bias = NULL;
3342    LLVMValueRef explicit_lod = NULL;
3343    if (lod_control == LP_SAMPLER_LOD_BIAS) {
3344       lod_bias = lod;
3345       assert(lod);
3346       assert(derivs == NULL);
3347    }
3348    else if (lod_control == LP_SAMPLER_LOD_EXPLICIT) {
3349       explicit_lod = lod;
3350       assert(lod);
3351       assert(derivs == NULL);
3352    }
3353    else if (lod_control == LP_SAMPLER_LOD_DERIVATIVES) {
3354       assert(derivs);
3355       assert(lod == NULL);
3356    }
3357    else {
3358       assert(derivs == NULL);
3359       assert(lod == NULL);
3360    }
3361 
3362    if (static_texture_state->format == PIPE_FORMAT_NONE) {
3363       /*
3364        * If there's nothing bound, format is NONE, and we must return
3365        * all zero as mandated by d3d10 in this case.
3366        */
3367       LLVMValueRef zero = lp_build_zero(gallivm, type);
3368       for (unsigned chan = 0; chan < 4; chan++) {
3369          texel_out[chan] = zero;
3370       }
3371       return;
3372    }
3373 
3374    assert(type.floating);
3375 
3376    /* Setup our build context */
3377    memset(&bld, 0, sizeof bld);
3378    bld.gallivm = gallivm;
3379    bld.context_ptr = context_ptr;
3380    bld.aniso_filter_table = aniso_filter_table;
3381    bld.static_sampler_state = &derived_sampler_state;
3382    bld.static_texture_state = static_texture_state;
3383    bld.dynamic_state = dynamic_state;
3384    bld.format_desc = util_format_description(static_texture_state->format);
3385    bld.dims = dims;
3386 
3387    if (gallivm_perf & GALLIVM_PERF_NO_QUAD_LOD || op_is_lodq) {
3388       bld.no_quad_lod = TRUE;
3389    }
3390    if (!(gallivm_perf & GALLIVM_PERF_RHO_APPROX) || op_is_lodq) {
3391       bld.no_rho_approx = TRUE;
3392    }
3393    if (!(gallivm_perf & GALLIVM_PERF_BRILINEAR) || op_is_lodq || lod_bias || explicit_lod) {
3394       bld.no_brilinear = TRUE;
3395    }
3396 
3397    bld.vector_width = lp_type_width(type);
3398 
3399    bld.float_type = lp_type_float(32);
3400    bld.int_type = lp_type_int(32);
3401    bld.coord_type = type;
3402    bld.int_coord_type = lp_int_type(type);
3403    bld.float_size_in_type = lp_type_float(32);
3404    bld.float_size_in_type.length = dims > 1 ? 4 : 1;
3405    bld.int_size_in_type = lp_int_type(bld.float_size_in_type);
3406 
3407    bld.texel_type = lp_build_texel_type(type, bld.format_desc);
3408 
3409    if (!static_texture_state->level_zero_only ||
3410        !static_sampler_state->max_lod_pos || op_is_lodq) {
3411       derived_sampler_state.min_mip_filter = static_sampler_state->min_mip_filter;
3412    } else {
3413       derived_sampler_state.min_mip_filter = PIPE_TEX_MIPFILTER_NONE;
3414    }
3415 
3416    if (op_is_gather) {
3417       /*
3418        * gather4 is exactly like GL_LINEAR filtering but in the end skipping
3419        * the actual filtering. Using mostly the same paths, so cube face
3420        * selection, coord wrapping etc. all naturally uses the same code.
3421        */
3422       derived_sampler_state.min_mip_filter = PIPE_TEX_MIPFILTER_NONE;
3423       derived_sampler_state.min_img_filter = PIPE_TEX_FILTER_LINEAR;
3424       derived_sampler_state.mag_img_filter = PIPE_TEX_FILTER_LINEAR;
3425    }
3426 
3427    const enum pipe_tex_mipfilter mip_filter =
3428       derived_sampler_state.min_mip_filter;
3429 
3430    if (static_texture_state->target == PIPE_TEXTURE_CUBE ||
3431        static_texture_state->target == PIPE_TEXTURE_CUBE_ARRAY) {
3432       /*
3433        * Seamless filtering ignores wrap modes.
3434        * Setting to CLAMP_TO_EDGE is correct for nearest filtering, for
3435        * bilinear it's not correct but way better than using for instance
3436        * repeat.  Note we even set this for non-seamless. Technically GL
3437        * allows any wrap mode, which made sense when supporting true borders
3438        * (can get seamless effect with border and CLAMP_TO_BORDER), but
3439        * gallium doesn't support borders and d3d9 requires wrap modes to be
3440        * ignored and it's a pain to fix up the sampler state (as it makes it
3441        * texture dependent).
3442        */
3443       derived_sampler_state.wrap_s = PIPE_TEX_WRAP_CLAMP_TO_EDGE;
3444       derived_sampler_state.wrap_t = PIPE_TEX_WRAP_CLAMP_TO_EDGE;
3445    }
3446 
3447    /*
3448     * We could force CLAMP to CLAMP_TO_EDGE here if min/mag filter is nearest,
3449     * so AoS path could be used. Not sure it's worth the trouble...
3450     */
3451    const enum pipe_tex_filter min_img_filter =
3452       derived_sampler_state.min_img_filter;
3453    const enum pipe_tex_filter mag_img_filter =
3454       derived_sampler_state.mag_img_filter;
3455 
3456    /*
3457     * This is all a bit complicated different paths are chosen for performance
3458     * reasons.
3459     * Essentially, there can be 1 lod per element, 1 lod per quad or 1 lod for
3460     * everything (the last two options are equivalent for 4-wide case).
3461     * If there's per-quad lod but we split to 4-wide so we can use AoS, per-quad
3462     * lod is calculated then the lod value extracted afterwards so making this
3463     * case basically the same as far as lod handling is concerned for the
3464     * further sample/filter code as the 1 lod for everything case.
3465     * Different lod handling mostly shows up when building mipmap sizes
3466     * (lp_build_mipmap_level_sizes() and friends) and also in filtering
3467     * (getting the fractional part of the lod to the right texels).
3468     */
3469 
3470    /*
3471     * There are other situations where at least the multiple int lods could be
3472     * avoided like min and max lod being equal.
3473     */
3474    bld.num_mips = bld.num_lods = 1;
3475 
3476    if (bld.no_quad_lod && bld.no_rho_approx &&
3477        ((mip_filter != PIPE_TEX_MIPFILTER_NONE && op_is_tex &&
3478          (static_texture_state->target == PIPE_TEXTURE_CUBE ||
3479           static_texture_state->target == PIPE_TEXTURE_CUBE_ARRAY)) ||
3480         op_is_lodq)) {
3481       /*
3482        * special case for using per-pixel lod even for implicit lod,
3483        * which is generally never required (ok by APIs) except to please
3484        * some (somewhat broken imho) tests (because per-pixel face selection
3485        * can cause derivatives to be different for pixels outside the primitive
3486        * due to the major axis division even if pre-project derivatives are
3487        * looking normal).
3488        * For lodq, we do it to simply avoid scalar pack / unpack (albeit for
3489        * cube maps we do indeed get per-pixel lod values).
3490        */
3491       bld.num_mips = type.length;
3492       bld.num_lods = type.length;
3493    }
3494    else if (lod_property == LP_SAMPLER_LOD_PER_ELEMENT ||
3495        (explicit_lod || lod_bias || derivs)) {
3496       if ((!op_is_tex && target != PIPE_BUFFER) ||
3497           (op_is_tex && mip_filter != PIPE_TEX_MIPFILTER_NONE)) {
3498          bld.num_mips = type.length;
3499          bld.num_lods = type.length;
3500       }
3501       else if (op_is_tex && min_img_filter != mag_img_filter) {
3502          bld.num_mips = 1;
3503          bld.num_lods = type.length;
3504       }
3505    }
3506    /* TODO: for true scalar_lod should only use 1 lod value */
3507    else if ((!op_is_tex && explicit_lod && target != PIPE_BUFFER) ||
3508             (op_is_tex && mip_filter != PIPE_TEX_MIPFILTER_NONE)) {
3509       bld.num_mips = num_quads;
3510       bld.num_lods = num_quads;
3511    }
3512    else if (op_is_tex && min_img_filter != mag_img_filter) {
3513       bld.num_mips = 1;
3514       bld.num_lods = num_quads;
3515    }
3516 
3517    bld.fetch_ms = fetch_ms;
3518    if (op_is_gather)
3519       bld.gather_comp = (sample_key & LP_SAMPLER_GATHER_COMP_MASK) >> LP_SAMPLER_GATHER_COMP_SHIFT;
3520    bld.lodf_type = type;
3521    /* we want native vector size to be able to use our intrinsics */
3522    if (bld.num_lods != type.length) {
3523       /* TODO: this currently always has to be per-quad or per-element */
3524       bld.lodf_type.length = type.length > 4 ? ((type.length + 15) / 16) * 4 : 1;
3525    }
3526    bld.lodi_type = lp_int_type(bld.lodf_type);
3527    bld.levelf_type = bld.lodf_type;
3528    if (bld.num_mips == 1) {
3529       bld.levelf_type.length = 1;
3530    }
3531    bld.leveli_type = lp_int_type(bld.levelf_type);
3532    bld.float_size_type = bld.float_size_in_type;
3533 
3534    /* Note: size vectors may not be native. They contain minified w/h/d/_
3535     * values, with per-element lod that is w0/h0/d0/_/w1/h1/d1_/... so up to
3536     * 8x4f32
3537     */
3538    if (bld.num_mips > 1) {
3539       bld.float_size_type.length = bld.num_mips == type.length ?
3540                                       bld.num_mips * bld.float_size_in_type.length :
3541                                       type.length;
3542    }
3543    bld.int_size_type = lp_int_type(bld.float_size_type);
3544 
3545    lp_build_context_init(&bld.float_bld, gallivm, bld.float_type);
3546    lp_build_context_init(&bld.float_vec_bld, gallivm, type);
3547    lp_build_context_init(&bld.int_bld, gallivm, bld.int_type);
3548    lp_build_context_init(&bld.coord_bld, gallivm, bld.coord_type);
3549    lp_build_context_init(&bld.int_coord_bld, gallivm, bld.int_coord_type);
3550    lp_build_context_init(&bld.int_size_in_bld, gallivm, bld.int_size_in_type);
3551    lp_build_context_init(&bld.float_size_in_bld, gallivm, bld.float_size_in_type);
3552    lp_build_context_init(&bld.int_size_bld, gallivm, bld.int_size_type);
3553    lp_build_context_init(&bld.float_size_bld, gallivm, bld.float_size_type);
3554    lp_build_context_init(&bld.texel_bld, gallivm, bld.texel_type);
3555    lp_build_context_init(&bld.levelf_bld, gallivm, bld.levelf_type);
3556    lp_build_context_init(&bld.leveli_bld, gallivm, bld.leveli_type);
3557    lp_build_context_init(&bld.lodf_bld, gallivm, bld.lodf_type);
3558    lp_build_context_init(&bld.lodi_bld, gallivm, bld.lodi_type);
3559 
3560    /* Get the dynamic state */
3561    LLVMValueRef tex_width = dynamic_state->width(dynamic_state, gallivm,
3562                                                  context_ptr, texture_index,
3563                                                  NULL);
3564    bld.row_stride_array = dynamic_state->row_stride(dynamic_state, gallivm,
3565                                                     context_ptr, texture_index, NULL);
3566    bld.img_stride_array = dynamic_state->img_stride(dynamic_state, gallivm,
3567                                                     context_ptr, texture_index, NULL);
3568    bld.base_ptr = dynamic_state->base_ptr(dynamic_state, gallivm,
3569                                           context_ptr, texture_index, NULL);
3570    bld.mip_offsets = dynamic_state->mip_offsets(dynamic_state, gallivm,
3571                                                 context_ptr, texture_index, NULL);
3572 
3573    if (fetch_ms) {
3574       bld.sample_stride =
3575          lp_build_broadcast_scalar(&bld.int_coord_bld,
3576                                    dynamic_state->sample_stride(dynamic_state,
3577                                                                 gallivm,
3578                                                                 context_ptr,
3579                                                                 texture_index,
3580                                                                 NULL));
3581    }
3582 
3583    /* Note that mip_offsets is an array[level] of offsets to texture images */
3584 
3585    if (dynamic_state->cache_ptr && thread_data_ptr) {
3586       bld.cache = dynamic_state->cache_ptr(dynamic_state, gallivm,
3587                                            thread_data_ptr, texture_index);
3588    }
3589 
3590    /* width, height, depth as single int vector */
3591    if (dims <= 1) {
3592       bld.int_size = tex_width;
3593    }
3594    else {
3595       bld.int_size = LLVMBuildInsertElement(builder, bld.int_size_in_bld.undef,
3596                                             tex_width,
3597                                             LLVMConstInt(i32t, 0, 0), "");
3598       if (dims >= 2) {
3599          LLVMValueRef tex_height =
3600             dynamic_state->height(dynamic_state, gallivm,
3601                                   context_ptr, texture_index, NULL);
3602          bld.int_size = LLVMBuildInsertElement(builder, bld.int_size,
3603                                                tex_height,
3604                                                LLVMConstInt(i32t, 1, 0), "");
3605          if (dims >= 3) {
3606             LLVMValueRef tex_depth =
3607                dynamic_state->depth(dynamic_state, gallivm, context_ptr,
3608                                     texture_index, NULL);
3609             bld.int_size = LLVMBuildInsertElement(builder, bld.int_size,
3610                                                   tex_depth,
3611                                                   LLVMConstInt(i32t, 2, 0), "");
3612          }
3613       }
3614    }
3615 
3616    LLVMValueRef newcoords[5];
3617    for (unsigned i = 0; i < 5; i++) {
3618       newcoords[i] = coords[i];
3619    }
3620 
3621    if (util_format_is_pure_integer(static_texture_state->format) &&
3622        !util_format_has_depth(bld.format_desc) && op_is_tex &&
3623        (static_sampler_state->min_mip_filter == PIPE_TEX_MIPFILTER_LINEAR ||
3624         static_sampler_state->min_img_filter == PIPE_TEX_FILTER_LINEAR ||
3625         static_sampler_state->mag_img_filter == PIPE_TEX_FILTER_LINEAR)) {
3626       /*
3627        * Bail if impossible filtering is specified (the awkard additional
3628        * depth check is because it is legal in gallium to have things like
3629        * S8Z24 here which would say it's pure int despite such formats should
3630        * sample the depth component).
3631        * In GL such filters make the texture incomplete, this makes it robust
3632        * against gallium frontends which set this up regardless (we'd crash in
3633        * the lerp later otherwise).
3634        * At least in some apis it may be legal to use such filters with lod
3635        * queries and/or gather (at least for gather d3d10 says only the wrap
3636        * bits are really used hence filter bits are likely simply ignored).
3637        * For fetch, we don't get valid samplers either way here.
3638        */
3639       LLVMValueRef zero = lp_build_zero(gallivm, type);
3640       for (unsigned chan = 0; chan < 4; chan++) {
3641          texel_out[chan] = zero;
3642       }
3643       return;
3644    }
3645 
3646    if (0) {
3647       /* For debug: no-op texture sampling */
3648       lp_build_sample_nop(gallivm,
3649                           bld.texel_type,
3650                           newcoords,
3651                           texel_out);
3652    } else if (op_type == LP_SAMPLER_OP_FETCH) {
3653       lp_build_fetch_texel(&bld, texture_index, ms_index, newcoords,
3654                            lod, offsets, texel_out);
3655    } else {
3656       LLVMValueRef lod_fpart = NULL, lod_positive = NULL;
3657       LLVMValueRef ilevel0 = NULL, ilevel1 = NULL, lod = NULL;
3658       boolean use_aos = util_format_fits_8unorm(bld.format_desc) &&
3659                 op_is_tex &&
3660                 /* not sure this is strictly needed or simply impossible */
3661                 derived_sampler_state.compare_mode == PIPE_TEX_COMPARE_NONE &&
3662                 derived_sampler_state.aniso == 0 &&
3663                 lp_is_simple_wrap_mode(derived_sampler_state.wrap_s);
3664 
3665       use_aos &= bld.num_lods <= num_quads ||
3666                  derived_sampler_state.min_img_filter ==
3667                     derived_sampler_state.mag_img_filter;
3668 
3669       if (gallivm_perf & GALLIVM_PERF_NO_AOS_SAMPLING) {
3670          use_aos = 0;
3671       }
3672 
3673       if (dims > 1) {
3674          use_aos &= lp_is_simple_wrap_mode(derived_sampler_state.wrap_t);
3675          if (dims > 2) {
3676             use_aos &= lp_is_simple_wrap_mode(derived_sampler_state.wrap_r);
3677          }
3678       }
3679       if ((static_texture_state->target == PIPE_TEXTURE_CUBE ||
3680            static_texture_state->target == PIPE_TEXTURE_CUBE_ARRAY) &&
3681           derived_sampler_state.seamless_cube_map &&
3682           (derived_sampler_state.min_img_filter == PIPE_TEX_FILTER_LINEAR ||
3683            derived_sampler_state.mag_img_filter == PIPE_TEX_FILTER_LINEAR)) {
3684          /* theoretically possible with AoS filtering but not implemented (complex!) */
3685          use_aos = 0;
3686       }
3687 
3688       if ((gallivm_debug & GALLIVM_DEBUG_PERF) &&
3689           !use_aos && util_format_fits_8unorm(bld.format_desc)) {
3690          debug_printf("%s: using floating point linear filtering for %s\n",
3691                       __FUNCTION__, bld.format_desc->short_name);
3692          debug_printf("  min_img %d  mag_img %d  mip %d  target %d  seamless %d"
3693                       "  wraps %d  wrapt %d  wrapr %d\n",
3694                       derived_sampler_state.min_img_filter,
3695                       derived_sampler_state.mag_img_filter,
3696                       derived_sampler_state.min_mip_filter,
3697                       static_texture_state->target,
3698                       derived_sampler_state.seamless_cube_map,
3699                       derived_sampler_state.wrap_s,
3700                       derived_sampler_state.wrap_t,
3701                       derived_sampler_state.wrap_r);
3702       }
3703 
3704       lp_build_sample_common(&bld, op_is_lodq, texture_index, sampler_index,
3705                              newcoords, derivs, lod_bias, explicit_lod,
3706                              &lod_positive, &lod, &lod_fpart,
3707                              &ilevel0, &ilevel1);
3708 
3709       if (op_is_lodq) {
3710          texel_out[0] = lod_fpart;
3711          texel_out[1] = lod;
3712          texel_out[2] = texel_out[3] = bld.coord_bld.zero;
3713          return;
3714       }
3715 
3716       if (use_aos && static_texture_state->target == PIPE_TEXTURE_CUBE_ARRAY) {
3717          /* The aos path doesn't do seamless filtering so simply add cube layer
3718           * to face now.
3719           */
3720          newcoords[2] = lp_build_add(&bld.int_coord_bld, newcoords[2], newcoords[3]);
3721       }
3722 
3723       /*
3724        * we only try 8-wide sampling with soa or if we have AVX2
3725        * as it appears to be a loss with just AVX)
3726        */
3727       if (num_quads == 1 || !use_aos ||
3728           (util_get_cpu_caps()->has_avx2 &&
3729            (bld.num_lods == 1 ||
3730             derived_sampler_state.min_img_filter == derived_sampler_state.mag_img_filter))) {
3731          if (use_aos) {
3732             /* do sampling/filtering with fixed pt arithmetic */
3733             lp_build_sample_aos(&bld, sampler_index,
3734                                 newcoords[0], newcoords[1],
3735                                 newcoords[2],
3736                                 offsets, lod_positive, lod_fpart,
3737                                 ilevel0, ilevel1,
3738                                 texel_out);
3739          } else {
3740             lp_build_sample_general(&bld, sampler_index,
3741                                     op_type == LP_SAMPLER_OP_GATHER,
3742                                     newcoords, offsets,
3743                                     lod_positive, lod_fpart,
3744                                     ilevel0, ilevel1,
3745                                     texel_out);
3746          }
3747       }
3748       else {
3749          struct lp_build_sample_context bld4;
3750          struct lp_type type4 = type;
3751          LLVMValueRef texelout4[4];
3752          LLVMValueRef texelouttmp[4][LP_MAX_VECTOR_LENGTH/16];
3753 
3754          type4.length = 4;
3755 
3756          /* Setup our build context */
3757          memset(&bld4, 0, sizeof bld4);
3758          bld4.no_quad_lod = bld.no_quad_lod;
3759          bld4.no_rho_approx = bld.no_rho_approx;
3760          bld4.no_brilinear = bld.no_brilinear;
3761          bld4.gallivm = bld.gallivm;
3762          bld4.context_ptr = bld.context_ptr;
3763          bld4.aniso_filter_table = aniso_filter_table;
3764          bld4.static_texture_state = bld.static_texture_state;
3765          bld4.static_sampler_state = bld.static_sampler_state;
3766          bld4.dynamic_state = bld.dynamic_state;
3767          bld4.format_desc = bld.format_desc;
3768          bld4.dims = bld.dims;
3769          bld4.row_stride_array = bld.row_stride_array;
3770          bld4.img_stride_array = bld.img_stride_array;
3771          bld4.base_ptr = bld.base_ptr;
3772          bld4.mip_offsets = bld.mip_offsets;
3773          bld4.int_size = bld.int_size;
3774          bld4.cache = bld.cache;
3775 
3776          bld4.vector_width = lp_type_width(type4);
3777 
3778          bld4.float_type = lp_type_float(32);
3779          bld4.int_type = lp_type_int(32);
3780          bld4.coord_type = type4;
3781          bld4.int_coord_type = lp_int_type(type4);
3782          bld4.float_size_in_type = lp_type_float(32);
3783          bld4.float_size_in_type.length = dims > 1 ? 4 : 1;
3784          bld4.int_size_in_type = lp_int_type(bld4.float_size_in_type);
3785          bld4.texel_type = bld.texel_type;
3786          bld4.texel_type.length = 4;
3787 
3788          bld4.num_mips = bld4.num_lods = 1;
3789          if (bld4.no_quad_lod && bld4.no_rho_approx &&
3790              (static_texture_state->target == PIPE_TEXTURE_CUBE ||
3791               static_texture_state->target == PIPE_TEXTURE_CUBE_ARRAY) &&
3792              (op_is_tex && mip_filter != PIPE_TEX_MIPFILTER_NONE)) {
3793             bld4.num_mips = type4.length;
3794             bld4.num_lods = type4.length;
3795          }
3796          if (lod_property == LP_SAMPLER_LOD_PER_ELEMENT &&
3797              (explicit_lod || lod_bias || derivs)) {
3798             if ((!op_is_tex && target != PIPE_BUFFER) ||
3799                 (op_is_tex && mip_filter != PIPE_TEX_MIPFILTER_NONE)) {
3800                bld4.num_mips = type4.length;
3801                bld4.num_lods = type4.length;
3802             }
3803             else if (op_is_tex && min_img_filter != mag_img_filter) {
3804                bld4.num_mips = 1;
3805                bld4.num_lods = type4.length;
3806             }
3807          }
3808 
3809          /* we want native vector size to be able to use our intrinsics */
3810          bld4.lodf_type = type4;
3811          if (bld4.num_lods != type4.length) {
3812             bld4.lodf_type.length = 1;
3813          }
3814          bld4.lodi_type = lp_int_type(bld4.lodf_type);
3815          bld4.levelf_type = type4;
3816          if (bld4.num_mips != type4.length) {
3817             bld4.levelf_type.length = 1;
3818          }
3819          bld4.leveli_type = lp_int_type(bld4.levelf_type);
3820          bld4.float_size_type = bld4.float_size_in_type;
3821          if (bld4.num_mips > 1) {
3822             bld4.float_size_type.length = bld4.num_mips == type4.length ?
3823                                             bld4.num_mips * bld4.float_size_in_type.length :
3824                                             type4.length;
3825          }
3826          bld4.int_size_type = lp_int_type(bld4.float_size_type);
3827 
3828          lp_build_context_init(&bld4.float_bld, gallivm, bld4.float_type);
3829          lp_build_context_init(&bld4.float_vec_bld, gallivm, type4);
3830          lp_build_context_init(&bld4.int_bld, gallivm, bld4.int_type);
3831          lp_build_context_init(&bld4.coord_bld, gallivm, bld4.coord_type);
3832          lp_build_context_init(&bld4.int_coord_bld, gallivm, bld4.int_coord_type);
3833          lp_build_context_init(&bld4.int_size_in_bld, gallivm, bld4.int_size_in_type);
3834          lp_build_context_init(&bld4.float_size_in_bld, gallivm, bld4.float_size_in_type);
3835          lp_build_context_init(&bld4.int_size_bld, gallivm, bld4.int_size_type);
3836          lp_build_context_init(&bld4.float_size_bld, gallivm, bld4.float_size_type);
3837          lp_build_context_init(&bld4.texel_bld, gallivm, bld4.texel_type);
3838          lp_build_context_init(&bld4.levelf_bld, gallivm, bld4.levelf_type);
3839          lp_build_context_init(&bld4.leveli_bld, gallivm, bld4.leveli_type);
3840          lp_build_context_init(&bld4.lodf_bld, gallivm, bld4.lodf_type);
3841          lp_build_context_init(&bld4.lodi_bld, gallivm, bld4.lodi_type);
3842 
3843          for (unsigned i = 0; i < num_quads; i++) {
3844             LLVMValueRef s4, t4, r4;
3845             LLVMValueRef lod_positive4, lod_fpart4 = NULL;
3846             LLVMValueRef ilevel04, ilevel14 = NULL;
3847             LLVMValueRef offsets4[4] = { NULL };
3848             unsigned num_lods = bld4.num_lods;
3849 
3850             s4 = lp_build_extract_range(gallivm, newcoords[0], 4*i, 4);
3851             t4 = lp_build_extract_range(gallivm, newcoords[1], 4*i, 4);
3852             r4 = lp_build_extract_range(gallivm, newcoords[2], 4*i, 4);
3853 
3854             if (offsets[0]) {
3855                offsets4[0] = lp_build_extract_range(gallivm, offsets[0], 4*i, 4);
3856                if (dims > 1) {
3857                   offsets4[1] = lp_build_extract_range(gallivm, offsets[1], 4*i, 4);
3858                   if (dims > 2) {
3859                      offsets4[2] = lp_build_extract_range(gallivm, offsets[2], 4*i, 4);
3860                   }
3861                }
3862             }
3863             lod_positive4 = lp_build_extract_range(gallivm, lod_positive, num_lods * i, num_lods);
3864             ilevel04 = bld.num_mips == 1 ? ilevel0 :
3865                           lp_build_extract_range(gallivm, ilevel0, num_lods * i, num_lods);
3866             if (mip_filter == PIPE_TEX_MIPFILTER_LINEAR) {
3867                ilevel14 = lp_build_extract_range(gallivm, ilevel1, num_lods * i, num_lods);
3868                lod_fpart4 = lp_build_extract_range(gallivm, lod_fpart, num_lods * i, num_lods);
3869             }
3870 
3871             if (use_aos) {
3872                /* do sampling/filtering with fixed pt arithmetic */
3873                lp_build_sample_aos(&bld4, sampler_index,
3874                                    s4, t4, r4, offsets4,
3875                                    lod_positive4, lod_fpart4,
3876                                    ilevel04, ilevel14,
3877                                    texelout4);
3878             }
3879 
3880             else {
3881                /* this path is currently unreachable and hence might break easily... */
3882                LLVMValueRef newcoords4[5];
3883                newcoords4[0] = s4;
3884                newcoords4[1] = t4;
3885                newcoords4[2] = r4;
3886                newcoords4[3] = lp_build_extract_range(gallivm, newcoords[3], 4*i, 4);
3887                newcoords4[4] = lp_build_extract_range(gallivm, newcoords[4], 4*i, 4);
3888 
3889                lp_build_sample_general(&bld4, sampler_index,
3890                                        op_type == LP_SAMPLER_OP_GATHER,
3891                                        newcoords4, offsets4,
3892                                        lod_positive4, lod_fpart4,
3893                                        ilevel04, ilevel14,
3894                                        texelout4);
3895             }
3896             for (unsigned j = 0; j < 4; j++) {
3897                texelouttmp[j][i] = texelout4[j];
3898             }
3899          }
3900 
3901          for (unsigned j = 0; j < 4; j++) {
3902             texel_out[j] = lp_build_concat(gallivm, texelouttmp[j], type4, num_quads);
3903          }
3904       }
3905    }
3906 
3907    if (target != PIPE_BUFFER && op_type != LP_SAMPLER_OP_GATHER) {
3908       apply_sampler_swizzle(&bld, texel_out);
3909    }
3910 
3911    /*
3912     * texel type can be a (32bit) int/uint (for pure int formats only),
3913     * however we are expected to always return floats (storage is untyped).
3914     */
3915    if (!bld.texel_type.floating) {
3916       unsigned chan;
3917       for (chan = 0; chan < 4; chan++) {
3918          texel_out[chan] = LLVMBuildBitCast(builder, texel_out[chan],
3919                                             lp_build_vec_type(gallivm, type), "");
3920       }
3921    }
3922 }
3923 
3924 
3925 #define USE_TEX_FUNC_CALL 1
3926 
3927 #define LP_MAX_TEX_FUNC_ARGS 32
3928 
3929 static inline void
get_target_info(enum pipe_texture_target target,unsigned * num_coords,unsigned * num_derivs,unsigned * num_offsets,unsigned * layer)3930 get_target_info(enum pipe_texture_target target,
3931                 unsigned *num_coords, unsigned *num_derivs,
3932                 unsigned *num_offsets, unsigned *layer)
3933 {
3934    unsigned dims = texture_dims(target);
3935    *num_coords = dims;
3936    *num_offsets = dims;
3937    *num_derivs = (target == PIPE_TEXTURE_CUBE ||
3938                   target == PIPE_TEXTURE_CUBE_ARRAY) ? 3 : dims;
3939    *layer = has_layer_coord(target) ? 2: 0;
3940    if (target == PIPE_TEXTURE_CUBE_ARRAY) {
3941       /*
3942        * dims doesn't include r coord for cubes - this is handled
3943        * by layer instead, but need to fix up for cube arrays...
3944        */
3945       *layer = 3;
3946       *num_coords = 3;
3947    }
3948 }
3949 
3950 
3951 /**
3952  * Generate the function body for a texture sampling function.
3953  */
3954 static void
lp_build_sample_gen_func(struct gallivm_state * gallivm,const struct lp_static_texture_state * static_texture_state,const struct lp_static_sampler_state * static_sampler_state,struct lp_sampler_dynamic_state * dynamic_state,struct lp_type type,unsigned texture_index,unsigned sampler_index,LLVMValueRef function,unsigned num_args,unsigned sample_key,bool has_aniso_filter_table)3955 lp_build_sample_gen_func(struct gallivm_state *gallivm,
3956                          const struct lp_static_texture_state *static_texture_state,
3957                          const struct lp_static_sampler_state *static_sampler_state,
3958                          struct lp_sampler_dynamic_state *dynamic_state,
3959                          struct lp_type type,
3960                          unsigned texture_index,
3961                          unsigned sampler_index,
3962                          LLVMValueRef function,
3963                          unsigned num_args,
3964                          unsigned sample_key,
3965                          bool has_aniso_filter_table)
3966 {
3967    LLVMBuilderRef old_builder;
3968    LLVMBasicBlockRef block;
3969    LLVMValueRef coords[5];
3970    LLVMValueRef offsets[3] = { NULL };
3971    LLVMValueRef lod = NULL;
3972    LLVMValueRef ms_index = NULL;
3973    LLVMValueRef context_ptr;
3974    LLVMValueRef thread_data_ptr = NULL;
3975    LLVMValueRef aniso_filter_table = NULL;
3976    LLVMValueRef texel_out[4];
3977    struct lp_derivatives derivs;
3978    struct lp_derivatives *deriv_ptr = NULL;
3979    unsigned num_param = 0;
3980    unsigned num_coords, num_derivs, num_offsets, layer;
3981    enum lp_sampler_lod_control lod_control;
3982    enum lp_sampler_op_type op_type;
3983    boolean need_cache = FALSE;
3984 
3985    lod_control = (sample_key & LP_SAMPLER_LOD_CONTROL_MASK) >>
3986                     LP_SAMPLER_LOD_CONTROL_SHIFT;
3987 
3988    op_type = (sample_key & LP_SAMPLER_OP_TYPE_MASK) >>
3989                     LP_SAMPLER_OP_TYPE_SHIFT;
3990 
3991    get_target_info(static_texture_state->target,
3992                    &num_coords, &num_derivs, &num_offsets, &layer);
3993 
3994    /* lod query doesn't take a layer */
3995    if (layer && op_type == LP_SAMPLER_OP_LODQ)
3996       layer = 0;
3997 
3998    if (dynamic_state->cache_ptr) {
3999       const struct util_format_description *format_desc;
4000       format_desc = util_format_description(static_texture_state->format);
4001       if (format_desc->layout == UTIL_FORMAT_LAYOUT_S3TC) {
4002          need_cache = TRUE;
4003       }
4004    }
4005 
4006    /* "unpack" arguments */
4007    context_ptr = LLVMGetParam(function, num_param++);
4008    if (has_aniso_filter_table)
4009       aniso_filter_table = LLVMGetParam(function, num_param++);
4010    if (need_cache) {
4011       thread_data_ptr = LLVMGetParam(function, num_param++);
4012    }
4013    for (unsigned i = 0; i < num_coords; i++) {
4014       coords[i] = LLVMGetParam(function, num_param++);
4015    }
4016    for (unsigned i = num_coords; i < 5; i++) {
4017       /* This is rather unfortunate... */
4018       coords[i] = lp_build_undef(gallivm, type);
4019    }
4020    if (layer) {
4021       coords[layer] = LLVMGetParam(function, num_param++);
4022    }
4023    if (sample_key & LP_SAMPLER_SHADOW) {
4024       coords[4] = LLVMGetParam(function, num_param++);
4025    }
4026    if (sample_key & LP_SAMPLER_FETCH_MS) {
4027       ms_index = LLVMGetParam(function, num_param++);
4028    }
4029    if (sample_key & LP_SAMPLER_OFFSETS) {
4030       for (unsigned i = 0; i < num_offsets; i++) {
4031          offsets[i] = LLVMGetParam(function, num_param++);
4032       }
4033    }
4034    if (lod_control == LP_SAMPLER_LOD_BIAS ||
4035        lod_control == LP_SAMPLER_LOD_EXPLICIT) {
4036       lod = LLVMGetParam(function, num_param++);
4037    }
4038    else if (lod_control == LP_SAMPLER_LOD_DERIVATIVES) {
4039       for (unsigned i = 0; i < num_derivs; i++) {
4040          derivs.ddx[i] = LLVMGetParam(function, num_param++);
4041          derivs.ddy[i] = LLVMGetParam(function, num_param++);
4042       }
4043       deriv_ptr = &derivs;
4044    }
4045 
4046    assert(num_args == num_param);
4047 
4048    /*
4049     * Function body
4050     */
4051 
4052    old_builder = gallivm->builder;
4053    block = LLVMAppendBasicBlockInContext(gallivm->context, function, "entry");
4054    gallivm->builder = LLVMCreateBuilderInContext(gallivm->context);
4055    LLVMPositionBuilderAtEnd(gallivm->builder, block);
4056 
4057    lp_build_sample_soa_code(gallivm,
4058                             static_texture_state,
4059                             static_sampler_state,
4060                             dynamic_state,
4061                             type,
4062                             sample_key,
4063                             texture_index,
4064                             sampler_index,
4065                             context_ptr,
4066                             thread_data_ptr,
4067                             coords,
4068                             offsets,
4069                             deriv_ptr,
4070                             lod,
4071                             ms_index,
4072                             aniso_filter_table,
4073                             texel_out);
4074 
4075    LLVMBuildAggregateRet(gallivm->builder, texel_out, 4);
4076 
4077    LLVMDisposeBuilder(gallivm->builder);
4078    gallivm->builder = old_builder;
4079 
4080    gallivm_verify_function(gallivm, function);
4081 }
4082 
4083 
4084 /**
4085  * Call the matching function for texture sampling.
4086  * If there's no match, generate a new one.
4087  */
4088 static void
lp_build_sample_soa_func(struct gallivm_state * gallivm,const struct lp_static_texture_state * static_texture_state,const struct lp_static_sampler_state * static_sampler_state,struct lp_sampler_dynamic_state * dynamic_state,const struct lp_sampler_params * params,int texture_index,int sampler_index,LLVMValueRef * tex_ret)4089 lp_build_sample_soa_func(struct gallivm_state *gallivm,
4090                          const struct lp_static_texture_state *static_texture_state,
4091                          const struct lp_static_sampler_state *static_sampler_state,
4092                          struct lp_sampler_dynamic_state *dynamic_state,
4093                          const struct lp_sampler_params *params,
4094                          int texture_index, int sampler_index,
4095                          LLVMValueRef *tex_ret)
4096 {
4097    LLVMBuilderRef builder = gallivm->builder;
4098    LLVMModuleRef module = LLVMGetGlobalParent(LLVMGetBasicBlockParent(
4099                              LLVMGetInsertBlock(builder)));
4100    LLVMValueRef function, inst;
4101    LLVMValueRef args[LP_MAX_TEX_FUNC_ARGS];
4102    LLVMBasicBlockRef bb;
4103    unsigned num_args = 0;
4104    char func_name[64];
4105    unsigned i, num_coords, num_derivs, num_offsets, layer;
4106    unsigned sample_key = params->sample_key;
4107    const LLVMValueRef *coords = params->coords;
4108    const LLVMValueRef *offsets = params->offsets;
4109    const struct lp_derivatives *derivs = params->derivs;
4110    enum lp_sampler_lod_control lod_control;
4111    enum lp_sampler_op_type op_type;
4112    boolean need_cache = FALSE;
4113 
4114    lod_control = (sample_key & LP_SAMPLER_LOD_CONTROL_MASK) >>
4115                     LP_SAMPLER_LOD_CONTROL_SHIFT;
4116 
4117    op_type = (sample_key & LP_SAMPLER_OP_TYPE_MASK) >>
4118                     LP_SAMPLER_OP_TYPE_SHIFT;
4119 
4120    get_target_info(static_texture_state->target,
4121                    &num_coords, &num_derivs, &num_offsets, &layer);
4122 
4123    /* lod query doesn't take a layer */
4124    if (layer && op_type == LP_SAMPLER_OP_LODQ)
4125       layer = 0;
4126 
4127    if (dynamic_state->cache_ptr) {
4128       const struct util_format_description *format_desc;
4129       format_desc = util_format_description(static_texture_state->format);
4130       if (format_desc->layout == UTIL_FORMAT_LAYOUT_S3TC) {
4131          need_cache = TRUE;
4132       }
4133    }
4134    /*
4135     * texture function matches are found by name.
4136     * Thus the name has to include both the texture and sampler unit
4137     * (which covers all static state) plus the actual texture function
4138     * (including things like offsets, shadow coord, lod control).
4139     * Additionally lod_property has to be included too.
4140     */
4141 
4142    snprintf(func_name, sizeof(func_name), "texfunc_res_%d_sam_%d_%x",
4143             texture_index, sampler_index, sample_key);
4144 
4145    function = LLVMGetNamedFunction(module, func_name);
4146 
4147    LLVMTypeRef arg_types[LP_MAX_TEX_FUNC_ARGS];
4148    LLVMTypeRef ret_type;
4149    LLVMTypeRef val_type[4];
4150    unsigned num_param = 0;
4151 
4152    /*
4153     * Generate the function prototype.
4154     */
4155 
4156    arg_types[num_param++] = LLVMTypeOf(params->context_ptr);
4157    if (params->aniso_filter_table)
4158       arg_types[num_param++] = LLVMTypeOf(params->aniso_filter_table);
4159    if (need_cache) {
4160       arg_types[num_param++] = LLVMTypeOf(params->thread_data_ptr);
4161    }
4162    for (i = 0; i < num_coords; i++) {
4163       arg_types[num_param++] = LLVMTypeOf(coords[0]);
4164       assert(LLVMTypeOf(coords[0]) == LLVMTypeOf(coords[i]));
4165    }
4166    if (layer) {
4167       arg_types[num_param++] = LLVMTypeOf(coords[layer]);
4168       assert(LLVMTypeOf(coords[0]) == LLVMTypeOf(coords[layer]));
4169    }
4170    if (sample_key & LP_SAMPLER_SHADOW) {
4171       arg_types[num_param++] = LLVMTypeOf(coords[0]);
4172    }
4173    if (sample_key & LP_SAMPLER_FETCH_MS) {
4174       arg_types[num_param++] = LLVMTypeOf(params->ms_index);
4175    }
4176    if (sample_key & LP_SAMPLER_OFFSETS) {
4177       for (i = 0; i < num_offsets; i++) {
4178          arg_types[num_param++] = LLVMTypeOf(offsets[0]);
4179          assert(LLVMTypeOf(offsets[0]) == LLVMTypeOf(offsets[i]));
4180       }
4181    }
4182    if (lod_control == LP_SAMPLER_LOD_BIAS ||
4183        lod_control == LP_SAMPLER_LOD_EXPLICIT) {
4184       arg_types[num_param++] = LLVMTypeOf(params->lod);
4185    }
4186    else if (lod_control == LP_SAMPLER_LOD_DERIVATIVES) {
4187       for (i = 0; i < num_derivs; i++) {
4188          arg_types[num_param++] = LLVMTypeOf(derivs->ddx[i]);
4189          arg_types[num_param++] = LLVMTypeOf(derivs->ddy[i]);
4190          assert(LLVMTypeOf(derivs->ddx[0]) == LLVMTypeOf(derivs->ddx[i]));
4191          assert(LLVMTypeOf(derivs->ddy[0]) == LLVMTypeOf(derivs->ddy[i]));
4192       }
4193    }
4194 
4195    val_type[0] = val_type[1] = val_type[2] = val_type[3] =
4196          lp_build_vec_type(gallivm, params->type);
4197    ret_type = LLVMStructTypeInContext(gallivm->context, val_type, 4, 0);
4198    LLVMTypeRef function_type = LLVMFunctionType(ret_type, arg_types, num_param, 0);
4199 
4200    if (!function) {
4201       function = LLVMAddFunction(module, func_name, function_type);
4202 
4203       for (i = 0; i < num_param; ++i) {
4204          if (LLVMGetTypeKind(arg_types[i]) == LLVMPointerTypeKind) {
4205 
4206             lp_add_function_attr(function, i + 1, LP_FUNC_ATTR_NOALIAS);
4207          }
4208       }
4209 
4210       LLVMSetFunctionCallConv(function, LLVMFastCallConv);
4211       LLVMSetLinkage(function, LLVMInternalLinkage);
4212 
4213       lp_build_sample_gen_func(gallivm,
4214                                static_texture_state,
4215                                static_sampler_state,
4216                                dynamic_state,
4217                                params->type,
4218                                texture_index,
4219                                sampler_index,
4220                                function,
4221                                num_param,
4222                                sample_key,
4223                                params->aniso_filter_table ? true : false);
4224    }
4225 
4226    num_args = 0;
4227    args[num_args++] = params->context_ptr;
4228    if (params->aniso_filter_table)
4229       args[num_args++] = params->aniso_filter_table;
4230    if (need_cache) {
4231       args[num_args++] = params->thread_data_ptr;
4232    }
4233    for (i = 0; i < num_coords; i++) {
4234       args[num_args++] = coords[i];
4235    }
4236    if (layer) {
4237       args[num_args++] = coords[layer];
4238    }
4239    if (sample_key & LP_SAMPLER_SHADOW) {
4240       args[num_args++] = coords[4];
4241    }
4242    if (sample_key & LP_SAMPLER_FETCH_MS) {
4243       args[num_args++] = params->ms_index;
4244    }
4245    if (sample_key & LP_SAMPLER_OFFSETS) {
4246       for (i = 0; i < num_offsets; i++) {
4247          args[num_args++] = offsets[i];
4248       }
4249    }
4250    if (lod_control == LP_SAMPLER_LOD_BIAS ||
4251        lod_control == LP_SAMPLER_LOD_EXPLICIT) {
4252       args[num_args++] = params->lod;
4253    }
4254    else if (lod_control == LP_SAMPLER_LOD_DERIVATIVES) {
4255       for (i = 0; i < num_derivs; i++) {
4256          args[num_args++] = derivs->ddx[i];
4257          args[num_args++] = derivs->ddy[i];
4258       }
4259    }
4260 
4261    assert(num_args <= LP_MAX_TEX_FUNC_ARGS);
4262 
4263    *tex_ret = LLVMBuildCall2(builder, function_type, function, args, num_args, "");
4264    bb = LLVMGetInsertBlock(builder);
4265    inst = LLVMGetLastInstruction(bb);
4266    LLVMSetInstructionCallConv(inst, LLVMFastCallConv);
4267 
4268 }
4269 
4270 
4271 /**
4272  * Build texture sampling code.
4273  * Either via a function call or inline it directly.
4274  */
4275 void
lp_build_sample_soa(const struct lp_static_texture_state * static_texture_state,const struct lp_static_sampler_state * static_sampler_state,struct lp_sampler_dynamic_state * dynamic_state,struct gallivm_state * gallivm,const struct lp_sampler_params * params)4276 lp_build_sample_soa(const struct lp_static_texture_state *static_texture_state,
4277                     const struct lp_static_sampler_state *static_sampler_state,
4278                     struct lp_sampler_dynamic_state *dynamic_state,
4279                     struct gallivm_state *gallivm,
4280                     const struct lp_sampler_params *params)
4281 {
4282    boolean use_tex_func = FALSE;
4283 
4284    /*
4285     * Do not use a function call if the sampling is "simple enough".
4286     * We define this by
4287     * a) format
4288     * b) no mips (either one level only or no mip filter)
4289     * No mips will definitely make the code smaller, though
4290     * the format requirement is a bit iffy - there's some (SoA) formats
4291     * which definitely generate less code. This does happen to catch
4292     * some important cases though which are hurt quite a bit by using
4293     * a call (though not really because of the call overhead but because
4294     * they are reusing the same texture unit with some of the same
4295     * parameters).
4296     * Ideally we'd let llvm recognize this stuff by doing IPO passes.
4297     */
4298 
4299    if (USE_TEX_FUNC_CALL) {
4300       const struct util_format_description *format_desc =
4301          util_format_description(static_texture_state->format);
4302       const boolean simple_format =
4303          (util_format_is_rgba8_variant(format_desc) &&
4304          format_desc->colorspace == UTIL_FORMAT_COLORSPACE_RGB);
4305       const enum lp_sampler_op_type op_type =
4306          (params->sample_key & LP_SAMPLER_OP_TYPE_MASK) >>
4307          LP_SAMPLER_OP_TYPE_SHIFT;
4308       const boolean simple_tex =
4309          op_type != LP_SAMPLER_OP_TEXTURE ||
4310            ((static_sampler_state->min_mip_filter == PIPE_TEX_MIPFILTER_NONE ||
4311              static_texture_state->level_zero_only == TRUE) &&
4312             static_sampler_state->min_img_filter == static_sampler_state->mag_img_filter);
4313 
4314       use_tex_func = !(simple_format && simple_tex);
4315    }
4316 
4317    if (use_tex_func) {
4318       LLVMValueRef tex_ret;
4319       lp_build_sample_soa_func(gallivm,
4320                                static_texture_state,
4321                                static_sampler_state,
4322                                dynamic_state,
4323                                params, params->texture_index,
4324                                params->sampler_index, &tex_ret);
4325 
4326       for (unsigned i = 0; i < 4; i++) {
4327          params->texel[i] =
4328             LLVMBuildExtractValue(gallivm->builder, tex_ret, i, "");
4329       }
4330    }
4331    else {
4332       lp_build_sample_soa_code(gallivm,
4333                                static_texture_state,
4334                                static_sampler_state,
4335                                dynamic_state,
4336                                params->type,
4337                                params->sample_key,
4338                                params->texture_index,
4339                                params->sampler_index,
4340                                params->context_ptr,
4341                                params->thread_data_ptr,
4342                                params->coords,
4343                                params->offsets,
4344                                params->derivs,
4345                                params->lod,
4346                                params->ms_index,
4347                                params->aniso_filter_table,
4348                                params->texel);
4349    }
4350 }
4351 
4352 
4353 void
lp_build_size_query_soa(struct gallivm_state * gallivm,const struct lp_static_texture_state * static_state,struct lp_sampler_dynamic_state * dynamic_state,const struct lp_sampler_size_query_params * params)4354 lp_build_size_query_soa(struct gallivm_state *gallivm,
4355                         const struct lp_static_texture_state *static_state,
4356                         struct lp_sampler_dynamic_state *dynamic_state,
4357                         const struct lp_sampler_size_query_params *params)
4358 {
4359    LLVMValueRef lod, level = 0, size;
4360    LLVMValueRef first_level = NULL;
4361    unsigned num_lods = 1;
4362    struct lp_build_context bld_int_vec4;
4363    LLVMValueRef context_ptr = params->context_ptr;
4364    unsigned texture_unit = params->texture_unit;
4365    unsigned target = params->target;
4366    LLVMValueRef texture_unit_offset = params->texture_unit_offset;
4367 
4368    if (static_state->format == PIPE_FORMAT_NONE) {
4369       /*
4370        * If there's nothing bound, format is NONE, and we must return
4371        * all zero as mandated by d3d10 in this case.
4372        */
4373       LLVMValueRef zero = lp_build_const_vec(gallivm, params->int_type, 0.0F);
4374       for (unsigned chan = 0; chan < 4; chan++) {
4375          params->sizes_out[chan] = zero;
4376       }
4377       return;
4378    }
4379 
4380    /*
4381     * Do some sanity verification about bound texture and shader dcl target.
4382     * Not entirely sure what's possible but assume array/non-array
4383     * always compatible (probably not ok for OpenGL but d3d10 has no
4384     * distinction of arrays at the resource level).
4385     * Everything else looks bogus (though not entirely sure about rect/2d).
4386     * Currently disabled because it causes assertion failures if there's
4387     * nothing bound (or rather a dummy texture, not that this case would
4388     * return the right values).
4389     */
4390    if (0 && static_state->target != target) {
4391       if (static_state->target == PIPE_TEXTURE_1D)
4392          assert(target == PIPE_TEXTURE_1D_ARRAY);
4393       else if (static_state->target == PIPE_TEXTURE_1D_ARRAY)
4394          assert(target == PIPE_TEXTURE_1D);
4395       else if (static_state->target == PIPE_TEXTURE_2D)
4396          assert(target == PIPE_TEXTURE_2D_ARRAY);
4397       else if (static_state->target == PIPE_TEXTURE_2D_ARRAY)
4398          assert(target == PIPE_TEXTURE_2D);
4399       else if (static_state->target == PIPE_TEXTURE_CUBE)
4400          assert(target == PIPE_TEXTURE_CUBE_ARRAY);
4401       else if (static_state->target == PIPE_TEXTURE_CUBE_ARRAY)
4402          assert(target == PIPE_TEXTURE_CUBE);
4403       else
4404          assert(0);
4405    }
4406 
4407    const unsigned dims = texture_dims(target);
4408 
4409    const boolean has_array = has_layer_coord(target);
4410 
4411    assert(!params->int_type.floating);
4412 
4413    lp_build_context_init(&bld_int_vec4, gallivm, lp_type_int_vec(32, 128));
4414 
4415    if (params->samples_only) {
4416       params->sizes_out[0] =
4417          lp_build_broadcast(gallivm,
4418                             lp_build_vec_type(gallivm, params->int_type),
4419                             dynamic_state->num_samples(dynamic_state, gallivm,
4420                                                        context_ptr,
4421                                                        texture_unit,
4422                                                        texture_unit_offset));
4423       return;
4424    }
4425 
4426    if (params->explicit_lod) {
4427       /* FIXME: this needs to honor per-element lod */
4428       lod = LLVMBuildExtractElement(gallivm->builder, params->explicit_lod,
4429                                     lp_build_const_int32(gallivm, 0), "");
4430       first_level = dynamic_state->first_level(dynamic_state, gallivm,
4431                                                context_ptr, texture_unit,
4432                                                texture_unit_offset);
4433       level = LLVMBuildAdd(gallivm->builder, lod, first_level, "level");
4434       lod = lp_build_broadcast_scalar(&bld_int_vec4, level);
4435    } else {
4436       lod = bld_int_vec4.zero;
4437    }
4438 
4439    size = bld_int_vec4.undef;
4440 
4441    size = LLVMBuildInsertElement(gallivm->builder, size,
4442                                  dynamic_state->width(dynamic_state, gallivm,
4443                                                       context_ptr,
4444                                                       texture_unit,
4445                                                       texture_unit_offset),
4446                                  lp_build_const_int32(gallivm, 0), "");
4447 
4448    if (dims >= 2) {
4449       size = LLVMBuildInsertElement(gallivm->builder, size,
4450                                     dynamic_state->height(dynamic_state,
4451                                                           gallivm, context_ptr,
4452                                                           texture_unit,
4453                                                           texture_unit_offset),
4454                                     lp_build_const_int32(gallivm, 1), "");
4455    }
4456 
4457    if (dims >= 3) {
4458       size = LLVMBuildInsertElement(gallivm->builder, size,
4459                                     dynamic_state->depth(dynamic_state, gallivm,
4460                                                          context_ptr,
4461                                                          texture_unit,
4462                                                          texture_unit_offset),
4463                                     lp_build_const_int32(gallivm, 2), "");
4464    }
4465 
4466    size = lp_build_minify(&bld_int_vec4, size, lod, TRUE);
4467 
4468    if (has_array) {
4469       LLVMValueRef layers = dynamic_state->depth(dynamic_state, gallivm,
4470                                                  context_ptr, texture_unit,
4471                                                  texture_unit_offset);
4472       if (target == PIPE_TEXTURE_CUBE_ARRAY) {
4473          /*
4474           * It looks like GL wants number of cubes, d3d10.1 has it undefined?
4475           * Could avoid this by passing in number of cubes instead of total
4476           * number of layers (might make things easier elsewhere too).
4477           */
4478          LLVMValueRef six = lp_build_const_int32(gallivm, 6);
4479          layers = LLVMBuildSDiv(gallivm->builder, layers, six, "");
4480       }
4481       size = LLVMBuildInsertElement(gallivm->builder, size, layers,
4482                                     lp_build_const_int32(gallivm, dims), "");
4483    }
4484 
4485    /*
4486     * d3d10 requires zero for x/y/z values (but not w, i.e. mip levels)
4487     * if level is out of bounds (note this can't cover unbound texture
4488     * here, which also requires returning zero).
4489     */
4490    if (params->explicit_lod && params->is_sviewinfo) {
4491       LLVMValueRef last_level, out, out1;
4492       struct lp_build_context leveli_bld;
4493 
4494       /* everything is scalar for now */
4495       lp_build_context_init(&leveli_bld, gallivm, lp_type_int_vec(32, 32));
4496       last_level = dynamic_state->last_level(dynamic_state, gallivm,
4497                                              context_ptr, texture_unit,
4498                                              texture_unit_offset);
4499 
4500       out = lp_build_cmp(&leveli_bld, PIPE_FUNC_LESS, level, first_level);
4501       out1 = lp_build_cmp(&leveli_bld, PIPE_FUNC_GREATER, level, last_level);
4502       out = lp_build_or(&leveli_bld, out, out1);
4503       if (num_lods == 1) {
4504          out = lp_build_broadcast_scalar(&bld_int_vec4, out);
4505       }
4506       else {
4507          /* TODO */
4508          assert(0);
4509       }
4510       size = lp_build_andnot(&bld_int_vec4, size, out);
4511    }
4512 
4513    unsigned i;
4514    for (i = 0; i < dims + (has_array ? 1 : 0); i++) {
4515       params->sizes_out[i] =
4516          lp_build_extract_broadcast(gallivm, bld_int_vec4.type,
4517                                     params->int_type,
4518                                     size,
4519                                     lp_build_const_int32(gallivm, i));
4520    }
4521    if (params->is_sviewinfo) {
4522       for (; i < 4; i++) {
4523          params->sizes_out[i] = lp_build_const_vec(gallivm,
4524                                                    params->int_type, 0.0);
4525       }
4526    }
4527 
4528    /*
4529     * if there's no explicit_lod (buffers, rects) queries requiring nr of
4530     * mips would be illegal.
4531     */
4532    if (params->is_sviewinfo && params->explicit_lod) {
4533       struct lp_build_context bld_int_scalar;
4534       lp_build_context_init(&bld_int_scalar, gallivm, lp_type_int(32));
4535 
4536       LLVMValueRef num_levels;
4537       if (static_state->level_zero_only) {
4538          num_levels = bld_int_scalar.one;
4539       }
4540       else {
4541          LLVMValueRef last_level;
4542 
4543          last_level = dynamic_state->last_level(dynamic_state, gallivm,
4544                                                 context_ptr, texture_unit,
4545                                                 texture_unit_offset);
4546          num_levels = lp_build_sub(&bld_int_scalar, last_level, first_level);
4547          num_levels = lp_build_add(&bld_int_scalar, num_levels,
4548                                    bld_int_scalar.one);
4549       }
4550       params->sizes_out[3] =
4551          lp_build_broadcast(gallivm,
4552                             lp_build_vec_type(gallivm, params->int_type),
4553                             num_levels);
4554    }
4555 }
4556 
4557 
4558 static void
lp_build_do_atomic_soa(struct gallivm_state * gallivm,const struct util_format_description * format_desc,struct lp_type type,LLVMValueRef exec_mask,LLVMValueRef base_ptr,LLVMValueRef offset,LLVMValueRef out_of_bounds,unsigned img_op,LLVMAtomicRMWBinOp op,const LLVMValueRef rgba_in[4],const LLVMValueRef rgba2_in[4],LLVMValueRef atomic_result[4])4559 lp_build_do_atomic_soa(struct gallivm_state *gallivm,
4560                        const struct util_format_description *format_desc,
4561                        struct lp_type type,
4562                        LLVMValueRef exec_mask,
4563                        LLVMValueRef base_ptr,
4564                        LLVMValueRef offset,
4565                        LLVMValueRef out_of_bounds,
4566                        unsigned img_op,
4567                        LLVMAtomicRMWBinOp op,
4568                        const LLVMValueRef rgba_in[4],
4569                        const LLVMValueRef rgba2_in[4],
4570                        LLVMValueRef atomic_result[4])
4571 {
4572    const enum pipe_format format = format_desc->format;
4573 
4574    if (format != PIPE_FORMAT_R32_UINT &&
4575        format != PIPE_FORMAT_R32_SINT &&
4576        format != PIPE_FORMAT_R32_FLOAT) {
4577       atomic_result[0] = lp_build_zero(gallivm, type);
4578       return;
4579    }
4580 
4581    LLVMTypeRef atom_res_elem_type =
4582       LLVMVectorType(LLVMInt32TypeInContext(gallivm->context), type.length);
4583    LLVMValueRef atom_res = lp_build_alloca(gallivm, atom_res_elem_type, "");
4584 
4585    offset = LLVMBuildGEP(gallivm->builder, base_ptr, &offset, 1, "");
4586 
4587    struct lp_build_loop_state loop_state;
4588    lp_build_loop_begin(&loop_state, gallivm, lp_build_const_int32(gallivm, 0));
4589    struct lp_build_if_state ifthen;
4590    LLVMValueRef cond;
4591    LLVMValueRef packed = rgba_in[0], packed2 = rgba2_in[0];
4592 
4593    LLVMValueRef should_store_mask =
4594       LLVMBuildAnd(gallivm->builder, exec_mask,
4595                    LLVMBuildNot(gallivm->builder, out_of_bounds, ""),
4596                    "store_mask");
4597    assert(exec_mask);
4598 
4599    cond = LLVMBuildICmp(gallivm->builder, LLVMIntNE, should_store_mask,
4600                         lp_build_const_int_vec(gallivm, type, 0), "");
4601    cond = LLVMBuildExtractElement(gallivm->builder, cond,
4602                                   loop_state.counter, "");
4603    lp_build_if(&ifthen, gallivm, cond);
4604 
4605    LLVMValueRef data =
4606       LLVMBuildExtractElement(gallivm->builder, packed, loop_state.counter, "");
4607    LLVMValueRef cast_base_ptr =
4608       LLVMBuildExtractElement(gallivm->builder, offset, loop_state.counter, "");
4609    cast_base_ptr = LLVMBuildBitCast(gallivm->builder, cast_base_ptr,
4610               LLVMPointerType(LLVMInt32TypeInContext(gallivm->context), 0), "");
4611    data = LLVMBuildBitCast(gallivm->builder, data,
4612                            LLVMInt32TypeInContext(gallivm->context), "");
4613 
4614    if (img_op == LP_IMG_ATOMIC_CAS) {
4615       LLVMValueRef cas_src_ptr =
4616          LLVMBuildExtractElement(gallivm->builder, packed2,
4617                                  loop_state.counter, "");
4618       LLVMValueRef cas_src =
4619          LLVMBuildBitCast(gallivm->builder, cas_src_ptr,
4620                           LLVMInt32TypeInContext(gallivm->context), "");
4621       data = LLVMBuildAtomicCmpXchg(gallivm->builder, cast_base_ptr, data,
4622                                     cas_src,
4623                                     LLVMAtomicOrderingSequentiallyConsistent,
4624                                     LLVMAtomicOrderingSequentiallyConsistent,
4625                                     false);
4626       data = LLVMBuildExtractValue(gallivm->builder, data, 0, "");
4627    } else {
4628       data = LLVMBuildAtomicRMW(gallivm->builder, op,
4629                                 cast_base_ptr, data,
4630                                 LLVMAtomicOrderingSequentiallyConsistent,
4631                                 false);
4632    }
4633 
4634    LLVMValueRef temp_res =
4635       LLVMBuildLoad2(gallivm->builder, atom_res_elem_type, atom_res, "");
4636    temp_res = LLVMBuildInsertElement(gallivm->builder, temp_res, data,
4637                                      loop_state.counter, "");
4638    LLVMBuildStore(gallivm->builder, temp_res, atom_res);
4639 
4640    lp_build_endif(&ifthen);
4641    lp_build_loop_end_cond(&loop_state,
4642                           lp_build_const_int32(gallivm, type.length),
4643                           NULL, LLVMIntUGE);
4644    atomic_result[0] = LLVMBuildLoad2(gallivm->builder, atom_res_elem_type,
4645                                      atom_res, "");
4646 }
4647 
4648 
4649 static void
lp_build_img_op_no_format(struct gallivm_state * gallivm,const struct lp_img_params * params,LLVMValueRef outdata[4])4650 lp_build_img_op_no_format(struct gallivm_state *gallivm,
4651                           const struct lp_img_params *params,
4652                           LLVMValueRef outdata[4])
4653 {
4654    /*
4655     * If there's nothing bound, format is NONE, and we must return
4656     * all zero as mandated by d3d10 in this case.
4657     */
4658    if (params->img_op != LP_IMG_STORE) {
4659       LLVMValueRef zero = lp_build_zero(gallivm, params->type);
4660       for (unsigned chan = 0; chan < (params->img_op == LP_IMG_LOAD ? 4 : 1);
4661            chan++) {
4662          outdata[chan] = zero;
4663       }
4664    }
4665 }
4666 
4667 
4668 void
lp_build_img_op_soa(const struct lp_static_texture_state * static_texture_state,struct lp_sampler_dynamic_state * dynamic_state,struct gallivm_state * gallivm,const struct lp_img_params * params,LLVMValueRef outdata[4])4669 lp_build_img_op_soa(const struct lp_static_texture_state *static_texture_state,
4670                     struct lp_sampler_dynamic_state *dynamic_state,
4671                     struct gallivm_state *gallivm,
4672                     const struct lp_img_params *params,
4673                     LLVMValueRef outdata[4])
4674 {
4675    const enum pipe_texture_target target = params->target;
4676    const unsigned dims = texture_dims(target);
4677    /** regular scalar int type */
4678    struct lp_type int_coord_type;
4679    struct lp_build_context int_coord_bld;
4680    const struct util_format_description *format_desc =
4681       util_format_description(static_texture_state->format);
4682    LLVMValueRef x = params->coords[0], y = params->coords[1],
4683       z = params->coords[2];
4684    LLVMValueRef ms_index = params->ms_index;
4685    LLVMValueRef row_stride_vec = NULL, img_stride_vec = NULL;
4686 
4687    int_coord_type = lp_uint_type(params->type);
4688    lp_build_context_init(&int_coord_bld, gallivm, int_coord_type);
4689 
4690    if (static_texture_state->format == PIPE_FORMAT_NONE) {
4691       lp_build_img_op_no_format(gallivm, params, outdata);
4692       return;
4693 
4694    }
4695    LLVMValueRef offset, i, j;
4696 
4697    LLVMValueRef row_stride = dynamic_state->row_stride(dynamic_state, gallivm,
4698                                                        params->context_ptr,
4699                                                        params->image_index, NULL);
4700    LLVMValueRef img_stride = dynamic_state->img_stride(dynamic_state, gallivm,
4701                                                        params->context_ptr,
4702                                                        params->image_index, NULL);
4703    LLVMValueRef base_ptr = dynamic_state->base_ptr(dynamic_state, gallivm,
4704                                                    params->context_ptr,
4705                                                    params->image_index, NULL);
4706    LLVMValueRef width = dynamic_state->width(dynamic_state, gallivm,
4707                                                 params->context_ptr,
4708                                              params->image_index, NULL);
4709    LLVMValueRef height = dynamic_state->height(dynamic_state, gallivm,
4710                                                params->context_ptr,
4711                                                params->image_index, NULL);
4712    LLVMValueRef depth = dynamic_state->depth(dynamic_state, gallivm,
4713                                               params->context_ptr,
4714                                              params->image_index, NULL);
4715    LLVMValueRef num_samples = NULL, sample_stride = NULL;
4716    if (ms_index) {
4717       num_samples = dynamic_state->num_samples(dynamic_state, gallivm,
4718                                                params->context_ptr,
4719                                                params->image_index, NULL);
4720       sample_stride = dynamic_state->sample_stride(dynamic_state, gallivm,
4721                                                    params->context_ptr,
4722                                                    params->image_index, NULL);
4723    }
4724 
4725    boolean layer_coord = has_layer_coord(target);
4726 
4727    width = lp_build_broadcast_scalar(&int_coord_bld, width);
4728    if (dims >= 2) {
4729       height = lp_build_broadcast_scalar(&int_coord_bld, height);
4730       row_stride_vec = lp_build_broadcast_scalar(&int_coord_bld, row_stride);
4731    }
4732    if (dims >= 3 || layer_coord) {
4733       depth = lp_build_broadcast_scalar(&int_coord_bld, depth);
4734       img_stride_vec = lp_build_broadcast_scalar(&int_coord_bld, img_stride);
4735    }
4736 
4737    LLVMValueRef out_of_bounds = int_coord_bld.zero;
4738    LLVMValueRef out1 = lp_build_cmp(&int_coord_bld, PIPE_FUNC_GEQUAL, x, width);
4739    out_of_bounds = lp_build_or(&int_coord_bld, out_of_bounds, out1);
4740 
4741    if (dims >= 2) {
4742       out1 = lp_build_cmp(&int_coord_bld, PIPE_FUNC_GEQUAL, y, height);
4743       out_of_bounds = lp_build_or(&int_coord_bld, out_of_bounds, out1);
4744    }
4745    if (dims >= 3 || layer_coord) {
4746       out1 = lp_build_cmp(&int_coord_bld, PIPE_FUNC_GEQUAL, z, depth);
4747       out_of_bounds = lp_build_or(&int_coord_bld, out_of_bounds, out1);
4748    }
4749    lp_build_sample_offset(&int_coord_bld,
4750                           format_desc,
4751                           x, y, z, row_stride_vec, img_stride_vec,
4752                           &offset, &i, &j);
4753 
4754    if (ms_index) {
4755       out1 = lp_build_cmp(&int_coord_bld, PIPE_FUNC_GEQUAL, ms_index,
4756                           lp_build_broadcast_scalar(&int_coord_bld,
4757                                                     num_samples));
4758       out_of_bounds = lp_build_or(&int_coord_bld, out_of_bounds, out1);
4759 
4760       offset =
4761          lp_build_add(&int_coord_bld, offset,
4762                       lp_build_mul(&int_coord_bld,
4763                                    lp_build_broadcast_scalar(&int_coord_bld,
4764                                                              sample_stride),
4765                                    ms_index));
4766    }
4767    if (params->img_op == LP_IMG_LOAD) {
4768       struct lp_type texel_type = lp_build_texel_type(params->type, format_desc);
4769 
4770       offset = lp_build_andnot(&int_coord_bld, offset, out_of_bounds);
4771       struct lp_build_context texel_bld;
4772       lp_build_context_init(&texel_bld, gallivm, texel_type);
4773       lp_build_fetch_rgba_soa(gallivm,
4774                               format_desc,
4775                               texel_type, TRUE,
4776                               base_ptr, offset,
4777                               i, j,
4778                               NULL,
4779                               outdata);
4780 
4781       for (unsigned chan = 0; chan < 3; chan++) {
4782          outdata[chan] = lp_build_select(&texel_bld, out_of_bounds,
4783                                          texel_bld.zero, outdata[chan]);
4784       }
4785       if (format_desc->swizzle[3] == PIPE_SWIZZLE_1)
4786          outdata[3] = lp_build_select(&texel_bld, out_of_bounds,
4787                                       texel_bld.one, outdata[3]);
4788       else
4789          outdata[3] = lp_build_select(&texel_bld, out_of_bounds,
4790                                       texel_bld.zero, outdata[3]);
4791    } else if (params->img_op == LP_IMG_STORE) {
4792       lp_build_store_rgba_soa(gallivm, format_desc, params->type,
4793                               params->exec_mask, base_ptr, offset,
4794                               out_of_bounds, params->indata);
4795    } else {
4796       lp_build_do_atomic_soa(gallivm, format_desc, params->type,
4797                              params->exec_mask, base_ptr, offset,
4798                              out_of_bounds, params->img_op, params->op,
4799                              params->indata, params->indata2, outdata);
4800    }
4801 }
4802 
4803 
4804 /*
4805  * These functions are for indirect texture access suppoort.
4806  *
4807  * Indirect textures are implemented using a switch statement, that
4808  * takes the texture index and jumps to the sampler functions for
4809  * that texture unit.
4810  */
4811 
4812 /*
4813  * Initialise an indexed sampler switch block.
4814  *
4815  * This sets up the switch_info state and adds the LLVM flow control pieces.
4816  */
4817 void
lp_build_sample_array_init_soa(struct lp_build_sample_array_switch * switch_info,struct gallivm_state * gallivm,const struct lp_sampler_params * params,LLVMValueRef idx,unsigned base,unsigned range)4818 lp_build_sample_array_init_soa(struct lp_build_sample_array_switch *switch_info,
4819                            struct gallivm_state *gallivm,
4820                            const struct lp_sampler_params *params,
4821                            LLVMValueRef idx,
4822                            unsigned base, unsigned range)
4823 {
4824    switch_info->gallivm = gallivm;
4825    switch_info->params = *params;
4826    switch_info->base = base;
4827    switch_info->range = range;
4828 
4829    /* for generating the switch functions we don't want the texture index
4830     * offset
4831     */
4832    switch_info->params.texture_index_offset = 0;
4833 
4834    LLVMBasicBlockRef initial_block = LLVMGetInsertBlock(gallivm->builder);
4835    switch_info->merge_ref = lp_build_insert_new_block(gallivm, "texmerge");
4836 
4837    switch_info->switch_ref = LLVMBuildSwitch(gallivm->builder, idx,
4838                                              switch_info->merge_ref,
4839                                              range - base);
4840 
4841    LLVMTypeRef val_type[4];
4842    val_type[0] = val_type[1] = val_type[2] = val_type[3] =
4843       lp_build_vec_type(gallivm, params->type);
4844 
4845    LLVMTypeRef ret_type =
4846       LLVMStructTypeInContext(gallivm->context, val_type, 4, 0);
4847 
4848    LLVMValueRef undef_val = LLVMGetUndef(ret_type);
4849 
4850    LLVMPositionBuilderAtEnd(gallivm->builder, switch_info->merge_ref);
4851 
4852    switch_info->phi = LLVMBuildPhi(gallivm->builder, ret_type, "");
4853    LLVMAddIncoming(switch_info->phi, &undef_val, &initial_block, 1);
4854 }
4855 
4856 
4857 /*
4858  * Add an individual entry to the indirect texture switch.
4859  *
4860  * This builds the sample function and links a case for it into the switch
4861  * statement.
4862  */
4863 void
lp_build_sample_array_case_soa(struct lp_build_sample_array_switch * switch_info,int idx,const struct lp_static_texture_state * static_texture_state,const struct lp_static_sampler_state * static_sampler_state,struct lp_sampler_dynamic_state * dynamic_texture_state)4864 lp_build_sample_array_case_soa(struct lp_build_sample_array_switch *switch_info,
4865                            int idx,
4866                            const struct lp_static_texture_state *static_texture_state,
4867                            const struct lp_static_sampler_state *static_sampler_state,
4868                            struct lp_sampler_dynamic_state *dynamic_texture_state)
4869 {
4870    struct gallivm_state *gallivm = switch_info->gallivm;
4871    LLVMBasicBlockRef this_block = lp_build_insert_new_block(gallivm, "texblock");
4872 
4873    LLVMAddCase(switch_info->switch_ref,
4874                LLVMConstInt(LLVMInt32TypeInContext(gallivm->context), idx, 0),
4875                this_block);
4876    LLVMPositionBuilderAtEnd(gallivm->builder, this_block);
4877 
4878    LLVMValueRef tex_ret;
4879    lp_build_sample_soa_func(gallivm, static_texture_state,
4880                             static_sampler_state, dynamic_texture_state,
4881                             &switch_info->params, idx, idx, &tex_ret);
4882 
4883    LLVMAddIncoming(switch_info->phi, &tex_ret, &this_block, 1);
4884    LLVMBuildBr(gallivm->builder, switch_info->merge_ref);
4885 }
4886 
4887 
4888 /*
4889  * Finish a switch statement.
4890  *
4891  * This handles extract the results from the switch.
4892  */
4893 void
lp_build_sample_array_fini_soa(struct lp_build_sample_array_switch * switch_info)4894 lp_build_sample_array_fini_soa(struct lp_build_sample_array_switch *switch_info)
4895 {
4896    struct gallivm_state *gallivm = switch_info->gallivm;
4897 
4898    LLVMPositionBuilderAtEnd(gallivm->builder, switch_info->merge_ref);
4899    for (unsigned i = 0; i < 4; i++) {
4900       switch_info->params.texel[i] =
4901          LLVMBuildExtractValue(gallivm->builder, switch_info->phi, i, "");
4902    }
4903 }
4904 
4905 
4906 void
lp_build_image_op_switch_soa(struct lp_build_img_op_array_switch * switch_info,struct gallivm_state * gallivm,const struct lp_img_params * params,LLVMValueRef idx,unsigned base,unsigned range)4907 lp_build_image_op_switch_soa(struct lp_build_img_op_array_switch *switch_info,
4908                              struct gallivm_state *gallivm,
4909                              const struct lp_img_params *params,
4910                              LLVMValueRef idx,
4911                              unsigned base, unsigned range)
4912 {
4913    switch_info->gallivm = gallivm;
4914    switch_info->params = *params;
4915    switch_info->base = base;
4916    switch_info->range = range;
4917 
4918    /* for generating the switch functions we don't want the texture index
4919     * offset
4920     */
4921    switch_info->params.image_index_offset = 0;
4922 
4923    LLVMBasicBlockRef initial_block = LLVMGetInsertBlock(gallivm->builder);
4924    switch_info->merge_ref = lp_build_insert_new_block(gallivm, "imgmerge");
4925 
4926    switch_info->switch_ref =
4927       LLVMBuildSwitch(gallivm->builder, idx,
4928                       switch_info->merge_ref, range - base);
4929 
4930    if (params->img_op != LP_IMG_STORE) {
4931       LLVMTypeRef ret_type = lp_build_vec_type(gallivm, params->type);
4932       LLVMValueRef undef_val = LLVMGetUndef(ret_type);
4933 
4934       LLVMPositionBuilderAtEnd(gallivm->builder, switch_info->merge_ref);
4935 
4936       for (unsigned i = 0; i < ((params->img_op == LP_IMG_LOAD) ? 4 : 1); i++) {
4937          switch_info->phi[i] = LLVMBuildPhi(gallivm->builder, ret_type, "");
4938          LLVMAddIncoming(switch_info->phi[i], &undef_val, &initial_block, 1);
4939       }
4940    }
4941 }
4942 
4943 
4944 void
lp_build_image_op_array_case(struct lp_build_img_op_array_switch * switch_info,int idx,const struct lp_static_texture_state * static_texture_state,struct lp_sampler_dynamic_state * dynamic_state)4945 lp_build_image_op_array_case(struct lp_build_img_op_array_switch *switch_info,
4946                             int idx,
4947                             const struct lp_static_texture_state *static_texture_state,
4948                             struct lp_sampler_dynamic_state *dynamic_state)
4949 {
4950    struct gallivm_state *gallivm = switch_info->gallivm;
4951    LLVMBasicBlockRef this_block = lp_build_insert_new_block(gallivm, "img");
4952    LLVMValueRef tex_ret[4];
4953 
4954    LLVMAddCase(switch_info->switch_ref,
4955                lp_build_const_int32(gallivm, idx), this_block);
4956    LLVMPositionBuilderAtEnd(gallivm->builder, this_block);
4957 
4958    switch_info->params.image_index = idx;
4959 
4960    lp_build_img_op_soa(static_texture_state, dynamic_state,
4961                        switch_info->gallivm, &switch_info->params, tex_ret);
4962 
4963    if (switch_info->params.img_op != LP_IMG_STORE) {
4964       for (unsigned i = 0;
4965            i < ((switch_info->params.img_op == LP_IMG_LOAD) ? 4 : 1); i++) {
4966          tex_ret[i] =
4967             LLVMBuildBitCast(gallivm->builder, tex_ret[i],
4968                              lp_build_vec_type(gallivm,
4969                                                switch_info->params.type), "");
4970       }
4971 
4972       this_block = LLVMGetInsertBlock(gallivm->builder);
4973       for (unsigned i = 0;
4974            i < ((switch_info->params.img_op == LP_IMG_LOAD) ? 4 : 1); i++) {
4975          LLVMAddIncoming(switch_info->phi[i], &tex_ret[i], &this_block, 1);
4976       }
4977    }
4978    LLVMBuildBr(gallivm->builder, switch_info->merge_ref);
4979 }
4980 
4981 
4982 void
lp_build_image_op_array_fini_soa(struct lp_build_img_op_array_switch * switch_info)4983 lp_build_image_op_array_fini_soa(struct lp_build_img_op_array_switch *switch_info)
4984 {
4985    struct gallivm_state *gallivm = switch_info->gallivm;
4986 
4987    LLVMPositionBuilderAtEnd(gallivm->builder, switch_info->merge_ref);
4988 
4989    if (switch_info->params.img_op != LP_IMG_STORE) {
4990       for (unsigned i = 0;
4991            i < ((switch_info->params.img_op == LP_IMG_LOAD) ? 4 : 1); i++) {
4992          switch_info->params.outdata[i] = switch_info->phi[i];
4993       }
4994    }
4995 }
4996