• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /**************************************************************************
2  *
3  * Copyright 2009 VMware, Inc.
4  * Copyright 2007 VMware, Inc.
5  * All Rights Reserved.
6  *
7  * Permission is hereby granted, free of charge, to any person obtaining a
8  * copy of this software and associated documentation files (the
9  * "Software"), to deal in the Software without restriction, including
10  * without limitation the rights to use, copy, modify, merge, publish,
11  * distribute, sub license, and/or sell copies of the Software, and to
12  * permit persons to whom the Software is furnished to do so, subject to
13  * the following conditions:
14  *
15  * The above copyright notice and this permission notice (including the
16  * next paragraph) shall be included in all copies or substantial portions
17  * of the Software.
18  *
19  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
20  * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
21  * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
22  * IN NO EVENT SHALL VMWARE AND/OR ITS SUPPLIERS BE LIABLE FOR
23  * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
24  * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
25  * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
26  *
27  **************************************************************************/
28 
29 /**
30  * @file
31  * Code generate the whole fragment pipeline.
32  *
33  * The fragment pipeline consists of the following stages:
34  * - early depth test
35  * - fragment shader
36  * - alpha test
37  * - depth/stencil test
38  * - blending
39  *
40  * This file has only the glue to assemble the fragment pipeline.  The actual
41  * plumbing of converting Gallium state into LLVM IR is done elsewhere, in the
42  * lp_bld_*.[ch] files, and in a complete generic and reusable way. Here we
43  * muster the LLVM JIT execution engine to create a function that follows an
44  * established binary interface and that can be called from C directly.
45  *
46  * A big source of complexity here is that we often want to run different
47  * stages with different precisions and data types and precisions. For example,
48  * the fragment shader needs typically to be done in floats, but the
49  * depth/stencil test and blending is better done in the type that most closely
50  * matches the depth/stencil and color buffer respectively.
51  *
52  * Since the width of a SIMD vector register stays the same regardless of the
53  * element type, different types imply different number of elements, so we must
54  * code generate more instances of the stages with larger types to be able to
55  * feed/consume the stages with smaller types.
56  *
57  * @author Jose Fonseca <jfonseca@vmware.com>
58  */
59 
60 #include <limits.h>
61 #include "pipe/p_defines.h"
62 #include "util/u_inlines.h"
63 #include "util/u_memory.h"
64 #include "util/u_pointer.h"
65 #include "util/format/u_format.h"
66 #include "util/u_dump.h"
67 #include "util/u_string.h"
68 #include "util/u_dual_blend.h"
69 #include "util/u_upload_mgr.h"
70 #include "util/os_time.h"
71 #include "pipe/p_shader_tokens.h"
72 #include "draw/draw_context.h"
73 #include "nir/tgsi_to_nir.h"
74 #include "gallivm/lp_bld_type.h"
75 #include "gallivm/lp_bld_const.h"
76 #include "gallivm/lp_bld_conv.h"
77 #include "gallivm/lp_bld_init.h"
78 #include "gallivm/lp_bld_intr.h"
79 #include "gallivm/lp_bld_logic.h"
80 #include "gallivm/lp_bld_tgsi.h"
81 #include "gallivm/lp_bld_nir.h"
82 #include "gallivm/lp_bld_swizzle.h"
83 #include "gallivm/lp_bld_flow.h"
84 #include "gallivm/lp_bld_debug.h"
85 #include "gallivm/lp_bld_arit.h"
86 #include "gallivm/lp_bld_bitarit.h"
87 #include "gallivm/lp_bld_pack.h"
88 #include "gallivm/lp_bld_format.h"
89 #include "gallivm/lp_bld_quad.h"
90 #include "gallivm/lp_bld_gather.h"
91 #include "gallivm/lp_bld_jit_sample.h"
92 
93 #include "lp_bld_alpha.h"
94 #include "lp_bld_blend.h"
95 #include "lp_bld_depth.h"
96 #include "lp_bld_interp.h"
97 #include "lp_context.h"
98 #include "lp_debug.h"
99 #include "lp_perf.h"
100 #include "lp_setup.h"
101 #include "lp_state.h"
102 #include "lp_tex_sample.h"
103 #include "lp_flush.h"
104 #include "lp_state_fs.h"
105 #include "lp_rast.h"
106 #include "nir/nir_to_tgsi_info.h"
107 
108 #include "lp_screen.h"
109 #include "compiler/nir/nir_serialize.h"
110 #include "util/mesa-sha1.h"
111 
112 
113 /** Fragment shader number (for debugging) */
114 static unsigned fs_no = 0;
115 
116 
117 static void
118 load_unswizzled_block(struct gallivm_state *gallivm,
119                       LLVMTypeRef base_type,
120                       LLVMValueRef base_ptr,
121                       LLVMValueRef stride,
122                       unsigned block_width,
123                       unsigned block_height,
124                       LLVMValueRef* dst,
125                       struct lp_type dst_type,
126                       unsigned dst_count,
127                       unsigned dst_alignment);
128 /**
129  * Checks if a format description is an arithmetic format
130  *
131  * A format which has irregular channel sizes such as R3_G3_B2 or R5_G6_B5.
132  */
133 static inline bool
is_arithmetic_format(const struct util_format_description * format_desc)134 is_arithmetic_format(const struct util_format_description *format_desc)
135 {
136    bool arith = false;
137 
138    for (unsigned i = 0; i < format_desc->nr_channels; ++i) {
139       arith |= format_desc->channel[i].size != format_desc->channel[0].size;
140       arith |= (format_desc->channel[i].size % 8) != 0;
141    }
142 
143    return arith;
144 }
145 
146 
147 /**
148  * Checks if this format requires special handling due to required expansion
149  * to floats for blending, and furthermore has "natural" packed AoS ->
150  * unpacked SoA conversion.
151  */
152 static inline bool
format_expands_to_float_soa(const struct util_format_description * format_desc)153 format_expands_to_float_soa(const struct util_format_description *format_desc)
154 {
155    if (format_desc->format == PIPE_FORMAT_R11G11B10_FLOAT ||
156        format_desc->colorspace == UTIL_FORMAT_COLORSPACE_SRGB) {
157       return true;
158    }
159    return false;
160 }
161 
162 
163 /**
164  * Retrieves the type representing the memory layout for a format
165  *
166  * e.g. RGBA16F = 4x half-float and R3G3B2 = 1x byte
167  */
168 static inline void
lp_mem_type_from_format_desc(const struct util_format_description * format_desc,struct lp_type * type)169 lp_mem_type_from_format_desc(const struct util_format_description *format_desc,
170                              struct lp_type* type)
171 {
172    if (format_expands_to_float_soa(format_desc)) {
173       /* just make this a uint with width of block */
174       type->floating = false;
175       type->fixed = false;
176       type->sign = false;
177       type->norm = false;
178       type->width = format_desc->block.bits;
179       type->length = 1;
180       return;
181    }
182 
183    int chan = util_format_get_first_non_void_channel(format_desc->format);
184 
185    memset(type, 0, sizeof(struct lp_type));
186    type->floating = format_desc->channel[chan].type == UTIL_FORMAT_TYPE_FLOAT;
187    type->fixed    = format_desc->channel[chan].type == UTIL_FORMAT_TYPE_FIXED;
188    type->sign     = format_desc->channel[chan].type != UTIL_FORMAT_TYPE_UNSIGNED;
189    type->norm     = format_desc->channel[chan].normalized;
190 
191    if (is_arithmetic_format(format_desc)) {
192       type->width = 0;
193       type->length = 1;
194 
195       for (unsigned i = 0; i < format_desc->nr_channels; ++i) {
196          type->width += format_desc->channel[i].size;
197       }
198    } else {
199       type->width = format_desc->channel[chan].size;
200       type->length = format_desc->nr_channels;
201    }
202 }
203 
204 
205 /**
206  * Expand the relevant bits of mask_input to a n*4-dword mask for the
207  * n*four pixels in n 2x2 quads.  This will set the n*four elements of the
208  * quad mask vector to 0 or ~0.
209  * Grouping is 01, 23 for 2 quad mode hence only 0 and 2 are valid
210  * quad arguments with fs length 8.
211  *
212  * \param first_quad  which quad(s) of the quad group to test, in [0,3]
213  * \param mask_input  bitwise mask for the whole 4x4 stamp
214  */
215 static LLVMValueRef
generate_quad_mask(struct gallivm_state * gallivm,struct lp_type fs_type,unsigned first_quad,unsigned sample,LLVMValueRef mask_input)216 generate_quad_mask(struct gallivm_state *gallivm,
217                    struct lp_type fs_type,
218                    unsigned first_quad,
219                    unsigned sample,
220                    LLVMValueRef mask_input) /* int64 */
221 {
222    LLVMBuilderRef builder = gallivm->builder;
223    LLVMTypeRef i32t = LLVMInt32TypeInContext(gallivm->context);
224    LLVMValueRef bits[16];
225    LLVMValueRef mask, bits_vec;
226 
227    /*
228     * XXX: We'll need a different path for 16 x u8
229     */
230    assert(fs_type.width == 32);
231    assert(fs_type.length <= ARRAY_SIZE(bits));
232    struct lp_type mask_type = lp_int_type(fs_type);
233 
234    /*
235     * mask_input >>= (quad * 4)
236     */
237    int shift;
238    switch (first_quad) {
239    case 0:
240       shift = 0;
241       break;
242    case 1:
243       assert(fs_type.length == 4);
244       shift = 2;
245       break;
246    case 2:
247       shift = 8;
248       break;
249    case 3:
250       assert(fs_type.length == 4);
251       shift = 10;
252       break;
253    default:
254       assert(0);
255       shift = 0;
256    }
257 
258    mask_input = LLVMBuildLShr(builder, mask_input,
259                               lp_build_const_int64(gallivm, 16 * sample), "");
260    mask_input = LLVMBuildTrunc(builder, mask_input, i32t, "");
261    mask_input = LLVMBuildAnd(builder, mask_input,
262                              lp_build_const_int32(gallivm, 0xffff), "");
263    mask_input = LLVMBuildLShr(builder, mask_input,
264                               LLVMConstInt(i32t, shift, 0), "");
265 
266    /*
267     * mask = { mask_input & (1 << i), for i in [0,3] }
268     */
269    mask = lp_build_broadcast(gallivm,
270                              lp_build_vec_type(gallivm, mask_type),
271                              mask_input);
272 
273    for (int i = 0; i < fs_type.length / 4; i++) {
274       unsigned j = 2 * (i % 2) + (i / 2) * 8;
275       bits[4*i + 0] = LLVMConstInt(i32t, 1ULL << (j + 0), 0);
276       bits[4*i + 1] = LLVMConstInt(i32t, 1ULL << (j + 1), 0);
277       bits[4*i + 2] = LLVMConstInt(i32t, 1ULL << (j + 4), 0);
278       bits[4*i + 3] = LLVMConstInt(i32t, 1ULL << (j + 5), 0);
279    }
280    bits_vec = LLVMConstVector(bits, fs_type.length);
281    mask = LLVMBuildAnd(builder, mask, bits_vec, "");
282 
283    /*
284     * mask = mask == bits ? ~0 : 0
285     */
286    mask = lp_build_compare(gallivm,
287                            mask_type, PIPE_FUNC_EQUAL,
288                            mask, bits_vec);
289 
290    return mask;
291 }
292 
293 
294 #define EARLY_DEPTH_TEST  0x1
295 #define LATE_DEPTH_TEST   0x2
296 #define EARLY_DEPTH_WRITE 0x4
297 #define LATE_DEPTH_WRITE  0x8
298 #define EARLY_DEPTH_TEST_INFERRED  0x10 //only with EARLY_DEPTH_TEST
299 
300 static unsigned
get_cbuf_location(nir_variable * var,unsigned slot)301 get_cbuf_location(nir_variable *var, unsigned slot)
302 {
303    return (var->data.location - FRAG_RESULT_DATA0) + var->data.index + slot;
304 }
305 
306 static int
find_output_by_frag_result(struct nir_shader * shader,gl_frag_result frag_result)307 find_output_by_frag_result(struct nir_shader *shader,
308                            gl_frag_result frag_result)
309 {
310    nir_foreach_shader_out_variable(var, shader) {
311       int slots = nir_variable_count_slots(var, var->type);
312       for (unsigned s = 0; s < slots; s++) {
313          if (var->data.location + var->data.index + s == frag_result)
314             return var->data.driver_location + s;
315       }
316    }
317 
318    return -1;
319 }
320 
321 /**
322  * Fetch the specified lp_jit_viewport structure for a given viewport_index.
323  */
324 static LLVMValueRef
lp_llvm_viewport(LLVMTypeRef context_type,LLVMValueRef context_ptr,struct gallivm_state * gallivm,LLVMValueRef viewport_index)325 lp_llvm_viewport(LLVMTypeRef context_type,
326                  LLVMValueRef context_ptr,
327                  struct gallivm_state *gallivm,
328                  LLVMValueRef viewport_index)
329 {
330    LLVMBuilderRef builder = gallivm->builder;
331    LLVMValueRef ptr;
332    LLVMValueRef res;
333    struct lp_type viewport_type =
334       lp_type_float_vec(32, 32 * LP_JIT_VIEWPORT_NUM_FIELDS);
335    LLVMTypeRef vtype = lp_build_vec_type(gallivm, viewport_type);
336 
337    ptr = lp_jit_context_viewports(gallivm, context_type, context_ptr);
338    ptr = LLVMBuildPointerCast(builder, ptr,
339             LLVMPointerType(vtype, 0), "");
340 
341    res = lp_build_pointer_get2(builder, vtype, ptr, viewport_index);
342 
343    return res;
344 }
345 
346 
347 static LLVMValueRef
lp_build_depth_clamp(struct gallivm_state * gallivm,LLVMBuilderRef builder,bool depth_clamp,bool restrict_depth,struct lp_type type,LLVMTypeRef context_type,LLVMValueRef context_ptr,LLVMTypeRef thread_data_type,LLVMValueRef thread_data_ptr,LLVMValueRef z)348 lp_build_depth_clamp(struct gallivm_state *gallivm,
349                      LLVMBuilderRef builder,
350                      bool depth_clamp,
351                      bool restrict_depth,
352                      struct lp_type type,
353                      LLVMTypeRef context_type,
354                      LLVMValueRef context_ptr,
355                      LLVMTypeRef thread_data_type,
356                      LLVMValueRef thread_data_ptr,
357                      LLVMValueRef z)
358 {
359    LLVMValueRef viewport, min_depth, max_depth;
360    LLVMValueRef viewport_index;
361    struct lp_build_context f32_bld;
362 
363    assert(type.floating);
364    lp_build_context_init(&f32_bld, gallivm, type);
365 
366    if (restrict_depth)
367       z = lp_build_clamp(&f32_bld, z, f32_bld.zero, f32_bld.one);
368 
369    if (!depth_clamp)
370       return z;
371 
372    /*
373     * Assumes clamping of the viewport index will occur in setup/gs. Value
374     * is passed through the rasterization stage via lp_rast_shader_inputs.
375     *
376     * See: draw_clamp_viewport_idx and lp_clamp_viewport_idx for clamping
377     *      semantics.
378     */
379    viewport_index = lp_jit_thread_data_raster_state_viewport_index(gallivm,
380                                                                    thread_data_type,
381                                                                    thread_data_ptr);
382 
383    /*
384     * Load the min and max depth from the lp_jit_context.viewports
385     * array of lp_jit_viewport structures.
386     */
387    viewport = lp_llvm_viewport(context_type, context_ptr, gallivm, viewport_index);
388 
389    /* viewports[viewport_index].min_depth */
390    min_depth = LLVMBuildExtractElement(builder, viewport,
391                   lp_build_const_int32(gallivm, LP_JIT_VIEWPORT_MIN_DEPTH), "");
392    min_depth = lp_build_broadcast_scalar(&f32_bld, min_depth);
393 
394    /* viewports[viewport_index].max_depth */
395    max_depth = LLVMBuildExtractElement(builder, viewport,
396                   lp_build_const_int32(gallivm, LP_JIT_VIEWPORT_MAX_DEPTH), "");
397    max_depth = lp_build_broadcast_scalar(&f32_bld, max_depth);
398 
399    /*
400     * Clamp to the min and max depth values for the given viewport.
401     */
402    return lp_build_clamp(&f32_bld, z, min_depth, max_depth);
403 }
404 
405 
406 static LLVMValueRef
lp_build_alpha_to_coverage_dither(struct gallivm_state * gallivm,struct lp_type type,unsigned coverage_samples,const LLVMValueRef * pos,LLVMValueRef alpha)407 lp_build_alpha_to_coverage_dither(struct gallivm_state *gallivm,
408                                   struct lp_type type,
409                                   unsigned coverage_samples,
410                                   const LLVMValueRef* pos,
411                                   LLVMValueRef alpha)
412 {
413    LLVMBuilderRef builder = gallivm->builder;
414    /* Standard ordered dithering 2x2 threshold matrix. */
415    LLVMValueRef elems[] = {
416       lp_build_const_elem(gallivm, type, 0.125 / coverage_samples),
417       lp_build_const_elem(gallivm, type, 0.625 / coverage_samples),
418       lp_build_const_elem(gallivm, type, 0.875 / coverage_samples),
419       lp_build_const_elem(gallivm, type, 0.375 / coverage_samples),
420    };
421    LLVMValueRef dither_thresholds = LLVMConstVector(elems, ARRAY_SIZE(elems));
422    /* Get a two bit mask, where each bit is even/odd on X and Y. */
423    LLVMTypeRef int_vec_type = lp_build_int_vec_type(gallivm, type);
424    LLVMValueRef frag_int_pos_x = LLVMBuildFPToSI(builder, pos[0], int_vec_type, "frag_int_pos_x");
425    LLVMValueRef frag_int_pos_y = LLVMBuildFPToSI(builder, pos[1], int_vec_type, "frag_int_pos_y");
426    LLVMValueRef odd_bitmask = lp_build_const_int_vec(gallivm, type, 1);
427    LLVMValueRef dither_index = LLVMBuildOr(builder, LLVMBuildAnd(builder, frag_int_pos_x, odd_bitmask, ""),
428                                            LLVMBuildShl(builder, LLVMBuildAnd(builder, frag_int_pos_y, odd_bitmask, ""),
429                                                         lp_build_const_int_vec(gallivm, type, 1), ""), "dither_index");
430    /* Use the bit mask as an index in the threshold matrix, subtract it from the alpha value. */
431    LLVMValueRef offsets = LLVMGetUndef(lp_build_vec_type(gallivm, type));
432    for (unsigned i = 0; i < type.length; i++) {
433       LLVMValueRef index = lp_build_const_int32(gallivm, i);
434       offsets = LLVMBuildInsertElement(builder, offsets,
435                                        LLVMBuildExtractElement(builder, dither_thresholds,
436                                                                LLVMBuildExtractElement(builder, dither_index,
437                                                                                        index, "threshold"),
438                                                                ""), index, "");
439    }
440    /* Alpha value is only used in a comparison, no need to clamp to [0, 1]. */
441    return LLVMBuildFSub(builder, alpha, offsets, "");
442 }
443 
444 
445 static void
lp_build_sample_alpha_to_coverage(struct gallivm_state * gallivm,struct lp_type type,unsigned coverage_samples,LLVMValueRef num_loop,LLVMValueRef loop_counter,LLVMTypeRef coverage_mask_type,LLVMValueRef coverage_mask_store,LLVMValueRef alpha)446 lp_build_sample_alpha_to_coverage(struct gallivm_state *gallivm,
447                                   struct lp_type type,
448                                   unsigned coverage_samples,
449                                   LLVMValueRef num_loop,
450                                   LLVMValueRef loop_counter,
451                                   LLVMTypeRef coverage_mask_type,
452                                   LLVMValueRef coverage_mask_store,
453                                   LLVMValueRef alpha)
454 {
455    struct lp_build_context bld;
456    LLVMBuilderRef builder = gallivm->builder;
457    float step = 1.0 / coverage_samples;
458 
459    lp_build_context_init(&bld, gallivm, type);
460    for (unsigned s = 0; s < coverage_samples; s++) {
461       LLVMValueRef alpha_ref_value = lp_build_const_vec(gallivm, type, step * s);
462       LLVMValueRef test = lp_build_cmp(&bld, PIPE_FUNC_GREATER, alpha, alpha_ref_value);
463 
464       LLVMValueRef s_mask_idx = LLVMBuildMul(builder, lp_build_const_int32(gallivm, s), num_loop, "");
465       s_mask_idx = LLVMBuildAdd(builder, s_mask_idx, loop_counter, "");
466       LLVMValueRef s_mask_ptr = LLVMBuildGEP2(builder, coverage_mask_type,
467                                               coverage_mask_store, &s_mask_idx, 1, "");
468       LLVMValueRef s_mask = LLVMBuildLoad2(builder, coverage_mask_type, s_mask_ptr, "");
469       s_mask = LLVMBuildAnd(builder, s_mask, test, "");
470       LLVMBuildStore(builder, s_mask, s_mask_ptr);
471    }
472 };
473 
474 
475 struct lp_build_fs_llvm_iface {
476    struct lp_build_fs_iface base;
477    struct lp_build_interp_soa_context *interp;
478    struct lp_build_for_loop_state *loop_state;
479    LLVMTypeRef mask_type;
480    LLVMValueRef mask_store;
481    LLVMValueRef sample_id;
482    LLVMValueRef color_ptr_ptr;
483    LLVMValueRef color_stride_ptr;
484    LLVMValueRef color_sample_stride_ptr;
485    LLVMValueRef zs_base_ptr;
486    LLVMValueRef zs_stride;
487    LLVMValueRef zs_sample_stride;
488    const struct lp_fragment_shader_variant_key *key;
489 };
490 
491 
492 static LLVMValueRef
fs_interp(const struct lp_build_fs_iface * iface,struct lp_build_context * bld,unsigned attrib,unsigned chan,bool centroid,bool sample,LLVMValueRef attrib_indir,LLVMValueRef offsets[2])493 fs_interp(const struct lp_build_fs_iface *iface,
494           struct lp_build_context *bld,
495           unsigned attrib, unsigned chan,
496           bool centroid, bool sample,
497           LLVMValueRef attrib_indir,
498           LLVMValueRef offsets[2])
499 {
500    struct lp_build_fs_llvm_iface *fs_iface = (struct lp_build_fs_llvm_iface *)iface;
501    struct lp_build_interp_soa_context *interp = fs_iface->interp;
502    unsigned loc = TGSI_INTERPOLATE_LOC_CENTER;
503    if (centroid)
504       loc = TGSI_INTERPOLATE_LOC_CENTROID;
505    if (sample)
506       loc = TGSI_INTERPOLATE_LOC_SAMPLE;
507 
508    return lp_build_interp_soa(interp, bld->gallivm, fs_iface->loop_state->counter,
509                               fs_iface->mask_type, fs_iface->mask_store,
510                               attrib, chan, loc, attrib_indir, offsets);
511 }
512 
513 
514 /**
515  * Convert depth-stencil format to a single component one, returning
516  * PIPE_FORMAT_NONE if it doesn't contain the required component.
517  */
518 static enum pipe_format
select_zs_component_format(enum pipe_format format,bool fetch_stencil)519 select_zs_component_format(enum pipe_format format,
520                            bool fetch_stencil)
521 {
522    const struct util_format_description* desc = util_format_description(format);
523    if (fetch_stencil && !util_format_has_stencil(desc))
524       return PIPE_FORMAT_NONE;
525    if (!fetch_stencil && !util_format_has_depth(desc))
526       return PIPE_FORMAT_NONE;
527 
528    switch (format) {
529    case PIPE_FORMAT_Z24_UNORM_S8_UINT:
530       return fetch_stencil ? PIPE_FORMAT_X24S8_UINT : PIPE_FORMAT_Z24X8_UNORM;
531    case PIPE_FORMAT_S8_UINT_Z24_UNORM:
532       return fetch_stencil ? PIPE_FORMAT_S8X24_UINT : PIPE_FORMAT_X8Z24_UNORM;
533    case PIPE_FORMAT_Z32_FLOAT_S8X24_UINT:
534       return fetch_stencil ? PIPE_FORMAT_X32_S8X24_UINT : format;
535    default:
536       return format;
537    }
538 }
539 
540 static void
fs_fb_fetch(const struct lp_build_fs_iface * iface,struct lp_build_context * bld,int location,LLVMValueRef result[4])541 fs_fb_fetch(const struct lp_build_fs_iface *iface,
542             struct lp_build_context *bld,
543             int location,
544             LLVMValueRef result[4])
545 {
546    struct lp_build_fs_llvm_iface *fs_iface = (struct lp_build_fs_llvm_iface *)iface;
547    struct gallivm_state *gallivm = bld->gallivm;
548    LLVMBuilderRef builder = gallivm->builder;
549    LLVMTypeRef int32_type = LLVMInt32TypeInContext(gallivm->context);
550    LLVMTypeRef int8_type = LLVMInt8TypeInContext(gallivm->context);
551    LLVMTypeRef int8p_type = LLVMPointerType(int8_type, 0);
552    const struct lp_fragment_shader_variant_key *key = fs_iface->key;
553 
554    LLVMValueRef buf_ptr;
555    LLVMValueRef stride;
556    enum pipe_format buf_format;
557 
558    const bool fetch_stencil = location == FRAG_RESULT_STENCIL;
559    const bool fetch_zs = fetch_stencil || location == FRAG_RESULT_DEPTH;
560    if (fetch_zs) {
561       buf_ptr = fs_iface->zs_base_ptr;
562       stride = fs_iface->zs_stride;
563       buf_format = select_zs_component_format(key->zsbuf_format, fetch_stencil);
564    } else {
565       assert(location >= FRAG_RESULT_DATA0 && location <= FRAG_RESULT_DATA7);
566       const int cbuf = location - FRAG_RESULT_DATA0;
567       LLVMValueRef index = lp_build_const_int32(gallivm, cbuf);
568 
569       buf_ptr = LLVMBuildLoad2(builder, int8p_type,
570                                LLVMBuildGEP2(builder, int8p_type,
571                                              fs_iface->color_ptr_ptr, &index, 1, ""), "");
572       stride = LLVMBuildLoad2(builder, int32_type,
573                               LLVMBuildGEP2(builder, int32_type,
574                                             fs_iface->color_stride_ptr, &index, 1, ""), "");
575       buf_format = key->cbuf_format[cbuf];
576    }
577 
578    const struct util_format_description* out_format_desc = util_format_description(buf_format);
579    if (out_format_desc->format == PIPE_FORMAT_NONE) {
580       result[0] = result[1] = result[2] = result[3] = bld->undef;
581       return;
582    }
583 
584    unsigned block_size = bld->type.length;
585    unsigned block_height = key->resource_1d ? 1 : 2;
586    unsigned block_width = block_size / block_height;
587 
588    if (key->multisample) {
589       LLVMValueRef sample_stride;
590 
591       if (fetch_zs) {
592          sample_stride = fs_iface->zs_sample_stride;
593       } else {
594          LLVMValueRef index = lp_build_const_int32(gallivm, location - FRAG_RESULT_DATA0);
595          sample_stride = LLVMBuildLoad2(builder, int32_type,
596                                        LLVMBuildGEP2(builder,
597                                                      int32_type,
598                                                      fs_iface->color_sample_stride_ptr,
599                                                      &index, 1, ""), "");
600       }
601 
602       LLVMValueRef sample_offset = LLVMBuildMul(builder, sample_stride, fs_iface->sample_id, "");
603       buf_ptr = LLVMBuildGEP2(builder, int8_type,
604                               buf_ptr, &sample_offset, 1, "");
605    }
606 
607    /* fragment shader executes on 4x4 blocks. depending on vector width it can
608     * execute 2 or 4 iterations.  only move to the next row once the top row
609     * has completed 8 wide 1 iteration, 4 wide 2 iterations */
610    LLVMValueRef x_offset = NULL, y_offset = NULL;
611    if (!key->resource_1d) {
612       LLVMValueRef counter = fs_iface->loop_state->counter;
613 
614       if (block_size == 4) {
615          x_offset = LLVMBuildShl(builder,
616                                  LLVMBuildAnd(builder, fs_iface->loop_state->counter, lp_build_const_int32(gallivm, 1), ""),
617                                  lp_build_const_int32(gallivm, 1), "");
618          counter = LLVMBuildLShr(builder, fs_iface->loop_state->counter, lp_build_const_int32(gallivm, 1), "");
619       }
620       y_offset = LLVMBuildMul(builder, counter, lp_build_const_int32(gallivm, 2), "");
621    }
622 
623    LLVMValueRef offsets[4 * 4];
624    for (unsigned i = 0; i < block_size; i++) {
625       unsigned x = i % block_width;
626       unsigned y = i / block_width;
627 
628       if (block_size == 8) {
629          /* remap the raw slots into the fragment shader execution mode. */
630          /* this math took me way too long to work out, I'm sure it's
631           * overkill.
632           */
633          x = (i & 1) + ((i >> 2) << 1);
634          if (!key->resource_1d)
635             y = (i & 2) >> 1;
636       }
637 
638       LLVMValueRef x_val;
639       if (x_offset) {
640          x_val = LLVMBuildAdd(builder, lp_build_const_int32(gallivm, x), x_offset, "");
641          x_val = LLVMBuildMul(builder, x_val, lp_build_const_int32(gallivm, out_format_desc->block.bits / 8), "");
642       } else {
643          x_val = lp_build_const_int32(gallivm, x * (out_format_desc->block.bits / 8));
644       }
645 
646       LLVMValueRef y_val = lp_build_const_int32(gallivm, y);
647       if (y_offset)
648          y_val = LLVMBuildAdd(builder, y_val, y_offset, "");
649       y_val = LLVMBuildMul(builder, y_val, stride, "");
650 
651       offsets[i] = LLVMBuildAdd(builder, x_val, y_val, "");
652    }
653    LLVMValueRef offset = lp_build_gather_values(gallivm, offsets, block_size);
654 
655    struct lp_type texel_type = bld->type;
656    if (out_format_desc->colorspace == UTIL_FORMAT_COLORSPACE_RGB &&
657        out_format_desc->channel[0].pure_integer) {
658       if (out_format_desc->channel[0].type == UTIL_FORMAT_TYPE_SIGNED) {
659          texel_type = lp_type_int_vec(bld->type.width, bld->type.width * bld->type.length);
660       } else if (out_format_desc->channel[0].type == UTIL_FORMAT_TYPE_UNSIGNED) {
661          texel_type = lp_type_uint_vec(bld->type.width, bld->type.width * bld->type.length);
662       }
663    } else if (fetch_stencil) {
664       texel_type = lp_type_uint_vec(bld->type.width, bld->type.width * bld->type.length);
665    }
666 
667    lp_build_fetch_rgba_soa(gallivm, out_format_desc, texel_type,
668                            true, buf_ptr, offset,
669                            NULL, NULL, NULL, result);
670 }
671 
672 /**
673  * Generate the fragment shader, depth/stencil test, and alpha tests.
674  */
675 static void
generate_fs_loop(struct gallivm_state * gallivm,struct lp_fragment_shader * shader,const struct lp_fragment_shader_variant_key * key,LLVMBuilderRef builder,struct lp_type type,LLVMTypeRef context_type,LLVMValueRef context_ptr,LLVMTypeRef resources_type,LLVMValueRef resources_ptr,LLVMTypeRef sample_pos_type,LLVMValueRef sample_pos_array,LLVMValueRef num_loop,struct lp_build_interp_soa_context * interp,const struct lp_build_sampler_soa * sampler,const struct lp_build_image_soa * image,LLVMTypeRef mask_type,LLVMValueRef mask_store,LLVMValueRef (* out_color)[4],LLVMValueRef depth_base_ptr,LLVMValueRef depth_stride,LLVMValueRef depth_sample_stride,LLVMValueRef color_ptr_ptr,LLVMValueRef color_stride_ptr,LLVMValueRef color_sample_stride_ptr,LLVMValueRef facing,LLVMTypeRef thread_data_type,LLVMValueRef thread_data_ptr)676 generate_fs_loop(struct gallivm_state *gallivm,
677                  struct lp_fragment_shader *shader,
678                  const struct lp_fragment_shader_variant_key *key,
679                  LLVMBuilderRef builder,
680                  struct lp_type type,
681                  LLVMTypeRef context_type,
682                  LLVMValueRef context_ptr,
683                  LLVMTypeRef resources_type,
684                  LLVMValueRef resources_ptr,
685                  LLVMTypeRef sample_pos_type,
686                  LLVMValueRef sample_pos_array,
687                  LLVMValueRef num_loop,
688                  struct lp_build_interp_soa_context *interp,
689                  const struct lp_build_sampler_soa *sampler,
690                  const struct lp_build_image_soa *image,
691                  LLVMTypeRef mask_type,
692                  LLVMValueRef mask_store,
693                  LLVMValueRef (*out_color)[4],
694                  LLVMValueRef depth_base_ptr,
695                  LLVMValueRef depth_stride,
696                  LLVMValueRef depth_sample_stride,
697                  LLVMValueRef color_ptr_ptr,
698                  LLVMValueRef color_stride_ptr,
699                  LLVMValueRef color_sample_stride_ptr,
700                  LLVMValueRef facing,
701                  LLVMTypeRef thread_data_type,
702                  LLVMValueRef thread_data_ptr)
703 {
704    struct lp_type int_type = lp_int_type(type);
705    LLVMValueRef mask_ptr = NULL, mask_val = NULL;
706    LLVMValueRef z;
707    LLVMValueRef z_value, s_value;
708    LLVMValueRef z_fb, s_fb;
709    LLVMValueRef zs_samples = lp_build_const_int32(gallivm, key->zsbuf_nr_samples);
710    LLVMValueRef z_out = NULL, s_out = NULL;
711    struct lp_build_for_loop_state loop_state, sample_loop_state = {0};
712    struct lp_build_mask_context mask;
713    struct nir_shader *nir = shader->base.ir.nir;
714    const bool dual_source_blend = key->blend.rt[0].blend_enable &&
715                                   util_blend_state_is_dual(&key->blend, 0);
716    const bool post_depth_coverage = nir->info.fs.post_depth_coverage;
717 
718    struct lp_bld_tgsi_system_values system_values;
719 
720    memset(&system_values, 0, sizeof(system_values));
721 
722    /* truncate then sign extend. */
723    system_values.front_facing =
724       LLVMBuildTrunc(gallivm->builder, facing,
725                      LLVMInt1TypeInContext(gallivm->context), "");
726    system_values.front_facing =
727       LLVMBuildSExt(gallivm->builder, system_values.front_facing,
728                     LLVMInt32TypeInContext(gallivm->context), "");
729    system_values.view_index =
730       lp_jit_thread_data_raster_state_view_index(gallivm,
731                                                  thread_data_type,
732                                                  thread_data_ptr);
733 
734    unsigned depth_mode;
735    const struct util_format_description *zs_format_desc = NULL;
736    if (key->depth.enabled ||
737        key->stencil[0].enabled) {
738       zs_format_desc = util_format_description(key->zsbuf_format);
739 
740       if (nir->info.fs.early_fragment_tests || nir->info.fs.post_depth_coverage) {
741          depth_mode = EARLY_DEPTH_TEST | EARLY_DEPTH_WRITE;
742       } else if (!(nir->info.outputs_written & BITFIELD64_BIT(FRAG_RESULT_DEPTH)) &&
743                  !(nir->info.outputs_written & BITFIELD64_BIT(FRAG_RESULT_STENCIL)) &&
744                  !nir->info.fs.uses_fbfetch_output && !nir->info.writes_memory) {
745          if (key->alpha.enabled ||
746              key->blend.alpha_to_coverage ||
747              nir->info.fs.uses_discard ||
748              nir->info.outputs_written & BITFIELD64_BIT(FRAG_RESULT_SAMPLE_MASK)) {
749             /* With alpha test and kill, can do the depth test early
750              * and hopefully eliminate some quads.  But need to do a
751              * special deferred depth write once the final mask value
752              * is known. This only works though if there's either no
753              * stencil test or the stencil value isn't written.
754              */
755             if (key->stencil[0].enabled && (key->stencil[0].writemask ||
756                                             (key->stencil[1].enabled &&
757                                              key->stencil[1].writemask)))
758                depth_mode = LATE_DEPTH_TEST | LATE_DEPTH_WRITE;
759             else
760                depth_mode = EARLY_DEPTH_TEST | LATE_DEPTH_WRITE | EARLY_DEPTH_TEST_INFERRED;
761          } else {
762             depth_mode = EARLY_DEPTH_TEST | EARLY_DEPTH_WRITE | EARLY_DEPTH_TEST_INFERRED;
763          }
764       } else {
765          depth_mode = LATE_DEPTH_TEST | LATE_DEPTH_WRITE;
766       }
767 
768       if (!(key->depth.enabled && key->depth.writemask) &&
769           !(key->stencil[0].enabled && (key->stencil[0].writemask ||
770                                         (key->stencil[1].enabled &&
771                                          key->stencil[1].writemask))))
772          depth_mode &= ~(LATE_DEPTH_WRITE | EARLY_DEPTH_WRITE);
773    } else {
774       depth_mode = 0;
775    }
776 
777    LLVMTypeRef vec_type = lp_build_vec_type(gallivm, type);
778    LLVMTypeRef int_vec_type = lp_build_vec_type(gallivm, int_type);
779 
780    LLVMValueRef stencil_refs[2];
781    stencil_refs[0] = lp_jit_context_stencil_ref_front_value(gallivm, context_type, context_ptr);
782    stencil_refs[1] = lp_jit_context_stencil_ref_back_value(gallivm, context_type, context_ptr);
783    /* convert scalar stencil refs into vectors */
784    stencil_refs[0] = lp_build_broadcast(gallivm, int_vec_type, stencil_refs[0]);
785    stencil_refs[1] = lp_build_broadcast(gallivm, int_vec_type, stencil_refs[1]);
786 
787    LLVMValueRef consts_ptr = lp_jit_resources_constants(gallivm, resources_type, resources_ptr);
788 
789    LLVMValueRef ssbo_ptr = lp_jit_resources_ssbos(gallivm, resources_type, resources_ptr);
790 
791    LLVMValueRef outputs[PIPE_MAX_SHADER_OUTPUTS][TGSI_NUM_CHANNELS];
792    memset(outputs, 0, sizeof outputs);
793 
794    /* Allocate color storage for each fragment sample */
795    LLVMValueRef color_store_size = num_loop;
796    if (key->min_samples > 1)
797       color_store_size = LLVMBuildMul(builder, num_loop, lp_build_const_int32(gallivm, key->min_samples), "");
798 
799    for (unsigned cbuf = 0; cbuf < key->nr_cbufs; cbuf++) {
800       for (unsigned chan = 0; chan < TGSI_NUM_CHANNELS; ++chan) {
801          out_color[cbuf][chan] = lp_build_array_alloca(gallivm,
802                                                        lp_build_vec_type(gallivm,
803                                                                          type),
804                                                        color_store_size, "color");
805       }
806    }
807    if (dual_source_blend) {
808       assert(key->nr_cbufs <= 1);
809       for (unsigned chan = 0; chan < TGSI_NUM_CHANNELS; ++chan) {
810          out_color[1][chan] = lp_build_array_alloca(gallivm,
811                                                     lp_build_vec_type(gallivm,
812                                                                       type),
813                                                     color_store_size, "color1");
814       }
815    }
816    if (nir->info.outputs_written & BITFIELD64_BIT(FRAG_RESULT_DEPTH)) {
817       z_out = lp_build_array_alloca(gallivm,
818                                     lp_build_vec_type(gallivm, type),
819                                     color_store_size, "depth");
820    }
821 
822    if (nir->info.outputs_written & BITFIELD64_BIT(FRAG_RESULT_STENCIL)) {
823       s_out = lp_build_array_alloca(gallivm,
824                                     lp_build_vec_type(gallivm, type),
825                                     color_store_size, "depth");
826    }
827 
828    lp_build_for_loop_begin(&loop_state, gallivm,
829                            lp_build_const_int32(gallivm, 0),
830                            LLVMIntULT,
831                            num_loop,
832                            lp_build_const_int32(gallivm, 1));
833 
834    LLVMValueRef sample_mask_in;
835    if (key->multisample) {
836       sample_mask_in = lp_build_const_int_vec(gallivm, type, 0);
837       /* create shader execution mask by combining all sample masks. */
838       for (unsigned s = 0; s < key->coverage_samples; s++) {
839          LLVMValueRef s_mask_idx = LLVMBuildMul(builder, num_loop, lp_build_const_int32(gallivm, s), "");
840          s_mask_idx = LLVMBuildAdd(builder, s_mask_idx, loop_state.counter, "");
841          LLVMValueRef s_mask = lp_build_pointer_get2(builder, mask_type, mask_store, s_mask_idx);
842          if (s == 0)
843             mask_val = s_mask;
844          else
845             mask_val = LLVMBuildOr(builder, s_mask, mask_val, "");
846 
847          LLVMValueRef mask_in = LLVMBuildAnd(builder, s_mask, lp_build_const_int_vec(gallivm, type, (1ll << s)), "");
848          sample_mask_in = LLVMBuildOr(builder, sample_mask_in, mask_in, "");
849       }
850    } else {
851       sample_mask_in = lp_build_const_int_vec(gallivm, type, 1);
852       mask_ptr = LLVMBuildGEP2(builder, mask_type, mask_store,
853                               &loop_state.counter, 1, "mask_ptr");
854       mask_val = LLVMBuildLoad2(builder, mask_type, mask_ptr, "");
855 
856       LLVMValueRef mask_in = LLVMBuildAnd(builder, mask_val, lp_build_const_int_vec(gallivm, type, 1), "");
857       sample_mask_in = LLVMBuildOr(builder, sample_mask_in, mask_in, "");
858    }
859 
860    /* 'mask' will control execution based on quad's pixel alive/killed state */
861    lp_build_mask_begin(&mask, gallivm, type, mask_val);
862 
863    if (!(depth_mode & EARLY_DEPTH_TEST))
864       lp_build_mask_check(&mask);
865 
866    /* Create storage for recombining sample masks after early Z pass. */
867    LLVMValueRef s_mask_or = lp_build_alloca(gallivm, int_vec_type, "cov_mask_early_depth");
868    LLVMBuildStore(builder, LLVMConstNull(int_vec_type), s_mask_or);
869 
870    /* Create storage for post depth sample mask */
871    LLVMValueRef post_depth_sample_mask_in = NULL;
872    if (post_depth_coverage)
873       post_depth_sample_mask_in = lp_build_alloca(gallivm, int_vec_type, "post_depth_sample_mask_in");
874 
875    LLVMValueRef s_mask = NULL, s_mask_ptr = NULL;
876    LLVMValueRef z_sample_value_store = NULL, s_sample_value_store = NULL;
877    LLVMValueRef z_fb_store = NULL, s_fb_store = NULL;
878    LLVMTypeRef z_type = NULL, z_fb_type = NULL;
879 
880    /* Run early depth once per sample */
881    if (key->multisample) {
882 
883       if (zs_format_desc) {
884          struct lp_type zs_type = lp_depth_type(zs_format_desc, type.length);
885          struct lp_type z_type = zs_type;
886          struct lp_type s_type = zs_type;
887          if (zs_format_desc->block.bits < type.width)
888             z_type.width = type.width;
889          if (zs_format_desc->block.bits == 8) {
890             s_type.width = type.width;
891          } else if (zs_format_desc->block.bits > 32) {
892             z_type.width = z_type.width / 2;
893             s_type.width = s_type.width / 2;
894             s_type.floating = 0;
895          }
896          z_sample_value_store = lp_build_array_alloca(gallivm, lp_build_int_vec_type(gallivm, type),
897                                                       zs_samples, "z_sample_store");
898          s_sample_value_store = lp_build_array_alloca(gallivm, lp_build_int_vec_type(gallivm, type),
899                                                       zs_samples, "s_sample_store");
900          z_fb_store = lp_build_array_alloca(gallivm, lp_build_vec_type(gallivm, z_type),
901                                             zs_samples, "z_fb_store");
902          s_fb_store = lp_build_array_alloca(gallivm, lp_build_vec_type(gallivm, s_type),
903                                             zs_samples, "s_fb_store");
904       }
905       lp_build_for_loop_begin(&sample_loop_state, gallivm,
906                               lp_build_const_int32(gallivm, 0),
907                               LLVMIntULT, lp_build_const_int32(gallivm, key->coverage_samples),
908                               lp_build_const_int32(gallivm, 1));
909 
910       LLVMValueRef s_mask_idx = LLVMBuildMul(builder, sample_loop_state.counter, num_loop, "");
911       s_mask_idx = LLVMBuildAdd(builder, s_mask_idx, loop_state.counter, "");
912       s_mask_ptr = LLVMBuildGEP2(builder, mask_type, mask_store, &s_mask_idx, 1, "");
913 
914       s_mask = LLVMBuildLoad2(builder, mask_type, s_mask_ptr, "");
915       s_mask = LLVMBuildAnd(builder, s_mask, mask_val, "");
916    }
917 
918 
919    /* for multisample Z needs to be interpolated at sample points for testing. */
920    lp_build_interp_soa_update_pos_dyn(interp, gallivm, loop_state.counter,
921                                       key->multisample
922                                       ? sample_loop_state.counter : NULL);
923    z = interp->pos[2];
924 
925    LLVMValueRef depth_ptr = depth_base_ptr;
926    if (key->multisample) {
927       LLVMValueRef sample_offset =
928          LLVMBuildMul(builder, sample_loop_state.counter,
929                       depth_sample_stride, "");
930       depth_ptr = LLVMBuildGEP2(builder, LLVMInt8TypeInContext(gallivm->context),
931                                 depth_ptr, &sample_offset, 1, "");
932    }
933 
934    if (depth_mode & EARLY_DEPTH_TEST) {
935       z = lp_build_depth_clamp(gallivm, builder, key->depth_clamp,
936                                key->restrict_depth_values, type,
937                                context_type, context_ptr,
938                                thread_data_type, thread_data_ptr, z);
939 
940       lp_build_depth_stencil_load_swizzled(gallivm, type,
941                                            zs_format_desc, key->resource_1d,
942                                            depth_ptr, depth_stride,
943                                            &z_fb, &s_fb, loop_state.counter);
944       lp_build_depth_stencil_test(gallivm,
945                                   &key->depth,
946                                   key->stencil,
947                                   type,
948                                   zs_format_desc,
949                                   key->multisample ? NULL : &mask,
950                                   &s_mask,
951                                   stencil_refs,
952                                   z, z_fb, s_fb,
953                                   facing,
954                                   &z_value, &s_value,
955                                   !key->multisample,
956                                   key->restrict_depth_values);
957 
958       if (depth_mode & EARLY_DEPTH_WRITE) {
959          lp_build_depth_stencil_write_swizzled(gallivm, type,
960                                                zs_format_desc, key->resource_1d,
961                                                NULL, NULL, NULL, loop_state.counter,
962                                                depth_ptr, depth_stride,
963                                                z_value, s_value);
964       }
965       /*
966        * Note mask check if stencil is enabled must be after ds write not
967        * after stencil test otherwise new stencil values may not get written
968        * if all fragments got killed by depth/stencil test.
969        */
970       if (key->stencil[0].enabled && !key->multisample)
971          lp_build_mask_check(&mask);
972 
973       if (key->multisample) {
974          z_fb_type = LLVMTypeOf(z_fb);
975          z_type = LLVMTypeOf(z_value);
976          lp_build_pointer_set(builder, z_sample_value_store, sample_loop_state.counter, LLVMBuildBitCast(builder, z_value, lp_build_int_vec_type(gallivm, type), ""));
977          lp_build_pointer_set(builder, s_sample_value_store, sample_loop_state.counter, LLVMBuildBitCast(builder, s_value, lp_build_int_vec_type(gallivm, type), ""));
978          lp_build_pointer_set(builder, z_fb_store, sample_loop_state.counter, z_fb);
979          lp_build_pointer_set(builder, s_fb_store, sample_loop_state.counter, s_fb);
980       }
981       if (key->occlusion_count && !(depth_mode & EARLY_DEPTH_TEST_INFERRED)) {
982          LLVMValueRef counter = lp_jit_thread_data_vis_counter(gallivm, thread_data_type, thread_data_ptr);
983          lp_build_name(counter, "counter");
984          lp_build_occlusion_count(gallivm, type,
985                                  key->multisample ? s_mask : lp_build_mask_value(&mask), counter);
986       }
987    }
988 
989    if (key->multisample) {
990       /*
991        * Store the post-early Z coverage mask.
992        * Recombine the resulting coverage masks post early Z into the fragment
993        * shader execution mask.
994        */
995       LLVMValueRef tmp_s_mask_or = LLVMBuildLoad2(builder, int_vec_type, s_mask_or, "");
996       tmp_s_mask_or = LLVMBuildOr(builder, tmp_s_mask_or, s_mask, "");
997       LLVMBuildStore(builder, tmp_s_mask_or, s_mask_or);
998 
999       if (post_depth_coverage) {
1000          LLVMValueRef mask_bit_idx = LLVMBuildShl(builder, lp_build_const_int32(gallivm, 1), sample_loop_state.counter, "");
1001          LLVMValueRef post_depth_mask_in = LLVMBuildLoad2(builder, int_vec_type, post_depth_sample_mask_in, "");
1002          mask_bit_idx = LLVMBuildAnd(builder, s_mask, lp_build_broadcast(gallivm, int_vec_type, mask_bit_idx), "");
1003          post_depth_mask_in = LLVMBuildOr(builder, post_depth_mask_in, mask_bit_idx, "");
1004          LLVMBuildStore(builder, post_depth_mask_in, post_depth_sample_mask_in);
1005       }
1006 
1007       LLVMBuildStore(builder, s_mask, s_mask_ptr);
1008 
1009       lp_build_for_loop_end(&sample_loop_state);
1010 
1011       /* recombined all the coverage masks in the shader exec mask. */
1012       tmp_s_mask_or = LLVMBuildLoad2(builder, int_vec_type, s_mask_or, "");
1013       lp_build_mask_update(&mask, tmp_s_mask_or);
1014 
1015       if (key->min_samples == 1) {
1016          /* for multisample Z needs to be re interpolated at pixel center */
1017          lp_build_interp_soa_update_pos_dyn(interp, gallivm, loop_state.counter, NULL);
1018          z = interp->pos[2];
1019          lp_build_mask_update(&mask, tmp_s_mask_or);
1020       }
1021    } else {
1022       if (post_depth_coverage) {
1023          LLVMValueRef post_depth_mask_in = LLVMBuildAnd(builder, lp_build_mask_value(&mask), lp_build_const_int_vec(gallivm, type, 1), "");
1024          LLVMBuildStore(builder, post_depth_mask_in, post_depth_sample_mask_in);
1025       }
1026    }
1027 
1028    LLVMValueRef out_sample_mask_storage = NULL;
1029    if (nir->info.outputs_written & BITFIELD64_BIT(FRAG_RESULT_SAMPLE_MASK)) {
1030       out_sample_mask_storage = lp_build_alloca(gallivm, int_vec_type, "write_mask");
1031       if (key->min_samples > 1)
1032          LLVMBuildStore(builder, LLVMConstNull(int_vec_type), out_sample_mask_storage);
1033    }
1034 
1035    if (post_depth_coverage) {
1036       system_values.sample_mask_in = LLVMBuildLoad2(builder, int_vec_type, post_depth_sample_mask_in, "");
1037    } else {
1038       system_values.sample_mask_in = sample_mask_in;
1039    }
1040    if (key->multisample && key->min_samples > 1) {
1041       lp_build_for_loop_begin(&sample_loop_state, gallivm,
1042                               lp_build_const_int32(gallivm, 0),
1043                               LLVMIntULT,
1044                               lp_build_const_int32(gallivm, key->min_samples),
1045                               lp_build_const_int32(gallivm, 1));
1046 
1047       LLVMValueRef s_mask_idx = LLVMBuildMul(builder, sample_loop_state.counter, num_loop, "");
1048       s_mask_idx = LLVMBuildAdd(builder, s_mask_idx, loop_state.counter, "");
1049       s_mask_ptr = LLVMBuildGEP2(builder, mask_type, mask_store, &s_mask_idx, 1, "");
1050       s_mask = LLVMBuildLoad2(builder, mask_type, s_mask_ptr, "");
1051       lp_build_mask_force(&mask, s_mask);
1052       lp_build_interp_soa_update_pos_dyn(interp, gallivm, loop_state.counter, sample_loop_state.counter);
1053       system_values.sample_id = sample_loop_state.counter;
1054       system_values.sample_mask_in = LLVMBuildAnd(builder, system_values.sample_mask_in,
1055                                                   lp_build_broadcast(gallivm, int_vec_type,
1056                                                                      LLVMBuildShl(builder, lp_build_const_int32(gallivm, 1), sample_loop_state.counter, "")), "");
1057    } else {
1058       system_values.sample_id = lp_build_const_int32(gallivm, 0);
1059 
1060    }
1061    system_values.sample_pos = sample_pos_array;
1062    system_values.sample_pos_type = sample_pos_type;
1063 
1064    lp_build_interp_soa_update_inputs_dyn(interp, gallivm, loop_state.counter,
1065                                          mask_type, mask_store, sample_loop_state.counter);
1066 
1067    struct lp_build_fs_llvm_iface fs_iface = {
1068      .base.interp_fn = fs_interp,
1069      .base.fb_fetch = fs_fb_fetch,
1070      .interp = interp,
1071      .loop_state = &loop_state,
1072      .sample_id = system_values.sample_id,
1073      .mask_type = mask_type,
1074      .mask_store = mask_store,
1075      .color_ptr_ptr = color_ptr_ptr,
1076      .color_stride_ptr = color_stride_ptr,
1077      .color_sample_stride_ptr = color_sample_stride_ptr,
1078      .zs_base_ptr = depth_base_ptr,
1079      .zs_stride = depth_stride,
1080      .zs_sample_stride = depth_sample_stride,
1081      .key = key,
1082    };
1083 
1084    struct lp_build_tgsi_params params;
1085    memset(&params, 0, sizeof(params));
1086 
1087    params.type = type;
1088    params.mask = &mask;
1089    params.fs_iface = &fs_iface.base;
1090    params.consts_ptr = consts_ptr;
1091    params.system_values = &system_values;
1092    params.inputs = interp->inputs;
1093    params.num_inputs = interp->num_attribs - 1;
1094    params.context_type = context_type;
1095    params.context_ptr = context_ptr;
1096    params.resources_type = resources_type;
1097    params.resources_ptr = resources_ptr;
1098    params.thread_data_type = thread_data_type;
1099    params.thread_data_ptr = thread_data_ptr;
1100    params.sampler = sampler;
1101    params.info = &shader->info.base;
1102    params.ssbo_ptr = ssbo_ptr;
1103    params.image = image;
1104 
1105    /* Build the actual shader */
1106    lp_build_nir_soa(gallivm, nir, &params, outputs);
1107 
1108    /*
1109     * Must not count ps invocations if there's a null shader.
1110     * (It would be ok to count with null shader if there's d/s tests,
1111     * but only if there's d/s buffers too, which is different
1112     * to implicit rasterization disable which must not depend
1113     * on the d/s buffers.)
1114     * Could disable if there's no stats query, but maybe not worth it.
1115     */
1116    if (shader->info.base.num_instructions > 1) {
1117       LLVMValueRef invocs = lp_jit_thread_data_ps_invocations(gallivm, thread_data_type, thread_data_ptr);
1118       lp_build_occlusion_count(gallivm, type, lp_build_mask_value(&mask), invocs);
1119    }
1120 
1121    /* Alpha test */
1122    if (key->alpha.enabled) {
1123       int color0 = find_output_by_frag_result(nir, FRAG_RESULT_DATA0);
1124 
1125       if (color0 != -1 && outputs[color0][3]) {
1126          const struct util_format_description *cbuf_format_desc;
1127          LLVMValueRef alpha = LLVMBuildLoad2(builder, vec_type, outputs[color0][3], "alpha");
1128          LLVMValueRef alpha_ref_value;
1129 
1130          alpha_ref_value = lp_jit_context_alpha_ref_value(gallivm, context_type, context_ptr);
1131          alpha_ref_value = lp_build_broadcast(gallivm, vec_type, alpha_ref_value);
1132 
1133          cbuf_format_desc = util_format_description(key->cbuf_format[0]);
1134 
1135          lp_build_alpha_test(gallivm, key->alpha.func, type, cbuf_format_desc,
1136                              &mask, alpha, alpha_ref_value,
1137                              ((depth_mode & LATE_DEPTH_TEST) != 0) && !key->multisample);
1138       }
1139    }
1140 
1141    /* Alpha to coverage */
1142    if (key->blend.alpha_to_coverage) {
1143       int color0 = find_output_by_frag_result(nir, FRAG_RESULT_DATA0);
1144 
1145       if (color0 != -1 && outputs[color0][3]) {
1146          LLVMValueRef alpha = LLVMBuildLoad2(builder, vec_type, outputs[color0][3], "alpha");
1147 
1148          if (key->blend.alpha_to_coverage_dither) {
1149             alpha = lp_build_alpha_to_coverage_dither(gallivm, type, key->coverage_samples,
1150                                                       interp->pos, alpha);
1151          }
1152 
1153          if (!key->multisample) {
1154             lp_build_alpha_to_coverage(gallivm, type,
1155                                        &mask, alpha,
1156                                        key->blend.alpha_to_coverage_dither,
1157                                        (depth_mode & LATE_DEPTH_TEST) != 0);
1158          } else {
1159             lp_build_sample_alpha_to_coverage(gallivm, type, key->coverage_samples, num_loop,
1160                                               loop_state.counter,
1161                                               mask_type, mask_store, alpha);
1162          }
1163       }
1164    }
1165 
1166    if (key->blend.alpha_to_one) {
1167       nir_foreach_shader_out_variable(var, nir) {
1168          if (var->data.location < FRAG_RESULT_DATA0)
1169             continue;
1170          int slots = nir_variable_count_slots(var, var->type);
1171          for (unsigned s = 0; s < slots; s++) {
1172             unsigned cbuf = get_cbuf_location(var, s);
1173             if ((cbuf < key->nr_cbufs) || (cbuf == 1 && dual_source_blend))
1174                if (outputs[cbuf][3]) {
1175                   LLVMBuildStore(builder, lp_build_const_vec(gallivm, type, 1.0),
1176                                  outputs[cbuf][3]);
1177                }
1178          }
1179       }
1180    }
1181 
1182    if (nir->info.outputs_written & BITFIELD64_BIT(FRAG_RESULT_SAMPLE_MASK)) {
1183       LLVMValueRef output_smask = NULL;
1184       int smaski = find_output_by_frag_result(nir, FRAG_RESULT_SAMPLE_MASK);
1185 
1186       struct lp_build_context smask_bld;
1187       lp_build_context_init(&smask_bld, gallivm, int_type);
1188 
1189       assert(smaski >= 0);
1190       output_smask = LLVMBuildLoad2(builder, vec_type, outputs[smaski][0], "smask");
1191       output_smask = LLVMBuildBitCast(builder, output_smask, smask_bld.vec_type, "");
1192       if (!key->multisample && key->no_ms_sample_mask_out) {
1193          output_smask = lp_build_and(&smask_bld, output_smask, smask_bld.one);
1194          output_smask = lp_build_cmp(&smask_bld, PIPE_FUNC_NOTEQUAL, output_smask, smask_bld.zero);
1195          lp_build_mask_update(&mask, output_smask);
1196       }
1197 
1198       if (key->min_samples > 1) {
1199          /* only the bit corresponding to this sample is to be used. */
1200          LLVMValueRef tmp_mask = LLVMBuildLoad2(builder, int_vec_type, out_sample_mask_storage, "tmp_mask");
1201          LLVMValueRef out_smask_idx = LLVMBuildShl(builder, lp_build_const_int32(gallivm, 1), sample_loop_state.counter, "");
1202          LLVMValueRef smask_bit = LLVMBuildAnd(builder, output_smask, lp_build_broadcast(gallivm, int_vec_type, out_smask_idx), "");
1203          output_smask = LLVMBuildOr(builder, tmp_mask, smask_bit, "");
1204       }
1205 
1206       LLVMBuildStore(builder, output_smask, out_sample_mask_storage);
1207    }
1208 
1209    if (nir->info.outputs_written & BITFIELD64_BIT(FRAG_RESULT_DEPTH)) {
1210       int pos0 = find_output_by_frag_result(nir, FRAG_RESULT_DEPTH);
1211 
1212       LLVMValueRef out = LLVMBuildLoad2(builder, vec_type, outputs[pos0][2], "");
1213       LLVMValueRef idx = loop_state.counter;
1214       if (key->min_samples > 1)
1215          idx = LLVMBuildAdd(builder, idx,
1216                             LLVMBuildMul(builder, sample_loop_state.counter, num_loop, ""), "");
1217       LLVMValueRef ptr = LLVMBuildGEP2(builder, vec_type, z_out, &idx, 1, "");
1218       LLVMBuildStore(builder, out, ptr);
1219    }
1220 
1221    if (nir->info.outputs_written & BITFIELD64_BIT(FRAG_RESULT_STENCIL)) {
1222       int sten_out = find_output_by_frag_result(nir, FRAG_RESULT_STENCIL);
1223 
1224       LLVMValueRef out = LLVMBuildLoad2(builder, vec_type,
1225                                         outputs[sten_out][1], "output.s");
1226       LLVMValueRef idx = loop_state.counter;
1227       if (key->min_samples > 1)
1228          idx = LLVMBuildAdd(builder, idx,
1229                             LLVMBuildMul(builder, sample_loop_state.counter, num_loop, ""), "");
1230       LLVMValueRef ptr = LLVMBuildGEP2(builder, vec_type, s_out, &idx, 1, "");
1231       LLVMBuildStore(builder, out, ptr);
1232    }
1233 
1234    bool has_cbuf0_write = false;
1235    /* Color write - per fragment sample */
1236    nir_foreach_shader_out_variable(var, nir) {
1237       if (var->data.location < FRAG_RESULT_DATA0)
1238          continue;
1239       int slots = nir_variable_count_slots(var, var->type);
1240 
1241       for (unsigned s = 0; s < slots; s++) {
1242          unsigned cbuf = get_cbuf_location(var, s);
1243          unsigned attrib = var->data.driver_location + s;
1244          if ((cbuf < key->nr_cbufs) || (cbuf == 1 && dual_source_blend)) {
1245             if (cbuf == 0) {
1246                /* XXX: there is an edge case with FB fetch where gl_FragColor and
1247                 * gl_LastFragData[0] are used together. This creates both
1248                 * FRAG_RESULT_COLOR and FRAG_RESULT_DATA* output variables. This
1249                 * loop then writes to cbuf 0 twice, owerwriting the correct value
1250                 * from gl_FragColor with some garbage. This case is excercised in
1251                 * one of deqp tests.  A similar bug can happen if
1252                 * gl_SecondaryFragColorEXT and gl_LastFragData[1] are mixed in
1253                 * the same fashion...  This workaround will break if
1254                 * gl_LastFragData[0] goes in outputs list before
1255                 * gl_FragColor. This doesn't seem to happen though.
1256                 */
1257                if (has_cbuf0_write)
1258                   continue;
1259                has_cbuf0_write = true;
1260             }
1261 
1262             for (unsigned chan = 0; chan < TGSI_NUM_CHANNELS; ++chan) {
1263                if (outputs[attrib][chan]) {
1264                   /* XXX: just initialize outputs to point at colors[] and
1265                    * skip this.
1266                    */
1267                   LLVMValueRef out = LLVMBuildLoad2(builder, vec_type, outputs[attrib][chan], "");
1268                   LLVMValueRef color_ptr;
1269                   LLVMValueRef color_idx = loop_state.counter;
1270                   if (key->min_samples > 1)
1271                      color_idx = LLVMBuildAdd(builder, color_idx,
1272                                               LLVMBuildMul(builder, sample_loop_state.counter, num_loop, ""), "");
1273                   color_ptr = LLVMBuildGEP2(builder, vec_type, out_color[cbuf][chan],
1274                                             &color_idx, 1, "");
1275                   lp_build_name(out, "color%u.%c", attrib, "rgba"[chan]);
1276                   LLVMBuildStore(builder, out, color_ptr);
1277                }
1278             }
1279          }
1280       }
1281    }
1282 
1283    if (key->multisample && key->min_samples > 1) {
1284       LLVMBuildStore(builder, lp_build_mask_value(&mask), s_mask_ptr);
1285       lp_build_for_loop_end(&sample_loop_state);
1286    }
1287 
1288    if (key->multisample) {
1289       /* execute depth test for each sample */
1290       lp_build_for_loop_begin(&sample_loop_state, gallivm,
1291                               lp_build_const_int32(gallivm, 0),
1292                               LLVMIntULT, lp_build_const_int32(gallivm, key->coverage_samples),
1293                               lp_build_const_int32(gallivm, 1));
1294 
1295       /* load the per-sample coverage mask */
1296       LLVMValueRef s_mask_idx = LLVMBuildMul(builder, sample_loop_state.counter, num_loop, "");
1297       s_mask_idx = LLVMBuildAdd(builder, s_mask_idx, loop_state.counter, "");
1298       s_mask_ptr = LLVMBuildGEP2(builder, mask_type, mask_store, &s_mask_idx, 1, "");
1299 
1300       /* combine the execution mask post fragment shader with the coverage mask. */
1301       s_mask = LLVMBuildLoad2(builder, mask_type, s_mask_ptr, "");
1302       if (key->min_samples == 1)
1303          s_mask = LLVMBuildAnd(builder, s_mask, lp_build_mask_value(&mask), "");
1304 
1305       /* if the shader writes sample mask use that,
1306        * but only if this isn't genuine early-depth to avoid breaking occlusion query */
1307       if (nir->info.outputs_written & BITFIELD64_BIT(FRAG_RESULT_SAMPLE_MASK) &&
1308           (!(depth_mode & EARLY_DEPTH_TEST) || (depth_mode & (EARLY_DEPTH_TEST_INFERRED)))) {
1309          LLVMValueRef out_smask_idx = LLVMBuildShl(builder, lp_build_const_int32(gallivm, 1), sample_loop_state.counter, "");
1310          out_smask_idx = lp_build_broadcast(gallivm, int_vec_type, out_smask_idx);
1311          LLVMValueRef output_smask = LLVMBuildLoad2(builder, int_vec_type, out_sample_mask_storage, "");
1312          LLVMValueRef smask_bit = LLVMBuildAnd(builder, output_smask, out_smask_idx, "");
1313          LLVMValueRef cmp = LLVMBuildICmp(builder, LLVMIntNE, smask_bit, lp_build_const_int_vec(gallivm, int_type, 0), "");
1314          smask_bit = LLVMBuildSExt(builder, cmp, int_vec_type, "");
1315 
1316          s_mask = LLVMBuildAnd(builder, s_mask, smask_bit, "");
1317       }
1318    }
1319 
1320    depth_ptr = depth_base_ptr;
1321    if (key->multisample) {
1322       LLVMValueRef sample_offset = LLVMBuildMul(builder, sample_loop_state.counter, depth_sample_stride, "");
1323       depth_ptr = LLVMBuildGEP2(builder, LLVMInt8TypeInContext(gallivm->context),
1324                                 depth_ptr, &sample_offset, 1, "");
1325    }
1326 
1327    /* Late Z test */
1328    if (depth_mode & LATE_DEPTH_TEST) {
1329       if (nir->info.outputs_written & BITFIELD64_BIT(FRAG_RESULT_DEPTH)) {
1330          LLVMValueRef idx = loop_state.counter;
1331          if (key->min_samples > 1)
1332             idx = LLVMBuildAdd(builder, idx,
1333                                LLVMBuildMul(builder, sample_loop_state.counter, num_loop, ""), "");
1334          LLVMValueRef ptr = LLVMBuildGEP2(builder, vec_type, z_out, &idx, 1, "");
1335          z = LLVMBuildLoad2(builder, vec_type, ptr, "output.z");
1336       } else {
1337          if (key->multisample) {
1338             lp_build_interp_soa_update_pos_dyn(interp, gallivm, loop_state.counter, key->multisample ? sample_loop_state.counter : NULL);
1339             z = interp->pos[2];
1340          }
1341       }
1342 
1343       /*
1344        * Clamp according to ARB_depth_clamp semantics.
1345        */
1346       z = lp_build_depth_clamp(gallivm, builder, key->depth_clamp,
1347                                key->restrict_depth_values, type,
1348                                context_type, context_ptr,
1349                                thread_data_type, thread_data_ptr, z);
1350 
1351       if (nir->info.outputs_written & BITFIELD64_BIT(FRAG_RESULT_STENCIL)) {
1352          LLVMValueRef idx = loop_state.counter;
1353          if (key->min_samples > 1)
1354             idx = LLVMBuildAdd(builder, idx,
1355                                LLVMBuildMul(builder, sample_loop_state.counter, num_loop, ""), "");
1356          LLVMValueRef ptr = LLVMBuildGEP2(builder, vec_type, s_out, &idx, 1, "");
1357          stencil_refs[0] = LLVMBuildLoad2(builder, vec_type, ptr, "output.s");
1358          /* there's only one value, and spec says to discard additional bits */
1359          LLVMValueRef s_max_mask = lp_build_const_int_vec(gallivm, int_type, 255);
1360          stencil_refs[0] = LLVMBuildBitCast(builder, stencil_refs[0], int_vec_type, "");
1361          stencil_refs[0] = LLVMBuildAnd(builder, stencil_refs[0], s_max_mask, "");
1362          stencil_refs[1] = stencil_refs[0];
1363       }
1364 
1365       lp_build_depth_stencil_load_swizzled(gallivm, type,
1366                                            zs_format_desc, key->resource_1d,
1367                                            depth_ptr, depth_stride,
1368                                            &z_fb, &s_fb, loop_state.counter);
1369 
1370       lp_build_depth_stencil_test(gallivm,
1371                                   &key->depth,
1372                                   key->stencil,
1373                                   type,
1374                                   zs_format_desc,
1375                                   key->multisample ? NULL : &mask,
1376                                   &s_mask,
1377                                   stencil_refs,
1378                                   z, z_fb, s_fb,
1379                                   facing,
1380                                   &z_value, &s_value,
1381                                   false,
1382                                   key->restrict_depth_values);
1383       /* Late Z write */
1384       if (depth_mode & LATE_DEPTH_WRITE) {
1385          lp_build_depth_stencil_write_swizzled(gallivm, type,
1386                                                zs_format_desc, key->resource_1d,
1387                                                NULL, NULL, NULL, loop_state.counter,
1388                                                depth_ptr, depth_stride,
1389                                                z_value, s_value);
1390       }
1391    } else if ((depth_mode & EARLY_DEPTH_TEST) &&
1392               (depth_mode & LATE_DEPTH_WRITE)) {
1393       /* Need to apply a reduced mask to the depth write.  Reload the
1394        * depth value, update from zs_value with the new mask value and
1395        * write that out.
1396        */
1397       if (key->multisample) {
1398          z_value = LLVMBuildBitCast(builder, lp_build_pointer_get2(builder, int_vec_type, z_sample_value_store, sample_loop_state.counter), z_type, "");
1399          s_value = lp_build_pointer_get2(builder, int_vec_type, s_sample_value_store, sample_loop_state.counter);
1400          z_fb = LLVMBuildBitCast(builder, lp_build_pointer_get2(builder, int_vec_type, z_fb_store, sample_loop_state.counter), z_fb_type, "");
1401          s_fb = lp_build_pointer_get2(builder, int_vec_type, s_fb_store, sample_loop_state.counter);
1402       }
1403       lp_build_depth_stencil_write_swizzled(gallivm, type,
1404                                             zs_format_desc, key->resource_1d,
1405                                             key->multisample ? s_mask : lp_build_mask_value(&mask), z_fb, s_fb, loop_state.counter,
1406                                             depth_ptr, depth_stride,
1407                                             z_value, s_value);
1408    }
1409 
1410    if (key->occlusion_count && (!(depth_mode & EARLY_DEPTH_TEST) || (depth_mode & EARLY_DEPTH_TEST_INFERRED))) {
1411       LLVMValueRef counter = lp_jit_thread_data_vis_counter(gallivm, thread_data_type, thread_data_ptr);
1412       lp_build_name(counter, "counter");
1413 
1414       lp_build_occlusion_count(gallivm, type,
1415                                key->multisample ? s_mask : lp_build_mask_value(&mask), counter);
1416    }
1417 
1418    /* if this is genuine early-depth in the shader, write samplemask now
1419     * after occlusion count has been updated
1420     */
1421    if (key->multisample &&
1422        nir->info.outputs_written & BITFIELD64_BIT(FRAG_RESULT_SAMPLE_MASK) &&
1423        (depth_mode & (EARLY_DEPTH_TEST_INFERRED | EARLY_DEPTH_TEST)) == EARLY_DEPTH_TEST) {
1424       /* if the shader writes sample mask use that */
1425          LLVMValueRef out_smask_idx = LLVMBuildShl(builder, lp_build_const_int32(gallivm, 1), sample_loop_state.counter, "");
1426          out_smask_idx = lp_build_broadcast(gallivm, int_vec_type, out_smask_idx);
1427          LLVMValueRef output_smask = LLVMBuildLoad2(builder, int_vec_type, out_sample_mask_storage, "");
1428          LLVMValueRef smask_bit = LLVMBuildAnd(builder, output_smask, out_smask_idx, "");
1429          LLVMValueRef cmp = LLVMBuildICmp(builder, LLVMIntNE, smask_bit, lp_build_const_int_vec(gallivm, int_type, 0), "");
1430          smask_bit = LLVMBuildSExt(builder, cmp, int_vec_type, "");
1431 
1432          s_mask = LLVMBuildAnd(builder, s_mask, smask_bit, "");
1433    }
1434 
1435 
1436    if (key->multisample) {
1437       /* store the sample mask for this loop */
1438       LLVMBuildStore(builder, s_mask, s_mask_ptr);
1439       lp_build_for_loop_end(&sample_loop_state);
1440    }
1441 
1442    mask_val = lp_build_mask_end(&mask);
1443    if (!key->multisample)
1444       LLVMBuildStore(builder, mask_val, mask_ptr);
1445    lp_build_for_loop_end(&loop_state);
1446 }
1447 
1448 
1449 /**
1450  * This function will reorder pixels from the fragment shader SoA to memory
1451  * layout AoS
1452  *
1453  * Fragment Shader outputs pixels in small 2x2 blocks
1454  *  e.g. (0, 0), (1, 0), (0, 1), (1, 1) ; (2, 0) ...
1455  *
1456  * However in memory pixels are stored in rows
1457  *  e.g. (0, 0), (1, 0), (2, 0), (3, 0) ; (0, 1) ...
1458  *
1459  * @param type            fragment shader type (4x or 8x float)
1460  * @param num_fs          number of fs_src
1461  * @param is_1d           whether we're outputting to a 1d resource
1462  * @param dst_channels    number of output channels
1463  * @param fs_src          output from fragment shader
1464  * @param dst             pointer to store result
1465  * @param pad_inline      is channel padding inline or at end of row
1466  * @return                the number of dsts
1467  */
1468 static int
generate_fs_twiddle(struct gallivm_state * gallivm,struct lp_type type,unsigned num_fs,unsigned dst_channels,LLVMValueRef fs_src[][4],LLVMValueRef * dst,bool pad_inline)1469 generate_fs_twiddle(struct gallivm_state *gallivm,
1470                     struct lp_type type,
1471                     unsigned num_fs,
1472                     unsigned dst_channels,
1473                     LLVMValueRef fs_src[][4],
1474                     LLVMValueRef* dst,
1475                     bool pad_inline)
1476 {
1477    LLVMValueRef src[16];
1478    unsigned pixels = type.length / 4;
1479    unsigned src_channels = dst_channels < 3 ? dst_channels : 4;
1480    unsigned src_count = num_fs * src_channels;
1481 
1482    assert(pixels == 2 || pixels == 1);
1483    assert(num_fs * src_channels <= ARRAY_SIZE(src));
1484 
1485    /*
1486     * Transpose from SoA -> AoS
1487     */
1488    for (unsigned i = 0; i < num_fs; ++i) {
1489       lp_build_transpose_aos_n(gallivm, type, &fs_src[i][0], src_channels,
1490                                &src[i * src_channels]);
1491    }
1492 
1493    /*
1494     * Pick transformation options
1495     */
1496    bool swizzle_pad = false;
1497    bool twiddle = false;
1498    bool split = false;
1499    unsigned reorder_group = 0;
1500 
1501    if (dst_channels == 1) {
1502       twiddle = true;
1503       if (pixels == 2) {
1504          split = true;
1505       }
1506    } else if (dst_channels == 2) {
1507       if (pixels == 1) {
1508          reorder_group = 1;
1509       }
1510    } else if (dst_channels > 2) {
1511       if (pixels == 1) {
1512          reorder_group = 2;
1513       } else {
1514          twiddle = true;
1515       }
1516 
1517       if (!pad_inline && dst_channels == 3 && pixels > 1) {
1518          swizzle_pad = true;
1519       }
1520    }
1521 
1522    /*
1523     * Split the src in half
1524     */
1525    if (split) {
1526       for (unsigned i = num_fs; i > 0; --i) {
1527          src[(i - 1)*2 + 1] = lp_build_extract_range(gallivm, src[i - 1], 4, 4);
1528          src[(i - 1)*2 + 0] = lp_build_extract_range(gallivm, src[i - 1], 0, 4);
1529       }
1530 
1531       src_count *= 2;
1532       type.length = 4;
1533    }
1534 
1535    /*
1536     * Ensure pixels are in memory order
1537     */
1538    if (reorder_group) {
1539       /* Twiddle pixels by reordering the array, e.g.:
1540        *
1541        * src_count =  8 -> 0 2 1 3 4 6 5 7
1542        * src_count = 16 -> 0 1 4 5 2 3 6 7 8 9 12 13 10 11 14 15
1543        */
1544       const unsigned reorder_sw[] = { 0, 2, 1, 3 };
1545 
1546       for (unsigned i = 0; i < src_count; ++i) {
1547          unsigned group = i / reorder_group;
1548          unsigned block = (group / 4) * 4 * reorder_group;
1549          unsigned j = block + (reorder_sw[group % 4] * reorder_group) + (i % reorder_group);
1550          dst[i] = src[j];
1551       }
1552    } else if (twiddle) {
1553       /* Twiddle pixels across elements of array */
1554       /*
1555        * XXX: we should avoid this in some cases, but would need to tell
1556        * lp_build_conv to reorder (or deal with it ourselves).
1557        */
1558       lp_bld_quad_twiddle(gallivm, type, src, src_count, dst);
1559    } else {
1560       /* Do nothing */
1561       memcpy(dst, src, sizeof(LLVMValueRef) * src_count);
1562    }
1563 
1564    /*
1565     * Moves any padding between pixels to the end
1566     * e.g. RGBXRGBX -> RGBRGBXX
1567     */
1568    if (swizzle_pad) {
1569       unsigned char swizzles[16];
1570       unsigned elems = pixels * dst_channels;
1571 
1572       for (unsigned i = 0; i < type.length; ++i) {
1573          if (i < elems)
1574             swizzles[i] = i % dst_channels + (i / dst_channels) * 4;
1575          else
1576             swizzles[i] = LP_BLD_SWIZZLE_DONTCARE;
1577       }
1578 
1579       for (unsigned i = 0; i < src_count; ++i) {
1580          dst[i] = lp_build_swizzle_aos_n(gallivm, dst[i], swizzles,
1581                                          type.length, type.length);
1582       }
1583    }
1584 
1585    return src_count;
1586 }
1587 
1588 
1589 /*
1590  * Untwiddle and transpose, much like the above.
1591  * However, this is after conversion, so we get packed vectors.
1592  * At this time only handle 4x16i8 rgba / 2x16i8 rg / 1x16i8 r data,
1593  * the vectors will look like:
1594  * r0r1r4r5r2r3r6r7r8r9r12... (albeit color channels may
1595  * be swizzled here). Extending to 16bit should be trivial.
1596  * Should also be extended to handle twice wide vectors with AVX2...
1597  */
1598 static void
fs_twiddle_transpose(struct gallivm_state * gallivm,struct lp_type type,LLVMValueRef * src,unsigned src_count,LLVMValueRef * dst)1599 fs_twiddle_transpose(struct gallivm_state *gallivm,
1600                      struct lp_type type,
1601                      LLVMValueRef *src,
1602                      unsigned src_count,
1603                      LLVMValueRef *dst)
1604 {
1605    struct lp_type type64, type16, type32;
1606    LLVMTypeRef type64_t, type8_t, type16_t, type32_t;
1607    LLVMBuilderRef builder = gallivm->builder;
1608    LLVMValueRef tmp[4], shuf[8];
1609    for (unsigned j = 0; j < 2; j++) {
1610       shuf[j*4 + 0] = lp_build_const_int32(gallivm, j*4 + 0);
1611       shuf[j*4 + 1] = lp_build_const_int32(gallivm, j*4 + 2);
1612       shuf[j*4 + 2] = lp_build_const_int32(gallivm, j*4 + 1);
1613       shuf[j*4 + 3] = lp_build_const_int32(gallivm, j*4 + 3);
1614    }
1615 
1616    assert(src_count == 4 || src_count == 2 || src_count == 1);
1617    assert(type.width == 8);
1618    assert(type.length == 16);
1619 
1620    type8_t = lp_build_vec_type(gallivm, type);
1621 
1622    type64 = type;
1623    type64.length /= 8;
1624    type64.width *= 8;
1625    type64_t = lp_build_vec_type(gallivm, type64);
1626 
1627    type16 = type;
1628    type16.length /= 2;
1629    type16.width *= 2;
1630    type16_t = lp_build_vec_type(gallivm, type16);
1631 
1632    type32 = type;
1633    type32.length /= 4;
1634    type32.width *= 4;
1635    type32_t = lp_build_vec_type(gallivm, type32);
1636 
1637    lp_build_transpose_aos_n(gallivm, type, src, src_count, tmp);
1638 
1639    if (src_count == 1) {
1640       /* transpose was no-op, just untwiddle */
1641       LLVMValueRef shuf_vec;
1642       shuf_vec = LLVMConstVector(shuf, 8);
1643       tmp[0] = LLVMBuildBitCast(builder, src[0], type16_t, "");
1644       tmp[0] = LLVMBuildShuffleVector(builder, tmp[0], tmp[0], shuf_vec, "");
1645       dst[0] = LLVMBuildBitCast(builder, tmp[0], type8_t, "");
1646    } else if (src_count == 2) {
1647       LLVMValueRef shuf_vec;
1648       shuf_vec = LLVMConstVector(shuf, 4);
1649 
1650       for (unsigned i = 0; i < 2; i++) {
1651          tmp[i] = LLVMBuildBitCast(builder, tmp[i], type32_t, "");
1652          tmp[i] = LLVMBuildShuffleVector(builder, tmp[i], tmp[i], shuf_vec, "");
1653          dst[i] = LLVMBuildBitCast(builder, tmp[i], type8_t, "");
1654       }
1655    } else {
1656       for (unsigned j = 0; j < 2; j++) {
1657          LLVMValueRef lo, hi, lo2, hi2;
1658           /*
1659           * Note that if we only really have 3 valid channels (rgb)
1660           * and we don't need alpha we could substitute a undef here
1661           * for the respective channel (causing llvm to drop conversion
1662           * for alpha).
1663           */
1664          /* we now have rgba0rgba1rgba4rgba5 etc, untwiddle */
1665          lo2 = LLVMBuildBitCast(builder, tmp[j*2], type64_t, "");
1666          hi2 = LLVMBuildBitCast(builder, tmp[j*2 + 1], type64_t, "");
1667          lo = lp_build_interleave2(gallivm, type64, lo2, hi2, 0);
1668          hi = lp_build_interleave2(gallivm, type64, lo2, hi2, 1);
1669          dst[j*2] = LLVMBuildBitCast(builder, lo, type8_t, "");
1670          dst[j*2 + 1] = LLVMBuildBitCast(builder, hi, type8_t, "");
1671       }
1672    }
1673 }
1674 
1675 
1676 /**
1677  * Load an unswizzled block of pixels from memory
1678  */
1679 static void
load_unswizzled_block(struct gallivm_state * gallivm,LLVMTypeRef base_type,LLVMValueRef base_ptr,LLVMValueRef stride,unsigned block_width,unsigned block_height,LLVMValueRef * dst,struct lp_type dst_type,unsigned dst_count,unsigned dst_alignment)1680 load_unswizzled_block(struct gallivm_state *gallivm,
1681                       LLVMTypeRef base_type,
1682                       LLVMValueRef base_ptr,
1683                       LLVMValueRef stride,
1684                       unsigned block_width,
1685                       unsigned block_height,
1686                       LLVMValueRef* dst,
1687                       struct lp_type dst_type,
1688                       unsigned dst_count,
1689                       unsigned dst_alignment)
1690 {
1691    LLVMBuilderRef builder = gallivm->builder;
1692    const unsigned row_size = dst_count / block_height;
1693 
1694    /* Ensure block exactly fits into dst */
1695    assert((block_width * block_height) % dst_count == 0);
1696 
1697    for (unsigned i = 0; i < dst_count; ++i) {
1698       unsigned x = i % row_size;
1699       unsigned y = i / row_size;
1700 
1701       LLVMValueRef bx = lp_build_const_int32(gallivm, x * (dst_type.width / 8) * dst_type.length);
1702       LLVMValueRef by = LLVMBuildMul(builder, lp_build_const_int32(gallivm, y), stride, "");
1703 
1704       LLVMValueRef gep[2];
1705       LLVMValueRef dst_ptr;
1706 
1707       gep[0] = lp_build_const_int32(gallivm, 0);
1708       gep[1] = LLVMBuildAdd(builder, bx, by, "");
1709 
1710       dst_ptr = LLVMBuildGEP2(builder, base_type, base_ptr, gep, 2, "");
1711       dst_ptr = LLVMBuildBitCast(builder, dst_ptr,
1712                                  LLVMPointerType(lp_build_vec_type(gallivm, dst_type), 0), "");
1713 
1714       dst[i] = LLVMBuildLoad2(builder,
1715                               lp_build_vec_type(gallivm, dst_type),
1716                               dst_ptr, "");
1717 
1718       LLVMSetAlignment(dst[i], dst_alignment);
1719    }
1720 }
1721 
1722 
1723 /**
1724  * Store an unswizzled block of pixels to memory
1725  */
1726 static void
store_unswizzled_block(struct gallivm_state * gallivm,LLVMTypeRef base_type,LLVMValueRef base_ptr,LLVMValueRef stride,unsigned block_width,unsigned block_height,LLVMValueRef src[],struct lp_type src_type,unsigned src_count,unsigned src_alignment)1727 store_unswizzled_block(struct gallivm_state *gallivm,
1728                        LLVMTypeRef base_type,
1729                        LLVMValueRef base_ptr,
1730                        LLVMValueRef stride,
1731                        unsigned block_width,
1732                        unsigned block_height,
1733                        LLVMValueRef src[],   // [src_count]
1734                        struct lp_type src_type,
1735                        unsigned src_count,
1736                        unsigned src_alignment)
1737 {
1738    LLVMBuilderRef builder = gallivm->builder;
1739    const unsigned row_size = src_count / block_height;
1740 
1741    /* Ensure src exactly fits into block */
1742    assert((block_width * block_height) % src_count == 0);
1743 
1744    for (unsigned i = 0; i < src_count; ++i) {
1745       unsigned x = i % row_size;
1746       unsigned y = i / row_size;
1747 
1748       LLVMValueRef bx = lp_build_const_int32(gallivm, x * (src_type.width / 8) * src_type.length);
1749       LLVMValueRef by = LLVMBuildMul(builder, lp_build_const_int32(gallivm, y), stride, "");
1750 
1751       LLVMValueRef gep[2];
1752       LLVMValueRef src_ptr;
1753 
1754       gep[0] = lp_build_const_int32(gallivm, 0);
1755       gep[1] = LLVMBuildAdd(builder, bx, by, "");
1756 
1757       src_ptr = LLVMBuildGEP2(builder, base_type, base_ptr, gep, 2, "");
1758       src_ptr = LLVMBuildBitCast(builder, src_ptr,
1759                                  LLVMPointerType(lp_build_vec_type(gallivm, src_type), 0), "");
1760 
1761       src_ptr = LLVMBuildStore(builder, src[i], src_ptr);
1762 
1763       LLVMSetAlignment(src_ptr, src_alignment);
1764    }
1765 }
1766 
1767 
1768 
1769 /**
1770  * Retrieves the type for a format which is usable in the blending code.
1771  *
1772  * e.g. RGBA16F = 4x float, R3G3B2 = 3x byte
1773  */
1774 static inline void
lp_blend_type_from_format_desc(const struct util_format_description * format_desc,struct lp_type * type)1775 lp_blend_type_from_format_desc(const struct util_format_description *format_desc,
1776                                struct lp_type* type)
1777 {
1778    if (format_expands_to_float_soa(format_desc)) {
1779       /* always use ordinary floats for blending */
1780       type->floating = true;
1781       type->fixed = false;
1782       type->sign = true;
1783       type->norm = false;
1784       type->width = 32;
1785       type->length = 4;
1786       return;
1787    }
1788 
1789    const int chan = util_format_get_first_non_void_channel(format_desc->format);
1790 
1791    memset(type, 0, sizeof(struct lp_type));
1792    type->floating = format_desc->channel[chan].type == UTIL_FORMAT_TYPE_FLOAT;
1793    type->fixed    = format_desc->channel[chan].type == UTIL_FORMAT_TYPE_FIXED;
1794    type->sign     = format_desc->channel[chan].type != UTIL_FORMAT_TYPE_UNSIGNED;
1795    type->norm     = format_desc->channel[chan].normalized;
1796    type->width    = format_desc->channel[chan].size;
1797    type->length   = format_desc->nr_channels;
1798 
1799    for (unsigned i = 1; i < format_desc->nr_channels; ++i) {
1800       if (format_desc->channel[i].size > type->width)
1801          type->width = format_desc->channel[i].size;
1802    }
1803 
1804    if (type->floating) {
1805       type->width = 32;
1806    } else {
1807       if (type->width <= 8) {
1808          type->width = 8;
1809       } else if (type->width <= 16) {
1810          type->width = 16;
1811       } else {
1812          type->width = 32;
1813       }
1814    }
1815 
1816    if (is_arithmetic_format(format_desc) && type->length == 3) {
1817       type->length = 4;
1818    }
1819 }
1820 
1821 
1822 /**
1823  * Scale a normalized value from src_bits to dst_bits.
1824  *
1825  * The exact calculation is
1826  *
1827  *    dst = iround(src * dst_mask / src_mask)
1828  *
1829  *  or with integer rounding
1830  *
1831  *    dst = src * (2*dst_mask + sign(src)*src_mask) / (2*src_mask)
1832  *
1833  *  where
1834  *
1835  *    src_mask = (1 << src_bits) - 1
1836  *    dst_mask = (1 << dst_bits) - 1
1837  *
1838  * but we try to avoid division and multiplication through shifts.
1839  */
1840 static inline LLVMValueRef
scale_bits(struct gallivm_state * gallivm,int src_bits,int dst_bits,LLVMValueRef src,struct lp_type src_type)1841 scale_bits(struct gallivm_state *gallivm,
1842            int src_bits,
1843            int dst_bits,
1844            LLVMValueRef src,
1845            struct lp_type src_type)
1846 {
1847    LLVMBuilderRef builder = gallivm->builder;
1848    LLVMValueRef result = src;
1849 
1850    if (dst_bits < src_bits) {
1851       int delta_bits = src_bits - dst_bits;
1852 
1853       if (delta_bits <= dst_bits) {
1854 
1855          if (dst_bits == 4) {
1856             struct lp_type flt_type =
1857                lp_type_float_vec(32, src_type.length * 32);
1858 
1859             result = lp_build_unsigned_norm_to_float(gallivm, src_bits,
1860                                                      flt_type, src);
1861             result = lp_build_clamped_float_to_unsigned_norm(gallivm, flt_type,
1862                                                              dst_bits, result);
1863             result = LLVMBuildTrunc(gallivm->builder, result,
1864                                     lp_build_int_vec_type(gallivm, src_type),
1865                                     "");
1866             return result;
1867          }
1868 
1869          /*
1870           * Approximate the rescaling with a single shift.
1871           *
1872           * This gives the wrong rounding.
1873           */
1874 
1875          result = LLVMBuildLShr(builder, src,
1876                                 lp_build_const_int_vec(gallivm, src_type,
1877                                                        delta_bits),
1878                                 "");
1879       } else {
1880          /*
1881           * Try more accurate rescaling.
1882           */
1883 
1884          /*
1885           * Drop the least significant bits to make space for the
1886           * multiplication.
1887           *
1888           * XXX: A better approach would be to use a wider integer type as
1889           * intermediate.  But this is enough to convert alpha from 16bits ->
1890           * 2 when rendering to PIPE_FORMAT_R10G10B10A2_UNORM.
1891           */
1892          result = LLVMBuildLShr(builder, src,
1893                                 lp_build_const_int_vec(gallivm, src_type,
1894                                                        dst_bits),
1895                                 "");
1896 
1897 
1898          result = LLVMBuildMul(builder, result,
1899                                lp_build_const_int_vec(gallivm, src_type,
1900                                                       (1LL << dst_bits) - 1),
1901                                "");
1902 
1903          /*
1904           * Add a rounding term before the division.
1905           *
1906           * TODO: Handle signed integers too.
1907           */
1908          if (!src_type.sign) {
1909             result = LLVMBuildAdd(builder, result,
1910                                   lp_build_const_int_vec(gallivm, src_type,
1911                                                     (1LL << (delta_bits - 1))),
1912                                   "");
1913          }
1914 
1915          /*
1916           * Approximate the division by src_mask with a src_bits shift.
1917           *
1918           * Given the src has already been shifted by dst_bits, all we need
1919           * to do is to shift by the difference.
1920           */
1921 
1922          result = LLVMBuildLShr(builder,
1923                                 result,
1924                                 lp_build_const_int_vec(gallivm, src_type, delta_bits),
1925                                 "");
1926       }
1927 
1928    } else if (dst_bits > src_bits) {
1929       /* Scale up bits */
1930       int db = dst_bits - src_bits;
1931 
1932       /* Shift left by difference in bits */
1933       result = LLVMBuildShl(builder,
1934                             src,
1935                             lp_build_const_int_vec(gallivm, src_type, db),
1936                             "");
1937 
1938       if (db <= src_bits) {
1939          /* Enough bits in src to fill the remainder */
1940          LLVMValueRef lower = LLVMBuildLShr(builder,
1941                                             src,
1942                                             lp_build_const_int_vec(gallivm, src_type, src_bits - db),
1943                                             "");
1944 
1945          result = LLVMBuildOr(builder, result, lower, "");
1946       } else if (db > src_bits) {
1947          /* Need to repeatedly copy src bits to fill remainder in dst */
1948          unsigned n;
1949 
1950          for (n = src_bits; n < dst_bits; n *= 2) {
1951             LLVMValueRef shuv = lp_build_const_int_vec(gallivm, src_type, n);
1952 
1953             result = LLVMBuildOr(builder,
1954                                  result,
1955                                  LLVMBuildLShr(builder, result, shuv, ""),
1956                                  "");
1957          }
1958       }
1959    }
1960 
1961    return result;
1962 }
1963 
1964 /**
1965  * If RT is a smallfloat (needing denorms) format
1966  */
1967 static inline int
have_smallfloat_format(struct lp_type dst_type,enum pipe_format format)1968 have_smallfloat_format(struct lp_type dst_type,
1969                        enum pipe_format format)
1970 {
1971    return ((dst_type.floating && dst_type.width != 32) ||
1972     /* due to format handling hacks this format doesn't have floating set
1973      * here (and actually has width set to 32 too) so special case this.
1974      */
1975     (format == PIPE_FORMAT_R11G11B10_FLOAT));
1976 }
1977 
1978 
1979 /**
1980  * Convert from memory format to blending format
1981  *
1982  * e.g. GL_R3G3B2 is 1 byte in memory but 3 bytes for blending
1983  */
1984 static void
convert_to_blend_type(struct gallivm_state * gallivm,unsigned block_size,const struct util_format_description * src_fmt,struct lp_type src_type,struct lp_type dst_type,LLVMValueRef * src,unsigned num_srcs)1985 convert_to_blend_type(struct gallivm_state *gallivm,
1986                       unsigned block_size,
1987                       const struct util_format_description *src_fmt,
1988                       struct lp_type src_type,
1989                       struct lp_type dst_type,
1990                       LLVMValueRef* src, // and dst
1991                       unsigned num_srcs)
1992 {
1993    LLVMValueRef *dst = src;
1994    LLVMBuilderRef builder = gallivm->builder;
1995    struct lp_type blend_type;
1996    struct lp_type mem_type;
1997    unsigned i, j;
1998    unsigned pixels = block_size / num_srcs;
1999    bool is_arith;
2000 
2001    /*
2002     * full custom path for packed floats and srgb formats - none of the later
2003     * functions would do anything useful, and given the lp_type representation
2004     * they can't be fixed. Should really have some SoA blend path for these
2005     * kind of formats rather than hacking them in here.
2006     */
2007    if (format_expands_to_float_soa(src_fmt)) {
2008       LLVMValueRef tmpsrc[4];
2009       /*
2010        * This is pretty suboptimal for this case blending in SoA would be much
2011        * better, since conversion gets us SoA values so need to convert back.
2012        */
2013       assert(src_type.width == 32 || src_type.width == 16);
2014       assert(dst_type.floating);
2015       assert(dst_type.width == 32);
2016       assert(dst_type.length % 4 == 0);
2017       assert(num_srcs % 4 == 0);
2018 
2019       if (src_type.width == 16) {
2020          /* expand 4x16bit values to 4x32bit */
2021          struct lp_type type32x4 = src_type;
2022          LLVMTypeRef ltype32x4;
2023          unsigned num_fetch = dst_type.length == 8 ? num_srcs / 2 : num_srcs / 4;
2024          type32x4.width = 32;
2025          ltype32x4 = lp_build_vec_type(gallivm, type32x4);
2026          for (i = 0; i < num_fetch; i++) {
2027             src[i] = LLVMBuildZExt(builder, src[i], ltype32x4, "");
2028          }
2029          src_type.width = 32;
2030       }
2031       for (i = 0; i < 4; i++) {
2032          tmpsrc[i] = src[i];
2033       }
2034       for (i = 0; i < num_srcs / 4; i++) {
2035          LLVMValueRef tmpsoa[4];
2036          LLVMValueRef tmps = tmpsrc[i];
2037          if (dst_type.length == 8) {
2038             LLVMValueRef shuffles[8];
2039             unsigned j;
2040             /* fetch was 4 values but need 8-wide output values */
2041             tmps = lp_build_concat(gallivm, &tmpsrc[i * 2], src_type, 2);
2042             /*
2043              * for 8-wide aos transpose would give us wrong order not matching
2044              * incoming converted fs values and mask. ARGH.
2045              */
2046             for (j = 0; j < 4; j++) {
2047                shuffles[j] = lp_build_const_int32(gallivm, j * 2);
2048                shuffles[j + 4] = lp_build_const_int32(gallivm, j * 2 + 1);
2049             }
2050             tmps = LLVMBuildShuffleVector(builder, tmps, tmps,
2051                                           LLVMConstVector(shuffles, 8), "");
2052          }
2053          if (src_fmt->format == PIPE_FORMAT_R11G11B10_FLOAT) {
2054             lp_build_r11g11b10_to_float(gallivm, tmps, tmpsoa);
2055          } else {
2056             lp_build_unpack_rgba_soa(gallivm, src_fmt, dst_type, tmps, tmpsoa);
2057          }
2058          lp_build_transpose_aos(gallivm, dst_type, tmpsoa, &src[i * 4]);
2059       }
2060       return;
2061    }
2062 
2063    lp_mem_type_from_format_desc(src_fmt, &mem_type);
2064    lp_blend_type_from_format_desc(src_fmt, &blend_type);
2065 
2066    /* Is the format arithmetic */
2067    is_arith = blend_type.length * blend_type.width != mem_type.width * mem_type.length;
2068    is_arith &= !(mem_type.width == 16 && mem_type.floating);
2069 
2070    /* Pad if necessary */
2071    if (!is_arith && src_type.length < dst_type.length) {
2072       for (i = 0; i < num_srcs; ++i) {
2073          dst[i] = lp_build_pad_vector(gallivm, src[i], dst_type.length);
2074       }
2075 
2076       src_type.length = dst_type.length;
2077    }
2078 
2079    /* Special case for half-floats */
2080    if (mem_type.width == 16 && mem_type.floating) {
2081       assert(blend_type.width == 32 && blend_type.floating);
2082       lp_build_conv_auto(gallivm, src_type, &dst_type, dst, num_srcs, dst);
2083       is_arith = false;
2084    }
2085 
2086    if (!is_arith) {
2087       return;
2088    }
2089 
2090    src_type.width = blend_type.width * blend_type.length;
2091    blend_type.length *= pixels;
2092    src_type.length *= pixels / (src_type.length / mem_type.length);
2093 
2094    for (i = 0; i < num_srcs; ++i) {
2095       LLVMValueRef chans;
2096       LLVMValueRef res = NULL;
2097 
2098       dst[i] = LLVMBuildZExt(builder, src[i], lp_build_vec_type(gallivm, src_type), "");
2099 
2100       for (j = 0; j < src_fmt->nr_channels; ++j) {
2101          unsigned mask = 0;
2102          unsigned sa = src_fmt->channel[j].shift;
2103 #if UTIL_ARCH_LITTLE_ENDIAN
2104          unsigned from_lsb = j;
2105 #else
2106          unsigned from_lsb = (blend_type.length / pixels) - j - 1;
2107 #endif
2108 
2109          mask = (1 << src_fmt->channel[j].size) - 1;
2110 
2111          /* Extract bits from source */
2112          chans = LLVMBuildLShr(builder,
2113                                dst[i],
2114                                lp_build_const_int_vec(gallivm, src_type, sa),
2115                                "");
2116 
2117          chans = LLVMBuildAnd(builder,
2118                               chans,
2119                               lp_build_const_int_vec(gallivm, src_type, mask),
2120                               "");
2121 
2122          /* Scale bits */
2123          if (src_type.norm) {
2124             chans = scale_bits(gallivm, src_fmt->channel[j].size,
2125                                blend_type.width, chans, src_type);
2126          }
2127 
2128          /* Insert bits into correct position */
2129          chans = LLVMBuildShl(builder,
2130                               chans,
2131                               lp_build_const_int_vec(gallivm, src_type, from_lsb * blend_type.width),
2132                               "");
2133 
2134          if (j == 0) {
2135             res = chans;
2136          } else {
2137             res = LLVMBuildOr(builder, res, chans, "");
2138          }
2139       }
2140 
2141       dst[i] = LLVMBuildBitCast(builder, res, lp_build_vec_type(gallivm, blend_type), "");
2142    }
2143 }
2144 
2145 
2146 /**
2147  * Convert from blending format to memory format
2148  *
2149  * e.g. GL_R3G3B2 is 3 bytes for blending but 1 byte in memory
2150  */
2151 static void
convert_from_blend_type(struct gallivm_state * gallivm,unsigned block_size,const struct util_format_description * src_fmt,struct lp_type src_type,struct lp_type dst_type,LLVMValueRef * src,unsigned num_srcs)2152 convert_from_blend_type(struct gallivm_state *gallivm,
2153                         unsigned block_size,
2154                         const struct util_format_description *src_fmt,
2155                         struct lp_type src_type,
2156                         struct lp_type dst_type,
2157                         LLVMValueRef* src, // and dst
2158                         unsigned num_srcs)
2159 {
2160    LLVMValueRef* dst = src;
2161    unsigned i, j, k;
2162    struct lp_type mem_type;
2163    struct lp_type blend_type;
2164    LLVMBuilderRef builder = gallivm->builder;
2165    unsigned pixels = block_size / num_srcs;
2166    bool is_arith;
2167 
2168    /*
2169     * full custom path for packed floats and srgb formats - none of the later
2170     * functions would do anything useful, and given the lp_type representation
2171     * they can't be fixed. Should really have some SoA blend path for these
2172     * kind of formats rather than hacking them in here.
2173     */
2174    if (format_expands_to_float_soa(src_fmt)) {
2175       /*
2176        * This is pretty suboptimal for this case blending in SoA would be much
2177        * better - we need to transpose the AoS values back to SoA values for
2178        * conversion/packing.
2179        */
2180       assert(src_type.floating);
2181       assert(src_type.width == 32);
2182       assert(src_type.length % 4 == 0);
2183       assert(dst_type.width == 32 || dst_type.width == 16);
2184 
2185       for (i = 0; i < num_srcs / 4; i++) {
2186          LLVMValueRef tmpsoa[4], tmpdst;
2187          lp_build_transpose_aos(gallivm, src_type, &src[i * 4], tmpsoa);
2188          /* really really need SoA here */
2189 
2190          if (src_fmt->format == PIPE_FORMAT_R11G11B10_FLOAT) {
2191             tmpdst = lp_build_float_to_r11g11b10(gallivm, tmpsoa);
2192          } else {
2193             tmpdst = lp_build_float_to_srgb_packed(gallivm, src_fmt,
2194                                                    src_type, tmpsoa);
2195          }
2196 
2197          if (src_type.length == 8) {
2198             LLVMValueRef tmpaos, shuffles[8];
2199             unsigned j;
2200             /*
2201              * for 8-wide aos transpose has given us wrong order not matching
2202              * output order. HMPF. Also need to split the output values
2203              * manually.
2204              */
2205             for (j = 0; j < 4; j++) {
2206                shuffles[j * 2] = lp_build_const_int32(gallivm, j);
2207                shuffles[j * 2 + 1] = lp_build_const_int32(gallivm, j + 4);
2208             }
2209             tmpaos = LLVMBuildShuffleVector(builder, tmpdst, tmpdst,
2210                                             LLVMConstVector(shuffles, 8), "");
2211             src[i * 2] = lp_build_extract_range(gallivm, tmpaos, 0, 4);
2212             src[i * 2 + 1] = lp_build_extract_range(gallivm, tmpaos, 4, 4);
2213          } else {
2214             src[i] = tmpdst;
2215          }
2216       }
2217       if (dst_type.width == 16) {
2218          struct lp_type type16x8 = dst_type;
2219          struct lp_type type32x4 = dst_type;
2220          LLVMTypeRef ltype16x4, ltypei64, ltypei128;
2221          unsigned num_fetch = src_type.length == 8 ? num_srcs / 2 : num_srcs / 4;
2222          type16x8.length = 8;
2223          type32x4.width = 32;
2224          ltypei128 = LLVMIntTypeInContext(gallivm->context, 128);
2225          ltypei64 = LLVMIntTypeInContext(gallivm->context, 64);
2226          ltype16x4 = lp_build_vec_type(gallivm, dst_type);
2227          /* We could do vector truncation but it doesn't generate very good code */
2228          for (i = 0; i < num_fetch; i++) {
2229             src[i] = lp_build_pack2(gallivm, type32x4, type16x8,
2230                                     src[i], lp_build_zero(gallivm, type32x4));
2231             src[i] = LLVMBuildBitCast(builder, src[i], ltypei128, "");
2232             src[i] = LLVMBuildTrunc(builder, src[i], ltypei64, "");
2233             src[i] = LLVMBuildBitCast(builder, src[i], ltype16x4, "");
2234          }
2235       }
2236       return;
2237    }
2238 
2239    lp_mem_type_from_format_desc(src_fmt, &mem_type);
2240    lp_blend_type_from_format_desc(src_fmt, &blend_type);
2241 
2242    is_arith = (blend_type.length * blend_type.width != mem_type.width * mem_type.length);
2243 
2244    /* Special case for half-floats */
2245    if (mem_type.width == 16 && mem_type.floating) {
2246       int length = dst_type.length;
2247       assert(blend_type.width == 32 && blend_type.floating);
2248 
2249       dst_type.length = src_type.length;
2250 
2251       lp_build_conv_auto(gallivm, src_type, &dst_type, dst, num_srcs, dst);
2252 
2253       dst_type.length = length;
2254       is_arith = false;
2255    }
2256 
2257    /* Remove any padding */
2258    if (!is_arith && (src_type.length % mem_type.length)) {
2259       src_type.length -= (src_type.length % mem_type.length);
2260 
2261       for (i = 0; i < num_srcs; ++i) {
2262          dst[i] = lp_build_extract_range(gallivm, dst[i], 0, src_type.length);
2263       }
2264    }
2265 
2266    /* No bit arithmetic to do */
2267    if (!is_arith) {
2268       return;
2269    }
2270 
2271    src_type.length = pixels;
2272    src_type.width = blend_type.length * blend_type.width;
2273    dst_type.length = pixels;
2274 
2275    for (i = 0; i < num_srcs; ++i) {
2276       LLVMValueRef chans;
2277       LLVMValueRef res = NULL;
2278 
2279       dst[i] = LLVMBuildBitCast(builder, src[i], lp_build_vec_type(gallivm, src_type), "");
2280 
2281       for (j = 0; j < src_fmt->nr_channels; ++j) {
2282          unsigned mask = 0;
2283          unsigned sa = src_fmt->channel[j].shift;
2284          unsigned sz_a = src_fmt->channel[j].size;
2285 #if UTIL_ARCH_LITTLE_ENDIAN
2286          unsigned from_lsb = j;
2287 #else
2288          unsigned from_lsb = blend_type.length - j - 1;
2289 #endif
2290 
2291          assert(blend_type.width > src_fmt->channel[j].size);
2292 
2293          for (k = 0; k < blend_type.width; ++k) {
2294             mask |= 1 << k;
2295          }
2296 
2297          /* Extract bits */
2298          chans = LLVMBuildLShr(builder,
2299                                dst[i],
2300                                lp_build_const_int_vec(gallivm, src_type,
2301                                                       from_lsb * blend_type.width),
2302                                "");
2303 
2304          chans = LLVMBuildAnd(builder,
2305                               chans,
2306                               lp_build_const_int_vec(gallivm, src_type, mask),
2307                               "");
2308 
2309          /* Scale down bits */
2310          if (src_type.norm) {
2311             chans = scale_bits(gallivm, blend_type.width,
2312                                src_fmt->channel[j].size, chans, src_type);
2313          } else if (!src_type.floating && sz_a < blend_type.width) {
2314             LLVMValueRef mask_val = lp_build_const_int_vec(gallivm, src_type, (1UL << sz_a) - 1);
2315             LLVMValueRef mask = LLVMBuildICmp(builder, LLVMIntUGT, chans, mask_val, "");
2316             chans = LLVMBuildSelect(builder, mask, mask_val, chans, "");
2317          }
2318 
2319          /* Insert bits */
2320          chans = LLVMBuildShl(builder,
2321                               chans,
2322                               lp_build_const_int_vec(gallivm, src_type, sa),
2323                               "");
2324 
2325          sa += src_fmt->channel[j].size;
2326 
2327          if (j == 0) {
2328             res = chans;
2329          } else {
2330             res = LLVMBuildOr(builder, res, chans, "");
2331          }
2332       }
2333 
2334       assert (dst_type.width != 24);
2335 
2336       dst[i] = LLVMBuildTrunc(builder, res, lp_build_vec_type(gallivm, dst_type), "");
2337    }
2338 }
2339 
2340 
2341 /**
2342  * Convert alpha to same blend type as src
2343  */
2344 static void
convert_alpha(struct gallivm_state * gallivm,struct lp_type row_type,struct lp_type alpha_type,const unsigned block_size,const unsigned block_height,const unsigned src_count,const unsigned dst_channels,const bool pad_inline,LLVMValueRef * src_alpha)2345 convert_alpha(struct gallivm_state *gallivm,
2346               struct lp_type row_type,
2347               struct lp_type alpha_type,
2348               const unsigned block_size,
2349               const unsigned block_height,
2350               const unsigned src_count,
2351               const unsigned dst_channels,
2352               const bool pad_inline,
2353               LLVMValueRef* src_alpha)
2354 {
2355    LLVMBuilderRef builder = gallivm->builder;
2356    const unsigned length = row_type.length;
2357    row_type.length = alpha_type.length;
2358 
2359    /* Twiddle the alpha to match pixels */
2360    lp_bld_quad_twiddle(gallivm, alpha_type, src_alpha, block_height, src_alpha);
2361 
2362    /*
2363     * TODO this should use single lp_build_conv call for
2364     * src_count == 1 && dst_channels == 1 case (dropping the concat below)
2365     */
2366    for (unsigned i = 0; i < block_height; ++i) {
2367       lp_build_conv(gallivm, alpha_type, row_type, &src_alpha[i], 1,
2368                     &src_alpha[i], 1);
2369    }
2370 
2371    alpha_type = row_type;
2372    row_type.length = length;
2373 
2374    /* If only one channel we can only need the single alpha value per pixel */
2375    if (src_count == 1 && dst_channels == 1) {
2376       lp_build_concat_n(gallivm, alpha_type, src_alpha, block_height,
2377                         src_alpha, src_count);
2378    } else {
2379       /* If there are more srcs than rows then we need to split alpha up */
2380       if (src_count > block_height) {
2381          for (unsigned i = src_count; i > 0; --i) {
2382             unsigned pixels = block_size / src_count;
2383             unsigned idx = i - 1;
2384 
2385             src_alpha[idx] =
2386                lp_build_extract_range(gallivm, src_alpha[(idx * pixels) / 4],
2387                                       (idx * pixels) % 4, pixels);
2388          }
2389       }
2390 
2391       /* If there is a src for each pixel broadcast the alpha across whole
2392        * row
2393        */
2394       if (src_count == block_size) {
2395          for (unsigned i = 0; i < src_count; ++i) {
2396             src_alpha[i] = lp_build_broadcast(gallivm,
2397                               lp_build_vec_type(gallivm, row_type), src_alpha[i]);
2398          }
2399       } else {
2400          unsigned pixels = block_size / src_count;
2401          unsigned channels = pad_inline ? TGSI_NUM_CHANNELS : dst_channels;
2402          unsigned alpha_span = 1;
2403          LLVMValueRef shuffles[LP_MAX_VECTOR_LENGTH];
2404 
2405          /* Check if we need 2 src_alphas for our shuffles */
2406          if (pixels > alpha_type.length) {
2407             alpha_span = 2;
2408          }
2409 
2410          /* Broadcast alpha across all channels, e.g. a1a2 to a1a1a1a1a2a2a2a2 */
2411          for (unsigned j = 0; j < row_type.length; ++j) {
2412             if (j < pixels * channels) {
2413                shuffles[j] = lp_build_const_int32(gallivm, j / channels);
2414             } else {
2415                shuffles[j] = LLVMGetUndef(LLVMInt32TypeInContext(gallivm->context));
2416             }
2417          }
2418 
2419          for (unsigned i = 0; i < src_count; ++i) {
2420             unsigned idx1 = i, idx2 = i;
2421 
2422             if (alpha_span > 1){
2423                idx1 *= alpha_span;
2424                idx2 = idx1 + 1;
2425             }
2426 
2427             src_alpha[i] = LLVMBuildShuffleVector(builder,
2428                                                   src_alpha[idx1],
2429                                                   src_alpha[idx2],
2430                                                   LLVMConstVector(shuffles, row_type.length),
2431                                                   "");
2432          }
2433       }
2434    }
2435 }
2436 
2437 
2438 /**
2439  * Generates the blend function for unswizzled colour buffers
2440  * Also generates the read & write from colour buffer
2441  */
2442 static void
generate_unswizzled_blend(struct gallivm_state * gallivm,unsigned rt,struct lp_fragment_shader_variant * variant,enum pipe_format out_format,unsigned int num_fs,struct lp_type fs_type,LLVMValueRef * fs_mask,LLVMValueRef fs_out_color[PIPE_MAX_COLOR_BUFS][TGSI_NUM_CHANNELS][4],LLVMTypeRef context_type,LLVMValueRef context_ptr,LLVMTypeRef color_type,LLVMValueRef color_ptr,LLVMValueRef stride,unsigned partial_mask,bool do_branch)2443 generate_unswizzled_blend(struct gallivm_state *gallivm,
2444                           unsigned rt,
2445                           struct lp_fragment_shader_variant *variant,
2446                           enum pipe_format out_format,
2447                           unsigned int num_fs,
2448                           struct lp_type fs_type,
2449                           LLVMValueRef* fs_mask,
2450                           LLVMValueRef fs_out_color[PIPE_MAX_COLOR_BUFS][TGSI_NUM_CHANNELS][4],
2451                           LLVMTypeRef context_type,
2452                           LLVMValueRef context_ptr,
2453                           LLVMTypeRef color_type,
2454                           LLVMValueRef color_ptr,
2455                           LLVMValueRef stride,
2456                           unsigned partial_mask,
2457                           bool do_branch)
2458 {
2459    const unsigned alpha_channel = 3;
2460    const unsigned block_width = LP_RASTER_BLOCK_SIZE;
2461    const unsigned block_height = LP_RASTER_BLOCK_SIZE;
2462    const unsigned block_size = block_width * block_height;
2463    const unsigned lp_integer_vector_width = 128;
2464 
2465    LLVMBuilderRef builder = gallivm->builder;
2466    LLVMValueRef fs_src[4][TGSI_NUM_CHANNELS];
2467    LLVMValueRef fs_src1[4][TGSI_NUM_CHANNELS];
2468    LLVMValueRef src_alpha[4 * 4];
2469    LLVMValueRef src1_alpha[4 * 4] = { NULL };
2470    LLVMValueRef src_mask[4 * 4];
2471    LLVMValueRef src[4 * 4];
2472    LLVMValueRef src1[4 * 4];
2473    LLVMValueRef dst[4 * 4];
2474 
2475    struct lp_build_mask_context mask_ctx;
2476 
2477    unsigned char swizzle[TGSI_NUM_CHANNELS];
2478    unsigned src_channels = TGSI_NUM_CHANNELS;
2479 
2480    const struct util_format_description *out_format_desc =
2481       util_format_description(out_format);
2482 
2483    bool pad_inline = is_arithmetic_format(out_format_desc);
2484    const bool dual_source_blend =
2485       variant->key.blend.rt[0].blend_enable &&
2486       util_blend_state_is_dual(&variant->key.blend, 0);
2487 
2488    const bool is_1d = variant->key.resource_1d;
2489    const unsigned num_fullblock_fs = is_1d ? 2 * num_fs : num_fs;
2490    LLVMValueRef fpstate = NULL;
2491 
2492    LLVMTypeRef fs_vec_type = lp_build_vec_type(gallivm, fs_type);
2493 
2494    /* Get type from output format */
2495    struct lp_type row_type, dst_type;
2496    lp_blend_type_from_format_desc(out_format_desc, &row_type);
2497    lp_mem_type_from_format_desc(out_format_desc, &dst_type);
2498 
2499    /*
2500     * Technically this code should go into lp_build_smallfloat_to_float
2501     * and lp_build_float_to_smallfloat but due to the
2502     * http://llvm.org/bugs/show_bug.cgi?id=6393
2503     * llvm reorders the mxcsr intrinsics in a way that breaks the code.
2504     * So the ordering is important here and there shouldn't be any
2505     * llvm ir instrunctions in this function before
2506     * this, otherwise half-float format conversions won't work
2507     * (again due to llvm bug #6393).
2508     */
2509    if (have_smallfloat_format(dst_type, out_format)) {
2510       /* We need to make sure that denorms are ok for half float
2511          conversions */
2512       fpstate = lp_build_fpstate_get(gallivm);
2513       lp_build_fpstate_set_denorms_zero(gallivm, false);
2514    }
2515 
2516    struct lp_type mask_type = lp_int32_vec4_type();
2517    mask_type.length = fs_type.length;
2518 
2519    for (unsigned i = num_fs; i < num_fullblock_fs; i++) {
2520       fs_mask[i] = lp_build_zero(gallivm, mask_type);
2521    }
2522 
2523    /* Do not bother executing code when mask is empty.. */
2524    if (do_branch) {
2525       LLVMValueRef check_mask =
2526          LLVMConstNull(lp_build_int_vec_type(gallivm, mask_type));
2527 
2528       for (unsigned i = 0; i < num_fullblock_fs; ++i) {
2529          check_mask = LLVMBuildOr(builder, check_mask, fs_mask[i], "");
2530       }
2531 
2532       lp_build_mask_begin(&mask_ctx, gallivm, mask_type, check_mask);
2533       lp_build_mask_check(&mask_ctx);
2534    }
2535 
2536    partial_mask |= !variant->opaque;
2537    LLVMValueRef i32_zero = lp_build_const_int32(gallivm, 0);
2538 
2539    LLVMValueRef undef_src_val = lp_build_undef(gallivm, fs_type);
2540 
2541    row_type.length = fs_type.length;
2542    unsigned vector_width =
2543       dst_type.floating ? lp_native_vector_width : lp_integer_vector_width;
2544 
2545    /* Compute correct swizzle and count channels */
2546    memset(swizzle, LP_BLD_SWIZZLE_DONTCARE, TGSI_NUM_CHANNELS);
2547    unsigned dst_channels = 0;
2548 
2549    bool has_alpha = false;
2550    for (unsigned i = 0; i < TGSI_NUM_CHANNELS; ++i) {
2551       /* Ensure channel is used */
2552       if (out_format_desc->swizzle[i] >= TGSI_NUM_CHANNELS) {
2553          continue;
2554       }
2555 
2556       /* Ensure not already written to (happens in case with GL_ALPHA) */
2557       if (swizzle[out_format_desc->swizzle[i]] < TGSI_NUM_CHANNELS) {
2558          continue;
2559       }
2560 
2561       /* Ensure we haven't already found all channels */
2562       if (dst_channels >= out_format_desc->nr_channels) {
2563          continue;
2564       }
2565 
2566       swizzle[out_format_desc->swizzle[i]] = i;
2567       ++dst_channels;
2568 
2569       if (i == alpha_channel) {
2570          has_alpha = true;
2571       }
2572    }
2573 
2574    if (format_expands_to_float_soa(out_format_desc)) {
2575       /*
2576        * the code above can't work for layout_other
2577        * for srgb it would sort of work but we short-circuit swizzles, etc.
2578        * as that is done as part of unpack / pack.
2579        */
2580       dst_channels = 4; /* HACK: this is fake 4 really but need it due to transpose stuff later */
2581       has_alpha = true;
2582       swizzle[0] = 0;
2583       swizzle[1] = 1;
2584       swizzle[2] = 2;
2585       swizzle[3] = 3;
2586       pad_inline = true; /* HACK: prevent rgbxrgbx->rgbrgbxx conversion later */
2587    }
2588 
2589    /* If 3 channels then pad to include alpha for 4 element transpose */
2590    if (dst_channels == 3) {
2591       assert (!has_alpha);
2592       for (unsigned i = 0; i < TGSI_NUM_CHANNELS; i++) {
2593          if (swizzle[i] > TGSI_NUM_CHANNELS)
2594             swizzle[i] = 3;
2595       }
2596       if (out_format_desc->nr_channels == 4) {
2597          dst_channels = 4;
2598          /*
2599           * We use alpha from the color conversion, not separate one.
2600           * We had to include it for transpose, hence it will get converted
2601           * too (albeit when doing transpose after conversion, that would
2602           * no longer be the case necessarily).
2603           * (It works only with 4 channel dsts, e.g. rgbx formats, because
2604           * otherwise we really have padding, not alpha, included.)
2605           */
2606          has_alpha = true;
2607       }
2608    }
2609 
2610    /*
2611     * Load shader output
2612     */
2613    for (unsigned i = 0; i < num_fullblock_fs; ++i) {
2614       /* Always load alpha for use in blending */
2615       LLVMValueRef alpha;
2616       if (i < num_fs) {
2617          alpha = LLVMBuildLoad2(builder, fs_vec_type,
2618                                 fs_out_color[rt][alpha_channel][i], "");
2619       } else {
2620          alpha = undef_src_val;
2621       }
2622 
2623       /* Load each channel */
2624       for (unsigned j = 0; j < dst_channels; ++j) {
2625          assert(swizzle[j] < 4);
2626          if (i < num_fs) {
2627             fs_src[i][j] = LLVMBuildLoad2(builder, fs_vec_type,
2628                                           fs_out_color[rt][swizzle[j]][i], "");
2629          } else {
2630             fs_src[i][j] = undef_src_val;
2631          }
2632       }
2633 
2634       /* If 3 channels then pad to include alpha for 4 element transpose */
2635       /*
2636        * XXX If we include that here maybe could actually use it instead of
2637        * separate alpha for blending?
2638        * (Difficult though we actually convert pad channels, not alpha.)
2639        */
2640       if (dst_channels == 3 && !has_alpha) {
2641          fs_src[i][3] = alpha;
2642       }
2643 
2644       /* We split the row_mask and row_alpha as we want 128bit interleave */
2645       if (fs_type.length == 8) {
2646          src_mask[i*2 + 0]  = lp_build_extract_range(gallivm, fs_mask[i],
2647                                                      0, src_channels);
2648          src_mask[i*2 + 1]  = lp_build_extract_range(gallivm, fs_mask[i],
2649                                                      src_channels,
2650                                                      src_channels);
2651 
2652          src_alpha[i*2 + 0] = lp_build_extract_range(gallivm, alpha,
2653                                                      0, src_channels);
2654          src_alpha[i*2 + 1] = lp_build_extract_range(gallivm, alpha,
2655                                                      src_channels,
2656                                                      src_channels);
2657       } else {
2658          src_mask[i] = fs_mask[i];
2659          src_alpha[i] = alpha;
2660       }
2661    }
2662    if (dual_source_blend) {
2663       /* same as above except different src/dst, skip masks and comments... */
2664       for (unsigned i = 0; i < num_fullblock_fs; ++i) {
2665          LLVMValueRef alpha;
2666          if (i < num_fs) {
2667             alpha = LLVMBuildLoad2(builder, fs_vec_type,
2668                                    fs_out_color[1][alpha_channel][i], "");
2669          } else {
2670             alpha = undef_src_val;
2671          }
2672 
2673          for (unsigned j = 0; j < dst_channels; ++j) {
2674             assert(swizzle[j] < 4);
2675             if (i < num_fs) {
2676                fs_src1[i][j] = LLVMBuildLoad2(builder, fs_vec_type,
2677                                               fs_out_color[1][swizzle[j]][i], "");
2678             } else {
2679                fs_src1[i][j] = undef_src_val;
2680             }
2681          }
2682          if (dst_channels == 3 && !has_alpha) {
2683             fs_src1[i][3] = alpha;
2684          }
2685          if (fs_type.length == 8) {
2686             src1_alpha[i*2 + 0] = lp_build_extract_range(gallivm, alpha, 0, src_channels);
2687             src1_alpha[i*2 + 1] = lp_build_extract_range(gallivm, alpha,
2688                                                          src_channels, src_channels);
2689          } else {
2690             src1_alpha[i] = alpha;
2691          }
2692       }
2693    }
2694 
2695    if (util_format_is_pure_integer(out_format)) {
2696       /*
2697        * In this case fs_type was really ints or uints disguised as floats,
2698        * fix that up now.
2699        */
2700       fs_type.floating = 0;
2701       fs_type.sign = dst_type.sign;
2702       fs_vec_type = lp_build_vec_type(gallivm, fs_type);
2703       for (unsigned i = 0; i < num_fullblock_fs; ++i) {
2704          for (unsigned j = 0; j < dst_channels; ++j) {
2705             fs_src[i][j] = LLVMBuildBitCast(builder, fs_src[i][j],
2706                                             fs_vec_type, "");
2707          }
2708          if (dst_channels == 3 && !has_alpha) {
2709             fs_src[i][3] = LLVMBuildBitCast(builder, fs_src[i][3],
2710                                             fs_vec_type, "");
2711          }
2712       }
2713    }
2714 
2715    /*
2716     * We actually should generally do conversion first (for non-1d cases)
2717     * when the blend format is 8 or 16 bits. The reason is obvious,
2718     * there's 2 or 4 times less vectors to deal with for the interleave...
2719     * Albeit for the AVX (not AVX2) case there's no benefit with 16 bit
2720     * vectors (as it can do 32bit unpack with 256bit vectors, but 8/16bit
2721     * unpack only with 128bit vectors).
2722     * Note: for 16bit sizes really need matching pack conversion code
2723     */
2724    bool twiddle_after_convert = false;
2725    if (!is_1d && dst_channels != 3 && dst_type.width == 8) {
2726       twiddle_after_convert = true;
2727    }
2728 
2729    /*
2730     * Pixel twiddle from fragment shader order to memory order
2731     */
2732    unsigned src_count;
2733    if (!twiddle_after_convert) {
2734       src_count = generate_fs_twiddle(gallivm, fs_type, num_fullblock_fs,
2735                                       dst_channels, fs_src, src, pad_inline);
2736       if (dual_source_blend) {
2737          generate_fs_twiddle(gallivm, fs_type, num_fullblock_fs, dst_channels,
2738                              fs_src1, src1, pad_inline);
2739       }
2740    } else {
2741       src_count = num_fullblock_fs * dst_channels;
2742       /*
2743        * We reorder things a bit here, so the cases for 4-wide and 8-wide
2744        * (AVX) turn out the same later when untwiddling/transpose (albeit
2745        * for true AVX2 path untwiddle needs to be different).
2746        * For now just order by colors first (so we can use unpack later).
2747        */
2748       for (unsigned j = 0; j < num_fullblock_fs; j++) {
2749          for (unsigned i = 0; i < dst_channels; i++) {
2750             src[i*num_fullblock_fs + j] = fs_src[j][i];
2751             if (dual_source_blend) {
2752                src1[i*num_fullblock_fs + j] = fs_src1[j][i];
2753             }
2754          }
2755       }
2756    }
2757 
2758    src_channels = dst_channels < 3 ? dst_channels : 4;
2759    if (src_count != num_fullblock_fs * src_channels) {
2760       unsigned ds = src_count / (num_fullblock_fs * src_channels);
2761       row_type.length /= ds;
2762       fs_type.length = row_type.length;
2763       fs_vec_type = lp_build_vec_type(gallivm, fs_type);
2764    }
2765 
2766    struct lp_type blend_type = row_type;
2767    mask_type.length = 4;
2768 
2769    /* Convert src to row_type */
2770    if (dual_source_blend) {
2771       struct lp_type old_row_type = row_type;
2772       lp_build_conv_auto(gallivm, fs_type, &row_type, src, src_count, src);
2773       src_count = lp_build_conv_auto(gallivm, fs_type, &old_row_type,
2774                                      src1, src_count, src1);
2775    } else {
2776       src_count = lp_build_conv_auto(gallivm, fs_type, &row_type,
2777                                      src, src_count, src);
2778    }
2779 
2780    /* If the rows are not an SSE vector, combine them to become SSE size! */
2781    if ((row_type.width * row_type.length) % 128) {
2782       unsigned bits = row_type.width * row_type.length;
2783       unsigned combined;
2784 
2785       assert(src_count >= (vector_width / bits));
2786 
2787       const unsigned dst_count = src_count / (vector_width / bits);
2788 
2789       combined = lp_build_concat_n(gallivm, row_type, src, src_count,
2790                                    src, dst_count);
2791       if (dual_source_blend) {
2792          lp_build_concat_n(gallivm, row_type, src1, src_count, src1, dst_count);
2793       }
2794 
2795       row_type.length *= combined;
2796       src_count /= combined;
2797 
2798       bits = row_type.width * row_type.length;
2799       assert(bits == 128 || bits == 256);
2800    }
2801 
2802    if (twiddle_after_convert) {
2803       fs_twiddle_transpose(gallivm, row_type, src, src_count, src);
2804       if (dual_source_blend) {
2805          fs_twiddle_transpose(gallivm, row_type, src1, src_count, src1);
2806       }
2807    }
2808 
2809    /*
2810     * Blend Colour conversion
2811     */
2812    LLVMValueRef blend_color =
2813       lp_jit_context_f_blend_color(gallivm, context_type, context_ptr);
2814    blend_color = LLVMBuildPointerCast(builder, blend_color,
2815                                       LLVMPointerType(fs_vec_type, 0),
2816                                       "");
2817    blend_color = LLVMBuildLoad2(builder, fs_vec_type,
2818                                 LLVMBuildGEP2(builder, fs_vec_type,
2819                                               blend_color,
2820                                               &i32_zero, 1, ""), "");
2821 
2822    /* Convert */
2823    lp_build_conv(gallivm, fs_type, blend_type, &blend_color, 1,
2824                  &blend_color, 1);
2825 
2826    if (out_format_desc->colorspace == UTIL_FORMAT_COLORSPACE_SRGB) {
2827       /*
2828        * since blending is done with floats, there was no conversion.
2829        * However, the rules according to fixed point renderbuffers still
2830        * apply, that is we must clamp inputs to 0.0/1.0.
2831        * (This would apply to separate alpha conversion too but we currently
2832        * force has_alpha to be true.)
2833        * TODO: should skip this with "fake" blend, since post-blend conversion
2834        * will clamp anyway.
2835        * TODO: could also skip this if fragment color clamping is enabled.
2836        * We don't support it natively so it gets baked into the shader
2837        * however, so can't really tell here.
2838        */
2839       struct lp_build_context f32_bld;
2840       assert(row_type.floating);
2841       lp_build_context_init(&f32_bld, gallivm, row_type);
2842       for (unsigned i = 0; i < src_count; i++) {
2843          src[i] = lp_build_clamp_zero_one_nanzero(&f32_bld, src[i]);
2844       }
2845       if (dual_source_blend) {
2846          for (unsigned i = 0; i < src_count; i++) {
2847             src1[i] = lp_build_clamp_zero_one_nanzero(&f32_bld, src1[i]);
2848          }
2849       }
2850       /* probably can't be different than row_type but better safe than sorry... */
2851       lp_build_context_init(&f32_bld, gallivm, blend_type);
2852       blend_color = lp_build_clamp(&f32_bld, blend_color,
2853                                    f32_bld.zero, f32_bld.one);
2854    }
2855 
2856    /* Extract alpha */
2857    LLVMValueRef blend_alpha =
2858       lp_build_extract_broadcast(gallivm, blend_type, row_type,
2859                                  blend_color,
2860                                  lp_build_const_int32(gallivm, 3));
2861 
2862    /* Swizzle to appropriate channels, e.g. from RGBA to BGRA BGRA */
2863    pad_inline &= (dst_channels * (block_size / src_count) * row_type.width)
2864       != vector_width;
2865    if (pad_inline) {
2866       /* Use all 4 channels e.g. from RGBA RGBA to RGxx RGxx */
2867       blend_color = lp_build_swizzle_aos_n(gallivm, blend_color, swizzle,
2868                                            TGSI_NUM_CHANNELS, row_type.length);
2869    } else {
2870       /* Only use dst_channels e.g. RGBA RGBA to RG RG xxxx */
2871       blend_color = lp_build_swizzle_aos_n(gallivm, blend_color, swizzle,
2872                                            dst_channels, row_type.length);
2873    }
2874 
2875    /*
2876     * Mask conversion
2877     */
2878    lp_bld_quad_twiddle(gallivm, mask_type, &src_mask[0],
2879                        block_height, &src_mask[0]);
2880 
2881    if (src_count < block_height) {
2882       lp_build_concat_n(gallivm, mask_type, src_mask, 4, src_mask, src_count);
2883    } else if (src_count > block_height) {
2884       for (unsigned i = src_count; i > 0; --i) {
2885          unsigned pixels = block_size / src_count;
2886          unsigned idx = i - 1;
2887 
2888          src_mask[idx] = lp_build_extract_range(gallivm,
2889                                                 src_mask[(idx * pixels) / 4],
2890                                                 (idx * pixels) % 4, pixels);
2891       }
2892    }
2893 
2894    assert(mask_type.width == 32);
2895 
2896    for (unsigned i = 0; i < src_count; ++i) {
2897       unsigned pixels = block_size / src_count;
2898       unsigned pixel_width = row_type.width * dst_channels;
2899 
2900       if (pixel_width == 24) {
2901          mask_type.width = 8;
2902          mask_type.length = vector_width / mask_type.width;
2903       } else {
2904          mask_type.length = pixels;
2905          mask_type.width = row_type.width * dst_channels;
2906 
2907          /*
2908           * If mask_type width is smaller than 32bit, this doesn't quite
2909           * generate the most efficient code (could use some pack).
2910           */
2911          src_mask[i] = LLVMBuildIntCast(builder, src_mask[i],
2912                                         lp_build_int_vec_type(gallivm,
2913                                                               mask_type), "");
2914 
2915          mask_type.length *= dst_channels;
2916          mask_type.width /= dst_channels;
2917       }
2918 
2919       src_mask[i] = LLVMBuildBitCast(builder, src_mask[i],
2920                                      lp_build_int_vec_type(gallivm, mask_type),
2921                                      "");
2922       src_mask[i] = lp_build_pad_vector(gallivm, src_mask[i], row_type.length);
2923    }
2924 
2925    /*
2926     * Alpha conversion
2927     */
2928    if (!has_alpha) {
2929       struct lp_type alpha_type = fs_type;
2930       alpha_type.length = 4;
2931       convert_alpha(gallivm, row_type, alpha_type,
2932                     block_size, block_height,
2933                     src_count, dst_channels,
2934                     pad_inline, src_alpha);
2935       if (dual_source_blend) {
2936          convert_alpha(gallivm, row_type, alpha_type,
2937                        block_size, block_height,
2938                        src_count, dst_channels,
2939                        pad_inline, src1_alpha);
2940       }
2941    }
2942 
2943 
2944    /*
2945     * Load dst from memory
2946     */
2947    unsigned dst_count;
2948    if (src_count < block_height) {
2949       dst_count = block_height;
2950    } else {
2951       dst_count = src_count;
2952    }
2953 
2954    dst_type.length *= block_size / dst_count;
2955 
2956    if (format_expands_to_float_soa(out_format_desc)) {
2957       /*
2958        * we need multiple values at once for the conversion, so can as well
2959        * load them vectorized here too instead of concatenating later.
2960        * (Still need concatenation later for 8-wide vectors).
2961        */
2962       dst_count = block_height;
2963       dst_type.length = block_width;
2964    }
2965 
2966    /*
2967     * Compute the alignment of the destination pointer in bytes
2968     * We fetch 1-4 pixels, if the format has pot alignment then those fetches
2969     * are always aligned by MIN2(16, fetch_width) except for buffers (not
2970     * 1d tex but can't distinguish here) so need to stick with per-pixel
2971     * alignment in this case.
2972     */
2973    unsigned dst_alignment;
2974    if (is_1d) {
2975       dst_alignment = (out_format_desc->block.bits + 7)/(out_format_desc->block.width * 8);
2976    } else {
2977       dst_alignment = dst_type.length * dst_type.width / 8;
2978    }
2979    /* Force power-of-two alignment by extracting only the least-significant-bit */
2980    dst_alignment = 1 << (ffs(dst_alignment) - 1);
2981    /*
2982     * Resource base and stride pointers are aligned to 16 bytes, so that's
2983     * the maximum alignment we can guarantee
2984     */
2985    dst_alignment = MIN2(16, dst_alignment);
2986 
2987    struct lp_type ls_type = dst_type;
2988 
2989    if (dst_count > src_count) {
2990       if ((dst_type.width == 8 || dst_type.width == 16) &&
2991           util_is_power_of_two_or_zero(dst_type.length) &&
2992           dst_type.length * dst_type.width < 128) {
2993          /*
2994           * Never try to load values as 4xi8 which we will then
2995           * concatenate to larger vectors. This gives llvm a real
2996           * headache (the problem is the type legalizer (?) will
2997           * try to load that as 4xi8 zext to 4xi32 to fill the vector,
2998           * then the shuffles to concatenate are more or less impossible
2999           * - llvm is easily capable of generating a sequence of 32
3000           * pextrb/pinsrb instructions for that. Albeit it appears to
3001           * be fixed in llvm 4.0. So, load and concatenate with 32bit
3002           * width to avoid the trouble (16bit seems not as bad, llvm
3003           * probably recognizes the load+shuffle as only one shuffle
3004           * is necessary, but we can do just the same anyway).
3005           */
3006          ls_type.length = dst_type.length * dst_type.width / 32;
3007          ls_type.width = 32;
3008       }
3009    }
3010 
3011    if (is_1d) {
3012       load_unswizzled_block(gallivm, color_type, color_ptr, stride, block_width, 1,
3013                             dst, ls_type, dst_count / 4, dst_alignment);
3014       for (unsigned i = dst_count / 4; i < dst_count; i++) {
3015          dst[i] = lp_build_undef(gallivm, ls_type);
3016       }
3017    } else {
3018       load_unswizzled_block(gallivm, color_type, color_ptr, stride, block_width,
3019                             block_height, dst, ls_type, dst_count,
3020                             dst_alignment);
3021    }
3022 
3023 
3024    /*
3025     * Convert from dst/output format to src/blending format.
3026     *
3027     * This is necessary as we can only read 1 row from memory at a time,
3028     * so the minimum dst_count will ever be at this point is 4.
3029     *
3030     * With, for example, R8 format you can have all 16 pixels in a 128 bit
3031     * vector, this will take the 4 dsts and combine them into 1 src so we can
3032     * perform blending on all 16 pixels in that single vector at once.
3033     */
3034    if (dst_count > src_count) {
3035       if (ls_type.length != dst_type.length && ls_type.length == 1) {
3036          LLVMTypeRef elem_type = lp_build_elem_type(gallivm, ls_type);
3037          LLVMTypeRef ls_vec_type = LLVMVectorType(elem_type, 1);
3038          for (unsigned i = 0; i < dst_count; i++) {
3039             dst[i] = LLVMBuildBitCast(builder, dst[i], ls_vec_type, "");
3040          }
3041       }
3042 
3043       lp_build_concat_n(gallivm, ls_type, dst, 4, dst, src_count);
3044 
3045       if (ls_type.length != dst_type.length) {
3046          struct lp_type tmp_type = dst_type;
3047          tmp_type.length = dst_type.length * 4 / src_count;
3048          for (unsigned i = 0; i < src_count; i++) {
3049             dst[i] = LLVMBuildBitCast(builder, dst[i],
3050                                       lp_build_vec_type(gallivm, tmp_type), "");
3051          }
3052       }
3053    }
3054 
3055    /*
3056     * Blending
3057     */
3058    /* XXX this is broken for RGB8 formats -
3059     * they get expanded from 12 to 16 elements (to include alpha)
3060     * by convert_to_blend_type then reduced to 15 instead of 12
3061     * by convert_from_blend_type (a simple fix though breaks A8...).
3062     * R16G16B16 also crashes differently however something going wrong
3063     * inside llvm handling npot vector sizes seemingly.
3064     * It seems some cleanup could be done here (like skipping conversion/blend
3065     * when not needed).
3066     */
3067    convert_to_blend_type(gallivm, block_size, out_format_desc, dst_type,
3068                          row_type, dst, src_count);
3069 
3070    /*
3071     * FIXME: Really should get logic ops / masks out of generic blend / row
3072     * format. Logic ops will definitely not work on the blend float format
3073     * used for SRGB here and I think OpenGL expects this to work as expected
3074     * (that is incoming values converted to srgb then logic op applied).
3075     */
3076    for (unsigned i = 0; i < src_count; ++i) {
3077       dst[i] = lp_build_blend_aos(gallivm,
3078                                   &variant->key.blend,
3079                                   out_format,
3080                                   row_type,
3081                                   rt,
3082                                   src[i],
3083                                   has_alpha ? NULL : src_alpha[i],
3084                                   src1[i],
3085                                   has_alpha ? NULL : src1_alpha[i],
3086                                   dst[i],
3087                                   partial_mask ? src_mask[i] : NULL,
3088                                   blend_color,
3089                                   has_alpha ? NULL : blend_alpha,
3090                                   swizzle,
3091                                   pad_inline ? 4 : dst_channels);
3092    }
3093 
3094    convert_from_blend_type(gallivm, block_size, out_format_desc,
3095                            row_type, dst_type, dst, src_count);
3096 
3097    /* Split the blend rows back to memory rows */
3098    if (dst_count > src_count) {
3099       row_type.length = dst_type.length * (dst_count / src_count);
3100 
3101       if (src_count == 1) {
3102          dst[1] = lp_build_extract_range(gallivm, dst[0],
3103                                          row_type.length / 2,
3104                                          row_type.length / 2);
3105          dst[0] = lp_build_extract_range(gallivm, dst[0],
3106                                          0, row_type.length / 2);
3107 
3108          row_type.length /= 2;
3109          src_count *= 2;
3110       }
3111 
3112       dst[3] = lp_build_extract_range(gallivm, dst[1], row_type.length / 2,
3113                                       row_type.length / 2);
3114       dst[2] = lp_build_extract_range(gallivm, dst[1], 0, row_type.length / 2);
3115       dst[1] = lp_build_extract_range(gallivm, dst[0], row_type.length / 2,
3116                                       row_type.length / 2);
3117       dst[0] = lp_build_extract_range(gallivm, dst[0], 0, row_type.length / 2);
3118 
3119       row_type.length /= 2;
3120       src_count *= 2;
3121    }
3122 
3123    /*
3124     * Store blend result to memory
3125     */
3126    if (is_1d) {
3127       store_unswizzled_block(gallivm, color_type, color_ptr, stride, block_width, 1,
3128                              dst, dst_type, dst_count / 4, dst_alignment);
3129    } else {
3130       store_unswizzled_block(gallivm, color_type, color_ptr, stride, block_width,
3131                              block_height,
3132                              dst, dst_type, dst_count, dst_alignment);
3133    }
3134 
3135    if (do_branch) {
3136       lp_build_mask_end(&mask_ctx);
3137    }
3138 
3139    if (fpstate) {
3140       lp_build_fpstate_set(gallivm, fpstate);
3141    }
3142 }
3143 
3144 
3145 /**
3146  * Generate the runtime callable function for the whole fragment pipeline.
3147  * Note that the function which we generate operates on a block of 16
3148  * pixels at at time.  The block contains 2x2 quads.  Each quad contains
3149  * 2x2 pixels.
3150  */
3151 static void
generate_fragment(struct llvmpipe_context * lp,struct lp_fragment_shader * shader,struct lp_fragment_shader_variant * variant,unsigned partial_mask)3152 generate_fragment(struct llvmpipe_context *lp,
3153                   struct lp_fragment_shader *shader,
3154                   struct lp_fragment_shader_variant *variant,
3155                   unsigned partial_mask)
3156 {
3157    assert(partial_mask == RAST_WHOLE ||
3158           partial_mask == RAST_EDGE_TEST);
3159 
3160    struct nir_shader *nir = shader->base.ir.nir;
3161    struct gallivm_state *gallivm = variant->gallivm;
3162    struct lp_fragment_shader_variant_key *key = &variant->key;
3163    struct lp_shader_input inputs[PIPE_MAX_SHADER_INPUTS];
3164    LLVMTypeRef fs_elem_type;
3165    LLVMTypeRef blend_vec_type;
3166    LLVMTypeRef arg_types[16];
3167    LLVMTypeRef func_type;
3168    LLVMTypeRef int32_type = LLVMInt32TypeInContext(gallivm->context);
3169    LLVMTypeRef int32p_type = LLVMPointerType(int32_type, 0);
3170    LLVMTypeRef int8_type = LLVMInt8TypeInContext(gallivm->context);
3171    LLVMTypeRef int8p_type = LLVMPointerType(int8_type, 0);
3172    LLVMValueRef context_ptr;
3173    LLVMValueRef resources_ptr;
3174    LLVMValueRef x;
3175    LLVMValueRef y;
3176    LLVMValueRef a0_ptr;
3177    LLVMValueRef dadx_ptr;
3178    LLVMValueRef dady_ptr;
3179    LLVMValueRef color_ptr_ptr;
3180    LLVMValueRef stride_ptr;
3181    LLVMValueRef color_sample_stride_ptr;
3182    LLVMValueRef depth_ptr;
3183    LLVMValueRef depth_stride;
3184    LLVMValueRef depth_sample_stride;
3185    LLVMValueRef mask_input;
3186    LLVMValueRef thread_data_ptr;
3187    LLVMBasicBlockRef block;
3188    LLVMBuilderRef builder;
3189    struct lp_build_interp_soa_context interp;
3190    LLVMValueRef fs_mask[(16 / 4) * LP_MAX_SAMPLES];
3191    LLVMValueRef fs_out_color[LP_MAX_SAMPLES][PIPE_MAX_COLOR_BUFS][TGSI_NUM_CHANNELS][16 / 4];
3192    LLVMValueRef function;
3193    LLVMValueRef facing;
3194    const bool dual_source_blend = key->blend.rt[0].blend_enable &&
3195                                   util_blend_state_is_dual(&key->blend, 0);
3196 
3197    assert(lp_native_vector_width / 32 >= 4);
3198 
3199    /* Adjust color input interpolation according to flatshade state:
3200     */
3201    nir_foreach_shader_in_variable(var, nir) {
3202       unsigned idx = var->data.driver_location;
3203       unsigned slots = nir_variable_count_slots(var, var->type);
3204       memcpy(&inputs[idx], &shader->inputs[idx], (sizeof inputs[0] * slots));
3205       for (unsigned s = 0; s < slots; s++) {
3206          if (inputs[idx + s].interp == LP_INTERP_COLOR)
3207             inputs[idx + s].interp = key->flatshade ? LP_INTERP_CONSTANT : LP_INTERP_PERSPECTIVE;
3208       }
3209    }
3210 
3211    /* TODO: actually pick these based on the fs and color buffer
3212     * characteristics. */
3213 
3214    struct lp_type fs_type;
3215    memset(&fs_type, 0, sizeof fs_type);
3216    fs_type.floating = true;      /* floating point values */
3217    fs_type.sign = true;          /* values are signed */
3218    fs_type.norm = false;         /* values are not limited to [0,1] or [-1,1] */
3219    fs_type.width = 32;           /* 32-bit float */
3220    fs_type.length = MIN2(lp_native_vector_width / 32, 16); /* n*4 elements per vector */
3221 
3222    struct lp_type blend_type;
3223    memset(&blend_type, 0, sizeof blend_type);
3224    blend_type.floating = false; /* values are integers */
3225    blend_type.sign = false;     /* values are unsigned */
3226    blend_type.norm = true;      /* values are in [0,1] or [-1,1] */
3227    blend_type.width = 8;        /* 8-bit ubyte values */
3228    blend_type.length = 16;      /* 16 elements per vector */
3229 
3230    /*
3231     * Generate the function prototype. Any change here must be reflected in
3232     * lp_jit.h's lp_jit_frag_func function pointer type, and vice-versa.
3233     */
3234 
3235    fs_elem_type = lp_build_elem_type(gallivm, fs_type);
3236 
3237    blend_vec_type = lp_build_vec_type(gallivm, blend_type);
3238 
3239    char func_name[64];
3240    snprintf(func_name, sizeof(func_name), "fs_variant_%s",
3241             partial_mask ? "partial" : "whole");
3242 
3243    arg_types[0] = variant->jit_context_ptr_type;       /* context */
3244    arg_types[1] = variant->jit_resources_ptr_type;       /* context */
3245    arg_types[2] = int32_type;                          /* x */
3246    arg_types[3] = int32_type;                          /* y */
3247    arg_types[4] = int32_type;                          /* facing */
3248    arg_types[5] = LLVMPointerType(fs_elem_type, 0);    /* a0 */
3249    arg_types[6] = LLVMPointerType(fs_elem_type, 0);    /* dadx */
3250    arg_types[7] = LLVMPointerType(fs_elem_type, 0);    /* dady */
3251    arg_types[8] = LLVMPointerType(int8p_type, 0);  /* color */
3252    arg_types[9] = int8p_type;       /* depth */
3253    arg_types[10] = LLVMInt64TypeInContext(gallivm->context);  /* mask_input */
3254    arg_types[11] = variant->jit_thread_data_ptr_type;  /* per thread data */
3255    arg_types[12] = int32p_type;     /* stride */
3256    arg_types[13] = int32_type;                         /* depth_stride */
3257    arg_types[14] = int32p_type;     /* color sample strides */
3258    arg_types[15] = int32_type;                         /* depth sample stride */
3259 
3260    func_type = LLVMFunctionType(LLVMVoidTypeInContext(gallivm->context),
3261                                 arg_types, ARRAY_SIZE(arg_types), 0);
3262 
3263    function = LLVMAddFunction(gallivm->module, func_name, func_type);
3264    LLVMSetFunctionCallConv(function, LLVMCCallConv);
3265 
3266    variant->function[partial_mask] = function;
3267    variant->function_name[partial_mask] = MALLOC(strlen(func_name)+1);
3268    strcpy(variant->function_name[partial_mask], func_name);
3269 
3270    /* XXX: need to propagate noalias down into color param now we are
3271     * passing a pointer-to-pointer?
3272     */
3273    for (unsigned i = 0; i < ARRAY_SIZE(arg_types); ++i)
3274       if (LLVMGetTypeKind(arg_types[i]) == LLVMPointerTypeKind)
3275          lp_add_function_attr(function, i + 1, LP_FUNC_ATTR_NOALIAS);
3276 
3277    if (variant->gallivm->cache->data_size) {
3278       gallivm_stub_func(gallivm, function);
3279       return;
3280    }
3281 
3282    context_ptr  = LLVMGetParam(function, 0);
3283    resources_ptr  = LLVMGetParam(function, 1);
3284    x            = LLVMGetParam(function, 2);
3285    y            = LLVMGetParam(function, 3);
3286    facing       = LLVMGetParam(function, 4);
3287    a0_ptr       = LLVMGetParam(function, 5);
3288    dadx_ptr     = LLVMGetParam(function, 6);
3289    dady_ptr     = LLVMGetParam(function, 7);
3290    color_ptr_ptr = LLVMGetParam(function, 8);
3291    depth_ptr    = LLVMGetParam(function, 9);
3292    mask_input   = LLVMGetParam(function, 10);
3293    thread_data_ptr  = LLVMGetParam(function, 11);
3294    stride_ptr   = LLVMGetParam(function, 12);
3295    depth_stride = LLVMGetParam(function, 13);
3296    color_sample_stride_ptr = LLVMGetParam(function, 14);
3297    depth_sample_stride = LLVMGetParam(function, 15);
3298 
3299    lp_build_name(context_ptr, "context");
3300    lp_build_name(resources_ptr, "resources");
3301    lp_build_name(x, "x");
3302    lp_build_name(y, "y");
3303    lp_build_name(a0_ptr, "a0");
3304    lp_build_name(dadx_ptr, "dadx");
3305    lp_build_name(dady_ptr, "dady");
3306    lp_build_name(color_ptr_ptr, "color_ptr_ptr");
3307    lp_build_name(depth_ptr, "depth");
3308    lp_build_name(mask_input, "mask_input");
3309    lp_build_name(thread_data_ptr, "thread_data");
3310    lp_build_name(stride_ptr, "stride_ptr");
3311    lp_build_name(depth_stride, "depth_stride");
3312    lp_build_name(color_sample_stride_ptr, "color_sample_stride_ptr");
3313    lp_build_name(depth_sample_stride, "depth_sample_stride");
3314 
3315    /*
3316     * Function body
3317     */
3318 
3319    block = LLVMAppendBasicBlockInContext(gallivm->context, function, "entry");
3320    builder = gallivm->builder;
3321    assert(builder);
3322    LLVMPositionBuilderAtEnd(builder, block);
3323 
3324    /* code generated texture sampling */
3325    struct lp_build_sampler_soa *sampler =
3326       lp_llvm_sampler_soa_create(lp_fs_variant_key_samplers(key),
3327                                  MAX2(key->nr_samplers,
3328                                       key->nr_sampler_views));
3329    struct lp_build_image_soa *image =
3330       lp_bld_llvm_image_soa_create(lp_fs_variant_key_images(key), key->nr_images);
3331 
3332    unsigned num_fs = 16 / fs_type.length; /* number of loops per 4x4 stamp */
3333    /* for 1d resources only run "upper half" of stamp */
3334    if (key->resource_1d)
3335       num_fs /= 2;
3336 
3337    {
3338       LLVMValueRef num_loop = lp_build_const_int32(gallivm, num_fs);
3339       LLVMTypeRef mask_type = lp_build_int_vec_type(gallivm, fs_type);
3340       LLVMValueRef num_loop_samp =
3341          lp_build_const_int32(gallivm, num_fs * key->coverage_samples);
3342       LLVMValueRef mask_store =
3343          lp_build_array_alloca(gallivm, mask_type,
3344                                num_loop_samp, "mask_store");
3345       LLVMTypeRef flt_type = LLVMFloatTypeInContext(gallivm->context);
3346       LLVMValueRef glob_sample_pos =
3347          LLVMAddGlobal(gallivm->module,
3348                        LLVMArrayType(flt_type, key->coverage_samples * 2), "");
3349       LLVMSetLinkage(glob_sample_pos, LLVMInternalLinkage);
3350       LLVMValueRef sample_pos_array;
3351 
3352       if (key->multisample && key->coverage_samples == 4) {
3353          LLVMValueRef sample_pos_arr[8];
3354          for (unsigned i = 0; i < 4; i++) {
3355             sample_pos_arr[i * 2] = LLVMConstReal(flt_type,
3356                                                   lp_sample_pos_4x[i][0]);
3357             sample_pos_arr[i * 2 + 1] = LLVMConstReal(flt_type,
3358                                                       lp_sample_pos_4x[i][1]);
3359          }
3360          sample_pos_array =
3361             LLVMConstArray(LLVMFloatTypeInContext(gallivm->context),
3362                            sample_pos_arr, 8);
3363       } else {
3364          LLVMValueRef sample_pos_arr[2];
3365          sample_pos_arr[0] = LLVMConstReal(flt_type, 0.5);
3366          sample_pos_arr[1] = LLVMConstReal(flt_type, 0.5);
3367          sample_pos_array =
3368             LLVMConstArray(LLVMFloatTypeInContext(gallivm->context),
3369                            sample_pos_arr, 2);
3370       }
3371       LLVMSetInitializer(glob_sample_pos, sample_pos_array);
3372 
3373       LLVMValueRef color_store[PIPE_MAX_COLOR_BUFS][TGSI_NUM_CHANNELS];
3374       bool pixel_center_integer = nir->info.fs.pixel_center_integer;
3375 
3376       /*
3377        * The shader input interpolation info is not explicitely baked in the
3378        * shader key, but everything it derives from (TGSI, and flatshade) is
3379        * already included in the shader key.
3380        */
3381       lp_build_interp_soa_init(&interp,
3382                                gallivm,
3383                                nir->num_inputs,
3384                                inputs,
3385                                pixel_center_integer,
3386                                key->coverage_samples,
3387                                LLVMTypeOf(sample_pos_array),
3388                                glob_sample_pos,
3389                                num_loop,
3390                                builder, fs_type,
3391                                a0_ptr, dadx_ptr, dady_ptr,
3392                                x, y);
3393 
3394       for (unsigned i = 0; i < num_fs; i++) {
3395          if (key->multisample) {
3396             LLVMValueRef smask_val =
3397                LLVMBuildLoad2(builder, int32_type,
3398                               lp_jit_context_sample_mask(gallivm, variant->jit_context_type, context_ptr),
3399                               "");
3400 
3401             /*
3402              * For multisampling, extract the per-sample mask from the
3403              * incoming 64-bit mask, store to the per sample mask storage. Or
3404              * all of them together to generate the fragment shader
3405              * mask. (sample shading TODO).  Take the incoming state coverage
3406              * mask into account.
3407              */
3408             for (unsigned s = 0; s < key->coverage_samples; s++) {
3409                LLVMValueRef sindexi =
3410                   lp_build_const_int32(gallivm, i + (s * num_fs));
3411                LLVMValueRef sample_mask_ptr =
3412                   LLVMBuildGEP2(builder, mask_type, mask_store, &sindexi, 1,
3413                                 "sample_mask_ptr");
3414                LLVMValueRef s_mask =
3415                   generate_quad_mask(gallivm, fs_type,
3416                                      i * fs_type.length / 4, s, mask_input);
3417                LLVMValueRef smask_bit =
3418                   LLVMBuildAnd(builder, smask_val,
3419                                lp_build_const_int32(gallivm, (1 << s)), "");
3420                LLVMValueRef cmp =
3421                   LLVMBuildICmp(builder, LLVMIntNE, smask_bit,
3422                                 lp_build_const_int32(gallivm, 0), "");
3423                smask_bit = LLVMBuildSExt(builder, cmp, int32_type, "");
3424                smask_bit = lp_build_broadcast(gallivm, mask_type, smask_bit);
3425 
3426                s_mask = LLVMBuildAnd(builder, s_mask, smask_bit, "");
3427                LLVMBuildStore(builder, s_mask, sample_mask_ptr);
3428             }
3429          } else {
3430             LLVMValueRef mask;
3431             LLVMValueRef indexi = lp_build_const_int32(gallivm, i);
3432             LLVMValueRef mask_ptr = LLVMBuildGEP2(builder, mask_type, mask_store,
3433                                                   &indexi, 1, "mask_ptr");
3434 
3435             if (partial_mask) {
3436                mask = generate_quad_mask(gallivm, fs_type,
3437                                          i * fs_type.length / 4, 0, mask_input);
3438             } else {
3439                mask = lp_build_const_int_vec(gallivm, fs_type, ~0);
3440             }
3441             LLVMBuildStore(builder, mask, mask_ptr);
3442          }
3443       }
3444 
3445       generate_fs_loop(gallivm,
3446                        shader, key,
3447                        builder,
3448                        fs_type,
3449                        variant->jit_context_type,
3450                        context_ptr,
3451                        variant->jit_resources_type,
3452                        resources_ptr,
3453                        LLVMTypeOf(sample_pos_array),
3454                        glob_sample_pos,
3455                        num_loop,
3456                        &interp,
3457                        sampler,
3458                        image,
3459                        mask_type,
3460                        mask_store, /* output */
3461                        color_store,
3462                        depth_ptr,
3463                        depth_stride,
3464                        depth_sample_stride,
3465                        color_ptr_ptr,
3466                        stride_ptr,
3467                        color_sample_stride_ptr,
3468                        facing,
3469                        variant->jit_thread_data_type,
3470                        thread_data_ptr);
3471 
3472       LLVMTypeRef fs_vec_type = lp_build_vec_type(gallivm, fs_type);
3473       for (unsigned i = 0; i < num_fs; i++) {
3474          LLVMValueRef ptr;
3475          for (unsigned s = 0; s < key->coverage_samples; s++) {
3476             int idx = (i + (s * num_fs));
3477             LLVMValueRef sindexi = lp_build_const_int32(gallivm, idx);
3478             ptr = LLVMBuildGEP2(builder, mask_type, mask_store, &sindexi, 1, "");
3479 
3480             fs_mask[idx] = LLVMBuildLoad2(builder, mask_type, ptr, "smask");
3481          }
3482 
3483          for (unsigned s = 0; s < key->min_samples; s++) {
3484             /* This is fucked up need to reorganize things */
3485             int idx = s * num_fs + i;
3486             LLVMValueRef sindexi = lp_build_const_int32(gallivm, idx);
3487             for (unsigned cbuf = 0; cbuf < key->nr_cbufs; cbuf++) {
3488                for (unsigned chan = 0; chan < TGSI_NUM_CHANNELS; ++chan) {
3489                   ptr = LLVMBuildGEP2(builder, fs_vec_type,
3490                                       color_store[cbuf][chan],
3491                                       &sindexi, 1, "");
3492                   fs_out_color[s][cbuf][chan][i] = ptr;
3493                }
3494             }
3495             if (dual_source_blend) {
3496                /* only support one dual source blend target hence always use
3497                 * output 1
3498                 */
3499                for (unsigned chan = 0; chan < TGSI_NUM_CHANNELS; ++chan) {
3500                   ptr = LLVMBuildGEP2(builder, fs_vec_type,
3501                                       color_store[1][chan],
3502                                       &sindexi, 1, "");
3503                   fs_out_color[s][1][chan][i] = ptr;
3504                }
3505             }
3506          }
3507       }
3508    }
3509 
3510    lp_bld_llvm_sampler_soa_destroy(sampler);
3511    lp_bld_llvm_image_soa_destroy(image);
3512 
3513    /* Loop over color outputs / color buffers to do blending */
3514    for (unsigned cbuf = 0; cbuf < key->nr_cbufs; cbuf++) {
3515       if (key->cbuf_format[cbuf] != PIPE_FORMAT_NONE &&
3516           (key->blend.rt[cbuf].blend_enable || key->blend.logicop_enable ||
3517            find_output_by_frag_result(nir, FRAG_RESULT_DATA0 + cbuf) != -1)) {
3518          LLVMValueRef color_ptr;
3519          LLVMValueRef stride;
3520          LLVMValueRef sample_stride = NULL;
3521          LLVMValueRef index = lp_build_const_int32(gallivm, cbuf);
3522 
3523          bool do_branch = ((key->depth.enabled
3524                             || key->stencil[0].enabled
3525                             || key->alpha.enabled)
3526                            && !nir->info.fs.uses_discard);
3527 
3528          color_ptr = LLVMBuildLoad2(builder, int8p_type,
3529                                     LLVMBuildGEP2(builder, int8p_type, color_ptr_ptr,
3530                                                  &index, 1, ""),
3531                                     "");
3532 
3533          stride = LLVMBuildLoad2(builder, int32_type,
3534                                  LLVMBuildGEP2(builder, int32_type, stride_ptr,
3535                                              &index, 1, ""),
3536                                  "");
3537 
3538          if (key->cbuf_nr_samples[cbuf] > 1)
3539             sample_stride = LLVMBuildLoad2(builder, int32_type,
3540                                            LLVMBuildGEP2(builder,
3541                                                          int32_type,
3542                                                          color_sample_stride_ptr,
3543                                                          &index, 1, ""), "");
3544 
3545          for (unsigned s = 0; s < key->cbuf_nr_samples[cbuf]; s++) {
3546             unsigned mask_idx = num_fs * (key->multisample ? s : 0);
3547             unsigned out_idx = key->min_samples == 1 ? 0 : s;
3548             LLVMValueRef out_ptr = color_ptr;
3549 
3550             if (sample_stride) {
3551                LLVMValueRef sample_offset =
3552                   LLVMBuildMul(builder, sample_stride,
3553                                lp_build_const_int32(gallivm, s), "");
3554                out_ptr = LLVMBuildGEP2(builder, int8_type, out_ptr, &sample_offset, 1, "");
3555             }
3556             out_ptr = LLVMBuildBitCast(builder, out_ptr,
3557                                        LLVMPointerType(blend_vec_type, 0), "");
3558 
3559             lp_build_name(out_ptr, "color_ptr%d", cbuf);
3560 
3561             generate_unswizzled_blend(gallivm, cbuf, variant,
3562                                       key->cbuf_format[cbuf],
3563                                       num_fs, fs_type, &fs_mask[mask_idx],
3564                                       fs_out_color[out_idx],
3565                                       variant->jit_context_type,
3566                                       context_ptr, blend_vec_type, out_ptr, stride,
3567                                       partial_mask, do_branch);
3568          }
3569       }
3570    }
3571 
3572    LLVMBuildRetVoid(builder);
3573 
3574    gallivm_verify_function(gallivm, function);
3575 }
3576 
3577 
3578 static void
dump_fs_variant_key(struct lp_fragment_shader_variant_key * key)3579 dump_fs_variant_key(struct lp_fragment_shader_variant_key *key)
3580 {
3581    debug_printf("fs variant %p:\n", (void *) key);
3582 
3583    if (key->flatshade) {
3584       debug_printf("flatshade = 1\n");
3585    }
3586    if (key->depth_clamp)
3587       debug_printf("depth_clamp = 1\n");
3588 
3589    if (key->restrict_depth_values)
3590       debug_printf("restrict_depth_values = 1\n");
3591 
3592    if (key->multisample) {
3593       debug_printf("multisample = 1\n");
3594       debug_printf("coverage samples = %d\n", key->coverage_samples);
3595       debug_printf("min samples = %d\n", key->min_samples);
3596    }
3597    for (unsigned i = 0; i < key->nr_cbufs; ++i) {
3598       debug_printf("cbuf_format[%u] = %s\n", i, util_format_name(key->cbuf_format[i]));
3599       debug_printf("cbuf nr_samples[%u] = %d\n", i, key->cbuf_nr_samples[i]);
3600    }
3601    if (key->depth.enabled || key->stencil[0].enabled) {
3602       debug_printf("depth.format = %s\n", util_format_name(key->zsbuf_format));
3603       debug_printf("depth nr_samples = %d\n", key->zsbuf_nr_samples);
3604    }
3605    if (key->depth.enabled) {
3606       debug_printf("depth.func = %s\n", util_str_func(key->depth.func, true));
3607       debug_printf("depth.writemask = %u\n", key->depth.writemask);
3608    }
3609 
3610    for (unsigned i = 0; i < 2; ++i) {
3611       if (key->stencil[i].enabled) {
3612          debug_printf("stencil[%u].func = %s\n", i, util_str_func(key->stencil[i].func, true));
3613          debug_printf("stencil[%u].fail_op = %s\n", i, util_str_stencil_op(key->stencil[i].fail_op, true));
3614          debug_printf("stencil[%u].zpass_op = %s\n", i, util_str_stencil_op(key->stencil[i].zpass_op, true));
3615          debug_printf("stencil[%u].zfail_op = %s\n", i, util_str_stencil_op(key->stencil[i].zfail_op, true));
3616          debug_printf("stencil[%u].valuemask = 0x%x\n", i, key->stencil[i].valuemask);
3617          debug_printf("stencil[%u].writemask = 0x%x\n", i, key->stencil[i].writemask);
3618       }
3619    }
3620 
3621    if (key->alpha.enabled) {
3622       debug_printf("alpha.func = %s\n", util_str_func(key->alpha.func, true));
3623    }
3624 
3625    if (key->occlusion_count) {
3626       debug_printf("occlusion_count = 1\n");
3627    }
3628 
3629    if (key->blend.logicop_enable) {
3630       debug_printf("blend.logicop_func = %s\n", util_str_logicop(key->blend.logicop_func, true));
3631    } else if (key->blend.rt[0].blend_enable) {
3632       debug_printf("blend.rgb_func = %s\n",   util_str_blend_func  (key->blend.rt[0].rgb_func, true));
3633       debug_printf("blend.rgb_src_factor = %s\n",   util_str_blend_factor(key->blend.rt[0].rgb_src_factor, true));
3634       debug_printf("blend.rgb_dst_factor = %s\n",   util_str_blend_factor(key->blend.rt[0].rgb_dst_factor, true));
3635       debug_printf("blend.alpha_func = %s\n",       util_str_blend_func  (key->blend.rt[0].alpha_func, true));
3636       debug_printf("blend.alpha_src_factor = %s\n", util_str_blend_factor(key->blend.rt[0].alpha_src_factor, true));
3637       debug_printf("blend.alpha_dst_factor = %s\n", util_str_blend_factor(key->blend.rt[0].alpha_dst_factor, true));
3638    }
3639    debug_printf("blend.colormask = 0x%x\n", key->blend.rt[0].colormask);
3640    if (key->blend.alpha_to_coverage) {
3641       debug_printf("blend.alpha_to_coverage is enabled\n");
3642    }
3643    for (unsigned i = 0; i < key->nr_samplers; ++i) {
3644       const struct lp_sampler_static_state *samplers = lp_fs_variant_key_samplers(key);
3645       const struct lp_static_sampler_state *sampler = &samplers[i].sampler_state;
3646       debug_printf("sampler[%u] = \n", i);
3647       debug_printf("  .wrap = %s %s %s\n",
3648                    util_str_tex_wrap(sampler->wrap_s, true),
3649                    util_str_tex_wrap(sampler->wrap_t, true),
3650                    util_str_tex_wrap(sampler->wrap_r, true));
3651       debug_printf("  .min_img_filter = %s\n",
3652                    util_str_tex_filter(sampler->min_img_filter, true));
3653       debug_printf("  .min_mip_filter = %s\n",
3654                    util_str_tex_mipfilter(sampler->min_mip_filter, true));
3655       debug_printf("  .mag_img_filter = %s\n",
3656                    util_str_tex_filter(sampler->mag_img_filter, true));
3657       if (sampler->compare_mode != PIPE_TEX_COMPARE_NONE)
3658          debug_printf("  .compare_func = %s\n", util_str_func(sampler->compare_func, true));
3659       debug_printf("  .normalized_coords = %u\n", sampler->normalized_coords);
3660       debug_printf("  .min_max_lod_equal = %u\n", sampler->min_max_lod_equal);
3661       debug_printf("  .lod_bias_non_zero = %u\n", sampler->lod_bias_non_zero);
3662       debug_printf("  .apply_min_lod = %u\n", sampler->apply_min_lod);
3663       debug_printf("  .apply_max_lod = %u\n", sampler->apply_max_lod);
3664       debug_printf("  .reduction_mode = %u\n", sampler->reduction_mode);
3665       debug_printf("  .aniso = %u\n", sampler->aniso);
3666    }
3667    for (unsigned i = 0; i < key->nr_sampler_views; ++i) {
3668       const struct lp_sampler_static_state *samplers = lp_fs_variant_key_samplers(key);
3669       const struct lp_static_texture_state *texture = &samplers[i].texture_state;
3670       debug_printf("texture[%u] = \n", i);
3671       debug_printf("  .format = %s\n",
3672                    util_format_name(texture->format));
3673       debug_printf("  .target = %s\n",
3674                    util_str_tex_target(texture->target, true));
3675       debug_printf("  .level_zero_only = %u\n",
3676                    texture->level_zero_only);
3677       debug_printf("  .pot = %u %u %u\n",
3678                    texture->pot_width,
3679                    texture->pot_height,
3680                    texture->pot_depth);
3681    }
3682    struct lp_image_static_state *images = lp_fs_variant_key_images(key);
3683    for (unsigned i = 0; i < key->nr_images; ++i) {
3684       const struct lp_static_texture_state *image = &images[i].image_state;
3685       debug_printf("image[%u] = \n", i);
3686       debug_printf("  .format = %s\n",
3687                    util_format_name(image->format));
3688       debug_printf("  .target = %s\n",
3689                    util_str_tex_target(image->target, true));
3690       debug_printf("  .level_zero_only = %u\n",
3691                    image->level_zero_only);
3692       debug_printf("  .pot = %u %u %u\n",
3693                    image->pot_width,
3694                    image->pot_height,
3695                    image->pot_depth);
3696    }
3697 }
3698 
3699 
3700 const char *
lp_debug_fs_kind(enum lp_fs_kind kind)3701 lp_debug_fs_kind(enum lp_fs_kind kind)
3702 {
3703    switch (kind) {
3704    case LP_FS_KIND_GENERAL:
3705       return "GENERAL";
3706    case LP_FS_KIND_BLIT_RGBA:
3707       return "BLIT_RGBA";
3708    case LP_FS_KIND_BLIT_RGB1:
3709       return "BLIT_RGB1";
3710    case LP_FS_KIND_AERO_MINIFICATION:
3711       return "AERO_MINIFICATION";
3712    case LP_FS_KIND_LLVM_LINEAR:
3713       return "LLVM_LINEAR";
3714    default:
3715       return "unknown";
3716    }
3717 }
3718 
3719 
3720 void
lp_debug_fs_variant(struct lp_fragment_shader_variant * variant)3721 lp_debug_fs_variant(struct lp_fragment_shader_variant *variant)
3722 {
3723    debug_printf("llvmpipe: Fragment shader #%u variant #%u:\n",
3724                 variant->shader->no, variant->no);
3725    nir_print_shader(variant->shader->base.ir.nir, stderr);
3726    dump_fs_variant_key(&variant->key);
3727    debug_printf("variant->opaque = %u\n", variant->opaque);
3728    debug_printf("variant->potentially_opaque = %u\n", variant->potentially_opaque);
3729    debug_printf("variant->blit = %u\n", variant->blit);
3730    debug_printf("shader->kind = %s\n", lp_debug_fs_kind(variant->shader->kind));
3731    debug_printf("\n");
3732 }
3733 
3734 
3735 static void
lp_fs_get_ir_cache_key(struct lp_fragment_shader_variant * variant,unsigned char ir_sha1_cache_key[20])3736 lp_fs_get_ir_cache_key(struct lp_fragment_shader_variant *variant,
3737                        unsigned char ir_sha1_cache_key[20])
3738 {
3739    struct blob blob = { 0 };
3740    unsigned ir_size;
3741    void *ir_binary;
3742 
3743    blob_init(&blob);
3744    nir_serialize(&blob, variant->shader->base.ir.nir, true);
3745    ir_binary = blob.data;
3746    ir_size = blob.size;
3747 
3748    struct mesa_sha1 ctx;
3749    _mesa_sha1_init(&ctx);
3750    _mesa_sha1_update(&ctx, &variant->key, variant->shader->variant_key_size);
3751    _mesa_sha1_update(&ctx, ir_binary, ir_size);
3752    _mesa_sha1_final(&ctx, ir_sha1_cache_key);
3753 
3754    blob_finish(&blob);
3755 }
3756 
3757 
3758 /**
3759  * Generate a new fragment shader variant from the shader code and
3760  * other state indicated by the key.
3761  */
3762 static struct lp_fragment_shader_variant *
generate_variant(struct llvmpipe_context * lp,struct lp_fragment_shader * shader,const struct lp_fragment_shader_variant_key * key)3763 generate_variant(struct llvmpipe_context *lp,
3764                  struct lp_fragment_shader *shader,
3765                  const struct lp_fragment_shader_variant_key *key)
3766 {
3767    struct nir_shader *nir = shader->base.ir.nir;
3768    struct lp_fragment_shader_variant *variant =
3769       MALLOC(sizeof *variant + shader->variant_key_size - sizeof variant->key);
3770    if (!variant)
3771       return NULL;
3772 
3773    memset(variant, 0, sizeof(*variant));
3774 
3775    pipe_reference_init(&variant->reference, 1);
3776    lp_fs_reference(lp, &variant->shader, shader);
3777 
3778    memcpy(&variant->key, key, shader->variant_key_size);
3779 
3780    struct llvmpipe_screen *screen = llvmpipe_screen(lp->pipe.screen);
3781    struct lp_cached_code cached = { 0 };
3782    unsigned char ir_sha1_cache_key[20];
3783    bool needs_caching = false;
3784    if (shader->base.ir.nir) {
3785       lp_fs_get_ir_cache_key(variant, ir_sha1_cache_key);
3786 
3787       lp_disk_cache_find_shader(screen, &cached, ir_sha1_cache_key);
3788       if (!cached.data_size)
3789          needs_caching = true;
3790    }
3791 
3792    char module_name[64];
3793    snprintf(module_name, sizeof(module_name), "fs%u_variant%u",
3794             shader->no, shader->variants_created);
3795    variant->gallivm = gallivm_create(module_name, &lp->context, &cached);
3796    if (!variant->gallivm) {
3797       FREE(variant);
3798       return NULL;
3799    }
3800 
3801    variant->list_item_global.base = variant;
3802    variant->list_item_local.base = variant;
3803    variant->no = shader->variants_created++;
3804 
3805    /*
3806     * Determine whether we are touching all channels in the color buffer.
3807     */
3808    const struct util_format_description *cbuf0_format_desc = NULL;
3809    bool fullcolormask = false;
3810    if (key->nr_cbufs == 1) {
3811       cbuf0_format_desc = util_format_description(key->cbuf_format[0]);
3812       fullcolormask = util_format_colormask_full(cbuf0_format_desc,
3813                                                  key->blend.rt[0].colormask);
3814    }
3815 
3816    /* The scissor is ignored here as only tiles inside the scissoring
3817     * rectangle will refer to this.
3818     */
3819    const bool no_kill =
3820          fullcolormask &&
3821          !key->stencil[0].enabled &&
3822          !key->alpha.enabled &&
3823          !key->multisample &&
3824          !key->blend.alpha_to_coverage &&
3825          !key->depth.enabled &&
3826          !nir->info.fs.uses_discard &&
3827          !(nir->info.outputs_written & BITFIELD64_BIT(FRAG_RESULT_SAMPLE_MASK)) &&
3828          !nir->info.fs.uses_fbfetch_output;
3829 
3830    variant->opaque =
3831          no_kill &&
3832          !key->blend.logicop_enable &&
3833          !key->blend.rt[0].blend_enable
3834          ? true : false;
3835 
3836    variant->potentially_opaque =
3837          no_kill &&
3838          !key->blend.logicop_enable &&
3839          key->blend.rt[0].blend_enable &&
3840          key->blend.rt[0].rgb_func == PIPE_BLEND_ADD &&
3841          key->blend.rt[0].rgb_dst_factor == PIPE_BLENDFACTOR_INV_SRC_ALPHA &&
3842          key->blend.rt[0].alpha_func == key->blend.rt[0].rgb_func &&
3843          key->blend.rt[0].alpha_dst_factor == key->blend.rt[0].rgb_dst_factor &&
3844          shader->base.type == PIPE_SHADER_IR_TGSI &&
3845          /*
3846           * FIXME: for NIR, all of the fields of info.xxx (except info.base)
3847           * are zeros, hence shader analysis (here and elsewhere) using these
3848           * bits cannot work and will silently fail (cbuf is the only pointer
3849           * field, hence causing a crash).
3850           */
3851          shader->info.cbuf[0][3].file != TGSI_FILE_NULL
3852          ? true : false;
3853 
3854    /* We only care about opaque blits for now */
3855    if (variant->opaque &&
3856        (shader->kind == LP_FS_KIND_BLIT_RGBA ||
3857         shader->kind == LP_FS_KIND_BLIT_RGB1)) {
3858       const struct lp_sampler_static_state *samp0 =
3859          lp_fs_variant_key_sampler_idx(key, 0);
3860       assert(samp0);
3861 
3862       const enum pipe_format texture_format = samp0->texture_state.format;
3863       const enum pipe_texture_target target = samp0->texture_state.target;
3864       const unsigned min_img_filter = samp0->sampler_state.min_img_filter;
3865       const unsigned mag_img_filter = samp0->sampler_state.mag_img_filter;
3866 
3867       unsigned min_mip_filter;
3868       if (samp0->texture_state.level_zero_only) {
3869          min_mip_filter = PIPE_TEX_MIPFILTER_NONE;
3870       } else {
3871          min_mip_filter = samp0->sampler_state.min_mip_filter;
3872       }
3873 
3874       if (target == PIPE_TEXTURE_2D &&
3875           min_img_filter == PIPE_TEX_FILTER_NEAREST &&
3876           mag_img_filter == PIPE_TEX_FILTER_NEAREST &&
3877           min_mip_filter == PIPE_TEX_MIPFILTER_NONE &&
3878           ((texture_format &&
3879             util_is_format_compatible(util_format_description(texture_format),
3880                                       cbuf0_format_desc)) ||
3881            (shader->kind == LP_FS_KIND_BLIT_RGB1 &&
3882             (texture_format == PIPE_FORMAT_B8G8R8A8_UNORM ||
3883              texture_format == PIPE_FORMAT_B8G8R8X8_UNORM) &&
3884             (key->cbuf_format[0] == PIPE_FORMAT_B8G8R8A8_UNORM ||
3885              key->cbuf_format[0] == PIPE_FORMAT_B8G8R8X8_UNORM)))) {
3886          variant->blit = 1;
3887       }
3888    }
3889 
3890    /* Determine whether this shader + pipeline state is a candidate for
3891     * the linear path.
3892     */
3893    const bool linear_pipeline =
3894          !key->stencil[0].enabled &&
3895          !key->depth.enabled &&
3896          !nir->info.fs.uses_discard &&
3897          !key->blend.logicop_enable &&
3898          (key->cbuf_format[0] == PIPE_FORMAT_B8G8R8A8_UNORM ||
3899           key->cbuf_format[0] == PIPE_FORMAT_B8G8R8X8_UNORM ||
3900           key->cbuf_format[0] == PIPE_FORMAT_R8G8B8A8_UNORM ||
3901           key->cbuf_format[0] == PIPE_FORMAT_R8G8B8X8_UNORM);
3902 
3903    memcpy(&variant->key, key, sizeof *key);
3904 
3905    if ((LP_DEBUG & DEBUG_FS) || (gallivm_debug & GALLIVM_DEBUG_IR)) {
3906       lp_debug_fs_variant(variant);
3907    }
3908 
3909    llvmpipe_fs_variant_fastpath(variant);
3910 
3911    lp_jit_init_types(variant);
3912 
3913    if (variant->jit_function[RAST_EDGE_TEST] == NULL)
3914       generate_fragment(lp, shader, variant, RAST_EDGE_TEST);
3915 
3916    if (variant->jit_function[RAST_WHOLE] == NULL) {
3917       if (variant->opaque) {
3918          /* Specialized shader, which doesn't need to read the color buffer. */
3919          generate_fragment(lp, shader, variant, RAST_WHOLE);
3920       }
3921    }
3922 
3923    if (linear_pipeline) {
3924       /* Currently keeping both the old fastpaths and new linear path
3925        * active.  The older code is still somewhat faster for the cases
3926        * it covers.
3927        *
3928        * XXX: consider restricting this to aero-mode only.
3929        */
3930       if (fullcolormask &&
3931           !key->alpha.enabled &&
3932           !key->blend.alpha_to_coverage) {
3933          llvmpipe_fs_variant_linear_fastpath(variant);
3934       }
3935 
3936       /* If the original fastpath doesn't cover this variant, try the new
3937        * code:
3938        */
3939       if (variant->jit_linear == NULL) {
3940          if (shader->kind == LP_FS_KIND_BLIT_RGBA ||
3941              shader->kind == LP_FS_KIND_BLIT_RGB1 ||
3942              shader->kind == LP_FS_KIND_LLVM_LINEAR) {
3943             llvmpipe_fs_variant_linear_llvm(lp, shader, variant);
3944          }
3945       }
3946    } else {
3947       if (LP_DEBUG & DEBUG_LINEAR) {
3948          lp_debug_fs_variant(variant);
3949          debug_printf("    ----> no linear path for this variant\n");
3950       }
3951    }
3952 
3953    /*
3954     * Compile everything
3955     */
3956 
3957 #if GALLIVM_USE_ORCJIT
3958 /* module has been moved into ORCJIT after gallivm_compile_module */
3959    variant->nr_instrs += lp_build_count_ir_module(variant->gallivm->module);
3960 
3961    gallivm_compile_module(variant->gallivm);
3962 #else
3963    gallivm_compile_module(variant->gallivm);
3964 
3965    variant->nr_instrs += lp_build_count_ir_module(variant->gallivm->module);
3966 #endif
3967 
3968    if (variant->function[RAST_EDGE_TEST]) {
3969       variant->jit_function[RAST_EDGE_TEST] = (lp_jit_frag_func)
3970             gallivm_jit_function(variant->gallivm,
3971                                  variant->function[RAST_EDGE_TEST],
3972                                  variant->function_name[RAST_EDGE_TEST]);
3973    }
3974 
3975    if (variant->function[RAST_WHOLE]) {
3976       variant->jit_function[RAST_WHOLE] = (lp_jit_frag_func)
3977          gallivm_jit_function(variant->gallivm,
3978                               variant->function[RAST_WHOLE],
3979                               variant->function_name[RAST_WHOLE]);
3980    } else if (!variant->jit_function[RAST_WHOLE]) {
3981       variant->jit_function[RAST_WHOLE] = (lp_jit_frag_func)
3982          variant->jit_function[RAST_EDGE_TEST];
3983    }
3984 
3985    if (linear_pipeline) {
3986       if (variant->linear_function) {
3987          variant->jit_linear_llvm = (lp_jit_linear_llvm_func)
3988             gallivm_jit_function(variant->gallivm, variant->linear_function,
3989                                  variant->linear_function_name);
3990       }
3991 
3992       /*
3993        * This must be done after LLVM compilation, as it will call the JIT'ed
3994        * code to determine active inputs.
3995        */
3996       lp_linear_check_variant(variant);
3997    }
3998 
3999    if (needs_caching) {
4000       lp_disk_cache_insert_shader(screen, &cached, ir_sha1_cache_key);
4001    }
4002 
4003    gallivm_free_ir(variant->gallivm);
4004 
4005    return variant;
4006 }
4007 
4008 
4009 static void *
llvmpipe_create_fs_state(struct pipe_context * pipe,const struct pipe_shader_state * templ)4010 llvmpipe_create_fs_state(struct pipe_context *pipe,
4011                          const struct pipe_shader_state *templ)
4012 {
4013    struct llvmpipe_context *llvmpipe = llvmpipe_context(pipe);
4014 
4015    struct lp_fragment_shader *shader = CALLOC_STRUCT(lp_fragment_shader);
4016    if (!shader)
4017       return NULL;
4018 
4019    pipe_reference_init(&shader->reference, 1);
4020    shader->no = fs_no++;
4021    list_inithead(&shader->variants.list);
4022 
4023    shader->base.type = PIPE_SHADER_IR_NIR;
4024 
4025    if (templ->type == PIPE_SHADER_IR_TGSI) {
4026       shader->base.ir.nir = tgsi_to_nir(templ->tokens, pipe->screen, false);
4027    } else {
4028       shader->base.ir.nir = templ->ir.nir;
4029    }
4030 
4031    /* lower FRAG_RESULT_COLOR -> DATA[0-7] to correctly handle unused attachments */
4032    nir_shader *nir = shader->base.ir.nir;
4033    NIR_PASS_V(nir, nir_lower_fragcolor, nir->info.fs.color_is_dual_source ? 1 : 8);
4034 
4035    nir_shader_gather_info(nir, nir_shader_get_entrypoint(nir));
4036    nir_tgsi_scan_shader(nir, &shader->info.base, true);
4037    shader->info.num_texs = shader->info.base.opcode_count[TGSI_OPCODE_TEX];
4038 
4039    llvmpipe_register_shader(pipe, &shader->base);
4040 
4041    shader->draw_data = draw_create_fragment_shader(llvmpipe->draw, templ);
4042    if (shader->draw_data == NULL) {
4043       FREE(shader);
4044       return NULL;
4045    }
4046 
4047    const int nr_samplers = BITSET_LAST_BIT(nir->info.samplers_used);
4048    const int nr_sampler_views = BITSET_LAST_BIT(nir->info.textures_used);
4049    const int nr_images = BITSET_LAST_BIT(nir->info.images_used);
4050 
4051    shader->variant_key_size = lp_fs_variant_key_size(MAX2(nr_samplers,
4052                                                           nr_sampler_views),
4053                                                      nr_images);
4054 
4055    nir_foreach_shader_in_variable(var, nir) {
4056       unsigned idx = var->data.driver_location;
4057       unsigned slots = nir_variable_count_slots(var, var->type);
4058 
4059       if (var->data.centroid)
4060          shader->inputs[idx].location = TGSI_INTERPOLATE_LOC_CENTROID;
4061       if (var->data.sample)
4062          shader->inputs[idx].location = TGSI_INTERPOLATE_LOC_SAMPLE;
4063 
4064       enum glsl_base_type base_type =
4065          glsl_get_base_type(glsl_without_array(var->type));
4066       switch (var->data.interpolation) {
4067       case INTERP_MODE_NONE:
4068          if (glsl_base_type_is_integer(base_type) || var->data.per_primitive) {
4069             shader->inputs[idx].interp = LP_INTERP_CONSTANT;
4070             break;
4071          }
4072          if (var->data.location == VARYING_SLOT_COL0 ||
4073              var->data.location == VARYING_SLOT_COL1) {
4074             shader->inputs[idx].interp = LP_INTERP_COLOR;
4075             break;
4076          }
4077          FALLTHROUGH;
4078       case INTERP_MODE_SMOOTH:
4079          shader->inputs[idx].interp = LP_INTERP_PERSPECTIVE;
4080          break;
4081       case INTERP_MODE_NOPERSPECTIVE:
4082          shader->inputs[idx].interp = LP_INTERP_LINEAR;
4083          break;
4084       case INTERP_MODE_FLAT:
4085          shader->inputs[idx].interp = LP_INTERP_CONSTANT;
4086          break;
4087       }
4088 
4089       /* XXX this is a completely pointless index map... */
4090       shader->inputs[idx].src_index = idx + 1;
4091       if (var->data.location == VARYING_SLOT_FACE)
4092          shader->inputs[idx].interp = LP_INTERP_FACING;
4093       else if (var->data.location == VARYING_SLOT_POS) {
4094          shader->inputs[idx].src_index = 0;
4095          shader->inputs[idx].interp = LP_INTERP_POSITION;
4096       }
4097 
4098       shader->inputs[idx].usage_mask = shader->info.base.input_usage_mask[idx];
4099       for (unsigned s = 1; s < slots; s++) {
4100          shader->inputs[idx + s] = shader->inputs[idx];
4101          shader->inputs[idx + s].src_index = idx + s + 1;
4102          shader->inputs[idx + s].usage_mask = shader->info.base.input_usage_mask[idx + s];
4103       }
4104    }
4105 
4106    llvmpipe_fs_analyse_nir(shader);
4107 
4108    return shader;
4109 }
4110 
4111 
4112 static void
llvmpipe_bind_fs_state(struct pipe_context * pipe,void * fs)4113 llvmpipe_bind_fs_state(struct pipe_context *pipe, void *fs)
4114 {
4115    struct llvmpipe_context *llvmpipe = llvmpipe_context(pipe);
4116    struct lp_fragment_shader *lp_fs = (struct lp_fragment_shader *)fs;
4117    if (llvmpipe->fs == lp_fs)
4118       return;
4119 
4120    draw_bind_fragment_shader(llvmpipe->draw,
4121                              (lp_fs ? lp_fs->draw_data : NULL));
4122 
4123    lp_fs_reference(llvmpipe, &llvmpipe->fs, lp_fs);
4124 
4125    /* invalidate the setup link, NEW_FS will make it update */
4126    lp_setup_set_fs_variant(llvmpipe->setup, NULL);
4127    llvmpipe->dirty |= LP_NEW_FS;
4128 }
4129 
4130 
4131 /**
4132  * Remove shader variant from two lists: the shader's variant list
4133  * and the context's variant list.
4134  */
4135 static void
llvmpipe_remove_shader_variant(struct llvmpipe_context * lp,struct lp_fragment_shader_variant * variant)4136 llvmpipe_remove_shader_variant(struct llvmpipe_context *lp,
4137                                struct lp_fragment_shader_variant *variant)
4138 {
4139    if ((LP_DEBUG & DEBUG_FS) || (gallivm_debug & GALLIVM_DEBUG_IR)) {
4140       debug_printf("llvmpipe: del fs #%u var %u v created %u v cached %u "
4141                    "v total cached %u inst %u total inst %u\n",
4142                    variant->shader->no, variant->no,
4143                    variant->shader->variants_created,
4144                    variant->shader->variants_cached,
4145                    lp->nr_fs_variants, variant->nr_instrs, lp->nr_fs_instrs);
4146    }
4147 
4148    /* remove from shader's list */
4149    list_del(&variant->list_item_local.list);
4150    variant->shader->variants_cached--;
4151 
4152    /* remove from context's list */
4153    list_del(&variant->list_item_global.list);
4154    lp->nr_fs_variants--;
4155    lp->nr_fs_instrs -= variant->nr_instrs;
4156 }
4157 
4158 
4159 void
llvmpipe_destroy_shader_variant(struct llvmpipe_context * lp,struct lp_fragment_shader_variant * variant)4160 llvmpipe_destroy_shader_variant(struct llvmpipe_context *lp,
4161                                 struct lp_fragment_shader_variant *variant)
4162 {
4163    gallivm_destroy(variant->gallivm);
4164    lp_fs_reference(lp, &variant->shader, NULL);
4165    if (variant->function_name[RAST_EDGE_TEST])
4166       FREE(variant->function_name[RAST_EDGE_TEST]);
4167    if (variant->function_name[RAST_WHOLE])
4168       FREE(variant->function_name[RAST_WHOLE]);
4169    if (variant->linear_function_name)
4170       FREE(variant->linear_function_name);
4171    FREE(variant);
4172 }
4173 
4174 
4175 void
llvmpipe_destroy_fs(struct llvmpipe_context * llvmpipe,struct lp_fragment_shader * shader)4176 llvmpipe_destroy_fs(struct llvmpipe_context *llvmpipe,
4177                     struct lp_fragment_shader *shader)
4178 {
4179    /* Delete draw module's data */
4180    draw_delete_fragment_shader(llvmpipe->draw, shader->draw_data);
4181 
4182    ralloc_free(shader->base.ir.nir);
4183    assert(shader->variants_cached == 0);
4184    FREE(shader);
4185 }
4186 
4187 
4188 static void
llvmpipe_delete_fs_state(struct pipe_context * pipe,void * fs)4189 llvmpipe_delete_fs_state(struct pipe_context *pipe, void *fs)
4190 {
4191    struct llvmpipe_context *llvmpipe = llvmpipe_context(pipe);
4192    struct lp_fragment_shader *shader = fs;
4193    struct lp_fs_variant_list_item *li, *next;
4194 
4195    /* Delete all the variants */
4196    LIST_FOR_EACH_ENTRY_SAFE(li, next, &shader->variants.list, list) {
4197       struct lp_fragment_shader_variant *variant;
4198       variant = li->base;
4199       llvmpipe_remove_shader_variant(llvmpipe, li->base);
4200       lp_fs_variant_reference(llvmpipe, &variant, NULL);
4201    }
4202 
4203    lp_fs_reference(llvmpipe, &shader, NULL);
4204 }
4205 
4206 
4207 static void
llvmpipe_set_constant_buffer(struct pipe_context * pipe,enum pipe_shader_type shader,uint index,bool take_ownership,const struct pipe_constant_buffer * cb)4208 llvmpipe_set_constant_buffer(struct pipe_context *pipe,
4209                              enum pipe_shader_type shader, uint index,
4210                              bool take_ownership,
4211                              const struct pipe_constant_buffer *cb)
4212 {
4213    struct llvmpipe_context *llvmpipe = llvmpipe_context(pipe);
4214    struct pipe_constant_buffer *constants = &llvmpipe->constants[shader][index];
4215 
4216    assert(shader < PIPE_SHADER_MESH_TYPES);
4217    assert(index < ARRAY_SIZE(llvmpipe->constants[shader]));
4218 
4219    /* note: reference counting */
4220    util_copy_constant_buffer(&llvmpipe->constants[shader][index], cb,
4221                              take_ownership);
4222 
4223    /* user_buffer is only valid until the next set_constant_buffer (at most,
4224     * possibly until shader deletion), so we need to upload it now to make
4225     * sure it doesn't get updated/freed out from under us.
4226     */
4227    if (constants->user_buffer) {
4228       u_upload_data(llvmpipe->pipe.const_uploader, 0, constants->buffer_size,
4229                     16, constants->user_buffer, &constants->buffer_offset,
4230                     &constants->buffer);
4231    }
4232    if (constants->buffer) {
4233        if (!(constants->buffer->bind & PIPE_BIND_CONSTANT_BUFFER)) {
4234          debug_printf("Illegal set constant without bind flag\n");
4235          constants->buffer->bind |= PIPE_BIND_CONSTANT_BUFFER;
4236       }
4237       llvmpipe_flush_resource(pipe, constants->buffer, 0, true, true, false, "set_constant_buffer");
4238    }
4239 
4240    switch (shader) {
4241    case PIPE_SHADER_VERTEX:
4242    case PIPE_SHADER_GEOMETRY:
4243    case PIPE_SHADER_TESS_CTRL:
4244    case PIPE_SHADER_TESS_EVAL: {
4245       const unsigned size = cb ? cb->buffer_size : 0;
4246 
4247       const uint8_t *data = NULL;
4248       if (constants->buffer) {
4249          data = (uint8_t *) llvmpipe_resource_data(constants->buffer)
4250             + constants->buffer_offset;
4251       }
4252 
4253       draw_set_mapped_constant_buffer(llvmpipe->draw, shader,
4254                                       index, data, size);
4255       break;
4256    }
4257    case PIPE_SHADER_COMPUTE:
4258       llvmpipe->cs_dirty |= LP_CSNEW_CONSTANTS;
4259       break;
4260    case PIPE_SHADER_FRAGMENT:
4261       llvmpipe->dirty |= LP_NEW_FS_CONSTANTS;
4262       break;
4263    case PIPE_SHADER_TASK:
4264       llvmpipe->dirty |= LP_NEW_TASK_CONSTANTS;
4265       break;
4266    case PIPE_SHADER_MESH:
4267       llvmpipe->dirty |= LP_NEW_MESH_CONSTANTS;
4268       break;
4269    default:
4270       unreachable("Illegal shader type");
4271       break;
4272    }
4273 }
4274 
4275 
4276 static void
llvmpipe_set_shader_buffers(struct pipe_context * pipe,enum pipe_shader_type shader,unsigned start_slot,unsigned count,const struct pipe_shader_buffer * buffers,unsigned writable_bitmask)4277 llvmpipe_set_shader_buffers(struct pipe_context *pipe,
4278                             enum pipe_shader_type shader, unsigned start_slot,
4279                             unsigned count,
4280                             const struct pipe_shader_buffer *buffers,
4281                             unsigned writable_bitmask)
4282 {
4283    struct llvmpipe_context *llvmpipe = llvmpipe_context(pipe);
4284 
4285    unsigned i, idx;
4286    for (i = start_slot, idx = 0; i < start_slot + count; i++, idx++) {
4287       const struct pipe_shader_buffer *buffer = buffers ? &buffers[idx] : NULL;
4288 
4289       util_copy_shader_buffer(&llvmpipe->ssbos[shader][i], buffer);
4290 
4291       if (buffer && buffer->buffer) {
4292          bool read_only = !(writable_bitmask & (1 << idx));
4293          llvmpipe_flush_resource(pipe, buffer->buffer, 0, read_only, false,
4294                                  false, "buffer");
4295       }
4296 
4297       switch (shader) {
4298       case PIPE_SHADER_VERTEX:
4299       case PIPE_SHADER_GEOMETRY:
4300       case PIPE_SHADER_TESS_CTRL:
4301       case PIPE_SHADER_TESS_EVAL: {
4302          const unsigned size = buffer ? buffer->buffer_size : 0;
4303          const uint8_t *data = NULL;
4304          if (buffer && buffer->buffer)
4305             data = (uint8_t *) llvmpipe_resource_data(buffer->buffer);
4306          if (data)
4307             data += buffer->buffer_offset;
4308          draw_set_mapped_shader_buffer(llvmpipe->draw, shader,
4309                                        i, data, size);
4310          break;
4311       }
4312       case PIPE_SHADER_COMPUTE:
4313          llvmpipe->cs_dirty |= LP_CSNEW_SSBOS;
4314          break;
4315       case PIPE_SHADER_TASK:
4316          llvmpipe->dirty |= LP_NEW_TASK_SSBOS;
4317          break;
4318       case PIPE_SHADER_MESH:
4319          llvmpipe->dirty |= LP_NEW_MESH_SSBOS;
4320          break;
4321       case PIPE_SHADER_FRAGMENT:
4322          llvmpipe->fs_ssbo_write_mask &= ~(((1 << count) - 1) << start_slot);
4323          llvmpipe->fs_ssbo_write_mask |= writable_bitmask << start_slot;
4324          llvmpipe->dirty |= LP_NEW_FS_SSBOS;
4325          break;
4326       default:
4327          unreachable("Illegal shader type");
4328          break;
4329       }
4330    }
4331 }
4332 
4333 
4334 static void
llvmpipe_set_shader_images(struct pipe_context * pipe,enum pipe_shader_type shader,unsigned start_slot,unsigned count,unsigned unbind_num_trailing_slots,const struct pipe_image_view * images)4335 llvmpipe_set_shader_images(struct pipe_context *pipe,
4336                            enum pipe_shader_type shader, unsigned start_slot,
4337                            unsigned count, unsigned unbind_num_trailing_slots,
4338                            const struct pipe_image_view *images)
4339 {
4340    struct llvmpipe_context *llvmpipe = llvmpipe_context(pipe);
4341    unsigned i, idx;
4342 
4343    draw_flush(llvmpipe->draw);
4344    for (i = start_slot, idx = 0; i < start_slot + count; i++, idx++) {
4345       const struct pipe_image_view *image = images ? &images[idx] : NULL;
4346 
4347       util_copy_image_view(&llvmpipe->images[shader][i], image);
4348 
4349       if (image && image->resource) {
4350          bool read_only = !(image->access & PIPE_IMAGE_ACCESS_WRITE);
4351          llvmpipe_flush_resource(pipe, image->resource, 0, read_only, false,
4352                                  false, "image");
4353       }
4354    }
4355 
4356    llvmpipe->num_images[shader] = start_slot + count;
4357    switch (shader) {
4358    case PIPE_SHADER_VERTEX:
4359    case PIPE_SHADER_GEOMETRY:
4360    case PIPE_SHADER_TESS_CTRL:
4361    case PIPE_SHADER_TESS_EVAL:
4362       draw_set_images(llvmpipe->draw, shader, llvmpipe->images[shader],
4363                       start_slot + count);
4364       break;
4365    case PIPE_SHADER_COMPUTE:
4366       llvmpipe->cs_dirty |= LP_CSNEW_IMAGES;
4367       break;
4368    case PIPE_SHADER_FRAGMENT:
4369       llvmpipe->dirty |= LP_NEW_FS_IMAGES;
4370       break;
4371    case PIPE_SHADER_TASK:
4372       llvmpipe->dirty |= LP_NEW_TASK_IMAGES;
4373       break;
4374    case PIPE_SHADER_MESH:
4375       llvmpipe->dirty |= LP_NEW_MESH_IMAGES;
4376       break;
4377    default:
4378       unreachable("Illegal shader type");
4379       break;
4380    }
4381 
4382    if (unbind_num_trailing_slots) {
4383       llvmpipe_set_shader_images(pipe, shader, start_slot + count,
4384                                  unbind_num_trailing_slots, 0, NULL);
4385    }
4386 }
4387 
4388 
4389 /**
4390  * Return the blend factor equivalent to a destination alpha of one.
4391  */
4392 static inline enum pipe_blendfactor
force_dst_alpha_one(enum pipe_blendfactor factor,bool clamped_zero)4393 force_dst_alpha_one(enum pipe_blendfactor factor, bool clamped_zero)
4394 {
4395    switch (factor) {
4396    case PIPE_BLENDFACTOR_DST_ALPHA:
4397       return PIPE_BLENDFACTOR_ONE;
4398    case PIPE_BLENDFACTOR_INV_DST_ALPHA:
4399       return PIPE_BLENDFACTOR_ZERO;
4400    case PIPE_BLENDFACTOR_SRC_ALPHA_SATURATE:
4401       if (clamped_zero)
4402          return PIPE_BLENDFACTOR_ZERO;
4403       else
4404          return PIPE_BLENDFACTOR_SRC_ALPHA_SATURATE;
4405    default:
4406       return factor;
4407    }
4408 }
4409 
4410 
4411 /**
4412  * We need to generate several variants of the fragment pipeline to match
4413  * all the combinations of the contributing state atoms.
4414  *
4415  * TODO: there is actually no reason to tie this to context state -- the
4416  * generated code could be cached globally in the screen.
4417  */
4418 static struct lp_fragment_shader_variant_key *
make_variant_key(struct llvmpipe_context * lp,struct lp_fragment_shader * shader,char * store)4419 make_variant_key(struct llvmpipe_context *lp,
4420                  struct lp_fragment_shader *shader,
4421                  char *store)
4422 {
4423    struct lp_fragment_shader_variant_key *key =
4424       (struct lp_fragment_shader_variant_key *)store;
4425    struct nir_shader *nir = shader->base.ir.nir;
4426 
4427    memset(key, 0, sizeof(*key));
4428 
4429    if (lp->framebuffer.zsbuf) {
4430       const enum pipe_format zsbuf_format = lp->framebuffer.zsbuf->format;
4431       const struct util_format_description *zsbuf_desc =
4432          util_format_description(zsbuf_format);
4433 
4434       if (lp->depth_stencil->depth_enabled &&
4435           util_format_has_depth(zsbuf_desc)) {
4436          key->zsbuf_format = zsbuf_format;
4437          key->depth.enabled = lp->depth_stencil->depth_enabled;
4438          key->depth.writemask = lp->depth_stencil->depth_writemask;
4439          key->depth.func = lp->depth_stencil->depth_func;
4440       }
4441       if (lp->depth_stencil->stencil[0].enabled &&
4442           util_format_has_stencil(zsbuf_desc)) {
4443          key->zsbuf_format = zsbuf_format;
4444          memcpy(&key->stencil, &lp->depth_stencil->stencil,
4445                 sizeof key->stencil);
4446       }
4447       if (llvmpipe_resource_is_1d(lp->framebuffer.zsbuf->texture)) {
4448          key->resource_1d = true;
4449       }
4450       key->zsbuf_nr_samples =
4451          util_res_sample_count(lp->framebuffer.zsbuf->texture);
4452 
4453       /*
4454        * Restrict depth values if the API is clamped (GL, VK with ext)
4455        * for non float Z buffer
4456        */
4457       key->restrict_depth_values =
4458          !(lp->rasterizer->unclamped_fragment_depth_values &&
4459            util_format_get_depth_only(zsbuf_format) == PIPE_FORMAT_Z32_FLOAT);
4460    }
4461 
4462    /*
4463     * Propagate the depth clamp setting from the rasterizer state.
4464     */
4465    key->depth_clamp = lp->rasterizer->depth_clamp;
4466 
4467    /* alpha test only applies if render buffer 0 is non-integer
4468     * (or does not exist)
4469     */
4470    if (!lp->framebuffer.nr_cbufs ||
4471        !lp->framebuffer.cbufs[0] ||
4472        !util_format_is_pure_integer(lp->framebuffer.cbufs[0]->format)) {
4473       key->alpha.enabled = lp->depth_stencil->alpha_enabled;
4474    }
4475    if (key->alpha.enabled) {
4476       key->alpha.func = lp->depth_stencil->alpha_func;
4477       /* alpha.ref_value is passed in jit_context */
4478    }
4479 
4480    key->flatshade = lp->rasterizer->flatshade;
4481    key->multisample = lp->rasterizer->multisample;
4482    key->no_ms_sample_mask_out = lp->rasterizer->no_ms_sample_mask_out;
4483    if (lp->active_occlusion_queries && !lp->queries_disabled) {
4484       key->occlusion_count = true;
4485    }
4486 
4487    memcpy(&key->blend, lp->blend, sizeof key->blend);
4488 
4489    key->coverage_samples = 1;
4490    key->min_samples = 1;
4491    if (key->multisample) {
4492       key->coverage_samples =
4493          util_framebuffer_get_num_samples(&lp->framebuffer);
4494       /* Per EXT_shader_framebuffer_fetch spec:
4495        *
4496        *   "1. How is framebuffer data treated during multisample rendering?
4497        *
4498        *    RESOLVED: Reading the value of gl_LastFragData produces a
4499        *    different result for each sample. This implies that all or part
4500        *    of the shader be run once for each sample, but has no additional
4501        *    implications on fragment shader input variables which may still
4502        *    be interpolated per pixel by the implementation."
4503        *
4504        * ARM_shader_framebuffer_fetch_depth_stencil spec further says:
4505        *
4506        *   "(1) When multisampling is enabled, does the shader run per sample?
4507        *
4508        *    RESOLVED.
4509        *
4510        *    This behavior is inherited from either
4511        *    EXT_shader_framebuffer_fetch or ARM_shader_framebuffer_fetch as
4512        *    described in the interactions section.  If neither extension is
4513        *    supported, the shader runs once per fragment."
4514        *
4515        * Therefore we should always enable per-sample shading when FB fetch is
4516        * used.
4517        */
4518       if (lp->min_samples > 1 || nir->info.fs.uses_fbfetch_output)
4519          key->min_samples = key->coverage_samples;
4520    }
4521    key->nr_cbufs = lp->framebuffer.nr_cbufs;
4522 
4523    if (!key->blend.independent_blend_enable) {
4524       // we always need independent blend otherwise the fixups below won't work
4525       for (unsigned i = 1; i < key->nr_cbufs; i++) {
4526          memcpy(&key->blend.rt[i], &key->blend.rt[0],
4527                 sizeof(key->blend.rt[0]));
4528       }
4529       key->blend.independent_blend_enable = 1;
4530    }
4531 
4532    for (unsigned i = 0; i < lp->framebuffer.nr_cbufs; i++) {
4533       struct pipe_rt_blend_state *blend_rt = &key->blend.rt[i];
4534 
4535       if (lp->framebuffer.cbufs[i]) {
4536          const enum pipe_format format = lp->framebuffer.cbufs[i]->format;
4537 
4538          key->cbuf_format[i] = format;
4539          key->cbuf_nr_samples[i] =
4540             util_res_sample_count(lp->framebuffer.cbufs[i]->texture);
4541 
4542          /*
4543           * Figure out if this is a 1d resource. Note that OpenGL allows crazy
4544           * mixing of 2d textures with height 1 and 1d textures, so make sure
4545           * we pick 1d if any cbuf or zsbuf is 1d.
4546           */
4547          if (llvmpipe_resource_is_1d(lp->framebuffer.cbufs[i]->texture)) {
4548             key->resource_1d = true;
4549          }
4550 
4551          const struct util_format_description *format_desc =
4552             util_format_description(format);
4553          assert(format_desc->colorspace == UTIL_FORMAT_COLORSPACE_RGB ||
4554                 format_desc->colorspace == UTIL_FORMAT_COLORSPACE_SRGB);
4555 
4556          /*
4557           * Mask out color channels not present in the color buffer.
4558           */
4559          blend_rt->colormask &= util_format_colormask(format_desc);
4560 
4561          /*
4562           * Disable blend for integer formats.
4563           */
4564          if (util_format_is_pure_integer(format)) {
4565             blend_rt->blend_enable = 0;
4566          }
4567 
4568          /*
4569           * Our swizzled render tiles always have an alpha channel, but the
4570           * linear render target format often does not, so force here the dst
4571           * alpha to be one.
4572           *
4573           * This is not a mere optimization. Wrong results will be produced if
4574           * the dst alpha is used, the dst format does not have alpha, and the
4575           * previous rendering was not flushed from the swizzled to linear
4576           * buffer. For example, NonPowTwo DCT.
4577           *
4578           * TODO: This should be generalized to all channels for better
4579           * performance, but only alpha causes correctness issues.
4580           *
4581           * Also, force rgb/alpha func/factors match, to make AoS blending
4582           * easier.
4583           */
4584          if (format_desc->swizzle[3] > PIPE_SWIZZLE_W ||
4585              format_desc->swizzle[3] == format_desc->swizzle[0]) {
4586             // Doesn't cover mixed snorm/unorm but can't render to them anyway
4587             bool clamped_zero = !util_format_is_float(format) &&
4588                                 !util_format_is_snorm(format);
4589             blend_rt->rgb_src_factor =
4590                force_dst_alpha_one(blend_rt->rgb_src_factor, clamped_zero);
4591             blend_rt->rgb_dst_factor =
4592                force_dst_alpha_one(blend_rt->rgb_dst_factor, clamped_zero);
4593             blend_rt->alpha_func       = blend_rt->rgb_func;
4594             blend_rt->alpha_src_factor = blend_rt->rgb_src_factor;
4595             blend_rt->alpha_dst_factor = blend_rt->rgb_dst_factor;
4596          }
4597       } else {
4598          /* no color buffer for this fragment output */
4599          key->cbuf_format[i] = PIPE_FORMAT_NONE;
4600          key->cbuf_nr_samples[i] = 0;
4601          blend_rt->colormask = 0x0;
4602          blend_rt->blend_enable = 0;
4603       }
4604    }
4605 
4606    /* This value will be the same for all the variants of a given shader:
4607     */
4608    key->nr_samplers = BITSET_LAST_BIT(nir->info.samplers_used);
4609    key->nr_sampler_views = BITSET_LAST_BIT(nir->info.textures_used);
4610 
4611    struct lp_sampler_static_state *fs_sampler =
4612       lp_fs_variant_key_samplers(key);
4613 
4614    memset(fs_sampler, 0,
4615           MAX2(key->nr_samplers, key->nr_sampler_views) * sizeof *fs_sampler);
4616 
4617    for (unsigned i = 0; i < key->nr_samplers; ++i) {
4618       if (BITSET_TEST(nir->info.samplers_used, i)) {
4619          lp_sampler_static_sampler_state(&fs_sampler[i].sampler_state,
4620                                          lp->samplers[PIPE_SHADER_FRAGMENT][i]);
4621       }
4622    }
4623 
4624    /*
4625     * XXX If TGSI_FILE_SAMPLER_VIEW exists assume all texture opcodes
4626     * are dx10-style? Can't really have mixed opcodes, at least not
4627     * if we want to skip the holes here (without rescanning tgsi).
4628     */
4629    if (key->nr_sampler_views) {
4630       for (unsigned i = 0; i < key->nr_sampler_views; ++i) {
4631          /*
4632           * Note sview may exceed what's representable by file_mask.
4633           * This will still work, the only downside is that not actually
4634           * used views may be included in the shader key.
4635           */
4636          if (BITSET_TEST(nir->info.textures_used, i)) {
4637             lp_sampler_static_texture_state(&fs_sampler[i].texture_state,
4638                                   lp->sampler_views[PIPE_SHADER_FRAGMENT][i]);
4639          }
4640       }
4641    } else {
4642       key->nr_sampler_views = key->nr_samplers;
4643       for (unsigned i = 0; i < key->nr_sampler_views; ++i) {
4644          if (BITSET_TEST(nir->info.samplers_used, i)) {
4645             lp_sampler_static_texture_state(&fs_sampler[i].texture_state,
4646                                  lp->sampler_views[PIPE_SHADER_FRAGMENT][i]);
4647          }
4648       }
4649    }
4650 
4651    struct lp_image_static_state *lp_image = lp_fs_variant_key_images(key);
4652    key->nr_images = BITSET_LAST_BIT(nir->info.images_used);
4653    if (key->nr_images)
4654       memset(lp_image, 0,
4655              key->nr_images * sizeof *lp_image);
4656    for (unsigned i = 0; i < key->nr_images; ++i) {
4657       if (BITSET_TEST(nir->info.images_used, i)) {
4658          lp_sampler_static_texture_state_image(&lp_image[i].image_state,
4659                                       &lp->images[PIPE_SHADER_FRAGMENT][i]);
4660       }
4661    }
4662 
4663    if (shader->kind == LP_FS_KIND_AERO_MINIFICATION) {
4664       struct lp_sampler_static_state *samp0 =
4665          lp_fs_variant_key_sampler_idx(key, 0);
4666       assert(samp0);
4667       samp0->sampler_state.min_img_filter = PIPE_TEX_FILTER_NEAREST;
4668       samp0->sampler_state.mag_img_filter = PIPE_TEX_FILTER_NEAREST;
4669    }
4670 
4671    return key;
4672 }
4673 
4674 
4675 /**
4676  * Update fragment shader state.  This is called just prior to drawing
4677  * something when some fragment-related state has changed.
4678  */
4679 void
llvmpipe_update_fs(struct llvmpipe_context * lp)4680 llvmpipe_update_fs(struct llvmpipe_context *lp)
4681 {
4682    struct lp_fragment_shader *shader = lp->fs;
4683 
4684    char store[LP_FS_MAX_VARIANT_KEY_SIZE];
4685    const struct lp_fragment_shader_variant_key *key =
4686       make_variant_key(lp, shader, store);
4687 
4688    struct lp_fragment_shader_variant *variant = NULL;
4689    struct lp_fs_variant_list_item *li;
4690    /* Search the variants for one which matches the key */
4691    LIST_FOR_EACH_ENTRY(li, &shader->variants.list, list) {
4692       if (memcmp(&li->base->key, key, shader->variant_key_size) == 0) {
4693          variant = li->base;
4694          break;
4695       }
4696    }
4697 
4698    if (variant) {
4699       /* Move this variant to the head of the list to implement LRU
4700        * deletion of shader's when we have too many.
4701        */
4702       list_move_to(&variant->list_item_global.list, &lp->fs_variants_list.list);
4703    } else {
4704       /* variant not found, create it now */
4705 
4706       if (LP_DEBUG & DEBUG_FS) {
4707          debug_printf("%u variants,\t%u instrs,\t%u instrs/variant\n",
4708                       lp->nr_fs_variants,
4709                       lp->nr_fs_instrs,
4710                       lp->nr_fs_variants ? lp->nr_fs_instrs / lp->nr_fs_variants : 0);
4711       }
4712 
4713       /* First, check if we've exceeded the max number of shader variants.
4714        * If so, free 6.25% of them (the least recently used ones).
4715        */
4716       const unsigned variants_to_cull =
4717          lp->nr_fs_variants >= LP_MAX_SHADER_VARIANTS
4718          ? LP_MAX_SHADER_VARIANTS / 16 : 0;
4719 
4720       if (variants_to_cull ||
4721           lp->nr_fs_instrs >= LP_MAX_SHADER_INSTRUCTIONS) {
4722          if (gallivm_debug & GALLIVM_DEBUG_PERF) {
4723             debug_printf("Evicting FS: %u fs variants,\t%u total variants,"
4724                          "\t%u instrs,\t%u instrs/variant\n",
4725                          shader->variants_cached,
4726                          lp->nr_fs_variants, lp->nr_fs_instrs,
4727                          lp->nr_fs_instrs / lp->nr_fs_variants);
4728          }
4729 
4730          /*
4731           * We need to re-check lp->nr_fs_variants because an arbitrarily
4732           * large number of shader variants (potentially all of them) could
4733           * be pending for destruction on flush.
4734           */
4735 
4736          for (unsigned i = 0;
4737               i < variants_to_cull ||
4738                  lp->nr_fs_instrs >= LP_MAX_SHADER_INSTRUCTIONS;
4739               i++) {
4740             struct lp_fs_variant_list_item *item;
4741             if (list_is_empty(&lp->fs_variants_list.list)) {
4742                break;
4743             }
4744             item = list_last_entry(&lp->fs_variants_list.list,
4745                                    struct lp_fs_variant_list_item, list);
4746             assert(item);
4747             assert(item->base);
4748             llvmpipe_remove_shader_variant(lp, item->base);
4749             struct lp_fragment_shader_variant *variant = item->base;
4750             lp_fs_variant_reference(lp, &variant, NULL);
4751          }
4752       }
4753 
4754       /*
4755        * Generate the new variant.
4756        */
4757       int64_t t0 = os_time_get();
4758       variant = generate_variant(lp, shader, key);
4759       int64_t t1 = os_time_get();
4760       int64_t dt = t1 - t0;
4761       LP_COUNT_ADD(llvm_compile_time, dt);
4762       LP_COUNT_ADD(nr_llvm_compiles, 2);  /* emit vs. omit in/out test */
4763 
4764       /* Put the new variant into the list */
4765       if (variant) {
4766          list_add(&variant->list_item_local.list, &shader->variants.list);
4767          list_add(&variant->list_item_global.list, &lp->fs_variants_list.list);
4768          lp->nr_fs_variants++;
4769          lp->nr_fs_instrs += variant->nr_instrs;
4770          shader->variants_cached++;
4771       }
4772    }
4773 
4774    /* Bind this variant */
4775    lp_setup_set_fs_variant(lp->setup, variant);
4776 }
4777 
4778 
4779 void
llvmpipe_init_fs_funcs(struct llvmpipe_context * llvmpipe)4780 llvmpipe_init_fs_funcs(struct llvmpipe_context *llvmpipe)
4781 {
4782    llvmpipe->pipe.create_fs_state = llvmpipe_create_fs_state;
4783    llvmpipe->pipe.bind_fs_state   = llvmpipe_bind_fs_state;
4784    llvmpipe->pipe.delete_fs_state = llvmpipe_delete_fs_state;
4785    llvmpipe->pipe.set_constant_buffer = llvmpipe_set_constant_buffer;
4786    llvmpipe->pipe.set_shader_buffers = llvmpipe_set_shader_buffers;
4787    llvmpipe->pipe.set_shader_images = llvmpipe_set_shader_images;
4788 }
4789