• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /**************************************************************************
2  *
3  * Copyright 2009 VMware, Inc.
4  * Copyright 2007 VMware, Inc.
5  * All Rights Reserved.
6  *
7  * Permission is hereby granted, free of charge, to any person obtaining a
8  * copy of this software and associated documentation files (the
9  * "Software"), to deal in the Software without restriction, including
10  * without limitation the rights to use, copy, modify, merge, publish,
11  * distribute, sub license, and/or sell copies of the Software, and to
12  * permit persons to whom the Software is furnished to do so, subject to
13  * the following conditions:
14  *
15  * The above copyright notice and this permission notice (including the
16  * next paragraph) shall be included in all copies or substantial portions
17  * of the Software.
18  *
19  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
20  * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
21  * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
22  * IN NO EVENT SHALL VMWARE AND/OR ITS SUPPLIERS BE LIABLE FOR
23  * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
24  * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
25  * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
26  *
27  **************************************************************************/
28 
29 /**
30  * @file
31  * Code generate the whole fragment pipeline.
32  *
33  * The fragment pipeline consists of the following stages:
34  * - early depth test
35  * - fragment shader
36  * - alpha test
37  * - depth/stencil test
38  * - blending
39  *
40  * This file has only the glue to assemble the fragment pipeline.  The actual
41  * plumbing of converting Gallium state into LLVM IR is done elsewhere, in the
42  * lp_bld_*.[ch] files, and in a complete generic and reusable way. Here we
43  * muster the LLVM JIT execution engine to create a function that follows an
44  * established binary interface and that can be called from C directly.
45  *
46  * A big source of complexity here is that we often want to run different
47  * stages with different precisions and data types and precisions. For example,
48  * the fragment shader needs typically to be done in floats, but the
49  * depth/stencil test and blending is better done in the type that most closely
50  * matches the depth/stencil and color buffer respectively.
51  *
52  * Since the width of a SIMD vector register stays the same regardless of the
53  * element type, different types imply different number of elements, so we must
54  * code generate more instances of the stages with larger types to be able to
55  * feed/consume the stages with smaller types.
56  *
57  * @author Jose Fonseca <jfonseca@vmware.com>
58  */
59 
60 #include <limits.h>
61 #include "pipe/p_defines.h"
62 #include "util/u_inlines.h"
63 #include "util/u_memory.h"
64 #include "util/u_pointer.h"
65 #include "util/format/u_format.h"
66 #include "util/u_dump.h"
67 #include "util/u_string.h"
68 #include "util/u_dual_blend.h"
69 #include "util/u_upload_mgr.h"
70 #include "util/os_time.h"
71 #include "pipe/p_shader_tokens.h"
72 #include "draw/draw_context.h"
73 #include "nir/tgsi_to_nir.h"
74 #include "gallivm/lp_bld_type.h"
75 #include "gallivm/lp_bld_const.h"
76 #include "gallivm/lp_bld_conv.h"
77 #include "gallivm/lp_bld_init.h"
78 #include "gallivm/lp_bld_intr.h"
79 #include "gallivm/lp_bld_logic.h"
80 #include "gallivm/lp_bld_tgsi.h"
81 #include "gallivm/lp_bld_nir.h"
82 #include "gallivm/lp_bld_swizzle.h"
83 #include "gallivm/lp_bld_flow.h"
84 #include "gallivm/lp_bld_debug.h"
85 #include "gallivm/lp_bld_arit.h"
86 #include "gallivm/lp_bld_bitarit.h"
87 #include "gallivm/lp_bld_pack.h"
88 #include "gallivm/lp_bld_format.h"
89 #include "gallivm/lp_bld_quad.h"
90 #include "gallivm/lp_bld_gather.h"
91 #include "gallivm/lp_bld_jit_sample.h"
92 
93 #include "lp_bld_alpha.h"
94 #include "lp_bld_blend.h"
95 #include "lp_bld_depth.h"
96 #include "lp_bld_interp.h"
97 #include "lp_context.h"
98 #include "lp_debug.h"
99 #include "lp_perf.h"
100 #include "lp_setup.h"
101 #include "lp_state.h"
102 #include "lp_tex_sample.h"
103 #include "lp_flush.h"
104 #include "lp_state_fs.h"
105 #include "lp_rast.h"
106 #include "nir/nir_to_tgsi_info.h"
107 
108 #include "lp_screen.h"
109 #include "compiler/nir/nir_serialize.h"
110 #include "util/mesa-sha1.h"
111 
112 
113 /** Fragment shader number (for debugging) */
114 static unsigned fs_no = 0;
115 
116 
117 static void
118 load_unswizzled_block(struct gallivm_state *gallivm,
119                       LLVMTypeRef base_type,
120                       LLVMValueRef base_ptr,
121                       LLVMValueRef stride,
122                       unsigned block_width,
123                       unsigned block_height,
124                       LLVMValueRef* dst,
125                       struct lp_type dst_type,
126                       unsigned dst_count,
127                       unsigned dst_alignment);
128 /**
129  * Checks if a format description is an arithmetic format
130  *
131  * A format which has irregular channel sizes such as R3_G3_B2 or R5_G6_B5.
132  */
133 static inline bool
is_arithmetic_format(const struct util_format_description * format_desc)134 is_arithmetic_format(const struct util_format_description *format_desc)
135 {
136    bool arith = false;
137 
138    for (unsigned i = 0; i < format_desc->nr_channels; ++i) {
139       arith |= format_desc->channel[i].size != format_desc->channel[0].size;
140       arith |= (format_desc->channel[i].size % 8) != 0;
141    }
142 
143    return arith;
144 }
145 
146 
147 /**
148  * Checks if this format requires special handling due to required expansion
149  * to floats for blending, and furthermore has "natural" packed AoS ->
150  * unpacked SoA conversion.
151  */
152 static inline bool
format_expands_to_float_soa(const struct util_format_description * format_desc)153 format_expands_to_float_soa(const struct util_format_description *format_desc)
154 {
155    if (format_desc->format == PIPE_FORMAT_R11G11B10_FLOAT ||
156        format_desc->colorspace == UTIL_FORMAT_COLORSPACE_SRGB) {
157       return true;
158    }
159    return false;
160 }
161 
162 
163 /**
164  * Retrieves the type representing the memory layout for a format
165  *
166  * e.g. RGBA16F = 4x half-float and R3G3B2 = 1x byte
167  */
168 static inline void
lp_mem_type_from_format_desc(const struct util_format_description * format_desc,struct lp_type * type)169 lp_mem_type_from_format_desc(const struct util_format_description *format_desc,
170                              struct lp_type* type)
171 {
172    if (format_expands_to_float_soa(format_desc)) {
173       /* just make this a uint with width of block */
174       type->floating = false;
175       type->fixed = false;
176       type->sign = false;
177       type->norm = false;
178       type->width = format_desc->block.bits;
179       type->length = 1;
180       return;
181    }
182 
183    int chan = util_format_get_first_non_void_channel(format_desc->format);
184 
185    memset(type, 0, sizeof(struct lp_type));
186    type->floating = format_desc->channel[chan].type == UTIL_FORMAT_TYPE_FLOAT;
187    type->fixed    = format_desc->channel[chan].type == UTIL_FORMAT_TYPE_FIXED;
188    type->sign     = format_desc->channel[chan].type != UTIL_FORMAT_TYPE_UNSIGNED;
189    type->norm     = format_desc->channel[chan].normalized;
190 
191    if (is_arithmetic_format(format_desc)) {
192       type->width = 0;
193       type->length = 1;
194 
195       for (unsigned i = 0; i < format_desc->nr_channels; ++i) {
196          type->width += format_desc->channel[i].size;
197       }
198    } else {
199       type->width = format_desc->channel[chan].size;
200       type->length = format_desc->nr_channels;
201    }
202 }
203 
204 
205 /**
206  * Expand the relevant bits of mask_input to a n*4-dword mask for the
207  * n*four pixels in n 2x2 quads.  This will set the n*four elements of the
208  * quad mask vector to 0 or ~0.
209  * Grouping is 01, 23 for 2 quad mode hence only 0 and 2 are valid
210  * quad arguments with fs length 8.
211  *
212  * \param first_quad  which quad(s) of the quad group to test, in [0,3]
213  * \param mask_input  bitwise mask for the whole 4x4 stamp
214  */
215 static LLVMValueRef
generate_quad_mask(struct gallivm_state * gallivm,struct lp_type fs_type,unsigned first_quad,unsigned sample,LLVMValueRef mask_input)216 generate_quad_mask(struct gallivm_state *gallivm,
217                    struct lp_type fs_type,
218                    unsigned first_quad,
219                    unsigned sample,
220                    LLVMValueRef mask_input) /* int64 */
221 {
222    LLVMBuilderRef builder = gallivm->builder;
223    LLVMTypeRef i32t = LLVMInt32TypeInContext(gallivm->context);
224    LLVMValueRef bits[16];
225    LLVMValueRef mask, bits_vec;
226 
227    /*
228     * XXX: We'll need a different path for 16 x u8
229     */
230    assert(fs_type.width == 32);
231    assert(fs_type.length <= ARRAY_SIZE(bits));
232    struct lp_type mask_type = lp_int_type(fs_type);
233 
234    /*
235     * mask_input >>= (quad * 4)
236     */
237    int shift;
238    switch (first_quad) {
239    case 0:
240       shift = 0;
241       break;
242    case 1:
243       assert(fs_type.length == 4);
244       shift = 2;
245       break;
246    case 2:
247       shift = 8;
248       break;
249    case 3:
250       assert(fs_type.length == 4);
251       shift = 10;
252       break;
253    default:
254       assert(0);
255       shift = 0;
256    }
257 
258    mask_input = LLVMBuildLShr(builder, mask_input,
259                               lp_build_const_int64(gallivm, 16 * sample), "");
260    mask_input = LLVMBuildTrunc(builder, mask_input, i32t, "");
261    mask_input = LLVMBuildAnd(builder, mask_input,
262                              lp_build_const_int32(gallivm, 0xffff), "");
263    mask_input = LLVMBuildLShr(builder, mask_input,
264                               LLVMConstInt(i32t, shift, 0), "");
265 
266    /*
267     * mask = { mask_input & (1 << i), for i in [0,3] }
268     */
269    mask = lp_build_broadcast(gallivm,
270                              lp_build_vec_type(gallivm, mask_type),
271                              mask_input);
272 
273    for (int i = 0; i < fs_type.length / 4; i++) {
274       unsigned j = 2 * (i % 2) + (i / 2) * 8;
275       bits[4*i + 0] = LLVMConstInt(i32t, 1ULL << (j + 0), 0);
276       bits[4*i + 1] = LLVMConstInt(i32t, 1ULL << (j + 1), 0);
277       bits[4*i + 2] = LLVMConstInt(i32t, 1ULL << (j + 4), 0);
278       bits[4*i + 3] = LLVMConstInt(i32t, 1ULL << (j + 5), 0);
279    }
280    bits_vec = LLVMConstVector(bits, fs_type.length);
281    mask = LLVMBuildAnd(builder, mask, bits_vec, "");
282 
283    /*
284     * mask = mask == bits ? ~0 : 0
285     */
286    mask = lp_build_compare(gallivm,
287                            mask_type, PIPE_FUNC_EQUAL,
288                            mask, bits_vec);
289 
290    return mask;
291 }
292 
293 
294 #define EARLY_DEPTH_TEST  0x1
295 #define LATE_DEPTH_TEST   0x2
296 #define EARLY_DEPTH_WRITE 0x4
297 #define LATE_DEPTH_WRITE  0x8
298 #define EARLY_DEPTH_TEST_INFERRED  0x10 //only with EARLY_DEPTH_TEST
299 
300 static unsigned
get_cbuf_location(nir_variable * var,unsigned slot)301 get_cbuf_location(nir_variable *var, unsigned slot)
302 {
303    return (var->data.location - FRAG_RESULT_DATA0) + var->data.index + slot;
304 }
305 
306 static int
find_output_by_frag_result(struct nir_shader * shader,gl_frag_result frag_result)307 find_output_by_frag_result(struct nir_shader *shader,
308                            gl_frag_result frag_result)
309 {
310    nir_foreach_shader_out_variable(var, shader) {
311       int slots = nir_variable_count_slots(var, var->type);
312       for (unsigned s = 0; s < slots; s++) {
313          if (var->data.location + var->data.index + s == frag_result)
314             return var->data.driver_location + s;
315       }
316    }
317 
318    return -1;
319 }
320 
321 /**
322  * Fetch the specified lp_jit_viewport structure for a given viewport_index.
323  */
324 static LLVMValueRef
lp_llvm_viewport(LLVMTypeRef context_type,LLVMValueRef context_ptr,struct gallivm_state * gallivm,LLVMValueRef viewport_index)325 lp_llvm_viewport(LLVMTypeRef context_type,
326                  LLVMValueRef context_ptr,
327                  struct gallivm_state *gallivm,
328                  LLVMValueRef viewport_index)
329 {
330    LLVMBuilderRef builder = gallivm->builder;
331    LLVMValueRef ptr;
332    LLVMValueRef res;
333    struct lp_type viewport_type =
334       lp_type_float_vec(32, 32 * LP_JIT_VIEWPORT_NUM_FIELDS);
335    LLVMTypeRef vtype = lp_build_vec_type(gallivm, viewport_type);
336 
337    ptr = lp_jit_context_viewports(gallivm, context_type, context_ptr);
338    ptr = LLVMBuildPointerCast(builder, ptr,
339             LLVMPointerType(vtype, 0), "");
340 
341    res = lp_build_pointer_get2(builder, vtype, ptr, viewport_index);
342 
343    return res;
344 }
345 
346 
347 static LLVMValueRef
lp_build_depth_clamp(struct gallivm_state * gallivm,LLVMBuilderRef builder,bool depth_clamp,bool restrict_depth,struct lp_type type,LLVMTypeRef context_type,LLVMValueRef context_ptr,LLVMTypeRef thread_data_type,LLVMValueRef thread_data_ptr,LLVMValueRef z)348 lp_build_depth_clamp(struct gallivm_state *gallivm,
349                      LLVMBuilderRef builder,
350                      bool depth_clamp,
351                      bool restrict_depth,
352                      struct lp_type type,
353                      LLVMTypeRef context_type,
354                      LLVMValueRef context_ptr,
355                      LLVMTypeRef thread_data_type,
356                      LLVMValueRef thread_data_ptr,
357                      LLVMValueRef z)
358 {
359    LLVMValueRef viewport, min_depth, max_depth;
360    LLVMValueRef viewport_index;
361    struct lp_build_context f32_bld;
362 
363    assert(type.floating);
364    lp_build_context_init(&f32_bld, gallivm, type);
365 
366    if (restrict_depth)
367       z = lp_build_clamp(&f32_bld, z, f32_bld.zero, f32_bld.one);
368 
369    if (!depth_clamp)
370       return z;
371 
372    /*
373     * Assumes clamping of the viewport index will occur in setup/gs. Value
374     * is passed through the rasterization stage via lp_rast_shader_inputs.
375     *
376     * See: draw_clamp_viewport_idx and lp_clamp_viewport_idx for clamping
377     *      semantics.
378     */
379    viewport_index = lp_jit_thread_data_raster_state_viewport_index(gallivm,
380                                                                    thread_data_type,
381                                                                    thread_data_ptr);
382 
383    /*
384     * Load the min and max depth from the lp_jit_context.viewports
385     * array of lp_jit_viewport structures.
386     */
387    viewport = lp_llvm_viewport(context_type, context_ptr, gallivm, viewport_index);
388 
389    /* viewports[viewport_index].min_depth */
390    min_depth = LLVMBuildExtractElement(builder, viewport,
391                   lp_build_const_int32(gallivm, LP_JIT_VIEWPORT_MIN_DEPTH), "");
392    min_depth = lp_build_broadcast_scalar(&f32_bld, min_depth);
393 
394    /* viewports[viewport_index].max_depth */
395    max_depth = LLVMBuildExtractElement(builder, viewport,
396                   lp_build_const_int32(gallivm, LP_JIT_VIEWPORT_MAX_DEPTH), "");
397    max_depth = lp_build_broadcast_scalar(&f32_bld, max_depth);
398 
399    /*
400     * Clamp to the min and max depth values for the given viewport.
401     */
402    return lp_build_clamp(&f32_bld, z, min_depth, max_depth);
403 }
404 
405 
406 static void
lp_build_sample_alpha_to_coverage(struct gallivm_state * gallivm,struct lp_type type,unsigned coverage_samples,LLVMValueRef num_loop,LLVMValueRef loop_counter,LLVMTypeRef coverage_mask_type,LLVMValueRef coverage_mask_store,LLVMValueRef alpha)407 lp_build_sample_alpha_to_coverage(struct gallivm_state *gallivm,
408                                   struct lp_type type,
409                                   unsigned coverage_samples,
410                                   LLVMValueRef num_loop,
411                                   LLVMValueRef loop_counter,
412                                   LLVMTypeRef coverage_mask_type,
413                                   LLVMValueRef coverage_mask_store,
414                                   LLVMValueRef alpha)
415 {
416    struct lp_build_context bld;
417    LLVMBuilderRef builder = gallivm->builder;
418    float step = 1.0 / coverage_samples;
419 
420    lp_build_context_init(&bld, gallivm, type);
421    for (unsigned s = 0; s < coverage_samples; s++) {
422       LLVMValueRef alpha_ref_value = lp_build_const_vec(gallivm, type, step * s);
423       LLVMValueRef test = lp_build_cmp(&bld, PIPE_FUNC_GREATER, alpha, alpha_ref_value);
424 
425       LLVMValueRef s_mask_idx = LLVMBuildMul(builder, lp_build_const_int32(gallivm, s), num_loop, "");
426       s_mask_idx = LLVMBuildAdd(builder, s_mask_idx, loop_counter, "");
427       LLVMValueRef s_mask_ptr = LLVMBuildGEP2(builder, coverage_mask_type,
428                                               coverage_mask_store, &s_mask_idx, 1, "");
429       LLVMValueRef s_mask = LLVMBuildLoad2(builder, coverage_mask_type, s_mask_ptr, "");
430       s_mask = LLVMBuildAnd(builder, s_mask, test, "");
431       LLVMBuildStore(builder, s_mask, s_mask_ptr);
432    }
433 };
434 
435 
436 struct lp_build_fs_llvm_iface {
437    struct lp_build_fs_iface base;
438    struct lp_build_interp_soa_context *interp;
439    struct lp_build_for_loop_state *loop_state;
440    LLVMTypeRef mask_type;
441    LLVMValueRef mask_store;
442    LLVMValueRef sample_id;
443    LLVMValueRef color_ptr_ptr;
444    LLVMValueRef color_stride_ptr;
445    LLVMValueRef color_sample_stride_ptr;
446    LLVMValueRef zs_base_ptr;
447    LLVMValueRef zs_stride;
448    LLVMValueRef zs_sample_stride;
449    const struct lp_fragment_shader_variant_key *key;
450 };
451 
452 
453 static LLVMValueRef
fs_interp(const struct lp_build_fs_iface * iface,struct lp_build_context * bld,unsigned attrib,unsigned chan,bool centroid,bool sample,LLVMValueRef attrib_indir,LLVMValueRef offsets[2])454 fs_interp(const struct lp_build_fs_iface *iface,
455           struct lp_build_context *bld,
456           unsigned attrib, unsigned chan,
457           bool centroid, bool sample,
458           LLVMValueRef attrib_indir,
459           LLVMValueRef offsets[2])
460 {
461    struct lp_build_fs_llvm_iface *fs_iface = (struct lp_build_fs_llvm_iface *)iface;
462    struct lp_build_interp_soa_context *interp = fs_iface->interp;
463    unsigned loc = TGSI_INTERPOLATE_LOC_CENTER;
464    if (centroid)
465       loc = TGSI_INTERPOLATE_LOC_CENTROID;
466    if (sample)
467       loc = TGSI_INTERPOLATE_LOC_SAMPLE;
468 
469    return lp_build_interp_soa(interp, bld->gallivm, fs_iface->loop_state->counter,
470                               fs_iface->mask_type, fs_iface->mask_store,
471                               attrib, chan, loc, attrib_indir, offsets);
472 }
473 
474 
475 /**
476  * Convert depth-stencil format to a single component one, returning
477  * PIPE_FORMAT_NONE if it doesn't contain the required component.
478  */
479 static enum pipe_format
select_zs_component_format(enum pipe_format format,bool fetch_stencil)480 select_zs_component_format(enum pipe_format format,
481                            bool fetch_stencil)
482 {
483    const struct util_format_description* desc = util_format_description(format);
484    if (fetch_stencil && !util_format_has_stencil(desc))
485       return PIPE_FORMAT_NONE;
486    if (!fetch_stencil && !util_format_has_depth(desc))
487       return PIPE_FORMAT_NONE;
488 
489    switch (format) {
490    case PIPE_FORMAT_Z24_UNORM_S8_UINT:
491       return fetch_stencil ? PIPE_FORMAT_X24S8_UINT : PIPE_FORMAT_Z24X8_UNORM;
492    case PIPE_FORMAT_S8_UINT_Z24_UNORM:
493       return fetch_stencil ? PIPE_FORMAT_S8X24_UINT : PIPE_FORMAT_X8Z24_UNORM;
494    case PIPE_FORMAT_Z32_FLOAT_S8X24_UINT:
495       return fetch_stencil ? PIPE_FORMAT_X32_S8X24_UINT : format;
496    default:
497       return format;
498    }
499 }
500 
501 static void
fs_fb_fetch(const struct lp_build_fs_iface * iface,struct lp_build_context * bld,int location,LLVMValueRef result[4])502 fs_fb_fetch(const struct lp_build_fs_iface *iface,
503             struct lp_build_context *bld,
504             int location,
505             LLVMValueRef result[4])
506 {
507    struct lp_build_fs_llvm_iface *fs_iface = (struct lp_build_fs_llvm_iface *)iface;
508    struct gallivm_state *gallivm = bld->gallivm;
509    LLVMBuilderRef builder = gallivm->builder;
510    LLVMTypeRef int32_type = LLVMInt32TypeInContext(gallivm->context);
511    LLVMTypeRef int8_type = LLVMInt8TypeInContext(gallivm->context);
512    LLVMTypeRef int8p_type = LLVMPointerType(int8_type, 0);
513    const struct lp_fragment_shader_variant_key *key = fs_iface->key;
514 
515    LLVMValueRef buf_ptr;
516    LLVMValueRef stride;
517    enum pipe_format buf_format;
518 
519    const bool fetch_stencil = location == FRAG_RESULT_STENCIL;
520    const bool fetch_zs = fetch_stencil || location == FRAG_RESULT_DEPTH;
521    if (fetch_zs) {
522       buf_ptr = fs_iface->zs_base_ptr;
523       stride = fs_iface->zs_stride;
524       buf_format = select_zs_component_format(key->zsbuf_format, fetch_stencil);
525    } else {
526       assert(location >= FRAG_RESULT_DATA0 && location <= FRAG_RESULT_DATA7);
527       const int cbuf = location - FRAG_RESULT_DATA0;
528       LLVMValueRef index = lp_build_const_int32(gallivm, cbuf);
529 
530       buf_ptr = LLVMBuildLoad2(builder, int8p_type,
531                                LLVMBuildGEP2(builder, int8p_type,
532                                              fs_iface->color_ptr_ptr, &index, 1, ""), "");
533       stride = LLVMBuildLoad2(builder, int32_type,
534                               LLVMBuildGEP2(builder, int32_type,
535                                             fs_iface->color_stride_ptr, &index, 1, ""), "");
536       buf_format = key->cbuf_format[cbuf];
537    }
538 
539    const struct util_format_description* out_format_desc = util_format_description(buf_format);
540    if (out_format_desc->format == PIPE_FORMAT_NONE) {
541       result[0] = result[1] = result[2] = result[3] = bld->undef;
542       return;
543    }
544 
545    unsigned block_size = bld->type.length;
546    unsigned block_height = key->resource_1d ? 1 : 2;
547    unsigned block_width = block_size / block_height;
548 
549    if (key->multisample) {
550       LLVMValueRef sample_stride;
551 
552       if (fetch_zs) {
553          sample_stride = fs_iface->zs_sample_stride;
554       } else {
555          LLVMValueRef index = lp_build_const_int32(gallivm, location - FRAG_RESULT_DATA0);
556          sample_stride = LLVMBuildLoad2(builder, int32_type,
557                                        LLVMBuildGEP2(builder,
558                                                      int32_type,
559                                                      fs_iface->color_sample_stride_ptr,
560                                                      &index, 1, ""), "");
561       }
562 
563       LLVMValueRef sample_offset = LLVMBuildMul(builder, sample_stride, fs_iface->sample_id, "");
564       buf_ptr = LLVMBuildGEP2(builder, int8_type,
565                               buf_ptr, &sample_offset, 1, "");
566    }
567 
568    /* fragment shader executes on 4x4 blocks. depending on vector width it can
569     * execute 2 or 4 iterations.  only move to the next row once the top row
570     * has completed 8 wide 1 iteration, 4 wide 2 iterations */
571    LLVMValueRef x_offset = NULL, y_offset = NULL;
572    if (!key->resource_1d) {
573       LLVMValueRef counter = fs_iface->loop_state->counter;
574 
575       if (block_size == 4) {
576          x_offset = LLVMBuildShl(builder,
577                                  LLVMBuildAnd(builder, fs_iface->loop_state->counter, lp_build_const_int32(gallivm, 1), ""),
578                                  lp_build_const_int32(gallivm, 1), "");
579          counter = LLVMBuildLShr(builder, fs_iface->loop_state->counter, lp_build_const_int32(gallivm, 1), "");
580       }
581       y_offset = LLVMBuildMul(builder, counter, lp_build_const_int32(gallivm, 2), "");
582    }
583 
584    LLVMValueRef offsets[4 * 4];
585    for (unsigned i = 0; i < block_size; i++) {
586       unsigned x = i % block_width;
587       unsigned y = i / block_width;
588 
589       if (block_size == 8) {
590          /* remap the raw slots into the fragment shader execution mode. */
591          /* this math took me way too long to work out, I'm sure it's
592           * overkill.
593           */
594          x = (i & 1) + ((i >> 2) << 1);
595          if (!key->resource_1d)
596             y = (i & 2) >> 1;
597       }
598 
599       LLVMValueRef x_val;
600       if (x_offset) {
601          x_val = LLVMBuildAdd(builder, lp_build_const_int32(gallivm, x), x_offset, "");
602          x_val = LLVMBuildMul(builder, x_val, lp_build_const_int32(gallivm, out_format_desc->block.bits / 8), "");
603       } else {
604          x_val = lp_build_const_int32(gallivm, x * (out_format_desc->block.bits / 8));
605       }
606 
607       LLVMValueRef y_val = lp_build_const_int32(gallivm, y);
608       if (y_offset)
609          y_val = LLVMBuildAdd(builder, y_val, y_offset, "");
610       y_val = LLVMBuildMul(builder, y_val, stride, "");
611 
612       offsets[i] = LLVMBuildAdd(builder, x_val, y_val, "");
613    }
614    LLVMValueRef offset = lp_build_gather_values(gallivm, offsets, block_size);
615 
616    struct lp_type texel_type = bld->type;
617    if (out_format_desc->colorspace == UTIL_FORMAT_COLORSPACE_RGB &&
618        out_format_desc->channel[0].pure_integer) {
619       if (out_format_desc->channel[0].type == UTIL_FORMAT_TYPE_SIGNED) {
620          texel_type = lp_type_int_vec(bld->type.width, bld->type.width * bld->type.length);
621       } else if (out_format_desc->channel[0].type == UTIL_FORMAT_TYPE_UNSIGNED) {
622          texel_type = lp_type_uint_vec(bld->type.width, bld->type.width * bld->type.length);
623       }
624    } else if (fetch_stencil) {
625       texel_type = lp_type_uint_vec(bld->type.width, bld->type.width * bld->type.length);
626    }
627 
628    lp_build_fetch_rgba_soa(gallivm, out_format_desc, texel_type,
629                            true, buf_ptr, offset,
630                            NULL, NULL, NULL, result);
631 }
632 
633 /**
634  * Generate the fragment shader, depth/stencil test, and alpha tests.
635  */
636 static void
generate_fs_loop(struct gallivm_state * gallivm,struct lp_fragment_shader * shader,const struct lp_fragment_shader_variant_key * key,LLVMBuilderRef builder,struct lp_type type,LLVMTypeRef context_type,LLVMValueRef context_ptr,LLVMTypeRef resources_type,LLVMValueRef resources_ptr,LLVMTypeRef sample_pos_type,LLVMValueRef sample_pos_array,LLVMValueRef num_loop,struct lp_build_interp_soa_context * interp,const struct lp_build_sampler_soa * sampler,const struct lp_build_image_soa * image,LLVMTypeRef mask_type,LLVMValueRef mask_store,LLVMValueRef (* out_color)[4],LLVMValueRef depth_base_ptr,LLVMValueRef depth_stride,LLVMValueRef depth_sample_stride,LLVMValueRef color_ptr_ptr,LLVMValueRef color_stride_ptr,LLVMValueRef color_sample_stride_ptr,LLVMValueRef facing,LLVMTypeRef thread_data_type,LLVMValueRef thread_data_ptr)637 generate_fs_loop(struct gallivm_state *gallivm,
638                  struct lp_fragment_shader *shader,
639                  const struct lp_fragment_shader_variant_key *key,
640                  LLVMBuilderRef builder,
641                  struct lp_type type,
642                  LLVMTypeRef context_type,
643                  LLVMValueRef context_ptr,
644                  LLVMTypeRef resources_type,
645                  LLVMValueRef resources_ptr,
646                  LLVMTypeRef sample_pos_type,
647                  LLVMValueRef sample_pos_array,
648                  LLVMValueRef num_loop,
649                  struct lp_build_interp_soa_context *interp,
650                  const struct lp_build_sampler_soa *sampler,
651                  const struct lp_build_image_soa *image,
652                  LLVMTypeRef mask_type,
653                  LLVMValueRef mask_store,
654                  LLVMValueRef (*out_color)[4],
655                  LLVMValueRef depth_base_ptr,
656                  LLVMValueRef depth_stride,
657                  LLVMValueRef depth_sample_stride,
658                  LLVMValueRef color_ptr_ptr,
659                  LLVMValueRef color_stride_ptr,
660                  LLVMValueRef color_sample_stride_ptr,
661                  LLVMValueRef facing,
662                  LLVMTypeRef thread_data_type,
663                  LLVMValueRef thread_data_ptr)
664 {
665    struct lp_type int_type = lp_int_type(type);
666    LLVMValueRef mask_ptr = NULL, mask_val = NULL;
667    LLVMValueRef z;
668    LLVMValueRef z_value, s_value;
669    LLVMValueRef z_fb, s_fb;
670    LLVMValueRef zs_samples = lp_build_const_int32(gallivm, key->zsbuf_nr_samples);
671    LLVMValueRef z_out = NULL, s_out = NULL;
672    struct lp_build_for_loop_state loop_state, sample_loop_state = {0};
673    struct lp_build_mask_context mask;
674    struct nir_shader *nir = shader->base.ir.nir;
675    const bool dual_source_blend = key->blend.rt[0].blend_enable &&
676                                   util_blend_state_is_dual(&key->blend, 0);
677    const bool post_depth_coverage = nir->info.fs.post_depth_coverage;
678 
679    struct lp_bld_tgsi_system_values system_values;
680 
681    memset(&system_values, 0, sizeof(system_values));
682 
683    /* truncate then sign extend. */
684    system_values.front_facing =
685       LLVMBuildTrunc(gallivm->builder, facing,
686                      LLVMInt1TypeInContext(gallivm->context), "");
687    system_values.front_facing =
688       LLVMBuildSExt(gallivm->builder, system_values.front_facing,
689                     LLVMInt32TypeInContext(gallivm->context), "");
690    system_values.view_index =
691       lp_jit_thread_data_raster_state_view_index(gallivm,
692                                                  thread_data_type,
693                                                  thread_data_ptr);
694 
695    unsigned depth_mode;
696    const struct util_format_description *zs_format_desc = NULL;
697    if (key->depth.enabled ||
698        key->stencil[0].enabled) {
699       zs_format_desc = util_format_description(key->zsbuf_format);
700 
701       if (nir->info.fs.early_fragment_tests || nir->info.fs.post_depth_coverage) {
702          depth_mode = EARLY_DEPTH_TEST | EARLY_DEPTH_WRITE;
703       } else if (!(nir->info.outputs_written & BITFIELD64_BIT(FRAG_RESULT_DEPTH)) &&
704                  !(nir->info.outputs_written & BITFIELD64_BIT(FRAG_RESULT_STENCIL)) &&
705                  !nir->info.fs.uses_fbfetch_output && !nir->info.writes_memory) {
706          if (key->alpha.enabled ||
707              key->blend.alpha_to_coverage ||
708              nir->info.fs.uses_discard ||
709              nir->info.outputs_written & BITFIELD64_BIT(FRAG_RESULT_SAMPLE_MASK)) {
710             /* With alpha test and kill, can do the depth test early
711              * and hopefully eliminate some quads.  But need to do a
712              * special deferred depth write once the final mask value
713              * is known. This only works though if there's either no
714              * stencil test or the stencil value isn't written.
715              */
716             if (key->stencil[0].enabled && (key->stencil[0].writemask ||
717                                             (key->stencil[1].enabled &&
718                                              key->stencil[1].writemask)))
719                depth_mode = LATE_DEPTH_TEST | LATE_DEPTH_WRITE;
720             else
721                depth_mode = EARLY_DEPTH_TEST | LATE_DEPTH_WRITE | EARLY_DEPTH_TEST_INFERRED;
722          } else {
723             depth_mode = EARLY_DEPTH_TEST | EARLY_DEPTH_WRITE | EARLY_DEPTH_TEST_INFERRED;
724          }
725       } else {
726          depth_mode = LATE_DEPTH_TEST | LATE_DEPTH_WRITE;
727       }
728 
729       if (!(key->depth.enabled && key->depth.writemask) &&
730           !(key->stencil[0].enabled && (key->stencil[0].writemask ||
731                                         (key->stencil[1].enabled &&
732                                          key->stencil[1].writemask))))
733          depth_mode &= ~(LATE_DEPTH_WRITE | EARLY_DEPTH_WRITE);
734    } else {
735       depth_mode = 0;
736    }
737 
738    LLVMTypeRef vec_type = lp_build_vec_type(gallivm, type);
739    LLVMTypeRef int_vec_type = lp_build_vec_type(gallivm, int_type);
740 
741    LLVMValueRef stencil_refs[2];
742    stencil_refs[0] = lp_jit_context_stencil_ref_front_value(gallivm, context_type, context_ptr);
743    stencil_refs[1] = lp_jit_context_stencil_ref_back_value(gallivm, context_type, context_ptr);
744    /* convert scalar stencil refs into vectors */
745    stencil_refs[0] = lp_build_broadcast(gallivm, int_vec_type, stencil_refs[0]);
746    stencil_refs[1] = lp_build_broadcast(gallivm, int_vec_type, stencil_refs[1]);
747 
748    LLVMValueRef consts_ptr = lp_jit_resources_constants(gallivm, resources_type, resources_ptr);
749 
750    LLVMValueRef ssbo_ptr = lp_jit_resources_ssbos(gallivm, resources_type, resources_ptr);
751 
752    LLVMValueRef outputs[PIPE_MAX_SHADER_OUTPUTS][TGSI_NUM_CHANNELS];
753    memset(outputs, 0, sizeof outputs);
754 
755    /* Allocate color storage for each fragment sample */
756    LLVMValueRef color_store_size = num_loop;
757    if (key->min_samples > 1)
758       color_store_size = LLVMBuildMul(builder, num_loop, lp_build_const_int32(gallivm, key->min_samples), "");
759 
760    for (unsigned cbuf = 0; cbuf < key->nr_cbufs; cbuf++) {
761       for (unsigned chan = 0; chan < TGSI_NUM_CHANNELS; ++chan) {
762          out_color[cbuf][chan] = lp_build_array_alloca(gallivm,
763                                                        lp_build_vec_type(gallivm,
764                                                                          type),
765                                                        color_store_size, "color");
766       }
767    }
768    if (dual_source_blend) {
769       assert(key->nr_cbufs <= 1);
770       for (unsigned chan = 0; chan < TGSI_NUM_CHANNELS; ++chan) {
771          out_color[1][chan] = lp_build_array_alloca(gallivm,
772                                                     lp_build_vec_type(gallivm,
773                                                                       type),
774                                                     color_store_size, "color1");
775       }
776    }
777    if (nir->info.outputs_written & BITFIELD64_BIT(FRAG_RESULT_DEPTH)) {
778       z_out = lp_build_array_alloca(gallivm,
779                                     lp_build_vec_type(gallivm, type),
780                                     color_store_size, "depth");
781    }
782 
783    if (nir->info.outputs_written & BITFIELD64_BIT(FRAG_RESULT_STENCIL)) {
784       s_out = lp_build_array_alloca(gallivm,
785                                     lp_build_vec_type(gallivm, type),
786                                     color_store_size, "depth");
787    }
788 
789    lp_build_for_loop_begin(&loop_state, gallivm,
790                            lp_build_const_int32(gallivm, 0),
791                            LLVMIntULT,
792                            num_loop,
793                            lp_build_const_int32(gallivm, 1));
794 
795    LLVMValueRef sample_mask_in;
796    if (key->multisample) {
797       sample_mask_in = lp_build_const_int_vec(gallivm, type, 0);
798       /* create shader execution mask by combining all sample masks. */
799       for (unsigned s = 0; s < key->coverage_samples; s++) {
800          LLVMValueRef s_mask_idx = LLVMBuildMul(builder, num_loop, lp_build_const_int32(gallivm, s), "");
801          s_mask_idx = LLVMBuildAdd(builder, s_mask_idx, loop_state.counter, "");
802          LLVMValueRef s_mask = lp_build_pointer_get2(builder, mask_type, mask_store, s_mask_idx);
803          if (s == 0)
804             mask_val = s_mask;
805          else
806             mask_val = LLVMBuildOr(builder, s_mask, mask_val, "");
807 
808          LLVMValueRef mask_in = LLVMBuildAnd(builder, s_mask, lp_build_const_int_vec(gallivm, type, (1ll << s)), "");
809          sample_mask_in = LLVMBuildOr(builder, sample_mask_in, mask_in, "");
810       }
811    } else {
812       sample_mask_in = lp_build_const_int_vec(gallivm, type, 1);
813       mask_ptr = LLVMBuildGEP2(builder, mask_type, mask_store,
814                               &loop_state.counter, 1, "mask_ptr");
815       mask_val = LLVMBuildLoad2(builder, mask_type, mask_ptr, "");
816 
817       LLVMValueRef mask_in = LLVMBuildAnd(builder, mask_val, lp_build_const_int_vec(gallivm, type, 1), "");
818       sample_mask_in = LLVMBuildOr(builder, sample_mask_in, mask_in, "");
819    }
820 
821    /* 'mask' will control execution based on quad's pixel alive/killed state */
822    lp_build_mask_begin(&mask, gallivm, type, mask_val);
823 
824    if (!(depth_mode & EARLY_DEPTH_TEST))
825       lp_build_mask_check(&mask);
826 
827    /* Create storage for recombining sample masks after early Z pass. */
828    LLVMValueRef s_mask_or = lp_build_alloca(gallivm, int_vec_type, "cov_mask_early_depth");
829    LLVMBuildStore(builder, LLVMConstNull(int_vec_type), s_mask_or);
830 
831    /* Create storage for post depth sample mask */
832    LLVMValueRef post_depth_sample_mask_in = NULL;
833    if (post_depth_coverage)
834       post_depth_sample_mask_in = lp_build_alloca(gallivm, int_vec_type, "post_depth_sample_mask_in");
835 
836    LLVMValueRef s_mask = NULL, s_mask_ptr = NULL;
837    LLVMValueRef z_sample_value_store = NULL, s_sample_value_store = NULL;
838    LLVMValueRef z_fb_store = NULL, s_fb_store = NULL;
839    LLVMTypeRef z_type = NULL, z_fb_type = NULL;
840 
841    /* Run early depth once per sample */
842    if (key->multisample) {
843 
844       if (zs_format_desc) {
845          struct lp_type zs_type = lp_depth_type(zs_format_desc, type.length);
846          struct lp_type z_type = zs_type;
847          struct lp_type s_type = zs_type;
848          if (zs_format_desc->block.bits < type.width)
849             z_type.width = type.width;
850          if (zs_format_desc->block.bits == 8) {
851             s_type.width = type.width;
852          } else if (zs_format_desc->block.bits > 32) {
853             z_type.width = z_type.width / 2;
854             s_type.width = s_type.width / 2;
855             s_type.floating = 0;
856          }
857          z_sample_value_store = lp_build_array_alloca(gallivm, lp_build_int_vec_type(gallivm, type),
858                                                       zs_samples, "z_sample_store");
859          s_sample_value_store = lp_build_array_alloca(gallivm, lp_build_int_vec_type(gallivm, type),
860                                                       zs_samples, "s_sample_store");
861          z_fb_store = lp_build_array_alloca(gallivm, lp_build_vec_type(gallivm, z_type),
862                                             zs_samples, "z_fb_store");
863          s_fb_store = lp_build_array_alloca(gallivm, lp_build_vec_type(gallivm, s_type),
864                                             zs_samples, "s_fb_store");
865       }
866       lp_build_for_loop_begin(&sample_loop_state, gallivm,
867                               lp_build_const_int32(gallivm, 0),
868                               LLVMIntULT, lp_build_const_int32(gallivm, key->coverage_samples),
869                               lp_build_const_int32(gallivm, 1));
870 
871       LLVMValueRef s_mask_idx = LLVMBuildMul(builder, sample_loop_state.counter, num_loop, "");
872       s_mask_idx = LLVMBuildAdd(builder, s_mask_idx, loop_state.counter, "");
873       s_mask_ptr = LLVMBuildGEP2(builder, mask_type, mask_store, &s_mask_idx, 1, "");
874 
875       s_mask = LLVMBuildLoad2(builder, mask_type, s_mask_ptr, "");
876       s_mask = LLVMBuildAnd(builder, s_mask, mask_val, "");
877    }
878 
879 
880    /* for multisample Z needs to be interpolated at sample points for testing. */
881    lp_build_interp_soa_update_pos_dyn(interp, gallivm, loop_state.counter,
882                                       key->multisample
883                                       ? sample_loop_state.counter : NULL);
884    z = interp->pos[2];
885 
886    LLVMValueRef depth_ptr = depth_base_ptr;
887    if (key->multisample) {
888       LLVMValueRef sample_offset =
889          LLVMBuildMul(builder, sample_loop_state.counter,
890                       depth_sample_stride, "");
891       depth_ptr = LLVMBuildGEP2(builder, LLVMInt8TypeInContext(gallivm->context),
892                                 depth_ptr, &sample_offset, 1, "");
893    }
894 
895    if (depth_mode & EARLY_DEPTH_TEST) {
896       z = lp_build_depth_clamp(gallivm, builder, key->depth_clamp,
897                                key->restrict_depth_values, type,
898                                context_type, context_ptr,
899                                thread_data_type, thread_data_ptr, z);
900 
901       lp_build_depth_stencil_load_swizzled(gallivm, type,
902                                            zs_format_desc, key->resource_1d,
903                                            depth_ptr, depth_stride,
904                                            &z_fb, &s_fb, loop_state.counter);
905       lp_build_depth_stencil_test(gallivm,
906                                   &key->depth,
907                                   key->stencil,
908                                   type,
909                                   zs_format_desc,
910                                   key->multisample ? NULL : &mask,
911                                   &s_mask,
912                                   stencil_refs,
913                                   z, z_fb, s_fb,
914                                   facing,
915                                   &z_value, &s_value,
916                                   !key->multisample,
917                                   key->restrict_depth_values);
918 
919       if (depth_mode & EARLY_DEPTH_WRITE) {
920          lp_build_depth_stencil_write_swizzled(gallivm, type,
921                                                zs_format_desc, key->resource_1d,
922                                                NULL, NULL, NULL, loop_state.counter,
923                                                depth_ptr, depth_stride,
924                                                z_value, s_value);
925       }
926       /*
927        * Note mask check if stencil is enabled must be after ds write not
928        * after stencil test otherwise new stencil values may not get written
929        * if all fragments got killed by depth/stencil test.
930        */
931       if (key->stencil[0].enabled && !key->multisample)
932          lp_build_mask_check(&mask);
933 
934       if (key->multisample) {
935          z_fb_type = LLVMTypeOf(z_fb);
936          z_type = LLVMTypeOf(z_value);
937          lp_build_pointer_set(builder, z_sample_value_store, sample_loop_state.counter, LLVMBuildBitCast(builder, z_value, lp_build_int_vec_type(gallivm, type), ""));
938          lp_build_pointer_set(builder, s_sample_value_store, sample_loop_state.counter, LLVMBuildBitCast(builder, s_value, lp_build_int_vec_type(gallivm, type), ""));
939          lp_build_pointer_set(builder, z_fb_store, sample_loop_state.counter, z_fb);
940          lp_build_pointer_set(builder, s_fb_store, sample_loop_state.counter, s_fb);
941       }
942       if (key->occlusion_count && !(depth_mode & EARLY_DEPTH_TEST_INFERRED)) {
943          LLVMValueRef counter = lp_jit_thread_data_vis_counter(gallivm, thread_data_type, thread_data_ptr);
944          lp_build_name(counter, "counter");
945          lp_build_occlusion_count(gallivm, type,
946                                  key->multisample ? s_mask : lp_build_mask_value(&mask), counter);
947       }
948    }
949 
950    if (key->multisample) {
951       /*
952        * Store the post-early Z coverage mask.
953        * Recombine the resulting coverage masks post early Z into the fragment
954        * shader execution mask.
955        */
956       LLVMValueRef tmp_s_mask_or = LLVMBuildLoad2(builder, int_vec_type, s_mask_or, "");
957       tmp_s_mask_or = LLVMBuildOr(builder, tmp_s_mask_or, s_mask, "");
958       LLVMBuildStore(builder, tmp_s_mask_or, s_mask_or);
959 
960       if (post_depth_coverage) {
961          LLVMValueRef mask_bit_idx = LLVMBuildShl(builder, lp_build_const_int32(gallivm, 1), sample_loop_state.counter, "");
962          LLVMValueRef post_depth_mask_in = LLVMBuildLoad2(builder, int_vec_type, post_depth_sample_mask_in, "");
963          mask_bit_idx = LLVMBuildAnd(builder, s_mask, lp_build_broadcast(gallivm, int_vec_type, mask_bit_idx), "");
964          post_depth_mask_in = LLVMBuildOr(builder, post_depth_mask_in, mask_bit_idx, "");
965          LLVMBuildStore(builder, post_depth_mask_in, post_depth_sample_mask_in);
966       }
967 
968       LLVMBuildStore(builder, s_mask, s_mask_ptr);
969 
970       lp_build_for_loop_end(&sample_loop_state);
971 
972       /* recombined all the coverage masks in the shader exec mask. */
973       tmp_s_mask_or = LLVMBuildLoad2(builder, int_vec_type, s_mask_or, "");
974       lp_build_mask_update(&mask, tmp_s_mask_or);
975 
976       if (key->min_samples == 1) {
977          /* for multisample Z needs to be re interpolated at pixel center */
978          lp_build_interp_soa_update_pos_dyn(interp, gallivm, loop_state.counter, NULL);
979          z = interp->pos[2];
980          lp_build_mask_update(&mask, tmp_s_mask_or);
981       }
982    } else {
983       if (post_depth_coverage) {
984          LLVMValueRef post_depth_mask_in = LLVMBuildAnd(builder, lp_build_mask_value(&mask), lp_build_const_int_vec(gallivm, type, 1), "");
985          LLVMBuildStore(builder, post_depth_mask_in, post_depth_sample_mask_in);
986       }
987    }
988 
989    LLVMValueRef out_sample_mask_storage = NULL;
990    if (nir->info.outputs_written & BITFIELD64_BIT(FRAG_RESULT_SAMPLE_MASK)) {
991       out_sample_mask_storage = lp_build_alloca(gallivm, int_vec_type, "write_mask");
992       if (key->min_samples > 1)
993          LLVMBuildStore(builder, LLVMConstNull(int_vec_type), out_sample_mask_storage);
994    }
995 
996    if (post_depth_coverage) {
997       system_values.sample_mask_in = LLVMBuildLoad2(builder, int_vec_type, post_depth_sample_mask_in, "");
998    } else {
999       system_values.sample_mask_in = sample_mask_in;
1000    }
1001    if (key->multisample && key->min_samples > 1) {
1002       lp_build_for_loop_begin(&sample_loop_state, gallivm,
1003                               lp_build_const_int32(gallivm, 0),
1004                               LLVMIntULT,
1005                               lp_build_const_int32(gallivm, key->min_samples),
1006                               lp_build_const_int32(gallivm, 1));
1007 
1008       LLVMValueRef s_mask_idx = LLVMBuildMul(builder, sample_loop_state.counter, num_loop, "");
1009       s_mask_idx = LLVMBuildAdd(builder, s_mask_idx, loop_state.counter, "");
1010       s_mask_ptr = LLVMBuildGEP2(builder, mask_type, mask_store, &s_mask_idx, 1, "");
1011       s_mask = LLVMBuildLoad2(builder, mask_type, s_mask_ptr, "");
1012       lp_build_mask_force(&mask, s_mask);
1013       lp_build_interp_soa_update_pos_dyn(interp, gallivm, loop_state.counter, sample_loop_state.counter);
1014       system_values.sample_id = sample_loop_state.counter;
1015       system_values.sample_mask_in = LLVMBuildAnd(builder, system_values.sample_mask_in,
1016                                                   lp_build_broadcast(gallivm, int_vec_type,
1017                                                                      LLVMBuildShl(builder, lp_build_const_int32(gallivm, 1), sample_loop_state.counter, "")), "");
1018    } else {
1019       system_values.sample_id = lp_build_const_int32(gallivm, 0);
1020 
1021    }
1022    system_values.sample_pos = sample_pos_array;
1023    system_values.sample_pos_type = sample_pos_type;
1024 
1025    lp_build_interp_soa_update_inputs_dyn(interp, gallivm, loop_state.counter,
1026                                          mask_type, mask_store, sample_loop_state.counter);
1027 
1028    struct lp_build_fs_llvm_iface fs_iface = {
1029      .base.interp_fn = fs_interp,
1030      .base.fb_fetch = fs_fb_fetch,
1031      .interp = interp,
1032      .loop_state = &loop_state,
1033      .sample_id = system_values.sample_id,
1034      .mask_type = mask_type,
1035      .mask_store = mask_store,
1036      .color_ptr_ptr = color_ptr_ptr,
1037      .color_stride_ptr = color_stride_ptr,
1038      .color_sample_stride_ptr = color_sample_stride_ptr,
1039      .zs_base_ptr = depth_base_ptr,
1040      .zs_stride = depth_stride,
1041      .zs_sample_stride = depth_sample_stride,
1042      .key = key,
1043    };
1044 
1045    struct lp_build_tgsi_params params;
1046    memset(&params, 0, sizeof(params));
1047 
1048    params.type = type;
1049    params.mask = &mask;
1050    params.fs_iface = &fs_iface.base;
1051    params.consts_ptr = consts_ptr;
1052    params.system_values = &system_values;
1053    params.inputs = interp->inputs;
1054    params.num_inputs = interp->num_attribs - 1;
1055    params.context_type = context_type;
1056    params.context_ptr = context_ptr;
1057    params.resources_type = resources_type;
1058    params.resources_ptr = resources_ptr;
1059    params.thread_data_type = thread_data_type;
1060    params.thread_data_ptr = thread_data_ptr;
1061    params.sampler = sampler;
1062    params.info = &shader->info.base;
1063    params.ssbo_ptr = ssbo_ptr;
1064    params.image = image;
1065    params.aniso_filter_table = lp_jit_resources_aniso_filter_table(gallivm, resources_type, resources_ptr);
1066 
1067    /* Build the actual shader */
1068    lp_build_nir_soa(gallivm, nir, &params, outputs);
1069 
1070    /* Alpha test */
1071    if (key->alpha.enabled) {
1072       int color0 = find_output_by_frag_result(nir, FRAG_RESULT_DATA0);
1073 
1074       if (color0 != -1 && outputs[color0][3]) {
1075          const struct util_format_description *cbuf_format_desc;
1076          LLVMValueRef alpha = LLVMBuildLoad2(builder, vec_type, outputs[color0][3], "alpha");
1077          LLVMValueRef alpha_ref_value;
1078 
1079          alpha_ref_value = lp_jit_context_alpha_ref_value(gallivm, context_type, context_ptr);
1080          alpha_ref_value = lp_build_broadcast(gallivm, vec_type, alpha_ref_value);
1081 
1082          cbuf_format_desc = util_format_description(key->cbuf_format[0]);
1083 
1084          lp_build_alpha_test(gallivm, key->alpha.func, type, cbuf_format_desc,
1085                              &mask, alpha, alpha_ref_value,
1086                              ((depth_mode & LATE_DEPTH_TEST) != 0) && !key->multisample);
1087       }
1088    }
1089 
1090    /* Emulate Alpha to Coverage with Alpha test */
1091    if (key->blend.alpha_to_coverage) {
1092       int color0 = find_output_by_frag_result(nir, FRAG_RESULT_DATA0);
1093 
1094       if (color0 != -1 && outputs[color0][3]) {
1095          LLVMValueRef alpha = LLVMBuildLoad2(builder, vec_type, outputs[color0][3], "alpha");
1096 
1097          if (!key->multisample) {
1098             lp_build_alpha_to_coverage(gallivm, type,
1099                                        &mask, alpha,
1100                                        (depth_mode & LATE_DEPTH_TEST) != 0);
1101          } else {
1102             lp_build_sample_alpha_to_coverage(gallivm, type, key->coverage_samples, num_loop,
1103                                               loop_state.counter,
1104                                               mask_type, mask_store, alpha);
1105          }
1106       }
1107    }
1108 
1109    if (key->blend.alpha_to_one) {
1110       nir_foreach_shader_out_variable(var, nir) {
1111          if (var->data.location < FRAG_RESULT_DATA0)
1112             continue;
1113          int slots = nir_variable_count_slots(var, var->type);
1114          for (unsigned s = 0; s < slots; s++) {
1115             unsigned cbuf = get_cbuf_location(var, s);
1116             if ((cbuf < key->nr_cbufs) || (cbuf == 1 && dual_source_blend))
1117                if (outputs[cbuf][3]) {
1118                   LLVMBuildStore(builder, lp_build_const_vec(gallivm, type, 1.0),
1119                                  outputs[cbuf][3]);
1120                }
1121          }
1122       }
1123    }
1124 
1125    if (nir->info.outputs_written & BITFIELD64_BIT(FRAG_RESULT_SAMPLE_MASK)) {
1126       LLVMValueRef output_smask = NULL;
1127       int smaski = find_output_by_frag_result(nir, FRAG_RESULT_SAMPLE_MASK);
1128 
1129       struct lp_build_context smask_bld;
1130       lp_build_context_init(&smask_bld, gallivm, int_type);
1131 
1132       assert(smaski >= 0);
1133       output_smask = LLVMBuildLoad2(builder, vec_type, outputs[smaski][0], "smask");
1134       output_smask = LLVMBuildBitCast(builder, output_smask, smask_bld.vec_type, "");
1135       if (!key->multisample && key->no_ms_sample_mask_out) {
1136          output_smask = lp_build_and(&smask_bld, output_smask, smask_bld.one);
1137          output_smask = lp_build_cmp(&smask_bld, PIPE_FUNC_NOTEQUAL, output_smask, smask_bld.zero);
1138          lp_build_mask_update(&mask, output_smask);
1139       }
1140 
1141       if (key->min_samples > 1) {
1142          /* only the bit corresponding to this sample is to be used. */
1143          LLVMValueRef tmp_mask = LLVMBuildLoad2(builder, int_vec_type, out_sample_mask_storage, "tmp_mask");
1144          LLVMValueRef out_smask_idx = LLVMBuildShl(builder, lp_build_const_int32(gallivm, 1), sample_loop_state.counter, "");
1145          LLVMValueRef smask_bit = LLVMBuildAnd(builder, output_smask, lp_build_broadcast(gallivm, int_vec_type, out_smask_idx), "");
1146          output_smask = LLVMBuildOr(builder, tmp_mask, smask_bit, "");
1147       }
1148 
1149       LLVMBuildStore(builder, output_smask, out_sample_mask_storage);
1150    }
1151 
1152    if (nir->info.outputs_written & BITFIELD64_BIT(FRAG_RESULT_DEPTH)) {
1153       int pos0 = find_output_by_frag_result(nir, FRAG_RESULT_DEPTH);
1154 
1155       LLVMValueRef out = LLVMBuildLoad2(builder, vec_type, outputs[pos0][2], "");
1156       LLVMValueRef idx = loop_state.counter;
1157       if (key->min_samples > 1)
1158          idx = LLVMBuildAdd(builder, idx,
1159                             LLVMBuildMul(builder, sample_loop_state.counter, num_loop, ""), "");
1160       LLVMValueRef ptr = LLVMBuildGEP2(builder, vec_type, z_out, &idx, 1, "");
1161       LLVMBuildStore(builder, out, ptr);
1162    }
1163 
1164    if (nir->info.outputs_written & BITFIELD64_BIT(FRAG_RESULT_STENCIL)) {
1165       int sten_out = find_output_by_frag_result(nir, FRAG_RESULT_STENCIL);
1166 
1167       LLVMValueRef out = LLVMBuildLoad2(builder, vec_type,
1168                                         outputs[sten_out][1], "output.s");
1169       LLVMValueRef idx = loop_state.counter;
1170       if (key->min_samples > 1)
1171          idx = LLVMBuildAdd(builder, idx,
1172                             LLVMBuildMul(builder, sample_loop_state.counter, num_loop, ""), "");
1173       LLVMValueRef ptr = LLVMBuildGEP2(builder, vec_type, s_out, &idx, 1, "");
1174       LLVMBuildStore(builder, out, ptr);
1175    }
1176 
1177    bool has_cbuf0_write = false;
1178    /* Color write - per fragment sample */
1179    nir_foreach_shader_out_variable(var, nir) {
1180       if (var->data.location < FRAG_RESULT_DATA0)
1181          continue;
1182       int slots = nir_variable_count_slots(var, var->type);
1183 
1184       for (unsigned s = 0; s < slots; s++) {
1185          unsigned cbuf = get_cbuf_location(var, s);
1186          unsigned attrib = var->data.driver_location + s;
1187          if ((cbuf < key->nr_cbufs) || (cbuf == 1 && dual_source_blend)) {
1188             if (cbuf == 0) {
1189                /* XXX: there is an edge case with FB fetch where gl_FragColor and
1190                 * gl_LastFragData[0] are used together. This creates both
1191                 * FRAG_RESULT_COLOR and FRAG_RESULT_DATA* output variables. This
1192                 * loop then writes to cbuf 0 twice, owerwriting the correct value
1193                 * from gl_FragColor with some garbage. This case is excercised in
1194                 * one of deqp tests.  A similar bug can happen if
1195                 * gl_SecondaryFragColorEXT and gl_LastFragData[1] are mixed in
1196                 * the same fashion...  This workaround will break if
1197                 * gl_LastFragData[0] goes in outputs list before
1198                 * gl_FragColor. This doesn't seem to happen though.
1199                 */
1200                if (has_cbuf0_write)
1201                   continue;
1202                has_cbuf0_write = true;
1203             }
1204 
1205             for (unsigned chan = 0; chan < TGSI_NUM_CHANNELS; ++chan) {
1206                if (outputs[attrib][chan]) {
1207                   /* XXX: just initialize outputs to point at colors[] and
1208                    * skip this.
1209                    */
1210                   LLVMValueRef out = LLVMBuildLoad2(builder, vec_type, outputs[attrib][chan], "");
1211                   LLVMValueRef color_ptr;
1212                   LLVMValueRef color_idx = loop_state.counter;
1213                   if (key->min_samples > 1)
1214                      color_idx = LLVMBuildAdd(builder, color_idx,
1215                                               LLVMBuildMul(builder, sample_loop_state.counter, num_loop, ""), "");
1216                   color_ptr = LLVMBuildGEP2(builder, vec_type, out_color[cbuf][chan],
1217                                             &color_idx, 1, "");
1218                   lp_build_name(out, "color%u.%c", attrib, "rgba"[chan]);
1219                   LLVMBuildStore(builder, out, color_ptr);
1220                }
1221             }
1222          }
1223       }
1224    }
1225 
1226    if (key->multisample && key->min_samples > 1) {
1227       LLVMBuildStore(builder, lp_build_mask_value(&mask), s_mask_ptr);
1228       lp_build_for_loop_end(&sample_loop_state);
1229    }
1230 
1231    if (key->multisample) {
1232       /* execute depth test for each sample */
1233       lp_build_for_loop_begin(&sample_loop_state, gallivm,
1234                               lp_build_const_int32(gallivm, 0),
1235                               LLVMIntULT, lp_build_const_int32(gallivm, key->coverage_samples),
1236                               lp_build_const_int32(gallivm, 1));
1237 
1238       /* load the per-sample coverage mask */
1239       LLVMValueRef s_mask_idx = LLVMBuildMul(builder, sample_loop_state.counter, num_loop, "");
1240       s_mask_idx = LLVMBuildAdd(builder, s_mask_idx, loop_state.counter, "");
1241       s_mask_ptr = LLVMBuildGEP2(builder, mask_type, mask_store, &s_mask_idx, 1, "");
1242 
1243       /* combine the execution mask post fragment shader with the coverage mask. */
1244       s_mask = LLVMBuildLoad2(builder, mask_type, s_mask_ptr, "");
1245       if (key->min_samples == 1)
1246          s_mask = LLVMBuildAnd(builder, s_mask, lp_build_mask_value(&mask), "");
1247 
1248       /* if the shader writes sample mask use that,
1249        * but only if this isn't genuine early-depth to avoid breaking occlusion query */
1250       if (nir->info.outputs_written & BITFIELD64_BIT(FRAG_RESULT_SAMPLE_MASK) &&
1251           (!(depth_mode & EARLY_DEPTH_TEST) || (depth_mode & (EARLY_DEPTH_TEST_INFERRED)))) {
1252          LLVMValueRef out_smask_idx = LLVMBuildShl(builder, lp_build_const_int32(gallivm, 1), sample_loop_state.counter, "");
1253          out_smask_idx = lp_build_broadcast(gallivm, int_vec_type, out_smask_idx);
1254          LLVMValueRef output_smask = LLVMBuildLoad2(builder, int_vec_type, out_sample_mask_storage, "");
1255          LLVMValueRef smask_bit = LLVMBuildAnd(builder, output_smask, out_smask_idx, "");
1256          LLVMValueRef cmp = LLVMBuildICmp(builder, LLVMIntNE, smask_bit, lp_build_const_int_vec(gallivm, int_type, 0), "");
1257          smask_bit = LLVMBuildSExt(builder, cmp, int_vec_type, "");
1258 
1259          s_mask = LLVMBuildAnd(builder, s_mask, smask_bit, "");
1260       }
1261    }
1262 
1263    depth_ptr = depth_base_ptr;
1264    if (key->multisample) {
1265       LLVMValueRef sample_offset = LLVMBuildMul(builder, sample_loop_state.counter, depth_sample_stride, "");
1266       depth_ptr = LLVMBuildGEP2(builder, LLVMInt8TypeInContext(gallivm->context),
1267                                 depth_ptr, &sample_offset, 1, "");
1268    }
1269 
1270    /* Late Z test */
1271    if (depth_mode & LATE_DEPTH_TEST) {
1272       if (nir->info.outputs_written & BITFIELD64_BIT(FRAG_RESULT_DEPTH)) {
1273          LLVMValueRef idx = loop_state.counter;
1274          if (key->min_samples > 1)
1275             idx = LLVMBuildAdd(builder, idx,
1276                                LLVMBuildMul(builder, sample_loop_state.counter, num_loop, ""), "");
1277          LLVMValueRef ptr = LLVMBuildGEP2(builder, vec_type, z_out, &idx, 1, "");
1278          z = LLVMBuildLoad2(builder, vec_type, ptr, "output.z");
1279       } else {
1280          if (key->multisample) {
1281             lp_build_interp_soa_update_pos_dyn(interp, gallivm, loop_state.counter, key->multisample ? sample_loop_state.counter : NULL);
1282             z = interp->pos[2];
1283          }
1284       }
1285 
1286       /*
1287        * Clamp according to ARB_depth_clamp semantics.
1288        */
1289       z = lp_build_depth_clamp(gallivm, builder, key->depth_clamp,
1290                                key->restrict_depth_values, type,
1291                                context_type, context_ptr,
1292                                thread_data_type, thread_data_ptr, z);
1293 
1294       if (nir->info.outputs_written & BITFIELD64_BIT(FRAG_RESULT_STENCIL)) {
1295          LLVMValueRef idx = loop_state.counter;
1296          if (key->min_samples > 1)
1297             idx = LLVMBuildAdd(builder, idx,
1298                                LLVMBuildMul(builder, sample_loop_state.counter, num_loop, ""), "");
1299          LLVMValueRef ptr = LLVMBuildGEP2(builder, vec_type, s_out, &idx, 1, "");
1300          stencil_refs[0] = LLVMBuildLoad2(builder, vec_type, ptr, "output.s");
1301          /* there's only one value, and spec says to discard additional bits */
1302          LLVMValueRef s_max_mask = lp_build_const_int_vec(gallivm, int_type, 255);
1303          stencil_refs[0] = LLVMBuildBitCast(builder, stencil_refs[0], int_vec_type, "");
1304          stencil_refs[0] = LLVMBuildAnd(builder, stencil_refs[0], s_max_mask, "");
1305          stencil_refs[1] = stencil_refs[0];
1306       }
1307 
1308       lp_build_depth_stencil_load_swizzled(gallivm, type,
1309                                            zs_format_desc, key->resource_1d,
1310                                            depth_ptr, depth_stride,
1311                                            &z_fb, &s_fb, loop_state.counter);
1312 
1313       lp_build_depth_stencil_test(gallivm,
1314                                   &key->depth,
1315                                   key->stencil,
1316                                   type,
1317                                   zs_format_desc,
1318                                   key->multisample ? NULL : &mask,
1319                                   &s_mask,
1320                                   stencil_refs,
1321                                   z, z_fb, s_fb,
1322                                   facing,
1323                                   &z_value, &s_value,
1324                                   false,
1325                                   key->restrict_depth_values);
1326       /* Late Z write */
1327       if (depth_mode & LATE_DEPTH_WRITE) {
1328          lp_build_depth_stencil_write_swizzled(gallivm, type,
1329                                                zs_format_desc, key->resource_1d,
1330                                                NULL, NULL, NULL, loop_state.counter,
1331                                                depth_ptr, depth_stride,
1332                                                z_value, s_value);
1333       }
1334    } else if ((depth_mode & EARLY_DEPTH_TEST) &&
1335               (depth_mode & LATE_DEPTH_WRITE)) {
1336       /* Need to apply a reduced mask to the depth write.  Reload the
1337        * depth value, update from zs_value with the new mask value and
1338        * write that out.
1339        */
1340       if (key->multisample) {
1341          z_value = LLVMBuildBitCast(builder, lp_build_pointer_get2(builder, int_vec_type, z_sample_value_store, sample_loop_state.counter), z_type, "");
1342          s_value = lp_build_pointer_get2(builder, int_vec_type, s_sample_value_store, sample_loop_state.counter);
1343          z_fb = LLVMBuildBitCast(builder, lp_build_pointer_get2(builder, int_vec_type, z_fb_store, sample_loop_state.counter), z_fb_type, "");
1344          s_fb = lp_build_pointer_get2(builder, int_vec_type, s_fb_store, sample_loop_state.counter);
1345       }
1346       lp_build_depth_stencil_write_swizzled(gallivm, type,
1347                                             zs_format_desc, key->resource_1d,
1348                                             key->multisample ? s_mask : lp_build_mask_value(&mask), z_fb, s_fb, loop_state.counter,
1349                                             depth_ptr, depth_stride,
1350                                             z_value, s_value);
1351    }
1352 
1353    if (key->occlusion_count && (!(depth_mode & EARLY_DEPTH_TEST) || (depth_mode & EARLY_DEPTH_TEST_INFERRED))) {
1354       LLVMValueRef counter = lp_jit_thread_data_vis_counter(gallivm, thread_data_type, thread_data_ptr);
1355       lp_build_name(counter, "counter");
1356 
1357       lp_build_occlusion_count(gallivm, type,
1358                                key->multisample ? s_mask : lp_build_mask_value(&mask), counter);
1359    }
1360 
1361    /* if this is genuine early-depth in the shader, write samplemask now
1362     * after occlusion count has been updated
1363     */
1364    if (key->multisample &&
1365        nir->info.outputs_written & BITFIELD64_BIT(FRAG_RESULT_SAMPLE_MASK) &&
1366        (depth_mode & (EARLY_DEPTH_TEST_INFERRED | EARLY_DEPTH_TEST)) == EARLY_DEPTH_TEST) {
1367       /* if the shader writes sample mask use that */
1368          LLVMValueRef out_smask_idx = LLVMBuildShl(builder, lp_build_const_int32(gallivm, 1), sample_loop_state.counter, "");
1369          out_smask_idx = lp_build_broadcast(gallivm, int_vec_type, out_smask_idx);
1370          LLVMValueRef output_smask = LLVMBuildLoad2(builder, int_vec_type, out_sample_mask_storage, "");
1371          LLVMValueRef smask_bit = LLVMBuildAnd(builder, output_smask, out_smask_idx, "");
1372          LLVMValueRef cmp = LLVMBuildICmp(builder, LLVMIntNE, smask_bit, lp_build_const_int_vec(gallivm, int_type, 0), "");
1373          smask_bit = LLVMBuildSExt(builder, cmp, int_vec_type, "");
1374 
1375          s_mask = LLVMBuildAnd(builder, s_mask, smask_bit, "");
1376    }
1377 
1378 
1379    if (key->multisample) {
1380       /* store the sample mask for this loop */
1381       LLVMBuildStore(builder, s_mask, s_mask_ptr);
1382       lp_build_for_loop_end(&sample_loop_state);
1383    }
1384 
1385    mask_val = lp_build_mask_end(&mask);
1386    if (!key->multisample)
1387       LLVMBuildStore(builder, mask_val, mask_ptr);
1388    lp_build_for_loop_end(&loop_state);
1389 }
1390 
1391 
1392 /**
1393  * This function will reorder pixels from the fragment shader SoA to memory
1394  * layout AoS
1395  *
1396  * Fragment Shader outputs pixels in small 2x2 blocks
1397  *  e.g. (0, 0), (1, 0), (0, 1), (1, 1) ; (2, 0) ...
1398  *
1399  * However in memory pixels are stored in rows
1400  *  e.g. (0, 0), (1, 0), (2, 0), (3, 0) ; (0, 1) ...
1401  *
1402  * @param type            fragment shader type (4x or 8x float)
1403  * @param num_fs          number of fs_src
1404  * @param is_1d           whether we're outputting to a 1d resource
1405  * @param dst_channels    number of output channels
1406  * @param fs_src          output from fragment shader
1407  * @param dst             pointer to store result
1408  * @param pad_inline      is channel padding inline or at end of row
1409  * @return                the number of dsts
1410  */
1411 static int
generate_fs_twiddle(struct gallivm_state * gallivm,struct lp_type type,unsigned num_fs,unsigned dst_channels,LLVMValueRef fs_src[][4],LLVMValueRef * dst,bool pad_inline)1412 generate_fs_twiddle(struct gallivm_state *gallivm,
1413                     struct lp_type type,
1414                     unsigned num_fs,
1415                     unsigned dst_channels,
1416                     LLVMValueRef fs_src[][4],
1417                     LLVMValueRef* dst,
1418                     bool pad_inline)
1419 {
1420    LLVMValueRef src[16];
1421    unsigned pixels = type.length / 4;
1422    unsigned src_channels = dst_channels < 3 ? dst_channels : 4;
1423    unsigned src_count = num_fs * src_channels;
1424 
1425    assert(pixels == 2 || pixels == 1);
1426    assert(num_fs * src_channels <= ARRAY_SIZE(src));
1427 
1428    /*
1429     * Transpose from SoA -> AoS
1430     */
1431    for (unsigned i = 0; i < num_fs; ++i) {
1432       lp_build_transpose_aos_n(gallivm, type, &fs_src[i][0], src_channels,
1433                                &src[i * src_channels]);
1434    }
1435 
1436    /*
1437     * Pick transformation options
1438     */
1439    bool swizzle_pad = false;
1440    bool twiddle = false;
1441    bool split = false;
1442    unsigned reorder_group = 0;
1443 
1444    if (dst_channels == 1) {
1445       twiddle = true;
1446       if (pixels == 2) {
1447          split = true;
1448       }
1449    } else if (dst_channels == 2) {
1450       if (pixels == 1) {
1451          reorder_group = 1;
1452       }
1453    } else if (dst_channels > 2) {
1454       if (pixels == 1) {
1455          reorder_group = 2;
1456       } else {
1457          twiddle = true;
1458       }
1459 
1460       if (!pad_inline && dst_channels == 3 && pixels > 1) {
1461          swizzle_pad = true;
1462       }
1463    }
1464 
1465    /*
1466     * Split the src in half
1467     */
1468    if (split) {
1469       for (unsigned i = num_fs; i > 0; --i) {
1470          src[(i - 1)*2 + 1] = lp_build_extract_range(gallivm, src[i - 1], 4, 4);
1471          src[(i - 1)*2 + 0] = lp_build_extract_range(gallivm, src[i - 1], 0, 4);
1472       }
1473 
1474       src_count *= 2;
1475       type.length = 4;
1476    }
1477 
1478    /*
1479     * Ensure pixels are in memory order
1480     */
1481    if (reorder_group) {
1482       /* Twiddle pixels by reordering the array, e.g.:
1483        *
1484        * src_count =  8 -> 0 2 1 3 4 6 5 7
1485        * src_count = 16 -> 0 1 4 5 2 3 6 7 8 9 12 13 10 11 14 15
1486        */
1487       const unsigned reorder_sw[] = { 0, 2, 1, 3 };
1488 
1489       for (unsigned i = 0; i < src_count; ++i) {
1490          unsigned group = i / reorder_group;
1491          unsigned block = (group / 4) * 4 * reorder_group;
1492          unsigned j = block + (reorder_sw[group % 4] * reorder_group) + (i % reorder_group);
1493          dst[i] = src[j];
1494       }
1495    } else if (twiddle) {
1496       /* Twiddle pixels across elements of array */
1497       /*
1498        * XXX: we should avoid this in some cases, but would need to tell
1499        * lp_build_conv to reorder (or deal with it ourselves).
1500        */
1501       lp_bld_quad_twiddle(gallivm, type, src, src_count, dst);
1502    } else {
1503       /* Do nothing */
1504       memcpy(dst, src, sizeof(LLVMValueRef) * src_count);
1505    }
1506 
1507    /*
1508     * Moves any padding between pixels to the end
1509     * e.g. RGBXRGBX -> RGBRGBXX
1510     */
1511    if (swizzle_pad) {
1512       unsigned char swizzles[16];
1513       unsigned elems = pixels * dst_channels;
1514 
1515       for (unsigned i = 0; i < type.length; ++i) {
1516          if (i < elems)
1517             swizzles[i] = i % dst_channels + (i / dst_channels) * 4;
1518          else
1519             swizzles[i] = LP_BLD_SWIZZLE_DONTCARE;
1520       }
1521 
1522       for (unsigned i = 0; i < src_count; ++i) {
1523          dst[i] = lp_build_swizzle_aos_n(gallivm, dst[i], swizzles,
1524                                          type.length, type.length);
1525       }
1526    }
1527 
1528    return src_count;
1529 }
1530 
1531 
1532 /*
1533  * Untwiddle and transpose, much like the above.
1534  * However, this is after conversion, so we get packed vectors.
1535  * At this time only handle 4x16i8 rgba / 2x16i8 rg / 1x16i8 r data,
1536  * the vectors will look like:
1537  * r0r1r4r5r2r3r6r7r8r9r12... (albeit color channels may
1538  * be swizzled here). Extending to 16bit should be trivial.
1539  * Should also be extended to handle twice wide vectors with AVX2...
1540  */
1541 static void
fs_twiddle_transpose(struct gallivm_state * gallivm,struct lp_type type,LLVMValueRef * src,unsigned src_count,LLVMValueRef * dst)1542 fs_twiddle_transpose(struct gallivm_state *gallivm,
1543                      struct lp_type type,
1544                      LLVMValueRef *src,
1545                      unsigned src_count,
1546                      LLVMValueRef *dst)
1547 {
1548    struct lp_type type64, type16, type32;
1549    LLVMTypeRef type64_t, type8_t, type16_t, type32_t;
1550    LLVMBuilderRef builder = gallivm->builder;
1551    LLVMValueRef tmp[4], shuf[8];
1552    for (unsigned j = 0; j < 2; j++) {
1553       shuf[j*4 + 0] = lp_build_const_int32(gallivm, j*4 + 0);
1554       shuf[j*4 + 1] = lp_build_const_int32(gallivm, j*4 + 2);
1555       shuf[j*4 + 2] = lp_build_const_int32(gallivm, j*4 + 1);
1556       shuf[j*4 + 3] = lp_build_const_int32(gallivm, j*4 + 3);
1557    }
1558 
1559    assert(src_count == 4 || src_count == 2 || src_count == 1);
1560    assert(type.width == 8);
1561    assert(type.length == 16);
1562 
1563    type8_t = lp_build_vec_type(gallivm, type);
1564 
1565    type64 = type;
1566    type64.length /= 8;
1567    type64.width *= 8;
1568    type64_t = lp_build_vec_type(gallivm, type64);
1569 
1570    type16 = type;
1571    type16.length /= 2;
1572    type16.width *= 2;
1573    type16_t = lp_build_vec_type(gallivm, type16);
1574 
1575    type32 = type;
1576    type32.length /= 4;
1577    type32.width *= 4;
1578    type32_t = lp_build_vec_type(gallivm, type32);
1579 
1580    lp_build_transpose_aos_n(gallivm, type, src, src_count, tmp);
1581 
1582    if (src_count == 1) {
1583       /* transpose was no-op, just untwiddle */
1584       LLVMValueRef shuf_vec;
1585       shuf_vec = LLVMConstVector(shuf, 8);
1586       tmp[0] = LLVMBuildBitCast(builder, src[0], type16_t, "");
1587       tmp[0] = LLVMBuildShuffleVector(builder, tmp[0], tmp[0], shuf_vec, "");
1588       dst[0] = LLVMBuildBitCast(builder, tmp[0], type8_t, "");
1589    } else if (src_count == 2) {
1590       LLVMValueRef shuf_vec;
1591       shuf_vec = LLVMConstVector(shuf, 4);
1592 
1593       for (unsigned i = 0; i < 2; i++) {
1594          tmp[i] = LLVMBuildBitCast(builder, tmp[i], type32_t, "");
1595          tmp[i] = LLVMBuildShuffleVector(builder, tmp[i], tmp[i], shuf_vec, "");
1596          dst[i] = LLVMBuildBitCast(builder, tmp[i], type8_t, "");
1597       }
1598    } else {
1599       for (unsigned j = 0; j < 2; j++) {
1600          LLVMValueRef lo, hi, lo2, hi2;
1601           /*
1602           * Note that if we only really have 3 valid channels (rgb)
1603           * and we don't need alpha we could substitute a undef here
1604           * for the respective channel (causing llvm to drop conversion
1605           * for alpha).
1606           */
1607          /* we now have rgba0rgba1rgba4rgba5 etc, untwiddle */
1608          lo2 = LLVMBuildBitCast(builder, tmp[j*2], type64_t, "");
1609          hi2 = LLVMBuildBitCast(builder, tmp[j*2 + 1], type64_t, "");
1610          lo = lp_build_interleave2(gallivm, type64, lo2, hi2, 0);
1611          hi = lp_build_interleave2(gallivm, type64, lo2, hi2, 1);
1612          dst[j*2] = LLVMBuildBitCast(builder, lo, type8_t, "");
1613          dst[j*2 + 1] = LLVMBuildBitCast(builder, hi, type8_t, "");
1614       }
1615    }
1616 }
1617 
1618 
1619 /**
1620  * Load an unswizzled block of pixels from memory
1621  */
1622 static void
load_unswizzled_block(struct gallivm_state * gallivm,LLVMTypeRef base_type,LLVMValueRef base_ptr,LLVMValueRef stride,unsigned block_width,unsigned block_height,LLVMValueRef * dst,struct lp_type dst_type,unsigned dst_count,unsigned dst_alignment)1623 load_unswizzled_block(struct gallivm_state *gallivm,
1624                       LLVMTypeRef base_type,
1625                       LLVMValueRef base_ptr,
1626                       LLVMValueRef stride,
1627                       unsigned block_width,
1628                       unsigned block_height,
1629                       LLVMValueRef* dst,
1630                       struct lp_type dst_type,
1631                       unsigned dst_count,
1632                       unsigned dst_alignment)
1633 {
1634    LLVMBuilderRef builder = gallivm->builder;
1635    const unsigned row_size = dst_count / block_height;
1636 
1637    /* Ensure block exactly fits into dst */
1638    assert((block_width * block_height) % dst_count == 0);
1639 
1640    for (unsigned i = 0; i < dst_count; ++i) {
1641       unsigned x = i % row_size;
1642       unsigned y = i / row_size;
1643 
1644       LLVMValueRef bx = lp_build_const_int32(gallivm, x * (dst_type.width / 8) * dst_type.length);
1645       LLVMValueRef by = LLVMBuildMul(builder, lp_build_const_int32(gallivm, y), stride, "");
1646 
1647       LLVMValueRef gep[2];
1648       LLVMValueRef dst_ptr;
1649 
1650       gep[0] = lp_build_const_int32(gallivm, 0);
1651       gep[1] = LLVMBuildAdd(builder, bx, by, "");
1652 
1653       dst_ptr = LLVMBuildGEP2(builder, base_type, base_ptr, gep, 2, "");
1654       dst_ptr = LLVMBuildBitCast(builder, dst_ptr,
1655                                  LLVMPointerType(lp_build_vec_type(gallivm, dst_type), 0), "");
1656 
1657       dst[i] = LLVMBuildLoad2(builder,
1658                               lp_build_vec_type(gallivm, dst_type),
1659                               dst_ptr, "");
1660 
1661       LLVMSetAlignment(dst[i], dst_alignment);
1662    }
1663 }
1664 
1665 
1666 /**
1667  * Store an unswizzled block of pixels to memory
1668  */
1669 static void
store_unswizzled_block(struct gallivm_state * gallivm,LLVMTypeRef base_type,LLVMValueRef base_ptr,LLVMValueRef stride,unsigned block_width,unsigned block_height,LLVMValueRef src[],struct lp_type src_type,unsigned src_count,unsigned src_alignment)1670 store_unswizzled_block(struct gallivm_state *gallivm,
1671                        LLVMTypeRef base_type,
1672                        LLVMValueRef base_ptr,
1673                        LLVMValueRef stride,
1674                        unsigned block_width,
1675                        unsigned block_height,
1676                        LLVMValueRef src[],   // [src_count]
1677                        struct lp_type src_type,
1678                        unsigned src_count,
1679                        unsigned src_alignment)
1680 {
1681    LLVMBuilderRef builder = gallivm->builder;
1682    const unsigned row_size = src_count / block_height;
1683 
1684    /* Ensure src exactly fits into block */
1685    assert((block_width * block_height) % src_count == 0);
1686 
1687    for (unsigned i = 0; i < src_count; ++i) {
1688       unsigned x = i % row_size;
1689       unsigned y = i / row_size;
1690 
1691       LLVMValueRef bx = lp_build_const_int32(gallivm, x * (src_type.width / 8) * src_type.length);
1692       LLVMValueRef by = LLVMBuildMul(builder, lp_build_const_int32(gallivm, y), stride, "");
1693 
1694       LLVMValueRef gep[2];
1695       LLVMValueRef src_ptr;
1696 
1697       gep[0] = lp_build_const_int32(gallivm, 0);
1698       gep[1] = LLVMBuildAdd(builder, bx, by, "");
1699 
1700       src_ptr = LLVMBuildGEP2(builder, base_type, base_ptr, gep, 2, "");
1701       src_ptr = LLVMBuildBitCast(builder, src_ptr,
1702                                  LLVMPointerType(lp_build_vec_type(gallivm, src_type), 0), "");
1703 
1704       src_ptr = LLVMBuildStore(builder, src[i], src_ptr);
1705 
1706       LLVMSetAlignment(src_ptr, src_alignment);
1707    }
1708 }
1709 
1710 
1711 
1712 /**
1713  * Retrieves the type for a format which is usable in the blending code.
1714  *
1715  * e.g. RGBA16F = 4x float, R3G3B2 = 3x byte
1716  */
1717 static inline void
lp_blend_type_from_format_desc(const struct util_format_description * format_desc,struct lp_type * type)1718 lp_blend_type_from_format_desc(const struct util_format_description *format_desc,
1719                                struct lp_type* type)
1720 {
1721    if (format_expands_to_float_soa(format_desc)) {
1722       /* always use ordinary floats for blending */
1723       type->floating = true;
1724       type->fixed = false;
1725       type->sign = true;
1726       type->norm = false;
1727       type->width = 32;
1728       type->length = 4;
1729       return;
1730    }
1731 
1732    const int chan = util_format_get_first_non_void_channel(format_desc->format);
1733 
1734    memset(type, 0, sizeof(struct lp_type));
1735    type->floating = format_desc->channel[chan].type == UTIL_FORMAT_TYPE_FLOAT;
1736    type->fixed    = format_desc->channel[chan].type == UTIL_FORMAT_TYPE_FIXED;
1737    type->sign     = format_desc->channel[chan].type != UTIL_FORMAT_TYPE_UNSIGNED;
1738    type->norm     = format_desc->channel[chan].normalized;
1739    type->width    = format_desc->channel[chan].size;
1740    type->length   = format_desc->nr_channels;
1741 
1742    for (unsigned i = 1; i < format_desc->nr_channels; ++i) {
1743       if (format_desc->channel[i].size > type->width)
1744          type->width = format_desc->channel[i].size;
1745    }
1746 
1747    if (type->floating) {
1748       type->width = 32;
1749    } else {
1750       if (type->width <= 8) {
1751          type->width = 8;
1752       } else if (type->width <= 16) {
1753          type->width = 16;
1754       } else {
1755          type->width = 32;
1756       }
1757    }
1758 
1759    if (is_arithmetic_format(format_desc) && type->length == 3) {
1760       type->length = 4;
1761    }
1762 }
1763 
1764 
1765 /**
1766  * Scale a normalized value from src_bits to dst_bits.
1767  *
1768  * The exact calculation is
1769  *
1770  *    dst = iround(src * dst_mask / src_mask)
1771  *
1772  *  or with integer rounding
1773  *
1774  *    dst = src * (2*dst_mask + sign(src)*src_mask) / (2*src_mask)
1775  *
1776  *  where
1777  *
1778  *    src_mask = (1 << src_bits) - 1
1779  *    dst_mask = (1 << dst_bits) - 1
1780  *
1781  * but we try to avoid division and multiplication through shifts.
1782  */
1783 static inline LLVMValueRef
scale_bits(struct gallivm_state * gallivm,int src_bits,int dst_bits,LLVMValueRef src,struct lp_type src_type)1784 scale_bits(struct gallivm_state *gallivm,
1785            int src_bits,
1786            int dst_bits,
1787            LLVMValueRef src,
1788            struct lp_type src_type)
1789 {
1790    LLVMBuilderRef builder = gallivm->builder;
1791    LLVMValueRef result = src;
1792 
1793    if (dst_bits < src_bits) {
1794       int delta_bits = src_bits - dst_bits;
1795 
1796       if (delta_bits <= dst_bits) {
1797 
1798          if (dst_bits == 4) {
1799             struct lp_type flt_type =
1800                lp_type_float_vec(32, src_type.length * 32);
1801 
1802             result = lp_build_unsigned_norm_to_float(gallivm, src_bits,
1803                                                      flt_type, src);
1804             result = lp_build_clamped_float_to_unsigned_norm(gallivm, flt_type,
1805                                                              dst_bits, result);
1806             result = LLVMBuildTrunc(gallivm->builder, result,
1807                                     lp_build_int_vec_type(gallivm, src_type),
1808                                     "");
1809             return result;
1810          }
1811 
1812          /*
1813           * Approximate the rescaling with a single shift.
1814           *
1815           * This gives the wrong rounding.
1816           */
1817 
1818          result = LLVMBuildLShr(builder, src,
1819                                 lp_build_const_int_vec(gallivm, src_type,
1820                                                        delta_bits),
1821                                 "");
1822       } else {
1823          /*
1824           * Try more accurate rescaling.
1825           */
1826 
1827          /*
1828           * Drop the least significant bits to make space for the
1829           * multiplication.
1830           *
1831           * XXX: A better approach would be to use a wider integer type as
1832           * intermediate.  But this is enough to convert alpha from 16bits ->
1833           * 2 when rendering to PIPE_FORMAT_R10G10B10A2_UNORM.
1834           */
1835          result = LLVMBuildLShr(builder, src,
1836                                 lp_build_const_int_vec(gallivm, src_type,
1837                                                        dst_bits),
1838                                 "");
1839 
1840 
1841          result = LLVMBuildMul(builder, result,
1842                                lp_build_const_int_vec(gallivm, src_type,
1843                                                       (1LL << dst_bits) - 1),
1844                                "");
1845 
1846          /*
1847           * Add a rounding term before the division.
1848           *
1849           * TODO: Handle signed integers too.
1850           */
1851          if (!src_type.sign) {
1852             result = LLVMBuildAdd(builder, result,
1853                                   lp_build_const_int_vec(gallivm, src_type,
1854                                                     (1LL << (delta_bits - 1))),
1855                                   "");
1856          }
1857 
1858          /*
1859           * Approximate the division by src_mask with a src_bits shift.
1860           *
1861           * Given the src has already been shifted by dst_bits, all we need
1862           * to do is to shift by the difference.
1863           */
1864 
1865          result = LLVMBuildLShr(builder,
1866                                 result,
1867                                 lp_build_const_int_vec(gallivm, src_type, delta_bits),
1868                                 "");
1869       }
1870 
1871    } else if (dst_bits > src_bits) {
1872       /* Scale up bits */
1873       int db = dst_bits - src_bits;
1874 
1875       /* Shift left by difference in bits */
1876       result = LLVMBuildShl(builder,
1877                             src,
1878                             lp_build_const_int_vec(gallivm, src_type, db),
1879                             "");
1880 
1881       if (db <= src_bits) {
1882          /* Enough bits in src to fill the remainder */
1883          LLVMValueRef lower = LLVMBuildLShr(builder,
1884                                             src,
1885                                             lp_build_const_int_vec(gallivm, src_type, src_bits - db),
1886                                             "");
1887 
1888          result = LLVMBuildOr(builder, result, lower, "");
1889       } else if (db > src_bits) {
1890          /* Need to repeatedly copy src bits to fill remainder in dst */
1891          unsigned n;
1892 
1893          for (n = src_bits; n < dst_bits; n *= 2) {
1894             LLVMValueRef shuv = lp_build_const_int_vec(gallivm, src_type, n);
1895 
1896             result = LLVMBuildOr(builder,
1897                                  result,
1898                                  LLVMBuildLShr(builder, result, shuv, ""),
1899                                  "");
1900          }
1901       }
1902    }
1903 
1904    return result;
1905 }
1906 
1907 /**
1908  * If RT is a smallfloat (needing denorms) format
1909  */
1910 static inline int
have_smallfloat_format(struct lp_type dst_type,enum pipe_format format)1911 have_smallfloat_format(struct lp_type dst_type,
1912                        enum pipe_format format)
1913 {
1914    return ((dst_type.floating && dst_type.width != 32) ||
1915     /* due to format handling hacks this format doesn't have floating set
1916      * here (and actually has width set to 32 too) so special case this.
1917      */
1918     (format == PIPE_FORMAT_R11G11B10_FLOAT));
1919 }
1920 
1921 
1922 /**
1923  * Convert from memory format to blending format
1924  *
1925  * e.g. GL_R3G3B2 is 1 byte in memory but 3 bytes for blending
1926  */
1927 static void
convert_to_blend_type(struct gallivm_state * gallivm,unsigned block_size,const struct util_format_description * src_fmt,struct lp_type src_type,struct lp_type dst_type,LLVMValueRef * src,unsigned num_srcs)1928 convert_to_blend_type(struct gallivm_state *gallivm,
1929                       unsigned block_size,
1930                       const struct util_format_description *src_fmt,
1931                       struct lp_type src_type,
1932                       struct lp_type dst_type,
1933                       LLVMValueRef* src, // and dst
1934                       unsigned num_srcs)
1935 {
1936    LLVMValueRef *dst = src;
1937    LLVMBuilderRef builder = gallivm->builder;
1938    struct lp_type blend_type;
1939    struct lp_type mem_type;
1940    unsigned i, j;
1941    unsigned pixels = block_size / num_srcs;
1942    bool is_arith;
1943 
1944    /*
1945     * full custom path for packed floats and srgb formats - none of the later
1946     * functions would do anything useful, and given the lp_type representation
1947     * they can't be fixed. Should really have some SoA blend path for these
1948     * kind of formats rather than hacking them in here.
1949     */
1950    if (format_expands_to_float_soa(src_fmt)) {
1951       LLVMValueRef tmpsrc[4];
1952       /*
1953        * This is pretty suboptimal for this case blending in SoA would be much
1954        * better, since conversion gets us SoA values so need to convert back.
1955        */
1956       assert(src_type.width == 32 || src_type.width == 16);
1957       assert(dst_type.floating);
1958       assert(dst_type.width == 32);
1959       assert(dst_type.length % 4 == 0);
1960       assert(num_srcs % 4 == 0);
1961 
1962       if (src_type.width == 16) {
1963          /* expand 4x16bit values to 4x32bit */
1964          struct lp_type type32x4 = src_type;
1965          LLVMTypeRef ltype32x4;
1966          unsigned num_fetch = dst_type.length == 8 ? num_srcs / 2 : num_srcs / 4;
1967          type32x4.width = 32;
1968          ltype32x4 = lp_build_vec_type(gallivm, type32x4);
1969          for (i = 0; i < num_fetch; i++) {
1970             src[i] = LLVMBuildZExt(builder, src[i], ltype32x4, "");
1971          }
1972          src_type.width = 32;
1973       }
1974       for (i = 0; i < 4; i++) {
1975          tmpsrc[i] = src[i];
1976       }
1977       for (i = 0; i < num_srcs / 4; i++) {
1978          LLVMValueRef tmpsoa[4];
1979          LLVMValueRef tmps = tmpsrc[i];
1980          if (dst_type.length == 8) {
1981             LLVMValueRef shuffles[8];
1982             unsigned j;
1983             /* fetch was 4 values but need 8-wide output values */
1984             tmps = lp_build_concat(gallivm, &tmpsrc[i * 2], src_type, 2);
1985             /*
1986              * for 8-wide aos transpose would give us wrong order not matching
1987              * incoming converted fs values and mask. ARGH.
1988              */
1989             for (j = 0; j < 4; j++) {
1990                shuffles[j] = lp_build_const_int32(gallivm, j * 2);
1991                shuffles[j + 4] = lp_build_const_int32(gallivm, j * 2 + 1);
1992             }
1993             tmps = LLVMBuildShuffleVector(builder, tmps, tmps,
1994                                           LLVMConstVector(shuffles, 8), "");
1995          }
1996          if (src_fmt->format == PIPE_FORMAT_R11G11B10_FLOAT) {
1997             lp_build_r11g11b10_to_float(gallivm, tmps, tmpsoa);
1998          } else {
1999             lp_build_unpack_rgba_soa(gallivm, src_fmt, dst_type, tmps, tmpsoa);
2000          }
2001          lp_build_transpose_aos(gallivm, dst_type, tmpsoa, &src[i * 4]);
2002       }
2003       return;
2004    }
2005 
2006    lp_mem_type_from_format_desc(src_fmt, &mem_type);
2007    lp_blend_type_from_format_desc(src_fmt, &blend_type);
2008 
2009    /* Is the format arithmetic */
2010    is_arith = blend_type.length * blend_type.width != mem_type.width * mem_type.length;
2011    is_arith &= !(mem_type.width == 16 && mem_type.floating);
2012 
2013    /* Pad if necessary */
2014    if (!is_arith && src_type.length < dst_type.length) {
2015       for (i = 0; i < num_srcs; ++i) {
2016          dst[i] = lp_build_pad_vector(gallivm, src[i], dst_type.length);
2017       }
2018 
2019       src_type.length = dst_type.length;
2020    }
2021 
2022    /* Special case for half-floats */
2023    if (mem_type.width == 16 && mem_type.floating) {
2024       assert(blend_type.width == 32 && blend_type.floating);
2025       lp_build_conv_auto(gallivm, src_type, &dst_type, dst, num_srcs, dst);
2026       is_arith = false;
2027    }
2028 
2029    if (!is_arith) {
2030       return;
2031    }
2032 
2033    src_type.width = blend_type.width * blend_type.length;
2034    blend_type.length *= pixels;
2035    src_type.length *= pixels / (src_type.length / mem_type.length);
2036 
2037    for (i = 0; i < num_srcs; ++i) {
2038       LLVMValueRef chans;
2039       LLVMValueRef res = NULL;
2040 
2041       dst[i] = LLVMBuildZExt(builder, src[i], lp_build_vec_type(gallivm, src_type), "");
2042 
2043       for (j = 0; j < src_fmt->nr_channels; ++j) {
2044          unsigned mask = 0;
2045          unsigned sa = src_fmt->channel[j].shift;
2046 #if UTIL_ARCH_LITTLE_ENDIAN
2047          unsigned from_lsb = j;
2048 #else
2049          unsigned from_lsb = (blend_type.length / pixels) - j - 1;
2050 #endif
2051 
2052          mask = (1 << src_fmt->channel[j].size) - 1;
2053 
2054          /* Extract bits from source */
2055          chans = LLVMBuildLShr(builder,
2056                                dst[i],
2057                                lp_build_const_int_vec(gallivm, src_type, sa),
2058                                "");
2059 
2060          chans = LLVMBuildAnd(builder,
2061                               chans,
2062                               lp_build_const_int_vec(gallivm, src_type, mask),
2063                               "");
2064 
2065          /* Scale bits */
2066          if (src_type.norm) {
2067             chans = scale_bits(gallivm, src_fmt->channel[j].size,
2068                                blend_type.width, chans, src_type);
2069          }
2070 
2071          /* Insert bits into correct position */
2072          chans = LLVMBuildShl(builder,
2073                               chans,
2074                               lp_build_const_int_vec(gallivm, src_type, from_lsb * blend_type.width),
2075                               "");
2076 
2077          if (j == 0) {
2078             res = chans;
2079          } else {
2080             res = LLVMBuildOr(builder, res, chans, "");
2081          }
2082       }
2083 
2084       dst[i] = LLVMBuildBitCast(builder, res, lp_build_vec_type(gallivm, blend_type), "");
2085    }
2086 }
2087 
2088 
2089 /**
2090  * Convert from blending format to memory format
2091  *
2092  * e.g. GL_R3G3B2 is 3 bytes for blending but 1 byte in memory
2093  */
2094 static void
convert_from_blend_type(struct gallivm_state * gallivm,unsigned block_size,const struct util_format_description * src_fmt,struct lp_type src_type,struct lp_type dst_type,LLVMValueRef * src,unsigned num_srcs)2095 convert_from_blend_type(struct gallivm_state *gallivm,
2096                         unsigned block_size,
2097                         const struct util_format_description *src_fmt,
2098                         struct lp_type src_type,
2099                         struct lp_type dst_type,
2100                         LLVMValueRef* src, // and dst
2101                         unsigned num_srcs)
2102 {
2103    LLVMValueRef* dst = src;
2104    unsigned i, j, k;
2105    struct lp_type mem_type;
2106    struct lp_type blend_type;
2107    LLVMBuilderRef builder = gallivm->builder;
2108    unsigned pixels = block_size / num_srcs;
2109    bool is_arith;
2110 
2111    /*
2112     * full custom path for packed floats and srgb formats - none of the later
2113     * functions would do anything useful, and given the lp_type representation
2114     * they can't be fixed. Should really have some SoA blend path for these
2115     * kind of formats rather than hacking them in here.
2116     */
2117    if (format_expands_to_float_soa(src_fmt)) {
2118       /*
2119        * This is pretty suboptimal for this case blending in SoA would be much
2120        * better - we need to transpose the AoS values back to SoA values for
2121        * conversion/packing.
2122        */
2123       assert(src_type.floating);
2124       assert(src_type.width == 32);
2125       assert(src_type.length % 4 == 0);
2126       assert(dst_type.width == 32 || dst_type.width == 16);
2127 
2128       for (i = 0; i < num_srcs / 4; i++) {
2129          LLVMValueRef tmpsoa[4], tmpdst;
2130          lp_build_transpose_aos(gallivm, src_type, &src[i * 4], tmpsoa);
2131          /* really really need SoA here */
2132 
2133          if (src_fmt->format == PIPE_FORMAT_R11G11B10_FLOAT) {
2134             tmpdst = lp_build_float_to_r11g11b10(gallivm, tmpsoa);
2135          } else {
2136             tmpdst = lp_build_float_to_srgb_packed(gallivm, src_fmt,
2137                                                    src_type, tmpsoa);
2138          }
2139 
2140          if (src_type.length == 8) {
2141             LLVMValueRef tmpaos, shuffles[8];
2142             unsigned j;
2143             /*
2144              * for 8-wide aos transpose has given us wrong order not matching
2145              * output order. HMPF. Also need to split the output values
2146              * manually.
2147              */
2148             for (j = 0; j < 4; j++) {
2149                shuffles[j * 2] = lp_build_const_int32(gallivm, j);
2150                shuffles[j * 2 + 1] = lp_build_const_int32(gallivm, j + 4);
2151             }
2152             tmpaos = LLVMBuildShuffleVector(builder, tmpdst, tmpdst,
2153                                             LLVMConstVector(shuffles, 8), "");
2154             src[i * 2] = lp_build_extract_range(gallivm, tmpaos, 0, 4);
2155             src[i * 2 + 1] = lp_build_extract_range(gallivm, tmpaos, 4, 4);
2156          } else {
2157             src[i] = tmpdst;
2158          }
2159       }
2160       if (dst_type.width == 16) {
2161          struct lp_type type16x8 = dst_type;
2162          struct lp_type type32x4 = dst_type;
2163          LLVMTypeRef ltype16x4, ltypei64, ltypei128;
2164          unsigned num_fetch = src_type.length == 8 ? num_srcs / 2 : num_srcs / 4;
2165          type16x8.length = 8;
2166          type32x4.width = 32;
2167          ltypei128 = LLVMIntTypeInContext(gallivm->context, 128);
2168          ltypei64 = LLVMIntTypeInContext(gallivm->context, 64);
2169          ltype16x4 = lp_build_vec_type(gallivm, dst_type);
2170          /* We could do vector truncation but it doesn't generate very good code */
2171          for (i = 0; i < num_fetch; i++) {
2172             src[i] = lp_build_pack2(gallivm, type32x4, type16x8,
2173                                     src[i], lp_build_zero(gallivm, type32x4));
2174             src[i] = LLVMBuildBitCast(builder, src[i], ltypei128, "");
2175             src[i] = LLVMBuildTrunc(builder, src[i], ltypei64, "");
2176             src[i] = LLVMBuildBitCast(builder, src[i], ltype16x4, "");
2177          }
2178       }
2179       return;
2180    }
2181 
2182    lp_mem_type_from_format_desc(src_fmt, &mem_type);
2183    lp_blend_type_from_format_desc(src_fmt, &blend_type);
2184 
2185    is_arith = (blend_type.length * blend_type.width != mem_type.width * mem_type.length);
2186 
2187    /* Special case for half-floats */
2188    if (mem_type.width == 16 && mem_type.floating) {
2189       int length = dst_type.length;
2190       assert(blend_type.width == 32 && blend_type.floating);
2191 
2192       dst_type.length = src_type.length;
2193 
2194       lp_build_conv_auto(gallivm, src_type, &dst_type, dst, num_srcs, dst);
2195 
2196       dst_type.length = length;
2197       is_arith = false;
2198    }
2199 
2200    /* Remove any padding */
2201    if (!is_arith && (src_type.length % mem_type.length)) {
2202       src_type.length -= (src_type.length % mem_type.length);
2203 
2204       for (i = 0; i < num_srcs; ++i) {
2205          dst[i] = lp_build_extract_range(gallivm, dst[i], 0, src_type.length);
2206       }
2207    }
2208 
2209    /* No bit arithmetic to do */
2210    if (!is_arith) {
2211       return;
2212    }
2213 
2214    src_type.length = pixels;
2215    src_type.width = blend_type.length * blend_type.width;
2216    dst_type.length = pixels;
2217 
2218    for (i = 0; i < num_srcs; ++i) {
2219       LLVMValueRef chans;
2220       LLVMValueRef res = NULL;
2221 
2222       dst[i] = LLVMBuildBitCast(builder, src[i], lp_build_vec_type(gallivm, src_type), "");
2223 
2224       for (j = 0; j < src_fmt->nr_channels; ++j) {
2225          unsigned mask = 0;
2226          unsigned sa = src_fmt->channel[j].shift;
2227          unsigned sz_a = src_fmt->channel[j].size;
2228 #if UTIL_ARCH_LITTLE_ENDIAN
2229          unsigned from_lsb = j;
2230 #else
2231          unsigned from_lsb = blend_type.length - j - 1;
2232 #endif
2233 
2234          assert(blend_type.width > src_fmt->channel[j].size);
2235 
2236          for (k = 0; k < blend_type.width; ++k) {
2237             mask |= 1 << k;
2238          }
2239 
2240          /* Extract bits */
2241          chans = LLVMBuildLShr(builder,
2242                                dst[i],
2243                                lp_build_const_int_vec(gallivm, src_type,
2244                                                       from_lsb * blend_type.width),
2245                                "");
2246 
2247          chans = LLVMBuildAnd(builder,
2248                               chans,
2249                               lp_build_const_int_vec(gallivm, src_type, mask),
2250                               "");
2251 
2252          /* Scale down bits */
2253          if (src_type.norm) {
2254             chans = scale_bits(gallivm, blend_type.width,
2255                                src_fmt->channel[j].size, chans, src_type);
2256          } else if (!src_type.floating && sz_a < blend_type.width) {
2257             LLVMValueRef mask_val = lp_build_const_int_vec(gallivm, src_type, (1UL << sz_a) - 1);
2258             LLVMValueRef mask = LLVMBuildICmp(builder, LLVMIntUGT, chans, mask_val, "");
2259             chans = LLVMBuildSelect(builder, mask, mask_val, chans, "");
2260          }
2261 
2262          /* Insert bits */
2263          chans = LLVMBuildShl(builder,
2264                               chans,
2265                               lp_build_const_int_vec(gallivm, src_type, sa),
2266                               "");
2267 
2268          sa += src_fmt->channel[j].size;
2269 
2270          if (j == 0) {
2271             res = chans;
2272          } else {
2273             res = LLVMBuildOr(builder, res, chans, "");
2274          }
2275       }
2276 
2277       assert (dst_type.width != 24);
2278 
2279       dst[i] = LLVMBuildTrunc(builder, res, lp_build_vec_type(gallivm, dst_type), "");
2280    }
2281 }
2282 
2283 
2284 /**
2285  * Convert alpha to same blend type as src
2286  */
2287 static void
convert_alpha(struct gallivm_state * gallivm,struct lp_type row_type,struct lp_type alpha_type,const unsigned block_size,const unsigned block_height,const unsigned src_count,const unsigned dst_channels,const bool pad_inline,LLVMValueRef * src_alpha)2288 convert_alpha(struct gallivm_state *gallivm,
2289               struct lp_type row_type,
2290               struct lp_type alpha_type,
2291               const unsigned block_size,
2292               const unsigned block_height,
2293               const unsigned src_count,
2294               const unsigned dst_channels,
2295               const bool pad_inline,
2296               LLVMValueRef* src_alpha)
2297 {
2298    LLVMBuilderRef builder = gallivm->builder;
2299    const unsigned length = row_type.length;
2300    row_type.length = alpha_type.length;
2301 
2302    /* Twiddle the alpha to match pixels */
2303    lp_bld_quad_twiddle(gallivm, alpha_type, src_alpha, block_height, src_alpha);
2304 
2305    /*
2306     * TODO this should use single lp_build_conv call for
2307     * src_count == 1 && dst_channels == 1 case (dropping the concat below)
2308     */
2309    for (unsigned i = 0; i < block_height; ++i) {
2310       lp_build_conv(gallivm, alpha_type, row_type, &src_alpha[i], 1,
2311                     &src_alpha[i], 1);
2312    }
2313 
2314    alpha_type = row_type;
2315    row_type.length = length;
2316 
2317    /* If only one channel we can only need the single alpha value per pixel */
2318    if (src_count == 1 && dst_channels == 1) {
2319       lp_build_concat_n(gallivm, alpha_type, src_alpha, block_height,
2320                         src_alpha, src_count);
2321    } else {
2322       /* If there are more srcs than rows then we need to split alpha up */
2323       if (src_count > block_height) {
2324          for (unsigned i = src_count; i > 0; --i) {
2325             unsigned pixels = block_size / src_count;
2326             unsigned idx = i - 1;
2327 
2328             src_alpha[idx] =
2329                lp_build_extract_range(gallivm, src_alpha[(idx * pixels) / 4],
2330                                       (idx * pixels) % 4, pixels);
2331          }
2332       }
2333 
2334       /* If there is a src for each pixel broadcast the alpha across whole
2335        * row
2336        */
2337       if (src_count == block_size) {
2338          for (unsigned i = 0; i < src_count; ++i) {
2339             src_alpha[i] = lp_build_broadcast(gallivm,
2340                               lp_build_vec_type(gallivm, row_type), src_alpha[i]);
2341          }
2342       } else {
2343          unsigned pixels = block_size / src_count;
2344          unsigned channels = pad_inline ? TGSI_NUM_CHANNELS : dst_channels;
2345          unsigned alpha_span = 1;
2346          LLVMValueRef shuffles[LP_MAX_VECTOR_LENGTH];
2347 
2348          /* Check if we need 2 src_alphas for our shuffles */
2349          if (pixels > alpha_type.length) {
2350             alpha_span = 2;
2351          }
2352 
2353          /* Broadcast alpha across all channels, e.g. a1a2 to a1a1a1a1a2a2a2a2 */
2354          for (unsigned j = 0; j < row_type.length; ++j) {
2355             if (j < pixels * channels) {
2356                shuffles[j] = lp_build_const_int32(gallivm, j / channels);
2357             } else {
2358                shuffles[j] = LLVMGetUndef(LLVMInt32TypeInContext(gallivm->context));
2359             }
2360          }
2361 
2362          for (unsigned i = 0; i < src_count; ++i) {
2363             unsigned idx1 = i, idx2 = i;
2364 
2365             if (alpha_span > 1){
2366                idx1 *= alpha_span;
2367                idx2 = idx1 + 1;
2368             }
2369 
2370             src_alpha[i] = LLVMBuildShuffleVector(builder,
2371                                                   src_alpha[idx1],
2372                                                   src_alpha[idx2],
2373                                                   LLVMConstVector(shuffles, row_type.length),
2374                                                   "");
2375          }
2376       }
2377    }
2378 }
2379 
2380 
2381 /**
2382  * Generates the blend function for unswizzled colour buffers
2383  * Also generates the read & write from colour buffer
2384  */
2385 static void
generate_unswizzled_blend(struct gallivm_state * gallivm,unsigned rt,struct lp_fragment_shader_variant * variant,enum pipe_format out_format,unsigned int num_fs,struct lp_type fs_type,LLVMValueRef * fs_mask,LLVMValueRef fs_out_color[PIPE_MAX_COLOR_BUFS][TGSI_NUM_CHANNELS][4],LLVMTypeRef context_type,LLVMValueRef context_ptr,LLVMTypeRef color_type,LLVMValueRef color_ptr,LLVMValueRef stride,unsigned partial_mask,bool do_branch)2386 generate_unswizzled_blend(struct gallivm_state *gallivm,
2387                           unsigned rt,
2388                           struct lp_fragment_shader_variant *variant,
2389                           enum pipe_format out_format,
2390                           unsigned int num_fs,
2391                           struct lp_type fs_type,
2392                           LLVMValueRef* fs_mask,
2393                           LLVMValueRef fs_out_color[PIPE_MAX_COLOR_BUFS][TGSI_NUM_CHANNELS][4],
2394                           LLVMTypeRef context_type,
2395                           LLVMValueRef context_ptr,
2396                           LLVMTypeRef color_type,
2397                           LLVMValueRef color_ptr,
2398                           LLVMValueRef stride,
2399                           unsigned partial_mask,
2400                           bool do_branch)
2401 {
2402    const unsigned alpha_channel = 3;
2403    const unsigned block_width = LP_RASTER_BLOCK_SIZE;
2404    const unsigned block_height = LP_RASTER_BLOCK_SIZE;
2405    const unsigned block_size = block_width * block_height;
2406    const unsigned lp_integer_vector_width = 128;
2407 
2408    LLVMBuilderRef builder = gallivm->builder;
2409    LLVMValueRef fs_src[4][TGSI_NUM_CHANNELS];
2410    LLVMValueRef fs_src1[4][TGSI_NUM_CHANNELS];
2411    LLVMValueRef src_alpha[4 * 4];
2412    LLVMValueRef src1_alpha[4 * 4] = { NULL };
2413    LLVMValueRef src_mask[4 * 4];
2414    LLVMValueRef src[4 * 4];
2415    LLVMValueRef src1[4 * 4];
2416    LLVMValueRef dst[4 * 4];
2417 
2418    struct lp_build_mask_context mask_ctx;
2419 
2420    unsigned char swizzle[TGSI_NUM_CHANNELS];
2421    unsigned src_channels = TGSI_NUM_CHANNELS;
2422 
2423    const struct util_format_description *out_format_desc =
2424       util_format_description(out_format);
2425 
2426    bool pad_inline = is_arithmetic_format(out_format_desc);
2427    const bool dual_source_blend =
2428       variant->key.blend.rt[0].blend_enable &&
2429       util_blend_state_is_dual(&variant->key.blend, 0);
2430 
2431    const bool is_1d = variant->key.resource_1d;
2432    const unsigned num_fullblock_fs = is_1d ? 2 * num_fs : num_fs;
2433    LLVMValueRef fpstate = NULL;
2434 
2435    LLVMTypeRef fs_vec_type = lp_build_vec_type(gallivm, fs_type);
2436 
2437    /* Get type from output format */
2438    struct lp_type row_type, dst_type;
2439    lp_blend_type_from_format_desc(out_format_desc, &row_type);
2440    lp_mem_type_from_format_desc(out_format_desc, &dst_type);
2441 
2442    /*
2443     * Technically this code should go into lp_build_smallfloat_to_float
2444     * and lp_build_float_to_smallfloat but due to the
2445     * http://llvm.org/bugs/show_bug.cgi?id=6393
2446     * llvm reorders the mxcsr intrinsics in a way that breaks the code.
2447     * So the ordering is important here and there shouldn't be any
2448     * llvm ir instrunctions in this function before
2449     * this, otherwise half-float format conversions won't work
2450     * (again due to llvm bug #6393).
2451     */
2452    if (have_smallfloat_format(dst_type, out_format)) {
2453       /* We need to make sure that denorms are ok for half float
2454          conversions */
2455       fpstate = lp_build_fpstate_get(gallivm);
2456       lp_build_fpstate_set_denorms_zero(gallivm, false);
2457    }
2458 
2459    struct lp_type mask_type = lp_int32_vec4_type();
2460    mask_type.length = fs_type.length;
2461 
2462    for (unsigned i = num_fs; i < num_fullblock_fs; i++) {
2463       fs_mask[i] = lp_build_zero(gallivm, mask_type);
2464    }
2465 
2466    /* Do not bother executing code when mask is empty.. */
2467    if (do_branch) {
2468       LLVMValueRef check_mask =
2469          LLVMConstNull(lp_build_int_vec_type(gallivm, mask_type));
2470 
2471       for (unsigned i = 0; i < num_fullblock_fs; ++i) {
2472          check_mask = LLVMBuildOr(builder, check_mask, fs_mask[i], "");
2473       }
2474 
2475       lp_build_mask_begin(&mask_ctx, gallivm, mask_type, check_mask);
2476       lp_build_mask_check(&mask_ctx);
2477    }
2478 
2479    partial_mask |= !variant->opaque;
2480    LLVMValueRef i32_zero = lp_build_const_int32(gallivm, 0);
2481 
2482    LLVMValueRef undef_src_val = lp_build_undef(gallivm, fs_type);
2483 
2484    row_type.length = fs_type.length;
2485    unsigned vector_width =
2486       dst_type.floating ? lp_native_vector_width : lp_integer_vector_width;
2487 
2488    /* Compute correct swizzle and count channels */
2489    memset(swizzle, LP_BLD_SWIZZLE_DONTCARE, TGSI_NUM_CHANNELS);
2490    unsigned dst_channels = 0;
2491 
2492    bool has_alpha = false;
2493    for (unsigned i = 0; i < TGSI_NUM_CHANNELS; ++i) {
2494       /* Ensure channel is used */
2495       if (out_format_desc->swizzle[i] >= TGSI_NUM_CHANNELS) {
2496          continue;
2497       }
2498 
2499       /* Ensure not already written to (happens in case with GL_ALPHA) */
2500       if (swizzle[out_format_desc->swizzle[i]] < TGSI_NUM_CHANNELS) {
2501          continue;
2502       }
2503 
2504       /* Ensure we haven't already found all channels */
2505       if (dst_channels >= out_format_desc->nr_channels) {
2506          continue;
2507       }
2508 
2509       swizzle[out_format_desc->swizzle[i]] = i;
2510       ++dst_channels;
2511 
2512       if (i == alpha_channel) {
2513          has_alpha = true;
2514       }
2515    }
2516 
2517    if (format_expands_to_float_soa(out_format_desc)) {
2518       /*
2519        * the code above can't work for layout_other
2520        * for srgb it would sort of work but we short-circuit swizzles, etc.
2521        * as that is done as part of unpack / pack.
2522        */
2523       dst_channels = 4; /* HACK: this is fake 4 really but need it due to transpose stuff later */
2524       has_alpha = true;
2525       swizzle[0] = 0;
2526       swizzle[1] = 1;
2527       swizzle[2] = 2;
2528       swizzle[3] = 3;
2529       pad_inline = true; /* HACK: prevent rgbxrgbx->rgbrgbxx conversion later */
2530    }
2531 
2532    /* If 3 channels then pad to include alpha for 4 element transpose */
2533    if (dst_channels == 3) {
2534       assert (!has_alpha);
2535       for (unsigned i = 0; i < TGSI_NUM_CHANNELS; i++) {
2536          if (swizzle[i] > TGSI_NUM_CHANNELS)
2537             swizzle[i] = 3;
2538       }
2539       if (out_format_desc->nr_channels == 4) {
2540          dst_channels = 4;
2541          /*
2542           * We use alpha from the color conversion, not separate one.
2543           * We had to include it for transpose, hence it will get converted
2544           * too (albeit when doing transpose after conversion, that would
2545           * no longer be the case necessarily).
2546           * (It works only with 4 channel dsts, e.g. rgbx formats, because
2547           * otherwise we really have padding, not alpha, included.)
2548           */
2549          has_alpha = true;
2550       }
2551    }
2552 
2553    /*
2554     * Load shader output
2555     */
2556    for (unsigned i = 0; i < num_fullblock_fs; ++i) {
2557       /* Always load alpha for use in blending */
2558       LLVMValueRef alpha;
2559       if (i < num_fs) {
2560          alpha = LLVMBuildLoad2(builder, fs_vec_type,
2561                                 fs_out_color[rt][alpha_channel][i], "");
2562       } else {
2563          alpha = undef_src_val;
2564       }
2565 
2566       /* Load each channel */
2567       for (unsigned j = 0; j < dst_channels; ++j) {
2568          assert(swizzle[j] < 4);
2569          if (i < num_fs) {
2570             fs_src[i][j] = LLVMBuildLoad2(builder, fs_vec_type,
2571                                           fs_out_color[rt][swizzle[j]][i], "");
2572          } else {
2573             fs_src[i][j] = undef_src_val;
2574          }
2575       }
2576 
2577       /* If 3 channels then pad to include alpha for 4 element transpose */
2578       /*
2579        * XXX If we include that here maybe could actually use it instead of
2580        * separate alpha for blending?
2581        * (Difficult though we actually convert pad channels, not alpha.)
2582        */
2583       if (dst_channels == 3 && !has_alpha) {
2584          fs_src[i][3] = alpha;
2585       }
2586 
2587       /* We split the row_mask and row_alpha as we want 128bit interleave */
2588       if (fs_type.length == 8) {
2589          src_mask[i*2 + 0]  = lp_build_extract_range(gallivm, fs_mask[i],
2590                                                      0, src_channels);
2591          src_mask[i*2 + 1]  = lp_build_extract_range(gallivm, fs_mask[i],
2592                                                      src_channels,
2593                                                      src_channels);
2594 
2595          src_alpha[i*2 + 0] = lp_build_extract_range(gallivm, alpha,
2596                                                      0, src_channels);
2597          src_alpha[i*2 + 1] = lp_build_extract_range(gallivm, alpha,
2598                                                      src_channels,
2599                                                      src_channels);
2600       } else {
2601          src_mask[i] = fs_mask[i];
2602          src_alpha[i] = alpha;
2603       }
2604    }
2605    if (dual_source_blend) {
2606       /* same as above except different src/dst, skip masks and comments... */
2607       for (unsigned i = 0; i < num_fullblock_fs; ++i) {
2608          LLVMValueRef alpha;
2609          if (i < num_fs) {
2610             alpha = LLVMBuildLoad2(builder, fs_vec_type,
2611                                    fs_out_color[1][alpha_channel][i], "");
2612          } else {
2613             alpha = undef_src_val;
2614          }
2615 
2616          for (unsigned j = 0; j < dst_channels; ++j) {
2617             assert(swizzle[j] < 4);
2618             if (i < num_fs) {
2619                fs_src1[i][j] = LLVMBuildLoad2(builder, fs_vec_type,
2620                                               fs_out_color[1][swizzle[j]][i], "");
2621             } else {
2622                fs_src1[i][j] = undef_src_val;
2623             }
2624          }
2625          if (dst_channels == 3 && !has_alpha) {
2626             fs_src1[i][3] = alpha;
2627          }
2628          if (fs_type.length == 8) {
2629             src1_alpha[i*2 + 0] = lp_build_extract_range(gallivm, alpha, 0, src_channels);
2630             src1_alpha[i*2 + 1] = lp_build_extract_range(gallivm, alpha,
2631                                                          src_channels, src_channels);
2632          } else {
2633             src1_alpha[i] = alpha;
2634          }
2635       }
2636    }
2637 
2638    if (util_format_is_pure_integer(out_format)) {
2639       /*
2640        * In this case fs_type was really ints or uints disguised as floats,
2641        * fix that up now.
2642        */
2643       fs_type.floating = 0;
2644       fs_type.sign = dst_type.sign;
2645       fs_vec_type = lp_build_vec_type(gallivm, fs_type);
2646       for (unsigned i = 0; i < num_fullblock_fs; ++i) {
2647          for (unsigned j = 0; j < dst_channels; ++j) {
2648             fs_src[i][j] = LLVMBuildBitCast(builder, fs_src[i][j],
2649                                             fs_vec_type, "");
2650          }
2651          if (dst_channels == 3 && !has_alpha) {
2652             fs_src[i][3] = LLVMBuildBitCast(builder, fs_src[i][3],
2653                                             fs_vec_type, "");
2654          }
2655       }
2656    }
2657 
2658    /*
2659     * We actually should generally do conversion first (for non-1d cases)
2660     * when the blend format is 8 or 16 bits. The reason is obvious,
2661     * there's 2 or 4 times less vectors to deal with for the interleave...
2662     * Albeit for the AVX (not AVX2) case there's no benefit with 16 bit
2663     * vectors (as it can do 32bit unpack with 256bit vectors, but 8/16bit
2664     * unpack only with 128bit vectors).
2665     * Note: for 16bit sizes really need matching pack conversion code
2666     */
2667    bool twiddle_after_convert = false;
2668    if (!is_1d && dst_channels != 3 && dst_type.width == 8) {
2669       twiddle_after_convert = true;
2670    }
2671 
2672    /*
2673     * Pixel twiddle from fragment shader order to memory order
2674     */
2675    unsigned src_count;
2676    if (!twiddle_after_convert) {
2677       src_count = generate_fs_twiddle(gallivm, fs_type, num_fullblock_fs,
2678                                       dst_channels, fs_src, src, pad_inline);
2679       if (dual_source_blend) {
2680          generate_fs_twiddle(gallivm, fs_type, num_fullblock_fs, dst_channels,
2681                              fs_src1, src1, pad_inline);
2682       }
2683    } else {
2684       src_count = num_fullblock_fs * dst_channels;
2685       /*
2686        * We reorder things a bit here, so the cases for 4-wide and 8-wide
2687        * (AVX) turn out the same later when untwiddling/transpose (albeit
2688        * for true AVX2 path untwiddle needs to be different).
2689        * For now just order by colors first (so we can use unpack later).
2690        */
2691       for (unsigned j = 0; j < num_fullblock_fs; j++) {
2692          for (unsigned i = 0; i < dst_channels; i++) {
2693             src[i*num_fullblock_fs + j] = fs_src[j][i];
2694             if (dual_source_blend) {
2695                src1[i*num_fullblock_fs + j] = fs_src1[j][i];
2696             }
2697          }
2698       }
2699    }
2700 
2701    src_channels = dst_channels < 3 ? dst_channels : 4;
2702    if (src_count != num_fullblock_fs * src_channels) {
2703       unsigned ds = src_count / (num_fullblock_fs * src_channels);
2704       row_type.length /= ds;
2705       fs_type.length = row_type.length;
2706       fs_vec_type = lp_build_vec_type(gallivm, fs_type);
2707    }
2708 
2709    struct lp_type blend_type = row_type;
2710    mask_type.length = 4;
2711 
2712    /* Convert src to row_type */
2713    if (dual_source_blend) {
2714       struct lp_type old_row_type = row_type;
2715       lp_build_conv_auto(gallivm, fs_type, &row_type, src, src_count, src);
2716       src_count = lp_build_conv_auto(gallivm, fs_type, &old_row_type,
2717                                      src1, src_count, src1);
2718    } else {
2719       src_count = lp_build_conv_auto(gallivm, fs_type, &row_type,
2720                                      src, src_count, src);
2721    }
2722 
2723    /* If the rows are not an SSE vector, combine them to become SSE size! */
2724    if ((row_type.width * row_type.length) % 128) {
2725       unsigned bits = row_type.width * row_type.length;
2726       unsigned combined;
2727 
2728       assert(src_count >= (vector_width / bits));
2729 
2730       const unsigned dst_count = src_count / (vector_width / bits);
2731 
2732       combined = lp_build_concat_n(gallivm, row_type, src, src_count,
2733                                    src, dst_count);
2734       if (dual_source_blend) {
2735          lp_build_concat_n(gallivm, row_type, src1, src_count, src1, dst_count);
2736       }
2737 
2738       row_type.length *= combined;
2739       src_count /= combined;
2740 
2741       bits = row_type.width * row_type.length;
2742       assert(bits == 128 || bits == 256);
2743    }
2744 
2745    if (twiddle_after_convert) {
2746       fs_twiddle_transpose(gallivm, row_type, src, src_count, src);
2747       if (dual_source_blend) {
2748          fs_twiddle_transpose(gallivm, row_type, src1, src_count, src1);
2749       }
2750    }
2751 
2752    /*
2753     * Blend Colour conversion
2754     */
2755    LLVMValueRef blend_color =
2756       lp_jit_context_f_blend_color(gallivm, context_type, context_ptr);
2757    blend_color = LLVMBuildPointerCast(builder, blend_color,
2758                                       LLVMPointerType(fs_vec_type, 0),
2759                                       "");
2760    blend_color = LLVMBuildLoad2(builder, fs_vec_type,
2761                                 LLVMBuildGEP2(builder, fs_vec_type,
2762                                               blend_color,
2763                                               &i32_zero, 1, ""), "");
2764 
2765    /* Convert */
2766    lp_build_conv(gallivm, fs_type, blend_type, &blend_color, 1,
2767                  &blend_color, 1);
2768 
2769    if (out_format_desc->colorspace == UTIL_FORMAT_COLORSPACE_SRGB) {
2770       /*
2771        * since blending is done with floats, there was no conversion.
2772        * However, the rules according to fixed point renderbuffers still
2773        * apply, that is we must clamp inputs to 0.0/1.0.
2774        * (This would apply to separate alpha conversion too but we currently
2775        * force has_alpha to be true.)
2776        * TODO: should skip this with "fake" blend, since post-blend conversion
2777        * will clamp anyway.
2778        * TODO: could also skip this if fragment color clamping is enabled.
2779        * We don't support it natively so it gets baked into the shader
2780        * however, so can't really tell here.
2781        */
2782       struct lp_build_context f32_bld;
2783       assert(row_type.floating);
2784       lp_build_context_init(&f32_bld, gallivm, row_type);
2785       for (unsigned i = 0; i < src_count; i++) {
2786          src[i] = lp_build_clamp_zero_one_nanzero(&f32_bld, src[i]);
2787       }
2788       if (dual_source_blend) {
2789          for (unsigned i = 0; i < src_count; i++) {
2790             src1[i] = lp_build_clamp_zero_one_nanzero(&f32_bld, src1[i]);
2791          }
2792       }
2793       /* probably can't be different than row_type but better safe than sorry... */
2794       lp_build_context_init(&f32_bld, gallivm, blend_type);
2795       blend_color = lp_build_clamp(&f32_bld, blend_color,
2796                                    f32_bld.zero, f32_bld.one);
2797    }
2798 
2799    /* Extract alpha */
2800    LLVMValueRef blend_alpha =
2801       lp_build_extract_broadcast(gallivm, blend_type, row_type,
2802                                  blend_color,
2803                                  lp_build_const_int32(gallivm, 3));
2804 
2805    /* Swizzle to appropriate channels, e.g. from RGBA to BGRA BGRA */
2806    pad_inline &= (dst_channels * (block_size / src_count) * row_type.width)
2807       != vector_width;
2808    if (pad_inline) {
2809       /* Use all 4 channels e.g. from RGBA RGBA to RGxx RGxx */
2810       blend_color = lp_build_swizzle_aos_n(gallivm, blend_color, swizzle,
2811                                            TGSI_NUM_CHANNELS, row_type.length);
2812    } else {
2813       /* Only use dst_channels e.g. RGBA RGBA to RG RG xxxx */
2814       blend_color = lp_build_swizzle_aos_n(gallivm, blend_color, swizzle,
2815                                            dst_channels, row_type.length);
2816    }
2817 
2818    /*
2819     * Mask conversion
2820     */
2821    lp_bld_quad_twiddle(gallivm, mask_type, &src_mask[0],
2822                        block_height, &src_mask[0]);
2823 
2824    if (src_count < block_height) {
2825       lp_build_concat_n(gallivm, mask_type, src_mask, 4, src_mask, src_count);
2826    } else if (src_count > block_height) {
2827       for (unsigned i = src_count; i > 0; --i) {
2828          unsigned pixels = block_size / src_count;
2829          unsigned idx = i - 1;
2830 
2831          src_mask[idx] = lp_build_extract_range(gallivm,
2832                                                 src_mask[(idx * pixels) / 4],
2833                                                 (idx * pixels) % 4, pixels);
2834       }
2835    }
2836 
2837    assert(mask_type.width == 32);
2838 
2839    for (unsigned i = 0; i < src_count; ++i) {
2840       unsigned pixels = block_size / src_count;
2841       unsigned pixel_width = row_type.width * dst_channels;
2842 
2843       if (pixel_width == 24) {
2844          mask_type.width = 8;
2845          mask_type.length = vector_width / mask_type.width;
2846       } else {
2847          mask_type.length = pixels;
2848          mask_type.width = row_type.width * dst_channels;
2849 
2850          /*
2851           * If mask_type width is smaller than 32bit, this doesn't quite
2852           * generate the most efficient code (could use some pack).
2853           */
2854          src_mask[i] = LLVMBuildIntCast(builder, src_mask[i],
2855                                         lp_build_int_vec_type(gallivm,
2856                                                               mask_type), "");
2857 
2858          mask_type.length *= dst_channels;
2859          mask_type.width /= dst_channels;
2860       }
2861 
2862       src_mask[i] = LLVMBuildBitCast(builder, src_mask[i],
2863                                      lp_build_int_vec_type(gallivm, mask_type),
2864                                      "");
2865       src_mask[i] = lp_build_pad_vector(gallivm, src_mask[i], row_type.length);
2866    }
2867 
2868    /*
2869     * Alpha conversion
2870     */
2871    if (!has_alpha) {
2872       struct lp_type alpha_type = fs_type;
2873       alpha_type.length = 4;
2874       convert_alpha(gallivm, row_type, alpha_type,
2875                     block_size, block_height,
2876                     src_count, dst_channels,
2877                     pad_inline, src_alpha);
2878       if (dual_source_blend) {
2879          convert_alpha(gallivm, row_type, alpha_type,
2880                        block_size, block_height,
2881                        src_count, dst_channels,
2882                        pad_inline, src1_alpha);
2883       }
2884    }
2885 
2886 
2887    /*
2888     * Load dst from memory
2889     */
2890    unsigned dst_count;
2891    if (src_count < block_height) {
2892       dst_count = block_height;
2893    } else {
2894       dst_count = src_count;
2895    }
2896 
2897    dst_type.length *= block_size / dst_count;
2898 
2899    if (format_expands_to_float_soa(out_format_desc)) {
2900       /*
2901        * we need multiple values at once for the conversion, so can as well
2902        * load them vectorized here too instead of concatenating later.
2903        * (Still need concatenation later for 8-wide vectors).
2904        */
2905       dst_count = block_height;
2906       dst_type.length = block_width;
2907    }
2908 
2909    /*
2910     * Compute the alignment of the destination pointer in bytes
2911     * We fetch 1-4 pixels, if the format has pot alignment then those fetches
2912     * are always aligned by MIN2(16, fetch_width) except for buffers (not
2913     * 1d tex but can't distinguish here) so need to stick with per-pixel
2914     * alignment in this case.
2915     */
2916    unsigned dst_alignment;
2917    if (is_1d) {
2918       dst_alignment = (out_format_desc->block.bits + 7)/(out_format_desc->block.width * 8);
2919    } else {
2920       dst_alignment = dst_type.length * dst_type.width / 8;
2921    }
2922    /* Force power-of-two alignment by extracting only the least-significant-bit */
2923    dst_alignment = 1 << (ffs(dst_alignment) - 1);
2924    /*
2925     * Resource base and stride pointers are aligned to 16 bytes, so that's
2926     * the maximum alignment we can guarantee
2927     */
2928    dst_alignment = MIN2(16, dst_alignment);
2929 
2930    struct lp_type ls_type = dst_type;
2931 
2932    if (dst_count > src_count) {
2933       if ((dst_type.width == 8 || dst_type.width == 16) &&
2934           util_is_power_of_two_or_zero(dst_type.length) &&
2935           dst_type.length * dst_type.width < 128) {
2936          /*
2937           * Never try to load values as 4xi8 which we will then
2938           * concatenate to larger vectors. This gives llvm a real
2939           * headache (the problem is the type legalizer (?) will
2940           * try to load that as 4xi8 zext to 4xi32 to fill the vector,
2941           * then the shuffles to concatenate are more or less impossible
2942           * - llvm is easily capable of generating a sequence of 32
2943           * pextrb/pinsrb instructions for that. Albeit it appears to
2944           * be fixed in llvm 4.0. So, load and concatenate with 32bit
2945           * width to avoid the trouble (16bit seems not as bad, llvm
2946           * probably recognizes the load+shuffle as only one shuffle
2947           * is necessary, but we can do just the same anyway).
2948           */
2949          ls_type.length = dst_type.length * dst_type.width / 32;
2950          ls_type.width = 32;
2951       }
2952    }
2953 
2954    if (is_1d) {
2955       load_unswizzled_block(gallivm, color_type, color_ptr, stride, block_width, 1,
2956                             dst, ls_type, dst_count / 4, dst_alignment);
2957       for (unsigned i = dst_count / 4; i < dst_count; i++) {
2958          dst[i] = lp_build_undef(gallivm, ls_type);
2959       }
2960    } else {
2961       load_unswizzled_block(gallivm, color_type, color_ptr, stride, block_width,
2962                             block_height, dst, ls_type, dst_count,
2963                             dst_alignment);
2964    }
2965 
2966 
2967    /*
2968     * Convert from dst/output format to src/blending format.
2969     *
2970     * This is necessary as we can only read 1 row from memory at a time,
2971     * so the minimum dst_count will ever be at this point is 4.
2972     *
2973     * With, for example, R8 format you can have all 16 pixels in a 128 bit
2974     * vector, this will take the 4 dsts and combine them into 1 src so we can
2975     * perform blending on all 16 pixels in that single vector at once.
2976     */
2977    if (dst_count > src_count) {
2978       if (ls_type.length != dst_type.length && ls_type.length == 1) {
2979          LLVMTypeRef elem_type = lp_build_elem_type(gallivm, ls_type);
2980          LLVMTypeRef ls_vec_type = LLVMVectorType(elem_type, 1);
2981          for (unsigned i = 0; i < dst_count; i++) {
2982             dst[i] = LLVMBuildBitCast(builder, dst[i], ls_vec_type, "");
2983          }
2984       }
2985 
2986       lp_build_concat_n(gallivm, ls_type, dst, 4, dst, src_count);
2987 
2988       if (ls_type.length != dst_type.length) {
2989          struct lp_type tmp_type = dst_type;
2990          tmp_type.length = dst_type.length * 4 / src_count;
2991          for (unsigned i = 0; i < src_count; i++) {
2992             dst[i] = LLVMBuildBitCast(builder, dst[i],
2993                                       lp_build_vec_type(gallivm, tmp_type), "");
2994          }
2995       }
2996    }
2997 
2998    /*
2999     * Blending
3000     */
3001    /* XXX this is broken for RGB8 formats -
3002     * they get expanded from 12 to 16 elements (to include alpha)
3003     * by convert_to_blend_type then reduced to 15 instead of 12
3004     * by convert_from_blend_type (a simple fix though breaks A8...).
3005     * R16G16B16 also crashes differently however something going wrong
3006     * inside llvm handling npot vector sizes seemingly.
3007     * It seems some cleanup could be done here (like skipping conversion/blend
3008     * when not needed).
3009     */
3010    convert_to_blend_type(gallivm, block_size, out_format_desc, dst_type,
3011                          row_type, dst, src_count);
3012 
3013    /*
3014     * FIXME: Really should get logic ops / masks out of generic blend / row
3015     * format. Logic ops will definitely not work on the blend float format
3016     * used for SRGB here and I think OpenGL expects this to work as expected
3017     * (that is incoming values converted to srgb then logic op applied).
3018     */
3019    for (unsigned i = 0; i < src_count; ++i) {
3020       dst[i] = lp_build_blend_aos(gallivm,
3021                                   &variant->key.blend,
3022                                   out_format,
3023                                   row_type,
3024                                   rt,
3025                                   src[i],
3026                                   has_alpha ? NULL : src_alpha[i],
3027                                   src1[i],
3028                                   has_alpha ? NULL : src1_alpha[i],
3029                                   dst[i],
3030                                   partial_mask ? src_mask[i] : NULL,
3031                                   blend_color,
3032                                   has_alpha ? NULL : blend_alpha,
3033                                   swizzle,
3034                                   pad_inline ? 4 : dst_channels);
3035    }
3036 
3037    convert_from_blend_type(gallivm, block_size, out_format_desc,
3038                            row_type, dst_type, dst, src_count);
3039 
3040    /* Split the blend rows back to memory rows */
3041    if (dst_count > src_count) {
3042       row_type.length = dst_type.length * (dst_count / src_count);
3043 
3044       if (src_count == 1) {
3045          dst[1] = lp_build_extract_range(gallivm, dst[0],
3046                                          row_type.length / 2,
3047                                          row_type.length / 2);
3048          dst[0] = lp_build_extract_range(gallivm, dst[0],
3049                                          0, row_type.length / 2);
3050 
3051          row_type.length /= 2;
3052          src_count *= 2;
3053       }
3054 
3055       dst[3] = lp_build_extract_range(gallivm, dst[1], row_type.length / 2,
3056                                       row_type.length / 2);
3057       dst[2] = lp_build_extract_range(gallivm, dst[1], 0, row_type.length / 2);
3058       dst[1] = lp_build_extract_range(gallivm, dst[0], row_type.length / 2,
3059                                       row_type.length / 2);
3060       dst[0] = lp_build_extract_range(gallivm, dst[0], 0, row_type.length / 2);
3061 
3062       row_type.length /= 2;
3063       src_count *= 2;
3064    }
3065 
3066    /*
3067     * Store blend result to memory
3068     */
3069    if (is_1d) {
3070       store_unswizzled_block(gallivm, color_type, color_ptr, stride, block_width, 1,
3071                              dst, dst_type, dst_count / 4, dst_alignment);
3072    } else {
3073       store_unswizzled_block(gallivm, color_type, color_ptr, stride, block_width,
3074                              block_height,
3075                              dst, dst_type, dst_count, dst_alignment);
3076    }
3077 
3078    if (do_branch) {
3079       lp_build_mask_end(&mask_ctx);
3080    }
3081 
3082    if (fpstate) {
3083       lp_build_fpstate_set(gallivm, fpstate);
3084    }
3085 }
3086 
3087 
3088 /**
3089  * Generate the runtime callable function for the whole fragment pipeline.
3090  * Note that the function which we generate operates on a block of 16
3091  * pixels at at time.  The block contains 2x2 quads.  Each quad contains
3092  * 2x2 pixels.
3093  */
3094 static void
generate_fragment(struct llvmpipe_context * lp,struct lp_fragment_shader * shader,struct lp_fragment_shader_variant * variant,unsigned partial_mask)3095 generate_fragment(struct llvmpipe_context *lp,
3096                   struct lp_fragment_shader *shader,
3097                   struct lp_fragment_shader_variant *variant,
3098                   unsigned partial_mask)
3099 {
3100    assert(partial_mask == RAST_WHOLE ||
3101           partial_mask == RAST_EDGE_TEST);
3102 
3103    struct nir_shader *nir = shader->base.ir.nir;
3104    struct gallivm_state *gallivm = variant->gallivm;
3105    struct lp_fragment_shader_variant_key *key = &variant->key;
3106    struct lp_shader_input inputs[PIPE_MAX_SHADER_INPUTS];
3107    LLVMTypeRef fs_elem_type;
3108    LLVMTypeRef blend_vec_type;
3109    LLVMTypeRef arg_types[16];
3110    LLVMTypeRef func_type;
3111    LLVMTypeRef int32_type = LLVMInt32TypeInContext(gallivm->context);
3112    LLVMTypeRef int32p_type = LLVMPointerType(int32_type, 0);
3113    LLVMTypeRef int8_type = LLVMInt8TypeInContext(gallivm->context);
3114    LLVMTypeRef int8p_type = LLVMPointerType(int8_type, 0);
3115    LLVMValueRef context_ptr;
3116    LLVMValueRef resources_ptr;
3117    LLVMValueRef x;
3118    LLVMValueRef y;
3119    LLVMValueRef a0_ptr;
3120    LLVMValueRef dadx_ptr;
3121    LLVMValueRef dady_ptr;
3122    LLVMValueRef color_ptr_ptr;
3123    LLVMValueRef stride_ptr;
3124    LLVMValueRef color_sample_stride_ptr;
3125    LLVMValueRef depth_ptr;
3126    LLVMValueRef depth_stride;
3127    LLVMValueRef depth_sample_stride;
3128    LLVMValueRef mask_input;
3129    LLVMValueRef thread_data_ptr;
3130    LLVMBasicBlockRef block;
3131    LLVMBuilderRef builder;
3132    struct lp_build_interp_soa_context interp;
3133    LLVMValueRef fs_mask[(16 / 4) * LP_MAX_SAMPLES];
3134    LLVMValueRef fs_out_color[LP_MAX_SAMPLES][PIPE_MAX_COLOR_BUFS][TGSI_NUM_CHANNELS][16 / 4];
3135    LLVMValueRef function;
3136    LLVMValueRef facing;
3137    const bool dual_source_blend = key->blend.rt[0].blend_enable &&
3138                                   util_blend_state_is_dual(&key->blend, 0);
3139 
3140    assert(lp_native_vector_width / 32 >= 4);
3141 
3142    /* Adjust color input interpolation according to flatshade state:
3143     */
3144    nir_foreach_shader_in_variable(var, nir) {
3145       unsigned idx = var->data.driver_location;
3146       unsigned slots = nir_variable_count_slots(var, var->type);
3147       memcpy(&inputs[idx], &shader->inputs[idx], (sizeof inputs[0] * slots));
3148       for (unsigned s = 0; s < slots; s++) {
3149          if (inputs[idx + s].interp == LP_INTERP_COLOR)
3150             inputs[idx + s].interp = key->flatshade ? LP_INTERP_CONSTANT : LP_INTERP_PERSPECTIVE;
3151       }
3152    }
3153 
3154    /* TODO: actually pick these based on the fs and color buffer
3155     * characteristics. */
3156 
3157    struct lp_type fs_type;
3158    memset(&fs_type, 0, sizeof fs_type);
3159    fs_type.floating = true;      /* floating point values */
3160    fs_type.sign = true;          /* values are signed */
3161    fs_type.norm = false;         /* values are not limited to [0,1] or [-1,1] */
3162    fs_type.width = 32;           /* 32-bit float */
3163    fs_type.length = MIN2(lp_native_vector_width / 32, 16); /* n*4 elements per vector */
3164 
3165    struct lp_type blend_type;
3166    memset(&blend_type, 0, sizeof blend_type);
3167    blend_type.floating = false; /* values are integers */
3168    blend_type.sign = false;     /* values are unsigned */
3169    blend_type.norm = true;      /* values are in [0,1] or [-1,1] */
3170    blend_type.width = 8;        /* 8-bit ubyte values */
3171    blend_type.length = 16;      /* 16 elements per vector */
3172 
3173    /*
3174     * Generate the function prototype. Any change here must be reflected in
3175     * lp_jit.h's lp_jit_frag_func function pointer type, and vice-versa.
3176     */
3177 
3178    fs_elem_type = lp_build_elem_type(gallivm, fs_type);
3179 
3180    blend_vec_type = lp_build_vec_type(gallivm, blend_type);
3181 
3182    char func_name[64];
3183    snprintf(func_name, sizeof(func_name), "fs_variant_%s",
3184             partial_mask ? "partial" : "whole");
3185 
3186    arg_types[0] = variant->jit_context_ptr_type;       /* context */
3187    arg_types[1] = variant->jit_resources_ptr_type;       /* context */
3188    arg_types[2] = int32_type;                          /* x */
3189    arg_types[3] = int32_type;                          /* y */
3190    arg_types[4] = int32_type;                          /* facing */
3191    arg_types[5] = LLVMPointerType(fs_elem_type, 0);    /* a0 */
3192    arg_types[6] = LLVMPointerType(fs_elem_type, 0);    /* dadx */
3193    arg_types[7] = LLVMPointerType(fs_elem_type, 0);    /* dady */
3194    arg_types[8] = LLVMPointerType(int8p_type, 0);  /* color */
3195    arg_types[9] = int8p_type;       /* depth */
3196    arg_types[10] = LLVMInt64TypeInContext(gallivm->context);  /* mask_input */
3197    arg_types[11] = variant->jit_thread_data_ptr_type;  /* per thread data */
3198    arg_types[12] = int32p_type;     /* stride */
3199    arg_types[13] = int32_type;                         /* depth_stride */
3200    arg_types[14] = int32p_type;     /* color sample strides */
3201    arg_types[15] = int32_type;                         /* depth sample stride */
3202 
3203    func_type = LLVMFunctionType(LLVMVoidTypeInContext(gallivm->context),
3204                                 arg_types, ARRAY_SIZE(arg_types), 0);
3205 
3206    function = LLVMAddFunction(gallivm->module, func_name, func_type);
3207    LLVMSetFunctionCallConv(function, LLVMCCallConv);
3208 
3209    variant->function[partial_mask] = function;
3210 
3211    /* XXX: need to propagate noalias down into color param now we are
3212     * passing a pointer-to-pointer?
3213     */
3214    for (unsigned i = 0; i < ARRAY_SIZE(arg_types); ++i)
3215       if (LLVMGetTypeKind(arg_types[i]) == LLVMPointerTypeKind)
3216          lp_add_function_attr(function, i + 1, LP_FUNC_ATTR_NOALIAS);
3217 
3218    if (variant->gallivm->cache->data_size)
3219       return;
3220 
3221    context_ptr  = LLVMGetParam(function, 0);
3222    resources_ptr  = LLVMGetParam(function, 1);
3223    x            = LLVMGetParam(function, 2);
3224    y            = LLVMGetParam(function, 3);
3225    facing       = LLVMGetParam(function, 4);
3226    a0_ptr       = LLVMGetParam(function, 5);
3227    dadx_ptr     = LLVMGetParam(function, 6);
3228    dady_ptr     = LLVMGetParam(function, 7);
3229    color_ptr_ptr = LLVMGetParam(function, 8);
3230    depth_ptr    = LLVMGetParam(function, 9);
3231    mask_input   = LLVMGetParam(function, 10);
3232    thread_data_ptr  = LLVMGetParam(function, 11);
3233    stride_ptr   = LLVMGetParam(function, 12);
3234    depth_stride = LLVMGetParam(function, 13);
3235    color_sample_stride_ptr = LLVMGetParam(function, 14);
3236    depth_sample_stride = LLVMGetParam(function, 15);
3237 
3238    lp_build_name(context_ptr, "context");
3239    lp_build_name(resources_ptr, "resources");
3240    lp_build_name(x, "x");
3241    lp_build_name(y, "y");
3242    lp_build_name(a0_ptr, "a0");
3243    lp_build_name(dadx_ptr, "dadx");
3244    lp_build_name(dady_ptr, "dady");
3245    lp_build_name(color_ptr_ptr, "color_ptr_ptr");
3246    lp_build_name(depth_ptr, "depth");
3247    lp_build_name(mask_input, "mask_input");
3248    lp_build_name(thread_data_ptr, "thread_data");
3249    lp_build_name(stride_ptr, "stride_ptr");
3250    lp_build_name(depth_stride, "depth_stride");
3251    lp_build_name(color_sample_stride_ptr, "color_sample_stride_ptr");
3252    lp_build_name(depth_sample_stride, "depth_sample_stride");
3253 
3254    /*
3255     * Function body
3256     */
3257 
3258    block = LLVMAppendBasicBlockInContext(gallivm->context, function, "entry");
3259    builder = gallivm->builder;
3260    assert(builder);
3261    LLVMPositionBuilderAtEnd(builder, block);
3262 
3263    /*
3264     * Must not count ps invocations if there's a null shader.
3265     * (It would be ok to count with null shader if there's d/s tests,
3266     * but only if there's d/s buffers too, which is different
3267     * to implicit rasterization disable which must not depend
3268     * on the d/s buffers.)
3269     * Could use popcount on mask, but pixel accuracy is not required.
3270     * Could disable if there's no stats query, but maybe not worth it.
3271     */
3272    if (shader->info.base.num_instructions > 1) {
3273       LLVMValueRef invocs, val;
3274       LLVMTypeRef invocs_type = LLVMInt64TypeInContext(gallivm->context);
3275       invocs = lp_jit_thread_data_ps_invocations(gallivm, variant->jit_thread_data_type, thread_data_ptr);
3276       val = LLVMBuildLoad2(builder, invocs_type, invocs, "");
3277       val = LLVMBuildAdd(builder, val,
3278                          LLVMConstInt(LLVMInt64TypeInContext(gallivm->context),
3279                                       1, 0),
3280                          "invoc_count");
3281       LLVMBuildStore(builder, val, invocs);
3282    }
3283 
3284    /* code generated texture sampling */
3285    struct lp_build_sampler_soa *sampler =
3286       lp_llvm_sampler_soa_create(lp_fs_variant_key_samplers(key),
3287                                  MAX2(key->nr_samplers,
3288                                       key->nr_sampler_views));
3289    struct lp_build_image_soa *image =
3290       lp_bld_llvm_image_soa_create(lp_fs_variant_key_images(key), key->nr_images);
3291 
3292    unsigned num_fs = 16 / fs_type.length; /* number of loops per 4x4 stamp */
3293    /* for 1d resources only run "upper half" of stamp */
3294    if (key->resource_1d)
3295       num_fs /= 2;
3296 
3297    {
3298       LLVMValueRef num_loop = lp_build_const_int32(gallivm, num_fs);
3299       LLVMTypeRef mask_type = lp_build_int_vec_type(gallivm, fs_type);
3300       LLVMValueRef num_loop_samp =
3301          lp_build_const_int32(gallivm, num_fs * key->coverage_samples);
3302       LLVMValueRef mask_store =
3303          lp_build_array_alloca(gallivm, mask_type,
3304                                num_loop_samp, "mask_store");
3305       LLVMTypeRef flt_type = LLVMFloatTypeInContext(gallivm->context);
3306       LLVMValueRef glob_sample_pos =
3307          LLVMAddGlobal(gallivm->module,
3308                        LLVMArrayType(flt_type, key->coverage_samples * 2), "");
3309       LLVMValueRef sample_pos_array;
3310 
3311       if (key->multisample && key->coverage_samples == 4) {
3312          LLVMValueRef sample_pos_arr[8];
3313          for (unsigned i = 0; i < 4; i++) {
3314             sample_pos_arr[i * 2] = LLVMConstReal(flt_type,
3315                                                   lp_sample_pos_4x[i][0]);
3316             sample_pos_arr[i * 2 + 1] = LLVMConstReal(flt_type,
3317                                                       lp_sample_pos_4x[i][1]);
3318          }
3319          sample_pos_array =
3320             LLVMConstArray(LLVMFloatTypeInContext(gallivm->context),
3321                            sample_pos_arr, 8);
3322       } else {
3323          LLVMValueRef sample_pos_arr[2];
3324          sample_pos_arr[0] = LLVMConstReal(flt_type, 0.5);
3325          sample_pos_arr[1] = LLVMConstReal(flt_type, 0.5);
3326          sample_pos_array =
3327             LLVMConstArray(LLVMFloatTypeInContext(gallivm->context),
3328                            sample_pos_arr, 2);
3329       }
3330       LLVMSetInitializer(glob_sample_pos, sample_pos_array);
3331 
3332       LLVMValueRef color_store[PIPE_MAX_COLOR_BUFS][TGSI_NUM_CHANNELS];
3333       bool pixel_center_integer = nir->info.fs.pixel_center_integer;
3334 
3335       /*
3336        * The shader input interpolation info is not explicitely baked in the
3337        * shader key, but everything it derives from (TGSI, and flatshade) is
3338        * already included in the shader key.
3339        */
3340       lp_build_interp_soa_init(&interp,
3341                                gallivm,
3342                                nir->num_inputs,
3343                                inputs,
3344                                pixel_center_integer,
3345                                key->coverage_samples,
3346                                LLVMTypeOf(sample_pos_array),
3347                                glob_sample_pos,
3348                                num_loop,
3349                                builder, fs_type,
3350                                a0_ptr, dadx_ptr, dady_ptr,
3351                                x, y);
3352 
3353       for (unsigned i = 0; i < num_fs; i++) {
3354          if (key->multisample) {
3355             LLVMValueRef smask_val =
3356                LLVMBuildLoad2(builder, int32_type,
3357                               lp_jit_context_sample_mask(gallivm, variant->jit_context_type, context_ptr),
3358                               "");
3359 
3360             /*
3361              * For multisampling, extract the per-sample mask from the
3362              * incoming 64-bit mask, store to the per sample mask storage. Or
3363              * all of them together to generate the fragment shader
3364              * mask. (sample shading TODO).  Take the incoming state coverage
3365              * mask into account.
3366              */
3367             for (unsigned s = 0; s < key->coverage_samples; s++) {
3368                LLVMValueRef sindexi =
3369                   lp_build_const_int32(gallivm, i + (s * num_fs));
3370                LLVMValueRef sample_mask_ptr =
3371                   LLVMBuildGEP2(builder, mask_type, mask_store, &sindexi, 1,
3372                                 "sample_mask_ptr");
3373                LLVMValueRef s_mask =
3374                   generate_quad_mask(gallivm, fs_type,
3375                                      i * fs_type.length / 4, s, mask_input);
3376                LLVMValueRef smask_bit =
3377                   LLVMBuildAnd(builder, smask_val,
3378                                lp_build_const_int32(gallivm, (1 << s)), "");
3379                LLVMValueRef cmp =
3380                   LLVMBuildICmp(builder, LLVMIntNE, smask_bit,
3381                                 lp_build_const_int32(gallivm, 0), "");
3382                smask_bit = LLVMBuildSExt(builder, cmp, int32_type, "");
3383                smask_bit = lp_build_broadcast(gallivm, mask_type, smask_bit);
3384 
3385                s_mask = LLVMBuildAnd(builder, s_mask, smask_bit, "");
3386                LLVMBuildStore(builder, s_mask, sample_mask_ptr);
3387             }
3388          } else {
3389             LLVMValueRef mask;
3390             LLVMValueRef indexi = lp_build_const_int32(gallivm, i);
3391             LLVMValueRef mask_ptr = LLVMBuildGEP2(builder, mask_type, mask_store,
3392                                                   &indexi, 1, "mask_ptr");
3393 
3394             if (partial_mask) {
3395                mask = generate_quad_mask(gallivm, fs_type,
3396                                          i * fs_type.length / 4, 0, mask_input);
3397             } else {
3398                mask = lp_build_const_int_vec(gallivm, fs_type, ~0);
3399             }
3400             LLVMBuildStore(builder, mask, mask_ptr);
3401          }
3402       }
3403 
3404       generate_fs_loop(gallivm,
3405                        shader, key,
3406                        builder,
3407                        fs_type,
3408                        variant->jit_context_type,
3409                        context_ptr,
3410                        variant->jit_resources_type,
3411                        resources_ptr,
3412                        LLVMTypeOf(sample_pos_array),
3413                        glob_sample_pos,
3414                        num_loop,
3415                        &interp,
3416                        sampler,
3417                        image,
3418                        mask_type,
3419                        mask_store, /* output */
3420                        color_store,
3421                        depth_ptr,
3422                        depth_stride,
3423                        depth_sample_stride,
3424                        color_ptr_ptr,
3425                        stride_ptr,
3426                        color_sample_stride_ptr,
3427                        facing,
3428                        variant->jit_thread_data_type,
3429                        thread_data_ptr);
3430 
3431       LLVMTypeRef fs_vec_type = lp_build_vec_type(gallivm, fs_type);
3432       for (unsigned i = 0; i < num_fs; i++) {
3433          LLVMValueRef ptr;
3434          for (unsigned s = 0; s < key->coverage_samples; s++) {
3435             int idx = (i + (s * num_fs));
3436             LLVMValueRef sindexi = lp_build_const_int32(gallivm, idx);
3437             ptr = LLVMBuildGEP2(builder, mask_type, mask_store, &sindexi, 1, "");
3438 
3439             fs_mask[idx] = LLVMBuildLoad2(builder, mask_type, ptr, "smask");
3440          }
3441 
3442          for (unsigned s = 0; s < key->min_samples; s++) {
3443             /* This is fucked up need to reorganize things */
3444             int idx = s * num_fs + i;
3445             LLVMValueRef sindexi = lp_build_const_int32(gallivm, idx);
3446             for (unsigned cbuf = 0; cbuf < key->nr_cbufs; cbuf++) {
3447                for (unsigned chan = 0; chan < TGSI_NUM_CHANNELS; ++chan) {
3448                   ptr = LLVMBuildGEP2(builder, fs_vec_type,
3449                                       color_store[cbuf][chan],
3450                                       &sindexi, 1, "");
3451                   fs_out_color[s][cbuf][chan][i] = ptr;
3452                }
3453             }
3454             if (dual_source_blend) {
3455                /* only support one dual source blend target hence always use
3456                 * output 1
3457                 */
3458                for (unsigned chan = 0; chan < TGSI_NUM_CHANNELS; ++chan) {
3459                   ptr = LLVMBuildGEP2(builder, fs_vec_type,
3460                                       color_store[1][chan],
3461                                       &sindexi, 1, "");
3462                   fs_out_color[s][1][chan][i] = ptr;
3463                }
3464             }
3465          }
3466       }
3467    }
3468 
3469    lp_bld_llvm_sampler_soa_destroy(sampler);
3470    lp_bld_llvm_image_soa_destroy(image);
3471 
3472    /* Loop over color outputs / color buffers to do blending */
3473    for (unsigned cbuf = 0; cbuf < key->nr_cbufs; cbuf++) {
3474       if (key->cbuf_format[cbuf] != PIPE_FORMAT_NONE &&
3475           (key->blend.rt[cbuf].blend_enable || key->blend.logicop_enable ||
3476            find_output_by_frag_result(nir, FRAG_RESULT_DATA0 + cbuf) != -1)) {
3477          LLVMValueRef color_ptr;
3478          LLVMValueRef stride;
3479          LLVMValueRef sample_stride = NULL;
3480          LLVMValueRef index = lp_build_const_int32(gallivm, cbuf);
3481 
3482          bool do_branch = ((key->depth.enabled
3483                             || key->stencil[0].enabled
3484                             || key->alpha.enabled)
3485                            && !nir->info.fs.uses_discard);
3486 
3487          color_ptr = LLVMBuildLoad2(builder, int8p_type,
3488                                     LLVMBuildGEP2(builder, int8p_type, color_ptr_ptr,
3489                                                  &index, 1, ""),
3490                                     "");
3491 
3492          stride = LLVMBuildLoad2(builder, int32_type,
3493                                  LLVMBuildGEP2(builder, int32_type, stride_ptr,
3494                                              &index, 1, ""),
3495                                  "");
3496 
3497          if (key->cbuf_nr_samples[cbuf] > 1)
3498             sample_stride = LLVMBuildLoad2(builder, int32_type,
3499                                            LLVMBuildGEP2(builder,
3500                                                          int32_type,
3501                                                          color_sample_stride_ptr,
3502                                                          &index, 1, ""), "");
3503 
3504          for (unsigned s = 0; s < key->cbuf_nr_samples[cbuf]; s++) {
3505             unsigned mask_idx = num_fs * (key->multisample ? s : 0);
3506             unsigned out_idx = key->min_samples == 1 ? 0 : s;
3507             LLVMValueRef out_ptr = color_ptr;
3508 
3509             if (sample_stride) {
3510                LLVMValueRef sample_offset =
3511                   LLVMBuildMul(builder, sample_stride,
3512                                lp_build_const_int32(gallivm, s), "");
3513                out_ptr = LLVMBuildGEP2(builder, int8_type, out_ptr, &sample_offset, 1, "");
3514             }
3515             out_ptr = LLVMBuildBitCast(builder, out_ptr,
3516                                        LLVMPointerType(blend_vec_type, 0), "");
3517 
3518             lp_build_name(out_ptr, "color_ptr%d", cbuf);
3519 
3520             generate_unswizzled_blend(gallivm, cbuf, variant,
3521                                       key->cbuf_format[cbuf],
3522                                       num_fs, fs_type, &fs_mask[mask_idx],
3523                                       fs_out_color[out_idx],
3524                                       variant->jit_context_type,
3525                                       context_ptr, blend_vec_type, out_ptr, stride,
3526                                       partial_mask, do_branch);
3527          }
3528       }
3529    }
3530 
3531    LLVMBuildRetVoid(builder);
3532 
3533    gallivm_verify_function(gallivm, function);
3534 }
3535 
3536 
3537 static void
dump_fs_variant_key(struct lp_fragment_shader_variant_key * key)3538 dump_fs_variant_key(struct lp_fragment_shader_variant_key *key)
3539 {
3540    debug_printf("fs variant %p:\n", (void *) key);
3541 
3542    if (key->flatshade) {
3543       debug_printf("flatshade = 1\n");
3544    }
3545    if (key->depth_clamp)
3546       debug_printf("depth_clamp = 1\n");
3547 
3548    if (key->restrict_depth_values)
3549       debug_printf("restrict_depth_values = 1\n");
3550 
3551    if (key->multisample) {
3552       debug_printf("multisample = 1\n");
3553       debug_printf("coverage samples = %d\n", key->coverage_samples);
3554       debug_printf("min samples = %d\n", key->min_samples);
3555    }
3556    for (unsigned i = 0; i < key->nr_cbufs; ++i) {
3557       debug_printf("cbuf_format[%u] = %s\n", i, util_format_name(key->cbuf_format[i]));
3558       debug_printf("cbuf nr_samples[%u] = %d\n", i, key->cbuf_nr_samples[i]);
3559    }
3560    if (key->depth.enabled || key->stencil[0].enabled) {
3561       debug_printf("depth.format = %s\n", util_format_name(key->zsbuf_format));
3562       debug_printf("depth nr_samples = %d\n", key->zsbuf_nr_samples);
3563    }
3564    if (key->depth.enabled) {
3565       debug_printf("depth.func = %s\n", util_str_func(key->depth.func, true));
3566       debug_printf("depth.writemask = %u\n", key->depth.writemask);
3567    }
3568 
3569    for (unsigned i = 0; i < 2; ++i) {
3570       if (key->stencil[i].enabled) {
3571          debug_printf("stencil[%u].func = %s\n", i, util_str_func(key->stencil[i].func, true));
3572          debug_printf("stencil[%u].fail_op = %s\n", i, util_str_stencil_op(key->stencil[i].fail_op, true));
3573          debug_printf("stencil[%u].zpass_op = %s\n", i, util_str_stencil_op(key->stencil[i].zpass_op, true));
3574          debug_printf("stencil[%u].zfail_op = %s\n", i, util_str_stencil_op(key->stencil[i].zfail_op, true));
3575          debug_printf("stencil[%u].valuemask = 0x%x\n", i, key->stencil[i].valuemask);
3576          debug_printf("stencil[%u].writemask = 0x%x\n", i, key->stencil[i].writemask);
3577       }
3578    }
3579 
3580    if (key->alpha.enabled) {
3581       debug_printf("alpha.func = %s\n", util_str_func(key->alpha.func, true));
3582    }
3583 
3584    if (key->occlusion_count) {
3585       debug_printf("occlusion_count = 1\n");
3586    }
3587 
3588    if (key->blend.logicop_enable) {
3589       debug_printf("blend.logicop_func = %s\n", util_str_logicop(key->blend.logicop_func, true));
3590    } else if (key->blend.rt[0].blend_enable) {
3591       debug_printf("blend.rgb_func = %s\n",   util_str_blend_func  (key->blend.rt[0].rgb_func, true));
3592       debug_printf("blend.rgb_src_factor = %s\n",   util_str_blend_factor(key->blend.rt[0].rgb_src_factor, true));
3593       debug_printf("blend.rgb_dst_factor = %s\n",   util_str_blend_factor(key->blend.rt[0].rgb_dst_factor, true));
3594       debug_printf("blend.alpha_func = %s\n",       util_str_blend_func  (key->blend.rt[0].alpha_func, true));
3595       debug_printf("blend.alpha_src_factor = %s\n", util_str_blend_factor(key->blend.rt[0].alpha_src_factor, true));
3596       debug_printf("blend.alpha_dst_factor = %s\n", util_str_blend_factor(key->blend.rt[0].alpha_dst_factor, true));
3597    }
3598    debug_printf("blend.colormask = 0x%x\n", key->blend.rt[0].colormask);
3599    if (key->blend.alpha_to_coverage) {
3600       debug_printf("blend.alpha_to_coverage is enabled\n");
3601    }
3602    for (unsigned i = 0; i < key->nr_samplers; ++i) {
3603       const struct lp_sampler_static_state *samplers = lp_fs_variant_key_samplers(key);
3604       const struct lp_static_sampler_state *sampler = &samplers[i].sampler_state;
3605       debug_printf("sampler[%u] = \n", i);
3606       debug_printf("  .wrap = %s %s %s\n",
3607                    util_str_tex_wrap(sampler->wrap_s, true),
3608                    util_str_tex_wrap(sampler->wrap_t, true),
3609                    util_str_tex_wrap(sampler->wrap_r, true));
3610       debug_printf("  .min_img_filter = %s\n",
3611                    util_str_tex_filter(sampler->min_img_filter, true));
3612       debug_printf("  .min_mip_filter = %s\n",
3613                    util_str_tex_mipfilter(sampler->min_mip_filter, true));
3614       debug_printf("  .mag_img_filter = %s\n",
3615                    util_str_tex_filter(sampler->mag_img_filter, true));
3616       if (sampler->compare_mode != PIPE_TEX_COMPARE_NONE)
3617          debug_printf("  .compare_func = %s\n", util_str_func(sampler->compare_func, true));
3618       debug_printf("  .normalized_coords = %u\n", sampler->normalized_coords);
3619       debug_printf("  .min_max_lod_equal = %u\n", sampler->min_max_lod_equal);
3620       debug_printf("  .lod_bias_non_zero = %u\n", sampler->lod_bias_non_zero);
3621       debug_printf("  .apply_min_lod = %u\n", sampler->apply_min_lod);
3622       debug_printf("  .apply_max_lod = %u\n", sampler->apply_max_lod);
3623       debug_printf("  .reduction_mode = %u\n", sampler->reduction_mode);
3624       debug_printf("  .aniso = %u\n", sampler->aniso);
3625    }
3626    for (unsigned i = 0; i < key->nr_sampler_views; ++i) {
3627       const struct lp_sampler_static_state *samplers = lp_fs_variant_key_samplers(key);
3628       const struct lp_static_texture_state *texture = &samplers[i].texture_state;
3629       debug_printf("texture[%u] = \n", i);
3630       debug_printf("  .format = %s\n",
3631                    util_format_name(texture->format));
3632       debug_printf("  .target = %s\n",
3633                    util_str_tex_target(texture->target, true));
3634       debug_printf("  .level_zero_only = %u\n",
3635                    texture->level_zero_only);
3636       debug_printf("  .pot = %u %u %u\n",
3637                    texture->pot_width,
3638                    texture->pot_height,
3639                    texture->pot_depth);
3640    }
3641    struct lp_image_static_state *images = lp_fs_variant_key_images(key);
3642    for (unsigned i = 0; i < key->nr_images; ++i) {
3643       const struct lp_static_texture_state *image = &images[i].image_state;
3644       debug_printf("image[%u] = \n", i);
3645       debug_printf("  .format = %s\n",
3646                    util_format_name(image->format));
3647       debug_printf("  .target = %s\n",
3648                    util_str_tex_target(image->target, true));
3649       debug_printf("  .level_zero_only = %u\n",
3650                    image->level_zero_only);
3651       debug_printf("  .pot = %u %u %u\n",
3652                    image->pot_width,
3653                    image->pot_height,
3654                    image->pot_depth);
3655    }
3656 }
3657 
3658 
3659 const char *
lp_debug_fs_kind(enum lp_fs_kind kind)3660 lp_debug_fs_kind(enum lp_fs_kind kind)
3661 {
3662    switch (kind) {
3663    case LP_FS_KIND_GENERAL:
3664       return "GENERAL";
3665    case LP_FS_KIND_BLIT_RGBA:
3666       return "BLIT_RGBA";
3667    case LP_FS_KIND_BLIT_RGB1:
3668       return "BLIT_RGB1";
3669    case LP_FS_KIND_AERO_MINIFICATION:
3670       return "AERO_MINIFICATION";
3671    case LP_FS_KIND_LLVM_LINEAR:
3672       return "LLVM_LINEAR";
3673    default:
3674       return "unknown";
3675    }
3676 }
3677 
3678 
3679 void
lp_debug_fs_variant(struct lp_fragment_shader_variant * variant)3680 lp_debug_fs_variant(struct lp_fragment_shader_variant *variant)
3681 {
3682    debug_printf("llvmpipe: Fragment shader #%u variant #%u:\n",
3683                 variant->shader->no, variant->no);
3684    nir_print_shader(variant->shader->base.ir.nir, stderr);
3685    dump_fs_variant_key(&variant->key);
3686    debug_printf("variant->opaque = %u\n", variant->opaque);
3687    debug_printf("variant->potentially_opaque = %u\n", variant->potentially_opaque);
3688    debug_printf("variant->blit = %u\n", variant->blit);
3689    debug_printf("shader->kind = %s\n", lp_debug_fs_kind(variant->shader->kind));
3690    debug_printf("\n");
3691 }
3692 
3693 
3694 static void
lp_fs_get_ir_cache_key(struct lp_fragment_shader_variant * variant,unsigned char ir_sha1_cache_key[20])3695 lp_fs_get_ir_cache_key(struct lp_fragment_shader_variant *variant,
3696                        unsigned char ir_sha1_cache_key[20])
3697 {
3698    struct blob blob = { 0 };
3699    unsigned ir_size;
3700    void *ir_binary;
3701 
3702    blob_init(&blob);
3703    nir_serialize(&blob, variant->shader->base.ir.nir, true);
3704    ir_binary = blob.data;
3705    ir_size = blob.size;
3706 
3707    struct mesa_sha1 ctx;
3708    _mesa_sha1_init(&ctx);
3709    _mesa_sha1_update(&ctx, &variant->key, variant->shader->variant_key_size);
3710    _mesa_sha1_update(&ctx, ir_binary, ir_size);
3711    _mesa_sha1_final(&ctx, ir_sha1_cache_key);
3712 
3713    blob_finish(&blob);
3714 }
3715 
3716 
3717 /**
3718  * Generate a new fragment shader variant from the shader code and
3719  * other state indicated by the key.
3720  */
3721 static struct lp_fragment_shader_variant *
generate_variant(struct llvmpipe_context * lp,struct lp_fragment_shader * shader,const struct lp_fragment_shader_variant_key * key)3722 generate_variant(struct llvmpipe_context *lp,
3723                  struct lp_fragment_shader *shader,
3724                  const struct lp_fragment_shader_variant_key *key)
3725 {
3726    struct nir_shader *nir = shader->base.ir.nir;
3727    struct lp_fragment_shader_variant *variant =
3728       MALLOC(sizeof *variant + shader->variant_key_size - sizeof variant->key);
3729    if (!variant)
3730       return NULL;
3731 
3732    memset(variant, 0, sizeof(*variant));
3733 
3734    pipe_reference_init(&variant->reference, 1);
3735    lp_fs_reference(lp, &variant->shader, shader);
3736 
3737    memcpy(&variant->key, key, shader->variant_key_size);
3738 
3739    struct llvmpipe_screen *screen = llvmpipe_screen(lp->pipe.screen);
3740    struct lp_cached_code cached = { 0 };
3741    unsigned char ir_sha1_cache_key[20];
3742    bool needs_caching = false;
3743    if (shader->base.ir.nir) {
3744       lp_fs_get_ir_cache_key(variant, ir_sha1_cache_key);
3745 
3746       lp_disk_cache_find_shader(screen, &cached, ir_sha1_cache_key);
3747       if (!cached.data_size)
3748          needs_caching = true;
3749    }
3750 
3751    char module_name[64];
3752    snprintf(module_name, sizeof(module_name), "fs%u_variant%u",
3753             shader->no, shader->variants_created);
3754    variant->gallivm = gallivm_create(module_name, lp->context, &cached);
3755    if (!variant->gallivm) {
3756       FREE(variant);
3757       return NULL;
3758    }
3759 
3760    variant->list_item_global.base = variant;
3761    variant->list_item_local.base = variant;
3762    variant->no = shader->variants_created++;
3763 
3764    /*
3765     * Determine whether we are touching all channels in the color buffer.
3766     */
3767    const struct util_format_description *cbuf0_format_desc = NULL;
3768    bool fullcolormask = false;
3769    if (key->nr_cbufs == 1) {
3770       cbuf0_format_desc = util_format_description(key->cbuf_format[0]);
3771       fullcolormask = util_format_colormask_full(cbuf0_format_desc,
3772                                                  key->blend.rt[0].colormask);
3773    }
3774 
3775    /* The scissor is ignored here as only tiles inside the scissoring
3776     * rectangle will refer to this.
3777     */
3778    const bool no_kill =
3779          fullcolormask &&
3780          !key->stencil[0].enabled &&
3781          !key->alpha.enabled &&
3782          !key->multisample &&
3783          !key->blend.alpha_to_coverage &&
3784          !key->depth.enabled &&
3785          !nir->info.fs.uses_discard &&
3786          !(nir->info.outputs_written & BITFIELD64_BIT(FRAG_RESULT_SAMPLE_MASK)) &&
3787          !nir->info.fs.uses_fbfetch_output;
3788 
3789    variant->opaque =
3790          no_kill &&
3791          !key->blend.logicop_enable &&
3792          !key->blend.rt[0].blend_enable
3793          ? true : false;
3794 
3795    variant->potentially_opaque =
3796          no_kill &&
3797          !key->blend.logicop_enable &&
3798          key->blend.rt[0].blend_enable &&
3799          key->blend.rt[0].rgb_func == PIPE_BLEND_ADD &&
3800          key->blend.rt[0].rgb_dst_factor == PIPE_BLENDFACTOR_INV_SRC_ALPHA &&
3801          key->blend.rt[0].alpha_func == key->blend.rt[0].rgb_func &&
3802          key->blend.rt[0].alpha_dst_factor == key->blend.rt[0].rgb_dst_factor &&
3803          shader->base.type == PIPE_SHADER_IR_TGSI &&
3804          /*
3805           * FIXME: for NIR, all of the fields of info.xxx (except info.base)
3806           * are zeros, hence shader analysis (here and elsewhere) using these
3807           * bits cannot work and will silently fail (cbuf is the only pointer
3808           * field, hence causing a crash).
3809           */
3810          shader->info.cbuf[0][3].file != TGSI_FILE_NULL
3811          ? true : false;
3812 
3813    /* We only care about opaque blits for now */
3814    if (variant->opaque &&
3815        (shader->kind == LP_FS_KIND_BLIT_RGBA ||
3816         shader->kind == LP_FS_KIND_BLIT_RGB1)) {
3817       const struct lp_sampler_static_state *samp0 =
3818          lp_fs_variant_key_sampler_idx(key, 0);
3819       assert(samp0);
3820 
3821       const enum pipe_format texture_format = samp0->texture_state.format;
3822       const enum pipe_texture_target target = samp0->texture_state.target;
3823       const unsigned min_img_filter = samp0->sampler_state.min_img_filter;
3824       const unsigned mag_img_filter = samp0->sampler_state.mag_img_filter;
3825 
3826       unsigned min_mip_filter;
3827       if (samp0->texture_state.level_zero_only) {
3828          min_mip_filter = PIPE_TEX_MIPFILTER_NONE;
3829       } else {
3830          min_mip_filter = samp0->sampler_state.min_mip_filter;
3831       }
3832 
3833       if (target == PIPE_TEXTURE_2D &&
3834           min_img_filter == PIPE_TEX_FILTER_NEAREST &&
3835           mag_img_filter == PIPE_TEX_FILTER_NEAREST &&
3836           min_mip_filter == PIPE_TEX_MIPFILTER_NONE &&
3837           ((texture_format &&
3838             util_is_format_compatible(util_format_description(texture_format),
3839                                       cbuf0_format_desc)) ||
3840            (shader->kind == LP_FS_KIND_BLIT_RGB1 &&
3841             (texture_format == PIPE_FORMAT_B8G8R8A8_UNORM ||
3842              texture_format == PIPE_FORMAT_B8G8R8X8_UNORM) &&
3843             (key->cbuf_format[0] == PIPE_FORMAT_B8G8R8A8_UNORM ||
3844              key->cbuf_format[0] == PIPE_FORMAT_B8G8R8X8_UNORM)))) {
3845          variant->blit = 1;
3846       }
3847    }
3848 
3849    /* Determine whether this shader + pipeline state is a candidate for
3850     * the linear path.
3851     */
3852    const bool linear_pipeline =
3853          !key->stencil[0].enabled &&
3854          !key->depth.enabled &&
3855          !nir->info.fs.uses_discard &&
3856          !key->blend.logicop_enable &&
3857          (key->cbuf_format[0] == PIPE_FORMAT_B8G8R8A8_UNORM ||
3858           key->cbuf_format[0] == PIPE_FORMAT_B8G8R8X8_UNORM ||
3859           key->cbuf_format[0] == PIPE_FORMAT_R8G8B8A8_UNORM ||
3860           key->cbuf_format[0] == PIPE_FORMAT_R8G8B8X8_UNORM);
3861 
3862    memcpy(&variant->key, key, sizeof *key);
3863 
3864    if ((LP_DEBUG & DEBUG_FS) || (gallivm_debug & GALLIVM_DEBUG_IR)) {
3865       lp_debug_fs_variant(variant);
3866    }
3867 
3868    llvmpipe_fs_variant_fastpath(variant);
3869 
3870    lp_jit_init_types(variant);
3871 
3872    if (variant->jit_function[RAST_EDGE_TEST] == NULL)
3873       generate_fragment(lp, shader, variant, RAST_EDGE_TEST);
3874 
3875    if (variant->jit_function[RAST_WHOLE] == NULL) {
3876       if (variant->opaque) {
3877          /* Specialized shader, which doesn't need to read the color buffer. */
3878          generate_fragment(lp, shader, variant, RAST_WHOLE);
3879       }
3880    }
3881 
3882    if (linear_pipeline) {
3883       /* Currently keeping both the old fastpaths and new linear path
3884        * active.  The older code is still somewhat faster for the cases
3885        * it covers.
3886        *
3887        * XXX: consider restricting this to aero-mode only.
3888        */
3889       if (fullcolormask &&
3890           !key->alpha.enabled &&
3891           !key->blend.alpha_to_coverage) {
3892          llvmpipe_fs_variant_linear_fastpath(variant);
3893       }
3894 
3895       /* If the original fastpath doesn't cover this variant, try the new
3896        * code:
3897        */
3898       if (variant->jit_linear == NULL) {
3899          if (shader->kind == LP_FS_KIND_BLIT_RGBA ||
3900              shader->kind == LP_FS_KIND_BLIT_RGB1 ||
3901              shader->kind == LP_FS_KIND_LLVM_LINEAR) {
3902             llvmpipe_fs_variant_linear_llvm(lp, shader, variant);
3903          }
3904       }
3905    } else {
3906       if (LP_DEBUG & DEBUG_LINEAR) {
3907          lp_debug_fs_variant(variant);
3908          debug_printf("    ----> no linear path for this variant\n");
3909       }
3910    }
3911 
3912    /*
3913     * Compile everything
3914     */
3915 
3916    gallivm_compile_module(variant->gallivm);
3917 
3918    variant->nr_instrs += lp_build_count_ir_module(variant->gallivm->module);
3919 
3920    if (variant->function[RAST_EDGE_TEST]) {
3921       variant->jit_function[RAST_EDGE_TEST] = (lp_jit_frag_func)
3922             gallivm_jit_function(variant->gallivm,
3923                                  variant->function[RAST_EDGE_TEST]);
3924    }
3925 
3926    if (variant->function[RAST_WHOLE]) {
3927       variant->jit_function[RAST_WHOLE] = (lp_jit_frag_func)
3928          gallivm_jit_function(variant->gallivm,
3929                               variant->function[RAST_WHOLE]);
3930    } else if (!variant->jit_function[RAST_WHOLE]) {
3931       variant->jit_function[RAST_WHOLE] = (lp_jit_frag_func)
3932          variant->jit_function[RAST_EDGE_TEST];
3933    }
3934 
3935    if (linear_pipeline) {
3936       if (variant->linear_function) {
3937          variant->jit_linear_llvm = (lp_jit_linear_llvm_func)
3938             gallivm_jit_function(variant->gallivm, variant->linear_function);
3939       }
3940 
3941       /*
3942        * This must be done after LLVM compilation, as it will call the JIT'ed
3943        * code to determine active inputs.
3944        */
3945       lp_linear_check_variant(variant);
3946    }
3947 
3948    if (needs_caching) {
3949       lp_disk_cache_insert_shader(screen, &cached, ir_sha1_cache_key);
3950    }
3951 
3952    gallivm_free_ir(variant->gallivm);
3953 
3954    return variant;
3955 }
3956 
3957 
3958 static void *
llvmpipe_create_fs_state(struct pipe_context * pipe,const struct pipe_shader_state * templ)3959 llvmpipe_create_fs_state(struct pipe_context *pipe,
3960                          const struct pipe_shader_state *templ)
3961 {
3962    struct llvmpipe_context *llvmpipe = llvmpipe_context(pipe);
3963 
3964    struct lp_fragment_shader *shader = CALLOC_STRUCT(lp_fragment_shader);
3965    if (!shader)
3966       return NULL;
3967 
3968    pipe_reference_init(&shader->reference, 1);
3969    shader->no = fs_no++;
3970    list_inithead(&shader->variants.list);
3971 
3972    shader->base.type = PIPE_SHADER_IR_NIR;
3973 
3974    if (templ->type == PIPE_SHADER_IR_TGSI) {
3975       shader->base.ir.nir = tgsi_to_nir(templ->tokens, pipe->screen, false);
3976    } else {
3977       shader->base.ir.nir = templ->ir.nir;
3978    }
3979 
3980    /* lower FRAG_RESULT_COLOR -> DATA[0-7] to correctly handle unused attachments */
3981    nir_shader *nir = shader->base.ir.nir;
3982    NIR_PASS_V(nir, nir_lower_fragcolor, nir->info.fs.color_is_dual_source ? 1 : 8);
3983 
3984    nir_shader_gather_info(nir, nir_shader_get_entrypoint(nir));
3985    nir_tgsi_scan_shader(nir, &shader->info.base, true);
3986    shader->info.num_texs = shader->info.base.opcode_count[TGSI_OPCODE_TEX];
3987 
3988    llvmpipe_register_shader(pipe, &shader->base);
3989 
3990    shader->draw_data = draw_create_fragment_shader(llvmpipe->draw, templ);
3991    if (shader->draw_data == NULL) {
3992       FREE(shader);
3993       return NULL;
3994    }
3995 
3996    const int nr_samplers = BITSET_LAST_BIT(nir->info.samplers_used);
3997    const int nr_sampler_views = BITSET_LAST_BIT(nir->info.textures_used);
3998    const int nr_images = BITSET_LAST_BIT(nir->info.images_used);
3999 
4000    shader->variant_key_size = lp_fs_variant_key_size(MAX2(nr_samplers,
4001                                                           nr_sampler_views),
4002                                                      nr_images);
4003 
4004    nir_foreach_shader_in_variable(var, nir) {
4005       unsigned idx = var->data.driver_location;
4006       unsigned slots = nir_variable_count_slots(var, var->type);
4007 
4008       if (var->data.centroid)
4009          shader->inputs[idx].location = TGSI_INTERPOLATE_LOC_CENTROID;
4010       if (var->data.sample)
4011          shader->inputs[idx].location = TGSI_INTERPOLATE_LOC_SAMPLE;
4012 
4013       enum glsl_base_type base_type =
4014          glsl_get_base_type(glsl_without_array(var->type));
4015       switch (var->data.interpolation) {
4016       case INTERP_MODE_NONE:
4017          if (glsl_base_type_is_integer(base_type) || var->data.per_primitive) {
4018             shader->inputs[idx].interp = LP_INTERP_CONSTANT;
4019             break;
4020          }
4021          if (var->data.location == VARYING_SLOT_COL0 ||
4022              var->data.location == VARYING_SLOT_COL1) {
4023             shader->inputs[idx].interp = LP_INTERP_COLOR;
4024             break;
4025          }
4026          FALLTHROUGH;
4027       case INTERP_MODE_SMOOTH:
4028          shader->inputs[idx].interp = LP_INTERP_PERSPECTIVE;
4029          break;
4030       case INTERP_MODE_NOPERSPECTIVE:
4031          shader->inputs[idx].interp = LP_INTERP_LINEAR;
4032          break;
4033       case INTERP_MODE_FLAT:
4034          shader->inputs[idx].interp = LP_INTERP_CONSTANT;
4035          break;
4036       }
4037 
4038       /* XXX this is a completely pointless index map... */
4039       shader->inputs[idx].src_index = idx + 1;
4040       if (var->data.location == VARYING_SLOT_FACE)
4041          shader->inputs[idx].interp = LP_INTERP_FACING;
4042       else if (var->data.location == VARYING_SLOT_POS) {
4043          shader->inputs[idx].src_index = 0;
4044          shader->inputs[idx].interp = LP_INTERP_POSITION;
4045       }
4046 
4047       shader->inputs[idx].usage_mask = shader->info.base.input_usage_mask[idx];
4048       for (unsigned s = 1; s < slots; s++) {
4049          shader->inputs[idx + s] = shader->inputs[idx];
4050          shader->inputs[idx + s].src_index = idx + s + 1;
4051          shader->inputs[idx + s].usage_mask = shader->info.base.input_usage_mask[idx + s];
4052       }
4053    }
4054 
4055    llvmpipe_fs_analyse_nir(shader);
4056 
4057    return shader;
4058 }
4059 
4060 
4061 static void
llvmpipe_bind_fs_state(struct pipe_context * pipe,void * fs)4062 llvmpipe_bind_fs_state(struct pipe_context *pipe, void *fs)
4063 {
4064    struct llvmpipe_context *llvmpipe = llvmpipe_context(pipe);
4065    struct lp_fragment_shader *lp_fs = (struct lp_fragment_shader *)fs;
4066    if (llvmpipe->fs == lp_fs)
4067       return;
4068 
4069    draw_bind_fragment_shader(llvmpipe->draw,
4070                              (lp_fs ? lp_fs->draw_data : NULL));
4071 
4072    lp_fs_reference(llvmpipe, &llvmpipe->fs, lp_fs);
4073 
4074    /* invalidate the setup link, NEW_FS will make it update */
4075    lp_setup_set_fs_variant(llvmpipe->setup, NULL);
4076    llvmpipe->dirty |= LP_NEW_FS;
4077 }
4078 
4079 
4080 /**
4081  * Remove shader variant from two lists: the shader's variant list
4082  * and the context's variant list.
4083  */
4084 static void
llvmpipe_remove_shader_variant(struct llvmpipe_context * lp,struct lp_fragment_shader_variant * variant)4085 llvmpipe_remove_shader_variant(struct llvmpipe_context *lp,
4086                                struct lp_fragment_shader_variant *variant)
4087 {
4088    if ((LP_DEBUG & DEBUG_FS) || (gallivm_debug & GALLIVM_DEBUG_IR)) {
4089       debug_printf("llvmpipe: del fs #%u var %u v created %u v cached %u "
4090                    "v total cached %u inst %u total inst %u\n",
4091                    variant->shader->no, variant->no,
4092                    variant->shader->variants_created,
4093                    variant->shader->variants_cached,
4094                    lp->nr_fs_variants, variant->nr_instrs, lp->nr_fs_instrs);
4095    }
4096 
4097    /* remove from shader's list */
4098    list_del(&variant->list_item_local.list);
4099    variant->shader->variants_cached--;
4100 
4101    /* remove from context's list */
4102    list_del(&variant->list_item_global.list);
4103    lp->nr_fs_variants--;
4104    lp->nr_fs_instrs -= variant->nr_instrs;
4105 }
4106 
4107 
4108 void
llvmpipe_destroy_shader_variant(struct llvmpipe_context * lp,struct lp_fragment_shader_variant * variant)4109 llvmpipe_destroy_shader_variant(struct llvmpipe_context *lp,
4110                                 struct lp_fragment_shader_variant *variant)
4111 {
4112    gallivm_destroy(variant->gallivm);
4113    lp_fs_reference(lp, &variant->shader, NULL);
4114    FREE(variant);
4115 }
4116 
4117 
4118 void
llvmpipe_destroy_fs(struct llvmpipe_context * llvmpipe,struct lp_fragment_shader * shader)4119 llvmpipe_destroy_fs(struct llvmpipe_context *llvmpipe,
4120                     struct lp_fragment_shader *shader)
4121 {
4122    /* Delete draw module's data */
4123    draw_delete_fragment_shader(llvmpipe->draw, shader->draw_data);
4124 
4125    ralloc_free(shader->base.ir.nir);
4126    assert(shader->variants_cached == 0);
4127    FREE(shader);
4128 }
4129 
4130 
4131 static void
llvmpipe_delete_fs_state(struct pipe_context * pipe,void * fs)4132 llvmpipe_delete_fs_state(struct pipe_context *pipe, void *fs)
4133 {
4134    struct llvmpipe_context *llvmpipe = llvmpipe_context(pipe);
4135    struct lp_fragment_shader *shader = fs;
4136    struct lp_fs_variant_list_item *li, *next;
4137 
4138    /* Delete all the variants */
4139    LIST_FOR_EACH_ENTRY_SAFE(li, next, &shader->variants.list, list) {
4140       struct lp_fragment_shader_variant *variant;
4141       variant = li->base;
4142       llvmpipe_remove_shader_variant(llvmpipe, li->base);
4143       lp_fs_variant_reference(llvmpipe, &variant, NULL);
4144    }
4145 
4146    lp_fs_reference(llvmpipe, &shader, NULL);
4147 }
4148 
4149 
4150 static void
llvmpipe_set_constant_buffer(struct pipe_context * pipe,enum pipe_shader_type shader,uint index,bool take_ownership,const struct pipe_constant_buffer * cb)4151 llvmpipe_set_constant_buffer(struct pipe_context *pipe,
4152                              enum pipe_shader_type shader, uint index,
4153                              bool take_ownership,
4154                              const struct pipe_constant_buffer *cb)
4155 {
4156    struct llvmpipe_context *llvmpipe = llvmpipe_context(pipe);
4157    struct pipe_constant_buffer *constants = &llvmpipe->constants[shader][index];
4158 
4159    assert(shader < PIPE_SHADER_MESH_TYPES);
4160    assert(index < ARRAY_SIZE(llvmpipe->constants[shader]));
4161 
4162    /* note: reference counting */
4163    util_copy_constant_buffer(&llvmpipe->constants[shader][index], cb,
4164                              take_ownership);
4165 
4166    /* user_buffer is only valid until the next set_constant_buffer (at most,
4167     * possibly until shader deletion), so we need to upload it now to make
4168     * sure it doesn't get updated/freed out from under us.
4169     */
4170    if (constants->user_buffer) {
4171       u_upload_data(llvmpipe->pipe.const_uploader, 0, constants->buffer_size,
4172                     16, constants->user_buffer, &constants->buffer_offset,
4173                     &constants->buffer);
4174    }
4175    if (constants->buffer) {
4176        if (!(constants->buffer->bind & PIPE_BIND_CONSTANT_BUFFER)) {
4177          debug_printf("Illegal set constant without bind flag\n");
4178          constants->buffer->bind |= PIPE_BIND_CONSTANT_BUFFER;
4179       }
4180       llvmpipe_flush_resource(pipe, constants->buffer, 0, true, true, false, "set_constant_buffer");
4181    }
4182 
4183    switch (shader) {
4184    case PIPE_SHADER_VERTEX:
4185    case PIPE_SHADER_GEOMETRY:
4186    case PIPE_SHADER_TESS_CTRL:
4187    case PIPE_SHADER_TESS_EVAL: {
4188       const unsigned size = cb ? cb->buffer_size : 0;
4189 
4190       const uint8_t *data = NULL;
4191       if (constants->buffer) {
4192          data = (uint8_t *) llvmpipe_resource_data(constants->buffer)
4193             + constants->buffer_offset;
4194       }
4195 
4196       draw_set_mapped_constant_buffer(llvmpipe->draw, shader,
4197                                       index, data, size);
4198       break;
4199    }
4200    case PIPE_SHADER_COMPUTE:
4201       llvmpipe->cs_dirty |= LP_CSNEW_CONSTANTS;
4202       break;
4203    case PIPE_SHADER_FRAGMENT:
4204       llvmpipe->dirty |= LP_NEW_FS_CONSTANTS;
4205       break;
4206    case PIPE_SHADER_TASK:
4207       llvmpipe->dirty |= LP_NEW_TASK_CONSTANTS;
4208       break;
4209    case PIPE_SHADER_MESH:
4210       llvmpipe->dirty |= LP_NEW_MESH_CONSTANTS;
4211       break;
4212    default:
4213       unreachable("Illegal shader type");
4214       break;
4215    }
4216 }
4217 
4218 
4219 static void
llvmpipe_set_shader_buffers(struct pipe_context * pipe,enum pipe_shader_type shader,unsigned start_slot,unsigned count,const struct pipe_shader_buffer * buffers,unsigned writable_bitmask)4220 llvmpipe_set_shader_buffers(struct pipe_context *pipe,
4221                             enum pipe_shader_type shader, unsigned start_slot,
4222                             unsigned count,
4223                             const struct pipe_shader_buffer *buffers,
4224                             unsigned writable_bitmask)
4225 {
4226    struct llvmpipe_context *llvmpipe = llvmpipe_context(pipe);
4227 
4228    unsigned i, idx;
4229    for (i = start_slot, idx = 0; i < start_slot + count; i++, idx++) {
4230       const struct pipe_shader_buffer *buffer = buffers ? &buffers[idx] : NULL;
4231 
4232       util_copy_shader_buffer(&llvmpipe->ssbos[shader][i], buffer);
4233 
4234       if (buffer && buffer->buffer) {
4235          bool read_only = !(writable_bitmask & (1 << idx));
4236          llvmpipe_flush_resource(pipe, buffer->buffer, 0, read_only, false,
4237                                  false, "buffer");
4238       }
4239 
4240       switch (shader) {
4241       case PIPE_SHADER_VERTEX:
4242       case PIPE_SHADER_GEOMETRY:
4243       case PIPE_SHADER_TESS_CTRL:
4244       case PIPE_SHADER_TESS_EVAL: {
4245          const unsigned size = buffer ? buffer->buffer_size : 0;
4246          const uint8_t *data = NULL;
4247          if (buffer && buffer->buffer)
4248             data = (uint8_t *) llvmpipe_resource_data(buffer->buffer);
4249          if (data)
4250             data += buffer->buffer_offset;
4251          draw_set_mapped_shader_buffer(llvmpipe->draw, shader,
4252                                        i, data, size);
4253          break;
4254       }
4255       case PIPE_SHADER_COMPUTE:
4256          llvmpipe->cs_dirty |= LP_CSNEW_SSBOS;
4257          break;
4258       case PIPE_SHADER_TASK:
4259          llvmpipe->dirty |= LP_NEW_TASK_SSBOS;
4260          break;
4261       case PIPE_SHADER_MESH:
4262          llvmpipe->dirty |= LP_NEW_MESH_SSBOS;
4263          break;
4264       case PIPE_SHADER_FRAGMENT:
4265          llvmpipe->fs_ssbo_write_mask &= ~(((1 << count) - 1) << start_slot);
4266          llvmpipe->fs_ssbo_write_mask |= writable_bitmask << start_slot;
4267          llvmpipe->dirty |= LP_NEW_FS_SSBOS;
4268          break;
4269       default:
4270          unreachable("Illegal shader type");
4271          break;
4272       }
4273    }
4274 }
4275 
4276 
4277 static void
llvmpipe_set_shader_images(struct pipe_context * pipe,enum pipe_shader_type shader,unsigned start_slot,unsigned count,unsigned unbind_num_trailing_slots,const struct pipe_image_view * images)4278 llvmpipe_set_shader_images(struct pipe_context *pipe,
4279                            enum pipe_shader_type shader, unsigned start_slot,
4280                            unsigned count, unsigned unbind_num_trailing_slots,
4281                            const struct pipe_image_view *images)
4282 {
4283    struct llvmpipe_context *llvmpipe = llvmpipe_context(pipe);
4284    unsigned i, idx;
4285 
4286    draw_flush(llvmpipe->draw);
4287    for (i = start_slot, idx = 0; i < start_slot + count; i++, idx++) {
4288       const struct pipe_image_view *image = images ? &images[idx] : NULL;
4289 
4290       util_copy_image_view(&llvmpipe->images[shader][i], image);
4291 
4292       if (image && image->resource) {
4293          bool read_only = !(image->access & PIPE_IMAGE_ACCESS_WRITE);
4294          llvmpipe_flush_resource(pipe, image->resource, 0, read_only, false,
4295                                  false, "image");
4296       }
4297    }
4298 
4299    llvmpipe->num_images[shader] = start_slot + count;
4300    switch (shader) {
4301    case PIPE_SHADER_VERTEX:
4302    case PIPE_SHADER_GEOMETRY:
4303    case PIPE_SHADER_TESS_CTRL:
4304    case PIPE_SHADER_TESS_EVAL:
4305       draw_set_images(llvmpipe->draw, shader, llvmpipe->images[shader],
4306                       start_slot + count);
4307       break;
4308    case PIPE_SHADER_COMPUTE:
4309       llvmpipe->cs_dirty |= LP_CSNEW_IMAGES;
4310       break;
4311    case PIPE_SHADER_FRAGMENT:
4312       llvmpipe->dirty |= LP_NEW_FS_IMAGES;
4313       break;
4314    case PIPE_SHADER_TASK:
4315       llvmpipe->dirty |= LP_NEW_TASK_IMAGES;
4316       break;
4317    case PIPE_SHADER_MESH:
4318       llvmpipe->dirty |= LP_NEW_MESH_IMAGES;
4319       break;
4320    default:
4321       unreachable("Illegal shader type");
4322       break;
4323    }
4324 
4325    if (unbind_num_trailing_slots) {
4326       llvmpipe_set_shader_images(pipe, shader, start_slot + count,
4327                                  unbind_num_trailing_slots, 0, NULL);
4328    }
4329 }
4330 
4331 
4332 /**
4333  * Return the blend factor equivalent to a destination alpha of one.
4334  */
4335 static inline enum pipe_blendfactor
force_dst_alpha_one(enum pipe_blendfactor factor,bool clamped_zero)4336 force_dst_alpha_one(enum pipe_blendfactor factor, bool clamped_zero)
4337 {
4338    switch (factor) {
4339    case PIPE_BLENDFACTOR_DST_ALPHA:
4340       return PIPE_BLENDFACTOR_ONE;
4341    case PIPE_BLENDFACTOR_INV_DST_ALPHA:
4342       return PIPE_BLENDFACTOR_ZERO;
4343    case PIPE_BLENDFACTOR_SRC_ALPHA_SATURATE:
4344       if (clamped_zero)
4345          return PIPE_BLENDFACTOR_ZERO;
4346       else
4347          return PIPE_BLENDFACTOR_SRC_ALPHA_SATURATE;
4348    default:
4349       return factor;
4350    }
4351 }
4352 
4353 
4354 /**
4355  * We need to generate several variants of the fragment pipeline to match
4356  * all the combinations of the contributing state atoms.
4357  *
4358  * TODO: there is actually no reason to tie this to context state -- the
4359  * generated code could be cached globally in the screen.
4360  */
4361 static struct lp_fragment_shader_variant_key *
make_variant_key(struct llvmpipe_context * lp,struct lp_fragment_shader * shader,char * store)4362 make_variant_key(struct llvmpipe_context *lp,
4363                  struct lp_fragment_shader *shader,
4364                  char *store)
4365 {
4366    struct lp_fragment_shader_variant_key *key =
4367       (struct lp_fragment_shader_variant_key *)store;
4368    struct nir_shader *nir = shader->base.ir.nir;
4369 
4370    memset(key, 0, sizeof(*key));
4371 
4372    if (lp->framebuffer.zsbuf) {
4373       const enum pipe_format zsbuf_format = lp->framebuffer.zsbuf->format;
4374       const struct util_format_description *zsbuf_desc =
4375          util_format_description(zsbuf_format);
4376 
4377       if (lp->depth_stencil->depth_enabled &&
4378           util_format_has_depth(zsbuf_desc)) {
4379          key->zsbuf_format = zsbuf_format;
4380          key->depth.enabled = lp->depth_stencil->depth_enabled;
4381          key->depth.writemask = lp->depth_stencil->depth_writemask;
4382          key->depth.func = lp->depth_stencil->depth_func;
4383       }
4384       if (lp->depth_stencil->stencil[0].enabled &&
4385           util_format_has_stencil(zsbuf_desc)) {
4386          key->zsbuf_format = zsbuf_format;
4387          memcpy(&key->stencil, &lp->depth_stencil->stencil,
4388                 sizeof key->stencil);
4389       }
4390       if (llvmpipe_resource_is_1d(lp->framebuffer.zsbuf->texture)) {
4391          key->resource_1d = true;
4392       }
4393       key->zsbuf_nr_samples =
4394          util_res_sample_count(lp->framebuffer.zsbuf->texture);
4395 
4396       /*
4397        * Restrict depth values if the API is clamped (GL, VK with ext)
4398        * for non float Z buffer
4399        */
4400       key->restrict_depth_values =
4401          !(lp->rasterizer->unclamped_fragment_depth_values &&
4402            util_format_get_depth_only(zsbuf_format) == PIPE_FORMAT_Z32_FLOAT);
4403    }
4404 
4405    /*
4406     * Propagate the depth clamp setting from the rasterizer state.
4407     */
4408    key->depth_clamp = lp->rasterizer->depth_clamp;
4409 
4410    /* alpha test only applies if render buffer 0 is non-integer
4411     * (or does not exist)
4412     */
4413    if (!lp->framebuffer.nr_cbufs ||
4414        !lp->framebuffer.cbufs[0] ||
4415        !util_format_is_pure_integer(lp->framebuffer.cbufs[0]->format)) {
4416       key->alpha.enabled = lp->depth_stencil->alpha_enabled;
4417    }
4418    if (key->alpha.enabled) {
4419       key->alpha.func = lp->depth_stencil->alpha_func;
4420       /* alpha.ref_value is passed in jit_context */
4421    }
4422 
4423    key->flatshade = lp->rasterizer->flatshade;
4424    key->multisample = lp->rasterizer->multisample;
4425    key->no_ms_sample_mask_out = lp->rasterizer->no_ms_sample_mask_out;
4426    if (lp->active_occlusion_queries && !lp->queries_disabled) {
4427       key->occlusion_count = true;
4428    }
4429 
4430    memcpy(&key->blend, lp->blend, sizeof key->blend);
4431 
4432    key->coverage_samples = 1;
4433    key->min_samples = 1;
4434    if (key->multisample) {
4435       key->coverage_samples =
4436          util_framebuffer_get_num_samples(&lp->framebuffer);
4437       /* Per EXT_shader_framebuffer_fetch spec:
4438        *
4439        *   "1. How is framebuffer data treated during multisample rendering?
4440        *
4441        *    RESOLVED: Reading the value of gl_LastFragData produces a
4442        *    different result for each sample. This implies that all or part
4443        *    of the shader be run once for each sample, but has no additional
4444        *    implications on fragment shader input variables which may still
4445        *    be interpolated per pixel by the implementation."
4446        *
4447        * ARM_shader_framebuffer_fetch_depth_stencil spec further says:
4448        *
4449        *   "(1) When multisampling is enabled, does the shader run per sample?
4450        *
4451        *    RESOLVED.
4452        *
4453        *    This behavior is inherited from either
4454        *    EXT_shader_framebuffer_fetch or ARM_shader_framebuffer_fetch as
4455        *    described in the interactions section.  If neither extension is
4456        *    supported, the shader runs once per fragment."
4457        *
4458        * Therefore we should always enable per-sample shading when FB fetch is
4459        * used.
4460        */
4461       if (lp->min_samples > 1 || nir->info.fs.uses_fbfetch_output)
4462          key->min_samples = key->coverage_samples;
4463    }
4464    key->nr_cbufs = lp->framebuffer.nr_cbufs;
4465 
4466    if (!key->blend.independent_blend_enable) {
4467       // we always need independent blend otherwise the fixups below won't work
4468       for (unsigned i = 1; i < key->nr_cbufs; i++) {
4469          memcpy(&key->blend.rt[i], &key->blend.rt[0],
4470                 sizeof(key->blend.rt[0]));
4471       }
4472       key->blend.independent_blend_enable = 1;
4473    }
4474 
4475    for (unsigned i = 0; i < lp->framebuffer.nr_cbufs; i++) {
4476       struct pipe_rt_blend_state *blend_rt = &key->blend.rt[i];
4477 
4478       if (lp->framebuffer.cbufs[i]) {
4479          const enum pipe_format format = lp->framebuffer.cbufs[i]->format;
4480 
4481          key->cbuf_format[i] = format;
4482          key->cbuf_nr_samples[i] =
4483             util_res_sample_count(lp->framebuffer.cbufs[i]->texture);
4484 
4485          /*
4486           * Figure out if this is a 1d resource. Note that OpenGL allows crazy
4487           * mixing of 2d textures with height 1 and 1d textures, so make sure
4488           * we pick 1d if any cbuf or zsbuf is 1d.
4489           */
4490          if (llvmpipe_resource_is_1d(lp->framebuffer.cbufs[i]->texture)) {
4491             key->resource_1d = true;
4492          }
4493 
4494          const struct util_format_description *format_desc =
4495             util_format_description(format);
4496          assert(format_desc->colorspace == UTIL_FORMAT_COLORSPACE_RGB ||
4497                 format_desc->colorspace == UTIL_FORMAT_COLORSPACE_SRGB);
4498 
4499          /*
4500           * Mask out color channels not present in the color buffer.
4501           */
4502          blend_rt->colormask &= util_format_colormask(format_desc);
4503 
4504          /*
4505           * Disable blend for integer formats.
4506           */
4507          if (util_format_is_pure_integer(format)) {
4508             blend_rt->blend_enable = 0;
4509          }
4510 
4511          /*
4512           * Our swizzled render tiles always have an alpha channel, but the
4513           * linear render target format often does not, so force here the dst
4514           * alpha to be one.
4515           *
4516           * This is not a mere optimization. Wrong results will be produced if
4517           * the dst alpha is used, the dst format does not have alpha, and the
4518           * previous rendering was not flushed from the swizzled to linear
4519           * buffer. For example, NonPowTwo DCT.
4520           *
4521           * TODO: This should be generalized to all channels for better
4522           * performance, but only alpha causes correctness issues.
4523           *
4524           * Also, force rgb/alpha func/factors match, to make AoS blending
4525           * easier.
4526           */
4527          if (format_desc->swizzle[3] > PIPE_SWIZZLE_W ||
4528              format_desc->swizzle[3] == format_desc->swizzle[0]) {
4529             // Doesn't cover mixed snorm/unorm but can't render to them anyway
4530             bool clamped_zero = !util_format_is_float(format) &&
4531                                 !util_format_is_snorm(format);
4532             blend_rt->rgb_src_factor =
4533                force_dst_alpha_one(blend_rt->rgb_src_factor, clamped_zero);
4534             blend_rt->rgb_dst_factor =
4535                force_dst_alpha_one(blend_rt->rgb_dst_factor, clamped_zero);
4536             blend_rt->alpha_func       = blend_rt->rgb_func;
4537             blend_rt->alpha_src_factor = blend_rt->rgb_src_factor;
4538             blend_rt->alpha_dst_factor = blend_rt->rgb_dst_factor;
4539          }
4540       } else {
4541          /* no color buffer for this fragment output */
4542          key->cbuf_format[i] = PIPE_FORMAT_NONE;
4543          key->cbuf_nr_samples[i] = 0;
4544          blend_rt->colormask = 0x0;
4545          blend_rt->blend_enable = 0;
4546       }
4547    }
4548 
4549    /* This value will be the same for all the variants of a given shader:
4550     */
4551    key->nr_samplers = BITSET_LAST_BIT(nir->info.samplers_used);
4552    key->nr_sampler_views = BITSET_LAST_BIT(nir->info.textures_used);
4553 
4554    struct lp_sampler_static_state *fs_sampler =
4555       lp_fs_variant_key_samplers(key);
4556 
4557    memset(fs_sampler, 0,
4558           MAX2(key->nr_samplers, key->nr_sampler_views) * sizeof *fs_sampler);
4559 
4560    for (unsigned i = 0; i < key->nr_samplers; ++i) {
4561       if (BITSET_TEST(nir->info.samplers_used, i)) {
4562          lp_sampler_static_sampler_state(&fs_sampler[i].sampler_state,
4563                                          lp->samplers[PIPE_SHADER_FRAGMENT][i]);
4564       }
4565    }
4566 
4567    /*
4568     * XXX If TGSI_FILE_SAMPLER_VIEW exists assume all texture opcodes
4569     * are dx10-style? Can't really have mixed opcodes, at least not
4570     * if we want to skip the holes here (without rescanning tgsi).
4571     */
4572    if (key->nr_sampler_views) {
4573       for (unsigned i = 0; i < key->nr_sampler_views; ++i) {
4574          /*
4575           * Note sview may exceed what's representable by file_mask.
4576           * This will still work, the only downside is that not actually
4577           * used views may be included in the shader key.
4578           */
4579          if (BITSET_TEST(nir->info.textures_used, i)) {
4580             lp_sampler_static_texture_state(&fs_sampler[i].texture_state,
4581                                   lp->sampler_views[PIPE_SHADER_FRAGMENT][i]);
4582          }
4583       }
4584    } else {
4585       key->nr_sampler_views = key->nr_samplers;
4586       for (unsigned i = 0; i < key->nr_sampler_views; ++i) {
4587          if (BITSET_TEST(nir->info.samplers_used, i)) {
4588             lp_sampler_static_texture_state(&fs_sampler[i].texture_state,
4589                                  lp->sampler_views[PIPE_SHADER_FRAGMENT][i]);
4590          }
4591       }
4592    }
4593 
4594    struct lp_image_static_state *lp_image = lp_fs_variant_key_images(key);
4595    key->nr_images = BITSET_LAST_BIT(nir->info.images_used);
4596    if (key->nr_images)
4597       memset(lp_image, 0,
4598              key->nr_images * sizeof *lp_image);
4599    for (unsigned i = 0; i < key->nr_images; ++i) {
4600       if (BITSET_TEST(nir->info.images_used, i)) {
4601          lp_sampler_static_texture_state_image(&lp_image[i].image_state,
4602                                       &lp->images[PIPE_SHADER_FRAGMENT][i]);
4603       }
4604    }
4605 
4606    if (shader->kind == LP_FS_KIND_AERO_MINIFICATION) {
4607       struct lp_sampler_static_state *samp0 =
4608          lp_fs_variant_key_sampler_idx(key, 0);
4609       assert(samp0);
4610       samp0->sampler_state.min_img_filter = PIPE_TEX_FILTER_NEAREST;
4611       samp0->sampler_state.mag_img_filter = PIPE_TEX_FILTER_NEAREST;
4612    }
4613 
4614    return key;
4615 }
4616 
4617 
4618 /**
4619  * Update fragment shader state.  This is called just prior to drawing
4620  * something when some fragment-related state has changed.
4621  */
4622 void
llvmpipe_update_fs(struct llvmpipe_context * lp)4623 llvmpipe_update_fs(struct llvmpipe_context *lp)
4624 {
4625    struct lp_fragment_shader *shader = lp->fs;
4626 
4627    char store[LP_FS_MAX_VARIANT_KEY_SIZE];
4628    const struct lp_fragment_shader_variant_key *key =
4629       make_variant_key(lp, shader, store);
4630 
4631    struct lp_fragment_shader_variant *variant = NULL;
4632    struct lp_fs_variant_list_item *li;
4633    /* Search the variants for one which matches the key */
4634    LIST_FOR_EACH_ENTRY(li, &shader->variants.list, list) {
4635       if (memcmp(&li->base->key, key, shader->variant_key_size) == 0) {
4636          variant = li->base;
4637          break;
4638       }
4639    }
4640 
4641    if (variant) {
4642       /* Move this variant to the head of the list to implement LRU
4643        * deletion of shader's when we have too many.
4644        */
4645       list_move_to(&variant->list_item_global.list, &lp->fs_variants_list.list);
4646    } else {
4647       /* variant not found, create it now */
4648 
4649       if (LP_DEBUG & DEBUG_FS) {
4650          debug_printf("%u variants,\t%u instrs,\t%u instrs/variant\n",
4651                       lp->nr_fs_variants,
4652                       lp->nr_fs_instrs,
4653                       lp->nr_fs_variants ? lp->nr_fs_instrs / lp->nr_fs_variants : 0);
4654       }
4655 
4656       /* First, check if we've exceeded the max number of shader variants.
4657        * If so, free 6.25% of them (the least recently used ones).
4658        */
4659       const unsigned variants_to_cull =
4660          lp->nr_fs_variants >= LP_MAX_SHADER_VARIANTS
4661          ? LP_MAX_SHADER_VARIANTS / 16 : 0;
4662 
4663       if (variants_to_cull ||
4664           lp->nr_fs_instrs >= LP_MAX_SHADER_INSTRUCTIONS) {
4665          if (gallivm_debug & GALLIVM_DEBUG_PERF) {
4666             debug_printf("Evicting FS: %u fs variants,\t%u total variants,"
4667                          "\t%u instrs,\t%u instrs/variant\n",
4668                          shader->variants_cached,
4669                          lp->nr_fs_variants, lp->nr_fs_instrs,
4670                          lp->nr_fs_instrs / lp->nr_fs_variants);
4671          }
4672 
4673          /*
4674           * We need to re-check lp->nr_fs_variants because an arbitrarily
4675           * large number of shader variants (potentially all of them) could
4676           * be pending for destruction on flush.
4677           */
4678 
4679          for (unsigned i = 0;
4680               i < variants_to_cull ||
4681                  lp->nr_fs_instrs >= LP_MAX_SHADER_INSTRUCTIONS;
4682               i++) {
4683             struct lp_fs_variant_list_item *item;
4684             if (list_is_empty(&lp->fs_variants_list.list)) {
4685                break;
4686             }
4687             item = list_last_entry(&lp->fs_variants_list.list,
4688                                    struct lp_fs_variant_list_item, list);
4689             assert(item);
4690             assert(item->base);
4691             llvmpipe_remove_shader_variant(lp, item->base);
4692             struct lp_fragment_shader_variant *variant = item->base;
4693             lp_fs_variant_reference(lp, &variant, NULL);
4694          }
4695       }
4696 
4697       /*
4698        * Generate the new variant.
4699        */
4700       int64_t t0 = os_time_get();
4701       variant = generate_variant(lp, shader, key);
4702       int64_t t1 = os_time_get();
4703       int64_t dt = t1 - t0;
4704       LP_COUNT_ADD(llvm_compile_time, dt);
4705       LP_COUNT_ADD(nr_llvm_compiles, 2);  /* emit vs. omit in/out test */
4706 
4707       /* Put the new variant into the list */
4708       if (variant) {
4709          list_add(&variant->list_item_local.list, &shader->variants.list);
4710          list_add(&variant->list_item_global.list, &lp->fs_variants_list.list);
4711          lp->nr_fs_variants++;
4712          lp->nr_fs_instrs += variant->nr_instrs;
4713          shader->variants_cached++;
4714       }
4715    }
4716 
4717    /* Bind this variant */
4718    lp_setup_set_fs_variant(lp->setup, variant);
4719 }
4720 
4721 
4722 void
llvmpipe_init_fs_funcs(struct llvmpipe_context * llvmpipe)4723 llvmpipe_init_fs_funcs(struct llvmpipe_context *llvmpipe)
4724 {
4725    llvmpipe->pipe.create_fs_state = llvmpipe_create_fs_state;
4726    llvmpipe->pipe.bind_fs_state   = llvmpipe_bind_fs_state;
4727    llvmpipe->pipe.delete_fs_state = llvmpipe_delete_fs_state;
4728    llvmpipe->pipe.set_constant_buffer = llvmpipe_set_constant_buffer;
4729    llvmpipe->pipe.set_shader_buffers = llvmpipe_set_shader_buffers;
4730    llvmpipe->pipe.set_shader_images = llvmpipe_set_shader_images;
4731 }
4732