1 /**************************************************************************
2 *
3 * Copyright 2009 VMware, Inc.
4 * Copyright 2007 VMware, Inc.
5 * All Rights Reserved.
6 *
7 * Permission is hereby granted, free of charge, to any person obtaining a
8 * copy of this software and associated documentation files (the
9 * "Software"), to deal in the Software without restriction, including
10 * without limitation the rights to use, copy, modify, merge, publish,
11 * distribute, sub license, and/or sell copies of the Software, and to
12 * permit persons to whom the Software is furnished to do so, subject to
13 * the following conditions:
14 *
15 * The above copyright notice and this permission notice (including the
16 * next paragraph) shall be included in all copies or substantial portions
17 * of the Software.
18 *
19 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
20 * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
21 * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
22 * IN NO EVENT SHALL VMWARE AND/OR ITS SUPPLIERS BE LIABLE FOR
23 * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
24 * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
25 * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
26 *
27 **************************************************************************/
28
29 /**
30 * @file
31 * Code generate the whole fragment pipeline.
32 *
33 * The fragment pipeline consists of the following stages:
34 * - early depth test
35 * - fragment shader
36 * - alpha test
37 * - depth/stencil test
38 * - blending
39 *
40 * This file has only the glue to assemble the fragment pipeline. The actual
41 * plumbing of converting Gallium state into LLVM IR is done elsewhere, in the
42 * lp_bld_*.[ch] files, and in a complete generic and reusable way. Here we
43 * muster the LLVM JIT execution engine to create a function that follows an
44 * established binary interface and that can be called from C directly.
45 *
46 * A big source of complexity here is that we often want to run different
47 * stages with different precisions and data types and precisions. For example,
48 * the fragment shader needs typically to be done in floats, but the
49 * depth/stencil test and blending is better done in the type that most closely
50 * matches the depth/stencil and color buffer respectively.
51 *
52 * Since the width of a SIMD vector register stays the same regardless of the
53 * element type, different types imply different number of elements, so we must
54 * code generate more instances of the stages with larger types to be able to
55 * feed/consume the stages with smaller types.
56 *
57 * @author Jose Fonseca <jfonseca@vmware.com>
58 */
59
60 #include <limits.h>
61 #include "pipe/p_defines.h"
62 #include "util/u_inlines.h"
63 #include "util/u_memory.h"
64 #include "util/u_pointer.h"
65 #include "util/format/u_format.h"
66 #include "util/u_dump.h"
67 #include "util/u_string.h"
68 #include "util/u_dual_blend.h"
69 #include "util/u_upload_mgr.h"
70 #include "util/os_time.h"
71 #include "pipe/p_shader_tokens.h"
72 #include "draw/draw_context.h"
73 #include "nir/tgsi_to_nir.h"
74 #include "gallivm/lp_bld_type.h"
75 #include "gallivm/lp_bld_const.h"
76 #include "gallivm/lp_bld_conv.h"
77 #include "gallivm/lp_bld_init.h"
78 #include "gallivm/lp_bld_intr.h"
79 #include "gallivm/lp_bld_logic.h"
80 #include "gallivm/lp_bld_tgsi.h"
81 #include "gallivm/lp_bld_nir.h"
82 #include "gallivm/lp_bld_swizzle.h"
83 #include "gallivm/lp_bld_flow.h"
84 #include "gallivm/lp_bld_debug.h"
85 #include "gallivm/lp_bld_arit.h"
86 #include "gallivm/lp_bld_bitarit.h"
87 #include "gallivm/lp_bld_pack.h"
88 #include "gallivm/lp_bld_format.h"
89 #include "gallivm/lp_bld_quad.h"
90 #include "gallivm/lp_bld_gather.h"
91 #include "gallivm/lp_bld_jit_sample.h"
92
93 #include "lp_bld_alpha.h"
94 #include "lp_bld_blend.h"
95 #include "lp_bld_depth.h"
96 #include "lp_bld_interp.h"
97 #include "lp_context.h"
98 #include "lp_debug.h"
99 #include "lp_perf.h"
100 #include "lp_setup.h"
101 #include "lp_state.h"
102 #include "lp_tex_sample.h"
103 #include "lp_flush.h"
104 #include "lp_state_fs.h"
105 #include "lp_rast.h"
106 #include "nir/nir_to_tgsi_info.h"
107
108 #include "lp_screen.h"
109 #include "compiler/nir/nir_serialize.h"
110 #include "util/mesa-sha1.h"
111
112
113 /** Fragment shader number (for debugging) */
114 static unsigned fs_no = 0;
115
116
117 static void
118 load_unswizzled_block(struct gallivm_state *gallivm,
119 LLVMTypeRef base_type,
120 LLVMValueRef base_ptr,
121 LLVMValueRef stride,
122 unsigned block_width,
123 unsigned block_height,
124 LLVMValueRef* dst,
125 struct lp_type dst_type,
126 unsigned dst_count,
127 unsigned dst_alignment);
128 /**
129 * Checks if a format description is an arithmetic format
130 *
131 * A format which has irregular channel sizes such as R3_G3_B2 or R5_G6_B5.
132 */
133 static inline bool
is_arithmetic_format(const struct util_format_description * format_desc)134 is_arithmetic_format(const struct util_format_description *format_desc)
135 {
136 bool arith = false;
137
138 for (unsigned i = 0; i < format_desc->nr_channels; ++i) {
139 arith |= format_desc->channel[i].size != format_desc->channel[0].size;
140 arith |= (format_desc->channel[i].size % 8) != 0;
141 }
142
143 return arith;
144 }
145
146
147 /**
148 * Checks if this format requires special handling due to required expansion
149 * to floats for blending, and furthermore has "natural" packed AoS ->
150 * unpacked SoA conversion.
151 */
152 static inline bool
format_expands_to_float_soa(const struct util_format_description * format_desc)153 format_expands_to_float_soa(const struct util_format_description *format_desc)
154 {
155 if (format_desc->format == PIPE_FORMAT_R11G11B10_FLOAT ||
156 format_desc->colorspace == UTIL_FORMAT_COLORSPACE_SRGB) {
157 return true;
158 }
159 return false;
160 }
161
162
163 /**
164 * Retrieves the type representing the memory layout for a format
165 *
166 * e.g. RGBA16F = 4x half-float and R3G3B2 = 1x byte
167 */
168 static inline void
lp_mem_type_from_format_desc(const struct util_format_description * format_desc,struct lp_type * type)169 lp_mem_type_from_format_desc(const struct util_format_description *format_desc,
170 struct lp_type* type)
171 {
172 if (format_expands_to_float_soa(format_desc)) {
173 /* just make this a uint with width of block */
174 type->floating = false;
175 type->fixed = false;
176 type->sign = false;
177 type->norm = false;
178 type->width = format_desc->block.bits;
179 type->length = 1;
180 return;
181 }
182
183 int chan = util_format_get_first_non_void_channel(format_desc->format);
184
185 memset(type, 0, sizeof(struct lp_type));
186 type->floating = format_desc->channel[chan].type == UTIL_FORMAT_TYPE_FLOAT;
187 type->fixed = format_desc->channel[chan].type == UTIL_FORMAT_TYPE_FIXED;
188 type->sign = format_desc->channel[chan].type != UTIL_FORMAT_TYPE_UNSIGNED;
189 type->norm = format_desc->channel[chan].normalized;
190
191 if (is_arithmetic_format(format_desc)) {
192 type->width = 0;
193 type->length = 1;
194
195 for (unsigned i = 0; i < format_desc->nr_channels; ++i) {
196 type->width += format_desc->channel[i].size;
197 }
198 } else {
199 type->width = format_desc->channel[chan].size;
200 type->length = format_desc->nr_channels;
201 }
202 }
203
204
205 /**
206 * Expand the relevant bits of mask_input to a n*4-dword mask for the
207 * n*four pixels in n 2x2 quads. This will set the n*four elements of the
208 * quad mask vector to 0 or ~0.
209 * Grouping is 01, 23 for 2 quad mode hence only 0 and 2 are valid
210 * quad arguments with fs length 8.
211 *
212 * \param first_quad which quad(s) of the quad group to test, in [0,3]
213 * \param mask_input bitwise mask for the whole 4x4 stamp
214 */
215 static LLVMValueRef
generate_quad_mask(struct gallivm_state * gallivm,struct lp_type fs_type,unsigned first_quad,unsigned sample,LLVMValueRef mask_input)216 generate_quad_mask(struct gallivm_state *gallivm,
217 struct lp_type fs_type,
218 unsigned first_quad,
219 unsigned sample,
220 LLVMValueRef mask_input) /* int64 */
221 {
222 LLVMBuilderRef builder = gallivm->builder;
223 LLVMTypeRef i32t = LLVMInt32TypeInContext(gallivm->context);
224 LLVMValueRef bits[16];
225 LLVMValueRef mask, bits_vec;
226
227 /*
228 * XXX: We'll need a different path for 16 x u8
229 */
230 assert(fs_type.width == 32);
231 assert(fs_type.length <= ARRAY_SIZE(bits));
232 struct lp_type mask_type = lp_int_type(fs_type);
233
234 /*
235 * mask_input >>= (quad * 4)
236 */
237 int shift;
238 switch (first_quad) {
239 case 0:
240 shift = 0;
241 break;
242 case 1:
243 assert(fs_type.length == 4);
244 shift = 2;
245 break;
246 case 2:
247 shift = 8;
248 break;
249 case 3:
250 assert(fs_type.length == 4);
251 shift = 10;
252 break;
253 default:
254 assert(0);
255 shift = 0;
256 }
257
258 mask_input = LLVMBuildLShr(builder, mask_input,
259 lp_build_const_int64(gallivm, 16 * sample), "");
260 mask_input = LLVMBuildTrunc(builder, mask_input, i32t, "");
261 mask_input = LLVMBuildAnd(builder, mask_input,
262 lp_build_const_int32(gallivm, 0xffff), "");
263 mask_input = LLVMBuildLShr(builder, mask_input,
264 LLVMConstInt(i32t, shift, 0), "");
265
266 /*
267 * mask = { mask_input & (1 << i), for i in [0,3] }
268 */
269 mask = lp_build_broadcast(gallivm,
270 lp_build_vec_type(gallivm, mask_type),
271 mask_input);
272
273 for (int i = 0; i < fs_type.length / 4; i++) {
274 unsigned j = 2 * (i % 2) + (i / 2) * 8;
275 bits[4*i + 0] = LLVMConstInt(i32t, 1ULL << (j + 0), 0);
276 bits[4*i + 1] = LLVMConstInt(i32t, 1ULL << (j + 1), 0);
277 bits[4*i + 2] = LLVMConstInt(i32t, 1ULL << (j + 4), 0);
278 bits[4*i + 3] = LLVMConstInt(i32t, 1ULL << (j + 5), 0);
279 }
280 bits_vec = LLVMConstVector(bits, fs_type.length);
281 mask = LLVMBuildAnd(builder, mask, bits_vec, "");
282
283 /*
284 * mask = mask == bits ? ~0 : 0
285 */
286 mask = lp_build_compare(gallivm,
287 mask_type, PIPE_FUNC_EQUAL,
288 mask, bits_vec);
289
290 return mask;
291 }
292
293
294 #define EARLY_DEPTH_TEST 0x1
295 #define LATE_DEPTH_TEST 0x2
296 #define EARLY_DEPTH_WRITE 0x4
297 #define LATE_DEPTH_WRITE 0x8
298 #define EARLY_DEPTH_TEST_INFERRED 0x10 //only with EARLY_DEPTH_TEST
299
300 static unsigned
get_cbuf_location(nir_variable * var,unsigned slot)301 get_cbuf_location(nir_variable *var, unsigned slot)
302 {
303 return (var->data.location - FRAG_RESULT_DATA0) + var->data.index + slot;
304 }
305
306 static int
find_output_by_frag_result(struct nir_shader * shader,gl_frag_result frag_result)307 find_output_by_frag_result(struct nir_shader *shader,
308 gl_frag_result frag_result)
309 {
310 nir_foreach_shader_out_variable(var, shader) {
311 int slots = nir_variable_count_slots(var, var->type);
312 for (unsigned s = 0; s < slots; s++) {
313 if (var->data.location + var->data.index + s == frag_result)
314 return var->data.driver_location + s;
315 }
316 }
317
318 return -1;
319 }
320
321 /**
322 * Fetch the specified lp_jit_viewport structure for a given viewport_index.
323 */
324 static LLVMValueRef
lp_llvm_viewport(LLVMTypeRef context_type,LLVMValueRef context_ptr,struct gallivm_state * gallivm,LLVMValueRef viewport_index)325 lp_llvm_viewport(LLVMTypeRef context_type,
326 LLVMValueRef context_ptr,
327 struct gallivm_state *gallivm,
328 LLVMValueRef viewport_index)
329 {
330 LLVMBuilderRef builder = gallivm->builder;
331 LLVMValueRef ptr;
332 LLVMValueRef res;
333 struct lp_type viewport_type =
334 lp_type_float_vec(32, 32 * LP_JIT_VIEWPORT_NUM_FIELDS);
335 LLVMTypeRef vtype = lp_build_vec_type(gallivm, viewport_type);
336
337 ptr = lp_jit_context_viewports(gallivm, context_type, context_ptr);
338 ptr = LLVMBuildPointerCast(builder, ptr,
339 LLVMPointerType(vtype, 0), "");
340
341 res = lp_build_pointer_get2(builder, vtype, ptr, viewport_index);
342
343 return res;
344 }
345
346
347 static LLVMValueRef
lp_build_depth_clamp(struct gallivm_state * gallivm,LLVMBuilderRef builder,bool depth_clamp,bool restrict_depth,struct lp_type type,LLVMTypeRef context_type,LLVMValueRef context_ptr,LLVMTypeRef thread_data_type,LLVMValueRef thread_data_ptr,LLVMValueRef z)348 lp_build_depth_clamp(struct gallivm_state *gallivm,
349 LLVMBuilderRef builder,
350 bool depth_clamp,
351 bool restrict_depth,
352 struct lp_type type,
353 LLVMTypeRef context_type,
354 LLVMValueRef context_ptr,
355 LLVMTypeRef thread_data_type,
356 LLVMValueRef thread_data_ptr,
357 LLVMValueRef z)
358 {
359 LLVMValueRef viewport, min_depth, max_depth;
360 LLVMValueRef viewport_index;
361 struct lp_build_context f32_bld;
362
363 assert(type.floating);
364 lp_build_context_init(&f32_bld, gallivm, type);
365
366 if (restrict_depth)
367 z = lp_build_clamp(&f32_bld, z, f32_bld.zero, f32_bld.one);
368
369 if (!depth_clamp)
370 return z;
371
372 /*
373 * Assumes clamping of the viewport index will occur in setup/gs. Value
374 * is passed through the rasterization stage via lp_rast_shader_inputs.
375 *
376 * See: draw_clamp_viewport_idx and lp_clamp_viewport_idx for clamping
377 * semantics.
378 */
379 viewport_index = lp_jit_thread_data_raster_state_viewport_index(gallivm,
380 thread_data_type,
381 thread_data_ptr);
382
383 /*
384 * Load the min and max depth from the lp_jit_context.viewports
385 * array of lp_jit_viewport structures.
386 */
387 viewport = lp_llvm_viewport(context_type, context_ptr, gallivm, viewport_index);
388
389 /* viewports[viewport_index].min_depth */
390 min_depth = LLVMBuildExtractElement(builder, viewport,
391 lp_build_const_int32(gallivm, LP_JIT_VIEWPORT_MIN_DEPTH), "");
392 min_depth = lp_build_broadcast_scalar(&f32_bld, min_depth);
393
394 /* viewports[viewport_index].max_depth */
395 max_depth = LLVMBuildExtractElement(builder, viewport,
396 lp_build_const_int32(gallivm, LP_JIT_VIEWPORT_MAX_DEPTH), "");
397 max_depth = lp_build_broadcast_scalar(&f32_bld, max_depth);
398
399 /*
400 * Clamp to the min and max depth values for the given viewport.
401 */
402 return lp_build_clamp(&f32_bld, z, min_depth, max_depth);
403 }
404
405
406 static LLVMValueRef
lp_build_alpha_to_coverage_dither(struct gallivm_state * gallivm,struct lp_type type,unsigned coverage_samples,const LLVMValueRef * pos,LLVMValueRef alpha)407 lp_build_alpha_to_coverage_dither(struct gallivm_state *gallivm,
408 struct lp_type type,
409 unsigned coverage_samples,
410 const LLVMValueRef* pos,
411 LLVMValueRef alpha)
412 {
413 LLVMBuilderRef builder = gallivm->builder;
414 /* Standard ordered dithering 2x2 threshold matrix. */
415 LLVMValueRef elems[] = {
416 lp_build_const_elem(gallivm, type, 0.125 / coverage_samples),
417 lp_build_const_elem(gallivm, type, 0.625 / coverage_samples),
418 lp_build_const_elem(gallivm, type, 0.875 / coverage_samples),
419 lp_build_const_elem(gallivm, type, 0.375 / coverage_samples),
420 };
421 LLVMValueRef dither_thresholds = LLVMConstVector(elems, ARRAY_SIZE(elems));
422 /* Get a two bit mask, where each bit is even/odd on X and Y. */
423 LLVMTypeRef int_vec_type = lp_build_int_vec_type(gallivm, type);
424 LLVMValueRef frag_int_pos_x = LLVMBuildFPToSI(builder, pos[0], int_vec_type, "frag_int_pos_x");
425 LLVMValueRef frag_int_pos_y = LLVMBuildFPToSI(builder, pos[1], int_vec_type, "frag_int_pos_y");
426 LLVMValueRef odd_bitmask = lp_build_const_int_vec(gallivm, type, 1);
427 LLVMValueRef dither_index = LLVMBuildOr(builder, LLVMBuildAnd(builder, frag_int_pos_x, odd_bitmask, ""),
428 LLVMBuildShl(builder, LLVMBuildAnd(builder, frag_int_pos_y, odd_bitmask, ""),
429 lp_build_const_int_vec(gallivm, type, 1), ""), "dither_index");
430 /* Use the bit mask as an index in the threshold matrix, subtract it from the alpha value. */
431 LLVMValueRef offsets = LLVMGetUndef(lp_build_vec_type(gallivm, type));
432 for (unsigned i = 0; i < type.length; i++) {
433 LLVMValueRef index = lp_build_const_int32(gallivm, i);
434 offsets = LLVMBuildInsertElement(builder, offsets,
435 LLVMBuildExtractElement(builder, dither_thresholds,
436 LLVMBuildExtractElement(builder, dither_index,
437 index, "threshold"),
438 ""), index, "");
439 }
440 /* Alpha value is only used in a comparison, no need to clamp to [0, 1]. */
441 return LLVMBuildFSub(builder, alpha, offsets, "");
442 }
443
444
445 static void
lp_build_sample_alpha_to_coverage(struct gallivm_state * gallivm,struct lp_type type,unsigned coverage_samples,LLVMValueRef num_loop,LLVMValueRef loop_counter,LLVMTypeRef coverage_mask_type,LLVMValueRef coverage_mask_store,LLVMValueRef alpha)446 lp_build_sample_alpha_to_coverage(struct gallivm_state *gallivm,
447 struct lp_type type,
448 unsigned coverage_samples,
449 LLVMValueRef num_loop,
450 LLVMValueRef loop_counter,
451 LLVMTypeRef coverage_mask_type,
452 LLVMValueRef coverage_mask_store,
453 LLVMValueRef alpha)
454 {
455 struct lp_build_context bld;
456 LLVMBuilderRef builder = gallivm->builder;
457 float step = 1.0 / coverage_samples;
458
459 lp_build_context_init(&bld, gallivm, type);
460 for (unsigned s = 0; s < coverage_samples; s++) {
461 LLVMValueRef alpha_ref_value = lp_build_const_vec(gallivm, type, step * s);
462 LLVMValueRef test = lp_build_cmp(&bld, PIPE_FUNC_GREATER, alpha, alpha_ref_value);
463
464 LLVMValueRef s_mask_idx = LLVMBuildMul(builder, lp_build_const_int32(gallivm, s), num_loop, "");
465 s_mask_idx = LLVMBuildAdd(builder, s_mask_idx, loop_counter, "");
466 LLVMValueRef s_mask_ptr = LLVMBuildGEP2(builder, coverage_mask_type,
467 coverage_mask_store, &s_mask_idx, 1, "");
468 LLVMValueRef s_mask = LLVMBuildLoad2(builder, coverage_mask_type, s_mask_ptr, "");
469 s_mask = LLVMBuildAnd(builder, s_mask, test, "");
470 LLVMBuildStore(builder, s_mask, s_mask_ptr);
471 }
472 };
473
474
475 struct lp_build_fs_llvm_iface {
476 struct lp_build_fs_iface base;
477 struct lp_build_interp_soa_context *interp;
478 struct lp_build_for_loop_state *loop_state;
479 LLVMTypeRef mask_type;
480 LLVMValueRef mask_store;
481 LLVMValueRef sample_id;
482 LLVMValueRef color_ptr_ptr;
483 LLVMValueRef color_stride_ptr;
484 LLVMValueRef color_sample_stride_ptr;
485 LLVMValueRef zs_base_ptr;
486 LLVMValueRef zs_stride;
487 LLVMValueRef zs_sample_stride;
488 const struct lp_fragment_shader_variant_key *key;
489 };
490
491
492 static LLVMValueRef
fs_interp(const struct lp_build_fs_iface * iface,struct lp_build_context * bld,unsigned attrib,unsigned chan,bool centroid,bool sample,LLVMValueRef attrib_indir,LLVMValueRef offsets[2])493 fs_interp(const struct lp_build_fs_iface *iface,
494 struct lp_build_context *bld,
495 unsigned attrib, unsigned chan,
496 bool centroid, bool sample,
497 LLVMValueRef attrib_indir,
498 LLVMValueRef offsets[2])
499 {
500 struct lp_build_fs_llvm_iface *fs_iface = (struct lp_build_fs_llvm_iface *)iface;
501 struct lp_build_interp_soa_context *interp = fs_iface->interp;
502 unsigned loc = TGSI_INTERPOLATE_LOC_CENTER;
503 if (centroid)
504 loc = TGSI_INTERPOLATE_LOC_CENTROID;
505 if (sample)
506 loc = TGSI_INTERPOLATE_LOC_SAMPLE;
507
508 return lp_build_interp_soa(interp, bld->gallivm, fs_iface->loop_state->counter,
509 fs_iface->mask_type, fs_iface->mask_store,
510 attrib, chan, loc, attrib_indir, offsets);
511 }
512
513
514 /**
515 * Convert depth-stencil format to a single component one, returning
516 * PIPE_FORMAT_NONE if it doesn't contain the required component.
517 */
518 static enum pipe_format
select_zs_component_format(enum pipe_format format,bool fetch_stencil)519 select_zs_component_format(enum pipe_format format,
520 bool fetch_stencil)
521 {
522 const struct util_format_description* desc = util_format_description(format);
523 if (fetch_stencil && !util_format_has_stencil(desc))
524 return PIPE_FORMAT_NONE;
525 if (!fetch_stencil && !util_format_has_depth(desc))
526 return PIPE_FORMAT_NONE;
527
528 switch (format) {
529 case PIPE_FORMAT_Z24_UNORM_S8_UINT:
530 return fetch_stencil ? PIPE_FORMAT_X24S8_UINT : PIPE_FORMAT_Z24X8_UNORM;
531 case PIPE_FORMAT_S8_UINT_Z24_UNORM:
532 return fetch_stencil ? PIPE_FORMAT_S8X24_UINT : PIPE_FORMAT_X8Z24_UNORM;
533 case PIPE_FORMAT_Z32_FLOAT_S8X24_UINT:
534 return fetch_stencil ? PIPE_FORMAT_X32_S8X24_UINT : format;
535 default:
536 return format;
537 }
538 }
539
540 static void
fs_fb_fetch(const struct lp_build_fs_iface * iface,struct lp_build_context * bld,int location,LLVMValueRef result[4])541 fs_fb_fetch(const struct lp_build_fs_iface *iface,
542 struct lp_build_context *bld,
543 int location,
544 LLVMValueRef result[4])
545 {
546 struct lp_build_fs_llvm_iface *fs_iface = (struct lp_build_fs_llvm_iface *)iface;
547 struct gallivm_state *gallivm = bld->gallivm;
548 LLVMBuilderRef builder = gallivm->builder;
549 LLVMTypeRef int32_type = LLVMInt32TypeInContext(gallivm->context);
550 LLVMTypeRef int8_type = LLVMInt8TypeInContext(gallivm->context);
551 LLVMTypeRef int8p_type = LLVMPointerType(int8_type, 0);
552 const struct lp_fragment_shader_variant_key *key = fs_iface->key;
553
554 LLVMValueRef buf_ptr;
555 LLVMValueRef stride;
556 enum pipe_format buf_format;
557
558 const bool fetch_stencil = location == FRAG_RESULT_STENCIL;
559 const bool fetch_zs = fetch_stencil || location == FRAG_RESULT_DEPTH;
560 if (fetch_zs) {
561 buf_ptr = fs_iface->zs_base_ptr;
562 stride = fs_iface->zs_stride;
563 buf_format = select_zs_component_format(key->zsbuf_format, fetch_stencil);
564 } else {
565 assert(location >= FRAG_RESULT_DATA0 && location <= FRAG_RESULT_DATA7);
566 const int cbuf = location - FRAG_RESULT_DATA0;
567 LLVMValueRef index = lp_build_const_int32(gallivm, cbuf);
568
569 buf_ptr = LLVMBuildLoad2(builder, int8p_type,
570 LLVMBuildGEP2(builder, int8p_type,
571 fs_iface->color_ptr_ptr, &index, 1, ""), "");
572 stride = LLVMBuildLoad2(builder, int32_type,
573 LLVMBuildGEP2(builder, int32_type,
574 fs_iface->color_stride_ptr, &index, 1, ""), "");
575 buf_format = key->cbuf_format[cbuf];
576 }
577
578 const struct util_format_description* out_format_desc = util_format_description(buf_format);
579 if (out_format_desc->format == PIPE_FORMAT_NONE) {
580 result[0] = result[1] = result[2] = result[3] = bld->undef;
581 return;
582 }
583
584 unsigned block_size = bld->type.length;
585 unsigned block_height = key->resource_1d ? 1 : 2;
586 unsigned block_width = block_size / block_height;
587
588 if (key->multisample) {
589 LLVMValueRef sample_stride;
590
591 if (fetch_zs) {
592 sample_stride = fs_iface->zs_sample_stride;
593 } else {
594 LLVMValueRef index = lp_build_const_int32(gallivm, location - FRAG_RESULT_DATA0);
595 sample_stride = LLVMBuildLoad2(builder, int32_type,
596 LLVMBuildGEP2(builder,
597 int32_type,
598 fs_iface->color_sample_stride_ptr,
599 &index, 1, ""), "");
600 }
601
602 LLVMValueRef sample_offset = LLVMBuildMul(builder, sample_stride, fs_iface->sample_id, "");
603 buf_ptr = LLVMBuildGEP2(builder, int8_type,
604 buf_ptr, &sample_offset, 1, "");
605 }
606
607 /* fragment shader executes on 4x4 blocks. depending on vector width it can
608 * execute 2 or 4 iterations. only move to the next row once the top row
609 * has completed 8 wide 1 iteration, 4 wide 2 iterations */
610 LLVMValueRef x_offset = NULL, y_offset = NULL;
611 if (!key->resource_1d) {
612 LLVMValueRef counter = fs_iface->loop_state->counter;
613
614 if (block_size == 4) {
615 x_offset = LLVMBuildShl(builder,
616 LLVMBuildAnd(builder, fs_iface->loop_state->counter, lp_build_const_int32(gallivm, 1), ""),
617 lp_build_const_int32(gallivm, 1), "");
618 counter = LLVMBuildLShr(builder, fs_iface->loop_state->counter, lp_build_const_int32(gallivm, 1), "");
619 }
620 y_offset = LLVMBuildMul(builder, counter, lp_build_const_int32(gallivm, 2), "");
621 }
622
623 LLVMValueRef offsets[4 * 4];
624 for (unsigned i = 0; i < block_size; i++) {
625 unsigned x = i % block_width;
626 unsigned y = i / block_width;
627
628 if (block_size == 8) {
629 /* remap the raw slots into the fragment shader execution mode. */
630 /* this math took me way too long to work out, I'm sure it's
631 * overkill.
632 */
633 x = (i & 1) + ((i >> 2) << 1);
634 if (!key->resource_1d)
635 y = (i & 2) >> 1;
636 }
637
638 LLVMValueRef x_val;
639 if (x_offset) {
640 x_val = LLVMBuildAdd(builder, lp_build_const_int32(gallivm, x), x_offset, "");
641 x_val = LLVMBuildMul(builder, x_val, lp_build_const_int32(gallivm, out_format_desc->block.bits / 8), "");
642 } else {
643 x_val = lp_build_const_int32(gallivm, x * (out_format_desc->block.bits / 8));
644 }
645
646 LLVMValueRef y_val = lp_build_const_int32(gallivm, y);
647 if (y_offset)
648 y_val = LLVMBuildAdd(builder, y_val, y_offset, "");
649 y_val = LLVMBuildMul(builder, y_val, stride, "");
650
651 offsets[i] = LLVMBuildAdd(builder, x_val, y_val, "");
652 }
653 LLVMValueRef offset = lp_build_gather_values(gallivm, offsets, block_size);
654
655 struct lp_type texel_type = bld->type;
656 if (out_format_desc->colorspace == UTIL_FORMAT_COLORSPACE_RGB &&
657 out_format_desc->channel[0].pure_integer) {
658 if (out_format_desc->channel[0].type == UTIL_FORMAT_TYPE_SIGNED) {
659 texel_type = lp_type_int_vec(bld->type.width, bld->type.width * bld->type.length);
660 } else if (out_format_desc->channel[0].type == UTIL_FORMAT_TYPE_UNSIGNED) {
661 texel_type = lp_type_uint_vec(bld->type.width, bld->type.width * bld->type.length);
662 }
663 } else if (fetch_stencil) {
664 texel_type = lp_type_uint_vec(bld->type.width, bld->type.width * bld->type.length);
665 }
666
667 lp_build_fetch_rgba_soa(gallivm, out_format_desc, texel_type,
668 true, buf_ptr, offset,
669 NULL, NULL, NULL, result);
670 }
671
672 /**
673 * Generate the fragment shader, depth/stencil test, and alpha tests.
674 */
675 static void
generate_fs_loop(struct gallivm_state * gallivm,struct lp_fragment_shader * shader,const struct lp_fragment_shader_variant_key * key,LLVMBuilderRef builder,struct lp_type type,LLVMTypeRef context_type,LLVMValueRef context_ptr,LLVMTypeRef resources_type,LLVMValueRef resources_ptr,LLVMTypeRef sample_pos_type,LLVMValueRef sample_pos_array,LLVMValueRef num_loop,struct lp_build_interp_soa_context * interp,const struct lp_build_sampler_soa * sampler,const struct lp_build_image_soa * image,LLVMTypeRef mask_type,LLVMValueRef mask_store,LLVMValueRef (* out_color)[4],LLVMValueRef depth_base_ptr,LLVMValueRef depth_stride,LLVMValueRef depth_sample_stride,LLVMValueRef color_ptr_ptr,LLVMValueRef color_stride_ptr,LLVMValueRef color_sample_stride_ptr,LLVMValueRef facing,LLVMTypeRef thread_data_type,LLVMValueRef thread_data_ptr)676 generate_fs_loop(struct gallivm_state *gallivm,
677 struct lp_fragment_shader *shader,
678 const struct lp_fragment_shader_variant_key *key,
679 LLVMBuilderRef builder,
680 struct lp_type type,
681 LLVMTypeRef context_type,
682 LLVMValueRef context_ptr,
683 LLVMTypeRef resources_type,
684 LLVMValueRef resources_ptr,
685 LLVMTypeRef sample_pos_type,
686 LLVMValueRef sample_pos_array,
687 LLVMValueRef num_loop,
688 struct lp_build_interp_soa_context *interp,
689 const struct lp_build_sampler_soa *sampler,
690 const struct lp_build_image_soa *image,
691 LLVMTypeRef mask_type,
692 LLVMValueRef mask_store,
693 LLVMValueRef (*out_color)[4],
694 LLVMValueRef depth_base_ptr,
695 LLVMValueRef depth_stride,
696 LLVMValueRef depth_sample_stride,
697 LLVMValueRef color_ptr_ptr,
698 LLVMValueRef color_stride_ptr,
699 LLVMValueRef color_sample_stride_ptr,
700 LLVMValueRef facing,
701 LLVMTypeRef thread_data_type,
702 LLVMValueRef thread_data_ptr)
703 {
704 struct lp_type int_type = lp_int_type(type);
705 LLVMValueRef mask_ptr = NULL, mask_val = NULL;
706 LLVMValueRef z;
707 LLVMValueRef z_value, s_value;
708 LLVMValueRef z_fb, s_fb;
709 LLVMValueRef zs_samples = lp_build_const_int32(gallivm, key->zsbuf_nr_samples);
710 LLVMValueRef z_out = NULL, s_out = NULL;
711 struct lp_build_for_loop_state loop_state, sample_loop_state = {0};
712 struct lp_build_mask_context mask;
713 struct nir_shader *nir = shader->base.ir.nir;
714 const bool dual_source_blend = key->blend.rt[0].blend_enable &&
715 util_blend_state_is_dual(&key->blend, 0);
716 const bool post_depth_coverage = nir->info.fs.post_depth_coverage;
717
718 struct lp_bld_tgsi_system_values system_values;
719
720 memset(&system_values, 0, sizeof(system_values));
721
722 /* truncate then sign extend. */
723 system_values.front_facing =
724 LLVMBuildTrunc(gallivm->builder, facing,
725 LLVMInt1TypeInContext(gallivm->context), "");
726 system_values.front_facing =
727 LLVMBuildSExt(gallivm->builder, system_values.front_facing,
728 LLVMInt32TypeInContext(gallivm->context), "");
729 system_values.view_index =
730 lp_jit_thread_data_raster_state_view_index(gallivm,
731 thread_data_type,
732 thread_data_ptr);
733
734 unsigned depth_mode;
735 const struct util_format_description *zs_format_desc = NULL;
736 if (key->depth.enabled ||
737 key->stencil[0].enabled) {
738 zs_format_desc = util_format_description(key->zsbuf_format);
739
740 if (nir->info.fs.early_fragment_tests || nir->info.fs.post_depth_coverage) {
741 depth_mode = EARLY_DEPTH_TEST | EARLY_DEPTH_WRITE;
742 } else if (!(nir->info.outputs_written & BITFIELD64_BIT(FRAG_RESULT_DEPTH)) &&
743 !(nir->info.outputs_written & BITFIELD64_BIT(FRAG_RESULT_STENCIL)) &&
744 !nir->info.fs.uses_fbfetch_output && !nir->info.writes_memory) {
745 if (key->alpha.enabled ||
746 key->blend.alpha_to_coverage ||
747 nir->info.fs.uses_discard ||
748 nir->info.outputs_written & BITFIELD64_BIT(FRAG_RESULT_SAMPLE_MASK)) {
749 /* With alpha test and kill, can do the depth test early
750 * and hopefully eliminate some quads. But need to do a
751 * special deferred depth write once the final mask value
752 * is known. This only works though if there's either no
753 * stencil test or the stencil value isn't written.
754 */
755 if (key->stencil[0].enabled && (key->stencil[0].writemask ||
756 (key->stencil[1].enabled &&
757 key->stencil[1].writemask)))
758 depth_mode = LATE_DEPTH_TEST | LATE_DEPTH_WRITE;
759 else
760 depth_mode = EARLY_DEPTH_TEST | LATE_DEPTH_WRITE | EARLY_DEPTH_TEST_INFERRED;
761 } else {
762 depth_mode = EARLY_DEPTH_TEST | EARLY_DEPTH_WRITE | EARLY_DEPTH_TEST_INFERRED;
763 }
764 } else {
765 depth_mode = LATE_DEPTH_TEST | LATE_DEPTH_WRITE;
766 }
767
768 if (!(key->depth.enabled && key->depth.writemask) &&
769 !(key->stencil[0].enabled && (key->stencil[0].writemask ||
770 (key->stencil[1].enabled &&
771 key->stencil[1].writemask))))
772 depth_mode &= ~(LATE_DEPTH_WRITE | EARLY_DEPTH_WRITE);
773 } else {
774 depth_mode = 0;
775 }
776
777 LLVMTypeRef vec_type = lp_build_vec_type(gallivm, type);
778 LLVMTypeRef int_vec_type = lp_build_vec_type(gallivm, int_type);
779
780 LLVMValueRef stencil_refs[2];
781 stencil_refs[0] = lp_jit_context_stencil_ref_front_value(gallivm, context_type, context_ptr);
782 stencil_refs[1] = lp_jit_context_stencil_ref_back_value(gallivm, context_type, context_ptr);
783 /* convert scalar stencil refs into vectors */
784 stencil_refs[0] = lp_build_broadcast(gallivm, int_vec_type, stencil_refs[0]);
785 stencil_refs[1] = lp_build_broadcast(gallivm, int_vec_type, stencil_refs[1]);
786
787 LLVMValueRef consts_ptr = lp_jit_resources_constants(gallivm, resources_type, resources_ptr);
788
789 LLVMValueRef ssbo_ptr = lp_jit_resources_ssbos(gallivm, resources_type, resources_ptr);
790
791 LLVMValueRef outputs[PIPE_MAX_SHADER_OUTPUTS][TGSI_NUM_CHANNELS];
792 memset(outputs, 0, sizeof outputs);
793
794 /* Allocate color storage for each fragment sample */
795 LLVMValueRef color_store_size = num_loop;
796 if (key->min_samples > 1)
797 color_store_size = LLVMBuildMul(builder, num_loop, lp_build_const_int32(gallivm, key->min_samples), "");
798
799 for (unsigned cbuf = 0; cbuf < key->nr_cbufs; cbuf++) {
800 for (unsigned chan = 0; chan < TGSI_NUM_CHANNELS; ++chan) {
801 out_color[cbuf][chan] = lp_build_array_alloca(gallivm,
802 lp_build_vec_type(gallivm,
803 type),
804 color_store_size, "color");
805 }
806 }
807 if (dual_source_blend) {
808 assert(key->nr_cbufs <= 1);
809 for (unsigned chan = 0; chan < TGSI_NUM_CHANNELS; ++chan) {
810 out_color[1][chan] = lp_build_array_alloca(gallivm,
811 lp_build_vec_type(gallivm,
812 type),
813 color_store_size, "color1");
814 }
815 }
816 if (nir->info.outputs_written & BITFIELD64_BIT(FRAG_RESULT_DEPTH)) {
817 z_out = lp_build_array_alloca(gallivm,
818 lp_build_vec_type(gallivm, type),
819 color_store_size, "depth");
820 }
821
822 if (nir->info.outputs_written & BITFIELD64_BIT(FRAG_RESULT_STENCIL)) {
823 s_out = lp_build_array_alloca(gallivm,
824 lp_build_vec_type(gallivm, type),
825 color_store_size, "depth");
826 }
827
828 lp_build_for_loop_begin(&loop_state, gallivm,
829 lp_build_const_int32(gallivm, 0),
830 LLVMIntULT,
831 num_loop,
832 lp_build_const_int32(gallivm, 1));
833
834 LLVMValueRef sample_mask_in;
835 if (key->multisample) {
836 sample_mask_in = lp_build_const_int_vec(gallivm, type, 0);
837 /* create shader execution mask by combining all sample masks. */
838 for (unsigned s = 0; s < key->coverage_samples; s++) {
839 LLVMValueRef s_mask_idx = LLVMBuildMul(builder, num_loop, lp_build_const_int32(gallivm, s), "");
840 s_mask_idx = LLVMBuildAdd(builder, s_mask_idx, loop_state.counter, "");
841 LLVMValueRef s_mask = lp_build_pointer_get2(builder, mask_type, mask_store, s_mask_idx);
842 if (s == 0)
843 mask_val = s_mask;
844 else
845 mask_val = LLVMBuildOr(builder, s_mask, mask_val, "");
846
847 LLVMValueRef mask_in = LLVMBuildAnd(builder, s_mask, lp_build_const_int_vec(gallivm, type, (1ll << s)), "");
848 sample_mask_in = LLVMBuildOr(builder, sample_mask_in, mask_in, "");
849 }
850 } else {
851 sample_mask_in = lp_build_const_int_vec(gallivm, type, 1);
852 mask_ptr = LLVMBuildGEP2(builder, mask_type, mask_store,
853 &loop_state.counter, 1, "mask_ptr");
854 mask_val = LLVMBuildLoad2(builder, mask_type, mask_ptr, "");
855
856 LLVMValueRef mask_in = LLVMBuildAnd(builder, mask_val, lp_build_const_int_vec(gallivm, type, 1), "");
857 sample_mask_in = LLVMBuildOr(builder, sample_mask_in, mask_in, "");
858 }
859
860 /* 'mask' will control execution based on quad's pixel alive/killed state */
861 lp_build_mask_begin(&mask, gallivm, type, mask_val);
862
863 if (!(depth_mode & EARLY_DEPTH_TEST))
864 lp_build_mask_check(&mask);
865
866 /* Create storage for recombining sample masks after early Z pass. */
867 LLVMValueRef s_mask_or = lp_build_alloca(gallivm, int_vec_type, "cov_mask_early_depth");
868 LLVMBuildStore(builder, LLVMConstNull(int_vec_type), s_mask_or);
869
870 /* Create storage for post depth sample mask */
871 LLVMValueRef post_depth_sample_mask_in = NULL;
872 if (post_depth_coverage)
873 post_depth_sample_mask_in = lp_build_alloca(gallivm, int_vec_type, "post_depth_sample_mask_in");
874
875 LLVMValueRef s_mask = NULL, s_mask_ptr = NULL;
876 LLVMValueRef z_sample_value_store = NULL, s_sample_value_store = NULL;
877 LLVMValueRef z_fb_store = NULL, s_fb_store = NULL;
878 LLVMTypeRef z_type = NULL, z_fb_type = NULL;
879
880 /* Run early depth once per sample */
881 if (key->multisample) {
882
883 if (zs_format_desc) {
884 struct lp_type zs_type = lp_depth_type(zs_format_desc, type.length);
885 struct lp_type z_type = zs_type;
886 struct lp_type s_type = zs_type;
887 if (zs_format_desc->block.bits < type.width)
888 z_type.width = type.width;
889 if (zs_format_desc->block.bits == 8) {
890 s_type.width = type.width;
891 } else if (zs_format_desc->block.bits > 32) {
892 z_type.width = z_type.width / 2;
893 s_type.width = s_type.width / 2;
894 s_type.floating = 0;
895 }
896 z_sample_value_store = lp_build_array_alloca(gallivm, lp_build_int_vec_type(gallivm, type),
897 zs_samples, "z_sample_store");
898 s_sample_value_store = lp_build_array_alloca(gallivm, lp_build_int_vec_type(gallivm, type),
899 zs_samples, "s_sample_store");
900 z_fb_store = lp_build_array_alloca(gallivm, lp_build_vec_type(gallivm, z_type),
901 zs_samples, "z_fb_store");
902 s_fb_store = lp_build_array_alloca(gallivm, lp_build_vec_type(gallivm, s_type),
903 zs_samples, "s_fb_store");
904 }
905 lp_build_for_loop_begin(&sample_loop_state, gallivm,
906 lp_build_const_int32(gallivm, 0),
907 LLVMIntULT, lp_build_const_int32(gallivm, key->coverage_samples),
908 lp_build_const_int32(gallivm, 1));
909
910 LLVMValueRef s_mask_idx = LLVMBuildMul(builder, sample_loop_state.counter, num_loop, "");
911 s_mask_idx = LLVMBuildAdd(builder, s_mask_idx, loop_state.counter, "");
912 s_mask_ptr = LLVMBuildGEP2(builder, mask_type, mask_store, &s_mask_idx, 1, "");
913
914 s_mask = LLVMBuildLoad2(builder, mask_type, s_mask_ptr, "");
915 s_mask = LLVMBuildAnd(builder, s_mask, mask_val, "");
916 }
917
918
919 /* for multisample Z needs to be interpolated at sample points for testing. */
920 lp_build_interp_soa_update_pos_dyn(interp, gallivm, loop_state.counter,
921 key->multisample
922 ? sample_loop_state.counter : NULL);
923 z = interp->pos[2];
924
925 LLVMValueRef depth_ptr = depth_base_ptr;
926 if (key->multisample) {
927 LLVMValueRef sample_offset =
928 LLVMBuildMul(builder, sample_loop_state.counter,
929 depth_sample_stride, "");
930 depth_ptr = LLVMBuildGEP2(builder, LLVMInt8TypeInContext(gallivm->context),
931 depth_ptr, &sample_offset, 1, "");
932 }
933
934 if (depth_mode & EARLY_DEPTH_TEST) {
935 z = lp_build_depth_clamp(gallivm, builder, key->depth_clamp,
936 key->restrict_depth_values, type,
937 context_type, context_ptr,
938 thread_data_type, thread_data_ptr, z);
939
940 lp_build_depth_stencil_load_swizzled(gallivm, type,
941 zs_format_desc, key->resource_1d,
942 depth_ptr, depth_stride,
943 &z_fb, &s_fb, loop_state.counter);
944 lp_build_depth_stencil_test(gallivm,
945 &key->depth,
946 key->stencil,
947 type,
948 zs_format_desc,
949 key->multisample ? NULL : &mask,
950 &s_mask,
951 stencil_refs,
952 z, z_fb, s_fb,
953 facing,
954 &z_value, &s_value,
955 !key->multisample,
956 key->restrict_depth_values);
957
958 if (depth_mode & EARLY_DEPTH_WRITE) {
959 lp_build_depth_stencil_write_swizzled(gallivm, type,
960 zs_format_desc, key->resource_1d,
961 NULL, NULL, NULL, loop_state.counter,
962 depth_ptr, depth_stride,
963 z_value, s_value);
964 }
965 /*
966 * Note mask check if stencil is enabled must be after ds write not
967 * after stencil test otherwise new stencil values may not get written
968 * if all fragments got killed by depth/stencil test.
969 */
970 if (key->stencil[0].enabled && !key->multisample)
971 lp_build_mask_check(&mask);
972
973 if (key->multisample) {
974 z_fb_type = LLVMTypeOf(z_fb);
975 z_type = LLVMTypeOf(z_value);
976 lp_build_pointer_set(builder, z_sample_value_store, sample_loop_state.counter, LLVMBuildBitCast(builder, z_value, lp_build_int_vec_type(gallivm, type), ""));
977 lp_build_pointer_set(builder, s_sample_value_store, sample_loop_state.counter, LLVMBuildBitCast(builder, s_value, lp_build_int_vec_type(gallivm, type), ""));
978 lp_build_pointer_set(builder, z_fb_store, sample_loop_state.counter, z_fb);
979 lp_build_pointer_set(builder, s_fb_store, sample_loop_state.counter, s_fb);
980 }
981 if (key->occlusion_count && !(depth_mode & EARLY_DEPTH_TEST_INFERRED)) {
982 LLVMValueRef counter = lp_jit_thread_data_vis_counter(gallivm, thread_data_type, thread_data_ptr);
983 lp_build_name(counter, "counter");
984 lp_build_occlusion_count(gallivm, type,
985 key->multisample ? s_mask : lp_build_mask_value(&mask), counter);
986 }
987 }
988
989 if (key->multisample) {
990 /*
991 * Store the post-early Z coverage mask.
992 * Recombine the resulting coverage masks post early Z into the fragment
993 * shader execution mask.
994 */
995 LLVMValueRef tmp_s_mask_or = LLVMBuildLoad2(builder, int_vec_type, s_mask_or, "");
996 tmp_s_mask_or = LLVMBuildOr(builder, tmp_s_mask_or, s_mask, "");
997 LLVMBuildStore(builder, tmp_s_mask_or, s_mask_or);
998
999 if (post_depth_coverage) {
1000 LLVMValueRef mask_bit_idx = LLVMBuildShl(builder, lp_build_const_int32(gallivm, 1), sample_loop_state.counter, "");
1001 LLVMValueRef post_depth_mask_in = LLVMBuildLoad2(builder, int_vec_type, post_depth_sample_mask_in, "");
1002 mask_bit_idx = LLVMBuildAnd(builder, s_mask, lp_build_broadcast(gallivm, int_vec_type, mask_bit_idx), "");
1003 post_depth_mask_in = LLVMBuildOr(builder, post_depth_mask_in, mask_bit_idx, "");
1004 LLVMBuildStore(builder, post_depth_mask_in, post_depth_sample_mask_in);
1005 }
1006
1007 LLVMBuildStore(builder, s_mask, s_mask_ptr);
1008
1009 lp_build_for_loop_end(&sample_loop_state);
1010
1011 /* recombined all the coverage masks in the shader exec mask. */
1012 tmp_s_mask_or = LLVMBuildLoad2(builder, int_vec_type, s_mask_or, "");
1013 lp_build_mask_update(&mask, tmp_s_mask_or);
1014
1015 if (key->min_samples == 1) {
1016 /* for multisample Z needs to be re interpolated at pixel center */
1017 lp_build_interp_soa_update_pos_dyn(interp, gallivm, loop_state.counter, NULL);
1018 z = interp->pos[2];
1019 lp_build_mask_update(&mask, tmp_s_mask_or);
1020 }
1021 } else {
1022 if (post_depth_coverage) {
1023 LLVMValueRef post_depth_mask_in = LLVMBuildAnd(builder, lp_build_mask_value(&mask), lp_build_const_int_vec(gallivm, type, 1), "");
1024 LLVMBuildStore(builder, post_depth_mask_in, post_depth_sample_mask_in);
1025 }
1026 }
1027
1028 LLVMValueRef out_sample_mask_storage = NULL;
1029 if (nir->info.outputs_written & BITFIELD64_BIT(FRAG_RESULT_SAMPLE_MASK)) {
1030 out_sample_mask_storage = lp_build_alloca(gallivm, int_vec_type, "write_mask");
1031 if (key->min_samples > 1)
1032 LLVMBuildStore(builder, LLVMConstNull(int_vec_type), out_sample_mask_storage);
1033 }
1034
1035 if (post_depth_coverage) {
1036 system_values.sample_mask_in = LLVMBuildLoad2(builder, int_vec_type, post_depth_sample_mask_in, "");
1037 } else {
1038 system_values.sample_mask_in = sample_mask_in;
1039 }
1040 if (key->multisample && key->min_samples > 1) {
1041 lp_build_for_loop_begin(&sample_loop_state, gallivm,
1042 lp_build_const_int32(gallivm, 0),
1043 LLVMIntULT,
1044 lp_build_const_int32(gallivm, key->min_samples),
1045 lp_build_const_int32(gallivm, 1));
1046
1047 LLVMValueRef s_mask_idx = LLVMBuildMul(builder, sample_loop_state.counter, num_loop, "");
1048 s_mask_idx = LLVMBuildAdd(builder, s_mask_idx, loop_state.counter, "");
1049 s_mask_ptr = LLVMBuildGEP2(builder, mask_type, mask_store, &s_mask_idx, 1, "");
1050 s_mask = LLVMBuildLoad2(builder, mask_type, s_mask_ptr, "");
1051 lp_build_mask_force(&mask, s_mask);
1052 lp_build_interp_soa_update_pos_dyn(interp, gallivm, loop_state.counter, sample_loop_state.counter);
1053 system_values.sample_id = sample_loop_state.counter;
1054 system_values.sample_mask_in = LLVMBuildAnd(builder, system_values.sample_mask_in,
1055 lp_build_broadcast(gallivm, int_vec_type,
1056 LLVMBuildShl(builder, lp_build_const_int32(gallivm, 1), sample_loop_state.counter, "")), "");
1057 } else {
1058 system_values.sample_id = lp_build_const_int32(gallivm, 0);
1059
1060 }
1061 system_values.sample_pos = sample_pos_array;
1062 system_values.sample_pos_type = sample_pos_type;
1063
1064 lp_build_interp_soa_update_inputs_dyn(interp, gallivm, loop_state.counter,
1065 mask_type, mask_store, sample_loop_state.counter);
1066
1067 struct lp_build_fs_llvm_iface fs_iface = {
1068 .base.interp_fn = fs_interp,
1069 .base.fb_fetch = fs_fb_fetch,
1070 .interp = interp,
1071 .loop_state = &loop_state,
1072 .sample_id = system_values.sample_id,
1073 .mask_type = mask_type,
1074 .mask_store = mask_store,
1075 .color_ptr_ptr = color_ptr_ptr,
1076 .color_stride_ptr = color_stride_ptr,
1077 .color_sample_stride_ptr = color_sample_stride_ptr,
1078 .zs_base_ptr = depth_base_ptr,
1079 .zs_stride = depth_stride,
1080 .zs_sample_stride = depth_sample_stride,
1081 .key = key,
1082 };
1083
1084 struct lp_build_tgsi_params params;
1085 memset(¶ms, 0, sizeof(params));
1086
1087 params.type = type;
1088 params.mask = &mask;
1089 params.fs_iface = &fs_iface.base;
1090 params.consts_ptr = consts_ptr;
1091 params.system_values = &system_values;
1092 params.inputs = interp->inputs;
1093 params.num_inputs = interp->num_attribs - 1;
1094 params.context_type = context_type;
1095 params.context_ptr = context_ptr;
1096 params.resources_type = resources_type;
1097 params.resources_ptr = resources_ptr;
1098 params.thread_data_type = thread_data_type;
1099 params.thread_data_ptr = thread_data_ptr;
1100 params.sampler = sampler;
1101 params.info = &shader->info.base;
1102 params.ssbo_ptr = ssbo_ptr;
1103 params.image = image;
1104
1105 /* Build the actual shader */
1106 lp_build_nir_soa(gallivm, nir, ¶ms, outputs);
1107
1108 /*
1109 * Must not count ps invocations if there's a null shader.
1110 * (It would be ok to count with null shader if there's d/s tests,
1111 * but only if there's d/s buffers too, which is different
1112 * to implicit rasterization disable which must not depend
1113 * on the d/s buffers.)
1114 * Could disable if there's no stats query, but maybe not worth it.
1115 */
1116 if (shader->info.base.num_instructions > 1) {
1117 LLVMValueRef invocs = lp_jit_thread_data_ps_invocations(gallivm, thread_data_type, thread_data_ptr);
1118 lp_build_occlusion_count(gallivm, type, lp_build_mask_value(&mask), invocs);
1119 }
1120
1121 /* Alpha test */
1122 if (key->alpha.enabled) {
1123 int color0 = find_output_by_frag_result(nir, FRAG_RESULT_DATA0);
1124
1125 if (color0 != -1 && outputs[color0][3]) {
1126 const struct util_format_description *cbuf_format_desc;
1127 LLVMValueRef alpha = LLVMBuildLoad2(builder, vec_type, outputs[color0][3], "alpha");
1128 LLVMValueRef alpha_ref_value;
1129
1130 alpha_ref_value = lp_jit_context_alpha_ref_value(gallivm, context_type, context_ptr);
1131 alpha_ref_value = lp_build_broadcast(gallivm, vec_type, alpha_ref_value);
1132
1133 cbuf_format_desc = util_format_description(key->cbuf_format[0]);
1134
1135 lp_build_alpha_test(gallivm, key->alpha.func, type, cbuf_format_desc,
1136 &mask, alpha, alpha_ref_value,
1137 ((depth_mode & LATE_DEPTH_TEST) != 0) && !key->multisample);
1138 }
1139 }
1140
1141 /* Alpha to coverage */
1142 if (key->blend.alpha_to_coverage) {
1143 int color0 = find_output_by_frag_result(nir, FRAG_RESULT_DATA0);
1144
1145 if (color0 != -1 && outputs[color0][3]) {
1146 LLVMValueRef alpha = LLVMBuildLoad2(builder, vec_type, outputs[color0][3], "alpha");
1147
1148 if (key->blend.alpha_to_coverage_dither) {
1149 alpha = lp_build_alpha_to_coverage_dither(gallivm, type, key->coverage_samples,
1150 interp->pos, alpha);
1151 }
1152
1153 if (!key->multisample) {
1154 lp_build_alpha_to_coverage(gallivm, type,
1155 &mask, alpha,
1156 key->blend.alpha_to_coverage_dither,
1157 (depth_mode & LATE_DEPTH_TEST) != 0);
1158 } else {
1159 lp_build_sample_alpha_to_coverage(gallivm, type, key->coverage_samples, num_loop,
1160 loop_state.counter,
1161 mask_type, mask_store, alpha);
1162 }
1163 }
1164 }
1165
1166 if (key->blend.alpha_to_one) {
1167 nir_foreach_shader_out_variable(var, nir) {
1168 if (var->data.location < FRAG_RESULT_DATA0)
1169 continue;
1170 int slots = nir_variable_count_slots(var, var->type);
1171 for (unsigned s = 0; s < slots; s++) {
1172 unsigned cbuf = get_cbuf_location(var, s);
1173 if ((cbuf < key->nr_cbufs) || (cbuf == 1 && dual_source_blend))
1174 if (outputs[cbuf][3]) {
1175 LLVMBuildStore(builder, lp_build_const_vec(gallivm, type, 1.0),
1176 outputs[cbuf][3]);
1177 }
1178 }
1179 }
1180 }
1181
1182 if (nir->info.outputs_written & BITFIELD64_BIT(FRAG_RESULT_SAMPLE_MASK)) {
1183 LLVMValueRef output_smask = NULL;
1184 int smaski = find_output_by_frag_result(nir, FRAG_RESULT_SAMPLE_MASK);
1185
1186 struct lp_build_context smask_bld;
1187 lp_build_context_init(&smask_bld, gallivm, int_type);
1188
1189 assert(smaski >= 0);
1190 output_smask = LLVMBuildLoad2(builder, vec_type, outputs[smaski][0], "smask");
1191 output_smask = LLVMBuildBitCast(builder, output_smask, smask_bld.vec_type, "");
1192 if (!key->multisample && key->no_ms_sample_mask_out) {
1193 output_smask = lp_build_and(&smask_bld, output_smask, smask_bld.one);
1194 output_smask = lp_build_cmp(&smask_bld, PIPE_FUNC_NOTEQUAL, output_smask, smask_bld.zero);
1195 lp_build_mask_update(&mask, output_smask);
1196 }
1197
1198 if (key->min_samples > 1) {
1199 /* only the bit corresponding to this sample is to be used. */
1200 LLVMValueRef tmp_mask = LLVMBuildLoad2(builder, int_vec_type, out_sample_mask_storage, "tmp_mask");
1201 LLVMValueRef out_smask_idx = LLVMBuildShl(builder, lp_build_const_int32(gallivm, 1), sample_loop_state.counter, "");
1202 LLVMValueRef smask_bit = LLVMBuildAnd(builder, output_smask, lp_build_broadcast(gallivm, int_vec_type, out_smask_idx), "");
1203 output_smask = LLVMBuildOr(builder, tmp_mask, smask_bit, "");
1204 }
1205
1206 LLVMBuildStore(builder, output_smask, out_sample_mask_storage);
1207 }
1208
1209 if (nir->info.outputs_written & BITFIELD64_BIT(FRAG_RESULT_DEPTH)) {
1210 int pos0 = find_output_by_frag_result(nir, FRAG_RESULT_DEPTH);
1211
1212 LLVMValueRef out = LLVMBuildLoad2(builder, vec_type, outputs[pos0][2], "");
1213 LLVMValueRef idx = loop_state.counter;
1214 if (key->min_samples > 1)
1215 idx = LLVMBuildAdd(builder, idx,
1216 LLVMBuildMul(builder, sample_loop_state.counter, num_loop, ""), "");
1217 LLVMValueRef ptr = LLVMBuildGEP2(builder, vec_type, z_out, &idx, 1, "");
1218 LLVMBuildStore(builder, out, ptr);
1219 }
1220
1221 if (nir->info.outputs_written & BITFIELD64_BIT(FRAG_RESULT_STENCIL)) {
1222 int sten_out = find_output_by_frag_result(nir, FRAG_RESULT_STENCIL);
1223
1224 LLVMValueRef out = LLVMBuildLoad2(builder, vec_type,
1225 outputs[sten_out][1], "output.s");
1226 LLVMValueRef idx = loop_state.counter;
1227 if (key->min_samples > 1)
1228 idx = LLVMBuildAdd(builder, idx,
1229 LLVMBuildMul(builder, sample_loop_state.counter, num_loop, ""), "");
1230 LLVMValueRef ptr = LLVMBuildGEP2(builder, vec_type, s_out, &idx, 1, "");
1231 LLVMBuildStore(builder, out, ptr);
1232 }
1233
1234 bool has_cbuf0_write = false;
1235 /* Color write - per fragment sample */
1236 nir_foreach_shader_out_variable(var, nir) {
1237 if (var->data.location < FRAG_RESULT_DATA0)
1238 continue;
1239 int slots = nir_variable_count_slots(var, var->type);
1240
1241 for (unsigned s = 0; s < slots; s++) {
1242 unsigned cbuf = get_cbuf_location(var, s);
1243 unsigned attrib = var->data.driver_location + s;
1244 if ((cbuf < key->nr_cbufs) || (cbuf == 1 && dual_source_blend)) {
1245 if (cbuf == 0) {
1246 /* XXX: there is an edge case with FB fetch where gl_FragColor and
1247 * gl_LastFragData[0] are used together. This creates both
1248 * FRAG_RESULT_COLOR and FRAG_RESULT_DATA* output variables. This
1249 * loop then writes to cbuf 0 twice, owerwriting the correct value
1250 * from gl_FragColor with some garbage. This case is excercised in
1251 * one of deqp tests. A similar bug can happen if
1252 * gl_SecondaryFragColorEXT and gl_LastFragData[1] are mixed in
1253 * the same fashion... This workaround will break if
1254 * gl_LastFragData[0] goes in outputs list before
1255 * gl_FragColor. This doesn't seem to happen though.
1256 */
1257 if (has_cbuf0_write)
1258 continue;
1259 has_cbuf0_write = true;
1260 }
1261
1262 for (unsigned chan = 0; chan < TGSI_NUM_CHANNELS; ++chan) {
1263 if (outputs[attrib][chan]) {
1264 /* XXX: just initialize outputs to point at colors[] and
1265 * skip this.
1266 */
1267 LLVMValueRef out = LLVMBuildLoad2(builder, vec_type, outputs[attrib][chan], "");
1268 LLVMValueRef color_ptr;
1269 LLVMValueRef color_idx = loop_state.counter;
1270 if (key->min_samples > 1)
1271 color_idx = LLVMBuildAdd(builder, color_idx,
1272 LLVMBuildMul(builder, sample_loop_state.counter, num_loop, ""), "");
1273 color_ptr = LLVMBuildGEP2(builder, vec_type, out_color[cbuf][chan],
1274 &color_idx, 1, "");
1275 lp_build_name(out, "color%u.%c", attrib, "rgba"[chan]);
1276 LLVMBuildStore(builder, out, color_ptr);
1277 }
1278 }
1279 }
1280 }
1281 }
1282
1283 if (key->multisample && key->min_samples > 1) {
1284 LLVMBuildStore(builder, lp_build_mask_value(&mask), s_mask_ptr);
1285 lp_build_for_loop_end(&sample_loop_state);
1286 }
1287
1288 if (key->multisample) {
1289 /* execute depth test for each sample */
1290 lp_build_for_loop_begin(&sample_loop_state, gallivm,
1291 lp_build_const_int32(gallivm, 0),
1292 LLVMIntULT, lp_build_const_int32(gallivm, key->coverage_samples),
1293 lp_build_const_int32(gallivm, 1));
1294
1295 /* load the per-sample coverage mask */
1296 LLVMValueRef s_mask_idx = LLVMBuildMul(builder, sample_loop_state.counter, num_loop, "");
1297 s_mask_idx = LLVMBuildAdd(builder, s_mask_idx, loop_state.counter, "");
1298 s_mask_ptr = LLVMBuildGEP2(builder, mask_type, mask_store, &s_mask_idx, 1, "");
1299
1300 /* combine the execution mask post fragment shader with the coverage mask. */
1301 s_mask = LLVMBuildLoad2(builder, mask_type, s_mask_ptr, "");
1302 if (key->min_samples == 1)
1303 s_mask = LLVMBuildAnd(builder, s_mask, lp_build_mask_value(&mask), "");
1304
1305 /* if the shader writes sample mask use that,
1306 * but only if this isn't genuine early-depth to avoid breaking occlusion query */
1307 if (nir->info.outputs_written & BITFIELD64_BIT(FRAG_RESULT_SAMPLE_MASK) &&
1308 (!(depth_mode & EARLY_DEPTH_TEST) || (depth_mode & (EARLY_DEPTH_TEST_INFERRED)))) {
1309 LLVMValueRef out_smask_idx = LLVMBuildShl(builder, lp_build_const_int32(gallivm, 1), sample_loop_state.counter, "");
1310 out_smask_idx = lp_build_broadcast(gallivm, int_vec_type, out_smask_idx);
1311 LLVMValueRef output_smask = LLVMBuildLoad2(builder, int_vec_type, out_sample_mask_storage, "");
1312 LLVMValueRef smask_bit = LLVMBuildAnd(builder, output_smask, out_smask_idx, "");
1313 LLVMValueRef cmp = LLVMBuildICmp(builder, LLVMIntNE, smask_bit, lp_build_const_int_vec(gallivm, int_type, 0), "");
1314 smask_bit = LLVMBuildSExt(builder, cmp, int_vec_type, "");
1315
1316 s_mask = LLVMBuildAnd(builder, s_mask, smask_bit, "");
1317 }
1318 }
1319
1320 depth_ptr = depth_base_ptr;
1321 if (key->multisample) {
1322 LLVMValueRef sample_offset = LLVMBuildMul(builder, sample_loop_state.counter, depth_sample_stride, "");
1323 depth_ptr = LLVMBuildGEP2(builder, LLVMInt8TypeInContext(gallivm->context),
1324 depth_ptr, &sample_offset, 1, "");
1325 }
1326
1327 /* Late Z test */
1328 if (depth_mode & LATE_DEPTH_TEST) {
1329 if (nir->info.outputs_written & BITFIELD64_BIT(FRAG_RESULT_DEPTH)) {
1330 LLVMValueRef idx = loop_state.counter;
1331 if (key->min_samples > 1)
1332 idx = LLVMBuildAdd(builder, idx,
1333 LLVMBuildMul(builder, sample_loop_state.counter, num_loop, ""), "");
1334 LLVMValueRef ptr = LLVMBuildGEP2(builder, vec_type, z_out, &idx, 1, "");
1335 z = LLVMBuildLoad2(builder, vec_type, ptr, "output.z");
1336 } else {
1337 if (key->multisample) {
1338 lp_build_interp_soa_update_pos_dyn(interp, gallivm, loop_state.counter, key->multisample ? sample_loop_state.counter : NULL);
1339 z = interp->pos[2];
1340 }
1341 }
1342
1343 /*
1344 * Clamp according to ARB_depth_clamp semantics.
1345 */
1346 z = lp_build_depth_clamp(gallivm, builder, key->depth_clamp,
1347 key->restrict_depth_values, type,
1348 context_type, context_ptr,
1349 thread_data_type, thread_data_ptr, z);
1350
1351 if (nir->info.outputs_written & BITFIELD64_BIT(FRAG_RESULT_STENCIL)) {
1352 LLVMValueRef idx = loop_state.counter;
1353 if (key->min_samples > 1)
1354 idx = LLVMBuildAdd(builder, idx,
1355 LLVMBuildMul(builder, sample_loop_state.counter, num_loop, ""), "");
1356 LLVMValueRef ptr = LLVMBuildGEP2(builder, vec_type, s_out, &idx, 1, "");
1357 stencil_refs[0] = LLVMBuildLoad2(builder, vec_type, ptr, "output.s");
1358 /* there's only one value, and spec says to discard additional bits */
1359 LLVMValueRef s_max_mask = lp_build_const_int_vec(gallivm, int_type, 255);
1360 stencil_refs[0] = LLVMBuildBitCast(builder, stencil_refs[0], int_vec_type, "");
1361 stencil_refs[0] = LLVMBuildAnd(builder, stencil_refs[0], s_max_mask, "");
1362 stencil_refs[1] = stencil_refs[0];
1363 }
1364
1365 lp_build_depth_stencil_load_swizzled(gallivm, type,
1366 zs_format_desc, key->resource_1d,
1367 depth_ptr, depth_stride,
1368 &z_fb, &s_fb, loop_state.counter);
1369
1370 lp_build_depth_stencil_test(gallivm,
1371 &key->depth,
1372 key->stencil,
1373 type,
1374 zs_format_desc,
1375 key->multisample ? NULL : &mask,
1376 &s_mask,
1377 stencil_refs,
1378 z, z_fb, s_fb,
1379 facing,
1380 &z_value, &s_value,
1381 false,
1382 key->restrict_depth_values);
1383 /* Late Z write */
1384 if (depth_mode & LATE_DEPTH_WRITE) {
1385 lp_build_depth_stencil_write_swizzled(gallivm, type,
1386 zs_format_desc, key->resource_1d,
1387 NULL, NULL, NULL, loop_state.counter,
1388 depth_ptr, depth_stride,
1389 z_value, s_value);
1390 }
1391 } else if ((depth_mode & EARLY_DEPTH_TEST) &&
1392 (depth_mode & LATE_DEPTH_WRITE)) {
1393 /* Need to apply a reduced mask to the depth write. Reload the
1394 * depth value, update from zs_value with the new mask value and
1395 * write that out.
1396 */
1397 if (key->multisample) {
1398 z_value = LLVMBuildBitCast(builder, lp_build_pointer_get2(builder, int_vec_type, z_sample_value_store, sample_loop_state.counter), z_type, "");
1399 s_value = lp_build_pointer_get2(builder, int_vec_type, s_sample_value_store, sample_loop_state.counter);
1400 z_fb = LLVMBuildBitCast(builder, lp_build_pointer_get2(builder, int_vec_type, z_fb_store, sample_loop_state.counter), z_fb_type, "");
1401 s_fb = lp_build_pointer_get2(builder, int_vec_type, s_fb_store, sample_loop_state.counter);
1402 }
1403 lp_build_depth_stencil_write_swizzled(gallivm, type,
1404 zs_format_desc, key->resource_1d,
1405 key->multisample ? s_mask : lp_build_mask_value(&mask), z_fb, s_fb, loop_state.counter,
1406 depth_ptr, depth_stride,
1407 z_value, s_value);
1408 }
1409
1410 if (key->occlusion_count && (!(depth_mode & EARLY_DEPTH_TEST) || (depth_mode & EARLY_DEPTH_TEST_INFERRED))) {
1411 LLVMValueRef counter = lp_jit_thread_data_vis_counter(gallivm, thread_data_type, thread_data_ptr);
1412 lp_build_name(counter, "counter");
1413
1414 lp_build_occlusion_count(gallivm, type,
1415 key->multisample ? s_mask : lp_build_mask_value(&mask), counter);
1416 }
1417
1418 /* if this is genuine early-depth in the shader, write samplemask now
1419 * after occlusion count has been updated
1420 */
1421 if (key->multisample &&
1422 nir->info.outputs_written & BITFIELD64_BIT(FRAG_RESULT_SAMPLE_MASK) &&
1423 (depth_mode & (EARLY_DEPTH_TEST_INFERRED | EARLY_DEPTH_TEST)) == EARLY_DEPTH_TEST) {
1424 /* if the shader writes sample mask use that */
1425 LLVMValueRef out_smask_idx = LLVMBuildShl(builder, lp_build_const_int32(gallivm, 1), sample_loop_state.counter, "");
1426 out_smask_idx = lp_build_broadcast(gallivm, int_vec_type, out_smask_idx);
1427 LLVMValueRef output_smask = LLVMBuildLoad2(builder, int_vec_type, out_sample_mask_storage, "");
1428 LLVMValueRef smask_bit = LLVMBuildAnd(builder, output_smask, out_smask_idx, "");
1429 LLVMValueRef cmp = LLVMBuildICmp(builder, LLVMIntNE, smask_bit, lp_build_const_int_vec(gallivm, int_type, 0), "");
1430 smask_bit = LLVMBuildSExt(builder, cmp, int_vec_type, "");
1431
1432 s_mask = LLVMBuildAnd(builder, s_mask, smask_bit, "");
1433 }
1434
1435
1436 if (key->multisample) {
1437 /* store the sample mask for this loop */
1438 LLVMBuildStore(builder, s_mask, s_mask_ptr);
1439 lp_build_for_loop_end(&sample_loop_state);
1440 }
1441
1442 mask_val = lp_build_mask_end(&mask);
1443 if (!key->multisample)
1444 LLVMBuildStore(builder, mask_val, mask_ptr);
1445 lp_build_for_loop_end(&loop_state);
1446 }
1447
1448
1449 /**
1450 * This function will reorder pixels from the fragment shader SoA to memory
1451 * layout AoS
1452 *
1453 * Fragment Shader outputs pixels in small 2x2 blocks
1454 * e.g. (0, 0), (1, 0), (0, 1), (1, 1) ; (2, 0) ...
1455 *
1456 * However in memory pixels are stored in rows
1457 * e.g. (0, 0), (1, 0), (2, 0), (3, 0) ; (0, 1) ...
1458 *
1459 * @param type fragment shader type (4x or 8x float)
1460 * @param num_fs number of fs_src
1461 * @param is_1d whether we're outputting to a 1d resource
1462 * @param dst_channels number of output channels
1463 * @param fs_src output from fragment shader
1464 * @param dst pointer to store result
1465 * @param pad_inline is channel padding inline or at end of row
1466 * @return the number of dsts
1467 */
1468 static int
generate_fs_twiddle(struct gallivm_state * gallivm,struct lp_type type,unsigned num_fs,unsigned dst_channels,LLVMValueRef fs_src[][4],LLVMValueRef * dst,bool pad_inline)1469 generate_fs_twiddle(struct gallivm_state *gallivm,
1470 struct lp_type type,
1471 unsigned num_fs,
1472 unsigned dst_channels,
1473 LLVMValueRef fs_src[][4],
1474 LLVMValueRef* dst,
1475 bool pad_inline)
1476 {
1477 LLVMValueRef src[16];
1478 unsigned pixels = type.length / 4;
1479 unsigned src_channels = dst_channels < 3 ? dst_channels : 4;
1480 unsigned src_count = num_fs * src_channels;
1481
1482 assert(pixels == 2 || pixels == 1);
1483 assert(num_fs * src_channels <= ARRAY_SIZE(src));
1484
1485 /*
1486 * Transpose from SoA -> AoS
1487 */
1488 for (unsigned i = 0; i < num_fs; ++i) {
1489 lp_build_transpose_aos_n(gallivm, type, &fs_src[i][0], src_channels,
1490 &src[i * src_channels]);
1491 }
1492
1493 /*
1494 * Pick transformation options
1495 */
1496 bool swizzle_pad = false;
1497 bool twiddle = false;
1498 bool split = false;
1499 unsigned reorder_group = 0;
1500
1501 if (dst_channels == 1) {
1502 twiddle = true;
1503 if (pixels == 2) {
1504 split = true;
1505 }
1506 } else if (dst_channels == 2) {
1507 if (pixels == 1) {
1508 reorder_group = 1;
1509 }
1510 } else if (dst_channels > 2) {
1511 if (pixels == 1) {
1512 reorder_group = 2;
1513 } else {
1514 twiddle = true;
1515 }
1516
1517 if (!pad_inline && dst_channels == 3 && pixels > 1) {
1518 swizzle_pad = true;
1519 }
1520 }
1521
1522 /*
1523 * Split the src in half
1524 */
1525 if (split) {
1526 for (unsigned i = num_fs; i > 0; --i) {
1527 src[(i - 1)*2 + 1] = lp_build_extract_range(gallivm, src[i - 1], 4, 4);
1528 src[(i - 1)*2 + 0] = lp_build_extract_range(gallivm, src[i - 1], 0, 4);
1529 }
1530
1531 src_count *= 2;
1532 type.length = 4;
1533 }
1534
1535 /*
1536 * Ensure pixels are in memory order
1537 */
1538 if (reorder_group) {
1539 /* Twiddle pixels by reordering the array, e.g.:
1540 *
1541 * src_count = 8 -> 0 2 1 3 4 6 5 7
1542 * src_count = 16 -> 0 1 4 5 2 3 6 7 8 9 12 13 10 11 14 15
1543 */
1544 const unsigned reorder_sw[] = { 0, 2, 1, 3 };
1545
1546 for (unsigned i = 0; i < src_count; ++i) {
1547 unsigned group = i / reorder_group;
1548 unsigned block = (group / 4) * 4 * reorder_group;
1549 unsigned j = block + (reorder_sw[group % 4] * reorder_group) + (i % reorder_group);
1550 dst[i] = src[j];
1551 }
1552 } else if (twiddle) {
1553 /* Twiddle pixels across elements of array */
1554 /*
1555 * XXX: we should avoid this in some cases, but would need to tell
1556 * lp_build_conv to reorder (or deal with it ourselves).
1557 */
1558 lp_bld_quad_twiddle(gallivm, type, src, src_count, dst);
1559 } else {
1560 /* Do nothing */
1561 memcpy(dst, src, sizeof(LLVMValueRef) * src_count);
1562 }
1563
1564 /*
1565 * Moves any padding between pixels to the end
1566 * e.g. RGBXRGBX -> RGBRGBXX
1567 */
1568 if (swizzle_pad) {
1569 unsigned char swizzles[16];
1570 unsigned elems = pixels * dst_channels;
1571
1572 for (unsigned i = 0; i < type.length; ++i) {
1573 if (i < elems)
1574 swizzles[i] = i % dst_channels + (i / dst_channels) * 4;
1575 else
1576 swizzles[i] = LP_BLD_SWIZZLE_DONTCARE;
1577 }
1578
1579 for (unsigned i = 0; i < src_count; ++i) {
1580 dst[i] = lp_build_swizzle_aos_n(gallivm, dst[i], swizzles,
1581 type.length, type.length);
1582 }
1583 }
1584
1585 return src_count;
1586 }
1587
1588
1589 /*
1590 * Untwiddle and transpose, much like the above.
1591 * However, this is after conversion, so we get packed vectors.
1592 * At this time only handle 4x16i8 rgba / 2x16i8 rg / 1x16i8 r data,
1593 * the vectors will look like:
1594 * r0r1r4r5r2r3r6r7r8r9r12... (albeit color channels may
1595 * be swizzled here). Extending to 16bit should be trivial.
1596 * Should also be extended to handle twice wide vectors with AVX2...
1597 */
1598 static void
fs_twiddle_transpose(struct gallivm_state * gallivm,struct lp_type type,LLVMValueRef * src,unsigned src_count,LLVMValueRef * dst)1599 fs_twiddle_transpose(struct gallivm_state *gallivm,
1600 struct lp_type type,
1601 LLVMValueRef *src,
1602 unsigned src_count,
1603 LLVMValueRef *dst)
1604 {
1605 struct lp_type type64, type16, type32;
1606 LLVMTypeRef type64_t, type8_t, type16_t, type32_t;
1607 LLVMBuilderRef builder = gallivm->builder;
1608 LLVMValueRef tmp[4], shuf[8];
1609 for (unsigned j = 0; j < 2; j++) {
1610 shuf[j*4 + 0] = lp_build_const_int32(gallivm, j*4 + 0);
1611 shuf[j*4 + 1] = lp_build_const_int32(gallivm, j*4 + 2);
1612 shuf[j*4 + 2] = lp_build_const_int32(gallivm, j*4 + 1);
1613 shuf[j*4 + 3] = lp_build_const_int32(gallivm, j*4 + 3);
1614 }
1615
1616 assert(src_count == 4 || src_count == 2 || src_count == 1);
1617 assert(type.width == 8);
1618 assert(type.length == 16);
1619
1620 type8_t = lp_build_vec_type(gallivm, type);
1621
1622 type64 = type;
1623 type64.length /= 8;
1624 type64.width *= 8;
1625 type64_t = lp_build_vec_type(gallivm, type64);
1626
1627 type16 = type;
1628 type16.length /= 2;
1629 type16.width *= 2;
1630 type16_t = lp_build_vec_type(gallivm, type16);
1631
1632 type32 = type;
1633 type32.length /= 4;
1634 type32.width *= 4;
1635 type32_t = lp_build_vec_type(gallivm, type32);
1636
1637 lp_build_transpose_aos_n(gallivm, type, src, src_count, tmp);
1638
1639 if (src_count == 1) {
1640 /* transpose was no-op, just untwiddle */
1641 LLVMValueRef shuf_vec;
1642 shuf_vec = LLVMConstVector(shuf, 8);
1643 tmp[0] = LLVMBuildBitCast(builder, src[0], type16_t, "");
1644 tmp[0] = LLVMBuildShuffleVector(builder, tmp[0], tmp[0], shuf_vec, "");
1645 dst[0] = LLVMBuildBitCast(builder, tmp[0], type8_t, "");
1646 } else if (src_count == 2) {
1647 LLVMValueRef shuf_vec;
1648 shuf_vec = LLVMConstVector(shuf, 4);
1649
1650 for (unsigned i = 0; i < 2; i++) {
1651 tmp[i] = LLVMBuildBitCast(builder, tmp[i], type32_t, "");
1652 tmp[i] = LLVMBuildShuffleVector(builder, tmp[i], tmp[i], shuf_vec, "");
1653 dst[i] = LLVMBuildBitCast(builder, tmp[i], type8_t, "");
1654 }
1655 } else {
1656 for (unsigned j = 0; j < 2; j++) {
1657 LLVMValueRef lo, hi, lo2, hi2;
1658 /*
1659 * Note that if we only really have 3 valid channels (rgb)
1660 * and we don't need alpha we could substitute a undef here
1661 * for the respective channel (causing llvm to drop conversion
1662 * for alpha).
1663 */
1664 /* we now have rgba0rgba1rgba4rgba5 etc, untwiddle */
1665 lo2 = LLVMBuildBitCast(builder, tmp[j*2], type64_t, "");
1666 hi2 = LLVMBuildBitCast(builder, tmp[j*2 + 1], type64_t, "");
1667 lo = lp_build_interleave2(gallivm, type64, lo2, hi2, 0);
1668 hi = lp_build_interleave2(gallivm, type64, lo2, hi2, 1);
1669 dst[j*2] = LLVMBuildBitCast(builder, lo, type8_t, "");
1670 dst[j*2 + 1] = LLVMBuildBitCast(builder, hi, type8_t, "");
1671 }
1672 }
1673 }
1674
1675
1676 /**
1677 * Load an unswizzled block of pixels from memory
1678 */
1679 static void
load_unswizzled_block(struct gallivm_state * gallivm,LLVMTypeRef base_type,LLVMValueRef base_ptr,LLVMValueRef stride,unsigned block_width,unsigned block_height,LLVMValueRef * dst,struct lp_type dst_type,unsigned dst_count,unsigned dst_alignment)1680 load_unswizzled_block(struct gallivm_state *gallivm,
1681 LLVMTypeRef base_type,
1682 LLVMValueRef base_ptr,
1683 LLVMValueRef stride,
1684 unsigned block_width,
1685 unsigned block_height,
1686 LLVMValueRef* dst,
1687 struct lp_type dst_type,
1688 unsigned dst_count,
1689 unsigned dst_alignment)
1690 {
1691 LLVMBuilderRef builder = gallivm->builder;
1692 const unsigned row_size = dst_count / block_height;
1693
1694 /* Ensure block exactly fits into dst */
1695 assert((block_width * block_height) % dst_count == 0);
1696
1697 for (unsigned i = 0; i < dst_count; ++i) {
1698 unsigned x = i % row_size;
1699 unsigned y = i / row_size;
1700
1701 LLVMValueRef bx = lp_build_const_int32(gallivm, x * (dst_type.width / 8) * dst_type.length);
1702 LLVMValueRef by = LLVMBuildMul(builder, lp_build_const_int32(gallivm, y), stride, "");
1703
1704 LLVMValueRef gep[2];
1705 LLVMValueRef dst_ptr;
1706
1707 gep[0] = lp_build_const_int32(gallivm, 0);
1708 gep[1] = LLVMBuildAdd(builder, bx, by, "");
1709
1710 dst_ptr = LLVMBuildGEP2(builder, base_type, base_ptr, gep, 2, "");
1711 dst_ptr = LLVMBuildBitCast(builder, dst_ptr,
1712 LLVMPointerType(lp_build_vec_type(gallivm, dst_type), 0), "");
1713
1714 dst[i] = LLVMBuildLoad2(builder,
1715 lp_build_vec_type(gallivm, dst_type),
1716 dst_ptr, "");
1717
1718 LLVMSetAlignment(dst[i], dst_alignment);
1719 }
1720 }
1721
1722
1723 /**
1724 * Store an unswizzled block of pixels to memory
1725 */
1726 static void
store_unswizzled_block(struct gallivm_state * gallivm,LLVMTypeRef base_type,LLVMValueRef base_ptr,LLVMValueRef stride,unsigned block_width,unsigned block_height,LLVMValueRef src[],struct lp_type src_type,unsigned src_count,unsigned src_alignment)1727 store_unswizzled_block(struct gallivm_state *gallivm,
1728 LLVMTypeRef base_type,
1729 LLVMValueRef base_ptr,
1730 LLVMValueRef stride,
1731 unsigned block_width,
1732 unsigned block_height,
1733 LLVMValueRef src[], // [src_count]
1734 struct lp_type src_type,
1735 unsigned src_count,
1736 unsigned src_alignment)
1737 {
1738 LLVMBuilderRef builder = gallivm->builder;
1739 const unsigned row_size = src_count / block_height;
1740
1741 /* Ensure src exactly fits into block */
1742 assert((block_width * block_height) % src_count == 0);
1743
1744 for (unsigned i = 0; i < src_count; ++i) {
1745 unsigned x = i % row_size;
1746 unsigned y = i / row_size;
1747
1748 LLVMValueRef bx = lp_build_const_int32(gallivm, x * (src_type.width / 8) * src_type.length);
1749 LLVMValueRef by = LLVMBuildMul(builder, lp_build_const_int32(gallivm, y), stride, "");
1750
1751 LLVMValueRef gep[2];
1752 LLVMValueRef src_ptr;
1753
1754 gep[0] = lp_build_const_int32(gallivm, 0);
1755 gep[1] = LLVMBuildAdd(builder, bx, by, "");
1756
1757 src_ptr = LLVMBuildGEP2(builder, base_type, base_ptr, gep, 2, "");
1758 src_ptr = LLVMBuildBitCast(builder, src_ptr,
1759 LLVMPointerType(lp_build_vec_type(gallivm, src_type), 0), "");
1760
1761 src_ptr = LLVMBuildStore(builder, src[i], src_ptr);
1762
1763 LLVMSetAlignment(src_ptr, src_alignment);
1764 }
1765 }
1766
1767
1768
1769 /**
1770 * Retrieves the type for a format which is usable in the blending code.
1771 *
1772 * e.g. RGBA16F = 4x float, R3G3B2 = 3x byte
1773 */
1774 static inline void
lp_blend_type_from_format_desc(const struct util_format_description * format_desc,struct lp_type * type)1775 lp_blend_type_from_format_desc(const struct util_format_description *format_desc,
1776 struct lp_type* type)
1777 {
1778 if (format_expands_to_float_soa(format_desc)) {
1779 /* always use ordinary floats for blending */
1780 type->floating = true;
1781 type->fixed = false;
1782 type->sign = true;
1783 type->norm = false;
1784 type->width = 32;
1785 type->length = 4;
1786 return;
1787 }
1788
1789 const int chan = util_format_get_first_non_void_channel(format_desc->format);
1790
1791 memset(type, 0, sizeof(struct lp_type));
1792 type->floating = format_desc->channel[chan].type == UTIL_FORMAT_TYPE_FLOAT;
1793 type->fixed = format_desc->channel[chan].type == UTIL_FORMAT_TYPE_FIXED;
1794 type->sign = format_desc->channel[chan].type != UTIL_FORMAT_TYPE_UNSIGNED;
1795 type->norm = format_desc->channel[chan].normalized;
1796 type->width = format_desc->channel[chan].size;
1797 type->length = format_desc->nr_channels;
1798
1799 for (unsigned i = 1; i < format_desc->nr_channels; ++i) {
1800 if (format_desc->channel[i].size > type->width)
1801 type->width = format_desc->channel[i].size;
1802 }
1803
1804 if (type->floating) {
1805 type->width = 32;
1806 } else {
1807 if (type->width <= 8) {
1808 type->width = 8;
1809 } else if (type->width <= 16) {
1810 type->width = 16;
1811 } else {
1812 type->width = 32;
1813 }
1814 }
1815
1816 if (is_arithmetic_format(format_desc) && type->length == 3) {
1817 type->length = 4;
1818 }
1819 }
1820
1821
1822 /**
1823 * Scale a normalized value from src_bits to dst_bits.
1824 *
1825 * The exact calculation is
1826 *
1827 * dst = iround(src * dst_mask / src_mask)
1828 *
1829 * or with integer rounding
1830 *
1831 * dst = src * (2*dst_mask + sign(src)*src_mask) / (2*src_mask)
1832 *
1833 * where
1834 *
1835 * src_mask = (1 << src_bits) - 1
1836 * dst_mask = (1 << dst_bits) - 1
1837 *
1838 * but we try to avoid division and multiplication through shifts.
1839 */
1840 static inline LLVMValueRef
scale_bits(struct gallivm_state * gallivm,int src_bits,int dst_bits,LLVMValueRef src,struct lp_type src_type)1841 scale_bits(struct gallivm_state *gallivm,
1842 int src_bits,
1843 int dst_bits,
1844 LLVMValueRef src,
1845 struct lp_type src_type)
1846 {
1847 LLVMBuilderRef builder = gallivm->builder;
1848 LLVMValueRef result = src;
1849
1850 if (dst_bits < src_bits) {
1851 int delta_bits = src_bits - dst_bits;
1852
1853 if (delta_bits <= dst_bits) {
1854
1855 if (dst_bits == 4) {
1856 struct lp_type flt_type =
1857 lp_type_float_vec(32, src_type.length * 32);
1858
1859 result = lp_build_unsigned_norm_to_float(gallivm, src_bits,
1860 flt_type, src);
1861 result = lp_build_clamped_float_to_unsigned_norm(gallivm, flt_type,
1862 dst_bits, result);
1863 result = LLVMBuildTrunc(gallivm->builder, result,
1864 lp_build_int_vec_type(gallivm, src_type),
1865 "");
1866 return result;
1867 }
1868
1869 /*
1870 * Approximate the rescaling with a single shift.
1871 *
1872 * This gives the wrong rounding.
1873 */
1874
1875 result = LLVMBuildLShr(builder, src,
1876 lp_build_const_int_vec(gallivm, src_type,
1877 delta_bits),
1878 "");
1879 } else {
1880 /*
1881 * Try more accurate rescaling.
1882 */
1883
1884 /*
1885 * Drop the least significant bits to make space for the
1886 * multiplication.
1887 *
1888 * XXX: A better approach would be to use a wider integer type as
1889 * intermediate. But this is enough to convert alpha from 16bits ->
1890 * 2 when rendering to PIPE_FORMAT_R10G10B10A2_UNORM.
1891 */
1892 result = LLVMBuildLShr(builder, src,
1893 lp_build_const_int_vec(gallivm, src_type,
1894 dst_bits),
1895 "");
1896
1897
1898 result = LLVMBuildMul(builder, result,
1899 lp_build_const_int_vec(gallivm, src_type,
1900 (1LL << dst_bits) - 1),
1901 "");
1902
1903 /*
1904 * Add a rounding term before the division.
1905 *
1906 * TODO: Handle signed integers too.
1907 */
1908 if (!src_type.sign) {
1909 result = LLVMBuildAdd(builder, result,
1910 lp_build_const_int_vec(gallivm, src_type,
1911 (1LL << (delta_bits - 1))),
1912 "");
1913 }
1914
1915 /*
1916 * Approximate the division by src_mask with a src_bits shift.
1917 *
1918 * Given the src has already been shifted by dst_bits, all we need
1919 * to do is to shift by the difference.
1920 */
1921
1922 result = LLVMBuildLShr(builder,
1923 result,
1924 lp_build_const_int_vec(gallivm, src_type, delta_bits),
1925 "");
1926 }
1927
1928 } else if (dst_bits > src_bits) {
1929 /* Scale up bits */
1930 int db = dst_bits - src_bits;
1931
1932 /* Shift left by difference in bits */
1933 result = LLVMBuildShl(builder,
1934 src,
1935 lp_build_const_int_vec(gallivm, src_type, db),
1936 "");
1937
1938 if (db <= src_bits) {
1939 /* Enough bits in src to fill the remainder */
1940 LLVMValueRef lower = LLVMBuildLShr(builder,
1941 src,
1942 lp_build_const_int_vec(gallivm, src_type, src_bits - db),
1943 "");
1944
1945 result = LLVMBuildOr(builder, result, lower, "");
1946 } else if (db > src_bits) {
1947 /* Need to repeatedly copy src bits to fill remainder in dst */
1948 unsigned n;
1949
1950 for (n = src_bits; n < dst_bits; n *= 2) {
1951 LLVMValueRef shuv = lp_build_const_int_vec(gallivm, src_type, n);
1952
1953 result = LLVMBuildOr(builder,
1954 result,
1955 LLVMBuildLShr(builder, result, shuv, ""),
1956 "");
1957 }
1958 }
1959 }
1960
1961 return result;
1962 }
1963
1964 /**
1965 * If RT is a smallfloat (needing denorms) format
1966 */
1967 static inline int
have_smallfloat_format(struct lp_type dst_type,enum pipe_format format)1968 have_smallfloat_format(struct lp_type dst_type,
1969 enum pipe_format format)
1970 {
1971 return ((dst_type.floating && dst_type.width != 32) ||
1972 /* due to format handling hacks this format doesn't have floating set
1973 * here (and actually has width set to 32 too) so special case this.
1974 */
1975 (format == PIPE_FORMAT_R11G11B10_FLOAT));
1976 }
1977
1978
1979 /**
1980 * Convert from memory format to blending format
1981 *
1982 * e.g. GL_R3G3B2 is 1 byte in memory but 3 bytes for blending
1983 */
1984 static void
convert_to_blend_type(struct gallivm_state * gallivm,unsigned block_size,const struct util_format_description * src_fmt,struct lp_type src_type,struct lp_type dst_type,LLVMValueRef * src,unsigned num_srcs)1985 convert_to_blend_type(struct gallivm_state *gallivm,
1986 unsigned block_size,
1987 const struct util_format_description *src_fmt,
1988 struct lp_type src_type,
1989 struct lp_type dst_type,
1990 LLVMValueRef* src, // and dst
1991 unsigned num_srcs)
1992 {
1993 LLVMValueRef *dst = src;
1994 LLVMBuilderRef builder = gallivm->builder;
1995 struct lp_type blend_type;
1996 struct lp_type mem_type;
1997 unsigned i, j;
1998 unsigned pixels = block_size / num_srcs;
1999 bool is_arith;
2000
2001 /*
2002 * full custom path for packed floats and srgb formats - none of the later
2003 * functions would do anything useful, and given the lp_type representation
2004 * they can't be fixed. Should really have some SoA blend path for these
2005 * kind of formats rather than hacking them in here.
2006 */
2007 if (format_expands_to_float_soa(src_fmt)) {
2008 LLVMValueRef tmpsrc[4];
2009 /*
2010 * This is pretty suboptimal for this case blending in SoA would be much
2011 * better, since conversion gets us SoA values so need to convert back.
2012 */
2013 assert(src_type.width == 32 || src_type.width == 16);
2014 assert(dst_type.floating);
2015 assert(dst_type.width == 32);
2016 assert(dst_type.length % 4 == 0);
2017 assert(num_srcs % 4 == 0);
2018
2019 if (src_type.width == 16) {
2020 /* expand 4x16bit values to 4x32bit */
2021 struct lp_type type32x4 = src_type;
2022 LLVMTypeRef ltype32x4;
2023 unsigned num_fetch = dst_type.length == 8 ? num_srcs / 2 : num_srcs / 4;
2024 type32x4.width = 32;
2025 ltype32x4 = lp_build_vec_type(gallivm, type32x4);
2026 for (i = 0; i < num_fetch; i++) {
2027 src[i] = LLVMBuildZExt(builder, src[i], ltype32x4, "");
2028 }
2029 src_type.width = 32;
2030 }
2031 for (i = 0; i < 4; i++) {
2032 tmpsrc[i] = src[i];
2033 }
2034 for (i = 0; i < num_srcs / 4; i++) {
2035 LLVMValueRef tmpsoa[4];
2036 LLVMValueRef tmps = tmpsrc[i];
2037 if (dst_type.length == 8) {
2038 LLVMValueRef shuffles[8];
2039 unsigned j;
2040 /* fetch was 4 values but need 8-wide output values */
2041 tmps = lp_build_concat(gallivm, &tmpsrc[i * 2], src_type, 2);
2042 /*
2043 * for 8-wide aos transpose would give us wrong order not matching
2044 * incoming converted fs values and mask. ARGH.
2045 */
2046 for (j = 0; j < 4; j++) {
2047 shuffles[j] = lp_build_const_int32(gallivm, j * 2);
2048 shuffles[j + 4] = lp_build_const_int32(gallivm, j * 2 + 1);
2049 }
2050 tmps = LLVMBuildShuffleVector(builder, tmps, tmps,
2051 LLVMConstVector(shuffles, 8), "");
2052 }
2053 if (src_fmt->format == PIPE_FORMAT_R11G11B10_FLOAT) {
2054 lp_build_r11g11b10_to_float(gallivm, tmps, tmpsoa);
2055 } else {
2056 lp_build_unpack_rgba_soa(gallivm, src_fmt, dst_type, tmps, tmpsoa);
2057 }
2058 lp_build_transpose_aos(gallivm, dst_type, tmpsoa, &src[i * 4]);
2059 }
2060 return;
2061 }
2062
2063 lp_mem_type_from_format_desc(src_fmt, &mem_type);
2064 lp_blend_type_from_format_desc(src_fmt, &blend_type);
2065
2066 /* Is the format arithmetic */
2067 is_arith = blend_type.length * blend_type.width != mem_type.width * mem_type.length;
2068 is_arith &= !(mem_type.width == 16 && mem_type.floating);
2069
2070 /* Pad if necessary */
2071 if (!is_arith && src_type.length < dst_type.length) {
2072 for (i = 0; i < num_srcs; ++i) {
2073 dst[i] = lp_build_pad_vector(gallivm, src[i], dst_type.length);
2074 }
2075
2076 src_type.length = dst_type.length;
2077 }
2078
2079 /* Special case for half-floats */
2080 if (mem_type.width == 16 && mem_type.floating) {
2081 assert(blend_type.width == 32 && blend_type.floating);
2082 lp_build_conv_auto(gallivm, src_type, &dst_type, dst, num_srcs, dst);
2083 is_arith = false;
2084 }
2085
2086 if (!is_arith) {
2087 return;
2088 }
2089
2090 src_type.width = blend_type.width * blend_type.length;
2091 blend_type.length *= pixels;
2092 src_type.length *= pixels / (src_type.length / mem_type.length);
2093
2094 for (i = 0; i < num_srcs; ++i) {
2095 LLVMValueRef chans;
2096 LLVMValueRef res = NULL;
2097
2098 dst[i] = LLVMBuildZExt(builder, src[i], lp_build_vec_type(gallivm, src_type), "");
2099
2100 for (j = 0; j < src_fmt->nr_channels; ++j) {
2101 unsigned mask = 0;
2102 unsigned sa = src_fmt->channel[j].shift;
2103 #if UTIL_ARCH_LITTLE_ENDIAN
2104 unsigned from_lsb = j;
2105 #else
2106 unsigned from_lsb = (blend_type.length / pixels) - j - 1;
2107 #endif
2108
2109 mask = (1 << src_fmt->channel[j].size) - 1;
2110
2111 /* Extract bits from source */
2112 chans = LLVMBuildLShr(builder,
2113 dst[i],
2114 lp_build_const_int_vec(gallivm, src_type, sa),
2115 "");
2116
2117 chans = LLVMBuildAnd(builder,
2118 chans,
2119 lp_build_const_int_vec(gallivm, src_type, mask),
2120 "");
2121
2122 /* Scale bits */
2123 if (src_type.norm) {
2124 chans = scale_bits(gallivm, src_fmt->channel[j].size,
2125 blend_type.width, chans, src_type);
2126 }
2127
2128 /* Insert bits into correct position */
2129 chans = LLVMBuildShl(builder,
2130 chans,
2131 lp_build_const_int_vec(gallivm, src_type, from_lsb * blend_type.width),
2132 "");
2133
2134 if (j == 0) {
2135 res = chans;
2136 } else {
2137 res = LLVMBuildOr(builder, res, chans, "");
2138 }
2139 }
2140
2141 dst[i] = LLVMBuildBitCast(builder, res, lp_build_vec_type(gallivm, blend_type), "");
2142 }
2143 }
2144
2145
2146 /**
2147 * Convert from blending format to memory format
2148 *
2149 * e.g. GL_R3G3B2 is 3 bytes for blending but 1 byte in memory
2150 */
2151 static void
convert_from_blend_type(struct gallivm_state * gallivm,unsigned block_size,const struct util_format_description * src_fmt,struct lp_type src_type,struct lp_type dst_type,LLVMValueRef * src,unsigned num_srcs)2152 convert_from_blend_type(struct gallivm_state *gallivm,
2153 unsigned block_size,
2154 const struct util_format_description *src_fmt,
2155 struct lp_type src_type,
2156 struct lp_type dst_type,
2157 LLVMValueRef* src, // and dst
2158 unsigned num_srcs)
2159 {
2160 LLVMValueRef* dst = src;
2161 unsigned i, j, k;
2162 struct lp_type mem_type;
2163 struct lp_type blend_type;
2164 LLVMBuilderRef builder = gallivm->builder;
2165 unsigned pixels = block_size / num_srcs;
2166 bool is_arith;
2167
2168 /*
2169 * full custom path for packed floats and srgb formats - none of the later
2170 * functions would do anything useful, and given the lp_type representation
2171 * they can't be fixed. Should really have some SoA blend path for these
2172 * kind of formats rather than hacking them in here.
2173 */
2174 if (format_expands_to_float_soa(src_fmt)) {
2175 /*
2176 * This is pretty suboptimal for this case blending in SoA would be much
2177 * better - we need to transpose the AoS values back to SoA values for
2178 * conversion/packing.
2179 */
2180 assert(src_type.floating);
2181 assert(src_type.width == 32);
2182 assert(src_type.length % 4 == 0);
2183 assert(dst_type.width == 32 || dst_type.width == 16);
2184
2185 for (i = 0; i < num_srcs / 4; i++) {
2186 LLVMValueRef tmpsoa[4], tmpdst;
2187 lp_build_transpose_aos(gallivm, src_type, &src[i * 4], tmpsoa);
2188 /* really really need SoA here */
2189
2190 if (src_fmt->format == PIPE_FORMAT_R11G11B10_FLOAT) {
2191 tmpdst = lp_build_float_to_r11g11b10(gallivm, tmpsoa);
2192 } else {
2193 tmpdst = lp_build_float_to_srgb_packed(gallivm, src_fmt,
2194 src_type, tmpsoa);
2195 }
2196
2197 if (src_type.length == 8) {
2198 LLVMValueRef tmpaos, shuffles[8];
2199 unsigned j;
2200 /*
2201 * for 8-wide aos transpose has given us wrong order not matching
2202 * output order. HMPF. Also need to split the output values
2203 * manually.
2204 */
2205 for (j = 0; j < 4; j++) {
2206 shuffles[j * 2] = lp_build_const_int32(gallivm, j);
2207 shuffles[j * 2 + 1] = lp_build_const_int32(gallivm, j + 4);
2208 }
2209 tmpaos = LLVMBuildShuffleVector(builder, tmpdst, tmpdst,
2210 LLVMConstVector(shuffles, 8), "");
2211 src[i * 2] = lp_build_extract_range(gallivm, tmpaos, 0, 4);
2212 src[i * 2 + 1] = lp_build_extract_range(gallivm, tmpaos, 4, 4);
2213 } else {
2214 src[i] = tmpdst;
2215 }
2216 }
2217 if (dst_type.width == 16) {
2218 struct lp_type type16x8 = dst_type;
2219 struct lp_type type32x4 = dst_type;
2220 LLVMTypeRef ltype16x4, ltypei64, ltypei128;
2221 unsigned num_fetch = src_type.length == 8 ? num_srcs / 2 : num_srcs / 4;
2222 type16x8.length = 8;
2223 type32x4.width = 32;
2224 ltypei128 = LLVMIntTypeInContext(gallivm->context, 128);
2225 ltypei64 = LLVMIntTypeInContext(gallivm->context, 64);
2226 ltype16x4 = lp_build_vec_type(gallivm, dst_type);
2227 /* We could do vector truncation but it doesn't generate very good code */
2228 for (i = 0; i < num_fetch; i++) {
2229 src[i] = lp_build_pack2(gallivm, type32x4, type16x8,
2230 src[i], lp_build_zero(gallivm, type32x4));
2231 src[i] = LLVMBuildBitCast(builder, src[i], ltypei128, "");
2232 src[i] = LLVMBuildTrunc(builder, src[i], ltypei64, "");
2233 src[i] = LLVMBuildBitCast(builder, src[i], ltype16x4, "");
2234 }
2235 }
2236 return;
2237 }
2238
2239 lp_mem_type_from_format_desc(src_fmt, &mem_type);
2240 lp_blend_type_from_format_desc(src_fmt, &blend_type);
2241
2242 is_arith = (blend_type.length * blend_type.width != mem_type.width * mem_type.length);
2243
2244 /* Special case for half-floats */
2245 if (mem_type.width == 16 && mem_type.floating) {
2246 int length = dst_type.length;
2247 assert(blend_type.width == 32 && blend_type.floating);
2248
2249 dst_type.length = src_type.length;
2250
2251 lp_build_conv_auto(gallivm, src_type, &dst_type, dst, num_srcs, dst);
2252
2253 dst_type.length = length;
2254 is_arith = false;
2255 }
2256
2257 /* Remove any padding */
2258 if (!is_arith && (src_type.length % mem_type.length)) {
2259 src_type.length -= (src_type.length % mem_type.length);
2260
2261 for (i = 0; i < num_srcs; ++i) {
2262 dst[i] = lp_build_extract_range(gallivm, dst[i], 0, src_type.length);
2263 }
2264 }
2265
2266 /* No bit arithmetic to do */
2267 if (!is_arith) {
2268 return;
2269 }
2270
2271 src_type.length = pixels;
2272 src_type.width = blend_type.length * blend_type.width;
2273 dst_type.length = pixels;
2274
2275 for (i = 0; i < num_srcs; ++i) {
2276 LLVMValueRef chans;
2277 LLVMValueRef res = NULL;
2278
2279 dst[i] = LLVMBuildBitCast(builder, src[i], lp_build_vec_type(gallivm, src_type), "");
2280
2281 for (j = 0; j < src_fmt->nr_channels; ++j) {
2282 unsigned mask = 0;
2283 unsigned sa = src_fmt->channel[j].shift;
2284 unsigned sz_a = src_fmt->channel[j].size;
2285 #if UTIL_ARCH_LITTLE_ENDIAN
2286 unsigned from_lsb = j;
2287 #else
2288 unsigned from_lsb = blend_type.length - j - 1;
2289 #endif
2290
2291 assert(blend_type.width > src_fmt->channel[j].size);
2292
2293 for (k = 0; k < blend_type.width; ++k) {
2294 mask |= 1 << k;
2295 }
2296
2297 /* Extract bits */
2298 chans = LLVMBuildLShr(builder,
2299 dst[i],
2300 lp_build_const_int_vec(gallivm, src_type,
2301 from_lsb * blend_type.width),
2302 "");
2303
2304 chans = LLVMBuildAnd(builder,
2305 chans,
2306 lp_build_const_int_vec(gallivm, src_type, mask),
2307 "");
2308
2309 /* Scale down bits */
2310 if (src_type.norm) {
2311 chans = scale_bits(gallivm, blend_type.width,
2312 src_fmt->channel[j].size, chans, src_type);
2313 } else if (!src_type.floating && sz_a < blend_type.width) {
2314 LLVMValueRef mask_val = lp_build_const_int_vec(gallivm, src_type, (1UL << sz_a) - 1);
2315 LLVMValueRef mask = LLVMBuildICmp(builder, LLVMIntUGT, chans, mask_val, "");
2316 chans = LLVMBuildSelect(builder, mask, mask_val, chans, "");
2317 }
2318
2319 /* Insert bits */
2320 chans = LLVMBuildShl(builder,
2321 chans,
2322 lp_build_const_int_vec(gallivm, src_type, sa),
2323 "");
2324
2325 sa += src_fmt->channel[j].size;
2326
2327 if (j == 0) {
2328 res = chans;
2329 } else {
2330 res = LLVMBuildOr(builder, res, chans, "");
2331 }
2332 }
2333
2334 assert (dst_type.width != 24);
2335
2336 dst[i] = LLVMBuildTrunc(builder, res, lp_build_vec_type(gallivm, dst_type), "");
2337 }
2338 }
2339
2340
2341 /**
2342 * Convert alpha to same blend type as src
2343 */
2344 static void
convert_alpha(struct gallivm_state * gallivm,struct lp_type row_type,struct lp_type alpha_type,const unsigned block_size,const unsigned block_height,const unsigned src_count,const unsigned dst_channels,const bool pad_inline,LLVMValueRef * src_alpha)2345 convert_alpha(struct gallivm_state *gallivm,
2346 struct lp_type row_type,
2347 struct lp_type alpha_type,
2348 const unsigned block_size,
2349 const unsigned block_height,
2350 const unsigned src_count,
2351 const unsigned dst_channels,
2352 const bool pad_inline,
2353 LLVMValueRef* src_alpha)
2354 {
2355 LLVMBuilderRef builder = gallivm->builder;
2356 const unsigned length = row_type.length;
2357 row_type.length = alpha_type.length;
2358
2359 /* Twiddle the alpha to match pixels */
2360 lp_bld_quad_twiddle(gallivm, alpha_type, src_alpha, block_height, src_alpha);
2361
2362 /*
2363 * TODO this should use single lp_build_conv call for
2364 * src_count == 1 && dst_channels == 1 case (dropping the concat below)
2365 */
2366 for (unsigned i = 0; i < block_height; ++i) {
2367 lp_build_conv(gallivm, alpha_type, row_type, &src_alpha[i], 1,
2368 &src_alpha[i], 1);
2369 }
2370
2371 alpha_type = row_type;
2372 row_type.length = length;
2373
2374 /* If only one channel we can only need the single alpha value per pixel */
2375 if (src_count == 1 && dst_channels == 1) {
2376 lp_build_concat_n(gallivm, alpha_type, src_alpha, block_height,
2377 src_alpha, src_count);
2378 } else {
2379 /* If there are more srcs than rows then we need to split alpha up */
2380 if (src_count > block_height) {
2381 for (unsigned i = src_count; i > 0; --i) {
2382 unsigned pixels = block_size / src_count;
2383 unsigned idx = i - 1;
2384
2385 src_alpha[idx] =
2386 lp_build_extract_range(gallivm, src_alpha[(idx * pixels) / 4],
2387 (idx * pixels) % 4, pixels);
2388 }
2389 }
2390
2391 /* If there is a src for each pixel broadcast the alpha across whole
2392 * row
2393 */
2394 if (src_count == block_size) {
2395 for (unsigned i = 0; i < src_count; ++i) {
2396 src_alpha[i] = lp_build_broadcast(gallivm,
2397 lp_build_vec_type(gallivm, row_type), src_alpha[i]);
2398 }
2399 } else {
2400 unsigned pixels = block_size / src_count;
2401 unsigned channels = pad_inline ? TGSI_NUM_CHANNELS : dst_channels;
2402 unsigned alpha_span = 1;
2403 LLVMValueRef shuffles[LP_MAX_VECTOR_LENGTH];
2404
2405 /* Check if we need 2 src_alphas for our shuffles */
2406 if (pixels > alpha_type.length) {
2407 alpha_span = 2;
2408 }
2409
2410 /* Broadcast alpha across all channels, e.g. a1a2 to a1a1a1a1a2a2a2a2 */
2411 for (unsigned j = 0; j < row_type.length; ++j) {
2412 if (j < pixels * channels) {
2413 shuffles[j] = lp_build_const_int32(gallivm, j / channels);
2414 } else {
2415 shuffles[j] = LLVMGetUndef(LLVMInt32TypeInContext(gallivm->context));
2416 }
2417 }
2418
2419 for (unsigned i = 0; i < src_count; ++i) {
2420 unsigned idx1 = i, idx2 = i;
2421
2422 if (alpha_span > 1){
2423 idx1 *= alpha_span;
2424 idx2 = idx1 + 1;
2425 }
2426
2427 src_alpha[i] = LLVMBuildShuffleVector(builder,
2428 src_alpha[idx1],
2429 src_alpha[idx2],
2430 LLVMConstVector(shuffles, row_type.length),
2431 "");
2432 }
2433 }
2434 }
2435 }
2436
2437
2438 /**
2439 * Generates the blend function for unswizzled colour buffers
2440 * Also generates the read & write from colour buffer
2441 */
2442 static void
generate_unswizzled_blend(struct gallivm_state * gallivm,unsigned rt,struct lp_fragment_shader_variant * variant,enum pipe_format out_format,unsigned int num_fs,struct lp_type fs_type,LLVMValueRef * fs_mask,LLVMValueRef fs_out_color[PIPE_MAX_COLOR_BUFS][TGSI_NUM_CHANNELS][4],LLVMTypeRef context_type,LLVMValueRef context_ptr,LLVMTypeRef color_type,LLVMValueRef color_ptr,LLVMValueRef stride,unsigned partial_mask,bool do_branch)2443 generate_unswizzled_blend(struct gallivm_state *gallivm,
2444 unsigned rt,
2445 struct lp_fragment_shader_variant *variant,
2446 enum pipe_format out_format,
2447 unsigned int num_fs,
2448 struct lp_type fs_type,
2449 LLVMValueRef* fs_mask,
2450 LLVMValueRef fs_out_color[PIPE_MAX_COLOR_BUFS][TGSI_NUM_CHANNELS][4],
2451 LLVMTypeRef context_type,
2452 LLVMValueRef context_ptr,
2453 LLVMTypeRef color_type,
2454 LLVMValueRef color_ptr,
2455 LLVMValueRef stride,
2456 unsigned partial_mask,
2457 bool do_branch)
2458 {
2459 const unsigned alpha_channel = 3;
2460 const unsigned block_width = LP_RASTER_BLOCK_SIZE;
2461 const unsigned block_height = LP_RASTER_BLOCK_SIZE;
2462 const unsigned block_size = block_width * block_height;
2463 const unsigned lp_integer_vector_width = 128;
2464
2465 LLVMBuilderRef builder = gallivm->builder;
2466 LLVMValueRef fs_src[4][TGSI_NUM_CHANNELS];
2467 LLVMValueRef fs_src1[4][TGSI_NUM_CHANNELS];
2468 LLVMValueRef src_alpha[4 * 4];
2469 LLVMValueRef src1_alpha[4 * 4] = { NULL };
2470 LLVMValueRef src_mask[4 * 4];
2471 LLVMValueRef src[4 * 4];
2472 LLVMValueRef src1[4 * 4];
2473 LLVMValueRef dst[4 * 4];
2474
2475 struct lp_build_mask_context mask_ctx;
2476
2477 unsigned char swizzle[TGSI_NUM_CHANNELS];
2478 unsigned src_channels = TGSI_NUM_CHANNELS;
2479
2480 const struct util_format_description *out_format_desc =
2481 util_format_description(out_format);
2482
2483 bool pad_inline = is_arithmetic_format(out_format_desc);
2484 const bool dual_source_blend =
2485 variant->key.blend.rt[0].blend_enable &&
2486 util_blend_state_is_dual(&variant->key.blend, 0);
2487
2488 const bool is_1d = variant->key.resource_1d;
2489 const unsigned num_fullblock_fs = is_1d ? 2 * num_fs : num_fs;
2490 LLVMValueRef fpstate = NULL;
2491
2492 LLVMTypeRef fs_vec_type = lp_build_vec_type(gallivm, fs_type);
2493
2494 /* Get type from output format */
2495 struct lp_type row_type, dst_type;
2496 lp_blend_type_from_format_desc(out_format_desc, &row_type);
2497 lp_mem_type_from_format_desc(out_format_desc, &dst_type);
2498
2499 /*
2500 * Technically this code should go into lp_build_smallfloat_to_float
2501 * and lp_build_float_to_smallfloat but due to the
2502 * http://llvm.org/bugs/show_bug.cgi?id=6393
2503 * llvm reorders the mxcsr intrinsics in a way that breaks the code.
2504 * So the ordering is important here and there shouldn't be any
2505 * llvm ir instrunctions in this function before
2506 * this, otherwise half-float format conversions won't work
2507 * (again due to llvm bug #6393).
2508 */
2509 if (have_smallfloat_format(dst_type, out_format)) {
2510 /* We need to make sure that denorms are ok for half float
2511 conversions */
2512 fpstate = lp_build_fpstate_get(gallivm);
2513 lp_build_fpstate_set_denorms_zero(gallivm, false);
2514 }
2515
2516 struct lp_type mask_type = lp_int32_vec4_type();
2517 mask_type.length = fs_type.length;
2518
2519 for (unsigned i = num_fs; i < num_fullblock_fs; i++) {
2520 fs_mask[i] = lp_build_zero(gallivm, mask_type);
2521 }
2522
2523 /* Do not bother executing code when mask is empty.. */
2524 if (do_branch) {
2525 LLVMValueRef check_mask =
2526 LLVMConstNull(lp_build_int_vec_type(gallivm, mask_type));
2527
2528 for (unsigned i = 0; i < num_fullblock_fs; ++i) {
2529 check_mask = LLVMBuildOr(builder, check_mask, fs_mask[i], "");
2530 }
2531
2532 lp_build_mask_begin(&mask_ctx, gallivm, mask_type, check_mask);
2533 lp_build_mask_check(&mask_ctx);
2534 }
2535
2536 partial_mask |= !variant->opaque;
2537 LLVMValueRef i32_zero = lp_build_const_int32(gallivm, 0);
2538
2539 LLVMValueRef undef_src_val = lp_build_undef(gallivm, fs_type);
2540
2541 row_type.length = fs_type.length;
2542 unsigned vector_width =
2543 dst_type.floating ? lp_native_vector_width : lp_integer_vector_width;
2544
2545 /* Compute correct swizzle and count channels */
2546 memset(swizzle, LP_BLD_SWIZZLE_DONTCARE, TGSI_NUM_CHANNELS);
2547 unsigned dst_channels = 0;
2548
2549 bool has_alpha = false;
2550 for (unsigned i = 0; i < TGSI_NUM_CHANNELS; ++i) {
2551 /* Ensure channel is used */
2552 if (out_format_desc->swizzle[i] >= TGSI_NUM_CHANNELS) {
2553 continue;
2554 }
2555
2556 /* Ensure not already written to (happens in case with GL_ALPHA) */
2557 if (swizzle[out_format_desc->swizzle[i]] < TGSI_NUM_CHANNELS) {
2558 continue;
2559 }
2560
2561 /* Ensure we haven't already found all channels */
2562 if (dst_channels >= out_format_desc->nr_channels) {
2563 continue;
2564 }
2565
2566 swizzle[out_format_desc->swizzle[i]] = i;
2567 ++dst_channels;
2568
2569 if (i == alpha_channel) {
2570 has_alpha = true;
2571 }
2572 }
2573
2574 if (format_expands_to_float_soa(out_format_desc)) {
2575 /*
2576 * the code above can't work for layout_other
2577 * for srgb it would sort of work but we short-circuit swizzles, etc.
2578 * as that is done as part of unpack / pack.
2579 */
2580 dst_channels = 4; /* HACK: this is fake 4 really but need it due to transpose stuff later */
2581 has_alpha = true;
2582 swizzle[0] = 0;
2583 swizzle[1] = 1;
2584 swizzle[2] = 2;
2585 swizzle[3] = 3;
2586 pad_inline = true; /* HACK: prevent rgbxrgbx->rgbrgbxx conversion later */
2587 }
2588
2589 /* If 3 channels then pad to include alpha for 4 element transpose */
2590 if (dst_channels == 3) {
2591 assert (!has_alpha);
2592 for (unsigned i = 0; i < TGSI_NUM_CHANNELS; i++) {
2593 if (swizzle[i] > TGSI_NUM_CHANNELS)
2594 swizzle[i] = 3;
2595 }
2596 if (out_format_desc->nr_channels == 4) {
2597 dst_channels = 4;
2598 /*
2599 * We use alpha from the color conversion, not separate one.
2600 * We had to include it for transpose, hence it will get converted
2601 * too (albeit when doing transpose after conversion, that would
2602 * no longer be the case necessarily).
2603 * (It works only with 4 channel dsts, e.g. rgbx formats, because
2604 * otherwise we really have padding, not alpha, included.)
2605 */
2606 has_alpha = true;
2607 }
2608 }
2609
2610 /*
2611 * Load shader output
2612 */
2613 for (unsigned i = 0; i < num_fullblock_fs; ++i) {
2614 /* Always load alpha for use in blending */
2615 LLVMValueRef alpha;
2616 if (i < num_fs) {
2617 alpha = LLVMBuildLoad2(builder, fs_vec_type,
2618 fs_out_color[rt][alpha_channel][i], "");
2619 } else {
2620 alpha = undef_src_val;
2621 }
2622
2623 /* Load each channel */
2624 for (unsigned j = 0; j < dst_channels; ++j) {
2625 assert(swizzle[j] < 4);
2626 if (i < num_fs) {
2627 fs_src[i][j] = LLVMBuildLoad2(builder, fs_vec_type,
2628 fs_out_color[rt][swizzle[j]][i], "");
2629 } else {
2630 fs_src[i][j] = undef_src_val;
2631 }
2632 }
2633
2634 /* If 3 channels then pad to include alpha for 4 element transpose */
2635 /*
2636 * XXX If we include that here maybe could actually use it instead of
2637 * separate alpha for blending?
2638 * (Difficult though we actually convert pad channels, not alpha.)
2639 */
2640 if (dst_channels == 3 && !has_alpha) {
2641 fs_src[i][3] = alpha;
2642 }
2643
2644 /* We split the row_mask and row_alpha as we want 128bit interleave */
2645 if (fs_type.length == 8) {
2646 src_mask[i*2 + 0] = lp_build_extract_range(gallivm, fs_mask[i],
2647 0, src_channels);
2648 src_mask[i*2 + 1] = lp_build_extract_range(gallivm, fs_mask[i],
2649 src_channels,
2650 src_channels);
2651
2652 src_alpha[i*2 + 0] = lp_build_extract_range(gallivm, alpha,
2653 0, src_channels);
2654 src_alpha[i*2 + 1] = lp_build_extract_range(gallivm, alpha,
2655 src_channels,
2656 src_channels);
2657 } else {
2658 src_mask[i] = fs_mask[i];
2659 src_alpha[i] = alpha;
2660 }
2661 }
2662 if (dual_source_blend) {
2663 /* same as above except different src/dst, skip masks and comments... */
2664 for (unsigned i = 0; i < num_fullblock_fs; ++i) {
2665 LLVMValueRef alpha;
2666 if (i < num_fs) {
2667 alpha = LLVMBuildLoad2(builder, fs_vec_type,
2668 fs_out_color[1][alpha_channel][i], "");
2669 } else {
2670 alpha = undef_src_val;
2671 }
2672
2673 for (unsigned j = 0; j < dst_channels; ++j) {
2674 assert(swizzle[j] < 4);
2675 if (i < num_fs) {
2676 fs_src1[i][j] = LLVMBuildLoad2(builder, fs_vec_type,
2677 fs_out_color[1][swizzle[j]][i], "");
2678 } else {
2679 fs_src1[i][j] = undef_src_val;
2680 }
2681 }
2682 if (dst_channels == 3 && !has_alpha) {
2683 fs_src1[i][3] = alpha;
2684 }
2685 if (fs_type.length == 8) {
2686 src1_alpha[i*2 + 0] = lp_build_extract_range(gallivm, alpha, 0, src_channels);
2687 src1_alpha[i*2 + 1] = lp_build_extract_range(gallivm, alpha,
2688 src_channels, src_channels);
2689 } else {
2690 src1_alpha[i] = alpha;
2691 }
2692 }
2693 }
2694
2695 if (util_format_is_pure_integer(out_format)) {
2696 /*
2697 * In this case fs_type was really ints or uints disguised as floats,
2698 * fix that up now.
2699 */
2700 fs_type.floating = 0;
2701 fs_type.sign = dst_type.sign;
2702 fs_vec_type = lp_build_vec_type(gallivm, fs_type);
2703 for (unsigned i = 0; i < num_fullblock_fs; ++i) {
2704 for (unsigned j = 0; j < dst_channels; ++j) {
2705 fs_src[i][j] = LLVMBuildBitCast(builder, fs_src[i][j],
2706 fs_vec_type, "");
2707 }
2708 if (dst_channels == 3 && !has_alpha) {
2709 fs_src[i][3] = LLVMBuildBitCast(builder, fs_src[i][3],
2710 fs_vec_type, "");
2711 }
2712 }
2713 }
2714
2715 /*
2716 * We actually should generally do conversion first (for non-1d cases)
2717 * when the blend format is 8 or 16 bits. The reason is obvious,
2718 * there's 2 or 4 times less vectors to deal with for the interleave...
2719 * Albeit for the AVX (not AVX2) case there's no benefit with 16 bit
2720 * vectors (as it can do 32bit unpack with 256bit vectors, but 8/16bit
2721 * unpack only with 128bit vectors).
2722 * Note: for 16bit sizes really need matching pack conversion code
2723 */
2724 bool twiddle_after_convert = false;
2725 if (!is_1d && dst_channels != 3 && dst_type.width == 8) {
2726 twiddle_after_convert = true;
2727 }
2728
2729 /*
2730 * Pixel twiddle from fragment shader order to memory order
2731 */
2732 unsigned src_count;
2733 if (!twiddle_after_convert) {
2734 src_count = generate_fs_twiddle(gallivm, fs_type, num_fullblock_fs,
2735 dst_channels, fs_src, src, pad_inline);
2736 if (dual_source_blend) {
2737 generate_fs_twiddle(gallivm, fs_type, num_fullblock_fs, dst_channels,
2738 fs_src1, src1, pad_inline);
2739 }
2740 } else {
2741 src_count = num_fullblock_fs * dst_channels;
2742 /*
2743 * We reorder things a bit here, so the cases for 4-wide and 8-wide
2744 * (AVX) turn out the same later when untwiddling/transpose (albeit
2745 * for true AVX2 path untwiddle needs to be different).
2746 * For now just order by colors first (so we can use unpack later).
2747 */
2748 for (unsigned j = 0; j < num_fullblock_fs; j++) {
2749 for (unsigned i = 0; i < dst_channels; i++) {
2750 src[i*num_fullblock_fs + j] = fs_src[j][i];
2751 if (dual_source_blend) {
2752 src1[i*num_fullblock_fs + j] = fs_src1[j][i];
2753 }
2754 }
2755 }
2756 }
2757
2758 src_channels = dst_channels < 3 ? dst_channels : 4;
2759 if (src_count != num_fullblock_fs * src_channels) {
2760 unsigned ds = src_count / (num_fullblock_fs * src_channels);
2761 row_type.length /= ds;
2762 fs_type.length = row_type.length;
2763 fs_vec_type = lp_build_vec_type(gallivm, fs_type);
2764 }
2765
2766 struct lp_type blend_type = row_type;
2767 mask_type.length = 4;
2768
2769 /* Convert src to row_type */
2770 if (dual_source_blend) {
2771 struct lp_type old_row_type = row_type;
2772 lp_build_conv_auto(gallivm, fs_type, &row_type, src, src_count, src);
2773 src_count = lp_build_conv_auto(gallivm, fs_type, &old_row_type,
2774 src1, src_count, src1);
2775 } else {
2776 src_count = lp_build_conv_auto(gallivm, fs_type, &row_type,
2777 src, src_count, src);
2778 }
2779
2780 /* If the rows are not an SSE vector, combine them to become SSE size! */
2781 if ((row_type.width * row_type.length) % 128) {
2782 unsigned bits = row_type.width * row_type.length;
2783 unsigned combined;
2784
2785 assert(src_count >= (vector_width / bits));
2786
2787 const unsigned dst_count = src_count / (vector_width / bits);
2788
2789 combined = lp_build_concat_n(gallivm, row_type, src, src_count,
2790 src, dst_count);
2791 if (dual_source_blend) {
2792 lp_build_concat_n(gallivm, row_type, src1, src_count, src1, dst_count);
2793 }
2794
2795 row_type.length *= combined;
2796 src_count /= combined;
2797
2798 bits = row_type.width * row_type.length;
2799 assert(bits == 128 || bits == 256);
2800 }
2801
2802 if (twiddle_after_convert) {
2803 fs_twiddle_transpose(gallivm, row_type, src, src_count, src);
2804 if (dual_source_blend) {
2805 fs_twiddle_transpose(gallivm, row_type, src1, src_count, src1);
2806 }
2807 }
2808
2809 /*
2810 * Blend Colour conversion
2811 */
2812 LLVMValueRef blend_color =
2813 lp_jit_context_f_blend_color(gallivm, context_type, context_ptr);
2814 blend_color = LLVMBuildPointerCast(builder, blend_color,
2815 LLVMPointerType(fs_vec_type, 0),
2816 "");
2817 blend_color = LLVMBuildLoad2(builder, fs_vec_type,
2818 LLVMBuildGEP2(builder, fs_vec_type,
2819 blend_color,
2820 &i32_zero, 1, ""), "");
2821
2822 /* Convert */
2823 lp_build_conv(gallivm, fs_type, blend_type, &blend_color, 1,
2824 &blend_color, 1);
2825
2826 if (out_format_desc->colorspace == UTIL_FORMAT_COLORSPACE_SRGB) {
2827 /*
2828 * since blending is done with floats, there was no conversion.
2829 * However, the rules according to fixed point renderbuffers still
2830 * apply, that is we must clamp inputs to 0.0/1.0.
2831 * (This would apply to separate alpha conversion too but we currently
2832 * force has_alpha to be true.)
2833 * TODO: should skip this with "fake" blend, since post-blend conversion
2834 * will clamp anyway.
2835 * TODO: could also skip this if fragment color clamping is enabled.
2836 * We don't support it natively so it gets baked into the shader
2837 * however, so can't really tell here.
2838 */
2839 struct lp_build_context f32_bld;
2840 assert(row_type.floating);
2841 lp_build_context_init(&f32_bld, gallivm, row_type);
2842 for (unsigned i = 0; i < src_count; i++) {
2843 src[i] = lp_build_clamp_zero_one_nanzero(&f32_bld, src[i]);
2844 }
2845 if (dual_source_blend) {
2846 for (unsigned i = 0; i < src_count; i++) {
2847 src1[i] = lp_build_clamp_zero_one_nanzero(&f32_bld, src1[i]);
2848 }
2849 }
2850 /* probably can't be different than row_type but better safe than sorry... */
2851 lp_build_context_init(&f32_bld, gallivm, blend_type);
2852 blend_color = lp_build_clamp(&f32_bld, blend_color,
2853 f32_bld.zero, f32_bld.one);
2854 }
2855
2856 /* Extract alpha */
2857 LLVMValueRef blend_alpha =
2858 lp_build_extract_broadcast(gallivm, blend_type, row_type,
2859 blend_color,
2860 lp_build_const_int32(gallivm, 3));
2861
2862 /* Swizzle to appropriate channels, e.g. from RGBA to BGRA BGRA */
2863 pad_inline &= (dst_channels * (block_size / src_count) * row_type.width)
2864 != vector_width;
2865 if (pad_inline) {
2866 /* Use all 4 channels e.g. from RGBA RGBA to RGxx RGxx */
2867 blend_color = lp_build_swizzle_aos_n(gallivm, blend_color, swizzle,
2868 TGSI_NUM_CHANNELS, row_type.length);
2869 } else {
2870 /* Only use dst_channels e.g. RGBA RGBA to RG RG xxxx */
2871 blend_color = lp_build_swizzle_aos_n(gallivm, blend_color, swizzle,
2872 dst_channels, row_type.length);
2873 }
2874
2875 /*
2876 * Mask conversion
2877 */
2878 lp_bld_quad_twiddle(gallivm, mask_type, &src_mask[0],
2879 block_height, &src_mask[0]);
2880
2881 if (src_count < block_height) {
2882 lp_build_concat_n(gallivm, mask_type, src_mask, 4, src_mask, src_count);
2883 } else if (src_count > block_height) {
2884 for (unsigned i = src_count; i > 0; --i) {
2885 unsigned pixels = block_size / src_count;
2886 unsigned idx = i - 1;
2887
2888 src_mask[idx] = lp_build_extract_range(gallivm,
2889 src_mask[(idx * pixels) / 4],
2890 (idx * pixels) % 4, pixels);
2891 }
2892 }
2893
2894 assert(mask_type.width == 32);
2895
2896 for (unsigned i = 0; i < src_count; ++i) {
2897 unsigned pixels = block_size / src_count;
2898 unsigned pixel_width = row_type.width * dst_channels;
2899
2900 if (pixel_width == 24) {
2901 mask_type.width = 8;
2902 mask_type.length = vector_width / mask_type.width;
2903 } else {
2904 mask_type.length = pixels;
2905 mask_type.width = row_type.width * dst_channels;
2906
2907 /*
2908 * If mask_type width is smaller than 32bit, this doesn't quite
2909 * generate the most efficient code (could use some pack).
2910 */
2911 src_mask[i] = LLVMBuildIntCast(builder, src_mask[i],
2912 lp_build_int_vec_type(gallivm,
2913 mask_type), "");
2914
2915 mask_type.length *= dst_channels;
2916 mask_type.width /= dst_channels;
2917 }
2918
2919 src_mask[i] = LLVMBuildBitCast(builder, src_mask[i],
2920 lp_build_int_vec_type(gallivm, mask_type),
2921 "");
2922 src_mask[i] = lp_build_pad_vector(gallivm, src_mask[i], row_type.length);
2923 }
2924
2925 /*
2926 * Alpha conversion
2927 */
2928 if (!has_alpha) {
2929 struct lp_type alpha_type = fs_type;
2930 alpha_type.length = 4;
2931 convert_alpha(gallivm, row_type, alpha_type,
2932 block_size, block_height,
2933 src_count, dst_channels,
2934 pad_inline, src_alpha);
2935 if (dual_source_blend) {
2936 convert_alpha(gallivm, row_type, alpha_type,
2937 block_size, block_height,
2938 src_count, dst_channels,
2939 pad_inline, src1_alpha);
2940 }
2941 }
2942
2943
2944 /*
2945 * Load dst from memory
2946 */
2947 unsigned dst_count;
2948 if (src_count < block_height) {
2949 dst_count = block_height;
2950 } else {
2951 dst_count = src_count;
2952 }
2953
2954 dst_type.length *= block_size / dst_count;
2955
2956 if (format_expands_to_float_soa(out_format_desc)) {
2957 /*
2958 * we need multiple values at once for the conversion, so can as well
2959 * load them vectorized here too instead of concatenating later.
2960 * (Still need concatenation later for 8-wide vectors).
2961 */
2962 dst_count = block_height;
2963 dst_type.length = block_width;
2964 }
2965
2966 /*
2967 * Compute the alignment of the destination pointer in bytes
2968 * We fetch 1-4 pixels, if the format has pot alignment then those fetches
2969 * are always aligned by MIN2(16, fetch_width) except for buffers (not
2970 * 1d tex but can't distinguish here) so need to stick with per-pixel
2971 * alignment in this case.
2972 */
2973 unsigned dst_alignment;
2974 if (is_1d) {
2975 dst_alignment = (out_format_desc->block.bits + 7)/(out_format_desc->block.width * 8);
2976 } else {
2977 dst_alignment = dst_type.length * dst_type.width / 8;
2978 }
2979 /* Force power-of-two alignment by extracting only the least-significant-bit */
2980 dst_alignment = 1 << (ffs(dst_alignment) - 1);
2981 /*
2982 * Resource base and stride pointers are aligned to 16 bytes, so that's
2983 * the maximum alignment we can guarantee
2984 */
2985 dst_alignment = MIN2(16, dst_alignment);
2986
2987 struct lp_type ls_type = dst_type;
2988
2989 if (dst_count > src_count) {
2990 if ((dst_type.width == 8 || dst_type.width == 16) &&
2991 util_is_power_of_two_or_zero(dst_type.length) &&
2992 dst_type.length * dst_type.width < 128) {
2993 /*
2994 * Never try to load values as 4xi8 which we will then
2995 * concatenate to larger vectors. This gives llvm a real
2996 * headache (the problem is the type legalizer (?) will
2997 * try to load that as 4xi8 zext to 4xi32 to fill the vector,
2998 * then the shuffles to concatenate are more or less impossible
2999 * - llvm is easily capable of generating a sequence of 32
3000 * pextrb/pinsrb instructions for that. Albeit it appears to
3001 * be fixed in llvm 4.0. So, load and concatenate with 32bit
3002 * width to avoid the trouble (16bit seems not as bad, llvm
3003 * probably recognizes the load+shuffle as only one shuffle
3004 * is necessary, but we can do just the same anyway).
3005 */
3006 ls_type.length = dst_type.length * dst_type.width / 32;
3007 ls_type.width = 32;
3008 }
3009 }
3010
3011 if (is_1d) {
3012 load_unswizzled_block(gallivm, color_type, color_ptr, stride, block_width, 1,
3013 dst, ls_type, dst_count / 4, dst_alignment);
3014 for (unsigned i = dst_count / 4; i < dst_count; i++) {
3015 dst[i] = lp_build_undef(gallivm, ls_type);
3016 }
3017 } else {
3018 load_unswizzled_block(gallivm, color_type, color_ptr, stride, block_width,
3019 block_height, dst, ls_type, dst_count,
3020 dst_alignment);
3021 }
3022
3023
3024 /*
3025 * Convert from dst/output format to src/blending format.
3026 *
3027 * This is necessary as we can only read 1 row from memory at a time,
3028 * so the minimum dst_count will ever be at this point is 4.
3029 *
3030 * With, for example, R8 format you can have all 16 pixels in a 128 bit
3031 * vector, this will take the 4 dsts and combine them into 1 src so we can
3032 * perform blending on all 16 pixels in that single vector at once.
3033 */
3034 if (dst_count > src_count) {
3035 if (ls_type.length != dst_type.length && ls_type.length == 1) {
3036 LLVMTypeRef elem_type = lp_build_elem_type(gallivm, ls_type);
3037 LLVMTypeRef ls_vec_type = LLVMVectorType(elem_type, 1);
3038 for (unsigned i = 0; i < dst_count; i++) {
3039 dst[i] = LLVMBuildBitCast(builder, dst[i], ls_vec_type, "");
3040 }
3041 }
3042
3043 lp_build_concat_n(gallivm, ls_type, dst, 4, dst, src_count);
3044
3045 if (ls_type.length != dst_type.length) {
3046 struct lp_type tmp_type = dst_type;
3047 tmp_type.length = dst_type.length * 4 / src_count;
3048 for (unsigned i = 0; i < src_count; i++) {
3049 dst[i] = LLVMBuildBitCast(builder, dst[i],
3050 lp_build_vec_type(gallivm, tmp_type), "");
3051 }
3052 }
3053 }
3054
3055 /*
3056 * Blending
3057 */
3058 /* XXX this is broken for RGB8 formats -
3059 * they get expanded from 12 to 16 elements (to include alpha)
3060 * by convert_to_blend_type then reduced to 15 instead of 12
3061 * by convert_from_blend_type (a simple fix though breaks A8...).
3062 * R16G16B16 also crashes differently however something going wrong
3063 * inside llvm handling npot vector sizes seemingly.
3064 * It seems some cleanup could be done here (like skipping conversion/blend
3065 * when not needed).
3066 */
3067 convert_to_blend_type(gallivm, block_size, out_format_desc, dst_type,
3068 row_type, dst, src_count);
3069
3070 /*
3071 * FIXME: Really should get logic ops / masks out of generic blend / row
3072 * format. Logic ops will definitely not work on the blend float format
3073 * used for SRGB here and I think OpenGL expects this to work as expected
3074 * (that is incoming values converted to srgb then logic op applied).
3075 */
3076 for (unsigned i = 0; i < src_count; ++i) {
3077 dst[i] = lp_build_blend_aos(gallivm,
3078 &variant->key.blend,
3079 out_format,
3080 row_type,
3081 rt,
3082 src[i],
3083 has_alpha ? NULL : src_alpha[i],
3084 src1[i],
3085 has_alpha ? NULL : src1_alpha[i],
3086 dst[i],
3087 partial_mask ? src_mask[i] : NULL,
3088 blend_color,
3089 has_alpha ? NULL : blend_alpha,
3090 swizzle,
3091 pad_inline ? 4 : dst_channels);
3092 }
3093
3094 convert_from_blend_type(gallivm, block_size, out_format_desc,
3095 row_type, dst_type, dst, src_count);
3096
3097 /* Split the blend rows back to memory rows */
3098 if (dst_count > src_count) {
3099 row_type.length = dst_type.length * (dst_count / src_count);
3100
3101 if (src_count == 1) {
3102 dst[1] = lp_build_extract_range(gallivm, dst[0],
3103 row_type.length / 2,
3104 row_type.length / 2);
3105 dst[0] = lp_build_extract_range(gallivm, dst[0],
3106 0, row_type.length / 2);
3107
3108 row_type.length /= 2;
3109 src_count *= 2;
3110 }
3111
3112 dst[3] = lp_build_extract_range(gallivm, dst[1], row_type.length / 2,
3113 row_type.length / 2);
3114 dst[2] = lp_build_extract_range(gallivm, dst[1], 0, row_type.length / 2);
3115 dst[1] = lp_build_extract_range(gallivm, dst[0], row_type.length / 2,
3116 row_type.length / 2);
3117 dst[0] = lp_build_extract_range(gallivm, dst[0], 0, row_type.length / 2);
3118
3119 row_type.length /= 2;
3120 src_count *= 2;
3121 }
3122
3123 /*
3124 * Store blend result to memory
3125 */
3126 if (is_1d) {
3127 store_unswizzled_block(gallivm, color_type, color_ptr, stride, block_width, 1,
3128 dst, dst_type, dst_count / 4, dst_alignment);
3129 } else {
3130 store_unswizzled_block(gallivm, color_type, color_ptr, stride, block_width,
3131 block_height,
3132 dst, dst_type, dst_count, dst_alignment);
3133 }
3134
3135 if (do_branch) {
3136 lp_build_mask_end(&mask_ctx);
3137 }
3138
3139 if (fpstate) {
3140 lp_build_fpstate_set(gallivm, fpstate);
3141 }
3142 }
3143
3144
3145 /**
3146 * Generate the runtime callable function for the whole fragment pipeline.
3147 * Note that the function which we generate operates on a block of 16
3148 * pixels at at time. The block contains 2x2 quads. Each quad contains
3149 * 2x2 pixels.
3150 */
3151 static void
generate_fragment(struct llvmpipe_context * lp,struct lp_fragment_shader * shader,struct lp_fragment_shader_variant * variant,unsigned partial_mask)3152 generate_fragment(struct llvmpipe_context *lp,
3153 struct lp_fragment_shader *shader,
3154 struct lp_fragment_shader_variant *variant,
3155 unsigned partial_mask)
3156 {
3157 assert(partial_mask == RAST_WHOLE ||
3158 partial_mask == RAST_EDGE_TEST);
3159
3160 struct nir_shader *nir = shader->base.ir.nir;
3161 struct gallivm_state *gallivm = variant->gallivm;
3162 struct lp_fragment_shader_variant_key *key = &variant->key;
3163 struct lp_shader_input inputs[PIPE_MAX_SHADER_INPUTS];
3164 LLVMTypeRef fs_elem_type;
3165 LLVMTypeRef blend_vec_type;
3166 LLVMTypeRef arg_types[16];
3167 LLVMTypeRef func_type;
3168 LLVMTypeRef int32_type = LLVMInt32TypeInContext(gallivm->context);
3169 LLVMTypeRef int32p_type = LLVMPointerType(int32_type, 0);
3170 LLVMTypeRef int8_type = LLVMInt8TypeInContext(gallivm->context);
3171 LLVMTypeRef int8p_type = LLVMPointerType(int8_type, 0);
3172 LLVMValueRef context_ptr;
3173 LLVMValueRef resources_ptr;
3174 LLVMValueRef x;
3175 LLVMValueRef y;
3176 LLVMValueRef a0_ptr;
3177 LLVMValueRef dadx_ptr;
3178 LLVMValueRef dady_ptr;
3179 LLVMValueRef color_ptr_ptr;
3180 LLVMValueRef stride_ptr;
3181 LLVMValueRef color_sample_stride_ptr;
3182 LLVMValueRef depth_ptr;
3183 LLVMValueRef depth_stride;
3184 LLVMValueRef depth_sample_stride;
3185 LLVMValueRef mask_input;
3186 LLVMValueRef thread_data_ptr;
3187 LLVMBasicBlockRef block;
3188 LLVMBuilderRef builder;
3189 struct lp_build_interp_soa_context interp;
3190 LLVMValueRef fs_mask[(16 / 4) * LP_MAX_SAMPLES];
3191 LLVMValueRef fs_out_color[LP_MAX_SAMPLES][PIPE_MAX_COLOR_BUFS][TGSI_NUM_CHANNELS][16 / 4];
3192 LLVMValueRef function;
3193 LLVMValueRef facing;
3194 const bool dual_source_blend = key->blend.rt[0].blend_enable &&
3195 util_blend_state_is_dual(&key->blend, 0);
3196
3197 assert(lp_native_vector_width / 32 >= 4);
3198
3199 /* Adjust color input interpolation according to flatshade state:
3200 */
3201 nir_foreach_shader_in_variable(var, nir) {
3202 unsigned idx = var->data.driver_location;
3203 unsigned slots = nir_variable_count_slots(var, var->type);
3204 memcpy(&inputs[idx], &shader->inputs[idx], (sizeof inputs[0] * slots));
3205 for (unsigned s = 0; s < slots; s++) {
3206 if (inputs[idx + s].interp == LP_INTERP_COLOR)
3207 inputs[idx + s].interp = key->flatshade ? LP_INTERP_CONSTANT : LP_INTERP_PERSPECTIVE;
3208 }
3209 }
3210
3211 /* TODO: actually pick these based on the fs and color buffer
3212 * characteristics. */
3213
3214 struct lp_type fs_type;
3215 memset(&fs_type, 0, sizeof fs_type);
3216 fs_type.floating = true; /* floating point values */
3217 fs_type.sign = true; /* values are signed */
3218 fs_type.norm = false; /* values are not limited to [0,1] or [-1,1] */
3219 fs_type.width = 32; /* 32-bit float */
3220 fs_type.length = MIN2(lp_native_vector_width / 32, 16); /* n*4 elements per vector */
3221
3222 struct lp_type blend_type;
3223 memset(&blend_type, 0, sizeof blend_type);
3224 blend_type.floating = false; /* values are integers */
3225 blend_type.sign = false; /* values are unsigned */
3226 blend_type.norm = true; /* values are in [0,1] or [-1,1] */
3227 blend_type.width = 8; /* 8-bit ubyte values */
3228 blend_type.length = 16; /* 16 elements per vector */
3229
3230 /*
3231 * Generate the function prototype. Any change here must be reflected in
3232 * lp_jit.h's lp_jit_frag_func function pointer type, and vice-versa.
3233 */
3234
3235 fs_elem_type = lp_build_elem_type(gallivm, fs_type);
3236
3237 blend_vec_type = lp_build_vec_type(gallivm, blend_type);
3238
3239 char func_name[64];
3240 snprintf(func_name, sizeof(func_name), "fs_variant_%s",
3241 partial_mask ? "partial" : "whole");
3242
3243 arg_types[0] = variant->jit_context_ptr_type; /* context */
3244 arg_types[1] = variant->jit_resources_ptr_type; /* context */
3245 arg_types[2] = int32_type; /* x */
3246 arg_types[3] = int32_type; /* y */
3247 arg_types[4] = int32_type; /* facing */
3248 arg_types[5] = LLVMPointerType(fs_elem_type, 0); /* a0 */
3249 arg_types[6] = LLVMPointerType(fs_elem_type, 0); /* dadx */
3250 arg_types[7] = LLVMPointerType(fs_elem_type, 0); /* dady */
3251 arg_types[8] = LLVMPointerType(int8p_type, 0); /* color */
3252 arg_types[9] = int8p_type; /* depth */
3253 arg_types[10] = LLVMInt64TypeInContext(gallivm->context); /* mask_input */
3254 arg_types[11] = variant->jit_thread_data_ptr_type; /* per thread data */
3255 arg_types[12] = int32p_type; /* stride */
3256 arg_types[13] = int32_type; /* depth_stride */
3257 arg_types[14] = int32p_type; /* color sample strides */
3258 arg_types[15] = int32_type; /* depth sample stride */
3259
3260 func_type = LLVMFunctionType(LLVMVoidTypeInContext(gallivm->context),
3261 arg_types, ARRAY_SIZE(arg_types), 0);
3262
3263 function = LLVMAddFunction(gallivm->module, func_name, func_type);
3264 LLVMSetFunctionCallConv(function, LLVMCCallConv);
3265
3266 variant->function[partial_mask] = function;
3267 variant->function_name[partial_mask] = MALLOC(strlen(func_name)+1);
3268 strcpy(variant->function_name[partial_mask], func_name);
3269
3270 /* XXX: need to propagate noalias down into color param now we are
3271 * passing a pointer-to-pointer?
3272 */
3273 for (unsigned i = 0; i < ARRAY_SIZE(arg_types); ++i)
3274 if (LLVMGetTypeKind(arg_types[i]) == LLVMPointerTypeKind)
3275 lp_add_function_attr(function, i + 1, LP_FUNC_ATTR_NOALIAS);
3276
3277 if (variant->gallivm->cache->data_size) {
3278 gallivm_stub_func(gallivm, function);
3279 return;
3280 }
3281
3282 context_ptr = LLVMGetParam(function, 0);
3283 resources_ptr = LLVMGetParam(function, 1);
3284 x = LLVMGetParam(function, 2);
3285 y = LLVMGetParam(function, 3);
3286 facing = LLVMGetParam(function, 4);
3287 a0_ptr = LLVMGetParam(function, 5);
3288 dadx_ptr = LLVMGetParam(function, 6);
3289 dady_ptr = LLVMGetParam(function, 7);
3290 color_ptr_ptr = LLVMGetParam(function, 8);
3291 depth_ptr = LLVMGetParam(function, 9);
3292 mask_input = LLVMGetParam(function, 10);
3293 thread_data_ptr = LLVMGetParam(function, 11);
3294 stride_ptr = LLVMGetParam(function, 12);
3295 depth_stride = LLVMGetParam(function, 13);
3296 color_sample_stride_ptr = LLVMGetParam(function, 14);
3297 depth_sample_stride = LLVMGetParam(function, 15);
3298
3299 lp_build_name(context_ptr, "context");
3300 lp_build_name(resources_ptr, "resources");
3301 lp_build_name(x, "x");
3302 lp_build_name(y, "y");
3303 lp_build_name(a0_ptr, "a0");
3304 lp_build_name(dadx_ptr, "dadx");
3305 lp_build_name(dady_ptr, "dady");
3306 lp_build_name(color_ptr_ptr, "color_ptr_ptr");
3307 lp_build_name(depth_ptr, "depth");
3308 lp_build_name(mask_input, "mask_input");
3309 lp_build_name(thread_data_ptr, "thread_data");
3310 lp_build_name(stride_ptr, "stride_ptr");
3311 lp_build_name(depth_stride, "depth_stride");
3312 lp_build_name(color_sample_stride_ptr, "color_sample_stride_ptr");
3313 lp_build_name(depth_sample_stride, "depth_sample_stride");
3314
3315 /*
3316 * Function body
3317 */
3318
3319 block = LLVMAppendBasicBlockInContext(gallivm->context, function, "entry");
3320 builder = gallivm->builder;
3321 assert(builder);
3322 LLVMPositionBuilderAtEnd(builder, block);
3323
3324 /* code generated texture sampling */
3325 struct lp_build_sampler_soa *sampler =
3326 lp_llvm_sampler_soa_create(lp_fs_variant_key_samplers(key),
3327 MAX2(key->nr_samplers,
3328 key->nr_sampler_views));
3329 struct lp_build_image_soa *image =
3330 lp_bld_llvm_image_soa_create(lp_fs_variant_key_images(key), key->nr_images);
3331
3332 unsigned num_fs = 16 / fs_type.length; /* number of loops per 4x4 stamp */
3333 /* for 1d resources only run "upper half" of stamp */
3334 if (key->resource_1d)
3335 num_fs /= 2;
3336
3337 {
3338 LLVMValueRef num_loop = lp_build_const_int32(gallivm, num_fs);
3339 LLVMTypeRef mask_type = lp_build_int_vec_type(gallivm, fs_type);
3340 LLVMValueRef num_loop_samp =
3341 lp_build_const_int32(gallivm, num_fs * key->coverage_samples);
3342 LLVMValueRef mask_store =
3343 lp_build_array_alloca(gallivm, mask_type,
3344 num_loop_samp, "mask_store");
3345 LLVMTypeRef flt_type = LLVMFloatTypeInContext(gallivm->context);
3346 LLVMValueRef glob_sample_pos =
3347 LLVMAddGlobal(gallivm->module,
3348 LLVMArrayType(flt_type, key->coverage_samples * 2), "");
3349 LLVMSetLinkage(glob_sample_pos, LLVMInternalLinkage);
3350 LLVMValueRef sample_pos_array;
3351
3352 if (key->multisample && key->coverage_samples == 4) {
3353 LLVMValueRef sample_pos_arr[8];
3354 for (unsigned i = 0; i < 4; i++) {
3355 sample_pos_arr[i * 2] = LLVMConstReal(flt_type,
3356 lp_sample_pos_4x[i][0]);
3357 sample_pos_arr[i * 2 + 1] = LLVMConstReal(flt_type,
3358 lp_sample_pos_4x[i][1]);
3359 }
3360 sample_pos_array =
3361 LLVMConstArray(LLVMFloatTypeInContext(gallivm->context),
3362 sample_pos_arr, 8);
3363 } else {
3364 LLVMValueRef sample_pos_arr[2];
3365 sample_pos_arr[0] = LLVMConstReal(flt_type, 0.5);
3366 sample_pos_arr[1] = LLVMConstReal(flt_type, 0.5);
3367 sample_pos_array =
3368 LLVMConstArray(LLVMFloatTypeInContext(gallivm->context),
3369 sample_pos_arr, 2);
3370 }
3371 LLVMSetInitializer(glob_sample_pos, sample_pos_array);
3372
3373 LLVMValueRef color_store[PIPE_MAX_COLOR_BUFS][TGSI_NUM_CHANNELS];
3374 bool pixel_center_integer = nir->info.fs.pixel_center_integer;
3375
3376 /*
3377 * The shader input interpolation info is not explicitely baked in the
3378 * shader key, but everything it derives from (TGSI, and flatshade) is
3379 * already included in the shader key.
3380 */
3381 lp_build_interp_soa_init(&interp,
3382 gallivm,
3383 nir->num_inputs,
3384 inputs,
3385 pixel_center_integer,
3386 key->coverage_samples,
3387 LLVMTypeOf(sample_pos_array),
3388 glob_sample_pos,
3389 num_loop,
3390 builder, fs_type,
3391 a0_ptr, dadx_ptr, dady_ptr,
3392 x, y);
3393
3394 for (unsigned i = 0; i < num_fs; i++) {
3395 if (key->multisample) {
3396 LLVMValueRef smask_val =
3397 LLVMBuildLoad2(builder, int32_type,
3398 lp_jit_context_sample_mask(gallivm, variant->jit_context_type, context_ptr),
3399 "");
3400
3401 /*
3402 * For multisampling, extract the per-sample mask from the
3403 * incoming 64-bit mask, store to the per sample mask storage. Or
3404 * all of them together to generate the fragment shader
3405 * mask. (sample shading TODO). Take the incoming state coverage
3406 * mask into account.
3407 */
3408 for (unsigned s = 0; s < key->coverage_samples; s++) {
3409 LLVMValueRef sindexi =
3410 lp_build_const_int32(gallivm, i + (s * num_fs));
3411 LLVMValueRef sample_mask_ptr =
3412 LLVMBuildGEP2(builder, mask_type, mask_store, &sindexi, 1,
3413 "sample_mask_ptr");
3414 LLVMValueRef s_mask =
3415 generate_quad_mask(gallivm, fs_type,
3416 i * fs_type.length / 4, s, mask_input);
3417 LLVMValueRef smask_bit =
3418 LLVMBuildAnd(builder, smask_val,
3419 lp_build_const_int32(gallivm, (1 << s)), "");
3420 LLVMValueRef cmp =
3421 LLVMBuildICmp(builder, LLVMIntNE, smask_bit,
3422 lp_build_const_int32(gallivm, 0), "");
3423 smask_bit = LLVMBuildSExt(builder, cmp, int32_type, "");
3424 smask_bit = lp_build_broadcast(gallivm, mask_type, smask_bit);
3425
3426 s_mask = LLVMBuildAnd(builder, s_mask, smask_bit, "");
3427 LLVMBuildStore(builder, s_mask, sample_mask_ptr);
3428 }
3429 } else {
3430 LLVMValueRef mask;
3431 LLVMValueRef indexi = lp_build_const_int32(gallivm, i);
3432 LLVMValueRef mask_ptr = LLVMBuildGEP2(builder, mask_type, mask_store,
3433 &indexi, 1, "mask_ptr");
3434
3435 if (partial_mask) {
3436 mask = generate_quad_mask(gallivm, fs_type,
3437 i * fs_type.length / 4, 0, mask_input);
3438 } else {
3439 mask = lp_build_const_int_vec(gallivm, fs_type, ~0);
3440 }
3441 LLVMBuildStore(builder, mask, mask_ptr);
3442 }
3443 }
3444
3445 generate_fs_loop(gallivm,
3446 shader, key,
3447 builder,
3448 fs_type,
3449 variant->jit_context_type,
3450 context_ptr,
3451 variant->jit_resources_type,
3452 resources_ptr,
3453 LLVMTypeOf(sample_pos_array),
3454 glob_sample_pos,
3455 num_loop,
3456 &interp,
3457 sampler,
3458 image,
3459 mask_type,
3460 mask_store, /* output */
3461 color_store,
3462 depth_ptr,
3463 depth_stride,
3464 depth_sample_stride,
3465 color_ptr_ptr,
3466 stride_ptr,
3467 color_sample_stride_ptr,
3468 facing,
3469 variant->jit_thread_data_type,
3470 thread_data_ptr);
3471
3472 LLVMTypeRef fs_vec_type = lp_build_vec_type(gallivm, fs_type);
3473 for (unsigned i = 0; i < num_fs; i++) {
3474 LLVMValueRef ptr;
3475 for (unsigned s = 0; s < key->coverage_samples; s++) {
3476 int idx = (i + (s * num_fs));
3477 LLVMValueRef sindexi = lp_build_const_int32(gallivm, idx);
3478 ptr = LLVMBuildGEP2(builder, mask_type, mask_store, &sindexi, 1, "");
3479
3480 fs_mask[idx] = LLVMBuildLoad2(builder, mask_type, ptr, "smask");
3481 }
3482
3483 for (unsigned s = 0; s < key->min_samples; s++) {
3484 /* This is fucked up need to reorganize things */
3485 int idx = s * num_fs + i;
3486 LLVMValueRef sindexi = lp_build_const_int32(gallivm, idx);
3487 for (unsigned cbuf = 0; cbuf < key->nr_cbufs; cbuf++) {
3488 for (unsigned chan = 0; chan < TGSI_NUM_CHANNELS; ++chan) {
3489 ptr = LLVMBuildGEP2(builder, fs_vec_type,
3490 color_store[cbuf][chan],
3491 &sindexi, 1, "");
3492 fs_out_color[s][cbuf][chan][i] = ptr;
3493 }
3494 }
3495 if (dual_source_blend) {
3496 /* only support one dual source blend target hence always use
3497 * output 1
3498 */
3499 for (unsigned chan = 0; chan < TGSI_NUM_CHANNELS; ++chan) {
3500 ptr = LLVMBuildGEP2(builder, fs_vec_type,
3501 color_store[1][chan],
3502 &sindexi, 1, "");
3503 fs_out_color[s][1][chan][i] = ptr;
3504 }
3505 }
3506 }
3507 }
3508 }
3509
3510 lp_bld_llvm_sampler_soa_destroy(sampler);
3511 lp_bld_llvm_image_soa_destroy(image);
3512
3513 /* Loop over color outputs / color buffers to do blending */
3514 for (unsigned cbuf = 0; cbuf < key->nr_cbufs; cbuf++) {
3515 if (key->cbuf_format[cbuf] != PIPE_FORMAT_NONE &&
3516 (key->blend.rt[cbuf].blend_enable || key->blend.logicop_enable ||
3517 find_output_by_frag_result(nir, FRAG_RESULT_DATA0 + cbuf) != -1)) {
3518 LLVMValueRef color_ptr;
3519 LLVMValueRef stride;
3520 LLVMValueRef sample_stride = NULL;
3521 LLVMValueRef index = lp_build_const_int32(gallivm, cbuf);
3522
3523 bool do_branch = ((key->depth.enabled
3524 || key->stencil[0].enabled
3525 || key->alpha.enabled)
3526 && !nir->info.fs.uses_discard);
3527
3528 color_ptr = LLVMBuildLoad2(builder, int8p_type,
3529 LLVMBuildGEP2(builder, int8p_type, color_ptr_ptr,
3530 &index, 1, ""),
3531 "");
3532
3533 stride = LLVMBuildLoad2(builder, int32_type,
3534 LLVMBuildGEP2(builder, int32_type, stride_ptr,
3535 &index, 1, ""),
3536 "");
3537
3538 if (key->cbuf_nr_samples[cbuf] > 1)
3539 sample_stride = LLVMBuildLoad2(builder, int32_type,
3540 LLVMBuildGEP2(builder,
3541 int32_type,
3542 color_sample_stride_ptr,
3543 &index, 1, ""), "");
3544
3545 for (unsigned s = 0; s < key->cbuf_nr_samples[cbuf]; s++) {
3546 unsigned mask_idx = num_fs * (key->multisample ? s : 0);
3547 unsigned out_idx = key->min_samples == 1 ? 0 : s;
3548 LLVMValueRef out_ptr = color_ptr;
3549
3550 if (sample_stride) {
3551 LLVMValueRef sample_offset =
3552 LLVMBuildMul(builder, sample_stride,
3553 lp_build_const_int32(gallivm, s), "");
3554 out_ptr = LLVMBuildGEP2(builder, int8_type, out_ptr, &sample_offset, 1, "");
3555 }
3556 out_ptr = LLVMBuildBitCast(builder, out_ptr,
3557 LLVMPointerType(blend_vec_type, 0), "");
3558
3559 lp_build_name(out_ptr, "color_ptr%d", cbuf);
3560
3561 generate_unswizzled_blend(gallivm, cbuf, variant,
3562 key->cbuf_format[cbuf],
3563 num_fs, fs_type, &fs_mask[mask_idx],
3564 fs_out_color[out_idx],
3565 variant->jit_context_type,
3566 context_ptr, blend_vec_type, out_ptr, stride,
3567 partial_mask, do_branch);
3568 }
3569 }
3570 }
3571
3572 LLVMBuildRetVoid(builder);
3573
3574 gallivm_verify_function(gallivm, function);
3575 }
3576
3577
3578 static void
dump_fs_variant_key(struct lp_fragment_shader_variant_key * key)3579 dump_fs_variant_key(struct lp_fragment_shader_variant_key *key)
3580 {
3581 debug_printf("fs variant %p:\n", (void *) key);
3582
3583 if (key->flatshade) {
3584 debug_printf("flatshade = 1\n");
3585 }
3586 if (key->depth_clamp)
3587 debug_printf("depth_clamp = 1\n");
3588
3589 if (key->restrict_depth_values)
3590 debug_printf("restrict_depth_values = 1\n");
3591
3592 if (key->multisample) {
3593 debug_printf("multisample = 1\n");
3594 debug_printf("coverage samples = %d\n", key->coverage_samples);
3595 debug_printf("min samples = %d\n", key->min_samples);
3596 }
3597 for (unsigned i = 0; i < key->nr_cbufs; ++i) {
3598 debug_printf("cbuf_format[%u] = %s\n", i, util_format_name(key->cbuf_format[i]));
3599 debug_printf("cbuf nr_samples[%u] = %d\n", i, key->cbuf_nr_samples[i]);
3600 }
3601 if (key->depth.enabled || key->stencil[0].enabled) {
3602 debug_printf("depth.format = %s\n", util_format_name(key->zsbuf_format));
3603 debug_printf("depth nr_samples = %d\n", key->zsbuf_nr_samples);
3604 }
3605 if (key->depth.enabled) {
3606 debug_printf("depth.func = %s\n", util_str_func(key->depth.func, true));
3607 debug_printf("depth.writemask = %u\n", key->depth.writemask);
3608 }
3609
3610 for (unsigned i = 0; i < 2; ++i) {
3611 if (key->stencil[i].enabled) {
3612 debug_printf("stencil[%u].func = %s\n", i, util_str_func(key->stencil[i].func, true));
3613 debug_printf("stencil[%u].fail_op = %s\n", i, util_str_stencil_op(key->stencil[i].fail_op, true));
3614 debug_printf("stencil[%u].zpass_op = %s\n", i, util_str_stencil_op(key->stencil[i].zpass_op, true));
3615 debug_printf("stencil[%u].zfail_op = %s\n", i, util_str_stencil_op(key->stencil[i].zfail_op, true));
3616 debug_printf("stencil[%u].valuemask = 0x%x\n", i, key->stencil[i].valuemask);
3617 debug_printf("stencil[%u].writemask = 0x%x\n", i, key->stencil[i].writemask);
3618 }
3619 }
3620
3621 if (key->alpha.enabled) {
3622 debug_printf("alpha.func = %s\n", util_str_func(key->alpha.func, true));
3623 }
3624
3625 if (key->occlusion_count) {
3626 debug_printf("occlusion_count = 1\n");
3627 }
3628
3629 if (key->blend.logicop_enable) {
3630 debug_printf("blend.logicop_func = %s\n", util_str_logicop(key->blend.logicop_func, true));
3631 } else if (key->blend.rt[0].blend_enable) {
3632 debug_printf("blend.rgb_func = %s\n", util_str_blend_func (key->blend.rt[0].rgb_func, true));
3633 debug_printf("blend.rgb_src_factor = %s\n", util_str_blend_factor(key->blend.rt[0].rgb_src_factor, true));
3634 debug_printf("blend.rgb_dst_factor = %s\n", util_str_blend_factor(key->blend.rt[0].rgb_dst_factor, true));
3635 debug_printf("blend.alpha_func = %s\n", util_str_blend_func (key->blend.rt[0].alpha_func, true));
3636 debug_printf("blend.alpha_src_factor = %s\n", util_str_blend_factor(key->blend.rt[0].alpha_src_factor, true));
3637 debug_printf("blend.alpha_dst_factor = %s\n", util_str_blend_factor(key->blend.rt[0].alpha_dst_factor, true));
3638 }
3639 debug_printf("blend.colormask = 0x%x\n", key->blend.rt[0].colormask);
3640 if (key->blend.alpha_to_coverage) {
3641 debug_printf("blend.alpha_to_coverage is enabled\n");
3642 }
3643 for (unsigned i = 0; i < key->nr_samplers; ++i) {
3644 const struct lp_sampler_static_state *samplers = lp_fs_variant_key_samplers(key);
3645 const struct lp_static_sampler_state *sampler = &samplers[i].sampler_state;
3646 debug_printf("sampler[%u] = \n", i);
3647 debug_printf(" .wrap = %s %s %s\n",
3648 util_str_tex_wrap(sampler->wrap_s, true),
3649 util_str_tex_wrap(sampler->wrap_t, true),
3650 util_str_tex_wrap(sampler->wrap_r, true));
3651 debug_printf(" .min_img_filter = %s\n",
3652 util_str_tex_filter(sampler->min_img_filter, true));
3653 debug_printf(" .min_mip_filter = %s\n",
3654 util_str_tex_mipfilter(sampler->min_mip_filter, true));
3655 debug_printf(" .mag_img_filter = %s\n",
3656 util_str_tex_filter(sampler->mag_img_filter, true));
3657 if (sampler->compare_mode != PIPE_TEX_COMPARE_NONE)
3658 debug_printf(" .compare_func = %s\n", util_str_func(sampler->compare_func, true));
3659 debug_printf(" .normalized_coords = %u\n", sampler->normalized_coords);
3660 debug_printf(" .min_max_lod_equal = %u\n", sampler->min_max_lod_equal);
3661 debug_printf(" .lod_bias_non_zero = %u\n", sampler->lod_bias_non_zero);
3662 debug_printf(" .apply_min_lod = %u\n", sampler->apply_min_lod);
3663 debug_printf(" .apply_max_lod = %u\n", sampler->apply_max_lod);
3664 debug_printf(" .reduction_mode = %u\n", sampler->reduction_mode);
3665 debug_printf(" .aniso = %u\n", sampler->aniso);
3666 }
3667 for (unsigned i = 0; i < key->nr_sampler_views; ++i) {
3668 const struct lp_sampler_static_state *samplers = lp_fs_variant_key_samplers(key);
3669 const struct lp_static_texture_state *texture = &samplers[i].texture_state;
3670 debug_printf("texture[%u] = \n", i);
3671 debug_printf(" .format = %s\n",
3672 util_format_name(texture->format));
3673 debug_printf(" .target = %s\n",
3674 util_str_tex_target(texture->target, true));
3675 debug_printf(" .level_zero_only = %u\n",
3676 texture->level_zero_only);
3677 debug_printf(" .pot = %u %u %u\n",
3678 texture->pot_width,
3679 texture->pot_height,
3680 texture->pot_depth);
3681 }
3682 struct lp_image_static_state *images = lp_fs_variant_key_images(key);
3683 for (unsigned i = 0; i < key->nr_images; ++i) {
3684 const struct lp_static_texture_state *image = &images[i].image_state;
3685 debug_printf("image[%u] = \n", i);
3686 debug_printf(" .format = %s\n",
3687 util_format_name(image->format));
3688 debug_printf(" .target = %s\n",
3689 util_str_tex_target(image->target, true));
3690 debug_printf(" .level_zero_only = %u\n",
3691 image->level_zero_only);
3692 debug_printf(" .pot = %u %u %u\n",
3693 image->pot_width,
3694 image->pot_height,
3695 image->pot_depth);
3696 }
3697 }
3698
3699
3700 const char *
lp_debug_fs_kind(enum lp_fs_kind kind)3701 lp_debug_fs_kind(enum lp_fs_kind kind)
3702 {
3703 switch (kind) {
3704 case LP_FS_KIND_GENERAL:
3705 return "GENERAL";
3706 case LP_FS_KIND_BLIT_RGBA:
3707 return "BLIT_RGBA";
3708 case LP_FS_KIND_BLIT_RGB1:
3709 return "BLIT_RGB1";
3710 case LP_FS_KIND_AERO_MINIFICATION:
3711 return "AERO_MINIFICATION";
3712 case LP_FS_KIND_LLVM_LINEAR:
3713 return "LLVM_LINEAR";
3714 default:
3715 return "unknown";
3716 }
3717 }
3718
3719
3720 void
lp_debug_fs_variant(struct lp_fragment_shader_variant * variant)3721 lp_debug_fs_variant(struct lp_fragment_shader_variant *variant)
3722 {
3723 debug_printf("llvmpipe: Fragment shader #%u variant #%u:\n",
3724 variant->shader->no, variant->no);
3725 nir_print_shader(variant->shader->base.ir.nir, stderr);
3726 dump_fs_variant_key(&variant->key);
3727 debug_printf("variant->opaque = %u\n", variant->opaque);
3728 debug_printf("variant->potentially_opaque = %u\n", variant->potentially_opaque);
3729 debug_printf("variant->blit = %u\n", variant->blit);
3730 debug_printf("shader->kind = %s\n", lp_debug_fs_kind(variant->shader->kind));
3731 debug_printf("\n");
3732 }
3733
3734
3735 static void
lp_fs_get_ir_cache_key(struct lp_fragment_shader_variant * variant,unsigned char ir_sha1_cache_key[20])3736 lp_fs_get_ir_cache_key(struct lp_fragment_shader_variant *variant,
3737 unsigned char ir_sha1_cache_key[20])
3738 {
3739 struct blob blob = { 0 };
3740 unsigned ir_size;
3741 void *ir_binary;
3742
3743 blob_init(&blob);
3744 nir_serialize(&blob, variant->shader->base.ir.nir, true);
3745 ir_binary = blob.data;
3746 ir_size = blob.size;
3747
3748 struct mesa_sha1 ctx;
3749 _mesa_sha1_init(&ctx);
3750 _mesa_sha1_update(&ctx, &variant->key, variant->shader->variant_key_size);
3751 _mesa_sha1_update(&ctx, ir_binary, ir_size);
3752 _mesa_sha1_final(&ctx, ir_sha1_cache_key);
3753
3754 blob_finish(&blob);
3755 }
3756
3757
3758 /**
3759 * Generate a new fragment shader variant from the shader code and
3760 * other state indicated by the key.
3761 */
3762 static struct lp_fragment_shader_variant *
generate_variant(struct llvmpipe_context * lp,struct lp_fragment_shader * shader,const struct lp_fragment_shader_variant_key * key)3763 generate_variant(struct llvmpipe_context *lp,
3764 struct lp_fragment_shader *shader,
3765 const struct lp_fragment_shader_variant_key *key)
3766 {
3767 struct nir_shader *nir = shader->base.ir.nir;
3768 struct lp_fragment_shader_variant *variant =
3769 MALLOC(sizeof *variant + shader->variant_key_size - sizeof variant->key);
3770 if (!variant)
3771 return NULL;
3772
3773 memset(variant, 0, sizeof(*variant));
3774
3775 pipe_reference_init(&variant->reference, 1);
3776 lp_fs_reference(lp, &variant->shader, shader);
3777
3778 memcpy(&variant->key, key, shader->variant_key_size);
3779
3780 struct llvmpipe_screen *screen = llvmpipe_screen(lp->pipe.screen);
3781 struct lp_cached_code cached = { 0 };
3782 unsigned char ir_sha1_cache_key[20];
3783 bool needs_caching = false;
3784 if (shader->base.ir.nir) {
3785 lp_fs_get_ir_cache_key(variant, ir_sha1_cache_key);
3786
3787 lp_disk_cache_find_shader(screen, &cached, ir_sha1_cache_key);
3788 if (!cached.data_size)
3789 needs_caching = true;
3790 }
3791
3792 char module_name[64];
3793 snprintf(module_name, sizeof(module_name), "fs%u_variant%u",
3794 shader->no, shader->variants_created);
3795 variant->gallivm = gallivm_create(module_name, &lp->context, &cached);
3796 if (!variant->gallivm) {
3797 FREE(variant);
3798 return NULL;
3799 }
3800
3801 variant->list_item_global.base = variant;
3802 variant->list_item_local.base = variant;
3803 variant->no = shader->variants_created++;
3804
3805 /*
3806 * Determine whether we are touching all channels in the color buffer.
3807 */
3808 const struct util_format_description *cbuf0_format_desc = NULL;
3809 bool fullcolormask = false;
3810 if (key->nr_cbufs == 1) {
3811 cbuf0_format_desc = util_format_description(key->cbuf_format[0]);
3812 fullcolormask = util_format_colormask_full(cbuf0_format_desc,
3813 key->blend.rt[0].colormask);
3814 }
3815
3816 /* The scissor is ignored here as only tiles inside the scissoring
3817 * rectangle will refer to this.
3818 */
3819 const bool no_kill =
3820 fullcolormask &&
3821 !key->stencil[0].enabled &&
3822 !key->alpha.enabled &&
3823 !key->multisample &&
3824 !key->blend.alpha_to_coverage &&
3825 !key->depth.enabled &&
3826 !nir->info.fs.uses_discard &&
3827 !(nir->info.outputs_written & BITFIELD64_BIT(FRAG_RESULT_SAMPLE_MASK)) &&
3828 !nir->info.fs.uses_fbfetch_output;
3829
3830 variant->opaque =
3831 no_kill &&
3832 !key->blend.logicop_enable &&
3833 !key->blend.rt[0].blend_enable
3834 ? true : false;
3835
3836 variant->potentially_opaque =
3837 no_kill &&
3838 !key->blend.logicop_enable &&
3839 key->blend.rt[0].blend_enable &&
3840 key->blend.rt[0].rgb_func == PIPE_BLEND_ADD &&
3841 key->blend.rt[0].rgb_dst_factor == PIPE_BLENDFACTOR_INV_SRC_ALPHA &&
3842 key->blend.rt[0].alpha_func == key->blend.rt[0].rgb_func &&
3843 key->blend.rt[0].alpha_dst_factor == key->blend.rt[0].rgb_dst_factor &&
3844 shader->base.type == PIPE_SHADER_IR_TGSI &&
3845 /*
3846 * FIXME: for NIR, all of the fields of info.xxx (except info.base)
3847 * are zeros, hence shader analysis (here and elsewhere) using these
3848 * bits cannot work and will silently fail (cbuf is the only pointer
3849 * field, hence causing a crash).
3850 */
3851 shader->info.cbuf[0][3].file != TGSI_FILE_NULL
3852 ? true : false;
3853
3854 /* We only care about opaque blits for now */
3855 if (variant->opaque &&
3856 (shader->kind == LP_FS_KIND_BLIT_RGBA ||
3857 shader->kind == LP_FS_KIND_BLIT_RGB1)) {
3858 const struct lp_sampler_static_state *samp0 =
3859 lp_fs_variant_key_sampler_idx(key, 0);
3860 assert(samp0);
3861
3862 const enum pipe_format texture_format = samp0->texture_state.format;
3863 const enum pipe_texture_target target = samp0->texture_state.target;
3864 const unsigned min_img_filter = samp0->sampler_state.min_img_filter;
3865 const unsigned mag_img_filter = samp0->sampler_state.mag_img_filter;
3866
3867 unsigned min_mip_filter;
3868 if (samp0->texture_state.level_zero_only) {
3869 min_mip_filter = PIPE_TEX_MIPFILTER_NONE;
3870 } else {
3871 min_mip_filter = samp0->sampler_state.min_mip_filter;
3872 }
3873
3874 if (target == PIPE_TEXTURE_2D &&
3875 min_img_filter == PIPE_TEX_FILTER_NEAREST &&
3876 mag_img_filter == PIPE_TEX_FILTER_NEAREST &&
3877 min_mip_filter == PIPE_TEX_MIPFILTER_NONE &&
3878 ((texture_format &&
3879 util_is_format_compatible(util_format_description(texture_format),
3880 cbuf0_format_desc)) ||
3881 (shader->kind == LP_FS_KIND_BLIT_RGB1 &&
3882 (texture_format == PIPE_FORMAT_B8G8R8A8_UNORM ||
3883 texture_format == PIPE_FORMAT_B8G8R8X8_UNORM) &&
3884 (key->cbuf_format[0] == PIPE_FORMAT_B8G8R8A8_UNORM ||
3885 key->cbuf_format[0] == PIPE_FORMAT_B8G8R8X8_UNORM)))) {
3886 variant->blit = 1;
3887 }
3888 }
3889
3890 /* Determine whether this shader + pipeline state is a candidate for
3891 * the linear path.
3892 */
3893 const bool linear_pipeline =
3894 !key->stencil[0].enabled &&
3895 !key->depth.enabled &&
3896 !nir->info.fs.uses_discard &&
3897 !key->blend.logicop_enable &&
3898 (key->cbuf_format[0] == PIPE_FORMAT_B8G8R8A8_UNORM ||
3899 key->cbuf_format[0] == PIPE_FORMAT_B8G8R8X8_UNORM ||
3900 key->cbuf_format[0] == PIPE_FORMAT_R8G8B8A8_UNORM ||
3901 key->cbuf_format[0] == PIPE_FORMAT_R8G8B8X8_UNORM);
3902
3903 memcpy(&variant->key, key, sizeof *key);
3904
3905 if ((LP_DEBUG & DEBUG_FS) || (gallivm_debug & GALLIVM_DEBUG_IR)) {
3906 lp_debug_fs_variant(variant);
3907 }
3908
3909 llvmpipe_fs_variant_fastpath(variant);
3910
3911 lp_jit_init_types(variant);
3912
3913 if (variant->jit_function[RAST_EDGE_TEST] == NULL)
3914 generate_fragment(lp, shader, variant, RAST_EDGE_TEST);
3915
3916 if (variant->jit_function[RAST_WHOLE] == NULL) {
3917 if (variant->opaque) {
3918 /* Specialized shader, which doesn't need to read the color buffer. */
3919 generate_fragment(lp, shader, variant, RAST_WHOLE);
3920 }
3921 }
3922
3923 if (linear_pipeline) {
3924 /* Currently keeping both the old fastpaths and new linear path
3925 * active. The older code is still somewhat faster for the cases
3926 * it covers.
3927 *
3928 * XXX: consider restricting this to aero-mode only.
3929 */
3930 if (fullcolormask &&
3931 !key->alpha.enabled &&
3932 !key->blend.alpha_to_coverage) {
3933 llvmpipe_fs_variant_linear_fastpath(variant);
3934 }
3935
3936 /* If the original fastpath doesn't cover this variant, try the new
3937 * code:
3938 */
3939 if (variant->jit_linear == NULL) {
3940 if (shader->kind == LP_FS_KIND_BLIT_RGBA ||
3941 shader->kind == LP_FS_KIND_BLIT_RGB1 ||
3942 shader->kind == LP_FS_KIND_LLVM_LINEAR) {
3943 llvmpipe_fs_variant_linear_llvm(lp, shader, variant);
3944 }
3945 }
3946 } else {
3947 if (LP_DEBUG & DEBUG_LINEAR) {
3948 lp_debug_fs_variant(variant);
3949 debug_printf(" ----> no linear path for this variant\n");
3950 }
3951 }
3952
3953 /*
3954 * Compile everything
3955 */
3956
3957 #if GALLIVM_USE_ORCJIT
3958 /* module has been moved into ORCJIT after gallivm_compile_module */
3959 variant->nr_instrs += lp_build_count_ir_module(variant->gallivm->module);
3960
3961 gallivm_compile_module(variant->gallivm);
3962 #else
3963 gallivm_compile_module(variant->gallivm);
3964
3965 variant->nr_instrs += lp_build_count_ir_module(variant->gallivm->module);
3966 #endif
3967
3968 if (variant->function[RAST_EDGE_TEST]) {
3969 variant->jit_function[RAST_EDGE_TEST] = (lp_jit_frag_func)
3970 gallivm_jit_function(variant->gallivm,
3971 variant->function[RAST_EDGE_TEST],
3972 variant->function_name[RAST_EDGE_TEST]);
3973 }
3974
3975 if (variant->function[RAST_WHOLE]) {
3976 variant->jit_function[RAST_WHOLE] = (lp_jit_frag_func)
3977 gallivm_jit_function(variant->gallivm,
3978 variant->function[RAST_WHOLE],
3979 variant->function_name[RAST_WHOLE]);
3980 } else if (!variant->jit_function[RAST_WHOLE]) {
3981 variant->jit_function[RAST_WHOLE] = (lp_jit_frag_func)
3982 variant->jit_function[RAST_EDGE_TEST];
3983 }
3984
3985 if (linear_pipeline) {
3986 if (variant->linear_function) {
3987 variant->jit_linear_llvm = (lp_jit_linear_llvm_func)
3988 gallivm_jit_function(variant->gallivm, variant->linear_function,
3989 variant->linear_function_name);
3990 }
3991
3992 /*
3993 * This must be done after LLVM compilation, as it will call the JIT'ed
3994 * code to determine active inputs.
3995 */
3996 lp_linear_check_variant(variant);
3997 }
3998
3999 if (needs_caching) {
4000 lp_disk_cache_insert_shader(screen, &cached, ir_sha1_cache_key);
4001 }
4002
4003 gallivm_free_ir(variant->gallivm);
4004
4005 return variant;
4006 }
4007
4008
4009 static void *
llvmpipe_create_fs_state(struct pipe_context * pipe,const struct pipe_shader_state * templ)4010 llvmpipe_create_fs_state(struct pipe_context *pipe,
4011 const struct pipe_shader_state *templ)
4012 {
4013 struct llvmpipe_context *llvmpipe = llvmpipe_context(pipe);
4014
4015 struct lp_fragment_shader *shader = CALLOC_STRUCT(lp_fragment_shader);
4016 if (!shader)
4017 return NULL;
4018
4019 pipe_reference_init(&shader->reference, 1);
4020 shader->no = fs_no++;
4021 list_inithead(&shader->variants.list);
4022
4023 shader->base.type = PIPE_SHADER_IR_NIR;
4024
4025 if (templ->type == PIPE_SHADER_IR_TGSI) {
4026 shader->base.ir.nir = tgsi_to_nir(templ->tokens, pipe->screen, false);
4027 } else {
4028 shader->base.ir.nir = templ->ir.nir;
4029 }
4030
4031 /* lower FRAG_RESULT_COLOR -> DATA[0-7] to correctly handle unused attachments */
4032 nir_shader *nir = shader->base.ir.nir;
4033 NIR_PASS_V(nir, nir_lower_fragcolor, nir->info.fs.color_is_dual_source ? 1 : 8);
4034
4035 nir_shader_gather_info(nir, nir_shader_get_entrypoint(nir));
4036 nir_tgsi_scan_shader(nir, &shader->info.base, true);
4037 shader->info.num_texs = shader->info.base.opcode_count[TGSI_OPCODE_TEX];
4038
4039 llvmpipe_register_shader(pipe, &shader->base);
4040
4041 shader->draw_data = draw_create_fragment_shader(llvmpipe->draw, templ);
4042 if (shader->draw_data == NULL) {
4043 FREE(shader);
4044 return NULL;
4045 }
4046
4047 const int nr_samplers = BITSET_LAST_BIT(nir->info.samplers_used);
4048 const int nr_sampler_views = BITSET_LAST_BIT(nir->info.textures_used);
4049 const int nr_images = BITSET_LAST_BIT(nir->info.images_used);
4050
4051 shader->variant_key_size = lp_fs_variant_key_size(MAX2(nr_samplers,
4052 nr_sampler_views),
4053 nr_images);
4054
4055 nir_foreach_shader_in_variable(var, nir) {
4056 unsigned idx = var->data.driver_location;
4057 unsigned slots = nir_variable_count_slots(var, var->type);
4058
4059 if (var->data.centroid)
4060 shader->inputs[idx].location = TGSI_INTERPOLATE_LOC_CENTROID;
4061 if (var->data.sample)
4062 shader->inputs[idx].location = TGSI_INTERPOLATE_LOC_SAMPLE;
4063
4064 enum glsl_base_type base_type =
4065 glsl_get_base_type(glsl_without_array(var->type));
4066 switch (var->data.interpolation) {
4067 case INTERP_MODE_NONE:
4068 if (glsl_base_type_is_integer(base_type) || var->data.per_primitive) {
4069 shader->inputs[idx].interp = LP_INTERP_CONSTANT;
4070 break;
4071 }
4072 if (var->data.location == VARYING_SLOT_COL0 ||
4073 var->data.location == VARYING_SLOT_COL1) {
4074 shader->inputs[idx].interp = LP_INTERP_COLOR;
4075 break;
4076 }
4077 FALLTHROUGH;
4078 case INTERP_MODE_SMOOTH:
4079 shader->inputs[idx].interp = LP_INTERP_PERSPECTIVE;
4080 break;
4081 case INTERP_MODE_NOPERSPECTIVE:
4082 shader->inputs[idx].interp = LP_INTERP_LINEAR;
4083 break;
4084 case INTERP_MODE_FLAT:
4085 shader->inputs[idx].interp = LP_INTERP_CONSTANT;
4086 break;
4087 }
4088
4089 /* XXX this is a completely pointless index map... */
4090 shader->inputs[idx].src_index = idx + 1;
4091 if (var->data.location == VARYING_SLOT_FACE)
4092 shader->inputs[idx].interp = LP_INTERP_FACING;
4093 else if (var->data.location == VARYING_SLOT_POS) {
4094 shader->inputs[idx].src_index = 0;
4095 shader->inputs[idx].interp = LP_INTERP_POSITION;
4096 }
4097
4098 shader->inputs[idx].usage_mask = shader->info.base.input_usage_mask[idx];
4099 for (unsigned s = 1; s < slots; s++) {
4100 shader->inputs[idx + s] = shader->inputs[idx];
4101 shader->inputs[idx + s].src_index = idx + s + 1;
4102 shader->inputs[idx + s].usage_mask = shader->info.base.input_usage_mask[idx + s];
4103 }
4104 }
4105
4106 llvmpipe_fs_analyse_nir(shader);
4107
4108 return shader;
4109 }
4110
4111
4112 static void
llvmpipe_bind_fs_state(struct pipe_context * pipe,void * fs)4113 llvmpipe_bind_fs_state(struct pipe_context *pipe, void *fs)
4114 {
4115 struct llvmpipe_context *llvmpipe = llvmpipe_context(pipe);
4116 struct lp_fragment_shader *lp_fs = (struct lp_fragment_shader *)fs;
4117 if (llvmpipe->fs == lp_fs)
4118 return;
4119
4120 draw_bind_fragment_shader(llvmpipe->draw,
4121 (lp_fs ? lp_fs->draw_data : NULL));
4122
4123 lp_fs_reference(llvmpipe, &llvmpipe->fs, lp_fs);
4124
4125 /* invalidate the setup link, NEW_FS will make it update */
4126 lp_setup_set_fs_variant(llvmpipe->setup, NULL);
4127 llvmpipe->dirty |= LP_NEW_FS;
4128 }
4129
4130
4131 /**
4132 * Remove shader variant from two lists: the shader's variant list
4133 * and the context's variant list.
4134 */
4135 static void
llvmpipe_remove_shader_variant(struct llvmpipe_context * lp,struct lp_fragment_shader_variant * variant)4136 llvmpipe_remove_shader_variant(struct llvmpipe_context *lp,
4137 struct lp_fragment_shader_variant *variant)
4138 {
4139 if ((LP_DEBUG & DEBUG_FS) || (gallivm_debug & GALLIVM_DEBUG_IR)) {
4140 debug_printf("llvmpipe: del fs #%u var %u v created %u v cached %u "
4141 "v total cached %u inst %u total inst %u\n",
4142 variant->shader->no, variant->no,
4143 variant->shader->variants_created,
4144 variant->shader->variants_cached,
4145 lp->nr_fs_variants, variant->nr_instrs, lp->nr_fs_instrs);
4146 }
4147
4148 /* remove from shader's list */
4149 list_del(&variant->list_item_local.list);
4150 variant->shader->variants_cached--;
4151
4152 /* remove from context's list */
4153 list_del(&variant->list_item_global.list);
4154 lp->nr_fs_variants--;
4155 lp->nr_fs_instrs -= variant->nr_instrs;
4156 }
4157
4158
4159 void
llvmpipe_destroy_shader_variant(struct llvmpipe_context * lp,struct lp_fragment_shader_variant * variant)4160 llvmpipe_destroy_shader_variant(struct llvmpipe_context *lp,
4161 struct lp_fragment_shader_variant *variant)
4162 {
4163 gallivm_destroy(variant->gallivm);
4164 lp_fs_reference(lp, &variant->shader, NULL);
4165 if (variant->function_name[RAST_EDGE_TEST])
4166 FREE(variant->function_name[RAST_EDGE_TEST]);
4167 if (variant->function_name[RAST_WHOLE])
4168 FREE(variant->function_name[RAST_WHOLE]);
4169 if (variant->linear_function_name)
4170 FREE(variant->linear_function_name);
4171 FREE(variant);
4172 }
4173
4174
4175 void
llvmpipe_destroy_fs(struct llvmpipe_context * llvmpipe,struct lp_fragment_shader * shader)4176 llvmpipe_destroy_fs(struct llvmpipe_context *llvmpipe,
4177 struct lp_fragment_shader *shader)
4178 {
4179 /* Delete draw module's data */
4180 draw_delete_fragment_shader(llvmpipe->draw, shader->draw_data);
4181
4182 ralloc_free(shader->base.ir.nir);
4183 assert(shader->variants_cached == 0);
4184 FREE(shader);
4185 }
4186
4187
4188 static void
llvmpipe_delete_fs_state(struct pipe_context * pipe,void * fs)4189 llvmpipe_delete_fs_state(struct pipe_context *pipe, void *fs)
4190 {
4191 struct llvmpipe_context *llvmpipe = llvmpipe_context(pipe);
4192 struct lp_fragment_shader *shader = fs;
4193 struct lp_fs_variant_list_item *li, *next;
4194
4195 /* Delete all the variants */
4196 LIST_FOR_EACH_ENTRY_SAFE(li, next, &shader->variants.list, list) {
4197 struct lp_fragment_shader_variant *variant;
4198 variant = li->base;
4199 llvmpipe_remove_shader_variant(llvmpipe, li->base);
4200 lp_fs_variant_reference(llvmpipe, &variant, NULL);
4201 }
4202
4203 lp_fs_reference(llvmpipe, &shader, NULL);
4204 }
4205
4206
4207 static void
llvmpipe_set_constant_buffer(struct pipe_context * pipe,enum pipe_shader_type shader,uint index,bool take_ownership,const struct pipe_constant_buffer * cb)4208 llvmpipe_set_constant_buffer(struct pipe_context *pipe,
4209 enum pipe_shader_type shader, uint index,
4210 bool take_ownership,
4211 const struct pipe_constant_buffer *cb)
4212 {
4213 struct llvmpipe_context *llvmpipe = llvmpipe_context(pipe);
4214 struct pipe_constant_buffer *constants = &llvmpipe->constants[shader][index];
4215
4216 assert(shader < PIPE_SHADER_MESH_TYPES);
4217 assert(index < ARRAY_SIZE(llvmpipe->constants[shader]));
4218
4219 /* note: reference counting */
4220 util_copy_constant_buffer(&llvmpipe->constants[shader][index], cb,
4221 take_ownership);
4222
4223 /* user_buffer is only valid until the next set_constant_buffer (at most,
4224 * possibly until shader deletion), so we need to upload it now to make
4225 * sure it doesn't get updated/freed out from under us.
4226 */
4227 if (constants->user_buffer) {
4228 u_upload_data(llvmpipe->pipe.const_uploader, 0, constants->buffer_size,
4229 16, constants->user_buffer, &constants->buffer_offset,
4230 &constants->buffer);
4231 }
4232 if (constants->buffer) {
4233 if (!(constants->buffer->bind & PIPE_BIND_CONSTANT_BUFFER)) {
4234 debug_printf("Illegal set constant without bind flag\n");
4235 constants->buffer->bind |= PIPE_BIND_CONSTANT_BUFFER;
4236 }
4237 llvmpipe_flush_resource(pipe, constants->buffer, 0, true, true, false, "set_constant_buffer");
4238 }
4239
4240 switch (shader) {
4241 case PIPE_SHADER_VERTEX:
4242 case PIPE_SHADER_GEOMETRY:
4243 case PIPE_SHADER_TESS_CTRL:
4244 case PIPE_SHADER_TESS_EVAL: {
4245 const unsigned size = cb ? cb->buffer_size : 0;
4246
4247 const uint8_t *data = NULL;
4248 if (constants->buffer) {
4249 data = (uint8_t *) llvmpipe_resource_data(constants->buffer)
4250 + constants->buffer_offset;
4251 }
4252
4253 draw_set_mapped_constant_buffer(llvmpipe->draw, shader,
4254 index, data, size);
4255 break;
4256 }
4257 case PIPE_SHADER_COMPUTE:
4258 llvmpipe->cs_dirty |= LP_CSNEW_CONSTANTS;
4259 break;
4260 case PIPE_SHADER_FRAGMENT:
4261 llvmpipe->dirty |= LP_NEW_FS_CONSTANTS;
4262 break;
4263 case PIPE_SHADER_TASK:
4264 llvmpipe->dirty |= LP_NEW_TASK_CONSTANTS;
4265 break;
4266 case PIPE_SHADER_MESH:
4267 llvmpipe->dirty |= LP_NEW_MESH_CONSTANTS;
4268 break;
4269 default:
4270 unreachable("Illegal shader type");
4271 break;
4272 }
4273 }
4274
4275
4276 static void
llvmpipe_set_shader_buffers(struct pipe_context * pipe,enum pipe_shader_type shader,unsigned start_slot,unsigned count,const struct pipe_shader_buffer * buffers,unsigned writable_bitmask)4277 llvmpipe_set_shader_buffers(struct pipe_context *pipe,
4278 enum pipe_shader_type shader, unsigned start_slot,
4279 unsigned count,
4280 const struct pipe_shader_buffer *buffers,
4281 unsigned writable_bitmask)
4282 {
4283 struct llvmpipe_context *llvmpipe = llvmpipe_context(pipe);
4284
4285 unsigned i, idx;
4286 for (i = start_slot, idx = 0; i < start_slot + count; i++, idx++) {
4287 const struct pipe_shader_buffer *buffer = buffers ? &buffers[idx] : NULL;
4288
4289 util_copy_shader_buffer(&llvmpipe->ssbos[shader][i], buffer);
4290
4291 if (buffer && buffer->buffer) {
4292 bool read_only = !(writable_bitmask & (1 << idx));
4293 llvmpipe_flush_resource(pipe, buffer->buffer, 0, read_only, false,
4294 false, "buffer");
4295 }
4296
4297 switch (shader) {
4298 case PIPE_SHADER_VERTEX:
4299 case PIPE_SHADER_GEOMETRY:
4300 case PIPE_SHADER_TESS_CTRL:
4301 case PIPE_SHADER_TESS_EVAL: {
4302 const unsigned size = buffer ? buffer->buffer_size : 0;
4303 const uint8_t *data = NULL;
4304 if (buffer && buffer->buffer)
4305 data = (uint8_t *) llvmpipe_resource_data(buffer->buffer);
4306 if (data)
4307 data += buffer->buffer_offset;
4308 draw_set_mapped_shader_buffer(llvmpipe->draw, shader,
4309 i, data, size);
4310 break;
4311 }
4312 case PIPE_SHADER_COMPUTE:
4313 llvmpipe->cs_dirty |= LP_CSNEW_SSBOS;
4314 break;
4315 case PIPE_SHADER_TASK:
4316 llvmpipe->dirty |= LP_NEW_TASK_SSBOS;
4317 break;
4318 case PIPE_SHADER_MESH:
4319 llvmpipe->dirty |= LP_NEW_MESH_SSBOS;
4320 break;
4321 case PIPE_SHADER_FRAGMENT:
4322 llvmpipe->fs_ssbo_write_mask &= ~(((1 << count) - 1) << start_slot);
4323 llvmpipe->fs_ssbo_write_mask |= writable_bitmask << start_slot;
4324 llvmpipe->dirty |= LP_NEW_FS_SSBOS;
4325 break;
4326 default:
4327 unreachable("Illegal shader type");
4328 break;
4329 }
4330 }
4331 }
4332
4333
4334 static void
llvmpipe_set_shader_images(struct pipe_context * pipe,enum pipe_shader_type shader,unsigned start_slot,unsigned count,unsigned unbind_num_trailing_slots,const struct pipe_image_view * images)4335 llvmpipe_set_shader_images(struct pipe_context *pipe,
4336 enum pipe_shader_type shader, unsigned start_slot,
4337 unsigned count, unsigned unbind_num_trailing_slots,
4338 const struct pipe_image_view *images)
4339 {
4340 struct llvmpipe_context *llvmpipe = llvmpipe_context(pipe);
4341 unsigned i, idx;
4342
4343 draw_flush(llvmpipe->draw);
4344 for (i = start_slot, idx = 0; i < start_slot + count; i++, idx++) {
4345 const struct pipe_image_view *image = images ? &images[idx] : NULL;
4346
4347 util_copy_image_view(&llvmpipe->images[shader][i], image);
4348
4349 if (image && image->resource) {
4350 bool read_only = !(image->access & PIPE_IMAGE_ACCESS_WRITE);
4351 llvmpipe_flush_resource(pipe, image->resource, 0, read_only, false,
4352 false, "image");
4353 }
4354 }
4355
4356 llvmpipe->num_images[shader] = start_slot + count;
4357 switch (shader) {
4358 case PIPE_SHADER_VERTEX:
4359 case PIPE_SHADER_GEOMETRY:
4360 case PIPE_SHADER_TESS_CTRL:
4361 case PIPE_SHADER_TESS_EVAL:
4362 draw_set_images(llvmpipe->draw, shader, llvmpipe->images[shader],
4363 start_slot + count);
4364 break;
4365 case PIPE_SHADER_COMPUTE:
4366 llvmpipe->cs_dirty |= LP_CSNEW_IMAGES;
4367 break;
4368 case PIPE_SHADER_FRAGMENT:
4369 llvmpipe->dirty |= LP_NEW_FS_IMAGES;
4370 break;
4371 case PIPE_SHADER_TASK:
4372 llvmpipe->dirty |= LP_NEW_TASK_IMAGES;
4373 break;
4374 case PIPE_SHADER_MESH:
4375 llvmpipe->dirty |= LP_NEW_MESH_IMAGES;
4376 break;
4377 default:
4378 unreachable("Illegal shader type");
4379 break;
4380 }
4381
4382 if (unbind_num_trailing_slots) {
4383 llvmpipe_set_shader_images(pipe, shader, start_slot + count,
4384 unbind_num_trailing_slots, 0, NULL);
4385 }
4386 }
4387
4388
4389 /**
4390 * Return the blend factor equivalent to a destination alpha of one.
4391 */
4392 static inline enum pipe_blendfactor
force_dst_alpha_one(enum pipe_blendfactor factor,bool clamped_zero)4393 force_dst_alpha_one(enum pipe_blendfactor factor, bool clamped_zero)
4394 {
4395 switch (factor) {
4396 case PIPE_BLENDFACTOR_DST_ALPHA:
4397 return PIPE_BLENDFACTOR_ONE;
4398 case PIPE_BLENDFACTOR_INV_DST_ALPHA:
4399 return PIPE_BLENDFACTOR_ZERO;
4400 case PIPE_BLENDFACTOR_SRC_ALPHA_SATURATE:
4401 if (clamped_zero)
4402 return PIPE_BLENDFACTOR_ZERO;
4403 else
4404 return PIPE_BLENDFACTOR_SRC_ALPHA_SATURATE;
4405 default:
4406 return factor;
4407 }
4408 }
4409
4410
4411 /**
4412 * We need to generate several variants of the fragment pipeline to match
4413 * all the combinations of the contributing state atoms.
4414 *
4415 * TODO: there is actually no reason to tie this to context state -- the
4416 * generated code could be cached globally in the screen.
4417 */
4418 static struct lp_fragment_shader_variant_key *
make_variant_key(struct llvmpipe_context * lp,struct lp_fragment_shader * shader,char * store)4419 make_variant_key(struct llvmpipe_context *lp,
4420 struct lp_fragment_shader *shader,
4421 char *store)
4422 {
4423 struct lp_fragment_shader_variant_key *key =
4424 (struct lp_fragment_shader_variant_key *)store;
4425 struct nir_shader *nir = shader->base.ir.nir;
4426
4427 memset(key, 0, sizeof(*key));
4428
4429 if (lp->framebuffer.zsbuf) {
4430 const enum pipe_format zsbuf_format = lp->framebuffer.zsbuf->format;
4431 const struct util_format_description *zsbuf_desc =
4432 util_format_description(zsbuf_format);
4433
4434 if (lp->depth_stencil->depth_enabled &&
4435 util_format_has_depth(zsbuf_desc)) {
4436 key->zsbuf_format = zsbuf_format;
4437 key->depth.enabled = lp->depth_stencil->depth_enabled;
4438 key->depth.writemask = lp->depth_stencil->depth_writemask;
4439 key->depth.func = lp->depth_stencil->depth_func;
4440 }
4441 if (lp->depth_stencil->stencil[0].enabled &&
4442 util_format_has_stencil(zsbuf_desc)) {
4443 key->zsbuf_format = zsbuf_format;
4444 memcpy(&key->stencil, &lp->depth_stencil->stencil,
4445 sizeof key->stencil);
4446 }
4447 if (llvmpipe_resource_is_1d(lp->framebuffer.zsbuf->texture)) {
4448 key->resource_1d = true;
4449 }
4450 key->zsbuf_nr_samples =
4451 util_res_sample_count(lp->framebuffer.zsbuf->texture);
4452
4453 /*
4454 * Restrict depth values if the API is clamped (GL, VK with ext)
4455 * for non float Z buffer
4456 */
4457 key->restrict_depth_values =
4458 !(lp->rasterizer->unclamped_fragment_depth_values &&
4459 util_format_get_depth_only(zsbuf_format) == PIPE_FORMAT_Z32_FLOAT);
4460 }
4461
4462 /*
4463 * Propagate the depth clamp setting from the rasterizer state.
4464 */
4465 key->depth_clamp = lp->rasterizer->depth_clamp;
4466
4467 /* alpha test only applies if render buffer 0 is non-integer
4468 * (or does not exist)
4469 */
4470 if (!lp->framebuffer.nr_cbufs ||
4471 !lp->framebuffer.cbufs[0] ||
4472 !util_format_is_pure_integer(lp->framebuffer.cbufs[0]->format)) {
4473 key->alpha.enabled = lp->depth_stencil->alpha_enabled;
4474 }
4475 if (key->alpha.enabled) {
4476 key->alpha.func = lp->depth_stencil->alpha_func;
4477 /* alpha.ref_value is passed in jit_context */
4478 }
4479
4480 key->flatshade = lp->rasterizer->flatshade;
4481 key->multisample = lp->rasterizer->multisample;
4482 key->no_ms_sample_mask_out = lp->rasterizer->no_ms_sample_mask_out;
4483 if (lp->active_occlusion_queries && !lp->queries_disabled) {
4484 key->occlusion_count = true;
4485 }
4486
4487 memcpy(&key->blend, lp->blend, sizeof key->blend);
4488
4489 key->coverage_samples = 1;
4490 key->min_samples = 1;
4491 if (key->multisample) {
4492 key->coverage_samples =
4493 util_framebuffer_get_num_samples(&lp->framebuffer);
4494 /* Per EXT_shader_framebuffer_fetch spec:
4495 *
4496 * "1. How is framebuffer data treated during multisample rendering?
4497 *
4498 * RESOLVED: Reading the value of gl_LastFragData produces a
4499 * different result for each sample. This implies that all or part
4500 * of the shader be run once for each sample, but has no additional
4501 * implications on fragment shader input variables which may still
4502 * be interpolated per pixel by the implementation."
4503 *
4504 * ARM_shader_framebuffer_fetch_depth_stencil spec further says:
4505 *
4506 * "(1) When multisampling is enabled, does the shader run per sample?
4507 *
4508 * RESOLVED.
4509 *
4510 * This behavior is inherited from either
4511 * EXT_shader_framebuffer_fetch or ARM_shader_framebuffer_fetch as
4512 * described in the interactions section. If neither extension is
4513 * supported, the shader runs once per fragment."
4514 *
4515 * Therefore we should always enable per-sample shading when FB fetch is
4516 * used.
4517 */
4518 if (lp->min_samples > 1 || nir->info.fs.uses_fbfetch_output)
4519 key->min_samples = key->coverage_samples;
4520 }
4521 key->nr_cbufs = lp->framebuffer.nr_cbufs;
4522
4523 if (!key->blend.independent_blend_enable) {
4524 // we always need independent blend otherwise the fixups below won't work
4525 for (unsigned i = 1; i < key->nr_cbufs; i++) {
4526 memcpy(&key->blend.rt[i], &key->blend.rt[0],
4527 sizeof(key->blend.rt[0]));
4528 }
4529 key->blend.independent_blend_enable = 1;
4530 }
4531
4532 for (unsigned i = 0; i < lp->framebuffer.nr_cbufs; i++) {
4533 struct pipe_rt_blend_state *blend_rt = &key->blend.rt[i];
4534
4535 if (lp->framebuffer.cbufs[i]) {
4536 const enum pipe_format format = lp->framebuffer.cbufs[i]->format;
4537
4538 key->cbuf_format[i] = format;
4539 key->cbuf_nr_samples[i] =
4540 util_res_sample_count(lp->framebuffer.cbufs[i]->texture);
4541
4542 /*
4543 * Figure out if this is a 1d resource. Note that OpenGL allows crazy
4544 * mixing of 2d textures with height 1 and 1d textures, so make sure
4545 * we pick 1d if any cbuf or zsbuf is 1d.
4546 */
4547 if (llvmpipe_resource_is_1d(lp->framebuffer.cbufs[i]->texture)) {
4548 key->resource_1d = true;
4549 }
4550
4551 const struct util_format_description *format_desc =
4552 util_format_description(format);
4553 assert(format_desc->colorspace == UTIL_FORMAT_COLORSPACE_RGB ||
4554 format_desc->colorspace == UTIL_FORMAT_COLORSPACE_SRGB);
4555
4556 /*
4557 * Mask out color channels not present in the color buffer.
4558 */
4559 blend_rt->colormask &= util_format_colormask(format_desc);
4560
4561 /*
4562 * Disable blend for integer formats.
4563 */
4564 if (util_format_is_pure_integer(format)) {
4565 blend_rt->blend_enable = 0;
4566 }
4567
4568 /*
4569 * Our swizzled render tiles always have an alpha channel, but the
4570 * linear render target format often does not, so force here the dst
4571 * alpha to be one.
4572 *
4573 * This is not a mere optimization. Wrong results will be produced if
4574 * the dst alpha is used, the dst format does not have alpha, and the
4575 * previous rendering was not flushed from the swizzled to linear
4576 * buffer. For example, NonPowTwo DCT.
4577 *
4578 * TODO: This should be generalized to all channels for better
4579 * performance, but only alpha causes correctness issues.
4580 *
4581 * Also, force rgb/alpha func/factors match, to make AoS blending
4582 * easier.
4583 */
4584 if (format_desc->swizzle[3] > PIPE_SWIZZLE_W ||
4585 format_desc->swizzle[3] == format_desc->swizzle[0]) {
4586 // Doesn't cover mixed snorm/unorm but can't render to them anyway
4587 bool clamped_zero = !util_format_is_float(format) &&
4588 !util_format_is_snorm(format);
4589 blend_rt->rgb_src_factor =
4590 force_dst_alpha_one(blend_rt->rgb_src_factor, clamped_zero);
4591 blend_rt->rgb_dst_factor =
4592 force_dst_alpha_one(blend_rt->rgb_dst_factor, clamped_zero);
4593 blend_rt->alpha_func = blend_rt->rgb_func;
4594 blend_rt->alpha_src_factor = blend_rt->rgb_src_factor;
4595 blend_rt->alpha_dst_factor = blend_rt->rgb_dst_factor;
4596 }
4597 } else {
4598 /* no color buffer for this fragment output */
4599 key->cbuf_format[i] = PIPE_FORMAT_NONE;
4600 key->cbuf_nr_samples[i] = 0;
4601 blend_rt->colormask = 0x0;
4602 blend_rt->blend_enable = 0;
4603 }
4604 }
4605
4606 /* This value will be the same for all the variants of a given shader:
4607 */
4608 key->nr_samplers = BITSET_LAST_BIT(nir->info.samplers_used);
4609 key->nr_sampler_views = BITSET_LAST_BIT(nir->info.textures_used);
4610
4611 struct lp_sampler_static_state *fs_sampler =
4612 lp_fs_variant_key_samplers(key);
4613
4614 memset(fs_sampler, 0,
4615 MAX2(key->nr_samplers, key->nr_sampler_views) * sizeof *fs_sampler);
4616
4617 for (unsigned i = 0; i < key->nr_samplers; ++i) {
4618 if (BITSET_TEST(nir->info.samplers_used, i)) {
4619 lp_sampler_static_sampler_state(&fs_sampler[i].sampler_state,
4620 lp->samplers[PIPE_SHADER_FRAGMENT][i]);
4621 }
4622 }
4623
4624 /*
4625 * XXX If TGSI_FILE_SAMPLER_VIEW exists assume all texture opcodes
4626 * are dx10-style? Can't really have mixed opcodes, at least not
4627 * if we want to skip the holes here (without rescanning tgsi).
4628 */
4629 if (key->nr_sampler_views) {
4630 for (unsigned i = 0; i < key->nr_sampler_views; ++i) {
4631 /*
4632 * Note sview may exceed what's representable by file_mask.
4633 * This will still work, the only downside is that not actually
4634 * used views may be included in the shader key.
4635 */
4636 if (BITSET_TEST(nir->info.textures_used, i)) {
4637 lp_sampler_static_texture_state(&fs_sampler[i].texture_state,
4638 lp->sampler_views[PIPE_SHADER_FRAGMENT][i]);
4639 }
4640 }
4641 } else {
4642 key->nr_sampler_views = key->nr_samplers;
4643 for (unsigned i = 0; i < key->nr_sampler_views; ++i) {
4644 if (BITSET_TEST(nir->info.samplers_used, i)) {
4645 lp_sampler_static_texture_state(&fs_sampler[i].texture_state,
4646 lp->sampler_views[PIPE_SHADER_FRAGMENT][i]);
4647 }
4648 }
4649 }
4650
4651 struct lp_image_static_state *lp_image = lp_fs_variant_key_images(key);
4652 key->nr_images = BITSET_LAST_BIT(nir->info.images_used);
4653 if (key->nr_images)
4654 memset(lp_image, 0,
4655 key->nr_images * sizeof *lp_image);
4656 for (unsigned i = 0; i < key->nr_images; ++i) {
4657 if (BITSET_TEST(nir->info.images_used, i)) {
4658 lp_sampler_static_texture_state_image(&lp_image[i].image_state,
4659 &lp->images[PIPE_SHADER_FRAGMENT][i]);
4660 }
4661 }
4662
4663 if (shader->kind == LP_FS_KIND_AERO_MINIFICATION) {
4664 struct lp_sampler_static_state *samp0 =
4665 lp_fs_variant_key_sampler_idx(key, 0);
4666 assert(samp0);
4667 samp0->sampler_state.min_img_filter = PIPE_TEX_FILTER_NEAREST;
4668 samp0->sampler_state.mag_img_filter = PIPE_TEX_FILTER_NEAREST;
4669 }
4670
4671 return key;
4672 }
4673
4674
4675 /**
4676 * Update fragment shader state. This is called just prior to drawing
4677 * something when some fragment-related state has changed.
4678 */
4679 void
llvmpipe_update_fs(struct llvmpipe_context * lp)4680 llvmpipe_update_fs(struct llvmpipe_context *lp)
4681 {
4682 struct lp_fragment_shader *shader = lp->fs;
4683
4684 char store[LP_FS_MAX_VARIANT_KEY_SIZE];
4685 const struct lp_fragment_shader_variant_key *key =
4686 make_variant_key(lp, shader, store);
4687
4688 struct lp_fragment_shader_variant *variant = NULL;
4689 struct lp_fs_variant_list_item *li;
4690 /* Search the variants for one which matches the key */
4691 LIST_FOR_EACH_ENTRY(li, &shader->variants.list, list) {
4692 if (memcmp(&li->base->key, key, shader->variant_key_size) == 0) {
4693 variant = li->base;
4694 break;
4695 }
4696 }
4697
4698 if (variant) {
4699 /* Move this variant to the head of the list to implement LRU
4700 * deletion of shader's when we have too many.
4701 */
4702 list_move_to(&variant->list_item_global.list, &lp->fs_variants_list.list);
4703 } else {
4704 /* variant not found, create it now */
4705
4706 if (LP_DEBUG & DEBUG_FS) {
4707 debug_printf("%u variants,\t%u instrs,\t%u instrs/variant\n",
4708 lp->nr_fs_variants,
4709 lp->nr_fs_instrs,
4710 lp->nr_fs_variants ? lp->nr_fs_instrs / lp->nr_fs_variants : 0);
4711 }
4712
4713 /* First, check if we've exceeded the max number of shader variants.
4714 * If so, free 6.25% of them (the least recently used ones).
4715 */
4716 const unsigned variants_to_cull =
4717 lp->nr_fs_variants >= LP_MAX_SHADER_VARIANTS
4718 ? LP_MAX_SHADER_VARIANTS / 16 : 0;
4719
4720 if (variants_to_cull ||
4721 lp->nr_fs_instrs >= LP_MAX_SHADER_INSTRUCTIONS) {
4722 if (gallivm_debug & GALLIVM_DEBUG_PERF) {
4723 debug_printf("Evicting FS: %u fs variants,\t%u total variants,"
4724 "\t%u instrs,\t%u instrs/variant\n",
4725 shader->variants_cached,
4726 lp->nr_fs_variants, lp->nr_fs_instrs,
4727 lp->nr_fs_instrs / lp->nr_fs_variants);
4728 }
4729
4730 /*
4731 * We need to re-check lp->nr_fs_variants because an arbitrarily
4732 * large number of shader variants (potentially all of them) could
4733 * be pending for destruction on flush.
4734 */
4735
4736 for (unsigned i = 0;
4737 i < variants_to_cull ||
4738 lp->nr_fs_instrs >= LP_MAX_SHADER_INSTRUCTIONS;
4739 i++) {
4740 struct lp_fs_variant_list_item *item;
4741 if (list_is_empty(&lp->fs_variants_list.list)) {
4742 break;
4743 }
4744 item = list_last_entry(&lp->fs_variants_list.list,
4745 struct lp_fs_variant_list_item, list);
4746 assert(item);
4747 assert(item->base);
4748 llvmpipe_remove_shader_variant(lp, item->base);
4749 struct lp_fragment_shader_variant *variant = item->base;
4750 lp_fs_variant_reference(lp, &variant, NULL);
4751 }
4752 }
4753
4754 /*
4755 * Generate the new variant.
4756 */
4757 int64_t t0 = os_time_get();
4758 variant = generate_variant(lp, shader, key);
4759 int64_t t1 = os_time_get();
4760 int64_t dt = t1 - t0;
4761 LP_COUNT_ADD(llvm_compile_time, dt);
4762 LP_COUNT_ADD(nr_llvm_compiles, 2); /* emit vs. omit in/out test */
4763
4764 /* Put the new variant into the list */
4765 if (variant) {
4766 list_add(&variant->list_item_local.list, &shader->variants.list);
4767 list_add(&variant->list_item_global.list, &lp->fs_variants_list.list);
4768 lp->nr_fs_variants++;
4769 lp->nr_fs_instrs += variant->nr_instrs;
4770 shader->variants_cached++;
4771 }
4772 }
4773
4774 /* Bind this variant */
4775 lp_setup_set_fs_variant(lp->setup, variant);
4776 }
4777
4778
4779 void
llvmpipe_init_fs_funcs(struct llvmpipe_context * llvmpipe)4780 llvmpipe_init_fs_funcs(struct llvmpipe_context *llvmpipe)
4781 {
4782 llvmpipe->pipe.create_fs_state = llvmpipe_create_fs_state;
4783 llvmpipe->pipe.bind_fs_state = llvmpipe_bind_fs_state;
4784 llvmpipe->pipe.delete_fs_state = llvmpipe_delete_fs_state;
4785 llvmpipe->pipe.set_constant_buffer = llvmpipe_set_constant_buffer;
4786 llvmpipe->pipe.set_shader_buffers = llvmpipe_set_shader_buffers;
4787 llvmpipe->pipe.set_shader_images = llvmpipe_set_shader_images;
4788 }
4789