1 /**************************************************************************
2 *
3 * Copyright 2009 VMware, Inc.
4 * Copyright 2007 VMware, Inc.
5 * All Rights Reserved.
6 *
7 * Permission is hereby granted, free of charge, to any person obtaining a
8 * copy of this software and associated documentation files (the
9 * "Software"), to deal in the Software without restriction, including
10 * without limitation the rights to use, copy, modify, merge, publish,
11 * distribute, sub license, and/or sell copies of the Software, and to
12 * permit persons to whom the Software is furnished to do so, subject to
13 * the following conditions:
14 *
15 * The above copyright notice and this permission notice (including the
16 * next paragraph) shall be included in all copies or substantial portions
17 * of the Software.
18 *
19 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
20 * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
21 * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
22 * IN NO EVENT SHALL VMWARE AND/OR ITS SUPPLIERS BE LIABLE FOR
23 * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
24 * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
25 * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
26 *
27 **************************************************************************/
28
29 /**
30 * @file
31 * Code generate the whole fragment pipeline.
32 *
33 * The fragment pipeline consists of the following stages:
34 * - early depth test
35 * - fragment shader
36 * - alpha test
37 * - depth/stencil test
38 * - blending
39 *
40 * This file has only the glue to assemble the fragment pipeline. The actual
41 * plumbing of converting Gallium state into LLVM IR is done elsewhere, in the
42 * lp_bld_*.[ch] files, and in a complete generic and reusable way. Here we
43 * muster the LLVM JIT execution engine to create a function that follows an
44 * established binary interface and that can be called from C directly.
45 *
46 * A big source of complexity here is that we often want to run different
47 * stages with different precisions and data types and precisions. For example,
48 * the fragment shader needs typically to be done in floats, but the
49 * depth/stencil test and blending is better done in the type that most closely
50 * matches the depth/stencil and color buffer respectively.
51 *
52 * Since the width of a SIMD vector register stays the same regardless of the
53 * element type, different types imply different number of elements, so we must
54 * code generate more instances of the stages with larger types to be able to
55 * feed/consume the stages with smaller types.
56 *
57 * @author Jose Fonseca <jfonseca@vmware.com>
58 */
59
60 #include <limits.h>
61 #include "pipe/p_defines.h"
62 #include "util/u_inlines.h"
63 #include "util/u_memory.h"
64 #include "util/u_pointer.h"
65 #include "util/format/u_format.h"
66 #include "util/u_dump.h"
67 #include "util/u_string.h"
68 #include "util/u_dual_blend.h"
69 #include "util/u_upload_mgr.h"
70 #include "util/os_time.h"
71 #include "pipe/p_shader_tokens.h"
72 #include "draw/draw_context.h"
73 #include "nir/tgsi_to_nir.h"
74 #include "gallivm/lp_bld_type.h"
75 #include "gallivm/lp_bld_const.h"
76 #include "gallivm/lp_bld_conv.h"
77 #include "gallivm/lp_bld_init.h"
78 #include "gallivm/lp_bld_intr.h"
79 #include "gallivm/lp_bld_logic.h"
80 #include "gallivm/lp_bld_tgsi.h"
81 #include "gallivm/lp_bld_nir.h"
82 #include "gallivm/lp_bld_swizzle.h"
83 #include "gallivm/lp_bld_flow.h"
84 #include "gallivm/lp_bld_debug.h"
85 #include "gallivm/lp_bld_arit.h"
86 #include "gallivm/lp_bld_bitarit.h"
87 #include "gallivm/lp_bld_pack.h"
88 #include "gallivm/lp_bld_format.h"
89 #include "gallivm/lp_bld_quad.h"
90 #include "gallivm/lp_bld_gather.h"
91 #include "gallivm/lp_bld_jit_sample.h"
92
93 #include "lp_bld_alpha.h"
94 #include "lp_bld_blend.h"
95 #include "lp_bld_depth.h"
96 #include "lp_bld_interp.h"
97 #include "lp_context.h"
98 #include "lp_debug.h"
99 #include "lp_perf.h"
100 #include "lp_setup.h"
101 #include "lp_state.h"
102 #include "lp_tex_sample.h"
103 #include "lp_flush.h"
104 #include "lp_state_fs.h"
105 #include "lp_rast.h"
106 #include "nir/nir_to_tgsi_info.h"
107
108 #include "lp_screen.h"
109 #include "compiler/nir/nir_serialize.h"
110 #include "util/mesa-sha1.h"
111
112
113 /** Fragment shader number (for debugging) */
114 static unsigned fs_no = 0;
115
116
117 static void
118 load_unswizzled_block(struct gallivm_state *gallivm,
119 LLVMTypeRef base_type,
120 LLVMValueRef base_ptr,
121 LLVMValueRef stride,
122 unsigned block_width,
123 unsigned block_height,
124 LLVMValueRef* dst,
125 struct lp_type dst_type,
126 unsigned dst_count,
127 unsigned dst_alignment);
128 /**
129 * Checks if a format description is an arithmetic format
130 *
131 * A format which has irregular channel sizes such as R3_G3_B2 or R5_G6_B5.
132 */
133 static inline bool
is_arithmetic_format(const struct util_format_description * format_desc)134 is_arithmetic_format(const struct util_format_description *format_desc)
135 {
136 bool arith = false;
137
138 for (unsigned i = 0; i < format_desc->nr_channels; ++i) {
139 arith |= format_desc->channel[i].size != format_desc->channel[0].size;
140 arith |= (format_desc->channel[i].size % 8) != 0;
141 }
142
143 return arith;
144 }
145
146
147 /**
148 * Checks if this format requires special handling due to required expansion
149 * to floats for blending, and furthermore has "natural" packed AoS ->
150 * unpacked SoA conversion.
151 */
152 static inline bool
format_expands_to_float_soa(const struct util_format_description * format_desc)153 format_expands_to_float_soa(const struct util_format_description *format_desc)
154 {
155 if (format_desc->format == PIPE_FORMAT_R11G11B10_FLOAT ||
156 format_desc->colorspace == UTIL_FORMAT_COLORSPACE_SRGB) {
157 return true;
158 }
159 return false;
160 }
161
162
163 /**
164 * Retrieves the type representing the memory layout for a format
165 *
166 * e.g. RGBA16F = 4x half-float and R3G3B2 = 1x byte
167 */
168 static inline void
lp_mem_type_from_format_desc(const struct util_format_description * format_desc,struct lp_type * type)169 lp_mem_type_from_format_desc(const struct util_format_description *format_desc,
170 struct lp_type* type)
171 {
172 if (format_expands_to_float_soa(format_desc)) {
173 /* just make this a uint with width of block */
174 type->floating = false;
175 type->fixed = false;
176 type->sign = false;
177 type->norm = false;
178 type->width = format_desc->block.bits;
179 type->length = 1;
180 return;
181 }
182
183 int chan = util_format_get_first_non_void_channel(format_desc->format);
184
185 memset(type, 0, sizeof(struct lp_type));
186 type->floating = format_desc->channel[chan].type == UTIL_FORMAT_TYPE_FLOAT;
187 type->fixed = format_desc->channel[chan].type == UTIL_FORMAT_TYPE_FIXED;
188 type->sign = format_desc->channel[chan].type != UTIL_FORMAT_TYPE_UNSIGNED;
189 type->norm = format_desc->channel[chan].normalized;
190
191 if (is_arithmetic_format(format_desc)) {
192 type->width = 0;
193 type->length = 1;
194
195 for (unsigned i = 0; i < format_desc->nr_channels; ++i) {
196 type->width += format_desc->channel[i].size;
197 }
198 } else {
199 type->width = format_desc->channel[chan].size;
200 type->length = format_desc->nr_channels;
201 }
202 }
203
204
205 /**
206 * Expand the relevant bits of mask_input to a n*4-dword mask for the
207 * n*four pixels in n 2x2 quads. This will set the n*four elements of the
208 * quad mask vector to 0 or ~0.
209 * Grouping is 01, 23 for 2 quad mode hence only 0 and 2 are valid
210 * quad arguments with fs length 8.
211 *
212 * \param first_quad which quad(s) of the quad group to test, in [0,3]
213 * \param mask_input bitwise mask for the whole 4x4 stamp
214 */
215 static LLVMValueRef
generate_quad_mask(struct gallivm_state * gallivm,struct lp_type fs_type,unsigned first_quad,unsigned sample,LLVMValueRef mask_input)216 generate_quad_mask(struct gallivm_state *gallivm,
217 struct lp_type fs_type,
218 unsigned first_quad,
219 unsigned sample,
220 LLVMValueRef mask_input) /* int64 */
221 {
222 LLVMBuilderRef builder = gallivm->builder;
223 LLVMTypeRef i32t = LLVMInt32TypeInContext(gallivm->context);
224 LLVMValueRef bits[16];
225 LLVMValueRef mask, bits_vec;
226
227 /*
228 * XXX: We'll need a different path for 16 x u8
229 */
230 assert(fs_type.width == 32);
231 assert(fs_type.length <= ARRAY_SIZE(bits));
232 struct lp_type mask_type = lp_int_type(fs_type);
233
234 /*
235 * mask_input >>= (quad * 4)
236 */
237 int shift;
238 switch (first_quad) {
239 case 0:
240 shift = 0;
241 break;
242 case 1:
243 assert(fs_type.length == 4);
244 shift = 2;
245 break;
246 case 2:
247 shift = 8;
248 break;
249 case 3:
250 assert(fs_type.length == 4);
251 shift = 10;
252 break;
253 default:
254 assert(0);
255 shift = 0;
256 }
257
258 mask_input = LLVMBuildLShr(builder, mask_input,
259 lp_build_const_int64(gallivm, 16 * sample), "");
260 mask_input = LLVMBuildTrunc(builder, mask_input, i32t, "");
261 mask_input = LLVMBuildAnd(builder, mask_input,
262 lp_build_const_int32(gallivm, 0xffff), "");
263 mask_input = LLVMBuildLShr(builder, mask_input,
264 LLVMConstInt(i32t, shift, 0), "");
265
266 /*
267 * mask = { mask_input & (1 << i), for i in [0,3] }
268 */
269 mask = lp_build_broadcast(gallivm,
270 lp_build_vec_type(gallivm, mask_type),
271 mask_input);
272
273 for (int i = 0; i < fs_type.length / 4; i++) {
274 unsigned j = 2 * (i % 2) + (i / 2) * 8;
275 bits[4*i + 0] = LLVMConstInt(i32t, 1ULL << (j + 0), 0);
276 bits[4*i + 1] = LLVMConstInt(i32t, 1ULL << (j + 1), 0);
277 bits[4*i + 2] = LLVMConstInt(i32t, 1ULL << (j + 4), 0);
278 bits[4*i + 3] = LLVMConstInt(i32t, 1ULL << (j + 5), 0);
279 }
280 bits_vec = LLVMConstVector(bits, fs_type.length);
281 mask = LLVMBuildAnd(builder, mask, bits_vec, "");
282
283 /*
284 * mask = mask == bits ? ~0 : 0
285 */
286 mask = lp_build_compare(gallivm,
287 mask_type, PIPE_FUNC_EQUAL,
288 mask, bits_vec);
289
290 return mask;
291 }
292
293
294 #define EARLY_DEPTH_TEST 0x1
295 #define LATE_DEPTH_TEST 0x2
296 #define EARLY_DEPTH_WRITE 0x4
297 #define LATE_DEPTH_WRITE 0x8
298 #define EARLY_DEPTH_TEST_INFERRED 0x10 //only with EARLY_DEPTH_TEST
299
300 static unsigned
get_cbuf_location(nir_variable * var,unsigned slot)301 get_cbuf_location(nir_variable *var, unsigned slot)
302 {
303 return (var->data.location - FRAG_RESULT_DATA0) + var->data.index + slot;
304 }
305
306 static int
find_output_by_frag_result(struct nir_shader * shader,gl_frag_result frag_result)307 find_output_by_frag_result(struct nir_shader *shader,
308 gl_frag_result frag_result)
309 {
310 nir_foreach_shader_out_variable(var, shader) {
311 int slots = nir_variable_count_slots(var, var->type);
312 for (unsigned s = 0; s < slots; s++) {
313 if (var->data.location + var->data.index + s == frag_result)
314 return var->data.driver_location + s;
315 }
316 }
317
318 return -1;
319 }
320
321 /**
322 * Fetch the specified lp_jit_viewport structure for a given viewport_index.
323 */
324 static LLVMValueRef
lp_llvm_viewport(LLVMTypeRef context_type,LLVMValueRef context_ptr,struct gallivm_state * gallivm,LLVMValueRef viewport_index)325 lp_llvm_viewport(LLVMTypeRef context_type,
326 LLVMValueRef context_ptr,
327 struct gallivm_state *gallivm,
328 LLVMValueRef viewport_index)
329 {
330 LLVMBuilderRef builder = gallivm->builder;
331 LLVMValueRef ptr;
332 LLVMValueRef res;
333 struct lp_type viewport_type =
334 lp_type_float_vec(32, 32 * LP_JIT_VIEWPORT_NUM_FIELDS);
335 LLVMTypeRef vtype = lp_build_vec_type(gallivm, viewport_type);
336
337 ptr = lp_jit_context_viewports(gallivm, context_type, context_ptr);
338 ptr = LLVMBuildPointerCast(builder, ptr,
339 LLVMPointerType(vtype, 0), "");
340
341 res = lp_build_pointer_get2(builder, vtype, ptr, viewport_index);
342
343 return res;
344 }
345
346
347 static LLVMValueRef
lp_build_depth_clamp(struct gallivm_state * gallivm,LLVMBuilderRef builder,bool depth_clamp,bool restrict_depth,struct lp_type type,LLVMTypeRef context_type,LLVMValueRef context_ptr,LLVMTypeRef thread_data_type,LLVMValueRef thread_data_ptr,LLVMValueRef z)348 lp_build_depth_clamp(struct gallivm_state *gallivm,
349 LLVMBuilderRef builder,
350 bool depth_clamp,
351 bool restrict_depth,
352 struct lp_type type,
353 LLVMTypeRef context_type,
354 LLVMValueRef context_ptr,
355 LLVMTypeRef thread_data_type,
356 LLVMValueRef thread_data_ptr,
357 LLVMValueRef z)
358 {
359 LLVMValueRef viewport, min_depth, max_depth;
360 LLVMValueRef viewport_index;
361 struct lp_build_context f32_bld;
362
363 assert(type.floating);
364 lp_build_context_init(&f32_bld, gallivm, type);
365
366 if (restrict_depth)
367 z = lp_build_clamp(&f32_bld, z, f32_bld.zero, f32_bld.one);
368
369 if (!depth_clamp)
370 return z;
371
372 /*
373 * Assumes clamping of the viewport index will occur in setup/gs. Value
374 * is passed through the rasterization stage via lp_rast_shader_inputs.
375 *
376 * See: draw_clamp_viewport_idx and lp_clamp_viewport_idx for clamping
377 * semantics.
378 */
379 viewport_index = lp_jit_thread_data_raster_state_viewport_index(gallivm,
380 thread_data_type,
381 thread_data_ptr);
382
383 /*
384 * Load the min and max depth from the lp_jit_context.viewports
385 * array of lp_jit_viewport structures.
386 */
387 viewport = lp_llvm_viewport(context_type, context_ptr, gallivm, viewport_index);
388
389 /* viewports[viewport_index].min_depth */
390 min_depth = LLVMBuildExtractElement(builder, viewport,
391 lp_build_const_int32(gallivm, LP_JIT_VIEWPORT_MIN_DEPTH), "");
392 min_depth = lp_build_broadcast_scalar(&f32_bld, min_depth);
393
394 /* viewports[viewport_index].max_depth */
395 max_depth = LLVMBuildExtractElement(builder, viewport,
396 lp_build_const_int32(gallivm, LP_JIT_VIEWPORT_MAX_DEPTH), "");
397 max_depth = lp_build_broadcast_scalar(&f32_bld, max_depth);
398
399 /*
400 * Clamp to the min and max depth values for the given viewport.
401 */
402 return lp_build_clamp(&f32_bld, z, min_depth, max_depth);
403 }
404
405
406 static void
lp_build_sample_alpha_to_coverage(struct gallivm_state * gallivm,struct lp_type type,unsigned coverage_samples,LLVMValueRef num_loop,LLVMValueRef loop_counter,LLVMTypeRef coverage_mask_type,LLVMValueRef coverage_mask_store,LLVMValueRef alpha)407 lp_build_sample_alpha_to_coverage(struct gallivm_state *gallivm,
408 struct lp_type type,
409 unsigned coverage_samples,
410 LLVMValueRef num_loop,
411 LLVMValueRef loop_counter,
412 LLVMTypeRef coverage_mask_type,
413 LLVMValueRef coverage_mask_store,
414 LLVMValueRef alpha)
415 {
416 struct lp_build_context bld;
417 LLVMBuilderRef builder = gallivm->builder;
418 float step = 1.0 / coverage_samples;
419
420 lp_build_context_init(&bld, gallivm, type);
421 for (unsigned s = 0; s < coverage_samples; s++) {
422 LLVMValueRef alpha_ref_value = lp_build_const_vec(gallivm, type, step * s);
423 LLVMValueRef test = lp_build_cmp(&bld, PIPE_FUNC_GREATER, alpha, alpha_ref_value);
424
425 LLVMValueRef s_mask_idx = LLVMBuildMul(builder, lp_build_const_int32(gallivm, s), num_loop, "");
426 s_mask_idx = LLVMBuildAdd(builder, s_mask_idx, loop_counter, "");
427 LLVMValueRef s_mask_ptr = LLVMBuildGEP2(builder, coverage_mask_type,
428 coverage_mask_store, &s_mask_idx, 1, "");
429 LLVMValueRef s_mask = LLVMBuildLoad2(builder, coverage_mask_type, s_mask_ptr, "");
430 s_mask = LLVMBuildAnd(builder, s_mask, test, "");
431 LLVMBuildStore(builder, s_mask, s_mask_ptr);
432 }
433 };
434
435
436 struct lp_build_fs_llvm_iface {
437 struct lp_build_fs_iface base;
438 struct lp_build_interp_soa_context *interp;
439 struct lp_build_for_loop_state *loop_state;
440 LLVMTypeRef mask_type;
441 LLVMValueRef mask_store;
442 LLVMValueRef sample_id;
443 LLVMValueRef color_ptr_ptr;
444 LLVMValueRef color_stride_ptr;
445 LLVMValueRef color_sample_stride_ptr;
446 LLVMValueRef zs_base_ptr;
447 LLVMValueRef zs_stride;
448 LLVMValueRef zs_sample_stride;
449 const struct lp_fragment_shader_variant_key *key;
450 };
451
452
453 static LLVMValueRef
fs_interp(const struct lp_build_fs_iface * iface,struct lp_build_context * bld,unsigned attrib,unsigned chan,bool centroid,bool sample,LLVMValueRef attrib_indir,LLVMValueRef offsets[2])454 fs_interp(const struct lp_build_fs_iface *iface,
455 struct lp_build_context *bld,
456 unsigned attrib, unsigned chan,
457 bool centroid, bool sample,
458 LLVMValueRef attrib_indir,
459 LLVMValueRef offsets[2])
460 {
461 struct lp_build_fs_llvm_iface *fs_iface = (struct lp_build_fs_llvm_iface *)iface;
462 struct lp_build_interp_soa_context *interp = fs_iface->interp;
463 unsigned loc = TGSI_INTERPOLATE_LOC_CENTER;
464 if (centroid)
465 loc = TGSI_INTERPOLATE_LOC_CENTROID;
466 if (sample)
467 loc = TGSI_INTERPOLATE_LOC_SAMPLE;
468
469 return lp_build_interp_soa(interp, bld->gallivm, fs_iface->loop_state->counter,
470 fs_iface->mask_type, fs_iface->mask_store,
471 attrib, chan, loc, attrib_indir, offsets);
472 }
473
474
475 /**
476 * Convert depth-stencil format to a single component one, returning
477 * PIPE_FORMAT_NONE if it doesn't contain the required component.
478 */
479 static enum pipe_format
select_zs_component_format(enum pipe_format format,bool fetch_stencil)480 select_zs_component_format(enum pipe_format format,
481 bool fetch_stencil)
482 {
483 const struct util_format_description* desc = util_format_description(format);
484 if (fetch_stencil && !util_format_has_stencil(desc))
485 return PIPE_FORMAT_NONE;
486 if (!fetch_stencil && !util_format_has_depth(desc))
487 return PIPE_FORMAT_NONE;
488
489 switch (format) {
490 case PIPE_FORMAT_Z24_UNORM_S8_UINT:
491 return fetch_stencil ? PIPE_FORMAT_X24S8_UINT : PIPE_FORMAT_Z24X8_UNORM;
492 case PIPE_FORMAT_S8_UINT_Z24_UNORM:
493 return fetch_stencil ? PIPE_FORMAT_S8X24_UINT : PIPE_FORMAT_X8Z24_UNORM;
494 case PIPE_FORMAT_Z32_FLOAT_S8X24_UINT:
495 return fetch_stencil ? PIPE_FORMAT_X32_S8X24_UINT : format;
496 default:
497 return format;
498 }
499 }
500
501 static void
fs_fb_fetch(const struct lp_build_fs_iface * iface,struct lp_build_context * bld,int location,LLVMValueRef result[4])502 fs_fb_fetch(const struct lp_build_fs_iface *iface,
503 struct lp_build_context *bld,
504 int location,
505 LLVMValueRef result[4])
506 {
507 struct lp_build_fs_llvm_iface *fs_iface = (struct lp_build_fs_llvm_iface *)iface;
508 struct gallivm_state *gallivm = bld->gallivm;
509 LLVMBuilderRef builder = gallivm->builder;
510 LLVMTypeRef int32_type = LLVMInt32TypeInContext(gallivm->context);
511 LLVMTypeRef int8_type = LLVMInt8TypeInContext(gallivm->context);
512 LLVMTypeRef int8p_type = LLVMPointerType(int8_type, 0);
513 const struct lp_fragment_shader_variant_key *key = fs_iface->key;
514
515 LLVMValueRef buf_ptr;
516 LLVMValueRef stride;
517 enum pipe_format buf_format;
518
519 const bool fetch_stencil = location == FRAG_RESULT_STENCIL;
520 const bool fetch_zs = fetch_stencil || location == FRAG_RESULT_DEPTH;
521 if (fetch_zs) {
522 buf_ptr = fs_iface->zs_base_ptr;
523 stride = fs_iface->zs_stride;
524 buf_format = select_zs_component_format(key->zsbuf_format, fetch_stencil);
525 } else {
526 assert(location >= FRAG_RESULT_DATA0 && location <= FRAG_RESULT_DATA7);
527 const int cbuf = location - FRAG_RESULT_DATA0;
528 LLVMValueRef index = lp_build_const_int32(gallivm, cbuf);
529
530 buf_ptr = LLVMBuildLoad2(builder, int8p_type,
531 LLVMBuildGEP2(builder, int8p_type,
532 fs_iface->color_ptr_ptr, &index, 1, ""), "");
533 stride = LLVMBuildLoad2(builder, int32_type,
534 LLVMBuildGEP2(builder, int32_type,
535 fs_iface->color_stride_ptr, &index, 1, ""), "");
536 buf_format = key->cbuf_format[cbuf];
537 }
538
539 const struct util_format_description* out_format_desc = util_format_description(buf_format);
540 if (out_format_desc->format == PIPE_FORMAT_NONE) {
541 result[0] = result[1] = result[2] = result[3] = bld->undef;
542 return;
543 }
544
545 unsigned block_size = bld->type.length;
546 unsigned block_height = key->resource_1d ? 1 : 2;
547 unsigned block_width = block_size / block_height;
548
549 if (key->multisample) {
550 LLVMValueRef sample_stride;
551
552 if (fetch_zs) {
553 sample_stride = fs_iface->zs_sample_stride;
554 } else {
555 LLVMValueRef index = lp_build_const_int32(gallivm, location - FRAG_RESULT_DATA0);
556 sample_stride = LLVMBuildLoad2(builder, int32_type,
557 LLVMBuildGEP2(builder,
558 int32_type,
559 fs_iface->color_sample_stride_ptr,
560 &index, 1, ""), "");
561 }
562
563 LLVMValueRef sample_offset = LLVMBuildMul(builder, sample_stride, fs_iface->sample_id, "");
564 buf_ptr = LLVMBuildGEP2(builder, int8_type,
565 buf_ptr, &sample_offset, 1, "");
566 }
567
568 /* fragment shader executes on 4x4 blocks. depending on vector width it can
569 * execute 2 or 4 iterations. only move to the next row once the top row
570 * has completed 8 wide 1 iteration, 4 wide 2 iterations */
571 LLVMValueRef x_offset = NULL, y_offset = NULL;
572 if (!key->resource_1d) {
573 LLVMValueRef counter = fs_iface->loop_state->counter;
574
575 if (block_size == 4) {
576 x_offset = LLVMBuildShl(builder,
577 LLVMBuildAnd(builder, fs_iface->loop_state->counter, lp_build_const_int32(gallivm, 1), ""),
578 lp_build_const_int32(gallivm, 1), "");
579 counter = LLVMBuildLShr(builder, fs_iface->loop_state->counter, lp_build_const_int32(gallivm, 1), "");
580 }
581 y_offset = LLVMBuildMul(builder, counter, lp_build_const_int32(gallivm, 2), "");
582 }
583
584 LLVMValueRef offsets[4 * 4];
585 for (unsigned i = 0; i < block_size; i++) {
586 unsigned x = i % block_width;
587 unsigned y = i / block_width;
588
589 if (block_size == 8) {
590 /* remap the raw slots into the fragment shader execution mode. */
591 /* this math took me way too long to work out, I'm sure it's
592 * overkill.
593 */
594 x = (i & 1) + ((i >> 2) << 1);
595 if (!key->resource_1d)
596 y = (i & 2) >> 1;
597 }
598
599 LLVMValueRef x_val;
600 if (x_offset) {
601 x_val = LLVMBuildAdd(builder, lp_build_const_int32(gallivm, x), x_offset, "");
602 x_val = LLVMBuildMul(builder, x_val, lp_build_const_int32(gallivm, out_format_desc->block.bits / 8), "");
603 } else {
604 x_val = lp_build_const_int32(gallivm, x * (out_format_desc->block.bits / 8));
605 }
606
607 LLVMValueRef y_val = lp_build_const_int32(gallivm, y);
608 if (y_offset)
609 y_val = LLVMBuildAdd(builder, y_val, y_offset, "");
610 y_val = LLVMBuildMul(builder, y_val, stride, "");
611
612 offsets[i] = LLVMBuildAdd(builder, x_val, y_val, "");
613 }
614 LLVMValueRef offset = lp_build_gather_values(gallivm, offsets, block_size);
615
616 struct lp_type texel_type = bld->type;
617 if (out_format_desc->colorspace == UTIL_FORMAT_COLORSPACE_RGB &&
618 out_format_desc->channel[0].pure_integer) {
619 if (out_format_desc->channel[0].type == UTIL_FORMAT_TYPE_SIGNED) {
620 texel_type = lp_type_int_vec(bld->type.width, bld->type.width * bld->type.length);
621 } else if (out_format_desc->channel[0].type == UTIL_FORMAT_TYPE_UNSIGNED) {
622 texel_type = lp_type_uint_vec(bld->type.width, bld->type.width * bld->type.length);
623 }
624 } else if (fetch_stencil) {
625 texel_type = lp_type_uint_vec(bld->type.width, bld->type.width * bld->type.length);
626 }
627
628 lp_build_fetch_rgba_soa(gallivm, out_format_desc, texel_type,
629 true, buf_ptr, offset,
630 NULL, NULL, NULL, result);
631 }
632
633 /**
634 * Generate the fragment shader, depth/stencil test, and alpha tests.
635 */
636 static void
generate_fs_loop(struct gallivm_state * gallivm,struct lp_fragment_shader * shader,const struct lp_fragment_shader_variant_key * key,LLVMBuilderRef builder,struct lp_type type,LLVMTypeRef context_type,LLVMValueRef context_ptr,LLVMTypeRef resources_type,LLVMValueRef resources_ptr,LLVMTypeRef sample_pos_type,LLVMValueRef sample_pos_array,LLVMValueRef num_loop,struct lp_build_interp_soa_context * interp,const struct lp_build_sampler_soa * sampler,const struct lp_build_image_soa * image,LLVMTypeRef mask_type,LLVMValueRef mask_store,LLVMValueRef (* out_color)[4],LLVMValueRef depth_base_ptr,LLVMValueRef depth_stride,LLVMValueRef depth_sample_stride,LLVMValueRef color_ptr_ptr,LLVMValueRef color_stride_ptr,LLVMValueRef color_sample_stride_ptr,LLVMValueRef facing,LLVMTypeRef thread_data_type,LLVMValueRef thread_data_ptr)637 generate_fs_loop(struct gallivm_state *gallivm,
638 struct lp_fragment_shader *shader,
639 const struct lp_fragment_shader_variant_key *key,
640 LLVMBuilderRef builder,
641 struct lp_type type,
642 LLVMTypeRef context_type,
643 LLVMValueRef context_ptr,
644 LLVMTypeRef resources_type,
645 LLVMValueRef resources_ptr,
646 LLVMTypeRef sample_pos_type,
647 LLVMValueRef sample_pos_array,
648 LLVMValueRef num_loop,
649 struct lp_build_interp_soa_context *interp,
650 const struct lp_build_sampler_soa *sampler,
651 const struct lp_build_image_soa *image,
652 LLVMTypeRef mask_type,
653 LLVMValueRef mask_store,
654 LLVMValueRef (*out_color)[4],
655 LLVMValueRef depth_base_ptr,
656 LLVMValueRef depth_stride,
657 LLVMValueRef depth_sample_stride,
658 LLVMValueRef color_ptr_ptr,
659 LLVMValueRef color_stride_ptr,
660 LLVMValueRef color_sample_stride_ptr,
661 LLVMValueRef facing,
662 LLVMTypeRef thread_data_type,
663 LLVMValueRef thread_data_ptr)
664 {
665 struct lp_type int_type = lp_int_type(type);
666 LLVMValueRef mask_ptr = NULL, mask_val = NULL;
667 LLVMValueRef z;
668 LLVMValueRef z_value, s_value;
669 LLVMValueRef z_fb, s_fb;
670 LLVMValueRef zs_samples = lp_build_const_int32(gallivm, key->zsbuf_nr_samples);
671 LLVMValueRef z_out = NULL, s_out = NULL;
672 struct lp_build_for_loop_state loop_state, sample_loop_state = {0};
673 struct lp_build_mask_context mask;
674 struct nir_shader *nir = shader->base.ir.nir;
675 const bool dual_source_blend = key->blend.rt[0].blend_enable &&
676 util_blend_state_is_dual(&key->blend, 0);
677 const bool post_depth_coverage = nir->info.fs.post_depth_coverage;
678
679 struct lp_bld_tgsi_system_values system_values;
680
681 memset(&system_values, 0, sizeof(system_values));
682
683 /* truncate then sign extend. */
684 system_values.front_facing =
685 LLVMBuildTrunc(gallivm->builder, facing,
686 LLVMInt1TypeInContext(gallivm->context), "");
687 system_values.front_facing =
688 LLVMBuildSExt(gallivm->builder, system_values.front_facing,
689 LLVMInt32TypeInContext(gallivm->context), "");
690 system_values.view_index =
691 lp_jit_thread_data_raster_state_view_index(gallivm,
692 thread_data_type,
693 thread_data_ptr);
694
695 unsigned depth_mode;
696 const struct util_format_description *zs_format_desc = NULL;
697 if (key->depth.enabled ||
698 key->stencil[0].enabled) {
699 zs_format_desc = util_format_description(key->zsbuf_format);
700
701 if (nir->info.fs.early_fragment_tests || nir->info.fs.post_depth_coverage) {
702 depth_mode = EARLY_DEPTH_TEST | EARLY_DEPTH_WRITE;
703 } else if (!(nir->info.outputs_written & BITFIELD64_BIT(FRAG_RESULT_DEPTH)) &&
704 !(nir->info.outputs_written & BITFIELD64_BIT(FRAG_RESULT_STENCIL)) &&
705 !nir->info.fs.uses_fbfetch_output && !nir->info.writes_memory) {
706 if (key->alpha.enabled ||
707 key->blend.alpha_to_coverage ||
708 nir->info.fs.uses_discard ||
709 nir->info.outputs_written & BITFIELD64_BIT(FRAG_RESULT_SAMPLE_MASK)) {
710 /* With alpha test and kill, can do the depth test early
711 * and hopefully eliminate some quads. But need to do a
712 * special deferred depth write once the final mask value
713 * is known. This only works though if there's either no
714 * stencil test or the stencil value isn't written.
715 */
716 if (key->stencil[0].enabled && (key->stencil[0].writemask ||
717 (key->stencil[1].enabled &&
718 key->stencil[1].writemask)))
719 depth_mode = LATE_DEPTH_TEST | LATE_DEPTH_WRITE;
720 else
721 depth_mode = EARLY_DEPTH_TEST | LATE_DEPTH_WRITE | EARLY_DEPTH_TEST_INFERRED;
722 } else {
723 depth_mode = EARLY_DEPTH_TEST | EARLY_DEPTH_WRITE | EARLY_DEPTH_TEST_INFERRED;
724 }
725 } else {
726 depth_mode = LATE_DEPTH_TEST | LATE_DEPTH_WRITE;
727 }
728
729 if (!(key->depth.enabled && key->depth.writemask) &&
730 !(key->stencil[0].enabled && (key->stencil[0].writemask ||
731 (key->stencil[1].enabled &&
732 key->stencil[1].writemask))))
733 depth_mode &= ~(LATE_DEPTH_WRITE | EARLY_DEPTH_WRITE);
734 } else {
735 depth_mode = 0;
736 }
737
738 LLVMTypeRef vec_type = lp_build_vec_type(gallivm, type);
739 LLVMTypeRef int_vec_type = lp_build_vec_type(gallivm, int_type);
740
741 LLVMValueRef stencil_refs[2];
742 stencil_refs[0] = lp_jit_context_stencil_ref_front_value(gallivm, context_type, context_ptr);
743 stencil_refs[1] = lp_jit_context_stencil_ref_back_value(gallivm, context_type, context_ptr);
744 /* convert scalar stencil refs into vectors */
745 stencil_refs[0] = lp_build_broadcast(gallivm, int_vec_type, stencil_refs[0]);
746 stencil_refs[1] = lp_build_broadcast(gallivm, int_vec_type, stencil_refs[1]);
747
748 LLVMValueRef consts_ptr = lp_jit_resources_constants(gallivm, resources_type, resources_ptr);
749
750 LLVMValueRef ssbo_ptr = lp_jit_resources_ssbos(gallivm, resources_type, resources_ptr);
751
752 LLVMValueRef outputs[PIPE_MAX_SHADER_OUTPUTS][TGSI_NUM_CHANNELS];
753 memset(outputs, 0, sizeof outputs);
754
755 /* Allocate color storage for each fragment sample */
756 LLVMValueRef color_store_size = num_loop;
757 if (key->min_samples > 1)
758 color_store_size = LLVMBuildMul(builder, num_loop, lp_build_const_int32(gallivm, key->min_samples), "");
759
760 for (unsigned cbuf = 0; cbuf < key->nr_cbufs; cbuf++) {
761 for (unsigned chan = 0; chan < TGSI_NUM_CHANNELS; ++chan) {
762 out_color[cbuf][chan] = lp_build_array_alloca(gallivm,
763 lp_build_vec_type(gallivm,
764 type),
765 color_store_size, "color");
766 }
767 }
768 if (dual_source_blend) {
769 assert(key->nr_cbufs <= 1);
770 for (unsigned chan = 0; chan < TGSI_NUM_CHANNELS; ++chan) {
771 out_color[1][chan] = lp_build_array_alloca(gallivm,
772 lp_build_vec_type(gallivm,
773 type),
774 color_store_size, "color1");
775 }
776 }
777 if (nir->info.outputs_written & BITFIELD64_BIT(FRAG_RESULT_DEPTH)) {
778 z_out = lp_build_array_alloca(gallivm,
779 lp_build_vec_type(gallivm, type),
780 color_store_size, "depth");
781 }
782
783 if (nir->info.outputs_written & BITFIELD64_BIT(FRAG_RESULT_STENCIL)) {
784 s_out = lp_build_array_alloca(gallivm,
785 lp_build_vec_type(gallivm, type),
786 color_store_size, "depth");
787 }
788
789 lp_build_for_loop_begin(&loop_state, gallivm,
790 lp_build_const_int32(gallivm, 0),
791 LLVMIntULT,
792 num_loop,
793 lp_build_const_int32(gallivm, 1));
794
795 LLVMValueRef sample_mask_in;
796 if (key->multisample) {
797 sample_mask_in = lp_build_const_int_vec(gallivm, type, 0);
798 /* create shader execution mask by combining all sample masks. */
799 for (unsigned s = 0; s < key->coverage_samples; s++) {
800 LLVMValueRef s_mask_idx = LLVMBuildMul(builder, num_loop, lp_build_const_int32(gallivm, s), "");
801 s_mask_idx = LLVMBuildAdd(builder, s_mask_idx, loop_state.counter, "");
802 LLVMValueRef s_mask = lp_build_pointer_get2(builder, mask_type, mask_store, s_mask_idx);
803 if (s == 0)
804 mask_val = s_mask;
805 else
806 mask_val = LLVMBuildOr(builder, s_mask, mask_val, "");
807
808 LLVMValueRef mask_in = LLVMBuildAnd(builder, s_mask, lp_build_const_int_vec(gallivm, type, (1ll << s)), "");
809 sample_mask_in = LLVMBuildOr(builder, sample_mask_in, mask_in, "");
810 }
811 } else {
812 sample_mask_in = lp_build_const_int_vec(gallivm, type, 1);
813 mask_ptr = LLVMBuildGEP2(builder, mask_type, mask_store,
814 &loop_state.counter, 1, "mask_ptr");
815 mask_val = LLVMBuildLoad2(builder, mask_type, mask_ptr, "");
816
817 LLVMValueRef mask_in = LLVMBuildAnd(builder, mask_val, lp_build_const_int_vec(gallivm, type, 1), "");
818 sample_mask_in = LLVMBuildOr(builder, sample_mask_in, mask_in, "");
819 }
820
821 /* 'mask' will control execution based on quad's pixel alive/killed state */
822 lp_build_mask_begin(&mask, gallivm, type, mask_val);
823
824 if (!(depth_mode & EARLY_DEPTH_TEST))
825 lp_build_mask_check(&mask);
826
827 /* Create storage for recombining sample masks after early Z pass. */
828 LLVMValueRef s_mask_or = lp_build_alloca(gallivm, int_vec_type, "cov_mask_early_depth");
829 LLVMBuildStore(builder, LLVMConstNull(int_vec_type), s_mask_or);
830
831 /* Create storage for post depth sample mask */
832 LLVMValueRef post_depth_sample_mask_in = NULL;
833 if (post_depth_coverage)
834 post_depth_sample_mask_in = lp_build_alloca(gallivm, int_vec_type, "post_depth_sample_mask_in");
835
836 LLVMValueRef s_mask = NULL, s_mask_ptr = NULL;
837 LLVMValueRef z_sample_value_store = NULL, s_sample_value_store = NULL;
838 LLVMValueRef z_fb_store = NULL, s_fb_store = NULL;
839 LLVMTypeRef z_type = NULL, z_fb_type = NULL;
840
841 /* Run early depth once per sample */
842 if (key->multisample) {
843
844 if (zs_format_desc) {
845 struct lp_type zs_type = lp_depth_type(zs_format_desc, type.length);
846 struct lp_type z_type = zs_type;
847 struct lp_type s_type = zs_type;
848 if (zs_format_desc->block.bits < type.width)
849 z_type.width = type.width;
850 if (zs_format_desc->block.bits == 8) {
851 s_type.width = type.width;
852 } else if (zs_format_desc->block.bits > 32) {
853 z_type.width = z_type.width / 2;
854 s_type.width = s_type.width / 2;
855 s_type.floating = 0;
856 }
857 z_sample_value_store = lp_build_array_alloca(gallivm, lp_build_int_vec_type(gallivm, type),
858 zs_samples, "z_sample_store");
859 s_sample_value_store = lp_build_array_alloca(gallivm, lp_build_int_vec_type(gallivm, type),
860 zs_samples, "s_sample_store");
861 z_fb_store = lp_build_array_alloca(gallivm, lp_build_vec_type(gallivm, z_type),
862 zs_samples, "z_fb_store");
863 s_fb_store = lp_build_array_alloca(gallivm, lp_build_vec_type(gallivm, s_type),
864 zs_samples, "s_fb_store");
865 }
866 lp_build_for_loop_begin(&sample_loop_state, gallivm,
867 lp_build_const_int32(gallivm, 0),
868 LLVMIntULT, lp_build_const_int32(gallivm, key->coverage_samples),
869 lp_build_const_int32(gallivm, 1));
870
871 LLVMValueRef s_mask_idx = LLVMBuildMul(builder, sample_loop_state.counter, num_loop, "");
872 s_mask_idx = LLVMBuildAdd(builder, s_mask_idx, loop_state.counter, "");
873 s_mask_ptr = LLVMBuildGEP2(builder, mask_type, mask_store, &s_mask_idx, 1, "");
874
875 s_mask = LLVMBuildLoad2(builder, mask_type, s_mask_ptr, "");
876 s_mask = LLVMBuildAnd(builder, s_mask, mask_val, "");
877 }
878
879
880 /* for multisample Z needs to be interpolated at sample points for testing. */
881 lp_build_interp_soa_update_pos_dyn(interp, gallivm, loop_state.counter,
882 key->multisample
883 ? sample_loop_state.counter : NULL);
884 z = interp->pos[2];
885
886 LLVMValueRef depth_ptr = depth_base_ptr;
887 if (key->multisample) {
888 LLVMValueRef sample_offset =
889 LLVMBuildMul(builder, sample_loop_state.counter,
890 depth_sample_stride, "");
891 depth_ptr = LLVMBuildGEP2(builder, LLVMInt8TypeInContext(gallivm->context),
892 depth_ptr, &sample_offset, 1, "");
893 }
894
895 if (depth_mode & EARLY_DEPTH_TEST) {
896 z = lp_build_depth_clamp(gallivm, builder, key->depth_clamp,
897 key->restrict_depth_values, type,
898 context_type, context_ptr,
899 thread_data_type, thread_data_ptr, z);
900
901 lp_build_depth_stencil_load_swizzled(gallivm, type,
902 zs_format_desc, key->resource_1d,
903 depth_ptr, depth_stride,
904 &z_fb, &s_fb, loop_state.counter);
905 lp_build_depth_stencil_test(gallivm,
906 &key->depth,
907 key->stencil,
908 type,
909 zs_format_desc,
910 key->multisample ? NULL : &mask,
911 &s_mask,
912 stencil_refs,
913 z, z_fb, s_fb,
914 facing,
915 &z_value, &s_value,
916 !key->multisample,
917 key->restrict_depth_values);
918
919 if (depth_mode & EARLY_DEPTH_WRITE) {
920 lp_build_depth_stencil_write_swizzled(gallivm, type,
921 zs_format_desc, key->resource_1d,
922 NULL, NULL, NULL, loop_state.counter,
923 depth_ptr, depth_stride,
924 z_value, s_value);
925 }
926 /*
927 * Note mask check if stencil is enabled must be after ds write not
928 * after stencil test otherwise new stencil values may not get written
929 * if all fragments got killed by depth/stencil test.
930 */
931 if (key->stencil[0].enabled && !key->multisample)
932 lp_build_mask_check(&mask);
933
934 if (key->multisample) {
935 z_fb_type = LLVMTypeOf(z_fb);
936 z_type = LLVMTypeOf(z_value);
937 lp_build_pointer_set(builder, z_sample_value_store, sample_loop_state.counter, LLVMBuildBitCast(builder, z_value, lp_build_int_vec_type(gallivm, type), ""));
938 lp_build_pointer_set(builder, s_sample_value_store, sample_loop_state.counter, LLVMBuildBitCast(builder, s_value, lp_build_int_vec_type(gallivm, type), ""));
939 lp_build_pointer_set(builder, z_fb_store, sample_loop_state.counter, z_fb);
940 lp_build_pointer_set(builder, s_fb_store, sample_loop_state.counter, s_fb);
941 }
942 if (key->occlusion_count && !(depth_mode & EARLY_DEPTH_TEST_INFERRED)) {
943 LLVMValueRef counter = lp_jit_thread_data_vis_counter(gallivm, thread_data_type, thread_data_ptr);
944 lp_build_name(counter, "counter");
945 lp_build_occlusion_count(gallivm, type,
946 key->multisample ? s_mask : lp_build_mask_value(&mask), counter);
947 }
948 }
949
950 if (key->multisample) {
951 /*
952 * Store the post-early Z coverage mask.
953 * Recombine the resulting coverage masks post early Z into the fragment
954 * shader execution mask.
955 */
956 LLVMValueRef tmp_s_mask_or = LLVMBuildLoad2(builder, int_vec_type, s_mask_or, "");
957 tmp_s_mask_or = LLVMBuildOr(builder, tmp_s_mask_or, s_mask, "");
958 LLVMBuildStore(builder, tmp_s_mask_or, s_mask_or);
959
960 if (post_depth_coverage) {
961 LLVMValueRef mask_bit_idx = LLVMBuildShl(builder, lp_build_const_int32(gallivm, 1), sample_loop_state.counter, "");
962 LLVMValueRef post_depth_mask_in = LLVMBuildLoad2(builder, int_vec_type, post_depth_sample_mask_in, "");
963 mask_bit_idx = LLVMBuildAnd(builder, s_mask, lp_build_broadcast(gallivm, int_vec_type, mask_bit_idx), "");
964 post_depth_mask_in = LLVMBuildOr(builder, post_depth_mask_in, mask_bit_idx, "");
965 LLVMBuildStore(builder, post_depth_mask_in, post_depth_sample_mask_in);
966 }
967
968 LLVMBuildStore(builder, s_mask, s_mask_ptr);
969
970 lp_build_for_loop_end(&sample_loop_state);
971
972 /* recombined all the coverage masks in the shader exec mask. */
973 tmp_s_mask_or = LLVMBuildLoad2(builder, int_vec_type, s_mask_or, "");
974 lp_build_mask_update(&mask, tmp_s_mask_or);
975
976 if (key->min_samples == 1) {
977 /* for multisample Z needs to be re interpolated at pixel center */
978 lp_build_interp_soa_update_pos_dyn(interp, gallivm, loop_state.counter, NULL);
979 z = interp->pos[2];
980 lp_build_mask_update(&mask, tmp_s_mask_or);
981 }
982 } else {
983 if (post_depth_coverage) {
984 LLVMValueRef post_depth_mask_in = LLVMBuildAnd(builder, lp_build_mask_value(&mask), lp_build_const_int_vec(gallivm, type, 1), "");
985 LLVMBuildStore(builder, post_depth_mask_in, post_depth_sample_mask_in);
986 }
987 }
988
989 LLVMValueRef out_sample_mask_storage = NULL;
990 if (nir->info.outputs_written & BITFIELD64_BIT(FRAG_RESULT_SAMPLE_MASK)) {
991 out_sample_mask_storage = lp_build_alloca(gallivm, int_vec_type, "write_mask");
992 if (key->min_samples > 1)
993 LLVMBuildStore(builder, LLVMConstNull(int_vec_type), out_sample_mask_storage);
994 }
995
996 if (post_depth_coverage) {
997 system_values.sample_mask_in = LLVMBuildLoad2(builder, int_vec_type, post_depth_sample_mask_in, "");
998 } else {
999 system_values.sample_mask_in = sample_mask_in;
1000 }
1001 if (key->multisample && key->min_samples > 1) {
1002 lp_build_for_loop_begin(&sample_loop_state, gallivm,
1003 lp_build_const_int32(gallivm, 0),
1004 LLVMIntULT,
1005 lp_build_const_int32(gallivm, key->min_samples),
1006 lp_build_const_int32(gallivm, 1));
1007
1008 LLVMValueRef s_mask_idx = LLVMBuildMul(builder, sample_loop_state.counter, num_loop, "");
1009 s_mask_idx = LLVMBuildAdd(builder, s_mask_idx, loop_state.counter, "");
1010 s_mask_ptr = LLVMBuildGEP2(builder, mask_type, mask_store, &s_mask_idx, 1, "");
1011 s_mask = LLVMBuildLoad2(builder, mask_type, s_mask_ptr, "");
1012 lp_build_mask_force(&mask, s_mask);
1013 lp_build_interp_soa_update_pos_dyn(interp, gallivm, loop_state.counter, sample_loop_state.counter);
1014 system_values.sample_id = sample_loop_state.counter;
1015 system_values.sample_mask_in = LLVMBuildAnd(builder, system_values.sample_mask_in,
1016 lp_build_broadcast(gallivm, int_vec_type,
1017 LLVMBuildShl(builder, lp_build_const_int32(gallivm, 1), sample_loop_state.counter, "")), "");
1018 } else {
1019 system_values.sample_id = lp_build_const_int32(gallivm, 0);
1020
1021 }
1022 system_values.sample_pos = sample_pos_array;
1023 system_values.sample_pos_type = sample_pos_type;
1024
1025 lp_build_interp_soa_update_inputs_dyn(interp, gallivm, loop_state.counter,
1026 mask_type, mask_store, sample_loop_state.counter);
1027
1028 struct lp_build_fs_llvm_iface fs_iface = {
1029 .base.interp_fn = fs_interp,
1030 .base.fb_fetch = fs_fb_fetch,
1031 .interp = interp,
1032 .loop_state = &loop_state,
1033 .sample_id = system_values.sample_id,
1034 .mask_type = mask_type,
1035 .mask_store = mask_store,
1036 .color_ptr_ptr = color_ptr_ptr,
1037 .color_stride_ptr = color_stride_ptr,
1038 .color_sample_stride_ptr = color_sample_stride_ptr,
1039 .zs_base_ptr = depth_base_ptr,
1040 .zs_stride = depth_stride,
1041 .zs_sample_stride = depth_sample_stride,
1042 .key = key,
1043 };
1044
1045 struct lp_build_tgsi_params params;
1046 memset(¶ms, 0, sizeof(params));
1047
1048 params.type = type;
1049 params.mask = &mask;
1050 params.fs_iface = &fs_iface.base;
1051 params.consts_ptr = consts_ptr;
1052 params.system_values = &system_values;
1053 params.inputs = interp->inputs;
1054 params.num_inputs = interp->num_attribs - 1;
1055 params.context_type = context_type;
1056 params.context_ptr = context_ptr;
1057 params.resources_type = resources_type;
1058 params.resources_ptr = resources_ptr;
1059 params.thread_data_type = thread_data_type;
1060 params.thread_data_ptr = thread_data_ptr;
1061 params.sampler = sampler;
1062 params.info = &shader->info.base;
1063 params.ssbo_ptr = ssbo_ptr;
1064 params.image = image;
1065 params.aniso_filter_table = lp_jit_resources_aniso_filter_table(gallivm, resources_type, resources_ptr);
1066
1067 /* Build the actual shader */
1068 lp_build_nir_soa(gallivm, nir, ¶ms, outputs);
1069
1070 /* Alpha test */
1071 if (key->alpha.enabled) {
1072 int color0 = find_output_by_frag_result(nir, FRAG_RESULT_DATA0);
1073
1074 if (color0 != -1 && outputs[color0][3]) {
1075 const struct util_format_description *cbuf_format_desc;
1076 LLVMValueRef alpha = LLVMBuildLoad2(builder, vec_type, outputs[color0][3], "alpha");
1077 LLVMValueRef alpha_ref_value;
1078
1079 alpha_ref_value = lp_jit_context_alpha_ref_value(gallivm, context_type, context_ptr);
1080 alpha_ref_value = lp_build_broadcast(gallivm, vec_type, alpha_ref_value);
1081
1082 cbuf_format_desc = util_format_description(key->cbuf_format[0]);
1083
1084 lp_build_alpha_test(gallivm, key->alpha.func, type, cbuf_format_desc,
1085 &mask, alpha, alpha_ref_value,
1086 ((depth_mode & LATE_DEPTH_TEST) != 0) && !key->multisample);
1087 }
1088 }
1089
1090 /* Emulate Alpha to Coverage with Alpha test */
1091 if (key->blend.alpha_to_coverage) {
1092 int color0 = find_output_by_frag_result(nir, FRAG_RESULT_DATA0);
1093
1094 if (color0 != -1 && outputs[color0][3]) {
1095 LLVMValueRef alpha = LLVMBuildLoad2(builder, vec_type, outputs[color0][3], "alpha");
1096
1097 if (!key->multisample) {
1098 lp_build_alpha_to_coverage(gallivm, type,
1099 &mask, alpha,
1100 (depth_mode & LATE_DEPTH_TEST) != 0);
1101 } else {
1102 lp_build_sample_alpha_to_coverage(gallivm, type, key->coverage_samples, num_loop,
1103 loop_state.counter,
1104 mask_type, mask_store, alpha);
1105 }
1106 }
1107 }
1108
1109 if (key->blend.alpha_to_one) {
1110 nir_foreach_shader_out_variable(var, nir) {
1111 if (var->data.location < FRAG_RESULT_DATA0)
1112 continue;
1113 int slots = nir_variable_count_slots(var, var->type);
1114 for (unsigned s = 0; s < slots; s++) {
1115 unsigned cbuf = get_cbuf_location(var, s);
1116 if ((cbuf < key->nr_cbufs) || (cbuf == 1 && dual_source_blend))
1117 if (outputs[cbuf][3]) {
1118 LLVMBuildStore(builder, lp_build_const_vec(gallivm, type, 1.0),
1119 outputs[cbuf][3]);
1120 }
1121 }
1122 }
1123 }
1124
1125 if (nir->info.outputs_written & BITFIELD64_BIT(FRAG_RESULT_SAMPLE_MASK)) {
1126 LLVMValueRef output_smask = NULL;
1127 int smaski = find_output_by_frag_result(nir, FRAG_RESULT_SAMPLE_MASK);
1128
1129 struct lp_build_context smask_bld;
1130 lp_build_context_init(&smask_bld, gallivm, int_type);
1131
1132 assert(smaski >= 0);
1133 output_smask = LLVMBuildLoad2(builder, vec_type, outputs[smaski][0], "smask");
1134 output_smask = LLVMBuildBitCast(builder, output_smask, smask_bld.vec_type, "");
1135 if (!key->multisample && key->no_ms_sample_mask_out) {
1136 output_smask = lp_build_and(&smask_bld, output_smask, smask_bld.one);
1137 output_smask = lp_build_cmp(&smask_bld, PIPE_FUNC_NOTEQUAL, output_smask, smask_bld.zero);
1138 lp_build_mask_update(&mask, output_smask);
1139 }
1140
1141 if (key->min_samples > 1) {
1142 /* only the bit corresponding to this sample is to be used. */
1143 LLVMValueRef tmp_mask = LLVMBuildLoad2(builder, int_vec_type, out_sample_mask_storage, "tmp_mask");
1144 LLVMValueRef out_smask_idx = LLVMBuildShl(builder, lp_build_const_int32(gallivm, 1), sample_loop_state.counter, "");
1145 LLVMValueRef smask_bit = LLVMBuildAnd(builder, output_smask, lp_build_broadcast(gallivm, int_vec_type, out_smask_idx), "");
1146 output_smask = LLVMBuildOr(builder, tmp_mask, smask_bit, "");
1147 }
1148
1149 LLVMBuildStore(builder, output_smask, out_sample_mask_storage);
1150 }
1151
1152 if (nir->info.outputs_written & BITFIELD64_BIT(FRAG_RESULT_DEPTH)) {
1153 int pos0 = find_output_by_frag_result(nir, FRAG_RESULT_DEPTH);
1154
1155 LLVMValueRef out = LLVMBuildLoad2(builder, vec_type, outputs[pos0][2], "");
1156 LLVMValueRef idx = loop_state.counter;
1157 if (key->min_samples > 1)
1158 idx = LLVMBuildAdd(builder, idx,
1159 LLVMBuildMul(builder, sample_loop_state.counter, num_loop, ""), "");
1160 LLVMValueRef ptr = LLVMBuildGEP2(builder, vec_type, z_out, &idx, 1, "");
1161 LLVMBuildStore(builder, out, ptr);
1162 }
1163
1164 if (nir->info.outputs_written & BITFIELD64_BIT(FRAG_RESULT_STENCIL)) {
1165 int sten_out = find_output_by_frag_result(nir, FRAG_RESULT_STENCIL);
1166
1167 LLVMValueRef out = LLVMBuildLoad2(builder, vec_type,
1168 outputs[sten_out][1], "output.s");
1169 LLVMValueRef idx = loop_state.counter;
1170 if (key->min_samples > 1)
1171 idx = LLVMBuildAdd(builder, idx,
1172 LLVMBuildMul(builder, sample_loop_state.counter, num_loop, ""), "");
1173 LLVMValueRef ptr = LLVMBuildGEP2(builder, vec_type, s_out, &idx, 1, "");
1174 LLVMBuildStore(builder, out, ptr);
1175 }
1176
1177 bool has_cbuf0_write = false;
1178 /* Color write - per fragment sample */
1179 nir_foreach_shader_out_variable(var, nir) {
1180 if (var->data.location < FRAG_RESULT_DATA0)
1181 continue;
1182 int slots = nir_variable_count_slots(var, var->type);
1183
1184 for (unsigned s = 0; s < slots; s++) {
1185 unsigned cbuf = get_cbuf_location(var, s);
1186 unsigned attrib = var->data.driver_location + s;
1187 if ((cbuf < key->nr_cbufs) || (cbuf == 1 && dual_source_blend)) {
1188 if (cbuf == 0) {
1189 /* XXX: there is an edge case with FB fetch where gl_FragColor and
1190 * gl_LastFragData[0] are used together. This creates both
1191 * FRAG_RESULT_COLOR and FRAG_RESULT_DATA* output variables. This
1192 * loop then writes to cbuf 0 twice, owerwriting the correct value
1193 * from gl_FragColor with some garbage. This case is excercised in
1194 * one of deqp tests. A similar bug can happen if
1195 * gl_SecondaryFragColorEXT and gl_LastFragData[1] are mixed in
1196 * the same fashion... This workaround will break if
1197 * gl_LastFragData[0] goes in outputs list before
1198 * gl_FragColor. This doesn't seem to happen though.
1199 */
1200 if (has_cbuf0_write)
1201 continue;
1202 has_cbuf0_write = true;
1203 }
1204
1205 for (unsigned chan = 0; chan < TGSI_NUM_CHANNELS; ++chan) {
1206 if (outputs[attrib][chan]) {
1207 /* XXX: just initialize outputs to point at colors[] and
1208 * skip this.
1209 */
1210 LLVMValueRef out = LLVMBuildLoad2(builder, vec_type, outputs[attrib][chan], "");
1211 LLVMValueRef color_ptr;
1212 LLVMValueRef color_idx = loop_state.counter;
1213 if (key->min_samples > 1)
1214 color_idx = LLVMBuildAdd(builder, color_idx,
1215 LLVMBuildMul(builder, sample_loop_state.counter, num_loop, ""), "");
1216 color_ptr = LLVMBuildGEP2(builder, vec_type, out_color[cbuf][chan],
1217 &color_idx, 1, "");
1218 lp_build_name(out, "color%u.%c", attrib, "rgba"[chan]);
1219 LLVMBuildStore(builder, out, color_ptr);
1220 }
1221 }
1222 }
1223 }
1224 }
1225
1226 if (key->multisample && key->min_samples > 1) {
1227 LLVMBuildStore(builder, lp_build_mask_value(&mask), s_mask_ptr);
1228 lp_build_for_loop_end(&sample_loop_state);
1229 }
1230
1231 if (key->multisample) {
1232 /* execute depth test for each sample */
1233 lp_build_for_loop_begin(&sample_loop_state, gallivm,
1234 lp_build_const_int32(gallivm, 0),
1235 LLVMIntULT, lp_build_const_int32(gallivm, key->coverage_samples),
1236 lp_build_const_int32(gallivm, 1));
1237
1238 /* load the per-sample coverage mask */
1239 LLVMValueRef s_mask_idx = LLVMBuildMul(builder, sample_loop_state.counter, num_loop, "");
1240 s_mask_idx = LLVMBuildAdd(builder, s_mask_idx, loop_state.counter, "");
1241 s_mask_ptr = LLVMBuildGEP2(builder, mask_type, mask_store, &s_mask_idx, 1, "");
1242
1243 /* combine the execution mask post fragment shader with the coverage mask. */
1244 s_mask = LLVMBuildLoad2(builder, mask_type, s_mask_ptr, "");
1245 if (key->min_samples == 1)
1246 s_mask = LLVMBuildAnd(builder, s_mask, lp_build_mask_value(&mask), "");
1247
1248 /* if the shader writes sample mask use that,
1249 * but only if this isn't genuine early-depth to avoid breaking occlusion query */
1250 if (nir->info.outputs_written & BITFIELD64_BIT(FRAG_RESULT_SAMPLE_MASK) &&
1251 (!(depth_mode & EARLY_DEPTH_TEST) || (depth_mode & (EARLY_DEPTH_TEST_INFERRED)))) {
1252 LLVMValueRef out_smask_idx = LLVMBuildShl(builder, lp_build_const_int32(gallivm, 1), sample_loop_state.counter, "");
1253 out_smask_idx = lp_build_broadcast(gallivm, int_vec_type, out_smask_idx);
1254 LLVMValueRef output_smask = LLVMBuildLoad2(builder, int_vec_type, out_sample_mask_storage, "");
1255 LLVMValueRef smask_bit = LLVMBuildAnd(builder, output_smask, out_smask_idx, "");
1256 LLVMValueRef cmp = LLVMBuildICmp(builder, LLVMIntNE, smask_bit, lp_build_const_int_vec(gallivm, int_type, 0), "");
1257 smask_bit = LLVMBuildSExt(builder, cmp, int_vec_type, "");
1258
1259 s_mask = LLVMBuildAnd(builder, s_mask, smask_bit, "");
1260 }
1261 }
1262
1263 depth_ptr = depth_base_ptr;
1264 if (key->multisample) {
1265 LLVMValueRef sample_offset = LLVMBuildMul(builder, sample_loop_state.counter, depth_sample_stride, "");
1266 depth_ptr = LLVMBuildGEP2(builder, LLVMInt8TypeInContext(gallivm->context),
1267 depth_ptr, &sample_offset, 1, "");
1268 }
1269
1270 /* Late Z test */
1271 if (depth_mode & LATE_DEPTH_TEST) {
1272 if (nir->info.outputs_written & BITFIELD64_BIT(FRAG_RESULT_DEPTH)) {
1273 LLVMValueRef idx = loop_state.counter;
1274 if (key->min_samples > 1)
1275 idx = LLVMBuildAdd(builder, idx,
1276 LLVMBuildMul(builder, sample_loop_state.counter, num_loop, ""), "");
1277 LLVMValueRef ptr = LLVMBuildGEP2(builder, vec_type, z_out, &idx, 1, "");
1278 z = LLVMBuildLoad2(builder, vec_type, ptr, "output.z");
1279 } else {
1280 if (key->multisample) {
1281 lp_build_interp_soa_update_pos_dyn(interp, gallivm, loop_state.counter, key->multisample ? sample_loop_state.counter : NULL);
1282 z = interp->pos[2];
1283 }
1284 }
1285
1286 /*
1287 * Clamp according to ARB_depth_clamp semantics.
1288 */
1289 z = lp_build_depth_clamp(gallivm, builder, key->depth_clamp,
1290 key->restrict_depth_values, type,
1291 context_type, context_ptr,
1292 thread_data_type, thread_data_ptr, z);
1293
1294 if (nir->info.outputs_written & BITFIELD64_BIT(FRAG_RESULT_STENCIL)) {
1295 LLVMValueRef idx = loop_state.counter;
1296 if (key->min_samples > 1)
1297 idx = LLVMBuildAdd(builder, idx,
1298 LLVMBuildMul(builder, sample_loop_state.counter, num_loop, ""), "");
1299 LLVMValueRef ptr = LLVMBuildGEP2(builder, vec_type, s_out, &idx, 1, "");
1300 stencil_refs[0] = LLVMBuildLoad2(builder, vec_type, ptr, "output.s");
1301 /* there's only one value, and spec says to discard additional bits */
1302 LLVMValueRef s_max_mask = lp_build_const_int_vec(gallivm, int_type, 255);
1303 stencil_refs[0] = LLVMBuildBitCast(builder, stencil_refs[0], int_vec_type, "");
1304 stencil_refs[0] = LLVMBuildAnd(builder, stencil_refs[0], s_max_mask, "");
1305 stencil_refs[1] = stencil_refs[0];
1306 }
1307
1308 lp_build_depth_stencil_load_swizzled(gallivm, type,
1309 zs_format_desc, key->resource_1d,
1310 depth_ptr, depth_stride,
1311 &z_fb, &s_fb, loop_state.counter);
1312
1313 lp_build_depth_stencil_test(gallivm,
1314 &key->depth,
1315 key->stencil,
1316 type,
1317 zs_format_desc,
1318 key->multisample ? NULL : &mask,
1319 &s_mask,
1320 stencil_refs,
1321 z, z_fb, s_fb,
1322 facing,
1323 &z_value, &s_value,
1324 false,
1325 key->restrict_depth_values);
1326 /* Late Z write */
1327 if (depth_mode & LATE_DEPTH_WRITE) {
1328 lp_build_depth_stencil_write_swizzled(gallivm, type,
1329 zs_format_desc, key->resource_1d,
1330 NULL, NULL, NULL, loop_state.counter,
1331 depth_ptr, depth_stride,
1332 z_value, s_value);
1333 }
1334 } else if ((depth_mode & EARLY_DEPTH_TEST) &&
1335 (depth_mode & LATE_DEPTH_WRITE)) {
1336 /* Need to apply a reduced mask to the depth write. Reload the
1337 * depth value, update from zs_value with the new mask value and
1338 * write that out.
1339 */
1340 if (key->multisample) {
1341 z_value = LLVMBuildBitCast(builder, lp_build_pointer_get2(builder, int_vec_type, z_sample_value_store, sample_loop_state.counter), z_type, "");
1342 s_value = lp_build_pointer_get2(builder, int_vec_type, s_sample_value_store, sample_loop_state.counter);
1343 z_fb = LLVMBuildBitCast(builder, lp_build_pointer_get2(builder, int_vec_type, z_fb_store, sample_loop_state.counter), z_fb_type, "");
1344 s_fb = lp_build_pointer_get2(builder, int_vec_type, s_fb_store, sample_loop_state.counter);
1345 }
1346 lp_build_depth_stencil_write_swizzled(gallivm, type,
1347 zs_format_desc, key->resource_1d,
1348 key->multisample ? s_mask : lp_build_mask_value(&mask), z_fb, s_fb, loop_state.counter,
1349 depth_ptr, depth_stride,
1350 z_value, s_value);
1351 }
1352
1353 if (key->occlusion_count && (!(depth_mode & EARLY_DEPTH_TEST) || (depth_mode & EARLY_DEPTH_TEST_INFERRED))) {
1354 LLVMValueRef counter = lp_jit_thread_data_vis_counter(gallivm, thread_data_type, thread_data_ptr);
1355 lp_build_name(counter, "counter");
1356
1357 lp_build_occlusion_count(gallivm, type,
1358 key->multisample ? s_mask : lp_build_mask_value(&mask), counter);
1359 }
1360
1361 /* if this is genuine early-depth in the shader, write samplemask now
1362 * after occlusion count has been updated
1363 */
1364 if (key->multisample &&
1365 nir->info.outputs_written & BITFIELD64_BIT(FRAG_RESULT_SAMPLE_MASK) &&
1366 (depth_mode & (EARLY_DEPTH_TEST_INFERRED | EARLY_DEPTH_TEST)) == EARLY_DEPTH_TEST) {
1367 /* if the shader writes sample mask use that */
1368 LLVMValueRef out_smask_idx = LLVMBuildShl(builder, lp_build_const_int32(gallivm, 1), sample_loop_state.counter, "");
1369 out_smask_idx = lp_build_broadcast(gallivm, int_vec_type, out_smask_idx);
1370 LLVMValueRef output_smask = LLVMBuildLoad2(builder, int_vec_type, out_sample_mask_storage, "");
1371 LLVMValueRef smask_bit = LLVMBuildAnd(builder, output_smask, out_smask_idx, "");
1372 LLVMValueRef cmp = LLVMBuildICmp(builder, LLVMIntNE, smask_bit, lp_build_const_int_vec(gallivm, int_type, 0), "");
1373 smask_bit = LLVMBuildSExt(builder, cmp, int_vec_type, "");
1374
1375 s_mask = LLVMBuildAnd(builder, s_mask, smask_bit, "");
1376 }
1377
1378
1379 if (key->multisample) {
1380 /* store the sample mask for this loop */
1381 LLVMBuildStore(builder, s_mask, s_mask_ptr);
1382 lp_build_for_loop_end(&sample_loop_state);
1383 }
1384
1385 mask_val = lp_build_mask_end(&mask);
1386 if (!key->multisample)
1387 LLVMBuildStore(builder, mask_val, mask_ptr);
1388 lp_build_for_loop_end(&loop_state);
1389 }
1390
1391
1392 /**
1393 * This function will reorder pixels from the fragment shader SoA to memory
1394 * layout AoS
1395 *
1396 * Fragment Shader outputs pixels in small 2x2 blocks
1397 * e.g. (0, 0), (1, 0), (0, 1), (1, 1) ; (2, 0) ...
1398 *
1399 * However in memory pixels are stored in rows
1400 * e.g. (0, 0), (1, 0), (2, 0), (3, 0) ; (0, 1) ...
1401 *
1402 * @param type fragment shader type (4x or 8x float)
1403 * @param num_fs number of fs_src
1404 * @param is_1d whether we're outputting to a 1d resource
1405 * @param dst_channels number of output channels
1406 * @param fs_src output from fragment shader
1407 * @param dst pointer to store result
1408 * @param pad_inline is channel padding inline or at end of row
1409 * @return the number of dsts
1410 */
1411 static int
generate_fs_twiddle(struct gallivm_state * gallivm,struct lp_type type,unsigned num_fs,unsigned dst_channels,LLVMValueRef fs_src[][4],LLVMValueRef * dst,bool pad_inline)1412 generate_fs_twiddle(struct gallivm_state *gallivm,
1413 struct lp_type type,
1414 unsigned num_fs,
1415 unsigned dst_channels,
1416 LLVMValueRef fs_src[][4],
1417 LLVMValueRef* dst,
1418 bool pad_inline)
1419 {
1420 LLVMValueRef src[16];
1421 unsigned pixels = type.length / 4;
1422 unsigned src_channels = dst_channels < 3 ? dst_channels : 4;
1423 unsigned src_count = num_fs * src_channels;
1424
1425 assert(pixels == 2 || pixels == 1);
1426 assert(num_fs * src_channels <= ARRAY_SIZE(src));
1427
1428 /*
1429 * Transpose from SoA -> AoS
1430 */
1431 for (unsigned i = 0; i < num_fs; ++i) {
1432 lp_build_transpose_aos_n(gallivm, type, &fs_src[i][0], src_channels,
1433 &src[i * src_channels]);
1434 }
1435
1436 /*
1437 * Pick transformation options
1438 */
1439 bool swizzle_pad = false;
1440 bool twiddle = false;
1441 bool split = false;
1442 unsigned reorder_group = 0;
1443
1444 if (dst_channels == 1) {
1445 twiddle = true;
1446 if (pixels == 2) {
1447 split = true;
1448 }
1449 } else if (dst_channels == 2) {
1450 if (pixels == 1) {
1451 reorder_group = 1;
1452 }
1453 } else if (dst_channels > 2) {
1454 if (pixels == 1) {
1455 reorder_group = 2;
1456 } else {
1457 twiddle = true;
1458 }
1459
1460 if (!pad_inline && dst_channels == 3 && pixels > 1) {
1461 swizzle_pad = true;
1462 }
1463 }
1464
1465 /*
1466 * Split the src in half
1467 */
1468 if (split) {
1469 for (unsigned i = num_fs; i > 0; --i) {
1470 src[(i - 1)*2 + 1] = lp_build_extract_range(gallivm, src[i - 1], 4, 4);
1471 src[(i - 1)*2 + 0] = lp_build_extract_range(gallivm, src[i - 1], 0, 4);
1472 }
1473
1474 src_count *= 2;
1475 type.length = 4;
1476 }
1477
1478 /*
1479 * Ensure pixels are in memory order
1480 */
1481 if (reorder_group) {
1482 /* Twiddle pixels by reordering the array, e.g.:
1483 *
1484 * src_count = 8 -> 0 2 1 3 4 6 5 7
1485 * src_count = 16 -> 0 1 4 5 2 3 6 7 8 9 12 13 10 11 14 15
1486 */
1487 const unsigned reorder_sw[] = { 0, 2, 1, 3 };
1488
1489 for (unsigned i = 0; i < src_count; ++i) {
1490 unsigned group = i / reorder_group;
1491 unsigned block = (group / 4) * 4 * reorder_group;
1492 unsigned j = block + (reorder_sw[group % 4] * reorder_group) + (i % reorder_group);
1493 dst[i] = src[j];
1494 }
1495 } else if (twiddle) {
1496 /* Twiddle pixels across elements of array */
1497 /*
1498 * XXX: we should avoid this in some cases, but would need to tell
1499 * lp_build_conv to reorder (or deal with it ourselves).
1500 */
1501 lp_bld_quad_twiddle(gallivm, type, src, src_count, dst);
1502 } else {
1503 /* Do nothing */
1504 memcpy(dst, src, sizeof(LLVMValueRef) * src_count);
1505 }
1506
1507 /*
1508 * Moves any padding between pixels to the end
1509 * e.g. RGBXRGBX -> RGBRGBXX
1510 */
1511 if (swizzle_pad) {
1512 unsigned char swizzles[16];
1513 unsigned elems = pixels * dst_channels;
1514
1515 for (unsigned i = 0; i < type.length; ++i) {
1516 if (i < elems)
1517 swizzles[i] = i % dst_channels + (i / dst_channels) * 4;
1518 else
1519 swizzles[i] = LP_BLD_SWIZZLE_DONTCARE;
1520 }
1521
1522 for (unsigned i = 0; i < src_count; ++i) {
1523 dst[i] = lp_build_swizzle_aos_n(gallivm, dst[i], swizzles,
1524 type.length, type.length);
1525 }
1526 }
1527
1528 return src_count;
1529 }
1530
1531
1532 /*
1533 * Untwiddle and transpose, much like the above.
1534 * However, this is after conversion, so we get packed vectors.
1535 * At this time only handle 4x16i8 rgba / 2x16i8 rg / 1x16i8 r data,
1536 * the vectors will look like:
1537 * r0r1r4r5r2r3r6r7r8r9r12... (albeit color channels may
1538 * be swizzled here). Extending to 16bit should be trivial.
1539 * Should also be extended to handle twice wide vectors with AVX2...
1540 */
1541 static void
fs_twiddle_transpose(struct gallivm_state * gallivm,struct lp_type type,LLVMValueRef * src,unsigned src_count,LLVMValueRef * dst)1542 fs_twiddle_transpose(struct gallivm_state *gallivm,
1543 struct lp_type type,
1544 LLVMValueRef *src,
1545 unsigned src_count,
1546 LLVMValueRef *dst)
1547 {
1548 struct lp_type type64, type16, type32;
1549 LLVMTypeRef type64_t, type8_t, type16_t, type32_t;
1550 LLVMBuilderRef builder = gallivm->builder;
1551 LLVMValueRef tmp[4], shuf[8];
1552 for (unsigned j = 0; j < 2; j++) {
1553 shuf[j*4 + 0] = lp_build_const_int32(gallivm, j*4 + 0);
1554 shuf[j*4 + 1] = lp_build_const_int32(gallivm, j*4 + 2);
1555 shuf[j*4 + 2] = lp_build_const_int32(gallivm, j*4 + 1);
1556 shuf[j*4 + 3] = lp_build_const_int32(gallivm, j*4 + 3);
1557 }
1558
1559 assert(src_count == 4 || src_count == 2 || src_count == 1);
1560 assert(type.width == 8);
1561 assert(type.length == 16);
1562
1563 type8_t = lp_build_vec_type(gallivm, type);
1564
1565 type64 = type;
1566 type64.length /= 8;
1567 type64.width *= 8;
1568 type64_t = lp_build_vec_type(gallivm, type64);
1569
1570 type16 = type;
1571 type16.length /= 2;
1572 type16.width *= 2;
1573 type16_t = lp_build_vec_type(gallivm, type16);
1574
1575 type32 = type;
1576 type32.length /= 4;
1577 type32.width *= 4;
1578 type32_t = lp_build_vec_type(gallivm, type32);
1579
1580 lp_build_transpose_aos_n(gallivm, type, src, src_count, tmp);
1581
1582 if (src_count == 1) {
1583 /* transpose was no-op, just untwiddle */
1584 LLVMValueRef shuf_vec;
1585 shuf_vec = LLVMConstVector(shuf, 8);
1586 tmp[0] = LLVMBuildBitCast(builder, src[0], type16_t, "");
1587 tmp[0] = LLVMBuildShuffleVector(builder, tmp[0], tmp[0], shuf_vec, "");
1588 dst[0] = LLVMBuildBitCast(builder, tmp[0], type8_t, "");
1589 } else if (src_count == 2) {
1590 LLVMValueRef shuf_vec;
1591 shuf_vec = LLVMConstVector(shuf, 4);
1592
1593 for (unsigned i = 0; i < 2; i++) {
1594 tmp[i] = LLVMBuildBitCast(builder, tmp[i], type32_t, "");
1595 tmp[i] = LLVMBuildShuffleVector(builder, tmp[i], tmp[i], shuf_vec, "");
1596 dst[i] = LLVMBuildBitCast(builder, tmp[i], type8_t, "");
1597 }
1598 } else {
1599 for (unsigned j = 0; j < 2; j++) {
1600 LLVMValueRef lo, hi, lo2, hi2;
1601 /*
1602 * Note that if we only really have 3 valid channels (rgb)
1603 * and we don't need alpha we could substitute a undef here
1604 * for the respective channel (causing llvm to drop conversion
1605 * for alpha).
1606 */
1607 /* we now have rgba0rgba1rgba4rgba5 etc, untwiddle */
1608 lo2 = LLVMBuildBitCast(builder, tmp[j*2], type64_t, "");
1609 hi2 = LLVMBuildBitCast(builder, tmp[j*2 + 1], type64_t, "");
1610 lo = lp_build_interleave2(gallivm, type64, lo2, hi2, 0);
1611 hi = lp_build_interleave2(gallivm, type64, lo2, hi2, 1);
1612 dst[j*2] = LLVMBuildBitCast(builder, lo, type8_t, "");
1613 dst[j*2 + 1] = LLVMBuildBitCast(builder, hi, type8_t, "");
1614 }
1615 }
1616 }
1617
1618
1619 /**
1620 * Load an unswizzled block of pixels from memory
1621 */
1622 static void
load_unswizzled_block(struct gallivm_state * gallivm,LLVMTypeRef base_type,LLVMValueRef base_ptr,LLVMValueRef stride,unsigned block_width,unsigned block_height,LLVMValueRef * dst,struct lp_type dst_type,unsigned dst_count,unsigned dst_alignment)1623 load_unswizzled_block(struct gallivm_state *gallivm,
1624 LLVMTypeRef base_type,
1625 LLVMValueRef base_ptr,
1626 LLVMValueRef stride,
1627 unsigned block_width,
1628 unsigned block_height,
1629 LLVMValueRef* dst,
1630 struct lp_type dst_type,
1631 unsigned dst_count,
1632 unsigned dst_alignment)
1633 {
1634 LLVMBuilderRef builder = gallivm->builder;
1635 const unsigned row_size = dst_count / block_height;
1636
1637 /* Ensure block exactly fits into dst */
1638 assert((block_width * block_height) % dst_count == 0);
1639
1640 for (unsigned i = 0; i < dst_count; ++i) {
1641 unsigned x = i % row_size;
1642 unsigned y = i / row_size;
1643
1644 LLVMValueRef bx = lp_build_const_int32(gallivm, x * (dst_type.width / 8) * dst_type.length);
1645 LLVMValueRef by = LLVMBuildMul(builder, lp_build_const_int32(gallivm, y), stride, "");
1646
1647 LLVMValueRef gep[2];
1648 LLVMValueRef dst_ptr;
1649
1650 gep[0] = lp_build_const_int32(gallivm, 0);
1651 gep[1] = LLVMBuildAdd(builder, bx, by, "");
1652
1653 dst_ptr = LLVMBuildGEP2(builder, base_type, base_ptr, gep, 2, "");
1654 dst_ptr = LLVMBuildBitCast(builder, dst_ptr,
1655 LLVMPointerType(lp_build_vec_type(gallivm, dst_type), 0), "");
1656
1657 dst[i] = LLVMBuildLoad2(builder,
1658 lp_build_vec_type(gallivm, dst_type),
1659 dst_ptr, "");
1660
1661 LLVMSetAlignment(dst[i], dst_alignment);
1662 }
1663 }
1664
1665
1666 /**
1667 * Store an unswizzled block of pixels to memory
1668 */
1669 static void
store_unswizzled_block(struct gallivm_state * gallivm,LLVMTypeRef base_type,LLVMValueRef base_ptr,LLVMValueRef stride,unsigned block_width,unsigned block_height,LLVMValueRef src[],struct lp_type src_type,unsigned src_count,unsigned src_alignment)1670 store_unswizzled_block(struct gallivm_state *gallivm,
1671 LLVMTypeRef base_type,
1672 LLVMValueRef base_ptr,
1673 LLVMValueRef stride,
1674 unsigned block_width,
1675 unsigned block_height,
1676 LLVMValueRef src[], // [src_count]
1677 struct lp_type src_type,
1678 unsigned src_count,
1679 unsigned src_alignment)
1680 {
1681 LLVMBuilderRef builder = gallivm->builder;
1682 const unsigned row_size = src_count / block_height;
1683
1684 /* Ensure src exactly fits into block */
1685 assert((block_width * block_height) % src_count == 0);
1686
1687 for (unsigned i = 0; i < src_count; ++i) {
1688 unsigned x = i % row_size;
1689 unsigned y = i / row_size;
1690
1691 LLVMValueRef bx = lp_build_const_int32(gallivm, x * (src_type.width / 8) * src_type.length);
1692 LLVMValueRef by = LLVMBuildMul(builder, lp_build_const_int32(gallivm, y), stride, "");
1693
1694 LLVMValueRef gep[2];
1695 LLVMValueRef src_ptr;
1696
1697 gep[0] = lp_build_const_int32(gallivm, 0);
1698 gep[1] = LLVMBuildAdd(builder, bx, by, "");
1699
1700 src_ptr = LLVMBuildGEP2(builder, base_type, base_ptr, gep, 2, "");
1701 src_ptr = LLVMBuildBitCast(builder, src_ptr,
1702 LLVMPointerType(lp_build_vec_type(gallivm, src_type), 0), "");
1703
1704 src_ptr = LLVMBuildStore(builder, src[i], src_ptr);
1705
1706 LLVMSetAlignment(src_ptr, src_alignment);
1707 }
1708 }
1709
1710
1711
1712 /**
1713 * Retrieves the type for a format which is usable in the blending code.
1714 *
1715 * e.g. RGBA16F = 4x float, R3G3B2 = 3x byte
1716 */
1717 static inline void
lp_blend_type_from_format_desc(const struct util_format_description * format_desc,struct lp_type * type)1718 lp_blend_type_from_format_desc(const struct util_format_description *format_desc,
1719 struct lp_type* type)
1720 {
1721 if (format_expands_to_float_soa(format_desc)) {
1722 /* always use ordinary floats for blending */
1723 type->floating = true;
1724 type->fixed = false;
1725 type->sign = true;
1726 type->norm = false;
1727 type->width = 32;
1728 type->length = 4;
1729 return;
1730 }
1731
1732 const int chan = util_format_get_first_non_void_channel(format_desc->format);
1733
1734 memset(type, 0, sizeof(struct lp_type));
1735 type->floating = format_desc->channel[chan].type == UTIL_FORMAT_TYPE_FLOAT;
1736 type->fixed = format_desc->channel[chan].type == UTIL_FORMAT_TYPE_FIXED;
1737 type->sign = format_desc->channel[chan].type != UTIL_FORMAT_TYPE_UNSIGNED;
1738 type->norm = format_desc->channel[chan].normalized;
1739 type->width = format_desc->channel[chan].size;
1740 type->length = format_desc->nr_channels;
1741
1742 for (unsigned i = 1; i < format_desc->nr_channels; ++i) {
1743 if (format_desc->channel[i].size > type->width)
1744 type->width = format_desc->channel[i].size;
1745 }
1746
1747 if (type->floating) {
1748 type->width = 32;
1749 } else {
1750 if (type->width <= 8) {
1751 type->width = 8;
1752 } else if (type->width <= 16) {
1753 type->width = 16;
1754 } else {
1755 type->width = 32;
1756 }
1757 }
1758
1759 if (is_arithmetic_format(format_desc) && type->length == 3) {
1760 type->length = 4;
1761 }
1762 }
1763
1764
1765 /**
1766 * Scale a normalized value from src_bits to dst_bits.
1767 *
1768 * The exact calculation is
1769 *
1770 * dst = iround(src * dst_mask / src_mask)
1771 *
1772 * or with integer rounding
1773 *
1774 * dst = src * (2*dst_mask + sign(src)*src_mask) / (2*src_mask)
1775 *
1776 * where
1777 *
1778 * src_mask = (1 << src_bits) - 1
1779 * dst_mask = (1 << dst_bits) - 1
1780 *
1781 * but we try to avoid division and multiplication through shifts.
1782 */
1783 static inline LLVMValueRef
scale_bits(struct gallivm_state * gallivm,int src_bits,int dst_bits,LLVMValueRef src,struct lp_type src_type)1784 scale_bits(struct gallivm_state *gallivm,
1785 int src_bits,
1786 int dst_bits,
1787 LLVMValueRef src,
1788 struct lp_type src_type)
1789 {
1790 LLVMBuilderRef builder = gallivm->builder;
1791 LLVMValueRef result = src;
1792
1793 if (dst_bits < src_bits) {
1794 int delta_bits = src_bits - dst_bits;
1795
1796 if (delta_bits <= dst_bits) {
1797
1798 if (dst_bits == 4) {
1799 struct lp_type flt_type =
1800 lp_type_float_vec(32, src_type.length * 32);
1801
1802 result = lp_build_unsigned_norm_to_float(gallivm, src_bits,
1803 flt_type, src);
1804 result = lp_build_clamped_float_to_unsigned_norm(gallivm, flt_type,
1805 dst_bits, result);
1806 result = LLVMBuildTrunc(gallivm->builder, result,
1807 lp_build_int_vec_type(gallivm, src_type),
1808 "");
1809 return result;
1810 }
1811
1812 /*
1813 * Approximate the rescaling with a single shift.
1814 *
1815 * This gives the wrong rounding.
1816 */
1817
1818 result = LLVMBuildLShr(builder, src,
1819 lp_build_const_int_vec(gallivm, src_type,
1820 delta_bits),
1821 "");
1822 } else {
1823 /*
1824 * Try more accurate rescaling.
1825 */
1826
1827 /*
1828 * Drop the least significant bits to make space for the
1829 * multiplication.
1830 *
1831 * XXX: A better approach would be to use a wider integer type as
1832 * intermediate. But this is enough to convert alpha from 16bits ->
1833 * 2 when rendering to PIPE_FORMAT_R10G10B10A2_UNORM.
1834 */
1835 result = LLVMBuildLShr(builder, src,
1836 lp_build_const_int_vec(gallivm, src_type,
1837 dst_bits),
1838 "");
1839
1840
1841 result = LLVMBuildMul(builder, result,
1842 lp_build_const_int_vec(gallivm, src_type,
1843 (1LL << dst_bits) - 1),
1844 "");
1845
1846 /*
1847 * Add a rounding term before the division.
1848 *
1849 * TODO: Handle signed integers too.
1850 */
1851 if (!src_type.sign) {
1852 result = LLVMBuildAdd(builder, result,
1853 lp_build_const_int_vec(gallivm, src_type,
1854 (1LL << (delta_bits - 1))),
1855 "");
1856 }
1857
1858 /*
1859 * Approximate the division by src_mask with a src_bits shift.
1860 *
1861 * Given the src has already been shifted by dst_bits, all we need
1862 * to do is to shift by the difference.
1863 */
1864
1865 result = LLVMBuildLShr(builder,
1866 result,
1867 lp_build_const_int_vec(gallivm, src_type, delta_bits),
1868 "");
1869 }
1870
1871 } else if (dst_bits > src_bits) {
1872 /* Scale up bits */
1873 int db = dst_bits - src_bits;
1874
1875 /* Shift left by difference in bits */
1876 result = LLVMBuildShl(builder,
1877 src,
1878 lp_build_const_int_vec(gallivm, src_type, db),
1879 "");
1880
1881 if (db <= src_bits) {
1882 /* Enough bits in src to fill the remainder */
1883 LLVMValueRef lower = LLVMBuildLShr(builder,
1884 src,
1885 lp_build_const_int_vec(gallivm, src_type, src_bits - db),
1886 "");
1887
1888 result = LLVMBuildOr(builder, result, lower, "");
1889 } else if (db > src_bits) {
1890 /* Need to repeatedly copy src bits to fill remainder in dst */
1891 unsigned n;
1892
1893 for (n = src_bits; n < dst_bits; n *= 2) {
1894 LLVMValueRef shuv = lp_build_const_int_vec(gallivm, src_type, n);
1895
1896 result = LLVMBuildOr(builder,
1897 result,
1898 LLVMBuildLShr(builder, result, shuv, ""),
1899 "");
1900 }
1901 }
1902 }
1903
1904 return result;
1905 }
1906
1907 /**
1908 * If RT is a smallfloat (needing denorms) format
1909 */
1910 static inline int
have_smallfloat_format(struct lp_type dst_type,enum pipe_format format)1911 have_smallfloat_format(struct lp_type dst_type,
1912 enum pipe_format format)
1913 {
1914 return ((dst_type.floating && dst_type.width != 32) ||
1915 /* due to format handling hacks this format doesn't have floating set
1916 * here (and actually has width set to 32 too) so special case this.
1917 */
1918 (format == PIPE_FORMAT_R11G11B10_FLOAT));
1919 }
1920
1921
1922 /**
1923 * Convert from memory format to blending format
1924 *
1925 * e.g. GL_R3G3B2 is 1 byte in memory but 3 bytes for blending
1926 */
1927 static void
convert_to_blend_type(struct gallivm_state * gallivm,unsigned block_size,const struct util_format_description * src_fmt,struct lp_type src_type,struct lp_type dst_type,LLVMValueRef * src,unsigned num_srcs)1928 convert_to_blend_type(struct gallivm_state *gallivm,
1929 unsigned block_size,
1930 const struct util_format_description *src_fmt,
1931 struct lp_type src_type,
1932 struct lp_type dst_type,
1933 LLVMValueRef* src, // and dst
1934 unsigned num_srcs)
1935 {
1936 LLVMValueRef *dst = src;
1937 LLVMBuilderRef builder = gallivm->builder;
1938 struct lp_type blend_type;
1939 struct lp_type mem_type;
1940 unsigned i, j;
1941 unsigned pixels = block_size / num_srcs;
1942 bool is_arith;
1943
1944 /*
1945 * full custom path for packed floats and srgb formats - none of the later
1946 * functions would do anything useful, and given the lp_type representation
1947 * they can't be fixed. Should really have some SoA blend path for these
1948 * kind of formats rather than hacking them in here.
1949 */
1950 if (format_expands_to_float_soa(src_fmt)) {
1951 LLVMValueRef tmpsrc[4];
1952 /*
1953 * This is pretty suboptimal for this case blending in SoA would be much
1954 * better, since conversion gets us SoA values so need to convert back.
1955 */
1956 assert(src_type.width == 32 || src_type.width == 16);
1957 assert(dst_type.floating);
1958 assert(dst_type.width == 32);
1959 assert(dst_type.length % 4 == 0);
1960 assert(num_srcs % 4 == 0);
1961
1962 if (src_type.width == 16) {
1963 /* expand 4x16bit values to 4x32bit */
1964 struct lp_type type32x4 = src_type;
1965 LLVMTypeRef ltype32x4;
1966 unsigned num_fetch = dst_type.length == 8 ? num_srcs / 2 : num_srcs / 4;
1967 type32x4.width = 32;
1968 ltype32x4 = lp_build_vec_type(gallivm, type32x4);
1969 for (i = 0; i < num_fetch; i++) {
1970 src[i] = LLVMBuildZExt(builder, src[i], ltype32x4, "");
1971 }
1972 src_type.width = 32;
1973 }
1974 for (i = 0; i < 4; i++) {
1975 tmpsrc[i] = src[i];
1976 }
1977 for (i = 0; i < num_srcs / 4; i++) {
1978 LLVMValueRef tmpsoa[4];
1979 LLVMValueRef tmps = tmpsrc[i];
1980 if (dst_type.length == 8) {
1981 LLVMValueRef shuffles[8];
1982 unsigned j;
1983 /* fetch was 4 values but need 8-wide output values */
1984 tmps = lp_build_concat(gallivm, &tmpsrc[i * 2], src_type, 2);
1985 /*
1986 * for 8-wide aos transpose would give us wrong order not matching
1987 * incoming converted fs values and mask. ARGH.
1988 */
1989 for (j = 0; j < 4; j++) {
1990 shuffles[j] = lp_build_const_int32(gallivm, j * 2);
1991 shuffles[j + 4] = lp_build_const_int32(gallivm, j * 2 + 1);
1992 }
1993 tmps = LLVMBuildShuffleVector(builder, tmps, tmps,
1994 LLVMConstVector(shuffles, 8), "");
1995 }
1996 if (src_fmt->format == PIPE_FORMAT_R11G11B10_FLOAT) {
1997 lp_build_r11g11b10_to_float(gallivm, tmps, tmpsoa);
1998 } else {
1999 lp_build_unpack_rgba_soa(gallivm, src_fmt, dst_type, tmps, tmpsoa);
2000 }
2001 lp_build_transpose_aos(gallivm, dst_type, tmpsoa, &src[i * 4]);
2002 }
2003 return;
2004 }
2005
2006 lp_mem_type_from_format_desc(src_fmt, &mem_type);
2007 lp_blend_type_from_format_desc(src_fmt, &blend_type);
2008
2009 /* Is the format arithmetic */
2010 is_arith = blend_type.length * blend_type.width != mem_type.width * mem_type.length;
2011 is_arith &= !(mem_type.width == 16 && mem_type.floating);
2012
2013 /* Pad if necessary */
2014 if (!is_arith && src_type.length < dst_type.length) {
2015 for (i = 0; i < num_srcs; ++i) {
2016 dst[i] = lp_build_pad_vector(gallivm, src[i], dst_type.length);
2017 }
2018
2019 src_type.length = dst_type.length;
2020 }
2021
2022 /* Special case for half-floats */
2023 if (mem_type.width == 16 && mem_type.floating) {
2024 assert(blend_type.width == 32 && blend_type.floating);
2025 lp_build_conv_auto(gallivm, src_type, &dst_type, dst, num_srcs, dst);
2026 is_arith = false;
2027 }
2028
2029 if (!is_arith) {
2030 return;
2031 }
2032
2033 src_type.width = blend_type.width * blend_type.length;
2034 blend_type.length *= pixels;
2035 src_type.length *= pixels / (src_type.length / mem_type.length);
2036
2037 for (i = 0; i < num_srcs; ++i) {
2038 LLVMValueRef chans;
2039 LLVMValueRef res = NULL;
2040
2041 dst[i] = LLVMBuildZExt(builder, src[i], lp_build_vec_type(gallivm, src_type), "");
2042
2043 for (j = 0; j < src_fmt->nr_channels; ++j) {
2044 unsigned mask = 0;
2045 unsigned sa = src_fmt->channel[j].shift;
2046 #if UTIL_ARCH_LITTLE_ENDIAN
2047 unsigned from_lsb = j;
2048 #else
2049 unsigned from_lsb = (blend_type.length / pixels) - j - 1;
2050 #endif
2051
2052 mask = (1 << src_fmt->channel[j].size) - 1;
2053
2054 /* Extract bits from source */
2055 chans = LLVMBuildLShr(builder,
2056 dst[i],
2057 lp_build_const_int_vec(gallivm, src_type, sa),
2058 "");
2059
2060 chans = LLVMBuildAnd(builder,
2061 chans,
2062 lp_build_const_int_vec(gallivm, src_type, mask),
2063 "");
2064
2065 /* Scale bits */
2066 if (src_type.norm) {
2067 chans = scale_bits(gallivm, src_fmt->channel[j].size,
2068 blend_type.width, chans, src_type);
2069 }
2070
2071 /* Insert bits into correct position */
2072 chans = LLVMBuildShl(builder,
2073 chans,
2074 lp_build_const_int_vec(gallivm, src_type, from_lsb * blend_type.width),
2075 "");
2076
2077 if (j == 0) {
2078 res = chans;
2079 } else {
2080 res = LLVMBuildOr(builder, res, chans, "");
2081 }
2082 }
2083
2084 dst[i] = LLVMBuildBitCast(builder, res, lp_build_vec_type(gallivm, blend_type), "");
2085 }
2086 }
2087
2088
2089 /**
2090 * Convert from blending format to memory format
2091 *
2092 * e.g. GL_R3G3B2 is 3 bytes for blending but 1 byte in memory
2093 */
2094 static void
convert_from_blend_type(struct gallivm_state * gallivm,unsigned block_size,const struct util_format_description * src_fmt,struct lp_type src_type,struct lp_type dst_type,LLVMValueRef * src,unsigned num_srcs)2095 convert_from_blend_type(struct gallivm_state *gallivm,
2096 unsigned block_size,
2097 const struct util_format_description *src_fmt,
2098 struct lp_type src_type,
2099 struct lp_type dst_type,
2100 LLVMValueRef* src, // and dst
2101 unsigned num_srcs)
2102 {
2103 LLVMValueRef* dst = src;
2104 unsigned i, j, k;
2105 struct lp_type mem_type;
2106 struct lp_type blend_type;
2107 LLVMBuilderRef builder = gallivm->builder;
2108 unsigned pixels = block_size / num_srcs;
2109 bool is_arith;
2110
2111 /*
2112 * full custom path for packed floats and srgb formats - none of the later
2113 * functions would do anything useful, and given the lp_type representation
2114 * they can't be fixed. Should really have some SoA blend path for these
2115 * kind of formats rather than hacking them in here.
2116 */
2117 if (format_expands_to_float_soa(src_fmt)) {
2118 /*
2119 * This is pretty suboptimal for this case blending in SoA would be much
2120 * better - we need to transpose the AoS values back to SoA values for
2121 * conversion/packing.
2122 */
2123 assert(src_type.floating);
2124 assert(src_type.width == 32);
2125 assert(src_type.length % 4 == 0);
2126 assert(dst_type.width == 32 || dst_type.width == 16);
2127
2128 for (i = 0; i < num_srcs / 4; i++) {
2129 LLVMValueRef tmpsoa[4], tmpdst;
2130 lp_build_transpose_aos(gallivm, src_type, &src[i * 4], tmpsoa);
2131 /* really really need SoA here */
2132
2133 if (src_fmt->format == PIPE_FORMAT_R11G11B10_FLOAT) {
2134 tmpdst = lp_build_float_to_r11g11b10(gallivm, tmpsoa);
2135 } else {
2136 tmpdst = lp_build_float_to_srgb_packed(gallivm, src_fmt,
2137 src_type, tmpsoa);
2138 }
2139
2140 if (src_type.length == 8) {
2141 LLVMValueRef tmpaos, shuffles[8];
2142 unsigned j;
2143 /*
2144 * for 8-wide aos transpose has given us wrong order not matching
2145 * output order. HMPF. Also need to split the output values
2146 * manually.
2147 */
2148 for (j = 0; j < 4; j++) {
2149 shuffles[j * 2] = lp_build_const_int32(gallivm, j);
2150 shuffles[j * 2 + 1] = lp_build_const_int32(gallivm, j + 4);
2151 }
2152 tmpaos = LLVMBuildShuffleVector(builder, tmpdst, tmpdst,
2153 LLVMConstVector(shuffles, 8), "");
2154 src[i * 2] = lp_build_extract_range(gallivm, tmpaos, 0, 4);
2155 src[i * 2 + 1] = lp_build_extract_range(gallivm, tmpaos, 4, 4);
2156 } else {
2157 src[i] = tmpdst;
2158 }
2159 }
2160 if (dst_type.width == 16) {
2161 struct lp_type type16x8 = dst_type;
2162 struct lp_type type32x4 = dst_type;
2163 LLVMTypeRef ltype16x4, ltypei64, ltypei128;
2164 unsigned num_fetch = src_type.length == 8 ? num_srcs / 2 : num_srcs / 4;
2165 type16x8.length = 8;
2166 type32x4.width = 32;
2167 ltypei128 = LLVMIntTypeInContext(gallivm->context, 128);
2168 ltypei64 = LLVMIntTypeInContext(gallivm->context, 64);
2169 ltype16x4 = lp_build_vec_type(gallivm, dst_type);
2170 /* We could do vector truncation but it doesn't generate very good code */
2171 for (i = 0; i < num_fetch; i++) {
2172 src[i] = lp_build_pack2(gallivm, type32x4, type16x8,
2173 src[i], lp_build_zero(gallivm, type32x4));
2174 src[i] = LLVMBuildBitCast(builder, src[i], ltypei128, "");
2175 src[i] = LLVMBuildTrunc(builder, src[i], ltypei64, "");
2176 src[i] = LLVMBuildBitCast(builder, src[i], ltype16x4, "");
2177 }
2178 }
2179 return;
2180 }
2181
2182 lp_mem_type_from_format_desc(src_fmt, &mem_type);
2183 lp_blend_type_from_format_desc(src_fmt, &blend_type);
2184
2185 is_arith = (blend_type.length * blend_type.width != mem_type.width * mem_type.length);
2186
2187 /* Special case for half-floats */
2188 if (mem_type.width == 16 && mem_type.floating) {
2189 int length = dst_type.length;
2190 assert(blend_type.width == 32 && blend_type.floating);
2191
2192 dst_type.length = src_type.length;
2193
2194 lp_build_conv_auto(gallivm, src_type, &dst_type, dst, num_srcs, dst);
2195
2196 dst_type.length = length;
2197 is_arith = false;
2198 }
2199
2200 /* Remove any padding */
2201 if (!is_arith && (src_type.length % mem_type.length)) {
2202 src_type.length -= (src_type.length % mem_type.length);
2203
2204 for (i = 0; i < num_srcs; ++i) {
2205 dst[i] = lp_build_extract_range(gallivm, dst[i], 0, src_type.length);
2206 }
2207 }
2208
2209 /* No bit arithmetic to do */
2210 if (!is_arith) {
2211 return;
2212 }
2213
2214 src_type.length = pixels;
2215 src_type.width = blend_type.length * blend_type.width;
2216 dst_type.length = pixels;
2217
2218 for (i = 0; i < num_srcs; ++i) {
2219 LLVMValueRef chans;
2220 LLVMValueRef res = NULL;
2221
2222 dst[i] = LLVMBuildBitCast(builder, src[i], lp_build_vec_type(gallivm, src_type), "");
2223
2224 for (j = 0; j < src_fmt->nr_channels; ++j) {
2225 unsigned mask = 0;
2226 unsigned sa = src_fmt->channel[j].shift;
2227 unsigned sz_a = src_fmt->channel[j].size;
2228 #if UTIL_ARCH_LITTLE_ENDIAN
2229 unsigned from_lsb = j;
2230 #else
2231 unsigned from_lsb = blend_type.length - j - 1;
2232 #endif
2233
2234 assert(blend_type.width > src_fmt->channel[j].size);
2235
2236 for (k = 0; k < blend_type.width; ++k) {
2237 mask |= 1 << k;
2238 }
2239
2240 /* Extract bits */
2241 chans = LLVMBuildLShr(builder,
2242 dst[i],
2243 lp_build_const_int_vec(gallivm, src_type,
2244 from_lsb * blend_type.width),
2245 "");
2246
2247 chans = LLVMBuildAnd(builder,
2248 chans,
2249 lp_build_const_int_vec(gallivm, src_type, mask),
2250 "");
2251
2252 /* Scale down bits */
2253 if (src_type.norm) {
2254 chans = scale_bits(gallivm, blend_type.width,
2255 src_fmt->channel[j].size, chans, src_type);
2256 } else if (!src_type.floating && sz_a < blend_type.width) {
2257 LLVMValueRef mask_val = lp_build_const_int_vec(gallivm, src_type, (1UL << sz_a) - 1);
2258 LLVMValueRef mask = LLVMBuildICmp(builder, LLVMIntUGT, chans, mask_val, "");
2259 chans = LLVMBuildSelect(builder, mask, mask_val, chans, "");
2260 }
2261
2262 /* Insert bits */
2263 chans = LLVMBuildShl(builder,
2264 chans,
2265 lp_build_const_int_vec(gallivm, src_type, sa),
2266 "");
2267
2268 sa += src_fmt->channel[j].size;
2269
2270 if (j == 0) {
2271 res = chans;
2272 } else {
2273 res = LLVMBuildOr(builder, res, chans, "");
2274 }
2275 }
2276
2277 assert (dst_type.width != 24);
2278
2279 dst[i] = LLVMBuildTrunc(builder, res, lp_build_vec_type(gallivm, dst_type), "");
2280 }
2281 }
2282
2283
2284 /**
2285 * Convert alpha to same blend type as src
2286 */
2287 static void
convert_alpha(struct gallivm_state * gallivm,struct lp_type row_type,struct lp_type alpha_type,const unsigned block_size,const unsigned block_height,const unsigned src_count,const unsigned dst_channels,const bool pad_inline,LLVMValueRef * src_alpha)2288 convert_alpha(struct gallivm_state *gallivm,
2289 struct lp_type row_type,
2290 struct lp_type alpha_type,
2291 const unsigned block_size,
2292 const unsigned block_height,
2293 const unsigned src_count,
2294 const unsigned dst_channels,
2295 const bool pad_inline,
2296 LLVMValueRef* src_alpha)
2297 {
2298 LLVMBuilderRef builder = gallivm->builder;
2299 const unsigned length = row_type.length;
2300 row_type.length = alpha_type.length;
2301
2302 /* Twiddle the alpha to match pixels */
2303 lp_bld_quad_twiddle(gallivm, alpha_type, src_alpha, block_height, src_alpha);
2304
2305 /*
2306 * TODO this should use single lp_build_conv call for
2307 * src_count == 1 && dst_channels == 1 case (dropping the concat below)
2308 */
2309 for (unsigned i = 0; i < block_height; ++i) {
2310 lp_build_conv(gallivm, alpha_type, row_type, &src_alpha[i], 1,
2311 &src_alpha[i], 1);
2312 }
2313
2314 alpha_type = row_type;
2315 row_type.length = length;
2316
2317 /* If only one channel we can only need the single alpha value per pixel */
2318 if (src_count == 1 && dst_channels == 1) {
2319 lp_build_concat_n(gallivm, alpha_type, src_alpha, block_height,
2320 src_alpha, src_count);
2321 } else {
2322 /* If there are more srcs than rows then we need to split alpha up */
2323 if (src_count > block_height) {
2324 for (unsigned i = src_count; i > 0; --i) {
2325 unsigned pixels = block_size / src_count;
2326 unsigned idx = i - 1;
2327
2328 src_alpha[idx] =
2329 lp_build_extract_range(gallivm, src_alpha[(idx * pixels) / 4],
2330 (idx * pixels) % 4, pixels);
2331 }
2332 }
2333
2334 /* If there is a src for each pixel broadcast the alpha across whole
2335 * row
2336 */
2337 if (src_count == block_size) {
2338 for (unsigned i = 0; i < src_count; ++i) {
2339 src_alpha[i] = lp_build_broadcast(gallivm,
2340 lp_build_vec_type(gallivm, row_type), src_alpha[i]);
2341 }
2342 } else {
2343 unsigned pixels = block_size / src_count;
2344 unsigned channels = pad_inline ? TGSI_NUM_CHANNELS : dst_channels;
2345 unsigned alpha_span = 1;
2346 LLVMValueRef shuffles[LP_MAX_VECTOR_LENGTH];
2347
2348 /* Check if we need 2 src_alphas for our shuffles */
2349 if (pixels > alpha_type.length) {
2350 alpha_span = 2;
2351 }
2352
2353 /* Broadcast alpha across all channels, e.g. a1a2 to a1a1a1a1a2a2a2a2 */
2354 for (unsigned j = 0; j < row_type.length; ++j) {
2355 if (j < pixels * channels) {
2356 shuffles[j] = lp_build_const_int32(gallivm, j / channels);
2357 } else {
2358 shuffles[j] = LLVMGetUndef(LLVMInt32TypeInContext(gallivm->context));
2359 }
2360 }
2361
2362 for (unsigned i = 0; i < src_count; ++i) {
2363 unsigned idx1 = i, idx2 = i;
2364
2365 if (alpha_span > 1){
2366 idx1 *= alpha_span;
2367 idx2 = idx1 + 1;
2368 }
2369
2370 src_alpha[i] = LLVMBuildShuffleVector(builder,
2371 src_alpha[idx1],
2372 src_alpha[idx2],
2373 LLVMConstVector(shuffles, row_type.length),
2374 "");
2375 }
2376 }
2377 }
2378 }
2379
2380
2381 /**
2382 * Generates the blend function for unswizzled colour buffers
2383 * Also generates the read & write from colour buffer
2384 */
2385 static void
generate_unswizzled_blend(struct gallivm_state * gallivm,unsigned rt,struct lp_fragment_shader_variant * variant,enum pipe_format out_format,unsigned int num_fs,struct lp_type fs_type,LLVMValueRef * fs_mask,LLVMValueRef fs_out_color[PIPE_MAX_COLOR_BUFS][TGSI_NUM_CHANNELS][4],LLVMTypeRef context_type,LLVMValueRef context_ptr,LLVMTypeRef color_type,LLVMValueRef color_ptr,LLVMValueRef stride,unsigned partial_mask,bool do_branch)2386 generate_unswizzled_blend(struct gallivm_state *gallivm,
2387 unsigned rt,
2388 struct lp_fragment_shader_variant *variant,
2389 enum pipe_format out_format,
2390 unsigned int num_fs,
2391 struct lp_type fs_type,
2392 LLVMValueRef* fs_mask,
2393 LLVMValueRef fs_out_color[PIPE_MAX_COLOR_BUFS][TGSI_NUM_CHANNELS][4],
2394 LLVMTypeRef context_type,
2395 LLVMValueRef context_ptr,
2396 LLVMTypeRef color_type,
2397 LLVMValueRef color_ptr,
2398 LLVMValueRef stride,
2399 unsigned partial_mask,
2400 bool do_branch)
2401 {
2402 const unsigned alpha_channel = 3;
2403 const unsigned block_width = LP_RASTER_BLOCK_SIZE;
2404 const unsigned block_height = LP_RASTER_BLOCK_SIZE;
2405 const unsigned block_size = block_width * block_height;
2406 const unsigned lp_integer_vector_width = 128;
2407
2408 LLVMBuilderRef builder = gallivm->builder;
2409 LLVMValueRef fs_src[4][TGSI_NUM_CHANNELS];
2410 LLVMValueRef fs_src1[4][TGSI_NUM_CHANNELS];
2411 LLVMValueRef src_alpha[4 * 4];
2412 LLVMValueRef src1_alpha[4 * 4] = { NULL };
2413 LLVMValueRef src_mask[4 * 4];
2414 LLVMValueRef src[4 * 4];
2415 LLVMValueRef src1[4 * 4];
2416 LLVMValueRef dst[4 * 4];
2417
2418 struct lp_build_mask_context mask_ctx;
2419
2420 unsigned char swizzle[TGSI_NUM_CHANNELS];
2421 unsigned src_channels = TGSI_NUM_CHANNELS;
2422
2423 const struct util_format_description *out_format_desc =
2424 util_format_description(out_format);
2425
2426 bool pad_inline = is_arithmetic_format(out_format_desc);
2427 const bool dual_source_blend =
2428 variant->key.blend.rt[0].blend_enable &&
2429 util_blend_state_is_dual(&variant->key.blend, 0);
2430
2431 const bool is_1d = variant->key.resource_1d;
2432 const unsigned num_fullblock_fs = is_1d ? 2 * num_fs : num_fs;
2433 LLVMValueRef fpstate = NULL;
2434
2435 LLVMTypeRef fs_vec_type = lp_build_vec_type(gallivm, fs_type);
2436
2437 /* Get type from output format */
2438 struct lp_type row_type, dst_type;
2439 lp_blend_type_from_format_desc(out_format_desc, &row_type);
2440 lp_mem_type_from_format_desc(out_format_desc, &dst_type);
2441
2442 /*
2443 * Technically this code should go into lp_build_smallfloat_to_float
2444 * and lp_build_float_to_smallfloat but due to the
2445 * http://llvm.org/bugs/show_bug.cgi?id=6393
2446 * llvm reorders the mxcsr intrinsics in a way that breaks the code.
2447 * So the ordering is important here and there shouldn't be any
2448 * llvm ir instrunctions in this function before
2449 * this, otherwise half-float format conversions won't work
2450 * (again due to llvm bug #6393).
2451 */
2452 if (have_smallfloat_format(dst_type, out_format)) {
2453 /* We need to make sure that denorms are ok for half float
2454 conversions */
2455 fpstate = lp_build_fpstate_get(gallivm);
2456 lp_build_fpstate_set_denorms_zero(gallivm, false);
2457 }
2458
2459 struct lp_type mask_type = lp_int32_vec4_type();
2460 mask_type.length = fs_type.length;
2461
2462 for (unsigned i = num_fs; i < num_fullblock_fs; i++) {
2463 fs_mask[i] = lp_build_zero(gallivm, mask_type);
2464 }
2465
2466 /* Do not bother executing code when mask is empty.. */
2467 if (do_branch) {
2468 LLVMValueRef check_mask =
2469 LLVMConstNull(lp_build_int_vec_type(gallivm, mask_type));
2470
2471 for (unsigned i = 0; i < num_fullblock_fs; ++i) {
2472 check_mask = LLVMBuildOr(builder, check_mask, fs_mask[i], "");
2473 }
2474
2475 lp_build_mask_begin(&mask_ctx, gallivm, mask_type, check_mask);
2476 lp_build_mask_check(&mask_ctx);
2477 }
2478
2479 partial_mask |= !variant->opaque;
2480 LLVMValueRef i32_zero = lp_build_const_int32(gallivm, 0);
2481
2482 LLVMValueRef undef_src_val = lp_build_undef(gallivm, fs_type);
2483
2484 row_type.length = fs_type.length;
2485 unsigned vector_width =
2486 dst_type.floating ? lp_native_vector_width : lp_integer_vector_width;
2487
2488 /* Compute correct swizzle and count channels */
2489 memset(swizzle, LP_BLD_SWIZZLE_DONTCARE, TGSI_NUM_CHANNELS);
2490 unsigned dst_channels = 0;
2491
2492 bool has_alpha = false;
2493 for (unsigned i = 0; i < TGSI_NUM_CHANNELS; ++i) {
2494 /* Ensure channel is used */
2495 if (out_format_desc->swizzle[i] >= TGSI_NUM_CHANNELS) {
2496 continue;
2497 }
2498
2499 /* Ensure not already written to (happens in case with GL_ALPHA) */
2500 if (swizzle[out_format_desc->swizzle[i]] < TGSI_NUM_CHANNELS) {
2501 continue;
2502 }
2503
2504 /* Ensure we haven't already found all channels */
2505 if (dst_channels >= out_format_desc->nr_channels) {
2506 continue;
2507 }
2508
2509 swizzle[out_format_desc->swizzle[i]] = i;
2510 ++dst_channels;
2511
2512 if (i == alpha_channel) {
2513 has_alpha = true;
2514 }
2515 }
2516
2517 if (format_expands_to_float_soa(out_format_desc)) {
2518 /*
2519 * the code above can't work for layout_other
2520 * for srgb it would sort of work but we short-circuit swizzles, etc.
2521 * as that is done as part of unpack / pack.
2522 */
2523 dst_channels = 4; /* HACK: this is fake 4 really but need it due to transpose stuff later */
2524 has_alpha = true;
2525 swizzle[0] = 0;
2526 swizzle[1] = 1;
2527 swizzle[2] = 2;
2528 swizzle[3] = 3;
2529 pad_inline = true; /* HACK: prevent rgbxrgbx->rgbrgbxx conversion later */
2530 }
2531
2532 /* If 3 channels then pad to include alpha for 4 element transpose */
2533 if (dst_channels == 3) {
2534 assert (!has_alpha);
2535 for (unsigned i = 0; i < TGSI_NUM_CHANNELS; i++) {
2536 if (swizzle[i] > TGSI_NUM_CHANNELS)
2537 swizzle[i] = 3;
2538 }
2539 if (out_format_desc->nr_channels == 4) {
2540 dst_channels = 4;
2541 /*
2542 * We use alpha from the color conversion, not separate one.
2543 * We had to include it for transpose, hence it will get converted
2544 * too (albeit when doing transpose after conversion, that would
2545 * no longer be the case necessarily).
2546 * (It works only with 4 channel dsts, e.g. rgbx formats, because
2547 * otherwise we really have padding, not alpha, included.)
2548 */
2549 has_alpha = true;
2550 }
2551 }
2552
2553 /*
2554 * Load shader output
2555 */
2556 for (unsigned i = 0; i < num_fullblock_fs; ++i) {
2557 /* Always load alpha for use in blending */
2558 LLVMValueRef alpha;
2559 if (i < num_fs) {
2560 alpha = LLVMBuildLoad2(builder, fs_vec_type,
2561 fs_out_color[rt][alpha_channel][i], "");
2562 } else {
2563 alpha = undef_src_val;
2564 }
2565
2566 /* Load each channel */
2567 for (unsigned j = 0; j < dst_channels; ++j) {
2568 assert(swizzle[j] < 4);
2569 if (i < num_fs) {
2570 fs_src[i][j] = LLVMBuildLoad2(builder, fs_vec_type,
2571 fs_out_color[rt][swizzle[j]][i], "");
2572 } else {
2573 fs_src[i][j] = undef_src_val;
2574 }
2575 }
2576
2577 /* If 3 channels then pad to include alpha for 4 element transpose */
2578 /*
2579 * XXX If we include that here maybe could actually use it instead of
2580 * separate alpha for blending?
2581 * (Difficult though we actually convert pad channels, not alpha.)
2582 */
2583 if (dst_channels == 3 && !has_alpha) {
2584 fs_src[i][3] = alpha;
2585 }
2586
2587 /* We split the row_mask and row_alpha as we want 128bit interleave */
2588 if (fs_type.length == 8) {
2589 src_mask[i*2 + 0] = lp_build_extract_range(gallivm, fs_mask[i],
2590 0, src_channels);
2591 src_mask[i*2 + 1] = lp_build_extract_range(gallivm, fs_mask[i],
2592 src_channels,
2593 src_channels);
2594
2595 src_alpha[i*2 + 0] = lp_build_extract_range(gallivm, alpha,
2596 0, src_channels);
2597 src_alpha[i*2 + 1] = lp_build_extract_range(gallivm, alpha,
2598 src_channels,
2599 src_channels);
2600 } else {
2601 src_mask[i] = fs_mask[i];
2602 src_alpha[i] = alpha;
2603 }
2604 }
2605 if (dual_source_blend) {
2606 /* same as above except different src/dst, skip masks and comments... */
2607 for (unsigned i = 0; i < num_fullblock_fs; ++i) {
2608 LLVMValueRef alpha;
2609 if (i < num_fs) {
2610 alpha = LLVMBuildLoad2(builder, fs_vec_type,
2611 fs_out_color[1][alpha_channel][i], "");
2612 } else {
2613 alpha = undef_src_val;
2614 }
2615
2616 for (unsigned j = 0; j < dst_channels; ++j) {
2617 assert(swizzle[j] < 4);
2618 if (i < num_fs) {
2619 fs_src1[i][j] = LLVMBuildLoad2(builder, fs_vec_type,
2620 fs_out_color[1][swizzle[j]][i], "");
2621 } else {
2622 fs_src1[i][j] = undef_src_val;
2623 }
2624 }
2625 if (dst_channels == 3 && !has_alpha) {
2626 fs_src1[i][3] = alpha;
2627 }
2628 if (fs_type.length == 8) {
2629 src1_alpha[i*2 + 0] = lp_build_extract_range(gallivm, alpha, 0, src_channels);
2630 src1_alpha[i*2 + 1] = lp_build_extract_range(gallivm, alpha,
2631 src_channels, src_channels);
2632 } else {
2633 src1_alpha[i] = alpha;
2634 }
2635 }
2636 }
2637
2638 if (util_format_is_pure_integer(out_format)) {
2639 /*
2640 * In this case fs_type was really ints or uints disguised as floats,
2641 * fix that up now.
2642 */
2643 fs_type.floating = 0;
2644 fs_type.sign = dst_type.sign;
2645 fs_vec_type = lp_build_vec_type(gallivm, fs_type);
2646 for (unsigned i = 0; i < num_fullblock_fs; ++i) {
2647 for (unsigned j = 0; j < dst_channels; ++j) {
2648 fs_src[i][j] = LLVMBuildBitCast(builder, fs_src[i][j],
2649 fs_vec_type, "");
2650 }
2651 if (dst_channels == 3 && !has_alpha) {
2652 fs_src[i][3] = LLVMBuildBitCast(builder, fs_src[i][3],
2653 fs_vec_type, "");
2654 }
2655 }
2656 }
2657
2658 /*
2659 * We actually should generally do conversion first (for non-1d cases)
2660 * when the blend format is 8 or 16 bits. The reason is obvious,
2661 * there's 2 or 4 times less vectors to deal with for the interleave...
2662 * Albeit for the AVX (not AVX2) case there's no benefit with 16 bit
2663 * vectors (as it can do 32bit unpack with 256bit vectors, but 8/16bit
2664 * unpack only with 128bit vectors).
2665 * Note: for 16bit sizes really need matching pack conversion code
2666 */
2667 bool twiddle_after_convert = false;
2668 if (!is_1d && dst_channels != 3 && dst_type.width == 8) {
2669 twiddle_after_convert = true;
2670 }
2671
2672 /*
2673 * Pixel twiddle from fragment shader order to memory order
2674 */
2675 unsigned src_count;
2676 if (!twiddle_after_convert) {
2677 src_count = generate_fs_twiddle(gallivm, fs_type, num_fullblock_fs,
2678 dst_channels, fs_src, src, pad_inline);
2679 if (dual_source_blend) {
2680 generate_fs_twiddle(gallivm, fs_type, num_fullblock_fs, dst_channels,
2681 fs_src1, src1, pad_inline);
2682 }
2683 } else {
2684 src_count = num_fullblock_fs * dst_channels;
2685 /*
2686 * We reorder things a bit here, so the cases for 4-wide and 8-wide
2687 * (AVX) turn out the same later when untwiddling/transpose (albeit
2688 * for true AVX2 path untwiddle needs to be different).
2689 * For now just order by colors first (so we can use unpack later).
2690 */
2691 for (unsigned j = 0; j < num_fullblock_fs; j++) {
2692 for (unsigned i = 0; i < dst_channels; i++) {
2693 src[i*num_fullblock_fs + j] = fs_src[j][i];
2694 if (dual_source_blend) {
2695 src1[i*num_fullblock_fs + j] = fs_src1[j][i];
2696 }
2697 }
2698 }
2699 }
2700
2701 src_channels = dst_channels < 3 ? dst_channels : 4;
2702 if (src_count != num_fullblock_fs * src_channels) {
2703 unsigned ds = src_count / (num_fullblock_fs * src_channels);
2704 row_type.length /= ds;
2705 fs_type.length = row_type.length;
2706 fs_vec_type = lp_build_vec_type(gallivm, fs_type);
2707 }
2708
2709 struct lp_type blend_type = row_type;
2710 mask_type.length = 4;
2711
2712 /* Convert src to row_type */
2713 if (dual_source_blend) {
2714 struct lp_type old_row_type = row_type;
2715 lp_build_conv_auto(gallivm, fs_type, &row_type, src, src_count, src);
2716 src_count = lp_build_conv_auto(gallivm, fs_type, &old_row_type,
2717 src1, src_count, src1);
2718 } else {
2719 src_count = lp_build_conv_auto(gallivm, fs_type, &row_type,
2720 src, src_count, src);
2721 }
2722
2723 /* If the rows are not an SSE vector, combine them to become SSE size! */
2724 if ((row_type.width * row_type.length) % 128) {
2725 unsigned bits = row_type.width * row_type.length;
2726 unsigned combined;
2727
2728 assert(src_count >= (vector_width / bits));
2729
2730 const unsigned dst_count = src_count / (vector_width / bits);
2731
2732 combined = lp_build_concat_n(gallivm, row_type, src, src_count,
2733 src, dst_count);
2734 if (dual_source_blend) {
2735 lp_build_concat_n(gallivm, row_type, src1, src_count, src1, dst_count);
2736 }
2737
2738 row_type.length *= combined;
2739 src_count /= combined;
2740
2741 bits = row_type.width * row_type.length;
2742 assert(bits == 128 || bits == 256);
2743 }
2744
2745 if (twiddle_after_convert) {
2746 fs_twiddle_transpose(gallivm, row_type, src, src_count, src);
2747 if (dual_source_blend) {
2748 fs_twiddle_transpose(gallivm, row_type, src1, src_count, src1);
2749 }
2750 }
2751
2752 /*
2753 * Blend Colour conversion
2754 */
2755 LLVMValueRef blend_color =
2756 lp_jit_context_f_blend_color(gallivm, context_type, context_ptr);
2757 blend_color = LLVMBuildPointerCast(builder, blend_color,
2758 LLVMPointerType(fs_vec_type, 0),
2759 "");
2760 blend_color = LLVMBuildLoad2(builder, fs_vec_type,
2761 LLVMBuildGEP2(builder, fs_vec_type,
2762 blend_color,
2763 &i32_zero, 1, ""), "");
2764
2765 /* Convert */
2766 lp_build_conv(gallivm, fs_type, blend_type, &blend_color, 1,
2767 &blend_color, 1);
2768
2769 if (out_format_desc->colorspace == UTIL_FORMAT_COLORSPACE_SRGB) {
2770 /*
2771 * since blending is done with floats, there was no conversion.
2772 * However, the rules according to fixed point renderbuffers still
2773 * apply, that is we must clamp inputs to 0.0/1.0.
2774 * (This would apply to separate alpha conversion too but we currently
2775 * force has_alpha to be true.)
2776 * TODO: should skip this with "fake" blend, since post-blend conversion
2777 * will clamp anyway.
2778 * TODO: could also skip this if fragment color clamping is enabled.
2779 * We don't support it natively so it gets baked into the shader
2780 * however, so can't really tell here.
2781 */
2782 struct lp_build_context f32_bld;
2783 assert(row_type.floating);
2784 lp_build_context_init(&f32_bld, gallivm, row_type);
2785 for (unsigned i = 0; i < src_count; i++) {
2786 src[i] = lp_build_clamp_zero_one_nanzero(&f32_bld, src[i]);
2787 }
2788 if (dual_source_blend) {
2789 for (unsigned i = 0; i < src_count; i++) {
2790 src1[i] = lp_build_clamp_zero_one_nanzero(&f32_bld, src1[i]);
2791 }
2792 }
2793 /* probably can't be different than row_type but better safe than sorry... */
2794 lp_build_context_init(&f32_bld, gallivm, blend_type);
2795 blend_color = lp_build_clamp(&f32_bld, blend_color,
2796 f32_bld.zero, f32_bld.one);
2797 }
2798
2799 /* Extract alpha */
2800 LLVMValueRef blend_alpha =
2801 lp_build_extract_broadcast(gallivm, blend_type, row_type,
2802 blend_color,
2803 lp_build_const_int32(gallivm, 3));
2804
2805 /* Swizzle to appropriate channels, e.g. from RGBA to BGRA BGRA */
2806 pad_inline &= (dst_channels * (block_size / src_count) * row_type.width)
2807 != vector_width;
2808 if (pad_inline) {
2809 /* Use all 4 channels e.g. from RGBA RGBA to RGxx RGxx */
2810 blend_color = lp_build_swizzle_aos_n(gallivm, blend_color, swizzle,
2811 TGSI_NUM_CHANNELS, row_type.length);
2812 } else {
2813 /* Only use dst_channels e.g. RGBA RGBA to RG RG xxxx */
2814 blend_color = lp_build_swizzle_aos_n(gallivm, blend_color, swizzle,
2815 dst_channels, row_type.length);
2816 }
2817
2818 /*
2819 * Mask conversion
2820 */
2821 lp_bld_quad_twiddle(gallivm, mask_type, &src_mask[0],
2822 block_height, &src_mask[0]);
2823
2824 if (src_count < block_height) {
2825 lp_build_concat_n(gallivm, mask_type, src_mask, 4, src_mask, src_count);
2826 } else if (src_count > block_height) {
2827 for (unsigned i = src_count; i > 0; --i) {
2828 unsigned pixels = block_size / src_count;
2829 unsigned idx = i - 1;
2830
2831 src_mask[idx] = lp_build_extract_range(gallivm,
2832 src_mask[(idx * pixels) / 4],
2833 (idx * pixels) % 4, pixels);
2834 }
2835 }
2836
2837 assert(mask_type.width == 32);
2838
2839 for (unsigned i = 0; i < src_count; ++i) {
2840 unsigned pixels = block_size / src_count;
2841 unsigned pixel_width = row_type.width * dst_channels;
2842
2843 if (pixel_width == 24) {
2844 mask_type.width = 8;
2845 mask_type.length = vector_width / mask_type.width;
2846 } else {
2847 mask_type.length = pixels;
2848 mask_type.width = row_type.width * dst_channels;
2849
2850 /*
2851 * If mask_type width is smaller than 32bit, this doesn't quite
2852 * generate the most efficient code (could use some pack).
2853 */
2854 src_mask[i] = LLVMBuildIntCast(builder, src_mask[i],
2855 lp_build_int_vec_type(gallivm,
2856 mask_type), "");
2857
2858 mask_type.length *= dst_channels;
2859 mask_type.width /= dst_channels;
2860 }
2861
2862 src_mask[i] = LLVMBuildBitCast(builder, src_mask[i],
2863 lp_build_int_vec_type(gallivm, mask_type),
2864 "");
2865 src_mask[i] = lp_build_pad_vector(gallivm, src_mask[i], row_type.length);
2866 }
2867
2868 /*
2869 * Alpha conversion
2870 */
2871 if (!has_alpha) {
2872 struct lp_type alpha_type = fs_type;
2873 alpha_type.length = 4;
2874 convert_alpha(gallivm, row_type, alpha_type,
2875 block_size, block_height,
2876 src_count, dst_channels,
2877 pad_inline, src_alpha);
2878 if (dual_source_blend) {
2879 convert_alpha(gallivm, row_type, alpha_type,
2880 block_size, block_height,
2881 src_count, dst_channels,
2882 pad_inline, src1_alpha);
2883 }
2884 }
2885
2886
2887 /*
2888 * Load dst from memory
2889 */
2890 unsigned dst_count;
2891 if (src_count < block_height) {
2892 dst_count = block_height;
2893 } else {
2894 dst_count = src_count;
2895 }
2896
2897 dst_type.length *= block_size / dst_count;
2898
2899 if (format_expands_to_float_soa(out_format_desc)) {
2900 /*
2901 * we need multiple values at once for the conversion, so can as well
2902 * load them vectorized here too instead of concatenating later.
2903 * (Still need concatenation later for 8-wide vectors).
2904 */
2905 dst_count = block_height;
2906 dst_type.length = block_width;
2907 }
2908
2909 /*
2910 * Compute the alignment of the destination pointer in bytes
2911 * We fetch 1-4 pixels, if the format has pot alignment then those fetches
2912 * are always aligned by MIN2(16, fetch_width) except for buffers (not
2913 * 1d tex but can't distinguish here) so need to stick with per-pixel
2914 * alignment in this case.
2915 */
2916 unsigned dst_alignment;
2917 if (is_1d) {
2918 dst_alignment = (out_format_desc->block.bits + 7)/(out_format_desc->block.width * 8);
2919 } else {
2920 dst_alignment = dst_type.length * dst_type.width / 8;
2921 }
2922 /* Force power-of-two alignment by extracting only the least-significant-bit */
2923 dst_alignment = 1 << (ffs(dst_alignment) - 1);
2924 /*
2925 * Resource base and stride pointers are aligned to 16 bytes, so that's
2926 * the maximum alignment we can guarantee
2927 */
2928 dst_alignment = MIN2(16, dst_alignment);
2929
2930 struct lp_type ls_type = dst_type;
2931
2932 if (dst_count > src_count) {
2933 if ((dst_type.width == 8 || dst_type.width == 16) &&
2934 util_is_power_of_two_or_zero(dst_type.length) &&
2935 dst_type.length * dst_type.width < 128) {
2936 /*
2937 * Never try to load values as 4xi8 which we will then
2938 * concatenate to larger vectors. This gives llvm a real
2939 * headache (the problem is the type legalizer (?) will
2940 * try to load that as 4xi8 zext to 4xi32 to fill the vector,
2941 * then the shuffles to concatenate are more or less impossible
2942 * - llvm is easily capable of generating a sequence of 32
2943 * pextrb/pinsrb instructions for that. Albeit it appears to
2944 * be fixed in llvm 4.0. So, load and concatenate with 32bit
2945 * width to avoid the trouble (16bit seems not as bad, llvm
2946 * probably recognizes the load+shuffle as only one shuffle
2947 * is necessary, but we can do just the same anyway).
2948 */
2949 ls_type.length = dst_type.length * dst_type.width / 32;
2950 ls_type.width = 32;
2951 }
2952 }
2953
2954 if (is_1d) {
2955 load_unswizzled_block(gallivm, color_type, color_ptr, stride, block_width, 1,
2956 dst, ls_type, dst_count / 4, dst_alignment);
2957 for (unsigned i = dst_count / 4; i < dst_count; i++) {
2958 dst[i] = lp_build_undef(gallivm, ls_type);
2959 }
2960 } else {
2961 load_unswizzled_block(gallivm, color_type, color_ptr, stride, block_width,
2962 block_height, dst, ls_type, dst_count,
2963 dst_alignment);
2964 }
2965
2966
2967 /*
2968 * Convert from dst/output format to src/blending format.
2969 *
2970 * This is necessary as we can only read 1 row from memory at a time,
2971 * so the minimum dst_count will ever be at this point is 4.
2972 *
2973 * With, for example, R8 format you can have all 16 pixels in a 128 bit
2974 * vector, this will take the 4 dsts and combine them into 1 src so we can
2975 * perform blending on all 16 pixels in that single vector at once.
2976 */
2977 if (dst_count > src_count) {
2978 if (ls_type.length != dst_type.length && ls_type.length == 1) {
2979 LLVMTypeRef elem_type = lp_build_elem_type(gallivm, ls_type);
2980 LLVMTypeRef ls_vec_type = LLVMVectorType(elem_type, 1);
2981 for (unsigned i = 0; i < dst_count; i++) {
2982 dst[i] = LLVMBuildBitCast(builder, dst[i], ls_vec_type, "");
2983 }
2984 }
2985
2986 lp_build_concat_n(gallivm, ls_type, dst, 4, dst, src_count);
2987
2988 if (ls_type.length != dst_type.length) {
2989 struct lp_type tmp_type = dst_type;
2990 tmp_type.length = dst_type.length * 4 / src_count;
2991 for (unsigned i = 0; i < src_count; i++) {
2992 dst[i] = LLVMBuildBitCast(builder, dst[i],
2993 lp_build_vec_type(gallivm, tmp_type), "");
2994 }
2995 }
2996 }
2997
2998 /*
2999 * Blending
3000 */
3001 /* XXX this is broken for RGB8 formats -
3002 * they get expanded from 12 to 16 elements (to include alpha)
3003 * by convert_to_blend_type then reduced to 15 instead of 12
3004 * by convert_from_blend_type (a simple fix though breaks A8...).
3005 * R16G16B16 also crashes differently however something going wrong
3006 * inside llvm handling npot vector sizes seemingly.
3007 * It seems some cleanup could be done here (like skipping conversion/blend
3008 * when not needed).
3009 */
3010 convert_to_blend_type(gallivm, block_size, out_format_desc, dst_type,
3011 row_type, dst, src_count);
3012
3013 /*
3014 * FIXME: Really should get logic ops / masks out of generic blend / row
3015 * format. Logic ops will definitely not work on the blend float format
3016 * used for SRGB here and I think OpenGL expects this to work as expected
3017 * (that is incoming values converted to srgb then logic op applied).
3018 */
3019 for (unsigned i = 0; i < src_count; ++i) {
3020 dst[i] = lp_build_blend_aos(gallivm,
3021 &variant->key.blend,
3022 out_format,
3023 row_type,
3024 rt,
3025 src[i],
3026 has_alpha ? NULL : src_alpha[i],
3027 src1[i],
3028 has_alpha ? NULL : src1_alpha[i],
3029 dst[i],
3030 partial_mask ? src_mask[i] : NULL,
3031 blend_color,
3032 has_alpha ? NULL : blend_alpha,
3033 swizzle,
3034 pad_inline ? 4 : dst_channels);
3035 }
3036
3037 convert_from_blend_type(gallivm, block_size, out_format_desc,
3038 row_type, dst_type, dst, src_count);
3039
3040 /* Split the blend rows back to memory rows */
3041 if (dst_count > src_count) {
3042 row_type.length = dst_type.length * (dst_count / src_count);
3043
3044 if (src_count == 1) {
3045 dst[1] = lp_build_extract_range(gallivm, dst[0],
3046 row_type.length / 2,
3047 row_type.length / 2);
3048 dst[0] = lp_build_extract_range(gallivm, dst[0],
3049 0, row_type.length / 2);
3050
3051 row_type.length /= 2;
3052 src_count *= 2;
3053 }
3054
3055 dst[3] = lp_build_extract_range(gallivm, dst[1], row_type.length / 2,
3056 row_type.length / 2);
3057 dst[2] = lp_build_extract_range(gallivm, dst[1], 0, row_type.length / 2);
3058 dst[1] = lp_build_extract_range(gallivm, dst[0], row_type.length / 2,
3059 row_type.length / 2);
3060 dst[0] = lp_build_extract_range(gallivm, dst[0], 0, row_type.length / 2);
3061
3062 row_type.length /= 2;
3063 src_count *= 2;
3064 }
3065
3066 /*
3067 * Store blend result to memory
3068 */
3069 if (is_1d) {
3070 store_unswizzled_block(gallivm, color_type, color_ptr, stride, block_width, 1,
3071 dst, dst_type, dst_count / 4, dst_alignment);
3072 } else {
3073 store_unswizzled_block(gallivm, color_type, color_ptr, stride, block_width,
3074 block_height,
3075 dst, dst_type, dst_count, dst_alignment);
3076 }
3077
3078 if (do_branch) {
3079 lp_build_mask_end(&mask_ctx);
3080 }
3081
3082 if (fpstate) {
3083 lp_build_fpstate_set(gallivm, fpstate);
3084 }
3085 }
3086
3087
3088 /**
3089 * Generate the runtime callable function for the whole fragment pipeline.
3090 * Note that the function which we generate operates on a block of 16
3091 * pixels at at time. The block contains 2x2 quads. Each quad contains
3092 * 2x2 pixels.
3093 */
3094 static void
generate_fragment(struct llvmpipe_context * lp,struct lp_fragment_shader * shader,struct lp_fragment_shader_variant * variant,unsigned partial_mask)3095 generate_fragment(struct llvmpipe_context *lp,
3096 struct lp_fragment_shader *shader,
3097 struct lp_fragment_shader_variant *variant,
3098 unsigned partial_mask)
3099 {
3100 assert(partial_mask == RAST_WHOLE ||
3101 partial_mask == RAST_EDGE_TEST);
3102
3103 struct nir_shader *nir = shader->base.ir.nir;
3104 struct gallivm_state *gallivm = variant->gallivm;
3105 struct lp_fragment_shader_variant_key *key = &variant->key;
3106 struct lp_shader_input inputs[PIPE_MAX_SHADER_INPUTS];
3107 LLVMTypeRef fs_elem_type;
3108 LLVMTypeRef blend_vec_type;
3109 LLVMTypeRef arg_types[16];
3110 LLVMTypeRef func_type;
3111 LLVMTypeRef int32_type = LLVMInt32TypeInContext(gallivm->context);
3112 LLVMTypeRef int32p_type = LLVMPointerType(int32_type, 0);
3113 LLVMTypeRef int8_type = LLVMInt8TypeInContext(gallivm->context);
3114 LLVMTypeRef int8p_type = LLVMPointerType(int8_type, 0);
3115 LLVMValueRef context_ptr;
3116 LLVMValueRef resources_ptr;
3117 LLVMValueRef x;
3118 LLVMValueRef y;
3119 LLVMValueRef a0_ptr;
3120 LLVMValueRef dadx_ptr;
3121 LLVMValueRef dady_ptr;
3122 LLVMValueRef color_ptr_ptr;
3123 LLVMValueRef stride_ptr;
3124 LLVMValueRef color_sample_stride_ptr;
3125 LLVMValueRef depth_ptr;
3126 LLVMValueRef depth_stride;
3127 LLVMValueRef depth_sample_stride;
3128 LLVMValueRef mask_input;
3129 LLVMValueRef thread_data_ptr;
3130 LLVMBasicBlockRef block;
3131 LLVMBuilderRef builder;
3132 struct lp_build_interp_soa_context interp;
3133 LLVMValueRef fs_mask[(16 / 4) * LP_MAX_SAMPLES];
3134 LLVMValueRef fs_out_color[LP_MAX_SAMPLES][PIPE_MAX_COLOR_BUFS][TGSI_NUM_CHANNELS][16 / 4];
3135 LLVMValueRef function;
3136 LLVMValueRef facing;
3137 const bool dual_source_blend = key->blend.rt[0].blend_enable &&
3138 util_blend_state_is_dual(&key->blend, 0);
3139
3140 assert(lp_native_vector_width / 32 >= 4);
3141
3142 /* Adjust color input interpolation according to flatshade state:
3143 */
3144 nir_foreach_shader_in_variable(var, nir) {
3145 unsigned idx = var->data.driver_location;
3146 unsigned slots = nir_variable_count_slots(var, var->type);
3147 memcpy(&inputs[idx], &shader->inputs[idx], (sizeof inputs[0] * slots));
3148 for (unsigned s = 0; s < slots; s++) {
3149 if (inputs[idx + s].interp == LP_INTERP_COLOR)
3150 inputs[idx + s].interp = key->flatshade ? LP_INTERP_CONSTANT : LP_INTERP_PERSPECTIVE;
3151 }
3152 }
3153
3154 /* TODO: actually pick these based on the fs and color buffer
3155 * characteristics. */
3156
3157 struct lp_type fs_type;
3158 memset(&fs_type, 0, sizeof fs_type);
3159 fs_type.floating = true; /* floating point values */
3160 fs_type.sign = true; /* values are signed */
3161 fs_type.norm = false; /* values are not limited to [0,1] or [-1,1] */
3162 fs_type.width = 32; /* 32-bit float */
3163 fs_type.length = MIN2(lp_native_vector_width / 32, 16); /* n*4 elements per vector */
3164
3165 struct lp_type blend_type;
3166 memset(&blend_type, 0, sizeof blend_type);
3167 blend_type.floating = false; /* values are integers */
3168 blend_type.sign = false; /* values are unsigned */
3169 blend_type.norm = true; /* values are in [0,1] or [-1,1] */
3170 blend_type.width = 8; /* 8-bit ubyte values */
3171 blend_type.length = 16; /* 16 elements per vector */
3172
3173 /*
3174 * Generate the function prototype. Any change here must be reflected in
3175 * lp_jit.h's lp_jit_frag_func function pointer type, and vice-versa.
3176 */
3177
3178 fs_elem_type = lp_build_elem_type(gallivm, fs_type);
3179
3180 blend_vec_type = lp_build_vec_type(gallivm, blend_type);
3181
3182 char func_name[64];
3183 snprintf(func_name, sizeof(func_name), "fs_variant_%s",
3184 partial_mask ? "partial" : "whole");
3185
3186 arg_types[0] = variant->jit_context_ptr_type; /* context */
3187 arg_types[1] = variant->jit_resources_ptr_type; /* context */
3188 arg_types[2] = int32_type; /* x */
3189 arg_types[3] = int32_type; /* y */
3190 arg_types[4] = int32_type; /* facing */
3191 arg_types[5] = LLVMPointerType(fs_elem_type, 0); /* a0 */
3192 arg_types[6] = LLVMPointerType(fs_elem_type, 0); /* dadx */
3193 arg_types[7] = LLVMPointerType(fs_elem_type, 0); /* dady */
3194 arg_types[8] = LLVMPointerType(int8p_type, 0); /* color */
3195 arg_types[9] = int8p_type; /* depth */
3196 arg_types[10] = LLVMInt64TypeInContext(gallivm->context); /* mask_input */
3197 arg_types[11] = variant->jit_thread_data_ptr_type; /* per thread data */
3198 arg_types[12] = int32p_type; /* stride */
3199 arg_types[13] = int32_type; /* depth_stride */
3200 arg_types[14] = int32p_type; /* color sample strides */
3201 arg_types[15] = int32_type; /* depth sample stride */
3202
3203 func_type = LLVMFunctionType(LLVMVoidTypeInContext(gallivm->context),
3204 arg_types, ARRAY_SIZE(arg_types), 0);
3205
3206 function = LLVMAddFunction(gallivm->module, func_name, func_type);
3207 LLVMSetFunctionCallConv(function, LLVMCCallConv);
3208
3209 variant->function[partial_mask] = function;
3210
3211 /* XXX: need to propagate noalias down into color param now we are
3212 * passing a pointer-to-pointer?
3213 */
3214 for (unsigned i = 0; i < ARRAY_SIZE(arg_types); ++i)
3215 if (LLVMGetTypeKind(arg_types[i]) == LLVMPointerTypeKind)
3216 lp_add_function_attr(function, i + 1, LP_FUNC_ATTR_NOALIAS);
3217
3218 if (variant->gallivm->cache->data_size)
3219 return;
3220
3221 context_ptr = LLVMGetParam(function, 0);
3222 resources_ptr = LLVMGetParam(function, 1);
3223 x = LLVMGetParam(function, 2);
3224 y = LLVMGetParam(function, 3);
3225 facing = LLVMGetParam(function, 4);
3226 a0_ptr = LLVMGetParam(function, 5);
3227 dadx_ptr = LLVMGetParam(function, 6);
3228 dady_ptr = LLVMGetParam(function, 7);
3229 color_ptr_ptr = LLVMGetParam(function, 8);
3230 depth_ptr = LLVMGetParam(function, 9);
3231 mask_input = LLVMGetParam(function, 10);
3232 thread_data_ptr = LLVMGetParam(function, 11);
3233 stride_ptr = LLVMGetParam(function, 12);
3234 depth_stride = LLVMGetParam(function, 13);
3235 color_sample_stride_ptr = LLVMGetParam(function, 14);
3236 depth_sample_stride = LLVMGetParam(function, 15);
3237
3238 lp_build_name(context_ptr, "context");
3239 lp_build_name(resources_ptr, "resources");
3240 lp_build_name(x, "x");
3241 lp_build_name(y, "y");
3242 lp_build_name(a0_ptr, "a0");
3243 lp_build_name(dadx_ptr, "dadx");
3244 lp_build_name(dady_ptr, "dady");
3245 lp_build_name(color_ptr_ptr, "color_ptr_ptr");
3246 lp_build_name(depth_ptr, "depth");
3247 lp_build_name(mask_input, "mask_input");
3248 lp_build_name(thread_data_ptr, "thread_data");
3249 lp_build_name(stride_ptr, "stride_ptr");
3250 lp_build_name(depth_stride, "depth_stride");
3251 lp_build_name(color_sample_stride_ptr, "color_sample_stride_ptr");
3252 lp_build_name(depth_sample_stride, "depth_sample_stride");
3253
3254 /*
3255 * Function body
3256 */
3257
3258 block = LLVMAppendBasicBlockInContext(gallivm->context, function, "entry");
3259 builder = gallivm->builder;
3260 assert(builder);
3261 LLVMPositionBuilderAtEnd(builder, block);
3262
3263 /*
3264 * Must not count ps invocations if there's a null shader.
3265 * (It would be ok to count with null shader if there's d/s tests,
3266 * but only if there's d/s buffers too, which is different
3267 * to implicit rasterization disable which must not depend
3268 * on the d/s buffers.)
3269 * Could use popcount on mask, but pixel accuracy is not required.
3270 * Could disable if there's no stats query, but maybe not worth it.
3271 */
3272 if (shader->info.base.num_instructions > 1) {
3273 LLVMValueRef invocs, val;
3274 LLVMTypeRef invocs_type = LLVMInt64TypeInContext(gallivm->context);
3275 invocs = lp_jit_thread_data_ps_invocations(gallivm, variant->jit_thread_data_type, thread_data_ptr);
3276 val = LLVMBuildLoad2(builder, invocs_type, invocs, "");
3277 val = LLVMBuildAdd(builder, val,
3278 LLVMConstInt(LLVMInt64TypeInContext(gallivm->context),
3279 1, 0),
3280 "invoc_count");
3281 LLVMBuildStore(builder, val, invocs);
3282 }
3283
3284 /* code generated texture sampling */
3285 struct lp_build_sampler_soa *sampler =
3286 lp_llvm_sampler_soa_create(lp_fs_variant_key_samplers(key),
3287 MAX2(key->nr_samplers,
3288 key->nr_sampler_views));
3289 struct lp_build_image_soa *image =
3290 lp_bld_llvm_image_soa_create(lp_fs_variant_key_images(key), key->nr_images);
3291
3292 unsigned num_fs = 16 / fs_type.length; /* number of loops per 4x4 stamp */
3293 /* for 1d resources only run "upper half" of stamp */
3294 if (key->resource_1d)
3295 num_fs /= 2;
3296
3297 {
3298 LLVMValueRef num_loop = lp_build_const_int32(gallivm, num_fs);
3299 LLVMTypeRef mask_type = lp_build_int_vec_type(gallivm, fs_type);
3300 LLVMValueRef num_loop_samp =
3301 lp_build_const_int32(gallivm, num_fs * key->coverage_samples);
3302 LLVMValueRef mask_store =
3303 lp_build_array_alloca(gallivm, mask_type,
3304 num_loop_samp, "mask_store");
3305 LLVMTypeRef flt_type = LLVMFloatTypeInContext(gallivm->context);
3306 LLVMValueRef glob_sample_pos =
3307 LLVMAddGlobal(gallivm->module,
3308 LLVMArrayType(flt_type, key->coverage_samples * 2), "");
3309 LLVMValueRef sample_pos_array;
3310
3311 if (key->multisample && key->coverage_samples == 4) {
3312 LLVMValueRef sample_pos_arr[8];
3313 for (unsigned i = 0; i < 4; i++) {
3314 sample_pos_arr[i * 2] = LLVMConstReal(flt_type,
3315 lp_sample_pos_4x[i][0]);
3316 sample_pos_arr[i * 2 + 1] = LLVMConstReal(flt_type,
3317 lp_sample_pos_4x[i][1]);
3318 }
3319 sample_pos_array =
3320 LLVMConstArray(LLVMFloatTypeInContext(gallivm->context),
3321 sample_pos_arr, 8);
3322 } else {
3323 LLVMValueRef sample_pos_arr[2];
3324 sample_pos_arr[0] = LLVMConstReal(flt_type, 0.5);
3325 sample_pos_arr[1] = LLVMConstReal(flt_type, 0.5);
3326 sample_pos_array =
3327 LLVMConstArray(LLVMFloatTypeInContext(gallivm->context),
3328 sample_pos_arr, 2);
3329 }
3330 LLVMSetInitializer(glob_sample_pos, sample_pos_array);
3331
3332 LLVMValueRef color_store[PIPE_MAX_COLOR_BUFS][TGSI_NUM_CHANNELS];
3333 bool pixel_center_integer = nir->info.fs.pixel_center_integer;
3334
3335 /*
3336 * The shader input interpolation info is not explicitely baked in the
3337 * shader key, but everything it derives from (TGSI, and flatshade) is
3338 * already included in the shader key.
3339 */
3340 lp_build_interp_soa_init(&interp,
3341 gallivm,
3342 nir->num_inputs,
3343 inputs,
3344 pixel_center_integer,
3345 key->coverage_samples,
3346 LLVMTypeOf(sample_pos_array),
3347 glob_sample_pos,
3348 num_loop,
3349 builder, fs_type,
3350 a0_ptr, dadx_ptr, dady_ptr,
3351 x, y);
3352
3353 for (unsigned i = 0; i < num_fs; i++) {
3354 if (key->multisample) {
3355 LLVMValueRef smask_val =
3356 LLVMBuildLoad2(builder, int32_type,
3357 lp_jit_context_sample_mask(gallivm, variant->jit_context_type, context_ptr),
3358 "");
3359
3360 /*
3361 * For multisampling, extract the per-sample mask from the
3362 * incoming 64-bit mask, store to the per sample mask storage. Or
3363 * all of them together to generate the fragment shader
3364 * mask. (sample shading TODO). Take the incoming state coverage
3365 * mask into account.
3366 */
3367 for (unsigned s = 0; s < key->coverage_samples; s++) {
3368 LLVMValueRef sindexi =
3369 lp_build_const_int32(gallivm, i + (s * num_fs));
3370 LLVMValueRef sample_mask_ptr =
3371 LLVMBuildGEP2(builder, mask_type, mask_store, &sindexi, 1,
3372 "sample_mask_ptr");
3373 LLVMValueRef s_mask =
3374 generate_quad_mask(gallivm, fs_type,
3375 i * fs_type.length / 4, s, mask_input);
3376 LLVMValueRef smask_bit =
3377 LLVMBuildAnd(builder, smask_val,
3378 lp_build_const_int32(gallivm, (1 << s)), "");
3379 LLVMValueRef cmp =
3380 LLVMBuildICmp(builder, LLVMIntNE, smask_bit,
3381 lp_build_const_int32(gallivm, 0), "");
3382 smask_bit = LLVMBuildSExt(builder, cmp, int32_type, "");
3383 smask_bit = lp_build_broadcast(gallivm, mask_type, smask_bit);
3384
3385 s_mask = LLVMBuildAnd(builder, s_mask, smask_bit, "");
3386 LLVMBuildStore(builder, s_mask, sample_mask_ptr);
3387 }
3388 } else {
3389 LLVMValueRef mask;
3390 LLVMValueRef indexi = lp_build_const_int32(gallivm, i);
3391 LLVMValueRef mask_ptr = LLVMBuildGEP2(builder, mask_type, mask_store,
3392 &indexi, 1, "mask_ptr");
3393
3394 if (partial_mask) {
3395 mask = generate_quad_mask(gallivm, fs_type,
3396 i * fs_type.length / 4, 0, mask_input);
3397 } else {
3398 mask = lp_build_const_int_vec(gallivm, fs_type, ~0);
3399 }
3400 LLVMBuildStore(builder, mask, mask_ptr);
3401 }
3402 }
3403
3404 generate_fs_loop(gallivm,
3405 shader, key,
3406 builder,
3407 fs_type,
3408 variant->jit_context_type,
3409 context_ptr,
3410 variant->jit_resources_type,
3411 resources_ptr,
3412 LLVMTypeOf(sample_pos_array),
3413 glob_sample_pos,
3414 num_loop,
3415 &interp,
3416 sampler,
3417 image,
3418 mask_type,
3419 mask_store, /* output */
3420 color_store,
3421 depth_ptr,
3422 depth_stride,
3423 depth_sample_stride,
3424 color_ptr_ptr,
3425 stride_ptr,
3426 color_sample_stride_ptr,
3427 facing,
3428 variant->jit_thread_data_type,
3429 thread_data_ptr);
3430
3431 LLVMTypeRef fs_vec_type = lp_build_vec_type(gallivm, fs_type);
3432 for (unsigned i = 0; i < num_fs; i++) {
3433 LLVMValueRef ptr;
3434 for (unsigned s = 0; s < key->coverage_samples; s++) {
3435 int idx = (i + (s * num_fs));
3436 LLVMValueRef sindexi = lp_build_const_int32(gallivm, idx);
3437 ptr = LLVMBuildGEP2(builder, mask_type, mask_store, &sindexi, 1, "");
3438
3439 fs_mask[idx] = LLVMBuildLoad2(builder, mask_type, ptr, "smask");
3440 }
3441
3442 for (unsigned s = 0; s < key->min_samples; s++) {
3443 /* This is fucked up need to reorganize things */
3444 int idx = s * num_fs + i;
3445 LLVMValueRef sindexi = lp_build_const_int32(gallivm, idx);
3446 for (unsigned cbuf = 0; cbuf < key->nr_cbufs; cbuf++) {
3447 for (unsigned chan = 0; chan < TGSI_NUM_CHANNELS; ++chan) {
3448 ptr = LLVMBuildGEP2(builder, fs_vec_type,
3449 color_store[cbuf][chan],
3450 &sindexi, 1, "");
3451 fs_out_color[s][cbuf][chan][i] = ptr;
3452 }
3453 }
3454 if (dual_source_blend) {
3455 /* only support one dual source blend target hence always use
3456 * output 1
3457 */
3458 for (unsigned chan = 0; chan < TGSI_NUM_CHANNELS; ++chan) {
3459 ptr = LLVMBuildGEP2(builder, fs_vec_type,
3460 color_store[1][chan],
3461 &sindexi, 1, "");
3462 fs_out_color[s][1][chan][i] = ptr;
3463 }
3464 }
3465 }
3466 }
3467 }
3468
3469 lp_bld_llvm_sampler_soa_destroy(sampler);
3470 lp_bld_llvm_image_soa_destroy(image);
3471
3472 /* Loop over color outputs / color buffers to do blending */
3473 for (unsigned cbuf = 0; cbuf < key->nr_cbufs; cbuf++) {
3474 if (key->cbuf_format[cbuf] != PIPE_FORMAT_NONE &&
3475 (key->blend.rt[cbuf].blend_enable || key->blend.logicop_enable ||
3476 find_output_by_frag_result(nir, FRAG_RESULT_DATA0 + cbuf) != -1)) {
3477 LLVMValueRef color_ptr;
3478 LLVMValueRef stride;
3479 LLVMValueRef sample_stride = NULL;
3480 LLVMValueRef index = lp_build_const_int32(gallivm, cbuf);
3481
3482 bool do_branch = ((key->depth.enabled
3483 || key->stencil[0].enabled
3484 || key->alpha.enabled)
3485 && !nir->info.fs.uses_discard);
3486
3487 color_ptr = LLVMBuildLoad2(builder, int8p_type,
3488 LLVMBuildGEP2(builder, int8p_type, color_ptr_ptr,
3489 &index, 1, ""),
3490 "");
3491
3492 stride = LLVMBuildLoad2(builder, int32_type,
3493 LLVMBuildGEP2(builder, int32_type, stride_ptr,
3494 &index, 1, ""),
3495 "");
3496
3497 if (key->cbuf_nr_samples[cbuf] > 1)
3498 sample_stride = LLVMBuildLoad2(builder, int32_type,
3499 LLVMBuildGEP2(builder,
3500 int32_type,
3501 color_sample_stride_ptr,
3502 &index, 1, ""), "");
3503
3504 for (unsigned s = 0; s < key->cbuf_nr_samples[cbuf]; s++) {
3505 unsigned mask_idx = num_fs * (key->multisample ? s : 0);
3506 unsigned out_idx = key->min_samples == 1 ? 0 : s;
3507 LLVMValueRef out_ptr = color_ptr;
3508
3509 if (sample_stride) {
3510 LLVMValueRef sample_offset =
3511 LLVMBuildMul(builder, sample_stride,
3512 lp_build_const_int32(gallivm, s), "");
3513 out_ptr = LLVMBuildGEP2(builder, int8_type, out_ptr, &sample_offset, 1, "");
3514 }
3515 out_ptr = LLVMBuildBitCast(builder, out_ptr,
3516 LLVMPointerType(blend_vec_type, 0), "");
3517
3518 lp_build_name(out_ptr, "color_ptr%d", cbuf);
3519
3520 generate_unswizzled_blend(gallivm, cbuf, variant,
3521 key->cbuf_format[cbuf],
3522 num_fs, fs_type, &fs_mask[mask_idx],
3523 fs_out_color[out_idx],
3524 variant->jit_context_type,
3525 context_ptr, blend_vec_type, out_ptr, stride,
3526 partial_mask, do_branch);
3527 }
3528 }
3529 }
3530
3531 LLVMBuildRetVoid(builder);
3532
3533 gallivm_verify_function(gallivm, function);
3534 }
3535
3536
3537 static void
dump_fs_variant_key(struct lp_fragment_shader_variant_key * key)3538 dump_fs_variant_key(struct lp_fragment_shader_variant_key *key)
3539 {
3540 debug_printf("fs variant %p:\n", (void *) key);
3541
3542 if (key->flatshade) {
3543 debug_printf("flatshade = 1\n");
3544 }
3545 if (key->depth_clamp)
3546 debug_printf("depth_clamp = 1\n");
3547
3548 if (key->restrict_depth_values)
3549 debug_printf("restrict_depth_values = 1\n");
3550
3551 if (key->multisample) {
3552 debug_printf("multisample = 1\n");
3553 debug_printf("coverage samples = %d\n", key->coverage_samples);
3554 debug_printf("min samples = %d\n", key->min_samples);
3555 }
3556 for (unsigned i = 0; i < key->nr_cbufs; ++i) {
3557 debug_printf("cbuf_format[%u] = %s\n", i, util_format_name(key->cbuf_format[i]));
3558 debug_printf("cbuf nr_samples[%u] = %d\n", i, key->cbuf_nr_samples[i]);
3559 }
3560 if (key->depth.enabled || key->stencil[0].enabled) {
3561 debug_printf("depth.format = %s\n", util_format_name(key->zsbuf_format));
3562 debug_printf("depth nr_samples = %d\n", key->zsbuf_nr_samples);
3563 }
3564 if (key->depth.enabled) {
3565 debug_printf("depth.func = %s\n", util_str_func(key->depth.func, true));
3566 debug_printf("depth.writemask = %u\n", key->depth.writemask);
3567 }
3568
3569 for (unsigned i = 0; i < 2; ++i) {
3570 if (key->stencil[i].enabled) {
3571 debug_printf("stencil[%u].func = %s\n", i, util_str_func(key->stencil[i].func, true));
3572 debug_printf("stencil[%u].fail_op = %s\n", i, util_str_stencil_op(key->stencil[i].fail_op, true));
3573 debug_printf("stencil[%u].zpass_op = %s\n", i, util_str_stencil_op(key->stencil[i].zpass_op, true));
3574 debug_printf("stencil[%u].zfail_op = %s\n", i, util_str_stencil_op(key->stencil[i].zfail_op, true));
3575 debug_printf("stencil[%u].valuemask = 0x%x\n", i, key->stencil[i].valuemask);
3576 debug_printf("stencil[%u].writemask = 0x%x\n", i, key->stencil[i].writemask);
3577 }
3578 }
3579
3580 if (key->alpha.enabled) {
3581 debug_printf("alpha.func = %s\n", util_str_func(key->alpha.func, true));
3582 }
3583
3584 if (key->occlusion_count) {
3585 debug_printf("occlusion_count = 1\n");
3586 }
3587
3588 if (key->blend.logicop_enable) {
3589 debug_printf("blend.logicop_func = %s\n", util_str_logicop(key->blend.logicop_func, true));
3590 } else if (key->blend.rt[0].blend_enable) {
3591 debug_printf("blend.rgb_func = %s\n", util_str_blend_func (key->blend.rt[0].rgb_func, true));
3592 debug_printf("blend.rgb_src_factor = %s\n", util_str_blend_factor(key->blend.rt[0].rgb_src_factor, true));
3593 debug_printf("blend.rgb_dst_factor = %s\n", util_str_blend_factor(key->blend.rt[0].rgb_dst_factor, true));
3594 debug_printf("blend.alpha_func = %s\n", util_str_blend_func (key->blend.rt[0].alpha_func, true));
3595 debug_printf("blend.alpha_src_factor = %s\n", util_str_blend_factor(key->blend.rt[0].alpha_src_factor, true));
3596 debug_printf("blend.alpha_dst_factor = %s\n", util_str_blend_factor(key->blend.rt[0].alpha_dst_factor, true));
3597 }
3598 debug_printf("blend.colormask = 0x%x\n", key->blend.rt[0].colormask);
3599 if (key->blend.alpha_to_coverage) {
3600 debug_printf("blend.alpha_to_coverage is enabled\n");
3601 }
3602 for (unsigned i = 0; i < key->nr_samplers; ++i) {
3603 const struct lp_sampler_static_state *samplers = lp_fs_variant_key_samplers(key);
3604 const struct lp_static_sampler_state *sampler = &samplers[i].sampler_state;
3605 debug_printf("sampler[%u] = \n", i);
3606 debug_printf(" .wrap = %s %s %s\n",
3607 util_str_tex_wrap(sampler->wrap_s, true),
3608 util_str_tex_wrap(sampler->wrap_t, true),
3609 util_str_tex_wrap(sampler->wrap_r, true));
3610 debug_printf(" .min_img_filter = %s\n",
3611 util_str_tex_filter(sampler->min_img_filter, true));
3612 debug_printf(" .min_mip_filter = %s\n",
3613 util_str_tex_mipfilter(sampler->min_mip_filter, true));
3614 debug_printf(" .mag_img_filter = %s\n",
3615 util_str_tex_filter(sampler->mag_img_filter, true));
3616 if (sampler->compare_mode != PIPE_TEX_COMPARE_NONE)
3617 debug_printf(" .compare_func = %s\n", util_str_func(sampler->compare_func, true));
3618 debug_printf(" .normalized_coords = %u\n", sampler->normalized_coords);
3619 debug_printf(" .min_max_lod_equal = %u\n", sampler->min_max_lod_equal);
3620 debug_printf(" .lod_bias_non_zero = %u\n", sampler->lod_bias_non_zero);
3621 debug_printf(" .apply_min_lod = %u\n", sampler->apply_min_lod);
3622 debug_printf(" .apply_max_lod = %u\n", sampler->apply_max_lod);
3623 debug_printf(" .reduction_mode = %u\n", sampler->reduction_mode);
3624 debug_printf(" .aniso = %u\n", sampler->aniso);
3625 }
3626 for (unsigned i = 0; i < key->nr_sampler_views; ++i) {
3627 const struct lp_sampler_static_state *samplers = lp_fs_variant_key_samplers(key);
3628 const struct lp_static_texture_state *texture = &samplers[i].texture_state;
3629 debug_printf("texture[%u] = \n", i);
3630 debug_printf(" .format = %s\n",
3631 util_format_name(texture->format));
3632 debug_printf(" .target = %s\n",
3633 util_str_tex_target(texture->target, true));
3634 debug_printf(" .level_zero_only = %u\n",
3635 texture->level_zero_only);
3636 debug_printf(" .pot = %u %u %u\n",
3637 texture->pot_width,
3638 texture->pot_height,
3639 texture->pot_depth);
3640 }
3641 struct lp_image_static_state *images = lp_fs_variant_key_images(key);
3642 for (unsigned i = 0; i < key->nr_images; ++i) {
3643 const struct lp_static_texture_state *image = &images[i].image_state;
3644 debug_printf("image[%u] = \n", i);
3645 debug_printf(" .format = %s\n",
3646 util_format_name(image->format));
3647 debug_printf(" .target = %s\n",
3648 util_str_tex_target(image->target, true));
3649 debug_printf(" .level_zero_only = %u\n",
3650 image->level_zero_only);
3651 debug_printf(" .pot = %u %u %u\n",
3652 image->pot_width,
3653 image->pot_height,
3654 image->pot_depth);
3655 }
3656 }
3657
3658
3659 const char *
lp_debug_fs_kind(enum lp_fs_kind kind)3660 lp_debug_fs_kind(enum lp_fs_kind kind)
3661 {
3662 switch (kind) {
3663 case LP_FS_KIND_GENERAL:
3664 return "GENERAL";
3665 case LP_FS_KIND_BLIT_RGBA:
3666 return "BLIT_RGBA";
3667 case LP_FS_KIND_BLIT_RGB1:
3668 return "BLIT_RGB1";
3669 case LP_FS_KIND_AERO_MINIFICATION:
3670 return "AERO_MINIFICATION";
3671 case LP_FS_KIND_LLVM_LINEAR:
3672 return "LLVM_LINEAR";
3673 default:
3674 return "unknown";
3675 }
3676 }
3677
3678
3679 void
lp_debug_fs_variant(struct lp_fragment_shader_variant * variant)3680 lp_debug_fs_variant(struct lp_fragment_shader_variant *variant)
3681 {
3682 debug_printf("llvmpipe: Fragment shader #%u variant #%u:\n",
3683 variant->shader->no, variant->no);
3684 nir_print_shader(variant->shader->base.ir.nir, stderr);
3685 dump_fs_variant_key(&variant->key);
3686 debug_printf("variant->opaque = %u\n", variant->opaque);
3687 debug_printf("variant->potentially_opaque = %u\n", variant->potentially_opaque);
3688 debug_printf("variant->blit = %u\n", variant->blit);
3689 debug_printf("shader->kind = %s\n", lp_debug_fs_kind(variant->shader->kind));
3690 debug_printf("\n");
3691 }
3692
3693
3694 static void
lp_fs_get_ir_cache_key(struct lp_fragment_shader_variant * variant,unsigned char ir_sha1_cache_key[20])3695 lp_fs_get_ir_cache_key(struct lp_fragment_shader_variant *variant,
3696 unsigned char ir_sha1_cache_key[20])
3697 {
3698 struct blob blob = { 0 };
3699 unsigned ir_size;
3700 void *ir_binary;
3701
3702 blob_init(&blob);
3703 nir_serialize(&blob, variant->shader->base.ir.nir, true);
3704 ir_binary = blob.data;
3705 ir_size = blob.size;
3706
3707 struct mesa_sha1 ctx;
3708 _mesa_sha1_init(&ctx);
3709 _mesa_sha1_update(&ctx, &variant->key, variant->shader->variant_key_size);
3710 _mesa_sha1_update(&ctx, ir_binary, ir_size);
3711 _mesa_sha1_final(&ctx, ir_sha1_cache_key);
3712
3713 blob_finish(&blob);
3714 }
3715
3716
3717 /**
3718 * Generate a new fragment shader variant from the shader code and
3719 * other state indicated by the key.
3720 */
3721 static struct lp_fragment_shader_variant *
generate_variant(struct llvmpipe_context * lp,struct lp_fragment_shader * shader,const struct lp_fragment_shader_variant_key * key)3722 generate_variant(struct llvmpipe_context *lp,
3723 struct lp_fragment_shader *shader,
3724 const struct lp_fragment_shader_variant_key *key)
3725 {
3726 struct nir_shader *nir = shader->base.ir.nir;
3727 struct lp_fragment_shader_variant *variant =
3728 MALLOC(sizeof *variant + shader->variant_key_size - sizeof variant->key);
3729 if (!variant)
3730 return NULL;
3731
3732 memset(variant, 0, sizeof(*variant));
3733
3734 pipe_reference_init(&variant->reference, 1);
3735 lp_fs_reference(lp, &variant->shader, shader);
3736
3737 memcpy(&variant->key, key, shader->variant_key_size);
3738
3739 struct llvmpipe_screen *screen = llvmpipe_screen(lp->pipe.screen);
3740 struct lp_cached_code cached = { 0 };
3741 unsigned char ir_sha1_cache_key[20];
3742 bool needs_caching = false;
3743 if (shader->base.ir.nir) {
3744 lp_fs_get_ir_cache_key(variant, ir_sha1_cache_key);
3745
3746 lp_disk_cache_find_shader(screen, &cached, ir_sha1_cache_key);
3747 if (!cached.data_size)
3748 needs_caching = true;
3749 }
3750
3751 char module_name[64];
3752 snprintf(module_name, sizeof(module_name), "fs%u_variant%u",
3753 shader->no, shader->variants_created);
3754 variant->gallivm = gallivm_create(module_name, lp->context, &cached);
3755 if (!variant->gallivm) {
3756 FREE(variant);
3757 return NULL;
3758 }
3759
3760 variant->list_item_global.base = variant;
3761 variant->list_item_local.base = variant;
3762 variant->no = shader->variants_created++;
3763
3764 /*
3765 * Determine whether we are touching all channels in the color buffer.
3766 */
3767 const struct util_format_description *cbuf0_format_desc = NULL;
3768 bool fullcolormask = false;
3769 if (key->nr_cbufs == 1) {
3770 cbuf0_format_desc = util_format_description(key->cbuf_format[0]);
3771 fullcolormask = util_format_colormask_full(cbuf0_format_desc,
3772 key->blend.rt[0].colormask);
3773 }
3774
3775 /* The scissor is ignored here as only tiles inside the scissoring
3776 * rectangle will refer to this.
3777 */
3778 const bool no_kill =
3779 fullcolormask &&
3780 !key->stencil[0].enabled &&
3781 !key->alpha.enabled &&
3782 !key->multisample &&
3783 !key->blend.alpha_to_coverage &&
3784 !key->depth.enabled &&
3785 !nir->info.fs.uses_discard &&
3786 !(nir->info.outputs_written & BITFIELD64_BIT(FRAG_RESULT_SAMPLE_MASK)) &&
3787 !nir->info.fs.uses_fbfetch_output;
3788
3789 variant->opaque =
3790 no_kill &&
3791 !key->blend.logicop_enable &&
3792 !key->blend.rt[0].blend_enable
3793 ? true : false;
3794
3795 variant->potentially_opaque =
3796 no_kill &&
3797 !key->blend.logicop_enable &&
3798 key->blend.rt[0].blend_enable &&
3799 key->blend.rt[0].rgb_func == PIPE_BLEND_ADD &&
3800 key->blend.rt[0].rgb_dst_factor == PIPE_BLENDFACTOR_INV_SRC_ALPHA &&
3801 key->blend.rt[0].alpha_func == key->blend.rt[0].rgb_func &&
3802 key->blend.rt[0].alpha_dst_factor == key->blend.rt[0].rgb_dst_factor &&
3803 shader->base.type == PIPE_SHADER_IR_TGSI &&
3804 /*
3805 * FIXME: for NIR, all of the fields of info.xxx (except info.base)
3806 * are zeros, hence shader analysis (here and elsewhere) using these
3807 * bits cannot work and will silently fail (cbuf is the only pointer
3808 * field, hence causing a crash).
3809 */
3810 shader->info.cbuf[0][3].file != TGSI_FILE_NULL
3811 ? true : false;
3812
3813 /* We only care about opaque blits for now */
3814 if (variant->opaque &&
3815 (shader->kind == LP_FS_KIND_BLIT_RGBA ||
3816 shader->kind == LP_FS_KIND_BLIT_RGB1)) {
3817 const struct lp_sampler_static_state *samp0 =
3818 lp_fs_variant_key_sampler_idx(key, 0);
3819 assert(samp0);
3820
3821 const enum pipe_format texture_format = samp0->texture_state.format;
3822 const enum pipe_texture_target target = samp0->texture_state.target;
3823 const unsigned min_img_filter = samp0->sampler_state.min_img_filter;
3824 const unsigned mag_img_filter = samp0->sampler_state.mag_img_filter;
3825
3826 unsigned min_mip_filter;
3827 if (samp0->texture_state.level_zero_only) {
3828 min_mip_filter = PIPE_TEX_MIPFILTER_NONE;
3829 } else {
3830 min_mip_filter = samp0->sampler_state.min_mip_filter;
3831 }
3832
3833 if (target == PIPE_TEXTURE_2D &&
3834 min_img_filter == PIPE_TEX_FILTER_NEAREST &&
3835 mag_img_filter == PIPE_TEX_FILTER_NEAREST &&
3836 min_mip_filter == PIPE_TEX_MIPFILTER_NONE &&
3837 ((texture_format &&
3838 util_is_format_compatible(util_format_description(texture_format),
3839 cbuf0_format_desc)) ||
3840 (shader->kind == LP_FS_KIND_BLIT_RGB1 &&
3841 (texture_format == PIPE_FORMAT_B8G8R8A8_UNORM ||
3842 texture_format == PIPE_FORMAT_B8G8R8X8_UNORM) &&
3843 (key->cbuf_format[0] == PIPE_FORMAT_B8G8R8A8_UNORM ||
3844 key->cbuf_format[0] == PIPE_FORMAT_B8G8R8X8_UNORM)))) {
3845 variant->blit = 1;
3846 }
3847 }
3848
3849 /* Determine whether this shader + pipeline state is a candidate for
3850 * the linear path.
3851 */
3852 const bool linear_pipeline =
3853 !key->stencil[0].enabled &&
3854 !key->depth.enabled &&
3855 !nir->info.fs.uses_discard &&
3856 !key->blend.logicop_enable &&
3857 (key->cbuf_format[0] == PIPE_FORMAT_B8G8R8A8_UNORM ||
3858 key->cbuf_format[0] == PIPE_FORMAT_B8G8R8X8_UNORM ||
3859 key->cbuf_format[0] == PIPE_FORMAT_R8G8B8A8_UNORM ||
3860 key->cbuf_format[0] == PIPE_FORMAT_R8G8B8X8_UNORM);
3861
3862 memcpy(&variant->key, key, sizeof *key);
3863
3864 if ((LP_DEBUG & DEBUG_FS) || (gallivm_debug & GALLIVM_DEBUG_IR)) {
3865 lp_debug_fs_variant(variant);
3866 }
3867
3868 llvmpipe_fs_variant_fastpath(variant);
3869
3870 lp_jit_init_types(variant);
3871
3872 if (variant->jit_function[RAST_EDGE_TEST] == NULL)
3873 generate_fragment(lp, shader, variant, RAST_EDGE_TEST);
3874
3875 if (variant->jit_function[RAST_WHOLE] == NULL) {
3876 if (variant->opaque) {
3877 /* Specialized shader, which doesn't need to read the color buffer. */
3878 generate_fragment(lp, shader, variant, RAST_WHOLE);
3879 }
3880 }
3881
3882 if (linear_pipeline) {
3883 /* Currently keeping both the old fastpaths and new linear path
3884 * active. The older code is still somewhat faster for the cases
3885 * it covers.
3886 *
3887 * XXX: consider restricting this to aero-mode only.
3888 */
3889 if (fullcolormask &&
3890 !key->alpha.enabled &&
3891 !key->blend.alpha_to_coverage) {
3892 llvmpipe_fs_variant_linear_fastpath(variant);
3893 }
3894
3895 /* If the original fastpath doesn't cover this variant, try the new
3896 * code:
3897 */
3898 if (variant->jit_linear == NULL) {
3899 if (shader->kind == LP_FS_KIND_BLIT_RGBA ||
3900 shader->kind == LP_FS_KIND_BLIT_RGB1 ||
3901 shader->kind == LP_FS_KIND_LLVM_LINEAR) {
3902 llvmpipe_fs_variant_linear_llvm(lp, shader, variant);
3903 }
3904 }
3905 } else {
3906 if (LP_DEBUG & DEBUG_LINEAR) {
3907 lp_debug_fs_variant(variant);
3908 debug_printf(" ----> no linear path for this variant\n");
3909 }
3910 }
3911
3912 /*
3913 * Compile everything
3914 */
3915
3916 gallivm_compile_module(variant->gallivm);
3917
3918 variant->nr_instrs += lp_build_count_ir_module(variant->gallivm->module);
3919
3920 if (variant->function[RAST_EDGE_TEST]) {
3921 variant->jit_function[RAST_EDGE_TEST] = (lp_jit_frag_func)
3922 gallivm_jit_function(variant->gallivm,
3923 variant->function[RAST_EDGE_TEST]);
3924 }
3925
3926 if (variant->function[RAST_WHOLE]) {
3927 variant->jit_function[RAST_WHOLE] = (lp_jit_frag_func)
3928 gallivm_jit_function(variant->gallivm,
3929 variant->function[RAST_WHOLE]);
3930 } else if (!variant->jit_function[RAST_WHOLE]) {
3931 variant->jit_function[RAST_WHOLE] = (lp_jit_frag_func)
3932 variant->jit_function[RAST_EDGE_TEST];
3933 }
3934
3935 if (linear_pipeline) {
3936 if (variant->linear_function) {
3937 variant->jit_linear_llvm = (lp_jit_linear_llvm_func)
3938 gallivm_jit_function(variant->gallivm, variant->linear_function);
3939 }
3940
3941 /*
3942 * This must be done after LLVM compilation, as it will call the JIT'ed
3943 * code to determine active inputs.
3944 */
3945 lp_linear_check_variant(variant);
3946 }
3947
3948 if (needs_caching) {
3949 lp_disk_cache_insert_shader(screen, &cached, ir_sha1_cache_key);
3950 }
3951
3952 gallivm_free_ir(variant->gallivm);
3953
3954 return variant;
3955 }
3956
3957
3958 static void *
llvmpipe_create_fs_state(struct pipe_context * pipe,const struct pipe_shader_state * templ)3959 llvmpipe_create_fs_state(struct pipe_context *pipe,
3960 const struct pipe_shader_state *templ)
3961 {
3962 struct llvmpipe_context *llvmpipe = llvmpipe_context(pipe);
3963
3964 struct lp_fragment_shader *shader = CALLOC_STRUCT(lp_fragment_shader);
3965 if (!shader)
3966 return NULL;
3967
3968 pipe_reference_init(&shader->reference, 1);
3969 shader->no = fs_no++;
3970 list_inithead(&shader->variants.list);
3971
3972 shader->base.type = PIPE_SHADER_IR_NIR;
3973
3974 if (templ->type == PIPE_SHADER_IR_TGSI) {
3975 shader->base.ir.nir = tgsi_to_nir(templ->tokens, pipe->screen, false);
3976 } else {
3977 shader->base.ir.nir = templ->ir.nir;
3978 }
3979
3980 /* lower FRAG_RESULT_COLOR -> DATA[0-7] to correctly handle unused attachments */
3981 nir_shader *nir = shader->base.ir.nir;
3982 NIR_PASS_V(nir, nir_lower_fragcolor, nir->info.fs.color_is_dual_source ? 1 : 8);
3983
3984 nir_shader_gather_info(nir, nir_shader_get_entrypoint(nir));
3985 nir_tgsi_scan_shader(nir, &shader->info.base, true);
3986 shader->info.num_texs = shader->info.base.opcode_count[TGSI_OPCODE_TEX];
3987
3988 llvmpipe_register_shader(pipe, &shader->base);
3989
3990 shader->draw_data = draw_create_fragment_shader(llvmpipe->draw, templ);
3991 if (shader->draw_data == NULL) {
3992 FREE(shader);
3993 return NULL;
3994 }
3995
3996 const int nr_samplers = BITSET_LAST_BIT(nir->info.samplers_used);
3997 const int nr_sampler_views = BITSET_LAST_BIT(nir->info.textures_used);
3998 const int nr_images = BITSET_LAST_BIT(nir->info.images_used);
3999
4000 shader->variant_key_size = lp_fs_variant_key_size(MAX2(nr_samplers,
4001 nr_sampler_views),
4002 nr_images);
4003
4004 nir_foreach_shader_in_variable(var, nir) {
4005 unsigned idx = var->data.driver_location;
4006 unsigned slots = nir_variable_count_slots(var, var->type);
4007
4008 if (var->data.centroid)
4009 shader->inputs[idx].location = TGSI_INTERPOLATE_LOC_CENTROID;
4010 if (var->data.sample)
4011 shader->inputs[idx].location = TGSI_INTERPOLATE_LOC_SAMPLE;
4012
4013 enum glsl_base_type base_type =
4014 glsl_get_base_type(glsl_without_array(var->type));
4015 switch (var->data.interpolation) {
4016 case INTERP_MODE_NONE:
4017 if (glsl_base_type_is_integer(base_type) || var->data.per_primitive) {
4018 shader->inputs[idx].interp = LP_INTERP_CONSTANT;
4019 break;
4020 }
4021 if (var->data.location == VARYING_SLOT_COL0 ||
4022 var->data.location == VARYING_SLOT_COL1) {
4023 shader->inputs[idx].interp = LP_INTERP_COLOR;
4024 break;
4025 }
4026 FALLTHROUGH;
4027 case INTERP_MODE_SMOOTH:
4028 shader->inputs[idx].interp = LP_INTERP_PERSPECTIVE;
4029 break;
4030 case INTERP_MODE_NOPERSPECTIVE:
4031 shader->inputs[idx].interp = LP_INTERP_LINEAR;
4032 break;
4033 case INTERP_MODE_FLAT:
4034 shader->inputs[idx].interp = LP_INTERP_CONSTANT;
4035 break;
4036 }
4037
4038 /* XXX this is a completely pointless index map... */
4039 shader->inputs[idx].src_index = idx + 1;
4040 if (var->data.location == VARYING_SLOT_FACE)
4041 shader->inputs[idx].interp = LP_INTERP_FACING;
4042 else if (var->data.location == VARYING_SLOT_POS) {
4043 shader->inputs[idx].src_index = 0;
4044 shader->inputs[idx].interp = LP_INTERP_POSITION;
4045 }
4046
4047 shader->inputs[idx].usage_mask = shader->info.base.input_usage_mask[idx];
4048 for (unsigned s = 1; s < slots; s++) {
4049 shader->inputs[idx + s] = shader->inputs[idx];
4050 shader->inputs[idx + s].src_index = idx + s + 1;
4051 shader->inputs[idx + s].usage_mask = shader->info.base.input_usage_mask[idx + s];
4052 }
4053 }
4054
4055 llvmpipe_fs_analyse_nir(shader);
4056
4057 return shader;
4058 }
4059
4060
4061 static void
llvmpipe_bind_fs_state(struct pipe_context * pipe,void * fs)4062 llvmpipe_bind_fs_state(struct pipe_context *pipe, void *fs)
4063 {
4064 struct llvmpipe_context *llvmpipe = llvmpipe_context(pipe);
4065 struct lp_fragment_shader *lp_fs = (struct lp_fragment_shader *)fs;
4066 if (llvmpipe->fs == lp_fs)
4067 return;
4068
4069 draw_bind_fragment_shader(llvmpipe->draw,
4070 (lp_fs ? lp_fs->draw_data : NULL));
4071
4072 lp_fs_reference(llvmpipe, &llvmpipe->fs, lp_fs);
4073
4074 /* invalidate the setup link, NEW_FS will make it update */
4075 lp_setup_set_fs_variant(llvmpipe->setup, NULL);
4076 llvmpipe->dirty |= LP_NEW_FS;
4077 }
4078
4079
4080 /**
4081 * Remove shader variant from two lists: the shader's variant list
4082 * and the context's variant list.
4083 */
4084 static void
llvmpipe_remove_shader_variant(struct llvmpipe_context * lp,struct lp_fragment_shader_variant * variant)4085 llvmpipe_remove_shader_variant(struct llvmpipe_context *lp,
4086 struct lp_fragment_shader_variant *variant)
4087 {
4088 if ((LP_DEBUG & DEBUG_FS) || (gallivm_debug & GALLIVM_DEBUG_IR)) {
4089 debug_printf("llvmpipe: del fs #%u var %u v created %u v cached %u "
4090 "v total cached %u inst %u total inst %u\n",
4091 variant->shader->no, variant->no,
4092 variant->shader->variants_created,
4093 variant->shader->variants_cached,
4094 lp->nr_fs_variants, variant->nr_instrs, lp->nr_fs_instrs);
4095 }
4096
4097 /* remove from shader's list */
4098 list_del(&variant->list_item_local.list);
4099 variant->shader->variants_cached--;
4100
4101 /* remove from context's list */
4102 list_del(&variant->list_item_global.list);
4103 lp->nr_fs_variants--;
4104 lp->nr_fs_instrs -= variant->nr_instrs;
4105 }
4106
4107
4108 void
llvmpipe_destroy_shader_variant(struct llvmpipe_context * lp,struct lp_fragment_shader_variant * variant)4109 llvmpipe_destroy_shader_variant(struct llvmpipe_context *lp,
4110 struct lp_fragment_shader_variant *variant)
4111 {
4112 gallivm_destroy(variant->gallivm);
4113 lp_fs_reference(lp, &variant->shader, NULL);
4114 FREE(variant);
4115 }
4116
4117
4118 void
llvmpipe_destroy_fs(struct llvmpipe_context * llvmpipe,struct lp_fragment_shader * shader)4119 llvmpipe_destroy_fs(struct llvmpipe_context *llvmpipe,
4120 struct lp_fragment_shader *shader)
4121 {
4122 /* Delete draw module's data */
4123 draw_delete_fragment_shader(llvmpipe->draw, shader->draw_data);
4124
4125 ralloc_free(shader->base.ir.nir);
4126 assert(shader->variants_cached == 0);
4127 FREE(shader);
4128 }
4129
4130
4131 static void
llvmpipe_delete_fs_state(struct pipe_context * pipe,void * fs)4132 llvmpipe_delete_fs_state(struct pipe_context *pipe, void *fs)
4133 {
4134 struct llvmpipe_context *llvmpipe = llvmpipe_context(pipe);
4135 struct lp_fragment_shader *shader = fs;
4136 struct lp_fs_variant_list_item *li, *next;
4137
4138 /* Delete all the variants */
4139 LIST_FOR_EACH_ENTRY_SAFE(li, next, &shader->variants.list, list) {
4140 struct lp_fragment_shader_variant *variant;
4141 variant = li->base;
4142 llvmpipe_remove_shader_variant(llvmpipe, li->base);
4143 lp_fs_variant_reference(llvmpipe, &variant, NULL);
4144 }
4145
4146 lp_fs_reference(llvmpipe, &shader, NULL);
4147 }
4148
4149
4150 static void
llvmpipe_set_constant_buffer(struct pipe_context * pipe,enum pipe_shader_type shader,uint index,bool take_ownership,const struct pipe_constant_buffer * cb)4151 llvmpipe_set_constant_buffer(struct pipe_context *pipe,
4152 enum pipe_shader_type shader, uint index,
4153 bool take_ownership,
4154 const struct pipe_constant_buffer *cb)
4155 {
4156 struct llvmpipe_context *llvmpipe = llvmpipe_context(pipe);
4157 struct pipe_constant_buffer *constants = &llvmpipe->constants[shader][index];
4158
4159 assert(shader < PIPE_SHADER_MESH_TYPES);
4160 assert(index < ARRAY_SIZE(llvmpipe->constants[shader]));
4161
4162 /* note: reference counting */
4163 util_copy_constant_buffer(&llvmpipe->constants[shader][index], cb,
4164 take_ownership);
4165
4166 /* user_buffer is only valid until the next set_constant_buffer (at most,
4167 * possibly until shader deletion), so we need to upload it now to make
4168 * sure it doesn't get updated/freed out from under us.
4169 */
4170 if (constants->user_buffer) {
4171 u_upload_data(llvmpipe->pipe.const_uploader, 0, constants->buffer_size,
4172 16, constants->user_buffer, &constants->buffer_offset,
4173 &constants->buffer);
4174 }
4175 if (constants->buffer) {
4176 if (!(constants->buffer->bind & PIPE_BIND_CONSTANT_BUFFER)) {
4177 debug_printf("Illegal set constant without bind flag\n");
4178 constants->buffer->bind |= PIPE_BIND_CONSTANT_BUFFER;
4179 }
4180 llvmpipe_flush_resource(pipe, constants->buffer, 0, true, true, false, "set_constant_buffer");
4181 }
4182
4183 switch (shader) {
4184 case PIPE_SHADER_VERTEX:
4185 case PIPE_SHADER_GEOMETRY:
4186 case PIPE_SHADER_TESS_CTRL:
4187 case PIPE_SHADER_TESS_EVAL: {
4188 const unsigned size = cb ? cb->buffer_size : 0;
4189
4190 const uint8_t *data = NULL;
4191 if (constants->buffer) {
4192 data = (uint8_t *) llvmpipe_resource_data(constants->buffer)
4193 + constants->buffer_offset;
4194 }
4195
4196 draw_set_mapped_constant_buffer(llvmpipe->draw, shader,
4197 index, data, size);
4198 break;
4199 }
4200 case PIPE_SHADER_COMPUTE:
4201 llvmpipe->cs_dirty |= LP_CSNEW_CONSTANTS;
4202 break;
4203 case PIPE_SHADER_FRAGMENT:
4204 llvmpipe->dirty |= LP_NEW_FS_CONSTANTS;
4205 break;
4206 case PIPE_SHADER_TASK:
4207 llvmpipe->dirty |= LP_NEW_TASK_CONSTANTS;
4208 break;
4209 case PIPE_SHADER_MESH:
4210 llvmpipe->dirty |= LP_NEW_MESH_CONSTANTS;
4211 break;
4212 default:
4213 unreachable("Illegal shader type");
4214 break;
4215 }
4216 }
4217
4218
4219 static void
llvmpipe_set_shader_buffers(struct pipe_context * pipe,enum pipe_shader_type shader,unsigned start_slot,unsigned count,const struct pipe_shader_buffer * buffers,unsigned writable_bitmask)4220 llvmpipe_set_shader_buffers(struct pipe_context *pipe,
4221 enum pipe_shader_type shader, unsigned start_slot,
4222 unsigned count,
4223 const struct pipe_shader_buffer *buffers,
4224 unsigned writable_bitmask)
4225 {
4226 struct llvmpipe_context *llvmpipe = llvmpipe_context(pipe);
4227
4228 unsigned i, idx;
4229 for (i = start_slot, idx = 0; i < start_slot + count; i++, idx++) {
4230 const struct pipe_shader_buffer *buffer = buffers ? &buffers[idx] : NULL;
4231
4232 util_copy_shader_buffer(&llvmpipe->ssbos[shader][i], buffer);
4233
4234 if (buffer && buffer->buffer) {
4235 bool read_only = !(writable_bitmask & (1 << idx));
4236 llvmpipe_flush_resource(pipe, buffer->buffer, 0, read_only, false,
4237 false, "buffer");
4238 }
4239
4240 switch (shader) {
4241 case PIPE_SHADER_VERTEX:
4242 case PIPE_SHADER_GEOMETRY:
4243 case PIPE_SHADER_TESS_CTRL:
4244 case PIPE_SHADER_TESS_EVAL: {
4245 const unsigned size = buffer ? buffer->buffer_size : 0;
4246 const uint8_t *data = NULL;
4247 if (buffer && buffer->buffer)
4248 data = (uint8_t *) llvmpipe_resource_data(buffer->buffer);
4249 if (data)
4250 data += buffer->buffer_offset;
4251 draw_set_mapped_shader_buffer(llvmpipe->draw, shader,
4252 i, data, size);
4253 break;
4254 }
4255 case PIPE_SHADER_COMPUTE:
4256 llvmpipe->cs_dirty |= LP_CSNEW_SSBOS;
4257 break;
4258 case PIPE_SHADER_TASK:
4259 llvmpipe->dirty |= LP_NEW_TASK_SSBOS;
4260 break;
4261 case PIPE_SHADER_MESH:
4262 llvmpipe->dirty |= LP_NEW_MESH_SSBOS;
4263 break;
4264 case PIPE_SHADER_FRAGMENT:
4265 llvmpipe->fs_ssbo_write_mask &= ~(((1 << count) - 1) << start_slot);
4266 llvmpipe->fs_ssbo_write_mask |= writable_bitmask << start_slot;
4267 llvmpipe->dirty |= LP_NEW_FS_SSBOS;
4268 break;
4269 default:
4270 unreachable("Illegal shader type");
4271 break;
4272 }
4273 }
4274 }
4275
4276
4277 static void
llvmpipe_set_shader_images(struct pipe_context * pipe,enum pipe_shader_type shader,unsigned start_slot,unsigned count,unsigned unbind_num_trailing_slots,const struct pipe_image_view * images)4278 llvmpipe_set_shader_images(struct pipe_context *pipe,
4279 enum pipe_shader_type shader, unsigned start_slot,
4280 unsigned count, unsigned unbind_num_trailing_slots,
4281 const struct pipe_image_view *images)
4282 {
4283 struct llvmpipe_context *llvmpipe = llvmpipe_context(pipe);
4284 unsigned i, idx;
4285
4286 draw_flush(llvmpipe->draw);
4287 for (i = start_slot, idx = 0; i < start_slot + count; i++, idx++) {
4288 const struct pipe_image_view *image = images ? &images[idx] : NULL;
4289
4290 util_copy_image_view(&llvmpipe->images[shader][i], image);
4291
4292 if (image && image->resource) {
4293 bool read_only = !(image->access & PIPE_IMAGE_ACCESS_WRITE);
4294 llvmpipe_flush_resource(pipe, image->resource, 0, read_only, false,
4295 false, "image");
4296 }
4297 }
4298
4299 llvmpipe->num_images[shader] = start_slot + count;
4300 switch (shader) {
4301 case PIPE_SHADER_VERTEX:
4302 case PIPE_SHADER_GEOMETRY:
4303 case PIPE_SHADER_TESS_CTRL:
4304 case PIPE_SHADER_TESS_EVAL:
4305 draw_set_images(llvmpipe->draw, shader, llvmpipe->images[shader],
4306 start_slot + count);
4307 break;
4308 case PIPE_SHADER_COMPUTE:
4309 llvmpipe->cs_dirty |= LP_CSNEW_IMAGES;
4310 break;
4311 case PIPE_SHADER_FRAGMENT:
4312 llvmpipe->dirty |= LP_NEW_FS_IMAGES;
4313 break;
4314 case PIPE_SHADER_TASK:
4315 llvmpipe->dirty |= LP_NEW_TASK_IMAGES;
4316 break;
4317 case PIPE_SHADER_MESH:
4318 llvmpipe->dirty |= LP_NEW_MESH_IMAGES;
4319 break;
4320 default:
4321 unreachable("Illegal shader type");
4322 break;
4323 }
4324
4325 if (unbind_num_trailing_slots) {
4326 llvmpipe_set_shader_images(pipe, shader, start_slot + count,
4327 unbind_num_trailing_slots, 0, NULL);
4328 }
4329 }
4330
4331
4332 /**
4333 * Return the blend factor equivalent to a destination alpha of one.
4334 */
4335 static inline enum pipe_blendfactor
force_dst_alpha_one(enum pipe_blendfactor factor,bool clamped_zero)4336 force_dst_alpha_one(enum pipe_blendfactor factor, bool clamped_zero)
4337 {
4338 switch (factor) {
4339 case PIPE_BLENDFACTOR_DST_ALPHA:
4340 return PIPE_BLENDFACTOR_ONE;
4341 case PIPE_BLENDFACTOR_INV_DST_ALPHA:
4342 return PIPE_BLENDFACTOR_ZERO;
4343 case PIPE_BLENDFACTOR_SRC_ALPHA_SATURATE:
4344 if (clamped_zero)
4345 return PIPE_BLENDFACTOR_ZERO;
4346 else
4347 return PIPE_BLENDFACTOR_SRC_ALPHA_SATURATE;
4348 default:
4349 return factor;
4350 }
4351 }
4352
4353
4354 /**
4355 * We need to generate several variants of the fragment pipeline to match
4356 * all the combinations of the contributing state atoms.
4357 *
4358 * TODO: there is actually no reason to tie this to context state -- the
4359 * generated code could be cached globally in the screen.
4360 */
4361 static struct lp_fragment_shader_variant_key *
make_variant_key(struct llvmpipe_context * lp,struct lp_fragment_shader * shader,char * store)4362 make_variant_key(struct llvmpipe_context *lp,
4363 struct lp_fragment_shader *shader,
4364 char *store)
4365 {
4366 struct lp_fragment_shader_variant_key *key =
4367 (struct lp_fragment_shader_variant_key *)store;
4368 struct nir_shader *nir = shader->base.ir.nir;
4369
4370 memset(key, 0, sizeof(*key));
4371
4372 if (lp->framebuffer.zsbuf) {
4373 const enum pipe_format zsbuf_format = lp->framebuffer.zsbuf->format;
4374 const struct util_format_description *zsbuf_desc =
4375 util_format_description(zsbuf_format);
4376
4377 if (lp->depth_stencil->depth_enabled &&
4378 util_format_has_depth(zsbuf_desc)) {
4379 key->zsbuf_format = zsbuf_format;
4380 key->depth.enabled = lp->depth_stencil->depth_enabled;
4381 key->depth.writemask = lp->depth_stencil->depth_writemask;
4382 key->depth.func = lp->depth_stencil->depth_func;
4383 }
4384 if (lp->depth_stencil->stencil[0].enabled &&
4385 util_format_has_stencil(zsbuf_desc)) {
4386 key->zsbuf_format = zsbuf_format;
4387 memcpy(&key->stencil, &lp->depth_stencil->stencil,
4388 sizeof key->stencil);
4389 }
4390 if (llvmpipe_resource_is_1d(lp->framebuffer.zsbuf->texture)) {
4391 key->resource_1d = true;
4392 }
4393 key->zsbuf_nr_samples =
4394 util_res_sample_count(lp->framebuffer.zsbuf->texture);
4395
4396 /*
4397 * Restrict depth values if the API is clamped (GL, VK with ext)
4398 * for non float Z buffer
4399 */
4400 key->restrict_depth_values =
4401 !(lp->rasterizer->unclamped_fragment_depth_values &&
4402 util_format_get_depth_only(zsbuf_format) == PIPE_FORMAT_Z32_FLOAT);
4403 }
4404
4405 /*
4406 * Propagate the depth clamp setting from the rasterizer state.
4407 */
4408 key->depth_clamp = lp->rasterizer->depth_clamp;
4409
4410 /* alpha test only applies if render buffer 0 is non-integer
4411 * (or does not exist)
4412 */
4413 if (!lp->framebuffer.nr_cbufs ||
4414 !lp->framebuffer.cbufs[0] ||
4415 !util_format_is_pure_integer(lp->framebuffer.cbufs[0]->format)) {
4416 key->alpha.enabled = lp->depth_stencil->alpha_enabled;
4417 }
4418 if (key->alpha.enabled) {
4419 key->alpha.func = lp->depth_stencil->alpha_func;
4420 /* alpha.ref_value is passed in jit_context */
4421 }
4422
4423 key->flatshade = lp->rasterizer->flatshade;
4424 key->multisample = lp->rasterizer->multisample;
4425 key->no_ms_sample_mask_out = lp->rasterizer->no_ms_sample_mask_out;
4426 if (lp->active_occlusion_queries && !lp->queries_disabled) {
4427 key->occlusion_count = true;
4428 }
4429
4430 memcpy(&key->blend, lp->blend, sizeof key->blend);
4431
4432 key->coverage_samples = 1;
4433 key->min_samples = 1;
4434 if (key->multisample) {
4435 key->coverage_samples =
4436 util_framebuffer_get_num_samples(&lp->framebuffer);
4437 /* Per EXT_shader_framebuffer_fetch spec:
4438 *
4439 * "1. How is framebuffer data treated during multisample rendering?
4440 *
4441 * RESOLVED: Reading the value of gl_LastFragData produces a
4442 * different result for each sample. This implies that all or part
4443 * of the shader be run once for each sample, but has no additional
4444 * implications on fragment shader input variables which may still
4445 * be interpolated per pixel by the implementation."
4446 *
4447 * ARM_shader_framebuffer_fetch_depth_stencil spec further says:
4448 *
4449 * "(1) When multisampling is enabled, does the shader run per sample?
4450 *
4451 * RESOLVED.
4452 *
4453 * This behavior is inherited from either
4454 * EXT_shader_framebuffer_fetch or ARM_shader_framebuffer_fetch as
4455 * described in the interactions section. If neither extension is
4456 * supported, the shader runs once per fragment."
4457 *
4458 * Therefore we should always enable per-sample shading when FB fetch is
4459 * used.
4460 */
4461 if (lp->min_samples > 1 || nir->info.fs.uses_fbfetch_output)
4462 key->min_samples = key->coverage_samples;
4463 }
4464 key->nr_cbufs = lp->framebuffer.nr_cbufs;
4465
4466 if (!key->blend.independent_blend_enable) {
4467 // we always need independent blend otherwise the fixups below won't work
4468 for (unsigned i = 1; i < key->nr_cbufs; i++) {
4469 memcpy(&key->blend.rt[i], &key->blend.rt[0],
4470 sizeof(key->blend.rt[0]));
4471 }
4472 key->blend.independent_blend_enable = 1;
4473 }
4474
4475 for (unsigned i = 0; i < lp->framebuffer.nr_cbufs; i++) {
4476 struct pipe_rt_blend_state *blend_rt = &key->blend.rt[i];
4477
4478 if (lp->framebuffer.cbufs[i]) {
4479 const enum pipe_format format = lp->framebuffer.cbufs[i]->format;
4480
4481 key->cbuf_format[i] = format;
4482 key->cbuf_nr_samples[i] =
4483 util_res_sample_count(lp->framebuffer.cbufs[i]->texture);
4484
4485 /*
4486 * Figure out if this is a 1d resource. Note that OpenGL allows crazy
4487 * mixing of 2d textures with height 1 and 1d textures, so make sure
4488 * we pick 1d if any cbuf or zsbuf is 1d.
4489 */
4490 if (llvmpipe_resource_is_1d(lp->framebuffer.cbufs[i]->texture)) {
4491 key->resource_1d = true;
4492 }
4493
4494 const struct util_format_description *format_desc =
4495 util_format_description(format);
4496 assert(format_desc->colorspace == UTIL_FORMAT_COLORSPACE_RGB ||
4497 format_desc->colorspace == UTIL_FORMAT_COLORSPACE_SRGB);
4498
4499 /*
4500 * Mask out color channels not present in the color buffer.
4501 */
4502 blend_rt->colormask &= util_format_colormask(format_desc);
4503
4504 /*
4505 * Disable blend for integer formats.
4506 */
4507 if (util_format_is_pure_integer(format)) {
4508 blend_rt->blend_enable = 0;
4509 }
4510
4511 /*
4512 * Our swizzled render tiles always have an alpha channel, but the
4513 * linear render target format often does not, so force here the dst
4514 * alpha to be one.
4515 *
4516 * This is not a mere optimization. Wrong results will be produced if
4517 * the dst alpha is used, the dst format does not have alpha, and the
4518 * previous rendering was not flushed from the swizzled to linear
4519 * buffer. For example, NonPowTwo DCT.
4520 *
4521 * TODO: This should be generalized to all channels for better
4522 * performance, but only alpha causes correctness issues.
4523 *
4524 * Also, force rgb/alpha func/factors match, to make AoS blending
4525 * easier.
4526 */
4527 if (format_desc->swizzle[3] > PIPE_SWIZZLE_W ||
4528 format_desc->swizzle[3] == format_desc->swizzle[0]) {
4529 // Doesn't cover mixed snorm/unorm but can't render to them anyway
4530 bool clamped_zero = !util_format_is_float(format) &&
4531 !util_format_is_snorm(format);
4532 blend_rt->rgb_src_factor =
4533 force_dst_alpha_one(blend_rt->rgb_src_factor, clamped_zero);
4534 blend_rt->rgb_dst_factor =
4535 force_dst_alpha_one(blend_rt->rgb_dst_factor, clamped_zero);
4536 blend_rt->alpha_func = blend_rt->rgb_func;
4537 blend_rt->alpha_src_factor = blend_rt->rgb_src_factor;
4538 blend_rt->alpha_dst_factor = blend_rt->rgb_dst_factor;
4539 }
4540 } else {
4541 /* no color buffer for this fragment output */
4542 key->cbuf_format[i] = PIPE_FORMAT_NONE;
4543 key->cbuf_nr_samples[i] = 0;
4544 blend_rt->colormask = 0x0;
4545 blend_rt->blend_enable = 0;
4546 }
4547 }
4548
4549 /* This value will be the same for all the variants of a given shader:
4550 */
4551 key->nr_samplers = BITSET_LAST_BIT(nir->info.samplers_used);
4552 key->nr_sampler_views = BITSET_LAST_BIT(nir->info.textures_used);
4553
4554 struct lp_sampler_static_state *fs_sampler =
4555 lp_fs_variant_key_samplers(key);
4556
4557 memset(fs_sampler, 0,
4558 MAX2(key->nr_samplers, key->nr_sampler_views) * sizeof *fs_sampler);
4559
4560 for (unsigned i = 0; i < key->nr_samplers; ++i) {
4561 if (BITSET_TEST(nir->info.samplers_used, i)) {
4562 lp_sampler_static_sampler_state(&fs_sampler[i].sampler_state,
4563 lp->samplers[PIPE_SHADER_FRAGMENT][i]);
4564 }
4565 }
4566
4567 /*
4568 * XXX If TGSI_FILE_SAMPLER_VIEW exists assume all texture opcodes
4569 * are dx10-style? Can't really have mixed opcodes, at least not
4570 * if we want to skip the holes here (without rescanning tgsi).
4571 */
4572 if (key->nr_sampler_views) {
4573 for (unsigned i = 0; i < key->nr_sampler_views; ++i) {
4574 /*
4575 * Note sview may exceed what's representable by file_mask.
4576 * This will still work, the only downside is that not actually
4577 * used views may be included in the shader key.
4578 */
4579 if (BITSET_TEST(nir->info.textures_used, i)) {
4580 lp_sampler_static_texture_state(&fs_sampler[i].texture_state,
4581 lp->sampler_views[PIPE_SHADER_FRAGMENT][i]);
4582 }
4583 }
4584 } else {
4585 key->nr_sampler_views = key->nr_samplers;
4586 for (unsigned i = 0; i < key->nr_sampler_views; ++i) {
4587 if (BITSET_TEST(nir->info.samplers_used, i)) {
4588 lp_sampler_static_texture_state(&fs_sampler[i].texture_state,
4589 lp->sampler_views[PIPE_SHADER_FRAGMENT][i]);
4590 }
4591 }
4592 }
4593
4594 struct lp_image_static_state *lp_image = lp_fs_variant_key_images(key);
4595 key->nr_images = BITSET_LAST_BIT(nir->info.images_used);
4596 if (key->nr_images)
4597 memset(lp_image, 0,
4598 key->nr_images * sizeof *lp_image);
4599 for (unsigned i = 0; i < key->nr_images; ++i) {
4600 if (BITSET_TEST(nir->info.images_used, i)) {
4601 lp_sampler_static_texture_state_image(&lp_image[i].image_state,
4602 &lp->images[PIPE_SHADER_FRAGMENT][i]);
4603 }
4604 }
4605
4606 if (shader->kind == LP_FS_KIND_AERO_MINIFICATION) {
4607 struct lp_sampler_static_state *samp0 =
4608 lp_fs_variant_key_sampler_idx(key, 0);
4609 assert(samp0);
4610 samp0->sampler_state.min_img_filter = PIPE_TEX_FILTER_NEAREST;
4611 samp0->sampler_state.mag_img_filter = PIPE_TEX_FILTER_NEAREST;
4612 }
4613
4614 return key;
4615 }
4616
4617
4618 /**
4619 * Update fragment shader state. This is called just prior to drawing
4620 * something when some fragment-related state has changed.
4621 */
4622 void
llvmpipe_update_fs(struct llvmpipe_context * lp)4623 llvmpipe_update_fs(struct llvmpipe_context *lp)
4624 {
4625 struct lp_fragment_shader *shader = lp->fs;
4626
4627 char store[LP_FS_MAX_VARIANT_KEY_SIZE];
4628 const struct lp_fragment_shader_variant_key *key =
4629 make_variant_key(lp, shader, store);
4630
4631 struct lp_fragment_shader_variant *variant = NULL;
4632 struct lp_fs_variant_list_item *li;
4633 /* Search the variants for one which matches the key */
4634 LIST_FOR_EACH_ENTRY(li, &shader->variants.list, list) {
4635 if (memcmp(&li->base->key, key, shader->variant_key_size) == 0) {
4636 variant = li->base;
4637 break;
4638 }
4639 }
4640
4641 if (variant) {
4642 /* Move this variant to the head of the list to implement LRU
4643 * deletion of shader's when we have too many.
4644 */
4645 list_move_to(&variant->list_item_global.list, &lp->fs_variants_list.list);
4646 } else {
4647 /* variant not found, create it now */
4648
4649 if (LP_DEBUG & DEBUG_FS) {
4650 debug_printf("%u variants,\t%u instrs,\t%u instrs/variant\n",
4651 lp->nr_fs_variants,
4652 lp->nr_fs_instrs,
4653 lp->nr_fs_variants ? lp->nr_fs_instrs / lp->nr_fs_variants : 0);
4654 }
4655
4656 /* First, check if we've exceeded the max number of shader variants.
4657 * If so, free 6.25% of them (the least recently used ones).
4658 */
4659 const unsigned variants_to_cull =
4660 lp->nr_fs_variants >= LP_MAX_SHADER_VARIANTS
4661 ? LP_MAX_SHADER_VARIANTS / 16 : 0;
4662
4663 if (variants_to_cull ||
4664 lp->nr_fs_instrs >= LP_MAX_SHADER_INSTRUCTIONS) {
4665 if (gallivm_debug & GALLIVM_DEBUG_PERF) {
4666 debug_printf("Evicting FS: %u fs variants,\t%u total variants,"
4667 "\t%u instrs,\t%u instrs/variant\n",
4668 shader->variants_cached,
4669 lp->nr_fs_variants, lp->nr_fs_instrs,
4670 lp->nr_fs_instrs / lp->nr_fs_variants);
4671 }
4672
4673 /*
4674 * We need to re-check lp->nr_fs_variants because an arbitrarily
4675 * large number of shader variants (potentially all of them) could
4676 * be pending for destruction on flush.
4677 */
4678
4679 for (unsigned i = 0;
4680 i < variants_to_cull ||
4681 lp->nr_fs_instrs >= LP_MAX_SHADER_INSTRUCTIONS;
4682 i++) {
4683 struct lp_fs_variant_list_item *item;
4684 if (list_is_empty(&lp->fs_variants_list.list)) {
4685 break;
4686 }
4687 item = list_last_entry(&lp->fs_variants_list.list,
4688 struct lp_fs_variant_list_item, list);
4689 assert(item);
4690 assert(item->base);
4691 llvmpipe_remove_shader_variant(lp, item->base);
4692 struct lp_fragment_shader_variant *variant = item->base;
4693 lp_fs_variant_reference(lp, &variant, NULL);
4694 }
4695 }
4696
4697 /*
4698 * Generate the new variant.
4699 */
4700 int64_t t0 = os_time_get();
4701 variant = generate_variant(lp, shader, key);
4702 int64_t t1 = os_time_get();
4703 int64_t dt = t1 - t0;
4704 LP_COUNT_ADD(llvm_compile_time, dt);
4705 LP_COUNT_ADD(nr_llvm_compiles, 2); /* emit vs. omit in/out test */
4706
4707 /* Put the new variant into the list */
4708 if (variant) {
4709 list_add(&variant->list_item_local.list, &shader->variants.list);
4710 list_add(&variant->list_item_global.list, &lp->fs_variants_list.list);
4711 lp->nr_fs_variants++;
4712 lp->nr_fs_instrs += variant->nr_instrs;
4713 shader->variants_cached++;
4714 }
4715 }
4716
4717 /* Bind this variant */
4718 lp_setup_set_fs_variant(lp->setup, variant);
4719 }
4720
4721
4722 void
llvmpipe_init_fs_funcs(struct llvmpipe_context * llvmpipe)4723 llvmpipe_init_fs_funcs(struct llvmpipe_context *llvmpipe)
4724 {
4725 llvmpipe->pipe.create_fs_state = llvmpipe_create_fs_state;
4726 llvmpipe->pipe.bind_fs_state = llvmpipe_bind_fs_state;
4727 llvmpipe->pipe.delete_fs_state = llvmpipe_delete_fs_state;
4728 llvmpipe->pipe.set_constant_buffer = llvmpipe_set_constant_buffer;
4729 llvmpipe->pipe.set_shader_buffers = llvmpipe_set_shader_buffers;
4730 llvmpipe->pipe.set_shader_images = llvmpipe_set_shader_images;
4731 }
4732