1 /*
2 * Copyright © 2016 Red Hat.
3 * Copyright © 2016 Bas Nieuwenhuizen
4 * SPDX-License-Identifier: MIT
5 *
6 * based in part on anv driver which is:
7 * Copyright © 2015 Intel Corporation
8 */
9
10 #include "tu_pipeline.h"
11
12 #include "common/freedreno_guardband.h"
13
14 #include "ir3/ir3_nir.h"
15 #include "main/menums.h"
16 #include "nir/nir.h"
17 #include "nir/nir_builder.h"
18 #include "spirv/nir_spirv.h"
19 #include "util/debug.h"
20 #include "util/mesa-sha1.h"
21 #include "vk_pipeline.h"
22 #include "vk_render_pass.h"
23 #include "vk_util.h"
24
25 #include "tu_cmd_buffer.h"
26 #include "tu_cs.h"
27 #include "tu_device.h"
28 #include "tu_formats.h"
29 #include "tu_lrz.h"
30 #include "tu_pass.h"
31
32 /* Emit IB that preloads the descriptors that the shader uses */
33
34 static void
emit_load_state(struct tu_cs * cs,unsigned opcode,enum a6xx_state_type st,enum a6xx_state_block sb,unsigned base,unsigned offset,unsigned count)35 emit_load_state(struct tu_cs *cs, unsigned opcode, enum a6xx_state_type st,
36 enum a6xx_state_block sb, unsigned base, unsigned offset,
37 unsigned count)
38 {
39 /* Note: just emit one packet, even if count overflows NUM_UNIT. It's not
40 * clear if emitting more packets will even help anything. Presumably the
41 * descriptor cache is relatively small, and these packets stop doing
42 * anything when there are too many descriptors.
43 */
44 tu_cs_emit_pkt7(cs, opcode, 3);
45 tu_cs_emit(cs,
46 CP_LOAD_STATE6_0_STATE_TYPE(st) |
47 CP_LOAD_STATE6_0_STATE_SRC(SS6_BINDLESS) |
48 CP_LOAD_STATE6_0_STATE_BLOCK(sb) |
49 CP_LOAD_STATE6_0_NUM_UNIT(MIN2(count, 1024-1)));
50 tu_cs_emit_qw(cs, offset | (base << 28));
51 }
52
53 static unsigned
tu6_load_state_size(struct tu_pipeline * pipeline,struct tu_pipeline_layout * layout,bool compute)54 tu6_load_state_size(struct tu_pipeline *pipeline,
55 struct tu_pipeline_layout *layout, bool compute)
56 {
57 const unsigned load_state_size = 4;
58 unsigned size = 0;
59 for (unsigned i = 0; i < layout->num_sets; i++) {
60 if (!(pipeline->active_desc_sets & (1u << i)))
61 continue;
62
63 struct tu_descriptor_set_layout *set_layout = layout->set[i].layout;
64 for (unsigned j = 0; j < set_layout->binding_count; j++) {
65 struct tu_descriptor_set_binding_layout *binding = &set_layout->binding[j];
66 unsigned count = 0;
67 /* Note: some users, like amber for example, pass in
68 * VK_SHADER_STAGE_ALL which includes a bunch of extra bits, so
69 * filter these out by using VK_SHADER_STAGE_ALL_GRAPHICS explicitly.
70 */
71 VkShaderStageFlags stages = compute ?
72 binding->shader_stages & VK_SHADER_STAGE_COMPUTE_BIT :
73 binding->shader_stages & VK_SHADER_STAGE_ALL_GRAPHICS;
74 unsigned stage_count = util_bitcount(stages);
75
76 if (!binding->array_size)
77 continue;
78
79 switch (binding->type) {
80 case VK_DESCRIPTOR_TYPE_STORAGE_BUFFER:
81 case VK_DESCRIPTOR_TYPE_STORAGE_BUFFER_DYNAMIC:
82 case VK_DESCRIPTOR_TYPE_STORAGE_IMAGE:
83 case VK_DESCRIPTOR_TYPE_STORAGE_TEXEL_BUFFER:
84 /* IBO-backed resources only need one packet for all graphics stages */
85 if (stages & ~VK_SHADER_STAGE_COMPUTE_BIT)
86 count += 1;
87 if (stages & VK_SHADER_STAGE_COMPUTE_BIT)
88 count += 1;
89 break;
90 case VK_DESCRIPTOR_TYPE_SAMPLER:
91 case VK_DESCRIPTOR_TYPE_SAMPLED_IMAGE:
92 case VK_DESCRIPTOR_TYPE_UNIFORM_TEXEL_BUFFER:
93 case VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER:
94 case VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER_DYNAMIC:
95 /* Textures and UBO's needs a packet for each stage */
96 count = stage_count;
97 break;
98 case VK_DESCRIPTOR_TYPE_COMBINED_IMAGE_SAMPLER:
99 /* Because of how we pack combined images and samplers, we
100 * currently can't use one packet for the whole array.
101 */
102 count = stage_count * binding->array_size * 2;
103 break;
104 case VK_DESCRIPTOR_TYPE_INPUT_ATTACHMENT:
105 case VK_DESCRIPTOR_TYPE_MUTABLE_VALVE:
106 break;
107 default:
108 unreachable("bad descriptor type");
109 }
110 size += count * load_state_size;
111 }
112 }
113 return size;
114 }
115
116 static void
tu6_emit_load_state(struct tu_pipeline * pipeline,struct tu_pipeline_layout * layout,bool compute)117 tu6_emit_load_state(struct tu_pipeline *pipeline,
118 struct tu_pipeline_layout *layout, bool compute)
119 {
120 unsigned size = tu6_load_state_size(pipeline, layout, compute);
121 if (size == 0)
122 return;
123
124 struct tu_cs cs;
125 tu_cs_begin_sub_stream(&pipeline->cs, size, &cs);
126
127 for (unsigned i = 0; i < layout->num_sets; i++) {
128 /* From 13.2.7. Descriptor Set Binding:
129 *
130 * A compatible descriptor set must be bound for all set numbers that
131 * any shaders in a pipeline access, at the time that a draw or
132 * dispatch command is recorded to execute using that pipeline.
133 * However, if none of the shaders in a pipeline statically use any
134 * bindings with a particular set number, then no descriptor set need
135 * be bound for that set number, even if the pipeline layout includes
136 * a non-trivial descriptor set layout for that set number.
137 *
138 * This means that descriptor sets unused by the pipeline may have a
139 * garbage or 0 BINDLESS_BASE register, which will cause context faults
140 * when prefetching descriptors from these sets. Skip prefetching for
141 * descriptors from them to avoid this. This is also an optimization,
142 * since these prefetches would be useless.
143 */
144 if (!(pipeline->active_desc_sets & (1u << i)))
145 continue;
146
147 struct tu_descriptor_set_layout *set_layout = layout->set[i].layout;
148 for (unsigned j = 0; j < set_layout->binding_count; j++) {
149 struct tu_descriptor_set_binding_layout *binding = &set_layout->binding[j];
150 unsigned base = i;
151 unsigned offset = binding->offset / 4;
152 /* Note: some users, like amber for example, pass in
153 * VK_SHADER_STAGE_ALL which includes a bunch of extra bits, so
154 * filter these out by using VK_SHADER_STAGE_ALL_GRAPHICS explicitly.
155 */
156 VkShaderStageFlags stages = compute ?
157 binding->shader_stages & VK_SHADER_STAGE_COMPUTE_BIT :
158 binding->shader_stages & VK_SHADER_STAGE_ALL_GRAPHICS;
159 unsigned count = binding->array_size;
160
161 /* If this is a variable-count descriptor, then the array_size is an
162 * upper bound on the size, but we don't know how many descriptors
163 * will actually be used. Therefore we can't pre-load them here.
164 */
165 if (j == set_layout->binding_count - 1 &&
166 set_layout->has_variable_descriptors)
167 continue;
168
169 if (count == 0 || stages == 0)
170 continue;
171 switch (binding->type) {
172 case VK_DESCRIPTOR_TYPE_STORAGE_BUFFER_DYNAMIC:
173 base = MAX_SETS;
174 offset = (layout->set[i].dynamic_offset_start +
175 binding->dynamic_offset_offset) / 4;
176 FALLTHROUGH;
177 case VK_DESCRIPTOR_TYPE_STORAGE_BUFFER:
178 case VK_DESCRIPTOR_TYPE_STORAGE_IMAGE:
179 case VK_DESCRIPTOR_TYPE_STORAGE_TEXEL_BUFFER: {
180 unsigned mul = binding->size / (A6XX_TEX_CONST_DWORDS * 4);
181 /* IBO-backed resources only need one packet for all graphics stages */
182 if (stages & ~VK_SHADER_STAGE_COMPUTE_BIT) {
183 emit_load_state(&cs, CP_LOAD_STATE6, ST6_SHADER, SB6_IBO,
184 base, offset, count * mul);
185 }
186 if (stages & VK_SHADER_STAGE_COMPUTE_BIT) {
187 emit_load_state(&cs, CP_LOAD_STATE6_FRAG, ST6_IBO, SB6_CS_SHADER,
188 base, offset, count * mul);
189 }
190 break;
191 }
192 case VK_DESCRIPTOR_TYPE_INPUT_ATTACHMENT:
193 case VK_DESCRIPTOR_TYPE_MUTABLE_VALVE:
194 /* nothing - input attachment doesn't use bindless */
195 break;
196 case VK_DESCRIPTOR_TYPE_SAMPLER:
197 case VK_DESCRIPTOR_TYPE_SAMPLED_IMAGE:
198 case VK_DESCRIPTOR_TYPE_UNIFORM_TEXEL_BUFFER: {
199 tu_foreach_stage(stage, stages) {
200 emit_load_state(&cs, tu6_stage2opcode(stage),
201 binding->type == VK_DESCRIPTOR_TYPE_SAMPLER ?
202 ST6_SHADER : ST6_CONSTANTS,
203 tu6_stage2texsb(stage), base, offset, count);
204 }
205 break;
206 }
207 case VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER_DYNAMIC:
208 base = MAX_SETS;
209 offset = (layout->set[i].dynamic_offset_start +
210 binding->dynamic_offset_offset) / 4;
211 FALLTHROUGH;
212 case VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER: {
213 tu_foreach_stage(stage, stages) {
214 emit_load_state(&cs, tu6_stage2opcode(stage), ST6_UBO,
215 tu6_stage2shadersb(stage), base, offset, count);
216 }
217 break;
218 }
219 case VK_DESCRIPTOR_TYPE_COMBINED_IMAGE_SAMPLER: {
220 tu_foreach_stage(stage, stages) {
221 /* TODO: We could emit less CP_LOAD_STATE6 if we used
222 * struct-of-arrays instead of array-of-structs.
223 */
224 for (unsigned i = 0; i < count; i++) {
225 unsigned tex_offset = offset + 2 * i * A6XX_TEX_CONST_DWORDS;
226 unsigned sam_offset = offset + (2 * i + 1) * A6XX_TEX_CONST_DWORDS;
227 emit_load_state(&cs, tu6_stage2opcode(stage),
228 ST6_CONSTANTS, tu6_stage2texsb(stage),
229 base, tex_offset, 1);
230 emit_load_state(&cs, tu6_stage2opcode(stage),
231 ST6_SHADER, tu6_stage2texsb(stage),
232 base, sam_offset, 1);
233 }
234 }
235 break;
236 }
237 default:
238 unreachable("bad descriptor type");
239 }
240 }
241 }
242
243 pipeline->load_state = tu_cs_end_draw_state(&pipeline->cs, &cs);
244 }
245
246 struct tu_pipeline_builder
247 {
248 struct tu_device *device;
249 void *mem_ctx;
250 struct vk_pipeline_cache *cache;
251 struct tu_pipeline_layout *layout;
252 const VkAllocationCallbacks *alloc;
253 const VkGraphicsPipelineCreateInfo *create_info;
254
255 struct tu_compiled_shaders *shaders;
256 struct ir3_shader_variant *binning_variant;
257 uint64_t shader_iova[MESA_SHADER_FRAGMENT + 1];
258 uint64_t binning_vs_iova;
259
260 uint32_t additional_cs_reserve_size;
261
262 struct tu_pvtmem_config pvtmem;
263
264 bool rasterizer_discard;
265 /* these states are affectd by rasterizer_discard */
266 bool emit_msaa_state;
267 bool depth_clip_disable;
268 VkSampleCountFlagBits samples;
269 bool use_color_attachments;
270 bool use_dual_src_blend;
271 bool alpha_to_coverage;
272 uint32_t color_attachment_count;
273 VkFormat color_attachment_formats[MAX_RTS];
274 VkFormat depth_attachment_format;
275 uint32_t render_components;
276 uint32_t multiview_mask;
277
278 bool subpass_raster_order_attachment_access;
279 bool subpass_feedback_loop_color;
280 bool subpass_feedback_loop_ds;
281 };
282
283 static bool
tu_logic_op_reads_dst(VkLogicOp op)284 tu_logic_op_reads_dst(VkLogicOp op)
285 {
286 switch (op) {
287 case VK_LOGIC_OP_CLEAR:
288 case VK_LOGIC_OP_COPY:
289 case VK_LOGIC_OP_COPY_INVERTED:
290 case VK_LOGIC_OP_SET:
291 return false;
292 default:
293 return true;
294 }
295 }
296
297 static VkBlendFactor
tu_blend_factor_no_dst_alpha(VkBlendFactor factor)298 tu_blend_factor_no_dst_alpha(VkBlendFactor factor)
299 {
300 /* treat dst alpha as 1.0 and avoid reading it */
301 switch (factor) {
302 case VK_BLEND_FACTOR_DST_ALPHA:
303 return VK_BLEND_FACTOR_ONE;
304 case VK_BLEND_FACTOR_ONE_MINUS_DST_ALPHA:
305 return VK_BLEND_FACTOR_ZERO;
306 default:
307 return factor;
308 }
309 }
310
tu_blend_factor_is_dual_src(VkBlendFactor factor)311 static bool tu_blend_factor_is_dual_src(VkBlendFactor factor)
312 {
313 switch (factor) {
314 case VK_BLEND_FACTOR_SRC1_COLOR:
315 case VK_BLEND_FACTOR_ONE_MINUS_SRC1_COLOR:
316 case VK_BLEND_FACTOR_SRC1_ALPHA:
317 case VK_BLEND_FACTOR_ONE_MINUS_SRC1_ALPHA:
318 return true;
319 default:
320 return false;
321 }
322 }
323
324 static bool
tu_blend_state_is_dual_src(const VkPipelineColorBlendStateCreateInfo * info)325 tu_blend_state_is_dual_src(const VkPipelineColorBlendStateCreateInfo *info)
326 {
327 if (!info)
328 return false;
329
330 for (unsigned i = 0; i < info->attachmentCount; i++) {
331 const VkPipelineColorBlendAttachmentState *blend = &info->pAttachments[i];
332 if (tu_blend_factor_is_dual_src(blend->srcColorBlendFactor) ||
333 tu_blend_factor_is_dual_src(blend->dstColorBlendFactor) ||
334 tu_blend_factor_is_dual_src(blend->srcAlphaBlendFactor) ||
335 tu_blend_factor_is_dual_src(blend->dstAlphaBlendFactor))
336 return true;
337 }
338
339 return false;
340 }
341
342 static const struct xs_config {
343 uint16_t reg_sp_xs_ctrl;
344 uint16_t reg_sp_xs_config;
345 uint16_t reg_sp_xs_instrlen;
346 uint16_t reg_hlsq_xs_ctrl;
347 uint16_t reg_sp_xs_first_exec_offset;
348 uint16_t reg_sp_xs_pvt_mem_hw_stack_offset;
349 } xs_config[] = {
350 [MESA_SHADER_VERTEX] = {
351 REG_A6XX_SP_VS_CTRL_REG0,
352 REG_A6XX_SP_VS_CONFIG,
353 REG_A6XX_SP_VS_INSTRLEN,
354 REG_A6XX_HLSQ_VS_CNTL,
355 REG_A6XX_SP_VS_OBJ_FIRST_EXEC_OFFSET,
356 REG_A6XX_SP_VS_PVT_MEM_HW_STACK_OFFSET,
357 },
358 [MESA_SHADER_TESS_CTRL] = {
359 REG_A6XX_SP_HS_CTRL_REG0,
360 REG_A6XX_SP_HS_CONFIG,
361 REG_A6XX_SP_HS_INSTRLEN,
362 REG_A6XX_HLSQ_HS_CNTL,
363 REG_A6XX_SP_HS_OBJ_FIRST_EXEC_OFFSET,
364 REG_A6XX_SP_HS_PVT_MEM_HW_STACK_OFFSET,
365 },
366 [MESA_SHADER_TESS_EVAL] = {
367 REG_A6XX_SP_DS_CTRL_REG0,
368 REG_A6XX_SP_DS_CONFIG,
369 REG_A6XX_SP_DS_INSTRLEN,
370 REG_A6XX_HLSQ_DS_CNTL,
371 REG_A6XX_SP_DS_OBJ_FIRST_EXEC_OFFSET,
372 REG_A6XX_SP_DS_PVT_MEM_HW_STACK_OFFSET,
373 },
374 [MESA_SHADER_GEOMETRY] = {
375 REG_A6XX_SP_GS_CTRL_REG0,
376 REG_A6XX_SP_GS_CONFIG,
377 REG_A6XX_SP_GS_INSTRLEN,
378 REG_A6XX_HLSQ_GS_CNTL,
379 REG_A6XX_SP_GS_OBJ_FIRST_EXEC_OFFSET,
380 REG_A6XX_SP_GS_PVT_MEM_HW_STACK_OFFSET,
381 },
382 [MESA_SHADER_FRAGMENT] = {
383 REG_A6XX_SP_FS_CTRL_REG0,
384 REG_A6XX_SP_FS_CONFIG,
385 REG_A6XX_SP_FS_INSTRLEN,
386 REG_A6XX_HLSQ_FS_CNTL,
387 REG_A6XX_SP_FS_OBJ_FIRST_EXEC_OFFSET,
388 REG_A6XX_SP_FS_PVT_MEM_HW_STACK_OFFSET,
389 },
390 [MESA_SHADER_COMPUTE] = {
391 REG_A6XX_SP_CS_CTRL_REG0,
392 REG_A6XX_SP_CS_CONFIG,
393 REG_A6XX_SP_CS_INSTRLEN,
394 REG_A6XX_HLSQ_CS_CNTL,
395 REG_A6XX_SP_CS_OBJ_FIRST_EXEC_OFFSET,
396 REG_A6XX_SP_CS_PVT_MEM_HW_STACK_OFFSET,
397 },
398 };
399
400 static uint32_t
tu_xs_get_immediates_packet_size_dwords(const struct ir3_shader_variant * xs)401 tu_xs_get_immediates_packet_size_dwords(const struct ir3_shader_variant *xs)
402 {
403 const struct ir3_const_state *const_state = ir3_const_state(xs);
404 uint32_t base = const_state->offsets.immediate;
405 int32_t size = DIV_ROUND_UP(const_state->immediates_count, 4);
406
407 /* truncate size to avoid writing constants that shader
408 * does not use:
409 */
410 size = MIN2(size + base, xs->constlen) - base;
411
412 return MAX2(size, 0) * 4;
413 }
414
415 /* We allocate fixed-length substreams for shader state, however some
416 * parts of the state may have unbound length. Their additional space
417 * requirements should be calculated here.
418 */
419 static uint32_t
tu_xs_get_additional_cs_size_dwords(const struct ir3_shader_variant * xs)420 tu_xs_get_additional_cs_size_dwords(const struct ir3_shader_variant *xs)
421 {
422 const struct ir3_const_state *const_state = ir3_const_state(xs);
423
424 uint32_t size = tu_xs_get_immediates_packet_size_dwords(xs);
425
426 /* Variable number of UBO upload ranges. */
427 size += 4 * const_state->ubo_state.num_enabled;
428
429 /* Variable number of dwords for the primitive map */
430 size += xs->input_size;
431
432 size += xs->constant_data_size / 4;
433
434 return size;
435 }
436
437 void
tu6_emit_xs_config(struct tu_cs * cs,gl_shader_stage stage,const struct ir3_shader_variant * xs)438 tu6_emit_xs_config(struct tu_cs *cs,
439 gl_shader_stage stage, /* xs->type, but xs may be NULL */
440 const struct ir3_shader_variant *xs)
441 {
442 const struct xs_config *cfg = &xs_config[stage];
443
444 if (!xs) {
445 /* shader stage disabled */
446 tu_cs_emit_pkt4(cs, cfg->reg_sp_xs_config, 1);
447 tu_cs_emit(cs, 0);
448
449 tu_cs_emit_pkt4(cs, cfg->reg_hlsq_xs_ctrl, 1);
450 tu_cs_emit(cs, 0);
451 return;
452 }
453
454 tu_cs_emit_pkt4(cs, cfg->reg_sp_xs_config, 1);
455 tu_cs_emit(cs, A6XX_SP_VS_CONFIG_ENABLED |
456 COND(xs->bindless_tex, A6XX_SP_VS_CONFIG_BINDLESS_TEX) |
457 COND(xs->bindless_samp, A6XX_SP_VS_CONFIG_BINDLESS_SAMP) |
458 COND(xs->bindless_ibo, A6XX_SP_VS_CONFIG_BINDLESS_IBO) |
459 COND(xs->bindless_ubo, A6XX_SP_VS_CONFIG_BINDLESS_UBO) |
460 A6XX_SP_VS_CONFIG_NTEX(xs->num_samp) |
461 A6XX_SP_VS_CONFIG_NSAMP(xs->num_samp));
462
463 tu_cs_emit_pkt4(cs, cfg->reg_hlsq_xs_ctrl, 1);
464 tu_cs_emit(cs, A6XX_HLSQ_VS_CNTL_CONSTLEN(xs->constlen) |
465 A6XX_HLSQ_VS_CNTL_ENABLED);
466 }
467
468 void
tu6_emit_xs(struct tu_cs * cs,gl_shader_stage stage,const struct ir3_shader_variant * xs,const struct tu_pvtmem_config * pvtmem,uint64_t binary_iova)469 tu6_emit_xs(struct tu_cs *cs,
470 gl_shader_stage stage, /* xs->type, but xs may be NULL */
471 const struct ir3_shader_variant *xs,
472 const struct tu_pvtmem_config *pvtmem,
473 uint64_t binary_iova)
474 {
475 const struct xs_config *cfg = &xs_config[stage];
476
477 if (!xs) {
478 /* shader stage disabled */
479 return;
480 }
481
482 enum a6xx_threadsize thrsz =
483 xs->info.double_threadsize ? THREAD128 : THREAD64;
484 switch (stage) {
485 case MESA_SHADER_VERTEX:
486 tu_cs_emit_regs(cs, A6XX_SP_VS_CTRL_REG0(
487 .fullregfootprint = xs->info.max_reg + 1,
488 .halfregfootprint = xs->info.max_half_reg + 1,
489 .branchstack = ir3_shader_branchstack_hw(xs),
490 .mergedregs = xs->mergedregs,
491 ));
492 break;
493 case MESA_SHADER_TESS_CTRL:
494 tu_cs_emit_regs(cs, A6XX_SP_HS_CTRL_REG0(
495 .fullregfootprint = xs->info.max_reg + 1,
496 .halfregfootprint = xs->info.max_half_reg + 1,
497 .branchstack = ir3_shader_branchstack_hw(xs),
498 ));
499 break;
500 case MESA_SHADER_TESS_EVAL:
501 tu_cs_emit_regs(cs, A6XX_SP_DS_CTRL_REG0(
502 .fullregfootprint = xs->info.max_reg + 1,
503 .halfregfootprint = xs->info.max_half_reg + 1,
504 .branchstack = ir3_shader_branchstack_hw(xs),
505 ));
506 break;
507 case MESA_SHADER_GEOMETRY:
508 tu_cs_emit_regs(cs, A6XX_SP_GS_CTRL_REG0(
509 .fullregfootprint = xs->info.max_reg + 1,
510 .halfregfootprint = xs->info.max_half_reg + 1,
511 .branchstack = ir3_shader_branchstack_hw(xs),
512 ));
513 break;
514 case MESA_SHADER_FRAGMENT:
515 tu_cs_emit_regs(cs, A6XX_SP_FS_CTRL_REG0(
516 .fullregfootprint = xs->info.max_reg + 1,
517 .halfregfootprint = xs->info.max_half_reg + 1,
518 .branchstack = ir3_shader_branchstack_hw(xs),
519 .mergedregs = xs->mergedregs,
520 .threadsize = thrsz,
521 .pixlodenable = xs->need_pixlod,
522 .diff_fine = xs->need_fine_derivatives,
523 .varying = xs->total_in != 0,
524 /* unknown bit, seems unnecessary */
525 .unk24 = true,
526 ));
527 break;
528 case MESA_SHADER_COMPUTE:
529 tu_cs_emit_regs(cs, A6XX_SP_CS_CTRL_REG0(
530 .fullregfootprint = xs->info.max_reg + 1,
531 .halfregfootprint = xs->info.max_half_reg + 1,
532 .branchstack = ir3_shader_branchstack_hw(xs),
533 .mergedregs = xs->mergedregs,
534 .threadsize = thrsz,
535 ));
536 break;
537 default:
538 unreachable("bad shader stage");
539 }
540
541 tu_cs_emit_pkt4(cs, cfg->reg_sp_xs_instrlen, 1);
542 tu_cs_emit(cs, xs->instrlen);
543
544 /* emit program binary & private memory layout
545 * binary_iova should be aligned to 1 instrlen unit (128 bytes)
546 */
547
548 assert((binary_iova & 0x7f) == 0);
549 assert((pvtmem->iova & 0x1f) == 0);
550
551 tu_cs_emit_pkt4(cs, cfg->reg_sp_xs_first_exec_offset, 7);
552 tu_cs_emit(cs, 0);
553 tu_cs_emit_qw(cs, binary_iova);
554 tu_cs_emit(cs,
555 A6XX_SP_VS_PVT_MEM_PARAM_MEMSIZEPERITEM(pvtmem->per_fiber_size));
556 tu_cs_emit_qw(cs, pvtmem->iova);
557 tu_cs_emit(cs, A6XX_SP_VS_PVT_MEM_SIZE_TOTALPVTMEMSIZE(pvtmem->per_sp_size) |
558 COND(pvtmem->per_wave, A6XX_SP_VS_PVT_MEM_SIZE_PERWAVEMEMLAYOUT));
559
560 tu_cs_emit_pkt4(cs, cfg->reg_sp_xs_pvt_mem_hw_stack_offset, 1);
561 tu_cs_emit(cs, A6XX_SP_VS_PVT_MEM_HW_STACK_OFFSET_OFFSET(pvtmem->per_sp_size));
562
563 uint32_t shader_preload_size =
564 MIN2(xs->instrlen, cs->device->physical_device->info->a6xx.instr_cache_size);
565
566 tu_cs_emit_pkt7(cs, tu6_stage2opcode(stage), 3);
567 tu_cs_emit(cs, CP_LOAD_STATE6_0_DST_OFF(0) |
568 CP_LOAD_STATE6_0_STATE_TYPE(ST6_SHADER) |
569 CP_LOAD_STATE6_0_STATE_SRC(SS6_INDIRECT) |
570 CP_LOAD_STATE6_0_STATE_BLOCK(tu6_stage2shadersb(stage)) |
571 CP_LOAD_STATE6_0_NUM_UNIT(shader_preload_size));
572 tu_cs_emit_qw(cs, binary_iova);
573
574 /* emit immediates */
575
576 const struct ir3_const_state *const_state = ir3_const_state(xs);
577 uint32_t base = const_state->offsets.immediate;
578 unsigned immediate_size = tu_xs_get_immediates_packet_size_dwords(xs);
579
580 if (immediate_size > 0) {
581 tu_cs_emit_pkt7(cs, tu6_stage2opcode(stage), 3 + immediate_size);
582 tu_cs_emit(cs, CP_LOAD_STATE6_0_DST_OFF(base) |
583 CP_LOAD_STATE6_0_STATE_TYPE(ST6_CONSTANTS) |
584 CP_LOAD_STATE6_0_STATE_SRC(SS6_DIRECT) |
585 CP_LOAD_STATE6_0_STATE_BLOCK(tu6_stage2shadersb(stage)) |
586 CP_LOAD_STATE6_0_NUM_UNIT(immediate_size / 4));
587 tu_cs_emit(cs, CP_LOAD_STATE6_1_EXT_SRC_ADDR(0));
588 tu_cs_emit(cs, CP_LOAD_STATE6_2_EXT_SRC_ADDR_HI(0));
589
590 tu_cs_emit_array(cs, const_state->immediates, immediate_size);
591 }
592
593 if (const_state->constant_data_ubo != -1) {
594 uint64_t iova = binary_iova + xs->info.constant_data_offset;
595
596 /* Upload UBO state for the constant data. */
597 tu_cs_emit_pkt7(cs, tu6_stage2opcode(stage), 5);
598 tu_cs_emit(cs,
599 CP_LOAD_STATE6_0_DST_OFF(const_state->constant_data_ubo) |
600 CP_LOAD_STATE6_0_STATE_TYPE(ST6_UBO)|
601 CP_LOAD_STATE6_0_STATE_SRC(SS6_DIRECT) |
602 CP_LOAD_STATE6_0_STATE_BLOCK(tu6_stage2shadersb(stage)) |
603 CP_LOAD_STATE6_0_NUM_UNIT(1));
604 tu_cs_emit(cs, CP_LOAD_STATE6_1_EXT_SRC_ADDR(0));
605 tu_cs_emit(cs, CP_LOAD_STATE6_2_EXT_SRC_ADDR_HI(0));
606 int size_vec4s = DIV_ROUND_UP(xs->constant_data_size, 16);
607 tu_cs_emit_qw(cs,
608 iova |
609 (uint64_t)A6XX_UBO_1_SIZE(size_vec4s) << 32);
610
611 /* Upload the constant data to the const file if needed. */
612 const struct ir3_ubo_analysis_state *ubo_state = &const_state->ubo_state;
613
614 for (int i = 0; i < ubo_state->num_enabled; i++) {
615 if (ubo_state->range[i].ubo.block != const_state->constant_data_ubo ||
616 ubo_state->range[i].ubo.bindless) {
617 continue;
618 }
619
620 uint32_t start = ubo_state->range[i].start;
621 uint32_t end = ubo_state->range[i].end;
622 uint32_t size = MIN2(end - start,
623 (16 * xs->constlen) - ubo_state->range[i].offset);
624
625 tu_cs_emit_pkt7(cs, tu6_stage2opcode(stage), 3);
626 tu_cs_emit(cs,
627 CP_LOAD_STATE6_0_DST_OFF(ubo_state->range[i].offset / 16) |
628 CP_LOAD_STATE6_0_STATE_TYPE(ST6_CONSTANTS) |
629 CP_LOAD_STATE6_0_STATE_SRC(SS6_INDIRECT) |
630 CP_LOAD_STATE6_0_STATE_BLOCK(tu6_stage2shadersb(stage)) |
631 CP_LOAD_STATE6_0_NUM_UNIT(size / 16));
632 tu_cs_emit_qw(cs, iova + start);
633 }
634 }
635
636 /* emit FS driver param */
637 if (stage == MESA_SHADER_FRAGMENT && const_state->num_driver_params > 0) {
638 uint32_t base = const_state->offsets.driver_param;
639 int32_t size = DIV_ROUND_UP(const_state->num_driver_params, 4);
640 size = MAX2(MIN2(size + base, xs->constlen) - base, 0);
641
642 if (size > 0) {
643 tu_cs_emit_pkt7(cs, tu6_stage2opcode(stage), 3 + size * 4);
644 tu_cs_emit(cs, CP_LOAD_STATE6_0_DST_OFF(base) |
645 CP_LOAD_STATE6_0_STATE_TYPE(ST6_CONSTANTS) |
646 CP_LOAD_STATE6_0_STATE_SRC(SS6_DIRECT) |
647 CP_LOAD_STATE6_0_STATE_BLOCK(tu6_stage2shadersb(stage)) |
648 CP_LOAD_STATE6_0_NUM_UNIT(size));
649 tu_cs_emit(cs, CP_LOAD_STATE6_1_EXT_SRC_ADDR(0));
650 tu_cs_emit(cs, CP_LOAD_STATE6_2_EXT_SRC_ADDR_HI(0));
651
652 assert(size == 1);
653 tu_cs_emit(cs, xs->info.double_threadsize ? 128 : 64);
654 tu_cs_emit(cs, 0);
655 tu_cs_emit(cs, 0);
656 tu_cs_emit(cs, 0);
657 }
658 }
659 }
660
661 static void
tu6_emit_shared_consts_enable(struct tu_cs * cs,bool enable)662 tu6_emit_shared_consts_enable(struct tu_cs *cs, bool enable)
663 {
664 /* Enable/disable shared constants */
665 tu_cs_emit_regs(cs, A6XX_HLSQ_SHARED_CONSTS(.enable = enable));
666 tu_cs_emit_regs(cs, A6XX_SP_MODE_CONTROL(.constant_demotion_enable = true,
667 .isammode = ISAMMODE_GL,
668 .shared_consts_enable = enable));
669 }
670
671 static void
tu6_emit_cs_config(struct tu_cs * cs,const struct ir3_shader_variant * v,const struct tu_pvtmem_config * pvtmem,uint64_t binary_iova)672 tu6_emit_cs_config(struct tu_cs *cs,
673 const struct ir3_shader_variant *v,
674 const struct tu_pvtmem_config *pvtmem,
675 uint64_t binary_iova)
676 {
677 bool shared_consts_enable = ir3_const_state(v)->shared_consts_enable;
678 tu6_emit_shared_consts_enable(cs, shared_consts_enable);
679
680 tu_cs_emit_regs(cs, A6XX_HLSQ_INVALIDATE_CMD(
681 .cs_state = true,
682 .cs_ibo = true,
683 .cs_shared_const = shared_consts_enable));
684
685 tu6_emit_xs_config(cs, MESA_SHADER_COMPUTE, v);
686 tu6_emit_xs(cs, MESA_SHADER_COMPUTE, v, pvtmem, binary_iova);
687
688 uint32_t shared_size = MAX2(((int)v->shared_size - 1) / 1024, 1);
689 tu_cs_emit_pkt4(cs, REG_A6XX_SP_CS_UNKNOWN_A9B1, 1);
690 tu_cs_emit(cs, A6XX_SP_CS_UNKNOWN_A9B1_SHARED_SIZE(shared_size) |
691 A6XX_SP_CS_UNKNOWN_A9B1_UNK6);
692
693 if (cs->device->physical_device->info->a6xx.has_lpac) {
694 tu_cs_emit_pkt4(cs, REG_A6XX_HLSQ_CS_UNKNOWN_B9D0, 1);
695 tu_cs_emit(cs, A6XX_HLSQ_CS_UNKNOWN_B9D0_SHARED_SIZE(shared_size) |
696 A6XX_HLSQ_CS_UNKNOWN_B9D0_UNK6);
697 }
698
699 uint32_t local_invocation_id =
700 ir3_find_sysval_regid(v, SYSTEM_VALUE_LOCAL_INVOCATION_ID);
701 uint32_t work_group_id =
702 ir3_find_sysval_regid(v, SYSTEM_VALUE_WORKGROUP_ID);
703
704 enum a6xx_threadsize thrsz = v->info.double_threadsize ? THREAD128 : THREAD64;
705 tu_cs_emit_pkt4(cs, REG_A6XX_HLSQ_CS_CNTL_0, 2);
706 tu_cs_emit(cs,
707 A6XX_HLSQ_CS_CNTL_0_WGIDCONSTID(work_group_id) |
708 A6XX_HLSQ_CS_CNTL_0_WGSIZECONSTID(regid(63, 0)) |
709 A6XX_HLSQ_CS_CNTL_0_WGOFFSETCONSTID(regid(63, 0)) |
710 A6XX_HLSQ_CS_CNTL_0_LOCALIDREGID(local_invocation_id));
711 tu_cs_emit(cs, A6XX_HLSQ_CS_CNTL_1_LINEARLOCALIDREGID(regid(63, 0)) |
712 A6XX_HLSQ_CS_CNTL_1_THREADSIZE(thrsz));
713
714 if (cs->device->physical_device->info->a6xx.has_lpac) {
715 tu_cs_emit_pkt4(cs, REG_A6XX_SP_CS_CNTL_0, 2);
716 tu_cs_emit(cs,
717 A6XX_SP_CS_CNTL_0_WGIDCONSTID(work_group_id) |
718 A6XX_SP_CS_CNTL_0_WGSIZECONSTID(regid(63, 0)) |
719 A6XX_SP_CS_CNTL_0_WGOFFSETCONSTID(regid(63, 0)) |
720 A6XX_SP_CS_CNTL_0_LOCALIDREGID(local_invocation_id));
721 tu_cs_emit(cs, A6XX_SP_CS_CNTL_1_LINEARLOCALIDREGID(regid(63, 0)) |
722 A6XX_SP_CS_CNTL_1_THREADSIZE(thrsz));
723 }
724 }
725
726 static void
tu6_emit_vs_system_values(struct tu_cs * cs,const struct ir3_shader_variant * vs,const struct ir3_shader_variant * hs,const struct ir3_shader_variant * ds,const struct ir3_shader_variant * gs,bool primid_passthru)727 tu6_emit_vs_system_values(struct tu_cs *cs,
728 const struct ir3_shader_variant *vs,
729 const struct ir3_shader_variant *hs,
730 const struct ir3_shader_variant *ds,
731 const struct ir3_shader_variant *gs,
732 bool primid_passthru)
733 {
734 const uint32_t vertexid_regid =
735 ir3_find_sysval_regid(vs, SYSTEM_VALUE_VERTEX_ID);
736 const uint32_t instanceid_regid =
737 ir3_find_sysval_regid(vs, SYSTEM_VALUE_INSTANCE_ID);
738 const uint32_t tess_coord_x_regid = hs ?
739 ir3_find_sysval_regid(ds, SYSTEM_VALUE_TESS_COORD) :
740 regid(63, 0);
741 const uint32_t tess_coord_y_regid = VALIDREG(tess_coord_x_regid) ?
742 tess_coord_x_regid + 1 :
743 regid(63, 0);
744 const uint32_t hs_rel_patch_regid = hs ?
745 ir3_find_sysval_regid(hs, SYSTEM_VALUE_REL_PATCH_ID_IR3) :
746 regid(63, 0);
747 const uint32_t ds_rel_patch_regid = hs ?
748 ir3_find_sysval_regid(ds, SYSTEM_VALUE_REL_PATCH_ID_IR3) :
749 regid(63, 0);
750 const uint32_t hs_invocation_regid = hs ?
751 ir3_find_sysval_regid(hs, SYSTEM_VALUE_TCS_HEADER_IR3) :
752 regid(63, 0);
753 const uint32_t gs_primitiveid_regid = gs ?
754 ir3_find_sysval_regid(gs, SYSTEM_VALUE_PRIMITIVE_ID) :
755 regid(63, 0);
756 const uint32_t vs_primitiveid_regid = hs ?
757 ir3_find_sysval_regid(hs, SYSTEM_VALUE_PRIMITIVE_ID) :
758 gs_primitiveid_regid;
759 const uint32_t ds_primitiveid_regid = ds ?
760 ir3_find_sysval_regid(ds, SYSTEM_VALUE_PRIMITIVE_ID) :
761 regid(63, 0);
762 const uint32_t gsheader_regid = gs ?
763 ir3_find_sysval_regid(gs, SYSTEM_VALUE_GS_HEADER_IR3) :
764 regid(63, 0);
765
766 /* Note: we currently don't support multiview with tess or GS. If we did,
767 * and the HW actually works, then we'd have to somehow share this across
768 * stages. Note that the blob doesn't support this either.
769 */
770 const uint32_t viewid_regid =
771 ir3_find_sysval_regid(vs, SYSTEM_VALUE_VIEW_INDEX);
772
773 tu_cs_emit_pkt4(cs, REG_A6XX_VFD_CONTROL_1, 6);
774 tu_cs_emit(cs, A6XX_VFD_CONTROL_1_REGID4VTX(vertexid_regid) |
775 A6XX_VFD_CONTROL_1_REGID4INST(instanceid_regid) |
776 A6XX_VFD_CONTROL_1_REGID4PRIMID(vs_primitiveid_regid) |
777 A6XX_VFD_CONTROL_1_REGID4VIEWID(viewid_regid));
778 tu_cs_emit(cs, A6XX_VFD_CONTROL_2_REGID_HSRELPATCHID(hs_rel_patch_regid) |
779 A6XX_VFD_CONTROL_2_REGID_INVOCATIONID(hs_invocation_regid));
780 tu_cs_emit(cs, A6XX_VFD_CONTROL_3_REGID_DSRELPATCHID(ds_rel_patch_regid) |
781 A6XX_VFD_CONTROL_3_REGID_TESSX(tess_coord_x_regid) |
782 A6XX_VFD_CONTROL_3_REGID_TESSY(tess_coord_y_regid) |
783 A6XX_VFD_CONTROL_3_REGID_DSPRIMID(ds_primitiveid_regid));
784 tu_cs_emit(cs, 0x000000fc); /* VFD_CONTROL_4 */
785 tu_cs_emit(cs, A6XX_VFD_CONTROL_5_REGID_GSHEADER(gsheader_regid) |
786 0xfc00); /* VFD_CONTROL_5 */
787 tu_cs_emit(cs, COND(primid_passthru, A6XX_VFD_CONTROL_6_PRIMID_PASSTHRU)); /* VFD_CONTROL_6 */
788 }
789
790 static void
tu6_setup_streamout(struct tu_cs * cs,const struct ir3_shader_variant * v,struct ir3_shader_linkage * l)791 tu6_setup_streamout(struct tu_cs *cs,
792 const struct ir3_shader_variant *v,
793 struct ir3_shader_linkage *l)
794 {
795 const struct ir3_stream_output_info *info = &v->stream_output;
796 /* Note: 64 here comes from the HW layout of the program RAM. The program
797 * for stream N is at DWORD 64 * N.
798 */
799 #define A6XX_SO_PROG_DWORDS 64
800 uint32_t prog[A6XX_SO_PROG_DWORDS * IR3_MAX_SO_STREAMS] = {};
801 BITSET_DECLARE(valid_dwords, A6XX_SO_PROG_DWORDS * IR3_MAX_SO_STREAMS) = {0};
802
803 /* TODO: streamout state should be in a non-GMEM draw state */
804
805 /* no streamout: */
806 if (info->num_outputs == 0) {
807 tu_cs_emit_pkt7(cs, CP_CONTEXT_REG_BUNCH, 4);
808 tu_cs_emit(cs, REG_A6XX_VPC_SO_CNTL);
809 tu_cs_emit(cs, 0);
810 tu_cs_emit(cs, REG_A6XX_VPC_SO_STREAM_CNTL);
811 tu_cs_emit(cs, 0);
812 return;
813 }
814
815 for (unsigned i = 0; i < info->num_outputs; i++) {
816 const struct ir3_stream_output *out = &info->output[i];
817 unsigned k = out->register_index;
818 unsigned idx;
819
820 /* Skip it, if it's an output that was never assigned a register. */
821 if (k >= v->outputs_count || v->outputs[k].regid == INVALID_REG)
822 continue;
823
824 /* linkage map sorted by order frag shader wants things, so
825 * a bit less ideal here..
826 */
827 for (idx = 0; idx < l->cnt; idx++)
828 if (l->var[idx].slot == v->outputs[k].slot)
829 break;
830
831 assert(idx < l->cnt);
832
833 for (unsigned j = 0; j < out->num_components; j++) {
834 unsigned c = j + out->start_component;
835 unsigned loc = l->var[idx].loc + c;
836 unsigned off = j + out->dst_offset; /* in dwords */
837
838 assert(loc < A6XX_SO_PROG_DWORDS * 2);
839 unsigned dword = out->stream * A6XX_SO_PROG_DWORDS + loc/2;
840 if (loc & 1) {
841 prog[dword] |= A6XX_VPC_SO_PROG_B_EN |
842 A6XX_VPC_SO_PROG_B_BUF(out->output_buffer) |
843 A6XX_VPC_SO_PROG_B_OFF(off * 4);
844 } else {
845 prog[dword] |= A6XX_VPC_SO_PROG_A_EN |
846 A6XX_VPC_SO_PROG_A_BUF(out->output_buffer) |
847 A6XX_VPC_SO_PROG_A_OFF(off * 4);
848 }
849 BITSET_SET(valid_dwords, dword);
850 }
851 }
852
853 unsigned prog_count = 0;
854 unsigned start, end;
855 BITSET_FOREACH_RANGE(start, end, valid_dwords,
856 A6XX_SO_PROG_DWORDS * IR3_MAX_SO_STREAMS) {
857 prog_count += end - start + 1;
858 }
859
860 tu_cs_emit_pkt7(cs, CP_CONTEXT_REG_BUNCH, 10 + 2 * prog_count);
861 tu_cs_emit(cs, REG_A6XX_VPC_SO_STREAM_CNTL);
862 tu_cs_emit(cs, A6XX_VPC_SO_STREAM_CNTL_STREAM_ENABLE(info->streams_written) |
863 COND(info->stride[0] > 0,
864 A6XX_VPC_SO_STREAM_CNTL_BUF0_STREAM(1 + info->buffer_to_stream[0])) |
865 COND(info->stride[1] > 0,
866 A6XX_VPC_SO_STREAM_CNTL_BUF1_STREAM(1 + info->buffer_to_stream[1])) |
867 COND(info->stride[2] > 0,
868 A6XX_VPC_SO_STREAM_CNTL_BUF2_STREAM(1 + info->buffer_to_stream[2])) |
869 COND(info->stride[3] > 0,
870 A6XX_VPC_SO_STREAM_CNTL_BUF3_STREAM(1 + info->buffer_to_stream[3])));
871 for (uint32_t i = 0; i < 4; i++) {
872 tu_cs_emit(cs, REG_A6XX_VPC_SO_BUFFER_STRIDE(i));
873 tu_cs_emit(cs, info->stride[i]);
874 }
875 bool first = true;
876 BITSET_FOREACH_RANGE(start, end, valid_dwords,
877 A6XX_SO_PROG_DWORDS * IR3_MAX_SO_STREAMS) {
878 tu_cs_emit(cs, REG_A6XX_VPC_SO_CNTL);
879 tu_cs_emit(cs, COND(first, A6XX_VPC_SO_CNTL_RESET) |
880 A6XX_VPC_SO_CNTL_ADDR(start));
881 for (unsigned i = start; i < end; i++) {
882 tu_cs_emit(cs, REG_A6XX_VPC_SO_PROG);
883 tu_cs_emit(cs, prog[i]);
884 }
885 first = false;
886 }
887 }
888
889 static void
tu6_emit_const(struct tu_cs * cs,uint32_t opcode,uint32_t base,enum a6xx_state_block block,uint32_t offset,uint32_t size,const uint32_t * dwords)890 tu6_emit_const(struct tu_cs *cs, uint32_t opcode, uint32_t base,
891 enum a6xx_state_block block, uint32_t offset,
892 uint32_t size, const uint32_t *dwords) {
893 assert(size % 4 == 0);
894
895 tu_cs_emit_pkt7(cs, opcode, 3 + size);
896 tu_cs_emit(cs, CP_LOAD_STATE6_0_DST_OFF(base) |
897 CP_LOAD_STATE6_0_STATE_TYPE(ST6_CONSTANTS) |
898 CP_LOAD_STATE6_0_STATE_SRC(SS6_DIRECT) |
899 CP_LOAD_STATE6_0_STATE_BLOCK(block) |
900 CP_LOAD_STATE6_0_NUM_UNIT(size / 4));
901
902 tu_cs_emit(cs, CP_LOAD_STATE6_1_EXT_SRC_ADDR(0));
903 tu_cs_emit(cs, CP_LOAD_STATE6_2_EXT_SRC_ADDR_HI(0));
904 dwords = (uint32_t *)&((uint8_t *)dwords)[offset];
905
906 tu_cs_emit_array(cs, dwords, size);
907 }
908
909 static void
tu6_emit_link_map(struct tu_cs * cs,const struct ir3_shader_variant * producer,const struct ir3_shader_variant * consumer,enum a6xx_state_block sb)910 tu6_emit_link_map(struct tu_cs *cs,
911 const struct ir3_shader_variant *producer,
912 const struct ir3_shader_variant *consumer,
913 enum a6xx_state_block sb)
914 {
915 const struct ir3_const_state *const_state = ir3_const_state(consumer);
916 uint32_t base = const_state->offsets.primitive_map;
917 int size = DIV_ROUND_UP(consumer->input_size, 4);
918
919 size = (MIN2(size + base, consumer->constlen) - base) * 4;
920 if (size <= 0)
921 return;
922
923 tu6_emit_const(cs, CP_LOAD_STATE6_GEOM, base, sb, 0, size,
924 producer->output_loc);
925 }
926
927 static uint16_t
primitive_to_tess(enum shader_prim primitive)928 primitive_to_tess(enum shader_prim primitive) {
929 switch (primitive) {
930 case SHADER_PRIM_POINTS:
931 return TESS_POINTS;
932 case SHADER_PRIM_LINE_STRIP:
933 return TESS_LINES;
934 case SHADER_PRIM_TRIANGLE_STRIP:
935 return TESS_CW_TRIS;
936 default:
937 unreachable("");
938 }
939 }
940
941 void
tu6_emit_vpc(struct tu_cs * cs,const struct ir3_shader_variant * vs,const struct ir3_shader_variant * hs,const struct ir3_shader_variant * ds,const struct ir3_shader_variant * gs,const struct ir3_shader_variant * fs,uint32_t patch_control_points)942 tu6_emit_vpc(struct tu_cs *cs,
943 const struct ir3_shader_variant *vs,
944 const struct ir3_shader_variant *hs,
945 const struct ir3_shader_variant *ds,
946 const struct ir3_shader_variant *gs,
947 const struct ir3_shader_variant *fs,
948 uint32_t patch_control_points)
949 {
950 /* note: doesn't compile as static because of the array regs.. */
951 const struct reg_config {
952 uint16_t reg_sp_xs_out_reg;
953 uint16_t reg_sp_xs_vpc_dst_reg;
954 uint16_t reg_vpc_xs_pack;
955 uint16_t reg_vpc_xs_clip_cntl;
956 uint16_t reg_gras_xs_cl_cntl;
957 uint16_t reg_pc_xs_out_cntl;
958 uint16_t reg_sp_xs_primitive_cntl;
959 uint16_t reg_vpc_xs_layer_cntl;
960 uint16_t reg_gras_xs_layer_cntl;
961 } reg_config[] = {
962 [MESA_SHADER_VERTEX] = {
963 REG_A6XX_SP_VS_OUT_REG(0),
964 REG_A6XX_SP_VS_VPC_DST_REG(0),
965 REG_A6XX_VPC_VS_PACK,
966 REG_A6XX_VPC_VS_CLIP_CNTL,
967 REG_A6XX_GRAS_VS_CL_CNTL,
968 REG_A6XX_PC_VS_OUT_CNTL,
969 REG_A6XX_SP_VS_PRIMITIVE_CNTL,
970 REG_A6XX_VPC_VS_LAYER_CNTL,
971 REG_A6XX_GRAS_VS_LAYER_CNTL
972 },
973 [MESA_SHADER_TESS_CTRL] = {
974 0,
975 0,
976 0,
977 0,
978 0,
979 REG_A6XX_PC_HS_OUT_CNTL,
980 0,
981 0,
982 0
983 },
984 [MESA_SHADER_TESS_EVAL] = {
985 REG_A6XX_SP_DS_OUT_REG(0),
986 REG_A6XX_SP_DS_VPC_DST_REG(0),
987 REG_A6XX_VPC_DS_PACK,
988 REG_A6XX_VPC_DS_CLIP_CNTL,
989 REG_A6XX_GRAS_DS_CL_CNTL,
990 REG_A6XX_PC_DS_OUT_CNTL,
991 REG_A6XX_SP_DS_PRIMITIVE_CNTL,
992 REG_A6XX_VPC_DS_LAYER_CNTL,
993 REG_A6XX_GRAS_DS_LAYER_CNTL
994 },
995 [MESA_SHADER_GEOMETRY] = {
996 REG_A6XX_SP_GS_OUT_REG(0),
997 REG_A6XX_SP_GS_VPC_DST_REG(0),
998 REG_A6XX_VPC_GS_PACK,
999 REG_A6XX_VPC_GS_CLIP_CNTL,
1000 REG_A6XX_GRAS_GS_CL_CNTL,
1001 REG_A6XX_PC_GS_OUT_CNTL,
1002 REG_A6XX_SP_GS_PRIMITIVE_CNTL,
1003 REG_A6XX_VPC_GS_LAYER_CNTL,
1004 REG_A6XX_GRAS_GS_LAYER_CNTL
1005 },
1006 };
1007
1008 const struct ir3_shader_variant *last_shader;
1009 if (gs) {
1010 last_shader = gs;
1011 } else if (hs) {
1012 last_shader = ds;
1013 } else {
1014 last_shader = vs;
1015 }
1016
1017 const struct reg_config *cfg = ®_config[last_shader->type];
1018
1019 struct ir3_shader_linkage linkage = {
1020 .primid_loc = 0xff,
1021 .clip0_loc = 0xff,
1022 .clip1_loc = 0xff,
1023 };
1024 if (fs)
1025 ir3_link_shaders(&linkage, last_shader, fs, true);
1026
1027 if (last_shader->stream_output.num_outputs)
1028 ir3_link_stream_out(&linkage, last_shader);
1029
1030 /* We do this after linking shaders in order to know whether PrimID
1031 * passthrough needs to be enabled.
1032 */
1033 bool primid_passthru = linkage.primid_loc != 0xff;
1034 tu6_emit_vs_system_values(cs, vs, hs, ds, gs, primid_passthru);
1035
1036 tu_cs_emit_pkt4(cs, REG_A6XX_VPC_VAR_DISABLE(0), 4);
1037 tu_cs_emit(cs, ~linkage.varmask[0]);
1038 tu_cs_emit(cs, ~linkage.varmask[1]);
1039 tu_cs_emit(cs, ~linkage.varmask[2]);
1040 tu_cs_emit(cs, ~linkage.varmask[3]);
1041
1042 /* a6xx finds position/pointsize at the end */
1043 const uint32_t pointsize_regid =
1044 ir3_find_output_regid(last_shader, VARYING_SLOT_PSIZ);
1045 const uint32_t layer_regid =
1046 ir3_find_output_regid(last_shader, VARYING_SLOT_LAYER);
1047 const uint32_t view_regid =
1048 ir3_find_output_regid(last_shader, VARYING_SLOT_VIEWPORT);
1049 const uint32_t clip0_regid =
1050 ir3_find_output_regid(last_shader, VARYING_SLOT_CLIP_DIST0);
1051 const uint32_t clip1_regid =
1052 ir3_find_output_regid(last_shader, VARYING_SLOT_CLIP_DIST1);
1053 uint32_t flags_regid = gs ?
1054 ir3_find_output_regid(gs, VARYING_SLOT_GS_VERTEX_FLAGS_IR3) : 0;
1055
1056 uint32_t pointsize_loc = 0xff, position_loc = 0xff, layer_loc = 0xff, view_loc = 0xff;
1057
1058 if (layer_regid != regid(63, 0)) {
1059 layer_loc = linkage.max_loc;
1060 ir3_link_add(&linkage, VARYING_SLOT_LAYER, layer_regid, 0x1, linkage.max_loc);
1061 }
1062
1063 if (view_regid != regid(63, 0)) {
1064 view_loc = linkage.max_loc;
1065 ir3_link_add(&linkage, VARYING_SLOT_VIEWPORT, view_regid, 0x1, linkage.max_loc);
1066 }
1067
1068 unsigned extra_pos = 0;
1069
1070 for (unsigned i = 0; i < last_shader->outputs_count; i++) {
1071 if (last_shader->outputs[i].slot != VARYING_SLOT_POS)
1072 continue;
1073
1074 if (position_loc == 0xff)
1075 position_loc = linkage.max_loc;
1076
1077 ir3_link_add(&linkage, last_shader->outputs[i].slot,
1078 last_shader->outputs[i].regid,
1079 0xf, position_loc + 4 * last_shader->outputs[i].view);
1080 extra_pos = MAX2(extra_pos, last_shader->outputs[i].view);
1081 }
1082
1083 if (pointsize_regid != regid(63, 0)) {
1084 pointsize_loc = linkage.max_loc;
1085 ir3_link_add(&linkage, VARYING_SLOT_PSIZ, pointsize_regid, 0x1, linkage.max_loc);
1086 }
1087
1088 uint8_t clip_cull_mask = last_shader->clip_mask | last_shader->cull_mask;
1089
1090 /* Handle the case where clip/cull distances aren't read by the FS */
1091 uint32_t clip0_loc = linkage.clip0_loc, clip1_loc = linkage.clip1_loc;
1092 if (clip0_loc == 0xff && clip0_regid != regid(63, 0)) {
1093 clip0_loc = linkage.max_loc;
1094 ir3_link_add(&linkage, VARYING_SLOT_CLIP_DIST0, clip0_regid,
1095 clip_cull_mask & 0xf, linkage.max_loc);
1096 }
1097 if (clip1_loc == 0xff && clip1_regid != regid(63, 0)) {
1098 clip1_loc = linkage.max_loc;
1099 ir3_link_add(&linkage, VARYING_SLOT_CLIP_DIST1, clip1_regid,
1100 clip_cull_mask >> 4, linkage.max_loc);
1101 }
1102
1103 tu6_setup_streamout(cs, last_shader, &linkage);
1104
1105 /* The GPU hangs on some models when there are no outputs (xs_pack::CNT),
1106 * at least when a DS is the last stage, so add a dummy output to keep it
1107 * happy if there aren't any. We do this late in order to avoid emitting
1108 * any unused code and make sure that optimizations don't remove it.
1109 */
1110 if (linkage.cnt == 0)
1111 ir3_link_add(&linkage, 0, 0, 0x1, linkage.max_loc);
1112
1113 /* map outputs of the last shader to VPC */
1114 assert(linkage.cnt <= 32);
1115 const uint32_t sp_out_count = DIV_ROUND_UP(linkage.cnt, 2);
1116 const uint32_t sp_vpc_dst_count = DIV_ROUND_UP(linkage.cnt, 4);
1117 uint32_t sp_out[16] = {0};
1118 uint32_t sp_vpc_dst[8] = {0};
1119 for (uint32_t i = 0; i < linkage.cnt; i++) {
1120 ((uint16_t *) sp_out)[i] =
1121 A6XX_SP_VS_OUT_REG_A_REGID(linkage.var[i].regid) |
1122 A6XX_SP_VS_OUT_REG_A_COMPMASK(linkage.var[i].compmask);
1123 ((uint8_t *) sp_vpc_dst)[i] =
1124 A6XX_SP_VS_VPC_DST_REG_OUTLOC0(linkage.var[i].loc);
1125 }
1126
1127 tu_cs_emit_pkt4(cs, cfg->reg_sp_xs_out_reg, sp_out_count);
1128 tu_cs_emit_array(cs, sp_out, sp_out_count);
1129
1130 tu_cs_emit_pkt4(cs, cfg->reg_sp_xs_vpc_dst_reg, sp_vpc_dst_count);
1131 tu_cs_emit_array(cs, sp_vpc_dst, sp_vpc_dst_count);
1132
1133 tu_cs_emit_pkt4(cs, cfg->reg_vpc_xs_pack, 1);
1134 tu_cs_emit(cs, A6XX_VPC_VS_PACK_POSITIONLOC(position_loc) |
1135 A6XX_VPC_VS_PACK_PSIZELOC(pointsize_loc) |
1136 A6XX_VPC_VS_PACK_STRIDE_IN_VPC(linkage.max_loc) |
1137 A6XX_VPC_VS_PACK_EXTRAPOS(extra_pos));
1138
1139 tu_cs_emit_pkt4(cs, cfg->reg_vpc_xs_clip_cntl, 1);
1140 tu_cs_emit(cs, A6XX_VPC_VS_CLIP_CNTL_CLIP_MASK(clip_cull_mask) |
1141 A6XX_VPC_VS_CLIP_CNTL_CLIP_DIST_03_LOC(clip0_loc) |
1142 A6XX_VPC_VS_CLIP_CNTL_CLIP_DIST_47_LOC(clip1_loc));
1143
1144 tu_cs_emit_pkt4(cs, cfg->reg_gras_xs_cl_cntl, 1);
1145 tu_cs_emit(cs, A6XX_GRAS_VS_CL_CNTL_CLIP_MASK(last_shader->clip_mask) |
1146 A6XX_GRAS_VS_CL_CNTL_CULL_MASK(last_shader->cull_mask));
1147
1148 const struct ir3_shader_variant *geom_shaders[] = { vs, hs, ds, gs };
1149
1150 for (unsigned i = 0; i < ARRAY_SIZE(geom_shaders); i++) {
1151 const struct ir3_shader_variant *shader = geom_shaders[i];
1152 if (!shader)
1153 continue;
1154
1155 bool primid = shader->type != MESA_SHADER_VERTEX &&
1156 VALIDREG(ir3_find_sysval_regid(shader, SYSTEM_VALUE_PRIMITIVE_ID));
1157
1158 tu_cs_emit_pkt4(cs, reg_config[shader->type].reg_pc_xs_out_cntl, 1);
1159 if (shader == last_shader) {
1160 tu_cs_emit(cs, A6XX_PC_VS_OUT_CNTL_STRIDE_IN_VPC(linkage.max_loc) |
1161 CONDREG(pointsize_regid, A6XX_PC_VS_OUT_CNTL_PSIZE) |
1162 CONDREG(layer_regid, A6XX_PC_VS_OUT_CNTL_LAYER) |
1163 CONDREG(view_regid, A6XX_PC_VS_OUT_CNTL_VIEW) |
1164 COND(primid, A6XX_PC_VS_OUT_CNTL_PRIMITIVE_ID) |
1165 A6XX_PC_VS_OUT_CNTL_CLIP_MASK(clip_cull_mask));
1166 } else {
1167 tu_cs_emit(cs, COND(primid, A6XX_PC_VS_OUT_CNTL_PRIMITIVE_ID));
1168 }
1169 }
1170
1171 /* if vertex_flags somehow gets optimized out, your gonna have a bad time: */
1172 if (gs)
1173 assert(flags_regid != INVALID_REG);
1174
1175 tu_cs_emit_pkt4(cs, cfg->reg_sp_xs_primitive_cntl, 1);
1176 tu_cs_emit(cs, A6XX_SP_VS_PRIMITIVE_CNTL_OUT(linkage.cnt) |
1177 A6XX_SP_GS_PRIMITIVE_CNTL_FLAGS_REGID(flags_regid));
1178
1179 tu_cs_emit_pkt4(cs, cfg->reg_vpc_xs_layer_cntl, 1);
1180 tu_cs_emit(cs, A6XX_VPC_VS_LAYER_CNTL_LAYERLOC(layer_loc) |
1181 A6XX_VPC_VS_LAYER_CNTL_VIEWLOC(view_loc));
1182
1183 tu_cs_emit_pkt4(cs, cfg->reg_gras_xs_layer_cntl, 1);
1184 tu_cs_emit(cs, CONDREG(layer_regid, A6XX_GRAS_GS_LAYER_CNTL_WRITES_LAYER) |
1185 CONDREG(view_regid, A6XX_GRAS_GS_LAYER_CNTL_WRITES_VIEW));
1186
1187 tu_cs_emit_regs(cs, A6XX_PC_PRIMID_PASSTHRU(primid_passthru));
1188
1189 tu_cs_emit_pkt4(cs, REG_A6XX_VPC_CNTL_0, 1);
1190 tu_cs_emit(cs, A6XX_VPC_CNTL_0_NUMNONPOSVAR(fs ? fs->total_in : 0) |
1191 COND(fs && fs->total_in, A6XX_VPC_CNTL_0_VARYING) |
1192 A6XX_VPC_CNTL_0_PRIMIDLOC(linkage.primid_loc) |
1193 A6XX_VPC_CNTL_0_VIEWIDLOC(linkage.viewid_loc));
1194
1195 if (hs) {
1196 tu_cs_emit_pkt4(cs, REG_A6XX_PC_TESS_NUM_VERTEX, 1);
1197 tu_cs_emit(cs, hs->tess.tcs_vertices_out);
1198
1199 /* Total attribute slots in HS incoming patch. */
1200 tu_cs_emit_pkt4(cs, REG_A6XX_PC_HS_INPUT_SIZE, 1);
1201 tu_cs_emit(cs, patch_control_points * vs->output_size / 4);
1202
1203 const uint32_t wavesize = 64;
1204 const uint32_t max_wave_input_size = 64;
1205
1206 /* note: if HS is really just the VS extended, then this
1207 * should be by MAX2(patch_control_points, hs->tess.tcs_vertices_out)
1208 * however that doesn't match the blob, and fails some dEQP tests.
1209 */
1210 uint32_t prims_per_wave = wavesize / hs->tess.tcs_vertices_out;
1211 uint32_t max_prims_per_wave =
1212 max_wave_input_size * wavesize / (vs->output_size * patch_control_points);
1213 prims_per_wave = MIN2(prims_per_wave, max_prims_per_wave);
1214
1215 uint32_t total_size = vs->output_size * patch_control_points * prims_per_wave;
1216 uint32_t wave_input_size = DIV_ROUND_UP(total_size, wavesize);
1217
1218 tu_cs_emit_pkt4(cs, REG_A6XX_SP_HS_WAVE_INPUT_SIZE, 1);
1219 tu_cs_emit(cs, wave_input_size);
1220
1221 /* In SPIR-V generated from GLSL, the tessellation primitive params are
1222 * are specified in the tess eval shader, but in SPIR-V generated from
1223 * HLSL, they are specified in the tess control shader. */
1224 const struct ir3_shader_variant *tess =
1225 ds->tess.spacing == TESS_SPACING_UNSPECIFIED ? hs : ds;
1226 tu_cs_emit_pkt4(cs, REG_A6XX_PC_TESS_CNTL, 1);
1227 uint32_t output;
1228 if (tess->tess.point_mode)
1229 output = TESS_POINTS;
1230 else if (tess->tess.primitive_mode == TESS_PRIMITIVE_ISOLINES)
1231 output = TESS_LINES;
1232 else if (tess->tess.ccw)
1233 output = TESS_CCW_TRIS;
1234 else
1235 output = TESS_CW_TRIS;
1236
1237 enum a6xx_tess_spacing spacing;
1238 switch (tess->tess.spacing) {
1239 case TESS_SPACING_EQUAL:
1240 spacing = TESS_EQUAL;
1241 break;
1242 case TESS_SPACING_FRACTIONAL_ODD:
1243 spacing = TESS_FRACTIONAL_ODD;
1244 break;
1245 case TESS_SPACING_FRACTIONAL_EVEN:
1246 spacing = TESS_FRACTIONAL_EVEN;
1247 break;
1248 case TESS_SPACING_UNSPECIFIED:
1249 default:
1250 unreachable("invalid tess spacing");
1251 }
1252 tu_cs_emit(cs, A6XX_PC_TESS_CNTL_SPACING(spacing) |
1253 A6XX_PC_TESS_CNTL_OUTPUT(output));
1254
1255 tu6_emit_link_map(cs, vs, hs, SB6_HS_SHADER);
1256 tu6_emit_link_map(cs, hs, ds, SB6_DS_SHADER);
1257 }
1258
1259
1260 if (gs) {
1261 uint32_t vertices_out, invocations, output, vec4_size;
1262 uint32_t prev_stage_output_size = ds ? ds->output_size : vs->output_size;
1263
1264 if (hs) {
1265 tu6_emit_link_map(cs, ds, gs, SB6_GS_SHADER);
1266 } else {
1267 tu6_emit_link_map(cs, vs, gs, SB6_GS_SHADER);
1268 }
1269 vertices_out = gs->gs.vertices_out - 1;
1270 output = primitive_to_tess(gs->gs.output_primitive);
1271 invocations = gs->gs.invocations - 1;
1272 /* Size of per-primitive alloction in ldlw memory in vec4s. */
1273 vec4_size = gs->gs.vertices_in *
1274 DIV_ROUND_UP(prev_stage_output_size, 4);
1275
1276 tu_cs_emit_pkt4(cs, REG_A6XX_PC_PRIMITIVE_CNTL_5, 1);
1277 tu_cs_emit(cs,
1278 A6XX_PC_PRIMITIVE_CNTL_5_GS_VERTICES_OUT(vertices_out) |
1279 A6XX_PC_PRIMITIVE_CNTL_5_GS_OUTPUT(output) |
1280 A6XX_PC_PRIMITIVE_CNTL_5_GS_INVOCATIONS(invocations));
1281
1282 tu_cs_emit_pkt4(cs, REG_A6XX_VPC_GS_PARAM, 1);
1283 tu_cs_emit(cs, 0xff);
1284
1285 tu_cs_emit_pkt4(cs, REG_A6XX_PC_PRIMITIVE_CNTL_6, 1);
1286 tu_cs_emit(cs, A6XX_PC_PRIMITIVE_CNTL_6_STRIDE_IN_VPC(vec4_size));
1287
1288 uint32_t prim_size = prev_stage_output_size;
1289 if (prim_size > 64)
1290 prim_size = 64;
1291 else if (prim_size == 64)
1292 prim_size = 63;
1293 tu_cs_emit_pkt4(cs, REG_A6XX_SP_GS_PRIM_SIZE, 1);
1294 tu_cs_emit(cs, prim_size);
1295 }
1296 }
1297
1298 static int
tu6_vpc_varying_mode(const struct ir3_shader_variant * fs,uint32_t index,uint8_t * interp_mode,uint8_t * ps_repl_mode)1299 tu6_vpc_varying_mode(const struct ir3_shader_variant *fs,
1300 uint32_t index,
1301 uint8_t *interp_mode,
1302 uint8_t *ps_repl_mode)
1303 {
1304 enum
1305 {
1306 INTERP_SMOOTH = 0,
1307 INTERP_FLAT = 1,
1308 INTERP_ZERO = 2,
1309 INTERP_ONE = 3,
1310 };
1311 enum
1312 {
1313 PS_REPL_NONE = 0,
1314 PS_REPL_S = 1,
1315 PS_REPL_T = 2,
1316 PS_REPL_ONE_MINUS_T = 3,
1317 };
1318
1319 const uint32_t compmask = fs->inputs[index].compmask;
1320
1321 /* NOTE: varyings are packed, so if compmask is 0xb then first, second, and
1322 * fourth component occupy three consecutive varying slots
1323 */
1324 int shift = 0;
1325 *interp_mode = 0;
1326 *ps_repl_mode = 0;
1327 if (fs->inputs[index].slot == VARYING_SLOT_PNTC) {
1328 if (compmask & 0x1) {
1329 *ps_repl_mode |= PS_REPL_S << shift;
1330 shift += 2;
1331 }
1332 if (compmask & 0x2) {
1333 *ps_repl_mode |= PS_REPL_T << shift;
1334 shift += 2;
1335 }
1336 if (compmask & 0x4) {
1337 *interp_mode |= INTERP_ZERO << shift;
1338 shift += 2;
1339 }
1340 if (compmask & 0x8) {
1341 *interp_mode |= INTERP_ONE << 6;
1342 shift += 2;
1343 }
1344 } else if (fs->inputs[index].flat) {
1345 for (int i = 0; i < 4; i++) {
1346 if (compmask & (1 << i)) {
1347 *interp_mode |= INTERP_FLAT << shift;
1348 shift += 2;
1349 }
1350 }
1351 }
1352
1353 return shift;
1354 }
1355
1356 static void
tu6_emit_vpc_varying_modes(struct tu_cs * cs,const struct ir3_shader_variant * fs)1357 tu6_emit_vpc_varying_modes(struct tu_cs *cs,
1358 const struct ir3_shader_variant *fs)
1359 {
1360 uint32_t interp_modes[8] = { 0 };
1361 uint32_t ps_repl_modes[8] = { 0 };
1362
1363 if (fs) {
1364 for (int i = -1;
1365 (i = ir3_next_varying(fs, i)) < (int) fs->inputs_count;) {
1366
1367 /* get the mode for input i */
1368 uint8_t interp_mode;
1369 uint8_t ps_repl_mode;
1370 const int bits =
1371 tu6_vpc_varying_mode(fs, i, &interp_mode, &ps_repl_mode);
1372
1373 /* OR the mode into the array */
1374 const uint32_t inloc = fs->inputs[i].inloc * 2;
1375 uint32_t n = inloc / 32;
1376 uint32_t shift = inloc % 32;
1377 interp_modes[n] |= interp_mode << shift;
1378 ps_repl_modes[n] |= ps_repl_mode << shift;
1379 if (shift + bits > 32) {
1380 n++;
1381 shift = 32 - shift;
1382
1383 interp_modes[n] |= interp_mode >> shift;
1384 ps_repl_modes[n] |= ps_repl_mode >> shift;
1385 }
1386 }
1387 }
1388
1389 tu_cs_emit_pkt4(cs, REG_A6XX_VPC_VARYING_INTERP_MODE(0), 8);
1390 tu_cs_emit_array(cs, interp_modes, 8);
1391
1392 tu_cs_emit_pkt4(cs, REG_A6XX_VPC_VARYING_PS_REPL_MODE(0), 8);
1393 tu_cs_emit_array(cs, ps_repl_modes, 8);
1394 }
1395
1396 void
tu6_emit_fs_inputs(struct tu_cs * cs,const struct ir3_shader_variant * fs)1397 tu6_emit_fs_inputs(struct tu_cs *cs, const struct ir3_shader_variant *fs)
1398 {
1399 uint32_t face_regid, coord_regid, zwcoord_regid, samp_id_regid;
1400 uint32_t ij_regid[IJ_COUNT];
1401 uint32_t smask_in_regid;
1402
1403 bool sample_shading = fs->per_samp | fs->key.sample_shading;
1404 bool enable_varyings = fs->total_in > 0;
1405
1406 samp_id_regid = ir3_find_sysval_regid(fs, SYSTEM_VALUE_SAMPLE_ID);
1407 smask_in_regid = ir3_find_sysval_regid(fs, SYSTEM_VALUE_SAMPLE_MASK_IN);
1408 face_regid = ir3_find_sysval_regid(fs, SYSTEM_VALUE_FRONT_FACE);
1409 coord_regid = ir3_find_sysval_regid(fs, SYSTEM_VALUE_FRAG_COORD);
1410 zwcoord_regid = VALIDREG(coord_regid) ? coord_regid + 2 : regid(63, 0);
1411 for (unsigned i = 0; i < ARRAY_SIZE(ij_regid); i++)
1412 ij_regid[i] = ir3_find_sysval_regid(fs, SYSTEM_VALUE_BARYCENTRIC_PERSP_PIXEL + i);
1413
1414 if (fs->num_sampler_prefetch > 0) {
1415 assert(VALIDREG(ij_regid[IJ_PERSP_PIXEL]));
1416 /* also, it seems like ij_pix is *required* to be r0.x */
1417 assert(ij_regid[IJ_PERSP_PIXEL] == regid(0, 0));
1418 }
1419
1420 tu_cs_emit_pkt4(cs, REG_A6XX_SP_FS_PREFETCH_CNTL, 1 + fs->num_sampler_prefetch);
1421 tu_cs_emit(cs, A6XX_SP_FS_PREFETCH_CNTL_COUNT(fs->num_sampler_prefetch) |
1422 A6XX_SP_FS_PREFETCH_CNTL_UNK4(regid(63, 0)) |
1423 0x7000); // XXX);
1424 for (int i = 0; i < fs->num_sampler_prefetch; i++) {
1425 const struct ir3_sampler_prefetch *prefetch = &fs->sampler_prefetch[i];
1426 tu_cs_emit(cs, A6XX_SP_FS_PREFETCH_CMD_SRC(prefetch->src) |
1427 A6XX_SP_FS_PREFETCH_CMD_SAMP_ID(prefetch->samp_id) |
1428 A6XX_SP_FS_PREFETCH_CMD_TEX_ID(prefetch->tex_id) |
1429 A6XX_SP_FS_PREFETCH_CMD_DST(prefetch->dst) |
1430 A6XX_SP_FS_PREFETCH_CMD_WRMASK(prefetch->wrmask) |
1431 COND(prefetch->half_precision, A6XX_SP_FS_PREFETCH_CMD_HALF) |
1432 A6XX_SP_FS_PREFETCH_CMD_CMD(prefetch->cmd));
1433 }
1434
1435 if (fs->num_sampler_prefetch > 0) {
1436 tu_cs_emit_pkt4(cs, REG_A6XX_SP_FS_BINDLESS_PREFETCH_CMD(0), fs->num_sampler_prefetch);
1437 for (int i = 0; i < fs->num_sampler_prefetch; i++) {
1438 const struct ir3_sampler_prefetch *prefetch = &fs->sampler_prefetch[i];
1439 tu_cs_emit(cs,
1440 A6XX_SP_FS_BINDLESS_PREFETCH_CMD_SAMP_ID(prefetch->samp_bindless_id) |
1441 A6XX_SP_FS_BINDLESS_PREFETCH_CMD_TEX_ID(prefetch->tex_bindless_id));
1442 }
1443 }
1444
1445 tu_cs_emit_pkt4(cs, REG_A6XX_HLSQ_CONTROL_1_REG, 5);
1446 tu_cs_emit(cs, 0x7);
1447 tu_cs_emit(cs, A6XX_HLSQ_CONTROL_2_REG_FACEREGID(face_regid) |
1448 A6XX_HLSQ_CONTROL_2_REG_SAMPLEID(samp_id_regid) |
1449 A6XX_HLSQ_CONTROL_2_REG_SAMPLEMASK(smask_in_regid) |
1450 A6XX_HLSQ_CONTROL_2_REG_CENTERRHW(ij_regid[IJ_PERSP_CENTER_RHW]));
1451 tu_cs_emit(cs, A6XX_HLSQ_CONTROL_3_REG_IJ_PERSP_PIXEL(ij_regid[IJ_PERSP_PIXEL]) |
1452 A6XX_HLSQ_CONTROL_3_REG_IJ_LINEAR_PIXEL(ij_regid[IJ_LINEAR_PIXEL]) |
1453 A6XX_HLSQ_CONTROL_3_REG_IJ_PERSP_CENTROID(ij_regid[IJ_PERSP_CENTROID]) |
1454 A6XX_HLSQ_CONTROL_3_REG_IJ_LINEAR_CENTROID(ij_regid[IJ_LINEAR_CENTROID]));
1455 tu_cs_emit(cs, A6XX_HLSQ_CONTROL_4_REG_XYCOORDREGID(coord_regid) |
1456 A6XX_HLSQ_CONTROL_4_REG_ZWCOORDREGID(zwcoord_regid) |
1457 A6XX_HLSQ_CONTROL_4_REG_IJ_PERSP_SAMPLE(ij_regid[IJ_PERSP_SAMPLE]) |
1458 A6XX_HLSQ_CONTROL_4_REG_IJ_LINEAR_SAMPLE(ij_regid[IJ_LINEAR_SAMPLE]));
1459 tu_cs_emit(cs, 0xfcfc);
1460
1461 enum a6xx_threadsize thrsz = fs->info.double_threadsize ? THREAD128 : THREAD64;
1462 tu_cs_emit_pkt4(cs, REG_A6XX_HLSQ_FS_CNTL_0, 1);
1463 tu_cs_emit(cs, A6XX_HLSQ_FS_CNTL_0_THREADSIZE(thrsz) |
1464 COND(enable_varyings, A6XX_HLSQ_FS_CNTL_0_VARYINGS));
1465
1466 bool need_size = fs->frag_face || fs->fragcoord_compmask != 0;
1467 bool need_size_persamp = false;
1468 if (VALIDREG(ij_regid[IJ_PERSP_CENTER_RHW])) {
1469 if (sample_shading)
1470 need_size_persamp = true;
1471 else
1472 need_size = true;
1473 }
1474
1475 tu_cs_emit_pkt4(cs, REG_A6XX_GRAS_CNTL, 1);
1476 tu_cs_emit(cs,
1477 CONDREG(ij_regid[IJ_PERSP_PIXEL], A6XX_GRAS_CNTL_IJ_PERSP_PIXEL) |
1478 CONDREG(ij_regid[IJ_PERSP_CENTROID], A6XX_GRAS_CNTL_IJ_PERSP_CENTROID) |
1479 CONDREG(ij_regid[IJ_PERSP_SAMPLE], A6XX_GRAS_CNTL_IJ_PERSP_SAMPLE) |
1480 CONDREG(ij_regid[IJ_LINEAR_PIXEL], A6XX_GRAS_CNTL_IJ_LINEAR_PIXEL) |
1481 CONDREG(ij_regid[IJ_LINEAR_CENTROID], A6XX_GRAS_CNTL_IJ_LINEAR_CENTROID) |
1482 CONDREG(ij_regid[IJ_LINEAR_SAMPLE], A6XX_GRAS_CNTL_IJ_LINEAR_SAMPLE) |
1483 COND(need_size, A6XX_GRAS_CNTL_IJ_LINEAR_PIXEL) |
1484 COND(need_size_persamp, A6XX_GRAS_CNTL_IJ_LINEAR_SAMPLE) |
1485 COND(fs->fragcoord_compmask != 0, A6XX_GRAS_CNTL_COORD_MASK(fs->fragcoord_compmask)));
1486
1487 tu_cs_emit_pkt4(cs, REG_A6XX_RB_RENDER_CONTROL0, 2);
1488 tu_cs_emit(cs,
1489 CONDREG(ij_regid[IJ_PERSP_PIXEL], A6XX_RB_RENDER_CONTROL0_IJ_PERSP_PIXEL) |
1490 CONDREG(ij_regid[IJ_PERSP_CENTROID], A6XX_RB_RENDER_CONTROL0_IJ_PERSP_CENTROID) |
1491 CONDREG(ij_regid[IJ_PERSP_SAMPLE], A6XX_RB_RENDER_CONTROL0_IJ_PERSP_SAMPLE) |
1492 CONDREG(ij_regid[IJ_LINEAR_PIXEL], A6XX_RB_RENDER_CONTROL0_IJ_LINEAR_PIXEL) |
1493 CONDREG(ij_regid[IJ_LINEAR_CENTROID], A6XX_RB_RENDER_CONTROL0_IJ_LINEAR_CENTROID) |
1494 CONDREG(ij_regid[IJ_LINEAR_SAMPLE], A6XX_RB_RENDER_CONTROL0_IJ_LINEAR_SAMPLE) |
1495 COND(need_size, A6XX_RB_RENDER_CONTROL0_IJ_LINEAR_PIXEL) |
1496 COND(enable_varyings, A6XX_RB_RENDER_CONTROL0_UNK10) |
1497 COND(need_size_persamp, A6XX_RB_RENDER_CONTROL0_IJ_LINEAR_SAMPLE) |
1498 COND(fs->fragcoord_compmask != 0,
1499 A6XX_RB_RENDER_CONTROL0_COORD_MASK(fs->fragcoord_compmask)));
1500 tu_cs_emit(cs,
1501 A6XX_RB_RENDER_CONTROL1_FRAGCOORDSAMPLEMODE(
1502 sample_shading ? FRAGCOORD_SAMPLE : FRAGCOORD_CENTER) |
1503 CONDREG(smask_in_regid, A6XX_RB_RENDER_CONTROL1_SAMPLEMASK) |
1504 CONDREG(samp_id_regid, A6XX_RB_RENDER_CONTROL1_SAMPLEID) |
1505 CONDREG(ij_regid[IJ_PERSP_CENTER_RHW], A6XX_RB_RENDER_CONTROL1_CENTERRHW) |
1506 COND(fs->frag_face, A6XX_RB_RENDER_CONTROL1_FACENESS));
1507
1508 tu_cs_emit_pkt4(cs, REG_A6XX_RB_SAMPLE_CNTL, 1);
1509 tu_cs_emit(cs, COND(sample_shading, A6XX_RB_SAMPLE_CNTL_PER_SAMP_MODE));
1510
1511 tu_cs_emit_pkt4(cs, REG_A6XX_GRAS_LRZ_PS_INPUT_CNTL, 1);
1512 tu_cs_emit(cs, CONDREG(samp_id_regid, A6XX_GRAS_LRZ_PS_INPUT_CNTL_SAMPLEID) |
1513 A6XX_GRAS_LRZ_PS_INPUT_CNTL_FRAGCOORDSAMPLEMODE(
1514 sample_shading ? FRAGCOORD_SAMPLE : FRAGCOORD_CENTER));
1515
1516 tu_cs_emit_pkt4(cs, REG_A6XX_GRAS_SAMPLE_CNTL, 1);
1517 tu_cs_emit(cs, COND(sample_shading, A6XX_GRAS_SAMPLE_CNTL_PER_SAMP_MODE));
1518 }
1519
1520 static void
tu6_emit_fs_outputs(struct tu_cs * cs,const struct ir3_shader_variant * fs,uint32_t mrt_count,bool dual_src_blend,uint32_t render_components,bool no_earlyz,struct tu_pipeline * pipeline)1521 tu6_emit_fs_outputs(struct tu_cs *cs,
1522 const struct ir3_shader_variant *fs,
1523 uint32_t mrt_count, bool dual_src_blend,
1524 uint32_t render_components,
1525 bool no_earlyz,
1526 struct tu_pipeline *pipeline)
1527 {
1528 uint32_t smask_regid, posz_regid, stencilref_regid;
1529
1530 posz_regid = ir3_find_output_regid(fs, FRAG_RESULT_DEPTH);
1531 smask_regid = ir3_find_output_regid(fs, FRAG_RESULT_SAMPLE_MASK);
1532 stencilref_regid = ir3_find_output_regid(fs, FRAG_RESULT_STENCIL);
1533
1534 uint32_t fragdata_regid[8];
1535 if (fs->color0_mrt) {
1536 fragdata_regid[0] = ir3_find_output_regid(fs, FRAG_RESULT_COLOR);
1537 for (uint32_t i = 1; i < ARRAY_SIZE(fragdata_regid); i++)
1538 fragdata_regid[i] = fragdata_regid[0];
1539 } else {
1540 for (uint32_t i = 0; i < ARRAY_SIZE(fragdata_regid); i++)
1541 fragdata_regid[i] = ir3_find_output_regid(fs, FRAG_RESULT_DATA0 + i);
1542 }
1543
1544 tu_cs_emit_pkt4(cs, REG_A6XX_SP_FS_OUTPUT_CNTL0, 2);
1545 tu_cs_emit(cs, A6XX_SP_FS_OUTPUT_CNTL0_DEPTH_REGID(posz_regid) |
1546 A6XX_SP_FS_OUTPUT_CNTL0_SAMPMASK_REGID(smask_regid) |
1547 A6XX_SP_FS_OUTPUT_CNTL0_STENCILREF_REGID(stencilref_regid) |
1548 COND(dual_src_blend, A6XX_SP_FS_OUTPUT_CNTL0_DUAL_COLOR_IN_ENABLE));
1549 tu_cs_emit(cs, A6XX_SP_FS_OUTPUT_CNTL1_MRT(mrt_count));
1550
1551 uint32_t fs_render_components = 0;
1552
1553 tu_cs_emit_pkt4(cs, REG_A6XX_SP_FS_OUTPUT_REG(0), 8);
1554 for (uint32_t i = 0; i < ARRAY_SIZE(fragdata_regid); i++) {
1555 tu_cs_emit(cs, A6XX_SP_FS_OUTPUT_REG_REGID(fragdata_regid[i]) |
1556 (COND(fragdata_regid[i] & HALF_REG_ID,
1557 A6XX_SP_FS_OUTPUT_REG_HALF_PRECISION)));
1558
1559 if (VALIDREG(fragdata_regid[i])) {
1560 fs_render_components |= 0xf << (i * 4);
1561 }
1562 }
1563
1564 /* dual source blending has an extra fs output in the 2nd slot */
1565 if (dual_src_blend) {
1566 fs_render_components |= 0xf << 4;
1567 }
1568
1569 /* There is no point in having component enabled which is not written
1570 * by the shader. Per VK spec it is an UB, however a few apps depend on
1571 * attachment not being changed if FS doesn't have corresponding output.
1572 */
1573 fs_render_components &= render_components;
1574
1575 tu_cs_emit_regs(cs,
1576 A6XX_SP_FS_RENDER_COMPONENTS(.dword = fs_render_components));
1577
1578 tu_cs_emit_pkt4(cs, REG_A6XX_RB_FS_OUTPUT_CNTL0, 2);
1579 tu_cs_emit(cs, COND(fs->writes_pos, A6XX_RB_FS_OUTPUT_CNTL0_FRAG_WRITES_Z) |
1580 COND(fs->writes_smask, A6XX_RB_FS_OUTPUT_CNTL0_FRAG_WRITES_SAMPMASK) |
1581 COND(fs->writes_stencilref, A6XX_RB_FS_OUTPUT_CNTL0_FRAG_WRITES_STENCILREF) |
1582 COND(dual_src_blend, A6XX_RB_FS_OUTPUT_CNTL0_DUAL_COLOR_IN_ENABLE));
1583 tu_cs_emit(cs, A6XX_RB_FS_OUTPUT_CNTL1_MRT(mrt_count));
1584
1585 tu_cs_emit_regs(cs,
1586 A6XX_RB_RENDER_COMPONENTS(.dword = fs_render_components));
1587
1588 if (pipeline) {
1589 pipeline->lrz.fs_has_kill = fs->has_kill;
1590 pipeline->lrz.early_fragment_tests = fs->fs.early_fragment_tests;
1591
1592 if (!fs->fs.early_fragment_tests &&
1593 (fs->no_earlyz || fs->has_kill || fs->writes_pos || fs->writes_stencilref || no_earlyz || fs->writes_smask)) {
1594 pipeline->lrz.force_late_z = true;
1595 }
1596 }
1597 }
1598
1599 static void
tu6_emit_geom_tess_consts(struct tu_cs * cs,const struct ir3_shader_variant * vs,const struct ir3_shader_variant * hs,const struct ir3_shader_variant * ds,const struct ir3_shader_variant * gs,uint32_t cps_per_patch)1600 tu6_emit_geom_tess_consts(struct tu_cs *cs,
1601 const struct ir3_shader_variant *vs,
1602 const struct ir3_shader_variant *hs,
1603 const struct ir3_shader_variant *ds,
1604 const struct ir3_shader_variant *gs,
1605 uint32_t cps_per_patch)
1606 {
1607 struct tu_device *dev = cs->device;
1608
1609 uint32_t num_vertices =
1610 hs ? cps_per_patch : gs->gs.vertices_in;
1611
1612 uint32_t vs_params[4] = {
1613 vs->output_size * num_vertices * 4, /* vs primitive stride */
1614 vs->output_size * 4, /* vs vertex stride */
1615 0,
1616 0,
1617 };
1618 uint32_t vs_base = ir3_const_state(vs)->offsets.primitive_param;
1619 tu6_emit_const(cs, CP_LOAD_STATE6_GEOM, vs_base, SB6_VS_SHADER, 0,
1620 ARRAY_SIZE(vs_params), vs_params);
1621
1622 if (hs) {
1623 assert(ds->type != MESA_SHADER_NONE);
1624
1625 /* Create the shared tess factor BO the first time tess is used on the device. */
1626 mtx_lock(&dev->mutex);
1627 if (!dev->tess_bo)
1628 tu_bo_init_new(dev, &dev->tess_bo, TU_TESS_BO_SIZE, TU_BO_ALLOC_NO_FLAGS);
1629 mtx_unlock(&dev->mutex);
1630
1631 uint64_t tess_factor_iova = dev->tess_bo->iova;
1632 uint64_t tess_param_iova = tess_factor_iova + TU_TESS_FACTOR_SIZE;
1633
1634 uint32_t hs_params[8] = {
1635 vs->output_size * num_vertices * 4, /* hs primitive stride */
1636 vs->output_size * 4, /* hs vertex stride */
1637 hs->output_size,
1638 cps_per_patch,
1639 tess_param_iova,
1640 tess_param_iova >> 32,
1641 tess_factor_iova,
1642 tess_factor_iova >> 32,
1643 };
1644
1645 uint32_t hs_base = hs->const_state->offsets.primitive_param;
1646 uint32_t hs_param_dwords = MIN2((hs->constlen - hs_base) * 4, ARRAY_SIZE(hs_params));
1647 tu6_emit_const(cs, CP_LOAD_STATE6_GEOM, hs_base, SB6_HS_SHADER, 0,
1648 hs_param_dwords, hs_params);
1649 if (gs)
1650 num_vertices = gs->gs.vertices_in;
1651
1652 uint32_t ds_params[8] = {
1653 ds->output_size * num_vertices * 4, /* ds primitive stride */
1654 ds->output_size * 4, /* ds vertex stride */
1655 hs->output_size, /* hs vertex stride (dwords) */
1656 hs->tess.tcs_vertices_out,
1657 tess_param_iova,
1658 tess_param_iova >> 32,
1659 tess_factor_iova,
1660 tess_factor_iova >> 32,
1661 };
1662
1663 uint32_t ds_base = ds->const_state->offsets.primitive_param;
1664 uint32_t ds_param_dwords = MIN2((ds->constlen - ds_base) * 4, ARRAY_SIZE(ds_params));
1665 tu6_emit_const(cs, CP_LOAD_STATE6_GEOM, ds_base, SB6_DS_SHADER, 0,
1666 ds_param_dwords, ds_params);
1667 }
1668
1669 if (gs) {
1670 const struct ir3_shader_variant *prev = ds ? ds : vs;
1671 uint32_t gs_params[4] = {
1672 prev->output_size * num_vertices * 4, /* gs primitive stride */
1673 prev->output_size * 4, /* gs vertex stride */
1674 0,
1675 0,
1676 };
1677 uint32_t gs_base = gs->const_state->offsets.primitive_param;
1678 tu6_emit_const(cs, CP_LOAD_STATE6_GEOM, gs_base, SB6_GS_SHADER, 0,
1679 ARRAY_SIZE(gs_params), gs_params);
1680 }
1681 }
1682
1683 static void
tu6_emit_program_config(struct tu_cs * cs,struct tu_pipeline_builder * builder)1684 tu6_emit_program_config(struct tu_cs *cs,
1685 struct tu_pipeline_builder *builder)
1686 {
1687 gl_shader_stage stage = MESA_SHADER_VERTEX;
1688
1689 STATIC_ASSERT(MESA_SHADER_VERTEX == 0);
1690
1691 bool shared_consts_enable = tu6_shared_constants_enable(builder->layout,
1692 builder->device->compiler);
1693 tu6_emit_shared_consts_enable(cs, shared_consts_enable);
1694
1695 tu_cs_emit_regs(cs, A6XX_HLSQ_INVALIDATE_CMD(
1696 .vs_state = true,
1697 .hs_state = true,
1698 .ds_state = true,
1699 .gs_state = true,
1700 .fs_state = true,
1701 .gfx_ibo = true,
1702 .gfx_shared_const = shared_consts_enable));
1703 for (; stage < ARRAY_SIZE(builder->shader_iova); stage++) {
1704 tu6_emit_xs_config(cs, stage, builder->shaders->variants[stage]);
1705 }
1706 }
1707
1708 static void
tu6_emit_program(struct tu_cs * cs,struct tu_pipeline_builder * builder,bool binning_pass,struct tu_pipeline * pipeline)1709 tu6_emit_program(struct tu_cs *cs,
1710 struct tu_pipeline_builder *builder,
1711 bool binning_pass,
1712 struct tu_pipeline *pipeline)
1713 {
1714 const struct ir3_shader_variant *vs = builder->shaders->variants[MESA_SHADER_VERTEX];
1715 const struct ir3_shader_variant *bs = builder->binning_variant;
1716 const struct ir3_shader_variant *hs = builder->shaders->variants[MESA_SHADER_TESS_CTRL];
1717 const struct ir3_shader_variant *ds = builder->shaders->variants[MESA_SHADER_TESS_EVAL];
1718 const struct ir3_shader_variant *gs = builder->shaders->variants[MESA_SHADER_GEOMETRY];
1719 const struct ir3_shader_variant *fs = builder->shaders->variants[MESA_SHADER_FRAGMENT];
1720 gl_shader_stage stage = MESA_SHADER_VERTEX;
1721 uint32_t cps_per_patch = builder->create_info->pTessellationState ?
1722 builder->create_info->pTessellationState->patchControlPoints : 0;
1723 bool multi_pos_output = builder->shaders->multi_pos_output;
1724
1725 /* Don't use the binning pass variant when GS is present because we don't
1726 * support compiling correct binning pass variants with GS.
1727 */
1728 if (binning_pass && !gs) {
1729 vs = bs;
1730 tu6_emit_xs(cs, stage, bs, &builder->pvtmem, builder->binning_vs_iova);
1731 stage++;
1732 }
1733
1734 for (; stage < ARRAY_SIZE(builder->shader_iova); stage++) {
1735 const struct ir3_shader_variant *xs = builder->shaders->variants[stage];
1736
1737 if (stage == MESA_SHADER_FRAGMENT && binning_pass)
1738 fs = xs = NULL;
1739
1740 tu6_emit_xs(cs, stage, xs, &builder->pvtmem, builder->shader_iova[stage]);
1741 }
1742
1743 uint32_t multiview_views = util_logbase2(builder->multiview_mask) + 1;
1744 uint32_t multiview_cntl = builder->multiview_mask ?
1745 A6XX_PC_MULTIVIEW_CNTL_ENABLE |
1746 A6XX_PC_MULTIVIEW_CNTL_VIEWS(multiview_views) |
1747 COND(!multi_pos_output, A6XX_PC_MULTIVIEW_CNTL_DISABLEMULTIPOS)
1748 : 0;
1749
1750 /* Copy what the blob does here. This will emit an extra 0x3f
1751 * CP_EVENT_WRITE when multiview is disabled. I'm not exactly sure what
1752 * this is working around yet.
1753 */
1754 if (builder->device->physical_device->info->a6xx.has_cp_reg_write) {
1755 tu_cs_emit_pkt7(cs, CP_REG_WRITE, 3);
1756 tu_cs_emit(cs, CP_REG_WRITE_0_TRACKER(UNK_EVENT_WRITE));
1757 tu_cs_emit(cs, REG_A6XX_PC_MULTIVIEW_CNTL);
1758 } else {
1759 tu_cs_emit_pkt4(cs, REG_A6XX_PC_MULTIVIEW_CNTL, 1);
1760 }
1761 tu_cs_emit(cs, multiview_cntl);
1762
1763 tu_cs_emit_pkt4(cs, REG_A6XX_VFD_MULTIVIEW_CNTL, 1);
1764 tu_cs_emit(cs, multiview_cntl);
1765
1766 if (multiview_cntl &&
1767 builder->device->physical_device->info->a6xx.supports_multiview_mask) {
1768 tu_cs_emit_pkt4(cs, REG_A6XX_PC_MULTIVIEW_MASK, 1);
1769 tu_cs_emit(cs, builder->multiview_mask);
1770 }
1771
1772 tu_cs_emit_pkt4(cs, REG_A6XX_SP_HS_WAVE_INPUT_SIZE, 1);
1773 tu_cs_emit(cs, 0);
1774
1775 tu6_emit_vpc(cs, vs, hs, ds, gs, fs, cps_per_patch);
1776 tu6_emit_vpc_varying_modes(cs, fs);
1777
1778 bool no_earlyz = builder->depth_attachment_format == VK_FORMAT_S8_UINT;
1779 uint32_t mrt_count = builder->color_attachment_count;
1780 uint32_t render_components = builder->render_components;
1781
1782 if (builder->alpha_to_coverage) {
1783 /* alpha to coverage can behave like a discard */
1784 no_earlyz = true;
1785 /* alpha value comes from first mrt */
1786 render_components |= 0xf;
1787 if (!mrt_count) {
1788 mrt_count = 1;
1789 /* Disable memory write for dummy mrt because it doesn't get set otherwise */
1790 tu_cs_emit_regs(cs, A6XX_RB_MRT_CONTROL(0, .component_enable = 0));
1791 }
1792 }
1793
1794 if (fs) {
1795 tu6_emit_fs_inputs(cs, fs);
1796 tu6_emit_fs_outputs(cs, fs, mrt_count,
1797 builder->use_dual_src_blend,
1798 render_components,
1799 no_earlyz,
1800 pipeline);
1801 } else {
1802 /* TODO: check if these can be skipped if fs is disabled */
1803 struct ir3_shader_variant dummy_variant = {};
1804 tu6_emit_fs_inputs(cs, &dummy_variant);
1805 tu6_emit_fs_outputs(cs, &dummy_variant, mrt_count,
1806 builder->use_dual_src_blend,
1807 render_components,
1808 no_earlyz,
1809 NULL);
1810 }
1811
1812 if (gs || hs) {
1813 tu6_emit_geom_tess_consts(cs, vs, hs, ds, gs, cps_per_patch);
1814 }
1815 }
1816
1817 #define TU6_EMIT_VERTEX_INPUT_MAX_DWORDS (MAX_VERTEX_ATTRIBS * 5 + 4)
1818
1819 static void
tu6_emit_vertex_input(struct tu_pipeline * pipeline,struct tu_draw_state * vi_state,const struct ir3_shader_variant * vs,const VkPipelineVertexInputStateCreateInfo * info)1820 tu6_emit_vertex_input(struct tu_pipeline *pipeline,
1821 struct tu_draw_state *vi_state,
1822 const struct ir3_shader_variant *vs,
1823 const VkPipelineVertexInputStateCreateInfo *info)
1824 {
1825 uint32_t binding_instanced = 0; /* bitmask of instanced bindings */
1826 uint32_t step_rate[MAX_VBS];
1827
1828 struct tu_cs cs;
1829 tu_cs_begin_sub_stream(&pipeline->cs,
1830 TU6_EMIT_VERTEX_INPUT_MAX_DWORDS, &cs);
1831
1832 for (uint32_t i = 0; i < info->vertexBindingDescriptionCount; i++) {
1833 const VkVertexInputBindingDescription *binding =
1834 &info->pVertexBindingDescriptions[i];
1835
1836 if (!(pipeline->dynamic_state_mask & BIT(TU_DYNAMIC_STATE_VB_STRIDE))) {
1837 tu_cs_emit_regs(&cs,
1838 A6XX_VFD_FETCH_STRIDE(binding->binding, binding->stride));
1839 }
1840
1841 if (binding->inputRate == VK_VERTEX_INPUT_RATE_INSTANCE)
1842 binding_instanced |= 1 << binding->binding;
1843
1844 step_rate[binding->binding] = 1;
1845 }
1846
1847 const VkPipelineVertexInputDivisorStateCreateInfoEXT *div_state =
1848 vk_find_struct_const(info->pNext, PIPELINE_VERTEX_INPUT_DIVISOR_STATE_CREATE_INFO_EXT);
1849 if (div_state) {
1850 for (uint32_t i = 0; i < div_state->vertexBindingDivisorCount; i++) {
1851 const VkVertexInputBindingDivisorDescriptionEXT *desc =
1852 &div_state->pVertexBindingDivisors[i];
1853 step_rate[desc->binding] = desc->divisor;
1854 }
1855 }
1856
1857 int32_t input_for_attr[MAX_VERTEX_ATTRIBS];
1858 uint32_t used_attrs_count = 0;
1859
1860 for (uint32_t attr_idx = 0; attr_idx < info->vertexAttributeDescriptionCount; attr_idx++) {
1861 input_for_attr[attr_idx] = -1;
1862 for (uint32_t input_idx = 0; input_idx < vs->inputs_count; input_idx++) {
1863 if ((vs->inputs[input_idx].slot - VERT_ATTRIB_GENERIC0) ==
1864 info->pVertexAttributeDescriptions[attr_idx].location) {
1865 input_for_attr[attr_idx] = input_idx;
1866 used_attrs_count++;
1867 break;
1868 }
1869 }
1870 }
1871
1872 if (used_attrs_count)
1873 tu_cs_emit_pkt4(&cs, REG_A6XX_VFD_DECODE_INSTR(0), used_attrs_count * 2);
1874
1875 for (uint32_t attr_idx = 0; attr_idx < info->vertexAttributeDescriptionCount; attr_idx++) {
1876 const VkVertexInputAttributeDescription *attr =
1877 &info->pVertexAttributeDescriptions[attr_idx];
1878
1879 if (input_for_attr[attr_idx] == -1)
1880 continue;
1881
1882 const struct tu_native_format format = tu6_format_vtx(attr->format);
1883 tu_cs_emit(&cs, A6XX_VFD_DECODE_INSTR(0,
1884 .idx = attr->binding,
1885 .offset = attr->offset,
1886 .instanced = binding_instanced & (1 << attr->binding),
1887 .format = format.fmt,
1888 .swap = format.swap,
1889 .unk30 = 1,
1890 ._float = !vk_format_is_int(attr->format)).value);
1891 tu_cs_emit(&cs, A6XX_VFD_DECODE_STEP_RATE(0, step_rate[attr->binding]).value);
1892 }
1893
1894 if (used_attrs_count)
1895 tu_cs_emit_pkt4(&cs, REG_A6XX_VFD_DEST_CNTL_INSTR(0), used_attrs_count);
1896
1897 for (uint32_t attr_idx = 0; attr_idx < info->vertexAttributeDescriptionCount; attr_idx++) {
1898 int32_t input_idx = input_for_attr[attr_idx];
1899 if (input_idx == -1)
1900 continue;
1901
1902 tu_cs_emit(&cs, A6XX_VFD_DEST_CNTL_INSTR(0,
1903 .writemask = vs->inputs[input_idx].compmask,
1904 .regid = vs->inputs[input_idx].regid).value);
1905 }
1906
1907 tu_cs_emit_regs(&cs,
1908 A6XX_VFD_CONTROL_0(
1909 .fetch_cnt = used_attrs_count, /* decode_cnt for binning pass ? */
1910 .decode_cnt = used_attrs_count));
1911
1912 *vi_state = tu_cs_end_draw_state(&pipeline->cs, &cs);
1913 }
1914
1915 void
tu6_emit_viewport(struct tu_cs * cs,const VkViewport * viewports,uint32_t num_viewport,bool z_negative_one_to_one)1916 tu6_emit_viewport(struct tu_cs *cs, const VkViewport *viewports, uint32_t num_viewport,
1917 bool z_negative_one_to_one)
1918 {
1919 VkExtent2D guardband = {511, 511};
1920
1921 tu_cs_emit_pkt4(cs, REG_A6XX_GRAS_CL_VPORT_XOFFSET(0), num_viewport * 6);
1922 for (uint32_t i = 0; i < num_viewport; i++) {
1923 const VkViewport *viewport = &viewports[i];
1924 float offsets[3];
1925 float scales[3];
1926 scales[0] = viewport->width / 2.0f;
1927 scales[1] = viewport->height / 2.0f;
1928 if (z_negative_one_to_one) {
1929 scales[2] = 0.5 * (viewport->maxDepth - viewport->minDepth);
1930 } else {
1931 scales[2] = viewport->maxDepth - viewport->minDepth;
1932 }
1933
1934 offsets[0] = viewport->x + scales[0];
1935 offsets[1] = viewport->y + scales[1];
1936 if (z_negative_one_to_one) {
1937 offsets[2] = 0.5 * (viewport->minDepth + viewport->maxDepth);
1938 } else {
1939 offsets[2] = viewport->minDepth;
1940 }
1941
1942 for (uint32_t j = 0; j < 3; j++) {
1943 tu_cs_emit(cs, fui(offsets[j]));
1944 tu_cs_emit(cs, fui(scales[j]));
1945 }
1946
1947 guardband.width =
1948 MIN2(guardband.width, fd_calc_guardband(offsets[0], scales[0], false));
1949 guardband.height =
1950 MIN2(guardband.height, fd_calc_guardband(offsets[1], scales[1], false));
1951 }
1952
1953 tu_cs_emit_pkt4(cs, REG_A6XX_GRAS_SC_VIEWPORT_SCISSOR_TL(0), num_viewport * 2);
1954 for (uint32_t i = 0; i < num_viewport; i++) {
1955 const VkViewport *viewport = &viewports[i];
1956 VkOffset2D min;
1957 VkOffset2D max;
1958 min.x = (int32_t) viewport->x;
1959 max.x = (int32_t) ceilf(viewport->x + viewport->width);
1960 if (viewport->height >= 0.0f) {
1961 min.y = (int32_t) viewport->y;
1962 max.y = (int32_t) ceilf(viewport->y + viewport->height);
1963 } else {
1964 min.y = (int32_t)(viewport->y + viewport->height);
1965 max.y = (int32_t) ceilf(viewport->y);
1966 }
1967 /* the spec allows viewport->height to be 0.0f */
1968 if (min.y == max.y)
1969 max.y++;
1970 /* allow viewport->width = 0.0f for un-initialized viewports: */
1971 if (min.x == max.x)
1972 max.x++;
1973
1974 min.x = MAX2(min.x, 0);
1975 min.y = MAX2(min.y, 0);
1976 max.x = MAX2(max.x, 1);
1977 max.y = MAX2(max.y, 1);
1978
1979 assert(min.x < max.x);
1980 assert(min.y < max.y);
1981
1982 tu_cs_emit(cs, A6XX_GRAS_SC_VIEWPORT_SCISSOR_TL_X(min.x) |
1983 A6XX_GRAS_SC_VIEWPORT_SCISSOR_TL_Y(min.y));
1984 tu_cs_emit(cs, A6XX_GRAS_SC_VIEWPORT_SCISSOR_BR_X(max.x - 1) |
1985 A6XX_GRAS_SC_VIEWPORT_SCISSOR_BR_Y(max.y - 1));
1986 }
1987
1988 tu_cs_emit_pkt4(cs, REG_A6XX_GRAS_CL_Z_CLAMP(0), num_viewport * 2);
1989 for (uint32_t i = 0; i < num_viewport; i++) {
1990 const VkViewport *viewport = &viewports[i];
1991 tu_cs_emit(cs, fui(MIN2(viewport->minDepth, viewport->maxDepth)));
1992 tu_cs_emit(cs, fui(MAX2(viewport->minDepth, viewport->maxDepth)));
1993 }
1994 tu_cs_emit_pkt4(cs, REG_A6XX_GRAS_CL_GUARDBAND_CLIP_ADJ, 1);
1995 tu_cs_emit(cs, A6XX_GRAS_CL_GUARDBAND_CLIP_ADJ_HORZ(guardband.width) |
1996 A6XX_GRAS_CL_GUARDBAND_CLIP_ADJ_VERT(guardband.height));
1997
1998 /* TODO: what to do about this and multi viewport ? */
1999 float z_clamp_min = num_viewport ? MIN2(viewports[0].minDepth, viewports[0].maxDepth) : 0;
2000 float z_clamp_max = num_viewport ? MAX2(viewports[0].minDepth, viewports[0].maxDepth) : 0;
2001
2002 tu_cs_emit_regs(cs,
2003 A6XX_RB_Z_CLAMP_MIN(z_clamp_min),
2004 A6XX_RB_Z_CLAMP_MAX(z_clamp_max));
2005 }
2006
2007 void
tu6_emit_scissor(struct tu_cs * cs,const VkRect2D * scissors,uint32_t scissor_count)2008 tu6_emit_scissor(struct tu_cs *cs, const VkRect2D *scissors, uint32_t scissor_count)
2009 {
2010 tu_cs_emit_pkt4(cs, REG_A6XX_GRAS_SC_SCREEN_SCISSOR_TL(0), scissor_count * 2);
2011
2012 for (uint32_t i = 0; i < scissor_count; i++) {
2013 const VkRect2D *scissor = &scissors[i];
2014
2015 uint32_t min_x = scissor->offset.x;
2016 uint32_t min_y = scissor->offset.y;
2017 uint32_t max_x = min_x + scissor->extent.width - 1;
2018 uint32_t max_y = min_y + scissor->extent.height - 1;
2019
2020 if (!scissor->extent.width || !scissor->extent.height) {
2021 min_x = min_y = 1;
2022 max_x = max_y = 0;
2023 } else {
2024 /* avoid overflow */
2025 uint32_t scissor_max = BITFIELD_MASK(15);
2026 min_x = MIN2(scissor_max, min_x);
2027 min_y = MIN2(scissor_max, min_y);
2028 max_x = MIN2(scissor_max, max_x);
2029 max_y = MIN2(scissor_max, max_y);
2030 }
2031
2032 tu_cs_emit(cs, A6XX_GRAS_SC_SCREEN_SCISSOR_TL_X(min_x) |
2033 A6XX_GRAS_SC_SCREEN_SCISSOR_TL_Y(min_y));
2034 tu_cs_emit(cs, A6XX_GRAS_SC_SCREEN_SCISSOR_BR_X(max_x) |
2035 A6XX_GRAS_SC_SCREEN_SCISSOR_BR_Y(max_y));
2036 }
2037 }
2038
2039 void
tu6_emit_sample_locations(struct tu_cs * cs,const VkSampleLocationsInfoEXT * samp_loc)2040 tu6_emit_sample_locations(struct tu_cs *cs, const VkSampleLocationsInfoEXT *samp_loc)
2041 {
2042 if (!samp_loc) {
2043 tu_cs_emit_pkt4(cs, REG_A6XX_GRAS_SAMPLE_CONFIG, 1);
2044 tu_cs_emit(cs, 0);
2045
2046 tu_cs_emit_pkt4(cs, REG_A6XX_RB_SAMPLE_CONFIG, 1);
2047 tu_cs_emit(cs, 0);
2048
2049 tu_cs_emit_pkt4(cs, REG_A6XX_SP_TP_SAMPLE_CONFIG, 1);
2050 tu_cs_emit(cs, 0);
2051 return;
2052 }
2053
2054 assert(samp_loc->sampleLocationsPerPixel == samp_loc->sampleLocationsCount);
2055 assert(samp_loc->sampleLocationGridSize.width == 1);
2056 assert(samp_loc->sampleLocationGridSize.height == 1);
2057
2058 uint32_t sample_config =
2059 A6XX_RB_SAMPLE_CONFIG_LOCATION_ENABLE;
2060 uint32_t sample_locations = 0;
2061 for (uint32_t i = 0; i < samp_loc->sampleLocationsCount; i++) {
2062 sample_locations |=
2063 (A6XX_RB_SAMPLE_LOCATION_0_SAMPLE_0_X(samp_loc->pSampleLocations[i].x) |
2064 A6XX_RB_SAMPLE_LOCATION_0_SAMPLE_0_Y(samp_loc->pSampleLocations[i].y)) << i*8;
2065 }
2066
2067 tu_cs_emit_pkt4(cs, REG_A6XX_GRAS_SAMPLE_CONFIG, 2);
2068 tu_cs_emit(cs, sample_config);
2069 tu_cs_emit(cs, sample_locations);
2070
2071 tu_cs_emit_pkt4(cs, REG_A6XX_RB_SAMPLE_CONFIG, 2);
2072 tu_cs_emit(cs, sample_config);
2073 tu_cs_emit(cs, sample_locations);
2074
2075 tu_cs_emit_pkt4(cs, REG_A6XX_SP_TP_SAMPLE_CONFIG, 2);
2076 tu_cs_emit(cs, sample_config);
2077 tu_cs_emit(cs, sample_locations);
2078 }
2079
2080 static uint32_t
tu6_gras_su_cntl(const VkPipelineRasterizationStateCreateInfo * rast_info,enum a5xx_line_mode line_mode,bool multiview)2081 tu6_gras_su_cntl(const VkPipelineRasterizationStateCreateInfo *rast_info,
2082 enum a5xx_line_mode line_mode,
2083 bool multiview)
2084 {
2085 uint32_t gras_su_cntl = 0;
2086
2087 if (rast_info->cullMode & VK_CULL_MODE_FRONT_BIT)
2088 gras_su_cntl |= A6XX_GRAS_SU_CNTL_CULL_FRONT;
2089 if (rast_info->cullMode & VK_CULL_MODE_BACK_BIT)
2090 gras_su_cntl |= A6XX_GRAS_SU_CNTL_CULL_BACK;
2091
2092 if (rast_info->frontFace == VK_FRONT_FACE_CLOCKWISE)
2093 gras_su_cntl |= A6XX_GRAS_SU_CNTL_FRONT_CW;
2094
2095 gras_su_cntl |=
2096 A6XX_GRAS_SU_CNTL_LINEHALFWIDTH(rast_info->lineWidth / 2.0f);
2097
2098 if (rast_info->depthBiasEnable)
2099 gras_su_cntl |= A6XX_GRAS_SU_CNTL_POLY_OFFSET;
2100
2101 gras_su_cntl |= A6XX_GRAS_SU_CNTL_LINE_MODE(line_mode);
2102
2103 if (multiview) {
2104 gras_su_cntl |=
2105 A6XX_GRAS_SU_CNTL_UNK17 |
2106 A6XX_GRAS_SU_CNTL_MULTIVIEW_ENABLE;
2107 }
2108
2109 return gras_su_cntl;
2110 }
2111
2112 void
tu6_emit_depth_bias(struct tu_cs * cs,float constant_factor,float clamp,float slope_factor)2113 tu6_emit_depth_bias(struct tu_cs *cs,
2114 float constant_factor,
2115 float clamp,
2116 float slope_factor)
2117 {
2118 tu_cs_emit_pkt4(cs, REG_A6XX_GRAS_SU_POLY_OFFSET_SCALE, 3);
2119 tu_cs_emit(cs, A6XX_GRAS_SU_POLY_OFFSET_SCALE(slope_factor).value);
2120 tu_cs_emit(cs, A6XX_GRAS_SU_POLY_OFFSET_OFFSET(constant_factor).value);
2121 tu_cs_emit(cs, A6XX_GRAS_SU_POLY_OFFSET_OFFSET_CLAMP(clamp).value);
2122 }
2123
2124 static uint32_t
tu6_rb_mrt_blend_control(const VkPipelineColorBlendAttachmentState * att,bool has_alpha)2125 tu6_rb_mrt_blend_control(const VkPipelineColorBlendAttachmentState *att,
2126 bool has_alpha)
2127 {
2128 const enum a3xx_rb_blend_opcode color_op = tu6_blend_op(att->colorBlendOp);
2129 const enum adreno_rb_blend_factor src_color_factor = tu6_blend_factor(
2130 has_alpha ? att->srcColorBlendFactor
2131 : tu_blend_factor_no_dst_alpha(att->srcColorBlendFactor));
2132 const enum adreno_rb_blend_factor dst_color_factor = tu6_blend_factor(
2133 has_alpha ? att->dstColorBlendFactor
2134 : tu_blend_factor_no_dst_alpha(att->dstColorBlendFactor));
2135 const enum a3xx_rb_blend_opcode alpha_op = tu6_blend_op(att->alphaBlendOp);
2136 const enum adreno_rb_blend_factor src_alpha_factor =
2137 tu6_blend_factor(att->srcAlphaBlendFactor);
2138 const enum adreno_rb_blend_factor dst_alpha_factor =
2139 tu6_blend_factor(att->dstAlphaBlendFactor);
2140
2141 return A6XX_RB_MRT_BLEND_CONTROL_RGB_SRC_FACTOR(src_color_factor) |
2142 A6XX_RB_MRT_BLEND_CONTROL_RGB_BLEND_OPCODE(color_op) |
2143 A6XX_RB_MRT_BLEND_CONTROL_RGB_DEST_FACTOR(dst_color_factor) |
2144 A6XX_RB_MRT_BLEND_CONTROL_ALPHA_SRC_FACTOR(src_alpha_factor) |
2145 A6XX_RB_MRT_BLEND_CONTROL_ALPHA_BLEND_OPCODE(alpha_op) |
2146 A6XX_RB_MRT_BLEND_CONTROL_ALPHA_DEST_FACTOR(dst_alpha_factor);
2147 }
2148
2149 static uint32_t
tu6_rb_mrt_control(const VkPipelineColorBlendAttachmentState * att,uint32_t rb_mrt_control_rop,bool has_alpha)2150 tu6_rb_mrt_control(const VkPipelineColorBlendAttachmentState *att,
2151 uint32_t rb_mrt_control_rop,
2152 bool has_alpha)
2153 {
2154 uint32_t rb_mrt_control =
2155 A6XX_RB_MRT_CONTROL_COMPONENT_ENABLE(att->colorWriteMask);
2156
2157 rb_mrt_control |= rb_mrt_control_rop;
2158
2159 if (att->blendEnable) {
2160 rb_mrt_control |= A6XX_RB_MRT_CONTROL_BLEND;
2161
2162 if (has_alpha)
2163 rb_mrt_control |= A6XX_RB_MRT_CONTROL_BLEND2;
2164 }
2165
2166 return rb_mrt_control;
2167 }
2168
2169 uint32_t
tu6_rb_mrt_control_rop(VkLogicOp op,bool * rop_reads_dst)2170 tu6_rb_mrt_control_rop(VkLogicOp op, bool *rop_reads_dst)
2171 {
2172 *rop_reads_dst = tu_logic_op_reads_dst(op);
2173 return A6XX_RB_MRT_CONTROL_ROP_ENABLE |
2174 A6XX_RB_MRT_CONTROL_ROP_CODE(tu6_rop(op));
2175 }
2176
2177 static void
tu6_emit_rb_mrt_controls(struct tu_pipeline * pipeline,const VkPipelineColorBlendStateCreateInfo * blend_info,const VkFormat attachment_formats[MAX_RTS],bool * rop_reads_dst,uint32_t * color_bandwidth_per_sample)2178 tu6_emit_rb_mrt_controls(struct tu_pipeline *pipeline,
2179 const VkPipelineColorBlendStateCreateInfo *blend_info,
2180 const VkFormat attachment_formats[MAX_RTS],
2181 bool *rop_reads_dst,
2182 uint32_t *color_bandwidth_per_sample)
2183 {
2184 const VkPipelineColorWriteCreateInfoEXT *color_info =
2185 vk_find_struct_const(blend_info->pNext,
2186 PIPELINE_COLOR_WRITE_CREATE_INFO_EXT);
2187
2188 /* The static state is ignored if it's dynamic. In that case assume
2189 * everything is enabled and then the appropriate registers will be zero'd
2190 * dynamically.
2191 */
2192 if (pipeline->dynamic_state_mask & BIT(TU_DYNAMIC_STATE_COLOR_WRITE_ENABLE))
2193 color_info = NULL;
2194
2195 *rop_reads_dst = false;
2196 *color_bandwidth_per_sample = 0;
2197
2198 uint32_t rb_mrt_control_rop = 0;
2199 if (blend_info->logicOpEnable) {
2200 pipeline->logic_op_enabled = true;
2201 rb_mrt_control_rop = tu6_rb_mrt_control_rop(blend_info->logicOp,
2202 rop_reads_dst);
2203 }
2204
2205 uint32_t total_bpp = 0;
2206 pipeline->num_rts = blend_info->attachmentCount;
2207 for (uint32_t i = 0; i < blend_info->attachmentCount; i++) {
2208 const VkPipelineColorBlendAttachmentState *att =
2209 &blend_info->pAttachments[i];
2210 const VkFormat format = attachment_formats[i];
2211
2212 uint32_t rb_mrt_control = 0;
2213 uint32_t rb_mrt_blend_control = 0;
2214 if (format != VK_FORMAT_UNDEFINED &&
2215 (!color_info || color_info->pColorWriteEnables[i])) {
2216 const bool has_alpha = vk_format_has_alpha(format);
2217
2218 rb_mrt_control =
2219 tu6_rb_mrt_control(att, rb_mrt_control_rop, has_alpha);
2220 rb_mrt_blend_control = tu6_rb_mrt_blend_control(att, has_alpha);
2221
2222 /* calculate bpp based on format and write mask */
2223 uint32_t write_bpp = 0;
2224 if (att->colorWriteMask == 0xf) {
2225 write_bpp = vk_format_get_blocksizebits(format);
2226 } else {
2227 const enum pipe_format pipe_format = vk_format_to_pipe_format(format);
2228 for (uint32_t i = 0; i < 4; i++) {
2229 if (att->colorWriteMask & (1 << i)) {
2230 write_bpp += util_format_get_component_bits(pipe_format,
2231 UTIL_FORMAT_COLORSPACE_RGB, i);
2232 }
2233 }
2234 }
2235 total_bpp += write_bpp;
2236
2237 pipeline->color_write_enable |= BIT(i);
2238 if (att->blendEnable)
2239 pipeline->blend_enable |= BIT(i);
2240
2241 if (att->blendEnable || *rop_reads_dst) {
2242 total_bpp += write_bpp;
2243 }
2244 }
2245
2246 pipeline->rb_mrt_control[i] = rb_mrt_control & pipeline->rb_mrt_control_mask;
2247 pipeline->rb_mrt_blend_control[i] = rb_mrt_blend_control;
2248 }
2249
2250 *color_bandwidth_per_sample = total_bpp / 8;
2251 }
2252
2253 static void
tu6_emit_blend_control(struct tu_pipeline * pipeline,uint32_t blend_enable_mask,bool dual_src_blend,const VkPipelineMultisampleStateCreateInfo * msaa_info)2254 tu6_emit_blend_control(struct tu_pipeline *pipeline,
2255 uint32_t blend_enable_mask,
2256 bool dual_src_blend,
2257 const VkPipelineMultisampleStateCreateInfo *msaa_info)
2258 {
2259 const uint32_t sample_mask =
2260 msaa_info->pSampleMask ? (*msaa_info->pSampleMask & 0xffff)
2261 : ((1 << msaa_info->rasterizationSamples) - 1);
2262
2263
2264 pipeline->sp_blend_cntl =
2265 A6XX_SP_BLEND_CNTL(.enable_blend = blend_enable_mask,
2266 .dual_color_in_enable = dual_src_blend,
2267 .alpha_to_coverage = msaa_info->alphaToCoverageEnable,
2268 .unk8 = true).value & pipeline->sp_blend_cntl_mask;
2269
2270 /* set A6XX_RB_BLEND_CNTL_INDEPENDENT_BLEND only when enabled? */
2271 pipeline->rb_blend_cntl =
2272 A6XX_RB_BLEND_CNTL(.enable_blend = blend_enable_mask,
2273 .independent_blend = true,
2274 .sample_mask = sample_mask,
2275 .dual_color_in_enable = dual_src_blend,
2276 .alpha_to_coverage = msaa_info->alphaToCoverageEnable,
2277 .alpha_to_one = msaa_info->alphaToOneEnable).value &
2278 pipeline->rb_blend_cntl_mask;
2279 }
2280
2281 static void
tu6_emit_blend(struct tu_cs * cs,struct tu_pipeline * pipeline)2282 tu6_emit_blend(struct tu_cs *cs,
2283 struct tu_pipeline *pipeline)
2284 {
2285 tu_cs_emit_regs(cs, A6XX_SP_BLEND_CNTL(.dword = pipeline->sp_blend_cntl));
2286 tu_cs_emit_regs(cs, A6XX_RB_BLEND_CNTL(.dword = pipeline->rb_blend_cntl));
2287
2288 for (unsigned i = 0; i < pipeline->num_rts; i++) {
2289 tu_cs_emit_regs(cs,
2290 A6XX_RB_MRT_CONTROL(i, .dword = pipeline->rb_mrt_control[i]),
2291 A6XX_RB_MRT_BLEND_CONTROL(i, .dword = pipeline->rb_mrt_blend_control[i]));
2292 }
2293 }
2294
2295 static uint32_t
calc_pvtmem_size(struct tu_device * dev,struct tu_pvtmem_config * config,uint32_t pvtmem_bytes)2296 calc_pvtmem_size(struct tu_device *dev, struct tu_pvtmem_config *config,
2297 uint32_t pvtmem_bytes)
2298 {
2299 uint32_t per_fiber_size = ALIGN(pvtmem_bytes, 512);
2300 uint32_t per_sp_size =
2301 ALIGN(per_fiber_size * dev->physical_device->info->a6xx.fibers_per_sp, 1 << 12);
2302
2303 if (config) {
2304 config->per_fiber_size = per_fiber_size;
2305 config->per_sp_size = per_sp_size;
2306 }
2307
2308 return dev->physical_device->info->num_sp_cores * per_sp_size;
2309 }
2310
2311 static VkResult
tu_setup_pvtmem(struct tu_device * dev,struct tu_pipeline * pipeline,struct tu_pvtmem_config * config,uint32_t pvtmem_bytes,bool per_wave)2312 tu_setup_pvtmem(struct tu_device *dev,
2313 struct tu_pipeline *pipeline,
2314 struct tu_pvtmem_config *config,
2315 uint32_t pvtmem_bytes, bool per_wave)
2316 {
2317 if (!pvtmem_bytes) {
2318 memset(config, 0, sizeof(*config));
2319 return VK_SUCCESS;
2320 }
2321
2322 uint32_t total_size = calc_pvtmem_size(dev, config, pvtmem_bytes);
2323 config->per_wave = per_wave;
2324
2325 VkResult result =
2326 tu_bo_init_new(dev, &pipeline->pvtmem_bo, total_size,
2327 TU_BO_ALLOC_NO_FLAGS);
2328 if (result != VK_SUCCESS)
2329 return result;
2330
2331 config->iova = pipeline->pvtmem_bo->iova;
2332
2333 return result;
2334 }
2335
2336
2337 static VkResult
tu_pipeline_allocate_cs(struct tu_device * dev,struct tu_pipeline * pipeline,struct tu_pipeline_layout * layout,struct tu_pipeline_builder * builder,struct ir3_shader_variant * compute)2338 tu_pipeline_allocate_cs(struct tu_device *dev,
2339 struct tu_pipeline *pipeline,
2340 struct tu_pipeline_layout *layout,
2341 struct tu_pipeline_builder *builder,
2342 struct ir3_shader_variant *compute)
2343 {
2344 uint32_t size = 1024 + tu6_load_state_size(pipeline, layout, compute);
2345
2346 /* graphics case: */
2347 if (builder) {
2348 size += 2 * TU6_EMIT_VERTEX_INPUT_MAX_DWORDS;
2349
2350 for (uint32_t i = 0; i < ARRAY_SIZE(builder->shaders->variants); i++) {
2351 if (builder->shaders->variants[i]) {
2352 size += builder->shaders->variants[i]->info.size / 4;
2353 }
2354 }
2355
2356 size += builder->binning_variant->info.size / 4;
2357
2358 builder->additional_cs_reserve_size = 0;
2359 for (unsigned i = 0; i < ARRAY_SIZE(builder->shaders->variants); i++) {
2360 struct ir3_shader_variant *variant = builder->shaders->variants[i];
2361 if (variant) {
2362 builder->additional_cs_reserve_size +=
2363 tu_xs_get_additional_cs_size_dwords(variant);
2364
2365 if (variant->binning) {
2366 builder->additional_cs_reserve_size +=
2367 tu_xs_get_additional_cs_size_dwords(variant->binning);
2368 }
2369 }
2370 }
2371
2372 /* The additional size is used twice, once per tu6_emit_program() call. */
2373 size += builder->additional_cs_reserve_size * 2;
2374 } else {
2375 size += compute->info.size / 4;
2376
2377 size += tu_xs_get_additional_cs_size_dwords(compute);
2378 }
2379
2380 /* Allocate the space for the pipeline out of the device's RO suballocator.
2381 *
2382 * Sub-allocating BOs saves memory and also kernel overhead in refcounting of
2383 * BOs at exec time.
2384 *
2385 * The pipeline cache would seem like a natural place to stick the
2386 * suballocator, except that it is not guaranteed to outlive the pipelines
2387 * created from it, so you can't store any long-lived state there, and you
2388 * can't use its EXTERNALLY_SYNCHRONIZED flag to avoid atomics because
2389 * pipeline destroy isn't synchronized by the cache.
2390 */
2391 pthread_mutex_lock(&dev->pipeline_mutex);
2392 VkResult result = tu_suballoc_bo_alloc(&pipeline->bo, &dev->pipeline_suballoc,
2393 size * 4, 128);
2394 pthread_mutex_unlock(&dev->pipeline_mutex);
2395 if (result != VK_SUCCESS)
2396 return result;
2397
2398 tu_cs_init_suballoc(&pipeline->cs, dev, &pipeline->bo);
2399
2400 return VK_SUCCESS;
2401 }
2402
2403 static void
tu_pipeline_shader_key_init(struct ir3_shader_key * key,const struct tu_pipeline * pipeline,const VkGraphicsPipelineCreateInfo * pipeline_info)2404 tu_pipeline_shader_key_init(struct ir3_shader_key *key,
2405 const struct tu_pipeline *pipeline,
2406 const VkGraphicsPipelineCreateInfo *pipeline_info)
2407 {
2408 for (uint32_t i = 0; i < pipeline_info->stageCount; i++) {
2409 if (pipeline_info->pStages[i].stage == VK_SHADER_STAGE_GEOMETRY_BIT) {
2410 key->has_gs = true;
2411 break;
2412 }
2413 }
2414
2415 if (pipeline_info->pRasterizationState->rasterizerDiscardEnable &&
2416 !(pipeline->dynamic_state_mask & BIT(TU_DYNAMIC_STATE_RASTERIZER_DISCARD)))
2417 return;
2418
2419 const VkPipelineMultisampleStateCreateInfo *msaa_info = pipeline_info->pMultisampleState;
2420 const struct VkPipelineSampleLocationsStateCreateInfoEXT *sample_locations =
2421 vk_find_struct_const(msaa_info->pNext, PIPELINE_SAMPLE_LOCATIONS_STATE_CREATE_INFO_EXT);
2422 if (msaa_info->rasterizationSamples > 1 ||
2423 /* also set msaa key when sample location is not the default
2424 * since this affects varying interpolation */
2425 (sample_locations && sample_locations->sampleLocationsEnable)) {
2426 key->msaa = true;
2427 }
2428
2429 /* The 1.3.215 spec says:
2430 *
2431 * Sample shading can be used to specify a minimum number of unique
2432 * samples to process for each fragment. If sample shading is enabled,
2433 * an implementation must provide a minimum of
2434 *
2435 * max(ceil(minSampleShadingFactor * totalSamples), 1)
2436 *
2437 * unique associated data for each fragment, where
2438 * minSampleShadingFactor is the minimum fraction of sample shading.
2439 *
2440 * The definition is pretty much the same as OpenGL's GL_SAMPLE_SHADING.
2441 * They both require unique associated data.
2442 *
2443 * There are discussions to change the definition, such that
2444 * sampleShadingEnable does not imply unique associated data. Before the
2445 * discussions are settled and before apps (i.e., ANGLE) are fixed to
2446 * follow the new and incompatible definition, we should stick to the
2447 * current definition.
2448 *
2449 * Note that ir3_shader_key::sample_shading is not actually used by ir3,
2450 * just checked in tu6_emit_fs_inputs. We will also copy the value to
2451 * tu_shader_key::force_sample_interp in a bit.
2452 */
2453 if (msaa_info->sampleShadingEnable &&
2454 (msaa_info->minSampleShading * msaa_info->rasterizationSamples) > 1.0f)
2455 key->sample_shading = true;
2456
2457 /* We set this after we compile to NIR because we need the prim mode */
2458 key->tessellation = IR3_TESS_NONE;
2459 }
2460
2461 static uint32_t
tu6_get_tessmode(struct tu_shader * shader)2462 tu6_get_tessmode(struct tu_shader* shader)
2463 {
2464 enum tess_primitive_mode primitive_mode = shader->ir3_shader->nir->info.tess._primitive_mode;
2465 switch (primitive_mode) {
2466 case TESS_PRIMITIVE_ISOLINES:
2467 return IR3_TESS_ISOLINES;
2468 case TESS_PRIMITIVE_TRIANGLES:
2469 return IR3_TESS_TRIANGLES;
2470 case TESS_PRIMITIVE_QUADS:
2471 return IR3_TESS_QUADS;
2472 case TESS_PRIMITIVE_UNSPECIFIED:
2473 return IR3_TESS_NONE;
2474 default:
2475 unreachable("bad tessmode");
2476 }
2477 }
2478
2479 static uint64_t
tu_upload_variant(struct tu_pipeline * pipeline,const struct ir3_shader_variant * variant)2480 tu_upload_variant(struct tu_pipeline *pipeline,
2481 const struct ir3_shader_variant *variant)
2482 {
2483 struct tu_cs_memory memory;
2484
2485 if (!variant)
2486 return 0;
2487
2488 /* this expects to get enough alignment because shaders are allocated first
2489 * and total size is always aligned correctly
2490 * note: an assert in tu6_emit_xs_config validates the alignment
2491 */
2492 tu_cs_alloc(&pipeline->cs, variant->info.size / 4, 1, &memory);
2493
2494 memcpy(memory.map, variant->bin, variant->info.size);
2495 return memory.iova;
2496 }
2497
2498 static void
tu_append_executable(struct tu_pipeline * pipeline,struct ir3_shader_variant * variant,char * nir_from_spirv)2499 tu_append_executable(struct tu_pipeline *pipeline, struct ir3_shader_variant *variant,
2500 char *nir_from_spirv)
2501 {
2502 struct tu_pipeline_executable exe = {
2503 .stage = variant->type,
2504 .nir_from_spirv = nir_from_spirv,
2505 .nir_final = ralloc_strdup(pipeline->executables_mem_ctx, variant->disasm_info.nir),
2506 .disasm = ralloc_strdup(pipeline->executables_mem_ctx, variant->disasm_info.disasm),
2507 .stats = variant->info,
2508 .is_binning = variant->binning_pass,
2509 };
2510
2511 util_dynarray_append(&pipeline->executables, struct tu_pipeline_executable, exe);
2512 }
2513
2514 static void
tu_link_shaders(struct tu_pipeline_builder * builder,nir_shader ** shaders,unsigned shaders_count)2515 tu_link_shaders(struct tu_pipeline_builder *builder,
2516 nir_shader **shaders, unsigned shaders_count)
2517 {
2518 nir_shader *consumer = NULL;
2519 for (gl_shader_stage stage = shaders_count - 1;
2520 stage >= MESA_SHADER_VERTEX; stage--) {
2521 if (!shaders[stage])
2522 continue;
2523
2524 nir_shader *producer = shaders[stage];
2525 if (!consumer) {
2526 consumer = producer;
2527 continue;
2528 }
2529
2530 if (nir_link_opt_varyings(producer, consumer)) {
2531 NIR_PASS_V(consumer, nir_opt_constant_folding);
2532 NIR_PASS_V(consumer, nir_opt_algebraic);
2533 NIR_PASS_V(consumer, nir_opt_dce);
2534 }
2535
2536 NIR_PASS_V(producer, nir_remove_dead_variables, nir_var_shader_out, NULL);
2537 NIR_PASS_V(consumer, nir_remove_dead_variables, nir_var_shader_in, NULL);
2538
2539 bool progress = nir_remove_unused_varyings(producer, consumer);
2540
2541 nir_compact_varyings(producer, consumer, true);
2542 if (progress) {
2543 if (nir_lower_global_vars_to_local(producer)) {
2544 /* Remove dead writes, which can remove input loads */
2545 NIR_PASS_V(producer, nir_remove_dead_variables, nir_var_shader_temp, NULL);
2546 NIR_PASS_V(producer, nir_opt_dce);
2547 }
2548 nir_lower_global_vars_to_local(consumer);
2549 }
2550
2551 consumer = producer;
2552 }
2553 }
2554
2555 static void
tu_shader_key_init(struct tu_shader_key * key,const VkPipelineShaderStageCreateInfo * stage_info,struct tu_device * dev)2556 tu_shader_key_init(struct tu_shader_key *key,
2557 const VkPipelineShaderStageCreateInfo *stage_info,
2558 struct tu_device *dev)
2559 {
2560 enum ir3_wavesize_option api_wavesize, real_wavesize;
2561
2562 if (stage_info) {
2563 if (stage_info->flags &
2564 VK_PIPELINE_SHADER_STAGE_CREATE_ALLOW_VARYING_SUBGROUP_SIZE_BIT) {
2565 api_wavesize = real_wavesize = IR3_SINGLE_OR_DOUBLE;
2566 } else {
2567 const VkPipelineShaderStageRequiredSubgroupSizeCreateInfo *size_info =
2568 vk_find_struct_const(stage_info->pNext,
2569 PIPELINE_SHADER_STAGE_REQUIRED_SUBGROUP_SIZE_CREATE_INFO);
2570
2571 if (size_info) {
2572 if (size_info->requiredSubgroupSize == dev->compiler->threadsize_base) {
2573 api_wavesize = IR3_SINGLE_ONLY;
2574 } else {
2575 assert(size_info->requiredSubgroupSize == dev->compiler->threadsize_base * 2);
2576 api_wavesize = IR3_DOUBLE_ONLY;
2577 }
2578 } else {
2579 /* Match the exposed subgroupSize. */
2580 api_wavesize = IR3_DOUBLE_ONLY;
2581 }
2582
2583 if (stage_info->flags &
2584 VK_PIPELINE_SHADER_STAGE_CREATE_REQUIRE_FULL_SUBGROUPS_BIT)
2585 real_wavesize = api_wavesize;
2586 else if (api_wavesize == IR3_SINGLE_ONLY)
2587 real_wavesize = IR3_SINGLE_ONLY;
2588 else
2589 real_wavesize = IR3_SINGLE_OR_DOUBLE;
2590 }
2591 } else {
2592 api_wavesize = real_wavesize = IR3_SINGLE_OR_DOUBLE;
2593 }
2594
2595 key->api_wavesize = api_wavesize;
2596 key->real_wavesize = real_wavesize;
2597 }
2598
2599 static void
tu_hash_stage(struct mesa_sha1 * ctx,const VkPipelineShaderStageCreateInfo * stage,const struct tu_shader_key * key)2600 tu_hash_stage(struct mesa_sha1 *ctx,
2601 const VkPipelineShaderStageCreateInfo *stage,
2602 const struct tu_shader_key *key)
2603 {
2604 unsigned char stage_hash[SHA1_DIGEST_LENGTH];
2605
2606 vk_pipeline_hash_shader_stage(stage, stage_hash);
2607 _mesa_sha1_update(ctx, stage_hash, sizeof(stage_hash));
2608 _mesa_sha1_update(ctx, key, sizeof(*key));
2609 }
2610
2611 /* Hash flags which can affect ir3 shader compilation which aren't known until
2612 * logical device creation.
2613 */
2614 static void
tu_hash_compiler(struct mesa_sha1 * ctx,const struct ir3_compiler * compiler)2615 tu_hash_compiler(struct mesa_sha1 *ctx, const struct ir3_compiler *compiler)
2616 {
2617 _mesa_sha1_update(ctx, &compiler->robust_buffer_access2,
2618 sizeof(compiler->robust_buffer_access2));
2619 _mesa_sha1_update(ctx, &ir3_shader_debug, sizeof(ir3_shader_debug));
2620 }
2621
2622 static void
tu_hash_shaders(unsigned char * hash,const VkPipelineShaderStageCreateInfo ** stages,const struct tu_pipeline_layout * layout,const struct tu_shader_key * keys,const struct ir3_shader_key * ir3_key,const struct ir3_compiler * compiler)2623 tu_hash_shaders(unsigned char *hash,
2624 const VkPipelineShaderStageCreateInfo **stages,
2625 const struct tu_pipeline_layout *layout,
2626 const struct tu_shader_key *keys,
2627 const struct ir3_shader_key *ir3_key,
2628 const struct ir3_compiler *compiler)
2629 {
2630 struct mesa_sha1 ctx;
2631
2632 _mesa_sha1_init(&ctx);
2633
2634 if (layout)
2635 _mesa_sha1_update(&ctx, layout->sha1, sizeof(layout->sha1));
2636
2637 _mesa_sha1_update(&ctx, ir3_key, sizeof(ir3_key));
2638
2639 for (int i = 0; i < MESA_SHADER_STAGES; ++i) {
2640 if (stages[i]) {
2641 tu_hash_stage(&ctx, stages[i], &keys[i]);
2642 }
2643 }
2644 tu_hash_compiler(&ctx, compiler);
2645 _mesa_sha1_final(&ctx, hash);
2646 }
2647
2648 static void
tu_hash_compute(unsigned char * hash,const VkPipelineShaderStageCreateInfo * stage,const struct tu_pipeline_layout * layout,const struct tu_shader_key * key,const struct ir3_compiler * compiler)2649 tu_hash_compute(unsigned char *hash,
2650 const VkPipelineShaderStageCreateInfo *stage,
2651 const struct tu_pipeline_layout *layout,
2652 const struct tu_shader_key *key,
2653 const struct ir3_compiler *compiler)
2654 {
2655 struct mesa_sha1 ctx;
2656
2657 _mesa_sha1_init(&ctx);
2658
2659 if (layout)
2660 _mesa_sha1_update(&ctx, layout->sha1, sizeof(layout->sha1));
2661
2662 tu_hash_stage(&ctx, stage, key);
2663
2664 tu_hash_compiler(&ctx, compiler);
2665 _mesa_sha1_final(&ctx, hash);
2666 }
2667
2668 static bool
2669 tu_shaders_serialize(struct vk_pipeline_cache_object *object,
2670 struct blob *blob);
2671
2672 static struct vk_pipeline_cache_object *
2673 tu_shaders_deserialize(struct vk_device *device,
2674 const void *key_data, size_t key_size,
2675 struct blob_reader *blob);
2676
2677 static void
tu_shaders_destroy(struct vk_pipeline_cache_object * object)2678 tu_shaders_destroy(struct vk_pipeline_cache_object *object)
2679 {
2680 struct tu_compiled_shaders *shaders =
2681 container_of(object, struct tu_compiled_shaders, base);
2682
2683 for (unsigned i = 0; i < ARRAY_SIZE(shaders->variants); i++)
2684 ralloc_free(shaders->variants[i]);
2685
2686 vk_pipeline_cache_object_finish(&shaders->base);
2687 vk_free(&object->device->alloc, shaders);
2688 }
2689
2690 const struct vk_pipeline_cache_object_ops tu_shaders_ops = {
2691 .serialize = tu_shaders_serialize,
2692 .deserialize = tu_shaders_deserialize,
2693 .destroy = tu_shaders_destroy,
2694 };
2695
2696 static struct tu_compiled_shaders *
tu_shaders_init(struct tu_device * dev,const void * key_data,size_t key_size)2697 tu_shaders_init(struct tu_device *dev, const void *key_data, size_t key_size)
2698 {
2699 VK_MULTIALLOC(ma);
2700 VK_MULTIALLOC_DECL(&ma, struct tu_compiled_shaders, shaders, 1);
2701 VK_MULTIALLOC_DECL_SIZE(&ma, void, obj_key_data, key_size);
2702
2703 if (!vk_multialloc_zalloc(&ma, &dev->vk.alloc,
2704 VK_SYSTEM_ALLOCATION_SCOPE_DEVICE))
2705 return NULL;
2706
2707 memcpy(obj_key_data, key_data, key_size);
2708 vk_pipeline_cache_object_init(&dev->vk, &shaders->base,
2709 &tu_shaders_ops, obj_key_data, key_size);
2710
2711 return shaders;
2712 }
2713
2714 static bool
tu_shaders_serialize(struct vk_pipeline_cache_object * object,struct blob * blob)2715 tu_shaders_serialize(struct vk_pipeline_cache_object *object,
2716 struct blob *blob)
2717 {
2718 struct tu_compiled_shaders *shaders =
2719 container_of(object, struct tu_compiled_shaders, base);
2720
2721 blob_write_bytes(blob, shaders->push_consts, sizeof(shaders->push_consts));
2722 blob_write_uint8(blob, shaders->active_desc_sets);
2723 blob_write_uint8(blob, shaders->multi_pos_output);
2724
2725 for (unsigned i = 0; i < ARRAY_SIZE(shaders->variants); i++) {
2726 if (shaders->variants[i]) {
2727 blob_write_uint8(blob, 1);
2728 ir3_store_variant(blob, shaders->variants[i]);
2729 } else {
2730 blob_write_uint8(blob, 0);
2731 }
2732 }
2733
2734 return true;
2735 }
2736
2737 static struct vk_pipeline_cache_object *
tu_shaders_deserialize(struct vk_device * _device,const void * key_data,size_t key_size,struct blob_reader * blob)2738 tu_shaders_deserialize(struct vk_device *_device,
2739 const void *key_data, size_t key_size,
2740 struct blob_reader *blob)
2741 {
2742 struct tu_device *dev = container_of(_device, struct tu_device, vk);
2743 struct tu_compiled_shaders *shaders =
2744 tu_shaders_init(dev, key_data, key_size);
2745
2746 if (!shaders)
2747 return NULL;
2748
2749 blob_copy_bytes(blob, shaders->push_consts, sizeof(shaders->push_consts));
2750 shaders->active_desc_sets = blob_read_uint8(blob);
2751 shaders->multi_pos_output = blob_read_uint8(blob);
2752
2753 for (unsigned i = 0; i < ARRAY_SIZE(shaders->variants); i++) {
2754 bool has_shader = blob_read_uint8(blob);
2755 if (has_shader) {
2756 shaders->variants[i] = ir3_retrieve_variant(blob, dev->compiler, NULL);
2757 }
2758 }
2759
2760 return &shaders->base;
2761 }
2762
2763 static struct tu_compiled_shaders *
tu_pipeline_cache_lookup(struct vk_pipeline_cache * cache,const void * key_data,size_t key_size,bool * application_cache_hit)2764 tu_pipeline_cache_lookup(struct vk_pipeline_cache *cache,
2765 const void *key_data, size_t key_size,
2766 bool *application_cache_hit)
2767 {
2768 struct vk_pipeline_cache_object *object =
2769 vk_pipeline_cache_lookup_object(cache, key_data, key_size,
2770 &tu_shaders_ops, application_cache_hit);
2771 if (object)
2772 return container_of(object, struct tu_compiled_shaders, base);
2773 else
2774 return NULL;
2775 }
2776
2777 static struct tu_compiled_shaders *
tu_pipeline_cache_insert(struct vk_pipeline_cache * cache,struct tu_compiled_shaders * shaders)2778 tu_pipeline_cache_insert(struct vk_pipeline_cache *cache,
2779 struct tu_compiled_shaders *shaders)
2780 {
2781 struct vk_pipeline_cache_object *object =
2782 vk_pipeline_cache_add_object(cache, &shaders->base);
2783 return container_of(object, struct tu_compiled_shaders, base);
2784 }
2785
2786 static VkResult
tu_pipeline_builder_compile_shaders(struct tu_pipeline_builder * builder,struct tu_pipeline * pipeline)2787 tu_pipeline_builder_compile_shaders(struct tu_pipeline_builder *builder,
2788 struct tu_pipeline *pipeline)
2789 {
2790 VkResult result = VK_SUCCESS;
2791 const struct ir3_compiler *compiler = builder->device->compiler;
2792 const VkPipelineShaderStageCreateInfo *stage_infos[MESA_SHADER_STAGES] = {
2793 NULL
2794 };
2795 VkPipelineCreationFeedback pipeline_feedback = {
2796 .flags = VK_PIPELINE_CREATION_FEEDBACK_VALID_BIT,
2797 };
2798 VkPipelineCreationFeedback stage_feedbacks[MESA_SHADER_STAGES] = { 0 };
2799
2800 int64_t pipeline_start = os_time_get_nano();
2801
2802 const VkPipelineCreationFeedbackCreateInfo *creation_feedback =
2803 vk_find_struct_const(builder->create_info->pNext, PIPELINE_CREATION_FEEDBACK_CREATE_INFO);
2804
2805 for (uint32_t i = 0; i < builder->create_info->stageCount; i++) {
2806 gl_shader_stage stage =
2807 vk_to_mesa_shader_stage(builder->create_info->pStages[i].stage);
2808 stage_infos[stage] = &builder->create_info->pStages[i];
2809 }
2810
2811 if (tu6_shared_constants_enable(builder->layout, builder->device->compiler)) {
2812 pipeline->shared_consts = (struct tu_push_constant_range) {
2813 .lo = 0,
2814 .dwords = builder->layout->push_constant_size / 4,
2815 };
2816 }
2817
2818 struct tu_shader_key keys[ARRAY_SIZE(stage_infos)] = { };
2819 for (gl_shader_stage stage = MESA_SHADER_VERTEX;
2820 stage < ARRAY_SIZE(keys); stage++) {
2821 tu_shader_key_init(&keys[stage], stage_infos[stage], builder->device);
2822 }
2823
2824 struct ir3_shader_key ir3_key = {};
2825 tu_pipeline_shader_key_init(&ir3_key, pipeline, builder->create_info);
2826
2827 keys[MESA_SHADER_VERTEX].multiview_mask = builder->multiview_mask;
2828 keys[MESA_SHADER_FRAGMENT].multiview_mask = builder->multiview_mask;
2829 keys[MESA_SHADER_FRAGMENT].force_sample_interp = ir3_key.sample_shading;
2830
2831 unsigned char pipeline_sha1[20];
2832 tu_hash_shaders(pipeline_sha1, stage_infos, builder->layout, keys, &ir3_key, compiler);
2833
2834 const bool executable_info = builder->create_info->flags &
2835 VK_PIPELINE_CREATE_CAPTURE_INTERNAL_REPRESENTATIONS_BIT_KHR;
2836
2837 char *nir_initial_disasm[ARRAY_SIZE(stage_infos)] = { NULL };
2838
2839 struct tu_compiled_shaders *compiled_shaders;
2840
2841 if (!executable_info) {
2842 bool application_cache_hit = false;
2843
2844 compiled_shaders =
2845 tu_pipeline_cache_lookup(builder->cache, &pipeline_sha1,
2846 sizeof(pipeline_sha1),
2847 &application_cache_hit);
2848
2849 if (application_cache_hit && builder->cache != builder->device->mem_cache) {
2850 pipeline_feedback.flags |=
2851 VK_PIPELINE_CREATION_FEEDBACK_APPLICATION_PIPELINE_CACHE_HIT_BIT;
2852 }
2853
2854 if (compiled_shaders)
2855 goto done;
2856 }
2857
2858 if (builder->create_info->flags &
2859 VK_PIPELINE_CREATE_FAIL_ON_PIPELINE_COMPILE_REQUIRED_BIT) {
2860 return VK_PIPELINE_COMPILE_REQUIRED;
2861 }
2862
2863 nir_shader *nir[ARRAY_SIZE(stage_infos)] = { NULL };
2864
2865 struct tu_shader *shaders[ARRAY_SIZE(nir)] = { NULL };
2866
2867 for (gl_shader_stage stage = MESA_SHADER_VERTEX;
2868 stage < ARRAY_SIZE(nir); stage++) {
2869 const VkPipelineShaderStageCreateInfo *stage_info = stage_infos[stage];
2870 if (!stage_info)
2871 continue;
2872
2873 int64_t stage_start = os_time_get_nano();
2874
2875 nir[stage] = tu_spirv_to_nir(builder->device, builder->mem_ctx, stage_info, stage);
2876 if (!nir[stage]) {
2877 result = VK_ERROR_OUT_OF_HOST_MEMORY;
2878 goto fail;
2879 }
2880
2881 stage_feedbacks[stage].flags = VK_PIPELINE_CREATION_FEEDBACK_VALID_BIT;
2882 stage_feedbacks[stage].duration += os_time_get_nano() - stage_start;
2883 }
2884
2885 if (!nir[MESA_SHADER_FRAGMENT]) {
2886 const nir_shader_compiler_options *nir_options =
2887 ir3_get_compiler_options(builder->device->compiler);
2888 nir_builder fs_b = nir_builder_init_simple_shader(MESA_SHADER_FRAGMENT,
2889 nir_options,
2890 "noop_fs");
2891 nir[MESA_SHADER_FRAGMENT] = fs_b.shader;
2892 }
2893
2894 if (executable_info) {
2895 for (gl_shader_stage stage = MESA_SHADER_VERTEX;
2896 stage < ARRAY_SIZE(nir); stage++) {
2897 if (!nir[stage])
2898 continue;
2899
2900 nir_initial_disasm[stage] =
2901 nir_shader_as_str(nir[stage], pipeline->executables_mem_ctx);
2902 }
2903 }
2904
2905 tu_link_shaders(builder, nir, ARRAY_SIZE(nir));
2906
2907 uint32_t desc_sets = 0;
2908 for (gl_shader_stage stage = MESA_SHADER_VERTEX;
2909 stage < ARRAY_SIZE(nir); stage++) {
2910 if (!nir[stage])
2911 continue;
2912
2913 int64_t stage_start = os_time_get_nano();
2914
2915 struct tu_shader *shader =
2916 tu_shader_create(builder->device, nir[stage], &keys[stage],
2917 builder->layout, builder->alloc);
2918 if (!shader) {
2919 result = VK_ERROR_OUT_OF_HOST_MEMORY;
2920 goto fail;
2921 }
2922
2923 /* In SPIR-V generated from GLSL, the primitive mode is specified in the
2924 * tessellation evaluation shader, but in SPIR-V generated from HLSL,
2925 * the mode is specified in the tessellation control shader. */
2926 if ((stage == MESA_SHADER_TESS_EVAL || stage == MESA_SHADER_TESS_CTRL) &&
2927 ir3_key.tessellation == IR3_TESS_NONE) {
2928 ir3_key.tessellation = tu6_get_tessmode(shader);
2929 }
2930
2931 if (stage > MESA_SHADER_TESS_CTRL) {
2932 if (stage == MESA_SHADER_FRAGMENT) {
2933 ir3_key.tcs_store_primid = ir3_key.tcs_store_primid ||
2934 (nir[stage]->info.inputs_read & (1ull << VARYING_SLOT_PRIMITIVE_ID));
2935 } else {
2936 ir3_key.tcs_store_primid = ir3_key.tcs_store_primid ||
2937 BITSET_TEST(nir[stage]->info.system_values_read, SYSTEM_VALUE_PRIMITIVE_ID);
2938 }
2939 }
2940
2941 /* Keep track of the status of each shader's active descriptor sets,
2942 * which is set in tu_lower_io. */
2943 desc_sets |= shader->active_desc_sets;
2944
2945 shaders[stage] = shader;
2946
2947 stage_feedbacks[stage].duration += os_time_get_nano() - stage_start;
2948 }
2949
2950 struct tu_shader *last_shader = shaders[MESA_SHADER_GEOMETRY];
2951 if (!last_shader)
2952 last_shader = shaders[MESA_SHADER_TESS_EVAL];
2953 if (!last_shader)
2954 last_shader = shaders[MESA_SHADER_VERTEX];
2955
2956 uint64_t outputs_written = last_shader->ir3_shader->nir->info.outputs_written;
2957
2958 ir3_key.layer_zero = !(outputs_written & VARYING_BIT_LAYER);
2959 ir3_key.view_zero = !(outputs_written & VARYING_BIT_VIEWPORT);
2960
2961 compiled_shaders =
2962 tu_shaders_init(builder->device, &pipeline_sha1, sizeof(pipeline_sha1));
2963
2964 if (!compiled_shaders) {
2965 result = VK_ERROR_OUT_OF_HOST_MEMORY;
2966 goto fail;
2967 }
2968
2969 compiled_shaders->active_desc_sets = desc_sets;
2970 compiled_shaders->multi_pos_output =
2971 shaders[MESA_SHADER_VERTEX]->multi_pos_output;
2972
2973 for (gl_shader_stage stage = MESA_SHADER_VERTEX;
2974 stage < ARRAY_SIZE(shaders); stage++) {
2975 if (!shaders[stage])
2976 continue;
2977
2978 int64_t stage_start = os_time_get_nano();
2979
2980 compiled_shaders->variants[stage] =
2981 ir3_shader_create_variant(shaders[stage]->ir3_shader, &ir3_key,
2982 executable_info);
2983 if (!compiled_shaders->variants[stage])
2984 return VK_ERROR_OUT_OF_HOST_MEMORY;
2985
2986 compiled_shaders->push_consts[stage] = shaders[stage]->push_consts;
2987
2988 stage_feedbacks[stage].duration += os_time_get_nano() - stage_start;
2989 }
2990
2991 uint32_t safe_constlens = ir3_trim_constlen(compiled_shaders->variants, compiler);
2992
2993 ir3_key.safe_constlen = true;
2994
2995 for (gl_shader_stage stage = MESA_SHADER_VERTEX;
2996 stage < ARRAY_SIZE(shaders); stage++) {
2997 if (!shaders[stage])
2998 continue;
2999
3000 if (safe_constlens & (1 << stage)) {
3001 int64_t stage_start = os_time_get_nano();
3002
3003 ralloc_free(compiled_shaders->variants[stage]);
3004 compiled_shaders->variants[stage] =
3005 ir3_shader_create_variant(shaders[stage]->ir3_shader, &ir3_key,
3006 executable_info);
3007 if (!compiled_shaders->variants[stage]) {
3008 result = VK_ERROR_OUT_OF_HOST_MEMORY;
3009 goto fail;
3010 }
3011
3012 stage_feedbacks[stage].duration += os_time_get_nano() - stage_start;
3013 }
3014 }
3015
3016 for (gl_shader_stage stage = MESA_SHADER_VERTEX;
3017 stage < ARRAY_SIZE(nir); stage++) {
3018 if (shaders[stage]) {
3019 tu_shader_destroy(builder->device, shaders[stage], builder->alloc);
3020 }
3021 }
3022
3023 compiled_shaders =
3024 tu_pipeline_cache_insert(builder->cache, compiled_shaders);
3025
3026 done:
3027 for (gl_shader_stage stage = MESA_SHADER_VERTEX;
3028 stage < ARRAY_SIZE(nir); stage++) {
3029 if (compiled_shaders->variants[stage]) {
3030 tu_append_executable(pipeline, compiled_shaders->variants[stage],
3031 nir_initial_disasm[stage]);
3032 }
3033 }
3034
3035 struct ir3_shader_variant *vs =
3036 compiled_shaders->variants[MESA_SHADER_VERTEX];
3037
3038 struct ir3_shader_variant *variant;
3039 if (!vs->stream_output.num_outputs && ir3_has_binning_vs(&vs->key)) {
3040 tu_append_executable(pipeline, vs->binning, NULL);
3041 variant = vs->binning;
3042 } else {
3043 variant = vs;
3044 }
3045
3046 builder->binning_variant = variant;
3047
3048 builder->shaders = compiled_shaders;
3049
3050 pipeline->active_desc_sets = compiled_shaders->active_desc_sets;
3051 if (compiled_shaders->variants[MESA_SHADER_TESS_CTRL]) {
3052 pipeline->tess.patch_type =
3053 compiled_shaders->variants[MESA_SHADER_TESS_CTRL]->key.tessellation;
3054 }
3055
3056 pipeline_feedback.duration = os_time_get_nano() - pipeline_start;
3057 if (creation_feedback) {
3058 *creation_feedback->pPipelineCreationFeedback = pipeline_feedback;
3059
3060 assert(builder->create_info->stageCount ==
3061 creation_feedback->pipelineStageCreationFeedbackCount);
3062 for (uint32_t i = 0; i < builder->create_info->stageCount; i++) {
3063 gl_shader_stage s =
3064 vk_to_mesa_shader_stage(builder->create_info->pStages[i].stage);
3065 creation_feedback->pPipelineStageCreationFeedbacks[i] = stage_feedbacks[s];
3066 }
3067 }
3068
3069 return VK_SUCCESS;
3070
3071 fail:
3072 for (gl_shader_stage stage = MESA_SHADER_VERTEX;
3073 stage < ARRAY_SIZE(nir); stage++) {
3074 if (shaders[stage]) {
3075 tu_shader_destroy(builder->device, shaders[stage], builder->alloc);
3076 }
3077 }
3078
3079 if (compiled_shaders)
3080 vk_pipeline_cache_object_unref(&compiled_shaders->base);
3081
3082 return result;
3083 }
3084
3085 static void
tu_pipeline_builder_parse_dynamic(struct tu_pipeline_builder * builder,struct tu_pipeline * pipeline)3086 tu_pipeline_builder_parse_dynamic(struct tu_pipeline_builder *builder,
3087 struct tu_pipeline *pipeline)
3088 {
3089 const VkPipelineDynamicStateCreateInfo *dynamic_info =
3090 builder->create_info->pDynamicState;
3091
3092 pipeline->gras_su_cntl_mask = ~0u;
3093 pipeline->rb_depth_cntl_mask = ~0u;
3094 pipeline->rb_stencil_cntl_mask = ~0u;
3095 pipeline->pc_raster_cntl_mask = ~0u;
3096 pipeline->vpc_unknown_9107_mask = ~0u;
3097 pipeline->sp_blend_cntl_mask = ~0u;
3098 pipeline->rb_blend_cntl_mask = ~0u;
3099 pipeline->rb_mrt_control_mask = ~0u;
3100
3101 if (!dynamic_info)
3102 return;
3103
3104 for (uint32_t i = 0; i < dynamic_info->dynamicStateCount; i++) {
3105 VkDynamicState state = dynamic_info->pDynamicStates[i];
3106 switch (state) {
3107 case VK_DYNAMIC_STATE_VIEWPORT ... VK_DYNAMIC_STATE_STENCIL_REFERENCE:
3108 if (state == VK_DYNAMIC_STATE_LINE_WIDTH)
3109 pipeline->gras_su_cntl_mask &= ~A6XX_GRAS_SU_CNTL_LINEHALFWIDTH__MASK;
3110 pipeline->dynamic_state_mask |= BIT(state);
3111 break;
3112 case VK_DYNAMIC_STATE_SAMPLE_LOCATIONS_EXT:
3113 pipeline->dynamic_state_mask |= BIT(TU_DYNAMIC_STATE_SAMPLE_LOCATIONS);
3114 break;
3115 case VK_DYNAMIC_STATE_CULL_MODE:
3116 pipeline->gras_su_cntl_mask &=
3117 ~(A6XX_GRAS_SU_CNTL_CULL_BACK | A6XX_GRAS_SU_CNTL_CULL_FRONT);
3118 pipeline->dynamic_state_mask |= BIT(TU_DYNAMIC_STATE_GRAS_SU_CNTL);
3119 break;
3120 case VK_DYNAMIC_STATE_FRONT_FACE:
3121 pipeline->gras_su_cntl_mask &= ~A6XX_GRAS_SU_CNTL_FRONT_CW;
3122 pipeline->dynamic_state_mask |= BIT(TU_DYNAMIC_STATE_GRAS_SU_CNTL);
3123 break;
3124 case VK_DYNAMIC_STATE_PRIMITIVE_TOPOLOGY:
3125 pipeline->dynamic_state_mask |= BIT(TU_DYNAMIC_STATE_PRIMITIVE_TOPOLOGY);
3126 break;
3127 case VK_DYNAMIC_STATE_VERTEX_INPUT_BINDING_STRIDE:
3128 pipeline->dynamic_state_mask |= BIT(TU_DYNAMIC_STATE_VB_STRIDE);
3129 break;
3130 case VK_DYNAMIC_STATE_VIEWPORT_WITH_COUNT:
3131 pipeline->dynamic_state_mask |= BIT(VK_DYNAMIC_STATE_VIEWPORT);
3132 break;
3133 case VK_DYNAMIC_STATE_SCISSOR_WITH_COUNT:
3134 pipeline->dynamic_state_mask |= BIT(VK_DYNAMIC_STATE_SCISSOR);
3135 break;
3136 case VK_DYNAMIC_STATE_DEPTH_TEST_ENABLE:
3137 pipeline->rb_depth_cntl_mask &=
3138 ~(A6XX_RB_DEPTH_CNTL_Z_TEST_ENABLE | A6XX_RB_DEPTH_CNTL_Z_READ_ENABLE);
3139 pipeline->dynamic_state_mask |= BIT(TU_DYNAMIC_STATE_RB_DEPTH_CNTL);
3140 break;
3141 case VK_DYNAMIC_STATE_DEPTH_WRITE_ENABLE:
3142 pipeline->rb_depth_cntl_mask &= ~A6XX_RB_DEPTH_CNTL_Z_WRITE_ENABLE;
3143 pipeline->dynamic_state_mask |= BIT(TU_DYNAMIC_STATE_RB_DEPTH_CNTL);
3144 break;
3145 case VK_DYNAMIC_STATE_DEPTH_COMPARE_OP:
3146 pipeline->rb_depth_cntl_mask &= ~A6XX_RB_DEPTH_CNTL_ZFUNC__MASK;
3147 pipeline->dynamic_state_mask |= BIT(TU_DYNAMIC_STATE_RB_DEPTH_CNTL);
3148 break;
3149 case VK_DYNAMIC_STATE_DEPTH_BOUNDS_TEST_ENABLE:
3150 pipeline->rb_depth_cntl_mask &=
3151 ~(A6XX_RB_DEPTH_CNTL_Z_BOUNDS_ENABLE | A6XX_RB_DEPTH_CNTL_Z_READ_ENABLE);
3152 pipeline->dynamic_state_mask |= BIT(TU_DYNAMIC_STATE_RB_DEPTH_CNTL);
3153 break;
3154 case VK_DYNAMIC_STATE_STENCIL_TEST_ENABLE:
3155 pipeline->rb_stencil_cntl_mask &= ~(A6XX_RB_STENCIL_CONTROL_STENCIL_ENABLE |
3156 A6XX_RB_STENCIL_CONTROL_STENCIL_ENABLE_BF |
3157 A6XX_RB_STENCIL_CONTROL_STENCIL_READ);
3158 pipeline->dynamic_state_mask |= BIT(TU_DYNAMIC_STATE_RB_STENCIL_CNTL);
3159 break;
3160 case VK_DYNAMIC_STATE_STENCIL_OP:
3161 pipeline->rb_stencil_cntl_mask &= ~(A6XX_RB_STENCIL_CONTROL_FUNC__MASK |
3162 A6XX_RB_STENCIL_CONTROL_FAIL__MASK |
3163 A6XX_RB_STENCIL_CONTROL_ZPASS__MASK |
3164 A6XX_RB_STENCIL_CONTROL_ZFAIL__MASK |
3165 A6XX_RB_STENCIL_CONTROL_FUNC_BF__MASK |
3166 A6XX_RB_STENCIL_CONTROL_FAIL_BF__MASK |
3167 A6XX_RB_STENCIL_CONTROL_ZPASS_BF__MASK |
3168 A6XX_RB_STENCIL_CONTROL_ZFAIL_BF__MASK);
3169 pipeline->dynamic_state_mask |= BIT(TU_DYNAMIC_STATE_RB_STENCIL_CNTL);
3170 break;
3171 case VK_DYNAMIC_STATE_DEPTH_BIAS_ENABLE:
3172 pipeline->gras_su_cntl_mask &= ~A6XX_GRAS_SU_CNTL_POLY_OFFSET;
3173 pipeline->dynamic_state_mask |= BIT(TU_DYNAMIC_STATE_GRAS_SU_CNTL);
3174 break;
3175 case VK_DYNAMIC_STATE_PRIMITIVE_RESTART_ENABLE:
3176 pipeline->dynamic_state_mask |= BIT(TU_DYNAMIC_STATE_PRIMITIVE_RESTART_ENABLE);
3177 break;
3178 case VK_DYNAMIC_STATE_RASTERIZER_DISCARD_ENABLE:
3179 pipeline->pc_raster_cntl_mask &= ~A6XX_PC_RASTER_CNTL_DISCARD;
3180 pipeline->vpc_unknown_9107_mask &= ~A6XX_VPC_UNKNOWN_9107_RASTER_DISCARD;
3181 pipeline->dynamic_state_mask |= BIT(TU_DYNAMIC_STATE_RASTERIZER_DISCARD);
3182 break;
3183 case VK_DYNAMIC_STATE_LOGIC_OP_EXT:
3184 pipeline->sp_blend_cntl_mask &= ~A6XX_SP_BLEND_CNTL_ENABLE_BLEND__MASK;
3185 pipeline->rb_blend_cntl_mask &= ~A6XX_RB_BLEND_CNTL_ENABLE_BLEND__MASK;
3186 pipeline->rb_mrt_control_mask &= ~A6XX_RB_MRT_CONTROL_ROP_CODE__MASK;
3187 pipeline->dynamic_state_mask |= BIT(TU_DYNAMIC_STATE_BLEND);
3188 pipeline->dynamic_state_mask |= BIT(TU_DYNAMIC_STATE_LOGIC_OP);
3189 break;
3190 case VK_DYNAMIC_STATE_COLOR_WRITE_ENABLE_EXT:
3191 pipeline->sp_blend_cntl_mask &= ~A6XX_SP_BLEND_CNTL_ENABLE_BLEND__MASK;
3192 pipeline->rb_blend_cntl_mask &= ~A6XX_RB_BLEND_CNTL_ENABLE_BLEND__MASK;
3193 pipeline->dynamic_state_mask |= BIT(TU_DYNAMIC_STATE_BLEND);
3194
3195 /* Dynamic color write enable doesn't directly change any of the
3196 * registers, but it causes us to make some of the registers 0, so we
3197 * set this dynamic state instead of making the register dynamic.
3198 */
3199 pipeline->dynamic_state_mask |= BIT(TU_DYNAMIC_STATE_COLOR_WRITE_ENABLE);
3200 break;
3201 default:
3202 assert(!"unsupported dynamic state");
3203 break;
3204 }
3205 }
3206 }
3207
3208 static void
tu_pipeline_set_linkage(struct tu_program_descriptor_linkage * link,struct tu_push_constant_range * push_consts,struct ir3_shader_variant * v)3209 tu_pipeline_set_linkage(struct tu_program_descriptor_linkage *link,
3210 struct tu_push_constant_range *push_consts,
3211 struct ir3_shader_variant *v)
3212 {
3213 link->const_state = *ir3_const_state(v);
3214 link->constlen = v->constlen;
3215 link->push_consts = *push_consts;
3216 }
3217
3218 static void
tu_pipeline_builder_parse_shader_stages(struct tu_pipeline_builder * builder,struct tu_pipeline * pipeline)3219 tu_pipeline_builder_parse_shader_stages(struct tu_pipeline_builder *builder,
3220 struct tu_pipeline *pipeline)
3221 {
3222 struct tu_cs prog_cs;
3223
3224 /* Emit HLSQ_xS_CNTL/HLSQ_SP_xS_CONFIG *first*, before emitting anything
3225 * else that could depend on that state (like push constants)
3226 *
3227 * Note also that this always uses the full VS even in binning pass. The
3228 * binning pass variant has the same const layout as the full VS, and
3229 * the constlen for the VS will be the same or greater than the constlen
3230 * for the binning pass variant. It is required that the constlen state
3231 * matches between binning and draw passes, as some parts of the push
3232 * consts are emitted in state groups that are shared between the binning
3233 * and draw passes.
3234 */
3235 tu_cs_begin_sub_stream(&pipeline->cs, 512, &prog_cs);
3236 tu6_emit_program_config(&prog_cs, builder);
3237 pipeline->program.config_state = tu_cs_end_draw_state(&pipeline->cs, &prog_cs);
3238
3239 tu_cs_begin_sub_stream(&pipeline->cs, 512 + builder->additional_cs_reserve_size, &prog_cs);
3240 tu6_emit_program(&prog_cs, builder, false, pipeline);
3241 pipeline->program.state = tu_cs_end_draw_state(&pipeline->cs, &prog_cs);
3242
3243 tu_cs_begin_sub_stream(&pipeline->cs, 512 + builder->additional_cs_reserve_size, &prog_cs);
3244 tu6_emit_program(&prog_cs, builder, true, pipeline);
3245 pipeline->program.binning_state = tu_cs_end_draw_state(&pipeline->cs, &prog_cs);
3246
3247 VkShaderStageFlags stages = 0;
3248 for (unsigned i = 0; i < builder->create_info->stageCount; i++) {
3249 stages |= builder->create_info->pStages[i].stage;
3250 }
3251 pipeline->active_stages = stages;
3252
3253 for (unsigned i = 0; i < ARRAY_SIZE(builder->shaders->variants); i++) {
3254 if (!builder->shaders->variants[i])
3255 continue;
3256
3257 tu_pipeline_set_linkage(&pipeline->program.link[i],
3258 &builder->shaders->push_consts[i],
3259 builder->shaders->variants[i]);
3260 }
3261 }
3262
3263 static void
tu_pipeline_builder_parse_vertex_input(struct tu_pipeline_builder * builder,struct tu_pipeline * pipeline)3264 tu_pipeline_builder_parse_vertex_input(struct tu_pipeline_builder *builder,
3265 struct tu_pipeline *pipeline)
3266 {
3267 const VkPipelineVertexInputStateCreateInfo *vi_info =
3268 builder->create_info->pVertexInputState;
3269 const struct ir3_shader_variant *vs = builder->shaders->variants[MESA_SHADER_VERTEX];
3270 const struct ir3_shader_variant *bs = builder->binning_variant;
3271
3272 /* Bindings may contain holes */
3273 for (unsigned i = 0; i < vi_info->vertexBindingDescriptionCount; i++) {
3274 pipeline->num_vbs =
3275 MAX2(pipeline->num_vbs, vi_info->pVertexBindingDescriptions[i].binding + 1);
3276 }
3277
3278 tu6_emit_vertex_input(pipeline, &pipeline->vi.state, vs, vi_info);
3279 if (bs)
3280 tu6_emit_vertex_input(pipeline, &pipeline->vi.binning_state, bs, vi_info);
3281 }
3282
3283 static void
tu_pipeline_builder_parse_input_assembly(struct tu_pipeline_builder * builder,struct tu_pipeline * pipeline)3284 tu_pipeline_builder_parse_input_assembly(struct tu_pipeline_builder *builder,
3285 struct tu_pipeline *pipeline)
3286 {
3287 const VkPipelineInputAssemblyStateCreateInfo *ia_info =
3288 builder->create_info->pInputAssemblyState;
3289
3290 pipeline->ia.primtype = tu6_primtype(ia_info->topology);
3291 pipeline->ia.primitive_restart = ia_info->primitiveRestartEnable;
3292 }
3293
3294 static bool
tu_pipeline_static_state(struct tu_pipeline * pipeline,struct tu_cs * cs,uint32_t id,uint32_t size)3295 tu_pipeline_static_state(struct tu_pipeline *pipeline, struct tu_cs *cs,
3296 uint32_t id, uint32_t size)
3297 {
3298 assert(id < ARRAY_SIZE(pipeline->dynamic_state));
3299
3300 if (pipeline->dynamic_state_mask & BIT(id))
3301 return false;
3302
3303 pipeline->dynamic_state[id] = tu_cs_draw_state(&pipeline->cs, cs, size);
3304 return true;
3305 }
3306
3307 static void
tu_pipeline_builder_parse_tessellation(struct tu_pipeline_builder * builder,struct tu_pipeline * pipeline)3308 tu_pipeline_builder_parse_tessellation(struct tu_pipeline_builder *builder,
3309 struct tu_pipeline *pipeline)
3310 {
3311 if (!(pipeline->active_stages & VK_SHADER_STAGE_TESSELLATION_CONTROL_BIT) ||
3312 !(pipeline->active_stages & VK_SHADER_STAGE_TESSELLATION_EVALUATION_BIT))
3313 return;
3314
3315 const VkPipelineTessellationStateCreateInfo *tess_info =
3316 builder->create_info->pTessellationState;
3317
3318 assert(pipeline->ia.primtype == DI_PT_PATCHES0);
3319 assert(tess_info->patchControlPoints <= 32);
3320 pipeline->ia.primtype += tess_info->patchControlPoints;
3321 const VkPipelineTessellationDomainOriginStateCreateInfo *domain_info =
3322 vk_find_struct_const(tess_info->pNext, PIPELINE_TESSELLATION_DOMAIN_ORIGIN_STATE_CREATE_INFO);
3323 pipeline->tess.upper_left_domain_origin = !domain_info ||
3324 domain_info->domainOrigin == VK_TESSELLATION_DOMAIN_ORIGIN_UPPER_LEFT;
3325 const struct ir3_shader_variant *hs = builder->shaders->variants[MESA_SHADER_TESS_CTRL];
3326 pipeline->tess.param_stride = hs->output_size * 4;
3327 }
3328
3329 static void
tu_pipeline_builder_parse_viewport(struct tu_pipeline_builder * builder,struct tu_pipeline * pipeline)3330 tu_pipeline_builder_parse_viewport(struct tu_pipeline_builder *builder,
3331 struct tu_pipeline *pipeline)
3332 {
3333 /* The spec says:
3334 *
3335 * pViewportState is a pointer to an instance of the
3336 * VkPipelineViewportStateCreateInfo structure, and is ignored if the
3337 * pipeline has rasterization disabled."
3338 *
3339 * We leave the relevant registers stale in that case.
3340 */
3341 if (builder->rasterizer_discard)
3342 return;
3343
3344 const VkPipelineViewportStateCreateInfo *vp_info =
3345 builder->create_info->pViewportState;
3346 const VkPipelineViewportDepthClipControlCreateInfoEXT *depth_clip_info =
3347 vk_find_struct_const(vp_info->pNext, PIPELINE_VIEWPORT_DEPTH_CLIP_CONTROL_CREATE_INFO_EXT);
3348 pipeline->z_negative_one_to_one = depth_clip_info ? depth_clip_info->negativeOneToOne : false;
3349
3350 struct tu_cs cs;
3351
3352 if (tu_pipeline_static_state(pipeline, &cs, VK_DYNAMIC_STATE_VIEWPORT, 8 + 10 * vp_info->viewportCount))
3353 tu6_emit_viewport(&cs, vp_info->pViewports, vp_info->viewportCount, pipeline->z_negative_one_to_one);
3354
3355 if (tu_pipeline_static_state(pipeline, &cs, VK_DYNAMIC_STATE_SCISSOR, 1 + 2 * vp_info->scissorCount))
3356 tu6_emit_scissor(&cs, vp_info->pScissors, vp_info->scissorCount);
3357 }
3358
3359 static void
tu_pipeline_builder_parse_rasterization(struct tu_pipeline_builder * builder,struct tu_pipeline * pipeline)3360 tu_pipeline_builder_parse_rasterization(struct tu_pipeline_builder *builder,
3361 struct tu_pipeline *pipeline)
3362 {
3363 const VkPipelineRasterizationStateCreateInfo *rast_info =
3364 builder->create_info->pRasterizationState;
3365
3366 enum a6xx_polygon_mode mode = tu6_polygon_mode(rast_info->polygonMode);
3367
3368 builder->depth_clip_disable = rast_info->depthClampEnable;
3369
3370 const VkPipelineRasterizationDepthClipStateCreateInfoEXT *depth_clip_state =
3371 vk_find_struct_const(rast_info, PIPELINE_RASTERIZATION_DEPTH_CLIP_STATE_CREATE_INFO_EXT);
3372 if (depth_clip_state)
3373 builder->depth_clip_disable = !depth_clip_state->depthClipEnable;
3374
3375 pipeline->line_mode = RECTANGULAR;
3376
3377 if (tu6_primtype_line(pipeline->ia.primtype) ||
3378 (tu6_primtype_patches(pipeline->ia.primtype) &&
3379 pipeline->tess.patch_type == IR3_TESS_ISOLINES)) {
3380 const VkPipelineRasterizationLineStateCreateInfoEXT *rast_line_state =
3381 vk_find_struct_const(rast_info->pNext,
3382 PIPELINE_RASTERIZATION_LINE_STATE_CREATE_INFO_EXT);
3383
3384 if (rast_line_state && rast_line_state->lineRasterizationMode ==
3385 VK_LINE_RASTERIZATION_MODE_BRESENHAM_EXT) {
3386 pipeline->line_mode = BRESENHAM;
3387 }
3388 }
3389
3390 struct tu_cs cs;
3391 uint32_t cs_size = 9 +
3392 (builder->device->physical_device->info->a6xx.has_shading_rate ? 8 : 0) +
3393 (builder->emit_msaa_state ? 11 : 0);
3394 pipeline->rast_state = tu_cs_draw_state(&pipeline->cs, &cs, cs_size);
3395
3396 tu_cs_emit_regs(&cs,
3397 A6XX_GRAS_CL_CNTL(
3398 .znear_clip_disable = builder->depth_clip_disable,
3399 .zfar_clip_disable = builder->depth_clip_disable,
3400 /* TODO should this be depth_clip_disable instead? */
3401 .unk5 = rast_info->depthClampEnable,
3402 .zero_gb_scale_z = pipeline->z_negative_one_to_one ? 0 : 1,
3403 .vp_clip_code_ignore = 1));
3404
3405 tu_cs_emit_regs(&cs,
3406 A6XX_VPC_POLYGON_MODE(mode));
3407
3408 tu_cs_emit_regs(&cs,
3409 A6XX_PC_POLYGON_MODE(mode));
3410
3411 /* move to hw ctx init? */
3412 tu_cs_emit_regs(&cs,
3413 A6XX_GRAS_SU_POINT_MINMAX(.min = 1.0f / 16.0f, .max = 4092.0f),
3414 A6XX_GRAS_SU_POINT_SIZE(1.0f));
3415
3416 if (builder->device->physical_device->info->a6xx.has_shading_rate) {
3417 tu_cs_emit_regs(&cs, A6XX_RB_UNKNOWN_8A00());
3418 tu_cs_emit_regs(&cs, A6XX_RB_UNKNOWN_8A10());
3419 tu_cs_emit_regs(&cs, A6XX_RB_UNKNOWN_8A20());
3420 tu_cs_emit_regs(&cs, A6XX_RB_UNKNOWN_8A30());
3421 }
3422
3423 /* If samples count couldn't be devised from the subpass, we should emit it here.
3424 * It happens when subpass doesn't use any color/depth attachment.
3425 */
3426 if (builder->emit_msaa_state)
3427 tu6_emit_msaa(&cs, builder->samples, pipeline->line_mode);
3428
3429 const VkPipelineRasterizationStateStreamCreateInfoEXT *stream_info =
3430 vk_find_struct_const(rast_info->pNext,
3431 PIPELINE_RASTERIZATION_STATE_STREAM_CREATE_INFO_EXT);
3432 unsigned stream = stream_info ? stream_info->rasterizationStream : 0;
3433
3434 pipeline->pc_raster_cntl = A6XX_PC_RASTER_CNTL_STREAM(stream);
3435 pipeline->vpc_unknown_9107 = 0;
3436 if (rast_info->rasterizerDiscardEnable) {
3437 pipeline->pc_raster_cntl |= A6XX_PC_RASTER_CNTL_DISCARD;
3438 pipeline->vpc_unknown_9107 |= A6XX_VPC_UNKNOWN_9107_RASTER_DISCARD;
3439 }
3440
3441 if (tu_pipeline_static_state(pipeline, &cs, TU_DYNAMIC_STATE_RASTERIZER_DISCARD, 4)) {
3442 tu_cs_emit_regs(&cs, A6XX_PC_RASTER_CNTL(.dword = pipeline->pc_raster_cntl));
3443 tu_cs_emit_regs(&cs, A6XX_VPC_UNKNOWN_9107(.dword = pipeline->vpc_unknown_9107));
3444 }
3445
3446 pipeline->gras_su_cntl =
3447 tu6_gras_su_cntl(rast_info, pipeline->line_mode, builder->multiview_mask != 0);
3448
3449 if (tu_pipeline_static_state(pipeline, &cs, TU_DYNAMIC_STATE_GRAS_SU_CNTL, 2))
3450 tu_cs_emit_regs(&cs, A6XX_GRAS_SU_CNTL(.dword = pipeline->gras_su_cntl));
3451
3452 if (tu_pipeline_static_state(pipeline, &cs, VK_DYNAMIC_STATE_DEPTH_BIAS, 4)) {
3453 tu6_emit_depth_bias(&cs, rast_info->depthBiasConstantFactor,
3454 rast_info->depthBiasClamp,
3455 rast_info->depthBiasSlopeFactor);
3456 }
3457
3458 const struct VkPipelineRasterizationProvokingVertexStateCreateInfoEXT *provoking_vtx_state =
3459 vk_find_struct_const(rast_info->pNext, PIPELINE_RASTERIZATION_PROVOKING_VERTEX_STATE_CREATE_INFO_EXT);
3460 pipeline->provoking_vertex_last = provoking_vtx_state &&
3461 provoking_vtx_state->provokingVertexMode == VK_PROVOKING_VERTEX_MODE_LAST_VERTEX_EXT;
3462 }
3463
3464 static void
tu_pipeline_builder_parse_depth_stencil(struct tu_pipeline_builder * builder,struct tu_pipeline * pipeline)3465 tu_pipeline_builder_parse_depth_stencil(struct tu_pipeline_builder *builder,
3466 struct tu_pipeline *pipeline)
3467 {
3468 /* The spec says:
3469 *
3470 * pDepthStencilState is a pointer to an instance of the
3471 * VkPipelineDepthStencilStateCreateInfo structure, and is ignored if
3472 * the pipeline has rasterization disabled or if the subpass of the
3473 * render pass the pipeline is created against does not use a
3474 * depth/stencil attachment.
3475 */
3476 const VkPipelineDepthStencilStateCreateInfo *ds_info =
3477 builder->create_info->pDepthStencilState;
3478 const enum pipe_format pipe_format =
3479 vk_format_to_pipe_format(builder->depth_attachment_format);
3480 uint32_t rb_depth_cntl = 0, rb_stencil_cntl = 0;
3481 struct tu_cs cs;
3482
3483 if (builder->depth_attachment_format != VK_FORMAT_UNDEFINED &&
3484 builder->depth_attachment_format != VK_FORMAT_S8_UINT) {
3485 if (ds_info->depthTestEnable) {
3486 rb_depth_cntl |=
3487 A6XX_RB_DEPTH_CNTL_Z_TEST_ENABLE |
3488 A6XX_RB_DEPTH_CNTL_ZFUNC(tu6_compare_func(ds_info->depthCompareOp)) |
3489 A6XX_RB_DEPTH_CNTL_Z_READ_ENABLE; /* TODO: don't set for ALWAYS/NEVER */
3490
3491 if (builder->depth_clip_disable)
3492 rb_depth_cntl |= A6XX_RB_DEPTH_CNTL_Z_CLIP_DISABLE;
3493
3494 if (ds_info->depthWriteEnable)
3495 rb_depth_cntl |= A6XX_RB_DEPTH_CNTL_Z_WRITE_ENABLE;
3496 }
3497
3498 if (ds_info->depthBoundsTestEnable)
3499 rb_depth_cntl |= A6XX_RB_DEPTH_CNTL_Z_BOUNDS_ENABLE | A6XX_RB_DEPTH_CNTL_Z_READ_ENABLE;
3500
3501 if (ds_info->depthBoundsTestEnable && !ds_info->depthTestEnable)
3502 tu6_apply_depth_bounds_workaround(builder->device, &rb_depth_cntl);
3503
3504 pipeline->depth_cpp_per_sample = util_format_get_component_bits(
3505 pipe_format, UTIL_FORMAT_COLORSPACE_ZS, 0) / 8;
3506 } else {
3507 /* if RB_DEPTH_CNTL is set dynamically, we need to make sure it is set
3508 * to 0 when this pipeline is used, as enabling depth test when there
3509 * is no depth attachment is a problem (at least for the S8_UINT case)
3510 */
3511 if (pipeline->dynamic_state_mask & BIT(TU_DYNAMIC_STATE_RB_DEPTH_CNTL))
3512 pipeline->rb_depth_cntl_disable = true;
3513 }
3514
3515 if (builder->depth_attachment_format != VK_FORMAT_UNDEFINED) {
3516 const VkStencilOpState *front = &ds_info->front;
3517 const VkStencilOpState *back = &ds_info->back;
3518
3519 rb_stencil_cntl |=
3520 A6XX_RB_STENCIL_CONTROL_FUNC(tu6_compare_func(front->compareOp)) |
3521 A6XX_RB_STENCIL_CONTROL_FAIL(tu6_stencil_op(front->failOp)) |
3522 A6XX_RB_STENCIL_CONTROL_ZPASS(tu6_stencil_op(front->passOp)) |
3523 A6XX_RB_STENCIL_CONTROL_ZFAIL(tu6_stencil_op(front->depthFailOp)) |
3524 A6XX_RB_STENCIL_CONTROL_FUNC_BF(tu6_compare_func(back->compareOp)) |
3525 A6XX_RB_STENCIL_CONTROL_FAIL_BF(tu6_stencil_op(back->failOp)) |
3526 A6XX_RB_STENCIL_CONTROL_ZPASS_BF(tu6_stencil_op(back->passOp)) |
3527 A6XX_RB_STENCIL_CONTROL_ZFAIL_BF(tu6_stencil_op(back->depthFailOp));
3528
3529 if (ds_info->stencilTestEnable) {
3530 rb_stencil_cntl |=
3531 A6XX_RB_STENCIL_CONTROL_STENCIL_ENABLE |
3532 A6XX_RB_STENCIL_CONTROL_STENCIL_ENABLE_BF |
3533 A6XX_RB_STENCIL_CONTROL_STENCIL_READ;
3534 }
3535
3536 pipeline->stencil_cpp_per_sample = util_format_get_component_bits(
3537 pipe_format, UTIL_FORMAT_COLORSPACE_ZS, 1) / 8;
3538 }
3539
3540 if (tu_pipeline_static_state(pipeline, &cs, TU_DYNAMIC_STATE_RB_DEPTH_CNTL, 2)) {
3541 tu_cs_emit_pkt4(&cs, REG_A6XX_RB_DEPTH_CNTL, 1);
3542 tu_cs_emit(&cs, rb_depth_cntl);
3543 }
3544 pipeline->rb_depth_cntl = rb_depth_cntl;
3545
3546 if (tu_pipeline_static_state(pipeline, &cs, TU_DYNAMIC_STATE_RB_STENCIL_CNTL, 2)) {
3547 tu_cs_emit_pkt4(&cs, REG_A6XX_RB_STENCIL_CONTROL, 1);
3548 tu_cs_emit(&cs, rb_stencil_cntl);
3549 }
3550 pipeline->rb_stencil_cntl = rb_stencil_cntl;
3551
3552 /* the remaining draw states arent used if there is no d/s, leave them empty */
3553 if (builder->depth_attachment_format == VK_FORMAT_UNDEFINED)
3554 return;
3555
3556 if (tu_pipeline_static_state(pipeline, &cs, VK_DYNAMIC_STATE_DEPTH_BOUNDS, 3)) {
3557 tu_cs_emit_regs(&cs,
3558 A6XX_RB_Z_BOUNDS_MIN(ds_info->minDepthBounds),
3559 A6XX_RB_Z_BOUNDS_MAX(ds_info->maxDepthBounds));
3560 }
3561
3562 if (tu_pipeline_static_state(pipeline, &cs, VK_DYNAMIC_STATE_STENCIL_COMPARE_MASK, 2)) {
3563 tu_cs_emit_regs(&cs, A6XX_RB_STENCILMASK(.mask = ds_info->front.compareMask & 0xff,
3564 .bfmask = ds_info->back.compareMask & 0xff));
3565 }
3566
3567 if (tu_pipeline_static_state(pipeline, &cs, VK_DYNAMIC_STATE_STENCIL_WRITE_MASK, 2)) {
3568 update_stencil_mask(&pipeline->stencil_wrmask, VK_STENCIL_FACE_FRONT_BIT, ds_info->front.writeMask);
3569 update_stencil_mask(&pipeline->stencil_wrmask, VK_STENCIL_FACE_BACK_BIT, ds_info->back.writeMask);
3570 tu_cs_emit_regs(&cs, A6XX_RB_STENCILWRMASK(.dword = pipeline->stencil_wrmask));
3571 }
3572
3573 if (tu_pipeline_static_state(pipeline, &cs, VK_DYNAMIC_STATE_STENCIL_REFERENCE, 2)) {
3574 tu_cs_emit_regs(&cs, A6XX_RB_STENCILREF(.ref = ds_info->front.reference & 0xff,
3575 .bfref = ds_info->back.reference & 0xff));
3576 }
3577
3578 if (builder->shaders->variants[MESA_SHADER_FRAGMENT]) {
3579 const struct ir3_shader_variant *fs = builder->shaders->variants[MESA_SHADER_FRAGMENT];
3580 if (fs->has_kill || builder->alpha_to_coverage) {
3581 pipeline->lrz.force_disable_mask |= TU_LRZ_FORCE_DISABLE_WRITE;
3582 }
3583 if (fs->no_earlyz || fs->writes_pos) {
3584 pipeline->lrz.force_disable_mask = TU_LRZ_FORCE_DISABLE_LRZ;
3585 }
3586 }
3587 }
3588
3589 static void
tu_pipeline_builder_parse_multisample_and_color_blend(struct tu_pipeline_builder * builder,struct tu_pipeline * pipeline)3590 tu_pipeline_builder_parse_multisample_and_color_blend(
3591 struct tu_pipeline_builder *builder, struct tu_pipeline *pipeline)
3592 {
3593 /* The spec says:
3594 *
3595 * pMultisampleState is a pointer to an instance of the
3596 * VkPipelineMultisampleStateCreateInfo, and is ignored if the pipeline
3597 * has rasterization disabled.
3598 *
3599 * Also,
3600 *
3601 * pColorBlendState is a pointer to an instance of the
3602 * VkPipelineColorBlendStateCreateInfo structure, and is ignored if the
3603 * pipeline has rasterization disabled or if the subpass of the render
3604 * pass the pipeline is created against does not use any color
3605 * attachments.
3606 *
3607 * We leave the relevant registers stale when rasterization is disabled.
3608 */
3609 if (builder->rasterizer_discard)
3610 return;
3611
3612 static const VkPipelineColorBlendStateCreateInfo dummy_blend_info;
3613 const VkPipelineMultisampleStateCreateInfo *msaa_info =
3614 builder->create_info->pMultisampleState;
3615 const VkPipelineColorBlendStateCreateInfo *blend_info =
3616 builder->use_color_attachments ? builder->create_info->pColorBlendState
3617 : &dummy_blend_info;
3618
3619 struct tu_cs cs;
3620 tu6_emit_rb_mrt_controls(pipeline, blend_info,
3621 builder->color_attachment_formats,
3622 &pipeline->rop_reads_dst,
3623 &pipeline->color_bandwidth_per_sample);
3624
3625 uint32_t blend_enable_mask =
3626 pipeline->rop_reads_dst ? pipeline->color_write_enable : pipeline->blend_enable;
3627 tu6_emit_blend_control(pipeline, blend_enable_mask,
3628 builder->use_dual_src_blend, msaa_info);
3629
3630 if (tu_pipeline_static_state(pipeline, &cs, TU_DYNAMIC_STATE_BLEND,
3631 blend_info->attachmentCount * 3 + 4)) {
3632 tu6_emit_blend(&cs, pipeline);
3633 assert(cs.cur == cs.end); /* validate draw state size */
3634 }
3635
3636 /* Disable LRZ writes when blend or logic op that reads the destination is
3637 * enabled, since the resulting pixel value from the blend-draw depends on
3638 * an earlier draw, which LRZ in the draw pass could early-reject if the
3639 * previous blend-enabled draw wrote LRZ.
3640 *
3641 * TODO: We need to disable LRZ writes only for the binning pass.
3642 * Therefore, we need to emit it in a separate draw state. We keep
3643 * it disabled for sysmem path as well for the moment.
3644 */
3645 if (blend_enable_mask)
3646 pipeline->lrz.force_disable_mask |= TU_LRZ_FORCE_DISABLE_WRITE;
3647
3648 for (int i = 0; i < blend_info->attachmentCount; i++) {
3649 VkPipelineColorBlendAttachmentState blendAttachment = blend_info->pAttachments[i];
3650 /* From the PoV of LRZ, having masked color channels is
3651 * the same as having blend enabled, in that the draw will
3652 * care about the fragments from an earlier draw.
3653 */
3654 VkFormat format = builder->color_attachment_formats[i];
3655 unsigned mask = MASK(vk_format_get_nr_components(format));
3656 if (format != VK_FORMAT_UNDEFINED &&
3657 ((blendAttachment.colorWriteMask & mask) != mask ||
3658 !(pipeline->color_write_enable & BIT(i)))) {
3659 pipeline->lrz.force_disable_mask |= TU_LRZ_FORCE_DISABLE_WRITE;
3660 }
3661 }
3662
3663 if (tu_pipeline_static_state(pipeline, &cs, VK_DYNAMIC_STATE_BLEND_CONSTANTS, 5)) {
3664 tu_cs_emit_pkt4(&cs, REG_A6XX_RB_BLEND_RED_F32, 4);
3665 tu_cs_emit_array(&cs, (const uint32_t *) blend_info->blendConstants, 4);
3666 }
3667
3668 const struct VkPipelineSampleLocationsStateCreateInfoEXT *sample_locations =
3669 vk_find_struct_const(msaa_info->pNext, PIPELINE_SAMPLE_LOCATIONS_STATE_CREATE_INFO_EXT);
3670 const VkSampleLocationsInfoEXT *samp_loc = NULL;
3671
3672 if (sample_locations && sample_locations->sampleLocationsEnable)
3673 samp_loc = &sample_locations->sampleLocationsInfo;
3674
3675 if (tu_pipeline_static_state(pipeline, &cs, TU_DYNAMIC_STATE_SAMPLE_LOCATIONS,
3676 samp_loc ? 9 : 6)) {
3677 tu6_emit_sample_locations(&cs, samp_loc);
3678 }
3679 }
3680
3681 static void
tu_pipeline_builder_parse_rasterization_order(struct tu_pipeline_builder * builder,struct tu_pipeline * pipeline)3682 tu_pipeline_builder_parse_rasterization_order(
3683 struct tu_pipeline_builder *builder, struct tu_pipeline *pipeline)
3684 {
3685 if (builder->rasterizer_discard)
3686 return;
3687
3688 pipeline->subpass_feedback_loop_ds = builder->subpass_feedback_loop_ds;
3689
3690 const VkPipelineColorBlendStateCreateInfo *blend_info =
3691 builder->create_info->pColorBlendState;
3692
3693 const VkPipelineDepthStencilStateCreateInfo *ds_info =
3694 builder->create_info->pDepthStencilState;
3695
3696 if (builder->use_color_attachments) {
3697 pipeline->raster_order_attachment_access =
3698 blend_info->flags &
3699 VK_PIPELINE_COLOR_BLEND_STATE_CREATE_RASTERIZATION_ORDER_ATTACHMENT_ACCESS_BIT_ARM;
3700 }
3701
3702 if (builder->depth_attachment_format != VK_FORMAT_UNDEFINED) {
3703 pipeline->raster_order_attachment_access |=
3704 ds_info->flags &
3705 (VK_PIPELINE_DEPTH_STENCIL_STATE_CREATE_RASTERIZATION_ORDER_ATTACHMENT_DEPTH_ACCESS_BIT_ARM |
3706 VK_PIPELINE_DEPTH_STENCIL_STATE_CREATE_RASTERIZATION_ORDER_ATTACHMENT_STENCIL_ACCESS_BIT_ARM);
3707 }
3708
3709 if (unlikely(builder->device->physical_device->instance->debug_flags & TU_DEBUG_RAST_ORDER))
3710 pipeline->raster_order_attachment_access = true;
3711
3712 /* VK_EXT_blend_operation_advanced would also require ordered access
3713 * when implemented in the future.
3714 */
3715
3716 uint32_t sysmem_prim_mode = NO_FLUSH;
3717 uint32_t gmem_prim_mode = NO_FLUSH;
3718
3719 if (pipeline->raster_order_attachment_access) {
3720 /* VK_ARM_rasterization_order_attachment_access:
3721 *
3722 * This extension allow access to framebuffer attachments when used as
3723 * both input and color attachments from one fragment to the next,
3724 * in rasterization order, without explicit synchronization.
3725 */
3726 sysmem_prim_mode = FLUSH_PER_OVERLAP_AND_OVERWRITE;
3727 gmem_prim_mode = FLUSH_PER_OVERLAP;
3728 } else {
3729 /* If there is a feedback loop, then the shader can read the previous value
3730 * of a pixel being written out. It can also write some components and then
3731 * read different components without a barrier in between. This is a
3732 * problem in sysmem mode with UBWC, because the main buffer and flags
3733 * buffer can get out-of-sync if only one is flushed. We fix this by
3734 * setting the SINGLE_PRIM_MODE field to the same value that the blob does
3735 * for advanced_blend in sysmem mode if a feedback loop is detected.
3736 */
3737 if (builder->subpass_feedback_loop_color ||
3738 builder->subpass_feedback_loop_ds) {
3739 sysmem_prim_mode = FLUSH_PER_OVERLAP_AND_OVERWRITE;
3740 }
3741 }
3742
3743 struct tu_cs cs;
3744
3745 pipeline->prim_order_state_gmem = tu_cs_draw_state(&pipeline->cs, &cs, 2);
3746 tu_cs_emit_write_reg(&cs, REG_A6XX_GRAS_SC_CNTL,
3747 A6XX_GRAS_SC_CNTL_CCUSINGLECACHELINESIZE(2) |
3748 A6XX_GRAS_SC_CNTL_SINGLE_PRIM_MODE(gmem_prim_mode));
3749
3750 pipeline->prim_order_state_sysmem = tu_cs_draw_state(&pipeline->cs, &cs, 2);
3751 tu_cs_emit_write_reg(&cs, REG_A6XX_GRAS_SC_CNTL,
3752 A6XX_GRAS_SC_CNTL_CCUSINGLECACHELINESIZE(2) |
3753 A6XX_GRAS_SC_CNTL_SINGLE_PRIM_MODE(sysmem_prim_mode));
3754 }
3755
3756 static void
tu_pipeline_finish(struct tu_pipeline * pipeline,struct tu_device * dev,const VkAllocationCallbacks * alloc)3757 tu_pipeline_finish(struct tu_pipeline *pipeline,
3758 struct tu_device *dev,
3759 const VkAllocationCallbacks *alloc)
3760 {
3761 tu_cs_finish(&pipeline->cs);
3762 pthread_mutex_lock(&dev->pipeline_mutex);
3763 tu_suballoc_bo_free(&dev->pipeline_suballoc, &pipeline->bo);
3764 pthread_mutex_unlock(&dev->pipeline_mutex);
3765
3766 if (pipeline->pvtmem_bo)
3767 tu_bo_finish(dev, pipeline->pvtmem_bo);
3768
3769 ralloc_free(pipeline->executables_mem_ctx);
3770 }
3771
3772 static VkResult
tu_pipeline_builder_build(struct tu_pipeline_builder * builder,struct tu_pipeline ** pipeline)3773 tu_pipeline_builder_build(struct tu_pipeline_builder *builder,
3774 struct tu_pipeline **pipeline)
3775 {
3776 VkResult result;
3777
3778 *pipeline = vk_object_zalloc(&builder->device->vk, builder->alloc,
3779 sizeof(**pipeline), VK_OBJECT_TYPE_PIPELINE);
3780 if (!*pipeline)
3781 return VK_ERROR_OUT_OF_HOST_MEMORY;
3782
3783 (*pipeline)->executables_mem_ctx = ralloc_context(NULL);
3784 util_dynarray_init(&(*pipeline)->executables, (*pipeline)->executables_mem_ctx);
3785
3786 /* compile and upload shaders */
3787 result = tu_pipeline_builder_compile_shaders(builder, *pipeline);
3788 if (result != VK_SUCCESS) {
3789 vk_object_free(&builder->device->vk, builder->alloc, *pipeline);
3790 return result;
3791 }
3792
3793 result = tu_pipeline_allocate_cs(builder->device, *pipeline,
3794 builder->layout, builder, NULL);
3795 if (result != VK_SUCCESS) {
3796 vk_object_free(&builder->device->vk, builder->alloc, *pipeline);
3797 return result;
3798 }
3799
3800 for (uint32_t i = 0; i < ARRAY_SIZE(builder->shader_iova); i++)
3801 builder->shader_iova[i] =
3802 tu_upload_variant(*pipeline, builder->shaders->variants[i]);
3803
3804 builder->binning_vs_iova =
3805 tu_upload_variant(*pipeline, builder->binning_variant);
3806
3807 /* Setup private memory. Note that because we're sharing the same private
3808 * memory for all stages, all stages must use the same config, or else
3809 * fibers from one stage might overwrite fibers in another.
3810 */
3811
3812 uint32_t pvtmem_size = 0;
3813 bool per_wave = true;
3814 for (uint32_t i = 0; i < ARRAY_SIZE(builder->shaders->variants); i++) {
3815 if (builder->shaders->variants[i]) {
3816 pvtmem_size = MAX2(pvtmem_size, builder->shaders->variants[i]->pvtmem_size);
3817 if (!builder->shaders->variants[i]->pvtmem_per_wave)
3818 per_wave = false;
3819 }
3820 }
3821
3822 if (builder->binning_variant) {
3823 pvtmem_size = MAX2(pvtmem_size, builder->binning_variant->pvtmem_size);
3824 if (!builder->binning_variant->pvtmem_per_wave)
3825 per_wave = false;
3826 }
3827
3828 result = tu_setup_pvtmem(builder->device, *pipeline, &builder->pvtmem,
3829 pvtmem_size, per_wave);
3830 if (result != VK_SUCCESS) {
3831 vk_object_free(&builder->device->vk, builder->alloc, *pipeline);
3832 return result;
3833 }
3834
3835 tu_pipeline_builder_parse_dynamic(builder, *pipeline);
3836 tu_pipeline_builder_parse_shader_stages(builder, *pipeline);
3837 tu_pipeline_builder_parse_vertex_input(builder, *pipeline);
3838 tu_pipeline_builder_parse_input_assembly(builder, *pipeline);
3839 tu_pipeline_builder_parse_tessellation(builder, *pipeline);
3840 tu_pipeline_builder_parse_viewport(builder, *pipeline);
3841 tu_pipeline_builder_parse_rasterization(builder, *pipeline);
3842 tu_pipeline_builder_parse_depth_stencil(builder, *pipeline);
3843 tu_pipeline_builder_parse_multisample_and_color_blend(builder, *pipeline);
3844 tu_pipeline_builder_parse_rasterization_order(builder, *pipeline);
3845 tu6_emit_load_state(*pipeline, builder->layout, false);
3846
3847 return VK_SUCCESS;
3848 }
3849
3850 static void
tu_pipeline_builder_finish(struct tu_pipeline_builder * builder)3851 tu_pipeline_builder_finish(struct tu_pipeline_builder *builder)
3852 {
3853 if (builder->shaders)
3854 vk_pipeline_cache_object_unref(&builder->shaders->base);
3855 ralloc_free(builder->mem_ctx);
3856 }
3857
3858 static void
tu_pipeline_builder_init_graphics(struct tu_pipeline_builder * builder,struct tu_device * dev,struct vk_pipeline_cache * cache,const VkGraphicsPipelineCreateInfo * create_info,const VkAllocationCallbacks * alloc)3859 tu_pipeline_builder_init_graphics(
3860 struct tu_pipeline_builder *builder,
3861 struct tu_device *dev,
3862 struct vk_pipeline_cache *cache,
3863 const VkGraphicsPipelineCreateInfo *create_info,
3864 const VkAllocationCallbacks *alloc)
3865 {
3866 TU_FROM_HANDLE(tu_pipeline_layout, layout, create_info->layout);
3867
3868 *builder = (struct tu_pipeline_builder) {
3869 .device = dev,
3870 .mem_ctx = ralloc_context(NULL),
3871 .cache = cache,
3872 .create_info = create_info,
3873 .alloc = alloc,
3874 .layout = layout,
3875 };
3876
3877 bool rasterizer_discard_dynamic = false;
3878 if (create_info->pDynamicState) {
3879 for (uint32_t i = 0; i < create_info->pDynamicState->dynamicStateCount; i++) {
3880 if (create_info->pDynamicState->pDynamicStates[i] ==
3881 VK_DYNAMIC_STATE_RASTERIZER_DISCARD_ENABLE) {
3882 rasterizer_discard_dynamic = true;
3883 break;
3884 }
3885 }
3886 }
3887
3888 builder->rasterizer_discard =
3889 builder->create_info->pRasterizationState->rasterizerDiscardEnable &&
3890 !rasterizer_discard_dynamic;
3891
3892 const VkPipelineRenderingCreateInfo *rendering_info =
3893 vk_find_struct_const(create_info->pNext, PIPELINE_RENDERING_CREATE_INFO);
3894
3895 if (unlikely(dev->instance->debug_flags & TU_DEBUG_DYNAMIC) && !rendering_info)
3896 rendering_info = vk_get_pipeline_rendering_create_info(create_info);
3897
3898 if (rendering_info) {
3899 builder->subpass_raster_order_attachment_access = false;
3900 builder->subpass_feedback_loop_ds = false;
3901 builder->subpass_feedback_loop_color = false;
3902
3903 builder->multiview_mask = rendering_info->viewMask;
3904
3905 /* We don't know with dynamic rendering whether the pipeline will be
3906 * used in a render pass with none of attachments enabled, so we have to
3907 * dynamically emit MSAA state.
3908 *
3909 * TODO: Move MSAA state to a separate draw state and emit it
3910 * dynamically only when the sample count is different from the
3911 * subpass's sample count.
3912 */
3913 builder->emit_msaa_state = !builder->rasterizer_discard;
3914
3915 const VkRenderingSelfDependencyInfoMESA *self_dependency =
3916 vk_find_struct_const(rendering_info->pNext, RENDERING_SELF_DEPENDENCY_INFO_MESA);
3917
3918 if (self_dependency) {
3919 builder->subpass_feedback_loop_ds =
3920 self_dependency->depthSelfDependency ||
3921 self_dependency->stencilSelfDependency;
3922 builder->subpass_feedback_loop_color =
3923 self_dependency->colorSelfDependencies;
3924 }
3925
3926 if (!builder->rasterizer_discard) {
3927 builder->depth_attachment_format =
3928 rendering_info->depthAttachmentFormat == VK_FORMAT_UNDEFINED ?
3929 rendering_info->stencilAttachmentFormat :
3930 rendering_info->depthAttachmentFormat;
3931
3932 builder->color_attachment_count =
3933 rendering_info->colorAttachmentCount;
3934
3935 for (unsigned i = 0; i < rendering_info->colorAttachmentCount; i++) {
3936 builder->color_attachment_formats[i] =
3937 rendering_info->pColorAttachmentFormats[i];
3938 if (builder->color_attachment_formats[i] != VK_FORMAT_UNDEFINED) {
3939 builder->use_color_attachments = true;
3940 builder->render_components |= 0xf << (i * 4);
3941 }
3942 }
3943 }
3944 } else {
3945 const struct tu_render_pass *pass =
3946 tu_render_pass_from_handle(create_info->renderPass);
3947 const struct tu_subpass *subpass =
3948 &pass->subpasses[create_info->subpass];
3949
3950 builder->subpass_raster_order_attachment_access =
3951 subpass->raster_order_attachment_access;
3952 builder->subpass_feedback_loop_color = subpass->feedback_loop_color;
3953 builder->subpass_feedback_loop_ds = subpass->feedback_loop_ds;
3954
3955 builder->multiview_mask = subpass->multiview_mask;
3956
3957 /* variableMultisampleRate support */
3958 builder->emit_msaa_state = (subpass->samples == 0) && !builder->rasterizer_discard;
3959
3960 if (!builder->rasterizer_discard) {
3961 const uint32_t a = subpass->depth_stencil_attachment.attachment;
3962 builder->depth_attachment_format = (a != VK_ATTACHMENT_UNUSED) ?
3963 pass->attachments[a].format : VK_FORMAT_UNDEFINED;
3964
3965 assert(subpass->color_count == 0 ||
3966 !create_info->pColorBlendState ||
3967 subpass->color_count == create_info->pColorBlendState->attachmentCount);
3968 builder->color_attachment_count = subpass->color_count;
3969 for (uint32_t i = 0; i < subpass->color_count; i++) {
3970 const uint32_t a = subpass->color_attachments[i].attachment;
3971 if (a == VK_ATTACHMENT_UNUSED)
3972 continue;
3973
3974 builder->color_attachment_formats[i] = pass->attachments[a].format;
3975 builder->use_color_attachments = true;
3976 builder->render_components |= 0xf << (i * 4);
3977 }
3978 }
3979 }
3980
3981
3982 if (builder->rasterizer_discard) {
3983 builder->samples = VK_SAMPLE_COUNT_1_BIT;
3984 } else {
3985 builder->samples = create_info->pMultisampleState->rasterizationSamples;
3986 builder->alpha_to_coverage = create_info->pMultisampleState->alphaToCoverageEnable;
3987
3988 if (tu_blend_state_is_dual_src(create_info->pColorBlendState)) {
3989 builder->color_attachment_count++;
3990 builder->use_dual_src_blend = true;
3991 /* dual source blending has an extra fs output in the 2nd slot */
3992 if (builder->color_attachment_formats[0] != VK_FORMAT_UNDEFINED)
3993 builder->render_components |= 0xf << 4;
3994 }
3995 }
3996 }
3997
3998 static VkResult
tu_graphics_pipeline_create(VkDevice device,VkPipelineCache pipelineCache,const VkGraphicsPipelineCreateInfo * pCreateInfo,const VkAllocationCallbacks * pAllocator,VkPipeline * pPipeline)3999 tu_graphics_pipeline_create(VkDevice device,
4000 VkPipelineCache pipelineCache,
4001 const VkGraphicsPipelineCreateInfo *pCreateInfo,
4002 const VkAllocationCallbacks *pAllocator,
4003 VkPipeline *pPipeline)
4004 {
4005 TU_FROM_HANDLE(tu_device, dev, device);
4006 TU_FROM_HANDLE(vk_pipeline_cache, cache, pipelineCache);
4007
4008 cache = cache ? cache : dev->mem_cache;
4009
4010 struct tu_pipeline_builder builder;
4011 tu_pipeline_builder_init_graphics(&builder, dev, cache,
4012 pCreateInfo, pAllocator);
4013
4014 struct tu_pipeline *pipeline = NULL;
4015 VkResult result = tu_pipeline_builder_build(&builder, &pipeline);
4016 tu_pipeline_builder_finish(&builder);
4017
4018 if (result == VK_SUCCESS)
4019 *pPipeline = tu_pipeline_to_handle(pipeline);
4020 else
4021 *pPipeline = VK_NULL_HANDLE;
4022
4023 return result;
4024 }
4025
4026 VKAPI_ATTR VkResult VKAPI_CALL
tu_CreateGraphicsPipelines(VkDevice device,VkPipelineCache pipelineCache,uint32_t count,const VkGraphicsPipelineCreateInfo * pCreateInfos,const VkAllocationCallbacks * pAllocator,VkPipeline * pPipelines)4027 tu_CreateGraphicsPipelines(VkDevice device,
4028 VkPipelineCache pipelineCache,
4029 uint32_t count,
4030 const VkGraphicsPipelineCreateInfo *pCreateInfos,
4031 const VkAllocationCallbacks *pAllocator,
4032 VkPipeline *pPipelines)
4033 {
4034 VkResult final_result = VK_SUCCESS;
4035 uint32_t i = 0;
4036
4037 for (; i < count; i++) {
4038 VkResult result = tu_graphics_pipeline_create(device, pipelineCache,
4039 &pCreateInfos[i], pAllocator,
4040 &pPipelines[i]);
4041
4042 if (result != VK_SUCCESS) {
4043 final_result = result;
4044 pPipelines[i] = VK_NULL_HANDLE;
4045
4046 if (pCreateInfos[i].flags &
4047 VK_PIPELINE_CREATE_EARLY_RETURN_ON_FAILURE_BIT)
4048 break;
4049 }
4050 }
4051
4052 for (; i < count; i++)
4053 pPipelines[i] = VK_NULL_HANDLE;
4054
4055 return final_result;
4056 }
4057
4058 static VkResult
tu_compute_pipeline_create(VkDevice device,VkPipelineCache pipelineCache,const VkComputePipelineCreateInfo * pCreateInfo,const VkAllocationCallbacks * pAllocator,VkPipeline * pPipeline)4059 tu_compute_pipeline_create(VkDevice device,
4060 VkPipelineCache pipelineCache,
4061 const VkComputePipelineCreateInfo *pCreateInfo,
4062 const VkAllocationCallbacks *pAllocator,
4063 VkPipeline *pPipeline)
4064 {
4065 TU_FROM_HANDLE(tu_device, dev, device);
4066 TU_FROM_HANDLE(vk_pipeline_cache, cache, pipelineCache);
4067 TU_FROM_HANDLE(tu_pipeline_layout, layout, pCreateInfo->layout);
4068 const VkPipelineShaderStageCreateInfo *stage_info = &pCreateInfo->stage;
4069 VkResult result;
4070
4071 cache = cache ? cache : dev->mem_cache;
4072
4073 struct tu_pipeline *pipeline;
4074
4075 *pPipeline = VK_NULL_HANDLE;
4076
4077 VkPipelineCreationFeedback pipeline_feedback = {
4078 .flags = VK_PIPELINE_CREATION_FEEDBACK_VALID_BIT,
4079 };
4080
4081 const VkPipelineCreationFeedbackCreateInfo *creation_feedback =
4082 vk_find_struct_const(pCreateInfo->pNext, PIPELINE_CREATION_FEEDBACK_CREATE_INFO);
4083
4084 int64_t pipeline_start = os_time_get_nano();
4085
4086 pipeline = vk_object_zalloc(&dev->vk, pAllocator, sizeof(*pipeline),
4087 VK_OBJECT_TYPE_PIPELINE);
4088 if (!pipeline)
4089 return VK_ERROR_OUT_OF_HOST_MEMORY;
4090
4091 pipeline->executables_mem_ctx = ralloc_context(NULL);
4092 util_dynarray_init(&pipeline->executables, pipeline->executables_mem_ctx);
4093
4094 struct tu_shader_key key = { };
4095 tu_shader_key_init(&key, stage_info, dev);
4096
4097 void *pipeline_mem_ctx = ralloc_context(NULL);
4098
4099 unsigned char pipeline_sha1[20];
4100 tu_hash_compute(pipeline_sha1, stage_info, layout, &key, dev->compiler);
4101
4102 struct tu_compiled_shaders *compiled = NULL;
4103
4104 const bool executable_info = pCreateInfo->flags &
4105 VK_PIPELINE_CREATE_CAPTURE_INTERNAL_REPRESENTATIONS_BIT_KHR;
4106
4107 bool application_cache_hit = false;
4108
4109 if (!executable_info) {
4110 compiled =
4111 tu_pipeline_cache_lookup(cache, pipeline_sha1, sizeof(pipeline_sha1),
4112 &application_cache_hit);
4113 }
4114
4115 if (application_cache_hit && cache != dev->mem_cache) {
4116 pipeline_feedback.flags |=
4117 VK_PIPELINE_CREATION_FEEDBACK_APPLICATION_PIPELINE_CACHE_HIT_BIT;
4118 }
4119
4120 if (tu6_shared_constants_enable(layout, dev->compiler)) {
4121 pipeline->shared_consts = (struct tu_push_constant_range) {
4122 .lo = 0,
4123 .dwords = layout->push_constant_size / 4,
4124 };
4125 }
4126
4127 char *nir_initial_disasm = NULL;
4128
4129 if (!compiled) {
4130 if (pCreateInfo->flags &
4131 VK_PIPELINE_CREATE_FAIL_ON_PIPELINE_COMPILE_REQUIRED_BIT) {
4132 result = VK_PIPELINE_COMPILE_REQUIRED;
4133 goto fail;
4134 }
4135
4136 struct ir3_shader_key ir3_key = {};
4137
4138 nir_shader *nir = tu_spirv_to_nir(dev, pipeline_mem_ctx, stage_info,
4139 MESA_SHADER_COMPUTE);
4140
4141 nir_initial_disasm = executable_info ?
4142 nir_shader_as_str(nir, pipeline->executables_mem_ctx) : NULL;
4143
4144 struct tu_shader *shader =
4145 tu_shader_create(dev, nir, &key, layout, pAllocator);
4146 if (!shader) {
4147 result = VK_ERROR_OUT_OF_HOST_MEMORY;
4148 goto fail;
4149 }
4150
4151 compiled = tu_shaders_init(dev, &pipeline_sha1, sizeof(pipeline_sha1));
4152 if (!compiled) {
4153 tu_shader_destroy(dev, shader, pAllocator);
4154 result = VK_ERROR_OUT_OF_HOST_MEMORY;
4155 goto fail;
4156 }
4157
4158 compiled->active_desc_sets = shader->active_desc_sets;
4159 compiled->push_consts[MESA_SHADER_COMPUTE] = shader->push_consts;
4160
4161 struct ir3_shader_variant *v =
4162 ir3_shader_create_variant(shader->ir3_shader, &ir3_key, executable_info);
4163
4164 tu_shader_destroy(dev, shader, pAllocator);
4165
4166 if (!v) {
4167 result = VK_ERROR_OUT_OF_HOST_MEMORY;
4168 goto fail;
4169 }
4170
4171 compiled->variants[MESA_SHADER_COMPUTE] = v;
4172
4173 compiled = tu_pipeline_cache_insert(cache, compiled);
4174 }
4175
4176 pipeline_feedback.duration = os_time_get_nano() - pipeline_start;
4177
4178 if (creation_feedback) {
4179 *creation_feedback->pPipelineCreationFeedback = pipeline_feedback;
4180 assert(creation_feedback->pipelineStageCreationFeedbackCount == 1);
4181 creation_feedback->pPipelineStageCreationFeedbacks[0] = pipeline_feedback;
4182 }
4183
4184 pipeline->active_desc_sets = compiled->active_desc_sets;
4185
4186 struct ir3_shader_variant *v = compiled->variants[MESA_SHADER_COMPUTE];
4187
4188 tu_pipeline_set_linkage(&pipeline->program.link[MESA_SHADER_COMPUTE],
4189 &compiled->push_consts[MESA_SHADER_COMPUTE], v);
4190
4191 result = tu_pipeline_allocate_cs(dev, pipeline, layout, NULL, v);
4192 if (result != VK_SUCCESS)
4193 goto fail;
4194
4195 uint64_t shader_iova = tu_upload_variant(pipeline, v);
4196
4197 struct tu_pvtmem_config pvtmem;
4198 tu_setup_pvtmem(dev, pipeline, &pvtmem, v->pvtmem_size, v->pvtmem_per_wave);
4199
4200 for (int i = 0; i < 3; i++)
4201 pipeline->compute.local_size[i] = v->local_size[i];
4202
4203 pipeline->compute.subgroup_size = v->info.double_threadsize ? 128 : 64;
4204
4205 struct tu_cs prog_cs;
4206 uint32_t additional_reserve_size = tu_xs_get_additional_cs_size_dwords(v);
4207 tu_cs_begin_sub_stream(&pipeline->cs, 64 + additional_reserve_size, &prog_cs);
4208 tu6_emit_cs_config(&prog_cs, v, &pvtmem, shader_iova);
4209 pipeline->program.state = tu_cs_end_draw_state(&pipeline->cs, &prog_cs);
4210
4211 tu6_emit_load_state(pipeline, layout, true);
4212
4213 tu_append_executable(pipeline, v, nir_initial_disasm);
4214
4215 vk_pipeline_cache_object_unref(&compiled->base);
4216 ralloc_free(pipeline_mem_ctx);
4217
4218 *pPipeline = tu_pipeline_to_handle(pipeline);
4219
4220 return VK_SUCCESS;
4221
4222 fail:
4223 if (compiled)
4224 vk_pipeline_cache_object_unref(&compiled->base);
4225
4226 ralloc_free(pipeline_mem_ctx);
4227
4228 vk_object_free(&dev->vk, pAllocator, pipeline);
4229
4230 return result;
4231 }
4232
4233 VKAPI_ATTR VkResult VKAPI_CALL
tu_CreateComputePipelines(VkDevice device,VkPipelineCache pipelineCache,uint32_t count,const VkComputePipelineCreateInfo * pCreateInfos,const VkAllocationCallbacks * pAllocator,VkPipeline * pPipelines)4234 tu_CreateComputePipelines(VkDevice device,
4235 VkPipelineCache pipelineCache,
4236 uint32_t count,
4237 const VkComputePipelineCreateInfo *pCreateInfos,
4238 const VkAllocationCallbacks *pAllocator,
4239 VkPipeline *pPipelines)
4240 {
4241 VkResult final_result = VK_SUCCESS;
4242 uint32_t i = 0;
4243
4244 for (; i < count; i++) {
4245 VkResult result = tu_compute_pipeline_create(device, pipelineCache,
4246 &pCreateInfos[i],
4247 pAllocator, &pPipelines[i]);
4248 if (result != VK_SUCCESS) {
4249 final_result = result;
4250 pPipelines[i] = VK_NULL_HANDLE;
4251
4252 if (pCreateInfos[i].flags &
4253 VK_PIPELINE_CREATE_EARLY_RETURN_ON_FAILURE_BIT)
4254 break;
4255 }
4256 }
4257
4258 for (; i < count; i++)
4259 pPipelines[i] = VK_NULL_HANDLE;
4260
4261 return final_result;
4262 }
4263
4264 VKAPI_ATTR void VKAPI_CALL
tu_DestroyPipeline(VkDevice _device,VkPipeline _pipeline,const VkAllocationCallbacks * pAllocator)4265 tu_DestroyPipeline(VkDevice _device,
4266 VkPipeline _pipeline,
4267 const VkAllocationCallbacks *pAllocator)
4268 {
4269 TU_FROM_HANDLE(tu_device, dev, _device);
4270 TU_FROM_HANDLE(tu_pipeline, pipeline, _pipeline);
4271
4272 if (!_pipeline)
4273 return;
4274
4275 tu_pipeline_finish(pipeline, dev, pAllocator);
4276 vk_object_free(&dev->vk, pAllocator, pipeline);
4277 }
4278
4279 #define WRITE_STR(field, ...) ({ \
4280 memset(field, 0, sizeof(field)); \
4281 UNUSED int _i = snprintf(field, sizeof(field), __VA_ARGS__); \
4282 assert(_i > 0 && _i < sizeof(field)); \
4283 })
4284
4285 static const struct tu_pipeline_executable *
tu_pipeline_get_executable(struct tu_pipeline * pipeline,uint32_t index)4286 tu_pipeline_get_executable(struct tu_pipeline *pipeline, uint32_t index)
4287 {
4288 assert(index < util_dynarray_num_elements(&pipeline->executables,
4289 struct tu_pipeline_executable));
4290 return util_dynarray_element(
4291 &pipeline->executables, struct tu_pipeline_executable, index);
4292 }
4293
4294 VKAPI_ATTR VkResult VKAPI_CALL
tu_GetPipelineExecutablePropertiesKHR(VkDevice _device,const VkPipelineInfoKHR * pPipelineInfo,uint32_t * pExecutableCount,VkPipelineExecutablePropertiesKHR * pProperties)4295 tu_GetPipelineExecutablePropertiesKHR(
4296 VkDevice _device,
4297 const VkPipelineInfoKHR* pPipelineInfo,
4298 uint32_t* pExecutableCount,
4299 VkPipelineExecutablePropertiesKHR* pProperties)
4300 {
4301 TU_FROM_HANDLE(tu_device, dev, _device);
4302 TU_FROM_HANDLE(tu_pipeline, pipeline, pPipelineInfo->pipeline);
4303 VK_OUTARRAY_MAKE_TYPED(VkPipelineExecutablePropertiesKHR, out,
4304 pProperties, pExecutableCount);
4305
4306 util_dynarray_foreach (&pipeline->executables, struct tu_pipeline_executable, exe) {
4307 vk_outarray_append_typed(VkPipelineExecutablePropertiesKHR, &out, props) {
4308 gl_shader_stage stage = exe->stage;
4309 props->stages = mesa_to_vk_shader_stage(stage);
4310
4311 if (!exe->is_binning)
4312 WRITE_STR(props->name, "%s", _mesa_shader_stage_to_abbrev(stage));
4313 else
4314 WRITE_STR(props->name, "Binning VS");
4315
4316 WRITE_STR(props->description, "%s", _mesa_shader_stage_to_string(stage));
4317
4318 props->subgroupSize =
4319 dev->compiler->threadsize_base * (exe->stats.double_threadsize ? 2 : 1);
4320 }
4321 }
4322
4323 return vk_outarray_status(&out);
4324 }
4325
4326 VKAPI_ATTR VkResult VKAPI_CALL
tu_GetPipelineExecutableStatisticsKHR(VkDevice _device,const VkPipelineExecutableInfoKHR * pExecutableInfo,uint32_t * pStatisticCount,VkPipelineExecutableStatisticKHR * pStatistics)4327 tu_GetPipelineExecutableStatisticsKHR(
4328 VkDevice _device,
4329 const VkPipelineExecutableInfoKHR* pExecutableInfo,
4330 uint32_t* pStatisticCount,
4331 VkPipelineExecutableStatisticKHR* pStatistics)
4332 {
4333 TU_FROM_HANDLE(tu_pipeline, pipeline, pExecutableInfo->pipeline);
4334 VK_OUTARRAY_MAKE_TYPED(VkPipelineExecutableStatisticKHR, out,
4335 pStatistics, pStatisticCount);
4336
4337 const struct tu_pipeline_executable *exe =
4338 tu_pipeline_get_executable(pipeline, pExecutableInfo->executableIndex);
4339
4340 vk_outarray_append_typed(VkPipelineExecutableStatisticKHR, &out, stat) {
4341 WRITE_STR(stat->name, "Max Waves Per Core");
4342 WRITE_STR(stat->description,
4343 "Maximum number of simultaneous waves per core.");
4344 stat->format = VK_PIPELINE_EXECUTABLE_STATISTIC_FORMAT_UINT64_KHR;
4345 stat->value.u64 = exe->stats.max_waves;
4346 }
4347
4348 vk_outarray_append_typed(VkPipelineExecutableStatisticKHR, &out, stat) {
4349 WRITE_STR(stat->name, "Instruction Count");
4350 WRITE_STR(stat->description,
4351 "Total number of IR3 instructions in the final generated "
4352 "shader executable.");
4353 stat->format = VK_PIPELINE_EXECUTABLE_STATISTIC_FORMAT_UINT64_KHR;
4354 stat->value.u64 = exe->stats.instrs_count;
4355 }
4356
4357 vk_outarray_append_typed(VkPipelineExecutableStatisticKHR, &out, stat) {
4358 WRITE_STR(stat->name, "Code size");
4359 WRITE_STR(stat->description,
4360 "Total number of dwords in the final generated "
4361 "shader executable.");
4362 stat->format = VK_PIPELINE_EXECUTABLE_STATISTIC_FORMAT_UINT64_KHR;
4363 stat->value.u64 = exe->stats.sizedwords;
4364 }
4365
4366 vk_outarray_append_typed(VkPipelineExecutableStatisticKHR, &out, stat) {
4367 WRITE_STR(stat->name, "NOPs Count");
4368 WRITE_STR(stat->description,
4369 "Number of NOP instructions in the final generated "
4370 "shader executable.");
4371 stat->format = VK_PIPELINE_EXECUTABLE_STATISTIC_FORMAT_UINT64_KHR;
4372 stat->value.u64 = exe->stats.nops_count;
4373 }
4374
4375 vk_outarray_append_typed(VkPipelineExecutableStatisticKHR, &out, stat) {
4376 WRITE_STR(stat->name, "MOV Count");
4377 WRITE_STR(stat->description,
4378 "Number of MOV instructions in the final generated "
4379 "shader executable.");
4380 stat->format = VK_PIPELINE_EXECUTABLE_STATISTIC_FORMAT_UINT64_KHR;
4381 stat->value.u64 = exe->stats.mov_count;
4382 }
4383
4384 vk_outarray_append_typed(VkPipelineExecutableStatisticKHR, &out, stat) {
4385 WRITE_STR(stat->name, "COV Count");
4386 WRITE_STR(stat->description,
4387 "Number of COV instructions in the final generated "
4388 "shader executable.");
4389 stat->format = VK_PIPELINE_EXECUTABLE_STATISTIC_FORMAT_UINT64_KHR;
4390 stat->value.u64 = exe->stats.cov_count;
4391 }
4392
4393 vk_outarray_append_typed(VkPipelineExecutableStatisticKHR, &out, stat) {
4394 WRITE_STR(stat->name, "Registers used");
4395 WRITE_STR(stat->description,
4396 "Number of registers used in the final generated "
4397 "shader executable.");
4398 stat->format = VK_PIPELINE_EXECUTABLE_STATISTIC_FORMAT_UINT64_KHR;
4399 stat->value.u64 = exe->stats.max_reg + 1;
4400 }
4401
4402 vk_outarray_append_typed(VkPipelineExecutableStatisticKHR, &out, stat) {
4403 WRITE_STR(stat->name, "Half-registers used");
4404 WRITE_STR(stat->description,
4405 "Number of half-registers used in the final generated "
4406 "shader executable.");
4407 stat->format = VK_PIPELINE_EXECUTABLE_STATISTIC_FORMAT_UINT64_KHR;
4408 stat->value.u64 = exe->stats.max_half_reg + 1;
4409 }
4410
4411 vk_outarray_append_typed(VkPipelineExecutableStatisticKHR, &out, stat) {
4412 WRITE_STR(stat->name, "Instructions with SS sync bit");
4413 WRITE_STR(stat->description,
4414 "SS bit is set for instructions which depend on a result "
4415 "of \"long\" instructions to prevent RAW hazard.");
4416 stat->format = VK_PIPELINE_EXECUTABLE_STATISTIC_FORMAT_UINT64_KHR;
4417 stat->value.u64 = exe->stats.ss;
4418 }
4419
4420 vk_outarray_append_typed(VkPipelineExecutableStatisticKHR, &out, stat) {
4421 WRITE_STR(stat->name, "Instructions with SY sync bit");
4422 WRITE_STR(stat->description,
4423 "SY bit is set for instructions which depend on a result "
4424 "of loads from global memory to prevent RAW hazard.");
4425 stat->format = VK_PIPELINE_EXECUTABLE_STATISTIC_FORMAT_UINT64_KHR;
4426 stat->value.u64 = exe->stats.sy;
4427 }
4428
4429 vk_outarray_append_typed(VkPipelineExecutableStatisticKHR, &out, stat) {
4430 WRITE_STR(stat->name, "Estimated cycles stalled on SS");
4431 WRITE_STR(stat->description,
4432 "A better metric to estimate the impact of SS syncs.");
4433 stat->format = VK_PIPELINE_EXECUTABLE_STATISTIC_FORMAT_UINT64_KHR;
4434 stat->value.u64 = exe->stats.sstall;
4435 }
4436
4437 vk_outarray_append_typed(VkPipelineExecutableStatisticKHR, &out, stat) {
4438 WRITE_STR(stat->name, "Estimated cycles stalled on SY");
4439 WRITE_STR(stat->description,
4440 "A better metric to estimate the impact of SY syncs.");
4441 stat->format = VK_PIPELINE_EXECUTABLE_STATISTIC_FORMAT_UINT64_KHR;
4442 stat->value.u64 = exe->stats.systall;
4443 }
4444
4445 for (int i = 0; i < ARRAY_SIZE(exe->stats.instrs_per_cat); i++) {
4446 vk_outarray_append_typed(VkPipelineExecutableStatisticKHR, &out, stat) {
4447 WRITE_STR(stat->name, "cat%d instructions", i);
4448 WRITE_STR(stat->description,
4449 "Number of cat%d instructions.", i);
4450 stat->format = VK_PIPELINE_EXECUTABLE_STATISTIC_FORMAT_UINT64_KHR;
4451 stat->value.u64 = exe->stats.instrs_per_cat[i];
4452 }
4453 }
4454
4455 vk_outarray_append_typed(VkPipelineExecutableStatisticKHR, &out, stat) {
4456 WRITE_STR(stat->name, "STP Count");
4457 WRITE_STR(stat->description,
4458 "Number of STore Private instructions in the final generated "
4459 "shader executable.");
4460 stat->format = VK_PIPELINE_EXECUTABLE_STATISTIC_FORMAT_UINT64_KHR;
4461 stat->value.u64 = exe->stats.stp_count;
4462 }
4463
4464 vk_outarray_append_typed(VkPipelineExecutableStatisticKHR, &out, stat) {
4465 WRITE_STR(stat->name, "LDP Count");
4466 WRITE_STR(stat->description,
4467 "Number of LoaD Private instructions in the final generated "
4468 "shader executable.");
4469 stat->format = VK_PIPELINE_EXECUTABLE_STATISTIC_FORMAT_UINT64_KHR;
4470 stat->value.u64 = exe->stats.ldp_count;
4471 }
4472
4473 return vk_outarray_status(&out);
4474 }
4475
4476 static bool
write_ir_text(VkPipelineExecutableInternalRepresentationKHR * ir,const char * data)4477 write_ir_text(VkPipelineExecutableInternalRepresentationKHR* ir,
4478 const char *data)
4479 {
4480 ir->isText = VK_TRUE;
4481
4482 size_t data_len = strlen(data) + 1;
4483
4484 if (ir->pData == NULL) {
4485 ir->dataSize = data_len;
4486 return true;
4487 }
4488
4489 strncpy(ir->pData, data, ir->dataSize);
4490 if (ir->dataSize < data_len)
4491 return false;
4492
4493 ir->dataSize = data_len;
4494 return true;
4495 }
4496
4497 VKAPI_ATTR VkResult VKAPI_CALL
tu_GetPipelineExecutableInternalRepresentationsKHR(VkDevice _device,const VkPipelineExecutableInfoKHR * pExecutableInfo,uint32_t * pInternalRepresentationCount,VkPipelineExecutableInternalRepresentationKHR * pInternalRepresentations)4498 tu_GetPipelineExecutableInternalRepresentationsKHR(
4499 VkDevice _device,
4500 const VkPipelineExecutableInfoKHR* pExecutableInfo,
4501 uint32_t* pInternalRepresentationCount,
4502 VkPipelineExecutableInternalRepresentationKHR* pInternalRepresentations)
4503 {
4504 TU_FROM_HANDLE(tu_pipeline, pipeline, pExecutableInfo->pipeline);
4505 VK_OUTARRAY_MAKE_TYPED(VkPipelineExecutableInternalRepresentationKHR, out,
4506 pInternalRepresentations, pInternalRepresentationCount);
4507 bool incomplete_text = false;
4508
4509 const struct tu_pipeline_executable *exe =
4510 tu_pipeline_get_executable(pipeline, pExecutableInfo->executableIndex);
4511
4512 if (exe->nir_from_spirv) {
4513 vk_outarray_append_typed(VkPipelineExecutableInternalRepresentationKHR, &out, ir) {
4514 WRITE_STR(ir->name, "NIR from SPIRV");
4515 WRITE_STR(ir->description,
4516 "Initial NIR before any optimizations");
4517
4518 if (!write_ir_text(ir, exe->nir_from_spirv))
4519 incomplete_text = true;
4520 }
4521 }
4522
4523 if (exe->nir_final) {
4524 vk_outarray_append_typed(VkPipelineExecutableInternalRepresentationKHR, &out, ir) {
4525 WRITE_STR(ir->name, "Final NIR");
4526 WRITE_STR(ir->description,
4527 "Final NIR before going into the back-end compiler");
4528
4529 if (!write_ir_text(ir, exe->nir_final))
4530 incomplete_text = true;
4531 }
4532 }
4533
4534 if (exe->disasm) {
4535 vk_outarray_append_typed(VkPipelineExecutableInternalRepresentationKHR, &out, ir) {
4536 WRITE_STR(ir->name, "IR3 Assembly");
4537 WRITE_STR(ir->description,
4538 "Final IR3 assembly for the generated shader binary");
4539
4540 if (!write_ir_text(ir, exe->disasm))
4541 incomplete_text = true;
4542 }
4543 }
4544
4545 return incomplete_text ? VK_INCOMPLETE : vk_outarray_status(&out);
4546 }
4547