1 /*
2 * Copyright © 2016 Red Hat.
3 * Copyright © 2016 Bas Nieuwenhuizen
4 * SPDX-License-Identifier: MIT
5 *
6 * based in part on anv driver which is:
7 * Copyright © 2015 Intel Corporation
8 */
9
10 #include "tu_pipeline.h"
11
12 #include "common/freedreno_guardband.h"
13
14 #include "ir3/ir3_nir.h"
15 #include "nir/nir.h"
16 #include "nir/nir_builder.h"
17 #include "nir/nir_serialize.h"
18 #include "spirv/nir_spirv.h"
19 #include "util/u_debug.h"
20 #include "util/mesa-sha1.h"
21 #include "vk_nir.h"
22 #include "vk_pipeline.h"
23 #include "vk_render_pass.h"
24 #include "vk_util.h"
25
26 #include "tu_cmd_buffer.h"
27 #include "tu_cs.h"
28 #include "tu_device.h"
29 #include "tu_knl.h"
30 #include "tu_formats.h"
31 #include "tu_lrz.h"
32 #include "tu_pass.h"
33 #include "tu_rmv.h"
34
35 /* Emit IB that preloads the descriptors that the shader uses */
36
37 static void
emit_load_state(struct tu_cs * cs,unsigned opcode,enum a6xx_state_type st,enum a6xx_state_block sb,unsigned base,unsigned offset,unsigned count)38 emit_load_state(struct tu_cs *cs, unsigned opcode, enum a6xx_state_type st,
39 enum a6xx_state_block sb, unsigned base, unsigned offset,
40 unsigned count)
41 {
42 /* Note: just emit one packet, even if count overflows NUM_UNIT. It's not
43 * clear if emitting more packets will even help anything. Presumably the
44 * descriptor cache is relatively small, and these packets stop doing
45 * anything when there are too many descriptors.
46 */
47 tu_cs_emit_pkt7(cs, opcode, 3);
48 tu_cs_emit(cs,
49 CP_LOAD_STATE6_0_STATE_TYPE(st) |
50 CP_LOAD_STATE6_0_STATE_SRC(SS6_BINDLESS) |
51 CP_LOAD_STATE6_0_STATE_BLOCK(sb) |
52 CP_LOAD_STATE6_0_NUM_UNIT(MIN2(count, 1024-1)));
53 tu_cs_emit_qw(cs, offset | (base << 28));
54 }
55
56 static unsigned
tu6_load_state_size(struct tu_pipeline * pipeline,struct tu_pipeline_layout * layout)57 tu6_load_state_size(struct tu_pipeline *pipeline,
58 struct tu_pipeline_layout *layout)
59 {
60 const unsigned load_state_size = 4;
61 unsigned size = 0;
62 for (unsigned i = 0; i < layout->num_sets; i++) {
63 if (!(pipeline->active_desc_sets & (1u << i)))
64 continue;
65
66 struct tu_descriptor_set_layout *set_layout = layout->set[i].layout;
67 for (unsigned j = 0; j < set_layout->binding_count; j++) {
68 struct tu_descriptor_set_binding_layout *binding = &set_layout->binding[j];
69 unsigned count = 0;
70 /* See comment in tu6_emit_load_state(). */
71 VkShaderStageFlags stages = pipeline->active_stages & binding->shader_stages;
72 unsigned stage_count = util_bitcount(stages);
73
74 if (!binding->array_size)
75 continue;
76
77 switch (binding->type) {
78 case VK_DESCRIPTOR_TYPE_STORAGE_BUFFER:
79 case VK_DESCRIPTOR_TYPE_STORAGE_BUFFER_DYNAMIC:
80 case VK_DESCRIPTOR_TYPE_STORAGE_IMAGE:
81 case VK_DESCRIPTOR_TYPE_STORAGE_TEXEL_BUFFER:
82 /* IBO-backed resources only need one packet for all graphics stages */
83 if (stage_count)
84 count += 1;
85 break;
86 case VK_DESCRIPTOR_TYPE_SAMPLER:
87 case VK_DESCRIPTOR_TYPE_SAMPLED_IMAGE:
88 case VK_DESCRIPTOR_TYPE_UNIFORM_TEXEL_BUFFER:
89 case VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER:
90 case VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER_DYNAMIC:
91 /* Textures and UBO's needs a packet for each stage */
92 count = stage_count;
93 break;
94 case VK_DESCRIPTOR_TYPE_COMBINED_IMAGE_SAMPLER:
95 /* Because of how we pack combined images and samplers, we
96 * currently can't use one packet for the whole array.
97 */
98 count = stage_count * binding->array_size * 2;
99 break;
100 case VK_DESCRIPTOR_TYPE_INPUT_ATTACHMENT:
101 case VK_DESCRIPTOR_TYPE_INLINE_UNIFORM_BLOCK:
102 case VK_DESCRIPTOR_TYPE_MUTABLE_EXT:
103 break;
104 default:
105 unreachable("bad descriptor type");
106 }
107 size += count * load_state_size;
108 }
109 }
110 return size;
111 }
112
113 static void
tu6_emit_load_state(struct tu_device * device,struct tu_pipeline * pipeline,struct tu_pipeline_layout * layout)114 tu6_emit_load_state(struct tu_device *device,
115 struct tu_pipeline *pipeline,
116 struct tu_pipeline_layout *layout)
117 {
118 unsigned size = tu6_load_state_size(pipeline, layout);
119 if (size == 0)
120 return;
121
122 struct tu_cs cs;
123 tu_cs_begin_sub_stream(&pipeline->cs, size, &cs);
124
125 for (unsigned i = 0; i < layout->num_sets; i++) {
126 /* From 13.2.7. Descriptor Set Binding:
127 *
128 * A compatible descriptor set must be bound for all set numbers that
129 * any shaders in a pipeline access, at the time that a draw or
130 * dispatch command is recorded to execute using that pipeline.
131 * However, if none of the shaders in a pipeline statically use any
132 * bindings with a particular set number, then no descriptor set need
133 * be bound for that set number, even if the pipeline layout includes
134 * a non-trivial descriptor set layout for that set number.
135 *
136 * This means that descriptor sets unused by the pipeline may have a
137 * garbage or 0 BINDLESS_BASE register, which will cause context faults
138 * when prefetching descriptors from these sets. Skip prefetching for
139 * descriptors from them to avoid this. This is also an optimization,
140 * since these prefetches would be useless.
141 */
142 if (!(pipeline->active_desc_sets & (1u << i)))
143 continue;
144
145 struct tu_descriptor_set_layout *set_layout = layout->set[i].layout;
146 for (unsigned j = 0; j < set_layout->binding_count; j++) {
147 struct tu_descriptor_set_binding_layout *binding = &set_layout->binding[j];
148 unsigned base = i;
149 unsigned offset = binding->offset / 4;
150 /* Note: amber sets VK_SHADER_STAGE_ALL for its descriptor layout, and
151 * zink has descriptors for each stage in the push layout even if some
152 * stages aren't present in a used pipeline. We don't want to emit
153 * loads for unused descriptors.
154 */
155 VkShaderStageFlags stages = pipeline->active_stages & binding->shader_stages;
156 unsigned count = binding->array_size;
157
158 /* If this is a variable-count descriptor, then the array_size is an
159 * upper bound on the size, but we don't know how many descriptors
160 * will actually be used. Therefore we can't pre-load them here.
161 */
162 if (j == set_layout->binding_count - 1 &&
163 set_layout->has_variable_descriptors)
164 continue;
165
166 if (count == 0 || stages == 0)
167 continue;
168 switch (binding->type) {
169 case VK_DESCRIPTOR_TYPE_STORAGE_BUFFER_DYNAMIC:
170 assert(device->physical_device->reserved_set_idx >= 0);
171 base = device->physical_device->reserved_set_idx;
172 offset = (pipeline->program.dynamic_descriptor_offsets[i] +
173 binding->dynamic_offset_offset) / 4;
174 FALLTHROUGH;
175 case VK_DESCRIPTOR_TYPE_STORAGE_BUFFER:
176 case VK_DESCRIPTOR_TYPE_STORAGE_IMAGE:
177 case VK_DESCRIPTOR_TYPE_STORAGE_TEXEL_BUFFER: {
178 unsigned mul = binding->size / (A6XX_TEX_CONST_DWORDS * 4);
179 /* IBO-backed resources only need one packet for all graphics stages */
180 if (stages & ~VK_SHADER_STAGE_COMPUTE_BIT) {
181 emit_load_state(&cs, CP_LOAD_STATE6, ST6_SHADER, SB6_IBO,
182 base, offset, count * mul);
183 }
184 if (stages & VK_SHADER_STAGE_COMPUTE_BIT) {
185 emit_load_state(&cs, CP_LOAD_STATE6_FRAG, ST6_IBO, SB6_CS_SHADER,
186 base, offset, count * mul);
187 }
188 break;
189 }
190 case VK_DESCRIPTOR_TYPE_INPUT_ATTACHMENT:
191 case VK_DESCRIPTOR_TYPE_INLINE_UNIFORM_BLOCK:
192 case VK_DESCRIPTOR_TYPE_MUTABLE_EXT:
193 /* nothing - input attachments and inline uniforms don't use bindless */
194 break;
195 case VK_DESCRIPTOR_TYPE_SAMPLER:
196 case VK_DESCRIPTOR_TYPE_SAMPLED_IMAGE:
197 case VK_DESCRIPTOR_TYPE_UNIFORM_TEXEL_BUFFER: {
198 tu_foreach_stage(stage, stages) {
199 emit_load_state(&cs, tu6_stage2opcode(stage),
200 binding->type == VK_DESCRIPTOR_TYPE_SAMPLER ?
201 ST6_SHADER : ST6_CONSTANTS,
202 tu6_stage2texsb(stage), base, offset, count);
203 }
204 break;
205 }
206 case VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER_DYNAMIC:
207 assert(device->physical_device->reserved_set_idx >= 0);
208 base = device->physical_device->reserved_set_idx;
209 offset = (pipeline->program.dynamic_descriptor_offsets[i] +
210 binding->dynamic_offset_offset) / 4;
211 FALLTHROUGH;
212 case VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER: {
213 tu_foreach_stage(stage, stages) {
214 emit_load_state(&cs, tu6_stage2opcode(stage), ST6_UBO,
215 tu6_stage2shadersb(stage), base, offset, count);
216 }
217 break;
218 }
219 case VK_DESCRIPTOR_TYPE_COMBINED_IMAGE_SAMPLER: {
220 tu_foreach_stage(stage, stages) {
221 /* TODO: We could emit less CP_LOAD_STATE6 if we used
222 * struct-of-arrays instead of array-of-structs.
223 */
224 for (unsigned i = 0; i < count; i++) {
225 unsigned tex_offset = offset + 2 * i * A6XX_TEX_CONST_DWORDS;
226 unsigned sam_offset = offset + (2 * i + 1) * A6XX_TEX_CONST_DWORDS;
227 emit_load_state(&cs, tu6_stage2opcode(stage),
228 ST6_CONSTANTS, tu6_stage2texsb(stage),
229 base, tex_offset, 1);
230 emit_load_state(&cs, tu6_stage2opcode(stage),
231 ST6_SHADER, tu6_stage2texsb(stage),
232 base, sam_offset, 1);
233 }
234 }
235 break;
236 }
237 default:
238 unreachable("bad descriptor type");
239 }
240 }
241 }
242
243 pipeline->load_state = tu_cs_end_draw_state(&pipeline->cs, &cs);
244 }
245
246 struct tu_pipeline_builder
247 {
248 struct tu_device *device;
249 void *mem_ctx;
250 struct vk_pipeline_cache *cache;
251 const VkAllocationCallbacks *alloc;
252 const VkGraphicsPipelineCreateInfo *create_info;
253 VkPipelineCreateFlags2KHR create_flags;
254
255 struct tu_pipeline_layout layout;
256
257 struct tu_pvtmem_config pvtmem;
258
259 bool rasterizer_discard;
260 /* these states are affectd by rasterizer_discard */
261 uint8_t unscaled_input_fragcoord;
262
263 /* Each library defines at least one piece of state in
264 * VkGraphicsPipelineLibraryFlagsEXT, and libraries cannot overlap, so
265 * there can be at most as many libraries as pieces of state, of which
266 * there are currently 4.
267 */
268 #define MAX_LIBRARIES 4
269
270 unsigned num_libraries;
271 struct tu_graphics_lib_pipeline *libraries[MAX_LIBRARIES];
272
273 /* This is just the state that we are compiling now, whereas the final
274 * pipeline will include the state from the libraries.
275 */
276 VkGraphicsPipelineLibraryFlagsEXT state;
277
278 /* The stages we are compiling now. */
279 VkShaderStageFlags active_stages;
280
281 bool fragment_density_map;
282
283 struct vk_graphics_pipeline_all_state all_state;
284 struct vk_graphics_pipeline_state graphics_state;
285 };
286
287 static bool
tu_logic_op_reads_dst(VkLogicOp op)288 tu_logic_op_reads_dst(VkLogicOp op)
289 {
290 switch (op) {
291 case VK_LOGIC_OP_CLEAR:
292 case VK_LOGIC_OP_COPY:
293 case VK_LOGIC_OP_COPY_INVERTED:
294 case VK_LOGIC_OP_SET:
295 return false;
296 default:
297 return true;
298 }
299 }
300
301 static bool
tu_blend_state_is_dual_src(const struct vk_color_blend_state * cb)302 tu_blend_state_is_dual_src(const struct vk_color_blend_state *cb)
303 {
304 for (unsigned i = 0; i < cb->attachment_count; i++) {
305 if (tu_blend_factor_is_dual_src((VkBlendFactor)cb->attachments[i].src_color_blend_factor) ||
306 tu_blend_factor_is_dual_src((VkBlendFactor)cb->attachments[i].dst_color_blend_factor) ||
307 tu_blend_factor_is_dual_src((VkBlendFactor)cb->attachments[i].src_alpha_blend_factor) ||
308 tu_blend_factor_is_dual_src((VkBlendFactor)cb->attachments[i].dst_alpha_blend_factor))
309 return true;
310 }
311
312 return false;
313 }
314
315 enum ir3_push_consts_type
tu_push_consts_type(const struct tu_pipeline_layout * layout,const struct ir3_compiler * compiler)316 tu_push_consts_type(const struct tu_pipeline_layout *layout,
317 const struct ir3_compiler *compiler)
318 {
319 if (!layout->push_constant_size)
320 return IR3_PUSH_CONSTS_NONE;
321
322 if (TU_DEBUG(PUSH_CONSTS_PER_STAGE))
323 return IR3_PUSH_CONSTS_PER_STAGE;
324
325 if (tu6_shared_constants_enable(layout, compiler)) {
326 return IR3_PUSH_CONSTS_SHARED;
327 } else {
328 if (compiler->gen >= 7) {
329 return IR3_PUSH_CONSTS_SHARED_PREAMBLE;
330 } else {
331 return IR3_PUSH_CONSTS_PER_STAGE;
332 }
333 }
334 }
335
336 template <chip CHIP>
337 struct xs_config {
338 uint16_t reg_sp_xs_config;
339 uint16_t reg_hlsq_xs_ctrl;
340 };
341
342 template <chip CHIP>
343 static const xs_config<CHIP> xs_configs[] = {
344 [MESA_SHADER_VERTEX] = {
345 REG_A6XX_SP_VS_CONFIG,
346 CHIP == A6XX ? REG_A6XX_HLSQ_VS_CNTL : REG_A7XX_HLSQ_VS_CNTL,
347 },
348 [MESA_SHADER_TESS_CTRL] = {
349 REG_A6XX_SP_HS_CONFIG,
350 CHIP == A6XX ? REG_A6XX_HLSQ_HS_CNTL : REG_A7XX_HLSQ_HS_CNTL,
351 },
352 [MESA_SHADER_TESS_EVAL] = {
353 REG_A6XX_SP_DS_CONFIG,
354 CHIP == A6XX ? REG_A6XX_HLSQ_DS_CNTL : REG_A7XX_HLSQ_DS_CNTL,
355 },
356 [MESA_SHADER_GEOMETRY] = {
357 REG_A6XX_SP_GS_CONFIG,
358 CHIP == A6XX ? REG_A6XX_HLSQ_GS_CNTL : REG_A7XX_HLSQ_GS_CNTL,
359 },
360 [MESA_SHADER_FRAGMENT] = {
361 REG_A6XX_SP_FS_CONFIG,
362 CHIP == A6XX ? REG_A6XX_HLSQ_FS_CNTL : REG_A7XX_HLSQ_FS_CNTL,
363 },
364 [MESA_SHADER_COMPUTE] = {
365 REG_A6XX_SP_CS_CONFIG,
366 CHIP == A6XX ? REG_A6XX_HLSQ_CS_CNTL : REG_A7XX_HLSQ_CS_CNTL,
367 },
368 };
369
370 template <chip CHIP>
371 void
tu6_emit_xs_config(struct tu_cs * cs,gl_shader_stage stage,const struct ir3_shader_variant * xs)372 tu6_emit_xs_config(struct tu_cs *cs,
373 gl_shader_stage stage, /* xs->type, but xs may be NULL */
374 const struct ir3_shader_variant *xs)
375 {
376 const struct xs_config<CHIP> *cfg = &xs_configs<CHIP>[stage];
377
378 if (!xs) {
379 /* shader stage disabled */
380 tu_cs_emit_pkt4(cs, cfg->reg_sp_xs_config, 1);
381 tu_cs_emit(cs, 0);
382
383 tu_cs_emit_pkt4(cs, cfg->reg_hlsq_xs_ctrl, 1);
384 tu_cs_emit(cs, 0);
385 return;
386 }
387
388 tu_cs_emit_pkt4(cs, cfg->reg_sp_xs_config, 1);
389 tu_cs_emit(cs, A6XX_SP_VS_CONFIG_ENABLED |
390 COND(xs->bindless_tex, A6XX_SP_VS_CONFIG_BINDLESS_TEX) |
391 COND(xs->bindless_samp, A6XX_SP_VS_CONFIG_BINDLESS_SAMP) |
392 COND(xs->bindless_ibo, A6XX_SP_VS_CONFIG_BINDLESS_IBO) |
393 COND(xs->bindless_ubo, A6XX_SP_VS_CONFIG_BINDLESS_UBO) |
394 A6XX_SP_VS_CONFIG_NTEX(xs->num_samp) |
395 A6XX_SP_VS_CONFIG_NSAMP(xs->num_samp));
396
397 tu_cs_emit_pkt4(cs, cfg->reg_hlsq_xs_ctrl, 1);
398 tu_cs_emit(cs, A6XX_HLSQ_VS_CNTL_CONSTLEN(xs->constlen) |
399 A6XX_HLSQ_VS_CNTL_ENABLED |
400 COND(xs->shader_options.push_consts_type == IR3_PUSH_CONSTS_SHARED_PREAMBLE,
401 A7XX_HLSQ_VS_CNTL_READ_IMM_SHARED_CONSTS));
402 }
403 TU_GENX(tu6_emit_xs_config);
404
405 static void
tu6_emit_dynamic_offset(struct tu_cs * cs,const struct ir3_shader_variant * xs,const struct tu_shader * shader,const struct tu_program_state * program)406 tu6_emit_dynamic_offset(struct tu_cs *cs,
407 const struct ir3_shader_variant *xs,
408 const struct tu_shader *shader,
409 const struct tu_program_state *program)
410 {
411 const struct tu_physical_device *phys_dev = cs->device->physical_device;
412
413 if (!xs)
414 return;
415
416 if (cs->device->physical_device->info->a7xx.load_shader_consts_via_preamble) {
417 if (shader->const_state.dynamic_offsets_ubo.size == 0)
418 return;
419
420 uint32_t offsets[MAX_SETS];
421 for (unsigned i = 0; i < phys_dev->usable_sets; i++) {
422 unsigned dynamic_offset_start =
423 program->dynamic_descriptor_offsets[i] / (A6XX_TEX_CONST_DWORDS * 4);
424 offsets[i] = dynamic_offset_start;
425 }
426
427 /* A7XX TODO: Emit data via sub_cs instead of NOP */
428 uint64_t iova = tu_cs_emit_data_nop(cs, offsets, phys_dev->usable_sets, 4);
429 uint32_t offset = shader->const_state.dynamic_offsets_ubo.idx;
430
431 tu_cs_emit_pkt7(cs, tu6_stage2opcode(xs->type), 5);
432 tu_cs_emit(cs, CP_LOAD_STATE6_0_DST_OFF(offset) |
433 CP_LOAD_STATE6_0_STATE_TYPE(ST6_UBO) |
434 CP_LOAD_STATE6_0_STATE_SRC(SS6_DIRECT) |
435 CP_LOAD_STATE6_0_STATE_BLOCK(tu6_stage2shadersb(xs->type)) |
436 CP_LOAD_STATE6_0_NUM_UNIT(1));
437 tu_cs_emit(cs, CP_LOAD_STATE6_1_EXT_SRC_ADDR(0));
438 tu_cs_emit(cs, CP_LOAD_STATE6_2_EXT_SRC_ADDR_HI(0));
439 int size_vec4s = DIV_ROUND_UP(phys_dev->usable_sets, 4);
440 tu_cs_emit_qw(cs, iova | ((uint64_t)A6XX_UBO_1_SIZE(size_vec4s) << 32));
441 } else {
442 if (shader->const_state.dynamic_offset_loc == UINT32_MAX)
443 return;
444
445 tu_cs_emit_pkt7(cs, tu6_stage2opcode(xs->type), 3 + phys_dev->usable_sets);
446 tu_cs_emit(cs, CP_LOAD_STATE6_0_DST_OFF(shader->const_state.dynamic_offset_loc / 4) |
447 CP_LOAD_STATE6_0_STATE_TYPE(ST6_CONSTANTS) |
448 CP_LOAD_STATE6_0_STATE_SRC(SS6_DIRECT) |
449 CP_LOAD_STATE6_0_STATE_BLOCK(tu6_stage2shadersb(xs->type)) |
450 CP_LOAD_STATE6_0_NUM_UNIT(DIV_ROUND_UP(phys_dev->usable_sets, 4)));
451 tu_cs_emit(cs, CP_LOAD_STATE6_1_EXT_SRC_ADDR(0));
452 tu_cs_emit(cs, CP_LOAD_STATE6_2_EXT_SRC_ADDR_HI(0));
453
454 for (unsigned i = 0; i < phys_dev->usable_sets; i++) {
455 unsigned dynamic_offset_start =
456 program->dynamic_descriptor_offsets[i] / (A6XX_TEX_CONST_DWORDS * 4);
457 tu_cs_emit(cs, dynamic_offset_start);
458 }
459 }
460 }
461
462 template <chip CHIP>
463 void
tu6_emit_shared_consts_enable(struct tu_cs * cs,bool enable)464 tu6_emit_shared_consts_enable(struct tu_cs *cs, bool enable)
465 {
466 if (CHIP == A6XX) {
467 /* Enable/disable shared constants */
468 tu_cs_emit_regs(cs, A6XX_HLSQ_SHARED_CONSTS(.enable = enable));
469 } else {
470 assert(!enable);
471 }
472
473 tu_cs_emit_regs(cs, A6XX_SP_MODE_CONTROL(.constant_demotion_enable = true,
474 .isammode = ISAMMODE_GL,
475 .shared_consts_enable = enable));
476 }
477 TU_GENX(tu6_emit_shared_consts_enable);
478
479 template <chip CHIP>
480 static void
tu6_setup_streamout(struct tu_cs * cs,const struct ir3_shader_variant * v,const struct ir3_shader_linkage * l)481 tu6_setup_streamout(struct tu_cs *cs,
482 const struct ir3_shader_variant *v,
483 const struct ir3_shader_linkage *l)
484 {
485 const struct ir3_stream_output_info *info = &v->stream_output;
486 /* Note: 64 here comes from the HW layout of the program RAM. The program
487 * for stream N is at DWORD 64 * N.
488 */
489 #define A6XX_SO_PROG_DWORDS 64
490 uint32_t prog[A6XX_SO_PROG_DWORDS * IR3_MAX_SO_STREAMS] = {};
491 BITSET_DECLARE(valid_dwords, A6XX_SO_PROG_DWORDS * IR3_MAX_SO_STREAMS) = {0};
492
493 /* TODO: streamout state should be in a non-GMEM draw state */
494
495 /* no streamout: */
496 if (info->num_outputs == 0) {
497 unsigned sizedw = 4;
498 if (cs->device->physical_device->info->a6xx.tess_use_shared)
499 sizedw += 2;
500
501 tu_cs_emit_pkt7(cs, CP_CONTEXT_REG_BUNCH, sizedw);
502 tu_cs_emit(cs, REG_A6XX_VPC_SO_CNTL);
503 tu_cs_emit(cs, 0);
504 tu_cs_emit(cs, REG_A6XX_VPC_SO_STREAM_CNTL);
505 tu_cs_emit(cs, 0);
506
507 if (cs->device->physical_device->info->a6xx.tess_use_shared) {
508 tu_cs_emit(cs, REG_A6XX_PC_SO_STREAM_CNTL);
509 tu_cs_emit(cs, 0);
510 }
511
512 return;
513 }
514
515 for (unsigned i = 0; i < info->num_outputs; i++) {
516 const struct ir3_stream_output *out = &info->output[i];
517 unsigned k = out->register_index;
518 unsigned idx;
519
520 /* Skip it, if it's an output that was never assigned a register. */
521 if (k >= v->outputs_count || v->outputs[k].regid == INVALID_REG)
522 continue;
523
524 /* linkage map sorted by order frag shader wants things, so
525 * a bit less ideal here..
526 */
527 for (idx = 0; idx < l->cnt; idx++)
528 if (l->var[idx].slot == v->outputs[k].slot)
529 break;
530
531 assert(idx < l->cnt);
532
533 for (unsigned j = 0; j < out->num_components; j++) {
534 unsigned c = j + out->start_component;
535 unsigned loc = l->var[idx].loc + c;
536 unsigned off = j + out->dst_offset; /* in dwords */
537
538 assert(loc < A6XX_SO_PROG_DWORDS * 2);
539 unsigned dword = out->stream * A6XX_SO_PROG_DWORDS + loc/2;
540 if (loc & 1) {
541 prog[dword] |= A6XX_VPC_SO_PROG_B_EN |
542 A6XX_VPC_SO_PROG_B_BUF(out->output_buffer) |
543 A6XX_VPC_SO_PROG_B_OFF(off * 4);
544 } else {
545 prog[dword] |= A6XX_VPC_SO_PROG_A_EN |
546 A6XX_VPC_SO_PROG_A_BUF(out->output_buffer) |
547 A6XX_VPC_SO_PROG_A_OFF(off * 4);
548 }
549 BITSET_SET(valid_dwords, dword);
550 }
551 }
552
553 unsigned prog_count = 0;
554 unsigned start, end;
555 BITSET_FOREACH_RANGE(start, end, valid_dwords,
556 A6XX_SO_PROG_DWORDS * IR3_MAX_SO_STREAMS) {
557 prog_count += end - start + 1;
558 }
559
560 const bool emit_pc_so_stream_cntl =
561 cs->device->physical_device->info->a6xx.tess_use_shared &&
562 v->type == MESA_SHADER_TESS_EVAL;
563
564 if (emit_pc_so_stream_cntl)
565 prog_count += 1;
566
567 tu_cs_emit_pkt7(cs, CP_CONTEXT_REG_BUNCH, 10 + 2 * prog_count);
568 tu_cs_emit(cs, REG_A6XX_VPC_SO_STREAM_CNTL);
569 tu_cs_emit(cs, A6XX_VPC_SO_STREAM_CNTL_STREAM_ENABLE(info->streams_written) |
570 COND(info->stride[0] > 0,
571 A6XX_VPC_SO_STREAM_CNTL_BUF0_STREAM(1 + info->buffer_to_stream[0])) |
572 COND(info->stride[1] > 0,
573 A6XX_VPC_SO_STREAM_CNTL_BUF1_STREAM(1 + info->buffer_to_stream[1])) |
574 COND(info->stride[2] > 0,
575 A6XX_VPC_SO_STREAM_CNTL_BUF2_STREAM(1 + info->buffer_to_stream[2])) |
576 COND(info->stride[3] > 0,
577 A6XX_VPC_SO_STREAM_CNTL_BUF3_STREAM(1 + info->buffer_to_stream[3])));
578 for (uint32_t i = 0; i < 4; i++) {
579 tu_cs_emit(cs, REG_A6XX_VPC_SO_BUFFER_STRIDE(i));
580 tu_cs_emit(cs, info->stride[i]);
581 }
582 bool first = true;
583 BITSET_FOREACH_RANGE(start, end, valid_dwords,
584 A6XX_SO_PROG_DWORDS * IR3_MAX_SO_STREAMS) {
585 tu_cs_emit(cs, REG_A6XX_VPC_SO_CNTL);
586 tu_cs_emit(cs, COND(first, A6XX_VPC_SO_CNTL_RESET) |
587 A6XX_VPC_SO_CNTL_ADDR(start));
588 for (unsigned i = start; i < end; i++) {
589 tu_cs_emit(cs, REG_A6XX_VPC_SO_PROG);
590 tu_cs_emit(cs, prog[i]);
591 }
592 first = false;
593 }
594
595 if (emit_pc_so_stream_cntl) {
596 /* Possibly not tess_use_shared related, but the combination of
597 * tess + xfb fails some tests if we don't emit this.
598 */
599 tu_cs_emit(cs, REG_A6XX_PC_SO_STREAM_CNTL);
600 tu_cs_emit(cs, A6XX_PC_SO_STREAM_CNTL_STREAM_ENABLE(info->streams_written));
601 }
602 }
603
604 enum tu_geom_consts_type
605 {
606 TU_CONSTS_PRIMITIVE_MAP,
607 TU_CONSTS_PRIMITIVE_PARAM,
608 };
609
610 static void
tu6_emit_const(struct tu_cs * cs,uint32_t opcode,enum tu_geom_consts_type type,const struct ir3_const_state * const_state,unsigned constlen,enum a6xx_state_block block,uint32_t offset,uint32_t size,const uint32_t * dwords)611 tu6_emit_const(struct tu_cs *cs, uint32_t opcode, enum tu_geom_consts_type type,
612 const struct ir3_const_state *const_state,
613 unsigned constlen, enum a6xx_state_block block,
614 uint32_t offset, uint32_t size, const uint32_t *dwords) {
615 assert(size % 4 == 0);
616 dwords = (uint32_t *)&((uint8_t *)dwords)[offset];
617
618 if (!cs->device->physical_device->info->a7xx.load_shader_consts_via_preamble) {
619 uint32_t base;
620 switch (type) {
621 case TU_CONSTS_PRIMITIVE_MAP:
622 base = const_state->allocs.consts[IR3_CONST_ALLOC_PRIMITIVE_MAP].offset_vec4;
623 break;
624 case TU_CONSTS_PRIMITIVE_PARAM:
625 base = const_state->allocs.consts[IR3_CONST_ALLOC_PRIMITIVE_PARAM].offset_vec4;
626 break;
627 default:
628 unreachable("bad consts type");
629 }
630
631 int32_t adjusted_size = MIN2(base * 4 + size, constlen * 4) - base * 4;
632 if (adjusted_size <= 0)
633 return;
634
635 tu_cs_emit_pkt7(cs, opcode, 3 + adjusted_size);
636 tu_cs_emit(cs, CP_LOAD_STATE6_0_DST_OFF(base) |
637 CP_LOAD_STATE6_0_STATE_TYPE(ST6_CONSTANTS) |
638 CP_LOAD_STATE6_0_STATE_SRC(SS6_DIRECT) |
639 CP_LOAD_STATE6_0_STATE_BLOCK(block) |
640 CP_LOAD_STATE6_0_NUM_UNIT(adjusted_size / 4));
641
642 tu_cs_emit(cs, CP_LOAD_STATE6_1_EXT_SRC_ADDR(0));
643 tu_cs_emit(cs, CP_LOAD_STATE6_2_EXT_SRC_ADDR_HI(0));
644
645 tu_cs_emit_array(cs, dwords, adjusted_size);
646 } else {
647 uint32_t base;
648 switch (type) {
649 case TU_CONSTS_PRIMITIVE_MAP:
650 base = const_state->primitive_map_ubo.idx;
651 break;
652 case TU_CONSTS_PRIMITIVE_PARAM:
653 base = const_state->primitive_param_ubo.idx;
654 break;
655 default:
656 unreachable("bad consts type");
657 }
658 if (base == -1)
659 return;
660
661 /* A7XX TODO: Emit data via sub_cs instead of NOP */
662 uint64_t iova = tu_cs_emit_data_nop(cs, dwords, size, 4);
663
664 tu_cs_emit_pkt7(cs, opcode, 5);
665 tu_cs_emit(cs, CP_LOAD_STATE6_0_DST_OFF(base) |
666 CP_LOAD_STATE6_0_STATE_TYPE(ST6_UBO) |
667 CP_LOAD_STATE6_0_STATE_SRC(SS6_DIRECT) |
668 CP_LOAD_STATE6_0_STATE_BLOCK(block) |
669 CP_LOAD_STATE6_0_NUM_UNIT(1));
670 tu_cs_emit(cs, CP_LOAD_STATE6_1_EXT_SRC_ADDR(0));
671 tu_cs_emit(cs, CP_LOAD_STATE6_2_EXT_SRC_ADDR_HI(0));
672 int size_vec4s = DIV_ROUND_UP(size, 4);
673 tu_cs_emit_qw(cs, iova | ((uint64_t)A6XX_UBO_1_SIZE(size_vec4s) << 32));
674 }
675 }
676
677 static void
tu6_emit_link_map(struct tu_cs * cs,const struct ir3_shader_variant * producer,const struct ir3_shader_variant * consumer,enum a6xx_state_block sb)678 tu6_emit_link_map(struct tu_cs *cs,
679 const struct ir3_shader_variant *producer,
680 const struct ir3_shader_variant *consumer,
681 enum a6xx_state_block sb)
682 {
683 const struct ir3_const_state *const_state = ir3_const_state(consumer);
684 uint32_t size = ALIGN(consumer->input_size, 4);
685
686 if (size == 0)
687 return;
688
689 tu6_emit_const(cs, CP_LOAD_STATE6_GEOM, TU_CONSTS_PRIMITIVE_MAP,
690 const_state, consumer->constlen, sb, 0, size, producer->output_loc);
691 }
692
693 static int
tu6_vpc_varying_mode(const struct ir3_shader_variant * fs,const struct ir3_shader_variant * last_shader,uint32_t index,uint8_t * interp_mode,uint8_t * ps_repl_mode)694 tu6_vpc_varying_mode(const struct ir3_shader_variant *fs,
695 const struct ir3_shader_variant *last_shader,
696 uint32_t index,
697 uint8_t *interp_mode,
698 uint8_t *ps_repl_mode)
699 {
700 const uint32_t compmask = fs->inputs[index].compmask;
701
702 /* NOTE: varyings are packed, so if compmask is 0xb then first, second, and
703 * fourth component occupy three consecutive varying slots
704 */
705 int shift = 0;
706 *interp_mode = 0;
707 *ps_repl_mode = 0;
708 if (fs->inputs[index].slot == VARYING_SLOT_PNTC) {
709 if (compmask & 0x1) {
710 *ps_repl_mode |= PS_REPL_S << shift;
711 shift += 2;
712 }
713 if (compmask & 0x2) {
714 *ps_repl_mode |= PS_REPL_T << shift;
715 shift += 2;
716 }
717 if (compmask & 0x4) {
718 *interp_mode |= INTERP_ZERO << shift;
719 shift += 2;
720 }
721 if (compmask & 0x8) {
722 *interp_mode |= INTERP_ONE << 6;
723 shift += 2;
724 }
725 } else if (fs->inputs[index].slot == VARYING_SLOT_LAYER ||
726 fs->inputs[index].slot == VARYING_SLOT_VIEWPORT) {
727 /* If the last geometry shader doesn't statically write these, they're
728 * implicitly zero and the FS is supposed to read zero.
729 */
730 const gl_varying_slot slot = (gl_varying_slot) fs->inputs[index].slot;
731 if (ir3_find_output(last_shader, slot) < 0 &&
732 (compmask & 0x1)) {
733 *interp_mode |= INTERP_ZERO;
734 } else {
735 *interp_mode |= INTERP_FLAT;
736 }
737 } else if (fs->inputs[index].flat) {
738 for (int i = 0; i < 4; i++) {
739 if (compmask & (1 << i)) {
740 *interp_mode |= INTERP_FLAT << shift;
741 shift += 2;
742 }
743 }
744 }
745
746 return util_bitcount(compmask) * 2;
747 }
748
749 static void
tu6_emit_vpc_varying_modes(struct tu_cs * cs,const struct ir3_shader_variant * fs,const struct ir3_shader_variant * last_shader)750 tu6_emit_vpc_varying_modes(struct tu_cs *cs,
751 const struct ir3_shader_variant *fs,
752 const struct ir3_shader_variant *last_shader)
753 {
754 uint32_t interp_modes[8] = { 0 };
755 uint32_t ps_repl_modes[8] = { 0 };
756 uint32_t interp_regs = 0;
757
758 if (fs) {
759 for (int i = -1;
760 (i = ir3_next_varying(fs, i)) < (int) fs->inputs_count;) {
761
762 /* get the mode for input i */
763 uint8_t interp_mode;
764 uint8_t ps_repl_mode;
765 const int bits =
766 tu6_vpc_varying_mode(fs, last_shader, i, &interp_mode, &ps_repl_mode);
767
768 /* OR the mode into the array */
769 const uint32_t inloc = fs->inputs[i].inloc * 2;
770 uint32_t n = inloc / 32;
771 uint32_t shift = inloc % 32;
772 interp_modes[n] |= interp_mode << shift;
773 ps_repl_modes[n] |= ps_repl_mode << shift;
774 if (shift + bits > 32) {
775 n++;
776 shift = 32 - shift;
777
778 interp_modes[n] |= interp_mode >> shift;
779 ps_repl_modes[n] |= ps_repl_mode >> shift;
780 }
781 interp_regs = MAX2(interp_regs, n + 1);
782 }
783 }
784
785 if (interp_regs) {
786 tu_cs_emit_pkt4(cs, REG_A6XX_VPC_VARYING_INTERP_MODE(0), interp_regs);
787 tu_cs_emit_array(cs, interp_modes, interp_regs);
788
789 tu_cs_emit_pkt4(cs, REG_A6XX_VPC_VARYING_PS_REPL_MODE(0), interp_regs);
790 tu_cs_emit_array(cs, ps_repl_modes, interp_regs);
791 }
792 }
793
794 template <chip CHIP>
795 void
tu6_emit_vpc(struct tu_cs * cs,const struct ir3_shader_variant * vs,const struct ir3_shader_variant * hs,const struct ir3_shader_variant * ds,const struct ir3_shader_variant * gs,const struct ir3_shader_variant * fs)796 tu6_emit_vpc(struct tu_cs *cs,
797 const struct ir3_shader_variant *vs,
798 const struct ir3_shader_variant *hs,
799 const struct ir3_shader_variant *ds,
800 const struct ir3_shader_variant *gs,
801 const struct ir3_shader_variant *fs)
802 {
803 /* note: doesn't compile as static because of the array regs.. */
804 const struct reg_config {
805 uint16_t reg_sp_xs_out_reg;
806 uint16_t reg_sp_xs_vpc_dst_reg;
807 uint16_t reg_vpc_xs_pack;
808 uint16_t reg_vpc_xs_clip_cntl;
809 uint16_t reg_vpc_xs_clip_cntl_v2;
810 uint16_t reg_gras_xs_cl_cntl;
811 uint16_t reg_pc_xs_out_cntl;
812 uint16_t reg_sp_xs_primitive_cntl;
813 uint16_t reg_vpc_xs_layer_cntl;
814 uint16_t reg_vpc_xs_layer_cntl_v2;
815 uint16_t reg_gras_xs_layer_cntl;
816 } reg_config[] = {
817 [MESA_SHADER_VERTEX] = {
818 REG_A6XX_SP_VS_OUT_REG(0),
819 REG_A6XX_SP_VS_VPC_DST_REG(0),
820 REG_A6XX_VPC_VS_PACK,
821 REG_A6XX_VPC_VS_CLIP_CNTL,
822 REG_A6XX_VPC_VS_CLIP_CNTL_V2,
823 REG_A6XX_GRAS_VS_CL_CNTL,
824 REG_A6XX_PC_VS_OUT_CNTL,
825 REG_A6XX_SP_VS_PRIMITIVE_CNTL,
826 REG_A6XX_VPC_VS_LAYER_CNTL,
827 REG_A6XX_VPC_VS_LAYER_CNTL_V2,
828 REG_A6XX_GRAS_VS_LAYER_CNTL
829 },
830 [MESA_SHADER_TESS_CTRL] = {
831 0,
832 0,
833 0,
834 0,
835 0,
836 0,
837 REG_A6XX_PC_HS_OUT_CNTL,
838 0,
839 0,
840 0
841 },
842 [MESA_SHADER_TESS_EVAL] = {
843 REG_A6XX_SP_DS_OUT_REG(0),
844 REG_A6XX_SP_DS_VPC_DST_REG(0),
845 REG_A6XX_VPC_DS_PACK,
846 REG_A6XX_VPC_DS_CLIP_CNTL,
847 REG_A6XX_VPC_DS_CLIP_CNTL_V2,
848 REG_A6XX_GRAS_DS_CL_CNTL,
849 REG_A6XX_PC_DS_OUT_CNTL,
850 REG_A6XX_SP_DS_PRIMITIVE_CNTL,
851 REG_A6XX_VPC_DS_LAYER_CNTL,
852 REG_A6XX_VPC_DS_LAYER_CNTL_V2,
853 REG_A6XX_GRAS_DS_LAYER_CNTL
854 },
855 [MESA_SHADER_GEOMETRY] = {
856 REG_A6XX_SP_GS_OUT_REG(0),
857 REG_A6XX_SP_GS_VPC_DST_REG(0),
858 REG_A6XX_VPC_GS_PACK,
859 REG_A6XX_VPC_GS_CLIP_CNTL,
860 REG_A6XX_VPC_GS_CLIP_CNTL_V2,
861 REG_A6XX_GRAS_GS_CL_CNTL,
862 REG_A6XX_PC_GS_OUT_CNTL,
863 REG_A6XX_SP_GS_PRIMITIVE_CNTL,
864 REG_A6XX_VPC_GS_LAYER_CNTL,
865 REG_A6XX_VPC_GS_LAYER_CNTL_V2,
866 REG_A6XX_GRAS_GS_LAYER_CNTL
867 },
868 };
869
870 const struct ir3_shader_variant *last_shader;
871 if (gs) {
872 last_shader = gs;
873 } else if (hs) {
874 last_shader = ds;
875 } else {
876 last_shader = vs;
877 }
878
879 const struct reg_config *cfg = ®_config[last_shader->type];
880
881 struct ir3_shader_linkage linkage = {
882 .primid_loc = 0xff,
883 .clip0_loc = 0xff,
884 .clip1_loc = 0xff,
885 };
886 if (fs)
887 ir3_link_shaders(&linkage, last_shader, fs, true);
888
889 if (last_shader->stream_output.num_outputs)
890 ir3_link_stream_out(&linkage, last_shader);
891
892 /* a6xx finds position/pointsize at the end */
893 const uint32_t pointsize_regid =
894 ir3_find_output_regid(last_shader, VARYING_SLOT_PSIZ);
895 const uint32_t layer_regid =
896 ir3_find_output_regid(last_shader, VARYING_SLOT_LAYER);
897 const uint32_t view_regid =
898 ir3_find_output_regid(last_shader, VARYING_SLOT_VIEWPORT);
899 const uint32_t clip0_regid =
900 ir3_find_output_regid(last_shader, VARYING_SLOT_CLIP_DIST0);
901 const uint32_t clip1_regid =
902 ir3_find_output_regid(last_shader, VARYING_SLOT_CLIP_DIST1);
903 uint32_t flags_regid = gs ?
904 ir3_find_output_regid(gs, VARYING_SLOT_GS_VERTEX_FLAGS_IR3) : 0;
905 const uint32_t shading_rate_regid =
906 ir3_find_output_regid(last_shader, VARYING_SLOT_PRIMITIVE_SHADING_RATE);
907
908 uint32_t pointsize_loc = 0xff, position_loc = 0xff, layer_loc = 0xff, view_loc = 0xff;
909 uint32_t shading_rate_loc = 0xff;
910
911 if (layer_regid != regid(63, 0)) {
912 layer_loc = linkage.max_loc;
913 ir3_link_add(&linkage, VARYING_SLOT_LAYER, layer_regid, 0x1, linkage.max_loc);
914 }
915
916 if (view_regid != regid(63, 0)) {
917 view_loc = linkage.max_loc;
918 ir3_link_add(&linkage, VARYING_SLOT_VIEWPORT, view_regid, 0x1, linkage.max_loc);
919 }
920
921 if (shading_rate_regid != regid(63, 0)) {
922 shading_rate_loc = linkage.max_loc;
923 ir3_link_add(&linkage, VARYING_SLOT_PRIMITIVE_SHADING_RATE,
924 shading_rate_regid, 0x1, linkage.max_loc);
925 }
926
927 unsigned extra_pos = 0;
928
929 for (unsigned i = 0; i < last_shader->outputs_count; i++) {
930 if (last_shader->outputs[i].slot != VARYING_SLOT_POS)
931 continue;
932
933 if (position_loc == 0xff)
934 position_loc = linkage.max_loc;
935
936 ir3_link_add(&linkage, last_shader->outputs[i].slot,
937 last_shader->outputs[i].regid,
938 0xf, position_loc + 4 * last_shader->outputs[i].view);
939 extra_pos = MAX2(extra_pos, last_shader->outputs[i].view);
940 }
941
942 if (pointsize_regid != regid(63, 0)) {
943 pointsize_loc = linkage.max_loc;
944 ir3_link_add(&linkage, VARYING_SLOT_PSIZ, pointsize_regid, 0x1, linkage.max_loc);
945 }
946
947 uint8_t clip_cull_mask = last_shader->clip_mask | last_shader->cull_mask;
948
949 /* Handle the case where clip/cull distances aren't read by the FS */
950 uint32_t clip0_loc = linkage.clip0_loc, clip1_loc = linkage.clip1_loc;
951 if (clip0_loc == 0xff && clip0_regid != regid(63, 0)) {
952 clip0_loc = linkage.max_loc;
953 ir3_link_add(&linkage, VARYING_SLOT_CLIP_DIST0, clip0_regid,
954 clip_cull_mask & 0xf, linkage.max_loc);
955 }
956 if (clip1_loc == 0xff && clip1_regid != regid(63, 0)) {
957 clip1_loc = linkage.max_loc;
958 ir3_link_add(&linkage, VARYING_SLOT_CLIP_DIST1, clip1_regid,
959 clip_cull_mask >> 4, linkage.max_loc);
960 }
961
962 tu6_setup_streamout<CHIP>(cs, last_shader, &linkage);
963
964 /* There is a hardware bug on a750 where STRIDE_IN_VPC of 5 to 8 in GS with
965 * an input primitive type with adjacency, an output primitive type of
966 * points, and a high enough vertex count causes a hang.
967 */
968 if (cs->device->physical_device->info->a7xx.gs_vpc_adjacency_quirk &&
969 gs && gs->gs.output_primitive == MESA_PRIM_POINTS &&
970 linkage.max_loc > 4) {
971 linkage.max_loc = MAX2(linkage.max_loc, 9);
972 }
973
974 /* The GPU hangs on some models when there are no outputs (xs_pack::CNT),
975 * at least when a DS is the last stage, so add a dummy output to keep it
976 * happy if there aren't any. We do this late in order to avoid emitting
977 * any unused code and make sure that optimizations don't remove it.
978 */
979 if (linkage.cnt == 0)
980 ir3_link_add(&linkage, 0, 0, 0x1, linkage.max_loc);
981
982 /* map outputs of the last shader to VPC */
983 assert(linkage.cnt <= 32);
984 const uint32_t sp_out_count = DIV_ROUND_UP(linkage.cnt, 2);
985 const uint32_t sp_vpc_dst_count = DIV_ROUND_UP(linkage.cnt, 4);
986 uint32_t sp_out[16] = {0};
987 uint32_t sp_vpc_dst[8] = {0};
988 for (uint32_t i = 0; i < linkage.cnt; i++) {
989 ((uint16_t *) sp_out)[i] =
990 A6XX_SP_VS_OUT_REG_A_REGID(linkage.var[i].regid) |
991 A6XX_SP_VS_OUT_REG_A_COMPMASK(linkage.var[i].compmask);
992 ((uint8_t *) sp_vpc_dst)[i] =
993 A6XX_SP_VS_VPC_DST_REG_OUTLOC0(linkage.var[i].loc);
994 }
995
996 tu_cs_emit_pkt4(cs, cfg->reg_sp_xs_out_reg, sp_out_count);
997 tu_cs_emit_array(cs, sp_out, sp_out_count);
998
999 tu_cs_emit_pkt4(cs, cfg->reg_sp_xs_vpc_dst_reg, sp_vpc_dst_count);
1000 tu_cs_emit_array(cs, sp_vpc_dst, sp_vpc_dst_count);
1001
1002 tu_cs_emit_pkt4(cs, cfg->reg_vpc_xs_pack, 1);
1003 tu_cs_emit(cs, A6XX_VPC_VS_PACK_POSITIONLOC(position_loc) |
1004 A6XX_VPC_VS_PACK_PSIZELOC(pointsize_loc) |
1005 A6XX_VPC_VS_PACK_STRIDE_IN_VPC(linkage.max_loc) |
1006 A6XX_VPC_VS_PACK_EXTRAPOS(extra_pos));
1007
1008 tu_cs_emit_pkt4(cs, cfg->reg_vpc_xs_clip_cntl, 1);
1009 tu_cs_emit(cs, A6XX_VPC_VS_CLIP_CNTL_CLIP_MASK(clip_cull_mask) |
1010 A6XX_VPC_VS_CLIP_CNTL_CLIP_DIST_03_LOC(clip0_loc) |
1011 A6XX_VPC_VS_CLIP_CNTL_CLIP_DIST_47_LOC(clip1_loc));
1012 tu_cs_emit_pkt4(cs, cfg->reg_vpc_xs_clip_cntl_v2, 1);
1013 tu_cs_emit(cs, A6XX_VPC_VS_CLIP_CNTL_CLIP_MASK(clip_cull_mask) |
1014 A6XX_VPC_VS_CLIP_CNTL_CLIP_DIST_03_LOC(clip0_loc) |
1015 A6XX_VPC_VS_CLIP_CNTL_CLIP_DIST_47_LOC(clip1_loc));
1016
1017 tu_cs_emit_pkt4(cs, cfg->reg_gras_xs_cl_cntl, 1);
1018 tu_cs_emit(cs, A6XX_GRAS_VS_CL_CNTL_CLIP_MASK(last_shader->clip_mask) |
1019 A6XX_GRAS_VS_CL_CNTL_CULL_MASK(last_shader->cull_mask));
1020
1021 const struct ir3_shader_variant *geom_shaders[] = { vs, hs, ds, gs };
1022
1023 for (unsigned i = 0; i < ARRAY_SIZE(geom_shaders); i++) {
1024 const struct ir3_shader_variant *shader = geom_shaders[i];
1025 if (!shader)
1026 continue;
1027
1028 bool primid = shader->type != MESA_SHADER_VERTEX &&
1029 VALIDREG(ir3_find_sysval_regid(shader, SYSTEM_VALUE_PRIMITIVE_ID));
1030
1031 tu_cs_emit_pkt4(cs, reg_config[shader->type].reg_pc_xs_out_cntl, 1);
1032 if (shader == last_shader) {
1033 tu_cs_emit(cs, A6XX_PC_VS_OUT_CNTL_STRIDE_IN_VPC(linkage.max_loc) |
1034 CONDREG(pointsize_regid, A6XX_PC_VS_OUT_CNTL_PSIZE) |
1035 CONDREG(layer_regid, A6XX_PC_VS_OUT_CNTL_LAYER) |
1036 CONDREG(view_regid, A6XX_PC_VS_OUT_CNTL_VIEW) |
1037 COND(primid, A6XX_PC_VS_OUT_CNTL_PRIMITIVE_ID) |
1038 A6XX_PC_VS_OUT_CNTL_CLIP_MASK(clip_cull_mask) |
1039 CONDREG(shading_rate_regid, A6XX_PC_VS_OUT_CNTL_SHADINGRATE));
1040 } else {
1041 tu_cs_emit(cs, COND(primid, A6XX_PC_VS_OUT_CNTL_PRIMITIVE_ID));
1042 }
1043 }
1044
1045 /* if vertex_flags somehow gets optimized out, your gonna have a bad time: */
1046 if (gs)
1047 assert(flags_regid != INVALID_REG);
1048
1049 tu_cs_emit_pkt4(cs, cfg->reg_sp_xs_primitive_cntl, 1);
1050 tu_cs_emit(cs, A6XX_SP_VS_PRIMITIVE_CNTL_OUT(linkage.cnt) |
1051 A6XX_SP_GS_PRIMITIVE_CNTL_FLAGS_REGID(flags_regid));
1052
1053 tu_cs_emit_pkt4(cs, cfg->reg_vpc_xs_layer_cntl, 1);
1054 tu_cs_emit(cs, A6XX_VPC_VS_LAYER_CNTL_LAYERLOC(layer_loc) |
1055 A6XX_VPC_VS_LAYER_CNTL_VIEWLOC(view_loc) |
1056 A6XX_VPC_VS_LAYER_CNTL_SHADINGRATELOC(shading_rate_loc));
1057 tu_cs_emit_pkt4(cs, cfg->reg_vpc_xs_layer_cntl_v2, 1);
1058 tu_cs_emit(cs, A6XX_VPC_VS_LAYER_CNTL_LAYERLOC(layer_loc) |
1059 A6XX_VPC_VS_LAYER_CNTL_VIEWLOC(view_loc) |
1060 A6XX_VPC_VS_LAYER_CNTL_SHADINGRATELOC(shading_rate_loc));
1061
1062 tu_cs_emit_pkt4(cs, cfg->reg_gras_xs_layer_cntl, 1);
1063 tu_cs_emit(cs, CONDREG(layer_regid, A6XX_GRAS_GS_LAYER_CNTL_WRITES_LAYER) |
1064 CONDREG(view_regid, A6XX_GRAS_GS_LAYER_CNTL_WRITES_VIEW));
1065
1066 tu6_emit_vpc_varying_modes(cs, fs, last_shader);
1067 }
1068 TU_GENX(tu6_emit_vpc);
1069
1070 static void
tu6_emit_vs_params(struct tu_cs * cs,const struct ir3_const_state * const_state,unsigned constlen,unsigned param_stride,unsigned num_vertices)1071 tu6_emit_vs_params(struct tu_cs *cs,
1072 const struct ir3_const_state *const_state,
1073 unsigned constlen,
1074 unsigned param_stride,
1075 unsigned num_vertices)
1076 {
1077 uint32_t vs_params[4] = {
1078 param_stride * num_vertices * 4, /* vs primitive stride */
1079 param_stride * 4, /* vs vertex stride */
1080 0,
1081 0,
1082 };
1083 tu6_emit_const(cs, CP_LOAD_STATE6_GEOM, TU_CONSTS_PRIMITIVE_PARAM,
1084 const_state, constlen, SB6_VS_SHADER, 0,
1085 ARRAY_SIZE(vs_params), vs_params);
1086 }
1087
1088 static void
tu_get_tess_iova(struct tu_device * dev,uint64_t * tess_factor_iova,uint64_t * tess_param_iova)1089 tu_get_tess_iova(struct tu_device *dev,
1090 uint64_t *tess_factor_iova,
1091 uint64_t *tess_param_iova)
1092 {
1093 /* Create the shared tess factor BO the first time tess is used on the device. */
1094 if (!dev->tess_bo) {
1095 mtx_lock(&dev->mutex);
1096 if (!dev->tess_bo) {
1097 tu_bo_init_new(dev, NULL, &dev->tess_bo, TU_TESS_BO_SIZE,
1098 TU_BO_ALLOC_INTERNAL_RESOURCE, "tess");
1099 }
1100 mtx_unlock(&dev->mutex);
1101 }
1102
1103 *tess_factor_iova = dev->tess_bo->iova;
1104 *tess_param_iova = dev->tess_bo->iova + TU_TESS_FACTOR_SIZE;
1105 }
1106
1107 static const enum mesa_vk_dynamic_graphics_state tu_patch_control_points_state[] = {
1108 MESA_VK_DYNAMIC_TS_PATCH_CONTROL_POINTS,
1109 };
1110
1111 #define HS_PARAMS_SIZE 8
1112
1113 template <chip CHIP>
1114 static unsigned
tu6_patch_control_points_size(struct tu_device * dev,const struct tu_shader * vs,const struct tu_shader * tcs,const struct tu_shader * tes,const struct tu_program_state * program,uint32_t patch_control_points)1115 tu6_patch_control_points_size(struct tu_device *dev,
1116 const struct tu_shader *vs,
1117 const struct tu_shader *tcs,
1118 const struct tu_shader *tes,
1119 const struct tu_program_state *program,
1120 uint32_t patch_control_points)
1121 {
1122 if (dev->physical_device->info->a7xx.load_shader_consts_via_preamble) {
1123 #define EMIT_CONST_DWORDS(const_dwords) (6 + const_dwords + 4)
1124 return EMIT_CONST_DWORDS(4) +
1125 EMIT_CONST_DWORDS(HS_PARAMS_SIZE) + 2 + 2 + 2;
1126 #undef EMIT_CONST_DWORDS
1127 } else {
1128 #define EMIT_CONST_DWORDS(const_dwords) (4 + const_dwords)
1129 return EMIT_CONST_DWORDS(4) +
1130 EMIT_CONST_DWORDS(HS_PARAMS_SIZE) + 2 + 2 + 2;
1131 #undef EMIT_CONST_DWORDS
1132 }
1133 }
1134
1135 template <chip CHIP>
1136 void
tu6_emit_patch_control_points(struct tu_cs * cs,const struct tu_shader * vs,const struct tu_shader * tcs,const struct tu_shader * tes,const struct tu_program_state * program,uint32_t patch_control_points)1137 tu6_emit_patch_control_points(struct tu_cs *cs,
1138 const struct tu_shader *vs,
1139 const struct tu_shader *tcs,
1140 const struct tu_shader *tes,
1141 const struct tu_program_state *program,
1142 uint32_t patch_control_points)
1143 {
1144 if (!tcs->variant)
1145 return;
1146
1147 struct tu_device *dev = cs->device;
1148
1149 tu6_emit_vs_params(cs,
1150 &program->link[MESA_SHADER_VERTEX].const_state,
1151 program->link[MESA_SHADER_VERTEX].constlen,
1152 vs->variant->output_size,
1153 patch_control_points);
1154
1155 uint64_t tess_factor_iova, tess_param_iova;
1156 tu_get_tess_iova(dev, &tess_factor_iova, &tess_param_iova);
1157
1158 uint32_t hs_params[HS_PARAMS_SIZE] = {
1159 vs->variant->output_size * patch_control_points * 4, /* hs primitive stride */
1160 vs->variant->output_size * 4, /* hs vertex stride */
1161 tcs->variant->output_size,
1162 patch_control_points,
1163 tess_param_iova,
1164 tess_param_iova >> 32,
1165 tess_factor_iova,
1166 tess_factor_iova >> 32,
1167 };
1168
1169 const struct ir3_const_state *hs_const =
1170 &program->link[MESA_SHADER_TESS_CTRL].const_state;
1171 unsigned hs_constlen = program->link[MESA_SHADER_TESS_CTRL].constlen;
1172 tu6_emit_const(cs, CP_LOAD_STATE6_GEOM, TU_CONSTS_PRIMITIVE_PARAM,
1173 hs_const, hs_constlen, SB6_HS_SHADER, 0,
1174 ARRAY_SIZE(hs_params), hs_params);
1175
1176 uint32_t patch_local_mem_size_16b =
1177 patch_control_points * vs->variant->output_size / 4;
1178
1179 /* Total attribute slots in HS incoming patch. */
1180 tu_cs_emit_pkt4(cs, REG_A6XX_PC_HS_INPUT_SIZE, 1);
1181 tu_cs_emit(cs, patch_local_mem_size_16b);
1182
1183 const uint32_t wavesize = 64;
1184 const uint32_t vs_hs_local_mem_size = 16384;
1185
1186 uint32_t max_patches_per_wave;
1187 if (dev->physical_device->info->a6xx.tess_use_shared) {
1188 /* HS invocations for a patch are always within the same wave,
1189 * making barriers less expensive. VS can't have barriers so we
1190 * don't care about VS invocations being in the same wave.
1191 */
1192 max_patches_per_wave = wavesize / tcs->variant->tess.tcs_vertices_out;
1193 } else {
1194 /* VS is also in the same wave */
1195 max_patches_per_wave =
1196 wavesize / MAX2(patch_control_points,
1197 tcs->variant->tess.tcs_vertices_out);
1198 }
1199
1200 uint32_t patches_per_wave =
1201 MIN2(vs_hs_local_mem_size / (patch_local_mem_size_16b * 16),
1202 max_patches_per_wave);
1203
1204 uint32_t wave_input_size = DIV_ROUND_UP(
1205 patches_per_wave * patch_local_mem_size_16b * 16, 256);
1206
1207 tu_cs_emit_pkt4(cs, REG_A6XX_SP_HS_WAVE_INPUT_SIZE, 1);
1208 tu_cs_emit(cs, wave_input_size);
1209
1210 /* maximum number of patches that can fit in tess factor/param buffers */
1211 uint32_t subdraw_size = MIN2(TU_TESS_FACTOR_SIZE / ir3_tess_factor_stride(tes->variant->key.tessellation),
1212 TU_TESS_PARAM_SIZE / (tcs->variant->output_size * 4));
1213 /* convert from # of patches to draw count */
1214 subdraw_size *= patch_control_points;
1215
1216 tu_cs_emit_pkt7(cs, CP_SET_SUBDRAW_SIZE, 1);
1217 tu_cs_emit(cs, subdraw_size);
1218 }
1219
1220 static void
tu6_emit_geom_tess_consts(struct tu_cs * cs,const struct ir3_shader_variant * vs,const struct ir3_shader_variant * hs,const struct ir3_shader_variant * ds,const struct ir3_shader_variant * gs)1221 tu6_emit_geom_tess_consts(struct tu_cs *cs,
1222 const struct ir3_shader_variant *vs,
1223 const struct ir3_shader_variant *hs,
1224 const struct ir3_shader_variant *ds,
1225 const struct ir3_shader_variant *gs)
1226 {
1227 struct tu_device *dev = cs->device;
1228
1229 if (gs && !hs) {
1230 tu6_emit_vs_params(cs, ir3_const_state(vs), vs->constlen,
1231 vs->output_size, gs->gs.vertices_in);
1232 }
1233
1234 if (hs) {
1235 uint64_t tess_factor_iova, tess_param_iova;
1236 tu_get_tess_iova(dev, &tess_factor_iova, &tess_param_iova);
1237
1238 uint32_t ds_params[8] = {
1239 gs ? ds->output_size * gs->gs.vertices_in * 4 : 0, /* ds primitive stride */
1240 ds->output_size * 4, /* ds vertex stride */
1241 hs->output_size, /* hs vertex stride (dwords) */
1242 hs->tess.tcs_vertices_out,
1243 tess_param_iova,
1244 tess_param_iova >> 32,
1245 tess_factor_iova,
1246 tess_factor_iova >> 32,
1247 };
1248
1249 tu6_emit_const(cs, CP_LOAD_STATE6_GEOM, TU_CONSTS_PRIMITIVE_PARAM,
1250 ds->const_state, ds->constlen, SB6_DS_SHADER, 0,
1251 ARRAY_SIZE(ds_params), ds_params);
1252 }
1253
1254 if (gs) {
1255 const struct ir3_shader_variant *prev = ds ? ds : vs;
1256 uint32_t gs_params[4] = {
1257 prev->output_size * gs->gs.vertices_in * 4, /* gs primitive stride */
1258 prev->output_size * 4, /* gs vertex stride */
1259 0,
1260 0,
1261 };
1262 tu6_emit_const(cs, CP_LOAD_STATE6_GEOM, TU_CONSTS_PRIMITIVE_PARAM,
1263 gs->const_state, gs->constlen, SB6_GS_SHADER, 0,
1264 ARRAY_SIZE(gs_params), gs_params);
1265 }
1266 }
1267
1268 template <chip CHIP>
1269 static void
tu6_emit_program_config(struct tu_cs * cs,const struct tu_program_state * prog,struct tu_shader ** shaders,const struct ir3_shader_variant ** variants)1270 tu6_emit_program_config(struct tu_cs *cs,
1271 const struct tu_program_state *prog,
1272 struct tu_shader **shaders,
1273 const struct ir3_shader_variant **variants)
1274 {
1275 STATIC_ASSERT(MESA_SHADER_VERTEX == 0);
1276
1277 bool shared_consts_enable =
1278 prog->shared_consts.type == IR3_PUSH_CONSTS_SHARED;
1279 tu6_emit_shared_consts_enable<CHIP>(cs, shared_consts_enable);
1280
1281 tu_cs_emit_regs(cs, HLSQ_INVALIDATE_CMD(CHIP,
1282 .vs_state = true,
1283 .hs_state = true,
1284 .ds_state = true,
1285 .gs_state = true,
1286 .fs_state = true,
1287 .gfx_ibo = true,
1288 .gfx_shared_const = shared_consts_enable));
1289 for (size_t stage_idx = MESA_SHADER_VERTEX;
1290 stage_idx <= MESA_SHADER_FRAGMENT; stage_idx++) {
1291 gl_shader_stage stage = (gl_shader_stage) stage_idx;
1292 tu6_emit_xs_config<CHIP>(cs, stage, variants[stage]);
1293 }
1294
1295 for (size_t stage_idx = MESA_SHADER_VERTEX;
1296 stage_idx <= MESA_SHADER_FRAGMENT; stage_idx++) {
1297 gl_shader_stage stage = (gl_shader_stage) stage_idx;
1298 tu6_emit_dynamic_offset(cs, variants[stage], shaders[stage], prog);
1299 }
1300
1301 const struct ir3_shader_variant *vs = variants[MESA_SHADER_VERTEX];
1302 const struct ir3_shader_variant *hs = variants[MESA_SHADER_TESS_CTRL];
1303 const struct ir3_shader_variant *ds = variants[MESA_SHADER_TESS_EVAL];
1304 const struct ir3_shader_variant *gs = variants[MESA_SHADER_GEOMETRY];
1305
1306 if (hs) {
1307 tu6_emit_link_map(cs, vs, hs, SB6_HS_SHADER);
1308 tu6_emit_link_map(cs, hs, ds, SB6_DS_SHADER);
1309 }
1310
1311 if (gs) {
1312 if (hs) {
1313 tu6_emit_link_map(cs, ds, gs, SB6_GS_SHADER);
1314 } else {
1315 tu6_emit_link_map(cs, vs, gs, SB6_GS_SHADER);
1316 }
1317
1318 uint32_t prev_stage_output_size = ds ? ds->output_size : vs->output_size;
1319
1320 if (CHIP == A6XX) {
1321 /* Size of per-primitive alloction in ldlw memory in vec4s. */
1322 uint32_t vec4_size = gs->gs.vertices_in *
1323 DIV_ROUND_UP(prev_stage_output_size, 4);
1324
1325 tu_cs_emit_pkt4(cs, REG_A6XX_PC_PRIMITIVE_CNTL_6, 1);
1326 tu_cs_emit(cs, A6XX_PC_PRIMITIVE_CNTL_6_STRIDE_IN_VPC(vec4_size));
1327 }
1328
1329 uint32_t prim_size = prev_stage_output_size;
1330 if (prim_size > 64)
1331 prim_size = 64;
1332 else if (prim_size == 64)
1333 prim_size = 63;
1334 tu_cs_emit_pkt4(cs, REG_A6XX_SP_GS_PRIM_SIZE, 1);
1335 tu_cs_emit(cs, prim_size);
1336 }
1337
1338 if (gs || hs) {
1339 tu6_emit_geom_tess_consts(cs, vs, hs, ds, gs);
1340 }
1341 }
1342
1343 static bool
contains_all_shader_state(VkGraphicsPipelineLibraryFlagsEXT state)1344 contains_all_shader_state(VkGraphicsPipelineLibraryFlagsEXT state)
1345 {
1346 return (state &
1347 (VK_GRAPHICS_PIPELINE_LIBRARY_PRE_RASTERIZATION_SHADERS_BIT_EXT |
1348 VK_GRAPHICS_PIPELINE_LIBRARY_FRAGMENT_SHADER_BIT_EXT)) ==
1349 (VK_GRAPHICS_PIPELINE_LIBRARY_PRE_RASTERIZATION_SHADERS_BIT_EXT |
1350 VK_GRAPHICS_PIPELINE_LIBRARY_FRAGMENT_SHADER_BIT_EXT);
1351 }
1352
1353 static bool
pipeline_contains_all_shader_state(struct tu_pipeline * pipeline)1354 pipeline_contains_all_shader_state(struct tu_pipeline *pipeline)
1355 {
1356 return pipeline->type == TU_PIPELINE_GRAPHICS ||
1357 pipeline->type == TU_PIPELINE_COMPUTE ||
1358 contains_all_shader_state(tu_pipeline_to_graphics_lib(pipeline)->state);
1359 }
1360
1361 /* Return true if this pipeline contains all of the GPL stages listed but none
1362 * of the libraries it uses do, so this is "the first time" that all of them
1363 * are defined together. This is useful for state that needs to be combined
1364 * from multiple GPL stages.
1365 */
1366
1367 static bool
set_combined_state(struct tu_pipeline_builder * builder,struct tu_pipeline * pipeline,VkGraphicsPipelineLibraryFlagsEXT state)1368 set_combined_state(struct tu_pipeline_builder *builder,
1369 struct tu_pipeline *pipeline,
1370 VkGraphicsPipelineLibraryFlagsEXT state)
1371 {
1372 if (pipeline->type == TU_PIPELINE_GRAPHICS_LIB &&
1373 (tu_pipeline_to_graphics_lib(pipeline)->state & state) != state)
1374 return false;
1375
1376 for (unsigned i = 0; i < builder->num_libraries; i++) {
1377 if ((builder->libraries[i]->state & state) == state)
1378 return false;
1379 }
1380
1381 return true;
1382 }
1383
1384 #define TU6_EMIT_VERTEX_INPUT_MAX_DWORDS (MAX_VERTEX_ATTRIBS * 2 + 1)
1385
1386 static VkResult
tu_pipeline_allocate_cs(struct tu_device * dev,struct tu_pipeline * pipeline,struct tu_pipeline_layout * layout,struct tu_pipeline_builder * builder,const struct ir3_shader_variant * compute)1387 tu_pipeline_allocate_cs(struct tu_device *dev,
1388 struct tu_pipeline *pipeline,
1389 struct tu_pipeline_layout *layout,
1390 struct tu_pipeline_builder *builder,
1391 const struct ir3_shader_variant *compute)
1392 {
1393 uint32_t size = 1024;
1394
1395 /* graphics case: */
1396 if (builder) {
1397 if (builder->state &
1398 VK_GRAPHICS_PIPELINE_LIBRARY_VERTEX_INPUT_INTERFACE_BIT_EXT) {
1399 size += TU6_EMIT_VERTEX_INPUT_MAX_DWORDS;
1400 }
1401
1402 if (set_combined_state(builder, pipeline,
1403 VK_GRAPHICS_PIPELINE_LIBRARY_PRE_RASTERIZATION_SHADERS_BIT_EXT |
1404 VK_GRAPHICS_PIPELINE_LIBRARY_FRAGMENT_SHADER_BIT_EXT)) {
1405 size += tu6_load_state_size(pipeline, layout);
1406 }
1407 } else {
1408 size += tu6_load_state_size(pipeline, layout);
1409 }
1410
1411 /* Allocate the space for the pipeline out of the device's RO suballocator.
1412 *
1413 * Sub-allocating BOs saves memory and also kernel overhead in refcounting of
1414 * BOs at exec time.
1415 *
1416 * The pipeline cache would seem like a natural place to stick the
1417 * suballocator, except that it is not guaranteed to outlive the pipelines
1418 * created from it, so you can't store any long-lived state there, and you
1419 * can't use its EXTERNALLY_SYNCHRONIZED flag to avoid atomics because
1420 * pipeline destroy isn't synchronized by the cache.
1421 */
1422 mtx_lock(&dev->pipeline_mutex);
1423 VkResult result = tu_suballoc_bo_alloc(&pipeline->bo, &dev->pipeline_suballoc,
1424 size * 4, 128);
1425 mtx_unlock(&dev->pipeline_mutex);
1426 if (result != VK_SUCCESS)
1427 return result;
1428
1429 TU_RMV(cmd_buffer_suballoc_bo_create, dev, &pipeline->bo);
1430 tu_cs_init_suballoc(&pipeline->cs, dev, &pipeline->bo);
1431
1432 return VK_SUCCESS;
1433 }
1434
1435 static void
tu_append_executable(struct tu_pipeline * pipeline,const struct ir3_shader_variant * variant,char * nir_from_spirv)1436 tu_append_executable(struct tu_pipeline *pipeline,
1437 const struct ir3_shader_variant *variant,
1438 char *nir_from_spirv)
1439 {
1440 struct tu_pipeline_executable exe = {
1441 .stage = variant->type,
1442 .stats = variant->info,
1443 .is_binning = variant->binning_pass,
1444 .nir_from_spirv = nir_from_spirv,
1445 .nir_final = ralloc_strdup(pipeline->executables_mem_ctx, variant->disasm_info.nir),
1446 .disasm = ralloc_strdup(pipeline->executables_mem_ctx, variant->disasm_info.disasm),
1447 };
1448
1449 util_dynarray_append(&pipeline->executables, struct tu_pipeline_executable, exe);
1450 }
1451
1452 static void
tu_hash_stage(struct mesa_sha1 * ctx,VkPipelineCreateFlags2KHR pipeline_flags,const VkPipelineShaderStageCreateInfo * stage,const nir_shader * nir,const struct tu_shader_key * key)1453 tu_hash_stage(struct mesa_sha1 *ctx,
1454 VkPipelineCreateFlags2KHR pipeline_flags,
1455 const VkPipelineShaderStageCreateInfo *stage,
1456 const nir_shader *nir,
1457 const struct tu_shader_key *key)
1458 {
1459
1460 if (nir) {
1461 struct blob blob;
1462 blob_init(&blob);
1463 nir_serialize(&blob, nir, true);
1464 _mesa_sha1_update(ctx, blob.data, blob.size);
1465 blob_finish(&blob);
1466 } else {
1467 unsigned char stage_hash[SHA1_DIGEST_LENGTH];
1468 vk_pipeline_hash_shader_stage(pipeline_flags, stage, NULL, stage_hash);
1469 _mesa_sha1_update(ctx, stage_hash, sizeof(stage_hash));
1470 }
1471 _mesa_sha1_update(ctx, key, sizeof(*key));
1472 }
1473
1474 static void
tu_hash_shaders(unsigned char * hash,VkPipelineCreateFlags2KHR pipeline_flags,const VkPipelineShaderStageCreateInfo ** stages,nir_shader * const * nir,const struct tu_pipeline_layout * layout,const struct tu_shader_key * keys,VkGraphicsPipelineLibraryFlagsEXT state)1475 tu_hash_shaders(unsigned char *hash,
1476 VkPipelineCreateFlags2KHR pipeline_flags,
1477 const VkPipelineShaderStageCreateInfo **stages,
1478 nir_shader *const *nir,
1479 const struct tu_pipeline_layout *layout,
1480 const struct tu_shader_key *keys,
1481 VkGraphicsPipelineLibraryFlagsEXT state)
1482 {
1483 struct mesa_sha1 ctx;
1484
1485 _mesa_sha1_init(&ctx);
1486
1487 if (layout)
1488 _mesa_sha1_update(&ctx, layout->sha1, sizeof(layout->sha1));
1489
1490 for (int i = 0; i < MESA_SHADER_STAGES; ++i) {
1491 if (stages[i] || nir[i]) {
1492 tu_hash_stage(&ctx, pipeline_flags, stages[i], nir[i], &keys[i]);
1493 }
1494 }
1495 _mesa_sha1_update(&ctx, &state, sizeof(state));
1496 enum ir3_shader_debug ir3_debug_key = ir3_shader_debug_hash_key();
1497 _mesa_sha1_update(&ctx, &ir3_debug_key, sizeof(ir3_debug_key));
1498 _mesa_sha1_final(&ctx, hash);
1499 }
1500
1501 static void
tu_hash_compute(unsigned char * hash,VkPipelineCreateFlags2KHR pipeline_flags,const VkPipelineShaderStageCreateInfo * stage,const struct tu_pipeline_layout * layout,const struct tu_shader_key * key)1502 tu_hash_compute(unsigned char *hash,
1503 VkPipelineCreateFlags2KHR pipeline_flags,
1504 const VkPipelineShaderStageCreateInfo *stage,
1505 const struct tu_pipeline_layout *layout,
1506 const struct tu_shader_key *key)
1507 {
1508 struct mesa_sha1 ctx;
1509
1510 _mesa_sha1_init(&ctx);
1511
1512 if (layout)
1513 _mesa_sha1_update(&ctx, layout->sha1, sizeof(layout->sha1));
1514
1515 tu_hash_stage(&ctx, pipeline_flags, stage, NULL, key);
1516 enum ir3_shader_debug ir3_debug_key = ir3_shader_debug_hash_key();
1517 _mesa_sha1_update(&ctx, &ir3_debug_key, sizeof(ir3_debug_key));
1518
1519 _mesa_sha1_final(&ctx, hash);
1520 }
1521
1522 static struct tu_shader *
tu_pipeline_cache_lookup(struct vk_pipeline_cache * cache,const void * key_data,size_t key_size,bool * application_cache_hit)1523 tu_pipeline_cache_lookup(struct vk_pipeline_cache *cache,
1524 const void *key_data, size_t key_size,
1525 bool *application_cache_hit)
1526 {
1527 struct vk_pipeline_cache_object *object =
1528 vk_pipeline_cache_lookup_object(cache, key_data, key_size,
1529 &tu_shader_ops, application_cache_hit);
1530 if (object)
1531 return container_of(object, struct tu_shader, base);
1532 else
1533 return NULL;
1534 }
1535
1536 static struct tu_shader *
tu_pipeline_cache_insert(struct vk_pipeline_cache * cache,struct tu_shader * shader)1537 tu_pipeline_cache_insert(struct vk_pipeline_cache *cache,
1538 struct tu_shader *shader)
1539 {
1540 struct vk_pipeline_cache_object *object =
1541 vk_pipeline_cache_add_object(cache, &shader->base);
1542 return container_of(object, struct tu_shader, base);
1543 }
1544
1545 static bool
1546 tu_nir_shaders_serialize(struct vk_pipeline_cache_object *object,
1547 struct blob *blob);
1548
1549 static struct vk_pipeline_cache_object *
1550 tu_nir_shaders_deserialize(struct vk_pipeline_cache *cache,
1551 const void *key_data,
1552 size_t key_size,
1553 struct blob_reader *blob);
1554
1555 static void
tu_nir_shaders_destroy(struct vk_device * device,struct vk_pipeline_cache_object * object)1556 tu_nir_shaders_destroy(struct vk_device *device,
1557 struct vk_pipeline_cache_object *object)
1558 {
1559 struct tu_nir_shaders *shaders =
1560 container_of(object, struct tu_nir_shaders, base);
1561
1562 for (unsigned i = 0; i < ARRAY_SIZE(shaders->nir); i++)
1563 ralloc_free(shaders->nir[i]);
1564
1565 vk_pipeline_cache_object_finish(&shaders->base);
1566 vk_free(&device->alloc, shaders);
1567 }
1568
1569 const struct vk_pipeline_cache_object_ops tu_nir_shaders_ops = {
1570 .serialize = tu_nir_shaders_serialize,
1571 .deserialize = tu_nir_shaders_deserialize,
1572 .destroy = tu_nir_shaders_destroy,
1573 };
1574
1575 static struct tu_nir_shaders *
tu_nir_shaders_init(struct tu_device * dev,const void * key_data,size_t key_size)1576 tu_nir_shaders_init(struct tu_device *dev, const void *key_data, size_t key_size)
1577 {
1578 VK_MULTIALLOC(ma);
1579 VK_MULTIALLOC_DECL(&ma, struct tu_nir_shaders, shaders, 1);
1580 VK_MULTIALLOC_DECL_SIZE(&ma, char, obj_key_data, key_size);
1581
1582 if (!vk_multialloc_zalloc(&ma, &dev->vk.alloc,
1583 VK_SYSTEM_ALLOCATION_SCOPE_DEVICE))
1584 return NULL;
1585
1586 memcpy(obj_key_data, key_data, key_size);
1587 vk_pipeline_cache_object_init(&dev->vk, &shaders->base,
1588 &tu_nir_shaders_ops, obj_key_data, key_size);
1589
1590 return shaders;
1591 }
1592
1593 static bool
tu_nir_shaders_serialize(struct vk_pipeline_cache_object * object,struct blob * blob)1594 tu_nir_shaders_serialize(struct vk_pipeline_cache_object *object,
1595 struct blob *blob)
1596 {
1597 struct tu_nir_shaders *shaders =
1598 container_of(object, struct tu_nir_shaders, base);
1599
1600 for (unsigned i = 0; i < ARRAY_SIZE(shaders->nir); i++) {
1601 if (shaders->nir[i]) {
1602 blob_write_uint8(blob, 1);
1603 nir_serialize(blob, shaders->nir[i], true);
1604 } else {
1605 blob_write_uint8(blob, 0);
1606 }
1607 }
1608
1609 return true;
1610 }
1611
1612 static struct vk_pipeline_cache_object *
tu_nir_shaders_deserialize(struct vk_pipeline_cache * cache,const void * key_data,size_t key_size,struct blob_reader * blob)1613 tu_nir_shaders_deserialize(struct vk_pipeline_cache *cache,
1614 const void *key_data,
1615 size_t key_size,
1616 struct blob_reader *blob)
1617 {
1618 struct tu_device *dev =
1619 container_of(cache->base.device, struct tu_device, vk);
1620 struct tu_nir_shaders *shaders =
1621 tu_nir_shaders_init(dev, key_data, key_size);
1622
1623 if (!shaders)
1624 return NULL;
1625
1626 for (unsigned i = 0; i < ARRAY_SIZE(shaders->nir); i++) {
1627 if (blob_read_uint8(blob)) {
1628 shaders->nir[i] =
1629 nir_deserialize(NULL, ir3_get_compiler_options(dev->compiler), blob);
1630 }
1631 }
1632
1633 return &shaders->base;
1634 }
1635
1636 static struct tu_nir_shaders *
tu_nir_cache_lookup(struct vk_pipeline_cache * cache,const void * key_data,size_t key_size,bool * application_cache_hit)1637 tu_nir_cache_lookup(struct vk_pipeline_cache *cache,
1638 const void *key_data, size_t key_size,
1639 bool *application_cache_hit)
1640 {
1641 struct vk_pipeline_cache_object *object =
1642 vk_pipeline_cache_lookup_object(cache, key_data, key_size,
1643 &tu_nir_shaders_ops, application_cache_hit);
1644 if (object)
1645 return container_of(object, struct tu_nir_shaders, base);
1646 else
1647 return NULL;
1648 }
1649
1650 static struct tu_nir_shaders *
tu_nir_cache_insert(struct vk_pipeline_cache * cache,struct tu_nir_shaders * shaders)1651 tu_nir_cache_insert(struct vk_pipeline_cache *cache,
1652 struct tu_nir_shaders *shaders)
1653 {
1654 struct vk_pipeline_cache_object *object =
1655 vk_pipeline_cache_add_object(cache, &shaders->base);
1656 return container_of(object, struct tu_nir_shaders, base);
1657 }
1658
1659 static VkResult
tu_pipeline_builder_compile_shaders(struct tu_pipeline_builder * builder,struct tu_pipeline * pipeline)1660 tu_pipeline_builder_compile_shaders(struct tu_pipeline_builder *builder,
1661 struct tu_pipeline *pipeline)
1662 {
1663 VkResult result = VK_SUCCESS;
1664 const VkPipelineShaderStageCreateInfo *stage_infos[MESA_SHADER_STAGES] = {
1665 NULL
1666 };
1667 VkPipelineCreationFeedback pipeline_feedback = {
1668 .flags = VK_PIPELINE_CREATION_FEEDBACK_VALID_BIT,
1669 };
1670 VkPipelineCreationFeedback stage_feedbacks[MESA_SHADER_STAGES] = { 0 };
1671
1672 const bool executable_info =
1673 builder->create_flags &
1674 VK_PIPELINE_CREATE_2_CAPTURE_INTERNAL_REPRESENTATIONS_BIT_KHR;
1675
1676 bool retain_nir =
1677 builder->create_flags &
1678 VK_PIPELINE_CREATE_2_RETAIN_LINK_TIME_OPTIMIZATION_INFO_BIT_EXT;
1679
1680 int64_t pipeline_start = os_time_get_nano();
1681
1682 const VkPipelineCreationFeedbackCreateInfo *creation_feedback =
1683 vk_find_struct_const(builder->create_info->pNext, PIPELINE_CREATION_FEEDBACK_CREATE_INFO);
1684
1685 bool must_compile = false;
1686 for (uint32_t i = 0; i < builder->create_info->stageCount; i++) {
1687 if (!(builder->active_stages & builder->create_info->pStages[i].stage))
1688 continue;
1689
1690 gl_shader_stage stage =
1691 vk_to_mesa_shader_stage(builder->create_info->pStages[i].stage);
1692 stage_infos[stage] = &builder->create_info->pStages[i];
1693 must_compile = true;
1694 }
1695
1696 /* Forward declare everything due to the goto usage */
1697 nir_shader *nir[ARRAY_SIZE(stage_infos)] = { NULL };
1698 struct tu_shader *shaders[ARRAY_SIZE(stage_infos)] = { NULL };
1699 nir_shader *post_link_nir[ARRAY_SIZE(nir)] = { NULL };
1700 char *nir_initial_disasm[ARRAY_SIZE(stage_infos)] = { NULL };
1701 bool cache_hit = false;
1702
1703 struct tu_shader_key keys[ARRAY_SIZE(stage_infos)] = { };
1704 for (gl_shader_stage stage = MESA_SHADER_VERTEX;
1705 stage < ARRAY_SIZE(keys); stage = (gl_shader_stage) (stage+1)) {
1706 const VkPipelineShaderStageRequiredSubgroupSizeCreateInfo *subgroup_info = NULL;
1707 if (stage_infos[stage])
1708 subgroup_info = vk_find_struct_const(stage_infos[stage],
1709 PIPELINE_SHADER_STAGE_REQUIRED_SUBGROUP_SIZE_CREATE_INFO);
1710 bool allow_varying_subgroup_size =
1711 !stage_infos[stage] ||
1712 (stage_infos[stage]->flags &
1713 VK_PIPELINE_SHADER_STAGE_CREATE_ALLOW_VARYING_SUBGROUP_SIZE_BIT_EXT);
1714 bool require_full_subgroups =
1715 stage_infos[stage] &&
1716 (stage_infos[stage]->flags &
1717 VK_PIPELINE_SHADER_STAGE_CREATE_REQUIRE_FULL_SUBGROUPS_BIT_EXT);
1718 tu_shader_key_subgroup_size(&keys[stage], allow_varying_subgroup_size,
1719 require_full_subgroups, subgroup_info,
1720 builder->device);
1721
1722 if (stage_infos[stage]) {
1723 struct vk_pipeline_robustness_state rs;
1724 vk_pipeline_robustness_state_fill(&builder->device->vk, &rs,
1725 builder->create_info->pNext,
1726 stage_infos[stage]->pNext);
1727 tu_shader_key_robustness(&keys[stage], &rs);
1728 if (builder->create_flags & VK_PIPELINE_CREATE_2_VIEW_INDEX_FROM_DEVICE_INDEX_BIT_KHR)
1729 keys[stage].lower_view_index_to_device_index = true;
1730 }
1731 }
1732
1733 if ((builder->state &
1734 VK_GRAPHICS_PIPELINE_LIBRARY_FRAGMENT_SHADER_BIT_EXT) &&
1735 builder->graphics_state.ial &&
1736 builder->create_info->renderPass == VK_NULL_HANDLE) {
1737 const struct vk_input_attachment_location_state *ial =
1738 builder->graphics_state.ial;
1739
1740 keys[MESA_SHADER_FRAGMENT].dynamic_renderpass = true;
1741
1742 uint32_t attachments_referenced = 0;
1743
1744 if (ial->color_attachment_count == MESA_VK_COLOR_ATTACHMENT_COUNT_UNKNOWN) {
1745 attachments_referenced |=
1746 BITFIELD_MASK(MAX_RTS) << TU_DYN_INPUT_ATT_OFFSET;
1747 } else {
1748 for (unsigned i = 0; i < ial->color_attachment_count; i++) {
1749 if (ial->color_map[i] != MESA_VK_ATTACHMENT_UNUSED) {
1750 attachments_referenced |=
1751 (1u << (ial->color_map[i] + TU_DYN_INPUT_ATT_OFFSET));
1752 }
1753 }
1754 }
1755
1756 if (ial->depth_att != MESA_VK_ATTACHMENT_UNUSED) {
1757 if (ial->depth_att == MESA_VK_ATTACHMENT_NO_INDEX)
1758 attachments_referenced |= 1;
1759 else
1760 attachments_referenced |= 1u << (ial->depth_att + 1);
1761 }
1762
1763 if (ial->stencil_att != MESA_VK_ATTACHMENT_UNUSED) {
1764 if (ial->stencil_att == MESA_VK_ATTACHMENT_NO_INDEX)
1765 attachments_referenced |= 1;
1766 else
1767 attachments_referenced |= 1u << (ial->stencil_att + 1);
1768 }
1769
1770 keys[MESA_SHADER_FRAGMENT].read_only_input_attachments =
1771 ~attachments_referenced;
1772 }
1773
1774 if (builder->create_flags &
1775 VK_PIPELINE_CREATE_2_LINK_TIME_OPTIMIZATION_BIT_EXT) {
1776 for (unsigned i = 0; i < builder->num_libraries; i++) {
1777 struct tu_graphics_lib_pipeline *library = builder->libraries[i];
1778
1779 for (unsigned j = 0; j < ARRAY_SIZE(library->shaders); j++) {
1780 if (library->shaders[j].nir) {
1781 assert(!nir[j]);
1782 nir[j] = nir_shader_clone(builder->mem_ctx,
1783 library->shaders[j].nir);
1784 keys[j] = library->shaders[j].key;
1785 must_compile = true;
1786 }
1787 }
1788 }
1789 }
1790
1791 struct tu_nir_shaders *nir_shaders = NULL;
1792 if (!must_compile)
1793 goto done;
1794
1795 if (builder->state &
1796 VK_GRAPHICS_PIPELINE_LIBRARY_PRE_RASTERIZATION_SHADERS_BIT_EXT) {
1797 keys[MESA_SHADER_VERTEX].multiview_mask =
1798 builder->graphics_state.rp->view_mask;
1799 }
1800
1801 if (builder->state & VK_GRAPHICS_PIPELINE_LIBRARY_FRAGMENT_SHADER_BIT_EXT) {
1802 keys[MESA_SHADER_FRAGMENT].multiview_mask =
1803 builder->graphics_state.rp->view_mask;
1804 keys[MESA_SHADER_FRAGMENT].fragment_density_map =
1805 builder->fragment_density_map;
1806 keys[MESA_SHADER_FRAGMENT].unscaled_input_fragcoord =
1807 builder->unscaled_input_fragcoord;
1808
1809 const VkPipelineMultisampleStateCreateInfo *msaa_info =
1810 builder->create_info->pMultisampleState;
1811
1812 /* The 1.3.215 spec says:
1813 *
1814 * Sample shading can be used to specify a minimum number of unique
1815 * samples to process for each fragment. If sample shading is enabled,
1816 * an implementation must provide a minimum of
1817 *
1818 * max(ceil(minSampleShadingFactor * totalSamples), 1)
1819 *
1820 * unique associated data for each fragment, where
1821 * minSampleShadingFactor is the minimum fraction of sample shading.
1822 *
1823 * The definition is pretty much the same as OpenGL's GL_SAMPLE_SHADING.
1824 * They both require unique associated data.
1825 *
1826 * There are discussions to change the definition, such that
1827 * sampleShadingEnable does not imply unique associated data. Before the
1828 * discussions are settled and before apps (i.e., ANGLE) are fixed to
1829 * follow the new and incompatible definition, we should stick to the
1830 * current definition.
1831 *
1832 * Note that ir3_shader_key::sample_shading is not actually used by ir3,
1833 * just checked in tu6_emit_fs_inputs. We will also copy the value to
1834 * tu_shader_key::force_sample_interp in a bit.
1835 */
1836 keys[MESA_SHADER_FRAGMENT].force_sample_interp =
1837 !builder->rasterizer_discard && msaa_info && msaa_info->sampleShadingEnable;
1838 }
1839
1840 unsigned char pipeline_sha1[20];
1841 tu_hash_shaders(pipeline_sha1, builder->create_flags, stage_infos, nir,
1842 &builder->layout, keys, builder->state);
1843
1844 unsigned char nir_sha1[21];
1845 memcpy(nir_sha1, pipeline_sha1, sizeof(pipeline_sha1));
1846 nir_sha1[20] = 'N';
1847
1848 if (!executable_info) {
1849 cache_hit = true;
1850 bool application_cache_hit = false;
1851
1852 unsigned char shader_sha1[21];
1853 memcpy(shader_sha1, pipeline_sha1, sizeof(pipeline_sha1));
1854
1855 for (gl_shader_stage stage = MESA_SHADER_VERTEX; stage < ARRAY_SIZE(nir);
1856 stage = (gl_shader_stage) (stage + 1)) {
1857 if (stage_infos[stage] || nir[stage]) {
1858 bool shader_application_cache_hit;
1859 shader_sha1[20] = (unsigned char) stage;
1860 shaders[stage] =
1861 tu_pipeline_cache_lookup(builder->cache, &shader_sha1,
1862 sizeof(shader_sha1),
1863 &shader_application_cache_hit);
1864 if (!shaders[stage]) {
1865 cache_hit = false;
1866 break;
1867 }
1868 application_cache_hit &= shader_application_cache_hit;
1869 }
1870 }
1871
1872 /* If the user asks us to keep the NIR around, we need to have it for a
1873 * successful cache hit. If we only have a "partial" cache hit, then we
1874 * still need to recompile in order to get the NIR.
1875 */
1876 if (cache_hit &&
1877 (builder->create_flags &
1878 VK_PIPELINE_CREATE_2_RETAIN_LINK_TIME_OPTIMIZATION_INFO_BIT_EXT)) {
1879 bool nir_application_cache_hit = false;
1880 nir_shaders =
1881 tu_nir_cache_lookup(builder->cache, &nir_sha1,
1882 sizeof(nir_sha1),
1883 &nir_application_cache_hit);
1884
1885 application_cache_hit &= nir_application_cache_hit;
1886 cache_hit &= !!nir_shaders;
1887 }
1888
1889 if (application_cache_hit && builder->cache != builder->device->mem_cache) {
1890 pipeline_feedback.flags |=
1891 VK_PIPELINE_CREATION_FEEDBACK_APPLICATION_PIPELINE_CACHE_HIT_BIT;
1892 }
1893 }
1894
1895 if (!cache_hit) {
1896 if (builder->create_flags &
1897 VK_PIPELINE_CREATE_2_FAIL_ON_PIPELINE_COMPILE_REQUIRED_BIT_KHR) {
1898 return VK_PIPELINE_COMPILE_REQUIRED;
1899 }
1900
1901 result = tu_compile_shaders(builder->device,
1902 builder->create_flags,
1903 stage_infos,
1904 nir,
1905 keys,
1906 &builder->layout,
1907 pipeline_sha1,
1908 shaders,
1909 executable_info ? nir_initial_disasm : NULL,
1910 pipeline->executables_mem_ctx,
1911 retain_nir ? post_link_nir : NULL,
1912 stage_feedbacks);
1913
1914 if (result != VK_SUCCESS)
1915 goto fail;
1916
1917 if (retain_nir) {
1918 nir_shaders =
1919 tu_nir_shaders_init(builder->device, &nir_sha1, sizeof(nir_sha1));
1920 for (gl_shader_stage stage = MESA_SHADER_VERTEX;
1921 stage < ARRAY_SIZE(nir); stage = (gl_shader_stage) (stage + 1)) {
1922 if (!post_link_nir[stage])
1923 continue;
1924
1925 nir_shaders->nir[stage] = post_link_nir[stage];
1926 }
1927
1928 nir_shaders = tu_nir_cache_insert(builder->cache, nir_shaders);
1929 }
1930
1931 for (gl_shader_stage stage = MESA_SHADER_VERTEX; stage < ARRAY_SIZE(nir);
1932 stage = (gl_shader_stage) (stage + 1)) {
1933 if (!nir[stage])
1934 continue;
1935
1936 shaders[stage] = tu_pipeline_cache_insert(builder->cache, shaders[stage]);
1937 }
1938 }
1939
1940 done:
1941
1942 /* Create empty shaders which contain the draw states to initialize
1943 * registers for unused shader stages.
1944 */
1945 if (builder->state &
1946 VK_GRAPHICS_PIPELINE_LIBRARY_PRE_RASTERIZATION_SHADERS_BIT_EXT) {
1947 if (!shaders[MESA_SHADER_TESS_CTRL]) {
1948 shaders[MESA_SHADER_TESS_CTRL] = builder->device->empty_tcs;
1949 vk_pipeline_cache_object_ref(&shaders[MESA_SHADER_TESS_CTRL]->base);
1950 }
1951 if (!shaders[MESA_SHADER_TESS_EVAL]) {
1952 shaders[MESA_SHADER_TESS_EVAL] = builder->device->empty_tes;
1953 vk_pipeline_cache_object_ref(&shaders[MESA_SHADER_TESS_EVAL]->base);
1954 }
1955 if (!shaders[MESA_SHADER_GEOMETRY]) {
1956 shaders[MESA_SHADER_GEOMETRY] = builder->device->empty_gs;
1957 vk_pipeline_cache_object_ref(&shaders[MESA_SHADER_GEOMETRY]->base);
1958 }
1959 }
1960
1961 if (builder->state &
1962 VK_GRAPHICS_PIPELINE_LIBRARY_FRAGMENT_SHADER_BIT_EXT) {
1963 if (!shaders[MESA_SHADER_FRAGMENT]) {
1964 shaders[MESA_SHADER_FRAGMENT] =
1965 builder->fragment_density_map ?
1966 builder->device->empty_fs_fdm : builder->device->empty_fs;
1967 vk_pipeline_cache_object_ref(&shaders[MESA_SHADER_FRAGMENT]->base);
1968 }
1969 }
1970
1971 for (gl_shader_stage stage = MESA_SHADER_VERTEX;
1972 stage < ARRAY_SIZE(nir); stage = (gl_shader_stage) (stage + 1)) {
1973 if (shaders[stage] && shaders[stage]->variant) {
1974 tu_append_executable(pipeline, shaders[stage]->variant,
1975 nir_initial_disasm[stage]);
1976 }
1977 }
1978
1979 /* We may have deduplicated a cache entry, in which case our original
1980 * post_link_nir may be gone.
1981 */
1982 if (nir_shaders) {
1983 for (gl_shader_stage stage = MESA_SHADER_VERTEX;
1984 stage < ARRAY_SIZE(nir); stage = (gl_shader_stage) (stage + 1)) {
1985 if (nir_shaders->nir[stage]) {
1986 post_link_nir[stage] = nir_shaders->nir[stage];
1987 }
1988 }
1989 }
1990
1991 /* In the case where we're building a library without link-time
1992 * optimization but with sub-libraries that retain LTO info, we should
1993 * retain it ourselves in case another pipeline includes us with LTO.
1994 */
1995 for (unsigned i = 0; i < builder->num_libraries; i++) {
1996 struct tu_graphics_lib_pipeline *library = builder->libraries[i];
1997 for (gl_shader_stage stage = MESA_SHADER_VERTEX;
1998 stage < ARRAY_SIZE(library->shaders);
1999 stage = (gl_shader_stage) (stage + 1)) {
2000 if (!post_link_nir[stage] && library->shaders[stage].nir) {
2001 post_link_nir[stage] = library->shaders[stage].nir;
2002 keys[stage] = library->shaders[stage].key;
2003 }
2004
2005 if (!shaders[stage] && library->base.shaders[stage]) {
2006 shaders[stage] = library->base.shaders[stage];
2007 vk_pipeline_cache_object_ref(&shaders[stage]->base);
2008 }
2009 }
2010 }
2011
2012 if (shaders[MESA_SHADER_VERTEX]) {
2013 const struct ir3_shader_variant *vs =
2014 shaders[MESA_SHADER_VERTEX]->variant;
2015
2016 if (!vs->stream_output.num_outputs && ir3_has_binning_vs(&vs->key)) {
2017 tu_append_executable(pipeline, vs->binning, NULL);
2018 }
2019 }
2020
2021 if (pipeline_contains_all_shader_state(pipeline)) {
2022 /* It doesn't make much sense to use RETAIN_LINK_TIME_OPTIMIZATION_INFO
2023 * when compiling all stages, but make sure we don't leak.
2024 */
2025 if (nir_shaders)
2026 vk_pipeline_cache_object_unref(&builder->device->vk,
2027 &nir_shaders->base);
2028 } else {
2029 struct tu_graphics_lib_pipeline *library =
2030 tu_pipeline_to_graphics_lib(pipeline);
2031 library->nir_shaders = nir_shaders;
2032 for (gl_shader_stage stage = MESA_SHADER_VERTEX;
2033 stage < ARRAY_SIZE(library->shaders);
2034 stage = (gl_shader_stage) (stage + 1)) {
2035 library->shaders[stage].nir = post_link_nir[stage];
2036 library->shaders[stage].key = keys[stage];
2037 }
2038 }
2039
2040 for (gl_shader_stage stage = MESA_SHADER_VERTEX;
2041 stage < ARRAY_SIZE(shaders); stage = (gl_shader_stage) (stage + 1)) {
2042 pipeline->shaders[stage] = shaders[stage];
2043 if (shaders[stage])
2044 pipeline->active_desc_sets |= shaders[stage]->active_desc_sets;
2045 }
2046
2047 pipeline_feedback.duration = os_time_get_nano() - pipeline_start;
2048 if (creation_feedback) {
2049 *creation_feedback->pPipelineCreationFeedback = pipeline_feedback;
2050
2051 for (uint32_t i = 0; i < builder->create_info->stageCount; i++) {
2052 gl_shader_stage s =
2053 vk_to_mesa_shader_stage(builder->create_info->pStages[i].stage);
2054 creation_feedback->pPipelineStageCreationFeedbacks[i] = stage_feedbacks[s];
2055 }
2056 }
2057
2058 return VK_SUCCESS;
2059
2060 fail:
2061 if (nir_shaders)
2062 vk_pipeline_cache_object_unref(&builder->device->vk,
2063 &nir_shaders->base);
2064
2065 return result;
2066 }
2067
2068 static void
tu_pipeline_builder_parse_libraries(struct tu_pipeline_builder * builder,struct tu_pipeline * pipeline)2069 tu_pipeline_builder_parse_libraries(struct tu_pipeline_builder *builder,
2070 struct tu_pipeline *pipeline)
2071 {
2072 const VkPipelineLibraryCreateInfoKHR *library_info =
2073 vk_find_struct_const(builder->create_info->pNext,
2074 PIPELINE_LIBRARY_CREATE_INFO_KHR);
2075
2076 if (library_info) {
2077 assert(library_info->libraryCount <= MAX_LIBRARIES);
2078 builder->num_libraries = library_info->libraryCount;
2079 for (unsigned i = 0; i < library_info->libraryCount; i++) {
2080 VK_FROM_HANDLE(tu_pipeline, library, library_info->pLibraries[i]);
2081 builder->libraries[i] = tu_pipeline_to_graphics_lib(library);
2082 }
2083 }
2084
2085 /* Merge in the state from libraries. The program state is a bit special
2086 * and is handled separately.
2087 */
2088 if (pipeline->type == TU_PIPELINE_GRAPHICS_LIB)
2089 tu_pipeline_to_graphics_lib(pipeline)->state = builder->state;
2090 for (unsigned i = 0; i < builder->num_libraries; i++) {
2091 struct tu_graphics_lib_pipeline *library = builder->libraries[i];
2092 if (pipeline->type == TU_PIPELINE_GRAPHICS_LIB)
2093 tu_pipeline_to_graphics_lib(pipeline)->state |= library->state;
2094
2095 if (library->state &
2096 VK_GRAPHICS_PIPELINE_LIBRARY_FRAGMENT_OUTPUT_INTERFACE_BIT_EXT) {
2097 pipeline->output = library->base.output;
2098 pipeline->lrz_blend.reads_dest |= library->base.lrz_blend.reads_dest;
2099 pipeline->lrz_blend.valid |= library->base.lrz_blend.valid;
2100 pipeline->prim_order = library->base.prim_order;
2101 }
2102
2103 if ((library->state &
2104 VK_GRAPHICS_PIPELINE_LIBRARY_FRAGMENT_SHADER_BIT_EXT) &&
2105 (library->state &
2106 VK_GRAPHICS_PIPELINE_LIBRARY_FRAGMENT_OUTPUT_INTERFACE_BIT_EXT)) {
2107 pipeline->prim_order = library->base.prim_order;
2108 }
2109
2110 pipeline->set_state_mask |= library->base.set_state_mask;
2111
2112 u_foreach_bit (i, library->base.set_state_mask) {
2113 pipeline->dynamic_state[i] = library->base.dynamic_state[i];
2114 }
2115
2116 if (contains_all_shader_state(library->state)) {
2117 pipeline->program = library->base.program;
2118 pipeline->load_state = library->base.load_state;
2119 for (unsigned i = 0; i < ARRAY_SIZE(pipeline->shaders); i++) {
2120 if (library->base.shaders[i]) {
2121 pipeline->shaders[i] = library->base.shaders[i];
2122 vk_pipeline_cache_object_ref(&pipeline->shaders[i]->base);
2123 }
2124 }
2125 }
2126
2127 BITSET_OR(pipeline->static_state_mask, pipeline->static_state_mask,
2128 library->base.static_state_mask);
2129
2130 vk_graphics_pipeline_state_merge(&builder->graphics_state,
2131 &library->graphics_state);
2132 }
2133 }
2134
2135 static void
tu_pipeline_builder_parse_layout(struct tu_pipeline_builder * builder,struct tu_pipeline * pipeline)2136 tu_pipeline_builder_parse_layout(struct tu_pipeline_builder *builder,
2137 struct tu_pipeline *pipeline)
2138 {
2139 VK_FROM_HANDLE(tu_pipeline_layout, layout, builder->create_info->layout);
2140
2141 if (layout) {
2142 /* Note: it's still valid to have a layout even if there are libraries.
2143 * This allows the app to e.g. overwrite an INDEPENDENT_SET layout with
2144 * a non-INDEPENDENT_SET layout which may make us use a faster path,
2145 * currently this just affects dynamic offset descriptors.
2146 */
2147 builder->layout = *layout;
2148 } else {
2149 for (unsigned i = 0; i < builder->num_libraries; i++) {
2150 struct tu_graphics_lib_pipeline *library = builder->libraries[i];
2151 builder->layout.num_sets = MAX2(builder->layout.num_sets,
2152 library->num_sets);
2153 assert(builder->layout.num_sets <= builder->device->physical_device->usable_sets);
2154 for (unsigned j = 0; j < library->num_sets; j++) {
2155 builder->layout.set[i].layout = library->layouts[i];
2156 }
2157
2158 builder->layout.push_constant_size = library->push_constant_size;
2159 }
2160
2161 tu_pipeline_layout_init(&builder->layout);
2162 }
2163
2164 if (pipeline->type == TU_PIPELINE_GRAPHICS_LIB) {
2165 struct tu_graphics_lib_pipeline *library =
2166 tu_pipeline_to_graphics_lib(pipeline);
2167 library->num_sets = builder->layout.num_sets;
2168 for (unsigned i = 0; i < library->num_sets; i++) {
2169 library->layouts[i] = builder->layout.set[i].layout;
2170 if (library->layouts[i])
2171 vk_descriptor_set_layout_ref(&library->layouts[i]->vk);
2172 }
2173 library->push_constant_size = builder->layout.push_constant_size;
2174 }
2175 }
2176
2177 static void
tu_pipeline_set_linkage(struct tu_program_descriptor_linkage * link,struct tu_const_state * const_state,const struct ir3_shader_variant * v)2178 tu_pipeline_set_linkage(struct tu_program_descriptor_linkage *link,
2179 struct tu_const_state *const_state,
2180 const struct ir3_shader_variant *v)
2181 {
2182 link->const_state = *ir3_const_state(v);
2183 link->tu_const_state = *const_state;
2184 link->constlen = v->constlen;
2185 }
2186
2187 template <chip CHIP>
2188 static void
tu_emit_program_state(struct tu_cs * sub_cs,struct tu_program_state * prog,struct tu_shader ** shaders)2189 tu_emit_program_state(struct tu_cs *sub_cs,
2190 struct tu_program_state *prog,
2191 struct tu_shader **shaders)
2192 {
2193 struct tu_device *dev = sub_cs->device;
2194 struct tu_cs prog_cs;
2195
2196 const struct ir3_shader_variant *variants[MESA_SHADER_STAGES];
2197 struct tu_draw_state draw_states[MESA_SHADER_STAGES];
2198
2199 for (gl_shader_stage stage = MESA_SHADER_VERTEX;
2200 stage < ARRAY_SIZE(variants); stage = (gl_shader_stage) (stage+1)) {
2201 variants[stage] = shaders[stage] ? shaders[stage]->variant : NULL;
2202 }
2203
2204 uint32_t safe_variants =
2205 ir3_trim_constlen(variants, dev->compiler);
2206
2207 unsigned dynamic_descriptor_sizes[MAX_SETS] = { };
2208
2209 for (gl_shader_stage stage = MESA_SHADER_VERTEX;
2210 stage < ARRAY_SIZE(variants); stage = (gl_shader_stage) (stage+1)) {
2211 if (shaders[stage]) {
2212 if (safe_variants & (1u << stage)) {
2213 variants[stage] = shaders[stage]->safe_const_variant;
2214 draw_states[stage] = shaders[stage]->safe_const_state;
2215 } else {
2216 draw_states[stage] = shaders[stage]->state;
2217 }
2218
2219 for (unsigned i = 0; i < MAX_SETS; i++) {
2220 if (shaders[stage]->dynamic_descriptor_sizes[i] >= 0) {
2221 dynamic_descriptor_sizes[i] =
2222 shaders[stage]->dynamic_descriptor_sizes[i];
2223 }
2224 }
2225 }
2226 }
2227
2228 for (unsigned i = 0; i < ARRAY_SIZE(variants); i++) {
2229 if (!variants[i])
2230 continue;
2231
2232 tu_pipeline_set_linkage(&prog->link[i],
2233 &shaders[i]->const_state,
2234 variants[i]);
2235
2236 struct tu_push_constant_range *push_consts =
2237 &shaders[i]->const_state.push_consts;
2238 if (push_consts->type == IR3_PUSH_CONSTS_SHARED ||
2239 push_consts->type == IR3_PUSH_CONSTS_SHARED_PREAMBLE) {
2240 prog->shared_consts = *push_consts;
2241 }
2242 }
2243
2244 unsigned dynamic_descriptor_offset = 0;
2245 for (unsigned i = 0; i < MAX_SETS; i++) {
2246 prog->dynamic_descriptor_offsets[i] = dynamic_descriptor_offset;
2247 dynamic_descriptor_offset += dynamic_descriptor_sizes[i];
2248 }
2249
2250 /* Emit HLSQ_xS_CNTL/HLSQ_SP_xS_CONFIG *first*, before emitting anything
2251 * else that could depend on that state (like push constants)
2252 *
2253 * Note also that this always uses the full VS even in binning pass. The
2254 * binning pass variant has the same const layout as the full VS, and
2255 * the constlen for the VS will be the same or greater than the constlen
2256 * for the binning pass variant. It is required that the constlen state
2257 * matches between binning and draw passes, as some parts of the push
2258 * consts are emitted in state groups that are shared between the binning
2259 * and draw passes.
2260 */
2261 tu_cs_begin_sub_stream(sub_cs, 512, &prog_cs);
2262 tu6_emit_program_config<CHIP>(&prog_cs, prog, shaders, variants);
2263 prog->config_state = tu_cs_end_draw_state(sub_cs, &prog_cs);
2264
2265 prog->vs_state = draw_states[MESA_SHADER_VERTEX];
2266
2267 /* Don't use the binning pass variant when GS is present because we don't
2268 * support compiling correct binning pass variants with GS.
2269 */
2270 if (variants[MESA_SHADER_GEOMETRY]) {
2271 prog->vs_binning_state = prog->vs_state;
2272 } else {
2273 prog->vs_binning_state =
2274 shaders[MESA_SHADER_VERTEX]->binning_state;
2275 }
2276
2277 prog->hs_state = draw_states[MESA_SHADER_TESS_CTRL];
2278 prog->ds_state = draw_states[MESA_SHADER_TESS_EVAL];
2279 prog->gs_state = draw_states[MESA_SHADER_GEOMETRY];
2280 prog->gs_binning_state =
2281 shaders[MESA_SHADER_GEOMETRY]->binning_state;
2282 prog->fs_state = draw_states[MESA_SHADER_FRAGMENT];
2283
2284 const struct ir3_shader_variant *vs = variants[MESA_SHADER_VERTEX];
2285 const struct ir3_shader_variant *hs = variants[MESA_SHADER_TESS_CTRL];
2286 const struct ir3_shader_variant *ds = variants[MESA_SHADER_TESS_EVAL];
2287 const struct ir3_shader_variant *gs = variants[MESA_SHADER_GEOMETRY];
2288 const struct ir3_shader_variant *fs = variants[MESA_SHADER_FRAGMENT];
2289
2290 tu_cs_begin_sub_stream(sub_cs, 512, &prog_cs);
2291 tu6_emit_vpc<CHIP>(&prog_cs, vs, hs, ds, gs, fs);
2292 prog->vpc_state = tu_cs_end_draw_state(sub_cs, &prog_cs);
2293
2294 const struct ir3_shader_variant *last_shader;
2295 if (gs)
2296 last_shader = gs;
2297 else if (ds)
2298 last_shader = ds;
2299 else
2300 last_shader = vs;
2301
2302 prog->per_view_viewport =
2303 !last_shader->writes_viewport &&
2304 shaders[MESA_SHADER_FRAGMENT]->fs.has_fdm &&
2305 dev->physical_device->info->a6xx.has_per_view_viewport;
2306 prog->writes_shading_rate = last_shader->writes_shading_rate;
2307 prog->reads_shading_rate = fs->reads_shading_rate;
2308 prog->accesses_smask = fs->reads_smask || fs->writes_smask;
2309 }
2310
2311 static const enum mesa_vk_dynamic_graphics_state tu_vertex_input_state[] = {
2312 MESA_VK_DYNAMIC_VI,
2313 };
2314
2315 template <chip CHIP>
2316 static unsigned
tu6_vertex_input_size(struct tu_device * dev,const struct vk_vertex_input_state * vi)2317 tu6_vertex_input_size(struct tu_device *dev,
2318 const struct vk_vertex_input_state *vi)
2319 {
2320 return 1 + 2 * util_last_bit(vi->attributes_valid);
2321 }
2322
2323 template <chip CHIP>
2324 static void
tu6_emit_vertex_input(struct tu_cs * cs,const struct vk_vertex_input_state * vi)2325 tu6_emit_vertex_input(struct tu_cs *cs,
2326 const struct vk_vertex_input_state *vi)
2327 {
2328 unsigned attr_count = util_last_bit(vi->attributes_valid);
2329 if (attr_count != 0)
2330 tu_cs_emit_pkt4(cs, REG_A6XX_VFD_DECODE_INSTR(0), attr_count * 2);
2331
2332 for (uint32_t loc = 0; loc < attr_count; loc++) {
2333 const struct vk_vertex_attribute_state *attr = &vi->attributes[loc];
2334
2335 if (vi->attributes_valid & (1u << loc)) {
2336 const struct vk_vertex_binding_state *binding =
2337 &vi->bindings[attr->binding];
2338
2339 enum pipe_format pipe_format = vk_format_to_pipe_format(attr->format);
2340 const struct tu_native_format format = tu6_format_vtx(pipe_format);
2341 tu_cs_emit(cs, A6XX_VFD_DECODE_INSTR(0,
2342 .idx = attr->binding,
2343 .offset = attr->offset,
2344 .instanced = binding->input_rate == VK_VERTEX_INPUT_RATE_INSTANCE,
2345 .format = format.fmt,
2346 .swap = format.swap,
2347 .unk30 = 1,
2348 ._float = !util_format_is_pure_integer(pipe_format)).value);
2349 tu_cs_emit(cs, A6XX_VFD_DECODE_STEP_RATE(0, binding->divisor).value);
2350 } else {
2351 tu_cs_emit(cs, 0);
2352 tu_cs_emit(cs, 0);
2353 }
2354 }
2355 }
2356
2357 static const enum mesa_vk_dynamic_graphics_state tu_vertex_stride_state[] = {
2358 MESA_VK_DYNAMIC_VI_BINDINGS_VALID,
2359 MESA_VK_DYNAMIC_VI_BINDING_STRIDES,
2360 };
2361
2362 template <chip CHIP>
2363 static unsigned
tu6_vertex_stride_size(struct tu_device * dev,const struct vk_vertex_input_state * vi)2364 tu6_vertex_stride_size(struct tu_device *dev,
2365 const struct vk_vertex_input_state *vi)
2366 {
2367 return 1 + 2 * util_last_bit(vi->bindings_valid);
2368 }
2369
2370 template <chip CHIP>
2371 static void
tu6_emit_vertex_stride(struct tu_cs * cs,const struct vk_vertex_input_state * vi)2372 tu6_emit_vertex_stride(struct tu_cs *cs, const struct vk_vertex_input_state *vi)
2373 {
2374 if (vi->bindings_valid) {
2375 unsigned bindings_count = util_last_bit(vi->bindings_valid);
2376 tu_cs_emit_pkt7(cs, CP_CONTEXT_REG_BUNCH, 2 * bindings_count);
2377 for (unsigned i = 0; i < bindings_count; i++) {
2378 tu_cs_emit(cs, REG_A6XX_VFD_FETCH_STRIDE(i));
2379 tu_cs_emit(cs, vi->bindings[i].stride);
2380 }
2381 }
2382 }
2383
2384 template <chip CHIP>
2385 static unsigned
tu6_vertex_stride_size_dyn(struct tu_device * dev,const uint16_t * vi_binding_stride,uint32_t bindings_valid)2386 tu6_vertex_stride_size_dyn(struct tu_device *dev,
2387 const uint16_t *vi_binding_stride,
2388 uint32_t bindings_valid)
2389 {
2390 return 1 + 2 * util_last_bit(bindings_valid);
2391 }
2392
2393 template <chip CHIP>
2394 static void
tu6_emit_vertex_stride_dyn(struct tu_cs * cs,const uint16_t * vi_binding_stride,uint32_t bindings_valid)2395 tu6_emit_vertex_stride_dyn(struct tu_cs *cs, const uint16_t *vi_binding_stride,
2396 uint32_t bindings_valid)
2397 {
2398 if (bindings_valid) {
2399 unsigned bindings_count = util_last_bit(bindings_valid);
2400 tu_cs_emit_pkt7(cs, CP_CONTEXT_REG_BUNCH, 2 * bindings_count);
2401 for (unsigned i = 0; i < bindings_count; i++) {
2402 tu_cs_emit(cs, REG_A6XX_VFD_FETCH_STRIDE(i));
2403 tu_cs_emit(cs, vi_binding_stride[i]);
2404 }
2405 }
2406 }
2407
2408 static const enum mesa_vk_dynamic_graphics_state tu_viewport_state[] = {
2409 MESA_VK_DYNAMIC_VP_VIEWPORTS,
2410 MESA_VK_DYNAMIC_VP_VIEWPORT_COUNT,
2411 MESA_VK_DYNAMIC_VP_DEPTH_CLIP_NEGATIVE_ONE_TO_ONE,
2412 MESA_VK_DYNAMIC_RS_DEPTH_CLAMP_ENABLE,
2413 };
2414
2415 template <chip CHIP>
2416 static unsigned
tu6_viewport_size(struct tu_device * dev,const struct vk_viewport_state * vp,const struct vk_rasterization_state * rs)2417 tu6_viewport_size(struct tu_device *dev,
2418 const struct vk_viewport_state *vp,
2419 const struct vk_rasterization_state *rs)
2420 {
2421 return 1 + vp->viewport_count * 6 + 1 + vp->viewport_count * 2 +
2422 1 + vp->viewport_count * 2 + 5;
2423 }
2424
2425 template <chip CHIP>
2426 static void
tu6_emit_viewport(struct tu_cs * cs,const struct vk_viewport_state * vp,const struct vk_rasterization_state * rs)2427 tu6_emit_viewport(struct tu_cs *cs,
2428 const struct vk_viewport_state *vp,
2429 const struct vk_rasterization_state *rs)
2430 {
2431 VkExtent2D guardband = {511, 511};
2432
2433 tu_cs_emit_pkt4(cs, REG_A6XX_GRAS_CL_VPORT_XOFFSET(0), vp->viewport_count * 6);
2434 for (uint32_t i = 0; i < vp->viewport_count; i++) {
2435 const VkViewport *viewport = &vp->viewports[i];
2436 float offsets[3];
2437 float scales[3];
2438 scales[0] = viewport->width / 2.0f;
2439 scales[1] = viewport->height / 2.0f;
2440 if (vp->depth_clip_negative_one_to_one) {
2441 scales[2] = 0.5 * (viewport->maxDepth - viewport->minDepth);
2442 } else {
2443 scales[2] = viewport->maxDepth - viewport->minDepth;
2444 }
2445
2446 offsets[0] = viewport->x + scales[0];
2447 offsets[1] = viewport->y + scales[1];
2448 if (vp->depth_clip_negative_one_to_one) {
2449 offsets[2] = 0.5 * (viewport->minDepth + viewport->maxDepth);
2450 } else {
2451 offsets[2] = viewport->minDepth;
2452 }
2453
2454 for (uint32_t j = 0; j < 3; j++) {
2455 tu_cs_emit(cs, fui(offsets[j]));
2456 tu_cs_emit(cs, fui(scales[j]));
2457 }
2458
2459 guardband.width =
2460 MIN2(guardband.width, fd_calc_guardband(offsets[0], scales[0], false));
2461 guardband.height =
2462 MIN2(guardband.height, fd_calc_guardband(offsets[1], scales[1], false));
2463 }
2464
2465 tu_cs_emit_pkt4(cs, REG_A6XX_GRAS_SC_VIEWPORT_SCISSOR_TL(0), vp->viewport_count * 2);
2466 for (uint32_t i = 0; i < vp->viewport_count; i++) {
2467 const VkViewport *viewport = &vp->viewports[i];
2468 VkOffset2D min;
2469 VkOffset2D max;
2470 min.x = (int32_t) viewport->x;
2471 max.x = (int32_t) ceilf(viewport->x + viewport->width);
2472 if (viewport->height >= 0.0f) {
2473 min.y = (int32_t) viewport->y;
2474 max.y = (int32_t) ceilf(viewport->y + viewport->height);
2475 } else {
2476 min.y = (int32_t)(viewport->y + viewport->height);
2477 max.y = (int32_t) ceilf(viewport->y);
2478 }
2479 /* the spec allows viewport->height to be 0.0f */
2480 if (min.y == max.y)
2481 max.y++;
2482 /* allow viewport->width = 0.0f for un-initialized viewports: */
2483 if (min.x == max.x)
2484 max.x++;
2485
2486 min.x = MAX2(min.x, 0);
2487 min.y = MAX2(min.y, 0);
2488 max.x = MAX2(max.x, 1);
2489 max.y = MAX2(max.y, 1);
2490
2491 assert(min.x < max.x);
2492 assert(min.y < max.y);
2493
2494 tu_cs_emit(cs, A6XX_GRAS_SC_VIEWPORT_SCISSOR_TL_X(min.x) |
2495 A6XX_GRAS_SC_VIEWPORT_SCISSOR_TL_Y(min.y));
2496 tu_cs_emit(cs, A6XX_GRAS_SC_VIEWPORT_SCISSOR_BR_X(max.x - 1) |
2497 A6XX_GRAS_SC_VIEWPORT_SCISSOR_BR_Y(max.y - 1));
2498 }
2499
2500 /* A7XX+ doesn't clamp to [0,1] with disabled depth clamp, to support
2501 * VK_EXT_depth_clamp_zero_one we have to always enable clamp and manually
2502 * set range to [0,1] when rs->depth_clamp_enable is false.
2503 */
2504 bool zero_one_depth_clamp = CHIP >= A7XX && !rs->depth_clamp_enable;
2505
2506 tu_cs_emit_pkt4(cs, REG_A6XX_GRAS_CL_Z_CLAMP(0), vp->viewport_count * 2);
2507 for (uint32_t i = 0; i < vp->viewport_count; i++) {
2508 const VkViewport *viewport = &vp->viewports[i];
2509 if (zero_one_depth_clamp) {
2510 tu_cs_emit(cs, fui(0.0f));
2511 tu_cs_emit(cs, fui(1.0f));
2512 } else {
2513 tu_cs_emit(cs, fui(MIN2(viewport->minDepth, viewport->maxDepth)));
2514 tu_cs_emit(cs, fui(MAX2(viewport->minDepth, viewport->maxDepth)));
2515 }
2516 }
2517 tu_cs_emit_pkt4(cs, REG_A6XX_GRAS_CL_GUARDBAND_CLIP_ADJ, 1);
2518 tu_cs_emit(cs, A6XX_GRAS_CL_GUARDBAND_CLIP_ADJ_HORZ(guardband.width) |
2519 A6XX_GRAS_CL_GUARDBAND_CLIP_ADJ_VERT(guardband.height));
2520
2521 /* TODO: what to do about this and multi viewport ? */
2522 float z_clamp_min = vp->viewport_count ? MIN2(vp->viewports[0].minDepth, vp->viewports[0].maxDepth) : 0;
2523 float z_clamp_max = vp->viewport_count ? MAX2(vp->viewports[0].minDepth, vp->viewports[0].maxDepth) : 0;
2524 if (zero_one_depth_clamp) {
2525 z_clamp_min = 0.0f;
2526 z_clamp_max = 1.0f;
2527 }
2528
2529 tu_cs_emit_regs(cs,
2530 A6XX_RB_Z_CLAMP_MIN(z_clamp_min),
2531 A6XX_RB_Z_CLAMP_MAX(z_clamp_max));
2532 }
2533
2534 struct apply_viewport_state {
2535 struct vk_viewport_state vp;
2536 struct vk_rasterization_state rs;
2537 bool share_scale;
2538 };
2539
2540 /* It's a hardware restriction that the window offset (i.e. bin.offset) must
2541 * be the same for all views. This means that GMEM coordinates cannot be a
2542 * simple scaling of framebuffer coordinates, because this would require us to
2543 * scale the window offset and the scale may be different per view. Instead we
2544 * have to apply a per-bin offset to the GMEM coordinate transform to make
2545 * sure that the window offset maps to itself. Specifically we need an offset
2546 * o to the transform:
2547 *
2548 * x' = s * x + o
2549 *
2550 * so that when we plug in the bin start b_s:
2551 *
2552 * b_s = s * b_s + o
2553 *
2554 * and we get:
2555 *
2556 * o = b_s - s * b_s
2557 *
2558 * We use this form exactly, because we know the bin offset is a multiple of
2559 * the frag area so s * b_s is an integer and we can compute an exact result
2560 * easily.
2561 */
2562
2563 VkOffset2D
tu_fdm_per_bin_offset(VkExtent2D frag_area,VkRect2D bin)2564 tu_fdm_per_bin_offset(VkExtent2D frag_area, VkRect2D bin)
2565 {
2566 assert(bin.offset.x % frag_area.width == 0);
2567 assert(bin.offset.y % frag_area.height == 0);
2568
2569 return (VkOffset2D) {
2570 bin.offset.x - bin.offset.x / frag_area.width,
2571 bin.offset.y - bin.offset.y / frag_area.height
2572 };
2573 }
2574
2575 static void
fdm_apply_viewports(struct tu_cmd_buffer * cmd,struct tu_cs * cs,void * data,VkRect2D bin,unsigned views,VkExtent2D * frag_areas)2576 fdm_apply_viewports(struct tu_cmd_buffer *cmd, struct tu_cs *cs, void *data,
2577 VkRect2D bin, unsigned views, VkExtent2D *frag_areas)
2578 {
2579 const struct apply_viewport_state *state =
2580 (const struct apply_viewport_state *)data;
2581
2582 struct vk_viewport_state vp = state->vp;
2583
2584 for (unsigned i = 0; i < state->vp.viewport_count; i++) {
2585 /* Note: If we're using shared scaling, the scale should already be the
2586 * same across all views, we can pick any view. However the number
2587 * of viewports and number of views is not guaranteed the same, so we
2588 * need to pick the 0'th view which always exists to be safe.
2589 *
2590 * Conversly, if we're not using shared scaling then the rasterizer in
2591 * the original pipeline is using only the first viewport, so we need to
2592 * replicate it across all viewports.
2593 */
2594 VkExtent2D frag_area = state->share_scale ? frag_areas[0] : frag_areas[i];
2595 VkViewport viewport =
2596 state->share_scale ? state->vp.viewports[i] : state->vp.viewports[0];
2597 if (frag_area.width == 1 && frag_area.height == 1) {
2598 vp.viewports[i] = viewport;
2599 continue;
2600 }
2601
2602 float scale_x = (float) 1.0f / frag_area.width;
2603 float scale_y = (float) 1.0f / frag_area.height;
2604
2605 vp.viewports[i].minDepth = viewport.minDepth;
2606 vp.viewports[i].maxDepth = viewport.maxDepth;
2607 vp.viewports[i].width = viewport.width * scale_x;
2608 vp.viewports[i].height = viewport.height * scale_y;
2609
2610 VkOffset2D offset = tu_fdm_per_bin_offset(frag_area, bin);
2611
2612 vp.viewports[i].x = scale_x * viewport.x + offset.x;
2613 vp.viewports[i].y = scale_y * viewport.y + offset.y;
2614 }
2615
2616 TU_CALLX(cs->device, tu6_emit_viewport)(cs, &vp, &state->rs);
2617 }
2618
2619 static void
tu6_emit_viewport_fdm(struct tu_cs * cs,struct tu_cmd_buffer * cmd,const struct vk_viewport_state * vp,const struct vk_rasterization_state * rs)2620 tu6_emit_viewport_fdm(struct tu_cs *cs, struct tu_cmd_buffer *cmd,
2621 const struct vk_viewport_state *vp,
2622 const struct vk_rasterization_state *rs)
2623 {
2624 unsigned num_views = MAX2(cmd->state.pass->num_views, 1);
2625 struct apply_viewport_state state = {
2626 .vp = *vp,
2627 .rs = *rs,
2628 .share_scale = !cmd->state.per_view_viewport,
2629 };
2630 if (!state.share_scale)
2631 state.vp.viewport_count = num_views;
2632 unsigned size = TU_CALLX(cmd->device, tu6_viewport_size)(cmd->device, &state.vp, &state.rs);
2633 tu_cs_begin_sub_stream(&cmd->sub_cs, size, cs);
2634 tu_create_fdm_bin_patchpoint(cmd, cs, size, fdm_apply_viewports, state);
2635 }
2636
2637 static const enum mesa_vk_dynamic_graphics_state tu_scissor_state[] = {
2638 MESA_VK_DYNAMIC_VP_SCISSORS,
2639 MESA_VK_DYNAMIC_VP_SCISSOR_COUNT,
2640 };
2641
2642 template <chip CHIP>
2643 static unsigned
tu6_scissor_size(struct tu_device * dev,const struct vk_viewport_state * vp)2644 tu6_scissor_size(struct tu_device *dev, const struct vk_viewport_state *vp)
2645 {
2646 return 1 + vp->scissor_count * 2;
2647 }
2648
2649 template <chip CHIP>
2650 void
tu6_emit_scissor(struct tu_cs * cs,const struct vk_viewport_state * vp)2651 tu6_emit_scissor(struct tu_cs *cs, const struct vk_viewport_state *vp)
2652 {
2653 tu_cs_emit_pkt4(cs, REG_A6XX_GRAS_SC_SCREEN_SCISSOR_TL(0), vp->scissor_count * 2);
2654
2655 for (uint32_t i = 0; i < vp->scissor_count; i++) {
2656 const VkRect2D *scissor = &vp->scissors[i];
2657
2658 uint32_t min_x = scissor->offset.x;
2659 uint32_t min_y = scissor->offset.y;
2660 uint32_t max_x = min_x + scissor->extent.width - 1;
2661 uint32_t max_y = min_y + scissor->extent.height - 1;
2662
2663 if (!scissor->extent.width || !scissor->extent.height) {
2664 min_x = min_y = 1;
2665 max_x = max_y = 0;
2666 } else {
2667 /* avoid overflow */
2668 uint32_t scissor_max = BITFIELD_MASK(15);
2669 min_x = MIN2(scissor_max, min_x);
2670 min_y = MIN2(scissor_max, min_y);
2671 max_x = MIN2(scissor_max, max_x);
2672 max_y = MIN2(scissor_max, max_y);
2673 }
2674
2675 tu_cs_emit(cs, A6XX_GRAS_SC_SCREEN_SCISSOR_TL_X(min_x) |
2676 A6XX_GRAS_SC_SCREEN_SCISSOR_TL_Y(min_y));
2677 tu_cs_emit(cs, A6XX_GRAS_SC_SCREEN_SCISSOR_BR_X(max_x) |
2678 A6XX_GRAS_SC_SCREEN_SCISSOR_BR_Y(max_y));
2679 }
2680 }
2681
2682 static void
fdm_apply_scissors(struct tu_cmd_buffer * cmd,struct tu_cs * cs,void * data,VkRect2D bin,unsigned views,VkExtent2D * frag_areas)2683 fdm_apply_scissors(struct tu_cmd_buffer *cmd, struct tu_cs *cs, void *data,
2684 VkRect2D bin, unsigned views, VkExtent2D *frag_areas)
2685 {
2686 const struct apply_viewport_state *state =
2687 (const struct apply_viewport_state *)data;
2688
2689 struct vk_viewport_state vp = state->vp;
2690
2691 for (unsigned i = 0; i < vp.scissor_count; i++) {
2692 VkExtent2D frag_area = state->share_scale ? frag_areas[0] : frag_areas[i];
2693 VkRect2D scissor =
2694 state->share_scale ? state->vp.scissors[i] : state->vp.scissors[0];
2695 if (frag_area.width == 1 && frag_area.height == 1) {
2696 vp.scissors[i] = scissor;
2697 continue;
2698 }
2699
2700 /* Transform the scissor following the viewport. It's unclear how this
2701 * is supposed to handle cases where the scissor isn't aligned to the
2702 * fragment area, but we round outwards to always render partial
2703 * fragments if the scissor size equals the framebuffer size and it
2704 * isn't aligned to the fragment area.
2705 */
2706 VkOffset2D offset = tu_fdm_per_bin_offset(frag_area, bin);
2707 VkOffset2D min = {
2708 scissor.offset.x / frag_area.width + offset.x,
2709 scissor.offset.y / frag_area.width + offset.y,
2710 };
2711 VkOffset2D max = {
2712 DIV_ROUND_UP(scissor.offset.x + scissor.extent.width, frag_area.width) + offset.x,
2713 DIV_ROUND_UP(scissor.offset.y + scissor.extent.height, frag_area.height) + offset.y,
2714 };
2715
2716 /* Intersect scissor with the scaled bin, this essentially replaces the
2717 * window scissor.
2718 */
2719 uint32_t scaled_width = bin.extent.width / frag_area.width;
2720 uint32_t scaled_height = bin.extent.height / frag_area.height;
2721 vp.scissors[i].offset.x = MAX2(min.x, bin.offset.x);
2722 vp.scissors[i].offset.y = MAX2(min.y, bin.offset.y);
2723 vp.scissors[i].extent.width =
2724 MIN2(max.x, bin.offset.x + scaled_width) - vp.scissors[i].offset.x;
2725 vp.scissors[i].extent.height =
2726 MIN2(max.y, bin.offset.y + scaled_height) - vp.scissors[i].offset.y;
2727 }
2728
2729 TU_CALLX(cs->device, tu6_emit_scissor)(cs, &vp);
2730 }
2731
2732 static void
tu6_emit_scissor_fdm(struct tu_cs * cs,struct tu_cmd_buffer * cmd,const struct vk_viewport_state * vp)2733 tu6_emit_scissor_fdm(struct tu_cs *cs, struct tu_cmd_buffer *cmd,
2734 const struct vk_viewport_state *vp)
2735 {
2736 unsigned num_views = MAX2(cmd->state.pass->num_views, 1);
2737 struct apply_viewport_state state = {
2738 .vp = *vp,
2739 .share_scale = !cmd->state.per_view_viewport,
2740 };
2741 if (!state.share_scale)
2742 state.vp.scissor_count = num_views;
2743 unsigned size = TU_CALLX(cmd->device, tu6_scissor_size)(cmd->device, &state.vp);
2744 tu_cs_begin_sub_stream(&cmd->sub_cs, size, cs);
2745 tu_create_fdm_bin_patchpoint(cmd, cs, size, fdm_apply_scissors, state);
2746 }
2747
2748 static const enum mesa_vk_dynamic_graphics_state tu_sample_locations_state[] = {
2749 MESA_VK_DYNAMIC_MS_SAMPLE_LOCATIONS_ENABLE,
2750 MESA_VK_DYNAMIC_MS_SAMPLE_LOCATIONS,
2751 };
2752
2753 template <chip CHIP>
2754 static unsigned
tu6_sample_locations_size(struct tu_device * dev,bool enable,const struct vk_sample_locations_state * samp_loc)2755 tu6_sample_locations_size(struct tu_device *dev, bool enable,
2756 const struct vk_sample_locations_state *samp_loc)
2757 {
2758 return 6 + (enable ? 9 : 0);
2759 }
2760
2761 template <chip CHIP>
2762 void
tu6_emit_sample_locations(struct tu_cs * cs,bool enable,const struct vk_sample_locations_state * samp_loc)2763 tu6_emit_sample_locations(struct tu_cs *cs, bool enable,
2764 const struct vk_sample_locations_state *samp_loc)
2765 {
2766 uint32_t sample_config =
2767 COND(enable, A6XX_RB_SAMPLE_CONFIG_LOCATION_ENABLE);
2768
2769 tu_cs_emit_pkt4(cs, REG_A6XX_GRAS_SAMPLE_CONFIG, 1);
2770 tu_cs_emit(cs, sample_config);
2771
2772 tu_cs_emit_pkt4(cs, REG_A6XX_RB_SAMPLE_CONFIG, 1);
2773 tu_cs_emit(cs, sample_config);
2774
2775 tu_cs_emit_pkt4(cs, REG_A6XX_SP_TP_SAMPLE_CONFIG, 1);
2776 tu_cs_emit(cs, sample_config);
2777
2778 if (!enable)
2779 return;
2780
2781 assert(samp_loc->grid_size.width == 1);
2782 assert(samp_loc->grid_size.height == 1);
2783
2784 uint64_t sample_locations = 0;
2785 for (uint32_t i = 0; i < samp_loc->per_pixel; i++) {
2786 /* From VkSampleLocationEXT:
2787 *
2788 * The values specified in a VkSampleLocationEXT structure are always
2789 * clamped to the implementation-dependent sample location coordinate
2790 * range
2791 * [sampleLocationCoordinateRange[0],sampleLocationCoordinateRange[1]]
2792 */
2793 float x = CLAMP(samp_loc->locations[i].x, SAMPLE_LOCATION_MIN,
2794 SAMPLE_LOCATION_MAX);
2795 float y = CLAMP(samp_loc->locations[i].y, SAMPLE_LOCATION_MIN,
2796 SAMPLE_LOCATION_MAX);
2797
2798 sample_locations |=
2799 ((uint64_t)(A6XX_RB_SAMPLE_LOCATION_0_SAMPLE_0_X(x) |
2800 A6XX_RB_SAMPLE_LOCATION_0_SAMPLE_0_Y(y))) << i*8;
2801 }
2802
2803 tu_cs_emit_pkt4(cs, REG_A6XX_GRAS_SAMPLE_LOCATION_0, 2);
2804 tu_cs_emit_qw(cs, sample_locations);
2805
2806 tu_cs_emit_pkt4(cs, REG_A6XX_RB_SAMPLE_LOCATION_0, 2);
2807 tu_cs_emit_qw(cs, sample_locations);
2808
2809 tu_cs_emit_pkt4(cs, REG_A6XX_SP_TP_SAMPLE_LOCATION_0, 2);
2810 tu_cs_emit_qw(cs, sample_locations);
2811 }
2812
2813 static const enum mesa_vk_dynamic_graphics_state tu_depth_bias_state[] = {
2814 MESA_VK_DYNAMIC_RS_DEPTH_BIAS_FACTORS,
2815 };
2816
2817 template <chip CHIP>
2818 static unsigned
tu6_depth_bias_size(struct tu_device * dev,const struct vk_rasterization_state * rs)2819 tu6_depth_bias_size(struct tu_device *dev,
2820 const struct vk_rasterization_state *rs)
2821 {
2822 return 4;
2823 }
2824
2825 template <chip CHIP>
2826 void
tu6_emit_depth_bias(struct tu_cs * cs,const struct vk_rasterization_state * rs)2827 tu6_emit_depth_bias(struct tu_cs *cs, const struct vk_rasterization_state *rs)
2828 {
2829 tu_cs_emit_pkt4(cs, REG_A6XX_GRAS_SU_POLY_OFFSET_SCALE, 3);
2830 tu_cs_emit(cs, A6XX_GRAS_SU_POLY_OFFSET_SCALE(rs->depth_bias.slope_factor).value);
2831 tu_cs_emit(cs, A6XX_GRAS_SU_POLY_OFFSET_OFFSET(rs->depth_bias.constant_factor).value);
2832 tu_cs_emit(cs, A6XX_GRAS_SU_POLY_OFFSET_OFFSET_CLAMP(rs->depth_bias.clamp).value);
2833 }
2834
2835 static const enum mesa_vk_dynamic_graphics_state tu_bandwidth_state[] = {
2836 MESA_VK_DYNAMIC_CB_LOGIC_OP_ENABLE,
2837 MESA_VK_DYNAMIC_CB_LOGIC_OP,
2838 MESA_VK_DYNAMIC_CB_ATTACHMENT_COUNT,
2839 MESA_VK_DYNAMIC_CB_COLOR_WRITE_ENABLES,
2840 MESA_VK_DYNAMIC_CB_BLEND_ENABLES,
2841 MESA_VK_DYNAMIC_CB_WRITE_MASKS,
2842 };
2843
2844 static void
tu_calc_bandwidth(struct tu_bandwidth * bandwidth,const struct vk_color_blend_state * cb,const struct vk_render_pass_state * rp)2845 tu_calc_bandwidth(struct tu_bandwidth *bandwidth,
2846 const struct vk_color_blend_state *cb,
2847 const struct vk_render_pass_state *rp)
2848 {
2849 bool rop_reads_dst = cb->logic_op_enable && tu_logic_op_reads_dst((VkLogicOp)cb->logic_op);
2850
2851 uint32_t total_bpp = 0;
2852 for (unsigned i = 0; i < cb->attachment_count; i++) {
2853 const struct vk_color_blend_attachment_state *att = &cb->attachments[i];
2854 if (!(cb->color_write_enables & (1u << i)))
2855 continue;
2856
2857 const VkFormat format = rp->color_attachment_formats[i];
2858
2859 uint32_t write_bpp = 0;
2860 if (format == VK_FORMAT_UNDEFINED) {
2861 /* do nothing */
2862 } else if (att->write_mask == 0xf) {
2863 write_bpp = vk_format_get_blocksizebits(format);
2864 } else {
2865 const enum pipe_format pipe_format = vk_format_to_pipe_format(format);
2866 for (uint32_t i = 0; i < 4; i++) {
2867 if (att->write_mask & (1 << i)) {
2868 write_bpp += util_format_get_component_bits(pipe_format,
2869 UTIL_FORMAT_COLORSPACE_RGB, i);
2870 }
2871 }
2872 }
2873 total_bpp += write_bpp;
2874
2875 if (rop_reads_dst || att->blend_enable) {
2876 total_bpp += write_bpp;
2877 }
2878 }
2879
2880 bandwidth->color_bandwidth_per_sample = total_bpp / 8;
2881
2882 if (rp->attachments & MESA_VK_RP_ATTACHMENT_DEPTH_BIT) {
2883 bandwidth->depth_cpp_per_sample = util_format_get_component_bits(
2884 vk_format_to_pipe_format(rp->depth_attachment_format),
2885 UTIL_FORMAT_COLORSPACE_ZS, 0) / 8;
2886 }
2887
2888 if (rp->attachments & MESA_VK_RP_ATTACHMENT_STENCIL_BIT) {
2889 bandwidth->stencil_cpp_per_sample = util_format_get_component_bits(
2890 vk_format_to_pipe_format(rp->stencil_attachment_format),
2891 UTIL_FORMAT_COLORSPACE_ZS, 1) / 8;
2892 }
2893 }
2894
2895 /* Return true if the blend state reads the color attachments. */
2896 static bool
tu6_calc_blend_lrz(const struct vk_color_blend_state * cb,const struct vk_render_pass_state * rp)2897 tu6_calc_blend_lrz(const struct vk_color_blend_state *cb,
2898 const struct vk_render_pass_state *rp)
2899 {
2900 if (cb->logic_op_enable && tu_logic_op_reads_dst((VkLogicOp)cb->logic_op))
2901 return true;
2902
2903 for (unsigned i = 0; i < cb->attachment_count; i++) {
2904 if (rp->color_attachment_formats[i] == VK_FORMAT_UNDEFINED)
2905 continue;
2906
2907 const struct vk_color_blend_attachment_state *att = &cb->attachments[i];
2908 if (att->blend_enable)
2909 return true;
2910 if (!(cb->color_write_enables & (1u << i)))
2911 return true;
2912 unsigned mask =
2913 MASK(vk_format_get_nr_components(rp->color_attachment_formats[i]));
2914 if ((att->write_mask & mask) != mask)
2915 return true;
2916 }
2917
2918 return false;
2919 }
2920
2921 static const enum mesa_vk_dynamic_graphics_state tu_blend_lrz_state[] = {
2922 MESA_VK_DYNAMIC_CB_LOGIC_OP_ENABLE,
2923 MESA_VK_DYNAMIC_CB_LOGIC_OP,
2924 MESA_VK_DYNAMIC_CB_ATTACHMENT_COUNT,
2925 MESA_VK_DYNAMIC_CB_COLOR_WRITE_ENABLES,
2926 MESA_VK_DYNAMIC_CB_BLEND_ENABLES,
2927 MESA_VK_DYNAMIC_CB_WRITE_MASKS,
2928 };
2929
2930 static void
tu_emit_blend_lrz(struct tu_lrz_blend * lrz,const struct vk_color_blend_state * cb,const struct vk_render_pass_state * rp)2931 tu_emit_blend_lrz(struct tu_lrz_blend *lrz,
2932 const struct vk_color_blend_state *cb,
2933 const struct vk_render_pass_state *rp)
2934 {
2935 lrz->reads_dest = tu6_calc_blend_lrz(cb, rp);
2936 lrz->valid = true;
2937 }
2938
2939 static const enum mesa_vk_dynamic_graphics_state tu_blend_state[] = {
2940 MESA_VK_DYNAMIC_CB_LOGIC_OP_ENABLE,
2941 MESA_VK_DYNAMIC_CB_LOGIC_OP,
2942 MESA_VK_DYNAMIC_CB_ATTACHMENT_COUNT,
2943 MESA_VK_DYNAMIC_CB_COLOR_WRITE_ENABLES,
2944 MESA_VK_DYNAMIC_CB_BLEND_ENABLES,
2945 MESA_VK_DYNAMIC_CB_BLEND_EQUATIONS,
2946 MESA_VK_DYNAMIC_CB_WRITE_MASKS,
2947 MESA_VK_DYNAMIC_MS_ALPHA_TO_COVERAGE_ENABLE,
2948 MESA_VK_DYNAMIC_MS_ALPHA_TO_ONE_ENABLE,
2949 MESA_VK_DYNAMIC_MS_SAMPLE_MASK,
2950 MESA_VK_DYNAMIC_COLOR_ATTACHMENT_MAP,
2951 };
2952
2953 template <chip CHIP>
2954 static unsigned
tu6_blend_size(struct tu_device * dev,const struct vk_color_blend_state * cb,const struct vk_color_attachment_location_state * cal,bool alpha_to_coverage_enable,bool alpha_to_one_enable,uint32_t sample_mask)2955 tu6_blend_size(struct tu_device *dev,
2956 const struct vk_color_blend_state *cb,
2957 const struct vk_color_attachment_location_state *cal,
2958 bool alpha_to_coverage_enable,
2959 bool alpha_to_one_enable,
2960 uint32_t sample_mask)
2961 {
2962 unsigned num_rts = alpha_to_coverage_enable ?
2963 MAX2(cb->attachment_count, 1) : cb->attachment_count;
2964 return 8 + 3 * num_rts;
2965 }
2966
2967 template <chip CHIP>
2968 static void
tu6_emit_blend(struct tu_cs * cs,const struct vk_color_blend_state * cb,const struct vk_color_attachment_location_state * cal,bool alpha_to_coverage_enable,bool alpha_to_one_enable,uint32_t sample_mask)2969 tu6_emit_blend(struct tu_cs *cs,
2970 const struct vk_color_blend_state *cb,
2971 const struct vk_color_attachment_location_state *cal,
2972 bool alpha_to_coverage_enable,
2973 bool alpha_to_one_enable,
2974 uint32_t sample_mask)
2975 {
2976 bool rop_reads_dst = cb->logic_op_enable && tu_logic_op_reads_dst((VkLogicOp)cb->logic_op);
2977 enum a3xx_rop_code rop = tu6_rop((VkLogicOp)cb->logic_op);
2978
2979 uint32_t blend_enable_mask = 0;
2980 for (unsigned i = 0; i < cb->attachment_count; i++) {
2981 if (!(cb->color_write_enables & (1u << i)) ||
2982 cal->color_map[i] == MESA_VK_ATTACHMENT_UNUSED)
2983 continue;
2984
2985 const struct vk_color_blend_attachment_state *att = &cb->attachments[i];
2986
2987 if (rop_reads_dst || att->blend_enable) {
2988 blend_enable_mask |= 1u << cal->color_map[i];
2989 }
2990 }
2991
2992 /* This will emit a dummy RB_MRT_*_CONTROL below if alpha-to-coverage is
2993 * enabled but there are no color attachments, in addition to changing
2994 * *_FS_OUTPUT_CNTL1.
2995 */
2996 unsigned num_rts = alpha_to_coverage_enable ?
2997 MAX2(cb->attachment_count, 1) : cb->attachment_count;
2998
2999 bool dual_src_blend = tu_blend_state_is_dual_src(cb);
3000
3001 tu_cs_emit_regs(cs, A6XX_SP_FS_OUTPUT_CNTL1(.mrt = num_rts));
3002 tu_cs_emit_regs(cs, A6XX_RB_FS_OUTPUT_CNTL1(.mrt = num_rts));
3003 tu_cs_emit_regs(cs, A6XX_SP_BLEND_CNTL(.enable_blend = blend_enable_mask,
3004 .unk8 = true,
3005 .dual_color_in_enable =
3006 dual_src_blend,
3007 .alpha_to_coverage =
3008 alpha_to_coverage_enable));
3009 /* set A6XX_RB_BLEND_CNTL_INDEPENDENT_BLEND only when enabled? */
3010 tu_cs_emit_regs(cs, A6XX_RB_BLEND_CNTL(.enable_blend = blend_enable_mask,
3011 .independent_blend = true,
3012 .dual_color_in_enable =
3013 dual_src_blend,
3014 .alpha_to_coverage =
3015 alpha_to_coverage_enable,
3016 .alpha_to_one = alpha_to_one_enable,
3017 .sample_mask = sample_mask));
3018
3019 for (unsigned i = 0; i < num_rts; i++) {
3020 if (cal->color_map[i] == MESA_VK_ATTACHMENT_UNUSED)
3021 continue;
3022 unsigned remapped_idx = cal->color_map[i];
3023 const struct vk_color_blend_attachment_state *att = &cb->attachments[i];
3024 if ((cb->color_write_enables & (1u << i)) && i < cb->attachment_count) {
3025 const enum a3xx_rb_blend_opcode color_op = tu6_blend_op(att->color_blend_op);
3026 const enum adreno_rb_blend_factor src_color_factor =
3027 tu6_blend_factor((VkBlendFactor)att->src_color_blend_factor);
3028 const enum adreno_rb_blend_factor dst_color_factor =
3029 tu6_blend_factor((VkBlendFactor)att->dst_color_blend_factor);
3030 const enum a3xx_rb_blend_opcode alpha_op =
3031 tu6_blend_op(att->alpha_blend_op);
3032 const enum adreno_rb_blend_factor src_alpha_factor =
3033 tu6_blend_factor((VkBlendFactor)att->src_alpha_blend_factor);
3034 const enum adreno_rb_blend_factor dst_alpha_factor =
3035 tu6_blend_factor((VkBlendFactor)att->dst_alpha_blend_factor);
3036
3037 tu_cs_emit_regs(cs,
3038 A6XX_RB_MRT_CONTROL(remapped_idx,
3039 .blend = att->blend_enable,
3040 .blend2 = att->blend_enable,
3041 .rop_enable = cb->logic_op_enable,
3042 .rop_code = rop,
3043 .component_enable = att->write_mask),
3044 A6XX_RB_MRT_BLEND_CONTROL(remapped_idx,
3045 .rgb_src_factor = src_color_factor,
3046 .rgb_blend_opcode = color_op,
3047 .rgb_dest_factor = dst_color_factor,
3048 .alpha_src_factor = src_alpha_factor,
3049 .alpha_blend_opcode = alpha_op,
3050 .alpha_dest_factor = dst_alpha_factor));
3051 } else {
3052 tu_cs_emit_regs(cs,
3053 A6XX_RB_MRT_CONTROL(remapped_idx,),
3054 A6XX_RB_MRT_BLEND_CONTROL(remapped_idx,));
3055 }
3056 }
3057 }
3058
3059 static const enum mesa_vk_dynamic_graphics_state tu_blend_constants_state[] = {
3060 MESA_VK_DYNAMIC_CB_BLEND_CONSTANTS,
3061 };
3062
3063 template <chip CHIP>
3064 static unsigned
tu6_blend_constants_size(struct tu_device * dev,const struct vk_color_blend_state * cb)3065 tu6_blend_constants_size(struct tu_device *dev,
3066 const struct vk_color_blend_state *cb)
3067 {
3068 return 5;
3069 }
3070
3071 template <chip CHIP>
3072 static void
tu6_emit_blend_constants(struct tu_cs * cs,const struct vk_color_blend_state * cb)3073 tu6_emit_blend_constants(struct tu_cs *cs, const struct vk_color_blend_state *cb)
3074 {
3075 tu_cs_emit_pkt4(cs, REG_A6XX_RB_BLEND_RED_F32, 4);
3076 tu_cs_emit_array(cs, (const uint32_t *) cb->blend_constants, 4);
3077 }
3078
3079 static const enum mesa_vk_dynamic_graphics_state tu_rast_state[] = {
3080 MESA_VK_DYNAMIC_RS_DEPTH_CLAMP_ENABLE,
3081 MESA_VK_DYNAMIC_RS_DEPTH_CLIP_ENABLE,
3082 MESA_VK_DYNAMIC_RS_POLYGON_MODE,
3083 MESA_VK_DYNAMIC_RS_CULL_MODE,
3084 MESA_VK_DYNAMIC_RS_FRONT_FACE,
3085 MESA_VK_DYNAMIC_RS_DEPTH_BIAS_ENABLE,
3086 MESA_VK_DYNAMIC_RS_LINE_MODE,
3087 MESA_VK_DYNAMIC_RS_RASTERIZER_DISCARD_ENABLE,
3088 MESA_VK_DYNAMIC_RS_RASTERIZATION_STREAM,
3089 MESA_VK_DYNAMIC_VP_DEPTH_CLIP_NEGATIVE_ONE_TO_ONE,
3090 MESA_VK_DYNAMIC_RS_LINE_WIDTH,
3091 };
3092
3093 template <chip CHIP>
3094 uint32_t
tu6_rast_size(struct tu_device * dev,const struct vk_rasterization_state * rs,const struct vk_viewport_state * vp,bool multiview,bool per_view_viewport)3095 tu6_rast_size(struct tu_device *dev,
3096 const struct vk_rasterization_state *rs,
3097 const struct vk_viewport_state *vp,
3098 bool multiview,
3099 bool per_view_viewport)
3100 {
3101 if (CHIP == A6XX) {
3102 return 15 + (dev->physical_device->info->a6xx.has_legacy_pipeline_shading_rate ? 8 : 0);
3103 } else {
3104 return 17;
3105 }
3106 }
3107
3108 template <chip CHIP>
3109 void
tu6_emit_rast(struct tu_cs * cs,const struct vk_rasterization_state * rs,const struct vk_viewport_state * vp,bool multiview,bool per_view_viewport)3110 tu6_emit_rast(struct tu_cs *cs,
3111 const struct vk_rasterization_state *rs,
3112 const struct vk_viewport_state *vp,
3113 bool multiview,
3114 bool per_view_viewport)
3115 {
3116 enum a5xx_line_mode line_mode =
3117 rs->line.mode == VK_LINE_RASTERIZATION_MODE_BRESENHAM_KHR ?
3118 BRESENHAM : RECTANGULAR;
3119 tu_cs_emit_regs(cs,
3120 A6XX_GRAS_SU_CNTL(
3121 .cull_front = rs->cull_mode & VK_CULL_MODE_FRONT_BIT,
3122 .cull_back = rs->cull_mode & VK_CULL_MODE_BACK_BIT,
3123 .front_cw = rs->front_face == VK_FRONT_FACE_CLOCKWISE,
3124 .linehalfwidth = rs->line.width / 2.0f,
3125 .poly_offset = rs->depth_bias.enable,
3126 .line_mode = line_mode,
3127 .multiview_enable = multiview,
3128 .rendertargetindexincr = multiview,
3129 .viewportindexincr = multiview && per_view_viewport));
3130
3131 bool depth_clip_enable = vk_rasterization_state_depth_clip_enable(rs);
3132
3133 tu_cs_emit_regs(cs,
3134 A6XX_GRAS_CL_CNTL(
3135 .znear_clip_disable = !depth_clip_enable,
3136 .zfar_clip_disable = !depth_clip_enable,
3137 /* To support VK_EXT_depth_clamp_zero_one on a7xx+ */
3138 .z_clamp_enable = rs->depth_clamp_enable || CHIP >= A7XX,
3139 .zero_gb_scale_z = vp->depth_clip_negative_one_to_one ? 0 : 1,
3140 .vp_clip_code_ignore = 1));;
3141
3142 enum a6xx_polygon_mode polygon_mode = tu6_polygon_mode(rs->polygon_mode);
3143
3144 tu_cs_emit_regs(cs,
3145 A6XX_VPC_POLYGON_MODE(polygon_mode));
3146
3147 tu_cs_emit_regs(cs,
3148 PC_POLYGON_MODE(CHIP, polygon_mode));
3149
3150 if (CHIP == A7XX) {
3151 tu_cs_emit_regs(cs,
3152 A7XX_VPC_POLYGON_MODE2(polygon_mode));
3153 }
3154
3155 tu_cs_emit_regs(cs, PC_RASTER_CNTL(CHIP,
3156 .stream = rs->rasterization_stream,
3157 .discard = rs->rasterizer_discard_enable));
3158 if (CHIP == A6XX) {
3159 tu_cs_emit_regs(cs, A6XX_VPC_UNKNOWN_9107(
3160 .raster_discard = rs->rasterizer_discard_enable));
3161 } else {
3162 tu_cs_emit_regs(cs, A7XX_PC_RASTER_CNTL_V2(
3163 .stream = rs->rasterization_stream,
3164 .discard = rs->rasterizer_discard_enable));
3165 }
3166
3167 /* move to hw ctx init? */
3168 tu_cs_emit_regs(cs,
3169 A6XX_GRAS_SU_POINT_MINMAX(.min = 1.0f / 16.0f, .max = 4092.0f),
3170 A6XX_GRAS_SU_POINT_SIZE(1.0f));
3171
3172 if (CHIP == A6XX && cs->device->physical_device->info->a6xx.has_legacy_pipeline_shading_rate) {
3173 tu_cs_emit_regs(cs, A6XX_RB_UNKNOWN_8A00());
3174 tu_cs_emit_regs(cs, A6XX_RB_UNKNOWN_8A10());
3175 tu_cs_emit_regs(cs, A6XX_RB_UNKNOWN_8A20());
3176 tu_cs_emit_regs(cs, A6XX_RB_UNKNOWN_8A30());
3177 }
3178 }
3179
3180 static const enum mesa_vk_dynamic_graphics_state tu_ds_state[] = {
3181 MESA_VK_DYNAMIC_DS_STENCIL_TEST_ENABLE,
3182 MESA_VK_DYNAMIC_DS_STENCIL_OP,
3183 MESA_VK_DYNAMIC_DS_STENCIL_COMPARE_MASK,
3184 MESA_VK_DYNAMIC_DS_STENCIL_WRITE_MASK,
3185 MESA_VK_DYNAMIC_DS_STENCIL_REFERENCE,
3186 MESA_VK_DYNAMIC_DS_DEPTH_BOUNDS_TEST_BOUNDS,
3187 };
3188
3189 template <chip CHIP>
3190 static unsigned
tu6_ds_size(struct tu_device * dev,const struct vk_depth_stencil_state * ds,const struct vk_render_pass_state * rp)3191 tu6_ds_size(struct tu_device *dev,
3192 const struct vk_depth_stencil_state *ds,
3193 const struct vk_render_pass_state *rp)
3194 {
3195 return 13;
3196 }
3197
3198 template <chip CHIP>
3199 static void
tu6_emit_ds(struct tu_cs * cs,const struct vk_depth_stencil_state * ds,const struct vk_render_pass_state * rp)3200 tu6_emit_ds(struct tu_cs *cs,
3201 const struct vk_depth_stencil_state *ds,
3202 const struct vk_render_pass_state *rp)
3203 {
3204 bool stencil_test_enable =
3205 ds->stencil.test_enable && rp->attachments & MESA_VK_RP_ATTACHMENT_STENCIL_BIT;
3206 tu_cs_emit_regs(cs, A6XX_RB_STENCIL_CONTROL(
3207 .stencil_enable = stencil_test_enable,
3208 .stencil_enable_bf = stencil_test_enable,
3209 .stencil_read = stencil_test_enable,
3210 .func = tu6_compare_func((VkCompareOp)ds->stencil.front.op.compare),
3211 .fail = tu6_stencil_op((VkStencilOp)ds->stencil.front.op.fail),
3212 .zpass = tu6_stencil_op((VkStencilOp)ds->stencil.front.op.pass),
3213 .zfail = tu6_stencil_op((VkStencilOp)ds->stencil.front.op.depth_fail),
3214 .func_bf = tu6_compare_func((VkCompareOp)ds->stencil.back.op.compare),
3215 .fail_bf = tu6_stencil_op((VkStencilOp)ds->stencil.back.op.fail),
3216 .zpass_bf = tu6_stencil_op((VkStencilOp)ds->stencil.back.op.pass),
3217 .zfail_bf = tu6_stencil_op((VkStencilOp)ds->stencil.back.op.depth_fail)));
3218 tu_cs_emit_regs(cs, A6XX_GRAS_SU_STENCIL_CNTL(stencil_test_enable));
3219
3220 tu_cs_emit_regs(cs, A6XX_RB_STENCILMASK(
3221 .mask = ds->stencil.front.compare_mask,
3222 .bfmask = ds->stencil.back.compare_mask));
3223
3224 tu_cs_emit_regs(cs, A6XX_RB_STENCILWRMASK(
3225 .wrmask = ds->stencil.front.write_mask,
3226 .bfwrmask = ds->stencil.back.write_mask));
3227
3228 tu_cs_emit_regs(cs, A6XX_RB_STENCILREF(
3229 .ref = ds->stencil.front.reference,
3230 .bfref = ds->stencil.back.reference));
3231
3232 tu_cs_emit_regs(cs,
3233 A6XX_RB_Z_BOUNDS_MIN(ds->depth.bounds_test.min),
3234 A6XX_RB_Z_BOUNDS_MAX(ds->depth.bounds_test.max));
3235 }
3236
3237 static const enum mesa_vk_dynamic_graphics_state tu_rb_depth_cntl_state[] = {
3238 MESA_VK_DYNAMIC_DS_DEPTH_TEST_ENABLE,
3239 MESA_VK_DYNAMIC_DS_DEPTH_WRITE_ENABLE,
3240 MESA_VK_DYNAMIC_DS_DEPTH_COMPARE_OP,
3241 MESA_VK_DYNAMIC_DS_DEPTH_BOUNDS_TEST_ENABLE,
3242 MESA_VK_DYNAMIC_RS_DEPTH_CLAMP_ENABLE,
3243 };
3244
3245 template <chip CHIP>
3246 static unsigned
tu6_rb_depth_cntl_size(struct tu_device * dev,const struct vk_depth_stencil_state * ds,const struct vk_render_pass_state * rp,const struct vk_rasterization_state * rs)3247 tu6_rb_depth_cntl_size(struct tu_device *dev,
3248 const struct vk_depth_stencil_state *ds,
3249 const struct vk_render_pass_state *rp,
3250 const struct vk_rasterization_state *rs)
3251 {
3252 return 4;
3253 }
3254
3255 template <chip CHIP>
3256 static void
tu6_emit_rb_depth_cntl(struct tu_cs * cs,const struct vk_depth_stencil_state * ds,const struct vk_render_pass_state * rp,const struct vk_rasterization_state * rs)3257 tu6_emit_rb_depth_cntl(struct tu_cs *cs,
3258 const struct vk_depth_stencil_state *ds,
3259 const struct vk_render_pass_state *rp,
3260 const struct vk_rasterization_state *rs)
3261 {
3262 if (rp->attachments & MESA_VK_RP_ATTACHMENT_DEPTH_BIT) {
3263 bool depth_test = ds->depth.test_enable;
3264 enum adreno_compare_func zfunc = tu6_compare_func(ds->depth.compare_op);
3265
3266 /* On some GPUs it is necessary to enable z test for depth bounds test
3267 * when UBWC is enabled. Otherwise, the GPU would hang. FUNC_ALWAYS is
3268 * required to pass z test. Relevant tests:
3269 * dEQP-VK.pipeline.extended_dynamic_state.two_draws_dynamic.depth_bounds_test_disable
3270 * dEQP-VK.dynamic_state.ds_state.depth_bounds_1
3271 */
3272 if (ds->depth.bounds_test.enable &&
3273 !ds->depth.test_enable &&
3274 cs->device->physical_device->info->a6xx.depth_bounds_require_depth_test_quirk) {
3275 depth_test = true;
3276 zfunc = FUNC_ALWAYS;
3277 }
3278
3279 tu_cs_emit_regs(cs, A6XX_RB_DEPTH_CNTL(
3280 .z_test_enable = depth_test,
3281 .z_write_enable = ds->depth.test_enable && ds->depth.write_enable,
3282 .zfunc = zfunc,
3283 /* To support VK_EXT_depth_clamp_zero_one on a7xx+ */
3284 .z_clamp_enable = rs->depth_clamp_enable || CHIP >= A7XX,
3285 /* TODO don't set for ALWAYS/NEVER */
3286 .z_read_enable = ds->depth.test_enable || ds->depth.bounds_test.enable,
3287 .z_bounds_enable = ds->depth.bounds_test.enable));
3288 tu_cs_emit_regs(cs, A6XX_GRAS_SU_DEPTH_CNTL(depth_test));
3289 } else {
3290 tu_cs_emit_regs(cs, A6XX_RB_DEPTH_CNTL());
3291 tu_cs_emit_regs(cs, A6XX_GRAS_SU_DEPTH_CNTL());
3292 }
3293 }
3294
3295 static const enum mesa_vk_dynamic_graphics_state tu_prim_mode_sysmem_state[] = {
3296 MESA_VK_DYNAMIC_ATTACHMENT_FEEDBACK_LOOP_ENABLE,
3297 };
3298
3299 template <chip CHIP>
3300 static unsigned
tu6_prim_mode_sysmem_size(struct tu_device * dev,struct tu_shader * fs,bool raster_order_attachment_access,VkImageAspectFlags feedback_loops,bool * sysmem_single_prim_mode)3301 tu6_prim_mode_sysmem_size(struct tu_device *dev,
3302 struct tu_shader *fs,
3303 bool raster_order_attachment_access,
3304 VkImageAspectFlags feedback_loops,
3305 bool *sysmem_single_prim_mode)
3306 {
3307 return 2;
3308 }
3309
3310 template <chip CHIP>
3311 static void
tu6_emit_prim_mode_sysmem(struct tu_cs * cs,struct tu_shader * fs,bool raster_order_attachment_access,VkImageAspectFlags feedback_loops,bool * sysmem_single_prim_mode)3312 tu6_emit_prim_mode_sysmem(struct tu_cs *cs,
3313 struct tu_shader *fs,
3314 bool raster_order_attachment_access,
3315 VkImageAspectFlags feedback_loops,
3316 bool *sysmem_single_prim_mode)
3317 {
3318 /* VK_EXT_rasterization_order_attachment_access:
3319 *
3320 * This extension allow access to framebuffer attachments when used as both
3321 * input and color attachments from one fragment to the next, in
3322 * rasterization order, without explicit synchronization.
3323 */
3324 raster_order_attachment_access |= TU_DEBUG(RAST_ORDER);
3325
3326 /* If there is a feedback loop, then the shader can read the previous value
3327 * of a pixel being written out. It can also write some components and then
3328 * read different components without a barrier in between. This is a
3329 * problem in sysmem mode with UBWC, because the main buffer and flags
3330 * buffer can get out-of-sync if only one is flushed. We fix this by
3331 * setting the SINGLE_PRIM_MODE field to the same value that the blob does
3332 * for advanced_blend in sysmem mode if a feedback loop is detected.
3333 */
3334 enum a6xx_single_prim_mode sysmem_prim_mode =
3335 (raster_order_attachment_access || feedback_loops ||
3336 fs->fs.dynamic_input_attachments_used) ?
3337 FLUSH_PER_OVERLAP_AND_OVERWRITE : NO_FLUSH;
3338
3339 if (sysmem_prim_mode == FLUSH_PER_OVERLAP_AND_OVERWRITE)
3340 *sysmem_single_prim_mode = true;
3341
3342 tu_cs_emit_regs(cs, A6XX_GRAS_SC_CNTL(.ccusinglecachelinesize = 2,
3343 .single_prim_mode = sysmem_prim_mode));
3344 }
3345
3346 static const enum mesa_vk_dynamic_graphics_state tu_fragment_shading_rate_state[] = {
3347 MESA_VK_DYNAMIC_FSR,
3348 };
3349
3350 template <chip CHIP>
3351 static unsigned
tu6_fragment_shading_rate_size(struct tu_device * dev,const vk_fragment_shading_rate_state * fsr,bool enable_att_fsr,bool enable_prim_fsr,bool fs_reads_fsr,bool sample_shading)3352 tu6_fragment_shading_rate_size(struct tu_device *dev,
3353 const vk_fragment_shading_rate_state *fsr,
3354 bool enable_att_fsr,
3355 bool enable_prim_fsr,
3356 bool fs_reads_fsr,
3357 bool sample_shading)
3358 {
3359 return 6;
3360 }
3361
3362 template <chip CHIP>
3363 static void
tu6_emit_fragment_shading_rate(struct tu_cs * cs,const vk_fragment_shading_rate_state * fsr,bool enable_att_fsr,bool enable_prim_fsr,bool fs_reads_fsr,bool accesses_smask)3364 tu6_emit_fragment_shading_rate(struct tu_cs *cs,
3365 const vk_fragment_shading_rate_state *fsr,
3366 bool enable_att_fsr,
3367 bool enable_prim_fsr,
3368 bool fs_reads_fsr,
3369 bool accesses_smask)
3370 {
3371 /* gl_ShadingRateEXT don't read 1x1 value with null config, so
3372 * if it is read - we have to emit the config.
3373 */
3374 if (!fsr || (!fs_reads_fsr && vk_fragment_shading_rate_is_disabled(fsr))) {
3375 tu_cs_emit_regs(cs, A6XX_RB_FSR_CONFIG());
3376 tu_cs_emit_regs(cs, A7XX_SP_FSR_CONFIG());
3377 tu_cs_emit_regs(cs, A7XX_GRAS_FSR_CONFIG());
3378 return;
3379 }
3380
3381 uint32_t frag_width = fsr->fragment_size.width;
3382 uint32_t frag_height = fsr->fragment_size.height;
3383
3384 bool enable_draw_fsr = true;
3385 if (enable_att_fsr) {
3386 if (fsr->combiner_ops[1] ==
3387 VK_FRAGMENT_SHADING_RATE_COMBINER_OP_REPLACE_KHR) {
3388 enable_draw_fsr = false;
3389 enable_prim_fsr = false;
3390 } else if (fsr->combiner_ops[1] ==
3391 VK_FRAGMENT_SHADING_RATE_COMBINER_OP_KEEP_KHR) {
3392 enable_att_fsr = false;
3393 }
3394 }
3395 if (enable_prim_fsr) {
3396 if (fsr->combiner_ops[0] ==
3397 VK_FRAGMENT_SHADING_RATE_COMBINER_OP_REPLACE_KHR) {
3398 enable_draw_fsr = false;
3399 } else if (fsr->combiner_ops[0] ==
3400 VK_FRAGMENT_SHADING_RATE_COMBINER_OP_KEEP_KHR) {
3401 enable_prim_fsr = false;
3402 }
3403 }
3404
3405 /* Force 1x1 FSR because we don't support
3406 * fragmentShadingRateWithShaderSampleMask.
3407 */
3408 if (accesses_smask) {
3409 enable_att_fsr = enable_prim_fsr = false;
3410 frag_width = frag_height = 1;
3411 enable_draw_fsr = true;
3412 }
3413
3414 tu_cs_emit_regs(
3415 cs,
3416 A6XX_RB_FSR_CONFIG(.unk2 = true, .pipeline_fsr_enable = enable_draw_fsr,
3417 .attachment_fsr_enable = enable_att_fsr,
3418 .primitive_fsr_enable = enable_prim_fsr));
3419 tu_cs_emit_regs(
3420 cs, A7XX_SP_FSR_CONFIG(.pipeline_fsr_enable = enable_draw_fsr,
3421 .attachment_fsr_enable = enable_att_fsr,
3422 .primitive_fsr_enable = enable_prim_fsr));
3423 tu_cs_emit_regs(
3424 cs, A7XX_GRAS_FSR_CONFIG(
3425 .pipeline_fsr_enable = enable_draw_fsr,
3426 .frag_size_x = util_logbase2(frag_width),
3427 .frag_size_y = util_logbase2(frag_height),
3428 .combiner_op_1 = (a6xx_fsr_combiner) fsr->combiner_ops[0],
3429 .combiner_op_2 = (a6xx_fsr_combiner) fsr->combiner_ops[1],
3430 .attachment_fsr_enable = enable_att_fsr,
3431 .primitive_fsr_enable = enable_prim_fsr));
3432 }
3433
3434
3435 static inline bool
emit_pipeline_state(BITSET_WORD * keep,BITSET_WORD * remove,BITSET_WORD * pipeline_set,const enum mesa_vk_dynamic_graphics_state * state_array,unsigned num_states,bool extra_cond,struct tu_pipeline_builder * builder)3436 emit_pipeline_state(BITSET_WORD *keep, BITSET_WORD *remove,
3437 BITSET_WORD *pipeline_set,
3438 const enum mesa_vk_dynamic_graphics_state *state_array,
3439 unsigned num_states, bool extra_cond,
3440 struct tu_pipeline_builder *builder)
3441 {
3442 BITSET_DECLARE(state, MESA_VK_DYNAMIC_GRAPHICS_STATE_ENUM_MAX) = {};
3443
3444 /* Unrolling this loop should produce a constant value once the function is
3445 * inlined, because state_array and num_states are a per-draw-state
3446 * constant, but GCC seems to need a little encouragement. clang does a
3447 * little better but still needs a pragma when there are a large number of
3448 * states.
3449 */
3450 #if defined(__clang__)
3451 #pragma clang loop unroll(full)
3452 #elif defined(__GNUC__) && __GNUC__ >= 8
3453 #pragma GCC unroll MESA_VK_DYNAMIC_GRAPHICS_STATE_ENUM_MAX
3454 #endif
3455 for (unsigned i = 0; i < num_states; i++) {
3456 BITSET_SET(state, state_array[i]);
3457 }
3458
3459 /* If all of the state is set, then after we emit it we can tentatively
3460 * remove it from the states to set for the pipeline by making it dynamic.
3461 * If we can't emit it, though, we need to keep around the partial state so
3462 * that we can emit it later, even if another draw state consumes it. That
3463 * is, we have to cancel any tentative removal.
3464 */
3465 BITSET_DECLARE(temp, MESA_VK_DYNAMIC_GRAPHICS_STATE_ENUM_MAX);
3466 memcpy(temp, pipeline_set, sizeof(temp));
3467 BITSET_AND(temp, temp, state);
3468 if (!BITSET_EQUAL(temp, state) || !extra_cond) {
3469 __bitset_or(keep, keep, temp, ARRAY_SIZE(temp));
3470 return false;
3471 }
3472 __bitset_or(remove, remove, state, ARRAY_SIZE(state));
3473 return true;
3474 }
3475
3476 template <chip CHIP>
3477 static void
tu_pipeline_builder_emit_state(struct tu_pipeline_builder * builder,struct tu_pipeline * pipeline)3478 tu_pipeline_builder_emit_state(struct tu_pipeline_builder *builder,
3479 struct tu_pipeline *pipeline)
3480 {
3481 struct tu_cs cs;
3482 BITSET_DECLARE(keep, MESA_VK_DYNAMIC_GRAPHICS_STATE_ENUM_MAX) = {};
3483 BITSET_DECLARE(remove, MESA_VK_DYNAMIC_GRAPHICS_STATE_ENUM_MAX) = {};
3484 BITSET_DECLARE(pipeline_set, MESA_VK_DYNAMIC_GRAPHICS_STATE_ENUM_MAX) = {};
3485
3486 vk_graphics_pipeline_get_state(&builder->graphics_state, pipeline_set);
3487
3488 #define EMIT_STATE(name, extra_cond) \
3489 emit_pipeline_state(keep, remove, pipeline_set, tu_##name##_state, \
3490 ARRAY_SIZE(tu_##name##_state), extra_cond, builder)
3491
3492 #define DRAW_STATE_COND(name, id, extra_cond, ...) \
3493 if (EMIT_STATE(name, extra_cond)) { \
3494 unsigned size = tu6_##name##_size<CHIP>(builder->device, __VA_ARGS__); \
3495 if (size > 0) { \
3496 tu_cs_begin_sub_stream(&pipeline->cs, size, &cs); \
3497 tu6_emit_##name<CHIP>(&cs, __VA_ARGS__); \
3498 pipeline->dynamic_state[id] = \
3499 tu_cs_end_draw_state(&pipeline->cs, &cs); \
3500 } \
3501 pipeline->set_state_mask |= (1u << id); \
3502 }
3503 #define DRAW_STATE(name, id, ...) DRAW_STATE_COND(name, id, true, __VA_ARGS__)
3504
3505 DRAW_STATE(vertex_input, TU_DYNAMIC_STATE_VERTEX_INPUT,
3506 builder->graphics_state.vi);
3507 DRAW_STATE(vertex_stride, TU_DYNAMIC_STATE_VB_STRIDE,
3508 builder->graphics_state.vi);
3509 /* If (a) per-view viewport is used or (b) we don't know yet, then we need
3510 * to set viewport and stencil state dynamically.
3511 */
3512 bool no_per_view_viewport = pipeline_contains_all_shader_state(pipeline) &&
3513 !pipeline->program.per_view_viewport;
3514 DRAW_STATE_COND(viewport, TU_DYNAMIC_STATE_VIEWPORT, no_per_view_viewport,
3515 builder->graphics_state.vp,
3516 builder->graphics_state.rs);
3517 DRAW_STATE_COND(scissor, TU_DYNAMIC_STATE_SCISSOR, no_per_view_viewport,
3518 builder->graphics_state.vp);
3519 DRAW_STATE(sample_locations,
3520 TU_DYNAMIC_STATE_SAMPLE_LOCATIONS,
3521 builder->graphics_state.ms->sample_locations_enable,
3522 builder->graphics_state.ms->sample_locations);
3523 DRAW_STATE(depth_bias, TU_DYNAMIC_STATE_DEPTH_BIAS,
3524 builder->graphics_state.rs);
3525 bool attachments_valid =
3526 builder->graphics_state.rp &&
3527 vk_render_pass_state_has_attachment_info(builder->graphics_state.rp);
3528 struct vk_color_blend_state dummy_cb = {};
3529 const struct vk_color_blend_state *cb = builder->graphics_state.cb;
3530 if (attachments_valid &&
3531 !(builder->graphics_state.rp->attachments &
3532 MESA_VK_RP_ATTACHMENT_ANY_COLOR_BITS)) {
3533 /* If there are no color attachments, then the original blend state may
3534 * be NULL and the common code sanitizes it to always be NULL. In this
3535 * case we want to emit an empty blend/bandwidth/etc. rather than
3536 * letting it be dynamic (and potentially garbage).
3537 */
3538 cb = &dummy_cb;
3539 BITSET_SET(pipeline_set, MESA_VK_DYNAMIC_CB_LOGIC_OP_ENABLE);
3540 BITSET_SET(pipeline_set, MESA_VK_DYNAMIC_CB_LOGIC_OP);
3541 BITSET_SET(pipeline_set, MESA_VK_DYNAMIC_CB_ATTACHMENT_COUNT);
3542 BITSET_SET(pipeline_set, MESA_VK_DYNAMIC_CB_COLOR_WRITE_ENABLES);
3543 BITSET_SET(pipeline_set, MESA_VK_DYNAMIC_CB_BLEND_ENABLES);
3544 BITSET_SET(pipeline_set, MESA_VK_DYNAMIC_CB_BLEND_EQUATIONS);
3545 BITSET_SET(pipeline_set, MESA_VK_DYNAMIC_CB_WRITE_MASKS);
3546 BITSET_SET(pipeline_set, MESA_VK_DYNAMIC_CB_BLEND_CONSTANTS);
3547 }
3548 DRAW_STATE(blend, TU_DYNAMIC_STATE_BLEND, cb,
3549 builder->graphics_state.cal,
3550 builder->graphics_state.ms->alpha_to_coverage_enable,
3551 builder->graphics_state.ms->alpha_to_one_enable,
3552 builder->graphics_state.ms->sample_mask);
3553 if (EMIT_STATE(blend_lrz, attachments_valid))
3554 tu_emit_blend_lrz(&pipeline->lrz_blend, cb,
3555 builder->graphics_state.rp);
3556 if (EMIT_STATE(bandwidth, attachments_valid))
3557 tu_calc_bandwidth(&pipeline->bandwidth, cb,
3558 builder->graphics_state.rp);
3559 DRAW_STATE(blend_constants, TU_DYNAMIC_STATE_BLEND_CONSTANTS, cb);
3560
3561 if (attachments_valid &&
3562 !(builder->graphics_state.rp->attachments &
3563 MESA_VK_RP_ATTACHMENT_ANY_COLOR_BITS)) {
3564 /* Don't actually make anything dynamic as that may mean a partially-set
3565 * state group where the group is NULL which angers common code.
3566 */
3567 BITSET_CLEAR(remove, MESA_VK_DYNAMIC_CB_LOGIC_OP_ENABLE);
3568 BITSET_CLEAR(remove, MESA_VK_DYNAMIC_CB_LOGIC_OP);
3569 BITSET_CLEAR(remove, MESA_VK_DYNAMIC_CB_ATTACHMENT_COUNT);
3570 BITSET_CLEAR(remove, MESA_VK_DYNAMIC_CB_COLOR_WRITE_ENABLES);
3571 BITSET_CLEAR(remove, MESA_VK_DYNAMIC_CB_BLEND_ENABLES);
3572 BITSET_CLEAR(remove, MESA_VK_DYNAMIC_CB_BLEND_EQUATIONS);
3573 BITSET_CLEAR(remove, MESA_VK_DYNAMIC_CB_WRITE_MASKS);
3574 BITSET_CLEAR(remove, MESA_VK_DYNAMIC_CB_BLEND_CONSTANTS);
3575 }
3576 DRAW_STATE_COND(rast, TU_DYNAMIC_STATE_RAST,
3577 pipeline_contains_all_shader_state(pipeline),
3578 builder->graphics_state.rs,
3579 builder->graphics_state.vp,
3580 builder->graphics_state.rp->view_mask != 0,
3581 pipeline->program.per_view_viewport);
3582 DRAW_STATE_COND(ds, TU_DYNAMIC_STATE_DS,
3583 attachments_valid,
3584 builder->graphics_state.ds,
3585 builder->graphics_state.rp);
3586 DRAW_STATE_COND(rb_depth_cntl, TU_DYNAMIC_STATE_RB_DEPTH_CNTL,
3587 attachments_valid,
3588 builder->graphics_state.ds,
3589 builder->graphics_state.rp,
3590 builder->graphics_state.rs);
3591 DRAW_STATE_COND(patch_control_points,
3592 TU_DYNAMIC_STATE_PATCH_CONTROL_POINTS,
3593 pipeline_contains_all_shader_state(pipeline),
3594 pipeline->shaders[MESA_SHADER_VERTEX],
3595 pipeline->shaders[MESA_SHADER_TESS_CTRL],
3596 pipeline->shaders[MESA_SHADER_TESS_EVAL],
3597 &pipeline->program,
3598 builder->graphics_state.ts->patch_control_points);
3599 bool has_raster_order_state = false;
3600 if (pipeline->type == TU_PIPELINE_GRAPHICS) {
3601 has_raster_order_state = true;
3602 } else {
3603 struct tu_graphics_lib_pipeline *lib =
3604 tu_pipeline_to_graphics_lib(pipeline);
3605 has_raster_order_state =
3606 (lib->state & VK_GRAPHICS_PIPELINE_LIBRARY_FRAGMENT_SHADER_BIT_EXT) &&
3607 (lib->state &
3608 VK_GRAPHICS_PIPELINE_LIBRARY_FRAGMENT_OUTPUT_INTERFACE_BIT_EXT);
3609 }
3610 if (!builder->device->physical_device->info->a6xx.has_coherent_ubwc_flag_caches) {
3611 DRAW_STATE_COND(prim_mode_sysmem,
3612 TU_DYNAMIC_STATE_PRIM_MODE_SYSMEM,
3613 has_raster_order_state,
3614 pipeline->shaders[MESA_SHADER_FRAGMENT],
3615 pipeline->output.raster_order_attachment_access ||
3616 pipeline->ds.raster_order_attachment_access,
3617 vk_pipeline_flags_feedback_loops(builder->graphics_state.pipeline_flags),
3618 &pipeline->prim_order.sysmem_single_prim_mode);
3619 }
3620
3621 if (builder->device->physical_device->info->a6xx.has_attachment_shading_rate) {
3622 bool has_fsr_att =
3623 builder->graphics_state.pipeline_flags &
3624 VK_PIPELINE_CREATE_2_RENDERING_FRAGMENT_SHADING_RATE_ATTACHMENT_BIT_KHR;
3625 DRAW_STATE_COND(fragment_shading_rate,
3626 TU_DYNAMIC_STATE_A7XX_FRAGMENT_SHADING_RATE,
3627 attachments_valid && pipeline_contains_all_shader_state(pipeline),
3628 builder->graphics_state.fsr,
3629 has_fsr_att,
3630 pipeline->program.writes_shading_rate,
3631 pipeline->program.reads_shading_rate,
3632 pipeline->program.accesses_smask);
3633 }
3634 #undef DRAW_STATE
3635 #undef DRAW_STATE_COND
3636 #undef EMIT_STATE
3637
3638 /* LRZ always needs depth/stencil state at draw time */
3639 BITSET_SET(keep, MESA_VK_DYNAMIC_DS_DEPTH_TEST_ENABLE);
3640 BITSET_SET(keep, MESA_VK_DYNAMIC_DS_DEPTH_WRITE_ENABLE);
3641 BITSET_SET(keep, MESA_VK_DYNAMIC_DS_DEPTH_BOUNDS_TEST_ENABLE);
3642 BITSET_SET(keep, MESA_VK_DYNAMIC_DS_DEPTH_COMPARE_OP);
3643 BITSET_SET(keep, MESA_VK_DYNAMIC_DS_STENCIL_TEST_ENABLE);
3644 BITSET_SET(keep, MESA_VK_DYNAMIC_DS_STENCIL_OP);
3645 BITSET_SET(keep, MESA_VK_DYNAMIC_DS_STENCIL_WRITE_MASK);
3646 BITSET_SET(keep, MESA_VK_DYNAMIC_MS_ALPHA_TO_COVERAGE_ENABLE);
3647
3648 /* MSAA needs line mode */
3649 BITSET_SET(keep, MESA_VK_DYNAMIC_RS_LINE_MODE);
3650
3651 /* The patch control points is part of the draw */
3652 BITSET_SET(keep, MESA_VK_DYNAMIC_TS_PATCH_CONTROL_POINTS);
3653
3654 /* Vertex buffer state needs to know the max valid binding */
3655 BITSET_SET(keep, MESA_VK_DYNAMIC_VI_BINDINGS_VALID);
3656
3657 /* Remove state which has been emitted and we no longer need to set when
3658 * binding the pipeline by making it "dynamic".
3659 */
3660 BITSET_ANDNOT(remove, remove, keep);
3661
3662 BITSET_OR(pipeline->static_state_mask, pipeline->static_state_mask, remove);
3663
3664 BITSET_OR(builder->graphics_state.dynamic, builder->graphics_state.dynamic,
3665 remove);
3666 }
3667
3668 static inline bool
emit_draw_state(const struct vk_dynamic_graphics_state * dynamic_state,const enum mesa_vk_dynamic_graphics_state * state_array,unsigned num_states)3669 emit_draw_state(const struct vk_dynamic_graphics_state *dynamic_state,
3670 const enum mesa_vk_dynamic_graphics_state *state_array,
3671 unsigned num_states)
3672 {
3673 BITSET_DECLARE(state, MESA_VK_DYNAMIC_GRAPHICS_STATE_ENUM_MAX) = {};
3674
3675 /* Unrolling this loop should produce a constant value once the function is
3676 * inlined, because state_array and num_states are a per-draw-state
3677 * constant, but GCC seems to need a little encouragement. clang does a
3678 * little better but still needs a pragma when there are a large number of
3679 * states.
3680 */
3681 #if defined(__clang__)
3682 #pragma clang loop unroll(full)
3683 #elif defined(__GNUC__) && __GNUC__ >= 8
3684 #pragma GCC unroll MESA_VK_DYNAMIC_GRAPHICS_STATE_ENUM_MAX
3685 #endif
3686 for (unsigned i = 0; i < num_states; i++) {
3687 BITSET_SET(state, state_array[i]);
3688 }
3689
3690 BITSET_DECLARE(temp, MESA_VK_DYNAMIC_GRAPHICS_STATE_ENUM_MAX);
3691 BITSET_AND(temp, state, dynamic_state->dirty);
3692 return !BITSET_IS_EMPTY(temp);
3693 }
3694
3695 template <chip CHIP>
3696 uint32_t
tu_emit_draw_state(struct tu_cmd_buffer * cmd)3697 tu_emit_draw_state(struct tu_cmd_buffer *cmd)
3698 {
3699 struct tu_cs cs;
3700 uint32_t dirty_draw_states = 0;
3701
3702 #define EMIT_STATE(name) \
3703 emit_draw_state(&cmd->vk.dynamic_graphics_state, tu_##name##_state, \
3704 ARRAY_SIZE(tu_##name##_state))
3705 #define DRAW_STATE_COND(name, id, extra_cond, ...) \
3706 if ((EMIT_STATE(name) || (extra_cond)) && \
3707 !(cmd->state.pipeline_draw_states & (1u << id))) { \
3708 unsigned size = tu6_##name##_size<CHIP>(cmd->device, __VA_ARGS__); \
3709 if (size > 0) { \
3710 tu_cs_begin_sub_stream(&cmd->sub_cs, size, &cs); \
3711 tu6_emit_##name<CHIP>(&cs, __VA_ARGS__); \
3712 cmd->state.dynamic_state[id] = \
3713 tu_cs_end_draw_state(&cmd->sub_cs, &cs); \
3714 } else { \
3715 cmd->state.dynamic_state[id] = {}; \
3716 } \
3717 dirty_draw_states |= (1u << id); \
3718 }
3719 #define DRAW_STATE_FDM(name, id, ...) \
3720 if ((EMIT_STATE(name) || (cmd->state.dirty & TU_CMD_DIRTY_FDM)) && \
3721 !(cmd->state.pipeline_draw_states & (1u << id))) { \
3722 if (cmd->state.shaders[MESA_SHADER_FRAGMENT]->fs.has_fdm) { \
3723 tu_cs_set_writeable(&cmd->sub_cs, true); \
3724 tu6_emit_##name##_fdm(&cs, cmd, __VA_ARGS__); \
3725 cmd->state.dynamic_state[id] = \
3726 tu_cs_end_draw_state(&cmd->sub_cs, &cs); \
3727 tu_cs_set_writeable(&cmd->sub_cs, false); \
3728 } else { \
3729 unsigned size = tu6_##name##_size<CHIP>(cmd->device, __VA_ARGS__); \
3730 if (size > 0) { \
3731 tu_cs_begin_sub_stream(&cmd->sub_cs, size, &cs); \
3732 tu6_emit_##name<CHIP>(&cs, __VA_ARGS__); \
3733 cmd->state.dynamic_state[id] = \
3734 tu_cs_end_draw_state(&cmd->sub_cs, &cs); \
3735 } else { \
3736 cmd->state.dynamic_state[id] = {}; \
3737 } \
3738 tu_cs_begin_sub_stream(&cmd->sub_cs, \
3739 tu6_##name##_size<CHIP>(cmd->device, __VA_ARGS__), \
3740 &cs); \
3741 tu6_emit_##name<CHIP>(&cs, __VA_ARGS__); \
3742 cmd->state.dynamic_state[id] = \
3743 tu_cs_end_draw_state(&cmd->sub_cs, &cs); \
3744 } \
3745 dirty_draw_states |= (1u << id); \
3746 }
3747 #define DRAW_STATE(name, id, ...) DRAW_STATE_COND(name, id, false, __VA_ARGS__)
3748
3749 DRAW_STATE(vertex_input, TU_DYNAMIC_STATE_VERTEX_INPUT,
3750 cmd->vk.dynamic_graphics_state.vi);
3751
3752 /* Vertex input stride is special because it's part of the vertex input in
3753 * the pipeline but a separate array when it's dynamic state so we have to
3754 * use two separate functions.
3755 */
3756 #define tu6_emit_vertex_stride tu6_emit_vertex_stride_dyn
3757 #define tu6_vertex_stride_size tu6_vertex_stride_size_dyn
3758
3759 DRAW_STATE(vertex_stride, TU_DYNAMIC_STATE_VB_STRIDE,
3760 cmd->vk.dynamic_graphics_state.vi_binding_strides,
3761 cmd->vk.dynamic_graphics_state.vi_bindings_valid);
3762
3763 #undef tu6_emit_vertex_stride
3764 #undef tu6_vertex_stride_size
3765
3766 DRAW_STATE_FDM(viewport, TU_DYNAMIC_STATE_VIEWPORT,
3767 &cmd->vk.dynamic_graphics_state.vp,
3768 &cmd->vk.dynamic_graphics_state.rs);
3769 DRAW_STATE_FDM(scissor, TU_DYNAMIC_STATE_SCISSOR,
3770 &cmd->vk.dynamic_graphics_state.vp);
3771 DRAW_STATE(sample_locations,
3772 TU_DYNAMIC_STATE_SAMPLE_LOCATIONS,
3773 cmd->vk.dynamic_graphics_state.ms.sample_locations_enable,
3774 cmd->vk.dynamic_graphics_state.ms.sample_locations);
3775 DRAW_STATE(depth_bias, TU_DYNAMIC_STATE_DEPTH_BIAS,
3776 &cmd->vk.dynamic_graphics_state.rs);
3777 DRAW_STATE(blend, TU_DYNAMIC_STATE_BLEND,
3778 &cmd->vk.dynamic_graphics_state.cb,
3779 &cmd->vk.dynamic_graphics_state.cal,
3780 cmd->vk.dynamic_graphics_state.ms.alpha_to_coverage_enable,
3781 cmd->vk.dynamic_graphics_state.ms.alpha_to_one_enable,
3782 cmd->vk.dynamic_graphics_state.ms.sample_mask);
3783 if (EMIT_STATE(blend_lrz) ||
3784 ((cmd->state.dirty & TU_CMD_DIRTY_SUBPASS) &&
3785 !cmd->state.pipeline_blend_lrz)) {
3786 bool blend_reads_dest = tu6_calc_blend_lrz(&cmd->vk.dynamic_graphics_state.cb,
3787 &cmd->state.vk_rp);
3788 if (blend_reads_dest != cmd->state.blend_reads_dest) {
3789 cmd->state.blend_reads_dest = blend_reads_dest;
3790 cmd->state.dirty |= TU_CMD_DIRTY_LRZ;
3791 }
3792 }
3793 if (EMIT_STATE(bandwidth) ||
3794 ((cmd->state.dirty & TU_CMD_DIRTY_SUBPASS) &&
3795 !cmd->state.pipeline_bandwidth))
3796 tu_calc_bandwidth(&cmd->state.bandwidth, &cmd->vk.dynamic_graphics_state.cb,
3797 &cmd->state.vk_rp);
3798 DRAW_STATE(blend_constants, VK_DYNAMIC_STATE_BLEND_CONSTANTS,
3799 &cmd->vk.dynamic_graphics_state.cb);
3800
3801 if (cmd->device->physical_device->info->a6xx.has_attachment_shading_rate) {
3802 DRAW_STATE_COND(fragment_shading_rate,
3803 TU_DYNAMIC_STATE_A7XX_FRAGMENT_SHADING_RATE,
3804 cmd->state.dirty & (TU_CMD_DIRTY_SUBPASS | TU_CMD_DIRTY_SHADING_RATE),
3805 &cmd->vk.dynamic_graphics_state.fsr,
3806 cmd->state.subpass->fsr_attachment != VK_ATTACHMENT_UNUSED,
3807 cmd->state.program.writes_shading_rate,
3808 cmd->state.program.reads_shading_rate,
3809 cmd->state.program.accesses_smask);
3810 }
3811 DRAW_STATE_COND(rast, TU_DYNAMIC_STATE_RAST,
3812 cmd->state.dirty & (TU_CMD_DIRTY_SUBPASS |
3813 TU_CMD_DIRTY_PER_VIEW_VIEWPORT),
3814 &cmd->vk.dynamic_graphics_state.rs,
3815 &cmd->vk.dynamic_graphics_state.vp,
3816 cmd->state.vk_rp.view_mask != 0,
3817 cmd->state.per_view_viewport);
3818 DRAW_STATE_COND(ds, TU_DYNAMIC_STATE_DS,
3819 cmd->state.dirty & TU_CMD_DIRTY_SUBPASS,
3820 &cmd->vk.dynamic_graphics_state.ds,
3821 &cmd->state.vk_rp);
3822 DRAW_STATE_COND(rb_depth_cntl, TU_DYNAMIC_STATE_RB_DEPTH_CNTL,
3823 cmd->state.dirty & TU_CMD_DIRTY_SUBPASS,
3824 &cmd->vk.dynamic_graphics_state.ds,
3825 &cmd->state.vk_rp,
3826 &cmd->vk.dynamic_graphics_state.rs);
3827 DRAW_STATE_COND(patch_control_points,
3828 TU_DYNAMIC_STATE_PATCH_CONTROL_POINTS,
3829 cmd->state.dirty & TU_CMD_DIRTY_PROGRAM,
3830 cmd->state.shaders[MESA_SHADER_VERTEX],
3831 cmd->state.shaders[MESA_SHADER_TESS_CTRL],
3832 cmd->state.shaders[MESA_SHADER_TESS_EVAL],
3833 &cmd->state.program,
3834 cmd->vk.dynamic_graphics_state.ts.patch_control_points);
3835 if (!cmd->device->physical_device->info->a6xx.has_coherent_ubwc_flag_caches) {
3836 DRAW_STATE_COND(prim_mode_sysmem,
3837 TU_DYNAMIC_STATE_PRIM_MODE_SYSMEM,
3838 cmd->state.dirty & (TU_CMD_DIRTY_RAST_ORDER |
3839 TU_CMD_DIRTY_FEEDBACK_LOOPS |
3840 TU_CMD_DIRTY_FS),
3841 cmd->state.shaders[MESA_SHADER_FRAGMENT],
3842 cmd->state.raster_order_attachment_access,
3843 cmd->vk.dynamic_graphics_state.feedback_loops |
3844 cmd->state.pipeline_feedback_loops,
3845 &cmd->state.rp.sysmem_single_prim_mode);
3846 }
3847 #undef DRAW_STATE
3848 #undef DRAW_STATE_COND
3849 #undef EMIT_STATE
3850
3851 return dirty_draw_states;
3852 }
3853 TU_GENX(tu_emit_draw_state);
3854
3855 static void
tu_pipeline_builder_parse_depth_stencil(struct tu_pipeline_builder * builder,struct tu_pipeline * pipeline)3856 tu_pipeline_builder_parse_depth_stencil(
3857 struct tu_pipeline_builder *builder, struct tu_pipeline *pipeline)
3858 {
3859 const VkPipelineDepthStencilStateCreateInfo *ds_info =
3860 builder->create_info->pDepthStencilState;
3861
3862 if ((builder->graphics_state.rp->attachments ==
3863 MESA_VK_RP_ATTACHMENT_INFO_INVALID) ||
3864 (builder->graphics_state.rp->attachments &
3865 MESA_VK_RP_ATTACHMENT_DEPTH_BIT)) {
3866 pipeline->ds.raster_order_attachment_access =
3867 ds_info && (ds_info->flags &
3868 (VK_PIPELINE_DEPTH_STENCIL_STATE_CREATE_RASTERIZATION_ORDER_ATTACHMENT_DEPTH_ACCESS_BIT_EXT |
3869 VK_PIPELINE_DEPTH_STENCIL_STATE_CREATE_RASTERIZATION_ORDER_ATTACHMENT_STENCIL_ACCESS_BIT_EXT));
3870 }
3871 }
3872
3873 static void
tu_pipeline_builder_parse_multisample_and_color_blend(struct tu_pipeline_builder * builder,struct tu_pipeline * pipeline)3874 tu_pipeline_builder_parse_multisample_and_color_blend(
3875 struct tu_pipeline_builder *builder, struct tu_pipeline *pipeline)
3876 {
3877 /* The spec says:
3878 *
3879 * pMultisampleState is a pointer to an instance of the
3880 * VkPipelineMultisampleStateCreateInfo, and is ignored if the pipeline
3881 * has rasterization disabled.
3882 *
3883 * Also,
3884 *
3885 * pColorBlendState is a pointer to an instance of the
3886 * VkPipelineColorBlendStateCreateInfo structure, and is ignored if the
3887 * pipeline has rasterization disabled or if the subpass of the render
3888 * pass the pipeline is created against does not use any color
3889 * attachments.
3890 *
3891 * We leave the relevant registers stale when rasterization is disabled.
3892 */
3893 if (builder->rasterizer_discard) {
3894 return;
3895 }
3896
3897 static const VkPipelineColorBlendStateCreateInfo dummy_blend_info = {};
3898
3899 const VkPipelineColorBlendStateCreateInfo *blend_info =
3900 (builder->graphics_state.rp->attachments &
3901 MESA_VK_RP_ATTACHMENT_ANY_COLOR_BITS)
3902 ? builder->create_info->pColorBlendState
3903 : &dummy_blend_info;
3904
3905 if (builder->graphics_state.rp->attachments &
3906 MESA_VK_RP_ATTACHMENT_ANY_COLOR_BITS) {
3907 pipeline->output.raster_order_attachment_access =
3908 blend_info && (blend_info->flags &
3909 VK_PIPELINE_COLOR_BLEND_STATE_CREATE_RASTERIZATION_ORDER_ATTACHMENT_ACCESS_BIT_EXT);
3910 }
3911 }
3912
3913 static void
tu_pipeline_builder_parse_rasterization_order(struct tu_pipeline_builder * builder,struct tu_pipeline * pipeline)3914 tu_pipeline_builder_parse_rasterization_order(
3915 struct tu_pipeline_builder *builder, struct tu_pipeline *pipeline)
3916 {
3917 if (builder->rasterizer_discard)
3918 return;
3919
3920 bool raster_order_attachment_access =
3921 pipeline->output.raster_order_attachment_access ||
3922 pipeline->ds.raster_order_attachment_access ||
3923 TU_DEBUG(RAST_ORDER);
3924
3925 /* VK_EXT_blend_operation_advanced would also require ordered access
3926 * when implemented in the future.
3927 */
3928
3929 enum a6xx_single_prim_mode gmem_prim_mode = NO_FLUSH;
3930
3931 if (raster_order_attachment_access) {
3932 /* VK_EXT_rasterization_order_attachment_access:
3933 *
3934 * This extension allow access to framebuffer attachments when used as
3935 * both input and color attachments from one fragment to the next,
3936 * in rasterization order, without explicit synchronization.
3937 */
3938 gmem_prim_mode = FLUSH_PER_OVERLAP;
3939 }
3940
3941 struct tu_cs cs;
3942
3943 pipeline->prim_order.state_gmem = tu_cs_draw_state(&pipeline->cs, &cs, 2);
3944 tu_cs_emit_write_reg(&cs, REG_A6XX_GRAS_SC_CNTL,
3945 A6XX_GRAS_SC_CNTL_CCUSINGLECACHELINESIZE(2) |
3946 A6XX_GRAS_SC_CNTL_SINGLE_PRIM_MODE(gmem_prim_mode));
3947 }
3948
3949 static void
tu_pipeline_finish(struct tu_pipeline * pipeline,struct tu_device * dev,const VkAllocationCallbacks * alloc)3950 tu_pipeline_finish(struct tu_pipeline *pipeline,
3951 struct tu_device *dev,
3952 const VkAllocationCallbacks *alloc)
3953 {
3954 tu_cs_finish(&pipeline->cs);
3955 TU_RMV(resource_destroy, dev, &pipeline->bo);
3956
3957 mtx_lock(&dev->pipeline_mutex);
3958 tu_suballoc_bo_free(&dev->pipeline_suballoc, &pipeline->bo);
3959 mtx_unlock(&dev->pipeline_mutex);
3960
3961 if (pipeline->type == TU_PIPELINE_GRAPHICS_LIB) {
3962 struct tu_graphics_lib_pipeline *library =
3963 tu_pipeline_to_graphics_lib(pipeline);
3964
3965 if (library->nir_shaders)
3966 vk_pipeline_cache_object_unref(&dev->vk,
3967 &library->nir_shaders->base);
3968
3969 for (unsigned i = 0; i < library->num_sets; i++) {
3970 if (library->layouts[i])
3971 vk_descriptor_set_layout_unref(&dev->vk, &library->layouts[i]->vk);
3972 }
3973
3974 vk_free2(&dev->vk.alloc, alloc, library->state_data);
3975 }
3976
3977 for (unsigned i = 0; i < ARRAY_SIZE(pipeline->shaders); i++) {
3978 if (pipeline->shaders[i])
3979 vk_pipeline_cache_object_unref(&dev->vk,
3980 &pipeline->shaders[i]->base);
3981 }
3982
3983 ralloc_free(pipeline->executables_mem_ctx);
3984 }
3985
3986 static VkGraphicsPipelineLibraryFlagBitsEXT
vk_shader_stage_to_pipeline_library_flags(VkShaderStageFlagBits stage)3987 vk_shader_stage_to_pipeline_library_flags(VkShaderStageFlagBits stage)
3988 {
3989 assert(util_bitcount(stage) == 1);
3990 switch (stage) {
3991 case VK_SHADER_STAGE_VERTEX_BIT:
3992 case VK_SHADER_STAGE_TESSELLATION_CONTROL_BIT:
3993 case VK_SHADER_STAGE_TESSELLATION_EVALUATION_BIT:
3994 case VK_SHADER_STAGE_GEOMETRY_BIT:
3995 case VK_SHADER_STAGE_TASK_BIT_EXT:
3996 case VK_SHADER_STAGE_MESH_BIT_EXT:
3997 return VK_GRAPHICS_PIPELINE_LIBRARY_PRE_RASTERIZATION_SHADERS_BIT_EXT;
3998 case VK_SHADER_STAGE_FRAGMENT_BIT:
3999 return VK_GRAPHICS_PIPELINE_LIBRARY_FRAGMENT_SHADER_BIT_EXT;
4000 default:
4001 unreachable("Invalid shader stage");
4002 }
4003 }
4004
4005 template <chip CHIP>
4006 static VkResult
tu_pipeline_builder_build(struct tu_pipeline_builder * builder,struct tu_pipeline ** pipeline)4007 tu_pipeline_builder_build(struct tu_pipeline_builder *builder,
4008 struct tu_pipeline **pipeline)
4009 {
4010 VkResult result;
4011
4012 if (builder->create_flags & VK_PIPELINE_CREATE_2_LIBRARY_BIT_KHR) {
4013 *pipeline = (struct tu_pipeline *) vk_object_zalloc(
4014 &builder->device->vk, builder->alloc,
4015 sizeof(struct tu_graphics_lib_pipeline),
4016 VK_OBJECT_TYPE_PIPELINE);
4017 if (!*pipeline)
4018 return VK_ERROR_OUT_OF_HOST_MEMORY;
4019 (*pipeline)->type = TU_PIPELINE_GRAPHICS_LIB;
4020 } else {
4021 *pipeline = (struct tu_pipeline *) vk_object_zalloc(
4022 &builder->device->vk, builder->alloc,
4023 sizeof(struct tu_graphics_pipeline),
4024 VK_OBJECT_TYPE_PIPELINE);
4025 if (!*pipeline)
4026 return VK_ERROR_OUT_OF_HOST_MEMORY;
4027 (*pipeline)->type = TU_PIPELINE_GRAPHICS;
4028 }
4029
4030 (*pipeline)->executables_mem_ctx = ralloc_context(NULL);
4031 util_dynarray_init(&(*pipeline)->executables, (*pipeline)->executables_mem_ctx);
4032
4033 tu_pipeline_builder_parse_libraries(builder, *pipeline);
4034
4035 VkShaderStageFlags stages = 0;
4036 for (unsigned i = 0; i < builder->create_info->stageCount; i++) {
4037 VkShaderStageFlagBits stage = builder->create_info->pStages[i].stage;
4038
4039 /* Ignore shader stages that don't need to be imported. */
4040 if (!(vk_shader_stage_to_pipeline_library_flags(stage) & builder->state))
4041 continue;
4042
4043 stages |= stage;
4044 }
4045 builder->active_stages = stages;
4046
4047 (*pipeline)->active_stages = stages;
4048 for (unsigned i = 0; i < builder->num_libraries; i++)
4049 (*pipeline)->active_stages |= builder->libraries[i]->base.active_stages;
4050
4051 /* Compile and upload shaders unless a library has already done that. */
4052 if ((*pipeline)->program.vs_state.size == 0) {
4053 tu_pipeline_builder_parse_layout(builder, *pipeline);
4054
4055 result = tu_pipeline_builder_compile_shaders(builder, *pipeline);
4056 if (result != VK_SUCCESS) {
4057 tu_pipeline_finish(*pipeline, builder->device, builder->alloc);
4058 vk_object_free(&builder->device->vk, builder->alloc, *pipeline);
4059 return result;
4060 }
4061 }
4062
4063 result = tu_pipeline_allocate_cs(builder->device, *pipeline,
4064 &builder->layout, builder, NULL);
4065
4066
4067 if (set_combined_state(builder, *pipeline,
4068 VK_GRAPHICS_PIPELINE_LIBRARY_PRE_RASTERIZATION_SHADERS_BIT_EXT |
4069 VK_GRAPHICS_PIPELINE_LIBRARY_FRAGMENT_SHADER_BIT_EXT)) {
4070 if (result != VK_SUCCESS) {
4071 vk_object_free(&builder->device->vk, builder->alloc, *pipeline);
4072 return result;
4073 }
4074
4075 tu_emit_program_state<CHIP>(&(*pipeline)->cs, &(*pipeline)->program,
4076 (*pipeline)->shaders);
4077
4078 if (CHIP == A6XX) {
4079 /* Blob doesn't preload state on A7XX, likely preloading either
4080 * doesn't work or doesn't provide benefits.
4081 */
4082 tu6_emit_load_state(builder->device, *pipeline, &builder->layout);
4083 }
4084 }
4085
4086 if (builder->state & VK_GRAPHICS_PIPELINE_LIBRARY_FRAGMENT_SHADER_BIT_EXT) {
4087 tu_pipeline_builder_parse_depth_stencil(builder, *pipeline);
4088 }
4089
4090 if (builder->state &
4091 VK_GRAPHICS_PIPELINE_LIBRARY_FRAGMENT_OUTPUT_INTERFACE_BIT_EXT) {
4092 tu_pipeline_builder_parse_multisample_and_color_blend(builder, *pipeline);
4093 }
4094
4095 if (set_combined_state(builder, *pipeline,
4096 VK_GRAPHICS_PIPELINE_LIBRARY_FRAGMENT_SHADER_BIT_EXT |
4097 VK_GRAPHICS_PIPELINE_LIBRARY_FRAGMENT_OUTPUT_INTERFACE_BIT_EXT)) {
4098 tu_pipeline_builder_parse_rasterization_order(builder, *pipeline);
4099 }
4100
4101 tu_pipeline_builder_emit_state<CHIP>(builder, *pipeline);
4102
4103 if ((*pipeline)->type == TU_PIPELINE_GRAPHICS_LIB) {
4104 struct tu_graphics_lib_pipeline *library =
4105 tu_pipeline_to_graphics_lib(*pipeline);
4106 result = vk_graphics_pipeline_state_copy(&builder->device->vk,
4107 &library->graphics_state,
4108 &builder->graphics_state,
4109 builder->alloc,
4110 VK_SYSTEM_ALLOCATION_SCOPE_OBJECT,
4111 &library->state_data);
4112 if (result != VK_SUCCESS) {
4113 tu_pipeline_finish(*pipeline, builder->device, builder->alloc);
4114 return result;
4115 }
4116 } else {
4117 struct tu_graphics_pipeline *gfx_pipeline =
4118 tu_pipeline_to_graphics(*pipeline);
4119 gfx_pipeline->dynamic_state.ms.sample_locations =
4120 &gfx_pipeline->sample_locations;
4121 vk_dynamic_graphics_state_fill(&gfx_pipeline->dynamic_state,
4122 &builder->graphics_state);
4123 gfx_pipeline->feedback_loops =
4124 vk_pipeline_flags_feedback_loops(builder->graphics_state.pipeline_flags);
4125 gfx_pipeline->feedback_loop_may_involve_textures =
4126 builder->graphics_state.feedback_loop_not_input_only;
4127 }
4128
4129 return VK_SUCCESS;
4130 }
4131
4132 static void
tu_pipeline_builder_finish(struct tu_pipeline_builder * builder)4133 tu_pipeline_builder_finish(struct tu_pipeline_builder *builder)
4134 {
4135 ralloc_free(builder->mem_ctx);
4136 }
4137
4138 void
tu_fill_render_pass_state(struct vk_render_pass_state * rp,const struct tu_render_pass * pass,const struct tu_subpass * subpass)4139 tu_fill_render_pass_state(struct vk_render_pass_state *rp,
4140 const struct tu_render_pass *pass,
4141 const struct tu_subpass *subpass)
4142 {
4143 rp->view_mask = subpass->multiview_mask;
4144 rp->color_attachment_count = subpass->color_count;
4145
4146 const uint32_t a = subpass->depth_stencil_attachment.attachment;
4147 rp->depth_attachment_format = VK_FORMAT_UNDEFINED;
4148 rp->stencil_attachment_format = VK_FORMAT_UNDEFINED;
4149 rp->attachments = MESA_VK_RP_ATTACHMENT_NONE;
4150 if (a != VK_ATTACHMENT_UNUSED) {
4151 VkFormat ds_format = pass->attachments[a].format;
4152 if (vk_format_has_depth(ds_format) && subpass->depth_used) {
4153 rp->depth_attachment_format = ds_format;
4154 rp->attachments |= MESA_VK_RP_ATTACHMENT_DEPTH_BIT;
4155 }
4156 if (vk_format_has_stencil(ds_format) && subpass->stencil_used) {
4157 rp->stencil_attachment_format = ds_format;
4158 rp->attachments |= MESA_VK_RP_ATTACHMENT_STENCIL_BIT;
4159 }
4160 }
4161
4162 for (uint32_t i = 0; i < subpass->color_count; i++) {
4163 const uint32_t a = subpass->color_attachments[i].attachment;
4164 if (a == VK_ATTACHMENT_UNUSED) {
4165 rp->color_attachment_formats[i] = VK_FORMAT_UNDEFINED;
4166 continue;
4167 }
4168
4169 rp->color_attachment_formats[i] = pass->attachments[a].format;
4170 rp->attachments |= MESA_VK_RP_ATTACHMENT_COLOR_BIT(i);
4171 }
4172 }
4173
4174 static void
tu_pipeline_builder_init_graphics(struct tu_pipeline_builder * builder,struct tu_device * dev,struct vk_pipeline_cache * cache,const VkGraphicsPipelineCreateInfo * create_info,VkPipelineCreateFlags2KHR flags,const VkAllocationCallbacks * alloc)4175 tu_pipeline_builder_init_graphics(
4176 struct tu_pipeline_builder *builder,
4177 struct tu_device *dev,
4178 struct vk_pipeline_cache *cache,
4179 const VkGraphicsPipelineCreateInfo *create_info,
4180 VkPipelineCreateFlags2KHR flags,
4181 const VkAllocationCallbacks *alloc)
4182 {
4183 *builder = (struct tu_pipeline_builder) {
4184 .device = dev,
4185 .mem_ctx = ralloc_context(NULL),
4186 .cache = cache,
4187 .alloc = alloc,
4188 .create_info = create_info,
4189 .create_flags = flags,
4190 };
4191
4192 const VkGraphicsPipelineLibraryCreateInfoEXT *gpl_info =
4193 vk_find_struct_const(builder->create_info->pNext,
4194 GRAPHICS_PIPELINE_LIBRARY_CREATE_INFO_EXT);
4195
4196 const VkPipelineLibraryCreateInfoKHR *library_info =
4197 vk_find_struct_const(builder->create_info->pNext,
4198 PIPELINE_LIBRARY_CREATE_INFO_KHR);
4199
4200 if (gpl_info) {
4201 builder->state = gpl_info->flags;
4202 } else {
4203 /* Implement this bit of spec text:
4204 *
4205 * If this structure is omitted, and either
4206 * VkGraphicsPipelineCreateInfo::flags includes
4207 * VK_PIPELINE_CREATE_LIBRARY_BIT_KHR or the
4208 * VkGraphicsPipelineCreateInfo::pNext chain includes a
4209 * VkPipelineLibraryCreateInfoKHR structure with a libraryCount
4210 * greater than 0, it is as if flags is 0. Otherwise if this
4211 * structure is omitted, it is as if flags includes all possible
4212 * subsets of the graphics pipeline (i.e. a complete graphics
4213 * pipeline).
4214 */
4215 if ((library_info && library_info->libraryCount > 0) ||
4216 (builder->create_flags & VK_PIPELINE_CREATE_2_LIBRARY_BIT_KHR)) {
4217 builder->state = 0;
4218 } else {
4219 builder->state =
4220 VK_GRAPHICS_PIPELINE_LIBRARY_VERTEX_INPUT_INTERFACE_BIT_EXT |
4221 VK_GRAPHICS_PIPELINE_LIBRARY_PRE_RASTERIZATION_SHADERS_BIT_EXT |
4222 VK_GRAPHICS_PIPELINE_LIBRARY_FRAGMENT_SHADER_BIT_EXT |
4223 VK_GRAPHICS_PIPELINE_LIBRARY_FRAGMENT_OUTPUT_INTERFACE_BIT_EXT;
4224 }
4225 }
4226
4227 bool rasterizer_discard_dynamic = false;
4228 if (create_info->pDynamicState) {
4229 for (uint32_t i = 0; i < create_info->pDynamicState->dynamicStateCount; i++) {
4230 if (create_info->pDynamicState->pDynamicStates[i] ==
4231 VK_DYNAMIC_STATE_RASTERIZER_DISCARD_ENABLE) {
4232 rasterizer_discard_dynamic = true;
4233 break;
4234 }
4235 }
4236 }
4237
4238 builder->rasterizer_discard =
4239 (builder->state & VK_GRAPHICS_PIPELINE_LIBRARY_PRE_RASTERIZATION_SHADERS_BIT_EXT) &&
4240 !rasterizer_discard_dynamic &&
4241 builder->create_info->pRasterizationState->rasterizerDiscardEnable;
4242
4243 struct vk_render_pass_state rp_state = {};
4244 const struct vk_render_pass_state *driver_rp = NULL;
4245 VkPipelineCreateFlags2KHR rp_flags = 0;
4246
4247 builder->unscaled_input_fragcoord = 0;
4248
4249 /* Extract information we need from the turnip renderpass. This will be
4250 * filled out automatically if the app is using dynamic rendering or
4251 * renderpasses are emulated.
4252 */
4253 if (!TU_DEBUG(DYNAMIC) &&
4254 (builder->state &
4255 (VK_GRAPHICS_PIPELINE_LIBRARY_PRE_RASTERIZATION_SHADERS_BIT_EXT |
4256 VK_GRAPHICS_PIPELINE_LIBRARY_FRAGMENT_SHADER_BIT_EXT |
4257 VK_GRAPHICS_PIPELINE_LIBRARY_FRAGMENT_OUTPUT_INTERFACE_BIT_EXT)) &&
4258 builder->create_info->renderPass) {
4259 const struct tu_render_pass *pass =
4260 tu_render_pass_from_handle(create_info->renderPass);
4261 const struct tu_subpass *subpass =
4262 &pass->subpasses[create_info->subpass];
4263
4264 tu_fill_render_pass_state(&rp_state, pass, subpass);
4265
4266 for (unsigned i = 0; i < subpass->input_count; i++) {
4267 /* Input attachments stored in GMEM must be loaded with unscaled
4268 * FragCoord.
4269 */
4270 if (subpass->input_attachments[i].patch_input_gmem)
4271 builder->unscaled_input_fragcoord |= 1u << i;
4272 }
4273
4274 if (subpass->feedback_loop_color) {
4275 rp_flags |=
4276 VK_PIPELINE_CREATE_2_COLOR_ATTACHMENT_FEEDBACK_LOOP_BIT_EXT;
4277 }
4278
4279 if (subpass->feedback_loop_ds) {
4280 rp_flags |=
4281 VK_PIPELINE_CREATE_2_DEPTH_STENCIL_ATTACHMENT_FEEDBACK_LOOP_BIT_EXT;
4282 }
4283
4284 if (pass->fragment_density_map.attachment != VK_ATTACHMENT_UNUSED) {
4285 rp_flags |=
4286 VK_PIPELINE_CREATE_2_RENDERING_FRAGMENT_DENSITY_MAP_ATTACHMENT_BIT_EXT;
4287 }
4288
4289 if (subpass->fsr_attachment != VK_ATTACHMENT_UNUSED) {
4290 rp_flags |=
4291 VK_PIPELINE_CREATE_2_RENDERING_FRAGMENT_SHADING_RATE_ATTACHMENT_BIT_KHR;
4292 }
4293
4294 builder->unscaled_input_fragcoord = 0;
4295 for (unsigned i = 0; i < subpass->input_count; i++) {
4296 /* Input attachments stored in GMEM must be loaded with unscaled
4297 * FragCoord.
4298 */
4299 if (subpass->input_attachments[i].patch_input_gmem)
4300 builder->unscaled_input_fragcoord |= 1u << i;
4301 }
4302
4303 driver_rp = &rp_state;
4304 }
4305
4306 vk_graphics_pipeline_state_fill(&dev->vk,
4307 &builder->graphics_state,
4308 builder->create_info,
4309 driver_rp,
4310 rp_flags,
4311 &builder->all_state,
4312 NULL, VK_SYSTEM_ALLOCATION_SCOPE_OBJECT,
4313 NULL);
4314
4315 if (builder->graphics_state.rp) {
4316 builder->fragment_density_map = (builder->graphics_state.pipeline_flags &
4317 VK_PIPELINE_CREATE_2_RENDERING_FRAGMENT_DENSITY_MAP_ATTACHMENT_BIT_EXT) ||
4318 TU_DEBUG(FDM);
4319 }
4320 }
4321
4322 template <chip CHIP>
4323 static VkResult
tu_graphics_pipeline_create(VkDevice device,VkPipelineCache pipelineCache,const VkGraphicsPipelineCreateInfo * pCreateInfo,VkPipelineCreateFlags2KHR flags,const VkAllocationCallbacks * pAllocator,VkPipeline * pPipeline)4324 tu_graphics_pipeline_create(VkDevice device,
4325 VkPipelineCache pipelineCache,
4326 const VkGraphicsPipelineCreateInfo *pCreateInfo,
4327 VkPipelineCreateFlags2KHR flags,
4328 const VkAllocationCallbacks *pAllocator,
4329 VkPipeline *pPipeline)
4330 {
4331 VK_FROM_HANDLE(tu_device, dev, device);
4332 VK_FROM_HANDLE(vk_pipeline_cache, cache, pipelineCache);
4333
4334 cache = cache ? cache : dev->mem_cache;
4335
4336 struct tu_pipeline_builder builder;
4337 tu_pipeline_builder_init_graphics(&builder, dev, cache,
4338 pCreateInfo, flags, pAllocator);
4339
4340 struct tu_pipeline *pipeline = NULL;
4341 VkResult result = tu_pipeline_builder_build<CHIP>(&builder, &pipeline);
4342 tu_pipeline_builder_finish(&builder);
4343
4344 if (result == VK_SUCCESS) {
4345 TU_RMV(graphics_pipeline_create, dev, tu_pipeline_to_graphics(pipeline));
4346
4347 *pPipeline = tu_pipeline_to_handle(pipeline);
4348 } else
4349 *pPipeline = VK_NULL_HANDLE;
4350
4351 return result;
4352 }
4353
4354 template <chip CHIP>
4355 VKAPI_ATTR VkResult VKAPI_CALL
tu_CreateGraphicsPipelines(VkDevice device,VkPipelineCache pipelineCache,uint32_t count,const VkGraphicsPipelineCreateInfo * pCreateInfos,const VkAllocationCallbacks * pAllocator,VkPipeline * pPipelines)4356 tu_CreateGraphicsPipelines(VkDevice device,
4357 VkPipelineCache pipelineCache,
4358 uint32_t count,
4359 const VkGraphicsPipelineCreateInfo *pCreateInfos,
4360 const VkAllocationCallbacks *pAllocator,
4361 VkPipeline *pPipelines)
4362 {
4363 MESA_TRACE_FUNC();
4364 VkResult final_result = VK_SUCCESS;
4365 uint32_t i = 0;
4366
4367 for (; i < count; i++) {
4368 VkPipelineCreateFlags2KHR flags =
4369 vk_graphics_pipeline_create_flags(&pCreateInfos[i]);
4370
4371 VkResult result =
4372 tu_graphics_pipeline_create<CHIP>(device, pipelineCache,
4373 &pCreateInfos[i], flags,
4374 pAllocator, &pPipelines[i]);
4375
4376 if (result != VK_SUCCESS) {
4377 final_result = result;
4378 pPipelines[i] = VK_NULL_HANDLE;
4379
4380 if (flags & VK_PIPELINE_CREATE_2_EARLY_RETURN_ON_FAILURE_BIT_KHR)
4381 break;
4382 }
4383 }
4384
4385 for (; i < count; i++)
4386 pPipelines[i] = VK_NULL_HANDLE;
4387
4388 return final_result;
4389 }
4390 TU_GENX(tu_CreateGraphicsPipelines);
4391
4392 template <chip CHIP>
4393 static VkResult
tu_compute_pipeline_create(VkDevice device,VkPipelineCache pipelineCache,const VkComputePipelineCreateInfo * pCreateInfo,VkPipelineCreateFlags2KHR flags,const VkAllocationCallbacks * pAllocator,VkPipeline * pPipeline)4394 tu_compute_pipeline_create(VkDevice device,
4395 VkPipelineCache pipelineCache,
4396 const VkComputePipelineCreateInfo *pCreateInfo,
4397 VkPipelineCreateFlags2KHR flags,
4398 const VkAllocationCallbacks *pAllocator,
4399 VkPipeline *pPipeline)
4400 {
4401 VK_FROM_HANDLE(tu_device, dev, device);
4402 VK_FROM_HANDLE(vk_pipeline_cache, cache, pipelineCache);
4403 VK_FROM_HANDLE(tu_pipeline_layout, layout, pCreateInfo->layout);
4404 const VkPipelineShaderStageCreateInfo *stage_info = &pCreateInfo->stage;
4405 VkResult result;
4406 const struct ir3_shader_variant *v = NULL;
4407
4408 cache = cache ? cache : dev->mem_cache;
4409
4410 struct tu_compute_pipeline *pipeline;
4411
4412 *pPipeline = VK_NULL_HANDLE;
4413
4414 VkPipelineCreationFeedback pipeline_feedback = {
4415 .flags = VK_PIPELINE_CREATION_FEEDBACK_VALID_BIT,
4416 };
4417
4418 const VkPipelineCreationFeedbackCreateInfo *creation_feedback =
4419 vk_find_struct_const(pCreateInfo->pNext, PIPELINE_CREATION_FEEDBACK_CREATE_INFO);
4420
4421 int64_t pipeline_start = os_time_get_nano();
4422
4423 pipeline = (struct tu_compute_pipeline *) vk_object_zalloc(
4424 &dev->vk, pAllocator, sizeof(*pipeline), VK_OBJECT_TYPE_PIPELINE);
4425 if (!pipeline)
4426 return VK_ERROR_OUT_OF_HOST_MEMORY;
4427 pipeline->base.type = TU_PIPELINE_COMPUTE;
4428
4429 pipeline->base.executables_mem_ctx = ralloc_context(NULL);
4430 util_dynarray_init(&pipeline->base.executables, pipeline->base.executables_mem_ctx);
4431 pipeline->base.active_stages = VK_SHADER_STAGE_COMPUTE_BIT;
4432
4433 struct tu_shader_key key = { };
4434 bool allow_varying_subgroup_size =
4435 (stage_info->flags &
4436 VK_PIPELINE_SHADER_STAGE_CREATE_ALLOW_VARYING_SUBGROUP_SIZE_BIT_EXT);
4437 bool require_full_subgroups =
4438 stage_info->flags &
4439 VK_PIPELINE_SHADER_STAGE_CREATE_REQUIRE_FULL_SUBGROUPS_BIT_EXT;
4440 const VkPipelineShaderStageRequiredSubgroupSizeCreateInfo *subgroup_info =
4441 vk_find_struct_const(stage_info,
4442 PIPELINE_SHADER_STAGE_REQUIRED_SUBGROUP_SIZE_CREATE_INFO);
4443 tu_shader_key_subgroup_size(&key, allow_varying_subgroup_size,
4444 require_full_subgroups, subgroup_info,
4445 dev);
4446
4447 struct vk_pipeline_robustness_state rs;
4448 vk_pipeline_robustness_state_fill(&dev->vk, &rs,
4449 pCreateInfo->pNext,
4450 stage_info->pNext);
4451 tu_shader_key_robustness(&key, &rs);
4452
4453 void *pipeline_mem_ctx = ralloc_context(NULL);
4454
4455 unsigned char pipeline_sha1[20];
4456 tu_hash_compute(pipeline_sha1, flags, stage_info, layout, &key);
4457
4458 struct tu_shader *shader = NULL;
4459
4460 const bool executable_info = flags &
4461 VK_PIPELINE_CREATE_2_CAPTURE_INTERNAL_REPRESENTATIONS_BIT_KHR;
4462
4463 bool application_cache_hit = false;
4464
4465 if (!executable_info) {
4466 shader =
4467 tu_pipeline_cache_lookup(cache, pipeline_sha1, sizeof(pipeline_sha1),
4468 &application_cache_hit);
4469 }
4470
4471 if (application_cache_hit && cache != dev->mem_cache) {
4472 pipeline_feedback.flags |=
4473 VK_PIPELINE_CREATION_FEEDBACK_APPLICATION_PIPELINE_CACHE_HIT_BIT;
4474 }
4475
4476 char *nir_initial_disasm = NULL;
4477
4478 if (!shader) {
4479 if (flags &
4480 VK_PIPELINE_CREATE_2_FAIL_ON_PIPELINE_COMPILE_REQUIRED_BIT_KHR) {
4481 result = VK_PIPELINE_COMPILE_REQUIRED;
4482 goto fail;
4483 }
4484
4485 struct ir3_shader_key ir3_key = {};
4486
4487 nir_shader *nir = tu_spirv_to_nir(dev, pipeline_mem_ctx, flags,
4488 stage_info, &key, MESA_SHADER_COMPUTE);
4489
4490 nir_initial_disasm = executable_info ?
4491 nir_shader_as_str(nir, pipeline->base.executables_mem_ctx) : NULL;
4492
4493 result = tu_shader_create(dev, &shader, nir, &key, &ir3_key,
4494 pipeline_sha1, sizeof(pipeline_sha1), layout,
4495 executable_info);
4496 if (!shader) {
4497 goto fail;
4498 }
4499
4500 shader = tu_pipeline_cache_insert(cache, shader);
4501 }
4502
4503 pipeline_feedback.duration = os_time_get_nano() - pipeline_start;
4504
4505 if (creation_feedback) {
4506 *creation_feedback->pPipelineCreationFeedback = pipeline_feedback;
4507 assert(creation_feedback->pipelineStageCreationFeedbackCount == 1);
4508 creation_feedback->pPipelineStageCreationFeedbacks[0] = pipeline_feedback;
4509 }
4510
4511 pipeline->base.active_desc_sets = shader->active_desc_sets;
4512
4513 v = shader->variant;
4514
4515 tu_pipeline_set_linkage(&pipeline->base.program.link[MESA_SHADER_COMPUTE],
4516 &shader->const_state, v);
4517
4518 result = tu_pipeline_allocate_cs(dev, &pipeline->base, layout, NULL, v);
4519 if (result != VK_SUCCESS)
4520 goto fail;
4521
4522 for (int i = 0; i < 3; i++)
4523 pipeline->local_size[i] = v->local_size[i];
4524
4525 if (CHIP == A6XX) {
4526 tu6_emit_load_state(dev, &pipeline->base, layout);
4527 }
4528
4529 tu_append_executable(&pipeline->base, v, nir_initial_disasm);
4530
4531 pipeline->instrlen = v->instrlen;
4532
4533 pipeline->base.shaders[MESA_SHADER_COMPUTE] = shader;
4534
4535 ralloc_free(pipeline_mem_ctx);
4536
4537 TU_RMV(compute_pipeline_create, dev, pipeline);
4538
4539 *pPipeline = tu_pipeline_to_handle(&pipeline->base);
4540
4541 return VK_SUCCESS;
4542
4543 fail:
4544 if (shader)
4545 vk_pipeline_cache_object_unref(&dev->vk, &shader->base);
4546
4547 ralloc_free(pipeline_mem_ctx);
4548
4549 vk_object_free(&dev->vk, pAllocator, pipeline);
4550
4551 return result;
4552 }
4553
4554 template <chip CHIP>
4555 VKAPI_ATTR VkResult VKAPI_CALL
tu_CreateComputePipelines(VkDevice device,VkPipelineCache pipelineCache,uint32_t count,const VkComputePipelineCreateInfo * pCreateInfos,const VkAllocationCallbacks * pAllocator,VkPipeline * pPipelines)4556 tu_CreateComputePipelines(VkDevice device,
4557 VkPipelineCache pipelineCache,
4558 uint32_t count,
4559 const VkComputePipelineCreateInfo *pCreateInfos,
4560 const VkAllocationCallbacks *pAllocator,
4561 VkPipeline *pPipelines)
4562 {
4563 MESA_TRACE_FUNC();
4564 VkResult final_result = VK_SUCCESS;
4565 uint32_t i = 0;
4566
4567 for (; i < count; i++) {
4568 VkPipelineCreateFlags2KHR flags =
4569 vk_compute_pipeline_create_flags(&pCreateInfos[i]);
4570
4571 VkResult result =
4572 tu_compute_pipeline_create<CHIP>(device, pipelineCache,
4573 &pCreateInfos[i], flags,
4574 pAllocator, &pPipelines[i]);
4575 if (result != VK_SUCCESS) {
4576 final_result = result;
4577 pPipelines[i] = VK_NULL_HANDLE;
4578
4579 if (flags &
4580 VK_PIPELINE_CREATE_2_EARLY_RETURN_ON_FAILURE_BIT_KHR)
4581 break;
4582 }
4583 }
4584
4585 for (; i < count; i++)
4586 pPipelines[i] = VK_NULL_HANDLE;
4587
4588 return final_result;
4589 }
4590 TU_GENX(tu_CreateComputePipelines);
4591
4592 VKAPI_ATTR void VKAPI_CALL
tu_DestroyPipeline(VkDevice _device,VkPipeline _pipeline,const VkAllocationCallbacks * pAllocator)4593 tu_DestroyPipeline(VkDevice _device,
4594 VkPipeline _pipeline,
4595 const VkAllocationCallbacks *pAllocator)
4596 {
4597 VK_FROM_HANDLE(tu_device, dev, _device);
4598 VK_FROM_HANDLE(tu_pipeline, pipeline, _pipeline);
4599
4600 if (!_pipeline)
4601 return;
4602
4603 TU_RMV(resource_destroy, dev, pipeline);
4604
4605 tu_pipeline_finish(pipeline, dev, pAllocator);
4606 vk_object_free(&dev->vk, pAllocator, pipeline);
4607 }
4608
4609 #define WRITE_STR(field, ...) ({ \
4610 memset(field, 0, sizeof(field)); \
4611 UNUSED int _i = snprintf(field, sizeof(field), __VA_ARGS__); \
4612 assert(_i > 0 && _i < sizeof(field)); \
4613 })
4614
4615 static const struct tu_pipeline_executable *
tu_pipeline_get_executable(struct tu_pipeline * pipeline,uint32_t index)4616 tu_pipeline_get_executable(struct tu_pipeline *pipeline, uint32_t index)
4617 {
4618 assert(index < util_dynarray_num_elements(&pipeline->executables,
4619 struct tu_pipeline_executable));
4620 return util_dynarray_element(
4621 &pipeline->executables, struct tu_pipeline_executable, index);
4622 }
4623
4624 VKAPI_ATTR VkResult VKAPI_CALL
tu_GetPipelineExecutablePropertiesKHR(VkDevice _device,const VkPipelineInfoKHR * pPipelineInfo,uint32_t * pExecutableCount,VkPipelineExecutablePropertiesKHR * pProperties)4625 tu_GetPipelineExecutablePropertiesKHR(
4626 VkDevice _device,
4627 const VkPipelineInfoKHR* pPipelineInfo,
4628 uint32_t* pExecutableCount,
4629 VkPipelineExecutablePropertiesKHR* pProperties)
4630 {
4631 VK_FROM_HANDLE(tu_device, dev, _device);
4632 VK_FROM_HANDLE(tu_pipeline, pipeline, pPipelineInfo->pipeline);
4633 VK_OUTARRAY_MAKE_TYPED(VkPipelineExecutablePropertiesKHR, out,
4634 pProperties, pExecutableCount);
4635
4636 util_dynarray_foreach (&pipeline->executables, struct tu_pipeline_executable, exe) {
4637 vk_outarray_append_typed(VkPipelineExecutablePropertiesKHR, &out, props) {
4638 gl_shader_stage stage = exe->stage;
4639 props->stages = mesa_to_vk_shader_stage(stage);
4640
4641 if (!exe->is_binning)
4642 WRITE_STR(props->name, "%s", _mesa_shader_stage_to_abbrev(stage));
4643 else
4644 WRITE_STR(props->name, "Binning VS");
4645
4646 WRITE_STR(props->description, "%s", _mesa_shader_stage_to_string(stage));
4647
4648 props->subgroupSize =
4649 dev->compiler->threadsize_base * (exe->stats.double_threadsize ? 2 : 1);
4650 }
4651 }
4652
4653 return vk_outarray_status(&out);
4654 }
4655
4656 VKAPI_ATTR VkResult VKAPI_CALL
tu_GetPipelineExecutableStatisticsKHR(VkDevice _device,const VkPipelineExecutableInfoKHR * pExecutableInfo,uint32_t * pStatisticCount,VkPipelineExecutableStatisticKHR * pStatistics)4657 tu_GetPipelineExecutableStatisticsKHR(
4658 VkDevice _device,
4659 const VkPipelineExecutableInfoKHR* pExecutableInfo,
4660 uint32_t* pStatisticCount,
4661 VkPipelineExecutableStatisticKHR* pStatistics)
4662 {
4663 VK_FROM_HANDLE(tu_pipeline, pipeline, pExecutableInfo->pipeline);
4664 VK_OUTARRAY_MAKE_TYPED(VkPipelineExecutableStatisticKHR, out,
4665 pStatistics, pStatisticCount);
4666
4667 const struct tu_pipeline_executable *exe =
4668 tu_pipeline_get_executable(pipeline, pExecutableInfo->executableIndex);
4669
4670 vk_outarray_append_typed(VkPipelineExecutableStatisticKHR, &out, stat) {
4671 WRITE_STR(stat->name, "Max Waves Per Core");
4672 WRITE_STR(stat->description,
4673 "Maximum number of simultaneous waves per core.");
4674 stat->format = VK_PIPELINE_EXECUTABLE_STATISTIC_FORMAT_UINT64_KHR;
4675 stat->value.u64 = exe->stats.max_waves;
4676 }
4677
4678 vk_outarray_append_typed(VkPipelineExecutableStatisticKHR, &out, stat) {
4679 WRITE_STR(stat->name, "Instruction Count");
4680 WRITE_STR(stat->description,
4681 "Total number of IR3 instructions in the final generated "
4682 "shader executable.");
4683 stat->format = VK_PIPELINE_EXECUTABLE_STATISTIC_FORMAT_UINT64_KHR;
4684 stat->value.u64 = exe->stats.instrs_count;
4685 }
4686
4687 vk_outarray_append_typed(VkPipelineExecutableStatisticKHR, &out, stat) {
4688 WRITE_STR(stat->name, "Code size");
4689 WRITE_STR(stat->description,
4690 "Total number of dwords in the final generated "
4691 "shader executable.");
4692 stat->format = VK_PIPELINE_EXECUTABLE_STATISTIC_FORMAT_UINT64_KHR;
4693 stat->value.u64 = exe->stats.sizedwords;
4694 }
4695
4696 vk_outarray_append_typed(VkPipelineExecutableStatisticKHR, &out, stat) {
4697 WRITE_STR(stat->name, "NOPs Count");
4698 WRITE_STR(stat->description,
4699 "Number of NOP instructions in the final generated "
4700 "shader executable.");
4701 stat->format = VK_PIPELINE_EXECUTABLE_STATISTIC_FORMAT_UINT64_KHR;
4702 stat->value.u64 = exe->stats.nops_count;
4703 }
4704
4705 vk_outarray_append_typed(VkPipelineExecutableStatisticKHR, &out, stat) {
4706 WRITE_STR(stat->name, "MOV Count");
4707 WRITE_STR(stat->description,
4708 "Number of MOV instructions in the final generated "
4709 "shader executable.");
4710 stat->format = VK_PIPELINE_EXECUTABLE_STATISTIC_FORMAT_UINT64_KHR;
4711 stat->value.u64 = exe->stats.mov_count;
4712 }
4713
4714 vk_outarray_append_typed(VkPipelineExecutableStatisticKHR, &out, stat) {
4715 WRITE_STR(stat->name, "COV Count");
4716 WRITE_STR(stat->description,
4717 "Number of COV instructions in the final generated "
4718 "shader executable.");
4719 stat->format = VK_PIPELINE_EXECUTABLE_STATISTIC_FORMAT_UINT64_KHR;
4720 stat->value.u64 = exe->stats.cov_count;
4721 }
4722
4723 vk_outarray_append_typed(VkPipelineExecutableStatisticKHR, &out, stat) {
4724 WRITE_STR(stat->name, "Registers used");
4725 WRITE_STR(stat->description,
4726 "Number of registers used in the final generated "
4727 "shader executable.");
4728 stat->format = VK_PIPELINE_EXECUTABLE_STATISTIC_FORMAT_UINT64_KHR;
4729 stat->value.u64 = exe->stats.max_reg + 1;
4730 }
4731
4732 vk_outarray_append_typed(VkPipelineExecutableStatisticKHR, &out, stat) {
4733 WRITE_STR(stat->name, "Half-registers used");
4734 WRITE_STR(stat->description,
4735 "Number of half-registers used in the final generated "
4736 "shader executable.");
4737 stat->format = VK_PIPELINE_EXECUTABLE_STATISTIC_FORMAT_UINT64_KHR;
4738 stat->value.u64 = exe->stats.max_half_reg + 1;
4739 }
4740
4741 vk_outarray_append_typed(VkPipelineExecutableStatisticKHR, &out, stat) {
4742 WRITE_STR(stat->name, "Last interpolation instruction");
4743 WRITE_STR(stat->description,
4744 "The instruction where varying storage in Local Memory is released");
4745 stat->format = VK_PIPELINE_EXECUTABLE_STATISTIC_FORMAT_UINT64_KHR;
4746 stat->value.u64 = exe->stats.last_baryf;
4747 }
4748
4749 vk_outarray_append_typed(VkPipelineExecutableStatisticKHR, &out, stat) {
4750 WRITE_STR(stat->name, "Last helper instruction");
4751 WRITE_STR(stat->description,
4752 "The instruction where helper invocations are killed");
4753 stat->format = VK_PIPELINE_EXECUTABLE_STATISTIC_FORMAT_UINT64_KHR;
4754 stat->value.u64 = exe->stats.last_helper;
4755 }
4756
4757 vk_outarray_append_typed(VkPipelineExecutableStatisticKHR, &out, stat) {
4758 WRITE_STR(stat->name, "Instructions with SS sync bit");
4759 WRITE_STR(stat->description,
4760 "SS bit is set for instructions which depend on a result "
4761 "of \"long\" instructions to prevent RAW hazard.");
4762 stat->format = VK_PIPELINE_EXECUTABLE_STATISTIC_FORMAT_UINT64_KHR;
4763 stat->value.u64 = exe->stats.ss;
4764 }
4765
4766 vk_outarray_append_typed(VkPipelineExecutableStatisticKHR, &out, stat) {
4767 WRITE_STR(stat->name, "Instructions with SY sync bit");
4768 WRITE_STR(stat->description,
4769 "SY bit is set for instructions which depend on a result "
4770 "of loads from global memory to prevent RAW hazard.");
4771 stat->format = VK_PIPELINE_EXECUTABLE_STATISTIC_FORMAT_UINT64_KHR;
4772 stat->value.u64 = exe->stats.sy;
4773 }
4774
4775 vk_outarray_append_typed(VkPipelineExecutableStatisticKHR, &out, stat) {
4776 WRITE_STR(stat->name, "Estimated cycles stalled on SS");
4777 WRITE_STR(stat->description,
4778 "A better metric to estimate the impact of SS syncs.");
4779 stat->format = VK_PIPELINE_EXECUTABLE_STATISTIC_FORMAT_UINT64_KHR;
4780 stat->value.u64 = exe->stats.sstall;
4781 }
4782
4783 vk_outarray_append_typed(VkPipelineExecutableStatisticKHR, &out, stat) {
4784 WRITE_STR(stat->name, "Estimated cycles stalled on SY");
4785 WRITE_STR(stat->description,
4786 "A better metric to estimate the impact of SY syncs.");
4787 stat->format = VK_PIPELINE_EXECUTABLE_STATISTIC_FORMAT_UINT64_KHR;
4788 stat->value.u64 = exe->stats.systall;
4789 }
4790
4791 for (int i = 0; i < ARRAY_SIZE(exe->stats.instrs_per_cat); i++) {
4792 vk_outarray_append_typed(VkPipelineExecutableStatisticKHR, &out, stat) {
4793 WRITE_STR(stat->name, "cat%d instructions", i);
4794 WRITE_STR(stat->description,
4795 "Number of cat%d instructions.", i);
4796 stat->format = VK_PIPELINE_EXECUTABLE_STATISTIC_FORMAT_UINT64_KHR;
4797 stat->value.u64 = exe->stats.instrs_per_cat[i];
4798 }
4799 }
4800
4801 vk_outarray_append_typed(VkPipelineExecutableStatisticKHR, &out, stat) {
4802 WRITE_STR(stat->name, "STP Count");
4803 WRITE_STR(stat->description,
4804 "Number of STore Private instructions in the final generated "
4805 "shader executable.");
4806 stat->format = VK_PIPELINE_EXECUTABLE_STATISTIC_FORMAT_UINT64_KHR;
4807 stat->value.u64 = exe->stats.stp_count;
4808 }
4809
4810 vk_outarray_append_typed(VkPipelineExecutableStatisticKHR, &out, stat) {
4811 WRITE_STR(stat->name, "LDP Count");
4812 WRITE_STR(stat->description,
4813 "Number of LoaD Private instructions in the final generated "
4814 "shader executable.");
4815 stat->format = VK_PIPELINE_EXECUTABLE_STATISTIC_FORMAT_UINT64_KHR;
4816 stat->value.u64 = exe->stats.ldp_count;
4817 }
4818
4819 vk_outarray_append_typed(VkPipelineExecutableStatisticKHR, &out, stat) {
4820 WRITE_STR(stat->name, "Preamble Instruction Count");
4821 WRITE_STR(stat->description,
4822 "Total number of IR3 instructions in the preamble.");
4823 stat->format = VK_PIPELINE_EXECUTABLE_STATISTIC_FORMAT_UINT64_KHR;
4824 stat->value.u64 = exe->stats.preamble_instrs_count;
4825 }
4826
4827 vk_outarray_append_typed(VkPipelineExecutableStatisticKHR, &out, stat) {
4828 WRITE_STR(stat->name, "Early preamble");
4829 WRITE_STR(stat->description,
4830 "Whether the preamble will be executed early.");
4831 stat->format = VK_PIPELINE_EXECUTABLE_STATISTIC_FORMAT_BOOL32_KHR;
4832 stat->value.b32 = exe->stats.early_preamble;
4833 }
4834
4835 return vk_outarray_status(&out);
4836 }
4837
4838 static bool
write_ir_text(VkPipelineExecutableInternalRepresentationKHR * ir,const char * data)4839 write_ir_text(VkPipelineExecutableInternalRepresentationKHR* ir,
4840 const char *data)
4841 {
4842 ir->isText = VK_TRUE;
4843
4844 size_t data_len = strlen(data) + 1;
4845
4846 if (ir->pData == NULL) {
4847 ir->dataSize = data_len;
4848 return true;
4849 }
4850
4851 strncpy((char *) ir->pData, data, ir->dataSize);
4852 if (ir->dataSize < data_len)
4853 return false;
4854
4855 ir->dataSize = data_len;
4856 return true;
4857 }
4858
4859 VKAPI_ATTR VkResult VKAPI_CALL
tu_GetPipelineExecutableInternalRepresentationsKHR(VkDevice _device,const VkPipelineExecutableInfoKHR * pExecutableInfo,uint32_t * pInternalRepresentationCount,VkPipelineExecutableInternalRepresentationKHR * pInternalRepresentations)4860 tu_GetPipelineExecutableInternalRepresentationsKHR(
4861 VkDevice _device,
4862 const VkPipelineExecutableInfoKHR* pExecutableInfo,
4863 uint32_t* pInternalRepresentationCount,
4864 VkPipelineExecutableInternalRepresentationKHR* pInternalRepresentations)
4865 {
4866 VK_FROM_HANDLE(tu_pipeline, pipeline, pExecutableInfo->pipeline);
4867 VK_OUTARRAY_MAKE_TYPED(VkPipelineExecutableInternalRepresentationKHR, out,
4868 pInternalRepresentations, pInternalRepresentationCount);
4869 bool incomplete_text = false;
4870
4871 const struct tu_pipeline_executable *exe =
4872 tu_pipeline_get_executable(pipeline, pExecutableInfo->executableIndex);
4873
4874 if (exe->nir_from_spirv) {
4875 vk_outarray_append_typed(VkPipelineExecutableInternalRepresentationKHR, &out, ir) {
4876 WRITE_STR(ir->name, "NIR from SPIRV");
4877 WRITE_STR(ir->description,
4878 "Initial NIR before any optimizations");
4879
4880 if (!write_ir_text(ir, exe->nir_from_spirv))
4881 incomplete_text = true;
4882 }
4883 }
4884
4885 if (exe->nir_final) {
4886 vk_outarray_append_typed(VkPipelineExecutableInternalRepresentationKHR, &out, ir) {
4887 WRITE_STR(ir->name, "Final NIR");
4888 WRITE_STR(ir->description,
4889 "Final NIR before going into the back-end compiler");
4890
4891 if (!write_ir_text(ir, exe->nir_final))
4892 incomplete_text = true;
4893 }
4894 }
4895
4896 if (exe->disasm) {
4897 vk_outarray_append_typed(VkPipelineExecutableInternalRepresentationKHR, &out, ir) {
4898 WRITE_STR(ir->name, "IR3 Assembly");
4899 WRITE_STR(ir->description,
4900 "Final IR3 assembly for the generated shader binary");
4901
4902 if (!write_ir_text(ir, exe->disasm))
4903 incomplete_text = true;
4904 }
4905 }
4906
4907 return incomplete_text ? VK_INCOMPLETE : vk_outarray_status(&out);
4908 }
4909