• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*
2  * Copyright © 2016 Red Hat.
3  * Copyright © 2016 Bas Nieuwenhuizen
4  * SPDX-License-Identifier: MIT
5  *
6  * based in part on anv driver which is:
7  * Copyright © 2015 Intel Corporation
8  */
9 
10 #include "tu_pipeline.h"
11 
12 #include "common/freedreno_guardband.h"
13 
14 #include "ir3/ir3_nir.h"
15 #include "nir/nir.h"
16 #include "nir/nir_builder.h"
17 #include "nir/nir_serialize.h"
18 #include "spirv/nir_spirv.h"
19 #include "util/u_debug.h"
20 #include "util/mesa-sha1.h"
21 #include "vk_nir.h"
22 #include "vk_pipeline.h"
23 #include "vk_render_pass.h"
24 #include "vk_util.h"
25 
26 #include "tu_cmd_buffer.h"
27 #include "tu_cs.h"
28 #include "tu_device.h"
29 #include "tu_knl.h"
30 #include "tu_formats.h"
31 #include "tu_lrz.h"
32 #include "tu_pass.h"
33 #include "tu_rmv.h"
34 
35 /* Emit IB that preloads the descriptors that the shader uses */
36 
37 static void
emit_load_state(struct tu_cs * cs,unsigned opcode,enum a6xx_state_type st,enum a6xx_state_block sb,unsigned base,unsigned offset,unsigned count)38 emit_load_state(struct tu_cs *cs, unsigned opcode, enum a6xx_state_type st,
39                 enum a6xx_state_block sb, unsigned base, unsigned offset,
40                 unsigned count)
41 {
42    /* Note: just emit one packet, even if count overflows NUM_UNIT. It's not
43     * clear if emitting more packets will even help anything. Presumably the
44     * descriptor cache is relatively small, and these packets stop doing
45     * anything when there are too many descriptors.
46     */
47    tu_cs_emit_pkt7(cs, opcode, 3);
48    tu_cs_emit(cs,
49               CP_LOAD_STATE6_0_STATE_TYPE(st) |
50               CP_LOAD_STATE6_0_STATE_SRC(SS6_BINDLESS) |
51               CP_LOAD_STATE6_0_STATE_BLOCK(sb) |
52               CP_LOAD_STATE6_0_NUM_UNIT(MIN2(count, 1024-1)));
53    tu_cs_emit_qw(cs, offset | (base << 28));
54 }
55 
56 static unsigned
tu6_load_state_size(struct tu_pipeline * pipeline,struct tu_pipeline_layout * layout)57 tu6_load_state_size(struct tu_pipeline *pipeline,
58                     struct tu_pipeline_layout *layout)
59 {
60    const unsigned load_state_size = 4;
61    unsigned size = 0;
62    for (unsigned i = 0; i < layout->num_sets; i++) {
63       if (!(pipeline->active_desc_sets & (1u << i)))
64          continue;
65 
66       struct tu_descriptor_set_layout *set_layout = layout->set[i].layout;
67       for (unsigned j = 0; j < set_layout->binding_count; j++) {
68          struct tu_descriptor_set_binding_layout *binding = &set_layout->binding[j];
69          unsigned count = 0;
70          /* See comment in tu6_emit_load_state(). */
71          VkShaderStageFlags stages = pipeline->active_stages & binding->shader_stages;
72          unsigned stage_count = util_bitcount(stages);
73 
74          if (!binding->array_size)
75             continue;
76 
77          switch (binding->type) {
78          case VK_DESCRIPTOR_TYPE_STORAGE_BUFFER:
79          case VK_DESCRIPTOR_TYPE_STORAGE_BUFFER_DYNAMIC:
80          case VK_DESCRIPTOR_TYPE_STORAGE_IMAGE:
81          case VK_DESCRIPTOR_TYPE_STORAGE_TEXEL_BUFFER:
82             /* IBO-backed resources only need one packet for all graphics stages */
83             if (stage_count)
84                count += 1;
85             break;
86          case VK_DESCRIPTOR_TYPE_SAMPLER:
87          case VK_DESCRIPTOR_TYPE_SAMPLED_IMAGE:
88          case VK_DESCRIPTOR_TYPE_UNIFORM_TEXEL_BUFFER:
89          case VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER:
90          case VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER_DYNAMIC:
91             /* Textures and UBO's needs a packet for each stage */
92             count = stage_count;
93             break;
94          case VK_DESCRIPTOR_TYPE_COMBINED_IMAGE_SAMPLER:
95             /* Because of how we pack combined images and samplers, we
96              * currently can't use one packet for the whole array.
97              */
98             count = stage_count * binding->array_size * 2;
99             break;
100          case VK_DESCRIPTOR_TYPE_INPUT_ATTACHMENT:
101          case VK_DESCRIPTOR_TYPE_INLINE_UNIFORM_BLOCK:
102          case VK_DESCRIPTOR_TYPE_MUTABLE_EXT:
103             break;
104          default:
105             unreachable("bad descriptor type");
106          }
107          size += count * load_state_size;
108       }
109    }
110    return size;
111 }
112 
113 static void
tu6_emit_load_state(struct tu_device * device,struct tu_pipeline * pipeline,struct tu_pipeline_layout * layout)114 tu6_emit_load_state(struct tu_device *device,
115                     struct tu_pipeline *pipeline,
116                     struct tu_pipeline_layout *layout)
117 {
118    unsigned size = tu6_load_state_size(pipeline, layout);
119    if (size == 0)
120       return;
121 
122    struct tu_cs cs;
123    tu_cs_begin_sub_stream(&pipeline->cs, size, &cs);
124 
125    for (unsigned i = 0; i < layout->num_sets; i++) {
126       /* From 13.2.7. Descriptor Set Binding:
127        *
128        *    A compatible descriptor set must be bound for all set numbers that
129        *    any shaders in a pipeline access, at the time that a draw or
130        *    dispatch command is recorded to execute using that pipeline.
131        *    However, if none of the shaders in a pipeline statically use any
132        *    bindings with a particular set number, then no descriptor set need
133        *    be bound for that set number, even if the pipeline layout includes
134        *    a non-trivial descriptor set layout for that set number.
135        *
136        * This means that descriptor sets unused by the pipeline may have a
137        * garbage or 0 BINDLESS_BASE register, which will cause context faults
138        * when prefetching descriptors from these sets. Skip prefetching for
139        * descriptors from them to avoid this. This is also an optimization,
140        * since these prefetches would be useless.
141        */
142       if (!(pipeline->active_desc_sets & (1u << i)))
143          continue;
144 
145       struct tu_descriptor_set_layout *set_layout = layout->set[i].layout;
146       for (unsigned j = 0; j < set_layout->binding_count; j++) {
147          struct tu_descriptor_set_binding_layout *binding = &set_layout->binding[j];
148          unsigned base = i;
149          unsigned offset = binding->offset / 4;
150          /* Note: amber sets VK_SHADER_STAGE_ALL for its descriptor layout, and
151           * zink has descriptors for each stage in the push layout even if some
152           * stages aren't present in a used pipeline.  We don't want to emit
153           * loads for unused descriptors.
154           */
155          VkShaderStageFlags stages = pipeline->active_stages & binding->shader_stages;
156          unsigned count = binding->array_size;
157 
158          /* If this is a variable-count descriptor, then the array_size is an
159           * upper bound on the size, but we don't know how many descriptors
160           * will actually be used. Therefore we can't pre-load them here.
161           */
162          if (j == set_layout->binding_count - 1 &&
163              set_layout->has_variable_descriptors)
164             continue;
165 
166          if (count == 0 || stages == 0)
167             continue;
168          switch (binding->type) {
169          case VK_DESCRIPTOR_TYPE_STORAGE_BUFFER_DYNAMIC:
170             assert(device->physical_device->reserved_set_idx >= 0);
171             base = device->physical_device->reserved_set_idx;
172             offset = (pipeline->program.dynamic_descriptor_offsets[i] +
173                       binding->dynamic_offset_offset) / 4;
174             FALLTHROUGH;
175          case VK_DESCRIPTOR_TYPE_STORAGE_BUFFER:
176          case VK_DESCRIPTOR_TYPE_STORAGE_IMAGE:
177          case VK_DESCRIPTOR_TYPE_STORAGE_TEXEL_BUFFER: {
178             unsigned mul = binding->size / (A6XX_TEX_CONST_DWORDS * 4);
179             /* IBO-backed resources only need one packet for all graphics stages */
180             if (stages & ~VK_SHADER_STAGE_COMPUTE_BIT) {
181                emit_load_state(&cs, CP_LOAD_STATE6, ST6_SHADER, SB6_IBO,
182                                base, offset, count * mul);
183             }
184             if (stages & VK_SHADER_STAGE_COMPUTE_BIT) {
185                emit_load_state(&cs, CP_LOAD_STATE6_FRAG, ST6_IBO, SB6_CS_SHADER,
186                                base, offset, count * mul);
187             }
188             break;
189          }
190          case VK_DESCRIPTOR_TYPE_INPUT_ATTACHMENT:
191          case VK_DESCRIPTOR_TYPE_INLINE_UNIFORM_BLOCK:
192          case VK_DESCRIPTOR_TYPE_MUTABLE_EXT:
193             /* nothing - input attachments and inline uniforms don't use bindless */
194             break;
195          case VK_DESCRIPTOR_TYPE_SAMPLER:
196          case VK_DESCRIPTOR_TYPE_SAMPLED_IMAGE:
197          case VK_DESCRIPTOR_TYPE_UNIFORM_TEXEL_BUFFER: {
198             tu_foreach_stage(stage, stages) {
199                emit_load_state(&cs, tu6_stage2opcode(stage),
200                                binding->type == VK_DESCRIPTOR_TYPE_SAMPLER ?
201                                ST6_SHADER : ST6_CONSTANTS,
202                                tu6_stage2texsb(stage), base, offset, count);
203             }
204             break;
205          }
206          case VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER_DYNAMIC:
207             assert(device->physical_device->reserved_set_idx >= 0);
208             base = device->physical_device->reserved_set_idx;
209             offset = (pipeline->program.dynamic_descriptor_offsets[i] +
210                       binding->dynamic_offset_offset) / 4;
211             FALLTHROUGH;
212          case VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER: {
213             tu_foreach_stage(stage, stages) {
214                emit_load_state(&cs, tu6_stage2opcode(stage), ST6_UBO,
215                                tu6_stage2shadersb(stage), base, offset, count);
216             }
217             break;
218          }
219          case VK_DESCRIPTOR_TYPE_COMBINED_IMAGE_SAMPLER: {
220             tu_foreach_stage(stage, stages) {
221                /* TODO: We could emit less CP_LOAD_STATE6 if we used
222                 * struct-of-arrays instead of array-of-structs.
223                 */
224                for (unsigned i = 0; i < count; i++) {
225                   unsigned tex_offset = offset + 2 * i * A6XX_TEX_CONST_DWORDS;
226                   unsigned sam_offset = offset + (2 * i + 1) * A6XX_TEX_CONST_DWORDS;
227                   emit_load_state(&cs, tu6_stage2opcode(stage),
228                                   ST6_CONSTANTS, tu6_stage2texsb(stage),
229                                   base, tex_offset, 1);
230                   emit_load_state(&cs, tu6_stage2opcode(stage),
231                                   ST6_SHADER, tu6_stage2texsb(stage),
232                                   base, sam_offset, 1);
233                }
234             }
235             break;
236          }
237          default:
238             unreachable("bad descriptor type");
239          }
240       }
241    }
242 
243    pipeline->load_state = tu_cs_end_draw_state(&pipeline->cs, &cs);
244 }
245 
246 struct tu_pipeline_builder
247 {
248    struct tu_device *device;
249    void *mem_ctx;
250    struct vk_pipeline_cache *cache;
251    const VkAllocationCallbacks *alloc;
252    const VkGraphicsPipelineCreateInfo *create_info;
253    VkPipelineCreateFlags2KHR create_flags;
254 
255    struct tu_pipeline_layout layout;
256 
257    struct tu_pvtmem_config pvtmem;
258 
259    bool rasterizer_discard;
260    /* these states are affectd by rasterizer_discard */
261    uint8_t unscaled_input_fragcoord;
262 
263    /* Each library defines at least one piece of state in
264     * VkGraphicsPipelineLibraryFlagsEXT, and libraries cannot overlap, so
265     * there can be at most as many libraries as pieces of state, of which
266     * there are currently 4.
267     */
268 #define MAX_LIBRARIES 4
269 
270    unsigned num_libraries;
271    struct tu_graphics_lib_pipeline *libraries[MAX_LIBRARIES];
272 
273    /* This is just the state that we are compiling now, whereas the final
274     * pipeline will include the state from the libraries.
275     */
276    VkGraphicsPipelineLibraryFlagsEXT state;
277 
278    /* The stages we are compiling now. */
279    VkShaderStageFlags active_stages;
280 
281    bool fragment_density_map;
282 
283    struct vk_graphics_pipeline_all_state all_state;
284    struct vk_graphics_pipeline_state graphics_state;
285 };
286 
287 static bool
tu_logic_op_reads_dst(VkLogicOp op)288 tu_logic_op_reads_dst(VkLogicOp op)
289 {
290    switch (op) {
291    case VK_LOGIC_OP_CLEAR:
292    case VK_LOGIC_OP_COPY:
293    case VK_LOGIC_OP_COPY_INVERTED:
294    case VK_LOGIC_OP_SET:
295       return false;
296    default:
297       return true;
298    }
299 }
300 
301 static bool
tu_blend_state_is_dual_src(const struct vk_color_blend_state * cb)302 tu_blend_state_is_dual_src(const struct vk_color_blend_state *cb)
303 {
304    for (unsigned i = 0; i < cb->attachment_count; i++) {
305       if (tu_blend_factor_is_dual_src((VkBlendFactor)cb->attachments[i].src_color_blend_factor) ||
306           tu_blend_factor_is_dual_src((VkBlendFactor)cb->attachments[i].dst_color_blend_factor) ||
307           tu_blend_factor_is_dual_src((VkBlendFactor)cb->attachments[i].src_alpha_blend_factor) ||
308           tu_blend_factor_is_dual_src((VkBlendFactor)cb->attachments[i].dst_alpha_blend_factor))
309          return true;
310    }
311 
312    return false;
313 }
314 
315 enum ir3_push_consts_type
tu_push_consts_type(const struct tu_pipeline_layout * layout,const struct ir3_compiler * compiler)316 tu_push_consts_type(const struct tu_pipeline_layout *layout,
317                     const struct ir3_compiler *compiler)
318 {
319    if (!layout->push_constant_size)
320       return IR3_PUSH_CONSTS_NONE;
321 
322    if (TU_DEBUG(PUSH_CONSTS_PER_STAGE))
323       return IR3_PUSH_CONSTS_PER_STAGE;
324 
325    if (tu6_shared_constants_enable(layout, compiler)) {
326       return IR3_PUSH_CONSTS_SHARED;
327    } else {
328       if (compiler->gen >= 7) {
329          return IR3_PUSH_CONSTS_SHARED_PREAMBLE;
330       } else {
331          return IR3_PUSH_CONSTS_PER_STAGE;
332       }
333    }
334 }
335 
336 template <chip CHIP>
337 struct xs_config {
338    uint16_t reg_sp_xs_config;
339    uint16_t reg_hlsq_xs_ctrl;
340 };
341 
342 template <chip CHIP>
343 static const xs_config<CHIP> xs_configs[] = {
344    [MESA_SHADER_VERTEX] = {
345       REG_A6XX_SP_VS_CONFIG,
346       CHIP == A6XX ? REG_A6XX_HLSQ_VS_CNTL : REG_A7XX_HLSQ_VS_CNTL,
347    },
348    [MESA_SHADER_TESS_CTRL] = {
349       REG_A6XX_SP_HS_CONFIG,
350       CHIP == A6XX ? REG_A6XX_HLSQ_HS_CNTL : REG_A7XX_HLSQ_HS_CNTL,
351    },
352    [MESA_SHADER_TESS_EVAL] = {
353       REG_A6XX_SP_DS_CONFIG,
354       CHIP == A6XX ? REG_A6XX_HLSQ_DS_CNTL : REG_A7XX_HLSQ_DS_CNTL,
355    },
356    [MESA_SHADER_GEOMETRY] = {
357       REG_A6XX_SP_GS_CONFIG,
358       CHIP == A6XX ? REG_A6XX_HLSQ_GS_CNTL : REG_A7XX_HLSQ_GS_CNTL,
359    },
360    [MESA_SHADER_FRAGMENT] = {
361       REG_A6XX_SP_FS_CONFIG,
362       CHIP == A6XX ? REG_A6XX_HLSQ_FS_CNTL : REG_A7XX_HLSQ_FS_CNTL,
363    },
364    [MESA_SHADER_COMPUTE] = {
365       REG_A6XX_SP_CS_CONFIG,
366       CHIP == A6XX ? REG_A6XX_HLSQ_CS_CNTL : REG_A7XX_HLSQ_CS_CNTL,
367    },
368 };
369 
370 template <chip CHIP>
371 void
tu6_emit_xs_config(struct tu_cs * cs,gl_shader_stage stage,const struct ir3_shader_variant * xs)372 tu6_emit_xs_config(struct tu_cs *cs,
373                    gl_shader_stage stage, /* xs->type, but xs may be NULL */
374                    const struct ir3_shader_variant *xs)
375 {
376    const struct xs_config<CHIP> *cfg = &xs_configs<CHIP>[stage];
377 
378    if (!xs) {
379       /* shader stage disabled */
380       tu_cs_emit_pkt4(cs, cfg->reg_sp_xs_config, 1);
381       tu_cs_emit(cs, 0);
382 
383       tu_cs_emit_pkt4(cs, cfg->reg_hlsq_xs_ctrl, 1);
384       tu_cs_emit(cs, 0);
385       return;
386    }
387 
388    tu_cs_emit_pkt4(cs, cfg->reg_sp_xs_config, 1);
389    tu_cs_emit(cs, A6XX_SP_VS_CONFIG_ENABLED |
390                   COND(xs->bindless_tex, A6XX_SP_VS_CONFIG_BINDLESS_TEX) |
391                   COND(xs->bindless_samp, A6XX_SP_VS_CONFIG_BINDLESS_SAMP) |
392                   COND(xs->bindless_ibo, A6XX_SP_VS_CONFIG_BINDLESS_IBO) |
393                   COND(xs->bindless_ubo, A6XX_SP_VS_CONFIG_BINDLESS_UBO) |
394                   A6XX_SP_VS_CONFIG_NTEX(xs->num_samp) |
395                   A6XX_SP_VS_CONFIG_NSAMP(xs->num_samp));
396 
397    tu_cs_emit_pkt4(cs, cfg->reg_hlsq_xs_ctrl, 1);
398    tu_cs_emit(cs, A6XX_HLSQ_VS_CNTL_CONSTLEN(xs->constlen) |
399                      A6XX_HLSQ_VS_CNTL_ENABLED |
400                      COND(xs->shader_options.push_consts_type == IR3_PUSH_CONSTS_SHARED_PREAMBLE,
401                           A7XX_HLSQ_VS_CNTL_READ_IMM_SHARED_CONSTS));
402 }
403 TU_GENX(tu6_emit_xs_config);
404 
405 static void
tu6_emit_dynamic_offset(struct tu_cs * cs,const struct ir3_shader_variant * xs,const struct tu_shader * shader,const struct tu_program_state * program)406 tu6_emit_dynamic_offset(struct tu_cs *cs,
407                         const struct ir3_shader_variant *xs,
408                         const struct tu_shader *shader,
409                         const struct tu_program_state *program)
410 {
411    const struct tu_physical_device *phys_dev = cs->device->physical_device;
412 
413    if (!xs)
414       return;
415 
416    if (cs->device->physical_device->info->a7xx.load_shader_consts_via_preamble) {
417       if (shader->const_state.dynamic_offsets_ubo.size == 0)
418          return;
419 
420       uint32_t offsets[MAX_SETS];
421       for (unsigned i = 0; i < phys_dev->usable_sets; i++) {
422          unsigned dynamic_offset_start =
423             program->dynamic_descriptor_offsets[i] / (A6XX_TEX_CONST_DWORDS * 4);
424          offsets[i] = dynamic_offset_start;
425       }
426 
427       /* A7XX TODO: Emit data via sub_cs instead of NOP */
428       uint64_t iova = tu_cs_emit_data_nop(cs, offsets, phys_dev->usable_sets, 4);
429       uint32_t offset = shader->const_state.dynamic_offsets_ubo.idx;
430 
431       tu_cs_emit_pkt7(cs, tu6_stage2opcode(xs->type), 5);
432       tu_cs_emit(cs, CP_LOAD_STATE6_0_DST_OFF(offset) |
433                CP_LOAD_STATE6_0_STATE_TYPE(ST6_UBO) |
434                CP_LOAD_STATE6_0_STATE_SRC(SS6_DIRECT) |
435                CP_LOAD_STATE6_0_STATE_BLOCK(tu6_stage2shadersb(xs->type)) |
436                CP_LOAD_STATE6_0_NUM_UNIT(1));
437       tu_cs_emit(cs, CP_LOAD_STATE6_1_EXT_SRC_ADDR(0));
438       tu_cs_emit(cs, CP_LOAD_STATE6_2_EXT_SRC_ADDR_HI(0));
439       int size_vec4s = DIV_ROUND_UP(phys_dev->usable_sets, 4);
440       tu_cs_emit_qw(cs, iova | ((uint64_t)A6XX_UBO_1_SIZE(size_vec4s) << 32));
441    } else {
442       if (shader->const_state.dynamic_offset_loc == UINT32_MAX)
443          return;
444 
445       tu_cs_emit_pkt7(cs, tu6_stage2opcode(xs->type), 3 + phys_dev->usable_sets);
446       tu_cs_emit(cs, CP_LOAD_STATE6_0_DST_OFF(shader->const_state.dynamic_offset_loc / 4) |
447                CP_LOAD_STATE6_0_STATE_TYPE(ST6_CONSTANTS) |
448                CP_LOAD_STATE6_0_STATE_SRC(SS6_DIRECT) |
449                CP_LOAD_STATE6_0_STATE_BLOCK(tu6_stage2shadersb(xs->type)) |
450                CP_LOAD_STATE6_0_NUM_UNIT(DIV_ROUND_UP(phys_dev->usable_sets, 4)));
451       tu_cs_emit(cs, CP_LOAD_STATE6_1_EXT_SRC_ADDR(0));
452       tu_cs_emit(cs, CP_LOAD_STATE6_2_EXT_SRC_ADDR_HI(0));
453 
454       for (unsigned i = 0; i < phys_dev->usable_sets; i++) {
455          unsigned dynamic_offset_start =
456             program->dynamic_descriptor_offsets[i] / (A6XX_TEX_CONST_DWORDS * 4);
457          tu_cs_emit(cs, dynamic_offset_start);
458       }
459    }
460 }
461 
462 template <chip CHIP>
463 void
tu6_emit_shared_consts_enable(struct tu_cs * cs,bool enable)464 tu6_emit_shared_consts_enable(struct tu_cs *cs, bool enable)
465 {
466    if (CHIP == A6XX) {
467       /* Enable/disable shared constants */
468       tu_cs_emit_regs(cs, A6XX_HLSQ_SHARED_CONSTS(.enable = enable));
469    } else {
470       assert(!enable);
471    }
472 
473    tu_cs_emit_regs(cs, A6XX_SP_MODE_CONTROL(.constant_demotion_enable = true,
474                                             .isammode = ISAMMODE_GL,
475                                             .shared_consts_enable = enable));
476 }
477 TU_GENX(tu6_emit_shared_consts_enable);
478 
479 template <chip CHIP>
480 static void
tu6_setup_streamout(struct tu_cs * cs,const struct ir3_shader_variant * v,const struct ir3_shader_linkage * l)481 tu6_setup_streamout(struct tu_cs *cs,
482                     const struct ir3_shader_variant *v,
483                     const struct ir3_shader_linkage *l)
484 {
485    const struct ir3_stream_output_info *info = &v->stream_output;
486    /* Note: 64 here comes from the HW layout of the program RAM. The program
487     * for stream N is at DWORD 64 * N.
488     */
489 #define A6XX_SO_PROG_DWORDS 64
490    uint32_t prog[A6XX_SO_PROG_DWORDS * IR3_MAX_SO_STREAMS] = {};
491    BITSET_DECLARE(valid_dwords, A6XX_SO_PROG_DWORDS * IR3_MAX_SO_STREAMS) = {0};
492 
493    /* TODO: streamout state should be in a non-GMEM draw state */
494 
495    /* no streamout: */
496    if (info->num_outputs == 0) {
497       unsigned sizedw = 4;
498       if (cs->device->physical_device->info->a6xx.tess_use_shared)
499          sizedw += 2;
500 
501       tu_cs_emit_pkt7(cs, CP_CONTEXT_REG_BUNCH, sizedw);
502       tu_cs_emit(cs, REG_A6XX_VPC_SO_CNTL);
503       tu_cs_emit(cs, 0);
504       tu_cs_emit(cs, REG_A6XX_VPC_SO_STREAM_CNTL);
505       tu_cs_emit(cs, 0);
506 
507       if (cs->device->physical_device->info->a6xx.tess_use_shared) {
508          tu_cs_emit(cs, REG_A6XX_PC_SO_STREAM_CNTL);
509          tu_cs_emit(cs, 0);
510       }
511 
512       return;
513    }
514 
515    for (unsigned i = 0; i < info->num_outputs; i++) {
516       const struct ir3_stream_output *out = &info->output[i];
517       unsigned k = out->register_index;
518       unsigned idx;
519 
520       /* Skip it, if it's an output that was never assigned a register. */
521       if (k >= v->outputs_count || v->outputs[k].regid == INVALID_REG)
522          continue;
523 
524       /* linkage map sorted by order frag shader wants things, so
525        * a bit less ideal here..
526        */
527       for (idx = 0; idx < l->cnt; idx++)
528          if (l->var[idx].slot == v->outputs[k].slot)
529             break;
530 
531       assert(idx < l->cnt);
532 
533       for (unsigned j = 0; j < out->num_components; j++) {
534          unsigned c   = j + out->start_component;
535          unsigned loc = l->var[idx].loc + c;
536          unsigned off = j + out->dst_offset;  /* in dwords */
537 
538          assert(loc < A6XX_SO_PROG_DWORDS * 2);
539          unsigned dword = out->stream * A6XX_SO_PROG_DWORDS + loc/2;
540          if (loc & 1) {
541             prog[dword] |= A6XX_VPC_SO_PROG_B_EN |
542                            A6XX_VPC_SO_PROG_B_BUF(out->output_buffer) |
543                            A6XX_VPC_SO_PROG_B_OFF(off * 4);
544          } else {
545             prog[dword] |= A6XX_VPC_SO_PROG_A_EN |
546                            A6XX_VPC_SO_PROG_A_BUF(out->output_buffer) |
547                            A6XX_VPC_SO_PROG_A_OFF(off * 4);
548          }
549          BITSET_SET(valid_dwords, dword);
550       }
551    }
552 
553    unsigned prog_count = 0;
554    unsigned start, end;
555    BITSET_FOREACH_RANGE(start, end, valid_dwords,
556                         A6XX_SO_PROG_DWORDS * IR3_MAX_SO_STREAMS) {
557       prog_count += end - start + 1;
558    }
559 
560    const bool emit_pc_so_stream_cntl =
561       cs->device->physical_device->info->a6xx.tess_use_shared &&
562       v->type == MESA_SHADER_TESS_EVAL;
563 
564    if (emit_pc_so_stream_cntl)
565       prog_count += 1;
566 
567    tu_cs_emit_pkt7(cs, CP_CONTEXT_REG_BUNCH, 10 + 2 * prog_count);
568    tu_cs_emit(cs, REG_A6XX_VPC_SO_STREAM_CNTL);
569    tu_cs_emit(cs, A6XX_VPC_SO_STREAM_CNTL_STREAM_ENABLE(info->streams_written) |
570                   COND(info->stride[0] > 0,
571                        A6XX_VPC_SO_STREAM_CNTL_BUF0_STREAM(1 + info->buffer_to_stream[0])) |
572                   COND(info->stride[1] > 0,
573                        A6XX_VPC_SO_STREAM_CNTL_BUF1_STREAM(1 + info->buffer_to_stream[1])) |
574                   COND(info->stride[2] > 0,
575                        A6XX_VPC_SO_STREAM_CNTL_BUF2_STREAM(1 + info->buffer_to_stream[2])) |
576                   COND(info->stride[3] > 0,
577                        A6XX_VPC_SO_STREAM_CNTL_BUF3_STREAM(1 + info->buffer_to_stream[3])));
578    for (uint32_t i = 0; i < 4; i++) {
579       tu_cs_emit(cs, REG_A6XX_VPC_SO_BUFFER_STRIDE(i));
580       tu_cs_emit(cs, info->stride[i]);
581    }
582    bool first = true;
583    BITSET_FOREACH_RANGE(start, end, valid_dwords,
584                         A6XX_SO_PROG_DWORDS * IR3_MAX_SO_STREAMS) {
585       tu_cs_emit(cs, REG_A6XX_VPC_SO_CNTL);
586       tu_cs_emit(cs, COND(first, A6XX_VPC_SO_CNTL_RESET) |
587                      A6XX_VPC_SO_CNTL_ADDR(start));
588       for (unsigned i = start; i < end; i++) {
589          tu_cs_emit(cs, REG_A6XX_VPC_SO_PROG);
590          tu_cs_emit(cs, prog[i]);
591       }
592       first = false;
593    }
594 
595    if (emit_pc_so_stream_cntl) {
596       /* Possibly not tess_use_shared related, but the combination of
597        * tess + xfb fails some tests if we don't emit this.
598        */
599       tu_cs_emit(cs, REG_A6XX_PC_SO_STREAM_CNTL);
600       tu_cs_emit(cs, A6XX_PC_SO_STREAM_CNTL_STREAM_ENABLE(info->streams_written));
601    }
602 }
603 
604 enum tu_geom_consts_type
605 {
606    TU_CONSTS_PRIMITIVE_MAP,
607    TU_CONSTS_PRIMITIVE_PARAM,
608 };
609 
610 static void
tu6_emit_const(struct tu_cs * cs,uint32_t opcode,enum tu_geom_consts_type type,const struct ir3_const_state * const_state,unsigned constlen,enum a6xx_state_block block,uint32_t offset,uint32_t size,const uint32_t * dwords)611 tu6_emit_const(struct tu_cs *cs, uint32_t opcode, enum tu_geom_consts_type type,
612                const struct ir3_const_state *const_state,
613                unsigned constlen, enum a6xx_state_block block,
614                uint32_t offset, uint32_t size, const uint32_t *dwords) {
615    assert(size % 4 == 0);
616    dwords = (uint32_t *)&((uint8_t *)dwords)[offset];
617 
618    if (!cs->device->physical_device->info->a7xx.load_shader_consts_via_preamble) {
619       uint32_t base;
620       switch (type) {
621       case TU_CONSTS_PRIMITIVE_MAP:
622          base = const_state->allocs.consts[IR3_CONST_ALLOC_PRIMITIVE_MAP].offset_vec4;
623          break;
624       case TU_CONSTS_PRIMITIVE_PARAM:
625          base = const_state->allocs.consts[IR3_CONST_ALLOC_PRIMITIVE_PARAM].offset_vec4;
626          break;
627       default:
628          unreachable("bad consts type");
629       }
630 
631       int32_t adjusted_size = MIN2(base * 4 + size, constlen * 4) - base * 4;
632       if (adjusted_size <= 0)
633          return;
634 
635       tu_cs_emit_pkt7(cs, opcode, 3 + adjusted_size);
636       tu_cs_emit(cs, CP_LOAD_STATE6_0_DST_OFF(base) |
637             CP_LOAD_STATE6_0_STATE_TYPE(ST6_CONSTANTS) |
638             CP_LOAD_STATE6_0_STATE_SRC(SS6_DIRECT) |
639             CP_LOAD_STATE6_0_STATE_BLOCK(block) |
640             CP_LOAD_STATE6_0_NUM_UNIT(adjusted_size / 4));
641 
642       tu_cs_emit(cs, CP_LOAD_STATE6_1_EXT_SRC_ADDR(0));
643       tu_cs_emit(cs, CP_LOAD_STATE6_2_EXT_SRC_ADDR_HI(0));
644 
645       tu_cs_emit_array(cs, dwords, adjusted_size);
646    } else {
647       uint32_t base;
648       switch (type) {
649       case TU_CONSTS_PRIMITIVE_MAP:
650          base = const_state->primitive_map_ubo.idx;
651          break;
652       case TU_CONSTS_PRIMITIVE_PARAM:
653          base = const_state->primitive_param_ubo.idx;
654          break;
655       default:
656          unreachable("bad consts type");
657       }
658       if (base == -1)
659          return;
660 
661       /* A7XX TODO: Emit data via sub_cs instead of NOP */
662       uint64_t iova = tu_cs_emit_data_nop(cs, dwords, size, 4);
663 
664       tu_cs_emit_pkt7(cs, opcode, 5);
665       tu_cs_emit(cs, CP_LOAD_STATE6_0_DST_OFF(base) |
666                CP_LOAD_STATE6_0_STATE_TYPE(ST6_UBO) |
667                CP_LOAD_STATE6_0_STATE_SRC(SS6_DIRECT) |
668                CP_LOAD_STATE6_0_STATE_BLOCK(block) |
669                CP_LOAD_STATE6_0_NUM_UNIT(1));
670       tu_cs_emit(cs, CP_LOAD_STATE6_1_EXT_SRC_ADDR(0));
671       tu_cs_emit(cs, CP_LOAD_STATE6_2_EXT_SRC_ADDR_HI(0));
672       int size_vec4s = DIV_ROUND_UP(size, 4);
673       tu_cs_emit_qw(cs, iova | ((uint64_t)A6XX_UBO_1_SIZE(size_vec4s) << 32));
674    }
675 }
676 
677 static void
tu6_emit_link_map(struct tu_cs * cs,const struct ir3_shader_variant * producer,const struct ir3_shader_variant * consumer,enum a6xx_state_block sb)678 tu6_emit_link_map(struct tu_cs *cs,
679                   const struct ir3_shader_variant *producer,
680                   const struct ir3_shader_variant *consumer,
681                   enum a6xx_state_block sb)
682 {
683    const struct ir3_const_state *const_state = ir3_const_state(consumer);
684    uint32_t size = ALIGN(consumer->input_size, 4);
685 
686    if (size == 0)
687       return;
688 
689    tu6_emit_const(cs, CP_LOAD_STATE6_GEOM, TU_CONSTS_PRIMITIVE_MAP,
690                   const_state, consumer->constlen, sb, 0, size, producer->output_loc);
691 }
692 
693 static int
tu6_vpc_varying_mode(const struct ir3_shader_variant * fs,const struct ir3_shader_variant * last_shader,uint32_t index,uint8_t * interp_mode,uint8_t * ps_repl_mode)694 tu6_vpc_varying_mode(const struct ir3_shader_variant *fs,
695                      const struct ir3_shader_variant *last_shader,
696                      uint32_t index,
697                      uint8_t *interp_mode,
698                      uint8_t *ps_repl_mode)
699 {
700    const uint32_t compmask = fs->inputs[index].compmask;
701 
702    /* NOTE: varyings are packed, so if compmask is 0xb then first, second, and
703     * fourth component occupy three consecutive varying slots
704     */
705    int shift = 0;
706    *interp_mode = 0;
707    *ps_repl_mode = 0;
708    if (fs->inputs[index].slot == VARYING_SLOT_PNTC) {
709       if (compmask & 0x1) {
710          *ps_repl_mode |= PS_REPL_S << shift;
711          shift += 2;
712       }
713       if (compmask & 0x2) {
714          *ps_repl_mode |= PS_REPL_T << shift;
715          shift += 2;
716       }
717       if (compmask & 0x4) {
718          *interp_mode |= INTERP_ZERO << shift;
719          shift += 2;
720       }
721       if (compmask & 0x8) {
722          *interp_mode |= INTERP_ONE << 6;
723          shift += 2;
724       }
725    } else if (fs->inputs[index].slot == VARYING_SLOT_LAYER ||
726               fs->inputs[index].slot == VARYING_SLOT_VIEWPORT) {
727       /* If the last geometry shader doesn't statically write these, they're
728        * implicitly zero and the FS is supposed to read zero.
729        */
730       const gl_varying_slot slot = (gl_varying_slot) fs->inputs[index].slot;
731       if (ir3_find_output(last_shader, slot) < 0 &&
732           (compmask & 0x1)) {
733          *interp_mode |= INTERP_ZERO;
734       } else {
735          *interp_mode |= INTERP_FLAT;
736       }
737    } else if (fs->inputs[index].flat) {
738       for (int i = 0; i < 4; i++) {
739          if (compmask & (1 << i)) {
740             *interp_mode |= INTERP_FLAT << shift;
741             shift += 2;
742          }
743       }
744    }
745 
746    return util_bitcount(compmask) * 2;
747 }
748 
749 static void
tu6_emit_vpc_varying_modes(struct tu_cs * cs,const struct ir3_shader_variant * fs,const struct ir3_shader_variant * last_shader)750 tu6_emit_vpc_varying_modes(struct tu_cs *cs,
751                            const struct ir3_shader_variant *fs,
752                            const struct ir3_shader_variant *last_shader)
753 {
754    uint32_t interp_modes[8] = { 0 };
755    uint32_t ps_repl_modes[8] = { 0 };
756    uint32_t interp_regs = 0;
757 
758    if (fs) {
759       for (int i = -1;
760            (i = ir3_next_varying(fs, i)) < (int) fs->inputs_count;) {
761 
762          /* get the mode for input i */
763          uint8_t interp_mode;
764          uint8_t ps_repl_mode;
765          const int bits =
766             tu6_vpc_varying_mode(fs, last_shader, i, &interp_mode, &ps_repl_mode);
767 
768          /* OR the mode into the array */
769          const uint32_t inloc = fs->inputs[i].inloc * 2;
770          uint32_t n = inloc / 32;
771          uint32_t shift = inloc % 32;
772          interp_modes[n] |= interp_mode << shift;
773          ps_repl_modes[n] |= ps_repl_mode << shift;
774          if (shift + bits > 32) {
775             n++;
776             shift = 32 - shift;
777 
778             interp_modes[n] |= interp_mode >> shift;
779             ps_repl_modes[n] |= ps_repl_mode >> shift;
780          }
781          interp_regs = MAX2(interp_regs, n + 1);
782       }
783    }
784 
785    if (interp_regs) {
786       tu_cs_emit_pkt4(cs, REG_A6XX_VPC_VARYING_INTERP_MODE(0), interp_regs);
787       tu_cs_emit_array(cs, interp_modes, interp_regs);
788 
789       tu_cs_emit_pkt4(cs, REG_A6XX_VPC_VARYING_PS_REPL_MODE(0), interp_regs);
790       tu_cs_emit_array(cs, ps_repl_modes, interp_regs);
791    }
792 }
793 
794 template <chip CHIP>
795 void
tu6_emit_vpc(struct tu_cs * cs,const struct ir3_shader_variant * vs,const struct ir3_shader_variant * hs,const struct ir3_shader_variant * ds,const struct ir3_shader_variant * gs,const struct ir3_shader_variant * fs)796 tu6_emit_vpc(struct tu_cs *cs,
797              const struct ir3_shader_variant *vs,
798              const struct ir3_shader_variant *hs,
799              const struct ir3_shader_variant *ds,
800              const struct ir3_shader_variant *gs,
801              const struct ir3_shader_variant *fs)
802 {
803    /* note: doesn't compile as static because of the array regs.. */
804    const struct reg_config {
805       uint16_t reg_sp_xs_out_reg;
806       uint16_t reg_sp_xs_vpc_dst_reg;
807       uint16_t reg_vpc_xs_pack;
808       uint16_t reg_vpc_xs_clip_cntl;
809       uint16_t reg_vpc_xs_clip_cntl_v2;
810       uint16_t reg_gras_xs_cl_cntl;
811       uint16_t reg_pc_xs_out_cntl;
812       uint16_t reg_sp_xs_primitive_cntl;
813       uint16_t reg_vpc_xs_layer_cntl;
814       uint16_t reg_vpc_xs_layer_cntl_v2;
815       uint16_t reg_gras_xs_layer_cntl;
816    } reg_config[] = {
817       [MESA_SHADER_VERTEX] = {
818          REG_A6XX_SP_VS_OUT_REG(0),
819          REG_A6XX_SP_VS_VPC_DST_REG(0),
820          REG_A6XX_VPC_VS_PACK,
821          REG_A6XX_VPC_VS_CLIP_CNTL,
822          REG_A6XX_VPC_VS_CLIP_CNTL_V2,
823          REG_A6XX_GRAS_VS_CL_CNTL,
824          REG_A6XX_PC_VS_OUT_CNTL,
825          REG_A6XX_SP_VS_PRIMITIVE_CNTL,
826          REG_A6XX_VPC_VS_LAYER_CNTL,
827          REG_A6XX_VPC_VS_LAYER_CNTL_V2,
828          REG_A6XX_GRAS_VS_LAYER_CNTL
829       },
830       [MESA_SHADER_TESS_CTRL] = {
831          0,
832          0,
833          0,
834          0,
835          0,
836          0,
837          REG_A6XX_PC_HS_OUT_CNTL,
838          0,
839          0,
840          0
841       },
842       [MESA_SHADER_TESS_EVAL] = {
843          REG_A6XX_SP_DS_OUT_REG(0),
844          REG_A6XX_SP_DS_VPC_DST_REG(0),
845          REG_A6XX_VPC_DS_PACK,
846          REG_A6XX_VPC_DS_CLIP_CNTL,
847          REG_A6XX_VPC_DS_CLIP_CNTL_V2,
848          REG_A6XX_GRAS_DS_CL_CNTL,
849          REG_A6XX_PC_DS_OUT_CNTL,
850          REG_A6XX_SP_DS_PRIMITIVE_CNTL,
851          REG_A6XX_VPC_DS_LAYER_CNTL,
852          REG_A6XX_VPC_DS_LAYER_CNTL_V2,
853          REG_A6XX_GRAS_DS_LAYER_CNTL
854       },
855       [MESA_SHADER_GEOMETRY] = {
856          REG_A6XX_SP_GS_OUT_REG(0),
857          REG_A6XX_SP_GS_VPC_DST_REG(0),
858          REG_A6XX_VPC_GS_PACK,
859          REG_A6XX_VPC_GS_CLIP_CNTL,
860          REG_A6XX_VPC_GS_CLIP_CNTL_V2,
861          REG_A6XX_GRAS_GS_CL_CNTL,
862          REG_A6XX_PC_GS_OUT_CNTL,
863          REG_A6XX_SP_GS_PRIMITIVE_CNTL,
864          REG_A6XX_VPC_GS_LAYER_CNTL,
865          REG_A6XX_VPC_GS_LAYER_CNTL_V2,
866          REG_A6XX_GRAS_GS_LAYER_CNTL
867       },
868    };
869 
870    const struct ir3_shader_variant *last_shader;
871    if (gs) {
872       last_shader = gs;
873    } else if (hs) {
874       last_shader = ds;
875    } else {
876       last_shader = vs;
877    }
878 
879    const struct reg_config *cfg = &reg_config[last_shader->type];
880 
881    struct ir3_shader_linkage linkage = {
882       .primid_loc = 0xff,
883       .clip0_loc = 0xff,
884       .clip1_loc = 0xff,
885    };
886    if (fs)
887       ir3_link_shaders(&linkage, last_shader, fs, true);
888 
889    if (last_shader->stream_output.num_outputs)
890       ir3_link_stream_out(&linkage, last_shader);
891 
892    /* a6xx finds position/pointsize at the end */
893    const uint32_t pointsize_regid =
894       ir3_find_output_regid(last_shader, VARYING_SLOT_PSIZ);
895    const uint32_t layer_regid =
896       ir3_find_output_regid(last_shader, VARYING_SLOT_LAYER);
897    const uint32_t view_regid =
898       ir3_find_output_regid(last_shader, VARYING_SLOT_VIEWPORT);
899    const uint32_t clip0_regid =
900       ir3_find_output_regid(last_shader, VARYING_SLOT_CLIP_DIST0);
901    const uint32_t clip1_regid =
902       ir3_find_output_regid(last_shader, VARYING_SLOT_CLIP_DIST1);
903    uint32_t flags_regid = gs ?
904       ir3_find_output_regid(gs, VARYING_SLOT_GS_VERTEX_FLAGS_IR3) : 0;
905    const uint32_t shading_rate_regid =
906       ir3_find_output_regid(last_shader, VARYING_SLOT_PRIMITIVE_SHADING_RATE);
907 
908    uint32_t pointsize_loc = 0xff, position_loc = 0xff, layer_loc = 0xff, view_loc = 0xff;
909    uint32_t shading_rate_loc = 0xff;
910 
911    if (layer_regid != regid(63, 0)) {
912       layer_loc = linkage.max_loc;
913       ir3_link_add(&linkage, VARYING_SLOT_LAYER, layer_regid, 0x1, linkage.max_loc);
914    }
915 
916    if (view_regid != regid(63, 0)) {
917       view_loc = linkage.max_loc;
918       ir3_link_add(&linkage, VARYING_SLOT_VIEWPORT, view_regid, 0x1, linkage.max_loc);
919    }
920 
921    if (shading_rate_regid != regid(63, 0)) {
922       shading_rate_loc = linkage.max_loc;
923       ir3_link_add(&linkage, VARYING_SLOT_PRIMITIVE_SHADING_RATE,
924                    shading_rate_regid, 0x1, linkage.max_loc);
925    }
926 
927    unsigned extra_pos = 0;
928 
929    for (unsigned i = 0; i < last_shader->outputs_count; i++) {
930       if (last_shader->outputs[i].slot != VARYING_SLOT_POS)
931          continue;
932 
933       if (position_loc == 0xff)
934          position_loc = linkage.max_loc;
935 
936       ir3_link_add(&linkage, last_shader->outputs[i].slot,
937                    last_shader->outputs[i].regid,
938                    0xf, position_loc + 4 * last_shader->outputs[i].view);
939       extra_pos = MAX2(extra_pos, last_shader->outputs[i].view);
940    }
941 
942    if (pointsize_regid != regid(63, 0)) {
943       pointsize_loc = linkage.max_loc;
944       ir3_link_add(&linkage, VARYING_SLOT_PSIZ, pointsize_regid, 0x1, linkage.max_loc);
945    }
946 
947    uint8_t clip_cull_mask = last_shader->clip_mask | last_shader->cull_mask;
948 
949    /* Handle the case where clip/cull distances aren't read by the FS */
950    uint32_t clip0_loc = linkage.clip0_loc, clip1_loc = linkage.clip1_loc;
951    if (clip0_loc == 0xff && clip0_regid != regid(63, 0)) {
952       clip0_loc = linkage.max_loc;
953       ir3_link_add(&linkage, VARYING_SLOT_CLIP_DIST0, clip0_regid,
954                    clip_cull_mask & 0xf, linkage.max_loc);
955    }
956    if (clip1_loc == 0xff && clip1_regid != regid(63, 0)) {
957       clip1_loc = linkage.max_loc;
958       ir3_link_add(&linkage, VARYING_SLOT_CLIP_DIST1, clip1_regid,
959                    clip_cull_mask >> 4, linkage.max_loc);
960    }
961 
962    tu6_setup_streamout<CHIP>(cs, last_shader, &linkage);
963 
964    /* There is a hardware bug on a750 where STRIDE_IN_VPC of 5 to 8 in GS with
965     * an input primitive type with adjacency, an output primitive type of
966     * points, and a high enough vertex count causes a hang.
967     */
968    if (cs->device->physical_device->info->a7xx.gs_vpc_adjacency_quirk &&
969        gs && gs->gs.output_primitive == MESA_PRIM_POINTS &&
970        linkage.max_loc > 4) {
971       linkage.max_loc = MAX2(linkage.max_loc, 9);
972    }
973 
974    /* The GPU hangs on some models when there are no outputs (xs_pack::CNT),
975     * at least when a DS is the last stage, so add a dummy output to keep it
976     * happy if there aren't any. We do this late in order to avoid emitting
977     * any unused code and make sure that optimizations don't remove it.
978     */
979    if (linkage.cnt == 0)
980       ir3_link_add(&linkage, 0, 0, 0x1, linkage.max_loc);
981 
982    /* map outputs of the last shader to VPC */
983    assert(linkage.cnt <= 32);
984    const uint32_t sp_out_count = DIV_ROUND_UP(linkage.cnt, 2);
985    const uint32_t sp_vpc_dst_count = DIV_ROUND_UP(linkage.cnt, 4);
986    uint32_t sp_out[16] = {0};
987    uint32_t sp_vpc_dst[8] = {0};
988    for (uint32_t i = 0; i < linkage.cnt; i++) {
989       ((uint16_t *) sp_out)[i] =
990          A6XX_SP_VS_OUT_REG_A_REGID(linkage.var[i].regid) |
991          A6XX_SP_VS_OUT_REG_A_COMPMASK(linkage.var[i].compmask);
992       ((uint8_t *) sp_vpc_dst)[i] =
993          A6XX_SP_VS_VPC_DST_REG_OUTLOC0(linkage.var[i].loc);
994    }
995 
996    tu_cs_emit_pkt4(cs, cfg->reg_sp_xs_out_reg, sp_out_count);
997    tu_cs_emit_array(cs, sp_out, sp_out_count);
998 
999    tu_cs_emit_pkt4(cs, cfg->reg_sp_xs_vpc_dst_reg, sp_vpc_dst_count);
1000    tu_cs_emit_array(cs, sp_vpc_dst, sp_vpc_dst_count);
1001 
1002    tu_cs_emit_pkt4(cs, cfg->reg_vpc_xs_pack, 1);
1003    tu_cs_emit(cs, A6XX_VPC_VS_PACK_POSITIONLOC(position_loc) |
1004                   A6XX_VPC_VS_PACK_PSIZELOC(pointsize_loc) |
1005                   A6XX_VPC_VS_PACK_STRIDE_IN_VPC(linkage.max_loc) |
1006                   A6XX_VPC_VS_PACK_EXTRAPOS(extra_pos));
1007 
1008    tu_cs_emit_pkt4(cs, cfg->reg_vpc_xs_clip_cntl, 1);
1009    tu_cs_emit(cs, A6XX_VPC_VS_CLIP_CNTL_CLIP_MASK(clip_cull_mask) |
1010                   A6XX_VPC_VS_CLIP_CNTL_CLIP_DIST_03_LOC(clip0_loc) |
1011                   A6XX_VPC_VS_CLIP_CNTL_CLIP_DIST_47_LOC(clip1_loc));
1012    tu_cs_emit_pkt4(cs, cfg->reg_vpc_xs_clip_cntl_v2, 1);
1013    tu_cs_emit(cs, A6XX_VPC_VS_CLIP_CNTL_CLIP_MASK(clip_cull_mask) |
1014                   A6XX_VPC_VS_CLIP_CNTL_CLIP_DIST_03_LOC(clip0_loc) |
1015                   A6XX_VPC_VS_CLIP_CNTL_CLIP_DIST_47_LOC(clip1_loc));
1016 
1017    tu_cs_emit_pkt4(cs, cfg->reg_gras_xs_cl_cntl, 1);
1018    tu_cs_emit(cs, A6XX_GRAS_VS_CL_CNTL_CLIP_MASK(last_shader->clip_mask) |
1019                   A6XX_GRAS_VS_CL_CNTL_CULL_MASK(last_shader->cull_mask));
1020 
1021    const struct ir3_shader_variant *geom_shaders[] = { vs, hs, ds, gs };
1022 
1023    for (unsigned i = 0; i < ARRAY_SIZE(geom_shaders); i++) {
1024       const struct ir3_shader_variant *shader = geom_shaders[i];
1025       if (!shader)
1026          continue;
1027 
1028       bool primid = shader->type != MESA_SHADER_VERTEX &&
1029          VALIDREG(ir3_find_sysval_regid(shader, SYSTEM_VALUE_PRIMITIVE_ID));
1030 
1031       tu_cs_emit_pkt4(cs, reg_config[shader->type].reg_pc_xs_out_cntl, 1);
1032       if (shader == last_shader) {
1033          tu_cs_emit(cs, A6XX_PC_VS_OUT_CNTL_STRIDE_IN_VPC(linkage.max_loc) |
1034                         CONDREG(pointsize_regid, A6XX_PC_VS_OUT_CNTL_PSIZE) |
1035                         CONDREG(layer_regid, A6XX_PC_VS_OUT_CNTL_LAYER) |
1036                         CONDREG(view_regid, A6XX_PC_VS_OUT_CNTL_VIEW) |
1037                         COND(primid, A6XX_PC_VS_OUT_CNTL_PRIMITIVE_ID) |
1038                         A6XX_PC_VS_OUT_CNTL_CLIP_MASK(clip_cull_mask) |
1039                         CONDREG(shading_rate_regid, A6XX_PC_VS_OUT_CNTL_SHADINGRATE));
1040       } else {
1041          tu_cs_emit(cs, COND(primid, A6XX_PC_VS_OUT_CNTL_PRIMITIVE_ID));
1042       }
1043    }
1044 
1045    /* if vertex_flags somehow gets optimized out, your gonna have a bad time: */
1046    if (gs)
1047       assert(flags_regid != INVALID_REG);
1048 
1049    tu_cs_emit_pkt4(cs, cfg->reg_sp_xs_primitive_cntl, 1);
1050    tu_cs_emit(cs, A6XX_SP_VS_PRIMITIVE_CNTL_OUT(linkage.cnt) |
1051                   A6XX_SP_GS_PRIMITIVE_CNTL_FLAGS_REGID(flags_regid));
1052 
1053    tu_cs_emit_pkt4(cs, cfg->reg_vpc_xs_layer_cntl, 1);
1054    tu_cs_emit(cs, A6XX_VPC_VS_LAYER_CNTL_LAYERLOC(layer_loc) |
1055                   A6XX_VPC_VS_LAYER_CNTL_VIEWLOC(view_loc) |
1056                   A6XX_VPC_VS_LAYER_CNTL_SHADINGRATELOC(shading_rate_loc));
1057    tu_cs_emit_pkt4(cs, cfg->reg_vpc_xs_layer_cntl_v2, 1);
1058    tu_cs_emit(cs, A6XX_VPC_VS_LAYER_CNTL_LAYERLOC(layer_loc) |
1059                   A6XX_VPC_VS_LAYER_CNTL_VIEWLOC(view_loc) |
1060                   A6XX_VPC_VS_LAYER_CNTL_SHADINGRATELOC(shading_rate_loc));
1061 
1062    tu_cs_emit_pkt4(cs, cfg->reg_gras_xs_layer_cntl, 1);
1063    tu_cs_emit(cs, CONDREG(layer_regid, A6XX_GRAS_GS_LAYER_CNTL_WRITES_LAYER) |
1064                   CONDREG(view_regid, A6XX_GRAS_GS_LAYER_CNTL_WRITES_VIEW));
1065 
1066    tu6_emit_vpc_varying_modes(cs, fs, last_shader);
1067 }
1068 TU_GENX(tu6_emit_vpc);
1069 
1070 static void
tu6_emit_vs_params(struct tu_cs * cs,const struct ir3_const_state * const_state,unsigned constlen,unsigned param_stride,unsigned num_vertices)1071 tu6_emit_vs_params(struct tu_cs *cs,
1072                    const struct ir3_const_state *const_state,
1073                    unsigned constlen,
1074                    unsigned param_stride,
1075                    unsigned num_vertices)
1076 {
1077    uint32_t vs_params[4] = {
1078       param_stride * num_vertices * 4,  /* vs primitive stride */
1079       param_stride * 4,                 /* vs vertex stride */
1080       0,
1081       0,
1082    };
1083    tu6_emit_const(cs, CP_LOAD_STATE6_GEOM, TU_CONSTS_PRIMITIVE_PARAM,
1084                   const_state, constlen, SB6_VS_SHADER, 0,
1085                   ARRAY_SIZE(vs_params), vs_params);
1086 }
1087 
1088 static void
tu_get_tess_iova(struct tu_device * dev,uint64_t * tess_factor_iova,uint64_t * tess_param_iova)1089 tu_get_tess_iova(struct tu_device *dev,
1090                  uint64_t *tess_factor_iova,
1091                  uint64_t *tess_param_iova)
1092 {
1093    /* Create the shared tess factor BO the first time tess is used on the device. */
1094    if (!dev->tess_bo) {
1095       mtx_lock(&dev->mutex);
1096       if (!dev->tess_bo) {
1097          tu_bo_init_new(dev, NULL, &dev->tess_bo, TU_TESS_BO_SIZE,
1098                         TU_BO_ALLOC_INTERNAL_RESOURCE, "tess");
1099       }
1100       mtx_unlock(&dev->mutex);
1101    }
1102 
1103    *tess_factor_iova = dev->tess_bo->iova;
1104    *tess_param_iova = dev->tess_bo->iova + TU_TESS_FACTOR_SIZE;
1105 }
1106 
1107 static const enum mesa_vk_dynamic_graphics_state tu_patch_control_points_state[] = {
1108    MESA_VK_DYNAMIC_TS_PATCH_CONTROL_POINTS,
1109 };
1110 
1111 #define HS_PARAMS_SIZE 8
1112 
1113 template <chip CHIP>
1114 static unsigned
tu6_patch_control_points_size(struct tu_device * dev,const struct tu_shader * vs,const struct tu_shader * tcs,const struct tu_shader * tes,const struct tu_program_state * program,uint32_t patch_control_points)1115 tu6_patch_control_points_size(struct tu_device *dev,
1116                               const struct tu_shader *vs,
1117                               const struct tu_shader *tcs,
1118                               const struct tu_shader *tes,
1119                               const struct tu_program_state *program,
1120                               uint32_t patch_control_points)
1121 {
1122    if (dev->physical_device->info->a7xx.load_shader_consts_via_preamble) {
1123 #define EMIT_CONST_DWORDS(const_dwords) (6 + const_dwords + 4)
1124       return EMIT_CONST_DWORDS(4) +
1125          EMIT_CONST_DWORDS(HS_PARAMS_SIZE) + 2 + 2 + 2;
1126 #undef EMIT_CONST_DWORDS
1127    } else {
1128 #define EMIT_CONST_DWORDS(const_dwords) (4 + const_dwords)
1129       return EMIT_CONST_DWORDS(4) +
1130          EMIT_CONST_DWORDS(HS_PARAMS_SIZE) + 2 + 2 + 2;
1131 #undef EMIT_CONST_DWORDS
1132    }
1133 }
1134 
1135 template <chip CHIP>
1136 void
tu6_emit_patch_control_points(struct tu_cs * cs,const struct tu_shader * vs,const struct tu_shader * tcs,const struct tu_shader * tes,const struct tu_program_state * program,uint32_t patch_control_points)1137 tu6_emit_patch_control_points(struct tu_cs *cs,
1138                               const struct tu_shader *vs,
1139                               const struct tu_shader *tcs,
1140                               const struct tu_shader *tes,
1141                               const struct tu_program_state *program,
1142                               uint32_t patch_control_points)
1143 {
1144    if (!tcs->variant)
1145       return;
1146 
1147    struct tu_device *dev = cs->device;
1148 
1149    tu6_emit_vs_params(cs,
1150                       &program->link[MESA_SHADER_VERTEX].const_state,
1151                       program->link[MESA_SHADER_VERTEX].constlen,
1152                       vs->variant->output_size,
1153                       patch_control_points);
1154 
1155    uint64_t tess_factor_iova, tess_param_iova;
1156    tu_get_tess_iova(dev, &tess_factor_iova, &tess_param_iova);
1157 
1158    uint32_t hs_params[HS_PARAMS_SIZE] = {
1159       vs->variant->output_size * patch_control_points * 4,  /* hs primitive stride */
1160       vs->variant->output_size * 4,                         /* hs vertex stride */
1161       tcs->variant->output_size,
1162       patch_control_points,
1163       tess_param_iova,
1164       tess_param_iova >> 32,
1165       tess_factor_iova,
1166       tess_factor_iova >> 32,
1167    };
1168 
1169    const struct ir3_const_state *hs_const =
1170       &program->link[MESA_SHADER_TESS_CTRL].const_state;
1171    unsigned hs_constlen = program->link[MESA_SHADER_TESS_CTRL].constlen;
1172    tu6_emit_const(cs, CP_LOAD_STATE6_GEOM, TU_CONSTS_PRIMITIVE_PARAM,
1173                   hs_const, hs_constlen, SB6_HS_SHADER, 0,
1174                   ARRAY_SIZE(hs_params), hs_params);
1175 
1176    uint32_t patch_local_mem_size_16b =
1177       patch_control_points * vs->variant->output_size / 4;
1178 
1179    /* Total attribute slots in HS incoming patch. */
1180    tu_cs_emit_pkt4(cs, REG_A6XX_PC_HS_INPUT_SIZE, 1);
1181    tu_cs_emit(cs, patch_local_mem_size_16b);
1182 
1183    const uint32_t wavesize = 64;
1184    const uint32_t vs_hs_local_mem_size = 16384;
1185 
1186    uint32_t max_patches_per_wave;
1187    if (dev->physical_device->info->a6xx.tess_use_shared) {
1188       /* HS invocations for a patch are always within the same wave,
1189        * making barriers less expensive. VS can't have barriers so we
1190        * don't care about VS invocations being in the same wave.
1191        */
1192       max_patches_per_wave = wavesize / tcs->variant->tess.tcs_vertices_out;
1193    } else {
1194       /* VS is also in the same wave */
1195       max_patches_per_wave =
1196          wavesize / MAX2(patch_control_points,
1197                          tcs->variant->tess.tcs_vertices_out);
1198    }
1199 
1200    uint32_t patches_per_wave =
1201       MIN2(vs_hs_local_mem_size / (patch_local_mem_size_16b * 16),
1202            max_patches_per_wave);
1203 
1204    uint32_t wave_input_size = DIV_ROUND_UP(
1205       patches_per_wave * patch_local_mem_size_16b * 16, 256);
1206 
1207    tu_cs_emit_pkt4(cs, REG_A6XX_SP_HS_WAVE_INPUT_SIZE, 1);
1208    tu_cs_emit(cs, wave_input_size);
1209 
1210    /* maximum number of patches that can fit in tess factor/param buffers */
1211    uint32_t subdraw_size = MIN2(TU_TESS_FACTOR_SIZE / ir3_tess_factor_stride(tes->variant->key.tessellation),
1212                         TU_TESS_PARAM_SIZE / (tcs->variant->output_size * 4));
1213    /* convert from # of patches to draw count */
1214    subdraw_size *= patch_control_points;
1215 
1216    tu_cs_emit_pkt7(cs, CP_SET_SUBDRAW_SIZE, 1);
1217    tu_cs_emit(cs, subdraw_size);
1218 }
1219 
1220 static void
tu6_emit_geom_tess_consts(struct tu_cs * cs,const struct ir3_shader_variant * vs,const struct ir3_shader_variant * hs,const struct ir3_shader_variant * ds,const struct ir3_shader_variant * gs)1221 tu6_emit_geom_tess_consts(struct tu_cs *cs,
1222                           const struct ir3_shader_variant *vs,
1223                           const struct ir3_shader_variant *hs,
1224                           const struct ir3_shader_variant *ds,
1225                           const struct ir3_shader_variant *gs)
1226 {
1227    struct tu_device *dev = cs->device;
1228 
1229    if (gs && !hs) {
1230       tu6_emit_vs_params(cs, ir3_const_state(vs), vs->constlen,
1231                          vs->output_size, gs->gs.vertices_in);
1232    }
1233 
1234    if (hs) {
1235       uint64_t tess_factor_iova, tess_param_iova;
1236       tu_get_tess_iova(dev, &tess_factor_iova, &tess_param_iova);
1237 
1238       uint32_t ds_params[8] = {
1239          gs ? ds->output_size * gs->gs.vertices_in * 4 : 0,  /* ds primitive stride */
1240          ds->output_size * 4,                                /* ds vertex stride */
1241          hs->output_size,                                    /* hs vertex stride (dwords) */
1242          hs->tess.tcs_vertices_out,
1243          tess_param_iova,
1244          tess_param_iova >> 32,
1245          tess_factor_iova,
1246          tess_factor_iova >> 32,
1247       };
1248 
1249       tu6_emit_const(cs, CP_LOAD_STATE6_GEOM, TU_CONSTS_PRIMITIVE_PARAM,
1250                      ds->const_state, ds->constlen, SB6_DS_SHADER, 0,
1251                      ARRAY_SIZE(ds_params), ds_params);
1252    }
1253 
1254    if (gs) {
1255       const struct ir3_shader_variant *prev = ds ? ds : vs;
1256       uint32_t gs_params[4] = {
1257          prev->output_size * gs->gs.vertices_in * 4,  /* gs primitive stride */
1258          prev->output_size * 4,                 /* gs vertex stride */
1259          0,
1260          0,
1261       };
1262       tu6_emit_const(cs, CP_LOAD_STATE6_GEOM, TU_CONSTS_PRIMITIVE_PARAM,
1263                      gs->const_state, gs->constlen, SB6_GS_SHADER, 0,
1264                      ARRAY_SIZE(gs_params), gs_params);
1265    }
1266 }
1267 
1268 template <chip CHIP>
1269 static void
tu6_emit_program_config(struct tu_cs * cs,const struct tu_program_state * prog,struct tu_shader ** shaders,const struct ir3_shader_variant ** variants)1270 tu6_emit_program_config(struct tu_cs *cs,
1271                         const struct tu_program_state *prog,
1272                         struct tu_shader **shaders,
1273                         const struct ir3_shader_variant **variants)
1274 {
1275    STATIC_ASSERT(MESA_SHADER_VERTEX == 0);
1276 
1277    bool shared_consts_enable =
1278       prog->shared_consts.type == IR3_PUSH_CONSTS_SHARED;
1279    tu6_emit_shared_consts_enable<CHIP>(cs, shared_consts_enable);
1280 
1281    tu_cs_emit_regs(cs, HLSQ_INVALIDATE_CMD(CHIP,
1282          .vs_state = true,
1283          .hs_state = true,
1284          .ds_state = true,
1285          .gs_state = true,
1286          .fs_state = true,
1287          .gfx_ibo = true,
1288          .gfx_shared_const = shared_consts_enable));
1289    for (size_t stage_idx = MESA_SHADER_VERTEX;
1290         stage_idx <= MESA_SHADER_FRAGMENT; stage_idx++) {
1291       gl_shader_stage stage = (gl_shader_stage) stage_idx;
1292       tu6_emit_xs_config<CHIP>(cs, stage, variants[stage]);
1293    }
1294 
1295    for (size_t stage_idx = MESA_SHADER_VERTEX;
1296         stage_idx <= MESA_SHADER_FRAGMENT; stage_idx++) {
1297       gl_shader_stage stage = (gl_shader_stage) stage_idx;
1298       tu6_emit_dynamic_offset(cs, variants[stage], shaders[stage], prog);
1299    }
1300 
1301    const struct ir3_shader_variant *vs = variants[MESA_SHADER_VERTEX];
1302    const struct ir3_shader_variant *hs = variants[MESA_SHADER_TESS_CTRL];
1303    const struct ir3_shader_variant *ds = variants[MESA_SHADER_TESS_EVAL];
1304    const struct ir3_shader_variant *gs = variants[MESA_SHADER_GEOMETRY];
1305 
1306    if (hs) {
1307       tu6_emit_link_map(cs, vs, hs, SB6_HS_SHADER);
1308       tu6_emit_link_map(cs, hs, ds, SB6_DS_SHADER);
1309    }
1310 
1311    if (gs) {
1312       if (hs) {
1313          tu6_emit_link_map(cs, ds, gs, SB6_GS_SHADER);
1314       } else {
1315          tu6_emit_link_map(cs, vs, gs, SB6_GS_SHADER);
1316       }
1317 
1318       uint32_t prev_stage_output_size = ds ? ds->output_size : vs->output_size;
1319 
1320       if (CHIP == A6XX) {
1321          /* Size of per-primitive alloction in ldlw memory in vec4s. */
1322          uint32_t vec4_size = gs->gs.vertices_in *
1323                               DIV_ROUND_UP(prev_stage_output_size, 4);
1324 
1325          tu_cs_emit_pkt4(cs, REG_A6XX_PC_PRIMITIVE_CNTL_6, 1);
1326          tu_cs_emit(cs, A6XX_PC_PRIMITIVE_CNTL_6_STRIDE_IN_VPC(vec4_size));
1327       }
1328 
1329       uint32_t prim_size = prev_stage_output_size;
1330       if (prim_size > 64)
1331          prim_size = 64;
1332       else if (prim_size == 64)
1333          prim_size = 63;
1334       tu_cs_emit_pkt4(cs, REG_A6XX_SP_GS_PRIM_SIZE, 1);
1335       tu_cs_emit(cs, prim_size);
1336    }
1337 
1338    if (gs || hs) {
1339       tu6_emit_geom_tess_consts(cs, vs, hs, ds, gs);
1340    }
1341 }
1342 
1343 static bool
contains_all_shader_state(VkGraphicsPipelineLibraryFlagsEXT state)1344 contains_all_shader_state(VkGraphicsPipelineLibraryFlagsEXT state)
1345 {
1346    return (state &
1347       (VK_GRAPHICS_PIPELINE_LIBRARY_PRE_RASTERIZATION_SHADERS_BIT_EXT |
1348        VK_GRAPHICS_PIPELINE_LIBRARY_FRAGMENT_SHADER_BIT_EXT)) ==
1349       (VK_GRAPHICS_PIPELINE_LIBRARY_PRE_RASTERIZATION_SHADERS_BIT_EXT |
1350        VK_GRAPHICS_PIPELINE_LIBRARY_FRAGMENT_SHADER_BIT_EXT);
1351 }
1352 
1353 static bool
pipeline_contains_all_shader_state(struct tu_pipeline * pipeline)1354 pipeline_contains_all_shader_state(struct tu_pipeline *pipeline)
1355 {
1356    return pipeline->type == TU_PIPELINE_GRAPHICS ||
1357       pipeline->type == TU_PIPELINE_COMPUTE ||
1358       contains_all_shader_state(tu_pipeline_to_graphics_lib(pipeline)->state);
1359 }
1360 
1361 /* Return true if this pipeline contains all of the GPL stages listed but none
1362  * of the libraries it uses do, so this is "the first time" that all of them
1363  * are defined together. This is useful for state that needs to be combined
1364  * from multiple GPL stages.
1365  */
1366 
1367 static bool
set_combined_state(struct tu_pipeline_builder * builder,struct tu_pipeline * pipeline,VkGraphicsPipelineLibraryFlagsEXT state)1368 set_combined_state(struct tu_pipeline_builder *builder,
1369                    struct tu_pipeline *pipeline,
1370                    VkGraphicsPipelineLibraryFlagsEXT state)
1371 {
1372    if (pipeline->type == TU_PIPELINE_GRAPHICS_LIB &&
1373        (tu_pipeline_to_graphics_lib(pipeline)->state & state) != state)
1374       return false;
1375 
1376    for (unsigned i = 0; i < builder->num_libraries; i++) {
1377       if ((builder->libraries[i]->state & state) == state)
1378          return false;
1379    }
1380 
1381    return true;
1382 }
1383 
1384 #define TU6_EMIT_VERTEX_INPUT_MAX_DWORDS (MAX_VERTEX_ATTRIBS * 2 + 1)
1385 
1386 static VkResult
tu_pipeline_allocate_cs(struct tu_device * dev,struct tu_pipeline * pipeline,struct tu_pipeline_layout * layout,struct tu_pipeline_builder * builder,const struct ir3_shader_variant * compute)1387 tu_pipeline_allocate_cs(struct tu_device *dev,
1388                         struct tu_pipeline *pipeline,
1389                         struct tu_pipeline_layout *layout,
1390                         struct tu_pipeline_builder *builder,
1391                         const struct ir3_shader_variant *compute)
1392 {
1393    uint32_t size = 1024;
1394 
1395    /* graphics case: */
1396    if (builder) {
1397       if (builder->state &
1398           VK_GRAPHICS_PIPELINE_LIBRARY_VERTEX_INPUT_INTERFACE_BIT_EXT) {
1399          size += TU6_EMIT_VERTEX_INPUT_MAX_DWORDS;
1400       }
1401 
1402       if (set_combined_state(builder, pipeline,
1403                              VK_GRAPHICS_PIPELINE_LIBRARY_PRE_RASTERIZATION_SHADERS_BIT_EXT |
1404                              VK_GRAPHICS_PIPELINE_LIBRARY_FRAGMENT_SHADER_BIT_EXT)) {
1405          size += tu6_load_state_size(pipeline, layout);
1406       }
1407    } else {
1408       size += tu6_load_state_size(pipeline, layout);
1409    }
1410 
1411    /* Allocate the space for the pipeline out of the device's RO suballocator.
1412     *
1413     * Sub-allocating BOs saves memory and also kernel overhead in refcounting of
1414     * BOs at exec time.
1415     *
1416     * The pipeline cache would seem like a natural place to stick the
1417     * suballocator, except that it is not guaranteed to outlive the pipelines
1418     * created from it, so you can't store any long-lived state there, and you
1419     * can't use its EXTERNALLY_SYNCHRONIZED flag to avoid atomics because
1420     * pipeline destroy isn't synchronized by the cache.
1421     */
1422    mtx_lock(&dev->pipeline_mutex);
1423    VkResult result = tu_suballoc_bo_alloc(&pipeline->bo, &dev->pipeline_suballoc,
1424                                           size * 4, 128);
1425    mtx_unlock(&dev->pipeline_mutex);
1426    if (result != VK_SUCCESS)
1427       return result;
1428 
1429    TU_RMV(cmd_buffer_suballoc_bo_create, dev, &pipeline->bo);
1430    tu_cs_init_suballoc(&pipeline->cs, dev, &pipeline->bo);
1431 
1432    return VK_SUCCESS;
1433 }
1434 
1435 static void
tu_append_executable(struct tu_pipeline * pipeline,const struct ir3_shader_variant * variant,char * nir_from_spirv)1436 tu_append_executable(struct tu_pipeline *pipeline,
1437                      const struct ir3_shader_variant *variant,
1438                      char *nir_from_spirv)
1439 {
1440    struct tu_pipeline_executable exe = {
1441       .stage = variant->type,
1442       .stats = variant->info,
1443       .is_binning = variant->binning_pass,
1444       .nir_from_spirv = nir_from_spirv,
1445       .nir_final = ralloc_strdup(pipeline->executables_mem_ctx, variant->disasm_info.nir),
1446       .disasm = ralloc_strdup(pipeline->executables_mem_ctx, variant->disasm_info.disasm),
1447    };
1448 
1449    util_dynarray_append(&pipeline->executables, struct tu_pipeline_executable, exe);
1450 }
1451 
1452 static void
tu_hash_stage(struct mesa_sha1 * ctx,VkPipelineCreateFlags2KHR pipeline_flags,const VkPipelineShaderStageCreateInfo * stage,const nir_shader * nir,const struct tu_shader_key * key)1453 tu_hash_stage(struct mesa_sha1 *ctx,
1454               VkPipelineCreateFlags2KHR pipeline_flags,
1455               const VkPipelineShaderStageCreateInfo *stage,
1456               const nir_shader *nir,
1457               const struct tu_shader_key *key)
1458 {
1459 
1460    if (nir) {
1461       struct blob blob;
1462       blob_init(&blob);
1463       nir_serialize(&blob, nir, true);
1464       _mesa_sha1_update(ctx, blob.data, blob.size);
1465       blob_finish(&blob);
1466    } else {
1467       unsigned char stage_hash[SHA1_DIGEST_LENGTH];
1468       vk_pipeline_hash_shader_stage(pipeline_flags, stage, NULL, stage_hash);
1469       _mesa_sha1_update(ctx, stage_hash, sizeof(stage_hash));
1470    }
1471    _mesa_sha1_update(ctx, key, sizeof(*key));
1472 }
1473 
1474 static void
tu_hash_shaders(unsigned char * hash,VkPipelineCreateFlags2KHR pipeline_flags,const VkPipelineShaderStageCreateInfo ** stages,nir_shader * const * nir,const struct tu_pipeline_layout * layout,const struct tu_shader_key * keys,VkGraphicsPipelineLibraryFlagsEXT state)1475 tu_hash_shaders(unsigned char *hash,
1476                 VkPipelineCreateFlags2KHR pipeline_flags,
1477                 const VkPipelineShaderStageCreateInfo **stages,
1478                 nir_shader *const *nir,
1479                 const struct tu_pipeline_layout *layout,
1480                 const struct tu_shader_key *keys,
1481                 VkGraphicsPipelineLibraryFlagsEXT state)
1482 {
1483    struct mesa_sha1 ctx;
1484 
1485    _mesa_sha1_init(&ctx);
1486 
1487    if (layout)
1488       _mesa_sha1_update(&ctx, layout->sha1, sizeof(layout->sha1));
1489 
1490    for (int i = 0; i < MESA_SHADER_STAGES; ++i) {
1491       if (stages[i] || nir[i]) {
1492          tu_hash_stage(&ctx, pipeline_flags, stages[i], nir[i], &keys[i]);
1493       }
1494    }
1495    _mesa_sha1_update(&ctx, &state, sizeof(state));
1496    enum ir3_shader_debug ir3_debug_key = ir3_shader_debug_hash_key();
1497    _mesa_sha1_update(&ctx, &ir3_debug_key, sizeof(ir3_debug_key));
1498    _mesa_sha1_final(&ctx, hash);
1499 }
1500 
1501 static void
tu_hash_compute(unsigned char * hash,VkPipelineCreateFlags2KHR pipeline_flags,const VkPipelineShaderStageCreateInfo * stage,const struct tu_pipeline_layout * layout,const struct tu_shader_key * key)1502 tu_hash_compute(unsigned char *hash,
1503                 VkPipelineCreateFlags2KHR pipeline_flags,
1504                 const VkPipelineShaderStageCreateInfo *stage,
1505                 const struct tu_pipeline_layout *layout,
1506                 const struct tu_shader_key *key)
1507 {
1508    struct mesa_sha1 ctx;
1509 
1510    _mesa_sha1_init(&ctx);
1511 
1512    if (layout)
1513       _mesa_sha1_update(&ctx, layout->sha1, sizeof(layout->sha1));
1514 
1515    tu_hash_stage(&ctx, pipeline_flags, stage, NULL, key);
1516    enum ir3_shader_debug ir3_debug_key = ir3_shader_debug_hash_key();
1517    _mesa_sha1_update(&ctx, &ir3_debug_key, sizeof(ir3_debug_key));
1518 
1519    _mesa_sha1_final(&ctx, hash);
1520 }
1521 
1522 static struct tu_shader *
tu_pipeline_cache_lookup(struct vk_pipeline_cache * cache,const void * key_data,size_t key_size,bool * application_cache_hit)1523 tu_pipeline_cache_lookup(struct vk_pipeline_cache *cache,
1524                          const void *key_data, size_t key_size,
1525                          bool *application_cache_hit)
1526 {
1527    struct vk_pipeline_cache_object *object =
1528       vk_pipeline_cache_lookup_object(cache, key_data, key_size,
1529                                       &tu_shader_ops, application_cache_hit);
1530    if (object)
1531       return container_of(object, struct tu_shader, base);
1532    else
1533       return NULL;
1534 }
1535 
1536 static struct tu_shader *
tu_pipeline_cache_insert(struct vk_pipeline_cache * cache,struct tu_shader * shader)1537 tu_pipeline_cache_insert(struct vk_pipeline_cache *cache,
1538                          struct tu_shader *shader)
1539 {
1540    struct vk_pipeline_cache_object *object =
1541       vk_pipeline_cache_add_object(cache, &shader->base);
1542    return container_of(object, struct tu_shader, base);
1543 }
1544 
1545 static bool
1546 tu_nir_shaders_serialize(struct vk_pipeline_cache_object *object,
1547                          struct blob *blob);
1548 
1549 static struct vk_pipeline_cache_object *
1550 tu_nir_shaders_deserialize(struct vk_pipeline_cache *cache,
1551                            const void *key_data,
1552                            size_t key_size,
1553                            struct blob_reader *blob);
1554 
1555 static void
tu_nir_shaders_destroy(struct vk_device * device,struct vk_pipeline_cache_object * object)1556 tu_nir_shaders_destroy(struct vk_device *device,
1557                        struct vk_pipeline_cache_object *object)
1558 {
1559    struct tu_nir_shaders *shaders =
1560       container_of(object, struct tu_nir_shaders, base);
1561 
1562    for (unsigned i = 0; i < ARRAY_SIZE(shaders->nir); i++)
1563       ralloc_free(shaders->nir[i]);
1564 
1565    vk_pipeline_cache_object_finish(&shaders->base);
1566    vk_free(&device->alloc, shaders);
1567 }
1568 
1569 const struct vk_pipeline_cache_object_ops tu_nir_shaders_ops = {
1570    .serialize = tu_nir_shaders_serialize,
1571    .deserialize = tu_nir_shaders_deserialize,
1572    .destroy = tu_nir_shaders_destroy,
1573 };
1574 
1575 static struct tu_nir_shaders *
tu_nir_shaders_init(struct tu_device * dev,const void * key_data,size_t key_size)1576 tu_nir_shaders_init(struct tu_device *dev, const void *key_data, size_t key_size)
1577 {
1578    VK_MULTIALLOC(ma);
1579    VK_MULTIALLOC_DECL(&ma, struct tu_nir_shaders, shaders, 1);
1580    VK_MULTIALLOC_DECL_SIZE(&ma, char, obj_key_data, key_size);
1581 
1582    if (!vk_multialloc_zalloc(&ma, &dev->vk.alloc,
1583                              VK_SYSTEM_ALLOCATION_SCOPE_DEVICE))
1584       return NULL;
1585 
1586    memcpy(obj_key_data, key_data, key_size);
1587    vk_pipeline_cache_object_init(&dev->vk, &shaders->base,
1588                                  &tu_nir_shaders_ops, obj_key_data, key_size);
1589 
1590    return shaders;
1591 }
1592 
1593 static bool
tu_nir_shaders_serialize(struct vk_pipeline_cache_object * object,struct blob * blob)1594 tu_nir_shaders_serialize(struct vk_pipeline_cache_object *object,
1595                          struct blob *blob)
1596 {
1597    struct tu_nir_shaders *shaders =
1598       container_of(object, struct tu_nir_shaders, base);
1599 
1600    for (unsigned i = 0; i < ARRAY_SIZE(shaders->nir); i++) {
1601       if (shaders->nir[i]) {
1602          blob_write_uint8(blob, 1);
1603          nir_serialize(blob, shaders->nir[i], true);
1604       } else {
1605          blob_write_uint8(blob, 0);
1606       }
1607    }
1608 
1609    return true;
1610 }
1611 
1612 static struct vk_pipeline_cache_object *
tu_nir_shaders_deserialize(struct vk_pipeline_cache * cache,const void * key_data,size_t key_size,struct blob_reader * blob)1613 tu_nir_shaders_deserialize(struct vk_pipeline_cache *cache,
1614                            const void *key_data,
1615                            size_t key_size,
1616                            struct blob_reader *blob)
1617 {
1618    struct tu_device *dev =
1619       container_of(cache->base.device, struct tu_device, vk);
1620    struct tu_nir_shaders *shaders =
1621       tu_nir_shaders_init(dev, key_data, key_size);
1622 
1623    if (!shaders)
1624       return NULL;
1625 
1626    for (unsigned i = 0; i < ARRAY_SIZE(shaders->nir); i++) {
1627       if (blob_read_uint8(blob)) {
1628          shaders->nir[i] =
1629             nir_deserialize(NULL, ir3_get_compiler_options(dev->compiler), blob);
1630       }
1631    }
1632 
1633    return &shaders->base;
1634 }
1635 
1636 static struct tu_nir_shaders *
tu_nir_cache_lookup(struct vk_pipeline_cache * cache,const void * key_data,size_t key_size,bool * application_cache_hit)1637 tu_nir_cache_lookup(struct vk_pipeline_cache *cache,
1638                     const void *key_data, size_t key_size,
1639                     bool *application_cache_hit)
1640 {
1641    struct vk_pipeline_cache_object *object =
1642       vk_pipeline_cache_lookup_object(cache, key_data, key_size,
1643                                       &tu_nir_shaders_ops, application_cache_hit);
1644    if (object)
1645       return container_of(object, struct tu_nir_shaders, base);
1646    else
1647       return NULL;
1648 }
1649 
1650 static struct tu_nir_shaders *
tu_nir_cache_insert(struct vk_pipeline_cache * cache,struct tu_nir_shaders * shaders)1651 tu_nir_cache_insert(struct vk_pipeline_cache *cache,
1652                     struct tu_nir_shaders *shaders)
1653 {
1654    struct vk_pipeline_cache_object *object =
1655       vk_pipeline_cache_add_object(cache, &shaders->base);
1656    return container_of(object, struct tu_nir_shaders, base);
1657 }
1658 
1659 static VkResult
tu_pipeline_builder_compile_shaders(struct tu_pipeline_builder * builder,struct tu_pipeline * pipeline)1660 tu_pipeline_builder_compile_shaders(struct tu_pipeline_builder *builder,
1661                                     struct tu_pipeline *pipeline)
1662 {
1663    VkResult result = VK_SUCCESS;
1664    const VkPipelineShaderStageCreateInfo *stage_infos[MESA_SHADER_STAGES] = {
1665       NULL
1666    };
1667    VkPipelineCreationFeedback pipeline_feedback = {
1668       .flags = VK_PIPELINE_CREATION_FEEDBACK_VALID_BIT,
1669    };
1670    VkPipelineCreationFeedback stage_feedbacks[MESA_SHADER_STAGES] = { 0 };
1671 
1672    const bool executable_info =
1673       builder->create_flags &
1674       VK_PIPELINE_CREATE_2_CAPTURE_INTERNAL_REPRESENTATIONS_BIT_KHR;
1675 
1676    bool retain_nir =
1677       builder->create_flags &
1678       VK_PIPELINE_CREATE_2_RETAIN_LINK_TIME_OPTIMIZATION_INFO_BIT_EXT;
1679 
1680    int64_t pipeline_start = os_time_get_nano();
1681 
1682    const VkPipelineCreationFeedbackCreateInfo *creation_feedback =
1683       vk_find_struct_const(builder->create_info->pNext, PIPELINE_CREATION_FEEDBACK_CREATE_INFO);
1684 
1685    bool must_compile = false;
1686    for (uint32_t i = 0; i < builder->create_info->stageCount; i++) {
1687       if (!(builder->active_stages & builder->create_info->pStages[i].stage))
1688          continue;
1689 
1690       gl_shader_stage stage =
1691          vk_to_mesa_shader_stage(builder->create_info->pStages[i].stage);
1692       stage_infos[stage] = &builder->create_info->pStages[i];
1693       must_compile = true;
1694    }
1695 
1696    /* Forward declare everything due to the goto usage */
1697    nir_shader *nir[ARRAY_SIZE(stage_infos)] = { NULL };
1698    struct tu_shader *shaders[ARRAY_SIZE(stage_infos)] = { NULL };
1699    nir_shader *post_link_nir[ARRAY_SIZE(nir)] = { NULL };
1700    char *nir_initial_disasm[ARRAY_SIZE(stage_infos)] = { NULL };
1701    bool cache_hit = false;
1702 
1703    struct tu_shader_key keys[ARRAY_SIZE(stage_infos)] = { };
1704    for (gl_shader_stage stage = MESA_SHADER_VERTEX;
1705         stage < ARRAY_SIZE(keys); stage = (gl_shader_stage) (stage+1)) {
1706       const VkPipelineShaderStageRequiredSubgroupSizeCreateInfo *subgroup_info = NULL;
1707       if (stage_infos[stage])
1708          subgroup_info = vk_find_struct_const(stage_infos[stage],
1709                                               PIPELINE_SHADER_STAGE_REQUIRED_SUBGROUP_SIZE_CREATE_INFO);
1710       bool allow_varying_subgroup_size =
1711          !stage_infos[stage] ||
1712          (stage_infos[stage]->flags &
1713           VK_PIPELINE_SHADER_STAGE_CREATE_ALLOW_VARYING_SUBGROUP_SIZE_BIT_EXT);
1714       bool require_full_subgroups =
1715          stage_infos[stage] &&
1716          (stage_infos[stage]->flags &
1717           VK_PIPELINE_SHADER_STAGE_CREATE_REQUIRE_FULL_SUBGROUPS_BIT_EXT);
1718       tu_shader_key_subgroup_size(&keys[stage], allow_varying_subgroup_size,
1719                                   require_full_subgroups, subgroup_info,
1720                                   builder->device);
1721 
1722       if (stage_infos[stage]) {
1723          struct vk_pipeline_robustness_state rs;
1724          vk_pipeline_robustness_state_fill(&builder->device->vk, &rs,
1725                                            builder->create_info->pNext,
1726                                            stage_infos[stage]->pNext);
1727          tu_shader_key_robustness(&keys[stage], &rs);
1728          if (builder->create_flags & VK_PIPELINE_CREATE_2_VIEW_INDEX_FROM_DEVICE_INDEX_BIT_KHR)
1729             keys[stage].lower_view_index_to_device_index = true;
1730       }
1731    }
1732 
1733    if ((builder->state &
1734         VK_GRAPHICS_PIPELINE_LIBRARY_FRAGMENT_SHADER_BIT_EXT) &&
1735        builder->graphics_state.ial &&
1736        builder->create_info->renderPass == VK_NULL_HANDLE) {
1737       const struct vk_input_attachment_location_state *ial =
1738          builder->graphics_state.ial;
1739 
1740       keys[MESA_SHADER_FRAGMENT].dynamic_renderpass = true;
1741 
1742       uint32_t attachments_referenced = 0;
1743 
1744       if (ial->color_attachment_count == MESA_VK_COLOR_ATTACHMENT_COUNT_UNKNOWN) {
1745          attachments_referenced |=
1746             BITFIELD_MASK(MAX_RTS) << TU_DYN_INPUT_ATT_OFFSET;
1747       } else {
1748          for (unsigned i = 0; i < ial->color_attachment_count; i++) {
1749             if (ial->color_map[i] != MESA_VK_ATTACHMENT_UNUSED) {
1750                attachments_referenced |=
1751                   (1u << (ial->color_map[i] + TU_DYN_INPUT_ATT_OFFSET));
1752                }
1753          }
1754       }
1755 
1756       if (ial->depth_att != MESA_VK_ATTACHMENT_UNUSED) {
1757          if (ial->depth_att == MESA_VK_ATTACHMENT_NO_INDEX)
1758             attachments_referenced |= 1;
1759          else
1760             attachments_referenced |= 1u << (ial->depth_att + 1);
1761       }
1762 
1763       if (ial->stencil_att != MESA_VK_ATTACHMENT_UNUSED) {
1764          if (ial->stencil_att == MESA_VK_ATTACHMENT_NO_INDEX)
1765             attachments_referenced |= 1;
1766          else
1767             attachments_referenced |= 1u << (ial->stencil_att + 1);
1768       }
1769 
1770       keys[MESA_SHADER_FRAGMENT].read_only_input_attachments =
1771          ~attachments_referenced;
1772    }
1773 
1774    if (builder->create_flags &
1775        VK_PIPELINE_CREATE_2_LINK_TIME_OPTIMIZATION_BIT_EXT) {
1776       for (unsigned i = 0; i < builder->num_libraries; i++) {
1777          struct tu_graphics_lib_pipeline *library = builder->libraries[i];
1778 
1779          for (unsigned j = 0; j < ARRAY_SIZE(library->shaders); j++) {
1780             if (library->shaders[j].nir) {
1781                assert(!nir[j]);
1782                nir[j] = nir_shader_clone(builder->mem_ctx,
1783                      library->shaders[j].nir);
1784                keys[j] = library->shaders[j].key;
1785                must_compile = true;
1786             }
1787          }
1788       }
1789    }
1790 
1791    struct tu_nir_shaders *nir_shaders = NULL;
1792    if (!must_compile)
1793       goto done;
1794 
1795    if (builder->state &
1796        VK_GRAPHICS_PIPELINE_LIBRARY_PRE_RASTERIZATION_SHADERS_BIT_EXT) {
1797       keys[MESA_SHADER_VERTEX].multiview_mask =
1798          builder->graphics_state.rp->view_mask;
1799    }
1800 
1801    if (builder->state & VK_GRAPHICS_PIPELINE_LIBRARY_FRAGMENT_SHADER_BIT_EXT) {
1802       keys[MESA_SHADER_FRAGMENT].multiview_mask =
1803          builder->graphics_state.rp->view_mask;
1804       keys[MESA_SHADER_FRAGMENT].fragment_density_map =
1805          builder->fragment_density_map;
1806       keys[MESA_SHADER_FRAGMENT].unscaled_input_fragcoord =
1807          builder->unscaled_input_fragcoord;
1808 
1809       const VkPipelineMultisampleStateCreateInfo *msaa_info =
1810          builder->create_info->pMultisampleState;
1811 
1812       /* The 1.3.215 spec says:
1813        *
1814        *    Sample shading can be used to specify a minimum number of unique
1815        *    samples to process for each fragment. If sample shading is enabled,
1816        *    an implementation must provide a minimum of
1817        *
1818        *       max(ceil(minSampleShadingFactor * totalSamples), 1)
1819        *
1820        *    unique associated data for each fragment, where
1821        *    minSampleShadingFactor is the minimum fraction of sample shading.
1822        *
1823        * The definition is pretty much the same as OpenGL's GL_SAMPLE_SHADING.
1824        * They both require unique associated data.
1825        *
1826        * There are discussions to change the definition, such that
1827        * sampleShadingEnable does not imply unique associated data.  Before the
1828        * discussions are settled and before apps (i.e., ANGLE) are fixed to
1829        * follow the new and incompatible definition, we should stick to the
1830        * current definition.
1831        *
1832        * Note that ir3_shader_key::sample_shading is not actually used by ir3,
1833        * just checked in tu6_emit_fs_inputs.  We will also copy the value to
1834        * tu_shader_key::force_sample_interp in a bit.
1835        */
1836       keys[MESA_SHADER_FRAGMENT].force_sample_interp =
1837          !builder->rasterizer_discard && msaa_info && msaa_info->sampleShadingEnable;
1838    }
1839 
1840    unsigned char pipeline_sha1[20];
1841    tu_hash_shaders(pipeline_sha1, builder->create_flags, stage_infos, nir,
1842                    &builder->layout, keys, builder->state);
1843 
1844    unsigned char nir_sha1[21];
1845    memcpy(nir_sha1, pipeline_sha1, sizeof(pipeline_sha1));
1846    nir_sha1[20] = 'N';
1847 
1848    if (!executable_info) {
1849       cache_hit = true;
1850       bool application_cache_hit = false;
1851 
1852       unsigned char shader_sha1[21];
1853       memcpy(shader_sha1, pipeline_sha1, sizeof(pipeline_sha1));
1854 
1855       for (gl_shader_stage stage = MESA_SHADER_VERTEX; stage < ARRAY_SIZE(nir);
1856            stage = (gl_shader_stage) (stage + 1)) {
1857          if (stage_infos[stage] || nir[stage]) {
1858             bool shader_application_cache_hit;
1859             shader_sha1[20] = (unsigned char) stage;
1860             shaders[stage] =
1861                tu_pipeline_cache_lookup(builder->cache, &shader_sha1,
1862                                         sizeof(shader_sha1),
1863                                         &shader_application_cache_hit);
1864             if (!shaders[stage]) {
1865                cache_hit = false;
1866                break;
1867             }
1868             application_cache_hit &= shader_application_cache_hit;
1869          }
1870       }
1871 
1872       /* If the user asks us to keep the NIR around, we need to have it for a
1873        * successful cache hit. If we only have a "partial" cache hit, then we
1874        * still need to recompile in order to get the NIR.
1875        */
1876       if (cache_hit &&
1877           (builder->create_flags &
1878            VK_PIPELINE_CREATE_2_RETAIN_LINK_TIME_OPTIMIZATION_INFO_BIT_EXT)) {
1879          bool nir_application_cache_hit = false;
1880          nir_shaders =
1881             tu_nir_cache_lookup(builder->cache, &nir_sha1,
1882                                 sizeof(nir_sha1),
1883                                 &nir_application_cache_hit);
1884 
1885          application_cache_hit &= nir_application_cache_hit;
1886          cache_hit &= !!nir_shaders;
1887       }
1888 
1889       if (application_cache_hit && builder->cache != builder->device->mem_cache) {
1890          pipeline_feedback.flags |=
1891             VK_PIPELINE_CREATION_FEEDBACK_APPLICATION_PIPELINE_CACHE_HIT_BIT;
1892       }
1893    }
1894 
1895    if (!cache_hit) {
1896       if (builder->create_flags &
1897           VK_PIPELINE_CREATE_2_FAIL_ON_PIPELINE_COMPILE_REQUIRED_BIT_KHR) {
1898          return VK_PIPELINE_COMPILE_REQUIRED;
1899       }
1900 
1901       result = tu_compile_shaders(builder->device,
1902                                   builder->create_flags,
1903                                   stage_infos,
1904                                   nir,
1905                                   keys,
1906                                   &builder->layout,
1907                                   pipeline_sha1,
1908                                   shaders,
1909                                   executable_info ? nir_initial_disasm : NULL,
1910                                   pipeline->executables_mem_ctx,
1911                                   retain_nir ? post_link_nir : NULL,
1912                                   stage_feedbacks);
1913 
1914       if (result != VK_SUCCESS)
1915          goto fail;
1916 
1917       if (retain_nir) {
1918          nir_shaders =
1919             tu_nir_shaders_init(builder->device, &nir_sha1, sizeof(nir_sha1));
1920          for (gl_shader_stage stage = MESA_SHADER_VERTEX;
1921               stage < ARRAY_SIZE(nir); stage = (gl_shader_stage) (stage + 1)) {
1922             if (!post_link_nir[stage])
1923                continue;
1924 
1925             nir_shaders->nir[stage] = post_link_nir[stage];
1926          }
1927 
1928          nir_shaders = tu_nir_cache_insert(builder->cache, nir_shaders);
1929       }
1930 
1931       for (gl_shader_stage stage = MESA_SHADER_VERTEX; stage < ARRAY_SIZE(nir);
1932            stage = (gl_shader_stage) (stage + 1)) {
1933          if (!nir[stage])
1934             continue;
1935 
1936          shaders[stage] = tu_pipeline_cache_insert(builder->cache, shaders[stage]);
1937       }
1938    }
1939 
1940 done:
1941 
1942    /* Create empty shaders which contain the draw states to initialize
1943     * registers for unused shader stages.
1944     */
1945    if (builder->state &
1946        VK_GRAPHICS_PIPELINE_LIBRARY_PRE_RASTERIZATION_SHADERS_BIT_EXT) {
1947       if (!shaders[MESA_SHADER_TESS_CTRL]) {
1948          shaders[MESA_SHADER_TESS_CTRL] = builder->device->empty_tcs;
1949          vk_pipeline_cache_object_ref(&shaders[MESA_SHADER_TESS_CTRL]->base);
1950       }
1951       if (!shaders[MESA_SHADER_TESS_EVAL]) {
1952          shaders[MESA_SHADER_TESS_EVAL] = builder->device->empty_tes;
1953          vk_pipeline_cache_object_ref(&shaders[MESA_SHADER_TESS_EVAL]->base);
1954       }
1955       if (!shaders[MESA_SHADER_GEOMETRY]) {
1956          shaders[MESA_SHADER_GEOMETRY] = builder->device->empty_gs;
1957          vk_pipeline_cache_object_ref(&shaders[MESA_SHADER_GEOMETRY]->base);
1958       }
1959    }
1960 
1961    if (builder->state &
1962        VK_GRAPHICS_PIPELINE_LIBRARY_FRAGMENT_SHADER_BIT_EXT) {
1963       if (!shaders[MESA_SHADER_FRAGMENT]) {
1964          shaders[MESA_SHADER_FRAGMENT] =
1965             builder->fragment_density_map ?
1966             builder->device->empty_fs_fdm : builder->device->empty_fs;
1967          vk_pipeline_cache_object_ref(&shaders[MESA_SHADER_FRAGMENT]->base);
1968       }
1969    }
1970 
1971    for (gl_shader_stage stage = MESA_SHADER_VERTEX;
1972         stage < ARRAY_SIZE(nir); stage = (gl_shader_stage) (stage + 1)) {
1973       if (shaders[stage] && shaders[stage]->variant) {
1974          tu_append_executable(pipeline, shaders[stage]->variant,
1975                               nir_initial_disasm[stage]);
1976       }
1977    }
1978 
1979    /* We may have deduplicated a cache entry, in which case our original
1980     * post_link_nir may be gone.
1981     */
1982    if (nir_shaders) {
1983       for (gl_shader_stage stage = MESA_SHADER_VERTEX;
1984            stage < ARRAY_SIZE(nir); stage = (gl_shader_stage) (stage + 1)) {
1985          if (nir_shaders->nir[stage]) {
1986             post_link_nir[stage] = nir_shaders->nir[stage];
1987          }
1988       }
1989    }
1990 
1991    /* In the case where we're building a library without link-time
1992     * optimization but with sub-libraries that retain LTO info, we should
1993     * retain it ourselves in case another pipeline includes us with LTO.
1994     */
1995    for (unsigned i = 0; i < builder->num_libraries; i++) {
1996       struct tu_graphics_lib_pipeline *library = builder->libraries[i];
1997       for (gl_shader_stage stage = MESA_SHADER_VERTEX;
1998            stage < ARRAY_SIZE(library->shaders);
1999            stage = (gl_shader_stage) (stage + 1)) {
2000          if (!post_link_nir[stage] && library->shaders[stage].nir) {
2001             post_link_nir[stage] = library->shaders[stage].nir;
2002             keys[stage] = library->shaders[stage].key;
2003          }
2004 
2005          if (!shaders[stage] && library->base.shaders[stage]) {
2006             shaders[stage] = library->base.shaders[stage];
2007             vk_pipeline_cache_object_ref(&shaders[stage]->base);
2008          }
2009       }
2010    }
2011 
2012    if (shaders[MESA_SHADER_VERTEX]) {
2013       const struct ir3_shader_variant *vs =
2014          shaders[MESA_SHADER_VERTEX]->variant;
2015 
2016       if (!vs->stream_output.num_outputs && ir3_has_binning_vs(&vs->key)) {
2017          tu_append_executable(pipeline, vs->binning, NULL);
2018       }
2019    }
2020 
2021    if (pipeline_contains_all_shader_state(pipeline)) {
2022       /* It doesn't make much sense to use RETAIN_LINK_TIME_OPTIMIZATION_INFO
2023        * when compiling all stages, but make sure we don't leak.
2024        */
2025       if (nir_shaders)
2026          vk_pipeline_cache_object_unref(&builder->device->vk,
2027                                         &nir_shaders->base);
2028    } else {
2029       struct tu_graphics_lib_pipeline *library =
2030          tu_pipeline_to_graphics_lib(pipeline);
2031       library->nir_shaders = nir_shaders;
2032       for (gl_shader_stage stage = MESA_SHADER_VERTEX;
2033            stage < ARRAY_SIZE(library->shaders);
2034            stage = (gl_shader_stage) (stage + 1)) {
2035          library->shaders[stage].nir = post_link_nir[stage];
2036          library->shaders[stage].key = keys[stage];
2037       }
2038    }
2039 
2040    for (gl_shader_stage stage = MESA_SHADER_VERTEX;
2041         stage < ARRAY_SIZE(shaders); stage = (gl_shader_stage) (stage + 1)) {
2042       pipeline->shaders[stage] = shaders[stage];
2043       if (shaders[stage])
2044          pipeline->active_desc_sets |= shaders[stage]->active_desc_sets;
2045    }
2046 
2047    pipeline_feedback.duration = os_time_get_nano() - pipeline_start;
2048    if (creation_feedback) {
2049       *creation_feedback->pPipelineCreationFeedback = pipeline_feedback;
2050 
2051       for (uint32_t i = 0; i < builder->create_info->stageCount; i++) {
2052          gl_shader_stage s =
2053             vk_to_mesa_shader_stage(builder->create_info->pStages[i].stage);
2054          creation_feedback->pPipelineStageCreationFeedbacks[i] = stage_feedbacks[s];
2055       }
2056    }
2057 
2058    return VK_SUCCESS;
2059 
2060 fail:
2061    if (nir_shaders)
2062       vk_pipeline_cache_object_unref(&builder->device->vk,
2063                                      &nir_shaders->base);
2064 
2065    return result;
2066 }
2067 
2068 static void
tu_pipeline_builder_parse_libraries(struct tu_pipeline_builder * builder,struct tu_pipeline * pipeline)2069 tu_pipeline_builder_parse_libraries(struct tu_pipeline_builder *builder,
2070                                     struct tu_pipeline *pipeline)
2071 {
2072    const VkPipelineLibraryCreateInfoKHR *library_info =
2073       vk_find_struct_const(builder->create_info->pNext,
2074                            PIPELINE_LIBRARY_CREATE_INFO_KHR);
2075 
2076    if (library_info) {
2077       assert(library_info->libraryCount <= MAX_LIBRARIES);
2078       builder->num_libraries = library_info->libraryCount;
2079       for (unsigned i = 0; i < library_info->libraryCount; i++) {
2080          VK_FROM_HANDLE(tu_pipeline, library, library_info->pLibraries[i]);
2081          builder->libraries[i] = tu_pipeline_to_graphics_lib(library);
2082       }
2083    }
2084 
2085    /* Merge in the state from libraries. The program state is a bit special
2086     * and is handled separately.
2087     */
2088    if (pipeline->type == TU_PIPELINE_GRAPHICS_LIB)
2089       tu_pipeline_to_graphics_lib(pipeline)->state = builder->state;
2090    for (unsigned i = 0; i < builder->num_libraries; i++) {
2091       struct tu_graphics_lib_pipeline *library = builder->libraries[i];
2092       if (pipeline->type == TU_PIPELINE_GRAPHICS_LIB)
2093          tu_pipeline_to_graphics_lib(pipeline)->state |= library->state;
2094 
2095       if (library->state &
2096           VK_GRAPHICS_PIPELINE_LIBRARY_FRAGMENT_OUTPUT_INTERFACE_BIT_EXT) {
2097          pipeline->output = library->base.output;
2098          pipeline->lrz_blend.reads_dest |= library->base.lrz_blend.reads_dest;
2099          pipeline->lrz_blend.valid |= library->base.lrz_blend.valid;
2100          pipeline->prim_order = library->base.prim_order;
2101       }
2102 
2103       if ((library->state &
2104            VK_GRAPHICS_PIPELINE_LIBRARY_FRAGMENT_SHADER_BIT_EXT) &&
2105           (library->state &
2106            VK_GRAPHICS_PIPELINE_LIBRARY_FRAGMENT_OUTPUT_INTERFACE_BIT_EXT)) {
2107          pipeline->prim_order = library->base.prim_order;
2108       }
2109 
2110       pipeline->set_state_mask |= library->base.set_state_mask;
2111 
2112       u_foreach_bit (i, library->base.set_state_mask) {
2113          pipeline->dynamic_state[i] = library->base.dynamic_state[i];
2114       }
2115 
2116       if (contains_all_shader_state(library->state)) {
2117          pipeline->program = library->base.program;
2118          pipeline->load_state = library->base.load_state;
2119          for (unsigned i = 0; i < ARRAY_SIZE(pipeline->shaders); i++) {
2120             if (library->base.shaders[i]) {
2121                pipeline->shaders[i] = library->base.shaders[i];
2122                vk_pipeline_cache_object_ref(&pipeline->shaders[i]->base);
2123             }
2124          }
2125       }
2126 
2127       BITSET_OR(pipeline->static_state_mask, pipeline->static_state_mask,
2128                 library->base.static_state_mask);
2129 
2130       vk_graphics_pipeline_state_merge(&builder->graphics_state,
2131                                        &library->graphics_state);
2132    }
2133 }
2134 
2135 static void
tu_pipeline_builder_parse_layout(struct tu_pipeline_builder * builder,struct tu_pipeline * pipeline)2136 tu_pipeline_builder_parse_layout(struct tu_pipeline_builder *builder,
2137                                  struct tu_pipeline *pipeline)
2138 {
2139    VK_FROM_HANDLE(tu_pipeline_layout, layout, builder->create_info->layout);
2140 
2141    if (layout) {
2142       /* Note: it's still valid to have a layout even if there are libraries.
2143        * This allows the app to e.g. overwrite an INDEPENDENT_SET layout with
2144        * a non-INDEPENDENT_SET layout which may make us use a faster path,
2145        * currently this just affects dynamic offset descriptors.
2146        */
2147       builder->layout = *layout;
2148    } else {
2149       for (unsigned i = 0; i < builder->num_libraries; i++) {
2150          struct tu_graphics_lib_pipeline *library = builder->libraries[i];
2151          builder->layout.num_sets = MAX2(builder->layout.num_sets,
2152                                          library->num_sets);
2153          assert(builder->layout.num_sets <= builder->device->physical_device->usable_sets);
2154          for (unsigned j = 0; j < library->num_sets; j++) {
2155             builder->layout.set[i].layout = library->layouts[i];
2156          }
2157 
2158          builder->layout.push_constant_size = library->push_constant_size;
2159       }
2160 
2161       tu_pipeline_layout_init(&builder->layout);
2162    }
2163 
2164    if (pipeline->type == TU_PIPELINE_GRAPHICS_LIB) {
2165       struct tu_graphics_lib_pipeline *library =
2166          tu_pipeline_to_graphics_lib(pipeline);
2167       library->num_sets = builder->layout.num_sets;
2168       for (unsigned i = 0; i < library->num_sets; i++) {
2169          library->layouts[i] = builder->layout.set[i].layout;
2170          if (library->layouts[i])
2171             vk_descriptor_set_layout_ref(&library->layouts[i]->vk);
2172       }
2173       library->push_constant_size = builder->layout.push_constant_size;
2174    }
2175 }
2176 
2177 static void
tu_pipeline_set_linkage(struct tu_program_descriptor_linkage * link,struct tu_const_state * const_state,const struct ir3_shader_variant * v)2178 tu_pipeline_set_linkage(struct tu_program_descriptor_linkage *link,
2179                         struct tu_const_state *const_state,
2180                         const struct ir3_shader_variant *v)
2181 {
2182    link->const_state = *ir3_const_state(v);
2183    link->tu_const_state = *const_state;
2184    link->constlen = v->constlen;
2185 }
2186 
2187 template <chip CHIP>
2188 static void
tu_emit_program_state(struct tu_cs * sub_cs,struct tu_program_state * prog,struct tu_shader ** shaders)2189 tu_emit_program_state(struct tu_cs *sub_cs,
2190                       struct tu_program_state *prog,
2191                       struct tu_shader **shaders)
2192 {
2193    struct tu_device *dev = sub_cs->device;
2194    struct tu_cs prog_cs;
2195 
2196    const struct ir3_shader_variant *variants[MESA_SHADER_STAGES];
2197    struct tu_draw_state draw_states[MESA_SHADER_STAGES];
2198 
2199    for (gl_shader_stage stage = MESA_SHADER_VERTEX;
2200         stage < ARRAY_SIZE(variants); stage = (gl_shader_stage) (stage+1)) {
2201       variants[stage] = shaders[stage] ? shaders[stage]->variant : NULL;
2202    }
2203 
2204    uint32_t safe_variants =
2205       ir3_trim_constlen(variants, dev->compiler);
2206 
2207    unsigned dynamic_descriptor_sizes[MAX_SETS] = { };
2208 
2209    for (gl_shader_stage stage = MESA_SHADER_VERTEX;
2210         stage < ARRAY_SIZE(variants); stage = (gl_shader_stage) (stage+1)) {
2211       if (shaders[stage]) {
2212          if (safe_variants & (1u << stage)) {
2213             variants[stage] = shaders[stage]->safe_const_variant;
2214             draw_states[stage] = shaders[stage]->safe_const_state;
2215          } else {
2216             draw_states[stage] = shaders[stage]->state;
2217          }
2218 
2219          for (unsigned i = 0; i < MAX_SETS; i++) {
2220             if (shaders[stage]->dynamic_descriptor_sizes[i] >= 0) {
2221                dynamic_descriptor_sizes[i] =
2222                   shaders[stage]->dynamic_descriptor_sizes[i];
2223             }
2224          }
2225       }
2226    }
2227 
2228    for (unsigned i = 0; i < ARRAY_SIZE(variants); i++) {
2229       if (!variants[i])
2230          continue;
2231 
2232       tu_pipeline_set_linkage(&prog->link[i],
2233                               &shaders[i]->const_state,
2234                               variants[i]);
2235 
2236       struct tu_push_constant_range *push_consts =
2237          &shaders[i]->const_state.push_consts;
2238       if (push_consts->type == IR3_PUSH_CONSTS_SHARED ||
2239           push_consts->type == IR3_PUSH_CONSTS_SHARED_PREAMBLE) {
2240          prog->shared_consts = *push_consts;
2241       }
2242    }
2243 
2244    unsigned dynamic_descriptor_offset = 0;
2245    for (unsigned i = 0; i < MAX_SETS; i++) {
2246       prog->dynamic_descriptor_offsets[i] = dynamic_descriptor_offset;
2247       dynamic_descriptor_offset += dynamic_descriptor_sizes[i];
2248    }
2249 
2250    /* Emit HLSQ_xS_CNTL/HLSQ_SP_xS_CONFIG *first*, before emitting anything
2251     * else that could depend on that state (like push constants)
2252     *
2253     * Note also that this always uses the full VS even in binning pass.  The
2254     * binning pass variant has the same const layout as the full VS, and
2255     * the constlen for the VS will be the same or greater than the constlen
2256     * for the binning pass variant.  It is required that the constlen state
2257     * matches between binning and draw passes, as some parts of the push
2258     * consts are emitted in state groups that are shared between the binning
2259     * and draw passes.
2260     */
2261    tu_cs_begin_sub_stream(sub_cs, 512, &prog_cs);
2262    tu6_emit_program_config<CHIP>(&prog_cs, prog, shaders, variants);
2263    prog->config_state = tu_cs_end_draw_state(sub_cs, &prog_cs);
2264 
2265    prog->vs_state = draw_states[MESA_SHADER_VERTEX];
2266 
2267   /* Don't use the binning pass variant when GS is present because we don't
2268    * support compiling correct binning pass variants with GS.
2269    */
2270    if (variants[MESA_SHADER_GEOMETRY]) {
2271       prog->vs_binning_state = prog->vs_state;
2272    } else {
2273       prog->vs_binning_state =
2274          shaders[MESA_SHADER_VERTEX]->binning_state;
2275    }
2276 
2277    prog->hs_state = draw_states[MESA_SHADER_TESS_CTRL];
2278    prog->ds_state = draw_states[MESA_SHADER_TESS_EVAL];
2279    prog->gs_state = draw_states[MESA_SHADER_GEOMETRY];
2280    prog->gs_binning_state =
2281       shaders[MESA_SHADER_GEOMETRY]->binning_state;
2282    prog->fs_state = draw_states[MESA_SHADER_FRAGMENT];
2283 
2284    const struct ir3_shader_variant *vs = variants[MESA_SHADER_VERTEX];
2285    const struct ir3_shader_variant *hs = variants[MESA_SHADER_TESS_CTRL];
2286    const struct ir3_shader_variant *ds = variants[MESA_SHADER_TESS_EVAL];
2287    const struct ir3_shader_variant *gs = variants[MESA_SHADER_GEOMETRY];
2288    const struct ir3_shader_variant *fs = variants[MESA_SHADER_FRAGMENT];
2289 
2290    tu_cs_begin_sub_stream(sub_cs, 512, &prog_cs);
2291    tu6_emit_vpc<CHIP>(&prog_cs, vs, hs, ds, gs, fs);
2292    prog->vpc_state = tu_cs_end_draw_state(sub_cs, &prog_cs);
2293 
2294    const struct ir3_shader_variant *last_shader;
2295    if (gs)
2296       last_shader = gs;
2297    else if (ds)
2298       last_shader = ds;
2299    else
2300       last_shader = vs;
2301 
2302    prog->per_view_viewport =
2303       !last_shader->writes_viewport &&
2304       shaders[MESA_SHADER_FRAGMENT]->fs.has_fdm &&
2305       dev->physical_device->info->a6xx.has_per_view_viewport;
2306    prog->writes_shading_rate = last_shader->writes_shading_rate;
2307    prog->reads_shading_rate = fs->reads_shading_rate;
2308    prog->accesses_smask = fs->reads_smask || fs->writes_smask;
2309 }
2310 
2311 static const enum mesa_vk_dynamic_graphics_state tu_vertex_input_state[] = {
2312    MESA_VK_DYNAMIC_VI,
2313 };
2314 
2315 template <chip CHIP>
2316 static unsigned
tu6_vertex_input_size(struct tu_device * dev,const struct vk_vertex_input_state * vi)2317 tu6_vertex_input_size(struct tu_device *dev,
2318                       const struct vk_vertex_input_state *vi)
2319 {
2320    return 1 + 2 * util_last_bit(vi->attributes_valid);
2321 }
2322 
2323 template <chip CHIP>
2324 static void
tu6_emit_vertex_input(struct tu_cs * cs,const struct vk_vertex_input_state * vi)2325 tu6_emit_vertex_input(struct tu_cs *cs,
2326                       const struct vk_vertex_input_state *vi)
2327 {
2328    unsigned attr_count = util_last_bit(vi->attributes_valid);
2329    if (attr_count != 0)
2330       tu_cs_emit_pkt4(cs, REG_A6XX_VFD_DECODE_INSTR(0), attr_count * 2);
2331 
2332    for (uint32_t loc = 0; loc < attr_count; loc++) {
2333       const struct vk_vertex_attribute_state *attr = &vi->attributes[loc];
2334 
2335       if (vi->attributes_valid & (1u << loc)) {
2336          const struct vk_vertex_binding_state *binding =
2337             &vi->bindings[attr->binding];
2338 
2339          enum pipe_format pipe_format = vk_format_to_pipe_format(attr->format);
2340          const struct tu_native_format format = tu6_format_vtx(pipe_format);
2341          tu_cs_emit(cs, A6XX_VFD_DECODE_INSTR(0,
2342                           .idx = attr->binding,
2343                           .offset = attr->offset,
2344                           .instanced = binding->input_rate == VK_VERTEX_INPUT_RATE_INSTANCE,
2345                           .format = format.fmt,
2346                           .swap = format.swap,
2347                           .unk30 = 1,
2348                           ._float = !util_format_is_pure_integer(pipe_format)).value);
2349          tu_cs_emit(cs, A6XX_VFD_DECODE_STEP_RATE(0, binding->divisor).value);
2350       } else {
2351          tu_cs_emit(cs, 0);
2352          tu_cs_emit(cs, 0);
2353       }
2354    }
2355 }
2356 
2357 static const enum mesa_vk_dynamic_graphics_state tu_vertex_stride_state[] = {
2358    MESA_VK_DYNAMIC_VI_BINDINGS_VALID,
2359    MESA_VK_DYNAMIC_VI_BINDING_STRIDES,
2360 };
2361 
2362 template <chip CHIP>
2363 static unsigned
tu6_vertex_stride_size(struct tu_device * dev,const struct vk_vertex_input_state * vi)2364 tu6_vertex_stride_size(struct tu_device *dev,
2365                        const struct vk_vertex_input_state *vi)
2366 {
2367    return 1 + 2 * util_last_bit(vi->bindings_valid);
2368 }
2369 
2370 template <chip CHIP>
2371 static void
tu6_emit_vertex_stride(struct tu_cs * cs,const struct vk_vertex_input_state * vi)2372 tu6_emit_vertex_stride(struct tu_cs *cs, const struct vk_vertex_input_state *vi)
2373 {
2374    if (vi->bindings_valid) {
2375       unsigned bindings_count = util_last_bit(vi->bindings_valid);
2376       tu_cs_emit_pkt7(cs, CP_CONTEXT_REG_BUNCH, 2 * bindings_count);
2377       for (unsigned i = 0; i < bindings_count; i++) {
2378          tu_cs_emit(cs, REG_A6XX_VFD_FETCH_STRIDE(i));
2379          tu_cs_emit(cs, vi->bindings[i].stride);
2380       }
2381    }
2382 }
2383 
2384 template <chip CHIP>
2385 static unsigned
tu6_vertex_stride_size_dyn(struct tu_device * dev,const uint16_t * vi_binding_stride,uint32_t bindings_valid)2386 tu6_vertex_stride_size_dyn(struct tu_device *dev,
2387                            const uint16_t *vi_binding_stride,
2388                            uint32_t bindings_valid)
2389 {
2390    return 1 + 2 * util_last_bit(bindings_valid);
2391 }
2392 
2393 template <chip CHIP>
2394 static void
tu6_emit_vertex_stride_dyn(struct tu_cs * cs,const uint16_t * vi_binding_stride,uint32_t bindings_valid)2395 tu6_emit_vertex_stride_dyn(struct tu_cs *cs, const uint16_t *vi_binding_stride,
2396                            uint32_t bindings_valid)
2397 {
2398    if (bindings_valid) {
2399       unsigned bindings_count = util_last_bit(bindings_valid);
2400       tu_cs_emit_pkt7(cs, CP_CONTEXT_REG_BUNCH, 2 * bindings_count);
2401       for (unsigned i = 0; i < bindings_count; i++) {
2402          tu_cs_emit(cs, REG_A6XX_VFD_FETCH_STRIDE(i));
2403          tu_cs_emit(cs, vi_binding_stride[i]);
2404       }
2405    }
2406 }
2407 
2408 static const enum mesa_vk_dynamic_graphics_state tu_viewport_state[] = {
2409    MESA_VK_DYNAMIC_VP_VIEWPORTS,
2410    MESA_VK_DYNAMIC_VP_VIEWPORT_COUNT,
2411    MESA_VK_DYNAMIC_VP_DEPTH_CLIP_NEGATIVE_ONE_TO_ONE,
2412    MESA_VK_DYNAMIC_RS_DEPTH_CLAMP_ENABLE,
2413 };
2414 
2415 template <chip CHIP>
2416 static unsigned
tu6_viewport_size(struct tu_device * dev,const struct vk_viewport_state * vp,const struct vk_rasterization_state * rs)2417 tu6_viewport_size(struct tu_device *dev,
2418                   const struct vk_viewport_state *vp,
2419                   const struct vk_rasterization_state *rs)
2420 {
2421    return 1 + vp->viewport_count * 6 + 1 + vp->viewport_count * 2 +
2422       1 + vp->viewport_count * 2 + 5;
2423 }
2424 
2425 template <chip CHIP>
2426 static void
tu6_emit_viewport(struct tu_cs * cs,const struct vk_viewport_state * vp,const struct vk_rasterization_state * rs)2427 tu6_emit_viewport(struct tu_cs *cs,
2428                   const struct vk_viewport_state *vp,
2429                   const struct vk_rasterization_state *rs)
2430 {
2431    VkExtent2D guardband = {511, 511};
2432 
2433    tu_cs_emit_pkt4(cs, REG_A6XX_GRAS_CL_VPORT_XOFFSET(0), vp->viewport_count * 6);
2434    for (uint32_t i = 0; i < vp->viewport_count; i++) {
2435       const VkViewport *viewport = &vp->viewports[i];
2436       float offsets[3];
2437       float scales[3];
2438       scales[0] = viewport->width / 2.0f;
2439       scales[1] = viewport->height / 2.0f;
2440       if (vp->depth_clip_negative_one_to_one) {
2441          scales[2] = 0.5 * (viewport->maxDepth - viewport->minDepth);
2442       } else {
2443          scales[2] = viewport->maxDepth - viewport->minDepth;
2444       }
2445 
2446       offsets[0] = viewport->x + scales[0];
2447       offsets[1] = viewport->y + scales[1];
2448       if (vp->depth_clip_negative_one_to_one) {
2449          offsets[2] = 0.5 * (viewport->minDepth + viewport->maxDepth);
2450       } else {
2451          offsets[2] = viewport->minDepth;
2452       }
2453 
2454       for (uint32_t j = 0; j < 3; j++) {
2455          tu_cs_emit(cs, fui(offsets[j]));
2456          tu_cs_emit(cs, fui(scales[j]));
2457       }
2458 
2459       guardband.width =
2460          MIN2(guardband.width, fd_calc_guardband(offsets[0], scales[0], false));
2461       guardband.height =
2462          MIN2(guardband.height, fd_calc_guardband(offsets[1], scales[1], false));
2463    }
2464 
2465    tu_cs_emit_pkt4(cs, REG_A6XX_GRAS_SC_VIEWPORT_SCISSOR_TL(0), vp->viewport_count * 2);
2466    for (uint32_t i = 0; i < vp->viewport_count; i++) {
2467       const VkViewport *viewport = &vp->viewports[i];
2468       VkOffset2D min;
2469       VkOffset2D max;
2470       min.x = (int32_t) viewport->x;
2471       max.x = (int32_t) ceilf(viewport->x + viewport->width);
2472       if (viewport->height >= 0.0f) {
2473          min.y = (int32_t) viewport->y;
2474          max.y = (int32_t) ceilf(viewport->y + viewport->height);
2475       } else {
2476          min.y = (int32_t)(viewport->y + viewport->height);
2477          max.y = (int32_t) ceilf(viewport->y);
2478       }
2479       /* the spec allows viewport->height to be 0.0f */
2480       if (min.y == max.y)
2481          max.y++;
2482       /* allow viewport->width = 0.0f for un-initialized viewports: */
2483       if (min.x == max.x)
2484          max.x++;
2485 
2486       min.x = MAX2(min.x, 0);
2487       min.y = MAX2(min.y, 0);
2488       max.x = MAX2(max.x, 1);
2489       max.y = MAX2(max.y, 1);
2490 
2491       assert(min.x < max.x);
2492       assert(min.y < max.y);
2493 
2494       tu_cs_emit(cs, A6XX_GRAS_SC_VIEWPORT_SCISSOR_TL_X(min.x) |
2495                      A6XX_GRAS_SC_VIEWPORT_SCISSOR_TL_Y(min.y));
2496       tu_cs_emit(cs, A6XX_GRAS_SC_VIEWPORT_SCISSOR_BR_X(max.x - 1) |
2497                      A6XX_GRAS_SC_VIEWPORT_SCISSOR_BR_Y(max.y - 1));
2498    }
2499 
2500    /* A7XX+ doesn't clamp to [0,1] with disabled depth clamp, to support
2501     * VK_EXT_depth_clamp_zero_one we have to always enable clamp and manually
2502     * set range to [0,1] when rs->depth_clamp_enable is false.
2503     */
2504    bool zero_one_depth_clamp = CHIP >= A7XX && !rs->depth_clamp_enable;
2505 
2506    tu_cs_emit_pkt4(cs, REG_A6XX_GRAS_CL_Z_CLAMP(0), vp->viewport_count * 2);
2507    for (uint32_t i = 0; i < vp->viewport_count; i++) {
2508       const VkViewport *viewport = &vp->viewports[i];
2509       if (zero_one_depth_clamp) {
2510          tu_cs_emit(cs, fui(0.0f));
2511          tu_cs_emit(cs, fui(1.0f));
2512       } else {
2513          tu_cs_emit(cs, fui(MIN2(viewport->minDepth, viewport->maxDepth)));
2514          tu_cs_emit(cs, fui(MAX2(viewport->minDepth, viewport->maxDepth)));
2515       }
2516    }
2517    tu_cs_emit_pkt4(cs, REG_A6XX_GRAS_CL_GUARDBAND_CLIP_ADJ, 1);
2518    tu_cs_emit(cs, A6XX_GRAS_CL_GUARDBAND_CLIP_ADJ_HORZ(guardband.width) |
2519                   A6XX_GRAS_CL_GUARDBAND_CLIP_ADJ_VERT(guardband.height));
2520 
2521    /* TODO: what to do about this and multi viewport ? */
2522    float z_clamp_min = vp->viewport_count ? MIN2(vp->viewports[0].minDepth, vp->viewports[0].maxDepth) : 0;
2523    float z_clamp_max = vp->viewport_count ? MAX2(vp->viewports[0].minDepth, vp->viewports[0].maxDepth) : 0;
2524    if (zero_one_depth_clamp) {
2525       z_clamp_min = 0.0f;
2526       z_clamp_max = 1.0f;
2527    }
2528 
2529    tu_cs_emit_regs(cs,
2530                    A6XX_RB_Z_CLAMP_MIN(z_clamp_min),
2531                    A6XX_RB_Z_CLAMP_MAX(z_clamp_max));
2532 }
2533 
2534 struct apply_viewport_state {
2535    struct vk_viewport_state vp;
2536    struct vk_rasterization_state rs;
2537    bool share_scale;
2538 };
2539 
2540 /* It's a hardware restriction that the window offset (i.e. bin.offset) must
2541  * be the same for all views. This means that GMEM coordinates cannot be a
2542  * simple scaling of framebuffer coordinates, because this would require us to
2543  * scale the window offset and the scale may be different per view. Instead we
2544  * have to apply a per-bin offset to the GMEM coordinate transform to make
2545  * sure that the window offset maps to itself. Specifically we need an offset
2546  * o to the transform:
2547  *
2548  * x' = s * x + o
2549  *
2550  * so that when we plug in the bin start b_s:
2551  *
2552  * b_s = s * b_s + o
2553  *
2554  * and we get:
2555  *
2556  * o = b_s - s * b_s
2557  *
2558  * We use this form exactly, because we know the bin offset is a multiple of
2559  * the frag area so s * b_s is an integer and we can compute an exact result
2560  * easily.
2561  */
2562 
2563 VkOffset2D
tu_fdm_per_bin_offset(VkExtent2D frag_area,VkRect2D bin)2564 tu_fdm_per_bin_offset(VkExtent2D frag_area, VkRect2D bin)
2565 {
2566    assert(bin.offset.x % frag_area.width == 0);
2567    assert(bin.offset.y % frag_area.height == 0);
2568 
2569    return (VkOffset2D) {
2570       bin.offset.x - bin.offset.x / frag_area.width,
2571       bin.offset.y - bin.offset.y / frag_area.height
2572    };
2573 }
2574 
2575 static void
fdm_apply_viewports(struct tu_cmd_buffer * cmd,struct tu_cs * cs,void * data,VkRect2D bin,unsigned views,VkExtent2D * frag_areas)2576 fdm_apply_viewports(struct tu_cmd_buffer *cmd, struct tu_cs *cs, void *data,
2577                     VkRect2D bin, unsigned views, VkExtent2D *frag_areas)
2578 {
2579    const struct apply_viewport_state *state =
2580       (const struct apply_viewport_state *)data;
2581 
2582    struct vk_viewport_state vp = state->vp;
2583 
2584    for (unsigned i = 0; i < state->vp.viewport_count; i++) {
2585       /* Note: If we're using shared scaling, the scale should already be the
2586        * same across all views, we can pick any view. However the number
2587        * of viewports and number of views is not guaranteed the same, so we
2588        * need to pick the 0'th view which always exists to be safe.
2589        *
2590        * Conversly, if we're not using shared scaling then the rasterizer in
2591        * the original pipeline is using only the first viewport, so we need to
2592        * replicate it across all viewports.
2593        */
2594       VkExtent2D frag_area = state->share_scale ? frag_areas[0] : frag_areas[i];
2595       VkViewport viewport =
2596          state->share_scale ? state->vp.viewports[i] : state->vp.viewports[0];
2597       if (frag_area.width == 1 && frag_area.height == 1) {
2598          vp.viewports[i] = viewport;
2599          continue;
2600       }
2601 
2602       float scale_x = (float) 1.0f / frag_area.width;
2603       float scale_y = (float) 1.0f / frag_area.height;
2604 
2605       vp.viewports[i].minDepth = viewport.minDepth;
2606       vp.viewports[i].maxDepth = viewport.maxDepth;
2607       vp.viewports[i].width = viewport.width * scale_x;
2608       vp.viewports[i].height = viewport.height * scale_y;
2609 
2610       VkOffset2D offset = tu_fdm_per_bin_offset(frag_area, bin);
2611 
2612       vp.viewports[i].x = scale_x * viewport.x + offset.x;
2613       vp.viewports[i].y = scale_y * viewport.y + offset.y;
2614    }
2615 
2616    TU_CALLX(cs->device, tu6_emit_viewport)(cs, &vp, &state->rs);
2617 }
2618 
2619 static void
tu6_emit_viewport_fdm(struct tu_cs * cs,struct tu_cmd_buffer * cmd,const struct vk_viewport_state * vp,const struct vk_rasterization_state * rs)2620 tu6_emit_viewport_fdm(struct tu_cs *cs, struct tu_cmd_buffer *cmd,
2621                       const struct vk_viewport_state *vp,
2622                       const struct vk_rasterization_state *rs)
2623 {
2624    unsigned num_views = MAX2(cmd->state.pass->num_views, 1);
2625    struct apply_viewport_state state = {
2626       .vp = *vp,
2627       .rs = *rs,
2628       .share_scale = !cmd->state.per_view_viewport,
2629    };
2630    if (!state.share_scale)
2631       state.vp.viewport_count = num_views;
2632    unsigned size = TU_CALLX(cmd->device, tu6_viewport_size)(cmd->device, &state.vp, &state.rs);
2633    tu_cs_begin_sub_stream(&cmd->sub_cs, size, cs);
2634    tu_create_fdm_bin_patchpoint(cmd, cs, size, fdm_apply_viewports, state);
2635 }
2636 
2637 static const enum mesa_vk_dynamic_graphics_state tu_scissor_state[] = {
2638    MESA_VK_DYNAMIC_VP_SCISSORS,
2639    MESA_VK_DYNAMIC_VP_SCISSOR_COUNT,
2640 };
2641 
2642 template <chip CHIP>
2643 static unsigned
tu6_scissor_size(struct tu_device * dev,const struct vk_viewport_state * vp)2644 tu6_scissor_size(struct tu_device *dev, const struct vk_viewport_state *vp)
2645 {
2646    return 1 + vp->scissor_count * 2;
2647 }
2648 
2649 template <chip CHIP>
2650 void
tu6_emit_scissor(struct tu_cs * cs,const struct vk_viewport_state * vp)2651 tu6_emit_scissor(struct tu_cs *cs, const struct vk_viewport_state *vp)
2652 {
2653    tu_cs_emit_pkt4(cs, REG_A6XX_GRAS_SC_SCREEN_SCISSOR_TL(0), vp->scissor_count * 2);
2654 
2655    for (uint32_t i = 0; i < vp->scissor_count; i++) {
2656       const VkRect2D *scissor = &vp->scissors[i];
2657 
2658       uint32_t min_x = scissor->offset.x;
2659       uint32_t min_y = scissor->offset.y;
2660       uint32_t max_x = min_x + scissor->extent.width - 1;
2661       uint32_t max_y = min_y + scissor->extent.height - 1;
2662 
2663       if (!scissor->extent.width || !scissor->extent.height) {
2664          min_x = min_y = 1;
2665          max_x = max_y = 0;
2666       } else {
2667          /* avoid overflow */
2668          uint32_t scissor_max = BITFIELD_MASK(15);
2669          min_x = MIN2(scissor_max, min_x);
2670          min_y = MIN2(scissor_max, min_y);
2671          max_x = MIN2(scissor_max, max_x);
2672          max_y = MIN2(scissor_max, max_y);
2673       }
2674 
2675       tu_cs_emit(cs, A6XX_GRAS_SC_SCREEN_SCISSOR_TL_X(min_x) |
2676                      A6XX_GRAS_SC_SCREEN_SCISSOR_TL_Y(min_y));
2677       tu_cs_emit(cs, A6XX_GRAS_SC_SCREEN_SCISSOR_BR_X(max_x) |
2678                      A6XX_GRAS_SC_SCREEN_SCISSOR_BR_Y(max_y));
2679    }
2680 }
2681 
2682 static void
fdm_apply_scissors(struct tu_cmd_buffer * cmd,struct tu_cs * cs,void * data,VkRect2D bin,unsigned views,VkExtent2D * frag_areas)2683 fdm_apply_scissors(struct tu_cmd_buffer *cmd, struct tu_cs *cs, void *data,
2684                    VkRect2D bin, unsigned views, VkExtent2D *frag_areas)
2685 {
2686    const struct apply_viewport_state *state =
2687       (const struct apply_viewport_state *)data;
2688 
2689    struct vk_viewport_state vp = state->vp;
2690 
2691    for (unsigned i = 0; i < vp.scissor_count; i++) {
2692       VkExtent2D frag_area = state->share_scale ? frag_areas[0] : frag_areas[i];
2693       VkRect2D scissor =
2694          state->share_scale ? state->vp.scissors[i] : state->vp.scissors[0];
2695       if (frag_area.width == 1 && frag_area.height == 1) {
2696          vp.scissors[i] = scissor;
2697          continue;
2698       }
2699 
2700       /* Transform the scissor following the viewport. It's unclear how this
2701        * is supposed to handle cases where the scissor isn't aligned to the
2702        * fragment area, but we round outwards to always render partial
2703        * fragments if the scissor size equals the framebuffer size and it
2704        * isn't aligned to the fragment area.
2705        */
2706       VkOffset2D offset = tu_fdm_per_bin_offset(frag_area, bin);
2707       VkOffset2D min = {
2708          scissor.offset.x / frag_area.width + offset.x,
2709          scissor.offset.y / frag_area.width + offset.y,
2710       };
2711       VkOffset2D max = {
2712          DIV_ROUND_UP(scissor.offset.x + scissor.extent.width, frag_area.width) + offset.x,
2713          DIV_ROUND_UP(scissor.offset.y + scissor.extent.height, frag_area.height) + offset.y,
2714       };
2715 
2716       /* Intersect scissor with the scaled bin, this essentially replaces the
2717        * window scissor.
2718        */
2719       uint32_t scaled_width = bin.extent.width / frag_area.width;
2720       uint32_t scaled_height = bin.extent.height / frag_area.height;
2721       vp.scissors[i].offset.x = MAX2(min.x, bin.offset.x);
2722       vp.scissors[i].offset.y = MAX2(min.y, bin.offset.y);
2723       vp.scissors[i].extent.width =
2724          MIN2(max.x, bin.offset.x + scaled_width) - vp.scissors[i].offset.x;
2725       vp.scissors[i].extent.height =
2726          MIN2(max.y, bin.offset.y + scaled_height) - vp.scissors[i].offset.y;
2727    }
2728 
2729    TU_CALLX(cs->device, tu6_emit_scissor)(cs, &vp);
2730 }
2731 
2732 static void
tu6_emit_scissor_fdm(struct tu_cs * cs,struct tu_cmd_buffer * cmd,const struct vk_viewport_state * vp)2733 tu6_emit_scissor_fdm(struct tu_cs *cs, struct tu_cmd_buffer *cmd,
2734                      const struct vk_viewport_state *vp)
2735 {
2736    unsigned num_views = MAX2(cmd->state.pass->num_views, 1);
2737    struct apply_viewport_state state = {
2738       .vp = *vp,
2739       .share_scale = !cmd->state.per_view_viewport,
2740    };
2741    if (!state.share_scale)
2742       state.vp.scissor_count = num_views;
2743    unsigned size = TU_CALLX(cmd->device, tu6_scissor_size)(cmd->device, &state.vp);
2744    tu_cs_begin_sub_stream(&cmd->sub_cs, size, cs);
2745    tu_create_fdm_bin_patchpoint(cmd, cs, size, fdm_apply_scissors, state);
2746 }
2747 
2748 static const enum mesa_vk_dynamic_graphics_state tu_sample_locations_state[] = {
2749    MESA_VK_DYNAMIC_MS_SAMPLE_LOCATIONS_ENABLE,
2750    MESA_VK_DYNAMIC_MS_SAMPLE_LOCATIONS,
2751 };
2752 
2753 template <chip CHIP>
2754 static unsigned
tu6_sample_locations_size(struct tu_device * dev,bool enable,const struct vk_sample_locations_state * samp_loc)2755 tu6_sample_locations_size(struct tu_device *dev, bool enable,
2756                           const struct vk_sample_locations_state *samp_loc)
2757 {
2758    return 6 + (enable ? 9 : 0);
2759 }
2760 
2761 template <chip CHIP>
2762 void
tu6_emit_sample_locations(struct tu_cs * cs,bool enable,const struct vk_sample_locations_state * samp_loc)2763 tu6_emit_sample_locations(struct tu_cs *cs, bool enable,
2764                           const struct vk_sample_locations_state *samp_loc)
2765 {
2766    uint32_t sample_config =
2767       COND(enable, A6XX_RB_SAMPLE_CONFIG_LOCATION_ENABLE);
2768 
2769    tu_cs_emit_pkt4(cs, REG_A6XX_GRAS_SAMPLE_CONFIG, 1);
2770    tu_cs_emit(cs, sample_config);
2771 
2772    tu_cs_emit_pkt4(cs, REG_A6XX_RB_SAMPLE_CONFIG, 1);
2773    tu_cs_emit(cs, sample_config);
2774 
2775    tu_cs_emit_pkt4(cs, REG_A6XX_SP_TP_SAMPLE_CONFIG, 1);
2776    tu_cs_emit(cs, sample_config);
2777 
2778    if (!enable)
2779       return;
2780 
2781    assert(samp_loc->grid_size.width == 1);
2782    assert(samp_loc->grid_size.height == 1);
2783 
2784    uint64_t sample_locations = 0;
2785    for (uint32_t i = 0; i < samp_loc->per_pixel; i++) {
2786       /* From VkSampleLocationEXT:
2787        *
2788        *    The values specified in a VkSampleLocationEXT structure are always
2789        *    clamped to the implementation-dependent sample location coordinate
2790        *    range
2791        *    [sampleLocationCoordinateRange[0],sampleLocationCoordinateRange[1]]
2792        */
2793       float x = CLAMP(samp_loc->locations[i].x, SAMPLE_LOCATION_MIN,
2794                       SAMPLE_LOCATION_MAX);
2795       float y = CLAMP(samp_loc->locations[i].y, SAMPLE_LOCATION_MIN,
2796                       SAMPLE_LOCATION_MAX);
2797 
2798       sample_locations |=
2799          ((uint64_t)(A6XX_RB_SAMPLE_LOCATION_0_SAMPLE_0_X(x) |
2800                      A6XX_RB_SAMPLE_LOCATION_0_SAMPLE_0_Y(y))) << i*8;
2801    }
2802 
2803    tu_cs_emit_pkt4(cs, REG_A6XX_GRAS_SAMPLE_LOCATION_0, 2);
2804    tu_cs_emit_qw(cs, sample_locations);
2805 
2806    tu_cs_emit_pkt4(cs, REG_A6XX_RB_SAMPLE_LOCATION_0, 2);
2807    tu_cs_emit_qw(cs, sample_locations);
2808 
2809    tu_cs_emit_pkt4(cs, REG_A6XX_SP_TP_SAMPLE_LOCATION_0, 2);
2810    tu_cs_emit_qw(cs, sample_locations);
2811 }
2812 
2813 static const enum mesa_vk_dynamic_graphics_state tu_depth_bias_state[] = {
2814    MESA_VK_DYNAMIC_RS_DEPTH_BIAS_FACTORS,
2815 };
2816 
2817 template <chip CHIP>
2818 static unsigned
tu6_depth_bias_size(struct tu_device * dev,const struct vk_rasterization_state * rs)2819 tu6_depth_bias_size(struct tu_device *dev,
2820                     const struct vk_rasterization_state *rs)
2821 {
2822    return 4;
2823 }
2824 
2825 template <chip CHIP>
2826 void
tu6_emit_depth_bias(struct tu_cs * cs,const struct vk_rasterization_state * rs)2827 tu6_emit_depth_bias(struct tu_cs *cs, const struct vk_rasterization_state *rs)
2828 {
2829    tu_cs_emit_pkt4(cs, REG_A6XX_GRAS_SU_POLY_OFFSET_SCALE, 3);
2830    tu_cs_emit(cs, A6XX_GRAS_SU_POLY_OFFSET_SCALE(rs->depth_bias.slope_factor).value);
2831    tu_cs_emit(cs, A6XX_GRAS_SU_POLY_OFFSET_OFFSET(rs->depth_bias.constant_factor).value);
2832    tu_cs_emit(cs, A6XX_GRAS_SU_POLY_OFFSET_OFFSET_CLAMP(rs->depth_bias.clamp).value);
2833 }
2834 
2835 static const enum mesa_vk_dynamic_graphics_state tu_bandwidth_state[] = {
2836    MESA_VK_DYNAMIC_CB_LOGIC_OP_ENABLE,
2837    MESA_VK_DYNAMIC_CB_LOGIC_OP,
2838    MESA_VK_DYNAMIC_CB_ATTACHMENT_COUNT,
2839    MESA_VK_DYNAMIC_CB_COLOR_WRITE_ENABLES,
2840    MESA_VK_DYNAMIC_CB_BLEND_ENABLES,
2841    MESA_VK_DYNAMIC_CB_WRITE_MASKS,
2842 };
2843 
2844 static void
tu_calc_bandwidth(struct tu_bandwidth * bandwidth,const struct vk_color_blend_state * cb,const struct vk_render_pass_state * rp)2845 tu_calc_bandwidth(struct tu_bandwidth *bandwidth,
2846                   const struct vk_color_blend_state *cb,
2847                   const struct vk_render_pass_state *rp)
2848 {
2849    bool rop_reads_dst = cb->logic_op_enable && tu_logic_op_reads_dst((VkLogicOp)cb->logic_op);
2850 
2851    uint32_t total_bpp = 0;
2852    for (unsigned i = 0; i < cb->attachment_count; i++) {
2853       const struct vk_color_blend_attachment_state *att = &cb->attachments[i];
2854       if (!(cb->color_write_enables & (1u << i)))
2855          continue;
2856 
2857       const VkFormat format = rp->color_attachment_formats[i];
2858 
2859       uint32_t write_bpp = 0;
2860       if (format == VK_FORMAT_UNDEFINED) {
2861          /* do nothing */
2862       } else if (att->write_mask == 0xf) {
2863          write_bpp = vk_format_get_blocksizebits(format);
2864       } else {
2865          const enum pipe_format pipe_format = vk_format_to_pipe_format(format);
2866          for (uint32_t i = 0; i < 4; i++) {
2867             if (att->write_mask & (1 << i)) {
2868                write_bpp += util_format_get_component_bits(pipe_format,
2869                      UTIL_FORMAT_COLORSPACE_RGB, i);
2870             }
2871          }
2872       }
2873       total_bpp += write_bpp;
2874 
2875       if (rop_reads_dst || att->blend_enable) {
2876          total_bpp += write_bpp;
2877       }
2878    }
2879 
2880    bandwidth->color_bandwidth_per_sample = total_bpp / 8;
2881 
2882    if (rp->attachments & MESA_VK_RP_ATTACHMENT_DEPTH_BIT) {
2883       bandwidth->depth_cpp_per_sample = util_format_get_component_bits(
2884             vk_format_to_pipe_format(rp->depth_attachment_format),
2885             UTIL_FORMAT_COLORSPACE_ZS, 0) / 8;
2886    }
2887 
2888    if (rp->attachments & MESA_VK_RP_ATTACHMENT_STENCIL_BIT) {
2889       bandwidth->stencil_cpp_per_sample = util_format_get_component_bits(
2890             vk_format_to_pipe_format(rp->stencil_attachment_format),
2891             UTIL_FORMAT_COLORSPACE_ZS, 1) / 8;
2892    }
2893 }
2894 
2895 /* Return true if the blend state reads the color attachments. */
2896 static bool
tu6_calc_blend_lrz(const struct vk_color_blend_state * cb,const struct vk_render_pass_state * rp)2897 tu6_calc_blend_lrz(const struct vk_color_blend_state *cb,
2898                    const struct vk_render_pass_state *rp)
2899 {
2900    if (cb->logic_op_enable && tu_logic_op_reads_dst((VkLogicOp)cb->logic_op))
2901       return true;
2902 
2903    for (unsigned i = 0; i < cb->attachment_count; i++) {
2904       if (rp->color_attachment_formats[i] == VK_FORMAT_UNDEFINED)
2905          continue;
2906 
2907       const struct vk_color_blend_attachment_state *att = &cb->attachments[i];
2908       if (att->blend_enable)
2909          return true;
2910       if (!(cb->color_write_enables & (1u << i)))
2911          return true;
2912       unsigned mask =
2913          MASK(vk_format_get_nr_components(rp->color_attachment_formats[i]));
2914       if ((att->write_mask & mask) != mask)
2915          return true;
2916    }
2917 
2918    return false;
2919 }
2920 
2921 static const enum mesa_vk_dynamic_graphics_state tu_blend_lrz_state[] = {
2922    MESA_VK_DYNAMIC_CB_LOGIC_OP_ENABLE,
2923    MESA_VK_DYNAMIC_CB_LOGIC_OP,
2924    MESA_VK_DYNAMIC_CB_ATTACHMENT_COUNT,
2925    MESA_VK_DYNAMIC_CB_COLOR_WRITE_ENABLES,
2926    MESA_VK_DYNAMIC_CB_BLEND_ENABLES,
2927    MESA_VK_DYNAMIC_CB_WRITE_MASKS,
2928 };
2929 
2930 static void
tu_emit_blend_lrz(struct tu_lrz_blend * lrz,const struct vk_color_blend_state * cb,const struct vk_render_pass_state * rp)2931 tu_emit_blend_lrz(struct tu_lrz_blend *lrz,
2932                   const struct vk_color_blend_state *cb,
2933                   const struct vk_render_pass_state *rp)
2934 {
2935    lrz->reads_dest = tu6_calc_blend_lrz(cb, rp);
2936    lrz->valid = true;
2937 }
2938 
2939 static const enum mesa_vk_dynamic_graphics_state tu_blend_state[] = {
2940    MESA_VK_DYNAMIC_CB_LOGIC_OP_ENABLE,
2941    MESA_VK_DYNAMIC_CB_LOGIC_OP,
2942    MESA_VK_DYNAMIC_CB_ATTACHMENT_COUNT,
2943    MESA_VK_DYNAMIC_CB_COLOR_WRITE_ENABLES,
2944    MESA_VK_DYNAMIC_CB_BLEND_ENABLES,
2945    MESA_VK_DYNAMIC_CB_BLEND_EQUATIONS,
2946    MESA_VK_DYNAMIC_CB_WRITE_MASKS,
2947    MESA_VK_DYNAMIC_MS_ALPHA_TO_COVERAGE_ENABLE,
2948    MESA_VK_DYNAMIC_MS_ALPHA_TO_ONE_ENABLE,
2949    MESA_VK_DYNAMIC_MS_SAMPLE_MASK,
2950    MESA_VK_DYNAMIC_COLOR_ATTACHMENT_MAP,
2951 };
2952 
2953 template <chip CHIP>
2954 static unsigned
tu6_blend_size(struct tu_device * dev,const struct vk_color_blend_state * cb,const struct vk_color_attachment_location_state * cal,bool alpha_to_coverage_enable,bool alpha_to_one_enable,uint32_t sample_mask)2955 tu6_blend_size(struct tu_device *dev,
2956                const struct vk_color_blend_state *cb,
2957                const struct vk_color_attachment_location_state *cal,
2958                bool alpha_to_coverage_enable,
2959                bool alpha_to_one_enable,
2960                uint32_t sample_mask)
2961 {
2962    unsigned num_rts = alpha_to_coverage_enable ?
2963       MAX2(cb->attachment_count, 1) : cb->attachment_count;
2964    return 8 + 3 * num_rts;
2965 }
2966 
2967 template <chip CHIP>
2968 static void
tu6_emit_blend(struct tu_cs * cs,const struct vk_color_blend_state * cb,const struct vk_color_attachment_location_state * cal,bool alpha_to_coverage_enable,bool alpha_to_one_enable,uint32_t sample_mask)2969 tu6_emit_blend(struct tu_cs *cs,
2970                const struct vk_color_blend_state *cb,
2971                const struct vk_color_attachment_location_state *cal,
2972                bool alpha_to_coverage_enable,
2973                bool alpha_to_one_enable,
2974                uint32_t sample_mask)
2975 {
2976    bool rop_reads_dst = cb->logic_op_enable && tu_logic_op_reads_dst((VkLogicOp)cb->logic_op);
2977    enum a3xx_rop_code rop = tu6_rop((VkLogicOp)cb->logic_op);
2978 
2979    uint32_t blend_enable_mask = 0;
2980    for (unsigned i = 0; i < cb->attachment_count; i++) {
2981       if (!(cb->color_write_enables & (1u << i)) ||
2982           cal->color_map[i] == MESA_VK_ATTACHMENT_UNUSED)
2983          continue;
2984 
2985       const struct vk_color_blend_attachment_state *att = &cb->attachments[i];
2986 
2987       if (rop_reads_dst || att->blend_enable) {
2988          blend_enable_mask |= 1u << cal->color_map[i];
2989       }
2990    }
2991 
2992    /* This will emit a dummy RB_MRT_*_CONTROL below if alpha-to-coverage is
2993     * enabled but there are no color attachments, in addition to changing
2994     * *_FS_OUTPUT_CNTL1.
2995     */
2996    unsigned num_rts = alpha_to_coverage_enable ?
2997       MAX2(cb->attachment_count, 1) : cb->attachment_count;
2998 
2999    bool dual_src_blend = tu_blend_state_is_dual_src(cb);
3000 
3001    tu_cs_emit_regs(cs, A6XX_SP_FS_OUTPUT_CNTL1(.mrt = num_rts));
3002    tu_cs_emit_regs(cs, A6XX_RB_FS_OUTPUT_CNTL1(.mrt = num_rts));
3003    tu_cs_emit_regs(cs, A6XX_SP_BLEND_CNTL(.enable_blend = blend_enable_mask,
3004                                           .unk8 = true,
3005                                           .dual_color_in_enable =
3006                                              dual_src_blend,
3007                                           .alpha_to_coverage =
3008                                              alpha_to_coverage_enable));
3009    /* set A6XX_RB_BLEND_CNTL_INDEPENDENT_BLEND only when enabled? */
3010    tu_cs_emit_regs(cs, A6XX_RB_BLEND_CNTL(.enable_blend = blend_enable_mask,
3011                                           .independent_blend = true,
3012                                           .dual_color_in_enable =
3013                                              dual_src_blend,
3014                                           .alpha_to_coverage =
3015                                              alpha_to_coverage_enable,
3016                                           .alpha_to_one = alpha_to_one_enable,
3017                                           .sample_mask = sample_mask));
3018 
3019    for (unsigned i = 0; i < num_rts; i++) {
3020       if (cal->color_map[i] == MESA_VK_ATTACHMENT_UNUSED)
3021          continue;
3022       unsigned remapped_idx = cal->color_map[i];
3023       const struct vk_color_blend_attachment_state *att = &cb->attachments[i];
3024       if ((cb->color_write_enables & (1u << i)) && i < cb->attachment_count) {
3025          const enum a3xx_rb_blend_opcode color_op = tu6_blend_op(att->color_blend_op);
3026          const enum adreno_rb_blend_factor src_color_factor =
3027             tu6_blend_factor((VkBlendFactor)att->src_color_blend_factor);
3028          const enum adreno_rb_blend_factor dst_color_factor =
3029             tu6_blend_factor((VkBlendFactor)att->dst_color_blend_factor);
3030          const enum a3xx_rb_blend_opcode alpha_op =
3031             tu6_blend_op(att->alpha_blend_op);
3032          const enum adreno_rb_blend_factor src_alpha_factor =
3033             tu6_blend_factor((VkBlendFactor)att->src_alpha_blend_factor);
3034          const enum adreno_rb_blend_factor dst_alpha_factor =
3035             tu6_blend_factor((VkBlendFactor)att->dst_alpha_blend_factor);
3036 
3037          tu_cs_emit_regs(cs,
3038                          A6XX_RB_MRT_CONTROL(remapped_idx,
3039                                              .blend = att->blend_enable,
3040                                              .blend2 = att->blend_enable,
3041                                              .rop_enable = cb->logic_op_enable,
3042                                              .rop_code = rop,
3043                                              .component_enable = att->write_mask),
3044                          A6XX_RB_MRT_BLEND_CONTROL(remapped_idx,
3045                                                    .rgb_src_factor = src_color_factor,
3046                                                    .rgb_blend_opcode = color_op,
3047                                                    .rgb_dest_factor = dst_color_factor,
3048                                                    .alpha_src_factor = src_alpha_factor,
3049                                                    .alpha_blend_opcode = alpha_op,
3050                                                    .alpha_dest_factor = dst_alpha_factor));
3051       } else {
3052             tu_cs_emit_regs(cs,
3053                             A6XX_RB_MRT_CONTROL(remapped_idx,),
3054                             A6XX_RB_MRT_BLEND_CONTROL(remapped_idx,));
3055       }
3056    }
3057 }
3058 
3059 static const enum mesa_vk_dynamic_graphics_state tu_blend_constants_state[] = {
3060    MESA_VK_DYNAMIC_CB_BLEND_CONSTANTS,
3061 };
3062 
3063 template <chip CHIP>
3064 static unsigned
tu6_blend_constants_size(struct tu_device * dev,const struct vk_color_blend_state * cb)3065 tu6_blend_constants_size(struct tu_device *dev,
3066                          const struct vk_color_blend_state *cb)
3067 {
3068    return 5;
3069 }
3070 
3071 template <chip CHIP>
3072 static void
tu6_emit_blend_constants(struct tu_cs * cs,const struct vk_color_blend_state * cb)3073 tu6_emit_blend_constants(struct tu_cs *cs, const struct vk_color_blend_state *cb)
3074 {
3075    tu_cs_emit_pkt4(cs, REG_A6XX_RB_BLEND_RED_F32, 4);
3076    tu_cs_emit_array(cs, (const uint32_t *) cb->blend_constants, 4);
3077 }
3078 
3079 static const enum mesa_vk_dynamic_graphics_state tu_rast_state[] = {
3080    MESA_VK_DYNAMIC_RS_DEPTH_CLAMP_ENABLE,
3081    MESA_VK_DYNAMIC_RS_DEPTH_CLIP_ENABLE,
3082    MESA_VK_DYNAMIC_RS_POLYGON_MODE,
3083    MESA_VK_DYNAMIC_RS_CULL_MODE,
3084    MESA_VK_DYNAMIC_RS_FRONT_FACE,
3085    MESA_VK_DYNAMIC_RS_DEPTH_BIAS_ENABLE,
3086    MESA_VK_DYNAMIC_RS_LINE_MODE,
3087    MESA_VK_DYNAMIC_RS_RASTERIZER_DISCARD_ENABLE,
3088    MESA_VK_DYNAMIC_RS_RASTERIZATION_STREAM,
3089    MESA_VK_DYNAMIC_VP_DEPTH_CLIP_NEGATIVE_ONE_TO_ONE,
3090    MESA_VK_DYNAMIC_RS_LINE_WIDTH,
3091 };
3092 
3093 template <chip CHIP>
3094 uint32_t
tu6_rast_size(struct tu_device * dev,const struct vk_rasterization_state * rs,const struct vk_viewport_state * vp,bool multiview,bool per_view_viewport)3095 tu6_rast_size(struct tu_device *dev,
3096               const struct vk_rasterization_state *rs,
3097               const struct vk_viewport_state *vp,
3098               bool multiview,
3099               bool per_view_viewport)
3100 {
3101    if (CHIP == A6XX) {
3102       return 15 + (dev->physical_device->info->a6xx.has_legacy_pipeline_shading_rate ? 8 : 0);
3103    } else {
3104       return 17;
3105    }
3106 }
3107 
3108 template <chip CHIP>
3109 void
tu6_emit_rast(struct tu_cs * cs,const struct vk_rasterization_state * rs,const struct vk_viewport_state * vp,bool multiview,bool per_view_viewport)3110 tu6_emit_rast(struct tu_cs *cs,
3111               const struct vk_rasterization_state *rs,
3112               const struct vk_viewport_state *vp,
3113               bool multiview,
3114               bool per_view_viewport)
3115 {
3116    enum a5xx_line_mode line_mode =
3117       rs->line.mode == VK_LINE_RASTERIZATION_MODE_BRESENHAM_KHR ?
3118       BRESENHAM : RECTANGULAR;
3119    tu_cs_emit_regs(cs,
3120                    A6XX_GRAS_SU_CNTL(
3121                      .cull_front = rs->cull_mode & VK_CULL_MODE_FRONT_BIT,
3122                      .cull_back = rs->cull_mode & VK_CULL_MODE_BACK_BIT,
3123                      .front_cw = rs->front_face == VK_FRONT_FACE_CLOCKWISE,
3124                      .linehalfwidth = rs->line.width / 2.0f,
3125                      .poly_offset = rs->depth_bias.enable,
3126                      .line_mode = line_mode,
3127                      .multiview_enable = multiview,
3128                      .rendertargetindexincr = multiview,
3129                      .viewportindexincr = multiview && per_view_viewport));
3130 
3131    bool depth_clip_enable = vk_rasterization_state_depth_clip_enable(rs);
3132 
3133    tu_cs_emit_regs(cs,
3134                    A6XX_GRAS_CL_CNTL(
3135                      .znear_clip_disable = !depth_clip_enable,
3136                      .zfar_clip_disable = !depth_clip_enable,
3137                      /* To support VK_EXT_depth_clamp_zero_one on a7xx+ */
3138                      .z_clamp_enable = rs->depth_clamp_enable || CHIP >= A7XX,
3139                      .zero_gb_scale_z = vp->depth_clip_negative_one_to_one ? 0 : 1,
3140                      .vp_clip_code_ignore = 1));;
3141 
3142    enum a6xx_polygon_mode polygon_mode = tu6_polygon_mode(rs->polygon_mode);
3143 
3144    tu_cs_emit_regs(cs,
3145                    A6XX_VPC_POLYGON_MODE(polygon_mode));
3146 
3147    tu_cs_emit_regs(cs,
3148                    PC_POLYGON_MODE(CHIP, polygon_mode));
3149 
3150    if (CHIP == A7XX) {
3151       tu_cs_emit_regs(cs,
3152                      A7XX_VPC_POLYGON_MODE2(polygon_mode));
3153    }
3154 
3155    tu_cs_emit_regs(cs, PC_RASTER_CNTL(CHIP,
3156       .stream = rs->rasterization_stream,
3157       .discard = rs->rasterizer_discard_enable));
3158    if (CHIP == A6XX) {
3159       tu_cs_emit_regs(cs, A6XX_VPC_UNKNOWN_9107(
3160          .raster_discard = rs->rasterizer_discard_enable));
3161    } else {
3162       tu_cs_emit_regs(cs, A7XX_PC_RASTER_CNTL_V2(
3163          .stream = rs->rasterization_stream,
3164          .discard = rs->rasterizer_discard_enable));
3165    }
3166 
3167    /* move to hw ctx init? */
3168    tu_cs_emit_regs(cs,
3169                    A6XX_GRAS_SU_POINT_MINMAX(.min = 1.0f / 16.0f, .max = 4092.0f),
3170                    A6XX_GRAS_SU_POINT_SIZE(1.0f));
3171 
3172    if (CHIP == A6XX && cs->device->physical_device->info->a6xx.has_legacy_pipeline_shading_rate) {
3173       tu_cs_emit_regs(cs, A6XX_RB_UNKNOWN_8A00());
3174       tu_cs_emit_regs(cs, A6XX_RB_UNKNOWN_8A10());
3175       tu_cs_emit_regs(cs, A6XX_RB_UNKNOWN_8A20());
3176       tu_cs_emit_regs(cs, A6XX_RB_UNKNOWN_8A30());
3177    }
3178 }
3179 
3180 static const enum mesa_vk_dynamic_graphics_state tu_ds_state[] = {
3181    MESA_VK_DYNAMIC_DS_STENCIL_TEST_ENABLE,
3182    MESA_VK_DYNAMIC_DS_STENCIL_OP,
3183    MESA_VK_DYNAMIC_DS_STENCIL_COMPARE_MASK,
3184    MESA_VK_DYNAMIC_DS_STENCIL_WRITE_MASK,
3185    MESA_VK_DYNAMIC_DS_STENCIL_REFERENCE,
3186    MESA_VK_DYNAMIC_DS_DEPTH_BOUNDS_TEST_BOUNDS,
3187 };
3188 
3189 template <chip CHIP>
3190 static unsigned
tu6_ds_size(struct tu_device * dev,const struct vk_depth_stencil_state * ds,const struct vk_render_pass_state * rp)3191 tu6_ds_size(struct tu_device *dev,
3192                  const struct vk_depth_stencil_state *ds,
3193                  const struct vk_render_pass_state *rp)
3194 {
3195    return 13;
3196 }
3197 
3198 template <chip CHIP>
3199 static void
tu6_emit_ds(struct tu_cs * cs,const struct vk_depth_stencil_state * ds,const struct vk_render_pass_state * rp)3200 tu6_emit_ds(struct tu_cs *cs,
3201             const struct vk_depth_stencil_state *ds,
3202             const struct vk_render_pass_state *rp)
3203 {
3204    bool stencil_test_enable =
3205       ds->stencil.test_enable && rp->attachments & MESA_VK_RP_ATTACHMENT_STENCIL_BIT;
3206    tu_cs_emit_regs(cs, A6XX_RB_STENCIL_CONTROL(
3207       .stencil_enable = stencil_test_enable,
3208       .stencil_enable_bf = stencil_test_enable,
3209       .stencil_read = stencil_test_enable,
3210       .func = tu6_compare_func((VkCompareOp)ds->stencil.front.op.compare),
3211       .fail = tu6_stencil_op((VkStencilOp)ds->stencil.front.op.fail),
3212       .zpass = tu6_stencil_op((VkStencilOp)ds->stencil.front.op.pass),
3213       .zfail = tu6_stencil_op((VkStencilOp)ds->stencil.front.op.depth_fail),
3214       .func_bf = tu6_compare_func((VkCompareOp)ds->stencil.back.op.compare),
3215       .fail_bf = tu6_stencil_op((VkStencilOp)ds->stencil.back.op.fail),
3216       .zpass_bf = tu6_stencil_op((VkStencilOp)ds->stencil.back.op.pass),
3217       .zfail_bf = tu6_stencil_op((VkStencilOp)ds->stencil.back.op.depth_fail)));
3218    tu_cs_emit_regs(cs, A6XX_GRAS_SU_STENCIL_CNTL(stencil_test_enable));
3219 
3220    tu_cs_emit_regs(cs, A6XX_RB_STENCILMASK(
3221       .mask = ds->stencil.front.compare_mask,
3222       .bfmask = ds->stencil.back.compare_mask));
3223 
3224    tu_cs_emit_regs(cs, A6XX_RB_STENCILWRMASK(
3225       .wrmask = ds->stencil.front.write_mask,
3226       .bfwrmask = ds->stencil.back.write_mask));
3227 
3228    tu_cs_emit_regs(cs, A6XX_RB_STENCILREF(
3229       .ref = ds->stencil.front.reference,
3230       .bfref = ds->stencil.back.reference));
3231 
3232    tu_cs_emit_regs(cs,
3233                    A6XX_RB_Z_BOUNDS_MIN(ds->depth.bounds_test.min),
3234                    A6XX_RB_Z_BOUNDS_MAX(ds->depth.bounds_test.max));
3235 }
3236 
3237 static const enum mesa_vk_dynamic_graphics_state tu_rb_depth_cntl_state[] = {
3238    MESA_VK_DYNAMIC_DS_DEPTH_TEST_ENABLE,
3239    MESA_VK_DYNAMIC_DS_DEPTH_WRITE_ENABLE,
3240    MESA_VK_DYNAMIC_DS_DEPTH_COMPARE_OP,
3241    MESA_VK_DYNAMIC_DS_DEPTH_BOUNDS_TEST_ENABLE,
3242    MESA_VK_DYNAMIC_RS_DEPTH_CLAMP_ENABLE,
3243 };
3244 
3245 template <chip CHIP>
3246 static unsigned
tu6_rb_depth_cntl_size(struct tu_device * dev,const struct vk_depth_stencil_state * ds,const struct vk_render_pass_state * rp,const struct vk_rasterization_state * rs)3247 tu6_rb_depth_cntl_size(struct tu_device *dev,
3248                        const struct vk_depth_stencil_state *ds,
3249                        const struct vk_render_pass_state *rp,
3250                        const struct vk_rasterization_state *rs)
3251 {
3252    return 4;
3253 }
3254 
3255 template <chip CHIP>
3256 static void
tu6_emit_rb_depth_cntl(struct tu_cs * cs,const struct vk_depth_stencil_state * ds,const struct vk_render_pass_state * rp,const struct vk_rasterization_state * rs)3257 tu6_emit_rb_depth_cntl(struct tu_cs *cs,
3258                        const struct vk_depth_stencil_state *ds,
3259                        const struct vk_render_pass_state *rp,
3260                        const struct vk_rasterization_state *rs)
3261 {
3262    if (rp->attachments & MESA_VK_RP_ATTACHMENT_DEPTH_BIT) {
3263       bool depth_test = ds->depth.test_enable;
3264       enum adreno_compare_func zfunc = tu6_compare_func(ds->depth.compare_op);
3265 
3266       /* On some GPUs it is necessary to enable z test for depth bounds test
3267        * when UBWC is enabled. Otherwise, the GPU would hang. FUNC_ALWAYS is
3268        * required to pass z test. Relevant tests:
3269        *  dEQP-VK.pipeline.extended_dynamic_state.two_draws_dynamic.depth_bounds_test_disable
3270        *  dEQP-VK.dynamic_state.ds_state.depth_bounds_1
3271        */
3272       if (ds->depth.bounds_test.enable &&
3273           !ds->depth.test_enable &&
3274           cs->device->physical_device->info->a6xx.depth_bounds_require_depth_test_quirk) {
3275          depth_test = true;
3276          zfunc = FUNC_ALWAYS;
3277       }
3278 
3279       tu_cs_emit_regs(cs, A6XX_RB_DEPTH_CNTL(
3280          .z_test_enable = depth_test,
3281          .z_write_enable = ds->depth.test_enable && ds->depth.write_enable,
3282          .zfunc = zfunc,
3283          /* To support VK_EXT_depth_clamp_zero_one on a7xx+ */
3284          .z_clamp_enable = rs->depth_clamp_enable || CHIP >= A7XX,
3285          /* TODO don't set for ALWAYS/NEVER */
3286          .z_read_enable = ds->depth.test_enable || ds->depth.bounds_test.enable,
3287          .z_bounds_enable = ds->depth.bounds_test.enable));
3288       tu_cs_emit_regs(cs, A6XX_GRAS_SU_DEPTH_CNTL(depth_test));
3289    } else {
3290       tu_cs_emit_regs(cs, A6XX_RB_DEPTH_CNTL());
3291       tu_cs_emit_regs(cs, A6XX_GRAS_SU_DEPTH_CNTL());
3292    }
3293 }
3294 
3295 static const enum mesa_vk_dynamic_graphics_state tu_prim_mode_sysmem_state[] = {
3296    MESA_VK_DYNAMIC_ATTACHMENT_FEEDBACK_LOOP_ENABLE,
3297 };
3298 
3299 template <chip CHIP>
3300 static unsigned
tu6_prim_mode_sysmem_size(struct tu_device * dev,struct tu_shader * fs,bool raster_order_attachment_access,VkImageAspectFlags feedback_loops,bool * sysmem_single_prim_mode)3301 tu6_prim_mode_sysmem_size(struct tu_device *dev,
3302                           struct tu_shader *fs,
3303                           bool raster_order_attachment_access,
3304                           VkImageAspectFlags feedback_loops,
3305                           bool *sysmem_single_prim_mode)
3306 {
3307    return 2;
3308 }
3309 
3310 template <chip CHIP>
3311 static void
tu6_emit_prim_mode_sysmem(struct tu_cs * cs,struct tu_shader * fs,bool raster_order_attachment_access,VkImageAspectFlags feedback_loops,bool * sysmem_single_prim_mode)3312 tu6_emit_prim_mode_sysmem(struct tu_cs *cs,
3313                           struct tu_shader *fs,
3314                           bool raster_order_attachment_access,
3315                           VkImageAspectFlags feedback_loops,
3316                           bool *sysmem_single_prim_mode)
3317 {
3318    /* VK_EXT_rasterization_order_attachment_access:
3319     *
3320     * This extension allow access to framebuffer attachments when used as both
3321     * input and color attachments from one fragment to the next, in
3322     * rasterization order, without explicit synchronization.
3323     */
3324    raster_order_attachment_access |= TU_DEBUG(RAST_ORDER);
3325 
3326    /* If there is a feedback loop, then the shader can read the previous value
3327     * of a pixel being written out. It can also write some components and then
3328     * read different components without a barrier in between. This is a
3329     * problem in sysmem mode with UBWC, because the main buffer and flags
3330     * buffer can get out-of-sync if only one is flushed. We fix this by
3331     * setting the SINGLE_PRIM_MODE field to the same value that the blob does
3332     * for advanced_blend in sysmem mode if a feedback loop is detected.
3333     */
3334    enum a6xx_single_prim_mode sysmem_prim_mode =
3335       (raster_order_attachment_access || feedback_loops ||
3336        fs->fs.dynamic_input_attachments_used) ?
3337       FLUSH_PER_OVERLAP_AND_OVERWRITE : NO_FLUSH;
3338 
3339    if (sysmem_prim_mode == FLUSH_PER_OVERLAP_AND_OVERWRITE)
3340       *sysmem_single_prim_mode = true;
3341 
3342    tu_cs_emit_regs(cs, A6XX_GRAS_SC_CNTL(.ccusinglecachelinesize = 2,
3343                                          .single_prim_mode = sysmem_prim_mode));
3344 }
3345 
3346 static const enum mesa_vk_dynamic_graphics_state tu_fragment_shading_rate_state[] = {
3347    MESA_VK_DYNAMIC_FSR,
3348 };
3349 
3350 template <chip CHIP>
3351 static unsigned
tu6_fragment_shading_rate_size(struct tu_device * dev,const vk_fragment_shading_rate_state * fsr,bool enable_att_fsr,bool enable_prim_fsr,bool fs_reads_fsr,bool sample_shading)3352 tu6_fragment_shading_rate_size(struct tu_device *dev,
3353                                const vk_fragment_shading_rate_state *fsr,
3354                                bool enable_att_fsr,
3355                                bool enable_prim_fsr,
3356                                bool fs_reads_fsr,
3357                                bool sample_shading)
3358 {
3359    return 6;
3360 }
3361 
3362 template <chip CHIP>
3363 static void
tu6_emit_fragment_shading_rate(struct tu_cs * cs,const vk_fragment_shading_rate_state * fsr,bool enable_att_fsr,bool enable_prim_fsr,bool fs_reads_fsr,bool accesses_smask)3364 tu6_emit_fragment_shading_rate(struct tu_cs *cs,
3365                                const vk_fragment_shading_rate_state *fsr,
3366                                bool enable_att_fsr,
3367                                bool enable_prim_fsr,
3368                                bool fs_reads_fsr,
3369                                bool accesses_smask)
3370 {
3371    /* gl_ShadingRateEXT don't read 1x1 value with null config, so
3372     * if it is read - we have to emit the config.
3373     */
3374    if (!fsr || (!fs_reads_fsr && vk_fragment_shading_rate_is_disabled(fsr))) {
3375       tu_cs_emit_regs(cs, A6XX_RB_FSR_CONFIG());
3376       tu_cs_emit_regs(cs, A7XX_SP_FSR_CONFIG());
3377       tu_cs_emit_regs(cs, A7XX_GRAS_FSR_CONFIG());
3378       return;
3379    }
3380 
3381    uint32_t frag_width = fsr->fragment_size.width;
3382    uint32_t frag_height = fsr->fragment_size.height;
3383 
3384    bool enable_draw_fsr = true;
3385    if (enable_att_fsr) {
3386       if (fsr->combiner_ops[1] ==
3387           VK_FRAGMENT_SHADING_RATE_COMBINER_OP_REPLACE_KHR) {
3388          enable_draw_fsr = false;
3389          enable_prim_fsr = false;
3390       } else if (fsr->combiner_ops[1] ==
3391                  VK_FRAGMENT_SHADING_RATE_COMBINER_OP_KEEP_KHR) {
3392          enable_att_fsr = false;
3393       }
3394    }
3395    if (enable_prim_fsr) {
3396       if (fsr->combiner_ops[0] ==
3397           VK_FRAGMENT_SHADING_RATE_COMBINER_OP_REPLACE_KHR) {
3398          enable_draw_fsr = false;
3399       } else if (fsr->combiner_ops[0] ==
3400                  VK_FRAGMENT_SHADING_RATE_COMBINER_OP_KEEP_KHR) {
3401          enable_prim_fsr = false;
3402       }
3403    }
3404 
3405    /* Force 1x1 FSR because we don't support
3406     * fragmentShadingRateWithShaderSampleMask.
3407     */
3408    if (accesses_smask) {
3409       enable_att_fsr = enable_prim_fsr = false;
3410       frag_width = frag_height = 1;
3411       enable_draw_fsr = true;
3412    }
3413 
3414    tu_cs_emit_regs(
3415       cs,
3416       A6XX_RB_FSR_CONFIG(.unk2 = true, .pipeline_fsr_enable = enable_draw_fsr,
3417                          .attachment_fsr_enable = enable_att_fsr,
3418                          .primitive_fsr_enable = enable_prim_fsr));
3419    tu_cs_emit_regs(
3420       cs, A7XX_SP_FSR_CONFIG(.pipeline_fsr_enable = enable_draw_fsr,
3421                              .attachment_fsr_enable = enable_att_fsr,
3422                              .primitive_fsr_enable = enable_prim_fsr));
3423    tu_cs_emit_regs(
3424       cs, A7XX_GRAS_FSR_CONFIG(
3425                 .pipeline_fsr_enable = enable_draw_fsr,
3426                 .frag_size_x = util_logbase2(frag_width),
3427                 .frag_size_y = util_logbase2(frag_height),
3428                 .combiner_op_1 = (a6xx_fsr_combiner) fsr->combiner_ops[0],
3429                 .combiner_op_2 = (a6xx_fsr_combiner) fsr->combiner_ops[1],
3430                 .attachment_fsr_enable = enable_att_fsr,
3431                 .primitive_fsr_enable = enable_prim_fsr));
3432 }
3433 
3434 
3435 static inline bool
emit_pipeline_state(BITSET_WORD * keep,BITSET_WORD * remove,BITSET_WORD * pipeline_set,const enum mesa_vk_dynamic_graphics_state * state_array,unsigned num_states,bool extra_cond,struct tu_pipeline_builder * builder)3436 emit_pipeline_state(BITSET_WORD *keep, BITSET_WORD *remove,
3437                     BITSET_WORD *pipeline_set,
3438                     const enum mesa_vk_dynamic_graphics_state *state_array,
3439                     unsigned num_states, bool extra_cond,
3440                     struct tu_pipeline_builder *builder)
3441 {
3442    BITSET_DECLARE(state, MESA_VK_DYNAMIC_GRAPHICS_STATE_ENUM_MAX) = {};
3443 
3444    /* Unrolling this loop should produce a constant value once the function is
3445     * inlined, because state_array and num_states are a per-draw-state
3446     * constant, but GCC seems to need a little encouragement. clang does a
3447     * little better but still needs a pragma when there are a large number of
3448     * states.
3449     */
3450 #if defined(__clang__)
3451 #pragma clang loop unroll(full)
3452 #elif defined(__GNUC__) && __GNUC__ >= 8
3453 #pragma GCC unroll MESA_VK_DYNAMIC_GRAPHICS_STATE_ENUM_MAX
3454 #endif
3455    for (unsigned i = 0; i < num_states; i++) {
3456       BITSET_SET(state, state_array[i]);
3457    }
3458 
3459    /* If all of the state is set, then after we emit it we can tentatively
3460     * remove it from the states to set for the pipeline by making it dynamic.
3461     * If we can't emit it, though, we need to keep around the partial state so
3462     * that we can emit it later, even if another draw state consumes it. That
3463     * is, we have to cancel any tentative removal.
3464     */
3465    BITSET_DECLARE(temp, MESA_VK_DYNAMIC_GRAPHICS_STATE_ENUM_MAX);
3466    memcpy(temp, pipeline_set, sizeof(temp));
3467    BITSET_AND(temp, temp, state);
3468    if (!BITSET_EQUAL(temp, state) || !extra_cond) {
3469       __bitset_or(keep, keep, temp, ARRAY_SIZE(temp));
3470       return false;
3471    }
3472    __bitset_or(remove, remove, state, ARRAY_SIZE(state));
3473    return true;
3474 }
3475 
3476 template <chip CHIP>
3477 static void
tu_pipeline_builder_emit_state(struct tu_pipeline_builder * builder,struct tu_pipeline * pipeline)3478 tu_pipeline_builder_emit_state(struct tu_pipeline_builder *builder,
3479                                struct tu_pipeline *pipeline)
3480 {
3481    struct tu_cs cs;
3482    BITSET_DECLARE(keep, MESA_VK_DYNAMIC_GRAPHICS_STATE_ENUM_MAX) = {};
3483    BITSET_DECLARE(remove, MESA_VK_DYNAMIC_GRAPHICS_STATE_ENUM_MAX) = {};
3484    BITSET_DECLARE(pipeline_set, MESA_VK_DYNAMIC_GRAPHICS_STATE_ENUM_MAX) = {};
3485 
3486    vk_graphics_pipeline_get_state(&builder->graphics_state, pipeline_set);
3487 
3488 #define EMIT_STATE(name, extra_cond)                                          \
3489    emit_pipeline_state(keep, remove, pipeline_set, tu_##name##_state,         \
3490                        ARRAY_SIZE(tu_##name##_state), extra_cond, builder)
3491 
3492 #define DRAW_STATE_COND(name, id, extra_cond, ...)                            \
3493    if (EMIT_STATE(name, extra_cond)) {                                        \
3494       unsigned size = tu6_##name##_size<CHIP>(builder->device, __VA_ARGS__);  \
3495       if (size > 0) {                                                         \
3496          tu_cs_begin_sub_stream(&pipeline->cs, size, &cs);                    \
3497          tu6_emit_##name<CHIP>(&cs, __VA_ARGS__);                             \
3498          pipeline->dynamic_state[id] =                                        \
3499             tu_cs_end_draw_state(&pipeline->cs, &cs);                         \
3500       }                                                                       \
3501       pipeline->set_state_mask |= (1u << id);                                 \
3502    }
3503 #define DRAW_STATE(name, id, ...) DRAW_STATE_COND(name, id, true, __VA_ARGS__)
3504 
3505    DRAW_STATE(vertex_input, TU_DYNAMIC_STATE_VERTEX_INPUT,
3506               builder->graphics_state.vi);
3507    DRAW_STATE(vertex_stride, TU_DYNAMIC_STATE_VB_STRIDE,
3508               builder->graphics_state.vi);
3509    /* If (a) per-view viewport is used or (b) we don't know yet, then we need
3510     * to set viewport and stencil state dynamically.
3511     */
3512    bool no_per_view_viewport = pipeline_contains_all_shader_state(pipeline) &&
3513       !pipeline->program.per_view_viewport;
3514    DRAW_STATE_COND(viewport, TU_DYNAMIC_STATE_VIEWPORT, no_per_view_viewport,
3515                    builder->graphics_state.vp,
3516                    builder->graphics_state.rs);
3517    DRAW_STATE_COND(scissor, TU_DYNAMIC_STATE_SCISSOR, no_per_view_viewport,
3518               builder->graphics_state.vp);
3519    DRAW_STATE(sample_locations,
3520               TU_DYNAMIC_STATE_SAMPLE_LOCATIONS,
3521               builder->graphics_state.ms->sample_locations_enable,
3522               builder->graphics_state.ms->sample_locations);
3523    DRAW_STATE(depth_bias, TU_DYNAMIC_STATE_DEPTH_BIAS,
3524               builder->graphics_state.rs);
3525    bool attachments_valid =
3526       builder->graphics_state.rp &&
3527       vk_render_pass_state_has_attachment_info(builder->graphics_state.rp);
3528    struct vk_color_blend_state dummy_cb = {};
3529    const struct vk_color_blend_state *cb = builder->graphics_state.cb;
3530    if (attachments_valid &&
3531        !(builder->graphics_state.rp->attachments &
3532          MESA_VK_RP_ATTACHMENT_ANY_COLOR_BITS)) {
3533       /* If there are no color attachments, then the original blend state may
3534        * be NULL and the common code sanitizes it to always be NULL. In this
3535        * case we want to emit an empty blend/bandwidth/etc.  rather than
3536        * letting it be dynamic (and potentially garbage).
3537        */
3538       cb = &dummy_cb;
3539       BITSET_SET(pipeline_set, MESA_VK_DYNAMIC_CB_LOGIC_OP_ENABLE);
3540       BITSET_SET(pipeline_set, MESA_VK_DYNAMIC_CB_LOGIC_OP);
3541       BITSET_SET(pipeline_set, MESA_VK_DYNAMIC_CB_ATTACHMENT_COUNT);
3542       BITSET_SET(pipeline_set, MESA_VK_DYNAMIC_CB_COLOR_WRITE_ENABLES);
3543       BITSET_SET(pipeline_set, MESA_VK_DYNAMIC_CB_BLEND_ENABLES);
3544       BITSET_SET(pipeline_set, MESA_VK_DYNAMIC_CB_BLEND_EQUATIONS);
3545       BITSET_SET(pipeline_set, MESA_VK_DYNAMIC_CB_WRITE_MASKS);
3546       BITSET_SET(pipeline_set, MESA_VK_DYNAMIC_CB_BLEND_CONSTANTS);
3547    }
3548    DRAW_STATE(blend, TU_DYNAMIC_STATE_BLEND, cb,
3549               builder->graphics_state.cal,
3550               builder->graphics_state.ms->alpha_to_coverage_enable,
3551               builder->graphics_state.ms->alpha_to_one_enable,
3552               builder->graphics_state.ms->sample_mask);
3553    if (EMIT_STATE(blend_lrz, attachments_valid))
3554       tu_emit_blend_lrz(&pipeline->lrz_blend, cb,
3555                         builder->graphics_state.rp);
3556    if (EMIT_STATE(bandwidth, attachments_valid))
3557       tu_calc_bandwidth(&pipeline->bandwidth, cb,
3558                         builder->graphics_state.rp);
3559    DRAW_STATE(blend_constants, TU_DYNAMIC_STATE_BLEND_CONSTANTS, cb);
3560 
3561    if (attachments_valid &&
3562        !(builder->graphics_state.rp->attachments &
3563          MESA_VK_RP_ATTACHMENT_ANY_COLOR_BITS)) {
3564       /* Don't actually make anything dynamic as that may mean a partially-set
3565        * state group where the group is NULL which angers common code.
3566        */
3567       BITSET_CLEAR(remove, MESA_VK_DYNAMIC_CB_LOGIC_OP_ENABLE);
3568       BITSET_CLEAR(remove, MESA_VK_DYNAMIC_CB_LOGIC_OP);
3569       BITSET_CLEAR(remove, MESA_VK_DYNAMIC_CB_ATTACHMENT_COUNT);
3570       BITSET_CLEAR(remove, MESA_VK_DYNAMIC_CB_COLOR_WRITE_ENABLES);
3571       BITSET_CLEAR(remove, MESA_VK_DYNAMIC_CB_BLEND_ENABLES);
3572       BITSET_CLEAR(remove, MESA_VK_DYNAMIC_CB_BLEND_EQUATIONS);
3573       BITSET_CLEAR(remove, MESA_VK_DYNAMIC_CB_WRITE_MASKS);
3574       BITSET_CLEAR(remove, MESA_VK_DYNAMIC_CB_BLEND_CONSTANTS);
3575    }
3576    DRAW_STATE_COND(rast, TU_DYNAMIC_STATE_RAST,
3577                    pipeline_contains_all_shader_state(pipeline),
3578                    builder->graphics_state.rs,
3579                    builder->graphics_state.vp,
3580                    builder->graphics_state.rp->view_mask != 0,
3581                    pipeline->program.per_view_viewport);
3582    DRAW_STATE_COND(ds, TU_DYNAMIC_STATE_DS,
3583               attachments_valid,
3584               builder->graphics_state.ds,
3585               builder->graphics_state.rp);
3586    DRAW_STATE_COND(rb_depth_cntl, TU_DYNAMIC_STATE_RB_DEPTH_CNTL,
3587                    attachments_valid,
3588                    builder->graphics_state.ds,
3589                    builder->graphics_state.rp,
3590                    builder->graphics_state.rs);
3591    DRAW_STATE_COND(patch_control_points,
3592                    TU_DYNAMIC_STATE_PATCH_CONTROL_POINTS,
3593                    pipeline_contains_all_shader_state(pipeline),
3594                    pipeline->shaders[MESA_SHADER_VERTEX],
3595                    pipeline->shaders[MESA_SHADER_TESS_CTRL],
3596                    pipeline->shaders[MESA_SHADER_TESS_EVAL],
3597                    &pipeline->program,
3598                    builder->graphics_state.ts->patch_control_points);
3599    bool has_raster_order_state = false;
3600    if (pipeline->type == TU_PIPELINE_GRAPHICS) {
3601       has_raster_order_state = true;
3602    } else {
3603       struct tu_graphics_lib_pipeline *lib =
3604          tu_pipeline_to_graphics_lib(pipeline);
3605       has_raster_order_state =
3606          (lib->state & VK_GRAPHICS_PIPELINE_LIBRARY_FRAGMENT_SHADER_BIT_EXT) &&
3607          (lib->state &
3608           VK_GRAPHICS_PIPELINE_LIBRARY_FRAGMENT_OUTPUT_INTERFACE_BIT_EXT);
3609    }
3610    if (!builder->device->physical_device->info->a6xx.has_coherent_ubwc_flag_caches) {
3611       DRAW_STATE_COND(prim_mode_sysmem,
3612                       TU_DYNAMIC_STATE_PRIM_MODE_SYSMEM,
3613                       has_raster_order_state,
3614                       pipeline->shaders[MESA_SHADER_FRAGMENT],
3615                       pipeline->output.raster_order_attachment_access ||
3616                       pipeline->ds.raster_order_attachment_access,
3617                       vk_pipeline_flags_feedback_loops(builder->graphics_state.pipeline_flags),
3618                       &pipeline->prim_order.sysmem_single_prim_mode);
3619    }
3620 
3621    if (builder->device->physical_device->info->a6xx.has_attachment_shading_rate) {
3622       bool has_fsr_att =
3623          builder->graphics_state.pipeline_flags &
3624          VK_PIPELINE_CREATE_2_RENDERING_FRAGMENT_SHADING_RATE_ATTACHMENT_BIT_KHR;
3625       DRAW_STATE_COND(fragment_shading_rate,
3626                       TU_DYNAMIC_STATE_A7XX_FRAGMENT_SHADING_RATE,
3627                       attachments_valid && pipeline_contains_all_shader_state(pipeline),
3628                       builder->graphics_state.fsr,
3629                       has_fsr_att,
3630                       pipeline->program.writes_shading_rate,
3631                       pipeline->program.reads_shading_rate,
3632                       pipeline->program.accesses_smask);
3633    }
3634 #undef DRAW_STATE
3635 #undef DRAW_STATE_COND
3636 #undef EMIT_STATE
3637 
3638    /* LRZ always needs depth/stencil state at draw time */
3639    BITSET_SET(keep, MESA_VK_DYNAMIC_DS_DEPTH_TEST_ENABLE);
3640    BITSET_SET(keep, MESA_VK_DYNAMIC_DS_DEPTH_WRITE_ENABLE);
3641    BITSET_SET(keep, MESA_VK_DYNAMIC_DS_DEPTH_BOUNDS_TEST_ENABLE);
3642    BITSET_SET(keep, MESA_VK_DYNAMIC_DS_DEPTH_COMPARE_OP);
3643    BITSET_SET(keep, MESA_VK_DYNAMIC_DS_STENCIL_TEST_ENABLE);
3644    BITSET_SET(keep, MESA_VK_DYNAMIC_DS_STENCIL_OP);
3645    BITSET_SET(keep, MESA_VK_DYNAMIC_DS_STENCIL_WRITE_MASK);
3646    BITSET_SET(keep, MESA_VK_DYNAMIC_MS_ALPHA_TO_COVERAGE_ENABLE);
3647 
3648    /* MSAA needs line mode */
3649    BITSET_SET(keep, MESA_VK_DYNAMIC_RS_LINE_MODE);
3650 
3651    /* The patch control points is part of the draw */
3652    BITSET_SET(keep, MESA_VK_DYNAMIC_TS_PATCH_CONTROL_POINTS);
3653 
3654    /* Vertex buffer state needs to know the max valid binding */
3655    BITSET_SET(keep, MESA_VK_DYNAMIC_VI_BINDINGS_VALID);
3656 
3657    /* Remove state which has been emitted and we no longer need to set when
3658     * binding the pipeline by making it "dynamic".
3659     */
3660    BITSET_ANDNOT(remove, remove, keep);
3661 
3662    BITSET_OR(pipeline->static_state_mask, pipeline->static_state_mask, remove);
3663 
3664    BITSET_OR(builder->graphics_state.dynamic, builder->graphics_state.dynamic,
3665              remove);
3666 }
3667 
3668 static inline bool
emit_draw_state(const struct vk_dynamic_graphics_state * dynamic_state,const enum mesa_vk_dynamic_graphics_state * state_array,unsigned num_states)3669 emit_draw_state(const struct vk_dynamic_graphics_state *dynamic_state,
3670                 const enum mesa_vk_dynamic_graphics_state *state_array,
3671                 unsigned num_states)
3672 {
3673    BITSET_DECLARE(state, MESA_VK_DYNAMIC_GRAPHICS_STATE_ENUM_MAX) = {};
3674 
3675    /* Unrolling this loop should produce a constant value once the function is
3676     * inlined, because state_array and num_states are a per-draw-state
3677     * constant, but GCC seems to need a little encouragement. clang does a
3678     * little better but still needs a pragma when there are a large number of
3679     * states.
3680     */
3681 #if defined(__clang__)
3682 #pragma clang loop unroll(full)
3683 #elif defined(__GNUC__) && __GNUC__ >= 8
3684 #pragma GCC unroll MESA_VK_DYNAMIC_GRAPHICS_STATE_ENUM_MAX
3685 #endif
3686    for (unsigned i = 0; i < num_states; i++) {
3687       BITSET_SET(state, state_array[i]);
3688    }
3689 
3690    BITSET_DECLARE(temp, MESA_VK_DYNAMIC_GRAPHICS_STATE_ENUM_MAX);
3691    BITSET_AND(temp, state, dynamic_state->dirty);
3692    return !BITSET_IS_EMPTY(temp);
3693 }
3694 
3695 template <chip CHIP>
3696 uint32_t
tu_emit_draw_state(struct tu_cmd_buffer * cmd)3697 tu_emit_draw_state(struct tu_cmd_buffer *cmd)
3698 {
3699    struct tu_cs cs;
3700    uint32_t dirty_draw_states = 0;
3701 
3702 #define EMIT_STATE(name)                                                      \
3703    emit_draw_state(&cmd->vk.dynamic_graphics_state, tu_##name##_state,        \
3704                    ARRAY_SIZE(tu_##name##_state))
3705 #define DRAW_STATE_COND(name, id, extra_cond, ...)                            \
3706    if ((EMIT_STATE(name) || (extra_cond)) &&                                  \
3707        !(cmd->state.pipeline_draw_states & (1u << id))) {                     \
3708       unsigned size = tu6_##name##_size<CHIP>(cmd->device, __VA_ARGS__);      \
3709       if (size > 0) {                                                         \
3710          tu_cs_begin_sub_stream(&cmd->sub_cs, size, &cs);                     \
3711          tu6_emit_##name<CHIP>(&cs, __VA_ARGS__);                             \
3712          cmd->state.dynamic_state[id] =                                       \
3713             tu_cs_end_draw_state(&cmd->sub_cs, &cs);                          \
3714       } else {                                                                \
3715          cmd->state.dynamic_state[id] = {};                                   \
3716       }                                                                       \
3717       dirty_draw_states |= (1u << id);                                        \
3718    }
3719 #define DRAW_STATE_FDM(name, id, ...)                                         \
3720    if ((EMIT_STATE(name) || (cmd->state.dirty & TU_CMD_DIRTY_FDM)) &&         \
3721        !(cmd->state.pipeline_draw_states & (1u << id))) {                     \
3722       if (cmd->state.shaders[MESA_SHADER_FRAGMENT]->fs.has_fdm) {             \
3723          tu_cs_set_writeable(&cmd->sub_cs, true);                             \
3724          tu6_emit_##name##_fdm(&cs, cmd, __VA_ARGS__);                        \
3725          cmd->state.dynamic_state[id] =                                       \
3726             tu_cs_end_draw_state(&cmd->sub_cs, &cs);                          \
3727          tu_cs_set_writeable(&cmd->sub_cs, false);                            \
3728       } else {                                                                \
3729          unsigned size = tu6_##name##_size<CHIP>(cmd->device, __VA_ARGS__);   \
3730          if (size > 0) {                                                      \
3731             tu_cs_begin_sub_stream(&cmd->sub_cs, size, &cs);                  \
3732             tu6_emit_##name<CHIP>(&cs, __VA_ARGS__);                          \
3733             cmd->state.dynamic_state[id] =                                    \
3734                tu_cs_end_draw_state(&cmd->sub_cs, &cs);                       \
3735          } else {                                                             \
3736             cmd->state.dynamic_state[id] = {};                                \
3737          }                                                                    \
3738          tu_cs_begin_sub_stream(&cmd->sub_cs,                                 \
3739                                 tu6_##name##_size<CHIP>(cmd->device, __VA_ARGS__),  \
3740                                 &cs);                                         \
3741          tu6_emit_##name<CHIP>(&cs, __VA_ARGS__);                             \
3742          cmd->state.dynamic_state[id] =                                       \
3743             tu_cs_end_draw_state(&cmd->sub_cs, &cs);                          \
3744       }                                                                       \
3745       dirty_draw_states |= (1u << id);                                        \
3746    }
3747 #define DRAW_STATE(name, id, ...) DRAW_STATE_COND(name, id, false, __VA_ARGS__)
3748 
3749    DRAW_STATE(vertex_input, TU_DYNAMIC_STATE_VERTEX_INPUT,
3750               cmd->vk.dynamic_graphics_state.vi);
3751 
3752    /* Vertex input stride is special because it's part of the vertex input in
3753     * the pipeline but a separate array when it's dynamic state so we have to
3754     * use two separate functions.
3755     */
3756 #define tu6_emit_vertex_stride tu6_emit_vertex_stride_dyn
3757 #define tu6_vertex_stride_size tu6_vertex_stride_size_dyn
3758 
3759    DRAW_STATE(vertex_stride, TU_DYNAMIC_STATE_VB_STRIDE,
3760               cmd->vk.dynamic_graphics_state.vi_binding_strides,
3761               cmd->vk.dynamic_graphics_state.vi_bindings_valid);
3762 
3763 #undef tu6_emit_vertex_stride
3764 #undef tu6_vertex_stride_size
3765 
3766    DRAW_STATE_FDM(viewport, TU_DYNAMIC_STATE_VIEWPORT,
3767                   &cmd->vk.dynamic_graphics_state.vp,
3768                   &cmd->vk.dynamic_graphics_state.rs);
3769    DRAW_STATE_FDM(scissor, TU_DYNAMIC_STATE_SCISSOR,
3770                   &cmd->vk.dynamic_graphics_state.vp);
3771    DRAW_STATE(sample_locations,
3772               TU_DYNAMIC_STATE_SAMPLE_LOCATIONS,
3773               cmd->vk.dynamic_graphics_state.ms.sample_locations_enable,
3774               cmd->vk.dynamic_graphics_state.ms.sample_locations);
3775    DRAW_STATE(depth_bias, TU_DYNAMIC_STATE_DEPTH_BIAS,
3776               &cmd->vk.dynamic_graphics_state.rs);
3777    DRAW_STATE(blend, TU_DYNAMIC_STATE_BLEND,
3778               &cmd->vk.dynamic_graphics_state.cb,
3779               &cmd->vk.dynamic_graphics_state.cal,
3780               cmd->vk.dynamic_graphics_state.ms.alpha_to_coverage_enable,
3781               cmd->vk.dynamic_graphics_state.ms.alpha_to_one_enable,
3782               cmd->vk.dynamic_graphics_state.ms.sample_mask);
3783    if (EMIT_STATE(blend_lrz) ||
3784        ((cmd->state.dirty & TU_CMD_DIRTY_SUBPASS) &&
3785         !cmd->state.pipeline_blend_lrz)) {
3786       bool blend_reads_dest = tu6_calc_blend_lrz(&cmd->vk.dynamic_graphics_state.cb,
3787                                                  &cmd->state.vk_rp);
3788       if (blend_reads_dest != cmd->state.blend_reads_dest) {
3789          cmd->state.blend_reads_dest = blend_reads_dest;
3790          cmd->state.dirty |= TU_CMD_DIRTY_LRZ;
3791       }
3792    }
3793    if (EMIT_STATE(bandwidth) ||
3794        ((cmd->state.dirty & TU_CMD_DIRTY_SUBPASS) &&
3795         !cmd->state.pipeline_bandwidth))
3796       tu_calc_bandwidth(&cmd->state.bandwidth, &cmd->vk.dynamic_graphics_state.cb,
3797                         &cmd->state.vk_rp);
3798    DRAW_STATE(blend_constants, VK_DYNAMIC_STATE_BLEND_CONSTANTS,
3799               &cmd->vk.dynamic_graphics_state.cb);
3800 
3801    if (cmd->device->physical_device->info->a6xx.has_attachment_shading_rate) {
3802       DRAW_STATE_COND(fragment_shading_rate,
3803                TU_DYNAMIC_STATE_A7XX_FRAGMENT_SHADING_RATE,
3804                cmd->state.dirty & (TU_CMD_DIRTY_SUBPASS | TU_CMD_DIRTY_SHADING_RATE),
3805                &cmd->vk.dynamic_graphics_state.fsr,
3806                cmd->state.subpass->fsr_attachment != VK_ATTACHMENT_UNUSED,
3807                cmd->state.program.writes_shading_rate,
3808                cmd->state.program.reads_shading_rate,
3809                cmd->state.program.accesses_smask);
3810    }
3811    DRAW_STATE_COND(rast, TU_DYNAMIC_STATE_RAST,
3812                    cmd->state.dirty & (TU_CMD_DIRTY_SUBPASS |
3813                                        TU_CMD_DIRTY_PER_VIEW_VIEWPORT),
3814                    &cmd->vk.dynamic_graphics_state.rs,
3815                    &cmd->vk.dynamic_graphics_state.vp,
3816                    cmd->state.vk_rp.view_mask != 0,
3817                    cmd->state.per_view_viewport);
3818    DRAW_STATE_COND(ds, TU_DYNAMIC_STATE_DS,
3819               cmd->state.dirty & TU_CMD_DIRTY_SUBPASS,
3820               &cmd->vk.dynamic_graphics_state.ds,
3821               &cmd->state.vk_rp);
3822    DRAW_STATE_COND(rb_depth_cntl, TU_DYNAMIC_STATE_RB_DEPTH_CNTL,
3823                    cmd->state.dirty & TU_CMD_DIRTY_SUBPASS,
3824                    &cmd->vk.dynamic_graphics_state.ds,
3825                    &cmd->state.vk_rp,
3826                    &cmd->vk.dynamic_graphics_state.rs);
3827    DRAW_STATE_COND(patch_control_points,
3828                    TU_DYNAMIC_STATE_PATCH_CONTROL_POINTS,
3829                    cmd->state.dirty & TU_CMD_DIRTY_PROGRAM,
3830                    cmd->state.shaders[MESA_SHADER_VERTEX],
3831                    cmd->state.shaders[MESA_SHADER_TESS_CTRL],
3832                    cmd->state.shaders[MESA_SHADER_TESS_EVAL],
3833                    &cmd->state.program,
3834                    cmd->vk.dynamic_graphics_state.ts.patch_control_points);
3835    if (!cmd->device->physical_device->info->a6xx.has_coherent_ubwc_flag_caches) {
3836       DRAW_STATE_COND(prim_mode_sysmem,
3837                       TU_DYNAMIC_STATE_PRIM_MODE_SYSMEM,
3838                       cmd->state.dirty & (TU_CMD_DIRTY_RAST_ORDER |
3839                                           TU_CMD_DIRTY_FEEDBACK_LOOPS |
3840                                           TU_CMD_DIRTY_FS),
3841                       cmd->state.shaders[MESA_SHADER_FRAGMENT],
3842                       cmd->state.raster_order_attachment_access,
3843                       cmd->vk.dynamic_graphics_state.feedback_loops |
3844                       cmd->state.pipeline_feedback_loops,
3845                       &cmd->state.rp.sysmem_single_prim_mode);
3846    }
3847 #undef DRAW_STATE
3848 #undef DRAW_STATE_COND
3849 #undef EMIT_STATE
3850 
3851    return dirty_draw_states;
3852 }
3853 TU_GENX(tu_emit_draw_state);
3854 
3855 static void
tu_pipeline_builder_parse_depth_stencil(struct tu_pipeline_builder * builder,struct tu_pipeline * pipeline)3856 tu_pipeline_builder_parse_depth_stencil(
3857    struct tu_pipeline_builder *builder, struct tu_pipeline *pipeline)
3858 {
3859    const VkPipelineDepthStencilStateCreateInfo *ds_info =
3860       builder->create_info->pDepthStencilState;
3861 
3862    if ((builder->graphics_state.rp->attachments ==
3863         MESA_VK_RP_ATTACHMENT_INFO_INVALID) ||
3864        (builder->graphics_state.rp->attachments &
3865         MESA_VK_RP_ATTACHMENT_DEPTH_BIT)) {
3866       pipeline->ds.raster_order_attachment_access =
3867          ds_info && (ds_info->flags &
3868          (VK_PIPELINE_DEPTH_STENCIL_STATE_CREATE_RASTERIZATION_ORDER_ATTACHMENT_DEPTH_ACCESS_BIT_EXT |
3869           VK_PIPELINE_DEPTH_STENCIL_STATE_CREATE_RASTERIZATION_ORDER_ATTACHMENT_STENCIL_ACCESS_BIT_EXT));
3870    }
3871 }
3872 
3873 static void
tu_pipeline_builder_parse_multisample_and_color_blend(struct tu_pipeline_builder * builder,struct tu_pipeline * pipeline)3874 tu_pipeline_builder_parse_multisample_and_color_blend(
3875    struct tu_pipeline_builder *builder, struct tu_pipeline *pipeline)
3876 {
3877    /* The spec says:
3878     *
3879     *    pMultisampleState is a pointer to an instance of the
3880     *    VkPipelineMultisampleStateCreateInfo, and is ignored if the pipeline
3881     *    has rasterization disabled.
3882     *
3883     * Also,
3884     *
3885     *    pColorBlendState is a pointer to an instance of the
3886     *    VkPipelineColorBlendStateCreateInfo structure, and is ignored if the
3887     *    pipeline has rasterization disabled or if the subpass of the render
3888     *    pass the pipeline is created against does not use any color
3889     *    attachments.
3890     *
3891     * We leave the relevant registers stale when rasterization is disabled.
3892     */
3893    if (builder->rasterizer_discard) {
3894       return;
3895    }
3896 
3897    static const VkPipelineColorBlendStateCreateInfo dummy_blend_info = {};
3898 
3899    const VkPipelineColorBlendStateCreateInfo *blend_info =
3900       (builder->graphics_state.rp->attachments &
3901        MESA_VK_RP_ATTACHMENT_ANY_COLOR_BITS)
3902       ? builder->create_info->pColorBlendState
3903       : &dummy_blend_info;
3904 
3905    if (builder->graphics_state.rp->attachments &
3906        MESA_VK_RP_ATTACHMENT_ANY_COLOR_BITS) {
3907       pipeline->output.raster_order_attachment_access =
3908          blend_info && (blend_info->flags &
3909             VK_PIPELINE_COLOR_BLEND_STATE_CREATE_RASTERIZATION_ORDER_ATTACHMENT_ACCESS_BIT_EXT);
3910    }
3911 }
3912 
3913 static void
tu_pipeline_builder_parse_rasterization_order(struct tu_pipeline_builder * builder,struct tu_pipeline * pipeline)3914 tu_pipeline_builder_parse_rasterization_order(
3915    struct tu_pipeline_builder *builder, struct tu_pipeline *pipeline)
3916 {
3917    if (builder->rasterizer_discard)
3918       return;
3919 
3920    bool raster_order_attachment_access =
3921       pipeline->output.raster_order_attachment_access ||
3922       pipeline->ds.raster_order_attachment_access ||
3923       TU_DEBUG(RAST_ORDER);
3924 
3925    /* VK_EXT_blend_operation_advanced would also require ordered access
3926     * when implemented in the future.
3927     */
3928 
3929    enum a6xx_single_prim_mode gmem_prim_mode = NO_FLUSH;
3930 
3931    if (raster_order_attachment_access) {
3932       /* VK_EXT_rasterization_order_attachment_access:
3933        *
3934        * This extension allow access to framebuffer attachments when used as
3935        * both input and color attachments from one fragment to the next,
3936        * in rasterization order, without explicit synchronization.
3937        */
3938       gmem_prim_mode = FLUSH_PER_OVERLAP;
3939    }
3940 
3941    struct tu_cs cs;
3942 
3943    pipeline->prim_order.state_gmem = tu_cs_draw_state(&pipeline->cs, &cs, 2);
3944    tu_cs_emit_write_reg(&cs, REG_A6XX_GRAS_SC_CNTL,
3945                         A6XX_GRAS_SC_CNTL_CCUSINGLECACHELINESIZE(2) |
3946                         A6XX_GRAS_SC_CNTL_SINGLE_PRIM_MODE(gmem_prim_mode));
3947 }
3948 
3949 static void
tu_pipeline_finish(struct tu_pipeline * pipeline,struct tu_device * dev,const VkAllocationCallbacks * alloc)3950 tu_pipeline_finish(struct tu_pipeline *pipeline,
3951                    struct tu_device *dev,
3952                    const VkAllocationCallbacks *alloc)
3953 {
3954    tu_cs_finish(&pipeline->cs);
3955    TU_RMV(resource_destroy, dev, &pipeline->bo);
3956 
3957    mtx_lock(&dev->pipeline_mutex);
3958    tu_suballoc_bo_free(&dev->pipeline_suballoc, &pipeline->bo);
3959    mtx_unlock(&dev->pipeline_mutex);
3960 
3961    if (pipeline->type == TU_PIPELINE_GRAPHICS_LIB) {
3962       struct tu_graphics_lib_pipeline *library =
3963          tu_pipeline_to_graphics_lib(pipeline);
3964 
3965       if (library->nir_shaders)
3966          vk_pipeline_cache_object_unref(&dev->vk,
3967                                         &library->nir_shaders->base);
3968 
3969       for (unsigned i = 0; i < library->num_sets; i++) {
3970          if (library->layouts[i])
3971             vk_descriptor_set_layout_unref(&dev->vk, &library->layouts[i]->vk);
3972       }
3973 
3974       vk_free2(&dev->vk.alloc, alloc, library->state_data);
3975    }
3976 
3977    for (unsigned i = 0; i < ARRAY_SIZE(pipeline->shaders); i++) {
3978       if (pipeline->shaders[i])
3979          vk_pipeline_cache_object_unref(&dev->vk,
3980                                         &pipeline->shaders[i]->base);
3981    }
3982 
3983    ralloc_free(pipeline->executables_mem_ctx);
3984 }
3985 
3986 static VkGraphicsPipelineLibraryFlagBitsEXT
vk_shader_stage_to_pipeline_library_flags(VkShaderStageFlagBits stage)3987 vk_shader_stage_to_pipeline_library_flags(VkShaderStageFlagBits stage)
3988 {
3989    assert(util_bitcount(stage) == 1);
3990    switch (stage) {
3991    case VK_SHADER_STAGE_VERTEX_BIT:
3992    case VK_SHADER_STAGE_TESSELLATION_CONTROL_BIT:
3993    case VK_SHADER_STAGE_TESSELLATION_EVALUATION_BIT:
3994    case VK_SHADER_STAGE_GEOMETRY_BIT:
3995    case VK_SHADER_STAGE_TASK_BIT_EXT:
3996    case VK_SHADER_STAGE_MESH_BIT_EXT:
3997       return VK_GRAPHICS_PIPELINE_LIBRARY_PRE_RASTERIZATION_SHADERS_BIT_EXT;
3998    case VK_SHADER_STAGE_FRAGMENT_BIT:
3999       return VK_GRAPHICS_PIPELINE_LIBRARY_FRAGMENT_SHADER_BIT_EXT;
4000    default:
4001       unreachable("Invalid shader stage");
4002    }
4003 }
4004 
4005 template <chip CHIP>
4006 static VkResult
tu_pipeline_builder_build(struct tu_pipeline_builder * builder,struct tu_pipeline ** pipeline)4007 tu_pipeline_builder_build(struct tu_pipeline_builder *builder,
4008                           struct tu_pipeline **pipeline)
4009 {
4010    VkResult result;
4011 
4012    if (builder->create_flags & VK_PIPELINE_CREATE_2_LIBRARY_BIT_KHR) {
4013       *pipeline = (struct tu_pipeline *) vk_object_zalloc(
4014          &builder->device->vk, builder->alloc,
4015          sizeof(struct tu_graphics_lib_pipeline),
4016          VK_OBJECT_TYPE_PIPELINE);
4017       if (!*pipeline)
4018          return VK_ERROR_OUT_OF_HOST_MEMORY;
4019       (*pipeline)->type = TU_PIPELINE_GRAPHICS_LIB;
4020    } else {
4021       *pipeline = (struct tu_pipeline *) vk_object_zalloc(
4022          &builder->device->vk, builder->alloc,
4023          sizeof(struct tu_graphics_pipeline),
4024          VK_OBJECT_TYPE_PIPELINE);
4025       if (!*pipeline)
4026          return VK_ERROR_OUT_OF_HOST_MEMORY;
4027       (*pipeline)->type = TU_PIPELINE_GRAPHICS;
4028    }
4029 
4030    (*pipeline)->executables_mem_ctx = ralloc_context(NULL);
4031    util_dynarray_init(&(*pipeline)->executables, (*pipeline)->executables_mem_ctx);
4032 
4033    tu_pipeline_builder_parse_libraries(builder, *pipeline);
4034 
4035    VkShaderStageFlags stages = 0;
4036    for (unsigned i = 0; i < builder->create_info->stageCount; i++) {
4037       VkShaderStageFlagBits stage = builder->create_info->pStages[i].stage;
4038 
4039       /* Ignore shader stages that don't need to be imported. */
4040       if (!(vk_shader_stage_to_pipeline_library_flags(stage) & builder->state))
4041          continue;
4042 
4043       stages |= stage;
4044    }
4045    builder->active_stages = stages;
4046 
4047    (*pipeline)->active_stages = stages;
4048    for (unsigned i = 0; i < builder->num_libraries; i++)
4049       (*pipeline)->active_stages |= builder->libraries[i]->base.active_stages;
4050 
4051    /* Compile and upload shaders unless a library has already done that. */
4052    if ((*pipeline)->program.vs_state.size == 0) {
4053       tu_pipeline_builder_parse_layout(builder, *pipeline);
4054 
4055       result = tu_pipeline_builder_compile_shaders(builder, *pipeline);
4056       if (result != VK_SUCCESS) {
4057          tu_pipeline_finish(*pipeline, builder->device, builder->alloc);
4058          vk_object_free(&builder->device->vk, builder->alloc, *pipeline);
4059          return result;
4060       }
4061    }
4062 
4063    result = tu_pipeline_allocate_cs(builder->device, *pipeline,
4064                                     &builder->layout, builder, NULL);
4065 
4066 
4067    if (set_combined_state(builder, *pipeline,
4068                           VK_GRAPHICS_PIPELINE_LIBRARY_PRE_RASTERIZATION_SHADERS_BIT_EXT |
4069                           VK_GRAPHICS_PIPELINE_LIBRARY_FRAGMENT_SHADER_BIT_EXT)) {
4070       if (result != VK_SUCCESS) {
4071          vk_object_free(&builder->device->vk, builder->alloc, *pipeline);
4072          return result;
4073       }
4074 
4075       tu_emit_program_state<CHIP>(&(*pipeline)->cs, &(*pipeline)->program,
4076                                   (*pipeline)->shaders);
4077 
4078       if (CHIP == A6XX) {
4079          /* Blob doesn't preload state on A7XX, likely preloading either
4080           * doesn't work or doesn't provide benefits.
4081           */
4082          tu6_emit_load_state(builder->device, *pipeline, &builder->layout);
4083       }
4084    }
4085 
4086    if (builder->state & VK_GRAPHICS_PIPELINE_LIBRARY_FRAGMENT_SHADER_BIT_EXT) {
4087       tu_pipeline_builder_parse_depth_stencil(builder, *pipeline);
4088    }
4089 
4090    if (builder->state &
4091        VK_GRAPHICS_PIPELINE_LIBRARY_FRAGMENT_OUTPUT_INTERFACE_BIT_EXT) {
4092       tu_pipeline_builder_parse_multisample_and_color_blend(builder, *pipeline);
4093    }
4094 
4095    if (set_combined_state(builder, *pipeline,
4096                           VK_GRAPHICS_PIPELINE_LIBRARY_FRAGMENT_SHADER_BIT_EXT |
4097                           VK_GRAPHICS_PIPELINE_LIBRARY_FRAGMENT_OUTPUT_INTERFACE_BIT_EXT)) {
4098       tu_pipeline_builder_parse_rasterization_order(builder, *pipeline);
4099    }
4100 
4101    tu_pipeline_builder_emit_state<CHIP>(builder, *pipeline);
4102 
4103    if ((*pipeline)->type == TU_PIPELINE_GRAPHICS_LIB) {
4104       struct tu_graphics_lib_pipeline *library =
4105          tu_pipeline_to_graphics_lib(*pipeline);
4106       result = vk_graphics_pipeline_state_copy(&builder->device->vk,
4107                                                &library->graphics_state,
4108                                                &builder->graphics_state,
4109                                                builder->alloc,
4110                                                VK_SYSTEM_ALLOCATION_SCOPE_OBJECT,
4111                                                &library->state_data);
4112       if (result != VK_SUCCESS) {
4113          tu_pipeline_finish(*pipeline, builder->device, builder->alloc);
4114          return result;
4115       }
4116    } else {
4117       struct tu_graphics_pipeline *gfx_pipeline =
4118          tu_pipeline_to_graphics(*pipeline);
4119       gfx_pipeline->dynamic_state.ms.sample_locations =
4120          &gfx_pipeline->sample_locations;
4121       vk_dynamic_graphics_state_fill(&gfx_pipeline->dynamic_state,
4122                                      &builder->graphics_state);
4123       gfx_pipeline->feedback_loops =
4124          vk_pipeline_flags_feedback_loops(builder->graphics_state.pipeline_flags);
4125       gfx_pipeline->feedback_loop_may_involve_textures =
4126          builder->graphics_state.feedback_loop_not_input_only;
4127    }
4128 
4129    return VK_SUCCESS;
4130 }
4131 
4132 static void
tu_pipeline_builder_finish(struct tu_pipeline_builder * builder)4133 tu_pipeline_builder_finish(struct tu_pipeline_builder *builder)
4134 {
4135    ralloc_free(builder->mem_ctx);
4136 }
4137 
4138 void
tu_fill_render_pass_state(struct vk_render_pass_state * rp,const struct tu_render_pass * pass,const struct tu_subpass * subpass)4139 tu_fill_render_pass_state(struct vk_render_pass_state *rp,
4140                           const struct tu_render_pass *pass,
4141                           const struct tu_subpass *subpass)
4142 {
4143    rp->view_mask = subpass->multiview_mask;
4144    rp->color_attachment_count = subpass->color_count;
4145 
4146    const uint32_t a = subpass->depth_stencil_attachment.attachment;
4147    rp->depth_attachment_format = VK_FORMAT_UNDEFINED;
4148    rp->stencil_attachment_format = VK_FORMAT_UNDEFINED;
4149    rp->attachments = MESA_VK_RP_ATTACHMENT_NONE;
4150    if (a != VK_ATTACHMENT_UNUSED) {
4151       VkFormat ds_format = pass->attachments[a].format;
4152       if (vk_format_has_depth(ds_format) && subpass->depth_used) {
4153          rp->depth_attachment_format = ds_format;
4154          rp->attachments |= MESA_VK_RP_ATTACHMENT_DEPTH_BIT;
4155       }
4156       if (vk_format_has_stencil(ds_format) && subpass->stencil_used) {
4157          rp->stencil_attachment_format = ds_format;
4158          rp->attachments |= MESA_VK_RP_ATTACHMENT_STENCIL_BIT;
4159       }
4160    }
4161 
4162    for (uint32_t i = 0; i < subpass->color_count; i++) {
4163       const uint32_t a = subpass->color_attachments[i].attachment;
4164       if (a == VK_ATTACHMENT_UNUSED) {
4165          rp->color_attachment_formats[i] = VK_FORMAT_UNDEFINED;
4166          continue;
4167       }
4168 
4169       rp->color_attachment_formats[i] = pass->attachments[a].format;
4170       rp->attachments |= MESA_VK_RP_ATTACHMENT_COLOR_BIT(i);
4171    }
4172 }
4173 
4174 static void
tu_pipeline_builder_init_graphics(struct tu_pipeline_builder * builder,struct tu_device * dev,struct vk_pipeline_cache * cache,const VkGraphicsPipelineCreateInfo * create_info,VkPipelineCreateFlags2KHR flags,const VkAllocationCallbacks * alloc)4175 tu_pipeline_builder_init_graphics(
4176    struct tu_pipeline_builder *builder,
4177    struct tu_device *dev,
4178    struct vk_pipeline_cache *cache,
4179    const VkGraphicsPipelineCreateInfo *create_info,
4180    VkPipelineCreateFlags2KHR flags,
4181    const VkAllocationCallbacks *alloc)
4182 {
4183    *builder = (struct tu_pipeline_builder) {
4184       .device = dev,
4185       .mem_ctx = ralloc_context(NULL),
4186       .cache = cache,
4187       .alloc = alloc,
4188       .create_info = create_info,
4189       .create_flags = flags,
4190    };
4191 
4192    const VkGraphicsPipelineLibraryCreateInfoEXT *gpl_info =
4193       vk_find_struct_const(builder->create_info->pNext,
4194                            GRAPHICS_PIPELINE_LIBRARY_CREATE_INFO_EXT);
4195 
4196    const VkPipelineLibraryCreateInfoKHR *library_info =
4197       vk_find_struct_const(builder->create_info->pNext,
4198                            PIPELINE_LIBRARY_CREATE_INFO_KHR);
4199 
4200    if (gpl_info) {
4201       builder->state = gpl_info->flags;
4202    } else {
4203       /* Implement this bit of spec text:
4204        *
4205        *    If this structure is omitted, and either
4206        *    VkGraphicsPipelineCreateInfo::flags includes
4207        *    VK_PIPELINE_CREATE_LIBRARY_BIT_KHR or the
4208        *    VkGraphicsPipelineCreateInfo::pNext chain includes a
4209        *    VkPipelineLibraryCreateInfoKHR structure with a libraryCount
4210        *    greater than 0, it is as if flags is 0. Otherwise if this
4211        *    structure is omitted, it is as if flags includes all possible
4212        *    subsets of the graphics pipeline (i.e. a complete graphics
4213        *    pipeline).
4214        */
4215       if ((library_info && library_info->libraryCount > 0) ||
4216           (builder->create_flags & VK_PIPELINE_CREATE_2_LIBRARY_BIT_KHR)) {
4217          builder->state = 0;
4218       } else {
4219          builder->state =
4220             VK_GRAPHICS_PIPELINE_LIBRARY_VERTEX_INPUT_INTERFACE_BIT_EXT |
4221             VK_GRAPHICS_PIPELINE_LIBRARY_PRE_RASTERIZATION_SHADERS_BIT_EXT |
4222             VK_GRAPHICS_PIPELINE_LIBRARY_FRAGMENT_SHADER_BIT_EXT |
4223             VK_GRAPHICS_PIPELINE_LIBRARY_FRAGMENT_OUTPUT_INTERFACE_BIT_EXT;
4224       }
4225    }
4226 
4227    bool rasterizer_discard_dynamic = false;
4228    if (create_info->pDynamicState) {
4229       for (uint32_t i = 0; i < create_info->pDynamicState->dynamicStateCount; i++) {
4230          if (create_info->pDynamicState->pDynamicStates[i] ==
4231                VK_DYNAMIC_STATE_RASTERIZER_DISCARD_ENABLE) {
4232             rasterizer_discard_dynamic = true;
4233             break;
4234          }
4235       }
4236    }
4237 
4238    builder->rasterizer_discard =
4239       (builder->state & VK_GRAPHICS_PIPELINE_LIBRARY_PRE_RASTERIZATION_SHADERS_BIT_EXT) &&
4240       !rasterizer_discard_dynamic &&
4241       builder->create_info->pRasterizationState->rasterizerDiscardEnable;
4242 
4243    struct vk_render_pass_state rp_state = {};
4244    const struct vk_render_pass_state *driver_rp = NULL;
4245    VkPipelineCreateFlags2KHR rp_flags = 0;
4246 
4247    builder->unscaled_input_fragcoord = 0;
4248 
4249    /* Extract information we need from the turnip renderpass. This will be
4250     * filled out automatically if the app is using dynamic rendering or
4251     * renderpasses are emulated.
4252     */
4253    if (!TU_DEBUG(DYNAMIC) &&
4254        (builder->state &
4255         (VK_GRAPHICS_PIPELINE_LIBRARY_PRE_RASTERIZATION_SHADERS_BIT_EXT |
4256          VK_GRAPHICS_PIPELINE_LIBRARY_FRAGMENT_SHADER_BIT_EXT |
4257          VK_GRAPHICS_PIPELINE_LIBRARY_FRAGMENT_OUTPUT_INTERFACE_BIT_EXT)) &&
4258        builder->create_info->renderPass) {
4259       const struct tu_render_pass *pass =
4260          tu_render_pass_from_handle(create_info->renderPass);
4261       const struct tu_subpass *subpass =
4262          &pass->subpasses[create_info->subpass];
4263 
4264       tu_fill_render_pass_state(&rp_state, pass, subpass);
4265 
4266       for (unsigned i = 0; i < subpass->input_count; i++) {
4267          /* Input attachments stored in GMEM must be loaded with unscaled
4268           * FragCoord.
4269           */
4270          if (subpass->input_attachments[i].patch_input_gmem)
4271             builder->unscaled_input_fragcoord |= 1u << i;
4272       }
4273 
4274       if (subpass->feedback_loop_color) {
4275          rp_flags |=
4276             VK_PIPELINE_CREATE_2_COLOR_ATTACHMENT_FEEDBACK_LOOP_BIT_EXT;
4277       }
4278 
4279       if (subpass->feedback_loop_ds) {
4280          rp_flags |=
4281             VK_PIPELINE_CREATE_2_DEPTH_STENCIL_ATTACHMENT_FEEDBACK_LOOP_BIT_EXT;
4282       }
4283 
4284       if (pass->fragment_density_map.attachment != VK_ATTACHMENT_UNUSED) {
4285          rp_flags |=
4286             VK_PIPELINE_CREATE_2_RENDERING_FRAGMENT_DENSITY_MAP_ATTACHMENT_BIT_EXT;
4287       }
4288 
4289       if (subpass->fsr_attachment != VK_ATTACHMENT_UNUSED) {
4290          rp_flags |=
4291             VK_PIPELINE_CREATE_2_RENDERING_FRAGMENT_SHADING_RATE_ATTACHMENT_BIT_KHR;
4292       }
4293 
4294       builder->unscaled_input_fragcoord = 0;
4295       for (unsigned i = 0; i < subpass->input_count; i++) {
4296          /* Input attachments stored in GMEM must be loaded with unscaled
4297           * FragCoord.
4298           */
4299          if (subpass->input_attachments[i].patch_input_gmem)
4300             builder->unscaled_input_fragcoord |= 1u << i;
4301       }
4302 
4303       driver_rp = &rp_state;
4304    }
4305 
4306    vk_graphics_pipeline_state_fill(&dev->vk,
4307                                    &builder->graphics_state,
4308                                    builder->create_info,
4309                                    driver_rp,
4310                                    rp_flags,
4311                                    &builder->all_state,
4312                                    NULL, VK_SYSTEM_ALLOCATION_SCOPE_OBJECT,
4313                                    NULL);
4314 
4315    if (builder->graphics_state.rp) {
4316       builder->fragment_density_map = (builder->graphics_state.pipeline_flags &
4317          VK_PIPELINE_CREATE_2_RENDERING_FRAGMENT_DENSITY_MAP_ATTACHMENT_BIT_EXT) ||
4318          TU_DEBUG(FDM);
4319    }
4320 }
4321 
4322 template <chip CHIP>
4323 static VkResult
tu_graphics_pipeline_create(VkDevice device,VkPipelineCache pipelineCache,const VkGraphicsPipelineCreateInfo * pCreateInfo,VkPipelineCreateFlags2KHR flags,const VkAllocationCallbacks * pAllocator,VkPipeline * pPipeline)4324 tu_graphics_pipeline_create(VkDevice device,
4325                             VkPipelineCache pipelineCache,
4326                             const VkGraphicsPipelineCreateInfo *pCreateInfo,
4327                             VkPipelineCreateFlags2KHR flags,
4328                             const VkAllocationCallbacks *pAllocator,
4329                             VkPipeline *pPipeline)
4330 {
4331    VK_FROM_HANDLE(tu_device, dev, device);
4332    VK_FROM_HANDLE(vk_pipeline_cache, cache, pipelineCache);
4333 
4334    cache = cache ? cache : dev->mem_cache;
4335 
4336    struct tu_pipeline_builder builder;
4337    tu_pipeline_builder_init_graphics(&builder, dev, cache,
4338                                      pCreateInfo, flags, pAllocator);
4339 
4340    struct tu_pipeline *pipeline = NULL;
4341    VkResult result = tu_pipeline_builder_build<CHIP>(&builder, &pipeline);
4342    tu_pipeline_builder_finish(&builder);
4343 
4344    if (result == VK_SUCCESS) {
4345       TU_RMV(graphics_pipeline_create, dev, tu_pipeline_to_graphics(pipeline));
4346 
4347       *pPipeline = tu_pipeline_to_handle(pipeline);
4348    } else
4349       *pPipeline = VK_NULL_HANDLE;
4350 
4351    return result;
4352 }
4353 
4354 template <chip CHIP>
4355 VKAPI_ATTR VkResult VKAPI_CALL
tu_CreateGraphicsPipelines(VkDevice device,VkPipelineCache pipelineCache,uint32_t count,const VkGraphicsPipelineCreateInfo * pCreateInfos,const VkAllocationCallbacks * pAllocator,VkPipeline * pPipelines)4356 tu_CreateGraphicsPipelines(VkDevice device,
4357                            VkPipelineCache pipelineCache,
4358                            uint32_t count,
4359                            const VkGraphicsPipelineCreateInfo *pCreateInfos,
4360                            const VkAllocationCallbacks *pAllocator,
4361                            VkPipeline *pPipelines)
4362 {
4363    MESA_TRACE_FUNC();
4364    VkResult final_result = VK_SUCCESS;
4365    uint32_t i = 0;
4366 
4367    for (; i < count; i++) {
4368       VkPipelineCreateFlags2KHR flags =
4369          vk_graphics_pipeline_create_flags(&pCreateInfos[i]);
4370 
4371       VkResult result =
4372          tu_graphics_pipeline_create<CHIP>(device, pipelineCache,
4373                                            &pCreateInfos[i], flags,
4374                                            pAllocator, &pPipelines[i]);
4375 
4376       if (result != VK_SUCCESS) {
4377          final_result = result;
4378          pPipelines[i] = VK_NULL_HANDLE;
4379 
4380          if (flags & VK_PIPELINE_CREATE_2_EARLY_RETURN_ON_FAILURE_BIT_KHR)
4381             break;
4382       }
4383    }
4384 
4385    for (; i < count; i++)
4386       pPipelines[i] = VK_NULL_HANDLE;
4387 
4388    return final_result;
4389 }
4390 TU_GENX(tu_CreateGraphicsPipelines);
4391 
4392 template <chip CHIP>
4393 static VkResult
tu_compute_pipeline_create(VkDevice device,VkPipelineCache pipelineCache,const VkComputePipelineCreateInfo * pCreateInfo,VkPipelineCreateFlags2KHR flags,const VkAllocationCallbacks * pAllocator,VkPipeline * pPipeline)4394 tu_compute_pipeline_create(VkDevice device,
4395                            VkPipelineCache pipelineCache,
4396                            const VkComputePipelineCreateInfo *pCreateInfo,
4397                            VkPipelineCreateFlags2KHR flags,
4398                            const VkAllocationCallbacks *pAllocator,
4399                            VkPipeline *pPipeline)
4400 {
4401    VK_FROM_HANDLE(tu_device, dev, device);
4402    VK_FROM_HANDLE(vk_pipeline_cache, cache, pipelineCache);
4403    VK_FROM_HANDLE(tu_pipeline_layout, layout, pCreateInfo->layout);
4404    const VkPipelineShaderStageCreateInfo *stage_info = &pCreateInfo->stage;
4405    VkResult result;
4406    const struct ir3_shader_variant *v = NULL;
4407 
4408    cache = cache ? cache : dev->mem_cache;
4409 
4410    struct tu_compute_pipeline *pipeline;
4411 
4412    *pPipeline = VK_NULL_HANDLE;
4413 
4414    VkPipelineCreationFeedback pipeline_feedback = {
4415       .flags = VK_PIPELINE_CREATION_FEEDBACK_VALID_BIT,
4416    };
4417 
4418    const VkPipelineCreationFeedbackCreateInfo *creation_feedback =
4419       vk_find_struct_const(pCreateInfo->pNext, PIPELINE_CREATION_FEEDBACK_CREATE_INFO);
4420 
4421    int64_t pipeline_start = os_time_get_nano();
4422 
4423    pipeline = (struct tu_compute_pipeline *) vk_object_zalloc(
4424       &dev->vk, pAllocator, sizeof(*pipeline), VK_OBJECT_TYPE_PIPELINE);
4425    if (!pipeline)
4426       return VK_ERROR_OUT_OF_HOST_MEMORY;
4427    pipeline->base.type = TU_PIPELINE_COMPUTE;
4428 
4429    pipeline->base.executables_mem_ctx = ralloc_context(NULL);
4430    util_dynarray_init(&pipeline->base.executables, pipeline->base.executables_mem_ctx);
4431    pipeline->base.active_stages = VK_SHADER_STAGE_COMPUTE_BIT;
4432 
4433    struct tu_shader_key key = { };
4434    bool allow_varying_subgroup_size =
4435       (stage_info->flags &
4436        VK_PIPELINE_SHADER_STAGE_CREATE_ALLOW_VARYING_SUBGROUP_SIZE_BIT_EXT);
4437    bool require_full_subgroups =
4438       stage_info->flags &
4439       VK_PIPELINE_SHADER_STAGE_CREATE_REQUIRE_FULL_SUBGROUPS_BIT_EXT;
4440    const VkPipelineShaderStageRequiredSubgroupSizeCreateInfo *subgroup_info =
4441       vk_find_struct_const(stage_info,
4442                            PIPELINE_SHADER_STAGE_REQUIRED_SUBGROUP_SIZE_CREATE_INFO);
4443    tu_shader_key_subgroup_size(&key, allow_varying_subgroup_size,
4444                                require_full_subgroups, subgroup_info,
4445                                dev);
4446 
4447    struct vk_pipeline_robustness_state rs;
4448    vk_pipeline_robustness_state_fill(&dev->vk, &rs,
4449                                      pCreateInfo->pNext,
4450                                      stage_info->pNext);
4451    tu_shader_key_robustness(&key, &rs);
4452 
4453    void *pipeline_mem_ctx = ralloc_context(NULL);
4454 
4455    unsigned char pipeline_sha1[20];
4456    tu_hash_compute(pipeline_sha1, flags, stage_info, layout, &key);
4457 
4458    struct tu_shader *shader = NULL;
4459 
4460    const bool executable_info = flags &
4461       VK_PIPELINE_CREATE_2_CAPTURE_INTERNAL_REPRESENTATIONS_BIT_KHR;
4462 
4463    bool application_cache_hit = false;
4464 
4465    if (!executable_info) {
4466       shader =
4467          tu_pipeline_cache_lookup(cache, pipeline_sha1, sizeof(pipeline_sha1),
4468                                   &application_cache_hit);
4469    }
4470 
4471    if (application_cache_hit && cache != dev->mem_cache) {
4472       pipeline_feedback.flags |=
4473          VK_PIPELINE_CREATION_FEEDBACK_APPLICATION_PIPELINE_CACHE_HIT_BIT;
4474    }
4475 
4476    char *nir_initial_disasm = NULL;
4477 
4478    if (!shader) {
4479       if (flags &
4480           VK_PIPELINE_CREATE_2_FAIL_ON_PIPELINE_COMPILE_REQUIRED_BIT_KHR) {
4481          result = VK_PIPELINE_COMPILE_REQUIRED;
4482          goto fail;
4483       }
4484 
4485       struct ir3_shader_key ir3_key = {};
4486 
4487       nir_shader *nir = tu_spirv_to_nir(dev, pipeline_mem_ctx, flags,
4488                                         stage_info, &key, MESA_SHADER_COMPUTE);
4489 
4490       nir_initial_disasm = executable_info ?
4491          nir_shader_as_str(nir, pipeline->base.executables_mem_ctx) : NULL;
4492 
4493       result = tu_shader_create(dev, &shader, nir, &key, &ir3_key,
4494                                 pipeline_sha1, sizeof(pipeline_sha1), layout,
4495                                 executable_info);
4496       if (!shader) {
4497          goto fail;
4498       }
4499 
4500       shader = tu_pipeline_cache_insert(cache, shader);
4501    }
4502 
4503    pipeline_feedback.duration = os_time_get_nano() - pipeline_start;
4504 
4505    if (creation_feedback) {
4506       *creation_feedback->pPipelineCreationFeedback = pipeline_feedback;
4507       assert(creation_feedback->pipelineStageCreationFeedbackCount == 1);
4508       creation_feedback->pPipelineStageCreationFeedbacks[0] = pipeline_feedback;
4509    }
4510 
4511    pipeline->base.active_desc_sets = shader->active_desc_sets;
4512 
4513    v = shader->variant;
4514 
4515    tu_pipeline_set_linkage(&pipeline->base.program.link[MESA_SHADER_COMPUTE],
4516                            &shader->const_state, v);
4517 
4518    result = tu_pipeline_allocate_cs(dev, &pipeline->base, layout, NULL, v);
4519    if (result != VK_SUCCESS)
4520       goto fail;
4521 
4522    for (int i = 0; i < 3; i++)
4523       pipeline->local_size[i] = v->local_size[i];
4524 
4525    if (CHIP == A6XX) {
4526       tu6_emit_load_state(dev, &pipeline->base, layout);
4527    }
4528 
4529    tu_append_executable(&pipeline->base, v, nir_initial_disasm);
4530 
4531    pipeline->instrlen = v->instrlen;
4532 
4533    pipeline->base.shaders[MESA_SHADER_COMPUTE] = shader;
4534 
4535    ralloc_free(pipeline_mem_ctx);
4536 
4537    TU_RMV(compute_pipeline_create, dev, pipeline);
4538 
4539    *pPipeline = tu_pipeline_to_handle(&pipeline->base);
4540 
4541    return VK_SUCCESS;
4542 
4543 fail:
4544    if (shader)
4545       vk_pipeline_cache_object_unref(&dev->vk, &shader->base);
4546 
4547    ralloc_free(pipeline_mem_ctx);
4548 
4549    vk_object_free(&dev->vk, pAllocator, pipeline);
4550 
4551    return result;
4552 }
4553 
4554 template <chip CHIP>
4555 VKAPI_ATTR VkResult VKAPI_CALL
tu_CreateComputePipelines(VkDevice device,VkPipelineCache pipelineCache,uint32_t count,const VkComputePipelineCreateInfo * pCreateInfos,const VkAllocationCallbacks * pAllocator,VkPipeline * pPipelines)4556 tu_CreateComputePipelines(VkDevice device,
4557                           VkPipelineCache pipelineCache,
4558                           uint32_t count,
4559                           const VkComputePipelineCreateInfo *pCreateInfos,
4560                           const VkAllocationCallbacks *pAllocator,
4561                           VkPipeline *pPipelines)
4562 {
4563    MESA_TRACE_FUNC();
4564    VkResult final_result = VK_SUCCESS;
4565    uint32_t i = 0;
4566 
4567    for (; i < count; i++) {
4568       VkPipelineCreateFlags2KHR flags =
4569          vk_compute_pipeline_create_flags(&pCreateInfos[i]);
4570 
4571       VkResult result =
4572          tu_compute_pipeline_create<CHIP>(device, pipelineCache,
4573                                           &pCreateInfos[i], flags,
4574                                           pAllocator, &pPipelines[i]);
4575       if (result != VK_SUCCESS) {
4576          final_result = result;
4577          pPipelines[i] = VK_NULL_HANDLE;
4578 
4579          if (flags &
4580              VK_PIPELINE_CREATE_2_EARLY_RETURN_ON_FAILURE_BIT_KHR)
4581             break;
4582       }
4583    }
4584 
4585    for (; i < count; i++)
4586       pPipelines[i] = VK_NULL_HANDLE;
4587 
4588    return final_result;
4589 }
4590 TU_GENX(tu_CreateComputePipelines);
4591 
4592 VKAPI_ATTR void VKAPI_CALL
tu_DestroyPipeline(VkDevice _device,VkPipeline _pipeline,const VkAllocationCallbacks * pAllocator)4593 tu_DestroyPipeline(VkDevice _device,
4594                    VkPipeline _pipeline,
4595                    const VkAllocationCallbacks *pAllocator)
4596 {
4597    VK_FROM_HANDLE(tu_device, dev, _device);
4598    VK_FROM_HANDLE(tu_pipeline, pipeline, _pipeline);
4599 
4600    if (!_pipeline)
4601       return;
4602 
4603    TU_RMV(resource_destroy, dev, pipeline);
4604 
4605    tu_pipeline_finish(pipeline, dev, pAllocator);
4606    vk_object_free(&dev->vk, pAllocator, pipeline);
4607 }
4608 
4609 #define WRITE_STR(field, ...) ({                                \
4610    memset(field, 0, sizeof(field));                             \
4611    UNUSED int _i = snprintf(field, sizeof(field), __VA_ARGS__); \
4612    assert(_i > 0 && _i < sizeof(field));                        \
4613 })
4614 
4615 static const struct tu_pipeline_executable *
tu_pipeline_get_executable(struct tu_pipeline * pipeline,uint32_t index)4616 tu_pipeline_get_executable(struct tu_pipeline *pipeline, uint32_t index)
4617 {
4618    assert(index < util_dynarray_num_elements(&pipeline->executables,
4619                                              struct tu_pipeline_executable));
4620    return util_dynarray_element(
4621       &pipeline->executables, struct tu_pipeline_executable, index);
4622 }
4623 
4624 VKAPI_ATTR VkResult VKAPI_CALL
tu_GetPipelineExecutablePropertiesKHR(VkDevice _device,const VkPipelineInfoKHR * pPipelineInfo,uint32_t * pExecutableCount,VkPipelineExecutablePropertiesKHR * pProperties)4625 tu_GetPipelineExecutablePropertiesKHR(
4626       VkDevice _device,
4627       const VkPipelineInfoKHR* pPipelineInfo,
4628       uint32_t* pExecutableCount,
4629       VkPipelineExecutablePropertiesKHR* pProperties)
4630 {
4631    VK_FROM_HANDLE(tu_device, dev, _device);
4632    VK_FROM_HANDLE(tu_pipeline, pipeline, pPipelineInfo->pipeline);
4633    VK_OUTARRAY_MAKE_TYPED(VkPipelineExecutablePropertiesKHR, out,
4634                           pProperties, pExecutableCount);
4635 
4636    util_dynarray_foreach (&pipeline->executables, struct tu_pipeline_executable, exe) {
4637       vk_outarray_append_typed(VkPipelineExecutablePropertiesKHR, &out, props) {
4638          gl_shader_stage stage = exe->stage;
4639          props->stages = mesa_to_vk_shader_stage(stage);
4640 
4641          if (!exe->is_binning)
4642             WRITE_STR(props->name, "%s", _mesa_shader_stage_to_abbrev(stage));
4643          else
4644             WRITE_STR(props->name, "Binning VS");
4645 
4646          WRITE_STR(props->description, "%s", _mesa_shader_stage_to_string(stage));
4647 
4648          props->subgroupSize =
4649             dev->compiler->threadsize_base * (exe->stats.double_threadsize ? 2 : 1);
4650       }
4651    }
4652 
4653    return vk_outarray_status(&out);
4654 }
4655 
4656 VKAPI_ATTR VkResult VKAPI_CALL
tu_GetPipelineExecutableStatisticsKHR(VkDevice _device,const VkPipelineExecutableInfoKHR * pExecutableInfo,uint32_t * pStatisticCount,VkPipelineExecutableStatisticKHR * pStatistics)4657 tu_GetPipelineExecutableStatisticsKHR(
4658       VkDevice _device,
4659       const VkPipelineExecutableInfoKHR* pExecutableInfo,
4660       uint32_t* pStatisticCount,
4661       VkPipelineExecutableStatisticKHR* pStatistics)
4662 {
4663    VK_FROM_HANDLE(tu_pipeline, pipeline, pExecutableInfo->pipeline);
4664    VK_OUTARRAY_MAKE_TYPED(VkPipelineExecutableStatisticKHR, out,
4665                           pStatistics, pStatisticCount);
4666 
4667    const struct tu_pipeline_executable *exe =
4668       tu_pipeline_get_executable(pipeline, pExecutableInfo->executableIndex);
4669 
4670    vk_outarray_append_typed(VkPipelineExecutableStatisticKHR, &out, stat) {
4671       WRITE_STR(stat->name, "Max Waves Per Core");
4672       WRITE_STR(stat->description,
4673                 "Maximum number of simultaneous waves per core.");
4674       stat->format = VK_PIPELINE_EXECUTABLE_STATISTIC_FORMAT_UINT64_KHR;
4675       stat->value.u64 = exe->stats.max_waves;
4676    }
4677 
4678    vk_outarray_append_typed(VkPipelineExecutableStatisticKHR, &out, stat) {
4679       WRITE_STR(stat->name, "Instruction Count");
4680       WRITE_STR(stat->description,
4681                 "Total number of IR3 instructions in the final generated "
4682                 "shader executable.");
4683       stat->format = VK_PIPELINE_EXECUTABLE_STATISTIC_FORMAT_UINT64_KHR;
4684       stat->value.u64 = exe->stats.instrs_count;
4685    }
4686 
4687    vk_outarray_append_typed(VkPipelineExecutableStatisticKHR, &out, stat) {
4688       WRITE_STR(stat->name, "Code size");
4689       WRITE_STR(stat->description,
4690                 "Total number of dwords in the final generated "
4691                 "shader executable.");
4692       stat->format = VK_PIPELINE_EXECUTABLE_STATISTIC_FORMAT_UINT64_KHR;
4693       stat->value.u64 = exe->stats.sizedwords;
4694    }
4695 
4696    vk_outarray_append_typed(VkPipelineExecutableStatisticKHR, &out, stat) {
4697       WRITE_STR(stat->name, "NOPs Count");
4698       WRITE_STR(stat->description,
4699                 "Number of NOP instructions in the final generated "
4700                 "shader executable.");
4701       stat->format = VK_PIPELINE_EXECUTABLE_STATISTIC_FORMAT_UINT64_KHR;
4702       stat->value.u64 = exe->stats.nops_count;
4703    }
4704 
4705    vk_outarray_append_typed(VkPipelineExecutableStatisticKHR, &out, stat) {
4706       WRITE_STR(stat->name, "MOV Count");
4707       WRITE_STR(stat->description,
4708                 "Number of MOV instructions in the final generated "
4709                 "shader executable.");
4710       stat->format = VK_PIPELINE_EXECUTABLE_STATISTIC_FORMAT_UINT64_KHR;
4711       stat->value.u64 = exe->stats.mov_count;
4712    }
4713 
4714    vk_outarray_append_typed(VkPipelineExecutableStatisticKHR, &out, stat) {
4715       WRITE_STR(stat->name, "COV Count");
4716       WRITE_STR(stat->description,
4717                 "Number of COV instructions in the final generated "
4718                 "shader executable.");
4719       stat->format = VK_PIPELINE_EXECUTABLE_STATISTIC_FORMAT_UINT64_KHR;
4720       stat->value.u64 = exe->stats.cov_count;
4721    }
4722 
4723    vk_outarray_append_typed(VkPipelineExecutableStatisticKHR, &out, stat) {
4724       WRITE_STR(stat->name, "Registers used");
4725       WRITE_STR(stat->description,
4726                 "Number of registers used in the final generated "
4727                 "shader executable.");
4728       stat->format = VK_PIPELINE_EXECUTABLE_STATISTIC_FORMAT_UINT64_KHR;
4729       stat->value.u64 = exe->stats.max_reg + 1;
4730    }
4731 
4732    vk_outarray_append_typed(VkPipelineExecutableStatisticKHR, &out, stat) {
4733       WRITE_STR(stat->name, "Half-registers used");
4734       WRITE_STR(stat->description,
4735                 "Number of half-registers used in the final generated "
4736                 "shader executable.");
4737       stat->format = VK_PIPELINE_EXECUTABLE_STATISTIC_FORMAT_UINT64_KHR;
4738       stat->value.u64 = exe->stats.max_half_reg + 1;
4739    }
4740 
4741    vk_outarray_append_typed(VkPipelineExecutableStatisticKHR, &out, stat) {
4742       WRITE_STR(stat->name, "Last interpolation instruction");
4743       WRITE_STR(stat->description,
4744                 "The instruction where varying storage in Local Memory is released");
4745       stat->format = VK_PIPELINE_EXECUTABLE_STATISTIC_FORMAT_UINT64_KHR;
4746       stat->value.u64 = exe->stats.last_baryf;
4747    }
4748 
4749    vk_outarray_append_typed(VkPipelineExecutableStatisticKHR, &out, stat) {
4750       WRITE_STR(stat->name, "Last helper instruction");
4751       WRITE_STR(stat->description,
4752                 "The instruction where helper invocations are killed");
4753       stat->format = VK_PIPELINE_EXECUTABLE_STATISTIC_FORMAT_UINT64_KHR;
4754       stat->value.u64 = exe->stats.last_helper;
4755    }
4756 
4757    vk_outarray_append_typed(VkPipelineExecutableStatisticKHR, &out, stat) {
4758       WRITE_STR(stat->name, "Instructions with SS sync bit");
4759       WRITE_STR(stat->description,
4760                 "SS bit is set for instructions which depend on a result "
4761                 "of \"long\" instructions to prevent RAW hazard.");
4762       stat->format = VK_PIPELINE_EXECUTABLE_STATISTIC_FORMAT_UINT64_KHR;
4763       stat->value.u64 = exe->stats.ss;
4764    }
4765 
4766    vk_outarray_append_typed(VkPipelineExecutableStatisticKHR, &out, stat) {
4767       WRITE_STR(stat->name, "Instructions with SY sync bit");
4768       WRITE_STR(stat->description,
4769                 "SY bit is set for instructions which depend on a result "
4770                 "of loads from global memory to prevent RAW hazard.");
4771       stat->format = VK_PIPELINE_EXECUTABLE_STATISTIC_FORMAT_UINT64_KHR;
4772       stat->value.u64 = exe->stats.sy;
4773    }
4774 
4775    vk_outarray_append_typed(VkPipelineExecutableStatisticKHR, &out, stat) {
4776       WRITE_STR(stat->name, "Estimated cycles stalled on SS");
4777       WRITE_STR(stat->description,
4778                 "A better metric to estimate the impact of SS syncs.");
4779       stat->format = VK_PIPELINE_EXECUTABLE_STATISTIC_FORMAT_UINT64_KHR;
4780       stat->value.u64 = exe->stats.sstall;
4781    }
4782 
4783    vk_outarray_append_typed(VkPipelineExecutableStatisticKHR, &out, stat) {
4784       WRITE_STR(stat->name, "Estimated cycles stalled on SY");
4785       WRITE_STR(stat->description,
4786                 "A better metric to estimate the impact of SY syncs.");
4787       stat->format = VK_PIPELINE_EXECUTABLE_STATISTIC_FORMAT_UINT64_KHR;
4788       stat->value.u64 = exe->stats.systall;
4789    }
4790 
4791    for (int i = 0; i < ARRAY_SIZE(exe->stats.instrs_per_cat); i++) {
4792       vk_outarray_append_typed(VkPipelineExecutableStatisticKHR, &out, stat) {
4793          WRITE_STR(stat->name, "cat%d instructions", i);
4794          WRITE_STR(stat->description,
4795                   "Number of cat%d instructions.", i);
4796          stat->format = VK_PIPELINE_EXECUTABLE_STATISTIC_FORMAT_UINT64_KHR;
4797          stat->value.u64 = exe->stats.instrs_per_cat[i];
4798       }
4799    }
4800 
4801    vk_outarray_append_typed(VkPipelineExecutableStatisticKHR, &out, stat) {
4802       WRITE_STR(stat->name, "STP Count");
4803       WRITE_STR(stat->description,
4804                 "Number of STore Private instructions in the final generated "
4805                 "shader executable.");
4806       stat->format = VK_PIPELINE_EXECUTABLE_STATISTIC_FORMAT_UINT64_KHR;
4807       stat->value.u64 = exe->stats.stp_count;
4808    }
4809 
4810    vk_outarray_append_typed(VkPipelineExecutableStatisticKHR, &out, stat) {
4811       WRITE_STR(stat->name, "LDP Count");
4812       WRITE_STR(stat->description,
4813                 "Number of LoaD Private instructions in the final generated "
4814                 "shader executable.");
4815       stat->format = VK_PIPELINE_EXECUTABLE_STATISTIC_FORMAT_UINT64_KHR;
4816       stat->value.u64 = exe->stats.ldp_count;
4817    }
4818 
4819    vk_outarray_append_typed(VkPipelineExecutableStatisticKHR, &out, stat) {
4820       WRITE_STR(stat->name, "Preamble Instruction Count");
4821       WRITE_STR(stat->description,
4822                 "Total number of IR3 instructions in the preamble.");
4823       stat->format = VK_PIPELINE_EXECUTABLE_STATISTIC_FORMAT_UINT64_KHR;
4824       stat->value.u64 = exe->stats.preamble_instrs_count;
4825    }
4826 
4827    vk_outarray_append_typed(VkPipelineExecutableStatisticKHR, &out, stat) {
4828       WRITE_STR(stat->name, "Early preamble");
4829       WRITE_STR(stat->description,
4830                 "Whether the preamble will be executed early.");
4831       stat->format = VK_PIPELINE_EXECUTABLE_STATISTIC_FORMAT_BOOL32_KHR;
4832       stat->value.b32 = exe->stats.early_preamble;
4833    }
4834 
4835    return vk_outarray_status(&out);
4836 }
4837 
4838 static bool
write_ir_text(VkPipelineExecutableInternalRepresentationKHR * ir,const char * data)4839 write_ir_text(VkPipelineExecutableInternalRepresentationKHR* ir,
4840               const char *data)
4841 {
4842    ir->isText = VK_TRUE;
4843 
4844    size_t data_len = strlen(data) + 1;
4845 
4846    if (ir->pData == NULL) {
4847       ir->dataSize = data_len;
4848       return true;
4849    }
4850 
4851    strncpy((char *) ir->pData, data, ir->dataSize);
4852    if (ir->dataSize < data_len)
4853       return false;
4854 
4855    ir->dataSize = data_len;
4856    return true;
4857 }
4858 
4859 VKAPI_ATTR VkResult VKAPI_CALL
tu_GetPipelineExecutableInternalRepresentationsKHR(VkDevice _device,const VkPipelineExecutableInfoKHR * pExecutableInfo,uint32_t * pInternalRepresentationCount,VkPipelineExecutableInternalRepresentationKHR * pInternalRepresentations)4860 tu_GetPipelineExecutableInternalRepresentationsKHR(
4861     VkDevice _device,
4862     const VkPipelineExecutableInfoKHR* pExecutableInfo,
4863     uint32_t* pInternalRepresentationCount,
4864     VkPipelineExecutableInternalRepresentationKHR* pInternalRepresentations)
4865 {
4866    VK_FROM_HANDLE(tu_pipeline, pipeline, pExecutableInfo->pipeline);
4867    VK_OUTARRAY_MAKE_TYPED(VkPipelineExecutableInternalRepresentationKHR, out,
4868                           pInternalRepresentations, pInternalRepresentationCount);
4869    bool incomplete_text = false;
4870 
4871    const struct tu_pipeline_executable *exe =
4872       tu_pipeline_get_executable(pipeline, pExecutableInfo->executableIndex);
4873 
4874    if (exe->nir_from_spirv) {
4875       vk_outarray_append_typed(VkPipelineExecutableInternalRepresentationKHR, &out, ir) {
4876          WRITE_STR(ir->name, "NIR from SPIRV");
4877          WRITE_STR(ir->description,
4878                    "Initial NIR before any optimizations");
4879 
4880          if (!write_ir_text(ir, exe->nir_from_spirv))
4881             incomplete_text = true;
4882       }
4883    }
4884 
4885    if (exe->nir_final) {
4886       vk_outarray_append_typed(VkPipelineExecutableInternalRepresentationKHR, &out, ir) {
4887          WRITE_STR(ir->name, "Final NIR");
4888          WRITE_STR(ir->description,
4889                    "Final NIR before going into the back-end compiler");
4890 
4891          if (!write_ir_text(ir, exe->nir_final))
4892             incomplete_text = true;
4893       }
4894    }
4895 
4896    if (exe->disasm) {
4897       vk_outarray_append_typed(VkPipelineExecutableInternalRepresentationKHR, &out, ir) {
4898          WRITE_STR(ir->name, "IR3 Assembly");
4899          WRITE_STR(ir->description,
4900                    "Final IR3 assembly for the generated shader binary");
4901 
4902          if (!write_ir_text(ir, exe->disasm))
4903             incomplete_text = true;
4904       }
4905    }
4906 
4907    return incomplete_text ? VK_INCOMPLETE : vk_outarray_status(&out);
4908 }
4909