• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*
2  * Copyright © 2016 Red Hat.
3  * Copyright © 2016 Bas Nieuwenhuizen
4  * SPDX-License-Identifier: MIT
5  *
6  * based in part on anv driver which is:
7  * Copyright © 2015 Intel Corporation
8  */
9 
10 #include "tu_pipeline.h"
11 
12 #include "common/freedreno_guardband.h"
13 
14 #include "ir3/ir3_nir.h"
15 #include "nir/nir.h"
16 #include "nir/nir_builder.h"
17 #include "nir/nir_serialize.h"
18 #include "spirv/nir_spirv.h"
19 #include "util/u_debug.h"
20 #include "util/mesa-sha1.h"
21 #include "vk_nir.h"
22 #include "vk_pipeline.h"
23 #include "vk_render_pass.h"
24 #include "vk_util.h"
25 
26 #include "tu_cmd_buffer.h"
27 #include "tu_cs.h"
28 #include "tu_device.h"
29 #include "tu_knl.h"
30 #include "tu_formats.h"
31 #include "tu_lrz.h"
32 #include "tu_pass.h"
33 #include "tu_rmv.h"
34 
35 /* Emit IB that preloads the descriptors that the shader uses */
36 
37 static void
emit_load_state(struct tu_cs * cs,unsigned opcode,enum a6xx_state_type st,enum a6xx_state_block sb,unsigned base,unsigned offset,unsigned count)38 emit_load_state(struct tu_cs *cs, unsigned opcode, enum a6xx_state_type st,
39                 enum a6xx_state_block sb, unsigned base, unsigned offset,
40                 unsigned count)
41 {
42    /* Note: just emit one packet, even if count overflows NUM_UNIT. It's not
43     * clear if emitting more packets will even help anything. Presumably the
44     * descriptor cache is relatively small, and these packets stop doing
45     * anything when there are too many descriptors.
46     */
47    tu_cs_emit_pkt7(cs, opcode, 3);
48    tu_cs_emit(cs,
49               CP_LOAD_STATE6_0_STATE_TYPE(st) |
50               CP_LOAD_STATE6_0_STATE_SRC(SS6_BINDLESS) |
51               CP_LOAD_STATE6_0_STATE_BLOCK(sb) |
52               CP_LOAD_STATE6_0_NUM_UNIT(MIN2(count, 1024-1)));
53    tu_cs_emit_qw(cs, offset | (base << 28));
54 }
55 
56 static unsigned
tu6_load_state_size(struct tu_pipeline * pipeline,struct tu_pipeline_layout * layout)57 tu6_load_state_size(struct tu_pipeline *pipeline,
58                     struct tu_pipeline_layout *layout)
59 {
60    const unsigned load_state_size = 4;
61    unsigned size = 0;
62    for (unsigned i = 0; i < layout->num_sets; i++) {
63       if (!(pipeline->active_desc_sets & (1u << i)))
64          continue;
65 
66       struct tu_descriptor_set_layout *set_layout = layout->set[i].layout;
67       for (unsigned j = 0; j < set_layout->binding_count; j++) {
68          struct tu_descriptor_set_binding_layout *binding = &set_layout->binding[j];
69          unsigned count = 0;
70          /* See comment in tu6_emit_load_state(). */
71          VkShaderStageFlags stages = pipeline->active_stages & binding->shader_stages;
72          unsigned stage_count = util_bitcount(stages);
73 
74          if (!binding->array_size)
75             continue;
76 
77          switch (binding->type) {
78          case VK_DESCRIPTOR_TYPE_STORAGE_BUFFER:
79          case VK_DESCRIPTOR_TYPE_STORAGE_BUFFER_DYNAMIC:
80          case VK_DESCRIPTOR_TYPE_STORAGE_IMAGE:
81          case VK_DESCRIPTOR_TYPE_STORAGE_TEXEL_BUFFER:
82          case VK_DESCRIPTOR_TYPE_ACCELERATION_STRUCTURE_KHR:
83             /* IBO-backed resources only need one packet for all graphics stages */
84             if (stage_count)
85                count += 1;
86             break;
87          case VK_DESCRIPTOR_TYPE_SAMPLER:
88          case VK_DESCRIPTOR_TYPE_SAMPLED_IMAGE:
89          case VK_DESCRIPTOR_TYPE_UNIFORM_TEXEL_BUFFER:
90          case VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER:
91          case VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER_DYNAMIC:
92             /* Textures and UBO's needs a packet for each stage */
93             count = stage_count;
94             break;
95          case VK_DESCRIPTOR_TYPE_COMBINED_IMAGE_SAMPLER:
96             /* Because of how we pack combined images and samplers, we
97              * currently can't use one packet for the whole array.
98              */
99             count = stage_count * binding->array_size * 2;
100             break;
101          case VK_DESCRIPTOR_TYPE_INPUT_ATTACHMENT:
102          case VK_DESCRIPTOR_TYPE_INLINE_UNIFORM_BLOCK:
103          case VK_DESCRIPTOR_TYPE_MUTABLE_EXT:
104             break;
105          default:
106             unreachable("bad descriptor type");
107          }
108          size += count * load_state_size;
109       }
110    }
111    return size;
112 }
113 
114 static void
tu6_emit_load_state(struct tu_device * device,struct tu_pipeline * pipeline,struct tu_pipeline_layout * layout)115 tu6_emit_load_state(struct tu_device *device,
116                     struct tu_pipeline *pipeline,
117                     struct tu_pipeline_layout *layout)
118 {
119    unsigned size = tu6_load_state_size(pipeline, layout);
120    if (size == 0)
121       return;
122 
123    struct tu_cs cs;
124    tu_cs_begin_sub_stream(&pipeline->cs, size, &cs);
125 
126    for (unsigned i = 0; i < layout->num_sets; i++) {
127       /* From 13.2.7. Descriptor Set Binding:
128        *
129        *    A compatible descriptor set must be bound for all set numbers that
130        *    any shaders in a pipeline access, at the time that a draw or
131        *    dispatch command is recorded to execute using that pipeline.
132        *    However, if none of the shaders in a pipeline statically use any
133        *    bindings with a particular set number, then no descriptor set need
134        *    be bound for that set number, even if the pipeline layout includes
135        *    a non-trivial descriptor set layout for that set number.
136        *
137        * This means that descriptor sets unused by the pipeline may have a
138        * garbage or 0 BINDLESS_BASE register, which will cause context faults
139        * when prefetching descriptors from these sets. Skip prefetching for
140        * descriptors from them to avoid this. This is also an optimization,
141        * since these prefetches would be useless.
142        */
143       if (!(pipeline->active_desc_sets & (1u << i)))
144          continue;
145 
146       struct tu_descriptor_set_layout *set_layout = layout->set[i].layout;
147       for (unsigned j = 0; j < set_layout->binding_count; j++) {
148          struct tu_descriptor_set_binding_layout *binding = &set_layout->binding[j];
149          unsigned base = i;
150          unsigned offset = binding->offset / 4;
151          /* Note: amber sets VK_SHADER_STAGE_ALL for its descriptor layout, and
152           * zink has descriptors for each stage in the push layout even if some
153           * stages aren't present in a used pipeline.  We don't want to emit
154           * loads for unused descriptors.
155           */
156          VkShaderStageFlags stages = pipeline->active_stages & binding->shader_stages;
157          unsigned count = binding->array_size;
158 
159          /* If this is a variable-count descriptor, then the array_size is an
160           * upper bound on the size, but we don't know how many descriptors
161           * will actually be used. Therefore we can't pre-load them here.
162           */
163          if (j == set_layout->binding_count - 1 &&
164              set_layout->has_variable_descriptors)
165             continue;
166 
167          if (count == 0 || stages == 0)
168             continue;
169          switch (binding->type) {
170          case VK_DESCRIPTOR_TYPE_STORAGE_BUFFER_DYNAMIC:
171             assert(device->physical_device->reserved_set_idx >= 0);
172             base = device->physical_device->reserved_set_idx;
173             offset = (pipeline->program.dynamic_descriptor_offsets[i] +
174                       binding->dynamic_offset_offset) / 4;
175             FALLTHROUGH;
176          case VK_DESCRIPTOR_TYPE_STORAGE_BUFFER:
177          case VK_DESCRIPTOR_TYPE_STORAGE_IMAGE:
178          case VK_DESCRIPTOR_TYPE_STORAGE_TEXEL_BUFFER:
179          case VK_DESCRIPTOR_TYPE_ACCELERATION_STRUCTURE_KHR: {
180             unsigned mul = binding->size / (A6XX_TEX_CONST_DWORDS * 4);
181             /* IBO-backed resources only need one packet for all graphics stages */
182             if (stages & ~VK_SHADER_STAGE_COMPUTE_BIT) {
183                emit_load_state(&cs, CP_LOAD_STATE6, ST6_SHADER, SB6_IBO,
184                                base, offset, count * mul);
185             }
186             if (stages & VK_SHADER_STAGE_COMPUTE_BIT) {
187                emit_load_state(&cs, CP_LOAD_STATE6_FRAG, ST6_IBO, SB6_CS_SHADER,
188                                base, offset, count * mul);
189             }
190             break;
191          }
192          case VK_DESCRIPTOR_TYPE_INPUT_ATTACHMENT:
193          case VK_DESCRIPTOR_TYPE_INLINE_UNIFORM_BLOCK:
194          case VK_DESCRIPTOR_TYPE_MUTABLE_EXT:
195             /* nothing - input attachments and inline uniforms don't use bindless */
196             break;
197          case VK_DESCRIPTOR_TYPE_SAMPLER:
198          case VK_DESCRIPTOR_TYPE_SAMPLED_IMAGE:
199          case VK_DESCRIPTOR_TYPE_UNIFORM_TEXEL_BUFFER: {
200             tu_foreach_stage(stage, stages) {
201                emit_load_state(&cs, tu6_stage2opcode(stage),
202                                binding->type == VK_DESCRIPTOR_TYPE_SAMPLER ?
203                                ST6_SHADER : ST6_CONSTANTS,
204                                tu6_stage2texsb(stage), base, offset, count);
205             }
206             break;
207          }
208          case VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER_DYNAMIC:
209             assert(device->physical_device->reserved_set_idx >= 0);
210             base = device->physical_device->reserved_set_idx;
211             offset = (pipeline->program.dynamic_descriptor_offsets[i] +
212                       binding->dynamic_offset_offset) / 4;
213             FALLTHROUGH;
214          case VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER: {
215             tu_foreach_stage(stage, stages) {
216                emit_load_state(&cs, tu6_stage2opcode(stage), ST6_UBO,
217                                tu6_stage2shadersb(stage), base, offset, count);
218             }
219             break;
220          }
221          case VK_DESCRIPTOR_TYPE_COMBINED_IMAGE_SAMPLER: {
222             tu_foreach_stage(stage, stages) {
223                /* TODO: We could emit less CP_LOAD_STATE6 if we used
224                 * struct-of-arrays instead of array-of-structs.
225                 */
226                for (unsigned i = 0; i < count; i++) {
227                   unsigned tex_offset = offset + 2 * i * A6XX_TEX_CONST_DWORDS;
228                   unsigned sam_offset = offset + (2 * i + 1) * A6XX_TEX_CONST_DWORDS;
229                   emit_load_state(&cs, tu6_stage2opcode(stage),
230                                   ST6_CONSTANTS, tu6_stage2texsb(stage),
231                                   base, tex_offset, 1);
232                   emit_load_state(&cs, tu6_stage2opcode(stage),
233                                   ST6_SHADER, tu6_stage2texsb(stage),
234                                   base, sam_offset, 1);
235                }
236             }
237             break;
238          }
239          default:
240             unreachable("bad descriptor type");
241          }
242       }
243    }
244 
245    pipeline->load_state = tu_cs_end_draw_state(&pipeline->cs, &cs);
246 }
247 
248 struct tu_pipeline_builder
249 {
250    struct tu_device *device;
251    void *mem_ctx;
252    struct vk_pipeline_cache *cache;
253    const VkAllocationCallbacks *alloc;
254    const VkGraphicsPipelineCreateInfo *create_info;
255    VkPipelineCreateFlags2KHR create_flags;
256 
257    struct tu_pipeline_layout layout;
258 
259    struct tu_pvtmem_config pvtmem;
260 
261    bool rasterizer_discard;
262    /* these states are affectd by rasterizer_discard */
263    uint8_t unscaled_input_fragcoord;
264 
265    /* Each library defines at least one piece of state in
266     * VkGraphicsPipelineLibraryFlagsEXT, and libraries cannot overlap, so
267     * there can be at most as many libraries as pieces of state, of which
268     * there are currently 4.
269     */
270 #define MAX_LIBRARIES 4
271 
272    unsigned num_libraries;
273    struct tu_graphics_lib_pipeline *libraries[MAX_LIBRARIES];
274 
275    /* This is just the state that we are compiling now, whereas the final
276     * pipeline will include the state from the libraries.
277     */
278    VkGraphicsPipelineLibraryFlagsEXT state;
279 
280    /* The stages we are compiling now. */
281    VkShaderStageFlags active_stages;
282 
283    bool fragment_density_map;
284 
285    struct vk_graphics_pipeline_all_state all_state;
286    struct vk_graphics_pipeline_state graphics_state;
287 };
288 
289 static bool
tu_logic_op_reads_dst(VkLogicOp op)290 tu_logic_op_reads_dst(VkLogicOp op)
291 {
292    switch (op) {
293    case VK_LOGIC_OP_CLEAR:
294    case VK_LOGIC_OP_COPY:
295    case VK_LOGIC_OP_COPY_INVERTED:
296    case VK_LOGIC_OP_SET:
297       return false;
298    default:
299       return true;
300    }
301 }
302 
303 static bool
tu_blend_state_is_dual_src(const struct vk_color_blend_state * cb)304 tu_blend_state_is_dual_src(const struct vk_color_blend_state *cb)
305 {
306    for (unsigned i = 0; i < cb->attachment_count; i++) {
307       if (tu_blend_factor_is_dual_src((VkBlendFactor)cb->attachments[i].src_color_blend_factor) ||
308           tu_blend_factor_is_dual_src((VkBlendFactor)cb->attachments[i].dst_color_blend_factor) ||
309           tu_blend_factor_is_dual_src((VkBlendFactor)cb->attachments[i].src_alpha_blend_factor) ||
310           tu_blend_factor_is_dual_src((VkBlendFactor)cb->attachments[i].dst_alpha_blend_factor))
311          return true;
312    }
313 
314    return false;
315 }
316 
317 enum ir3_push_consts_type
tu_push_consts_type(const struct tu_pipeline_layout * layout,const struct ir3_compiler * compiler)318 tu_push_consts_type(const struct tu_pipeline_layout *layout,
319                     const struct ir3_compiler *compiler)
320 {
321    if (!layout->push_constant_size)
322       return IR3_PUSH_CONSTS_NONE;
323 
324    if (TU_DEBUG(PUSH_CONSTS_PER_STAGE))
325       return IR3_PUSH_CONSTS_PER_STAGE;
326 
327    if (tu6_shared_constants_enable(layout, compiler)) {
328       return IR3_PUSH_CONSTS_SHARED;
329    } else {
330       if (compiler->gen >= 7) {
331          return IR3_PUSH_CONSTS_SHARED_PREAMBLE;
332       } else {
333          return IR3_PUSH_CONSTS_PER_STAGE;
334       }
335    }
336 }
337 
338 template <chip CHIP>
339 struct xs_config {
340    uint16_t reg_sp_xs_config;
341    uint16_t reg_hlsq_xs_ctrl;
342 };
343 
344 template <chip CHIP>
345 static const xs_config<CHIP> xs_configs[] = {
346    [MESA_SHADER_VERTEX] = {
347       REG_A6XX_SP_VS_CONFIG,
348       CHIP == A6XX ? REG_A6XX_HLSQ_VS_CNTL : REG_A7XX_HLSQ_VS_CNTL,
349    },
350    [MESA_SHADER_TESS_CTRL] = {
351       REG_A6XX_SP_HS_CONFIG,
352       CHIP == A6XX ? REG_A6XX_HLSQ_HS_CNTL : REG_A7XX_HLSQ_HS_CNTL,
353    },
354    [MESA_SHADER_TESS_EVAL] = {
355       REG_A6XX_SP_DS_CONFIG,
356       CHIP == A6XX ? REG_A6XX_HLSQ_DS_CNTL : REG_A7XX_HLSQ_DS_CNTL,
357    },
358    [MESA_SHADER_GEOMETRY] = {
359       REG_A6XX_SP_GS_CONFIG,
360       CHIP == A6XX ? REG_A6XX_HLSQ_GS_CNTL : REG_A7XX_HLSQ_GS_CNTL,
361    },
362    [MESA_SHADER_FRAGMENT] = {
363       REG_A6XX_SP_FS_CONFIG,
364       CHIP == A6XX ? REG_A6XX_HLSQ_FS_CNTL : REG_A7XX_HLSQ_FS_CNTL,
365    },
366    [MESA_SHADER_COMPUTE] = {
367       REG_A6XX_SP_CS_CONFIG,
368       CHIP == A6XX ? REG_A6XX_HLSQ_CS_CNTL : REG_A7XX_HLSQ_CS_CNTL,
369    },
370 };
371 
372 template <chip CHIP>
373 void
tu6_emit_xs_config(struct tu_cs * cs,gl_shader_stage stage,const struct ir3_shader_variant * xs)374 tu6_emit_xs_config(struct tu_cs *cs,
375                    gl_shader_stage stage, /* xs->type, but xs may be NULL */
376                    const struct ir3_shader_variant *xs)
377 {
378    const struct xs_config<CHIP> *cfg = &xs_configs<CHIP>[stage];
379 
380    if (!xs) {
381       /* shader stage disabled */
382       tu_cs_emit_pkt4(cs, cfg->reg_sp_xs_config, 1);
383       tu_cs_emit(cs, 0);
384 
385       tu_cs_emit_pkt4(cs, cfg->reg_hlsq_xs_ctrl, 1);
386       tu_cs_emit(cs, 0);
387       return;
388    }
389 
390    tu_cs_emit_pkt4(cs, cfg->reg_sp_xs_config, 1);
391    tu_cs_emit(cs, A6XX_SP_VS_CONFIG_ENABLED |
392                   COND(xs->bindless_tex, A6XX_SP_VS_CONFIG_BINDLESS_TEX) |
393                   COND(xs->bindless_samp, A6XX_SP_VS_CONFIG_BINDLESS_SAMP) |
394                   COND(xs->bindless_ibo, A6XX_SP_VS_CONFIG_BINDLESS_IBO) |
395                   COND(xs->bindless_ubo, A6XX_SP_VS_CONFIG_BINDLESS_UBO) |
396                   A6XX_SP_VS_CONFIG_NTEX(xs->num_samp) |
397                   A6XX_SP_VS_CONFIG_NSAMP(xs->num_samp));
398 
399    tu_cs_emit_pkt4(cs, cfg->reg_hlsq_xs_ctrl, 1);
400    tu_cs_emit(cs, A6XX_HLSQ_VS_CNTL_CONSTLEN(xs->constlen) |
401                      A6XX_HLSQ_VS_CNTL_ENABLED |
402                      COND(xs->shader_options.push_consts_type == IR3_PUSH_CONSTS_SHARED_PREAMBLE,
403                           A7XX_HLSQ_VS_CNTL_READ_IMM_SHARED_CONSTS));
404 }
405 TU_GENX(tu6_emit_xs_config);
406 
407 static void
tu6_emit_dynamic_offset(struct tu_cs * cs,const struct ir3_shader_variant * xs,const struct tu_shader * shader,const struct tu_program_state * program)408 tu6_emit_dynamic_offset(struct tu_cs *cs,
409                         const struct ir3_shader_variant *xs,
410                         const struct tu_shader *shader,
411                         const struct tu_program_state *program)
412 {
413    const struct tu_physical_device *phys_dev = cs->device->physical_device;
414 
415    if (!xs)
416       return;
417 
418    if (cs->device->physical_device->info->a7xx.load_shader_consts_via_preamble) {
419       if (shader->const_state.dynamic_offsets_ubo.size == 0)
420          return;
421 
422       uint32_t offsets[MAX_SETS];
423       for (unsigned i = 0; i < phys_dev->usable_sets; i++) {
424          unsigned dynamic_offset_start =
425             program->dynamic_descriptor_offsets[i] / (A6XX_TEX_CONST_DWORDS * 4);
426          offsets[i] = dynamic_offset_start;
427       }
428 
429       /* A7XX TODO: Emit data via sub_cs instead of NOP */
430       uint64_t iova = tu_cs_emit_data_nop(cs, offsets, phys_dev->usable_sets, 4);
431       uint32_t offset = shader->const_state.dynamic_offsets_ubo.idx;
432 
433       tu_cs_emit_pkt7(cs, tu6_stage2opcode(xs->type), 5);
434       tu_cs_emit(cs, CP_LOAD_STATE6_0_DST_OFF(offset) |
435                CP_LOAD_STATE6_0_STATE_TYPE(ST6_UBO) |
436                CP_LOAD_STATE6_0_STATE_SRC(SS6_DIRECT) |
437                CP_LOAD_STATE6_0_STATE_BLOCK(tu6_stage2shadersb(xs->type)) |
438                CP_LOAD_STATE6_0_NUM_UNIT(1));
439       tu_cs_emit(cs, CP_LOAD_STATE6_1_EXT_SRC_ADDR(0));
440       tu_cs_emit(cs, CP_LOAD_STATE6_2_EXT_SRC_ADDR_HI(0));
441       int size_vec4s = DIV_ROUND_UP(phys_dev->usable_sets, 4);
442       tu_cs_emit_qw(cs, iova | ((uint64_t)A6XX_UBO_1_SIZE(size_vec4s) << 32));
443    } else {
444       if (shader->const_state.dynamic_offset_loc == UINT32_MAX)
445          return;
446 
447       tu_cs_emit_pkt7(cs, tu6_stage2opcode(xs->type), 3 + phys_dev->usable_sets);
448       tu_cs_emit(cs, CP_LOAD_STATE6_0_DST_OFF(shader->const_state.dynamic_offset_loc / 4) |
449                CP_LOAD_STATE6_0_STATE_TYPE(ST6_CONSTANTS) |
450                CP_LOAD_STATE6_0_STATE_SRC(SS6_DIRECT) |
451                CP_LOAD_STATE6_0_STATE_BLOCK(tu6_stage2shadersb(xs->type)) |
452                CP_LOAD_STATE6_0_NUM_UNIT(DIV_ROUND_UP(phys_dev->usable_sets, 4)));
453       tu_cs_emit(cs, CP_LOAD_STATE6_1_EXT_SRC_ADDR(0));
454       tu_cs_emit(cs, CP_LOAD_STATE6_2_EXT_SRC_ADDR_HI(0));
455 
456       for (unsigned i = 0; i < phys_dev->usable_sets; i++) {
457          unsigned dynamic_offset_start =
458             program->dynamic_descriptor_offsets[i] / (A6XX_TEX_CONST_DWORDS * 4);
459          tu_cs_emit(cs, dynamic_offset_start);
460       }
461    }
462 }
463 
464 template <chip CHIP>
465 void
tu6_emit_shared_consts_enable(struct tu_cs * cs,bool enable)466 tu6_emit_shared_consts_enable(struct tu_cs *cs, bool enable)
467 {
468    if (CHIP == A6XX) {
469       /* Enable/disable shared constants */
470       tu_cs_emit_regs(cs, A6XX_HLSQ_SHARED_CONSTS(.enable = enable));
471    } else {
472       assert(!enable);
473    }
474 
475    tu_cs_emit_regs(cs, A6XX_SP_MODE_CONTROL(.constant_demotion_enable = true,
476                                             .isammode = ISAMMODE_GL,
477                                             .shared_consts_enable = enable));
478 }
479 TU_GENX(tu6_emit_shared_consts_enable);
480 
481 template <chip CHIP>
482 static void
tu6_setup_streamout(struct tu_cs * cs,const struct ir3_shader_variant * v,const struct ir3_shader_linkage * l)483 tu6_setup_streamout(struct tu_cs *cs,
484                     const struct ir3_shader_variant *v,
485                     const struct ir3_shader_linkage *l)
486 {
487    const struct ir3_stream_output_info *info = &v->stream_output;
488    /* Note: 64 here comes from the HW layout of the program RAM. The program
489     * for stream N is at DWORD 64 * N.
490     */
491 #define A6XX_SO_PROG_DWORDS 64
492    uint32_t prog[A6XX_SO_PROG_DWORDS * IR3_MAX_SO_STREAMS] = {};
493    BITSET_DECLARE(valid_dwords, A6XX_SO_PROG_DWORDS * IR3_MAX_SO_STREAMS) = {0};
494 
495    /* TODO: streamout state should be in a non-GMEM draw state */
496 
497    /* no streamout: */
498    if (info->num_outputs == 0) {
499       unsigned sizedw = 4;
500       if (cs->device->physical_device->info->a6xx.tess_use_shared)
501          sizedw += 2;
502 
503       tu_cs_emit_pkt7(cs, CP_CONTEXT_REG_BUNCH, sizedw);
504       tu_cs_emit(cs, REG_A6XX_VPC_SO_CNTL);
505       tu_cs_emit(cs, 0);
506       tu_cs_emit(cs, REG_A6XX_VPC_SO_STREAM_CNTL);
507       tu_cs_emit(cs, 0);
508 
509       if (cs->device->physical_device->info->a6xx.tess_use_shared) {
510          tu_cs_emit(cs, REG_A6XX_PC_SO_STREAM_CNTL);
511          tu_cs_emit(cs, 0);
512       }
513 
514       return;
515    }
516 
517    for (unsigned i = 0; i < info->num_outputs; i++) {
518       const struct ir3_stream_output *out = &info->output[i];
519       unsigned k = out->register_index;
520       unsigned idx;
521 
522       /* Skip it, if it's an output that was never assigned a register. */
523       if (k >= v->outputs_count || v->outputs[k].regid == INVALID_REG)
524          continue;
525 
526       /* linkage map sorted by order frag shader wants things, so
527        * a bit less ideal here..
528        */
529       for (idx = 0; idx < l->cnt; idx++)
530          if (l->var[idx].slot == v->outputs[k].slot)
531             break;
532 
533       assert(idx < l->cnt);
534 
535       for (unsigned j = 0; j < out->num_components; j++) {
536          unsigned c   = j + out->start_component;
537          unsigned loc = l->var[idx].loc + c;
538          unsigned off = j + out->dst_offset;  /* in dwords */
539 
540          assert(loc < A6XX_SO_PROG_DWORDS * 2);
541          unsigned dword = out->stream * A6XX_SO_PROG_DWORDS + loc/2;
542          if (loc & 1) {
543             prog[dword] |= A6XX_VPC_SO_PROG_B_EN |
544                            A6XX_VPC_SO_PROG_B_BUF(out->output_buffer) |
545                            A6XX_VPC_SO_PROG_B_OFF(off * 4);
546          } else {
547             prog[dword] |= A6XX_VPC_SO_PROG_A_EN |
548                            A6XX_VPC_SO_PROG_A_BUF(out->output_buffer) |
549                            A6XX_VPC_SO_PROG_A_OFF(off * 4);
550          }
551          BITSET_SET(valid_dwords, dword);
552       }
553    }
554 
555    unsigned prog_count = 0;
556    unsigned start, end;
557    BITSET_FOREACH_RANGE(start, end, valid_dwords,
558                         A6XX_SO_PROG_DWORDS * IR3_MAX_SO_STREAMS) {
559       prog_count += end - start + 1;
560    }
561 
562    const bool emit_pc_so_stream_cntl =
563       cs->device->physical_device->info->a6xx.tess_use_shared &&
564       v->type == MESA_SHADER_TESS_EVAL;
565 
566    if (emit_pc_so_stream_cntl)
567       prog_count += 1;
568 
569    tu_cs_emit_pkt7(cs, CP_CONTEXT_REG_BUNCH, 10 + 2 * prog_count);
570    tu_cs_emit(cs, REG_A6XX_VPC_SO_STREAM_CNTL);
571    tu_cs_emit(cs, A6XX_VPC_SO_STREAM_CNTL_STREAM_ENABLE(info->streams_written) |
572                   COND(info->stride[0] > 0,
573                        A6XX_VPC_SO_STREAM_CNTL_BUF0_STREAM(1 + info->buffer_to_stream[0])) |
574                   COND(info->stride[1] > 0,
575                        A6XX_VPC_SO_STREAM_CNTL_BUF1_STREAM(1 + info->buffer_to_stream[1])) |
576                   COND(info->stride[2] > 0,
577                        A6XX_VPC_SO_STREAM_CNTL_BUF2_STREAM(1 + info->buffer_to_stream[2])) |
578                   COND(info->stride[3] > 0,
579                        A6XX_VPC_SO_STREAM_CNTL_BUF3_STREAM(1 + info->buffer_to_stream[3])));
580    for (uint32_t i = 0; i < 4; i++) {
581       tu_cs_emit(cs, REG_A6XX_VPC_SO_BUFFER_STRIDE(i));
582       tu_cs_emit(cs, info->stride[i]);
583    }
584    bool first = true;
585    BITSET_FOREACH_RANGE(start, end, valid_dwords,
586                         A6XX_SO_PROG_DWORDS * IR3_MAX_SO_STREAMS) {
587       tu_cs_emit(cs, REG_A6XX_VPC_SO_CNTL);
588       tu_cs_emit(cs, COND(first, A6XX_VPC_SO_CNTL_RESET) |
589                      A6XX_VPC_SO_CNTL_ADDR(start));
590       for (unsigned i = start; i < end; i++) {
591          tu_cs_emit(cs, REG_A6XX_VPC_SO_PROG);
592          tu_cs_emit(cs, prog[i]);
593       }
594       first = false;
595    }
596 
597    if (emit_pc_so_stream_cntl) {
598       /* Possibly not tess_use_shared related, but the combination of
599        * tess + xfb fails some tests if we don't emit this.
600        */
601       tu_cs_emit(cs, REG_A6XX_PC_SO_STREAM_CNTL);
602       tu_cs_emit(cs, A6XX_PC_SO_STREAM_CNTL_STREAM_ENABLE(info->streams_written));
603    }
604 }
605 
606 enum tu_geom_consts_type
607 {
608    TU_CONSTS_PRIMITIVE_MAP,
609    TU_CONSTS_PRIMITIVE_PARAM,
610 };
611 
612 static void
tu6_emit_const(struct tu_cs * cs,uint32_t opcode,enum tu_geom_consts_type type,const struct ir3_const_state * const_state,unsigned constlen,enum a6xx_state_block block,uint32_t offset,uint32_t size,const uint32_t * dwords)613 tu6_emit_const(struct tu_cs *cs, uint32_t opcode, enum tu_geom_consts_type type,
614                const struct ir3_const_state *const_state,
615                unsigned constlen, enum a6xx_state_block block,
616                uint32_t offset, uint32_t size, const uint32_t *dwords) {
617    assert(size % 4 == 0);
618    dwords = (uint32_t *)&((uint8_t *)dwords)[offset];
619 
620    if (!cs->device->physical_device->info->a7xx.load_shader_consts_via_preamble) {
621       uint32_t base;
622       switch (type) {
623       case TU_CONSTS_PRIMITIVE_MAP:
624          base = const_state->allocs.consts[IR3_CONST_ALLOC_PRIMITIVE_MAP].offset_vec4;
625          break;
626       case TU_CONSTS_PRIMITIVE_PARAM:
627          base = const_state->allocs.consts[IR3_CONST_ALLOC_PRIMITIVE_PARAM].offset_vec4;
628          break;
629       default:
630          unreachable("bad consts type");
631       }
632 
633       int32_t adjusted_size = MIN2(base * 4 + size, constlen * 4) - base * 4;
634       if (adjusted_size <= 0)
635          return;
636 
637       tu_cs_emit_pkt7(cs, opcode, 3 + adjusted_size);
638       tu_cs_emit(cs, CP_LOAD_STATE6_0_DST_OFF(base) |
639             CP_LOAD_STATE6_0_STATE_TYPE(ST6_CONSTANTS) |
640             CP_LOAD_STATE6_0_STATE_SRC(SS6_DIRECT) |
641             CP_LOAD_STATE6_0_STATE_BLOCK(block) |
642             CP_LOAD_STATE6_0_NUM_UNIT(adjusted_size / 4));
643 
644       tu_cs_emit(cs, CP_LOAD_STATE6_1_EXT_SRC_ADDR(0));
645       tu_cs_emit(cs, CP_LOAD_STATE6_2_EXT_SRC_ADDR_HI(0));
646 
647       tu_cs_emit_array(cs, dwords, adjusted_size);
648    } else {
649       uint32_t base;
650       switch (type) {
651       case TU_CONSTS_PRIMITIVE_MAP:
652          base = const_state->primitive_map_ubo.idx;
653          break;
654       case TU_CONSTS_PRIMITIVE_PARAM:
655          base = const_state->primitive_param_ubo.idx;
656          break;
657       default:
658          unreachable("bad consts type");
659       }
660       if (base == -1)
661          return;
662 
663       /* A7XX TODO: Emit data via sub_cs instead of NOP */
664       uint64_t iova = tu_cs_emit_data_nop(cs, dwords, size, 4);
665 
666       tu_cs_emit_pkt7(cs, opcode, 5);
667       tu_cs_emit(cs, CP_LOAD_STATE6_0_DST_OFF(base) |
668                CP_LOAD_STATE6_0_STATE_TYPE(ST6_UBO) |
669                CP_LOAD_STATE6_0_STATE_SRC(SS6_DIRECT) |
670                CP_LOAD_STATE6_0_STATE_BLOCK(block) |
671                CP_LOAD_STATE6_0_NUM_UNIT(1));
672       tu_cs_emit(cs, CP_LOAD_STATE6_1_EXT_SRC_ADDR(0));
673       tu_cs_emit(cs, CP_LOAD_STATE6_2_EXT_SRC_ADDR_HI(0));
674       int size_vec4s = DIV_ROUND_UP(size, 4);
675       tu_cs_emit_qw(cs, iova | ((uint64_t)A6XX_UBO_1_SIZE(size_vec4s) << 32));
676    }
677 }
678 
679 static void
tu6_emit_link_map(struct tu_cs * cs,const struct ir3_shader_variant * producer,const struct ir3_shader_variant * consumer,enum a6xx_state_block sb)680 tu6_emit_link_map(struct tu_cs *cs,
681                   const struct ir3_shader_variant *producer,
682                   const struct ir3_shader_variant *consumer,
683                   enum a6xx_state_block sb)
684 {
685    const struct ir3_const_state *const_state = ir3_const_state(consumer);
686    uint32_t size = ALIGN(consumer->input_size, 4);
687 
688    if (size == 0)
689       return;
690 
691    tu6_emit_const(cs, CP_LOAD_STATE6_GEOM, TU_CONSTS_PRIMITIVE_MAP,
692                   const_state, consumer->constlen, sb, 0, size, producer->output_loc);
693 }
694 
695 static int
tu6_vpc_varying_mode(const struct ir3_shader_variant * fs,const struct ir3_shader_variant * last_shader,uint32_t index,uint8_t * interp_mode,uint8_t * ps_repl_mode)696 tu6_vpc_varying_mode(const struct ir3_shader_variant *fs,
697                      const struct ir3_shader_variant *last_shader,
698                      uint32_t index,
699                      uint8_t *interp_mode,
700                      uint8_t *ps_repl_mode)
701 {
702    const uint32_t compmask = fs->inputs[index].compmask;
703 
704    /* NOTE: varyings are packed, so if compmask is 0xb then first, second, and
705     * fourth component occupy three consecutive varying slots
706     */
707    int shift = 0;
708    *interp_mode = 0;
709    *ps_repl_mode = 0;
710    if (fs->inputs[index].slot == VARYING_SLOT_PNTC) {
711       if (compmask & 0x1) {
712          *ps_repl_mode |= PS_REPL_S << shift;
713          shift += 2;
714       }
715       if (compmask & 0x2) {
716          *ps_repl_mode |= PS_REPL_T << shift;
717          shift += 2;
718       }
719       if (compmask & 0x4) {
720          *interp_mode |= INTERP_ZERO << shift;
721          shift += 2;
722       }
723       if (compmask & 0x8) {
724          *interp_mode |= INTERP_ONE << 6;
725          shift += 2;
726       }
727    } else if (fs->inputs[index].slot == VARYING_SLOT_LAYER ||
728               fs->inputs[index].slot == VARYING_SLOT_VIEWPORT) {
729       /* If the last geometry shader doesn't statically write these, they're
730        * implicitly zero and the FS is supposed to read zero.
731        */
732       const gl_varying_slot slot = (gl_varying_slot) fs->inputs[index].slot;
733       if (ir3_find_output(last_shader, slot) < 0 &&
734           (compmask & 0x1)) {
735          *interp_mode |= INTERP_ZERO;
736       } else {
737          *interp_mode |= INTERP_FLAT;
738       }
739    } else if (fs->inputs[index].flat) {
740       for (int i = 0; i < 4; i++) {
741          if (compmask & (1 << i)) {
742             *interp_mode |= INTERP_FLAT << shift;
743             shift += 2;
744          }
745       }
746    }
747 
748    return util_bitcount(compmask) * 2;
749 }
750 
751 static void
tu6_emit_vpc_varying_modes(struct tu_cs * cs,const struct ir3_shader_variant * fs,const struct ir3_shader_variant * last_shader)752 tu6_emit_vpc_varying_modes(struct tu_cs *cs,
753                            const struct ir3_shader_variant *fs,
754                            const struct ir3_shader_variant *last_shader)
755 {
756    uint32_t interp_modes[8] = { 0 };
757    uint32_t ps_repl_modes[8] = { 0 };
758    uint32_t interp_regs = 0;
759 
760    if (fs) {
761       for (int i = -1;
762            (i = ir3_next_varying(fs, i)) < (int) fs->inputs_count;) {
763 
764          /* get the mode for input i */
765          uint8_t interp_mode;
766          uint8_t ps_repl_mode;
767          const int bits =
768             tu6_vpc_varying_mode(fs, last_shader, i, &interp_mode, &ps_repl_mode);
769 
770          /* OR the mode into the array */
771          const uint32_t inloc = fs->inputs[i].inloc * 2;
772          uint32_t n = inloc / 32;
773          uint32_t shift = inloc % 32;
774          interp_modes[n] |= interp_mode << shift;
775          ps_repl_modes[n] |= ps_repl_mode << shift;
776          if (shift + bits > 32) {
777             n++;
778             shift = 32 - shift;
779 
780             interp_modes[n] |= interp_mode >> shift;
781             ps_repl_modes[n] |= ps_repl_mode >> shift;
782          }
783          interp_regs = MAX2(interp_regs, n + 1);
784       }
785    }
786 
787    if (interp_regs) {
788       tu_cs_emit_pkt4(cs, REG_A6XX_VPC_VARYING_INTERP_MODE(0), interp_regs);
789       tu_cs_emit_array(cs, interp_modes, interp_regs);
790 
791       tu_cs_emit_pkt4(cs, REG_A6XX_VPC_VARYING_PS_REPL_MODE(0), interp_regs);
792       tu_cs_emit_array(cs, ps_repl_modes, interp_regs);
793    }
794 }
795 
796 template <chip CHIP>
797 void
tu6_emit_vpc(struct tu_cs * cs,const struct ir3_shader_variant * vs,const struct ir3_shader_variant * hs,const struct ir3_shader_variant * ds,const struct ir3_shader_variant * gs,const struct ir3_shader_variant * fs)798 tu6_emit_vpc(struct tu_cs *cs,
799              const struct ir3_shader_variant *vs,
800              const struct ir3_shader_variant *hs,
801              const struct ir3_shader_variant *ds,
802              const struct ir3_shader_variant *gs,
803              const struct ir3_shader_variant *fs)
804 {
805    /* note: doesn't compile as static because of the array regs.. */
806    const struct reg_config {
807       uint16_t reg_sp_xs_out_reg;
808       uint16_t reg_sp_xs_vpc_dst_reg;
809       uint16_t reg_vpc_xs_pack;
810       uint16_t reg_vpc_xs_clip_cntl;
811       uint16_t reg_vpc_xs_clip_cntl_v2;
812       uint16_t reg_gras_xs_cl_cntl;
813       uint16_t reg_pc_xs_out_cntl;
814       uint16_t reg_sp_xs_primitive_cntl;
815       uint16_t reg_vpc_xs_layer_cntl;
816       uint16_t reg_vpc_xs_layer_cntl_v2;
817       uint16_t reg_gras_xs_layer_cntl;
818    } reg_config[] = {
819       [MESA_SHADER_VERTEX] = {
820          REG_A6XX_SP_VS_OUT_REG(0),
821          REG_A6XX_SP_VS_VPC_DST_REG(0),
822          REG_A6XX_VPC_VS_PACK,
823          REG_A6XX_VPC_VS_CLIP_CNTL,
824          REG_A6XX_VPC_VS_CLIP_CNTL_V2,
825          REG_A6XX_GRAS_VS_CL_CNTL,
826          REG_A6XX_PC_VS_OUT_CNTL,
827          REG_A6XX_SP_VS_PRIMITIVE_CNTL,
828          REG_A6XX_VPC_VS_LAYER_CNTL,
829          REG_A6XX_VPC_VS_LAYER_CNTL_V2,
830          REG_A6XX_GRAS_VS_LAYER_CNTL
831       },
832       [MESA_SHADER_TESS_CTRL] = {
833          0,
834          0,
835          0,
836          0,
837          0,
838          0,
839          REG_A6XX_PC_HS_OUT_CNTL,
840          0,
841          0,
842          0
843       },
844       [MESA_SHADER_TESS_EVAL] = {
845          REG_A6XX_SP_DS_OUT_REG(0),
846          REG_A6XX_SP_DS_VPC_DST_REG(0),
847          REG_A6XX_VPC_DS_PACK,
848          REG_A6XX_VPC_DS_CLIP_CNTL,
849          REG_A6XX_VPC_DS_CLIP_CNTL_V2,
850          REG_A6XX_GRAS_DS_CL_CNTL,
851          REG_A6XX_PC_DS_OUT_CNTL,
852          REG_A6XX_SP_DS_PRIMITIVE_CNTL,
853          REG_A6XX_VPC_DS_LAYER_CNTL,
854          REG_A6XX_VPC_DS_LAYER_CNTL_V2,
855          REG_A6XX_GRAS_DS_LAYER_CNTL
856       },
857       [MESA_SHADER_GEOMETRY] = {
858          REG_A6XX_SP_GS_OUT_REG(0),
859          REG_A6XX_SP_GS_VPC_DST_REG(0),
860          REG_A6XX_VPC_GS_PACK,
861          REG_A6XX_VPC_GS_CLIP_CNTL,
862          REG_A6XX_VPC_GS_CLIP_CNTL_V2,
863          REG_A6XX_GRAS_GS_CL_CNTL,
864          REG_A6XX_PC_GS_OUT_CNTL,
865          REG_A6XX_SP_GS_PRIMITIVE_CNTL,
866          REG_A6XX_VPC_GS_LAYER_CNTL,
867          REG_A6XX_VPC_GS_LAYER_CNTL_V2,
868          REG_A6XX_GRAS_GS_LAYER_CNTL
869       },
870    };
871 
872    const struct ir3_shader_variant *last_shader;
873    if (gs) {
874       last_shader = gs;
875    } else if (hs) {
876       last_shader = ds;
877    } else {
878       last_shader = vs;
879    }
880 
881    const struct reg_config *cfg = &reg_config[last_shader->type];
882 
883    struct ir3_shader_linkage linkage = {
884       .primid_loc = 0xff,
885       .clip0_loc = 0xff,
886       .clip1_loc = 0xff,
887    };
888    if (fs)
889       ir3_link_shaders(&linkage, last_shader, fs, true);
890 
891    if (last_shader->stream_output.num_outputs)
892       ir3_link_stream_out(&linkage, last_shader);
893 
894    /* a6xx finds position/pointsize at the end */
895    const uint32_t pointsize_regid =
896       ir3_find_output_regid(last_shader, VARYING_SLOT_PSIZ);
897    const uint32_t layer_regid =
898       ir3_find_output_regid(last_shader, VARYING_SLOT_LAYER);
899    const uint32_t view_regid =
900       ir3_find_output_regid(last_shader, VARYING_SLOT_VIEWPORT);
901    const uint32_t clip0_regid =
902       ir3_find_output_regid(last_shader, VARYING_SLOT_CLIP_DIST0);
903    const uint32_t clip1_regid =
904       ir3_find_output_regid(last_shader, VARYING_SLOT_CLIP_DIST1);
905    uint32_t flags_regid = gs ?
906       ir3_find_output_regid(gs, VARYING_SLOT_GS_VERTEX_FLAGS_IR3) : 0;
907    const uint32_t shading_rate_regid =
908       ir3_find_output_regid(last_shader, VARYING_SLOT_PRIMITIVE_SHADING_RATE);
909 
910    uint32_t pointsize_loc = 0xff, position_loc = 0xff, layer_loc = 0xff, view_loc = 0xff;
911    uint32_t shading_rate_loc = 0xff;
912 
913    if (layer_regid != regid(63, 0)) {
914       layer_loc = linkage.max_loc;
915       ir3_link_add(&linkage, VARYING_SLOT_LAYER, layer_regid, 0x1, linkage.max_loc);
916    }
917 
918    if (view_regid != regid(63, 0)) {
919       view_loc = linkage.max_loc;
920       ir3_link_add(&linkage, VARYING_SLOT_VIEWPORT, view_regid, 0x1, linkage.max_loc);
921    }
922 
923    if (shading_rate_regid != regid(63, 0)) {
924       shading_rate_loc = linkage.max_loc;
925       ir3_link_add(&linkage, VARYING_SLOT_PRIMITIVE_SHADING_RATE,
926                    shading_rate_regid, 0x1, linkage.max_loc);
927    }
928 
929    unsigned extra_pos = 0;
930 
931    for (unsigned i = 0; i < last_shader->outputs_count; i++) {
932       if (last_shader->outputs[i].slot != VARYING_SLOT_POS)
933          continue;
934 
935       if (position_loc == 0xff)
936          position_loc = linkage.max_loc;
937 
938       ir3_link_add(&linkage, last_shader->outputs[i].slot,
939                    last_shader->outputs[i].regid,
940                    0xf, position_loc + 4 * last_shader->outputs[i].view);
941       extra_pos = MAX2(extra_pos, last_shader->outputs[i].view);
942    }
943 
944    if (pointsize_regid != regid(63, 0)) {
945       pointsize_loc = linkage.max_loc;
946       ir3_link_add(&linkage, VARYING_SLOT_PSIZ, pointsize_regid, 0x1, linkage.max_loc);
947    }
948 
949    uint8_t clip_cull_mask = last_shader->clip_mask | last_shader->cull_mask;
950 
951    /* Handle the case where clip/cull distances aren't read by the FS */
952    uint32_t clip0_loc = linkage.clip0_loc, clip1_loc = linkage.clip1_loc;
953    if (clip0_loc == 0xff && clip0_regid != regid(63, 0)) {
954       clip0_loc = linkage.max_loc;
955       ir3_link_add(&linkage, VARYING_SLOT_CLIP_DIST0, clip0_regid,
956                    clip_cull_mask & 0xf, linkage.max_loc);
957    }
958    if (clip1_loc == 0xff && clip1_regid != regid(63, 0)) {
959       clip1_loc = linkage.max_loc;
960       ir3_link_add(&linkage, VARYING_SLOT_CLIP_DIST1, clip1_regid,
961                    clip_cull_mask >> 4, linkage.max_loc);
962    }
963 
964    tu6_setup_streamout<CHIP>(cs, last_shader, &linkage);
965 
966    /* There is a hardware bug on a750 where STRIDE_IN_VPC of 5 to 8 in GS with
967     * an input primitive type with adjacency, an output primitive type of
968     * points, and a high enough vertex count causes a hang.
969     */
970    if (cs->device->physical_device->info->a7xx.gs_vpc_adjacency_quirk &&
971        gs && gs->gs.output_primitive == MESA_PRIM_POINTS &&
972        linkage.max_loc > 4) {
973       linkage.max_loc = MAX2(linkage.max_loc, 9);
974    }
975 
976    /* The GPU hangs on some models when there are no outputs (xs_pack::CNT),
977     * at least when a DS is the last stage, so add a dummy output to keep it
978     * happy if there aren't any. We do this late in order to avoid emitting
979     * any unused code and make sure that optimizations don't remove it.
980     */
981    if (linkage.cnt == 0)
982       ir3_link_add(&linkage, 0, 0, 0x1, linkage.max_loc);
983 
984    /* map outputs of the last shader to VPC */
985    assert(linkage.cnt <= 32);
986    const uint32_t sp_out_count = DIV_ROUND_UP(linkage.cnt, 2);
987    const uint32_t sp_vpc_dst_count = DIV_ROUND_UP(linkage.cnt, 4);
988    uint32_t sp_out[16] = {0};
989    uint32_t sp_vpc_dst[8] = {0};
990    for (uint32_t i = 0; i < linkage.cnt; i++) {
991       ((uint16_t *) sp_out)[i] =
992          A6XX_SP_VS_OUT_REG_A_REGID(linkage.var[i].regid) |
993          A6XX_SP_VS_OUT_REG_A_COMPMASK(linkage.var[i].compmask);
994       ((uint8_t *) sp_vpc_dst)[i] =
995          A6XX_SP_VS_VPC_DST_REG_OUTLOC0(linkage.var[i].loc);
996    }
997 
998    tu_cs_emit_pkt4(cs, cfg->reg_sp_xs_out_reg, sp_out_count);
999    tu_cs_emit_array(cs, sp_out, sp_out_count);
1000 
1001    tu_cs_emit_pkt4(cs, cfg->reg_sp_xs_vpc_dst_reg, sp_vpc_dst_count);
1002    tu_cs_emit_array(cs, sp_vpc_dst, sp_vpc_dst_count);
1003 
1004    tu_cs_emit_pkt4(cs, cfg->reg_vpc_xs_pack, 1);
1005    tu_cs_emit(cs, A6XX_VPC_VS_PACK_POSITIONLOC(position_loc) |
1006                   A6XX_VPC_VS_PACK_PSIZELOC(pointsize_loc) |
1007                   A6XX_VPC_VS_PACK_STRIDE_IN_VPC(linkage.max_loc) |
1008                   A6XX_VPC_VS_PACK_EXTRAPOS(extra_pos));
1009 
1010    tu_cs_emit_pkt4(cs, cfg->reg_vpc_xs_clip_cntl, 1);
1011    tu_cs_emit(cs, A6XX_VPC_VS_CLIP_CNTL_CLIP_MASK(clip_cull_mask) |
1012                   A6XX_VPC_VS_CLIP_CNTL_CLIP_DIST_03_LOC(clip0_loc) |
1013                   A6XX_VPC_VS_CLIP_CNTL_CLIP_DIST_47_LOC(clip1_loc));
1014    tu_cs_emit_pkt4(cs, cfg->reg_vpc_xs_clip_cntl_v2, 1);
1015    tu_cs_emit(cs, A6XX_VPC_VS_CLIP_CNTL_CLIP_MASK(clip_cull_mask) |
1016                   A6XX_VPC_VS_CLIP_CNTL_CLIP_DIST_03_LOC(clip0_loc) |
1017                   A6XX_VPC_VS_CLIP_CNTL_CLIP_DIST_47_LOC(clip1_loc));
1018 
1019    tu_cs_emit_pkt4(cs, cfg->reg_gras_xs_cl_cntl, 1);
1020    tu_cs_emit(cs, A6XX_GRAS_VS_CL_CNTL_CLIP_MASK(last_shader->clip_mask) |
1021                   A6XX_GRAS_VS_CL_CNTL_CULL_MASK(last_shader->cull_mask));
1022 
1023    const struct ir3_shader_variant *geom_shaders[] = { vs, hs, ds, gs };
1024 
1025    for (unsigned i = 0; i < ARRAY_SIZE(geom_shaders); i++) {
1026       const struct ir3_shader_variant *shader = geom_shaders[i];
1027       if (!shader)
1028          continue;
1029 
1030       bool primid = shader->type != MESA_SHADER_VERTEX &&
1031          VALIDREG(ir3_find_sysval_regid(shader, SYSTEM_VALUE_PRIMITIVE_ID));
1032 
1033       tu_cs_emit_pkt4(cs, reg_config[shader->type].reg_pc_xs_out_cntl, 1);
1034       if (shader == last_shader) {
1035          tu_cs_emit(cs, A6XX_PC_VS_OUT_CNTL_STRIDE_IN_VPC(linkage.max_loc) |
1036                         CONDREG(pointsize_regid, A6XX_PC_VS_OUT_CNTL_PSIZE) |
1037                         CONDREG(layer_regid, A6XX_PC_VS_OUT_CNTL_LAYER) |
1038                         CONDREG(view_regid, A6XX_PC_VS_OUT_CNTL_VIEW) |
1039                         COND(primid, A6XX_PC_VS_OUT_CNTL_PRIMITIVE_ID) |
1040                         A6XX_PC_VS_OUT_CNTL_CLIP_MASK(clip_cull_mask) |
1041                         CONDREG(shading_rate_regid, A6XX_PC_VS_OUT_CNTL_SHADINGRATE));
1042       } else {
1043          tu_cs_emit(cs, COND(primid, A6XX_PC_VS_OUT_CNTL_PRIMITIVE_ID));
1044       }
1045    }
1046 
1047    /* if vertex_flags somehow gets optimized out, your gonna have a bad time: */
1048    if (gs)
1049       assert(flags_regid != INVALID_REG);
1050 
1051    tu_cs_emit_pkt4(cs, cfg->reg_sp_xs_primitive_cntl, 1);
1052    tu_cs_emit(cs, A6XX_SP_VS_PRIMITIVE_CNTL_OUT(linkage.cnt) |
1053                   A6XX_SP_GS_PRIMITIVE_CNTL_FLAGS_REGID(flags_regid));
1054 
1055    tu_cs_emit_pkt4(cs, cfg->reg_vpc_xs_layer_cntl, 1);
1056    tu_cs_emit(cs, A6XX_VPC_VS_LAYER_CNTL_LAYERLOC(layer_loc) |
1057                   A6XX_VPC_VS_LAYER_CNTL_VIEWLOC(view_loc) |
1058                   A6XX_VPC_VS_LAYER_CNTL_SHADINGRATELOC(shading_rate_loc));
1059    tu_cs_emit_pkt4(cs, cfg->reg_vpc_xs_layer_cntl_v2, 1);
1060    tu_cs_emit(cs, A6XX_VPC_VS_LAYER_CNTL_LAYERLOC(layer_loc) |
1061                   A6XX_VPC_VS_LAYER_CNTL_VIEWLOC(view_loc) |
1062                   A6XX_VPC_VS_LAYER_CNTL_SHADINGRATELOC(shading_rate_loc));
1063 
1064    tu_cs_emit_pkt4(cs, cfg->reg_gras_xs_layer_cntl, 1);
1065    tu_cs_emit(cs, CONDREG(layer_regid, A6XX_GRAS_GS_LAYER_CNTL_WRITES_LAYER) |
1066                   CONDREG(view_regid, A6XX_GRAS_GS_LAYER_CNTL_WRITES_VIEW));
1067 
1068    tu6_emit_vpc_varying_modes(cs, fs, last_shader);
1069 }
1070 TU_GENX(tu6_emit_vpc);
1071 
1072 static void
tu6_emit_vs_params(struct tu_cs * cs,const struct ir3_const_state * const_state,unsigned constlen,unsigned param_stride,unsigned num_vertices)1073 tu6_emit_vs_params(struct tu_cs *cs,
1074                    const struct ir3_const_state *const_state,
1075                    unsigned constlen,
1076                    unsigned param_stride,
1077                    unsigned num_vertices)
1078 {
1079    uint32_t vs_params[4] = {
1080       param_stride * num_vertices * 4,  /* vs primitive stride */
1081       param_stride * 4,                 /* vs vertex stride */
1082       0,
1083       0,
1084    };
1085    tu6_emit_const(cs, CP_LOAD_STATE6_GEOM, TU_CONSTS_PRIMITIVE_PARAM,
1086                   const_state, constlen, SB6_VS_SHADER, 0,
1087                   ARRAY_SIZE(vs_params), vs_params);
1088 }
1089 
1090 static void
tu_get_tess_iova(struct tu_device * dev,uint64_t * tess_factor_iova,uint64_t * tess_param_iova)1091 tu_get_tess_iova(struct tu_device *dev,
1092                  uint64_t *tess_factor_iova,
1093                  uint64_t *tess_param_iova)
1094 {
1095    /* Create the shared tess factor BO the first time tess is used on the device. */
1096    if (!dev->tess_bo) {
1097       mtx_lock(&dev->mutex);
1098       if (!dev->tess_bo) {
1099          tu_bo_init_new(dev, NULL, &dev->tess_bo, TU_TESS_BO_SIZE,
1100                         TU_BO_ALLOC_INTERNAL_RESOURCE, "tess");
1101       }
1102       mtx_unlock(&dev->mutex);
1103    }
1104 
1105    *tess_factor_iova = dev->tess_bo->iova;
1106    *tess_param_iova = dev->tess_bo->iova + TU_TESS_FACTOR_SIZE;
1107 }
1108 
1109 static const enum mesa_vk_dynamic_graphics_state tu_patch_control_points_state[] = {
1110    MESA_VK_DYNAMIC_TS_PATCH_CONTROL_POINTS,
1111 };
1112 
1113 #define HS_PARAMS_SIZE 8
1114 
1115 template <chip CHIP>
1116 static unsigned
tu6_patch_control_points_size(struct tu_device * dev,const struct tu_shader * vs,const struct tu_shader * tcs,const struct tu_shader * tes,const struct tu_program_state * program,uint32_t patch_control_points)1117 tu6_patch_control_points_size(struct tu_device *dev,
1118                               const struct tu_shader *vs,
1119                               const struct tu_shader *tcs,
1120                               const struct tu_shader *tes,
1121                               const struct tu_program_state *program,
1122                               uint32_t patch_control_points)
1123 {
1124    if (dev->physical_device->info->a7xx.load_shader_consts_via_preamble) {
1125 #define EMIT_CONST_DWORDS(const_dwords) (6 + const_dwords + 4)
1126       return EMIT_CONST_DWORDS(4) +
1127          EMIT_CONST_DWORDS(HS_PARAMS_SIZE) + 2 + 2 + 2;
1128 #undef EMIT_CONST_DWORDS
1129    } else {
1130 #define EMIT_CONST_DWORDS(const_dwords) (4 + const_dwords)
1131       return EMIT_CONST_DWORDS(4) +
1132          EMIT_CONST_DWORDS(HS_PARAMS_SIZE) + 2 + 2 + 2;
1133 #undef EMIT_CONST_DWORDS
1134    }
1135 }
1136 
1137 template <chip CHIP>
1138 void
tu6_emit_patch_control_points(struct tu_cs * cs,const struct tu_shader * vs,const struct tu_shader * tcs,const struct tu_shader * tes,const struct tu_program_state * program,uint32_t patch_control_points)1139 tu6_emit_patch_control_points(struct tu_cs *cs,
1140                               const struct tu_shader *vs,
1141                               const struct tu_shader *tcs,
1142                               const struct tu_shader *tes,
1143                               const struct tu_program_state *program,
1144                               uint32_t patch_control_points)
1145 {
1146    if (!tcs->variant)
1147       return;
1148 
1149    struct tu_device *dev = cs->device;
1150 
1151    tu6_emit_vs_params(cs,
1152                       &program->link[MESA_SHADER_VERTEX].const_state,
1153                       program->link[MESA_SHADER_VERTEX].constlen,
1154                       vs->variant->output_size,
1155                       patch_control_points);
1156 
1157    uint64_t tess_factor_iova, tess_param_iova;
1158    tu_get_tess_iova(dev, &tess_factor_iova, &tess_param_iova);
1159 
1160    uint32_t hs_params[HS_PARAMS_SIZE] = {
1161       vs->variant->output_size * patch_control_points * 4,  /* hs primitive stride */
1162       vs->variant->output_size * 4,                         /* hs vertex stride */
1163       tcs->variant->output_size,
1164       patch_control_points,
1165       tess_param_iova,
1166       tess_param_iova >> 32,
1167       tess_factor_iova,
1168       tess_factor_iova >> 32,
1169    };
1170 
1171    const struct ir3_const_state *hs_const =
1172       &program->link[MESA_SHADER_TESS_CTRL].const_state;
1173    unsigned hs_constlen = program->link[MESA_SHADER_TESS_CTRL].constlen;
1174    tu6_emit_const(cs, CP_LOAD_STATE6_GEOM, TU_CONSTS_PRIMITIVE_PARAM,
1175                   hs_const, hs_constlen, SB6_HS_SHADER, 0,
1176                   ARRAY_SIZE(hs_params), hs_params);
1177 
1178    uint32_t patch_local_mem_size_16b =
1179       patch_control_points * vs->variant->output_size / 4;
1180 
1181    /* Total attribute slots in HS incoming patch. */
1182    tu_cs_emit_pkt4(cs, REG_A6XX_PC_HS_INPUT_SIZE, 1);
1183    tu_cs_emit(cs, patch_local_mem_size_16b);
1184 
1185    const uint32_t wavesize = 64;
1186    const uint32_t vs_hs_local_mem_size = 16384;
1187 
1188    uint32_t max_patches_per_wave;
1189    if (dev->physical_device->info->a6xx.tess_use_shared) {
1190       /* HS invocations for a patch are always within the same wave,
1191        * making barriers less expensive. VS can't have barriers so we
1192        * don't care about VS invocations being in the same wave.
1193        */
1194       max_patches_per_wave = wavesize / tcs->variant->tess.tcs_vertices_out;
1195    } else {
1196       /* VS is also in the same wave */
1197       max_patches_per_wave =
1198          wavesize / MAX2(patch_control_points,
1199                          tcs->variant->tess.tcs_vertices_out);
1200    }
1201 
1202    uint32_t patches_per_wave =
1203       MIN2(vs_hs_local_mem_size / (patch_local_mem_size_16b * 16),
1204            max_patches_per_wave);
1205 
1206    uint32_t wave_input_size = DIV_ROUND_UP(
1207       patches_per_wave * patch_local_mem_size_16b * 16, 256);
1208 
1209    tu_cs_emit_pkt4(cs, REG_A6XX_SP_HS_WAVE_INPUT_SIZE, 1);
1210    tu_cs_emit(cs, wave_input_size);
1211 
1212    /* maximum number of patches that can fit in tess factor/param buffers */
1213    uint32_t subdraw_size = MIN2(TU_TESS_FACTOR_SIZE / ir3_tess_factor_stride(tes->variant->key.tessellation),
1214                         TU_TESS_PARAM_SIZE / (tcs->variant->output_size * 4));
1215    /* convert from # of patches to draw count */
1216    subdraw_size *= patch_control_points;
1217 
1218    tu_cs_emit_pkt7(cs, CP_SET_SUBDRAW_SIZE, 1);
1219    tu_cs_emit(cs, subdraw_size);
1220 }
1221 
1222 static void
tu6_emit_geom_tess_consts(struct tu_cs * cs,const struct ir3_shader_variant * vs,const struct ir3_shader_variant * hs,const struct ir3_shader_variant * ds,const struct ir3_shader_variant * gs)1223 tu6_emit_geom_tess_consts(struct tu_cs *cs,
1224                           const struct ir3_shader_variant *vs,
1225                           const struct ir3_shader_variant *hs,
1226                           const struct ir3_shader_variant *ds,
1227                           const struct ir3_shader_variant *gs)
1228 {
1229    struct tu_device *dev = cs->device;
1230 
1231    if (gs && !hs) {
1232       tu6_emit_vs_params(cs, ir3_const_state(vs), vs->constlen,
1233                          vs->output_size, gs->gs.vertices_in);
1234    }
1235 
1236    if (hs) {
1237       uint64_t tess_factor_iova, tess_param_iova;
1238       tu_get_tess_iova(dev, &tess_factor_iova, &tess_param_iova);
1239 
1240       uint32_t ds_params[8] = {
1241          gs ? ds->output_size * gs->gs.vertices_in * 4 : 0,  /* ds primitive stride */
1242          ds->output_size * 4,                                /* ds vertex stride */
1243          hs->output_size,                                    /* hs vertex stride (dwords) */
1244          hs->tess.tcs_vertices_out,
1245          tess_param_iova,
1246          tess_param_iova >> 32,
1247          tess_factor_iova,
1248          tess_factor_iova >> 32,
1249       };
1250 
1251       tu6_emit_const(cs, CP_LOAD_STATE6_GEOM, TU_CONSTS_PRIMITIVE_PARAM,
1252                      ds->const_state, ds->constlen, SB6_DS_SHADER, 0,
1253                      ARRAY_SIZE(ds_params), ds_params);
1254    }
1255 
1256    if (gs) {
1257       const struct ir3_shader_variant *prev = ds ? ds : vs;
1258       uint32_t gs_params[4] = {
1259          prev->output_size * gs->gs.vertices_in * 4,  /* gs primitive stride */
1260          prev->output_size * 4,                 /* gs vertex stride */
1261          0,
1262          0,
1263       };
1264       tu6_emit_const(cs, CP_LOAD_STATE6_GEOM, TU_CONSTS_PRIMITIVE_PARAM,
1265                      gs->const_state, gs->constlen, SB6_GS_SHADER, 0,
1266                      ARRAY_SIZE(gs_params), gs_params);
1267    }
1268 }
1269 
1270 template <chip CHIP>
1271 static void
tu6_emit_program_config(struct tu_cs * cs,const struct tu_program_state * prog,struct tu_shader ** shaders,const struct ir3_shader_variant ** variants)1272 tu6_emit_program_config(struct tu_cs *cs,
1273                         const struct tu_program_state *prog,
1274                         struct tu_shader **shaders,
1275                         const struct ir3_shader_variant **variants)
1276 {
1277    STATIC_ASSERT(MESA_SHADER_VERTEX == 0);
1278 
1279    bool shared_consts_enable =
1280       prog->shared_consts.type == IR3_PUSH_CONSTS_SHARED;
1281    tu6_emit_shared_consts_enable<CHIP>(cs, shared_consts_enable);
1282 
1283    tu_cs_emit_regs(cs, HLSQ_INVALIDATE_CMD(CHIP,
1284          .vs_state = true,
1285          .hs_state = true,
1286          .ds_state = true,
1287          .gs_state = true,
1288          .fs_state = true,
1289          .gfx_ibo = true,
1290          .gfx_shared_const = shared_consts_enable));
1291    for (size_t stage_idx = MESA_SHADER_VERTEX;
1292         stage_idx <= MESA_SHADER_FRAGMENT; stage_idx++) {
1293       gl_shader_stage stage = (gl_shader_stage) stage_idx;
1294       tu6_emit_xs_config<CHIP>(cs, stage, variants[stage]);
1295    }
1296 
1297    for (size_t stage_idx = MESA_SHADER_VERTEX;
1298         stage_idx <= MESA_SHADER_FRAGMENT; stage_idx++) {
1299       gl_shader_stage stage = (gl_shader_stage) stage_idx;
1300       tu6_emit_dynamic_offset(cs, variants[stage], shaders[stage], prog);
1301    }
1302 
1303    const struct ir3_shader_variant *vs = variants[MESA_SHADER_VERTEX];
1304    const struct ir3_shader_variant *hs = variants[MESA_SHADER_TESS_CTRL];
1305    const struct ir3_shader_variant *ds = variants[MESA_SHADER_TESS_EVAL];
1306    const struct ir3_shader_variant *gs = variants[MESA_SHADER_GEOMETRY];
1307 
1308    if (hs) {
1309       tu6_emit_link_map(cs, vs, hs, SB6_HS_SHADER);
1310       tu6_emit_link_map(cs, hs, ds, SB6_DS_SHADER);
1311    }
1312 
1313    if (gs) {
1314       if (hs) {
1315          tu6_emit_link_map(cs, ds, gs, SB6_GS_SHADER);
1316       } else {
1317          tu6_emit_link_map(cs, vs, gs, SB6_GS_SHADER);
1318       }
1319 
1320       uint32_t prev_stage_output_size = ds ? ds->output_size : vs->output_size;
1321 
1322       if (CHIP == A6XX) {
1323          /* Size of per-primitive alloction in ldlw memory in vec4s. */
1324          uint32_t vec4_size = gs->gs.vertices_in *
1325                               DIV_ROUND_UP(prev_stage_output_size, 4);
1326 
1327          tu_cs_emit_pkt4(cs, REG_A6XX_PC_PRIMITIVE_CNTL_6, 1);
1328          tu_cs_emit(cs, A6XX_PC_PRIMITIVE_CNTL_6_STRIDE_IN_VPC(vec4_size));
1329       }
1330 
1331       uint32_t prim_size = prev_stage_output_size;
1332       if (prim_size > 64)
1333          prim_size = 64;
1334       else if (prim_size == 64)
1335          prim_size = 63;
1336       tu_cs_emit_pkt4(cs, REG_A6XX_SP_GS_PRIM_SIZE, 1);
1337       tu_cs_emit(cs, prim_size);
1338    }
1339 
1340    if (gs || hs) {
1341       tu6_emit_geom_tess_consts(cs, vs, hs, ds, gs);
1342    }
1343 }
1344 
1345 static bool
contains_all_shader_state(VkGraphicsPipelineLibraryFlagsEXT state)1346 contains_all_shader_state(VkGraphicsPipelineLibraryFlagsEXT state)
1347 {
1348    return (state &
1349       (VK_GRAPHICS_PIPELINE_LIBRARY_PRE_RASTERIZATION_SHADERS_BIT_EXT |
1350        VK_GRAPHICS_PIPELINE_LIBRARY_FRAGMENT_SHADER_BIT_EXT)) ==
1351       (VK_GRAPHICS_PIPELINE_LIBRARY_PRE_RASTERIZATION_SHADERS_BIT_EXT |
1352        VK_GRAPHICS_PIPELINE_LIBRARY_FRAGMENT_SHADER_BIT_EXT);
1353 }
1354 
1355 static bool
pipeline_contains_all_shader_state(struct tu_pipeline * pipeline)1356 pipeline_contains_all_shader_state(struct tu_pipeline *pipeline)
1357 {
1358    return pipeline->type == TU_PIPELINE_GRAPHICS ||
1359       pipeline->type == TU_PIPELINE_COMPUTE ||
1360       contains_all_shader_state(tu_pipeline_to_graphics_lib(pipeline)->state);
1361 }
1362 
1363 /* Return true if this pipeline contains all of the GPL stages listed but none
1364  * of the libraries it uses do, so this is "the first time" that all of them
1365  * are defined together. This is useful for state that needs to be combined
1366  * from multiple GPL stages.
1367  */
1368 
1369 static bool
set_combined_state(struct tu_pipeline_builder * builder,struct tu_pipeline * pipeline,VkGraphicsPipelineLibraryFlagsEXT state)1370 set_combined_state(struct tu_pipeline_builder *builder,
1371                    struct tu_pipeline *pipeline,
1372                    VkGraphicsPipelineLibraryFlagsEXT state)
1373 {
1374    if (pipeline->type == TU_PIPELINE_GRAPHICS_LIB &&
1375        (tu_pipeline_to_graphics_lib(pipeline)->state & state) != state)
1376       return false;
1377 
1378    for (unsigned i = 0; i < builder->num_libraries; i++) {
1379       if ((builder->libraries[i]->state & state) == state)
1380          return false;
1381    }
1382 
1383    return true;
1384 }
1385 
1386 #define TU6_EMIT_VERTEX_INPUT_MAX_DWORDS (MAX_VERTEX_ATTRIBS * 2 + 1)
1387 
1388 static VkResult
tu_pipeline_allocate_cs(struct tu_device * dev,struct tu_pipeline * pipeline,struct tu_pipeline_layout * layout,struct tu_pipeline_builder * builder,const struct ir3_shader_variant * compute)1389 tu_pipeline_allocate_cs(struct tu_device *dev,
1390                         struct tu_pipeline *pipeline,
1391                         struct tu_pipeline_layout *layout,
1392                         struct tu_pipeline_builder *builder,
1393                         const struct ir3_shader_variant *compute)
1394 {
1395    uint32_t size = 1024;
1396 
1397    /* graphics case: */
1398    if (builder) {
1399       if (builder->state &
1400           VK_GRAPHICS_PIPELINE_LIBRARY_VERTEX_INPUT_INTERFACE_BIT_EXT) {
1401          size += TU6_EMIT_VERTEX_INPUT_MAX_DWORDS;
1402       }
1403 
1404       if (set_combined_state(builder, pipeline,
1405                              VK_GRAPHICS_PIPELINE_LIBRARY_PRE_RASTERIZATION_SHADERS_BIT_EXT |
1406                              VK_GRAPHICS_PIPELINE_LIBRARY_FRAGMENT_SHADER_BIT_EXT)) {
1407          size += tu6_load_state_size(pipeline, layout);
1408       }
1409    } else {
1410       size += tu6_load_state_size(pipeline, layout);
1411    }
1412 
1413    /* Allocate the space for the pipeline out of the device's RO suballocator.
1414     *
1415     * Sub-allocating BOs saves memory and also kernel overhead in refcounting of
1416     * BOs at exec time.
1417     *
1418     * The pipeline cache would seem like a natural place to stick the
1419     * suballocator, except that it is not guaranteed to outlive the pipelines
1420     * created from it, so you can't store any long-lived state there, and you
1421     * can't use its EXTERNALLY_SYNCHRONIZED flag to avoid atomics because
1422     * pipeline destroy isn't synchronized by the cache.
1423     */
1424    mtx_lock(&dev->pipeline_mutex);
1425    VkResult result = tu_suballoc_bo_alloc(&pipeline->bo, &dev->pipeline_suballoc,
1426                                           size * 4, 128);
1427    mtx_unlock(&dev->pipeline_mutex);
1428    if (result != VK_SUCCESS)
1429       return result;
1430 
1431    TU_RMV(cmd_buffer_suballoc_bo_create, dev, &pipeline->bo);
1432    tu_cs_init_suballoc(&pipeline->cs, dev, &pipeline->bo);
1433 
1434    return VK_SUCCESS;
1435 }
1436 
1437 static void
tu_append_executable(struct tu_pipeline * pipeline,const struct ir3_shader_variant * variant,char * nir_from_spirv)1438 tu_append_executable(struct tu_pipeline *pipeline,
1439                      const struct ir3_shader_variant *variant,
1440                      char *nir_from_spirv)
1441 {
1442    struct tu_pipeline_executable exe = {
1443       .stage = variant->type,
1444       .stats = variant->info,
1445       .is_binning = variant->binning_pass,
1446       .nir_from_spirv = nir_from_spirv,
1447       .nir_final = ralloc_strdup(pipeline->executables_mem_ctx, variant->disasm_info.nir),
1448       .disasm = ralloc_strdup(pipeline->executables_mem_ctx, variant->disasm_info.disasm),
1449    };
1450 
1451    util_dynarray_append(&pipeline->executables, struct tu_pipeline_executable, exe);
1452 }
1453 
1454 static void
tu_hash_stage(struct mesa_sha1 * ctx,VkPipelineCreateFlags2KHR pipeline_flags,const VkPipelineShaderStageCreateInfo * stage,const nir_shader * nir,const struct tu_shader_key * key)1455 tu_hash_stage(struct mesa_sha1 *ctx,
1456               VkPipelineCreateFlags2KHR pipeline_flags,
1457               const VkPipelineShaderStageCreateInfo *stage,
1458               const nir_shader *nir,
1459               const struct tu_shader_key *key)
1460 {
1461 
1462    if (nir) {
1463       struct blob blob;
1464       blob_init(&blob);
1465       nir_serialize(&blob, nir, true);
1466       _mesa_sha1_update(ctx, blob.data, blob.size);
1467       blob_finish(&blob);
1468    } else {
1469       unsigned char stage_hash[SHA1_DIGEST_LENGTH];
1470       vk_pipeline_hash_shader_stage(pipeline_flags, stage, NULL, stage_hash);
1471       _mesa_sha1_update(ctx, stage_hash, sizeof(stage_hash));
1472    }
1473    _mesa_sha1_update(ctx, key, sizeof(*key));
1474 }
1475 
1476 static void
tu_hash_shaders(unsigned char * hash,VkPipelineCreateFlags2KHR pipeline_flags,const VkPipelineShaderStageCreateInfo ** stages,nir_shader * const * nir,const struct tu_pipeline_layout * layout,const struct tu_shader_key * keys,VkGraphicsPipelineLibraryFlagsEXT state)1477 tu_hash_shaders(unsigned char *hash,
1478                 VkPipelineCreateFlags2KHR pipeline_flags,
1479                 const VkPipelineShaderStageCreateInfo **stages,
1480                 nir_shader *const *nir,
1481                 const struct tu_pipeline_layout *layout,
1482                 const struct tu_shader_key *keys,
1483                 VkGraphicsPipelineLibraryFlagsEXT state)
1484 {
1485    struct mesa_sha1 ctx;
1486 
1487    _mesa_sha1_init(&ctx);
1488 
1489    if (layout)
1490       _mesa_sha1_update(&ctx, layout->sha1, sizeof(layout->sha1));
1491 
1492    for (int i = 0; i < MESA_SHADER_STAGES; ++i) {
1493       if (stages[i] || nir[i]) {
1494          tu_hash_stage(&ctx, pipeline_flags, stages[i], nir[i], &keys[i]);
1495       }
1496    }
1497    _mesa_sha1_update(&ctx, &state, sizeof(state));
1498    enum ir3_shader_debug ir3_debug_key = ir3_shader_debug_hash_key();
1499    _mesa_sha1_update(&ctx, &ir3_debug_key, sizeof(ir3_debug_key));
1500    _mesa_sha1_final(&ctx, hash);
1501 }
1502 
1503 static void
tu_hash_compute(unsigned char * hash,VkPipelineCreateFlags2KHR pipeline_flags,const VkPipelineShaderStageCreateInfo * stage,const struct tu_pipeline_layout * layout,const struct tu_shader_key * key)1504 tu_hash_compute(unsigned char *hash,
1505                 VkPipelineCreateFlags2KHR pipeline_flags,
1506                 const VkPipelineShaderStageCreateInfo *stage,
1507                 const struct tu_pipeline_layout *layout,
1508                 const struct tu_shader_key *key)
1509 {
1510    struct mesa_sha1 ctx;
1511 
1512    _mesa_sha1_init(&ctx);
1513 
1514    if (layout)
1515       _mesa_sha1_update(&ctx, layout->sha1, sizeof(layout->sha1));
1516 
1517    tu_hash_stage(&ctx, pipeline_flags, stage, NULL, key);
1518    enum ir3_shader_debug ir3_debug_key = ir3_shader_debug_hash_key();
1519    _mesa_sha1_update(&ctx, &ir3_debug_key, sizeof(ir3_debug_key));
1520 
1521    _mesa_sha1_final(&ctx, hash);
1522 }
1523 
1524 static struct tu_shader *
tu_pipeline_cache_lookup(struct vk_pipeline_cache * cache,const void * key_data,size_t key_size,bool * application_cache_hit)1525 tu_pipeline_cache_lookup(struct vk_pipeline_cache *cache,
1526                          const void *key_data, size_t key_size,
1527                          bool *application_cache_hit)
1528 {
1529    struct vk_pipeline_cache_object *object =
1530       vk_pipeline_cache_lookup_object(cache, key_data, key_size,
1531                                       &tu_shader_ops, application_cache_hit);
1532    if (object)
1533       return container_of(object, struct tu_shader, base);
1534    else
1535       return NULL;
1536 }
1537 
1538 static struct tu_shader *
tu_pipeline_cache_insert(struct vk_pipeline_cache * cache,struct tu_shader * shader)1539 tu_pipeline_cache_insert(struct vk_pipeline_cache *cache,
1540                          struct tu_shader *shader)
1541 {
1542    struct vk_pipeline_cache_object *object =
1543       vk_pipeline_cache_add_object(cache, &shader->base);
1544    return container_of(object, struct tu_shader, base);
1545 }
1546 
1547 static bool
1548 tu_nir_shaders_serialize(struct vk_pipeline_cache_object *object,
1549                          struct blob *blob);
1550 
1551 static struct vk_pipeline_cache_object *
1552 tu_nir_shaders_deserialize(struct vk_pipeline_cache *cache,
1553                            const void *key_data,
1554                            size_t key_size,
1555                            struct blob_reader *blob);
1556 
1557 static void
tu_nir_shaders_destroy(struct vk_device * device,struct vk_pipeline_cache_object * object)1558 tu_nir_shaders_destroy(struct vk_device *device,
1559                        struct vk_pipeline_cache_object *object)
1560 {
1561    struct tu_nir_shaders *shaders =
1562       container_of(object, struct tu_nir_shaders, base);
1563 
1564    for (unsigned i = 0; i < ARRAY_SIZE(shaders->nir); i++)
1565       ralloc_free(shaders->nir[i]);
1566 
1567    vk_pipeline_cache_object_finish(&shaders->base);
1568    vk_free(&device->alloc, shaders);
1569 }
1570 
1571 const struct vk_pipeline_cache_object_ops tu_nir_shaders_ops = {
1572    .serialize = tu_nir_shaders_serialize,
1573    .deserialize = tu_nir_shaders_deserialize,
1574    .destroy = tu_nir_shaders_destroy,
1575 };
1576 
1577 static struct tu_nir_shaders *
tu_nir_shaders_init(struct tu_device * dev,const void * key_data,size_t key_size)1578 tu_nir_shaders_init(struct tu_device *dev, const void *key_data, size_t key_size)
1579 {
1580    VK_MULTIALLOC(ma);
1581    VK_MULTIALLOC_DECL(&ma, struct tu_nir_shaders, shaders, 1);
1582    VK_MULTIALLOC_DECL_SIZE(&ma, char, obj_key_data, key_size);
1583 
1584    if (!vk_multialloc_zalloc(&ma, &dev->vk.alloc,
1585                              VK_SYSTEM_ALLOCATION_SCOPE_DEVICE))
1586       return NULL;
1587 
1588    memcpy(obj_key_data, key_data, key_size);
1589    vk_pipeline_cache_object_init(&dev->vk, &shaders->base,
1590                                  &tu_nir_shaders_ops, obj_key_data, key_size);
1591 
1592    return shaders;
1593 }
1594 
1595 static bool
tu_nir_shaders_serialize(struct vk_pipeline_cache_object * object,struct blob * blob)1596 tu_nir_shaders_serialize(struct vk_pipeline_cache_object *object,
1597                          struct blob *blob)
1598 {
1599    struct tu_nir_shaders *shaders =
1600       container_of(object, struct tu_nir_shaders, base);
1601 
1602    for (unsigned i = 0; i < ARRAY_SIZE(shaders->nir); i++) {
1603       if (shaders->nir[i]) {
1604          blob_write_uint8(blob, 1);
1605          nir_serialize(blob, shaders->nir[i], true);
1606       } else {
1607          blob_write_uint8(blob, 0);
1608       }
1609    }
1610 
1611    return true;
1612 }
1613 
1614 static struct vk_pipeline_cache_object *
tu_nir_shaders_deserialize(struct vk_pipeline_cache * cache,const void * key_data,size_t key_size,struct blob_reader * blob)1615 tu_nir_shaders_deserialize(struct vk_pipeline_cache *cache,
1616                            const void *key_data,
1617                            size_t key_size,
1618                            struct blob_reader *blob)
1619 {
1620    struct tu_device *dev =
1621       container_of(cache->base.device, struct tu_device, vk);
1622    struct tu_nir_shaders *shaders =
1623       tu_nir_shaders_init(dev, key_data, key_size);
1624 
1625    if (!shaders)
1626       return NULL;
1627 
1628    for (unsigned i = 0; i < ARRAY_SIZE(shaders->nir); i++) {
1629       if (blob_read_uint8(blob)) {
1630          shaders->nir[i] =
1631             nir_deserialize(NULL, ir3_get_compiler_options(dev->compiler), blob);
1632       }
1633    }
1634 
1635    return &shaders->base;
1636 }
1637 
1638 static struct tu_nir_shaders *
tu_nir_cache_lookup(struct vk_pipeline_cache * cache,const void * key_data,size_t key_size,bool * application_cache_hit)1639 tu_nir_cache_lookup(struct vk_pipeline_cache *cache,
1640                     const void *key_data, size_t key_size,
1641                     bool *application_cache_hit)
1642 {
1643    struct vk_pipeline_cache_object *object =
1644       vk_pipeline_cache_lookup_object(cache, key_data, key_size,
1645                                       &tu_nir_shaders_ops, application_cache_hit);
1646    if (object)
1647       return container_of(object, struct tu_nir_shaders, base);
1648    else
1649       return NULL;
1650 }
1651 
1652 static struct tu_nir_shaders *
tu_nir_cache_insert(struct vk_pipeline_cache * cache,struct tu_nir_shaders * shaders)1653 tu_nir_cache_insert(struct vk_pipeline_cache *cache,
1654                     struct tu_nir_shaders *shaders)
1655 {
1656    struct vk_pipeline_cache_object *object =
1657       vk_pipeline_cache_add_object(cache, &shaders->base);
1658    return container_of(object, struct tu_nir_shaders, base);
1659 }
1660 
1661 static VkResult
tu_pipeline_builder_compile_shaders(struct tu_pipeline_builder * builder,struct tu_pipeline * pipeline)1662 tu_pipeline_builder_compile_shaders(struct tu_pipeline_builder *builder,
1663                                     struct tu_pipeline *pipeline)
1664 {
1665    VkResult result = VK_SUCCESS;
1666    const VkPipelineShaderStageCreateInfo *stage_infos[MESA_SHADER_STAGES] = {
1667       NULL
1668    };
1669    VkPipelineCreationFeedback pipeline_feedback = {
1670       .flags = VK_PIPELINE_CREATION_FEEDBACK_VALID_BIT,
1671    };
1672    VkPipelineCreationFeedback stage_feedbacks[MESA_SHADER_STAGES] = { 0 };
1673 
1674    const bool executable_info =
1675       builder->create_flags &
1676       VK_PIPELINE_CREATE_2_CAPTURE_INTERNAL_REPRESENTATIONS_BIT_KHR;
1677 
1678    bool retain_nir =
1679       builder->create_flags &
1680       VK_PIPELINE_CREATE_2_RETAIN_LINK_TIME_OPTIMIZATION_INFO_BIT_EXT;
1681 
1682    int64_t pipeline_start = os_time_get_nano();
1683 
1684    const VkPipelineCreationFeedbackCreateInfo *creation_feedback =
1685       vk_find_struct_const(builder->create_info->pNext, PIPELINE_CREATION_FEEDBACK_CREATE_INFO);
1686 
1687    bool must_compile = false;
1688    for (uint32_t i = 0; i < builder->create_info->stageCount; i++) {
1689       if (!(builder->active_stages & builder->create_info->pStages[i].stage))
1690          continue;
1691 
1692       gl_shader_stage stage =
1693          vk_to_mesa_shader_stage(builder->create_info->pStages[i].stage);
1694       stage_infos[stage] = &builder->create_info->pStages[i];
1695       must_compile = true;
1696    }
1697 
1698    /* Forward declare everything due to the goto usage */
1699    nir_shader *nir[ARRAY_SIZE(stage_infos)] = { NULL };
1700    struct tu_shader *shaders[ARRAY_SIZE(stage_infos)] = { NULL };
1701    nir_shader *post_link_nir[ARRAY_SIZE(nir)] = { NULL };
1702    char *nir_initial_disasm[ARRAY_SIZE(stage_infos)] = { NULL };
1703    bool cache_hit = false;
1704 
1705    struct tu_shader_key keys[ARRAY_SIZE(stage_infos)] = { };
1706    for (gl_shader_stage stage = MESA_SHADER_VERTEX;
1707         stage < ARRAY_SIZE(keys); stage = (gl_shader_stage) (stage+1)) {
1708       const VkPipelineShaderStageRequiredSubgroupSizeCreateInfo *subgroup_info = NULL;
1709       if (stage_infos[stage])
1710          subgroup_info = vk_find_struct_const(stage_infos[stage],
1711                                               PIPELINE_SHADER_STAGE_REQUIRED_SUBGROUP_SIZE_CREATE_INFO);
1712       bool allow_varying_subgroup_size =
1713          !stage_infos[stage] ||
1714          (stage_infos[stage]->flags &
1715           VK_PIPELINE_SHADER_STAGE_CREATE_ALLOW_VARYING_SUBGROUP_SIZE_BIT_EXT);
1716       bool require_full_subgroups =
1717          stage_infos[stage] &&
1718          (stage_infos[stage]->flags &
1719           VK_PIPELINE_SHADER_STAGE_CREATE_REQUIRE_FULL_SUBGROUPS_BIT_EXT);
1720       tu_shader_key_subgroup_size(&keys[stage], allow_varying_subgroup_size,
1721                                   require_full_subgroups, subgroup_info,
1722                                   builder->device);
1723 
1724       if (stage_infos[stage]) {
1725          struct vk_pipeline_robustness_state rs;
1726          vk_pipeline_robustness_state_fill(&builder->device->vk, &rs,
1727                                            builder->create_info->pNext,
1728                                            stage_infos[stage]->pNext);
1729          tu_shader_key_robustness(&keys[stage], &rs);
1730          if (builder->create_flags & VK_PIPELINE_CREATE_2_VIEW_INDEX_FROM_DEVICE_INDEX_BIT_KHR)
1731             keys[stage].lower_view_index_to_device_index = true;
1732       }
1733    }
1734 
1735    if ((builder->state &
1736         VK_GRAPHICS_PIPELINE_LIBRARY_FRAGMENT_SHADER_BIT_EXT) &&
1737        builder->graphics_state.ial &&
1738        builder->create_info->renderPass == VK_NULL_HANDLE) {
1739       const struct vk_input_attachment_location_state *ial =
1740          builder->graphics_state.ial;
1741 
1742       keys[MESA_SHADER_FRAGMENT].dynamic_renderpass = true;
1743 
1744       uint32_t attachments_referenced = 0;
1745 
1746       if (ial->color_attachment_count == MESA_VK_COLOR_ATTACHMENT_COUNT_UNKNOWN) {
1747          attachments_referenced |=
1748             BITFIELD_MASK(MAX_RTS) << TU_DYN_INPUT_ATT_OFFSET;
1749       } else {
1750          for (unsigned i = 0; i < ial->color_attachment_count; i++) {
1751             if (ial->color_map[i] != MESA_VK_ATTACHMENT_UNUSED) {
1752                attachments_referenced |=
1753                   (1u << (ial->color_map[i] + TU_DYN_INPUT_ATT_OFFSET));
1754                }
1755          }
1756       }
1757 
1758       if (ial->depth_att != MESA_VK_ATTACHMENT_UNUSED) {
1759          if (ial->depth_att == MESA_VK_ATTACHMENT_NO_INDEX)
1760             attachments_referenced |= 1;
1761          else
1762             attachments_referenced |= 1u << (ial->depth_att + 1);
1763       }
1764 
1765       if (ial->stencil_att != MESA_VK_ATTACHMENT_UNUSED) {
1766          if (ial->stencil_att == MESA_VK_ATTACHMENT_NO_INDEX)
1767             attachments_referenced |= 1;
1768          else
1769             attachments_referenced |= 1u << (ial->stencil_att + 1);
1770       }
1771 
1772       keys[MESA_SHADER_FRAGMENT].read_only_input_attachments =
1773          ~attachments_referenced;
1774    }
1775 
1776    if (builder->create_flags &
1777        VK_PIPELINE_CREATE_2_LINK_TIME_OPTIMIZATION_BIT_EXT) {
1778       for (unsigned i = 0; i < builder->num_libraries; i++) {
1779          struct tu_graphics_lib_pipeline *library = builder->libraries[i];
1780 
1781          for (unsigned j = 0; j < ARRAY_SIZE(library->shaders); j++) {
1782             if (library->shaders[j].nir) {
1783                assert(!nir[j]);
1784                nir[j] = nir_shader_clone(builder->mem_ctx,
1785                      library->shaders[j].nir);
1786                keys[j] = library->shaders[j].key;
1787                must_compile = true;
1788             }
1789          }
1790       }
1791    }
1792 
1793    struct tu_nir_shaders *nir_shaders = NULL;
1794    if (!must_compile)
1795       goto done;
1796 
1797    if (builder->state &
1798        VK_GRAPHICS_PIPELINE_LIBRARY_PRE_RASTERIZATION_SHADERS_BIT_EXT) {
1799       keys[MESA_SHADER_VERTEX].multiview_mask =
1800          builder->graphics_state.rp->view_mask;
1801    }
1802 
1803    if (builder->state & VK_GRAPHICS_PIPELINE_LIBRARY_FRAGMENT_SHADER_BIT_EXT) {
1804       keys[MESA_SHADER_FRAGMENT].multiview_mask =
1805          builder->graphics_state.rp->view_mask;
1806       keys[MESA_SHADER_FRAGMENT].fragment_density_map =
1807          builder->fragment_density_map;
1808       keys[MESA_SHADER_FRAGMENT].unscaled_input_fragcoord =
1809          builder->unscaled_input_fragcoord;
1810 
1811       const VkPipelineMultisampleStateCreateInfo *msaa_info =
1812          builder->create_info->pMultisampleState;
1813 
1814       /* The 1.3.215 spec says:
1815        *
1816        *    Sample shading can be used to specify a minimum number of unique
1817        *    samples to process for each fragment. If sample shading is enabled,
1818        *    an implementation must provide a minimum of
1819        *
1820        *       max(ceil(minSampleShadingFactor * totalSamples), 1)
1821        *
1822        *    unique associated data for each fragment, where
1823        *    minSampleShadingFactor is the minimum fraction of sample shading.
1824        *
1825        * The definition is pretty much the same as OpenGL's GL_SAMPLE_SHADING.
1826        * They both require unique associated data.
1827        *
1828        * There are discussions to change the definition, such that
1829        * sampleShadingEnable does not imply unique associated data.  Before the
1830        * discussions are settled and before apps (i.e., ANGLE) are fixed to
1831        * follow the new and incompatible definition, we should stick to the
1832        * current definition.
1833        *
1834        * Note that ir3_shader_key::sample_shading is not actually used by ir3,
1835        * just checked in tu6_emit_fs_inputs.  We will also copy the value to
1836        * tu_shader_key::force_sample_interp in a bit.
1837        */
1838       keys[MESA_SHADER_FRAGMENT].force_sample_interp =
1839          !builder->rasterizer_discard && msaa_info && msaa_info->sampleShadingEnable;
1840    }
1841 
1842    unsigned char pipeline_sha1[20];
1843    tu_hash_shaders(pipeline_sha1, builder->create_flags, stage_infos, nir,
1844                    &builder->layout, keys, builder->state);
1845 
1846    unsigned char nir_sha1[21];
1847    memcpy(nir_sha1, pipeline_sha1, sizeof(pipeline_sha1));
1848    nir_sha1[20] = 'N';
1849 
1850    if (!executable_info) {
1851       cache_hit = true;
1852       bool application_cache_hit = false;
1853 
1854       unsigned char shader_sha1[21];
1855       memcpy(shader_sha1, pipeline_sha1, sizeof(pipeline_sha1));
1856 
1857       for (gl_shader_stage stage = MESA_SHADER_VERTEX; stage < ARRAY_SIZE(nir);
1858            stage = (gl_shader_stage) (stage + 1)) {
1859          if (stage_infos[stage] || nir[stage]) {
1860             bool shader_application_cache_hit;
1861             shader_sha1[20] = (unsigned char) stage;
1862             shaders[stage] =
1863                tu_pipeline_cache_lookup(builder->cache, &shader_sha1,
1864                                         sizeof(shader_sha1),
1865                                         &shader_application_cache_hit);
1866             if (!shaders[stage]) {
1867                cache_hit = false;
1868                break;
1869             }
1870             application_cache_hit &= shader_application_cache_hit;
1871          }
1872       }
1873 
1874       /* If the user asks us to keep the NIR around, we need to have it for a
1875        * successful cache hit. If we only have a "partial" cache hit, then we
1876        * still need to recompile in order to get the NIR.
1877        */
1878       if (cache_hit &&
1879           (builder->create_flags &
1880            VK_PIPELINE_CREATE_2_RETAIN_LINK_TIME_OPTIMIZATION_INFO_BIT_EXT)) {
1881          bool nir_application_cache_hit = false;
1882          nir_shaders =
1883             tu_nir_cache_lookup(builder->cache, &nir_sha1,
1884                                 sizeof(nir_sha1),
1885                                 &nir_application_cache_hit);
1886 
1887          application_cache_hit &= nir_application_cache_hit;
1888          cache_hit &= !!nir_shaders;
1889       }
1890 
1891       if (application_cache_hit && builder->cache != builder->device->mem_cache) {
1892          pipeline_feedback.flags |=
1893             VK_PIPELINE_CREATION_FEEDBACK_APPLICATION_PIPELINE_CACHE_HIT_BIT;
1894       }
1895    }
1896 
1897    if (!cache_hit) {
1898       if (builder->create_flags &
1899           VK_PIPELINE_CREATE_2_FAIL_ON_PIPELINE_COMPILE_REQUIRED_BIT_KHR) {
1900          return VK_PIPELINE_COMPILE_REQUIRED;
1901       }
1902 
1903       result = tu_compile_shaders(builder->device,
1904                                   builder->create_flags,
1905                                   stage_infos,
1906                                   nir,
1907                                   keys,
1908                                   &builder->layout,
1909                                   pipeline_sha1,
1910                                   shaders,
1911                                   executable_info ? nir_initial_disasm : NULL,
1912                                   pipeline->executables_mem_ctx,
1913                                   retain_nir ? post_link_nir : NULL,
1914                                   stage_feedbacks);
1915 
1916       if (result != VK_SUCCESS)
1917          goto fail;
1918 
1919       if (retain_nir) {
1920          nir_shaders =
1921             tu_nir_shaders_init(builder->device, &nir_sha1, sizeof(nir_sha1));
1922          for (gl_shader_stage stage = MESA_SHADER_VERTEX;
1923               stage < ARRAY_SIZE(nir); stage = (gl_shader_stage) (stage + 1)) {
1924             if (!post_link_nir[stage])
1925                continue;
1926 
1927             nir_shaders->nir[stage] = post_link_nir[stage];
1928          }
1929 
1930          nir_shaders = tu_nir_cache_insert(builder->cache, nir_shaders);
1931       }
1932 
1933       for (gl_shader_stage stage = MESA_SHADER_VERTEX; stage < ARRAY_SIZE(nir);
1934            stage = (gl_shader_stage) (stage + 1)) {
1935          if (!nir[stage])
1936             continue;
1937 
1938          shaders[stage] = tu_pipeline_cache_insert(builder->cache, shaders[stage]);
1939       }
1940    }
1941 
1942 done:
1943 
1944    /* Create empty shaders which contain the draw states to initialize
1945     * registers for unused shader stages.
1946     */
1947    if (builder->state &
1948        VK_GRAPHICS_PIPELINE_LIBRARY_PRE_RASTERIZATION_SHADERS_BIT_EXT) {
1949       if (!shaders[MESA_SHADER_TESS_CTRL]) {
1950          shaders[MESA_SHADER_TESS_CTRL] = builder->device->empty_tcs;
1951          vk_pipeline_cache_object_ref(&shaders[MESA_SHADER_TESS_CTRL]->base);
1952       }
1953       if (!shaders[MESA_SHADER_TESS_EVAL]) {
1954          shaders[MESA_SHADER_TESS_EVAL] = builder->device->empty_tes;
1955          vk_pipeline_cache_object_ref(&shaders[MESA_SHADER_TESS_EVAL]->base);
1956       }
1957       if (!shaders[MESA_SHADER_GEOMETRY]) {
1958          shaders[MESA_SHADER_GEOMETRY] = builder->device->empty_gs;
1959          vk_pipeline_cache_object_ref(&shaders[MESA_SHADER_GEOMETRY]->base);
1960       }
1961    }
1962 
1963    if (builder->state &
1964        VK_GRAPHICS_PIPELINE_LIBRARY_FRAGMENT_SHADER_BIT_EXT) {
1965       if (!shaders[MESA_SHADER_FRAGMENT]) {
1966          shaders[MESA_SHADER_FRAGMENT] =
1967             builder->fragment_density_map ?
1968             builder->device->empty_fs_fdm : builder->device->empty_fs;
1969          vk_pipeline_cache_object_ref(&shaders[MESA_SHADER_FRAGMENT]->base);
1970       }
1971    }
1972 
1973    for (gl_shader_stage stage = MESA_SHADER_VERTEX;
1974         stage < ARRAY_SIZE(nir); stage = (gl_shader_stage) (stage + 1)) {
1975       if (shaders[stage] && shaders[stage]->variant) {
1976          tu_append_executable(pipeline, shaders[stage]->variant,
1977                               nir_initial_disasm[stage]);
1978       }
1979    }
1980 
1981    /* We may have deduplicated a cache entry, in which case our original
1982     * post_link_nir may be gone.
1983     */
1984    if (nir_shaders) {
1985       for (gl_shader_stage stage = MESA_SHADER_VERTEX;
1986            stage < ARRAY_SIZE(nir); stage = (gl_shader_stage) (stage + 1)) {
1987          if (nir_shaders->nir[stage]) {
1988             post_link_nir[stage] = nir_shaders->nir[stage];
1989          }
1990       }
1991    }
1992 
1993    /* In the case where we're building a library without link-time
1994     * optimization but with sub-libraries that retain LTO info, we should
1995     * retain it ourselves in case another pipeline includes us with LTO.
1996     */
1997    for (unsigned i = 0; i < builder->num_libraries; i++) {
1998       struct tu_graphics_lib_pipeline *library = builder->libraries[i];
1999       for (gl_shader_stage stage = MESA_SHADER_VERTEX;
2000            stage < ARRAY_SIZE(library->shaders);
2001            stage = (gl_shader_stage) (stage + 1)) {
2002          if (!post_link_nir[stage] && library->shaders[stage].nir) {
2003             post_link_nir[stage] = library->shaders[stage].nir;
2004             keys[stage] = library->shaders[stage].key;
2005          }
2006 
2007          if (!shaders[stage] && library->base.shaders[stage]) {
2008             shaders[stage] = library->base.shaders[stage];
2009             vk_pipeline_cache_object_ref(&shaders[stage]->base);
2010          }
2011       }
2012    }
2013 
2014    if (shaders[MESA_SHADER_VERTEX]) {
2015       const struct ir3_shader_variant *vs =
2016          shaders[MESA_SHADER_VERTEX]->variant;
2017 
2018       if (!vs->stream_output.num_outputs && ir3_has_binning_vs(&vs->key)) {
2019          tu_append_executable(pipeline, vs->binning, NULL);
2020       }
2021    }
2022 
2023    if (pipeline_contains_all_shader_state(pipeline)) {
2024       /* It doesn't make much sense to use RETAIN_LINK_TIME_OPTIMIZATION_INFO
2025        * when compiling all stages, but make sure we don't leak.
2026        */
2027       if (nir_shaders)
2028          vk_pipeline_cache_object_unref(&builder->device->vk,
2029                                         &nir_shaders->base);
2030    } else {
2031       struct tu_graphics_lib_pipeline *library =
2032          tu_pipeline_to_graphics_lib(pipeline);
2033       library->nir_shaders = nir_shaders;
2034       for (gl_shader_stage stage = MESA_SHADER_VERTEX;
2035            stage < ARRAY_SIZE(library->shaders);
2036            stage = (gl_shader_stage) (stage + 1)) {
2037          library->shaders[stage].nir = post_link_nir[stage];
2038          library->shaders[stage].key = keys[stage];
2039       }
2040    }
2041 
2042    for (gl_shader_stage stage = MESA_SHADER_VERTEX;
2043         stage < ARRAY_SIZE(shaders); stage = (gl_shader_stage) (stage + 1)) {
2044       pipeline->shaders[stage] = shaders[stage];
2045       if (shaders[stage])
2046          pipeline->active_desc_sets |= shaders[stage]->active_desc_sets;
2047    }
2048 
2049    pipeline_feedback.duration = os_time_get_nano() - pipeline_start;
2050    if (creation_feedback) {
2051       *creation_feedback->pPipelineCreationFeedback = pipeline_feedback;
2052 
2053       for (uint32_t i = 0; i < builder->create_info->stageCount; i++) {
2054          gl_shader_stage s =
2055             vk_to_mesa_shader_stage(builder->create_info->pStages[i].stage);
2056          creation_feedback->pPipelineStageCreationFeedbacks[i] = stage_feedbacks[s];
2057       }
2058    }
2059 
2060    return VK_SUCCESS;
2061 
2062 fail:
2063    if (nir_shaders)
2064       vk_pipeline_cache_object_unref(&builder->device->vk,
2065                                      &nir_shaders->base);
2066 
2067    return result;
2068 }
2069 
2070 static void
tu_pipeline_builder_parse_libraries(struct tu_pipeline_builder * builder,struct tu_pipeline * pipeline)2071 tu_pipeline_builder_parse_libraries(struct tu_pipeline_builder *builder,
2072                                     struct tu_pipeline *pipeline)
2073 {
2074    const VkPipelineLibraryCreateInfoKHR *library_info =
2075       vk_find_struct_const(builder->create_info->pNext,
2076                            PIPELINE_LIBRARY_CREATE_INFO_KHR);
2077 
2078    if (library_info) {
2079       assert(library_info->libraryCount <= MAX_LIBRARIES);
2080       builder->num_libraries = library_info->libraryCount;
2081       for (unsigned i = 0; i < library_info->libraryCount; i++) {
2082          VK_FROM_HANDLE(tu_pipeline, library, library_info->pLibraries[i]);
2083          builder->libraries[i] = tu_pipeline_to_graphics_lib(library);
2084       }
2085    }
2086 
2087    /* Merge in the state from libraries. The program state is a bit special
2088     * and is handled separately.
2089     */
2090    if (pipeline->type == TU_PIPELINE_GRAPHICS_LIB)
2091       tu_pipeline_to_graphics_lib(pipeline)->state = builder->state;
2092    for (unsigned i = 0; i < builder->num_libraries; i++) {
2093       struct tu_graphics_lib_pipeline *library = builder->libraries[i];
2094       if (pipeline->type == TU_PIPELINE_GRAPHICS_LIB)
2095          tu_pipeline_to_graphics_lib(pipeline)->state |= library->state;
2096 
2097       if (library->state &
2098           VK_GRAPHICS_PIPELINE_LIBRARY_FRAGMENT_OUTPUT_INTERFACE_BIT_EXT) {
2099          pipeline->output = library->base.output;
2100          pipeline->lrz_blend.reads_dest |= library->base.lrz_blend.reads_dest;
2101          pipeline->lrz_blend.valid |= library->base.lrz_blend.valid;
2102          pipeline->prim_order = library->base.prim_order;
2103       }
2104 
2105       if ((library->state &
2106            VK_GRAPHICS_PIPELINE_LIBRARY_FRAGMENT_SHADER_BIT_EXT) &&
2107           (library->state &
2108            VK_GRAPHICS_PIPELINE_LIBRARY_FRAGMENT_OUTPUT_INTERFACE_BIT_EXT)) {
2109          pipeline->prim_order = library->base.prim_order;
2110       }
2111 
2112       pipeline->set_state_mask |= library->base.set_state_mask;
2113 
2114       u_foreach_bit (i, library->base.set_state_mask) {
2115          pipeline->dynamic_state[i] = library->base.dynamic_state[i];
2116       }
2117 
2118       if (contains_all_shader_state(library->state)) {
2119          pipeline->program = library->base.program;
2120          pipeline->load_state = library->base.load_state;
2121          for (unsigned i = 0; i < ARRAY_SIZE(pipeline->shaders); i++) {
2122             if (library->base.shaders[i]) {
2123                pipeline->shaders[i] = library->base.shaders[i];
2124                vk_pipeline_cache_object_ref(&pipeline->shaders[i]->base);
2125             }
2126          }
2127       }
2128 
2129       BITSET_OR(pipeline->static_state_mask, pipeline->static_state_mask,
2130                 library->base.static_state_mask);
2131 
2132       vk_graphics_pipeline_state_merge(&builder->graphics_state,
2133                                        &library->graphics_state);
2134    }
2135 }
2136 
2137 static void
tu_pipeline_builder_parse_layout(struct tu_pipeline_builder * builder,struct tu_pipeline * pipeline)2138 tu_pipeline_builder_parse_layout(struct tu_pipeline_builder *builder,
2139                                  struct tu_pipeline *pipeline)
2140 {
2141    VK_FROM_HANDLE(tu_pipeline_layout, layout, builder->create_info->layout);
2142 
2143    if (layout) {
2144       /* Note: it's still valid to have a layout even if there are libraries.
2145        * This allows the app to e.g. overwrite an INDEPENDENT_SET layout with
2146        * a non-INDEPENDENT_SET layout which may make us use a faster path,
2147        * currently this just affects dynamic offset descriptors.
2148        */
2149       builder->layout = *layout;
2150    } else {
2151       for (unsigned i = 0; i < builder->num_libraries; i++) {
2152          struct tu_graphics_lib_pipeline *library = builder->libraries[i];
2153          builder->layout.num_sets = MAX2(builder->layout.num_sets,
2154                                          library->num_sets);
2155          assert(builder->layout.num_sets <= builder->device->physical_device->usable_sets);
2156          for (unsigned j = 0; j < library->num_sets; j++) {
2157             builder->layout.set[i].layout = library->layouts[i];
2158          }
2159 
2160          builder->layout.push_constant_size = library->push_constant_size;
2161       }
2162 
2163       tu_pipeline_layout_init(&builder->layout);
2164    }
2165 
2166    if (pipeline->type == TU_PIPELINE_GRAPHICS_LIB) {
2167       struct tu_graphics_lib_pipeline *library =
2168          tu_pipeline_to_graphics_lib(pipeline);
2169       library->num_sets = builder->layout.num_sets;
2170       for (unsigned i = 0; i < library->num_sets; i++) {
2171          library->layouts[i] = builder->layout.set[i].layout;
2172          if (library->layouts[i])
2173             vk_descriptor_set_layout_ref(&library->layouts[i]->vk);
2174       }
2175       library->push_constant_size = builder->layout.push_constant_size;
2176    }
2177 }
2178 
2179 static void
tu_pipeline_set_linkage(struct tu_program_descriptor_linkage * link,struct tu_const_state * const_state,const struct ir3_shader_variant * v)2180 tu_pipeline_set_linkage(struct tu_program_descriptor_linkage *link,
2181                         struct tu_const_state *const_state,
2182                         const struct ir3_shader_variant *v)
2183 {
2184    link->const_state = *ir3_const_state(v);
2185    link->tu_const_state = *const_state;
2186    link->constlen = v->constlen;
2187 }
2188 
2189 template <chip CHIP>
2190 static void
tu_emit_program_state(struct tu_cs * sub_cs,struct tu_program_state * prog,struct tu_shader ** shaders)2191 tu_emit_program_state(struct tu_cs *sub_cs,
2192                       struct tu_program_state *prog,
2193                       struct tu_shader **shaders)
2194 {
2195    struct tu_device *dev = sub_cs->device;
2196    struct tu_cs prog_cs;
2197 
2198    const struct ir3_shader_variant *variants[MESA_SHADER_STAGES];
2199    struct tu_draw_state draw_states[MESA_SHADER_STAGES];
2200 
2201    for (gl_shader_stage stage = MESA_SHADER_VERTEX;
2202         stage < ARRAY_SIZE(variants); stage = (gl_shader_stage) (stage+1)) {
2203       variants[stage] = shaders[stage] ? shaders[stage]->variant : NULL;
2204    }
2205 
2206    uint32_t safe_variants =
2207       ir3_trim_constlen(variants, dev->compiler);
2208 
2209    unsigned dynamic_descriptor_sizes[MAX_SETS] = { };
2210 
2211    for (gl_shader_stage stage = MESA_SHADER_VERTEX;
2212         stage < ARRAY_SIZE(variants); stage = (gl_shader_stage) (stage+1)) {
2213       if (shaders[stage]) {
2214          if (safe_variants & (1u << stage)) {
2215             variants[stage] = shaders[stage]->safe_const_variant;
2216             draw_states[stage] = shaders[stage]->safe_const_state;
2217          } else {
2218             draw_states[stage] = shaders[stage]->state;
2219          }
2220 
2221          for (unsigned i = 0; i < MAX_SETS; i++) {
2222             if (shaders[stage]->dynamic_descriptor_sizes[i] >= 0) {
2223                dynamic_descriptor_sizes[i] =
2224                   shaders[stage]->dynamic_descriptor_sizes[i];
2225             }
2226          }
2227       }
2228    }
2229 
2230    for (unsigned i = 0; i < ARRAY_SIZE(variants); i++) {
2231       if (!variants[i])
2232          continue;
2233 
2234       tu_pipeline_set_linkage(&prog->link[i],
2235                               &shaders[i]->const_state,
2236                               variants[i]);
2237 
2238       struct tu_push_constant_range *push_consts =
2239          &shaders[i]->const_state.push_consts;
2240       if (push_consts->type == IR3_PUSH_CONSTS_SHARED ||
2241           push_consts->type == IR3_PUSH_CONSTS_SHARED_PREAMBLE) {
2242          prog->shared_consts = *push_consts;
2243       }
2244 
2245       if (variants[i]->info.uses_ray_intersection)
2246          prog->uses_ray_intersection = true;
2247    }
2248 
2249    unsigned dynamic_descriptor_offset = 0;
2250    for (unsigned i = 0; i < MAX_SETS; i++) {
2251       prog->dynamic_descriptor_offsets[i] = dynamic_descriptor_offset;
2252       dynamic_descriptor_offset += dynamic_descriptor_sizes[i];
2253    }
2254 
2255    /* Emit HLSQ_xS_CNTL/HLSQ_SP_xS_CONFIG *first*, before emitting anything
2256     * else that could depend on that state (like push constants)
2257     *
2258     * Note also that this always uses the full VS even in binning pass.  The
2259     * binning pass variant has the same const layout as the full VS, and
2260     * the constlen for the VS will be the same or greater than the constlen
2261     * for the binning pass variant.  It is required that the constlen state
2262     * matches between binning and draw passes, as some parts of the push
2263     * consts are emitted in state groups that are shared between the binning
2264     * and draw passes.
2265     */
2266    tu_cs_begin_sub_stream(sub_cs, 512, &prog_cs);
2267    tu6_emit_program_config<CHIP>(&prog_cs, prog, shaders, variants);
2268    prog->config_state = tu_cs_end_draw_state(sub_cs, &prog_cs);
2269 
2270    prog->vs_state = draw_states[MESA_SHADER_VERTEX];
2271 
2272   /* Don't use the binning pass variant when GS is present because we don't
2273    * support compiling correct binning pass variants with GS.
2274    */
2275    if (variants[MESA_SHADER_GEOMETRY]) {
2276       prog->vs_binning_state = prog->vs_state;
2277    } else {
2278       prog->vs_binning_state =
2279          shaders[MESA_SHADER_VERTEX]->binning_state;
2280    }
2281 
2282    prog->hs_state = draw_states[MESA_SHADER_TESS_CTRL];
2283    prog->ds_state = draw_states[MESA_SHADER_TESS_EVAL];
2284    prog->gs_state = draw_states[MESA_SHADER_GEOMETRY];
2285    prog->gs_binning_state =
2286       shaders[MESA_SHADER_GEOMETRY]->binning_state;
2287    prog->fs_state = draw_states[MESA_SHADER_FRAGMENT];
2288 
2289    const struct ir3_shader_variant *vs = variants[MESA_SHADER_VERTEX];
2290    const struct ir3_shader_variant *hs = variants[MESA_SHADER_TESS_CTRL];
2291    const struct ir3_shader_variant *ds = variants[MESA_SHADER_TESS_EVAL];
2292    const struct ir3_shader_variant *gs = variants[MESA_SHADER_GEOMETRY];
2293    const struct ir3_shader_variant *fs = variants[MESA_SHADER_FRAGMENT];
2294 
2295    tu_cs_begin_sub_stream(sub_cs, 512, &prog_cs);
2296    tu6_emit_vpc<CHIP>(&prog_cs, vs, hs, ds, gs, fs);
2297    prog->vpc_state = tu_cs_end_draw_state(sub_cs, &prog_cs);
2298 
2299    const struct ir3_shader_variant *last_shader;
2300    if (gs)
2301       last_shader = gs;
2302    else if (ds)
2303       last_shader = ds;
2304    else
2305       last_shader = vs;
2306 
2307    prog->per_view_viewport =
2308       !last_shader->writes_viewport &&
2309       shaders[MESA_SHADER_FRAGMENT]->fs.has_fdm &&
2310       dev->physical_device->info->a6xx.has_per_view_viewport;
2311    prog->writes_shading_rate = last_shader->writes_shading_rate;
2312    prog->reads_shading_rate = fs->reads_shading_rate;
2313    prog->accesses_smask = fs->reads_smask || fs->writes_smask;
2314 }
2315 
2316 static const enum mesa_vk_dynamic_graphics_state tu_vertex_input_state[] = {
2317    MESA_VK_DYNAMIC_VI,
2318 };
2319 
2320 template <chip CHIP>
2321 static unsigned
tu6_vertex_input_size(struct tu_device * dev,const struct vk_vertex_input_state * vi)2322 tu6_vertex_input_size(struct tu_device *dev,
2323                       const struct vk_vertex_input_state *vi)
2324 {
2325    return 1 + 2 * util_last_bit(vi->attributes_valid);
2326 }
2327 
2328 template <chip CHIP>
2329 static void
tu6_emit_vertex_input(struct tu_cs * cs,const struct vk_vertex_input_state * vi)2330 tu6_emit_vertex_input(struct tu_cs *cs,
2331                       const struct vk_vertex_input_state *vi)
2332 {
2333    unsigned attr_count = util_last_bit(vi->attributes_valid);
2334    if (attr_count != 0)
2335       tu_cs_emit_pkt4(cs, REG_A6XX_VFD_DECODE_INSTR(0), attr_count * 2);
2336 
2337    for (uint32_t loc = 0; loc < attr_count; loc++) {
2338       const struct vk_vertex_attribute_state *attr = &vi->attributes[loc];
2339 
2340       if (vi->attributes_valid & (1u << loc)) {
2341          const struct vk_vertex_binding_state *binding =
2342             &vi->bindings[attr->binding];
2343 
2344          enum pipe_format pipe_format = vk_format_to_pipe_format(attr->format);
2345          const struct tu_native_format format = tu6_format_vtx(pipe_format);
2346          tu_cs_emit(cs, A6XX_VFD_DECODE_INSTR(0,
2347                           .idx = attr->binding,
2348                           .offset = attr->offset,
2349                           .instanced = binding->input_rate == VK_VERTEX_INPUT_RATE_INSTANCE,
2350                           .format = format.fmt,
2351                           .swap = format.swap,
2352                           .unk30 = 1,
2353                           ._float = !util_format_is_pure_integer(pipe_format)).value);
2354          tu_cs_emit(cs, A6XX_VFD_DECODE_STEP_RATE(0, binding->divisor).value);
2355       } else {
2356          tu_cs_emit(cs, 0);
2357          tu_cs_emit(cs, 0);
2358       }
2359    }
2360 }
2361 
2362 static const enum mesa_vk_dynamic_graphics_state tu_vertex_stride_state[] = {
2363    MESA_VK_DYNAMIC_VI_BINDINGS_VALID,
2364    MESA_VK_DYNAMIC_VI_BINDING_STRIDES,
2365 };
2366 
2367 template <chip CHIP>
2368 static unsigned
tu6_vertex_stride_size(struct tu_device * dev,const struct vk_vertex_input_state * vi)2369 tu6_vertex_stride_size(struct tu_device *dev,
2370                        const struct vk_vertex_input_state *vi)
2371 {
2372    return 1 + 2 * util_last_bit(vi->bindings_valid);
2373 }
2374 
2375 template <chip CHIP>
2376 static void
tu6_emit_vertex_stride(struct tu_cs * cs,const struct vk_vertex_input_state * vi)2377 tu6_emit_vertex_stride(struct tu_cs *cs, const struct vk_vertex_input_state *vi)
2378 {
2379    if (vi->bindings_valid) {
2380       unsigned bindings_count = util_last_bit(vi->bindings_valid);
2381       tu_cs_emit_pkt7(cs, CP_CONTEXT_REG_BUNCH, 2 * bindings_count);
2382       for (unsigned i = 0; i < bindings_count; i++) {
2383          tu_cs_emit(cs, REG_A6XX_VFD_FETCH_STRIDE(i));
2384          tu_cs_emit(cs, vi->bindings[i].stride);
2385       }
2386    }
2387 }
2388 
2389 template <chip CHIP>
2390 static unsigned
tu6_vertex_stride_size_dyn(struct tu_device * dev,const uint16_t * vi_binding_stride,uint32_t bindings_valid)2391 tu6_vertex_stride_size_dyn(struct tu_device *dev,
2392                            const uint16_t *vi_binding_stride,
2393                            uint32_t bindings_valid)
2394 {
2395    return 1 + 2 * util_last_bit(bindings_valid);
2396 }
2397 
2398 template <chip CHIP>
2399 static void
tu6_emit_vertex_stride_dyn(struct tu_cs * cs,const uint16_t * vi_binding_stride,uint32_t bindings_valid)2400 tu6_emit_vertex_stride_dyn(struct tu_cs *cs, const uint16_t *vi_binding_stride,
2401                            uint32_t bindings_valid)
2402 {
2403    if (bindings_valid) {
2404       unsigned bindings_count = util_last_bit(bindings_valid);
2405       tu_cs_emit_pkt7(cs, CP_CONTEXT_REG_BUNCH, 2 * bindings_count);
2406       for (unsigned i = 0; i < bindings_count; i++) {
2407          tu_cs_emit(cs, REG_A6XX_VFD_FETCH_STRIDE(i));
2408          tu_cs_emit(cs, vi_binding_stride[i]);
2409       }
2410    }
2411 }
2412 
2413 static const enum mesa_vk_dynamic_graphics_state tu_viewport_state[] = {
2414    MESA_VK_DYNAMIC_VP_VIEWPORTS,
2415    MESA_VK_DYNAMIC_VP_VIEWPORT_COUNT,
2416    MESA_VK_DYNAMIC_VP_DEPTH_CLIP_NEGATIVE_ONE_TO_ONE,
2417    MESA_VK_DYNAMIC_RS_DEPTH_CLAMP_ENABLE,
2418 };
2419 
2420 template <chip CHIP>
2421 static unsigned
tu6_viewport_size(struct tu_device * dev,const struct vk_viewport_state * vp,const struct vk_rasterization_state * rs)2422 tu6_viewport_size(struct tu_device *dev,
2423                   const struct vk_viewport_state *vp,
2424                   const struct vk_rasterization_state *rs)
2425 {
2426    return 1 + vp->viewport_count * 6 + 1 + vp->viewport_count * 2 +
2427       1 + vp->viewport_count * 2 + 5;
2428 }
2429 
2430 template <chip CHIP>
2431 static void
tu6_emit_viewport(struct tu_cs * cs,const struct vk_viewport_state * vp,const struct vk_rasterization_state * rs)2432 tu6_emit_viewport(struct tu_cs *cs,
2433                   const struct vk_viewport_state *vp,
2434                   const struct vk_rasterization_state *rs)
2435 {
2436    VkExtent2D guardband = {511, 511};
2437 
2438    tu_cs_emit_pkt4(cs, REG_A6XX_GRAS_CL_VPORT_XOFFSET(0), vp->viewport_count * 6);
2439    for (uint32_t i = 0; i < vp->viewport_count; i++) {
2440       const VkViewport *viewport = &vp->viewports[i];
2441       float offsets[3];
2442       float scales[3];
2443       scales[0] = viewport->width / 2.0f;
2444       scales[1] = viewport->height / 2.0f;
2445       if (vp->depth_clip_negative_one_to_one) {
2446          scales[2] = 0.5 * (viewport->maxDepth - viewport->minDepth);
2447       } else {
2448          scales[2] = viewport->maxDepth - viewport->minDepth;
2449       }
2450 
2451       offsets[0] = viewport->x + scales[0];
2452       offsets[1] = viewport->y + scales[1];
2453       if (vp->depth_clip_negative_one_to_one) {
2454          offsets[2] = 0.5 * (viewport->minDepth + viewport->maxDepth);
2455       } else {
2456          offsets[2] = viewport->minDepth;
2457       }
2458 
2459       for (uint32_t j = 0; j < 3; j++) {
2460          tu_cs_emit(cs, fui(offsets[j]));
2461          tu_cs_emit(cs, fui(scales[j]));
2462       }
2463 
2464       guardband.width =
2465          MIN2(guardband.width, fd_calc_guardband(offsets[0], scales[0], false));
2466       guardband.height =
2467          MIN2(guardband.height, fd_calc_guardband(offsets[1], scales[1], false));
2468    }
2469 
2470    tu_cs_emit_pkt4(cs, REG_A6XX_GRAS_SC_VIEWPORT_SCISSOR_TL(0), vp->viewport_count * 2);
2471    for (uint32_t i = 0; i < vp->viewport_count; i++) {
2472       const VkViewport *viewport = &vp->viewports[i];
2473       VkOffset2D min;
2474       VkOffset2D max;
2475       min.x = (int32_t) viewport->x;
2476       max.x = (int32_t) ceilf(viewport->x + viewport->width);
2477       if (viewport->height >= 0.0f) {
2478          min.y = (int32_t) viewport->y;
2479          max.y = (int32_t) ceilf(viewport->y + viewport->height);
2480       } else {
2481          min.y = (int32_t)(viewport->y + viewport->height);
2482          max.y = (int32_t) ceilf(viewport->y);
2483       }
2484       /* the spec allows viewport->height to be 0.0f */
2485       if (min.y == max.y)
2486          max.y++;
2487       /* allow viewport->width = 0.0f for un-initialized viewports: */
2488       if (min.x == max.x)
2489          max.x++;
2490 
2491       min.x = MAX2(min.x, 0);
2492       min.y = MAX2(min.y, 0);
2493       max.x = MAX2(max.x, 1);
2494       max.y = MAX2(max.y, 1);
2495 
2496       assert(min.x < max.x);
2497       assert(min.y < max.y);
2498 
2499       tu_cs_emit(cs, A6XX_GRAS_SC_VIEWPORT_SCISSOR_TL_X(min.x) |
2500                      A6XX_GRAS_SC_VIEWPORT_SCISSOR_TL_Y(min.y));
2501       tu_cs_emit(cs, A6XX_GRAS_SC_VIEWPORT_SCISSOR_BR_X(max.x - 1) |
2502                      A6XX_GRAS_SC_VIEWPORT_SCISSOR_BR_Y(max.y - 1));
2503    }
2504 
2505    /* A7XX+ doesn't clamp to [0,1] with disabled depth clamp, to support
2506     * VK_EXT_depth_clamp_zero_one we have to always enable clamp and manually
2507     * set range to [0,1] when rs->depth_clamp_enable is false.
2508     */
2509    bool zero_one_depth_clamp = CHIP >= A7XX && !rs->depth_clamp_enable;
2510 
2511    tu_cs_emit_pkt4(cs, REG_A6XX_GRAS_CL_Z_CLAMP(0), vp->viewport_count * 2);
2512    for (uint32_t i = 0; i < vp->viewport_count; i++) {
2513       const VkViewport *viewport = &vp->viewports[i];
2514       if (zero_one_depth_clamp) {
2515          tu_cs_emit(cs, fui(0.0f));
2516          tu_cs_emit(cs, fui(1.0f));
2517       } else {
2518          tu_cs_emit(cs, fui(MIN2(viewport->minDepth, viewport->maxDepth)));
2519          tu_cs_emit(cs, fui(MAX2(viewport->minDepth, viewport->maxDepth)));
2520       }
2521    }
2522    tu_cs_emit_pkt4(cs, REG_A6XX_GRAS_CL_GUARDBAND_CLIP_ADJ, 1);
2523    tu_cs_emit(cs, A6XX_GRAS_CL_GUARDBAND_CLIP_ADJ_HORZ(guardband.width) |
2524                   A6XX_GRAS_CL_GUARDBAND_CLIP_ADJ_VERT(guardband.height));
2525 
2526    /* TODO: what to do about this and multi viewport ? */
2527    float z_clamp_min = vp->viewport_count ? MIN2(vp->viewports[0].minDepth, vp->viewports[0].maxDepth) : 0;
2528    float z_clamp_max = vp->viewport_count ? MAX2(vp->viewports[0].minDepth, vp->viewports[0].maxDepth) : 0;
2529    if (zero_one_depth_clamp) {
2530       z_clamp_min = 0.0f;
2531       z_clamp_max = 1.0f;
2532    }
2533 
2534    tu_cs_emit_regs(cs,
2535                    A6XX_RB_Z_CLAMP_MIN(z_clamp_min),
2536                    A6XX_RB_Z_CLAMP_MAX(z_clamp_max));
2537 }
2538 
2539 struct apply_viewport_state {
2540    struct vk_viewport_state vp;
2541    struct vk_rasterization_state rs;
2542    bool share_scale;
2543 };
2544 
2545 /* It's a hardware restriction that the window offset (i.e. bin.offset) must
2546  * be the same for all views. This means that GMEM coordinates cannot be a
2547  * simple scaling of framebuffer coordinates, because this would require us to
2548  * scale the window offset and the scale may be different per view. Instead we
2549  * have to apply a per-bin offset to the GMEM coordinate transform to make
2550  * sure that the window offset maps to itself. Specifically we need an offset
2551  * o to the transform:
2552  *
2553  * x' = s * x + o
2554  *
2555  * so that when we plug in the bin start b_s:
2556  *
2557  * b_s = s * b_s + o
2558  *
2559  * and we get:
2560  *
2561  * o = b_s - s * b_s
2562  *
2563  * We use this form exactly, because we know the bin offset is a multiple of
2564  * the frag area so s * b_s is an integer and we can compute an exact result
2565  * easily.
2566  */
2567 
2568 VkOffset2D
tu_fdm_per_bin_offset(VkExtent2D frag_area,VkRect2D bin)2569 tu_fdm_per_bin_offset(VkExtent2D frag_area, VkRect2D bin)
2570 {
2571    assert(bin.offset.x % frag_area.width == 0);
2572    assert(bin.offset.y % frag_area.height == 0);
2573 
2574    return (VkOffset2D) {
2575       bin.offset.x - bin.offset.x / frag_area.width,
2576       bin.offset.y - bin.offset.y / frag_area.height
2577    };
2578 }
2579 
2580 static void
fdm_apply_viewports(struct tu_cmd_buffer * cmd,struct tu_cs * cs,void * data,VkRect2D bin,unsigned views,VkExtent2D * frag_areas)2581 fdm_apply_viewports(struct tu_cmd_buffer *cmd, struct tu_cs *cs, void *data,
2582                     VkRect2D bin, unsigned views, VkExtent2D *frag_areas)
2583 {
2584    const struct apply_viewport_state *state =
2585       (const struct apply_viewport_state *)data;
2586 
2587    struct vk_viewport_state vp = state->vp;
2588 
2589    for (unsigned i = 0; i < state->vp.viewport_count; i++) {
2590       /* Note: If we're using shared scaling, the scale should already be the
2591        * same across all views, we can pick any view. However the number
2592        * of viewports and number of views is not guaranteed the same, so we
2593        * need to pick the 0'th view which always exists to be safe.
2594        *
2595        * Conversly, if we're not using shared scaling then the rasterizer in
2596        * the original pipeline is using only the first viewport, so we need to
2597        * replicate it across all viewports.
2598        */
2599       VkExtent2D frag_area = state->share_scale ? frag_areas[0] : frag_areas[i];
2600       VkViewport viewport =
2601          state->share_scale ? state->vp.viewports[i] : state->vp.viewports[0];
2602       if (frag_area.width == 1 && frag_area.height == 1) {
2603          vp.viewports[i] = viewport;
2604          continue;
2605       }
2606 
2607       float scale_x = (float) 1.0f / frag_area.width;
2608       float scale_y = (float) 1.0f / frag_area.height;
2609 
2610       vp.viewports[i].minDepth = viewport.minDepth;
2611       vp.viewports[i].maxDepth = viewport.maxDepth;
2612       vp.viewports[i].width = viewport.width * scale_x;
2613       vp.viewports[i].height = viewport.height * scale_y;
2614 
2615       VkOffset2D offset = tu_fdm_per_bin_offset(frag_area, bin);
2616 
2617       vp.viewports[i].x = scale_x * viewport.x + offset.x;
2618       vp.viewports[i].y = scale_y * viewport.y + offset.y;
2619    }
2620 
2621    TU_CALLX(cs->device, tu6_emit_viewport)(cs, &vp, &state->rs);
2622 }
2623 
2624 static void
tu6_emit_viewport_fdm(struct tu_cs * cs,struct tu_cmd_buffer * cmd,const struct vk_viewport_state * vp,const struct vk_rasterization_state * rs)2625 tu6_emit_viewport_fdm(struct tu_cs *cs, struct tu_cmd_buffer *cmd,
2626                       const struct vk_viewport_state *vp,
2627                       const struct vk_rasterization_state *rs)
2628 {
2629    unsigned num_views = MAX2(cmd->state.pass->num_views, 1);
2630    struct apply_viewport_state state = {
2631       .vp = *vp,
2632       .rs = *rs,
2633       .share_scale = !cmd->state.per_view_viewport,
2634    };
2635    if (!state.share_scale)
2636       state.vp.viewport_count = num_views;
2637    unsigned size = TU_CALLX(cmd->device, tu6_viewport_size)(cmd->device, &state.vp, &state.rs);
2638    tu_cs_begin_sub_stream(&cmd->sub_cs, size, cs);
2639    tu_create_fdm_bin_patchpoint(cmd, cs, size, fdm_apply_viewports, state);
2640    cmd->state.rp.shared_viewport |= !cmd->state.per_view_viewport;
2641 }
2642 
2643 static const enum mesa_vk_dynamic_graphics_state tu_scissor_state[] = {
2644    MESA_VK_DYNAMIC_VP_SCISSORS,
2645    MESA_VK_DYNAMIC_VP_SCISSOR_COUNT,
2646 };
2647 
2648 template <chip CHIP>
2649 static unsigned
tu6_scissor_size(struct tu_device * dev,const struct vk_viewport_state * vp)2650 tu6_scissor_size(struct tu_device *dev, const struct vk_viewport_state *vp)
2651 {
2652    return 1 + vp->scissor_count * 2;
2653 }
2654 
2655 template <chip CHIP>
2656 void
tu6_emit_scissor(struct tu_cs * cs,const struct vk_viewport_state * vp)2657 tu6_emit_scissor(struct tu_cs *cs, const struct vk_viewport_state *vp)
2658 {
2659    tu_cs_emit_pkt4(cs, REG_A6XX_GRAS_SC_SCREEN_SCISSOR_TL(0), vp->scissor_count * 2);
2660 
2661    for (uint32_t i = 0; i < vp->scissor_count; i++) {
2662       const VkRect2D *scissor = &vp->scissors[i];
2663 
2664       uint32_t min_x = scissor->offset.x;
2665       uint32_t min_y = scissor->offset.y;
2666       uint32_t max_x = min_x + scissor->extent.width - 1;
2667       uint32_t max_y = min_y + scissor->extent.height - 1;
2668 
2669       if (!scissor->extent.width || !scissor->extent.height) {
2670          min_x = min_y = 1;
2671          max_x = max_y = 0;
2672       } else {
2673          /* avoid overflow */
2674          uint32_t scissor_max = BITFIELD_MASK(15);
2675          min_x = MIN2(scissor_max, min_x);
2676          min_y = MIN2(scissor_max, min_y);
2677          max_x = MIN2(scissor_max, max_x);
2678          max_y = MIN2(scissor_max, max_y);
2679       }
2680 
2681       tu_cs_emit(cs, A6XX_GRAS_SC_SCREEN_SCISSOR_TL_X(min_x) |
2682                      A6XX_GRAS_SC_SCREEN_SCISSOR_TL_Y(min_y));
2683       tu_cs_emit(cs, A6XX_GRAS_SC_SCREEN_SCISSOR_BR_X(max_x) |
2684                      A6XX_GRAS_SC_SCREEN_SCISSOR_BR_Y(max_y));
2685    }
2686 }
2687 
2688 static void
fdm_apply_scissors(struct tu_cmd_buffer * cmd,struct tu_cs * cs,void * data,VkRect2D bin,unsigned views,VkExtent2D * frag_areas)2689 fdm_apply_scissors(struct tu_cmd_buffer *cmd, struct tu_cs *cs, void *data,
2690                    VkRect2D bin, unsigned views, VkExtent2D *frag_areas)
2691 {
2692    const struct apply_viewport_state *state =
2693       (const struct apply_viewport_state *)data;
2694 
2695    struct vk_viewport_state vp = state->vp;
2696 
2697    for (unsigned i = 0; i < vp.scissor_count; i++) {
2698       VkExtent2D frag_area = state->share_scale ? frag_areas[0] : frag_areas[i];
2699       VkRect2D scissor =
2700          state->share_scale ? state->vp.scissors[i] : state->vp.scissors[0];
2701       if (frag_area.width == 1 && frag_area.height == 1) {
2702          vp.scissors[i] = scissor;
2703          continue;
2704       }
2705 
2706       /* Transform the scissor following the viewport. It's unclear how this
2707        * is supposed to handle cases where the scissor isn't aligned to the
2708        * fragment area, but we round outwards to always render partial
2709        * fragments if the scissor size equals the framebuffer size and it
2710        * isn't aligned to the fragment area.
2711        */
2712       VkOffset2D offset = tu_fdm_per_bin_offset(frag_area, bin);
2713       VkOffset2D min = {
2714          scissor.offset.x / frag_area.width + offset.x,
2715          scissor.offset.y / frag_area.width + offset.y,
2716       };
2717       VkOffset2D max = {
2718          DIV_ROUND_UP(scissor.offset.x + scissor.extent.width, frag_area.width) + offset.x,
2719          DIV_ROUND_UP(scissor.offset.y + scissor.extent.height, frag_area.height) + offset.y,
2720       };
2721 
2722       /* Intersect scissor with the scaled bin, this essentially replaces the
2723        * window scissor.
2724        */
2725       uint32_t scaled_width = bin.extent.width / frag_area.width;
2726       uint32_t scaled_height = bin.extent.height / frag_area.height;
2727       vp.scissors[i].offset.x = MAX2(min.x, bin.offset.x);
2728       vp.scissors[i].offset.y = MAX2(min.y, bin.offset.y);
2729       vp.scissors[i].extent.width =
2730          MIN2(max.x, bin.offset.x + scaled_width) - vp.scissors[i].offset.x;
2731       vp.scissors[i].extent.height =
2732          MIN2(max.y, bin.offset.y + scaled_height) - vp.scissors[i].offset.y;
2733    }
2734 
2735    TU_CALLX(cs->device, tu6_emit_scissor)(cs, &vp);
2736 }
2737 
2738 static void
tu6_emit_scissor_fdm(struct tu_cs * cs,struct tu_cmd_buffer * cmd,const struct vk_viewport_state * vp)2739 tu6_emit_scissor_fdm(struct tu_cs *cs, struct tu_cmd_buffer *cmd,
2740                      const struct vk_viewport_state *vp)
2741 {
2742    unsigned num_views = MAX2(cmd->state.pass->num_views, 1);
2743    struct apply_viewport_state state = {
2744       .vp = *vp,
2745       .share_scale = !cmd->state.per_view_viewport,
2746    };
2747    if (!state.share_scale)
2748       state.vp.scissor_count = num_views;
2749    unsigned size = TU_CALLX(cmd->device, tu6_scissor_size)(cmd->device, &state.vp);
2750    tu_cs_begin_sub_stream(&cmd->sub_cs, size, cs);
2751    tu_create_fdm_bin_patchpoint(cmd, cs, size, fdm_apply_scissors, state);
2752 }
2753 
2754 static const enum mesa_vk_dynamic_graphics_state tu_sample_locations_state[] = {
2755    MESA_VK_DYNAMIC_MS_SAMPLE_LOCATIONS_ENABLE,
2756    MESA_VK_DYNAMIC_MS_SAMPLE_LOCATIONS,
2757 };
2758 
2759 template <chip CHIP>
2760 static unsigned
tu6_sample_locations_size(struct tu_device * dev,bool enable,const struct vk_sample_locations_state * samp_loc)2761 tu6_sample_locations_size(struct tu_device *dev, bool enable,
2762                           const struct vk_sample_locations_state *samp_loc)
2763 {
2764    return 6 + (enable ? 9 : 0);
2765 }
2766 
2767 template <chip CHIP>
2768 void
tu6_emit_sample_locations(struct tu_cs * cs,bool enable,const struct vk_sample_locations_state * samp_loc)2769 tu6_emit_sample_locations(struct tu_cs *cs, bool enable,
2770                           const struct vk_sample_locations_state *samp_loc)
2771 {
2772    uint32_t sample_config =
2773       COND(enable, A6XX_RB_SAMPLE_CONFIG_LOCATION_ENABLE);
2774 
2775    tu_cs_emit_pkt4(cs, REG_A6XX_GRAS_SAMPLE_CONFIG, 1);
2776    tu_cs_emit(cs, sample_config);
2777 
2778    tu_cs_emit_pkt4(cs, REG_A6XX_RB_SAMPLE_CONFIG, 1);
2779    tu_cs_emit(cs, sample_config);
2780 
2781    tu_cs_emit_pkt4(cs, REG_A6XX_SP_TP_SAMPLE_CONFIG, 1);
2782    tu_cs_emit(cs, sample_config);
2783 
2784    if (!enable)
2785       return;
2786 
2787    assert(samp_loc->grid_size.width == 1);
2788    assert(samp_loc->grid_size.height == 1);
2789 
2790    uint64_t sample_locations = 0;
2791    for (uint32_t i = 0; i < samp_loc->per_pixel; i++) {
2792       /* From VkSampleLocationEXT:
2793        *
2794        *    The values specified in a VkSampleLocationEXT structure are always
2795        *    clamped to the implementation-dependent sample location coordinate
2796        *    range
2797        *    [sampleLocationCoordinateRange[0],sampleLocationCoordinateRange[1]]
2798        */
2799       float x = CLAMP(samp_loc->locations[i].x, SAMPLE_LOCATION_MIN,
2800                       SAMPLE_LOCATION_MAX);
2801       float y = CLAMP(samp_loc->locations[i].y, SAMPLE_LOCATION_MIN,
2802                       SAMPLE_LOCATION_MAX);
2803 
2804       sample_locations |=
2805          ((uint64_t)(A6XX_RB_SAMPLE_LOCATION_0_SAMPLE_0_X(x) |
2806                      A6XX_RB_SAMPLE_LOCATION_0_SAMPLE_0_Y(y))) << i*8;
2807    }
2808 
2809    tu_cs_emit_pkt4(cs, REG_A6XX_GRAS_SAMPLE_LOCATION_0, 2);
2810    tu_cs_emit_qw(cs, sample_locations);
2811 
2812    tu_cs_emit_pkt4(cs, REG_A6XX_RB_SAMPLE_LOCATION_0, 2);
2813    tu_cs_emit_qw(cs, sample_locations);
2814 
2815    tu_cs_emit_pkt4(cs, REG_A6XX_SP_TP_SAMPLE_LOCATION_0, 2);
2816    tu_cs_emit_qw(cs, sample_locations);
2817 }
2818 
2819 static const enum mesa_vk_dynamic_graphics_state tu_depth_bias_state[] = {
2820    MESA_VK_DYNAMIC_RS_DEPTH_BIAS_FACTORS,
2821 };
2822 
2823 template <chip CHIP>
2824 static unsigned
tu6_depth_bias_size(struct tu_device * dev,const struct vk_rasterization_state * rs)2825 tu6_depth_bias_size(struct tu_device *dev,
2826                     const struct vk_rasterization_state *rs)
2827 {
2828    return 4;
2829 }
2830 
2831 template <chip CHIP>
2832 void
tu6_emit_depth_bias(struct tu_cs * cs,const struct vk_rasterization_state * rs)2833 tu6_emit_depth_bias(struct tu_cs *cs, const struct vk_rasterization_state *rs)
2834 {
2835    tu_cs_emit_pkt4(cs, REG_A6XX_GRAS_SU_POLY_OFFSET_SCALE, 3);
2836    tu_cs_emit(cs, A6XX_GRAS_SU_POLY_OFFSET_SCALE(rs->depth_bias.slope_factor).value);
2837    tu_cs_emit(cs, A6XX_GRAS_SU_POLY_OFFSET_OFFSET(rs->depth_bias.constant_factor).value);
2838    tu_cs_emit(cs, A6XX_GRAS_SU_POLY_OFFSET_OFFSET_CLAMP(rs->depth_bias.clamp).value);
2839 }
2840 
2841 static const enum mesa_vk_dynamic_graphics_state tu_bandwidth_state[] = {
2842    MESA_VK_DYNAMIC_CB_LOGIC_OP_ENABLE,
2843    MESA_VK_DYNAMIC_CB_LOGIC_OP,
2844    MESA_VK_DYNAMIC_CB_ATTACHMENT_COUNT,
2845    MESA_VK_DYNAMIC_CB_COLOR_WRITE_ENABLES,
2846    MESA_VK_DYNAMIC_CB_BLEND_ENABLES,
2847    MESA_VK_DYNAMIC_CB_WRITE_MASKS,
2848 };
2849 
2850 static void
tu_calc_bandwidth(struct tu_bandwidth * bandwidth,const struct vk_color_blend_state * cb,const struct vk_render_pass_state * rp)2851 tu_calc_bandwidth(struct tu_bandwidth *bandwidth,
2852                   const struct vk_color_blend_state *cb,
2853                   const struct vk_render_pass_state *rp)
2854 {
2855    bool rop_reads_dst = cb->logic_op_enable && tu_logic_op_reads_dst((VkLogicOp)cb->logic_op);
2856 
2857    uint32_t total_bpp = 0;
2858    for (unsigned i = 0; i < cb->attachment_count; i++) {
2859       const struct vk_color_blend_attachment_state *att = &cb->attachments[i];
2860       if (!(cb->color_write_enables & (1u << i)))
2861          continue;
2862 
2863       const VkFormat format = rp->color_attachment_formats[i];
2864 
2865       uint32_t write_bpp = 0;
2866       if (format == VK_FORMAT_UNDEFINED) {
2867          /* do nothing */
2868       } else if (att->write_mask == 0xf) {
2869          write_bpp = vk_format_get_blocksizebits(format);
2870       } else {
2871          const enum pipe_format pipe_format = vk_format_to_pipe_format(format);
2872          for (uint32_t i = 0; i < 4; i++) {
2873             if (att->write_mask & (1 << i)) {
2874                write_bpp += util_format_get_component_bits(pipe_format,
2875                      UTIL_FORMAT_COLORSPACE_RGB, i);
2876             }
2877          }
2878       }
2879       total_bpp += write_bpp;
2880 
2881       if (rop_reads_dst || att->blend_enable) {
2882          total_bpp += write_bpp;
2883       }
2884    }
2885 
2886    bandwidth->color_bandwidth_per_sample = total_bpp / 8;
2887 
2888    if (rp->attachments & MESA_VK_RP_ATTACHMENT_DEPTH_BIT) {
2889       bandwidth->depth_cpp_per_sample = util_format_get_component_bits(
2890             vk_format_to_pipe_format(rp->depth_attachment_format),
2891             UTIL_FORMAT_COLORSPACE_ZS, 0) / 8;
2892    }
2893 
2894    if (rp->attachments & MESA_VK_RP_ATTACHMENT_STENCIL_BIT) {
2895       bandwidth->stencil_cpp_per_sample = util_format_get_component_bits(
2896             vk_format_to_pipe_format(rp->stencil_attachment_format),
2897             UTIL_FORMAT_COLORSPACE_ZS, 1) / 8;
2898    }
2899 }
2900 
2901 /* Return true if the blend state reads the color attachments. */
2902 static bool
tu6_calc_blend_lrz(const struct vk_color_blend_state * cb,const struct vk_render_pass_state * rp)2903 tu6_calc_blend_lrz(const struct vk_color_blend_state *cb,
2904                    const struct vk_render_pass_state *rp)
2905 {
2906    if (cb->logic_op_enable && tu_logic_op_reads_dst((VkLogicOp)cb->logic_op))
2907       return true;
2908 
2909    for (unsigned i = 0; i < cb->attachment_count; i++) {
2910       if (rp->color_attachment_formats[i] == VK_FORMAT_UNDEFINED)
2911          continue;
2912 
2913       const struct vk_color_blend_attachment_state *att = &cb->attachments[i];
2914       if (att->blend_enable)
2915          return true;
2916       if (!(cb->color_write_enables & (1u << i)))
2917          return true;
2918       unsigned mask =
2919          MASK(vk_format_get_nr_components(rp->color_attachment_formats[i]));
2920       if ((att->write_mask & mask) != mask)
2921          return true;
2922    }
2923 
2924    return false;
2925 }
2926 
2927 static const enum mesa_vk_dynamic_graphics_state tu_blend_lrz_state[] = {
2928    MESA_VK_DYNAMIC_CB_LOGIC_OP_ENABLE,
2929    MESA_VK_DYNAMIC_CB_LOGIC_OP,
2930    MESA_VK_DYNAMIC_CB_ATTACHMENT_COUNT,
2931    MESA_VK_DYNAMIC_CB_COLOR_WRITE_ENABLES,
2932    MESA_VK_DYNAMIC_CB_BLEND_ENABLES,
2933    MESA_VK_DYNAMIC_CB_WRITE_MASKS,
2934 };
2935 
2936 static void
tu_emit_blend_lrz(struct tu_lrz_blend * lrz,const struct vk_color_blend_state * cb,const struct vk_render_pass_state * rp)2937 tu_emit_blend_lrz(struct tu_lrz_blend *lrz,
2938                   const struct vk_color_blend_state *cb,
2939                   const struct vk_render_pass_state *rp)
2940 {
2941    lrz->reads_dest = tu6_calc_blend_lrz(cb, rp);
2942    lrz->valid = true;
2943 }
2944 
2945 static const enum mesa_vk_dynamic_graphics_state tu_blend_state[] = {
2946    MESA_VK_DYNAMIC_CB_LOGIC_OP_ENABLE,
2947    MESA_VK_DYNAMIC_CB_LOGIC_OP,
2948    MESA_VK_DYNAMIC_CB_ATTACHMENT_COUNT,
2949    MESA_VK_DYNAMIC_CB_COLOR_WRITE_ENABLES,
2950    MESA_VK_DYNAMIC_CB_BLEND_ENABLES,
2951    MESA_VK_DYNAMIC_CB_BLEND_EQUATIONS,
2952    MESA_VK_DYNAMIC_CB_WRITE_MASKS,
2953    MESA_VK_DYNAMIC_MS_ALPHA_TO_COVERAGE_ENABLE,
2954    MESA_VK_DYNAMIC_MS_ALPHA_TO_ONE_ENABLE,
2955    MESA_VK_DYNAMIC_MS_SAMPLE_MASK,
2956    MESA_VK_DYNAMIC_COLOR_ATTACHMENT_MAP,
2957 };
2958 
2959 template <chip CHIP>
2960 static unsigned
tu6_blend_size(struct tu_device * dev,const struct vk_color_blend_state * cb,const struct vk_color_attachment_location_state * cal,bool alpha_to_coverage_enable,bool alpha_to_one_enable,uint32_t sample_mask)2961 tu6_blend_size(struct tu_device *dev,
2962                const struct vk_color_blend_state *cb,
2963                const struct vk_color_attachment_location_state *cal,
2964                bool alpha_to_coverage_enable,
2965                bool alpha_to_one_enable,
2966                uint32_t sample_mask)
2967 {
2968    unsigned num_rts = alpha_to_coverage_enable ?
2969       MAX2(cb->attachment_count, 1) : cb->attachment_count;
2970    return 8 + 3 * num_rts;
2971 }
2972 
2973 template <chip CHIP>
2974 static void
tu6_emit_blend(struct tu_cs * cs,const struct vk_color_blend_state * cb,const struct vk_color_attachment_location_state * cal,bool alpha_to_coverage_enable,bool alpha_to_one_enable,uint32_t sample_mask)2975 tu6_emit_blend(struct tu_cs *cs,
2976                const struct vk_color_blend_state *cb,
2977                const struct vk_color_attachment_location_state *cal,
2978                bool alpha_to_coverage_enable,
2979                bool alpha_to_one_enable,
2980                uint32_t sample_mask)
2981 {
2982    bool rop_reads_dst = cb->logic_op_enable && tu_logic_op_reads_dst((VkLogicOp)cb->logic_op);
2983    enum a3xx_rop_code rop = tu6_rop((VkLogicOp)cb->logic_op);
2984 
2985    uint32_t blend_enable_mask = 0;
2986    for (unsigned i = 0; i < cb->attachment_count; i++) {
2987       if (!(cb->color_write_enables & (1u << i)) ||
2988           cal->color_map[i] == MESA_VK_ATTACHMENT_UNUSED)
2989          continue;
2990 
2991       const struct vk_color_blend_attachment_state *att = &cb->attachments[i];
2992 
2993       if (rop_reads_dst || att->blend_enable) {
2994          blend_enable_mask |= 1u << cal->color_map[i];
2995       }
2996    }
2997 
2998    /* This will emit a dummy RB_MRT_*_CONTROL below if alpha-to-coverage is
2999     * enabled but there are no color attachments, in addition to changing
3000     * *_FS_OUTPUT_CNTL1.
3001     */
3002    unsigned num_rts = alpha_to_coverage_enable ?
3003       MAX2(cb->attachment_count, 1) : cb->attachment_count;
3004 
3005    bool dual_src_blend = tu_blend_state_is_dual_src(cb);
3006 
3007    tu_cs_emit_regs(cs, A6XX_SP_FS_OUTPUT_CNTL1(.mrt = num_rts));
3008    tu_cs_emit_regs(cs, A6XX_RB_FS_OUTPUT_CNTL1(.mrt = num_rts));
3009    tu_cs_emit_regs(cs, A6XX_SP_BLEND_CNTL(.enable_blend = blend_enable_mask,
3010                                           .unk8 = true,
3011                                           .dual_color_in_enable =
3012                                              dual_src_blend,
3013                                           .alpha_to_coverage =
3014                                              alpha_to_coverage_enable));
3015    /* set A6XX_RB_BLEND_CNTL_INDEPENDENT_BLEND only when enabled? */
3016    tu_cs_emit_regs(cs, A6XX_RB_BLEND_CNTL(.enable_blend = blend_enable_mask,
3017                                           .independent_blend = true,
3018                                           .dual_color_in_enable =
3019                                              dual_src_blend,
3020                                           .alpha_to_coverage =
3021                                              alpha_to_coverage_enable,
3022                                           .alpha_to_one = alpha_to_one_enable,
3023                                           .sample_mask = sample_mask));
3024 
3025    for (unsigned i = 0; i < num_rts; i++) {
3026       if (cal->color_map[i] == MESA_VK_ATTACHMENT_UNUSED)
3027          continue;
3028       unsigned remapped_idx = cal->color_map[i];
3029       const struct vk_color_blend_attachment_state *att = &cb->attachments[i];
3030       if ((cb->color_write_enables & (1u << i)) && i < cb->attachment_count) {
3031          const enum a3xx_rb_blend_opcode color_op = tu6_blend_op(att->color_blend_op);
3032          const enum adreno_rb_blend_factor src_color_factor =
3033             tu6_blend_factor((VkBlendFactor)att->src_color_blend_factor);
3034          const enum adreno_rb_blend_factor dst_color_factor =
3035             tu6_blend_factor((VkBlendFactor)att->dst_color_blend_factor);
3036          const enum a3xx_rb_blend_opcode alpha_op =
3037             tu6_blend_op(att->alpha_blend_op);
3038          const enum adreno_rb_blend_factor src_alpha_factor =
3039             tu6_blend_factor((VkBlendFactor)att->src_alpha_blend_factor);
3040          const enum adreno_rb_blend_factor dst_alpha_factor =
3041             tu6_blend_factor((VkBlendFactor)att->dst_alpha_blend_factor);
3042 
3043          tu_cs_emit_regs(cs,
3044                          A6XX_RB_MRT_CONTROL(remapped_idx,
3045                                              .blend = att->blend_enable,
3046                                              .blend2 = att->blend_enable,
3047                                              .rop_enable = cb->logic_op_enable,
3048                                              .rop_code = rop,
3049                                              .component_enable = att->write_mask),
3050                          A6XX_RB_MRT_BLEND_CONTROL(remapped_idx,
3051                                                    .rgb_src_factor = src_color_factor,
3052                                                    .rgb_blend_opcode = color_op,
3053                                                    .rgb_dest_factor = dst_color_factor,
3054                                                    .alpha_src_factor = src_alpha_factor,
3055                                                    .alpha_blend_opcode = alpha_op,
3056                                                    .alpha_dest_factor = dst_alpha_factor));
3057       } else {
3058             tu_cs_emit_regs(cs,
3059                             A6XX_RB_MRT_CONTROL(remapped_idx,),
3060                             A6XX_RB_MRT_BLEND_CONTROL(remapped_idx,));
3061       }
3062    }
3063 }
3064 
3065 static const enum mesa_vk_dynamic_graphics_state tu_blend_constants_state[] = {
3066    MESA_VK_DYNAMIC_CB_BLEND_CONSTANTS,
3067 };
3068 
3069 template <chip CHIP>
3070 static unsigned
tu6_blend_constants_size(struct tu_device * dev,const struct vk_color_blend_state * cb)3071 tu6_blend_constants_size(struct tu_device *dev,
3072                          const struct vk_color_blend_state *cb)
3073 {
3074    return 5;
3075 }
3076 
3077 template <chip CHIP>
3078 static void
tu6_emit_blend_constants(struct tu_cs * cs,const struct vk_color_blend_state * cb)3079 tu6_emit_blend_constants(struct tu_cs *cs, const struct vk_color_blend_state *cb)
3080 {
3081    tu_cs_emit_pkt4(cs, REG_A6XX_RB_BLEND_RED_F32, 4);
3082    tu_cs_emit_array(cs, (const uint32_t *) cb->blend_constants, 4);
3083 }
3084 
3085 static const enum mesa_vk_dynamic_graphics_state tu_rast_state[] = {
3086    MESA_VK_DYNAMIC_RS_DEPTH_CLAMP_ENABLE,
3087    MESA_VK_DYNAMIC_RS_DEPTH_CLIP_ENABLE,
3088    MESA_VK_DYNAMIC_RS_POLYGON_MODE,
3089    MESA_VK_DYNAMIC_RS_CULL_MODE,
3090    MESA_VK_DYNAMIC_RS_FRONT_FACE,
3091    MESA_VK_DYNAMIC_RS_DEPTH_BIAS_ENABLE,
3092    MESA_VK_DYNAMIC_RS_LINE_MODE,
3093    MESA_VK_DYNAMIC_RS_RASTERIZER_DISCARD_ENABLE,
3094    MESA_VK_DYNAMIC_RS_RASTERIZATION_STREAM,
3095    MESA_VK_DYNAMIC_VP_DEPTH_CLIP_NEGATIVE_ONE_TO_ONE,
3096    MESA_VK_DYNAMIC_RS_LINE_WIDTH,
3097    MESA_VK_DYNAMIC_RS_CONSERVATIVE_MODE,
3098    MESA_VK_DYNAMIC_RS_EXTRA_PRIMITIVE_OVERESTIMATION_SIZE,
3099 };
3100 
3101 template <chip CHIP>
3102 uint32_t
tu6_rast_size(struct tu_device * dev,const struct vk_rasterization_state * rs,const struct vk_viewport_state * vp,bool multiview,bool per_view_viewport)3103 tu6_rast_size(struct tu_device *dev,
3104               const struct vk_rasterization_state *rs,
3105               const struct vk_viewport_state *vp,
3106               bool multiview,
3107               bool per_view_viewport)
3108 {
3109    if (CHIP == A6XX) {
3110       return 15 + (dev->physical_device->info->a6xx.has_legacy_pipeline_shading_rate ? 8 : 0);
3111    } else {
3112       return 25;
3113    }
3114 }
3115 
3116 template <chip CHIP>
3117 void
tu6_emit_rast(struct tu_cs * cs,const struct vk_rasterization_state * rs,const struct vk_viewport_state * vp,bool multiview,bool per_view_viewport)3118 tu6_emit_rast(struct tu_cs *cs,
3119               const struct vk_rasterization_state *rs,
3120               const struct vk_viewport_state *vp,
3121               bool multiview,
3122               bool per_view_viewport)
3123 {
3124    enum a5xx_line_mode line_mode =
3125       rs->line.mode == VK_LINE_RASTERIZATION_MODE_BRESENHAM_KHR ?
3126       BRESENHAM : RECTANGULAR;
3127    tu_cs_emit_regs(cs,
3128                    A6XX_GRAS_SU_CNTL(
3129                      .cull_front = rs->cull_mode & VK_CULL_MODE_FRONT_BIT,
3130                      .cull_back = rs->cull_mode & VK_CULL_MODE_BACK_BIT,
3131                      .front_cw = rs->front_face == VK_FRONT_FACE_CLOCKWISE,
3132                      .linehalfwidth = rs->line.width / 2.0f,
3133                      .poly_offset = rs->depth_bias.enable,
3134                      .line_mode = line_mode,
3135                      .multiview_enable = multiview,
3136                      .rendertargetindexincr = multiview,
3137                      .viewportindexincr = multiview && per_view_viewport));
3138 
3139    bool depth_clip_enable = vk_rasterization_state_depth_clip_enable(rs);
3140 
3141    tu_cs_emit_regs(cs,
3142                    A6XX_GRAS_CL_CNTL(
3143                      .znear_clip_disable = !depth_clip_enable,
3144                      .zfar_clip_disable = !depth_clip_enable,
3145                      /* To support VK_EXT_depth_clamp_zero_one on a7xx+ */
3146                      .z_clamp_enable = rs->depth_clamp_enable || CHIP >= A7XX,
3147                      .zero_gb_scale_z = vp->depth_clip_negative_one_to_one ? 0 : 1,
3148                      .vp_clip_code_ignore = 1));;
3149 
3150    enum a6xx_polygon_mode polygon_mode = tu6_polygon_mode(rs->polygon_mode);
3151 
3152    tu_cs_emit_regs(cs,
3153                    A6XX_VPC_POLYGON_MODE(polygon_mode));
3154 
3155    tu_cs_emit_regs(cs,
3156                    PC_POLYGON_MODE(CHIP, polygon_mode));
3157 
3158    if (CHIP == A7XX) {
3159       tu_cs_emit_regs(cs,
3160                      A7XX_VPC_POLYGON_MODE2(polygon_mode));
3161    }
3162 
3163    tu_cs_emit_regs(cs, PC_RASTER_CNTL(CHIP,
3164       .stream = rs->rasterization_stream,
3165       .discard = rs->rasterizer_discard_enable));
3166    if (CHIP == A6XX) {
3167       tu_cs_emit_regs(cs, A6XX_VPC_UNKNOWN_9107(
3168          .raster_discard = rs->rasterizer_discard_enable));
3169    } else {
3170       tu_cs_emit_regs(cs, A7XX_PC_RASTER_CNTL_V2(
3171          .stream = rs->rasterization_stream,
3172          .discard = rs->rasterizer_discard_enable));
3173 
3174       bool conservative_ras_en =
3175          rs->conservative_mode ==
3176          VK_CONSERVATIVE_RASTERIZATION_MODE_OVERESTIMATE_EXT;
3177 
3178       tu_cs_emit_regs(cs, RB_RENDER_CNTL(CHIP,
3179             .raster_mode = TYPE_TILED,
3180             .raster_direction = LR_TB,
3181             .conservativerasen = conservative_ras_en));
3182       tu_cs_emit_regs(cs, A7XX_GRAS_SU_RENDER_CNTL());
3183       tu_cs_emit_regs(cs,
3184                       A6XX_PC_DGEN_SU_CONSERVATIVE_RAS_CNTL(conservative_ras_en));
3185 
3186       /* There are only two conservative rasterization modes:
3187        * - shift_amount = 0 (NO_SHIFT) - normal rasterization
3188        * - shift_amount = 1 (HALF_PIXEL_SHIFT) - overestimate by half a pixel
3189        *   plus the rasterization grid size (1/256)
3190        * - shift_amount = 2 (FULL_PIXEL_SHIFT) - overestimate by another half
3191        *   a pixel
3192        *
3193        * We expose a max of 0.5 and a granularity of 0.5, so the app should
3194        * only give us 0 or 0.5 which correspond to HALF_PIXEL_SHIFT and
3195        * FULL_PIXEL_SHIFT respectively. If they give us anything else just
3196        * assume they meant 0.5 as the most conservative choice.
3197        */
3198       enum a6xx_shift_amount shift_amount = conservative_ras_en ?
3199          (rs->extra_primitive_overestimation_size != 0. ?
3200             FULL_PIXEL_SHIFT : HALF_PIXEL_SHIFT) : NO_SHIFT;
3201       tu_cs_emit_regs(cs, A6XX_GRAS_SU_CONSERVATIVE_RAS_CNTL(
3202             .conservativerasen = conservative_ras_en,
3203             .shiftamount = shift_amount));
3204    }
3205 
3206    /* move to hw ctx init? */
3207    tu_cs_emit_regs(cs,
3208                    A6XX_GRAS_SU_POINT_MINMAX(.min = 1.0f / 16.0f, .max = 4092.0f),
3209                    A6XX_GRAS_SU_POINT_SIZE(1.0f));
3210 
3211    if (CHIP == A6XX && cs->device->physical_device->info->a6xx.has_legacy_pipeline_shading_rate) {
3212       tu_cs_emit_regs(cs, A6XX_RB_UNKNOWN_8A00());
3213       tu_cs_emit_regs(cs, A6XX_RB_UNKNOWN_8A10());
3214       tu_cs_emit_regs(cs, A6XX_RB_UNKNOWN_8A20());
3215       tu_cs_emit_regs(cs, A6XX_RB_UNKNOWN_8A30());
3216    }
3217 }
3218 
3219 static const enum mesa_vk_dynamic_graphics_state tu_ds_state[] = {
3220    MESA_VK_DYNAMIC_DS_STENCIL_TEST_ENABLE,
3221    MESA_VK_DYNAMIC_DS_STENCIL_OP,
3222    MESA_VK_DYNAMIC_DS_STENCIL_COMPARE_MASK,
3223    MESA_VK_DYNAMIC_DS_STENCIL_WRITE_MASK,
3224    MESA_VK_DYNAMIC_DS_STENCIL_REFERENCE,
3225    MESA_VK_DYNAMIC_DS_DEPTH_BOUNDS_TEST_BOUNDS,
3226 };
3227 
3228 template <chip CHIP>
3229 static unsigned
tu6_ds_size(struct tu_device * dev,const struct vk_depth_stencil_state * ds,const struct vk_render_pass_state * rp)3230 tu6_ds_size(struct tu_device *dev,
3231                  const struct vk_depth_stencil_state *ds,
3232                  const struct vk_render_pass_state *rp)
3233 {
3234    return 13;
3235 }
3236 
3237 template <chip CHIP>
3238 static void
tu6_emit_ds(struct tu_cs * cs,const struct vk_depth_stencil_state * ds,const struct vk_render_pass_state * rp)3239 tu6_emit_ds(struct tu_cs *cs,
3240             const struct vk_depth_stencil_state *ds,
3241             const struct vk_render_pass_state *rp)
3242 {
3243    bool stencil_test_enable =
3244       ds->stencil.test_enable && rp->attachments & MESA_VK_RP_ATTACHMENT_STENCIL_BIT;
3245    tu_cs_emit_regs(cs, A6XX_RB_STENCIL_CONTROL(
3246       .stencil_enable = stencil_test_enable,
3247       .stencil_enable_bf = stencil_test_enable,
3248       .stencil_read = stencil_test_enable,
3249       .func = tu6_compare_func((VkCompareOp)ds->stencil.front.op.compare),
3250       .fail = tu6_stencil_op((VkStencilOp)ds->stencil.front.op.fail),
3251       .zpass = tu6_stencil_op((VkStencilOp)ds->stencil.front.op.pass),
3252       .zfail = tu6_stencil_op((VkStencilOp)ds->stencil.front.op.depth_fail),
3253       .func_bf = tu6_compare_func((VkCompareOp)ds->stencil.back.op.compare),
3254       .fail_bf = tu6_stencil_op((VkStencilOp)ds->stencil.back.op.fail),
3255       .zpass_bf = tu6_stencil_op((VkStencilOp)ds->stencil.back.op.pass),
3256       .zfail_bf = tu6_stencil_op((VkStencilOp)ds->stencil.back.op.depth_fail)));
3257    tu_cs_emit_regs(cs, A6XX_GRAS_SU_STENCIL_CNTL(stencil_test_enable));
3258 
3259    tu_cs_emit_regs(cs, A6XX_RB_STENCILMASK(
3260       .mask = ds->stencil.front.compare_mask,
3261       .bfmask = ds->stencil.back.compare_mask));
3262 
3263    tu_cs_emit_regs(cs, A6XX_RB_STENCILWRMASK(
3264       .wrmask = ds->stencil.front.write_mask,
3265       .bfwrmask = ds->stencil.back.write_mask));
3266 
3267    tu_cs_emit_regs(cs, A6XX_RB_STENCILREF(
3268       .ref = ds->stencil.front.reference,
3269       .bfref = ds->stencil.back.reference));
3270 
3271    tu_cs_emit_regs(cs,
3272                    A6XX_RB_Z_BOUNDS_MIN(ds->depth.bounds_test.min),
3273                    A6XX_RB_Z_BOUNDS_MAX(ds->depth.bounds_test.max));
3274 }
3275 
3276 static const enum mesa_vk_dynamic_graphics_state tu_rb_depth_cntl_state[] = {
3277    MESA_VK_DYNAMIC_DS_DEPTH_TEST_ENABLE,
3278    MESA_VK_DYNAMIC_DS_DEPTH_WRITE_ENABLE,
3279    MESA_VK_DYNAMIC_DS_DEPTH_COMPARE_OP,
3280    MESA_VK_DYNAMIC_DS_DEPTH_BOUNDS_TEST_ENABLE,
3281    MESA_VK_DYNAMIC_RS_DEPTH_CLAMP_ENABLE,
3282 };
3283 
3284 template <chip CHIP>
3285 static unsigned
tu6_rb_depth_cntl_size(struct tu_device * dev,const struct vk_depth_stencil_state * ds,const struct vk_render_pass_state * rp,const struct vk_rasterization_state * rs)3286 tu6_rb_depth_cntl_size(struct tu_device *dev,
3287                        const struct vk_depth_stencil_state *ds,
3288                        const struct vk_render_pass_state *rp,
3289                        const struct vk_rasterization_state *rs)
3290 {
3291    return 4;
3292 }
3293 
3294 template <chip CHIP>
3295 static void
tu6_emit_rb_depth_cntl(struct tu_cs * cs,const struct vk_depth_stencil_state * ds,const struct vk_render_pass_state * rp,const struct vk_rasterization_state * rs)3296 tu6_emit_rb_depth_cntl(struct tu_cs *cs,
3297                        const struct vk_depth_stencil_state *ds,
3298                        const struct vk_render_pass_state *rp,
3299                        const struct vk_rasterization_state *rs)
3300 {
3301    if (rp->attachments & MESA_VK_RP_ATTACHMENT_DEPTH_BIT) {
3302       bool depth_test = ds->depth.test_enable;
3303       enum adreno_compare_func zfunc = tu6_compare_func(ds->depth.compare_op);
3304 
3305       /* On some GPUs it is necessary to enable z test for depth bounds test
3306        * when UBWC is enabled. Otherwise, the GPU would hang. FUNC_ALWAYS is
3307        * required to pass z test. Relevant tests:
3308        *  dEQP-VK.pipeline.extended_dynamic_state.two_draws_dynamic.depth_bounds_test_disable
3309        *  dEQP-VK.dynamic_state.ds_state.depth_bounds_1
3310        */
3311       if (ds->depth.bounds_test.enable &&
3312           !ds->depth.test_enable &&
3313           cs->device->physical_device->info->a6xx.depth_bounds_require_depth_test_quirk) {
3314          depth_test = true;
3315          zfunc = FUNC_ALWAYS;
3316       }
3317 
3318       tu_cs_emit_regs(cs, A6XX_RB_DEPTH_CNTL(
3319          .z_test_enable = depth_test,
3320          .z_write_enable = ds->depth.test_enable && ds->depth.write_enable,
3321          .zfunc = zfunc,
3322          /* To support VK_EXT_depth_clamp_zero_one on a7xx+ */
3323          .z_clamp_enable = rs->depth_clamp_enable || CHIP >= A7XX,
3324          /* TODO don't set for ALWAYS/NEVER */
3325          .z_read_enable = ds->depth.test_enable || ds->depth.bounds_test.enable,
3326          .z_bounds_enable = ds->depth.bounds_test.enable));
3327       tu_cs_emit_regs(cs, A6XX_GRAS_SU_DEPTH_CNTL(depth_test));
3328    } else {
3329       tu_cs_emit_regs(cs, A6XX_RB_DEPTH_CNTL());
3330       tu_cs_emit_regs(cs, A6XX_GRAS_SU_DEPTH_CNTL());
3331    }
3332 }
3333 
3334 static const enum mesa_vk_dynamic_graphics_state tu_prim_mode_sysmem_state[] = {
3335    MESA_VK_DYNAMIC_ATTACHMENT_FEEDBACK_LOOP_ENABLE,
3336 };
3337 
3338 template <chip CHIP>
3339 static unsigned
tu6_prim_mode_sysmem_size(struct tu_device * dev,struct tu_shader * fs,bool raster_order_attachment_access,VkImageAspectFlags feedback_loops,bool * sysmem_single_prim_mode)3340 tu6_prim_mode_sysmem_size(struct tu_device *dev,
3341                           struct tu_shader *fs,
3342                           bool raster_order_attachment_access,
3343                           VkImageAspectFlags feedback_loops,
3344                           bool *sysmem_single_prim_mode)
3345 {
3346    return 2;
3347 }
3348 
3349 template <chip CHIP>
3350 static void
tu6_emit_prim_mode_sysmem(struct tu_cs * cs,struct tu_shader * fs,bool raster_order_attachment_access,VkImageAspectFlags feedback_loops,bool * sysmem_single_prim_mode)3351 tu6_emit_prim_mode_sysmem(struct tu_cs *cs,
3352                           struct tu_shader *fs,
3353                           bool raster_order_attachment_access,
3354                           VkImageAspectFlags feedback_loops,
3355                           bool *sysmem_single_prim_mode)
3356 {
3357    /* VK_EXT_rasterization_order_attachment_access:
3358     *
3359     * This extension allow access to framebuffer attachments when used as both
3360     * input and color attachments from one fragment to the next, in
3361     * rasterization order, without explicit synchronization.
3362     */
3363    raster_order_attachment_access |= TU_DEBUG(RAST_ORDER);
3364 
3365    /* If there is a feedback loop, then the shader can read the previous value
3366     * of a pixel being written out. It can also write some components and then
3367     * read different components without a barrier in between. This is a
3368     * problem in sysmem mode with UBWC, because the main buffer and flags
3369     * buffer can get out-of-sync if only one is flushed. We fix this by
3370     * setting the SINGLE_PRIM_MODE field to the same value that the blob does
3371     * for advanced_blend in sysmem mode if a feedback loop is detected.
3372     */
3373    enum a6xx_single_prim_mode sysmem_prim_mode =
3374       (raster_order_attachment_access || feedback_loops ||
3375        fs->fs.dynamic_input_attachments_used) ?
3376       FLUSH_PER_OVERLAP_AND_OVERWRITE : NO_FLUSH;
3377 
3378    if (sysmem_prim_mode == FLUSH_PER_OVERLAP_AND_OVERWRITE)
3379       *sysmem_single_prim_mode = true;
3380 
3381    tu_cs_emit_regs(cs, A6XX_GRAS_SC_CNTL(.ccusinglecachelinesize = 2,
3382                                          .single_prim_mode = sysmem_prim_mode));
3383 }
3384 
3385 static const enum mesa_vk_dynamic_graphics_state tu_fragment_shading_rate_state[] = {
3386    MESA_VK_DYNAMIC_FSR,
3387 };
3388 
3389 template <chip CHIP>
3390 static unsigned
tu6_fragment_shading_rate_size(struct tu_device * dev,const vk_fragment_shading_rate_state * fsr,bool enable_att_fsr,bool enable_prim_fsr,bool fs_reads_fsr,bool sample_shading)3391 tu6_fragment_shading_rate_size(struct tu_device *dev,
3392                                const vk_fragment_shading_rate_state *fsr,
3393                                bool enable_att_fsr,
3394                                bool enable_prim_fsr,
3395                                bool fs_reads_fsr,
3396                                bool sample_shading)
3397 {
3398    return 6;
3399 }
3400 
3401 template <chip CHIP>
3402 static void
tu6_emit_fragment_shading_rate(struct tu_cs * cs,const vk_fragment_shading_rate_state * fsr,bool enable_att_fsr,bool enable_prim_fsr,bool fs_reads_fsr,bool accesses_smask)3403 tu6_emit_fragment_shading_rate(struct tu_cs *cs,
3404                                const vk_fragment_shading_rate_state *fsr,
3405                                bool enable_att_fsr,
3406                                bool enable_prim_fsr,
3407                                bool fs_reads_fsr,
3408                                bool accesses_smask)
3409 {
3410    /* gl_ShadingRateEXT don't read 1x1 value with null config, so
3411     * if it is read - we have to emit the config.
3412     */
3413    if (!fsr || (!fs_reads_fsr && vk_fragment_shading_rate_is_disabled(fsr))) {
3414       tu_cs_emit_regs(cs, A6XX_RB_FSR_CONFIG());
3415       tu_cs_emit_regs(cs, A7XX_SP_FSR_CONFIG());
3416       tu_cs_emit_regs(cs, A7XX_GRAS_FSR_CONFIG());
3417       return;
3418    }
3419 
3420    uint32_t frag_width = fsr->fragment_size.width;
3421    uint32_t frag_height = fsr->fragment_size.height;
3422 
3423    bool enable_draw_fsr = true;
3424    if (enable_att_fsr) {
3425       if (fsr->combiner_ops[1] ==
3426           VK_FRAGMENT_SHADING_RATE_COMBINER_OP_REPLACE_KHR) {
3427          enable_draw_fsr = false;
3428          enable_prim_fsr = false;
3429       } else if (fsr->combiner_ops[1] ==
3430                  VK_FRAGMENT_SHADING_RATE_COMBINER_OP_KEEP_KHR) {
3431          enable_att_fsr = false;
3432       }
3433    }
3434    if (enable_prim_fsr) {
3435       if (fsr->combiner_ops[0] ==
3436           VK_FRAGMENT_SHADING_RATE_COMBINER_OP_REPLACE_KHR) {
3437          enable_draw_fsr = false;
3438       } else if (fsr->combiner_ops[0] ==
3439                  VK_FRAGMENT_SHADING_RATE_COMBINER_OP_KEEP_KHR) {
3440          enable_prim_fsr = false;
3441       }
3442    }
3443 
3444    /* Force 1x1 FSR because we don't support
3445     * fragmentShadingRateWithShaderSampleMask.
3446     */
3447    if (accesses_smask) {
3448       enable_att_fsr = enable_prim_fsr = false;
3449       frag_width = frag_height = 1;
3450       enable_draw_fsr = true;
3451    }
3452 
3453    tu_cs_emit_regs(
3454       cs,
3455       A6XX_RB_FSR_CONFIG(.unk2 = true, .pipeline_fsr_enable = enable_draw_fsr,
3456                          .attachment_fsr_enable = enable_att_fsr,
3457                          .primitive_fsr_enable = enable_prim_fsr));
3458    tu_cs_emit_regs(
3459       cs, A7XX_SP_FSR_CONFIG(.pipeline_fsr_enable = enable_draw_fsr,
3460                              .attachment_fsr_enable = enable_att_fsr,
3461                              .primitive_fsr_enable = enable_prim_fsr));
3462    tu_cs_emit_regs(
3463       cs, A7XX_GRAS_FSR_CONFIG(
3464                 .pipeline_fsr_enable = enable_draw_fsr,
3465                 .frag_size_x = util_logbase2(frag_width),
3466                 .frag_size_y = util_logbase2(frag_height),
3467                 .combiner_op_1 = (a6xx_fsr_combiner) fsr->combiner_ops[0],
3468                 .combiner_op_2 = (a6xx_fsr_combiner) fsr->combiner_ops[1],
3469                 .attachment_fsr_enable = enable_att_fsr,
3470                 .primitive_fsr_enable = enable_prim_fsr));
3471 }
3472 
3473 
3474 static inline bool
emit_pipeline_state(BITSET_WORD * keep,BITSET_WORD * remove,BITSET_WORD * pipeline_set,const enum mesa_vk_dynamic_graphics_state * state_array,unsigned num_states,bool extra_cond,struct tu_pipeline_builder * builder)3475 emit_pipeline_state(BITSET_WORD *keep, BITSET_WORD *remove,
3476                     BITSET_WORD *pipeline_set,
3477                     const enum mesa_vk_dynamic_graphics_state *state_array,
3478                     unsigned num_states, bool extra_cond,
3479                     struct tu_pipeline_builder *builder)
3480 {
3481    BITSET_DECLARE(state, MESA_VK_DYNAMIC_GRAPHICS_STATE_ENUM_MAX) = {};
3482 
3483    /* Unrolling this loop should produce a constant value once the function is
3484     * inlined, because state_array and num_states are a per-draw-state
3485     * constant, but GCC seems to need a little encouragement. clang does a
3486     * little better but still needs a pragma when there are a large number of
3487     * states.
3488     */
3489 #if defined(__clang__)
3490 #pragma clang loop unroll(full)
3491 #elif defined(__GNUC__) && __GNUC__ >= 8
3492 #pragma GCC unroll MESA_VK_DYNAMIC_GRAPHICS_STATE_ENUM_MAX
3493 #endif
3494    for (unsigned i = 0; i < num_states; i++) {
3495       BITSET_SET(state, state_array[i]);
3496    }
3497 
3498    /* If all of the state is set, then after we emit it we can tentatively
3499     * remove it from the states to set for the pipeline by making it dynamic.
3500     * If we can't emit it, though, we need to keep around the partial state so
3501     * that we can emit it later, even if another draw state consumes it. That
3502     * is, we have to cancel any tentative removal.
3503     */
3504    BITSET_DECLARE(temp, MESA_VK_DYNAMIC_GRAPHICS_STATE_ENUM_MAX);
3505    memcpy(temp, pipeline_set, sizeof(temp));
3506    BITSET_AND(temp, temp, state);
3507    if (!BITSET_EQUAL(temp, state) || !extra_cond) {
3508       __bitset_or(keep, keep, temp, ARRAY_SIZE(temp));
3509       return false;
3510    }
3511    __bitset_or(remove, remove, state, ARRAY_SIZE(state));
3512    return true;
3513 }
3514 
3515 template <chip CHIP>
3516 static void
tu_pipeline_builder_emit_state(struct tu_pipeline_builder * builder,struct tu_pipeline * pipeline)3517 tu_pipeline_builder_emit_state(struct tu_pipeline_builder *builder,
3518                                struct tu_pipeline *pipeline)
3519 {
3520    struct tu_cs cs;
3521    BITSET_DECLARE(keep, MESA_VK_DYNAMIC_GRAPHICS_STATE_ENUM_MAX) = {};
3522    BITSET_DECLARE(remove, MESA_VK_DYNAMIC_GRAPHICS_STATE_ENUM_MAX) = {};
3523    BITSET_DECLARE(pipeline_set, MESA_VK_DYNAMIC_GRAPHICS_STATE_ENUM_MAX) = {};
3524 
3525    vk_graphics_pipeline_get_state(&builder->graphics_state, pipeline_set);
3526 
3527 #define EMIT_STATE(name, extra_cond)                                          \
3528    emit_pipeline_state(keep, remove, pipeline_set, tu_##name##_state,         \
3529                        ARRAY_SIZE(tu_##name##_state), extra_cond, builder)
3530 
3531 #define DRAW_STATE_COND(name, id, extra_cond, ...)                            \
3532    if (EMIT_STATE(name, extra_cond)) {                                        \
3533       unsigned size = tu6_##name##_size<CHIP>(builder->device, __VA_ARGS__);  \
3534       if (size > 0) {                                                         \
3535          tu_cs_begin_sub_stream(&pipeline->cs, size, &cs);                    \
3536          tu6_emit_##name<CHIP>(&cs, __VA_ARGS__);                             \
3537          pipeline->dynamic_state[id] =                                        \
3538             tu_cs_end_draw_state(&pipeline->cs, &cs);                         \
3539       }                                                                       \
3540       pipeline->set_state_mask |= (1u << id);                                 \
3541    }
3542 #define DRAW_STATE(name, id, ...) DRAW_STATE_COND(name, id, true, __VA_ARGS__)
3543 
3544    DRAW_STATE(vertex_input, TU_DYNAMIC_STATE_VERTEX_INPUT,
3545               builder->graphics_state.vi);
3546    DRAW_STATE(vertex_stride, TU_DYNAMIC_STATE_VB_STRIDE,
3547               builder->graphics_state.vi);
3548    /* If (a) per-view viewport is used or (b) we don't know yet, then we need
3549     * to set viewport and stencil state dynamically.
3550     */
3551    bool no_per_view_viewport = pipeline_contains_all_shader_state(pipeline) &&
3552       !pipeline->program.per_view_viewport;
3553    DRAW_STATE_COND(viewport, TU_DYNAMIC_STATE_VIEWPORT, no_per_view_viewport,
3554                    builder->graphics_state.vp,
3555                    builder->graphics_state.rs);
3556    DRAW_STATE_COND(scissor, TU_DYNAMIC_STATE_SCISSOR, no_per_view_viewport,
3557               builder->graphics_state.vp);
3558    DRAW_STATE(sample_locations,
3559               TU_DYNAMIC_STATE_SAMPLE_LOCATIONS,
3560               builder->graphics_state.ms->sample_locations_enable,
3561               builder->graphics_state.ms->sample_locations);
3562    DRAW_STATE(depth_bias, TU_DYNAMIC_STATE_DEPTH_BIAS,
3563               builder->graphics_state.rs);
3564    bool attachments_valid =
3565       builder->graphics_state.rp &&
3566       vk_render_pass_state_has_attachment_info(builder->graphics_state.rp);
3567    struct vk_color_blend_state dummy_cb = {};
3568    const struct vk_color_blend_state *cb = builder->graphics_state.cb;
3569    if (attachments_valid &&
3570        !(builder->graphics_state.rp->attachments &
3571          MESA_VK_RP_ATTACHMENT_ANY_COLOR_BITS)) {
3572       /* If there are no color attachments, then the original blend state may
3573        * be NULL and the common code sanitizes it to always be NULL. In this
3574        * case we want to emit an empty blend/bandwidth/etc.  rather than
3575        * letting it be dynamic (and potentially garbage).
3576        */
3577       cb = &dummy_cb;
3578       BITSET_SET(pipeline_set, MESA_VK_DYNAMIC_CB_LOGIC_OP_ENABLE);
3579       BITSET_SET(pipeline_set, MESA_VK_DYNAMIC_CB_LOGIC_OP);
3580       BITSET_SET(pipeline_set, MESA_VK_DYNAMIC_CB_ATTACHMENT_COUNT);
3581       BITSET_SET(pipeline_set, MESA_VK_DYNAMIC_CB_COLOR_WRITE_ENABLES);
3582       BITSET_SET(pipeline_set, MESA_VK_DYNAMIC_CB_BLEND_ENABLES);
3583       BITSET_SET(pipeline_set, MESA_VK_DYNAMIC_CB_BLEND_EQUATIONS);
3584       BITSET_SET(pipeline_set, MESA_VK_DYNAMIC_CB_WRITE_MASKS);
3585       BITSET_SET(pipeline_set, MESA_VK_DYNAMIC_CB_BLEND_CONSTANTS);
3586    }
3587    DRAW_STATE(blend, TU_DYNAMIC_STATE_BLEND, cb,
3588               builder->graphics_state.cal,
3589               builder->graphics_state.ms->alpha_to_coverage_enable,
3590               builder->graphics_state.ms->alpha_to_one_enable,
3591               builder->graphics_state.ms->sample_mask);
3592    if (EMIT_STATE(blend_lrz, attachments_valid))
3593       tu_emit_blend_lrz(&pipeline->lrz_blend, cb,
3594                         builder->graphics_state.rp);
3595    if (EMIT_STATE(bandwidth, attachments_valid))
3596       tu_calc_bandwidth(&pipeline->bandwidth, cb,
3597                         builder->graphics_state.rp);
3598    DRAW_STATE(blend_constants, TU_DYNAMIC_STATE_BLEND_CONSTANTS, cb);
3599 
3600    if (attachments_valid &&
3601        !(builder->graphics_state.rp->attachments &
3602          MESA_VK_RP_ATTACHMENT_ANY_COLOR_BITS)) {
3603       /* Don't actually make anything dynamic as that may mean a partially-set
3604        * state group where the group is NULL which angers common code.
3605        */
3606       BITSET_CLEAR(remove, MESA_VK_DYNAMIC_CB_LOGIC_OP_ENABLE);
3607       BITSET_CLEAR(remove, MESA_VK_DYNAMIC_CB_LOGIC_OP);
3608       BITSET_CLEAR(remove, MESA_VK_DYNAMIC_CB_ATTACHMENT_COUNT);
3609       BITSET_CLEAR(remove, MESA_VK_DYNAMIC_CB_COLOR_WRITE_ENABLES);
3610       BITSET_CLEAR(remove, MESA_VK_DYNAMIC_CB_BLEND_ENABLES);
3611       BITSET_CLEAR(remove, MESA_VK_DYNAMIC_CB_BLEND_EQUATIONS);
3612       BITSET_CLEAR(remove, MESA_VK_DYNAMIC_CB_WRITE_MASKS);
3613       BITSET_CLEAR(remove, MESA_VK_DYNAMIC_CB_BLEND_CONSTANTS);
3614    }
3615    DRAW_STATE_COND(rast, TU_DYNAMIC_STATE_RAST,
3616                    pipeline_contains_all_shader_state(pipeline),
3617                    builder->graphics_state.rs,
3618                    builder->graphics_state.vp,
3619                    builder->graphics_state.rp->view_mask != 0,
3620                    pipeline->program.per_view_viewport);
3621    DRAW_STATE_COND(ds, TU_DYNAMIC_STATE_DS,
3622               attachments_valid,
3623               builder->graphics_state.ds,
3624               builder->graphics_state.rp);
3625    DRAW_STATE_COND(rb_depth_cntl, TU_DYNAMIC_STATE_RB_DEPTH_CNTL,
3626                    attachments_valid,
3627                    builder->graphics_state.ds,
3628                    builder->graphics_state.rp,
3629                    builder->graphics_state.rs);
3630    DRAW_STATE_COND(patch_control_points,
3631                    TU_DYNAMIC_STATE_PATCH_CONTROL_POINTS,
3632                    pipeline_contains_all_shader_state(pipeline),
3633                    pipeline->shaders[MESA_SHADER_VERTEX],
3634                    pipeline->shaders[MESA_SHADER_TESS_CTRL],
3635                    pipeline->shaders[MESA_SHADER_TESS_EVAL],
3636                    &pipeline->program,
3637                    builder->graphics_state.ts->patch_control_points);
3638    bool has_raster_order_state = false;
3639    if (pipeline->type == TU_PIPELINE_GRAPHICS) {
3640       has_raster_order_state = true;
3641    } else {
3642       struct tu_graphics_lib_pipeline *lib =
3643          tu_pipeline_to_graphics_lib(pipeline);
3644       has_raster_order_state =
3645          (lib->state & VK_GRAPHICS_PIPELINE_LIBRARY_FRAGMENT_SHADER_BIT_EXT) &&
3646          (lib->state &
3647           VK_GRAPHICS_PIPELINE_LIBRARY_FRAGMENT_OUTPUT_INTERFACE_BIT_EXT);
3648    }
3649    if (!builder->device->physical_device->info->a6xx.has_coherent_ubwc_flag_caches) {
3650       DRAW_STATE_COND(prim_mode_sysmem,
3651                       TU_DYNAMIC_STATE_PRIM_MODE_SYSMEM,
3652                       has_raster_order_state,
3653                       pipeline->shaders[MESA_SHADER_FRAGMENT],
3654                       pipeline->output.raster_order_attachment_access ||
3655                       pipeline->ds.raster_order_attachment_access,
3656                       vk_pipeline_flags_feedback_loops(builder->graphics_state.pipeline_flags),
3657                       &pipeline->prim_order.sysmem_single_prim_mode);
3658    }
3659 
3660    if (builder->device->physical_device->info->a6xx.has_attachment_shading_rate) {
3661       bool has_fsr_att =
3662          builder->graphics_state.pipeline_flags &
3663          VK_PIPELINE_CREATE_2_RENDERING_FRAGMENT_SHADING_RATE_ATTACHMENT_BIT_KHR;
3664       DRAW_STATE_COND(fragment_shading_rate,
3665                       TU_DYNAMIC_STATE_A7XX_FRAGMENT_SHADING_RATE,
3666                       attachments_valid && pipeline_contains_all_shader_state(pipeline),
3667                       builder->graphics_state.fsr,
3668                       has_fsr_att,
3669                       pipeline->program.writes_shading_rate,
3670                       pipeline->program.reads_shading_rate,
3671                       pipeline->program.accesses_smask);
3672    }
3673 #undef DRAW_STATE
3674 #undef DRAW_STATE_COND
3675 #undef EMIT_STATE
3676 
3677    /* LRZ always needs depth/stencil state at draw time */
3678    BITSET_SET(keep, MESA_VK_DYNAMIC_DS_DEPTH_TEST_ENABLE);
3679    BITSET_SET(keep, MESA_VK_DYNAMIC_DS_DEPTH_WRITE_ENABLE);
3680    BITSET_SET(keep, MESA_VK_DYNAMIC_DS_DEPTH_BOUNDS_TEST_ENABLE);
3681    BITSET_SET(keep, MESA_VK_DYNAMIC_DS_DEPTH_COMPARE_OP);
3682    BITSET_SET(keep, MESA_VK_DYNAMIC_DS_STENCIL_TEST_ENABLE);
3683    BITSET_SET(keep, MESA_VK_DYNAMIC_DS_STENCIL_OP);
3684    BITSET_SET(keep, MESA_VK_DYNAMIC_DS_STENCIL_WRITE_MASK);
3685    BITSET_SET(keep, MESA_VK_DYNAMIC_MS_ALPHA_TO_COVERAGE_ENABLE);
3686 
3687    /* MSAA needs line mode */
3688    BITSET_SET(keep, MESA_VK_DYNAMIC_RS_LINE_MODE);
3689 
3690    /* The patch control points is part of the draw */
3691    BITSET_SET(keep, MESA_VK_DYNAMIC_TS_PATCH_CONTROL_POINTS);
3692 
3693    /* Vertex buffer state needs to know the max valid binding */
3694    BITSET_SET(keep, MESA_VK_DYNAMIC_VI_BINDINGS_VALID);
3695 
3696    /* Remove state which has been emitted and we no longer need to set when
3697     * binding the pipeline by making it "dynamic".
3698     */
3699    BITSET_ANDNOT(remove, remove, keep);
3700 
3701    BITSET_OR(pipeline->static_state_mask, pipeline->static_state_mask, remove);
3702 
3703    BITSET_OR(builder->graphics_state.dynamic, builder->graphics_state.dynamic,
3704              remove);
3705 }
3706 
3707 static inline bool
emit_draw_state(const struct vk_dynamic_graphics_state * dynamic_state,const enum mesa_vk_dynamic_graphics_state * state_array,unsigned num_states)3708 emit_draw_state(const struct vk_dynamic_graphics_state *dynamic_state,
3709                 const enum mesa_vk_dynamic_graphics_state *state_array,
3710                 unsigned num_states)
3711 {
3712    BITSET_DECLARE(state, MESA_VK_DYNAMIC_GRAPHICS_STATE_ENUM_MAX) = {};
3713 
3714    /* Unrolling this loop should produce a constant value once the function is
3715     * inlined, because state_array and num_states are a per-draw-state
3716     * constant, but GCC seems to need a little encouragement. clang does a
3717     * little better but still needs a pragma when there are a large number of
3718     * states.
3719     */
3720 #if defined(__clang__)
3721 #pragma clang loop unroll(full)
3722 #elif defined(__GNUC__) && __GNUC__ >= 8
3723 #pragma GCC unroll MESA_VK_DYNAMIC_GRAPHICS_STATE_ENUM_MAX
3724 #endif
3725    for (unsigned i = 0; i < num_states; i++) {
3726       BITSET_SET(state, state_array[i]);
3727    }
3728 
3729    BITSET_DECLARE(temp, MESA_VK_DYNAMIC_GRAPHICS_STATE_ENUM_MAX);
3730    BITSET_AND(temp, state, dynamic_state->dirty);
3731    return !BITSET_IS_EMPTY(temp);
3732 }
3733 
3734 template <chip CHIP>
3735 uint32_t
tu_emit_draw_state(struct tu_cmd_buffer * cmd)3736 tu_emit_draw_state(struct tu_cmd_buffer *cmd)
3737 {
3738    struct tu_cs cs;
3739    uint32_t dirty_draw_states = 0;
3740 
3741 #define EMIT_STATE(name)                                                      \
3742    emit_draw_state(&cmd->vk.dynamic_graphics_state, tu_##name##_state,        \
3743                    ARRAY_SIZE(tu_##name##_state))
3744 #define DRAW_STATE_COND(name, id, extra_cond, ...)                            \
3745    if ((EMIT_STATE(name) || (extra_cond)) &&                                  \
3746        !(cmd->state.pipeline_draw_states & (1u << id))) {                     \
3747       unsigned size = tu6_##name##_size<CHIP>(cmd->device, __VA_ARGS__);      \
3748       if (size > 0) {                                                         \
3749          tu_cs_begin_sub_stream(&cmd->sub_cs, size, &cs);                     \
3750          tu6_emit_##name<CHIP>(&cs, __VA_ARGS__);                             \
3751          cmd->state.dynamic_state[id] =                                       \
3752             tu_cs_end_draw_state(&cmd->sub_cs, &cs);                          \
3753       } else {                                                                \
3754          cmd->state.dynamic_state[id] = {};                                   \
3755       }                                                                       \
3756       dirty_draw_states |= (1u << id);                                        \
3757    }
3758 #define DRAW_STATE_FDM(name, id, ...)                                         \
3759    if ((EMIT_STATE(name) || (cmd->state.dirty & TU_CMD_DIRTY_FDM)) &&         \
3760        !(cmd->state.pipeline_draw_states & (1u << id))) {                     \
3761       if (cmd->state.shaders[MESA_SHADER_FRAGMENT]->fs.has_fdm) {             \
3762          tu_cs_set_writeable(&cmd->sub_cs, true);                             \
3763          tu6_emit_##name##_fdm(&cs, cmd, __VA_ARGS__);                        \
3764          cmd->state.dynamic_state[id] =                                       \
3765             tu_cs_end_draw_state(&cmd->sub_cs, &cs);                          \
3766          tu_cs_set_writeable(&cmd->sub_cs, false);                            \
3767       } else {                                                                \
3768          unsigned size = tu6_##name##_size<CHIP>(cmd->device, __VA_ARGS__);   \
3769          if (size > 0) {                                                      \
3770             tu_cs_begin_sub_stream(&cmd->sub_cs, size, &cs);                  \
3771             tu6_emit_##name<CHIP>(&cs, __VA_ARGS__);                          \
3772             cmd->state.dynamic_state[id] =                                    \
3773                tu_cs_end_draw_state(&cmd->sub_cs, &cs);                       \
3774          } else {                                                             \
3775             cmd->state.dynamic_state[id] = {};                                \
3776          }                                                                    \
3777          tu_cs_begin_sub_stream(&cmd->sub_cs,                                 \
3778                                 tu6_##name##_size<CHIP>(cmd->device, __VA_ARGS__),  \
3779                                 &cs);                                         \
3780          tu6_emit_##name<CHIP>(&cs, __VA_ARGS__);                             \
3781          cmd->state.dynamic_state[id] =                                       \
3782             tu_cs_end_draw_state(&cmd->sub_cs, &cs);                          \
3783       }                                                                       \
3784       dirty_draw_states |= (1u << id);                                        \
3785    }
3786 #define DRAW_STATE(name, id, ...) DRAW_STATE_COND(name, id, false, __VA_ARGS__)
3787 
3788    DRAW_STATE(vertex_input, TU_DYNAMIC_STATE_VERTEX_INPUT,
3789               cmd->vk.dynamic_graphics_state.vi);
3790 
3791    /* Vertex input stride is special because it's part of the vertex input in
3792     * the pipeline but a separate array when it's dynamic state so we have to
3793     * use two separate functions.
3794     */
3795 #define tu6_emit_vertex_stride tu6_emit_vertex_stride_dyn
3796 #define tu6_vertex_stride_size tu6_vertex_stride_size_dyn
3797 
3798    DRAW_STATE(vertex_stride, TU_DYNAMIC_STATE_VB_STRIDE,
3799               cmd->vk.dynamic_graphics_state.vi_binding_strides,
3800               cmd->vk.dynamic_graphics_state.vi_bindings_valid);
3801 
3802 #undef tu6_emit_vertex_stride
3803 #undef tu6_vertex_stride_size
3804 
3805    DRAW_STATE_FDM(viewport, TU_DYNAMIC_STATE_VIEWPORT,
3806                   &cmd->vk.dynamic_graphics_state.vp,
3807                   &cmd->vk.dynamic_graphics_state.rs);
3808    DRAW_STATE_FDM(scissor, TU_DYNAMIC_STATE_SCISSOR,
3809                   &cmd->vk.dynamic_graphics_state.vp);
3810    DRAW_STATE(sample_locations,
3811               TU_DYNAMIC_STATE_SAMPLE_LOCATIONS,
3812               cmd->vk.dynamic_graphics_state.ms.sample_locations_enable,
3813               cmd->vk.dynamic_graphics_state.ms.sample_locations);
3814    DRAW_STATE(depth_bias, TU_DYNAMIC_STATE_DEPTH_BIAS,
3815               &cmd->vk.dynamic_graphics_state.rs);
3816    DRAW_STATE(blend, TU_DYNAMIC_STATE_BLEND,
3817               &cmd->vk.dynamic_graphics_state.cb,
3818               &cmd->vk.dynamic_graphics_state.cal,
3819               cmd->vk.dynamic_graphics_state.ms.alpha_to_coverage_enable,
3820               cmd->vk.dynamic_graphics_state.ms.alpha_to_one_enable,
3821               cmd->vk.dynamic_graphics_state.ms.sample_mask);
3822    if (!cmd->state.pipeline_blend_lrz &&
3823        (EMIT_STATE(blend_lrz) || (cmd->state.dirty & TU_CMD_DIRTY_SUBPASS))) {
3824       bool blend_reads_dest = tu6_calc_blend_lrz(&cmd->vk.dynamic_graphics_state.cb,
3825                                                  &cmd->state.vk_rp);
3826       if (blend_reads_dest != cmd->state.blend_reads_dest) {
3827          cmd->state.blend_reads_dest = blend_reads_dest;
3828          cmd->state.dirty |= TU_CMD_DIRTY_LRZ;
3829       }
3830    }
3831    if (!cmd->state.pipeline_bandwidth &&
3832        (EMIT_STATE(bandwidth) || (cmd->state.dirty & TU_CMD_DIRTY_SUBPASS)))
3833       tu_calc_bandwidth(&cmd->state.bandwidth, &cmd->vk.dynamic_graphics_state.cb,
3834                         &cmd->state.vk_rp);
3835    DRAW_STATE(blend_constants, VK_DYNAMIC_STATE_BLEND_CONSTANTS,
3836               &cmd->vk.dynamic_graphics_state.cb);
3837 
3838    if (cmd->device->physical_device->info->a6xx.has_attachment_shading_rate) {
3839       DRAW_STATE_COND(fragment_shading_rate,
3840                TU_DYNAMIC_STATE_A7XX_FRAGMENT_SHADING_RATE,
3841                cmd->state.dirty & (TU_CMD_DIRTY_SUBPASS | TU_CMD_DIRTY_SHADING_RATE),
3842                &cmd->vk.dynamic_graphics_state.fsr,
3843                cmd->state.subpass->fsr_attachment != VK_ATTACHMENT_UNUSED,
3844                cmd->state.program.writes_shading_rate,
3845                cmd->state.program.reads_shading_rate,
3846                cmd->state.program.accesses_smask);
3847    }
3848    DRAW_STATE_COND(rast, TU_DYNAMIC_STATE_RAST,
3849                    cmd->state.dirty & (TU_CMD_DIRTY_SUBPASS |
3850                                        TU_CMD_DIRTY_PER_VIEW_VIEWPORT),
3851                    &cmd->vk.dynamic_graphics_state.rs,
3852                    &cmd->vk.dynamic_graphics_state.vp,
3853                    cmd->state.vk_rp.view_mask != 0,
3854                    cmd->state.per_view_viewport);
3855    DRAW_STATE_COND(ds, TU_DYNAMIC_STATE_DS,
3856               cmd->state.dirty & TU_CMD_DIRTY_SUBPASS,
3857               &cmd->vk.dynamic_graphics_state.ds,
3858               &cmd->state.vk_rp);
3859    DRAW_STATE_COND(rb_depth_cntl, TU_DYNAMIC_STATE_RB_DEPTH_CNTL,
3860                    cmd->state.dirty & TU_CMD_DIRTY_SUBPASS,
3861                    &cmd->vk.dynamic_graphics_state.ds,
3862                    &cmd->state.vk_rp,
3863                    &cmd->vk.dynamic_graphics_state.rs);
3864    DRAW_STATE_COND(patch_control_points,
3865                    TU_DYNAMIC_STATE_PATCH_CONTROL_POINTS,
3866                    cmd->state.dirty & TU_CMD_DIRTY_PROGRAM,
3867                    cmd->state.shaders[MESA_SHADER_VERTEX],
3868                    cmd->state.shaders[MESA_SHADER_TESS_CTRL],
3869                    cmd->state.shaders[MESA_SHADER_TESS_EVAL],
3870                    &cmd->state.program,
3871                    cmd->vk.dynamic_graphics_state.ts.patch_control_points);
3872    if (!cmd->device->physical_device->info->a6xx.has_coherent_ubwc_flag_caches) {
3873       DRAW_STATE_COND(prim_mode_sysmem,
3874                       TU_DYNAMIC_STATE_PRIM_MODE_SYSMEM,
3875                       cmd->state.dirty & (TU_CMD_DIRTY_RAST_ORDER |
3876                                           TU_CMD_DIRTY_FEEDBACK_LOOPS |
3877                                           TU_CMD_DIRTY_FS),
3878                       cmd->state.shaders[MESA_SHADER_FRAGMENT],
3879                       cmd->state.raster_order_attachment_access,
3880                       cmd->vk.dynamic_graphics_state.feedback_loops |
3881                       cmd->state.pipeline_feedback_loops,
3882                       &cmd->state.rp.sysmem_single_prim_mode);
3883    }
3884 #undef DRAW_STATE
3885 #undef DRAW_STATE_COND
3886 #undef EMIT_STATE
3887 
3888    return dirty_draw_states;
3889 }
3890 TU_GENX(tu_emit_draw_state);
3891 
3892 static void
tu_pipeline_builder_parse_depth_stencil(struct tu_pipeline_builder * builder,struct tu_pipeline * pipeline)3893 tu_pipeline_builder_parse_depth_stencil(
3894    struct tu_pipeline_builder *builder, struct tu_pipeline *pipeline)
3895 {
3896    const VkPipelineDepthStencilStateCreateInfo *ds_info =
3897       builder->create_info->pDepthStencilState;
3898 
3899    if ((builder->graphics_state.rp->attachments ==
3900         MESA_VK_RP_ATTACHMENT_INFO_INVALID) ||
3901        (builder->graphics_state.rp->attachments &
3902         MESA_VK_RP_ATTACHMENT_DEPTH_BIT)) {
3903       pipeline->ds.raster_order_attachment_access =
3904          ds_info && (ds_info->flags &
3905          (VK_PIPELINE_DEPTH_STENCIL_STATE_CREATE_RASTERIZATION_ORDER_ATTACHMENT_DEPTH_ACCESS_BIT_EXT |
3906           VK_PIPELINE_DEPTH_STENCIL_STATE_CREATE_RASTERIZATION_ORDER_ATTACHMENT_STENCIL_ACCESS_BIT_EXT));
3907    }
3908 }
3909 
3910 static void
tu_pipeline_builder_parse_multisample_and_color_blend(struct tu_pipeline_builder * builder,struct tu_pipeline * pipeline)3911 tu_pipeline_builder_parse_multisample_and_color_blend(
3912    struct tu_pipeline_builder *builder, struct tu_pipeline *pipeline)
3913 {
3914    /* The spec says:
3915     *
3916     *    pMultisampleState is a pointer to an instance of the
3917     *    VkPipelineMultisampleStateCreateInfo, and is ignored if the pipeline
3918     *    has rasterization disabled.
3919     *
3920     * Also,
3921     *
3922     *    pColorBlendState is a pointer to an instance of the
3923     *    VkPipelineColorBlendStateCreateInfo structure, and is ignored if the
3924     *    pipeline has rasterization disabled or if the subpass of the render
3925     *    pass the pipeline is created against does not use any color
3926     *    attachments.
3927     *
3928     * We leave the relevant registers stale when rasterization is disabled.
3929     */
3930    if (builder->rasterizer_discard) {
3931       return;
3932    }
3933 
3934    static const VkPipelineColorBlendStateCreateInfo dummy_blend_info = {};
3935 
3936    const VkPipelineColorBlendStateCreateInfo *blend_info =
3937       (builder->graphics_state.rp->attachments &
3938        MESA_VK_RP_ATTACHMENT_ANY_COLOR_BITS)
3939       ? builder->create_info->pColorBlendState
3940       : &dummy_blend_info;
3941 
3942    if (builder->graphics_state.rp->attachments &
3943        MESA_VK_RP_ATTACHMENT_ANY_COLOR_BITS) {
3944       pipeline->output.raster_order_attachment_access =
3945          blend_info && (blend_info->flags &
3946             VK_PIPELINE_COLOR_BLEND_STATE_CREATE_RASTERIZATION_ORDER_ATTACHMENT_ACCESS_BIT_EXT);
3947    }
3948 }
3949 
3950 static void
tu_pipeline_builder_parse_rasterization_order(struct tu_pipeline_builder * builder,struct tu_pipeline * pipeline)3951 tu_pipeline_builder_parse_rasterization_order(
3952    struct tu_pipeline_builder *builder, struct tu_pipeline *pipeline)
3953 {
3954    if (builder->rasterizer_discard)
3955       return;
3956 
3957    bool raster_order_attachment_access =
3958       pipeline->output.raster_order_attachment_access ||
3959       pipeline->ds.raster_order_attachment_access ||
3960       TU_DEBUG(RAST_ORDER);
3961 
3962    /* VK_EXT_blend_operation_advanced would also require ordered access
3963     * when implemented in the future.
3964     */
3965 
3966    enum a6xx_single_prim_mode gmem_prim_mode = NO_FLUSH;
3967 
3968    if (raster_order_attachment_access) {
3969       /* VK_EXT_rasterization_order_attachment_access:
3970        *
3971        * This extension allow access to framebuffer attachments when used as
3972        * both input and color attachments from one fragment to the next,
3973        * in rasterization order, without explicit synchronization.
3974        */
3975       gmem_prim_mode = FLUSH_PER_OVERLAP;
3976    }
3977 
3978    struct tu_cs cs;
3979 
3980    pipeline->prim_order.state_gmem = tu_cs_draw_state(&pipeline->cs, &cs, 2);
3981    tu_cs_emit_write_reg(&cs, REG_A6XX_GRAS_SC_CNTL,
3982                         A6XX_GRAS_SC_CNTL_CCUSINGLECACHELINESIZE(2) |
3983                         A6XX_GRAS_SC_CNTL_SINGLE_PRIM_MODE(gmem_prim_mode));
3984 }
3985 
3986 static void
tu_pipeline_finish(struct tu_pipeline * pipeline,struct tu_device * dev,const VkAllocationCallbacks * alloc)3987 tu_pipeline_finish(struct tu_pipeline *pipeline,
3988                    struct tu_device *dev,
3989                    const VkAllocationCallbacks *alloc)
3990 {
3991    tu_cs_finish(&pipeline->cs);
3992    TU_RMV(resource_destroy, dev, &pipeline->bo);
3993 
3994    mtx_lock(&dev->pipeline_mutex);
3995    tu_suballoc_bo_free(&dev->pipeline_suballoc, &pipeline->bo);
3996    mtx_unlock(&dev->pipeline_mutex);
3997 
3998    if (pipeline->type == TU_PIPELINE_GRAPHICS_LIB) {
3999       struct tu_graphics_lib_pipeline *library =
4000          tu_pipeline_to_graphics_lib(pipeline);
4001 
4002       if (library->nir_shaders)
4003          vk_pipeline_cache_object_unref(&dev->vk,
4004                                         &library->nir_shaders->base);
4005 
4006       for (unsigned i = 0; i < library->num_sets; i++) {
4007          if (library->layouts[i])
4008             vk_descriptor_set_layout_unref(&dev->vk, &library->layouts[i]->vk);
4009       }
4010 
4011       vk_free2(&dev->vk.alloc, alloc, library->state_data);
4012    }
4013 
4014    for (unsigned i = 0; i < ARRAY_SIZE(pipeline->shaders); i++) {
4015       if (pipeline->shaders[i])
4016          vk_pipeline_cache_object_unref(&dev->vk,
4017                                         &pipeline->shaders[i]->base);
4018    }
4019 
4020    ralloc_free(pipeline->executables_mem_ctx);
4021 }
4022 
4023 static VkGraphicsPipelineLibraryFlagBitsEXT
vk_shader_stage_to_pipeline_library_flags(VkShaderStageFlagBits stage)4024 vk_shader_stage_to_pipeline_library_flags(VkShaderStageFlagBits stage)
4025 {
4026    assert(util_bitcount(stage) == 1);
4027    switch (stage) {
4028    case VK_SHADER_STAGE_VERTEX_BIT:
4029    case VK_SHADER_STAGE_TESSELLATION_CONTROL_BIT:
4030    case VK_SHADER_STAGE_TESSELLATION_EVALUATION_BIT:
4031    case VK_SHADER_STAGE_GEOMETRY_BIT:
4032    case VK_SHADER_STAGE_TASK_BIT_EXT:
4033    case VK_SHADER_STAGE_MESH_BIT_EXT:
4034       return VK_GRAPHICS_PIPELINE_LIBRARY_PRE_RASTERIZATION_SHADERS_BIT_EXT;
4035    case VK_SHADER_STAGE_FRAGMENT_BIT:
4036       return VK_GRAPHICS_PIPELINE_LIBRARY_FRAGMENT_SHADER_BIT_EXT;
4037    default:
4038       unreachable("Invalid shader stage");
4039    }
4040 }
4041 
4042 template <chip CHIP>
4043 static VkResult
tu_pipeline_builder_build(struct tu_pipeline_builder * builder,struct tu_pipeline ** pipeline)4044 tu_pipeline_builder_build(struct tu_pipeline_builder *builder,
4045                           struct tu_pipeline **pipeline)
4046 {
4047    VkResult result;
4048 
4049    if (builder->create_flags & VK_PIPELINE_CREATE_2_LIBRARY_BIT_KHR) {
4050       *pipeline = (struct tu_pipeline *) vk_object_zalloc(
4051          &builder->device->vk, builder->alloc,
4052          sizeof(struct tu_graphics_lib_pipeline),
4053          VK_OBJECT_TYPE_PIPELINE);
4054       if (!*pipeline)
4055          return VK_ERROR_OUT_OF_HOST_MEMORY;
4056       (*pipeline)->type = TU_PIPELINE_GRAPHICS_LIB;
4057    } else {
4058       *pipeline = (struct tu_pipeline *) vk_object_zalloc(
4059          &builder->device->vk, builder->alloc,
4060          sizeof(struct tu_graphics_pipeline),
4061          VK_OBJECT_TYPE_PIPELINE);
4062       if (!*pipeline)
4063          return VK_ERROR_OUT_OF_HOST_MEMORY;
4064       (*pipeline)->type = TU_PIPELINE_GRAPHICS;
4065    }
4066 
4067    (*pipeline)->executables_mem_ctx = ralloc_context(NULL);
4068    util_dynarray_init(&(*pipeline)->executables, (*pipeline)->executables_mem_ctx);
4069 
4070    tu_pipeline_builder_parse_libraries(builder, *pipeline);
4071 
4072    VkShaderStageFlags stages = 0;
4073    for (unsigned i = 0; i < builder->create_info->stageCount; i++) {
4074       VkShaderStageFlagBits stage = builder->create_info->pStages[i].stage;
4075 
4076       /* Ignore shader stages that don't need to be imported. */
4077       if (!(vk_shader_stage_to_pipeline_library_flags(stage) & builder->state))
4078          continue;
4079 
4080       stages |= stage;
4081    }
4082    builder->active_stages = stages;
4083 
4084    (*pipeline)->active_stages = stages;
4085    for (unsigned i = 0; i < builder->num_libraries; i++)
4086       (*pipeline)->active_stages |= builder->libraries[i]->base.active_stages;
4087 
4088    /* Compile and upload shaders unless a library has already done that. */
4089    if ((*pipeline)->program.vs_state.size == 0) {
4090       tu_pipeline_builder_parse_layout(builder, *pipeline);
4091 
4092       result = tu_pipeline_builder_compile_shaders(builder, *pipeline);
4093       if (result != VK_SUCCESS) {
4094          tu_pipeline_finish(*pipeline, builder->device, builder->alloc);
4095          vk_object_free(&builder->device->vk, builder->alloc, *pipeline);
4096          return result;
4097       }
4098    }
4099 
4100    result = tu_pipeline_allocate_cs(builder->device, *pipeline,
4101                                     &builder->layout, builder, NULL);
4102 
4103 
4104    if (set_combined_state(builder, *pipeline,
4105                           VK_GRAPHICS_PIPELINE_LIBRARY_PRE_RASTERIZATION_SHADERS_BIT_EXT |
4106                           VK_GRAPHICS_PIPELINE_LIBRARY_FRAGMENT_SHADER_BIT_EXT)) {
4107       if (result != VK_SUCCESS) {
4108          vk_object_free(&builder->device->vk, builder->alloc, *pipeline);
4109          return result;
4110       }
4111 
4112       tu_emit_program_state<CHIP>(&(*pipeline)->cs, &(*pipeline)->program,
4113                                   (*pipeline)->shaders);
4114 
4115       if (CHIP == A6XX) {
4116          /* Blob doesn't preload state on A7XX, likely preloading either
4117           * doesn't work or doesn't provide benefits.
4118           */
4119          tu6_emit_load_state(builder->device, *pipeline, &builder->layout);
4120       }
4121    }
4122 
4123    if (builder->state & VK_GRAPHICS_PIPELINE_LIBRARY_FRAGMENT_SHADER_BIT_EXT) {
4124       tu_pipeline_builder_parse_depth_stencil(builder, *pipeline);
4125    }
4126 
4127    if (builder->state &
4128        VK_GRAPHICS_PIPELINE_LIBRARY_FRAGMENT_OUTPUT_INTERFACE_BIT_EXT) {
4129       tu_pipeline_builder_parse_multisample_and_color_blend(builder, *pipeline);
4130    }
4131 
4132    if (set_combined_state(builder, *pipeline,
4133                           VK_GRAPHICS_PIPELINE_LIBRARY_FRAGMENT_SHADER_BIT_EXT |
4134                           VK_GRAPHICS_PIPELINE_LIBRARY_FRAGMENT_OUTPUT_INTERFACE_BIT_EXT)) {
4135       tu_pipeline_builder_parse_rasterization_order(builder, *pipeline);
4136    }
4137 
4138    tu_pipeline_builder_emit_state<CHIP>(builder, *pipeline);
4139 
4140    if ((*pipeline)->type == TU_PIPELINE_GRAPHICS_LIB) {
4141       struct tu_graphics_lib_pipeline *library =
4142          tu_pipeline_to_graphics_lib(*pipeline);
4143       result = vk_graphics_pipeline_state_copy(&builder->device->vk,
4144                                                &library->graphics_state,
4145                                                &builder->graphics_state,
4146                                                builder->alloc,
4147                                                VK_SYSTEM_ALLOCATION_SCOPE_OBJECT,
4148                                                &library->state_data);
4149       if (result != VK_SUCCESS) {
4150          tu_pipeline_finish(*pipeline, builder->device, builder->alloc);
4151          return result;
4152       }
4153    } else {
4154       struct tu_graphics_pipeline *gfx_pipeline =
4155          tu_pipeline_to_graphics(*pipeline);
4156       gfx_pipeline->dynamic_state.ms.sample_locations =
4157          &gfx_pipeline->sample_locations;
4158       vk_dynamic_graphics_state_fill(&gfx_pipeline->dynamic_state,
4159                                      &builder->graphics_state);
4160       gfx_pipeline->feedback_loops =
4161          vk_pipeline_flags_feedback_loops(builder->graphics_state.pipeline_flags);
4162       gfx_pipeline->feedback_loop_may_involve_textures =
4163          builder->graphics_state.feedback_loop_not_input_only;
4164    }
4165 
4166    return VK_SUCCESS;
4167 }
4168 
4169 static void
tu_pipeline_builder_finish(struct tu_pipeline_builder * builder)4170 tu_pipeline_builder_finish(struct tu_pipeline_builder *builder)
4171 {
4172    ralloc_free(builder->mem_ctx);
4173 }
4174 
4175 void
tu_fill_render_pass_state(struct vk_render_pass_state * rp,const struct tu_render_pass * pass,const struct tu_subpass * subpass)4176 tu_fill_render_pass_state(struct vk_render_pass_state *rp,
4177                           const struct tu_render_pass *pass,
4178                           const struct tu_subpass *subpass)
4179 {
4180    rp->view_mask = subpass->multiview_mask;
4181    rp->color_attachment_count = subpass->color_count;
4182 
4183    const uint32_t a = subpass->depth_stencil_attachment.attachment;
4184    rp->depth_attachment_format = VK_FORMAT_UNDEFINED;
4185    rp->stencil_attachment_format = VK_FORMAT_UNDEFINED;
4186    rp->attachments = MESA_VK_RP_ATTACHMENT_NONE;
4187    if (a != VK_ATTACHMENT_UNUSED) {
4188       VkFormat ds_format = pass->attachments[a].format;
4189       if (vk_format_has_depth(ds_format) && subpass->depth_used) {
4190          rp->depth_attachment_format = ds_format;
4191          rp->attachments |= MESA_VK_RP_ATTACHMENT_DEPTH_BIT;
4192       }
4193       if (vk_format_has_stencil(ds_format) && subpass->stencil_used) {
4194          rp->stencil_attachment_format = ds_format;
4195          rp->attachments |= MESA_VK_RP_ATTACHMENT_STENCIL_BIT;
4196       }
4197    }
4198 
4199    for (uint32_t i = 0; i < subpass->color_count; i++) {
4200       const uint32_t a = subpass->color_attachments[i].attachment;
4201       if (a == VK_ATTACHMENT_UNUSED) {
4202          rp->color_attachment_formats[i] = VK_FORMAT_UNDEFINED;
4203          continue;
4204       }
4205 
4206       rp->color_attachment_formats[i] = pass->attachments[a].format;
4207       rp->attachments |= MESA_VK_RP_ATTACHMENT_COLOR_BIT(i);
4208    }
4209 }
4210 
4211 static void
tu_pipeline_builder_init_graphics(struct tu_pipeline_builder * builder,struct tu_device * dev,struct vk_pipeline_cache * cache,const VkGraphicsPipelineCreateInfo * create_info,VkPipelineCreateFlags2KHR flags,const VkAllocationCallbacks * alloc)4212 tu_pipeline_builder_init_graphics(
4213    struct tu_pipeline_builder *builder,
4214    struct tu_device *dev,
4215    struct vk_pipeline_cache *cache,
4216    const VkGraphicsPipelineCreateInfo *create_info,
4217    VkPipelineCreateFlags2KHR flags,
4218    const VkAllocationCallbacks *alloc)
4219 {
4220    *builder = (struct tu_pipeline_builder) {
4221       .device = dev,
4222       .mem_ctx = ralloc_context(NULL),
4223       .cache = cache,
4224       .alloc = alloc,
4225       .create_info = create_info,
4226       .create_flags = flags,
4227    };
4228 
4229    const VkGraphicsPipelineLibraryCreateInfoEXT *gpl_info =
4230       vk_find_struct_const(builder->create_info->pNext,
4231                            GRAPHICS_PIPELINE_LIBRARY_CREATE_INFO_EXT);
4232 
4233    const VkPipelineLibraryCreateInfoKHR *library_info =
4234       vk_find_struct_const(builder->create_info->pNext,
4235                            PIPELINE_LIBRARY_CREATE_INFO_KHR);
4236 
4237    if (gpl_info) {
4238       builder->state = gpl_info->flags;
4239    } else {
4240       /* Implement this bit of spec text:
4241        *
4242        *    If this structure is omitted, and either
4243        *    VkGraphicsPipelineCreateInfo::flags includes
4244        *    VK_PIPELINE_CREATE_LIBRARY_BIT_KHR or the
4245        *    VkGraphicsPipelineCreateInfo::pNext chain includes a
4246        *    VkPipelineLibraryCreateInfoKHR structure with a libraryCount
4247        *    greater than 0, it is as if flags is 0. Otherwise if this
4248        *    structure is omitted, it is as if flags includes all possible
4249        *    subsets of the graphics pipeline (i.e. a complete graphics
4250        *    pipeline).
4251        */
4252       if ((library_info && library_info->libraryCount > 0) ||
4253           (builder->create_flags & VK_PIPELINE_CREATE_2_LIBRARY_BIT_KHR)) {
4254          builder->state = 0;
4255       } else {
4256          builder->state =
4257             VK_GRAPHICS_PIPELINE_LIBRARY_VERTEX_INPUT_INTERFACE_BIT_EXT |
4258             VK_GRAPHICS_PIPELINE_LIBRARY_PRE_RASTERIZATION_SHADERS_BIT_EXT |
4259             VK_GRAPHICS_PIPELINE_LIBRARY_FRAGMENT_SHADER_BIT_EXT |
4260             VK_GRAPHICS_PIPELINE_LIBRARY_FRAGMENT_OUTPUT_INTERFACE_BIT_EXT;
4261       }
4262    }
4263 
4264    bool rasterizer_discard_dynamic = false;
4265    if (create_info->pDynamicState) {
4266       for (uint32_t i = 0; i < create_info->pDynamicState->dynamicStateCount; i++) {
4267          if (create_info->pDynamicState->pDynamicStates[i] ==
4268                VK_DYNAMIC_STATE_RASTERIZER_DISCARD_ENABLE) {
4269             rasterizer_discard_dynamic = true;
4270             break;
4271          }
4272       }
4273    }
4274 
4275    builder->rasterizer_discard =
4276       (builder->state & VK_GRAPHICS_PIPELINE_LIBRARY_PRE_RASTERIZATION_SHADERS_BIT_EXT) &&
4277       !rasterizer_discard_dynamic &&
4278       builder->create_info->pRasterizationState->rasterizerDiscardEnable;
4279 
4280    struct vk_render_pass_state rp_state = {};
4281    const struct vk_render_pass_state *driver_rp = NULL;
4282    VkPipelineCreateFlags2KHR rp_flags = 0;
4283 
4284    builder->unscaled_input_fragcoord = 0;
4285 
4286    /* Extract information we need from the turnip renderpass. This will be
4287     * filled out automatically if the app is using dynamic rendering or
4288     * renderpasses are emulated.
4289     */
4290    if (!TU_DEBUG(DYNAMIC) &&
4291        (builder->state &
4292         (VK_GRAPHICS_PIPELINE_LIBRARY_PRE_RASTERIZATION_SHADERS_BIT_EXT |
4293          VK_GRAPHICS_PIPELINE_LIBRARY_FRAGMENT_SHADER_BIT_EXT |
4294          VK_GRAPHICS_PIPELINE_LIBRARY_FRAGMENT_OUTPUT_INTERFACE_BIT_EXT)) &&
4295        builder->create_info->renderPass) {
4296       const struct tu_render_pass *pass =
4297          tu_render_pass_from_handle(create_info->renderPass);
4298       const struct tu_subpass *subpass =
4299          &pass->subpasses[create_info->subpass];
4300 
4301       tu_fill_render_pass_state(&rp_state, pass, subpass);
4302 
4303       for (unsigned i = 0; i < subpass->input_count; i++) {
4304          /* Input attachments stored in GMEM must be loaded with unscaled
4305           * FragCoord.
4306           */
4307          if (subpass->input_attachments[i].patch_input_gmem)
4308             builder->unscaled_input_fragcoord |= 1u << i;
4309       }
4310 
4311       if (subpass->feedback_loop_color) {
4312          rp_flags |=
4313             VK_PIPELINE_CREATE_2_COLOR_ATTACHMENT_FEEDBACK_LOOP_BIT_EXT;
4314       }
4315 
4316       if (subpass->feedback_loop_ds) {
4317          rp_flags |=
4318             VK_PIPELINE_CREATE_2_DEPTH_STENCIL_ATTACHMENT_FEEDBACK_LOOP_BIT_EXT;
4319       }
4320 
4321       if (pass->fragment_density_map.attachment != VK_ATTACHMENT_UNUSED) {
4322          rp_flags |=
4323             VK_PIPELINE_CREATE_2_RENDERING_FRAGMENT_DENSITY_MAP_ATTACHMENT_BIT_EXT;
4324       }
4325 
4326       if (subpass->fsr_attachment != VK_ATTACHMENT_UNUSED) {
4327          rp_flags |=
4328             VK_PIPELINE_CREATE_2_RENDERING_FRAGMENT_SHADING_RATE_ATTACHMENT_BIT_KHR;
4329       }
4330 
4331       builder->unscaled_input_fragcoord = 0;
4332       for (unsigned i = 0; i < subpass->input_count; i++) {
4333          /* Input attachments stored in GMEM must be loaded with unscaled
4334           * FragCoord.
4335           */
4336          if (subpass->input_attachments[i].patch_input_gmem)
4337             builder->unscaled_input_fragcoord |= 1u << i;
4338       }
4339 
4340       driver_rp = &rp_state;
4341    }
4342 
4343    vk_graphics_pipeline_state_fill(&dev->vk,
4344                                    &builder->graphics_state,
4345                                    builder->create_info,
4346                                    driver_rp,
4347                                    rp_flags,
4348                                    &builder->all_state,
4349                                    NULL, VK_SYSTEM_ALLOCATION_SCOPE_OBJECT,
4350                                    NULL);
4351 
4352    if (builder->graphics_state.rp) {
4353       builder->fragment_density_map = (builder->graphics_state.pipeline_flags &
4354          VK_PIPELINE_CREATE_2_RENDERING_FRAGMENT_DENSITY_MAP_ATTACHMENT_BIT_EXT) ||
4355          TU_DEBUG(FDM);
4356    }
4357 }
4358 
4359 template <chip CHIP>
4360 static VkResult
tu_graphics_pipeline_create(VkDevice device,VkPipelineCache pipelineCache,const VkGraphicsPipelineCreateInfo * pCreateInfo,VkPipelineCreateFlags2KHR flags,const VkAllocationCallbacks * pAllocator,VkPipeline * pPipeline)4361 tu_graphics_pipeline_create(VkDevice device,
4362                             VkPipelineCache pipelineCache,
4363                             const VkGraphicsPipelineCreateInfo *pCreateInfo,
4364                             VkPipelineCreateFlags2KHR flags,
4365                             const VkAllocationCallbacks *pAllocator,
4366                             VkPipeline *pPipeline)
4367 {
4368    VK_FROM_HANDLE(tu_device, dev, device);
4369    VK_FROM_HANDLE(vk_pipeline_cache, cache, pipelineCache);
4370 
4371    cache = cache ? cache : dev->mem_cache;
4372 
4373    struct tu_pipeline_builder builder;
4374    tu_pipeline_builder_init_graphics(&builder, dev, cache,
4375                                      pCreateInfo, flags, pAllocator);
4376 
4377    struct tu_pipeline *pipeline = NULL;
4378    VkResult result = tu_pipeline_builder_build<CHIP>(&builder, &pipeline);
4379    tu_pipeline_builder_finish(&builder);
4380 
4381    if (result == VK_SUCCESS) {
4382       TU_RMV(graphics_pipeline_create, dev, tu_pipeline_to_graphics(pipeline));
4383 
4384       *pPipeline = tu_pipeline_to_handle(pipeline);
4385    } else
4386       *pPipeline = VK_NULL_HANDLE;
4387 
4388    return result;
4389 }
4390 
4391 template <chip CHIP>
4392 VKAPI_ATTR VkResult VKAPI_CALL
tu_CreateGraphicsPipelines(VkDevice device,VkPipelineCache pipelineCache,uint32_t count,const VkGraphicsPipelineCreateInfo * pCreateInfos,const VkAllocationCallbacks * pAllocator,VkPipeline * pPipelines)4393 tu_CreateGraphicsPipelines(VkDevice device,
4394                            VkPipelineCache pipelineCache,
4395                            uint32_t count,
4396                            const VkGraphicsPipelineCreateInfo *pCreateInfos,
4397                            const VkAllocationCallbacks *pAllocator,
4398                            VkPipeline *pPipelines)
4399 {
4400    MESA_TRACE_FUNC();
4401    VkResult final_result = VK_SUCCESS;
4402    uint32_t i = 0;
4403 
4404    for (; i < count; i++) {
4405       VkPipelineCreateFlags2KHR flags =
4406          vk_graphics_pipeline_create_flags(&pCreateInfos[i]);
4407 
4408       VkResult result =
4409          tu_graphics_pipeline_create<CHIP>(device, pipelineCache,
4410                                            &pCreateInfos[i], flags,
4411                                            pAllocator, &pPipelines[i]);
4412 
4413       if (result != VK_SUCCESS) {
4414          final_result = result;
4415          pPipelines[i] = VK_NULL_HANDLE;
4416 
4417          if (flags & VK_PIPELINE_CREATE_2_EARLY_RETURN_ON_FAILURE_BIT_KHR)
4418             break;
4419       }
4420    }
4421 
4422    for (; i < count; i++)
4423       pPipelines[i] = VK_NULL_HANDLE;
4424 
4425    return final_result;
4426 }
4427 TU_GENX(tu_CreateGraphicsPipelines);
4428 
4429 template <chip CHIP>
4430 static VkResult
tu_compute_pipeline_create(VkDevice device,VkPipelineCache pipelineCache,const VkComputePipelineCreateInfo * pCreateInfo,VkPipelineCreateFlags2KHR flags,const VkAllocationCallbacks * pAllocator,VkPipeline * pPipeline)4431 tu_compute_pipeline_create(VkDevice device,
4432                            VkPipelineCache pipelineCache,
4433                            const VkComputePipelineCreateInfo *pCreateInfo,
4434                            VkPipelineCreateFlags2KHR flags,
4435                            const VkAllocationCallbacks *pAllocator,
4436                            VkPipeline *pPipeline)
4437 {
4438    VK_FROM_HANDLE(tu_device, dev, device);
4439    VK_FROM_HANDLE(vk_pipeline_cache, cache, pipelineCache);
4440    VK_FROM_HANDLE(tu_pipeline_layout, layout, pCreateInfo->layout);
4441    const VkPipelineShaderStageCreateInfo *stage_info = &pCreateInfo->stage;
4442    VkResult result;
4443    const struct ir3_shader_variant *v = NULL;
4444 
4445    cache = cache ? cache : dev->mem_cache;
4446 
4447    struct tu_compute_pipeline *pipeline;
4448 
4449    *pPipeline = VK_NULL_HANDLE;
4450 
4451    VkPipelineCreationFeedback pipeline_feedback = {
4452       .flags = VK_PIPELINE_CREATION_FEEDBACK_VALID_BIT,
4453    };
4454 
4455    const VkPipelineCreationFeedbackCreateInfo *creation_feedback =
4456       vk_find_struct_const(pCreateInfo->pNext, PIPELINE_CREATION_FEEDBACK_CREATE_INFO);
4457 
4458    int64_t pipeline_start = os_time_get_nano();
4459 
4460    pipeline = (struct tu_compute_pipeline *) vk_object_zalloc(
4461       &dev->vk, pAllocator, sizeof(*pipeline), VK_OBJECT_TYPE_PIPELINE);
4462    if (!pipeline)
4463       return VK_ERROR_OUT_OF_HOST_MEMORY;
4464    pipeline->base.type = TU_PIPELINE_COMPUTE;
4465 
4466    pipeline->base.executables_mem_ctx = ralloc_context(NULL);
4467    util_dynarray_init(&pipeline->base.executables, pipeline->base.executables_mem_ctx);
4468    pipeline->base.active_stages = VK_SHADER_STAGE_COMPUTE_BIT;
4469 
4470    struct tu_shader_key key = { };
4471    bool allow_varying_subgroup_size =
4472       (stage_info->flags &
4473        VK_PIPELINE_SHADER_STAGE_CREATE_ALLOW_VARYING_SUBGROUP_SIZE_BIT_EXT);
4474    bool require_full_subgroups =
4475       stage_info->flags &
4476       VK_PIPELINE_SHADER_STAGE_CREATE_REQUIRE_FULL_SUBGROUPS_BIT_EXT;
4477    const VkPipelineShaderStageRequiredSubgroupSizeCreateInfo *subgroup_info =
4478       vk_find_struct_const(stage_info,
4479                            PIPELINE_SHADER_STAGE_REQUIRED_SUBGROUP_SIZE_CREATE_INFO);
4480    tu_shader_key_subgroup_size(&key, allow_varying_subgroup_size,
4481                                require_full_subgroups, subgroup_info,
4482                                dev);
4483 
4484    struct vk_pipeline_robustness_state rs;
4485    vk_pipeline_robustness_state_fill(&dev->vk, &rs,
4486                                      pCreateInfo->pNext,
4487                                      stage_info->pNext);
4488    tu_shader_key_robustness(&key, &rs);
4489 
4490    void *pipeline_mem_ctx = ralloc_context(NULL);
4491 
4492    unsigned char pipeline_sha1[20];
4493    tu_hash_compute(pipeline_sha1, flags, stage_info, layout, &key);
4494 
4495    struct tu_shader *shader = NULL;
4496 
4497    const bool executable_info = flags &
4498       VK_PIPELINE_CREATE_2_CAPTURE_INTERNAL_REPRESENTATIONS_BIT_KHR;
4499 
4500    bool application_cache_hit = false;
4501 
4502    if (!executable_info) {
4503       shader =
4504          tu_pipeline_cache_lookup(cache, pipeline_sha1, sizeof(pipeline_sha1),
4505                                   &application_cache_hit);
4506    }
4507 
4508    if (application_cache_hit && cache != dev->mem_cache) {
4509       pipeline_feedback.flags |=
4510          VK_PIPELINE_CREATION_FEEDBACK_APPLICATION_PIPELINE_CACHE_HIT_BIT;
4511    }
4512 
4513    char *nir_initial_disasm = NULL;
4514 
4515    if (!shader) {
4516       if (flags &
4517           VK_PIPELINE_CREATE_2_FAIL_ON_PIPELINE_COMPILE_REQUIRED_BIT_KHR) {
4518          result = VK_PIPELINE_COMPILE_REQUIRED;
4519          goto fail;
4520       }
4521 
4522       struct ir3_shader_key ir3_key = {};
4523 
4524       nir_shader *nir = tu_spirv_to_nir(dev, pipeline_mem_ctx, flags,
4525                                         stage_info, &key, MESA_SHADER_COMPUTE);
4526 
4527       nir_initial_disasm = executable_info ?
4528          nir_shader_as_str(nir, pipeline->base.executables_mem_ctx) : NULL;
4529 
4530       result = tu_shader_create(dev, &shader, nir, &key, &ir3_key,
4531                                 pipeline_sha1, sizeof(pipeline_sha1), layout,
4532                                 executable_info);
4533       if (!shader) {
4534          goto fail;
4535       }
4536 
4537       shader = tu_pipeline_cache_insert(cache, shader);
4538    }
4539 
4540    pipeline_feedback.duration = os_time_get_nano() - pipeline_start;
4541 
4542    if (creation_feedback) {
4543       *creation_feedback->pPipelineCreationFeedback = pipeline_feedback;
4544       assert(creation_feedback->pipelineStageCreationFeedbackCount == 1);
4545       creation_feedback->pPipelineStageCreationFeedbacks[0] = pipeline_feedback;
4546    }
4547 
4548    pipeline->base.active_desc_sets = shader->active_desc_sets;
4549 
4550    v = shader->variant;
4551 
4552    tu_pipeline_set_linkage(&pipeline->base.program.link[MESA_SHADER_COMPUTE],
4553                            &shader->const_state, v);
4554 
4555    result = tu_pipeline_allocate_cs(dev, &pipeline->base, layout, NULL, v);
4556    if (result != VK_SUCCESS)
4557       goto fail;
4558 
4559    for (int i = 0; i < 3; i++)
4560       pipeline->local_size[i] = v->local_size[i];
4561 
4562    if (CHIP == A6XX) {
4563       tu6_emit_load_state(dev, &pipeline->base, layout);
4564    }
4565 
4566    tu_append_executable(&pipeline->base, v, nir_initial_disasm);
4567 
4568    pipeline->instrlen = v->instrlen;
4569 
4570    pipeline->base.shaders[MESA_SHADER_COMPUTE] = shader;
4571 
4572    ralloc_free(pipeline_mem_ctx);
4573 
4574    TU_RMV(compute_pipeline_create, dev, pipeline);
4575 
4576    *pPipeline = tu_pipeline_to_handle(&pipeline->base);
4577 
4578    return VK_SUCCESS;
4579 
4580 fail:
4581    if (shader)
4582       vk_pipeline_cache_object_unref(&dev->vk, &shader->base);
4583 
4584    ralloc_free(pipeline_mem_ctx);
4585 
4586    vk_object_free(&dev->vk, pAllocator, pipeline);
4587 
4588    return result;
4589 }
4590 
4591 template <chip CHIP>
4592 VKAPI_ATTR VkResult VKAPI_CALL
tu_CreateComputePipelines(VkDevice device,VkPipelineCache pipelineCache,uint32_t count,const VkComputePipelineCreateInfo * pCreateInfos,const VkAllocationCallbacks * pAllocator,VkPipeline * pPipelines)4593 tu_CreateComputePipelines(VkDevice device,
4594                           VkPipelineCache pipelineCache,
4595                           uint32_t count,
4596                           const VkComputePipelineCreateInfo *pCreateInfos,
4597                           const VkAllocationCallbacks *pAllocator,
4598                           VkPipeline *pPipelines)
4599 {
4600    MESA_TRACE_FUNC();
4601    VkResult final_result = VK_SUCCESS;
4602    uint32_t i = 0;
4603 
4604    for (; i < count; i++) {
4605       VkPipelineCreateFlags2KHR flags =
4606          vk_compute_pipeline_create_flags(&pCreateInfos[i]);
4607 
4608       VkResult result =
4609          tu_compute_pipeline_create<CHIP>(device, pipelineCache,
4610                                           &pCreateInfos[i], flags,
4611                                           pAllocator, &pPipelines[i]);
4612       if (result != VK_SUCCESS) {
4613          final_result = result;
4614          pPipelines[i] = VK_NULL_HANDLE;
4615 
4616          if (flags &
4617              VK_PIPELINE_CREATE_2_EARLY_RETURN_ON_FAILURE_BIT_KHR)
4618             break;
4619       }
4620    }
4621 
4622    for (; i < count; i++)
4623       pPipelines[i] = VK_NULL_HANDLE;
4624 
4625    return final_result;
4626 }
4627 TU_GENX(tu_CreateComputePipelines);
4628 
4629 VKAPI_ATTR void VKAPI_CALL
tu_DestroyPipeline(VkDevice _device,VkPipeline _pipeline,const VkAllocationCallbacks * pAllocator)4630 tu_DestroyPipeline(VkDevice _device,
4631                    VkPipeline _pipeline,
4632                    const VkAllocationCallbacks *pAllocator)
4633 {
4634    VK_FROM_HANDLE(tu_device, dev, _device);
4635    VK_FROM_HANDLE(tu_pipeline, pipeline, _pipeline);
4636 
4637    if (!_pipeline)
4638       return;
4639 
4640    TU_RMV(resource_destroy, dev, pipeline);
4641 
4642    tu_pipeline_finish(pipeline, dev, pAllocator);
4643    vk_object_free(&dev->vk, pAllocator, pipeline);
4644 }
4645 
4646 #define WRITE_STR(field, ...) ({                                \
4647    memset(field, 0, sizeof(field));                             \
4648    UNUSED int _i = snprintf(field, sizeof(field), __VA_ARGS__); \
4649    assert(_i > 0 && _i < sizeof(field));                        \
4650 })
4651 
4652 static const struct tu_pipeline_executable *
tu_pipeline_get_executable(struct tu_pipeline * pipeline,uint32_t index)4653 tu_pipeline_get_executable(struct tu_pipeline *pipeline, uint32_t index)
4654 {
4655    assert(index < util_dynarray_num_elements(&pipeline->executables,
4656                                              struct tu_pipeline_executable));
4657    return util_dynarray_element(
4658       &pipeline->executables, struct tu_pipeline_executable, index);
4659 }
4660 
4661 VKAPI_ATTR VkResult VKAPI_CALL
tu_GetPipelineExecutablePropertiesKHR(VkDevice _device,const VkPipelineInfoKHR * pPipelineInfo,uint32_t * pExecutableCount,VkPipelineExecutablePropertiesKHR * pProperties)4662 tu_GetPipelineExecutablePropertiesKHR(
4663       VkDevice _device,
4664       const VkPipelineInfoKHR* pPipelineInfo,
4665       uint32_t* pExecutableCount,
4666       VkPipelineExecutablePropertiesKHR* pProperties)
4667 {
4668    VK_FROM_HANDLE(tu_device, dev, _device);
4669    VK_FROM_HANDLE(tu_pipeline, pipeline, pPipelineInfo->pipeline);
4670    VK_OUTARRAY_MAKE_TYPED(VkPipelineExecutablePropertiesKHR, out,
4671                           pProperties, pExecutableCount);
4672 
4673    util_dynarray_foreach (&pipeline->executables, struct tu_pipeline_executable, exe) {
4674       vk_outarray_append_typed(VkPipelineExecutablePropertiesKHR, &out, props) {
4675          gl_shader_stage stage = exe->stage;
4676          props->stages = mesa_to_vk_shader_stage(stage);
4677 
4678          if (!exe->is_binning)
4679             WRITE_STR(props->name, "%s", _mesa_shader_stage_to_abbrev(stage));
4680          else
4681             WRITE_STR(props->name, "Binning VS");
4682 
4683          WRITE_STR(props->description, "%s", _mesa_shader_stage_to_string(stage));
4684 
4685          props->subgroupSize =
4686             dev->compiler->threadsize_base * (exe->stats.double_threadsize ? 2 : 1);
4687       }
4688    }
4689 
4690    return vk_outarray_status(&out);
4691 }
4692 
4693 VKAPI_ATTR VkResult VKAPI_CALL
tu_GetPipelineExecutableStatisticsKHR(VkDevice _device,const VkPipelineExecutableInfoKHR * pExecutableInfo,uint32_t * pStatisticCount,VkPipelineExecutableStatisticKHR * pStatistics)4694 tu_GetPipelineExecutableStatisticsKHR(
4695       VkDevice _device,
4696       const VkPipelineExecutableInfoKHR* pExecutableInfo,
4697       uint32_t* pStatisticCount,
4698       VkPipelineExecutableStatisticKHR* pStatistics)
4699 {
4700    VK_FROM_HANDLE(tu_pipeline, pipeline, pExecutableInfo->pipeline);
4701    VK_OUTARRAY_MAKE_TYPED(VkPipelineExecutableStatisticKHR, out,
4702                           pStatistics, pStatisticCount);
4703 
4704    const struct tu_pipeline_executable *exe =
4705       tu_pipeline_get_executable(pipeline, pExecutableInfo->executableIndex);
4706 
4707    vk_outarray_append_typed(VkPipelineExecutableStatisticKHR, &out, stat) {
4708       WRITE_STR(stat->name, "Max Waves Per Core");
4709       WRITE_STR(stat->description,
4710                 "Maximum number of simultaneous waves per core.");
4711       stat->format = VK_PIPELINE_EXECUTABLE_STATISTIC_FORMAT_UINT64_KHR;
4712       stat->value.u64 = exe->stats.max_waves;
4713    }
4714 
4715    vk_outarray_append_typed(VkPipelineExecutableStatisticKHR, &out, stat) {
4716       WRITE_STR(stat->name, "Instruction Count");
4717       WRITE_STR(stat->description,
4718                 "Total number of IR3 instructions in the final generated "
4719                 "shader executable.");
4720       stat->format = VK_PIPELINE_EXECUTABLE_STATISTIC_FORMAT_UINT64_KHR;
4721       stat->value.u64 = exe->stats.instrs_count;
4722    }
4723 
4724    vk_outarray_append_typed(VkPipelineExecutableStatisticKHR, &out, stat) {
4725       WRITE_STR(stat->name, "Code size");
4726       WRITE_STR(stat->description,
4727                 "Total number of dwords in the final generated "
4728                 "shader executable.");
4729       stat->format = VK_PIPELINE_EXECUTABLE_STATISTIC_FORMAT_UINT64_KHR;
4730       stat->value.u64 = exe->stats.sizedwords;
4731    }
4732 
4733    vk_outarray_append_typed(VkPipelineExecutableStatisticKHR, &out, stat) {
4734       WRITE_STR(stat->name, "NOPs Count");
4735       WRITE_STR(stat->description,
4736                 "Number of NOP instructions in the final generated "
4737                 "shader executable.");
4738       stat->format = VK_PIPELINE_EXECUTABLE_STATISTIC_FORMAT_UINT64_KHR;
4739       stat->value.u64 = exe->stats.nops_count;
4740    }
4741 
4742    vk_outarray_append_typed(VkPipelineExecutableStatisticKHR, &out, stat) {
4743       WRITE_STR(stat->name, "MOV Count");
4744       WRITE_STR(stat->description,
4745                 "Number of MOV instructions in the final generated "
4746                 "shader executable.");
4747       stat->format = VK_PIPELINE_EXECUTABLE_STATISTIC_FORMAT_UINT64_KHR;
4748       stat->value.u64 = exe->stats.mov_count;
4749    }
4750 
4751    vk_outarray_append_typed(VkPipelineExecutableStatisticKHR, &out, stat) {
4752       WRITE_STR(stat->name, "COV Count");
4753       WRITE_STR(stat->description,
4754                 "Number of COV instructions in the final generated "
4755                 "shader executable.");
4756       stat->format = VK_PIPELINE_EXECUTABLE_STATISTIC_FORMAT_UINT64_KHR;
4757       stat->value.u64 = exe->stats.cov_count;
4758    }
4759 
4760    vk_outarray_append_typed(VkPipelineExecutableStatisticKHR, &out, stat) {
4761       WRITE_STR(stat->name, "Registers used");
4762       WRITE_STR(stat->description,
4763                 "Number of registers used in the final generated "
4764                 "shader executable.");
4765       stat->format = VK_PIPELINE_EXECUTABLE_STATISTIC_FORMAT_UINT64_KHR;
4766       stat->value.u64 = exe->stats.max_reg + 1;
4767    }
4768 
4769    vk_outarray_append_typed(VkPipelineExecutableStatisticKHR, &out, stat) {
4770       WRITE_STR(stat->name, "Half-registers used");
4771       WRITE_STR(stat->description,
4772                 "Number of half-registers used in the final generated "
4773                 "shader executable.");
4774       stat->format = VK_PIPELINE_EXECUTABLE_STATISTIC_FORMAT_UINT64_KHR;
4775       stat->value.u64 = exe->stats.max_half_reg + 1;
4776    }
4777 
4778    vk_outarray_append_typed(VkPipelineExecutableStatisticKHR, &out, stat) {
4779       WRITE_STR(stat->name, "Last interpolation instruction");
4780       WRITE_STR(stat->description,
4781                 "The instruction where varying storage in Local Memory is released");
4782       stat->format = VK_PIPELINE_EXECUTABLE_STATISTIC_FORMAT_UINT64_KHR;
4783       stat->value.u64 = exe->stats.last_baryf;
4784    }
4785 
4786    vk_outarray_append_typed(VkPipelineExecutableStatisticKHR, &out, stat) {
4787       WRITE_STR(stat->name, "Last helper instruction");
4788       WRITE_STR(stat->description,
4789                 "The instruction where helper invocations are killed");
4790       stat->format = VK_PIPELINE_EXECUTABLE_STATISTIC_FORMAT_UINT64_KHR;
4791       stat->value.u64 = exe->stats.last_helper;
4792    }
4793 
4794    vk_outarray_append_typed(VkPipelineExecutableStatisticKHR, &out, stat) {
4795       WRITE_STR(stat->name, "Instructions with SS sync bit");
4796       WRITE_STR(stat->description,
4797                 "SS bit is set for instructions which depend on a result "
4798                 "of \"long\" instructions to prevent RAW hazard.");
4799       stat->format = VK_PIPELINE_EXECUTABLE_STATISTIC_FORMAT_UINT64_KHR;
4800       stat->value.u64 = exe->stats.ss;
4801    }
4802 
4803    vk_outarray_append_typed(VkPipelineExecutableStatisticKHR, &out, stat) {
4804       WRITE_STR(stat->name, "Instructions with SY sync bit");
4805       WRITE_STR(stat->description,
4806                 "SY bit is set for instructions which depend on a result "
4807                 "of loads from global memory to prevent RAW hazard.");
4808       stat->format = VK_PIPELINE_EXECUTABLE_STATISTIC_FORMAT_UINT64_KHR;
4809       stat->value.u64 = exe->stats.sy;
4810    }
4811 
4812    vk_outarray_append_typed(VkPipelineExecutableStatisticKHR, &out, stat) {
4813       WRITE_STR(stat->name, "Estimated cycles stalled on SS");
4814       WRITE_STR(stat->description,
4815                 "A better metric to estimate the impact of SS syncs.");
4816       stat->format = VK_PIPELINE_EXECUTABLE_STATISTIC_FORMAT_UINT64_KHR;
4817       stat->value.u64 = exe->stats.sstall;
4818    }
4819 
4820    vk_outarray_append_typed(VkPipelineExecutableStatisticKHR, &out, stat) {
4821       WRITE_STR(stat->name, "Estimated cycles stalled on SY");
4822       WRITE_STR(stat->description,
4823                 "A better metric to estimate the impact of SY syncs.");
4824       stat->format = VK_PIPELINE_EXECUTABLE_STATISTIC_FORMAT_UINT64_KHR;
4825       stat->value.u64 = exe->stats.systall;
4826    }
4827 
4828    for (int i = 0; i < ARRAY_SIZE(exe->stats.instrs_per_cat); i++) {
4829       vk_outarray_append_typed(VkPipelineExecutableStatisticKHR, &out, stat) {
4830          WRITE_STR(stat->name, "cat%d instructions", i);
4831          WRITE_STR(stat->description,
4832                   "Number of cat%d instructions.", i);
4833          stat->format = VK_PIPELINE_EXECUTABLE_STATISTIC_FORMAT_UINT64_KHR;
4834          stat->value.u64 = exe->stats.instrs_per_cat[i];
4835       }
4836    }
4837 
4838    vk_outarray_append_typed(VkPipelineExecutableStatisticKHR, &out, stat) {
4839       WRITE_STR(stat->name, "STP Count");
4840       WRITE_STR(stat->description,
4841                 "Number of STore Private instructions in the final generated "
4842                 "shader executable.");
4843       stat->format = VK_PIPELINE_EXECUTABLE_STATISTIC_FORMAT_UINT64_KHR;
4844       stat->value.u64 = exe->stats.stp_count;
4845    }
4846 
4847    vk_outarray_append_typed(VkPipelineExecutableStatisticKHR, &out, stat) {
4848       WRITE_STR(stat->name, "LDP Count");
4849       WRITE_STR(stat->description,
4850                 "Number of LoaD Private instructions in the final generated "
4851                 "shader executable.");
4852       stat->format = VK_PIPELINE_EXECUTABLE_STATISTIC_FORMAT_UINT64_KHR;
4853       stat->value.u64 = exe->stats.ldp_count;
4854    }
4855 
4856    vk_outarray_append_typed(VkPipelineExecutableStatisticKHR, &out, stat) {
4857       WRITE_STR(stat->name, "Preamble Instruction Count");
4858       WRITE_STR(stat->description,
4859                 "Total number of IR3 instructions in the preamble.");
4860       stat->format = VK_PIPELINE_EXECUTABLE_STATISTIC_FORMAT_UINT64_KHR;
4861       stat->value.u64 = exe->stats.preamble_instrs_count;
4862    }
4863 
4864    vk_outarray_append_typed(VkPipelineExecutableStatisticKHR, &out, stat) {
4865       WRITE_STR(stat->name, "Early preamble");
4866       WRITE_STR(stat->description,
4867                 "Whether the preamble will be executed early.");
4868       stat->format = VK_PIPELINE_EXECUTABLE_STATISTIC_FORMAT_BOOL32_KHR;
4869       stat->value.b32 = exe->stats.early_preamble;
4870    }
4871 
4872    return vk_outarray_status(&out);
4873 }
4874 
4875 static bool
write_ir_text(VkPipelineExecutableInternalRepresentationKHR * ir,const char * data)4876 write_ir_text(VkPipelineExecutableInternalRepresentationKHR* ir,
4877               const char *data)
4878 {
4879    ir->isText = VK_TRUE;
4880 
4881    size_t data_len = strlen(data) + 1;
4882 
4883    if (ir->pData == NULL) {
4884       ir->dataSize = data_len;
4885       return true;
4886    }
4887 
4888    strncpy((char *) ir->pData, data, ir->dataSize);
4889    if (ir->dataSize < data_len)
4890       return false;
4891 
4892    ir->dataSize = data_len;
4893    return true;
4894 }
4895 
4896 VKAPI_ATTR VkResult VKAPI_CALL
tu_GetPipelineExecutableInternalRepresentationsKHR(VkDevice _device,const VkPipelineExecutableInfoKHR * pExecutableInfo,uint32_t * pInternalRepresentationCount,VkPipelineExecutableInternalRepresentationKHR * pInternalRepresentations)4897 tu_GetPipelineExecutableInternalRepresentationsKHR(
4898     VkDevice _device,
4899     const VkPipelineExecutableInfoKHR* pExecutableInfo,
4900     uint32_t* pInternalRepresentationCount,
4901     VkPipelineExecutableInternalRepresentationKHR* pInternalRepresentations)
4902 {
4903    VK_FROM_HANDLE(tu_pipeline, pipeline, pExecutableInfo->pipeline);
4904    VK_OUTARRAY_MAKE_TYPED(VkPipelineExecutableInternalRepresentationKHR, out,
4905                           pInternalRepresentations, pInternalRepresentationCount);
4906    bool incomplete_text = false;
4907 
4908    const struct tu_pipeline_executable *exe =
4909       tu_pipeline_get_executable(pipeline, pExecutableInfo->executableIndex);
4910 
4911    if (exe->nir_from_spirv) {
4912       vk_outarray_append_typed(VkPipelineExecutableInternalRepresentationKHR, &out, ir) {
4913          WRITE_STR(ir->name, "NIR from SPIRV");
4914          WRITE_STR(ir->description,
4915                    "Initial NIR before any optimizations");
4916 
4917          if (!write_ir_text(ir, exe->nir_from_spirv))
4918             incomplete_text = true;
4919       }
4920    }
4921 
4922    if (exe->nir_final) {
4923       vk_outarray_append_typed(VkPipelineExecutableInternalRepresentationKHR, &out, ir) {
4924          WRITE_STR(ir->name, "Final NIR");
4925          WRITE_STR(ir->description,
4926                    "Final NIR before going into the back-end compiler");
4927 
4928          if (!write_ir_text(ir, exe->nir_final))
4929             incomplete_text = true;
4930       }
4931    }
4932 
4933    if (exe->disasm) {
4934       vk_outarray_append_typed(VkPipelineExecutableInternalRepresentationKHR, &out, ir) {
4935          WRITE_STR(ir->name, "IR3 Assembly");
4936          WRITE_STR(ir->description,
4937                    "Final IR3 assembly for the generated shader binary");
4938 
4939          if (!write_ir_text(ir, exe->disasm))
4940             incomplete_text = true;
4941       }
4942    }
4943 
4944    return incomplete_text ? VK_INCOMPLETE : vk_outarray_status(&out);
4945 }
4946