• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*
2  * Copyright © 2016 Red Hat.
3  * Copyright © 2016 Bas Nieuwenhuizen
4  * SPDX-License-Identifier: MIT
5  *
6  * based in part on anv driver which is:
7  * Copyright © 2015 Intel Corporation
8  */
9 
10 #include "tu_pipeline.h"
11 
12 #include "common/freedreno_guardband.h"
13 
14 #include "ir3/ir3_nir.h"
15 #include "nir/nir.h"
16 #include "nir/nir_builder.h"
17 #include "nir/nir_serialize.h"
18 #include "spirv/nir_spirv.h"
19 #include "util/u_debug.h"
20 #include "util/mesa-sha1.h"
21 #include "vk_nir.h"
22 #include "vk_pipeline.h"
23 #include "vk_render_pass.h"
24 #include "vk_util.h"
25 
26 #include "tu_cmd_buffer.h"
27 #include "tu_cs.h"
28 #include "tu_device.h"
29 #include "tu_knl.h"
30 #include "tu_formats.h"
31 #include "tu_lrz.h"
32 #include "tu_pass.h"
33 
34 /* Emit IB that preloads the descriptors that the shader uses */
35 
36 static void
emit_load_state(struct tu_cs * cs,unsigned opcode,enum a6xx_state_type st,enum a6xx_state_block sb,unsigned base,unsigned offset,unsigned count)37 emit_load_state(struct tu_cs *cs, unsigned opcode, enum a6xx_state_type st,
38                 enum a6xx_state_block sb, unsigned base, unsigned offset,
39                 unsigned count)
40 {
41    /* Note: just emit one packet, even if count overflows NUM_UNIT. It's not
42     * clear if emitting more packets will even help anything. Presumably the
43     * descriptor cache is relatively small, and these packets stop doing
44     * anything when there are too many descriptors.
45     */
46    tu_cs_emit_pkt7(cs, opcode, 3);
47    tu_cs_emit(cs,
48               CP_LOAD_STATE6_0_STATE_TYPE(st) |
49               CP_LOAD_STATE6_0_STATE_SRC(SS6_BINDLESS) |
50               CP_LOAD_STATE6_0_STATE_BLOCK(sb) |
51               CP_LOAD_STATE6_0_NUM_UNIT(MIN2(count, 1024-1)));
52    tu_cs_emit_qw(cs, offset | (base << 28));
53 }
54 
55 static unsigned
tu6_load_state_size(struct tu_pipeline * pipeline,struct tu_pipeline_layout * layout)56 tu6_load_state_size(struct tu_pipeline *pipeline,
57                     struct tu_pipeline_layout *layout)
58 {
59    const unsigned load_state_size = 4;
60    unsigned size = 0;
61    for (unsigned i = 0; i < layout->num_sets; i++) {
62       if (!(pipeline->active_desc_sets & (1u << i)))
63          continue;
64 
65       struct tu_descriptor_set_layout *set_layout = layout->set[i].layout;
66       for (unsigned j = 0; j < set_layout->binding_count; j++) {
67          struct tu_descriptor_set_binding_layout *binding = &set_layout->binding[j];
68          unsigned count = 0;
69          /* See comment in tu6_emit_load_state(). */
70          VkShaderStageFlags stages = pipeline->active_stages & binding->shader_stages;
71          unsigned stage_count = util_bitcount(stages);
72 
73          if (!binding->array_size)
74             continue;
75 
76          switch (binding->type) {
77          case VK_DESCRIPTOR_TYPE_STORAGE_BUFFER:
78          case VK_DESCRIPTOR_TYPE_STORAGE_BUFFER_DYNAMIC:
79          case VK_DESCRIPTOR_TYPE_STORAGE_IMAGE:
80          case VK_DESCRIPTOR_TYPE_STORAGE_TEXEL_BUFFER:
81             /* IBO-backed resources only need one packet for all graphics stages */
82             if (stage_count)
83                count += 1;
84             break;
85          case VK_DESCRIPTOR_TYPE_SAMPLER:
86          case VK_DESCRIPTOR_TYPE_SAMPLED_IMAGE:
87          case VK_DESCRIPTOR_TYPE_UNIFORM_TEXEL_BUFFER:
88          case VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER:
89          case VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER_DYNAMIC:
90             /* Textures and UBO's needs a packet for each stage */
91             count = stage_count;
92             break;
93          case VK_DESCRIPTOR_TYPE_COMBINED_IMAGE_SAMPLER:
94             /* Because of how we pack combined images and samplers, we
95              * currently can't use one packet for the whole array.
96              */
97             count = stage_count * binding->array_size * 2;
98             break;
99          case VK_DESCRIPTOR_TYPE_INPUT_ATTACHMENT:
100          case VK_DESCRIPTOR_TYPE_INLINE_UNIFORM_BLOCK:
101          case VK_DESCRIPTOR_TYPE_MUTABLE_EXT:
102             break;
103          default:
104             unreachable("bad descriptor type");
105          }
106          size += count * load_state_size;
107       }
108    }
109    return size;
110 }
111 
112 static void
tu6_emit_load_state(struct tu_device * device,struct tu_pipeline * pipeline,struct tu_pipeline_layout * layout)113 tu6_emit_load_state(struct tu_device *device,
114                     struct tu_pipeline *pipeline,
115                     struct tu_pipeline_layout *layout)
116 {
117    unsigned size = tu6_load_state_size(pipeline, layout);
118    if (size == 0)
119       return;
120 
121    struct tu_cs cs;
122    tu_cs_begin_sub_stream(&pipeline->cs, size, &cs);
123 
124    for (unsigned i = 0; i < layout->num_sets; i++) {
125       /* From 13.2.7. Descriptor Set Binding:
126        *
127        *    A compatible descriptor set must be bound for all set numbers that
128        *    any shaders in a pipeline access, at the time that a draw or
129        *    dispatch command is recorded to execute using that pipeline.
130        *    However, if none of the shaders in a pipeline statically use any
131        *    bindings with a particular set number, then no descriptor set need
132        *    be bound for that set number, even if the pipeline layout includes
133        *    a non-trivial descriptor set layout for that set number.
134        *
135        * This means that descriptor sets unused by the pipeline may have a
136        * garbage or 0 BINDLESS_BASE register, which will cause context faults
137        * when prefetching descriptors from these sets. Skip prefetching for
138        * descriptors from them to avoid this. This is also an optimization,
139        * since these prefetches would be useless.
140        */
141       if (!(pipeline->active_desc_sets & (1u << i)))
142          continue;
143 
144       struct tu_descriptor_set_layout *set_layout = layout->set[i].layout;
145       for (unsigned j = 0; j < set_layout->binding_count; j++) {
146          struct tu_descriptor_set_binding_layout *binding = &set_layout->binding[j];
147          unsigned base = i;
148          unsigned offset = binding->offset / 4;
149          /* Note: amber sets VK_SHADER_STAGE_ALL for its descriptor layout, and
150           * zink has descriptors for each stage in the push layout even if some
151           * stages aren't present in a used pipeline.  We don't want to emit
152           * loads for unused descriptors.
153           */
154          VkShaderStageFlags stages = pipeline->active_stages & binding->shader_stages;
155          unsigned count = binding->array_size;
156 
157          /* If this is a variable-count descriptor, then the array_size is an
158           * upper bound on the size, but we don't know how many descriptors
159           * will actually be used. Therefore we can't pre-load them here.
160           */
161          if (j == set_layout->binding_count - 1 &&
162              set_layout->has_variable_descriptors)
163             continue;
164 
165          if (count == 0 || stages == 0)
166             continue;
167          switch (binding->type) {
168          case VK_DESCRIPTOR_TYPE_STORAGE_BUFFER_DYNAMIC:
169             assert(device->physical_device->reserved_set_idx >= 0);
170             base = device->physical_device->reserved_set_idx;
171             offset = (pipeline->program.dynamic_descriptor_offsets[i] +
172                       binding->dynamic_offset_offset) / 4;
173             FALLTHROUGH;
174          case VK_DESCRIPTOR_TYPE_STORAGE_BUFFER:
175          case VK_DESCRIPTOR_TYPE_STORAGE_IMAGE:
176          case VK_DESCRIPTOR_TYPE_STORAGE_TEXEL_BUFFER: {
177             unsigned mul = binding->size / (A6XX_TEX_CONST_DWORDS * 4);
178             /* IBO-backed resources only need one packet for all graphics stages */
179             if (stages & ~VK_SHADER_STAGE_COMPUTE_BIT) {
180                emit_load_state(&cs, CP_LOAD_STATE6, ST6_SHADER, SB6_IBO,
181                                base, offset, count * mul);
182             }
183             if (stages & VK_SHADER_STAGE_COMPUTE_BIT) {
184                emit_load_state(&cs, CP_LOAD_STATE6_FRAG, ST6_IBO, SB6_CS_SHADER,
185                                base, offset, count * mul);
186             }
187             break;
188          }
189          case VK_DESCRIPTOR_TYPE_INPUT_ATTACHMENT:
190          case VK_DESCRIPTOR_TYPE_INLINE_UNIFORM_BLOCK:
191          case VK_DESCRIPTOR_TYPE_MUTABLE_EXT:
192             /* nothing - input attachments and inline uniforms don't use bindless */
193             break;
194          case VK_DESCRIPTOR_TYPE_SAMPLER:
195          case VK_DESCRIPTOR_TYPE_SAMPLED_IMAGE:
196          case VK_DESCRIPTOR_TYPE_UNIFORM_TEXEL_BUFFER: {
197             tu_foreach_stage(stage, stages) {
198                emit_load_state(&cs, tu6_stage2opcode(stage),
199                                binding->type == VK_DESCRIPTOR_TYPE_SAMPLER ?
200                                ST6_SHADER : ST6_CONSTANTS,
201                                tu6_stage2texsb(stage), base, offset, count);
202             }
203             break;
204          }
205          case VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER_DYNAMIC:
206             assert(device->physical_device->reserved_set_idx >= 0);
207             base = device->physical_device->reserved_set_idx;
208             offset = (pipeline->program.dynamic_descriptor_offsets[i] +
209                       binding->dynamic_offset_offset) / 4;
210             FALLTHROUGH;
211          case VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER: {
212             tu_foreach_stage(stage, stages) {
213                emit_load_state(&cs, tu6_stage2opcode(stage), ST6_UBO,
214                                tu6_stage2shadersb(stage), base, offset, count);
215             }
216             break;
217          }
218          case VK_DESCRIPTOR_TYPE_COMBINED_IMAGE_SAMPLER: {
219             tu_foreach_stage(stage, stages) {
220                /* TODO: We could emit less CP_LOAD_STATE6 if we used
221                 * struct-of-arrays instead of array-of-structs.
222                 */
223                for (unsigned i = 0; i < count; i++) {
224                   unsigned tex_offset = offset + 2 * i * A6XX_TEX_CONST_DWORDS;
225                   unsigned sam_offset = offset + (2 * i + 1) * A6XX_TEX_CONST_DWORDS;
226                   emit_load_state(&cs, tu6_stage2opcode(stage),
227                                   ST6_CONSTANTS, tu6_stage2texsb(stage),
228                                   base, tex_offset, 1);
229                   emit_load_state(&cs, tu6_stage2opcode(stage),
230                                   ST6_SHADER, tu6_stage2texsb(stage),
231                                   base, sam_offset, 1);
232                }
233             }
234             break;
235          }
236          default:
237             unreachable("bad descriptor type");
238          }
239       }
240    }
241 
242    pipeline->load_state = tu_cs_end_draw_state(&pipeline->cs, &cs);
243 }
244 
245 struct tu_pipeline_builder
246 {
247    struct tu_device *device;
248    void *mem_ctx;
249    struct vk_pipeline_cache *cache;
250    const VkAllocationCallbacks *alloc;
251    const VkGraphicsPipelineCreateInfo *create_info;
252    VkPipelineCreateFlags2KHR create_flags;
253 
254    struct tu_pipeline_layout layout;
255 
256    struct tu_pvtmem_config pvtmem;
257 
258    bool rasterizer_discard;
259    /* these states are affectd by rasterizer_discard */
260    uint8_t unscaled_input_fragcoord;
261 
262    /* Each library defines at least one piece of state in
263     * VkGraphicsPipelineLibraryFlagsEXT, and libraries cannot overlap, so
264     * there can be at most as many libraries as pieces of state, of which
265     * there are currently 4.
266     */
267 #define MAX_LIBRARIES 4
268 
269    unsigned num_libraries;
270    struct tu_graphics_lib_pipeline *libraries[MAX_LIBRARIES];
271 
272    /* This is just the state that we are compiling now, whereas the final
273     * pipeline will include the state from the libraries.
274     */
275    VkGraphicsPipelineLibraryFlagsEXT state;
276 
277    /* The stages we are compiling now. */
278    VkShaderStageFlags active_stages;
279 
280    bool fragment_density_map;
281 
282    struct vk_graphics_pipeline_all_state all_state;
283    struct vk_graphics_pipeline_state graphics_state;
284 };
285 
286 static bool
tu_logic_op_reads_dst(VkLogicOp op)287 tu_logic_op_reads_dst(VkLogicOp op)
288 {
289    switch (op) {
290    case VK_LOGIC_OP_CLEAR:
291    case VK_LOGIC_OP_COPY:
292    case VK_LOGIC_OP_COPY_INVERTED:
293    case VK_LOGIC_OP_SET:
294       return false;
295    default:
296       return true;
297    }
298 }
299 
300 static bool
tu_blend_state_is_dual_src(const struct vk_color_blend_state * cb)301 tu_blend_state_is_dual_src(const struct vk_color_blend_state *cb)
302 {
303    for (unsigned i = 0; i < cb->attachment_count; i++) {
304       if (tu_blend_factor_is_dual_src((VkBlendFactor)cb->attachments[i].src_color_blend_factor) ||
305           tu_blend_factor_is_dual_src((VkBlendFactor)cb->attachments[i].dst_color_blend_factor) ||
306           tu_blend_factor_is_dual_src((VkBlendFactor)cb->attachments[i].src_alpha_blend_factor) ||
307           tu_blend_factor_is_dual_src((VkBlendFactor)cb->attachments[i].dst_alpha_blend_factor))
308          return true;
309    }
310 
311    return false;
312 }
313 
314 enum ir3_push_consts_type
tu_push_consts_type(const struct tu_pipeline_layout * layout,const struct ir3_compiler * compiler)315 tu_push_consts_type(const struct tu_pipeline_layout *layout,
316                     const struct ir3_compiler *compiler)
317 {
318    if (!layout->push_constant_size)
319       return IR3_PUSH_CONSTS_NONE;
320 
321    if (TU_DEBUG(PUSH_CONSTS_PER_STAGE))
322       return IR3_PUSH_CONSTS_PER_STAGE;
323 
324    if (tu6_shared_constants_enable(layout, compiler)) {
325       return IR3_PUSH_CONSTS_SHARED;
326    } else {
327       if (compiler->gen >= 7) {
328          return IR3_PUSH_CONSTS_SHARED_PREAMBLE;
329       } else {
330          return IR3_PUSH_CONSTS_PER_STAGE;
331       }
332    }
333 }
334 
335 template <chip CHIP>
336 struct xs_config {
337    uint16_t reg_sp_xs_config;
338    uint16_t reg_hlsq_xs_ctrl;
339 };
340 
341 template <chip CHIP>
342 static const xs_config<CHIP> xs_configs[] = {
343    [MESA_SHADER_VERTEX] = {
344       REG_A6XX_SP_VS_CONFIG,
345       CHIP == A6XX ? REG_A6XX_HLSQ_VS_CNTL : REG_A7XX_HLSQ_VS_CNTL,
346    },
347    [MESA_SHADER_TESS_CTRL] = {
348       REG_A6XX_SP_HS_CONFIG,
349       CHIP == A6XX ? REG_A6XX_HLSQ_HS_CNTL : REG_A7XX_HLSQ_HS_CNTL,
350    },
351    [MESA_SHADER_TESS_EVAL] = {
352       REG_A6XX_SP_DS_CONFIG,
353       CHIP == A6XX ? REG_A6XX_HLSQ_DS_CNTL : REG_A7XX_HLSQ_DS_CNTL,
354    },
355    [MESA_SHADER_GEOMETRY] = {
356       REG_A6XX_SP_GS_CONFIG,
357       CHIP == A6XX ? REG_A6XX_HLSQ_GS_CNTL : REG_A7XX_HLSQ_GS_CNTL,
358    },
359    [MESA_SHADER_FRAGMENT] = {
360       REG_A6XX_SP_FS_CONFIG,
361       CHIP == A6XX ? REG_A6XX_HLSQ_FS_CNTL : REG_A7XX_HLSQ_FS_CNTL,
362    },
363    [MESA_SHADER_COMPUTE] = {
364       REG_A6XX_SP_CS_CONFIG,
365       CHIP == A6XX ? REG_A6XX_HLSQ_CS_CNTL : REG_A7XX_HLSQ_CS_CNTL,
366    },
367 };
368 
369 template <chip CHIP>
370 void
tu6_emit_xs_config(struct tu_cs * cs,gl_shader_stage stage,const struct ir3_shader_variant * xs)371 tu6_emit_xs_config(struct tu_cs *cs,
372                    gl_shader_stage stage, /* xs->type, but xs may be NULL */
373                    const struct ir3_shader_variant *xs)
374 {
375    const struct xs_config<CHIP> *cfg = &xs_configs<CHIP>[stage];
376 
377    if (!xs) {
378       /* shader stage disabled */
379       tu_cs_emit_pkt4(cs, cfg->reg_sp_xs_config, 1);
380       tu_cs_emit(cs, 0);
381 
382       tu_cs_emit_pkt4(cs, cfg->reg_hlsq_xs_ctrl, 1);
383       tu_cs_emit(cs, 0);
384       return;
385    }
386 
387    tu_cs_emit_pkt4(cs, cfg->reg_sp_xs_config, 1);
388    tu_cs_emit(cs, A6XX_SP_VS_CONFIG_ENABLED |
389                   COND(xs->bindless_tex, A6XX_SP_VS_CONFIG_BINDLESS_TEX) |
390                   COND(xs->bindless_samp, A6XX_SP_VS_CONFIG_BINDLESS_SAMP) |
391                   COND(xs->bindless_ibo, A6XX_SP_VS_CONFIG_BINDLESS_IBO) |
392                   COND(xs->bindless_ubo, A6XX_SP_VS_CONFIG_BINDLESS_UBO) |
393                   A6XX_SP_VS_CONFIG_NTEX(xs->num_samp) |
394                   A6XX_SP_VS_CONFIG_NSAMP(xs->num_samp));
395 
396    tu_cs_emit_pkt4(cs, cfg->reg_hlsq_xs_ctrl, 1);
397    tu_cs_emit(cs, A6XX_HLSQ_VS_CNTL_CONSTLEN(xs->constlen) |
398                      A6XX_HLSQ_VS_CNTL_ENABLED |
399                      COND(xs->shader_options.push_consts_type == IR3_PUSH_CONSTS_SHARED_PREAMBLE,
400                           A7XX_HLSQ_VS_CNTL_READ_IMM_SHARED_CONSTS));
401 }
402 TU_GENX(tu6_emit_xs_config);
403 
404 static void
tu6_emit_dynamic_offset(struct tu_cs * cs,const struct ir3_shader_variant * xs,const struct tu_shader * shader,const struct tu_program_state * program)405 tu6_emit_dynamic_offset(struct tu_cs *cs,
406                         const struct ir3_shader_variant *xs,
407                         const struct tu_shader *shader,
408                         const struct tu_program_state *program)
409 {
410    const struct tu_physical_device *phys_dev = cs->device->physical_device;
411 
412    if (!xs)
413       return;
414 
415    if (cs->device->physical_device->info->a7xx.load_shader_consts_via_preamble) {
416       if (shader->const_state.dynamic_offsets_ubo.size == 0)
417          return;
418 
419       uint32_t offsets[MAX_SETS];
420       for (unsigned i = 0; i < phys_dev->usable_sets; i++) {
421          unsigned dynamic_offset_start =
422             program->dynamic_descriptor_offsets[i] / (A6XX_TEX_CONST_DWORDS * 4);
423          offsets[i] = dynamic_offset_start;
424       }
425 
426       /* A7XX TODO: Emit data via sub_cs instead of NOP */
427       uint64_t iova = tu_cs_emit_data_nop(cs, offsets, phys_dev->usable_sets, 4);
428       uint32_t offset = shader->const_state.dynamic_offsets_ubo.idx;
429 
430       tu_cs_emit_pkt7(cs, tu6_stage2opcode(xs->type), 5);
431       tu_cs_emit(cs, CP_LOAD_STATE6_0_DST_OFF(offset) |
432                CP_LOAD_STATE6_0_STATE_TYPE(ST6_UBO) |
433                CP_LOAD_STATE6_0_STATE_SRC(SS6_DIRECT) |
434                CP_LOAD_STATE6_0_STATE_BLOCK(tu6_stage2shadersb(xs->type)) |
435                CP_LOAD_STATE6_0_NUM_UNIT(1));
436       tu_cs_emit(cs, CP_LOAD_STATE6_1_EXT_SRC_ADDR(0));
437       tu_cs_emit(cs, CP_LOAD_STATE6_2_EXT_SRC_ADDR_HI(0));
438       int size_vec4s = DIV_ROUND_UP(phys_dev->usable_sets, 4);
439       tu_cs_emit_qw(cs, iova | ((uint64_t)A6XX_UBO_1_SIZE(size_vec4s) << 32));
440    } else {
441       if (shader->const_state.dynamic_offset_loc == UINT32_MAX)
442          return;
443 
444       tu_cs_emit_pkt7(cs, tu6_stage2opcode(xs->type), 3 + phys_dev->usable_sets);
445       tu_cs_emit(cs, CP_LOAD_STATE6_0_DST_OFF(shader->const_state.dynamic_offset_loc / 4) |
446                CP_LOAD_STATE6_0_STATE_TYPE(ST6_CONSTANTS) |
447                CP_LOAD_STATE6_0_STATE_SRC(SS6_DIRECT) |
448                CP_LOAD_STATE6_0_STATE_BLOCK(tu6_stage2shadersb(xs->type)) |
449                CP_LOAD_STATE6_0_NUM_UNIT(DIV_ROUND_UP(phys_dev->usable_sets, 4)));
450       tu_cs_emit(cs, CP_LOAD_STATE6_1_EXT_SRC_ADDR(0));
451       tu_cs_emit(cs, CP_LOAD_STATE6_2_EXT_SRC_ADDR_HI(0));
452 
453       for (unsigned i = 0; i < phys_dev->usable_sets; i++) {
454          unsigned dynamic_offset_start =
455             program->dynamic_descriptor_offsets[i] / (A6XX_TEX_CONST_DWORDS * 4);
456          tu_cs_emit(cs, dynamic_offset_start);
457       }
458    }
459 }
460 
461 template <chip CHIP>
462 void
tu6_emit_shared_consts_enable(struct tu_cs * cs,bool enable)463 tu6_emit_shared_consts_enable(struct tu_cs *cs, bool enable)
464 {
465    if (CHIP == A6XX) {
466       /* Enable/disable shared constants */
467       tu_cs_emit_regs(cs, A6XX_HLSQ_SHARED_CONSTS(.enable = enable));
468    } else {
469       assert(!enable);
470    }
471 
472    tu_cs_emit_regs(cs, A6XX_SP_MODE_CONTROL(.constant_demotion_enable = true,
473                                             .isammode = ISAMMODE_GL,
474                                             .shared_consts_enable = enable));
475 }
476 TU_GENX(tu6_emit_shared_consts_enable);
477 
478 template <chip CHIP>
479 static void
tu6_setup_streamout(struct tu_cs * cs,const struct ir3_shader_variant * v,const struct ir3_shader_linkage * l)480 tu6_setup_streamout(struct tu_cs *cs,
481                     const struct ir3_shader_variant *v,
482                     const struct ir3_shader_linkage *l)
483 {
484    const struct ir3_stream_output_info *info = &v->stream_output;
485    /* Note: 64 here comes from the HW layout of the program RAM. The program
486     * for stream N is at DWORD 64 * N.
487     */
488 #define A6XX_SO_PROG_DWORDS 64
489    uint32_t prog[A6XX_SO_PROG_DWORDS * IR3_MAX_SO_STREAMS] = {};
490    BITSET_DECLARE(valid_dwords, A6XX_SO_PROG_DWORDS * IR3_MAX_SO_STREAMS) = {0};
491 
492    /* TODO: streamout state should be in a non-GMEM draw state */
493 
494    /* no streamout: */
495    if (info->num_outputs == 0) {
496       unsigned sizedw = 4;
497       if (cs->device->physical_device->info->a6xx.tess_use_shared)
498          sizedw += 2;
499 
500       tu_cs_emit_pkt7(cs, CP_CONTEXT_REG_BUNCH, sizedw);
501       tu_cs_emit(cs, REG_A6XX_VPC_SO_CNTL);
502       tu_cs_emit(cs, 0);
503       tu_cs_emit(cs, REG_A6XX_VPC_SO_STREAM_CNTL);
504       tu_cs_emit(cs, 0);
505 
506       if (cs->device->physical_device->info->a6xx.tess_use_shared) {
507          tu_cs_emit(cs, REG_A6XX_PC_SO_STREAM_CNTL);
508          tu_cs_emit(cs, 0);
509       }
510 
511       return;
512    }
513 
514    for (unsigned i = 0; i < info->num_outputs; i++) {
515       const struct ir3_stream_output *out = &info->output[i];
516       unsigned k = out->register_index;
517       unsigned idx;
518 
519       /* Skip it, if it's an output that was never assigned a register. */
520       if (k >= v->outputs_count || v->outputs[k].regid == INVALID_REG)
521          continue;
522 
523       /* linkage map sorted by order frag shader wants things, so
524        * a bit less ideal here..
525        */
526       for (idx = 0; idx < l->cnt; idx++)
527          if (l->var[idx].slot == v->outputs[k].slot)
528             break;
529 
530       assert(idx < l->cnt);
531 
532       for (unsigned j = 0; j < out->num_components; j++) {
533          unsigned c   = j + out->start_component;
534          unsigned loc = l->var[idx].loc + c;
535          unsigned off = j + out->dst_offset;  /* in dwords */
536 
537          assert(loc < A6XX_SO_PROG_DWORDS * 2);
538          unsigned dword = out->stream * A6XX_SO_PROG_DWORDS + loc/2;
539          if (loc & 1) {
540             prog[dword] |= A6XX_VPC_SO_PROG_B_EN |
541                            A6XX_VPC_SO_PROG_B_BUF(out->output_buffer) |
542                            A6XX_VPC_SO_PROG_B_OFF(off * 4);
543          } else {
544             prog[dword] |= A6XX_VPC_SO_PROG_A_EN |
545                            A6XX_VPC_SO_PROG_A_BUF(out->output_buffer) |
546                            A6XX_VPC_SO_PROG_A_OFF(off * 4);
547          }
548          BITSET_SET(valid_dwords, dword);
549       }
550    }
551 
552    unsigned prog_count = 0;
553    unsigned start, end;
554    BITSET_FOREACH_RANGE(start, end, valid_dwords,
555                         A6XX_SO_PROG_DWORDS * IR3_MAX_SO_STREAMS) {
556       prog_count += end - start + 1;
557    }
558 
559    const bool emit_pc_so_stream_cntl =
560       cs->device->physical_device->info->a6xx.tess_use_shared &&
561       v->type == MESA_SHADER_TESS_EVAL;
562 
563    if (emit_pc_so_stream_cntl)
564       prog_count += 1;
565 
566    tu_cs_emit_pkt7(cs, CP_CONTEXT_REG_BUNCH, 10 + 2 * prog_count);
567    tu_cs_emit(cs, REG_A6XX_VPC_SO_STREAM_CNTL);
568    tu_cs_emit(cs, A6XX_VPC_SO_STREAM_CNTL_STREAM_ENABLE(info->streams_written) |
569                   COND(info->stride[0] > 0,
570                        A6XX_VPC_SO_STREAM_CNTL_BUF0_STREAM(1 + info->buffer_to_stream[0])) |
571                   COND(info->stride[1] > 0,
572                        A6XX_VPC_SO_STREAM_CNTL_BUF1_STREAM(1 + info->buffer_to_stream[1])) |
573                   COND(info->stride[2] > 0,
574                        A6XX_VPC_SO_STREAM_CNTL_BUF2_STREAM(1 + info->buffer_to_stream[2])) |
575                   COND(info->stride[3] > 0,
576                        A6XX_VPC_SO_STREAM_CNTL_BUF3_STREAM(1 + info->buffer_to_stream[3])));
577    for (uint32_t i = 0; i < 4; i++) {
578       tu_cs_emit(cs, REG_A6XX_VPC_SO_BUFFER_STRIDE(i));
579       tu_cs_emit(cs, info->stride[i]);
580    }
581    bool first = true;
582    BITSET_FOREACH_RANGE(start, end, valid_dwords,
583                         A6XX_SO_PROG_DWORDS * IR3_MAX_SO_STREAMS) {
584       tu_cs_emit(cs, REG_A6XX_VPC_SO_CNTL);
585       tu_cs_emit(cs, COND(first, A6XX_VPC_SO_CNTL_RESET) |
586                      A6XX_VPC_SO_CNTL_ADDR(start));
587       for (unsigned i = start; i < end; i++) {
588          tu_cs_emit(cs, REG_A6XX_VPC_SO_PROG);
589          tu_cs_emit(cs, prog[i]);
590       }
591       first = false;
592    }
593 
594    if (emit_pc_so_stream_cntl) {
595       /* Possibly not tess_use_shared related, but the combination of
596        * tess + xfb fails some tests if we don't emit this.
597        */
598       tu_cs_emit(cs, REG_A6XX_PC_SO_STREAM_CNTL);
599       tu_cs_emit(cs, A6XX_PC_SO_STREAM_CNTL_STREAM_ENABLE(info->streams_written));
600    }
601 }
602 
603 enum tu_geom_consts_type
604 {
605    TU_CONSTS_PRIMITIVE_MAP,
606    TU_CONSTS_PRIMITIVE_PARAM,
607 };
608 
609 static void
tu6_emit_const(struct tu_cs * cs,uint32_t opcode,enum tu_geom_consts_type type,const struct ir3_const_state * const_state,unsigned constlen,enum a6xx_state_block block,uint32_t offset,uint32_t size,const uint32_t * dwords)610 tu6_emit_const(struct tu_cs *cs, uint32_t opcode, enum tu_geom_consts_type type,
611                const struct ir3_const_state *const_state,
612                unsigned constlen, enum a6xx_state_block block,
613                uint32_t offset, uint32_t size, const uint32_t *dwords) {
614    assert(size % 4 == 0);
615    dwords = (uint32_t *)&((uint8_t *)dwords)[offset];
616 
617    if (block == SB6_VS_SHADER || !cs->device->physical_device->info->a7xx.load_shader_consts_via_preamble) {
618       uint32_t base;
619       switch (type) {
620       case TU_CONSTS_PRIMITIVE_MAP:
621          base = const_state->offsets.primitive_map;
622          break;
623       case TU_CONSTS_PRIMITIVE_PARAM:
624          base = const_state->offsets.primitive_param;
625          break;
626       default:
627          unreachable("bad consts type");
628       }
629 
630       int32_t adjusted_size = MIN2(base * 4 + size, constlen * 4) - base * 4;
631       if (adjusted_size <= 0)
632          return;
633 
634       tu_cs_emit_pkt7(cs, opcode, 3 + adjusted_size);
635       tu_cs_emit(cs, CP_LOAD_STATE6_0_DST_OFF(base) |
636             CP_LOAD_STATE6_0_STATE_TYPE(ST6_CONSTANTS) |
637             CP_LOAD_STATE6_0_STATE_SRC(SS6_DIRECT) |
638             CP_LOAD_STATE6_0_STATE_BLOCK(block) |
639             CP_LOAD_STATE6_0_NUM_UNIT(adjusted_size / 4));
640 
641       tu_cs_emit(cs, CP_LOAD_STATE6_1_EXT_SRC_ADDR(0));
642       tu_cs_emit(cs, CP_LOAD_STATE6_2_EXT_SRC_ADDR_HI(0));
643 
644       tu_cs_emit_array(cs, dwords, adjusted_size);
645    } else {
646       uint32_t base;
647       switch (type) {
648       case TU_CONSTS_PRIMITIVE_MAP:
649          base = const_state->primitive_map_ubo.idx;
650          break;
651       case TU_CONSTS_PRIMITIVE_PARAM:
652          base = const_state->primitive_param_ubo.idx;
653          break;
654       default:
655          unreachable("bad consts type");
656       }
657       if (base == -1)
658          return;
659 
660       /* A7XX TODO: Emit data via sub_cs instead of NOP */
661       uint64_t iova = tu_cs_emit_data_nop(cs, dwords, size, 4);
662 
663       tu_cs_emit_pkt7(cs, opcode, 5);
664       tu_cs_emit(cs, CP_LOAD_STATE6_0_DST_OFF(base) |
665                CP_LOAD_STATE6_0_STATE_TYPE(ST6_UBO) |
666                CP_LOAD_STATE6_0_STATE_SRC(SS6_DIRECT) |
667                CP_LOAD_STATE6_0_STATE_BLOCK(block) |
668                CP_LOAD_STATE6_0_NUM_UNIT(1));
669       tu_cs_emit(cs, CP_LOAD_STATE6_1_EXT_SRC_ADDR(0));
670       tu_cs_emit(cs, CP_LOAD_STATE6_2_EXT_SRC_ADDR_HI(0));
671       int size_vec4s = DIV_ROUND_UP(size, 4);
672       tu_cs_emit_qw(cs, iova | ((uint64_t)A6XX_UBO_1_SIZE(size_vec4s) << 32));
673    }
674 }
675 
676 static void
tu6_emit_link_map(struct tu_cs * cs,const struct ir3_shader_variant * producer,const struct ir3_shader_variant * consumer,enum a6xx_state_block sb)677 tu6_emit_link_map(struct tu_cs *cs,
678                   const struct ir3_shader_variant *producer,
679                   const struct ir3_shader_variant *consumer,
680                   enum a6xx_state_block sb)
681 {
682    const struct ir3_const_state *const_state = ir3_const_state(consumer);
683    uint32_t size = ALIGN(consumer->input_size, 4);
684 
685    if (size == 0)
686       return;
687 
688    tu6_emit_const(cs, CP_LOAD_STATE6_GEOM, TU_CONSTS_PRIMITIVE_MAP,
689                   const_state, consumer->constlen, sb, 0, size, producer->output_loc);
690 }
691 
692 static int
tu6_vpc_varying_mode(const struct ir3_shader_variant * fs,const struct ir3_shader_variant * last_shader,uint32_t index,uint8_t * interp_mode,uint8_t * ps_repl_mode)693 tu6_vpc_varying_mode(const struct ir3_shader_variant *fs,
694                      const struct ir3_shader_variant *last_shader,
695                      uint32_t index,
696                      uint8_t *interp_mode,
697                      uint8_t *ps_repl_mode)
698 {
699    const uint32_t compmask = fs->inputs[index].compmask;
700 
701    /* NOTE: varyings are packed, so if compmask is 0xb then first, second, and
702     * fourth component occupy three consecutive varying slots
703     */
704    int shift = 0;
705    *interp_mode = 0;
706    *ps_repl_mode = 0;
707    if (fs->inputs[index].slot == VARYING_SLOT_PNTC) {
708       if (compmask & 0x1) {
709          *ps_repl_mode |= PS_REPL_S << shift;
710          shift += 2;
711       }
712       if (compmask & 0x2) {
713          *ps_repl_mode |= PS_REPL_T << shift;
714          shift += 2;
715       }
716       if (compmask & 0x4) {
717          *interp_mode |= INTERP_ZERO << shift;
718          shift += 2;
719       }
720       if (compmask & 0x8) {
721          *interp_mode |= INTERP_ONE << 6;
722          shift += 2;
723       }
724    } else if (fs->inputs[index].slot == VARYING_SLOT_LAYER ||
725               fs->inputs[index].slot == VARYING_SLOT_VIEWPORT) {
726       /* If the last geometry shader doesn't statically write these, they're
727        * implicitly zero and the FS is supposed to read zero.
728        */
729       const gl_varying_slot slot = (gl_varying_slot) fs->inputs[index].slot;
730       if (ir3_find_output(last_shader, slot) < 0 &&
731           (compmask & 0x1)) {
732          *interp_mode |= INTERP_ZERO;
733       } else {
734          *interp_mode |= INTERP_FLAT;
735       }
736    } else if (fs->inputs[index].flat) {
737       for (int i = 0; i < 4; i++) {
738          if (compmask & (1 << i)) {
739             *interp_mode |= INTERP_FLAT << shift;
740             shift += 2;
741          }
742       }
743    }
744 
745    return util_bitcount(compmask) * 2;
746 }
747 
748 static void
tu6_emit_vpc_varying_modes(struct tu_cs * cs,const struct ir3_shader_variant * fs,const struct ir3_shader_variant * last_shader)749 tu6_emit_vpc_varying_modes(struct tu_cs *cs,
750                            const struct ir3_shader_variant *fs,
751                            const struct ir3_shader_variant *last_shader)
752 {
753    uint32_t interp_modes[8] = { 0 };
754    uint32_t ps_repl_modes[8] = { 0 };
755    uint32_t interp_regs = 0;
756 
757    if (fs) {
758       for (int i = -1;
759            (i = ir3_next_varying(fs, i)) < (int) fs->inputs_count;) {
760 
761          /* get the mode for input i */
762          uint8_t interp_mode;
763          uint8_t ps_repl_mode;
764          const int bits =
765             tu6_vpc_varying_mode(fs, last_shader, i, &interp_mode, &ps_repl_mode);
766 
767          /* OR the mode into the array */
768          const uint32_t inloc = fs->inputs[i].inloc * 2;
769          uint32_t n = inloc / 32;
770          uint32_t shift = inloc % 32;
771          interp_modes[n] |= interp_mode << shift;
772          ps_repl_modes[n] |= ps_repl_mode << shift;
773          if (shift + bits > 32) {
774             n++;
775             shift = 32 - shift;
776 
777             interp_modes[n] |= interp_mode >> shift;
778             ps_repl_modes[n] |= ps_repl_mode >> shift;
779          }
780          interp_regs = MAX2(interp_regs, n + 1);
781       }
782    }
783 
784    if (interp_regs) {
785       tu_cs_emit_pkt4(cs, REG_A6XX_VPC_VARYING_INTERP_MODE(0), interp_regs);
786       tu_cs_emit_array(cs, interp_modes, interp_regs);
787 
788       tu_cs_emit_pkt4(cs, REG_A6XX_VPC_VARYING_PS_REPL_MODE(0), interp_regs);
789       tu_cs_emit_array(cs, ps_repl_modes, interp_regs);
790    }
791 }
792 
793 template <chip CHIP>
794 void
tu6_emit_vpc(struct tu_cs * cs,const struct ir3_shader_variant * vs,const struct ir3_shader_variant * hs,const struct ir3_shader_variant * ds,const struct ir3_shader_variant * gs,const struct ir3_shader_variant * fs)795 tu6_emit_vpc(struct tu_cs *cs,
796              const struct ir3_shader_variant *vs,
797              const struct ir3_shader_variant *hs,
798              const struct ir3_shader_variant *ds,
799              const struct ir3_shader_variant *gs,
800              const struct ir3_shader_variant *fs)
801 {
802    /* note: doesn't compile as static because of the array regs.. */
803    const struct reg_config {
804       uint16_t reg_sp_xs_out_reg;
805       uint16_t reg_sp_xs_vpc_dst_reg;
806       uint16_t reg_vpc_xs_pack;
807       uint16_t reg_vpc_xs_clip_cntl;
808       uint16_t reg_vpc_xs_clip_cntl_v2;
809       uint16_t reg_gras_xs_cl_cntl;
810       uint16_t reg_pc_xs_out_cntl;
811       uint16_t reg_sp_xs_primitive_cntl;
812       uint16_t reg_vpc_xs_layer_cntl;
813       uint16_t reg_vpc_xs_layer_cntl_v2;
814       uint16_t reg_gras_xs_layer_cntl;
815    } reg_config[] = {
816       [MESA_SHADER_VERTEX] = {
817          REG_A6XX_SP_VS_OUT_REG(0),
818          REG_A6XX_SP_VS_VPC_DST_REG(0),
819          REG_A6XX_VPC_VS_PACK,
820          REG_A6XX_VPC_VS_CLIP_CNTL,
821          REG_A6XX_VPC_VS_CLIP_CNTL_V2,
822          REG_A6XX_GRAS_VS_CL_CNTL,
823          REG_A6XX_PC_VS_OUT_CNTL,
824          REG_A6XX_SP_VS_PRIMITIVE_CNTL,
825          REG_A6XX_VPC_VS_LAYER_CNTL,
826          REG_A6XX_VPC_VS_LAYER_CNTL_V2,
827          REG_A6XX_GRAS_VS_LAYER_CNTL
828       },
829       [MESA_SHADER_TESS_CTRL] = {
830          0,
831          0,
832          0,
833          0,
834          0,
835          0,
836          REG_A6XX_PC_HS_OUT_CNTL,
837          0,
838          0,
839          0
840       },
841       [MESA_SHADER_TESS_EVAL] = {
842          REG_A6XX_SP_DS_OUT_REG(0),
843          REG_A6XX_SP_DS_VPC_DST_REG(0),
844          REG_A6XX_VPC_DS_PACK,
845          REG_A6XX_VPC_DS_CLIP_CNTL,
846          REG_A6XX_VPC_DS_CLIP_CNTL_V2,
847          REG_A6XX_GRAS_DS_CL_CNTL,
848          REG_A6XX_PC_DS_OUT_CNTL,
849          REG_A6XX_SP_DS_PRIMITIVE_CNTL,
850          REG_A6XX_VPC_DS_LAYER_CNTL,
851          REG_A6XX_VPC_DS_LAYER_CNTL_V2,
852          REG_A6XX_GRAS_DS_LAYER_CNTL
853       },
854       [MESA_SHADER_GEOMETRY] = {
855          REG_A6XX_SP_GS_OUT_REG(0),
856          REG_A6XX_SP_GS_VPC_DST_REG(0),
857          REG_A6XX_VPC_GS_PACK,
858          REG_A6XX_VPC_GS_CLIP_CNTL,
859          REG_A6XX_VPC_GS_CLIP_CNTL_V2,
860          REG_A6XX_GRAS_GS_CL_CNTL,
861          REG_A6XX_PC_GS_OUT_CNTL,
862          REG_A6XX_SP_GS_PRIMITIVE_CNTL,
863          REG_A6XX_VPC_GS_LAYER_CNTL,
864          REG_A6XX_VPC_GS_LAYER_CNTL_V2,
865          REG_A6XX_GRAS_GS_LAYER_CNTL
866       },
867    };
868 
869    const struct ir3_shader_variant *last_shader;
870    if (gs) {
871       last_shader = gs;
872    } else if (hs) {
873       last_shader = ds;
874    } else {
875       last_shader = vs;
876    }
877 
878    const struct reg_config *cfg = &reg_config[last_shader->type];
879 
880    struct ir3_shader_linkage linkage = {
881       .primid_loc = 0xff,
882       .clip0_loc = 0xff,
883       .clip1_loc = 0xff,
884    };
885    if (fs)
886       ir3_link_shaders(&linkage, last_shader, fs, true);
887 
888    if (last_shader->stream_output.num_outputs)
889       ir3_link_stream_out(&linkage, last_shader);
890 
891    /* a6xx finds position/pointsize at the end */
892    const uint32_t pointsize_regid =
893       ir3_find_output_regid(last_shader, VARYING_SLOT_PSIZ);
894    const uint32_t layer_regid =
895       ir3_find_output_regid(last_shader, VARYING_SLOT_LAYER);
896    const uint32_t view_regid =
897       ir3_find_output_regid(last_shader, VARYING_SLOT_VIEWPORT);
898    const uint32_t clip0_regid =
899       ir3_find_output_regid(last_shader, VARYING_SLOT_CLIP_DIST0);
900    const uint32_t clip1_regid =
901       ir3_find_output_regid(last_shader, VARYING_SLOT_CLIP_DIST1);
902    uint32_t flags_regid = gs ?
903       ir3_find_output_regid(gs, VARYING_SLOT_GS_VERTEX_FLAGS_IR3) : 0;
904 
905    uint32_t pointsize_loc = 0xff, position_loc = 0xff, layer_loc = 0xff, view_loc = 0xff;
906 
907    if (layer_regid != regid(63, 0)) {
908       layer_loc = linkage.max_loc;
909       ir3_link_add(&linkage, VARYING_SLOT_LAYER, layer_regid, 0x1, linkage.max_loc);
910    }
911 
912    if (view_regid != regid(63, 0)) {
913       view_loc = linkage.max_loc;
914       ir3_link_add(&linkage, VARYING_SLOT_VIEWPORT, view_regid, 0x1, linkage.max_loc);
915    }
916 
917    unsigned extra_pos = 0;
918 
919    for (unsigned i = 0; i < last_shader->outputs_count; i++) {
920       if (last_shader->outputs[i].slot != VARYING_SLOT_POS)
921          continue;
922 
923       if (position_loc == 0xff)
924          position_loc = linkage.max_loc;
925 
926       ir3_link_add(&linkage, last_shader->outputs[i].slot,
927                    last_shader->outputs[i].regid,
928                    0xf, position_loc + 4 * last_shader->outputs[i].view);
929       extra_pos = MAX2(extra_pos, last_shader->outputs[i].view);
930    }
931 
932    if (pointsize_regid != regid(63, 0)) {
933       pointsize_loc = linkage.max_loc;
934       ir3_link_add(&linkage, VARYING_SLOT_PSIZ, pointsize_regid, 0x1, linkage.max_loc);
935    }
936 
937    uint8_t clip_cull_mask = last_shader->clip_mask | last_shader->cull_mask;
938 
939    /* Handle the case where clip/cull distances aren't read by the FS */
940    uint32_t clip0_loc = linkage.clip0_loc, clip1_loc = linkage.clip1_loc;
941    if (clip0_loc == 0xff && clip0_regid != regid(63, 0)) {
942       clip0_loc = linkage.max_loc;
943       ir3_link_add(&linkage, VARYING_SLOT_CLIP_DIST0, clip0_regid,
944                    clip_cull_mask & 0xf, linkage.max_loc);
945    }
946    if (clip1_loc == 0xff && clip1_regid != regid(63, 0)) {
947       clip1_loc = linkage.max_loc;
948       ir3_link_add(&linkage, VARYING_SLOT_CLIP_DIST1, clip1_regid,
949                    clip_cull_mask >> 4, linkage.max_loc);
950    }
951 
952    tu6_setup_streamout<CHIP>(cs, last_shader, &linkage);
953 
954    /* The GPU hangs on some models when there are no outputs (xs_pack::CNT),
955     * at least when a DS is the last stage, so add a dummy output to keep it
956     * happy if there aren't any. We do this late in order to avoid emitting
957     * any unused code and make sure that optimizations don't remove it.
958     */
959    if (linkage.cnt == 0)
960       ir3_link_add(&linkage, 0, 0, 0x1, linkage.max_loc);
961 
962    /* map outputs of the last shader to VPC */
963    assert(linkage.cnt <= 32);
964    const uint32_t sp_out_count = DIV_ROUND_UP(linkage.cnt, 2);
965    const uint32_t sp_vpc_dst_count = DIV_ROUND_UP(linkage.cnt, 4);
966    uint32_t sp_out[16] = {0};
967    uint32_t sp_vpc_dst[8] = {0};
968    for (uint32_t i = 0; i < linkage.cnt; i++) {
969       ((uint16_t *) sp_out)[i] =
970          A6XX_SP_VS_OUT_REG_A_REGID(linkage.var[i].regid) |
971          A6XX_SP_VS_OUT_REG_A_COMPMASK(linkage.var[i].compmask);
972       ((uint8_t *) sp_vpc_dst)[i] =
973          A6XX_SP_VS_VPC_DST_REG_OUTLOC0(linkage.var[i].loc);
974    }
975 
976    tu_cs_emit_pkt4(cs, cfg->reg_sp_xs_out_reg, sp_out_count);
977    tu_cs_emit_array(cs, sp_out, sp_out_count);
978 
979    tu_cs_emit_pkt4(cs, cfg->reg_sp_xs_vpc_dst_reg, sp_vpc_dst_count);
980    tu_cs_emit_array(cs, sp_vpc_dst, sp_vpc_dst_count);
981 
982    tu_cs_emit_pkt4(cs, cfg->reg_vpc_xs_pack, 1);
983    tu_cs_emit(cs, A6XX_VPC_VS_PACK_POSITIONLOC(position_loc) |
984                   A6XX_VPC_VS_PACK_PSIZELOC(pointsize_loc) |
985                   A6XX_VPC_VS_PACK_STRIDE_IN_VPC(linkage.max_loc) |
986                   A6XX_VPC_VS_PACK_EXTRAPOS(extra_pos));
987 
988    tu_cs_emit_pkt4(cs, cfg->reg_vpc_xs_clip_cntl, 1);
989    tu_cs_emit(cs, A6XX_VPC_VS_CLIP_CNTL_CLIP_MASK(clip_cull_mask) |
990                   A6XX_VPC_VS_CLIP_CNTL_CLIP_DIST_03_LOC(clip0_loc) |
991                   A6XX_VPC_VS_CLIP_CNTL_CLIP_DIST_47_LOC(clip1_loc));
992    tu_cs_emit_pkt4(cs, cfg->reg_vpc_xs_clip_cntl_v2, 1);
993    tu_cs_emit(cs, A6XX_VPC_VS_CLIP_CNTL_CLIP_MASK(clip_cull_mask) |
994                   A6XX_VPC_VS_CLIP_CNTL_CLIP_DIST_03_LOC(clip0_loc) |
995                   A6XX_VPC_VS_CLIP_CNTL_CLIP_DIST_47_LOC(clip1_loc));
996 
997    tu_cs_emit_pkt4(cs, cfg->reg_gras_xs_cl_cntl, 1);
998    tu_cs_emit(cs, A6XX_GRAS_VS_CL_CNTL_CLIP_MASK(last_shader->clip_mask) |
999                   A6XX_GRAS_VS_CL_CNTL_CULL_MASK(last_shader->cull_mask));
1000 
1001    const struct ir3_shader_variant *geom_shaders[] = { vs, hs, ds, gs };
1002 
1003    for (unsigned i = 0; i < ARRAY_SIZE(geom_shaders); i++) {
1004       const struct ir3_shader_variant *shader = geom_shaders[i];
1005       if (!shader)
1006          continue;
1007 
1008       bool primid = shader->type != MESA_SHADER_VERTEX &&
1009          VALIDREG(ir3_find_sysval_regid(shader, SYSTEM_VALUE_PRIMITIVE_ID));
1010 
1011       tu_cs_emit_pkt4(cs, reg_config[shader->type].reg_pc_xs_out_cntl, 1);
1012       if (shader == last_shader) {
1013          tu_cs_emit(cs, A6XX_PC_VS_OUT_CNTL_STRIDE_IN_VPC(linkage.max_loc) |
1014                         CONDREG(pointsize_regid, A6XX_PC_VS_OUT_CNTL_PSIZE) |
1015                         CONDREG(layer_regid, A6XX_PC_VS_OUT_CNTL_LAYER) |
1016                         CONDREG(view_regid, A6XX_PC_VS_OUT_CNTL_VIEW) |
1017                         COND(primid, A6XX_PC_VS_OUT_CNTL_PRIMITIVE_ID) |
1018                         A6XX_PC_VS_OUT_CNTL_CLIP_MASK(clip_cull_mask));
1019       } else {
1020          tu_cs_emit(cs, COND(primid, A6XX_PC_VS_OUT_CNTL_PRIMITIVE_ID));
1021       }
1022    }
1023 
1024    /* if vertex_flags somehow gets optimized out, your gonna have a bad time: */
1025    if (gs)
1026       assert(flags_regid != INVALID_REG);
1027 
1028    tu_cs_emit_pkt4(cs, cfg->reg_sp_xs_primitive_cntl, 1);
1029    tu_cs_emit(cs, A6XX_SP_VS_PRIMITIVE_CNTL_OUT(linkage.cnt) |
1030                   A6XX_SP_GS_PRIMITIVE_CNTL_FLAGS_REGID(flags_regid));
1031 
1032    tu_cs_emit_pkt4(cs, cfg->reg_vpc_xs_layer_cntl, 1);
1033    tu_cs_emit(cs, A6XX_VPC_VS_LAYER_CNTL_LAYERLOC(layer_loc) |
1034                   A6XX_VPC_VS_LAYER_CNTL_VIEWLOC(view_loc) |
1035                   0xff0000);
1036    tu_cs_emit_pkt4(cs, cfg->reg_vpc_xs_layer_cntl_v2, 1);
1037    tu_cs_emit(cs, A6XX_VPC_VS_LAYER_CNTL_LAYERLOC(layer_loc) |
1038                   A6XX_VPC_VS_LAYER_CNTL_VIEWLOC(view_loc) |
1039                   0xff0000);
1040 
1041    tu_cs_emit_pkt4(cs, cfg->reg_gras_xs_layer_cntl, 1);
1042    tu_cs_emit(cs, CONDREG(layer_regid, A6XX_GRAS_GS_LAYER_CNTL_WRITES_LAYER) |
1043                   CONDREG(view_regid, A6XX_GRAS_GS_LAYER_CNTL_WRITES_VIEW));
1044 
1045    tu6_emit_vpc_varying_modes(cs, fs, last_shader);
1046 }
1047 TU_GENX(tu6_emit_vpc);
1048 
1049 static void
tu6_emit_vs_params(struct tu_cs * cs,const struct ir3_const_state * const_state,unsigned constlen,unsigned param_stride,unsigned num_vertices)1050 tu6_emit_vs_params(struct tu_cs *cs,
1051                    const struct ir3_const_state *const_state,
1052                    unsigned constlen,
1053                    unsigned param_stride,
1054                    unsigned num_vertices)
1055 {
1056    uint32_t vs_params[4] = {
1057       param_stride * num_vertices * 4,  /* vs primitive stride */
1058       param_stride * 4,                 /* vs vertex stride */
1059       0,
1060       0,
1061    };
1062    tu6_emit_const(cs, CP_LOAD_STATE6_GEOM, TU_CONSTS_PRIMITIVE_PARAM,
1063                   const_state, constlen, SB6_VS_SHADER, 0,
1064                   ARRAY_SIZE(vs_params), vs_params);
1065 }
1066 
1067 static void
tu_get_tess_iova(struct tu_device * dev,uint64_t * tess_factor_iova,uint64_t * tess_param_iova)1068 tu_get_tess_iova(struct tu_device *dev,
1069                  uint64_t *tess_factor_iova,
1070                  uint64_t *tess_param_iova)
1071 {
1072    /* Create the shared tess factor BO the first time tess is used on the device. */
1073    if (!dev->tess_bo) {
1074       mtx_lock(&dev->mutex);
1075       if (!dev->tess_bo)
1076          tu_bo_init_new(dev, &dev->tess_bo, TU_TESS_BO_SIZE, TU_BO_ALLOC_NO_FLAGS, "tess");
1077       mtx_unlock(&dev->mutex);
1078    }
1079 
1080    *tess_factor_iova = dev->tess_bo->iova;
1081    *tess_param_iova = dev->tess_bo->iova + TU_TESS_FACTOR_SIZE;
1082 }
1083 
1084 static const enum mesa_vk_dynamic_graphics_state tu_patch_control_points_state[] = {
1085    MESA_VK_DYNAMIC_TS_PATCH_CONTROL_POINTS,
1086 };
1087 
1088 #define HS_PARAMS_SIZE 8
1089 
1090 template <chip CHIP>
1091 static unsigned
tu6_patch_control_points_size(struct tu_device * dev,const struct tu_shader * vs,const struct tu_shader * tcs,const struct tu_shader * tes,const struct tu_program_state * program,uint32_t patch_control_points)1092 tu6_patch_control_points_size(struct tu_device *dev,
1093                               const struct tu_shader *vs,
1094                               const struct tu_shader *tcs,
1095                               const struct tu_shader *tes,
1096                               const struct tu_program_state *program,
1097                               uint32_t patch_control_points)
1098 {
1099    if (dev->physical_device->info->a7xx.load_shader_consts_via_preamble) {
1100 #define EMIT_CONST_DWORDS(const_dwords) (5 + const_dwords + 4)
1101       return EMIT_CONST_DWORDS(4) +
1102          EMIT_CONST_DWORDS(HS_PARAMS_SIZE) + 2 + 2 + 2;
1103 #undef EMIT_CONST_DWORDS
1104    } else {
1105 #define EMIT_CONST_DWORDS(const_dwords) (4 + const_dwords)
1106       return EMIT_CONST_DWORDS(4) +
1107          EMIT_CONST_DWORDS(HS_PARAMS_SIZE) + 2 + 2 + 2;
1108 #undef EMIT_CONST_DWORDS
1109    }
1110 }
1111 
1112 template <chip CHIP>
1113 void
tu6_emit_patch_control_points(struct tu_cs * cs,const struct tu_shader * vs,const struct tu_shader * tcs,const struct tu_shader * tes,const struct tu_program_state * program,uint32_t patch_control_points)1114 tu6_emit_patch_control_points(struct tu_cs *cs,
1115                               const struct tu_shader *vs,
1116                               const struct tu_shader *tcs,
1117                               const struct tu_shader *tes,
1118                               const struct tu_program_state *program,
1119                               uint32_t patch_control_points)
1120 {
1121    if (!tcs->variant)
1122       return;
1123 
1124    struct tu_device *dev = cs->device;
1125 
1126    tu6_emit_vs_params(cs,
1127                       &program->link[MESA_SHADER_VERTEX].const_state,
1128                       program->link[MESA_SHADER_VERTEX].constlen,
1129                       vs->variant->output_size,
1130                       patch_control_points);
1131 
1132    uint64_t tess_factor_iova, tess_param_iova;
1133    tu_get_tess_iova(dev, &tess_factor_iova, &tess_param_iova);
1134 
1135    uint32_t hs_params[HS_PARAMS_SIZE] = {
1136       vs->variant->output_size * patch_control_points * 4,  /* hs primitive stride */
1137       vs->variant->output_size * 4,                         /* hs vertex stride */
1138       tcs->variant->output_size,
1139       patch_control_points,
1140       tess_param_iova,
1141       tess_param_iova >> 32,
1142       tess_factor_iova,
1143       tess_factor_iova >> 32,
1144    };
1145 
1146    const struct ir3_const_state *hs_const =
1147       &program->link[MESA_SHADER_TESS_CTRL].const_state;
1148    unsigned hs_constlen = program->link[MESA_SHADER_TESS_CTRL].constlen;
1149    tu6_emit_const(cs, CP_LOAD_STATE6_GEOM, TU_CONSTS_PRIMITIVE_PARAM,
1150                   hs_const, hs_constlen, SB6_HS_SHADER, 0,
1151                   ARRAY_SIZE(hs_params), hs_params);
1152 
1153    uint32_t patch_local_mem_size_16b =
1154       patch_control_points * vs->variant->output_size / 4;
1155 
1156    /* Total attribute slots in HS incoming patch. */
1157    tu_cs_emit_pkt4(cs, REG_A6XX_PC_HS_INPUT_SIZE, 1);
1158    tu_cs_emit(cs, patch_local_mem_size_16b);
1159 
1160    const uint32_t wavesize = 64;
1161    const uint32_t vs_hs_local_mem_size = 16384;
1162 
1163    uint32_t max_patches_per_wave;
1164    if (dev->physical_device->info->a6xx.tess_use_shared) {
1165       /* HS invocations for a patch are always within the same wave,
1166        * making barriers less expensive. VS can't have barriers so we
1167        * don't care about VS invocations being in the same wave.
1168        */
1169       max_patches_per_wave = wavesize / tcs->variant->tess.tcs_vertices_out;
1170    } else {
1171       /* VS is also in the same wave */
1172       max_patches_per_wave =
1173          wavesize / MAX2(patch_control_points,
1174                          tcs->variant->tess.tcs_vertices_out);
1175    }
1176 
1177    uint32_t patches_per_wave =
1178       MIN2(vs_hs_local_mem_size / (patch_local_mem_size_16b * 16),
1179            max_patches_per_wave);
1180 
1181    uint32_t wave_input_size = DIV_ROUND_UP(
1182       patches_per_wave * patch_local_mem_size_16b * 16, 256);
1183 
1184    tu_cs_emit_pkt4(cs, REG_A6XX_SP_HS_WAVE_INPUT_SIZE, 1);
1185    tu_cs_emit(cs, wave_input_size);
1186 
1187    /* maximum number of patches that can fit in tess factor/param buffers */
1188    uint32_t subdraw_size = MIN2(TU_TESS_FACTOR_SIZE / ir3_tess_factor_stride(tes->variant->key.tessellation),
1189                         TU_TESS_PARAM_SIZE / (tcs->variant->output_size * 4));
1190    /* convert from # of patches to draw count */
1191    subdraw_size *= patch_control_points;
1192 
1193    tu_cs_emit_pkt7(cs, CP_SET_SUBDRAW_SIZE, 1);
1194    tu_cs_emit(cs, subdraw_size);
1195 }
1196 
1197 static void
tu6_emit_geom_tess_consts(struct tu_cs * cs,const struct ir3_shader_variant * vs,const struct ir3_shader_variant * hs,const struct ir3_shader_variant * ds,const struct ir3_shader_variant * gs)1198 tu6_emit_geom_tess_consts(struct tu_cs *cs,
1199                           const struct ir3_shader_variant *vs,
1200                           const struct ir3_shader_variant *hs,
1201                           const struct ir3_shader_variant *ds,
1202                           const struct ir3_shader_variant *gs)
1203 {
1204    struct tu_device *dev = cs->device;
1205 
1206    if (gs && !hs) {
1207       tu6_emit_vs_params(cs, ir3_const_state(vs), vs->constlen,
1208                          vs->output_size, gs->gs.vertices_in);
1209    }
1210 
1211    if (hs) {
1212       uint64_t tess_factor_iova, tess_param_iova;
1213       tu_get_tess_iova(dev, &tess_factor_iova, &tess_param_iova);
1214 
1215       uint32_t ds_params[8] = {
1216          gs ? ds->output_size * gs->gs.vertices_in * 4 : 0,  /* ds primitive stride */
1217          ds->output_size * 4,                                /* ds vertex stride */
1218          hs->output_size,                                    /* hs vertex stride (dwords) */
1219          hs->tess.tcs_vertices_out,
1220          tess_param_iova,
1221          tess_param_iova >> 32,
1222          tess_factor_iova,
1223          tess_factor_iova >> 32,
1224       };
1225 
1226       tu6_emit_const(cs, CP_LOAD_STATE6_GEOM, TU_CONSTS_PRIMITIVE_PARAM,
1227                      ds->const_state, ds->constlen, SB6_DS_SHADER, 0,
1228                      ARRAY_SIZE(ds_params), ds_params);
1229    }
1230 
1231    if (gs) {
1232       const struct ir3_shader_variant *prev = ds ? ds : vs;
1233       uint32_t gs_params[4] = {
1234          prev->output_size * gs->gs.vertices_in * 4,  /* gs primitive stride */
1235          prev->output_size * 4,                 /* gs vertex stride */
1236          0,
1237          0,
1238       };
1239       tu6_emit_const(cs, CP_LOAD_STATE6_GEOM, TU_CONSTS_PRIMITIVE_PARAM,
1240                      gs->const_state, gs->constlen, SB6_GS_SHADER, 0,
1241                      ARRAY_SIZE(gs_params), gs_params);
1242    }
1243 }
1244 
1245 template <chip CHIP>
1246 static void
tu6_emit_program_config(struct tu_cs * cs,const struct tu_program_state * prog,struct tu_shader ** shaders,const struct ir3_shader_variant ** variants)1247 tu6_emit_program_config(struct tu_cs *cs,
1248                         const struct tu_program_state *prog,
1249                         struct tu_shader **shaders,
1250                         const struct ir3_shader_variant **variants)
1251 {
1252    STATIC_ASSERT(MESA_SHADER_VERTEX == 0);
1253 
1254    bool shared_consts_enable =
1255       prog->shared_consts.type == IR3_PUSH_CONSTS_SHARED;
1256    tu6_emit_shared_consts_enable<CHIP>(cs, shared_consts_enable);
1257 
1258    tu_cs_emit_regs(cs, HLSQ_INVALIDATE_CMD(CHIP,
1259          .vs_state = true,
1260          .hs_state = true,
1261          .ds_state = true,
1262          .gs_state = true,
1263          .fs_state = true,
1264          .gfx_ibo = true,
1265          .gfx_shared_const = shared_consts_enable));
1266    for (size_t stage_idx = MESA_SHADER_VERTEX;
1267         stage_idx <= MESA_SHADER_FRAGMENT; stage_idx++) {
1268       gl_shader_stage stage = (gl_shader_stage) stage_idx;
1269       tu6_emit_xs_config<CHIP>(cs, stage, variants[stage]);
1270    }
1271 
1272    for (size_t stage_idx = MESA_SHADER_VERTEX;
1273         stage_idx <= MESA_SHADER_FRAGMENT; stage_idx++) {
1274       gl_shader_stage stage = (gl_shader_stage) stage_idx;
1275       tu6_emit_dynamic_offset(cs, variants[stage], shaders[stage], prog);
1276    }
1277 
1278    const struct ir3_shader_variant *vs = variants[MESA_SHADER_VERTEX];
1279    const struct ir3_shader_variant *hs = variants[MESA_SHADER_TESS_CTRL];
1280    const struct ir3_shader_variant *ds = variants[MESA_SHADER_TESS_EVAL];
1281    const struct ir3_shader_variant *gs = variants[MESA_SHADER_GEOMETRY];
1282 
1283    if (hs) {
1284       tu6_emit_link_map(cs, vs, hs, SB6_HS_SHADER);
1285       tu6_emit_link_map(cs, hs, ds, SB6_DS_SHADER);
1286    }
1287 
1288    if (gs) {
1289       if (hs) {
1290          tu6_emit_link_map(cs, ds, gs, SB6_GS_SHADER);
1291       } else {
1292          tu6_emit_link_map(cs, vs, gs, SB6_GS_SHADER);
1293       }
1294 
1295       uint32_t prev_stage_output_size = ds ? ds->output_size : vs->output_size;
1296 
1297       if (CHIP == A6XX) {
1298          /* Size of per-primitive alloction in ldlw memory in vec4s. */
1299          uint32_t vec4_size = gs->gs.vertices_in *
1300                               DIV_ROUND_UP(prev_stage_output_size, 4);
1301 
1302          tu_cs_emit_pkt4(cs, REG_A6XX_PC_PRIMITIVE_CNTL_6, 1);
1303          tu_cs_emit(cs, A6XX_PC_PRIMITIVE_CNTL_6_STRIDE_IN_VPC(vec4_size));
1304       }
1305 
1306       uint32_t prim_size = prev_stage_output_size;
1307       if (prim_size > 64)
1308          prim_size = 64;
1309       else if (prim_size == 64)
1310          prim_size = 63;
1311       tu_cs_emit_pkt4(cs, REG_A6XX_SP_GS_PRIM_SIZE, 1);
1312       tu_cs_emit(cs, prim_size);
1313    }
1314 
1315    if (gs || hs) {
1316       tu6_emit_geom_tess_consts(cs, vs, hs, ds, gs);
1317    }
1318 }
1319 
1320 static bool
contains_all_shader_state(VkGraphicsPipelineLibraryFlagsEXT state)1321 contains_all_shader_state(VkGraphicsPipelineLibraryFlagsEXT state)
1322 {
1323    return (state &
1324       (VK_GRAPHICS_PIPELINE_LIBRARY_PRE_RASTERIZATION_SHADERS_BIT_EXT |
1325        VK_GRAPHICS_PIPELINE_LIBRARY_FRAGMENT_SHADER_BIT_EXT)) ==
1326       (VK_GRAPHICS_PIPELINE_LIBRARY_PRE_RASTERIZATION_SHADERS_BIT_EXT |
1327        VK_GRAPHICS_PIPELINE_LIBRARY_FRAGMENT_SHADER_BIT_EXT);
1328 }
1329 
1330 static bool
pipeline_contains_all_shader_state(struct tu_pipeline * pipeline)1331 pipeline_contains_all_shader_state(struct tu_pipeline *pipeline)
1332 {
1333    return pipeline->type == TU_PIPELINE_GRAPHICS ||
1334       pipeline->type == TU_PIPELINE_COMPUTE ||
1335       contains_all_shader_state(tu_pipeline_to_graphics_lib(pipeline)->state);
1336 }
1337 
1338 /* Return true if this pipeline contains all of the GPL stages listed but none
1339  * of the libraries it uses do, so this is "the first time" that all of them
1340  * are defined together. This is useful for state that needs to be combined
1341  * from multiple GPL stages.
1342  */
1343 
1344 static bool
set_combined_state(struct tu_pipeline_builder * builder,struct tu_pipeline * pipeline,VkGraphicsPipelineLibraryFlagsEXT state)1345 set_combined_state(struct tu_pipeline_builder *builder,
1346                    struct tu_pipeline *pipeline,
1347                    VkGraphicsPipelineLibraryFlagsEXT state)
1348 {
1349    if (pipeline->type == TU_PIPELINE_GRAPHICS_LIB &&
1350        (tu_pipeline_to_graphics_lib(pipeline)->state & state) != state)
1351       return false;
1352 
1353    for (unsigned i = 0; i < builder->num_libraries; i++) {
1354       if ((builder->libraries[i]->state & state) == state)
1355          return false;
1356    }
1357 
1358    return true;
1359 }
1360 
1361 #define TU6_EMIT_VERTEX_INPUT_MAX_DWORDS (MAX_VERTEX_ATTRIBS * 2 + 1)
1362 
1363 static VkResult
tu_pipeline_allocate_cs(struct tu_device * dev,struct tu_pipeline * pipeline,struct tu_pipeline_layout * layout,struct tu_pipeline_builder * builder,const struct ir3_shader_variant * compute)1364 tu_pipeline_allocate_cs(struct tu_device *dev,
1365                         struct tu_pipeline *pipeline,
1366                         struct tu_pipeline_layout *layout,
1367                         struct tu_pipeline_builder *builder,
1368                         const struct ir3_shader_variant *compute)
1369 {
1370    uint32_t size = 1024;
1371 
1372    /* graphics case: */
1373    if (builder) {
1374       if (builder->state &
1375           VK_GRAPHICS_PIPELINE_LIBRARY_VERTEX_INPUT_INTERFACE_BIT_EXT) {
1376          size += TU6_EMIT_VERTEX_INPUT_MAX_DWORDS;
1377       }
1378 
1379       if (set_combined_state(builder, pipeline,
1380                              VK_GRAPHICS_PIPELINE_LIBRARY_PRE_RASTERIZATION_SHADERS_BIT_EXT |
1381                              VK_GRAPHICS_PIPELINE_LIBRARY_FRAGMENT_SHADER_BIT_EXT)) {
1382          size += tu6_load_state_size(pipeline, layout);
1383       }
1384    } else {
1385       size += tu6_load_state_size(pipeline, layout);
1386    }
1387 
1388    /* Allocate the space for the pipeline out of the device's RO suballocator.
1389     *
1390     * Sub-allocating BOs saves memory and also kernel overhead in refcounting of
1391     * BOs at exec time.
1392     *
1393     * The pipeline cache would seem like a natural place to stick the
1394     * suballocator, except that it is not guaranteed to outlive the pipelines
1395     * created from it, so you can't store any long-lived state there, and you
1396     * can't use its EXTERNALLY_SYNCHRONIZED flag to avoid atomics because
1397     * pipeline destroy isn't synchronized by the cache.
1398     */
1399    mtx_lock(&dev->pipeline_mutex);
1400    VkResult result = tu_suballoc_bo_alloc(&pipeline->bo, &dev->pipeline_suballoc,
1401                                           size * 4, 128);
1402    mtx_unlock(&dev->pipeline_mutex);
1403    if (result != VK_SUCCESS)
1404       return result;
1405 
1406    tu_cs_init_suballoc(&pipeline->cs, dev, &pipeline->bo);
1407 
1408    return VK_SUCCESS;
1409 }
1410 
1411 static void
tu_append_executable(struct tu_pipeline * pipeline,const struct ir3_shader_variant * variant,char * nir_from_spirv)1412 tu_append_executable(struct tu_pipeline *pipeline,
1413                      const struct ir3_shader_variant *variant,
1414                      char *nir_from_spirv)
1415 {
1416    struct tu_pipeline_executable exe = {
1417       .stage = variant->type,
1418       .stats = variant->info,
1419       .is_binning = variant->binning_pass,
1420       .nir_from_spirv = nir_from_spirv,
1421       .nir_final = ralloc_strdup(pipeline->executables_mem_ctx, variant->disasm_info.nir),
1422       .disasm = ralloc_strdup(pipeline->executables_mem_ctx, variant->disasm_info.disasm),
1423    };
1424 
1425    util_dynarray_append(&pipeline->executables, struct tu_pipeline_executable, exe);
1426 }
1427 
1428 static void
tu_hash_stage(struct mesa_sha1 * ctx,const VkPipelineShaderStageCreateInfo * stage,const nir_shader * nir,const struct tu_shader_key * key)1429 tu_hash_stage(struct mesa_sha1 *ctx,
1430               const VkPipelineShaderStageCreateInfo *stage,
1431               const nir_shader *nir,
1432               const struct tu_shader_key *key)
1433 {
1434 
1435    if (nir) {
1436       struct blob blob;
1437       blob_init(&blob);
1438       nir_serialize(&blob, nir, true);
1439       _mesa_sha1_update(ctx, blob.data, blob.size);
1440       blob_finish(&blob);
1441    } else {
1442       unsigned char stage_hash[SHA1_DIGEST_LENGTH];
1443       vk_pipeline_hash_shader_stage(stage, NULL, stage_hash);
1444       _mesa_sha1_update(ctx, stage_hash, sizeof(stage_hash));
1445    }
1446    _mesa_sha1_update(ctx, key, sizeof(*key));
1447 }
1448 
1449 /* Hash flags which can affect ir3 shader compilation which aren't known until
1450  * logical device creation.
1451  */
1452 static void
tu_hash_compiler(struct mesa_sha1 * ctx,const struct ir3_compiler * compiler)1453 tu_hash_compiler(struct mesa_sha1 *ctx, const struct ir3_compiler *compiler)
1454 {
1455    _mesa_sha1_update(ctx, &compiler->options.robust_buffer_access2,
1456                      sizeof(compiler->options.robust_buffer_access2));
1457    _mesa_sha1_update(ctx, &ir3_shader_debug, sizeof(ir3_shader_debug));
1458 }
1459 
1460 static void
tu_hash_shaders(unsigned char * hash,const VkPipelineShaderStageCreateInfo ** stages,nir_shader * const * nir,const struct tu_pipeline_layout * layout,const struct tu_shader_key * keys,VkGraphicsPipelineLibraryFlagsEXT state,const struct ir3_compiler * compiler)1461 tu_hash_shaders(unsigned char *hash,
1462                 const VkPipelineShaderStageCreateInfo **stages,
1463                 nir_shader *const *nir,
1464                 const struct tu_pipeline_layout *layout,
1465                 const struct tu_shader_key *keys,
1466                 VkGraphicsPipelineLibraryFlagsEXT state,
1467                 const struct ir3_compiler *compiler)
1468 {
1469    struct mesa_sha1 ctx;
1470 
1471    _mesa_sha1_init(&ctx);
1472 
1473    if (layout)
1474       _mesa_sha1_update(&ctx, layout->sha1, sizeof(layout->sha1));
1475 
1476    for (int i = 0; i < MESA_SHADER_STAGES; ++i) {
1477       if (stages[i] || nir[i]) {
1478          tu_hash_stage(&ctx, stages[i], nir[i], &keys[i]);
1479       }
1480    }
1481    _mesa_sha1_update(&ctx, &state, sizeof(state));
1482    tu_hash_compiler(&ctx, compiler);
1483    _mesa_sha1_final(&ctx, hash);
1484 }
1485 
1486 static void
tu_hash_compute(unsigned char * hash,const VkPipelineShaderStageCreateInfo * stage,const struct tu_pipeline_layout * layout,const struct tu_shader_key * key,const struct ir3_compiler * compiler)1487 tu_hash_compute(unsigned char *hash,
1488                 const VkPipelineShaderStageCreateInfo *stage,
1489                 const struct tu_pipeline_layout *layout,
1490                 const struct tu_shader_key *key,
1491                 const struct ir3_compiler *compiler)
1492 {
1493    struct mesa_sha1 ctx;
1494 
1495    _mesa_sha1_init(&ctx);
1496 
1497    if (layout)
1498       _mesa_sha1_update(&ctx, layout->sha1, sizeof(layout->sha1));
1499 
1500    tu_hash_stage(&ctx, stage, NULL, key);
1501 
1502    tu_hash_compiler(&ctx, compiler);
1503    _mesa_sha1_final(&ctx, hash);
1504 }
1505 
1506 static struct tu_shader *
tu_pipeline_cache_lookup(struct vk_pipeline_cache * cache,const void * key_data,size_t key_size,bool * application_cache_hit)1507 tu_pipeline_cache_lookup(struct vk_pipeline_cache *cache,
1508                          const void *key_data, size_t key_size,
1509                          bool *application_cache_hit)
1510 {
1511    struct vk_pipeline_cache_object *object =
1512       vk_pipeline_cache_lookup_object(cache, key_data, key_size,
1513                                       &tu_shader_ops, application_cache_hit);
1514    if (object)
1515       return container_of(object, struct tu_shader, base);
1516    else
1517       return NULL;
1518 }
1519 
1520 static struct tu_shader *
tu_pipeline_cache_insert(struct vk_pipeline_cache * cache,struct tu_shader * shader)1521 tu_pipeline_cache_insert(struct vk_pipeline_cache *cache,
1522                          struct tu_shader *shader)
1523 {
1524    struct vk_pipeline_cache_object *object =
1525       vk_pipeline_cache_add_object(cache, &shader->base);
1526    return container_of(object, struct tu_shader, base);
1527 }
1528 
1529 static bool
1530 tu_nir_shaders_serialize(struct vk_pipeline_cache_object *object,
1531                          struct blob *blob);
1532 
1533 static struct vk_pipeline_cache_object *
1534 tu_nir_shaders_deserialize(struct vk_pipeline_cache *cache,
1535                            const void *key_data,
1536                            size_t key_size,
1537                            struct blob_reader *blob);
1538 
1539 static void
tu_nir_shaders_destroy(struct vk_device * device,struct vk_pipeline_cache_object * object)1540 tu_nir_shaders_destroy(struct vk_device *device,
1541                        struct vk_pipeline_cache_object *object)
1542 {
1543    struct tu_nir_shaders *shaders =
1544       container_of(object, struct tu_nir_shaders, base);
1545 
1546    for (unsigned i = 0; i < ARRAY_SIZE(shaders->nir); i++)
1547       ralloc_free(shaders->nir[i]);
1548 
1549    vk_pipeline_cache_object_finish(&shaders->base);
1550    vk_free(&device->alloc, shaders);
1551 }
1552 
1553 const struct vk_pipeline_cache_object_ops tu_nir_shaders_ops = {
1554    .serialize = tu_nir_shaders_serialize,
1555    .deserialize = tu_nir_shaders_deserialize,
1556    .destroy = tu_nir_shaders_destroy,
1557 };
1558 
1559 static struct tu_nir_shaders *
tu_nir_shaders_init(struct tu_device * dev,const void * key_data,size_t key_size)1560 tu_nir_shaders_init(struct tu_device *dev, const void *key_data, size_t key_size)
1561 {
1562    VK_MULTIALLOC(ma);
1563    VK_MULTIALLOC_DECL(&ma, struct tu_nir_shaders, shaders, 1);
1564    VK_MULTIALLOC_DECL_SIZE(&ma, char, obj_key_data, key_size);
1565 
1566    if (!vk_multialloc_zalloc(&ma, &dev->vk.alloc,
1567                              VK_SYSTEM_ALLOCATION_SCOPE_DEVICE))
1568       return NULL;
1569 
1570    memcpy(obj_key_data, key_data, key_size);
1571    vk_pipeline_cache_object_init(&dev->vk, &shaders->base,
1572                                  &tu_nir_shaders_ops, obj_key_data, key_size);
1573 
1574    return shaders;
1575 }
1576 
1577 static bool
tu_nir_shaders_serialize(struct vk_pipeline_cache_object * object,struct blob * blob)1578 tu_nir_shaders_serialize(struct vk_pipeline_cache_object *object,
1579                          struct blob *blob)
1580 {
1581    struct tu_nir_shaders *shaders =
1582       container_of(object, struct tu_nir_shaders, base);
1583 
1584    for (unsigned i = 0; i < ARRAY_SIZE(shaders->nir); i++) {
1585       if (shaders->nir[i]) {
1586          blob_write_uint8(blob, 1);
1587          nir_serialize(blob, shaders->nir[i], true);
1588       } else {
1589          blob_write_uint8(blob, 0);
1590       }
1591    }
1592 
1593    return true;
1594 }
1595 
1596 static struct vk_pipeline_cache_object *
tu_nir_shaders_deserialize(struct vk_pipeline_cache * cache,const void * key_data,size_t key_size,struct blob_reader * blob)1597 tu_nir_shaders_deserialize(struct vk_pipeline_cache *cache,
1598                            const void *key_data,
1599                            size_t key_size,
1600                            struct blob_reader *blob)
1601 {
1602    struct tu_device *dev =
1603       container_of(cache->base.device, struct tu_device, vk);
1604    struct tu_nir_shaders *shaders =
1605       tu_nir_shaders_init(dev, key_data, key_size);
1606 
1607    if (!shaders)
1608       return NULL;
1609 
1610    for (unsigned i = 0; i < ARRAY_SIZE(shaders->nir); i++) {
1611       if (blob_read_uint8(blob)) {
1612          shaders->nir[i] =
1613             nir_deserialize(NULL, ir3_get_compiler_options(dev->compiler), blob);
1614       }
1615    }
1616 
1617    return &shaders->base;
1618 }
1619 
1620 static struct tu_nir_shaders *
tu_nir_cache_lookup(struct vk_pipeline_cache * cache,const void * key_data,size_t key_size,bool * application_cache_hit)1621 tu_nir_cache_lookup(struct vk_pipeline_cache *cache,
1622                     const void *key_data, size_t key_size,
1623                     bool *application_cache_hit)
1624 {
1625    struct vk_pipeline_cache_object *object =
1626       vk_pipeline_cache_lookup_object(cache, key_data, key_size,
1627                                       &tu_nir_shaders_ops, application_cache_hit);
1628    if (object)
1629       return container_of(object, struct tu_nir_shaders, base);
1630    else
1631       return NULL;
1632 }
1633 
1634 static struct tu_nir_shaders *
tu_nir_cache_insert(struct vk_pipeline_cache * cache,struct tu_nir_shaders * shaders)1635 tu_nir_cache_insert(struct vk_pipeline_cache *cache,
1636                     struct tu_nir_shaders *shaders)
1637 {
1638    struct vk_pipeline_cache_object *object =
1639       vk_pipeline_cache_add_object(cache, &shaders->base);
1640    return container_of(object, struct tu_nir_shaders, base);
1641 }
1642 
1643 static VkResult
tu_pipeline_builder_compile_shaders(struct tu_pipeline_builder * builder,struct tu_pipeline * pipeline)1644 tu_pipeline_builder_compile_shaders(struct tu_pipeline_builder *builder,
1645                                     struct tu_pipeline *pipeline)
1646 {
1647    VkResult result = VK_SUCCESS;
1648    const struct ir3_compiler *compiler = builder->device->compiler;
1649    const VkPipelineShaderStageCreateInfo *stage_infos[MESA_SHADER_STAGES] = {
1650       NULL
1651    };
1652    VkPipelineCreationFeedback pipeline_feedback = {
1653       .flags = VK_PIPELINE_CREATION_FEEDBACK_VALID_BIT,
1654    };
1655    VkPipelineCreationFeedback stage_feedbacks[MESA_SHADER_STAGES] = { 0 };
1656 
1657    const bool executable_info =
1658       builder->create_flags &
1659       VK_PIPELINE_CREATE_2_CAPTURE_INTERNAL_REPRESENTATIONS_BIT_KHR;
1660 
1661    bool retain_nir =
1662       builder->create_flags &
1663       VK_PIPELINE_CREATE_2_RETAIN_LINK_TIME_OPTIMIZATION_INFO_BIT_EXT;
1664 
1665    int64_t pipeline_start = os_time_get_nano();
1666 
1667    const VkPipelineCreationFeedbackCreateInfo *creation_feedback =
1668       vk_find_struct_const(builder->create_info->pNext, PIPELINE_CREATION_FEEDBACK_CREATE_INFO);
1669 
1670    bool must_compile = false;
1671    for (uint32_t i = 0; i < builder->create_info->stageCount; i++) {
1672       if (!(builder->active_stages & builder->create_info->pStages[i].stage))
1673          continue;
1674 
1675       gl_shader_stage stage =
1676          vk_to_mesa_shader_stage(builder->create_info->pStages[i].stage);
1677       stage_infos[stage] = &builder->create_info->pStages[i];
1678       must_compile = true;
1679    }
1680 
1681    /* Forward declare everything due to the goto usage */
1682    nir_shader *nir[ARRAY_SIZE(stage_infos)] = { NULL };
1683    struct tu_shader *shaders[ARRAY_SIZE(stage_infos)] = { NULL };
1684    nir_shader *post_link_nir[ARRAY_SIZE(nir)] = { NULL };
1685    char *nir_initial_disasm[ARRAY_SIZE(stage_infos)] = { NULL };
1686    bool cache_hit = false;
1687 
1688    struct tu_shader_key keys[ARRAY_SIZE(stage_infos)] = { };
1689    for (gl_shader_stage stage = MESA_SHADER_VERTEX;
1690         stage < ARRAY_SIZE(keys); stage = (gl_shader_stage) (stage+1)) {
1691       const VkPipelineShaderStageRequiredSubgroupSizeCreateInfo *subgroup_info = NULL;
1692       if (stage_infos[stage])
1693          subgroup_info = vk_find_struct_const(stage_infos[stage],
1694                                               PIPELINE_SHADER_STAGE_REQUIRED_SUBGROUP_SIZE_CREATE_INFO);
1695       bool allow_varying_subgroup_size =
1696          !stage_infos[stage] ||
1697          (stage_infos[stage]->flags &
1698           VK_PIPELINE_SHADER_STAGE_CREATE_ALLOW_VARYING_SUBGROUP_SIZE_BIT_EXT);
1699       bool require_full_subgroups =
1700          stage_infos[stage] &&
1701          (stage_infos[stage]->flags &
1702           VK_PIPELINE_SHADER_STAGE_CREATE_REQUIRE_FULL_SUBGROUPS_BIT_EXT);
1703       tu_shader_key_subgroup_size(&keys[stage], allow_varying_subgroup_size,
1704                                   require_full_subgroups, subgroup_info,
1705                                   builder->device);
1706    }
1707 
1708    if (builder->create_flags &
1709        VK_PIPELINE_CREATE_2_LINK_TIME_OPTIMIZATION_BIT_EXT) {
1710       for (unsigned i = 0; i < builder->num_libraries; i++) {
1711          struct tu_graphics_lib_pipeline *library = builder->libraries[i];
1712 
1713          for (unsigned j = 0; j < ARRAY_SIZE(library->shaders); j++) {
1714             if (library->shaders[j].nir) {
1715                assert(!nir[j]);
1716                nir[j] = nir_shader_clone(builder->mem_ctx,
1717                      library->shaders[j].nir);
1718                keys[j] = library->shaders[j].key;
1719                must_compile = true;
1720             }
1721          }
1722       }
1723    }
1724 
1725    struct tu_nir_shaders *nir_shaders = NULL;
1726    if (!must_compile)
1727       goto done;
1728 
1729    if (builder->state &
1730        VK_GRAPHICS_PIPELINE_LIBRARY_PRE_RASTERIZATION_SHADERS_BIT_EXT) {
1731       keys[MESA_SHADER_VERTEX].multiview_mask =
1732          builder->graphics_state.rp->view_mask;
1733    }
1734 
1735    if (builder->state & VK_GRAPHICS_PIPELINE_LIBRARY_FRAGMENT_SHADER_BIT_EXT) {
1736       keys[MESA_SHADER_FRAGMENT].multiview_mask =
1737          builder->graphics_state.rp->view_mask;
1738       keys[MESA_SHADER_FRAGMENT].fragment_density_map =
1739          builder->fragment_density_map;
1740       keys[MESA_SHADER_FRAGMENT].unscaled_input_fragcoord =
1741          builder->unscaled_input_fragcoord;
1742 
1743       const VkPipelineMultisampleStateCreateInfo *msaa_info =
1744          builder->create_info->pMultisampleState;
1745 
1746       /* The 1.3.215 spec says:
1747        *
1748        *    Sample shading can be used to specify a minimum number of unique
1749        *    samples to process for each fragment. If sample shading is enabled,
1750        *    an implementation must provide a minimum of
1751        *
1752        *       max(ceil(minSampleShadingFactor * totalSamples), 1)
1753        *
1754        *    unique associated data for each fragment, where
1755        *    minSampleShadingFactor is the minimum fraction of sample shading.
1756        *
1757        * The definition is pretty much the same as OpenGL's GL_SAMPLE_SHADING.
1758        * They both require unique associated data.
1759        *
1760        * There are discussions to change the definition, such that
1761        * sampleShadingEnable does not imply unique associated data.  Before the
1762        * discussions are settled and before apps (i.e., ANGLE) are fixed to
1763        * follow the new and incompatible definition, we should stick to the
1764        * current definition.
1765        *
1766        * Note that ir3_shader_key::sample_shading is not actually used by ir3,
1767        * just checked in tu6_emit_fs_inputs.  We will also copy the value to
1768        * tu_shader_key::force_sample_interp in a bit.
1769        */
1770       keys[MESA_SHADER_FRAGMENT].force_sample_interp =
1771          !builder->rasterizer_discard && msaa_info && msaa_info->sampleShadingEnable;
1772    }
1773 
1774    unsigned char pipeline_sha1[20];
1775    tu_hash_shaders(pipeline_sha1, stage_infos, nir, &builder->layout, keys,
1776                    builder->state, compiler);
1777 
1778    unsigned char nir_sha1[21];
1779    memcpy(nir_sha1, pipeline_sha1, sizeof(pipeline_sha1));
1780    nir_sha1[20] = 'N';
1781 
1782    if (!executable_info) {
1783       cache_hit = true;
1784       bool application_cache_hit = false;
1785 
1786       unsigned char shader_sha1[21];
1787       memcpy(shader_sha1, pipeline_sha1, sizeof(pipeline_sha1));
1788 
1789       for (gl_shader_stage stage = MESA_SHADER_VERTEX; stage < ARRAY_SIZE(nir);
1790            stage = (gl_shader_stage) (stage + 1)) {
1791          if (stage_infos[stage] || nir[stage]) {
1792             bool shader_application_cache_hit;
1793             shader_sha1[20] = (unsigned char) stage;
1794             shaders[stage] =
1795                tu_pipeline_cache_lookup(builder->cache, &shader_sha1,
1796                                         sizeof(shader_sha1),
1797                                         &shader_application_cache_hit);
1798             if (!shaders[stage]) {
1799                cache_hit = false;
1800                break;
1801             }
1802             application_cache_hit &= shader_application_cache_hit;
1803          }
1804       }
1805 
1806       /* If the user asks us to keep the NIR around, we need to have it for a
1807        * successful cache hit. If we only have a "partial" cache hit, then we
1808        * still need to recompile in order to get the NIR.
1809        */
1810       if (cache_hit &&
1811           (builder->create_flags &
1812            VK_PIPELINE_CREATE_2_RETAIN_LINK_TIME_OPTIMIZATION_INFO_BIT_EXT)) {
1813          bool nir_application_cache_hit = false;
1814          nir_shaders =
1815             tu_nir_cache_lookup(builder->cache, &nir_sha1,
1816                                 sizeof(nir_sha1),
1817                                 &nir_application_cache_hit);
1818 
1819          application_cache_hit &= nir_application_cache_hit;
1820          cache_hit &= !!nir_shaders;
1821       }
1822 
1823       if (application_cache_hit && builder->cache != builder->device->mem_cache) {
1824          pipeline_feedback.flags |=
1825             VK_PIPELINE_CREATION_FEEDBACK_APPLICATION_PIPELINE_CACHE_HIT_BIT;
1826       }
1827    }
1828 
1829    if (!cache_hit) {
1830       if (builder->create_flags &
1831           VK_PIPELINE_CREATE_2_FAIL_ON_PIPELINE_COMPILE_REQUIRED_BIT_KHR) {
1832          return VK_PIPELINE_COMPILE_REQUIRED;
1833       }
1834 
1835       result = tu_compile_shaders(builder->device,
1836                                   stage_infos,
1837                                   nir,
1838                                   keys,
1839                                   &builder->layout,
1840                                   pipeline_sha1,
1841                                   shaders,
1842                                   executable_info ? nir_initial_disasm : NULL,
1843                                   pipeline->executables_mem_ctx,
1844                                   retain_nir ? post_link_nir : NULL,
1845                                   stage_feedbacks);
1846 
1847       if (result != VK_SUCCESS)
1848          goto fail;
1849 
1850       if (retain_nir) {
1851          nir_shaders =
1852             tu_nir_shaders_init(builder->device, &nir_sha1, sizeof(nir_sha1));
1853          for (gl_shader_stage stage = MESA_SHADER_VERTEX;
1854               stage < ARRAY_SIZE(nir); stage = (gl_shader_stage) (stage + 1)) {
1855             if (!post_link_nir[stage])
1856                continue;
1857 
1858             nir_shaders->nir[stage] = post_link_nir[stage];
1859          }
1860 
1861          nir_shaders = tu_nir_cache_insert(builder->cache, nir_shaders);
1862       }
1863 
1864       for (gl_shader_stage stage = MESA_SHADER_VERTEX; stage < ARRAY_SIZE(nir);
1865            stage = (gl_shader_stage) (stage + 1)) {
1866          if (!nir[stage])
1867             continue;
1868 
1869          shaders[stage] = tu_pipeline_cache_insert(builder->cache, shaders[stage]);
1870       }
1871    }
1872 
1873 done:
1874 
1875    /* Create empty shaders which contain the draw states to initialize
1876     * registers for unused shader stages.
1877     */
1878    if (builder->state &
1879        VK_GRAPHICS_PIPELINE_LIBRARY_PRE_RASTERIZATION_SHADERS_BIT_EXT) {
1880       if (!shaders[MESA_SHADER_TESS_CTRL]) {
1881          shaders[MESA_SHADER_TESS_CTRL] = builder->device->empty_tcs;
1882          vk_pipeline_cache_object_ref(&shaders[MESA_SHADER_TESS_CTRL]->base);
1883       }
1884       if (!shaders[MESA_SHADER_TESS_EVAL]) {
1885          shaders[MESA_SHADER_TESS_EVAL] = builder->device->empty_tes;
1886          vk_pipeline_cache_object_ref(&shaders[MESA_SHADER_TESS_EVAL]->base);
1887       }
1888       if (!shaders[MESA_SHADER_GEOMETRY]) {
1889          shaders[MESA_SHADER_GEOMETRY] = builder->device->empty_gs;
1890          vk_pipeline_cache_object_ref(&shaders[MESA_SHADER_GEOMETRY]->base);
1891       }
1892    }
1893 
1894    if (builder->state &
1895        VK_GRAPHICS_PIPELINE_LIBRARY_FRAGMENT_SHADER_BIT_EXT) {
1896       if (!shaders[MESA_SHADER_FRAGMENT]) {
1897          shaders[MESA_SHADER_FRAGMENT] =
1898             builder->fragment_density_map ?
1899             builder->device->empty_fs_fdm : builder->device->empty_fs;
1900          vk_pipeline_cache_object_ref(&shaders[MESA_SHADER_FRAGMENT]->base);
1901       }
1902    }
1903 
1904    for (gl_shader_stage stage = MESA_SHADER_VERTEX;
1905         stage < ARRAY_SIZE(nir); stage = (gl_shader_stage) (stage + 1)) {
1906       if (shaders[stage] && shaders[stage]->variant) {
1907          tu_append_executable(pipeline, shaders[stage]->variant,
1908                               nir_initial_disasm[stage]);
1909       }
1910    }
1911 
1912    /* We may have deduplicated a cache entry, in which case our original
1913     * post_link_nir may be gone.
1914     */
1915    if (nir_shaders) {
1916       for (gl_shader_stage stage = MESA_SHADER_VERTEX;
1917            stage < ARRAY_SIZE(nir); stage = (gl_shader_stage) (stage + 1)) {
1918          if (nir_shaders->nir[stage]) {
1919             post_link_nir[stage] = nir_shaders->nir[stage];
1920          }
1921       }
1922    }
1923 
1924    /* In the case where we're building a library without link-time
1925     * optimization but with sub-libraries that retain LTO info, we should
1926     * retain it ourselves in case another pipeline includes us with LTO.
1927     */
1928    for (unsigned i = 0; i < builder->num_libraries; i++) {
1929       struct tu_graphics_lib_pipeline *library = builder->libraries[i];
1930       for (gl_shader_stage stage = MESA_SHADER_VERTEX;
1931            stage < ARRAY_SIZE(library->shaders);
1932            stage = (gl_shader_stage) (stage + 1)) {
1933          if (!post_link_nir[stage] && library->shaders[stage].nir) {
1934             post_link_nir[stage] = library->shaders[stage].nir;
1935             keys[stage] = library->shaders[stage].key;
1936          }
1937 
1938          if (!shaders[stage] && library->base.shaders[stage]) {
1939             shaders[stage] = library->base.shaders[stage];
1940             vk_pipeline_cache_object_ref(&shaders[stage]->base);
1941          }
1942       }
1943    }
1944 
1945    if (shaders[MESA_SHADER_VERTEX]) {
1946       const struct ir3_shader_variant *vs =
1947          shaders[MESA_SHADER_VERTEX]->variant;
1948 
1949       if (!vs->stream_output.num_outputs && ir3_has_binning_vs(&vs->key)) {
1950          tu_append_executable(pipeline, vs->binning, NULL);
1951       }
1952    }
1953 
1954    if (pipeline_contains_all_shader_state(pipeline)) {
1955       /* It doesn't make much sense to use RETAIN_LINK_TIME_OPTIMIZATION_INFO
1956        * when compiling all stages, but make sure we don't leak.
1957        */
1958       if (nir_shaders)
1959          vk_pipeline_cache_object_unref(&builder->device->vk,
1960                                         &nir_shaders->base);
1961    } else {
1962       struct tu_graphics_lib_pipeline *library =
1963          tu_pipeline_to_graphics_lib(pipeline);
1964       library->nir_shaders = nir_shaders;
1965       for (gl_shader_stage stage = MESA_SHADER_VERTEX;
1966            stage < ARRAY_SIZE(library->shaders);
1967            stage = (gl_shader_stage) (stage + 1)) {
1968          library->shaders[stage].nir = post_link_nir[stage];
1969          library->shaders[stage].key = keys[stage];
1970       }
1971    }
1972 
1973    for (gl_shader_stage stage = MESA_SHADER_VERTEX;
1974         stage < ARRAY_SIZE(shaders); stage = (gl_shader_stage) (stage + 1)) {
1975       pipeline->shaders[stage] = shaders[stage];
1976       if (shaders[stage])
1977          pipeline->active_desc_sets |= shaders[stage]->active_desc_sets;
1978    }
1979 
1980    pipeline_feedback.duration = os_time_get_nano() - pipeline_start;
1981    if (creation_feedback) {
1982       *creation_feedback->pPipelineCreationFeedback = pipeline_feedback;
1983 
1984       for (uint32_t i = 0; i < builder->create_info->stageCount; i++) {
1985          gl_shader_stage s =
1986             vk_to_mesa_shader_stage(builder->create_info->pStages[i].stage);
1987          creation_feedback->pPipelineStageCreationFeedbacks[i] = stage_feedbacks[s];
1988       }
1989    }
1990 
1991    return VK_SUCCESS;
1992 
1993 fail:
1994    if (nir_shaders)
1995       vk_pipeline_cache_object_unref(&builder->device->vk,
1996                                      &nir_shaders->base);
1997 
1998    return result;
1999 }
2000 
2001 static void
tu_pipeline_builder_parse_libraries(struct tu_pipeline_builder * builder,struct tu_pipeline * pipeline)2002 tu_pipeline_builder_parse_libraries(struct tu_pipeline_builder *builder,
2003                                     struct tu_pipeline *pipeline)
2004 {
2005    const VkPipelineLibraryCreateInfoKHR *library_info =
2006       vk_find_struct_const(builder->create_info->pNext,
2007                            PIPELINE_LIBRARY_CREATE_INFO_KHR);
2008 
2009    if (library_info) {
2010       assert(library_info->libraryCount <= MAX_LIBRARIES);
2011       builder->num_libraries = library_info->libraryCount;
2012       for (unsigned i = 0; i < library_info->libraryCount; i++) {
2013          TU_FROM_HANDLE(tu_pipeline, library, library_info->pLibraries[i]);
2014          builder->libraries[i] = tu_pipeline_to_graphics_lib(library);
2015       }
2016    }
2017 
2018    /* Merge in the state from libraries. The program state is a bit special
2019     * and is handled separately.
2020     */
2021    if (pipeline->type == TU_PIPELINE_GRAPHICS_LIB)
2022       tu_pipeline_to_graphics_lib(pipeline)->state = builder->state;
2023    for (unsigned i = 0; i < builder->num_libraries; i++) {
2024       struct tu_graphics_lib_pipeline *library = builder->libraries[i];
2025       if (pipeline->type == TU_PIPELINE_GRAPHICS_LIB)
2026          tu_pipeline_to_graphics_lib(pipeline)->state |= library->state;
2027 
2028       if (library->state &
2029           VK_GRAPHICS_PIPELINE_LIBRARY_FRAGMENT_OUTPUT_INTERFACE_BIT_EXT) {
2030          pipeline->output = library->base.output;
2031          pipeline->lrz_blend.reads_dest |= library->base.lrz_blend.reads_dest;
2032          pipeline->lrz_blend.valid |= library->base.lrz_blend.valid;
2033          pipeline->prim_order = library->base.prim_order;
2034       }
2035 
2036       if ((library->state &
2037            VK_GRAPHICS_PIPELINE_LIBRARY_FRAGMENT_SHADER_BIT_EXT) &&
2038           (library->state &
2039            VK_GRAPHICS_PIPELINE_LIBRARY_FRAGMENT_OUTPUT_INTERFACE_BIT_EXT)) {
2040          pipeline->prim_order = library->base.prim_order;
2041       }
2042 
2043       pipeline->set_state_mask |= library->base.set_state_mask;
2044 
2045       u_foreach_bit (i, library->base.set_state_mask) {
2046          pipeline->dynamic_state[i] = library->base.dynamic_state[i];
2047       }
2048 
2049       if (contains_all_shader_state(library->state)) {
2050          pipeline->program = library->base.program;
2051          pipeline->load_state = library->base.load_state;
2052          for (unsigned i = 0; i < ARRAY_SIZE(pipeline->shaders); i++) {
2053             if (library->base.shaders[i]) {
2054                pipeline->shaders[i] = library->base.shaders[i];
2055                vk_pipeline_cache_object_ref(&pipeline->shaders[i]->base);
2056             }
2057          }
2058       }
2059 
2060       vk_graphics_pipeline_state_merge(&builder->graphics_state,
2061                                        &library->graphics_state);
2062    }
2063 }
2064 
2065 static void
tu_pipeline_builder_parse_layout(struct tu_pipeline_builder * builder,struct tu_pipeline * pipeline)2066 tu_pipeline_builder_parse_layout(struct tu_pipeline_builder *builder,
2067                                  struct tu_pipeline *pipeline)
2068 {
2069    TU_FROM_HANDLE(tu_pipeline_layout, layout, builder->create_info->layout);
2070 
2071    if (layout) {
2072       /* Note: it's still valid to have a layout even if there are libraries.
2073        * This allows the app to e.g. overwrite an INDEPENDENT_SET layout with
2074        * a non-INDEPENDENT_SET layout which may make us use a faster path,
2075        * currently this just affects dynamic offset descriptors.
2076        */
2077       builder->layout = *layout;
2078    } else {
2079       for (unsigned i = 0; i < builder->num_libraries; i++) {
2080          struct tu_graphics_lib_pipeline *library = builder->libraries[i];
2081          builder->layout.num_sets = MAX2(builder->layout.num_sets,
2082                                          library->num_sets);
2083          assert(builder->layout.num_sets <= builder->device->physical_device->usable_sets);
2084          for (unsigned j = 0; j < library->num_sets; j++) {
2085             builder->layout.set[i].layout = library->layouts[i];
2086          }
2087 
2088          builder->layout.push_constant_size = library->push_constant_size;
2089       }
2090 
2091       tu_pipeline_layout_init(&builder->layout);
2092    }
2093 
2094    if (pipeline->type == TU_PIPELINE_GRAPHICS_LIB) {
2095       struct tu_graphics_lib_pipeline *library =
2096          tu_pipeline_to_graphics_lib(pipeline);
2097       library->num_sets = builder->layout.num_sets;
2098       for (unsigned i = 0; i < library->num_sets; i++) {
2099          library->layouts[i] = builder->layout.set[i].layout;
2100          if (library->layouts[i])
2101             vk_descriptor_set_layout_ref(&library->layouts[i]->vk);
2102       }
2103       library->push_constant_size = builder->layout.push_constant_size;
2104    }
2105 }
2106 
2107 static void
tu_pipeline_set_linkage(struct tu_program_descriptor_linkage * link,struct tu_const_state * const_state,const struct ir3_shader_variant * v)2108 tu_pipeline_set_linkage(struct tu_program_descriptor_linkage *link,
2109                         struct tu_const_state *const_state,
2110                         const struct ir3_shader_variant *v)
2111 {
2112    link->const_state = *ir3_const_state(v);
2113    link->tu_const_state = *const_state;
2114    link->constlen = v->constlen;
2115 }
2116 
2117 template <chip CHIP>
2118 static void
tu_emit_program_state(struct tu_cs * sub_cs,struct tu_program_state * prog,struct tu_shader ** shaders)2119 tu_emit_program_state(struct tu_cs *sub_cs,
2120                       struct tu_program_state *prog,
2121                       struct tu_shader **shaders)
2122 {
2123    struct tu_device *dev = sub_cs->device;
2124    struct tu_cs prog_cs;
2125 
2126    const struct ir3_shader_variant *variants[MESA_SHADER_STAGES];
2127    struct tu_draw_state draw_states[MESA_SHADER_STAGES];
2128 
2129    for (gl_shader_stage stage = MESA_SHADER_VERTEX;
2130         stage < ARRAY_SIZE(variants); stage = (gl_shader_stage) (stage+1)) {
2131       variants[stage] = shaders[stage] ? shaders[stage]->variant : NULL;
2132    }
2133 
2134    uint32_t safe_variants =
2135       ir3_trim_constlen(variants, dev->compiler);
2136 
2137    unsigned dynamic_descriptor_sizes[MAX_SETS] = { };
2138 
2139    for (gl_shader_stage stage = MESA_SHADER_VERTEX;
2140         stage < ARRAY_SIZE(variants); stage = (gl_shader_stage) (stage+1)) {
2141       if (shaders[stage]) {
2142          if (safe_variants & (1u << stage)) {
2143             variants[stage] = shaders[stage]->safe_const_variant;
2144             draw_states[stage] = shaders[stage]->safe_const_state;
2145          } else {
2146             draw_states[stage] = shaders[stage]->state;
2147          }
2148 
2149          for (unsigned i = 0; i < MAX_SETS; i++) {
2150             if (shaders[stage]->dynamic_descriptor_sizes[i] >= 0) {
2151                dynamic_descriptor_sizes[i] =
2152                   shaders[stage]->dynamic_descriptor_sizes[i];
2153             }
2154          }
2155       }
2156    }
2157 
2158    for (unsigned i = 0; i < ARRAY_SIZE(variants); i++) {
2159       if (!variants[i])
2160          continue;
2161 
2162       tu_pipeline_set_linkage(&prog->link[i],
2163                               &shaders[i]->const_state,
2164                               variants[i]);
2165 
2166       struct tu_push_constant_range *push_consts =
2167          &shaders[i]->const_state.push_consts;
2168       if (push_consts->type == IR3_PUSH_CONSTS_SHARED ||
2169           push_consts->type == IR3_PUSH_CONSTS_SHARED_PREAMBLE) {
2170          prog->shared_consts = *push_consts;
2171       }
2172    }
2173 
2174    unsigned dynamic_descriptor_offset = 0;
2175    for (unsigned i = 0; i < MAX_SETS; i++) {
2176       prog->dynamic_descriptor_offsets[i] = dynamic_descriptor_offset;
2177       dynamic_descriptor_offset += dynamic_descriptor_sizes[i];
2178    }
2179 
2180    /* Emit HLSQ_xS_CNTL/HLSQ_SP_xS_CONFIG *first*, before emitting anything
2181     * else that could depend on that state (like push constants)
2182     *
2183     * Note also that this always uses the full VS even in binning pass.  The
2184     * binning pass variant has the same const layout as the full VS, and
2185     * the constlen for the VS will be the same or greater than the constlen
2186     * for the binning pass variant.  It is required that the constlen state
2187     * matches between binning and draw passes, as some parts of the push
2188     * consts are emitted in state groups that are shared between the binning
2189     * and draw passes.
2190     */
2191    tu_cs_begin_sub_stream(sub_cs, 512, &prog_cs);
2192    tu6_emit_program_config<CHIP>(&prog_cs, prog, shaders, variants);
2193    prog->config_state = tu_cs_end_draw_state(sub_cs, &prog_cs);
2194 
2195    prog->vs_state = draw_states[MESA_SHADER_VERTEX];
2196 
2197   /* Don't use the binning pass variant when GS is present because we don't
2198    * support compiling correct binning pass variants with GS.
2199    */
2200    if (variants[MESA_SHADER_GEOMETRY]) {
2201       prog->vs_binning_state = prog->vs_state;
2202    } else {
2203       prog->vs_binning_state =
2204          shaders[MESA_SHADER_VERTEX]->binning_state;
2205    }
2206 
2207    prog->hs_state = draw_states[MESA_SHADER_TESS_CTRL];
2208    prog->ds_state = draw_states[MESA_SHADER_TESS_EVAL];
2209    prog->gs_state = draw_states[MESA_SHADER_GEOMETRY];
2210    prog->gs_binning_state =
2211       shaders[MESA_SHADER_GEOMETRY]->binning_state;
2212    prog->fs_state = draw_states[MESA_SHADER_FRAGMENT];
2213 
2214    const struct ir3_shader_variant *vs = variants[MESA_SHADER_VERTEX];
2215    const struct ir3_shader_variant *hs = variants[MESA_SHADER_TESS_CTRL];
2216    const struct ir3_shader_variant *ds = variants[MESA_SHADER_TESS_EVAL];
2217    const struct ir3_shader_variant *gs = variants[MESA_SHADER_GEOMETRY];
2218    const struct ir3_shader_variant *fs = variants[MESA_SHADER_FRAGMENT];
2219 
2220    tu_cs_begin_sub_stream(sub_cs, 512, &prog_cs);
2221    tu6_emit_vpc<CHIP>(&prog_cs, vs, hs, ds, gs, fs);
2222    prog->vpc_state = tu_cs_end_draw_state(sub_cs, &prog_cs);
2223 
2224    const struct ir3_shader_variant *last_shader;
2225    if (gs)
2226       last_shader = gs;
2227    else if (ds)
2228       last_shader = ds;
2229    else
2230       last_shader = vs;
2231 
2232    prog->per_view_viewport =
2233       !last_shader->writes_viewport &&
2234       shaders[MESA_SHADER_FRAGMENT]->fs.has_fdm &&
2235       dev->physical_device->info->a6xx.has_per_view_viewport;
2236 }
2237 
2238 static const enum mesa_vk_dynamic_graphics_state tu_vertex_input_state[] = {
2239    MESA_VK_DYNAMIC_VI,
2240 };
2241 
2242 template <chip CHIP>
2243 static unsigned
tu6_vertex_input_size(struct tu_device * dev,const struct vk_vertex_input_state * vi)2244 tu6_vertex_input_size(struct tu_device *dev,
2245                       const struct vk_vertex_input_state *vi)
2246 {
2247    return 1 + 2 * util_last_bit(vi->attributes_valid);
2248 }
2249 
2250 template <chip CHIP>
2251 static void
tu6_emit_vertex_input(struct tu_cs * cs,const struct vk_vertex_input_state * vi)2252 tu6_emit_vertex_input(struct tu_cs *cs,
2253                       const struct vk_vertex_input_state *vi)
2254 {
2255    unsigned attr_count = util_last_bit(vi->attributes_valid);
2256    if (attr_count != 0)
2257       tu_cs_emit_pkt4(cs, REG_A6XX_VFD_DECODE_INSTR(0), attr_count * 2);
2258 
2259    for (uint32_t loc = 0; loc < attr_count; loc++) {
2260       const struct vk_vertex_attribute_state *attr = &vi->attributes[loc];
2261 
2262       if (vi->attributes_valid & (1u << loc)) {
2263          const struct vk_vertex_binding_state *binding =
2264             &vi->bindings[attr->binding];
2265 
2266          enum pipe_format pipe_format = vk_format_to_pipe_format(attr->format);
2267          const struct tu_native_format format = tu6_format_vtx(pipe_format);
2268          tu_cs_emit(cs, A6XX_VFD_DECODE_INSTR(0,
2269                           .idx = attr->binding,
2270                           .offset = attr->offset,
2271                           .instanced = binding->input_rate == VK_VERTEX_INPUT_RATE_INSTANCE,
2272                           .format = format.fmt,
2273                           .swap = format.swap,
2274                           .unk30 = 1,
2275                           ._float = !util_format_is_pure_integer(pipe_format)).value);
2276          tu_cs_emit(cs, A6XX_VFD_DECODE_STEP_RATE(0, binding->divisor).value);
2277       } else {
2278          tu_cs_emit(cs, 0);
2279          tu_cs_emit(cs, 0);
2280       }
2281    }
2282 }
2283 
2284 static const enum mesa_vk_dynamic_graphics_state tu_vertex_stride_state[] = {
2285    MESA_VK_DYNAMIC_VI_BINDINGS_VALID,
2286    MESA_VK_DYNAMIC_VI_BINDING_STRIDES,
2287 };
2288 
2289 template <chip CHIP>
2290 static unsigned
tu6_vertex_stride_size(struct tu_device * dev,const struct vk_vertex_input_state * vi)2291 tu6_vertex_stride_size(struct tu_device *dev,
2292                        const struct vk_vertex_input_state *vi)
2293 {
2294    return 1 + 2 * util_last_bit(vi->bindings_valid);
2295 }
2296 
2297 template <chip CHIP>
2298 static void
tu6_emit_vertex_stride(struct tu_cs * cs,const struct vk_vertex_input_state * vi)2299 tu6_emit_vertex_stride(struct tu_cs *cs, const struct vk_vertex_input_state *vi)
2300 {
2301    if (vi->bindings_valid) {
2302       unsigned bindings_count = util_last_bit(vi->bindings_valid);
2303       tu_cs_emit_pkt7(cs, CP_CONTEXT_REG_BUNCH, 2 * bindings_count);
2304       for (unsigned i = 0; i < bindings_count; i++) {
2305          tu_cs_emit(cs, REG_A6XX_VFD_FETCH_STRIDE(i));
2306          tu_cs_emit(cs, vi->bindings[i].stride);
2307       }
2308    }
2309 }
2310 
2311 template <chip CHIP>
2312 static unsigned
tu6_vertex_stride_size_dyn(struct tu_device * dev,const uint16_t * vi_binding_stride,uint32_t bindings_valid)2313 tu6_vertex_stride_size_dyn(struct tu_device *dev,
2314                            const uint16_t *vi_binding_stride,
2315                            uint32_t bindings_valid)
2316 {
2317    return 1 + 2 * util_last_bit(bindings_valid);
2318 }
2319 
2320 template <chip CHIP>
2321 static void
tu6_emit_vertex_stride_dyn(struct tu_cs * cs,const uint16_t * vi_binding_stride,uint32_t bindings_valid)2322 tu6_emit_vertex_stride_dyn(struct tu_cs *cs, const uint16_t *vi_binding_stride,
2323                            uint32_t bindings_valid)
2324 {
2325    if (bindings_valid) {
2326       unsigned bindings_count = util_last_bit(bindings_valid);
2327       tu_cs_emit_pkt7(cs, CP_CONTEXT_REG_BUNCH, 2 * bindings_count);
2328       for (unsigned i = 0; i < bindings_count; i++) {
2329          tu_cs_emit(cs, REG_A6XX_VFD_FETCH_STRIDE(i));
2330          tu_cs_emit(cs, vi_binding_stride[i]);
2331       }
2332    }
2333 }
2334 
2335 static const enum mesa_vk_dynamic_graphics_state tu_viewport_state[] = {
2336    MESA_VK_DYNAMIC_VP_VIEWPORTS,
2337    MESA_VK_DYNAMIC_VP_VIEWPORT_COUNT,
2338    MESA_VK_DYNAMIC_VP_DEPTH_CLIP_NEGATIVE_ONE_TO_ONE,
2339 };
2340 
2341 template <chip CHIP>
2342 static unsigned
tu6_viewport_size(struct tu_device * dev,const struct vk_viewport_state * vp)2343 tu6_viewport_size(struct tu_device *dev, const struct vk_viewport_state *vp)
2344 {
2345    return 1 + vp->viewport_count * 6 + 1 + vp->viewport_count * 2 +
2346       1 + vp->viewport_count * 2 + 5;
2347 }
2348 
2349 template <chip CHIP>
2350 static void
tu6_emit_viewport(struct tu_cs * cs,const struct vk_viewport_state * vp)2351 tu6_emit_viewport(struct tu_cs *cs, const struct vk_viewport_state *vp)
2352 {
2353    VkExtent2D guardband = {511, 511};
2354 
2355    tu_cs_emit_pkt4(cs, REG_A6XX_GRAS_CL_VPORT_XOFFSET(0), vp->viewport_count * 6);
2356    for (uint32_t i = 0; i < vp->viewport_count; i++) {
2357       const VkViewport *viewport = &vp->viewports[i];
2358       float offsets[3];
2359       float scales[3];
2360       scales[0] = viewport->width / 2.0f;
2361       scales[1] = viewport->height / 2.0f;
2362       if (vp->depth_clip_negative_one_to_one) {
2363          scales[2] = 0.5 * (viewport->maxDepth - viewport->minDepth);
2364       } else {
2365          scales[2] = viewport->maxDepth - viewport->minDepth;
2366       }
2367 
2368       offsets[0] = viewport->x + scales[0];
2369       offsets[1] = viewport->y + scales[1];
2370       if (vp->depth_clip_negative_one_to_one) {
2371          offsets[2] = 0.5 * (viewport->minDepth + viewport->maxDepth);
2372       } else {
2373          offsets[2] = viewport->minDepth;
2374       }
2375 
2376       for (uint32_t j = 0; j < 3; j++) {
2377          tu_cs_emit(cs, fui(offsets[j]));
2378          tu_cs_emit(cs, fui(scales[j]));
2379       }
2380 
2381       guardband.width =
2382          MIN2(guardband.width, fd_calc_guardband(offsets[0], scales[0], false));
2383       guardband.height =
2384          MIN2(guardband.height, fd_calc_guardband(offsets[1], scales[1], false));
2385    }
2386 
2387    tu_cs_emit_pkt4(cs, REG_A6XX_GRAS_SC_VIEWPORT_SCISSOR_TL(0), vp->viewport_count * 2);
2388    for (uint32_t i = 0; i < vp->viewport_count; i++) {
2389       const VkViewport *viewport = &vp->viewports[i];
2390       VkOffset2D min;
2391       VkOffset2D max;
2392       min.x = (int32_t) viewport->x;
2393       max.x = (int32_t) ceilf(viewport->x + viewport->width);
2394       if (viewport->height >= 0.0f) {
2395          min.y = (int32_t) viewport->y;
2396          max.y = (int32_t) ceilf(viewport->y + viewport->height);
2397       } else {
2398          min.y = (int32_t)(viewport->y + viewport->height);
2399          max.y = (int32_t) ceilf(viewport->y);
2400       }
2401       /* the spec allows viewport->height to be 0.0f */
2402       if (min.y == max.y)
2403          max.y++;
2404       /* allow viewport->width = 0.0f for un-initialized viewports: */
2405       if (min.x == max.x)
2406          max.x++;
2407 
2408       min.x = MAX2(min.x, 0);
2409       min.y = MAX2(min.y, 0);
2410       max.x = MAX2(max.x, 1);
2411       max.y = MAX2(max.y, 1);
2412 
2413       assert(min.x < max.x);
2414       assert(min.y < max.y);
2415 
2416       tu_cs_emit(cs, A6XX_GRAS_SC_VIEWPORT_SCISSOR_TL_X(min.x) |
2417                      A6XX_GRAS_SC_VIEWPORT_SCISSOR_TL_Y(min.y));
2418       tu_cs_emit(cs, A6XX_GRAS_SC_VIEWPORT_SCISSOR_BR_X(max.x - 1) |
2419                      A6XX_GRAS_SC_VIEWPORT_SCISSOR_BR_Y(max.y - 1));
2420    }
2421 
2422    tu_cs_emit_pkt4(cs, REG_A6XX_GRAS_CL_Z_CLAMP(0), vp->viewport_count * 2);
2423    for (uint32_t i = 0; i < vp->viewport_count; i++) {
2424       const VkViewport *viewport = &vp->viewports[i];
2425       tu_cs_emit(cs, fui(MIN2(viewport->minDepth, viewport->maxDepth)));
2426       tu_cs_emit(cs, fui(MAX2(viewport->minDepth, viewport->maxDepth)));
2427    }
2428    tu_cs_emit_pkt4(cs, REG_A6XX_GRAS_CL_GUARDBAND_CLIP_ADJ, 1);
2429    tu_cs_emit(cs, A6XX_GRAS_CL_GUARDBAND_CLIP_ADJ_HORZ(guardband.width) |
2430                   A6XX_GRAS_CL_GUARDBAND_CLIP_ADJ_VERT(guardband.height));
2431 
2432    /* TODO: what to do about this and multi viewport ? */
2433    float z_clamp_min = vp->viewport_count ? MIN2(vp->viewports[0].minDepth, vp->viewports[0].maxDepth) : 0;
2434    float z_clamp_max = vp->viewport_count ? MAX2(vp->viewports[0].minDepth, vp->viewports[0].maxDepth) : 0;
2435 
2436    tu_cs_emit_regs(cs,
2437                    A6XX_RB_Z_CLAMP_MIN(z_clamp_min),
2438                    A6XX_RB_Z_CLAMP_MAX(z_clamp_max));
2439 }
2440 
2441 struct apply_viewport_state {
2442    struct vk_viewport_state vp;
2443    bool share_scale;
2444 };
2445 
2446 /* It's a hardware restriction that the window offset (i.e. bin.offset) must
2447  * be the same for all views. This means that GMEM coordinates cannot be a
2448  * simple scaling of framebuffer coordinates, because this would require us to
2449  * scale the window offset and the scale may be different per view. Instead we
2450  * have to apply a per-bin offset to the GMEM coordinate transform to make
2451  * sure that the window offset maps to itself. Specifically we need an offset
2452  * o to the transform:
2453  *
2454  * x' = s * x + o
2455  *
2456  * so that when we plug in the bin start b_s:
2457  *
2458  * b_s = s * b_s + o
2459  *
2460  * and we get:
2461  *
2462  * o = b_s - s * b_s
2463  *
2464  * We use this form exactly, because we know the bin offset is a multiple of
2465  * the frag area so s * b_s is an integer and we can compute an exact result
2466  * easily.
2467  */
2468 
2469 VkOffset2D
tu_fdm_per_bin_offset(VkExtent2D frag_area,VkRect2D bin)2470 tu_fdm_per_bin_offset(VkExtent2D frag_area, VkRect2D bin)
2471 {
2472    assert(bin.offset.x % frag_area.width == 0);
2473    assert(bin.offset.y % frag_area.height == 0);
2474 
2475    return (VkOffset2D) {
2476       bin.offset.x - bin.offset.x / frag_area.width,
2477       bin.offset.y - bin.offset.y / frag_area.height
2478    };
2479 }
2480 
2481 static void
fdm_apply_viewports(struct tu_cmd_buffer * cmd,struct tu_cs * cs,void * data,VkRect2D bin,unsigned views,VkExtent2D * frag_areas)2482 fdm_apply_viewports(struct tu_cmd_buffer *cmd, struct tu_cs *cs, void *data,
2483                     VkRect2D bin, unsigned views, VkExtent2D *frag_areas)
2484 {
2485    const struct apply_viewport_state *state =
2486       (const struct apply_viewport_state *)data;
2487 
2488    struct vk_viewport_state vp = state->vp;
2489 
2490    for (unsigned i = 0; i < state->vp.viewport_count; i++) {
2491       /* Note: If we're using shared scaling, the scale should already be the
2492        * same across all views, we can pick any view. However the number
2493        * of viewports and number of views is not guaranteed the same, so we
2494        * need to pick the 0'th view which always exists to be safe.
2495        *
2496        * Conversly, if we're not using shared scaling then the rasterizer in
2497        * the original pipeline is using only the first viewport, so we need to
2498        * replicate it across all viewports.
2499        */
2500       VkExtent2D frag_area = state->share_scale ? frag_areas[0] : frag_areas[i];
2501       VkViewport viewport =
2502          state->share_scale ? state->vp.viewports[i] : state->vp.viewports[0];
2503       if (frag_area.width == 1 && frag_area.height == 1) {
2504          vp.viewports[i] = viewport;
2505          continue;
2506       }
2507 
2508       float scale_x = (float) 1.0f / frag_area.width;
2509       float scale_y = (float) 1.0f / frag_area.height;
2510 
2511       vp.viewports[i].minDepth = viewport.minDepth;
2512       vp.viewports[i].maxDepth = viewport.maxDepth;
2513       vp.viewports[i].width = viewport.width * scale_x;
2514       vp.viewports[i].height = viewport.height * scale_y;
2515 
2516       VkOffset2D offset = tu_fdm_per_bin_offset(frag_area, bin);
2517 
2518       vp.viewports[i].x = scale_x * viewport.x + offset.x;
2519       vp.viewports[i].y = scale_y * viewport.y + offset.y;
2520    }
2521 
2522    TU_CALLX(cs->device, tu6_emit_viewport)(cs, &vp);
2523 }
2524 
2525 static void
tu6_emit_viewport_fdm(struct tu_cs * cs,struct tu_cmd_buffer * cmd,const struct vk_viewport_state * vp)2526 tu6_emit_viewport_fdm(struct tu_cs *cs, struct tu_cmd_buffer *cmd,
2527                       const struct vk_viewport_state *vp)
2528 {
2529    unsigned num_views = MAX2(cmd->state.pass->num_views, 1);
2530    struct apply_viewport_state state = {
2531       .vp = *vp,
2532       .share_scale = !cmd->state.per_view_viewport,
2533    };
2534    if (!state.share_scale)
2535       state.vp.viewport_count = num_views;
2536    unsigned size = TU_CALLX(cmd->device, tu6_viewport_size)(cmd->device, &state.vp);
2537    tu_cs_begin_sub_stream(&cmd->sub_cs, size, cs);
2538    tu_create_fdm_bin_patchpoint(cmd, cs, size, fdm_apply_viewports, state);
2539 }
2540 
2541 static const enum mesa_vk_dynamic_graphics_state tu_scissor_state[] = {
2542    MESA_VK_DYNAMIC_VP_SCISSORS,
2543    MESA_VK_DYNAMIC_VP_SCISSOR_COUNT,
2544 };
2545 
2546 template <chip CHIP>
2547 static unsigned
tu6_scissor_size(struct tu_device * dev,const struct vk_viewport_state * vp)2548 tu6_scissor_size(struct tu_device *dev, const struct vk_viewport_state *vp)
2549 {
2550    return 1 + vp->scissor_count * 2;
2551 }
2552 
2553 template <chip CHIP>
2554 void
tu6_emit_scissor(struct tu_cs * cs,const struct vk_viewport_state * vp)2555 tu6_emit_scissor(struct tu_cs *cs, const struct vk_viewport_state *vp)
2556 {
2557    tu_cs_emit_pkt4(cs, REG_A6XX_GRAS_SC_SCREEN_SCISSOR_TL(0), vp->scissor_count * 2);
2558 
2559    for (uint32_t i = 0; i < vp->scissor_count; i++) {
2560       const VkRect2D *scissor = &vp->scissors[i];
2561 
2562       uint32_t min_x = scissor->offset.x;
2563       uint32_t min_y = scissor->offset.y;
2564       uint32_t max_x = min_x + scissor->extent.width - 1;
2565       uint32_t max_y = min_y + scissor->extent.height - 1;
2566 
2567       if (!scissor->extent.width || !scissor->extent.height) {
2568          min_x = min_y = 1;
2569          max_x = max_y = 0;
2570       } else {
2571          /* avoid overflow */
2572          uint32_t scissor_max = BITFIELD_MASK(15);
2573          min_x = MIN2(scissor_max, min_x);
2574          min_y = MIN2(scissor_max, min_y);
2575          max_x = MIN2(scissor_max, max_x);
2576          max_y = MIN2(scissor_max, max_y);
2577       }
2578 
2579       tu_cs_emit(cs, A6XX_GRAS_SC_SCREEN_SCISSOR_TL_X(min_x) |
2580                      A6XX_GRAS_SC_SCREEN_SCISSOR_TL_Y(min_y));
2581       tu_cs_emit(cs, A6XX_GRAS_SC_SCREEN_SCISSOR_BR_X(max_x) |
2582                      A6XX_GRAS_SC_SCREEN_SCISSOR_BR_Y(max_y));
2583    }
2584 }
2585 
2586 static void
fdm_apply_scissors(struct tu_cmd_buffer * cmd,struct tu_cs * cs,void * data,VkRect2D bin,unsigned views,VkExtent2D * frag_areas)2587 fdm_apply_scissors(struct tu_cmd_buffer *cmd, struct tu_cs *cs, void *data,
2588                    VkRect2D bin, unsigned views, VkExtent2D *frag_areas)
2589 {
2590    const struct apply_viewport_state *state =
2591       (const struct apply_viewport_state *)data;
2592 
2593    struct vk_viewport_state vp = state->vp;
2594 
2595    for (unsigned i = 0; i < vp.scissor_count; i++) {
2596       VkExtent2D frag_area = state->share_scale ? frag_areas[0] : frag_areas[i];
2597       VkRect2D scissor =
2598          state->share_scale ? state->vp.scissors[i] : state->vp.scissors[0];
2599       if (frag_area.width == 1 && frag_area.height == 1) {
2600          vp.scissors[i] = scissor;
2601          continue;
2602       }
2603 
2604       /* Transform the scissor following the viewport. It's unclear how this
2605        * is supposed to handle cases where the scissor isn't aligned to the
2606        * fragment area, but we round outwards to always render partial
2607        * fragments if the scissor size equals the framebuffer size and it
2608        * isn't aligned to the fragment area.
2609        */
2610       VkOffset2D offset = tu_fdm_per_bin_offset(frag_area, bin);
2611       VkOffset2D min = {
2612          scissor.offset.x / frag_area.width + offset.x,
2613          scissor.offset.y / frag_area.width + offset.y,
2614       };
2615       VkOffset2D max = {
2616          DIV_ROUND_UP(scissor.offset.x + scissor.extent.width, frag_area.width) + offset.x,
2617          DIV_ROUND_UP(scissor.offset.y + scissor.extent.height, frag_area.height) + offset.y,
2618       };
2619 
2620       /* Intersect scissor with the scaled bin, this essentially replaces the
2621        * window scissor.
2622        */
2623       uint32_t scaled_width = bin.extent.width / frag_area.width;
2624       uint32_t scaled_height = bin.extent.height / frag_area.height;
2625       vp.scissors[i].offset.x = MAX2(min.x, bin.offset.x);
2626       vp.scissors[i].offset.y = MAX2(min.y, bin.offset.y);
2627       vp.scissors[i].extent.width =
2628          MIN2(max.x, bin.offset.x + scaled_width) - vp.scissors[i].offset.x;
2629       vp.scissors[i].extent.height =
2630          MIN2(max.y, bin.offset.y + scaled_height) - vp.scissors[i].offset.y;
2631    }
2632 
2633    TU_CALLX(cs->device, tu6_emit_scissor)(cs, &vp);
2634 }
2635 
2636 static void
tu6_emit_scissor_fdm(struct tu_cs * cs,struct tu_cmd_buffer * cmd,const struct vk_viewport_state * vp)2637 tu6_emit_scissor_fdm(struct tu_cs *cs, struct tu_cmd_buffer *cmd,
2638                      const struct vk_viewport_state *vp)
2639 {
2640    unsigned num_views = MAX2(cmd->state.pass->num_views, 1);
2641    struct apply_viewport_state state = {
2642       .vp = *vp,
2643       .share_scale = !cmd->state.per_view_viewport,
2644    };
2645    if (!state.share_scale)
2646       state.vp.scissor_count = num_views;
2647    unsigned size = TU_CALLX(cmd->device, tu6_scissor_size)(cmd->device, &state.vp);
2648    tu_cs_begin_sub_stream(&cmd->sub_cs, size, cs);
2649    tu_create_fdm_bin_patchpoint(cmd, cs, size, fdm_apply_scissors, state);
2650 }
2651 
2652 static const enum mesa_vk_dynamic_graphics_state tu_sample_locations_state[] = {
2653    MESA_VK_DYNAMIC_MS_SAMPLE_LOCATIONS_ENABLE,
2654    MESA_VK_DYNAMIC_MS_SAMPLE_LOCATIONS,
2655 };
2656 
2657 template <chip CHIP>
2658 static unsigned
tu6_sample_locations_size(struct tu_device * dev,bool enable,const struct vk_sample_locations_state * samp_loc)2659 tu6_sample_locations_size(struct tu_device *dev, bool enable,
2660                           const struct vk_sample_locations_state *samp_loc)
2661 {
2662    return 6 + (enable ? 6 : 0);
2663 }
2664 
2665 template <chip CHIP>
2666 void
tu6_emit_sample_locations(struct tu_cs * cs,bool enable,const struct vk_sample_locations_state * samp_loc)2667 tu6_emit_sample_locations(struct tu_cs *cs, bool enable,
2668                           const struct vk_sample_locations_state *samp_loc)
2669 {
2670    uint32_t sample_config =
2671       COND(enable, A6XX_RB_SAMPLE_CONFIG_LOCATION_ENABLE);
2672 
2673    tu_cs_emit_pkt4(cs, REG_A6XX_GRAS_SAMPLE_CONFIG, 1);
2674    tu_cs_emit(cs, sample_config);
2675 
2676    tu_cs_emit_pkt4(cs, REG_A6XX_RB_SAMPLE_CONFIG, 1);
2677    tu_cs_emit(cs, sample_config);
2678 
2679    tu_cs_emit_pkt4(cs, REG_A6XX_SP_TP_SAMPLE_CONFIG, 1);
2680    tu_cs_emit(cs, sample_config);
2681 
2682    if (!enable)
2683       return;
2684 
2685    assert(samp_loc->grid_size.width == 1);
2686    assert(samp_loc->grid_size.height == 1);
2687 
2688    uint32_t sample_locations = 0;
2689    for (uint32_t i = 0; i < samp_loc->per_pixel; i++) {
2690       /* From VkSampleLocationEXT:
2691        *
2692        *    The values specified in a VkSampleLocationEXT structure are always
2693        *    clamped to the implementation-dependent sample location coordinate
2694        *    range
2695        *    [sampleLocationCoordinateRange[0],sampleLocationCoordinateRange[1]]
2696        */
2697       float x = CLAMP(samp_loc->locations[i].x, SAMPLE_LOCATION_MIN,
2698                       SAMPLE_LOCATION_MAX);
2699       float y = CLAMP(samp_loc->locations[i].y, SAMPLE_LOCATION_MIN,
2700                       SAMPLE_LOCATION_MAX);
2701 
2702       sample_locations |=
2703          (A6XX_RB_SAMPLE_LOCATION_0_SAMPLE_0_X(x) |
2704           A6XX_RB_SAMPLE_LOCATION_0_SAMPLE_0_Y(y)) << i*8;
2705    }
2706 
2707    tu_cs_emit_pkt4(cs, REG_A6XX_GRAS_SAMPLE_LOCATION_0, 1);
2708    tu_cs_emit(cs, sample_locations);
2709 
2710    tu_cs_emit_pkt4(cs, REG_A6XX_RB_SAMPLE_LOCATION_0, 1);
2711    tu_cs_emit(cs, sample_locations);
2712 
2713    tu_cs_emit_pkt4(cs, REG_A6XX_SP_TP_SAMPLE_LOCATION_0, 1);
2714    tu_cs_emit(cs, sample_locations);
2715 }
2716 
2717 static const enum mesa_vk_dynamic_graphics_state tu_depth_bias_state[] = {
2718    MESA_VK_DYNAMIC_RS_DEPTH_BIAS_FACTORS,
2719 };
2720 
2721 template <chip CHIP>
2722 static unsigned
tu6_depth_bias_size(struct tu_device * dev,const struct vk_rasterization_state * rs)2723 tu6_depth_bias_size(struct tu_device *dev,
2724                     const struct vk_rasterization_state *rs)
2725 {
2726    return 4;
2727 }
2728 
2729 template <chip CHIP>
2730 void
tu6_emit_depth_bias(struct tu_cs * cs,const struct vk_rasterization_state * rs)2731 tu6_emit_depth_bias(struct tu_cs *cs, const struct vk_rasterization_state *rs)
2732 {
2733    tu_cs_emit_pkt4(cs, REG_A6XX_GRAS_SU_POLY_OFFSET_SCALE, 3);
2734    tu_cs_emit(cs, A6XX_GRAS_SU_POLY_OFFSET_SCALE(rs->depth_bias.slope).value);
2735    tu_cs_emit(cs, A6XX_GRAS_SU_POLY_OFFSET_OFFSET(rs->depth_bias.constant).value);
2736    tu_cs_emit(cs, A6XX_GRAS_SU_POLY_OFFSET_OFFSET_CLAMP(rs->depth_bias.clamp).value);
2737 }
2738 
2739 static const enum mesa_vk_dynamic_graphics_state tu_bandwidth_state[] = {
2740    MESA_VK_DYNAMIC_CB_LOGIC_OP_ENABLE,
2741    MESA_VK_DYNAMIC_CB_LOGIC_OP,
2742    MESA_VK_DYNAMIC_CB_ATTACHMENT_COUNT,
2743    MESA_VK_DYNAMIC_CB_COLOR_WRITE_ENABLES,
2744    MESA_VK_DYNAMIC_CB_BLEND_ENABLES,
2745    MESA_VK_DYNAMIC_CB_WRITE_MASKS,
2746 };
2747 
2748 static void
tu_calc_bandwidth(struct tu_bandwidth * bandwidth,const struct vk_color_blend_state * cb,const struct vk_render_pass_state * rp)2749 tu_calc_bandwidth(struct tu_bandwidth *bandwidth,
2750                   const struct vk_color_blend_state *cb,
2751                   const struct vk_render_pass_state *rp)
2752 {
2753    bool rop_reads_dst = cb->logic_op_enable && tu_logic_op_reads_dst((VkLogicOp)cb->logic_op);
2754 
2755    uint32_t total_bpp = 0;
2756    for (unsigned i = 0; i < cb->attachment_count; i++) {
2757       const struct vk_color_blend_attachment_state *att = &cb->attachments[i];
2758       if (!(cb->color_write_enables & (1u << i)))
2759          continue;
2760 
2761       const VkFormat format = rp->color_attachment_formats[i];
2762 
2763       uint32_t write_bpp = 0;
2764       if (format == VK_FORMAT_UNDEFINED) {
2765          /* do nothing */
2766       } else if (att->write_mask == 0xf) {
2767          write_bpp = vk_format_get_blocksizebits(format);
2768       } else {
2769          const enum pipe_format pipe_format = vk_format_to_pipe_format(format);
2770          for (uint32_t i = 0; i < 4; i++) {
2771             if (att->write_mask & (1 << i)) {
2772                write_bpp += util_format_get_component_bits(pipe_format,
2773                      UTIL_FORMAT_COLORSPACE_RGB, i);
2774             }
2775          }
2776       }
2777       total_bpp += write_bpp;
2778 
2779       if (rop_reads_dst || att->blend_enable) {
2780          total_bpp += write_bpp;
2781       }
2782    }
2783 
2784    bandwidth->color_bandwidth_per_sample = total_bpp / 8;
2785 
2786    if (rp->attachments & MESA_VK_RP_ATTACHMENT_DEPTH_BIT) {
2787       bandwidth->depth_cpp_per_sample = util_format_get_component_bits(
2788             vk_format_to_pipe_format(rp->depth_attachment_format),
2789             UTIL_FORMAT_COLORSPACE_ZS, 0) / 8;
2790    }
2791 
2792    if (rp->attachments & MESA_VK_RP_ATTACHMENT_STENCIL_BIT) {
2793       bandwidth->stencil_cpp_per_sample = util_format_get_component_bits(
2794             vk_format_to_pipe_format(rp->stencil_attachment_format),
2795             UTIL_FORMAT_COLORSPACE_ZS, 1) / 8;
2796    }
2797 }
2798 
2799 /* Return true if the blend state reads the color attachments. */
2800 static bool
tu6_calc_blend_lrz(const struct vk_color_blend_state * cb,const struct vk_render_pass_state * rp)2801 tu6_calc_blend_lrz(const struct vk_color_blend_state *cb,
2802                    const struct vk_render_pass_state *rp)
2803 {
2804    if (cb->logic_op_enable && tu_logic_op_reads_dst((VkLogicOp)cb->logic_op))
2805       return true;
2806 
2807    for (unsigned i = 0; i < cb->attachment_count; i++) {
2808       if (rp->color_attachment_formats[i] == VK_FORMAT_UNDEFINED)
2809          continue;
2810 
2811       const struct vk_color_blend_attachment_state *att = &cb->attachments[i];
2812       if (att->blend_enable)
2813          return true;
2814       if (!(cb->color_write_enables & (1u << i)))
2815          return true;
2816       unsigned mask =
2817          MASK(vk_format_get_nr_components(rp->color_attachment_formats[i]));
2818       if ((att->write_mask & mask) != mask)
2819          return true;
2820    }
2821 
2822    return false;
2823 }
2824 
2825 static const enum mesa_vk_dynamic_graphics_state tu_blend_lrz_state[] = {
2826    MESA_VK_DYNAMIC_CB_LOGIC_OP_ENABLE,
2827    MESA_VK_DYNAMIC_CB_LOGIC_OP,
2828    MESA_VK_DYNAMIC_CB_ATTACHMENT_COUNT,
2829    MESA_VK_DYNAMIC_CB_COLOR_WRITE_ENABLES,
2830    MESA_VK_DYNAMIC_CB_BLEND_ENABLES,
2831    MESA_VK_DYNAMIC_CB_WRITE_MASKS,
2832 };
2833 
2834 static void
tu_emit_blend_lrz(struct tu_lrz_blend * lrz,const struct vk_color_blend_state * cb,const struct vk_render_pass_state * rp)2835 tu_emit_blend_lrz(struct tu_lrz_blend *lrz,
2836                   const struct vk_color_blend_state *cb,
2837                   const struct vk_render_pass_state *rp)
2838 {
2839    lrz->reads_dest = tu6_calc_blend_lrz(cb, rp);
2840    lrz->valid = true;
2841 }
2842 
2843 static const enum mesa_vk_dynamic_graphics_state tu_blend_state[] = {
2844    MESA_VK_DYNAMIC_CB_LOGIC_OP_ENABLE,
2845    MESA_VK_DYNAMIC_CB_LOGIC_OP,
2846    MESA_VK_DYNAMIC_CB_ATTACHMENT_COUNT,
2847    MESA_VK_DYNAMIC_CB_COLOR_WRITE_ENABLES,
2848    MESA_VK_DYNAMIC_CB_BLEND_ENABLES,
2849    MESA_VK_DYNAMIC_CB_BLEND_EQUATIONS,
2850    MESA_VK_DYNAMIC_CB_WRITE_MASKS,
2851    MESA_VK_DYNAMIC_MS_ALPHA_TO_COVERAGE_ENABLE,
2852    MESA_VK_DYNAMIC_MS_ALPHA_TO_ONE_ENABLE,
2853    MESA_VK_DYNAMIC_MS_SAMPLE_MASK,
2854 };
2855 
2856 template <chip CHIP>
2857 static unsigned
tu6_blend_size(struct tu_device * dev,const struct vk_color_blend_state * cb,bool alpha_to_coverage_enable,bool alpha_to_one_enable,uint32_t sample_mask)2858 tu6_blend_size(struct tu_device *dev,
2859                const struct vk_color_blend_state *cb,
2860                bool alpha_to_coverage_enable,
2861                bool alpha_to_one_enable,
2862                uint32_t sample_mask)
2863 {
2864    unsigned num_rts = alpha_to_coverage_enable ?
2865       MAX2(cb->attachment_count, 1) : cb->attachment_count;
2866    return 8 + 3 * num_rts;
2867 }
2868 
2869 template <chip CHIP>
2870 static void
tu6_emit_blend(struct tu_cs * cs,const struct vk_color_blend_state * cb,bool alpha_to_coverage_enable,bool alpha_to_one_enable,uint32_t sample_mask)2871 tu6_emit_blend(struct tu_cs *cs,
2872                const struct vk_color_blend_state *cb,
2873                bool alpha_to_coverage_enable,
2874                bool alpha_to_one_enable,
2875                uint32_t sample_mask)
2876 {
2877    bool rop_reads_dst = cb->logic_op_enable && tu_logic_op_reads_dst((VkLogicOp)cb->logic_op);
2878    enum a3xx_rop_code rop = tu6_rop((VkLogicOp)cb->logic_op);
2879 
2880    uint32_t blend_enable_mask = 0;
2881    for (unsigned i = 0; i < cb->attachment_count; i++) {
2882       const struct vk_color_blend_attachment_state *att = &cb->attachments[i];
2883       if (!(cb->color_write_enables & (1u << i)))
2884          continue;
2885 
2886       if (rop_reads_dst || att->blend_enable) {
2887          blend_enable_mask |= 1u << i;
2888       }
2889    }
2890 
2891    /* This will emit a dummy RB_MRT_*_CONTROL below if alpha-to-coverage is
2892     * enabled but there are no color attachments, in addition to changing
2893     * *_FS_OUTPUT_CNTL1.
2894     */
2895    unsigned num_rts = alpha_to_coverage_enable ?
2896       MAX2(cb->attachment_count, 1) : cb->attachment_count;
2897 
2898    bool dual_src_blend = tu_blend_state_is_dual_src(cb);
2899 
2900    tu_cs_emit_regs(cs, A6XX_SP_FS_OUTPUT_CNTL1(.mrt = num_rts));
2901    tu_cs_emit_regs(cs, A6XX_RB_FS_OUTPUT_CNTL1(.mrt = num_rts));
2902    tu_cs_emit_regs(cs, A6XX_SP_BLEND_CNTL(.enable_blend = blend_enable_mask,
2903                                           .unk8 = true,
2904                                           .dual_color_in_enable =
2905                                              dual_src_blend,
2906                                           .alpha_to_coverage =
2907                                              alpha_to_coverage_enable));
2908    /* set A6XX_RB_BLEND_CNTL_INDEPENDENT_BLEND only when enabled? */
2909    tu_cs_emit_regs(cs, A6XX_RB_BLEND_CNTL(.enable_blend = blend_enable_mask,
2910                                           .independent_blend = true,
2911                                           .dual_color_in_enable =
2912                                              dual_src_blend,
2913                                           .alpha_to_coverage =
2914                                              alpha_to_coverage_enable,
2915                                           .alpha_to_one = alpha_to_one_enable,
2916                                           .sample_mask = sample_mask));
2917 
2918    for (unsigned i = 0; i < num_rts; i++) {
2919       const struct vk_color_blend_attachment_state *att = &cb->attachments[i];
2920       if ((cb->color_write_enables & (1u << i)) && i < cb->attachment_count) {
2921          const enum a3xx_rb_blend_opcode color_op = tu6_blend_op(att->color_blend_op);
2922          const enum adreno_rb_blend_factor src_color_factor =
2923             tu6_blend_factor((VkBlendFactor)att->src_color_blend_factor);
2924          const enum adreno_rb_blend_factor dst_color_factor =
2925             tu6_blend_factor((VkBlendFactor)att->dst_color_blend_factor);
2926          const enum a3xx_rb_blend_opcode alpha_op =
2927             tu6_blend_op(att->alpha_blend_op);
2928          const enum adreno_rb_blend_factor src_alpha_factor =
2929             tu6_blend_factor((VkBlendFactor)att->src_alpha_blend_factor);
2930          const enum adreno_rb_blend_factor dst_alpha_factor =
2931             tu6_blend_factor((VkBlendFactor)att->dst_alpha_blend_factor);
2932 
2933          tu_cs_emit_regs(cs,
2934                          A6XX_RB_MRT_CONTROL(i,
2935                                              .blend = att->blend_enable,
2936                                              .blend2 = att->blend_enable,
2937                                              .rop_enable = cb->logic_op_enable,
2938                                              .rop_code = rop,
2939                                              .component_enable = att->write_mask),
2940                          A6XX_RB_MRT_BLEND_CONTROL(i,
2941                                                    .rgb_src_factor = src_color_factor,
2942                                                    .rgb_blend_opcode = color_op,
2943                                                    .rgb_dest_factor = dst_color_factor,
2944                                                    .alpha_src_factor = src_alpha_factor,
2945                                                    .alpha_blend_opcode = alpha_op,
2946                                                    .alpha_dest_factor = dst_alpha_factor));
2947       } else {
2948             tu_cs_emit_regs(cs,
2949                             A6XX_RB_MRT_CONTROL(i,),
2950                             A6XX_RB_MRT_BLEND_CONTROL(i,));
2951       }
2952    }
2953 }
2954 
2955 static const enum mesa_vk_dynamic_graphics_state tu_blend_constants_state[] = {
2956    MESA_VK_DYNAMIC_CB_BLEND_CONSTANTS,
2957 };
2958 
2959 template <chip CHIP>
2960 static unsigned
tu6_blend_constants_size(struct tu_device * dev,const struct vk_color_blend_state * cb)2961 tu6_blend_constants_size(struct tu_device *dev,
2962                          const struct vk_color_blend_state *cb)
2963 {
2964    return 5;
2965 }
2966 
2967 template <chip CHIP>
2968 static void
tu6_emit_blend_constants(struct tu_cs * cs,const struct vk_color_blend_state * cb)2969 tu6_emit_blend_constants(struct tu_cs *cs, const struct vk_color_blend_state *cb)
2970 {
2971    tu_cs_emit_pkt4(cs, REG_A6XX_RB_BLEND_RED_F32, 4);
2972    tu_cs_emit_array(cs, (const uint32_t *) cb->blend_constants, 4);
2973 }
2974 
2975 static const enum mesa_vk_dynamic_graphics_state tu_rast_state[] = {
2976    MESA_VK_DYNAMIC_RS_DEPTH_CLAMP_ENABLE,
2977    MESA_VK_DYNAMIC_RS_DEPTH_CLIP_ENABLE,
2978    MESA_VK_DYNAMIC_RS_POLYGON_MODE,
2979    MESA_VK_DYNAMIC_RS_CULL_MODE,
2980    MESA_VK_DYNAMIC_RS_FRONT_FACE,
2981    MESA_VK_DYNAMIC_RS_DEPTH_BIAS_ENABLE,
2982    MESA_VK_DYNAMIC_RS_LINE_MODE,
2983    MESA_VK_DYNAMIC_RS_RASTERIZER_DISCARD_ENABLE,
2984    MESA_VK_DYNAMIC_RS_RASTERIZATION_STREAM,
2985    MESA_VK_DYNAMIC_VP_DEPTH_CLIP_NEGATIVE_ONE_TO_ONE,
2986 };
2987 
2988 template <chip CHIP>
2989 uint32_t
tu6_rast_size(struct tu_device * dev,const struct vk_rasterization_state * rs,const struct vk_viewport_state * vp,bool multiview,bool per_view_viewport)2990 tu6_rast_size(struct tu_device *dev,
2991               const struct vk_rasterization_state *rs,
2992               const struct vk_viewport_state *vp,
2993               bool multiview,
2994               bool per_view_viewport)
2995 {
2996    if (CHIP == A6XX) {
2997       return 15 + (dev->physical_device->info->a6xx.has_shading_rate ? 8 : 0);
2998    } else {
2999       return 17;
3000    }
3001 }
3002 
3003 template <chip CHIP>
3004 void
tu6_emit_rast(struct tu_cs * cs,const struct vk_rasterization_state * rs,const struct vk_viewport_state * vp,bool multiview,bool per_view_viewport)3005 tu6_emit_rast(struct tu_cs *cs,
3006               const struct vk_rasterization_state *rs,
3007               const struct vk_viewport_state *vp,
3008               bool multiview,
3009               bool per_view_viewport)
3010 {
3011    enum a5xx_line_mode line_mode =
3012       rs->line.mode == VK_LINE_RASTERIZATION_MODE_BRESENHAM_EXT ?
3013       BRESENHAM : RECTANGULAR;
3014    tu_cs_emit_regs(cs,
3015                    A6XX_GRAS_SU_CNTL(
3016                      .cull_front = rs->cull_mode & VK_CULL_MODE_FRONT_BIT,
3017                      .cull_back = rs->cull_mode & VK_CULL_MODE_BACK_BIT,
3018                      .front_cw = rs->front_face == VK_FRONT_FACE_CLOCKWISE,
3019                      .linehalfwidth = rs->line.width / 2.0f,
3020                      .poly_offset = rs->depth_bias.enable,
3021                      .line_mode = line_mode,
3022                      .multiview_enable = multiview,
3023                      .rendertargetindexincr = multiview,
3024                      .viewportindexincr = multiview && per_view_viewport));
3025 
3026    bool depth_clip_enable = vk_rasterization_state_depth_clip_enable(rs);
3027 
3028    tu_cs_emit_regs(cs,
3029                    A6XX_GRAS_CL_CNTL(
3030                      .znear_clip_disable = !depth_clip_enable,
3031                      .zfar_clip_disable = !depth_clip_enable,
3032                      .z_clamp_enable = rs->depth_clamp_enable,
3033                      .zero_gb_scale_z = vp->depth_clip_negative_one_to_one ? 0 : 1,
3034                      .vp_clip_code_ignore = 1));;
3035 
3036    enum a6xx_polygon_mode polygon_mode = tu6_polygon_mode(rs->polygon_mode);
3037 
3038    tu_cs_emit_regs(cs,
3039                    A6XX_VPC_POLYGON_MODE(polygon_mode));
3040 
3041    tu_cs_emit_regs(cs,
3042                    PC_POLYGON_MODE(CHIP, polygon_mode));
3043 
3044    if (CHIP == A7XX) {
3045       tu_cs_emit_regs(cs,
3046                      A7XX_VPC_POLYGON_MODE2(polygon_mode));
3047    }
3048 
3049    tu_cs_emit_regs(cs, PC_RASTER_CNTL(CHIP,
3050       .stream = rs->rasterization_stream,
3051       .discard = rs->rasterizer_discard_enable));
3052    if (CHIP == A6XX) {
3053       tu_cs_emit_regs(cs, A6XX_VPC_UNKNOWN_9107(
3054          .raster_discard = rs->rasterizer_discard_enable));
3055    } else {
3056       tu_cs_emit_regs(cs, A7XX_PC_RASTER_CNTL_V2(
3057          .stream = rs->rasterization_stream,
3058          .discard = rs->rasterizer_discard_enable));
3059    }
3060 
3061    /* move to hw ctx init? */
3062    tu_cs_emit_regs(cs,
3063                    A6XX_GRAS_SU_POINT_MINMAX(.min = 1.0f / 16.0f, .max = 4092.0f),
3064                    A6XX_GRAS_SU_POINT_SIZE(1.0f));
3065 
3066    if (CHIP == A6XX && cs->device->physical_device->info->a6xx.has_shading_rate) {
3067       tu_cs_emit_regs(cs, A6XX_RB_UNKNOWN_8A00());
3068       tu_cs_emit_regs(cs, A6XX_RB_UNKNOWN_8A10());
3069       tu_cs_emit_regs(cs, A6XX_RB_UNKNOWN_8A20());
3070       tu_cs_emit_regs(cs, A6XX_RB_UNKNOWN_8A30());
3071    }
3072 }
3073 
3074 static const enum mesa_vk_dynamic_graphics_state tu_ds_state[] = {
3075    MESA_VK_DYNAMIC_DS_STENCIL_TEST_ENABLE,
3076    MESA_VK_DYNAMIC_DS_STENCIL_OP,
3077    MESA_VK_DYNAMIC_DS_STENCIL_COMPARE_MASK,
3078    MESA_VK_DYNAMIC_DS_STENCIL_WRITE_MASK,
3079    MESA_VK_DYNAMIC_DS_STENCIL_REFERENCE,
3080    MESA_VK_DYNAMIC_DS_DEPTH_BOUNDS_TEST_BOUNDS,
3081 };
3082 
3083 template <chip CHIP>
3084 static unsigned
tu6_ds_size(struct tu_device * dev,const struct vk_depth_stencil_state * ds)3085 tu6_ds_size(struct tu_device *dev,
3086                  const struct vk_depth_stencil_state *ds)
3087 {
3088    return 13;
3089 }
3090 
3091 template <chip CHIP>
3092 static void
tu6_emit_ds(struct tu_cs * cs,const struct vk_depth_stencil_state * ds)3093 tu6_emit_ds(struct tu_cs *cs,
3094             const struct vk_depth_stencil_state *ds)
3095 {
3096    tu_cs_emit_regs(cs, A6XX_RB_STENCIL_CONTROL(
3097       .stencil_enable = ds->stencil.test_enable,
3098       .stencil_enable_bf = ds->stencil.test_enable,
3099       .stencil_read = ds->stencil.test_enable,
3100       .func = tu6_compare_func((VkCompareOp)ds->stencil.front.op.compare),
3101       .fail = tu6_stencil_op((VkStencilOp)ds->stencil.front.op.fail),
3102       .zpass = tu6_stencil_op((VkStencilOp)ds->stencil.front.op.pass),
3103       .zfail = tu6_stencil_op((VkStencilOp)ds->stencil.front.op.depth_fail),
3104       .func_bf = tu6_compare_func((VkCompareOp)ds->stencil.back.op.compare),
3105       .fail_bf = tu6_stencil_op((VkStencilOp)ds->stencil.back.op.fail),
3106       .zpass_bf = tu6_stencil_op((VkStencilOp)ds->stencil.back.op.pass),
3107       .zfail_bf = tu6_stencil_op((VkStencilOp)ds->stencil.back.op.depth_fail)));
3108    tu_cs_emit_regs(cs, A6XX_GRAS_SU_STENCIL_CNTL(ds->stencil.test_enable));
3109 
3110    tu_cs_emit_regs(cs, A6XX_RB_STENCILMASK(
3111       .mask = ds->stencil.front.compare_mask,
3112       .bfmask = ds->stencil.back.compare_mask));
3113 
3114    tu_cs_emit_regs(cs, A6XX_RB_STENCILWRMASK(
3115       .wrmask = ds->stencil.front.write_mask,
3116       .bfwrmask = ds->stencil.back.write_mask));
3117 
3118    tu_cs_emit_regs(cs, A6XX_RB_STENCILREF(
3119       .ref = ds->stencil.front.reference,
3120       .bfref = ds->stencil.back.reference));
3121 
3122    tu_cs_emit_regs(cs,
3123                    A6XX_RB_Z_BOUNDS_MIN(ds->depth.bounds_test.min),
3124                    A6XX_RB_Z_BOUNDS_MAX(ds->depth.bounds_test.max));
3125 }
3126 
3127 static const enum mesa_vk_dynamic_graphics_state tu_rb_depth_cntl_state[] = {
3128    MESA_VK_DYNAMIC_DS_DEPTH_TEST_ENABLE,
3129    MESA_VK_DYNAMIC_DS_DEPTH_WRITE_ENABLE,
3130    MESA_VK_DYNAMIC_DS_DEPTH_COMPARE_OP,
3131    MESA_VK_DYNAMIC_DS_DEPTH_BOUNDS_TEST_ENABLE,
3132    MESA_VK_DYNAMIC_RS_DEPTH_CLAMP_ENABLE,
3133 };
3134 
3135 template <chip CHIP>
3136 static unsigned
tu6_rb_depth_cntl_size(struct tu_device * dev,const struct vk_depth_stencil_state * ds,const struct vk_render_pass_state * rp,const struct vk_rasterization_state * rs)3137 tu6_rb_depth_cntl_size(struct tu_device *dev,
3138                        const struct vk_depth_stencil_state *ds,
3139                        const struct vk_render_pass_state *rp,
3140                        const struct vk_rasterization_state *rs)
3141 {
3142    return 4;
3143 }
3144 
3145 template <chip CHIP>
3146 static void
tu6_emit_rb_depth_cntl(struct tu_cs * cs,const struct vk_depth_stencil_state * ds,const struct vk_render_pass_state * rp,const struct vk_rasterization_state * rs)3147 tu6_emit_rb_depth_cntl(struct tu_cs *cs,
3148                        const struct vk_depth_stencil_state *ds,
3149                        const struct vk_render_pass_state *rp,
3150                        const struct vk_rasterization_state *rs)
3151 {
3152    if (rp->attachments & MESA_VK_RP_ATTACHMENT_DEPTH_BIT) {
3153       bool depth_test = ds->depth.test_enable;
3154       enum adreno_compare_func zfunc = tu6_compare_func(ds->depth.compare_op);
3155 
3156       /* On some GPUs it is necessary to enable z test for depth bounds test
3157        * when UBWC is enabled. Otherwise, the GPU would hang. FUNC_ALWAYS is
3158        * required to pass z test. Relevant tests:
3159        *  dEQP-VK.pipeline.extended_dynamic_state.two_draws_dynamic.depth_bounds_test_disable
3160        *  dEQP-VK.dynamic_state.ds_state.depth_bounds_1
3161        */
3162       if (ds->depth.bounds_test.enable &&
3163           !ds->depth.test_enable &&
3164           cs->device->physical_device->info->a6xx.depth_bounds_require_depth_test_quirk) {
3165          depth_test = true;
3166          zfunc = FUNC_ALWAYS;
3167       }
3168 
3169       tu_cs_emit_regs(cs, A6XX_RB_DEPTH_CNTL(
3170          .z_test_enable = depth_test,
3171          .z_write_enable = ds->depth.test_enable && ds->depth.write_enable,
3172          .zfunc = zfunc,
3173          .z_clamp_enable = rs->depth_clamp_enable,
3174          /* TODO don't set for ALWAYS/NEVER */
3175          .z_read_enable = ds->depth.test_enable || ds->depth.bounds_test.enable,
3176          .z_bounds_enable = ds->depth.bounds_test.enable));
3177       tu_cs_emit_regs(cs, A6XX_GRAS_SU_DEPTH_CNTL(depth_test));
3178    } else {
3179       tu_cs_emit_regs(cs, A6XX_RB_DEPTH_CNTL());
3180       tu_cs_emit_regs(cs, A6XX_GRAS_SU_DEPTH_CNTL());
3181    }
3182 }
3183 
3184 static inline bool
emit_pipeline_state(BITSET_WORD * keep,BITSET_WORD * remove,BITSET_WORD * pipeline_set,const enum mesa_vk_dynamic_graphics_state * state_array,unsigned num_states,bool extra_cond,struct tu_pipeline_builder * builder)3185 emit_pipeline_state(BITSET_WORD *keep, BITSET_WORD *remove,
3186                     BITSET_WORD *pipeline_set,
3187                     const enum mesa_vk_dynamic_graphics_state *state_array,
3188                     unsigned num_states, bool extra_cond,
3189                     struct tu_pipeline_builder *builder)
3190 {
3191    BITSET_DECLARE(state, MESA_VK_DYNAMIC_GRAPHICS_STATE_ENUM_MAX) = {};
3192 
3193    /* Unrolling this loop should produce a constant value once the function is
3194     * inlined, because state_array and num_states are a per-draw-state
3195     * constant, but GCC seems to need a little encouragement. clang does a
3196     * little better but still needs a pragma when there are a large number of
3197     * states.
3198     */
3199 #if defined(__clang__)
3200 #pragma clang loop unroll(full)
3201 #elif defined(__GNUC__) && __GNUC__ >= 8
3202 #pragma GCC unroll MESA_VK_DYNAMIC_GRAPHICS_STATE_ENUM_MAX
3203 #endif
3204    for (unsigned i = 0; i < num_states; i++) {
3205       BITSET_SET(state, state_array[i]);
3206    }
3207 
3208    /* If all of the state is set, then after we emit it we can tentatively
3209     * remove it from the states to set for the pipeline by making it dynamic.
3210     * If we can't emit it, though, we need to keep around the partial state so
3211     * that we can emit it later, even if another draw state consumes it. That
3212     * is, we have to cancel any tentative removal.
3213     */
3214    BITSET_DECLARE(temp, MESA_VK_DYNAMIC_GRAPHICS_STATE_ENUM_MAX);
3215    memcpy(temp, pipeline_set, sizeof(temp));
3216    BITSET_AND(temp, temp, state);
3217    if (!BITSET_EQUAL(temp, state) || !extra_cond) {
3218       __bitset_or(keep, keep, temp, ARRAY_SIZE(temp));
3219       return false;
3220    }
3221    __bitset_or(remove, remove, state, ARRAY_SIZE(state));
3222    return true;
3223 }
3224 
3225 template <chip CHIP>
3226 static void
tu_pipeline_builder_emit_state(struct tu_pipeline_builder * builder,struct tu_pipeline * pipeline)3227 tu_pipeline_builder_emit_state(struct tu_pipeline_builder *builder,
3228                                struct tu_pipeline *pipeline)
3229 {
3230    struct tu_cs cs;
3231    BITSET_DECLARE(keep, MESA_VK_DYNAMIC_GRAPHICS_STATE_ENUM_MAX) = {};
3232    BITSET_DECLARE(remove, MESA_VK_DYNAMIC_GRAPHICS_STATE_ENUM_MAX) = {};
3233    BITSET_DECLARE(pipeline_set, MESA_VK_DYNAMIC_GRAPHICS_STATE_ENUM_MAX) = {};
3234 
3235    vk_graphics_pipeline_get_state(&builder->graphics_state, pipeline_set);
3236 
3237 #define EMIT_STATE(name, extra_cond)                                          \
3238    emit_pipeline_state(keep, remove, pipeline_set, tu_##name##_state,         \
3239                        ARRAY_SIZE(tu_##name##_state), extra_cond, builder)
3240 
3241 #define DRAW_STATE_COND(name, id, extra_cond, ...)                            \
3242    if (EMIT_STATE(name, extra_cond)) {                                        \
3243       unsigned size = tu6_##name##_size<CHIP>(builder->device, __VA_ARGS__);  \
3244       if (size > 0) {                                                         \
3245          tu_cs_begin_sub_stream(&pipeline->cs, size, &cs);                    \
3246          tu6_emit_##name<CHIP>(&cs, __VA_ARGS__);                             \
3247          pipeline->dynamic_state[id] =                                        \
3248             tu_cs_end_draw_state(&pipeline->cs, &cs);                         \
3249       }                                                                       \
3250       pipeline->set_state_mask |= (1u << id);                                 \
3251    }
3252 #define DRAW_STATE(name, id, ...) DRAW_STATE_COND(name, id, true, __VA_ARGS__)
3253 
3254    DRAW_STATE(vertex_input, TU_DYNAMIC_STATE_VERTEX_INPUT,
3255               builder->graphics_state.vi);
3256    DRAW_STATE(vertex_stride, TU_DYNAMIC_STATE_VB_STRIDE,
3257               builder->graphics_state.vi);
3258    /* If (a) per-view viewport is used or (b) we don't know yet, then we need
3259     * to set viewport and stencil state dynamically.
3260     */
3261    bool no_per_view_viewport = pipeline_contains_all_shader_state(pipeline) &&
3262       !pipeline->program.per_view_viewport;
3263    DRAW_STATE_COND(viewport, TU_DYNAMIC_STATE_VIEWPORT, no_per_view_viewport,
3264                    builder->graphics_state.vp);
3265    DRAW_STATE_COND(scissor, TU_DYNAMIC_STATE_SCISSOR, no_per_view_viewport,
3266               builder->graphics_state.vp);
3267    DRAW_STATE(sample_locations,
3268               TU_DYNAMIC_STATE_SAMPLE_LOCATIONS,
3269               builder->graphics_state.ms->sample_locations_enable,
3270               builder->graphics_state.ms->sample_locations);
3271    DRAW_STATE(depth_bias, TU_DYNAMIC_STATE_DEPTH_BIAS,
3272               builder->graphics_state.rs);
3273    bool attachments_valid =
3274       builder->graphics_state.rp &&
3275       vk_render_pass_state_has_attachment_info(builder->graphics_state.rp);
3276    struct vk_color_blend_state dummy_cb = {};
3277    const struct vk_color_blend_state *cb = builder->graphics_state.cb;
3278    if (attachments_valid &&
3279        !(builder->graphics_state.rp->attachments &
3280          MESA_VK_RP_ATTACHMENT_ANY_COLOR_BITS)) {
3281       /* If there are no color attachments, then the original blend state may
3282        * be NULL and the common code sanitizes it to always be NULL. In this
3283        * case we want to emit an empty blend/bandwidth/etc.  rather than
3284        * letting it be dynamic (and potentially garbage).
3285        */
3286       cb = &dummy_cb;
3287       BITSET_SET(pipeline_set, MESA_VK_DYNAMIC_CB_LOGIC_OP_ENABLE);
3288       BITSET_SET(pipeline_set, MESA_VK_DYNAMIC_CB_LOGIC_OP);
3289       BITSET_SET(pipeline_set, MESA_VK_DYNAMIC_CB_ATTACHMENT_COUNT);
3290       BITSET_SET(pipeline_set, MESA_VK_DYNAMIC_CB_COLOR_WRITE_ENABLES);
3291       BITSET_SET(pipeline_set, MESA_VK_DYNAMIC_CB_BLEND_ENABLES);
3292       BITSET_SET(pipeline_set, MESA_VK_DYNAMIC_CB_BLEND_EQUATIONS);
3293       BITSET_SET(pipeline_set, MESA_VK_DYNAMIC_CB_WRITE_MASKS);
3294       BITSET_SET(pipeline_set, MESA_VK_DYNAMIC_CB_BLEND_CONSTANTS);
3295    }
3296    DRAW_STATE(blend, TU_DYNAMIC_STATE_BLEND, cb,
3297               builder->graphics_state.ms->alpha_to_coverage_enable,
3298               builder->graphics_state.ms->alpha_to_one_enable,
3299               builder->graphics_state.ms->sample_mask);
3300    if (EMIT_STATE(blend_lrz, attachments_valid))
3301       tu_emit_blend_lrz(&pipeline->lrz_blend, cb,
3302                         builder->graphics_state.rp);
3303    if (EMIT_STATE(bandwidth, attachments_valid))
3304       tu_calc_bandwidth(&pipeline->bandwidth, cb,
3305                         builder->graphics_state.rp);
3306    DRAW_STATE(blend_constants, TU_DYNAMIC_STATE_BLEND_CONSTANTS, cb);
3307    if (attachments_valid &&
3308        !(builder->graphics_state.rp->attachments &
3309          MESA_VK_RP_ATTACHMENT_ANY_COLOR_BITS)) {
3310       /* Don't actually make anything dynamic as that may mean a partially-set
3311        * state group where the group is NULL which angers common code.
3312        */
3313       BITSET_CLEAR(remove, MESA_VK_DYNAMIC_CB_LOGIC_OP_ENABLE);
3314       BITSET_CLEAR(remove, MESA_VK_DYNAMIC_CB_LOGIC_OP);
3315       BITSET_CLEAR(remove, MESA_VK_DYNAMIC_CB_ATTACHMENT_COUNT);
3316       BITSET_CLEAR(remove, MESA_VK_DYNAMIC_CB_COLOR_WRITE_ENABLES);
3317       BITSET_CLEAR(remove, MESA_VK_DYNAMIC_CB_BLEND_ENABLES);
3318       BITSET_CLEAR(remove, MESA_VK_DYNAMIC_CB_BLEND_EQUATIONS);
3319       BITSET_CLEAR(remove, MESA_VK_DYNAMIC_CB_WRITE_MASKS);
3320       BITSET_CLEAR(remove, MESA_VK_DYNAMIC_CB_BLEND_CONSTANTS);
3321    }
3322    DRAW_STATE_COND(rast, TU_DYNAMIC_STATE_RAST,
3323                    pipeline_contains_all_shader_state(pipeline),
3324                    builder->graphics_state.rs,
3325                    builder->graphics_state.vp,
3326                    builder->graphics_state.rp->view_mask != 0,
3327                    pipeline->program.per_view_viewport);
3328    DRAW_STATE(ds, TU_DYNAMIC_STATE_DS,
3329               builder->graphics_state.ds);
3330    DRAW_STATE_COND(rb_depth_cntl, TU_DYNAMIC_STATE_RB_DEPTH_CNTL,
3331                    attachments_valid,
3332                    builder->graphics_state.ds,
3333                    builder->graphics_state.rp,
3334                    builder->graphics_state.rs);
3335    DRAW_STATE_COND(patch_control_points,
3336                    TU_DYNAMIC_STATE_PATCH_CONTROL_POINTS,
3337                    pipeline_contains_all_shader_state(pipeline),
3338                    pipeline->shaders[MESA_SHADER_VERTEX],
3339                    pipeline->shaders[MESA_SHADER_TESS_CTRL],
3340                    pipeline->shaders[MESA_SHADER_TESS_EVAL],
3341                    &pipeline->program,
3342                    builder->graphics_state.ts->patch_control_points);
3343 #undef DRAW_STATE
3344 #undef DRAW_STATE_COND
3345 #undef EMIT_STATE
3346 
3347    /* LRZ always needs depth/stencil state at draw time */
3348    BITSET_SET(keep, MESA_VK_DYNAMIC_DS_DEPTH_TEST_ENABLE);
3349    BITSET_SET(keep, MESA_VK_DYNAMIC_DS_DEPTH_WRITE_ENABLE);
3350    BITSET_SET(keep, MESA_VK_DYNAMIC_DS_DEPTH_BOUNDS_TEST_ENABLE);
3351    BITSET_SET(keep, MESA_VK_DYNAMIC_DS_DEPTH_COMPARE_OP);
3352    BITSET_SET(keep, MESA_VK_DYNAMIC_DS_STENCIL_TEST_ENABLE);
3353    BITSET_SET(keep, MESA_VK_DYNAMIC_DS_STENCIL_OP);
3354    BITSET_SET(keep, MESA_VK_DYNAMIC_DS_STENCIL_WRITE_MASK);
3355    BITSET_SET(keep, MESA_VK_DYNAMIC_MS_ALPHA_TO_COVERAGE_ENABLE);
3356 
3357    /* MSAA needs line mode */
3358    BITSET_SET(keep, MESA_VK_DYNAMIC_RS_LINE_MODE);
3359 
3360    /* The patch control points is part of the draw */
3361    BITSET_SET(keep, MESA_VK_DYNAMIC_TS_PATCH_CONTROL_POINTS);
3362 
3363    /* Vertex buffer state needs to know the max valid binding */
3364    BITSET_SET(keep, MESA_VK_DYNAMIC_VI_BINDINGS_VALID);
3365 
3366    /* Remove state which has been emitted and we no longer need to set when
3367     * binding the pipeline by making it "dynamic".
3368     */
3369    BITSET_ANDNOT(remove, remove, keep);
3370    BITSET_OR(builder->graphics_state.dynamic, builder->graphics_state.dynamic,
3371              remove);
3372 }
3373 
3374 static inline bool
emit_draw_state(const struct vk_dynamic_graphics_state * dynamic_state,const enum mesa_vk_dynamic_graphics_state * state_array,unsigned num_states)3375 emit_draw_state(const struct vk_dynamic_graphics_state *dynamic_state,
3376                 const enum mesa_vk_dynamic_graphics_state *state_array,
3377                 unsigned num_states)
3378 {
3379    BITSET_DECLARE(state, MESA_VK_DYNAMIC_GRAPHICS_STATE_ENUM_MAX) = {};
3380 
3381    /* Unrolling this loop should produce a constant value once the function is
3382     * inlined, because state_array and num_states are a per-draw-state
3383     * constant, but GCC seems to need a little encouragement. clang does a
3384     * little better but still needs a pragma when there are a large number of
3385     * states.
3386     */
3387 #if defined(__clang__)
3388 #pragma clang loop unroll(full)
3389 #elif defined(__GNUC__) && __GNUC__ >= 8
3390 #pragma GCC unroll MESA_VK_DYNAMIC_GRAPHICS_STATE_ENUM_MAX
3391 #endif
3392    for (unsigned i = 0; i < num_states; i++) {
3393       BITSET_SET(state, state_array[i]);
3394    }
3395 
3396    BITSET_DECLARE(temp, MESA_VK_DYNAMIC_GRAPHICS_STATE_ENUM_MAX);
3397    BITSET_AND(temp, state, dynamic_state->dirty);
3398    return !BITSET_IS_EMPTY(temp);
3399 }
3400 
3401 template <chip CHIP>
3402 uint32_t
tu_emit_draw_state(struct tu_cmd_buffer * cmd)3403 tu_emit_draw_state(struct tu_cmd_buffer *cmd)
3404 {
3405    struct tu_cs cs;
3406    uint32_t dirty_draw_states = 0;
3407 
3408 #define EMIT_STATE(name)                                                      \
3409    emit_draw_state(&cmd->vk.dynamic_graphics_state, tu_##name##_state,        \
3410                    ARRAY_SIZE(tu_##name##_state))
3411 #define DRAW_STATE_COND(name, id, extra_cond, ...)                            \
3412    if ((EMIT_STATE(name) || extra_cond) &&                                    \
3413        !(cmd->state.pipeline_draw_states & (1u << id))) {                     \
3414       unsigned size = tu6_##name##_size<CHIP>(cmd->device, __VA_ARGS__);      \
3415       if (size > 0) {                                                         \
3416          tu_cs_begin_sub_stream(&cmd->sub_cs, size, &cs);                     \
3417          tu6_emit_##name<CHIP>(&cs, __VA_ARGS__);                             \
3418          cmd->state.dynamic_state[id] =                                       \
3419             tu_cs_end_draw_state(&cmd->sub_cs, &cs);                          \
3420       } else {                                                                \
3421          cmd->state.dynamic_state[id] = {};                                   \
3422       }                                                                       \
3423       dirty_draw_states |= (1u << id);                                        \
3424    }
3425 #define DRAW_STATE_FDM(name, id, ...)                                         \
3426    if ((EMIT_STATE(name) || (cmd->state.dirty & TU_CMD_DIRTY_FDM)) &&         \
3427        !(cmd->state.pipeline_draw_states & (1u << id))) {                     \
3428       if (cmd->state.shaders[MESA_SHADER_FRAGMENT]->fs.has_fdm) {             \
3429          tu_cs_set_writeable(&cmd->sub_cs, true);                             \
3430          tu6_emit_##name##_fdm(&cs, cmd, __VA_ARGS__);                        \
3431          cmd->state.dynamic_state[id] =                                       \
3432             tu_cs_end_draw_state(&cmd->sub_cs, &cs);                          \
3433          tu_cs_set_writeable(&cmd->sub_cs, false);                            \
3434       } else {                                                                \
3435          unsigned size = tu6_##name##_size<CHIP>(cmd->device, __VA_ARGS__);   \
3436          if (size > 0) {                                                      \
3437             tu_cs_begin_sub_stream(&cmd->sub_cs, size, &cs);                  \
3438             tu6_emit_##name<CHIP>(&cs, __VA_ARGS__);                          \
3439             cmd->state.dynamic_state[id] =                                    \
3440                tu_cs_end_draw_state(&cmd->sub_cs, &cs);                       \
3441          } else {                                                             \
3442             cmd->state.dynamic_state[id] = {};                                \
3443          }                                                                    \
3444          tu_cs_begin_sub_stream(&cmd->sub_cs,                                 \
3445                                 tu6_##name##_size<CHIP>(cmd->device, __VA_ARGS__),  \
3446                                 &cs);                                         \
3447          tu6_emit_##name<CHIP>(&cs, __VA_ARGS__);                             \
3448          cmd->state.dynamic_state[id] =                                       \
3449             tu_cs_end_draw_state(&cmd->sub_cs, &cs);                          \
3450       }                                                                       \
3451       dirty_draw_states |= (1u << id);                                        \
3452    }
3453 #define DRAW_STATE(name, id, ...) DRAW_STATE_COND(name, id, false, __VA_ARGS__)
3454 
3455    DRAW_STATE(vertex_input, TU_DYNAMIC_STATE_VERTEX_INPUT,
3456               cmd->vk.dynamic_graphics_state.vi);
3457 
3458    /* Vertex input stride is special because it's part of the vertex input in
3459     * the pipeline but a separate array when it's dynamic state so we have to
3460     * use two separate functions.
3461     */
3462 #define tu6_emit_vertex_stride tu6_emit_vertex_stride_dyn
3463 #define tu6_vertex_stride_size tu6_vertex_stride_size_dyn
3464 
3465    DRAW_STATE(vertex_stride, TU_DYNAMIC_STATE_VB_STRIDE,
3466               cmd->vk.dynamic_graphics_state.vi_binding_strides,
3467               cmd->vk.dynamic_graphics_state.vi_bindings_valid);
3468 
3469 #undef tu6_emit_vertex_stride
3470 #undef tu6_vertex_stride_size
3471 
3472    DRAW_STATE_FDM(viewport, TU_DYNAMIC_STATE_VIEWPORT,
3473                   &cmd->vk.dynamic_graphics_state.vp);
3474    DRAW_STATE_FDM(scissor, TU_DYNAMIC_STATE_SCISSOR,
3475                   &cmd->vk.dynamic_graphics_state.vp);
3476    DRAW_STATE(sample_locations,
3477               TU_DYNAMIC_STATE_SAMPLE_LOCATIONS,
3478               cmd->vk.dynamic_graphics_state.ms.sample_locations_enable,
3479               cmd->vk.dynamic_graphics_state.ms.sample_locations);
3480    DRAW_STATE(depth_bias, TU_DYNAMIC_STATE_DEPTH_BIAS,
3481               &cmd->vk.dynamic_graphics_state.rs);
3482    DRAW_STATE(blend, TU_DYNAMIC_STATE_BLEND,
3483               &cmd->vk.dynamic_graphics_state.cb,
3484               cmd->vk.dynamic_graphics_state.ms.alpha_to_coverage_enable,
3485               cmd->vk.dynamic_graphics_state.ms.alpha_to_one_enable,
3486               cmd->vk.dynamic_graphics_state.ms.sample_mask);
3487    if (EMIT_STATE(blend_lrz) ||
3488        ((cmd->state.dirty & TU_CMD_DIRTY_SUBPASS) &&
3489         !cmd->state.pipeline_blend_lrz)) {
3490       bool blend_reads_dest = tu6_calc_blend_lrz(&cmd->vk.dynamic_graphics_state.cb,
3491                                                  &cmd->state.vk_rp);
3492       if (blend_reads_dest != cmd->state.blend_reads_dest) {
3493          cmd->state.blend_reads_dest = blend_reads_dest;
3494          cmd->state.dirty |= TU_CMD_DIRTY_LRZ;
3495       }
3496    }
3497    if (EMIT_STATE(bandwidth) ||
3498        ((cmd->state.dirty & TU_CMD_DIRTY_SUBPASS) &&
3499         !cmd->state.pipeline_bandwidth))
3500       tu_calc_bandwidth(&cmd->state.bandwidth, &cmd->vk.dynamic_graphics_state.cb,
3501                         &cmd->state.vk_rp);
3502    DRAW_STATE(blend_constants, VK_DYNAMIC_STATE_BLEND_CONSTANTS,
3503               &cmd->vk.dynamic_graphics_state.cb);
3504    DRAW_STATE_COND(rast, TU_DYNAMIC_STATE_RAST,
3505                    cmd->state.dirty & (TU_CMD_DIRTY_SUBPASS |
3506                                        TU_CMD_DIRTY_PER_VIEW_VIEWPORT),
3507                    &cmd->vk.dynamic_graphics_state.rs,
3508                    &cmd->vk.dynamic_graphics_state.vp,
3509                    cmd->state.vk_rp.view_mask != 0,
3510                    cmd->state.per_view_viewport);
3511    DRAW_STATE(ds, TU_DYNAMIC_STATE_DS,
3512               &cmd->vk.dynamic_graphics_state.ds);
3513    DRAW_STATE_COND(rb_depth_cntl, TU_DYNAMIC_STATE_RB_DEPTH_CNTL,
3514                    cmd->state.dirty & TU_CMD_DIRTY_SUBPASS,
3515                    &cmd->vk.dynamic_graphics_state.ds,
3516                    &cmd->state.vk_rp,
3517                    &cmd->vk.dynamic_graphics_state.rs);
3518    DRAW_STATE_COND(patch_control_points,
3519                    TU_DYNAMIC_STATE_PATCH_CONTROL_POINTS,
3520                    cmd->state.dirty & TU_CMD_DIRTY_PROGRAM,
3521                    cmd->state.shaders[MESA_SHADER_VERTEX],
3522                    cmd->state.shaders[MESA_SHADER_TESS_CTRL],
3523                    cmd->state.shaders[MESA_SHADER_TESS_EVAL],
3524                    &cmd->state.program,
3525                    cmd->vk.dynamic_graphics_state.ts.patch_control_points);
3526 #undef DRAW_STATE
3527 #undef DRAW_STATE_COND
3528 #undef EMIT_STATE
3529 
3530    return dirty_draw_states;
3531 }
3532 TU_GENX(tu_emit_draw_state);
3533 
3534 static void
tu_pipeline_builder_parse_depth_stencil(struct tu_pipeline_builder * builder,struct tu_pipeline * pipeline)3535 tu_pipeline_builder_parse_depth_stencil(
3536    struct tu_pipeline_builder *builder, struct tu_pipeline *pipeline)
3537 {
3538    const VkPipelineDepthStencilStateCreateInfo *ds_info =
3539       builder->create_info->pDepthStencilState;
3540 
3541    if ((builder->graphics_state.rp->attachments ==
3542         MESA_VK_RP_ATTACHMENT_INFO_INVALID) ||
3543        (builder->graphics_state.rp->attachments &
3544         MESA_VK_RP_ATTACHMENT_DEPTH_BIT)) {
3545       pipeline->ds.raster_order_attachment_access =
3546          ds_info && (ds_info->flags &
3547          (VK_PIPELINE_DEPTH_STENCIL_STATE_CREATE_RASTERIZATION_ORDER_ATTACHMENT_DEPTH_ACCESS_BIT_EXT |
3548           VK_PIPELINE_DEPTH_STENCIL_STATE_CREATE_RASTERIZATION_ORDER_ATTACHMENT_STENCIL_ACCESS_BIT_EXT));
3549    }
3550 }
3551 
3552 static void
tu_pipeline_builder_parse_multisample_and_color_blend(struct tu_pipeline_builder * builder,struct tu_pipeline * pipeline)3553 tu_pipeline_builder_parse_multisample_and_color_blend(
3554    struct tu_pipeline_builder *builder, struct tu_pipeline *pipeline)
3555 {
3556    /* The spec says:
3557     *
3558     *    pMultisampleState is a pointer to an instance of the
3559     *    VkPipelineMultisampleStateCreateInfo, and is ignored if the pipeline
3560     *    has rasterization disabled.
3561     *
3562     * Also,
3563     *
3564     *    pColorBlendState is a pointer to an instance of the
3565     *    VkPipelineColorBlendStateCreateInfo structure, and is ignored if the
3566     *    pipeline has rasterization disabled or if the subpass of the render
3567     *    pass the pipeline is created against does not use any color
3568     *    attachments.
3569     *
3570     * We leave the relevant registers stale when rasterization is disabled.
3571     */
3572    if (builder->rasterizer_discard) {
3573       return;
3574    }
3575 
3576    static const VkPipelineColorBlendStateCreateInfo dummy_blend_info = {};
3577 
3578    const VkPipelineColorBlendStateCreateInfo *blend_info =
3579       (builder->graphics_state.rp->attachments &
3580        MESA_VK_RP_ATTACHMENT_ANY_COLOR_BITS)
3581       ? builder->create_info->pColorBlendState
3582       : &dummy_blend_info;
3583 
3584    if (builder->graphics_state.rp->attachments &
3585        MESA_VK_RP_ATTACHMENT_ANY_COLOR_BITS) {
3586       pipeline->output.raster_order_attachment_access =
3587          blend_info && (blend_info->flags &
3588             VK_PIPELINE_COLOR_BLEND_STATE_CREATE_RASTERIZATION_ORDER_ATTACHMENT_ACCESS_BIT_EXT);
3589    }
3590 }
3591 
3592 static void
tu_pipeline_builder_parse_rasterization_order(struct tu_pipeline_builder * builder,struct tu_pipeline * pipeline)3593 tu_pipeline_builder_parse_rasterization_order(
3594    struct tu_pipeline_builder *builder, struct tu_pipeline *pipeline)
3595 {
3596    if (builder->rasterizer_discard)
3597       return;
3598 
3599    bool raster_order_attachment_access =
3600       pipeline->output.raster_order_attachment_access ||
3601       pipeline->ds.raster_order_attachment_access ||
3602       TU_DEBUG(RAST_ORDER);
3603 
3604    /* VK_EXT_blend_operation_advanced would also require ordered access
3605     * when implemented in the future.
3606     */
3607 
3608    enum a6xx_single_prim_mode sysmem_prim_mode = NO_FLUSH;
3609    enum a6xx_single_prim_mode gmem_prim_mode = NO_FLUSH;
3610 
3611    if (raster_order_attachment_access) {
3612       /* VK_EXT_rasterization_order_attachment_access:
3613        *
3614        * This extension allow access to framebuffer attachments when used as
3615        * both input and color attachments from one fragment to the next,
3616        * in rasterization order, without explicit synchronization.
3617        */
3618       sysmem_prim_mode = FLUSH_PER_OVERLAP_AND_OVERWRITE;
3619       gmem_prim_mode = FLUSH_PER_OVERLAP;
3620       pipeline->prim_order.sysmem_single_prim_mode = true;
3621    } else {
3622       /* If there is a feedback loop, then the shader can read the previous value
3623        * of a pixel being written out. It can also write some components and then
3624        * read different components without a barrier in between. This is a
3625        * problem in sysmem mode with UBWC, because the main buffer and flags
3626        * buffer can get out-of-sync if only one is flushed. We fix this by
3627        * setting the SINGLE_PRIM_MODE field to the same value that the blob does
3628        * for advanced_blend in sysmem mode if a feedback loop is detected.
3629        */
3630       if (builder->graphics_state.pipeline_flags &
3631           (VK_PIPELINE_CREATE_2_COLOR_ATTACHMENT_FEEDBACK_LOOP_BIT_EXT |
3632            VK_PIPELINE_CREATE_2_DEPTH_STENCIL_ATTACHMENT_FEEDBACK_LOOP_BIT_EXT)) {
3633          sysmem_prim_mode = FLUSH_PER_OVERLAP_AND_OVERWRITE;
3634          pipeline->prim_order.sysmem_single_prim_mode = true;
3635       }
3636    }
3637 
3638    struct tu_cs cs;
3639 
3640    pipeline->prim_order.state_gmem = tu_cs_draw_state(&pipeline->cs, &cs, 2);
3641    tu_cs_emit_write_reg(&cs, REG_A6XX_GRAS_SC_CNTL,
3642                         A6XX_GRAS_SC_CNTL_CCUSINGLECACHELINESIZE(2) |
3643                         A6XX_GRAS_SC_CNTL_SINGLE_PRIM_MODE(gmem_prim_mode));
3644 
3645    pipeline->prim_order.state_sysmem = tu_cs_draw_state(&pipeline->cs, &cs, 2);
3646    tu_cs_emit_write_reg(&cs, REG_A6XX_GRAS_SC_CNTL,
3647                         A6XX_GRAS_SC_CNTL_CCUSINGLECACHELINESIZE(2) |
3648                         A6XX_GRAS_SC_CNTL_SINGLE_PRIM_MODE(sysmem_prim_mode));
3649 }
3650 
3651 static void
tu_pipeline_finish(struct tu_pipeline * pipeline,struct tu_device * dev,const VkAllocationCallbacks * alloc)3652 tu_pipeline_finish(struct tu_pipeline *pipeline,
3653                    struct tu_device *dev,
3654                    const VkAllocationCallbacks *alloc)
3655 {
3656    tu_cs_finish(&pipeline->cs);
3657    mtx_lock(&dev->pipeline_mutex);
3658    tu_suballoc_bo_free(&dev->pipeline_suballoc, &pipeline->bo);
3659    mtx_unlock(&dev->pipeline_mutex);
3660 
3661    if (pipeline->type == TU_PIPELINE_GRAPHICS_LIB) {
3662       struct tu_graphics_lib_pipeline *library =
3663          tu_pipeline_to_graphics_lib(pipeline);
3664 
3665       if (library->nir_shaders)
3666          vk_pipeline_cache_object_unref(&dev->vk,
3667                                         &library->nir_shaders->base);
3668 
3669       for (unsigned i = 0; i < library->num_sets; i++) {
3670          if (library->layouts[i])
3671             vk_descriptor_set_layout_unref(&dev->vk, &library->layouts[i]->vk);
3672       }
3673 
3674       vk_free2(&dev->vk.alloc, alloc, library->state_data);
3675    }
3676 
3677    for (unsigned i = 0; i < ARRAY_SIZE(pipeline->shaders); i++) {
3678       if (pipeline->shaders[i])
3679          vk_pipeline_cache_object_unref(&dev->vk,
3680                                         &pipeline->shaders[i]->base);
3681    }
3682 
3683    ralloc_free(pipeline->executables_mem_ctx);
3684 }
3685 
3686 static VkGraphicsPipelineLibraryFlagBitsEXT
vk_shader_stage_to_pipeline_library_flags(VkShaderStageFlagBits stage)3687 vk_shader_stage_to_pipeline_library_flags(VkShaderStageFlagBits stage)
3688 {
3689    assert(util_bitcount(stage) == 1);
3690    switch (stage) {
3691    case VK_SHADER_STAGE_VERTEX_BIT:
3692    case VK_SHADER_STAGE_TESSELLATION_CONTROL_BIT:
3693    case VK_SHADER_STAGE_TESSELLATION_EVALUATION_BIT:
3694    case VK_SHADER_STAGE_GEOMETRY_BIT:
3695    case VK_SHADER_STAGE_TASK_BIT_EXT:
3696    case VK_SHADER_STAGE_MESH_BIT_EXT:
3697       return VK_GRAPHICS_PIPELINE_LIBRARY_PRE_RASTERIZATION_SHADERS_BIT_EXT;
3698    case VK_SHADER_STAGE_FRAGMENT_BIT:
3699       return VK_GRAPHICS_PIPELINE_LIBRARY_FRAGMENT_SHADER_BIT_EXT;
3700    default:
3701       unreachable("Invalid shader stage");
3702    }
3703 }
3704 
3705 template <chip CHIP>
3706 static VkResult
tu_pipeline_builder_build(struct tu_pipeline_builder * builder,struct tu_pipeline ** pipeline)3707 tu_pipeline_builder_build(struct tu_pipeline_builder *builder,
3708                           struct tu_pipeline **pipeline)
3709 {
3710    VkResult result;
3711 
3712    if (builder->create_flags & VK_PIPELINE_CREATE_2_LIBRARY_BIT_KHR) {
3713       *pipeline = (struct tu_pipeline *) vk_object_zalloc(
3714          &builder->device->vk, builder->alloc,
3715          sizeof(struct tu_graphics_lib_pipeline),
3716          VK_OBJECT_TYPE_PIPELINE);
3717       if (!*pipeline)
3718          return VK_ERROR_OUT_OF_HOST_MEMORY;
3719       (*pipeline)->type = TU_PIPELINE_GRAPHICS_LIB;
3720    } else {
3721       *pipeline = (struct tu_pipeline *) vk_object_zalloc(
3722          &builder->device->vk, builder->alloc,
3723          sizeof(struct tu_graphics_pipeline),
3724          VK_OBJECT_TYPE_PIPELINE);
3725       if (!*pipeline)
3726          return VK_ERROR_OUT_OF_HOST_MEMORY;
3727       (*pipeline)->type = TU_PIPELINE_GRAPHICS;
3728    }
3729 
3730    (*pipeline)->executables_mem_ctx = ralloc_context(NULL);
3731    util_dynarray_init(&(*pipeline)->executables, (*pipeline)->executables_mem_ctx);
3732 
3733    tu_pipeline_builder_parse_libraries(builder, *pipeline);
3734 
3735    VkShaderStageFlags stages = 0;
3736    for (unsigned i = 0; i < builder->create_info->stageCount; i++) {
3737       VkShaderStageFlagBits stage = builder->create_info->pStages[i].stage;
3738 
3739       /* Ignore shader stages that don't need to be imported. */
3740       if (!(vk_shader_stage_to_pipeline_library_flags(stage) & builder->state))
3741          continue;
3742 
3743       stages |= stage;
3744    }
3745    builder->active_stages = stages;
3746 
3747    (*pipeline)->active_stages = stages;
3748    for (unsigned i = 0; i < builder->num_libraries; i++)
3749       (*pipeline)->active_stages |= builder->libraries[i]->base.active_stages;
3750 
3751    /* Compile and upload shaders unless a library has already done that. */
3752    if ((*pipeline)->program.vs_state.size == 0) {
3753       tu_pipeline_builder_parse_layout(builder, *pipeline);
3754 
3755       result = tu_pipeline_builder_compile_shaders(builder, *pipeline);
3756       if (result != VK_SUCCESS) {
3757          vk_object_free(&builder->device->vk, builder->alloc, *pipeline);
3758          return result;
3759       }
3760    }
3761 
3762    result = tu_pipeline_allocate_cs(builder->device, *pipeline,
3763                                     &builder->layout, builder, NULL);
3764 
3765 
3766    if (set_combined_state(builder, *pipeline,
3767                           VK_GRAPHICS_PIPELINE_LIBRARY_PRE_RASTERIZATION_SHADERS_BIT_EXT |
3768                           VK_GRAPHICS_PIPELINE_LIBRARY_FRAGMENT_SHADER_BIT_EXT)) {
3769       if (result != VK_SUCCESS) {
3770          vk_object_free(&builder->device->vk, builder->alloc, *pipeline);
3771          return result;
3772       }
3773 
3774       tu_emit_program_state<CHIP>(&(*pipeline)->cs, &(*pipeline)->program,
3775                                   (*pipeline)->shaders);
3776 
3777       if (CHIP == A6XX) {
3778          /* Blob doesn't preload state on A7XX, likely preloading either
3779           * doesn't work or doesn't provide benefits.
3780           */
3781          tu6_emit_load_state(builder->device, *pipeline, &builder->layout);
3782       }
3783    }
3784 
3785    if (builder->state & VK_GRAPHICS_PIPELINE_LIBRARY_FRAGMENT_SHADER_BIT_EXT) {
3786       tu_pipeline_builder_parse_depth_stencil(builder, *pipeline);
3787    }
3788 
3789    if (builder->state &
3790        VK_GRAPHICS_PIPELINE_LIBRARY_FRAGMENT_OUTPUT_INTERFACE_BIT_EXT) {
3791       tu_pipeline_builder_parse_multisample_and_color_blend(builder, *pipeline);
3792    }
3793 
3794    if (set_combined_state(builder, *pipeline,
3795                           VK_GRAPHICS_PIPELINE_LIBRARY_FRAGMENT_SHADER_BIT_EXT |
3796                           VK_GRAPHICS_PIPELINE_LIBRARY_FRAGMENT_OUTPUT_INTERFACE_BIT_EXT)) {
3797       tu_pipeline_builder_parse_rasterization_order(builder, *pipeline);
3798    }
3799 
3800    tu_pipeline_builder_emit_state<CHIP>(builder, *pipeline);
3801 
3802    if ((*pipeline)->type == TU_PIPELINE_GRAPHICS_LIB) {
3803       struct tu_graphics_lib_pipeline *library =
3804          tu_pipeline_to_graphics_lib(*pipeline);
3805       result = vk_graphics_pipeline_state_copy(&builder->device->vk,
3806                                                &library->graphics_state,
3807                                                &builder->graphics_state,
3808                                                builder->alloc,
3809                                                VK_SYSTEM_ALLOCATION_SCOPE_OBJECT,
3810                                                &library->state_data);
3811       if (result != VK_SUCCESS) {
3812          tu_pipeline_finish(*pipeline, builder->device, builder->alloc);
3813          return result;
3814       }
3815    } else {
3816       struct tu_graphics_pipeline *gfx_pipeline =
3817          tu_pipeline_to_graphics(*pipeline);
3818       gfx_pipeline->dynamic_state.ms.sample_locations =
3819          &gfx_pipeline->sample_locations;
3820       vk_dynamic_graphics_state_fill(&gfx_pipeline->dynamic_state,
3821                                      &builder->graphics_state);
3822       gfx_pipeline->feedback_loop_color =
3823          (builder->graphics_state.pipeline_flags &
3824           VK_PIPELINE_CREATE_2_COLOR_ATTACHMENT_FEEDBACK_LOOP_BIT_EXT);
3825       gfx_pipeline->feedback_loop_ds =
3826          (builder->graphics_state.pipeline_flags &
3827           VK_PIPELINE_CREATE_2_DEPTH_STENCIL_ATTACHMENT_FEEDBACK_LOOP_BIT_EXT);
3828       gfx_pipeline->feedback_loop_may_involve_textures =
3829          builder->graphics_state.feedback_loop_not_input_only;
3830    }
3831 
3832    return VK_SUCCESS;
3833 }
3834 
3835 static void
tu_pipeline_builder_finish(struct tu_pipeline_builder * builder)3836 tu_pipeline_builder_finish(struct tu_pipeline_builder *builder)
3837 {
3838    ralloc_free(builder->mem_ctx);
3839 }
3840 
3841 void
tu_fill_render_pass_state(struct vk_render_pass_state * rp,const struct tu_render_pass * pass,const struct tu_subpass * subpass)3842 tu_fill_render_pass_state(struct vk_render_pass_state *rp,
3843                           const struct tu_render_pass *pass,
3844                           const struct tu_subpass *subpass)
3845 {
3846    rp->view_mask = subpass->multiview_mask;
3847    rp->color_attachment_count = subpass->color_count;
3848 
3849    const uint32_t a = subpass->depth_stencil_attachment.attachment;
3850    rp->depth_attachment_format = VK_FORMAT_UNDEFINED;
3851    rp->stencil_attachment_format = VK_FORMAT_UNDEFINED;
3852    rp->attachments = MESA_VK_RP_ATTACHMENT_NONE;
3853    if (a != VK_ATTACHMENT_UNUSED) {
3854       VkFormat ds_format = pass->attachments[a].format;
3855       if (vk_format_has_depth(ds_format)) {
3856          rp->depth_attachment_format = ds_format;
3857          rp->attachments |= MESA_VK_RP_ATTACHMENT_DEPTH_BIT;
3858       }
3859       if (vk_format_has_stencil(ds_format)) {
3860          rp->stencil_attachment_format = ds_format;
3861          rp->attachments |= MESA_VK_RP_ATTACHMENT_STENCIL_BIT;
3862       }
3863    }
3864 
3865    for (uint32_t i = 0; i < subpass->color_count; i++) {
3866       const uint32_t a = subpass->color_attachments[i].attachment;
3867       if (a == VK_ATTACHMENT_UNUSED) {
3868          rp->color_attachment_formats[i] = VK_FORMAT_UNDEFINED;
3869          continue;
3870       }
3871 
3872       rp->color_attachment_formats[i] = pass->attachments[a].format;
3873       rp->attachments |= MESA_VK_RP_ATTACHMENT_COLOR_BIT(i);
3874    }
3875 }
3876 
3877 static void
tu_pipeline_builder_init_graphics(struct tu_pipeline_builder * builder,struct tu_device * dev,struct vk_pipeline_cache * cache,const VkGraphicsPipelineCreateInfo * create_info,VkPipelineCreateFlags2KHR flags,const VkAllocationCallbacks * alloc)3878 tu_pipeline_builder_init_graphics(
3879    struct tu_pipeline_builder *builder,
3880    struct tu_device *dev,
3881    struct vk_pipeline_cache *cache,
3882    const VkGraphicsPipelineCreateInfo *create_info,
3883    VkPipelineCreateFlags2KHR flags,
3884    const VkAllocationCallbacks *alloc)
3885 {
3886    *builder = (struct tu_pipeline_builder) {
3887       .device = dev,
3888       .mem_ctx = ralloc_context(NULL),
3889       .cache = cache,
3890       .alloc = alloc,
3891       .create_info = create_info,
3892       .create_flags = flags,
3893    };
3894 
3895    const VkGraphicsPipelineLibraryCreateInfoEXT *gpl_info =
3896       vk_find_struct_const(builder->create_info->pNext,
3897                            GRAPHICS_PIPELINE_LIBRARY_CREATE_INFO_EXT);
3898 
3899    const VkPipelineLibraryCreateInfoKHR *library_info =
3900       vk_find_struct_const(builder->create_info->pNext,
3901                            PIPELINE_LIBRARY_CREATE_INFO_KHR);
3902 
3903    if (gpl_info) {
3904       builder->state = gpl_info->flags;
3905    } else {
3906       /* Implement this bit of spec text:
3907        *
3908        *    If this structure is omitted, and either
3909        *    VkGraphicsPipelineCreateInfo::flags includes
3910        *    VK_PIPELINE_CREATE_LIBRARY_BIT_KHR or the
3911        *    VkGraphicsPipelineCreateInfo::pNext chain includes a
3912        *    VkPipelineLibraryCreateInfoKHR structure with a libraryCount
3913        *    greater than 0, it is as if flags is 0. Otherwise if this
3914        *    structure is omitted, it is as if flags includes all possible
3915        *    subsets of the graphics pipeline (i.e. a complete graphics
3916        *    pipeline).
3917        */
3918       if ((library_info && library_info->libraryCount > 0) ||
3919           (builder->create_flags & VK_PIPELINE_CREATE_2_LIBRARY_BIT_KHR)) {
3920          builder->state = 0;
3921       } else {
3922          builder->state =
3923             VK_GRAPHICS_PIPELINE_LIBRARY_VERTEX_INPUT_INTERFACE_BIT_EXT |
3924             VK_GRAPHICS_PIPELINE_LIBRARY_PRE_RASTERIZATION_SHADERS_BIT_EXT |
3925             VK_GRAPHICS_PIPELINE_LIBRARY_FRAGMENT_SHADER_BIT_EXT |
3926             VK_GRAPHICS_PIPELINE_LIBRARY_FRAGMENT_OUTPUT_INTERFACE_BIT_EXT;
3927       }
3928    }
3929 
3930    bool rasterizer_discard_dynamic = false;
3931    if (create_info->pDynamicState) {
3932       for (uint32_t i = 0; i < create_info->pDynamicState->dynamicStateCount; i++) {
3933          if (create_info->pDynamicState->pDynamicStates[i] ==
3934                VK_DYNAMIC_STATE_RASTERIZER_DISCARD_ENABLE) {
3935             rasterizer_discard_dynamic = true;
3936             break;
3937          }
3938       }
3939    }
3940 
3941    builder->rasterizer_discard =
3942       (builder->state & VK_GRAPHICS_PIPELINE_LIBRARY_PRE_RASTERIZATION_SHADERS_BIT_EXT) &&
3943       !rasterizer_discard_dynamic &&
3944       builder->create_info->pRasterizationState->rasterizerDiscardEnable;
3945 
3946    struct vk_render_pass_state rp_state = {};
3947    const struct vk_render_pass_state *driver_rp = NULL;
3948    VkPipelineCreateFlags2KHR rp_flags = 0;
3949 
3950    builder->unscaled_input_fragcoord = 0;
3951 
3952    /* Extract information we need from the turnip renderpass. This will be
3953     * filled out automatically if the app is using dynamic rendering or
3954     * renderpasses are emulated.
3955     */
3956    if (!TU_DEBUG(DYNAMIC) &&
3957        (builder->state &
3958         (VK_GRAPHICS_PIPELINE_LIBRARY_PRE_RASTERIZATION_SHADERS_BIT_EXT |
3959          VK_GRAPHICS_PIPELINE_LIBRARY_FRAGMENT_SHADER_BIT_EXT |
3960          VK_GRAPHICS_PIPELINE_LIBRARY_FRAGMENT_OUTPUT_INTERFACE_BIT_EXT)) &&
3961        builder->create_info->renderPass) {
3962       const struct tu_render_pass *pass =
3963          tu_render_pass_from_handle(create_info->renderPass);
3964       const struct tu_subpass *subpass =
3965          &pass->subpasses[create_info->subpass];
3966 
3967       tu_fill_render_pass_state(&rp_state, pass, subpass);
3968 
3969       for (unsigned i = 0; i < subpass->input_count; i++) {
3970          /* Input attachments stored in GMEM must be loaded with unscaled
3971           * FragCoord.
3972           */
3973          if (subpass->input_attachments[i].patch_input_gmem)
3974             builder->unscaled_input_fragcoord |= 1u << i;
3975       }
3976 
3977       if (subpass->feedback_loop_color) {
3978          rp_flags |=
3979             VK_PIPELINE_CREATE_2_COLOR_ATTACHMENT_FEEDBACK_LOOP_BIT_EXT;
3980       }
3981 
3982       if (subpass->feedback_loop_ds) {
3983          rp_flags |=
3984             VK_PIPELINE_CREATE_2_DEPTH_STENCIL_ATTACHMENT_FEEDBACK_LOOP_BIT_EXT;
3985       }
3986 
3987       if (pass->fragment_density_map.attachment != VK_ATTACHMENT_UNUSED) {
3988          rp_flags |=
3989             VK_PIPELINE_CREATE_2_RENDERING_FRAGMENT_DENSITY_MAP_ATTACHMENT_BIT_EXT;
3990       }
3991 
3992       builder->unscaled_input_fragcoord = 0;
3993       for (unsigned i = 0; i < subpass->input_count; i++) {
3994          /* Input attachments stored in GMEM must be loaded with unscaled
3995           * FragCoord.
3996           */
3997          if (subpass->input_attachments[i].patch_input_gmem)
3998             builder->unscaled_input_fragcoord |= 1u << i;
3999       }
4000 
4001       driver_rp = &rp_state;
4002    }
4003 
4004    vk_graphics_pipeline_state_fill(&dev->vk,
4005                                    &builder->graphics_state,
4006                                    builder->create_info,
4007                                    driver_rp,
4008                                    rp_flags,
4009                                    &builder->all_state,
4010                                    NULL, VK_SYSTEM_ALLOCATION_SCOPE_OBJECT,
4011                                    NULL);
4012 
4013    if (builder->graphics_state.rp) {
4014       builder->fragment_density_map = (builder->graphics_state.pipeline_flags &
4015          VK_PIPELINE_CREATE_2_RENDERING_FRAGMENT_DENSITY_MAP_ATTACHMENT_BIT_EXT) ||
4016          TU_DEBUG(FDM);
4017    }
4018 }
4019 
4020 template <chip CHIP>
4021 static VkResult
tu_graphics_pipeline_create(VkDevice device,VkPipelineCache pipelineCache,const VkGraphicsPipelineCreateInfo * pCreateInfo,VkPipelineCreateFlags2KHR flags,const VkAllocationCallbacks * pAllocator,VkPipeline * pPipeline)4022 tu_graphics_pipeline_create(VkDevice device,
4023                             VkPipelineCache pipelineCache,
4024                             const VkGraphicsPipelineCreateInfo *pCreateInfo,
4025                             VkPipelineCreateFlags2KHR flags,
4026                             const VkAllocationCallbacks *pAllocator,
4027                             VkPipeline *pPipeline)
4028 {
4029    TU_FROM_HANDLE(tu_device, dev, device);
4030    TU_FROM_HANDLE(vk_pipeline_cache, cache, pipelineCache);
4031 
4032    cache = cache ? cache : dev->mem_cache;
4033 
4034    struct tu_pipeline_builder builder;
4035    tu_pipeline_builder_init_graphics(&builder, dev, cache,
4036                                      pCreateInfo, flags, pAllocator);
4037 
4038    struct tu_pipeline *pipeline = NULL;
4039    VkResult result = tu_pipeline_builder_build<CHIP>(&builder, &pipeline);
4040    tu_pipeline_builder_finish(&builder);
4041 
4042    if (result == VK_SUCCESS)
4043       *pPipeline = tu_pipeline_to_handle(pipeline);
4044    else
4045       *pPipeline = VK_NULL_HANDLE;
4046 
4047    return result;
4048 }
4049 
4050 template <chip CHIP>
4051 VKAPI_ATTR VkResult VKAPI_CALL
tu_CreateGraphicsPipelines(VkDevice device,VkPipelineCache pipelineCache,uint32_t count,const VkGraphicsPipelineCreateInfo * pCreateInfos,const VkAllocationCallbacks * pAllocator,VkPipeline * pPipelines)4052 tu_CreateGraphicsPipelines(VkDevice device,
4053                            VkPipelineCache pipelineCache,
4054                            uint32_t count,
4055                            const VkGraphicsPipelineCreateInfo *pCreateInfos,
4056                            const VkAllocationCallbacks *pAllocator,
4057                            VkPipeline *pPipelines)
4058 {
4059    MESA_TRACE_FUNC();
4060    VkResult final_result = VK_SUCCESS;
4061    uint32_t i = 0;
4062 
4063    for (; i < count; i++) {
4064       VkPipelineCreateFlags2KHR flags =
4065          vk_graphics_pipeline_create_flags(&pCreateInfos[i]);
4066 
4067       VkResult result =
4068          tu_graphics_pipeline_create<CHIP>(device, pipelineCache,
4069                                            &pCreateInfos[i], flags,
4070                                            pAllocator, &pPipelines[i]);
4071 
4072       if (result != VK_SUCCESS) {
4073          final_result = result;
4074          pPipelines[i] = VK_NULL_HANDLE;
4075 
4076          if (flags & VK_PIPELINE_CREATE_2_EARLY_RETURN_ON_FAILURE_BIT_KHR)
4077             break;
4078       }
4079    }
4080 
4081    for (; i < count; i++)
4082       pPipelines[i] = VK_NULL_HANDLE;
4083 
4084    return final_result;
4085 }
4086 TU_GENX(tu_CreateGraphicsPipelines);
4087 
4088 template <chip CHIP>
4089 static VkResult
tu_compute_pipeline_create(VkDevice device,VkPipelineCache pipelineCache,const VkComputePipelineCreateInfo * pCreateInfo,VkPipelineCreateFlags2KHR flags,const VkAllocationCallbacks * pAllocator,VkPipeline * pPipeline)4090 tu_compute_pipeline_create(VkDevice device,
4091                            VkPipelineCache pipelineCache,
4092                            const VkComputePipelineCreateInfo *pCreateInfo,
4093                            VkPipelineCreateFlags2KHR flags,
4094                            const VkAllocationCallbacks *pAllocator,
4095                            VkPipeline *pPipeline)
4096 {
4097    TU_FROM_HANDLE(tu_device, dev, device);
4098    TU_FROM_HANDLE(vk_pipeline_cache, cache, pipelineCache);
4099    TU_FROM_HANDLE(tu_pipeline_layout, layout, pCreateInfo->layout);
4100    const VkPipelineShaderStageCreateInfo *stage_info = &pCreateInfo->stage;
4101    VkResult result;
4102    const struct ir3_shader_variant *v = NULL;
4103 
4104    cache = cache ? cache : dev->mem_cache;
4105 
4106    struct tu_compute_pipeline *pipeline;
4107 
4108    *pPipeline = VK_NULL_HANDLE;
4109 
4110    VkPipelineCreationFeedback pipeline_feedback = {
4111       .flags = VK_PIPELINE_CREATION_FEEDBACK_VALID_BIT,
4112    };
4113 
4114    const VkPipelineCreationFeedbackCreateInfo *creation_feedback =
4115       vk_find_struct_const(pCreateInfo->pNext, PIPELINE_CREATION_FEEDBACK_CREATE_INFO);
4116 
4117    int64_t pipeline_start = os_time_get_nano();
4118 
4119    pipeline = (struct tu_compute_pipeline *) vk_object_zalloc(
4120       &dev->vk, pAllocator, sizeof(*pipeline), VK_OBJECT_TYPE_PIPELINE);
4121    if (!pipeline)
4122       return VK_ERROR_OUT_OF_HOST_MEMORY;
4123    pipeline->base.type = TU_PIPELINE_COMPUTE;
4124 
4125    pipeline->base.executables_mem_ctx = ralloc_context(NULL);
4126    util_dynarray_init(&pipeline->base.executables, pipeline->base.executables_mem_ctx);
4127    pipeline->base.active_stages = VK_SHADER_STAGE_COMPUTE_BIT;
4128 
4129    struct tu_shader_key key = { };
4130    bool allow_varying_subgroup_size =
4131       (stage_info->flags &
4132        VK_PIPELINE_SHADER_STAGE_CREATE_ALLOW_VARYING_SUBGROUP_SIZE_BIT_EXT);
4133    bool require_full_subgroups =
4134       stage_info->flags &
4135       VK_PIPELINE_SHADER_STAGE_CREATE_REQUIRE_FULL_SUBGROUPS_BIT_EXT;
4136    const VkPipelineShaderStageRequiredSubgroupSizeCreateInfo *subgroup_info =
4137       vk_find_struct_const(stage_info,
4138                            PIPELINE_SHADER_STAGE_REQUIRED_SUBGROUP_SIZE_CREATE_INFO);
4139    tu_shader_key_subgroup_size(&key, allow_varying_subgroup_size,
4140                                require_full_subgroups, subgroup_info,
4141                                dev);
4142 
4143    void *pipeline_mem_ctx = ralloc_context(NULL);
4144 
4145    unsigned char pipeline_sha1[20];
4146    tu_hash_compute(pipeline_sha1, stage_info, layout, &key, dev->compiler);
4147 
4148    struct tu_shader *shader = NULL;
4149 
4150    const bool executable_info = flags &
4151       VK_PIPELINE_CREATE_2_CAPTURE_INTERNAL_REPRESENTATIONS_BIT_KHR;
4152 
4153    bool application_cache_hit = false;
4154 
4155    if (!executable_info) {
4156       shader =
4157          tu_pipeline_cache_lookup(cache, pipeline_sha1, sizeof(pipeline_sha1),
4158                                   &application_cache_hit);
4159    }
4160 
4161    if (application_cache_hit && cache != dev->mem_cache) {
4162       pipeline_feedback.flags |=
4163          VK_PIPELINE_CREATION_FEEDBACK_APPLICATION_PIPELINE_CACHE_HIT_BIT;
4164    }
4165 
4166    char *nir_initial_disasm = NULL;
4167 
4168    if (!shader) {
4169       if (flags &
4170           VK_PIPELINE_CREATE_2_FAIL_ON_PIPELINE_COMPILE_REQUIRED_BIT_KHR) {
4171          result = VK_PIPELINE_COMPILE_REQUIRED;
4172          goto fail;
4173       }
4174 
4175       struct ir3_shader_key ir3_key = {};
4176 
4177       nir_shader *nir = tu_spirv_to_nir(dev, pipeline_mem_ctx, stage_info,
4178                                         MESA_SHADER_COMPUTE);
4179 
4180       nir_initial_disasm = executable_info ?
4181          nir_shader_as_str(nir, pipeline->base.executables_mem_ctx) : NULL;
4182 
4183       result = tu_shader_create(dev, &shader, nir, &key, &ir3_key,
4184                                 pipeline_sha1, sizeof(pipeline_sha1), layout,
4185                                 executable_info);
4186       if (!shader) {
4187          goto fail;
4188       }
4189 
4190       shader = tu_pipeline_cache_insert(cache, shader);
4191    }
4192 
4193    pipeline_feedback.duration = os_time_get_nano() - pipeline_start;
4194 
4195    if (creation_feedback) {
4196       *creation_feedback->pPipelineCreationFeedback = pipeline_feedback;
4197       assert(creation_feedback->pipelineStageCreationFeedbackCount == 1);
4198       creation_feedback->pPipelineStageCreationFeedbacks[0] = pipeline_feedback;
4199    }
4200 
4201    pipeline->base.active_desc_sets = shader->active_desc_sets;
4202 
4203    v = shader->variant;
4204 
4205    tu_pipeline_set_linkage(&pipeline->base.program.link[MESA_SHADER_COMPUTE],
4206                            &shader->const_state, v);
4207 
4208    result = tu_pipeline_allocate_cs(dev, &pipeline->base, layout, NULL, v);
4209    if (result != VK_SUCCESS)
4210       goto fail;
4211 
4212    for (int i = 0; i < 3; i++)
4213       pipeline->local_size[i] = v->local_size[i];
4214 
4215    if (CHIP == A6XX) {
4216       tu6_emit_load_state(dev, &pipeline->base, layout);
4217    }
4218 
4219    tu_append_executable(&pipeline->base, v, nir_initial_disasm);
4220 
4221    pipeline->instrlen = v->instrlen;
4222 
4223    pipeline->base.shaders[MESA_SHADER_COMPUTE] = shader;
4224 
4225    ralloc_free(pipeline_mem_ctx);
4226 
4227    *pPipeline = tu_pipeline_to_handle(&pipeline->base);
4228 
4229    return VK_SUCCESS;
4230 
4231 fail:
4232    if (shader)
4233       vk_pipeline_cache_object_unref(&dev->vk, &shader->base);
4234 
4235    ralloc_free(pipeline_mem_ctx);
4236 
4237    vk_object_free(&dev->vk, pAllocator, pipeline);
4238 
4239    return result;
4240 }
4241 
4242 template <chip CHIP>
4243 VKAPI_ATTR VkResult VKAPI_CALL
tu_CreateComputePipelines(VkDevice device,VkPipelineCache pipelineCache,uint32_t count,const VkComputePipelineCreateInfo * pCreateInfos,const VkAllocationCallbacks * pAllocator,VkPipeline * pPipelines)4244 tu_CreateComputePipelines(VkDevice device,
4245                           VkPipelineCache pipelineCache,
4246                           uint32_t count,
4247                           const VkComputePipelineCreateInfo *pCreateInfos,
4248                           const VkAllocationCallbacks *pAllocator,
4249                           VkPipeline *pPipelines)
4250 {
4251    MESA_TRACE_FUNC();
4252    VkResult final_result = VK_SUCCESS;
4253    uint32_t i = 0;
4254 
4255    for (; i < count; i++) {
4256       VkPipelineCreateFlags2KHR flags =
4257          vk_compute_pipeline_create_flags(&pCreateInfos[i]);
4258 
4259       VkResult result =
4260          tu_compute_pipeline_create<CHIP>(device, pipelineCache,
4261                                           &pCreateInfos[i], flags,
4262                                           pAllocator, &pPipelines[i]);
4263       if (result != VK_SUCCESS) {
4264          final_result = result;
4265          pPipelines[i] = VK_NULL_HANDLE;
4266 
4267          if (flags &
4268              VK_PIPELINE_CREATE_2_EARLY_RETURN_ON_FAILURE_BIT_KHR)
4269             break;
4270       }
4271    }
4272 
4273    for (; i < count; i++)
4274       pPipelines[i] = VK_NULL_HANDLE;
4275 
4276    return final_result;
4277 }
4278 TU_GENX(tu_CreateComputePipelines);
4279 
4280 VKAPI_ATTR void VKAPI_CALL
tu_DestroyPipeline(VkDevice _device,VkPipeline _pipeline,const VkAllocationCallbacks * pAllocator)4281 tu_DestroyPipeline(VkDevice _device,
4282                    VkPipeline _pipeline,
4283                    const VkAllocationCallbacks *pAllocator)
4284 {
4285    TU_FROM_HANDLE(tu_device, dev, _device);
4286    TU_FROM_HANDLE(tu_pipeline, pipeline, _pipeline);
4287 
4288    if (!_pipeline)
4289       return;
4290 
4291    tu_pipeline_finish(pipeline, dev, pAllocator);
4292    vk_object_free(&dev->vk, pAllocator, pipeline);
4293 }
4294 
4295 #define WRITE_STR(field, ...) ({                                \
4296    memset(field, 0, sizeof(field));                             \
4297    UNUSED int _i = snprintf(field, sizeof(field), __VA_ARGS__); \
4298    assert(_i > 0 && _i < sizeof(field));                        \
4299 })
4300 
4301 static const struct tu_pipeline_executable *
tu_pipeline_get_executable(struct tu_pipeline * pipeline,uint32_t index)4302 tu_pipeline_get_executable(struct tu_pipeline *pipeline, uint32_t index)
4303 {
4304    assert(index < util_dynarray_num_elements(&pipeline->executables,
4305                                              struct tu_pipeline_executable));
4306    return util_dynarray_element(
4307       &pipeline->executables, struct tu_pipeline_executable, index);
4308 }
4309 
4310 VKAPI_ATTR VkResult VKAPI_CALL
tu_GetPipelineExecutablePropertiesKHR(VkDevice _device,const VkPipelineInfoKHR * pPipelineInfo,uint32_t * pExecutableCount,VkPipelineExecutablePropertiesKHR * pProperties)4311 tu_GetPipelineExecutablePropertiesKHR(
4312       VkDevice _device,
4313       const VkPipelineInfoKHR* pPipelineInfo,
4314       uint32_t* pExecutableCount,
4315       VkPipelineExecutablePropertiesKHR* pProperties)
4316 {
4317    TU_FROM_HANDLE(tu_device, dev, _device);
4318    TU_FROM_HANDLE(tu_pipeline, pipeline, pPipelineInfo->pipeline);
4319    VK_OUTARRAY_MAKE_TYPED(VkPipelineExecutablePropertiesKHR, out,
4320                           pProperties, pExecutableCount);
4321 
4322    util_dynarray_foreach (&pipeline->executables, struct tu_pipeline_executable, exe) {
4323       vk_outarray_append_typed(VkPipelineExecutablePropertiesKHR, &out, props) {
4324          gl_shader_stage stage = exe->stage;
4325          props->stages = mesa_to_vk_shader_stage(stage);
4326 
4327          if (!exe->is_binning)
4328             WRITE_STR(props->name, "%s", _mesa_shader_stage_to_abbrev(stage));
4329          else
4330             WRITE_STR(props->name, "Binning VS");
4331 
4332          WRITE_STR(props->description, "%s", _mesa_shader_stage_to_string(stage));
4333 
4334          props->subgroupSize =
4335             dev->compiler->threadsize_base * (exe->stats.double_threadsize ? 2 : 1);
4336       }
4337    }
4338 
4339    return vk_outarray_status(&out);
4340 }
4341 
4342 VKAPI_ATTR VkResult VKAPI_CALL
tu_GetPipelineExecutableStatisticsKHR(VkDevice _device,const VkPipelineExecutableInfoKHR * pExecutableInfo,uint32_t * pStatisticCount,VkPipelineExecutableStatisticKHR * pStatistics)4343 tu_GetPipelineExecutableStatisticsKHR(
4344       VkDevice _device,
4345       const VkPipelineExecutableInfoKHR* pExecutableInfo,
4346       uint32_t* pStatisticCount,
4347       VkPipelineExecutableStatisticKHR* pStatistics)
4348 {
4349    TU_FROM_HANDLE(tu_pipeline, pipeline, pExecutableInfo->pipeline);
4350    VK_OUTARRAY_MAKE_TYPED(VkPipelineExecutableStatisticKHR, out,
4351                           pStatistics, pStatisticCount);
4352 
4353    const struct tu_pipeline_executable *exe =
4354       tu_pipeline_get_executable(pipeline, pExecutableInfo->executableIndex);
4355 
4356    vk_outarray_append_typed(VkPipelineExecutableStatisticKHR, &out, stat) {
4357       WRITE_STR(stat->name, "Max Waves Per Core");
4358       WRITE_STR(stat->description,
4359                 "Maximum number of simultaneous waves per core.");
4360       stat->format = VK_PIPELINE_EXECUTABLE_STATISTIC_FORMAT_UINT64_KHR;
4361       stat->value.u64 = exe->stats.max_waves;
4362    }
4363 
4364    vk_outarray_append_typed(VkPipelineExecutableStatisticKHR, &out, stat) {
4365       WRITE_STR(stat->name, "Instruction Count");
4366       WRITE_STR(stat->description,
4367                 "Total number of IR3 instructions in the final generated "
4368                 "shader executable.");
4369       stat->format = VK_PIPELINE_EXECUTABLE_STATISTIC_FORMAT_UINT64_KHR;
4370       stat->value.u64 = exe->stats.instrs_count;
4371    }
4372 
4373    vk_outarray_append_typed(VkPipelineExecutableStatisticKHR, &out, stat) {
4374       WRITE_STR(stat->name, "Code size");
4375       WRITE_STR(stat->description,
4376                 "Total number of dwords in the final generated "
4377                 "shader executable.");
4378       stat->format = VK_PIPELINE_EXECUTABLE_STATISTIC_FORMAT_UINT64_KHR;
4379       stat->value.u64 = exe->stats.sizedwords;
4380    }
4381 
4382    vk_outarray_append_typed(VkPipelineExecutableStatisticKHR, &out, stat) {
4383       WRITE_STR(stat->name, "NOPs Count");
4384       WRITE_STR(stat->description,
4385                 "Number of NOP instructions in the final generated "
4386                 "shader executable.");
4387       stat->format = VK_PIPELINE_EXECUTABLE_STATISTIC_FORMAT_UINT64_KHR;
4388       stat->value.u64 = exe->stats.nops_count;
4389    }
4390 
4391    vk_outarray_append_typed(VkPipelineExecutableStatisticKHR, &out, stat) {
4392       WRITE_STR(stat->name, "MOV Count");
4393       WRITE_STR(stat->description,
4394                 "Number of MOV instructions in the final generated "
4395                 "shader executable.");
4396       stat->format = VK_PIPELINE_EXECUTABLE_STATISTIC_FORMAT_UINT64_KHR;
4397       stat->value.u64 = exe->stats.mov_count;
4398    }
4399 
4400    vk_outarray_append_typed(VkPipelineExecutableStatisticKHR, &out, stat) {
4401       WRITE_STR(stat->name, "COV Count");
4402       WRITE_STR(stat->description,
4403                 "Number of COV instructions in the final generated "
4404                 "shader executable.");
4405       stat->format = VK_PIPELINE_EXECUTABLE_STATISTIC_FORMAT_UINT64_KHR;
4406       stat->value.u64 = exe->stats.cov_count;
4407    }
4408 
4409    vk_outarray_append_typed(VkPipelineExecutableStatisticKHR, &out, stat) {
4410       WRITE_STR(stat->name, "Registers used");
4411       WRITE_STR(stat->description,
4412                 "Number of registers used in the final generated "
4413                 "shader executable.");
4414       stat->format = VK_PIPELINE_EXECUTABLE_STATISTIC_FORMAT_UINT64_KHR;
4415       stat->value.u64 = exe->stats.max_reg + 1;
4416    }
4417 
4418    vk_outarray_append_typed(VkPipelineExecutableStatisticKHR, &out, stat) {
4419       WRITE_STR(stat->name, "Half-registers used");
4420       WRITE_STR(stat->description,
4421                 "Number of half-registers used in the final generated "
4422                 "shader executable.");
4423       stat->format = VK_PIPELINE_EXECUTABLE_STATISTIC_FORMAT_UINT64_KHR;
4424       stat->value.u64 = exe->stats.max_half_reg + 1;
4425    }
4426 
4427    vk_outarray_append_typed(VkPipelineExecutableStatisticKHR, &out, stat) {
4428       WRITE_STR(stat->name, "Last interpolation instruction");
4429       WRITE_STR(stat->description,
4430                 "The instruction where varying storage in Local Memory is released");
4431       stat->format = VK_PIPELINE_EXECUTABLE_STATISTIC_FORMAT_UINT64_KHR;
4432       stat->value.u64 = exe->stats.last_baryf;
4433    }
4434 
4435    vk_outarray_append_typed(VkPipelineExecutableStatisticKHR, &out, stat) {
4436       WRITE_STR(stat->name, "Last helper instruction");
4437       WRITE_STR(stat->description,
4438                 "The instruction where helper invocations are killed");
4439       stat->format = VK_PIPELINE_EXECUTABLE_STATISTIC_FORMAT_UINT64_KHR;
4440       stat->value.u64 = exe->stats.last_helper;
4441    }
4442 
4443    vk_outarray_append_typed(VkPipelineExecutableStatisticKHR, &out, stat) {
4444       WRITE_STR(stat->name, "Instructions with SS sync bit");
4445       WRITE_STR(stat->description,
4446                 "SS bit is set for instructions which depend on a result "
4447                 "of \"long\" instructions to prevent RAW hazard.");
4448       stat->format = VK_PIPELINE_EXECUTABLE_STATISTIC_FORMAT_UINT64_KHR;
4449       stat->value.u64 = exe->stats.ss;
4450    }
4451 
4452    vk_outarray_append_typed(VkPipelineExecutableStatisticKHR, &out, stat) {
4453       WRITE_STR(stat->name, "Instructions with SY sync bit");
4454       WRITE_STR(stat->description,
4455                 "SY bit is set for instructions which depend on a result "
4456                 "of loads from global memory to prevent RAW hazard.");
4457       stat->format = VK_PIPELINE_EXECUTABLE_STATISTIC_FORMAT_UINT64_KHR;
4458       stat->value.u64 = exe->stats.sy;
4459    }
4460 
4461    vk_outarray_append_typed(VkPipelineExecutableStatisticKHR, &out, stat) {
4462       WRITE_STR(stat->name, "Estimated cycles stalled on SS");
4463       WRITE_STR(stat->description,
4464                 "A better metric to estimate the impact of SS syncs.");
4465       stat->format = VK_PIPELINE_EXECUTABLE_STATISTIC_FORMAT_UINT64_KHR;
4466       stat->value.u64 = exe->stats.sstall;
4467    }
4468 
4469    vk_outarray_append_typed(VkPipelineExecutableStatisticKHR, &out, stat) {
4470       WRITE_STR(stat->name, "Estimated cycles stalled on SY");
4471       WRITE_STR(stat->description,
4472                 "A better metric to estimate the impact of SY syncs.");
4473       stat->format = VK_PIPELINE_EXECUTABLE_STATISTIC_FORMAT_UINT64_KHR;
4474       stat->value.u64 = exe->stats.systall;
4475    }
4476 
4477    for (int i = 0; i < ARRAY_SIZE(exe->stats.instrs_per_cat); i++) {
4478       vk_outarray_append_typed(VkPipelineExecutableStatisticKHR, &out, stat) {
4479          WRITE_STR(stat->name, "cat%d instructions", i);
4480          WRITE_STR(stat->description,
4481                   "Number of cat%d instructions.", i);
4482          stat->format = VK_PIPELINE_EXECUTABLE_STATISTIC_FORMAT_UINT64_KHR;
4483          stat->value.u64 = exe->stats.instrs_per_cat[i];
4484       }
4485    }
4486 
4487    vk_outarray_append_typed(VkPipelineExecutableStatisticKHR, &out, stat) {
4488       WRITE_STR(stat->name, "STP Count");
4489       WRITE_STR(stat->description,
4490                 "Number of STore Private instructions in the final generated "
4491                 "shader executable.");
4492       stat->format = VK_PIPELINE_EXECUTABLE_STATISTIC_FORMAT_UINT64_KHR;
4493       stat->value.u64 = exe->stats.stp_count;
4494    }
4495 
4496    vk_outarray_append_typed(VkPipelineExecutableStatisticKHR, &out, stat) {
4497       WRITE_STR(stat->name, "LDP Count");
4498       WRITE_STR(stat->description,
4499                 "Number of LoaD Private instructions in the final generated "
4500                 "shader executable.");
4501       stat->format = VK_PIPELINE_EXECUTABLE_STATISTIC_FORMAT_UINT64_KHR;
4502       stat->value.u64 = exe->stats.ldp_count;
4503    }
4504 
4505    return vk_outarray_status(&out);
4506 }
4507 
4508 static bool
write_ir_text(VkPipelineExecutableInternalRepresentationKHR * ir,const char * data)4509 write_ir_text(VkPipelineExecutableInternalRepresentationKHR* ir,
4510               const char *data)
4511 {
4512    ir->isText = VK_TRUE;
4513 
4514    size_t data_len = strlen(data) + 1;
4515 
4516    if (ir->pData == NULL) {
4517       ir->dataSize = data_len;
4518       return true;
4519    }
4520 
4521    strncpy((char *) ir->pData, data, ir->dataSize);
4522    if (ir->dataSize < data_len)
4523       return false;
4524 
4525    ir->dataSize = data_len;
4526    return true;
4527 }
4528 
4529 VKAPI_ATTR VkResult VKAPI_CALL
tu_GetPipelineExecutableInternalRepresentationsKHR(VkDevice _device,const VkPipelineExecutableInfoKHR * pExecutableInfo,uint32_t * pInternalRepresentationCount,VkPipelineExecutableInternalRepresentationKHR * pInternalRepresentations)4530 tu_GetPipelineExecutableInternalRepresentationsKHR(
4531     VkDevice _device,
4532     const VkPipelineExecutableInfoKHR* pExecutableInfo,
4533     uint32_t* pInternalRepresentationCount,
4534     VkPipelineExecutableInternalRepresentationKHR* pInternalRepresentations)
4535 {
4536    TU_FROM_HANDLE(tu_pipeline, pipeline, pExecutableInfo->pipeline);
4537    VK_OUTARRAY_MAKE_TYPED(VkPipelineExecutableInternalRepresentationKHR, out,
4538                           pInternalRepresentations, pInternalRepresentationCount);
4539    bool incomplete_text = false;
4540 
4541    const struct tu_pipeline_executable *exe =
4542       tu_pipeline_get_executable(pipeline, pExecutableInfo->executableIndex);
4543 
4544    if (exe->nir_from_spirv) {
4545       vk_outarray_append_typed(VkPipelineExecutableInternalRepresentationKHR, &out, ir) {
4546          WRITE_STR(ir->name, "NIR from SPIRV");
4547          WRITE_STR(ir->description,
4548                    "Initial NIR before any optimizations");
4549 
4550          if (!write_ir_text(ir, exe->nir_from_spirv))
4551             incomplete_text = true;
4552       }
4553    }
4554 
4555    if (exe->nir_final) {
4556       vk_outarray_append_typed(VkPipelineExecutableInternalRepresentationKHR, &out, ir) {
4557          WRITE_STR(ir->name, "Final NIR");
4558          WRITE_STR(ir->description,
4559                    "Final NIR before going into the back-end compiler");
4560 
4561          if (!write_ir_text(ir, exe->nir_final))
4562             incomplete_text = true;
4563       }
4564    }
4565 
4566    if (exe->disasm) {
4567       vk_outarray_append_typed(VkPipelineExecutableInternalRepresentationKHR, &out, ir) {
4568          WRITE_STR(ir->name, "IR3 Assembly");
4569          WRITE_STR(ir->description,
4570                    "Final IR3 assembly for the generated shader binary");
4571 
4572          if (!write_ir_text(ir, exe->disasm))
4573             incomplete_text = true;
4574       }
4575    }
4576 
4577    return incomplete_text ? VK_INCOMPLETE : vk_outarray_status(&out);
4578 }
4579