• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*
2  * Copyright © 2016 Red Hat.
3  * Copyright © 2016 Bas Nieuwenhuizen
4  * SPDX-License-Identifier: MIT
5  *
6  * based in part on anv driver which is:
7  * Copyright © 2015 Intel Corporation
8  */
9 
10 #include "tu_pipeline.h"
11 
12 #include "common/freedreno_guardband.h"
13 
14 #include "ir3/ir3_nir.h"
15 #include "main/menums.h"
16 #include "nir/nir.h"
17 #include "nir/nir_builder.h"
18 #include "spirv/nir_spirv.h"
19 #include "util/debug.h"
20 #include "util/mesa-sha1.h"
21 #include "vk_pipeline.h"
22 #include "vk_render_pass.h"
23 #include "vk_util.h"
24 
25 #include "tu_cmd_buffer.h"
26 #include "tu_cs.h"
27 #include "tu_device.h"
28 #include "tu_formats.h"
29 #include "tu_lrz.h"
30 #include "tu_pass.h"
31 
32 /* Emit IB that preloads the descriptors that the shader uses */
33 
34 static void
emit_load_state(struct tu_cs * cs,unsigned opcode,enum a6xx_state_type st,enum a6xx_state_block sb,unsigned base,unsigned offset,unsigned count)35 emit_load_state(struct tu_cs *cs, unsigned opcode, enum a6xx_state_type st,
36                 enum a6xx_state_block sb, unsigned base, unsigned offset,
37                 unsigned count)
38 {
39    /* Note: just emit one packet, even if count overflows NUM_UNIT. It's not
40     * clear if emitting more packets will even help anything. Presumably the
41     * descriptor cache is relatively small, and these packets stop doing
42     * anything when there are too many descriptors.
43     */
44    tu_cs_emit_pkt7(cs, opcode, 3);
45    tu_cs_emit(cs,
46               CP_LOAD_STATE6_0_STATE_TYPE(st) |
47               CP_LOAD_STATE6_0_STATE_SRC(SS6_BINDLESS) |
48               CP_LOAD_STATE6_0_STATE_BLOCK(sb) |
49               CP_LOAD_STATE6_0_NUM_UNIT(MIN2(count, 1024-1)));
50    tu_cs_emit_qw(cs, offset | (base << 28));
51 }
52 
53 static unsigned
tu6_load_state_size(struct tu_pipeline * pipeline,struct tu_pipeline_layout * layout,bool compute)54 tu6_load_state_size(struct tu_pipeline *pipeline,
55                     struct tu_pipeline_layout *layout, bool compute)
56 {
57    const unsigned load_state_size = 4;
58    unsigned size = 0;
59    for (unsigned i = 0; i < layout->num_sets; i++) {
60       if (!(pipeline->active_desc_sets & (1u << i)))
61          continue;
62 
63       struct tu_descriptor_set_layout *set_layout = layout->set[i].layout;
64       for (unsigned j = 0; j < set_layout->binding_count; j++) {
65          struct tu_descriptor_set_binding_layout *binding = &set_layout->binding[j];
66          unsigned count = 0;
67          /* Note: some users, like amber for example, pass in
68           * VK_SHADER_STAGE_ALL which includes a bunch of extra bits, so
69           * filter these out by using VK_SHADER_STAGE_ALL_GRAPHICS explicitly.
70           */
71          VkShaderStageFlags stages = compute ?
72             binding->shader_stages & VK_SHADER_STAGE_COMPUTE_BIT :
73             binding->shader_stages & VK_SHADER_STAGE_ALL_GRAPHICS;
74          unsigned stage_count = util_bitcount(stages);
75 
76          if (!binding->array_size)
77             continue;
78 
79          switch (binding->type) {
80          case VK_DESCRIPTOR_TYPE_STORAGE_BUFFER:
81          case VK_DESCRIPTOR_TYPE_STORAGE_BUFFER_DYNAMIC:
82          case VK_DESCRIPTOR_TYPE_STORAGE_IMAGE:
83          case VK_DESCRIPTOR_TYPE_STORAGE_TEXEL_BUFFER:
84             /* IBO-backed resources only need one packet for all graphics stages */
85             if (stages & ~VK_SHADER_STAGE_COMPUTE_BIT)
86                count += 1;
87             if (stages & VK_SHADER_STAGE_COMPUTE_BIT)
88                count += 1;
89             break;
90          case VK_DESCRIPTOR_TYPE_SAMPLER:
91          case VK_DESCRIPTOR_TYPE_SAMPLED_IMAGE:
92          case VK_DESCRIPTOR_TYPE_UNIFORM_TEXEL_BUFFER:
93          case VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER:
94          case VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER_DYNAMIC:
95             /* Textures and UBO's needs a packet for each stage */
96             count = stage_count;
97             break;
98          case VK_DESCRIPTOR_TYPE_COMBINED_IMAGE_SAMPLER:
99             /* Because of how we pack combined images and samplers, we
100              * currently can't use one packet for the whole array.
101              */
102             count = stage_count * binding->array_size * 2;
103             break;
104          case VK_DESCRIPTOR_TYPE_INPUT_ATTACHMENT:
105          case VK_DESCRIPTOR_TYPE_MUTABLE_VALVE:
106             break;
107          default:
108             unreachable("bad descriptor type");
109          }
110          size += count * load_state_size;
111       }
112    }
113    return size;
114 }
115 
116 static void
tu6_emit_load_state(struct tu_pipeline * pipeline,struct tu_pipeline_layout * layout,bool compute)117 tu6_emit_load_state(struct tu_pipeline *pipeline,
118                     struct tu_pipeline_layout *layout, bool compute)
119 {
120    unsigned size = tu6_load_state_size(pipeline, layout, compute);
121    if (size == 0)
122       return;
123 
124    struct tu_cs cs;
125    tu_cs_begin_sub_stream(&pipeline->cs, size, &cs);
126 
127    for (unsigned i = 0; i < layout->num_sets; i++) {
128       /* From 13.2.7. Descriptor Set Binding:
129        *
130        *    A compatible descriptor set must be bound for all set numbers that
131        *    any shaders in a pipeline access, at the time that a draw or
132        *    dispatch command is recorded to execute using that pipeline.
133        *    However, if none of the shaders in a pipeline statically use any
134        *    bindings with a particular set number, then no descriptor set need
135        *    be bound for that set number, even if the pipeline layout includes
136        *    a non-trivial descriptor set layout for that set number.
137        *
138        * This means that descriptor sets unused by the pipeline may have a
139        * garbage or 0 BINDLESS_BASE register, which will cause context faults
140        * when prefetching descriptors from these sets. Skip prefetching for
141        * descriptors from them to avoid this. This is also an optimization,
142        * since these prefetches would be useless.
143        */
144       if (!(pipeline->active_desc_sets & (1u << i)))
145          continue;
146 
147       struct tu_descriptor_set_layout *set_layout = layout->set[i].layout;
148       for (unsigned j = 0; j < set_layout->binding_count; j++) {
149          struct tu_descriptor_set_binding_layout *binding = &set_layout->binding[j];
150          unsigned base = i;
151          unsigned offset = binding->offset / 4;
152          /* Note: some users, like amber for example, pass in
153           * VK_SHADER_STAGE_ALL which includes a bunch of extra bits, so
154           * filter these out by using VK_SHADER_STAGE_ALL_GRAPHICS explicitly.
155           */
156          VkShaderStageFlags stages = compute ?
157             binding->shader_stages & VK_SHADER_STAGE_COMPUTE_BIT :
158             binding->shader_stages & VK_SHADER_STAGE_ALL_GRAPHICS;
159          unsigned count = binding->array_size;
160 
161          /* If this is a variable-count descriptor, then the array_size is an
162           * upper bound on the size, but we don't know how many descriptors
163           * will actually be used. Therefore we can't pre-load them here.
164           */
165          if (j == set_layout->binding_count - 1 &&
166              set_layout->has_variable_descriptors)
167             continue;
168 
169          if (count == 0 || stages == 0)
170             continue;
171          switch (binding->type) {
172          case VK_DESCRIPTOR_TYPE_STORAGE_BUFFER_DYNAMIC:
173             base = MAX_SETS;
174             offset = (layout->set[i].dynamic_offset_start +
175                       binding->dynamic_offset_offset) / 4;
176             FALLTHROUGH;
177          case VK_DESCRIPTOR_TYPE_STORAGE_BUFFER:
178          case VK_DESCRIPTOR_TYPE_STORAGE_IMAGE:
179          case VK_DESCRIPTOR_TYPE_STORAGE_TEXEL_BUFFER: {
180             unsigned mul = binding->size / (A6XX_TEX_CONST_DWORDS * 4);
181             /* IBO-backed resources only need one packet for all graphics stages */
182             if (stages & ~VK_SHADER_STAGE_COMPUTE_BIT) {
183                emit_load_state(&cs, CP_LOAD_STATE6, ST6_SHADER, SB6_IBO,
184                                base, offset, count * mul);
185             }
186             if (stages & VK_SHADER_STAGE_COMPUTE_BIT) {
187                emit_load_state(&cs, CP_LOAD_STATE6_FRAG, ST6_IBO, SB6_CS_SHADER,
188                                base, offset, count * mul);
189             }
190             break;
191          }
192          case VK_DESCRIPTOR_TYPE_INPUT_ATTACHMENT:
193          case VK_DESCRIPTOR_TYPE_MUTABLE_VALVE:
194             /* nothing - input attachment doesn't use bindless */
195             break;
196          case VK_DESCRIPTOR_TYPE_SAMPLER:
197          case VK_DESCRIPTOR_TYPE_SAMPLED_IMAGE:
198          case VK_DESCRIPTOR_TYPE_UNIFORM_TEXEL_BUFFER: {
199             tu_foreach_stage(stage, stages) {
200                emit_load_state(&cs, tu6_stage2opcode(stage),
201                                binding->type == VK_DESCRIPTOR_TYPE_SAMPLER ?
202                                ST6_SHADER : ST6_CONSTANTS,
203                                tu6_stage2texsb(stage), base, offset, count);
204             }
205             break;
206          }
207          case VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER_DYNAMIC:
208             base = MAX_SETS;
209             offset = (layout->set[i].dynamic_offset_start +
210                       binding->dynamic_offset_offset) / 4;
211             FALLTHROUGH;
212          case VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER: {
213             tu_foreach_stage(stage, stages) {
214                emit_load_state(&cs, tu6_stage2opcode(stage), ST6_UBO,
215                                tu6_stage2shadersb(stage), base, offset, count);
216             }
217             break;
218          }
219          case VK_DESCRIPTOR_TYPE_COMBINED_IMAGE_SAMPLER: {
220             tu_foreach_stage(stage, stages) {
221                /* TODO: We could emit less CP_LOAD_STATE6 if we used
222                 * struct-of-arrays instead of array-of-structs.
223                 */
224                for (unsigned i = 0; i < count; i++) {
225                   unsigned tex_offset = offset + 2 * i * A6XX_TEX_CONST_DWORDS;
226                   unsigned sam_offset = offset + (2 * i + 1) * A6XX_TEX_CONST_DWORDS;
227                   emit_load_state(&cs, tu6_stage2opcode(stage),
228                                   ST6_CONSTANTS, tu6_stage2texsb(stage),
229                                   base, tex_offset, 1);
230                   emit_load_state(&cs, tu6_stage2opcode(stage),
231                                   ST6_SHADER, tu6_stage2texsb(stage),
232                                   base, sam_offset, 1);
233                }
234             }
235             break;
236          }
237          default:
238             unreachable("bad descriptor type");
239          }
240       }
241    }
242 
243    pipeline->load_state = tu_cs_end_draw_state(&pipeline->cs, &cs);
244 }
245 
246 struct tu_pipeline_builder
247 {
248    struct tu_device *device;
249    void *mem_ctx;
250    struct vk_pipeline_cache *cache;
251    struct tu_pipeline_layout *layout;
252    const VkAllocationCallbacks *alloc;
253    const VkGraphicsPipelineCreateInfo *create_info;
254 
255    struct tu_compiled_shaders *shaders;
256    struct ir3_shader_variant *binning_variant;
257    uint64_t shader_iova[MESA_SHADER_FRAGMENT + 1];
258    uint64_t binning_vs_iova;
259 
260    uint32_t additional_cs_reserve_size;
261 
262    struct tu_pvtmem_config pvtmem;
263 
264    bool rasterizer_discard;
265    /* these states are affectd by rasterizer_discard */
266    bool emit_msaa_state;
267    bool depth_clip_disable;
268    VkSampleCountFlagBits samples;
269    bool use_color_attachments;
270    bool use_dual_src_blend;
271    bool alpha_to_coverage;
272    uint32_t color_attachment_count;
273    VkFormat color_attachment_formats[MAX_RTS];
274    VkFormat depth_attachment_format;
275    uint32_t render_components;
276    uint32_t multiview_mask;
277 
278    bool subpass_raster_order_attachment_access;
279    bool subpass_feedback_loop_color;
280    bool subpass_feedback_loop_ds;
281 };
282 
283 static bool
tu_logic_op_reads_dst(VkLogicOp op)284 tu_logic_op_reads_dst(VkLogicOp op)
285 {
286    switch (op) {
287    case VK_LOGIC_OP_CLEAR:
288    case VK_LOGIC_OP_COPY:
289    case VK_LOGIC_OP_COPY_INVERTED:
290    case VK_LOGIC_OP_SET:
291       return false;
292    default:
293       return true;
294    }
295 }
296 
297 static VkBlendFactor
tu_blend_factor_no_dst_alpha(VkBlendFactor factor)298 tu_blend_factor_no_dst_alpha(VkBlendFactor factor)
299 {
300    /* treat dst alpha as 1.0 and avoid reading it */
301    switch (factor) {
302    case VK_BLEND_FACTOR_DST_ALPHA:
303       return VK_BLEND_FACTOR_ONE;
304    case VK_BLEND_FACTOR_ONE_MINUS_DST_ALPHA:
305       return VK_BLEND_FACTOR_ZERO;
306    default:
307       return factor;
308    }
309 }
310 
tu_blend_factor_is_dual_src(VkBlendFactor factor)311 static bool tu_blend_factor_is_dual_src(VkBlendFactor factor)
312 {
313    switch (factor) {
314    case VK_BLEND_FACTOR_SRC1_COLOR:
315    case VK_BLEND_FACTOR_ONE_MINUS_SRC1_COLOR:
316    case VK_BLEND_FACTOR_SRC1_ALPHA:
317    case VK_BLEND_FACTOR_ONE_MINUS_SRC1_ALPHA:
318       return true;
319    default:
320       return false;
321    }
322 }
323 
324 static bool
tu_blend_state_is_dual_src(const VkPipelineColorBlendStateCreateInfo * info)325 tu_blend_state_is_dual_src(const VkPipelineColorBlendStateCreateInfo *info)
326 {
327    if (!info)
328       return false;
329 
330    for (unsigned i = 0; i < info->attachmentCount; i++) {
331       const VkPipelineColorBlendAttachmentState *blend = &info->pAttachments[i];
332       if (tu_blend_factor_is_dual_src(blend->srcColorBlendFactor) ||
333           tu_blend_factor_is_dual_src(blend->dstColorBlendFactor) ||
334           tu_blend_factor_is_dual_src(blend->srcAlphaBlendFactor) ||
335           tu_blend_factor_is_dual_src(blend->dstAlphaBlendFactor))
336          return true;
337    }
338 
339    return false;
340 }
341 
342 static const struct xs_config {
343    uint16_t reg_sp_xs_ctrl;
344    uint16_t reg_sp_xs_config;
345    uint16_t reg_sp_xs_instrlen;
346    uint16_t reg_hlsq_xs_ctrl;
347    uint16_t reg_sp_xs_first_exec_offset;
348    uint16_t reg_sp_xs_pvt_mem_hw_stack_offset;
349 } xs_config[] = {
350    [MESA_SHADER_VERTEX] = {
351       REG_A6XX_SP_VS_CTRL_REG0,
352       REG_A6XX_SP_VS_CONFIG,
353       REG_A6XX_SP_VS_INSTRLEN,
354       REG_A6XX_HLSQ_VS_CNTL,
355       REG_A6XX_SP_VS_OBJ_FIRST_EXEC_OFFSET,
356       REG_A6XX_SP_VS_PVT_MEM_HW_STACK_OFFSET,
357    },
358    [MESA_SHADER_TESS_CTRL] = {
359       REG_A6XX_SP_HS_CTRL_REG0,
360       REG_A6XX_SP_HS_CONFIG,
361       REG_A6XX_SP_HS_INSTRLEN,
362       REG_A6XX_HLSQ_HS_CNTL,
363       REG_A6XX_SP_HS_OBJ_FIRST_EXEC_OFFSET,
364       REG_A6XX_SP_HS_PVT_MEM_HW_STACK_OFFSET,
365    },
366    [MESA_SHADER_TESS_EVAL] = {
367       REG_A6XX_SP_DS_CTRL_REG0,
368       REG_A6XX_SP_DS_CONFIG,
369       REG_A6XX_SP_DS_INSTRLEN,
370       REG_A6XX_HLSQ_DS_CNTL,
371       REG_A6XX_SP_DS_OBJ_FIRST_EXEC_OFFSET,
372       REG_A6XX_SP_DS_PVT_MEM_HW_STACK_OFFSET,
373    },
374    [MESA_SHADER_GEOMETRY] = {
375       REG_A6XX_SP_GS_CTRL_REG0,
376       REG_A6XX_SP_GS_CONFIG,
377       REG_A6XX_SP_GS_INSTRLEN,
378       REG_A6XX_HLSQ_GS_CNTL,
379       REG_A6XX_SP_GS_OBJ_FIRST_EXEC_OFFSET,
380       REG_A6XX_SP_GS_PVT_MEM_HW_STACK_OFFSET,
381    },
382    [MESA_SHADER_FRAGMENT] = {
383       REG_A6XX_SP_FS_CTRL_REG0,
384       REG_A6XX_SP_FS_CONFIG,
385       REG_A6XX_SP_FS_INSTRLEN,
386       REG_A6XX_HLSQ_FS_CNTL,
387       REG_A6XX_SP_FS_OBJ_FIRST_EXEC_OFFSET,
388       REG_A6XX_SP_FS_PVT_MEM_HW_STACK_OFFSET,
389    },
390    [MESA_SHADER_COMPUTE] = {
391       REG_A6XX_SP_CS_CTRL_REG0,
392       REG_A6XX_SP_CS_CONFIG,
393       REG_A6XX_SP_CS_INSTRLEN,
394       REG_A6XX_HLSQ_CS_CNTL,
395       REG_A6XX_SP_CS_OBJ_FIRST_EXEC_OFFSET,
396       REG_A6XX_SP_CS_PVT_MEM_HW_STACK_OFFSET,
397    },
398 };
399 
400 static uint32_t
tu_xs_get_immediates_packet_size_dwords(const struct ir3_shader_variant * xs)401 tu_xs_get_immediates_packet_size_dwords(const struct ir3_shader_variant *xs)
402 {
403    const struct ir3_const_state *const_state = ir3_const_state(xs);
404    uint32_t base = const_state->offsets.immediate;
405    int32_t size = DIV_ROUND_UP(const_state->immediates_count, 4);
406 
407    /* truncate size to avoid writing constants that shader
408     * does not use:
409     */
410    size = MIN2(size + base, xs->constlen) - base;
411 
412    return MAX2(size, 0) * 4;
413 }
414 
415 /* We allocate fixed-length substreams for shader state, however some
416  * parts of the state may have unbound length. Their additional space
417  * requirements should be calculated here.
418  */
419 static uint32_t
tu_xs_get_additional_cs_size_dwords(const struct ir3_shader_variant * xs)420 tu_xs_get_additional_cs_size_dwords(const struct ir3_shader_variant *xs)
421 {
422    const struct ir3_const_state *const_state = ir3_const_state(xs);
423 
424    uint32_t size = tu_xs_get_immediates_packet_size_dwords(xs);
425 
426    /* Variable number of UBO upload ranges. */
427    size += 4 * const_state->ubo_state.num_enabled;
428 
429    /* Variable number of dwords for the primitive map */
430    size += xs->input_size;
431 
432    size += xs->constant_data_size / 4;
433 
434    return size;
435 }
436 
437 void
tu6_emit_xs_config(struct tu_cs * cs,gl_shader_stage stage,const struct ir3_shader_variant * xs)438 tu6_emit_xs_config(struct tu_cs *cs,
439                    gl_shader_stage stage, /* xs->type, but xs may be NULL */
440                    const struct ir3_shader_variant *xs)
441 {
442    const struct xs_config *cfg = &xs_config[stage];
443 
444    if (!xs) {
445       /* shader stage disabled */
446       tu_cs_emit_pkt4(cs, cfg->reg_sp_xs_config, 1);
447       tu_cs_emit(cs, 0);
448 
449       tu_cs_emit_pkt4(cs, cfg->reg_hlsq_xs_ctrl, 1);
450       tu_cs_emit(cs, 0);
451       return;
452    }
453 
454    tu_cs_emit_pkt4(cs, cfg->reg_sp_xs_config, 1);
455    tu_cs_emit(cs, A6XX_SP_VS_CONFIG_ENABLED |
456                   COND(xs->bindless_tex, A6XX_SP_VS_CONFIG_BINDLESS_TEX) |
457                   COND(xs->bindless_samp, A6XX_SP_VS_CONFIG_BINDLESS_SAMP) |
458                   COND(xs->bindless_ibo, A6XX_SP_VS_CONFIG_BINDLESS_IBO) |
459                   COND(xs->bindless_ubo, A6XX_SP_VS_CONFIG_BINDLESS_UBO) |
460                   A6XX_SP_VS_CONFIG_NTEX(xs->num_samp) |
461                   A6XX_SP_VS_CONFIG_NSAMP(xs->num_samp));
462 
463    tu_cs_emit_pkt4(cs, cfg->reg_hlsq_xs_ctrl, 1);
464    tu_cs_emit(cs, A6XX_HLSQ_VS_CNTL_CONSTLEN(xs->constlen) |
465                   A6XX_HLSQ_VS_CNTL_ENABLED);
466 }
467 
468 void
tu6_emit_xs(struct tu_cs * cs,gl_shader_stage stage,const struct ir3_shader_variant * xs,const struct tu_pvtmem_config * pvtmem,uint64_t binary_iova)469 tu6_emit_xs(struct tu_cs *cs,
470             gl_shader_stage stage, /* xs->type, but xs may be NULL */
471             const struct ir3_shader_variant *xs,
472             const struct tu_pvtmem_config *pvtmem,
473             uint64_t binary_iova)
474 {
475    const struct xs_config *cfg = &xs_config[stage];
476 
477    if (!xs) {
478       /* shader stage disabled */
479       return;
480    }
481 
482    enum a6xx_threadsize thrsz =
483       xs->info.double_threadsize ? THREAD128 : THREAD64;
484    switch (stage) {
485    case MESA_SHADER_VERTEX:
486       tu_cs_emit_regs(cs, A6XX_SP_VS_CTRL_REG0(
487                .fullregfootprint = xs->info.max_reg + 1,
488                .halfregfootprint = xs->info.max_half_reg + 1,
489                .branchstack = ir3_shader_branchstack_hw(xs),
490                .mergedregs = xs->mergedregs,
491       ));
492       break;
493    case MESA_SHADER_TESS_CTRL:
494       tu_cs_emit_regs(cs, A6XX_SP_HS_CTRL_REG0(
495                .fullregfootprint = xs->info.max_reg + 1,
496                .halfregfootprint = xs->info.max_half_reg + 1,
497                .branchstack = ir3_shader_branchstack_hw(xs),
498       ));
499       break;
500    case MESA_SHADER_TESS_EVAL:
501       tu_cs_emit_regs(cs, A6XX_SP_DS_CTRL_REG0(
502                .fullregfootprint = xs->info.max_reg + 1,
503                .halfregfootprint = xs->info.max_half_reg + 1,
504                .branchstack = ir3_shader_branchstack_hw(xs),
505       ));
506       break;
507    case MESA_SHADER_GEOMETRY:
508       tu_cs_emit_regs(cs, A6XX_SP_GS_CTRL_REG0(
509                .fullregfootprint = xs->info.max_reg + 1,
510                .halfregfootprint = xs->info.max_half_reg + 1,
511                .branchstack = ir3_shader_branchstack_hw(xs),
512       ));
513       break;
514    case MESA_SHADER_FRAGMENT:
515       tu_cs_emit_regs(cs, A6XX_SP_FS_CTRL_REG0(
516                .fullregfootprint = xs->info.max_reg + 1,
517                .halfregfootprint = xs->info.max_half_reg + 1,
518                .branchstack = ir3_shader_branchstack_hw(xs),
519                .mergedregs = xs->mergedregs,
520                .threadsize = thrsz,
521                .pixlodenable = xs->need_pixlod,
522                .diff_fine = xs->need_fine_derivatives,
523                .varying = xs->total_in != 0,
524                /* unknown bit, seems unnecessary */
525                .unk24 = true,
526       ));
527       break;
528    case MESA_SHADER_COMPUTE:
529       tu_cs_emit_regs(cs, A6XX_SP_CS_CTRL_REG0(
530                .fullregfootprint = xs->info.max_reg + 1,
531                .halfregfootprint = xs->info.max_half_reg + 1,
532                .branchstack = ir3_shader_branchstack_hw(xs),
533                .mergedregs = xs->mergedregs,
534                .threadsize = thrsz,
535       ));
536       break;
537    default:
538       unreachable("bad shader stage");
539    }
540 
541    tu_cs_emit_pkt4(cs, cfg->reg_sp_xs_instrlen, 1);
542    tu_cs_emit(cs, xs->instrlen);
543 
544    /* emit program binary & private memory layout
545     * binary_iova should be aligned to 1 instrlen unit (128 bytes)
546     */
547 
548    assert((binary_iova & 0x7f) == 0);
549    assert((pvtmem->iova & 0x1f) == 0);
550 
551    tu_cs_emit_pkt4(cs, cfg->reg_sp_xs_first_exec_offset, 7);
552    tu_cs_emit(cs, 0);
553    tu_cs_emit_qw(cs, binary_iova);
554    tu_cs_emit(cs,
555               A6XX_SP_VS_PVT_MEM_PARAM_MEMSIZEPERITEM(pvtmem->per_fiber_size));
556    tu_cs_emit_qw(cs, pvtmem->iova);
557    tu_cs_emit(cs, A6XX_SP_VS_PVT_MEM_SIZE_TOTALPVTMEMSIZE(pvtmem->per_sp_size) |
558                   COND(pvtmem->per_wave, A6XX_SP_VS_PVT_MEM_SIZE_PERWAVEMEMLAYOUT));
559 
560    tu_cs_emit_pkt4(cs, cfg->reg_sp_xs_pvt_mem_hw_stack_offset, 1);
561    tu_cs_emit(cs, A6XX_SP_VS_PVT_MEM_HW_STACK_OFFSET_OFFSET(pvtmem->per_sp_size));
562 
563    uint32_t shader_preload_size =
564       MIN2(xs->instrlen, cs->device->physical_device->info->a6xx.instr_cache_size);
565 
566    tu_cs_emit_pkt7(cs, tu6_stage2opcode(stage), 3);
567    tu_cs_emit(cs, CP_LOAD_STATE6_0_DST_OFF(0) |
568                   CP_LOAD_STATE6_0_STATE_TYPE(ST6_SHADER) |
569                   CP_LOAD_STATE6_0_STATE_SRC(SS6_INDIRECT) |
570                   CP_LOAD_STATE6_0_STATE_BLOCK(tu6_stage2shadersb(stage)) |
571                   CP_LOAD_STATE6_0_NUM_UNIT(shader_preload_size));
572    tu_cs_emit_qw(cs, binary_iova);
573 
574    /* emit immediates */
575 
576    const struct ir3_const_state *const_state = ir3_const_state(xs);
577    uint32_t base = const_state->offsets.immediate;
578    unsigned immediate_size = tu_xs_get_immediates_packet_size_dwords(xs);
579 
580    if (immediate_size > 0) {
581       tu_cs_emit_pkt7(cs, tu6_stage2opcode(stage), 3 + immediate_size);
582       tu_cs_emit(cs, CP_LOAD_STATE6_0_DST_OFF(base) |
583                  CP_LOAD_STATE6_0_STATE_TYPE(ST6_CONSTANTS) |
584                  CP_LOAD_STATE6_0_STATE_SRC(SS6_DIRECT) |
585                  CP_LOAD_STATE6_0_STATE_BLOCK(tu6_stage2shadersb(stage)) |
586                  CP_LOAD_STATE6_0_NUM_UNIT(immediate_size / 4));
587       tu_cs_emit(cs, CP_LOAD_STATE6_1_EXT_SRC_ADDR(0));
588       tu_cs_emit(cs, CP_LOAD_STATE6_2_EXT_SRC_ADDR_HI(0));
589 
590       tu_cs_emit_array(cs, const_state->immediates, immediate_size);
591    }
592 
593    if (const_state->constant_data_ubo != -1) {
594       uint64_t iova = binary_iova + xs->info.constant_data_offset;
595 
596       /* Upload UBO state for the constant data. */
597       tu_cs_emit_pkt7(cs, tu6_stage2opcode(stage), 5);
598       tu_cs_emit(cs,
599                  CP_LOAD_STATE6_0_DST_OFF(const_state->constant_data_ubo) |
600                  CP_LOAD_STATE6_0_STATE_TYPE(ST6_UBO)|
601                  CP_LOAD_STATE6_0_STATE_SRC(SS6_DIRECT) |
602                  CP_LOAD_STATE6_0_STATE_BLOCK(tu6_stage2shadersb(stage)) |
603                  CP_LOAD_STATE6_0_NUM_UNIT(1));
604       tu_cs_emit(cs, CP_LOAD_STATE6_1_EXT_SRC_ADDR(0));
605       tu_cs_emit(cs, CP_LOAD_STATE6_2_EXT_SRC_ADDR_HI(0));
606       int size_vec4s = DIV_ROUND_UP(xs->constant_data_size, 16);
607       tu_cs_emit_qw(cs,
608                     iova |
609                     (uint64_t)A6XX_UBO_1_SIZE(size_vec4s) << 32);
610 
611       /* Upload the constant data to the const file if needed. */
612       const struct ir3_ubo_analysis_state *ubo_state = &const_state->ubo_state;
613 
614       for (int i = 0; i < ubo_state->num_enabled; i++) {
615          if (ubo_state->range[i].ubo.block != const_state->constant_data_ubo ||
616              ubo_state->range[i].ubo.bindless) {
617             continue;
618          }
619 
620          uint32_t start = ubo_state->range[i].start;
621          uint32_t end = ubo_state->range[i].end;
622          uint32_t size = MIN2(end - start,
623                               (16 * xs->constlen) - ubo_state->range[i].offset);
624 
625          tu_cs_emit_pkt7(cs, tu6_stage2opcode(stage), 3);
626          tu_cs_emit(cs,
627                     CP_LOAD_STATE6_0_DST_OFF(ubo_state->range[i].offset / 16) |
628                     CP_LOAD_STATE6_0_STATE_TYPE(ST6_CONSTANTS) |
629                     CP_LOAD_STATE6_0_STATE_SRC(SS6_INDIRECT) |
630                     CP_LOAD_STATE6_0_STATE_BLOCK(tu6_stage2shadersb(stage)) |
631                     CP_LOAD_STATE6_0_NUM_UNIT(size / 16));
632          tu_cs_emit_qw(cs, iova + start);
633       }
634    }
635 
636    /* emit FS driver param */
637    if (stage == MESA_SHADER_FRAGMENT && const_state->num_driver_params > 0) {
638       uint32_t base = const_state->offsets.driver_param;
639       int32_t size = DIV_ROUND_UP(const_state->num_driver_params, 4);
640       size = MAX2(MIN2(size + base, xs->constlen) - base, 0);
641 
642       if (size > 0) {
643          tu_cs_emit_pkt7(cs, tu6_stage2opcode(stage), 3 + size * 4);
644          tu_cs_emit(cs, CP_LOAD_STATE6_0_DST_OFF(base) |
645                     CP_LOAD_STATE6_0_STATE_TYPE(ST6_CONSTANTS) |
646                     CP_LOAD_STATE6_0_STATE_SRC(SS6_DIRECT) |
647                     CP_LOAD_STATE6_0_STATE_BLOCK(tu6_stage2shadersb(stage)) |
648                     CP_LOAD_STATE6_0_NUM_UNIT(size));
649          tu_cs_emit(cs, CP_LOAD_STATE6_1_EXT_SRC_ADDR(0));
650          tu_cs_emit(cs, CP_LOAD_STATE6_2_EXT_SRC_ADDR_HI(0));
651 
652          assert(size == 1);
653          tu_cs_emit(cs, xs->info.double_threadsize ? 128 : 64);
654          tu_cs_emit(cs, 0);
655          tu_cs_emit(cs, 0);
656          tu_cs_emit(cs, 0);
657       }
658    }
659 }
660 
661 static void
tu6_emit_shared_consts_enable(struct tu_cs * cs,bool enable)662 tu6_emit_shared_consts_enable(struct tu_cs *cs, bool enable)
663 {
664    /* Enable/disable shared constants */
665    tu_cs_emit_regs(cs, A6XX_HLSQ_SHARED_CONSTS(.enable = enable));
666    tu_cs_emit_regs(cs, A6XX_SP_MODE_CONTROL(.constant_demotion_enable = true,
667                                             .isammode = ISAMMODE_GL,
668                                             .shared_consts_enable = enable));
669 }
670 
671 static void
tu6_emit_cs_config(struct tu_cs * cs,const struct ir3_shader_variant * v,const struct tu_pvtmem_config * pvtmem,uint64_t binary_iova)672 tu6_emit_cs_config(struct tu_cs *cs,
673                    const struct ir3_shader_variant *v,
674                    const struct tu_pvtmem_config *pvtmem,
675                    uint64_t binary_iova)
676 {
677    bool shared_consts_enable = ir3_const_state(v)->shared_consts_enable;
678    tu6_emit_shared_consts_enable(cs, shared_consts_enable);
679 
680    tu_cs_emit_regs(cs, A6XX_HLSQ_INVALIDATE_CMD(
681          .cs_state = true,
682          .cs_ibo = true,
683          .cs_shared_const = shared_consts_enable));
684 
685    tu6_emit_xs_config(cs, MESA_SHADER_COMPUTE, v);
686    tu6_emit_xs(cs, MESA_SHADER_COMPUTE, v, pvtmem, binary_iova);
687 
688    uint32_t shared_size = MAX2(((int)v->shared_size - 1) / 1024, 1);
689    tu_cs_emit_pkt4(cs, REG_A6XX_SP_CS_UNKNOWN_A9B1, 1);
690    tu_cs_emit(cs, A6XX_SP_CS_UNKNOWN_A9B1_SHARED_SIZE(shared_size) |
691                   A6XX_SP_CS_UNKNOWN_A9B1_UNK6);
692 
693    if (cs->device->physical_device->info->a6xx.has_lpac) {
694       tu_cs_emit_pkt4(cs, REG_A6XX_HLSQ_CS_UNKNOWN_B9D0, 1);
695       tu_cs_emit(cs, A6XX_HLSQ_CS_UNKNOWN_B9D0_SHARED_SIZE(shared_size) |
696                      A6XX_HLSQ_CS_UNKNOWN_B9D0_UNK6);
697    }
698 
699    uint32_t local_invocation_id =
700       ir3_find_sysval_regid(v, SYSTEM_VALUE_LOCAL_INVOCATION_ID);
701    uint32_t work_group_id =
702       ir3_find_sysval_regid(v, SYSTEM_VALUE_WORKGROUP_ID);
703 
704    enum a6xx_threadsize thrsz = v->info.double_threadsize ? THREAD128 : THREAD64;
705    tu_cs_emit_pkt4(cs, REG_A6XX_HLSQ_CS_CNTL_0, 2);
706    tu_cs_emit(cs,
707               A6XX_HLSQ_CS_CNTL_0_WGIDCONSTID(work_group_id) |
708               A6XX_HLSQ_CS_CNTL_0_WGSIZECONSTID(regid(63, 0)) |
709               A6XX_HLSQ_CS_CNTL_0_WGOFFSETCONSTID(regid(63, 0)) |
710               A6XX_HLSQ_CS_CNTL_0_LOCALIDREGID(local_invocation_id));
711    tu_cs_emit(cs, A6XX_HLSQ_CS_CNTL_1_LINEARLOCALIDREGID(regid(63, 0)) |
712                   A6XX_HLSQ_CS_CNTL_1_THREADSIZE(thrsz));
713 
714    if (cs->device->physical_device->info->a6xx.has_lpac) {
715       tu_cs_emit_pkt4(cs, REG_A6XX_SP_CS_CNTL_0, 2);
716       tu_cs_emit(cs,
717                  A6XX_SP_CS_CNTL_0_WGIDCONSTID(work_group_id) |
718                  A6XX_SP_CS_CNTL_0_WGSIZECONSTID(regid(63, 0)) |
719                  A6XX_SP_CS_CNTL_0_WGOFFSETCONSTID(regid(63, 0)) |
720                  A6XX_SP_CS_CNTL_0_LOCALIDREGID(local_invocation_id));
721       tu_cs_emit(cs, A6XX_SP_CS_CNTL_1_LINEARLOCALIDREGID(regid(63, 0)) |
722                      A6XX_SP_CS_CNTL_1_THREADSIZE(thrsz));
723    }
724 }
725 
726 static void
tu6_emit_vs_system_values(struct tu_cs * cs,const struct ir3_shader_variant * vs,const struct ir3_shader_variant * hs,const struct ir3_shader_variant * ds,const struct ir3_shader_variant * gs,bool primid_passthru)727 tu6_emit_vs_system_values(struct tu_cs *cs,
728                           const struct ir3_shader_variant *vs,
729                           const struct ir3_shader_variant *hs,
730                           const struct ir3_shader_variant *ds,
731                           const struct ir3_shader_variant *gs,
732                           bool primid_passthru)
733 {
734    const uint32_t vertexid_regid =
735          ir3_find_sysval_regid(vs, SYSTEM_VALUE_VERTEX_ID);
736    const uint32_t instanceid_regid =
737          ir3_find_sysval_regid(vs, SYSTEM_VALUE_INSTANCE_ID);
738    const uint32_t tess_coord_x_regid = hs ?
739          ir3_find_sysval_regid(ds, SYSTEM_VALUE_TESS_COORD) :
740          regid(63, 0);
741    const uint32_t tess_coord_y_regid = VALIDREG(tess_coord_x_regid) ?
742          tess_coord_x_regid + 1 :
743          regid(63, 0);
744    const uint32_t hs_rel_patch_regid = hs ?
745          ir3_find_sysval_regid(hs, SYSTEM_VALUE_REL_PATCH_ID_IR3) :
746          regid(63, 0);
747    const uint32_t ds_rel_patch_regid = hs ?
748          ir3_find_sysval_regid(ds, SYSTEM_VALUE_REL_PATCH_ID_IR3) :
749          regid(63, 0);
750    const uint32_t hs_invocation_regid = hs ?
751          ir3_find_sysval_regid(hs, SYSTEM_VALUE_TCS_HEADER_IR3) :
752          regid(63, 0);
753    const uint32_t gs_primitiveid_regid = gs ?
754          ir3_find_sysval_regid(gs, SYSTEM_VALUE_PRIMITIVE_ID) :
755          regid(63, 0);
756    const uint32_t vs_primitiveid_regid = hs ?
757          ir3_find_sysval_regid(hs, SYSTEM_VALUE_PRIMITIVE_ID) :
758          gs_primitiveid_regid;
759    const uint32_t ds_primitiveid_regid = ds ?
760          ir3_find_sysval_regid(ds, SYSTEM_VALUE_PRIMITIVE_ID) :
761          regid(63, 0);
762    const uint32_t gsheader_regid = gs ?
763          ir3_find_sysval_regid(gs, SYSTEM_VALUE_GS_HEADER_IR3) :
764          regid(63, 0);
765 
766    /* Note: we currently don't support multiview with tess or GS. If we did,
767     * and the HW actually works, then we'd have to somehow share this across
768     * stages. Note that the blob doesn't support this either.
769     */
770    const uint32_t viewid_regid =
771       ir3_find_sysval_regid(vs, SYSTEM_VALUE_VIEW_INDEX);
772 
773    tu_cs_emit_pkt4(cs, REG_A6XX_VFD_CONTROL_1, 6);
774    tu_cs_emit(cs, A6XX_VFD_CONTROL_1_REGID4VTX(vertexid_regid) |
775                   A6XX_VFD_CONTROL_1_REGID4INST(instanceid_regid) |
776                   A6XX_VFD_CONTROL_1_REGID4PRIMID(vs_primitiveid_regid) |
777                   A6XX_VFD_CONTROL_1_REGID4VIEWID(viewid_regid));
778    tu_cs_emit(cs, A6XX_VFD_CONTROL_2_REGID_HSRELPATCHID(hs_rel_patch_regid) |
779                   A6XX_VFD_CONTROL_2_REGID_INVOCATIONID(hs_invocation_regid));
780    tu_cs_emit(cs, A6XX_VFD_CONTROL_3_REGID_DSRELPATCHID(ds_rel_patch_regid) |
781                   A6XX_VFD_CONTROL_3_REGID_TESSX(tess_coord_x_regid) |
782                   A6XX_VFD_CONTROL_3_REGID_TESSY(tess_coord_y_regid) |
783                   A6XX_VFD_CONTROL_3_REGID_DSPRIMID(ds_primitiveid_regid));
784    tu_cs_emit(cs, 0x000000fc); /* VFD_CONTROL_4 */
785    tu_cs_emit(cs, A6XX_VFD_CONTROL_5_REGID_GSHEADER(gsheader_regid) |
786                   0xfc00); /* VFD_CONTROL_5 */
787    tu_cs_emit(cs, COND(primid_passthru, A6XX_VFD_CONTROL_6_PRIMID_PASSTHRU)); /* VFD_CONTROL_6 */
788 }
789 
790 static void
tu6_setup_streamout(struct tu_cs * cs,const struct ir3_shader_variant * v,struct ir3_shader_linkage * l)791 tu6_setup_streamout(struct tu_cs *cs,
792                     const struct ir3_shader_variant *v,
793                     struct ir3_shader_linkage *l)
794 {
795    const struct ir3_stream_output_info *info = &v->stream_output;
796    /* Note: 64 here comes from the HW layout of the program RAM. The program
797     * for stream N is at DWORD 64 * N.
798     */
799 #define A6XX_SO_PROG_DWORDS 64
800    uint32_t prog[A6XX_SO_PROG_DWORDS * IR3_MAX_SO_STREAMS] = {};
801    BITSET_DECLARE(valid_dwords, A6XX_SO_PROG_DWORDS * IR3_MAX_SO_STREAMS) = {0};
802 
803    /* TODO: streamout state should be in a non-GMEM draw state */
804 
805    /* no streamout: */
806    if (info->num_outputs == 0) {
807       tu_cs_emit_pkt7(cs, CP_CONTEXT_REG_BUNCH, 4);
808       tu_cs_emit(cs, REG_A6XX_VPC_SO_CNTL);
809       tu_cs_emit(cs, 0);
810       tu_cs_emit(cs, REG_A6XX_VPC_SO_STREAM_CNTL);
811       tu_cs_emit(cs, 0);
812       return;
813    }
814 
815    for (unsigned i = 0; i < info->num_outputs; i++) {
816       const struct ir3_stream_output *out = &info->output[i];
817       unsigned k = out->register_index;
818       unsigned idx;
819 
820       /* Skip it, if it's an output that was never assigned a register. */
821       if (k >= v->outputs_count || v->outputs[k].regid == INVALID_REG)
822          continue;
823 
824       /* linkage map sorted by order frag shader wants things, so
825        * a bit less ideal here..
826        */
827       for (idx = 0; idx < l->cnt; idx++)
828          if (l->var[idx].slot == v->outputs[k].slot)
829             break;
830 
831       assert(idx < l->cnt);
832 
833       for (unsigned j = 0; j < out->num_components; j++) {
834          unsigned c   = j + out->start_component;
835          unsigned loc = l->var[idx].loc + c;
836          unsigned off = j + out->dst_offset;  /* in dwords */
837 
838          assert(loc < A6XX_SO_PROG_DWORDS * 2);
839          unsigned dword = out->stream * A6XX_SO_PROG_DWORDS + loc/2;
840          if (loc & 1) {
841             prog[dword] |= A6XX_VPC_SO_PROG_B_EN |
842                            A6XX_VPC_SO_PROG_B_BUF(out->output_buffer) |
843                            A6XX_VPC_SO_PROG_B_OFF(off * 4);
844          } else {
845             prog[dword] |= A6XX_VPC_SO_PROG_A_EN |
846                            A6XX_VPC_SO_PROG_A_BUF(out->output_buffer) |
847                            A6XX_VPC_SO_PROG_A_OFF(off * 4);
848          }
849          BITSET_SET(valid_dwords, dword);
850       }
851    }
852 
853    unsigned prog_count = 0;
854    unsigned start, end;
855    BITSET_FOREACH_RANGE(start, end, valid_dwords,
856                         A6XX_SO_PROG_DWORDS * IR3_MAX_SO_STREAMS) {
857       prog_count += end - start + 1;
858    }
859 
860    tu_cs_emit_pkt7(cs, CP_CONTEXT_REG_BUNCH, 10 + 2 * prog_count);
861    tu_cs_emit(cs, REG_A6XX_VPC_SO_STREAM_CNTL);
862    tu_cs_emit(cs, A6XX_VPC_SO_STREAM_CNTL_STREAM_ENABLE(info->streams_written) |
863                   COND(info->stride[0] > 0,
864                        A6XX_VPC_SO_STREAM_CNTL_BUF0_STREAM(1 + info->buffer_to_stream[0])) |
865                   COND(info->stride[1] > 0,
866                        A6XX_VPC_SO_STREAM_CNTL_BUF1_STREAM(1 + info->buffer_to_stream[1])) |
867                   COND(info->stride[2] > 0,
868                        A6XX_VPC_SO_STREAM_CNTL_BUF2_STREAM(1 + info->buffer_to_stream[2])) |
869                   COND(info->stride[3] > 0,
870                        A6XX_VPC_SO_STREAM_CNTL_BUF3_STREAM(1 + info->buffer_to_stream[3])));
871    for (uint32_t i = 0; i < 4; i++) {
872       tu_cs_emit(cs, REG_A6XX_VPC_SO_BUFFER_STRIDE(i));
873       tu_cs_emit(cs, info->stride[i]);
874    }
875    bool first = true;
876    BITSET_FOREACH_RANGE(start, end, valid_dwords,
877                         A6XX_SO_PROG_DWORDS * IR3_MAX_SO_STREAMS) {
878       tu_cs_emit(cs, REG_A6XX_VPC_SO_CNTL);
879       tu_cs_emit(cs, COND(first, A6XX_VPC_SO_CNTL_RESET) |
880                      A6XX_VPC_SO_CNTL_ADDR(start));
881       for (unsigned i = start; i < end; i++) {
882          tu_cs_emit(cs, REG_A6XX_VPC_SO_PROG);
883          tu_cs_emit(cs, prog[i]);
884       }
885       first = false;
886    }
887 }
888 
889 static void
tu6_emit_const(struct tu_cs * cs,uint32_t opcode,uint32_t base,enum a6xx_state_block block,uint32_t offset,uint32_t size,const uint32_t * dwords)890 tu6_emit_const(struct tu_cs *cs, uint32_t opcode, uint32_t base,
891                enum a6xx_state_block block, uint32_t offset,
892                uint32_t size, const uint32_t *dwords) {
893    assert(size % 4 == 0);
894 
895    tu_cs_emit_pkt7(cs, opcode, 3 + size);
896    tu_cs_emit(cs, CP_LOAD_STATE6_0_DST_OFF(base) |
897          CP_LOAD_STATE6_0_STATE_TYPE(ST6_CONSTANTS) |
898          CP_LOAD_STATE6_0_STATE_SRC(SS6_DIRECT) |
899          CP_LOAD_STATE6_0_STATE_BLOCK(block) |
900          CP_LOAD_STATE6_0_NUM_UNIT(size / 4));
901 
902    tu_cs_emit(cs, CP_LOAD_STATE6_1_EXT_SRC_ADDR(0));
903    tu_cs_emit(cs, CP_LOAD_STATE6_2_EXT_SRC_ADDR_HI(0));
904    dwords = (uint32_t *)&((uint8_t *)dwords)[offset];
905 
906    tu_cs_emit_array(cs, dwords, size);
907 }
908 
909 static void
tu6_emit_link_map(struct tu_cs * cs,const struct ir3_shader_variant * producer,const struct ir3_shader_variant * consumer,enum a6xx_state_block sb)910 tu6_emit_link_map(struct tu_cs *cs,
911                   const struct ir3_shader_variant *producer,
912                   const struct ir3_shader_variant *consumer,
913                   enum a6xx_state_block sb)
914 {
915    const struct ir3_const_state *const_state = ir3_const_state(consumer);
916    uint32_t base = const_state->offsets.primitive_map;
917    int size = DIV_ROUND_UP(consumer->input_size, 4);
918 
919    size = (MIN2(size + base, consumer->constlen) - base) * 4;
920    if (size <= 0)
921       return;
922 
923    tu6_emit_const(cs, CP_LOAD_STATE6_GEOM, base, sb, 0, size,
924                          producer->output_loc);
925 }
926 
927 static uint16_t
primitive_to_tess(enum shader_prim primitive)928 primitive_to_tess(enum shader_prim primitive) {
929    switch (primitive) {
930    case SHADER_PRIM_POINTS:
931       return TESS_POINTS;
932    case SHADER_PRIM_LINE_STRIP:
933       return TESS_LINES;
934    case SHADER_PRIM_TRIANGLE_STRIP:
935       return TESS_CW_TRIS;
936    default:
937       unreachable("");
938    }
939 }
940 
941 void
tu6_emit_vpc(struct tu_cs * cs,const struct ir3_shader_variant * vs,const struct ir3_shader_variant * hs,const struct ir3_shader_variant * ds,const struct ir3_shader_variant * gs,const struct ir3_shader_variant * fs,uint32_t patch_control_points)942 tu6_emit_vpc(struct tu_cs *cs,
943              const struct ir3_shader_variant *vs,
944              const struct ir3_shader_variant *hs,
945              const struct ir3_shader_variant *ds,
946              const struct ir3_shader_variant *gs,
947              const struct ir3_shader_variant *fs,
948              uint32_t patch_control_points)
949 {
950    /* note: doesn't compile as static because of the array regs.. */
951    const struct reg_config {
952       uint16_t reg_sp_xs_out_reg;
953       uint16_t reg_sp_xs_vpc_dst_reg;
954       uint16_t reg_vpc_xs_pack;
955       uint16_t reg_vpc_xs_clip_cntl;
956       uint16_t reg_gras_xs_cl_cntl;
957       uint16_t reg_pc_xs_out_cntl;
958       uint16_t reg_sp_xs_primitive_cntl;
959       uint16_t reg_vpc_xs_layer_cntl;
960       uint16_t reg_gras_xs_layer_cntl;
961    } reg_config[] = {
962       [MESA_SHADER_VERTEX] = {
963          REG_A6XX_SP_VS_OUT_REG(0),
964          REG_A6XX_SP_VS_VPC_DST_REG(0),
965          REG_A6XX_VPC_VS_PACK,
966          REG_A6XX_VPC_VS_CLIP_CNTL,
967          REG_A6XX_GRAS_VS_CL_CNTL,
968          REG_A6XX_PC_VS_OUT_CNTL,
969          REG_A6XX_SP_VS_PRIMITIVE_CNTL,
970          REG_A6XX_VPC_VS_LAYER_CNTL,
971          REG_A6XX_GRAS_VS_LAYER_CNTL
972       },
973       [MESA_SHADER_TESS_CTRL] = {
974          0,
975          0,
976          0,
977          0,
978          0,
979          REG_A6XX_PC_HS_OUT_CNTL,
980          0,
981          0,
982          0
983       },
984       [MESA_SHADER_TESS_EVAL] = {
985          REG_A6XX_SP_DS_OUT_REG(0),
986          REG_A6XX_SP_DS_VPC_DST_REG(0),
987          REG_A6XX_VPC_DS_PACK,
988          REG_A6XX_VPC_DS_CLIP_CNTL,
989          REG_A6XX_GRAS_DS_CL_CNTL,
990          REG_A6XX_PC_DS_OUT_CNTL,
991          REG_A6XX_SP_DS_PRIMITIVE_CNTL,
992          REG_A6XX_VPC_DS_LAYER_CNTL,
993          REG_A6XX_GRAS_DS_LAYER_CNTL
994       },
995       [MESA_SHADER_GEOMETRY] = {
996          REG_A6XX_SP_GS_OUT_REG(0),
997          REG_A6XX_SP_GS_VPC_DST_REG(0),
998          REG_A6XX_VPC_GS_PACK,
999          REG_A6XX_VPC_GS_CLIP_CNTL,
1000          REG_A6XX_GRAS_GS_CL_CNTL,
1001          REG_A6XX_PC_GS_OUT_CNTL,
1002          REG_A6XX_SP_GS_PRIMITIVE_CNTL,
1003          REG_A6XX_VPC_GS_LAYER_CNTL,
1004          REG_A6XX_GRAS_GS_LAYER_CNTL
1005       },
1006    };
1007 
1008    const struct ir3_shader_variant *last_shader;
1009    if (gs) {
1010       last_shader = gs;
1011    } else if (hs) {
1012       last_shader = ds;
1013    } else {
1014       last_shader = vs;
1015    }
1016 
1017    const struct reg_config *cfg = &reg_config[last_shader->type];
1018 
1019    struct ir3_shader_linkage linkage = {
1020       .primid_loc = 0xff,
1021       .clip0_loc = 0xff,
1022       .clip1_loc = 0xff,
1023    };
1024    if (fs)
1025       ir3_link_shaders(&linkage, last_shader, fs, true);
1026 
1027    if (last_shader->stream_output.num_outputs)
1028       ir3_link_stream_out(&linkage, last_shader);
1029 
1030    /* We do this after linking shaders in order to know whether PrimID
1031     * passthrough needs to be enabled.
1032     */
1033    bool primid_passthru = linkage.primid_loc != 0xff;
1034    tu6_emit_vs_system_values(cs, vs, hs, ds, gs, primid_passthru);
1035 
1036    tu_cs_emit_pkt4(cs, REG_A6XX_VPC_VAR_DISABLE(0), 4);
1037    tu_cs_emit(cs, ~linkage.varmask[0]);
1038    tu_cs_emit(cs, ~linkage.varmask[1]);
1039    tu_cs_emit(cs, ~linkage.varmask[2]);
1040    tu_cs_emit(cs, ~linkage.varmask[3]);
1041 
1042    /* a6xx finds position/pointsize at the end */
1043    const uint32_t pointsize_regid =
1044       ir3_find_output_regid(last_shader, VARYING_SLOT_PSIZ);
1045    const uint32_t layer_regid =
1046       ir3_find_output_regid(last_shader, VARYING_SLOT_LAYER);
1047    const uint32_t view_regid =
1048       ir3_find_output_regid(last_shader, VARYING_SLOT_VIEWPORT);
1049    const uint32_t clip0_regid =
1050       ir3_find_output_regid(last_shader, VARYING_SLOT_CLIP_DIST0);
1051    const uint32_t clip1_regid =
1052       ir3_find_output_regid(last_shader, VARYING_SLOT_CLIP_DIST1);
1053    uint32_t flags_regid = gs ?
1054       ir3_find_output_regid(gs, VARYING_SLOT_GS_VERTEX_FLAGS_IR3) : 0;
1055 
1056    uint32_t pointsize_loc = 0xff, position_loc = 0xff, layer_loc = 0xff, view_loc = 0xff;
1057 
1058    if (layer_regid != regid(63, 0)) {
1059       layer_loc = linkage.max_loc;
1060       ir3_link_add(&linkage, VARYING_SLOT_LAYER, layer_regid, 0x1, linkage.max_loc);
1061    }
1062 
1063    if (view_regid != regid(63, 0)) {
1064       view_loc = linkage.max_loc;
1065       ir3_link_add(&linkage, VARYING_SLOT_VIEWPORT, view_regid, 0x1, linkage.max_loc);
1066    }
1067 
1068    unsigned extra_pos = 0;
1069 
1070    for (unsigned i = 0; i < last_shader->outputs_count; i++) {
1071       if (last_shader->outputs[i].slot != VARYING_SLOT_POS)
1072          continue;
1073 
1074       if (position_loc == 0xff)
1075          position_loc = linkage.max_loc;
1076 
1077       ir3_link_add(&linkage, last_shader->outputs[i].slot,
1078                    last_shader->outputs[i].regid,
1079                    0xf, position_loc + 4 * last_shader->outputs[i].view);
1080       extra_pos = MAX2(extra_pos, last_shader->outputs[i].view);
1081    }
1082 
1083    if (pointsize_regid != regid(63, 0)) {
1084       pointsize_loc = linkage.max_loc;
1085       ir3_link_add(&linkage, VARYING_SLOT_PSIZ, pointsize_regid, 0x1, linkage.max_loc);
1086    }
1087 
1088    uint8_t clip_cull_mask = last_shader->clip_mask | last_shader->cull_mask;
1089 
1090    /* Handle the case where clip/cull distances aren't read by the FS */
1091    uint32_t clip0_loc = linkage.clip0_loc, clip1_loc = linkage.clip1_loc;
1092    if (clip0_loc == 0xff && clip0_regid != regid(63, 0)) {
1093       clip0_loc = linkage.max_loc;
1094       ir3_link_add(&linkage, VARYING_SLOT_CLIP_DIST0, clip0_regid,
1095                    clip_cull_mask & 0xf, linkage.max_loc);
1096    }
1097    if (clip1_loc == 0xff && clip1_regid != regid(63, 0)) {
1098       clip1_loc = linkage.max_loc;
1099       ir3_link_add(&linkage, VARYING_SLOT_CLIP_DIST1, clip1_regid,
1100                    clip_cull_mask >> 4, linkage.max_loc);
1101    }
1102 
1103    tu6_setup_streamout(cs, last_shader, &linkage);
1104 
1105    /* The GPU hangs on some models when there are no outputs (xs_pack::CNT),
1106     * at least when a DS is the last stage, so add a dummy output to keep it
1107     * happy if there aren't any. We do this late in order to avoid emitting
1108     * any unused code and make sure that optimizations don't remove it.
1109     */
1110    if (linkage.cnt == 0)
1111       ir3_link_add(&linkage, 0, 0, 0x1, linkage.max_loc);
1112 
1113    /* map outputs of the last shader to VPC */
1114    assert(linkage.cnt <= 32);
1115    const uint32_t sp_out_count = DIV_ROUND_UP(linkage.cnt, 2);
1116    const uint32_t sp_vpc_dst_count = DIV_ROUND_UP(linkage.cnt, 4);
1117    uint32_t sp_out[16] = {0};
1118    uint32_t sp_vpc_dst[8] = {0};
1119    for (uint32_t i = 0; i < linkage.cnt; i++) {
1120       ((uint16_t *) sp_out)[i] =
1121          A6XX_SP_VS_OUT_REG_A_REGID(linkage.var[i].regid) |
1122          A6XX_SP_VS_OUT_REG_A_COMPMASK(linkage.var[i].compmask);
1123       ((uint8_t *) sp_vpc_dst)[i] =
1124          A6XX_SP_VS_VPC_DST_REG_OUTLOC0(linkage.var[i].loc);
1125    }
1126 
1127    tu_cs_emit_pkt4(cs, cfg->reg_sp_xs_out_reg, sp_out_count);
1128    tu_cs_emit_array(cs, sp_out, sp_out_count);
1129 
1130    tu_cs_emit_pkt4(cs, cfg->reg_sp_xs_vpc_dst_reg, sp_vpc_dst_count);
1131    tu_cs_emit_array(cs, sp_vpc_dst, sp_vpc_dst_count);
1132 
1133    tu_cs_emit_pkt4(cs, cfg->reg_vpc_xs_pack, 1);
1134    tu_cs_emit(cs, A6XX_VPC_VS_PACK_POSITIONLOC(position_loc) |
1135                   A6XX_VPC_VS_PACK_PSIZELOC(pointsize_loc) |
1136                   A6XX_VPC_VS_PACK_STRIDE_IN_VPC(linkage.max_loc) |
1137                   A6XX_VPC_VS_PACK_EXTRAPOS(extra_pos));
1138 
1139    tu_cs_emit_pkt4(cs, cfg->reg_vpc_xs_clip_cntl, 1);
1140    tu_cs_emit(cs, A6XX_VPC_VS_CLIP_CNTL_CLIP_MASK(clip_cull_mask) |
1141                   A6XX_VPC_VS_CLIP_CNTL_CLIP_DIST_03_LOC(clip0_loc) |
1142                   A6XX_VPC_VS_CLIP_CNTL_CLIP_DIST_47_LOC(clip1_loc));
1143 
1144    tu_cs_emit_pkt4(cs, cfg->reg_gras_xs_cl_cntl, 1);
1145    tu_cs_emit(cs, A6XX_GRAS_VS_CL_CNTL_CLIP_MASK(last_shader->clip_mask) |
1146                   A6XX_GRAS_VS_CL_CNTL_CULL_MASK(last_shader->cull_mask));
1147 
1148    const struct ir3_shader_variant *geom_shaders[] = { vs, hs, ds, gs };
1149 
1150    for (unsigned i = 0; i < ARRAY_SIZE(geom_shaders); i++) {
1151       const struct ir3_shader_variant *shader = geom_shaders[i];
1152       if (!shader)
1153          continue;
1154 
1155       bool primid = shader->type != MESA_SHADER_VERTEX &&
1156          VALIDREG(ir3_find_sysval_regid(shader, SYSTEM_VALUE_PRIMITIVE_ID));
1157 
1158       tu_cs_emit_pkt4(cs, reg_config[shader->type].reg_pc_xs_out_cntl, 1);
1159       if (shader == last_shader) {
1160          tu_cs_emit(cs, A6XX_PC_VS_OUT_CNTL_STRIDE_IN_VPC(linkage.max_loc) |
1161                         CONDREG(pointsize_regid, A6XX_PC_VS_OUT_CNTL_PSIZE) |
1162                         CONDREG(layer_regid, A6XX_PC_VS_OUT_CNTL_LAYER) |
1163                         CONDREG(view_regid, A6XX_PC_VS_OUT_CNTL_VIEW) |
1164                         COND(primid, A6XX_PC_VS_OUT_CNTL_PRIMITIVE_ID) |
1165                         A6XX_PC_VS_OUT_CNTL_CLIP_MASK(clip_cull_mask));
1166       } else {
1167          tu_cs_emit(cs, COND(primid, A6XX_PC_VS_OUT_CNTL_PRIMITIVE_ID));
1168       }
1169    }
1170 
1171    /* if vertex_flags somehow gets optimized out, your gonna have a bad time: */
1172    if (gs)
1173       assert(flags_regid != INVALID_REG);
1174 
1175    tu_cs_emit_pkt4(cs, cfg->reg_sp_xs_primitive_cntl, 1);
1176    tu_cs_emit(cs, A6XX_SP_VS_PRIMITIVE_CNTL_OUT(linkage.cnt) |
1177                   A6XX_SP_GS_PRIMITIVE_CNTL_FLAGS_REGID(flags_regid));
1178 
1179    tu_cs_emit_pkt4(cs, cfg->reg_vpc_xs_layer_cntl, 1);
1180    tu_cs_emit(cs, A6XX_VPC_VS_LAYER_CNTL_LAYERLOC(layer_loc) |
1181                   A6XX_VPC_VS_LAYER_CNTL_VIEWLOC(view_loc));
1182 
1183    tu_cs_emit_pkt4(cs, cfg->reg_gras_xs_layer_cntl, 1);
1184    tu_cs_emit(cs, CONDREG(layer_regid, A6XX_GRAS_GS_LAYER_CNTL_WRITES_LAYER) |
1185                   CONDREG(view_regid, A6XX_GRAS_GS_LAYER_CNTL_WRITES_VIEW));
1186 
1187    tu_cs_emit_regs(cs, A6XX_PC_PRIMID_PASSTHRU(primid_passthru));
1188 
1189    tu_cs_emit_pkt4(cs, REG_A6XX_VPC_CNTL_0, 1);
1190    tu_cs_emit(cs, A6XX_VPC_CNTL_0_NUMNONPOSVAR(fs ? fs->total_in : 0) |
1191                   COND(fs && fs->total_in, A6XX_VPC_CNTL_0_VARYING) |
1192                   A6XX_VPC_CNTL_0_PRIMIDLOC(linkage.primid_loc) |
1193                   A6XX_VPC_CNTL_0_VIEWIDLOC(linkage.viewid_loc));
1194 
1195    if (hs) {
1196       tu_cs_emit_pkt4(cs, REG_A6XX_PC_TESS_NUM_VERTEX, 1);
1197       tu_cs_emit(cs, hs->tess.tcs_vertices_out);
1198 
1199       /* Total attribute slots in HS incoming patch. */
1200       tu_cs_emit_pkt4(cs, REG_A6XX_PC_HS_INPUT_SIZE, 1);
1201       tu_cs_emit(cs, patch_control_points * vs->output_size / 4);
1202 
1203       const uint32_t wavesize = 64;
1204       const uint32_t max_wave_input_size = 64;
1205 
1206       /* note: if HS is really just the VS extended, then this
1207        * should be by MAX2(patch_control_points, hs->tess.tcs_vertices_out)
1208        * however that doesn't match the blob, and fails some dEQP tests.
1209        */
1210       uint32_t prims_per_wave = wavesize / hs->tess.tcs_vertices_out;
1211       uint32_t max_prims_per_wave =
1212          max_wave_input_size * wavesize / (vs->output_size * patch_control_points);
1213       prims_per_wave = MIN2(prims_per_wave, max_prims_per_wave);
1214 
1215       uint32_t total_size = vs->output_size * patch_control_points * prims_per_wave;
1216       uint32_t wave_input_size = DIV_ROUND_UP(total_size, wavesize);
1217 
1218       tu_cs_emit_pkt4(cs, REG_A6XX_SP_HS_WAVE_INPUT_SIZE, 1);
1219       tu_cs_emit(cs, wave_input_size);
1220 
1221       /* In SPIR-V generated from GLSL, the tessellation primitive params are
1222        * are specified in the tess eval shader, but in SPIR-V generated from
1223        * HLSL, they are specified in the tess control shader. */
1224       const struct ir3_shader_variant *tess =
1225          ds->tess.spacing == TESS_SPACING_UNSPECIFIED ? hs : ds;
1226       tu_cs_emit_pkt4(cs, REG_A6XX_PC_TESS_CNTL, 1);
1227       uint32_t output;
1228       if (tess->tess.point_mode)
1229          output = TESS_POINTS;
1230       else if (tess->tess.primitive_mode == TESS_PRIMITIVE_ISOLINES)
1231          output = TESS_LINES;
1232       else if (tess->tess.ccw)
1233          output = TESS_CCW_TRIS;
1234       else
1235          output = TESS_CW_TRIS;
1236 
1237       enum a6xx_tess_spacing spacing;
1238       switch (tess->tess.spacing) {
1239       case TESS_SPACING_EQUAL:
1240          spacing = TESS_EQUAL;
1241          break;
1242       case TESS_SPACING_FRACTIONAL_ODD:
1243          spacing = TESS_FRACTIONAL_ODD;
1244          break;
1245       case TESS_SPACING_FRACTIONAL_EVEN:
1246          spacing = TESS_FRACTIONAL_EVEN;
1247          break;
1248       case TESS_SPACING_UNSPECIFIED:
1249       default:
1250          unreachable("invalid tess spacing");
1251       }
1252       tu_cs_emit(cs, A6XX_PC_TESS_CNTL_SPACING(spacing) |
1253             A6XX_PC_TESS_CNTL_OUTPUT(output));
1254 
1255       tu6_emit_link_map(cs, vs, hs, SB6_HS_SHADER);
1256       tu6_emit_link_map(cs, hs, ds, SB6_DS_SHADER);
1257    }
1258 
1259 
1260    if (gs) {
1261       uint32_t vertices_out, invocations, output, vec4_size;
1262       uint32_t prev_stage_output_size = ds ? ds->output_size : vs->output_size;
1263 
1264       if (hs) {
1265          tu6_emit_link_map(cs, ds, gs, SB6_GS_SHADER);
1266       } else {
1267          tu6_emit_link_map(cs, vs, gs, SB6_GS_SHADER);
1268       }
1269       vertices_out = gs->gs.vertices_out - 1;
1270       output = primitive_to_tess(gs->gs.output_primitive);
1271       invocations = gs->gs.invocations - 1;
1272       /* Size of per-primitive alloction in ldlw memory in vec4s. */
1273       vec4_size = gs->gs.vertices_in *
1274                   DIV_ROUND_UP(prev_stage_output_size, 4);
1275 
1276       tu_cs_emit_pkt4(cs, REG_A6XX_PC_PRIMITIVE_CNTL_5, 1);
1277       tu_cs_emit(cs,
1278             A6XX_PC_PRIMITIVE_CNTL_5_GS_VERTICES_OUT(vertices_out) |
1279             A6XX_PC_PRIMITIVE_CNTL_5_GS_OUTPUT(output) |
1280             A6XX_PC_PRIMITIVE_CNTL_5_GS_INVOCATIONS(invocations));
1281 
1282       tu_cs_emit_pkt4(cs, REG_A6XX_VPC_GS_PARAM, 1);
1283       tu_cs_emit(cs, 0xff);
1284 
1285       tu_cs_emit_pkt4(cs, REG_A6XX_PC_PRIMITIVE_CNTL_6, 1);
1286       tu_cs_emit(cs, A6XX_PC_PRIMITIVE_CNTL_6_STRIDE_IN_VPC(vec4_size));
1287 
1288       uint32_t prim_size = prev_stage_output_size;
1289       if (prim_size > 64)
1290          prim_size = 64;
1291       else if (prim_size == 64)
1292          prim_size = 63;
1293       tu_cs_emit_pkt4(cs, REG_A6XX_SP_GS_PRIM_SIZE, 1);
1294       tu_cs_emit(cs, prim_size);
1295    }
1296 }
1297 
1298 static int
tu6_vpc_varying_mode(const struct ir3_shader_variant * fs,uint32_t index,uint8_t * interp_mode,uint8_t * ps_repl_mode)1299 tu6_vpc_varying_mode(const struct ir3_shader_variant *fs,
1300                      uint32_t index,
1301                      uint8_t *interp_mode,
1302                      uint8_t *ps_repl_mode)
1303 {
1304    enum
1305    {
1306       INTERP_SMOOTH = 0,
1307       INTERP_FLAT = 1,
1308       INTERP_ZERO = 2,
1309       INTERP_ONE = 3,
1310    };
1311    enum
1312    {
1313       PS_REPL_NONE = 0,
1314       PS_REPL_S = 1,
1315       PS_REPL_T = 2,
1316       PS_REPL_ONE_MINUS_T = 3,
1317    };
1318 
1319    const uint32_t compmask = fs->inputs[index].compmask;
1320 
1321    /* NOTE: varyings are packed, so if compmask is 0xb then first, second, and
1322     * fourth component occupy three consecutive varying slots
1323     */
1324    int shift = 0;
1325    *interp_mode = 0;
1326    *ps_repl_mode = 0;
1327    if (fs->inputs[index].slot == VARYING_SLOT_PNTC) {
1328       if (compmask & 0x1) {
1329          *ps_repl_mode |= PS_REPL_S << shift;
1330          shift += 2;
1331       }
1332       if (compmask & 0x2) {
1333          *ps_repl_mode |= PS_REPL_T << shift;
1334          shift += 2;
1335       }
1336       if (compmask & 0x4) {
1337          *interp_mode |= INTERP_ZERO << shift;
1338          shift += 2;
1339       }
1340       if (compmask & 0x8) {
1341          *interp_mode |= INTERP_ONE << 6;
1342          shift += 2;
1343       }
1344    } else if (fs->inputs[index].flat) {
1345       for (int i = 0; i < 4; i++) {
1346          if (compmask & (1 << i)) {
1347             *interp_mode |= INTERP_FLAT << shift;
1348             shift += 2;
1349          }
1350       }
1351    }
1352 
1353    return shift;
1354 }
1355 
1356 static void
tu6_emit_vpc_varying_modes(struct tu_cs * cs,const struct ir3_shader_variant * fs)1357 tu6_emit_vpc_varying_modes(struct tu_cs *cs,
1358                            const struct ir3_shader_variant *fs)
1359 {
1360    uint32_t interp_modes[8] = { 0 };
1361    uint32_t ps_repl_modes[8] = { 0 };
1362 
1363    if (fs) {
1364       for (int i = -1;
1365            (i = ir3_next_varying(fs, i)) < (int) fs->inputs_count;) {
1366 
1367          /* get the mode for input i */
1368          uint8_t interp_mode;
1369          uint8_t ps_repl_mode;
1370          const int bits =
1371             tu6_vpc_varying_mode(fs, i, &interp_mode, &ps_repl_mode);
1372 
1373          /* OR the mode into the array */
1374          const uint32_t inloc = fs->inputs[i].inloc * 2;
1375          uint32_t n = inloc / 32;
1376          uint32_t shift = inloc % 32;
1377          interp_modes[n] |= interp_mode << shift;
1378          ps_repl_modes[n] |= ps_repl_mode << shift;
1379          if (shift + bits > 32) {
1380             n++;
1381             shift = 32 - shift;
1382 
1383             interp_modes[n] |= interp_mode >> shift;
1384             ps_repl_modes[n] |= ps_repl_mode >> shift;
1385          }
1386       }
1387    }
1388 
1389    tu_cs_emit_pkt4(cs, REG_A6XX_VPC_VARYING_INTERP_MODE(0), 8);
1390    tu_cs_emit_array(cs, interp_modes, 8);
1391 
1392    tu_cs_emit_pkt4(cs, REG_A6XX_VPC_VARYING_PS_REPL_MODE(0), 8);
1393    tu_cs_emit_array(cs, ps_repl_modes, 8);
1394 }
1395 
1396 void
tu6_emit_fs_inputs(struct tu_cs * cs,const struct ir3_shader_variant * fs)1397 tu6_emit_fs_inputs(struct tu_cs *cs, const struct ir3_shader_variant *fs)
1398 {
1399    uint32_t face_regid, coord_regid, zwcoord_regid, samp_id_regid;
1400    uint32_t ij_regid[IJ_COUNT];
1401    uint32_t smask_in_regid;
1402 
1403    bool sample_shading = fs->per_samp | fs->key.sample_shading;
1404    bool enable_varyings = fs->total_in > 0;
1405 
1406    samp_id_regid   = ir3_find_sysval_regid(fs, SYSTEM_VALUE_SAMPLE_ID);
1407    smask_in_regid  = ir3_find_sysval_regid(fs, SYSTEM_VALUE_SAMPLE_MASK_IN);
1408    face_regid      = ir3_find_sysval_regid(fs, SYSTEM_VALUE_FRONT_FACE);
1409    coord_regid     = ir3_find_sysval_regid(fs, SYSTEM_VALUE_FRAG_COORD);
1410    zwcoord_regid   = VALIDREG(coord_regid) ? coord_regid + 2 : regid(63, 0);
1411    for (unsigned i = 0; i < ARRAY_SIZE(ij_regid); i++)
1412       ij_regid[i] = ir3_find_sysval_regid(fs, SYSTEM_VALUE_BARYCENTRIC_PERSP_PIXEL + i);
1413 
1414    if (fs->num_sampler_prefetch > 0) {
1415       assert(VALIDREG(ij_regid[IJ_PERSP_PIXEL]));
1416       /* also, it seems like ij_pix is *required* to be r0.x */
1417       assert(ij_regid[IJ_PERSP_PIXEL] == regid(0, 0));
1418    }
1419 
1420    tu_cs_emit_pkt4(cs, REG_A6XX_SP_FS_PREFETCH_CNTL, 1 + fs->num_sampler_prefetch);
1421    tu_cs_emit(cs, A6XX_SP_FS_PREFETCH_CNTL_COUNT(fs->num_sampler_prefetch) |
1422          A6XX_SP_FS_PREFETCH_CNTL_UNK4(regid(63, 0)) |
1423          0x7000);    // XXX);
1424    for (int i = 0; i < fs->num_sampler_prefetch; i++) {
1425       const struct ir3_sampler_prefetch *prefetch = &fs->sampler_prefetch[i];
1426       tu_cs_emit(cs, A6XX_SP_FS_PREFETCH_CMD_SRC(prefetch->src) |
1427                      A6XX_SP_FS_PREFETCH_CMD_SAMP_ID(prefetch->samp_id) |
1428                      A6XX_SP_FS_PREFETCH_CMD_TEX_ID(prefetch->tex_id) |
1429                      A6XX_SP_FS_PREFETCH_CMD_DST(prefetch->dst) |
1430                      A6XX_SP_FS_PREFETCH_CMD_WRMASK(prefetch->wrmask) |
1431                      COND(prefetch->half_precision, A6XX_SP_FS_PREFETCH_CMD_HALF) |
1432                      A6XX_SP_FS_PREFETCH_CMD_CMD(prefetch->cmd));
1433    }
1434 
1435    if (fs->num_sampler_prefetch > 0) {
1436       tu_cs_emit_pkt4(cs, REG_A6XX_SP_FS_BINDLESS_PREFETCH_CMD(0), fs->num_sampler_prefetch);
1437       for (int i = 0; i < fs->num_sampler_prefetch; i++) {
1438          const struct ir3_sampler_prefetch *prefetch = &fs->sampler_prefetch[i];
1439          tu_cs_emit(cs,
1440                     A6XX_SP_FS_BINDLESS_PREFETCH_CMD_SAMP_ID(prefetch->samp_bindless_id) |
1441                     A6XX_SP_FS_BINDLESS_PREFETCH_CMD_TEX_ID(prefetch->tex_bindless_id));
1442       }
1443    }
1444 
1445    tu_cs_emit_pkt4(cs, REG_A6XX_HLSQ_CONTROL_1_REG, 5);
1446    tu_cs_emit(cs, 0x7);
1447    tu_cs_emit(cs, A6XX_HLSQ_CONTROL_2_REG_FACEREGID(face_regid) |
1448                   A6XX_HLSQ_CONTROL_2_REG_SAMPLEID(samp_id_regid) |
1449                   A6XX_HLSQ_CONTROL_2_REG_SAMPLEMASK(smask_in_regid) |
1450                   A6XX_HLSQ_CONTROL_2_REG_CENTERRHW(ij_regid[IJ_PERSP_CENTER_RHW]));
1451    tu_cs_emit(cs, A6XX_HLSQ_CONTROL_3_REG_IJ_PERSP_PIXEL(ij_regid[IJ_PERSP_PIXEL]) |
1452                   A6XX_HLSQ_CONTROL_3_REG_IJ_LINEAR_PIXEL(ij_regid[IJ_LINEAR_PIXEL]) |
1453                   A6XX_HLSQ_CONTROL_3_REG_IJ_PERSP_CENTROID(ij_regid[IJ_PERSP_CENTROID]) |
1454                   A6XX_HLSQ_CONTROL_3_REG_IJ_LINEAR_CENTROID(ij_regid[IJ_LINEAR_CENTROID]));
1455    tu_cs_emit(cs, A6XX_HLSQ_CONTROL_4_REG_XYCOORDREGID(coord_regid) |
1456                   A6XX_HLSQ_CONTROL_4_REG_ZWCOORDREGID(zwcoord_regid) |
1457                   A6XX_HLSQ_CONTROL_4_REG_IJ_PERSP_SAMPLE(ij_regid[IJ_PERSP_SAMPLE]) |
1458                   A6XX_HLSQ_CONTROL_4_REG_IJ_LINEAR_SAMPLE(ij_regid[IJ_LINEAR_SAMPLE]));
1459    tu_cs_emit(cs, 0xfcfc);
1460 
1461    enum a6xx_threadsize thrsz = fs->info.double_threadsize ? THREAD128 : THREAD64;
1462    tu_cs_emit_pkt4(cs, REG_A6XX_HLSQ_FS_CNTL_0, 1);
1463    tu_cs_emit(cs, A6XX_HLSQ_FS_CNTL_0_THREADSIZE(thrsz) |
1464                   COND(enable_varyings, A6XX_HLSQ_FS_CNTL_0_VARYINGS));
1465 
1466    bool need_size = fs->frag_face || fs->fragcoord_compmask != 0;
1467    bool need_size_persamp = false;
1468    if (VALIDREG(ij_regid[IJ_PERSP_CENTER_RHW])) {
1469       if (sample_shading)
1470          need_size_persamp = true;
1471       else
1472          need_size = true;
1473    }
1474 
1475    tu_cs_emit_pkt4(cs, REG_A6XX_GRAS_CNTL, 1);
1476    tu_cs_emit(cs,
1477          CONDREG(ij_regid[IJ_PERSP_PIXEL], A6XX_GRAS_CNTL_IJ_PERSP_PIXEL) |
1478          CONDREG(ij_regid[IJ_PERSP_CENTROID], A6XX_GRAS_CNTL_IJ_PERSP_CENTROID) |
1479          CONDREG(ij_regid[IJ_PERSP_SAMPLE], A6XX_GRAS_CNTL_IJ_PERSP_SAMPLE) |
1480          CONDREG(ij_regid[IJ_LINEAR_PIXEL], A6XX_GRAS_CNTL_IJ_LINEAR_PIXEL) |
1481          CONDREG(ij_regid[IJ_LINEAR_CENTROID], A6XX_GRAS_CNTL_IJ_LINEAR_CENTROID) |
1482          CONDREG(ij_regid[IJ_LINEAR_SAMPLE], A6XX_GRAS_CNTL_IJ_LINEAR_SAMPLE) |
1483          COND(need_size, A6XX_GRAS_CNTL_IJ_LINEAR_PIXEL) |
1484          COND(need_size_persamp, A6XX_GRAS_CNTL_IJ_LINEAR_SAMPLE) |
1485          COND(fs->fragcoord_compmask != 0, A6XX_GRAS_CNTL_COORD_MASK(fs->fragcoord_compmask)));
1486 
1487    tu_cs_emit_pkt4(cs, REG_A6XX_RB_RENDER_CONTROL0, 2);
1488    tu_cs_emit(cs,
1489          CONDREG(ij_regid[IJ_PERSP_PIXEL], A6XX_RB_RENDER_CONTROL0_IJ_PERSP_PIXEL) |
1490          CONDREG(ij_regid[IJ_PERSP_CENTROID], A6XX_RB_RENDER_CONTROL0_IJ_PERSP_CENTROID) |
1491          CONDREG(ij_regid[IJ_PERSP_SAMPLE], A6XX_RB_RENDER_CONTROL0_IJ_PERSP_SAMPLE) |
1492          CONDREG(ij_regid[IJ_LINEAR_PIXEL], A6XX_RB_RENDER_CONTROL0_IJ_LINEAR_PIXEL) |
1493          CONDREG(ij_regid[IJ_LINEAR_CENTROID], A6XX_RB_RENDER_CONTROL0_IJ_LINEAR_CENTROID) |
1494          CONDREG(ij_regid[IJ_LINEAR_SAMPLE], A6XX_RB_RENDER_CONTROL0_IJ_LINEAR_SAMPLE) |
1495          COND(need_size, A6XX_RB_RENDER_CONTROL0_IJ_LINEAR_PIXEL) |
1496          COND(enable_varyings, A6XX_RB_RENDER_CONTROL0_UNK10) |
1497          COND(need_size_persamp, A6XX_RB_RENDER_CONTROL0_IJ_LINEAR_SAMPLE) |
1498          COND(fs->fragcoord_compmask != 0,
1499                            A6XX_RB_RENDER_CONTROL0_COORD_MASK(fs->fragcoord_compmask)));
1500    tu_cs_emit(cs,
1501          A6XX_RB_RENDER_CONTROL1_FRAGCOORDSAMPLEMODE(
1502             sample_shading ? FRAGCOORD_SAMPLE : FRAGCOORD_CENTER) |
1503          CONDREG(smask_in_regid, A6XX_RB_RENDER_CONTROL1_SAMPLEMASK) |
1504          CONDREG(samp_id_regid, A6XX_RB_RENDER_CONTROL1_SAMPLEID) |
1505          CONDREG(ij_regid[IJ_PERSP_CENTER_RHW], A6XX_RB_RENDER_CONTROL1_CENTERRHW) |
1506          COND(fs->frag_face, A6XX_RB_RENDER_CONTROL1_FACENESS));
1507 
1508    tu_cs_emit_pkt4(cs, REG_A6XX_RB_SAMPLE_CNTL, 1);
1509    tu_cs_emit(cs, COND(sample_shading, A6XX_RB_SAMPLE_CNTL_PER_SAMP_MODE));
1510 
1511    tu_cs_emit_pkt4(cs, REG_A6XX_GRAS_LRZ_PS_INPUT_CNTL, 1);
1512    tu_cs_emit(cs, CONDREG(samp_id_regid, A6XX_GRAS_LRZ_PS_INPUT_CNTL_SAMPLEID) |
1513               A6XX_GRAS_LRZ_PS_INPUT_CNTL_FRAGCOORDSAMPLEMODE(
1514                  sample_shading ? FRAGCOORD_SAMPLE : FRAGCOORD_CENTER));
1515 
1516    tu_cs_emit_pkt4(cs, REG_A6XX_GRAS_SAMPLE_CNTL, 1);
1517    tu_cs_emit(cs, COND(sample_shading, A6XX_GRAS_SAMPLE_CNTL_PER_SAMP_MODE));
1518 }
1519 
1520 static void
tu6_emit_fs_outputs(struct tu_cs * cs,const struct ir3_shader_variant * fs,uint32_t mrt_count,bool dual_src_blend,uint32_t render_components,bool no_earlyz,struct tu_pipeline * pipeline)1521 tu6_emit_fs_outputs(struct tu_cs *cs,
1522                     const struct ir3_shader_variant *fs,
1523                     uint32_t mrt_count, bool dual_src_blend,
1524                     uint32_t render_components,
1525                     bool no_earlyz,
1526                     struct tu_pipeline *pipeline)
1527 {
1528    uint32_t smask_regid, posz_regid, stencilref_regid;
1529 
1530    posz_regid      = ir3_find_output_regid(fs, FRAG_RESULT_DEPTH);
1531    smask_regid     = ir3_find_output_regid(fs, FRAG_RESULT_SAMPLE_MASK);
1532    stencilref_regid = ir3_find_output_regid(fs, FRAG_RESULT_STENCIL);
1533 
1534    uint32_t fragdata_regid[8];
1535    if (fs->color0_mrt) {
1536       fragdata_regid[0] = ir3_find_output_regid(fs, FRAG_RESULT_COLOR);
1537       for (uint32_t i = 1; i < ARRAY_SIZE(fragdata_regid); i++)
1538          fragdata_regid[i] = fragdata_regid[0];
1539    } else {
1540       for (uint32_t i = 0; i < ARRAY_SIZE(fragdata_regid); i++)
1541          fragdata_regid[i] = ir3_find_output_regid(fs, FRAG_RESULT_DATA0 + i);
1542    }
1543 
1544    tu_cs_emit_pkt4(cs, REG_A6XX_SP_FS_OUTPUT_CNTL0, 2);
1545    tu_cs_emit(cs, A6XX_SP_FS_OUTPUT_CNTL0_DEPTH_REGID(posz_regid) |
1546                   A6XX_SP_FS_OUTPUT_CNTL0_SAMPMASK_REGID(smask_regid) |
1547                   A6XX_SP_FS_OUTPUT_CNTL0_STENCILREF_REGID(stencilref_regid) |
1548                   COND(dual_src_blend, A6XX_SP_FS_OUTPUT_CNTL0_DUAL_COLOR_IN_ENABLE));
1549    tu_cs_emit(cs, A6XX_SP_FS_OUTPUT_CNTL1_MRT(mrt_count));
1550 
1551    uint32_t fs_render_components = 0;
1552 
1553    tu_cs_emit_pkt4(cs, REG_A6XX_SP_FS_OUTPUT_REG(0), 8);
1554    for (uint32_t i = 0; i < ARRAY_SIZE(fragdata_regid); i++) {
1555       tu_cs_emit(cs, A6XX_SP_FS_OUTPUT_REG_REGID(fragdata_regid[i]) |
1556                      (COND(fragdata_regid[i] & HALF_REG_ID,
1557                            A6XX_SP_FS_OUTPUT_REG_HALF_PRECISION)));
1558 
1559       if (VALIDREG(fragdata_regid[i])) {
1560          fs_render_components |= 0xf << (i * 4);
1561       }
1562    }
1563 
1564    /* dual source blending has an extra fs output in the 2nd slot */
1565    if (dual_src_blend) {
1566       fs_render_components |= 0xf << 4;
1567    }
1568 
1569    /* There is no point in having component enabled which is not written
1570     * by the shader. Per VK spec it is an UB, however a few apps depend on
1571     * attachment not being changed if FS doesn't have corresponding output.
1572     */
1573    fs_render_components &= render_components;
1574 
1575    tu_cs_emit_regs(cs,
1576                    A6XX_SP_FS_RENDER_COMPONENTS(.dword = fs_render_components));
1577 
1578    tu_cs_emit_pkt4(cs, REG_A6XX_RB_FS_OUTPUT_CNTL0, 2);
1579    tu_cs_emit(cs, COND(fs->writes_pos, A6XX_RB_FS_OUTPUT_CNTL0_FRAG_WRITES_Z) |
1580                   COND(fs->writes_smask, A6XX_RB_FS_OUTPUT_CNTL0_FRAG_WRITES_SAMPMASK) |
1581                   COND(fs->writes_stencilref, A6XX_RB_FS_OUTPUT_CNTL0_FRAG_WRITES_STENCILREF) |
1582                   COND(dual_src_blend, A6XX_RB_FS_OUTPUT_CNTL0_DUAL_COLOR_IN_ENABLE));
1583    tu_cs_emit(cs, A6XX_RB_FS_OUTPUT_CNTL1_MRT(mrt_count));
1584 
1585    tu_cs_emit_regs(cs,
1586                    A6XX_RB_RENDER_COMPONENTS(.dword = fs_render_components));
1587 
1588    if (pipeline) {
1589       pipeline->lrz.fs_has_kill = fs->has_kill;
1590       pipeline->lrz.early_fragment_tests = fs->fs.early_fragment_tests;
1591 
1592       if (!fs->fs.early_fragment_tests &&
1593           (fs->no_earlyz || fs->has_kill || fs->writes_pos || fs->writes_stencilref || no_earlyz || fs->writes_smask)) {
1594          pipeline->lrz.force_late_z = true;
1595       }
1596    }
1597 }
1598 
1599 static void
tu6_emit_geom_tess_consts(struct tu_cs * cs,const struct ir3_shader_variant * vs,const struct ir3_shader_variant * hs,const struct ir3_shader_variant * ds,const struct ir3_shader_variant * gs,uint32_t cps_per_patch)1600 tu6_emit_geom_tess_consts(struct tu_cs *cs,
1601                           const struct ir3_shader_variant *vs,
1602                           const struct ir3_shader_variant *hs,
1603                           const struct ir3_shader_variant *ds,
1604                           const struct ir3_shader_variant *gs,
1605                           uint32_t cps_per_patch)
1606 {
1607    struct tu_device *dev = cs->device;
1608 
1609    uint32_t num_vertices =
1610          hs ? cps_per_patch : gs->gs.vertices_in;
1611 
1612    uint32_t vs_params[4] = {
1613       vs->output_size * num_vertices * 4,  /* vs primitive stride */
1614       vs->output_size * 4,                 /* vs vertex stride */
1615       0,
1616       0,
1617    };
1618    uint32_t vs_base = ir3_const_state(vs)->offsets.primitive_param;
1619    tu6_emit_const(cs, CP_LOAD_STATE6_GEOM, vs_base, SB6_VS_SHADER, 0,
1620                   ARRAY_SIZE(vs_params), vs_params);
1621 
1622    if (hs) {
1623       assert(ds->type != MESA_SHADER_NONE);
1624 
1625       /* Create the shared tess factor BO the first time tess is used on the device. */
1626       mtx_lock(&dev->mutex);
1627       if (!dev->tess_bo)
1628          tu_bo_init_new(dev, &dev->tess_bo, TU_TESS_BO_SIZE, TU_BO_ALLOC_NO_FLAGS);
1629       mtx_unlock(&dev->mutex);
1630 
1631       uint64_t tess_factor_iova = dev->tess_bo->iova;
1632       uint64_t tess_param_iova = tess_factor_iova + TU_TESS_FACTOR_SIZE;
1633 
1634       uint32_t hs_params[8] = {
1635          vs->output_size * num_vertices * 4,  /* hs primitive stride */
1636          vs->output_size * 4,                 /* hs vertex stride */
1637          hs->output_size,
1638          cps_per_patch,
1639          tess_param_iova,
1640          tess_param_iova >> 32,
1641          tess_factor_iova,
1642          tess_factor_iova >> 32,
1643       };
1644 
1645       uint32_t hs_base = hs->const_state->offsets.primitive_param;
1646       uint32_t hs_param_dwords = MIN2((hs->constlen - hs_base) * 4, ARRAY_SIZE(hs_params));
1647       tu6_emit_const(cs, CP_LOAD_STATE6_GEOM, hs_base, SB6_HS_SHADER, 0,
1648                      hs_param_dwords, hs_params);
1649       if (gs)
1650          num_vertices = gs->gs.vertices_in;
1651 
1652       uint32_t ds_params[8] = {
1653          ds->output_size * num_vertices * 4,  /* ds primitive stride */
1654          ds->output_size * 4,                 /* ds vertex stride */
1655          hs->output_size,                     /* hs vertex stride (dwords) */
1656          hs->tess.tcs_vertices_out,
1657          tess_param_iova,
1658          tess_param_iova >> 32,
1659          tess_factor_iova,
1660          tess_factor_iova >> 32,
1661       };
1662 
1663       uint32_t ds_base = ds->const_state->offsets.primitive_param;
1664       uint32_t ds_param_dwords = MIN2((ds->constlen - ds_base) * 4, ARRAY_SIZE(ds_params));
1665       tu6_emit_const(cs, CP_LOAD_STATE6_GEOM, ds_base, SB6_DS_SHADER, 0,
1666                      ds_param_dwords, ds_params);
1667    }
1668 
1669    if (gs) {
1670       const struct ir3_shader_variant *prev = ds ? ds : vs;
1671       uint32_t gs_params[4] = {
1672          prev->output_size * num_vertices * 4,  /* gs primitive stride */
1673          prev->output_size * 4,                 /* gs vertex stride */
1674          0,
1675          0,
1676       };
1677       uint32_t gs_base = gs->const_state->offsets.primitive_param;
1678       tu6_emit_const(cs, CP_LOAD_STATE6_GEOM, gs_base, SB6_GS_SHADER, 0,
1679                      ARRAY_SIZE(gs_params), gs_params);
1680    }
1681 }
1682 
1683 static void
tu6_emit_program_config(struct tu_cs * cs,struct tu_pipeline_builder * builder)1684 tu6_emit_program_config(struct tu_cs *cs,
1685                         struct tu_pipeline_builder *builder)
1686 {
1687    gl_shader_stage stage = MESA_SHADER_VERTEX;
1688 
1689    STATIC_ASSERT(MESA_SHADER_VERTEX == 0);
1690 
1691    bool shared_consts_enable = tu6_shared_constants_enable(builder->layout,
1692          builder->device->compiler);
1693    tu6_emit_shared_consts_enable(cs, shared_consts_enable);
1694 
1695    tu_cs_emit_regs(cs, A6XX_HLSQ_INVALIDATE_CMD(
1696          .vs_state = true,
1697          .hs_state = true,
1698          .ds_state = true,
1699          .gs_state = true,
1700          .fs_state = true,
1701          .gfx_ibo = true,
1702          .gfx_shared_const = shared_consts_enable));
1703    for (; stage < ARRAY_SIZE(builder->shader_iova); stage++) {
1704       tu6_emit_xs_config(cs, stage, builder->shaders->variants[stage]);
1705    }
1706 }
1707 
1708 static void
tu6_emit_program(struct tu_cs * cs,struct tu_pipeline_builder * builder,bool binning_pass,struct tu_pipeline * pipeline)1709 tu6_emit_program(struct tu_cs *cs,
1710                  struct tu_pipeline_builder *builder,
1711                  bool binning_pass,
1712                  struct tu_pipeline *pipeline)
1713 {
1714    const struct ir3_shader_variant *vs = builder->shaders->variants[MESA_SHADER_VERTEX];
1715    const struct ir3_shader_variant *bs = builder->binning_variant;
1716    const struct ir3_shader_variant *hs = builder->shaders->variants[MESA_SHADER_TESS_CTRL];
1717    const struct ir3_shader_variant *ds = builder->shaders->variants[MESA_SHADER_TESS_EVAL];
1718    const struct ir3_shader_variant *gs = builder->shaders->variants[MESA_SHADER_GEOMETRY];
1719    const struct ir3_shader_variant *fs = builder->shaders->variants[MESA_SHADER_FRAGMENT];
1720    gl_shader_stage stage = MESA_SHADER_VERTEX;
1721    uint32_t cps_per_patch = builder->create_info->pTessellationState ?
1722       builder->create_info->pTessellationState->patchControlPoints : 0;
1723    bool multi_pos_output = builder->shaders->multi_pos_output;
1724 
1725   /* Don't use the binning pass variant when GS is present because we don't
1726    * support compiling correct binning pass variants with GS.
1727    */
1728    if (binning_pass && !gs) {
1729       vs = bs;
1730       tu6_emit_xs(cs, stage, bs, &builder->pvtmem, builder->binning_vs_iova);
1731       stage++;
1732    }
1733 
1734    for (; stage < ARRAY_SIZE(builder->shader_iova); stage++) {
1735       const struct ir3_shader_variant *xs = builder->shaders->variants[stage];
1736 
1737       if (stage == MESA_SHADER_FRAGMENT && binning_pass)
1738          fs = xs = NULL;
1739 
1740       tu6_emit_xs(cs, stage, xs, &builder->pvtmem, builder->shader_iova[stage]);
1741    }
1742 
1743    uint32_t multiview_views = util_logbase2(builder->multiview_mask) + 1;
1744    uint32_t multiview_cntl = builder->multiview_mask ?
1745       A6XX_PC_MULTIVIEW_CNTL_ENABLE |
1746       A6XX_PC_MULTIVIEW_CNTL_VIEWS(multiview_views) |
1747       COND(!multi_pos_output, A6XX_PC_MULTIVIEW_CNTL_DISABLEMULTIPOS)
1748       : 0;
1749 
1750    /* Copy what the blob does here. This will emit an extra 0x3f
1751     * CP_EVENT_WRITE when multiview is disabled. I'm not exactly sure what
1752     * this is working around yet.
1753     */
1754    if (builder->device->physical_device->info->a6xx.has_cp_reg_write) {
1755       tu_cs_emit_pkt7(cs, CP_REG_WRITE, 3);
1756       tu_cs_emit(cs, CP_REG_WRITE_0_TRACKER(UNK_EVENT_WRITE));
1757       tu_cs_emit(cs, REG_A6XX_PC_MULTIVIEW_CNTL);
1758    } else {
1759       tu_cs_emit_pkt4(cs, REG_A6XX_PC_MULTIVIEW_CNTL, 1);
1760    }
1761    tu_cs_emit(cs, multiview_cntl);
1762 
1763    tu_cs_emit_pkt4(cs, REG_A6XX_VFD_MULTIVIEW_CNTL, 1);
1764    tu_cs_emit(cs, multiview_cntl);
1765 
1766    if (multiview_cntl &&
1767        builder->device->physical_device->info->a6xx.supports_multiview_mask) {
1768       tu_cs_emit_pkt4(cs, REG_A6XX_PC_MULTIVIEW_MASK, 1);
1769       tu_cs_emit(cs, builder->multiview_mask);
1770    }
1771 
1772    tu_cs_emit_pkt4(cs, REG_A6XX_SP_HS_WAVE_INPUT_SIZE, 1);
1773    tu_cs_emit(cs, 0);
1774 
1775    tu6_emit_vpc(cs, vs, hs, ds, gs, fs, cps_per_patch);
1776    tu6_emit_vpc_varying_modes(cs, fs);
1777 
1778    bool no_earlyz = builder->depth_attachment_format == VK_FORMAT_S8_UINT;
1779    uint32_t mrt_count = builder->color_attachment_count;
1780    uint32_t render_components = builder->render_components;
1781 
1782    if (builder->alpha_to_coverage) {
1783       /* alpha to coverage can behave like a discard */
1784       no_earlyz = true;
1785       /* alpha value comes from first mrt */
1786       render_components |= 0xf;
1787       if (!mrt_count) {
1788          mrt_count = 1;
1789          /* Disable memory write for dummy mrt because it doesn't get set otherwise */
1790          tu_cs_emit_regs(cs, A6XX_RB_MRT_CONTROL(0, .component_enable = 0));
1791       }
1792    }
1793 
1794    if (fs) {
1795       tu6_emit_fs_inputs(cs, fs);
1796       tu6_emit_fs_outputs(cs, fs, mrt_count,
1797                           builder->use_dual_src_blend,
1798                           render_components,
1799                           no_earlyz,
1800                           pipeline);
1801    } else {
1802       /* TODO: check if these can be skipped if fs is disabled */
1803       struct ir3_shader_variant dummy_variant = {};
1804       tu6_emit_fs_inputs(cs, &dummy_variant);
1805       tu6_emit_fs_outputs(cs, &dummy_variant, mrt_count,
1806                           builder->use_dual_src_blend,
1807                           render_components,
1808                           no_earlyz,
1809                           NULL);
1810    }
1811 
1812    if (gs || hs) {
1813       tu6_emit_geom_tess_consts(cs, vs, hs, ds, gs, cps_per_patch);
1814    }
1815 }
1816 
1817 #define TU6_EMIT_VERTEX_INPUT_MAX_DWORDS (MAX_VERTEX_ATTRIBS * 5 + 4)
1818 
1819 static void
tu6_emit_vertex_input(struct tu_pipeline * pipeline,struct tu_draw_state * vi_state,const struct ir3_shader_variant * vs,const VkPipelineVertexInputStateCreateInfo * info)1820 tu6_emit_vertex_input(struct tu_pipeline *pipeline,
1821                       struct tu_draw_state *vi_state,
1822                       const struct ir3_shader_variant *vs,
1823                       const VkPipelineVertexInputStateCreateInfo *info)
1824 {
1825    uint32_t binding_instanced = 0; /* bitmask of instanced bindings */
1826    uint32_t step_rate[MAX_VBS];
1827 
1828    struct tu_cs cs;
1829    tu_cs_begin_sub_stream(&pipeline->cs,
1830                           TU6_EMIT_VERTEX_INPUT_MAX_DWORDS, &cs);
1831 
1832    for (uint32_t i = 0; i < info->vertexBindingDescriptionCount; i++) {
1833       const VkVertexInputBindingDescription *binding =
1834          &info->pVertexBindingDescriptions[i];
1835 
1836       if (!(pipeline->dynamic_state_mask & BIT(TU_DYNAMIC_STATE_VB_STRIDE))) {
1837          tu_cs_emit_regs(&cs,
1838                         A6XX_VFD_FETCH_STRIDE(binding->binding, binding->stride));
1839       }
1840 
1841       if (binding->inputRate == VK_VERTEX_INPUT_RATE_INSTANCE)
1842          binding_instanced |= 1 << binding->binding;
1843 
1844       step_rate[binding->binding] = 1;
1845    }
1846 
1847    const VkPipelineVertexInputDivisorStateCreateInfoEXT *div_state =
1848       vk_find_struct_const(info->pNext, PIPELINE_VERTEX_INPUT_DIVISOR_STATE_CREATE_INFO_EXT);
1849    if (div_state) {
1850       for (uint32_t i = 0; i < div_state->vertexBindingDivisorCount; i++) {
1851          const VkVertexInputBindingDivisorDescriptionEXT *desc =
1852             &div_state->pVertexBindingDivisors[i];
1853          step_rate[desc->binding] = desc->divisor;
1854       }
1855    }
1856 
1857    int32_t input_for_attr[MAX_VERTEX_ATTRIBS];
1858    uint32_t used_attrs_count = 0;
1859 
1860    for (uint32_t attr_idx = 0; attr_idx < info->vertexAttributeDescriptionCount; attr_idx++) {
1861       input_for_attr[attr_idx] = -1;
1862       for (uint32_t input_idx = 0; input_idx < vs->inputs_count; input_idx++) {
1863          if ((vs->inputs[input_idx].slot - VERT_ATTRIB_GENERIC0) ==
1864              info->pVertexAttributeDescriptions[attr_idx].location) {
1865             input_for_attr[attr_idx] = input_idx;
1866             used_attrs_count++;
1867             break;
1868          }
1869       }
1870    }
1871 
1872    if (used_attrs_count)
1873       tu_cs_emit_pkt4(&cs, REG_A6XX_VFD_DECODE_INSTR(0), used_attrs_count * 2);
1874 
1875    for (uint32_t attr_idx = 0; attr_idx < info->vertexAttributeDescriptionCount; attr_idx++) {
1876       const VkVertexInputAttributeDescription *attr =
1877          &info->pVertexAttributeDescriptions[attr_idx];
1878 
1879       if (input_for_attr[attr_idx] == -1)
1880          continue;
1881 
1882       const struct tu_native_format format = tu6_format_vtx(attr->format);
1883       tu_cs_emit(&cs, A6XX_VFD_DECODE_INSTR(0,
1884                        .idx = attr->binding,
1885                        .offset = attr->offset,
1886                        .instanced = binding_instanced & (1 << attr->binding),
1887                        .format = format.fmt,
1888                        .swap = format.swap,
1889                        .unk30 = 1,
1890                        ._float = !vk_format_is_int(attr->format)).value);
1891       tu_cs_emit(&cs, A6XX_VFD_DECODE_STEP_RATE(0, step_rate[attr->binding]).value);
1892    }
1893 
1894    if (used_attrs_count)
1895       tu_cs_emit_pkt4(&cs, REG_A6XX_VFD_DEST_CNTL_INSTR(0), used_attrs_count);
1896 
1897    for (uint32_t attr_idx = 0; attr_idx < info->vertexAttributeDescriptionCount; attr_idx++) {
1898       int32_t input_idx = input_for_attr[attr_idx];
1899       if (input_idx == -1)
1900          continue;
1901 
1902       tu_cs_emit(&cs, A6XX_VFD_DEST_CNTL_INSTR(0,
1903                        .writemask = vs->inputs[input_idx].compmask,
1904                        .regid = vs->inputs[input_idx].regid).value);
1905    }
1906 
1907    tu_cs_emit_regs(&cs,
1908                    A6XX_VFD_CONTROL_0(
1909                      .fetch_cnt = used_attrs_count, /* decode_cnt for binning pass ? */
1910                      .decode_cnt = used_attrs_count));
1911 
1912    *vi_state = tu_cs_end_draw_state(&pipeline->cs, &cs);
1913 }
1914 
1915 void
tu6_emit_viewport(struct tu_cs * cs,const VkViewport * viewports,uint32_t num_viewport,bool z_negative_one_to_one)1916 tu6_emit_viewport(struct tu_cs *cs, const VkViewport *viewports, uint32_t num_viewport,
1917                   bool z_negative_one_to_one)
1918 {
1919    VkExtent2D guardband = {511, 511};
1920 
1921    tu_cs_emit_pkt4(cs, REG_A6XX_GRAS_CL_VPORT_XOFFSET(0), num_viewport * 6);
1922    for (uint32_t i = 0; i < num_viewport; i++) {
1923       const VkViewport *viewport = &viewports[i];
1924       float offsets[3];
1925       float scales[3];
1926       scales[0] = viewport->width / 2.0f;
1927       scales[1] = viewport->height / 2.0f;
1928       if (z_negative_one_to_one) {
1929          scales[2] = 0.5 * (viewport->maxDepth - viewport->minDepth);
1930       } else {
1931          scales[2] = viewport->maxDepth - viewport->minDepth;
1932       }
1933 
1934       offsets[0] = viewport->x + scales[0];
1935       offsets[1] = viewport->y + scales[1];
1936       if (z_negative_one_to_one) {
1937          offsets[2] = 0.5 * (viewport->minDepth + viewport->maxDepth);
1938       } else {
1939          offsets[2] = viewport->minDepth;
1940       }
1941 
1942       for (uint32_t j = 0; j < 3; j++) {
1943          tu_cs_emit(cs, fui(offsets[j]));
1944          tu_cs_emit(cs, fui(scales[j]));
1945       }
1946 
1947       guardband.width =
1948          MIN2(guardband.width, fd_calc_guardband(offsets[0], scales[0], false));
1949       guardband.height =
1950          MIN2(guardband.height, fd_calc_guardband(offsets[1], scales[1], false));
1951    }
1952 
1953    tu_cs_emit_pkt4(cs, REG_A6XX_GRAS_SC_VIEWPORT_SCISSOR_TL(0), num_viewport * 2);
1954    for (uint32_t i = 0; i < num_viewport; i++) {
1955       const VkViewport *viewport = &viewports[i];
1956       VkOffset2D min;
1957       VkOffset2D max;
1958       min.x = (int32_t) viewport->x;
1959       max.x = (int32_t) ceilf(viewport->x + viewport->width);
1960       if (viewport->height >= 0.0f) {
1961          min.y = (int32_t) viewport->y;
1962          max.y = (int32_t) ceilf(viewport->y + viewport->height);
1963       } else {
1964          min.y = (int32_t)(viewport->y + viewport->height);
1965          max.y = (int32_t) ceilf(viewport->y);
1966       }
1967       /* the spec allows viewport->height to be 0.0f */
1968       if (min.y == max.y)
1969          max.y++;
1970       /* allow viewport->width = 0.0f for un-initialized viewports: */
1971       if (min.x == max.x)
1972          max.x++;
1973 
1974       min.x = MAX2(min.x, 0);
1975       min.y = MAX2(min.y, 0);
1976       max.x = MAX2(max.x, 1);
1977       max.y = MAX2(max.y, 1);
1978 
1979       assert(min.x < max.x);
1980       assert(min.y < max.y);
1981 
1982       tu_cs_emit(cs, A6XX_GRAS_SC_VIEWPORT_SCISSOR_TL_X(min.x) |
1983                      A6XX_GRAS_SC_VIEWPORT_SCISSOR_TL_Y(min.y));
1984       tu_cs_emit(cs, A6XX_GRAS_SC_VIEWPORT_SCISSOR_BR_X(max.x - 1) |
1985                      A6XX_GRAS_SC_VIEWPORT_SCISSOR_BR_Y(max.y - 1));
1986    }
1987 
1988    tu_cs_emit_pkt4(cs, REG_A6XX_GRAS_CL_Z_CLAMP(0), num_viewport * 2);
1989    for (uint32_t i = 0; i < num_viewport; i++) {
1990       const VkViewport *viewport = &viewports[i];
1991       tu_cs_emit(cs, fui(MIN2(viewport->minDepth, viewport->maxDepth)));
1992       tu_cs_emit(cs, fui(MAX2(viewport->minDepth, viewport->maxDepth)));
1993    }
1994    tu_cs_emit_pkt4(cs, REG_A6XX_GRAS_CL_GUARDBAND_CLIP_ADJ, 1);
1995    tu_cs_emit(cs, A6XX_GRAS_CL_GUARDBAND_CLIP_ADJ_HORZ(guardband.width) |
1996                   A6XX_GRAS_CL_GUARDBAND_CLIP_ADJ_VERT(guardband.height));
1997 
1998    /* TODO: what to do about this and multi viewport ? */
1999    float z_clamp_min = num_viewport ? MIN2(viewports[0].minDepth, viewports[0].maxDepth) : 0;
2000    float z_clamp_max = num_viewport ? MAX2(viewports[0].minDepth, viewports[0].maxDepth) : 0;
2001 
2002    tu_cs_emit_regs(cs,
2003                    A6XX_RB_Z_CLAMP_MIN(z_clamp_min),
2004                    A6XX_RB_Z_CLAMP_MAX(z_clamp_max));
2005 }
2006 
2007 void
tu6_emit_scissor(struct tu_cs * cs,const VkRect2D * scissors,uint32_t scissor_count)2008 tu6_emit_scissor(struct tu_cs *cs, const VkRect2D *scissors, uint32_t scissor_count)
2009 {
2010    tu_cs_emit_pkt4(cs, REG_A6XX_GRAS_SC_SCREEN_SCISSOR_TL(0), scissor_count * 2);
2011 
2012    for (uint32_t i = 0; i < scissor_count; i++) {
2013       const VkRect2D *scissor = &scissors[i];
2014 
2015       uint32_t min_x = scissor->offset.x;
2016       uint32_t min_y = scissor->offset.y;
2017       uint32_t max_x = min_x + scissor->extent.width - 1;
2018       uint32_t max_y = min_y + scissor->extent.height - 1;
2019 
2020       if (!scissor->extent.width || !scissor->extent.height) {
2021          min_x = min_y = 1;
2022          max_x = max_y = 0;
2023       } else {
2024          /* avoid overflow */
2025          uint32_t scissor_max = BITFIELD_MASK(15);
2026          min_x = MIN2(scissor_max, min_x);
2027          min_y = MIN2(scissor_max, min_y);
2028          max_x = MIN2(scissor_max, max_x);
2029          max_y = MIN2(scissor_max, max_y);
2030       }
2031 
2032       tu_cs_emit(cs, A6XX_GRAS_SC_SCREEN_SCISSOR_TL_X(min_x) |
2033                      A6XX_GRAS_SC_SCREEN_SCISSOR_TL_Y(min_y));
2034       tu_cs_emit(cs, A6XX_GRAS_SC_SCREEN_SCISSOR_BR_X(max_x) |
2035                      A6XX_GRAS_SC_SCREEN_SCISSOR_BR_Y(max_y));
2036    }
2037 }
2038 
2039 void
tu6_emit_sample_locations(struct tu_cs * cs,const VkSampleLocationsInfoEXT * samp_loc)2040 tu6_emit_sample_locations(struct tu_cs *cs, const VkSampleLocationsInfoEXT *samp_loc)
2041 {
2042    if (!samp_loc) {
2043       tu_cs_emit_pkt4(cs, REG_A6XX_GRAS_SAMPLE_CONFIG, 1);
2044       tu_cs_emit(cs, 0);
2045 
2046       tu_cs_emit_pkt4(cs, REG_A6XX_RB_SAMPLE_CONFIG, 1);
2047       tu_cs_emit(cs, 0);
2048 
2049       tu_cs_emit_pkt4(cs, REG_A6XX_SP_TP_SAMPLE_CONFIG, 1);
2050       tu_cs_emit(cs, 0);
2051       return;
2052    }
2053 
2054    assert(samp_loc->sampleLocationsPerPixel == samp_loc->sampleLocationsCount);
2055    assert(samp_loc->sampleLocationGridSize.width == 1);
2056    assert(samp_loc->sampleLocationGridSize.height == 1);
2057 
2058    uint32_t sample_config =
2059       A6XX_RB_SAMPLE_CONFIG_LOCATION_ENABLE;
2060    uint32_t sample_locations = 0;
2061    for (uint32_t i = 0; i < samp_loc->sampleLocationsCount; i++) {
2062       sample_locations |=
2063          (A6XX_RB_SAMPLE_LOCATION_0_SAMPLE_0_X(samp_loc->pSampleLocations[i].x) |
2064           A6XX_RB_SAMPLE_LOCATION_0_SAMPLE_0_Y(samp_loc->pSampleLocations[i].y)) << i*8;
2065    }
2066 
2067    tu_cs_emit_pkt4(cs, REG_A6XX_GRAS_SAMPLE_CONFIG, 2);
2068    tu_cs_emit(cs, sample_config);
2069    tu_cs_emit(cs, sample_locations);
2070 
2071    tu_cs_emit_pkt4(cs, REG_A6XX_RB_SAMPLE_CONFIG, 2);
2072    tu_cs_emit(cs, sample_config);
2073    tu_cs_emit(cs, sample_locations);
2074 
2075    tu_cs_emit_pkt4(cs, REG_A6XX_SP_TP_SAMPLE_CONFIG, 2);
2076    tu_cs_emit(cs, sample_config);
2077    tu_cs_emit(cs, sample_locations);
2078 }
2079 
2080 static uint32_t
tu6_gras_su_cntl(const VkPipelineRasterizationStateCreateInfo * rast_info,enum a5xx_line_mode line_mode,bool multiview)2081 tu6_gras_su_cntl(const VkPipelineRasterizationStateCreateInfo *rast_info,
2082                  enum a5xx_line_mode line_mode,
2083                  bool multiview)
2084 {
2085    uint32_t gras_su_cntl = 0;
2086 
2087    if (rast_info->cullMode & VK_CULL_MODE_FRONT_BIT)
2088       gras_su_cntl |= A6XX_GRAS_SU_CNTL_CULL_FRONT;
2089    if (rast_info->cullMode & VK_CULL_MODE_BACK_BIT)
2090       gras_su_cntl |= A6XX_GRAS_SU_CNTL_CULL_BACK;
2091 
2092    if (rast_info->frontFace == VK_FRONT_FACE_CLOCKWISE)
2093       gras_su_cntl |= A6XX_GRAS_SU_CNTL_FRONT_CW;
2094 
2095    gras_su_cntl |=
2096       A6XX_GRAS_SU_CNTL_LINEHALFWIDTH(rast_info->lineWidth / 2.0f);
2097 
2098    if (rast_info->depthBiasEnable)
2099       gras_su_cntl |= A6XX_GRAS_SU_CNTL_POLY_OFFSET;
2100 
2101    gras_su_cntl |= A6XX_GRAS_SU_CNTL_LINE_MODE(line_mode);
2102 
2103    if (multiview) {
2104       gras_su_cntl |=
2105          A6XX_GRAS_SU_CNTL_UNK17 |
2106          A6XX_GRAS_SU_CNTL_MULTIVIEW_ENABLE;
2107    }
2108 
2109    return gras_su_cntl;
2110 }
2111 
2112 void
tu6_emit_depth_bias(struct tu_cs * cs,float constant_factor,float clamp,float slope_factor)2113 tu6_emit_depth_bias(struct tu_cs *cs,
2114                     float constant_factor,
2115                     float clamp,
2116                     float slope_factor)
2117 {
2118    tu_cs_emit_pkt4(cs, REG_A6XX_GRAS_SU_POLY_OFFSET_SCALE, 3);
2119    tu_cs_emit(cs, A6XX_GRAS_SU_POLY_OFFSET_SCALE(slope_factor).value);
2120    tu_cs_emit(cs, A6XX_GRAS_SU_POLY_OFFSET_OFFSET(constant_factor).value);
2121    tu_cs_emit(cs, A6XX_GRAS_SU_POLY_OFFSET_OFFSET_CLAMP(clamp).value);
2122 }
2123 
2124 static uint32_t
tu6_rb_mrt_blend_control(const VkPipelineColorBlendAttachmentState * att,bool has_alpha)2125 tu6_rb_mrt_blend_control(const VkPipelineColorBlendAttachmentState *att,
2126                          bool has_alpha)
2127 {
2128    const enum a3xx_rb_blend_opcode color_op = tu6_blend_op(att->colorBlendOp);
2129    const enum adreno_rb_blend_factor src_color_factor = tu6_blend_factor(
2130       has_alpha ? att->srcColorBlendFactor
2131                 : tu_blend_factor_no_dst_alpha(att->srcColorBlendFactor));
2132    const enum adreno_rb_blend_factor dst_color_factor = tu6_blend_factor(
2133       has_alpha ? att->dstColorBlendFactor
2134                 : tu_blend_factor_no_dst_alpha(att->dstColorBlendFactor));
2135    const enum a3xx_rb_blend_opcode alpha_op = tu6_blend_op(att->alphaBlendOp);
2136    const enum adreno_rb_blend_factor src_alpha_factor =
2137       tu6_blend_factor(att->srcAlphaBlendFactor);
2138    const enum adreno_rb_blend_factor dst_alpha_factor =
2139       tu6_blend_factor(att->dstAlphaBlendFactor);
2140 
2141    return A6XX_RB_MRT_BLEND_CONTROL_RGB_SRC_FACTOR(src_color_factor) |
2142           A6XX_RB_MRT_BLEND_CONTROL_RGB_BLEND_OPCODE(color_op) |
2143           A6XX_RB_MRT_BLEND_CONTROL_RGB_DEST_FACTOR(dst_color_factor) |
2144           A6XX_RB_MRT_BLEND_CONTROL_ALPHA_SRC_FACTOR(src_alpha_factor) |
2145           A6XX_RB_MRT_BLEND_CONTROL_ALPHA_BLEND_OPCODE(alpha_op) |
2146           A6XX_RB_MRT_BLEND_CONTROL_ALPHA_DEST_FACTOR(dst_alpha_factor);
2147 }
2148 
2149 static uint32_t
tu6_rb_mrt_control(const VkPipelineColorBlendAttachmentState * att,uint32_t rb_mrt_control_rop,bool has_alpha)2150 tu6_rb_mrt_control(const VkPipelineColorBlendAttachmentState *att,
2151                    uint32_t rb_mrt_control_rop,
2152                    bool has_alpha)
2153 {
2154    uint32_t rb_mrt_control =
2155       A6XX_RB_MRT_CONTROL_COMPONENT_ENABLE(att->colorWriteMask);
2156 
2157    rb_mrt_control |= rb_mrt_control_rop;
2158 
2159    if (att->blendEnable) {
2160       rb_mrt_control |= A6XX_RB_MRT_CONTROL_BLEND;
2161 
2162       if (has_alpha)
2163          rb_mrt_control |= A6XX_RB_MRT_CONTROL_BLEND2;
2164    }
2165 
2166    return rb_mrt_control;
2167 }
2168 
2169 uint32_t
tu6_rb_mrt_control_rop(VkLogicOp op,bool * rop_reads_dst)2170 tu6_rb_mrt_control_rop(VkLogicOp op, bool *rop_reads_dst)
2171 {
2172    *rop_reads_dst = tu_logic_op_reads_dst(op);
2173    return A6XX_RB_MRT_CONTROL_ROP_ENABLE |
2174           A6XX_RB_MRT_CONTROL_ROP_CODE(tu6_rop(op));
2175 }
2176 
2177 static void
tu6_emit_rb_mrt_controls(struct tu_pipeline * pipeline,const VkPipelineColorBlendStateCreateInfo * blend_info,const VkFormat attachment_formats[MAX_RTS],bool * rop_reads_dst,uint32_t * color_bandwidth_per_sample)2178 tu6_emit_rb_mrt_controls(struct tu_pipeline *pipeline,
2179                          const VkPipelineColorBlendStateCreateInfo *blend_info,
2180                          const VkFormat attachment_formats[MAX_RTS],
2181                          bool *rop_reads_dst,
2182                          uint32_t *color_bandwidth_per_sample)
2183 {
2184    const VkPipelineColorWriteCreateInfoEXT *color_info =
2185       vk_find_struct_const(blend_info->pNext,
2186                            PIPELINE_COLOR_WRITE_CREATE_INFO_EXT);
2187 
2188    /* The static state is ignored if it's dynamic. In that case assume
2189     * everything is enabled and then the appropriate registers will be zero'd
2190     * dynamically.
2191     */
2192    if (pipeline->dynamic_state_mask & BIT(TU_DYNAMIC_STATE_COLOR_WRITE_ENABLE))
2193       color_info = NULL;
2194 
2195    *rop_reads_dst = false;
2196    *color_bandwidth_per_sample = 0;
2197 
2198    uint32_t rb_mrt_control_rop = 0;
2199    if (blend_info->logicOpEnable) {
2200       pipeline->logic_op_enabled = true;
2201       rb_mrt_control_rop = tu6_rb_mrt_control_rop(blend_info->logicOp,
2202                                                   rop_reads_dst);
2203    }
2204 
2205    uint32_t total_bpp = 0;
2206    pipeline->num_rts = blend_info->attachmentCount;
2207    for (uint32_t i = 0; i < blend_info->attachmentCount; i++) {
2208       const VkPipelineColorBlendAttachmentState *att =
2209          &blend_info->pAttachments[i];
2210       const VkFormat format = attachment_formats[i];
2211 
2212       uint32_t rb_mrt_control = 0;
2213       uint32_t rb_mrt_blend_control = 0;
2214       if (format != VK_FORMAT_UNDEFINED &&
2215           (!color_info || color_info->pColorWriteEnables[i])) {
2216          const bool has_alpha = vk_format_has_alpha(format);
2217 
2218          rb_mrt_control =
2219             tu6_rb_mrt_control(att, rb_mrt_control_rop, has_alpha);
2220          rb_mrt_blend_control = tu6_rb_mrt_blend_control(att, has_alpha);
2221 
2222          /* calculate bpp based on format and write mask */
2223          uint32_t write_bpp = 0;
2224          if (att->colorWriteMask == 0xf) {
2225             write_bpp = vk_format_get_blocksizebits(format);
2226          } else {
2227             const enum pipe_format pipe_format = vk_format_to_pipe_format(format);
2228             for (uint32_t i = 0; i < 4; i++) {
2229                if (att->colorWriteMask & (1 << i)) {
2230                   write_bpp += util_format_get_component_bits(pipe_format,
2231                         UTIL_FORMAT_COLORSPACE_RGB, i);
2232                }
2233             }
2234          }
2235          total_bpp += write_bpp;
2236 
2237          pipeline->color_write_enable |= BIT(i);
2238          if (att->blendEnable)
2239             pipeline->blend_enable |= BIT(i);
2240 
2241          if (att->blendEnable || *rop_reads_dst) {
2242             total_bpp += write_bpp;
2243          }
2244       }
2245 
2246       pipeline->rb_mrt_control[i] = rb_mrt_control & pipeline->rb_mrt_control_mask;
2247       pipeline->rb_mrt_blend_control[i] = rb_mrt_blend_control;
2248    }
2249 
2250    *color_bandwidth_per_sample = total_bpp / 8;
2251 }
2252 
2253 static void
tu6_emit_blend_control(struct tu_pipeline * pipeline,uint32_t blend_enable_mask,bool dual_src_blend,const VkPipelineMultisampleStateCreateInfo * msaa_info)2254 tu6_emit_blend_control(struct tu_pipeline *pipeline,
2255                        uint32_t blend_enable_mask,
2256                        bool dual_src_blend,
2257                        const VkPipelineMultisampleStateCreateInfo *msaa_info)
2258 {
2259    const uint32_t sample_mask =
2260       msaa_info->pSampleMask ? (*msaa_info->pSampleMask & 0xffff)
2261                              : ((1 << msaa_info->rasterizationSamples) - 1);
2262 
2263 
2264    pipeline->sp_blend_cntl =
2265        A6XX_SP_BLEND_CNTL(.enable_blend = blend_enable_mask,
2266                           .dual_color_in_enable = dual_src_blend,
2267                           .alpha_to_coverage = msaa_info->alphaToCoverageEnable,
2268                           .unk8 = true).value & pipeline->sp_blend_cntl_mask;
2269 
2270    /* set A6XX_RB_BLEND_CNTL_INDEPENDENT_BLEND only when enabled? */
2271    pipeline->rb_blend_cntl =
2272        A6XX_RB_BLEND_CNTL(.enable_blend = blend_enable_mask,
2273                           .independent_blend = true,
2274                           .sample_mask = sample_mask,
2275                           .dual_color_in_enable = dual_src_blend,
2276                           .alpha_to_coverage = msaa_info->alphaToCoverageEnable,
2277                           .alpha_to_one = msaa_info->alphaToOneEnable).value &
2278       pipeline->rb_blend_cntl_mask;
2279 }
2280 
2281 static void
tu6_emit_blend(struct tu_cs * cs,struct tu_pipeline * pipeline)2282 tu6_emit_blend(struct tu_cs *cs,
2283                struct tu_pipeline *pipeline)
2284 {
2285    tu_cs_emit_regs(cs, A6XX_SP_BLEND_CNTL(.dword = pipeline->sp_blend_cntl));
2286    tu_cs_emit_regs(cs, A6XX_RB_BLEND_CNTL(.dword = pipeline->rb_blend_cntl));
2287 
2288    for (unsigned i = 0; i < pipeline->num_rts; i++) {
2289       tu_cs_emit_regs(cs,
2290                       A6XX_RB_MRT_CONTROL(i, .dword = pipeline->rb_mrt_control[i]),
2291                       A6XX_RB_MRT_BLEND_CONTROL(i, .dword = pipeline->rb_mrt_blend_control[i]));
2292    }
2293 }
2294 
2295 static uint32_t
calc_pvtmem_size(struct tu_device * dev,struct tu_pvtmem_config * config,uint32_t pvtmem_bytes)2296 calc_pvtmem_size(struct tu_device *dev, struct tu_pvtmem_config *config,
2297                  uint32_t pvtmem_bytes)
2298 {
2299    uint32_t per_fiber_size = ALIGN(pvtmem_bytes, 512);
2300    uint32_t per_sp_size =
2301       ALIGN(per_fiber_size * dev->physical_device->info->a6xx.fibers_per_sp, 1 << 12);
2302 
2303    if (config) {
2304       config->per_fiber_size = per_fiber_size;
2305       config->per_sp_size = per_sp_size;
2306    }
2307 
2308    return dev->physical_device->info->num_sp_cores * per_sp_size;
2309 }
2310 
2311 static VkResult
tu_setup_pvtmem(struct tu_device * dev,struct tu_pipeline * pipeline,struct tu_pvtmem_config * config,uint32_t pvtmem_bytes,bool per_wave)2312 tu_setup_pvtmem(struct tu_device *dev,
2313                 struct tu_pipeline *pipeline,
2314                 struct tu_pvtmem_config *config,
2315                 uint32_t pvtmem_bytes, bool per_wave)
2316 {
2317    if (!pvtmem_bytes) {
2318       memset(config, 0, sizeof(*config));
2319       return VK_SUCCESS;
2320    }
2321 
2322    uint32_t total_size = calc_pvtmem_size(dev, config, pvtmem_bytes);
2323    config->per_wave = per_wave;
2324 
2325    VkResult result =
2326       tu_bo_init_new(dev, &pipeline->pvtmem_bo, total_size,
2327                      TU_BO_ALLOC_NO_FLAGS);
2328    if (result != VK_SUCCESS)
2329       return result;
2330 
2331    config->iova = pipeline->pvtmem_bo->iova;
2332 
2333    return result;
2334 }
2335 
2336 
2337 static VkResult
tu_pipeline_allocate_cs(struct tu_device * dev,struct tu_pipeline * pipeline,struct tu_pipeline_layout * layout,struct tu_pipeline_builder * builder,struct ir3_shader_variant * compute)2338 tu_pipeline_allocate_cs(struct tu_device *dev,
2339                         struct tu_pipeline *pipeline,
2340                         struct tu_pipeline_layout *layout,
2341                         struct tu_pipeline_builder *builder,
2342                         struct ir3_shader_variant *compute)
2343 {
2344    uint32_t size = 1024 + tu6_load_state_size(pipeline, layout, compute);
2345 
2346    /* graphics case: */
2347    if (builder) {
2348       size += 2 * TU6_EMIT_VERTEX_INPUT_MAX_DWORDS;
2349 
2350       for (uint32_t i = 0; i < ARRAY_SIZE(builder->shaders->variants); i++) {
2351          if (builder->shaders->variants[i]) {
2352             size += builder->shaders->variants[i]->info.size / 4;
2353          }
2354       }
2355 
2356       size += builder->binning_variant->info.size / 4;
2357 
2358       builder->additional_cs_reserve_size = 0;
2359       for (unsigned i = 0; i < ARRAY_SIZE(builder->shaders->variants); i++) {
2360          struct ir3_shader_variant *variant = builder->shaders->variants[i];
2361          if (variant) {
2362             builder->additional_cs_reserve_size +=
2363                tu_xs_get_additional_cs_size_dwords(variant);
2364 
2365             if (variant->binning) {
2366                builder->additional_cs_reserve_size +=
2367                   tu_xs_get_additional_cs_size_dwords(variant->binning);
2368             }
2369          }
2370       }
2371 
2372       /* The additional size is used twice, once per tu6_emit_program() call. */
2373       size += builder->additional_cs_reserve_size * 2;
2374    } else {
2375       size += compute->info.size / 4;
2376 
2377       size += tu_xs_get_additional_cs_size_dwords(compute);
2378    }
2379 
2380    /* Allocate the space for the pipeline out of the device's RO suballocator.
2381     *
2382     * Sub-allocating BOs saves memory and also kernel overhead in refcounting of
2383     * BOs at exec time.
2384     *
2385     * The pipeline cache would seem like a natural place to stick the
2386     * suballocator, except that it is not guaranteed to outlive the pipelines
2387     * created from it, so you can't store any long-lived state there, and you
2388     * can't use its EXTERNALLY_SYNCHRONIZED flag to avoid atomics because
2389     * pipeline destroy isn't synchronized by the cache.
2390     */
2391    pthread_mutex_lock(&dev->pipeline_mutex);
2392    VkResult result = tu_suballoc_bo_alloc(&pipeline->bo, &dev->pipeline_suballoc,
2393                                           size * 4, 128);
2394    pthread_mutex_unlock(&dev->pipeline_mutex);
2395    if (result != VK_SUCCESS)
2396       return result;
2397 
2398    tu_cs_init_suballoc(&pipeline->cs, dev, &pipeline->bo);
2399 
2400    return VK_SUCCESS;
2401 }
2402 
2403 static void
tu_pipeline_shader_key_init(struct ir3_shader_key * key,const struct tu_pipeline * pipeline,const VkGraphicsPipelineCreateInfo * pipeline_info)2404 tu_pipeline_shader_key_init(struct ir3_shader_key *key,
2405                             const struct tu_pipeline *pipeline,
2406                             const VkGraphicsPipelineCreateInfo *pipeline_info)
2407 {
2408    for (uint32_t i = 0; i < pipeline_info->stageCount; i++) {
2409       if (pipeline_info->pStages[i].stage == VK_SHADER_STAGE_GEOMETRY_BIT) {
2410          key->has_gs = true;
2411          break;
2412       }
2413    }
2414 
2415    if (pipeline_info->pRasterizationState->rasterizerDiscardEnable &&
2416        !(pipeline->dynamic_state_mask & BIT(TU_DYNAMIC_STATE_RASTERIZER_DISCARD)))
2417       return;
2418 
2419    const VkPipelineMultisampleStateCreateInfo *msaa_info = pipeline_info->pMultisampleState;
2420    const struct VkPipelineSampleLocationsStateCreateInfoEXT *sample_locations =
2421       vk_find_struct_const(msaa_info->pNext, PIPELINE_SAMPLE_LOCATIONS_STATE_CREATE_INFO_EXT);
2422    if (msaa_info->rasterizationSamples > 1 ||
2423        /* also set msaa key when sample location is not the default
2424         * since this affects varying interpolation */
2425        (sample_locations && sample_locations->sampleLocationsEnable)) {
2426       key->msaa = true;
2427    }
2428 
2429    /* The 1.3.215 spec says:
2430     *
2431     *    Sample shading can be used to specify a minimum number of unique
2432     *    samples to process for each fragment. If sample shading is enabled,
2433     *    an implementation must provide a minimum of
2434     *
2435     *       max(ceil(minSampleShadingFactor * totalSamples), 1)
2436     *
2437     *    unique associated data for each fragment, where
2438     *    minSampleShadingFactor is the minimum fraction of sample shading.
2439     *
2440     * The definition is pretty much the same as OpenGL's GL_SAMPLE_SHADING.
2441     * They both require unique associated data.
2442     *
2443     * There are discussions to change the definition, such that
2444     * sampleShadingEnable does not imply unique associated data.  Before the
2445     * discussions are settled and before apps (i.e., ANGLE) are fixed to
2446     * follow the new and incompatible definition, we should stick to the
2447     * current definition.
2448     *
2449     * Note that ir3_shader_key::sample_shading is not actually used by ir3,
2450     * just checked in tu6_emit_fs_inputs.  We will also copy the value to
2451     * tu_shader_key::force_sample_interp in a bit.
2452     */
2453    if (msaa_info->sampleShadingEnable &&
2454        (msaa_info->minSampleShading * msaa_info->rasterizationSamples) > 1.0f)
2455       key->sample_shading = true;
2456 
2457    /* We set this after we compile to NIR because we need the prim mode */
2458    key->tessellation = IR3_TESS_NONE;
2459 }
2460 
2461 static uint32_t
tu6_get_tessmode(struct tu_shader * shader)2462 tu6_get_tessmode(struct tu_shader* shader)
2463 {
2464    enum tess_primitive_mode primitive_mode = shader->ir3_shader->nir->info.tess._primitive_mode;
2465    switch (primitive_mode) {
2466    case TESS_PRIMITIVE_ISOLINES:
2467       return IR3_TESS_ISOLINES;
2468    case TESS_PRIMITIVE_TRIANGLES:
2469       return IR3_TESS_TRIANGLES;
2470    case TESS_PRIMITIVE_QUADS:
2471       return IR3_TESS_QUADS;
2472    case TESS_PRIMITIVE_UNSPECIFIED:
2473       return IR3_TESS_NONE;
2474    default:
2475       unreachable("bad tessmode");
2476    }
2477 }
2478 
2479 static uint64_t
tu_upload_variant(struct tu_pipeline * pipeline,const struct ir3_shader_variant * variant)2480 tu_upload_variant(struct tu_pipeline *pipeline,
2481                   const struct ir3_shader_variant *variant)
2482 {
2483    struct tu_cs_memory memory;
2484 
2485    if (!variant)
2486       return 0;
2487 
2488    /* this expects to get enough alignment because shaders are allocated first
2489     * and total size is always aligned correctly
2490     * note: an assert in tu6_emit_xs_config validates the alignment
2491     */
2492    tu_cs_alloc(&pipeline->cs, variant->info.size / 4, 1, &memory);
2493 
2494    memcpy(memory.map, variant->bin, variant->info.size);
2495    return memory.iova;
2496 }
2497 
2498 static void
tu_append_executable(struct tu_pipeline * pipeline,struct ir3_shader_variant * variant,char * nir_from_spirv)2499 tu_append_executable(struct tu_pipeline *pipeline, struct ir3_shader_variant *variant,
2500                      char *nir_from_spirv)
2501 {
2502    struct tu_pipeline_executable exe = {
2503       .stage = variant->type,
2504       .nir_from_spirv = nir_from_spirv,
2505       .nir_final = ralloc_strdup(pipeline->executables_mem_ctx, variant->disasm_info.nir),
2506       .disasm = ralloc_strdup(pipeline->executables_mem_ctx, variant->disasm_info.disasm),
2507       .stats = variant->info,
2508       .is_binning = variant->binning_pass,
2509    };
2510 
2511    util_dynarray_append(&pipeline->executables, struct tu_pipeline_executable, exe);
2512 }
2513 
2514 static void
tu_link_shaders(struct tu_pipeline_builder * builder,nir_shader ** shaders,unsigned shaders_count)2515 tu_link_shaders(struct tu_pipeline_builder *builder,
2516                 nir_shader **shaders, unsigned shaders_count)
2517 {
2518    nir_shader *consumer = NULL;
2519    for (gl_shader_stage stage = shaders_count - 1;
2520         stage >= MESA_SHADER_VERTEX; stage--) {
2521       if (!shaders[stage])
2522          continue;
2523 
2524       nir_shader *producer = shaders[stage];
2525       if (!consumer) {
2526          consumer = producer;
2527          continue;
2528       }
2529 
2530       if (nir_link_opt_varyings(producer, consumer)) {
2531          NIR_PASS_V(consumer, nir_opt_constant_folding);
2532          NIR_PASS_V(consumer, nir_opt_algebraic);
2533          NIR_PASS_V(consumer, nir_opt_dce);
2534       }
2535 
2536       NIR_PASS_V(producer, nir_remove_dead_variables, nir_var_shader_out, NULL);
2537       NIR_PASS_V(consumer, nir_remove_dead_variables, nir_var_shader_in, NULL);
2538 
2539       bool progress = nir_remove_unused_varyings(producer, consumer);
2540 
2541       nir_compact_varyings(producer, consumer, true);
2542       if (progress) {
2543          if (nir_lower_global_vars_to_local(producer)) {
2544             /* Remove dead writes, which can remove input loads */
2545             NIR_PASS_V(producer, nir_remove_dead_variables, nir_var_shader_temp, NULL);
2546             NIR_PASS_V(producer, nir_opt_dce);
2547          }
2548          nir_lower_global_vars_to_local(consumer);
2549       }
2550 
2551       consumer = producer;
2552    }
2553 }
2554 
2555 static void
tu_shader_key_init(struct tu_shader_key * key,const VkPipelineShaderStageCreateInfo * stage_info,struct tu_device * dev)2556 tu_shader_key_init(struct tu_shader_key *key,
2557                    const VkPipelineShaderStageCreateInfo *stage_info,
2558                    struct tu_device *dev)
2559 {
2560    enum ir3_wavesize_option api_wavesize, real_wavesize;
2561 
2562    if (stage_info) {
2563       if (stage_info->flags &
2564           VK_PIPELINE_SHADER_STAGE_CREATE_ALLOW_VARYING_SUBGROUP_SIZE_BIT) {
2565          api_wavesize = real_wavesize = IR3_SINGLE_OR_DOUBLE;
2566       } else {
2567          const VkPipelineShaderStageRequiredSubgroupSizeCreateInfo *size_info =
2568             vk_find_struct_const(stage_info->pNext,
2569                                  PIPELINE_SHADER_STAGE_REQUIRED_SUBGROUP_SIZE_CREATE_INFO);
2570 
2571          if (size_info) {
2572             if (size_info->requiredSubgroupSize == dev->compiler->threadsize_base) {
2573                api_wavesize = IR3_SINGLE_ONLY;
2574             } else {
2575                assert(size_info->requiredSubgroupSize == dev->compiler->threadsize_base * 2);
2576                api_wavesize = IR3_DOUBLE_ONLY;
2577             }
2578          } else {
2579             /* Match the exposed subgroupSize. */
2580             api_wavesize = IR3_DOUBLE_ONLY;
2581          }
2582 
2583          if (stage_info->flags &
2584              VK_PIPELINE_SHADER_STAGE_CREATE_REQUIRE_FULL_SUBGROUPS_BIT)
2585             real_wavesize = api_wavesize;
2586          else if (api_wavesize == IR3_SINGLE_ONLY)
2587             real_wavesize = IR3_SINGLE_ONLY;
2588          else
2589             real_wavesize = IR3_SINGLE_OR_DOUBLE;
2590       }
2591    } else {
2592       api_wavesize = real_wavesize = IR3_SINGLE_OR_DOUBLE;
2593    }
2594 
2595    key->api_wavesize = api_wavesize;
2596    key->real_wavesize = real_wavesize;
2597 }
2598 
2599 static void
tu_hash_stage(struct mesa_sha1 * ctx,const VkPipelineShaderStageCreateInfo * stage,const struct tu_shader_key * key)2600 tu_hash_stage(struct mesa_sha1 *ctx,
2601               const VkPipelineShaderStageCreateInfo *stage,
2602               const struct tu_shader_key *key)
2603 {
2604    unsigned char stage_hash[SHA1_DIGEST_LENGTH];
2605 
2606    vk_pipeline_hash_shader_stage(stage, stage_hash);
2607    _mesa_sha1_update(ctx, stage_hash, sizeof(stage_hash));
2608    _mesa_sha1_update(ctx, key, sizeof(*key));
2609 }
2610 
2611 /* Hash flags which can affect ir3 shader compilation which aren't known until
2612  * logical device creation.
2613  */
2614 static void
tu_hash_compiler(struct mesa_sha1 * ctx,const struct ir3_compiler * compiler)2615 tu_hash_compiler(struct mesa_sha1 *ctx, const struct ir3_compiler *compiler)
2616 {
2617    _mesa_sha1_update(ctx, &compiler->robust_buffer_access2,
2618                      sizeof(compiler->robust_buffer_access2));
2619    _mesa_sha1_update(ctx, &ir3_shader_debug, sizeof(ir3_shader_debug));
2620 }
2621 
2622 static void
tu_hash_shaders(unsigned char * hash,const VkPipelineShaderStageCreateInfo ** stages,const struct tu_pipeline_layout * layout,const struct tu_shader_key * keys,const struct ir3_shader_key * ir3_key,const struct ir3_compiler * compiler)2623 tu_hash_shaders(unsigned char *hash,
2624                 const VkPipelineShaderStageCreateInfo **stages,
2625                 const struct tu_pipeline_layout *layout,
2626                 const struct tu_shader_key *keys,
2627                 const struct ir3_shader_key *ir3_key,
2628                 const struct ir3_compiler *compiler)
2629 {
2630    struct mesa_sha1 ctx;
2631 
2632    _mesa_sha1_init(&ctx);
2633 
2634    if (layout)
2635       _mesa_sha1_update(&ctx, layout->sha1, sizeof(layout->sha1));
2636 
2637    _mesa_sha1_update(&ctx, ir3_key, sizeof(ir3_key));
2638 
2639    for (int i = 0; i < MESA_SHADER_STAGES; ++i) {
2640       if (stages[i]) {
2641          tu_hash_stage(&ctx, stages[i], &keys[i]);
2642       }
2643    }
2644    tu_hash_compiler(&ctx, compiler);
2645    _mesa_sha1_final(&ctx, hash);
2646 }
2647 
2648 static void
tu_hash_compute(unsigned char * hash,const VkPipelineShaderStageCreateInfo * stage,const struct tu_pipeline_layout * layout,const struct tu_shader_key * key,const struct ir3_compiler * compiler)2649 tu_hash_compute(unsigned char *hash,
2650                 const VkPipelineShaderStageCreateInfo *stage,
2651                 const struct tu_pipeline_layout *layout,
2652                 const struct tu_shader_key *key,
2653                 const struct ir3_compiler *compiler)
2654 {
2655    struct mesa_sha1 ctx;
2656 
2657    _mesa_sha1_init(&ctx);
2658 
2659    if (layout)
2660       _mesa_sha1_update(&ctx, layout->sha1, sizeof(layout->sha1));
2661 
2662    tu_hash_stage(&ctx, stage, key);
2663 
2664    tu_hash_compiler(&ctx, compiler);
2665    _mesa_sha1_final(&ctx, hash);
2666 }
2667 
2668 static bool
2669 tu_shaders_serialize(struct vk_pipeline_cache_object *object,
2670                      struct blob *blob);
2671 
2672 static struct vk_pipeline_cache_object *
2673 tu_shaders_deserialize(struct vk_device *device,
2674                        const void *key_data, size_t key_size,
2675                        struct blob_reader *blob);
2676 
2677 static void
tu_shaders_destroy(struct vk_pipeline_cache_object * object)2678 tu_shaders_destroy(struct vk_pipeline_cache_object *object)
2679 {
2680    struct tu_compiled_shaders *shaders =
2681       container_of(object, struct tu_compiled_shaders, base);
2682 
2683    for (unsigned i = 0; i < ARRAY_SIZE(shaders->variants); i++)
2684       ralloc_free(shaders->variants[i]);
2685 
2686    vk_pipeline_cache_object_finish(&shaders->base);
2687    vk_free(&object->device->alloc, shaders);
2688 }
2689 
2690 const struct vk_pipeline_cache_object_ops tu_shaders_ops = {
2691    .serialize = tu_shaders_serialize,
2692    .deserialize = tu_shaders_deserialize,
2693    .destroy = tu_shaders_destroy,
2694 };
2695 
2696 static struct tu_compiled_shaders *
tu_shaders_init(struct tu_device * dev,const void * key_data,size_t key_size)2697 tu_shaders_init(struct tu_device *dev, const void *key_data, size_t key_size)
2698 {
2699    VK_MULTIALLOC(ma);
2700    VK_MULTIALLOC_DECL(&ma, struct tu_compiled_shaders, shaders, 1);
2701    VK_MULTIALLOC_DECL_SIZE(&ma, void, obj_key_data, key_size);
2702 
2703    if (!vk_multialloc_zalloc(&ma, &dev->vk.alloc,
2704                              VK_SYSTEM_ALLOCATION_SCOPE_DEVICE))
2705       return NULL;
2706 
2707    memcpy(obj_key_data, key_data, key_size);
2708    vk_pipeline_cache_object_init(&dev->vk, &shaders->base,
2709                                  &tu_shaders_ops, obj_key_data, key_size);
2710 
2711    return shaders;
2712 }
2713 
2714 static bool
tu_shaders_serialize(struct vk_pipeline_cache_object * object,struct blob * blob)2715 tu_shaders_serialize(struct vk_pipeline_cache_object *object,
2716                      struct blob *blob)
2717 {
2718    struct tu_compiled_shaders *shaders =
2719       container_of(object, struct tu_compiled_shaders, base);
2720 
2721    blob_write_bytes(blob, shaders->push_consts, sizeof(shaders->push_consts));
2722    blob_write_uint8(blob, shaders->active_desc_sets);
2723    blob_write_uint8(blob, shaders->multi_pos_output);
2724 
2725    for (unsigned i = 0; i < ARRAY_SIZE(shaders->variants); i++) {
2726       if (shaders->variants[i]) {
2727          blob_write_uint8(blob, 1);
2728          ir3_store_variant(blob, shaders->variants[i]);
2729       } else {
2730          blob_write_uint8(blob, 0);
2731       }
2732    }
2733 
2734    return true;
2735 }
2736 
2737 static struct vk_pipeline_cache_object *
tu_shaders_deserialize(struct vk_device * _device,const void * key_data,size_t key_size,struct blob_reader * blob)2738 tu_shaders_deserialize(struct vk_device *_device,
2739                        const void *key_data, size_t key_size,
2740                        struct blob_reader *blob)
2741 {
2742    struct tu_device *dev = container_of(_device, struct tu_device, vk);
2743    struct tu_compiled_shaders *shaders =
2744       tu_shaders_init(dev, key_data, key_size);
2745 
2746    if (!shaders)
2747       return NULL;
2748 
2749    blob_copy_bytes(blob, shaders->push_consts, sizeof(shaders->push_consts));
2750    shaders->active_desc_sets = blob_read_uint8(blob);
2751    shaders->multi_pos_output = blob_read_uint8(blob);
2752 
2753    for (unsigned i = 0; i < ARRAY_SIZE(shaders->variants); i++) {
2754       bool has_shader = blob_read_uint8(blob);
2755       if (has_shader) {
2756          shaders->variants[i] = ir3_retrieve_variant(blob, dev->compiler, NULL);
2757       }
2758    }
2759 
2760    return &shaders->base;
2761 }
2762 
2763 static struct tu_compiled_shaders *
tu_pipeline_cache_lookup(struct vk_pipeline_cache * cache,const void * key_data,size_t key_size,bool * application_cache_hit)2764 tu_pipeline_cache_lookup(struct vk_pipeline_cache *cache,
2765                          const void *key_data, size_t key_size,
2766                          bool *application_cache_hit)
2767 {
2768    struct vk_pipeline_cache_object *object =
2769       vk_pipeline_cache_lookup_object(cache, key_data, key_size,
2770                                       &tu_shaders_ops, application_cache_hit);
2771    if (object)
2772       return container_of(object, struct tu_compiled_shaders, base);
2773    else
2774       return NULL;
2775 }
2776 
2777 static struct tu_compiled_shaders *
tu_pipeline_cache_insert(struct vk_pipeline_cache * cache,struct tu_compiled_shaders * shaders)2778 tu_pipeline_cache_insert(struct vk_pipeline_cache *cache,
2779                          struct tu_compiled_shaders *shaders)
2780 {
2781    struct vk_pipeline_cache_object *object =
2782       vk_pipeline_cache_add_object(cache, &shaders->base);
2783    return container_of(object, struct tu_compiled_shaders, base);
2784 }
2785 
2786 static VkResult
tu_pipeline_builder_compile_shaders(struct tu_pipeline_builder * builder,struct tu_pipeline * pipeline)2787 tu_pipeline_builder_compile_shaders(struct tu_pipeline_builder *builder,
2788                                     struct tu_pipeline *pipeline)
2789 {
2790    VkResult result = VK_SUCCESS;
2791    const struct ir3_compiler *compiler = builder->device->compiler;
2792    const VkPipelineShaderStageCreateInfo *stage_infos[MESA_SHADER_STAGES] = {
2793       NULL
2794    };
2795    VkPipelineCreationFeedback pipeline_feedback = {
2796       .flags = VK_PIPELINE_CREATION_FEEDBACK_VALID_BIT,
2797    };
2798    VkPipelineCreationFeedback stage_feedbacks[MESA_SHADER_STAGES] = { 0 };
2799 
2800    int64_t pipeline_start = os_time_get_nano();
2801 
2802    const VkPipelineCreationFeedbackCreateInfo *creation_feedback =
2803       vk_find_struct_const(builder->create_info->pNext, PIPELINE_CREATION_FEEDBACK_CREATE_INFO);
2804 
2805    for (uint32_t i = 0; i < builder->create_info->stageCount; i++) {
2806       gl_shader_stage stage =
2807          vk_to_mesa_shader_stage(builder->create_info->pStages[i].stage);
2808       stage_infos[stage] = &builder->create_info->pStages[i];
2809    }
2810 
2811    if (tu6_shared_constants_enable(builder->layout, builder->device->compiler)) {
2812       pipeline->shared_consts = (struct tu_push_constant_range) {
2813          .lo = 0,
2814          .dwords = builder->layout->push_constant_size / 4,
2815       };
2816    }
2817 
2818    struct tu_shader_key keys[ARRAY_SIZE(stage_infos)] = { };
2819    for (gl_shader_stage stage = MESA_SHADER_VERTEX;
2820         stage < ARRAY_SIZE(keys); stage++) {
2821       tu_shader_key_init(&keys[stage], stage_infos[stage], builder->device);
2822    }
2823 
2824    struct ir3_shader_key ir3_key = {};
2825    tu_pipeline_shader_key_init(&ir3_key, pipeline, builder->create_info);
2826 
2827    keys[MESA_SHADER_VERTEX].multiview_mask = builder->multiview_mask;
2828    keys[MESA_SHADER_FRAGMENT].multiview_mask = builder->multiview_mask;
2829    keys[MESA_SHADER_FRAGMENT].force_sample_interp = ir3_key.sample_shading;
2830 
2831    unsigned char pipeline_sha1[20];
2832    tu_hash_shaders(pipeline_sha1, stage_infos, builder->layout, keys, &ir3_key, compiler);
2833 
2834    const bool executable_info = builder->create_info->flags &
2835       VK_PIPELINE_CREATE_CAPTURE_INTERNAL_REPRESENTATIONS_BIT_KHR;
2836 
2837    char *nir_initial_disasm[ARRAY_SIZE(stage_infos)] = { NULL };
2838 
2839    struct tu_compiled_shaders *compiled_shaders;
2840 
2841    if (!executable_info) {
2842       bool application_cache_hit = false;
2843 
2844       compiled_shaders =
2845          tu_pipeline_cache_lookup(builder->cache, &pipeline_sha1,
2846                                   sizeof(pipeline_sha1),
2847                                   &application_cache_hit);
2848 
2849       if (application_cache_hit && builder->cache != builder->device->mem_cache) {
2850          pipeline_feedback.flags |=
2851             VK_PIPELINE_CREATION_FEEDBACK_APPLICATION_PIPELINE_CACHE_HIT_BIT;
2852       }
2853 
2854       if (compiled_shaders)
2855          goto done;
2856    }
2857 
2858    if (builder->create_info->flags &
2859        VK_PIPELINE_CREATE_FAIL_ON_PIPELINE_COMPILE_REQUIRED_BIT) {
2860       return VK_PIPELINE_COMPILE_REQUIRED;
2861    }
2862 
2863    nir_shader *nir[ARRAY_SIZE(stage_infos)] = { NULL };
2864 
2865    struct tu_shader *shaders[ARRAY_SIZE(nir)] = { NULL };
2866 
2867    for (gl_shader_stage stage = MESA_SHADER_VERTEX;
2868         stage < ARRAY_SIZE(nir); stage++) {
2869       const VkPipelineShaderStageCreateInfo *stage_info = stage_infos[stage];
2870       if (!stage_info)
2871          continue;
2872 
2873       int64_t stage_start = os_time_get_nano();
2874 
2875       nir[stage] = tu_spirv_to_nir(builder->device, builder->mem_ctx, stage_info, stage);
2876       if (!nir[stage]) {
2877          result = VK_ERROR_OUT_OF_HOST_MEMORY;
2878          goto fail;
2879       }
2880 
2881       stage_feedbacks[stage].flags = VK_PIPELINE_CREATION_FEEDBACK_VALID_BIT;
2882       stage_feedbacks[stage].duration += os_time_get_nano() - stage_start;
2883    }
2884 
2885    if (!nir[MESA_SHADER_FRAGMENT]) {
2886          const nir_shader_compiler_options *nir_options =
2887             ir3_get_compiler_options(builder->device->compiler);
2888          nir_builder fs_b = nir_builder_init_simple_shader(MESA_SHADER_FRAGMENT,
2889                                                            nir_options,
2890                                                            "noop_fs");
2891          nir[MESA_SHADER_FRAGMENT] = fs_b.shader;
2892    }
2893 
2894    if (executable_info) {
2895       for (gl_shader_stage stage = MESA_SHADER_VERTEX;
2896             stage < ARRAY_SIZE(nir); stage++) {
2897          if (!nir[stage])
2898             continue;
2899 
2900          nir_initial_disasm[stage] =
2901             nir_shader_as_str(nir[stage], pipeline->executables_mem_ctx);
2902       }
2903    }
2904 
2905    tu_link_shaders(builder, nir, ARRAY_SIZE(nir));
2906 
2907    uint32_t desc_sets = 0;
2908    for (gl_shader_stage stage = MESA_SHADER_VERTEX;
2909         stage < ARRAY_SIZE(nir); stage++) {
2910       if (!nir[stage])
2911          continue;
2912 
2913       int64_t stage_start = os_time_get_nano();
2914 
2915       struct tu_shader *shader =
2916          tu_shader_create(builder->device, nir[stage], &keys[stage],
2917                           builder->layout, builder->alloc);
2918       if (!shader) {
2919          result = VK_ERROR_OUT_OF_HOST_MEMORY;
2920          goto fail;
2921       }
2922 
2923       /* In SPIR-V generated from GLSL, the primitive mode is specified in the
2924        * tessellation evaluation shader, but in SPIR-V generated from HLSL,
2925        * the mode is specified in the tessellation control shader. */
2926       if ((stage == MESA_SHADER_TESS_EVAL || stage == MESA_SHADER_TESS_CTRL) &&
2927           ir3_key.tessellation == IR3_TESS_NONE) {
2928          ir3_key.tessellation = tu6_get_tessmode(shader);
2929       }
2930 
2931       if (stage > MESA_SHADER_TESS_CTRL) {
2932          if (stage == MESA_SHADER_FRAGMENT) {
2933             ir3_key.tcs_store_primid = ir3_key.tcs_store_primid ||
2934                (nir[stage]->info.inputs_read & (1ull << VARYING_SLOT_PRIMITIVE_ID));
2935          } else {
2936             ir3_key.tcs_store_primid = ir3_key.tcs_store_primid ||
2937                BITSET_TEST(nir[stage]->info.system_values_read, SYSTEM_VALUE_PRIMITIVE_ID);
2938          }
2939       }
2940 
2941       /* Keep track of the status of each shader's active descriptor sets,
2942        * which is set in tu_lower_io. */
2943       desc_sets |= shader->active_desc_sets;
2944 
2945       shaders[stage] = shader;
2946 
2947       stage_feedbacks[stage].duration += os_time_get_nano() - stage_start;
2948    }
2949 
2950    struct tu_shader *last_shader = shaders[MESA_SHADER_GEOMETRY];
2951    if (!last_shader)
2952       last_shader = shaders[MESA_SHADER_TESS_EVAL];
2953    if (!last_shader)
2954       last_shader = shaders[MESA_SHADER_VERTEX];
2955 
2956    uint64_t outputs_written = last_shader->ir3_shader->nir->info.outputs_written;
2957 
2958    ir3_key.layer_zero = !(outputs_written & VARYING_BIT_LAYER);
2959    ir3_key.view_zero = !(outputs_written & VARYING_BIT_VIEWPORT);
2960 
2961    compiled_shaders =
2962       tu_shaders_init(builder->device, &pipeline_sha1, sizeof(pipeline_sha1));
2963 
2964    if (!compiled_shaders) {
2965       result = VK_ERROR_OUT_OF_HOST_MEMORY;
2966       goto fail;
2967    }
2968 
2969    compiled_shaders->active_desc_sets = desc_sets;
2970    compiled_shaders->multi_pos_output =
2971       shaders[MESA_SHADER_VERTEX]->multi_pos_output;
2972 
2973    for (gl_shader_stage stage = MESA_SHADER_VERTEX;
2974         stage < ARRAY_SIZE(shaders); stage++) {
2975       if (!shaders[stage])
2976          continue;
2977 
2978       int64_t stage_start = os_time_get_nano();
2979 
2980       compiled_shaders->variants[stage] =
2981          ir3_shader_create_variant(shaders[stage]->ir3_shader, &ir3_key,
2982                                    executable_info);
2983       if (!compiled_shaders->variants[stage])
2984          return VK_ERROR_OUT_OF_HOST_MEMORY;
2985 
2986       compiled_shaders->push_consts[stage] = shaders[stage]->push_consts;
2987 
2988       stage_feedbacks[stage].duration += os_time_get_nano() - stage_start;
2989    }
2990 
2991    uint32_t safe_constlens = ir3_trim_constlen(compiled_shaders->variants, compiler);
2992 
2993    ir3_key.safe_constlen = true;
2994 
2995    for (gl_shader_stage stage = MESA_SHADER_VERTEX;
2996         stage < ARRAY_SIZE(shaders); stage++) {
2997       if (!shaders[stage])
2998          continue;
2999 
3000       if (safe_constlens & (1 << stage)) {
3001          int64_t stage_start = os_time_get_nano();
3002 
3003          ralloc_free(compiled_shaders->variants[stage]);
3004          compiled_shaders->variants[stage] =
3005             ir3_shader_create_variant(shaders[stage]->ir3_shader, &ir3_key,
3006                                       executable_info);
3007          if (!compiled_shaders->variants[stage]) {
3008             result = VK_ERROR_OUT_OF_HOST_MEMORY;
3009             goto fail;
3010          }
3011 
3012          stage_feedbacks[stage].duration += os_time_get_nano() - stage_start;
3013       }
3014    }
3015 
3016    for (gl_shader_stage stage = MESA_SHADER_VERTEX;
3017          stage < ARRAY_SIZE(nir); stage++) {
3018       if (shaders[stage]) {
3019          tu_shader_destroy(builder->device, shaders[stage], builder->alloc);
3020       }
3021    }
3022 
3023    compiled_shaders =
3024       tu_pipeline_cache_insert(builder->cache, compiled_shaders);
3025 
3026 done:
3027    for (gl_shader_stage stage = MESA_SHADER_VERTEX;
3028          stage < ARRAY_SIZE(nir); stage++) {
3029       if (compiled_shaders->variants[stage]) {
3030          tu_append_executable(pipeline, compiled_shaders->variants[stage],
3031             nir_initial_disasm[stage]);
3032       }
3033    }
3034 
3035    struct ir3_shader_variant *vs =
3036       compiled_shaders->variants[MESA_SHADER_VERTEX];
3037 
3038    struct ir3_shader_variant *variant;
3039    if (!vs->stream_output.num_outputs && ir3_has_binning_vs(&vs->key)) {
3040       tu_append_executable(pipeline, vs->binning, NULL);
3041       variant = vs->binning;
3042    } else {
3043       variant = vs;
3044    }
3045 
3046    builder->binning_variant = variant;
3047 
3048    builder->shaders = compiled_shaders;
3049 
3050    pipeline->active_desc_sets = compiled_shaders->active_desc_sets;
3051    if (compiled_shaders->variants[MESA_SHADER_TESS_CTRL]) {
3052       pipeline->tess.patch_type =
3053          compiled_shaders->variants[MESA_SHADER_TESS_CTRL]->key.tessellation;
3054    }
3055 
3056    pipeline_feedback.duration = os_time_get_nano() - pipeline_start;
3057    if (creation_feedback) {
3058       *creation_feedback->pPipelineCreationFeedback = pipeline_feedback;
3059 
3060       assert(builder->create_info->stageCount ==
3061              creation_feedback->pipelineStageCreationFeedbackCount);
3062       for (uint32_t i = 0; i < builder->create_info->stageCount; i++) {
3063          gl_shader_stage s =
3064             vk_to_mesa_shader_stage(builder->create_info->pStages[i].stage);
3065          creation_feedback->pPipelineStageCreationFeedbacks[i] = stage_feedbacks[s];
3066       }
3067    }
3068 
3069    return VK_SUCCESS;
3070 
3071 fail:
3072    for (gl_shader_stage stage = MESA_SHADER_VERTEX;
3073          stage < ARRAY_SIZE(nir); stage++) {
3074       if (shaders[stage]) {
3075          tu_shader_destroy(builder->device, shaders[stage], builder->alloc);
3076       }
3077    }
3078 
3079    if (compiled_shaders)
3080       vk_pipeline_cache_object_unref(&compiled_shaders->base);
3081 
3082    return result;
3083 }
3084 
3085 static void
tu_pipeline_builder_parse_dynamic(struct tu_pipeline_builder * builder,struct tu_pipeline * pipeline)3086 tu_pipeline_builder_parse_dynamic(struct tu_pipeline_builder *builder,
3087                                   struct tu_pipeline *pipeline)
3088 {
3089    const VkPipelineDynamicStateCreateInfo *dynamic_info =
3090       builder->create_info->pDynamicState;
3091 
3092    pipeline->gras_su_cntl_mask = ~0u;
3093    pipeline->rb_depth_cntl_mask = ~0u;
3094    pipeline->rb_stencil_cntl_mask = ~0u;
3095    pipeline->pc_raster_cntl_mask = ~0u;
3096    pipeline->vpc_unknown_9107_mask = ~0u;
3097    pipeline->sp_blend_cntl_mask = ~0u;
3098    pipeline->rb_blend_cntl_mask = ~0u;
3099    pipeline->rb_mrt_control_mask = ~0u;
3100 
3101    if (!dynamic_info)
3102       return;
3103 
3104    for (uint32_t i = 0; i < dynamic_info->dynamicStateCount; i++) {
3105       VkDynamicState state = dynamic_info->pDynamicStates[i];
3106       switch (state) {
3107       case VK_DYNAMIC_STATE_VIEWPORT ... VK_DYNAMIC_STATE_STENCIL_REFERENCE:
3108          if (state == VK_DYNAMIC_STATE_LINE_WIDTH)
3109             pipeline->gras_su_cntl_mask &= ~A6XX_GRAS_SU_CNTL_LINEHALFWIDTH__MASK;
3110          pipeline->dynamic_state_mask |= BIT(state);
3111          break;
3112       case VK_DYNAMIC_STATE_SAMPLE_LOCATIONS_EXT:
3113          pipeline->dynamic_state_mask |= BIT(TU_DYNAMIC_STATE_SAMPLE_LOCATIONS);
3114          break;
3115       case VK_DYNAMIC_STATE_CULL_MODE:
3116          pipeline->gras_su_cntl_mask &=
3117             ~(A6XX_GRAS_SU_CNTL_CULL_BACK | A6XX_GRAS_SU_CNTL_CULL_FRONT);
3118          pipeline->dynamic_state_mask |= BIT(TU_DYNAMIC_STATE_GRAS_SU_CNTL);
3119          break;
3120       case VK_DYNAMIC_STATE_FRONT_FACE:
3121          pipeline->gras_su_cntl_mask &= ~A6XX_GRAS_SU_CNTL_FRONT_CW;
3122          pipeline->dynamic_state_mask |= BIT(TU_DYNAMIC_STATE_GRAS_SU_CNTL);
3123          break;
3124       case VK_DYNAMIC_STATE_PRIMITIVE_TOPOLOGY:
3125          pipeline->dynamic_state_mask |= BIT(TU_DYNAMIC_STATE_PRIMITIVE_TOPOLOGY);
3126          break;
3127       case VK_DYNAMIC_STATE_VERTEX_INPUT_BINDING_STRIDE:
3128          pipeline->dynamic_state_mask |= BIT(TU_DYNAMIC_STATE_VB_STRIDE);
3129          break;
3130       case VK_DYNAMIC_STATE_VIEWPORT_WITH_COUNT:
3131          pipeline->dynamic_state_mask |= BIT(VK_DYNAMIC_STATE_VIEWPORT);
3132          break;
3133       case VK_DYNAMIC_STATE_SCISSOR_WITH_COUNT:
3134          pipeline->dynamic_state_mask |= BIT(VK_DYNAMIC_STATE_SCISSOR);
3135          break;
3136       case VK_DYNAMIC_STATE_DEPTH_TEST_ENABLE:
3137          pipeline->rb_depth_cntl_mask &=
3138             ~(A6XX_RB_DEPTH_CNTL_Z_TEST_ENABLE | A6XX_RB_DEPTH_CNTL_Z_READ_ENABLE);
3139          pipeline->dynamic_state_mask |= BIT(TU_DYNAMIC_STATE_RB_DEPTH_CNTL);
3140          break;
3141       case VK_DYNAMIC_STATE_DEPTH_WRITE_ENABLE:
3142          pipeline->rb_depth_cntl_mask &= ~A6XX_RB_DEPTH_CNTL_Z_WRITE_ENABLE;
3143          pipeline->dynamic_state_mask |= BIT(TU_DYNAMIC_STATE_RB_DEPTH_CNTL);
3144          break;
3145       case VK_DYNAMIC_STATE_DEPTH_COMPARE_OP:
3146          pipeline->rb_depth_cntl_mask &= ~A6XX_RB_DEPTH_CNTL_ZFUNC__MASK;
3147          pipeline->dynamic_state_mask |= BIT(TU_DYNAMIC_STATE_RB_DEPTH_CNTL);
3148          break;
3149       case VK_DYNAMIC_STATE_DEPTH_BOUNDS_TEST_ENABLE:
3150          pipeline->rb_depth_cntl_mask &=
3151             ~(A6XX_RB_DEPTH_CNTL_Z_BOUNDS_ENABLE | A6XX_RB_DEPTH_CNTL_Z_READ_ENABLE);
3152          pipeline->dynamic_state_mask |= BIT(TU_DYNAMIC_STATE_RB_DEPTH_CNTL);
3153          break;
3154       case VK_DYNAMIC_STATE_STENCIL_TEST_ENABLE:
3155          pipeline->rb_stencil_cntl_mask &= ~(A6XX_RB_STENCIL_CONTROL_STENCIL_ENABLE |
3156                                              A6XX_RB_STENCIL_CONTROL_STENCIL_ENABLE_BF |
3157                                              A6XX_RB_STENCIL_CONTROL_STENCIL_READ);
3158          pipeline->dynamic_state_mask |= BIT(TU_DYNAMIC_STATE_RB_STENCIL_CNTL);
3159          break;
3160       case VK_DYNAMIC_STATE_STENCIL_OP:
3161          pipeline->rb_stencil_cntl_mask &= ~(A6XX_RB_STENCIL_CONTROL_FUNC__MASK |
3162                                              A6XX_RB_STENCIL_CONTROL_FAIL__MASK |
3163                                              A6XX_RB_STENCIL_CONTROL_ZPASS__MASK |
3164                                              A6XX_RB_STENCIL_CONTROL_ZFAIL__MASK |
3165                                              A6XX_RB_STENCIL_CONTROL_FUNC_BF__MASK |
3166                                              A6XX_RB_STENCIL_CONTROL_FAIL_BF__MASK |
3167                                              A6XX_RB_STENCIL_CONTROL_ZPASS_BF__MASK |
3168                                              A6XX_RB_STENCIL_CONTROL_ZFAIL_BF__MASK);
3169          pipeline->dynamic_state_mask |= BIT(TU_DYNAMIC_STATE_RB_STENCIL_CNTL);
3170          break;
3171       case VK_DYNAMIC_STATE_DEPTH_BIAS_ENABLE:
3172          pipeline->gras_su_cntl_mask &= ~A6XX_GRAS_SU_CNTL_POLY_OFFSET;
3173          pipeline->dynamic_state_mask |= BIT(TU_DYNAMIC_STATE_GRAS_SU_CNTL);
3174          break;
3175       case VK_DYNAMIC_STATE_PRIMITIVE_RESTART_ENABLE:
3176          pipeline->dynamic_state_mask |= BIT(TU_DYNAMIC_STATE_PRIMITIVE_RESTART_ENABLE);
3177          break;
3178       case VK_DYNAMIC_STATE_RASTERIZER_DISCARD_ENABLE:
3179          pipeline->pc_raster_cntl_mask &= ~A6XX_PC_RASTER_CNTL_DISCARD;
3180          pipeline->vpc_unknown_9107_mask &= ~A6XX_VPC_UNKNOWN_9107_RASTER_DISCARD;
3181          pipeline->dynamic_state_mask |= BIT(TU_DYNAMIC_STATE_RASTERIZER_DISCARD);
3182          break;
3183       case VK_DYNAMIC_STATE_LOGIC_OP_EXT:
3184          pipeline->sp_blend_cntl_mask &= ~A6XX_SP_BLEND_CNTL_ENABLE_BLEND__MASK;
3185          pipeline->rb_blend_cntl_mask &= ~A6XX_RB_BLEND_CNTL_ENABLE_BLEND__MASK;
3186          pipeline->rb_mrt_control_mask &= ~A6XX_RB_MRT_CONTROL_ROP_CODE__MASK;
3187          pipeline->dynamic_state_mask |= BIT(TU_DYNAMIC_STATE_BLEND);
3188          pipeline->dynamic_state_mask |= BIT(TU_DYNAMIC_STATE_LOGIC_OP);
3189          break;
3190       case VK_DYNAMIC_STATE_COLOR_WRITE_ENABLE_EXT:
3191          pipeline->sp_blend_cntl_mask &= ~A6XX_SP_BLEND_CNTL_ENABLE_BLEND__MASK;
3192          pipeline->rb_blend_cntl_mask &= ~A6XX_RB_BLEND_CNTL_ENABLE_BLEND__MASK;
3193          pipeline->dynamic_state_mask |= BIT(TU_DYNAMIC_STATE_BLEND);
3194 
3195          /* Dynamic color write enable doesn't directly change any of the
3196           * registers, but it causes us to make some of the registers 0, so we
3197           * set this dynamic state instead of making the register dynamic.
3198           */
3199          pipeline->dynamic_state_mask |= BIT(TU_DYNAMIC_STATE_COLOR_WRITE_ENABLE);
3200          break;
3201       default:
3202          assert(!"unsupported dynamic state");
3203          break;
3204       }
3205    }
3206 }
3207 
3208 static void
tu_pipeline_set_linkage(struct tu_program_descriptor_linkage * link,struct tu_push_constant_range * push_consts,struct ir3_shader_variant * v)3209 tu_pipeline_set_linkage(struct tu_program_descriptor_linkage *link,
3210                         struct tu_push_constant_range *push_consts,
3211                         struct ir3_shader_variant *v)
3212 {
3213    link->const_state = *ir3_const_state(v);
3214    link->constlen = v->constlen;
3215    link->push_consts = *push_consts;
3216 }
3217 
3218 static void
tu_pipeline_builder_parse_shader_stages(struct tu_pipeline_builder * builder,struct tu_pipeline * pipeline)3219 tu_pipeline_builder_parse_shader_stages(struct tu_pipeline_builder *builder,
3220                                         struct tu_pipeline *pipeline)
3221 {
3222    struct tu_cs prog_cs;
3223 
3224    /* Emit HLSQ_xS_CNTL/HLSQ_SP_xS_CONFIG *first*, before emitting anything
3225     * else that could depend on that state (like push constants)
3226     *
3227     * Note also that this always uses the full VS even in binning pass.  The
3228     * binning pass variant has the same const layout as the full VS, and
3229     * the constlen for the VS will be the same or greater than the constlen
3230     * for the binning pass variant.  It is required that the constlen state
3231     * matches between binning and draw passes, as some parts of the push
3232     * consts are emitted in state groups that are shared between the binning
3233     * and draw passes.
3234     */
3235    tu_cs_begin_sub_stream(&pipeline->cs, 512, &prog_cs);
3236    tu6_emit_program_config(&prog_cs, builder);
3237    pipeline->program.config_state = tu_cs_end_draw_state(&pipeline->cs, &prog_cs);
3238 
3239    tu_cs_begin_sub_stream(&pipeline->cs, 512 + builder->additional_cs_reserve_size, &prog_cs);
3240    tu6_emit_program(&prog_cs, builder, false, pipeline);
3241    pipeline->program.state = tu_cs_end_draw_state(&pipeline->cs, &prog_cs);
3242 
3243    tu_cs_begin_sub_stream(&pipeline->cs, 512 + builder->additional_cs_reserve_size, &prog_cs);
3244    tu6_emit_program(&prog_cs, builder, true, pipeline);
3245    pipeline->program.binning_state = tu_cs_end_draw_state(&pipeline->cs, &prog_cs);
3246 
3247    VkShaderStageFlags stages = 0;
3248    for (unsigned i = 0; i < builder->create_info->stageCount; i++) {
3249       stages |= builder->create_info->pStages[i].stage;
3250    }
3251    pipeline->active_stages = stages;
3252 
3253    for (unsigned i = 0; i < ARRAY_SIZE(builder->shaders->variants); i++) {
3254       if (!builder->shaders->variants[i])
3255          continue;
3256 
3257       tu_pipeline_set_linkage(&pipeline->program.link[i],
3258                               &builder->shaders->push_consts[i],
3259                               builder->shaders->variants[i]);
3260    }
3261 }
3262 
3263 static void
tu_pipeline_builder_parse_vertex_input(struct tu_pipeline_builder * builder,struct tu_pipeline * pipeline)3264 tu_pipeline_builder_parse_vertex_input(struct tu_pipeline_builder *builder,
3265                                        struct tu_pipeline *pipeline)
3266 {
3267    const VkPipelineVertexInputStateCreateInfo *vi_info =
3268       builder->create_info->pVertexInputState;
3269    const struct ir3_shader_variant *vs = builder->shaders->variants[MESA_SHADER_VERTEX];
3270    const struct ir3_shader_variant *bs = builder->binning_variant;
3271 
3272    /* Bindings may contain holes */
3273    for (unsigned i = 0; i < vi_info->vertexBindingDescriptionCount; i++) {
3274       pipeline->num_vbs =
3275          MAX2(pipeline->num_vbs, vi_info->pVertexBindingDescriptions[i].binding + 1);
3276    }
3277 
3278    tu6_emit_vertex_input(pipeline, &pipeline->vi.state, vs, vi_info);
3279    if (bs)
3280       tu6_emit_vertex_input(pipeline, &pipeline->vi.binning_state, bs, vi_info);
3281 }
3282 
3283 static void
tu_pipeline_builder_parse_input_assembly(struct tu_pipeline_builder * builder,struct tu_pipeline * pipeline)3284 tu_pipeline_builder_parse_input_assembly(struct tu_pipeline_builder *builder,
3285                                          struct tu_pipeline *pipeline)
3286 {
3287    const VkPipelineInputAssemblyStateCreateInfo *ia_info =
3288       builder->create_info->pInputAssemblyState;
3289 
3290    pipeline->ia.primtype = tu6_primtype(ia_info->topology);
3291    pipeline->ia.primitive_restart = ia_info->primitiveRestartEnable;
3292 }
3293 
3294 static bool
tu_pipeline_static_state(struct tu_pipeline * pipeline,struct tu_cs * cs,uint32_t id,uint32_t size)3295 tu_pipeline_static_state(struct tu_pipeline *pipeline, struct tu_cs *cs,
3296                          uint32_t id, uint32_t size)
3297 {
3298    assert(id < ARRAY_SIZE(pipeline->dynamic_state));
3299 
3300    if (pipeline->dynamic_state_mask & BIT(id))
3301       return false;
3302 
3303    pipeline->dynamic_state[id] = tu_cs_draw_state(&pipeline->cs, cs, size);
3304    return true;
3305 }
3306 
3307 static void
tu_pipeline_builder_parse_tessellation(struct tu_pipeline_builder * builder,struct tu_pipeline * pipeline)3308 tu_pipeline_builder_parse_tessellation(struct tu_pipeline_builder *builder,
3309                                        struct tu_pipeline *pipeline)
3310 {
3311    if (!(pipeline->active_stages & VK_SHADER_STAGE_TESSELLATION_CONTROL_BIT) ||
3312        !(pipeline->active_stages & VK_SHADER_STAGE_TESSELLATION_EVALUATION_BIT))
3313       return;
3314 
3315    const VkPipelineTessellationStateCreateInfo *tess_info =
3316       builder->create_info->pTessellationState;
3317 
3318    assert(pipeline->ia.primtype == DI_PT_PATCHES0);
3319    assert(tess_info->patchControlPoints <= 32);
3320    pipeline->ia.primtype += tess_info->patchControlPoints;
3321    const VkPipelineTessellationDomainOriginStateCreateInfo *domain_info =
3322          vk_find_struct_const(tess_info->pNext, PIPELINE_TESSELLATION_DOMAIN_ORIGIN_STATE_CREATE_INFO);
3323    pipeline->tess.upper_left_domain_origin = !domain_info ||
3324          domain_info->domainOrigin == VK_TESSELLATION_DOMAIN_ORIGIN_UPPER_LEFT;
3325    const struct ir3_shader_variant *hs = builder->shaders->variants[MESA_SHADER_TESS_CTRL];
3326    pipeline->tess.param_stride = hs->output_size * 4;
3327 }
3328 
3329 static void
tu_pipeline_builder_parse_viewport(struct tu_pipeline_builder * builder,struct tu_pipeline * pipeline)3330 tu_pipeline_builder_parse_viewport(struct tu_pipeline_builder *builder,
3331                                    struct tu_pipeline *pipeline)
3332 {
3333    /* The spec says:
3334     *
3335     *    pViewportState is a pointer to an instance of the
3336     *    VkPipelineViewportStateCreateInfo structure, and is ignored if the
3337     *    pipeline has rasterization disabled."
3338     *
3339     * We leave the relevant registers stale in that case.
3340     */
3341    if (builder->rasterizer_discard)
3342       return;
3343 
3344    const VkPipelineViewportStateCreateInfo *vp_info =
3345       builder->create_info->pViewportState;
3346    const VkPipelineViewportDepthClipControlCreateInfoEXT *depth_clip_info =
3347          vk_find_struct_const(vp_info->pNext, PIPELINE_VIEWPORT_DEPTH_CLIP_CONTROL_CREATE_INFO_EXT);
3348    pipeline->z_negative_one_to_one = depth_clip_info ? depth_clip_info->negativeOneToOne : false;
3349 
3350    struct tu_cs cs;
3351 
3352    if (tu_pipeline_static_state(pipeline, &cs, VK_DYNAMIC_STATE_VIEWPORT, 8 + 10 * vp_info->viewportCount))
3353       tu6_emit_viewport(&cs, vp_info->pViewports, vp_info->viewportCount, pipeline->z_negative_one_to_one);
3354 
3355    if (tu_pipeline_static_state(pipeline, &cs, VK_DYNAMIC_STATE_SCISSOR, 1 + 2 * vp_info->scissorCount))
3356       tu6_emit_scissor(&cs, vp_info->pScissors, vp_info->scissorCount);
3357 }
3358 
3359 static void
tu_pipeline_builder_parse_rasterization(struct tu_pipeline_builder * builder,struct tu_pipeline * pipeline)3360 tu_pipeline_builder_parse_rasterization(struct tu_pipeline_builder *builder,
3361                                         struct tu_pipeline *pipeline)
3362 {
3363    const VkPipelineRasterizationStateCreateInfo *rast_info =
3364       builder->create_info->pRasterizationState;
3365 
3366    enum a6xx_polygon_mode mode = tu6_polygon_mode(rast_info->polygonMode);
3367 
3368    builder->depth_clip_disable = rast_info->depthClampEnable;
3369 
3370    const VkPipelineRasterizationDepthClipStateCreateInfoEXT *depth_clip_state =
3371       vk_find_struct_const(rast_info, PIPELINE_RASTERIZATION_DEPTH_CLIP_STATE_CREATE_INFO_EXT);
3372    if (depth_clip_state)
3373       builder->depth_clip_disable = !depth_clip_state->depthClipEnable;
3374 
3375    pipeline->line_mode = RECTANGULAR;
3376 
3377    if (tu6_primtype_line(pipeline->ia.primtype) ||
3378        (tu6_primtype_patches(pipeline->ia.primtype) &&
3379         pipeline->tess.patch_type == IR3_TESS_ISOLINES)) {
3380       const VkPipelineRasterizationLineStateCreateInfoEXT *rast_line_state =
3381          vk_find_struct_const(rast_info->pNext,
3382                               PIPELINE_RASTERIZATION_LINE_STATE_CREATE_INFO_EXT);
3383 
3384       if (rast_line_state && rast_line_state->lineRasterizationMode ==
3385                VK_LINE_RASTERIZATION_MODE_BRESENHAM_EXT) {
3386          pipeline->line_mode = BRESENHAM;
3387       }
3388    }
3389 
3390    struct tu_cs cs;
3391    uint32_t cs_size = 9 +
3392       (builder->device->physical_device->info->a6xx.has_shading_rate ? 8 : 0) +
3393       (builder->emit_msaa_state ? 11 : 0);
3394    pipeline->rast_state = tu_cs_draw_state(&pipeline->cs, &cs, cs_size);
3395 
3396    tu_cs_emit_regs(&cs,
3397                    A6XX_GRAS_CL_CNTL(
3398                      .znear_clip_disable = builder->depth_clip_disable,
3399                      .zfar_clip_disable = builder->depth_clip_disable,
3400                      /* TODO should this be depth_clip_disable instead? */
3401                      .unk5 = rast_info->depthClampEnable,
3402                      .zero_gb_scale_z = pipeline->z_negative_one_to_one ? 0 : 1,
3403                      .vp_clip_code_ignore = 1));
3404 
3405    tu_cs_emit_regs(&cs,
3406                    A6XX_VPC_POLYGON_MODE(mode));
3407 
3408    tu_cs_emit_regs(&cs,
3409                    A6XX_PC_POLYGON_MODE(mode));
3410 
3411    /* move to hw ctx init? */
3412    tu_cs_emit_regs(&cs,
3413                    A6XX_GRAS_SU_POINT_MINMAX(.min = 1.0f / 16.0f, .max = 4092.0f),
3414                    A6XX_GRAS_SU_POINT_SIZE(1.0f));
3415 
3416    if (builder->device->physical_device->info->a6xx.has_shading_rate) {
3417       tu_cs_emit_regs(&cs, A6XX_RB_UNKNOWN_8A00());
3418       tu_cs_emit_regs(&cs, A6XX_RB_UNKNOWN_8A10());
3419       tu_cs_emit_regs(&cs, A6XX_RB_UNKNOWN_8A20());
3420       tu_cs_emit_regs(&cs, A6XX_RB_UNKNOWN_8A30());
3421    }
3422 
3423    /* If samples count couldn't be devised from the subpass, we should emit it here.
3424     * It happens when subpass doesn't use any color/depth attachment.
3425     */
3426    if (builder->emit_msaa_state)
3427       tu6_emit_msaa(&cs, builder->samples, pipeline->line_mode);
3428 
3429    const VkPipelineRasterizationStateStreamCreateInfoEXT *stream_info =
3430       vk_find_struct_const(rast_info->pNext,
3431                            PIPELINE_RASTERIZATION_STATE_STREAM_CREATE_INFO_EXT);
3432    unsigned stream = stream_info ? stream_info->rasterizationStream : 0;
3433 
3434    pipeline->pc_raster_cntl = A6XX_PC_RASTER_CNTL_STREAM(stream);
3435    pipeline->vpc_unknown_9107 = 0;
3436    if (rast_info->rasterizerDiscardEnable) {
3437       pipeline->pc_raster_cntl |= A6XX_PC_RASTER_CNTL_DISCARD;
3438       pipeline->vpc_unknown_9107 |= A6XX_VPC_UNKNOWN_9107_RASTER_DISCARD;
3439    }
3440 
3441    if (tu_pipeline_static_state(pipeline, &cs, TU_DYNAMIC_STATE_RASTERIZER_DISCARD, 4)) {
3442       tu_cs_emit_regs(&cs, A6XX_PC_RASTER_CNTL(.dword = pipeline->pc_raster_cntl));
3443       tu_cs_emit_regs(&cs, A6XX_VPC_UNKNOWN_9107(.dword = pipeline->vpc_unknown_9107));
3444    }
3445 
3446    pipeline->gras_su_cntl =
3447       tu6_gras_su_cntl(rast_info, pipeline->line_mode, builder->multiview_mask != 0);
3448 
3449    if (tu_pipeline_static_state(pipeline, &cs, TU_DYNAMIC_STATE_GRAS_SU_CNTL, 2))
3450       tu_cs_emit_regs(&cs, A6XX_GRAS_SU_CNTL(.dword = pipeline->gras_su_cntl));
3451 
3452    if (tu_pipeline_static_state(pipeline, &cs, VK_DYNAMIC_STATE_DEPTH_BIAS, 4)) {
3453       tu6_emit_depth_bias(&cs, rast_info->depthBiasConstantFactor,
3454                           rast_info->depthBiasClamp,
3455                           rast_info->depthBiasSlopeFactor);
3456    }
3457 
3458    const struct VkPipelineRasterizationProvokingVertexStateCreateInfoEXT *provoking_vtx_state =
3459       vk_find_struct_const(rast_info->pNext, PIPELINE_RASTERIZATION_PROVOKING_VERTEX_STATE_CREATE_INFO_EXT);
3460    pipeline->provoking_vertex_last = provoking_vtx_state &&
3461       provoking_vtx_state->provokingVertexMode == VK_PROVOKING_VERTEX_MODE_LAST_VERTEX_EXT;
3462 }
3463 
3464 static void
tu_pipeline_builder_parse_depth_stencil(struct tu_pipeline_builder * builder,struct tu_pipeline * pipeline)3465 tu_pipeline_builder_parse_depth_stencil(struct tu_pipeline_builder *builder,
3466                                         struct tu_pipeline *pipeline)
3467 {
3468    /* The spec says:
3469     *
3470     *    pDepthStencilState is a pointer to an instance of the
3471     *    VkPipelineDepthStencilStateCreateInfo structure, and is ignored if
3472     *    the pipeline has rasterization disabled or if the subpass of the
3473     *    render pass the pipeline is created against does not use a
3474     *    depth/stencil attachment.
3475     */
3476    const VkPipelineDepthStencilStateCreateInfo *ds_info =
3477       builder->create_info->pDepthStencilState;
3478    const enum pipe_format pipe_format =
3479       vk_format_to_pipe_format(builder->depth_attachment_format);
3480    uint32_t rb_depth_cntl = 0, rb_stencil_cntl = 0;
3481    struct tu_cs cs;
3482 
3483    if (builder->depth_attachment_format != VK_FORMAT_UNDEFINED &&
3484        builder->depth_attachment_format != VK_FORMAT_S8_UINT) {
3485       if (ds_info->depthTestEnable) {
3486          rb_depth_cntl |=
3487             A6XX_RB_DEPTH_CNTL_Z_TEST_ENABLE |
3488             A6XX_RB_DEPTH_CNTL_ZFUNC(tu6_compare_func(ds_info->depthCompareOp)) |
3489             A6XX_RB_DEPTH_CNTL_Z_READ_ENABLE; /* TODO: don't set for ALWAYS/NEVER */
3490 
3491          if (builder->depth_clip_disable)
3492             rb_depth_cntl |= A6XX_RB_DEPTH_CNTL_Z_CLIP_DISABLE;
3493 
3494          if (ds_info->depthWriteEnable)
3495             rb_depth_cntl |= A6XX_RB_DEPTH_CNTL_Z_WRITE_ENABLE;
3496       }
3497 
3498       if (ds_info->depthBoundsTestEnable)
3499          rb_depth_cntl |= A6XX_RB_DEPTH_CNTL_Z_BOUNDS_ENABLE | A6XX_RB_DEPTH_CNTL_Z_READ_ENABLE;
3500 
3501       if (ds_info->depthBoundsTestEnable && !ds_info->depthTestEnable)
3502          tu6_apply_depth_bounds_workaround(builder->device, &rb_depth_cntl);
3503 
3504       pipeline->depth_cpp_per_sample = util_format_get_component_bits(
3505             pipe_format, UTIL_FORMAT_COLORSPACE_ZS, 0) / 8;
3506    } else {
3507       /* if RB_DEPTH_CNTL is set dynamically, we need to make sure it is set
3508        * to 0 when this pipeline is used, as enabling depth test when there
3509        * is no depth attachment is a problem (at least for the S8_UINT case)
3510        */
3511       if (pipeline->dynamic_state_mask & BIT(TU_DYNAMIC_STATE_RB_DEPTH_CNTL))
3512          pipeline->rb_depth_cntl_disable = true;
3513    }
3514 
3515    if (builder->depth_attachment_format != VK_FORMAT_UNDEFINED) {
3516       const VkStencilOpState *front = &ds_info->front;
3517       const VkStencilOpState *back = &ds_info->back;
3518 
3519       rb_stencil_cntl |=
3520          A6XX_RB_STENCIL_CONTROL_FUNC(tu6_compare_func(front->compareOp)) |
3521          A6XX_RB_STENCIL_CONTROL_FAIL(tu6_stencil_op(front->failOp)) |
3522          A6XX_RB_STENCIL_CONTROL_ZPASS(tu6_stencil_op(front->passOp)) |
3523          A6XX_RB_STENCIL_CONTROL_ZFAIL(tu6_stencil_op(front->depthFailOp)) |
3524          A6XX_RB_STENCIL_CONTROL_FUNC_BF(tu6_compare_func(back->compareOp)) |
3525          A6XX_RB_STENCIL_CONTROL_FAIL_BF(tu6_stencil_op(back->failOp)) |
3526          A6XX_RB_STENCIL_CONTROL_ZPASS_BF(tu6_stencil_op(back->passOp)) |
3527          A6XX_RB_STENCIL_CONTROL_ZFAIL_BF(tu6_stencil_op(back->depthFailOp));
3528 
3529       if (ds_info->stencilTestEnable) {
3530          rb_stencil_cntl |=
3531             A6XX_RB_STENCIL_CONTROL_STENCIL_ENABLE |
3532             A6XX_RB_STENCIL_CONTROL_STENCIL_ENABLE_BF |
3533             A6XX_RB_STENCIL_CONTROL_STENCIL_READ;
3534       }
3535 
3536       pipeline->stencil_cpp_per_sample = util_format_get_component_bits(
3537             pipe_format, UTIL_FORMAT_COLORSPACE_ZS, 1) / 8;
3538    }
3539 
3540    if (tu_pipeline_static_state(pipeline, &cs, TU_DYNAMIC_STATE_RB_DEPTH_CNTL, 2)) {
3541       tu_cs_emit_pkt4(&cs, REG_A6XX_RB_DEPTH_CNTL, 1);
3542       tu_cs_emit(&cs, rb_depth_cntl);
3543    }
3544    pipeline->rb_depth_cntl = rb_depth_cntl;
3545 
3546    if (tu_pipeline_static_state(pipeline, &cs, TU_DYNAMIC_STATE_RB_STENCIL_CNTL, 2)) {
3547       tu_cs_emit_pkt4(&cs, REG_A6XX_RB_STENCIL_CONTROL, 1);
3548       tu_cs_emit(&cs, rb_stencil_cntl);
3549    }
3550    pipeline->rb_stencil_cntl = rb_stencil_cntl;
3551 
3552    /* the remaining draw states arent used if there is no d/s, leave them empty */
3553    if (builder->depth_attachment_format == VK_FORMAT_UNDEFINED)
3554       return;
3555 
3556    if (tu_pipeline_static_state(pipeline, &cs, VK_DYNAMIC_STATE_DEPTH_BOUNDS, 3)) {
3557       tu_cs_emit_regs(&cs,
3558                       A6XX_RB_Z_BOUNDS_MIN(ds_info->minDepthBounds),
3559                       A6XX_RB_Z_BOUNDS_MAX(ds_info->maxDepthBounds));
3560    }
3561 
3562    if (tu_pipeline_static_state(pipeline, &cs, VK_DYNAMIC_STATE_STENCIL_COMPARE_MASK, 2)) {
3563       tu_cs_emit_regs(&cs, A6XX_RB_STENCILMASK(.mask = ds_info->front.compareMask & 0xff,
3564                                                .bfmask = ds_info->back.compareMask & 0xff));
3565    }
3566 
3567    if (tu_pipeline_static_state(pipeline, &cs, VK_DYNAMIC_STATE_STENCIL_WRITE_MASK, 2)) {
3568       update_stencil_mask(&pipeline->stencil_wrmask,  VK_STENCIL_FACE_FRONT_BIT, ds_info->front.writeMask);
3569       update_stencil_mask(&pipeline->stencil_wrmask,  VK_STENCIL_FACE_BACK_BIT, ds_info->back.writeMask);
3570       tu_cs_emit_regs(&cs, A6XX_RB_STENCILWRMASK(.dword = pipeline->stencil_wrmask));
3571    }
3572 
3573    if (tu_pipeline_static_state(pipeline, &cs, VK_DYNAMIC_STATE_STENCIL_REFERENCE, 2)) {
3574       tu_cs_emit_regs(&cs, A6XX_RB_STENCILREF(.ref = ds_info->front.reference & 0xff,
3575                                               .bfref = ds_info->back.reference & 0xff));
3576    }
3577 
3578    if (builder->shaders->variants[MESA_SHADER_FRAGMENT]) {
3579       const struct ir3_shader_variant *fs = builder->shaders->variants[MESA_SHADER_FRAGMENT];
3580       if (fs->has_kill || builder->alpha_to_coverage) {
3581          pipeline->lrz.force_disable_mask |= TU_LRZ_FORCE_DISABLE_WRITE;
3582       }
3583       if (fs->no_earlyz || fs->writes_pos) {
3584          pipeline->lrz.force_disable_mask = TU_LRZ_FORCE_DISABLE_LRZ;
3585       }
3586    }
3587 }
3588 
3589 static void
tu_pipeline_builder_parse_multisample_and_color_blend(struct tu_pipeline_builder * builder,struct tu_pipeline * pipeline)3590 tu_pipeline_builder_parse_multisample_and_color_blend(
3591    struct tu_pipeline_builder *builder, struct tu_pipeline *pipeline)
3592 {
3593    /* The spec says:
3594     *
3595     *    pMultisampleState is a pointer to an instance of the
3596     *    VkPipelineMultisampleStateCreateInfo, and is ignored if the pipeline
3597     *    has rasterization disabled.
3598     *
3599     * Also,
3600     *
3601     *    pColorBlendState is a pointer to an instance of the
3602     *    VkPipelineColorBlendStateCreateInfo structure, and is ignored if the
3603     *    pipeline has rasterization disabled or if the subpass of the render
3604     *    pass the pipeline is created against does not use any color
3605     *    attachments.
3606     *
3607     * We leave the relevant registers stale when rasterization is disabled.
3608     */
3609    if (builder->rasterizer_discard)
3610       return;
3611 
3612    static const VkPipelineColorBlendStateCreateInfo dummy_blend_info;
3613    const VkPipelineMultisampleStateCreateInfo *msaa_info =
3614       builder->create_info->pMultisampleState;
3615    const VkPipelineColorBlendStateCreateInfo *blend_info =
3616       builder->use_color_attachments ? builder->create_info->pColorBlendState
3617                                      : &dummy_blend_info;
3618 
3619    struct tu_cs cs;
3620    tu6_emit_rb_mrt_controls(pipeline, blend_info,
3621                             builder->color_attachment_formats,
3622                             &pipeline->rop_reads_dst,
3623                             &pipeline->color_bandwidth_per_sample);
3624 
3625    uint32_t blend_enable_mask =
3626       pipeline->rop_reads_dst ? pipeline->color_write_enable : pipeline->blend_enable;
3627    tu6_emit_blend_control(pipeline, blend_enable_mask,
3628                           builder->use_dual_src_blend, msaa_info);
3629 
3630    if (tu_pipeline_static_state(pipeline, &cs, TU_DYNAMIC_STATE_BLEND,
3631                                 blend_info->attachmentCount * 3 + 4)) {
3632       tu6_emit_blend(&cs, pipeline);
3633       assert(cs.cur == cs.end); /* validate draw state size */
3634    }
3635 
3636    /* Disable LRZ writes when blend or logic op that reads the destination is
3637     * enabled, since the resulting pixel value from the blend-draw depends on
3638     * an earlier draw, which LRZ in the draw pass could early-reject if the
3639     * previous blend-enabled draw wrote LRZ.
3640     *
3641     * TODO: We need to disable LRZ writes only for the binning pass.
3642     * Therefore, we need to emit it in a separate draw state. We keep
3643     * it disabled for sysmem path as well for the moment.
3644     */
3645    if (blend_enable_mask)
3646       pipeline->lrz.force_disable_mask |= TU_LRZ_FORCE_DISABLE_WRITE;
3647 
3648    for (int i = 0; i < blend_info->attachmentCount; i++) {
3649       VkPipelineColorBlendAttachmentState blendAttachment = blend_info->pAttachments[i];
3650       /* From the PoV of LRZ, having masked color channels is
3651        * the same as having blend enabled, in that the draw will
3652        * care about the fragments from an earlier draw.
3653        */
3654       VkFormat format = builder->color_attachment_formats[i];
3655       unsigned mask = MASK(vk_format_get_nr_components(format));
3656       if (format != VK_FORMAT_UNDEFINED &&
3657           ((blendAttachment.colorWriteMask & mask) != mask ||
3658            !(pipeline->color_write_enable & BIT(i)))) {
3659          pipeline->lrz.force_disable_mask |= TU_LRZ_FORCE_DISABLE_WRITE;
3660       }
3661    }
3662 
3663    if (tu_pipeline_static_state(pipeline, &cs, VK_DYNAMIC_STATE_BLEND_CONSTANTS, 5)) {
3664       tu_cs_emit_pkt4(&cs, REG_A6XX_RB_BLEND_RED_F32, 4);
3665       tu_cs_emit_array(&cs, (const uint32_t *) blend_info->blendConstants, 4);
3666    }
3667 
3668    const struct VkPipelineSampleLocationsStateCreateInfoEXT *sample_locations =
3669       vk_find_struct_const(msaa_info->pNext, PIPELINE_SAMPLE_LOCATIONS_STATE_CREATE_INFO_EXT);
3670    const VkSampleLocationsInfoEXT *samp_loc = NULL;
3671 
3672    if (sample_locations && sample_locations->sampleLocationsEnable)
3673       samp_loc = &sample_locations->sampleLocationsInfo;
3674 
3675     if (tu_pipeline_static_state(pipeline, &cs, TU_DYNAMIC_STATE_SAMPLE_LOCATIONS,
3676                                  samp_loc ? 9 : 6)) {
3677       tu6_emit_sample_locations(&cs, samp_loc);
3678     }
3679 }
3680 
3681 static void
tu_pipeline_builder_parse_rasterization_order(struct tu_pipeline_builder * builder,struct tu_pipeline * pipeline)3682 tu_pipeline_builder_parse_rasterization_order(
3683    struct tu_pipeline_builder *builder, struct tu_pipeline *pipeline)
3684 {
3685    if (builder->rasterizer_discard)
3686       return;
3687 
3688    pipeline->subpass_feedback_loop_ds = builder->subpass_feedback_loop_ds;
3689 
3690    const VkPipelineColorBlendStateCreateInfo *blend_info =
3691       builder->create_info->pColorBlendState;
3692 
3693    const VkPipelineDepthStencilStateCreateInfo *ds_info =
3694       builder->create_info->pDepthStencilState;
3695 
3696    if (builder->use_color_attachments) {
3697       pipeline->raster_order_attachment_access =
3698          blend_info->flags &
3699          VK_PIPELINE_COLOR_BLEND_STATE_CREATE_RASTERIZATION_ORDER_ATTACHMENT_ACCESS_BIT_ARM;
3700    }
3701 
3702    if (builder->depth_attachment_format != VK_FORMAT_UNDEFINED) {
3703       pipeline->raster_order_attachment_access |=
3704          ds_info->flags &
3705          (VK_PIPELINE_DEPTH_STENCIL_STATE_CREATE_RASTERIZATION_ORDER_ATTACHMENT_DEPTH_ACCESS_BIT_ARM |
3706           VK_PIPELINE_DEPTH_STENCIL_STATE_CREATE_RASTERIZATION_ORDER_ATTACHMENT_STENCIL_ACCESS_BIT_ARM);
3707    }
3708 
3709    if (unlikely(builder->device->physical_device->instance->debug_flags & TU_DEBUG_RAST_ORDER))
3710       pipeline->raster_order_attachment_access = true;
3711 
3712    /* VK_EXT_blend_operation_advanced would also require ordered access
3713     * when implemented in the future.
3714     */
3715 
3716    uint32_t sysmem_prim_mode = NO_FLUSH;
3717    uint32_t gmem_prim_mode = NO_FLUSH;
3718 
3719    if (pipeline->raster_order_attachment_access) {
3720       /* VK_ARM_rasterization_order_attachment_access:
3721        *
3722        * This extension allow access to framebuffer attachments when used as
3723        * both input and color attachments from one fragment to the next,
3724        * in rasterization order, without explicit synchronization.
3725        */
3726       sysmem_prim_mode = FLUSH_PER_OVERLAP_AND_OVERWRITE;
3727       gmem_prim_mode = FLUSH_PER_OVERLAP;
3728    } else {
3729       /* If there is a feedback loop, then the shader can read the previous value
3730        * of a pixel being written out. It can also write some components and then
3731        * read different components without a barrier in between. This is a
3732        * problem in sysmem mode with UBWC, because the main buffer and flags
3733        * buffer can get out-of-sync if only one is flushed. We fix this by
3734        * setting the SINGLE_PRIM_MODE field to the same value that the blob does
3735        * for advanced_blend in sysmem mode if a feedback loop is detected.
3736        */
3737       if (builder->subpass_feedback_loop_color ||
3738           builder->subpass_feedback_loop_ds) {
3739          sysmem_prim_mode = FLUSH_PER_OVERLAP_AND_OVERWRITE;
3740       }
3741    }
3742 
3743    struct tu_cs cs;
3744 
3745    pipeline->prim_order_state_gmem = tu_cs_draw_state(&pipeline->cs, &cs, 2);
3746    tu_cs_emit_write_reg(&cs, REG_A6XX_GRAS_SC_CNTL,
3747                         A6XX_GRAS_SC_CNTL_CCUSINGLECACHELINESIZE(2) |
3748                         A6XX_GRAS_SC_CNTL_SINGLE_PRIM_MODE(gmem_prim_mode));
3749 
3750    pipeline->prim_order_state_sysmem = tu_cs_draw_state(&pipeline->cs, &cs, 2);
3751    tu_cs_emit_write_reg(&cs, REG_A6XX_GRAS_SC_CNTL,
3752                         A6XX_GRAS_SC_CNTL_CCUSINGLECACHELINESIZE(2) |
3753                         A6XX_GRAS_SC_CNTL_SINGLE_PRIM_MODE(sysmem_prim_mode));
3754 }
3755 
3756 static void
tu_pipeline_finish(struct tu_pipeline * pipeline,struct tu_device * dev,const VkAllocationCallbacks * alloc)3757 tu_pipeline_finish(struct tu_pipeline *pipeline,
3758                    struct tu_device *dev,
3759                    const VkAllocationCallbacks *alloc)
3760 {
3761    tu_cs_finish(&pipeline->cs);
3762    pthread_mutex_lock(&dev->pipeline_mutex);
3763    tu_suballoc_bo_free(&dev->pipeline_suballoc, &pipeline->bo);
3764    pthread_mutex_unlock(&dev->pipeline_mutex);
3765 
3766    if (pipeline->pvtmem_bo)
3767       tu_bo_finish(dev, pipeline->pvtmem_bo);
3768 
3769    ralloc_free(pipeline->executables_mem_ctx);
3770 }
3771 
3772 static VkResult
tu_pipeline_builder_build(struct tu_pipeline_builder * builder,struct tu_pipeline ** pipeline)3773 tu_pipeline_builder_build(struct tu_pipeline_builder *builder,
3774                           struct tu_pipeline **pipeline)
3775 {
3776    VkResult result;
3777 
3778    *pipeline = vk_object_zalloc(&builder->device->vk, builder->alloc,
3779                                 sizeof(**pipeline), VK_OBJECT_TYPE_PIPELINE);
3780    if (!*pipeline)
3781       return VK_ERROR_OUT_OF_HOST_MEMORY;
3782 
3783    (*pipeline)->executables_mem_ctx = ralloc_context(NULL);
3784    util_dynarray_init(&(*pipeline)->executables, (*pipeline)->executables_mem_ctx);
3785 
3786    /* compile and upload shaders */
3787    result = tu_pipeline_builder_compile_shaders(builder, *pipeline);
3788    if (result != VK_SUCCESS) {
3789       vk_object_free(&builder->device->vk, builder->alloc, *pipeline);
3790       return result;
3791    }
3792 
3793    result = tu_pipeline_allocate_cs(builder->device, *pipeline,
3794                                     builder->layout, builder, NULL);
3795    if (result != VK_SUCCESS) {
3796       vk_object_free(&builder->device->vk, builder->alloc, *pipeline);
3797       return result;
3798    }
3799 
3800    for (uint32_t i = 0; i < ARRAY_SIZE(builder->shader_iova); i++)
3801       builder->shader_iova[i] =
3802          tu_upload_variant(*pipeline, builder->shaders->variants[i]);
3803 
3804    builder->binning_vs_iova =
3805       tu_upload_variant(*pipeline, builder->binning_variant);
3806 
3807    /* Setup private memory. Note that because we're sharing the same private
3808     * memory for all stages, all stages must use the same config, or else
3809     * fibers from one stage might overwrite fibers in another.
3810     */
3811 
3812    uint32_t pvtmem_size = 0;
3813    bool per_wave = true;
3814    for (uint32_t i = 0; i < ARRAY_SIZE(builder->shaders->variants); i++) {
3815       if (builder->shaders->variants[i]) {
3816          pvtmem_size = MAX2(pvtmem_size, builder->shaders->variants[i]->pvtmem_size);
3817          if (!builder->shaders->variants[i]->pvtmem_per_wave)
3818             per_wave = false;
3819       }
3820    }
3821 
3822    if (builder->binning_variant) {
3823       pvtmem_size = MAX2(pvtmem_size, builder->binning_variant->pvtmem_size);
3824       if (!builder->binning_variant->pvtmem_per_wave)
3825          per_wave = false;
3826    }
3827 
3828    result = tu_setup_pvtmem(builder->device, *pipeline, &builder->pvtmem,
3829                             pvtmem_size, per_wave);
3830    if (result != VK_SUCCESS) {
3831       vk_object_free(&builder->device->vk, builder->alloc, *pipeline);
3832       return result;
3833    }
3834 
3835    tu_pipeline_builder_parse_dynamic(builder, *pipeline);
3836    tu_pipeline_builder_parse_shader_stages(builder, *pipeline);
3837    tu_pipeline_builder_parse_vertex_input(builder, *pipeline);
3838    tu_pipeline_builder_parse_input_assembly(builder, *pipeline);
3839    tu_pipeline_builder_parse_tessellation(builder, *pipeline);
3840    tu_pipeline_builder_parse_viewport(builder, *pipeline);
3841    tu_pipeline_builder_parse_rasterization(builder, *pipeline);
3842    tu_pipeline_builder_parse_depth_stencil(builder, *pipeline);
3843    tu_pipeline_builder_parse_multisample_and_color_blend(builder, *pipeline);
3844    tu_pipeline_builder_parse_rasterization_order(builder, *pipeline);
3845    tu6_emit_load_state(*pipeline, builder->layout, false);
3846 
3847    return VK_SUCCESS;
3848 }
3849 
3850 static void
tu_pipeline_builder_finish(struct tu_pipeline_builder * builder)3851 tu_pipeline_builder_finish(struct tu_pipeline_builder *builder)
3852 {
3853    if (builder->shaders)
3854       vk_pipeline_cache_object_unref(&builder->shaders->base);
3855    ralloc_free(builder->mem_ctx);
3856 }
3857 
3858 static void
tu_pipeline_builder_init_graphics(struct tu_pipeline_builder * builder,struct tu_device * dev,struct vk_pipeline_cache * cache,const VkGraphicsPipelineCreateInfo * create_info,const VkAllocationCallbacks * alloc)3859 tu_pipeline_builder_init_graphics(
3860    struct tu_pipeline_builder *builder,
3861    struct tu_device *dev,
3862    struct vk_pipeline_cache *cache,
3863    const VkGraphicsPipelineCreateInfo *create_info,
3864    const VkAllocationCallbacks *alloc)
3865 {
3866    TU_FROM_HANDLE(tu_pipeline_layout, layout, create_info->layout);
3867 
3868    *builder = (struct tu_pipeline_builder) {
3869       .device = dev,
3870       .mem_ctx = ralloc_context(NULL),
3871       .cache = cache,
3872       .create_info = create_info,
3873       .alloc = alloc,
3874       .layout = layout,
3875    };
3876 
3877    bool rasterizer_discard_dynamic = false;
3878    if (create_info->pDynamicState) {
3879       for (uint32_t i = 0; i < create_info->pDynamicState->dynamicStateCount; i++) {
3880          if (create_info->pDynamicState->pDynamicStates[i] ==
3881                VK_DYNAMIC_STATE_RASTERIZER_DISCARD_ENABLE) {
3882             rasterizer_discard_dynamic = true;
3883             break;
3884          }
3885       }
3886    }
3887 
3888    builder->rasterizer_discard =
3889       builder->create_info->pRasterizationState->rasterizerDiscardEnable &&
3890       !rasterizer_discard_dynamic;
3891 
3892    const VkPipelineRenderingCreateInfo *rendering_info =
3893       vk_find_struct_const(create_info->pNext, PIPELINE_RENDERING_CREATE_INFO);
3894 
3895    if (unlikely(dev->instance->debug_flags & TU_DEBUG_DYNAMIC) && !rendering_info)
3896       rendering_info = vk_get_pipeline_rendering_create_info(create_info);
3897 
3898    if (rendering_info) {
3899       builder->subpass_raster_order_attachment_access = false;
3900       builder->subpass_feedback_loop_ds = false;
3901       builder->subpass_feedback_loop_color = false;
3902 
3903       builder->multiview_mask = rendering_info->viewMask;
3904 
3905       /* We don't know with dynamic rendering whether the pipeline will be
3906        * used in a render pass with none of attachments enabled, so we have to
3907        * dynamically emit MSAA state.
3908        *
3909        * TODO: Move MSAA state to a separate draw state and emit it
3910        * dynamically only when the sample count is different from the
3911        * subpass's sample count.
3912        */
3913       builder->emit_msaa_state = !builder->rasterizer_discard;
3914 
3915       const VkRenderingSelfDependencyInfoMESA *self_dependency =
3916          vk_find_struct_const(rendering_info->pNext, RENDERING_SELF_DEPENDENCY_INFO_MESA);
3917 
3918       if (self_dependency) {
3919          builder->subpass_feedback_loop_ds =
3920             self_dependency->depthSelfDependency ||
3921             self_dependency->stencilSelfDependency;
3922          builder->subpass_feedback_loop_color =
3923             self_dependency->colorSelfDependencies;
3924       }
3925 
3926       if (!builder->rasterizer_discard) {
3927          builder->depth_attachment_format =
3928             rendering_info->depthAttachmentFormat == VK_FORMAT_UNDEFINED ?
3929             rendering_info->stencilAttachmentFormat :
3930             rendering_info->depthAttachmentFormat;
3931 
3932          builder->color_attachment_count =
3933             rendering_info->colorAttachmentCount;
3934 
3935          for (unsigned i = 0; i < rendering_info->colorAttachmentCount; i++) {
3936             builder->color_attachment_formats[i] =
3937                rendering_info->pColorAttachmentFormats[i];
3938             if (builder->color_attachment_formats[i] != VK_FORMAT_UNDEFINED) {
3939                builder->use_color_attachments = true;
3940                builder->render_components |= 0xf << (i * 4);
3941             }
3942          }
3943       }
3944    } else {
3945       const struct tu_render_pass *pass =
3946          tu_render_pass_from_handle(create_info->renderPass);
3947       const struct tu_subpass *subpass =
3948          &pass->subpasses[create_info->subpass];
3949 
3950       builder->subpass_raster_order_attachment_access =
3951          subpass->raster_order_attachment_access;
3952       builder->subpass_feedback_loop_color = subpass->feedback_loop_color;
3953       builder->subpass_feedback_loop_ds = subpass->feedback_loop_ds;
3954 
3955       builder->multiview_mask = subpass->multiview_mask;
3956 
3957       /* variableMultisampleRate support */
3958       builder->emit_msaa_state = (subpass->samples == 0) && !builder->rasterizer_discard;
3959 
3960       if (!builder->rasterizer_discard) {
3961          const uint32_t a = subpass->depth_stencil_attachment.attachment;
3962          builder->depth_attachment_format = (a != VK_ATTACHMENT_UNUSED) ?
3963             pass->attachments[a].format : VK_FORMAT_UNDEFINED;
3964 
3965          assert(subpass->color_count == 0 ||
3966                 !create_info->pColorBlendState ||
3967                 subpass->color_count == create_info->pColorBlendState->attachmentCount);
3968          builder->color_attachment_count = subpass->color_count;
3969          for (uint32_t i = 0; i < subpass->color_count; i++) {
3970             const uint32_t a = subpass->color_attachments[i].attachment;
3971             if (a == VK_ATTACHMENT_UNUSED)
3972                continue;
3973 
3974             builder->color_attachment_formats[i] = pass->attachments[a].format;
3975             builder->use_color_attachments = true;
3976             builder->render_components |= 0xf << (i * 4);
3977          }
3978       }
3979    }
3980 
3981 
3982    if (builder->rasterizer_discard) {
3983       builder->samples = VK_SAMPLE_COUNT_1_BIT;
3984    } else {
3985       builder->samples = create_info->pMultisampleState->rasterizationSamples;
3986       builder->alpha_to_coverage = create_info->pMultisampleState->alphaToCoverageEnable;
3987 
3988       if (tu_blend_state_is_dual_src(create_info->pColorBlendState)) {
3989          builder->color_attachment_count++;
3990          builder->use_dual_src_blend = true;
3991          /* dual source blending has an extra fs output in the 2nd slot */
3992          if (builder->color_attachment_formats[0] != VK_FORMAT_UNDEFINED)
3993             builder->render_components |= 0xf << 4;
3994       }
3995    }
3996 }
3997 
3998 static VkResult
tu_graphics_pipeline_create(VkDevice device,VkPipelineCache pipelineCache,const VkGraphicsPipelineCreateInfo * pCreateInfo,const VkAllocationCallbacks * pAllocator,VkPipeline * pPipeline)3999 tu_graphics_pipeline_create(VkDevice device,
4000                             VkPipelineCache pipelineCache,
4001                             const VkGraphicsPipelineCreateInfo *pCreateInfo,
4002                             const VkAllocationCallbacks *pAllocator,
4003                             VkPipeline *pPipeline)
4004 {
4005    TU_FROM_HANDLE(tu_device, dev, device);
4006    TU_FROM_HANDLE(vk_pipeline_cache, cache, pipelineCache);
4007 
4008    cache = cache ? cache : dev->mem_cache;
4009 
4010    struct tu_pipeline_builder builder;
4011    tu_pipeline_builder_init_graphics(&builder, dev, cache,
4012                                      pCreateInfo, pAllocator);
4013 
4014    struct tu_pipeline *pipeline = NULL;
4015    VkResult result = tu_pipeline_builder_build(&builder, &pipeline);
4016    tu_pipeline_builder_finish(&builder);
4017 
4018    if (result == VK_SUCCESS)
4019       *pPipeline = tu_pipeline_to_handle(pipeline);
4020    else
4021       *pPipeline = VK_NULL_HANDLE;
4022 
4023    return result;
4024 }
4025 
4026 VKAPI_ATTR VkResult VKAPI_CALL
tu_CreateGraphicsPipelines(VkDevice device,VkPipelineCache pipelineCache,uint32_t count,const VkGraphicsPipelineCreateInfo * pCreateInfos,const VkAllocationCallbacks * pAllocator,VkPipeline * pPipelines)4027 tu_CreateGraphicsPipelines(VkDevice device,
4028                            VkPipelineCache pipelineCache,
4029                            uint32_t count,
4030                            const VkGraphicsPipelineCreateInfo *pCreateInfos,
4031                            const VkAllocationCallbacks *pAllocator,
4032                            VkPipeline *pPipelines)
4033 {
4034    VkResult final_result = VK_SUCCESS;
4035    uint32_t i = 0;
4036 
4037    for (; i < count; i++) {
4038       VkResult result = tu_graphics_pipeline_create(device, pipelineCache,
4039                                                     &pCreateInfos[i], pAllocator,
4040                                                     &pPipelines[i]);
4041 
4042       if (result != VK_SUCCESS) {
4043          final_result = result;
4044          pPipelines[i] = VK_NULL_HANDLE;
4045 
4046          if (pCreateInfos[i].flags &
4047              VK_PIPELINE_CREATE_EARLY_RETURN_ON_FAILURE_BIT)
4048             break;
4049       }
4050    }
4051 
4052    for (; i < count; i++)
4053       pPipelines[i] = VK_NULL_HANDLE;
4054 
4055    return final_result;
4056 }
4057 
4058 static VkResult
tu_compute_pipeline_create(VkDevice device,VkPipelineCache pipelineCache,const VkComputePipelineCreateInfo * pCreateInfo,const VkAllocationCallbacks * pAllocator,VkPipeline * pPipeline)4059 tu_compute_pipeline_create(VkDevice device,
4060                            VkPipelineCache pipelineCache,
4061                            const VkComputePipelineCreateInfo *pCreateInfo,
4062                            const VkAllocationCallbacks *pAllocator,
4063                            VkPipeline *pPipeline)
4064 {
4065    TU_FROM_HANDLE(tu_device, dev, device);
4066    TU_FROM_HANDLE(vk_pipeline_cache, cache, pipelineCache);
4067    TU_FROM_HANDLE(tu_pipeline_layout, layout, pCreateInfo->layout);
4068    const VkPipelineShaderStageCreateInfo *stage_info = &pCreateInfo->stage;
4069    VkResult result;
4070 
4071    cache = cache ? cache : dev->mem_cache;
4072 
4073    struct tu_pipeline *pipeline;
4074 
4075    *pPipeline = VK_NULL_HANDLE;
4076 
4077    VkPipelineCreationFeedback pipeline_feedback = {
4078       .flags = VK_PIPELINE_CREATION_FEEDBACK_VALID_BIT,
4079    };
4080 
4081    const VkPipelineCreationFeedbackCreateInfo *creation_feedback =
4082       vk_find_struct_const(pCreateInfo->pNext, PIPELINE_CREATION_FEEDBACK_CREATE_INFO);
4083 
4084    int64_t pipeline_start = os_time_get_nano();
4085 
4086    pipeline = vk_object_zalloc(&dev->vk, pAllocator, sizeof(*pipeline),
4087                                VK_OBJECT_TYPE_PIPELINE);
4088    if (!pipeline)
4089       return VK_ERROR_OUT_OF_HOST_MEMORY;
4090 
4091    pipeline->executables_mem_ctx = ralloc_context(NULL);
4092    util_dynarray_init(&pipeline->executables, pipeline->executables_mem_ctx);
4093 
4094    struct tu_shader_key key = { };
4095    tu_shader_key_init(&key, stage_info, dev);
4096 
4097    void *pipeline_mem_ctx = ralloc_context(NULL);
4098 
4099    unsigned char pipeline_sha1[20];
4100    tu_hash_compute(pipeline_sha1, stage_info, layout, &key, dev->compiler);
4101 
4102    struct tu_compiled_shaders *compiled = NULL;
4103 
4104    const bool executable_info = pCreateInfo->flags &
4105       VK_PIPELINE_CREATE_CAPTURE_INTERNAL_REPRESENTATIONS_BIT_KHR;
4106 
4107    bool application_cache_hit = false;
4108 
4109    if (!executable_info) {
4110       compiled =
4111          tu_pipeline_cache_lookup(cache, pipeline_sha1, sizeof(pipeline_sha1),
4112                                   &application_cache_hit);
4113    }
4114 
4115    if (application_cache_hit && cache != dev->mem_cache) {
4116       pipeline_feedback.flags |=
4117          VK_PIPELINE_CREATION_FEEDBACK_APPLICATION_PIPELINE_CACHE_HIT_BIT;
4118    }
4119 
4120    if (tu6_shared_constants_enable(layout, dev->compiler)) {
4121       pipeline->shared_consts = (struct tu_push_constant_range) {
4122          .lo = 0,
4123          .dwords = layout->push_constant_size / 4,
4124       };
4125    }
4126 
4127    char *nir_initial_disasm = NULL;
4128 
4129    if (!compiled) {
4130       if (pCreateInfo->flags &
4131           VK_PIPELINE_CREATE_FAIL_ON_PIPELINE_COMPILE_REQUIRED_BIT) {
4132          result = VK_PIPELINE_COMPILE_REQUIRED;
4133          goto fail;
4134       }
4135 
4136       struct ir3_shader_key ir3_key = {};
4137 
4138       nir_shader *nir = tu_spirv_to_nir(dev, pipeline_mem_ctx, stage_info,
4139                                         MESA_SHADER_COMPUTE);
4140 
4141       nir_initial_disasm = executable_info ?
4142          nir_shader_as_str(nir, pipeline->executables_mem_ctx) : NULL;
4143 
4144       struct tu_shader *shader =
4145          tu_shader_create(dev, nir, &key, layout, pAllocator);
4146       if (!shader) {
4147          result = VK_ERROR_OUT_OF_HOST_MEMORY;
4148          goto fail;
4149       }
4150 
4151       compiled = tu_shaders_init(dev, &pipeline_sha1, sizeof(pipeline_sha1));
4152       if (!compiled) {
4153          tu_shader_destroy(dev, shader, pAllocator);
4154          result = VK_ERROR_OUT_OF_HOST_MEMORY;
4155          goto fail;
4156       }
4157 
4158       compiled->active_desc_sets = shader->active_desc_sets;
4159       compiled->push_consts[MESA_SHADER_COMPUTE] = shader->push_consts;
4160 
4161       struct ir3_shader_variant *v =
4162          ir3_shader_create_variant(shader->ir3_shader, &ir3_key, executable_info);
4163 
4164       tu_shader_destroy(dev, shader, pAllocator);
4165 
4166       if (!v) {
4167          result = VK_ERROR_OUT_OF_HOST_MEMORY;
4168          goto fail;
4169       }
4170 
4171       compiled->variants[MESA_SHADER_COMPUTE] = v;
4172 
4173       compiled = tu_pipeline_cache_insert(cache, compiled);
4174    }
4175 
4176    pipeline_feedback.duration = os_time_get_nano() - pipeline_start;
4177 
4178    if (creation_feedback) {
4179       *creation_feedback->pPipelineCreationFeedback = pipeline_feedback;
4180       assert(creation_feedback->pipelineStageCreationFeedbackCount == 1);
4181       creation_feedback->pPipelineStageCreationFeedbacks[0] = pipeline_feedback;
4182    }
4183 
4184    pipeline->active_desc_sets = compiled->active_desc_sets;
4185 
4186    struct ir3_shader_variant *v = compiled->variants[MESA_SHADER_COMPUTE];
4187 
4188    tu_pipeline_set_linkage(&pipeline->program.link[MESA_SHADER_COMPUTE],
4189                            &compiled->push_consts[MESA_SHADER_COMPUTE], v);
4190 
4191    result = tu_pipeline_allocate_cs(dev, pipeline, layout, NULL, v);
4192    if (result != VK_SUCCESS)
4193       goto fail;
4194 
4195    uint64_t shader_iova = tu_upload_variant(pipeline, v);
4196 
4197    struct tu_pvtmem_config pvtmem;
4198    tu_setup_pvtmem(dev, pipeline, &pvtmem, v->pvtmem_size, v->pvtmem_per_wave);
4199 
4200    for (int i = 0; i < 3; i++)
4201       pipeline->compute.local_size[i] = v->local_size[i];
4202 
4203    pipeline->compute.subgroup_size = v->info.double_threadsize ? 128 : 64;
4204 
4205    struct tu_cs prog_cs;
4206    uint32_t additional_reserve_size = tu_xs_get_additional_cs_size_dwords(v);
4207    tu_cs_begin_sub_stream(&pipeline->cs, 64 + additional_reserve_size, &prog_cs);
4208    tu6_emit_cs_config(&prog_cs, v, &pvtmem, shader_iova);
4209    pipeline->program.state = tu_cs_end_draw_state(&pipeline->cs, &prog_cs);
4210 
4211    tu6_emit_load_state(pipeline, layout, true);
4212 
4213    tu_append_executable(pipeline, v, nir_initial_disasm);
4214 
4215    vk_pipeline_cache_object_unref(&compiled->base);
4216    ralloc_free(pipeline_mem_ctx);
4217 
4218    *pPipeline = tu_pipeline_to_handle(pipeline);
4219 
4220    return VK_SUCCESS;
4221 
4222 fail:
4223    if (compiled)
4224       vk_pipeline_cache_object_unref(&compiled->base);
4225 
4226    ralloc_free(pipeline_mem_ctx);
4227 
4228    vk_object_free(&dev->vk, pAllocator, pipeline);
4229 
4230    return result;
4231 }
4232 
4233 VKAPI_ATTR VkResult VKAPI_CALL
tu_CreateComputePipelines(VkDevice device,VkPipelineCache pipelineCache,uint32_t count,const VkComputePipelineCreateInfo * pCreateInfos,const VkAllocationCallbacks * pAllocator,VkPipeline * pPipelines)4234 tu_CreateComputePipelines(VkDevice device,
4235                           VkPipelineCache pipelineCache,
4236                           uint32_t count,
4237                           const VkComputePipelineCreateInfo *pCreateInfos,
4238                           const VkAllocationCallbacks *pAllocator,
4239                           VkPipeline *pPipelines)
4240 {
4241    VkResult final_result = VK_SUCCESS;
4242    uint32_t i = 0;
4243 
4244    for (; i < count; i++) {
4245       VkResult result = tu_compute_pipeline_create(device, pipelineCache,
4246                                                    &pCreateInfos[i],
4247                                                    pAllocator, &pPipelines[i]);
4248       if (result != VK_SUCCESS) {
4249          final_result = result;
4250          pPipelines[i] = VK_NULL_HANDLE;
4251 
4252          if (pCreateInfos[i].flags &
4253              VK_PIPELINE_CREATE_EARLY_RETURN_ON_FAILURE_BIT)
4254             break;
4255       }
4256    }
4257 
4258    for (; i < count; i++)
4259       pPipelines[i] = VK_NULL_HANDLE;
4260 
4261    return final_result;
4262 }
4263 
4264 VKAPI_ATTR void VKAPI_CALL
tu_DestroyPipeline(VkDevice _device,VkPipeline _pipeline,const VkAllocationCallbacks * pAllocator)4265 tu_DestroyPipeline(VkDevice _device,
4266                    VkPipeline _pipeline,
4267                    const VkAllocationCallbacks *pAllocator)
4268 {
4269    TU_FROM_HANDLE(tu_device, dev, _device);
4270    TU_FROM_HANDLE(tu_pipeline, pipeline, _pipeline);
4271 
4272    if (!_pipeline)
4273       return;
4274 
4275    tu_pipeline_finish(pipeline, dev, pAllocator);
4276    vk_object_free(&dev->vk, pAllocator, pipeline);
4277 }
4278 
4279 #define WRITE_STR(field, ...) ({                                \
4280    memset(field, 0, sizeof(field));                             \
4281    UNUSED int _i = snprintf(field, sizeof(field), __VA_ARGS__); \
4282    assert(_i > 0 && _i < sizeof(field));                        \
4283 })
4284 
4285 static const struct tu_pipeline_executable *
tu_pipeline_get_executable(struct tu_pipeline * pipeline,uint32_t index)4286 tu_pipeline_get_executable(struct tu_pipeline *pipeline, uint32_t index)
4287 {
4288    assert(index < util_dynarray_num_elements(&pipeline->executables,
4289                                              struct tu_pipeline_executable));
4290    return util_dynarray_element(
4291       &pipeline->executables, struct tu_pipeline_executable, index);
4292 }
4293 
4294 VKAPI_ATTR VkResult VKAPI_CALL
tu_GetPipelineExecutablePropertiesKHR(VkDevice _device,const VkPipelineInfoKHR * pPipelineInfo,uint32_t * pExecutableCount,VkPipelineExecutablePropertiesKHR * pProperties)4295 tu_GetPipelineExecutablePropertiesKHR(
4296       VkDevice _device,
4297       const VkPipelineInfoKHR* pPipelineInfo,
4298       uint32_t* pExecutableCount,
4299       VkPipelineExecutablePropertiesKHR* pProperties)
4300 {
4301    TU_FROM_HANDLE(tu_device, dev, _device);
4302    TU_FROM_HANDLE(tu_pipeline, pipeline, pPipelineInfo->pipeline);
4303    VK_OUTARRAY_MAKE_TYPED(VkPipelineExecutablePropertiesKHR, out,
4304                           pProperties, pExecutableCount);
4305 
4306    util_dynarray_foreach (&pipeline->executables, struct tu_pipeline_executable, exe) {
4307       vk_outarray_append_typed(VkPipelineExecutablePropertiesKHR, &out, props) {
4308          gl_shader_stage stage = exe->stage;
4309          props->stages = mesa_to_vk_shader_stage(stage);
4310 
4311          if (!exe->is_binning)
4312             WRITE_STR(props->name, "%s", _mesa_shader_stage_to_abbrev(stage));
4313          else
4314             WRITE_STR(props->name, "Binning VS");
4315 
4316          WRITE_STR(props->description, "%s", _mesa_shader_stage_to_string(stage));
4317 
4318          props->subgroupSize =
4319             dev->compiler->threadsize_base * (exe->stats.double_threadsize ? 2 : 1);
4320       }
4321    }
4322 
4323    return vk_outarray_status(&out);
4324 }
4325 
4326 VKAPI_ATTR VkResult VKAPI_CALL
tu_GetPipelineExecutableStatisticsKHR(VkDevice _device,const VkPipelineExecutableInfoKHR * pExecutableInfo,uint32_t * pStatisticCount,VkPipelineExecutableStatisticKHR * pStatistics)4327 tu_GetPipelineExecutableStatisticsKHR(
4328       VkDevice _device,
4329       const VkPipelineExecutableInfoKHR* pExecutableInfo,
4330       uint32_t* pStatisticCount,
4331       VkPipelineExecutableStatisticKHR* pStatistics)
4332 {
4333    TU_FROM_HANDLE(tu_pipeline, pipeline, pExecutableInfo->pipeline);
4334    VK_OUTARRAY_MAKE_TYPED(VkPipelineExecutableStatisticKHR, out,
4335                           pStatistics, pStatisticCount);
4336 
4337    const struct tu_pipeline_executable *exe =
4338       tu_pipeline_get_executable(pipeline, pExecutableInfo->executableIndex);
4339 
4340    vk_outarray_append_typed(VkPipelineExecutableStatisticKHR, &out, stat) {
4341       WRITE_STR(stat->name, "Max Waves Per Core");
4342       WRITE_STR(stat->description,
4343                 "Maximum number of simultaneous waves per core.");
4344       stat->format = VK_PIPELINE_EXECUTABLE_STATISTIC_FORMAT_UINT64_KHR;
4345       stat->value.u64 = exe->stats.max_waves;
4346    }
4347 
4348    vk_outarray_append_typed(VkPipelineExecutableStatisticKHR, &out, stat) {
4349       WRITE_STR(stat->name, "Instruction Count");
4350       WRITE_STR(stat->description,
4351                 "Total number of IR3 instructions in the final generated "
4352                 "shader executable.");
4353       stat->format = VK_PIPELINE_EXECUTABLE_STATISTIC_FORMAT_UINT64_KHR;
4354       stat->value.u64 = exe->stats.instrs_count;
4355    }
4356 
4357    vk_outarray_append_typed(VkPipelineExecutableStatisticKHR, &out, stat) {
4358       WRITE_STR(stat->name, "Code size");
4359       WRITE_STR(stat->description,
4360                 "Total number of dwords in the final generated "
4361                 "shader executable.");
4362       stat->format = VK_PIPELINE_EXECUTABLE_STATISTIC_FORMAT_UINT64_KHR;
4363       stat->value.u64 = exe->stats.sizedwords;
4364    }
4365 
4366    vk_outarray_append_typed(VkPipelineExecutableStatisticKHR, &out, stat) {
4367       WRITE_STR(stat->name, "NOPs Count");
4368       WRITE_STR(stat->description,
4369                 "Number of NOP instructions in the final generated "
4370                 "shader executable.");
4371       stat->format = VK_PIPELINE_EXECUTABLE_STATISTIC_FORMAT_UINT64_KHR;
4372       stat->value.u64 = exe->stats.nops_count;
4373    }
4374 
4375    vk_outarray_append_typed(VkPipelineExecutableStatisticKHR, &out, stat) {
4376       WRITE_STR(stat->name, "MOV Count");
4377       WRITE_STR(stat->description,
4378                 "Number of MOV instructions in the final generated "
4379                 "shader executable.");
4380       stat->format = VK_PIPELINE_EXECUTABLE_STATISTIC_FORMAT_UINT64_KHR;
4381       stat->value.u64 = exe->stats.mov_count;
4382    }
4383 
4384    vk_outarray_append_typed(VkPipelineExecutableStatisticKHR, &out, stat) {
4385       WRITE_STR(stat->name, "COV Count");
4386       WRITE_STR(stat->description,
4387                 "Number of COV instructions in the final generated "
4388                 "shader executable.");
4389       stat->format = VK_PIPELINE_EXECUTABLE_STATISTIC_FORMAT_UINT64_KHR;
4390       stat->value.u64 = exe->stats.cov_count;
4391    }
4392 
4393    vk_outarray_append_typed(VkPipelineExecutableStatisticKHR, &out, stat) {
4394       WRITE_STR(stat->name, "Registers used");
4395       WRITE_STR(stat->description,
4396                 "Number of registers used in the final generated "
4397                 "shader executable.");
4398       stat->format = VK_PIPELINE_EXECUTABLE_STATISTIC_FORMAT_UINT64_KHR;
4399       stat->value.u64 = exe->stats.max_reg + 1;
4400    }
4401 
4402    vk_outarray_append_typed(VkPipelineExecutableStatisticKHR, &out, stat) {
4403       WRITE_STR(stat->name, "Half-registers used");
4404       WRITE_STR(stat->description,
4405                 "Number of half-registers used in the final generated "
4406                 "shader executable.");
4407       stat->format = VK_PIPELINE_EXECUTABLE_STATISTIC_FORMAT_UINT64_KHR;
4408       stat->value.u64 = exe->stats.max_half_reg + 1;
4409    }
4410 
4411    vk_outarray_append_typed(VkPipelineExecutableStatisticKHR, &out, stat) {
4412       WRITE_STR(stat->name, "Instructions with SS sync bit");
4413       WRITE_STR(stat->description,
4414                 "SS bit is set for instructions which depend on a result "
4415                 "of \"long\" instructions to prevent RAW hazard.");
4416       stat->format = VK_PIPELINE_EXECUTABLE_STATISTIC_FORMAT_UINT64_KHR;
4417       stat->value.u64 = exe->stats.ss;
4418    }
4419 
4420    vk_outarray_append_typed(VkPipelineExecutableStatisticKHR, &out, stat) {
4421       WRITE_STR(stat->name, "Instructions with SY sync bit");
4422       WRITE_STR(stat->description,
4423                 "SY bit is set for instructions which depend on a result "
4424                 "of loads from global memory to prevent RAW hazard.");
4425       stat->format = VK_PIPELINE_EXECUTABLE_STATISTIC_FORMAT_UINT64_KHR;
4426       stat->value.u64 = exe->stats.sy;
4427    }
4428 
4429    vk_outarray_append_typed(VkPipelineExecutableStatisticKHR, &out, stat) {
4430       WRITE_STR(stat->name, "Estimated cycles stalled on SS");
4431       WRITE_STR(stat->description,
4432                 "A better metric to estimate the impact of SS syncs.");
4433       stat->format = VK_PIPELINE_EXECUTABLE_STATISTIC_FORMAT_UINT64_KHR;
4434       stat->value.u64 = exe->stats.sstall;
4435    }
4436 
4437    vk_outarray_append_typed(VkPipelineExecutableStatisticKHR, &out, stat) {
4438       WRITE_STR(stat->name, "Estimated cycles stalled on SY");
4439       WRITE_STR(stat->description,
4440                 "A better metric to estimate the impact of SY syncs.");
4441       stat->format = VK_PIPELINE_EXECUTABLE_STATISTIC_FORMAT_UINT64_KHR;
4442       stat->value.u64 = exe->stats.systall;
4443    }
4444 
4445    for (int i = 0; i < ARRAY_SIZE(exe->stats.instrs_per_cat); i++) {
4446       vk_outarray_append_typed(VkPipelineExecutableStatisticKHR, &out, stat) {
4447          WRITE_STR(stat->name, "cat%d instructions", i);
4448          WRITE_STR(stat->description,
4449                   "Number of cat%d instructions.", i);
4450          stat->format = VK_PIPELINE_EXECUTABLE_STATISTIC_FORMAT_UINT64_KHR;
4451          stat->value.u64 = exe->stats.instrs_per_cat[i];
4452       }
4453    }
4454 
4455    vk_outarray_append_typed(VkPipelineExecutableStatisticKHR, &out, stat) {
4456       WRITE_STR(stat->name, "STP Count");
4457       WRITE_STR(stat->description,
4458                 "Number of STore Private instructions in the final generated "
4459                 "shader executable.");
4460       stat->format = VK_PIPELINE_EXECUTABLE_STATISTIC_FORMAT_UINT64_KHR;
4461       stat->value.u64 = exe->stats.stp_count;
4462    }
4463 
4464    vk_outarray_append_typed(VkPipelineExecutableStatisticKHR, &out, stat) {
4465       WRITE_STR(stat->name, "LDP Count");
4466       WRITE_STR(stat->description,
4467                 "Number of LoaD Private instructions in the final generated "
4468                 "shader executable.");
4469       stat->format = VK_PIPELINE_EXECUTABLE_STATISTIC_FORMAT_UINT64_KHR;
4470       stat->value.u64 = exe->stats.ldp_count;
4471    }
4472 
4473    return vk_outarray_status(&out);
4474 }
4475 
4476 static bool
write_ir_text(VkPipelineExecutableInternalRepresentationKHR * ir,const char * data)4477 write_ir_text(VkPipelineExecutableInternalRepresentationKHR* ir,
4478               const char *data)
4479 {
4480    ir->isText = VK_TRUE;
4481 
4482    size_t data_len = strlen(data) + 1;
4483 
4484    if (ir->pData == NULL) {
4485       ir->dataSize = data_len;
4486       return true;
4487    }
4488 
4489    strncpy(ir->pData, data, ir->dataSize);
4490    if (ir->dataSize < data_len)
4491       return false;
4492 
4493    ir->dataSize = data_len;
4494    return true;
4495 }
4496 
4497 VKAPI_ATTR VkResult VKAPI_CALL
tu_GetPipelineExecutableInternalRepresentationsKHR(VkDevice _device,const VkPipelineExecutableInfoKHR * pExecutableInfo,uint32_t * pInternalRepresentationCount,VkPipelineExecutableInternalRepresentationKHR * pInternalRepresentations)4498 tu_GetPipelineExecutableInternalRepresentationsKHR(
4499     VkDevice _device,
4500     const VkPipelineExecutableInfoKHR* pExecutableInfo,
4501     uint32_t* pInternalRepresentationCount,
4502     VkPipelineExecutableInternalRepresentationKHR* pInternalRepresentations)
4503 {
4504    TU_FROM_HANDLE(tu_pipeline, pipeline, pExecutableInfo->pipeline);
4505    VK_OUTARRAY_MAKE_TYPED(VkPipelineExecutableInternalRepresentationKHR, out,
4506                           pInternalRepresentations, pInternalRepresentationCount);
4507    bool incomplete_text = false;
4508 
4509    const struct tu_pipeline_executable *exe =
4510       tu_pipeline_get_executable(pipeline, pExecutableInfo->executableIndex);
4511 
4512    if (exe->nir_from_spirv) {
4513       vk_outarray_append_typed(VkPipelineExecutableInternalRepresentationKHR, &out, ir) {
4514          WRITE_STR(ir->name, "NIR from SPIRV");
4515          WRITE_STR(ir->description,
4516                    "Initial NIR before any optimizations");
4517 
4518          if (!write_ir_text(ir, exe->nir_from_spirv))
4519             incomplete_text = true;
4520       }
4521    }
4522 
4523    if (exe->nir_final) {
4524       vk_outarray_append_typed(VkPipelineExecutableInternalRepresentationKHR, &out, ir) {
4525          WRITE_STR(ir->name, "Final NIR");
4526          WRITE_STR(ir->description,
4527                    "Final NIR before going into the back-end compiler");
4528 
4529          if (!write_ir_text(ir, exe->nir_final))
4530             incomplete_text = true;
4531       }
4532    }
4533 
4534    if (exe->disasm) {
4535       vk_outarray_append_typed(VkPipelineExecutableInternalRepresentationKHR, &out, ir) {
4536          WRITE_STR(ir->name, "IR3 Assembly");
4537          WRITE_STR(ir->description,
4538                    "Final IR3 assembly for the generated shader binary");
4539 
4540          if (!write_ir_text(ir, exe->disasm))
4541             incomplete_text = true;
4542       }
4543    }
4544 
4545    return incomplete_text ? VK_INCOMPLETE : vk_outarray_status(&out);
4546 }
4547