• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*
2  * Copyright © 2024 Collabora Ltd.
3  * Copyright © 2024 Arm Ltd.
4  *
5  * Derived from tu_cmd_buffer.c which is:
6  * Copyright © 2016 Red Hat.
7  * Copyright © 2016 Bas Nieuwenhuizen
8  * Copyright © 2015 Intel Corporation
9  *
10  * SPDX-License-Identifier: MIT
11  */
12 
13 #include <stdint.h>
14 #include "genxml/gen_macros.h"
15 
16 #include "panvk_buffer.h"
17 #include "panvk_cmd_alloc.h"
18 #include "panvk_cmd_buffer.h"
19 #include "panvk_cmd_desc_state.h"
20 #include "panvk_cmd_draw.h"
21 #include "panvk_cmd_fb_preload.h"
22 #include "panvk_cmd_meta.h"
23 #include "panvk_device.h"
24 #include "panvk_entrypoints.h"
25 #include "panvk_image.h"
26 #include "panvk_image_view.h"
27 #include "panvk_instance.h"
28 #include "panvk_priv_bo.h"
29 #include "panvk_shader.h"
30 
31 #include "pan_desc.h"
32 #include "pan_earlyzs.h"
33 #include "pan_encoder.h"
34 #include "pan_format.h"
35 #include "pan_jc.h"
36 #include "pan_props.h"
37 #include "pan_samples.h"
38 #include "pan_shader.h"
39 
40 #include "util/bitscan.h"
41 #include "vk_format.h"
42 #include "vk_meta.h"
43 #include "vk_pipeline_layout.h"
44 #include "vk_render_pass.h"
45 
46 static void
emit_vs_attrib(const struct vk_vertex_attribute_state * attrib_info,const struct vk_vertex_binding_state * buf_info,const struct panvk_attrib_buf * buf,uint32_t vb_desc_offset,struct mali_attribute_packed * desc)47 emit_vs_attrib(const struct vk_vertex_attribute_state *attrib_info,
48                const struct vk_vertex_binding_state *buf_info,
49                const struct panvk_attrib_buf *buf, uint32_t vb_desc_offset,
50                struct mali_attribute_packed *desc)
51 {
52    bool per_instance = buf_info->input_rate == VK_VERTEX_INPUT_RATE_INSTANCE;
53    enum pipe_format f = vk_format_to_pipe_format(attrib_info->format);
54    unsigned buf_idx = vb_desc_offset + attrib_info->binding;
55 
56    pan_pack(desc, ATTRIBUTE, cfg) {
57       cfg.offset = attrib_info->offset;
58       cfg.format = GENX(panfrost_format_from_pipe_format)(f)->hw;
59       cfg.table = 0;
60       cfg.buffer_index = buf_idx;
61       cfg.stride = buf_info->stride;
62       if (!per_instance) {
63          /* Per-vertex */
64          cfg.attribute_type = MALI_ATTRIBUTE_TYPE_1D;
65          cfg.frequency = MALI_ATTRIBUTE_FREQUENCY_VERTEX;
66          cfg.offset_enable = true;
67       } else if (buf_info->divisor == 1) {
68          cfg.attribute_type = MALI_ATTRIBUTE_TYPE_1D;
69          cfg.frequency = MALI_ATTRIBUTE_FREQUENCY_INSTANCE;
70       } else if (buf_info->divisor == 0) {
71          cfg.attribute_type = MALI_ATTRIBUTE_TYPE_1D;
72          /* HW doesn't support a zero divisor, but we can achieve the same by
73           * not using a divisor and setting the stride to zero */
74          cfg.frequency = MALI_ATTRIBUTE_FREQUENCY_INSTANCE;
75          cfg.stride = 0;
76       } else if (util_is_power_of_two_or_zero(buf_info->divisor)) {
77          /* Per-instance, POT divisor */
78          cfg.attribute_type = MALI_ATTRIBUTE_TYPE_1D_POT_DIVISOR;
79          cfg.frequency = MALI_ATTRIBUTE_FREQUENCY_INSTANCE;
80          cfg.divisor_r = __builtin_ctz(buf_info->divisor);
81       } else {
82          /* Per-instance, NPOT divisor */
83          cfg.attribute_type = MALI_ATTRIBUTE_TYPE_1D_NPOT_DIVISOR;
84          cfg.frequency = MALI_ATTRIBUTE_FREQUENCY_INSTANCE;
85          cfg.divisor_d = panfrost_compute_magic_divisor(
86             buf_info->divisor, &cfg.divisor_r, &cfg.divisor_e);
87       }
88    }
89 }
90 
91 static bool
vs_driver_set_is_dirty(struct panvk_cmd_buffer * cmdbuf)92 vs_driver_set_is_dirty(struct panvk_cmd_buffer *cmdbuf)
93 {
94    return dyn_gfx_state_dirty(cmdbuf, VI) ||
95           dyn_gfx_state_dirty(cmdbuf, VI_BINDINGS_VALID) ||
96           dyn_gfx_state_dirty(cmdbuf, VI_BINDING_STRIDES) ||
97           gfx_state_dirty(cmdbuf, VB) || gfx_state_dirty(cmdbuf, VS) ||
98           gfx_state_dirty(cmdbuf, DESC_STATE);
99 }
100 
101 static VkResult
prepare_vs_driver_set(struct panvk_cmd_buffer * cmdbuf)102 prepare_vs_driver_set(struct panvk_cmd_buffer *cmdbuf)
103 {
104    if (!vs_driver_set_is_dirty(cmdbuf))
105       return VK_SUCCESS;
106 
107    struct panvk_shader_desc_state *vs_desc_state = &cmdbuf->state.gfx.vs.desc;
108    const struct panvk_shader *vs = cmdbuf->state.gfx.vs.shader;
109    const struct vk_vertex_input_state *vi =
110       cmdbuf->vk.dynamic_graphics_state.vi;
111    uint32_t vb_count = 0;
112 
113    u_foreach_bit(i, vi->attributes_valid)
114       vb_count = MAX2(vi->attributes[i].binding + 1, vb_count);
115 
116    uint32_t vb_offset = vs->desc_info.dyn_bufs.count + MAX_VS_ATTRIBS + 1;
117    uint32_t desc_count = vb_offset + vb_count;
118    const struct panvk_descriptor_state *desc_state =
119       &cmdbuf->state.gfx.desc_state;
120    struct panfrost_ptr driver_set = panvk_cmd_alloc_dev_mem(
121       cmdbuf, desc, desc_count * PANVK_DESCRIPTOR_SIZE, PANVK_DESCRIPTOR_SIZE);
122    struct panvk_opaque_desc *descs = driver_set.cpu;
123 
124    if (!driver_set.gpu)
125       return VK_ERROR_OUT_OF_DEVICE_MEMORY;
126 
127    for (uint32_t i = 0; i < MAX_VS_ATTRIBS; i++) {
128       if (vi->attributes_valid & BITFIELD_BIT(i)) {
129          unsigned binding = vi->attributes[i].binding;
130 
131          emit_vs_attrib(&vi->attributes[i], &vi->bindings[binding],
132                         &cmdbuf->state.gfx.vb.bufs[binding], vb_offset,
133                         (struct mali_attribute_packed *)(&descs[i]));
134       } else {
135          memset(&descs[i], 0, sizeof(descs[0]));
136       }
137    }
138 
139    /* Dummy sampler always comes right after the vertex attribs. */
140    pan_cast_and_pack(&descs[MAX_VS_ATTRIBS], SAMPLER, cfg) {
141       cfg.clamp_integer_array_indices = false;
142    }
143 
144    panvk_per_arch(cmd_fill_dyn_bufs)(
145       desc_state, vs,
146       (struct mali_buffer_packed *)(&descs[MAX_VS_ATTRIBS + 1]));
147 
148    for (uint32_t i = 0; i < vb_count; i++) {
149       const struct panvk_attrib_buf *vb = &cmdbuf->state.gfx.vb.bufs[i];
150 
151       pan_cast_and_pack(&descs[vb_offset + i], BUFFER, cfg) {
152          if (vi->bindings_valid & BITFIELD_BIT(i)) {
153             cfg.address = vb->address;
154             cfg.size = vb->size;
155          } else {
156             cfg.address = 0;
157             cfg.size = 0;
158          }
159       }
160    }
161 
162    vs_desc_state->driver_set.dev_addr = driver_set.gpu;
163    vs_desc_state->driver_set.size = desc_count * PANVK_DESCRIPTOR_SIZE;
164    gfx_state_set_dirty(cmdbuf, DESC_STATE);
165    return VK_SUCCESS;
166 }
167 
168 static uint32_t
get_varying_slots(const struct panvk_cmd_buffer * cmdbuf)169 get_varying_slots(const struct panvk_cmd_buffer *cmdbuf)
170 {
171    const struct panvk_shader *vs = cmdbuf->state.gfx.vs.shader;
172    const struct panvk_shader *fs = get_fs(cmdbuf);
173    uint32_t varying_slots = 0;
174 
175    if (fs) {
176       unsigned vs_vars = vs->info.varyings.output_count;
177       unsigned fs_vars = fs->info.varyings.input_count;
178       varying_slots = MAX2(vs_vars, fs_vars);
179    }
180 
181    return varying_slots;
182 }
183 
184 static void
emit_varying_descs(const struct panvk_cmd_buffer * cmdbuf,struct mali_attribute_packed * descs)185 emit_varying_descs(const struct panvk_cmd_buffer *cmdbuf,
186                    struct mali_attribute_packed *descs)
187 {
188    uint32_t varying_slots = get_varying_slots(cmdbuf);
189    /* Assumes 16 byte slots. We could do better. */
190    uint32_t varying_size = varying_slots * 16;
191 
192    const struct panvk_shader *fs = get_fs(cmdbuf);
193 
194    for (uint32_t i = 0; i < varying_slots; i++) {
195       const struct pan_shader_varying *var = &fs->info.varyings.input[i];
196       /* Skip special varyings. */
197       if (var->location < VARYING_SLOT_VAR0)
198          continue;
199 
200       /* We currently always write out F32 in the vertex shaders, so the format
201        * needs to reflect this. */
202       enum pipe_format f = var->format;
203       switch (f) {
204       case PIPE_FORMAT_R16_FLOAT:
205          f = PIPE_FORMAT_R32_FLOAT;
206          break;
207       case PIPE_FORMAT_R16G16_FLOAT:
208          f = PIPE_FORMAT_R32G32_FLOAT;
209          break;
210       case PIPE_FORMAT_R16G16B16_FLOAT:
211          f = PIPE_FORMAT_R32G32B32_FLOAT;
212          break;
213       case PIPE_FORMAT_R16G16B16A16_FLOAT:
214          f = PIPE_FORMAT_R32G32B32A32_FLOAT;
215          break;
216       default:
217          break;
218       }
219 
220       uint32_t loc = var->location - VARYING_SLOT_VAR0;
221       pan_pack(&descs[i], ATTRIBUTE, cfg) {
222          cfg.attribute_type = MALI_ATTRIBUTE_TYPE_VERTEX_PACKET;
223          cfg.offset_enable = false;
224          cfg.format = GENX(panfrost_format_from_pipe_format)(f)->hw;
225          cfg.table = 61;
226          cfg.frequency = MALI_ATTRIBUTE_FREQUENCY_VERTEX;
227          cfg.offset = 1024 + (loc * 16);
228          cfg.buffer_index = 0;
229          cfg.attribute_stride = varying_size;
230          cfg.packet_stride = varying_size + 16;
231       }
232    }
233 }
234 
235 static VkResult
prepare_fs_driver_set(struct panvk_cmd_buffer * cmdbuf)236 prepare_fs_driver_set(struct panvk_cmd_buffer *cmdbuf)
237 {
238    struct panvk_shader_desc_state *fs_desc_state = &cmdbuf->state.gfx.fs.desc;
239    const struct panvk_shader *fs = cmdbuf->state.gfx.fs.shader;
240    const struct panvk_descriptor_state *desc_state =
241       &cmdbuf->state.gfx.desc_state;
242    /* If the shader is using LD_VAR_BUF[_IMM], we do not have to set up
243     * Attribute Descriptors for varying loads. */
244    uint32_t num_varying_attr_descs =
245       panvk_use_ld_var_buf(fs) ? 0 : fs->desc_info.max_varying_loads;
246    uint32_t desc_count =
247       fs->desc_info.dyn_bufs.count + num_varying_attr_descs + 1;
248    struct panfrost_ptr driver_set = panvk_cmd_alloc_dev_mem(
249       cmdbuf, desc, desc_count * PANVK_DESCRIPTOR_SIZE, PANVK_DESCRIPTOR_SIZE);
250    struct panvk_opaque_desc *descs = driver_set.cpu;
251 
252    if (desc_count && !driver_set.gpu)
253       return VK_ERROR_OUT_OF_DEVICE_MEMORY;
254 
255    if (num_varying_attr_descs > 0)
256       emit_varying_descs(cmdbuf, (struct mali_attribute_packed *)(&descs[0]));
257 
258    /* Dummy sampler always comes right after the varyings. */
259    pan_cast_and_pack(&descs[num_varying_attr_descs], SAMPLER, cfg) {
260       cfg.clamp_integer_array_indices = false;
261    }
262 
263    panvk_per_arch(cmd_fill_dyn_bufs)(
264       desc_state, fs,
265       (struct mali_buffer_packed *)(&descs[num_varying_attr_descs + 1]));
266 
267    fs_desc_state->driver_set.dev_addr = driver_set.gpu;
268    fs_desc_state->driver_set.size = desc_count * PANVK_DESCRIPTOR_SIZE;
269    gfx_state_set_dirty(cmdbuf, DESC_STATE);
270    return VK_SUCCESS;
271 }
272 
273 static bool
has_depth_att(struct panvk_cmd_buffer * cmdbuf)274 has_depth_att(struct panvk_cmd_buffer *cmdbuf)
275 {
276    return (cmdbuf->state.gfx.render.bound_attachments &
277            MESA_VK_RP_ATTACHMENT_DEPTH_BIT) != 0;
278 }
279 
280 static bool
has_stencil_att(struct panvk_cmd_buffer * cmdbuf)281 has_stencil_att(struct panvk_cmd_buffer *cmdbuf)
282 {
283    return (cmdbuf->state.gfx.render.bound_attachments &
284            MESA_VK_RP_ATTACHMENT_STENCIL_BIT) != 0;
285 }
286 
287 static bool
writes_depth(struct panvk_cmd_buffer * cmdbuf)288 writes_depth(struct panvk_cmd_buffer *cmdbuf)
289 {
290    const struct vk_depth_stencil_state *ds =
291       &cmdbuf->vk.dynamic_graphics_state.ds;
292 
293    return has_depth_att(cmdbuf) && ds->depth.test_enable &&
294           ds->depth.write_enable && ds->depth.compare_op != VK_COMPARE_OP_NEVER;
295 }
296 
297 static bool
writes_stencil(struct panvk_cmd_buffer * cmdbuf)298 writes_stencil(struct panvk_cmd_buffer *cmdbuf)
299 {
300    const struct vk_depth_stencil_state *ds =
301       &cmdbuf->vk.dynamic_graphics_state.ds;
302 
303    return has_stencil_att(cmdbuf) && ds->stencil.test_enable &&
304           ((ds->stencil.front.write_mask &&
305             (ds->stencil.front.op.fail != VK_STENCIL_OP_KEEP ||
306              ds->stencil.front.op.pass != VK_STENCIL_OP_KEEP ||
307              ds->stencil.front.op.depth_fail != VK_STENCIL_OP_KEEP)) ||
308            (ds->stencil.back.write_mask &&
309             (ds->stencil.back.op.fail != VK_STENCIL_OP_KEEP ||
310              ds->stencil.back.op.pass != VK_STENCIL_OP_KEEP ||
311              ds->stencil.back.op.depth_fail != VK_STENCIL_OP_KEEP)));
312 }
313 
314 static bool
ds_test_always_passes(struct panvk_cmd_buffer * cmdbuf)315 ds_test_always_passes(struct panvk_cmd_buffer *cmdbuf)
316 {
317    const struct vk_depth_stencil_state *ds =
318       &cmdbuf->vk.dynamic_graphics_state.ds;
319 
320    if (!has_depth_att(cmdbuf))
321       return true;
322 
323    if (ds->depth.test_enable && ds->depth.compare_op != VK_COMPARE_OP_ALWAYS)
324       return false;
325 
326    if (ds->stencil.test_enable &&
327        (ds->stencil.front.op.compare != VK_COMPARE_OP_ALWAYS ||
328         ds->stencil.back.op.compare != VK_COMPARE_OP_ALWAYS))
329       return false;
330 
331    return true;
332 }
333 
334 static inline enum mali_func
translate_compare_func(VkCompareOp comp)335 translate_compare_func(VkCompareOp comp)
336 {
337    STATIC_ASSERT(VK_COMPARE_OP_NEVER == (VkCompareOp)MALI_FUNC_NEVER);
338    STATIC_ASSERT(VK_COMPARE_OP_LESS == (VkCompareOp)MALI_FUNC_LESS);
339    STATIC_ASSERT(VK_COMPARE_OP_EQUAL == (VkCompareOp)MALI_FUNC_EQUAL);
340    STATIC_ASSERT(VK_COMPARE_OP_LESS_OR_EQUAL == (VkCompareOp)MALI_FUNC_LEQUAL);
341    STATIC_ASSERT(VK_COMPARE_OP_GREATER == (VkCompareOp)MALI_FUNC_GREATER);
342    STATIC_ASSERT(VK_COMPARE_OP_NOT_EQUAL == (VkCompareOp)MALI_FUNC_NOT_EQUAL);
343    STATIC_ASSERT(VK_COMPARE_OP_GREATER_OR_EQUAL ==
344                  (VkCompareOp)MALI_FUNC_GEQUAL);
345    STATIC_ASSERT(VK_COMPARE_OP_ALWAYS == (VkCompareOp)MALI_FUNC_ALWAYS);
346 
347    return (enum mali_func)comp;
348 }
349 
350 static enum mali_stencil_op
translate_stencil_op(VkStencilOp in)351 translate_stencil_op(VkStencilOp in)
352 {
353    switch (in) {
354    case VK_STENCIL_OP_KEEP:
355       return MALI_STENCIL_OP_KEEP;
356    case VK_STENCIL_OP_ZERO:
357       return MALI_STENCIL_OP_ZERO;
358    case VK_STENCIL_OP_REPLACE:
359       return MALI_STENCIL_OP_REPLACE;
360    case VK_STENCIL_OP_INCREMENT_AND_CLAMP:
361       return MALI_STENCIL_OP_INCR_SAT;
362    case VK_STENCIL_OP_DECREMENT_AND_CLAMP:
363       return MALI_STENCIL_OP_DECR_SAT;
364    case VK_STENCIL_OP_INCREMENT_AND_WRAP:
365       return MALI_STENCIL_OP_INCR_WRAP;
366    case VK_STENCIL_OP_DECREMENT_AND_WRAP:
367       return MALI_STENCIL_OP_DECR_WRAP;
368    case VK_STENCIL_OP_INVERT:
369       return MALI_STENCIL_OP_INVERT;
370    default:
371       unreachable("Invalid stencil op");
372    }
373 }
374 
375 static enum mali_draw_mode
translate_prim_topology(VkPrimitiveTopology in)376 translate_prim_topology(VkPrimitiveTopology in)
377 {
378    /* Test VK_PRIMITIVE_TOPOLOGY_META_RECT_LIST_MESA separately, as it's not
379     * part of the VkPrimitiveTopology enum.
380     */
381    if (in == VK_PRIMITIVE_TOPOLOGY_META_RECT_LIST_MESA)
382       return MALI_DRAW_MODE_TRIANGLES;
383 
384    switch (in) {
385    case VK_PRIMITIVE_TOPOLOGY_POINT_LIST:
386       return MALI_DRAW_MODE_POINTS;
387    case VK_PRIMITIVE_TOPOLOGY_LINE_LIST:
388       return MALI_DRAW_MODE_LINES;
389    case VK_PRIMITIVE_TOPOLOGY_LINE_STRIP:
390       return MALI_DRAW_MODE_LINE_STRIP;
391    case VK_PRIMITIVE_TOPOLOGY_TRIANGLE_LIST:
392       return MALI_DRAW_MODE_TRIANGLES;
393    case VK_PRIMITIVE_TOPOLOGY_TRIANGLE_STRIP:
394       return MALI_DRAW_MODE_TRIANGLE_STRIP;
395    case VK_PRIMITIVE_TOPOLOGY_TRIANGLE_FAN:
396       return MALI_DRAW_MODE_TRIANGLE_FAN;
397    case VK_PRIMITIVE_TOPOLOGY_LINE_LIST_WITH_ADJACENCY:
398    case VK_PRIMITIVE_TOPOLOGY_LINE_STRIP_WITH_ADJACENCY:
399    case VK_PRIMITIVE_TOPOLOGY_TRIANGLE_LIST_WITH_ADJACENCY:
400    case VK_PRIMITIVE_TOPOLOGY_TRIANGLE_STRIP_WITH_ADJACENCY:
401    case VK_PRIMITIVE_TOPOLOGY_PATCH_LIST:
402    default:
403       unreachable("Invalid primitive type");
404    }
405 }
406 
407 static VkResult
update_tls(struct panvk_cmd_buffer * cmdbuf)408 update_tls(struct panvk_cmd_buffer *cmdbuf)
409 {
410    struct panvk_tls_state *state = &cmdbuf->state.tls;
411    const struct panvk_shader *vs = cmdbuf->state.gfx.vs.shader;
412    const struct panvk_shader *fs = cmdbuf->state.gfx.fs.shader;
413    struct cs_builder *b =
414       panvk_get_cs_builder(cmdbuf, PANVK_SUBQUEUE_VERTEX_TILER);
415 
416    if (!cmdbuf->state.gfx.tsd) {
417       if (!state->desc.gpu) {
418          state->desc = panvk_cmd_alloc_desc(cmdbuf, LOCAL_STORAGE);
419          if (!state->desc.gpu)
420             return VK_ERROR_OUT_OF_DEVICE_MEMORY;
421       }
422 
423       cmdbuf->state.gfx.tsd = state->desc.gpu;
424 
425       cs_update_vt_ctx(b)
426          cs_move64_to(b, cs_sr_reg64(b, 24), state->desc.gpu);
427    }
428 
429    state->info.tls.size =
430       MAX3(vs->info.tls_size, fs ? fs->info.tls_size : 0, state->info.tls.size);
431    return VK_SUCCESS;
432 }
433 
434 static enum mali_index_type
index_size_to_index_type(uint32_t size)435 index_size_to_index_type(uint32_t size)
436 {
437    switch (size) {
438    case 0:
439       return MALI_INDEX_TYPE_NONE;
440    case 1:
441       return MALI_INDEX_TYPE_UINT8;
442    case 2:
443       return MALI_INDEX_TYPE_UINT16;
444    case 4:
445       return MALI_INDEX_TYPE_UINT32;
446    default:
447       assert(!"Invalid index size");
448       return MALI_INDEX_TYPE_NONE;
449    }
450 }
451 
452 static VkResult
prepare_blend(struct panvk_cmd_buffer * cmdbuf)453 prepare_blend(struct panvk_cmd_buffer *cmdbuf)
454 {
455    bool dirty = dyn_gfx_state_dirty(cmdbuf, MS_ALPHA_TO_ONE_ENABLE) ||
456                 dyn_gfx_state_dirty(cmdbuf, CB_LOGIC_OP_ENABLE) ||
457                 dyn_gfx_state_dirty(cmdbuf, CB_LOGIC_OP) ||
458                 dyn_gfx_state_dirty(cmdbuf, CB_ATTACHMENT_COUNT) ||
459                 dyn_gfx_state_dirty(cmdbuf, CB_COLOR_WRITE_ENABLES) ||
460                 dyn_gfx_state_dirty(cmdbuf, CB_BLEND_ENABLES) ||
461                 dyn_gfx_state_dirty(cmdbuf, CB_BLEND_EQUATIONS) ||
462                 dyn_gfx_state_dirty(cmdbuf, CB_WRITE_MASKS) ||
463                 dyn_gfx_state_dirty(cmdbuf, CB_BLEND_CONSTANTS) ||
464                 fs_user_dirty(cmdbuf) || gfx_state_dirty(cmdbuf, RENDER_STATE);
465 
466    if (!dirty)
467       return VK_SUCCESS;
468 
469    const struct vk_dynamic_graphics_state *dyns =
470       &cmdbuf->vk.dynamic_graphics_state;
471    const struct vk_color_blend_state *cb = &dyns->cb;
472    unsigned bd_count = MAX2(cb->attachment_count, 1);
473    struct cs_builder *b =
474       panvk_get_cs_builder(cmdbuf, PANVK_SUBQUEUE_VERTEX_TILER);
475    struct panfrost_ptr ptr =
476       panvk_cmd_alloc_desc_array(cmdbuf, bd_count, BLEND);
477    struct mali_blend_packed *bds = ptr.cpu;
478 
479    if (bd_count && !ptr.gpu)
480       return VK_ERROR_OUT_OF_DEVICE_MEMORY;
481 
482    panvk_per_arch(blend_emit_descs)(cmdbuf, bds);
483 
484    cs_update_vt_ctx(b)
485       cs_move64_to(b, cs_sr_reg64(b, 50), ptr.gpu | bd_count);
486 
487    return VK_SUCCESS;
488 }
489 
490 static void
prepare_vp(struct panvk_cmd_buffer * cmdbuf)491 prepare_vp(struct panvk_cmd_buffer *cmdbuf)
492 {
493    struct cs_builder *b =
494       panvk_get_cs_builder(cmdbuf, PANVK_SUBQUEUE_VERTEX_TILER);
495    const VkViewport *viewport =
496       &cmdbuf->vk.dynamic_graphics_state.vp.viewports[0];
497    const VkRect2D *scissor = &cmdbuf->vk.dynamic_graphics_state.vp.scissors[0];
498 
499    if (dyn_gfx_state_dirty(cmdbuf, VP_VIEWPORTS) ||
500        dyn_gfx_state_dirty(cmdbuf, VP_SCISSORS)) {
501       struct mali_scissor_packed scissor_box;
502       pan_pack(&scissor_box, SCISSOR, cfg) {
503 
504          /* The spec says "width must be greater than 0.0" */
505          assert(viewport->width >= 0);
506          int minx = (int)viewport->x;
507          int maxx = (int)(viewport->x + viewport->width);
508 
509          /* Viewport height can be negative */
510          int miny =
511             MIN2((int)viewport->y, (int)(viewport->y + viewport->height));
512          int maxy =
513             MAX2((int)viewport->y, (int)(viewport->y + viewport->height));
514 
515          assert(scissor->offset.x >= 0 && scissor->offset.y >= 0);
516          minx = MAX2(scissor->offset.x, minx);
517          miny = MAX2(scissor->offset.y, miny);
518          maxx = MIN2(scissor->offset.x + scissor->extent.width, maxx);
519          maxy = MIN2(scissor->offset.y + scissor->extent.height, maxy);
520 
521          /* Make sure we don't end up with a max < min when width/height is 0 */
522          maxx = maxx > minx ? maxx - 1 : maxx;
523          maxy = maxy > miny ? maxy - 1 : maxy;
524 
525          /* Clamp viewport scissor to valid range */
526          cfg.scissor_minimum_x = CLAMP(minx, 0, UINT16_MAX);
527          cfg.scissor_minimum_y = CLAMP(miny, 0, UINT16_MAX);
528          cfg.scissor_maximum_x = CLAMP(maxx, 0, UINT16_MAX);
529          cfg.scissor_maximum_y = CLAMP(maxy, 0, UINT16_MAX);
530       }
531 
532       struct mali_scissor_packed *scissor_box_ptr = &scissor_box;
533       cs_move64_to(b, cs_sr_reg64(b, 42), *((uint64_t*)scissor_box_ptr));
534    }
535 
536    if (dyn_gfx_state_dirty(cmdbuf, VP_VIEWPORTS) ||
537        dyn_gfx_state_dirty(cmdbuf, RS_DEPTH_CLIP_ENABLE) ||
538        dyn_gfx_state_dirty(cmdbuf, RS_DEPTH_CLAMP_ENABLE)) {
539       struct panvk_graphics_sysvals *sysvals = &cmdbuf->state.gfx.sysvals;
540 
541       float z_min = sysvals->viewport.offset.z;
542       float z_max = z_min + sysvals->viewport.scale.z;
543       cs_move32_to(b, cs_sr_reg32(b, 44), fui(MIN2(z_min, z_max)));
544       cs_move32_to(b, cs_sr_reg32(b, 45), fui(MAX2(z_min, z_max)));
545    }
546 }
547 
548 static inline uint64_t
get_pos_spd(const struct panvk_cmd_buffer * cmdbuf)549 get_pos_spd(const struct panvk_cmd_buffer *cmdbuf)
550 {
551    const struct panvk_shader *vs = cmdbuf->state.gfx.vs.shader;
552    assert(vs);
553    const struct vk_input_assembly_state *ia =
554       &cmdbuf->vk.dynamic_graphics_state.ia;
555    return ia->primitive_topology == VK_PRIMITIVE_TOPOLOGY_POINT_LIST
556              ? panvk_priv_mem_dev_addr(vs->spds.pos_points)
557              : panvk_priv_mem_dev_addr(vs->spds.pos_triangles);
558 }
559 
560 static void
prepare_tiler_primitive_size(struct panvk_cmd_buffer * cmdbuf)561 prepare_tiler_primitive_size(struct panvk_cmd_buffer *cmdbuf)
562 {
563    struct cs_builder *b =
564       panvk_get_cs_builder(cmdbuf, PANVK_SUBQUEUE_VERTEX_TILER);
565    const struct panvk_shader *vs = cmdbuf->state.gfx.vs.shader;
566    const struct vk_input_assembly_state *ia =
567       &cmdbuf->vk.dynamic_graphics_state.ia;
568    float primitive_size;
569 
570    if (!dyn_gfx_state_dirty(cmdbuf, IA_PRIMITIVE_TOPOLOGY) &&
571        !dyn_gfx_state_dirty(cmdbuf, RS_LINE_WIDTH) &&
572        !gfx_state_dirty(cmdbuf, VS))
573       return;
574 
575    switch (ia->primitive_topology) {
576    /* From the Vulkan spec 1.3.293:
577     *
578     *    "If maintenance5 is enabled and a value is not written to a variable
579     *    decorated with PointSize, a value of 1.0 is used as the size of
580     *    points."
581     *
582     * If no point size is written, ensure that the size is always 1.0f.
583     */
584    case VK_PRIMITIVE_TOPOLOGY_POINT_LIST:
585       if (vs->info.vs.writes_point_size)
586          return;
587 
588       primitive_size = 1.0f;
589       break;
590    case VK_PRIMITIVE_TOPOLOGY_LINE_LIST:
591    case VK_PRIMITIVE_TOPOLOGY_LINE_STRIP:
592       primitive_size = cmdbuf->vk.dynamic_graphics_state.rs.line.width;
593       break;
594    default:
595       return;
596    }
597 
598    cs_move32_to(b, cs_sr_reg32(b, 60), fui(primitive_size));
599 }
600 
601 static uint32_t
calc_enabled_layer_count(struct panvk_cmd_buffer * cmdbuf)602 calc_enabled_layer_count(struct panvk_cmd_buffer *cmdbuf)
603 {
604    return cmdbuf->state.gfx.render.view_mask ?
605       util_bitcount(cmdbuf->state.gfx.render.view_mask) :
606       cmdbuf->state.gfx.render.layer_count;
607 }
608 
609 static uint32_t
calc_fbd_size(struct panvk_cmd_buffer * cmdbuf)610 calc_fbd_size(struct panvk_cmd_buffer *cmdbuf)
611 {
612    const struct pan_fb_info *fb = &cmdbuf->state.gfx.render.fb.info;
613    bool has_zs_ext = fb->zs.view.zs || fb->zs.view.s;
614    uint32_t rt_count = MAX2(fb->rt_count, 1);
615 
616    return get_fbd_size(has_zs_ext, rt_count);
617 }
618 
619 static uint32_t
calc_render_descs_size(struct panvk_cmd_buffer * cmdbuf)620 calc_render_descs_size(struct panvk_cmd_buffer *cmdbuf)
621 {
622    uint32_t fbd_count = calc_enabled_layer_count(cmdbuf) *
623       (1 + PANVK_IR_PASS_COUNT);
624    uint32_t td_count = DIV_ROUND_UP(cmdbuf->state.gfx.render.layer_count,
625                                     MAX_LAYERS_PER_TILER_DESC);
626 
627    return (calc_fbd_size(cmdbuf) * fbd_count) +
628           (td_count * pan_size(TILER_CONTEXT));
629 }
630 
631 static void
cs_render_desc_ringbuf_reserve(struct cs_builder * b,uint32_t size)632 cs_render_desc_ringbuf_reserve(struct cs_builder *b, uint32_t size)
633 {
634    /* Make sure we don't allocate more than the ringbuf size. */
635    assert(size <= RENDER_DESC_RINGBUF_SIZE);
636 
637    /* Make sure the allocation is 64-byte aligned. */
638    assert(ALIGN_POT(size, 64) == size);
639 
640    struct cs_index ringbuf_sync = cs_scratch_reg64(b, 0);
641    struct cs_index sz_reg = cs_scratch_reg32(b, 2);
642 
643    cs_load64_to(
644       b, ringbuf_sync, cs_subqueue_ctx_reg(b),
645       offsetof(struct panvk_cs_subqueue_context, render.desc_ringbuf.syncobj));
646    cs_wait_slot(b, SB_ID(LS), false);
647 
648    /* Wait for the other end to release memory. */
649    cs_move32_to(b, sz_reg, size - 1);
650    cs_sync32_wait(b, false, MALI_CS_CONDITION_GREATER, sz_reg, ringbuf_sync);
651 
652    /* Decrement the syncobj to reflect the fact we're reserving memory. */
653    cs_move32_to(b, sz_reg, -size);
654    cs_sync32_add(b, false, MALI_CS_SYNC_SCOPE_CSG, sz_reg, ringbuf_sync,
655                  cs_now());
656 }
657 
658 static void
cs_render_desc_ringbuf_move_ptr(struct cs_builder * b,uint32_t size,bool wrap_around)659 cs_render_desc_ringbuf_move_ptr(struct cs_builder *b, uint32_t size,
660                                 bool wrap_around)
661 {
662    struct cs_index scratch_reg = cs_scratch_reg32(b, 0);
663    struct cs_index ptr_lo = cs_scratch_reg32(b, 2);
664    struct cs_index pos = cs_scratch_reg32(b, 4);
665 
666    cs_load_to(
667       b, cs_scratch_reg_tuple(b, 2, 3), cs_subqueue_ctx_reg(b),
668       BITFIELD_MASK(3),
669       offsetof(struct panvk_cs_subqueue_context, render.desc_ringbuf.ptr));
670    cs_wait_slot(b, SB_ID(LS), false);
671 
672    /* Update the relative position and absolute address. */
673    cs_add32(b, ptr_lo, ptr_lo, size);
674    cs_add32(b, pos, pos, size);
675 
676    /* Wrap-around. */
677    if (likely(wrap_around)) {
678       cs_add32(b, scratch_reg, pos, -RENDER_DESC_RINGBUF_SIZE);
679 
680       cs_if(b, MALI_CS_CONDITION_GEQUAL, scratch_reg) {
681          cs_add32(b, ptr_lo, ptr_lo, -RENDER_DESC_RINGBUF_SIZE);
682          cs_add32(b, pos, pos, -RENDER_DESC_RINGBUF_SIZE);
683       }
684    }
685 
686    cs_store(
687       b, cs_scratch_reg_tuple(b, 2, 3), cs_subqueue_ctx_reg(b),
688       BITFIELD_MASK(3),
689       offsetof(struct panvk_cs_subqueue_context, render.desc_ringbuf.ptr));
690    cs_wait_slot(b, SB_ID(LS), false);
691 }
692 
693 static VkResult
get_tiler_desc(struct panvk_cmd_buffer * cmdbuf)694 get_tiler_desc(struct panvk_cmd_buffer *cmdbuf)
695 {
696    assert(cmdbuf->state.gfx.render.invalidate_inherited_ctx ||
697           !inherits_render_ctx(cmdbuf));
698 
699    if (cmdbuf->state.gfx.render.tiler)
700       return VK_SUCCESS;
701 
702    struct cs_builder *b =
703       panvk_get_cs_builder(cmdbuf, PANVK_SUBQUEUE_VERTEX_TILER);
704    struct panvk_physical_device *phys_dev =
705       to_panvk_physical_device(cmdbuf->vk.base.device->physical);
706    struct panvk_instance *instance =
707       to_panvk_instance(phys_dev->vk.instance);
708    bool tracing_enabled = instance->debug_flags & PANVK_DEBUG_TRACE;
709    struct panfrost_tiler_features tiler_features =
710       panfrost_query_tiler_features(&phys_dev->kmod.props);
711    bool simul_use =
712       cmdbuf->flags & VK_COMMAND_BUFFER_USAGE_SIMULTANEOUS_USE_BIT;
713    struct panfrost_ptr tiler_desc = {0};
714    struct mali_tiler_context_packed tiler_tmpl;
715    uint32_t td_count = DIV_ROUND_UP(cmdbuf->state.gfx.render.layer_count,
716                                     MAX_LAYERS_PER_TILER_DESC);
717 
718    if (!simul_use) {
719       tiler_desc = panvk_cmd_alloc_desc_array(cmdbuf, td_count, TILER_CONTEXT);
720       if (!tiler_desc.gpu)
721          return VK_ERROR_OUT_OF_DEVICE_MEMORY;
722    }
723 
724    const struct pan_fb_info *fbinfo = &cmdbuf->state.gfx.render.fb.info;
725 
726    pan_pack(&tiler_tmpl, TILER_CONTEXT, cfg) {
727       unsigned max_levels = tiler_features.max_levels;
728       assert(max_levels >= 2);
729 
730       cfg.hierarchy_mask =
731          panvk_select_tiler_hierarchy_mask(phys_dev, &cmdbuf->state.gfx);
732       cfg.fb_width = fbinfo->width;
733       cfg.fb_height = fbinfo->height;
734 
735       cfg.sample_pattern = pan_sample_pattern(fbinfo->nr_samples);
736 
737       cfg.first_provoking_vertex =
738          cmdbuf->vk.dynamic_graphics_state.rs.provoking_vertex ==
739             VK_PROVOKING_VERTEX_MODE_FIRST_VERTEX_EXT;
740 
741       /* This will be overloaded. */
742       cfg.layer_count = 1;
743       cfg.layer_offset = 0;
744    }
745 
746    /* When simul_use=true, the tiler descriptors are allocated from the
747     * descriptor ringbuf. We set state.gfx.render.tiler to a non-NULL
748     * value to satisfy the is_tiler_desc_allocated() tests, but we want
749     * it to point to a faulty address so that we can easily detect if it's
750     * used in the command stream/framebuffer descriptors. */
751    cmdbuf->state.gfx.render.tiler =
752       simul_use ? 0xdeadbeefdeadbeefull : tiler_desc.gpu;
753 
754    struct cs_index tiler_ctx_addr = cs_sr_reg64(b, 40);
755 
756    if (simul_use) {
757       uint32_t descs_sz = calc_render_descs_size(cmdbuf);
758 
759       cs_render_desc_ringbuf_reserve(b, descs_sz);
760 
761       /* Reserve ringbuf mem. */
762       cs_update_vt_ctx(b) {
763          cs_load64_to(b, tiler_ctx_addr, cs_subqueue_ctx_reg(b),
764                       offsetof(struct panvk_cs_subqueue_context,
765                                render.desc_ringbuf.ptr));
766       }
767 
768       cs_render_desc_ringbuf_move_ptr(b, descs_sz, !tracing_enabled);
769    } else {
770       cs_update_vt_ctx(b) {
771          cs_move64_to(b, tiler_ctx_addr, tiler_desc.gpu);
772       }
773    }
774 
775    /* Reset the polygon list. */
776    cs_move64_to(b, cs_scratch_reg64(b, 0), 0);
777 
778    /* Lay out words 2, 3 and 5, so they can be stored along the other updates.
779     * Word 4 contains layer information and will be updated in the loop. */
780    cs_move64_to(b, cs_scratch_reg64(b, 2),
781                 tiler_tmpl.opaque[2] | (uint64_t)tiler_tmpl.opaque[3] << 32);
782    cs_move32_to(b, cs_scratch_reg32(b, 5), tiler_tmpl.opaque[5]);
783 
784    /* Load the tiler_heap and geom_buf from the context. */
785    cs_load_to(b, cs_scratch_reg_tuple(b, 6, 4), cs_subqueue_ctx_reg(b),
786               BITFIELD_MASK(4),
787               offsetof(struct panvk_cs_subqueue_context, render.tiler_heap));
788 
789    /* Fill extra fields with zeroes so we can reset the completed
790     * top/bottom and private states. */
791    cs_move64_to(b, cs_scratch_reg64(b, 10), 0);
792    cs_move64_to(b, cs_scratch_reg64(b, 12), 0);
793    cs_move64_to(b, cs_scratch_reg64(b, 14), 0);
794 
795    cs_wait_slot(b, SB_ID(LS), false);
796 
797    /* Take care of the tiler desc with layer_offset=0 outside of the loop. */
798    cs_move32_to(b, cs_scratch_reg32(b, 4),
799                 MIN2(cmdbuf->state.gfx.render.layer_count - 1,
800                      MAX_LAYERS_PER_TILER_DESC - 1));
801 
802    /* Replace words 0:13 and 24:31. */
803    cs_store(b, cs_scratch_reg_tuple(b, 0, 16), tiler_ctx_addr,
804             BITFIELD_MASK(16), 0);
805    cs_store(b, cs_scratch_reg_tuple(b, 0, 16), tiler_ctx_addr,
806             BITFIELD_RANGE(0, 2) | BITFIELD_RANGE(10, 6), 64);
807    cs_store(b, cs_scratch_reg_tuple(b, 0, 16), tiler_ctx_addr,
808             BITFIELD_RANGE(0, 2) | BITFIELD_RANGE(10, 6), 96);
809 
810    cs_wait_slot(b, SB_ID(LS), false);
811 
812    uint32_t remaining_layers =
813       td_count > 1
814          ? cmdbuf->state.gfx.render.layer_count % MAX_LAYERS_PER_TILER_DESC
815          : 0;
816    uint32_t full_td_count =
817       cmdbuf->state.gfx.render.layer_count / MAX_LAYERS_PER_TILER_DESC;
818 
819    if (remaining_layers) {
820       int32_t layer_offset =
821          -(cmdbuf->state.gfx.render.layer_count - remaining_layers) &
822          BITFIELD_MASK(9);
823 
824       /* If the last tiler descriptor is not full, we emit it outside of the
825        * loop to pass the right layer count. All this would be a lot simpler
826        * if we had OR/AND instructions, but here we are. */
827       cs_update_vt_ctx(b)
828          cs_add64(b, tiler_ctx_addr, tiler_ctx_addr,
829                   pan_size(TILER_CONTEXT) * full_td_count);
830       cs_move32_to(b, cs_scratch_reg32(b, 4),
831                    (layer_offset << 8) | (remaining_layers - 1));
832       cs_store(b, cs_scratch_reg_tuple(b, 0, 16), tiler_ctx_addr,
833                BITFIELD_MASK(16), 0);
834       cs_store(b, cs_scratch_reg_tuple(b, 0, 16), tiler_ctx_addr,
835                BITFIELD_RANGE(0, 2) | BITFIELD_RANGE(10, 6), 64);
836       cs_store(b, cs_scratch_reg_tuple(b, 0, 16), tiler_ctx_addr,
837                BITFIELD_RANGE(0, 2) | BITFIELD_RANGE(10, 6), 96);
838       cs_wait_slot(b, SB_ID(LS), false);
839 
840       cs_update_vt_ctx(b)
841          cs_add64(b, tiler_ctx_addr, tiler_ctx_addr,
842                   -pan_size(TILER_CONTEXT));
843    } else if (full_td_count > 1) {
844       cs_update_vt_ctx(b)
845          cs_add64(b, tiler_ctx_addr, tiler_ctx_addr,
846                   pan_size(TILER_CONTEXT) * (full_td_count - 1));
847    }
848 
849    if (full_td_count > 1) {
850       struct cs_index counter_reg = cs_scratch_reg32(b, 17);
851       uint32_t layer_offset =
852          (-MAX_LAYERS_PER_TILER_DESC * (full_td_count - 1)) & BITFIELD_MASK(9);
853 
854       cs_move32_to(b, counter_reg, full_td_count - 1);
855       cs_move32_to(b, cs_scratch_reg32(b, 4),
856                    (layer_offset << 8) | (MAX_LAYERS_PER_TILER_DESC - 1));
857 
858       /* We iterate the remaining full tiler descriptors in reverse order, so we
859        * can start from the smallest layer offset, and increment it by
860        * MAX_LAYERS_PER_TILER_DESC << 8 at each iteration. Again, the split is
861        * mostly due to the lack of AND instructions, and the fact layer_offset
862        * is a 9-bit signed integer inside a 32-bit word, which ADD32 can't deal
863        * with unless the number we add is positive.
864        */
865       cs_while(b, MALI_CS_CONDITION_GREATER, counter_reg) {
866          /* Replace words 0:13 and 24:31. */
867          cs_store(b, cs_scratch_reg_tuple(b, 0, 16), tiler_ctx_addr,
868                   BITFIELD_MASK(16), 0);
869          cs_store(b, cs_scratch_reg_tuple(b, 0, 16), tiler_ctx_addr,
870                   BITFIELD_RANGE(0, 2) | BITFIELD_RANGE(10, 6), 64);
871          cs_store(b, cs_scratch_reg_tuple(b, 0, 16), tiler_ctx_addr,
872                   BITFIELD_RANGE(0, 2) | BITFIELD_RANGE(10, 6), 96);
873 
874          cs_wait_slot(b, SB_ID(LS), false);
875 
876          cs_add32(b, cs_scratch_reg32(b, 4), cs_scratch_reg32(b, 4),
877                   MAX_LAYERS_PER_TILER_DESC << 8);
878 
879          cs_add32(b, counter_reg, counter_reg, -1);
880          cs_update_vt_ctx(b)
881             cs_add64(b, tiler_ctx_addr, tiler_ctx_addr,
882                      -pan_size(TILER_CONTEXT));
883       }
884    }
885 
886    /* Then we change the scoreboard slot used for iterators. */
887    panvk_per_arch(cs_pick_iter_sb)(cmdbuf, PANVK_SUBQUEUE_VERTEX_TILER);
888 
889    cs_heap_operation(b, MALI_CS_HEAP_OPERATION_VERTEX_TILER_STARTED, cs_now());
890    return VK_SUCCESS;
891 }
892 
893 static uint8_t
prepare_fb_desc(struct panvk_cmd_buffer * cmdbuf,struct pan_fb_info * fbinfo,uint32_t layer,void * fbd)894 prepare_fb_desc(struct panvk_cmd_buffer *cmdbuf, struct pan_fb_info *fbinfo,
895                 uint32_t layer, void *fbd)
896 {
897    struct pan_tiler_context tiler_ctx = {
898       .valhall.layer_offset = layer - (layer % MAX_LAYERS_PER_TILER_DESC),
899    };
900 
901    if (!(cmdbuf->flags & VK_COMMAND_BUFFER_USAGE_SIMULTANEOUS_USE_BIT)) {
902       uint32_t td_idx = layer / MAX_LAYERS_PER_TILER_DESC;
903 
904       tiler_ctx.valhall.desc =
905          cmdbuf->state.gfx.render.tiler + (td_idx * pan_size(TILER_CONTEXT));
906    }
907 
908    return GENX(pan_emit_fbd)(fbinfo, layer, NULL, &tiler_ctx, fbd);
909 }
910 
911 static VkResult
prepare_incremental_rendering_fbinfos(struct panvk_cmd_buffer * cmdbuf,const struct pan_fb_info * fbinfo,struct pan_fb_info ir_fbinfos[PANVK_IR_PASS_COUNT])912 prepare_incremental_rendering_fbinfos(
913    struct panvk_cmd_buffer *cmdbuf, const struct pan_fb_info *fbinfo,
914    struct pan_fb_info ir_fbinfos[PANVK_IR_PASS_COUNT])
915 {
916    /* First incremental rendering pass: don't discard result */
917 
918    struct pan_fb_info *ir_fb = &ir_fbinfos[PANVK_IR_FIRST_PASS];
919 
920    memcpy(ir_fb, fbinfo, sizeof(*ir_fb));
921    for (unsigned i = 0; i < fbinfo->rt_count; i++)
922       ir_fb->rts[i].discard = false;
923    ir_fb->zs.discard.z = false;
924    ir_fb->zs.discard.s = false;
925 
926    /* Subsequent incremental rendering passes: preload old content and don't
927     * discard result */
928 
929    struct pan_fb_info *prev_ir_fb = ir_fb;
930    ir_fb = &ir_fbinfos[PANVK_IR_MIDDLE_PASS];
931    memcpy(ir_fb, prev_ir_fb, sizeof(*ir_fb));
932 
933    bool preload_changed = false;
934 
935    for (unsigned i = 0; i < fbinfo->rt_count; i++) {
936       if (fbinfo->rts[i].view && !fbinfo->rts[i].preload) {
937          ir_fb->rts[i].preload = true;
938          preload_changed = true;
939       }
940 
941       if (ir_fb->rts[i].clear) {
942          ir_fb->rts[i].clear = false;
943          preload_changed = true;
944       }
945    }
946    if (fbinfo->zs.view.zs && !fbinfo->zs.preload.z && !fbinfo->zs.preload.s) {
947       ir_fb->zs.preload.z = true;
948       ir_fb->zs.preload.s = true;
949       preload_changed = true;
950    } else if (fbinfo->zs.view.s && !fbinfo->zs.preload.s) {
951       ir_fb->zs.preload.s = true;
952       preload_changed = true;
953    }
954 
955    if (ir_fb->zs.clear.z || ir_fb->zs.clear.s) {
956       ir_fb->zs.clear.z = false;
957       ir_fb->zs.clear.s = false;
958       preload_changed = true;
959    }
960 
961    if (preload_changed) {
962       memset(&ir_fb->bifrost.pre_post.dcds, 0x0,
963              sizeof(ir_fb->bifrost.pre_post.dcds));
964       VkResult result = panvk_per_arch(cmd_fb_preload)(cmdbuf, ir_fb);
965       if (result != VK_SUCCESS)
966          return result;
967    }
968 
969    /* Last incremental rendering pass: preload previous content and deal with
970     * results as specified by user */
971 
972    prev_ir_fb = ir_fb;
973    ir_fb = &ir_fbinfos[PANVK_IR_LAST_PASS];
974    memcpy(ir_fb, prev_ir_fb, sizeof(*ir_fb));
975 
976    for (unsigned i = 0; i < fbinfo->rt_count; i++)
977       ir_fb->rts[i].discard = fbinfo->rts[i].discard;
978    ir_fb->zs.discard.z = fbinfo->zs.discard.z;
979    ir_fb->zs.discard.s = fbinfo->zs.discard.s;
980 
981    return VK_SUCCESS;
982 }
983 
984 static VkResult
get_fb_descs(struct panvk_cmd_buffer * cmdbuf)985 get_fb_descs(struct panvk_cmd_buffer *cmdbuf)
986 {
987    assert(cmdbuf->state.gfx.render.invalidate_inherited_ctx ||
988           !inherits_render_ctx(cmdbuf));
989 
990    if (cmdbuf->state.gfx.render.fbds.gpu ||
991        !cmdbuf->state.gfx.render.layer_count)
992       return VK_SUCCESS;
993 
994    uint32_t fbd_sz = calc_fbd_size(cmdbuf);
995    uint32_t fbds_sz = fbd_sz * calc_enabled_layer_count(cmdbuf) *
996       (1 + PANVK_IR_PASS_COUNT);
997 
998    cmdbuf->state.gfx.render.fbds = panvk_cmd_alloc_dev_mem(
999       cmdbuf, desc, fbds_sz, pan_alignment(FRAMEBUFFER));
1000    if (!cmdbuf->state.gfx.render.fbds.gpu)
1001       return VK_ERROR_OUT_OF_DEVICE_MEMORY;
1002 
1003    struct panvk_device *dev = to_panvk_device(cmdbuf->vk.base.device);
1004    struct pan_fb_info *fbinfo = &cmdbuf->state.gfx.render.fb.info;
1005    bool simul_use =
1006       cmdbuf->flags & VK_COMMAND_BUFFER_USAGE_SIMULTANEOUS_USE_BIT;
1007 
1008    /* The only bit we patch in FBDs is the tiler pointer. If tiler is not
1009     * involved (clear job) or if the update can happen in place (not
1010     * simultaneous use of the command buffer), we can avoid the
1011     * copy.
1012     *
1013     * According to VUID-VkSubmitInfo2KHR-commandBuffer-06192 and
1014     * VUID-VkSubmitInfo2KHR-commandBuffer-06010, suspend/resume operations
1015     * can't cross the vkQueueSubmit2() boundary, so no need to dynamically
1016     * allocate descriptors in that case:
1017     * "
1018     *   If any commandBuffer member of an element of pCommandBufferInfos
1019     *   contains any suspended render pass instances, they must be resumed by a
1020     *   render pass instance later in submission order within
1021     *   pCommandBufferInfos.
1022     *
1023     *   If any commandBuffer member of an element of pCommandBufferInfos
1024     *   contains any resumed render pass instances, they must be suspended by a
1025     *   render pass instance earlier in submission order within
1026     *   pCommandBufferInfos.
1027     * "
1028     */
1029    bool copy_fbds = simul_use && cmdbuf->state.gfx.render.tiler;
1030    struct panfrost_ptr fbds = cmdbuf->state.gfx.render.fbds;
1031    uint32_t fbd_flags = 0;
1032    uint32_t fbd_ir_pass_offset = fbd_sz * calc_enabled_layer_count(cmdbuf);
1033 
1034    fbinfo->sample_positions =
1035       dev->sample_positions->addr.dev +
1036       panfrost_sample_positions_offset(pan_sample_pattern(fbinfo->nr_samples));
1037 
1038    VkResult result = panvk_per_arch(cmd_fb_preload)(cmdbuf, fbinfo);
1039    if (result != VK_SUCCESS)
1040       return result;
1041 
1042    struct pan_fb_info ir_fbinfos[PANVK_IR_PASS_COUNT];
1043    result = prepare_incremental_rendering_fbinfos(cmdbuf, fbinfo, ir_fbinfos);
1044    if (result != VK_SUCCESS)
1045       return result;
1046 
1047    /* We prepare all FB descriptors upfront. For multiview, only create FBDs
1048     * for enabled views. */
1049    uint32_t view_mask_temp = cmdbuf->state.gfx.render.view_mask;
1050    uint32_t enabled_layer_count = calc_enabled_layer_count(cmdbuf);
1051    bool multiview = cmdbuf->state.gfx.render.view_mask;
1052 
1053    for (uint32_t i = 0; i < enabled_layer_count; i++) {
1054       uint32_t layer_idx = multiview ? u_bit_scan(&view_mask_temp) : i;
1055 
1056       uint32_t layer_offset = fbd_sz * i;
1057       uint32_t new_fbd_flags =
1058          prepare_fb_desc(cmdbuf, fbinfo, layer_idx, fbds.cpu + layer_offset);
1059 
1060       /* Make sure all FBDs have the same flags. */
1061       assert(i == 0 || new_fbd_flags == fbd_flags);
1062       fbd_flags = new_fbd_flags;
1063 
1064       for (uint32_t j = 0; j < PANVK_IR_PASS_COUNT; j++) {
1065          uint32_t ir_pass_offset = (1 + j) * fbd_ir_pass_offset;
1066          new_fbd_flags =
1067             prepare_fb_desc(cmdbuf, &ir_fbinfos[j], layer_idx,
1068                             fbds.cpu + ir_pass_offset + layer_offset);
1069 
1070          /* Make sure all IR FBDs have the same flags. */
1071          assert(new_fbd_flags == fbd_flags);
1072       }
1073    }
1074 
1075    struct cs_builder *b = panvk_get_cs_builder(cmdbuf, PANVK_SUBQUEUE_FRAGMENT);
1076 
1077    if (copy_fbds) {
1078       struct cs_index cur_tiler = cs_sr_reg64(b, 38);
1079       struct cs_index dst_fbd_ptr = cs_sr_reg64(b, 40);
1080       struct cs_index layer_count = cs_sr_reg32(b, 47);
1081       struct cs_index src_fbd_ptr = cs_sr_reg64(b, 48);
1082       struct cs_index remaining_layers_in_td = cs_sr_reg32(b, 50);
1083       struct cs_index pass_count = cs_sr_reg32(b, 51);
1084       struct cs_index pass_src_fbd_ptr = cs_sr_reg64(b, 52);
1085       struct cs_index pass_dst_fbd_ptr = cs_sr_reg64(b, 54);
1086       uint32_t td_count = DIV_ROUND_UP(cmdbuf->state.gfx.render.layer_count,
1087                                        MAX_LAYERS_PER_TILER_DESC);
1088 
1089       cs_update_frag_ctx(b) {
1090          cs_load64_to(b, cur_tiler, cs_subqueue_ctx_reg(b),
1091                       offsetof(struct panvk_cs_subqueue_context,
1092                                render.desc_ringbuf.ptr));
1093          cs_wait_slot(b, SB_ID(LS), false);
1094          cs_add64(b, dst_fbd_ptr, cur_tiler,
1095                   pan_size(TILER_CONTEXT) * td_count);
1096       }
1097 
1098       cs_move64_to(b, src_fbd_ptr, fbds.gpu);
1099       cs_move32_to(b, remaining_layers_in_td, MAX_LAYERS_PER_TILER_DESC);
1100 
1101       cs_move32_to(b, layer_count, calc_enabled_layer_count(cmdbuf));
1102       cs_while(b, MALI_CS_CONDITION_GREATER, layer_count) {
1103          /* Our loop is copying 64-bytes at a time, so make sure the
1104           * framebuffer size is aligned on 64-bytes. */
1105          assert(fbd_sz == ALIGN_POT(fbd_sz, 64));
1106 
1107          cs_move32_to(b, pass_count, PANVK_IR_PASS_COUNT);
1108          cs_add64(b, pass_src_fbd_ptr, src_fbd_ptr, 0);
1109          cs_add64(b, pass_dst_fbd_ptr, dst_fbd_ptr, 0);
1110          /* Copy FBDs the regular pass as well as IR passes. */
1111          cs_while(b, MALI_CS_CONDITION_GEQUAL, pass_count) {
1112             for (uint32_t fbd_off = 0; fbd_off < fbd_sz; fbd_off += 64) {
1113                if (fbd_off == 0) {
1114                   cs_load_to(b, cs_scratch_reg_tuple(b, 0, 14),
1115                              pass_src_fbd_ptr, BITFIELD_MASK(14), fbd_off);
1116                   cs_add64(b, cs_scratch_reg64(b, 14), cur_tiler, 0);
1117                } else {
1118                   cs_load_to(b, cs_scratch_reg_tuple(b, 0, 16),
1119                              pass_src_fbd_ptr, BITFIELD_MASK(16), fbd_off);
1120                }
1121                cs_wait_slot(b, SB_ID(LS), false);
1122                cs_store(b, cs_scratch_reg_tuple(b, 0, 16), pass_dst_fbd_ptr,
1123                         BITFIELD_MASK(16), fbd_off);
1124                cs_wait_slot(b, SB_ID(LS), false);
1125             }
1126             cs_add64(b, pass_src_fbd_ptr, pass_src_fbd_ptr, fbd_ir_pass_offset);
1127             cs_add64(b, pass_dst_fbd_ptr, pass_dst_fbd_ptr, fbd_ir_pass_offset);
1128             cs_add32(b, pass_count, pass_count, -1);
1129          }
1130 
1131          cs_add64(b, src_fbd_ptr, src_fbd_ptr, fbd_sz);
1132          cs_update_frag_ctx(b)
1133             cs_add64(b, dst_fbd_ptr, dst_fbd_ptr, fbd_sz);
1134 
1135          cs_add32(b, remaining_layers_in_td, remaining_layers_in_td, -1);
1136          cs_add32(b, layer_count, layer_count, -1);
1137          cs_if(b, MALI_CS_CONDITION_LEQUAL, remaining_layers_in_td) {
1138             cs_update_frag_ctx(b)
1139                cs_add64(b, cur_tiler, cur_tiler, pan_size(TILER_CONTEXT));
1140             cs_move32_to(b, remaining_layers_in_td,
1141                          MAX_LAYERS_PER_TILER_DESC);
1142          }
1143       }
1144 
1145       cs_update_frag_ctx(b) {
1146          uint32_t full_td_count =
1147             cmdbuf->state.gfx.render.layer_count / MAX_LAYERS_PER_TILER_DESC;
1148 
1149          /* If the last tiler descriptor is not full, cur_tiler points to the
1150           * last tiler descriptor, not the FBD that follows. */
1151          if (full_td_count < td_count)
1152             cs_add64(b, dst_fbd_ptr, cur_tiler,
1153                      fbd_flags + pan_size(TILER_CONTEXT));
1154          else
1155             cs_add64(b, dst_fbd_ptr, cur_tiler, fbd_flags);
1156 
1157          cs_add64(b, cur_tiler, cur_tiler,
1158                   -(full_td_count * pan_size(TILER_CONTEXT)));
1159       }
1160    } else {
1161       cs_update_frag_ctx(b) {
1162          cs_move64_to(b, cs_sr_reg64(b, 40), fbds.gpu | fbd_flags);
1163          cs_move64_to(b, cs_sr_reg64(b, 38), cmdbuf->state.gfx.render.tiler);
1164       }
1165    }
1166 
1167    return VK_SUCCESS;
1168 }
1169 
1170 static void
set_provoking_vertex_mode(struct panvk_cmd_buffer * cmdbuf)1171 set_provoking_vertex_mode(struct panvk_cmd_buffer *cmdbuf)
1172 {
1173    struct pan_fb_info *fbinfo = &cmdbuf->state.gfx.render.fb.info;
1174    bool first_provoking_vertex =
1175       cmdbuf->vk.dynamic_graphics_state.rs.provoking_vertex ==
1176          VK_PROVOKING_VERTEX_MODE_FIRST_VERTEX_EXT;
1177 
1178    /* If this is not the first draw, first_provoking_vertex should match
1179     * the one from the previous draws. Unfortunately, we can't check it
1180     * when the render pass is inherited. */
1181    assert(!cmdbuf->state.gfx.render.fbds.gpu || inherits_render_ctx(cmdbuf) ||
1182           fbinfo->first_provoking_vertex == first_provoking_vertex);
1183 
1184    fbinfo->first_provoking_vertex = first_provoking_vertex;
1185 }
1186 
1187 static VkResult
get_render_ctx(struct panvk_cmd_buffer * cmdbuf)1188 get_render_ctx(struct panvk_cmd_buffer *cmdbuf)
1189 {
1190    VkResult result = get_tiler_desc(cmdbuf);
1191    if (result != VK_SUCCESS)
1192       return result;
1193 
1194    return get_fb_descs(cmdbuf);
1195 }
1196 
1197 static VkResult
prepare_vs(struct panvk_cmd_buffer * cmdbuf)1198 prepare_vs(struct panvk_cmd_buffer *cmdbuf)
1199 {
1200    struct panvk_descriptor_state *desc_state = &cmdbuf->state.gfx.desc_state;
1201    struct panvk_shader_desc_state *vs_desc_state = &cmdbuf->state.gfx.vs.desc;
1202    const struct panvk_shader *vs = cmdbuf->state.gfx.vs.shader;
1203    struct cs_builder *b =
1204       panvk_get_cs_builder(cmdbuf, PANVK_SUBQUEUE_VERTEX_TILER);
1205    bool upd_res_table = false;
1206 
1207    VkResult result = prepare_vs_driver_set(cmdbuf);
1208    if (result != VK_SUCCESS)
1209       return result;
1210 
1211    if (gfx_state_dirty(cmdbuf, VS) || gfx_state_dirty(cmdbuf, DESC_STATE) ||
1212        vs_driver_set_is_dirty(cmdbuf)) {
1213       result = panvk_per_arch(cmd_prepare_shader_res_table)(cmdbuf, desc_state,
1214                                                             vs, vs_desc_state);
1215       if (result != VK_SUCCESS)
1216          return result;
1217 
1218       upd_res_table = true;
1219    }
1220 
1221    cs_update_vt_ctx(b) {
1222       if (upd_res_table)
1223          cs_move64_to(b, cs_sr_reg64(b, 0), vs_desc_state->res_table);
1224 
1225       if (gfx_state_dirty(cmdbuf, VS) ||
1226           dyn_gfx_state_dirty(cmdbuf, IA_PRIMITIVE_TOPOLOGY))
1227          cs_move64_to(b, cs_sr_reg64(b, 16), get_pos_spd(cmdbuf));
1228 
1229       if (gfx_state_dirty(cmdbuf, VS))
1230          cs_move64_to(b, cs_sr_reg64(b, 18),
1231                       panvk_priv_mem_dev_addr(vs->spds.var));
1232    }
1233 
1234    return VK_SUCCESS;
1235 }
1236 
1237 static VkResult
prepare_fs(struct panvk_cmd_buffer * cmdbuf)1238 prepare_fs(struct panvk_cmd_buffer *cmdbuf)
1239 {
1240    const struct panvk_shader *fs = get_fs(cmdbuf);
1241    struct panvk_shader_desc_state *fs_desc_state = &cmdbuf->state.gfx.fs.desc;
1242    struct panvk_descriptor_state *desc_state = &cmdbuf->state.gfx.desc_state;
1243    struct cs_builder *b =
1244       panvk_get_cs_builder(cmdbuf, PANVK_SUBQUEUE_VERTEX_TILER);
1245 
1246    if (fs &&
1247        (gfx_state_dirty(cmdbuf, FS) || gfx_state_dirty(cmdbuf, DESC_STATE))) {
1248       VkResult result = prepare_fs_driver_set(cmdbuf);
1249       if (result != VK_SUCCESS)
1250          return result;
1251 
1252       result = panvk_per_arch(cmd_prepare_shader_res_table)(cmdbuf, desc_state,
1253                                                             fs, fs_desc_state);
1254       if (result != VK_SUCCESS)
1255          return result;
1256    }
1257 
1258    cs_update_vt_ctx(b) {
1259       if (fs_user_dirty(cmdbuf) || gfx_state_dirty(cmdbuf, DESC_STATE))
1260          cs_move64_to(b, cs_sr_reg64(b, 4), fs ? fs_desc_state->res_table : 0);
1261       if (fs_user_dirty(cmdbuf))
1262          cs_move64_to(b, cs_sr_reg64(b, 20),
1263                       fs ? panvk_priv_mem_dev_addr(fs->spd) : 0);
1264    }
1265 
1266    return VK_SUCCESS;
1267 }
1268 
1269 static VkResult
prepare_push_uniforms(struct panvk_cmd_buffer * cmdbuf)1270 prepare_push_uniforms(struct panvk_cmd_buffer *cmdbuf)
1271 {
1272    struct cs_builder *b =
1273       panvk_get_cs_builder(cmdbuf, PANVK_SUBQUEUE_VERTEX_TILER);
1274    const struct panvk_shader *vs = cmdbuf->state.gfx.vs.shader;
1275    const struct panvk_shader *fs = get_fs(cmdbuf);
1276    VkResult result;
1277 
1278    if (gfx_state_dirty(cmdbuf, VS_PUSH_UNIFORMS)) {
1279       result = panvk_per_arch(cmd_prepare_push_uniforms)(cmdbuf, vs);
1280       if (result != VK_SUCCESS)
1281          return result;
1282 
1283       cs_update_vt_ctx(b) {
1284          cs_move64_to(b, cs_sr_reg64(b, 8),
1285                       cmdbuf->state.gfx.vs.push_uniforms |
1286                          ((uint64_t)vs->fau.total_count << 56));
1287       }
1288    }
1289 
1290    if (fs_user_dirty(cmdbuf) || gfx_state_dirty(cmdbuf, FS_PUSH_UNIFORMS)) {
1291       uint64_t fau_ptr = 0;
1292 
1293       if (fs) {
1294          result = panvk_per_arch(cmd_prepare_push_uniforms)(cmdbuf, fs);
1295          if (result != VK_SUCCESS)
1296             return result;
1297 
1298          fau_ptr = cmdbuf->state.gfx.fs.push_uniforms |
1299                    ((uint64_t)fs->fau.total_count << 56);
1300       }
1301 
1302       cs_update_vt_ctx(b)
1303          cs_move64_to(b, cs_sr_reg64(b, 12), fau_ptr);
1304    }
1305 
1306    return VK_SUCCESS;
1307 }
1308 
1309 static VkResult
prepare_ds(struct panvk_cmd_buffer * cmdbuf)1310 prepare_ds(struct panvk_cmd_buffer *cmdbuf)
1311 {
1312    bool dirty = dyn_gfx_state_dirty(cmdbuf, DS_DEPTH_TEST_ENABLE) ||
1313                 dyn_gfx_state_dirty(cmdbuf, DS_DEPTH_WRITE_ENABLE) ||
1314                 dyn_gfx_state_dirty(cmdbuf, DS_DEPTH_COMPARE_OP) ||
1315                 dyn_gfx_state_dirty(cmdbuf, DS_STENCIL_TEST_ENABLE) ||
1316                 dyn_gfx_state_dirty(cmdbuf, DS_STENCIL_OP) ||
1317                 dyn_gfx_state_dirty(cmdbuf, DS_STENCIL_COMPARE_MASK) ||
1318                 dyn_gfx_state_dirty(cmdbuf, DS_STENCIL_WRITE_MASK) ||
1319                 dyn_gfx_state_dirty(cmdbuf, DS_STENCIL_REFERENCE) ||
1320                 dyn_gfx_state_dirty(cmdbuf, RS_DEPTH_CLAMP_ENABLE) ||
1321                 dyn_gfx_state_dirty(cmdbuf, RS_DEPTH_CLIP_ENABLE) ||
1322                 dyn_gfx_state_dirty(cmdbuf, RS_DEPTH_BIAS_ENABLE) ||
1323                 dyn_gfx_state_dirty(cmdbuf, RS_DEPTH_BIAS_FACTORS) ||
1324                 fs_user_dirty(cmdbuf);
1325 
1326    if (!dirty)
1327       return VK_SUCCESS;
1328 
1329    struct cs_builder *b =
1330       panvk_get_cs_builder(cmdbuf, PANVK_SUBQUEUE_VERTEX_TILER);
1331    const struct vk_dynamic_graphics_state *dyns =
1332       &cmdbuf->vk.dynamic_graphics_state;
1333    const struct vk_depth_stencil_state *ds = &dyns->ds;
1334    const struct vk_rasterization_state *rs = &dyns->rs;
1335    bool test_s = has_stencil_att(cmdbuf) && ds->stencil.test_enable;
1336    bool test_z = has_depth_att(cmdbuf) && ds->depth.test_enable;
1337    const struct panvk_shader *fs = get_fs(cmdbuf);
1338 
1339    struct panfrost_ptr zsd = panvk_cmd_alloc_desc(cmdbuf, DEPTH_STENCIL);
1340    if (!zsd.gpu)
1341       return VK_ERROR_OUT_OF_DEVICE_MEMORY;
1342 
1343    pan_cast_and_pack(zsd.cpu, DEPTH_STENCIL, cfg) {
1344       cfg.stencil_test_enable = test_s;
1345       if (test_s) {
1346          cfg.front_compare_function =
1347             translate_compare_func(ds->stencil.front.op.compare);
1348          cfg.front_stencil_fail =
1349             translate_stencil_op(ds->stencil.front.op.fail);
1350          cfg.front_depth_fail =
1351             translate_stencil_op(ds->stencil.front.op.depth_fail);
1352          cfg.front_depth_pass = translate_stencil_op(ds->stencil.front.op.pass);
1353          cfg.back_compare_function =
1354             translate_compare_func(ds->stencil.back.op.compare);
1355          cfg.back_stencil_fail = translate_stencil_op(ds->stencil.back.op.fail);
1356          cfg.back_depth_fail =
1357             translate_stencil_op(ds->stencil.back.op.depth_fail);
1358          cfg.back_depth_pass = translate_stencil_op(ds->stencil.back.op.pass);
1359       }
1360 
1361       cfg.stencil_from_shader = fs ? fs->info.fs.writes_stencil : 0;
1362       cfg.front_write_mask = ds->stencil.front.write_mask;
1363       cfg.back_write_mask = ds->stencil.back.write_mask;
1364       cfg.front_value_mask = ds->stencil.front.compare_mask;
1365       cfg.back_value_mask = ds->stencil.back.compare_mask;
1366       cfg.front_reference_value = ds->stencil.front.reference;
1367       cfg.back_reference_value = ds->stencil.back.reference;
1368 
1369       cfg.depth_cull_enable = vk_rasterization_state_depth_clip_enable(rs);
1370       if (rs->depth_clamp_enable)
1371          cfg.depth_clamp_mode = MALI_DEPTH_CLAMP_MODE_BOUNDS;
1372 
1373       if (fs)
1374          cfg.depth_source = pan_depth_source(&fs->info);
1375       cfg.depth_write_enable = test_z && ds->depth.write_enable;
1376       cfg.depth_bias_enable = rs->depth_bias.enable;
1377       cfg.depth_function = test_z ? translate_compare_func(ds->depth.compare_op)
1378                                   : MALI_FUNC_ALWAYS;
1379       cfg.depth_units = rs->depth_bias.constant_factor;
1380       cfg.depth_factor = rs->depth_bias.slope_factor;
1381       cfg.depth_bias_clamp = rs->depth_bias.clamp;
1382    }
1383 
1384    cs_update_vt_ctx(b)
1385       cs_move64_to(b, cs_sr_reg64(b, 52), zsd.gpu);
1386 
1387    return VK_SUCCESS;
1388 }
1389 
1390 static VkResult
wrap_prev_oq(struct panvk_cmd_buffer * cmdbuf)1391 wrap_prev_oq(struct panvk_cmd_buffer *cmdbuf)
1392 {
1393    uint64_t last_syncobj = cmdbuf->state.gfx.render.oq.last;
1394 
1395    if (!last_syncobj)
1396       return VK_SUCCESS;
1397 
1398    uint64_t prev_oq_node = cmdbuf->state.gfx.render.oq.chain;
1399    struct panfrost_ptr new_oq_node = panvk_cmd_alloc_dev_mem(
1400       cmdbuf, desc, sizeof(struct panvk_cs_occlusion_query), 8);
1401 
1402    if (!new_oq_node.gpu)
1403       return VK_ERROR_OUT_OF_DEVICE_MEMORY;
1404 
1405    cmdbuf->state.gfx.render.oq.chain = new_oq_node.gpu;
1406 
1407    struct panvk_cs_occlusion_query *oq = new_oq_node.cpu;
1408 
1409    *oq = (struct panvk_cs_occlusion_query){
1410       .syncobj = last_syncobj,
1411       .next = prev_oq_node,
1412    };
1413 
1414    /* If we already had an OQ in the chain, we don't need to initialize the
1415     * oq_chain field in the subqueue ctx. */
1416    if (prev_oq_node)
1417       return VK_SUCCESS;
1418 
1419    /* If we're a secondary cmdbuf inside a render pass, we let the primary
1420     * cmdbuf link the OQ chain. */
1421    if (cmdbuf->flags & VK_COMMAND_BUFFER_USAGE_RENDER_PASS_CONTINUE_BIT)
1422       return VK_SUCCESS;
1423 
1424    struct cs_builder *b = panvk_get_cs_builder(cmdbuf, PANVK_SUBQUEUE_FRAGMENT);
1425    struct cs_index oq_node_reg = cs_scratch_reg64(b, 0);
1426 
1427    cs_move64_to(b, oq_node_reg, new_oq_node.gpu);
1428 
1429    /* If we're resuming, we need to link with the previous oq_chain, if any. */
1430    if (cmdbuf->state.gfx.render.flags & VK_RENDERING_RESUMING_BIT) {
1431       struct cs_index prev_oq_node_reg = cs_scratch_reg64(b, 2);
1432 
1433       cs_load64_to(
1434          b, prev_oq_node_reg, cs_subqueue_ctx_reg(b),
1435          offsetof(struct panvk_cs_subqueue_context, render.oq_chain));
1436       cs_wait_slot(b, SB_ID(LS), false);
1437       cs_store64(b, prev_oq_node_reg, oq_node_reg,
1438                  offsetof(struct panvk_cs_occlusion_query, next));
1439       cs_wait_slot(b, SB_ID(LS), false);
1440    }
1441 
1442    cs_store64(b, oq_node_reg, cs_subqueue_ctx_reg(b),
1443               offsetof(struct panvk_cs_subqueue_context, render.oq_chain));
1444    cs_wait_slot(b, SB_ID(LS), false);
1445    return VK_SUCCESS;
1446 }
1447 
1448 static VkResult
prepare_oq(struct panvk_cmd_buffer * cmdbuf)1449 prepare_oq(struct panvk_cmd_buffer *cmdbuf)
1450 {
1451    if (!gfx_state_dirty(cmdbuf, OQ) ||
1452        cmdbuf->state.gfx.occlusion_query.syncobj ==
1453           cmdbuf->state.gfx.render.oq.last)
1454       return VK_SUCCESS;
1455 
1456    VkResult result = wrap_prev_oq(cmdbuf);
1457    if (result)
1458       return result;
1459 
1460    struct cs_builder *b =
1461       panvk_get_cs_builder(cmdbuf, PANVK_SUBQUEUE_VERTEX_TILER);
1462    cs_move64_to(b, cs_sr_reg64(b, 46), cmdbuf->state.gfx.occlusion_query.ptr);
1463 
1464    cmdbuf->state.gfx.render.oq.last =
1465       cmdbuf->state.gfx.occlusion_query.syncobj;
1466    return VK_SUCCESS;
1467 }
1468 
1469 static void
prepare_dcd(struct panvk_cmd_buffer * cmdbuf)1470 prepare_dcd(struct panvk_cmd_buffer *cmdbuf)
1471 {
1472    struct cs_builder *b =
1473       panvk_get_cs_builder(cmdbuf, PANVK_SUBQUEUE_VERTEX_TILER);
1474    const struct panvk_shader *fs = get_fs(cmdbuf);
1475    bool dcd0_dirty =
1476       dyn_gfx_state_dirty(cmdbuf, RS_RASTERIZER_DISCARD_ENABLE) ||
1477       dyn_gfx_state_dirty(cmdbuf, RS_CULL_MODE) ||
1478       dyn_gfx_state_dirty(cmdbuf, RS_FRONT_FACE) ||
1479       dyn_gfx_state_dirty(cmdbuf, MS_RASTERIZATION_SAMPLES) ||
1480       dyn_gfx_state_dirty(cmdbuf, MS_SAMPLE_MASK) ||
1481       dyn_gfx_state_dirty(cmdbuf, MS_ALPHA_TO_COVERAGE_ENABLE) ||
1482       dyn_gfx_state_dirty(cmdbuf, MS_ALPHA_TO_ONE_ENABLE) ||
1483       /* writes_depth() uses vk_depth_stencil_state */
1484       dyn_gfx_state_dirty(cmdbuf, DS_DEPTH_TEST_ENABLE) ||
1485       dyn_gfx_state_dirty(cmdbuf, DS_DEPTH_WRITE_ENABLE) ||
1486       dyn_gfx_state_dirty(cmdbuf, DS_DEPTH_COMPARE_OP) ||
1487       /* writes_stencil() uses vk_depth_stencil_state */
1488       dyn_gfx_state_dirty(cmdbuf, DS_STENCIL_TEST_ENABLE) ||
1489       dyn_gfx_state_dirty(cmdbuf, DS_STENCIL_OP) ||
1490       dyn_gfx_state_dirty(cmdbuf, DS_STENCIL_WRITE_MASK) ||
1491       fs_user_dirty(cmdbuf) || gfx_state_dirty(cmdbuf, RENDER_STATE) ||
1492       gfx_state_dirty(cmdbuf, OQ);
1493    bool dcd1_dirty = dyn_gfx_state_dirty(cmdbuf, MS_RASTERIZATION_SAMPLES) ||
1494                      dyn_gfx_state_dirty(cmdbuf, MS_SAMPLE_MASK) ||
1495                      fs_user_dirty(cmdbuf) ||
1496                      gfx_state_dirty(cmdbuf, RENDER_STATE);
1497 
1498    const struct vk_dynamic_graphics_state *dyns =
1499       &cmdbuf->vk.dynamic_graphics_state;
1500    const struct vk_rasterization_state *rs =
1501       &cmdbuf->vk.dynamic_graphics_state.rs;
1502    bool alpha_to_coverage = dyns->ms.alpha_to_coverage_enable;
1503    bool writes_z = writes_depth(cmdbuf);
1504    bool writes_s = writes_stencil(cmdbuf);
1505 
1506    if (dcd0_dirty) {
1507       struct mali_dcd_flags_0_packed dcd0;
1508       pan_pack(&dcd0, DCD_FLAGS_0, cfg) {
1509          if (fs) {
1510             uint8_t rt_written = fs->info.outputs_written >> FRAG_RESULT_DATA0;
1511             uint8_t rt_mask = cmdbuf->state.gfx.render.bound_attachments &
1512                               MESA_VK_RP_ATTACHMENT_ANY_COLOR_BITS;
1513 
1514             cfg.allow_forward_pixel_to_kill =
1515                fs->info.fs.can_fpk && !(rt_mask & ~rt_written) &&
1516                !alpha_to_coverage && !cmdbuf->state.gfx.cb.info.any_dest_read;
1517 
1518             bool writes_zs = writes_z || writes_s;
1519             bool zs_always_passes = ds_test_always_passes(cmdbuf);
1520             bool oq = cmdbuf->state.gfx.occlusion_query.mode !=
1521                       MALI_OCCLUSION_MODE_DISABLED;
1522 
1523             struct pan_earlyzs_state earlyzs =
1524                pan_earlyzs_get(pan_earlyzs_analyze(&fs->info), writes_zs || oq,
1525                                alpha_to_coverage, zs_always_passes);
1526 
1527             cfg.pixel_kill_operation = earlyzs.kill;
1528             cfg.zs_update_operation = earlyzs.update;
1529             cfg.evaluate_per_sample = fs->info.fs.sample_shading &&
1530                                       (dyns->ms.rasterization_samples > 1);
1531 
1532             cfg.shader_modifies_coverage = fs->info.fs.writes_coverage ||
1533                                            fs->info.fs.can_discard ||
1534                                            alpha_to_coverage;
1535          } else {
1536             cfg.allow_forward_pixel_to_kill = true;
1537             cfg.allow_forward_pixel_to_be_killed = true;
1538             cfg.pixel_kill_operation = MALI_PIXEL_KILL_FORCE_EARLY;
1539             cfg.zs_update_operation = MALI_PIXEL_KILL_FORCE_EARLY;
1540             cfg.overdraw_alpha0 = true;
1541             cfg.overdraw_alpha1 = true;
1542          }
1543 
1544          cfg.front_face_ccw = rs->front_face == VK_FRONT_FACE_COUNTER_CLOCKWISE;
1545          cfg.cull_front_face = (rs->cull_mode & VK_CULL_MODE_FRONT_BIT) != 0;
1546          cfg.cull_back_face = (rs->cull_mode & VK_CULL_MODE_BACK_BIT) != 0;
1547 
1548          cfg.multisample_enable = dyns->ms.rasterization_samples > 1;
1549          cfg.occlusion_query = cmdbuf->state.gfx.occlusion_query.mode;
1550          cfg.alpha_to_coverage = alpha_to_coverage;
1551       }
1552 
1553       cs_update_vt_ctx(b)
1554          cs_move32_to(b, cs_sr_reg32(b, 57), dcd0.opaque[0]);
1555    }
1556 
1557    if (dcd1_dirty) {
1558       struct mali_dcd_flags_1_packed dcd1;
1559       pan_pack(&dcd1, DCD_FLAGS_1, cfg) {
1560          cfg.sample_mask = dyns->ms.rasterization_samples > 1
1561                               ? dyns->ms.sample_mask
1562                               : UINT16_MAX;
1563 
1564          if (fs) {
1565             cfg.render_target_mask =
1566                (fs->info.outputs_written >> FRAG_RESULT_DATA0) &
1567                cmdbuf->state.gfx.render.bound_attachments;
1568          }
1569       }
1570 
1571       cs_update_vt_ctx(b)
1572          cs_move32_to(b, cs_sr_reg32(b, 58), dcd1.opaque[0]);
1573    }
1574 }
1575 
1576 static void
prepare_index_buffer(struct panvk_cmd_buffer * cmdbuf,struct panvk_draw_info * draw)1577 prepare_index_buffer(struct panvk_cmd_buffer *cmdbuf,
1578                      struct panvk_draw_info *draw)
1579 {
1580    struct cs_builder *b =
1581       panvk_get_cs_builder(cmdbuf, PANVK_SUBQUEUE_VERTEX_TILER);
1582 
1583    if (draw->index.size && gfx_state_dirty(cmdbuf, IB)) {
1584       uint64_t ib_size =
1585          panvk_buffer_range(cmdbuf->state.gfx.ib.buffer,
1586                             cmdbuf->state.gfx.ib.offset, VK_WHOLE_SIZE);
1587       assert(ib_size <= UINT32_MAX);
1588       cs_move32_to(b, cs_sr_reg32(b, 39), ib_size);
1589 
1590       cs_move64_to(b, cs_sr_reg64(b, 54),
1591                    panvk_buffer_gpu_ptr(cmdbuf->state.gfx.ib.buffer,
1592                                         cmdbuf->state.gfx.ib.offset));
1593    }
1594 }
1595 
1596 static void
set_tiler_idvs_flags(struct cs_builder * b,struct panvk_cmd_buffer * cmdbuf,struct panvk_draw_info * draw)1597 set_tiler_idvs_flags(struct cs_builder *b, struct panvk_cmd_buffer *cmdbuf,
1598                      struct panvk_draw_info *draw)
1599 {
1600    const struct panvk_shader *vs = cmdbuf->state.gfx.vs.shader;
1601    const struct panvk_shader *fs = get_fs(cmdbuf);
1602    const struct vk_dynamic_graphics_state *dyns =
1603       &cmdbuf->vk.dynamic_graphics_state;
1604    const struct vk_input_assembly_state *ia = &dyns->ia;
1605    const struct vk_rasterization_state *rs = &dyns->rs;
1606    struct mali_primitive_flags_packed tiler_idvs_flags;
1607 
1608    /* When drawing non-point primitives, we use the no_psiz variant which has
1609     * point size writes patched out */
1610    bool writes_point_size =
1611       vs->info.vs.writes_point_size &&
1612       ia->primitive_topology == VK_PRIMITIVE_TOPOLOGY_POINT_LIST;
1613    bool multiview = cmdbuf->state.gfx.render.view_mask;
1614    bool writes_layer = vs->info.outputs_written & VARYING_BIT_LAYER;
1615 
1616    /* Multiview shaders depend on the FIFO format for indexing per-view
1617     * output writes. We don't currently patch these offsets in the no_psiz
1618     * variant, so we still need the extended format even though the shader
1619     * does not write point size. */
1620    bool extended_fifo = writes_point_size || writes_layer ||
1621                         (vs->info.vs.writes_point_size && multiview);
1622 
1623    bool dirty = gfx_state_dirty(cmdbuf, VS) || fs_user_dirty(cmdbuf) ||
1624                 dyn_gfx_state_dirty(cmdbuf, IA_PRIMITIVE_RESTART_ENABLE) ||
1625                 dyn_gfx_state_dirty(cmdbuf, IA_PRIMITIVE_TOPOLOGY) ||
1626                 dyn_gfx_state_dirty(cmdbuf, RS_DEPTH_CLAMP_ENABLE) ||
1627                 dyn_gfx_state_dirty(cmdbuf, RS_DEPTH_CLIP_ENABLE);
1628 
1629    if (dirty) {
1630       pan_pack(&tiler_idvs_flags, PRIMITIVE_FLAGS, cfg) {
1631          cfg.draw_mode = translate_prim_topology(ia->primitive_topology);
1632 
1633          cfg.point_size_array_format = writes_point_size
1634             ? MALI_POINT_SIZE_ARRAY_FORMAT_FP16
1635             : MALI_POINT_SIZE_ARRAY_FORMAT_NONE;
1636          cfg.layer_index_enable = writes_layer;
1637 
1638          cfg.position_fifo_format = extended_fifo
1639             ? MALI_FIFO_FORMAT_EXTENDED
1640             : MALI_FIFO_FORMAT_BASIC;
1641 
1642          cfg.low_depth_cull = cfg.high_depth_cull =
1643             vk_rasterization_state_depth_clip_enable(rs);
1644 
1645          cfg.secondary_shader = vs->info.vs.secondary_enable && fs != NULL;
1646          cfg.primitive_restart = ia->primitive_restart_enable;
1647          cfg.view_mask = cmdbuf->state.gfx.render.view_mask;
1648       }
1649 
1650       cs_move32_to(b, cs_sr_reg32(b, 56), tiler_idvs_flags.opaque[0]);
1651    }
1652 }
1653 
1654 static struct mali_primitive_flags_packed
get_tiler_flags_override(struct panvk_draw_info * draw)1655 get_tiler_flags_override(struct panvk_draw_info *draw)
1656 {
1657    struct mali_primitive_flags_packed flags_override;
1658    /* Pack with nodefaults so only explicitly set override fields affect the
1659     * previously set register values */
1660    pan_pack_nodefaults(&flags_override, PRIMITIVE_FLAGS, cfg) {
1661       cfg.index_type = index_size_to_index_type(draw->index.size);
1662    };
1663 
1664    return flags_override;
1665 }
1666 
1667 static VkResult
prepare_draw(struct panvk_cmd_buffer * cmdbuf,struct panvk_draw_info * draw)1668 prepare_draw(struct panvk_cmd_buffer *cmdbuf, struct panvk_draw_info *draw)
1669 {
1670    const struct panvk_shader *vs = cmdbuf->state.gfx.vs.shader;
1671    const struct panvk_shader *fs = get_fs(cmdbuf);
1672    struct panvk_descriptor_state *desc_state = &cmdbuf->state.gfx.desc_state;
1673    bool idvs = vs->info.vs.idvs;
1674    VkResult result;
1675 
1676    assert(vs);
1677 
1678    /* FIXME: support non-IDVS. */
1679    assert(idvs);
1680 
1681    set_provoking_vertex_mode(cmdbuf);
1682 
1683    result = update_tls(cmdbuf);
1684    if (result != VK_SUCCESS)
1685       return result;
1686 
1687    if (!inherits_render_ctx(cmdbuf)) {
1688       result = get_render_ctx(cmdbuf);
1689       if (result != VK_SUCCESS)
1690          return result;
1691    }
1692 
1693    struct cs_builder *b =
1694       panvk_get_cs_builder(cmdbuf, PANVK_SUBQUEUE_VERTEX_TILER);
1695 
1696    uint32_t used_set_mask =
1697       vs->desc_info.used_set_mask | (fs ? fs->desc_info.used_set_mask : 0);
1698 
1699    if (gfx_state_dirty(cmdbuf, DESC_STATE) || gfx_state_dirty(cmdbuf, VS) ||
1700        gfx_state_dirty(cmdbuf, FS)) {
1701       result = panvk_per_arch(cmd_prepare_push_descs)(cmdbuf, desc_state,
1702                                                       used_set_mask);
1703       if (result != VK_SUCCESS)
1704          return result;
1705    }
1706 
1707    result = prepare_blend(cmdbuf);
1708    if (result != VK_SUCCESS)
1709       return result;
1710 
1711    panvk_per_arch(cmd_prepare_draw_sysvals)(cmdbuf, draw);
1712 
1713    result = prepare_push_uniforms(cmdbuf);
1714    if (result != VK_SUCCESS)
1715       return result;
1716 
1717    result = prepare_vs(cmdbuf);
1718    if (result != VK_SUCCESS)
1719       return result;
1720 
1721    result = prepare_fs(cmdbuf);
1722    if (result != VK_SUCCESS)
1723       return result;
1724 
1725    /* Assumes 16 byte slots. We could do better. */
1726    uint32_t varying_size = get_varying_slots(cmdbuf) * 16;
1727 
1728    cs_update_vt_ctx(b) {
1729       /* We don't use the resource dep system yet. */
1730       cs_move32_to(b, cs_sr_reg32(b, 38), 0);
1731 
1732       prepare_index_buffer(cmdbuf, draw);
1733 
1734       set_tiler_idvs_flags(b, cmdbuf, draw);
1735 
1736       cs_move32_to(b, cs_sr_reg32(b, 48), varying_size);
1737 
1738       result = prepare_ds(cmdbuf);
1739       if (result != VK_SUCCESS)
1740          return result;
1741 
1742       result = prepare_oq(cmdbuf);
1743       if (result != VK_SUCCESS)
1744          return result;
1745 
1746       prepare_dcd(cmdbuf);
1747       prepare_vp(cmdbuf);
1748       prepare_tiler_primitive_size(cmdbuf);
1749    }
1750 
1751    clear_dirty_after_draw(cmdbuf);
1752    return VK_SUCCESS;
1753 }
1754 
1755 static void
panvk_cmd_draw(struct panvk_cmd_buffer * cmdbuf,struct panvk_draw_info * draw)1756 panvk_cmd_draw(struct panvk_cmd_buffer *cmdbuf, struct panvk_draw_info *draw)
1757 {
1758    const struct cs_tracing_ctx *tracing_ctx =
1759       &cmdbuf->state.cs[PANVK_SUBQUEUE_VERTEX_TILER].tracing;
1760    const struct panvk_shader *vs = cmdbuf->state.gfx.vs.shader;
1761    struct cs_builder *b =
1762       panvk_get_cs_builder(cmdbuf, PANVK_SUBQUEUE_VERTEX_TILER);
1763    VkResult result;
1764 
1765    /* If there's no vertex shader, we can skip the draw. */
1766    if (!panvk_priv_mem_dev_addr(vs->spds.pos_points))
1767       return;
1768 
1769    /* Needs to be done before get_fs() is called because it depends on
1770     * fs.required being initialized. */
1771    cmdbuf->state.gfx.fs.required =
1772       fs_required(&cmdbuf->state.gfx, &cmdbuf->vk.dynamic_graphics_state);
1773 
1774    if (!cmdbuf->vk.dynamic_graphics_state.rs.rasterizer_discard_enable) {
1775       struct pan_fb_info *fbinfo = &cmdbuf->state.gfx.render.fb.info;
1776       uint32_t rasterization_samples =
1777          cmdbuf->vk.dynamic_graphics_state.ms.rasterization_samples;
1778 
1779       /* If there's no attachment, we patch nr_samples to match
1780        * rasterization_samples, otherwise, we make sure those two numbers match.
1781        */
1782       if (!cmdbuf->state.gfx.render.bound_attachments) {
1783          assert(rasterization_samples > 0);
1784          fbinfo->nr_samples = rasterization_samples;
1785       } else {
1786          assert(rasterization_samples == fbinfo->nr_samples);
1787       }
1788    }
1789 
1790    result = prepare_draw(cmdbuf, draw);
1791    if (result != VK_SUCCESS)
1792       return;
1793 
1794    cs_update_vt_ctx(b) {
1795       cs_move32_to(b, cs_sr_reg32(b, 32), 0);
1796       cs_move32_to(b, cs_sr_reg32(b, 33), draw->vertex.count);
1797       cs_move32_to(b, cs_sr_reg32(b, 34), draw->instance.count);
1798       cs_move32_to(b, cs_sr_reg32(b, 35), draw->index.offset);
1799       cs_move32_to(b, cs_sr_reg32(b, 36), draw->vertex.base);
1800       /* NIR expects zero-based instance ID, but even if it did have an intrinsic to
1801        * load the absolute instance ID, we'd want to keep it zero-based to work around
1802        * Mali's limitation on non-zero firstInstance when a instance divisor is used.
1803        */
1804       cs_move32_to(b, cs_sr_reg32(b, 37), 0);
1805    }
1806 
1807    struct mali_primitive_flags_packed flags_override =
1808       get_tiler_flags_override(draw);
1809 
1810    uint32_t idvs_count = DIV_ROUND_UP(cmdbuf->state.gfx.render.layer_count,
1811                                       MAX_LAYERS_PER_TILER_DESC);
1812 
1813    cs_req_res(b, CS_IDVS_RES);
1814    if (idvs_count > 1) {
1815       struct cs_index counter_reg = cs_scratch_reg32(b, 17);
1816       struct cs_index tiler_ctx_addr = cs_sr_reg64(b, 40);
1817 
1818       cs_move32_to(b, counter_reg, idvs_count);
1819 
1820       cs_while(b, MALI_CS_CONDITION_GREATER, counter_reg) {
1821          cs_trace_run_idvs(b, tracing_ctx, cs_scratch_reg_tuple(b, 0, 4),
1822                            flags_override.opaque[0], false, true,
1823                            cs_shader_res_sel(0, 0, 1, 0),
1824                            cs_shader_res_sel(2, 2, 2, 0), cs_undef());
1825 
1826          cs_add32(b, counter_reg, counter_reg, -1);
1827          cs_update_vt_ctx(b) {
1828             cs_add64(b, tiler_ctx_addr, tiler_ctx_addr,
1829                      pan_size(TILER_CONTEXT));
1830          }
1831       }
1832 
1833       cs_update_vt_ctx(b) {
1834          cs_add64(b, tiler_ctx_addr, tiler_ctx_addr,
1835                   -(idvs_count * pan_size(TILER_CONTEXT)));
1836       }
1837    } else {
1838       cs_trace_run_idvs(b, tracing_ctx, cs_scratch_reg_tuple(b, 0, 4),
1839                         flags_override.opaque[0], false, true,
1840                         cs_shader_res_sel(0, 0, 1, 0),
1841                         cs_shader_res_sel(2, 2, 2, 0), cs_undef());
1842    }
1843    cs_req_res(b, 0);
1844 }
1845 
1846 VkResult
panvk_per_arch(cmd_prepare_exec_cmd_for_draws)1847 panvk_per_arch(cmd_prepare_exec_cmd_for_draws)(
1848    struct panvk_cmd_buffer *primary,
1849    struct panvk_cmd_buffer *secondary)
1850 {
1851    if (!(secondary->flags & VK_COMMAND_BUFFER_USAGE_RENDER_PASS_CONTINUE_BIT))
1852       return VK_SUCCESS;
1853 
1854    if (!inherits_render_ctx(primary)) {
1855       VkResult result  = get_render_ctx(primary);
1856       if (result != VK_SUCCESS)
1857          return result;
1858    }
1859 
1860    return prepare_oq(primary);
1861 }
1862 
1863 VKAPI_ATTR void VKAPI_CALL
panvk_per_arch(CmdDraw)1864 panvk_per_arch(CmdDraw)(VkCommandBuffer commandBuffer, uint32_t vertexCount,
1865                         uint32_t instanceCount, uint32_t firstVertex,
1866                         uint32_t firstInstance)
1867 {
1868    VK_FROM_HANDLE(panvk_cmd_buffer, cmdbuf, commandBuffer);
1869 
1870    if (instanceCount == 0 || vertexCount == 0)
1871       return;
1872 
1873    /* gl_BaseVertexARB is a signed integer, and it should expose the value of
1874     * firstVertex in a non-indexed draw. */
1875    assert(firstVertex < INT32_MAX);
1876 
1877    /* gl_BaseInstance is a signed integer, and it should expose the value of
1878     * firstInstnace. */
1879    assert(firstInstance < INT32_MAX);
1880 
1881    struct panvk_draw_info draw = {
1882       .vertex.base = firstVertex,
1883       .vertex.count = vertexCount,
1884       .instance.base = firstInstance,
1885       .instance.count = instanceCount,
1886    };
1887 
1888    panvk_cmd_draw(cmdbuf, &draw);
1889 }
1890 
1891 VKAPI_ATTR void VKAPI_CALL
panvk_per_arch(CmdDrawIndexed)1892 panvk_per_arch(CmdDrawIndexed)(VkCommandBuffer commandBuffer,
1893                                uint32_t indexCount, uint32_t instanceCount,
1894                                uint32_t firstIndex, int32_t vertexOffset,
1895                                uint32_t firstInstance)
1896 {
1897    VK_FROM_HANDLE(panvk_cmd_buffer, cmdbuf, commandBuffer);
1898 
1899    if (instanceCount == 0 || indexCount == 0)
1900       return;
1901 
1902    /* gl_BaseInstance is a signed integer, and it should expose the value of
1903     * firstInstnace. */
1904    assert(firstInstance < INT32_MAX);
1905 
1906    struct panvk_draw_info draw = {
1907       .index.size = cmdbuf->state.gfx.ib.index_size,
1908       .index.offset = firstIndex,
1909       .vertex.base = vertexOffset,
1910       .vertex.count = indexCount,
1911       .instance.count = instanceCount,
1912       .instance.base = firstInstance,
1913    };
1914 
1915    panvk_cmd_draw(cmdbuf, &draw);
1916 }
1917 
1918 static void
panvk_cmd_draw_indirect(struct panvk_cmd_buffer * cmdbuf,struct panvk_draw_info * draw)1919 panvk_cmd_draw_indirect(struct panvk_cmd_buffer *cmdbuf,
1920                         struct panvk_draw_info *draw)
1921 {
1922    const struct cs_tracing_ctx *tracing_ctx =
1923       &cmdbuf->state.cs[PANVK_SUBQUEUE_VERTEX_TILER].tracing;
1924    const struct panvk_shader *vs = cmdbuf->state.gfx.vs.shader;
1925    struct cs_builder *b =
1926       panvk_get_cs_builder(cmdbuf, PANVK_SUBQUEUE_VERTEX_TILER);
1927    VkResult result;
1928 
1929    /* If there's no vertex shader, we can skip the draw. */
1930    if (!panvk_priv_mem_dev_addr(vs->spds.pos_points))
1931       return;
1932 
1933    /* Needs to be done before get_fs() is called because it depends on
1934     * fs.required being initialized. */
1935    cmdbuf->state.gfx.fs.required =
1936       fs_required(&cmdbuf->state.gfx, &cmdbuf->vk.dynamic_graphics_state);
1937 
1938    /* Layered indirect draw (VK_EXT_shader_viewport_index_layer) needs
1939     * additional changes. We allow layer_count == 0 because that happens
1940     * when mixing dynamic rendering and secondary command buffers. Once
1941     * we decide to support layared+indirect, we'll need to pass the
1942     * layer_count info through the tiler descriptor, for instance by
1943     * re-using one of the word that's flagged 'ignored' in the descriptor
1944     * (word 14:23).
1945     *
1946     * Multiview is limited to 8 layers, and so will always fit in one TD.
1947     * Therefore layered rendering is allowed with multiview. */
1948    assert(cmdbuf->state.gfx.render.layer_count <= 1 ||
1949           cmdbuf->state.gfx.render.view_mask);
1950 
1951    /* MultiDrawIndirect (.maxDrawIndirectCount) needs additional changes. */
1952    assert(draw->indirect.draw_count == 1);
1953 
1954    /* Force a new push uniform block to be allocated */
1955    gfx_state_set_dirty(cmdbuf, VS_PUSH_UNIFORMS);
1956 
1957    result = prepare_draw(cmdbuf, draw);
1958    if (result != VK_SUCCESS)
1959       return;
1960 
1961    struct cs_index draw_params_addr = cs_scratch_reg64(b, 0);
1962    cs_move64_to(b, draw_params_addr, draw->indirect.buffer_dev_addr);
1963 
1964    cs_update_vt_ctx(b) {
1965       cs_move32_to(b, cs_sr_reg32(b, 32), 0);
1966       /* Load SR33-37 from indirect buffer. */
1967       unsigned reg_mask = draw->index.size ? 0b11111 : 0b11011;
1968       cs_load_to(b, cs_sr_reg_tuple(b, 33, 5), draw_params_addr, reg_mask, 0);
1969    }
1970 
1971    /* Wait for the SR33-37 indirect buffer load. */
1972    cs_wait_slot(b, SB_ID(LS), false);
1973 
1974    if (shader_uses_sysval(vs, graphics, vs.first_vertex) ||
1975        shader_uses_sysval(vs, graphics, vs.base_instance)) {
1976       struct cs_index fau_block_addr = cs_scratch_reg64(b, 2);
1977       cs_move64_to(b, fau_block_addr, cmdbuf->state.gfx.vs.push_uniforms);
1978 
1979       if (shader_uses_sysval(vs, graphics, vs.first_vertex)) {
1980          cs_store32(b, cs_sr_reg32(b, 36), fau_block_addr,
1981                     shader_remapped_sysval_offset(
1982                        vs, sysval_offset(graphics, vs.first_vertex)));
1983       }
1984 
1985       if (shader_uses_sysval(vs, graphics, vs.base_instance)) {
1986          cs_store32(b, cs_sr_reg32(b, 37), fau_block_addr,
1987                     shader_remapped_sysval_offset(
1988                        vs, sysval_offset(graphics, vs.base_instance)));
1989       }
1990 
1991       /* Wait for the store using SR-37 as src to finish, so we can overwrite
1992        * it. */
1993       cs_wait_slot(b, SB_ID(LS), false);
1994    }
1995 
1996    /* NIR expects zero-based instance ID, but even if it did have an intrinsic to
1997     * load the absolute instance ID, we'd want to keep it zero-based to work around
1998     * Mali's limitation on non-zero firstInstance when a instance divisor is used.
1999     */
2000    cs_update_vt_ctx(b)
2001       cs_move32_to(b, cs_sr_reg32(b, 37), 0);
2002 
2003    struct mali_primitive_flags_packed flags_override =
2004       get_tiler_flags_override(draw);
2005 
2006    cs_req_res(b, CS_IDVS_RES);
2007    cs_trace_run_idvs(b, tracing_ctx, cs_scratch_reg_tuple(b, 0, 4),
2008                      flags_override.opaque[0], false, true,
2009                      cs_shader_res_sel(0, 0, 1, 0),
2010                      cs_shader_res_sel(2, 2, 2, 0), cs_undef());
2011    cs_req_res(b, 0);
2012 }
2013 
2014 VKAPI_ATTR void VKAPI_CALL
panvk_per_arch(CmdDrawIndirect)2015 panvk_per_arch(CmdDrawIndirect)(VkCommandBuffer commandBuffer, VkBuffer _buffer,
2016                                 VkDeviceSize offset, uint32_t drawCount,
2017                                 uint32_t stride)
2018 {
2019    VK_FROM_HANDLE(panvk_cmd_buffer, cmdbuf, commandBuffer);
2020    VK_FROM_HANDLE(panvk_buffer, buffer, _buffer);
2021 
2022    if (drawCount == 0)
2023       return;
2024 
2025    struct panvk_draw_info draw = {
2026       .indirect.buffer_dev_addr = panvk_buffer_gpu_ptr(buffer, offset),
2027       .indirect.draw_count = drawCount,
2028       .indirect.stride = stride,
2029    };
2030 
2031    panvk_cmd_draw_indirect(cmdbuf, &draw);
2032 }
2033 
2034 VKAPI_ATTR void VKAPI_CALL
panvk_per_arch(CmdDrawIndexedIndirect)2035 panvk_per_arch(CmdDrawIndexedIndirect)(VkCommandBuffer commandBuffer,
2036                                        VkBuffer _buffer, VkDeviceSize offset,
2037                                        uint32_t drawCount, uint32_t stride)
2038 {
2039    VK_FROM_HANDLE(panvk_cmd_buffer, cmdbuf, commandBuffer);
2040    VK_FROM_HANDLE(panvk_buffer, buffer, _buffer);
2041 
2042    if (drawCount == 0)
2043       return;
2044 
2045    struct panvk_draw_info draw = {
2046       .index.size = cmdbuf->state.gfx.ib.index_size,
2047       .indirect.buffer_dev_addr = panvk_buffer_gpu_ptr(buffer, offset),
2048       .indirect.draw_count = drawCount,
2049       .indirect.stride = stride,
2050    };
2051 
2052    panvk_cmd_draw_indirect(cmdbuf, &draw);
2053 }
2054 
2055 void
panvk_per_arch(cmd_inherit_render_state)2056 panvk_per_arch(cmd_inherit_render_state)(
2057    struct panvk_cmd_buffer *cmdbuf,
2058    const VkCommandBufferBeginInfo *pBeginInfo)
2059 {
2060    if (cmdbuf->vk.level != VK_COMMAND_BUFFER_LEVEL_SECONDARY ||
2061        !(pBeginInfo->flags & VK_COMMAND_BUFFER_USAGE_RENDER_PASS_CONTINUE_BIT))
2062       return;
2063 
2064    assert(pBeginInfo->pInheritanceInfo);
2065    char gcbiar_data[VK_GCBIARR_DATA_SIZE(MAX_RTS)];
2066    const VkRenderingInfo *resume_info =
2067       vk_get_command_buffer_inheritance_as_rendering_resume(cmdbuf->vk.level,
2068                                                             pBeginInfo,
2069                                                             gcbiar_data);
2070    if (resume_info) {
2071       panvk_per_arch(cmd_init_render_state)(cmdbuf, resume_info);
2072       return;
2073    }
2074 
2075    const VkCommandBufferInheritanceRenderingInfo *inheritance_info =
2076       vk_get_command_buffer_inheritance_rendering_info(cmdbuf->vk.level,
2077                                                        pBeginInfo);
2078    assert(inheritance_info);
2079    struct panvk_device *dev = to_panvk_device(cmdbuf->vk.base.device);
2080    struct panvk_physical_device *phys_dev =
2081       to_panvk_physical_device(dev->vk.physical);
2082    struct pan_fb_info *fbinfo = &cmdbuf->state.gfx.render.fb.info;
2083 
2084    cmdbuf->state.gfx.render.suspended = false;
2085    cmdbuf->state.gfx.render.flags = inheritance_info->flags;
2086 
2087    gfx_state_set_dirty(cmdbuf, RENDER_STATE);
2088    memset(cmdbuf->state.gfx.render.fb.crc_valid, 0,
2089           sizeof(cmdbuf->state.gfx.render.fb.crc_valid));
2090    memset(&cmdbuf->state.gfx.render.color_attachments, 0,
2091           sizeof(cmdbuf->state.gfx.render.color_attachments));
2092    memset(&cmdbuf->state.gfx.render.z_attachment, 0,
2093           sizeof(cmdbuf->state.gfx.render.z_attachment));
2094    memset(&cmdbuf->state.gfx.render.s_attachment, 0,
2095           sizeof(cmdbuf->state.gfx.render.s_attachment));
2096    cmdbuf->state.gfx.render.bound_attachments = 0;
2097 
2098    cmdbuf->state.gfx.render.view_mask = inheritance_info->viewMask;
2099    cmdbuf->state.gfx.render.layer_count = inheritance_info->viewMask ?
2100       util_last_bit(inheritance_info->viewMask) :
2101       0;
2102    *fbinfo = (struct pan_fb_info){
2103       .tile_buf_budget = panfrost_query_optimal_tib_size(phys_dev->model),
2104       .nr_samples = inheritance_info->rasterizationSamples,
2105       .rt_count = inheritance_info->colorAttachmentCount,
2106    };
2107 
2108    assert(inheritance_info->colorAttachmentCount <= ARRAY_SIZE(fbinfo->rts));
2109 
2110    for (uint32_t i = 0; i < inheritance_info->colorAttachmentCount; i++) {
2111       cmdbuf->state.gfx.render.bound_attachments |=
2112          MESA_VK_RP_ATTACHMENT_COLOR_BIT(i);
2113       cmdbuf->state.gfx.render.color_attachments.fmts[i] =
2114          inheritance_info->pColorAttachmentFormats[i];
2115       cmdbuf->state.gfx.render.color_attachments.samples[i] =
2116          inheritance_info->rasterizationSamples;
2117    }
2118 
2119    if (inheritance_info->depthAttachmentFormat) {
2120       cmdbuf->state.gfx.render.bound_attachments |=
2121          MESA_VK_RP_ATTACHMENT_DEPTH_BIT;
2122       cmdbuf->state.gfx.render.z_attachment.fmt =
2123          inheritance_info->depthAttachmentFormat;
2124    }
2125 
2126    if (inheritance_info->stencilAttachmentFormat) {
2127       cmdbuf->state.gfx.render.bound_attachments |=
2128          MESA_VK_RP_ATTACHMENT_STENCIL_BIT;
2129       cmdbuf->state.gfx.render.s_attachment.fmt =
2130          inheritance_info->stencilAttachmentFormat;
2131    }
2132 
2133    const VkRenderingAttachmentLocationInfoKHR att_loc_info_default = {
2134       .sType = VK_STRUCTURE_TYPE_RENDERING_ATTACHMENT_LOCATION_INFO_KHR,
2135       .colorAttachmentCount = inheritance_info->colorAttachmentCount,
2136    };
2137    const VkRenderingAttachmentLocationInfoKHR *att_loc_info =
2138       vk_get_command_buffer_rendering_attachment_location_info(
2139          cmdbuf->vk.level, pBeginInfo);
2140    if (att_loc_info == NULL)
2141       att_loc_info = &att_loc_info_default;
2142 
2143    vk_cmd_set_rendering_attachment_locations(&cmdbuf->vk, att_loc_info);
2144 }
2145 
2146 VKAPI_ATTR void VKAPI_CALL
panvk_per_arch(CmdBeginRendering)2147 panvk_per_arch(CmdBeginRendering)(VkCommandBuffer commandBuffer,
2148                                   const VkRenderingInfo *pRenderingInfo)
2149 {
2150    VK_FROM_HANDLE(panvk_cmd_buffer, cmdbuf, commandBuffer);
2151    struct panvk_cmd_graphics_state *state = &cmdbuf->state.gfx;
2152    bool resuming = pRenderingInfo->flags & VK_RENDERING_RESUMING_BIT;
2153 
2154    panvk_per_arch(cmd_init_render_state)(cmdbuf, pRenderingInfo);
2155 
2156    /* If we're not resuming, the FBD should be NULL. */
2157    assert(!state->render.fbds.gpu || resuming);
2158 
2159    if (!resuming)
2160       panvk_per_arch(cmd_preload_render_area_border)(cmdbuf, pRenderingInfo);
2161 }
2162 
2163 static void
flush_tiling(struct panvk_cmd_buffer * cmdbuf)2164 flush_tiling(struct panvk_cmd_buffer *cmdbuf)
2165 {
2166    struct cs_builder *b =
2167       panvk_get_cs_builder(cmdbuf, PANVK_SUBQUEUE_VERTEX_TILER);
2168 
2169    struct cs_index render_ctx = cs_scratch_reg64(b, 2);
2170 
2171    if (cmdbuf->state.gfx.render.tiler || inherits_render_ctx(cmdbuf)) {
2172       /* Flush the tiling operations and signal the internal sync object. */
2173       cs_req_res(b, CS_TILER_RES);
2174       cs_finish_tiling(b, false);
2175       cs_req_res(b, 0);
2176 
2177       struct cs_index sync_addr = cs_scratch_reg64(b, 0);
2178       struct cs_index iter_sb = cs_scratch_reg32(b, 2);
2179       struct cs_index cmp_scratch = cs_scratch_reg32(b, 3);
2180       struct cs_index add_val = cs_scratch_reg64(b, 4);
2181 
2182       cs_load_to(b, cs_scratch_reg_tuple(b, 0, 3), cs_subqueue_ctx_reg(b),
2183                  BITFIELD_MASK(3),
2184                  offsetof(struct panvk_cs_subqueue_context, syncobjs));
2185       cs_wait_slot(b, SB_ID(LS), false);
2186 
2187       /* We're relying on PANVK_SUBQUEUE_VERTEX_TILER being the first queue to
2188        * skip an ADD operation on the syncobjs pointer. */
2189       STATIC_ASSERT(PANVK_SUBQUEUE_VERTEX_TILER == 0);
2190 
2191       cs_move64_to(b, add_val, 1);
2192 
2193       cs_match(b, iter_sb, cmp_scratch) {
2194 #define CASE(x)                                                                \
2195          cs_case(b, x) {                                                       \
2196             cs_heap_operation(b,                                               \
2197                               MALI_CS_HEAP_OPERATION_VERTEX_TILER_COMPLETED,   \
2198                               cs_defer(SB_WAIT_ITER(x),                        \
2199                                        SB_ID(DEFERRED_SYNC)));                 \
2200             cs_sync64_add(b, true, MALI_CS_SYNC_SCOPE_CSG,                     \
2201                           add_val, sync_addr,                                  \
2202                           cs_defer(SB_WAIT_ITER(x), SB_ID(DEFERRED_SYNC)));    \
2203             cs_move32_to(b, iter_sb, next_iter_sb(x));                         \
2204          }
2205 
2206          CASE(0)
2207          CASE(1)
2208          CASE(2)
2209          CASE(3)
2210          CASE(4)
2211 #undef CASE
2212       }
2213 
2214       cs_store32(b, iter_sb, cs_subqueue_ctx_reg(b),
2215                  offsetof(struct panvk_cs_subqueue_context, iter_sb));
2216       cs_wait_slot(b, SB_ID(LS), false);
2217 
2218       /* Update the vertex seqno. */
2219       ++cmdbuf->state.cs[PANVK_SUBQUEUE_VERTEX_TILER].relative_sync_point;
2220    } else {
2221       cs_load64_to(b, render_ctx, cs_subqueue_ctx_reg(b),
2222                    offsetof(struct panvk_cs_subqueue_context, render));
2223       cs_wait_slot(b, SB_ID(LS), false);
2224    }
2225 }
2226 
2227 static void
wait_finish_tiling(struct panvk_cmd_buffer * cmdbuf)2228 wait_finish_tiling(struct panvk_cmd_buffer *cmdbuf)
2229 {
2230    struct cs_builder *b = panvk_get_cs_builder(cmdbuf, PANVK_SUBQUEUE_FRAGMENT);
2231    struct cs_index vt_sync_addr = cs_scratch_reg64(b, 0);
2232    struct cs_index vt_sync_point = cs_scratch_reg64(b, 2);
2233    uint64_t rel_vt_sync_point =
2234       cmdbuf->state.cs[PANVK_SUBQUEUE_VERTEX_TILER].relative_sync_point;
2235 
2236    cs_load64_to(b, vt_sync_addr, cs_subqueue_ctx_reg(b),
2237                 offsetof(struct panvk_cs_subqueue_context, syncobjs));
2238    cs_wait_slot(b, SB_ID(LS), false);
2239 
2240    cs_add64(b, vt_sync_point,
2241             cs_progress_seqno_reg(b, PANVK_SUBQUEUE_VERTEX_TILER),
2242             rel_vt_sync_point);
2243    cs_sync64_wait(b, false, MALI_CS_CONDITION_GREATER, vt_sync_point,
2244                   vt_sync_addr);
2245 }
2246 
2247 static uint32_t
calc_tiler_oom_handler_idx(struct panvk_cmd_buffer * cmdbuf)2248 calc_tiler_oom_handler_idx(struct panvk_cmd_buffer *cmdbuf)
2249 {
2250    const struct pan_fb_info *fb = &cmdbuf->state.gfx.render.fb.info;
2251    bool has_zs_ext = fb->zs.view.zs || fb->zs.view.s;
2252    uint32_t rt_count = MAX2(fb->rt_count, 1);
2253 
2254    return get_tiler_oom_handler_idx(has_zs_ext, rt_count);
2255 }
2256 
2257 static void
setup_tiler_oom_ctx(struct panvk_cmd_buffer * cmdbuf)2258 setup_tiler_oom_ctx(struct panvk_cmd_buffer *cmdbuf)
2259 {
2260    struct cs_builder *b = panvk_get_cs_builder(cmdbuf, PANVK_SUBQUEUE_FRAGMENT);
2261 
2262    uint32_t td_count = DIV_ROUND_UP(cmdbuf->state.gfx.render.layer_count,
2263                                     MAX_LAYERS_PER_TILER_DESC);
2264    uint32_t fbd_sz = calc_fbd_size(cmdbuf);
2265    uint32_t fbd_ir_pass_offset = fbd_sz * cmdbuf->state.gfx.render.layer_count;
2266 
2267    struct cs_index counter = cs_scratch_reg32(b, 1);
2268    cs_move32_to(b, counter, 0);
2269    cs_store32(b, counter, cs_subqueue_ctx_reg(b),
2270               TILER_OOM_CTX_FIELD_OFFSET(counter));
2271 
2272    struct cs_index fbd_first = cs_scratch_reg64(b, 2);
2273    cs_add64(b, fbd_first, cs_sr_reg64(b, 40),
2274             (1 + PANVK_IR_FIRST_PASS) * fbd_ir_pass_offset);
2275    cs_store64(b, fbd_first, cs_subqueue_ctx_reg(b),
2276               TILER_OOM_CTX_FBDPTR_OFFSET(FIRST));
2277    struct cs_index fbd_middle = cs_scratch_reg64(b, 4);
2278    cs_add64(b, fbd_middle, cs_sr_reg64(b, 40),
2279             (1 + PANVK_IR_MIDDLE_PASS) * fbd_ir_pass_offset);
2280    cs_store64(b, fbd_middle, cs_subqueue_ctx_reg(b),
2281               TILER_OOM_CTX_FBDPTR_OFFSET(MIDDLE));
2282    struct cs_index fbd_last = cs_scratch_reg64(b, 6);
2283    cs_add64(b, fbd_last, cs_sr_reg64(b, 40),
2284             (1 + PANVK_IR_LAST_PASS) * fbd_ir_pass_offset);
2285    cs_store64(b, fbd_last, cs_subqueue_ctx_reg(b),
2286               TILER_OOM_CTX_FBDPTR_OFFSET(LAST));
2287 
2288    struct cs_index td_count_reg = cs_scratch_reg32(b, 8);
2289    cs_move32_to(b, td_count_reg, td_count);
2290    cs_store32(b, td_count_reg, cs_subqueue_ctx_reg(b),
2291               TILER_OOM_CTX_FIELD_OFFSET(td_count));
2292    struct cs_index layer_count = cs_scratch_reg32(b, 9);
2293    cs_move32_to(b, layer_count, cmdbuf->state.gfx.render.layer_count);
2294    cs_store32(b, layer_count, cs_subqueue_ctx_reg(b),
2295               TILER_OOM_CTX_FIELD_OFFSET(layer_count));
2296 
2297    cs_wait_slot(b, SB_ID(LS), false);
2298 }
2299 
2300 static VkResult
issue_fragment_jobs(struct panvk_cmd_buffer * cmdbuf)2301 issue_fragment_jobs(struct panvk_cmd_buffer *cmdbuf)
2302 {
2303    struct panvk_device *dev = to_panvk_device(cmdbuf->vk.base.device);
2304    struct panvk_instance *instance =
2305       to_panvk_instance(dev->vk.physical->instance);
2306    const struct cs_tracing_ctx *tracing_ctx =
2307       &cmdbuf->state.cs[PANVK_SUBQUEUE_FRAGMENT].tracing;
2308    struct pan_fb_info *fbinfo = &cmdbuf->state.gfx.render.fb.info;
2309    struct cs_builder *b = panvk_get_cs_builder(cmdbuf, PANVK_SUBQUEUE_FRAGMENT);
2310    bool has_oq_chain = cmdbuf->state.gfx.render.oq.chain != 0;
2311 
2312    /* Reserve a scoreboard for the fragment job. */
2313    panvk_per_arch(cs_pick_iter_sb)(cmdbuf, PANVK_SUBQUEUE_FRAGMENT);
2314 
2315    /* Now initialize the fragment bits. */
2316    cs_update_frag_ctx(b) {
2317       cs_move32_to(b, cs_sr_reg32(b, 42),
2318                    (fbinfo->extent.miny << 16) | fbinfo->extent.minx);
2319       cs_move32_to(b, cs_sr_reg32(b, 43),
2320                    (fbinfo->extent.maxy << 16) | fbinfo->extent.maxx);
2321    }
2322 
2323    bool simul_use =
2324       cmdbuf->flags & VK_COMMAND_BUFFER_USAGE_SIMULTANEOUS_USE_BIT;
2325 
2326    /* The only bit we patch in FBDs is the tiler pointer. If tiler is not
2327     * involved (clear job) or if the update can happen in place (not
2328     * simultaneous use of the command buffer), we can avoid the
2329     * copy. */
2330    bool needs_tiling =
2331       cmdbuf->state.gfx.render.tiler || inherits_render_ctx(cmdbuf);
2332 
2333    /* If the command buffer can run in parallel on different queues, we need
2334     * to make sure each instance has its own descriptors, unless tiling is
2335     * not needed (AKA RUN_FRAGMENT used for clears), because then the FBD
2336     * descriptors are constant (no need to patch them at runtime). */
2337    bool free_render_descs = simul_use && needs_tiling;
2338    uint32_t fbd_sz = calc_fbd_size(cmdbuf);
2339    uint32_t fbd_ir_pass_offset = fbd_sz * cmdbuf->state.gfx.render.layer_count;
2340    uint32_t td_count = 0;
2341    if (needs_tiling) {
2342       td_count = DIV_ROUND_UP(cmdbuf->state.gfx.render.layer_count,
2343                               MAX_LAYERS_PER_TILER_DESC);
2344    }
2345 
2346    /* Update the Tiler OOM context */
2347    setup_tiler_oom_ctx(cmdbuf);
2348 
2349    /* Enable the oom handler before waiting for the vertex/tiler work.
2350     * At this point, the tiler oom context has been set up with the correct
2351     * state for this renderpass, so it's safe to enable. */
2352    struct cs_index addr_reg = cs_scratch_reg64(b, 0);
2353    struct cs_index length_reg = cs_scratch_reg32(b, 2);
2354    uint32_t handler_idx = calc_tiler_oom_handler_idx(cmdbuf);
2355    uint64_t handler_addr = dev->tiler_oom.handlers_bo->addr.dev +
2356                            handler_idx * dev->tiler_oom.handler_stride;
2357    cs_move64_to(b, addr_reg, handler_addr);
2358    cs_move32_to(b, length_reg, dev->tiler_oom.handler_stride);
2359    cs_set_exception_handler(b, MALI_CS_EXCEPTION_TYPE_TILER_OOM, addr_reg,
2360                             length_reg);
2361 
2362    /* Wait for the tiling to be done before submitting the fragment job. */
2363    wait_finish_tiling(cmdbuf);
2364 
2365    /* Disable the oom handler once the vertex/tiler work has finished.
2366     * We need to disable the handler at this point as the vertex/tiler subqueue
2367     * might continue on to the next renderpass and hit an out-of-memory
2368     * exception prior to the fragment subqueue setting up the tiler oom context
2369     * for the next renderpass.
2370     * By disabling the handler here, any exception will be left pending until a
2371     * new hander is registered, at which point the correct state has been set
2372     * up. */
2373    cs_move64_to(b, addr_reg, 0);
2374    cs_move32_to(b, length_reg, 0);
2375    cs_set_exception_handler(b, MALI_CS_EXCEPTION_TYPE_TILER_OOM, addr_reg,
2376                             length_reg);
2377 
2378    /* Pick the correct set of FBDs based on whether an incremental render
2379     * occurred. */
2380    struct cs_index counter = cs_scratch_reg32(b, 0);
2381    cs_load32_to(
2382       b, counter, cs_subqueue_ctx_reg(b),
2383       offsetof(struct panvk_cs_subqueue_context, tiler_oom_ctx.counter));
2384    cs_wait_slot(b, SB_ID(LS), false);
2385    cs_if(b, MALI_CS_CONDITION_GREATER, counter)
2386       cs_update_frag_ctx(b)
2387          cs_add64(b, cs_sr_reg64(b, 40), cs_sr_reg64(b, 40),
2388                   (1 + PANVK_IR_LAST_PASS) * fbd_ir_pass_offset);
2389 
2390    /* Applications tend to forget to describe subpass dependencies, especially
2391     * when it comes to write -> read dependencies on attachments. The
2392     * proprietary driver forces "others" invalidation as a workaround, and this
2393     * invalidation even became implicit (done as part of the RUN_FRAGMENT) on
2394     * v13+. We don't do that in panvk, but we provide a debug flag to help
2395     * identify those issues. */
2396    if (unlikely(instance->debug_flags & PANVK_DEBUG_IMPLICIT_OTHERS_INV)) {
2397       cs_flush_caches(b, 0, 0, true, length_reg,
2398                       cs_defer(0x0, SB_ID(IMM_FLUSH)));
2399       cs_wait_slot(b, SB_ID(IMM_FLUSH), false);
2400    }
2401 
2402    cs_req_res(b, CS_FRAG_RES);
2403    if (cmdbuf->state.gfx.render.layer_count > 1) {
2404       struct cs_index layer_count = cs_sr_reg32(b, 47);
2405 
2406       cs_move32_to(b, layer_count, calc_enabled_layer_count(cmdbuf));
2407       cs_while(b, MALI_CS_CONDITION_GREATER, layer_count) {
2408          cs_trace_run_fragment(b, tracing_ctx, cs_scratch_reg_tuple(b, 0, 4),
2409                                false, MALI_TILE_RENDER_ORDER_Z_ORDER, false);
2410 
2411          cs_add32(b, layer_count, layer_count, -1);
2412          cs_update_frag_ctx(b)
2413             cs_add64(b, cs_sr_reg64(b, 40), cs_sr_reg64(b, 40), fbd_sz);
2414       }
2415    } else {
2416       cs_trace_run_fragment(b, tracing_ctx, cs_scratch_reg_tuple(b, 0, 4),
2417                             false, MALI_TILE_RENDER_ORDER_Z_ORDER, false);
2418    }
2419    cs_req_res(b, 0);
2420 
2421    struct cs_index sync_addr = cs_scratch_reg64(b, 0);
2422    struct cs_index iter_sb = cs_scratch_reg32(b, 2);
2423    struct cs_index cmp_scratch = cs_scratch_reg32(b, 3);
2424    struct cs_index add_val = cs_scratch_reg64(b, 4);
2425    struct cs_index add_val_lo = cs_scratch_reg32(b, 4);
2426    struct cs_index ringbuf_sync_addr = cs_scratch_reg64(b, 6);
2427    struct cs_index release_sz = cs_scratch_reg32(b, 8);
2428 
2429    struct cs_index completed = cs_scratch_reg_tuple(b, 10, 4);
2430    struct cs_index completed_top = cs_scratch_reg64(b, 10);
2431    struct cs_index completed_bottom = cs_scratch_reg64(b, 12);
2432    struct cs_index cur_tiler = cs_sr_reg64(b, 38);
2433    struct cs_index tiler_count = cs_sr_reg32(b, 47);
2434    struct cs_index oq_chain = cs_scratch_reg64(b, 10);
2435    struct cs_index oq_chain_lo = cs_scratch_reg32(b, 10);
2436    struct cs_index oq_chain_hi = cs_scratch_reg32(b, 11);
2437    struct cs_index oq_syncobj = cs_scratch_reg64(b, 12);
2438 
2439    cs_move64_to(b, add_val, 1);
2440    cs_load_to(b, cs_scratch_reg_tuple(b, 0, 3), cs_subqueue_ctx_reg(b),
2441               BITFIELD_MASK(3),
2442               offsetof(struct panvk_cs_subqueue_context, syncobjs));
2443 
2444    if (free_render_descs) {
2445       cs_move32_to(b, release_sz, calc_render_descs_size(cmdbuf));
2446       cs_load64_to(b, ringbuf_sync_addr, cs_subqueue_ctx_reg(b),
2447                    offsetof(struct panvk_cs_subqueue_context,
2448                             render.desc_ringbuf.syncobj));
2449    }
2450 
2451    cs_wait_slot(b, SB_ID(LS), false);
2452 
2453    cs_add64(b, sync_addr, sync_addr,
2454             PANVK_SUBQUEUE_FRAGMENT * sizeof(struct panvk_cs_sync64));
2455    cs_move32_to(b, tiler_count, td_count);
2456 
2457    cs_match(b, iter_sb, cmp_scratch) {
2458 #define CASE(x)                                                                \
2459    cs_case(b, x) {                                                             \
2460       const struct cs_async_op async =                                         \
2461          cs_defer(SB_WAIT_ITER(x), SB_ID(DEFERRED_SYNC));                      \
2462       if (td_count == 1) {                                                     \
2463          cs_load_to(b, completed, cur_tiler, BITFIELD_MASK(4), 40);            \
2464          cs_wait_slot(b, SB_ID(LS), false);                                    \
2465          cs_finish_fragment(b, true, completed_top, completed_bottom, async);  \
2466       } else if (td_count > 1) {                                               \
2467          cs_while(b, MALI_CS_CONDITION_GREATER, tiler_count) {                 \
2468             cs_load_to(b, completed, cur_tiler, BITFIELD_MASK(4), 40);         \
2469             cs_wait_slot(b, SB_ID(LS), false);                                 \
2470             cs_finish_fragment(b, false, completed_top, completed_bottom,      \
2471                                async);                                         \
2472             cs_update_frag_ctx(b)                                              \
2473                cs_add64(b, cur_tiler, cur_tiler, pan_size(TILER_CONTEXT));     \
2474             cs_add32(b, tiler_count, tiler_count, -1);                         \
2475          }                                                                     \
2476          cs_frag_end(b, async);                                                \
2477       }                                                                        \
2478       if (free_render_descs) {                                                 \
2479          cs_sync32_add(b, true, MALI_CS_SYNC_SCOPE_CSG, release_sz,            \
2480                        ringbuf_sync_addr, async);                              \
2481       }                                                                        \
2482       if (has_oq_chain) {                                                      \
2483          struct cs_index flush_id = oq_chain_lo;                               \
2484          cs_move32_to(b, flush_id, 0);                                         \
2485          cs_flush_caches(b, MALI_CS_FLUSH_MODE_CLEAN,                          \
2486                          MALI_CS_FLUSH_MODE_CLEAN, false, flush_id,            \
2487                          cs_defer(SB_WAIT_ITER(x), SB_ID(DEFERRED_FLUSH)));    \
2488          cs_load64_to(                                                         \
2489             b, oq_chain, cs_subqueue_ctx_reg(b),                               \
2490             offsetof(struct panvk_cs_subqueue_context, render.oq_chain));      \
2491          cs_wait_slot(b, SB_ID(LS), false);                                    \
2492          /* We use oq_syncobj as a placeholder to reset the oq_chain. */       \
2493          cs_move64_to(b, oq_syncobj, 0);                                       \
2494          cs_store64(                                                           \
2495             b, oq_syncobj, cs_subqueue_ctx_reg(b),                             \
2496             offsetof(struct panvk_cs_subqueue_context, render.oq_chain));      \
2497          cs_wait_slot(b, SB_ID(LS), false);                                    \
2498          cs_while(b, MALI_CS_CONDITION_ALWAYS, cs_undef()) {                   \
2499             cs_load64_to(b, oq_syncobj, oq_chain,                              \
2500                          offsetof(struct panvk_cs_occlusion_query, syncobj));  \
2501             cs_wait_slot(b, SB_ID(LS), false);                                 \
2502             cs_load64_to(b, oq_chain, oq_chain,                                \
2503                          offsetof(struct panvk_cs_occlusion_query, next));     \
2504             cs_wait_slot(b, SB_ID(LS), false);                                 \
2505             cs_sync32_set(                                                     \
2506                b, true, MALI_CS_SYNC_SCOPE_CSG, add_val_lo, oq_syncobj,        \
2507                cs_defer(SB_MASK(DEFERRED_FLUSH), SB_ID(DEFERRED_SYNC)));       \
2508             cs_if(b, MALI_CS_CONDITION_NEQUAL, oq_chain_lo)                    \
2509                cs_continue(b);                                                 \
2510             cs_if(b, MALI_CS_CONDITION_NEQUAL, oq_chain_hi)                    \
2511                cs_continue(b);                                                 \
2512             cs_break(b);                                                       \
2513          }                                                                     \
2514       }                                                                        \
2515       cs_sync64_add(b, true, MALI_CS_SYNC_SCOPE_CSG, add_val, sync_addr,       \
2516                     async);                                                    \
2517       cs_move32_to(b, iter_sb, next_iter_sb(x));                               \
2518    }
2519 
2520       CASE(0)
2521       CASE(1)
2522       CASE(2)
2523       CASE(3)
2524       CASE(4)
2525 #undef CASE
2526    }
2527 
2528    cs_store32(b, iter_sb, cs_subqueue_ctx_reg(b),
2529               offsetof(struct panvk_cs_subqueue_context, iter_sb));
2530    cs_wait_slot(b, SB_ID(LS), false);
2531 
2532    /* Update the ring buffer position. */
2533    if (free_render_descs) {
2534       cs_render_desc_ringbuf_move_ptr(b, calc_render_descs_size(cmdbuf),
2535                                       !tracing_ctx->enabled);
2536    }
2537 
2538    /* Update the frag seqno. */
2539    ++cmdbuf->state.cs[PANVK_SUBQUEUE_FRAGMENT].relative_sync_point;
2540 
2541 
2542    return VK_SUCCESS;
2543 }
2544 
2545 void
panvk_per_arch(cmd_flush_draws)2546 panvk_per_arch(cmd_flush_draws)(struct panvk_cmd_buffer *cmdbuf)
2547 {
2548    /* If there was no draw queued, we don't need to force a preload. */
2549    if (cmdbuf->state.gfx.render.fbds.gpu || inherits_render_ctx(cmdbuf)) {
2550       flush_tiling(cmdbuf);
2551       issue_fragment_jobs(cmdbuf);
2552       memset(&cmdbuf->state.gfx.render.fbds, 0,
2553              sizeof(cmdbuf->state.gfx.render.fbds));
2554       cmdbuf->state.gfx.render.tiler = 0;
2555 
2556       panvk_per_arch(cmd_force_fb_preload)(cmdbuf, NULL);
2557 
2558       /* We inherited the render context, and need to let the primary command
2559        * buffer know that it's changed. */
2560       cmdbuf->state.gfx.render.invalidate_inherited_ctx = true;
2561 
2562       /* Re-emit the FB/Tiler descs if we inherited them. */
2563       if (inherits_render_ctx(cmdbuf))
2564          get_render_ctx(cmdbuf);
2565    }
2566 }
2567 
2568 VKAPI_ATTR void VKAPI_CALL
panvk_per_arch(CmdEndRendering)2569 panvk_per_arch(CmdEndRendering)(VkCommandBuffer commandBuffer)
2570 {
2571    VK_FROM_HANDLE(panvk_cmd_buffer, cmdbuf, commandBuffer);
2572    bool suspending = cmdbuf->state.gfx.render.flags & VK_RENDERING_SUSPENDING_BIT;
2573    VkResult result;
2574 
2575    if (!suspending) {
2576       struct pan_fb_info *fbinfo = &cmdbuf->state.gfx.render.fb.info;
2577       bool clear = fbinfo->zs.clear.z | fbinfo->zs.clear.s;
2578       for (unsigned i = 0; i < fbinfo->rt_count; i++)
2579          clear |= fbinfo->rts[i].clear;
2580 
2581       if (clear && !inherits_render_ctx(cmdbuf)) {
2582          result = get_fb_descs(cmdbuf);
2583          if (result != VK_SUCCESS)
2584             return;
2585       }
2586 
2587       /* Flush the last occlusion query before ending the render pass if
2588        * this query has ended while we were inside the render pass. */
2589       if (cmdbuf->state.gfx.render.oq.last !=
2590           cmdbuf->state.gfx.occlusion_query.syncobj) {
2591          result = wrap_prev_oq(cmdbuf);
2592          if (result != VK_SUCCESS)
2593             return;
2594       }
2595 
2596       if (cmdbuf->state.gfx.render.fbds.gpu || inherits_render_ctx(cmdbuf)) {
2597          flush_tiling(cmdbuf);
2598          issue_fragment_jobs(cmdbuf);
2599       }
2600    } else if (!inherits_render_ctx(cmdbuf)) {
2601       /* If we're suspending the render pass and we didn't inherit the render
2602        * context, we need to emit it now, so it's available when the render pass
2603        * is resumed. */
2604       VkResult result = get_render_ctx(cmdbuf);
2605       if (result != VK_SUCCESS)
2606          return;
2607    }
2608 
2609    memset(&cmdbuf->state.gfx.render.fbds, 0,
2610           sizeof(cmdbuf->state.gfx.render.fbds));
2611    memset(&cmdbuf->state.gfx.render.oq, 0, sizeof(cmdbuf->state.gfx.render.oq));
2612    cmdbuf->state.gfx.render.tiler = 0;
2613 
2614    /* If we're finished with this render pass, make sure we reset the flags
2615     * so any barrier encountered after EndRendering() doesn't try to flush
2616     * draws. */
2617    cmdbuf->state.gfx.render.flags = 0;
2618    cmdbuf->state.gfx.render.suspended = suspending;
2619 
2620    /* If we're not suspending, we need to resolve attachments. */
2621    if (!suspending)
2622       panvk_per_arch(cmd_resolve_attachments)(cmdbuf);
2623 }
2624