• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*
2  * Copyright © 2024 Collabora Ltd.
3  * Copyright © 2024 Arm Ltd.
4  *
5  * Derived from tu_cmd_buffer.c which is:
6  * Copyright © 2016 Red Hat.
7  * Copyright © 2016 Bas Nieuwenhuizen
8  * Copyright © 2015 Intel Corporation
9  *
10  * SPDX-License-Identifier: MIT
11  */
12 
13 #include <stdint.h>
14 #include "genxml/gen_macros.h"
15 
16 #include "panvk_buffer.h"
17 #include "panvk_cmd_alloc.h"
18 #include "panvk_cmd_buffer.h"
19 #include "panvk_cmd_desc_state.h"
20 #include "panvk_cmd_draw.h"
21 #include "panvk_cmd_fb_preload.h"
22 #include "panvk_cmd_meta.h"
23 #include "panvk_device.h"
24 #include "panvk_entrypoints.h"
25 #include "panvk_image.h"
26 #include "panvk_image_view.h"
27 #include "panvk_instance.h"
28 #include "panvk_priv_bo.h"
29 #include "panvk_shader.h"
30 
31 #include "pan_desc.h"
32 #include "pan_earlyzs.h"
33 #include "pan_encoder.h"
34 #include "pan_format.h"
35 #include "pan_jc.h"
36 #include "pan_props.h"
37 #include "pan_samples.h"
38 #include "pan_shader.h"
39 
40 #include "util/bitscan.h"
41 #include "vk_format.h"
42 #include "vk_meta.h"
43 #include "vk_pipeline_layout.h"
44 #include "vk_render_pass.h"
45 
46 static void
emit_vs_attrib(const struct vk_vertex_attribute_state * attrib_info,const struct vk_vertex_binding_state * buf_info,const struct panvk_attrib_buf * buf,uint32_t vb_desc_offset,struct mali_attribute_packed * desc)47 emit_vs_attrib(const struct vk_vertex_attribute_state *attrib_info,
48                const struct vk_vertex_binding_state *buf_info,
49                const struct panvk_attrib_buf *buf, uint32_t vb_desc_offset,
50                struct mali_attribute_packed *desc)
51 {
52    bool per_instance = buf_info->input_rate == VK_VERTEX_INPUT_RATE_INSTANCE;
53    enum pipe_format f = vk_format_to_pipe_format(attrib_info->format);
54    unsigned buf_idx = vb_desc_offset + attrib_info->binding;
55 
56    pan_pack(desc, ATTRIBUTE, cfg) {
57       cfg.offset = attrib_info->offset;
58       cfg.format = GENX(panfrost_format_from_pipe_format)(f)->hw;
59       cfg.table = 0;
60       cfg.buffer_index = buf_idx;
61       cfg.stride = buf_info->stride;
62       if (!per_instance) {
63          /* Per-vertex */
64          cfg.attribute_type = MALI_ATTRIBUTE_TYPE_1D;
65          cfg.frequency = MALI_ATTRIBUTE_FREQUENCY_VERTEX;
66          cfg.offset_enable = true;
67       } else if (buf_info->divisor == 1) {
68          cfg.attribute_type = MALI_ATTRIBUTE_TYPE_1D;
69          cfg.frequency = MALI_ATTRIBUTE_FREQUENCY_INSTANCE;
70       } else if (buf_info->divisor == 0) {
71          cfg.attribute_type = MALI_ATTRIBUTE_TYPE_1D;
72          /* HW doesn't support a zero divisor, but we can achieve the same by
73           * not using a divisor and setting the stride to zero */
74          cfg.frequency = MALI_ATTRIBUTE_FREQUENCY_INSTANCE;
75          cfg.stride = 0;
76       } else if (util_is_power_of_two_or_zero(buf_info->divisor)) {
77          /* Per-instance, POT divisor */
78          cfg.attribute_type = MALI_ATTRIBUTE_TYPE_1D_POT_DIVISOR;
79          cfg.frequency = MALI_ATTRIBUTE_FREQUENCY_INSTANCE;
80          cfg.divisor_r = __builtin_ctz(buf_info->divisor);
81       } else {
82          /* Per-instance, NPOT divisor */
83          cfg.attribute_type = MALI_ATTRIBUTE_TYPE_1D_NPOT_DIVISOR;
84          cfg.frequency = MALI_ATTRIBUTE_FREQUENCY_INSTANCE;
85          cfg.divisor_d = panfrost_compute_magic_divisor(
86             buf_info->divisor, &cfg.divisor_r, &cfg.divisor_e);
87       }
88    }
89 }
90 
91 static bool
vs_driver_set_is_dirty(struct panvk_cmd_buffer * cmdbuf)92 vs_driver_set_is_dirty(struct panvk_cmd_buffer *cmdbuf)
93 {
94    return dyn_gfx_state_dirty(cmdbuf, VI) ||
95           dyn_gfx_state_dirty(cmdbuf, VI_BINDINGS_VALID) ||
96           dyn_gfx_state_dirty(cmdbuf, VI_BINDING_STRIDES) ||
97           gfx_state_dirty(cmdbuf, VB) || gfx_state_dirty(cmdbuf, VS) ||
98           gfx_state_dirty(cmdbuf, DESC_STATE);
99 }
100 
101 static VkResult
prepare_vs_driver_set(struct panvk_cmd_buffer * cmdbuf)102 prepare_vs_driver_set(struct panvk_cmd_buffer *cmdbuf)
103 {
104    if (!vs_driver_set_is_dirty(cmdbuf))
105       return VK_SUCCESS;
106 
107    struct panvk_shader_desc_state *vs_desc_state = &cmdbuf->state.gfx.vs.desc;
108    const struct panvk_shader *vs = cmdbuf->state.gfx.vs.shader;
109    const struct vk_vertex_input_state *vi =
110       cmdbuf->vk.dynamic_graphics_state.vi;
111    uint32_t vb_count = 0;
112 
113    u_foreach_bit(i, vi->attributes_valid)
114       vb_count = MAX2(vi->attributes[i].binding + 1, vb_count);
115 
116    uint32_t vb_offset = vs->desc_info.dyn_bufs.count + MAX_VS_ATTRIBS + 1;
117    uint32_t desc_count = vb_offset + vb_count;
118    const struct panvk_descriptor_state *desc_state =
119       &cmdbuf->state.gfx.desc_state;
120    struct panfrost_ptr driver_set = panvk_cmd_alloc_dev_mem(
121       cmdbuf, desc, desc_count * PANVK_DESCRIPTOR_SIZE, PANVK_DESCRIPTOR_SIZE);
122    struct panvk_opaque_desc *descs = driver_set.cpu;
123 
124    if (!driver_set.gpu)
125       return VK_ERROR_OUT_OF_DEVICE_MEMORY;
126 
127    for (uint32_t i = 0; i < MAX_VS_ATTRIBS; i++) {
128       if (vi->attributes_valid & BITFIELD_BIT(i)) {
129          unsigned binding = vi->attributes[i].binding;
130 
131          emit_vs_attrib(&vi->attributes[i], &vi->bindings[binding],
132                         &cmdbuf->state.gfx.vb.bufs[binding], vb_offset,
133                         (struct mali_attribute_packed *)(&descs[i]));
134       } else {
135          memset(&descs[i], 0, sizeof(descs[0]));
136       }
137    }
138 
139    /* Dummy sampler always comes right after the vertex attribs. */
140    pan_cast_and_pack(&descs[MAX_VS_ATTRIBS], SAMPLER, cfg) {
141       cfg.clamp_integer_array_indices = false;
142    }
143 
144    panvk_per_arch(cmd_fill_dyn_bufs)(
145       desc_state, vs,
146       (struct mali_buffer_packed *)(&descs[MAX_VS_ATTRIBS + 1]));
147 
148    for (uint32_t i = 0; i < vb_count; i++) {
149       const struct panvk_attrib_buf *vb = &cmdbuf->state.gfx.vb.bufs[i];
150 
151       pan_cast_and_pack(&descs[vb_offset + i], BUFFER, cfg) {
152          if (vi->bindings_valid & BITFIELD_BIT(i)) {
153             cfg.address = vb->address;
154             cfg.size = vb->size;
155          } else {
156             cfg.address = 0;
157             cfg.size = 0;
158          }
159       }
160    }
161 
162    vs_desc_state->driver_set.dev_addr = driver_set.gpu;
163    vs_desc_state->driver_set.size = desc_count * PANVK_DESCRIPTOR_SIZE;
164    gfx_state_set_dirty(cmdbuf, DESC_STATE);
165    return VK_SUCCESS;
166 }
167 
168 static VkResult
prepare_fs_driver_set(struct panvk_cmd_buffer * cmdbuf)169 prepare_fs_driver_set(struct panvk_cmd_buffer *cmdbuf)
170 {
171    struct panvk_shader_desc_state *fs_desc_state = &cmdbuf->state.gfx.fs.desc;
172    const struct panvk_shader *fs = cmdbuf->state.gfx.fs.shader;
173    const struct panvk_descriptor_state *desc_state =
174       &cmdbuf->state.gfx.desc_state;
175    uint32_t desc_count = fs->desc_info.dyn_bufs.count + 1;
176    struct panfrost_ptr driver_set = panvk_cmd_alloc_dev_mem(
177       cmdbuf, desc, desc_count * PANVK_DESCRIPTOR_SIZE, PANVK_DESCRIPTOR_SIZE);
178    struct panvk_opaque_desc *descs = driver_set.cpu;
179 
180    if (desc_count && !driver_set.gpu)
181       return VK_ERROR_OUT_OF_DEVICE_MEMORY;
182 
183    /* Dummy sampler always comes first. */
184    pan_cast_and_pack(&descs[0], SAMPLER, cfg) {
185       cfg.clamp_integer_array_indices = false;
186    }
187 
188    panvk_per_arch(cmd_fill_dyn_bufs)(desc_state, fs,
189                                      (struct mali_buffer_packed *)(&descs[1]));
190 
191    fs_desc_state->driver_set.dev_addr = driver_set.gpu;
192    fs_desc_state->driver_set.size = desc_count * PANVK_DESCRIPTOR_SIZE;
193    gfx_state_set_dirty(cmdbuf, DESC_STATE);
194    return VK_SUCCESS;
195 }
196 
197 static bool
has_depth_att(struct panvk_cmd_buffer * cmdbuf)198 has_depth_att(struct panvk_cmd_buffer *cmdbuf)
199 {
200    return (cmdbuf->state.gfx.render.bound_attachments &
201            MESA_VK_RP_ATTACHMENT_DEPTH_BIT) != 0;
202 }
203 
204 static bool
has_stencil_att(struct panvk_cmd_buffer * cmdbuf)205 has_stencil_att(struct panvk_cmd_buffer *cmdbuf)
206 {
207    return (cmdbuf->state.gfx.render.bound_attachments &
208            MESA_VK_RP_ATTACHMENT_STENCIL_BIT) != 0;
209 }
210 
211 static bool
writes_depth(struct panvk_cmd_buffer * cmdbuf)212 writes_depth(struct panvk_cmd_buffer *cmdbuf)
213 {
214    const struct vk_depth_stencil_state *ds =
215       &cmdbuf->vk.dynamic_graphics_state.ds;
216 
217    return has_depth_att(cmdbuf) && ds->depth.test_enable &&
218           ds->depth.write_enable && ds->depth.compare_op != VK_COMPARE_OP_NEVER;
219 }
220 
221 static bool
writes_stencil(struct panvk_cmd_buffer * cmdbuf)222 writes_stencil(struct panvk_cmd_buffer *cmdbuf)
223 {
224    const struct vk_depth_stencil_state *ds =
225       &cmdbuf->vk.dynamic_graphics_state.ds;
226 
227    return has_stencil_att(cmdbuf) && ds->stencil.test_enable &&
228           ((ds->stencil.front.write_mask &&
229             (ds->stencil.front.op.fail != VK_STENCIL_OP_KEEP ||
230              ds->stencil.front.op.pass != VK_STENCIL_OP_KEEP ||
231              ds->stencil.front.op.depth_fail != VK_STENCIL_OP_KEEP)) ||
232            (ds->stencil.back.write_mask &&
233             (ds->stencil.back.op.fail != VK_STENCIL_OP_KEEP ||
234              ds->stencil.back.op.pass != VK_STENCIL_OP_KEEP ||
235              ds->stencil.back.op.depth_fail != VK_STENCIL_OP_KEEP)));
236 }
237 
238 static bool
ds_test_always_passes(struct panvk_cmd_buffer * cmdbuf)239 ds_test_always_passes(struct panvk_cmd_buffer *cmdbuf)
240 {
241    const struct vk_depth_stencil_state *ds =
242       &cmdbuf->vk.dynamic_graphics_state.ds;
243 
244    if (!has_depth_att(cmdbuf))
245       return true;
246 
247    if (ds->depth.test_enable && ds->depth.compare_op != VK_COMPARE_OP_ALWAYS)
248       return false;
249 
250    if (ds->stencil.test_enable &&
251        (ds->stencil.front.op.compare != VK_COMPARE_OP_ALWAYS ||
252         ds->stencil.back.op.compare != VK_COMPARE_OP_ALWAYS))
253       return false;
254 
255    return true;
256 }
257 
258 static inline enum mali_func
translate_compare_func(VkCompareOp comp)259 translate_compare_func(VkCompareOp comp)
260 {
261    STATIC_ASSERT(VK_COMPARE_OP_NEVER == (VkCompareOp)MALI_FUNC_NEVER);
262    STATIC_ASSERT(VK_COMPARE_OP_LESS == (VkCompareOp)MALI_FUNC_LESS);
263    STATIC_ASSERT(VK_COMPARE_OP_EQUAL == (VkCompareOp)MALI_FUNC_EQUAL);
264    STATIC_ASSERT(VK_COMPARE_OP_LESS_OR_EQUAL == (VkCompareOp)MALI_FUNC_LEQUAL);
265    STATIC_ASSERT(VK_COMPARE_OP_GREATER == (VkCompareOp)MALI_FUNC_GREATER);
266    STATIC_ASSERT(VK_COMPARE_OP_NOT_EQUAL == (VkCompareOp)MALI_FUNC_NOT_EQUAL);
267    STATIC_ASSERT(VK_COMPARE_OP_GREATER_OR_EQUAL ==
268                  (VkCompareOp)MALI_FUNC_GEQUAL);
269    STATIC_ASSERT(VK_COMPARE_OP_ALWAYS == (VkCompareOp)MALI_FUNC_ALWAYS);
270 
271    return (enum mali_func)comp;
272 }
273 
274 static enum mali_stencil_op
translate_stencil_op(VkStencilOp in)275 translate_stencil_op(VkStencilOp in)
276 {
277    switch (in) {
278    case VK_STENCIL_OP_KEEP:
279       return MALI_STENCIL_OP_KEEP;
280    case VK_STENCIL_OP_ZERO:
281       return MALI_STENCIL_OP_ZERO;
282    case VK_STENCIL_OP_REPLACE:
283       return MALI_STENCIL_OP_REPLACE;
284    case VK_STENCIL_OP_INCREMENT_AND_CLAMP:
285       return MALI_STENCIL_OP_INCR_SAT;
286    case VK_STENCIL_OP_DECREMENT_AND_CLAMP:
287       return MALI_STENCIL_OP_DECR_SAT;
288    case VK_STENCIL_OP_INCREMENT_AND_WRAP:
289       return MALI_STENCIL_OP_INCR_WRAP;
290    case VK_STENCIL_OP_DECREMENT_AND_WRAP:
291       return MALI_STENCIL_OP_DECR_WRAP;
292    case VK_STENCIL_OP_INVERT:
293       return MALI_STENCIL_OP_INVERT;
294    default:
295       unreachable("Invalid stencil op");
296    }
297 }
298 
299 static enum mali_draw_mode
translate_prim_topology(VkPrimitiveTopology in)300 translate_prim_topology(VkPrimitiveTopology in)
301 {
302    /* Test VK_PRIMITIVE_TOPOLOGY_META_RECT_LIST_MESA separately, as it's not
303     * part of the VkPrimitiveTopology enum.
304     */
305    if (in == VK_PRIMITIVE_TOPOLOGY_META_RECT_LIST_MESA)
306       return MALI_DRAW_MODE_TRIANGLES;
307 
308    switch (in) {
309    case VK_PRIMITIVE_TOPOLOGY_POINT_LIST:
310       return MALI_DRAW_MODE_POINTS;
311    case VK_PRIMITIVE_TOPOLOGY_LINE_LIST:
312       return MALI_DRAW_MODE_LINES;
313    case VK_PRIMITIVE_TOPOLOGY_LINE_STRIP:
314       return MALI_DRAW_MODE_LINE_STRIP;
315    case VK_PRIMITIVE_TOPOLOGY_TRIANGLE_LIST:
316       return MALI_DRAW_MODE_TRIANGLES;
317    case VK_PRIMITIVE_TOPOLOGY_TRIANGLE_STRIP:
318       return MALI_DRAW_MODE_TRIANGLE_STRIP;
319    case VK_PRIMITIVE_TOPOLOGY_TRIANGLE_FAN:
320       return MALI_DRAW_MODE_TRIANGLE_FAN;
321    case VK_PRIMITIVE_TOPOLOGY_LINE_LIST_WITH_ADJACENCY:
322    case VK_PRIMITIVE_TOPOLOGY_LINE_STRIP_WITH_ADJACENCY:
323    case VK_PRIMITIVE_TOPOLOGY_TRIANGLE_LIST_WITH_ADJACENCY:
324    case VK_PRIMITIVE_TOPOLOGY_TRIANGLE_STRIP_WITH_ADJACENCY:
325    case VK_PRIMITIVE_TOPOLOGY_PATCH_LIST:
326    default:
327       unreachable("Invalid primitive type");
328    }
329 }
330 
331 static VkResult
update_tls(struct panvk_cmd_buffer * cmdbuf)332 update_tls(struct panvk_cmd_buffer *cmdbuf)
333 {
334    struct panvk_tls_state *state = &cmdbuf->state.tls;
335    const struct panvk_shader *vs = cmdbuf->state.gfx.vs.shader;
336    const struct panvk_shader *fs = cmdbuf->state.gfx.fs.shader;
337    struct cs_builder *b =
338       panvk_get_cs_builder(cmdbuf, PANVK_SUBQUEUE_VERTEX_TILER);
339 
340    if (!cmdbuf->state.gfx.tsd) {
341       if (!state->desc.gpu) {
342          state->desc = panvk_cmd_alloc_desc(cmdbuf, LOCAL_STORAGE);
343          if (!state->desc.gpu)
344             return VK_ERROR_OUT_OF_DEVICE_MEMORY;
345       }
346 
347       cmdbuf->state.gfx.tsd = state->desc.gpu;
348 
349       cs_update_vt_ctx(b)
350          cs_move64_to(b, cs_sr_reg64(b, 24), state->desc.gpu);
351    }
352 
353    state->info.tls.size =
354       MAX3(vs->info.tls_size, fs ? fs->info.tls_size : 0, state->info.tls.size);
355    return VK_SUCCESS;
356 }
357 
358 static enum mali_index_type
index_size_to_index_type(uint32_t size)359 index_size_to_index_type(uint32_t size)
360 {
361    switch (size) {
362    case 0:
363       return MALI_INDEX_TYPE_NONE;
364    case 1:
365       return MALI_INDEX_TYPE_UINT8;
366    case 2:
367       return MALI_INDEX_TYPE_UINT16;
368    case 4:
369       return MALI_INDEX_TYPE_UINT32;
370    default:
371       assert(!"Invalid index size");
372       return MALI_INDEX_TYPE_NONE;
373    }
374 }
375 
376 static VkResult
prepare_blend(struct panvk_cmd_buffer * cmdbuf)377 prepare_blend(struct panvk_cmd_buffer *cmdbuf)
378 {
379    bool dirty = dyn_gfx_state_dirty(cmdbuf, MS_ALPHA_TO_ONE_ENABLE) ||
380                 dyn_gfx_state_dirty(cmdbuf, CB_LOGIC_OP_ENABLE) ||
381                 dyn_gfx_state_dirty(cmdbuf, CB_LOGIC_OP) ||
382                 dyn_gfx_state_dirty(cmdbuf, CB_ATTACHMENT_COUNT) ||
383                 dyn_gfx_state_dirty(cmdbuf, CB_COLOR_WRITE_ENABLES) ||
384                 dyn_gfx_state_dirty(cmdbuf, CB_BLEND_ENABLES) ||
385                 dyn_gfx_state_dirty(cmdbuf, CB_BLEND_EQUATIONS) ||
386                 dyn_gfx_state_dirty(cmdbuf, CB_WRITE_MASKS) ||
387                 dyn_gfx_state_dirty(cmdbuf, CB_BLEND_CONSTANTS) ||
388                 fs_user_dirty(cmdbuf) || gfx_state_dirty(cmdbuf, RENDER_STATE);
389 
390    if (!dirty)
391       return VK_SUCCESS;
392 
393    const struct vk_dynamic_graphics_state *dyns =
394       &cmdbuf->vk.dynamic_graphics_state;
395    const struct vk_color_blend_state *cb = &dyns->cb;
396    unsigned bd_count = MAX2(cb->attachment_count, 1);
397    struct cs_builder *b =
398       panvk_get_cs_builder(cmdbuf, PANVK_SUBQUEUE_VERTEX_TILER);
399    struct panfrost_ptr ptr =
400       panvk_cmd_alloc_desc_array(cmdbuf, bd_count, BLEND);
401    struct mali_blend_packed *bds = ptr.cpu;
402 
403    if (bd_count && !ptr.gpu)
404       return VK_ERROR_OUT_OF_DEVICE_MEMORY;
405 
406    panvk_per_arch(blend_emit_descs)(cmdbuf, bds);
407 
408    cs_update_vt_ctx(b)
409       cs_move64_to(b, cs_sr_reg64(b, 50), ptr.gpu | bd_count);
410 
411    return VK_SUCCESS;
412 }
413 
414 static void
prepare_vp(struct panvk_cmd_buffer * cmdbuf)415 prepare_vp(struct panvk_cmd_buffer *cmdbuf)
416 {
417    struct cs_builder *b =
418       panvk_get_cs_builder(cmdbuf, PANVK_SUBQUEUE_VERTEX_TILER);
419    const VkViewport *viewport =
420       &cmdbuf->vk.dynamic_graphics_state.vp.viewports[0];
421    const VkRect2D *scissor = &cmdbuf->vk.dynamic_graphics_state.vp.scissors[0];
422 
423    if (dyn_gfx_state_dirty(cmdbuf, VP_VIEWPORTS) ||
424        dyn_gfx_state_dirty(cmdbuf, VP_SCISSORS)) {
425       struct mali_scissor_packed scissor_box;
426       pan_pack(&scissor_box, SCISSOR, cfg) {
427 
428          /* The spec says "width must be greater than 0.0" */
429          assert(viewport->width >= 0);
430          int minx = (int)viewport->x;
431          int maxx = (int)(viewport->x + viewport->width);
432 
433          /* Viewport height can be negative */
434          int miny =
435             MIN2((int)viewport->y, (int)(viewport->y + viewport->height));
436          int maxy =
437             MAX2((int)viewport->y, (int)(viewport->y + viewport->height));
438 
439          assert(scissor->offset.x >= 0 && scissor->offset.y >= 0);
440          minx = MAX2(scissor->offset.x, minx);
441          miny = MAX2(scissor->offset.y, miny);
442          maxx = MIN2(scissor->offset.x + scissor->extent.width, maxx);
443          maxy = MIN2(scissor->offset.y + scissor->extent.height, maxy);
444 
445          /* Make sure we don't end up with a max < min when width/height is 0 */
446          maxx = maxx > minx ? maxx - 1 : maxx;
447          maxy = maxy > miny ? maxy - 1 : maxy;
448 
449          /* Clamp viewport scissor to valid range */
450          cfg.scissor_minimum_x = CLAMP(minx, 0, UINT16_MAX);
451          cfg.scissor_minimum_y = CLAMP(miny, 0, UINT16_MAX);
452          cfg.scissor_maximum_x = CLAMP(maxx, 0, UINT16_MAX);
453          cfg.scissor_maximum_y = CLAMP(maxy, 0, UINT16_MAX);
454       }
455 
456       struct mali_scissor_packed *scissor_box_ptr = &scissor_box;
457       cs_move64_to(b, cs_sr_reg64(b, 42), *((uint64_t*)scissor_box_ptr));
458    }
459 
460    if (dyn_gfx_state_dirty(cmdbuf, VP_VIEWPORTS) ||
461        dyn_gfx_state_dirty(cmdbuf, RS_DEPTH_CLIP_ENABLE) ||
462        dyn_gfx_state_dirty(cmdbuf, RS_DEPTH_CLAMP_ENABLE)) {
463       struct panvk_graphics_sysvals *sysvals = &cmdbuf->state.gfx.sysvals;
464 
465       float z_min = sysvals->viewport.offset.z;
466       float z_max = z_min + sysvals->viewport.scale.z;
467       cs_move32_to(b, cs_sr_reg32(b, 44), fui(MIN2(z_min, z_max)));
468       cs_move32_to(b, cs_sr_reg32(b, 45), fui(MAX2(z_min, z_max)));
469    }
470 }
471 
472 static inline uint64_t
get_pos_spd(const struct panvk_cmd_buffer * cmdbuf)473 get_pos_spd(const struct panvk_cmd_buffer *cmdbuf)
474 {
475    const struct panvk_shader *vs = cmdbuf->state.gfx.vs.shader;
476    assert(vs);
477    const struct vk_input_assembly_state *ia =
478       &cmdbuf->vk.dynamic_graphics_state.ia;
479    return ia->primitive_topology == VK_PRIMITIVE_TOPOLOGY_POINT_LIST
480              ? panvk_priv_mem_dev_addr(vs->spds.pos_points)
481              : panvk_priv_mem_dev_addr(vs->spds.pos_triangles);
482 }
483 
484 static void
prepare_tiler_primitive_size(struct panvk_cmd_buffer * cmdbuf)485 prepare_tiler_primitive_size(struct panvk_cmd_buffer *cmdbuf)
486 {
487    struct cs_builder *b =
488       panvk_get_cs_builder(cmdbuf, PANVK_SUBQUEUE_VERTEX_TILER);
489    const struct panvk_shader *vs = cmdbuf->state.gfx.vs.shader;
490    const struct vk_input_assembly_state *ia =
491       &cmdbuf->vk.dynamic_graphics_state.ia;
492    float primitive_size;
493 
494    if (!dyn_gfx_state_dirty(cmdbuf, IA_PRIMITIVE_TOPOLOGY) &&
495        !dyn_gfx_state_dirty(cmdbuf, RS_LINE_WIDTH) &&
496        !gfx_state_dirty(cmdbuf, VS))
497       return;
498 
499    switch (ia->primitive_topology) {
500    /* From the Vulkan spec 1.3.293:
501     *
502     *    "If maintenance5 is enabled and a value is not written to a variable
503     *    decorated with PointSize, a value of 1.0 is used as the size of
504     *    points."
505     *
506     * If no point size is written, ensure that the size is always 1.0f.
507     */
508    case VK_PRIMITIVE_TOPOLOGY_POINT_LIST:
509       if (vs->info.vs.writes_point_size)
510          return;
511 
512       primitive_size = 1.0f;
513       break;
514    case VK_PRIMITIVE_TOPOLOGY_LINE_LIST:
515    case VK_PRIMITIVE_TOPOLOGY_LINE_STRIP:
516       primitive_size = cmdbuf->vk.dynamic_graphics_state.rs.line.width;
517       break;
518    default:
519       return;
520    }
521 
522    cs_move32_to(b, cs_sr_reg32(b, 60), fui(primitive_size));
523 }
524 
525 static uint32_t
calc_enabled_layer_count(struct panvk_cmd_buffer * cmdbuf)526 calc_enabled_layer_count(struct panvk_cmd_buffer *cmdbuf)
527 {
528    return cmdbuf->state.gfx.render.view_mask ?
529       util_bitcount(cmdbuf->state.gfx.render.view_mask) :
530       cmdbuf->state.gfx.render.layer_count;
531 }
532 
533 static uint32_t
calc_fbd_size(struct panvk_cmd_buffer * cmdbuf)534 calc_fbd_size(struct panvk_cmd_buffer *cmdbuf)
535 {
536    const struct pan_fb_info *fb = &cmdbuf->state.gfx.render.fb.info;
537    bool has_zs_ext = fb->zs.view.zs || fb->zs.view.s;
538    uint32_t rt_count = MAX2(fb->rt_count, 1);
539 
540    return get_fbd_size(has_zs_ext, rt_count);
541 }
542 
543 static uint32_t
calc_render_descs_size(struct panvk_cmd_buffer * cmdbuf)544 calc_render_descs_size(struct panvk_cmd_buffer *cmdbuf)
545 {
546    uint32_t fbd_count = calc_enabled_layer_count(cmdbuf) *
547       (1 + PANVK_IR_PASS_COUNT);
548    uint32_t td_count = DIV_ROUND_UP(cmdbuf->state.gfx.render.layer_count,
549                                     MAX_LAYERS_PER_TILER_DESC);
550 
551    return (calc_fbd_size(cmdbuf) * fbd_count) +
552           (td_count * pan_size(TILER_CONTEXT));
553 }
554 
555 static void
cs_render_desc_ringbuf_reserve(struct cs_builder * b,uint32_t size)556 cs_render_desc_ringbuf_reserve(struct cs_builder *b, uint32_t size)
557 {
558    /* Make sure we don't allocate more than the ringbuf size. */
559    assert(size <= RENDER_DESC_RINGBUF_SIZE);
560 
561    /* Make sure the allocation is 64-byte aligned. */
562    assert(ALIGN_POT(size, 64) == size);
563 
564    struct cs_index ringbuf_sync = cs_scratch_reg64(b, 0);
565    struct cs_index sz_reg = cs_scratch_reg32(b, 2);
566 
567    cs_load64_to(
568       b, ringbuf_sync, cs_subqueue_ctx_reg(b),
569       offsetof(struct panvk_cs_subqueue_context, render.desc_ringbuf.syncobj));
570    cs_wait_slot(b, SB_ID(LS), false);
571 
572    /* Wait for the other end to release memory. */
573    cs_move32_to(b, sz_reg, size - 1);
574    cs_sync32_wait(b, false, MALI_CS_CONDITION_GREATER, sz_reg, ringbuf_sync);
575 
576    /* Decrement the syncobj to reflect the fact we're reserving memory. */
577    cs_move32_to(b, sz_reg, -size);
578    cs_sync32_add(b, false, MALI_CS_SYNC_SCOPE_CSG, sz_reg, ringbuf_sync,
579                  cs_now());
580 }
581 
582 static void
cs_render_desc_ringbuf_move_ptr(struct cs_builder * b,uint32_t size,bool wrap_around)583 cs_render_desc_ringbuf_move_ptr(struct cs_builder *b, uint32_t size,
584                                 bool wrap_around)
585 {
586    struct cs_index scratch_reg = cs_scratch_reg32(b, 0);
587    struct cs_index ptr_lo = cs_scratch_reg32(b, 2);
588    struct cs_index pos = cs_scratch_reg32(b, 4);
589 
590    cs_load_to(
591       b, cs_scratch_reg_tuple(b, 2, 3), cs_subqueue_ctx_reg(b),
592       BITFIELD_MASK(3),
593       offsetof(struct panvk_cs_subqueue_context, render.desc_ringbuf.ptr));
594    cs_wait_slot(b, SB_ID(LS), false);
595 
596    /* Update the relative position and absolute address. */
597    cs_add32(b, ptr_lo, ptr_lo, size);
598    cs_add32(b, pos, pos, size);
599 
600    /* Wrap-around. */
601    if (likely(wrap_around)) {
602       cs_add32(b, scratch_reg, pos, -RENDER_DESC_RINGBUF_SIZE);
603 
604       cs_if(b, MALI_CS_CONDITION_GEQUAL, scratch_reg) {
605          cs_add32(b, ptr_lo, ptr_lo, -RENDER_DESC_RINGBUF_SIZE);
606          cs_add32(b, pos, pos, -RENDER_DESC_RINGBUF_SIZE);
607       }
608    }
609 
610    cs_store(
611       b, cs_scratch_reg_tuple(b, 2, 3), cs_subqueue_ctx_reg(b),
612       BITFIELD_MASK(3),
613       offsetof(struct panvk_cs_subqueue_context, render.desc_ringbuf.ptr));
614    cs_wait_slot(b, SB_ID(LS), false);
615 }
616 
617 static bool
inherits_render_ctx(struct panvk_cmd_buffer * cmdbuf)618 inherits_render_ctx(struct panvk_cmd_buffer *cmdbuf)
619 {
620    return (cmdbuf->vk.level == VK_COMMAND_BUFFER_LEVEL_SECONDARY &&
621            (cmdbuf->flags &
622             VK_COMMAND_BUFFER_USAGE_RENDER_PASS_CONTINUE_BIT)) ||
623           (cmdbuf->state.gfx.render.flags & VK_RENDERING_RESUMING_BIT);
624 }
625 
626 static VkResult
get_tiler_desc(struct panvk_cmd_buffer * cmdbuf)627 get_tiler_desc(struct panvk_cmd_buffer *cmdbuf)
628 {
629    assert(cmdbuf->state.gfx.render.invalidate_inherited_ctx ||
630           !inherits_render_ctx(cmdbuf));
631 
632    if (cmdbuf->state.gfx.render.tiler)
633       return VK_SUCCESS;
634 
635    struct cs_builder *b =
636       panvk_get_cs_builder(cmdbuf, PANVK_SUBQUEUE_VERTEX_TILER);
637    struct panvk_physical_device *phys_dev =
638       to_panvk_physical_device(cmdbuf->vk.base.device->physical);
639    struct panvk_instance *instance =
640       to_panvk_instance(phys_dev->vk.instance);
641    bool tracing_enabled = instance->debug_flags & PANVK_DEBUG_TRACE;
642    struct panfrost_tiler_features tiler_features =
643       panfrost_query_tiler_features(&phys_dev->kmod.props);
644    bool simul_use =
645       cmdbuf->flags & VK_COMMAND_BUFFER_USAGE_SIMULTANEOUS_USE_BIT;
646    struct panfrost_ptr tiler_desc = {0};
647    struct mali_tiler_context_packed tiler_tmpl;
648    uint32_t td_count = DIV_ROUND_UP(cmdbuf->state.gfx.render.layer_count,
649                                     MAX_LAYERS_PER_TILER_DESC);
650 
651    if (!simul_use) {
652       tiler_desc = panvk_cmd_alloc_desc_array(cmdbuf, td_count, TILER_CONTEXT);
653       if (!tiler_desc.gpu)
654          return VK_ERROR_OUT_OF_DEVICE_MEMORY;
655    }
656 
657    const struct pan_fb_info *fbinfo = &cmdbuf->state.gfx.render.fb.info;
658 
659    pan_pack(&tiler_tmpl, TILER_CONTEXT, cfg) {
660       unsigned max_levels = tiler_features.max_levels;
661       assert(max_levels >= 2);
662 
663       cfg.hierarchy_mask =
664          panvk_select_tiler_hierarchy_mask(phys_dev, &cmdbuf->state.gfx);
665       cfg.fb_width = fbinfo->width;
666       cfg.fb_height = fbinfo->height;
667 
668       cfg.sample_pattern = pan_sample_pattern(fbinfo->nr_samples);
669 
670       cfg.first_provoking_vertex =
671          cmdbuf->vk.dynamic_graphics_state.rs.provoking_vertex ==
672             VK_PROVOKING_VERTEX_MODE_FIRST_VERTEX_EXT;
673 
674       /* This will be overloaded. */
675       cfg.layer_count = 1;
676       cfg.layer_offset = 0;
677    }
678 
679    /* When simul_use=true, the tiler descriptors are allocated from the
680     * descriptor ringbuf. We set state.gfx.render.tiler to a non-NULL
681     * value to satisfy the is_tiler_desc_allocated() tests, but we want
682     * it to point to a faulty address so that we can easily detect if it's
683     * used in the command stream/framebuffer descriptors. */
684    cmdbuf->state.gfx.render.tiler =
685       simul_use ? 0xdeadbeefdeadbeefull : tiler_desc.gpu;
686 
687    struct cs_index tiler_ctx_addr = cs_sr_reg64(b, 40);
688 
689    if (simul_use) {
690       uint32_t descs_sz = calc_render_descs_size(cmdbuf);
691 
692       cs_render_desc_ringbuf_reserve(b, descs_sz);
693 
694       /* Reserve ringbuf mem. */
695       cs_update_vt_ctx(b) {
696          cs_load64_to(b, tiler_ctx_addr, cs_subqueue_ctx_reg(b),
697                       offsetof(struct panvk_cs_subqueue_context,
698                                render.desc_ringbuf.ptr));
699       }
700 
701       cs_render_desc_ringbuf_move_ptr(b, descs_sz, !tracing_enabled);
702    } else {
703       cs_update_vt_ctx(b) {
704          cs_move64_to(b, tiler_ctx_addr, tiler_desc.gpu);
705       }
706    }
707 
708    /* Reset the polygon list. */
709    cs_move64_to(b, cs_scratch_reg64(b, 0), 0);
710 
711    /* Lay out words 2, 3 and 5, so they can be stored along the other updates.
712     * Word 4 contains layer information and will be updated in the loop. */
713    cs_move64_to(b, cs_scratch_reg64(b, 2),
714                 tiler_tmpl.opaque[2] | (uint64_t)tiler_tmpl.opaque[3] << 32);
715    cs_move32_to(b, cs_scratch_reg32(b, 5), tiler_tmpl.opaque[5]);
716 
717    /* Load the tiler_heap and geom_buf from the context. */
718    cs_load_to(b, cs_scratch_reg_tuple(b, 6, 4), cs_subqueue_ctx_reg(b),
719               BITFIELD_MASK(4),
720               offsetof(struct panvk_cs_subqueue_context, render.tiler_heap));
721 
722    /* Fill extra fields with zeroes so we can reset the completed
723     * top/bottom and private states. */
724    cs_move64_to(b, cs_scratch_reg64(b, 10), 0);
725    cs_move64_to(b, cs_scratch_reg64(b, 12), 0);
726    cs_move64_to(b, cs_scratch_reg64(b, 14), 0);
727 
728    cs_wait_slot(b, SB_ID(LS), false);
729 
730    /* Take care of the tiler desc with layer_offset=0 outside of the loop. */
731    cs_move32_to(b, cs_scratch_reg32(b, 4),
732                 MIN2(cmdbuf->state.gfx.render.layer_count - 1,
733                      MAX_LAYERS_PER_TILER_DESC - 1));
734 
735    /* Replace words 0:13 and 24:31. */
736    cs_store(b, cs_scratch_reg_tuple(b, 0, 16), tiler_ctx_addr,
737             BITFIELD_MASK(16), 0);
738    cs_store(b, cs_scratch_reg_tuple(b, 0, 16), tiler_ctx_addr,
739             BITFIELD_RANGE(0, 2) | BITFIELD_RANGE(10, 6), 64);
740    cs_store(b, cs_scratch_reg_tuple(b, 0, 16), tiler_ctx_addr,
741             BITFIELD_RANGE(0, 2) | BITFIELD_RANGE(10, 6), 96);
742 
743    cs_wait_slot(b, SB_ID(LS), false);
744 
745    uint32_t remaining_layers =
746       td_count > 1
747          ? cmdbuf->state.gfx.render.layer_count % MAX_LAYERS_PER_TILER_DESC
748          : 0;
749    uint32_t full_td_count =
750       cmdbuf->state.gfx.render.layer_count / MAX_LAYERS_PER_TILER_DESC;
751 
752    if (remaining_layers) {
753       int32_t layer_offset =
754          -(cmdbuf->state.gfx.render.layer_count - remaining_layers) &
755          BITFIELD_MASK(9);
756 
757       /* If the last tiler descriptor is not full, we emit it outside of the
758        * loop to pass the right layer count. All this would be a lot simpler
759        * if we had OR/AND instructions, but here we are. */
760       cs_update_vt_ctx(b)
761          cs_add64(b, tiler_ctx_addr, tiler_ctx_addr,
762                   pan_size(TILER_CONTEXT) * full_td_count);
763       cs_move32_to(b, cs_scratch_reg32(b, 4),
764                    (layer_offset << 8) | (remaining_layers - 1));
765       cs_store(b, cs_scratch_reg_tuple(b, 0, 16), tiler_ctx_addr,
766                BITFIELD_MASK(16), 0);
767       cs_store(b, cs_scratch_reg_tuple(b, 0, 16), tiler_ctx_addr,
768                BITFIELD_RANGE(0, 2) | BITFIELD_RANGE(10, 6), 64);
769       cs_store(b, cs_scratch_reg_tuple(b, 0, 16), tiler_ctx_addr,
770                BITFIELD_RANGE(0, 2) | BITFIELD_RANGE(10, 6), 96);
771       cs_wait_slot(b, SB_ID(LS), false);
772 
773       cs_update_vt_ctx(b)
774          cs_add64(b, tiler_ctx_addr, tiler_ctx_addr,
775                   -pan_size(TILER_CONTEXT));
776    } else if (full_td_count > 1) {
777       cs_update_vt_ctx(b)
778          cs_add64(b, tiler_ctx_addr, tiler_ctx_addr,
779                   pan_size(TILER_CONTEXT) * (full_td_count - 1));
780    }
781 
782    if (full_td_count > 1) {
783       struct cs_index counter_reg = cs_scratch_reg32(b, 17);
784       uint32_t layer_offset =
785          (-MAX_LAYERS_PER_TILER_DESC * (full_td_count - 1)) & BITFIELD_MASK(9);
786 
787       cs_move32_to(b, counter_reg, full_td_count - 1);
788       cs_move32_to(b, cs_scratch_reg32(b, 4),
789                    (layer_offset << 8) | (MAX_LAYERS_PER_TILER_DESC - 1));
790 
791       /* We iterate the remaining full tiler descriptors in reverse order, so we
792        * can start from the smallest layer offset, and increment it by
793        * MAX_LAYERS_PER_TILER_DESC << 8 at each iteration. Again, the split is
794        * mostly due to the lack of AND instructions, and the fact layer_offset
795        * is a 9-bit signed integer inside a 32-bit word, which ADD32 can't deal
796        * with unless the number we add is positive.
797        */
798       cs_while(b, MALI_CS_CONDITION_GREATER, counter_reg) {
799          /* Replace words 0:13 and 24:31. */
800          cs_store(b, cs_scratch_reg_tuple(b, 0, 16), tiler_ctx_addr,
801                   BITFIELD_MASK(16), 0);
802          cs_store(b, cs_scratch_reg_tuple(b, 0, 16), tiler_ctx_addr,
803                   BITFIELD_RANGE(0, 2) | BITFIELD_RANGE(10, 6), 64);
804          cs_store(b, cs_scratch_reg_tuple(b, 0, 16), tiler_ctx_addr,
805                   BITFIELD_RANGE(0, 2) | BITFIELD_RANGE(10, 6), 96);
806 
807          cs_wait_slot(b, SB_ID(LS), false);
808 
809          cs_add32(b, cs_scratch_reg32(b, 4), cs_scratch_reg32(b, 4),
810                   MAX_LAYERS_PER_TILER_DESC << 8);
811 
812          cs_add32(b, counter_reg, counter_reg, -1);
813          cs_update_vt_ctx(b)
814             cs_add64(b, tiler_ctx_addr, tiler_ctx_addr,
815                      -pan_size(TILER_CONTEXT));
816       }
817    }
818 
819    /* Then we change the scoreboard slot used for iterators. */
820    panvk_per_arch(cs_pick_iter_sb)(cmdbuf, PANVK_SUBQUEUE_VERTEX_TILER);
821 
822    cs_heap_operation(b, MALI_CS_HEAP_OPERATION_VERTEX_TILER_STARTED, cs_now());
823    return VK_SUCCESS;
824 }
825 
826 static uint8_t
prepare_fb_desc(struct panvk_cmd_buffer * cmdbuf,struct pan_fb_info * fbinfo,uint32_t layer,void * fbd)827 prepare_fb_desc(struct panvk_cmd_buffer *cmdbuf, struct pan_fb_info *fbinfo,
828                 uint32_t layer, void *fbd)
829 {
830    struct pan_tiler_context tiler_ctx = {
831       .valhall.layer_offset = layer - (layer % MAX_LAYERS_PER_TILER_DESC),
832    };
833 
834    if (!(cmdbuf->flags & VK_COMMAND_BUFFER_USAGE_SIMULTANEOUS_USE_BIT)) {
835       uint32_t td_idx = layer / MAX_LAYERS_PER_TILER_DESC;
836 
837       tiler_ctx.valhall.desc =
838          cmdbuf->state.gfx.render.tiler + (td_idx * pan_size(TILER_CONTEXT));
839    }
840 
841    return GENX(pan_emit_fbd)(fbinfo, layer, NULL, &tiler_ctx, fbd);
842 }
843 
844 static VkResult
prepare_incremental_rendering_fbinfos(struct panvk_cmd_buffer * cmdbuf,const struct pan_fb_info * fbinfo,struct pan_fb_info ir_fbinfos[PANVK_IR_PASS_COUNT])845 prepare_incremental_rendering_fbinfos(
846    struct panvk_cmd_buffer *cmdbuf, const struct pan_fb_info *fbinfo,
847    struct pan_fb_info ir_fbinfos[PANVK_IR_PASS_COUNT])
848 {
849    /* First incremental rendering pass: don't discard result */
850 
851    struct pan_fb_info *ir_fb = &ir_fbinfos[PANVK_IR_FIRST_PASS];
852 
853    memcpy(ir_fb, fbinfo, sizeof(*ir_fb));
854    for (unsigned i = 0; i < fbinfo->rt_count; i++)
855       ir_fb->rts[i].discard = false;
856    ir_fb->zs.discard.z = false;
857    ir_fb->zs.discard.s = false;
858 
859    /* Subsequent incremental rendering passes: preload old content and don't
860     * discard result */
861 
862    struct pan_fb_info *prev_ir_fb = ir_fb;
863    ir_fb = &ir_fbinfos[PANVK_IR_MIDDLE_PASS];
864    memcpy(ir_fb, prev_ir_fb, sizeof(*ir_fb));
865 
866    bool preload_changed = false;
867 
868    for (unsigned i = 0; i < fbinfo->rt_count; i++) {
869       if (fbinfo->rts[i].view && !fbinfo->rts[i].preload) {
870          ir_fb->rts[i].preload = true;
871          preload_changed = true;
872       }
873 
874       if (ir_fb->rts[i].clear) {
875          ir_fb->rts[i].clear = false;
876          preload_changed = true;
877       }
878    }
879    if (fbinfo->zs.view.zs && !fbinfo->zs.preload.z && !fbinfo->zs.preload.s) {
880       ir_fb->zs.preload.z = true;
881       ir_fb->zs.preload.s = true;
882       preload_changed = true;
883    } else if (fbinfo->zs.view.s && !fbinfo->zs.preload.s) {
884       ir_fb->zs.preload.s = true;
885       preload_changed = true;
886    }
887 
888    if (ir_fb->zs.clear.z || ir_fb->zs.clear.s) {
889       ir_fb->zs.clear.z = false;
890       ir_fb->zs.clear.s = false;
891       preload_changed = true;
892    }
893 
894    if (preload_changed) {
895       memset(&ir_fb->bifrost.pre_post.dcds, 0x0,
896              sizeof(ir_fb->bifrost.pre_post.dcds));
897       VkResult result = panvk_per_arch(cmd_fb_preload)(cmdbuf, ir_fb);
898       if (result != VK_SUCCESS)
899          return result;
900    }
901 
902    /* Last incremental rendering pass: preload previous content and deal with
903     * results as specified by user */
904 
905    prev_ir_fb = ir_fb;
906    ir_fb = &ir_fbinfos[PANVK_IR_LAST_PASS];
907    memcpy(ir_fb, prev_ir_fb, sizeof(*ir_fb));
908 
909    for (unsigned i = 0; i < fbinfo->rt_count; i++)
910       ir_fb->rts[i].discard = fbinfo->rts[i].discard;
911    ir_fb->zs.discard.z = fbinfo->zs.discard.z;
912    ir_fb->zs.discard.s = fbinfo->zs.discard.s;
913 
914    return VK_SUCCESS;
915 }
916 
917 static VkResult
get_fb_descs(struct panvk_cmd_buffer * cmdbuf)918 get_fb_descs(struct panvk_cmd_buffer *cmdbuf)
919 {
920    assert(cmdbuf->state.gfx.render.invalidate_inherited_ctx ||
921           !inherits_render_ctx(cmdbuf));
922 
923    if (cmdbuf->state.gfx.render.fbds.gpu ||
924        !cmdbuf->state.gfx.render.layer_count)
925       return VK_SUCCESS;
926 
927    uint32_t fbd_sz = calc_fbd_size(cmdbuf);
928    uint32_t fbds_sz = fbd_sz * calc_enabled_layer_count(cmdbuf) *
929       (1 + PANVK_IR_PASS_COUNT);
930 
931    cmdbuf->state.gfx.render.fbds = panvk_cmd_alloc_dev_mem(
932       cmdbuf, desc, fbds_sz, pan_alignment(FRAMEBUFFER));
933    if (!cmdbuf->state.gfx.render.fbds.gpu)
934       return VK_ERROR_OUT_OF_DEVICE_MEMORY;
935 
936    struct panvk_device *dev = to_panvk_device(cmdbuf->vk.base.device);
937    struct pan_fb_info *fbinfo = &cmdbuf->state.gfx.render.fb.info;
938    bool simul_use =
939       cmdbuf->flags & VK_COMMAND_BUFFER_USAGE_SIMULTANEOUS_USE_BIT;
940 
941    /* The only bit we patch in FBDs is the tiler pointer. If tiler is not
942     * involved (clear job) or if the update can happen in place (not
943     * simultaneous use of the command buffer), we can avoid the
944     * copy.
945     *
946     * According to VUID-VkSubmitInfo2KHR-commandBuffer-06192 and
947     * VUID-VkSubmitInfo2KHR-commandBuffer-06010, suspend/resume operations
948     * can't cross the vkQueueSubmit2() boundary, so no need to dynamically
949     * allocate descriptors in that case:
950     * "
951     *   If any commandBuffer member of an element of pCommandBufferInfos
952     *   contains any suspended render pass instances, they must be resumed by a
953     *   render pass instance later in submission order within
954     *   pCommandBufferInfos.
955     *
956     *   If any commandBuffer member of an element of pCommandBufferInfos
957     *   contains any resumed render pass instances, they must be suspended by a
958     *   render pass instance earlier in submission order within
959     *   pCommandBufferInfos.
960     * "
961     */
962    bool copy_fbds = simul_use && cmdbuf->state.gfx.render.tiler;
963    struct panfrost_ptr fbds = cmdbuf->state.gfx.render.fbds;
964    uint32_t fbd_flags = 0;
965    uint32_t fbd_ir_pass_offset = fbd_sz * calc_enabled_layer_count(cmdbuf);
966 
967    fbinfo->sample_positions =
968       dev->sample_positions->addr.dev +
969       panfrost_sample_positions_offset(pan_sample_pattern(fbinfo->nr_samples));
970 
971    VkResult result = panvk_per_arch(cmd_fb_preload)(cmdbuf, fbinfo);
972    if (result != VK_SUCCESS)
973       return result;
974 
975    struct pan_fb_info ir_fbinfos[PANVK_IR_PASS_COUNT];
976    result = prepare_incremental_rendering_fbinfos(cmdbuf, fbinfo, ir_fbinfos);
977    if (result != VK_SUCCESS)
978       return result;
979 
980    /* We prepare all FB descriptors upfront. For multiview, only create FBDs
981     * for enabled views. */
982    uint32_t view_mask_temp = cmdbuf->state.gfx.render.view_mask;
983    uint32_t enabled_layer_count = calc_enabled_layer_count(cmdbuf);
984    bool multiview = cmdbuf->state.gfx.render.view_mask;
985 
986    for (uint32_t i = 0; i < enabled_layer_count; i++) {
987       uint32_t layer_idx = multiview ? u_bit_scan(&view_mask_temp) : i;
988 
989       uint32_t layer_offset = fbd_sz * i;
990       uint32_t new_fbd_flags =
991          prepare_fb_desc(cmdbuf, fbinfo, layer_idx, fbds.cpu + layer_offset);
992 
993       /* Make sure all FBDs have the same flags. */
994       assert(i == 0 || new_fbd_flags == fbd_flags);
995       fbd_flags = new_fbd_flags;
996 
997       for (uint32_t j = 0; j < PANVK_IR_PASS_COUNT; j++) {
998          uint32_t ir_pass_offset = (1 + j) * fbd_ir_pass_offset;
999          new_fbd_flags =
1000             prepare_fb_desc(cmdbuf, &ir_fbinfos[j], layer_idx,
1001                             fbds.cpu + ir_pass_offset + layer_offset);
1002 
1003          /* Make sure all IR FBDs have the same flags. */
1004          assert(new_fbd_flags == fbd_flags);
1005       }
1006    }
1007 
1008    struct cs_builder *b = panvk_get_cs_builder(cmdbuf, PANVK_SUBQUEUE_FRAGMENT);
1009 
1010    if (copy_fbds) {
1011       struct cs_index cur_tiler = cs_sr_reg64(b, 38);
1012       struct cs_index dst_fbd_ptr = cs_sr_reg64(b, 40);
1013       struct cs_index layer_count = cs_sr_reg32(b, 47);
1014       struct cs_index src_fbd_ptr = cs_sr_reg64(b, 48);
1015       struct cs_index remaining_layers_in_td = cs_sr_reg32(b, 50);
1016       struct cs_index pass_count = cs_sr_reg32(b, 51);
1017       struct cs_index pass_src_fbd_ptr = cs_sr_reg64(b, 52);
1018       struct cs_index pass_dst_fbd_ptr = cs_sr_reg64(b, 54);
1019       uint32_t td_count = DIV_ROUND_UP(cmdbuf->state.gfx.render.layer_count,
1020                                        MAX_LAYERS_PER_TILER_DESC);
1021 
1022       cs_update_frag_ctx(b) {
1023          cs_load64_to(b, cur_tiler, cs_subqueue_ctx_reg(b),
1024                       offsetof(struct panvk_cs_subqueue_context,
1025                                render.desc_ringbuf.ptr));
1026          cs_wait_slot(b, SB_ID(LS), false);
1027          cs_add64(b, dst_fbd_ptr, cur_tiler,
1028                   pan_size(TILER_CONTEXT) * td_count);
1029       }
1030 
1031       cs_move64_to(b, src_fbd_ptr, fbds.gpu);
1032       cs_move32_to(b, remaining_layers_in_td, MAX_LAYERS_PER_TILER_DESC);
1033 
1034       cs_move32_to(b, layer_count, calc_enabled_layer_count(cmdbuf));
1035       cs_while(b, MALI_CS_CONDITION_GREATER, layer_count) {
1036          /* Our loop is copying 64-bytes at a time, so make sure the
1037           * framebuffer size is aligned on 64-bytes. */
1038          assert(fbd_sz == ALIGN_POT(fbd_sz, 64));
1039 
1040          cs_move32_to(b, pass_count, PANVK_IR_PASS_COUNT);
1041          cs_add64(b, pass_src_fbd_ptr, src_fbd_ptr, 0);
1042          cs_add64(b, pass_dst_fbd_ptr, dst_fbd_ptr, 0);
1043          /* Copy FBDs the regular pass as well as IR passes. */
1044          cs_while(b, MALI_CS_CONDITION_GEQUAL, pass_count) {
1045             for (uint32_t fbd_off = 0; fbd_off < fbd_sz; fbd_off += 64) {
1046                if (fbd_off == 0) {
1047                   cs_load_to(b, cs_scratch_reg_tuple(b, 0, 14),
1048                              pass_src_fbd_ptr, BITFIELD_MASK(14), fbd_off);
1049                   cs_add64(b, cs_scratch_reg64(b, 14), cur_tiler, 0);
1050                } else {
1051                   cs_load_to(b, cs_scratch_reg_tuple(b, 0, 16),
1052                              pass_src_fbd_ptr, BITFIELD_MASK(16), fbd_off);
1053                }
1054                cs_wait_slot(b, SB_ID(LS), false);
1055                cs_store(b, cs_scratch_reg_tuple(b, 0, 16), pass_dst_fbd_ptr,
1056                         BITFIELD_MASK(16), fbd_off);
1057                cs_wait_slot(b, SB_ID(LS), false);
1058             }
1059             cs_add64(b, pass_src_fbd_ptr, pass_src_fbd_ptr, fbd_ir_pass_offset);
1060             cs_add64(b, pass_dst_fbd_ptr, pass_dst_fbd_ptr, fbd_ir_pass_offset);
1061             cs_add32(b, pass_count, pass_count, -1);
1062          }
1063 
1064          cs_add64(b, src_fbd_ptr, src_fbd_ptr, fbd_sz);
1065          cs_update_frag_ctx(b)
1066             cs_add64(b, dst_fbd_ptr, dst_fbd_ptr, fbd_sz);
1067 
1068          cs_add32(b, remaining_layers_in_td, remaining_layers_in_td, -1);
1069          cs_add32(b, layer_count, layer_count, -1);
1070          cs_if(b, MALI_CS_CONDITION_LEQUAL, remaining_layers_in_td) {
1071             cs_update_frag_ctx(b)
1072                cs_add64(b, cur_tiler, cur_tiler, pan_size(TILER_CONTEXT));
1073             cs_move32_to(b, remaining_layers_in_td,
1074                          MAX_LAYERS_PER_TILER_DESC);
1075          }
1076       }
1077 
1078       cs_update_frag_ctx(b) {
1079          uint32_t full_td_count =
1080             cmdbuf->state.gfx.render.layer_count / MAX_LAYERS_PER_TILER_DESC;
1081 
1082          /* If the last tiler descriptor is not full, cur_tiler points to the
1083           * last tiler descriptor, not the FBD that follows. */
1084          if (full_td_count < td_count)
1085             cs_add64(b, dst_fbd_ptr, cur_tiler,
1086                      fbd_flags + pan_size(TILER_CONTEXT));
1087          else
1088             cs_add64(b, dst_fbd_ptr, cur_tiler, fbd_flags);
1089 
1090          cs_add64(b, cur_tiler, cur_tiler,
1091                   -(full_td_count * pan_size(TILER_CONTEXT)));
1092       }
1093    } else {
1094       cs_update_frag_ctx(b) {
1095          cs_move64_to(b, cs_sr_reg64(b, 40), fbds.gpu | fbd_flags);
1096          cs_move64_to(b, cs_sr_reg64(b, 38), cmdbuf->state.gfx.render.tiler);
1097       }
1098    }
1099 
1100    return VK_SUCCESS;
1101 }
1102 
1103 static void
set_provoking_vertex_mode(struct panvk_cmd_buffer * cmdbuf)1104 set_provoking_vertex_mode(struct panvk_cmd_buffer *cmdbuf)
1105 {
1106    struct pan_fb_info *fbinfo = &cmdbuf->state.gfx.render.fb.info;
1107    bool first_provoking_vertex =
1108       cmdbuf->vk.dynamic_graphics_state.rs.provoking_vertex ==
1109          VK_PROVOKING_VERTEX_MODE_FIRST_VERTEX_EXT;
1110 
1111    /* If this is not the first draw, first_provoking_vertex should match
1112     * the one from the previous draws. Unfortunately, we can't check it
1113     * when the render pass is inherited. */
1114    assert(!cmdbuf->state.gfx.render.fbds.gpu ||
1115           fbinfo->first_provoking_vertex == first_provoking_vertex);
1116 
1117    fbinfo->first_provoking_vertex = first_provoking_vertex;
1118 }
1119 
1120 static VkResult
get_render_ctx(struct panvk_cmd_buffer * cmdbuf)1121 get_render_ctx(struct panvk_cmd_buffer *cmdbuf)
1122 {
1123    VkResult result = get_tiler_desc(cmdbuf);
1124    if (result != VK_SUCCESS)
1125       return result;
1126 
1127    return get_fb_descs(cmdbuf);
1128 }
1129 
1130 static VkResult
prepare_vs(struct panvk_cmd_buffer * cmdbuf)1131 prepare_vs(struct panvk_cmd_buffer *cmdbuf)
1132 {
1133    struct panvk_descriptor_state *desc_state = &cmdbuf->state.gfx.desc_state;
1134    struct panvk_shader_desc_state *vs_desc_state = &cmdbuf->state.gfx.vs.desc;
1135    const struct panvk_shader *vs = cmdbuf->state.gfx.vs.shader;
1136    struct cs_builder *b =
1137       panvk_get_cs_builder(cmdbuf, PANVK_SUBQUEUE_VERTEX_TILER);
1138    bool upd_res_table = false;
1139 
1140    VkResult result = prepare_vs_driver_set(cmdbuf);
1141    if (result != VK_SUCCESS)
1142       return result;
1143 
1144    if (gfx_state_dirty(cmdbuf, VS) || gfx_state_dirty(cmdbuf, DESC_STATE) ||
1145        vs_driver_set_is_dirty(cmdbuf)) {
1146       result = panvk_per_arch(cmd_prepare_shader_res_table)(cmdbuf, desc_state,
1147                                                             vs, vs_desc_state);
1148       if (result != VK_SUCCESS)
1149          return result;
1150 
1151       upd_res_table = true;
1152    }
1153 
1154    cs_update_vt_ctx(b) {
1155       if (upd_res_table)
1156          cs_move64_to(b, cs_sr_reg64(b, 0), vs_desc_state->res_table);
1157 
1158       if (gfx_state_dirty(cmdbuf, VS) ||
1159           dyn_gfx_state_dirty(cmdbuf, IA_PRIMITIVE_TOPOLOGY))
1160          cs_move64_to(b, cs_sr_reg64(b, 16), get_pos_spd(cmdbuf));
1161 
1162       if (gfx_state_dirty(cmdbuf, VS))
1163          cs_move64_to(b, cs_sr_reg64(b, 18),
1164                       panvk_priv_mem_dev_addr(vs->spds.var));
1165    }
1166 
1167    return VK_SUCCESS;
1168 }
1169 
1170 static VkResult
prepare_fs(struct panvk_cmd_buffer * cmdbuf)1171 prepare_fs(struct panvk_cmd_buffer *cmdbuf)
1172 {
1173    const struct panvk_shader *fs = get_fs(cmdbuf);
1174    struct panvk_shader_desc_state *fs_desc_state = &cmdbuf->state.gfx.fs.desc;
1175    struct panvk_descriptor_state *desc_state = &cmdbuf->state.gfx.desc_state;
1176    struct cs_builder *b =
1177       panvk_get_cs_builder(cmdbuf, PANVK_SUBQUEUE_VERTEX_TILER);
1178 
1179    if (fs &&
1180        (gfx_state_dirty(cmdbuf, FS) || gfx_state_dirty(cmdbuf, DESC_STATE))) {
1181       VkResult result = prepare_fs_driver_set(cmdbuf);
1182       if (result != VK_SUCCESS)
1183          return result;
1184 
1185       result = panvk_per_arch(cmd_prepare_shader_res_table)(cmdbuf, desc_state,
1186                                                             fs, fs_desc_state);
1187       if (result != VK_SUCCESS)
1188          return result;
1189    }
1190 
1191    cs_update_vt_ctx(b) {
1192       if (fs_user_dirty(cmdbuf) || gfx_state_dirty(cmdbuf, DESC_STATE))
1193          cs_move64_to(b, cs_sr_reg64(b, 4), fs ? fs_desc_state->res_table : 0);
1194       if (fs_user_dirty(cmdbuf))
1195          cs_move64_to(b, cs_sr_reg64(b, 20),
1196                       fs ? panvk_priv_mem_dev_addr(fs->spd) : 0);
1197    }
1198 
1199    return VK_SUCCESS;
1200 }
1201 
1202 static VkResult
prepare_push_uniforms(struct panvk_cmd_buffer * cmdbuf)1203 prepare_push_uniforms(struct panvk_cmd_buffer *cmdbuf)
1204 {
1205    struct cs_builder *b =
1206       panvk_get_cs_builder(cmdbuf, PANVK_SUBQUEUE_VERTEX_TILER);
1207    const struct panvk_shader *vs = cmdbuf->state.gfx.vs.shader;
1208    const struct panvk_shader *fs = get_fs(cmdbuf);
1209    VkResult result;
1210 
1211    if (gfx_state_dirty(cmdbuf, VS_PUSH_UNIFORMS)) {
1212       result = panvk_per_arch(cmd_prepare_push_uniforms)(cmdbuf, vs);
1213       if (result != VK_SUCCESS)
1214          return result;
1215 
1216       cs_update_vt_ctx(b) {
1217          cs_move64_to(b, cs_sr_reg64(b, 8),
1218                       cmdbuf->state.gfx.vs.push_uniforms |
1219                          ((uint64_t)vs->fau.total_count << 56));
1220       }
1221    }
1222 
1223    if (fs_user_dirty(cmdbuf) || gfx_state_dirty(cmdbuf, FS_PUSH_UNIFORMS)) {
1224       uint64_t fau_ptr = 0;
1225 
1226       if (fs) {
1227          result = panvk_per_arch(cmd_prepare_push_uniforms)(cmdbuf, fs);
1228          if (result != VK_SUCCESS)
1229             return result;
1230 
1231          fau_ptr = cmdbuf->state.gfx.fs.push_uniforms |
1232                    ((uint64_t)fs->fau.total_count << 56);
1233       }
1234 
1235       cs_update_vt_ctx(b)
1236          cs_move64_to(b, cs_sr_reg64(b, 12), fau_ptr);
1237    }
1238 
1239    return VK_SUCCESS;
1240 }
1241 
1242 static VkResult
prepare_ds(struct panvk_cmd_buffer * cmdbuf)1243 prepare_ds(struct panvk_cmd_buffer *cmdbuf)
1244 {
1245    bool dirty = dyn_gfx_state_dirty(cmdbuf, DS_DEPTH_TEST_ENABLE) ||
1246                 dyn_gfx_state_dirty(cmdbuf, DS_DEPTH_WRITE_ENABLE) ||
1247                 dyn_gfx_state_dirty(cmdbuf, DS_DEPTH_COMPARE_OP) ||
1248                 dyn_gfx_state_dirty(cmdbuf, DS_STENCIL_TEST_ENABLE) ||
1249                 dyn_gfx_state_dirty(cmdbuf, DS_STENCIL_OP) ||
1250                 dyn_gfx_state_dirty(cmdbuf, DS_STENCIL_COMPARE_MASK) ||
1251                 dyn_gfx_state_dirty(cmdbuf, DS_STENCIL_WRITE_MASK) ||
1252                 dyn_gfx_state_dirty(cmdbuf, DS_STENCIL_REFERENCE) ||
1253                 dyn_gfx_state_dirty(cmdbuf, RS_DEPTH_CLAMP_ENABLE) ||
1254                 dyn_gfx_state_dirty(cmdbuf, RS_DEPTH_CLIP_ENABLE) ||
1255                 dyn_gfx_state_dirty(cmdbuf, RS_DEPTH_BIAS_ENABLE) ||
1256                 dyn_gfx_state_dirty(cmdbuf, RS_DEPTH_BIAS_FACTORS) ||
1257                 fs_user_dirty(cmdbuf);
1258 
1259    if (!dirty)
1260       return VK_SUCCESS;
1261 
1262    struct cs_builder *b =
1263       panvk_get_cs_builder(cmdbuf, PANVK_SUBQUEUE_VERTEX_TILER);
1264    const struct vk_dynamic_graphics_state *dyns =
1265       &cmdbuf->vk.dynamic_graphics_state;
1266    const struct vk_depth_stencil_state *ds = &dyns->ds;
1267    const struct vk_rasterization_state *rs = &dyns->rs;
1268    bool test_s = has_stencil_att(cmdbuf) && ds->stencil.test_enable;
1269    bool test_z = has_depth_att(cmdbuf) && ds->depth.test_enable;
1270    const struct panvk_shader *fs = get_fs(cmdbuf);
1271 
1272    struct panfrost_ptr zsd = panvk_cmd_alloc_desc(cmdbuf, DEPTH_STENCIL);
1273    if (!zsd.gpu)
1274       return VK_ERROR_OUT_OF_DEVICE_MEMORY;
1275 
1276    pan_cast_and_pack(zsd.cpu, DEPTH_STENCIL, cfg) {
1277       cfg.stencil_test_enable = test_s;
1278       if (test_s) {
1279          cfg.front_compare_function =
1280             translate_compare_func(ds->stencil.front.op.compare);
1281          cfg.front_stencil_fail =
1282             translate_stencil_op(ds->stencil.front.op.fail);
1283          cfg.front_depth_fail =
1284             translate_stencil_op(ds->stencil.front.op.depth_fail);
1285          cfg.front_depth_pass = translate_stencil_op(ds->stencil.front.op.pass);
1286          cfg.back_compare_function =
1287             translate_compare_func(ds->stencil.back.op.compare);
1288          cfg.back_stencil_fail = translate_stencil_op(ds->stencil.back.op.fail);
1289          cfg.back_depth_fail =
1290             translate_stencil_op(ds->stencil.back.op.depth_fail);
1291          cfg.back_depth_pass = translate_stencil_op(ds->stencil.back.op.pass);
1292       }
1293 
1294       cfg.stencil_from_shader = fs ? fs->info.fs.writes_stencil : 0;
1295       cfg.front_write_mask = ds->stencil.front.write_mask;
1296       cfg.back_write_mask = ds->stencil.back.write_mask;
1297       cfg.front_value_mask = ds->stencil.front.compare_mask;
1298       cfg.back_value_mask = ds->stencil.back.compare_mask;
1299       cfg.front_reference_value = ds->stencil.front.reference;
1300       cfg.back_reference_value = ds->stencil.back.reference;
1301 
1302       cfg.depth_cull_enable = vk_rasterization_state_depth_clip_enable(rs);
1303       if (rs->depth_clamp_enable)
1304          cfg.depth_clamp_mode = MALI_DEPTH_CLAMP_MODE_BOUNDS;
1305 
1306       if (fs)
1307          cfg.depth_source = pan_depth_source(&fs->info);
1308       cfg.depth_write_enable = test_z && ds->depth.write_enable;
1309       cfg.depth_bias_enable = rs->depth_bias.enable;
1310       cfg.depth_function = test_z ? translate_compare_func(ds->depth.compare_op)
1311                                   : MALI_FUNC_ALWAYS;
1312       cfg.depth_units = rs->depth_bias.constant_factor;
1313       cfg.depth_factor = rs->depth_bias.slope_factor;
1314       cfg.depth_bias_clamp = rs->depth_bias.clamp;
1315    }
1316 
1317    cs_update_vt_ctx(b)
1318       cs_move64_to(b, cs_sr_reg64(b, 52), zsd.gpu);
1319 
1320    return VK_SUCCESS;
1321 }
1322 
1323 static VkResult
wrap_prev_oq(struct panvk_cmd_buffer * cmdbuf)1324 wrap_prev_oq(struct panvk_cmd_buffer *cmdbuf)
1325 {
1326    uint64_t last_syncobj = cmdbuf->state.gfx.render.oq.last;
1327 
1328    if (!last_syncobj)
1329       return VK_SUCCESS;
1330 
1331    uint64_t prev_oq_node = cmdbuf->state.gfx.render.oq.chain;
1332    struct panfrost_ptr new_oq_node = panvk_cmd_alloc_dev_mem(
1333       cmdbuf, desc, sizeof(struct panvk_cs_occlusion_query), 8);
1334 
1335    if (!new_oq_node.gpu)
1336       return VK_ERROR_OUT_OF_DEVICE_MEMORY;
1337 
1338    cmdbuf->state.gfx.render.oq.chain = new_oq_node.gpu;
1339 
1340    struct panvk_cs_occlusion_query *oq = new_oq_node.cpu;
1341 
1342    *oq = (struct panvk_cs_occlusion_query){
1343       .syncobj = last_syncobj,
1344       .next = prev_oq_node,
1345    };
1346 
1347    /* If we already had an OQ in the chain, we don't need to initialize the
1348     * oq_chain field in the subqueue ctx. */
1349    if (prev_oq_node)
1350       return VK_SUCCESS;
1351 
1352    /* If we're a secondary cmdbuf inside a render pass, we let the primary
1353     * cmdbuf link the OQ chain. */
1354    if (cmdbuf->flags & VK_COMMAND_BUFFER_USAGE_RENDER_PASS_CONTINUE_BIT)
1355       return VK_SUCCESS;
1356 
1357    struct cs_builder *b = panvk_get_cs_builder(cmdbuf, PANVK_SUBQUEUE_FRAGMENT);
1358    struct cs_index oq_node_reg = cs_scratch_reg64(b, 0);
1359 
1360    cs_move64_to(b, oq_node_reg, new_oq_node.gpu);
1361 
1362    /* If we're resuming, we need to link with the previous oq_chain, if any. */
1363    if (cmdbuf->state.gfx.render.flags & VK_RENDERING_RESUMING_BIT) {
1364       struct cs_index prev_oq_node_reg = cs_scratch_reg64(b, 2);
1365 
1366       cs_load64_to(
1367          b, prev_oq_node_reg, cs_subqueue_ctx_reg(b),
1368          offsetof(struct panvk_cs_subqueue_context, render.oq_chain));
1369       cs_wait_slot(b, SB_ID(LS), false);
1370       cs_store64(b, prev_oq_node_reg, oq_node_reg,
1371                  offsetof(struct panvk_cs_occlusion_query, next));
1372       cs_wait_slot(b, SB_ID(LS), false);
1373    }
1374 
1375    cs_store64(b, oq_node_reg, cs_subqueue_ctx_reg(b),
1376               offsetof(struct panvk_cs_subqueue_context, render.oq_chain));
1377    cs_wait_slot(b, SB_ID(LS), false);
1378    return VK_SUCCESS;
1379 }
1380 
1381 static VkResult
prepare_oq(struct panvk_cmd_buffer * cmdbuf)1382 prepare_oq(struct panvk_cmd_buffer *cmdbuf)
1383 {
1384    if (!gfx_state_dirty(cmdbuf, OQ) ||
1385        cmdbuf->state.gfx.occlusion_query.syncobj ==
1386           cmdbuf->state.gfx.render.oq.last)
1387       return VK_SUCCESS;
1388 
1389    VkResult result = wrap_prev_oq(cmdbuf);
1390    if (result)
1391       return result;
1392 
1393    struct cs_builder *b =
1394       panvk_get_cs_builder(cmdbuf, PANVK_SUBQUEUE_VERTEX_TILER);
1395    cs_move64_to(b, cs_sr_reg64(b, 46), cmdbuf->state.gfx.occlusion_query.ptr);
1396 
1397    cmdbuf->state.gfx.render.oq.last =
1398       cmdbuf->state.gfx.occlusion_query.syncobj;
1399    return VK_SUCCESS;
1400 }
1401 
1402 static void
prepare_dcd(struct panvk_cmd_buffer * cmdbuf)1403 prepare_dcd(struct panvk_cmd_buffer *cmdbuf)
1404 {
1405    struct cs_builder *b =
1406       panvk_get_cs_builder(cmdbuf, PANVK_SUBQUEUE_VERTEX_TILER);
1407    const struct panvk_shader *fs = get_fs(cmdbuf);
1408    bool dcd0_dirty =
1409       dyn_gfx_state_dirty(cmdbuf, RS_RASTERIZER_DISCARD_ENABLE) ||
1410       dyn_gfx_state_dirty(cmdbuf, RS_CULL_MODE) ||
1411       dyn_gfx_state_dirty(cmdbuf, RS_FRONT_FACE) ||
1412       dyn_gfx_state_dirty(cmdbuf, MS_RASTERIZATION_SAMPLES) ||
1413       dyn_gfx_state_dirty(cmdbuf, MS_SAMPLE_MASK) ||
1414       dyn_gfx_state_dirty(cmdbuf, MS_ALPHA_TO_COVERAGE_ENABLE) ||
1415       dyn_gfx_state_dirty(cmdbuf, MS_ALPHA_TO_ONE_ENABLE) ||
1416       /* writes_depth() uses vk_depth_stencil_state */
1417       dyn_gfx_state_dirty(cmdbuf, DS_DEPTH_TEST_ENABLE) ||
1418       dyn_gfx_state_dirty(cmdbuf, DS_DEPTH_WRITE_ENABLE) ||
1419       dyn_gfx_state_dirty(cmdbuf, DS_DEPTH_COMPARE_OP) ||
1420       /* writes_stencil() uses vk_depth_stencil_state */
1421       dyn_gfx_state_dirty(cmdbuf, DS_STENCIL_TEST_ENABLE) ||
1422       dyn_gfx_state_dirty(cmdbuf, DS_STENCIL_OP) ||
1423       dyn_gfx_state_dirty(cmdbuf, DS_STENCIL_WRITE_MASK) ||
1424       fs_user_dirty(cmdbuf) || gfx_state_dirty(cmdbuf, RENDER_STATE) ||
1425       gfx_state_dirty(cmdbuf, OQ);
1426    bool dcd1_dirty = dyn_gfx_state_dirty(cmdbuf, MS_RASTERIZATION_SAMPLES) ||
1427                      dyn_gfx_state_dirty(cmdbuf, MS_SAMPLE_MASK) ||
1428                      fs_user_dirty(cmdbuf) ||
1429                      gfx_state_dirty(cmdbuf, RENDER_STATE);
1430 
1431    const struct vk_dynamic_graphics_state *dyns =
1432       &cmdbuf->vk.dynamic_graphics_state;
1433    const struct vk_rasterization_state *rs =
1434       &cmdbuf->vk.dynamic_graphics_state.rs;
1435    bool alpha_to_coverage = dyns->ms.alpha_to_coverage_enable;
1436    bool writes_z = writes_depth(cmdbuf);
1437    bool writes_s = writes_stencil(cmdbuf);
1438 
1439    if (dcd0_dirty) {
1440       struct mali_dcd_flags_0_packed dcd0;
1441       pan_pack(&dcd0, DCD_FLAGS_0, cfg) {
1442          if (fs) {
1443             uint8_t rt_written = fs->info.outputs_written >> FRAG_RESULT_DATA0;
1444             uint8_t rt_mask = cmdbuf->state.gfx.render.bound_attachments &
1445                               MESA_VK_RP_ATTACHMENT_ANY_COLOR_BITS;
1446 
1447             cfg.allow_forward_pixel_to_kill =
1448                fs->info.fs.can_fpk && !(rt_mask & ~rt_written) &&
1449                !alpha_to_coverage && !cmdbuf->state.gfx.cb.info.any_dest_read;
1450 
1451             bool writes_zs = writes_z || writes_s;
1452             bool zs_always_passes = ds_test_always_passes(cmdbuf);
1453             bool oq = cmdbuf->state.gfx.occlusion_query.mode !=
1454                       MALI_OCCLUSION_MODE_DISABLED;
1455 
1456             struct pan_earlyzs_state earlyzs =
1457                pan_earlyzs_get(pan_earlyzs_analyze(&fs->info), writes_zs || oq,
1458                                alpha_to_coverage, zs_always_passes);
1459 
1460             cfg.pixel_kill_operation = earlyzs.kill;
1461             cfg.zs_update_operation = earlyzs.update;
1462             cfg.evaluate_per_sample = fs->info.fs.sample_shading;
1463          } else {
1464             cfg.allow_forward_pixel_to_kill = true;
1465             cfg.allow_forward_pixel_to_be_killed = true;
1466             cfg.pixel_kill_operation = MALI_PIXEL_KILL_FORCE_EARLY;
1467             cfg.zs_update_operation = MALI_PIXEL_KILL_STRONG_EARLY;
1468             cfg.overdraw_alpha0 = true;
1469             cfg.overdraw_alpha1 = true;
1470          }
1471 
1472          cfg.front_face_ccw = rs->front_face == VK_FRONT_FACE_COUNTER_CLOCKWISE;
1473          cfg.cull_front_face = (rs->cull_mode & VK_CULL_MODE_FRONT_BIT) != 0;
1474          cfg.cull_back_face = (rs->cull_mode & VK_CULL_MODE_BACK_BIT) != 0;
1475 
1476          cfg.multisample_enable = dyns->ms.rasterization_samples > 1;
1477          cfg.occlusion_query = cmdbuf->state.gfx.occlusion_query.mode;
1478          cfg.alpha_to_coverage = alpha_to_coverage;
1479       }
1480 
1481       cs_update_vt_ctx(b)
1482          cs_move32_to(b, cs_sr_reg32(b, 57), dcd0.opaque[0]);
1483    }
1484 
1485    if (dcd1_dirty) {
1486       struct mali_dcd_flags_1_packed dcd1;
1487       pan_pack(&dcd1, DCD_FLAGS_1, cfg) {
1488          cfg.sample_mask = dyns->ms.rasterization_samples > 1
1489                               ? dyns->ms.sample_mask
1490                               : UINT16_MAX;
1491 
1492          if (fs) {
1493             cfg.render_target_mask =
1494                (fs->info.outputs_written >> FRAG_RESULT_DATA0) &
1495                cmdbuf->state.gfx.render.bound_attachments;
1496          }
1497       }
1498 
1499       cs_update_vt_ctx(b)
1500          cs_move32_to(b, cs_sr_reg32(b, 58), dcd1.opaque[0]);
1501    }
1502 }
1503 
1504 static void
prepare_index_buffer(struct panvk_cmd_buffer * cmdbuf,struct panvk_draw_info * draw)1505 prepare_index_buffer(struct panvk_cmd_buffer *cmdbuf,
1506                      struct panvk_draw_info *draw)
1507 {
1508    struct cs_builder *b =
1509       panvk_get_cs_builder(cmdbuf, PANVK_SUBQUEUE_VERTEX_TILER);
1510 
1511    if (draw->index.size && gfx_state_dirty(cmdbuf, IB)) {
1512       uint64_t ib_size =
1513          panvk_buffer_range(cmdbuf->state.gfx.ib.buffer,
1514                             cmdbuf->state.gfx.ib.offset, VK_WHOLE_SIZE);
1515       assert(ib_size <= UINT32_MAX);
1516       cs_move32_to(b, cs_sr_reg32(b, 39), ib_size);
1517 
1518       cs_move64_to(b, cs_sr_reg64(b, 54),
1519                    panvk_buffer_gpu_ptr(cmdbuf->state.gfx.ib.buffer,
1520                                         cmdbuf->state.gfx.ib.offset));
1521    }
1522 }
1523 
1524 static void
set_tiler_idvs_flags(struct cs_builder * b,struct panvk_cmd_buffer * cmdbuf,struct panvk_draw_info * draw)1525 set_tiler_idvs_flags(struct cs_builder *b, struct panvk_cmd_buffer *cmdbuf,
1526                      struct panvk_draw_info *draw)
1527 {
1528    const struct panvk_shader *vs = cmdbuf->state.gfx.vs.shader;
1529    const struct panvk_shader *fs = get_fs(cmdbuf);
1530    const struct vk_dynamic_graphics_state *dyns =
1531       &cmdbuf->vk.dynamic_graphics_state;
1532    const struct vk_input_assembly_state *ia = &dyns->ia;
1533    const struct vk_rasterization_state *rs = &dyns->rs;
1534    struct mali_primitive_flags_packed tiler_idvs_flags;
1535 
1536    /* When drawing non-point primitives, we use the no_psiz variant which has
1537     * point size writes patched out */
1538    bool writes_point_size =
1539       vs->info.vs.writes_point_size &&
1540       ia->primitive_topology == VK_PRIMITIVE_TOPOLOGY_POINT_LIST;
1541    bool multiview = cmdbuf->state.gfx.render.view_mask;
1542    bool writes_layer = vs->info.outputs_written & VARYING_BIT_LAYER;
1543 
1544    /* Multiview shaders depend on the FIFO format for indexing per-view
1545     * output writes. We don't currently patch these offsets in the no_psiz
1546     * variant, so we still need the extended format even though the shader
1547     * does not write point size. */
1548    bool extended_fifo = writes_point_size || writes_layer ||
1549                         (vs->info.vs.writes_point_size && multiview);
1550 
1551    bool dirty = gfx_state_dirty(cmdbuf, VS) || fs_user_dirty(cmdbuf) ||
1552                 dyn_gfx_state_dirty(cmdbuf, IA_PRIMITIVE_RESTART_ENABLE) ||
1553                 dyn_gfx_state_dirty(cmdbuf, IA_PRIMITIVE_TOPOLOGY) ||
1554                 dyn_gfx_state_dirty(cmdbuf, RS_DEPTH_CLAMP_ENABLE) ||
1555                 dyn_gfx_state_dirty(cmdbuf, RS_DEPTH_CLIP_ENABLE);
1556 
1557    if (dirty) {
1558       pan_pack(&tiler_idvs_flags, PRIMITIVE_FLAGS, cfg) {
1559          cfg.draw_mode = translate_prim_topology(ia->primitive_topology);
1560 
1561          cfg.point_size_array_format = writes_point_size
1562             ? MALI_POINT_SIZE_ARRAY_FORMAT_FP16
1563             : MALI_POINT_SIZE_ARRAY_FORMAT_NONE;
1564          cfg.layer_index_enable = writes_layer;
1565 
1566          cfg.position_fifo_format = extended_fifo
1567             ? MALI_FIFO_FORMAT_EXTENDED
1568             : MALI_FIFO_FORMAT_BASIC;
1569 
1570          cfg.low_depth_cull = cfg.high_depth_cull =
1571             vk_rasterization_state_depth_clip_enable(rs);
1572 
1573          cfg.secondary_shader = vs->info.vs.secondary_enable && fs != NULL;
1574          cfg.primitive_restart = ia->primitive_restart_enable;
1575          cfg.view_mask = cmdbuf->state.gfx.render.view_mask;
1576       }
1577 
1578       cs_move32_to(b, cs_sr_reg32(b, 56), tiler_idvs_flags.opaque[0]);
1579    }
1580 }
1581 
1582 static struct mali_primitive_flags_packed
get_tiler_flags_override(struct panvk_draw_info * draw)1583 get_tiler_flags_override(struct panvk_draw_info *draw)
1584 {
1585    struct mali_primitive_flags_packed flags_override;
1586    /* Pack with nodefaults so only explicitly set override fields affect the
1587     * previously set register values */
1588    pan_pack_nodefaults(&flags_override, PRIMITIVE_FLAGS, cfg) {
1589       cfg.index_type = index_size_to_index_type(draw->index.size);
1590    };
1591 
1592    return flags_override;
1593 }
1594 
1595 static VkResult
prepare_draw(struct panvk_cmd_buffer * cmdbuf,struct panvk_draw_info * draw)1596 prepare_draw(struct panvk_cmd_buffer *cmdbuf, struct panvk_draw_info *draw)
1597 {
1598    const struct panvk_shader *vs = cmdbuf->state.gfx.vs.shader;
1599    const struct panvk_shader *fs = get_fs(cmdbuf);
1600    struct panvk_descriptor_state *desc_state = &cmdbuf->state.gfx.desc_state;
1601    bool idvs = vs->info.vs.idvs;
1602    VkResult result;
1603 
1604    assert(vs);
1605 
1606    /* FIXME: support non-IDVS. */
1607    assert(idvs);
1608 
1609    set_provoking_vertex_mode(cmdbuf);
1610 
1611    result = update_tls(cmdbuf);
1612    if (result != VK_SUCCESS)
1613       return result;
1614 
1615    if (!inherits_render_ctx(cmdbuf)) {
1616       result = get_render_ctx(cmdbuf);
1617       if (result != VK_SUCCESS)
1618          return result;
1619    }
1620 
1621    struct cs_builder *b =
1622       panvk_get_cs_builder(cmdbuf, PANVK_SUBQUEUE_VERTEX_TILER);
1623 
1624    uint32_t used_set_mask =
1625       vs->desc_info.used_set_mask | (fs ? fs->desc_info.used_set_mask : 0);
1626 
1627    if (gfx_state_dirty(cmdbuf, DESC_STATE) || gfx_state_dirty(cmdbuf, VS) ||
1628        gfx_state_dirty(cmdbuf, FS)) {
1629       result = panvk_per_arch(cmd_prepare_push_descs)(cmdbuf, desc_state,
1630                                                       used_set_mask);
1631       if (result != VK_SUCCESS)
1632          return result;
1633    }
1634 
1635    result = prepare_blend(cmdbuf);
1636    if (result != VK_SUCCESS)
1637       return result;
1638 
1639    panvk_per_arch(cmd_prepare_draw_sysvals)(cmdbuf, draw);
1640 
1641    result = prepare_push_uniforms(cmdbuf);
1642    if (result != VK_SUCCESS)
1643       return result;
1644 
1645    result = prepare_vs(cmdbuf);
1646    if (result != VK_SUCCESS)
1647       return result;
1648 
1649    result = prepare_fs(cmdbuf);
1650    if (result != VK_SUCCESS)
1651       return result;
1652 
1653    uint32_t varying_size = 0;
1654 
1655    if (fs) {
1656       unsigned vs_vars = vs->info.varyings.output_count;
1657       unsigned fs_vars = fs->info.varyings.input_count;
1658       unsigned var_slots = MAX2(vs_vars, fs_vars);
1659 
1660       /* Assumes 16 byte slots. We could do better. */
1661       varying_size = var_slots * 16;
1662    }
1663 
1664    cs_update_vt_ctx(b) {
1665       /* We don't use the resource dep system yet. */
1666       cs_move32_to(b, cs_sr_reg32(b, 38), 0);
1667 
1668       prepare_index_buffer(cmdbuf, draw);
1669 
1670       set_tiler_idvs_flags(b, cmdbuf, draw);
1671 
1672       cs_move32_to(b, cs_sr_reg32(b, 48), varying_size);
1673 
1674       result = prepare_ds(cmdbuf);
1675       if (result != VK_SUCCESS)
1676          return result;
1677 
1678       result = prepare_oq(cmdbuf);
1679       if (result != VK_SUCCESS)
1680          return result;
1681 
1682       prepare_dcd(cmdbuf);
1683       prepare_vp(cmdbuf);
1684       prepare_tiler_primitive_size(cmdbuf);
1685    }
1686 
1687    clear_dirty_after_draw(cmdbuf);
1688    return VK_SUCCESS;
1689 }
1690 
1691 static void
panvk_cmd_draw(struct panvk_cmd_buffer * cmdbuf,struct panvk_draw_info * draw)1692 panvk_cmd_draw(struct panvk_cmd_buffer *cmdbuf, struct panvk_draw_info *draw)
1693 {
1694    const struct cs_tracing_ctx *tracing_ctx =
1695       &cmdbuf->state.cs[PANVK_SUBQUEUE_VERTEX_TILER].tracing;
1696    const struct panvk_shader *vs = cmdbuf->state.gfx.vs.shader;
1697    struct cs_builder *b =
1698       panvk_get_cs_builder(cmdbuf, PANVK_SUBQUEUE_VERTEX_TILER);
1699    VkResult result;
1700 
1701    /* If there's no vertex shader, we can skip the draw. */
1702    if (!panvk_priv_mem_dev_addr(vs->spds.pos_points))
1703       return;
1704 
1705    /* Needs to be done before get_fs() is called because it depends on
1706     * fs.required being initialized. */
1707    cmdbuf->state.gfx.fs.required =
1708       fs_required(&cmdbuf->state.gfx, &cmdbuf->vk.dynamic_graphics_state);
1709 
1710    if (!cmdbuf->vk.dynamic_graphics_state.rs.rasterizer_discard_enable) {
1711       struct pan_fb_info *fbinfo = &cmdbuf->state.gfx.render.fb.info;
1712       uint32_t rasterization_samples =
1713          cmdbuf->vk.dynamic_graphics_state.ms.rasterization_samples;
1714 
1715       /* If there's no attachment, we patch nr_samples to match
1716        * rasterization_samples, otherwise, we make sure those two numbers match.
1717        */
1718       if (!cmdbuf->state.gfx.render.bound_attachments) {
1719          assert(rasterization_samples > 0);
1720          fbinfo->nr_samples = rasterization_samples;
1721       } else {
1722          assert(rasterization_samples == fbinfo->nr_samples);
1723       }
1724    }
1725 
1726    result = prepare_draw(cmdbuf, draw);
1727    if (result != VK_SUCCESS)
1728       return;
1729 
1730    cs_update_vt_ctx(b) {
1731       cs_move32_to(b, cs_sr_reg32(b, 32), 0);
1732       cs_move32_to(b, cs_sr_reg32(b, 33), draw->vertex.count);
1733       cs_move32_to(b, cs_sr_reg32(b, 34), draw->instance.count);
1734       cs_move32_to(b, cs_sr_reg32(b, 35), draw->index.offset);
1735       cs_move32_to(b, cs_sr_reg32(b, 36), draw->vertex.base);
1736       /* NIR expects zero-based instance ID, but even if it did have an intrinsic to
1737        * load the absolute instance ID, we'd want to keep it zero-based to work around
1738        * Mali's limitation on non-zero firstInstance when a instance divisor is used.
1739        */
1740       cs_move32_to(b, cs_sr_reg32(b, 37), 0);
1741    }
1742 
1743    struct mali_primitive_flags_packed flags_override =
1744       get_tiler_flags_override(draw);
1745 
1746    uint32_t idvs_count = DIV_ROUND_UP(cmdbuf->state.gfx.render.layer_count,
1747                                       MAX_LAYERS_PER_TILER_DESC);
1748 
1749    cs_req_res(b, CS_IDVS_RES);
1750    if (idvs_count > 1) {
1751       struct cs_index counter_reg = cs_scratch_reg32(b, 17);
1752       struct cs_index tiler_ctx_addr = cs_sr_reg64(b, 40);
1753 
1754       cs_move32_to(b, counter_reg, idvs_count);
1755 
1756       cs_while(b, MALI_CS_CONDITION_GREATER, counter_reg) {
1757          cs_trace_run_idvs(b, tracing_ctx, cs_scratch_reg_tuple(b, 0, 4),
1758                            flags_override.opaque[0], false, true,
1759                            cs_shader_res_sel(0, 0, 1, 0),
1760                            cs_shader_res_sel(2, 2, 2, 0), cs_undef());
1761 
1762          cs_add32(b, counter_reg, counter_reg, -1);
1763          cs_update_vt_ctx(b) {
1764             cs_add64(b, tiler_ctx_addr, tiler_ctx_addr,
1765                      pan_size(TILER_CONTEXT));
1766          }
1767       }
1768 
1769       cs_update_vt_ctx(b) {
1770          cs_add64(b, tiler_ctx_addr, tiler_ctx_addr,
1771                   -(idvs_count * pan_size(TILER_CONTEXT)));
1772       }
1773    } else {
1774       cs_trace_run_idvs(b, tracing_ctx, cs_scratch_reg_tuple(b, 0, 4),
1775                         flags_override.opaque[0], false, true,
1776                         cs_shader_res_sel(0, 0, 1, 0),
1777                         cs_shader_res_sel(2, 2, 2, 0), cs_undef());
1778    }
1779    cs_req_res(b, 0);
1780 }
1781 
1782 VkResult
panvk_per_arch(cmd_prepare_exec_cmd_for_draws)1783 panvk_per_arch(cmd_prepare_exec_cmd_for_draws)(
1784    struct panvk_cmd_buffer *primary,
1785    struct panvk_cmd_buffer *secondary)
1786 {
1787    if (!(secondary->flags & VK_COMMAND_BUFFER_USAGE_RENDER_PASS_CONTINUE_BIT))
1788       return VK_SUCCESS;
1789 
1790    if (!inherits_render_ctx(primary)) {
1791       VkResult result  = get_render_ctx(primary);
1792       if (result != VK_SUCCESS)
1793          return result;
1794    }
1795 
1796    return prepare_oq(primary);
1797 }
1798 
1799 VKAPI_ATTR void VKAPI_CALL
panvk_per_arch(CmdDraw)1800 panvk_per_arch(CmdDraw)(VkCommandBuffer commandBuffer, uint32_t vertexCount,
1801                         uint32_t instanceCount, uint32_t firstVertex,
1802                         uint32_t firstInstance)
1803 {
1804    VK_FROM_HANDLE(panvk_cmd_buffer, cmdbuf, commandBuffer);
1805 
1806    if (instanceCount == 0 || vertexCount == 0)
1807       return;
1808 
1809    /* gl_BaseVertexARB is a signed integer, and it should expose the value of
1810     * firstVertex in a non-indexed draw. */
1811    assert(firstVertex < INT32_MAX);
1812 
1813    /* gl_BaseInstance is a signed integer, and it should expose the value of
1814     * firstInstnace. */
1815    assert(firstInstance < INT32_MAX);
1816 
1817    struct panvk_draw_info draw = {
1818       .vertex.base = firstVertex,
1819       .vertex.count = vertexCount,
1820       .instance.base = firstInstance,
1821       .instance.count = instanceCount,
1822    };
1823 
1824    panvk_cmd_draw(cmdbuf, &draw);
1825 }
1826 
1827 VKAPI_ATTR void VKAPI_CALL
panvk_per_arch(CmdDrawIndexed)1828 panvk_per_arch(CmdDrawIndexed)(VkCommandBuffer commandBuffer,
1829                                uint32_t indexCount, uint32_t instanceCount,
1830                                uint32_t firstIndex, int32_t vertexOffset,
1831                                uint32_t firstInstance)
1832 {
1833    VK_FROM_HANDLE(panvk_cmd_buffer, cmdbuf, commandBuffer);
1834 
1835    if (instanceCount == 0 || indexCount == 0)
1836       return;
1837 
1838    /* gl_BaseInstance is a signed integer, and it should expose the value of
1839     * firstInstnace. */
1840    assert(firstInstance < INT32_MAX);
1841 
1842    struct panvk_draw_info draw = {
1843       .index.size = cmdbuf->state.gfx.ib.index_size,
1844       .index.offset = firstIndex,
1845       .vertex.base = vertexOffset,
1846       .vertex.count = indexCount,
1847       .instance.count = instanceCount,
1848       .instance.base = firstInstance,
1849    };
1850 
1851    panvk_cmd_draw(cmdbuf, &draw);
1852 }
1853 
1854 static void
panvk_cmd_draw_indirect(struct panvk_cmd_buffer * cmdbuf,struct panvk_draw_info * draw)1855 panvk_cmd_draw_indirect(struct panvk_cmd_buffer *cmdbuf,
1856                         struct panvk_draw_info *draw)
1857 {
1858    const struct cs_tracing_ctx *tracing_ctx =
1859       &cmdbuf->state.cs[PANVK_SUBQUEUE_VERTEX_TILER].tracing;
1860    const struct panvk_shader *vs = cmdbuf->state.gfx.vs.shader;
1861    struct cs_builder *b =
1862       panvk_get_cs_builder(cmdbuf, PANVK_SUBQUEUE_VERTEX_TILER);
1863    VkResult result;
1864 
1865    /* If there's no vertex shader, we can skip the draw. */
1866    if (!panvk_priv_mem_dev_addr(vs->spds.pos_points))
1867       return;
1868 
1869    /* Needs to be done before get_fs() is called because it depends on
1870     * fs.required being initialized. */
1871    cmdbuf->state.gfx.fs.required =
1872       fs_required(&cmdbuf->state.gfx, &cmdbuf->vk.dynamic_graphics_state);
1873 
1874    /* Layered indirect draw (VK_EXT_shader_viewport_index_layer) needs
1875     * additional changes. We allow layer_count == 0 because that happens
1876     * when mixing dynamic rendering and secondary command buffers. Once
1877     * we decide to support layared+indirect, we'll need to pass the
1878     * layer_count info through the tiler descriptor, for instance by
1879     * re-using one of the word that's flagged 'ignored' in the descriptor
1880     * (word 14:23).
1881     *
1882     * Multiview is limited to 8 layers, and so will always fit in one TD.
1883     * Therefore layered rendering is allowed with multiview. */
1884    assert(cmdbuf->state.gfx.render.layer_count <= 1 ||
1885           cmdbuf->state.gfx.render.view_mask);
1886 
1887    /* MultiDrawIndirect (.maxDrawIndirectCount) needs additional changes. */
1888    assert(draw->indirect.draw_count == 1);
1889 
1890    /* Force a new push uniform block to be allocated */
1891    gfx_state_set_dirty(cmdbuf, VS_PUSH_UNIFORMS);
1892 
1893    result = prepare_draw(cmdbuf, draw);
1894    if (result != VK_SUCCESS)
1895       return;
1896 
1897    struct cs_index draw_params_addr = cs_scratch_reg64(b, 0);
1898    cs_move64_to(b, draw_params_addr, draw->indirect.buffer_dev_addr);
1899 
1900    cs_update_vt_ctx(b) {
1901       cs_move32_to(b, cs_sr_reg32(b, 32), 0);
1902       /* Load SR33-37 from indirect buffer. */
1903       unsigned reg_mask = draw->index.size ? 0b11111 : 0b11011;
1904       cs_load_to(b, cs_sr_reg_tuple(b, 33, 5), draw_params_addr, reg_mask, 0);
1905    }
1906 
1907    /* Wait for the SR33-37 indirect buffer load. */
1908    cs_wait_slot(b, SB_ID(LS), false);
1909 
1910    if (shader_uses_sysval(vs, graphics, vs.first_vertex) ||
1911        shader_uses_sysval(vs, graphics, vs.base_instance)) {
1912       struct cs_index fau_block_addr = cs_scratch_reg64(b, 2);
1913       cs_move64_to(b, fau_block_addr, cmdbuf->state.gfx.vs.push_uniforms);
1914 
1915       if (shader_uses_sysval(vs, graphics, vs.first_vertex)) {
1916          cs_store32(b, cs_sr_reg32(b, 36), fau_block_addr,
1917                     shader_remapped_sysval_offset(
1918                        vs, sysval_offset(graphics, vs.first_vertex)));
1919       }
1920 
1921       if (shader_uses_sysval(vs, graphics, vs.base_instance)) {
1922          cs_store32(b, cs_sr_reg32(b, 37), fau_block_addr,
1923                     shader_remapped_sysval_offset(
1924                        vs, sysval_offset(graphics, vs.base_instance)));
1925       }
1926 
1927       /* Wait for the store using SR-37 as src to finish, so we can overwrite
1928        * it. */
1929       cs_wait_slot(b, SB_ID(LS), false);
1930    }
1931 
1932    /* NIR expects zero-based instance ID, but even if it did have an intrinsic to
1933     * load the absolute instance ID, we'd want to keep it zero-based to work around
1934     * Mali's limitation on non-zero firstInstance when a instance divisor is used.
1935     */
1936    cs_update_vt_ctx(b)
1937       cs_move32_to(b, cs_sr_reg32(b, 37), 0);
1938 
1939    struct mali_primitive_flags_packed flags_override =
1940       get_tiler_flags_override(draw);
1941 
1942    cs_req_res(b, CS_IDVS_RES);
1943    cs_trace_run_idvs(b, tracing_ctx, cs_scratch_reg_tuple(b, 0, 4),
1944                      flags_override.opaque[0], false, true,
1945                      cs_shader_res_sel(0, 0, 1, 0),
1946                      cs_shader_res_sel(2, 2, 2, 0), cs_undef());
1947    cs_req_res(b, 0);
1948 }
1949 
1950 VKAPI_ATTR void VKAPI_CALL
panvk_per_arch(CmdDrawIndirect)1951 panvk_per_arch(CmdDrawIndirect)(VkCommandBuffer commandBuffer, VkBuffer _buffer,
1952                                 VkDeviceSize offset, uint32_t drawCount,
1953                                 uint32_t stride)
1954 {
1955    VK_FROM_HANDLE(panvk_cmd_buffer, cmdbuf, commandBuffer);
1956    VK_FROM_HANDLE(panvk_buffer, buffer, _buffer);
1957 
1958    if (drawCount == 0)
1959       return;
1960 
1961    struct panvk_draw_info draw = {
1962       .indirect.buffer_dev_addr = panvk_buffer_gpu_ptr(buffer, offset),
1963       .indirect.draw_count = drawCount,
1964       .indirect.stride = stride,
1965    };
1966 
1967    panvk_cmd_draw_indirect(cmdbuf, &draw);
1968 }
1969 
1970 VKAPI_ATTR void VKAPI_CALL
panvk_per_arch(CmdDrawIndexedIndirect)1971 panvk_per_arch(CmdDrawIndexedIndirect)(VkCommandBuffer commandBuffer,
1972                                        VkBuffer _buffer, VkDeviceSize offset,
1973                                        uint32_t drawCount, uint32_t stride)
1974 {
1975    VK_FROM_HANDLE(panvk_cmd_buffer, cmdbuf, commandBuffer);
1976    VK_FROM_HANDLE(panvk_buffer, buffer, _buffer);
1977 
1978    if (drawCount == 0)
1979       return;
1980 
1981    struct panvk_draw_info draw = {
1982       .index.size = cmdbuf->state.gfx.ib.index_size,
1983       .indirect.buffer_dev_addr = panvk_buffer_gpu_ptr(buffer, offset),
1984       .indirect.draw_count = drawCount,
1985       .indirect.stride = stride,
1986    };
1987 
1988    panvk_cmd_draw_indirect(cmdbuf, &draw);
1989 }
1990 
1991 void
panvk_per_arch(cmd_inherit_render_state)1992 panvk_per_arch(cmd_inherit_render_state)(
1993    struct panvk_cmd_buffer *cmdbuf,
1994    const VkCommandBufferBeginInfo *pBeginInfo)
1995 {
1996    if (cmdbuf->vk.level != VK_COMMAND_BUFFER_LEVEL_SECONDARY ||
1997        !(pBeginInfo->flags & VK_COMMAND_BUFFER_USAGE_RENDER_PASS_CONTINUE_BIT))
1998       return;
1999 
2000    assert(pBeginInfo->pInheritanceInfo);
2001    char gcbiar_data[VK_GCBIARR_DATA_SIZE(MAX_RTS)];
2002    const VkRenderingInfo *resume_info =
2003       vk_get_command_buffer_inheritance_as_rendering_resume(cmdbuf->vk.level,
2004                                                             pBeginInfo,
2005                                                             gcbiar_data);
2006    if (resume_info) {
2007       panvk_per_arch(cmd_init_render_state)(cmdbuf, resume_info);
2008       return;
2009    }
2010 
2011    const VkCommandBufferInheritanceRenderingInfo *inheritance_info =
2012       vk_get_command_buffer_inheritance_rendering_info(cmdbuf->vk.level,
2013                                                        pBeginInfo);
2014    assert(inheritance_info);
2015    struct panvk_device *dev = to_panvk_device(cmdbuf->vk.base.device);
2016    struct panvk_physical_device *phys_dev =
2017       to_panvk_physical_device(dev->vk.physical);
2018    struct pan_fb_info *fbinfo = &cmdbuf->state.gfx.render.fb.info;
2019 
2020    cmdbuf->state.gfx.render.flags = inheritance_info->flags;
2021 
2022    gfx_state_set_dirty(cmdbuf, RENDER_STATE);
2023    memset(cmdbuf->state.gfx.render.fb.crc_valid, 0,
2024           sizeof(cmdbuf->state.gfx.render.fb.crc_valid));
2025    memset(&cmdbuf->state.gfx.render.color_attachments, 0,
2026           sizeof(cmdbuf->state.gfx.render.color_attachments));
2027    memset(&cmdbuf->state.gfx.render.z_attachment, 0,
2028           sizeof(cmdbuf->state.gfx.render.z_attachment));
2029    memset(&cmdbuf->state.gfx.render.s_attachment, 0,
2030           sizeof(cmdbuf->state.gfx.render.s_attachment));
2031    cmdbuf->state.gfx.render.bound_attachments = 0;
2032 
2033    cmdbuf->state.gfx.render.view_mask = inheritance_info->viewMask;
2034    cmdbuf->state.gfx.render.layer_count = inheritance_info->viewMask ?
2035       util_last_bit(inheritance_info->viewMask) :
2036       0;
2037    *fbinfo = (struct pan_fb_info){
2038       .tile_buf_budget = panfrost_query_optimal_tib_size(phys_dev->model),
2039       .nr_samples = inheritance_info->rasterizationSamples,
2040       .rt_count = inheritance_info->colorAttachmentCount,
2041    };
2042 
2043    assert(inheritance_info->colorAttachmentCount <= ARRAY_SIZE(fbinfo->rts));
2044 
2045    for (uint32_t i = 0; i < inheritance_info->colorAttachmentCount; i++) {
2046       cmdbuf->state.gfx.render.bound_attachments |=
2047          MESA_VK_RP_ATTACHMENT_COLOR_BIT(i);
2048       cmdbuf->state.gfx.render.color_attachments.fmts[i] =
2049          inheritance_info->pColorAttachmentFormats[i];
2050       cmdbuf->state.gfx.render.color_attachments.samples[i] =
2051          inheritance_info->rasterizationSamples;
2052    }
2053 
2054    if (inheritance_info->depthAttachmentFormat) {
2055       cmdbuf->state.gfx.render.bound_attachments |=
2056          MESA_VK_RP_ATTACHMENT_DEPTH_BIT;
2057       cmdbuf->state.gfx.render.z_attachment.fmt =
2058          inheritance_info->depthAttachmentFormat;
2059    }
2060 
2061    if (inheritance_info->stencilAttachmentFormat) {
2062       cmdbuf->state.gfx.render.bound_attachments |=
2063          MESA_VK_RP_ATTACHMENT_STENCIL_BIT;
2064       cmdbuf->state.gfx.render.s_attachment.fmt =
2065          inheritance_info->stencilAttachmentFormat;
2066    }
2067 
2068    const VkRenderingAttachmentLocationInfoKHR att_loc_info_default = {
2069       .sType = VK_STRUCTURE_TYPE_RENDERING_ATTACHMENT_LOCATION_INFO_KHR,
2070       .colorAttachmentCount = inheritance_info->colorAttachmentCount,
2071    };
2072    const VkRenderingAttachmentLocationInfoKHR *att_loc_info =
2073       vk_get_command_buffer_rendering_attachment_location_info(
2074          cmdbuf->vk.level, pBeginInfo);
2075    if (att_loc_info == NULL)
2076       att_loc_info = &att_loc_info_default;
2077 
2078    vk_cmd_set_rendering_attachment_locations(&cmdbuf->vk, att_loc_info);
2079 }
2080 
2081 VKAPI_ATTR void VKAPI_CALL
panvk_per_arch(CmdBeginRendering)2082 panvk_per_arch(CmdBeginRendering)(VkCommandBuffer commandBuffer,
2083                                   const VkRenderingInfo *pRenderingInfo)
2084 {
2085    VK_FROM_HANDLE(panvk_cmd_buffer, cmdbuf, commandBuffer);
2086    struct panvk_cmd_graphics_state *state = &cmdbuf->state.gfx;
2087    bool resuming = pRenderingInfo->flags & VK_RENDERING_RESUMING_BIT;
2088 
2089    panvk_per_arch(cmd_init_render_state)(cmdbuf, pRenderingInfo);
2090 
2091    /* If we're not resuming, the FBD should be NULL. */
2092    assert(!state->render.fbds.gpu || resuming);
2093 
2094    if (!resuming)
2095       panvk_per_arch(cmd_preload_render_area_border)(cmdbuf, pRenderingInfo);
2096 }
2097 
2098 static void
flush_tiling(struct panvk_cmd_buffer * cmdbuf)2099 flush_tiling(struct panvk_cmd_buffer *cmdbuf)
2100 {
2101    struct cs_builder *b =
2102       panvk_get_cs_builder(cmdbuf, PANVK_SUBQUEUE_VERTEX_TILER);
2103 
2104    struct cs_index render_ctx = cs_scratch_reg64(b, 2);
2105 
2106    if (cmdbuf->state.gfx.render.tiler || inherits_render_ctx(cmdbuf)) {
2107       /* Flush the tiling operations and signal the internal sync object. */
2108       cs_req_res(b, CS_TILER_RES);
2109       cs_finish_tiling(b, false);
2110       cs_req_res(b, 0);
2111 
2112       struct cs_index sync_addr = cs_scratch_reg64(b, 0);
2113       struct cs_index iter_sb = cs_scratch_reg32(b, 2);
2114       struct cs_index cmp_scratch = cs_scratch_reg32(b, 3);
2115       struct cs_index add_val = cs_scratch_reg64(b, 4);
2116 
2117       cs_load_to(b, cs_scratch_reg_tuple(b, 0, 3), cs_subqueue_ctx_reg(b),
2118                  BITFIELD_MASK(3),
2119                  offsetof(struct panvk_cs_subqueue_context, syncobjs));
2120       cs_wait_slot(b, SB_ID(LS), false);
2121 
2122       /* We're relying on PANVK_SUBQUEUE_VERTEX_TILER being the first queue to
2123        * skip an ADD operation on the syncobjs pointer. */
2124       STATIC_ASSERT(PANVK_SUBQUEUE_VERTEX_TILER == 0);
2125 
2126       cs_move64_to(b, add_val, 1);
2127 
2128       cs_match(b, iter_sb, cmp_scratch) {
2129 #define CASE(x)                                                                \
2130          cs_case(b, x) {                                                       \
2131             cs_heap_operation(b,                                               \
2132                               MALI_CS_HEAP_OPERATION_VERTEX_TILER_COMPLETED,   \
2133                               cs_defer(SB_WAIT_ITER(x),                        \
2134                                        SB_ID(DEFERRED_SYNC)));                 \
2135             cs_sync64_add(b, true, MALI_CS_SYNC_SCOPE_CSG,                     \
2136                           add_val, sync_addr,                                  \
2137                           cs_defer(SB_WAIT_ITER(x), SB_ID(DEFERRED_SYNC)));    \
2138             cs_move32_to(b, iter_sb, next_iter_sb(x));                         \
2139          }
2140 
2141          CASE(0)
2142          CASE(1)
2143          CASE(2)
2144          CASE(3)
2145          CASE(4)
2146 #undef CASE
2147       }
2148 
2149       cs_store32(b, iter_sb, cs_subqueue_ctx_reg(b),
2150                  offsetof(struct panvk_cs_subqueue_context, iter_sb));
2151       cs_wait_slot(b, SB_ID(LS), false);
2152 
2153       /* Update the vertex seqno. */
2154       ++cmdbuf->state.cs[PANVK_SUBQUEUE_VERTEX_TILER].relative_sync_point;
2155    } else {
2156       cs_load64_to(b, render_ctx, cs_subqueue_ctx_reg(b),
2157                    offsetof(struct panvk_cs_subqueue_context, render));
2158       cs_wait_slot(b, SB_ID(LS), false);
2159    }
2160 }
2161 
2162 static void
wait_finish_tiling(struct panvk_cmd_buffer * cmdbuf)2163 wait_finish_tiling(struct panvk_cmd_buffer *cmdbuf)
2164 {
2165    struct cs_builder *b = panvk_get_cs_builder(cmdbuf, PANVK_SUBQUEUE_FRAGMENT);
2166    struct cs_index vt_sync_addr = cs_scratch_reg64(b, 0);
2167    struct cs_index vt_sync_point = cs_scratch_reg64(b, 2);
2168    uint64_t rel_vt_sync_point =
2169       cmdbuf->state.cs[PANVK_SUBQUEUE_VERTEX_TILER].relative_sync_point;
2170 
2171    cs_load64_to(b, vt_sync_addr, cs_subqueue_ctx_reg(b),
2172                 offsetof(struct panvk_cs_subqueue_context, syncobjs));
2173    cs_wait_slot(b, SB_ID(LS), false);
2174 
2175    cs_add64(b, vt_sync_point,
2176             cs_progress_seqno_reg(b, PANVK_SUBQUEUE_VERTEX_TILER),
2177             rel_vt_sync_point);
2178    cs_sync64_wait(b, false, MALI_CS_CONDITION_GREATER, vt_sync_point,
2179                   vt_sync_addr);
2180 }
2181 
2182 static uint32_t
calc_tiler_oom_handler_idx(struct panvk_cmd_buffer * cmdbuf)2183 calc_tiler_oom_handler_idx(struct panvk_cmd_buffer *cmdbuf)
2184 {
2185    const struct pan_fb_info *fb = &cmdbuf->state.gfx.render.fb.info;
2186    bool has_zs_ext = fb->zs.view.zs || fb->zs.view.s;
2187    uint32_t rt_count = MAX2(fb->rt_count, 1);
2188 
2189    return get_tiler_oom_handler_idx(has_zs_ext, rt_count);
2190 }
2191 
2192 static void
setup_tiler_oom_ctx(struct panvk_cmd_buffer * cmdbuf)2193 setup_tiler_oom_ctx(struct panvk_cmd_buffer *cmdbuf)
2194 {
2195    struct cs_builder *b = panvk_get_cs_builder(cmdbuf, PANVK_SUBQUEUE_FRAGMENT);
2196 
2197    uint32_t td_count = DIV_ROUND_UP(cmdbuf->state.gfx.render.layer_count,
2198                                     MAX_LAYERS_PER_TILER_DESC);
2199    uint32_t fbd_sz = calc_fbd_size(cmdbuf);
2200    uint32_t fbd_ir_pass_offset = fbd_sz * cmdbuf->state.gfx.render.layer_count;
2201 
2202    struct cs_index counter = cs_scratch_reg32(b, 1);
2203    cs_move32_to(b, counter, 0);
2204    cs_store32(b, counter, cs_subqueue_ctx_reg(b),
2205               TILER_OOM_CTX_FIELD_OFFSET(counter));
2206 
2207    struct cs_index fbd_first = cs_scratch_reg64(b, 2);
2208    cs_add64(b, fbd_first, cs_sr_reg64(b, 40),
2209             (1 + PANVK_IR_FIRST_PASS) * fbd_ir_pass_offset);
2210    cs_store64(b, fbd_first, cs_subqueue_ctx_reg(b),
2211               TILER_OOM_CTX_FBDPTR_OFFSET(FIRST));
2212    struct cs_index fbd_middle = cs_scratch_reg64(b, 4);
2213    cs_add64(b, fbd_middle, cs_sr_reg64(b, 40),
2214             (1 + PANVK_IR_MIDDLE_PASS) * fbd_ir_pass_offset);
2215    cs_store64(b, fbd_middle, cs_subqueue_ctx_reg(b),
2216               TILER_OOM_CTX_FBDPTR_OFFSET(MIDDLE));
2217    struct cs_index fbd_last = cs_scratch_reg64(b, 6);
2218    cs_add64(b, fbd_last, cs_sr_reg64(b, 40),
2219             (1 + PANVK_IR_LAST_PASS) * fbd_ir_pass_offset);
2220    cs_store64(b, fbd_last, cs_subqueue_ctx_reg(b),
2221               TILER_OOM_CTX_FBDPTR_OFFSET(LAST));
2222 
2223    struct cs_index td_count_reg = cs_scratch_reg32(b, 8);
2224    cs_move32_to(b, td_count_reg, td_count);
2225    cs_store32(b, td_count_reg, cs_subqueue_ctx_reg(b),
2226               TILER_OOM_CTX_FIELD_OFFSET(td_count));
2227    struct cs_index layer_count = cs_scratch_reg32(b, 9);
2228    cs_move32_to(b, layer_count, cmdbuf->state.gfx.render.layer_count);
2229    cs_store32(b, layer_count, cs_subqueue_ctx_reg(b),
2230               TILER_OOM_CTX_FIELD_OFFSET(layer_count));
2231 
2232    cs_wait_slot(b, SB_ID(LS), false);
2233 }
2234 
2235 static VkResult
issue_fragment_jobs(struct panvk_cmd_buffer * cmdbuf)2236 issue_fragment_jobs(struct panvk_cmd_buffer *cmdbuf)
2237 {
2238    struct panvk_device *dev = to_panvk_device(cmdbuf->vk.base.device);
2239    struct panvk_instance *instance =
2240       to_panvk_instance(dev->vk.physical->instance);
2241    const struct cs_tracing_ctx *tracing_ctx =
2242       &cmdbuf->state.cs[PANVK_SUBQUEUE_FRAGMENT].tracing;
2243    struct pan_fb_info *fbinfo = &cmdbuf->state.gfx.render.fb.info;
2244    struct cs_builder *b = panvk_get_cs_builder(cmdbuf, PANVK_SUBQUEUE_FRAGMENT);
2245    bool has_oq_chain = cmdbuf->state.gfx.render.oq.chain != 0;
2246 
2247    /* Reserve a scoreboard for the fragment job. */
2248    panvk_per_arch(cs_pick_iter_sb)(cmdbuf, PANVK_SUBQUEUE_FRAGMENT);
2249 
2250    /* Now initialize the fragment bits. */
2251    cs_update_frag_ctx(b) {
2252       cs_move32_to(b, cs_sr_reg32(b, 42),
2253                    (fbinfo->extent.miny << 16) | fbinfo->extent.minx);
2254       cs_move32_to(b, cs_sr_reg32(b, 43),
2255                    (fbinfo->extent.maxy << 16) | fbinfo->extent.maxx);
2256    }
2257 
2258    bool simul_use =
2259       cmdbuf->flags & VK_COMMAND_BUFFER_USAGE_SIMULTANEOUS_USE_BIT;
2260 
2261    /* The only bit we patch in FBDs is the tiler pointer. If tiler is not
2262     * involved (clear job) or if the update can happen in place (not
2263     * simultaneous use of the command buffer), we can avoid the
2264     * copy. */
2265    bool needs_tiling =
2266       cmdbuf->state.gfx.render.tiler || inherits_render_ctx(cmdbuf);
2267 
2268    /* If the command buffer can run in parallel on different queues, we need
2269     * to make sure each instance has its own descriptors, unless tiling is
2270     * not needed (AKA RUN_FRAGMENT used for clears), because then the FBD
2271     * descriptors are constant (no need to patch them at runtime). */
2272    bool free_render_descs = simul_use && needs_tiling;
2273    uint32_t fbd_sz = calc_fbd_size(cmdbuf);
2274    uint32_t fbd_ir_pass_offset = fbd_sz * cmdbuf->state.gfx.render.layer_count;
2275    uint32_t td_count = 0;
2276    if (needs_tiling) {
2277       td_count = DIV_ROUND_UP(cmdbuf->state.gfx.render.layer_count,
2278                               MAX_LAYERS_PER_TILER_DESC);
2279    }
2280 
2281    /* Update the Tiler OOM context */
2282    setup_tiler_oom_ctx(cmdbuf);
2283 
2284    /* Enable the oom handler before waiting for the vertex/tiler work.
2285     * At this point, the tiler oom context has been set up with the correct
2286     * state for this renderpass, so it's safe to enable. */
2287    struct cs_index addr_reg = cs_scratch_reg64(b, 0);
2288    struct cs_index length_reg = cs_scratch_reg32(b, 2);
2289    uint32_t handler_idx = calc_tiler_oom_handler_idx(cmdbuf);
2290    uint64_t handler_addr = dev->tiler_oom.handlers_bo->addr.dev +
2291                            handler_idx * dev->tiler_oom.handler_stride;
2292    cs_move64_to(b, addr_reg, handler_addr);
2293    cs_move32_to(b, length_reg, dev->tiler_oom.handler_stride);
2294    cs_set_exception_handler(b, MALI_CS_EXCEPTION_TYPE_TILER_OOM, addr_reg,
2295                             length_reg);
2296 
2297    /* Wait for the tiling to be done before submitting the fragment job. */
2298    wait_finish_tiling(cmdbuf);
2299 
2300    /* Disable the oom handler once the vertex/tiler work has finished.
2301     * We need to disable the handler at this point as the vertex/tiler subqueue
2302     * might continue on to the next renderpass and hit an out-of-memory
2303     * exception prior to the fragment subqueue setting up the tiler oom context
2304     * for the next renderpass.
2305     * By disabling the handler here, any exception will be left pending until a
2306     * new hander is registered, at which point the correct state has been set
2307     * up. */
2308    cs_move64_to(b, addr_reg, 0);
2309    cs_move32_to(b, length_reg, 0);
2310    cs_set_exception_handler(b, MALI_CS_EXCEPTION_TYPE_TILER_OOM, addr_reg,
2311                             length_reg);
2312 
2313    /* Pick the correct set of FBDs based on whether an incremental render
2314     * occurred. */
2315    struct cs_index counter = cs_scratch_reg32(b, 0);
2316    cs_load32_to(
2317       b, counter, cs_subqueue_ctx_reg(b),
2318       offsetof(struct panvk_cs_subqueue_context, tiler_oom_ctx.counter));
2319    cs_wait_slot(b, SB_ID(LS), false);
2320    cs_if(b, MALI_CS_CONDITION_GREATER, counter)
2321       cs_update_frag_ctx(b)
2322          cs_add64(b, cs_sr_reg64(b, 40), cs_sr_reg64(b, 40),
2323                   (1 + PANVK_IR_LAST_PASS) * fbd_ir_pass_offset);
2324 
2325    /* Applications tend to forget to describe subpass dependencies, especially
2326     * when it comes to write -> read dependencies on attachments. The
2327     * proprietary driver forces "others" invalidation as a workaround, and this
2328     * invalidation even became implicit (done as part of the RUN_FRAGMENT) on
2329     * v13+. We don't do that in panvk, but we provide a debug flag to help
2330     * identify those issues. */
2331    if (unlikely(instance->debug_flags & PANVK_DEBUG_IMPLICIT_OTHERS_INV)) {
2332       cs_flush_caches(b, 0, 0, true, length_reg,
2333                       cs_defer(0x0, SB_ID(IMM_FLUSH)));
2334       cs_wait_slot(b, SB_ID(IMM_FLUSH), false);
2335    }
2336 
2337    cs_req_res(b, CS_FRAG_RES);
2338    if (cmdbuf->state.gfx.render.layer_count > 1) {
2339       struct cs_index layer_count = cs_sr_reg32(b, 47);
2340 
2341       cs_move32_to(b, layer_count, calc_enabled_layer_count(cmdbuf));
2342       cs_while(b, MALI_CS_CONDITION_GREATER, layer_count) {
2343          cs_trace_run_fragment(b, tracing_ctx, cs_scratch_reg_tuple(b, 0, 4),
2344                                false, MALI_TILE_RENDER_ORDER_Z_ORDER, false);
2345 
2346          cs_add32(b, layer_count, layer_count, -1);
2347          cs_update_frag_ctx(b)
2348             cs_add64(b, cs_sr_reg64(b, 40), cs_sr_reg64(b, 40), fbd_sz);
2349       }
2350    } else {
2351       cs_trace_run_fragment(b, tracing_ctx, cs_scratch_reg_tuple(b, 0, 4),
2352                             false, MALI_TILE_RENDER_ORDER_Z_ORDER, false);
2353    }
2354    cs_req_res(b, 0);
2355 
2356    struct cs_index sync_addr = cs_scratch_reg64(b, 0);
2357    struct cs_index iter_sb = cs_scratch_reg32(b, 2);
2358    struct cs_index cmp_scratch = cs_scratch_reg32(b, 3);
2359    struct cs_index add_val = cs_scratch_reg64(b, 4);
2360    struct cs_index add_val_lo = cs_scratch_reg32(b, 4);
2361    struct cs_index ringbuf_sync_addr = cs_scratch_reg64(b, 6);
2362    struct cs_index release_sz = cs_scratch_reg32(b, 8);
2363 
2364    struct cs_index completed = cs_scratch_reg_tuple(b, 10, 4);
2365    struct cs_index completed_top = cs_scratch_reg64(b, 10);
2366    struct cs_index completed_bottom = cs_scratch_reg64(b, 12);
2367    struct cs_index cur_tiler = cs_sr_reg64(b, 38);
2368    struct cs_index tiler_count = cs_sr_reg32(b, 47);
2369    struct cs_index oq_chain = cs_scratch_reg64(b, 10);
2370    struct cs_index oq_chain_lo = cs_scratch_reg32(b, 10);
2371    struct cs_index oq_chain_hi = cs_scratch_reg32(b, 11);
2372    struct cs_index oq_syncobj = cs_scratch_reg64(b, 12);
2373 
2374    cs_move64_to(b, add_val, 1);
2375    cs_load_to(b, cs_scratch_reg_tuple(b, 0, 3), cs_subqueue_ctx_reg(b),
2376               BITFIELD_MASK(3),
2377               offsetof(struct panvk_cs_subqueue_context, syncobjs));
2378 
2379    if (free_render_descs) {
2380       cs_move32_to(b, release_sz, calc_render_descs_size(cmdbuf));
2381       cs_load64_to(b, ringbuf_sync_addr, cs_subqueue_ctx_reg(b),
2382                    offsetof(struct panvk_cs_subqueue_context,
2383                             render.desc_ringbuf.syncobj));
2384    }
2385 
2386    cs_wait_slot(b, SB_ID(LS), false);
2387 
2388    cs_add64(b, sync_addr, sync_addr,
2389             PANVK_SUBQUEUE_FRAGMENT * sizeof(struct panvk_cs_sync64));
2390    cs_move32_to(b, tiler_count, td_count);
2391 
2392    cs_match(b, iter_sb, cmp_scratch) {
2393 #define CASE(x)                                                                \
2394    cs_case(b, x) {                                                             \
2395       const struct cs_async_op async =                                         \
2396          cs_defer(SB_WAIT_ITER(x), SB_ID(DEFERRED_SYNC));                      \
2397       if (td_count == 1) {                                                     \
2398          cs_load_to(b, completed, cur_tiler, BITFIELD_MASK(4), 40);            \
2399          cs_wait_slot(b, SB_ID(LS), false);                                    \
2400          cs_finish_fragment(b, true, completed_top, completed_bottom, async);  \
2401       } else if (td_count > 1) {                                               \
2402          cs_while(b, MALI_CS_CONDITION_GREATER, tiler_count) {                 \
2403             cs_load_to(b, completed, cur_tiler, BITFIELD_MASK(4), 40);         \
2404             cs_wait_slot(b, SB_ID(LS), false);                                 \
2405             cs_finish_fragment(b, false, completed_top, completed_bottom,      \
2406                                async);                                         \
2407             cs_update_frag_ctx(b)                                              \
2408                cs_add64(b, cur_tiler, cur_tiler, pan_size(TILER_CONTEXT));     \
2409             cs_add32(b, tiler_count, tiler_count, -1);                         \
2410          }                                                                     \
2411          cs_frag_end(b, async);                                                \
2412       }                                                                        \
2413       if (free_render_descs) {                                                 \
2414          cs_sync32_add(b, true, MALI_CS_SYNC_SCOPE_CSG, release_sz,            \
2415                        ringbuf_sync_addr, async);                              \
2416       }                                                                        \
2417       if (has_oq_chain) {                                                      \
2418          struct cs_index flush_id = oq_chain_lo;                               \
2419          cs_move32_to(b, flush_id, 0);                                         \
2420          cs_flush_caches(b, MALI_CS_FLUSH_MODE_CLEAN,                          \
2421                          MALI_CS_FLUSH_MODE_CLEAN, false, flush_id,            \
2422                          cs_defer(SB_WAIT_ITER(x), SB_ID(DEFERRED_FLUSH)));    \
2423          cs_load64_to(                                                         \
2424             b, oq_chain, cs_subqueue_ctx_reg(b),                               \
2425             offsetof(struct panvk_cs_subqueue_context, render.oq_chain));      \
2426          cs_wait_slot(b, SB_ID(LS), false);                                    \
2427          /* We use oq_syncobj as a placeholder to reset the oq_chain. */       \
2428          cs_move64_to(b, oq_syncobj, 0);                                       \
2429          cs_store64(                                                           \
2430             b, oq_syncobj, cs_subqueue_ctx_reg(b),                             \
2431             offsetof(struct panvk_cs_subqueue_context, render.oq_chain));      \
2432          cs_wait_slot(b, SB_ID(LS), false);                                    \
2433          cs_while(b, MALI_CS_CONDITION_ALWAYS, cs_undef()) {                   \
2434             cs_load64_to(b, oq_syncobj, oq_chain,                              \
2435                          offsetof(struct panvk_cs_occlusion_query, syncobj));  \
2436             cs_wait_slot(b, SB_ID(LS), false);                                 \
2437             cs_load64_to(b, oq_chain, oq_chain,                                \
2438                          offsetof(struct panvk_cs_occlusion_query, next));     \
2439             cs_wait_slot(b, SB_ID(LS), false);                                 \
2440             cs_sync32_set(                                                     \
2441                b, true, MALI_CS_SYNC_SCOPE_CSG, add_val_lo, oq_syncobj,        \
2442                cs_defer(SB_MASK(DEFERRED_FLUSH), SB_ID(DEFERRED_SYNC)));       \
2443             cs_if(b, MALI_CS_CONDITION_NEQUAL, oq_chain_lo)                    \
2444                cs_continue(b);                                                 \
2445             cs_if(b, MALI_CS_CONDITION_NEQUAL, oq_chain_hi)                    \
2446                cs_continue(b);                                                 \
2447             cs_break(b);                                                       \
2448          }                                                                     \
2449       }                                                                        \
2450       cs_sync64_add(b, true, MALI_CS_SYNC_SCOPE_CSG, add_val, sync_addr,       \
2451                     async);                                                    \
2452       cs_move32_to(b, iter_sb, next_iter_sb(x));                               \
2453    }
2454 
2455       CASE(0)
2456       CASE(1)
2457       CASE(2)
2458       CASE(3)
2459       CASE(4)
2460 #undef CASE
2461    }
2462 
2463    cs_store32(b, iter_sb, cs_subqueue_ctx_reg(b),
2464               offsetof(struct panvk_cs_subqueue_context, iter_sb));
2465    cs_wait_slot(b, SB_ID(LS), false);
2466 
2467    /* Update the ring buffer position. */
2468    if (free_render_descs) {
2469       cs_render_desc_ringbuf_move_ptr(b, calc_render_descs_size(cmdbuf),
2470                                       !tracing_ctx->enabled);
2471    }
2472 
2473    /* Update the frag seqno. */
2474    ++cmdbuf->state.cs[PANVK_SUBQUEUE_FRAGMENT].relative_sync_point;
2475 
2476 
2477    return VK_SUCCESS;
2478 }
2479 
2480 void
panvk_per_arch(cmd_flush_draws)2481 panvk_per_arch(cmd_flush_draws)(struct panvk_cmd_buffer *cmdbuf)
2482 {
2483    /* If there was no draw queued, we don't need to force a preload. */
2484    if (cmdbuf->state.gfx.render.fbds.gpu || inherits_render_ctx(cmdbuf)) {
2485       flush_tiling(cmdbuf);
2486       issue_fragment_jobs(cmdbuf);
2487       memset(&cmdbuf->state.gfx.render.fbds, 0,
2488              sizeof(cmdbuf->state.gfx.render.fbds));
2489       cmdbuf->state.gfx.render.tiler = 0;
2490 
2491       panvk_per_arch(cmd_force_fb_preload)(cmdbuf, NULL);
2492 
2493       /* We inherited the render context, and need to let the primary command
2494        * buffer know that it's changed. */
2495       cmdbuf->state.gfx.render.invalidate_inherited_ctx = true;
2496 
2497       /* Re-emit the FB/Tiler descs if we inherited them. */
2498       if (inherits_render_ctx(cmdbuf))
2499          get_render_ctx(cmdbuf);
2500    }
2501 }
2502 
2503 VKAPI_ATTR void VKAPI_CALL
panvk_per_arch(CmdEndRendering)2504 panvk_per_arch(CmdEndRendering)(VkCommandBuffer commandBuffer)
2505 {
2506    VK_FROM_HANDLE(panvk_cmd_buffer, cmdbuf, commandBuffer);
2507    bool suspending = cmdbuf->state.gfx.render.flags & VK_RENDERING_SUSPENDING_BIT;
2508    VkResult result;
2509 
2510    if (!suspending) {
2511       struct pan_fb_info *fbinfo = &cmdbuf->state.gfx.render.fb.info;
2512       bool clear = fbinfo->zs.clear.z | fbinfo->zs.clear.s;
2513       for (unsigned i = 0; i < fbinfo->rt_count; i++)
2514          clear |= fbinfo->rts[i].clear;
2515 
2516       if (clear && !inherits_render_ctx(cmdbuf)) {
2517          result = get_fb_descs(cmdbuf);
2518          if (result != VK_SUCCESS)
2519             return;
2520       }
2521 
2522       /* Flush the last occlusion query before ending the render pass if
2523        * this query has ended while we were inside the render pass. */
2524       if (cmdbuf->state.gfx.render.oq.last !=
2525           cmdbuf->state.gfx.occlusion_query.syncobj) {
2526          result = wrap_prev_oq(cmdbuf);
2527          if (result != VK_SUCCESS)
2528             return;
2529       }
2530 
2531       if (cmdbuf->state.gfx.render.fbds.gpu || inherits_render_ctx(cmdbuf)) {
2532          flush_tiling(cmdbuf);
2533          issue_fragment_jobs(cmdbuf);
2534       }
2535    } else if (!inherits_render_ctx(cmdbuf)) {
2536       /* If we're suspending the render pass and we didn't inherit the render
2537        * context, we need to emit it now, so it's available when the render pass
2538        * is resumed. */
2539       VkResult result = get_render_ctx(cmdbuf);
2540       if (result != VK_SUCCESS)
2541          return;
2542    }
2543 
2544    memset(&cmdbuf->state.gfx.render.fbds, 0,
2545           sizeof(cmdbuf->state.gfx.render.fbds));
2546    memset(&cmdbuf->state.gfx.render.oq, 0, sizeof(cmdbuf->state.gfx.render.oq));
2547    cmdbuf->state.gfx.render.tiler = 0;
2548 
2549    /* If we're not suspending, we need to resolve attachments. */
2550    if (!suspending)
2551       panvk_per_arch(cmd_resolve_attachments)(cmdbuf);
2552 }
2553