1 /*
2 * Copyright © 2024 Collabora Ltd.
3 * Copyright © 2024 Arm Ltd.
4 *
5 * Derived from tu_cmd_buffer.c which is:
6 * Copyright © 2016 Red Hat.
7 * Copyright © 2016 Bas Nieuwenhuizen
8 * Copyright © 2015 Intel Corporation
9 *
10 * SPDX-License-Identifier: MIT
11 */
12
13 #include <stdint.h>
14 #include "genxml/gen_macros.h"
15
16 #include "panvk_buffer.h"
17 #include "panvk_cmd_alloc.h"
18 #include "panvk_cmd_buffer.h"
19 #include "panvk_cmd_desc_state.h"
20 #include "panvk_cmd_draw.h"
21 #include "panvk_cmd_fb_preload.h"
22 #include "panvk_cmd_meta.h"
23 #include "panvk_device.h"
24 #include "panvk_entrypoints.h"
25 #include "panvk_image.h"
26 #include "panvk_image_view.h"
27 #include "panvk_instance.h"
28 #include "panvk_priv_bo.h"
29 #include "panvk_shader.h"
30
31 #include "pan_desc.h"
32 #include "pan_earlyzs.h"
33 #include "pan_encoder.h"
34 #include "pan_format.h"
35 #include "pan_jc.h"
36 #include "pan_props.h"
37 #include "pan_samples.h"
38 #include "pan_shader.h"
39
40 #include "util/bitscan.h"
41 #include "vk_format.h"
42 #include "vk_meta.h"
43 #include "vk_pipeline_layout.h"
44 #include "vk_render_pass.h"
45
46 static void
emit_vs_attrib(const struct vk_vertex_attribute_state * attrib_info,const struct vk_vertex_binding_state * buf_info,const struct panvk_attrib_buf * buf,uint32_t vb_desc_offset,struct mali_attribute_packed * desc)47 emit_vs_attrib(const struct vk_vertex_attribute_state *attrib_info,
48 const struct vk_vertex_binding_state *buf_info,
49 const struct panvk_attrib_buf *buf, uint32_t vb_desc_offset,
50 struct mali_attribute_packed *desc)
51 {
52 bool per_instance = buf_info->input_rate == VK_VERTEX_INPUT_RATE_INSTANCE;
53 enum pipe_format f = vk_format_to_pipe_format(attrib_info->format);
54 unsigned buf_idx = vb_desc_offset + attrib_info->binding;
55
56 pan_pack(desc, ATTRIBUTE, cfg) {
57 cfg.offset = attrib_info->offset;
58 cfg.format = GENX(panfrost_format_from_pipe_format)(f)->hw;
59 cfg.table = 0;
60 cfg.buffer_index = buf_idx;
61 cfg.stride = buf_info->stride;
62 if (!per_instance) {
63 /* Per-vertex */
64 cfg.attribute_type = MALI_ATTRIBUTE_TYPE_1D;
65 cfg.frequency = MALI_ATTRIBUTE_FREQUENCY_VERTEX;
66 cfg.offset_enable = true;
67 } else if (buf_info->divisor == 1) {
68 cfg.attribute_type = MALI_ATTRIBUTE_TYPE_1D;
69 cfg.frequency = MALI_ATTRIBUTE_FREQUENCY_INSTANCE;
70 } else if (buf_info->divisor == 0) {
71 cfg.attribute_type = MALI_ATTRIBUTE_TYPE_1D;
72 /* HW doesn't support a zero divisor, but we can achieve the same by
73 * not using a divisor and setting the stride to zero */
74 cfg.frequency = MALI_ATTRIBUTE_FREQUENCY_INSTANCE;
75 cfg.stride = 0;
76 } else if (util_is_power_of_two_or_zero(buf_info->divisor)) {
77 /* Per-instance, POT divisor */
78 cfg.attribute_type = MALI_ATTRIBUTE_TYPE_1D_POT_DIVISOR;
79 cfg.frequency = MALI_ATTRIBUTE_FREQUENCY_INSTANCE;
80 cfg.divisor_r = __builtin_ctz(buf_info->divisor);
81 } else {
82 /* Per-instance, NPOT divisor */
83 cfg.attribute_type = MALI_ATTRIBUTE_TYPE_1D_NPOT_DIVISOR;
84 cfg.frequency = MALI_ATTRIBUTE_FREQUENCY_INSTANCE;
85 cfg.divisor_d = panfrost_compute_magic_divisor(
86 buf_info->divisor, &cfg.divisor_r, &cfg.divisor_e);
87 }
88 }
89 }
90
91 static bool
vs_driver_set_is_dirty(struct panvk_cmd_buffer * cmdbuf)92 vs_driver_set_is_dirty(struct panvk_cmd_buffer *cmdbuf)
93 {
94 return dyn_gfx_state_dirty(cmdbuf, VI) ||
95 dyn_gfx_state_dirty(cmdbuf, VI_BINDINGS_VALID) ||
96 dyn_gfx_state_dirty(cmdbuf, VI_BINDING_STRIDES) ||
97 gfx_state_dirty(cmdbuf, VB) || gfx_state_dirty(cmdbuf, VS) ||
98 gfx_state_dirty(cmdbuf, DESC_STATE);
99 }
100
101 static VkResult
prepare_vs_driver_set(struct panvk_cmd_buffer * cmdbuf)102 prepare_vs_driver_set(struct panvk_cmd_buffer *cmdbuf)
103 {
104 if (!vs_driver_set_is_dirty(cmdbuf))
105 return VK_SUCCESS;
106
107 struct panvk_shader_desc_state *vs_desc_state = &cmdbuf->state.gfx.vs.desc;
108 const struct panvk_shader *vs = cmdbuf->state.gfx.vs.shader;
109 const struct vk_vertex_input_state *vi =
110 cmdbuf->vk.dynamic_graphics_state.vi;
111 uint32_t vb_count = 0;
112
113 u_foreach_bit(i, vi->attributes_valid)
114 vb_count = MAX2(vi->attributes[i].binding + 1, vb_count);
115
116 uint32_t vb_offset = vs->desc_info.dyn_bufs.count + MAX_VS_ATTRIBS + 1;
117 uint32_t desc_count = vb_offset + vb_count;
118 const struct panvk_descriptor_state *desc_state =
119 &cmdbuf->state.gfx.desc_state;
120 struct panfrost_ptr driver_set = panvk_cmd_alloc_dev_mem(
121 cmdbuf, desc, desc_count * PANVK_DESCRIPTOR_SIZE, PANVK_DESCRIPTOR_SIZE);
122 struct panvk_opaque_desc *descs = driver_set.cpu;
123
124 if (!driver_set.gpu)
125 return VK_ERROR_OUT_OF_DEVICE_MEMORY;
126
127 for (uint32_t i = 0; i < MAX_VS_ATTRIBS; i++) {
128 if (vi->attributes_valid & BITFIELD_BIT(i)) {
129 unsigned binding = vi->attributes[i].binding;
130
131 emit_vs_attrib(&vi->attributes[i], &vi->bindings[binding],
132 &cmdbuf->state.gfx.vb.bufs[binding], vb_offset,
133 (struct mali_attribute_packed *)(&descs[i]));
134 } else {
135 memset(&descs[i], 0, sizeof(descs[0]));
136 }
137 }
138
139 /* Dummy sampler always comes right after the vertex attribs. */
140 pan_cast_and_pack(&descs[MAX_VS_ATTRIBS], SAMPLER, cfg) {
141 cfg.clamp_integer_array_indices = false;
142 }
143
144 panvk_per_arch(cmd_fill_dyn_bufs)(
145 desc_state, vs,
146 (struct mali_buffer_packed *)(&descs[MAX_VS_ATTRIBS + 1]));
147
148 for (uint32_t i = 0; i < vb_count; i++) {
149 const struct panvk_attrib_buf *vb = &cmdbuf->state.gfx.vb.bufs[i];
150
151 pan_cast_and_pack(&descs[vb_offset + i], BUFFER, cfg) {
152 if (vi->bindings_valid & BITFIELD_BIT(i)) {
153 cfg.address = vb->address;
154 cfg.size = vb->size;
155 } else {
156 cfg.address = 0;
157 cfg.size = 0;
158 }
159 }
160 }
161
162 vs_desc_state->driver_set.dev_addr = driver_set.gpu;
163 vs_desc_state->driver_set.size = desc_count * PANVK_DESCRIPTOR_SIZE;
164 gfx_state_set_dirty(cmdbuf, DESC_STATE);
165 return VK_SUCCESS;
166 }
167
168 static VkResult
prepare_fs_driver_set(struct panvk_cmd_buffer * cmdbuf)169 prepare_fs_driver_set(struct panvk_cmd_buffer *cmdbuf)
170 {
171 struct panvk_shader_desc_state *fs_desc_state = &cmdbuf->state.gfx.fs.desc;
172 const struct panvk_shader *fs = cmdbuf->state.gfx.fs.shader;
173 const struct panvk_descriptor_state *desc_state =
174 &cmdbuf->state.gfx.desc_state;
175 uint32_t desc_count = fs->desc_info.dyn_bufs.count + 1;
176 struct panfrost_ptr driver_set = panvk_cmd_alloc_dev_mem(
177 cmdbuf, desc, desc_count * PANVK_DESCRIPTOR_SIZE, PANVK_DESCRIPTOR_SIZE);
178 struct panvk_opaque_desc *descs = driver_set.cpu;
179
180 if (desc_count && !driver_set.gpu)
181 return VK_ERROR_OUT_OF_DEVICE_MEMORY;
182
183 /* Dummy sampler always comes first. */
184 pan_cast_and_pack(&descs[0], SAMPLER, cfg) {
185 cfg.clamp_integer_array_indices = false;
186 }
187
188 panvk_per_arch(cmd_fill_dyn_bufs)(desc_state, fs,
189 (struct mali_buffer_packed *)(&descs[1]));
190
191 fs_desc_state->driver_set.dev_addr = driver_set.gpu;
192 fs_desc_state->driver_set.size = desc_count * PANVK_DESCRIPTOR_SIZE;
193 gfx_state_set_dirty(cmdbuf, DESC_STATE);
194 return VK_SUCCESS;
195 }
196
197 static bool
has_depth_att(struct panvk_cmd_buffer * cmdbuf)198 has_depth_att(struct panvk_cmd_buffer *cmdbuf)
199 {
200 return (cmdbuf->state.gfx.render.bound_attachments &
201 MESA_VK_RP_ATTACHMENT_DEPTH_BIT) != 0;
202 }
203
204 static bool
has_stencil_att(struct panvk_cmd_buffer * cmdbuf)205 has_stencil_att(struct panvk_cmd_buffer *cmdbuf)
206 {
207 return (cmdbuf->state.gfx.render.bound_attachments &
208 MESA_VK_RP_ATTACHMENT_STENCIL_BIT) != 0;
209 }
210
211 static bool
writes_depth(struct panvk_cmd_buffer * cmdbuf)212 writes_depth(struct panvk_cmd_buffer *cmdbuf)
213 {
214 const struct vk_depth_stencil_state *ds =
215 &cmdbuf->vk.dynamic_graphics_state.ds;
216
217 return has_depth_att(cmdbuf) && ds->depth.test_enable &&
218 ds->depth.write_enable && ds->depth.compare_op != VK_COMPARE_OP_NEVER;
219 }
220
221 static bool
writes_stencil(struct panvk_cmd_buffer * cmdbuf)222 writes_stencil(struct panvk_cmd_buffer *cmdbuf)
223 {
224 const struct vk_depth_stencil_state *ds =
225 &cmdbuf->vk.dynamic_graphics_state.ds;
226
227 return has_stencil_att(cmdbuf) && ds->stencil.test_enable &&
228 ((ds->stencil.front.write_mask &&
229 (ds->stencil.front.op.fail != VK_STENCIL_OP_KEEP ||
230 ds->stencil.front.op.pass != VK_STENCIL_OP_KEEP ||
231 ds->stencil.front.op.depth_fail != VK_STENCIL_OP_KEEP)) ||
232 (ds->stencil.back.write_mask &&
233 (ds->stencil.back.op.fail != VK_STENCIL_OP_KEEP ||
234 ds->stencil.back.op.pass != VK_STENCIL_OP_KEEP ||
235 ds->stencil.back.op.depth_fail != VK_STENCIL_OP_KEEP)));
236 }
237
238 static bool
ds_test_always_passes(struct panvk_cmd_buffer * cmdbuf)239 ds_test_always_passes(struct panvk_cmd_buffer *cmdbuf)
240 {
241 const struct vk_depth_stencil_state *ds =
242 &cmdbuf->vk.dynamic_graphics_state.ds;
243
244 if (!has_depth_att(cmdbuf))
245 return true;
246
247 if (ds->depth.test_enable && ds->depth.compare_op != VK_COMPARE_OP_ALWAYS)
248 return false;
249
250 if (ds->stencil.test_enable &&
251 (ds->stencil.front.op.compare != VK_COMPARE_OP_ALWAYS ||
252 ds->stencil.back.op.compare != VK_COMPARE_OP_ALWAYS))
253 return false;
254
255 return true;
256 }
257
258 static inline enum mali_func
translate_compare_func(VkCompareOp comp)259 translate_compare_func(VkCompareOp comp)
260 {
261 STATIC_ASSERT(VK_COMPARE_OP_NEVER == (VkCompareOp)MALI_FUNC_NEVER);
262 STATIC_ASSERT(VK_COMPARE_OP_LESS == (VkCompareOp)MALI_FUNC_LESS);
263 STATIC_ASSERT(VK_COMPARE_OP_EQUAL == (VkCompareOp)MALI_FUNC_EQUAL);
264 STATIC_ASSERT(VK_COMPARE_OP_LESS_OR_EQUAL == (VkCompareOp)MALI_FUNC_LEQUAL);
265 STATIC_ASSERT(VK_COMPARE_OP_GREATER == (VkCompareOp)MALI_FUNC_GREATER);
266 STATIC_ASSERT(VK_COMPARE_OP_NOT_EQUAL == (VkCompareOp)MALI_FUNC_NOT_EQUAL);
267 STATIC_ASSERT(VK_COMPARE_OP_GREATER_OR_EQUAL ==
268 (VkCompareOp)MALI_FUNC_GEQUAL);
269 STATIC_ASSERT(VK_COMPARE_OP_ALWAYS == (VkCompareOp)MALI_FUNC_ALWAYS);
270
271 return (enum mali_func)comp;
272 }
273
274 static enum mali_stencil_op
translate_stencil_op(VkStencilOp in)275 translate_stencil_op(VkStencilOp in)
276 {
277 switch (in) {
278 case VK_STENCIL_OP_KEEP:
279 return MALI_STENCIL_OP_KEEP;
280 case VK_STENCIL_OP_ZERO:
281 return MALI_STENCIL_OP_ZERO;
282 case VK_STENCIL_OP_REPLACE:
283 return MALI_STENCIL_OP_REPLACE;
284 case VK_STENCIL_OP_INCREMENT_AND_CLAMP:
285 return MALI_STENCIL_OP_INCR_SAT;
286 case VK_STENCIL_OP_DECREMENT_AND_CLAMP:
287 return MALI_STENCIL_OP_DECR_SAT;
288 case VK_STENCIL_OP_INCREMENT_AND_WRAP:
289 return MALI_STENCIL_OP_INCR_WRAP;
290 case VK_STENCIL_OP_DECREMENT_AND_WRAP:
291 return MALI_STENCIL_OP_DECR_WRAP;
292 case VK_STENCIL_OP_INVERT:
293 return MALI_STENCIL_OP_INVERT;
294 default:
295 unreachable("Invalid stencil op");
296 }
297 }
298
299 static enum mali_draw_mode
translate_prim_topology(VkPrimitiveTopology in)300 translate_prim_topology(VkPrimitiveTopology in)
301 {
302 /* Test VK_PRIMITIVE_TOPOLOGY_META_RECT_LIST_MESA separately, as it's not
303 * part of the VkPrimitiveTopology enum.
304 */
305 if (in == VK_PRIMITIVE_TOPOLOGY_META_RECT_LIST_MESA)
306 return MALI_DRAW_MODE_TRIANGLES;
307
308 switch (in) {
309 case VK_PRIMITIVE_TOPOLOGY_POINT_LIST:
310 return MALI_DRAW_MODE_POINTS;
311 case VK_PRIMITIVE_TOPOLOGY_LINE_LIST:
312 return MALI_DRAW_MODE_LINES;
313 case VK_PRIMITIVE_TOPOLOGY_LINE_STRIP:
314 return MALI_DRAW_MODE_LINE_STRIP;
315 case VK_PRIMITIVE_TOPOLOGY_TRIANGLE_LIST:
316 return MALI_DRAW_MODE_TRIANGLES;
317 case VK_PRIMITIVE_TOPOLOGY_TRIANGLE_STRIP:
318 return MALI_DRAW_MODE_TRIANGLE_STRIP;
319 case VK_PRIMITIVE_TOPOLOGY_TRIANGLE_FAN:
320 return MALI_DRAW_MODE_TRIANGLE_FAN;
321 case VK_PRIMITIVE_TOPOLOGY_LINE_LIST_WITH_ADJACENCY:
322 case VK_PRIMITIVE_TOPOLOGY_LINE_STRIP_WITH_ADJACENCY:
323 case VK_PRIMITIVE_TOPOLOGY_TRIANGLE_LIST_WITH_ADJACENCY:
324 case VK_PRIMITIVE_TOPOLOGY_TRIANGLE_STRIP_WITH_ADJACENCY:
325 case VK_PRIMITIVE_TOPOLOGY_PATCH_LIST:
326 default:
327 unreachable("Invalid primitive type");
328 }
329 }
330
331 static VkResult
update_tls(struct panvk_cmd_buffer * cmdbuf)332 update_tls(struct panvk_cmd_buffer *cmdbuf)
333 {
334 struct panvk_tls_state *state = &cmdbuf->state.tls;
335 const struct panvk_shader *vs = cmdbuf->state.gfx.vs.shader;
336 const struct panvk_shader *fs = cmdbuf->state.gfx.fs.shader;
337 struct cs_builder *b =
338 panvk_get_cs_builder(cmdbuf, PANVK_SUBQUEUE_VERTEX_TILER);
339
340 if (!cmdbuf->state.gfx.tsd) {
341 if (!state->desc.gpu) {
342 state->desc = panvk_cmd_alloc_desc(cmdbuf, LOCAL_STORAGE);
343 if (!state->desc.gpu)
344 return VK_ERROR_OUT_OF_DEVICE_MEMORY;
345 }
346
347 cmdbuf->state.gfx.tsd = state->desc.gpu;
348
349 cs_update_vt_ctx(b)
350 cs_move64_to(b, cs_sr_reg64(b, 24), state->desc.gpu);
351 }
352
353 state->info.tls.size =
354 MAX3(vs->info.tls_size, fs ? fs->info.tls_size : 0, state->info.tls.size);
355 return VK_SUCCESS;
356 }
357
358 static enum mali_index_type
index_size_to_index_type(uint32_t size)359 index_size_to_index_type(uint32_t size)
360 {
361 switch (size) {
362 case 0:
363 return MALI_INDEX_TYPE_NONE;
364 case 1:
365 return MALI_INDEX_TYPE_UINT8;
366 case 2:
367 return MALI_INDEX_TYPE_UINT16;
368 case 4:
369 return MALI_INDEX_TYPE_UINT32;
370 default:
371 assert(!"Invalid index size");
372 return MALI_INDEX_TYPE_NONE;
373 }
374 }
375
376 static VkResult
prepare_blend(struct panvk_cmd_buffer * cmdbuf)377 prepare_blend(struct panvk_cmd_buffer *cmdbuf)
378 {
379 bool dirty = dyn_gfx_state_dirty(cmdbuf, MS_ALPHA_TO_ONE_ENABLE) ||
380 dyn_gfx_state_dirty(cmdbuf, CB_LOGIC_OP_ENABLE) ||
381 dyn_gfx_state_dirty(cmdbuf, CB_LOGIC_OP) ||
382 dyn_gfx_state_dirty(cmdbuf, CB_ATTACHMENT_COUNT) ||
383 dyn_gfx_state_dirty(cmdbuf, CB_COLOR_WRITE_ENABLES) ||
384 dyn_gfx_state_dirty(cmdbuf, CB_BLEND_ENABLES) ||
385 dyn_gfx_state_dirty(cmdbuf, CB_BLEND_EQUATIONS) ||
386 dyn_gfx_state_dirty(cmdbuf, CB_WRITE_MASKS) ||
387 dyn_gfx_state_dirty(cmdbuf, CB_BLEND_CONSTANTS) ||
388 fs_user_dirty(cmdbuf) || gfx_state_dirty(cmdbuf, RENDER_STATE);
389
390 if (!dirty)
391 return VK_SUCCESS;
392
393 const struct vk_dynamic_graphics_state *dyns =
394 &cmdbuf->vk.dynamic_graphics_state;
395 const struct vk_color_blend_state *cb = &dyns->cb;
396 unsigned bd_count = MAX2(cb->attachment_count, 1);
397 struct cs_builder *b =
398 panvk_get_cs_builder(cmdbuf, PANVK_SUBQUEUE_VERTEX_TILER);
399 struct panfrost_ptr ptr =
400 panvk_cmd_alloc_desc_array(cmdbuf, bd_count, BLEND);
401 struct mali_blend_packed *bds = ptr.cpu;
402
403 if (bd_count && !ptr.gpu)
404 return VK_ERROR_OUT_OF_DEVICE_MEMORY;
405
406 panvk_per_arch(blend_emit_descs)(cmdbuf, bds);
407
408 cs_update_vt_ctx(b)
409 cs_move64_to(b, cs_sr_reg64(b, 50), ptr.gpu | bd_count);
410
411 return VK_SUCCESS;
412 }
413
414 static void
prepare_vp(struct panvk_cmd_buffer * cmdbuf)415 prepare_vp(struct panvk_cmd_buffer *cmdbuf)
416 {
417 struct cs_builder *b =
418 panvk_get_cs_builder(cmdbuf, PANVK_SUBQUEUE_VERTEX_TILER);
419 const VkViewport *viewport =
420 &cmdbuf->vk.dynamic_graphics_state.vp.viewports[0];
421 const VkRect2D *scissor = &cmdbuf->vk.dynamic_graphics_state.vp.scissors[0];
422
423 if (dyn_gfx_state_dirty(cmdbuf, VP_VIEWPORTS) ||
424 dyn_gfx_state_dirty(cmdbuf, VP_SCISSORS)) {
425 struct mali_scissor_packed scissor_box;
426 pan_pack(&scissor_box, SCISSOR, cfg) {
427
428 /* The spec says "width must be greater than 0.0" */
429 assert(viewport->width >= 0);
430 int minx = (int)viewport->x;
431 int maxx = (int)(viewport->x + viewport->width);
432
433 /* Viewport height can be negative */
434 int miny =
435 MIN2((int)viewport->y, (int)(viewport->y + viewport->height));
436 int maxy =
437 MAX2((int)viewport->y, (int)(viewport->y + viewport->height));
438
439 assert(scissor->offset.x >= 0 && scissor->offset.y >= 0);
440 minx = MAX2(scissor->offset.x, minx);
441 miny = MAX2(scissor->offset.y, miny);
442 maxx = MIN2(scissor->offset.x + scissor->extent.width, maxx);
443 maxy = MIN2(scissor->offset.y + scissor->extent.height, maxy);
444
445 /* Make sure we don't end up with a max < min when width/height is 0 */
446 maxx = maxx > minx ? maxx - 1 : maxx;
447 maxy = maxy > miny ? maxy - 1 : maxy;
448
449 /* Clamp viewport scissor to valid range */
450 cfg.scissor_minimum_x = CLAMP(minx, 0, UINT16_MAX);
451 cfg.scissor_minimum_y = CLAMP(miny, 0, UINT16_MAX);
452 cfg.scissor_maximum_x = CLAMP(maxx, 0, UINT16_MAX);
453 cfg.scissor_maximum_y = CLAMP(maxy, 0, UINT16_MAX);
454 }
455
456 struct mali_scissor_packed *scissor_box_ptr = &scissor_box;
457 cs_move64_to(b, cs_sr_reg64(b, 42), *((uint64_t*)scissor_box_ptr));
458 }
459
460 if (dyn_gfx_state_dirty(cmdbuf, VP_VIEWPORTS) ||
461 dyn_gfx_state_dirty(cmdbuf, RS_DEPTH_CLIP_ENABLE) ||
462 dyn_gfx_state_dirty(cmdbuf, RS_DEPTH_CLAMP_ENABLE)) {
463 struct panvk_graphics_sysvals *sysvals = &cmdbuf->state.gfx.sysvals;
464
465 float z_min = sysvals->viewport.offset.z;
466 float z_max = z_min + sysvals->viewport.scale.z;
467 cs_move32_to(b, cs_sr_reg32(b, 44), fui(MIN2(z_min, z_max)));
468 cs_move32_to(b, cs_sr_reg32(b, 45), fui(MAX2(z_min, z_max)));
469 }
470 }
471
472 static inline uint64_t
get_pos_spd(const struct panvk_cmd_buffer * cmdbuf)473 get_pos_spd(const struct panvk_cmd_buffer *cmdbuf)
474 {
475 const struct panvk_shader *vs = cmdbuf->state.gfx.vs.shader;
476 assert(vs);
477 const struct vk_input_assembly_state *ia =
478 &cmdbuf->vk.dynamic_graphics_state.ia;
479 return ia->primitive_topology == VK_PRIMITIVE_TOPOLOGY_POINT_LIST
480 ? panvk_priv_mem_dev_addr(vs->spds.pos_points)
481 : panvk_priv_mem_dev_addr(vs->spds.pos_triangles);
482 }
483
484 static void
prepare_tiler_primitive_size(struct panvk_cmd_buffer * cmdbuf)485 prepare_tiler_primitive_size(struct panvk_cmd_buffer *cmdbuf)
486 {
487 struct cs_builder *b =
488 panvk_get_cs_builder(cmdbuf, PANVK_SUBQUEUE_VERTEX_TILER);
489 const struct panvk_shader *vs = cmdbuf->state.gfx.vs.shader;
490 const struct vk_input_assembly_state *ia =
491 &cmdbuf->vk.dynamic_graphics_state.ia;
492 float primitive_size;
493
494 if (!dyn_gfx_state_dirty(cmdbuf, IA_PRIMITIVE_TOPOLOGY) &&
495 !dyn_gfx_state_dirty(cmdbuf, RS_LINE_WIDTH) &&
496 !gfx_state_dirty(cmdbuf, VS))
497 return;
498
499 switch (ia->primitive_topology) {
500 /* From the Vulkan spec 1.3.293:
501 *
502 * "If maintenance5 is enabled and a value is not written to a variable
503 * decorated with PointSize, a value of 1.0 is used as the size of
504 * points."
505 *
506 * If no point size is written, ensure that the size is always 1.0f.
507 */
508 case VK_PRIMITIVE_TOPOLOGY_POINT_LIST:
509 if (vs->info.vs.writes_point_size)
510 return;
511
512 primitive_size = 1.0f;
513 break;
514 case VK_PRIMITIVE_TOPOLOGY_LINE_LIST:
515 case VK_PRIMITIVE_TOPOLOGY_LINE_STRIP:
516 primitive_size = cmdbuf->vk.dynamic_graphics_state.rs.line.width;
517 break;
518 default:
519 return;
520 }
521
522 cs_move32_to(b, cs_sr_reg32(b, 60), fui(primitive_size));
523 }
524
525 static uint32_t
calc_enabled_layer_count(struct panvk_cmd_buffer * cmdbuf)526 calc_enabled_layer_count(struct panvk_cmd_buffer *cmdbuf)
527 {
528 return cmdbuf->state.gfx.render.view_mask ?
529 util_bitcount(cmdbuf->state.gfx.render.view_mask) :
530 cmdbuf->state.gfx.render.layer_count;
531 }
532
533 static uint32_t
calc_fbd_size(struct panvk_cmd_buffer * cmdbuf)534 calc_fbd_size(struct panvk_cmd_buffer *cmdbuf)
535 {
536 const struct pan_fb_info *fb = &cmdbuf->state.gfx.render.fb.info;
537 bool has_zs_ext = fb->zs.view.zs || fb->zs.view.s;
538 uint32_t rt_count = MAX2(fb->rt_count, 1);
539
540 return get_fbd_size(has_zs_ext, rt_count);
541 }
542
543 static uint32_t
calc_render_descs_size(struct panvk_cmd_buffer * cmdbuf)544 calc_render_descs_size(struct panvk_cmd_buffer *cmdbuf)
545 {
546 uint32_t fbd_count = calc_enabled_layer_count(cmdbuf) *
547 (1 + PANVK_IR_PASS_COUNT);
548 uint32_t td_count = DIV_ROUND_UP(cmdbuf->state.gfx.render.layer_count,
549 MAX_LAYERS_PER_TILER_DESC);
550
551 return (calc_fbd_size(cmdbuf) * fbd_count) +
552 (td_count * pan_size(TILER_CONTEXT));
553 }
554
555 static void
cs_render_desc_ringbuf_reserve(struct cs_builder * b,uint32_t size)556 cs_render_desc_ringbuf_reserve(struct cs_builder *b, uint32_t size)
557 {
558 /* Make sure we don't allocate more than the ringbuf size. */
559 assert(size <= RENDER_DESC_RINGBUF_SIZE);
560
561 /* Make sure the allocation is 64-byte aligned. */
562 assert(ALIGN_POT(size, 64) == size);
563
564 struct cs_index ringbuf_sync = cs_scratch_reg64(b, 0);
565 struct cs_index sz_reg = cs_scratch_reg32(b, 2);
566
567 cs_load64_to(
568 b, ringbuf_sync, cs_subqueue_ctx_reg(b),
569 offsetof(struct panvk_cs_subqueue_context, render.desc_ringbuf.syncobj));
570 cs_wait_slot(b, SB_ID(LS), false);
571
572 /* Wait for the other end to release memory. */
573 cs_move32_to(b, sz_reg, size - 1);
574 cs_sync32_wait(b, false, MALI_CS_CONDITION_GREATER, sz_reg, ringbuf_sync);
575
576 /* Decrement the syncobj to reflect the fact we're reserving memory. */
577 cs_move32_to(b, sz_reg, -size);
578 cs_sync32_add(b, false, MALI_CS_SYNC_SCOPE_CSG, sz_reg, ringbuf_sync,
579 cs_now());
580 }
581
582 static void
cs_render_desc_ringbuf_move_ptr(struct cs_builder * b,uint32_t size,bool wrap_around)583 cs_render_desc_ringbuf_move_ptr(struct cs_builder *b, uint32_t size,
584 bool wrap_around)
585 {
586 struct cs_index scratch_reg = cs_scratch_reg32(b, 0);
587 struct cs_index ptr_lo = cs_scratch_reg32(b, 2);
588 struct cs_index pos = cs_scratch_reg32(b, 4);
589
590 cs_load_to(
591 b, cs_scratch_reg_tuple(b, 2, 3), cs_subqueue_ctx_reg(b),
592 BITFIELD_MASK(3),
593 offsetof(struct panvk_cs_subqueue_context, render.desc_ringbuf.ptr));
594 cs_wait_slot(b, SB_ID(LS), false);
595
596 /* Update the relative position and absolute address. */
597 cs_add32(b, ptr_lo, ptr_lo, size);
598 cs_add32(b, pos, pos, size);
599
600 /* Wrap-around. */
601 if (likely(wrap_around)) {
602 cs_add32(b, scratch_reg, pos, -RENDER_DESC_RINGBUF_SIZE);
603
604 cs_if(b, MALI_CS_CONDITION_GEQUAL, scratch_reg) {
605 cs_add32(b, ptr_lo, ptr_lo, -RENDER_DESC_RINGBUF_SIZE);
606 cs_add32(b, pos, pos, -RENDER_DESC_RINGBUF_SIZE);
607 }
608 }
609
610 cs_store(
611 b, cs_scratch_reg_tuple(b, 2, 3), cs_subqueue_ctx_reg(b),
612 BITFIELD_MASK(3),
613 offsetof(struct panvk_cs_subqueue_context, render.desc_ringbuf.ptr));
614 cs_wait_slot(b, SB_ID(LS), false);
615 }
616
617 static bool
inherits_render_ctx(struct panvk_cmd_buffer * cmdbuf)618 inherits_render_ctx(struct panvk_cmd_buffer *cmdbuf)
619 {
620 return (cmdbuf->vk.level == VK_COMMAND_BUFFER_LEVEL_SECONDARY &&
621 (cmdbuf->flags &
622 VK_COMMAND_BUFFER_USAGE_RENDER_PASS_CONTINUE_BIT)) ||
623 (cmdbuf->state.gfx.render.flags & VK_RENDERING_RESUMING_BIT);
624 }
625
626 static VkResult
get_tiler_desc(struct panvk_cmd_buffer * cmdbuf)627 get_tiler_desc(struct panvk_cmd_buffer *cmdbuf)
628 {
629 assert(cmdbuf->state.gfx.render.invalidate_inherited_ctx ||
630 !inherits_render_ctx(cmdbuf));
631
632 if (cmdbuf->state.gfx.render.tiler)
633 return VK_SUCCESS;
634
635 struct cs_builder *b =
636 panvk_get_cs_builder(cmdbuf, PANVK_SUBQUEUE_VERTEX_TILER);
637 struct panvk_physical_device *phys_dev =
638 to_panvk_physical_device(cmdbuf->vk.base.device->physical);
639 struct panvk_instance *instance =
640 to_panvk_instance(phys_dev->vk.instance);
641 bool tracing_enabled = instance->debug_flags & PANVK_DEBUG_TRACE;
642 struct panfrost_tiler_features tiler_features =
643 panfrost_query_tiler_features(&phys_dev->kmod.props);
644 bool simul_use =
645 cmdbuf->flags & VK_COMMAND_BUFFER_USAGE_SIMULTANEOUS_USE_BIT;
646 struct panfrost_ptr tiler_desc = {0};
647 struct mali_tiler_context_packed tiler_tmpl;
648 uint32_t td_count = DIV_ROUND_UP(cmdbuf->state.gfx.render.layer_count,
649 MAX_LAYERS_PER_TILER_DESC);
650
651 if (!simul_use) {
652 tiler_desc = panvk_cmd_alloc_desc_array(cmdbuf, td_count, TILER_CONTEXT);
653 if (!tiler_desc.gpu)
654 return VK_ERROR_OUT_OF_DEVICE_MEMORY;
655 }
656
657 const struct pan_fb_info *fbinfo = &cmdbuf->state.gfx.render.fb.info;
658
659 pan_pack(&tiler_tmpl, TILER_CONTEXT, cfg) {
660 unsigned max_levels = tiler_features.max_levels;
661 assert(max_levels >= 2);
662
663 cfg.hierarchy_mask =
664 panvk_select_tiler_hierarchy_mask(phys_dev, &cmdbuf->state.gfx);
665 cfg.fb_width = fbinfo->width;
666 cfg.fb_height = fbinfo->height;
667
668 cfg.sample_pattern = pan_sample_pattern(fbinfo->nr_samples);
669
670 cfg.first_provoking_vertex =
671 cmdbuf->vk.dynamic_graphics_state.rs.provoking_vertex ==
672 VK_PROVOKING_VERTEX_MODE_FIRST_VERTEX_EXT;
673
674 /* This will be overloaded. */
675 cfg.layer_count = 1;
676 cfg.layer_offset = 0;
677 }
678
679 /* When simul_use=true, the tiler descriptors are allocated from the
680 * descriptor ringbuf. We set state.gfx.render.tiler to a non-NULL
681 * value to satisfy the is_tiler_desc_allocated() tests, but we want
682 * it to point to a faulty address so that we can easily detect if it's
683 * used in the command stream/framebuffer descriptors. */
684 cmdbuf->state.gfx.render.tiler =
685 simul_use ? 0xdeadbeefdeadbeefull : tiler_desc.gpu;
686
687 struct cs_index tiler_ctx_addr = cs_sr_reg64(b, 40);
688
689 if (simul_use) {
690 uint32_t descs_sz = calc_render_descs_size(cmdbuf);
691
692 cs_render_desc_ringbuf_reserve(b, descs_sz);
693
694 /* Reserve ringbuf mem. */
695 cs_update_vt_ctx(b) {
696 cs_load64_to(b, tiler_ctx_addr, cs_subqueue_ctx_reg(b),
697 offsetof(struct panvk_cs_subqueue_context,
698 render.desc_ringbuf.ptr));
699 }
700
701 cs_render_desc_ringbuf_move_ptr(b, descs_sz, !tracing_enabled);
702 } else {
703 cs_update_vt_ctx(b) {
704 cs_move64_to(b, tiler_ctx_addr, tiler_desc.gpu);
705 }
706 }
707
708 /* Reset the polygon list. */
709 cs_move64_to(b, cs_scratch_reg64(b, 0), 0);
710
711 /* Lay out words 2, 3 and 5, so they can be stored along the other updates.
712 * Word 4 contains layer information and will be updated in the loop. */
713 cs_move64_to(b, cs_scratch_reg64(b, 2),
714 tiler_tmpl.opaque[2] | (uint64_t)tiler_tmpl.opaque[3] << 32);
715 cs_move32_to(b, cs_scratch_reg32(b, 5), tiler_tmpl.opaque[5]);
716
717 /* Load the tiler_heap and geom_buf from the context. */
718 cs_load_to(b, cs_scratch_reg_tuple(b, 6, 4), cs_subqueue_ctx_reg(b),
719 BITFIELD_MASK(4),
720 offsetof(struct panvk_cs_subqueue_context, render.tiler_heap));
721
722 /* Fill extra fields with zeroes so we can reset the completed
723 * top/bottom and private states. */
724 cs_move64_to(b, cs_scratch_reg64(b, 10), 0);
725 cs_move64_to(b, cs_scratch_reg64(b, 12), 0);
726 cs_move64_to(b, cs_scratch_reg64(b, 14), 0);
727
728 cs_wait_slot(b, SB_ID(LS), false);
729
730 /* Take care of the tiler desc with layer_offset=0 outside of the loop. */
731 cs_move32_to(b, cs_scratch_reg32(b, 4),
732 MIN2(cmdbuf->state.gfx.render.layer_count - 1,
733 MAX_LAYERS_PER_TILER_DESC - 1));
734
735 /* Replace words 0:13 and 24:31. */
736 cs_store(b, cs_scratch_reg_tuple(b, 0, 16), tiler_ctx_addr,
737 BITFIELD_MASK(16), 0);
738 cs_store(b, cs_scratch_reg_tuple(b, 0, 16), tiler_ctx_addr,
739 BITFIELD_RANGE(0, 2) | BITFIELD_RANGE(10, 6), 64);
740 cs_store(b, cs_scratch_reg_tuple(b, 0, 16), tiler_ctx_addr,
741 BITFIELD_RANGE(0, 2) | BITFIELD_RANGE(10, 6), 96);
742
743 cs_wait_slot(b, SB_ID(LS), false);
744
745 uint32_t remaining_layers =
746 td_count > 1
747 ? cmdbuf->state.gfx.render.layer_count % MAX_LAYERS_PER_TILER_DESC
748 : 0;
749 uint32_t full_td_count =
750 cmdbuf->state.gfx.render.layer_count / MAX_LAYERS_PER_TILER_DESC;
751
752 if (remaining_layers) {
753 int32_t layer_offset =
754 -(cmdbuf->state.gfx.render.layer_count - remaining_layers) &
755 BITFIELD_MASK(9);
756
757 /* If the last tiler descriptor is not full, we emit it outside of the
758 * loop to pass the right layer count. All this would be a lot simpler
759 * if we had OR/AND instructions, but here we are. */
760 cs_update_vt_ctx(b)
761 cs_add64(b, tiler_ctx_addr, tiler_ctx_addr,
762 pan_size(TILER_CONTEXT) * full_td_count);
763 cs_move32_to(b, cs_scratch_reg32(b, 4),
764 (layer_offset << 8) | (remaining_layers - 1));
765 cs_store(b, cs_scratch_reg_tuple(b, 0, 16), tiler_ctx_addr,
766 BITFIELD_MASK(16), 0);
767 cs_store(b, cs_scratch_reg_tuple(b, 0, 16), tiler_ctx_addr,
768 BITFIELD_RANGE(0, 2) | BITFIELD_RANGE(10, 6), 64);
769 cs_store(b, cs_scratch_reg_tuple(b, 0, 16), tiler_ctx_addr,
770 BITFIELD_RANGE(0, 2) | BITFIELD_RANGE(10, 6), 96);
771 cs_wait_slot(b, SB_ID(LS), false);
772
773 cs_update_vt_ctx(b)
774 cs_add64(b, tiler_ctx_addr, tiler_ctx_addr,
775 -pan_size(TILER_CONTEXT));
776 } else if (full_td_count > 1) {
777 cs_update_vt_ctx(b)
778 cs_add64(b, tiler_ctx_addr, tiler_ctx_addr,
779 pan_size(TILER_CONTEXT) * (full_td_count - 1));
780 }
781
782 if (full_td_count > 1) {
783 struct cs_index counter_reg = cs_scratch_reg32(b, 17);
784 uint32_t layer_offset =
785 (-MAX_LAYERS_PER_TILER_DESC * (full_td_count - 1)) & BITFIELD_MASK(9);
786
787 cs_move32_to(b, counter_reg, full_td_count - 1);
788 cs_move32_to(b, cs_scratch_reg32(b, 4),
789 (layer_offset << 8) | (MAX_LAYERS_PER_TILER_DESC - 1));
790
791 /* We iterate the remaining full tiler descriptors in reverse order, so we
792 * can start from the smallest layer offset, and increment it by
793 * MAX_LAYERS_PER_TILER_DESC << 8 at each iteration. Again, the split is
794 * mostly due to the lack of AND instructions, and the fact layer_offset
795 * is a 9-bit signed integer inside a 32-bit word, which ADD32 can't deal
796 * with unless the number we add is positive.
797 */
798 cs_while(b, MALI_CS_CONDITION_GREATER, counter_reg) {
799 /* Replace words 0:13 and 24:31. */
800 cs_store(b, cs_scratch_reg_tuple(b, 0, 16), tiler_ctx_addr,
801 BITFIELD_MASK(16), 0);
802 cs_store(b, cs_scratch_reg_tuple(b, 0, 16), tiler_ctx_addr,
803 BITFIELD_RANGE(0, 2) | BITFIELD_RANGE(10, 6), 64);
804 cs_store(b, cs_scratch_reg_tuple(b, 0, 16), tiler_ctx_addr,
805 BITFIELD_RANGE(0, 2) | BITFIELD_RANGE(10, 6), 96);
806
807 cs_wait_slot(b, SB_ID(LS), false);
808
809 cs_add32(b, cs_scratch_reg32(b, 4), cs_scratch_reg32(b, 4),
810 MAX_LAYERS_PER_TILER_DESC << 8);
811
812 cs_add32(b, counter_reg, counter_reg, -1);
813 cs_update_vt_ctx(b)
814 cs_add64(b, tiler_ctx_addr, tiler_ctx_addr,
815 -pan_size(TILER_CONTEXT));
816 }
817 }
818
819 /* Then we change the scoreboard slot used for iterators. */
820 panvk_per_arch(cs_pick_iter_sb)(cmdbuf, PANVK_SUBQUEUE_VERTEX_TILER);
821
822 cs_heap_operation(b, MALI_CS_HEAP_OPERATION_VERTEX_TILER_STARTED, cs_now());
823 return VK_SUCCESS;
824 }
825
826 static uint8_t
prepare_fb_desc(struct panvk_cmd_buffer * cmdbuf,struct pan_fb_info * fbinfo,uint32_t layer,void * fbd)827 prepare_fb_desc(struct panvk_cmd_buffer *cmdbuf, struct pan_fb_info *fbinfo,
828 uint32_t layer, void *fbd)
829 {
830 struct pan_tiler_context tiler_ctx = {
831 .valhall.layer_offset = layer - (layer % MAX_LAYERS_PER_TILER_DESC),
832 };
833
834 if (!(cmdbuf->flags & VK_COMMAND_BUFFER_USAGE_SIMULTANEOUS_USE_BIT)) {
835 uint32_t td_idx = layer / MAX_LAYERS_PER_TILER_DESC;
836
837 tiler_ctx.valhall.desc =
838 cmdbuf->state.gfx.render.tiler + (td_idx * pan_size(TILER_CONTEXT));
839 }
840
841 return GENX(pan_emit_fbd)(fbinfo, layer, NULL, &tiler_ctx, fbd);
842 }
843
844 static VkResult
prepare_incremental_rendering_fbinfos(struct panvk_cmd_buffer * cmdbuf,const struct pan_fb_info * fbinfo,struct pan_fb_info ir_fbinfos[PANVK_IR_PASS_COUNT])845 prepare_incremental_rendering_fbinfos(
846 struct panvk_cmd_buffer *cmdbuf, const struct pan_fb_info *fbinfo,
847 struct pan_fb_info ir_fbinfos[PANVK_IR_PASS_COUNT])
848 {
849 /* First incremental rendering pass: don't discard result */
850
851 struct pan_fb_info *ir_fb = &ir_fbinfos[PANVK_IR_FIRST_PASS];
852
853 memcpy(ir_fb, fbinfo, sizeof(*ir_fb));
854 for (unsigned i = 0; i < fbinfo->rt_count; i++)
855 ir_fb->rts[i].discard = false;
856 ir_fb->zs.discard.z = false;
857 ir_fb->zs.discard.s = false;
858
859 /* Subsequent incremental rendering passes: preload old content and don't
860 * discard result */
861
862 struct pan_fb_info *prev_ir_fb = ir_fb;
863 ir_fb = &ir_fbinfos[PANVK_IR_MIDDLE_PASS];
864 memcpy(ir_fb, prev_ir_fb, sizeof(*ir_fb));
865
866 bool preload_changed = false;
867
868 for (unsigned i = 0; i < fbinfo->rt_count; i++) {
869 if (fbinfo->rts[i].view && !fbinfo->rts[i].preload) {
870 ir_fb->rts[i].preload = true;
871 preload_changed = true;
872 }
873
874 if (ir_fb->rts[i].clear) {
875 ir_fb->rts[i].clear = false;
876 preload_changed = true;
877 }
878 }
879 if (fbinfo->zs.view.zs && !fbinfo->zs.preload.z && !fbinfo->zs.preload.s) {
880 ir_fb->zs.preload.z = true;
881 ir_fb->zs.preload.s = true;
882 preload_changed = true;
883 } else if (fbinfo->zs.view.s && !fbinfo->zs.preload.s) {
884 ir_fb->zs.preload.s = true;
885 preload_changed = true;
886 }
887
888 if (ir_fb->zs.clear.z || ir_fb->zs.clear.s) {
889 ir_fb->zs.clear.z = false;
890 ir_fb->zs.clear.s = false;
891 preload_changed = true;
892 }
893
894 if (preload_changed) {
895 memset(&ir_fb->bifrost.pre_post.dcds, 0x0,
896 sizeof(ir_fb->bifrost.pre_post.dcds));
897 VkResult result = panvk_per_arch(cmd_fb_preload)(cmdbuf, ir_fb);
898 if (result != VK_SUCCESS)
899 return result;
900 }
901
902 /* Last incremental rendering pass: preload previous content and deal with
903 * results as specified by user */
904
905 prev_ir_fb = ir_fb;
906 ir_fb = &ir_fbinfos[PANVK_IR_LAST_PASS];
907 memcpy(ir_fb, prev_ir_fb, sizeof(*ir_fb));
908
909 for (unsigned i = 0; i < fbinfo->rt_count; i++)
910 ir_fb->rts[i].discard = fbinfo->rts[i].discard;
911 ir_fb->zs.discard.z = fbinfo->zs.discard.z;
912 ir_fb->zs.discard.s = fbinfo->zs.discard.s;
913
914 return VK_SUCCESS;
915 }
916
917 static VkResult
get_fb_descs(struct panvk_cmd_buffer * cmdbuf)918 get_fb_descs(struct panvk_cmd_buffer *cmdbuf)
919 {
920 assert(cmdbuf->state.gfx.render.invalidate_inherited_ctx ||
921 !inherits_render_ctx(cmdbuf));
922
923 if (cmdbuf->state.gfx.render.fbds.gpu ||
924 !cmdbuf->state.gfx.render.layer_count)
925 return VK_SUCCESS;
926
927 uint32_t fbd_sz = calc_fbd_size(cmdbuf);
928 uint32_t fbds_sz = fbd_sz * calc_enabled_layer_count(cmdbuf) *
929 (1 + PANVK_IR_PASS_COUNT);
930
931 cmdbuf->state.gfx.render.fbds = panvk_cmd_alloc_dev_mem(
932 cmdbuf, desc, fbds_sz, pan_alignment(FRAMEBUFFER));
933 if (!cmdbuf->state.gfx.render.fbds.gpu)
934 return VK_ERROR_OUT_OF_DEVICE_MEMORY;
935
936 struct panvk_device *dev = to_panvk_device(cmdbuf->vk.base.device);
937 struct pan_fb_info *fbinfo = &cmdbuf->state.gfx.render.fb.info;
938 bool simul_use =
939 cmdbuf->flags & VK_COMMAND_BUFFER_USAGE_SIMULTANEOUS_USE_BIT;
940
941 /* The only bit we patch in FBDs is the tiler pointer. If tiler is not
942 * involved (clear job) or if the update can happen in place (not
943 * simultaneous use of the command buffer), we can avoid the
944 * copy.
945 *
946 * According to VUID-VkSubmitInfo2KHR-commandBuffer-06192 and
947 * VUID-VkSubmitInfo2KHR-commandBuffer-06010, suspend/resume operations
948 * can't cross the vkQueueSubmit2() boundary, so no need to dynamically
949 * allocate descriptors in that case:
950 * "
951 * If any commandBuffer member of an element of pCommandBufferInfos
952 * contains any suspended render pass instances, they must be resumed by a
953 * render pass instance later in submission order within
954 * pCommandBufferInfos.
955 *
956 * If any commandBuffer member of an element of pCommandBufferInfos
957 * contains any resumed render pass instances, they must be suspended by a
958 * render pass instance earlier in submission order within
959 * pCommandBufferInfos.
960 * "
961 */
962 bool copy_fbds = simul_use && cmdbuf->state.gfx.render.tiler;
963 struct panfrost_ptr fbds = cmdbuf->state.gfx.render.fbds;
964 uint32_t fbd_flags = 0;
965 uint32_t fbd_ir_pass_offset = fbd_sz * calc_enabled_layer_count(cmdbuf);
966
967 fbinfo->sample_positions =
968 dev->sample_positions->addr.dev +
969 panfrost_sample_positions_offset(pan_sample_pattern(fbinfo->nr_samples));
970
971 VkResult result = panvk_per_arch(cmd_fb_preload)(cmdbuf, fbinfo);
972 if (result != VK_SUCCESS)
973 return result;
974
975 struct pan_fb_info ir_fbinfos[PANVK_IR_PASS_COUNT];
976 result = prepare_incremental_rendering_fbinfos(cmdbuf, fbinfo, ir_fbinfos);
977 if (result != VK_SUCCESS)
978 return result;
979
980 /* We prepare all FB descriptors upfront. For multiview, only create FBDs
981 * for enabled views. */
982 uint32_t view_mask_temp = cmdbuf->state.gfx.render.view_mask;
983 uint32_t enabled_layer_count = calc_enabled_layer_count(cmdbuf);
984 bool multiview = cmdbuf->state.gfx.render.view_mask;
985
986 for (uint32_t i = 0; i < enabled_layer_count; i++) {
987 uint32_t layer_idx = multiview ? u_bit_scan(&view_mask_temp) : i;
988
989 uint32_t layer_offset = fbd_sz * i;
990 uint32_t new_fbd_flags =
991 prepare_fb_desc(cmdbuf, fbinfo, layer_idx, fbds.cpu + layer_offset);
992
993 /* Make sure all FBDs have the same flags. */
994 assert(i == 0 || new_fbd_flags == fbd_flags);
995 fbd_flags = new_fbd_flags;
996
997 for (uint32_t j = 0; j < PANVK_IR_PASS_COUNT; j++) {
998 uint32_t ir_pass_offset = (1 + j) * fbd_ir_pass_offset;
999 new_fbd_flags =
1000 prepare_fb_desc(cmdbuf, &ir_fbinfos[j], layer_idx,
1001 fbds.cpu + ir_pass_offset + layer_offset);
1002
1003 /* Make sure all IR FBDs have the same flags. */
1004 assert(new_fbd_flags == fbd_flags);
1005 }
1006 }
1007
1008 struct cs_builder *b = panvk_get_cs_builder(cmdbuf, PANVK_SUBQUEUE_FRAGMENT);
1009
1010 if (copy_fbds) {
1011 struct cs_index cur_tiler = cs_sr_reg64(b, 38);
1012 struct cs_index dst_fbd_ptr = cs_sr_reg64(b, 40);
1013 struct cs_index layer_count = cs_sr_reg32(b, 47);
1014 struct cs_index src_fbd_ptr = cs_sr_reg64(b, 48);
1015 struct cs_index remaining_layers_in_td = cs_sr_reg32(b, 50);
1016 struct cs_index pass_count = cs_sr_reg32(b, 51);
1017 struct cs_index pass_src_fbd_ptr = cs_sr_reg64(b, 52);
1018 struct cs_index pass_dst_fbd_ptr = cs_sr_reg64(b, 54);
1019 uint32_t td_count = DIV_ROUND_UP(cmdbuf->state.gfx.render.layer_count,
1020 MAX_LAYERS_PER_TILER_DESC);
1021
1022 cs_update_frag_ctx(b) {
1023 cs_load64_to(b, cur_tiler, cs_subqueue_ctx_reg(b),
1024 offsetof(struct panvk_cs_subqueue_context,
1025 render.desc_ringbuf.ptr));
1026 cs_wait_slot(b, SB_ID(LS), false);
1027 cs_add64(b, dst_fbd_ptr, cur_tiler,
1028 pan_size(TILER_CONTEXT) * td_count);
1029 }
1030
1031 cs_move64_to(b, src_fbd_ptr, fbds.gpu);
1032 cs_move32_to(b, remaining_layers_in_td, MAX_LAYERS_PER_TILER_DESC);
1033
1034 cs_move32_to(b, layer_count, calc_enabled_layer_count(cmdbuf));
1035 cs_while(b, MALI_CS_CONDITION_GREATER, layer_count) {
1036 /* Our loop is copying 64-bytes at a time, so make sure the
1037 * framebuffer size is aligned on 64-bytes. */
1038 assert(fbd_sz == ALIGN_POT(fbd_sz, 64));
1039
1040 cs_move32_to(b, pass_count, PANVK_IR_PASS_COUNT);
1041 cs_add64(b, pass_src_fbd_ptr, src_fbd_ptr, 0);
1042 cs_add64(b, pass_dst_fbd_ptr, dst_fbd_ptr, 0);
1043 /* Copy FBDs the regular pass as well as IR passes. */
1044 cs_while(b, MALI_CS_CONDITION_GEQUAL, pass_count) {
1045 for (uint32_t fbd_off = 0; fbd_off < fbd_sz; fbd_off += 64) {
1046 if (fbd_off == 0) {
1047 cs_load_to(b, cs_scratch_reg_tuple(b, 0, 14),
1048 pass_src_fbd_ptr, BITFIELD_MASK(14), fbd_off);
1049 cs_add64(b, cs_scratch_reg64(b, 14), cur_tiler, 0);
1050 } else {
1051 cs_load_to(b, cs_scratch_reg_tuple(b, 0, 16),
1052 pass_src_fbd_ptr, BITFIELD_MASK(16), fbd_off);
1053 }
1054 cs_wait_slot(b, SB_ID(LS), false);
1055 cs_store(b, cs_scratch_reg_tuple(b, 0, 16), pass_dst_fbd_ptr,
1056 BITFIELD_MASK(16), fbd_off);
1057 cs_wait_slot(b, SB_ID(LS), false);
1058 }
1059 cs_add64(b, pass_src_fbd_ptr, pass_src_fbd_ptr, fbd_ir_pass_offset);
1060 cs_add64(b, pass_dst_fbd_ptr, pass_dst_fbd_ptr, fbd_ir_pass_offset);
1061 cs_add32(b, pass_count, pass_count, -1);
1062 }
1063
1064 cs_add64(b, src_fbd_ptr, src_fbd_ptr, fbd_sz);
1065 cs_update_frag_ctx(b)
1066 cs_add64(b, dst_fbd_ptr, dst_fbd_ptr, fbd_sz);
1067
1068 cs_add32(b, remaining_layers_in_td, remaining_layers_in_td, -1);
1069 cs_add32(b, layer_count, layer_count, -1);
1070 cs_if(b, MALI_CS_CONDITION_LEQUAL, remaining_layers_in_td) {
1071 cs_update_frag_ctx(b)
1072 cs_add64(b, cur_tiler, cur_tiler, pan_size(TILER_CONTEXT));
1073 cs_move32_to(b, remaining_layers_in_td,
1074 MAX_LAYERS_PER_TILER_DESC);
1075 }
1076 }
1077
1078 cs_update_frag_ctx(b) {
1079 uint32_t full_td_count =
1080 cmdbuf->state.gfx.render.layer_count / MAX_LAYERS_PER_TILER_DESC;
1081
1082 /* If the last tiler descriptor is not full, cur_tiler points to the
1083 * last tiler descriptor, not the FBD that follows. */
1084 if (full_td_count < td_count)
1085 cs_add64(b, dst_fbd_ptr, cur_tiler,
1086 fbd_flags + pan_size(TILER_CONTEXT));
1087 else
1088 cs_add64(b, dst_fbd_ptr, cur_tiler, fbd_flags);
1089
1090 cs_add64(b, cur_tiler, cur_tiler,
1091 -(full_td_count * pan_size(TILER_CONTEXT)));
1092 }
1093 } else {
1094 cs_update_frag_ctx(b) {
1095 cs_move64_to(b, cs_sr_reg64(b, 40), fbds.gpu | fbd_flags);
1096 cs_move64_to(b, cs_sr_reg64(b, 38), cmdbuf->state.gfx.render.tiler);
1097 }
1098 }
1099
1100 return VK_SUCCESS;
1101 }
1102
1103 static void
set_provoking_vertex_mode(struct panvk_cmd_buffer * cmdbuf)1104 set_provoking_vertex_mode(struct panvk_cmd_buffer *cmdbuf)
1105 {
1106 struct pan_fb_info *fbinfo = &cmdbuf->state.gfx.render.fb.info;
1107 bool first_provoking_vertex =
1108 cmdbuf->vk.dynamic_graphics_state.rs.provoking_vertex ==
1109 VK_PROVOKING_VERTEX_MODE_FIRST_VERTEX_EXT;
1110
1111 /* If this is not the first draw, first_provoking_vertex should match
1112 * the one from the previous draws. Unfortunately, we can't check it
1113 * when the render pass is inherited. */
1114 assert(!cmdbuf->state.gfx.render.fbds.gpu ||
1115 fbinfo->first_provoking_vertex == first_provoking_vertex);
1116
1117 fbinfo->first_provoking_vertex = first_provoking_vertex;
1118 }
1119
1120 static VkResult
get_render_ctx(struct panvk_cmd_buffer * cmdbuf)1121 get_render_ctx(struct panvk_cmd_buffer *cmdbuf)
1122 {
1123 VkResult result = get_tiler_desc(cmdbuf);
1124 if (result != VK_SUCCESS)
1125 return result;
1126
1127 return get_fb_descs(cmdbuf);
1128 }
1129
1130 static VkResult
prepare_vs(struct panvk_cmd_buffer * cmdbuf)1131 prepare_vs(struct panvk_cmd_buffer *cmdbuf)
1132 {
1133 struct panvk_descriptor_state *desc_state = &cmdbuf->state.gfx.desc_state;
1134 struct panvk_shader_desc_state *vs_desc_state = &cmdbuf->state.gfx.vs.desc;
1135 const struct panvk_shader *vs = cmdbuf->state.gfx.vs.shader;
1136 struct cs_builder *b =
1137 panvk_get_cs_builder(cmdbuf, PANVK_SUBQUEUE_VERTEX_TILER);
1138 bool upd_res_table = false;
1139
1140 VkResult result = prepare_vs_driver_set(cmdbuf);
1141 if (result != VK_SUCCESS)
1142 return result;
1143
1144 if (gfx_state_dirty(cmdbuf, VS) || gfx_state_dirty(cmdbuf, DESC_STATE) ||
1145 vs_driver_set_is_dirty(cmdbuf)) {
1146 result = panvk_per_arch(cmd_prepare_shader_res_table)(cmdbuf, desc_state,
1147 vs, vs_desc_state);
1148 if (result != VK_SUCCESS)
1149 return result;
1150
1151 upd_res_table = true;
1152 }
1153
1154 cs_update_vt_ctx(b) {
1155 if (upd_res_table)
1156 cs_move64_to(b, cs_sr_reg64(b, 0), vs_desc_state->res_table);
1157
1158 if (gfx_state_dirty(cmdbuf, VS) ||
1159 dyn_gfx_state_dirty(cmdbuf, IA_PRIMITIVE_TOPOLOGY))
1160 cs_move64_to(b, cs_sr_reg64(b, 16), get_pos_spd(cmdbuf));
1161
1162 if (gfx_state_dirty(cmdbuf, VS))
1163 cs_move64_to(b, cs_sr_reg64(b, 18),
1164 panvk_priv_mem_dev_addr(vs->spds.var));
1165 }
1166
1167 return VK_SUCCESS;
1168 }
1169
1170 static VkResult
prepare_fs(struct panvk_cmd_buffer * cmdbuf)1171 prepare_fs(struct panvk_cmd_buffer *cmdbuf)
1172 {
1173 const struct panvk_shader *fs = get_fs(cmdbuf);
1174 struct panvk_shader_desc_state *fs_desc_state = &cmdbuf->state.gfx.fs.desc;
1175 struct panvk_descriptor_state *desc_state = &cmdbuf->state.gfx.desc_state;
1176 struct cs_builder *b =
1177 panvk_get_cs_builder(cmdbuf, PANVK_SUBQUEUE_VERTEX_TILER);
1178
1179 if (fs &&
1180 (gfx_state_dirty(cmdbuf, FS) || gfx_state_dirty(cmdbuf, DESC_STATE))) {
1181 VkResult result = prepare_fs_driver_set(cmdbuf);
1182 if (result != VK_SUCCESS)
1183 return result;
1184
1185 result = panvk_per_arch(cmd_prepare_shader_res_table)(cmdbuf, desc_state,
1186 fs, fs_desc_state);
1187 if (result != VK_SUCCESS)
1188 return result;
1189 }
1190
1191 cs_update_vt_ctx(b) {
1192 if (fs_user_dirty(cmdbuf) || gfx_state_dirty(cmdbuf, DESC_STATE))
1193 cs_move64_to(b, cs_sr_reg64(b, 4), fs ? fs_desc_state->res_table : 0);
1194 if (fs_user_dirty(cmdbuf))
1195 cs_move64_to(b, cs_sr_reg64(b, 20),
1196 fs ? panvk_priv_mem_dev_addr(fs->spd) : 0);
1197 }
1198
1199 return VK_SUCCESS;
1200 }
1201
1202 static VkResult
prepare_push_uniforms(struct panvk_cmd_buffer * cmdbuf)1203 prepare_push_uniforms(struct panvk_cmd_buffer *cmdbuf)
1204 {
1205 struct cs_builder *b =
1206 panvk_get_cs_builder(cmdbuf, PANVK_SUBQUEUE_VERTEX_TILER);
1207 const struct panvk_shader *vs = cmdbuf->state.gfx.vs.shader;
1208 const struct panvk_shader *fs = get_fs(cmdbuf);
1209 VkResult result;
1210
1211 if (gfx_state_dirty(cmdbuf, VS_PUSH_UNIFORMS)) {
1212 result = panvk_per_arch(cmd_prepare_push_uniforms)(cmdbuf, vs);
1213 if (result != VK_SUCCESS)
1214 return result;
1215
1216 cs_update_vt_ctx(b) {
1217 cs_move64_to(b, cs_sr_reg64(b, 8),
1218 cmdbuf->state.gfx.vs.push_uniforms |
1219 ((uint64_t)vs->fau.total_count << 56));
1220 }
1221 }
1222
1223 if (fs_user_dirty(cmdbuf) || gfx_state_dirty(cmdbuf, FS_PUSH_UNIFORMS)) {
1224 uint64_t fau_ptr = 0;
1225
1226 if (fs) {
1227 result = panvk_per_arch(cmd_prepare_push_uniforms)(cmdbuf, fs);
1228 if (result != VK_SUCCESS)
1229 return result;
1230
1231 fau_ptr = cmdbuf->state.gfx.fs.push_uniforms |
1232 ((uint64_t)fs->fau.total_count << 56);
1233 }
1234
1235 cs_update_vt_ctx(b)
1236 cs_move64_to(b, cs_sr_reg64(b, 12), fau_ptr);
1237 }
1238
1239 return VK_SUCCESS;
1240 }
1241
1242 static VkResult
prepare_ds(struct panvk_cmd_buffer * cmdbuf)1243 prepare_ds(struct panvk_cmd_buffer *cmdbuf)
1244 {
1245 bool dirty = dyn_gfx_state_dirty(cmdbuf, DS_DEPTH_TEST_ENABLE) ||
1246 dyn_gfx_state_dirty(cmdbuf, DS_DEPTH_WRITE_ENABLE) ||
1247 dyn_gfx_state_dirty(cmdbuf, DS_DEPTH_COMPARE_OP) ||
1248 dyn_gfx_state_dirty(cmdbuf, DS_STENCIL_TEST_ENABLE) ||
1249 dyn_gfx_state_dirty(cmdbuf, DS_STENCIL_OP) ||
1250 dyn_gfx_state_dirty(cmdbuf, DS_STENCIL_COMPARE_MASK) ||
1251 dyn_gfx_state_dirty(cmdbuf, DS_STENCIL_WRITE_MASK) ||
1252 dyn_gfx_state_dirty(cmdbuf, DS_STENCIL_REFERENCE) ||
1253 dyn_gfx_state_dirty(cmdbuf, RS_DEPTH_CLAMP_ENABLE) ||
1254 dyn_gfx_state_dirty(cmdbuf, RS_DEPTH_CLIP_ENABLE) ||
1255 dyn_gfx_state_dirty(cmdbuf, RS_DEPTH_BIAS_ENABLE) ||
1256 dyn_gfx_state_dirty(cmdbuf, RS_DEPTH_BIAS_FACTORS) ||
1257 fs_user_dirty(cmdbuf);
1258
1259 if (!dirty)
1260 return VK_SUCCESS;
1261
1262 struct cs_builder *b =
1263 panvk_get_cs_builder(cmdbuf, PANVK_SUBQUEUE_VERTEX_TILER);
1264 const struct vk_dynamic_graphics_state *dyns =
1265 &cmdbuf->vk.dynamic_graphics_state;
1266 const struct vk_depth_stencil_state *ds = &dyns->ds;
1267 const struct vk_rasterization_state *rs = &dyns->rs;
1268 bool test_s = has_stencil_att(cmdbuf) && ds->stencil.test_enable;
1269 bool test_z = has_depth_att(cmdbuf) && ds->depth.test_enable;
1270 const struct panvk_shader *fs = get_fs(cmdbuf);
1271
1272 struct panfrost_ptr zsd = panvk_cmd_alloc_desc(cmdbuf, DEPTH_STENCIL);
1273 if (!zsd.gpu)
1274 return VK_ERROR_OUT_OF_DEVICE_MEMORY;
1275
1276 pan_cast_and_pack(zsd.cpu, DEPTH_STENCIL, cfg) {
1277 cfg.stencil_test_enable = test_s;
1278 if (test_s) {
1279 cfg.front_compare_function =
1280 translate_compare_func(ds->stencil.front.op.compare);
1281 cfg.front_stencil_fail =
1282 translate_stencil_op(ds->stencil.front.op.fail);
1283 cfg.front_depth_fail =
1284 translate_stencil_op(ds->stencil.front.op.depth_fail);
1285 cfg.front_depth_pass = translate_stencil_op(ds->stencil.front.op.pass);
1286 cfg.back_compare_function =
1287 translate_compare_func(ds->stencil.back.op.compare);
1288 cfg.back_stencil_fail = translate_stencil_op(ds->stencil.back.op.fail);
1289 cfg.back_depth_fail =
1290 translate_stencil_op(ds->stencil.back.op.depth_fail);
1291 cfg.back_depth_pass = translate_stencil_op(ds->stencil.back.op.pass);
1292 }
1293
1294 cfg.stencil_from_shader = fs ? fs->info.fs.writes_stencil : 0;
1295 cfg.front_write_mask = ds->stencil.front.write_mask;
1296 cfg.back_write_mask = ds->stencil.back.write_mask;
1297 cfg.front_value_mask = ds->stencil.front.compare_mask;
1298 cfg.back_value_mask = ds->stencil.back.compare_mask;
1299 cfg.front_reference_value = ds->stencil.front.reference;
1300 cfg.back_reference_value = ds->stencil.back.reference;
1301
1302 cfg.depth_cull_enable = vk_rasterization_state_depth_clip_enable(rs);
1303 if (rs->depth_clamp_enable)
1304 cfg.depth_clamp_mode = MALI_DEPTH_CLAMP_MODE_BOUNDS;
1305
1306 if (fs)
1307 cfg.depth_source = pan_depth_source(&fs->info);
1308 cfg.depth_write_enable = test_z && ds->depth.write_enable;
1309 cfg.depth_bias_enable = rs->depth_bias.enable;
1310 cfg.depth_function = test_z ? translate_compare_func(ds->depth.compare_op)
1311 : MALI_FUNC_ALWAYS;
1312 cfg.depth_units = rs->depth_bias.constant_factor;
1313 cfg.depth_factor = rs->depth_bias.slope_factor;
1314 cfg.depth_bias_clamp = rs->depth_bias.clamp;
1315 }
1316
1317 cs_update_vt_ctx(b)
1318 cs_move64_to(b, cs_sr_reg64(b, 52), zsd.gpu);
1319
1320 return VK_SUCCESS;
1321 }
1322
1323 static VkResult
wrap_prev_oq(struct panvk_cmd_buffer * cmdbuf)1324 wrap_prev_oq(struct panvk_cmd_buffer *cmdbuf)
1325 {
1326 uint64_t last_syncobj = cmdbuf->state.gfx.render.oq.last;
1327
1328 if (!last_syncobj)
1329 return VK_SUCCESS;
1330
1331 uint64_t prev_oq_node = cmdbuf->state.gfx.render.oq.chain;
1332 struct panfrost_ptr new_oq_node = panvk_cmd_alloc_dev_mem(
1333 cmdbuf, desc, sizeof(struct panvk_cs_occlusion_query), 8);
1334
1335 if (!new_oq_node.gpu)
1336 return VK_ERROR_OUT_OF_DEVICE_MEMORY;
1337
1338 cmdbuf->state.gfx.render.oq.chain = new_oq_node.gpu;
1339
1340 struct panvk_cs_occlusion_query *oq = new_oq_node.cpu;
1341
1342 *oq = (struct panvk_cs_occlusion_query){
1343 .syncobj = last_syncobj,
1344 .next = prev_oq_node,
1345 };
1346
1347 /* If we already had an OQ in the chain, we don't need to initialize the
1348 * oq_chain field in the subqueue ctx. */
1349 if (prev_oq_node)
1350 return VK_SUCCESS;
1351
1352 /* If we're a secondary cmdbuf inside a render pass, we let the primary
1353 * cmdbuf link the OQ chain. */
1354 if (cmdbuf->flags & VK_COMMAND_BUFFER_USAGE_RENDER_PASS_CONTINUE_BIT)
1355 return VK_SUCCESS;
1356
1357 struct cs_builder *b = panvk_get_cs_builder(cmdbuf, PANVK_SUBQUEUE_FRAGMENT);
1358 struct cs_index oq_node_reg = cs_scratch_reg64(b, 0);
1359
1360 cs_move64_to(b, oq_node_reg, new_oq_node.gpu);
1361
1362 /* If we're resuming, we need to link with the previous oq_chain, if any. */
1363 if (cmdbuf->state.gfx.render.flags & VK_RENDERING_RESUMING_BIT) {
1364 struct cs_index prev_oq_node_reg = cs_scratch_reg64(b, 2);
1365
1366 cs_load64_to(
1367 b, prev_oq_node_reg, cs_subqueue_ctx_reg(b),
1368 offsetof(struct panvk_cs_subqueue_context, render.oq_chain));
1369 cs_wait_slot(b, SB_ID(LS), false);
1370 cs_store64(b, prev_oq_node_reg, oq_node_reg,
1371 offsetof(struct panvk_cs_occlusion_query, next));
1372 cs_wait_slot(b, SB_ID(LS), false);
1373 }
1374
1375 cs_store64(b, oq_node_reg, cs_subqueue_ctx_reg(b),
1376 offsetof(struct panvk_cs_subqueue_context, render.oq_chain));
1377 cs_wait_slot(b, SB_ID(LS), false);
1378 return VK_SUCCESS;
1379 }
1380
1381 static VkResult
prepare_oq(struct panvk_cmd_buffer * cmdbuf)1382 prepare_oq(struct panvk_cmd_buffer *cmdbuf)
1383 {
1384 if (!gfx_state_dirty(cmdbuf, OQ) ||
1385 cmdbuf->state.gfx.occlusion_query.syncobj ==
1386 cmdbuf->state.gfx.render.oq.last)
1387 return VK_SUCCESS;
1388
1389 VkResult result = wrap_prev_oq(cmdbuf);
1390 if (result)
1391 return result;
1392
1393 struct cs_builder *b =
1394 panvk_get_cs_builder(cmdbuf, PANVK_SUBQUEUE_VERTEX_TILER);
1395 cs_move64_to(b, cs_sr_reg64(b, 46), cmdbuf->state.gfx.occlusion_query.ptr);
1396
1397 cmdbuf->state.gfx.render.oq.last =
1398 cmdbuf->state.gfx.occlusion_query.syncobj;
1399 return VK_SUCCESS;
1400 }
1401
1402 static void
prepare_dcd(struct panvk_cmd_buffer * cmdbuf)1403 prepare_dcd(struct panvk_cmd_buffer *cmdbuf)
1404 {
1405 struct cs_builder *b =
1406 panvk_get_cs_builder(cmdbuf, PANVK_SUBQUEUE_VERTEX_TILER);
1407 const struct panvk_shader *fs = get_fs(cmdbuf);
1408 bool dcd0_dirty =
1409 dyn_gfx_state_dirty(cmdbuf, RS_RASTERIZER_DISCARD_ENABLE) ||
1410 dyn_gfx_state_dirty(cmdbuf, RS_CULL_MODE) ||
1411 dyn_gfx_state_dirty(cmdbuf, RS_FRONT_FACE) ||
1412 dyn_gfx_state_dirty(cmdbuf, MS_RASTERIZATION_SAMPLES) ||
1413 dyn_gfx_state_dirty(cmdbuf, MS_SAMPLE_MASK) ||
1414 dyn_gfx_state_dirty(cmdbuf, MS_ALPHA_TO_COVERAGE_ENABLE) ||
1415 dyn_gfx_state_dirty(cmdbuf, MS_ALPHA_TO_ONE_ENABLE) ||
1416 /* writes_depth() uses vk_depth_stencil_state */
1417 dyn_gfx_state_dirty(cmdbuf, DS_DEPTH_TEST_ENABLE) ||
1418 dyn_gfx_state_dirty(cmdbuf, DS_DEPTH_WRITE_ENABLE) ||
1419 dyn_gfx_state_dirty(cmdbuf, DS_DEPTH_COMPARE_OP) ||
1420 /* writes_stencil() uses vk_depth_stencil_state */
1421 dyn_gfx_state_dirty(cmdbuf, DS_STENCIL_TEST_ENABLE) ||
1422 dyn_gfx_state_dirty(cmdbuf, DS_STENCIL_OP) ||
1423 dyn_gfx_state_dirty(cmdbuf, DS_STENCIL_WRITE_MASK) ||
1424 fs_user_dirty(cmdbuf) || gfx_state_dirty(cmdbuf, RENDER_STATE) ||
1425 gfx_state_dirty(cmdbuf, OQ);
1426 bool dcd1_dirty = dyn_gfx_state_dirty(cmdbuf, MS_RASTERIZATION_SAMPLES) ||
1427 dyn_gfx_state_dirty(cmdbuf, MS_SAMPLE_MASK) ||
1428 fs_user_dirty(cmdbuf) ||
1429 gfx_state_dirty(cmdbuf, RENDER_STATE);
1430
1431 const struct vk_dynamic_graphics_state *dyns =
1432 &cmdbuf->vk.dynamic_graphics_state;
1433 const struct vk_rasterization_state *rs =
1434 &cmdbuf->vk.dynamic_graphics_state.rs;
1435 bool alpha_to_coverage = dyns->ms.alpha_to_coverage_enable;
1436 bool writes_z = writes_depth(cmdbuf);
1437 bool writes_s = writes_stencil(cmdbuf);
1438
1439 if (dcd0_dirty) {
1440 struct mali_dcd_flags_0_packed dcd0;
1441 pan_pack(&dcd0, DCD_FLAGS_0, cfg) {
1442 if (fs) {
1443 uint8_t rt_written = fs->info.outputs_written >> FRAG_RESULT_DATA0;
1444 uint8_t rt_mask = cmdbuf->state.gfx.render.bound_attachments &
1445 MESA_VK_RP_ATTACHMENT_ANY_COLOR_BITS;
1446
1447 cfg.allow_forward_pixel_to_kill =
1448 fs->info.fs.can_fpk && !(rt_mask & ~rt_written) &&
1449 !alpha_to_coverage && !cmdbuf->state.gfx.cb.info.any_dest_read;
1450
1451 bool writes_zs = writes_z || writes_s;
1452 bool zs_always_passes = ds_test_always_passes(cmdbuf);
1453 bool oq = cmdbuf->state.gfx.occlusion_query.mode !=
1454 MALI_OCCLUSION_MODE_DISABLED;
1455
1456 struct pan_earlyzs_state earlyzs =
1457 pan_earlyzs_get(pan_earlyzs_analyze(&fs->info), writes_zs || oq,
1458 alpha_to_coverage, zs_always_passes);
1459
1460 cfg.pixel_kill_operation = earlyzs.kill;
1461 cfg.zs_update_operation = earlyzs.update;
1462 cfg.evaluate_per_sample = fs->info.fs.sample_shading;
1463 } else {
1464 cfg.allow_forward_pixel_to_kill = true;
1465 cfg.allow_forward_pixel_to_be_killed = true;
1466 cfg.pixel_kill_operation = MALI_PIXEL_KILL_FORCE_EARLY;
1467 cfg.zs_update_operation = MALI_PIXEL_KILL_STRONG_EARLY;
1468 cfg.overdraw_alpha0 = true;
1469 cfg.overdraw_alpha1 = true;
1470 }
1471
1472 cfg.front_face_ccw = rs->front_face == VK_FRONT_FACE_COUNTER_CLOCKWISE;
1473 cfg.cull_front_face = (rs->cull_mode & VK_CULL_MODE_FRONT_BIT) != 0;
1474 cfg.cull_back_face = (rs->cull_mode & VK_CULL_MODE_BACK_BIT) != 0;
1475
1476 cfg.multisample_enable = dyns->ms.rasterization_samples > 1;
1477 cfg.occlusion_query = cmdbuf->state.gfx.occlusion_query.mode;
1478 cfg.alpha_to_coverage = alpha_to_coverage;
1479 }
1480
1481 cs_update_vt_ctx(b)
1482 cs_move32_to(b, cs_sr_reg32(b, 57), dcd0.opaque[0]);
1483 }
1484
1485 if (dcd1_dirty) {
1486 struct mali_dcd_flags_1_packed dcd1;
1487 pan_pack(&dcd1, DCD_FLAGS_1, cfg) {
1488 cfg.sample_mask = dyns->ms.rasterization_samples > 1
1489 ? dyns->ms.sample_mask
1490 : UINT16_MAX;
1491
1492 if (fs) {
1493 cfg.render_target_mask =
1494 (fs->info.outputs_written >> FRAG_RESULT_DATA0) &
1495 cmdbuf->state.gfx.render.bound_attachments;
1496 }
1497 }
1498
1499 cs_update_vt_ctx(b)
1500 cs_move32_to(b, cs_sr_reg32(b, 58), dcd1.opaque[0]);
1501 }
1502 }
1503
1504 static void
prepare_index_buffer(struct panvk_cmd_buffer * cmdbuf,struct panvk_draw_info * draw)1505 prepare_index_buffer(struct panvk_cmd_buffer *cmdbuf,
1506 struct panvk_draw_info *draw)
1507 {
1508 struct cs_builder *b =
1509 panvk_get_cs_builder(cmdbuf, PANVK_SUBQUEUE_VERTEX_TILER);
1510
1511 if (draw->index.size && gfx_state_dirty(cmdbuf, IB)) {
1512 uint64_t ib_size =
1513 panvk_buffer_range(cmdbuf->state.gfx.ib.buffer,
1514 cmdbuf->state.gfx.ib.offset, VK_WHOLE_SIZE);
1515 assert(ib_size <= UINT32_MAX);
1516 cs_move32_to(b, cs_sr_reg32(b, 39), ib_size);
1517
1518 cs_move64_to(b, cs_sr_reg64(b, 54),
1519 panvk_buffer_gpu_ptr(cmdbuf->state.gfx.ib.buffer,
1520 cmdbuf->state.gfx.ib.offset));
1521 }
1522 }
1523
1524 static void
set_tiler_idvs_flags(struct cs_builder * b,struct panvk_cmd_buffer * cmdbuf,struct panvk_draw_info * draw)1525 set_tiler_idvs_flags(struct cs_builder *b, struct panvk_cmd_buffer *cmdbuf,
1526 struct panvk_draw_info *draw)
1527 {
1528 const struct panvk_shader *vs = cmdbuf->state.gfx.vs.shader;
1529 const struct panvk_shader *fs = get_fs(cmdbuf);
1530 const struct vk_dynamic_graphics_state *dyns =
1531 &cmdbuf->vk.dynamic_graphics_state;
1532 const struct vk_input_assembly_state *ia = &dyns->ia;
1533 const struct vk_rasterization_state *rs = &dyns->rs;
1534 struct mali_primitive_flags_packed tiler_idvs_flags;
1535
1536 /* When drawing non-point primitives, we use the no_psiz variant which has
1537 * point size writes patched out */
1538 bool writes_point_size =
1539 vs->info.vs.writes_point_size &&
1540 ia->primitive_topology == VK_PRIMITIVE_TOPOLOGY_POINT_LIST;
1541 bool multiview = cmdbuf->state.gfx.render.view_mask;
1542 bool writes_layer = vs->info.outputs_written & VARYING_BIT_LAYER;
1543
1544 /* Multiview shaders depend on the FIFO format for indexing per-view
1545 * output writes. We don't currently patch these offsets in the no_psiz
1546 * variant, so we still need the extended format even though the shader
1547 * does not write point size. */
1548 bool extended_fifo = writes_point_size || writes_layer ||
1549 (vs->info.vs.writes_point_size && multiview);
1550
1551 bool dirty = gfx_state_dirty(cmdbuf, VS) || fs_user_dirty(cmdbuf) ||
1552 dyn_gfx_state_dirty(cmdbuf, IA_PRIMITIVE_RESTART_ENABLE) ||
1553 dyn_gfx_state_dirty(cmdbuf, IA_PRIMITIVE_TOPOLOGY) ||
1554 dyn_gfx_state_dirty(cmdbuf, RS_DEPTH_CLAMP_ENABLE) ||
1555 dyn_gfx_state_dirty(cmdbuf, RS_DEPTH_CLIP_ENABLE);
1556
1557 if (dirty) {
1558 pan_pack(&tiler_idvs_flags, PRIMITIVE_FLAGS, cfg) {
1559 cfg.draw_mode = translate_prim_topology(ia->primitive_topology);
1560
1561 cfg.point_size_array_format = writes_point_size
1562 ? MALI_POINT_SIZE_ARRAY_FORMAT_FP16
1563 : MALI_POINT_SIZE_ARRAY_FORMAT_NONE;
1564 cfg.layer_index_enable = writes_layer;
1565
1566 cfg.position_fifo_format = extended_fifo
1567 ? MALI_FIFO_FORMAT_EXTENDED
1568 : MALI_FIFO_FORMAT_BASIC;
1569
1570 cfg.low_depth_cull = cfg.high_depth_cull =
1571 vk_rasterization_state_depth_clip_enable(rs);
1572
1573 cfg.secondary_shader = vs->info.vs.secondary_enable && fs != NULL;
1574 cfg.primitive_restart = ia->primitive_restart_enable;
1575 cfg.view_mask = cmdbuf->state.gfx.render.view_mask;
1576 }
1577
1578 cs_move32_to(b, cs_sr_reg32(b, 56), tiler_idvs_flags.opaque[0]);
1579 }
1580 }
1581
1582 static struct mali_primitive_flags_packed
get_tiler_flags_override(struct panvk_draw_info * draw)1583 get_tiler_flags_override(struct panvk_draw_info *draw)
1584 {
1585 struct mali_primitive_flags_packed flags_override;
1586 /* Pack with nodefaults so only explicitly set override fields affect the
1587 * previously set register values */
1588 pan_pack_nodefaults(&flags_override, PRIMITIVE_FLAGS, cfg) {
1589 cfg.index_type = index_size_to_index_type(draw->index.size);
1590 };
1591
1592 return flags_override;
1593 }
1594
1595 static VkResult
prepare_draw(struct panvk_cmd_buffer * cmdbuf,struct panvk_draw_info * draw)1596 prepare_draw(struct panvk_cmd_buffer *cmdbuf, struct panvk_draw_info *draw)
1597 {
1598 const struct panvk_shader *vs = cmdbuf->state.gfx.vs.shader;
1599 const struct panvk_shader *fs = get_fs(cmdbuf);
1600 struct panvk_descriptor_state *desc_state = &cmdbuf->state.gfx.desc_state;
1601 bool idvs = vs->info.vs.idvs;
1602 VkResult result;
1603
1604 assert(vs);
1605
1606 /* FIXME: support non-IDVS. */
1607 assert(idvs);
1608
1609 set_provoking_vertex_mode(cmdbuf);
1610
1611 result = update_tls(cmdbuf);
1612 if (result != VK_SUCCESS)
1613 return result;
1614
1615 if (!inherits_render_ctx(cmdbuf)) {
1616 result = get_render_ctx(cmdbuf);
1617 if (result != VK_SUCCESS)
1618 return result;
1619 }
1620
1621 struct cs_builder *b =
1622 panvk_get_cs_builder(cmdbuf, PANVK_SUBQUEUE_VERTEX_TILER);
1623
1624 uint32_t used_set_mask =
1625 vs->desc_info.used_set_mask | (fs ? fs->desc_info.used_set_mask : 0);
1626
1627 if (gfx_state_dirty(cmdbuf, DESC_STATE) || gfx_state_dirty(cmdbuf, VS) ||
1628 gfx_state_dirty(cmdbuf, FS)) {
1629 result = panvk_per_arch(cmd_prepare_push_descs)(cmdbuf, desc_state,
1630 used_set_mask);
1631 if (result != VK_SUCCESS)
1632 return result;
1633 }
1634
1635 result = prepare_blend(cmdbuf);
1636 if (result != VK_SUCCESS)
1637 return result;
1638
1639 panvk_per_arch(cmd_prepare_draw_sysvals)(cmdbuf, draw);
1640
1641 result = prepare_push_uniforms(cmdbuf);
1642 if (result != VK_SUCCESS)
1643 return result;
1644
1645 result = prepare_vs(cmdbuf);
1646 if (result != VK_SUCCESS)
1647 return result;
1648
1649 result = prepare_fs(cmdbuf);
1650 if (result != VK_SUCCESS)
1651 return result;
1652
1653 uint32_t varying_size = 0;
1654
1655 if (fs) {
1656 unsigned vs_vars = vs->info.varyings.output_count;
1657 unsigned fs_vars = fs->info.varyings.input_count;
1658 unsigned var_slots = MAX2(vs_vars, fs_vars);
1659
1660 /* Assumes 16 byte slots. We could do better. */
1661 varying_size = var_slots * 16;
1662 }
1663
1664 cs_update_vt_ctx(b) {
1665 /* We don't use the resource dep system yet. */
1666 cs_move32_to(b, cs_sr_reg32(b, 38), 0);
1667
1668 prepare_index_buffer(cmdbuf, draw);
1669
1670 set_tiler_idvs_flags(b, cmdbuf, draw);
1671
1672 cs_move32_to(b, cs_sr_reg32(b, 48), varying_size);
1673
1674 result = prepare_ds(cmdbuf);
1675 if (result != VK_SUCCESS)
1676 return result;
1677
1678 result = prepare_oq(cmdbuf);
1679 if (result != VK_SUCCESS)
1680 return result;
1681
1682 prepare_dcd(cmdbuf);
1683 prepare_vp(cmdbuf);
1684 prepare_tiler_primitive_size(cmdbuf);
1685 }
1686
1687 clear_dirty_after_draw(cmdbuf);
1688 return VK_SUCCESS;
1689 }
1690
1691 static void
panvk_cmd_draw(struct panvk_cmd_buffer * cmdbuf,struct panvk_draw_info * draw)1692 panvk_cmd_draw(struct panvk_cmd_buffer *cmdbuf, struct panvk_draw_info *draw)
1693 {
1694 const struct cs_tracing_ctx *tracing_ctx =
1695 &cmdbuf->state.cs[PANVK_SUBQUEUE_VERTEX_TILER].tracing;
1696 const struct panvk_shader *vs = cmdbuf->state.gfx.vs.shader;
1697 struct cs_builder *b =
1698 panvk_get_cs_builder(cmdbuf, PANVK_SUBQUEUE_VERTEX_TILER);
1699 VkResult result;
1700
1701 /* If there's no vertex shader, we can skip the draw. */
1702 if (!panvk_priv_mem_dev_addr(vs->spds.pos_points))
1703 return;
1704
1705 /* Needs to be done before get_fs() is called because it depends on
1706 * fs.required being initialized. */
1707 cmdbuf->state.gfx.fs.required =
1708 fs_required(&cmdbuf->state.gfx, &cmdbuf->vk.dynamic_graphics_state);
1709
1710 if (!cmdbuf->vk.dynamic_graphics_state.rs.rasterizer_discard_enable) {
1711 struct pan_fb_info *fbinfo = &cmdbuf->state.gfx.render.fb.info;
1712 uint32_t rasterization_samples =
1713 cmdbuf->vk.dynamic_graphics_state.ms.rasterization_samples;
1714
1715 /* If there's no attachment, we patch nr_samples to match
1716 * rasterization_samples, otherwise, we make sure those two numbers match.
1717 */
1718 if (!cmdbuf->state.gfx.render.bound_attachments) {
1719 assert(rasterization_samples > 0);
1720 fbinfo->nr_samples = rasterization_samples;
1721 } else {
1722 assert(rasterization_samples == fbinfo->nr_samples);
1723 }
1724 }
1725
1726 result = prepare_draw(cmdbuf, draw);
1727 if (result != VK_SUCCESS)
1728 return;
1729
1730 cs_update_vt_ctx(b) {
1731 cs_move32_to(b, cs_sr_reg32(b, 32), 0);
1732 cs_move32_to(b, cs_sr_reg32(b, 33), draw->vertex.count);
1733 cs_move32_to(b, cs_sr_reg32(b, 34), draw->instance.count);
1734 cs_move32_to(b, cs_sr_reg32(b, 35), draw->index.offset);
1735 cs_move32_to(b, cs_sr_reg32(b, 36), draw->vertex.base);
1736 /* NIR expects zero-based instance ID, but even if it did have an intrinsic to
1737 * load the absolute instance ID, we'd want to keep it zero-based to work around
1738 * Mali's limitation on non-zero firstInstance when a instance divisor is used.
1739 */
1740 cs_move32_to(b, cs_sr_reg32(b, 37), 0);
1741 }
1742
1743 struct mali_primitive_flags_packed flags_override =
1744 get_tiler_flags_override(draw);
1745
1746 uint32_t idvs_count = DIV_ROUND_UP(cmdbuf->state.gfx.render.layer_count,
1747 MAX_LAYERS_PER_TILER_DESC);
1748
1749 cs_req_res(b, CS_IDVS_RES);
1750 if (idvs_count > 1) {
1751 struct cs_index counter_reg = cs_scratch_reg32(b, 17);
1752 struct cs_index tiler_ctx_addr = cs_sr_reg64(b, 40);
1753
1754 cs_move32_to(b, counter_reg, idvs_count);
1755
1756 cs_while(b, MALI_CS_CONDITION_GREATER, counter_reg) {
1757 cs_trace_run_idvs(b, tracing_ctx, cs_scratch_reg_tuple(b, 0, 4),
1758 flags_override.opaque[0], false, true,
1759 cs_shader_res_sel(0, 0, 1, 0),
1760 cs_shader_res_sel(2, 2, 2, 0), cs_undef());
1761
1762 cs_add32(b, counter_reg, counter_reg, -1);
1763 cs_update_vt_ctx(b) {
1764 cs_add64(b, tiler_ctx_addr, tiler_ctx_addr,
1765 pan_size(TILER_CONTEXT));
1766 }
1767 }
1768
1769 cs_update_vt_ctx(b) {
1770 cs_add64(b, tiler_ctx_addr, tiler_ctx_addr,
1771 -(idvs_count * pan_size(TILER_CONTEXT)));
1772 }
1773 } else {
1774 cs_trace_run_idvs(b, tracing_ctx, cs_scratch_reg_tuple(b, 0, 4),
1775 flags_override.opaque[0], false, true,
1776 cs_shader_res_sel(0, 0, 1, 0),
1777 cs_shader_res_sel(2, 2, 2, 0), cs_undef());
1778 }
1779 cs_req_res(b, 0);
1780 }
1781
1782 VkResult
panvk_per_arch(cmd_prepare_exec_cmd_for_draws)1783 panvk_per_arch(cmd_prepare_exec_cmd_for_draws)(
1784 struct panvk_cmd_buffer *primary,
1785 struct panvk_cmd_buffer *secondary)
1786 {
1787 if (!(secondary->flags & VK_COMMAND_BUFFER_USAGE_RENDER_PASS_CONTINUE_BIT))
1788 return VK_SUCCESS;
1789
1790 if (!inherits_render_ctx(primary)) {
1791 VkResult result = get_render_ctx(primary);
1792 if (result != VK_SUCCESS)
1793 return result;
1794 }
1795
1796 return prepare_oq(primary);
1797 }
1798
1799 VKAPI_ATTR void VKAPI_CALL
panvk_per_arch(CmdDraw)1800 panvk_per_arch(CmdDraw)(VkCommandBuffer commandBuffer, uint32_t vertexCount,
1801 uint32_t instanceCount, uint32_t firstVertex,
1802 uint32_t firstInstance)
1803 {
1804 VK_FROM_HANDLE(panvk_cmd_buffer, cmdbuf, commandBuffer);
1805
1806 if (instanceCount == 0 || vertexCount == 0)
1807 return;
1808
1809 /* gl_BaseVertexARB is a signed integer, and it should expose the value of
1810 * firstVertex in a non-indexed draw. */
1811 assert(firstVertex < INT32_MAX);
1812
1813 /* gl_BaseInstance is a signed integer, and it should expose the value of
1814 * firstInstnace. */
1815 assert(firstInstance < INT32_MAX);
1816
1817 struct panvk_draw_info draw = {
1818 .vertex.base = firstVertex,
1819 .vertex.count = vertexCount,
1820 .instance.base = firstInstance,
1821 .instance.count = instanceCount,
1822 };
1823
1824 panvk_cmd_draw(cmdbuf, &draw);
1825 }
1826
1827 VKAPI_ATTR void VKAPI_CALL
panvk_per_arch(CmdDrawIndexed)1828 panvk_per_arch(CmdDrawIndexed)(VkCommandBuffer commandBuffer,
1829 uint32_t indexCount, uint32_t instanceCount,
1830 uint32_t firstIndex, int32_t vertexOffset,
1831 uint32_t firstInstance)
1832 {
1833 VK_FROM_HANDLE(panvk_cmd_buffer, cmdbuf, commandBuffer);
1834
1835 if (instanceCount == 0 || indexCount == 0)
1836 return;
1837
1838 /* gl_BaseInstance is a signed integer, and it should expose the value of
1839 * firstInstnace. */
1840 assert(firstInstance < INT32_MAX);
1841
1842 struct panvk_draw_info draw = {
1843 .index.size = cmdbuf->state.gfx.ib.index_size,
1844 .index.offset = firstIndex,
1845 .vertex.base = vertexOffset,
1846 .vertex.count = indexCount,
1847 .instance.count = instanceCount,
1848 .instance.base = firstInstance,
1849 };
1850
1851 panvk_cmd_draw(cmdbuf, &draw);
1852 }
1853
1854 static void
panvk_cmd_draw_indirect(struct panvk_cmd_buffer * cmdbuf,struct panvk_draw_info * draw)1855 panvk_cmd_draw_indirect(struct panvk_cmd_buffer *cmdbuf,
1856 struct panvk_draw_info *draw)
1857 {
1858 const struct cs_tracing_ctx *tracing_ctx =
1859 &cmdbuf->state.cs[PANVK_SUBQUEUE_VERTEX_TILER].tracing;
1860 const struct panvk_shader *vs = cmdbuf->state.gfx.vs.shader;
1861 struct cs_builder *b =
1862 panvk_get_cs_builder(cmdbuf, PANVK_SUBQUEUE_VERTEX_TILER);
1863 VkResult result;
1864
1865 /* If there's no vertex shader, we can skip the draw. */
1866 if (!panvk_priv_mem_dev_addr(vs->spds.pos_points))
1867 return;
1868
1869 /* Needs to be done before get_fs() is called because it depends on
1870 * fs.required being initialized. */
1871 cmdbuf->state.gfx.fs.required =
1872 fs_required(&cmdbuf->state.gfx, &cmdbuf->vk.dynamic_graphics_state);
1873
1874 /* Layered indirect draw (VK_EXT_shader_viewport_index_layer) needs
1875 * additional changes. We allow layer_count == 0 because that happens
1876 * when mixing dynamic rendering and secondary command buffers. Once
1877 * we decide to support layared+indirect, we'll need to pass the
1878 * layer_count info through the tiler descriptor, for instance by
1879 * re-using one of the word that's flagged 'ignored' in the descriptor
1880 * (word 14:23).
1881 *
1882 * Multiview is limited to 8 layers, and so will always fit in one TD.
1883 * Therefore layered rendering is allowed with multiview. */
1884 assert(cmdbuf->state.gfx.render.layer_count <= 1 ||
1885 cmdbuf->state.gfx.render.view_mask);
1886
1887 /* MultiDrawIndirect (.maxDrawIndirectCount) needs additional changes. */
1888 assert(draw->indirect.draw_count == 1);
1889
1890 /* Force a new push uniform block to be allocated */
1891 gfx_state_set_dirty(cmdbuf, VS_PUSH_UNIFORMS);
1892
1893 result = prepare_draw(cmdbuf, draw);
1894 if (result != VK_SUCCESS)
1895 return;
1896
1897 struct cs_index draw_params_addr = cs_scratch_reg64(b, 0);
1898 cs_move64_to(b, draw_params_addr, draw->indirect.buffer_dev_addr);
1899
1900 cs_update_vt_ctx(b) {
1901 cs_move32_to(b, cs_sr_reg32(b, 32), 0);
1902 /* Load SR33-37 from indirect buffer. */
1903 unsigned reg_mask = draw->index.size ? 0b11111 : 0b11011;
1904 cs_load_to(b, cs_sr_reg_tuple(b, 33, 5), draw_params_addr, reg_mask, 0);
1905 }
1906
1907 /* Wait for the SR33-37 indirect buffer load. */
1908 cs_wait_slot(b, SB_ID(LS), false);
1909
1910 if (shader_uses_sysval(vs, graphics, vs.first_vertex) ||
1911 shader_uses_sysval(vs, graphics, vs.base_instance)) {
1912 struct cs_index fau_block_addr = cs_scratch_reg64(b, 2);
1913 cs_move64_to(b, fau_block_addr, cmdbuf->state.gfx.vs.push_uniforms);
1914
1915 if (shader_uses_sysval(vs, graphics, vs.first_vertex)) {
1916 cs_store32(b, cs_sr_reg32(b, 36), fau_block_addr,
1917 shader_remapped_sysval_offset(
1918 vs, sysval_offset(graphics, vs.first_vertex)));
1919 }
1920
1921 if (shader_uses_sysval(vs, graphics, vs.base_instance)) {
1922 cs_store32(b, cs_sr_reg32(b, 37), fau_block_addr,
1923 shader_remapped_sysval_offset(
1924 vs, sysval_offset(graphics, vs.base_instance)));
1925 }
1926
1927 /* Wait for the store using SR-37 as src to finish, so we can overwrite
1928 * it. */
1929 cs_wait_slot(b, SB_ID(LS), false);
1930 }
1931
1932 /* NIR expects zero-based instance ID, but even if it did have an intrinsic to
1933 * load the absolute instance ID, we'd want to keep it zero-based to work around
1934 * Mali's limitation on non-zero firstInstance when a instance divisor is used.
1935 */
1936 cs_update_vt_ctx(b)
1937 cs_move32_to(b, cs_sr_reg32(b, 37), 0);
1938
1939 struct mali_primitive_flags_packed flags_override =
1940 get_tiler_flags_override(draw);
1941
1942 cs_req_res(b, CS_IDVS_RES);
1943 cs_trace_run_idvs(b, tracing_ctx, cs_scratch_reg_tuple(b, 0, 4),
1944 flags_override.opaque[0], false, true,
1945 cs_shader_res_sel(0, 0, 1, 0),
1946 cs_shader_res_sel(2, 2, 2, 0), cs_undef());
1947 cs_req_res(b, 0);
1948 }
1949
1950 VKAPI_ATTR void VKAPI_CALL
panvk_per_arch(CmdDrawIndirect)1951 panvk_per_arch(CmdDrawIndirect)(VkCommandBuffer commandBuffer, VkBuffer _buffer,
1952 VkDeviceSize offset, uint32_t drawCount,
1953 uint32_t stride)
1954 {
1955 VK_FROM_HANDLE(panvk_cmd_buffer, cmdbuf, commandBuffer);
1956 VK_FROM_HANDLE(panvk_buffer, buffer, _buffer);
1957
1958 if (drawCount == 0)
1959 return;
1960
1961 struct panvk_draw_info draw = {
1962 .indirect.buffer_dev_addr = panvk_buffer_gpu_ptr(buffer, offset),
1963 .indirect.draw_count = drawCount,
1964 .indirect.stride = stride,
1965 };
1966
1967 panvk_cmd_draw_indirect(cmdbuf, &draw);
1968 }
1969
1970 VKAPI_ATTR void VKAPI_CALL
panvk_per_arch(CmdDrawIndexedIndirect)1971 panvk_per_arch(CmdDrawIndexedIndirect)(VkCommandBuffer commandBuffer,
1972 VkBuffer _buffer, VkDeviceSize offset,
1973 uint32_t drawCount, uint32_t stride)
1974 {
1975 VK_FROM_HANDLE(panvk_cmd_buffer, cmdbuf, commandBuffer);
1976 VK_FROM_HANDLE(panvk_buffer, buffer, _buffer);
1977
1978 if (drawCount == 0)
1979 return;
1980
1981 struct panvk_draw_info draw = {
1982 .index.size = cmdbuf->state.gfx.ib.index_size,
1983 .indirect.buffer_dev_addr = panvk_buffer_gpu_ptr(buffer, offset),
1984 .indirect.draw_count = drawCount,
1985 .indirect.stride = stride,
1986 };
1987
1988 panvk_cmd_draw_indirect(cmdbuf, &draw);
1989 }
1990
1991 void
panvk_per_arch(cmd_inherit_render_state)1992 panvk_per_arch(cmd_inherit_render_state)(
1993 struct panvk_cmd_buffer *cmdbuf,
1994 const VkCommandBufferBeginInfo *pBeginInfo)
1995 {
1996 if (cmdbuf->vk.level != VK_COMMAND_BUFFER_LEVEL_SECONDARY ||
1997 !(pBeginInfo->flags & VK_COMMAND_BUFFER_USAGE_RENDER_PASS_CONTINUE_BIT))
1998 return;
1999
2000 assert(pBeginInfo->pInheritanceInfo);
2001 char gcbiar_data[VK_GCBIARR_DATA_SIZE(MAX_RTS)];
2002 const VkRenderingInfo *resume_info =
2003 vk_get_command_buffer_inheritance_as_rendering_resume(cmdbuf->vk.level,
2004 pBeginInfo,
2005 gcbiar_data);
2006 if (resume_info) {
2007 panvk_per_arch(cmd_init_render_state)(cmdbuf, resume_info);
2008 return;
2009 }
2010
2011 const VkCommandBufferInheritanceRenderingInfo *inheritance_info =
2012 vk_get_command_buffer_inheritance_rendering_info(cmdbuf->vk.level,
2013 pBeginInfo);
2014 assert(inheritance_info);
2015 struct panvk_device *dev = to_panvk_device(cmdbuf->vk.base.device);
2016 struct panvk_physical_device *phys_dev =
2017 to_panvk_physical_device(dev->vk.physical);
2018 struct pan_fb_info *fbinfo = &cmdbuf->state.gfx.render.fb.info;
2019
2020 cmdbuf->state.gfx.render.flags = inheritance_info->flags;
2021
2022 gfx_state_set_dirty(cmdbuf, RENDER_STATE);
2023 memset(cmdbuf->state.gfx.render.fb.crc_valid, 0,
2024 sizeof(cmdbuf->state.gfx.render.fb.crc_valid));
2025 memset(&cmdbuf->state.gfx.render.color_attachments, 0,
2026 sizeof(cmdbuf->state.gfx.render.color_attachments));
2027 memset(&cmdbuf->state.gfx.render.z_attachment, 0,
2028 sizeof(cmdbuf->state.gfx.render.z_attachment));
2029 memset(&cmdbuf->state.gfx.render.s_attachment, 0,
2030 sizeof(cmdbuf->state.gfx.render.s_attachment));
2031 cmdbuf->state.gfx.render.bound_attachments = 0;
2032
2033 cmdbuf->state.gfx.render.view_mask = inheritance_info->viewMask;
2034 cmdbuf->state.gfx.render.layer_count = inheritance_info->viewMask ?
2035 util_last_bit(inheritance_info->viewMask) :
2036 0;
2037 *fbinfo = (struct pan_fb_info){
2038 .tile_buf_budget = panfrost_query_optimal_tib_size(phys_dev->model),
2039 .nr_samples = inheritance_info->rasterizationSamples,
2040 .rt_count = inheritance_info->colorAttachmentCount,
2041 };
2042
2043 assert(inheritance_info->colorAttachmentCount <= ARRAY_SIZE(fbinfo->rts));
2044
2045 for (uint32_t i = 0; i < inheritance_info->colorAttachmentCount; i++) {
2046 cmdbuf->state.gfx.render.bound_attachments |=
2047 MESA_VK_RP_ATTACHMENT_COLOR_BIT(i);
2048 cmdbuf->state.gfx.render.color_attachments.fmts[i] =
2049 inheritance_info->pColorAttachmentFormats[i];
2050 cmdbuf->state.gfx.render.color_attachments.samples[i] =
2051 inheritance_info->rasterizationSamples;
2052 }
2053
2054 if (inheritance_info->depthAttachmentFormat) {
2055 cmdbuf->state.gfx.render.bound_attachments |=
2056 MESA_VK_RP_ATTACHMENT_DEPTH_BIT;
2057 cmdbuf->state.gfx.render.z_attachment.fmt =
2058 inheritance_info->depthAttachmentFormat;
2059 }
2060
2061 if (inheritance_info->stencilAttachmentFormat) {
2062 cmdbuf->state.gfx.render.bound_attachments |=
2063 MESA_VK_RP_ATTACHMENT_STENCIL_BIT;
2064 cmdbuf->state.gfx.render.s_attachment.fmt =
2065 inheritance_info->stencilAttachmentFormat;
2066 }
2067
2068 const VkRenderingAttachmentLocationInfoKHR att_loc_info_default = {
2069 .sType = VK_STRUCTURE_TYPE_RENDERING_ATTACHMENT_LOCATION_INFO_KHR,
2070 .colorAttachmentCount = inheritance_info->colorAttachmentCount,
2071 };
2072 const VkRenderingAttachmentLocationInfoKHR *att_loc_info =
2073 vk_get_command_buffer_rendering_attachment_location_info(
2074 cmdbuf->vk.level, pBeginInfo);
2075 if (att_loc_info == NULL)
2076 att_loc_info = &att_loc_info_default;
2077
2078 vk_cmd_set_rendering_attachment_locations(&cmdbuf->vk, att_loc_info);
2079 }
2080
2081 VKAPI_ATTR void VKAPI_CALL
panvk_per_arch(CmdBeginRendering)2082 panvk_per_arch(CmdBeginRendering)(VkCommandBuffer commandBuffer,
2083 const VkRenderingInfo *pRenderingInfo)
2084 {
2085 VK_FROM_HANDLE(panvk_cmd_buffer, cmdbuf, commandBuffer);
2086 struct panvk_cmd_graphics_state *state = &cmdbuf->state.gfx;
2087 bool resuming = pRenderingInfo->flags & VK_RENDERING_RESUMING_BIT;
2088
2089 panvk_per_arch(cmd_init_render_state)(cmdbuf, pRenderingInfo);
2090
2091 /* If we're not resuming, the FBD should be NULL. */
2092 assert(!state->render.fbds.gpu || resuming);
2093
2094 if (!resuming)
2095 panvk_per_arch(cmd_preload_render_area_border)(cmdbuf, pRenderingInfo);
2096 }
2097
2098 static void
flush_tiling(struct panvk_cmd_buffer * cmdbuf)2099 flush_tiling(struct panvk_cmd_buffer *cmdbuf)
2100 {
2101 struct cs_builder *b =
2102 panvk_get_cs_builder(cmdbuf, PANVK_SUBQUEUE_VERTEX_TILER);
2103
2104 struct cs_index render_ctx = cs_scratch_reg64(b, 2);
2105
2106 if (cmdbuf->state.gfx.render.tiler || inherits_render_ctx(cmdbuf)) {
2107 /* Flush the tiling operations and signal the internal sync object. */
2108 cs_req_res(b, CS_TILER_RES);
2109 cs_finish_tiling(b, false);
2110 cs_req_res(b, 0);
2111
2112 struct cs_index sync_addr = cs_scratch_reg64(b, 0);
2113 struct cs_index iter_sb = cs_scratch_reg32(b, 2);
2114 struct cs_index cmp_scratch = cs_scratch_reg32(b, 3);
2115 struct cs_index add_val = cs_scratch_reg64(b, 4);
2116
2117 cs_load_to(b, cs_scratch_reg_tuple(b, 0, 3), cs_subqueue_ctx_reg(b),
2118 BITFIELD_MASK(3),
2119 offsetof(struct panvk_cs_subqueue_context, syncobjs));
2120 cs_wait_slot(b, SB_ID(LS), false);
2121
2122 /* We're relying on PANVK_SUBQUEUE_VERTEX_TILER being the first queue to
2123 * skip an ADD operation on the syncobjs pointer. */
2124 STATIC_ASSERT(PANVK_SUBQUEUE_VERTEX_TILER == 0);
2125
2126 cs_move64_to(b, add_val, 1);
2127
2128 cs_match(b, iter_sb, cmp_scratch) {
2129 #define CASE(x) \
2130 cs_case(b, x) { \
2131 cs_heap_operation(b, \
2132 MALI_CS_HEAP_OPERATION_VERTEX_TILER_COMPLETED, \
2133 cs_defer(SB_WAIT_ITER(x), \
2134 SB_ID(DEFERRED_SYNC))); \
2135 cs_sync64_add(b, true, MALI_CS_SYNC_SCOPE_CSG, \
2136 add_val, sync_addr, \
2137 cs_defer(SB_WAIT_ITER(x), SB_ID(DEFERRED_SYNC))); \
2138 cs_move32_to(b, iter_sb, next_iter_sb(x)); \
2139 }
2140
2141 CASE(0)
2142 CASE(1)
2143 CASE(2)
2144 CASE(3)
2145 CASE(4)
2146 #undef CASE
2147 }
2148
2149 cs_store32(b, iter_sb, cs_subqueue_ctx_reg(b),
2150 offsetof(struct panvk_cs_subqueue_context, iter_sb));
2151 cs_wait_slot(b, SB_ID(LS), false);
2152
2153 /* Update the vertex seqno. */
2154 ++cmdbuf->state.cs[PANVK_SUBQUEUE_VERTEX_TILER].relative_sync_point;
2155 } else {
2156 cs_load64_to(b, render_ctx, cs_subqueue_ctx_reg(b),
2157 offsetof(struct panvk_cs_subqueue_context, render));
2158 cs_wait_slot(b, SB_ID(LS), false);
2159 }
2160 }
2161
2162 static void
wait_finish_tiling(struct panvk_cmd_buffer * cmdbuf)2163 wait_finish_tiling(struct panvk_cmd_buffer *cmdbuf)
2164 {
2165 struct cs_builder *b = panvk_get_cs_builder(cmdbuf, PANVK_SUBQUEUE_FRAGMENT);
2166 struct cs_index vt_sync_addr = cs_scratch_reg64(b, 0);
2167 struct cs_index vt_sync_point = cs_scratch_reg64(b, 2);
2168 uint64_t rel_vt_sync_point =
2169 cmdbuf->state.cs[PANVK_SUBQUEUE_VERTEX_TILER].relative_sync_point;
2170
2171 cs_load64_to(b, vt_sync_addr, cs_subqueue_ctx_reg(b),
2172 offsetof(struct panvk_cs_subqueue_context, syncobjs));
2173 cs_wait_slot(b, SB_ID(LS), false);
2174
2175 cs_add64(b, vt_sync_point,
2176 cs_progress_seqno_reg(b, PANVK_SUBQUEUE_VERTEX_TILER),
2177 rel_vt_sync_point);
2178 cs_sync64_wait(b, false, MALI_CS_CONDITION_GREATER, vt_sync_point,
2179 vt_sync_addr);
2180 }
2181
2182 static uint32_t
calc_tiler_oom_handler_idx(struct panvk_cmd_buffer * cmdbuf)2183 calc_tiler_oom_handler_idx(struct panvk_cmd_buffer *cmdbuf)
2184 {
2185 const struct pan_fb_info *fb = &cmdbuf->state.gfx.render.fb.info;
2186 bool has_zs_ext = fb->zs.view.zs || fb->zs.view.s;
2187 uint32_t rt_count = MAX2(fb->rt_count, 1);
2188
2189 return get_tiler_oom_handler_idx(has_zs_ext, rt_count);
2190 }
2191
2192 static void
setup_tiler_oom_ctx(struct panvk_cmd_buffer * cmdbuf)2193 setup_tiler_oom_ctx(struct panvk_cmd_buffer *cmdbuf)
2194 {
2195 struct cs_builder *b = panvk_get_cs_builder(cmdbuf, PANVK_SUBQUEUE_FRAGMENT);
2196
2197 uint32_t td_count = DIV_ROUND_UP(cmdbuf->state.gfx.render.layer_count,
2198 MAX_LAYERS_PER_TILER_DESC);
2199 uint32_t fbd_sz = calc_fbd_size(cmdbuf);
2200 uint32_t fbd_ir_pass_offset = fbd_sz * cmdbuf->state.gfx.render.layer_count;
2201
2202 struct cs_index counter = cs_scratch_reg32(b, 1);
2203 cs_move32_to(b, counter, 0);
2204 cs_store32(b, counter, cs_subqueue_ctx_reg(b),
2205 TILER_OOM_CTX_FIELD_OFFSET(counter));
2206
2207 struct cs_index fbd_first = cs_scratch_reg64(b, 2);
2208 cs_add64(b, fbd_first, cs_sr_reg64(b, 40),
2209 (1 + PANVK_IR_FIRST_PASS) * fbd_ir_pass_offset);
2210 cs_store64(b, fbd_first, cs_subqueue_ctx_reg(b),
2211 TILER_OOM_CTX_FBDPTR_OFFSET(FIRST));
2212 struct cs_index fbd_middle = cs_scratch_reg64(b, 4);
2213 cs_add64(b, fbd_middle, cs_sr_reg64(b, 40),
2214 (1 + PANVK_IR_MIDDLE_PASS) * fbd_ir_pass_offset);
2215 cs_store64(b, fbd_middle, cs_subqueue_ctx_reg(b),
2216 TILER_OOM_CTX_FBDPTR_OFFSET(MIDDLE));
2217 struct cs_index fbd_last = cs_scratch_reg64(b, 6);
2218 cs_add64(b, fbd_last, cs_sr_reg64(b, 40),
2219 (1 + PANVK_IR_LAST_PASS) * fbd_ir_pass_offset);
2220 cs_store64(b, fbd_last, cs_subqueue_ctx_reg(b),
2221 TILER_OOM_CTX_FBDPTR_OFFSET(LAST));
2222
2223 struct cs_index td_count_reg = cs_scratch_reg32(b, 8);
2224 cs_move32_to(b, td_count_reg, td_count);
2225 cs_store32(b, td_count_reg, cs_subqueue_ctx_reg(b),
2226 TILER_OOM_CTX_FIELD_OFFSET(td_count));
2227 struct cs_index layer_count = cs_scratch_reg32(b, 9);
2228 cs_move32_to(b, layer_count, cmdbuf->state.gfx.render.layer_count);
2229 cs_store32(b, layer_count, cs_subqueue_ctx_reg(b),
2230 TILER_OOM_CTX_FIELD_OFFSET(layer_count));
2231
2232 cs_wait_slot(b, SB_ID(LS), false);
2233 }
2234
2235 static VkResult
issue_fragment_jobs(struct panvk_cmd_buffer * cmdbuf)2236 issue_fragment_jobs(struct panvk_cmd_buffer *cmdbuf)
2237 {
2238 struct panvk_device *dev = to_panvk_device(cmdbuf->vk.base.device);
2239 struct panvk_instance *instance =
2240 to_panvk_instance(dev->vk.physical->instance);
2241 const struct cs_tracing_ctx *tracing_ctx =
2242 &cmdbuf->state.cs[PANVK_SUBQUEUE_FRAGMENT].tracing;
2243 struct pan_fb_info *fbinfo = &cmdbuf->state.gfx.render.fb.info;
2244 struct cs_builder *b = panvk_get_cs_builder(cmdbuf, PANVK_SUBQUEUE_FRAGMENT);
2245 bool has_oq_chain = cmdbuf->state.gfx.render.oq.chain != 0;
2246
2247 /* Reserve a scoreboard for the fragment job. */
2248 panvk_per_arch(cs_pick_iter_sb)(cmdbuf, PANVK_SUBQUEUE_FRAGMENT);
2249
2250 /* Now initialize the fragment bits. */
2251 cs_update_frag_ctx(b) {
2252 cs_move32_to(b, cs_sr_reg32(b, 42),
2253 (fbinfo->extent.miny << 16) | fbinfo->extent.minx);
2254 cs_move32_to(b, cs_sr_reg32(b, 43),
2255 (fbinfo->extent.maxy << 16) | fbinfo->extent.maxx);
2256 }
2257
2258 bool simul_use =
2259 cmdbuf->flags & VK_COMMAND_BUFFER_USAGE_SIMULTANEOUS_USE_BIT;
2260
2261 /* The only bit we patch in FBDs is the tiler pointer. If tiler is not
2262 * involved (clear job) or if the update can happen in place (not
2263 * simultaneous use of the command buffer), we can avoid the
2264 * copy. */
2265 bool needs_tiling =
2266 cmdbuf->state.gfx.render.tiler || inherits_render_ctx(cmdbuf);
2267
2268 /* If the command buffer can run in parallel on different queues, we need
2269 * to make sure each instance has its own descriptors, unless tiling is
2270 * not needed (AKA RUN_FRAGMENT used for clears), because then the FBD
2271 * descriptors are constant (no need to patch them at runtime). */
2272 bool free_render_descs = simul_use && needs_tiling;
2273 uint32_t fbd_sz = calc_fbd_size(cmdbuf);
2274 uint32_t fbd_ir_pass_offset = fbd_sz * cmdbuf->state.gfx.render.layer_count;
2275 uint32_t td_count = 0;
2276 if (needs_tiling) {
2277 td_count = DIV_ROUND_UP(cmdbuf->state.gfx.render.layer_count,
2278 MAX_LAYERS_PER_TILER_DESC);
2279 }
2280
2281 /* Update the Tiler OOM context */
2282 setup_tiler_oom_ctx(cmdbuf);
2283
2284 /* Enable the oom handler before waiting for the vertex/tiler work.
2285 * At this point, the tiler oom context has been set up with the correct
2286 * state for this renderpass, so it's safe to enable. */
2287 struct cs_index addr_reg = cs_scratch_reg64(b, 0);
2288 struct cs_index length_reg = cs_scratch_reg32(b, 2);
2289 uint32_t handler_idx = calc_tiler_oom_handler_idx(cmdbuf);
2290 uint64_t handler_addr = dev->tiler_oom.handlers_bo->addr.dev +
2291 handler_idx * dev->tiler_oom.handler_stride;
2292 cs_move64_to(b, addr_reg, handler_addr);
2293 cs_move32_to(b, length_reg, dev->tiler_oom.handler_stride);
2294 cs_set_exception_handler(b, MALI_CS_EXCEPTION_TYPE_TILER_OOM, addr_reg,
2295 length_reg);
2296
2297 /* Wait for the tiling to be done before submitting the fragment job. */
2298 wait_finish_tiling(cmdbuf);
2299
2300 /* Disable the oom handler once the vertex/tiler work has finished.
2301 * We need to disable the handler at this point as the vertex/tiler subqueue
2302 * might continue on to the next renderpass and hit an out-of-memory
2303 * exception prior to the fragment subqueue setting up the tiler oom context
2304 * for the next renderpass.
2305 * By disabling the handler here, any exception will be left pending until a
2306 * new hander is registered, at which point the correct state has been set
2307 * up. */
2308 cs_move64_to(b, addr_reg, 0);
2309 cs_move32_to(b, length_reg, 0);
2310 cs_set_exception_handler(b, MALI_CS_EXCEPTION_TYPE_TILER_OOM, addr_reg,
2311 length_reg);
2312
2313 /* Pick the correct set of FBDs based on whether an incremental render
2314 * occurred. */
2315 struct cs_index counter = cs_scratch_reg32(b, 0);
2316 cs_load32_to(
2317 b, counter, cs_subqueue_ctx_reg(b),
2318 offsetof(struct panvk_cs_subqueue_context, tiler_oom_ctx.counter));
2319 cs_wait_slot(b, SB_ID(LS), false);
2320 cs_if(b, MALI_CS_CONDITION_GREATER, counter)
2321 cs_update_frag_ctx(b)
2322 cs_add64(b, cs_sr_reg64(b, 40), cs_sr_reg64(b, 40),
2323 (1 + PANVK_IR_LAST_PASS) * fbd_ir_pass_offset);
2324
2325 /* Applications tend to forget to describe subpass dependencies, especially
2326 * when it comes to write -> read dependencies on attachments. The
2327 * proprietary driver forces "others" invalidation as a workaround, and this
2328 * invalidation even became implicit (done as part of the RUN_FRAGMENT) on
2329 * v13+. We don't do that in panvk, but we provide a debug flag to help
2330 * identify those issues. */
2331 if (unlikely(instance->debug_flags & PANVK_DEBUG_IMPLICIT_OTHERS_INV)) {
2332 cs_flush_caches(b, 0, 0, true, length_reg,
2333 cs_defer(0x0, SB_ID(IMM_FLUSH)));
2334 cs_wait_slot(b, SB_ID(IMM_FLUSH), false);
2335 }
2336
2337 cs_req_res(b, CS_FRAG_RES);
2338 if (cmdbuf->state.gfx.render.layer_count > 1) {
2339 struct cs_index layer_count = cs_sr_reg32(b, 47);
2340
2341 cs_move32_to(b, layer_count, calc_enabled_layer_count(cmdbuf));
2342 cs_while(b, MALI_CS_CONDITION_GREATER, layer_count) {
2343 cs_trace_run_fragment(b, tracing_ctx, cs_scratch_reg_tuple(b, 0, 4),
2344 false, MALI_TILE_RENDER_ORDER_Z_ORDER, false);
2345
2346 cs_add32(b, layer_count, layer_count, -1);
2347 cs_update_frag_ctx(b)
2348 cs_add64(b, cs_sr_reg64(b, 40), cs_sr_reg64(b, 40), fbd_sz);
2349 }
2350 } else {
2351 cs_trace_run_fragment(b, tracing_ctx, cs_scratch_reg_tuple(b, 0, 4),
2352 false, MALI_TILE_RENDER_ORDER_Z_ORDER, false);
2353 }
2354 cs_req_res(b, 0);
2355
2356 struct cs_index sync_addr = cs_scratch_reg64(b, 0);
2357 struct cs_index iter_sb = cs_scratch_reg32(b, 2);
2358 struct cs_index cmp_scratch = cs_scratch_reg32(b, 3);
2359 struct cs_index add_val = cs_scratch_reg64(b, 4);
2360 struct cs_index add_val_lo = cs_scratch_reg32(b, 4);
2361 struct cs_index ringbuf_sync_addr = cs_scratch_reg64(b, 6);
2362 struct cs_index release_sz = cs_scratch_reg32(b, 8);
2363
2364 struct cs_index completed = cs_scratch_reg_tuple(b, 10, 4);
2365 struct cs_index completed_top = cs_scratch_reg64(b, 10);
2366 struct cs_index completed_bottom = cs_scratch_reg64(b, 12);
2367 struct cs_index cur_tiler = cs_sr_reg64(b, 38);
2368 struct cs_index tiler_count = cs_sr_reg32(b, 47);
2369 struct cs_index oq_chain = cs_scratch_reg64(b, 10);
2370 struct cs_index oq_chain_lo = cs_scratch_reg32(b, 10);
2371 struct cs_index oq_chain_hi = cs_scratch_reg32(b, 11);
2372 struct cs_index oq_syncobj = cs_scratch_reg64(b, 12);
2373
2374 cs_move64_to(b, add_val, 1);
2375 cs_load_to(b, cs_scratch_reg_tuple(b, 0, 3), cs_subqueue_ctx_reg(b),
2376 BITFIELD_MASK(3),
2377 offsetof(struct panvk_cs_subqueue_context, syncobjs));
2378
2379 if (free_render_descs) {
2380 cs_move32_to(b, release_sz, calc_render_descs_size(cmdbuf));
2381 cs_load64_to(b, ringbuf_sync_addr, cs_subqueue_ctx_reg(b),
2382 offsetof(struct panvk_cs_subqueue_context,
2383 render.desc_ringbuf.syncobj));
2384 }
2385
2386 cs_wait_slot(b, SB_ID(LS), false);
2387
2388 cs_add64(b, sync_addr, sync_addr,
2389 PANVK_SUBQUEUE_FRAGMENT * sizeof(struct panvk_cs_sync64));
2390 cs_move32_to(b, tiler_count, td_count);
2391
2392 cs_match(b, iter_sb, cmp_scratch) {
2393 #define CASE(x) \
2394 cs_case(b, x) { \
2395 const struct cs_async_op async = \
2396 cs_defer(SB_WAIT_ITER(x), SB_ID(DEFERRED_SYNC)); \
2397 if (td_count == 1) { \
2398 cs_load_to(b, completed, cur_tiler, BITFIELD_MASK(4), 40); \
2399 cs_wait_slot(b, SB_ID(LS), false); \
2400 cs_finish_fragment(b, true, completed_top, completed_bottom, async); \
2401 } else if (td_count > 1) { \
2402 cs_while(b, MALI_CS_CONDITION_GREATER, tiler_count) { \
2403 cs_load_to(b, completed, cur_tiler, BITFIELD_MASK(4), 40); \
2404 cs_wait_slot(b, SB_ID(LS), false); \
2405 cs_finish_fragment(b, false, completed_top, completed_bottom, \
2406 async); \
2407 cs_update_frag_ctx(b) \
2408 cs_add64(b, cur_tiler, cur_tiler, pan_size(TILER_CONTEXT)); \
2409 cs_add32(b, tiler_count, tiler_count, -1); \
2410 } \
2411 cs_frag_end(b, async); \
2412 } \
2413 if (free_render_descs) { \
2414 cs_sync32_add(b, true, MALI_CS_SYNC_SCOPE_CSG, release_sz, \
2415 ringbuf_sync_addr, async); \
2416 } \
2417 if (has_oq_chain) { \
2418 struct cs_index flush_id = oq_chain_lo; \
2419 cs_move32_to(b, flush_id, 0); \
2420 cs_flush_caches(b, MALI_CS_FLUSH_MODE_CLEAN, \
2421 MALI_CS_FLUSH_MODE_CLEAN, false, flush_id, \
2422 cs_defer(SB_WAIT_ITER(x), SB_ID(DEFERRED_FLUSH))); \
2423 cs_load64_to( \
2424 b, oq_chain, cs_subqueue_ctx_reg(b), \
2425 offsetof(struct panvk_cs_subqueue_context, render.oq_chain)); \
2426 cs_wait_slot(b, SB_ID(LS), false); \
2427 /* We use oq_syncobj as a placeholder to reset the oq_chain. */ \
2428 cs_move64_to(b, oq_syncobj, 0); \
2429 cs_store64( \
2430 b, oq_syncobj, cs_subqueue_ctx_reg(b), \
2431 offsetof(struct panvk_cs_subqueue_context, render.oq_chain)); \
2432 cs_wait_slot(b, SB_ID(LS), false); \
2433 cs_while(b, MALI_CS_CONDITION_ALWAYS, cs_undef()) { \
2434 cs_load64_to(b, oq_syncobj, oq_chain, \
2435 offsetof(struct panvk_cs_occlusion_query, syncobj)); \
2436 cs_wait_slot(b, SB_ID(LS), false); \
2437 cs_load64_to(b, oq_chain, oq_chain, \
2438 offsetof(struct panvk_cs_occlusion_query, next)); \
2439 cs_wait_slot(b, SB_ID(LS), false); \
2440 cs_sync32_set( \
2441 b, true, MALI_CS_SYNC_SCOPE_CSG, add_val_lo, oq_syncobj, \
2442 cs_defer(SB_MASK(DEFERRED_FLUSH), SB_ID(DEFERRED_SYNC))); \
2443 cs_if(b, MALI_CS_CONDITION_NEQUAL, oq_chain_lo) \
2444 cs_continue(b); \
2445 cs_if(b, MALI_CS_CONDITION_NEQUAL, oq_chain_hi) \
2446 cs_continue(b); \
2447 cs_break(b); \
2448 } \
2449 } \
2450 cs_sync64_add(b, true, MALI_CS_SYNC_SCOPE_CSG, add_val, sync_addr, \
2451 async); \
2452 cs_move32_to(b, iter_sb, next_iter_sb(x)); \
2453 }
2454
2455 CASE(0)
2456 CASE(1)
2457 CASE(2)
2458 CASE(3)
2459 CASE(4)
2460 #undef CASE
2461 }
2462
2463 cs_store32(b, iter_sb, cs_subqueue_ctx_reg(b),
2464 offsetof(struct panvk_cs_subqueue_context, iter_sb));
2465 cs_wait_slot(b, SB_ID(LS), false);
2466
2467 /* Update the ring buffer position. */
2468 if (free_render_descs) {
2469 cs_render_desc_ringbuf_move_ptr(b, calc_render_descs_size(cmdbuf),
2470 !tracing_ctx->enabled);
2471 }
2472
2473 /* Update the frag seqno. */
2474 ++cmdbuf->state.cs[PANVK_SUBQUEUE_FRAGMENT].relative_sync_point;
2475
2476
2477 return VK_SUCCESS;
2478 }
2479
2480 void
panvk_per_arch(cmd_flush_draws)2481 panvk_per_arch(cmd_flush_draws)(struct panvk_cmd_buffer *cmdbuf)
2482 {
2483 /* If there was no draw queued, we don't need to force a preload. */
2484 if (cmdbuf->state.gfx.render.fbds.gpu || inherits_render_ctx(cmdbuf)) {
2485 flush_tiling(cmdbuf);
2486 issue_fragment_jobs(cmdbuf);
2487 memset(&cmdbuf->state.gfx.render.fbds, 0,
2488 sizeof(cmdbuf->state.gfx.render.fbds));
2489 cmdbuf->state.gfx.render.tiler = 0;
2490
2491 panvk_per_arch(cmd_force_fb_preload)(cmdbuf, NULL);
2492
2493 /* We inherited the render context, and need to let the primary command
2494 * buffer know that it's changed. */
2495 cmdbuf->state.gfx.render.invalidate_inherited_ctx = true;
2496
2497 /* Re-emit the FB/Tiler descs if we inherited them. */
2498 if (inherits_render_ctx(cmdbuf))
2499 get_render_ctx(cmdbuf);
2500 }
2501 }
2502
2503 VKAPI_ATTR void VKAPI_CALL
panvk_per_arch(CmdEndRendering)2504 panvk_per_arch(CmdEndRendering)(VkCommandBuffer commandBuffer)
2505 {
2506 VK_FROM_HANDLE(panvk_cmd_buffer, cmdbuf, commandBuffer);
2507 bool suspending = cmdbuf->state.gfx.render.flags & VK_RENDERING_SUSPENDING_BIT;
2508 VkResult result;
2509
2510 if (!suspending) {
2511 struct pan_fb_info *fbinfo = &cmdbuf->state.gfx.render.fb.info;
2512 bool clear = fbinfo->zs.clear.z | fbinfo->zs.clear.s;
2513 for (unsigned i = 0; i < fbinfo->rt_count; i++)
2514 clear |= fbinfo->rts[i].clear;
2515
2516 if (clear && !inherits_render_ctx(cmdbuf)) {
2517 result = get_fb_descs(cmdbuf);
2518 if (result != VK_SUCCESS)
2519 return;
2520 }
2521
2522 /* Flush the last occlusion query before ending the render pass if
2523 * this query has ended while we were inside the render pass. */
2524 if (cmdbuf->state.gfx.render.oq.last !=
2525 cmdbuf->state.gfx.occlusion_query.syncobj) {
2526 result = wrap_prev_oq(cmdbuf);
2527 if (result != VK_SUCCESS)
2528 return;
2529 }
2530
2531 if (cmdbuf->state.gfx.render.fbds.gpu || inherits_render_ctx(cmdbuf)) {
2532 flush_tiling(cmdbuf);
2533 issue_fragment_jobs(cmdbuf);
2534 }
2535 } else if (!inherits_render_ctx(cmdbuf)) {
2536 /* If we're suspending the render pass and we didn't inherit the render
2537 * context, we need to emit it now, so it's available when the render pass
2538 * is resumed. */
2539 VkResult result = get_render_ctx(cmdbuf);
2540 if (result != VK_SUCCESS)
2541 return;
2542 }
2543
2544 memset(&cmdbuf->state.gfx.render.fbds, 0,
2545 sizeof(cmdbuf->state.gfx.render.fbds));
2546 memset(&cmdbuf->state.gfx.render.oq, 0, sizeof(cmdbuf->state.gfx.render.oq));
2547 cmdbuf->state.gfx.render.tiler = 0;
2548
2549 /* If we're not suspending, we need to resolve attachments. */
2550 if (!suspending)
2551 panvk_per_arch(cmd_resolve_attachments)(cmdbuf);
2552 }
2553