1 /*
2 * Copyright © 2024 Collabora Ltd.
3 * Copyright © 2024 Arm Ltd.
4 *
5 * Derived from tu_cmd_buffer.c which is:
6 * Copyright © 2016 Red Hat.
7 * Copyright © 2016 Bas Nieuwenhuizen
8 * Copyright © 2015 Intel Corporation
9 *
10 * SPDX-License-Identifier: MIT
11 */
12
13 #include <stdint.h>
14 #include "genxml/gen_macros.h"
15
16 #include "panvk_buffer.h"
17 #include "panvk_cmd_alloc.h"
18 #include "panvk_cmd_buffer.h"
19 #include "panvk_cmd_desc_state.h"
20 #include "panvk_cmd_draw.h"
21 #include "panvk_cmd_fb_preload.h"
22 #include "panvk_cmd_meta.h"
23 #include "panvk_device.h"
24 #include "panvk_entrypoints.h"
25 #include "panvk_image.h"
26 #include "panvk_image_view.h"
27 #include "panvk_instance.h"
28 #include "panvk_priv_bo.h"
29 #include "panvk_shader.h"
30
31 #include "pan_desc.h"
32 #include "pan_earlyzs.h"
33 #include "pan_encoder.h"
34 #include "pan_format.h"
35 #include "pan_jc.h"
36 #include "pan_props.h"
37 #include "pan_samples.h"
38 #include "pan_shader.h"
39
40 #include "util/bitscan.h"
41 #include "vk_format.h"
42 #include "vk_meta.h"
43 #include "vk_pipeline_layout.h"
44 #include "vk_render_pass.h"
45
46 static void
emit_vs_attrib(const struct vk_vertex_attribute_state * attrib_info,const struct vk_vertex_binding_state * buf_info,const struct panvk_attrib_buf * buf,uint32_t vb_desc_offset,struct mali_attribute_packed * desc)47 emit_vs_attrib(const struct vk_vertex_attribute_state *attrib_info,
48 const struct vk_vertex_binding_state *buf_info,
49 const struct panvk_attrib_buf *buf, uint32_t vb_desc_offset,
50 struct mali_attribute_packed *desc)
51 {
52 bool per_instance = buf_info->input_rate == VK_VERTEX_INPUT_RATE_INSTANCE;
53 enum pipe_format f = vk_format_to_pipe_format(attrib_info->format);
54 unsigned buf_idx = vb_desc_offset + attrib_info->binding;
55
56 pan_pack(desc, ATTRIBUTE, cfg) {
57 cfg.offset = attrib_info->offset;
58 cfg.format = GENX(panfrost_format_from_pipe_format)(f)->hw;
59 cfg.table = 0;
60 cfg.buffer_index = buf_idx;
61 cfg.stride = buf_info->stride;
62 if (!per_instance) {
63 /* Per-vertex */
64 cfg.attribute_type = MALI_ATTRIBUTE_TYPE_1D;
65 cfg.frequency = MALI_ATTRIBUTE_FREQUENCY_VERTEX;
66 cfg.offset_enable = true;
67 } else if (buf_info->divisor == 1) {
68 cfg.attribute_type = MALI_ATTRIBUTE_TYPE_1D;
69 cfg.frequency = MALI_ATTRIBUTE_FREQUENCY_INSTANCE;
70 } else if (buf_info->divisor == 0) {
71 cfg.attribute_type = MALI_ATTRIBUTE_TYPE_1D;
72 /* HW doesn't support a zero divisor, but we can achieve the same by
73 * not using a divisor and setting the stride to zero */
74 cfg.frequency = MALI_ATTRIBUTE_FREQUENCY_INSTANCE;
75 cfg.stride = 0;
76 } else if (util_is_power_of_two_or_zero(buf_info->divisor)) {
77 /* Per-instance, POT divisor */
78 cfg.attribute_type = MALI_ATTRIBUTE_TYPE_1D_POT_DIVISOR;
79 cfg.frequency = MALI_ATTRIBUTE_FREQUENCY_INSTANCE;
80 cfg.divisor_r = __builtin_ctz(buf_info->divisor);
81 } else {
82 /* Per-instance, NPOT divisor */
83 cfg.attribute_type = MALI_ATTRIBUTE_TYPE_1D_NPOT_DIVISOR;
84 cfg.frequency = MALI_ATTRIBUTE_FREQUENCY_INSTANCE;
85 cfg.divisor_d = panfrost_compute_magic_divisor(
86 buf_info->divisor, &cfg.divisor_r, &cfg.divisor_e);
87 }
88 }
89 }
90
91 static bool
vs_driver_set_is_dirty(struct panvk_cmd_buffer * cmdbuf)92 vs_driver_set_is_dirty(struct panvk_cmd_buffer *cmdbuf)
93 {
94 return dyn_gfx_state_dirty(cmdbuf, VI) ||
95 dyn_gfx_state_dirty(cmdbuf, VI_BINDINGS_VALID) ||
96 dyn_gfx_state_dirty(cmdbuf, VI_BINDING_STRIDES) ||
97 gfx_state_dirty(cmdbuf, VB) || gfx_state_dirty(cmdbuf, VS) ||
98 gfx_state_dirty(cmdbuf, DESC_STATE);
99 }
100
101 static VkResult
prepare_vs_driver_set(struct panvk_cmd_buffer * cmdbuf)102 prepare_vs_driver_set(struct panvk_cmd_buffer *cmdbuf)
103 {
104 if (!vs_driver_set_is_dirty(cmdbuf))
105 return VK_SUCCESS;
106
107 struct panvk_shader_desc_state *vs_desc_state = &cmdbuf->state.gfx.vs.desc;
108 const struct panvk_shader *vs = cmdbuf->state.gfx.vs.shader;
109 const struct vk_vertex_input_state *vi =
110 cmdbuf->vk.dynamic_graphics_state.vi;
111 uint32_t vb_count = 0;
112
113 u_foreach_bit(i, vi->attributes_valid)
114 vb_count = MAX2(vi->attributes[i].binding + 1, vb_count);
115
116 uint32_t vb_offset = vs->desc_info.dyn_bufs.count + MAX_VS_ATTRIBS + 1;
117 uint32_t desc_count = vb_offset + vb_count;
118 const struct panvk_descriptor_state *desc_state =
119 &cmdbuf->state.gfx.desc_state;
120 struct panfrost_ptr driver_set = panvk_cmd_alloc_dev_mem(
121 cmdbuf, desc, desc_count * PANVK_DESCRIPTOR_SIZE, PANVK_DESCRIPTOR_SIZE);
122 struct panvk_opaque_desc *descs = driver_set.cpu;
123
124 if (!driver_set.gpu)
125 return VK_ERROR_OUT_OF_DEVICE_MEMORY;
126
127 for (uint32_t i = 0; i < MAX_VS_ATTRIBS; i++) {
128 if (vi->attributes_valid & BITFIELD_BIT(i)) {
129 unsigned binding = vi->attributes[i].binding;
130
131 emit_vs_attrib(&vi->attributes[i], &vi->bindings[binding],
132 &cmdbuf->state.gfx.vb.bufs[binding], vb_offset,
133 (struct mali_attribute_packed *)(&descs[i]));
134 } else {
135 memset(&descs[i], 0, sizeof(descs[0]));
136 }
137 }
138
139 /* Dummy sampler always comes right after the vertex attribs. */
140 pan_cast_and_pack(&descs[MAX_VS_ATTRIBS], SAMPLER, cfg) {
141 cfg.clamp_integer_array_indices = false;
142 }
143
144 panvk_per_arch(cmd_fill_dyn_bufs)(
145 desc_state, vs,
146 (struct mali_buffer_packed *)(&descs[MAX_VS_ATTRIBS + 1]));
147
148 for (uint32_t i = 0; i < vb_count; i++) {
149 const struct panvk_attrib_buf *vb = &cmdbuf->state.gfx.vb.bufs[i];
150
151 pan_cast_and_pack(&descs[vb_offset + i], BUFFER, cfg) {
152 if (vi->bindings_valid & BITFIELD_BIT(i)) {
153 cfg.address = vb->address;
154 cfg.size = vb->size;
155 } else {
156 cfg.address = 0;
157 cfg.size = 0;
158 }
159 }
160 }
161
162 vs_desc_state->driver_set.dev_addr = driver_set.gpu;
163 vs_desc_state->driver_set.size = desc_count * PANVK_DESCRIPTOR_SIZE;
164 gfx_state_set_dirty(cmdbuf, DESC_STATE);
165 return VK_SUCCESS;
166 }
167
168 static uint32_t
get_varying_slots(const struct panvk_cmd_buffer * cmdbuf)169 get_varying_slots(const struct panvk_cmd_buffer *cmdbuf)
170 {
171 const struct panvk_shader *vs = cmdbuf->state.gfx.vs.shader;
172 const struct panvk_shader *fs = get_fs(cmdbuf);
173 uint32_t varying_slots = 0;
174
175 if (fs) {
176 unsigned vs_vars = vs->info.varyings.output_count;
177 unsigned fs_vars = fs->info.varyings.input_count;
178 varying_slots = MAX2(vs_vars, fs_vars);
179 }
180
181 return varying_slots;
182 }
183
184 static void
emit_varying_descs(const struct panvk_cmd_buffer * cmdbuf,struct mali_attribute_packed * descs)185 emit_varying_descs(const struct panvk_cmd_buffer *cmdbuf,
186 struct mali_attribute_packed *descs)
187 {
188 uint32_t varying_slots = get_varying_slots(cmdbuf);
189 /* Assumes 16 byte slots. We could do better. */
190 uint32_t varying_size = varying_slots * 16;
191
192 const struct panvk_shader *fs = get_fs(cmdbuf);
193
194 for (uint32_t i = 0; i < varying_slots; i++) {
195 const struct pan_shader_varying *var = &fs->info.varyings.input[i];
196 /* Skip special varyings. */
197 if (var->location < VARYING_SLOT_VAR0)
198 continue;
199
200 /* We currently always write out F32 in the vertex shaders, so the format
201 * needs to reflect this. */
202 enum pipe_format f = var->format;
203 switch (f) {
204 case PIPE_FORMAT_R16_FLOAT:
205 f = PIPE_FORMAT_R32_FLOAT;
206 break;
207 case PIPE_FORMAT_R16G16_FLOAT:
208 f = PIPE_FORMAT_R32G32_FLOAT;
209 break;
210 case PIPE_FORMAT_R16G16B16_FLOAT:
211 f = PIPE_FORMAT_R32G32B32_FLOAT;
212 break;
213 case PIPE_FORMAT_R16G16B16A16_FLOAT:
214 f = PIPE_FORMAT_R32G32B32A32_FLOAT;
215 break;
216 default:
217 break;
218 }
219
220 uint32_t loc = var->location - VARYING_SLOT_VAR0;
221 pan_pack(&descs[i], ATTRIBUTE, cfg) {
222 cfg.attribute_type = MALI_ATTRIBUTE_TYPE_VERTEX_PACKET;
223 cfg.offset_enable = false;
224 cfg.format = GENX(panfrost_format_from_pipe_format)(f)->hw;
225 cfg.table = 61;
226 cfg.frequency = MALI_ATTRIBUTE_FREQUENCY_VERTEX;
227 cfg.offset = 1024 + (loc * 16);
228 cfg.buffer_index = 0;
229 cfg.attribute_stride = varying_size;
230 cfg.packet_stride = varying_size + 16;
231 }
232 }
233 }
234
235 static VkResult
prepare_fs_driver_set(struct panvk_cmd_buffer * cmdbuf)236 prepare_fs_driver_set(struct panvk_cmd_buffer *cmdbuf)
237 {
238 struct panvk_shader_desc_state *fs_desc_state = &cmdbuf->state.gfx.fs.desc;
239 const struct panvk_shader *fs = cmdbuf->state.gfx.fs.shader;
240 const struct panvk_descriptor_state *desc_state =
241 &cmdbuf->state.gfx.desc_state;
242 /* If the shader is using LD_VAR_BUF[_IMM], we do not have to set up
243 * Attribute Descriptors for varying loads. */
244 uint32_t num_varying_attr_descs =
245 panvk_use_ld_var_buf(fs) ? 0 : fs->desc_info.max_varying_loads;
246 uint32_t desc_count =
247 fs->desc_info.dyn_bufs.count + num_varying_attr_descs + 1;
248 struct panfrost_ptr driver_set = panvk_cmd_alloc_dev_mem(
249 cmdbuf, desc, desc_count * PANVK_DESCRIPTOR_SIZE, PANVK_DESCRIPTOR_SIZE);
250 struct panvk_opaque_desc *descs = driver_set.cpu;
251
252 if (desc_count && !driver_set.gpu)
253 return VK_ERROR_OUT_OF_DEVICE_MEMORY;
254
255 if (num_varying_attr_descs > 0)
256 emit_varying_descs(cmdbuf, (struct mali_attribute_packed *)(&descs[0]));
257
258 /* Dummy sampler always comes right after the varyings. */
259 pan_cast_and_pack(&descs[num_varying_attr_descs], SAMPLER, cfg) {
260 cfg.clamp_integer_array_indices = false;
261 }
262
263 panvk_per_arch(cmd_fill_dyn_bufs)(
264 desc_state, fs,
265 (struct mali_buffer_packed *)(&descs[num_varying_attr_descs + 1]));
266
267 fs_desc_state->driver_set.dev_addr = driver_set.gpu;
268 fs_desc_state->driver_set.size = desc_count * PANVK_DESCRIPTOR_SIZE;
269 gfx_state_set_dirty(cmdbuf, DESC_STATE);
270 return VK_SUCCESS;
271 }
272
273 static bool
has_depth_att(struct panvk_cmd_buffer * cmdbuf)274 has_depth_att(struct panvk_cmd_buffer *cmdbuf)
275 {
276 return (cmdbuf->state.gfx.render.bound_attachments &
277 MESA_VK_RP_ATTACHMENT_DEPTH_BIT) != 0;
278 }
279
280 static bool
has_stencil_att(struct panvk_cmd_buffer * cmdbuf)281 has_stencil_att(struct panvk_cmd_buffer *cmdbuf)
282 {
283 return (cmdbuf->state.gfx.render.bound_attachments &
284 MESA_VK_RP_ATTACHMENT_STENCIL_BIT) != 0;
285 }
286
287 static bool
writes_depth(struct panvk_cmd_buffer * cmdbuf)288 writes_depth(struct panvk_cmd_buffer *cmdbuf)
289 {
290 const struct vk_depth_stencil_state *ds =
291 &cmdbuf->vk.dynamic_graphics_state.ds;
292
293 return has_depth_att(cmdbuf) && ds->depth.test_enable &&
294 ds->depth.write_enable && ds->depth.compare_op != VK_COMPARE_OP_NEVER;
295 }
296
297 static bool
writes_stencil(struct panvk_cmd_buffer * cmdbuf)298 writes_stencil(struct panvk_cmd_buffer *cmdbuf)
299 {
300 const struct vk_depth_stencil_state *ds =
301 &cmdbuf->vk.dynamic_graphics_state.ds;
302
303 return has_stencil_att(cmdbuf) && ds->stencil.test_enable &&
304 ((ds->stencil.front.write_mask &&
305 (ds->stencil.front.op.fail != VK_STENCIL_OP_KEEP ||
306 ds->stencil.front.op.pass != VK_STENCIL_OP_KEEP ||
307 ds->stencil.front.op.depth_fail != VK_STENCIL_OP_KEEP)) ||
308 (ds->stencil.back.write_mask &&
309 (ds->stencil.back.op.fail != VK_STENCIL_OP_KEEP ||
310 ds->stencil.back.op.pass != VK_STENCIL_OP_KEEP ||
311 ds->stencil.back.op.depth_fail != VK_STENCIL_OP_KEEP)));
312 }
313
314 static bool
ds_test_always_passes(struct panvk_cmd_buffer * cmdbuf)315 ds_test_always_passes(struct panvk_cmd_buffer *cmdbuf)
316 {
317 const struct vk_depth_stencil_state *ds =
318 &cmdbuf->vk.dynamic_graphics_state.ds;
319
320 if (!has_depth_att(cmdbuf))
321 return true;
322
323 if (ds->depth.test_enable && ds->depth.compare_op != VK_COMPARE_OP_ALWAYS)
324 return false;
325
326 if (ds->stencil.test_enable &&
327 (ds->stencil.front.op.compare != VK_COMPARE_OP_ALWAYS ||
328 ds->stencil.back.op.compare != VK_COMPARE_OP_ALWAYS))
329 return false;
330
331 return true;
332 }
333
334 static inline enum mali_func
translate_compare_func(VkCompareOp comp)335 translate_compare_func(VkCompareOp comp)
336 {
337 STATIC_ASSERT(VK_COMPARE_OP_NEVER == (VkCompareOp)MALI_FUNC_NEVER);
338 STATIC_ASSERT(VK_COMPARE_OP_LESS == (VkCompareOp)MALI_FUNC_LESS);
339 STATIC_ASSERT(VK_COMPARE_OP_EQUAL == (VkCompareOp)MALI_FUNC_EQUAL);
340 STATIC_ASSERT(VK_COMPARE_OP_LESS_OR_EQUAL == (VkCompareOp)MALI_FUNC_LEQUAL);
341 STATIC_ASSERT(VK_COMPARE_OP_GREATER == (VkCompareOp)MALI_FUNC_GREATER);
342 STATIC_ASSERT(VK_COMPARE_OP_NOT_EQUAL == (VkCompareOp)MALI_FUNC_NOT_EQUAL);
343 STATIC_ASSERT(VK_COMPARE_OP_GREATER_OR_EQUAL ==
344 (VkCompareOp)MALI_FUNC_GEQUAL);
345 STATIC_ASSERT(VK_COMPARE_OP_ALWAYS == (VkCompareOp)MALI_FUNC_ALWAYS);
346
347 return (enum mali_func)comp;
348 }
349
350 static enum mali_stencil_op
translate_stencil_op(VkStencilOp in)351 translate_stencil_op(VkStencilOp in)
352 {
353 switch (in) {
354 case VK_STENCIL_OP_KEEP:
355 return MALI_STENCIL_OP_KEEP;
356 case VK_STENCIL_OP_ZERO:
357 return MALI_STENCIL_OP_ZERO;
358 case VK_STENCIL_OP_REPLACE:
359 return MALI_STENCIL_OP_REPLACE;
360 case VK_STENCIL_OP_INCREMENT_AND_CLAMP:
361 return MALI_STENCIL_OP_INCR_SAT;
362 case VK_STENCIL_OP_DECREMENT_AND_CLAMP:
363 return MALI_STENCIL_OP_DECR_SAT;
364 case VK_STENCIL_OP_INCREMENT_AND_WRAP:
365 return MALI_STENCIL_OP_INCR_WRAP;
366 case VK_STENCIL_OP_DECREMENT_AND_WRAP:
367 return MALI_STENCIL_OP_DECR_WRAP;
368 case VK_STENCIL_OP_INVERT:
369 return MALI_STENCIL_OP_INVERT;
370 default:
371 unreachable("Invalid stencil op");
372 }
373 }
374
375 static enum mali_draw_mode
translate_prim_topology(VkPrimitiveTopology in)376 translate_prim_topology(VkPrimitiveTopology in)
377 {
378 /* Test VK_PRIMITIVE_TOPOLOGY_META_RECT_LIST_MESA separately, as it's not
379 * part of the VkPrimitiveTopology enum.
380 */
381 if (in == VK_PRIMITIVE_TOPOLOGY_META_RECT_LIST_MESA)
382 return MALI_DRAW_MODE_TRIANGLES;
383
384 switch (in) {
385 case VK_PRIMITIVE_TOPOLOGY_POINT_LIST:
386 return MALI_DRAW_MODE_POINTS;
387 case VK_PRIMITIVE_TOPOLOGY_LINE_LIST:
388 return MALI_DRAW_MODE_LINES;
389 case VK_PRIMITIVE_TOPOLOGY_LINE_STRIP:
390 return MALI_DRAW_MODE_LINE_STRIP;
391 case VK_PRIMITIVE_TOPOLOGY_TRIANGLE_LIST:
392 return MALI_DRAW_MODE_TRIANGLES;
393 case VK_PRIMITIVE_TOPOLOGY_TRIANGLE_STRIP:
394 return MALI_DRAW_MODE_TRIANGLE_STRIP;
395 case VK_PRIMITIVE_TOPOLOGY_TRIANGLE_FAN:
396 return MALI_DRAW_MODE_TRIANGLE_FAN;
397 case VK_PRIMITIVE_TOPOLOGY_LINE_LIST_WITH_ADJACENCY:
398 case VK_PRIMITIVE_TOPOLOGY_LINE_STRIP_WITH_ADJACENCY:
399 case VK_PRIMITIVE_TOPOLOGY_TRIANGLE_LIST_WITH_ADJACENCY:
400 case VK_PRIMITIVE_TOPOLOGY_TRIANGLE_STRIP_WITH_ADJACENCY:
401 case VK_PRIMITIVE_TOPOLOGY_PATCH_LIST:
402 default:
403 unreachable("Invalid primitive type");
404 }
405 }
406
407 static VkResult
update_tls(struct panvk_cmd_buffer * cmdbuf)408 update_tls(struct panvk_cmd_buffer *cmdbuf)
409 {
410 struct panvk_tls_state *state = &cmdbuf->state.tls;
411 const struct panvk_shader *vs = cmdbuf->state.gfx.vs.shader;
412 const struct panvk_shader *fs = cmdbuf->state.gfx.fs.shader;
413 struct cs_builder *b =
414 panvk_get_cs_builder(cmdbuf, PANVK_SUBQUEUE_VERTEX_TILER);
415
416 if (!cmdbuf->state.gfx.tsd) {
417 if (!state->desc.gpu) {
418 state->desc = panvk_cmd_alloc_desc(cmdbuf, LOCAL_STORAGE);
419 if (!state->desc.gpu)
420 return VK_ERROR_OUT_OF_DEVICE_MEMORY;
421 }
422
423 cmdbuf->state.gfx.tsd = state->desc.gpu;
424
425 cs_update_vt_ctx(b)
426 cs_move64_to(b, cs_sr_reg64(b, 24), state->desc.gpu);
427 }
428
429 state->info.tls.size =
430 MAX3(vs->info.tls_size, fs ? fs->info.tls_size : 0, state->info.tls.size);
431 return VK_SUCCESS;
432 }
433
434 static enum mali_index_type
index_size_to_index_type(uint32_t size)435 index_size_to_index_type(uint32_t size)
436 {
437 switch (size) {
438 case 0:
439 return MALI_INDEX_TYPE_NONE;
440 case 1:
441 return MALI_INDEX_TYPE_UINT8;
442 case 2:
443 return MALI_INDEX_TYPE_UINT16;
444 case 4:
445 return MALI_INDEX_TYPE_UINT32;
446 default:
447 assert(!"Invalid index size");
448 return MALI_INDEX_TYPE_NONE;
449 }
450 }
451
452 static VkResult
prepare_blend(struct panvk_cmd_buffer * cmdbuf)453 prepare_blend(struct panvk_cmd_buffer *cmdbuf)
454 {
455 bool dirty = dyn_gfx_state_dirty(cmdbuf, MS_ALPHA_TO_ONE_ENABLE) ||
456 dyn_gfx_state_dirty(cmdbuf, CB_LOGIC_OP_ENABLE) ||
457 dyn_gfx_state_dirty(cmdbuf, CB_LOGIC_OP) ||
458 dyn_gfx_state_dirty(cmdbuf, CB_ATTACHMENT_COUNT) ||
459 dyn_gfx_state_dirty(cmdbuf, CB_COLOR_WRITE_ENABLES) ||
460 dyn_gfx_state_dirty(cmdbuf, CB_BLEND_ENABLES) ||
461 dyn_gfx_state_dirty(cmdbuf, CB_BLEND_EQUATIONS) ||
462 dyn_gfx_state_dirty(cmdbuf, CB_WRITE_MASKS) ||
463 dyn_gfx_state_dirty(cmdbuf, CB_BLEND_CONSTANTS) ||
464 fs_user_dirty(cmdbuf) || gfx_state_dirty(cmdbuf, RENDER_STATE);
465
466 if (!dirty)
467 return VK_SUCCESS;
468
469 const struct vk_dynamic_graphics_state *dyns =
470 &cmdbuf->vk.dynamic_graphics_state;
471 const struct vk_color_blend_state *cb = &dyns->cb;
472 unsigned bd_count = MAX2(cb->attachment_count, 1);
473 struct cs_builder *b =
474 panvk_get_cs_builder(cmdbuf, PANVK_SUBQUEUE_VERTEX_TILER);
475 struct panfrost_ptr ptr =
476 panvk_cmd_alloc_desc_array(cmdbuf, bd_count, BLEND);
477 struct mali_blend_packed *bds = ptr.cpu;
478
479 if (bd_count && !ptr.gpu)
480 return VK_ERROR_OUT_OF_DEVICE_MEMORY;
481
482 panvk_per_arch(blend_emit_descs)(cmdbuf, bds);
483
484 cs_update_vt_ctx(b)
485 cs_move64_to(b, cs_sr_reg64(b, 50), ptr.gpu | bd_count);
486
487 return VK_SUCCESS;
488 }
489
490 static void
prepare_vp(struct panvk_cmd_buffer * cmdbuf)491 prepare_vp(struct panvk_cmd_buffer *cmdbuf)
492 {
493 struct cs_builder *b =
494 panvk_get_cs_builder(cmdbuf, PANVK_SUBQUEUE_VERTEX_TILER);
495 const VkViewport *viewport =
496 &cmdbuf->vk.dynamic_graphics_state.vp.viewports[0];
497 const VkRect2D *scissor = &cmdbuf->vk.dynamic_graphics_state.vp.scissors[0];
498
499 if (dyn_gfx_state_dirty(cmdbuf, VP_VIEWPORTS) ||
500 dyn_gfx_state_dirty(cmdbuf, VP_SCISSORS)) {
501 struct mali_scissor_packed scissor_box;
502 pan_pack(&scissor_box, SCISSOR, cfg) {
503
504 /* The spec says "width must be greater than 0.0" */
505 assert(viewport->width >= 0);
506 int minx = (int)viewport->x;
507 int maxx = (int)(viewport->x + viewport->width);
508
509 /* Viewport height can be negative */
510 int miny =
511 MIN2((int)viewport->y, (int)(viewport->y + viewport->height));
512 int maxy =
513 MAX2((int)viewport->y, (int)(viewport->y + viewport->height));
514
515 assert(scissor->offset.x >= 0 && scissor->offset.y >= 0);
516 minx = MAX2(scissor->offset.x, minx);
517 miny = MAX2(scissor->offset.y, miny);
518 maxx = MIN2(scissor->offset.x + scissor->extent.width, maxx);
519 maxy = MIN2(scissor->offset.y + scissor->extent.height, maxy);
520
521 /* Make sure we don't end up with a max < min when width/height is 0 */
522 maxx = maxx > minx ? maxx - 1 : maxx;
523 maxy = maxy > miny ? maxy - 1 : maxy;
524
525 /* Clamp viewport scissor to valid range */
526 cfg.scissor_minimum_x = CLAMP(minx, 0, UINT16_MAX);
527 cfg.scissor_minimum_y = CLAMP(miny, 0, UINT16_MAX);
528 cfg.scissor_maximum_x = CLAMP(maxx, 0, UINT16_MAX);
529 cfg.scissor_maximum_y = CLAMP(maxy, 0, UINT16_MAX);
530 }
531
532 struct mali_scissor_packed *scissor_box_ptr = &scissor_box;
533 cs_move64_to(b, cs_sr_reg64(b, 42), *((uint64_t*)scissor_box_ptr));
534 }
535
536 if (dyn_gfx_state_dirty(cmdbuf, VP_VIEWPORTS) ||
537 dyn_gfx_state_dirty(cmdbuf, RS_DEPTH_CLIP_ENABLE) ||
538 dyn_gfx_state_dirty(cmdbuf, RS_DEPTH_CLAMP_ENABLE)) {
539 struct panvk_graphics_sysvals *sysvals = &cmdbuf->state.gfx.sysvals;
540
541 float z_min = sysvals->viewport.offset.z;
542 float z_max = z_min + sysvals->viewport.scale.z;
543 cs_move32_to(b, cs_sr_reg32(b, 44), fui(MIN2(z_min, z_max)));
544 cs_move32_to(b, cs_sr_reg32(b, 45), fui(MAX2(z_min, z_max)));
545 }
546 }
547
548 static inline uint64_t
get_pos_spd(const struct panvk_cmd_buffer * cmdbuf)549 get_pos_spd(const struct panvk_cmd_buffer *cmdbuf)
550 {
551 const struct panvk_shader *vs = cmdbuf->state.gfx.vs.shader;
552 assert(vs);
553 const struct vk_input_assembly_state *ia =
554 &cmdbuf->vk.dynamic_graphics_state.ia;
555 return ia->primitive_topology == VK_PRIMITIVE_TOPOLOGY_POINT_LIST
556 ? panvk_priv_mem_dev_addr(vs->spds.pos_points)
557 : panvk_priv_mem_dev_addr(vs->spds.pos_triangles);
558 }
559
560 static void
prepare_tiler_primitive_size(struct panvk_cmd_buffer * cmdbuf)561 prepare_tiler_primitive_size(struct panvk_cmd_buffer *cmdbuf)
562 {
563 struct cs_builder *b =
564 panvk_get_cs_builder(cmdbuf, PANVK_SUBQUEUE_VERTEX_TILER);
565 const struct panvk_shader *vs = cmdbuf->state.gfx.vs.shader;
566 const struct vk_input_assembly_state *ia =
567 &cmdbuf->vk.dynamic_graphics_state.ia;
568 float primitive_size;
569
570 if (!dyn_gfx_state_dirty(cmdbuf, IA_PRIMITIVE_TOPOLOGY) &&
571 !dyn_gfx_state_dirty(cmdbuf, RS_LINE_WIDTH) &&
572 !gfx_state_dirty(cmdbuf, VS))
573 return;
574
575 switch (ia->primitive_topology) {
576 /* From the Vulkan spec 1.3.293:
577 *
578 * "If maintenance5 is enabled and a value is not written to a variable
579 * decorated with PointSize, a value of 1.0 is used as the size of
580 * points."
581 *
582 * If no point size is written, ensure that the size is always 1.0f.
583 */
584 case VK_PRIMITIVE_TOPOLOGY_POINT_LIST:
585 if (vs->info.vs.writes_point_size)
586 return;
587
588 primitive_size = 1.0f;
589 break;
590 case VK_PRIMITIVE_TOPOLOGY_LINE_LIST:
591 case VK_PRIMITIVE_TOPOLOGY_LINE_STRIP:
592 primitive_size = cmdbuf->vk.dynamic_graphics_state.rs.line.width;
593 break;
594 default:
595 return;
596 }
597
598 cs_move32_to(b, cs_sr_reg32(b, 60), fui(primitive_size));
599 }
600
601 static uint32_t
calc_enabled_layer_count(struct panvk_cmd_buffer * cmdbuf)602 calc_enabled_layer_count(struct panvk_cmd_buffer *cmdbuf)
603 {
604 return cmdbuf->state.gfx.render.view_mask ?
605 util_bitcount(cmdbuf->state.gfx.render.view_mask) :
606 cmdbuf->state.gfx.render.layer_count;
607 }
608
609 static uint32_t
calc_fbd_size(struct panvk_cmd_buffer * cmdbuf)610 calc_fbd_size(struct panvk_cmd_buffer *cmdbuf)
611 {
612 const struct pan_fb_info *fb = &cmdbuf->state.gfx.render.fb.info;
613 bool has_zs_ext = fb->zs.view.zs || fb->zs.view.s;
614 uint32_t rt_count = MAX2(fb->rt_count, 1);
615
616 return get_fbd_size(has_zs_ext, rt_count);
617 }
618
619 static uint32_t
calc_render_descs_size(struct panvk_cmd_buffer * cmdbuf)620 calc_render_descs_size(struct panvk_cmd_buffer *cmdbuf)
621 {
622 uint32_t fbd_count = calc_enabled_layer_count(cmdbuf) *
623 (1 + PANVK_IR_PASS_COUNT);
624 uint32_t td_count = DIV_ROUND_UP(cmdbuf->state.gfx.render.layer_count,
625 MAX_LAYERS_PER_TILER_DESC);
626
627 return (calc_fbd_size(cmdbuf) * fbd_count) +
628 (td_count * pan_size(TILER_CONTEXT));
629 }
630
631 static void
cs_render_desc_ringbuf_reserve(struct cs_builder * b,uint32_t size)632 cs_render_desc_ringbuf_reserve(struct cs_builder *b, uint32_t size)
633 {
634 /* Make sure we don't allocate more than the ringbuf size. */
635 assert(size <= RENDER_DESC_RINGBUF_SIZE);
636
637 /* Make sure the allocation is 64-byte aligned. */
638 assert(ALIGN_POT(size, 64) == size);
639
640 struct cs_index ringbuf_sync = cs_scratch_reg64(b, 0);
641 struct cs_index sz_reg = cs_scratch_reg32(b, 2);
642
643 cs_load64_to(
644 b, ringbuf_sync, cs_subqueue_ctx_reg(b),
645 offsetof(struct panvk_cs_subqueue_context, render.desc_ringbuf.syncobj));
646 cs_wait_slot(b, SB_ID(LS), false);
647
648 /* Wait for the other end to release memory. */
649 cs_move32_to(b, sz_reg, size - 1);
650 cs_sync32_wait(b, false, MALI_CS_CONDITION_GREATER, sz_reg, ringbuf_sync);
651
652 /* Decrement the syncobj to reflect the fact we're reserving memory. */
653 cs_move32_to(b, sz_reg, -size);
654 cs_sync32_add(b, false, MALI_CS_SYNC_SCOPE_CSG, sz_reg, ringbuf_sync,
655 cs_now());
656 }
657
658 static void
cs_render_desc_ringbuf_move_ptr(struct cs_builder * b,uint32_t size,bool wrap_around)659 cs_render_desc_ringbuf_move_ptr(struct cs_builder *b, uint32_t size,
660 bool wrap_around)
661 {
662 struct cs_index scratch_reg = cs_scratch_reg32(b, 0);
663 struct cs_index ptr_lo = cs_scratch_reg32(b, 2);
664 struct cs_index pos = cs_scratch_reg32(b, 4);
665
666 cs_load_to(
667 b, cs_scratch_reg_tuple(b, 2, 3), cs_subqueue_ctx_reg(b),
668 BITFIELD_MASK(3),
669 offsetof(struct panvk_cs_subqueue_context, render.desc_ringbuf.ptr));
670 cs_wait_slot(b, SB_ID(LS), false);
671
672 /* Update the relative position and absolute address. */
673 cs_add32(b, ptr_lo, ptr_lo, size);
674 cs_add32(b, pos, pos, size);
675
676 /* Wrap-around. */
677 if (likely(wrap_around)) {
678 cs_add32(b, scratch_reg, pos, -RENDER_DESC_RINGBUF_SIZE);
679
680 cs_if(b, MALI_CS_CONDITION_GEQUAL, scratch_reg) {
681 cs_add32(b, ptr_lo, ptr_lo, -RENDER_DESC_RINGBUF_SIZE);
682 cs_add32(b, pos, pos, -RENDER_DESC_RINGBUF_SIZE);
683 }
684 }
685
686 cs_store(
687 b, cs_scratch_reg_tuple(b, 2, 3), cs_subqueue_ctx_reg(b),
688 BITFIELD_MASK(3),
689 offsetof(struct panvk_cs_subqueue_context, render.desc_ringbuf.ptr));
690 cs_wait_slot(b, SB_ID(LS), false);
691 }
692
693 static VkResult
get_tiler_desc(struct panvk_cmd_buffer * cmdbuf)694 get_tiler_desc(struct panvk_cmd_buffer *cmdbuf)
695 {
696 assert(cmdbuf->state.gfx.render.invalidate_inherited_ctx ||
697 !inherits_render_ctx(cmdbuf));
698
699 if (cmdbuf->state.gfx.render.tiler)
700 return VK_SUCCESS;
701
702 struct cs_builder *b =
703 panvk_get_cs_builder(cmdbuf, PANVK_SUBQUEUE_VERTEX_TILER);
704 struct panvk_physical_device *phys_dev =
705 to_panvk_physical_device(cmdbuf->vk.base.device->physical);
706 struct panvk_instance *instance =
707 to_panvk_instance(phys_dev->vk.instance);
708 bool tracing_enabled = instance->debug_flags & PANVK_DEBUG_TRACE;
709 struct panfrost_tiler_features tiler_features =
710 panfrost_query_tiler_features(&phys_dev->kmod.props);
711 bool simul_use =
712 cmdbuf->flags & VK_COMMAND_BUFFER_USAGE_SIMULTANEOUS_USE_BIT;
713 struct panfrost_ptr tiler_desc = {0};
714 struct mali_tiler_context_packed tiler_tmpl;
715 uint32_t td_count = DIV_ROUND_UP(cmdbuf->state.gfx.render.layer_count,
716 MAX_LAYERS_PER_TILER_DESC);
717
718 if (!simul_use) {
719 tiler_desc = panvk_cmd_alloc_desc_array(cmdbuf, td_count, TILER_CONTEXT);
720 if (!tiler_desc.gpu)
721 return VK_ERROR_OUT_OF_DEVICE_MEMORY;
722 }
723
724 const struct pan_fb_info *fbinfo = &cmdbuf->state.gfx.render.fb.info;
725
726 pan_pack(&tiler_tmpl, TILER_CONTEXT, cfg) {
727 unsigned max_levels = tiler_features.max_levels;
728 assert(max_levels >= 2);
729
730 cfg.hierarchy_mask =
731 panvk_select_tiler_hierarchy_mask(phys_dev, &cmdbuf->state.gfx);
732 cfg.fb_width = fbinfo->width;
733 cfg.fb_height = fbinfo->height;
734
735 cfg.sample_pattern = pan_sample_pattern(fbinfo->nr_samples);
736
737 cfg.first_provoking_vertex =
738 cmdbuf->vk.dynamic_graphics_state.rs.provoking_vertex ==
739 VK_PROVOKING_VERTEX_MODE_FIRST_VERTEX_EXT;
740
741 /* This will be overloaded. */
742 cfg.layer_count = 1;
743 cfg.layer_offset = 0;
744 }
745
746 /* When simul_use=true, the tiler descriptors are allocated from the
747 * descriptor ringbuf. We set state.gfx.render.tiler to a non-NULL
748 * value to satisfy the is_tiler_desc_allocated() tests, but we want
749 * it to point to a faulty address so that we can easily detect if it's
750 * used in the command stream/framebuffer descriptors. */
751 cmdbuf->state.gfx.render.tiler =
752 simul_use ? 0xdeadbeefdeadbeefull : tiler_desc.gpu;
753
754 struct cs_index tiler_ctx_addr = cs_sr_reg64(b, 40);
755
756 if (simul_use) {
757 uint32_t descs_sz = calc_render_descs_size(cmdbuf);
758
759 cs_render_desc_ringbuf_reserve(b, descs_sz);
760
761 /* Reserve ringbuf mem. */
762 cs_update_vt_ctx(b) {
763 cs_load64_to(b, tiler_ctx_addr, cs_subqueue_ctx_reg(b),
764 offsetof(struct panvk_cs_subqueue_context,
765 render.desc_ringbuf.ptr));
766 }
767
768 cs_render_desc_ringbuf_move_ptr(b, descs_sz, !tracing_enabled);
769 } else {
770 cs_update_vt_ctx(b) {
771 cs_move64_to(b, tiler_ctx_addr, tiler_desc.gpu);
772 }
773 }
774
775 /* Reset the polygon list. */
776 cs_move64_to(b, cs_scratch_reg64(b, 0), 0);
777
778 /* Lay out words 2, 3 and 5, so they can be stored along the other updates.
779 * Word 4 contains layer information and will be updated in the loop. */
780 cs_move64_to(b, cs_scratch_reg64(b, 2),
781 tiler_tmpl.opaque[2] | (uint64_t)tiler_tmpl.opaque[3] << 32);
782 cs_move32_to(b, cs_scratch_reg32(b, 5), tiler_tmpl.opaque[5]);
783
784 /* Load the tiler_heap and geom_buf from the context. */
785 cs_load_to(b, cs_scratch_reg_tuple(b, 6, 4), cs_subqueue_ctx_reg(b),
786 BITFIELD_MASK(4),
787 offsetof(struct panvk_cs_subqueue_context, render.tiler_heap));
788
789 /* Fill extra fields with zeroes so we can reset the completed
790 * top/bottom and private states. */
791 cs_move64_to(b, cs_scratch_reg64(b, 10), 0);
792 cs_move64_to(b, cs_scratch_reg64(b, 12), 0);
793 cs_move64_to(b, cs_scratch_reg64(b, 14), 0);
794
795 cs_wait_slot(b, SB_ID(LS), false);
796
797 /* Take care of the tiler desc with layer_offset=0 outside of the loop. */
798 cs_move32_to(b, cs_scratch_reg32(b, 4),
799 MIN2(cmdbuf->state.gfx.render.layer_count - 1,
800 MAX_LAYERS_PER_TILER_DESC - 1));
801
802 /* Replace words 0:13 and 24:31. */
803 cs_store(b, cs_scratch_reg_tuple(b, 0, 16), tiler_ctx_addr,
804 BITFIELD_MASK(16), 0);
805 cs_store(b, cs_scratch_reg_tuple(b, 0, 16), tiler_ctx_addr,
806 BITFIELD_RANGE(0, 2) | BITFIELD_RANGE(10, 6), 64);
807 cs_store(b, cs_scratch_reg_tuple(b, 0, 16), tiler_ctx_addr,
808 BITFIELD_RANGE(0, 2) | BITFIELD_RANGE(10, 6), 96);
809
810 cs_wait_slot(b, SB_ID(LS), false);
811
812 uint32_t remaining_layers =
813 td_count > 1
814 ? cmdbuf->state.gfx.render.layer_count % MAX_LAYERS_PER_TILER_DESC
815 : 0;
816 uint32_t full_td_count =
817 cmdbuf->state.gfx.render.layer_count / MAX_LAYERS_PER_TILER_DESC;
818
819 if (remaining_layers) {
820 int32_t layer_offset =
821 -(cmdbuf->state.gfx.render.layer_count - remaining_layers) &
822 BITFIELD_MASK(9);
823
824 /* If the last tiler descriptor is not full, we emit it outside of the
825 * loop to pass the right layer count. All this would be a lot simpler
826 * if we had OR/AND instructions, but here we are. */
827 cs_update_vt_ctx(b)
828 cs_add64(b, tiler_ctx_addr, tiler_ctx_addr,
829 pan_size(TILER_CONTEXT) * full_td_count);
830 cs_move32_to(b, cs_scratch_reg32(b, 4),
831 (layer_offset << 8) | (remaining_layers - 1));
832 cs_store(b, cs_scratch_reg_tuple(b, 0, 16), tiler_ctx_addr,
833 BITFIELD_MASK(16), 0);
834 cs_store(b, cs_scratch_reg_tuple(b, 0, 16), tiler_ctx_addr,
835 BITFIELD_RANGE(0, 2) | BITFIELD_RANGE(10, 6), 64);
836 cs_store(b, cs_scratch_reg_tuple(b, 0, 16), tiler_ctx_addr,
837 BITFIELD_RANGE(0, 2) | BITFIELD_RANGE(10, 6), 96);
838 cs_wait_slot(b, SB_ID(LS), false);
839
840 cs_update_vt_ctx(b)
841 cs_add64(b, tiler_ctx_addr, tiler_ctx_addr,
842 -pan_size(TILER_CONTEXT));
843 } else if (full_td_count > 1) {
844 cs_update_vt_ctx(b)
845 cs_add64(b, tiler_ctx_addr, tiler_ctx_addr,
846 pan_size(TILER_CONTEXT) * (full_td_count - 1));
847 }
848
849 if (full_td_count > 1) {
850 struct cs_index counter_reg = cs_scratch_reg32(b, 17);
851 uint32_t layer_offset =
852 (-MAX_LAYERS_PER_TILER_DESC * (full_td_count - 1)) & BITFIELD_MASK(9);
853
854 cs_move32_to(b, counter_reg, full_td_count - 1);
855 cs_move32_to(b, cs_scratch_reg32(b, 4),
856 (layer_offset << 8) | (MAX_LAYERS_PER_TILER_DESC - 1));
857
858 /* We iterate the remaining full tiler descriptors in reverse order, so we
859 * can start from the smallest layer offset, and increment it by
860 * MAX_LAYERS_PER_TILER_DESC << 8 at each iteration. Again, the split is
861 * mostly due to the lack of AND instructions, and the fact layer_offset
862 * is a 9-bit signed integer inside a 32-bit word, which ADD32 can't deal
863 * with unless the number we add is positive.
864 */
865 cs_while(b, MALI_CS_CONDITION_GREATER, counter_reg) {
866 /* Replace words 0:13 and 24:31. */
867 cs_store(b, cs_scratch_reg_tuple(b, 0, 16), tiler_ctx_addr,
868 BITFIELD_MASK(16), 0);
869 cs_store(b, cs_scratch_reg_tuple(b, 0, 16), tiler_ctx_addr,
870 BITFIELD_RANGE(0, 2) | BITFIELD_RANGE(10, 6), 64);
871 cs_store(b, cs_scratch_reg_tuple(b, 0, 16), tiler_ctx_addr,
872 BITFIELD_RANGE(0, 2) | BITFIELD_RANGE(10, 6), 96);
873
874 cs_wait_slot(b, SB_ID(LS), false);
875
876 cs_add32(b, cs_scratch_reg32(b, 4), cs_scratch_reg32(b, 4),
877 MAX_LAYERS_PER_TILER_DESC << 8);
878
879 cs_add32(b, counter_reg, counter_reg, -1);
880 cs_update_vt_ctx(b)
881 cs_add64(b, tiler_ctx_addr, tiler_ctx_addr,
882 -pan_size(TILER_CONTEXT));
883 }
884 }
885
886 /* Then we change the scoreboard slot used for iterators. */
887 panvk_per_arch(cs_pick_iter_sb)(cmdbuf, PANVK_SUBQUEUE_VERTEX_TILER);
888
889 cs_heap_operation(b, MALI_CS_HEAP_OPERATION_VERTEX_TILER_STARTED, cs_now());
890 return VK_SUCCESS;
891 }
892
893 static uint8_t
prepare_fb_desc(struct panvk_cmd_buffer * cmdbuf,struct pan_fb_info * fbinfo,uint32_t layer,void * fbd)894 prepare_fb_desc(struct panvk_cmd_buffer *cmdbuf, struct pan_fb_info *fbinfo,
895 uint32_t layer, void *fbd)
896 {
897 struct pan_tiler_context tiler_ctx = {
898 .valhall.layer_offset = layer - (layer % MAX_LAYERS_PER_TILER_DESC),
899 };
900
901 if (!(cmdbuf->flags & VK_COMMAND_BUFFER_USAGE_SIMULTANEOUS_USE_BIT)) {
902 uint32_t td_idx = layer / MAX_LAYERS_PER_TILER_DESC;
903
904 tiler_ctx.valhall.desc =
905 cmdbuf->state.gfx.render.tiler + (td_idx * pan_size(TILER_CONTEXT));
906 }
907
908 return GENX(pan_emit_fbd)(fbinfo, layer, NULL, &tiler_ctx, fbd);
909 }
910
911 static VkResult
prepare_incremental_rendering_fbinfos(struct panvk_cmd_buffer * cmdbuf,const struct pan_fb_info * fbinfo,struct pan_fb_info ir_fbinfos[PANVK_IR_PASS_COUNT])912 prepare_incremental_rendering_fbinfos(
913 struct panvk_cmd_buffer *cmdbuf, const struct pan_fb_info *fbinfo,
914 struct pan_fb_info ir_fbinfos[PANVK_IR_PASS_COUNT])
915 {
916 /* First incremental rendering pass: don't discard result */
917
918 struct pan_fb_info *ir_fb = &ir_fbinfos[PANVK_IR_FIRST_PASS];
919
920 memcpy(ir_fb, fbinfo, sizeof(*ir_fb));
921 for (unsigned i = 0; i < fbinfo->rt_count; i++)
922 ir_fb->rts[i].discard = false;
923 ir_fb->zs.discard.z = false;
924 ir_fb->zs.discard.s = false;
925
926 /* Subsequent incremental rendering passes: preload old content and don't
927 * discard result */
928
929 struct pan_fb_info *prev_ir_fb = ir_fb;
930 ir_fb = &ir_fbinfos[PANVK_IR_MIDDLE_PASS];
931 memcpy(ir_fb, prev_ir_fb, sizeof(*ir_fb));
932
933 bool preload_changed = false;
934
935 for (unsigned i = 0; i < fbinfo->rt_count; i++) {
936 if (fbinfo->rts[i].view && !fbinfo->rts[i].preload) {
937 ir_fb->rts[i].preload = true;
938 preload_changed = true;
939 }
940
941 if (ir_fb->rts[i].clear) {
942 ir_fb->rts[i].clear = false;
943 preload_changed = true;
944 }
945 }
946 if (fbinfo->zs.view.zs && !fbinfo->zs.preload.z && !fbinfo->zs.preload.s) {
947 ir_fb->zs.preload.z = true;
948 ir_fb->zs.preload.s = true;
949 preload_changed = true;
950 } else if (fbinfo->zs.view.s && !fbinfo->zs.preload.s) {
951 ir_fb->zs.preload.s = true;
952 preload_changed = true;
953 }
954
955 if (ir_fb->zs.clear.z || ir_fb->zs.clear.s) {
956 ir_fb->zs.clear.z = false;
957 ir_fb->zs.clear.s = false;
958 preload_changed = true;
959 }
960
961 if (preload_changed) {
962 memset(&ir_fb->bifrost.pre_post.dcds, 0x0,
963 sizeof(ir_fb->bifrost.pre_post.dcds));
964 VkResult result = panvk_per_arch(cmd_fb_preload)(cmdbuf, ir_fb);
965 if (result != VK_SUCCESS)
966 return result;
967 }
968
969 /* Last incremental rendering pass: preload previous content and deal with
970 * results as specified by user */
971
972 prev_ir_fb = ir_fb;
973 ir_fb = &ir_fbinfos[PANVK_IR_LAST_PASS];
974 memcpy(ir_fb, prev_ir_fb, sizeof(*ir_fb));
975
976 for (unsigned i = 0; i < fbinfo->rt_count; i++)
977 ir_fb->rts[i].discard = fbinfo->rts[i].discard;
978 ir_fb->zs.discard.z = fbinfo->zs.discard.z;
979 ir_fb->zs.discard.s = fbinfo->zs.discard.s;
980
981 return VK_SUCCESS;
982 }
983
984 static VkResult
get_fb_descs(struct panvk_cmd_buffer * cmdbuf)985 get_fb_descs(struct panvk_cmd_buffer *cmdbuf)
986 {
987 assert(cmdbuf->state.gfx.render.invalidate_inherited_ctx ||
988 !inherits_render_ctx(cmdbuf));
989
990 if (cmdbuf->state.gfx.render.fbds.gpu ||
991 !cmdbuf->state.gfx.render.layer_count)
992 return VK_SUCCESS;
993
994 uint32_t fbd_sz = calc_fbd_size(cmdbuf);
995 uint32_t fbds_sz = fbd_sz * calc_enabled_layer_count(cmdbuf) *
996 (1 + PANVK_IR_PASS_COUNT);
997
998 cmdbuf->state.gfx.render.fbds = panvk_cmd_alloc_dev_mem(
999 cmdbuf, desc, fbds_sz, pan_alignment(FRAMEBUFFER));
1000 if (!cmdbuf->state.gfx.render.fbds.gpu)
1001 return VK_ERROR_OUT_OF_DEVICE_MEMORY;
1002
1003 struct panvk_device *dev = to_panvk_device(cmdbuf->vk.base.device);
1004 struct pan_fb_info *fbinfo = &cmdbuf->state.gfx.render.fb.info;
1005 bool simul_use =
1006 cmdbuf->flags & VK_COMMAND_BUFFER_USAGE_SIMULTANEOUS_USE_BIT;
1007
1008 /* The only bit we patch in FBDs is the tiler pointer. If tiler is not
1009 * involved (clear job) or if the update can happen in place (not
1010 * simultaneous use of the command buffer), we can avoid the
1011 * copy.
1012 *
1013 * According to VUID-VkSubmitInfo2KHR-commandBuffer-06192 and
1014 * VUID-VkSubmitInfo2KHR-commandBuffer-06010, suspend/resume operations
1015 * can't cross the vkQueueSubmit2() boundary, so no need to dynamically
1016 * allocate descriptors in that case:
1017 * "
1018 * If any commandBuffer member of an element of pCommandBufferInfos
1019 * contains any suspended render pass instances, they must be resumed by a
1020 * render pass instance later in submission order within
1021 * pCommandBufferInfos.
1022 *
1023 * If any commandBuffer member of an element of pCommandBufferInfos
1024 * contains any resumed render pass instances, they must be suspended by a
1025 * render pass instance earlier in submission order within
1026 * pCommandBufferInfos.
1027 * "
1028 */
1029 bool copy_fbds = simul_use && cmdbuf->state.gfx.render.tiler;
1030 struct panfrost_ptr fbds = cmdbuf->state.gfx.render.fbds;
1031 uint32_t fbd_flags = 0;
1032 uint32_t fbd_ir_pass_offset = fbd_sz * calc_enabled_layer_count(cmdbuf);
1033
1034 fbinfo->sample_positions =
1035 dev->sample_positions->addr.dev +
1036 panfrost_sample_positions_offset(pan_sample_pattern(fbinfo->nr_samples));
1037
1038 VkResult result = panvk_per_arch(cmd_fb_preload)(cmdbuf, fbinfo);
1039 if (result != VK_SUCCESS)
1040 return result;
1041
1042 struct pan_fb_info ir_fbinfos[PANVK_IR_PASS_COUNT];
1043 result = prepare_incremental_rendering_fbinfos(cmdbuf, fbinfo, ir_fbinfos);
1044 if (result != VK_SUCCESS)
1045 return result;
1046
1047 /* We prepare all FB descriptors upfront. For multiview, only create FBDs
1048 * for enabled views. */
1049 uint32_t view_mask_temp = cmdbuf->state.gfx.render.view_mask;
1050 uint32_t enabled_layer_count = calc_enabled_layer_count(cmdbuf);
1051 bool multiview = cmdbuf->state.gfx.render.view_mask;
1052
1053 for (uint32_t i = 0; i < enabled_layer_count; i++) {
1054 uint32_t layer_idx = multiview ? u_bit_scan(&view_mask_temp) : i;
1055
1056 uint32_t layer_offset = fbd_sz * i;
1057 uint32_t new_fbd_flags =
1058 prepare_fb_desc(cmdbuf, fbinfo, layer_idx, fbds.cpu + layer_offset);
1059
1060 /* Make sure all FBDs have the same flags. */
1061 assert(i == 0 || new_fbd_flags == fbd_flags);
1062 fbd_flags = new_fbd_flags;
1063
1064 for (uint32_t j = 0; j < PANVK_IR_PASS_COUNT; j++) {
1065 uint32_t ir_pass_offset = (1 + j) * fbd_ir_pass_offset;
1066 new_fbd_flags =
1067 prepare_fb_desc(cmdbuf, &ir_fbinfos[j], layer_idx,
1068 fbds.cpu + ir_pass_offset + layer_offset);
1069
1070 /* Make sure all IR FBDs have the same flags. */
1071 assert(new_fbd_flags == fbd_flags);
1072 }
1073 }
1074
1075 struct cs_builder *b = panvk_get_cs_builder(cmdbuf, PANVK_SUBQUEUE_FRAGMENT);
1076
1077 if (copy_fbds) {
1078 struct cs_index cur_tiler = cs_sr_reg64(b, 38);
1079 struct cs_index dst_fbd_ptr = cs_sr_reg64(b, 40);
1080 struct cs_index layer_count = cs_sr_reg32(b, 47);
1081 struct cs_index src_fbd_ptr = cs_sr_reg64(b, 48);
1082 struct cs_index remaining_layers_in_td = cs_sr_reg32(b, 50);
1083 struct cs_index pass_count = cs_sr_reg32(b, 51);
1084 struct cs_index pass_src_fbd_ptr = cs_sr_reg64(b, 52);
1085 struct cs_index pass_dst_fbd_ptr = cs_sr_reg64(b, 54);
1086 uint32_t td_count = DIV_ROUND_UP(cmdbuf->state.gfx.render.layer_count,
1087 MAX_LAYERS_PER_TILER_DESC);
1088
1089 cs_update_frag_ctx(b) {
1090 cs_load64_to(b, cur_tiler, cs_subqueue_ctx_reg(b),
1091 offsetof(struct panvk_cs_subqueue_context,
1092 render.desc_ringbuf.ptr));
1093 cs_wait_slot(b, SB_ID(LS), false);
1094 cs_add64(b, dst_fbd_ptr, cur_tiler,
1095 pan_size(TILER_CONTEXT) * td_count);
1096 }
1097
1098 cs_move64_to(b, src_fbd_ptr, fbds.gpu);
1099 cs_move32_to(b, remaining_layers_in_td, MAX_LAYERS_PER_TILER_DESC);
1100
1101 cs_move32_to(b, layer_count, calc_enabled_layer_count(cmdbuf));
1102 cs_while(b, MALI_CS_CONDITION_GREATER, layer_count) {
1103 /* Our loop is copying 64-bytes at a time, so make sure the
1104 * framebuffer size is aligned on 64-bytes. */
1105 assert(fbd_sz == ALIGN_POT(fbd_sz, 64));
1106
1107 cs_move32_to(b, pass_count, PANVK_IR_PASS_COUNT);
1108 cs_add64(b, pass_src_fbd_ptr, src_fbd_ptr, 0);
1109 cs_add64(b, pass_dst_fbd_ptr, dst_fbd_ptr, 0);
1110 /* Copy FBDs the regular pass as well as IR passes. */
1111 cs_while(b, MALI_CS_CONDITION_GEQUAL, pass_count) {
1112 for (uint32_t fbd_off = 0; fbd_off < fbd_sz; fbd_off += 64) {
1113 if (fbd_off == 0) {
1114 cs_load_to(b, cs_scratch_reg_tuple(b, 0, 14),
1115 pass_src_fbd_ptr, BITFIELD_MASK(14), fbd_off);
1116 cs_add64(b, cs_scratch_reg64(b, 14), cur_tiler, 0);
1117 } else {
1118 cs_load_to(b, cs_scratch_reg_tuple(b, 0, 16),
1119 pass_src_fbd_ptr, BITFIELD_MASK(16), fbd_off);
1120 }
1121 cs_wait_slot(b, SB_ID(LS), false);
1122 cs_store(b, cs_scratch_reg_tuple(b, 0, 16), pass_dst_fbd_ptr,
1123 BITFIELD_MASK(16), fbd_off);
1124 cs_wait_slot(b, SB_ID(LS), false);
1125 }
1126 cs_add64(b, pass_src_fbd_ptr, pass_src_fbd_ptr, fbd_ir_pass_offset);
1127 cs_add64(b, pass_dst_fbd_ptr, pass_dst_fbd_ptr, fbd_ir_pass_offset);
1128 cs_add32(b, pass_count, pass_count, -1);
1129 }
1130
1131 cs_add64(b, src_fbd_ptr, src_fbd_ptr, fbd_sz);
1132 cs_update_frag_ctx(b)
1133 cs_add64(b, dst_fbd_ptr, dst_fbd_ptr, fbd_sz);
1134
1135 cs_add32(b, remaining_layers_in_td, remaining_layers_in_td, -1);
1136 cs_add32(b, layer_count, layer_count, -1);
1137 cs_if(b, MALI_CS_CONDITION_LEQUAL, remaining_layers_in_td) {
1138 cs_update_frag_ctx(b)
1139 cs_add64(b, cur_tiler, cur_tiler, pan_size(TILER_CONTEXT));
1140 cs_move32_to(b, remaining_layers_in_td,
1141 MAX_LAYERS_PER_TILER_DESC);
1142 }
1143 }
1144
1145 cs_update_frag_ctx(b) {
1146 uint32_t full_td_count =
1147 cmdbuf->state.gfx.render.layer_count / MAX_LAYERS_PER_TILER_DESC;
1148
1149 /* If the last tiler descriptor is not full, cur_tiler points to the
1150 * last tiler descriptor, not the FBD that follows. */
1151 if (full_td_count < td_count)
1152 cs_add64(b, dst_fbd_ptr, cur_tiler,
1153 fbd_flags + pan_size(TILER_CONTEXT));
1154 else
1155 cs_add64(b, dst_fbd_ptr, cur_tiler, fbd_flags);
1156
1157 cs_add64(b, cur_tiler, cur_tiler,
1158 -(full_td_count * pan_size(TILER_CONTEXT)));
1159 }
1160 } else {
1161 cs_update_frag_ctx(b) {
1162 cs_move64_to(b, cs_sr_reg64(b, 40), fbds.gpu | fbd_flags);
1163 cs_move64_to(b, cs_sr_reg64(b, 38), cmdbuf->state.gfx.render.tiler);
1164 }
1165 }
1166
1167 return VK_SUCCESS;
1168 }
1169
1170 static void
set_provoking_vertex_mode(struct panvk_cmd_buffer * cmdbuf)1171 set_provoking_vertex_mode(struct panvk_cmd_buffer *cmdbuf)
1172 {
1173 struct pan_fb_info *fbinfo = &cmdbuf->state.gfx.render.fb.info;
1174 bool first_provoking_vertex =
1175 cmdbuf->vk.dynamic_graphics_state.rs.provoking_vertex ==
1176 VK_PROVOKING_VERTEX_MODE_FIRST_VERTEX_EXT;
1177
1178 /* If this is not the first draw, first_provoking_vertex should match
1179 * the one from the previous draws. Unfortunately, we can't check it
1180 * when the render pass is inherited. */
1181 assert(!cmdbuf->state.gfx.render.fbds.gpu || inherits_render_ctx(cmdbuf) ||
1182 fbinfo->first_provoking_vertex == first_provoking_vertex);
1183
1184 fbinfo->first_provoking_vertex = first_provoking_vertex;
1185 }
1186
1187 static VkResult
get_render_ctx(struct panvk_cmd_buffer * cmdbuf)1188 get_render_ctx(struct panvk_cmd_buffer *cmdbuf)
1189 {
1190 VkResult result = get_tiler_desc(cmdbuf);
1191 if (result != VK_SUCCESS)
1192 return result;
1193
1194 return get_fb_descs(cmdbuf);
1195 }
1196
1197 static VkResult
prepare_vs(struct panvk_cmd_buffer * cmdbuf)1198 prepare_vs(struct panvk_cmd_buffer *cmdbuf)
1199 {
1200 struct panvk_descriptor_state *desc_state = &cmdbuf->state.gfx.desc_state;
1201 struct panvk_shader_desc_state *vs_desc_state = &cmdbuf->state.gfx.vs.desc;
1202 const struct panvk_shader *vs = cmdbuf->state.gfx.vs.shader;
1203 struct cs_builder *b =
1204 panvk_get_cs_builder(cmdbuf, PANVK_SUBQUEUE_VERTEX_TILER);
1205 bool upd_res_table = false;
1206
1207 VkResult result = prepare_vs_driver_set(cmdbuf);
1208 if (result != VK_SUCCESS)
1209 return result;
1210
1211 if (gfx_state_dirty(cmdbuf, VS) || gfx_state_dirty(cmdbuf, DESC_STATE) ||
1212 vs_driver_set_is_dirty(cmdbuf)) {
1213 result = panvk_per_arch(cmd_prepare_shader_res_table)(cmdbuf, desc_state,
1214 vs, vs_desc_state);
1215 if (result != VK_SUCCESS)
1216 return result;
1217
1218 upd_res_table = true;
1219 }
1220
1221 cs_update_vt_ctx(b) {
1222 if (upd_res_table)
1223 cs_move64_to(b, cs_sr_reg64(b, 0), vs_desc_state->res_table);
1224
1225 if (gfx_state_dirty(cmdbuf, VS) ||
1226 dyn_gfx_state_dirty(cmdbuf, IA_PRIMITIVE_TOPOLOGY))
1227 cs_move64_to(b, cs_sr_reg64(b, 16), get_pos_spd(cmdbuf));
1228
1229 if (gfx_state_dirty(cmdbuf, VS))
1230 cs_move64_to(b, cs_sr_reg64(b, 18),
1231 panvk_priv_mem_dev_addr(vs->spds.var));
1232 }
1233
1234 return VK_SUCCESS;
1235 }
1236
1237 static VkResult
prepare_fs(struct panvk_cmd_buffer * cmdbuf)1238 prepare_fs(struct panvk_cmd_buffer *cmdbuf)
1239 {
1240 const struct panvk_shader *fs = get_fs(cmdbuf);
1241 struct panvk_shader_desc_state *fs_desc_state = &cmdbuf->state.gfx.fs.desc;
1242 struct panvk_descriptor_state *desc_state = &cmdbuf->state.gfx.desc_state;
1243 struct cs_builder *b =
1244 panvk_get_cs_builder(cmdbuf, PANVK_SUBQUEUE_VERTEX_TILER);
1245
1246 if (fs &&
1247 (gfx_state_dirty(cmdbuf, FS) || gfx_state_dirty(cmdbuf, DESC_STATE))) {
1248 VkResult result = prepare_fs_driver_set(cmdbuf);
1249 if (result != VK_SUCCESS)
1250 return result;
1251
1252 result = panvk_per_arch(cmd_prepare_shader_res_table)(cmdbuf, desc_state,
1253 fs, fs_desc_state);
1254 if (result != VK_SUCCESS)
1255 return result;
1256 }
1257
1258 cs_update_vt_ctx(b) {
1259 if (fs_user_dirty(cmdbuf) || gfx_state_dirty(cmdbuf, DESC_STATE))
1260 cs_move64_to(b, cs_sr_reg64(b, 4), fs ? fs_desc_state->res_table : 0);
1261 if (fs_user_dirty(cmdbuf))
1262 cs_move64_to(b, cs_sr_reg64(b, 20),
1263 fs ? panvk_priv_mem_dev_addr(fs->spd) : 0);
1264 }
1265
1266 return VK_SUCCESS;
1267 }
1268
1269 static VkResult
prepare_push_uniforms(struct panvk_cmd_buffer * cmdbuf)1270 prepare_push_uniforms(struct panvk_cmd_buffer *cmdbuf)
1271 {
1272 struct cs_builder *b =
1273 panvk_get_cs_builder(cmdbuf, PANVK_SUBQUEUE_VERTEX_TILER);
1274 const struct panvk_shader *vs = cmdbuf->state.gfx.vs.shader;
1275 const struct panvk_shader *fs = get_fs(cmdbuf);
1276 VkResult result;
1277
1278 if (gfx_state_dirty(cmdbuf, VS_PUSH_UNIFORMS)) {
1279 result = panvk_per_arch(cmd_prepare_push_uniforms)(cmdbuf, vs);
1280 if (result != VK_SUCCESS)
1281 return result;
1282
1283 cs_update_vt_ctx(b) {
1284 cs_move64_to(b, cs_sr_reg64(b, 8),
1285 cmdbuf->state.gfx.vs.push_uniforms |
1286 ((uint64_t)vs->fau.total_count << 56));
1287 }
1288 }
1289
1290 if (fs_user_dirty(cmdbuf) || gfx_state_dirty(cmdbuf, FS_PUSH_UNIFORMS)) {
1291 uint64_t fau_ptr = 0;
1292
1293 if (fs) {
1294 result = panvk_per_arch(cmd_prepare_push_uniforms)(cmdbuf, fs);
1295 if (result != VK_SUCCESS)
1296 return result;
1297
1298 fau_ptr = cmdbuf->state.gfx.fs.push_uniforms |
1299 ((uint64_t)fs->fau.total_count << 56);
1300 }
1301
1302 cs_update_vt_ctx(b)
1303 cs_move64_to(b, cs_sr_reg64(b, 12), fau_ptr);
1304 }
1305
1306 return VK_SUCCESS;
1307 }
1308
1309 static VkResult
prepare_ds(struct panvk_cmd_buffer * cmdbuf)1310 prepare_ds(struct panvk_cmd_buffer *cmdbuf)
1311 {
1312 bool dirty = dyn_gfx_state_dirty(cmdbuf, DS_DEPTH_TEST_ENABLE) ||
1313 dyn_gfx_state_dirty(cmdbuf, DS_DEPTH_WRITE_ENABLE) ||
1314 dyn_gfx_state_dirty(cmdbuf, DS_DEPTH_COMPARE_OP) ||
1315 dyn_gfx_state_dirty(cmdbuf, DS_STENCIL_TEST_ENABLE) ||
1316 dyn_gfx_state_dirty(cmdbuf, DS_STENCIL_OP) ||
1317 dyn_gfx_state_dirty(cmdbuf, DS_STENCIL_COMPARE_MASK) ||
1318 dyn_gfx_state_dirty(cmdbuf, DS_STENCIL_WRITE_MASK) ||
1319 dyn_gfx_state_dirty(cmdbuf, DS_STENCIL_REFERENCE) ||
1320 dyn_gfx_state_dirty(cmdbuf, RS_DEPTH_CLAMP_ENABLE) ||
1321 dyn_gfx_state_dirty(cmdbuf, RS_DEPTH_CLIP_ENABLE) ||
1322 dyn_gfx_state_dirty(cmdbuf, RS_DEPTH_BIAS_ENABLE) ||
1323 dyn_gfx_state_dirty(cmdbuf, RS_DEPTH_BIAS_FACTORS) ||
1324 fs_user_dirty(cmdbuf);
1325
1326 if (!dirty)
1327 return VK_SUCCESS;
1328
1329 struct cs_builder *b =
1330 panvk_get_cs_builder(cmdbuf, PANVK_SUBQUEUE_VERTEX_TILER);
1331 const struct vk_dynamic_graphics_state *dyns =
1332 &cmdbuf->vk.dynamic_graphics_state;
1333 const struct vk_depth_stencil_state *ds = &dyns->ds;
1334 const struct vk_rasterization_state *rs = &dyns->rs;
1335 bool test_s = has_stencil_att(cmdbuf) && ds->stencil.test_enable;
1336 bool test_z = has_depth_att(cmdbuf) && ds->depth.test_enable;
1337 const struct panvk_shader *fs = get_fs(cmdbuf);
1338
1339 struct panfrost_ptr zsd = panvk_cmd_alloc_desc(cmdbuf, DEPTH_STENCIL);
1340 if (!zsd.gpu)
1341 return VK_ERROR_OUT_OF_DEVICE_MEMORY;
1342
1343 pan_cast_and_pack(zsd.cpu, DEPTH_STENCIL, cfg) {
1344 cfg.stencil_test_enable = test_s;
1345 if (test_s) {
1346 cfg.front_compare_function =
1347 translate_compare_func(ds->stencil.front.op.compare);
1348 cfg.front_stencil_fail =
1349 translate_stencil_op(ds->stencil.front.op.fail);
1350 cfg.front_depth_fail =
1351 translate_stencil_op(ds->stencil.front.op.depth_fail);
1352 cfg.front_depth_pass = translate_stencil_op(ds->stencil.front.op.pass);
1353 cfg.back_compare_function =
1354 translate_compare_func(ds->stencil.back.op.compare);
1355 cfg.back_stencil_fail = translate_stencil_op(ds->stencil.back.op.fail);
1356 cfg.back_depth_fail =
1357 translate_stencil_op(ds->stencil.back.op.depth_fail);
1358 cfg.back_depth_pass = translate_stencil_op(ds->stencil.back.op.pass);
1359 }
1360
1361 cfg.stencil_from_shader = fs ? fs->info.fs.writes_stencil : 0;
1362 cfg.front_write_mask = ds->stencil.front.write_mask;
1363 cfg.back_write_mask = ds->stencil.back.write_mask;
1364 cfg.front_value_mask = ds->stencil.front.compare_mask;
1365 cfg.back_value_mask = ds->stencil.back.compare_mask;
1366 cfg.front_reference_value = ds->stencil.front.reference;
1367 cfg.back_reference_value = ds->stencil.back.reference;
1368
1369 cfg.depth_cull_enable = vk_rasterization_state_depth_clip_enable(rs);
1370 if (rs->depth_clamp_enable)
1371 cfg.depth_clamp_mode = MALI_DEPTH_CLAMP_MODE_BOUNDS;
1372
1373 if (fs)
1374 cfg.depth_source = pan_depth_source(&fs->info);
1375 cfg.depth_write_enable = test_z && ds->depth.write_enable;
1376 cfg.depth_bias_enable = rs->depth_bias.enable;
1377 cfg.depth_function = test_z ? translate_compare_func(ds->depth.compare_op)
1378 : MALI_FUNC_ALWAYS;
1379 cfg.depth_units = rs->depth_bias.constant_factor;
1380 cfg.depth_factor = rs->depth_bias.slope_factor;
1381 cfg.depth_bias_clamp = rs->depth_bias.clamp;
1382 }
1383
1384 cs_update_vt_ctx(b)
1385 cs_move64_to(b, cs_sr_reg64(b, 52), zsd.gpu);
1386
1387 return VK_SUCCESS;
1388 }
1389
1390 static VkResult
wrap_prev_oq(struct panvk_cmd_buffer * cmdbuf)1391 wrap_prev_oq(struct panvk_cmd_buffer *cmdbuf)
1392 {
1393 uint64_t last_syncobj = cmdbuf->state.gfx.render.oq.last;
1394
1395 if (!last_syncobj)
1396 return VK_SUCCESS;
1397
1398 uint64_t prev_oq_node = cmdbuf->state.gfx.render.oq.chain;
1399 struct panfrost_ptr new_oq_node = panvk_cmd_alloc_dev_mem(
1400 cmdbuf, desc, sizeof(struct panvk_cs_occlusion_query), 8);
1401
1402 if (!new_oq_node.gpu)
1403 return VK_ERROR_OUT_OF_DEVICE_MEMORY;
1404
1405 cmdbuf->state.gfx.render.oq.chain = new_oq_node.gpu;
1406
1407 struct panvk_cs_occlusion_query *oq = new_oq_node.cpu;
1408
1409 *oq = (struct panvk_cs_occlusion_query){
1410 .syncobj = last_syncobj,
1411 .next = prev_oq_node,
1412 };
1413
1414 /* If we already had an OQ in the chain, we don't need to initialize the
1415 * oq_chain field in the subqueue ctx. */
1416 if (prev_oq_node)
1417 return VK_SUCCESS;
1418
1419 /* If we're a secondary cmdbuf inside a render pass, we let the primary
1420 * cmdbuf link the OQ chain. */
1421 if (cmdbuf->flags & VK_COMMAND_BUFFER_USAGE_RENDER_PASS_CONTINUE_BIT)
1422 return VK_SUCCESS;
1423
1424 struct cs_builder *b = panvk_get_cs_builder(cmdbuf, PANVK_SUBQUEUE_FRAGMENT);
1425 struct cs_index oq_node_reg = cs_scratch_reg64(b, 0);
1426
1427 cs_move64_to(b, oq_node_reg, new_oq_node.gpu);
1428
1429 /* If we're resuming, we need to link with the previous oq_chain, if any. */
1430 if (cmdbuf->state.gfx.render.flags & VK_RENDERING_RESUMING_BIT) {
1431 struct cs_index prev_oq_node_reg = cs_scratch_reg64(b, 2);
1432
1433 cs_load64_to(
1434 b, prev_oq_node_reg, cs_subqueue_ctx_reg(b),
1435 offsetof(struct panvk_cs_subqueue_context, render.oq_chain));
1436 cs_wait_slot(b, SB_ID(LS), false);
1437 cs_store64(b, prev_oq_node_reg, oq_node_reg,
1438 offsetof(struct panvk_cs_occlusion_query, next));
1439 cs_wait_slot(b, SB_ID(LS), false);
1440 }
1441
1442 cs_store64(b, oq_node_reg, cs_subqueue_ctx_reg(b),
1443 offsetof(struct panvk_cs_subqueue_context, render.oq_chain));
1444 cs_wait_slot(b, SB_ID(LS), false);
1445 return VK_SUCCESS;
1446 }
1447
1448 static VkResult
prepare_oq(struct panvk_cmd_buffer * cmdbuf)1449 prepare_oq(struct panvk_cmd_buffer *cmdbuf)
1450 {
1451 if (!gfx_state_dirty(cmdbuf, OQ) ||
1452 cmdbuf->state.gfx.occlusion_query.syncobj ==
1453 cmdbuf->state.gfx.render.oq.last)
1454 return VK_SUCCESS;
1455
1456 VkResult result = wrap_prev_oq(cmdbuf);
1457 if (result)
1458 return result;
1459
1460 struct cs_builder *b =
1461 panvk_get_cs_builder(cmdbuf, PANVK_SUBQUEUE_VERTEX_TILER);
1462 cs_move64_to(b, cs_sr_reg64(b, 46), cmdbuf->state.gfx.occlusion_query.ptr);
1463
1464 cmdbuf->state.gfx.render.oq.last =
1465 cmdbuf->state.gfx.occlusion_query.syncobj;
1466 return VK_SUCCESS;
1467 }
1468
1469 static void
prepare_dcd(struct panvk_cmd_buffer * cmdbuf)1470 prepare_dcd(struct panvk_cmd_buffer *cmdbuf)
1471 {
1472 struct cs_builder *b =
1473 panvk_get_cs_builder(cmdbuf, PANVK_SUBQUEUE_VERTEX_TILER);
1474 const struct panvk_shader *fs = get_fs(cmdbuf);
1475 bool dcd0_dirty =
1476 dyn_gfx_state_dirty(cmdbuf, RS_RASTERIZER_DISCARD_ENABLE) ||
1477 dyn_gfx_state_dirty(cmdbuf, RS_CULL_MODE) ||
1478 dyn_gfx_state_dirty(cmdbuf, RS_FRONT_FACE) ||
1479 dyn_gfx_state_dirty(cmdbuf, MS_RASTERIZATION_SAMPLES) ||
1480 dyn_gfx_state_dirty(cmdbuf, MS_SAMPLE_MASK) ||
1481 dyn_gfx_state_dirty(cmdbuf, MS_ALPHA_TO_COVERAGE_ENABLE) ||
1482 dyn_gfx_state_dirty(cmdbuf, MS_ALPHA_TO_ONE_ENABLE) ||
1483 /* writes_depth() uses vk_depth_stencil_state */
1484 dyn_gfx_state_dirty(cmdbuf, DS_DEPTH_TEST_ENABLE) ||
1485 dyn_gfx_state_dirty(cmdbuf, DS_DEPTH_WRITE_ENABLE) ||
1486 dyn_gfx_state_dirty(cmdbuf, DS_DEPTH_COMPARE_OP) ||
1487 /* writes_stencil() uses vk_depth_stencil_state */
1488 dyn_gfx_state_dirty(cmdbuf, DS_STENCIL_TEST_ENABLE) ||
1489 dyn_gfx_state_dirty(cmdbuf, DS_STENCIL_OP) ||
1490 dyn_gfx_state_dirty(cmdbuf, DS_STENCIL_WRITE_MASK) ||
1491 fs_user_dirty(cmdbuf) || gfx_state_dirty(cmdbuf, RENDER_STATE) ||
1492 gfx_state_dirty(cmdbuf, OQ);
1493 bool dcd1_dirty = dyn_gfx_state_dirty(cmdbuf, MS_RASTERIZATION_SAMPLES) ||
1494 dyn_gfx_state_dirty(cmdbuf, MS_SAMPLE_MASK) ||
1495 fs_user_dirty(cmdbuf) ||
1496 gfx_state_dirty(cmdbuf, RENDER_STATE);
1497
1498 const struct vk_dynamic_graphics_state *dyns =
1499 &cmdbuf->vk.dynamic_graphics_state;
1500 const struct vk_rasterization_state *rs =
1501 &cmdbuf->vk.dynamic_graphics_state.rs;
1502 bool alpha_to_coverage = dyns->ms.alpha_to_coverage_enable;
1503 bool writes_z = writes_depth(cmdbuf);
1504 bool writes_s = writes_stencil(cmdbuf);
1505
1506 if (dcd0_dirty) {
1507 struct mali_dcd_flags_0_packed dcd0;
1508 pan_pack(&dcd0, DCD_FLAGS_0, cfg) {
1509 if (fs) {
1510 uint8_t rt_written = fs->info.outputs_written >> FRAG_RESULT_DATA0;
1511 uint8_t rt_mask = cmdbuf->state.gfx.render.bound_attachments &
1512 MESA_VK_RP_ATTACHMENT_ANY_COLOR_BITS;
1513
1514 cfg.allow_forward_pixel_to_kill =
1515 fs->info.fs.can_fpk && !(rt_mask & ~rt_written) &&
1516 !alpha_to_coverage && !cmdbuf->state.gfx.cb.info.any_dest_read;
1517
1518 bool writes_zs = writes_z || writes_s;
1519 bool zs_always_passes = ds_test_always_passes(cmdbuf);
1520 bool oq = cmdbuf->state.gfx.occlusion_query.mode !=
1521 MALI_OCCLUSION_MODE_DISABLED;
1522
1523 struct pan_earlyzs_state earlyzs =
1524 pan_earlyzs_get(pan_earlyzs_analyze(&fs->info), writes_zs || oq,
1525 alpha_to_coverage, zs_always_passes);
1526
1527 cfg.pixel_kill_operation = earlyzs.kill;
1528 cfg.zs_update_operation = earlyzs.update;
1529 cfg.evaluate_per_sample = fs->info.fs.sample_shading &&
1530 (dyns->ms.rasterization_samples > 1);
1531
1532 cfg.shader_modifies_coverage = fs->info.fs.writes_coverage ||
1533 fs->info.fs.can_discard ||
1534 alpha_to_coverage;
1535 } else {
1536 cfg.allow_forward_pixel_to_kill = true;
1537 cfg.allow_forward_pixel_to_be_killed = true;
1538 cfg.pixel_kill_operation = MALI_PIXEL_KILL_FORCE_EARLY;
1539 cfg.zs_update_operation = MALI_PIXEL_KILL_FORCE_EARLY;
1540 cfg.overdraw_alpha0 = true;
1541 cfg.overdraw_alpha1 = true;
1542 }
1543
1544 cfg.front_face_ccw = rs->front_face == VK_FRONT_FACE_COUNTER_CLOCKWISE;
1545 cfg.cull_front_face = (rs->cull_mode & VK_CULL_MODE_FRONT_BIT) != 0;
1546 cfg.cull_back_face = (rs->cull_mode & VK_CULL_MODE_BACK_BIT) != 0;
1547
1548 cfg.multisample_enable = dyns->ms.rasterization_samples > 1;
1549 cfg.occlusion_query = cmdbuf->state.gfx.occlusion_query.mode;
1550 cfg.alpha_to_coverage = alpha_to_coverage;
1551 }
1552
1553 cs_update_vt_ctx(b)
1554 cs_move32_to(b, cs_sr_reg32(b, 57), dcd0.opaque[0]);
1555 }
1556
1557 if (dcd1_dirty) {
1558 struct mali_dcd_flags_1_packed dcd1;
1559 pan_pack(&dcd1, DCD_FLAGS_1, cfg) {
1560 cfg.sample_mask = dyns->ms.rasterization_samples > 1
1561 ? dyns->ms.sample_mask
1562 : UINT16_MAX;
1563
1564 if (fs) {
1565 cfg.render_target_mask =
1566 (fs->info.outputs_written >> FRAG_RESULT_DATA0) &
1567 cmdbuf->state.gfx.render.bound_attachments;
1568 }
1569 }
1570
1571 cs_update_vt_ctx(b)
1572 cs_move32_to(b, cs_sr_reg32(b, 58), dcd1.opaque[0]);
1573 }
1574 }
1575
1576 static void
prepare_index_buffer(struct panvk_cmd_buffer * cmdbuf,struct panvk_draw_info * draw)1577 prepare_index_buffer(struct panvk_cmd_buffer *cmdbuf,
1578 struct panvk_draw_info *draw)
1579 {
1580 struct cs_builder *b =
1581 panvk_get_cs_builder(cmdbuf, PANVK_SUBQUEUE_VERTEX_TILER);
1582
1583 if (draw->index.size && gfx_state_dirty(cmdbuf, IB)) {
1584 uint64_t ib_size =
1585 panvk_buffer_range(cmdbuf->state.gfx.ib.buffer,
1586 cmdbuf->state.gfx.ib.offset, VK_WHOLE_SIZE);
1587 assert(ib_size <= UINT32_MAX);
1588 cs_move32_to(b, cs_sr_reg32(b, 39), ib_size);
1589
1590 cs_move64_to(b, cs_sr_reg64(b, 54),
1591 panvk_buffer_gpu_ptr(cmdbuf->state.gfx.ib.buffer,
1592 cmdbuf->state.gfx.ib.offset));
1593 }
1594 }
1595
1596 static void
set_tiler_idvs_flags(struct cs_builder * b,struct panvk_cmd_buffer * cmdbuf,struct panvk_draw_info * draw)1597 set_tiler_idvs_flags(struct cs_builder *b, struct panvk_cmd_buffer *cmdbuf,
1598 struct panvk_draw_info *draw)
1599 {
1600 const struct panvk_shader *vs = cmdbuf->state.gfx.vs.shader;
1601 const struct panvk_shader *fs = get_fs(cmdbuf);
1602 const struct vk_dynamic_graphics_state *dyns =
1603 &cmdbuf->vk.dynamic_graphics_state;
1604 const struct vk_input_assembly_state *ia = &dyns->ia;
1605 const struct vk_rasterization_state *rs = &dyns->rs;
1606 struct mali_primitive_flags_packed tiler_idvs_flags;
1607
1608 /* When drawing non-point primitives, we use the no_psiz variant which has
1609 * point size writes patched out */
1610 bool writes_point_size =
1611 vs->info.vs.writes_point_size &&
1612 ia->primitive_topology == VK_PRIMITIVE_TOPOLOGY_POINT_LIST;
1613 bool multiview = cmdbuf->state.gfx.render.view_mask;
1614 bool writes_layer = vs->info.outputs_written & VARYING_BIT_LAYER;
1615
1616 /* Multiview shaders depend on the FIFO format for indexing per-view
1617 * output writes. We don't currently patch these offsets in the no_psiz
1618 * variant, so we still need the extended format even though the shader
1619 * does not write point size. */
1620 bool extended_fifo = writes_point_size || writes_layer ||
1621 (vs->info.vs.writes_point_size && multiview);
1622
1623 bool dirty = gfx_state_dirty(cmdbuf, VS) || fs_user_dirty(cmdbuf) ||
1624 dyn_gfx_state_dirty(cmdbuf, IA_PRIMITIVE_RESTART_ENABLE) ||
1625 dyn_gfx_state_dirty(cmdbuf, IA_PRIMITIVE_TOPOLOGY) ||
1626 dyn_gfx_state_dirty(cmdbuf, RS_DEPTH_CLAMP_ENABLE) ||
1627 dyn_gfx_state_dirty(cmdbuf, RS_DEPTH_CLIP_ENABLE);
1628
1629 if (dirty) {
1630 pan_pack(&tiler_idvs_flags, PRIMITIVE_FLAGS, cfg) {
1631 cfg.draw_mode = translate_prim_topology(ia->primitive_topology);
1632
1633 cfg.point_size_array_format = writes_point_size
1634 ? MALI_POINT_SIZE_ARRAY_FORMAT_FP16
1635 : MALI_POINT_SIZE_ARRAY_FORMAT_NONE;
1636 cfg.layer_index_enable = writes_layer;
1637
1638 cfg.position_fifo_format = extended_fifo
1639 ? MALI_FIFO_FORMAT_EXTENDED
1640 : MALI_FIFO_FORMAT_BASIC;
1641
1642 cfg.low_depth_cull = cfg.high_depth_cull =
1643 vk_rasterization_state_depth_clip_enable(rs);
1644
1645 cfg.secondary_shader = vs->info.vs.secondary_enable && fs != NULL;
1646 cfg.primitive_restart = ia->primitive_restart_enable;
1647 cfg.view_mask = cmdbuf->state.gfx.render.view_mask;
1648 }
1649
1650 cs_move32_to(b, cs_sr_reg32(b, 56), tiler_idvs_flags.opaque[0]);
1651 }
1652 }
1653
1654 static struct mali_primitive_flags_packed
get_tiler_flags_override(struct panvk_draw_info * draw)1655 get_tiler_flags_override(struct panvk_draw_info *draw)
1656 {
1657 struct mali_primitive_flags_packed flags_override;
1658 /* Pack with nodefaults so only explicitly set override fields affect the
1659 * previously set register values */
1660 pan_pack_nodefaults(&flags_override, PRIMITIVE_FLAGS, cfg) {
1661 cfg.index_type = index_size_to_index_type(draw->index.size);
1662 };
1663
1664 return flags_override;
1665 }
1666
1667 static VkResult
prepare_draw(struct panvk_cmd_buffer * cmdbuf,struct panvk_draw_info * draw)1668 prepare_draw(struct panvk_cmd_buffer *cmdbuf, struct panvk_draw_info *draw)
1669 {
1670 const struct panvk_shader *vs = cmdbuf->state.gfx.vs.shader;
1671 const struct panvk_shader *fs = get_fs(cmdbuf);
1672 struct panvk_descriptor_state *desc_state = &cmdbuf->state.gfx.desc_state;
1673 bool idvs = vs->info.vs.idvs;
1674 VkResult result;
1675
1676 assert(vs);
1677
1678 /* FIXME: support non-IDVS. */
1679 assert(idvs);
1680
1681 set_provoking_vertex_mode(cmdbuf);
1682
1683 result = update_tls(cmdbuf);
1684 if (result != VK_SUCCESS)
1685 return result;
1686
1687 if (!inherits_render_ctx(cmdbuf)) {
1688 result = get_render_ctx(cmdbuf);
1689 if (result != VK_SUCCESS)
1690 return result;
1691 }
1692
1693 struct cs_builder *b =
1694 panvk_get_cs_builder(cmdbuf, PANVK_SUBQUEUE_VERTEX_TILER);
1695
1696 uint32_t used_set_mask =
1697 vs->desc_info.used_set_mask | (fs ? fs->desc_info.used_set_mask : 0);
1698
1699 if (gfx_state_dirty(cmdbuf, DESC_STATE) || gfx_state_dirty(cmdbuf, VS) ||
1700 gfx_state_dirty(cmdbuf, FS)) {
1701 result = panvk_per_arch(cmd_prepare_push_descs)(cmdbuf, desc_state,
1702 used_set_mask);
1703 if (result != VK_SUCCESS)
1704 return result;
1705 }
1706
1707 result = prepare_blend(cmdbuf);
1708 if (result != VK_SUCCESS)
1709 return result;
1710
1711 panvk_per_arch(cmd_prepare_draw_sysvals)(cmdbuf, draw);
1712
1713 result = prepare_push_uniforms(cmdbuf);
1714 if (result != VK_SUCCESS)
1715 return result;
1716
1717 result = prepare_vs(cmdbuf);
1718 if (result != VK_SUCCESS)
1719 return result;
1720
1721 result = prepare_fs(cmdbuf);
1722 if (result != VK_SUCCESS)
1723 return result;
1724
1725 /* Assumes 16 byte slots. We could do better. */
1726 uint32_t varying_size = get_varying_slots(cmdbuf) * 16;
1727
1728 cs_update_vt_ctx(b) {
1729 /* We don't use the resource dep system yet. */
1730 cs_move32_to(b, cs_sr_reg32(b, 38), 0);
1731
1732 prepare_index_buffer(cmdbuf, draw);
1733
1734 set_tiler_idvs_flags(b, cmdbuf, draw);
1735
1736 cs_move32_to(b, cs_sr_reg32(b, 48), varying_size);
1737
1738 result = prepare_ds(cmdbuf);
1739 if (result != VK_SUCCESS)
1740 return result;
1741
1742 result = prepare_oq(cmdbuf);
1743 if (result != VK_SUCCESS)
1744 return result;
1745
1746 prepare_dcd(cmdbuf);
1747 prepare_vp(cmdbuf);
1748 prepare_tiler_primitive_size(cmdbuf);
1749 }
1750
1751 clear_dirty_after_draw(cmdbuf);
1752 return VK_SUCCESS;
1753 }
1754
1755 static void
panvk_cmd_draw(struct panvk_cmd_buffer * cmdbuf,struct panvk_draw_info * draw)1756 panvk_cmd_draw(struct panvk_cmd_buffer *cmdbuf, struct panvk_draw_info *draw)
1757 {
1758 const struct cs_tracing_ctx *tracing_ctx =
1759 &cmdbuf->state.cs[PANVK_SUBQUEUE_VERTEX_TILER].tracing;
1760 const struct panvk_shader *vs = cmdbuf->state.gfx.vs.shader;
1761 struct cs_builder *b =
1762 panvk_get_cs_builder(cmdbuf, PANVK_SUBQUEUE_VERTEX_TILER);
1763 VkResult result;
1764
1765 /* If there's no vertex shader, we can skip the draw. */
1766 if (!panvk_priv_mem_dev_addr(vs->spds.pos_points))
1767 return;
1768
1769 /* Needs to be done before get_fs() is called because it depends on
1770 * fs.required being initialized. */
1771 cmdbuf->state.gfx.fs.required =
1772 fs_required(&cmdbuf->state.gfx, &cmdbuf->vk.dynamic_graphics_state);
1773
1774 if (!cmdbuf->vk.dynamic_graphics_state.rs.rasterizer_discard_enable) {
1775 struct pan_fb_info *fbinfo = &cmdbuf->state.gfx.render.fb.info;
1776 uint32_t rasterization_samples =
1777 cmdbuf->vk.dynamic_graphics_state.ms.rasterization_samples;
1778
1779 /* If there's no attachment, we patch nr_samples to match
1780 * rasterization_samples, otherwise, we make sure those two numbers match.
1781 */
1782 if (!cmdbuf->state.gfx.render.bound_attachments) {
1783 assert(rasterization_samples > 0);
1784 fbinfo->nr_samples = rasterization_samples;
1785 } else {
1786 assert(rasterization_samples == fbinfo->nr_samples);
1787 }
1788 }
1789
1790 result = prepare_draw(cmdbuf, draw);
1791 if (result != VK_SUCCESS)
1792 return;
1793
1794 cs_update_vt_ctx(b) {
1795 cs_move32_to(b, cs_sr_reg32(b, 32), 0);
1796 cs_move32_to(b, cs_sr_reg32(b, 33), draw->vertex.count);
1797 cs_move32_to(b, cs_sr_reg32(b, 34), draw->instance.count);
1798 cs_move32_to(b, cs_sr_reg32(b, 35), draw->index.offset);
1799 cs_move32_to(b, cs_sr_reg32(b, 36), draw->vertex.base);
1800 /* NIR expects zero-based instance ID, but even if it did have an intrinsic to
1801 * load the absolute instance ID, we'd want to keep it zero-based to work around
1802 * Mali's limitation on non-zero firstInstance when a instance divisor is used.
1803 */
1804 cs_move32_to(b, cs_sr_reg32(b, 37), 0);
1805 }
1806
1807 struct mali_primitive_flags_packed flags_override =
1808 get_tiler_flags_override(draw);
1809
1810 uint32_t idvs_count = DIV_ROUND_UP(cmdbuf->state.gfx.render.layer_count,
1811 MAX_LAYERS_PER_TILER_DESC);
1812
1813 cs_req_res(b, CS_IDVS_RES);
1814 if (idvs_count > 1) {
1815 struct cs_index counter_reg = cs_scratch_reg32(b, 17);
1816 struct cs_index tiler_ctx_addr = cs_sr_reg64(b, 40);
1817
1818 cs_move32_to(b, counter_reg, idvs_count);
1819
1820 cs_while(b, MALI_CS_CONDITION_GREATER, counter_reg) {
1821 cs_trace_run_idvs(b, tracing_ctx, cs_scratch_reg_tuple(b, 0, 4),
1822 flags_override.opaque[0], false, true,
1823 cs_shader_res_sel(0, 0, 1, 0),
1824 cs_shader_res_sel(2, 2, 2, 0), cs_undef());
1825
1826 cs_add32(b, counter_reg, counter_reg, -1);
1827 cs_update_vt_ctx(b) {
1828 cs_add64(b, tiler_ctx_addr, tiler_ctx_addr,
1829 pan_size(TILER_CONTEXT));
1830 }
1831 }
1832
1833 cs_update_vt_ctx(b) {
1834 cs_add64(b, tiler_ctx_addr, tiler_ctx_addr,
1835 -(idvs_count * pan_size(TILER_CONTEXT)));
1836 }
1837 } else {
1838 cs_trace_run_idvs(b, tracing_ctx, cs_scratch_reg_tuple(b, 0, 4),
1839 flags_override.opaque[0], false, true,
1840 cs_shader_res_sel(0, 0, 1, 0),
1841 cs_shader_res_sel(2, 2, 2, 0), cs_undef());
1842 }
1843 cs_req_res(b, 0);
1844 }
1845
1846 VkResult
panvk_per_arch(cmd_prepare_exec_cmd_for_draws)1847 panvk_per_arch(cmd_prepare_exec_cmd_for_draws)(
1848 struct panvk_cmd_buffer *primary,
1849 struct panvk_cmd_buffer *secondary)
1850 {
1851 if (!(secondary->flags & VK_COMMAND_BUFFER_USAGE_RENDER_PASS_CONTINUE_BIT))
1852 return VK_SUCCESS;
1853
1854 if (!inherits_render_ctx(primary)) {
1855 VkResult result = get_render_ctx(primary);
1856 if (result != VK_SUCCESS)
1857 return result;
1858 }
1859
1860 return prepare_oq(primary);
1861 }
1862
1863 VKAPI_ATTR void VKAPI_CALL
panvk_per_arch(CmdDraw)1864 panvk_per_arch(CmdDraw)(VkCommandBuffer commandBuffer, uint32_t vertexCount,
1865 uint32_t instanceCount, uint32_t firstVertex,
1866 uint32_t firstInstance)
1867 {
1868 VK_FROM_HANDLE(panvk_cmd_buffer, cmdbuf, commandBuffer);
1869
1870 if (instanceCount == 0 || vertexCount == 0)
1871 return;
1872
1873 /* gl_BaseVertexARB is a signed integer, and it should expose the value of
1874 * firstVertex in a non-indexed draw. */
1875 assert(firstVertex < INT32_MAX);
1876
1877 /* gl_BaseInstance is a signed integer, and it should expose the value of
1878 * firstInstnace. */
1879 assert(firstInstance < INT32_MAX);
1880
1881 struct panvk_draw_info draw = {
1882 .vertex.base = firstVertex,
1883 .vertex.count = vertexCount,
1884 .instance.base = firstInstance,
1885 .instance.count = instanceCount,
1886 };
1887
1888 panvk_cmd_draw(cmdbuf, &draw);
1889 }
1890
1891 VKAPI_ATTR void VKAPI_CALL
panvk_per_arch(CmdDrawIndexed)1892 panvk_per_arch(CmdDrawIndexed)(VkCommandBuffer commandBuffer,
1893 uint32_t indexCount, uint32_t instanceCount,
1894 uint32_t firstIndex, int32_t vertexOffset,
1895 uint32_t firstInstance)
1896 {
1897 VK_FROM_HANDLE(panvk_cmd_buffer, cmdbuf, commandBuffer);
1898
1899 if (instanceCount == 0 || indexCount == 0)
1900 return;
1901
1902 /* gl_BaseInstance is a signed integer, and it should expose the value of
1903 * firstInstnace. */
1904 assert(firstInstance < INT32_MAX);
1905
1906 struct panvk_draw_info draw = {
1907 .index.size = cmdbuf->state.gfx.ib.index_size,
1908 .index.offset = firstIndex,
1909 .vertex.base = vertexOffset,
1910 .vertex.count = indexCount,
1911 .instance.count = instanceCount,
1912 .instance.base = firstInstance,
1913 };
1914
1915 panvk_cmd_draw(cmdbuf, &draw);
1916 }
1917
1918 static void
panvk_cmd_draw_indirect(struct panvk_cmd_buffer * cmdbuf,struct panvk_draw_info * draw)1919 panvk_cmd_draw_indirect(struct panvk_cmd_buffer *cmdbuf,
1920 struct panvk_draw_info *draw)
1921 {
1922 const struct cs_tracing_ctx *tracing_ctx =
1923 &cmdbuf->state.cs[PANVK_SUBQUEUE_VERTEX_TILER].tracing;
1924 const struct panvk_shader *vs = cmdbuf->state.gfx.vs.shader;
1925 struct cs_builder *b =
1926 panvk_get_cs_builder(cmdbuf, PANVK_SUBQUEUE_VERTEX_TILER);
1927 VkResult result;
1928
1929 /* If there's no vertex shader, we can skip the draw. */
1930 if (!panvk_priv_mem_dev_addr(vs->spds.pos_points))
1931 return;
1932
1933 /* Needs to be done before get_fs() is called because it depends on
1934 * fs.required being initialized. */
1935 cmdbuf->state.gfx.fs.required =
1936 fs_required(&cmdbuf->state.gfx, &cmdbuf->vk.dynamic_graphics_state);
1937
1938 /* Layered indirect draw (VK_EXT_shader_viewport_index_layer) needs
1939 * additional changes. We allow layer_count == 0 because that happens
1940 * when mixing dynamic rendering and secondary command buffers. Once
1941 * we decide to support layared+indirect, we'll need to pass the
1942 * layer_count info through the tiler descriptor, for instance by
1943 * re-using one of the word that's flagged 'ignored' in the descriptor
1944 * (word 14:23).
1945 *
1946 * Multiview is limited to 8 layers, and so will always fit in one TD.
1947 * Therefore layered rendering is allowed with multiview. */
1948 assert(cmdbuf->state.gfx.render.layer_count <= 1 ||
1949 cmdbuf->state.gfx.render.view_mask);
1950
1951 /* MultiDrawIndirect (.maxDrawIndirectCount) needs additional changes. */
1952 assert(draw->indirect.draw_count == 1);
1953
1954 /* Force a new push uniform block to be allocated */
1955 gfx_state_set_dirty(cmdbuf, VS_PUSH_UNIFORMS);
1956
1957 result = prepare_draw(cmdbuf, draw);
1958 if (result != VK_SUCCESS)
1959 return;
1960
1961 struct cs_index draw_params_addr = cs_scratch_reg64(b, 0);
1962 cs_move64_to(b, draw_params_addr, draw->indirect.buffer_dev_addr);
1963
1964 cs_update_vt_ctx(b) {
1965 cs_move32_to(b, cs_sr_reg32(b, 32), 0);
1966 /* Load SR33-37 from indirect buffer. */
1967 unsigned reg_mask = draw->index.size ? 0b11111 : 0b11011;
1968 cs_load_to(b, cs_sr_reg_tuple(b, 33, 5), draw_params_addr, reg_mask, 0);
1969 }
1970
1971 /* Wait for the SR33-37 indirect buffer load. */
1972 cs_wait_slot(b, SB_ID(LS), false);
1973
1974 if (shader_uses_sysval(vs, graphics, vs.first_vertex) ||
1975 shader_uses_sysval(vs, graphics, vs.base_instance)) {
1976 struct cs_index fau_block_addr = cs_scratch_reg64(b, 2);
1977 cs_move64_to(b, fau_block_addr, cmdbuf->state.gfx.vs.push_uniforms);
1978
1979 if (shader_uses_sysval(vs, graphics, vs.first_vertex)) {
1980 cs_store32(b, cs_sr_reg32(b, 36), fau_block_addr,
1981 shader_remapped_sysval_offset(
1982 vs, sysval_offset(graphics, vs.first_vertex)));
1983 }
1984
1985 if (shader_uses_sysval(vs, graphics, vs.base_instance)) {
1986 cs_store32(b, cs_sr_reg32(b, 37), fau_block_addr,
1987 shader_remapped_sysval_offset(
1988 vs, sysval_offset(graphics, vs.base_instance)));
1989 }
1990
1991 /* Wait for the store using SR-37 as src to finish, so we can overwrite
1992 * it. */
1993 cs_wait_slot(b, SB_ID(LS), false);
1994 }
1995
1996 /* NIR expects zero-based instance ID, but even if it did have an intrinsic to
1997 * load the absolute instance ID, we'd want to keep it zero-based to work around
1998 * Mali's limitation on non-zero firstInstance when a instance divisor is used.
1999 */
2000 cs_update_vt_ctx(b)
2001 cs_move32_to(b, cs_sr_reg32(b, 37), 0);
2002
2003 struct mali_primitive_flags_packed flags_override =
2004 get_tiler_flags_override(draw);
2005
2006 cs_req_res(b, CS_IDVS_RES);
2007 cs_trace_run_idvs(b, tracing_ctx, cs_scratch_reg_tuple(b, 0, 4),
2008 flags_override.opaque[0], false, true,
2009 cs_shader_res_sel(0, 0, 1, 0),
2010 cs_shader_res_sel(2, 2, 2, 0), cs_undef());
2011 cs_req_res(b, 0);
2012 }
2013
2014 VKAPI_ATTR void VKAPI_CALL
panvk_per_arch(CmdDrawIndirect)2015 panvk_per_arch(CmdDrawIndirect)(VkCommandBuffer commandBuffer, VkBuffer _buffer,
2016 VkDeviceSize offset, uint32_t drawCount,
2017 uint32_t stride)
2018 {
2019 VK_FROM_HANDLE(panvk_cmd_buffer, cmdbuf, commandBuffer);
2020 VK_FROM_HANDLE(panvk_buffer, buffer, _buffer);
2021
2022 if (drawCount == 0)
2023 return;
2024
2025 struct panvk_draw_info draw = {
2026 .indirect.buffer_dev_addr = panvk_buffer_gpu_ptr(buffer, offset),
2027 .indirect.draw_count = drawCount,
2028 .indirect.stride = stride,
2029 };
2030
2031 panvk_cmd_draw_indirect(cmdbuf, &draw);
2032 }
2033
2034 VKAPI_ATTR void VKAPI_CALL
panvk_per_arch(CmdDrawIndexedIndirect)2035 panvk_per_arch(CmdDrawIndexedIndirect)(VkCommandBuffer commandBuffer,
2036 VkBuffer _buffer, VkDeviceSize offset,
2037 uint32_t drawCount, uint32_t stride)
2038 {
2039 VK_FROM_HANDLE(panvk_cmd_buffer, cmdbuf, commandBuffer);
2040 VK_FROM_HANDLE(panvk_buffer, buffer, _buffer);
2041
2042 if (drawCount == 0)
2043 return;
2044
2045 struct panvk_draw_info draw = {
2046 .index.size = cmdbuf->state.gfx.ib.index_size,
2047 .indirect.buffer_dev_addr = panvk_buffer_gpu_ptr(buffer, offset),
2048 .indirect.draw_count = drawCount,
2049 .indirect.stride = stride,
2050 };
2051
2052 panvk_cmd_draw_indirect(cmdbuf, &draw);
2053 }
2054
2055 void
panvk_per_arch(cmd_inherit_render_state)2056 panvk_per_arch(cmd_inherit_render_state)(
2057 struct panvk_cmd_buffer *cmdbuf,
2058 const VkCommandBufferBeginInfo *pBeginInfo)
2059 {
2060 if (cmdbuf->vk.level != VK_COMMAND_BUFFER_LEVEL_SECONDARY ||
2061 !(pBeginInfo->flags & VK_COMMAND_BUFFER_USAGE_RENDER_PASS_CONTINUE_BIT))
2062 return;
2063
2064 assert(pBeginInfo->pInheritanceInfo);
2065 char gcbiar_data[VK_GCBIARR_DATA_SIZE(MAX_RTS)];
2066 const VkRenderingInfo *resume_info =
2067 vk_get_command_buffer_inheritance_as_rendering_resume(cmdbuf->vk.level,
2068 pBeginInfo,
2069 gcbiar_data);
2070 if (resume_info) {
2071 panvk_per_arch(cmd_init_render_state)(cmdbuf, resume_info);
2072 return;
2073 }
2074
2075 const VkCommandBufferInheritanceRenderingInfo *inheritance_info =
2076 vk_get_command_buffer_inheritance_rendering_info(cmdbuf->vk.level,
2077 pBeginInfo);
2078 assert(inheritance_info);
2079 struct panvk_device *dev = to_panvk_device(cmdbuf->vk.base.device);
2080 struct panvk_physical_device *phys_dev =
2081 to_panvk_physical_device(dev->vk.physical);
2082 struct pan_fb_info *fbinfo = &cmdbuf->state.gfx.render.fb.info;
2083
2084 cmdbuf->state.gfx.render.suspended = false;
2085 cmdbuf->state.gfx.render.flags = inheritance_info->flags;
2086
2087 gfx_state_set_dirty(cmdbuf, RENDER_STATE);
2088 memset(cmdbuf->state.gfx.render.fb.crc_valid, 0,
2089 sizeof(cmdbuf->state.gfx.render.fb.crc_valid));
2090 memset(&cmdbuf->state.gfx.render.color_attachments, 0,
2091 sizeof(cmdbuf->state.gfx.render.color_attachments));
2092 memset(&cmdbuf->state.gfx.render.z_attachment, 0,
2093 sizeof(cmdbuf->state.gfx.render.z_attachment));
2094 memset(&cmdbuf->state.gfx.render.s_attachment, 0,
2095 sizeof(cmdbuf->state.gfx.render.s_attachment));
2096 cmdbuf->state.gfx.render.bound_attachments = 0;
2097
2098 cmdbuf->state.gfx.render.view_mask = inheritance_info->viewMask;
2099 cmdbuf->state.gfx.render.layer_count = inheritance_info->viewMask ?
2100 util_last_bit(inheritance_info->viewMask) :
2101 0;
2102 *fbinfo = (struct pan_fb_info){
2103 .tile_buf_budget = panfrost_query_optimal_tib_size(phys_dev->model),
2104 .nr_samples = inheritance_info->rasterizationSamples,
2105 .rt_count = inheritance_info->colorAttachmentCount,
2106 };
2107
2108 assert(inheritance_info->colorAttachmentCount <= ARRAY_SIZE(fbinfo->rts));
2109
2110 for (uint32_t i = 0; i < inheritance_info->colorAttachmentCount; i++) {
2111 cmdbuf->state.gfx.render.bound_attachments |=
2112 MESA_VK_RP_ATTACHMENT_COLOR_BIT(i);
2113 cmdbuf->state.gfx.render.color_attachments.fmts[i] =
2114 inheritance_info->pColorAttachmentFormats[i];
2115 cmdbuf->state.gfx.render.color_attachments.samples[i] =
2116 inheritance_info->rasterizationSamples;
2117 }
2118
2119 if (inheritance_info->depthAttachmentFormat) {
2120 cmdbuf->state.gfx.render.bound_attachments |=
2121 MESA_VK_RP_ATTACHMENT_DEPTH_BIT;
2122 cmdbuf->state.gfx.render.z_attachment.fmt =
2123 inheritance_info->depthAttachmentFormat;
2124 }
2125
2126 if (inheritance_info->stencilAttachmentFormat) {
2127 cmdbuf->state.gfx.render.bound_attachments |=
2128 MESA_VK_RP_ATTACHMENT_STENCIL_BIT;
2129 cmdbuf->state.gfx.render.s_attachment.fmt =
2130 inheritance_info->stencilAttachmentFormat;
2131 }
2132
2133 const VkRenderingAttachmentLocationInfoKHR att_loc_info_default = {
2134 .sType = VK_STRUCTURE_TYPE_RENDERING_ATTACHMENT_LOCATION_INFO_KHR,
2135 .colorAttachmentCount = inheritance_info->colorAttachmentCount,
2136 };
2137 const VkRenderingAttachmentLocationInfoKHR *att_loc_info =
2138 vk_get_command_buffer_rendering_attachment_location_info(
2139 cmdbuf->vk.level, pBeginInfo);
2140 if (att_loc_info == NULL)
2141 att_loc_info = &att_loc_info_default;
2142
2143 vk_cmd_set_rendering_attachment_locations(&cmdbuf->vk, att_loc_info);
2144 }
2145
2146 VKAPI_ATTR void VKAPI_CALL
panvk_per_arch(CmdBeginRendering)2147 panvk_per_arch(CmdBeginRendering)(VkCommandBuffer commandBuffer,
2148 const VkRenderingInfo *pRenderingInfo)
2149 {
2150 VK_FROM_HANDLE(panvk_cmd_buffer, cmdbuf, commandBuffer);
2151 struct panvk_cmd_graphics_state *state = &cmdbuf->state.gfx;
2152 bool resuming = pRenderingInfo->flags & VK_RENDERING_RESUMING_BIT;
2153
2154 panvk_per_arch(cmd_init_render_state)(cmdbuf, pRenderingInfo);
2155
2156 /* If we're not resuming, the FBD should be NULL. */
2157 assert(!state->render.fbds.gpu || resuming);
2158
2159 if (!resuming)
2160 panvk_per_arch(cmd_preload_render_area_border)(cmdbuf, pRenderingInfo);
2161 }
2162
2163 static void
flush_tiling(struct panvk_cmd_buffer * cmdbuf)2164 flush_tiling(struct panvk_cmd_buffer *cmdbuf)
2165 {
2166 struct cs_builder *b =
2167 panvk_get_cs_builder(cmdbuf, PANVK_SUBQUEUE_VERTEX_TILER);
2168
2169 struct cs_index render_ctx = cs_scratch_reg64(b, 2);
2170
2171 if (cmdbuf->state.gfx.render.tiler || inherits_render_ctx(cmdbuf)) {
2172 /* Flush the tiling operations and signal the internal sync object. */
2173 cs_req_res(b, CS_TILER_RES);
2174 cs_finish_tiling(b, false);
2175 cs_req_res(b, 0);
2176
2177 struct cs_index sync_addr = cs_scratch_reg64(b, 0);
2178 struct cs_index iter_sb = cs_scratch_reg32(b, 2);
2179 struct cs_index cmp_scratch = cs_scratch_reg32(b, 3);
2180 struct cs_index add_val = cs_scratch_reg64(b, 4);
2181
2182 cs_load_to(b, cs_scratch_reg_tuple(b, 0, 3), cs_subqueue_ctx_reg(b),
2183 BITFIELD_MASK(3),
2184 offsetof(struct panvk_cs_subqueue_context, syncobjs));
2185 cs_wait_slot(b, SB_ID(LS), false);
2186
2187 /* We're relying on PANVK_SUBQUEUE_VERTEX_TILER being the first queue to
2188 * skip an ADD operation on the syncobjs pointer. */
2189 STATIC_ASSERT(PANVK_SUBQUEUE_VERTEX_TILER == 0);
2190
2191 cs_move64_to(b, add_val, 1);
2192
2193 cs_match(b, iter_sb, cmp_scratch) {
2194 #define CASE(x) \
2195 cs_case(b, x) { \
2196 cs_heap_operation(b, \
2197 MALI_CS_HEAP_OPERATION_VERTEX_TILER_COMPLETED, \
2198 cs_defer(SB_WAIT_ITER(x), \
2199 SB_ID(DEFERRED_SYNC))); \
2200 cs_sync64_add(b, true, MALI_CS_SYNC_SCOPE_CSG, \
2201 add_val, sync_addr, \
2202 cs_defer(SB_WAIT_ITER(x), SB_ID(DEFERRED_SYNC))); \
2203 cs_move32_to(b, iter_sb, next_iter_sb(x)); \
2204 }
2205
2206 CASE(0)
2207 CASE(1)
2208 CASE(2)
2209 CASE(3)
2210 CASE(4)
2211 #undef CASE
2212 }
2213
2214 cs_store32(b, iter_sb, cs_subqueue_ctx_reg(b),
2215 offsetof(struct panvk_cs_subqueue_context, iter_sb));
2216 cs_wait_slot(b, SB_ID(LS), false);
2217
2218 /* Update the vertex seqno. */
2219 ++cmdbuf->state.cs[PANVK_SUBQUEUE_VERTEX_TILER].relative_sync_point;
2220 } else {
2221 cs_load64_to(b, render_ctx, cs_subqueue_ctx_reg(b),
2222 offsetof(struct panvk_cs_subqueue_context, render));
2223 cs_wait_slot(b, SB_ID(LS), false);
2224 }
2225 }
2226
2227 static void
wait_finish_tiling(struct panvk_cmd_buffer * cmdbuf)2228 wait_finish_tiling(struct panvk_cmd_buffer *cmdbuf)
2229 {
2230 struct cs_builder *b = panvk_get_cs_builder(cmdbuf, PANVK_SUBQUEUE_FRAGMENT);
2231 struct cs_index vt_sync_addr = cs_scratch_reg64(b, 0);
2232 struct cs_index vt_sync_point = cs_scratch_reg64(b, 2);
2233 uint64_t rel_vt_sync_point =
2234 cmdbuf->state.cs[PANVK_SUBQUEUE_VERTEX_TILER].relative_sync_point;
2235
2236 cs_load64_to(b, vt_sync_addr, cs_subqueue_ctx_reg(b),
2237 offsetof(struct panvk_cs_subqueue_context, syncobjs));
2238 cs_wait_slot(b, SB_ID(LS), false);
2239
2240 cs_add64(b, vt_sync_point,
2241 cs_progress_seqno_reg(b, PANVK_SUBQUEUE_VERTEX_TILER),
2242 rel_vt_sync_point);
2243 cs_sync64_wait(b, false, MALI_CS_CONDITION_GREATER, vt_sync_point,
2244 vt_sync_addr);
2245 }
2246
2247 static uint32_t
calc_tiler_oom_handler_idx(struct panvk_cmd_buffer * cmdbuf)2248 calc_tiler_oom_handler_idx(struct panvk_cmd_buffer *cmdbuf)
2249 {
2250 const struct pan_fb_info *fb = &cmdbuf->state.gfx.render.fb.info;
2251 bool has_zs_ext = fb->zs.view.zs || fb->zs.view.s;
2252 uint32_t rt_count = MAX2(fb->rt_count, 1);
2253
2254 return get_tiler_oom_handler_idx(has_zs_ext, rt_count);
2255 }
2256
2257 static void
setup_tiler_oom_ctx(struct panvk_cmd_buffer * cmdbuf)2258 setup_tiler_oom_ctx(struct panvk_cmd_buffer *cmdbuf)
2259 {
2260 struct cs_builder *b = panvk_get_cs_builder(cmdbuf, PANVK_SUBQUEUE_FRAGMENT);
2261
2262 uint32_t td_count = DIV_ROUND_UP(cmdbuf->state.gfx.render.layer_count,
2263 MAX_LAYERS_PER_TILER_DESC);
2264 uint32_t fbd_sz = calc_fbd_size(cmdbuf);
2265 uint32_t fbd_ir_pass_offset = fbd_sz * cmdbuf->state.gfx.render.layer_count;
2266
2267 struct cs_index counter = cs_scratch_reg32(b, 1);
2268 cs_move32_to(b, counter, 0);
2269 cs_store32(b, counter, cs_subqueue_ctx_reg(b),
2270 TILER_OOM_CTX_FIELD_OFFSET(counter));
2271
2272 struct cs_index fbd_first = cs_scratch_reg64(b, 2);
2273 cs_add64(b, fbd_first, cs_sr_reg64(b, 40),
2274 (1 + PANVK_IR_FIRST_PASS) * fbd_ir_pass_offset);
2275 cs_store64(b, fbd_first, cs_subqueue_ctx_reg(b),
2276 TILER_OOM_CTX_FBDPTR_OFFSET(FIRST));
2277 struct cs_index fbd_middle = cs_scratch_reg64(b, 4);
2278 cs_add64(b, fbd_middle, cs_sr_reg64(b, 40),
2279 (1 + PANVK_IR_MIDDLE_PASS) * fbd_ir_pass_offset);
2280 cs_store64(b, fbd_middle, cs_subqueue_ctx_reg(b),
2281 TILER_OOM_CTX_FBDPTR_OFFSET(MIDDLE));
2282 struct cs_index fbd_last = cs_scratch_reg64(b, 6);
2283 cs_add64(b, fbd_last, cs_sr_reg64(b, 40),
2284 (1 + PANVK_IR_LAST_PASS) * fbd_ir_pass_offset);
2285 cs_store64(b, fbd_last, cs_subqueue_ctx_reg(b),
2286 TILER_OOM_CTX_FBDPTR_OFFSET(LAST));
2287
2288 struct cs_index td_count_reg = cs_scratch_reg32(b, 8);
2289 cs_move32_to(b, td_count_reg, td_count);
2290 cs_store32(b, td_count_reg, cs_subqueue_ctx_reg(b),
2291 TILER_OOM_CTX_FIELD_OFFSET(td_count));
2292 struct cs_index layer_count = cs_scratch_reg32(b, 9);
2293 cs_move32_to(b, layer_count, cmdbuf->state.gfx.render.layer_count);
2294 cs_store32(b, layer_count, cs_subqueue_ctx_reg(b),
2295 TILER_OOM_CTX_FIELD_OFFSET(layer_count));
2296
2297 cs_wait_slot(b, SB_ID(LS), false);
2298 }
2299
2300 static VkResult
issue_fragment_jobs(struct panvk_cmd_buffer * cmdbuf)2301 issue_fragment_jobs(struct panvk_cmd_buffer *cmdbuf)
2302 {
2303 struct panvk_device *dev = to_panvk_device(cmdbuf->vk.base.device);
2304 struct panvk_instance *instance =
2305 to_panvk_instance(dev->vk.physical->instance);
2306 const struct cs_tracing_ctx *tracing_ctx =
2307 &cmdbuf->state.cs[PANVK_SUBQUEUE_FRAGMENT].tracing;
2308 struct pan_fb_info *fbinfo = &cmdbuf->state.gfx.render.fb.info;
2309 struct cs_builder *b = panvk_get_cs_builder(cmdbuf, PANVK_SUBQUEUE_FRAGMENT);
2310 bool has_oq_chain = cmdbuf->state.gfx.render.oq.chain != 0;
2311
2312 /* Reserve a scoreboard for the fragment job. */
2313 panvk_per_arch(cs_pick_iter_sb)(cmdbuf, PANVK_SUBQUEUE_FRAGMENT);
2314
2315 /* Now initialize the fragment bits. */
2316 cs_update_frag_ctx(b) {
2317 cs_move32_to(b, cs_sr_reg32(b, 42),
2318 (fbinfo->extent.miny << 16) | fbinfo->extent.minx);
2319 cs_move32_to(b, cs_sr_reg32(b, 43),
2320 (fbinfo->extent.maxy << 16) | fbinfo->extent.maxx);
2321 }
2322
2323 bool simul_use =
2324 cmdbuf->flags & VK_COMMAND_BUFFER_USAGE_SIMULTANEOUS_USE_BIT;
2325
2326 /* The only bit we patch in FBDs is the tiler pointer. If tiler is not
2327 * involved (clear job) or if the update can happen in place (not
2328 * simultaneous use of the command buffer), we can avoid the
2329 * copy. */
2330 bool needs_tiling =
2331 cmdbuf->state.gfx.render.tiler || inherits_render_ctx(cmdbuf);
2332
2333 /* If the command buffer can run in parallel on different queues, we need
2334 * to make sure each instance has its own descriptors, unless tiling is
2335 * not needed (AKA RUN_FRAGMENT used for clears), because then the FBD
2336 * descriptors are constant (no need to patch them at runtime). */
2337 bool free_render_descs = simul_use && needs_tiling;
2338 uint32_t fbd_sz = calc_fbd_size(cmdbuf);
2339 uint32_t fbd_ir_pass_offset = fbd_sz * cmdbuf->state.gfx.render.layer_count;
2340 uint32_t td_count = 0;
2341 if (needs_tiling) {
2342 td_count = DIV_ROUND_UP(cmdbuf->state.gfx.render.layer_count,
2343 MAX_LAYERS_PER_TILER_DESC);
2344 }
2345
2346 /* Update the Tiler OOM context */
2347 setup_tiler_oom_ctx(cmdbuf);
2348
2349 /* Enable the oom handler before waiting for the vertex/tiler work.
2350 * At this point, the tiler oom context has been set up with the correct
2351 * state for this renderpass, so it's safe to enable. */
2352 struct cs_index addr_reg = cs_scratch_reg64(b, 0);
2353 struct cs_index length_reg = cs_scratch_reg32(b, 2);
2354 uint32_t handler_idx = calc_tiler_oom_handler_idx(cmdbuf);
2355 uint64_t handler_addr = dev->tiler_oom.handlers_bo->addr.dev +
2356 handler_idx * dev->tiler_oom.handler_stride;
2357 cs_move64_to(b, addr_reg, handler_addr);
2358 cs_move32_to(b, length_reg, dev->tiler_oom.handler_stride);
2359 cs_set_exception_handler(b, MALI_CS_EXCEPTION_TYPE_TILER_OOM, addr_reg,
2360 length_reg);
2361
2362 /* Wait for the tiling to be done before submitting the fragment job. */
2363 wait_finish_tiling(cmdbuf);
2364
2365 /* Disable the oom handler once the vertex/tiler work has finished.
2366 * We need to disable the handler at this point as the vertex/tiler subqueue
2367 * might continue on to the next renderpass and hit an out-of-memory
2368 * exception prior to the fragment subqueue setting up the tiler oom context
2369 * for the next renderpass.
2370 * By disabling the handler here, any exception will be left pending until a
2371 * new hander is registered, at which point the correct state has been set
2372 * up. */
2373 cs_move64_to(b, addr_reg, 0);
2374 cs_move32_to(b, length_reg, 0);
2375 cs_set_exception_handler(b, MALI_CS_EXCEPTION_TYPE_TILER_OOM, addr_reg,
2376 length_reg);
2377
2378 /* Pick the correct set of FBDs based on whether an incremental render
2379 * occurred. */
2380 struct cs_index counter = cs_scratch_reg32(b, 0);
2381 cs_load32_to(
2382 b, counter, cs_subqueue_ctx_reg(b),
2383 offsetof(struct panvk_cs_subqueue_context, tiler_oom_ctx.counter));
2384 cs_wait_slot(b, SB_ID(LS), false);
2385 cs_if(b, MALI_CS_CONDITION_GREATER, counter)
2386 cs_update_frag_ctx(b)
2387 cs_add64(b, cs_sr_reg64(b, 40), cs_sr_reg64(b, 40),
2388 (1 + PANVK_IR_LAST_PASS) * fbd_ir_pass_offset);
2389
2390 /* Applications tend to forget to describe subpass dependencies, especially
2391 * when it comes to write -> read dependencies on attachments. The
2392 * proprietary driver forces "others" invalidation as a workaround, and this
2393 * invalidation even became implicit (done as part of the RUN_FRAGMENT) on
2394 * v13+. We don't do that in panvk, but we provide a debug flag to help
2395 * identify those issues. */
2396 if (unlikely(instance->debug_flags & PANVK_DEBUG_IMPLICIT_OTHERS_INV)) {
2397 cs_flush_caches(b, 0, 0, true, length_reg,
2398 cs_defer(0x0, SB_ID(IMM_FLUSH)));
2399 cs_wait_slot(b, SB_ID(IMM_FLUSH), false);
2400 }
2401
2402 cs_req_res(b, CS_FRAG_RES);
2403 if (cmdbuf->state.gfx.render.layer_count > 1) {
2404 struct cs_index layer_count = cs_sr_reg32(b, 47);
2405
2406 cs_move32_to(b, layer_count, calc_enabled_layer_count(cmdbuf));
2407 cs_while(b, MALI_CS_CONDITION_GREATER, layer_count) {
2408 cs_trace_run_fragment(b, tracing_ctx, cs_scratch_reg_tuple(b, 0, 4),
2409 false, MALI_TILE_RENDER_ORDER_Z_ORDER, false);
2410
2411 cs_add32(b, layer_count, layer_count, -1);
2412 cs_update_frag_ctx(b)
2413 cs_add64(b, cs_sr_reg64(b, 40), cs_sr_reg64(b, 40), fbd_sz);
2414 }
2415 } else {
2416 cs_trace_run_fragment(b, tracing_ctx, cs_scratch_reg_tuple(b, 0, 4),
2417 false, MALI_TILE_RENDER_ORDER_Z_ORDER, false);
2418 }
2419 cs_req_res(b, 0);
2420
2421 struct cs_index sync_addr = cs_scratch_reg64(b, 0);
2422 struct cs_index iter_sb = cs_scratch_reg32(b, 2);
2423 struct cs_index cmp_scratch = cs_scratch_reg32(b, 3);
2424 struct cs_index add_val = cs_scratch_reg64(b, 4);
2425 struct cs_index add_val_lo = cs_scratch_reg32(b, 4);
2426 struct cs_index ringbuf_sync_addr = cs_scratch_reg64(b, 6);
2427 struct cs_index release_sz = cs_scratch_reg32(b, 8);
2428
2429 struct cs_index completed = cs_scratch_reg_tuple(b, 10, 4);
2430 struct cs_index completed_top = cs_scratch_reg64(b, 10);
2431 struct cs_index completed_bottom = cs_scratch_reg64(b, 12);
2432 struct cs_index cur_tiler = cs_sr_reg64(b, 38);
2433 struct cs_index tiler_count = cs_sr_reg32(b, 47);
2434 struct cs_index oq_chain = cs_scratch_reg64(b, 10);
2435 struct cs_index oq_chain_lo = cs_scratch_reg32(b, 10);
2436 struct cs_index oq_chain_hi = cs_scratch_reg32(b, 11);
2437 struct cs_index oq_syncobj = cs_scratch_reg64(b, 12);
2438
2439 cs_move64_to(b, add_val, 1);
2440 cs_load_to(b, cs_scratch_reg_tuple(b, 0, 3), cs_subqueue_ctx_reg(b),
2441 BITFIELD_MASK(3),
2442 offsetof(struct panvk_cs_subqueue_context, syncobjs));
2443
2444 if (free_render_descs) {
2445 cs_move32_to(b, release_sz, calc_render_descs_size(cmdbuf));
2446 cs_load64_to(b, ringbuf_sync_addr, cs_subqueue_ctx_reg(b),
2447 offsetof(struct panvk_cs_subqueue_context,
2448 render.desc_ringbuf.syncobj));
2449 }
2450
2451 cs_wait_slot(b, SB_ID(LS), false);
2452
2453 cs_add64(b, sync_addr, sync_addr,
2454 PANVK_SUBQUEUE_FRAGMENT * sizeof(struct panvk_cs_sync64));
2455 cs_move32_to(b, tiler_count, td_count);
2456
2457 cs_match(b, iter_sb, cmp_scratch) {
2458 #define CASE(x) \
2459 cs_case(b, x) { \
2460 const struct cs_async_op async = \
2461 cs_defer(SB_WAIT_ITER(x), SB_ID(DEFERRED_SYNC)); \
2462 if (td_count == 1) { \
2463 cs_load_to(b, completed, cur_tiler, BITFIELD_MASK(4), 40); \
2464 cs_wait_slot(b, SB_ID(LS), false); \
2465 cs_finish_fragment(b, true, completed_top, completed_bottom, async); \
2466 } else if (td_count > 1) { \
2467 cs_while(b, MALI_CS_CONDITION_GREATER, tiler_count) { \
2468 cs_load_to(b, completed, cur_tiler, BITFIELD_MASK(4), 40); \
2469 cs_wait_slot(b, SB_ID(LS), false); \
2470 cs_finish_fragment(b, false, completed_top, completed_bottom, \
2471 async); \
2472 cs_update_frag_ctx(b) \
2473 cs_add64(b, cur_tiler, cur_tiler, pan_size(TILER_CONTEXT)); \
2474 cs_add32(b, tiler_count, tiler_count, -1); \
2475 } \
2476 cs_frag_end(b, async); \
2477 } \
2478 if (free_render_descs) { \
2479 cs_sync32_add(b, true, MALI_CS_SYNC_SCOPE_CSG, release_sz, \
2480 ringbuf_sync_addr, async); \
2481 } \
2482 if (has_oq_chain) { \
2483 struct cs_index flush_id = oq_chain_lo; \
2484 cs_move32_to(b, flush_id, 0); \
2485 cs_flush_caches(b, MALI_CS_FLUSH_MODE_CLEAN, \
2486 MALI_CS_FLUSH_MODE_CLEAN, false, flush_id, \
2487 cs_defer(SB_WAIT_ITER(x), SB_ID(DEFERRED_FLUSH))); \
2488 cs_load64_to( \
2489 b, oq_chain, cs_subqueue_ctx_reg(b), \
2490 offsetof(struct panvk_cs_subqueue_context, render.oq_chain)); \
2491 cs_wait_slot(b, SB_ID(LS), false); \
2492 /* We use oq_syncobj as a placeholder to reset the oq_chain. */ \
2493 cs_move64_to(b, oq_syncobj, 0); \
2494 cs_store64( \
2495 b, oq_syncobj, cs_subqueue_ctx_reg(b), \
2496 offsetof(struct panvk_cs_subqueue_context, render.oq_chain)); \
2497 cs_wait_slot(b, SB_ID(LS), false); \
2498 cs_while(b, MALI_CS_CONDITION_ALWAYS, cs_undef()) { \
2499 cs_load64_to(b, oq_syncobj, oq_chain, \
2500 offsetof(struct panvk_cs_occlusion_query, syncobj)); \
2501 cs_wait_slot(b, SB_ID(LS), false); \
2502 cs_load64_to(b, oq_chain, oq_chain, \
2503 offsetof(struct panvk_cs_occlusion_query, next)); \
2504 cs_wait_slot(b, SB_ID(LS), false); \
2505 cs_sync32_set( \
2506 b, true, MALI_CS_SYNC_SCOPE_CSG, add_val_lo, oq_syncobj, \
2507 cs_defer(SB_MASK(DEFERRED_FLUSH), SB_ID(DEFERRED_SYNC))); \
2508 cs_if(b, MALI_CS_CONDITION_NEQUAL, oq_chain_lo) \
2509 cs_continue(b); \
2510 cs_if(b, MALI_CS_CONDITION_NEQUAL, oq_chain_hi) \
2511 cs_continue(b); \
2512 cs_break(b); \
2513 } \
2514 } \
2515 cs_sync64_add(b, true, MALI_CS_SYNC_SCOPE_CSG, add_val, sync_addr, \
2516 async); \
2517 cs_move32_to(b, iter_sb, next_iter_sb(x)); \
2518 }
2519
2520 CASE(0)
2521 CASE(1)
2522 CASE(2)
2523 CASE(3)
2524 CASE(4)
2525 #undef CASE
2526 }
2527
2528 cs_store32(b, iter_sb, cs_subqueue_ctx_reg(b),
2529 offsetof(struct panvk_cs_subqueue_context, iter_sb));
2530 cs_wait_slot(b, SB_ID(LS), false);
2531
2532 /* Update the ring buffer position. */
2533 if (free_render_descs) {
2534 cs_render_desc_ringbuf_move_ptr(b, calc_render_descs_size(cmdbuf),
2535 !tracing_ctx->enabled);
2536 }
2537
2538 /* Update the frag seqno. */
2539 ++cmdbuf->state.cs[PANVK_SUBQUEUE_FRAGMENT].relative_sync_point;
2540
2541
2542 return VK_SUCCESS;
2543 }
2544
2545 void
panvk_per_arch(cmd_flush_draws)2546 panvk_per_arch(cmd_flush_draws)(struct panvk_cmd_buffer *cmdbuf)
2547 {
2548 /* If there was no draw queued, we don't need to force a preload. */
2549 if (cmdbuf->state.gfx.render.fbds.gpu || inherits_render_ctx(cmdbuf)) {
2550 flush_tiling(cmdbuf);
2551 issue_fragment_jobs(cmdbuf);
2552 memset(&cmdbuf->state.gfx.render.fbds, 0,
2553 sizeof(cmdbuf->state.gfx.render.fbds));
2554 cmdbuf->state.gfx.render.tiler = 0;
2555
2556 panvk_per_arch(cmd_force_fb_preload)(cmdbuf, NULL);
2557
2558 /* We inherited the render context, and need to let the primary command
2559 * buffer know that it's changed. */
2560 cmdbuf->state.gfx.render.invalidate_inherited_ctx = true;
2561
2562 /* Re-emit the FB/Tiler descs if we inherited them. */
2563 if (inherits_render_ctx(cmdbuf))
2564 get_render_ctx(cmdbuf);
2565 }
2566 }
2567
2568 VKAPI_ATTR void VKAPI_CALL
panvk_per_arch(CmdEndRendering)2569 panvk_per_arch(CmdEndRendering)(VkCommandBuffer commandBuffer)
2570 {
2571 VK_FROM_HANDLE(panvk_cmd_buffer, cmdbuf, commandBuffer);
2572 bool suspending = cmdbuf->state.gfx.render.flags & VK_RENDERING_SUSPENDING_BIT;
2573 VkResult result;
2574
2575 if (!suspending) {
2576 struct pan_fb_info *fbinfo = &cmdbuf->state.gfx.render.fb.info;
2577 bool clear = fbinfo->zs.clear.z | fbinfo->zs.clear.s;
2578 for (unsigned i = 0; i < fbinfo->rt_count; i++)
2579 clear |= fbinfo->rts[i].clear;
2580
2581 if (clear && !inherits_render_ctx(cmdbuf)) {
2582 result = get_fb_descs(cmdbuf);
2583 if (result != VK_SUCCESS)
2584 return;
2585 }
2586
2587 /* Flush the last occlusion query before ending the render pass if
2588 * this query has ended while we were inside the render pass. */
2589 if (cmdbuf->state.gfx.render.oq.last !=
2590 cmdbuf->state.gfx.occlusion_query.syncobj) {
2591 result = wrap_prev_oq(cmdbuf);
2592 if (result != VK_SUCCESS)
2593 return;
2594 }
2595
2596 if (cmdbuf->state.gfx.render.fbds.gpu || inherits_render_ctx(cmdbuf)) {
2597 flush_tiling(cmdbuf);
2598 issue_fragment_jobs(cmdbuf);
2599 }
2600 } else if (!inherits_render_ctx(cmdbuf)) {
2601 /* If we're suspending the render pass and we didn't inherit the render
2602 * context, we need to emit it now, so it's available when the render pass
2603 * is resumed. */
2604 VkResult result = get_render_ctx(cmdbuf);
2605 if (result != VK_SUCCESS)
2606 return;
2607 }
2608
2609 memset(&cmdbuf->state.gfx.render.fbds, 0,
2610 sizeof(cmdbuf->state.gfx.render.fbds));
2611 memset(&cmdbuf->state.gfx.render.oq, 0, sizeof(cmdbuf->state.gfx.render.oq));
2612 cmdbuf->state.gfx.render.tiler = 0;
2613
2614 /* If we're finished with this render pass, make sure we reset the flags
2615 * so any barrier encountered after EndRendering() doesn't try to flush
2616 * draws. */
2617 cmdbuf->state.gfx.render.flags = 0;
2618 cmdbuf->state.gfx.render.suspended = suspending;
2619
2620 /* If we're not suspending, we need to resolve attachments. */
2621 if (!suspending)
2622 panvk_per_arch(cmd_resolve_attachments)(cmdbuf);
2623 }
2624