1 /*
2 * Copyright © 2021 Raspberry Pi Ltd
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
13 * Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21 * IN THE SOFTWARE.
22 */
23
24 #include "v3dv_private.h"
25 #include "broadcom/common/v3d_macros.h"
26 #include "broadcom/common/v3d_util.h"
27 #include "broadcom/cle/v3dx_pack.h"
28 #include "broadcom/compiler/v3d_compiler.h"
29
30 #include "util/half_float.h"
31 #include "vulkan/util/vk_format.h"
32 #include "util/u_pack_color.h"
33
34 void
v3dX(job_emit_binning_flush)35 v3dX(job_emit_binning_flush)(struct v3dv_job *job)
36 {
37 assert(job);
38
39 v3dv_cl_ensure_space_with_branch(&job->bcl, cl_packet_length(FLUSH));
40 v3dv_return_if_oom(NULL, job);
41
42 cl_emit(&job->bcl, FLUSH, flush);
43 }
44
45 void
v3dX(job_emit_enable_double_buffer)46 v3dX(job_emit_enable_double_buffer)(struct v3dv_job *job)
47 {
48 assert(job->can_use_double_buffer);
49 assert(job->frame_tiling.double_buffer);
50 assert(!job->frame_tiling.msaa);
51 assert(job->bcl_tile_binning_mode_ptr);
52
53 const struct v3dv_frame_tiling *tiling = &job->frame_tiling;
54 struct cl_packet_struct(TILE_BINNING_MODE_CFG) config = {
55 cl_packet_header(TILE_BINNING_MODE_CFG),
56 };
57 config.width_in_pixels = tiling->width;
58 config.height_in_pixels = tiling->height;
59 #if V3D_VERSION == 42
60 config.number_of_render_targets = MAX2(tiling->render_target_count, 1);
61 config.multisample_mode_4x = tiling->msaa;
62 config.double_buffer_in_non_ms_mode = tiling->double_buffer;
63 config.maximum_bpp_of_all_render_targets = tiling->internal_bpp;
64 #endif
65 #if V3D_VERSION >= 71
66 unreachable("HW generation 71 not supported yet.");
67 #endif
68
69 uint8_t *rewrite_addr = (uint8_t *)job->bcl_tile_binning_mode_ptr;
70 cl_packet_pack(TILE_BINNING_MODE_CFG)(NULL, rewrite_addr, &config);
71 }
72
73 void
v3dX(job_emit_binning_prolog)74 v3dX(job_emit_binning_prolog)(struct v3dv_job *job,
75 const struct v3dv_frame_tiling *tiling,
76 uint32_t layers)
77 {
78 /* This must go before the binning mode configuration. It is
79 * required for layered framebuffers to work.
80 */
81 cl_emit(&job->bcl, NUMBER_OF_LAYERS, config) {
82 config.number_of_layers = layers;
83 }
84
85 assert(!tiling->double_buffer || !tiling->msaa);
86 job->bcl_tile_binning_mode_ptr = cl_start(&job->bcl);
87 cl_emit(&job->bcl, TILE_BINNING_MODE_CFG, config) {
88 config.width_in_pixels = tiling->width;
89 config.height_in_pixels = tiling->height;
90 #if V3D_VERSION == 42
91 config.number_of_render_targets = MAX2(tiling->render_target_count, 1);
92 config.multisample_mode_4x = tiling->msaa;
93 config.double_buffer_in_non_ms_mode = tiling->double_buffer;
94 config.maximum_bpp_of_all_render_targets = tiling->internal_bpp;
95 #endif
96 #if V3D_VERSION >= 71
97 config.log2_tile_width = log2_tile_size(tiling->tile_width);
98 config.log2_tile_height = log2_tile_size(tiling->tile_height);
99 /* FIXME: ideally we would like next assert on the packet header (as is
100 * general, so also applies to GL). We would need to expand
101 * gen_pack_header for that.
102 */
103 assert(config.log2_tile_width == config.log2_tile_height ||
104 config.log2_tile_width == config.log2_tile_height + 1);
105 #endif
106 }
107
108 /* There's definitely nothing in the VCD cache we want. */
109 cl_emit(&job->bcl, FLUSH_VCD_CACHE, bin);
110
111 /* "Binning mode lists must have a Start Tile Binning item (6) after
112 * any prefix state data before the binning list proper starts."
113 */
114 cl_emit(&job->bcl, START_TILE_BINNING, bin);
115 }
116
117 void
v3dX(cmd_buffer_end_render_pass_secondary)118 v3dX(cmd_buffer_end_render_pass_secondary)(struct v3dv_cmd_buffer *cmd_buffer)
119 {
120 assert(cmd_buffer->state.job);
121 v3dv_cl_ensure_space_with_branch(&cmd_buffer->state.job->bcl,
122 cl_packet_length(RETURN_FROM_SUB_LIST));
123 v3dv_return_if_oom(cmd_buffer, NULL);
124 cl_emit(&cmd_buffer->state.job->bcl, RETURN_FROM_SUB_LIST, ret);
125 }
126
127 void
v3dX(job_emit_clip_window)128 v3dX(job_emit_clip_window)(struct v3dv_job *job, const VkRect2D *rect)
129 {
130 assert(job);
131
132 v3dv_cl_ensure_space_with_branch(&job->bcl, cl_packet_length(CLIP_WINDOW));
133 v3dv_return_if_oom(NULL, job);
134
135 cl_emit(&job->bcl, CLIP_WINDOW, clip) {
136 clip.clip_window_left_pixel_coordinate = rect->offset.x;
137 clip.clip_window_bottom_pixel_coordinate = rect->offset.y;
138 clip.clip_window_width_in_pixels = rect->extent.width;
139 clip.clip_window_height_in_pixels = rect->extent.height;
140 }
141 }
142
143 static void
cmd_buffer_render_pass_emit_load(struct v3dv_cmd_buffer * cmd_buffer,struct v3dv_cl * cl,struct v3dv_image_view * iview,uint32_t layer,uint32_t buffer)144 cmd_buffer_render_pass_emit_load(struct v3dv_cmd_buffer *cmd_buffer,
145 struct v3dv_cl *cl,
146 struct v3dv_image_view *iview,
147 uint32_t layer,
148 uint32_t buffer)
149 {
150 const struct v3dv_image *image = (struct v3dv_image *) iview->vk.image;
151
152 /* We don't support rendering to ycbcr images, so the image view should be
153 * single-plane, and using a single-plane format. But note that the underlying
154 * image can be a ycbcr format, as we support rendering to a specific plane
155 * of an image. This is used for example on some meta_copy code paths, in
156 * order to copy from/to a plane of a ycbcr image.
157 */
158 assert(iview->plane_count == 1);
159 assert(iview->format->plane_count == 1);
160
161 uint8_t image_plane = v3dv_plane_from_aspect(iview->vk.aspects);
162 const struct v3d_resource_slice *slice =
163 &image->planes[image_plane].slices[iview->vk.base_mip_level];
164
165 uint32_t layer_offset =
166 v3dv_layer_offset(image, iview->vk.base_mip_level,
167 iview->vk.base_array_layer + layer, image_plane);
168
169 cl_emit(cl, LOAD_TILE_BUFFER_GENERAL, load) {
170 load.buffer_to_load = buffer;
171 load.address = v3dv_cl_address(image->planes[image_plane].mem->bo, layer_offset);
172
173 load.input_image_format = iview->format->planes[0].rt_type;
174
175 /* If we create an image view with only the stencil format, we
176 * re-interpret the format as RGBA8_UINT, as it is want we want in
177 * general (see CreateImageView).
178 *
179 * However, when we are loading/storing tiles from the ZSTENCIL tile
180 * buffer, we need to use the underlying DS format.
181 */
182 if (buffer == ZSTENCIL &&
183 iview->format->planes[0].rt_type == V3D_OUTPUT_IMAGE_FORMAT_RGBA8UI) {
184 assert(image->format->planes[image_plane].rt_type == V3D_OUTPUT_IMAGE_FORMAT_D24S8);
185 load.input_image_format = image->format->planes[image_plane].rt_type;
186 }
187
188 load.r_b_swap = iview->planes[0].swap_rb;
189 load.channel_reverse = iview->planes[0].channel_reverse;
190 load.memory_format = slice->tiling;
191
192 if (slice->tiling == V3D_TILING_UIF_NO_XOR ||
193 slice->tiling == V3D_TILING_UIF_XOR) {
194 load.height_in_ub_or_stride =
195 slice->padded_height_of_output_image_in_uif_blocks;
196 } else if (slice->tiling == V3D_TILING_RASTER) {
197 load.height_in_ub_or_stride = slice->stride;
198 }
199
200 if (image->vk.samples > VK_SAMPLE_COUNT_1_BIT)
201 load.decimate_mode = V3D_DECIMATE_MODE_ALL_SAMPLES;
202 else
203 load.decimate_mode = V3D_DECIMATE_MODE_SAMPLE_0;
204 }
205 }
206
207 static inline uint32_t
v3dv_zs_buffer(bool depth,bool stencil)208 v3dv_zs_buffer(bool depth, bool stencil)
209 {
210 if (depth && stencil)
211 return ZSTENCIL;
212 else if (depth)
213 return Z;
214 else if (stencil)
215 return STENCIL;
216 return NONE;
217 }
218
219 static void
cmd_buffer_render_pass_emit_loads(struct v3dv_cmd_buffer * cmd_buffer,struct v3dv_cl * cl,uint32_t layer)220 cmd_buffer_render_pass_emit_loads(struct v3dv_cmd_buffer *cmd_buffer,
221 struct v3dv_cl *cl,
222 uint32_t layer)
223 {
224 const struct v3dv_cmd_buffer_state *state = &cmd_buffer->state;
225 const struct v3dv_render_pass *pass = state->pass;
226 const struct v3dv_subpass *subpass = &pass->subpasses[state->subpass_idx];
227
228 assert(!pass->multiview_enabled || layer < MAX_MULTIVIEW_VIEW_COUNT);
229
230 for (uint32_t i = 0; i < subpass->color_count; i++) {
231 uint32_t attachment_idx = subpass->color_attachments[i].attachment;
232
233 if (attachment_idx == VK_ATTACHMENT_UNUSED)
234 continue;
235
236 const struct v3dv_render_pass_attachment *attachment =
237 &state->pass->attachments[attachment_idx];
238
239 /* According to the Vulkan spec:
240 *
241 * "The load operation for each sample in an attachment happens before
242 * any recorded command which accesses the sample in the first subpass
243 * where the attachment is used."
244 *
245 * If the load operation is CLEAR, we must only clear once on the first
246 * subpass that uses the attachment (and in that case we don't LOAD).
247 * After that, we always want to load so we don't lose any rendering done
248 * by a previous subpass to the same attachment. We also want to load
249 * if the current job is continuing subpass work started by a previous
250 * job, for the same reason.
251 *
252 * If the render area is not aligned to tile boundaries then we have
253 * tiles which are partially covered by it. In this case, we need to
254 * load the tiles so we can preserve the pixels that are outside the
255 * render area for any such tiles.
256 */
257 uint32_t first_subpass = !pass->multiview_enabled ?
258 attachment->first_subpass :
259 attachment->views[layer].first_subpass;
260
261 uint32_t last_subpass = !pass->multiview_enabled ?
262 attachment->last_subpass :
263 attachment->views[layer].last_subpass;
264
265 bool needs_load =
266 v3dv_cmd_buffer_check_needs_load(state,
267 VK_IMAGE_ASPECT_COLOR_BIT,
268 first_subpass,
269 attachment->desc.loadOp,
270 last_subpass,
271 attachment->desc.storeOp);
272 if (needs_load) {
273 struct v3dv_image_view *iview =
274 state->attachments[attachment_idx].image_view;
275 cmd_buffer_render_pass_emit_load(cmd_buffer, cl, iview,
276 layer, RENDER_TARGET_0 + i);
277 }
278 }
279
280 uint32_t ds_attachment_idx = subpass->ds_attachment.attachment;
281 if (ds_attachment_idx != VK_ATTACHMENT_UNUSED) {
282 const struct v3dv_render_pass_attachment *ds_attachment =
283 &state->pass->attachments[ds_attachment_idx];
284
285 const VkImageAspectFlags ds_aspects =
286 vk_format_aspects(ds_attachment->desc.format);
287
288 uint32_t ds_first_subpass = !pass->multiview_enabled ?
289 ds_attachment->first_subpass :
290 ds_attachment->views[layer].first_subpass;
291
292 uint32_t ds_last_subpass = !pass->multiview_enabled ?
293 ds_attachment->last_subpass :
294 ds_attachment->views[layer].last_subpass;
295
296 const bool needs_depth_load =
297 v3dv_cmd_buffer_check_needs_load(state,
298 ds_aspects & VK_IMAGE_ASPECT_DEPTH_BIT,
299 ds_first_subpass,
300 ds_attachment->desc.loadOp,
301 ds_last_subpass,
302 ds_attachment->desc.storeOp);
303
304 const bool needs_stencil_load =
305 v3dv_cmd_buffer_check_needs_load(state,
306 ds_aspects & VK_IMAGE_ASPECT_STENCIL_BIT,
307 ds_first_subpass,
308 ds_attachment->desc.stencilLoadOp,
309 ds_last_subpass,
310 ds_attachment->desc.stencilStoreOp);
311
312 if (needs_depth_load || needs_stencil_load) {
313 struct v3dv_image_view *iview =
314 state->attachments[ds_attachment_idx].image_view;
315 /* From the Vulkan spec:
316 *
317 * "When an image view of a depth/stencil image is used as a
318 * depth/stencil framebuffer attachment, the aspectMask is ignored
319 * and both depth and stencil image subresources are used."
320 *
321 * So we ignore the aspects from the subresource range of the image
322 * view for the depth/stencil attachment, but we still need to restrict
323 * the to aspects compatible with the render pass and the image.
324 */
325 const uint32_t zs_buffer =
326 v3dv_zs_buffer(needs_depth_load, needs_stencil_load);
327 cmd_buffer_render_pass_emit_load(cmd_buffer, cl,
328 iview, layer, zs_buffer);
329 }
330 }
331
332 cl_emit(cl, END_OF_LOADS, end);
333 }
334
335 static void
cmd_buffer_render_pass_emit_store(struct v3dv_cmd_buffer * cmd_buffer,struct v3dv_cl * cl,uint32_t attachment_idx,uint32_t layer,uint32_t buffer,bool clear,bool is_multisample_resolve)336 cmd_buffer_render_pass_emit_store(struct v3dv_cmd_buffer *cmd_buffer,
337 struct v3dv_cl *cl,
338 uint32_t attachment_idx,
339 uint32_t layer,
340 uint32_t buffer,
341 bool clear,
342 bool is_multisample_resolve)
343 {
344 const struct v3dv_image_view *iview =
345 cmd_buffer->state.attachments[attachment_idx].image_view;
346 const struct v3dv_image *image = (struct v3dv_image *) iview->vk.image;
347
348 /* We don't support rendering to ycbcr images, so the image view should be
349 * one-plane, and using a single-plane format. But note that the underlying
350 * image can be a ycbcr format, as we support rendering to a specific plane
351 * of an image. This is used for example on some meta_copy code paths, in
352 * order to copy from/to a plane of a ycbcr image.
353 */
354 assert(iview->plane_count == 1);
355 assert(iview->format->plane_count == 1);
356
357 uint8_t image_plane = v3dv_plane_from_aspect(iview->vk.aspects);
358 const struct v3d_resource_slice *slice =
359 &image->planes[image_plane].slices[iview->vk.base_mip_level];
360 uint32_t layer_offset = v3dv_layer_offset(image,
361 iview->vk.base_mip_level,
362 iview->vk.base_array_layer + layer,
363 image_plane);
364
365 /* The Clear Buffer bit is not supported for Z/Stencil stores in 7.x and it
366 * is broken in earlier V3D versions.
367 */
368 assert((buffer != Z && buffer != STENCIL && buffer != ZSTENCIL) || !clear);
369
370 cl_emit(cl, STORE_TILE_BUFFER_GENERAL, store) {
371 store.buffer_to_store = buffer;
372 store.address = v3dv_cl_address(image->planes[image_plane].mem->bo, layer_offset);
373 store.clear_buffer_being_stored = clear;
374
375 store.output_image_format = iview->format->planes[0].rt_type;
376
377 /* If we create an image view with only the stencil format, we
378 * re-interpret the format as RGBA8_UINT, as it is want we want in
379 * general (see CreateImageView).
380 *
381 * However, when we are loading/storing tiles from the ZSTENCIL tile
382 * buffer, we need to use the underlying DS format.
383 */
384 if (buffer == ZSTENCIL &&
385 iview->format->planes[0].rt_type == V3D_OUTPUT_IMAGE_FORMAT_RGBA8UI) {
386 assert(image->format->planes[image_plane].rt_type == V3D_OUTPUT_IMAGE_FORMAT_D24S8);
387 store.output_image_format = image->format->planes[image_plane].rt_type;
388 }
389
390 store.r_b_swap = iview->planes[0].swap_rb;
391 store.channel_reverse = iview->planes[0].channel_reverse;
392 store.memory_format = slice->tiling;
393
394 if (slice->tiling == V3D_TILING_UIF_NO_XOR ||
395 slice->tiling == V3D_TILING_UIF_XOR) {
396 store.height_in_ub_or_stride =
397 slice->padded_height_of_output_image_in_uif_blocks;
398 } else if (slice->tiling == V3D_TILING_RASTER) {
399 store.height_in_ub_or_stride = slice->stride;
400 }
401
402 if (image->vk.samples > VK_SAMPLE_COUNT_1_BIT)
403 store.decimate_mode = V3D_DECIMATE_MODE_ALL_SAMPLES;
404 else if (is_multisample_resolve)
405 store.decimate_mode = V3D_DECIMATE_MODE_4X;
406 else
407 store.decimate_mode = V3D_DECIMATE_MODE_SAMPLE_0;
408 }
409 }
410
411 static bool
check_needs_clear(const struct v3dv_cmd_buffer_state * state,VkImageAspectFlags aspect,uint32_t first_subpass_idx,VkAttachmentLoadOp load_op,bool do_clear_with_draw)412 check_needs_clear(const struct v3dv_cmd_buffer_state *state,
413 VkImageAspectFlags aspect,
414 uint32_t first_subpass_idx,
415 VkAttachmentLoadOp load_op,
416 bool do_clear_with_draw)
417 {
418 /* We call this with image->vk.aspects & aspect, so 0 means the aspect we are
419 * testing does not exist in the image.
420 */
421 if (!aspect)
422 return false;
423
424 /* If the aspect needs to be cleared with a draw call then we won't emit
425 * the clear here.
426 */
427 if (do_clear_with_draw)
428 return false;
429
430 /* If this is resuming a subpass started with another job, then attachment
431 * load operations don't apply.
432 */
433 if (state->job->is_subpass_continue)
434 return false;
435
436 /* If the render area is not aligned to tile boundaries we can't use the
437 * TLB for a clear.
438 */
439 if (!state->tile_aligned_render_area)
440 return false;
441
442 /* If this job is running in a subpass other than the first subpass in
443 * which this attachment (or view) is used then attachment load operations
444 * don't apply.
445 */
446 if (state->job->first_subpass != first_subpass_idx)
447 return false;
448
449 /* The attachment load operation must be CLEAR */
450 return load_op == VK_ATTACHMENT_LOAD_OP_CLEAR;
451 }
452
453 static void
cmd_buffer_render_pass_emit_stores(struct v3dv_cmd_buffer * cmd_buffer,struct v3dv_cl * cl,uint32_t layer)454 cmd_buffer_render_pass_emit_stores(struct v3dv_cmd_buffer *cmd_buffer,
455 struct v3dv_cl *cl,
456 uint32_t layer)
457 {
458 struct v3dv_cmd_buffer_state *state = &cmd_buffer->state;
459 struct v3dv_render_pass *pass = state->pass;
460 const struct v3dv_subpass *subpass =
461 &pass->subpasses[state->subpass_idx];
462
463 bool has_stores = false;
464 bool use_global_zs_clear = false;
465 bool use_global_rt_clear = false;
466
467 assert(!pass->multiview_enabled || layer < MAX_MULTIVIEW_VIEW_COUNT);
468
469 /* FIXME: separate stencil */
470 uint32_t ds_attachment_idx = subpass->ds_attachment.attachment;
471 if (ds_attachment_idx != VK_ATTACHMENT_UNUSED) {
472 const struct v3dv_render_pass_attachment *ds_attachment =
473 &state->pass->attachments[ds_attachment_idx];
474
475 assert(state->job->first_subpass >= ds_attachment->first_subpass);
476 assert(state->subpass_idx >= ds_attachment->first_subpass);
477 assert(state->subpass_idx <= ds_attachment->last_subpass);
478
479 /* From the Vulkan spec, VkImageSubresourceRange:
480 *
481 * "When an image view of a depth/stencil image is used as a
482 * depth/stencil framebuffer attachment, the aspectMask is ignored
483 * and both depth and stencil image subresources are used."
484 *
485 * So we ignore the aspects from the subresource range of the image
486 * view for the depth/stencil attachment, but we still need to restrict
487 * the to aspects compatible with the render pass and the image.
488 */
489 const VkImageAspectFlags aspects =
490 vk_format_aspects(ds_attachment->desc.format);
491
492 #if V3D_VERSION <= 42
493 /* GFXH-1689: The per-buffer store command's clear buffer bit is broken
494 * for depth/stencil.
495 *
496 * There used to be some confusion regarding the Clear Tile Buffers
497 * Z/S bit also being broken, but we confirmed with Broadcom that this
498 * is not the case, it was just that some other hardware bugs (that we
499 * need to work around, such as GFXH-1461) could cause this bit to behave
500 * incorrectly.
501 *
502 * There used to be another issue where the RTs bit in the Clear Tile
503 * Buffers packet also cleared Z/S, but Broadcom confirmed this is
504 * fixed since V3D 4.1.
505 *
506 * So if we have to emit a clear of depth or stencil we don't use
507 * the per-buffer store clear bit, even if we need to store the buffers,
508 * instead we always have to use the Clear Tile Buffers Z/S bit.
509 * If we have configured the job to do early Z/S clearing, then we
510 * don't want to emit any Clear Tile Buffers command at all here.
511 *
512 * Note that GFXH-1689 is not reproduced in the simulator, where
513 * using the clear buffer bit in depth/stencil stores works fine.
514 */
515
516 /* Only clear once on the first subpass that uses the attachment */
517 uint32_t ds_first_subpass = !state->pass->multiview_enabled ?
518 ds_attachment->first_subpass :
519 ds_attachment->views[layer].first_subpass;
520
521 bool needs_depth_clear =
522 check_needs_clear(state,
523 aspects & VK_IMAGE_ASPECT_DEPTH_BIT,
524 ds_first_subpass,
525 ds_attachment->desc.loadOp,
526 subpass->do_depth_clear_with_draw);
527
528 bool needs_stencil_clear =
529 check_needs_clear(state,
530 aspects & VK_IMAGE_ASPECT_STENCIL_BIT,
531 ds_first_subpass,
532 ds_attachment->desc.stencilLoadOp,
533 subpass->do_stencil_clear_with_draw);
534
535 use_global_zs_clear = !state->job->early_zs_clear &&
536 (needs_depth_clear || needs_stencil_clear);
537 #endif
538 #if V3D_VERSION >= 71
539 /* The store command's clear buffer bit cannot be used for Z/S stencil:
540 * since V3D 4.5.6 Z/S buffers are automatically cleared between tiles,
541 * so we don't want to emit redundant clears here.
542 */
543 use_global_zs_clear = false;
544 #endif
545
546 /* Skip the last store if it is not required */
547 uint32_t ds_last_subpass = !pass->multiview_enabled ?
548 ds_attachment->last_subpass :
549 ds_attachment->views[layer].last_subpass;
550
551 bool needs_depth_store =
552 v3dv_cmd_buffer_check_needs_store(state,
553 aspects & VK_IMAGE_ASPECT_DEPTH_BIT,
554 ds_last_subpass,
555 ds_attachment->desc.storeOp);
556
557 bool needs_stencil_store =
558 v3dv_cmd_buffer_check_needs_store(state,
559 aspects & VK_IMAGE_ASPECT_STENCIL_BIT,
560 ds_last_subpass,
561 ds_attachment->desc.stencilStoreOp);
562
563 /* If we have a resolve, handle it before storing the tile */
564 const struct v3dv_cmd_buffer_attachment_state *ds_att_state =
565 &state->attachments[ds_attachment_idx];
566 if (ds_att_state->use_tlb_resolve) {
567 assert(ds_att_state->has_resolve);
568 assert(subpass->resolve_depth || subpass->resolve_stencil);
569 const uint32_t resolve_attachment_idx =
570 subpass->ds_resolve_attachment.attachment;
571 assert(resolve_attachment_idx != VK_ATTACHMENT_UNUSED);
572
573 const uint32_t zs_buffer =
574 v3dv_zs_buffer(subpass->resolve_depth, subpass->resolve_stencil);
575 cmd_buffer_render_pass_emit_store(cmd_buffer, cl,
576 resolve_attachment_idx, layer,
577 zs_buffer,
578 false, false);
579 has_stores = true;
580 } else if (ds_att_state->has_resolve) {
581 /* If we can't use the TLB to implement the resolve we will need to
582 * store the attachment so we can implement it later using a blit.
583 */
584 needs_depth_store = subpass->resolve_depth;
585 needs_stencil_store = subpass->resolve_stencil;
586 }
587
588 if (needs_depth_store || needs_stencil_store) {
589 const uint32_t zs_buffer =
590 v3dv_zs_buffer(needs_depth_store, needs_stencil_store);
591 cmd_buffer_render_pass_emit_store(cmd_buffer, cl,
592 ds_attachment_idx, layer,
593 zs_buffer, false, false);
594 has_stores = true;
595 }
596 }
597
598 for (uint32_t i = 0; i < subpass->color_count; i++) {
599 uint32_t attachment_idx = subpass->color_attachments[i].attachment;
600
601 if (attachment_idx == VK_ATTACHMENT_UNUSED)
602 continue;
603
604 const struct v3dv_render_pass_attachment *attachment =
605 &state->pass->attachments[attachment_idx];
606
607 assert(state->job->first_subpass >= attachment->first_subpass);
608 assert(state->subpass_idx >= attachment->first_subpass);
609 assert(state->subpass_idx <= attachment->last_subpass);
610
611 /* Only clear once on the first subpass that uses the attachment */
612 uint32_t first_subpass = !pass->multiview_enabled ?
613 attachment->first_subpass :
614 attachment->views[layer].first_subpass;
615
616 bool needs_clear =
617 check_needs_clear(state,
618 VK_IMAGE_ASPECT_COLOR_BIT,
619 first_subpass,
620 attachment->desc.loadOp,
621 false);
622
623 /* Skip the last store if it is not required */
624 uint32_t last_subpass = !pass->multiview_enabled ?
625 attachment->last_subpass :
626 attachment->views[layer].last_subpass;
627
628 bool needs_store =
629 v3dv_cmd_buffer_check_needs_store(state,
630 VK_IMAGE_ASPECT_COLOR_BIT,
631 last_subpass,
632 attachment->desc.storeOp);
633
634 /* If we need to resolve this attachment emit that store first. Notice
635 * that we must not request a tile buffer clear here in that case, since
636 * that would clear the tile buffer before we get to emit the actual
637 * color attachment store below, since the clear happens after the
638 * store is completed.
639 *
640 * If the attachment doesn't support TLB resolves (or the render area
641 * is not aligned to tile boundaries) then we will have to fallback to
642 * doing the resolve in a shader separately after this job, so we will
643 * need to store the multisampled attachment even if that wasn't
644 * requested by the client.
645 */
646 const struct v3dv_cmd_buffer_attachment_state *att_state =
647 &state->attachments[attachment_idx];
648 if (att_state->use_tlb_resolve) {
649 assert(att_state->has_resolve);
650 const uint32_t resolve_attachment_idx =
651 subpass->resolve_attachments[i].attachment;
652 cmd_buffer_render_pass_emit_store(cmd_buffer, cl,
653 resolve_attachment_idx, layer,
654 RENDER_TARGET_0 + i,
655 false, true);
656 has_stores = true;
657 } else if (att_state->has_resolve) {
658 needs_store = true;
659 }
660
661 /* Emit the color attachment store if needed */
662 if (needs_store) {
663 cmd_buffer_render_pass_emit_store(cmd_buffer, cl,
664 attachment_idx, layer,
665 RENDER_TARGET_0 + i,
666 needs_clear && !use_global_rt_clear,
667 false);
668 has_stores = true;
669 } else if (needs_clear) {
670 use_global_rt_clear = true;
671 }
672 }
673
674 /* We always need to emit at least one dummy store */
675 if (!has_stores) {
676 cl_emit(cl, STORE_TILE_BUFFER_GENERAL, store) {
677 store.buffer_to_store = NONE;
678 }
679 }
680
681 /* If we have any depth/stencil clears we can't use the per-buffer clear
682 * bit and instead we have to emit a single clear of all tile buffers.
683 */
684 if (use_global_zs_clear || use_global_rt_clear) {
685 #if V3D_VERSION == 42
686 cl_emit(cl, CLEAR_TILE_BUFFERS, clear) {
687 clear.clear_z_stencil_buffer = use_global_zs_clear;
688 clear.clear_all_render_targets = use_global_rt_clear;
689 }
690 #endif
691 #if V3D_VERSION >= 71
692 cl_emit(cl, CLEAR_RENDER_TARGETS, clear);
693 #endif
694 }
695 }
696
697 static void
cmd_buffer_render_pass_emit_per_tile_rcl(struct v3dv_cmd_buffer * cmd_buffer,uint32_t layer)698 cmd_buffer_render_pass_emit_per_tile_rcl(struct v3dv_cmd_buffer *cmd_buffer,
699 uint32_t layer)
700 {
701 struct v3dv_job *job = cmd_buffer->state.job;
702 assert(job);
703
704 /* Emit the generic list in our indirect state -- the rcl will just
705 * have pointers into it.
706 */
707 struct v3dv_cl *cl = &job->indirect;
708 v3dv_cl_ensure_space(cl, 200, 1);
709 v3dv_return_if_oom(cmd_buffer, NULL);
710
711 struct v3dv_cl_reloc tile_list_start = v3dv_cl_get_address(cl);
712
713 cl_emit(cl, TILE_COORDINATES_IMPLICIT, coords);
714
715 cmd_buffer_render_pass_emit_loads(cmd_buffer, cl, layer);
716
717 /* The binner starts out writing tiles assuming that the initial mode
718 * is triangles, so make sure that's the case.
719 */
720 cl_emit(cl, PRIM_LIST_FORMAT, fmt) {
721 fmt.primitive_type = LIST_TRIANGLES;
722 }
723
724 /* PTB assumes that value to be 0, but hw will not set it. */
725 cl_emit(cl, SET_INSTANCEID, set) {
726 set.instance_id = 0;
727 }
728
729 cl_emit(cl, BRANCH_TO_IMPLICIT_TILE_LIST, branch);
730
731 cmd_buffer_render_pass_emit_stores(cmd_buffer, cl, layer);
732
733 cl_emit(cl, END_OF_TILE_MARKER, end);
734
735 cl_emit(cl, RETURN_FROM_SUB_LIST, ret);
736
737 cl_emit(&job->rcl, START_ADDRESS_OF_GENERIC_TILE_LIST, branch) {
738 branch.start = tile_list_start;
739 branch.end = v3dv_cl_get_address(cl);
740 }
741 }
742
743 static void
cmd_buffer_emit_render_pass_layer_rcl(struct v3dv_cmd_buffer * cmd_buffer,uint32_t layer)744 cmd_buffer_emit_render_pass_layer_rcl(struct v3dv_cmd_buffer *cmd_buffer,
745 uint32_t layer)
746 {
747 const struct v3dv_cmd_buffer_state *state = &cmd_buffer->state;
748
749 struct v3dv_job *job = cmd_buffer->state.job;
750 struct v3dv_cl *rcl = &job->rcl;
751
752 /* If doing multicore binning, we would need to initialize each
753 * core's tile list here.
754 */
755 const struct v3dv_frame_tiling *tiling = &job->frame_tiling;
756 const uint32_t tile_alloc_offset =
757 64 * layer * tiling->draw_tiles_x * tiling->draw_tiles_y;
758 cl_emit(rcl, MULTICORE_RENDERING_TILE_LIST_SET_BASE, list) {
759 list.address = v3dv_cl_address(job->tile_alloc, tile_alloc_offset);
760 }
761
762 cmd_buffer_render_pass_emit_per_tile_rcl(cmd_buffer, layer);
763
764 uint32_t supertile_w_in_pixels =
765 tiling->tile_width * tiling->supertile_width;
766 uint32_t supertile_h_in_pixels =
767 tiling->tile_height * tiling->supertile_height;
768 const uint32_t min_x_supertile =
769 state->render_area.offset.x / supertile_w_in_pixels;
770 const uint32_t min_y_supertile =
771 state->render_area.offset.y / supertile_h_in_pixels;
772
773 uint32_t max_render_x = state->render_area.offset.x;
774 if (state->render_area.extent.width > 0)
775 max_render_x += state->render_area.extent.width - 1;
776 uint32_t max_render_y = state->render_area.offset.y;
777 if (state->render_area.extent.height > 0)
778 max_render_y += state->render_area.extent.height - 1;
779 const uint32_t max_x_supertile = max_render_x / supertile_w_in_pixels;
780 const uint32_t max_y_supertile = max_render_y / supertile_h_in_pixels;
781
782 for (int y = min_y_supertile; y <= max_y_supertile; y++) {
783 for (int x = min_x_supertile; x <= max_x_supertile; x++) {
784 cl_emit(rcl, SUPERTILE_COORDINATES, coords) {
785 coords.column_number_in_supertiles = x;
786 coords.row_number_in_supertiles = y;
787 }
788 }
789 }
790 }
791
792 static void
set_rcl_early_z_config(struct v3dv_job * job,bool * early_z_disable,uint32_t * early_z_test_and_update_direction)793 set_rcl_early_z_config(struct v3dv_job *job,
794 bool *early_z_disable,
795 uint32_t *early_z_test_and_update_direction)
796 {
797 /* Disable if none of the draw calls in this job enabled EZ */
798 if (!job->has_ez_draws) {
799 *early_z_disable = true;
800 return;
801 }
802
803 switch (job->first_ez_state) {
804 case V3D_EZ_UNDECIDED:
805 case V3D_EZ_LT_LE:
806 *early_z_disable = false;
807 *early_z_test_and_update_direction = EARLY_Z_DIRECTION_LT_LE;
808 break;
809 case V3D_EZ_GT_GE:
810 *early_z_disable = false;
811 *early_z_test_and_update_direction = EARLY_Z_DIRECTION_GT_GE;
812 break;
813 case V3D_EZ_DISABLED:
814 *early_z_disable = true;
815 break;
816 }
817 }
818
819 /* Note that for v71, render target cfg packets has just one field that
820 * combined the internal type and clamp mode. For simplicity we keep just one
821 * helper.
822 *
823 * Note: rt_type is in fact a "enum V3DX(Internal_Type)".
824 *
825 * FIXME: for v71 we are not returning all the possible combinations for
826 * render target internal type and clamp. For example for int types we are
827 * always using clamp int, and for 16f we are using clamp none or pos (that
828 * seems to be the equivalent for no-clamp on 4.2), but not pq or hlg. In
829 * summary right now we are just porting what we were doing on 4.2
830 */
831 uint32_t
v3dX(clamp_for_format_and_type)832 v3dX(clamp_for_format_and_type)(uint32_t rt_type,
833 VkFormat vk_format)
834 {
835 #if V3D_VERSION == 42
836 if (vk_format_is_int(vk_format))
837 return V3D_RENDER_TARGET_CLAMP_INT;
838 else if (vk_format_is_srgb(vk_format))
839 return V3D_RENDER_TARGET_CLAMP_NORM;
840 else
841 return V3D_RENDER_TARGET_CLAMP_NONE;
842 #endif
843 #if V3D_VERSION >= 71
844 switch (rt_type) {
845 case V3D_INTERNAL_TYPE_8I:
846 return V3D_RENDER_TARGET_TYPE_CLAMP_8I_CLAMPED;
847 case V3D_INTERNAL_TYPE_8UI:
848 return V3D_RENDER_TARGET_TYPE_CLAMP_8UI_CLAMPED;
849 case V3D_INTERNAL_TYPE_8:
850 return V3D_RENDER_TARGET_TYPE_CLAMP_8;
851 case V3D_INTERNAL_TYPE_16I:
852 return V3D_RENDER_TARGET_TYPE_CLAMP_16I_CLAMPED;
853 case V3D_INTERNAL_TYPE_16UI:
854 return V3D_RENDER_TARGET_TYPE_CLAMP_16UI_CLAMPED;
855 case V3D_INTERNAL_TYPE_16F:
856 return vk_format_is_srgb(vk_format) ?
857 V3D_RENDER_TARGET_TYPE_CLAMP_16F_CLAMP_NORM :
858 V3D_RENDER_TARGET_TYPE_CLAMP_16F;
859 case V3D_INTERNAL_TYPE_32I:
860 return V3D_RENDER_TARGET_TYPE_CLAMP_32I_CLAMPED;
861 case V3D_INTERNAL_TYPE_32UI:
862 return V3D_RENDER_TARGET_TYPE_CLAMP_32UI_CLAMPED;
863 case V3D_INTERNAL_TYPE_32F:
864 return V3D_RENDER_TARGET_TYPE_CLAMP_32F;
865 default:
866 unreachable("Unknown internal render target type");
867 }
868
869 return V3D_RENDER_TARGET_TYPE_CLAMP_INVALID;
870 #endif
871 }
872
873 static void
cmd_buffer_render_pass_setup_render_target(struct v3dv_cmd_buffer * cmd_buffer,int rt,uint32_t * rt_bpp,uint32_t * rt_type,uint32_t * rt_clamp)874 cmd_buffer_render_pass_setup_render_target(struct v3dv_cmd_buffer *cmd_buffer,
875 int rt,
876 uint32_t *rt_bpp,
877 #if V3D_VERSION == 42
878 uint32_t *rt_type,
879 uint32_t *rt_clamp)
880 #else
881 uint32_t *rt_type_clamp)
882 #endif
883 {
884 const struct v3dv_cmd_buffer_state *state = &cmd_buffer->state;
885
886 assert(state->subpass_idx < state->pass->subpass_count);
887 const struct v3dv_subpass *subpass =
888 &state->pass->subpasses[state->subpass_idx];
889
890 if (rt >= subpass->color_count)
891 return;
892
893 struct v3dv_subpass_attachment *attachment = &subpass->color_attachments[rt];
894 const uint32_t attachment_idx = attachment->attachment;
895 if (attachment_idx == VK_ATTACHMENT_UNUSED)
896 return;
897
898 assert(attachment_idx < state->framebuffer->attachment_count &&
899 attachment_idx < state->attachment_alloc_count);
900 struct v3dv_image_view *iview = state->attachments[attachment_idx].image_view;
901 assert(vk_format_is_color(iview->vk.format));
902
903 assert(iview->plane_count == 1);
904 *rt_bpp = iview->planes[0].internal_bpp;
905 #if V3D_VERSION == 42
906 *rt_type = iview->planes[0].internal_type;
907 *rt_clamp = v3dX(clamp_for_format_and_type)(iview->planes[0].internal_type,
908 iview->vk.format);
909 #endif
910 #if V3D_VERSION >= 71
911 *rt_type_clamp = v3dX(clamp_for_format_and_type)(iview->planes[0].internal_type,
912 iview->vk.format);
913 #endif
914 }
915
916 void
v3dX(cmd_buffer_emit_render_pass_rcl)917 v3dX(cmd_buffer_emit_render_pass_rcl)(struct v3dv_cmd_buffer *cmd_buffer)
918 {
919 struct v3dv_job *job = cmd_buffer->state.job;
920 assert(job);
921
922 const struct v3dv_cmd_buffer_state *state = &cmd_buffer->state;
923 const struct v3dv_framebuffer *framebuffer = state->framebuffer;
924
925 /* We can't emit the RCL until we have a framebuffer, which we may not have
926 * if we are recording a secondary command buffer. In that case, we will
927 * have to wait until vkCmdExecuteCommands is called from a primary command
928 * buffer.
929 */
930 if (!framebuffer) {
931 assert(cmd_buffer->vk.level == VK_COMMAND_BUFFER_LEVEL_SECONDARY);
932 return;
933 }
934
935 const struct v3dv_frame_tiling *tiling = &job->frame_tiling;
936
937 const uint32_t fb_layers = job->frame_tiling.layers;
938
939 v3dv_cl_ensure_space_with_branch(&job->rcl, 200 +
940 MAX2(fb_layers, 1) * 256 *
941 cl_packet_length(SUPERTILE_COORDINATES));
942 v3dv_return_if_oom(cmd_buffer, NULL);
943
944 assert(state->subpass_idx < state->pass->subpass_count);
945 const struct v3dv_render_pass *pass = state->pass;
946 const struct v3dv_subpass *subpass = &pass->subpasses[state->subpass_idx];
947 struct v3dv_cl *rcl = &job->rcl;
948
949 /* Common config must be the first TILE_RENDERING_MODE_CFG and
950 * Z_STENCIL_CLEAR_VALUES must be last. The ones in between are optional
951 * updates to the previous HW state.
952 */
953 bool do_early_zs_clear = false;
954 const uint32_t ds_attachment_idx = subpass->ds_attachment.attachment;
955 assert(!tiling->msaa || !tiling->double_buffer);
956 cl_emit(rcl, TILE_RENDERING_MODE_CFG_COMMON, config) {
957 config.image_width_pixels = framebuffer->width;
958 config.image_height_pixels = framebuffer->height;
959 config.number_of_render_targets = MAX2(subpass->color_count, 1);
960 config.multisample_mode_4x = tiling->msaa;
961 config.double_buffer_in_non_ms_mode = tiling->double_buffer;
962 #if V3D_VERSION == 42
963 config.maximum_bpp_of_all_render_targets = tiling->internal_bpp;
964 #endif
965 #if V3D_VERSION >= 71
966 config.log2_tile_width = log2_tile_size(tiling->tile_width);
967 config.log2_tile_height = log2_tile_size(tiling->tile_height);
968 /* FIXME: ideallly we would like next assert on the packet header (as is
969 * general, so also applies to GL). We would need to expand
970 * gen_pack_header for that.
971 */
972 assert(config.log2_tile_width == config.log2_tile_height ||
973 config.log2_tile_width == config.log2_tile_height + 1);
974 #endif
975
976 if (ds_attachment_idx != VK_ATTACHMENT_UNUSED) {
977 const struct v3dv_image_view *iview =
978 state->attachments[ds_attachment_idx].image_view;
979
980 /* At this point the image view should be single-plane. But note that
981 * the underlying image can be multi-plane, and the image view refer
982 * to one specific plane.
983 */
984 assert(iview->plane_count == 1);
985 assert(iview->format->plane_count == 1);
986 config.internal_depth_type = iview->planes[0].internal_type;
987
988 set_rcl_early_z_config(job,
989 &config.early_z_disable,
990 &config.early_z_test_and_update_direction);
991
992 /* Early-Z/S clear can be enabled if the job is clearing and not
993 * storing (or loading) depth. If a stencil aspect is also present
994 * we have the same requirements for it, however, in this case we
995 * can accept stencil loadOp DONT_CARE as well, so instead of
996 * checking that stencil is cleared we check that is not loaded.
997 *
998 * Early-Z/S clearing is independent of Early Z/S testing, so it is
999 * possible to enable one but not the other so long as their
1000 * respective requirements are met.
1001 *
1002 * From V3D 4.5.6, Z/S buffers are always cleared automatically
1003 * between tiles, but we still want to enable early ZS clears
1004 * when Z/S are not loaded or stored.
1005 */
1006 struct v3dv_render_pass_attachment *ds_attachment =
1007 &pass->attachments[ds_attachment_idx];
1008
1009 const VkImageAspectFlags ds_aspects =
1010 vk_format_aspects(ds_attachment->desc.format);
1011
1012 bool needs_depth_store =
1013 v3dv_cmd_buffer_check_needs_store(state,
1014 ds_aspects & VK_IMAGE_ASPECT_DEPTH_BIT,
1015 ds_attachment->last_subpass,
1016 ds_attachment->desc.storeOp) ||
1017 subpass->resolve_depth;
1018 #if V3D_VERSION <= 42
1019 bool needs_depth_clear =
1020 check_needs_clear(state,
1021 ds_aspects & VK_IMAGE_ASPECT_DEPTH_BIT,
1022 ds_attachment->first_subpass,
1023 ds_attachment->desc.loadOp,
1024 subpass->do_depth_clear_with_draw);
1025
1026 do_early_zs_clear = needs_depth_clear && !needs_depth_store;
1027 #endif
1028 #if V3D_VERSION >= 71
1029 bool needs_depth_load =
1030 v3dv_cmd_buffer_check_needs_load(state,
1031 ds_aspects & VK_IMAGE_ASPECT_DEPTH_BIT,
1032 ds_attachment->first_subpass,
1033 ds_attachment->desc.loadOp,
1034 ds_attachment->last_subpass,
1035 ds_attachment->desc.storeOp);
1036 do_early_zs_clear = !needs_depth_load && !needs_depth_store;
1037 #endif
1038
1039 if (do_early_zs_clear &&
1040 vk_format_has_stencil(ds_attachment->desc.format)) {
1041 bool needs_stencil_load =
1042 v3dv_cmd_buffer_check_needs_load(state,
1043 ds_aspects & VK_IMAGE_ASPECT_STENCIL_BIT,
1044 ds_attachment->first_subpass,
1045 ds_attachment->desc.stencilLoadOp,
1046 ds_attachment->last_subpass,
1047 ds_attachment->desc.stencilStoreOp);
1048
1049 bool needs_stencil_store =
1050 v3dv_cmd_buffer_check_needs_store(state,
1051 ds_aspects & VK_IMAGE_ASPECT_STENCIL_BIT,
1052 ds_attachment->last_subpass,
1053 ds_attachment->desc.stencilStoreOp) ||
1054 subpass->resolve_stencil;
1055
1056 do_early_zs_clear = !needs_stencil_load && !needs_stencil_store;
1057 }
1058
1059 config.early_depth_stencil_clear = do_early_zs_clear;
1060 } else {
1061 config.early_z_disable = true;
1062 }
1063 }
1064
1065 /* If we enabled early Z/S clear, then we can't emit any "Clear Tile Buffers"
1066 * commands with the Z/S bit set, so keep track of whether we enabled this
1067 * in the job so we can skip these later.
1068 */
1069 job->early_zs_clear = do_early_zs_clear;
1070
1071 #if V3D_VERSION >= 71
1072 uint32_t base_addr = 0;
1073 #endif
1074 for (uint32_t i = 0; i < subpass->color_count; i++) {
1075 uint32_t attachment_idx = subpass->color_attachments[i].attachment;
1076 if (attachment_idx == VK_ATTACHMENT_UNUSED) {
1077 #if V3D_VERSION >= 71
1078 cl_emit(rcl, TILE_RENDERING_MODE_CFG_RENDER_TARGET_PART1, rt) {
1079 rt.render_target_number = i;
1080 rt.stride = 1; /* Unused */
1081 }
1082 #endif
1083 continue;
1084 }
1085
1086 struct v3dv_image_view *iview =
1087 state->attachments[attachment_idx].image_view;
1088 assert(iview->plane_count == 1);
1089
1090 const struct v3dv_image *image = (struct v3dv_image *) iview->vk.image;
1091
1092 uint8_t plane = v3dv_plane_from_aspect(iview->vk.aspects);
1093 const struct v3d_resource_slice *slice =
1094 &image->planes[plane].slices[iview->vk.base_mip_level];
1095
1096 UNUSED const uint32_t *clear_color =
1097 &state->attachments[attachment_idx].clear_value.color[0];
1098
1099 UNUSED uint32_t clear_pad = 0;
1100 if (slice->tiling == V3D_TILING_UIF_NO_XOR ||
1101 slice->tiling == V3D_TILING_UIF_XOR) {
1102 int uif_block_height = v3d_utile_height(image->planes[plane].cpp) * 2;
1103
1104 uint32_t implicit_padded_height =
1105 align(framebuffer->height, uif_block_height) / uif_block_height;
1106
1107 if (slice->padded_height_of_output_image_in_uif_blocks -
1108 implicit_padded_height >= 15) {
1109 clear_pad = slice->padded_height_of_output_image_in_uif_blocks;
1110 }
1111 }
1112
1113 #if V3D_VERSION == 42
1114 cl_emit(rcl, TILE_RENDERING_MODE_CFG_CLEAR_COLORS_PART1, clear) {
1115 clear.clear_color_low_32_bits = clear_color[0];
1116 clear.clear_color_next_24_bits = clear_color[1] & 0xffffff;
1117 clear.render_target_number = i;
1118 };
1119
1120 if (iview->planes[0].internal_bpp >= V3D_INTERNAL_BPP_64) {
1121 cl_emit(rcl, TILE_RENDERING_MODE_CFG_CLEAR_COLORS_PART2, clear) {
1122 clear.clear_color_mid_low_32_bits =
1123 ((clear_color[1] >> 24) | (clear_color[2] << 8));
1124 clear.clear_color_mid_high_24_bits =
1125 ((clear_color[2] >> 24) | ((clear_color[3] & 0xffff) << 8));
1126 clear.render_target_number = i;
1127 };
1128 }
1129
1130 if (iview->planes[0].internal_bpp >= V3D_INTERNAL_BPP_128 || clear_pad) {
1131 cl_emit(rcl, TILE_RENDERING_MODE_CFG_CLEAR_COLORS_PART3, clear) {
1132 clear.uif_padded_height_in_uif_blocks = clear_pad;
1133 clear.clear_color_high_16_bits = clear_color[3] >> 16;
1134 clear.render_target_number = i;
1135 };
1136 }
1137 #endif
1138
1139 #if V3D_VERSION >= 71
1140 cl_emit(rcl, TILE_RENDERING_MODE_CFG_RENDER_TARGET_PART1, rt) {
1141 rt.clear_color_low_bits = clear_color[0];
1142 cmd_buffer_render_pass_setup_render_target(cmd_buffer, i, &rt.internal_bpp,
1143 &rt.internal_type_and_clamping);
1144 rt.stride =
1145 v3d_compute_rt_row_row_stride_128_bits(tiling->tile_width,
1146 v3d_internal_bpp_words(rt.internal_bpp));
1147 rt.base_address = base_addr;
1148 rt.render_target_number = i;
1149
1150 /* base_addr in multiples of 512 bits. We divide by 8 because stride
1151 * is in 128-bit units, but it is packing 2 rows worth of data, so we
1152 * need to divide it by 2 so it is only 1 row, and then again by 4 so
1153 * it is in 512-bit units.
1154 */
1155 base_addr += (tiling->tile_height * rt.stride) / 8;
1156 }
1157
1158 if (iview->planes[0].internal_bpp >= V3D_INTERNAL_BPP_64) {
1159 cl_emit(rcl, TILE_RENDERING_MODE_CFG_RENDER_TARGET_PART2, rt) {
1160 rt.clear_color_mid_bits = /* 40 bits (32 + 8) */
1161 ((uint64_t) clear_color[1]) |
1162 (((uint64_t) (clear_color[2] & 0xff)) << 32);
1163 rt.render_target_number = i;
1164 }
1165 }
1166
1167 if (iview->planes[0].internal_bpp >= V3D_INTERNAL_BPP_128) {
1168 cl_emit(rcl, TILE_RENDERING_MODE_CFG_RENDER_TARGET_PART3, rt) {
1169 rt.clear_color_top_bits = /* 56 bits (24 + 32) */
1170 (((uint64_t) (clear_color[2] & 0xffffff00)) >> 8) |
1171 (((uint64_t) (clear_color[3])) << 24);
1172 rt.render_target_number = i;
1173 }
1174 }
1175 #endif
1176 }
1177
1178 #if V3D_VERSION >= 71
1179 /* If we don't have any color RTs, we still need to emit one and flag
1180 * it as not used using stride = 1.
1181 */
1182 if (subpass->color_count == 0) {
1183 cl_emit(rcl, TILE_RENDERING_MODE_CFG_RENDER_TARGET_PART1, rt) {
1184 rt.stride = 1;
1185 }
1186 }
1187 #endif
1188
1189 #if V3D_VERSION == 42
1190 cl_emit(rcl, TILE_RENDERING_MODE_CFG_COLOR, rt) {
1191 cmd_buffer_render_pass_setup_render_target
1192 (cmd_buffer, 0, &rt.render_target_0_internal_bpp,
1193 &rt.render_target_0_internal_type, &rt.render_target_0_clamp);
1194 cmd_buffer_render_pass_setup_render_target
1195 (cmd_buffer, 1, &rt.render_target_1_internal_bpp,
1196 &rt.render_target_1_internal_type, &rt.render_target_1_clamp);
1197 cmd_buffer_render_pass_setup_render_target
1198 (cmd_buffer, 2, &rt.render_target_2_internal_bpp,
1199 &rt.render_target_2_internal_type, &rt.render_target_2_clamp);
1200 cmd_buffer_render_pass_setup_render_target
1201 (cmd_buffer, 3, &rt.render_target_3_internal_bpp,
1202 &rt.render_target_3_internal_type, &rt.render_target_3_clamp);
1203 }
1204 #endif
1205
1206 /* Ends rendering mode config. */
1207 if (ds_attachment_idx != VK_ATTACHMENT_UNUSED) {
1208 cl_emit(rcl, TILE_RENDERING_MODE_CFG_ZS_CLEAR_VALUES, clear) {
1209 clear.z_clear_value =
1210 state->attachments[ds_attachment_idx].clear_value.z;
1211 clear.stencil_clear_value =
1212 state->attachments[ds_attachment_idx].clear_value.s;
1213 };
1214 } else {
1215 cl_emit(rcl, TILE_RENDERING_MODE_CFG_ZS_CLEAR_VALUES, clear) {
1216 clear.z_clear_value = 1.0f;
1217 clear.stencil_clear_value = 0;
1218 };
1219 }
1220
1221 /* Always set initial block size before the first branch, which needs
1222 * to match the value from binning mode config.
1223 */
1224 cl_emit(rcl, TILE_LIST_INITIAL_BLOCK_SIZE, init) {
1225 init.use_auto_chained_tile_lists = true;
1226 init.size_of_first_block_in_chained_tile_lists =
1227 TILE_ALLOCATION_BLOCK_SIZE_64B;
1228 }
1229
1230 cl_emit(rcl, MULTICORE_RENDERING_SUPERTILE_CFG, config) {
1231 config.number_of_bin_tile_lists = 1;
1232 config.total_frame_width_in_tiles = tiling->draw_tiles_x;
1233 config.total_frame_height_in_tiles = tiling->draw_tiles_y;
1234
1235 config.supertile_width_in_tiles = tiling->supertile_width;
1236 config.supertile_height_in_tiles = tiling->supertile_height;
1237
1238 config.total_frame_width_in_supertiles =
1239 tiling->frame_width_in_supertiles;
1240 config.total_frame_height_in_supertiles =
1241 tiling->frame_height_in_supertiles;
1242 }
1243
1244 /* Emit an initial clear of the tile buffers. This is necessary
1245 * for any buffers that should be cleared (since clearing
1246 * normally happens at the *end* of the generic tile list), but
1247 * it's also nice to clear everything so the first tile doesn't
1248 * inherit any contents from some previous frame.
1249 *
1250 * Also, implement the GFXH-1742 workaround. There's a race in
1251 * the HW between the RCL updating the TLB's internal type/size
1252 * and the spawning of the QPU instances using the TLB's current
1253 * internal type/size. To make sure the QPUs get the right
1254 * state, we need 1 dummy store in between internal type/size
1255 * changes on V3D 3.x, and 2 dummy stores on 4.x.
1256 */
1257 for (int i = 0; i < 2; i++) {
1258 cl_emit(rcl, TILE_COORDINATES, coords);
1259 cl_emit(rcl, END_OF_LOADS, end);
1260 cl_emit(rcl, STORE_TILE_BUFFER_GENERAL, store) {
1261 store.buffer_to_store = NONE;
1262 }
1263 if (cmd_buffer->state.tile_aligned_render_area &&
1264 (i == 0 || v3dv_do_double_initial_tile_clear(tiling))) {
1265 #if V3D_VERSION == 42
1266 cl_emit(rcl, CLEAR_TILE_BUFFERS, clear) {
1267 clear.clear_z_stencil_buffer = !job->early_zs_clear;
1268 clear.clear_all_render_targets = true;
1269 }
1270 #endif
1271 #if V3D_VERSION >= 71
1272 cl_emit(rcl, CLEAR_RENDER_TARGETS, clear_rt);
1273 #endif
1274 }
1275 cl_emit(rcl, END_OF_TILE_MARKER, end);
1276 }
1277
1278 cl_emit(rcl, FLUSH_VCD_CACHE, flush);
1279
1280 for (int layer = 0; layer < MAX2(1, fb_layers); layer++) {
1281 if (subpass->view_mask == 0 || (subpass->view_mask & (1u << layer)))
1282 cmd_buffer_emit_render_pass_layer_rcl(cmd_buffer, layer);
1283 }
1284
1285 cl_emit(rcl, END_OF_RENDERING, end);
1286 }
1287
1288 void
v3dX(viewport_compute_xform)1289 v3dX(viewport_compute_xform)(const VkViewport *viewport,
1290 float scale[3],
1291 float translate[3])
1292 {
1293 float x = viewport->x;
1294 float y = viewport->y;
1295 float half_width = 0.5f * viewport->width;
1296 float half_height = 0.5f * viewport->height;
1297 double n = viewport->minDepth;
1298 double f = viewport->maxDepth;
1299
1300 scale[0] = half_width;
1301 translate[0] = half_width + x;
1302 scale[1] = half_height;
1303 translate[1] = half_height + y;
1304
1305 scale[2] = (f - n);
1306 translate[2] = n;
1307
1308 /* It seems that if the scale is small enough the hardware won't clip
1309 * correctly so we work around this my choosing the smallest scale that
1310 * seems to work.
1311 *
1312 * This case is exercised by CTS:
1313 * dEQP-VK.draw.renderpass.inverted_depth_ranges.nodepthclamp_deltazero
1314 *
1315 * V3D 7.x fixes this by using the new
1316 * CLIPPER_Z_SCALE_AND_OFFSET_NO_GUARDBAND.
1317 */
1318 #if V3D_VERSION <= 42
1319 const float min_abs_scale = 0.0005f;
1320 if (fabs(scale[2]) < min_abs_scale)
1321 scale[2] = scale[2] < 0 ? -min_abs_scale : min_abs_scale;
1322 #endif
1323 }
1324
1325 void
v3dX(cmd_buffer_emit_viewport)1326 v3dX(cmd_buffer_emit_viewport)(struct v3dv_cmd_buffer *cmd_buffer)
1327 {
1328 struct v3dv_dynamic_state *dynamic = &cmd_buffer->state.dynamic;
1329 struct v3dv_pipeline *pipeline = cmd_buffer->state.gfx.pipeline;
1330 assert(pipeline);
1331
1332 /* FIXME: right now we only support one viewport. viewporst[0] would work
1333 * now, would need to change if we allow multiple viewports
1334 */
1335 float *vptranslate = dynamic->viewport.translate[0];
1336 float *vpscale = dynamic->viewport.scale[0];
1337
1338 struct v3dv_job *job = cmd_buffer->state.job;
1339 assert(job);
1340
1341 const uint32_t required_cl_size =
1342 cl_packet_length(CLIPPER_XY_SCALING) +
1343 cl_packet_length(CLIPPER_Z_SCALE_AND_OFFSET) +
1344 cl_packet_length(CLIPPER_Z_MIN_MAX_CLIPPING_PLANES) +
1345 cl_packet_length(VIEWPORT_OFFSET);
1346 v3dv_cl_ensure_space_with_branch(&job->bcl, required_cl_size);
1347 v3dv_return_if_oom(cmd_buffer, NULL);
1348
1349 #if V3D_VERSION == 42
1350 cl_emit(&job->bcl, CLIPPER_XY_SCALING, clip) {
1351 clip.viewport_half_width_in_1_256th_of_pixel = vpscale[0] * 256.0f;
1352 clip.viewport_half_height_in_1_256th_of_pixel = vpscale[1] * 256.0f;
1353 }
1354 #endif
1355 #if V3D_VERSION >= 71
1356 cl_emit(&job->bcl, CLIPPER_XY_SCALING, clip) {
1357 clip.viewport_half_width_in_1_64th_of_pixel = vpscale[0] * 64.0f;
1358 clip.viewport_half_height_in_1_64th_of_pixel = vpscale[1] * 64.0f;
1359 }
1360 #endif
1361
1362 float translate_z, scale_z;
1363 v3dv_cmd_buffer_state_get_viewport_z_xform(&cmd_buffer->state, 0,
1364 &translate_z, &scale_z);
1365
1366 #if V3D_VERSION == 42
1367 cl_emit(&job->bcl, CLIPPER_Z_SCALE_AND_OFFSET, clip) {
1368 clip.viewport_z_offset_zc_to_zs = translate_z;
1369 clip.viewport_z_scale_zc_to_zs = scale_z;
1370 }
1371 #endif
1372
1373 #if V3D_VERSION >= 71
1374 /* If the Z scale is too small guardband clipping may not clip correctly */
1375 if (fabsf(scale_z) < 0.01f) {
1376 cl_emit(&job->bcl, CLIPPER_Z_SCALE_AND_OFFSET_NO_GUARDBAND, clip) {
1377 clip.viewport_z_offset_zc_to_zs = translate_z;
1378 clip.viewport_z_scale_zc_to_zs = scale_z;
1379 }
1380 } else {
1381 cl_emit(&job->bcl, CLIPPER_Z_SCALE_AND_OFFSET, clip) {
1382 clip.viewport_z_offset_zc_to_zs = translate_z;
1383 clip.viewport_z_scale_zc_to_zs = scale_z;
1384 }
1385 }
1386 #endif
1387
1388 cl_emit(&job->bcl, CLIPPER_Z_MIN_MAX_CLIPPING_PLANES, clip) {
1389 /* Vulkan's default Z NDC is [0..1]. If 'negative_one_to_one' is enabled,
1390 * we are using OpenGL's [-1, 1] instead.
1391 */
1392 float z1 = pipeline->negative_one_to_one ? translate_z - scale_z :
1393 translate_z;
1394 float z2 = translate_z + scale_z;
1395 clip.minimum_zw = MIN2(z1, z2);
1396 clip.maximum_zw = MAX2(z1, z2);
1397 }
1398
1399 cl_emit(&job->bcl, VIEWPORT_OFFSET, vp) {
1400 float vp_fine_x = vptranslate[0];
1401 float vp_fine_y = vptranslate[1];
1402 int32_t vp_coarse_x = 0;
1403 int32_t vp_coarse_y = 0;
1404
1405 /* The fine coordinates must be unsigned, but coarse can be signed */
1406 if (unlikely(vp_fine_x < 0)) {
1407 int32_t blocks_64 = DIV_ROUND_UP(fabsf(vp_fine_x), 64);
1408 vp_fine_x += 64.0f * blocks_64;
1409 vp_coarse_x -= blocks_64;
1410 }
1411
1412 if (unlikely(vp_fine_y < 0)) {
1413 int32_t blocks_64 = DIV_ROUND_UP(fabsf(vp_fine_y), 64);
1414 vp_fine_y += 64.0f * blocks_64;
1415 vp_coarse_y -= blocks_64;
1416 }
1417
1418 vp.fine_x = vp_fine_x;
1419 vp.fine_y = vp_fine_y;
1420 vp.coarse_x = vp_coarse_x;
1421 vp.coarse_y = vp_coarse_y;
1422 }
1423
1424 cmd_buffer->state.dirty &= ~V3DV_CMD_DIRTY_VIEWPORT;
1425 }
1426
1427 void
v3dX(cmd_buffer_emit_stencil)1428 v3dX(cmd_buffer_emit_stencil)(struct v3dv_cmd_buffer *cmd_buffer)
1429 {
1430 struct v3dv_job *job = cmd_buffer->state.job;
1431 assert(job);
1432
1433 struct v3dv_pipeline *pipeline = cmd_buffer->state.gfx.pipeline;
1434 struct v3dv_dynamic_state *dynamic_state = &cmd_buffer->state.dynamic;
1435
1436 const uint32_t dynamic_stencil_states = V3DV_DYNAMIC_STENCIL_COMPARE_MASK |
1437 V3DV_DYNAMIC_STENCIL_WRITE_MASK |
1438 V3DV_DYNAMIC_STENCIL_REFERENCE;
1439
1440 v3dv_cl_ensure_space_with_branch(&job->bcl,
1441 2 * cl_packet_length(STENCIL_CFG));
1442 v3dv_return_if_oom(cmd_buffer, NULL);
1443
1444 bool emitted_stencil = false;
1445 for (uint32_t i = 0; i < 2; i++) {
1446 if (pipeline->emit_stencil_cfg[i]) {
1447 if (dynamic_state->mask & dynamic_stencil_states) {
1448 cl_emit_with_prepacked(&job->bcl, STENCIL_CFG,
1449 pipeline->stencil_cfg[i], config) {
1450 if (dynamic_state->mask & V3DV_DYNAMIC_STENCIL_COMPARE_MASK) {
1451 config.stencil_test_mask =
1452 i == 0 ? dynamic_state->stencil_compare_mask.front :
1453 dynamic_state->stencil_compare_mask.back;
1454 }
1455 if (dynamic_state->mask & V3DV_DYNAMIC_STENCIL_WRITE_MASK) {
1456 config.stencil_write_mask =
1457 i == 0 ? dynamic_state->stencil_write_mask.front :
1458 dynamic_state->stencil_write_mask.back;
1459 }
1460 if (dynamic_state->mask & V3DV_DYNAMIC_STENCIL_REFERENCE) {
1461 config.stencil_ref_value =
1462 i == 0 ? dynamic_state->stencil_reference.front :
1463 dynamic_state->stencil_reference.back;
1464 }
1465 }
1466 } else {
1467 cl_emit_prepacked(&job->bcl, &pipeline->stencil_cfg[i]);
1468 }
1469
1470 emitted_stencil = true;
1471 }
1472 }
1473
1474 if (emitted_stencil) {
1475 const uint32_t dynamic_stencil_dirty_flags =
1476 V3DV_CMD_DIRTY_STENCIL_COMPARE_MASK |
1477 V3DV_CMD_DIRTY_STENCIL_WRITE_MASK |
1478 V3DV_CMD_DIRTY_STENCIL_REFERENCE;
1479 cmd_buffer->state.dirty &= ~dynamic_stencil_dirty_flags;
1480 }
1481 }
1482
1483 void
v3dX(cmd_buffer_emit_depth_bias)1484 v3dX(cmd_buffer_emit_depth_bias)(struct v3dv_cmd_buffer *cmd_buffer)
1485 {
1486 struct v3dv_pipeline *pipeline = cmd_buffer->state.gfx.pipeline;
1487 assert(pipeline);
1488
1489 if (!pipeline->depth_bias.enabled)
1490 return;
1491
1492 struct v3dv_job *job = cmd_buffer->state.job;
1493 assert(job);
1494
1495 v3dv_cl_ensure_space_with_branch(&job->bcl, cl_packet_length(DEPTH_OFFSET));
1496 v3dv_return_if_oom(cmd_buffer, NULL);
1497
1498 struct v3dv_dynamic_state *dynamic = &cmd_buffer->state.dynamic;
1499 cl_emit(&job->bcl, DEPTH_OFFSET, bias) {
1500 bias.depth_offset_factor = dynamic->depth_bias.slope_factor;
1501 bias.depth_offset_units = dynamic->depth_bias.constant_factor;
1502 #if V3D_VERSION <= 42
1503 if (pipeline->depth_bias.is_z16)
1504 bias.depth_offset_units *= 256.0f;
1505 #endif
1506 bias.limit = dynamic->depth_bias.depth_bias_clamp;
1507 }
1508
1509 cmd_buffer->state.dirty &= ~V3DV_CMD_DIRTY_DEPTH_BIAS;
1510 }
1511
1512 void
v3dX(cmd_buffer_emit_depth_bounds)1513 v3dX(cmd_buffer_emit_depth_bounds)(struct v3dv_cmd_buffer *cmd_buffer)
1514 {
1515 /* No depthBounds support for v42, so this method is empty in that case.
1516 *
1517 * Note that this method is being called as v3dv_job_init flags all state
1518 * as dirty. See FIXME note in v3dv_job_init.
1519 */
1520
1521 #if V3D_VERSION >= 71
1522 struct v3dv_pipeline *pipeline = cmd_buffer->state.gfx.pipeline;
1523 assert(pipeline);
1524
1525 if (!pipeline->depth_bounds_test_enabled)
1526 return;
1527
1528 struct v3dv_job *job = cmd_buffer->state.job;
1529 assert(job);
1530
1531 v3dv_cl_ensure_space_with_branch(&job->bcl, cl_packet_length(DEPTH_BOUNDS_TEST_LIMITS));
1532 v3dv_return_if_oom(cmd_buffer, NULL);
1533
1534 struct v3dv_dynamic_state *dynamic = &cmd_buffer->state.dynamic;
1535 cl_emit(&job->bcl, DEPTH_BOUNDS_TEST_LIMITS, bounds) {
1536 bounds.lower_test_limit = dynamic->depth_bounds.min;
1537 bounds.upper_test_limit = dynamic->depth_bounds.max;
1538 }
1539
1540 cmd_buffer->state.dirty &= ~V3DV_CMD_DIRTY_DEPTH_BOUNDS;
1541 #endif
1542 }
1543
1544 void
v3dX(cmd_buffer_emit_line_width)1545 v3dX(cmd_buffer_emit_line_width)(struct v3dv_cmd_buffer *cmd_buffer)
1546 {
1547 struct v3dv_job *job = cmd_buffer->state.job;
1548 assert(job);
1549
1550 v3dv_cl_ensure_space_with_branch(&job->bcl, cl_packet_length(LINE_WIDTH));
1551 v3dv_return_if_oom(cmd_buffer, NULL);
1552
1553 cl_emit(&job->bcl, LINE_WIDTH, line) {
1554 line.line_width = cmd_buffer->state.dynamic.line_width;
1555 }
1556
1557 cmd_buffer->state.dirty &= ~V3DV_CMD_DIRTY_LINE_WIDTH;
1558 }
1559
1560 void
v3dX(cmd_buffer_emit_sample_state)1561 v3dX(cmd_buffer_emit_sample_state)(struct v3dv_cmd_buffer *cmd_buffer)
1562 {
1563 struct v3dv_pipeline *pipeline = cmd_buffer->state.gfx.pipeline;
1564 assert(pipeline);
1565
1566 struct v3dv_job *job = cmd_buffer->state.job;
1567 assert(job);
1568
1569 v3dv_cl_ensure_space_with_branch(&job->bcl, cl_packet_length(SAMPLE_STATE));
1570 v3dv_return_if_oom(cmd_buffer, NULL);
1571
1572 cl_emit(&job->bcl, SAMPLE_STATE, state) {
1573 state.coverage = 1.0f;
1574 state.mask = pipeline->sample_mask;
1575 }
1576 }
1577
1578 void
v3dX(cmd_buffer_emit_blend)1579 v3dX(cmd_buffer_emit_blend)(struct v3dv_cmd_buffer *cmd_buffer)
1580 {
1581 struct v3dv_job *job = cmd_buffer->state.job;
1582 assert(job);
1583
1584 struct v3dv_pipeline *pipeline = cmd_buffer->state.gfx.pipeline;
1585 assert(pipeline);
1586
1587 const struct v3d_device_info *devinfo = &cmd_buffer->device->devinfo;
1588 const uint32_t max_color_rts = V3D_MAX_RENDER_TARGETS(devinfo->ver);
1589
1590 const uint32_t blend_packets_size =
1591 cl_packet_length(BLEND_ENABLES) +
1592 cl_packet_length(BLEND_CONSTANT_COLOR) +
1593 cl_packet_length(BLEND_CFG) * max_color_rts;
1594
1595 v3dv_cl_ensure_space_with_branch(&job->bcl, blend_packets_size);
1596 v3dv_return_if_oom(cmd_buffer, NULL);
1597
1598 if (cmd_buffer->state.dirty & V3DV_CMD_DIRTY_PIPELINE) {
1599 if (pipeline->blend.enables) {
1600 cl_emit(&job->bcl, BLEND_ENABLES, enables) {
1601 enables.mask = pipeline->blend.enables;
1602 }
1603 }
1604
1605 for (uint32_t i = 0; i < max_color_rts; i++) {
1606 if (pipeline->blend.enables & (1 << i))
1607 cl_emit_prepacked(&job->bcl, &pipeline->blend.cfg[i]);
1608 }
1609 }
1610
1611 if (pipeline->blend.needs_color_constants &&
1612 cmd_buffer->state.dirty & V3DV_CMD_DIRTY_BLEND_CONSTANTS) {
1613 struct v3dv_dynamic_state *dynamic = &cmd_buffer->state.dynamic;
1614 cl_emit(&job->bcl, BLEND_CONSTANT_COLOR, color) {
1615 color.red_f16 = _mesa_float_to_half(dynamic->blend_constants[0]);
1616 color.green_f16 = _mesa_float_to_half(dynamic->blend_constants[1]);
1617 color.blue_f16 = _mesa_float_to_half(dynamic->blend_constants[2]);
1618 color.alpha_f16 = _mesa_float_to_half(dynamic->blend_constants[3]);
1619 }
1620 cmd_buffer->state.dirty &= ~V3DV_CMD_DIRTY_BLEND_CONSTANTS;
1621 }
1622 }
1623
1624 void
v3dX(cmd_buffer_emit_color_write_mask)1625 v3dX(cmd_buffer_emit_color_write_mask)(struct v3dv_cmd_buffer *cmd_buffer)
1626 {
1627 struct v3dv_job *job = cmd_buffer->state.job;
1628 v3dv_cl_ensure_space_with_branch(&job->bcl, cl_packet_length(COLOR_WRITE_MASKS));
1629
1630 struct v3dv_pipeline *pipeline = cmd_buffer->state.gfx.pipeline;
1631 struct v3dv_dynamic_state *dynamic = &cmd_buffer->state.dynamic;
1632 uint32_t color_write_mask = ~dynamic->color_write_enable |
1633 pipeline->blend.color_write_masks;
1634 #if V3D_VERSION <= 42
1635 /* Only 4 RTs */
1636 color_write_mask &= 0xffff;
1637 #endif
1638
1639 cl_emit(&job->bcl, COLOR_WRITE_MASKS, mask) {
1640 mask.mask = color_write_mask;
1641 }
1642
1643 cmd_buffer->state.dirty &= ~V3DV_CMD_DIRTY_COLOR_WRITE_ENABLE;
1644 }
1645
1646 static void
emit_flat_shade_flags(struct v3dv_job * job,int varying_offset,uint32_t varyings,enum V3DX (Varying_Flags_Action)lower,enum V3DX (Varying_Flags_Action)higher)1647 emit_flat_shade_flags(struct v3dv_job *job,
1648 int varying_offset,
1649 uint32_t varyings,
1650 enum V3DX(Varying_Flags_Action) lower,
1651 enum V3DX(Varying_Flags_Action) higher)
1652 {
1653 v3dv_cl_ensure_space_with_branch(&job->bcl,
1654 cl_packet_length(FLAT_SHADE_FLAGS));
1655 v3dv_return_if_oom(NULL, job);
1656
1657 cl_emit(&job->bcl, FLAT_SHADE_FLAGS, flags) {
1658 flags.varying_offset_v0 = varying_offset;
1659 flags.flat_shade_flags_for_varyings_v024 = varyings;
1660 flags.action_for_flat_shade_flags_of_lower_numbered_varyings = lower;
1661 flags.action_for_flat_shade_flags_of_higher_numbered_varyings = higher;
1662 }
1663 }
1664
1665 static void
emit_noperspective_flags(struct v3dv_job * job,int varying_offset,uint32_t varyings,enum V3DX (Varying_Flags_Action)lower,enum V3DX (Varying_Flags_Action)higher)1666 emit_noperspective_flags(struct v3dv_job *job,
1667 int varying_offset,
1668 uint32_t varyings,
1669 enum V3DX(Varying_Flags_Action) lower,
1670 enum V3DX(Varying_Flags_Action) higher)
1671 {
1672 v3dv_cl_ensure_space_with_branch(&job->bcl,
1673 cl_packet_length(NON_PERSPECTIVE_FLAGS));
1674 v3dv_return_if_oom(NULL, job);
1675
1676 cl_emit(&job->bcl, NON_PERSPECTIVE_FLAGS, flags) {
1677 flags.varying_offset_v0 = varying_offset;
1678 flags.non_perspective_flags_for_varyings_v024 = varyings;
1679 flags.action_for_non_perspective_flags_of_lower_numbered_varyings = lower;
1680 flags.action_for_non_perspective_flags_of_higher_numbered_varyings = higher;
1681 }
1682 }
1683
1684 static void
emit_centroid_flags(struct v3dv_job * job,int varying_offset,uint32_t varyings,enum V3DX (Varying_Flags_Action)lower,enum V3DX (Varying_Flags_Action)higher)1685 emit_centroid_flags(struct v3dv_job *job,
1686 int varying_offset,
1687 uint32_t varyings,
1688 enum V3DX(Varying_Flags_Action) lower,
1689 enum V3DX(Varying_Flags_Action) higher)
1690 {
1691 v3dv_cl_ensure_space_with_branch(&job->bcl,
1692 cl_packet_length(CENTROID_FLAGS));
1693 v3dv_return_if_oom(NULL, job);
1694
1695 cl_emit(&job->bcl, CENTROID_FLAGS, flags) {
1696 flags.varying_offset_v0 = varying_offset;
1697 flags.centroid_flags_for_varyings_v024 = varyings;
1698 flags.action_for_centroid_flags_of_lower_numbered_varyings = lower;
1699 flags.action_for_centroid_flags_of_higher_numbered_varyings = higher;
1700 }
1701 }
1702
1703 static bool
emit_varying_flags(struct v3dv_job * job,uint32_t num_flags,const uint32_t * flags,void (* flag_emit_callback)(struct v3dv_job * job,int varying_offset,uint32_t flags,enum V3DX (Varying_Flags_Action)lower,enum V3DX (Varying_Flags_Action)higher))1704 emit_varying_flags(struct v3dv_job *job,
1705 uint32_t num_flags,
1706 const uint32_t *flags,
1707 void (*flag_emit_callback)(struct v3dv_job *job,
1708 int varying_offset,
1709 uint32_t flags,
1710 enum V3DX(Varying_Flags_Action) lower,
1711 enum V3DX(Varying_Flags_Action) higher))
1712 {
1713 bool emitted_any = false;
1714 for (int i = 0; i < num_flags; i++) {
1715 if (!flags[i])
1716 continue;
1717
1718 if (emitted_any) {
1719 flag_emit_callback(job, i, flags[i],
1720 V3D_VARYING_FLAGS_ACTION_UNCHANGED,
1721 V3D_VARYING_FLAGS_ACTION_UNCHANGED);
1722 } else if (i == 0) {
1723 flag_emit_callback(job, i, flags[i],
1724 V3D_VARYING_FLAGS_ACTION_UNCHANGED,
1725 V3D_VARYING_FLAGS_ACTION_ZEROED);
1726 } else {
1727 flag_emit_callback(job, i, flags[i],
1728 V3D_VARYING_FLAGS_ACTION_ZEROED,
1729 V3D_VARYING_FLAGS_ACTION_ZEROED);
1730 }
1731
1732 emitted_any = true;
1733 }
1734
1735 return emitted_any;
1736 }
1737
1738 void
v3dX(cmd_buffer_emit_varyings_state)1739 v3dX(cmd_buffer_emit_varyings_state)(struct v3dv_cmd_buffer *cmd_buffer)
1740 {
1741 struct v3dv_job *job = cmd_buffer->state.job;
1742 struct v3dv_pipeline *pipeline = cmd_buffer->state.gfx.pipeline;
1743
1744 struct v3d_fs_prog_data *prog_data_fs =
1745 pipeline->shared_data->variants[BROADCOM_SHADER_FRAGMENT]->prog_data.fs;
1746
1747 const uint32_t num_flags =
1748 ARRAY_SIZE(prog_data_fs->flat_shade_flags);
1749 const uint32_t *flat_shade_flags = prog_data_fs->flat_shade_flags;
1750 const uint32_t *noperspective_flags = prog_data_fs->noperspective_flags;
1751 const uint32_t *centroid_flags = prog_data_fs->centroid_flags;
1752
1753 if (!emit_varying_flags(job, num_flags, flat_shade_flags,
1754 emit_flat_shade_flags)) {
1755 v3dv_cl_ensure_space_with_branch(
1756 &job->bcl, cl_packet_length(ZERO_ALL_FLAT_SHADE_FLAGS));
1757 v3dv_return_if_oom(cmd_buffer, NULL);
1758
1759 cl_emit(&job->bcl, ZERO_ALL_FLAT_SHADE_FLAGS, flags);
1760 }
1761
1762 if (!emit_varying_flags(job, num_flags, noperspective_flags,
1763 emit_noperspective_flags)) {
1764 v3dv_cl_ensure_space_with_branch(
1765 &job->bcl, cl_packet_length(ZERO_ALL_NON_PERSPECTIVE_FLAGS));
1766 v3dv_return_if_oom(cmd_buffer, NULL);
1767
1768 cl_emit(&job->bcl, ZERO_ALL_NON_PERSPECTIVE_FLAGS, flags);
1769 }
1770
1771 if (!emit_varying_flags(job, num_flags, centroid_flags,
1772 emit_centroid_flags)) {
1773 v3dv_cl_ensure_space_with_branch(
1774 &job->bcl, cl_packet_length(ZERO_ALL_CENTROID_FLAGS));
1775 v3dv_return_if_oom(cmd_buffer, NULL);
1776
1777 cl_emit(&job->bcl, ZERO_ALL_CENTROID_FLAGS, flags);
1778 }
1779 }
1780
1781 /* Updates job early Z state tracking. Returns False if EZ must be disabled
1782 * for the current draw call.
1783 */
1784 static bool
job_update_ez_state(struct v3dv_job * job,struct v3dv_pipeline * pipeline,struct v3dv_cmd_buffer * cmd_buffer)1785 job_update_ez_state(struct v3dv_job *job,
1786 struct v3dv_pipeline *pipeline,
1787 struct v3dv_cmd_buffer *cmd_buffer)
1788 {
1789 /* If first_ez_state is V3D_EZ_DISABLED it means that we have already
1790 * determined that we should disable EZ completely for all draw calls in
1791 * this job. This will cause us to disable EZ for the entire job in the
1792 * Tile Rendering Mode RCL packet and when we do that we need to make sure
1793 * we never emit a draw call in the job with EZ enabled in the CFG_BITS
1794 * packet, so ez_state must also be V3D_EZ_DISABLED;
1795 */
1796 if (job->first_ez_state == V3D_EZ_DISABLED) {
1797 assert(job->ez_state == V3D_EZ_DISABLED);
1798 return false;
1799 }
1800
1801 /* If ez_state is V3D_EZ_DISABLED it means that we have already decided
1802 * that EZ must be disabled for the remaining of the frame.
1803 */
1804 if (job->ez_state == V3D_EZ_DISABLED)
1805 return false;
1806
1807 /* This is part of the pre draw call handling, so we should be inside a
1808 * render pass.
1809 */
1810 assert(cmd_buffer->state.pass);
1811
1812 /* If this is the first time we update EZ state for this job we first check
1813 * if there is anything that requires disabling it completely for the entire
1814 * job (based on state that is not related to the current draw call and
1815 * pipeline state).
1816 */
1817 if (!job->decided_global_ez_enable) {
1818 job->decided_global_ez_enable = true;
1819
1820 struct v3dv_cmd_buffer_state *state = &cmd_buffer->state;
1821 assert(state->subpass_idx < state->pass->subpass_count);
1822 struct v3dv_subpass *subpass = &state->pass->subpasses[state->subpass_idx];
1823 if (subpass->ds_attachment.attachment == VK_ATTACHMENT_UNUSED) {
1824 job->first_ez_state = V3D_EZ_DISABLED;
1825 job->ez_state = V3D_EZ_DISABLED;
1826 return false;
1827 }
1828
1829 /* GFXH-1918: the early-z buffer may load incorrect depth values
1830 * if the frame has odd width or height.
1831 *
1832 * So we need to disable EZ in this case.
1833 */
1834 const struct v3dv_render_pass_attachment *ds_attachment =
1835 &state->pass->attachments[subpass->ds_attachment.attachment];
1836
1837 const VkImageAspectFlags ds_aspects =
1838 vk_format_aspects(ds_attachment->desc.format);
1839
1840 bool needs_depth_load =
1841 v3dv_cmd_buffer_check_needs_load(state,
1842 ds_aspects & VK_IMAGE_ASPECT_DEPTH_BIT,
1843 ds_attachment->first_subpass,
1844 ds_attachment->desc.loadOp,
1845 ds_attachment->last_subpass,
1846 ds_attachment->desc.storeOp);
1847
1848 if (needs_depth_load) {
1849 struct v3dv_framebuffer *fb = state->framebuffer;
1850
1851 if (!fb) {
1852 assert(cmd_buffer->vk.level == VK_COMMAND_BUFFER_LEVEL_SECONDARY);
1853 perf_debug("Loading depth aspect in a secondary command buffer "
1854 "without framebuffer info disables early-z tests.\n");
1855 job->first_ez_state = V3D_EZ_DISABLED;
1856 job->ez_state = V3D_EZ_DISABLED;
1857 return false;
1858 }
1859
1860 if (((fb->width % 2) != 0 || (fb->height % 2) != 0)) {
1861 perf_debug("Loading depth aspect for framebuffer with odd width "
1862 "or height disables early-Z tests.\n");
1863 job->first_ez_state = V3D_EZ_DISABLED;
1864 job->ez_state = V3D_EZ_DISABLED;
1865 return false;
1866 }
1867 }
1868 }
1869
1870 /* Otherwise, we can decide to selectively enable or disable EZ for draw
1871 * calls using the CFG_BITS packet based on the bound pipeline state.
1872 */
1873 bool disable_ez = false;
1874 bool incompatible_test = false;
1875 switch (pipeline->ez_state) {
1876 case V3D_EZ_UNDECIDED:
1877 /* If the pipeline didn't pick a direction but didn't disable, then go
1878 * along with the current EZ state. This allows EZ optimization for Z
1879 * func == EQUAL or NEVER.
1880 */
1881 break;
1882
1883 case V3D_EZ_LT_LE:
1884 case V3D_EZ_GT_GE:
1885 /* If the pipeline picked a direction, then it needs to match the current
1886 * direction if we've decided on one.
1887 */
1888 if (job->ez_state == V3D_EZ_UNDECIDED) {
1889 job->ez_state = pipeline->ez_state;
1890 } else if (job->ez_state != pipeline->ez_state) {
1891 disable_ez = true;
1892 incompatible_test = true;
1893 }
1894 break;
1895
1896 case V3D_EZ_DISABLED:
1897 disable_ez = true;
1898 incompatible_test = pipeline->incompatible_ez_test;
1899 break;
1900 }
1901
1902 if (job->first_ez_state == V3D_EZ_UNDECIDED && !disable_ez) {
1903 assert(job->ez_state != V3D_EZ_DISABLED);
1904 job->first_ez_state = job->ez_state;
1905 }
1906
1907 /* If we had to disable EZ because of an incompatible test direction and
1908 * and the pipeline writes depth then we need to disable EZ for the rest of
1909 * the frame.
1910 */
1911 if (incompatible_test && pipeline->z_updates_enable) {
1912 assert(disable_ez);
1913 job->ez_state = V3D_EZ_DISABLED;
1914 }
1915
1916 if (!disable_ez)
1917 job->has_ez_draws = true;
1918
1919 return !disable_ez;
1920 }
1921
1922 void
v3dX(cmd_buffer_emit_configuration_bits)1923 v3dX(cmd_buffer_emit_configuration_bits)(struct v3dv_cmd_buffer *cmd_buffer)
1924 {
1925 struct v3dv_job *job = cmd_buffer->state.job;
1926 assert(job);
1927
1928 struct v3dv_pipeline *pipeline = cmd_buffer->state.gfx.pipeline;
1929 assert(pipeline);
1930
1931 v3dv_cl_ensure_space_with_branch(&job->bcl, cl_packet_length(CFG_BITS));
1932 v3dv_return_if_oom(cmd_buffer, NULL);
1933
1934 cl_emit_with_prepacked(&job->bcl, CFG_BITS, pipeline->cfg_bits, config) {
1935 #if V3D_VERSION == 42
1936 bool enable_ez = job_update_ez_state(job, pipeline, cmd_buffer);
1937 config.early_z_enable = enable_ez;
1938 config.early_z_updates_enable = config.early_z_enable &&
1939 pipeline->z_updates_enable;
1940 #endif
1941 }
1942 }
1943
1944 void
v3dX(cmd_buffer_emit_occlusion_query)1945 v3dX(cmd_buffer_emit_occlusion_query)(struct v3dv_cmd_buffer *cmd_buffer)
1946 {
1947 struct v3dv_job *job = cmd_buffer->state.job;
1948 assert(job);
1949
1950 v3dv_cl_ensure_space_with_branch(&job->bcl,
1951 cl_packet_length(OCCLUSION_QUERY_COUNTER));
1952 v3dv_return_if_oom(cmd_buffer, NULL);
1953
1954 cl_emit(&job->bcl, OCCLUSION_QUERY_COUNTER, counter) {
1955 if (cmd_buffer->state.query.active_query.bo) {
1956 counter.address =
1957 v3dv_cl_address(cmd_buffer->state.query.active_query.bo,
1958 cmd_buffer->state.query.active_query.offset);
1959 }
1960 }
1961
1962 cmd_buffer->state.dirty &= ~V3DV_CMD_DIRTY_OCCLUSION_QUERY;
1963 }
1964
1965 static struct v3dv_job *
cmd_buffer_subpass_split_for_barrier(struct v3dv_cmd_buffer * cmd_buffer,bool is_bcl_barrier)1966 cmd_buffer_subpass_split_for_barrier(struct v3dv_cmd_buffer *cmd_buffer,
1967 bool is_bcl_barrier)
1968 {
1969 assert(cmd_buffer->state.subpass_idx != -1);
1970 v3dv_cmd_buffer_finish_job(cmd_buffer);
1971 struct v3dv_job *job =
1972 v3dv_cmd_buffer_subpass_resume(cmd_buffer,
1973 cmd_buffer->state.subpass_idx);
1974 if (!job)
1975 return NULL;
1976
1977 /* FIXME: we can do better than all barriers */
1978 job->serialize = V3DV_BARRIER_ALL;
1979 job->needs_bcl_sync = is_bcl_barrier;
1980 return job;
1981 }
1982
1983 static void
cmd_buffer_copy_secondary_end_query_state(struct v3dv_cmd_buffer * primary,struct v3dv_cmd_buffer * secondary)1984 cmd_buffer_copy_secondary_end_query_state(struct v3dv_cmd_buffer *primary,
1985 struct v3dv_cmd_buffer *secondary)
1986 {
1987 struct v3dv_cmd_buffer_state *p_state = &primary->state;
1988 struct v3dv_cmd_buffer_state *s_state = &secondary->state;
1989
1990 const uint32_t total_state_count =
1991 p_state->query.end.used_count + s_state->query.end.used_count;
1992 v3dv_cmd_buffer_ensure_array_state(primary,
1993 sizeof(struct v3dv_end_query_info),
1994 total_state_count,
1995 &p_state->query.end.alloc_count,
1996 (void **) &p_state->query.end.states);
1997 v3dv_return_if_oom(primary, NULL);
1998
1999 for (uint32_t i = 0; i < s_state->query.end.used_count; i++) {
2000 const struct v3dv_end_query_info *s_qstate =
2001 &secondary->state.query.end.states[i];
2002
2003 struct v3dv_end_query_info *p_qstate =
2004 &p_state->query.end.states[p_state->query.end.used_count++];
2005
2006 p_qstate->pool = s_qstate->pool;
2007 p_qstate->query = s_qstate->query;
2008 }
2009 }
2010
2011 void
v3dX(cmd_buffer_execute_inside_pass)2012 v3dX(cmd_buffer_execute_inside_pass)(struct v3dv_cmd_buffer *primary,
2013 uint32_t cmd_buffer_count,
2014 const VkCommandBuffer *cmd_buffers)
2015 {
2016 assert(primary->state.job);
2017
2018 /* Typically we postpone applying binning syncs until we see a draw call
2019 * that may actually access proteted resources in the binning stage. However,
2020 * if the draw calls are recorded in a secondary command buffer and the
2021 * barriers were recorded in a primary command buffer, that won't work
2022 * and we will have to check if we need a binning sync when executing the
2023 * secondary.
2024 */
2025 struct v3dv_job *primary_job = primary->state.job;
2026 if (primary_job->serialize &&
2027 (primary->state.barrier.bcl_buffer_access ||
2028 primary->state.barrier.bcl_image_access)) {
2029 v3dv_cmd_buffer_consume_bcl_sync(primary, primary_job);
2030 }
2031
2032 /* Emit occlusion query state if needed so the draw calls inside our
2033 * secondaries update the counters.
2034 */
2035 bool has_occlusion_query =
2036 primary->state.dirty & V3DV_CMD_DIRTY_OCCLUSION_QUERY;
2037 if (has_occlusion_query)
2038 v3dX(cmd_buffer_emit_occlusion_query)(primary);
2039
2040 /* FIXME: if our primary job tiling doesn't enable MSSA but any of the
2041 * pipelines used by the secondaries do, we need to re-start the primary
2042 * job to enable MSAA. See cmd_buffer_restart_job_for_msaa_if_needed.
2043 */
2044 struct v3dv_barrier_state pending_barrier = { 0 };
2045 for (uint32_t i = 0; i < cmd_buffer_count; i++) {
2046 V3DV_FROM_HANDLE(v3dv_cmd_buffer, secondary, cmd_buffers[i]);
2047
2048 assert(secondary->usage_flags &
2049 VK_COMMAND_BUFFER_USAGE_RENDER_PASS_CONTINUE_BIT);
2050
2051 list_for_each_entry(struct v3dv_job, secondary_job,
2052 &secondary->jobs, list_link) {
2053 if (secondary_job->type == V3DV_JOB_TYPE_GPU_CL_SECONDARY) {
2054 /* If the job is a CL, then we branch to it from the primary BCL.
2055 * In this case the secondary's BCL is finished with a
2056 * RETURN_FROM_SUB_LIST command to return back to the primary BCL
2057 * once we are done executing it.
2058 */
2059 assert(v3dv_cl_offset(&secondary_job->rcl) == 0);
2060 assert(secondary_job->bcl.bo);
2061
2062 /* Sanity check that secondary BCL ends with RETURN_FROM_SUB_LIST */
2063 STATIC_ASSERT(cl_packet_length(RETURN_FROM_SUB_LIST) == 1);
2064 assert(v3dv_cl_offset(&secondary_job->bcl) >= 1);
2065 assert(*(((uint8_t *)secondary_job->bcl.next) - 1) ==
2066 V3DX(RETURN_FROM_SUB_LIST_opcode));
2067
2068 /* If this secondary has any barriers (or we had any pending barrier
2069 * to apply), then we can't just branch to it from the primary, we
2070 * need to split the primary to create a new job that can consume
2071 * the barriers first.
2072 *
2073 * FIXME: in this case, maybe just copy the secondary BCL without
2074 * the RETURN_FROM_SUB_LIST into the primary job to skip the
2075 * branch?
2076 */
2077 primary_job = primary->state.job;
2078 if (!primary_job || secondary_job->serialize ||
2079 pending_barrier.dst_mask) {
2080 const bool needs_bcl_barrier =
2081 secondary_job->needs_bcl_sync ||
2082 pending_barrier.bcl_buffer_access ||
2083 pending_barrier.bcl_image_access;
2084
2085 primary_job =
2086 cmd_buffer_subpass_split_for_barrier(primary,
2087 needs_bcl_barrier);
2088 v3dv_return_if_oom(primary, NULL);
2089
2090 /* Since we have created a new primary we need to re-emit
2091 * occlusion query state.
2092 */
2093 if (has_occlusion_query)
2094 v3dX(cmd_buffer_emit_occlusion_query)(primary);
2095 }
2096
2097 /* Make sure our primary job has all required BO references */
2098 set_foreach(secondary_job->bos, entry) {
2099 struct v3dv_bo *bo = (struct v3dv_bo *)entry->key;
2100 v3dv_job_add_bo(primary_job, bo);
2101 }
2102
2103 /* Emit required branch instructions. We expect each of these
2104 * to end with a corresponding 'return from sub list' item.
2105 */
2106 list_for_each_entry(struct v3dv_bo, bcl_bo,
2107 &secondary_job->bcl.bo_list, list_link) {
2108 v3dv_cl_ensure_space_with_branch(&primary_job->bcl,
2109 cl_packet_length(BRANCH_TO_SUB_LIST));
2110 v3dv_return_if_oom(primary, NULL);
2111 cl_emit(&primary_job->bcl, BRANCH_TO_SUB_LIST, branch) {
2112 branch.address = v3dv_cl_address(bcl_bo, 0);
2113 }
2114 }
2115
2116 if (!secondary_job->can_use_double_buffer) {
2117 primary_job->can_use_double_buffer = false;
2118 } else {
2119 primary_job->double_buffer_score.geom +=
2120 secondary_job->double_buffer_score.geom;
2121 primary_job->double_buffer_score.render +=
2122 secondary_job->double_buffer_score.render;
2123 }
2124 primary_job->tmu_dirty_rcl |= secondary_job->tmu_dirty_rcl;
2125 } else {
2126 /* This is a regular job (CPU or GPU), so just finish the current
2127 * primary job (if any) and then add the secondary job to the
2128 * primary's job list right after it.
2129 */
2130 v3dv_cmd_buffer_finish_job(primary);
2131 v3dv_job_clone_in_cmd_buffer(secondary_job, primary);
2132 if (pending_barrier.dst_mask) {
2133 /* FIXME: do the same we do for primaries and only choose the
2134 * relevant src masks.
2135 */
2136 secondary_job->serialize = pending_barrier.src_mask_graphics |
2137 pending_barrier.src_mask_transfer |
2138 pending_barrier.src_mask_compute;
2139 if (pending_barrier.bcl_buffer_access ||
2140 pending_barrier.bcl_image_access) {
2141 secondary_job->needs_bcl_sync = true;
2142 }
2143 }
2144 }
2145
2146 memset(&pending_barrier, 0, sizeof(pending_barrier));
2147 }
2148
2149 /* If the secondary has recorded any vkCmdEndQuery commands, we need to
2150 * copy this state to the primary so it is processed properly when the
2151 * current primary job is finished.
2152 */
2153 cmd_buffer_copy_secondary_end_query_state(primary, secondary);
2154
2155 /* If this secondary had any pending barrier state we will need that
2156 * barrier state consumed with whatever comes next in the primary.
2157 */
2158 assert(secondary->state.barrier.dst_mask ||
2159 (!secondary->state.barrier.bcl_buffer_access &&
2160 !secondary->state.barrier.bcl_image_access));
2161
2162 pending_barrier = secondary->state.barrier;
2163 }
2164
2165 if (pending_barrier.dst_mask) {
2166 v3dv_cmd_buffer_merge_barrier_state(&primary->state.barrier,
2167 &pending_barrier);
2168 }
2169 }
2170
2171 static void
emit_gs_shader_state_record(struct v3dv_job * job,struct v3dv_bo * assembly_bo,struct v3dv_shader_variant * gs_bin,struct v3dv_cl_reloc gs_bin_uniforms,struct v3dv_shader_variant * gs,struct v3dv_cl_reloc gs_render_uniforms)2172 emit_gs_shader_state_record(struct v3dv_job *job,
2173 struct v3dv_bo *assembly_bo,
2174 struct v3dv_shader_variant *gs_bin,
2175 struct v3dv_cl_reloc gs_bin_uniforms,
2176 struct v3dv_shader_variant *gs,
2177 struct v3dv_cl_reloc gs_render_uniforms)
2178 {
2179 cl_emit(&job->indirect, GEOMETRY_SHADER_STATE_RECORD, shader) {
2180 shader.geometry_bin_mode_shader_code_address =
2181 v3dv_cl_address(assembly_bo, gs_bin->assembly_offset);
2182 shader.geometry_bin_mode_shader_4_way_threadable =
2183 gs_bin->prog_data.gs->base.threads == 4;
2184 shader.geometry_bin_mode_shader_start_in_final_thread_section =
2185 gs_bin->prog_data.gs->base.single_seg;
2186 #if V3D_VERSION <= 42
2187 shader.geometry_bin_mode_shader_propagate_nans = true;
2188 #endif
2189 shader.geometry_bin_mode_shader_uniforms_address =
2190 gs_bin_uniforms;
2191
2192 shader.geometry_render_mode_shader_code_address =
2193 v3dv_cl_address(assembly_bo, gs->assembly_offset);
2194 shader.geometry_render_mode_shader_4_way_threadable =
2195 gs->prog_data.gs->base.threads == 4;
2196 shader.geometry_render_mode_shader_start_in_final_thread_section =
2197 gs->prog_data.gs->base.single_seg;
2198 #if V3D_VERSION <= 42
2199 shader.geometry_render_mode_shader_propagate_nans = true;
2200 #endif
2201 shader.geometry_render_mode_shader_uniforms_address =
2202 gs_render_uniforms;
2203 }
2204 }
2205
2206 static uint8_t
v3d_gs_output_primitive(enum mesa_prim prim_type)2207 v3d_gs_output_primitive(enum mesa_prim prim_type)
2208 {
2209 switch (prim_type) {
2210 case MESA_PRIM_POINTS:
2211 return GEOMETRY_SHADER_POINTS;
2212 case MESA_PRIM_LINE_STRIP:
2213 return GEOMETRY_SHADER_LINE_STRIP;
2214 case MESA_PRIM_TRIANGLE_STRIP:
2215 return GEOMETRY_SHADER_TRI_STRIP;
2216 default:
2217 unreachable("Unsupported primitive type");
2218 }
2219 }
2220
2221 static void
emit_tes_gs_common_params(struct v3dv_job * job,uint8_t gs_out_prim_type,uint8_t gs_num_invocations)2222 emit_tes_gs_common_params(struct v3dv_job *job,
2223 uint8_t gs_out_prim_type,
2224 uint8_t gs_num_invocations)
2225 {
2226 cl_emit(&job->indirect, TESSELLATION_GEOMETRY_COMMON_PARAMS, shader) {
2227 shader.tessellation_type = TESSELLATION_TYPE_TRIANGLE;
2228 shader.tessellation_point_mode = false;
2229 shader.tessellation_edge_spacing = TESSELLATION_EDGE_SPACING_EVEN;
2230 shader.tessellation_clockwise = true;
2231 shader.tessellation_invocations = 1;
2232
2233 shader.geometry_shader_output_format =
2234 v3d_gs_output_primitive(gs_out_prim_type);
2235 shader.geometry_shader_instances = gs_num_invocations & 0x1F;
2236 }
2237 }
2238
2239 static uint8_t
simd_width_to_gs_pack_mode(uint32_t width)2240 simd_width_to_gs_pack_mode(uint32_t width)
2241 {
2242 switch (width) {
2243 case 16:
2244 return V3D_PACK_MODE_16_WAY;
2245 case 8:
2246 return V3D_PACK_MODE_8_WAY;
2247 case 4:
2248 return V3D_PACK_MODE_4_WAY;
2249 case 1:
2250 return V3D_PACK_MODE_1_WAY;
2251 default:
2252 unreachable("Invalid SIMD width");
2253 };
2254 }
2255
2256 static void
emit_tes_gs_shader_params(struct v3dv_job * job,uint32_t gs_simd,uint32_t gs_vpm_output_size,uint32_t gs_max_vpm_input_size_per_batch)2257 emit_tes_gs_shader_params(struct v3dv_job *job,
2258 uint32_t gs_simd,
2259 uint32_t gs_vpm_output_size,
2260 uint32_t gs_max_vpm_input_size_per_batch)
2261 {
2262 cl_emit(&job->indirect, TESSELLATION_GEOMETRY_SHADER_PARAMS, shader) {
2263 shader.tcs_batch_flush_mode = V3D_TCS_FLUSH_MODE_FULLY_PACKED;
2264 shader.per_patch_data_column_depth = 1;
2265 shader.tcs_output_segment_size_in_sectors = 1;
2266 shader.tcs_output_segment_pack_mode = V3D_PACK_MODE_16_WAY;
2267 shader.tes_output_segment_size_in_sectors = 1;
2268 shader.tes_output_segment_pack_mode = V3D_PACK_MODE_16_WAY;
2269 shader.gs_output_segment_size_in_sectors = gs_vpm_output_size;
2270 shader.gs_output_segment_pack_mode =
2271 simd_width_to_gs_pack_mode(gs_simd);
2272 shader.tbg_max_patches_per_tcs_batch = 1;
2273 shader.tbg_max_extra_vertex_segs_for_patches_after_first = 0;
2274 shader.tbg_min_tcs_output_segments_required_in_play = 1;
2275 shader.tbg_min_per_patch_data_segments_required_in_play = 1;
2276 shader.tpg_max_patches_per_tes_batch = 1;
2277 shader.tpg_max_vertex_segments_per_tes_batch = 0;
2278 shader.tpg_max_tcs_output_segments_per_tes_batch = 1;
2279 shader.tpg_min_tes_output_segments_required_in_play = 1;
2280 shader.gbg_max_tes_output_vertex_segments_per_gs_batch =
2281 gs_max_vpm_input_size_per_batch;
2282 shader.gbg_min_gs_output_segments_required_in_play = 1;
2283 }
2284 }
2285
2286 void
v3dX(cmd_buffer_emit_gl_shader_state)2287 v3dX(cmd_buffer_emit_gl_shader_state)(struct v3dv_cmd_buffer *cmd_buffer)
2288 {
2289 struct v3dv_job *job = cmd_buffer->state.job;
2290 assert(job);
2291
2292 struct v3dv_cmd_buffer_state *state = &cmd_buffer->state;
2293 struct v3dv_pipeline *pipeline = state->gfx.pipeline;
2294 assert(pipeline);
2295
2296 struct v3dv_shader_variant *vs_variant =
2297 pipeline->shared_data->variants[BROADCOM_SHADER_VERTEX];
2298 struct v3d_vs_prog_data *prog_data_vs = vs_variant->prog_data.vs;
2299
2300 struct v3dv_shader_variant *vs_bin_variant =
2301 pipeline->shared_data->variants[BROADCOM_SHADER_VERTEX_BIN];
2302 struct v3d_vs_prog_data *prog_data_vs_bin = vs_bin_variant->prog_data.vs;
2303
2304 struct v3dv_shader_variant *fs_variant =
2305 pipeline->shared_data->variants[BROADCOM_SHADER_FRAGMENT];
2306 struct v3d_fs_prog_data *prog_data_fs = fs_variant->prog_data.fs;
2307
2308 struct v3dv_shader_variant *gs_variant = NULL;
2309 struct v3dv_shader_variant *gs_bin_variant = NULL;
2310 struct v3d_gs_prog_data *prog_data_gs = NULL;
2311 struct v3d_gs_prog_data *prog_data_gs_bin = NULL;
2312 if (pipeline->has_gs) {
2313 gs_variant =
2314 pipeline->shared_data->variants[BROADCOM_SHADER_GEOMETRY];
2315 prog_data_gs = gs_variant->prog_data.gs;
2316
2317 gs_bin_variant =
2318 pipeline->shared_data->variants[BROADCOM_SHADER_GEOMETRY_BIN];
2319 prog_data_gs_bin = gs_bin_variant->prog_data.gs;
2320 }
2321
2322 /* Update the cache dirty flag based on the shader progs data */
2323 job->tmu_dirty_rcl |= prog_data_vs_bin->base.tmu_dirty_rcl;
2324 job->tmu_dirty_rcl |= prog_data_vs->base.tmu_dirty_rcl;
2325 job->tmu_dirty_rcl |= prog_data_fs->base.tmu_dirty_rcl;
2326 if (pipeline->has_gs) {
2327 job->tmu_dirty_rcl |= prog_data_gs_bin->base.tmu_dirty_rcl;
2328 job->tmu_dirty_rcl |= prog_data_gs->base.tmu_dirty_rcl;
2329 }
2330
2331 /* See GFXH-930 workaround below */
2332 uint32_t num_elements_to_emit = MAX2(pipeline->va_count, 1);
2333
2334 uint32_t shader_state_record_length =
2335 cl_packet_length(GL_SHADER_STATE_RECORD);
2336 if (pipeline->has_gs) {
2337 shader_state_record_length +=
2338 cl_packet_length(GEOMETRY_SHADER_STATE_RECORD) +
2339 cl_packet_length(TESSELLATION_GEOMETRY_COMMON_PARAMS) +
2340 2 * cl_packet_length(TESSELLATION_GEOMETRY_SHADER_PARAMS);
2341 }
2342
2343 uint32_t shader_rec_offset =
2344 v3dv_cl_ensure_space(&job->indirect,
2345 shader_state_record_length +
2346 num_elements_to_emit *
2347 cl_packet_length(GL_SHADER_STATE_ATTRIBUTE_RECORD),
2348 32);
2349 v3dv_return_if_oom(cmd_buffer, NULL);
2350
2351 struct v3dv_bo *assembly_bo = pipeline->shared_data->assembly_bo;
2352
2353 if (pipeline->has_gs) {
2354 emit_gs_shader_state_record(job,
2355 assembly_bo,
2356 gs_bin_variant,
2357 cmd_buffer->state.uniforms.gs_bin,
2358 gs_variant,
2359 cmd_buffer->state.uniforms.gs);
2360
2361 emit_tes_gs_common_params(job,
2362 prog_data_gs->out_prim_type,
2363 prog_data_gs->num_invocations);
2364
2365 emit_tes_gs_shader_params(job,
2366 pipeline->vpm_cfg_bin.gs_width,
2367 pipeline->vpm_cfg_bin.Gd,
2368 pipeline->vpm_cfg_bin.Gv);
2369
2370 emit_tes_gs_shader_params(job,
2371 pipeline->vpm_cfg.gs_width,
2372 pipeline->vpm_cfg.Gd,
2373 pipeline->vpm_cfg.Gv);
2374 }
2375
2376 #if V3D_VERSION == 42
2377 struct v3dv_bo *default_attribute_values =
2378 pipeline->default_attribute_values != NULL ?
2379 pipeline->default_attribute_values :
2380 pipeline->device->default_attribute_float;
2381 #endif
2382
2383 cl_emit_with_prepacked(&job->indirect, GL_SHADER_STATE_RECORD,
2384 pipeline->shader_state_record, shader) {
2385
2386 /* FIXME: we are setting this values here and during the
2387 * prepacking. This is because both cl_emit_with_prepacked and v3dvx_pack
2388 * asserts for minimum values of these. It would be good to get
2389 * v3dvx_pack to assert on the final value if possible
2390 */
2391 shader.min_coord_shader_input_segments_required_in_play =
2392 pipeline->vpm_cfg_bin.As;
2393 shader.min_vertex_shader_input_segments_required_in_play =
2394 pipeline->vpm_cfg.As;
2395
2396 shader.coordinate_shader_code_address =
2397 v3dv_cl_address(assembly_bo, vs_bin_variant->assembly_offset);
2398 shader.vertex_shader_code_address =
2399 v3dv_cl_address(assembly_bo, vs_variant->assembly_offset);
2400 shader.fragment_shader_code_address =
2401 v3dv_cl_address(assembly_bo, fs_variant->assembly_offset);
2402
2403 shader.coordinate_shader_uniforms_address = cmd_buffer->state.uniforms.vs_bin;
2404 shader.vertex_shader_uniforms_address = cmd_buffer->state.uniforms.vs;
2405 shader.fragment_shader_uniforms_address = cmd_buffer->state.uniforms.fs;
2406
2407 #if V3D_VERSION == 42
2408 shader.address_of_default_attribute_values =
2409 v3dv_cl_address(default_attribute_values, 0);
2410 #endif
2411
2412 shader.any_shader_reads_hardware_written_primitive_id =
2413 (pipeline->has_gs && prog_data_gs->uses_pid) || prog_data_fs->uses_pid;
2414 shader.insert_primitive_id_as_first_varying_to_fragment_shader =
2415 !pipeline->has_gs && prog_data_fs->uses_pid;
2416 }
2417
2418 /* Upload vertex element attributes (SHADER_STATE_ATTRIBUTE_RECORD) */
2419 bool cs_loaded_any = false;
2420 const bool cs_uses_builtins = prog_data_vs_bin->uses_iid ||
2421 prog_data_vs_bin->uses_biid ||
2422 prog_data_vs_bin->uses_vid;
2423 const uint32_t packet_length =
2424 cl_packet_length(GL_SHADER_STATE_ATTRIBUTE_RECORD);
2425
2426 uint32_t emitted_va_count = 0;
2427 for (uint32_t i = 0; emitted_va_count < pipeline->va_count; i++) {
2428 assert(i < MAX_VERTEX_ATTRIBS);
2429
2430 if (pipeline->va[i].vk_format == VK_FORMAT_UNDEFINED)
2431 continue;
2432
2433 const uint32_t binding = pipeline->va[i].binding;
2434
2435 /* We store each vertex attribute in the array using its driver location
2436 * as index.
2437 */
2438 const uint32_t location = i;
2439
2440 struct v3dv_vertex_binding *c_vb = &cmd_buffer->state.vertex_bindings[binding];
2441
2442 cl_emit_with_prepacked(&job->indirect, GL_SHADER_STATE_ATTRIBUTE_RECORD,
2443 &pipeline->vertex_attrs[i * packet_length], attr) {
2444
2445 assert(c_vb->buffer->mem->bo);
2446 attr.address = v3dv_cl_address(c_vb->buffer->mem->bo,
2447 c_vb->buffer->mem_offset +
2448 pipeline->va[i].offset +
2449 c_vb->offset);
2450
2451 attr.number_of_values_read_by_coordinate_shader =
2452 prog_data_vs_bin->vattr_sizes[location];
2453 attr.number_of_values_read_by_vertex_shader =
2454 prog_data_vs->vattr_sizes[location];
2455
2456 /* GFXH-930: At least one attribute must be enabled and read by CS
2457 * and VS. If we have attributes being consumed by the VS but not
2458 * the CS, then set up a dummy load of the last attribute into the
2459 * CS's VPM inputs. (Since CS is just dead-code-elimination compared
2460 * to VS, we can't have CS loading but not VS).
2461 *
2462 * GFXH-1602: first attribute must be active if using builtins.
2463 */
2464 if (prog_data_vs_bin->vattr_sizes[location])
2465 cs_loaded_any = true;
2466
2467 if (i == 0 && cs_uses_builtins && !cs_loaded_any) {
2468 attr.number_of_values_read_by_coordinate_shader = 1;
2469 cs_loaded_any = true;
2470 } else if (i == pipeline->va_count - 1 && !cs_loaded_any) {
2471 attr.number_of_values_read_by_coordinate_shader = 1;
2472 cs_loaded_any = true;
2473 }
2474
2475 attr.maximum_index = 0xffffff;
2476 }
2477
2478 emitted_va_count++;
2479 }
2480
2481 if (pipeline->va_count == 0) {
2482 /* GFXH-930: At least one attribute must be enabled and read
2483 * by CS and VS. If we have no attributes being consumed by
2484 * the shader, set up a dummy to be loaded into the VPM.
2485 */
2486 cl_emit(&job->indirect, GL_SHADER_STATE_ATTRIBUTE_RECORD, attr) {
2487 /* Valid address of data whose value will be unused. */
2488 attr.address = v3dv_cl_address(job->indirect.bo, 0);
2489
2490 attr.type = ATTRIBUTE_FLOAT;
2491 attr.stride = 0;
2492 attr.vec_size = 1;
2493
2494 attr.number_of_values_read_by_coordinate_shader = 1;
2495 attr.number_of_values_read_by_vertex_shader = 1;
2496 }
2497 }
2498
2499 if (cmd_buffer->state.dirty & V3DV_CMD_DIRTY_PIPELINE) {
2500 v3dv_cl_ensure_space_with_branch(&job->bcl,
2501 sizeof(pipeline->vcm_cache_size));
2502 v3dv_return_if_oom(cmd_buffer, NULL);
2503
2504 cl_emit_prepacked(&job->bcl, &pipeline->vcm_cache_size);
2505 }
2506
2507 v3dv_cl_ensure_space_with_branch(&job->bcl,
2508 cl_packet_length(GL_SHADER_STATE));
2509 v3dv_return_if_oom(cmd_buffer, NULL);
2510
2511 if (pipeline->has_gs) {
2512 cl_emit(&job->bcl, GL_SHADER_STATE_INCLUDING_GS, state) {
2513 state.address = v3dv_cl_address(job->indirect.bo, shader_rec_offset);
2514 state.number_of_attribute_arrays = num_elements_to_emit;
2515 }
2516 } else {
2517 cl_emit(&job->bcl, GL_SHADER_STATE, state) {
2518 state.address = v3dv_cl_address(job->indirect.bo, shader_rec_offset);
2519 state.number_of_attribute_arrays = num_elements_to_emit;
2520 }
2521 }
2522
2523 /* Clearing push constants and descriptor sets for all stages is not quite
2524 * correct (some shader stages may not be used at all or they may not be
2525 * consuming push constants), however this is not relevant because if we
2526 * bind a different pipeline we always have to rebuild the uniform streams.
2527 */
2528 cmd_buffer->state.dirty &= ~(V3DV_CMD_DIRTY_VERTEX_BUFFER |
2529 V3DV_CMD_DIRTY_DESCRIPTOR_SETS |
2530 V3DV_CMD_DIRTY_PUSH_CONSTANTS);
2531 cmd_buffer->state.dirty_descriptor_stages &= ~VK_SHADER_STAGE_ALL_GRAPHICS;
2532 cmd_buffer->state.dirty_push_constants_stages &= ~VK_SHADER_STAGE_ALL_GRAPHICS;
2533 }
2534
2535 void
v3dX(cmd_buffer_emit_draw)2536 v3dX(cmd_buffer_emit_draw)(struct v3dv_cmd_buffer *cmd_buffer,
2537 struct v3dv_draw_info *info)
2538 {
2539 struct v3dv_job *job = cmd_buffer->state.job;
2540 assert(job);
2541
2542 struct v3dv_cmd_buffer_state *state = &cmd_buffer->state;
2543 struct v3dv_pipeline *pipeline = state->gfx.pipeline;
2544
2545 assert(pipeline);
2546
2547 uint32_t hw_prim_type = v3d_hw_prim_type(pipeline->topology);
2548
2549 if (info->first_instance > 0) {
2550 v3dv_cl_ensure_space_with_branch(
2551 &job->bcl, cl_packet_length(BASE_VERTEX_BASE_INSTANCE));
2552 v3dv_return_if_oom(cmd_buffer, NULL);
2553
2554 cl_emit(&job->bcl, BASE_VERTEX_BASE_INSTANCE, base) {
2555 base.base_instance = info->first_instance;
2556 base.base_vertex = 0;
2557 }
2558 }
2559
2560 if (info->instance_count > 1) {
2561 v3dv_cl_ensure_space_with_branch(
2562 &job->bcl, cl_packet_length(VERTEX_ARRAY_INSTANCED_PRIMS));
2563 v3dv_return_if_oom(cmd_buffer, NULL);
2564
2565 cl_emit(&job->bcl, VERTEX_ARRAY_INSTANCED_PRIMS, prim) {
2566 prim.mode = hw_prim_type;
2567 prim.index_of_first_vertex = info->first_vertex;
2568 prim.number_of_instances = info->instance_count;
2569 prim.instance_length = info->vertex_count;
2570 }
2571 } else {
2572 v3dv_cl_ensure_space_with_branch(
2573 &job->bcl, cl_packet_length(VERTEX_ARRAY_PRIMS));
2574 v3dv_return_if_oom(cmd_buffer, NULL);
2575 cl_emit(&job->bcl, VERTEX_ARRAY_PRIMS, prim) {
2576 prim.mode = hw_prim_type;
2577 prim.length = info->vertex_count;
2578 prim.index_of_first_vertex = info->first_vertex;
2579 }
2580 }
2581 }
2582
2583 void
v3dX(cmd_buffer_emit_index_buffer)2584 v3dX(cmd_buffer_emit_index_buffer)(struct v3dv_cmd_buffer *cmd_buffer)
2585 {
2586 struct v3dv_job *job = cmd_buffer->state.job;
2587 assert(job);
2588
2589 /* We flag all state as dirty when we create a new job so make sure we
2590 * have a valid index buffer before attempting to emit state for it.
2591 */
2592 struct v3dv_buffer *ibuffer =
2593 v3dv_buffer_from_handle(cmd_buffer->state.index_buffer.buffer);
2594 if (ibuffer) {
2595 v3dv_cl_ensure_space_with_branch(
2596 &job->bcl, cl_packet_length(INDEX_BUFFER_SETUP));
2597 v3dv_return_if_oom(cmd_buffer, NULL);
2598
2599 const uint32_t offset = cmd_buffer->state.index_buffer.offset;
2600 cl_emit(&job->bcl, INDEX_BUFFER_SETUP, ib) {
2601 ib.address = v3dv_cl_address(ibuffer->mem->bo,
2602 ibuffer->mem_offset + offset);
2603 ib.size = ibuffer->mem->bo->size;
2604 }
2605 }
2606
2607 cmd_buffer->state.dirty &= ~V3DV_CMD_DIRTY_INDEX_BUFFER;
2608 }
2609
2610 void
v3dX(cmd_buffer_emit_draw_indexed)2611 v3dX(cmd_buffer_emit_draw_indexed)(struct v3dv_cmd_buffer *cmd_buffer,
2612 uint32_t indexCount,
2613 uint32_t instanceCount,
2614 uint32_t firstIndex,
2615 int32_t vertexOffset,
2616 uint32_t firstInstance)
2617 {
2618 struct v3dv_job *job = cmd_buffer->state.job;
2619 assert(job);
2620
2621 const struct v3dv_pipeline *pipeline = cmd_buffer->state.gfx.pipeline;
2622 uint32_t hw_prim_type = v3d_hw_prim_type(pipeline->topology);
2623 uint8_t index_type = ffs(cmd_buffer->state.index_buffer.index_size) - 1;
2624 uint32_t index_offset = firstIndex * cmd_buffer->state.index_buffer.index_size;
2625
2626 if (vertexOffset != 0 || firstInstance != 0) {
2627 v3dv_cl_ensure_space_with_branch(
2628 &job->bcl, cl_packet_length(BASE_VERTEX_BASE_INSTANCE));
2629 v3dv_return_if_oom(cmd_buffer, NULL);
2630
2631 cl_emit(&job->bcl, BASE_VERTEX_BASE_INSTANCE, base) {
2632 base.base_instance = firstInstance;
2633 base.base_vertex = vertexOffset;
2634 }
2635 }
2636
2637 if (instanceCount == 1) {
2638 v3dv_cl_ensure_space_with_branch(
2639 &job->bcl, cl_packet_length(INDEXED_PRIM_LIST));
2640 v3dv_return_if_oom(cmd_buffer, NULL);
2641
2642 cl_emit(&job->bcl, INDEXED_PRIM_LIST, prim) {
2643 prim.index_type = index_type;
2644 prim.length = indexCount;
2645 prim.index_offset = index_offset;
2646 prim.mode = hw_prim_type;
2647 prim.enable_primitive_restarts = pipeline->primitive_restart;
2648 }
2649 } else if (instanceCount > 1) {
2650 v3dv_cl_ensure_space_with_branch(
2651 &job->bcl, cl_packet_length(INDEXED_INSTANCED_PRIM_LIST));
2652 v3dv_return_if_oom(cmd_buffer, NULL);
2653
2654 cl_emit(&job->bcl, INDEXED_INSTANCED_PRIM_LIST, prim) {
2655 prim.index_type = index_type;
2656 prim.index_offset = index_offset;
2657 prim.mode = hw_prim_type;
2658 prim.enable_primitive_restarts = pipeline->primitive_restart;
2659 prim.number_of_instances = instanceCount;
2660 prim.instance_length = indexCount;
2661 }
2662 }
2663 }
2664
2665 void
v3dX(cmd_buffer_emit_draw_indirect)2666 v3dX(cmd_buffer_emit_draw_indirect)(struct v3dv_cmd_buffer *cmd_buffer,
2667 struct v3dv_buffer *buffer,
2668 VkDeviceSize offset,
2669 uint32_t drawCount,
2670 uint32_t stride)
2671 {
2672 struct v3dv_job *job = cmd_buffer->state.job;
2673 assert(job);
2674
2675 const struct v3dv_pipeline *pipeline = cmd_buffer->state.gfx.pipeline;
2676 uint32_t hw_prim_type = v3d_hw_prim_type(pipeline->topology);
2677
2678 v3dv_cl_ensure_space_with_branch(
2679 &job->bcl, cl_packet_length(INDIRECT_VERTEX_ARRAY_INSTANCED_PRIMS));
2680 v3dv_return_if_oom(cmd_buffer, NULL);
2681
2682 cl_emit(&job->bcl, INDIRECT_VERTEX_ARRAY_INSTANCED_PRIMS, prim) {
2683 prim.mode = hw_prim_type;
2684 prim.number_of_draw_indirect_array_records = drawCount;
2685 prim.stride_in_multiples_of_4_bytes = stride >> 2;
2686 prim.address = v3dv_cl_address(buffer->mem->bo,
2687 buffer->mem_offset + offset);
2688 }
2689 }
2690
2691 void
v3dX(cmd_buffer_emit_indexed_indirect)2692 v3dX(cmd_buffer_emit_indexed_indirect)(struct v3dv_cmd_buffer *cmd_buffer,
2693 struct v3dv_buffer *buffer,
2694 VkDeviceSize offset,
2695 uint32_t drawCount,
2696 uint32_t stride)
2697 {
2698 struct v3dv_job *job = cmd_buffer->state.job;
2699 assert(job);
2700
2701 const struct v3dv_pipeline *pipeline = cmd_buffer->state.gfx.pipeline;
2702 uint32_t hw_prim_type = v3d_hw_prim_type(pipeline->topology);
2703 uint8_t index_type = ffs(cmd_buffer->state.index_buffer.index_size) - 1;
2704
2705 v3dv_cl_ensure_space_with_branch(
2706 &job->bcl, cl_packet_length(INDIRECT_INDEXED_INSTANCED_PRIM_LIST));
2707 v3dv_return_if_oom(cmd_buffer, NULL);
2708
2709 cl_emit(&job->bcl, INDIRECT_INDEXED_INSTANCED_PRIM_LIST, prim) {
2710 prim.index_type = index_type;
2711 prim.mode = hw_prim_type;
2712 prim.enable_primitive_restarts = pipeline->primitive_restart;
2713 prim.number_of_draw_indirect_indexed_records = drawCount;
2714 prim.stride_in_multiples_of_4_bytes = stride >> 2;
2715 prim.address = v3dv_cl_address(buffer->mem->bo,
2716 buffer->mem_offset + offset);
2717 }
2718 }
2719