1 /*
2 * Copyright © 2021 Raspberry Pi Ltd
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
13 * Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21 * IN THE SOFTWARE.
22 */
23
24 #include "v3dv_private.h"
25 #include "broadcom/common/v3d_macros.h"
26 #include "broadcom/common/v3d_util.h"
27 #include "broadcom/cle/v3dx_pack.h"
28 #include "broadcom/compiler/v3d_compiler.h"
29
30 #include "util/half_float.h"
31 #include "util/u_pack_color.h"
32 #include "vk_format.h"
33
34 void
v3dX(job_emit_binning_flush)35 v3dX(job_emit_binning_flush)(struct v3dv_job *job)
36 {
37 assert(job);
38
39 v3dv_cl_ensure_space_with_branch(&job->bcl, cl_packet_length(FLUSH));
40 v3dv_return_if_oom(NULL, job);
41
42 cl_emit(&job->bcl, FLUSH, flush);
43 }
44
45 void
v3dX(job_emit_enable_double_buffer)46 v3dX(job_emit_enable_double_buffer)(struct v3dv_job *job)
47 {
48 assert(job->can_use_double_buffer);
49 assert(job->frame_tiling.double_buffer);
50 assert(!job->frame_tiling.msaa);
51 assert(job->bcl_tile_binning_mode_ptr);
52
53 const struct v3dv_frame_tiling *tiling = &job->frame_tiling;
54 struct cl_packet_struct(TILE_BINNING_MODE_CFG) config = {
55 cl_packet_header(TILE_BINNING_MODE_CFG),
56 };
57 config.width_in_pixels = tiling->width;
58 config.height_in_pixels = tiling->height;
59 #if V3D_VERSION == 42
60 config.number_of_render_targets = MAX2(tiling->render_target_count, 1);
61 config.multisample_mode_4x = tiling->msaa;
62 config.double_buffer_in_non_ms_mode = tiling->double_buffer;
63 config.maximum_bpp_of_all_render_targets = tiling->internal_bpp;
64 #endif
65 #if V3D_VERSION >= 71
66 config.log2_tile_width = log2_tile_size(tiling->tile_width);
67 config.log2_tile_height = log2_tile_size(tiling->tile_height);
68 #endif
69
70 uint8_t *rewrite_addr = (uint8_t *)job->bcl_tile_binning_mode_ptr;
71 cl_packet_pack(TILE_BINNING_MODE_CFG)(NULL, rewrite_addr, &config);
72 }
73
74 void
v3dX(job_emit_binning_prolog)75 v3dX(job_emit_binning_prolog)(struct v3dv_job *job,
76 const struct v3dv_frame_tiling *tiling,
77 uint32_t layers)
78 {
79 /* This must go before the binning mode configuration. It is
80 * required for layered framebuffers to work.
81 */
82 cl_emit(&job->bcl, NUMBER_OF_LAYERS, config) {
83 config.number_of_layers = layers;
84 }
85
86 assert(!tiling->double_buffer || !tiling->msaa);
87 job->bcl_tile_binning_mode_ptr = cl_start(&job->bcl);
88 cl_emit(&job->bcl, TILE_BINNING_MODE_CFG, config) {
89 config.width_in_pixels = tiling->width;
90 config.height_in_pixels = tiling->height;
91 #if V3D_VERSION == 42
92 config.number_of_render_targets = MAX2(tiling->render_target_count, 1);
93 config.multisample_mode_4x = tiling->msaa;
94 config.double_buffer_in_non_ms_mode = tiling->double_buffer;
95 config.maximum_bpp_of_all_render_targets = tiling->internal_bpp;
96 #endif
97 #if V3D_VERSION >= 71
98 config.log2_tile_width = log2_tile_size(tiling->tile_width);
99 config.log2_tile_height = log2_tile_size(tiling->tile_height);
100 /* FIXME: ideally we would like next assert on the packet header (as is
101 * general, so also applies to GL). We would need to expand
102 * gen_pack_header for that.
103 */
104 assert(config.log2_tile_width == config.log2_tile_height ||
105 config.log2_tile_width == config.log2_tile_height + 1);
106 #endif
107 }
108
109 /* There's definitely nothing in the VCD cache we want. */
110 cl_emit(&job->bcl, FLUSH_VCD_CACHE, bin);
111
112 /* "Binning mode lists must have a Start Tile Binning item (6) after
113 * any prefix state data before the binning list proper starts."
114 */
115 cl_emit(&job->bcl, START_TILE_BINNING, bin);
116 }
117
118 void
v3dX(cmd_buffer_end_render_pass_secondary)119 v3dX(cmd_buffer_end_render_pass_secondary)(struct v3dv_cmd_buffer *cmd_buffer)
120 {
121 assert(cmd_buffer->state.job);
122 v3dv_cl_ensure_space_with_branch(&cmd_buffer->state.job->bcl,
123 cl_packet_length(RETURN_FROM_SUB_LIST));
124 v3dv_return_if_oom(cmd_buffer, NULL);
125 cl_emit(&cmd_buffer->state.job->bcl, RETURN_FROM_SUB_LIST, ret);
126 }
127
128 void
v3dX(job_emit_clip_window)129 v3dX(job_emit_clip_window)(struct v3dv_job *job, const VkRect2D *rect)
130 {
131 assert(job);
132
133 v3dv_cl_ensure_space_with_branch(&job->bcl, cl_packet_length(CLIP_WINDOW));
134 v3dv_return_if_oom(NULL, job);
135
136 cl_emit(&job->bcl, CLIP_WINDOW, clip) {
137 clip.clip_window_left_pixel_coordinate = rect->offset.x;
138 clip.clip_window_bottom_pixel_coordinate = rect->offset.y;
139 clip.clip_window_width_in_pixels = rect->extent.width;
140 clip.clip_window_height_in_pixels = rect->extent.height;
141 }
142 }
143
144 static void
cmd_buffer_render_pass_emit_load(struct v3dv_cmd_buffer * cmd_buffer,struct v3dv_cl * cl,struct v3dv_image_view * iview,uint32_t layer,uint32_t buffer)145 cmd_buffer_render_pass_emit_load(struct v3dv_cmd_buffer *cmd_buffer,
146 struct v3dv_cl *cl,
147 struct v3dv_image_view *iview,
148 uint32_t layer,
149 uint32_t buffer)
150 {
151 const struct v3dv_image *image = (struct v3dv_image *) iview->vk.image;
152
153 /* We don't support rendering to ycbcr images, so the image view should be
154 * single-plane, and using a single-plane format. But note that the underlying
155 * image can be a ycbcr format, as we support rendering to a specific plane
156 * of an image. This is used for example on some meta_copy code paths, in
157 * order to copy from/to a plane of a ycbcr image.
158 */
159 assert(iview->plane_count == 1);
160 assert(iview->format->plane_count == 1);
161
162 uint8_t image_plane = v3dv_plane_from_aspect(iview->vk.aspects);
163 const struct v3d_resource_slice *slice =
164 &image->planes[image_plane].slices[iview->vk.base_mip_level];
165
166 uint32_t layer_offset =
167 v3dv_layer_offset(image, iview->vk.base_mip_level,
168 iview->vk.base_array_layer + layer, image_plane);
169
170 cl_emit(cl, LOAD_TILE_BUFFER_GENERAL, load) {
171 load.buffer_to_load = buffer;
172 load.address = v3dv_cl_address(image->planes[image_plane].mem->bo, layer_offset);
173
174 load.input_image_format = iview->format->planes[0].rt_type;
175
176 /* If we create an image view with only the stencil format, we
177 * re-interpret the format as RGBA8_UINT, as it is want we want in
178 * general (see CreateImageView).
179 *
180 * However, when we are loading/storing tiles from the ZSTENCIL tile
181 * buffer, we need to use the underlying DS format.
182 */
183 if (buffer == ZSTENCIL &&
184 iview->format->planes[0].rt_type == V3D_OUTPUT_IMAGE_FORMAT_RGBA8UI) {
185 assert(image->format->planes[image_plane].rt_type == V3D_OUTPUT_IMAGE_FORMAT_D24S8);
186 load.input_image_format = image->format->planes[image_plane].rt_type;
187 }
188
189 load.r_b_swap = iview->planes[0].swap_rb;
190 load.channel_reverse = iview->planes[0].channel_reverse;
191 load.memory_format = slice->tiling;
192
193 if (slice->tiling == V3D_TILING_UIF_NO_XOR ||
194 slice->tiling == V3D_TILING_UIF_XOR) {
195 load.height_in_ub_or_stride =
196 slice->padded_height_of_output_image_in_uif_blocks;
197 } else if (slice->tiling == V3D_TILING_RASTER) {
198 load.height_in_ub_or_stride = slice->stride;
199 }
200
201 if (image->vk.samples > VK_SAMPLE_COUNT_1_BIT)
202 load.decimate_mode = V3D_DECIMATE_MODE_ALL_SAMPLES;
203 else
204 load.decimate_mode = V3D_DECIMATE_MODE_SAMPLE_0;
205 }
206 }
207
208 static inline uint32_t
v3dv_zs_buffer(bool depth,bool stencil)209 v3dv_zs_buffer(bool depth, bool stencil)
210 {
211 if (depth && stencil)
212 return ZSTENCIL;
213 else if (depth)
214 return Z;
215 else if (stencil)
216 return STENCIL;
217 return NONE;
218 }
219
220 static void
cmd_buffer_render_pass_emit_loads(struct v3dv_cmd_buffer * cmd_buffer,struct v3dv_cl * cl,uint32_t layer)221 cmd_buffer_render_pass_emit_loads(struct v3dv_cmd_buffer *cmd_buffer,
222 struct v3dv_cl *cl,
223 uint32_t layer)
224 {
225 const struct v3dv_cmd_buffer_state *state = &cmd_buffer->state;
226 const struct v3dv_render_pass *pass = state->pass;
227 const struct v3dv_subpass *subpass = &pass->subpasses[state->subpass_idx];
228
229 assert(!pass->multiview_enabled || layer < MAX_MULTIVIEW_VIEW_COUNT);
230
231 for (uint32_t i = 0; i < subpass->color_count; i++) {
232 uint32_t attachment_idx = subpass->color_attachments[i].attachment;
233
234 if (attachment_idx == VK_ATTACHMENT_UNUSED)
235 continue;
236
237 const struct v3dv_render_pass_attachment *attachment =
238 &state->pass->attachments[attachment_idx];
239
240 /* According to the Vulkan spec:
241 *
242 * "The load operation for each sample in an attachment happens before
243 * any recorded command which accesses the sample in the first subpass
244 * where the attachment is used."
245 *
246 * If the load operation is CLEAR, we must only clear once on the first
247 * subpass that uses the attachment (and in that case we don't LOAD).
248 * After that, we always want to load so we don't lose any rendering done
249 * by a previous subpass to the same attachment. We also want to load
250 * if the current job is continuing subpass work started by a previous
251 * job, for the same reason.
252 *
253 * If the render area is not aligned to tile boundaries then we have
254 * tiles which are partially covered by it. In this case, we need to
255 * load the tiles so we can preserve the pixels that are outside the
256 * render area for any such tiles.
257 */
258 uint32_t first_subpass = !pass->multiview_enabled ?
259 attachment->first_subpass :
260 attachment->views[layer].first_subpass;
261
262 uint32_t last_subpass = !pass->multiview_enabled ?
263 attachment->last_subpass :
264 attachment->views[layer].last_subpass;
265
266 bool needs_load =
267 v3dv_cmd_buffer_check_needs_load(state,
268 VK_IMAGE_ASPECT_COLOR_BIT,
269 first_subpass,
270 attachment->desc.loadOp,
271 last_subpass,
272 attachment->desc.storeOp);
273 if (needs_load) {
274 struct v3dv_image_view *iview =
275 state->attachments[attachment_idx].image_view;
276 cmd_buffer_render_pass_emit_load(cmd_buffer, cl, iview,
277 layer, RENDER_TARGET_0 + i);
278 }
279 }
280
281 uint32_t ds_attachment_idx = subpass->ds_attachment.attachment;
282 if (ds_attachment_idx != VK_ATTACHMENT_UNUSED) {
283 const struct v3dv_render_pass_attachment *ds_attachment =
284 &state->pass->attachments[ds_attachment_idx];
285
286 const VkImageAspectFlags ds_aspects =
287 vk_format_aspects(ds_attachment->desc.format);
288
289 uint32_t ds_first_subpass = !pass->multiview_enabled ?
290 ds_attachment->first_subpass :
291 ds_attachment->views[layer].first_subpass;
292
293 uint32_t ds_last_subpass = !pass->multiview_enabled ?
294 ds_attachment->last_subpass :
295 ds_attachment->views[layer].last_subpass;
296
297 const bool needs_depth_load =
298 v3dv_cmd_buffer_check_needs_load(state,
299 ds_aspects & VK_IMAGE_ASPECT_DEPTH_BIT,
300 ds_first_subpass,
301 ds_attachment->desc.loadOp,
302 ds_last_subpass,
303 ds_attachment->desc.storeOp);
304
305 const bool needs_stencil_load =
306 v3dv_cmd_buffer_check_needs_load(state,
307 ds_aspects & VK_IMAGE_ASPECT_STENCIL_BIT,
308 ds_first_subpass,
309 ds_attachment->desc.stencilLoadOp,
310 ds_last_subpass,
311 ds_attachment->desc.stencilStoreOp);
312
313 if (needs_depth_load || needs_stencil_load) {
314 struct v3dv_image_view *iview =
315 state->attachments[ds_attachment_idx].image_view;
316 /* From the Vulkan spec:
317 *
318 * "When an image view of a depth/stencil image is used as a
319 * depth/stencil framebuffer attachment, the aspectMask is ignored
320 * and both depth and stencil image subresources are used."
321 *
322 * So we ignore the aspects from the subresource range of the image
323 * view for the depth/stencil attachment, but we still need to restrict
324 * the to aspects compatible with the render pass and the image.
325 */
326 const uint32_t zs_buffer =
327 v3dv_zs_buffer(needs_depth_load, needs_stencil_load);
328 cmd_buffer_render_pass_emit_load(cmd_buffer, cl,
329 iview, layer, zs_buffer);
330 }
331 }
332
333 cl_emit(cl, END_OF_LOADS, end);
334 }
335
336 static void
cmd_buffer_render_pass_emit_store(struct v3dv_cmd_buffer * cmd_buffer,struct v3dv_cl * cl,uint32_t attachment_idx,uint32_t layer,uint32_t buffer,bool clear,bool is_multisample_resolve)337 cmd_buffer_render_pass_emit_store(struct v3dv_cmd_buffer *cmd_buffer,
338 struct v3dv_cl *cl,
339 uint32_t attachment_idx,
340 uint32_t layer,
341 uint32_t buffer,
342 bool clear,
343 bool is_multisample_resolve)
344 {
345 const struct v3dv_image_view *iview =
346 cmd_buffer->state.attachments[attachment_idx].image_view;
347 const struct v3dv_image *image = (struct v3dv_image *) iview->vk.image;
348
349 /* We don't support rendering to ycbcr images, so the image view should be
350 * one-plane, and using a single-plane format. But note that the underlying
351 * image can be a ycbcr format, as we support rendering to a specific plane
352 * of an image. This is used for example on some meta_copy code paths, in
353 * order to copy from/to a plane of a ycbcr image.
354 */
355 assert(iview->plane_count == 1);
356 assert(iview->format->plane_count == 1);
357
358 uint8_t image_plane = v3dv_plane_from_aspect(iview->vk.aspects);
359 const struct v3d_resource_slice *slice =
360 &image->planes[image_plane].slices[iview->vk.base_mip_level];
361 uint32_t layer_offset = v3dv_layer_offset(image,
362 iview->vk.base_mip_level,
363 iview->vk.base_array_layer + layer,
364 image_plane);
365
366 /* The Clear Buffer bit is not supported for Z/Stencil stores in 7.x and it
367 * is broken in earlier V3D versions.
368 */
369 assert((buffer != Z && buffer != STENCIL && buffer != ZSTENCIL) || !clear);
370
371 cl_emit(cl, STORE_TILE_BUFFER_GENERAL, store) {
372 store.buffer_to_store = buffer;
373 store.address = v3dv_cl_address(image->planes[image_plane].mem->bo, layer_offset);
374 store.clear_buffer_being_stored = clear;
375
376 store.output_image_format = iview->format->planes[0].rt_type;
377
378 /* If we create an image view with only the stencil format, we
379 * re-interpret the format as RGBA8_UINT, as it is want we want in
380 * general (see CreateImageView).
381 *
382 * However, when we are loading/storing tiles from the ZSTENCIL tile
383 * buffer, we need to use the underlying DS format.
384 */
385 if (buffer == ZSTENCIL &&
386 iview->format->planes[0].rt_type == V3D_OUTPUT_IMAGE_FORMAT_RGBA8UI) {
387 assert(image->format->planes[image_plane].rt_type == V3D_OUTPUT_IMAGE_FORMAT_D24S8);
388 store.output_image_format = image->format->planes[image_plane].rt_type;
389 }
390
391 store.r_b_swap = iview->planes[0].swap_rb;
392 store.channel_reverse = iview->planes[0].channel_reverse;
393 store.memory_format = slice->tiling;
394
395 if (slice->tiling == V3D_TILING_UIF_NO_XOR ||
396 slice->tiling == V3D_TILING_UIF_XOR) {
397 store.height_in_ub_or_stride =
398 slice->padded_height_of_output_image_in_uif_blocks;
399 } else if (slice->tiling == V3D_TILING_RASTER) {
400 store.height_in_ub_or_stride = slice->stride;
401 }
402
403 if (image->vk.samples > VK_SAMPLE_COUNT_1_BIT)
404 store.decimate_mode = V3D_DECIMATE_MODE_ALL_SAMPLES;
405 else if (is_multisample_resolve)
406 store.decimate_mode = V3D_DECIMATE_MODE_4X;
407 else
408 store.decimate_mode = V3D_DECIMATE_MODE_SAMPLE_0;
409 }
410 }
411
412 static bool
check_needs_clear(const struct v3dv_cmd_buffer_state * state,VkImageAspectFlags aspect,uint32_t first_subpass_idx,VkAttachmentLoadOp load_op,bool do_clear_with_draw)413 check_needs_clear(const struct v3dv_cmd_buffer_state *state,
414 VkImageAspectFlags aspect,
415 uint32_t first_subpass_idx,
416 VkAttachmentLoadOp load_op,
417 bool do_clear_with_draw)
418 {
419 /* We call this with image->vk.aspects & aspect, so 0 means the aspect we are
420 * testing does not exist in the image.
421 */
422 if (!aspect)
423 return false;
424
425 /* If the aspect needs to be cleared with a draw call then we won't emit
426 * the clear here.
427 */
428 if (do_clear_with_draw)
429 return false;
430
431 /* If this is resuming a subpass started with another job, then attachment
432 * load operations don't apply.
433 */
434 if (state->job->is_subpass_continue)
435 return false;
436
437 /* If the render area is not aligned to tile boundaries we can't use the
438 * TLB for a clear.
439 */
440 if (!state->tile_aligned_render_area)
441 return false;
442
443 /* If this job is running in a subpass other than the first subpass in
444 * which this attachment (or view) is used then attachment load operations
445 * don't apply.
446 */
447 if (state->job->first_subpass != first_subpass_idx)
448 return false;
449
450 /* The attachment load operation must be CLEAR */
451 return load_op == VK_ATTACHMENT_LOAD_OP_CLEAR;
452 }
453
454 static void
cmd_buffer_render_pass_emit_stores(struct v3dv_cmd_buffer * cmd_buffer,struct v3dv_cl * cl,uint32_t layer)455 cmd_buffer_render_pass_emit_stores(struct v3dv_cmd_buffer *cmd_buffer,
456 struct v3dv_cl *cl,
457 uint32_t layer)
458 {
459 struct v3dv_cmd_buffer_state *state = &cmd_buffer->state;
460 struct v3dv_render_pass *pass = state->pass;
461 const struct v3dv_subpass *subpass =
462 &pass->subpasses[state->subpass_idx];
463
464 bool has_stores = false;
465 bool use_global_zs_clear = false;
466 bool use_global_rt_clear = false;
467
468 assert(!pass->multiview_enabled || layer < MAX_MULTIVIEW_VIEW_COUNT);
469
470 /* FIXME: separate stencil */
471 uint32_t ds_attachment_idx = subpass->ds_attachment.attachment;
472 if (ds_attachment_idx != VK_ATTACHMENT_UNUSED) {
473 const struct v3dv_render_pass_attachment *ds_attachment =
474 &state->pass->attachments[ds_attachment_idx];
475
476 assert(state->job->first_subpass >= ds_attachment->first_subpass);
477 assert(state->subpass_idx >= ds_attachment->first_subpass);
478 assert(state->subpass_idx <= ds_attachment->last_subpass);
479
480 /* From the Vulkan spec, VkImageSubresourceRange:
481 *
482 * "When an image view of a depth/stencil image is used as a
483 * depth/stencil framebuffer attachment, the aspectMask is ignored
484 * and both depth and stencil image subresources are used."
485 *
486 * So we ignore the aspects from the subresource range of the image
487 * view for the depth/stencil attachment, but we still need to restrict
488 * the to aspects compatible with the render pass and the image.
489 */
490 const VkImageAspectFlags aspects =
491 vk_format_aspects(ds_attachment->desc.format);
492
493 #if V3D_VERSION <= 42
494 /* GFXH-1689: The per-buffer store command's clear buffer bit is broken
495 * for depth/stencil.
496 *
497 * There used to be some confusion regarding the Clear Tile Buffers
498 * Z/S bit also being broken, but we confirmed with Broadcom that this
499 * is not the case, it was just that some other hardware bugs (that we
500 * need to work around, such as GFXH-1461) could cause this bit to behave
501 * incorrectly.
502 *
503 * There used to be another issue where the RTs bit in the Clear Tile
504 * Buffers packet also cleared Z/S, but Broadcom confirmed this is
505 * fixed since V3D 4.1.
506 *
507 * So if we have to emit a clear of depth or stencil we don't use
508 * the per-buffer store clear bit, even if we need to store the buffers,
509 * instead we always have to use the Clear Tile Buffers Z/S bit.
510 * If we have configured the job to do early Z/S clearing, then we
511 * don't want to emit any Clear Tile Buffers command at all here.
512 *
513 * Note that GFXH-1689 is not reproduced in the simulator, where
514 * using the clear buffer bit in depth/stencil stores works fine.
515 */
516
517 /* Only clear once on the first subpass that uses the attachment */
518 uint32_t ds_first_subpass = !state->pass->multiview_enabled ?
519 ds_attachment->first_subpass :
520 ds_attachment->views[layer].first_subpass;
521
522 bool needs_depth_clear =
523 check_needs_clear(state,
524 aspects & VK_IMAGE_ASPECT_DEPTH_BIT,
525 ds_first_subpass,
526 ds_attachment->desc.loadOp,
527 subpass->do_depth_clear_with_draw);
528
529 bool needs_stencil_clear =
530 check_needs_clear(state,
531 aspects & VK_IMAGE_ASPECT_STENCIL_BIT,
532 ds_first_subpass,
533 ds_attachment->desc.stencilLoadOp,
534 subpass->do_stencil_clear_with_draw);
535
536 use_global_zs_clear = !state->job->early_zs_clear &&
537 (needs_depth_clear || needs_stencil_clear);
538 #endif
539 #if V3D_VERSION >= 71
540 /* The store command's clear buffer bit cannot be used for Z/S stencil:
541 * since V3D 4.5.6 Z/S buffers are automatically cleared between tiles,
542 * so we don't want to emit redundant clears here.
543 */
544 use_global_zs_clear = false;
545 #endif
546
547 /* Skip the last store if it is not required */
548 uint32_t ds_last_subpass = !pass->multiview_enabled ?
549 ds_attachment->last_subpass :
550 ds_attachment->views[layer].last_subpass;
551
552 bool needs_depth_store =
553 v3dv_cmd_buffer_check_needs_store(state,
554 aspects & VK_IMAGE_ASPECT_DEPTH_BIT,
555 ds_last_subpass,
556 ds_attachment->desc.storeOp);
557
558 bool needs_stencil_store =
559 v3dv_cmd_buffer_check_needs_store(state,
560 aspects & VK_IMAGE_ASPECT_STENCIL_BIT,
561 ds_last_subpass,
562 ds_attachment->desc.stencilStoreOp);
563
564 /* If we have a resolve, handle it before storing the tile */
565 const struct v3dv_cmd_buffer_attachment_state *ds_att_state =
566 &state->attachments[ds_attachment_idx];
567 if (ds_att_state->use_tlb_resolve) {
568 assert(ds_att_state->has_resolve);
569 assert(subpass->resolve_depth || subpass->resolve_stencil);
570 const uint32_t resolve_attachment_idx =
571 subpass->ds_resolve_attachment.attachment;
572 assert(resolve_attachment_idx != VK_ATTACHMENT_UNUSED);
573
574 const uint32_t zs_buffer =
575 v3dv_zs_buffer(subpass->resolve_depth, subpass->resolve_stencil);
576 cmd_buffer_render_pass_emit_store(cmd_buffer, cl,
577 resolve_attachment_idx, layer,
578 zs_buffer,
579 false, false);
580 has_stores = true;
581 } else if (ds_att_state->has_resolve) {
582 /* If we can't use the TLB to implement the resolve we will need to
583 * store the attachment so we can implement it later using a blit.
584 */
585 needs_depth_store = subpass->resolve_depth;
586 needs_stencil_store = subpass->resolve_stencil;
587 }
588
589 if (needs_depth_store || needs_stencil_store) {
590 const uint32_t zs_buffer =
591 v3dv_zs_buffer(needs_depth_store, needs_stencil_store);
592 cmd_buffer_render_pass_emit_store(cmd_buffer, cl,
593 ds_attachment_idx, layer,
594 zs_buffer, false, false);
595 has_stores = true;
596 }
597 }
598
599 for (uint32_t i = 0; i < subpass->color_count; i++) {
600 uint32_t attachment_idx = subpass->color_attachments[i].attachment;
601
602 if (attachment_idx == VK_ATTACHMENT_UNUSED)
603 continue;
604
605 const struct v3dv_render_pass_attachment *attachment =
606 &state->pass->attachments[attachment_idx];
607
608 assert(state->job->first_subpass >= attachment->first_subpass);
609 assert(state->subpass_idx >= attachment->first_subpass);
610 assert(state->subpass_idx <= attachment->last_subpass);
611
612 /* Only clear once on the first subpass that uses the attachment */
613 uint32_t first_subpass = !pass->multiview_enabled ?
614 attachment->first_subpass :
615 attachment->views[layer].first_subpass;
616
617 bool needs_clear =
618 check_needs_clear(state,
619 VK_IMAGE_ASPECT_COLOR_BIT,
620 first_subpass,
621 attachment->desc.loadOp,
622 false);
623
624 /* Skip the last store if it is not required */
625 uint32_t last_subpass = !pass->multiview_enabled ?
626 attachment->last_subpass :
627 attachment->views[layer].last_subpass;
628
629 bool needs_store =
630 v3dv_cmd_buffer_check_needs_store(state,
631 VK_IMAGE_ASPECT_COLOR_BIT,
632 last_subpass,
633 attachment->desc.storeOp);
634
635 /* If we need to resolve this attachment emit that store first. Notice
636 * that we must not request a tile buffer clear here in that case, since
637 * that would clear the tile buffer before we get to emit the actual
638 * color attachment store below, since the clear happens after the
639 * store is completed.
640 *
641 * If the attachment doesn't support TLB resolves (or the render area
642 * is not aligned to tile boundaries) then we will have to fallback to
643 * doing the resolve in a shader separately after this job, so we will
644 * need to store the multisampled attachment even if that wasn't
645 * requested by the client.
646 */
647 const struct v3dv_cmd_buffer_attachment_state *att_state =
648 &state->attachments[attachment_idx];
649 if (att_state->use_tlb_resolve) {
650 assert(att_state->has_resolve);
651 const uint32_t resolve_attachment_idx =
652 subpass->resolve_attachments[i].attachment;
653 cmd_buffer_render_pass_emit_store(cmd_buffer, cl,
654 resolve_attachment_idx, layer,
655 RENDER_TARGET_0 + i,
656 false, true);
657 has_stores = true;
658 } else if (att_state->has_resolve) {
659 needs_store = true;
660 }
661
662 /* Emit the color attachment store if needed */
663 if (needs_store) {
664 cmd_buffer_render_pass_emit_store(cmd_buffer, cl,
665 attachment_idx, layer,
666 RENDER_TARGET_0 + i,
667 needs_clear && !use_global_rt_clear,
668 false);
669 has_stores = true;
670 } else if (needs_clear) {
671 use_global_rt_clear = true;
672 }
673 }
674
675 /* We always need to emit at least one dummy store */
676 if (!has_stores) {
677 cl_emit(cl, STORE_TILE_BUFFER_GENERAL, store) {
678 store.buffer_to_store = NONE;
679 }
680 }
681
682 /* If we have any depth/stencil clears we can't use the per-buffer clear
683 * bit and instead we have to emit a single clear of all tile buffers.
684 */
685 if (use_global_zs_clear || use_global_rt_clear) {
686 #if V3D_VERSION == 42
687 cl_emit(cl, CLEAR_TILE_BUFFERS, clear) {
688 clear.clear_z_stencil_buffer = use_global_zs_clear;
689 clear.clear_all_render_targets = use_global_rt_clear;
690 }
691 #endif
692 #if V3D_VERSION >= 71
693 cl_emit(cl, CLEAR_RENDER_TARGETS, clear);
694 #endif
695 }
696 }
697
698 static void
cmd_buffer_render_pass_emit_per_tile_rcl(struct v3dv_cmd_buffer * cmd_buffer,uint32_t layer)699 cmd_buffer_render_pass_emit_per_tile_rcl(struct v3dv_cmd_buffer *cmd_buffer,
700 uint32_t layer)
701 {
702 struct v3dv_job *job = cmd_buffer->state.job;
703 assert(job);
704
705 /* Emit the generic list in our indirect state -- the rcl will just
706 * have pointers into it.
707 */
708 struct v3dv_cl *cl = &job->indirect;
709 v3dv_cl_ensure_space(cl, 200, 1);
710 v3dv_return_if_oom(cmd_buffer, NULL);
711
712 struct v3dv_cl_reloc tile_list_start = v3dv_cl_get_address(cl);
713
714 cl_emit(cl, TILE_COORDINATES_IMPLICIT, coords);
715
716 cmd_buffer_render_pass_emit_loads(cmd_buffer, cl, layer);
717
718 /* The binner starts out writing tiles assuming that the initial mode
719 * is triangles, so make sure that's the case.
720 */
721 cl_emit(cl, PRIM_LIST_FORMAT, fmt) {
722 fmt.primitive_type = LIST_TRIANGLES;
723 }
724
725 /* PTB assumes that value to be 0, but hw will not set it. */
726 cl_emit(cl, SET_INSTANCEID, set) {
727 set.instance_id = 0;
728 }
729
730 cl_emit(cl, BRANCH_TO_IMPLICIT_TILE_LIST, branch);
731
732 cmd_buffer_render_pass_emit_stores(cmd_buffer, cl, layer);
733
734 cl_emit(cl, END_OF_TILE_MARKER, end);
735
736 cl_emit(cl, RETURN_FROM_SUB_LIST, ret);
737
738 cl_emit(&job->rcl, START_ADDRESS_OF_GENERIC_TILE_LIST, branch) {
739 branch.start = tile_list_start;
740 branch.end = v3dv_cl_get_address(cl);
741 }
742 }
743
744 static void
cmd_buffer_emit_render_pass_layer_rcl(struct v3dv_cmd_buffer * cmd_buffer,uint32_t layer)745 cmd_buffer_emit_render_pass_layer_rcl(struct v3dv_cmd_buffer *cmd_buffer,
746 uint32_t layer)
747 {
748 const struct v3dv_cmd_buffer_state *state = &cmd_buffer->state;
749
750 struct v3dv_job *job = cmd_buffer->state.job;
751 struct v3dv_cl *rcl = &job->rcl;
752
753 /* If doing multicore binning, we would need to initialize each
754 * core's tile list here.
755 */
756 const struct v3dv_frame_tiling *tiling = &job->frame_tiling;
757 const uint32_t tile_alloc_offset =
758 64 * layer * tiling->draw_tiles_x * tiling->draw_tiles_y;
759 cl_emit(rcl, MULTICORE_RENDERING_TILE_LIST_SET_BASE, list) {
760 list.address = v3dv_cl_address(job->tile_alloc, tile_alloc_offset);
761 }
762
763 cmd_buffer_render_pass_emit_per_tile_rcl(cmd_buffer, layer);
764
765 uint32_t supertile_w_in_pixels =
766 tiling->tile_width * tiling->supertile_width;
767 uint32_t supertile_h_in_pixels =
768 tiling->tile_height * tiling->supertile_height;
769 const uint32_t min_x_supertile =
770 state->render_area.offset.x / supertile_w_in_pixels;
771 const uint32_t min_y_supertile =
772 state->render_area.offset.y / supertile_h_in_pixels;
773
774 uint32_t max_render_x = state->render_area.offset.x;
775 if (state->render_area.extent.width > 0)
776 max_render_x += state->render_area.extent.width - 1;
777 uint32_t max_render_y = state->render_area.offset.y;
778 if (state->render_area.extent.height > 0)
779 max_render_y += state->render_area.extent.height - 1;
780 const uint32_t max_x_supertile = max_render_x / supertile_w_in_pixels;
781 const uint32_t max_y_supertile = max_render_y / supertile_h_in_pixels;
782
783 for (int y = min_y_supertile; y <= max_y_supertile; y++) {
784 for (int x = min_x_supertile; x <= max_x_supertile; x++) {
785 cl_emit(rcl, SUPERTILE_COORDINATES, coords) {
786 coords.column_number_in_supertiles = x;
787 coords.row_number_in_supertiles = y;
788 }
789 }
790 }
791 }
792
793 static void
set_rcl_early_z_config(struct v3dv_job * job,bool * early_z_disable,uint32_t * early_z_test_and_update_direction)794 set_rcl_early_z_config(struct v3dv_job *job,
795 bool *early_z_disable,
796 uint32_t *early_z_test_and_update_direction)
797 {
798 /* Disable if none of the draw calls in this job enabled EZ */
799 if (!job->has_ez_draws) {
800 *early_z_disable = true;
801 return;
802 }
803
804 switch (job->first_ez_state) {
805 case V3D_EZ_UNDECIDED:
806 case V3D_EZ_LT_LE:
807 *early_z_disable = false;
808 *early_z_test_and_update_direction = EARLY_Z_DIRECTION_LT_LE;
809 break;
810 case V3D_EZ_GT_GE:
811 *early_z_disable = false;
812 *early_z_test_and_update_direction = EARLY_Z_DIRECTION_GT_GE;
813 break;
814 case V3D_EZ_DISABLED:
815 *early_z_disable = true;
816 break;
817 }
818 }
819
820 /* Note that for v71, render target cfg packets has just one field that
821 * combined the internal type and clamp mode. For simplicity we keep just one
822 * helper.
823 *
824 * Note: rt_type is in fact a "enum V3DX(Internal_Type)".
825 *
826 * FIXME: for v71 we are not returning all the possible combinations for
827 * render target internal type and clamp. For example for int types we are
828 * always using clamp int, and for 16f we are using clamp none or pos (that
829 * seems to be the equivalent for no-clamp on 4.2), but not pq or hlg. In
830 * summary right now we are just porting what we were doing on 4.2
831 */
832 uint32_t
v3dX(clamp_for_format_and_type)833 v3dX(clamp_for_format_and_type)(uint32_t rt_type,
834 VkFormat vk_format)
835 {
836 #if V3D_VERSION == 42
837 if (vk_format_is_int(vk_format))
838 return V3D_RENDER_TARGET_CLAMP_INT;
839 else if (vk_format_is_srgb(vk_format))
840 return V3D_RENDER_TARGET_CLAMP_NORM;
841 else
842 return V3D_RENDER_TARGET_CLAMP_NONE;
843 #endif
844 #if V3D_VERSION >= 71
845 switch (rt_type) {
846 case V3D_INTERNAL_TYPE_8I:
847 return V3D_RENDER_TARGET_TYPE_CLAMP_8I_CLAMPED;
848 case V3D_INTERNAL_TYPE_8UI:
849 return V3D_RENDER_TARGET_TYPE_CLAMP_8UI_CLAMPED;
850 case V3D_INTERNAL_TYPE_8:
851 return V3D_RENDER_TARGET_TYPE_CLAMP_8;
852 case V3D_INTERNAL_TYPE_16I:
853 return V3D_RENDER_TARGET_TYPE_CLAMP_16I_CLAMPED;
854 case V3D_INTERNAL_TYPE_16UI:
855 return V3D_RENDER_TARGET_TYPE_CLAMP_16UI_CLAMPED;
856 case V3D_INTERNAL_TYPE_16F:
857 return vk_format_is_srgb(vk_format) ?
858 V3D_RENDER_TARGET_TYPE_CLAMP_16F_CLAMP_NORM :
859 V3D_RENDER_TARGET_TYPE_CLAMP_16F;
860 case V3D_INTERNAL_TYPE_32I:
861 return V3D_RENDER_TARGET_TYPE_CLAMP_32I_CLAMPED;
862 case V3D_INTERNAL_TYPE_32UI:
863 return V3D_RENDER_TARGET_TYPE_CLAMP_32UI_CLAMPED;
864 case V3D_INTERNAL_TYPE_32F:
865 return V3D_RENDER_TARGET_TYPE_CLAMP_32F;
866 default:
867 unreachable("Unknown internal render target type");
868 }
869
870 return V3D_RENDER_TARGET_TYPE_CLAMP_INVALID;
871 #endif
872 }
873
874 static void
cmd_buffer_render_pass_setup_render_target(struct v3dv_cmd_buffer * cmd_buffer,int rt,uint32_t * rt_bpp,uint32_t * rt_type,uint32_t * rt_clamp)875 cmd_buffer_render_pass_setup_render_target(struct v3dv_cmd_buffer *cmd_buffer,
876 int rt,
877 uint32_t *rt_bpp,
878 #if V3D_VERSION == 42
879 uint32_t *rt_type,
880 uint32_t *rt_clamp)
881 #else
882 uint32_t *rt_type_clamp)
883 #endif
884 {
885 const struct v3dv_cmd_buffer_state *state = &cmd_buffer->state;
886
887 assert(state->subpass_idx < state->pass->subpass_count);
888 const struct v3dv_subpass *subpass =
889 &state->pass->subpasses[state->subpass_idx];
890
891 if (rt >= subpass->color_count)
892 return;
893
894 struct v3dv_subpass_attachment *attachment = &subpass->color_attachments[rt];
895 const uint32_t attachment_idx = attachment->attachment;
896 if (attachment_idx == VK_ATTACHMENT_UNUSED)
897 return;
898
899 assert(attachment_idx < state->framebuffer->attachment_count &&
900 attachment_idx < state->attachment_alloc_count);
901 struct v3dv_image_view *iview = state->attachments[attachment_idx].image_view;
902 assert(vk_format_is_color(iview->vk.format));
903
904 assert(iview->plane_count == 1);
905 *rt_bpp = iview->planes[0].internal_bpp;
906 #if V3D_VERSION == 42
907 *rt_type = iview->planes[0].internal_type;
908 *rt_clamp = v3dX(clamp_for_format_and_type)(iview->planes[0].internal_type,
909 iview->vk.format);
910 #endif
911 #if V3D_VERSION >= 71
912 *rt_type_clamp = v3dX(clamp_for_format_and_type)(iview->planes[0].internal_type,
913 iview->vk.format);
914 #endif
915 }
916
917 void
v3dX(cmd_buffer_emit_render_pass_rcl)918 v3dX(cmd_buffer_emit_render_pass_rcl)(struct v3dv_cmd_buffer *cmd_buffer)
919 {
920 struct v3dv_job *job = cmd_buffer->state.job;
921 assert(job);
922
923 const struct v3dv_cmd_buffer_state *state = &cmd_buffer->state;
924 const struct v3dv_framebuffer *framebuffer = state->framebuffer;
925
926 /* We can't emit the RCL until we have a framebuffer, which we may not have
927 * if we are recording a secondary command buffer. In that case, we will
928 * have to wait until vkCmdExecuteCommands is called from a primary command
929 * buffer.
930 */
931 if (!framebuffer) {
932 assert(cmd_buffer->vk.level == VK_COMMAND_BUFFER_LEVEL_SECONDARY);
933 return;
934 }
935
936 const struct v3dv_frame_tiling *tiling = &job->frame_tiling;
937
938 const uint32_t fb_layers = job->frame_tiling.layers;
939
940 v3dv_cl_ensure_space_with_branch(&job->rcl, 200 +
941 MAX2(fb_layers, 1) * 256 *
942 cl_packet_length(SUPERTILE_COORDINATES));
943 v3dv_return_if_oom(cmd_buffer, NULL);
944
945 assert(state->subpass_idx < state->pass->subpass_count);
946 const struct v3dv_render_pass *pass = state->pass;
947 const struct v3dv_subpass *subpass = &pass->subpasses[state->subpass_idx];
948 struct v3dv_cl *rcl = &job->rcl;
949
950 /* Common config must be the first TILE_RENDERING_MODE_CFG and
951 * Z_STENCIL_CLEAR_VALUES must be last. The ones in between are optional
952 * updates to the previous HW state.
953 */
954 bool do_early_zs_clear = false;
955 const uint32_t ds_attachment_idx = subpass->ds_attachment.attachment;
956 assert(!tiling->msaa || !tiling->double_buffer);
957 cl_emit(rcl, TILE_RENDERING_MODE_CFG_COMMON, config) {
958 config.image_width_pixels = framebuffer->width;
959 config.image_height_pixels = framebuffer->height;
960 config.number_of_render_targets = MAX2(subpass->color_count, 1);
961 config.multisample_mode_4x = tiling->msaa;
962 config.double_buffer_in_non_ms_mode = tiling->double_buffer;
963 #if V3D_VERSION == 42
964 config.maximum_bpp_of_all_render_targets = tiling->internal_bpp;
965 #endif
966 #if V3D_VERSION >= 71
967 config.log2_tile_width = log2_tile_size(tiling->tile_width);
968 config.log2_tile_height = log2_tile_size(tiling->tile_height);
969 /* FIXME: ideallly we would like next assert on the packet header (as is
970 * general, so also applies to GL). We would need to expand
971 * gen_pack_header for that.
972 */
973 assert(config.log2_tile_width == config.log2_tile_height ||
974 config.log2_tile_width == config.log2_tile_height + 1);
975 #endif
976
977 if (ds_attachment_idx != VK_ATTACHMENT_UNUSED) {
978 const struct v3dv_image_view *iview =
979 state->attachments[ds_attachment_idx].image_view;
980
981 /* At this point the image view should be single-plane. But note that
982 * the underlying image can be multi-plane, and the image view refer
983 * to one specific plane.
984 */
985 assert(iview->plane_count == 1);
986 assert(iview->format->plane_count == 1);
987 config.internal_depth_type = iview->planes[0].internal_type;
988
989 set_rcl_early_z_config(job,
990 &config.early_z_disable,
991 &config.early_z_test_and_update_direction);
992
993 /* Early-Z/S clear can be enabled if the job is clearing and not
994 * storing (or loading) depth. If a stencil aspect is also present
995 * we have the same requirements for it, however, in this case we
996 * can accept stencil loadOp DONT_CARE as well, so instead of
997 * checking that stencil is cleared we check that is not loaded.
998 *
999 * Early-Z/S clearing is independent of Early Z/S testing, so it is
1000 * possible to enable one but not the other so long as their
1001 * respective requirements are met.
1002 *
1003 * From V3D 4.5.6, Z/S buffers are always cleared automatically
1004 * between tiles, but we still want to enable early ZS clears
1005 * when Z/S are not loaded or stored.
1006 */
1007 struct v3dv_render_pass_attachment *ds_attachment =
1008 &pass->attachments[ds_attachment_idx];
1009
1010 const VkImageAspectFlags ds_aspects =
1011 vk_format_aspects(ds_attachment->desc.format);
1012
1013 bool needs_depth_store =
1014 v3dv_cmd_buffer_check_needs_store(state,
1015 ds_aspects & VK_IMAGE_ASPECT_DEPTH_BIT,
1016 ds_attachment->last_subpass,
1017 ds_attachment->desc.storeOp) ||
1018 subpass->resolve_depth;
1019 #if V3D_VERSION <= 42
1020 bool needs_depth_clear =
1021 check_needs_clear(state,
1022 ds_aspects & VK_IMAGE_ASPECT_DEPTH_BIT,
1023 ds_attachment->first_subpass,
1024 ds_attachment->desc.loadOp,
1025 subpass->do_depth_clear_with_draw);
1026
1027 do_early_zs_clear = needs_depth_clear && !needs_depth_store;
1028 #endif
1029 #if V3D_VERSION >= 71
1030 bool needs_depth_load =
1031 v3dv_cmd_buffer_check_needs_load(state,
1032 ds_aspects & VK_IMAGE_ASPECT_DEPTH_BIT,
1033 ds_attachment->first_subpass,
1034 ds_attachment->desc.loadOp,
1035 ds_attachment->last_subpass,
1036 ds_attachment->desc.storeOp);
1037 do_early_zs_clear = !needs_depth_load && !needs_depth_store;
1038 #endif
1039
1040 if (do_early_zs_clear &&
1041 vk_format_has_stencil(ds_attachment->desc.format)) {
1042 bool needs_stencil_load =
1043 v3dv_cmd_buffer_check_needs_load(state,
1044 ds_aspects & VK_IMAGE_ASPECT_STENCIL_BIT,
1045 ds_attachment->first_subpass,
1046 ds_attachment->desc.stencilLoadOp,
1047 ds_attachment->last_subpass,
1048 ds_attachment->desc.stencilStoreOp);
1049
1050 bool needs_stencil_store =
1051 v3dv_cmd_buffer_check_needs_store(state,
1052 ds_aspects & VK_IMAGE_ASPECT_STENCIL_BIT,
1053 ds_attachment->last_subpass,
1054 ds_attachment->desc.stencilStoreOp) ||
1055 subpass->resolve_stencil;
1056
1057 do_early_zs_clear = !needs_stencil_load && !needs_stencil_store;
1058 }
1059
1060 config.early_depth_stencil_clear = do_early_zs_clear;
1061 } else {
1062 config.early_z_disable = true;
1063 }
1064 }
1065
1066 /* If we enabled early Z/S clear, then we can't emit any "Clear Tile Buffers"
1067 * commands with the Z/S bit set, so keep track of whether we enabled this
1068 * in the job so we can skip these later.
1069 */
1070 job->early_zs_clear = do_early_zs_clear;
1071
1072 #if V3D_VERSION >= 71
1073 uint32_t base_addr = 0;
1074 #endif
1075 for (uint32_t i = 0; i < subpass->color_count; i++) {
1076 uint32_t attachment_idx = subpass->color_attachments[i].attachment;
1077 if (attachment_idx == VK_ATTACHMENT_UNUSED) {
1078 #if V3D_VERSION >= 71
1079 cl_emit(rcl, TILE_RENDERING_MODE_CFG_RENDER_TARGET_PART1, rt) {
1080 rt.render_target_number = i;
1081 rt.stride = 1; /* Unused */
1082 }
1083 #endif
1084 continue;
1085 }
1086
1087 struct v3dv_image_view *iview =
1088 state->attachments[attachment_idx].image_view;
1089 assert(iview->plane_count == 1);
1090
1091 const struct v3dv_image *image = (struct v3dv_image *) iview->vk.image;
1092
1093 uint8_t plane = v3dv_plane_from_aspect(iview->vk.aspects);
1094 const struct v3d_resource_slice *slice =
1095 &image->planes[plane].slices[iview->vk.base_mip_level];
1096
1097 UNUSED const uint32_t *clear_color =
1098 &state->attachments[attachment_idx].clear_value.color[0];
1099
1100 UNUSED uint32_t clear_pad = 0;
1101 if (slice->tiling == V3D_TILING_UIF_NO_XOR ||
1102 slice->tiling == V3D_TILING_UIF_XOR) {
1103 int uif_block_height = v3d_utile_height(image->planes[plane].cpp) * 2;
1104
1105 uint32_t implicit_padded_height =
1106 align(framebuffer->height, uif_block_height) / uif_block_height;
1107
1108 if (slice->padded_height_of_output_image_in_uif_blocks -
1109 implicit_padded_height >= 15) {
1110 clear_pad = slice->padded_height_of_output_image_in_uif_blocks;
1111 }
1112 }
1113
1114 #if V3D_VERSION == 42
1115 cl_emit(rcl, TILE_RENDERING_MODE_CFG_CLEAR_COLORS_PART1, clear) {
1116 clear.clear_color_low_32_bits = clear_color[0];
1117 clear.clear_color_next_24_bits = clear_color[1] & 0xffffff;
1118 clear.render_target_number = i;
1119 };
1120
1121 if (iview->planes[0].internal_bpp >= V3D_INTERNAL_BPP_64) {
1122 cl_emit(rcl, TILE_RENDERING_MODE_CFG_CLEAR_COLORS_PART2, clear) {
1123 clear.clear_color_mid_low_32_bits =
1124 ((clear_color[1] >> 24) | (clear_color[2] << 8));
1125 clear.clear_color_mid_high_24_bits =
1126 ((clear_color[2] >> 24) | ((clear_color[3] & 0xffff) << 8));
1127 clear.render_target_number = i;
1128 };
1129 }
1130
1131 if (iview->planes[0].internal_bpp >= V3D_INTERNAL_BPP_128 || clear_pad) {
1132 cl_emit(rcl, TILE_RENDERING_MODE_CFG_CLEAR_COLORS_PART3, clear) {
1133 clear.uif_padded_height_in_uif_blocks = clear_pad;
1134 clear.clear_color_high_16_bits = clear_color[3] >> 16;
1135 clear.render_target_number = i;
1136 };
1137 }
1138 #endif
1139
1140 #if V3D_VERSION >= 71
1141 cl_emit(rcl, TILE_RENDERING_MODE_CFG_RENDER_TARGET_PART1, rt) {
1142 rt.clear_color_low_bits = clear_color[0];
1143 cmd_buffer_render_pass_setup_render_target(cmd_buffer, i, &rt.internal_bpp,
1144 &rt.internal_type_and_clamping);
1145 rt.stride =
1146 v3d_compute_rt_row_row_stride_128_bits(tiling->tile_width,
1147 v3d_internal_bpp_words(rt.internal_bpp));
1148 rt.base_address = base_addr;
1149 rt.render_target_number = i;
1150
1151 /* base_addr in multiples of 512 bits. We divide by 8 because stride
1152 * is in 128-bit units, but it is packing 2 rows worth of data, so we
1153 * need to divide it by 2 so it is only 1 row, and then again by 4 so
1154 * it is in 512-bit units.
1155 */
1156 base_addr += (tiling->tile_height * rt.stride) / 8;
1157 }
1158
1159 if (iview->planes[0].internal_bpp >= V3D_INTERNAL_BPP_64) {
1160 cl_emit(rcl, TILE_RENDERING_MODE_CFG_RENDER_TARGET_PART2, rt) {
1161 rt.clear_color_mid_bits = /* 40 bits (32 + 8) */
1162 ((uint64_t) clear_color[1]) |
1163 (((uint64_t) (clear_color[2] & 0xff)) << 32);
1164 rt.render_target_number = i;
1165 }
1166 }
1167
1168 if (iview->planes[0].internal_bpp >= V3D_INTERNAL_BPP_128) {
1169 cl_emit(rcl, TILE_RENDERING_MODE_CFG_RENDER_TARGET_PART3, rt) {
1170 rt.clear_color_top_bits = /* 56 bits (24 + 32) */
1171 (((uint64_t) (clear_color[2] & 0xffffff00)) >> 8) |
1172 (((uint64_t) (clear_color[3])) << 24);
1173 rt.render_target_number = i;
1174 }
1175 }
1176 #endif
1177 }
1178
1179 #if V3D_VERSION >= 71
1180 /* If we don't have any color RTs, we still need to emit one and flag
1181 * it as not used using stride = 1.
1182 */
1183 if (subpass->color_count == 0) {
1184 cl_emit(rcl, TILE_RENDERING_MODE_CFG_RENDER_TARGET_PART1, rt) {
1185 rt.stride = 1;
1186 }
1187 }
1188 #endif
1189
1190 #if V3D_VERSION == 42
1191 cl_emit(rcl, TILE_RENDERING_MODE_CFG_COLOR, rt) {
1192 cmd_buffer_render_pass_setup_render_target
1193 (cmd_buffer, 0, &rt.render_target_0_internal_bpp,
1194 &rt.render_target_0_internal_type, &rt.render_target_0_clamp);
1195 cmd_buffer_render_pass_setup_render_target
1196 (cmd_buffer, 1, &rt.render_target_1_internal_bpp,
1197 &rt.render_target_1_internal_type, &rt.render_target_1_clamp);
1198 cmd_buffer_render_pass_setup_render_target
1199 (cmd_buffer, 2, &rt.render_target_2_internal_bpp,
1200 &rt.render_target_2_internal_type, &rt.render_target_2_clamp);
1201 cmd_buffer_render_pass_setup_render_target
1202 (cmd_buffer, 3, &rt.render_target_3_internal_bpp,
1203 &rt.render_target_3_internal_type, &rt.render_target_3_clamp);
1204 }
1205 #endif
1206
1207 /* Ends rendering mode config. */
1208 if (ds_attachment_idx != VK_ATTACHMENT_UNUSED) {
1209 cl_emit(rcl, TILE_RENDERING_MODE_CFG_ZS_CLEAR_VALUES, clear) {
1210 clear.z_clear_value =
1211 state->attachments[ds_attachment_idx].clear_value.z;
1212 clear.stencil_clear_value =
1213 state->attachments[ds_attachment_idx].clear_value.s;
1214 };
1215 } else {
1216 cl_emit(rcl, TILE_RENDERING_MODE_CFG_ZS_CLEAR_VALUES, clear) {
1217 clear.z_clear_value = 1.0f;
1218 clear.stencil_clear_value = 0;
1219 };
1220 }
1221
1222 /* Always set initial block size before the first branch, which needs
1223 * to match the value from binning mode config.
1224 */
1225 cl_emit(rcl, TILE_LIST_INITIAL_BLOCK_SIZE, init) {
1226 init.use_auto_chained_tile_lists = true;
1227 init.size_of_first_block_in_chained_tile_lists =
1228 TILE_ALLOCATION_BLOCK_SIZE_64B;
1229 }
1230
1231 cl_emit(rcl, MULTICORE_RENDERING_SUPERTILE_CFG, config) {
1232 config.number_of_bin_tile_lists = 1;
1233 config.total_frame_width_in_tiles = tiling->draw_tiles_x;
1234 config.total_frame_height_in_tiles = tiling->draw_tiles_y;
1235
1236 config.supertile_width_in_tiles = tiling->supertile_width;
1237 config.supertile_height_in_tiles = tiling->supertile_height;
1238
1239 config.total_frame_width_in_supertiles =
1240 tiling->frame_width_in_supertiles;
1241 config.total_frame_height_in_supertiles =
1242 tiling->frame_height_in_supertiles;
1243 }
1244
1245 /* Emit an initial clear of the tile buffers. This is necessary
1246 * for any buffers that should be cleared (since clearing
1247 * normally happens at the *end* of the generic tile list), but
1248 * it's also nice to clear everything so the first tile doesn't
1249 * inherit any contents from some previous frame.
1250 *
1251 * Also, implement the GFXH-1742 workaround. There's a race in
1252 * the HW between the RCL updating the TLB's internal type/size
1253 * and the spawning of the QPU instances using the TLB's current
1254 * internal type/size. To make sure the QPUs get the right
1255 * state, we need 1 dummy store in between internal type/size
1256 * changes on V3D 3.x, and 2 dummy stores on 4.x.
1257 */
1258 for (int i = 0; i < 2; i++) {
1259 cl_emit(rcl, TILE_COORDINATES, coords);
1260 cl_emit(rcl, END_OF_LOADS, end);
1261 cl_emit(rcl, STORE_TILE_BUFFER_GENERAL, store) {
1262 store.buffer_to_store = NONE;
1263 }
1264 if (cmd_buffer->state.tile_aligned_render_area &&
1265 (i == 0 || v3dv_do_double_initial_tile_clear(tiling))) {
1266 #if V3D_VERSION == 42
1267 cl_emit(rcl, CLEAR_TILE_BUFFERS, clear) {
1268 clear.clear_z_stencil_buffer = !job->early_zs_clear;
1269 clear.clear_all_render_targets = true;
1270 }
1271 #endif
1272 #if V3D_VERSION >= 71
1273 cl_emit(rcl, CLEAR_RENDER_TARGETS, clear_rt);
1274 #endif
1275 }
1276 cl_emit(rcl, END_OF_TILE_MARKER, end);
1277 }
1278
1279 cl_emit(rcl, FLUSH_VCD_CACHE, flush);
1280
1281 for (int layer = 0; layer < MAX2(1, fb_layers); layer++) {
1282 if (subpass->view_mask == 0 || (subpass->view_mask & (1u << layer)))
1283 cmd_buffer_emit_render_pass_layer_rcl(cmd_buffer, layer);
1284 }
1285
1286 cl_emit(rcl, END_OF_RENDERING, end);
1287 }
1288
1289 void
v3dX(viewport_compute_xform)1290 v3dX(viewport_compute_xform)(const VkViewport *viewport,
1291 float scale[3],
1292 float translate[3])
1293 {
1294 float x = viewport->x;
1295 float y = viewport->y;
1296 float half_width = 0.5f * viewport->width;
1297 float half_height = 0.5f * viewport->height;
1298 double n = viewport->minDepth;
1299 double f = viewport->maxDepth;
1300
1301 scale[0] = half_width;
1302 translate[0] = half_width + x;
1303 scale[1] = half_height;
1304 translate[1] = half_height + y;
1305
1306 scale[2] = (f - n);
1307 translate[2] = n;
1308
1309 /* It seems that if the scale is small enough the hardware won't clip
1310 * correctly so we work around this my choosing the smallest scale that
1311 * seems to work.
1312 *
1313 * This case is exercised by CTS:
1314 * dEQP-VK.draw.renderpass.inverted_depth_ranges.nodepthclamp_deltazero
1315 *
1316 * V3D 7.x fixes this by using the new
1317 * CLIPPER_Z_SCALE_AND_OFFSET_NO_GUARDBAND.
1318 */
1319 #if V3D_VERSION <= 42
1320 const float min_abs_scale = 0.0005f;
1321 if (fabs(scale[2]) < min_abs_scale)
1322 scale[2] = scale[2] < 0 ? -min_abs_scale : min_abs_scale;
1323 #endif
1324 }
1325
1326 void
v3dX(cmd_buffer_emit_viewport)1327 v3dX(cmd_buffer_emit_viewport)(struct v3dv_cmd_buffer *cmd_buffer)
1328 {
1329 struct v3dv_dynamic_state *dynamic = &cmd_buffer->state.dynamic;
1330 struct v3dv_pipeline *pipeline = cmd_buffer->state.gfx.pipeline;
1331 assert(pipeline);
1332
1333 /* FIXME: right now we don't support multiViewport so viewports[0] would
1334 * work now, but would need to change if we allow multiple viewports.
1335 */
1336 float *vptranslate = dynamic->viewport.translate[0];
1337 float *vpscale = dynamic->viewport.scale[0];
1338
1339 struct v3dv_job *job = cmd_buffer->state.job;
1340 assert(job);
1341
1342 const uint32_t required_cl_size =
1343 cl_packet_length(CLIPPER_XY_SCALING) +
1344 cl_packet_length(CLIPPER_Z_SCALE_AND_OFFSET) +
1345 cl_packet_length(CLIPPER_Z_MIN_MAX_CLIPPING_PLANES) +
1346 cl_packet_length(VIEWPORT_OFFSET);
1347 v3dv_cl_ensure_space_with_branch(&job->bcl, required_cl_size);
1348 v3dv_return_if_oom(cmd_buffer, NULL);
1349
1350 #if V3D_VERSION == 42
1351 cl_emit(&job->bcl, CLIPPER_XY_SCALING, clip) {
1352 clip.viewport_half_width_in_1_256th_of_pixel = vpscale[0] * 256.0f;
1353 clip.viewport_half_height_in_1_256th_of_pixel = vpscale[1] * 256.0f;
1354 }
1355 #endif
1356 #if V3D_VERSION >= 71
1357 cl_emit(&job->bcl, CLIPPER_XY_SCALING, clip) {
1358 clip.viewport_half_width_in_1_64th_of_pixel = vpscale[0] * 64.0f;
1359 clip.viewport_half_height_in_1_64th_of_pixel = vpscale[1] * 64.0f;
1360 }
1361 #endif
1362
1363 float translate_z, scale_z;
1364 v3dv_cmd_buffer_state_get_viewport_z_xform(cmd_buffer, 0,
1365 &translate_z, &scale_z);
1366
1367 #if V3D_VERSION == 42
1368 cl_emit(&job->bcl, CLIPPER_Z_SCALE_AND_OFFSET, clip) {
1369 clip.viewport_z_offset_zc_to_zs = translate_z;
1370 clip.viewport_z_scale_zc_to_zs = scale_z;
1371 }
1372 #endif
1373
1374 #if V3D_VERSION >= 71
1375 /* If the Z scale is too small guardband clipping may not clip correctly */
1376 if (fabsf(scale_z) < 0.01f) {
1377 cl_emit(&job->bcl, CLIPPER_Z_SCALE_AND_OFFSET_NO_GUARDBAND, clip) {
1378 clip.viewport_z_offset_zc_to_zs = translate_z;
1379 clip.viewport_z_scale_zc_to_zs = scale_z;
1380 }
1381 } else {
1382 cl_emit(&job->bcl, CLIPPER_Z_SCALE_AND_OFFSET, clip) {
1383 clip.viewport_z_offset_zc_to_zs = translate_z;
1384 clip.viewport_z_scale_zc_to_zs = scale_z;
1385 }
1386 }
1387 #endif
1388
1389 cl_emit(&job->bcl, CLIPPER_Z_MIN_MAX_CLIPPING_PLANES, clip) {
1390 /* Vulkan's default Z NDC is [0..1]. If 'negative_one_to_one' is enabled,
1391 * we are using OpenGL's [-1, 1] instead.
1392 */
1393 float z1 = pipeline->negative_one_to_one ? translate_z - scale_z :
1394 translate_z;
1395 float z2 = translate_z + scale_z;
1396 clip.minimum_zw = MIN2(z1, z2);
1397 clip.maximum_zw = MAX2(z1, z2);
1398 }
1399
1400 cl_emit(&job->bcl, VIEWPORT_OFFSET, vp) {
1401 float vp_fine_x = vptranslate[0];
1402 float vp_fine_y = vptranslate[1];
1403 int32_t vp_coarse_x = 0;
1404 int32_t vp_coarse_y = 0;
1405
1406 /* The fine coordinates must be unsigned, but coarse can be signed */
1407 if (unlikely(vp_fine_x < 0)) {
1408 int32_t blocks_64 = DIV_ROUND_UP(fabsf(vp_fine_x), 64);
1409 vp_fine_x += 64.0f * blocks_64;
1410 vp_coarse_x -= blocks_64;
1411 }
1412
1413 if (unlikely(vp_fine_y < 0)) {
1414 int32_t blocks_64 = DIV_ROUND_UP(fabsf(vp_fine_y), 64);
1415 vp_fine_y += 64.0f * blocks_64;
1416 vp_coarse_y -= blocks_64;
1417 }
1418
1419 vp.fine_x = vp_fine_x;
1420 vp.fine_y = vp_fine_y;
1421 vp.coarse_x = vp_coarse_x;
1422 vp.coarse_y = vp_coarse_y;
1423 }
1424
1425 BITSET_CLEAR(cmd_buffer->vk.dynamic_graphics_state.dirty,
1426 MESA_VK_DYNAMIC_VP_VIEWPORTS);
1427 }
1428
1429 void
v3dX(cmd_buffer_emit_stencil)1430 v3dX(cmd_buffer_emit_stencil)(struct v3dv_cmd_buffer *cmd_buffer)
1431 {
1432 struct v3dv_job *job = cmd_buffer->state.job;
1433 assert(job);
1434
1435 struct v3dv_pipeline *pipeline = cmd_buffer->state.gfx.pipeline;
1436 struct vk_dynamic_graphics_state *dyn =
1437 &cmd_buffer->vk.dynamic_graphics_state;
1438 bool has_stencil =
1439 pipeline->rendering_info.stencil_attachment_format != VK_FORMAT_UNDEFINED;
1440
1441 if (!(dyn->ds.stencil.test_enable && has_stencil))
1442 return;
1443
1444 v3dv_cl_ensure_space_with_branch(&job->bcl,
1445 2 * cl_packet_length(STENCIL_CFG));
1446 v3dv_return_if_oom(cmd_buffer, NULL);
1447
1448 bool any_dynamic_stencil_state =
1449 BITSET_TEST(dyn->set, MESA_VK_DYNAMIC_DS_STENCIL_COMPARE_MASK) ||
1450 BITSET_TEST(dyn->set, MESA_VK_DYNAMIC_DS_STENCIL_WRITE_MASK) ||
1451 BITSET_TEST(dyn->set, MESA_VK_DYNAMIC_DS_STENCIL_REFERENCE) ||
1452 BITSET_TEST(dyn->set, MESA_VK_DYNAMIC_DS_STENCIL_OP);
1453
1454 bool emitted_stencil = false;
1455 const struct vk_stencil_test_face_state *front = &dyn->ds.stencil.front;
1456 const struct vk_stencil_test_face_state *back = &dyn->ds.stencil.back;
1457
1458 const bool needs_front_and_back = any_dynamic_stencil_state ?
1459 memcmp(front, back, sizeof(*front)) != 0 :
1460 pipeline->emit_stencil_cfg[1] == true;
1461
1462 for (uint32_t i = 0; i < 2; i++) {
1463 if (any_dynamic_stencil_state) {
1464 const struct vk_stencil_test_face_state *stencil_state =
1465 i == 0 ? front : back;
1466 /* If we have any dynamic stencil state we just emit the entire
1467 * packet since for simplicity
1468 */
1469 cl_emit(&job->bcl, STENCIL_CFG, config) {
1470 config.front_config = !needs_front_and_back || i == 0;
1471 config.back_config = !needs_front_and_back || i == 1;
1472 config.stencil_test_mask = stencil_state->compare_mask & 0xff;
1473 config.stencil_write_mask = stencil_state->write_mask & 0xff;
1474 config.stencil_ref_value = stencil_state->reference & 0xff;
1475 config.stencil_test_function = stencil_state->op.compare;
1476 config.stencil_pass_op =
1477 v3dX(translate_stencil_op)(stencil_state->op.pass);
1478 config.depth_test_fail_op =
1479 v3dX(translate_stencil_op)(stencil_state->op.depth_fail);
1480 config.stencil_test_fail_op =
1481 v3dX(translate_stencil_op)(stencil_state->op.fail);
1482 }
1483 } else {
1484 assert(pipeline->emit_stencil_cfg[i]);
1485 cl_emit_prepacked(&job->bcl, &pipeline->stencil_cfg[i]);
1486 }
1487 emitted_stencil = true;
1488
1489 if (!needs_front_and_back)
1490 break;
1491 }
1492 if (emitted_stencil) {
1493 BITSET_CLEAR(dyn->dirty, MESA_VK_DYNAMIC_DS_STENCIL_COMPARE_MASK);
1494 BITSET_CLEAR(dyn->dirty, MESA_VK_DYNAMIC_DS_STENCIL_REFERENCE);
1495 BITSET_CLEAR(dyn->dirty, MESA_VK_DYNAMIC_DS_STENCIL_WRITE_MASK);
1496 BITSET_CLEAR(dyn->dirty, MESA_VK_DYNAMIC_DS_STENCIL_OP);
1497 }
1498 }
1499
1500 void
v3dX(cmd_buffer_emit_depth_bias)1501 v3dX(cmd_buffer_emit_depth_bias)(struct v3dv_cmd_buffer *cmd_buffer)
1502 {
1503 struct v3dv_pipeline *pipeline = cmd_buffer->state.gfx.pipeline;
1504 assert(pipeline);
1505 struct vk_dynamic_graphics_state *dyn = &cmd_buffer->vk.dynamic_graphics_state;
1506
1507 if (!dyn->rs.depth_bias.enable)
1508 return;
1509
1510 struct v3dv_job *job = cmd_buffer->state.job;
1511 assert(job);
1512
1513 v3dv_cl_ensure_space_with_branch(&job->bcl, cl_packet_length(DEPTH_OFFSET));
1514 v3dv_return_if_oom(cmd_buffer, NULL);
1515
1516 cl_emit(&job->bcl, DEPTH_OFFSET, bias) {
1517 bias.depth_offset_factor = dyn->rs.depth_bias.slope_factor;
1518 bias.depth_offset_units = dyn->rs.depth_bias.constant_factor;
1519 #if V3D_VERSION <= 42
1520 if (pipeline->rendering_info.depth_attachment_format == VK_FORMAT_D16_UNORM)
1521 bias.depth_offset_units *= 256.0f;
1522 #endif
1523 bias.limit = dyn->rs.depth_bias.clamp;
1524 }
1525
1526 BITSET_CLEAR(dyn->dirty, MESA_VK_DYNAMIC_RS_DEPTH_BIAS_FACTORS);
1527 }
1528
1529 void
v3dX(cmd_buffer_emit_depth_bounds)1530 v3dX(cmd_buffer_emit_depth_bounds)(struct v3dv_cmd_buffer *cmd_buffer)
1531 {
1532 /* No depthBounds support for v42, so this method is empty in that case.
1533 *
1534 * Note that this method is being called as v3dv_job_init flags all state
1535 * as dirty. See FIXME note in v3dv_job_init.
1536 */
1537 #if V3D_VERSION >= 71
1538 struct vk_dynamic_graphics_state *dyn =
1539 &cmd_buffer->vk.dynamic_graphics_state;
1540
1541 if (!dyn->ds.depth.bounds_test.enable)
1542 return;
1543
1544 struct v3dv_job *job = cmd_buffer->state.job;
1545 assert(job);
1546
1547 v3dv_cl_ensure_space_with_branch(&job->bcl, cl_packet_length(DEPTH_BOUNDS_TEST_LIMITS));
1548 v3dv_return_if_oom(cmd_buffer, NULL);
1549
1550 cl_emit(&job->bcl, DEPTH_BOUNDS_TEST_LIMITS, bounds) {
1551 bounds.lower_test_limit = dyn->ds.depth.bounds_test.min;
1552 bounds.upper_test_limit = dyn->ds.depth.bounds_test.max;
1553 }
1554 BITSET_CLEAR(dyn->dirty, MESA_VK_DYNAMIC_DS_DEPTH_BOUNDS_TEST_BOUNDS);
1555 #endif
1556 }
1557
1558 void
v3dX(cmd_buffer_emit_line_width)1559 v3dX(cmd_buffer_emit_line_width)(struct v3dv_cmd_buffer *cmd_buffer)
1560 {
1561 struct v3dv_job *job = cmd_buffer->state.job;
1562 assert(job);
1563
1564 struct vk_dynamic_graphics_state *dyn = &cmd_buffer->vk.dynamic_graphics_state;
1565
1566 v3dv_cl_ensure_space_with_branch(&job->bcl, cl_packet_length(LINE_WIDTH));
1567 v3dv_return_if_oom(cmd_buffer, NULL);
1568
1569 cl_emit(&job->bcl, LINE_WIDTH, line) {
1570 line.line_width = v3dv_get_aa_line_width(cmd_buffer->state.gfx.pipeline,
1571 cmd_buffer);
1572 }
1573
1574 BITSET_CLEAR(dyn->dirty, MESA_VK_DYNAMIC_RS_LINE_WIDTH);
1575 }
1576
1577 void
v3dX(cmd_buffer_emit_default_point_size)1578 v3dX(cmd_buffer_emit_default_point_size)(struct v3dv_cmd_buffer *cmd_buffer)
1579 {
1580 struct v3dv_job *job = cmd_buffer->state.job;
1581 assert(job);
1582
1583 v3dv_cl_ensure_space_with_branch(&job->bcl, cl_packet_length(POINT_SIZE));
1584 v3dv_return_if_oom(cmd_buffer, NULL);
1585
1586 cl_emit(&job->bcl, POINT_SIZE, point) {
1587 point.point_size = 1.0f;
1588 }
1589
1590 job->emitted_default_point_size = true;
1591 }
1592
1593 void
v3dX(cmd_buffer_emit_sample_state)1594 v3dX(cmd_buffer_emit_sample_state)(struct v3dv_cmd_buffer *cmd_buffer)
1595 {
1596 struct v3dv_pipeline *pipeline = cmd_buffer->state.gfx.pipeline;
1597 assert(pipeline);
1598
1599 struct v3dv_job *job = cmd_buffer->state.job;
1600 assert(job);
1601
1602 v3dv_cl_ensure_space_with_branch(&job->bcl, cl_packet_length(SAMPLE_STATE));
1603 v3dv_return_if_oom(cmd_buffer, NULL);
1604
1605 cl_emit(&job->bcl, SAMPLE_STATE, state) {
1606 state.coverage = 1.0f;
1607 state.mask = pipeline->sample_mask;
1608 }
1609 }
1610
1611 void
v3dX(cmd_buffer_emit_blend)1612 v3dX(cmd_buffer_emit_blend)(struct v3dv_cmd_buffer *cmd_buffer)
1613 {
1614 struct v3dv_job *job = cmd_buffer->state.job;
1615 assert(job);
1616
1617 struct v3dv_pipeline *pipeline = cmd_buffer->state.gfx.pipeline;
1618 assert(pipeline);
1619
1620 const struct v3d_device_info *devinfo = &cmd_buffer->device->devinfo;
1621 const uint32_t max_color_rts = V3D_MAX_RENDER_TARGETS(devinfo->ver);
1622
1623 const uint32_t blend_packets_size =
1624 cl_packet_length(BLEND_ENABLES) +
1625 cl_packet_length(BLEND_CONSTANT_COLOR) +
1626 cl_packet_length(BLEND_CFG) * max_color_rts;
1627
1628 v3dv_cl_ensure_space_with_branch(&job->bcl, blend_packets_size);
1629 v3dv_return_if_oom(cmd_buffer, NULL);
1630
1631 if (cmd_buffer->state.dirty & V3DV_CMD_DIRTY_PIPELINE) {
1632 if (pipeline->blend.enables) {
1633 cl_emit(&job->bcl, BLEND_ENABLES, enables) {
1634 enables.mask = pipeline->blend.enables;
1635 }
1636 }
1637
1638 for (uint32_t i = 0; i < max_color_rts; i++) {
1639 if (pipeline->blend.enables & (1 << i))
1640 cl_emit_prepacked(&job->bcl, &pipeline->blend.cfg[i]);
1641 }
1642 }
1643
1644 if (pipeline->blend.needs_color_constants) {
1645 const struct vk_dynamic_graphics_state *dyn =
1646 &cmd_buffer->vk.dynamic_graphics_state;
1647
1648 cl_emit(&job->bcl, BLEND_CONSTANT_COLOR, color) {
1649 color.red_f16 = _mesa_float_to_half(dyn->cb.blend_constants[0]);
1650 color.green_f16 = _mesa_float_to_half(dyn->cb.blend_constants[1]);
1651 color.blue_f16 = _mesa_float_to_half(dyn->cb.blend_constants[2]);
1652 color.alpha_f16 = _mesa_float_to_half(dyn->cb.blend_constants[3]);
1653 }
1654 }
1655
1656 BITSET_CLEAR(cmd_buffer->vk.dynamic_graphics_state.dirty,
1657 MESA_VK_DYNAMIC_CB_BLEND_CONSTANTS);
1658 }
1659
1660 void
v3dX(cmd_buffer_emit_color_write_mask)1661 v3dX(cmd_buffer_emit_color_write_mask)(struct v3dv_cmd_buffer *cmd_buffer)
1662 {
1663 struct v3dv_job *job = cmd_buffer->state.job;
1664 v3dv_cl_ensure_space_with_branch(&job->bcl, cl_packet_length(COLOR_WRITE_MASKS));
1665
1666 struct v3dv_pipeline *pipeline = cmd_buffer->state.gfx.pipeline;
1667 struct v3dv_dynamic_state *v3dv_dyn = &cmd_buffer->state.dynamic;
1668 uint32_t color_write_mask = ~v3dv_dyn->color_write_enable |
1669 pipeline->blend.color_write_masks;
1670
1671 #if V3D_VERSION <= 42
1672 /* Only 4 RTs */
1673 color_write_mask &= 0xffff;
1674 #endif
1675
1676 cl_emit(&job->bcl, COLOR_WRITE_MASKS, mask) {
1677 mask.mask = color_write_mask;
1678 }
1679
1680 BITSET_CLEAR(cmd_buffer->vk.dynamic_graphics_state.dirty,
1681 MESA_VK_DYNAMIC_CB_COLOR_WRITE_ENABLES);
1682 }
1683
1684 static void
emit_flat_shade_flags(struct v3dv_job * job,int varying_offset,uint32_t varyings,enum V3DX (Varying_Flags_Action)lower,enum V3DX (Varying_Flags_Action)higher)1685 emit_flat_shade_flags(struct v3dv_job *job,
1686 int varying_offset,
1687 uint32_t varyings,
1688 enum V3DX(Varying_Flags_Action) lower,
1689 enum V3DX(Varying_Flags_Action) higher)
1690 {
1691 v3dv_cl_ensure_space_with_branch(&job->bcl,
1692 cl_packet_length(FLAT_SHADE_FLAGS));
1693 v3dv_return_if_oom(NULL, job);
1694
1695 cl_emit(&job->bcl, FLAT_SHADE_FLAGS, flags) {
1696 flags.varying_offset_v0 = varying_offset;
1697 flags.flat_shade_flags_for_varyings_v024 = varyings;
1698 flags.action_for_flat_shade_flags_of_lower_numbered_varyings = lower;
1699 flags.action_for_flat_shade_flags_of_higher_numbered_varyings = higher;
1700 }
1701 }
1702
1703 static void
emit_noperspective_flags(struct v3dv_job * job,int varying_offset,uint32_t varyings,enum V3DX (Varying_Flags_Action)lower,enum V3DX (Varying_Flags_Action)higher)1704 emit_noperspective_flags(struct v3dv_job *job,
1705 int varying_offset,
1706 uint32_t varyings,
1707 enum V3DX(Varying_Flags_Action) lower,
1708 enum V3DX(Varying_Flags_Action) higher)
1709 {
1710 v3dv_cl_ensure_space_with_branch(&job->bcl,
1711 cl_packet_length(NON_PERSPECTIVE_FLAGS));
1712 v3dv_return_if_oom(NULL, job);
1713
1714 cl_emit(&job->bcl, NON_PERSPECTIVE_FLAGS, flags) {
1715 flags.varying_offset_v0 = varying_offset;
1716 flags.non_perspective_flags_for_varyings_v024 = varyings;
1717 flags.action_for_non_perspective_flags_of_lower_numbered_varyings = lower;
1718 flags.action_for_non_perspective_flags_of_higher_numbered_varyings = higher;
1719 }
1720 }
1721
1722 static void
emit_centroid_flags(struct v3dv_job * job,int varying_offset,uint32_t varyings,enum V3DX (Varying_Flags_Action)lower,enum V3DX (Varying_Flags_Action)higher)1723 emit_centroid_flags(struct v3dv_job *job,
1724 int varying_offset,
1725 uint32_t varyings,
1726 enum V3DX(Varying_Flags_Action) lower,
1727 enum V3DX(Varying_Flags_Action) higher)
1728 {
1729 v3dv_cl_ensure_space_with_branch(&job->bcl,
1730 cl_packet_length(CENTROID_FLAGS));
1731 v3dv_return_if_oom(NULL, job);
1732
1733 cl_emit(&job->bcl, CENTROID_FLAGS, flags) {
1734 flags.varying_offset_v0 = varying_offset;
1735 flags.centroid_flags_for_varyings_v024 = varyings;
1736 flags.action_for_centroid_flags_of_lower_numbered_varyings = lower;
1737 flags.action_for_centroid_flags_of_higher_numbered_varyings = higher;
1738 }
1739 }
1740
1741 static bool
emit_varying_flags(struct v3dv_job * job,uint32_t num_flags,const uint32_t * flags,void (* flag_emit_callback)(struct v3dv_job * job,int varying_offset,uint32_t flags,enum V3DX (Varying_Flags_Action)lower,enum V3DX (Varying_Flags_Action)higher))1742 emit_varying_flags(struct v3dv_job *job,
1743 uint32_t num_flags,
1744 const uint32_t *flags,
1745 void (*flag_emit_callback)(struct v3dv_job *job,
1746 int varying_offset,
1747 uint32_t flags,
1748 enum V3DX(Varying_Flags_Action) lower,
1749 enum V3DX(Varying_Flags_Action) higher))
1750 {
1751 bool emitted_any = false;
1752 for (int i = 0; i < num_flags; i++) {
1753 if (!flags[i])
1754 continue;
1755
1756 if (emitted_any) {
1757 flag_emit_callback(job, i, flags[i],
1758 V3D_VARYING_FLAGS_ACTION_UNCHANGED,
1759 V3D_VARYING_FLAGS_ACTION_UNCHANGED);
1760 } else if (i == 0) {
1761 flag_emit_callback(job, i, flags[i],
1762 V3D_VARYING_FLAGS_ACTION_UNCHANGED,
1763 V3D_VARYING_FLAGS_ACTION_ZEROED);
1764 } else {
1765 flag_emit_callback(job, i, flags[i],
1766 V3D_VARYING_FLAGS_ACTION_ZEROED,
1767 V3D_VARYING_FLAGS_ACTION_ZEROED);
1768 }
1769
1770 emitted_any = true;
1771 }
1772
1773 return emitted_any;
1774 }
1775
1776 void
v3dX(cmd_buffer_emit_varyings_state)1777 v3dX(cmd_buffer_emit_varyings_state)(struct v3dv_cmd_buffer *cmd_buffer)
1778 {
1779 struct v3dv_job *job = cmd_buffer->state.job;
1780 struct v3dv_pipeline *pipeline = cmd_buffer->state.gfx.pipeline;
1781
1782 struct v3d_fs_prog_data *prog_data_fs =
1783 pipeline->shared_data->variants[BROADCOM_SHADER_FRAGMENT]->prog_data.fs;
1784
1785 const uint32_t num_flags =
1786 ARRAY_SIZE(prog_data_fs->flat_shade_flags);
1787 const uint32_t *flat_shade_flags = prog_data_fs->flat_shade_flags;
1788 const uint32_t *noperspective_flags = prog_data_fs->noperspective_flags;
1789 const uint32_t *centroid_flags = prog_data_fs->centroid_flags;
1790
1791 if (!emit_varying_flags(job, num_flags, flat_shade_flags,
1792 emit_flat_shade_flags)) {
1793 v3dv_cl_ensure_space_with_branch(
1794 &job->bcl, cl_packet_length(ZERO_ALL_FLAT_SHADE_FLAGS));
1795 v3dv_return_if_oom(cmd_buffer, NULL);
1796
1797 cl_emit(&job->bcl, ZERO_ALL_FLAT_SHADE_FLAGS, flags);
1798 }
1799
1800 if (!emit_varying_flags(job, num_flags, noperspective_flags,
1801 emit_noperspective_flags)) {
1802 v3dv_cl_ensure_space_with_branch(
1803 &job->bcl, cl_packet_length(ZERO_ALL_NON_PERSPECTIVE_FLAGS));
1804 v3dv_return_if_oom(cmd_buffer, NULL);
1805
1806 cl_emit(&job->bcl, ZERO_ALL_NON_PERSPECTIVE_FLAGS, flags);
1807 }
1808
1809 if (!emit_varying_flags(job, num_flags, centroid_flags,
1810 emit_centroid_flags)) {
1811 v3dv_cl_ensure_space_with_branch(
1812 &job->bcl, cl_packet_length(ZERO_ALL_CENTROID_FLAGS));
1813 v3dv_return_if_oom(cmd_buffer, NULL);
1814
1815 cl_emit(&job->bcl, ZERO_ALL_CENTROID_FLAGS, flags);
1816 }
1817 }
1818
1819 #if V3D_VERSION == 42
1820 /* Updates cmd_buffer, and their job, early z state tracking. Returns false if
1821 * EZ must be disabled for the current draw call.
1822 */
1823 static bool
cmd_buffer_update_ez_state(struct v3dv_cmd_buffer * cmd_buffer,struct v3dv_pipeline * pipeline)1824 cmd_buffer_update_ez_state(struct v3dv_cmd_buffer *cmd_buffer,
1825 struct v3dv_pipeline *pipeline)
1826 {
1827 struct vk_dynamic_graphics_state *dyn = &cmd_buffer->vk.dynamic_graphics_state;
1828 /* Update first cmd_buffer ez_state tracking. If possible we reuse the
1829 * values from the pipeline
1830 */
1831 if (!BITSET_TEST(dyn->set, MESA_VK_DYNAMIC_DS_STENCIL_OP) &&
1832 !BITSET_TEST(dyn->set, MESA_VK_DYNAMIC_DS_STENCIL_TEST_ENABLE) &&
1833 !BITSET_TEST(dyn->set, MESA_VK_DYNAMIC_DS_DEPTH_TEST_ENABLE) &&
1834 !BITSET_TEST(dyn->set, MESA_VK_DYNAMIC_DS_DEPTH_COMPARE_OP)) {
1835 cmd_buffer->state.ez_state = pipeline->ez_state;
1836 cmd_buffer->state.incompatible_ez_test =
1837 pipeline->incompatible_ez_test;
1838 } else {
1839 v3dv_compute_ez_state(dyn, pipeline,
1840 &cmd_buffer->state.ez_state,
1841 &cmd_buffer->state.incompatible_ez_test);
1842 }
1843
1844 struct v3dv_job *job = cmd_buffer->state.job;
1845 assert(job);
1846 /* If first_ez_state is V3D_EZ_DISABLED it means that we have already
1847 * determined that we should disable EZ completely for all draw calls in
1848 * this job. This will cause us to disable EZ for the entire job in the
1849 * Tile Rendering Mode RCL packet and when we do that we need to make sure
1850 * we never emit a draw call in the job with EZ enabled in the CFG_BITS
1851 * packet, so ez_state must also be V3D_EZ_DISABLED;
1852 */
1853 if (job->first_ez_state == V3D_EZ_DISABLED) {
1854 assert(job->ez_state == V3D_EZ_DISABLED);
1855 return false;
1856 }
1857
1858 /* If ez_state is V3D_EZ_DISABLED it means that we have already decided
1859 * that EZ must be disabled for the remaining of the frame.
1860 */
1861 if (job->ez_state == V3D_EZ_DISABLED)
1862 return false;
1863
1864 /* This is part of the pre draw call handling, so we should be inside a
1865 * render pass.
1866 */
1867 assert(cmd_buffer->state.pass);
1868
1869 /* If this is the first time we update EZ state for this job we first check
1870 * if there is anything that requires disabling it completely for the entire
1871 * job (based on state that is not related to the current draw call and
1872 * pipeline/cmd_buffer state).
1873 */
1874 if (!job->decided_global_ez_enable) {
1875 job->decided_global_ez_enable = true;
1876
1877 struct v3dv_cmd_buffer_state *state = &cmd_buffer->state;
1878 assert(state->subpass_idx < state->pass->subpass_count);
1879 struct v3dv_subpass *subpass = &state->pass->subpasses[state->subpass_idx];
1880 if (subpass->ds_attachment.attachment == VK_ATTACHMENT_UNUSED) {
1881 job->first_ez_state = V3D_EZ_DISABLED;
1882 job->ez_state = V3D_EZ_DISABLED;
1883 return false;
1884 }
1885
1886 /* GFXH-1918: the early-z buffer may load incorrect depth values if the
1887 * frame has odd width or height, or if the buffer is 16-bit and
1888 * multisampled.
1889 *
1890 * So we need to disable EZ in these cases.
1891 */
1892 const struct v3dv_render_pass_attachment *ds_attachment =
1893 &state->pass->attachments[subpass->ds_attachment.attachment];
1894
1895 const VkImageAspectFlags ds_aspects =
1896 vk_format_aspects(ds_attachment->desc.format);
1897
1898 bool needs_depth_load =
1899 v3dv_cmd_buffer_check_needs_load(state,
1900 ds_aspects & VK_IMAGE_ASPECT_DEPTH_BIT,
1901 ds_attachment->first_subpass,
1902 ds_attachment->desc.loadOp,
1903 ds_attachment->last_subpass,
1904 ds_attachment->desc.storeOp);
1905
1906 if (needs_depth_load) {
1907 if (ds_attachment->desc.format == VK_FORMAT_D16_UNORM &&
1908 ds_attachment->desc.samples != VK_SAMPLE_COUNT_1_BIT) {
1909 perf_debug("Loading depth aspect from a multisampled 16-bit "
1910 "depth buffer disables early-Z tests.\n");
1911 job->first_ez_state = V3D_EZ_DISABLED;
1912 job->ez_state = V3D_EZ_DISABLED;
1913 return false;
1914 }
1915
1916 struct v3dv_framebuffer *fb = state->framebuffer;
1917
1918 if (!fb) {
1919 assert(cmd_buffer->vk.level == VK_COMMAND_BUFFER_LEVEL_SECONDARY);
1920 perf_debug("Loading depth aspect in a secondary command buffer "
1921 "without framebuffer info disables early-z tests.\n");
1922 job->first_ez_state = V3D_EZ_DISABLED;
1923 job->ez_state = V3D_EZ_DISABLED;
1924 return false;
1925 }
1926
1927 if (((fb->width % 2) != 0 || (fb->height % 2) != 0)) {
1928 perf_debug("Loading depth aspect for framebuffer with odd width "
1929 "or height disables early-Z tests.\n");
1930 job->first_ez_state = V3D_EZ_DISABLED;
1931 job->ez_state = V3D_EZ_DISABLED;
1932 return false;
1933 }
1934 }
1935 }
1936
1937 /* Otherwise, we can decide to selectively enable or disable EZ for draw
1938 * calls using the CFG_BITS packet based on the bound pipeline state, or
1939 * cmd_buffer state if some stencil/depth flags were dynamic.
1940 */
1941 bool disable_ez = false;
1942 bool incompatible_test = false;
1943 switch (cmd_buffer->state.ez_state) {
1944 case V3D_EZ_UNDECIDED:
1945 /* If the pipeline didn't pick a direction but didn't disable, then go
1946 * along with the current EZ state. This allows EZ optimization for Z
1947 * func == EQUAL or NEVER.
1948 */
1949 break;
1950
1951 case V3D_EZ_LT_LE:
1952 case V3D_EZ_GT_GE:
1953 /* If the pipeline picked a direction, then it needs to match the current
1954 * direction if we've decided on one.
1955 */
1956 if (job->ez_state == V3D_EZ_UNDECIDED) {
1957 job->ez_state = cmd_buffer->state.ez_state;
1958 } else if (job->ez_state != pipeline->ez_state) {
1959 disable_ez = true;
1960 incompatible_test = true;
1961 }
1962 break;
1963
1964 case V3D_EZ_DISABLED:
1965 disable_ez = true;
1966 incompatible_test = cmd_buffer->state.incompatible_ez_test;
1967 break;
1968 }
1969
1970 if (job->first_ez_state == V3D_EZ_UNDECIDED && !disable_ez) {
1971 assert(job->ez_state != V3D_EZ_DISABLED);
1972 job->first_ez_state = job->ez_state;
1973 }
1974
1975 /* If we had to disable EZ because of an incompatible test direction and
1976 * and the cmd buffer writes depth then we need to disable EZ for the rest
1977 * of the frame.
1978 */
1979 if (incompatible_test && cmd_buffer->state.z_updates_enable) {
1980 assert(disable_ez);
1981 job->ez_state = V3D_EZ_DISABLED;
1982 }
1983
1984 if (!disable_ez)
1985 job->has_ez_draws = true;
1986
1987 return !disable_ez;
1988 }
1989 #endif
1990
1991 void
v3dX(cmd_buffer_emit_configuration_bits)1992 v3dX(cmd_buffer_emit_configuration_bits)(struct v3dv_cmd_buffer *cmd_buffer)
1993 {
1994 struct v3dv_job *job = cmd_buffer->state.job;
1995 assert(job);
1996
1997 struct v3dv_pipeline *pipeline = cmd_buffer->state.gfx.pipeline;
1998 assert(pipeline);
1999
2000 v3dv_cl_ensure_space_with_branch(&job->bcl, cl_packet_length(CFG_BITS));
2001 v3dv_return_if_oom(cmd_buffer, NULL);
2002
2003 struct vk_dynamic_graphics_state *dyn =
2004 &cmd_buffer->vk.dynamic_graphics_state;
2005
2006 /* Disable depth/stencil if we don't have a D/S attachment */
2007 bool has_depth =
2008 pipeline->rendering_info.depth_attachment_format != VK_FORMAT_UNDEFINED;
2009 bool has_stencil =
2010 pipeline->rendering_info.stencil_attachment_format != VK_FORMAT_UNDEFINED;
2011
2012 cl_emit_with_prepacked(&job->bcl, CFG_BITS, pipeline->cfg_bits, config) {
2013 if (dyn->ds.depth.test_enable && has_depth) {
2014 config.z_updates_enable = dyn->ds.depth.write_enable;
2015 config.depth_test_function = dyn->ds.depth.compare_op;
2016 } else {
2017 config.depth_test_function = VK_COMPARE_OP_ALWAYS;
2018 }
2019
2020 config.stencil_enable = dyn->ds.stencil.test_enable && has_stencil;
2021
2022 cmd_buffer->state.z_updates_enable = config.z_updates_enable;
2023 #if V3D_VERSION == 42
2024 bool enable_ez = cmd_buffer_update_ez_state(cmd_buffer, pipeline);
2025 config.early_z_enable = enable_ez;
2026 config.early_z_updates_enable = config.early_z_enable &&
2027 cmd_buffer->state.z_updates_enable;
2028 #endif
2029
2030 if (!dyn->rs.rasterizer_discard_enable) {
2031 assert(BITSET_TEST(dyn->set, MESA_VK_DYNAMIC_RS_CULL_MODE));
2032 assert(BITSET_TEST(dyn->set, MESA_VK_DYNAMIC_RS_FRONT_FACE));
2033 config.enable_forward_facing_primitive = !(dyn->rs.cull_mode & VK_CULL_MODE_FRONT_BIT);
2034 config.enable_reverse_facing_primitive = !(dyn->rs.cull_mode & VK_CULL_MODE_BACK_BIT);
2035 /* Seems like the hardware is backwards regarding this setting... */
2036 config.clockwise_primitives = dyn->rs.front_face == VK_FRONT_FACE_COUNTER_CLOCKWISE;
2037 }
2038
2039 /* V3D 4.2 doesn't support depth bounds testing so we don't advertise that
2040 * feature and it shouldn't be used by any pipeline.
2041 */
2042 assert(cmd_buffer->device->devinfo.ver >= 71 ||
2043 !dyn->ds.depth.bounds_test.enable);
2044 #if V3D_VERSION >= 71
2045 config.depth_bounds_test_enable =
2046 dyn->ds.depth.bounds_test.enable && has_depth;
2047 #endif
2048
2049 config.enable_depth_offset = dyn->rs.depth_bias.enable;
2050 }
2051
2052 BITSET_CLEAR(dyn->dirty, MESA_VK_DYNAMIC_RS_CULL_MODE);
2053 BITSET_CLEAR(dyn->dirty, MESA_VK_DYNAMIC_RS_FRONT_FACE);
2054 BITSET_CLEAR(dyn->dirty, MESA_VK_DYNAMIC_DS_DEPTH_BOUNDS_TEST_ENABLE);
2055 BITSET_CLEAR(dyn->dirty, MESA_VK_DYNAMIC_DS_STENCIL_TEST_ENABLE);
2056 BITSET_CLEAR(dyn->dirty, MESA_VK_DYNAMIC_RS_DEPTH_BIAS_ENABLE);
2057 BITSET_CLEAR(dyn->dirty, MESA_VK_DYNAMIC_RS_RASTERIZER_DISCARD_ENABLE);
2058 }
2059
2060 void
v3dX(cmd_buffer_emit_occlusion_query)2061 v3dX(cmd_buffer_emit_occlusion_query)(struct v3dv_cmd_buffer *cmd_buffer)
2062 {
2063 struct v3dv_job *job = cmd_buffer->state.job;
2064 assert(job);
2065
2066 v3dv_cl_ensure_space_with_branch(&job->bcl,
2067 cl_packet_length(OCCLUSION_QUERY_COUNTER));
2068 v3dv_return_if_oom(cmd_buffer, NULL);
2069
2070 cl_emit(&job->bcl, OCCLUSION_QUERY_COUNTER, counter) {
2071 if (cmd_buffer->state.query.active_query.bo) {
2072 counter.address =
2073 v3dv_cl_address(cmd_buffer->state.query.active_query.bo,
2074 cmd_buffer->state.query.active_query.offset);
2075 }
2076 }
2077
2078 cmd_buffer->state.dirty &= ~V3DV_CMD_DIRTY_OCCLUSION_QUERY;
2079 }
2080
2081 static struct v3dv_job *
cmd_buffer_subpass_split_for_barrier(struct v3dv_cmd_buffer * cmd_buffer,bool is_bcl_barrier)2082 cmd_buffer_subpass_split_for_barrier(struct v3dv_cmd_buffer *cmd_buffer,
2083 bool is_bcl_barrier)
2084 {
2085 assert(cmd_buffer->state.subpass_idx != -1);
2086 v3dv_cmd_buffer_finish_job(cmd_buffer);
2087 struct v3dv_job *job =
2088 v3dv_cmd_buffer_subpass_resume(cmd_buffer,
2089 cmd_buffer->state.subpass_idx);
2090 if (!job)
2091 return NULL;
2092
2093 /* FIXME: we can do better than all barriers */
2094 job->serialize = V3DV_BARRIER_ALL;
2095 job->needs_bcl_sync = is_bcl_barrier;
2096 return job;
2097 }
2098
2099 static void
cmd_buffer_copy_secondary_end_query_state(struct v3dv_cmd_buffer * primary,struct v3dv_cmd_buffer * secondary)2100 cmd_buffer_copy_secondary_end_query_state(struct v3dv_cmd_buffer *primary,
2101 struct v3dv_cmd_buffer *secondary)
2102 {
2103 struct v3dv_cmd_buffer_state *p_state = &primary->state;
2104 struct v3dv_cmd_buffer_state *s_state = &secondary->state;
2105
2106 const uint32_t total_state_count =
2107 p_state->query.end.used_count + s_state->query.end.used_count;
2108 v3dv_cmd_buffer_ensure_array_state(primary,
2109 sizeof(struct v3dv_end_query_info),
2110 total_state_count,
2111 &p_state->query.end.alloc_count,
2112 (void **) &p_state->query.end.states);
2113 v3dv_return_if_oom(primary, NULL);
2114
2115 for (uint32_t i = 0; i < s_state->query.end.used_count; i++) {
2116 const struct v3dv_end_query_info *s_qstate =
2117 &secondary->state.query.end.states[i];
2118
2119 struct v3dv_end_query_info *p_qstate =
2120 &p_state->query.end.states[p_state->query.end.used_count++];
2121
2122 memcpy(p_qstate, s_qstate, sizeof(struct v3dv_end_query_info));
2123 }
2124 }
2125
2126 void
v3dX(cmd_buffer_execute_inside_pass)2127 v3dX(cmd_buffer_execute_inside_pass)(struct v3dv_cmd_buffer *primary,
2128 uint32_t cmd_buffer_count,
2129 const VkCommandBuffer *cmd_buffers)
2130 {
2131 assert(primary->state.job);
2132
2133 /* Typically we postpone applying binning syncs until we see a draw call
2134 * that may actually access proteted resources in the binning stage. However,
2135 * if the draw calls are recorded in a secondary command buffer and the
2136 * barriers were recorded in a primary command buffer, that won't work
2137 * and we will have to check if we need a binning sync when executing the
2138 * secondary.
2139 */
2140 struct v3dv_job *primary_job = primary->state.job;
2141 if (primary_job->serialize &&
2142 (primary->state.barrier.bcl_buffer_access ||
2143 primary->state.barrier.bcl_image_access)) {
2144 v3dv_cmd_buffer_consume_bcl_sync(primary, primary_job);
2145 }
2146
2147 /* Emit occlusion query state if needed so the draw calls inside our
2148 * secondaries update the counters.
2149 */
2150 bool has_occlusion_query =
2151 primary->state.dirty & V3DV_CMD_DIRTY_OCCLUSION_QUERY;
2152 if (has_occlusion_query)
2153 v3dX(cmd_buffer_emit_occlusion_query)(primary);
2154
2155 /* FIXME: if our primary job tiling doesn't enable MSSA but any of the
2156 * pipelines used by the secondaries do, we need to re-start the primary
2157 * job to enable MSAA. See cmd_buffer_restart_job_for_msaa_if_needed.
2158 */
2159 struct v3dv_barrier_state pending_barrier = { 0 };
2160 for (uint32_t i = 0; i < cmd_buffer_count; i++) {
2161 V3DV_FROM_HANDLE(v3dv_cmd_buffer, secondary, cmd_buffers[i]);
2162
2163 assert(secondary->usage_flags &
2164 VK_COMMAND_BUFFER_USAGE_RENDER_PASS_CONTINUE_BIT);
2165
2166 list_for_each_entry(struct v3dv_job, secondary_job,
2167 &secondary->jobs, list_link) {
2168 if (secondary_job->type == V3DV_JOB_TYPE_GPU_CL_INCOMPLETE) {
2169 /* If the job is a CL, then we branch to it from the primary BCL.
2170 * In this case the secondary's BCL is finished with a
2171 * RETURN_FROM_SUB_LIST command to return back to the primary BCL
2172 * once we are done executing it.
2173 */
2174 assert(v3dv_cl_offset(&secondary_job->rcl) == 0);
2175 assert(secondary_job->bcl.bo);
2176
2177 /* Sanity check that secondary BCL ends with RETURN_FROM_SUB_LIST */
2178 STATIC_ASSERT(cl_packet_length(RETURN_FROM_SUB_LIST) == 1);
2179 assert(v3dv_cl_offset(&secondary_job->bcl) >= 1);
2180 assert(*(((uint8_t *)secondary_job->bcl.next) - 1) ==
2181 V3DX(RETURN_FROM_SUB_LIST_opcode));
2182
2183 /* If this secondary has any barriers (or we had any pending barrier
2184 * to apply), then we can't just branch to it from the primary, we
2185 * need to split the primary to create a new job that can consume
2186 * the barriers first.
2187 *
2188 * FIXME: in this case, maybe just copy the secondary BCL without
2189 * the RETURN_FROM_SUB_LIST into the primary job to skip the
2190 * branch?
2191 */
2192 primary_job = primary->state.job;
2193 if (!primary_job || secondary_job->serialize ||
2194 pending_barrier.dst_mask) {
2195 const bool needs_bcl_barrier =
2196 secondary_job->needs_bcl_sync ||
2197 pending_barrier.bcl_buffer_access ||
2198 pending_barrier.bcl_image_access;
2199
2200 primary_job =
2201 cmd_buffer_subpass_split_for_barrier(primary,
2202 needs_bcl_barrier);
2203 v3dv_return_if_oom(primary, NULL);
2204
2205 /* Since we have created a new primary we need to re-emit
2206 * occlusion query state.
2207 */
2208 if (has_occlusion_query)
2209 v3dX(cmd_buffer_emit_occlusion_query)(primary);
2210 }
2211
2212 /* Make sure our primary job has all required BO references */
2213 set_foreach(secondary_job->bos, entry) {
2214 struct v3dv_bo *bo = (struct v3dv_bo *)entry->key;
2215 v3dv_job_add_bo(primary_job, bo);
2216 }
2217
2218 /* Emit required branch instructions. We expect each of these
2219 * to end with a corresponding 'return from sub list' item.
2220 */
2221 list_for_each_entry(struct v3dv_bo, bcl_bo,
2222 &secondary_job->bcl.bo_list, list_link) {
2223 v3dv_cl_ensure_space_with_branch(&primary_job->bcl,
2224 cl_packet_length(BRANCH_TO_SUB_LIST));
2225 v3dv_return_if_oom(primary, NULL);
2226 cl_emit(&primary_job->bcl, BRANCH_TO_SUB_LIST, branch) {
2227 branch.address = v3dv_cl_address(bcl_bo, 0);
2228 }
2229 }
2230
2231 if (!secondary_job->can_use_double_buffer) {
2232 primary_job->can_use_double_buffer = false;
2233 } else {
2234 primary_job->double_buffer_score.geom +=
2235 secondary_job->double_buffer_score.geom;
2236 primary_job->double_buffer_score.render +=
2237 secondary_job->double_buffer_score.render;
2238 }
2239 primary_job->tmu_dirty_rcl |= secondary_job->tmu_dirty_rcl;
2240 } else {
2241 /* This is a regular job (CPU or GPU), so just finish the current
2242 * primary job (if any) and then add the secondary job to the
2243 * primary's job list right after it.
2244 */
2245 v3dv_cmd_buffer_finish_job(primary);
2246 v3dv_job_clone_in_cmd_buffer(secondary_job, primary);
2247 if (pending_barrier.dst_mask) {
2248 /* FIXME: do the same we do for primaries and only choose the
2249 * relevant src masks.
2250 */
2251 secondary_job->serialize = pending_barrier.src_mask_graphics |
2252 pending_barrier.src_mask_transfer |
2253 pending_barrier.src_mask_compute;
2254 if (pending_barrier.bcl_buffer_access ||
2255 pending_barrier.bcl_image_access) {
2256 secondary_job->needs_bcl_sync = true;
2257 }
2258 }
2259 }
2260
2261 memset(&pending_barrier, 0, sizeof(pending_barrier));
2262 }
2263
2264 /* If the secondary has recorded any vkCmdEndQuery commands, we need to
2265 * copy this state to the primary so it is processed properly when the
2266 * current primary job is finished.
2267 */
2268 cmd_buffer_copy_secondary_end_query_state(primary, secondary);
2269
2270 /* If this secondary had any pending barrier state we will need that
2271 * barrier state consumed with whatever comes next in the primary.
2272 */
2273 assert(secondary->state.barrier.dst_mask ||
2274 (!secondary->state.barrier.bcl_buffer_access &&
2275 !secondary->state.barrier.bcl_image_access));
2276
2277 pending_barrier = secondary->state.barrier;
2278 }
2279
2280 if (pending_barrier.dst_mask) {
2281 v3dv_cmd_buffer_merge_barrier_state(&primary->state.barrier,
2282 &pending_barrier);
2283 }
2284 }
2285
2286 static void
emit_gs_shader_state_record(struct v3dv_job * job,struct v3dv_bo * assembly_bo,struct v3dv_shader_variant * gs_bin,struct v3dv_cl_reloc gs_bin_uniforms,struct v3dv_shader_variant * gs,struct v3dv_cl_reloc gs_render_uniforms)2287 emit_gs_shader_state_record(struct v3dv_job *job,
2288 struct v3dv_bo *assembly_bo,
2289 struct v3dv_shader_variant *gs_bin,
2290 struct v3dv_cl_reloc gs_bin_uniforms,
2291 struct v3dv_shader_variant *gs,
2292 struct v3dv_cl_reloc gs_render_uniforms)
2293 {
2294 cl_emit(&job->indirect, GEOMETRY_SHADER_STATE_RECORD, shader) {
2295 shader.geometry_bin_mode_shader_code_address =
2296 v3dv_cl_address(assembly_bo, gs_bin->assembly_offset);
2297 shader.geometry_bin_mode_shader_4_way_threadable =
2298 gs_bin->prog_data.gs->base.threads == 4;
2299 shader.geometry_bin_mode_shader_start_in_final_thread_section =
2300 gs_bin->prog_data.gs->base.single_seg;
2301 #if V3D_VERSION <= 42
2302 shader.geometry_bin_mode_shader_propagate_nans = true;
2303 #endif
2304 shader.geometry_bin_mode_shader_uniforms_address =
2305 gs_bin_uniforms;
2306
2307 shader.geometry_render_mode_shader_code_address =
2308 v3dv_cl_address(assembly_bo, gs->assembly_offset);
2309 shader.geometry_render_mode_shader_4_way_threadable =
2310 gs->prog_data.gs->base.threads == 4;
2311 shader.geometry_render_mode_shader_start_in_final_thread_section =
2312 gs->prog_data.gs->base.single_seg;
2313 #if V3D_VERSION <= 42
2314 shader.geometry_render_mode_shader_propagate_nans = true;
2315 #endif
2316 shader.geometry_render_mode_shader_uniforms_address =
2317 gs_render_uniforms;
2318 }
2319 }
2320
2321 static uint8_t
v3d_gs_output_primitive(enum mesa_prim prim_type)2322 v3d_gs_output_primitive(enum mesa_prim prim_type)
2323 {
2324 switch (prim_type) {
2325 case MESA_PRIM_POINTS:
2326 return GEOMETRY_SHADER_POINTS;
2327 case MESA_PRIM_LINE_STRIP:
2328 return GEOMETRY_SHADER_LINE_STRIP;
2329 case MESA_PRIM_TRIANGLE_STRIP:
2330 return GEOMETRY_SHADER_TRI_STRIP;
2331 default:
2332 unreachable("Unsupported primitive type");
2333 }
2334 }
2335
2336 static void
emit_tes_gs_common_params(struct v3dv_job * job,uint8_t gs_out_prim_type,uint8_t gs_num_invocations)2337 emit_tes_gs_common_params(struct v3dv_job *job,
2338 uint8_t gs_out_prim_type,
2339 uint8_t gs_num_invocations)
2340 {
2341 cl_emit(&job->indirect, TESSELLATION_GEOMETRY_COMMON_PARAMS, shader) {
2342 shader.tessellation_type = TESSELLATION_TYPE_TRIANGLE;
2343 shader.tessellation_point_mode = false;
2344 shader.tessellation_edge_spacing = TESSELLATION_EDGE_SPACING_EVEN;
2345 shader.tessellation_clockwise = true;
2346 shader.tessellation_invocations = 1;
2347
2348 shader.geometry_shader_output_format =
2349 v3d_gs_output_primitive(gs_out_prim_type);
2350 shader.geometry_shader_instances = gs_num_invocations & 0x1F;
2351 }
2352 }
2353
2354 static uint8_t
simd_width_to_gs_pack_mode(uint32_t width)2355 simd_width_to_gs_pack_mode(uint32_t width)
2356 {
2357 switch (width) {
2358 case 16:
2359 return V3D_PACK_MODE_16_WAY;
2360 case 8:
2361 return V3D_PACK_MODE_8_WAY;
2362 case 4:
2363 return V3D_PACK_MODE_4_WAY;
2364 case 1:
2365 return V3D_PACK_MODE_1_WAY;
2366 default:
2367 unreachable("Invalid SIMD width");
2368 };
2369 }
2370
2371 static void
emit_tes_gs_shader_params(struct v3dv_job * job,uint32_t gs_simd,uint32_t gs_vpm_output_size,uint32_t gs_max_vpm_input_size_per_batch)2372 emit_tes_gs_shader_params(struct v3dv_job *job,
2373 uint32_t gs_simd,
2374 uint32_t gs_vpm_output_size,
2375 uint32_t gs_max_vpm_input_size_per_batch)
2376 {
2377 cl_emit(&job->indirect, TESSELLATION_GEOMETRY_SHADER_PARAMS, shader) {
2378 shader.tcs_batch_flush_mode = V3D_TCS_FLUSH_MODE_FULLY_PACKED;
2379 shader.per_patch_data_column_depth = 1;
2380 shader.tcs_output_segment_size_in_sectors = 1;
2381 shader.tcs_output_segment_pack_mode = V3D_PACK_MODE_16_WAY;
2382 shader.tes_output_segment_size_in_sectors = 1;
2383 shader.tes_output_segment_pack_mode = V3D_PACK_MODE_16_WAY;
2384 shader.gs_output_segment_size_in_sectors = gs_vpm_output_size;
2385 shader.gs_output_segment_pack_mode =
2386 simd_width_to_gs_pack_mode(gs_simd);
2387 shader.tbg_max_patches_per_tcs_batch = 1;
2388 shader.tbg_max_extra_vertex_segs_for_patches_after_first = 0;
2389 shader.tbg_min_tcs_output_segments_required_in_play = 1;
2390 shader.tbg_min_per_patch_data_segments_required_in_play = 1;
2391 shader.tpg_max_patches_per_tes_batch = 1;
2392 shader.tpg_max_vertex_segments_per_tes_batch = 0;
2393 shader.tpg_max_tcs_output_segments_per_tes_batch = 1;
2394 shader.tpg_min_tes_output_segments_required_in_play = 1;
2395 shader.gbg_max_tes_output_vertex_segments_per_gs_batch =
2396 gs_max_vpm_input_size_per_batch;
2397 shader.gbg_min_gs_output_segments_required_in_play = 1;
2398 }
2399 }
2400
2401 void
v3dX(cmd_buffer_emit_gl_shader_state)2402 v3dX(cmd_buffer_emit_gl_shader_state)(struct v3dv_cmd_buffer *cmd_buffer)
2403 {
2404 struct v3dv_job *job = cmd_buffer->state.job;
2405 assert(job);
2406
2407 struct v3dv_cmd_buffer_state *state = &cmd_buffer->state;
2408 struct v3dv_pipeline *pipeline = state->gfx.pipeline;
2409 assert(pipeline);
2410
2411 struct v3dv_shader_variant *vs_variant =
2412 pipeline->shared_data->variants[BROADCOM_SHADER_VERTEX];
2413 struct v3d_vs_prog_data *prog_data_vs = vs_variant->prog_data.vs;
2414
2415 struct v3dv_shader_variant *vs_bin_variant =
2416 pipeline->shared_data->variants[BROADCOM_SHADER_VERTEX_BIN];
2417 struct v3d_vs_prog_data *prog_data_vs_bin = vs_bin_variant->prog_data.vs;
2418
2419 struct v3dv_shader_variant *fs_variant =
2420 pipeline->shared_data->variants[BROADCOM_SHADER_FRAGMENT];
2421 struct v3d_fs_prog_data *prog_data_fs = fs_variant->prog_data.fs;
2422
2423 struct v3dv_shader_variant *gs_variant = NULL;
2424 struct v3dv_shader_variant *gs_bin_variant = NULL;
2425 struct v3d_gs_prog_data *prog_data_gs = NULL;
2426 struct v3d_gs_prog_data *prog_data_gs_bin = NULL;
2427 if (pipeline->has_gs) {
2428 gs_variant =
2429 pipeline->shared_data->variants[BROADCOM_SHADER_GEOMETRY];
2430 prog_data_gs = gs_variant->prog_data.gs;
2431
2432 gs_bin_variant =
2433 pipeline->shared_data->variants[BROADCOM_SHADER_GEOMETRY_BIN];
2434 prog_data_gs_bin = gs_bin_variant->prog_data.gs;
2435 }
2436
2437 /* Update the cache dirty flag based on the shader progs data */
2438 job->tmu_dirty_rcl |= prog_data_vs_bin->base.tmu_dirty_rcl;
2439 job->tmu_dirty_rcl |= prog_data_vs->base.tmu_dirty_rcl;
2440 job->tmu_dirty_rcl |= prog_data_fs->base.tmu_dirty_rcl;
2441 if (pipeline->has_gs) {
2442 job->tmu_dirty_rcl |= prog_data_gs_bin->base.tmu_dirty_rcl;
2443 job->tmu_dirty_rcl |= prog_data_gs->base.tmu_dirty_rcl;
2444 }
2445
2446 /* See GFXH-930 workaround below */
2447 uint32_t num_elements_to_emit = MAX2(pipeline->va_count, 1);
2448
2449 uint32_t shader_state_record_length =
2450 cl_packet_length(GL_SHADER_STATE_RECORD);
2451 #if V3D_VERSION >= 71
2452 if (v3d_device_has_draw_index(&pipeline->device->devinfo)) {
2453 shader_state_record_length =
2454 cl_packet_length(GL_SHADER_STATE_RECORD_DRAW_INDEX);
2455 }
2456 #endif
2457
2458 if (pipeline->has_gs) {
2459 shader_state_record_length +=
2460 cl_packet_length(GEOMETRY_SHADER_STATE_RECORD) +
2461 cl_packet_length(TESSELLATION_GEOMETRY_COMMON_PARAMS) +
2462 2 * cl_packet_length(TESSELLATION_GEOMETRY_SHADER_PARAMS);
2463 }
2464
2465 uint32_t shader_rec_offset =
2466 v3dv_cl_ensure_space(&job->indirect,
2467 shader_state_record_length +
2468 num_elements_to_emit *
2469 cl_packet_length(GL_SHADER_STATE_ATTRIBUTE_RECORD),
2470 32);
2471 v3dv_return_if_oom(cmd_buffer, NULL);
2472
2473 struct v3dv_bo *assembly_bo = pipeline->shared_data->assembly_bo;
2474
2475 if (pipeline->has_gs) {
2476 emit_gs_shader_state_record(job,
2477 assembly_bo,
2478 gs_bin_variant,
2479 cmd_buffer->state.uniforms.gs_bin,
2480 gs_variant,
2481 cmd_buffer->state.uniforms.gs);
2482
2483 emit_tes_gs_common_params(job,
2484 prog_data_gs->out_prim_type,
2485 prog_data_gs->num_invocations);
2486
2487 emit_tes_gs_shader_params(job,
2488 pipeline->vpm_cfg_bin.gs_width,
2489 pipeline->vpm_cfg_bin.Gd,
2490 pipeline->vpm_cfg_bin.Gv);
2491
2492 emit_tes_gs_shader_params(job,
2493 pipeline->vpm_cfg.gs_width,
2494 pipeline->vpm_cfg.Gd,
2495 pipeline->vpm_cfg.Gv);
2496 }
2497
2498 #if V3D_VERSION == 42
2499 struct v3dv_bo *default_attribute_values =
2500 pipeline->default_attribute_values != NULL ?
2501 pipeline->default_attribute_values :
2502 pipeline->device->default_attribute_float;
2503 #endif
2504
2505 #if V3D_VERSION >= 71
2506 if (v3d_device_has_draw_index(&pipeline->device->devinfo)) {
2507 cl_emit_with_prepacked(&job->indirect, GL_SHADER_STATE_RECORD_DRAW_INDEX,
2508 pipeline->shader_state_record, shader) {
2509 shader.min_coord_shader_input_segments_required_in_play =
2510 pipeline->vpm_cfg_bin.As;
2511 shader.min_vertex_shader_input_segments_required_in_play =
2512 pipeline->vpm_cfg.As;
2513 shader.coordinate_shader_code_address =
2514 v3dv_cl_address(assembly_bo, vs_bin_variant->assembly_offset);
2515 shader.vertex_shader_code_address =
2516 v3dv_cl_address(assembly_bo, vs_variant->assembly_offset);
2517 shader.fragment_shader_code_address =
2518 v3dv_cl_address(assembly_bo, fs_variant->assembly_offset);
2519 shader.coordinate_shader_uniforms_address = cmd_buffer->state.uniforms.vs_bin;
2520 shader.vertex_shader_uniforms_address = cmd_buffer->state.uniforms.vs;
2521 shader.fragment_shader_uniforms_address = cmd_buffer->state.uniforms.fs;
2522 shader.any_shader_reads_hardware_written_primitive_id =
2523 (pipeline->has_gs && prog_data_gs->uses_pid) || prog_data_fs->uses_pid;
2524 shader.insert_primitive_id_as_first_varying_to_fragment_shader =
2525 !pipeline->has_gs && prog_data_fs->uses_pid;
2526 }
2527 } else
2528 #endif
2529 {
2530 cl_emit_with_prepacked(&job->indirect, GL_SHADER_STATE_RECORD,
2531 pipeline->shader_state_record, shader) {
2532 /* FIXME: we are setting this values here and during the
2533 * prepacking. This is because both cl_emit_with_prepacked and v3dvx_pack
2534 * asserts for minimum values of these. It would be good to get
2535 * v3dvx_pack to assert on the final value if possible
2536 */
2537 shader.min_coord_shader_input_segments_required_in_play =
2538 pipeline->vpm_cfg_bin.As;
2539 shader.min_vertex_shader_input_segments_required_in_play =
2540 pipeline->vpm_cfg.As;
2541
2542 shader.coordinate_shader_code_address =
2543 v3dv_cl_address(assembly_bo, vs_bin_variant->assembly_offset);
2544 shader.vertex_shader_code_address =
2545 v3dv_cl_address(assembly_bo, vs_variant->assembly_offset);
2546 shader.fragment_shader_code_address =
2547 v3dv_cl_address(assembly_bo, fs_variant->assembly_offset);
2548
2549 shader.coordinate_shader_uniforms_address = cmd_buffer->state.uniforms.vs_bin;
2550 shader.vertex_shader_uniforms_address = cmd_buffer->state.uniforms.vs;
2551 shader.fragment_shader_uniforms_address = cmd_buffer->state.uniforms.fs;
2552
2553 #if V3D_VERSION == 42
2554 shader.address_of_default_attribute_values =
2555 v3dv_cl_address(default_attribute_values, 0);
2556 #endif
2557
2558 shader.any_shader_reads_hardware_written_primitive_id =
2559 (pipeline->has_gs && prog_data_gs->uses_pid) || prog_data_fs->uses_pid;
2560 shader.insert_primitive_id_as_first_varying_to_fragment_shader =
2561 !pipeline->has_gs && prog_data_fs->uses_pid;
2562 }
2563 }
2564
2565 /* Upload vertex element attributes (SHADER_STATE_ATTRIBUTE_RECORD) */
2566 bool cs_loaded_any = false;
2567 const bool cs_uses_builtins = prog_data_vs_bin->uses_iid ||
2568 prog_data_vs_bin->uses_biid ||
2569 prog_data_vs_bin->uses_vid;
2570 const uint32_t packet_length =
2571 cl_packet_length(GL_SHADER_STATE_ATTRIBUTE_RECORD);
2572
2573 uint32_t emitted_va_count = 0;
2574 for (uint32_t i = 0; emitted_va_count < pipeline->va_count; i++) {
2575 assert(i < MAX_VERTEX_ATTRIBS);
2576
2577 if (pipeline->va[i].vk_format == VK_FORMAT_UNDEFINED)
2578 continue;
2579
2580 const uint32_t binding = pipeline->va[i].binding;
2581
2582 /* We store each vertex attribute in the array using its driver location
2583 * as index.
2584 */
2585 const uint32_t location = i;
2586
2587 struct v3dv_vertex_binding *c_vb = &cmd_buffer->state.vertex_bindings[binding];
2588
2589 cl_emit_with_prepacked(&job->indirect, GL_SHADER_STATE_ATTRIBUTE_RECORD,
2590 &pipeline->vertex_attrs[i * packet_length], attr) {
2591
2592 assert(c_vb->buffer->mem->bo);
2593 attr.address = v3dv_cl_address(c_vb->buffer->mem->bo,
2594 c_vb->buffer->mem_offset +
2595 pipeline->va[i].offset +
2596 c_vb->offset);
2597
2598 attr.number_of_values_read_by_coordinate_shader =
2599 prog_data_vs_bin->vattr_sizes[location];
2600 attr.number_of_values_read_by_vertex_shader =
2601 prog_data_vs->vattr_sizes[location];
2602
2603 /* GFXH-930: At least one attribute must be enabled and read by CS
2604 * and VS. If we have attributes being consumed by the VS but not
2605 * the CS, then set up a dummy load of the last attribute into the
2606 * CS's VPM inputs. (Since CS is just dead-code-elimination compared
2607 * to VS, we can't have CS loading but not VS).
2608 *
2609 * GFXH-1602: first attribute must be active if using builtins.
2610 */
2611 if (prog_data_vs_bin->vattr_sizes[location])
2612 cs_loaded_any = true;
2613
2614 if (i == 0 && cs_uses_builtins && !cs_loaded_any) {
2615 attr.number_of_values_read_by_coordinate_shader = 1;
2616 cs_loaded_any = true;
2617 } else if (i == pipeline->va_count - 1 && !cs_loaded_any) {
2618 attr.number_of_values_read_by_coordinate_shader = 1;
2619 cs_loaded_any = true;
2620 }
2621
2622 attr.stride =
2623 cmd_buffer->vk.dynamic_graphics_state.vi_binding_strides[binding];
2624
2625 attr.maximum_index = attr.stride == 0 ?
2626 1u : MIN2(0xffffffu, c_vb->size / attr.stride);
2627 }
2628
2629 emitted_va_count++;
2630 }
2631
2632 if (pipeline->va_count == 0) {
2633 /* GFXH-930: At least one attribute must be enabled and read
2634 * by CS and VS. If we have no attributes being consumed by
2635 * the shader, set up a dummy to be loaded into the VPM.
2636 */
2637 cl_emit(&job->indirect, GL_SHADER_STATE_ATTRIBUTE_RECORD, attr) {
2638 /* Valid address of data whose value will be unused. */
2639 attr.address = v3dv_cl_address(job->indirect.bo, 0);
2640
2641 attr.type = ATTRIBUTE_FLOAT;
2642 attr.stride = 0;
2643 attr.vec_size = 1;
2644
2645 attr.number_of_values_read_by_coordinate_shader = 1;
2646 attr.number_of_values_read_by_vertex_shader = 1;
2647 }
2648 }
2649
2650 if (cmd_buffer->state.dirty & V3DV_CMD_DIRTY_PIPELINE) {
2651 v3dv_cl_ensure_space_with_branch(&job->bcl,
2652 sizeof(pipeline->vcm_cache_size));
2653 v3dv_return_if_oom(cmd_buffer, NULL);
2654
2655 cl_emit_prepacked(&job->bcl, &pipeline->vcm_cache_size);
2656 }
2657
2658 v3dv_cl_ensure_space_with_branch(&job->bcl,
2659 cl_packet_length(GL_SHADER_STATE));
2660 v3dv_return_if_oom(cmd_buffer, NULL);
2661
2662 if (pipeline->has_gs) {
2663 cl_emit(&job->bcl, GL_SHADER_STATE_INCLUDING_GS, state) {
2664 state.address = v3dv_cl_address(job->indirect.bo, shader_rec_offset);
2665 state.number_of_attribute_arrays = num_elements_to_emit;
2666 }
2667 } else {
2668 cl_emit(&job->bcl, GL_SHADER_STATE, state) {
2669 state.address = v3dv_cl_address(job->indirect.bo, shader_rec_offset);
2670 state.number_of_attribute_arrays = num_elements_to_emit;
2671 }
2672 }
2673
2674 /* Clearing push constants and descriptor sets for all stages is not quite
2675 * correct (some shader stages may not be used at all or they may not be
2676 * consuming push constants), however this is not relevant because if we
2677 * bind a different pipeline we always have to rebuild the uniform streams.
2678 */
2679 cmd_buffer->state.dirty &= ~(V3DV_CMD_DIRTY_VERTEX_BUFFER |
2680 V3DV_CMD_DIRTY_DESCRIPTOR_SETS |
2681 V3DV_CMD_DIRTY_PUSH_CONSTANTS);
2682 cmd_buffer->state.dirty_descriptor_stages &= ~VK_SHADER_STAGE_ALL_GRAPHICS;
2683 cmd_buffer->state.dirty_push_constants_stages &= ~VK_SHADER_STAGE_ALL_GRAPHICS;
2684 }
2685
2686 void
v3dX(cmd_buffer_emit_draw)2687 v3dX(cmd_buffer_emit_draw)(struct v3dv_cmd_buffer *cmd_buffer,
2688 struct v3dv_draw_info *info)
2689 {
2690 struct v3dv_job *job = cmd_buffer->state.job;
2691 assert(job);
2692 const struct vk_dynamic_graphics_state *dyn =
2693 &cmd_buffer->vk.dynamic_graphics_state;
2694 uint32_t hw_prim_type = v3dv_pipeline_primitive(dyn->ia.primitive_topology);
2695
2696 if (info->first_instance > 0) {
2697 v3dv_cl_ensure_space_with_branch(
2698 &job->bcl, cl_packet_length(BASE_VERTEX_BASE_INSTANCE));
2699 v3dv_return_if_oom(cmd_buffer, NULL);
2700
2701 cl_emit(&job->bcl, BASE_VERTEX_BASE_INSTANCE, base) {
2702 base.base_instance = info->first_instance;
2703 base.base_vertex = 0;
2704 }
2705 }
2706
2707 if (info->instance_count > 1) {
2708 v3dv_cl_ensure_space_with_branch(
2709 &job->bcl, cl_packet_length(VERTEX_ARRAY_INSTANCED_PRIMS));
2710 v3dv_return_if_oom(cmd_buffer, NULL);
2711
2712 cl_emit(&job->bcl, VERTEX_ARRAY_INSTANCED_PRIMS, prim) {
2713 prim.mode = hw_prim_type;
2714 prim.index_of_first_vertex = info->first_vertex;
2715 prim.number_of_instances = info->instance_count;
2716 prim.instance_length = info->vertex_count;
2717 }
2718 } else {
2719 v3dv_cl_ensure_space_with_branch(
2720 &job->bcl, cl_packet_length(VERTEX_ARRAY_PRIMS));
2721 v3dv_return_if_oom(cmd_buffer, NULL);
2722 cl_emit(&job->bcl, VERTEX_ARRAY_PRIMS, prim) {
2723 prim.mode = hw_prim_type;
2724 prim.length = info->vertex_count;
2725 prim.index_of_first_vertex = info->first_vertex;
2726 }
2727 }
2728 }
2729
2730 void
v3dX(cmd_buffer_emit_index_buffer)2731 v3dX(cmd_buffer_emit_index_buffer)(struct v3dv_cmd_buffer *cmd_buffer)
2732 {
2733 struct v3dv_job *job = cmd_buffer->state.job;
2734 assert(job);
2735
2736 /* We flag all state as dirty when we create a new job so make sure we
2737 * have a valid index buffer before attempting to emit state for it.
2738 */
2739 struct v3dv_buffer *ibuffer =
2740 v3dv_buffer_from_handle(cmd_buffer->state.index_buffer.buffer);
2741 if (ibuffer) {
2742 v3dv_cl_ensure_space_with_branch(
2743 &job->bcl, cl_packet_length(INDEX_BUFFER_SETUP));
2744 v3dv_return_if_oom(cmd_buffer, NULL);
2745
2746 const uint32_t offset = ibuffer->mem_offset +
2747 cmd_buffer->state.index_buffer.offset;
2748 assert(ibuffer->mem->bo->size >= offset);
2749 cl_emit(&job->bcl, INDEX_BUFFER_SETUP, ib) {
2750 ib.address = v3dv_cl_address(ibuffer->mem->bo, offset);
2751 ib.size = cmd_buffer->state.index_buffer.size;
2752 }
2753 }
2754
2755 cmd_buffer->state.dirty &= ~V3DV_CMD_DIRTY_INDEX_BUFFER;
2756 }
2757
2758 void
v3dX(cmd_buffer_emit_draw_indexed)2759 v3dX(cmd_buffer_emit_draw_indexed)(struct v3dv_cmd_buffer *cmd_buffer,
2760 uint32_t indexCount,
2761 uint32_t instanceCount,
2762 uint32_t firstIndex,
2763 int32_t vertexOffset,
2764 uint32_t firstInstance)
2765 {
2766 struct v3dv_job *job = cmd_buffer->state.job;
2767 assert(job);
2768
2769 const struct vk_dynamic_graphics_state *dyn =
2770 &cmd_buffer->vk.dynamic_graphics_state;
2771 uint32_t hw_prim_type = v3dv_pipeline_primitive(dyn->ia.primitive_topology);
2772 uint8_t index_type = ffs(cmd_buffer->state.index_buffer.index_size) - 1;
2773 uint32_t index_offset = firstIndex * cmd_buffer->state.index_buffer.index_size;
2774
2775 if (vertexOffset != 0 || firstInstance != 0) {
2776 v3dv_cl_ensure_space_with_branch(
2777 &job->bcl, cl_packet_length(BASE_VERTEX_BASE_INSTANCE));
2778 v3dv_return_if_oom(cmd_buffer, NULL);
2779
2780 cl_emit(&job->bcl, BASE_VERTEX_BASE_INSTANCE, base) {
2781 base.base_instance = firstInstance;
2782 base.base_vertex = vertexOffset;
2783 }
2784 }
2785
2786 if (instanceCount == 1) {
2787 v3dv_cl_ensure_space_with_branch(
2788 &job->bcl, cl_packet_length(INDEXED_PRIM_LIST));
2789 v3dv_return_if_oom(cmd_buffer, NULL);
2790
2791 cl_emit(&job->bcl, INDEXED_PRIM_LIST, prim) {
2792 prim.index_type = index_type;
2793 prim.length = indexCount;
2794 prim.index_offset = index_offset;
2795 prim.mode = hw_prim_type;
2796 prim.enable_primitive_restarts = dyn->ia.primitive_restart_enable;
2797 }
2798 } else if (instanceCount > 1) {
2799 v3dv_cl_ensure_space_with_branch(
2800 &job->bcl, cl_packet_length(INDEXED_INSTANCED_PRIM_LIST));
2801 v3dv_return_if_oom(cmd_buffer, NULL);
2802
2803 cl_emit(&job->bcl, INDEXED_INSTANCED_PRIM_LIST, prim) {
2804 prim.index_type = index_type;
2805 prim.index_offset = index_offset;
2806 prim.mode = hw_prim_type;
2807 prim.enable_primitive_restarts = dyn->ia.primitive_restart_enable;
2808 prim.number_of_instances = instanceCount;
2809 prim.instance_length = indexCount;
2810 }
2811 }
2812 }
2813
2814 void
v3dX(cmd_buffer_emit_draw_indirect)2815 v3dX(cmd_buffer_emit_draw_indirect)(struct v3dv_cmd_buffer *cmd_buffer,
2816 struct v3dv_buffer *buffer,
2817 VkDeviceSize offset,
2818 uint32_t drawCount,
2819 uint32_t stride)
2820 {
2821 struct v3dv_job *job = cmd_buffer->state.job;
2822 assert(job);
2823
2824 const struct vk_dynamic_graphics_state *dyn =
2825 &cmd_buffer->vk.dynamic_graphics_state;
2826 uint32_t hw_prim_type = v3dv_pipeline_primitive(dyn->ia.primitive_topology);
2827
2828 v3dv_cl_ensure_space_with_branch(
2829 &job->bcl, cl_packet_length(INDIRECT_VERTEX_ARRAY_INSTANCED_PRIMS));
2830 v3dv_return_if_oom(cmd_buffer, NULL);
2831
2832 cl_emit(&job->bcl, INDIRECT_VERTEX_ARRAY_INSTANCED_PRIMS, prim) {
2833 prim.mode = hw_prim_type;
2834 prim.number_of_draw_indirect_array_records = drawCount;
2835 prim.stride_in_multiples_of_4_bytes = stride >> 2;
2836 prim.address = v3dv_cl_address(buffer->mem->bo,
2837 buffer->mem_offset + offset);
2838 }
2839 }
2840
2841 void
v3dX(cmd_buffer_emit_indexed_indirect)2842 v3dX(cmd_buffer_emit_indexed_indirect)(struct v3dv_cmd_buffer *cmd_buffer,
2843 struct v3dv_buffer *buffer,
2844 VkDeviceSize offset,
2845 uint32_t drawCount,
2846 uint32_t stride)
2847 {
2848 struct v3dv_job *job = cmd_buffer->state.job;
2849 assert(job);
2850
2851 const struct vk_dynamic_graphics_state *dyn =
2852 &cmd_buffer->vk.dynamic_graphics_state;
2853 uint32_t hw_prim_type = v3dv_pipeline_primitive(dyn->ia.primitive_topology);
2854 uint8_t index_type = ffs(cmd_buffer->state.index_buffer.index_size) - 1;
2855
2856 v3dv_cl_ensure_space_with_branch(
2857 &job->bcl, cl_packet_length(INDIRECT_INDEXED_INSTANCED_PRIM_LIST));
2858 v3dv_return_if_oom(cmd_buffer, NULL);
2859
2860 cl_emit(&job->bcl, INDIRECT_INDEXED_INSTANCED_PRIM_LIST, prim) {
2861 prim.index_type = index_type;
2862 prim.mode = hw_prim_type;
2863 prim.enable_primitive_restarts = dyn->ia.primitive_restart_enable;
2864 prim.number_of_draw_indirect_indexed_records = drawCount;
2865 prim.stride_in_multiples_of_4_bytes = stride >> 2;
2866 prim.address = v3dv_cl_address(buffer->mem->bo,
2867 buffer->mem_offset + offset);
2868 }
2869 }
2870
2871 void
v3dX(cmd_buffer_suspend)2872 v3dX(cmd_buffer_suspend)(struct v3dv_cmd_buffer *cmd_buffer)
2873 {
2874 struct v3dv_job *job = cmd_buffer->state.job;
2875 assert(job);
2876
2877 job->suspending = true;
2878
2879 v3dv_cl_ensure_space_with_branch(&job->bcl, cl_packet_length(BRANCH));
2880
2881 job->suspend_branch_inst_ptr = cl_start(&job->bcl);
2882 cl_emit(&job->bcl, BRANCH, branch) {
2883 branch.address = v3dv_cl_address(NULL, 0);
2884 }
2885
2886 /* The sim complains if the command list ends with a branch */
2887 cl_emit(&job->bcl, NOP, nop);
2888 }
2889
2890 void
v3dX(job_patch_resume_address)2891 v3dX(job_patch_resume_address)(struct v3dv_job *first_suspend,
2892 struct v3dv_job *suspend,
2893 struct v3dv_job *resume)
2894 {
2895 assert(resume && resume->resuming);
2896 assert(first_suspend && first_suspend->suspending);
2897 assert(suspend && suspend->suspending);
2898 assert(suspend->suspend_branch_inst_ptr != NULL);
2899
2900 struct v3dv_bo *resume_bo =
2901 list_first_entry(&resume->bcl.bo_list, struct v3dv_bo, list_link);
2902 struct cl_packet_struct(BRANCH) branch = {
2903 cl_packet_header(BRANCH),
2904 };
2905 branch.address = v3dv_cl_address(NULL, resume_bo->offset);
2906
2907 uint8_t *rewrite_addr = (uint8_t *) suspend->suspend_branch_inst_ptr;
2908 cl_packet_pack(BRANCH)(NULL, rewrite_addr, &branch);
2909
2910 if (resume != first_suspend) {
2911 set_foreach(resume->bos, entry) {
2912 struct v3dv_bo *bo = (void *)entry->key;
2913 v3dv_job_add_bo(first_suspend, bo);
2914 }
2915 }
2916
2917 first_suspend->suspended_bcl_end = resume->bcl.bo->offset +
2918 v3dv_cl_offset(&resume->bcl);
2919 }
2920
2921 static void
job_destroy_cb(VkDevice device,uint64_t pobj,VkAllocationCallbacks * allocb)2922 job_destroy_cb(VkDevice device, uint64_t pobj, VkAllocationCallbacks *allocb)
2923 {
2924 struct v3dv_job *clone = (struct v3dv_job *) (uintptr_t) pobj;
2925 v3dv_job_destroy(clone);
2926 }
2927
2928 /**
2929 * This checks if the command buffer has been created with
2930 * VK_COMMAND_BUFFER_USAGE_SIMULTANEOUS_USE_BIT, in which case we won't be
2931 * able to safely patch the resume address into the job (since we could have
2932 * another instance of this job running in the GPU, potentially resuming in a
2933 * different address). In that case, we clone the job and make the clone have
2934 * its own BCL copied from the original job so we can later patch the resume
2935 * address into it safely.
2936 */
2937 struct v3dv_job *
v3dX(cmd_buffer_prepare_suspend_job_for_submit)2938 v3dX(cmd_buffer_prepare_suspend_job_for_submit)(struct v3dv_job *job)
2939 {
2940 assert(job->suspending);
2941 assert(job->cmd_buffer);
2942 assert(job->type == V3DV_JOB_TYPE_GPU_CL);
2943
2944 if (!(job->cmd_buffer->usage_flags & VK_COMMAND_BUFFER_USAGE_SIMULTANEOUS_USE_BIT))
2945 return job;
2946
2947 /* Create the clone job, but skip the BCL since we are going to create
2948 * our own below.
2949 */
2950 struct v3dv_job *clone = v3dv_job_clone(job, true);
2951 if (!clone)
2952 return NULL;
2953
2954 /* Compute total size of BCL we need to copy */
2955 uint32_t bcl_size = 0;
2956 list_for_each_entry(struct v3dv_bo, bo, &job->bcl.bo_list, list_link)
2957 bcl_size += bo->size;
2958
2959 /* Prepare the BCL for the cloned job. For this we go over the BOs in the
2960 * BCL of the original job and we copy their contents into the single BO
2961 * in the BCL of the cloned job.
2962 */
2963 clone->clone_owns_bcl = true;
2964 v3dv_cl_init(clone, &clone->bcl);
2965 v3dv_cl_ensure_space(&clone->bcl, bcl_size, 4);
2966 if (!clone->bcl.bo)
2967 return NULL;
2968
2969 assert(clone->bcl.base);
2970 assert(clone->bcl.base == clone->bcl.next);
2971
2972 /* Unlink this job from the command buffer's execution list */
2973 list_inithead(&clone->list_link);
2974
2975 /* Copy the contents of each BO in the original job's BCL into the single
2976 * BO we have in the clone's BCL.
2977 *
2978 * If the BO is the last in the BCL (which we can tell because it wouldn't
2979 * have emitted a BRANCH instruction to link to another BO) we need to copy
2980 * up to the current BCL offset, otherwise we need to copy up to the BRANCH
2981 * instruction (excluded, since we are putting everything together into a
2982 * single BO here).
2983 */
2984 list_for_each_entry(struct v3dv_bo, bo, &job->bcl.bo_list, list_link) {
2985 assert(bo->map);
2986 uint32_t copy_size;
2987 if (bo->cl_branch_offset == 0xffffffff) { /* Last BO in BCL */
2988 assert(bo == list_last_entry(&job->bcl.bo_list, struct v3dv_bo, list_link));
2989 copy_size = v3dv_cl_offset(&job->bcl);
2990 } else {
2991 assert(bo->cl_branch_offset >= cl_packet_length(BRANCH));
2992 copy_size = bo->cl_branch_offset - cl_packet_length(BRANCH);
2993 }
2994
2995 assert(v3dv_cl_offset(&job->bcl) + copy_size < bcl_size);
2996 memcpy(cl_start(&clone->bcl), bo->map, copy_size);
2997 cl_advance_and_end(&clone->bcl, copy_size);
2998 }
2999
3000 /* Now we need to fixup the pointer to the suspend BRANCH instruction at the
3001 * end of the BCL so it points to the address in the new BCL. We know that
3002 * to suspend a command buffer we always emit a BRANCH+NOP combo, so we just
3003 * need to go back that many bytes in to the BCL to find the instruction.
3004 */
3005 uint32_t suspend_terminator_size =
3006 cl_packet_length(BRANCH) + cl_packet_length(NOP);
3007 clone->suspend_branch_inst_ptr = (struct v3dv_cl_out *)
3008 (((uint8_t *)cl_start(&clone->bcl)) - suspend_terminator_size);
3009 assert(*(((uint8_t *)clone->suspend_branch_inst_ptr)) == V3DX(BRANCH_opcode));
3010
3011 /* This job is not in the execution list of the command buffer so it
3012 * won't be destroyed with it; add it as a private object to get it freed.
3013 *
3014 * FIXME: every time this job is submitted we clone the job and we only
3015 * destroy it when the command buffer is destroyed. If the user keeps the
3016 * command buffer for the entire lifetime of the application, this command
3017 * buffer could grow significantly, so maybe we want to do something smarter
3018 * like having a syncobj bound to these jobs and every time we submit the
3019 * command buffer again we first check these sncobjs to see if we can free
3020 * some of these clones so we avoid blowing up memory.
3021 */
3022 v3dv_cmd_buffer_add_private_obj(
3023 job->cmd_buffer, (uintptr_t)clone,
3024 (v3dv_cmd_buffer_private_obj_destroy_cb)job_destroy_cb);
3025
3026 return clone;
3027 }
3028