• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*
2  * Copyright © 2021 Raspberry Pi Ltd
3  *
4  * Permission is hereby granted, free of charge, to any person obtaining a
5  * copy of this software and associated documentation files (the "Software"),
6  * to deal in the Software without restriction, including without limitation
7  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8  * and/or sell copies of the Software, and to permit persons to whom the
9  * Software is furnished to do so, subject to the following conditions:
10  *
11  * The above copyright notice and this permission notice (including the next
12  * paragraph) shall be included in all copies or substantial portions of the
13  * Software.
14  *
15  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
18  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21  * IN THE SOFTWARE.
22  */
23 
24 #include "v3dv_private.h"
25 #include "broadcom/common/v3d_macros.h"
26 #include "broadcom/common/v3d_util.h"
27 #include "broadcom/cle/v3dx_pack.h"
28 #include "broadcom/compiler/v3d_compiler.h"
29 
30 #include "util/half_float.h"
31 #include "util/u_pack_color.h"
32 #include "vk_format.h"
33 
34 void
v3dX(job_emit_binning_flush)35 v3dX(job_emit_binning_flush)(struct v3dv_job *job)
36 {
37    assert(job);
38 
39    v3dv_cl_ensure_space_with_branch(&job->bcl, cl_packet_length(FLUSH));
40    v3dv_return_if_oom(NULL, job);
41 
42    cl_emit(&job->bcl, FLUSH, flush);
43 }
44 
45 void
v3dX(job_emit_enable_double_buffer)46 v3dX(job_emit_enable_double_buffer)(struct v3dv_job *job)
47 {
48    assert(job->can_use_double_buffer);
49    assert(job->frame_tiling.double_buffer);
50    assert(!job->frame_tiling.msaa);
51    assert(job->bcl_tile_binning_mode_ptr);
52 
53    const struct v3dv_frame_tiling *tiling = &job->frame_tiling;
54    struct cl_packet_struct(TILE_BINNING_MODE_CFG) config = {
55       cl_packet_header(TILE_BINNING_MODE_CFG),
56    };
57    config.width_in_pixels = tiling->width;
58    config.height_in_pixels = tiling->height;
59 #if V3D_VERSION == 42
60    config.number_of_render_targets = MAX2(tiling->render_target_count, 1);
61    config.multisample_mode_4x = tiling->msaa;
62    config.double_buffer_in_non_ms_mode = tiling->double_buffer;
63    config.maximum_bpp_of_all_render_targets = tiling->internal_bpp;
64 #endif
65 #if V3D_VERSION >= 71
66    config.log2_tile_width = log2_tile_size(tiling->tile_width);
67    config.log2_tile_height = log2_tile_size(tiling->tile_height);
68 #endif
69 
70    uint8_t *rewrite_addr = (uint8_t *)job->bcl_tile_binning_mode_ptr;
71    cl_packet_pack(TILE_BINNING_MODE_CFG)(NULL, rewrite_addr, &config);
72 }
73 
74 void
v3dX(job_emit_binning_prolog)75 v3dX(job_emit_binning_prolog)(struct v3dv_job *job,
76                               const struct v3dv_frame_tiling *tiling,
77                               uint32_t layers)
78 {
79    /* This must go before the binning mode configuration. It is
80     * required for layered framebuffers to work.
81     */
82    cl_emit(&job->bcl, NUMBER_OF_LAYERS, config) {
83       config.number_of_layers = layers;
84    }
85 
86    assert(!tiling->double_buffer || !tiling->msaa);
87    job->bcl_tile_binning_mode_ptr = cl_start(&job->bcl);
88    cl_emit(&job->bcl, TILE_BINNING_MODE_CFG, config) {
89       config.width_in_pixels = tiling->width;
90       config.height_in_pixels = tiling->height;
91 #if V3D_VERSION == 42
92       config.number_of_render_targets = MAX2(tiling->render_target_count, 1);
93       config.multisample_mode_4x = tiling->msaa;
94       config.double_buffer_in_non_ms_mode = tiling->double_buffer;
95       config.maximum_bpp_of_all_render_targets = tiling->internal_bpp;
96 #endif
97 #if V3D_VERSION >= 71
98       config.log2_tile_width = log2_tile_size(tiling->tile_width);
99       config.log2_tile_height = log2_tile_size(tiling->tile_height);
100       /* FIXME: ideally we would like next assert on the packet header (as is
101        * general, so also applies to GL). We would need to expand
102        * gen_pack_header for that.
103        */
104       assert(config.log2_tile_width == config.log2_tile_height ||
105              config.log2_tile_width == config.log2_tile_height + 1);
106 #endif
107    }
108 
109    /* There's definitely nothing in the VCD cache we want. */
110    cl_emit(&job->bcl, FLUSH_VCD_CACHE, bin);
111 
112    /* "Binning mode lists must have a Start Tile Binning item (6) after
113     *  any prefix state data before the binning list proper starts."
114     */
115    cl_emit(&job->bcl, START_TILE_BINNING, bin);
116 }
117 
118 void
v3dX(cmd_buffer_end_render_pass_secondary)119 v3dX(cmd_buffer_end_render_pass_secondary)(struct v3dv_cmd_buffer *cmd_buffer)
120 {
121    assert(cmd_buffer->state.job);
122    v3dv_cl_ensure_space_with_branch(&cmd_buffer->state.job->bcl,
123                                     cl_packet_length(RETURN_FROM_SUB_LIST));
124    v3dv_return_if_oom(cmd_buffer, NULL);
125    cl_emit(&cmd_buffer->state.job->bcl, RETURN_FROM_SUB_LIST, ret);
126 }
127 
128 void
v3dX(job_emit_clip_window)129 v3dX(job_emit_clip_window)(struct v3dv_job *job, const VkRect2D *rect)
130 {
131    assert(job);
132 
133    v3dv_cl_ensure_space_with_branch(&job->bcl, cl_packet_length(CLIP_WINDOW));
134    v3dv_return_if_oom(NULL, job);
135 
136    cl_emit(&job->bcl, CLIP_WINDOW, clip) {
137       clip.clip_window_left_pixel_coordinate = rect->offset.x;
138       clip.clip_window_bottom_pixel_coordinate = rect->offset.y;
139       clip.clip_window_width_in_pixels = rect->extent.width;
140       clip.clip_window_height_in_pixels = rect->extent.height;
141    }
142 }
143 
144 static void
cmd_buffer_render_pass_emit_load(struct v3dv_cmd_buffer * cmd_buffer,struct v3dv_cl * cl,struct v3dv_image_view * iview,uint32_t layer,uint32_t buffer)145 cmd_buffer_render_pass_emit_load(struct v3dv_cmd_buffer *cmd_buffer,
146                                  struct v3dv_cl *cl,
147                                  struct v3dv_image_view *iview,
148                                  uint32_t layer,
149                                  uint32_t buffer)
150 {
151    const struct v3dv_image *image = (struct v3dv_image *) iview->vk.image;
152 
153    /* We don't support rendering to ycbcr images, so the image view should be
154     * single-plane, and using a single-plane format. But note that the underlying
155     * image can be a ycbcr format, as we support rendering to a specific plane
156     * of an image. This is used for example on some meta_copy code paths, in
157     * order to copy from/to a plane of a ycbcr image.
158     */
159    assert(iview->plane_count == 1);
160    assert(iview->format->plane_count == 1);
161 
162    uint8_t image_plane = v3dv_plane_from_aspect(iview->vk.aspects);
163    const struct v3d_resource_slice *slice =
164       &image->planes[image_plane].slices[iview->vk.base_mip_level];
165 
166    uint32_t layer_offset =
167       v3dv_layer_offset(image, iview->vk.base_mip_level,
168                         iview->vk.base_array_layer + layer, image_plane);
169 
170    cl_emit(cl, LOAD_TILE_BUFFER_GENERAL, load) {
171       load.buffer_to_load = buffer;
172       load.address = v3dv_cl_address(image->planes[image_plane].mem->bo, layer_offset);
173 
174       load.input_image_format = iview->format->planes[0].rt_type;
175 
176       /* If we create an image view with only the stencil format, we
177        * re-interpret the format as RGBA8_UINT, as it is want we want in
178        * general (see CreateImageView).
179        *
180        * However, when we are loading/storing tiles from the ZSTENCIL tile
181        * buffer, we need to use the underlying DS format.
182        */
183       if (buffer == ZSTENCIL &&
184           iview->format->planes[0].rt_type == V3D_OUTPUT_IMAGE_FORMAT_RGBA8UI) {
185          assert(image->format->planes[image_plane].rt_type == V3D_OUTPUT_IMAGE_FORMAT_D24S8);
186          load.input_image_format = image->format->planes[image_plane].rt_type;
187       }
188 
189       load.r_b_swap = iview->planes[0].swap_rb;
190       load.channel_reverse = iview->planes[0].channel_reverse;
191       load.memory_format = slice->tiling;
192 
193       if (slice->tiling == V3D_TILING_UIF_NO_XOR ||
194           slice->tiling == V3D_TILING_UIF_XOR) {
195          load.height_in_ub_or_stride =
196             slice->padded_height_of_output_image_in_uif_blocks;
197       } else if (slice->tiling == V3D_TILING_RASTER) {
198          load.height_in_ub_or_stride = slice->stride;
199       }
200 
201       if (image->vk.samples > VK_SAMPLE_COUNT_1_BIT)
202          load.decimate_mode = V3D_DECIMATE_MODE_ALL_SAMPLES;
203       else
204          load.decimate_mode = V3D_DECIMATE_MODE_SAMPLE_0;
205    }
206 }
207 
208 static inline uint32_t
v3dv_zs_buffer(bool depth,bool stencil)209 v3dv_zs_buffer(bool depth, bool stencil)
210 {
211    if (depth && stencil)
212       return ZSTENCIL;
213    else if (depth)
214       return Z;
215    else if (stencil)
216       return STENCIL;
217    return NONE;
218 }
219 
220 static void
cmd_buffer_render_pass_emit_loads(struct v3dv_cmd_buffer * cmd_buffer,struct v3dv_cl * cl,uint32_t layer)221 cmd_buffer_render_pass_emit_loads(struct v3dv_cmd_buffer *cmd_buffer,
222                                   struct v3dv_cl *cl,
223                                   uint32_t layer)
224 {
225    const struct v3dv_cmd_buffer_state *state = &cmd_buffer->state;
226    const struct v3dv_render_pass *pass = state->pass;
227    const struct v3dv_subpass *subpass = &pass->subpasses[state->subpass_idx];
228 
229   assert(!pass->multiview_enabled || layer < MAX_MULTIVIEW_VIEW_COUNT);
230 
231    for (uint32_t i = 0; i < subpass->color_count; i++) {
232       uint32_t attachment_idx = subpass->color_attachments[i].attachment;
233 
234       if (attachment_idx == VK_ATTACHMENT_UNUSED)
235          continue;
236 
237       const struct v3dv_render_pass_attachment *attachment =
238          &state->pass->attachments[attachment_idx];
239 
240       /* According to the Vulkan spec:
241        *
242        *    "The load operation for each sample in an attachment happens before
243        *     any recorded command which accesses the sample in the first subpass
244        *     where the attachment is used."
245        *
246        * If the load operation is CLEAR, we must only clear once on the first
247        * subpass that uses the attachment (and in that case we don't LOAD).
248        * After that, we always want to load so we don't lose any rendering done
249        * by a previous subpass to the same attachment. We also want to load
250        * if the current job is continuing subpass work started by a previous
251        * job, for the same reason.
252        *
253        * If the render area is not aligned to tile boundaries then we have
254        * tiles which are partially covered by it. In this case, we need to
255        * load the tiles so we can preserve the pixels that are outside the
256        * render area for any such tiles.
257        */
258       uint32_t first_subpass = !pass->multiview_enabled ?
259          attachment->first_subpass :
260          attachment->views[layer].first_subpass;
261 
262       uint32_t last_subpass = !pass->multiview_enabled ?
263          attachment->last_subpass :
264          attachment->views[layer].last_subpass;
265 
266       bool needs_load =
267          v3dv_cmd_buffer_check_needs_load(state,
268                                           VK_IMAGE_ASPECT_COLOR_BIT,
269                                           first_subpass,
270                                           attachment->desc.loadOp,
271                                           last_subpass,
272                                           attachment->desc.storeOp);
273       if (needs_load) {
274          struct v3dv_image_view *iview =
275             state->attachments[attachment_idx].image_view;
276          cmd_buffer_render_pass_emit_load(cmd_buffer, cl, iview,
277                                           layer, RENDER_TARGET_0 + i);
278       }
279    }
280 
281    uint32_t ds_attachment_idx = subpass->ds_attachment.attachment;
282    if (ds_attachment_idx != VK_ATTACHMENT_UNUSED) {
283       const struct v3dv_render_pass_attachment *ds_attachment =
284          &state->pass->attachments[ds_attachment_idx];
285 
286       const VkImageAspectFlags ds_aspects =
287          vk_format_aspects(ds_attachment->desc.format);
288 
289       uint32_t ds_first_subpass = !pass->multiview_enabled ?
290          ds_attachment->first_subpass :
291          ds_attachment->views[layer].first_subpass;
292 
293       uint32_t ds_last_subpass = !pass->multiview_enabled ?
294          ds_attachment->last_subpass :
295          ds_attachment->views[layer].last_subpass;
296 
297       const bool needs_depth_load =
298          v3dv_cmd_buffer_check_needs_load(state,
299                                           ds_aspects & VK_IMAGE_ASPECT_DEPTH_BIT,
300                                           ds_first_subpass,
301                                           ds_attachment->desc.loadOp,
302                                           ds_last_subpass,
303                                           ds_attachment->desc.storeOp);
304 
305       const bool needs_stencil_load =
306          v3dv_cmd_buffer_check_needs_load(state,
307                                           ds_aspects & VK_IMAGE_ASPECT_STENCIL_BIT,
308                                           ds_first_subpass,
309                                           ds_attachment->desc.stencilLoadOp,
310                                           ds_last_subpass,
311                                           ds_attachment->desc.stencilStoreOp);
312 
313       if (needs_depth_load || needs_stencil_load) {
314          struct v3dv_image_view *iview =
315             state->attachments[ds_attachment_idx].image_view;
316          /* From the Vulkan spec:
317           *
318           *   "When an image view of a depth/stencil image is used as a
319           *   depth/stencil framebuffer attachment, the aspectMask is ignored
320           *   and both depth and stencil image subresources are used."
321           *
322           * So we ignore the aspects from the subresource range of the image
323           * view for the depth/stencil attachment, but we still need to restrict
324           * the to aspects compatible with the render pass and the image.
325           */
326          const uint32_t zs_buffer =
327             v3dv_zs_buffer(needs_depth_load, needs_stencil_load);
328          cmd_buffer_render_pass_emit_load(cmd_buffer, cl,
329                                           iview, layer, zs_buffer);
330       }
331    }
332 
333    cl_emit(cl, END_OF_LOADS, end);
334 }
335 
336 static void
cmd_buffer_render_pass_emit_store(struct v3dv_cmd_buffer * cmd_buffer,struct v3dv_cl * cl,uint32_t attachment_idx,uint32_t layer,uint32_t buffer,bool clear,bool is_multisample_resolve)337 cmd_buffer_render_pass_emit_store(struct v3dv_cmd_buffer *cmd_buffer,
338                                   struct v3dv_cl *cl,
339                                   uint32_t attachment_idx,
340                                   uint32_t layer,
341                                   uint32_t buffer,
342                                   bool clear,
343                                   bool is_multisample_resolve)
344 {
345    const struct v3dv_image_view *iview =
346       cmd_buffer->state.attachments[attachment_idx].image_view;
347    const struct v3dv_image *image = (struct v3dv_image *) iview->vk.image;
348 
349    /* We don't support rendering to ycbcr images, so the image view should be
350     * one-plane, and using a single-plane format. But note that the underlying
351     * image can be a ycbcr format, as we support rendering to a specific plane
352     * of an image. This is used for example on some meta_copy code paths, in
353     * order to copy from/to a plane of a ycbcr image.
354     */
355    assert(iview->plane_count == 1);
356    assert(iview->format->plane_count == 1);
357 
358    uint8_t image_plane = v3dv_plane_from_aspect(iview->vk.aspects);
359    const struct v3d_resource_slice *slice =
360       &image->planes[image_plane].slices[iview->vk.base_mip_level];
361    uint32_t layer_offset = v3dv_layer_offset(image,
362                                              iview->vk.base_mip_level,
363                                              iview->vk.base_array_layer + layer,
364                                              image_plane);
365 
366    /* The Clear Buffer bit is not supported for Z/Stencil stores in 7.x and it
367     * is broken in earlier V3D versions.
368     */
369    assert((buffer != Z && buffer != STENCIL && buffer != ZSTENCIL) || !clear);
370 
371    cl_emit(cl, STORE_TILE_BUFFER_GENERAL, store) {
372       store.buffer_to_store = buffer;
373       store.address = v3dv_cl_address(image->planes[image_plane].mem->bo, layer_offset);
374       store.clear_buffer_being_stored = clear;
375 
376       store.output_image_format = iview->format->planes[0].rt_type;
377 
378       /* If we create an image view with only the stencil format, we
379        * re-interpret the format as RGBA8_UINT, as it is want we want in
380        * general (see CreateImageView).
381        *
382        * However, when we are loading/storing tiles from the ZSTENCIL tile
383        * buffer, we need to use the underlying DS format.
384        */
385       if (buffer == ZSTENCIL &&
386           iview->format->planes[0].rt_type == V3D_OUTPUT_IMAGE_FORMAT_RGBA8UI) {
387          assert(image->format->planes[image_plane].rt_type == V3D_OUTPUT_IMAGE_FORMAT_D24S8);
388          store.output_image_format = image->format->planes[image_plane].rt_type;
389       }
390 
391       store.r_b_swap = iview->planes[0].swap_rb;
392       store.channel_reverse = iview->planes[0].channel_reverse;
393       store.memory_format = slice->tiling;
394 
395       if (slice->tiling == V3D_TILING_UIF_NO_XOR ||
396           slice->tiling == V3D_TILING_UIF_XOR) {
397          store.height_in_ub_or_stride =
398             slice->padded_height_of_output_image_in_uif_blocks;
399       } else if (slice->tiling == V3D_TILING_RASTER) {
400          store.height_in_ub_or_stride = slice->stride;
401       }
402 
403       if (image->vk.samples > VK_SAMPLE_COUNT_1_BIT)
404          store.decimate_mode = V3D_DECIMATE_MODE_ALL_SAMPLES;
405       else if (is_multisample_resolve)
406          store.decimate_mode = V3D_DECIMATE_MODE_4X;
407       else
408          store.decimate_mode = V3D_DECIMATE_MODE_SAMPLE_0;
409    }
410 }
411 
412 static bool
check_needs_clear(const struct v3dv_cmd_buffer_state * state,VkImageAspectFlags aspect,uint32_t first_subpass_idx,VkAttachmentLoadOp load_op,bool do_clear_with_draw)413 check_needs_clear(const struct v3dv_cmd_buffer_state *state,
414                   VkImageAspectFlags aspect,
415                   uint32_t first_subpass_idx,
416                   VkAttachmentLoadOp load_op,
417                   bool do_clear_with_draw)
418 {
419    /* We call this with image->vk.aspects & aspect, so 0 means the aspect we are
420     * testing does not exist in the image.
421     */
422    if (!aspect)
423       return false;
424 
425    /* If the aspect needs to be cleared with a draw call then we won't emit
426     * the clear here.
427     */
428    if (do_clear_with_draw)
429       return false;
430 
431    /* If this is resuming a subpass started with another job, then attachment
432     * load operations don't apply.
433     */
434    if (state->job->is_subpass_continue)
435       return false;
436 
437    /* If the render area is not aligned to tile boundaries we can't use the
438     * TLB for a clear.
439     */
440    if (!state->tile_aligned_render_area)
441       return false;
442 
443    /* If this job is running in a subpass other than the first subpass in
444     * which this attachment (or view) is used then attachment load operations
445     * don't apply.
446     */
447    if (state->job->first_subpass != first_subpass_idx)
448       return false;
449 
450    /* The attachment load operation must be CLEAR */
451    return load_op == VK_ATTACHMENT_LOAD_OP_CLEAR;
452 }
453 
454 static void
cmd_buffer_render_pass_emit_stores(struct v3dv_cmd_buffer * cmd_buffer,struct v3dv_cl * cl,uint32_t layer)455 cmd_buffer_render_pass_emit_stores(struct v3dv_cmd_buffer *cmd_buffer,
456                                    struct v3dv_cl *cl,
457                                    uint32_t layer)
458 {
459    struct v3dv_cmd_buffer_state *state = &cmd_buffer->state;
460    struct v3dv_render_pass *pass = state->pass;
461    const struct v3dv_subpass *subpass =
462       &pass->subpasses[state->subpass_idx];
463 
464    bool has_stores = false;
465    bool use_global_zs_clear = false;
466    bool use_global_rt_clear = false;
467 
468    assert(!pass->multiview_enabled || layer < MAX_MULTIVIEW_VIEW_COUNT);
469 
470    /* FIXME: separate stencil */
471    uint32_t ds_attachment_idx = subpass->ds_attachment.attachment;
472    if (ds_attachment_idx != VK_ATTACHMENT_UNUSED) {
473       const struct v3dv_render_pass_attachment *ds_attachment =
474          &state->pass->attachments[ds_attachment_idx];
475 
476       assert(state->job->first_subpass >= ds_attachment->first_subpass);
477       assert(state->subpass_idx >= ds_attachment->first_subpass);
478       assert(state->subpass_idx <= ds_attachment->last_subpass);
479 
480       /* From the Vulkan spec, VkImageSubresourceRange:
481        *
482        *   "When an image view of a depth/stencil image is used as a
483        *   depth/stencil framebuffer attachment, the aspectMask is ignored
484        *   and both depth and stencil image subresources are used."
485        *
486        * So we ignore the aspects from the subresource range of the image
487        * view for the depth/stencil attachment, but we still need to restrict
488        * the to aspects compatible with the render pass and the image.
489        */
490       const VkImageAspectFlags aspects =
491          vk_format_aspects(ds_attachment->desc.format);
492 
493 #if V3D_VERSION <= 42
494       /* GFXH-1689: The per-buffer store command's clear buffer bit is broken
495        * for depth/stencil.
496        *
497        * There used to be some confusion regarding the Clear Tile Buffers
498        * Z/S bit also being broken, but we confirmed with Broadcom that this
499        * is not the case, it was just that some other hardware bugs (that we
500        * need to work around, such as GFXH-1461) could cause this bit to behave
501        * incorrectly.
502        *
503        * There used to be another issue where the RTs bit in the Clear Tile
504        * Buffers packet also cleared Z/S, but Broadcom confirmed this is
505        * fixed since V3D 4.1.
506        *
507        * So if we have to emit a clear of depth or stencil we don't use
508        * the per-buffer store clear bit, even if we need to store the buffers,
509        * instead we always have to use the Clear Tile Buffers Z/S bit.
510        * If we have configured the job to do early Z/S clearing, then we
511        * don't want to emit any Clear Tile Buffers command at all here.
512        *
513        * Note that GFXH-1689 is not reproduced in the simulator, where
514        * using the clear buffer bit in depth/stencil stores works fine.
515        */
516 
517       /* Only clear once on the first subpass that uses the attachment */
518       uint32_t ds_first_subpass = !state->pass->multiview_enabled ?
519          ds_attachment->first_subpass :
520          ds_attachment->views[layer].first_subpass;
521 
522       bool needs_depth_clear =
523          check_needs_clear(state,
524                            aspects & VK_IMAGE_ASPECT_DEPTH_BIT,
525                            ds_first_subpass,
526                            ds_attachment->desc.loadOp,
527                            subpass->do_depth_clear_with_draw);
528 
529       bool needs_stencil_clear =
530          check_needs_clear(state,
531                            aspects & VK_IMAGE_ASPECT_STENCIL_BIT,
532                            ds_first_subpass,
533                            ds_attachment->desc.stencilLoadOp,
534                            subpass->do_stencil_clear_with_draw);
535 
536       use_global_zs_clear = !state->job->early_zs_clear &&
537          (needs_depth_clear || needs_stencil_clear);
538 #endif
539 #if V3D_VERSION >= 71
540       /* The store command's clear buffer bit cannot be used for Z/S stencil:
541        * since V3D 4.5.6 Z/S buffers are automatically cleared between tiles,
542        * so we don't want to emit redundant clears here.
543        */
544       use_global_zs_clear = false;
545 #endif
546 
547       /* Skip the last store if it is not required */
548       uint32_t ds_last_subpass = !pass->multiview_enabled ?
549          ds_attachment->last_subpass :
550          ds_attachment->views[layer].last_subpass;
551 
552       bool needs_depth_store =
553          v3dv_cmd_buffer_check_needs_store(state,
554                                            aspects & VK_IMAGE_ASPECT_DEPTH_BIT,
555                                            ds_last_subpass,
556                                            ds_attachment->desc.storeOp);
557 
558       bool needs_stencil_store =
559          v3dv_cmd_buffer_check_needs_store(state,
560                                            aspects & VK_IMAGE_ASPECT_STENCIL_BIT,
561                                            ds_last_subpass,
562                                            ds_attachment->desc.stencilStoreOp);
563 
564       /* If we have a resolve, handle it before storing the tile */
565       const struct v3dv_cmd_buffer_attachment_state *ds_att_state =
566          &state->attachments[ds_attachment_idx];
567       if (ds_att_state->use_tlb_resolve) {
568          assert(ds_att_state->has_resolve);
569          assert(subpass->resolve_depth || subpass->resolve_stencil);
570          const uint32_t resolve_attachment_idx =
571             subpass->ds_resolve_attachment.attachment;
572          assert(resolve_attachment_idx != VK_ATTACHMENT_UNUSED);
573 
574          const uint32_t zs_buffer =
575             v3dv_zs_buffer(subpass->resolve_depth, subpass->resolve_stencil);
576          cmd_buffer_render_pass_emit_store(cmd_buffer, cl,
577                                            resolve_attachment_idx, layer,
578                                            zs_buffer,
579                                            false, false);
580          has_stores = true;
581       } else if (ds_att_state->has_resolve) {
582          /* If we can't use the TLB to implement the resolve we will need to
583           * store the attachment so we can implement it later using a blit.
584           */
585          needs_depth_store = subpass->resolve_depth;
586          needs_stencil_store = subpass->resolve_stencil;
587       }
588 
589       if (needs_depth_store || needs_stencil_store) {
590          const uint32_t zs_buffer =
591             v3dv_zs_buffer(needs_depth_store, needs_stencil_store);
592          cmd_buffer_render_pass_emit_store(cmd_buffer, cl,
593                                            ds_attachment_idx, layer,
594                                            zs_buffer, false, false);
595          has_stores = true;
596       }
597    }
598 
599    for (uint32_t i = 0; i < subpass->color_count; i++) {
600       uint32_t attachment_idx = subpass->color_attachments[i].attachment;
601 
602       if (attachment_idx == VK_ATTACHMENT_UNUSED)
603          continue;
604 
605       const struct v3dv_render_pass_attachment *attachment =
606          &state->pass->attachments[attachment_idx];
607 
608       assert(state->job->first_subpass >= attachment->first_subpass);
609       assert(state->subpass_idx >= attachment->first_subpass);
610       assert(state->subpass_idx <= attachment->last_subpass);
611 
612       /* Only clear once on the first subpass that uses the attachment */
613       uint32_t first_subpass = !pass->multiview_enabled ?
614          attachment->first_subpass :
615          attachment->views[layer].first_subpass;
616 
617       bool needs_clear =
618          check_needs_clear(state,
619                            VK_IMAGE_ASPECT_COLOR_BIT,
620                            first_subpass,
621                            attachment->desc.loadOp,
622                            false);
623 
624       /* Skip the last store if it is not required  */
625       uint32_t last_subpass = !pass->multiview_enabled ?
626          attachment->last_subpass :
627          attachment->views[layer].last_subpass;
628 
629       bool needs_store =
630          v3dv_cmd_buffer_check_needs_store(state,
631                                            VK_IMAGE_ASPECT_COLOR_BIT,
632                                            last_subpass,
633                                            attachment->desc.storeOp);
634 
635       /* If we need to resolve this attachment emit that store first. Notice
636        * that we must not request a tile buffer clear here in that case, since
637        * that would clear the tile buffer before we get to emit the actual
638        * color attachment store below, since the clear happens after the
639        * store is completed.
640        *
641        * If the attachment doesn't support TLB resolves (or the render area
642        * is not aligned to tile boundaries) then we will have to fallback to
643        * doing the resolve in a shader separately after this job, so we will
644        * need to store the multisampled attachment even if that wasn't
645        * requested by the client.
646        */
647       const struct v3dv_cmd_buffer_attachment_state *att_state =
648          &state->attachments[attachment_idx];
649       if (att_state->use_tlb_resolve) {
650          assert(att_state->has_resolve);
651          const uint32_t resolve_attachment_idx =
652             subpass->resolve_attachments[i].attachment;
653          cmd_buffer_render_pass_emit_store(cmd_buffer, cl,
654                                            resolve_attachment_idx, layer,
655                                            RENDER_TARGET_0 + i,
656                                            false, true);
657          has_stores = true;
658       } else if (att_state->has_resolve) {
659          needs_store = true;
660       }
661 
662       /* Emit the color attachment store if needed */
663       if (needs_store) {
664          cmd_buffer_render_pass_emit_store(cmd_buffer, cl,
665                                            attachment_idx, layer,
666                                            RENDER_TARGET_0 + i,
667                                            needs_clear && !use_global_rt_clear,
668                                            false);
669          has_stores = true;
670       } else if (needs_clear) {
671          use_global_rt_clear = true;
672       }
673    }
674 
675    /* We always need to emit at least one dummy store */
676    if (!has_stores) {
677       cl_emit(cl, STORE_TILE_BUFFER_GENERAL, store) {
678          store.buffer_to_store = NONE;
679       }
680    }
681 
682    /* If we have any depth/stencil clears we can't use the per-buffer clear
683     * bit and instead we have to emit a single clear of all tile buffers.
684     */
685    if (use_global_zs_clear || use_global_rt_clear) {
686 #if V3D_VERSION == 42
687       cl_emit(cl, CLEAR_TILE_BUFFERS, clear) {
688          clear.clear_z_stencil_buffer = use_global_zs_clear;
689          clear.clear_all_render_targets = use_global_rt_clear;
690       }
691 #endif
692 #if V3D_VERSION >= 71
693       cl_emit(cl, CLEAR_RENDER_TARGETS, clear);
694 #endif
695    }
696 }
697 
698 static void
cmd_buffer_render_pass_emit_per_tile_rcl(struct v3dv_cmd_buffer * cmd_buffer,uint32_t layer)699 cmd_buffer_render_pass_emit_per_tile_rcl(struct v3dv_cmd_buffer *cmd_buffer,
700                                          uint32_t layer)
701 {
702    struct v3dv_job *job = cmd_buffer->state.job;
703    assert(job);
704 
705    /* Emit the generic list in our indirect state -- the rcl will just
706     * have pointers into it.
707     */
708    struct v3dv_cl *cl = &job->indirect;
709    v3dv_cl_ensure_space(cl, 200, 1);
710    v3dv_return_if_oom(cmd_buffer, NULL);
711 
712    struct v3dv_cl_reloc tile_list_start = v3dv_cl_get_address(cl);
713 
714    cl_emit(cl, TILE_COORDINATES_IMPLICIT, coords);
715 
716    cmd_buffer_render_pass_emit_loads(cmd_buffer, cl, layer);
717 
718    /* The binner starts out writing tiles assuming that the initial mode
719     * is triangles, so make sure that's the case.
720     */
721    cl_emit(cl, PRIM_LIST_FORMAT, fmt) {
722       fmt.primitive_type = LIST_TRIANGLES;
723    }
724 
725    /* PTB assumes that value to be 0, but hw will not set it. */
726    cl_emit(cl, SET_INSTANCEID, set) {
727       set.instance_id = 0;
728    }
729 
730    cl_emit(cl, BRANCH_TO_IMPLICIT_TILE_LIST, branch);
731 
732    cmd_buffer_render_pass_emit_stores(cmd_buffer, cl, layer);
733 
734    cl_emit(cl, END_OF_TILE_MARKER, end);
735 
736    cl_emit(cl, RETURN_FROM_SUB_LIST, ret);
737 
738    cl_emit(&job->rcl, START_ADDRESS_OF_GENERIC_TILE_LIST, branch) {
739       branch.start = tile_list_start;
740       branch.end = v3dv_cl_get_address(cl);
741    }
742 }
743 
744 static void
cmd_buffer_emit_render_pass_layer_rcl(struct v3dv_cmd_buffer * cmd_buffer,uint32_t layer)745 cmd_buffer_emit_render_pass_layer_rcl(struct v3dv_cmd_buffer *cmd_buffer,
746                                       uint32_t layer)
747 {
748    const struct v3dv_cmd_buffer_state *state = &cmd_buffer->state;
749 
750    struct v3dv_job *job = cmd_buffer->state.job;
751    struct v3dv_cl *rcl = &job->rcl;
752 
753    /* If doing multicore binning, we would need to initialize each
754     * core's tile list here.
755     */
756    const struct v3dv_frame_tiling *tiling = &job->frame_tiling;
757    const uint32_t tile_alloc_offset =
758       64 * layer * tiling->draw_tiles_x * tiling->draw_tiles_y;
759    cl_emit(rcl, MULTICORE_RENDERING_TILE_LIST_SET_BASE, list) {
760       list.address = v3dv_cl_address(job->tile_alloc, tile_alloc_offset);
761    }
762 
763    cmd_buffer_render_pass_emit_per_tile_rcl(cmd_buffer, layer);
764 
765    uint32_t supertile_w_in_pixels =
766       tiling->tile_width * tiling->supertile_width;
767    uint32_t supertile_h_in_pixels =
768       tiling->tile_height * tiling->supertile_height;
769    const uint32_t min_x_supertile =
770       state->render_area.offset.x / supertile_w_in_pixels;
771    const uint32_t min_y_supertile =
772       state->render_area.offset.y / supertile_h_in_pixels;
773 
774    uint32_t max_render_x = state->render_area.offset.x;
775    if (state->render_area.extent.width > 0)
776       max_render_x += state->render_area.extent.width - 1;
777    uint32_t max_render_y = state->render_area.offset.y;
778    if (state->render_area.extent.height > 0)
779       max_render_y += state->render_area.extent.height - 1;
780    const uint32_t max_x_supertile = max_render_x / supertile_w_in_pixels;
781    const uint32_t max_y_supertile = max_render_y / supertile_h_in_pixels;
782 
783    for (int y = min_y_supertile; y <= max_y_supertile; y++) {
784       for (int x = min_x_supertile; x <= max_x_supertile; x++) {
785          cl_emit(rcl, SUPERTILE_COORDINATES, coords) {
786             coords.column_number_in_supertiles = x;
787             coords.row_number_in_supertiles = y;
788          }
789       }
790    }
791 }
792 
793 static void
set_rcl_early_z_config(struct v3dv_job * job,bool * early_z_disable,uint32_t * early_z_test_and_update_direction)794 set_rcl_early_z_config(struct v3dv_job *job,
795                        bool *early_z_disable,
796                        uint32_t *early_z_test_and_update_direction)
797 {
798    /* Disable if none of the draw calls in this job enabled EZ */
799    if (!job->has_ez_draws) {
800       *early_z_disable = true;
801       return;
802    }
803 
804    switch (job->first_ez_state) {
805    case V3D_EZ_UNDECIDED:
806    case V3D_EZ_LT_LE:
807       *early_z_disable = false;
808       *early_z_test_and_update_direction = EARLY_Z_DIRECTION_LT_LE;
809       break;
810    case V3D_EZ_GT_GE:
811       *early_z_disable = false;
812       *early_z_test_and_update_direction = EARLY_Z_DIRECTION_GT_GE;
813       break;
814    case V3D_EZ_DISABLED:
815       *early_z_disable = true;
816       break;
817    }
818 }
819 
820 /* Note that for v71, render target cfg packets has just one field that
821  * combined the internal type and clamp mode. For simplicity we keep just one
822  * helper.
823  *
824  * Note: rt_type is in fact a "enum V3DX(Internal_Type)".
825  *
826  * FIXME: for v71 we are not returning all the possible combinations for
827  * render target internal type and clamp. For example for int types we are
828  * always using clamp int, and for 16f we are using clamp none or pos (that
829  * seems to be the equivalent for no-clamp on 4.2), but not pq or hlg. In
830  * summary right now we are just porting what we were doing on 4.2
831  */
832 uint32_t
v3dX(clamp_for_format_and_type)833 v3dX(clamp_for_format_and_type)(uint32_t rt_type,
834                                 VkFormat vk_format)
835 {
836 #if V3D_VERSION == 42
837    if (vk_format_is_int(vk_format))
838       return V3D_RENDER_TARGET_CLAMP_INT;
839    else if (vk_format_is_srgb(vk_format))
840       return V3D_RENDER_TARGET_CLAMP_NORM;
841    else
842       return V3D_RENDER_TARGET_CLAMP_NONE;
843 #endif
844 #if V3D_VERSION >= 71
845    switch (rt_type) {
846    case V3D_INTERNAL_TYPE_8I:
847       return V3D_RENDER_TARGET_TYPE_CLAMP_8I_CLAMPED;
848    case V3D_INTERNAL_TYPE_8UI:
849       return V3D_RENDER_TARGET_TYPE_CLAMP_8UI_CLAMPED;
850    case V3D_INTERNAL_TYPE_8:
851       return V3D_RENDER_TARGET_TYPE_CLAMP_8;
852    case V3D_INTERNAL_TYPE_16I:
853       return V3D_RENDER_TARGET_TYPE_CLAMP_16I_CLAMPED;
854    case V3D_INTERNAL_TYPE_16UI:
855       return V3D_RENDER_TARGET_TYPE_CLAMP_16UI_CLAMPED;
856    case V3D_INTERNAL_TYPE_16F:
857       return vk_format_is_srgb(vk_format) ?
858          V3D_RENDER_TARGET_TYPE_CLAMP_16F_CLAMP_NORM :
859          V3D_RENDER_TARGET_TYPE_CLAMP_16F;
860    case V3D_INTERNAL_TYPE_32I:
861       return V3D_RENDER_TARGET_TYPE_CLAMP_32I_CLAMPED;
862    case V3D_INTERNAL_TYPE_32UI:
863       return V3D_RENDER_TARGET_TYPE_CLAMP_32UI_CLAMPED;
864    case V3D_INTERNAL_TYPE_32F:
865       return V3D_RENDER_TARGET_TYPE_CLAMP_32F;
866    default:
867       unreachable("Unknown internal render target type");
868    }
869 
870    return V3D_RENDER_TARGET_TYPE_CLAMP_INVALID;
871 #endif
872 }
873 
874 static void
cmd_buffer_render_pass_setup_render_target(struct v3dv_cmd_buffer * cmd_buffer,int rt,uint32_t * rt_bpp,uint32_t * rt_type,uint32_t * rt_clamp)875 cmd_buffer_render_pass_setup_render_target(struct v3dv_cmd_buffer *cmd_buffer,
876                                            int rt,
877                                            uint32_t *rt_bpp,
878 #if V3D_VERSION == 42
879                                            uint32_t *rt_type,
880                                            uint32_t *rt_clamp)
881 #else
882                                            uint32_t *rt_type_clamp)
883 #endif
884 {
885    const struct v3dv_cmd_buffer_state *state = &cmd_buffer->state;
886 
887    assert(state->subpass_idx < state->pass->subpass_count);
888    const struct v3dv_subpass *subpass =
889       &state->pass->subpasses[state->subpass_idx];
890 
891    if (rt >= subpass->color_count)
892       return;
893 
894    struct v3dv_subpass_attachment *attachment = &subpass->color_attachments[rt];
895    const uint32_t attachment_idx = attachment->attachment;
896    if (attachment_idx == VK_ATTACHMENT_UNUSED)
897       return;
898 
899    assert(attachment_idx < state->framebuffer->attachment_count &&
900           attachment_idx < state->attachment_alloc_count);
901    struct v3dv_image_view *iview = state->attachments[attachment_idx].image_view;
902    assert(vk_format_is_color(iview->vk.format));
903 
904    assert(iview->plane_count == 1);
905    *rt_bpp = iview->planes[0].internal_bpp;
906 #if V3D_VERSION == 42
907    *rt_type = iview->planes[0].internal_type;
908    *rt_clamp = v3dX(clamp_for_format_and_type)(iview->planes[0].internal_type,
909                                                iview->vk.format);
910 #endif
911 #if V3D_VERSION >= 71
912    *rt_type_clamp = v3dX(clamp_for_format_and_type)(iview->planes[0].internal_type,
913                                                     iview->vk.format);
914 #endif
915 }
916 
917 void
v3dX(cmd_buffer_emit_render_pass_rcl)918 v3dX(cmd_buffer_emit_render_pass_rcl)(struct v3dv_cmd_buffer *cmd_buffer)
919 {
920    struct v3dv_job *job = cmd_buffer->state.job;
921    assert(job);
922 
923    const struct v3dv_cmd_buffer_state *state = &cmd_buffer->state;
924    const struct v3dv_framebuffer *framebuffer = state->framebuffer;
925 
926    /* We can't emit the RCL until we have a framebuffer, which we may not have
927     * if we are recording a secondary command buffer. In that case, we will
928     * have to wait until vkCmdExecuteCommands is called from a primary command
929     * buffer.
930     */
931    if (!framebuffer) {
932       assert(cmd_buffer->vk.level == VK_COMMAND_BUFFER_LEVEL_SECONDARY);
933       return;
934    }
935 
936    const struct v3dv_frame_tiling *tiling = &job->frame_tiling;
937 
938    const uint32_t fb_layers = job->frame_tiling.layers;
939 
940    v3dv_cl_ensure_space_with_branch(&job->rcl, 200 +
941                                     MAX2(fb_layers, 1) * 256 *
942                                     cl_packet_length(SUPERTILE_COORDINATES));
943    v3dv_return_if_oom(cmd_buffer, NULL);
944 
945    assert(state->subpass_idx < state->pass->subpass_count);
946    const struct v3dv_render_pass *pass = state->pass;
947    const struct v3dv_subpass *subpass = &pass->subpasses[state->subpass_idx];
948    struct v3dv_cl *rcl = &job->rcl;
949 
950    /* Common config must be the first TILE_RENDERING_MODE_CFG and
951     * Z_STENCIL_CLEAR_VALUES must be last. The ones in between are optional
952     * updates to the previous HW state.
953     */
954    bool do_early_zs_clear = false;
955    const uint32_t ds_attachment_idx = subpass->ds_attachment.attachment;
956    assert(!tiling->msaa || !tiling->double_buffer);
957    cl_emit(rcl, TILE_RENDERING_MODE_CFG_COMMON, config) {
958       config.image_width_pixels = framebuffer->width;
959       config.image_height_pixels = framebuffer->height;
960       config.number_of_render_targets = MAX2(subpass->color_count, 1);
961       config.multisample_mode_4x = tiling->msaa;
962       config.double_buffer_in_non_ms_mode = tiling->double_buffer;
963 #if V3D_VERSION == 42
964       config.maximum_bpp_of_all_render_targets = tiling->internal_bpp;
965 #endif
966 #if V3D_VERSION >= 71
967       config.log2_tile_width = log2_tile_size(tiling->tile_width);
968       config.log2_tile_height = log2_tile_size(tiling->tile_height);
969       /* FIXME: ideallly we would like next assert on the packet header (as is
970        * general, so also applies to GL). We would need to expand
971        * gen_pack_header for that.
972        */
973       assert(config.log2_tile_width == config.log2_tile_height ||
974              config.log2_tile_width == config.log2_tile_height + 1);
975 #endif
976 
977       if (ds_attachment_idx != VK_ATTACHMENT_UNUSED) {
978          const struct v3dv_image_view *iview =
979             state->attachments[ds_attachment_idx].image_view;
980 
981          /* At this point the image view should be single-plane. But note that
982           * the underlying image can be multi-plane, and the image view refer
983           * to one specific plane.
984           */
985          assert(iview->plane_count == 1);
986          assert(iview->format->plane_count == 1);
987          config.internal_depth_type = iview->planes[0].internal_type;
988 
989          set_rcl_early_z_config(job,
990                                 &config.early_z_disable,
991                                 &config.early_z_test_and_update_direction);
992 
993          /* Early-Z/S clear can be enabled if the job is clearing and not
994           * storing (or loading) depth. If a stencil aspect is also present
995           * we have the same requirements for it, however, in this case we
996           * can accept stencil loadOp DONT_CARE as well, so instead of
997           * checking that stencil is cleared we check that is not loaded.
998           *
999           * Early-Z/S clearing is independent of Early Z/S testing, so it is
1000           * possible to enable one but not the other so long as their
1001           * respective requirements are met.
1002           *
1003           * From V3D 4.5.6, Z/S buffers are always cleared automatically
1004           * between tiles, but we still want to enable early ZS clears
1005           * when Z/S are not loaded or stored.
1006           */
1007          struct v3dv_render_pass_attachment *ds_attachment =
1008             &pass->attachments[ds_attachment_idx];
1009 
1010          const VkImageAspectFlags ds_aspects =
1011             vk_format_aspects(ds_attachment->desc.format);
1012 
1013          bool needs_depth_store =
1014             v3dv_cmd_buffer_check_needs_store(state,
1015                                               ds_aspects & VK_IMAGE_ASPECT_DEPTH_BIT,
1016                                               ds_attachment->last_subpass,
1017                                               ds_attachment->desc.storeOp) ||
1018                                               subpass->resolve_depth;
1019 #if V3D_VERSION <= 42
1020          bool needs_depth_clear =
1021             check_needs_clear(state,
1022                               ds_aspects & VK_IMAGE_ASPECT_DEPTH_BIT,
1023                               ds_attachment->first_subpass,
1024                               ds_attachment->desc.loadOp,
1025                               subpass->do_depth_clear_with_draw);
1026 
1027          do_early_zs_clear = needs_depth_clear && !needs_depth_store;
1028 #endif
1029 #if V3D_VERSION >= 71
1030          bool needs_depth_load =
1031             v3dv_cmd_buffer_check_needs_load(state,
1032                                              ds_aspects & VK_IMAGE_ASPECT_DEPTH_BIT,
1033                                              ds_attachment->first_subpass,
1034                                              ds_attachment->desc.loadOp,
1035                                              ds_attachment->last_subpass,
1036                                              ds_attachment->desc.storeOp);
1037          do_early_zs_clear = !needs_depth_load && !needs_depth_store;
1038 #endif
1039 
1040          if (do_early_zs_clear &&
1041              vk_format_has_stencil(ds_attachment->desc.format)) {
1042             bool needs_stencil_load =
1043                v3dv_cmd_buffer_check_needs_load(state,
1044                                                 ds_aspects & VK_IMAGE_ASPECT_STENCIL_BIT,
1045                                                 ds_attachment->first_subpass,
1046                                                 ds_attachment->desc.stencilLoadOp,
1047                                                 ds_attachment->last_subpass,
1048                                                 ds_attachment->desc.stencilStoreOp);
1049 
1050             bool needs_stencil_store =
1051                v3dv_cmd_buffer_check_needs_store(state,
1052                                                  ds_aspects & VK_IMAGE_ASPECT_STENCIL_BIT,
1053                                                  ds_attachment->last_subpass,
1054                                                  ds_attachment->desc.stencilStoreOp) ||
1055                subpass->resolve_stencil;
1056 
1057             do_early_zs_clear = !needs_stencil_load && !needs_stencil_store;
1058          }
1059 
1060          config.early_depth_stencil_clear = do_early_zs_clear;
1061       } else {
1062          config.early_z_disable = true;
1063       }
1064    }
1065 
1066    /* If we enabled early Z/S clear, then we can't emit any "Clear Tile Buffers"
1067     * commands with the Z/S bit set, so keep track of whether we enabled this
1068     * in the job so we can skip these later.
1069     */
1070    job->early_zs_clear = do_early_zs_clear;
1071 
1072 #if V3D_VERSION >= 71
1073    uint32_t base_addr = 0;
1074 #endif
1075    for (uint32_t i = 0; i < subpass->color_count; i++) {
1076       uint32_t attachment_idx = subpass->color_attachments[i].attachment;
1077       if (attachment_idx == VK_ATTACHMENT_UNUSED) {
1078 #if V3D_VERSION >= 71
1079          cl_emit(rcl, TILE_RENDERING_MODE_CFG_RENDER_TARGET_PART1, rt) {
1080             rt.render_target_number = i;
1081             rt.stride = 1; /* Unused */
1082          }
1083 #endif
1084          continue;
1085       }
1086 
1087       struct v3dv_image_view *iview =
1088          state->attachments[attachment_idx].image_view;
1089       assert(iview->plane_count == 1);
1090 
1091       const struct v3dv_image *image = (struct v3dv_image *) iview->vk.image;
1092 
1093       uint8_t plane = v3dv_plane_from_aspect(iview->vk.aspects);
1094       const struct v3d_resource_slice *slice =
1095          &image->planes[plane].slices[iview->vk.base_mip_level];
1096 
1097       UNUSED const uint32_t *clear_color =
1098          &state->attachments[attachment_idx].clear_value.color[0];
1099 
1100       UNUSED uint32_t clear_pad = 0;
1101       if (slice->tiling == V3D_TILING_UIF_NO_XOR ||
1102           slice->tiling == V3D_TILING_UIF_XOR) {
1103          int uif_block_height = v3d_utile_height(image->planes[plane].cpp) * 2;
1104 
1105          uint32_t implicit_padded_height =
1106             align(framebuffer->height, uif_block_height) / uif_block_height;
1107 
1108          if (slice->padded_height_of_output_image_in_uif_blocks -
1109              implicit_padded_height >= 15) {
1110             clear_pad = slice->padded_height_of_output_image_in_uif_blocks;
1111          }
1112       }
1113 
1114 #if V3D_VERSION == 42
1115       cl_emit(rcl, TILE_RENDERING_MODE_CFG_CLEAR_COLORS_PART1, clear) {
1116          clear.clear_color_low_32_bits = clear_color[0];
1117          clear.clear_color_next_24_bits = clear_color[1] & 0xffffff;
1118          clear.render_target_number = i;
1119       };
1120 
1121       if (iview->planes[0].internal_bpp >= V3D_INTERNAL_BPP_64) {
1122          cl_emit(rcl, TILE_RENDERING_MODE_CFG_CLEAR_COLORS_PART2, clear) {
1123             clear.clear_color_mid_low_32_bits =
1124                ((clear_color[1] >> 24) | (clear_color[2] << 8));
1125             clear.clear_color_mid_high_24_bits =
1126                ((clear_color[2] >> 24) | ((clear_color[3] & 0xffff) << 8));
1127             clear.render_target_number = i;
1128          };
1129       }
1130 
1131       if (iview->planes[0].internal_bpp >= V3D_INTERNAL_BPP_128 || clear_pad) {
1132          cl_emit(rcl, TILE_RENDERING_MODE_CFG_CLEAR_COLORS_PART3, clear) {
1133             clear.uif_padded_height_in_uif_blocks = clear_pad;
1134             clear.clear_color_high_16_bits = clear_color[3] >> 16;
1135             clear.render_target_number = i;
1136          };
1137       }
1138 #endif
1139 
1140 #if V3D_VERSION >= 71
1141       cl_emit(rcl, TILE_RENDERING_MODE_CFG_RENDER_TARGET_PART1, rt) {
1142          rt.clear_color_low_bits = clear_color[0];
1143          cmd_buffer_render_pass_setup_render_target(cmd_buffer, i, &rt.internal_bpp,
1144                                                     &rt.internal_type_and_clamping);
1145          rt.stride =
1146             v3d_compute_rt_row_row_stride_128_bits(tiling->tile_width,
1147                                                    v3d_internal_bpp_words(rt.internal_bpp));
1148          rt.base_address = base_addr;
1149          rt.render_target_number = i;
1150 
1151          /* base_addr in multiples of 512 bits. We divide by 8 because stride
1152           * is in 128-bit units, but it is packing 2 rows worth of data, so we
1153           * need to divide it by 2 so it is only 1 row, and then again by 4 so
1154           * it is in 512-bit units.
1155           */
1156          base_addr += (tiling->tile_height * rt.stride) / 8;
1157       }
1158 
1159       if (iview->planes[0].internal_bpp >= V3D_INTERNAL_BPP_64) {
1160          cl_emit(rcl, TILE_RENDERING_MODE_CFG_RENDER_TARGET_PART2, rt) {
1161             rt.clear_color_mid_bits = /* 40 bits (32 + 8)  */
1162                ((uint64_t) clear_color[1]) |
1163                (((uint64_t) (clear_color[2] & 0xff)) << 32);
1164             rt.render_target_number = i;
1165          }
1166       }
1167 
1168       if (iview->planes[0].internal_bpp >= V3D_INTERNAL_BPP_128) {
1169          cl_emit(rcl, TILE_RENDERING_MODE_CFG_RENDER_TARGET_PART3, rt) {
1170             rt.clear_color_top_bits = /* 56 bits (24 + 32) */
1171                (((uint64_t) (clear_color[2] & 0xffffff00)) >> 8) |
1172                (((uint64_t) (clear_color[3])) << 24);
1173             rt.render_target_number = i;
1174          }
1175       }
1176 #endif
1177    }
1178 
1179 #if V3D_VERSION >= 71
1180    /* If we don't have any color RTs, we still need to emit one and flag
1181     * it as not used using stride = 1.
1182     */
1183    if (subpass->color_count == 0) {
1184       cl_emit(rcl, TILE_RENDERING_MODE_CFG_RENDER_TARGET_PART1, rt) {
1185          rt.stride = 1;
1186       }
1187    }
1188 #endif
1189 
1190 #if V3D_VERSION == 42
1191    cl_emit(rcl, TILE_RENDERING_MODE_CFG_COLOR, rt) {
1192       cmd_buffer_render_pass_setup_render_target
1193          (cmd_buffer, 0, &rt.render_target_0_internal_bpp,
1194           &rt.render_target_0_internal_type, &rt.render_target_0_clamp);
1195       cmd_buffer_render_pass_setup_render_target
1196          (cmd_buffer, 1, &rt.render_target_1_internal_bpp,
1197           &rt.render_target_1_internal_type, &rt.render_target_1_clamp);
1198       cmd_buffer_render_pass_setup_render_target
1199          (cmd_buffer, 2, &rt.render_target_2_internal_bpp,
1200           &rt.render_target_2_internal_type, &rt.render_target_2_clamp);
1201       cmd_buffer_render_pass_setup_render_target
1202          (cmd_buffer, 3, &rt.render_target_3_internal_bpp,
1203           &rt.render_target_3_internal_type, &rt.render_target_3_clamp);
1204    }
1205 #endif
1206 
1207    /* Ends rendering mode config. */
1208    if (ds_attachment_idx != VK_ATTACHMENT_UNUSED) {
1209       cl_emit(rcl, TILE_RENDERING_MODE_CFG_ZS_CLEAR_VALUES, clear) {
1210          clear.z_clear_value =
1211             state->attachments[ds_attachment_idx].clear_value.z;
1212          clear.stencil_clear_value =
1213             state->attachments[ds_attachment_idx].clear_value.s;
1214       };
1215    } else {
1216       cl_emit(rcl, TILE_RENDERING_MODE_CFG_ZS_CLEAR_VALUES, clear) {
1217          clear.z_clear_value = 1.0f;
1218          clear.stencil_clear_value = 0;
1219       };
1220    }
1221 
1222    /* Always set initial block size before the first branch, which needs
1223     * to match the value from binning mode config.
1224     */
1225    cl_emit(rcl, TILE_LIST_INITIAL_BLOCK_SIZE, init) {
1226       init.use_auto_chained_tile_lists = true;
1227       init.size_of_first_block_in_chained_tile_lists =
1228          TILE_ALLOCATION_BLOCK_SIZE_64B;
1229    }
1230 
1231    cl_emit(rcl, MULTICORE_RENDERING_SUPERTILE_CFG, config) {
1232       config.number_of_bin_tile_lists = 1;
1233       config.total_frame_width_in_tiles = tiling->draw_tiles_x;
1234       config.total_frame_height_in_tiles = tiling->draw_tiles_y;
1235 
1236       config.supertile_width_in_tiles = tiling->supertile_width;
1237       config.supertile_height_in_tiles = tiling->supertile_height;
1238 
1239       config.total_frame_width_in_supertiles =
1240          tiling->frame_width_in_supertiles;
1241       config.total_frame_height_in_supertiles =
1242          tiling->frame_height_in_supertiles;
1243    }
1244 
1245    /* Emit an initial clear of the tile buffers. This is necessary
1246     * for any buffers that should be cleared (since clearing
1247     * normally happens at the *end* of the generic tile list), but
1248     * it's also nice to clear everything so the first tile doesn't
1249     * inherit any contents from some previous frame.
1250     *
1251     * Also, implement the GFXH-1742 workaround. There's a race in
1252     * the HW between the RCL updating the TLB's internal type/size
1253     * and the spawning of the QPU instances using the TLB's current
1254     * internal type/size. To make sure the QPUs get the right
1255     * state, we need 1 dummy store in between internal type/size
1256     * changes on V3D 3.x, and 2 dummy stores on 4.x.
1257     */
1258    for (int i = 0; i < 2; i++) {
1259       cl_emit(rcl, TILE_COORDINATES, coords);
1260       cl_emit(rcl, END_OF_LOADS, end);
1261       cl_emit(rcl, STORE_TILE_BUFFER_GENERAL, store) {
1262          store.buffer_to_store = NONE;
1263       }
1264       if (cmd_buffer->state.tile_aligned_render_area &&
1265           (i == 0 || v3dv_do_double_initial_tile_clear(tiling))) {
1266 #if V3D_VERSION == 42
1267          cl_emit(rcl, CLEAR_TILE_BUFFERS, clear) {
1268             clear.clear_z_stencil_buffer = !job->early_zs_clear;
1269             clear.clear_all_render_targets = true;
1270          }
1271 #endif
1272 #if V3D_VERSION >= 71
1273          cl_emit(rcl, CLEAR_RENDER_TARGETS, clear_rt);
1274 #endif
1275       }
1276       cl_emit(rcl, END_OF_TILE_MARKER, end);
1277    }
1278 
1279    cl_emit(rcl, FLUSH_VCD_CACHE, flush);
1280 
1281    for (int layer = 0; layer < MAX2(1, fb_layers); layer++) {
1282       if (subpass->view_mask == 0 || (subpass->view_mask & (1u << layer)))
1283          cmd_buffer_emit_render_pass_layer_rcl(cmd_buffer, layer);
1284    }
1285 
1286    cl_emit(rcl, END_OF_RENDERING, end);
1287 }
1288 
1289 void
v3dX(viewport_compute_xform)1290 v3dX(viewport_compute_xform)(const VkViewport *viewport,
1291                             float scale[3],
1292                             float translate[3])
1293 {
1294    float x = viewport->x;
1295    float y = viewport->y;
1296    float half_width = 0.5f * viewport->width;
1297    float half_height = 0.5f * viewport->height;
1298    double n = viewport->minDepth;
1299    double f = viewport->maxDepth;
1300 
1301    scale[0] = half_width;
1302    translate[0] = half_width + x;
1303    scale[1] = half_height;
1304    translate[1] = half_height + y;
1305 
1306    scale[2] = (f - n);
1307    translate[2] = n;
1308 
1309    /* It seems that if the scale is small enough the hardware won't clip
1310     * correctly so we work around this my choosing the smallest scale that
1311     * seems to work.
1312     *
1313     * This case is exercised by CTS:
1314     * dEQP-VK.draw.renderpass.inverted_depth_ranges.nodepthclamp_deltazero
1315     *
1316     * V3D 7.x fixes this by using the new
1317     * CLIPPER_Z_SCALE_AND_OFFSET_NO_GUARDBAND.
1318     */
1319 #if V3D_VERSION <= 42
1320    const float min_abs_scale = 0.0005f;
1321    if (fabs(scale[2]) < min_abs_scale)
1322       scale[2] = scale[2] < 0 ? -min_abs_scale : min_abs_scale;
1323 #endif
1324 }
1325 
1326 void
v3dX(cmd_buffer_emit_viewport)1327 v3dX(cmd_buffer_emit_viewport)(struct v3dv_cmd_buffer *cmd_buffer)
1328 {
1329    struct v3dv_dynamic_state *dynamic = &cmd_buffer->state.dynamic;
1330    struct v3dv_pipeline *pipeline = cmd_buffer->state.gfx.pipeline;
1331    assert(pipeline);
1332 
1333    /* FIXME: right now we don't support multiViewport so viewports[0] would
1334     * work now, but would need to change if we allow multiple viewports.
1335     */
1336    float *vptranslate = dynamic->viewport.translate[0];
1337    float *vpscale = dynamic->viewport.scale[0];
1338 
1339    struct v3dv_job *job = cmd_buffer->state.job;
1340    assert(job);
1341 
1342    const uint32_t required_cl_size =
1343       cl_packet_length(CLIPPER_XY_SCALING) +
1344       cl_packet_length(CLIPPER_Z_SCALE_AND_OFFSET) +
1345       cl_packet_length(CLIPPER_Z_MIN_MAX_CLIPPING_PLANES) +
1346       cl_packet_length(VIEWPORT_OFFSET);
1347    v3dv_cl_ensure_space_with_branch(&job->bcl, required_cl_size);
1348    v3dv_return_if_oom(cmd_buffer, NULL);
1349 
1350 #if V3D_VERSION == 42
1351    cl_emit(&job->bcl, CLIPPER_XY_SCALING, clip) {
1352       clip.viewport_half_width_in_1_256th_of_pixel = vpscale[0] * 256.0f;
1353       clip.viewport_half_height_in_1_256th_of_pixel = vpscale[1] * 256.0f;
1354    }
1355 #endif
1356 #if V3D_VERSION >= 71
1357    cl_emit(&job->bcl, CLIPPER_XY_SCALING, clip) {
1358       clip.viewport_half_width_in_1_64th_of_pixel = vpscale[0] * 64.0f;
1359       clip.viewport_half_height_in_1_64th_of_pixel = vpscale[1] * 64.0f;
1360    }
1361 #endif
1362 
1363    float translate_z, scale_z;
1364    v3dv_cmd_buffer_state_get_viewport_z_xform(cmd_buffer, 0,
1365                                               &translate_z, &scale_z);
1366 
1367 #if V3D_VERSION == 42
1368    cl_emit(&job->bcl, CLIPPER_Z_SCALE_AND_OFFSET, clip) {
1369       clip.viewport_z_offset_zc_to_zs = translate_z;
1370       clip.viewport_z_scale_zc_to_zs = scale_z;
1371    }
1372 #endif
1373 
1374 #if V3D_VERSION >= 71
1375    /* If the Z scale is too small guardband clipping may not clip correctly */
1376    if (fabsf(scale_z) < 0.01f) {
1377       cl_emit(&job->bcl, CLIPPER_Z_SCALE_AND_OFFSET_NO_GUARDBAND, clip) {
1378          clip.viewport_z_offset_zc_to_zs = translate_z;
1379          clip.viewport_z_scale_zc_to_zs = scale_z;
1380       }
1381    } else {
1382       cl_emit(&job->bcl, CLIPPER_Z_SCALE_AND_OFFSET, clip) {
1383          clip.viewport_z_offset_zc_to_zs = translate_z;
1384          clip.viewport_z_scale_zc_to_zs = scale_z;
1385       }
1386    }
1387 #endif
1388 
1389    cl_emit(&job->bcl, CLIPPER_Z_MIN_MAX_CLIPPING_PLANES, clip) {
1390       /* Vulkan's default Z NDC is [0..1]. If 'negative_one_to_one' is enabled,
1391        * we are using OpenGL's [-1, 1] instead.
1392        */
1393       float z1 = pipeline->negative_one_to_one ? translate_z - scale_z :
1394                                                  translate_z;
1395       float z2 = translate_z + scale_z;
1396       clip.minimum_zw = MIN2(z1, z2);
1397       clip.maximum_zw = MAX2(z1, z2);
1398    }
1399 
1400    cl_emit(&job->bcl, VIEWPORT_OFFSET, vp) {
1401       float vp_fine_x = vptranslate[0];
1402       float vp_fine_y = vptranslate[1];
1403       int32_t vp_coarse_x = 0;
1404       int32_t vp_coarse_y = 0;
1405 
1406       /* The fine coordinates must be unsigned, but coarse can be signed */
1407       if (unlikely(vp_fine_x < 0)) {
1408          int32_t blocks_64 = DIV_ROUND_UP(fabsf(vp_fine_x), 64);
1409          vp_fine_x += 64.0f * blocks_64;
1410          vp_coarse_x -= blocks_64;
1411       }
1412 
1413       if (unlikely(vp_fine_y < 0)) {
1414          int32_t blocks_64 = DIV_ROUND_UP(fabsf(vp_fine_y), 64);
1415          vp_fine_y += 64.0f * blocks_64;
1416          vp_coarse_y -= blocks_64;
1417       }
1418 
1419       vp.fine_x = vp_fine_x;
1420       vp.fine_y = vp_fine_y;
1421       vp.coarse_x = vp_coarse_x;
1422       vp.coarse_y = vp_coarse_y;
1423    }
1424 
1425    BITSET_CLEAR(cmd_buffer->vk.dynamic_graphics_state.dirty,
1426                 MESA_VK_DYNAMIC_VP_VIEWPORTS);
1427 }
1428 
1429 void
v3dX(cmd_buffer_emit_stencil)1430 v3dX(cmd_buffer_emit_stencil)(struct v3dv_cmd_buffer *cmd_buffer)
1431 {
1432    struct v3dv_job *job = cmd_buffer->state.job;
1433    assert(job);
1434 
1435    struct v3dv_pipeline *pipeline = cmd_buffer->state.gfx.pipeline;
1436    struct vk_dynamic_graphics_state *dyn =
1437       &cmd_buffer->vk.dynamic_graphics_state;
1438    bool has_stencil =
1439       pipeline->rendering_info.stencil_attachment_format != VK_FORMAT_UNDEFINED;
1440 
1441    if (!(dyn->ds.stencil.test_enable && has_stencil))
1442       return;
1443 
1444    v3dv_cl_ensure_space_with_branch(&job->bcl,
1445                                     2 * cl_packet_length(STENCIL_CFG));
1446    v3dv_return_if_oom(cmd_buffer, NULL);
1447 
1448    bool any_dynamic_stencil_state =
1449       BITSET_TEST(dyn->set, MESA_VK_DYNAMIC_DS_STENCIL_COMPARE_MASK) ||
1450       BITSET_TEST(dyn->set, MESA_VK_DYNAMIC_DS_STENCIL_WRITE_MASK) ||
1451       BITSET_TEST(dyn->set, MESA_VK_DYNAMIC_DS_STENCIL_REFERENCE) ||
1452       BITSET_TEST(dyn->set, MESA_VK_DYNAMIC_DS_STENCIL_OP);
1453 
1454    bool emitted_stencil = false;
1455    const struct vk_stencil_test_face_state *front = &dyn->ds.stencil.front;
1456    const struct vk_stencil_test_face_state *back = &dyn->ds.stencil.back;
1457 
1458    const bool needs_front_and_back = any_dynamic_stencil_state ?
1459       memcmp(front, back, sizeof(*front)) != 0 :
1460       pipeline->emit_stencil_cfg[1] == true;
1461 
1462    for (uint32_t i = 0; i < 2; i++) {
1463       if (any_dynamic_stencil_state) {
1464          const struct vk_stencil_test_face_state *stencil_state =
1465             i == 0 ? front : back;
1466          /* If we have any dynamic stencil state we just emit the entire
1467           * packet since for simplicity
1468           */
1469          cl_emit(&job->bcl, STENCIL_CFG, config) {
1470             config.front_config = !needs_front_and_back || i == 0;
1471             config.back_config = !needs_front_and_back || i == 1;
1472             config.stencil_test_mask = stencil_state->compare_mask & 0xff;
1473             config.stencil_write_mask = stencil_state->write_mask & 0xff;
1474             config.stencil_ref_value = stencil_state->reference & 0xff;
1475             config.stencil_test_function = stencil_state->op.compare;
1476             config.stencil_pass_op =
1477                v3dX(translate_stencil_op)(stencil_state->op.pass);
1478             config.depth_test_fail_op =
1479                v3dX(translate_stencil_op)(stencil_state->op.depth_fail);
1480             config.stencil_test_fail_op =
1481                v3dX(translate_stencil_op)(stencil_state->op.fail);
1482          }
1483       } else {
1484          assert(pipeline->emit_stencil_cfg[i]);
1485          cl_emit_prepacked(&job->bcl, &pipeline->stencil_cfg[i]);
1486       }
1487       emitted_stencil = true;
1488 
1489       if (!needs_front_and_back)
1490          break;
1491    }
1492    if (emitted_stencil) {
1493       BITSET_CLEAR(dyn->dirty, MESA_VK_DYNAMIC_DS_STENCIL_COMPARE_MASK);
1494       BITSET_CLEAR(dyn->dirty, MESA_VK_DYNAMIC_DS_STENCIL_REFERENCE);
1495       BITSET_CLEAR(dyn->dirty, MESA_VK_DYNAMIC_DS_STENCIL_WRITE_MASK);
1496       BITSET_CLEAR(dyn->dirty, MESA_VK_DYNAMIC_DS_STENCIL_OP);
1497    }
1498 }
1499 
1500 void
v3dX(cmd_buffer_emit_depth_bias)1501 v3dX(cmd_buffer_emit_depth_bias)(struct v3dv_cmd_buffer *cmd_buffer)
1502 {
1503    struct v3dv_pipeline *pipeline = cmd_buffer->state.gfx.pipeline;
1504    assert(pipeline);
1505    struct vk_dynamic_graphics_state *dyn = &cmd_buffer->vk.dynamic_graphics_state;
1506 
1507    if (!dyn->rs.depth_bias.enable)
1508       return;
1509 
1510    struct v3dv_job *job = cmd_buffer->state.job;
1511    assert(job);
1512 
1513    v3dv_cl_ensure_space_with_branch(&job->bcl, cl_packet_length(DEPTH_OFFSET));
1514    v3dv_return_if_oom(cmd_buffer, NULL);
1515 
1516    cl_emit(&job->bcl, DEPTH_OFFSET, bias) {
1517       bias.depth_offset_factor = dyn->rs.depth_bias.slope_factor;
1518       bias.depth_offset_units = dyn->rs.depth_bias.constant_factor;
1519 #if V3D_VERSION <= 42
1520       if (pipeline->rendering_info.depth_attachment_format == VK_FORMAT_D16_UNORM)
1521          bias.depth_offset_units *= 256.0f;
1522 #endif
1523       bias.limit = dyn->rs.depth_bias.clamp;
1524    }
1525 
1526    BITSET_CLEAR(dyn->dirty, MESA_VK_DYNAMIC_RS_DEPTH_BIAS_FACTORS);
1527 }
1528 
1529 void
v3dX(cmd_buffer_emit_depth_bounds)1530 v3dX(cmd_buffer_emit_depth_bounds)(struct v3dv_cmd_buffer *cmd_buffer)
1531 {
1532    /* No depthBounds support for v42, so this method is empty in that case.
1533     *
1534     * Note that this method is being called as v3dv_job_init flags all state
1535     * as dirty. See FIXME note in v3dv_job_init.
1536     */
1537 #if V3D_VERSION >= 71
1538    struct vk_dynamic_graphics_state *dyn =
1539       &cmd_buffer->vk.dynamic_graphics_state;
1540 
1541    if (!dyn->ds.depth.bounds_test.enable)
1542       return;
1543 
1544    struct v3dv_job *job = cmd_buffer->state.job;
1545    assert(job);
1546 
1547    v3dv_cl_ensure_space_with_branch(&job->bcl, cl_packet_length(DEPTH_BOUNDS_TEST_LIMITS));
1548    v3dv_return_if_oom(cmd_buffer, NULL);
1549 
1550    cl_emit(&job->bcl, DEPTH_BOUNDS_TEST_LIMITS, bounds) {
1551       bounds.lower_test_limit = dyn->ds.depth.bounds_test.min;
1552       bounds.upper_test_limit = dyn->ds.depth.bounds_test.max;
1553    }
1554    BITSET_CLEAR(dyn->dirty, MESA_VK_DYNAMIC_DS_DEPTH_BOUNDS_TEST_BOUNDS);
1555 #endif
1556 }
1557 
1558 void
v3dX(cmd_buffer_emit_line_width)1559 v3dX(cmd_buffer_emit_line_width)(struct v3dv_cmd_buffer *cmd_buffer)
1560 {
1561    struct v3dv_job *job = cmd_buffer->state.job;
1562    assert(job);
1563 
1564    struct vk_dynamic_graphics_state *dyn = &cmd_buffer->vk.dynamic_graphics_state;
1565 
1566    v3dv_cl_ensure_space_with_branch(&job->bcl, cl_packet_length(LINE_WIDTH));
1567    v3dv_return_if_oom(cmd_buffer, NULL);
1568 
1569    cl_emit(&job->bcl, LINE_WIDTH, line) {
1570       line.line_width = v3dv_get_aa_line_width(cmd_buffer->state.gfx.pipeline,
1571                                                cmd_buffer);
1572    }
1573 
1574    BITSET_CLEAR(dyn->dirty, MESA_VK_DYNAMIC_RS_LINE_WIDTH);
1575 }
1576 
1577 void
v3dX(cmd_buffer_emit_default_point_size)1578 v3dX(cmd_buffer_emit_default_point_size)(struct v3dv_cmd_buffer *cmd_buffer)
1579 {
1580    struct v3dv_job *job = cmd_buffer->state.job;
1581    assert(job);
1582 
1583    v3dv_cl_ensure_space_with_branch(&job->bcl, cl_packet_length(POINT_SIZE));
1584    v3dv_return_if_oom(cmd_buffer, NULL);
1585 
1586    cl_emit(&job->bcl, POINT_SIZE, point) {
1587      point.point_size = 1.0f;
1588    }
1589 
1590    job->emitted_default_point_size = true;
1591 }
1592 
1593 void
v3dX(cmd_buffer_emit_sample_state)1594 v3dX(cmd_buffer_emit_sample_state)(struct v3dv_cmd_buffer *cmd_buffer)
1595 {
1596    struct v3dv_pipeline *pipeline = cmd_buffer->state.gfx.pipeline;
1597    assert(pipeline);
1598 
1599    struct v3dv_job *job = cmd_buffer->state.job;
1600    assert(job);
1601 
1602    v3dv_cl_ensure_space_with_branch(&job->bcl, cl_packet_length(SAMPLE_STATE));
1603    v3dv_return_if_oom(cmd_buffer, NULL);
1604 
1605    cl_emit(&job->bcl, SAMPLE_STATE, state) {
1606       state.coverage = 1.0f;
1607       state.mask = pipeline->sample_mask;
1608    }
1609 }
1610 
1611 void
v3dX(cmd_buffer_emit_blend)1612 v3dX(cmd_buffer_emit_blend)(struct v3dv_cmd_buffer *cmd_buffer)
1613 {
1614    struct v3dv_job *job = cmd_buffer->state.job;
1615    assert(job);
1616 
1617    struct v3dv_pipeline *pipeline = cmd_buffer->state.gfx.pipeline;
1618    assert(pipeline);
1619 
1620    const struct v3d_device_info *devinfo = &cmd_buffer->device->devinfo;
1621    const uint32_t max_color_rts = V3D_MAX_RENDER_TARGETS(devinfo->ver);
1622 
1623    const uint32_t blend_packets_size =
1624       cl_packet_length(BLEND_ENABLES) +
1625       cl_packet_length(BLEND_CONSTANT_COLOR) +
1626       cl_packet_length(BLEND_CFG) * max_color_rts;
1627 
1628    v3dv_cl_ensure_space_with_branch(&job->bcl, blend_packets_size);
1629    v3dv_return_if_oom(cmd_buffer, NULL);
1630 
1631    if (cmd_buffer->state.dirty & V3DV_CMD_DIRTY_PIPELINE) {
1632       if (pipeline->blend.enables) {
1633          cl_emit(&job->bcl, BLEND_ENABLES, enables) {
1634             enables.mask = pipeline->blend.enables;
1635          }
1636       }
1637 
1638       for (uint32_t i = 0; i < max_color_rts; i++) {
1639          if (pipeline->blend.enables & (1 << i))
1640             cl_emit_prepacked(&job->bcl, &pipeline->blend.cfg[i]);
1641       }
1642    }
1643 
1644    if (pipeline->blend.needs_color_constants) {
1645       const struct vk_dynamic_graphics_state *dyn =
1646          &cmd_buffer->vk.dynamic_graphics_state;
1647 
1648       cl_emit(&job->bcl, BLEND_CONSTANT_COLOR, color) {
1649          color.red_f16 = _mesa_float_to_half(dyn->cb.blend_constants[0]);
1650          color.green_f16 = _mesa_float_to_half(dyn->cb.blend_constants[1]);
1651          color.blue_f16 = _mesa_float_to_half(dyn->cb.blend_constants[2]);
1652          color.alpha_f16 = _mesa_float_to_half(dyn->cb.blend_constants[3]);
1653       }
1654    }
1655 
1656    BITSET_CLEAR(cmd_buffer->vk.dynamic_graphics_state.dirty,
1657                 MESA_VK_DYNAMIC_CB_BLEND_CONSTANTS);
1658 }
1659 
1660 void
v3dX(cmd_buffer_emit_color_write_mask)1661 v3dX(cmd_buffer_emit_color_write_mask)(struct v3dv_cmd_buffer *cmd_buffer)
1662 {
1663    struct v3dv_job *job = cmd_buffer->state.job;
1664    v3dv_cl_ensure_space_with_branch(&job->bcl, cl_packet_length(COLOR_WRITE_MASKS));
1665 
1666    struct v3dv_pipeline *pipeline = cmd_buffer->state.gfx.pipeline;
1667    struct v3dv_dynamic_state *v3dv_dyn = &cmd_buffer->state.dynamic;
1668    uint32_t color_write_mask = ~v3dv_dyn->color_write_enable |
1669                                pipeline->blend.color_write_masks;
1670 
1671 #if V3D_VERSION <= 42
1672    /* Only 4 RTs */
1673    color_write_mask &= 0xffff;
1674 #endif
1675 
1676    cl_emit(&job->bcl, COLOR_WRITE_MASKS, mask) {
1677       mask.mask = color_write_mask;
1678    }
1679 
1680    BITSET_CLEAR(cmd_buffer->vk.dynamic_graphics_state.dirty,
1681                 MESA_VK_DYNAMIC_CB_COLOR_WRITE_ENABLES);
1682 }
1683 
1684 static void
emit_flat_shade_flags(struct v3dv_job * job,int varying_offset,uint32_t varyings,enum V3DX (Varying_Flags_Action)lower,enum V3DX (Varying_Flags_Action)higher)1685 emit_flat_shade_flags(struct v3dv_job *job,
1686                       int varying_offset,
1687                       uint32_t varyings,
1688                       enum V3DX(Varying_Flags_Action) lower,
1689                       enum V3DX(Varying_Flags_Action) higher)
1690 {
1691    v3dv_cl_ensure_space_with_branch(&job->bcl,
1692                                     cl_packet_length(FLAT_SHADE_FLAGS));
1693    v3dv_return_if_oom(NULL, job);
1694 
1695    cl_emit(&job->bcl, FLAT_SHADE_FLAGS, flags) {
1696       flags.varying_offset_v0 = varying_offset;
1697       flags.flat_shade_flags_for_varyings_v024 = varyings;
1698       flags.action_for_flat_shade_flags_of_lower_numbered_varyings = lower;
1699       flags.action_for_flat_shade_flags_of_higher_numbered_varyings = higher;
1700    }
1701 }
1702 
1703 static void
emit_noperspective_flags(struct v3dv_job * job,int varying_offset,uint32_t varyings,enum V3DX (Varying_Flags_Action)lower,enum V3DX (Varying_Flags_Action)higher)1704 emit_noperspective_flags(struct v3dv_job *job,
1705                          int varying_offset,
1706                          uint32_t varyings,
1707                          enum V3DX(Varying_Flags_Action) lower,
1708                          enum V3DX(Varying_Flags_Action) higher)
1709 {
1710    v3dv_cl_ensure_space_with_branch(&job->bcl,
1711                                     cl_packet_length(NON_PERSPECTIVE_FLAGS));
1712    v3dv_return_if_oom(NULL, job);
1713 
1714    cl_emit(&job->bcl, NON_PERSPECTIVE_FLAGS, flags) {
1715       flags.varying_offset_v0 = varying_offset;
1716       flags.non_perspective_flags_for_varyings_v024 = varyings;
1717       flags.action_for_non_perspective_flags_of_lower_numbered_varyings = lower;
1718       flags.action_for_non_perspective_flags_of_higher_numbered_varyings = higher;
1719    }
1720 }
1721 
1722 static void
emit_centroid_flags(struct v3dv_job * job,int varying_offset,uint32_t varyings,enum V3DX (Varying_Flags_Action)lower,enum V3DX (Varying_Flags_Action)higher)1723 emit_centroid_flags(struct v3dv_job *job,
1724                     int varying_offset,
1725                     uint32_t varyings,
1726                     enum V3DX(Varying_Flags_Action) lower,
1727                     enum V3DX(Varying_Flags_Action) higher)
1728 {
1729    v3dv_cl_ensure_space_with_branch(&job->bcl,
1730                                     cl_packet_length(CENTROID_FLAGS));
1731    v3dv_return_if_oom(NULL, job);
1732 
1733    cl_emit(&job->bcl, CENTROID_FLAGS, flags) {
1734       flags.varying_offset_v0 = varying_offset;
1735       flags.centroid_flags_for_varyings_v024 = varyings;
1736       flags.action_for_centroid_flags_of_lower_numbered_varyings = lower;
1737       flags.action_for_centroid_flags_of_higher_numbered_varyings = higher;
1738    }
1739 }
1740 
1741 static bool
emit_varying_flags(struct v3dv_job * job,uint32_t num_flags,const uint32_t * flags,void (* flag_emit_callback)(struct v3dv_job * job,int varying_offset,uint32_t flags,enum V3DX (Varying_Flags_Action)lower,enum V3DX (Varying_Flags_Action)higher))1742 emit_varying_flags(struct v3dv_job *job,
1743                    uint32_t num_flags,
1744                    const uint32_t *flags,
1745                    void (*flag_emit_callback)(struct v3dv_job *job,
1746                                               int varying_offset,
1747                                               uint32_t flags,
1748                                               enum V3DX(Varying_Flags_Action) lower,
1749                                               enum V3DX(Varying_Flags_Action) higher))
1750 {
1751    bool emitted_any = false;
1752    for (int i = 0; i < num_flags; i++) {
1753       if (!flags[i])
1754          continue;
1755 
1756       if (emitted_any) {
1757          flag_emit_callback(job, i, flags[i],
1758                             V3D_VARYING_FLAGS_ACTION_UNCHANGED,
1759                             V3D_VARYING_FLAGS_ACTION_UNCHANGED);
1760       } else if (i == 0) {
1761          flag_emit_callback(job, i, flags[i],
1762                             V3D_VARYING_FLAGS_ACTION_UNCHANGED,
1763                             V3D_VARYING_FLAGS_ACTION_ZEROED);
1764       } else {
1765          flag_emit_callback(job, i, flags[i],
1766                             V3D_VARYING_FLAGS_ACTION_ZEROED,
1767                             V3D_VARYING_FLAGS_ACTION_ZEROED);
1768       }
1769 
1770       emitted_any = true;
1771    }
1772 
1773    return emitted_any;
1774 }
1775 
1776 void
v3dX(cmd_buffer_emit_varyings_state)1777 v3dX(cmd_buffer_emit_varyings_state)(struct v3dv_cmd_buffer *cmd_buffer)
1778 {
1779    struct v3dv_job *job = cmd_buffer->state.job;
1780    struct v3dv_pipeline *pipeline = cmd_buffer->state.gfx.pipeline;
1781 
1782    struct v3d_fs_prog_data *prog_data_fs =
1783       pipeline->shared_data->variants[BROADCOM_SHADER_FRAGMENT]->prog_data.fs;
1784 
1785    const uint32_t num_flags =
1786       ARRAY_SIZE(prog_data_fs->flat_shade_flags);
1787    const uint32_t *flat_shade_flags = prog_data_fs->flat_shade_flags;
1788    const uint32_t *noperspective_flags =  prog_data_fs->noperspective_flags;
1789    const uint32_t *centroid_flags = prog_data_fs->centroid_flags;
1790 
1791    if (!emit_varying_flags(job, num_flags, flat_shade_flags,
1792                            emit_flat_shade_flags)) {
1793       v3dv_cl_ensure_space_with_branch(
1794          &job->bcl, cl_packet_length(ZERO_ALL_FLAT_SHADE_FLAGS));
1795       v3dv_return_if_oom(cmd_buffer, NULL);
1796 
1797       cl_emit(&job->bcl, ZERO_ALL_FLAT_SHADE_FLAGS, flags);
1798    }
1799 
1800    if (!emit_varying_flags(job, num_flags, noperspective_flags,
1801                            emit_noperspective_flags)) {
1802       v3dv_cl_ensure_space_with_branch(
1803          &job->bcl, cl_packet_length(ZERO_ALL_NON_PERSPECTIVE_FLAGS));
1804       v3dv_return_if_oom(cmd_buffer, NULL);
1805 
1806       cl_emit(&job->bcl, ZERO_ALL_NON_PERSPECTIVE_FLAGS, flags);
1807    }
1808 
1809    if (!emit_varying_flags(job, num_flags, centroid_flags,
1810                            emit_centroid_flags)) {
1811       v3dv_cl_ensure_space_with_branch(
1812          &job->bcl, cl_packet_length(ZERO_ALL_CENTROID_FLAGS));
1813       v3dv_return_if_oom(cmd_buffer, NULL);
1814 
1815       cl_emit(&job->bcl, ZERO_ALL_CENTROID_FLAGS, flags);
1816    }
1817 }
1818 
1819 #if V3D_VERSION == 42
1820 /* Updates cmd_buffer, and their job, early z state tracking. Returns false if
1821  * EZ must be disabled for the current draw call.
1822  */
1823 static bool
cmd_buffer_update_ez_state(struct v3dv_cmd_buffer * cmd_buffer,struct v3dv_pipeline * pipeline)1824 cmd_buffer_update_ez_state(struct v3dv_cmd_buffer *cmd_buffer,
1825                            struct v3dv_pipeline *pipeline)
1826 {
1827    struct vk_dynamic_graphics_state *dyn = &cmd_buffer->vk.dynamic_graphics_state;
1828    /* Update first cmd_buffer ez_state tracking. If possible we reuse the
1829     * values from the pipeline
1830     */
1831    if (!BITSET_TEST(dyn->set, MESA_VK_DYNAMIC_DS_STENCIL_OP) &&
1832        !BITSET_TEST(dyn->set, MESA_VK_DYNAMIC_DS_STENCIL_TEST_ENABLE) &&
1833        !BITSET_TEST(dyn->set, MESA_VK_DYNAMIC_DS_DEPTH_TEST_ENABLE) &&
1834        !BITSET_TEST(dyn->set, MESA_VK_DYNAMIC_DS_DEPTH_COMPARE_OP)) {
1835       cmd_buffer->state.ez_state = pipeline->ez_state;
1836       cmd_buffer->state.incompatible_ez_test =
1837          pipeline->incompatible_ez_test;
1838    } else {
1839       v3dv_compute_ez_state(dyn, pipeline,
1840                             &cmd_buffer->state.ez_state,
1841                             &cmd_buffer->state.incompatible_ez_test);
1842    }
1843 
1844    struct v3dv_job *job = cmd_buffer->state.job;
1845    assert(job);
1846    /* If first_ez_state is V3D_EZ_DISABLED it means that we have already
1847     * determined that we should disable EZ completely for all draw calls in
1848     * this job. This will cause us to disable EZ for the entire job in the
1849     * Tile Rendering Mode RCL packet and when we do that we need to make sure
1850     * we never emit a draw call in the job with EZ enabled in the CFG_BITS
1851     * packet, so ez_state must also be V3D_EZ_DISABLED;
1852     */
1853    if (job->first_ez_state == V3D_EZ_DISABLED) {
1854       assert(job->ez_state == V3D_EZ_DISABLED);
1855       return false;
1856    }
1857 
1858    /* If ez_state is V3D_EZ_DISABLED it means that we have already decided
1859     * that EZ must be disabled for the remaining of the frame.
1860     */
1861    if (job->ez_state == V3D_EZ_DISABLED)
1862       return false;
1863 
1864    /* This is part of the pre draw call handling, so we should be inside a
1865     * render pass.
1866     */
1867    assert(cmd_buffer->state.pass);
1868 
1869    /* If this is the first time we update EZ state for this job we first check
1870     * if there is anything that requires disabling it completely for the entire
1871     * job (based on state that is not related to the current draw call and
1872     * pipeline/cmd_buffer state).
1873     */
1874    if (!job->decided_global_ez_enable) {
1875       job->decided_global_ez_enable = true;
1876 
1877       struct v3dv_cmd_buffer_state *state = &cmd_buffer->state;
1878       assert(state->subpass_idx < state->pass->subpass_count);
1879       struct v3dv_subpass *subpass = &state->pass->subpasses[state->subpass_idx];
1880       if (subpass->ds_attachment.attachment == VK_ATTACHMENT_UNUSED) {
1881          job->first_ez_state = V3D_EZ_DISABLED;
1882          job->ez_state = V3D_EZ_DISABLED;
1883          return false;
1884       }
1885 
1886       /* GFXH-1918: the early-z buffer may load incorrect depth values if the
1887        * frame has odd width or height, or if the buffer is 16-bit and
1888        * multisampled.
1889        *
1890        * So we need to disable EZ in these cases.
1891        */
1892       const struct v3dv_render_pass_attachment *ds_attachment =
1893          &state->pass->attachments[subpass->ds_attachment.attachment];
1894 
1895       const VkImageAspectFlags ds_aspects =
1896          vk_format_aspects(ds_attachment->desc.format);
1897 
1898       bool needs_depth_load =
1899          v3dv_cmd_buffer_check_needs_load(state,
1900                                           ds_aspects & VK_IMAGE_ASPECT_DEPTH_BIT,
1901                                           ds_attachment->first_subpass,
1902                                           ds_attachment->desc.loadOp,
1903                                           ds_attachment->last_subpass,
1904                                           ds_attachment->desc.storeOp);
1905 
1906       if (needs_depth_load) {
1907          if (ds_attachment->desc.format == VK_FORMAT_D16_UNORM &&
1908              ds_attachment->desc.samples != VK_SAMPLE_COUNT_1_BIT) {
1909             perf_debug("Loading depth aspect from a multisampled 16-bit "
1910                        "depth buffer disables early-Z tests.\n");
1911             job->first_ez_state = V3D_EZ_DISABLED;
1912             job->ez_state = V3D_EZ_DISABLED;
1913             return false;
1914          }
1915 
1916          struct v3dv_framebuffer *fb = state->framebuffer;
1917 
1918          if (!fb) {
1919             assert(cmd_buffer->vk.level == VK_COMMAND_BUFFER_LEVEL_SECONDARY);
1920             perf_debug("Loading depth aspect in a secondary command buffer "
1921                        "without framebuffer info disables early-z tests.\n");
1922             job->first_ez_state = V3D_EZ_DISABLED;
1923             job->ez_state = V3D_EZ_DISABLED;
1924             return false;
1925          }
1926 
1927          if (((fb->width % 2) != 0 || (fb->height % 2) != 0)) {
1928             perf_debug("Loading depth aspect for framebuffer with odd width "
1929                        "or height disables early-Z tests.\n");
1930             job->first_ez_state = V3D_EZ_DISABLED;
1931             job->ez_state = V3D_EZ_DISABLED;
1932             return false;
1933          }
1934       }
1935    }
1936 
1937    /* Otherwise, we can decide to selectively enable or disable EZ for draw
1938     * calls using the CFG_BITS packet based on the bound pipeline state, or
1939     * cmd_buffer state if some stencil/depth flags were dynamic.
1940     */
1941    bool disable_ez = false;
1942    bool incompatible_test = false;
1943    switch (cmd_buffer->state.ez_state) {
1944    case V3D_EZ_UNDECIDED:
1945       /* If the pipeline didn't pick a direction but didn't disable, then go
1946        * along with the current EZ state. This allows EZ optimization for Z
1947        * func == EQUAL or NEVER.
1948        */
1949       break;
1950 
1951    case V3D_EZ_LT_LE:
1952    case V3D_EZ_GT_GE:
1953       /* If the pipeline picked a direction, then it needs to match the current
1954        * direction if we've decided on one.
1955        */
1956       if (job->ez_state == V3D_EZ_UNDECIDED) {
1957          job->ez_state = cmd_buffer->state.ez_state;
1958       } else if (job->ez_state != pipeline->ez_state) {
1959          disable_ez = true;
1960          incompatible_test = true;
1961       }
1962       break;
1963 
1964    case V3D_EZ_DISABLED:
1965          disable_ez = true;
1966          incompatible_test = cmd_buffer->state.incompatible_ez_test;
1967       break;
1968    }
1969 
1970    if (job->first_ez_state == V3D_EZ_UNDECIDED && !disable_ez) {
1971       assert(job->ez_state != V3D_EZ_DISABLED);
1972       job->first_ez_state = job->ez_state;
1973    }
1974 
1975    /* If we had to disable EZ because of an incompatible test direction and
1976     * and the cmd buffer writes depth then we need to disable EZ for the rest
1977     * of the frame.
1978     */
1979    if (incompatible_test && cmd_buffer->state.z_updates_enable) {
1980       assert(disable_ez);
1981       job->ez_state = V3D_EZ_DISABLED;
1982    }
1983 
1984    if (!disable_ez)
1985       job->has_ez_draws = true;
1986 
1987    return !disable_ez;
1988 }
1989 #endif
1990 
1991 void
v3dX(cmd_buffer_emit_configuration_bits)1992 v3dX(cmd_buffer_emit_configuration_bits)(struct v3dv_cmd_buffer *cmd_buffer)
1993 {
1994    struct v3dv_job *job = cmd_buffer->state.job;
1995    assert(job);
1996 
1997    struct v3dv_pipeline *pipeline = cmd_buffer->state.gfx.pipeline;
1998    assert(pipeline);
1999 
2000    v3dv_cl_ensure_space_with_branch(&job->bcl, cl_packet_length(CFG_BITS));
2001    v3dv_return_if_oom(cmd_buffer, NULL);
2002 
2003    struct vk_dynamic_graphics_state *dyn =
2004       &cmd_buffer->vk.dynamic_graphics_state;
2005 
2006    /* Disable depth/stencil if we don't have a D/S attachment */
2007    bool has_depth =
2008       pipeline->rendering_info.depth_attachment_format != VK_FORMAT_UNDEFINED;
2009    bool has_stencil =
2010       pipeline->rendering_info.stencil_attachment_format != VK_FORMAT_UNDEFINED;
2011 
2012    cl_emit_with_prepacked(&job->bcl, CFG_BITS, pipeline->cfg_bits, config) {
2013       if (dyn->ds.depth.test_enable && has_depth) {
2014          config.z_updates_enable = dyn->ds.depth.write_enable;
2015          config.depth_test_function = dyn->ds.depth.compare_op;
2016       } else {
2017          config.depth_test_function = VK_COMPARE_OP_ALWAYS;
2018       }
2019 
2020       config.stencil_enable = dyn->ds.stencil.test_enable && has_stencil;
2021 
2022       cmd_buffer->state.z_updates_enable = config.z_updates_enable;
2023 #if V3D_VERSION == 42
2024       bool enable_ez = cmd_buffer_update_ez_state(cmd_buffer, pipeline);
2025       config.early_z_enable = enable_ez;
2026       config.early_z_updates_enable = config.early_z_enable &&
2027          cmd_buffer->state.z_updates_enable;
2028 #endif
2029 
2030       if (!dyn->rs.rasterizer_discard_enable) {
2031          assert(BITSET_TEST(dyn->set, MESA_VK_DYNAMIC_RS_CULL_MODE));
2032          assert(BITSET_TEST(dyn->set, MESA_VK_DYNAMIC_RS_FRONT_FACE));
2033          config.enable_forward_facing_primitive = !(dyn->rs.cull_mode & VK_CULL_MODE_FRONT_BIT);
2034          config.enable_reverse_facing_primitive = !(dyn->rs.cull_mode & VK_CULL_MODE_BACK_BIT);
2035          /* Seems like the hardware is backwards regarding this setting... */
2036          config.clockwise_primitives = dyn->rs.front_face == VK_FRONT_FACE_COUNTER_CLOCKWISE;
2037       }
2038 
2039       /* V3D 4.2 doesn't support depth bounds testing so we don't advertise that
2040        * feature and it shouldn't be used by any pipeline.
2041        */
2042       assert(cmd_buffer->device->devinfo.ver >= 71 ||
2043              !dyn->ds.depth.bounds_test.enable);
2044 #if V3D_VERSION >= 71
2045       config.depth_bounds_test_enable =
2046          dyn->ds.depth.bounds_test.enable && has_depth;
2047 #endif
2048 
2049       config.enable_depth_offset = dyn->rs.depth_bias.enable;
2050    }
2051 
2052    BITSET_CLEAR(dyn->dirty, MESA_VK_DYNAMIC_RS_CULL_MODE);
2053    BITSET_CLEAR(dyn->dirty, MESA_VK_DYNAMIC_RS_FRONT_FACE);
2054    BITSET_CLEAR(dyn->dirty, MESA_VK_DYNAMIC_DS_DEPTH_BOUNDS_TEST_ENABLE);
2055    BITSET_CLEAR(dyn->dirty, MESA_VK_DYNAMIC_DS_STENCIL_TEST_ENABLE);
2056    BITSET_CLEAR(dyn->dirty, MESA_VK_DYNAMIC_RS_DEPTH_BIAS_ENABLE);
2057    BITSET_CLEAR(dyn->dirty, MESA_VK_DYNAMIC_RS_RASTERIZER_DISCARD_ENABLE);
2058 }
2059 
2060 void
v3dX(cmd_buffer_emit_occlusion_query)2061 v3dX(cmd_buffer_emit_occlusion_query)(struct v3dv_cmd_buffer *cmd_buffer)
2062 {
2063    struct v3dv_job *job = cmd_buffer->state.job;
2064    assert(job);
2065 
2066    v3dv_cl_ensure_space_with_branch(&job->bcl,
2067                                     cl_packet_length(OCCLUSION_QUERY_COUNTER));
2068    v3dv_return_if_oom(cmd_buffer, NULL);
2069 
2070    cl_emit(&job->bcl, OCCLUSION_QUERY_COUNTER, counter) {
2071       if (cmd_buffer->state.query.active_query.bo) {
2072          counter.address =
2073             v3dv_cl_address(cmd_buffer->state.query.active_query.bo,
2074                             cmd_buffer->state.query.active_query.offset);
2075       }
2076    }
2077 
2078    cmd_buffer->state.dirty &= ~V3DV_CMD_DIRTY_OCCLUSION_QUERY;
2079 }
2080 
2081 static struct v3dv_job *
cmd_buffer_subpass_split_for_barrier(struct v3dv_cmd_buffer * cmd_buffer,bool is_bcl_barrier)2082 cmd_buffer_subpass_split_for_barrier(struct v3dv_cmd_buffer *cmd_buffer,
2083                                      bool is_bcl_barrier)
2084 {
2085    assert(cmd_buffer->state.subpass_idx != -1);
2086    v3dv_cmd_buffer_finish_job(cmd_buffer);
2087    struct v3dv_job *job =
2088       v3dv_cmd_buffer_subpass_resume(cmd_buffer,
2089                                      cmd_buffer->state.subpass_idx);
2090    if (!job)
2091       return NULL;
2092 
2093    /* FIXME: we can do better than all barriers */
2094    job->serialize = V3DV_BARRIER_ALL;
2095    job->needs_bcl_sync = is_bcl_barrier;
2096    return job;
2097 }
2098 
2099 static void
cmd_buffer_copy_secondary_end_query_state(struct v3dv_cmd_buffer * primary,struct v3dv_cmd_buffer * secondary)2100 cmd_buffer_copy_secondary_end_query_state(struct v3dv_cmd_buffer *primary,
2101                                           struct v3dv_cmd_buffer *secondary)
2102 {
2103    struct v3dv_cmd_buffer_state *p_state = &primary->state;
2104    struct v3dv_cmd_buffer_state *s_state = &secondary->state;
2105 
2106    const uint32_t total_state_count =
2107       p_state->query.end.used_count + s_state->query.end.used_count;
2108    v3dv_cmd_buffer_ensure_array_state(primary,
2109                                       sizeof(struct v3dv_end_query_info),
2110                                       total_state_count,
2111                                       &p_state->query.end.alloc_count,
2112                                       (void **) &p_state->query.end.states);
2113    v3dv_return_if_oom(primary, NULL);
2114 
2115    for (uint32_t i = 0; i < s_state->query.end.used_count; i++) {
2116       const struct v3dv_end_query_info *s_qstate =
2117          &secondary->state.query.end.states[i];
2118 
2119       struct v3dv_end_query_info *p_qstate =
2120          &p_state->query.end.states[p_state->query.end.used_count++];
2121 
2122       memcpy(p_qstate, s_qstate, sizeof(struct v3dv_end_query_info));
2123    }
2124 }
2125 
2126 void
v3dX(cmd_buffer_execute_inside_pass)2127 v3dX(cmd_buffer_execute_inside_pass)(struct v3dv_cmd_buffer *primary,
2128                                      uint32_t cmd_buffer_count,
2129                                      const VkCommandBuffer *cmd_buffers)
2130 {
2131    assert(primary->state.job);
2132 
2133    /* Typically we postpone applying binning syncs until we see a draw call
2134     * that may actually access proteted resources in the binning stage. However,
2135     * if the draw calls are recorded in a secondary command buffer and the
2136     * barriers were recorded in a primary command buffer, that won't work
2137     * and we will have to check if we need a binning sync when executing the
2138     * secondary.
2139     */
2140    struct v3dv_job *primary_job = primary->state.job;
2141    if (primary_job->serialize &&
2142        (primary->state.barrier.bcl_buffer_access ||
2143         primary->state.barrier.bcl_image_access)) {
2144       v3dv_cmd_buffer_consume_bcl_sync(primary, primary_job);
2145    }
2146 
2147    /* Emit occlusion query state if needed so the draw calls inside our
2148     * secondaries update the counters.
2149     */
2150    bool has_occlusion_query =
2151       primary->state.dirty & V3DV_CMD_DIRTY_OCCLUSION_QUERY;
2152    if (has_occlusion_query)
2153       v3dX(cmd_buffer_emit_occlusion_query)(primary);
2154 
2155    /* FIXME: if our primary job tiling doesn't enable MSSA but any of the
2156     * pipelines used by the secondaries do, we need to re-start the primary
2157     * job to enable MSAA. See cmd_buffer_restart_job_for_msaa_if_needed.
2158     */
2159    struct v3dv_barrier_state pending_barrier = { 0 };
2160    for (uint32_t i = 0; i < cmd_buffer_count; i++) {
2161       V3DV_FROM_HANDLE(v3dv_cmd_buffer, secondary, cmd_buffers[i]);
2162 
2163       assert(secondary->usage_flags &
2164              VK_COMMAND_BUFFER_USAGE_RENDER_PASS_CONTINUE_BIT);
2165 
2166       list_for_each_entry(struct v3dv_job, secondary_job,
2167                           &secondary->jobs, list_link) {
2168          if (secondary_job->type == V3DV_JOB_TYPE_GPU_CL_INCOMPLETE) {
2169             /* If the job is a CL, then we branch to it from the primary BCL.
2170              * In this case the secondary's BCL is finished with a
2171              * RETURN_FROM_SUB_LIST command to return back to the primary BCL
2172              * once we are done executing it.
2173              */
2174             assert(v3dv_cl_offset(&secondary_job->rcl) == 0);
2175             assert(secondary_job->bcl.bo);
2176 
2177             /* Sanity check that secondary BCL ends with RETURN_FROM_SUB_LIST */
2178             STATIC_ASSERT(cl_packet_length(RETURN_FROM_SUB_LIST) == 1);
2179             assert(v3dv_cl_offset(&secondary_job->bcl) >= 1);
2180             assert(*(((uint8_t *)secondary_job->bcl.next) - 1) ==
2181                    V3DX(RETURN_FROM_SUB_LIST_opcode));
2182 
2183             /* If this secondary has any barriers (or we had any pending barrier
2184              * to apply), then we can't just branch to it from the primary, we
2185              * need to split the primary to create a new job that can consume
2186              * the barriers first.
2187              *
2188              * FIXME: in this case, maybe just copy the secondary BCL without
2189              * the RETURN_FROM_SUB_LIST into the primary job to skip the
2190              * branch?
2191              */
2192             primary_job = primary->state.job;
2193             if (!primary_job || secondary_job->serialize ||
2194                 pending_barrier.dst_mask) {
2195                const bool needs_bcl_barrier =
2196                   secondary_job->needs_bcl_sync ||
2197                   pending_barrier.bcl_buffer_access ||
2198                   pending_barrier.bcl_image_access;
2199 
2200                primary_job =
2201                   cmd_buffer_subpass_split_for_barrier(primary,
2202                                                        needs_bcl_barrier);
2203                v3dv_return_if_oom(primary, NULL);
2204 
2205                /* Since we have created a new primary we need to re-emit
2206                 * occlusion query state.
2207                 */
2208                if (has_occlusion_query)
2209                   v3dX(cmd_buffer_emit_occlusion_query)(primary);
2210             }
2211 
2212             /* Make sure our primary job has all required BO references */
2213             set_foreach(secondary_job->bos, entry) {
2214                struct v3dv_bo *bo = (struct v3dv_bo *)entry->key;
2215                v3dv_job_add_bo(primary_job, bo);
2216             }
2217 
2218             /* Emit required branch instructions. We expect each of these
2219              * to end with a corresponding 'return from sub list' item.
2220              */
2221             list_for_each_entry(struct v3dv_bo, bcl_bo,
2222                                 &secondary_job->bcl.bo_list, list_link) {
2223                v3dv_cl_ensure_space_with_branch(&primary_job->bcl,
2224                                                 cl_packet_length(BRANCH_TO_SUB_LIST));
2225                v3dv_return_if_oom(primary, NULL);
2226                cl_emit(&primary_job->bcl, BRANCH_TO_SUB_LIST, branch) {
2227                   branch.address = v3dv_cl_address(bcl_bo, 0);
2228                }
2229             }
2230 
2231             if (!secondary_job->can_use_double_buffer) {
2232                primary_job->can_use_double_buffer = false;
2233             } else {
2234                primary_job->double_buffer_score.geom +=
2235                   secondary_job->double_buffer_score.geom;
2236                primary_job->double_buffer_score.render +=
2237                   secondary_job->double_buffer_score.render;
2238             }
2239             primary_job->tmu_dirty_rcl |= secondary_job->tmu_dirty_rcl;
2240          } else {
2241             /* This is a regular job (CPU or GPU), so just finish the current
2242              * primary job (if any) and then add the secondary job to the
2243              * primary's job list right after it.
2244              */
2245             v3dv_cmd_buffer_finish_job(primary);
2246             v3dv_job_clone_in_cmd_buffer(secondary_job, primary);
2247             if (pending_barrier.dst_mask) {
2248                /* FIXME: do the same we do for primaries and only choose the
2249                 * relevant src masks.
2250                 */
2251                secondary_job->serialize = pending_barrier.src_mask_graphics |
2252                                           pending_barrier.src_mask_transfer |
2253                                           pending_barrier.src_mask_compute;
2254                if (pending_barrier.bcl_buffer_access ||
2255                    pending_barrier.bcl_image_access) {
2256                   secondary_job->needs_bcl_sync = true;
2257                }
2258             }
2259          }
2260 
2261          memset(&pending_barrier, 0, sizeof(pending_barrier));
2262       }
2263 
2264       /* If the secondary has recorded any vkCmdEndQuery commands, we need to
2265        * copy this state to the primary so it is processed properly when the
2266        * current primary job is finished.
2267        */
2268       cmd_buffer_copy_secondary_end_query_state(primary, secondary);
2269 
2270       /* If this secondary had any pending barrier state we will need that
2271        * barrier state consumed with whatever comes next in the primary.
2272        */
2273       assert(secondary->state.barrier.dst_mask ||
2274              (!secondary->state.barrier.bcl_buffer_access &&
2275               !secondary->state.barrier.bcl_image_access));
2276 
2277       pending_barrier = secondary->state.barrier;
2278    }
2279 
2280    if (pending_barrier.dst_mask) {
2281       v3dv_cmd_buffer_merge_barrier_state(&primary->state.barrier,
2282                                           &pending_barrier);
2283    }
2284 }
2285 
2286 static void
emit_gs_shader_state_record(struct v3dv_job * job,struct v3dv_bo * assembly_bo,struct v3dv_shader_variant * gs_bin,struct v3dv_cl_reloc gs_bin_uniforms,struct v3dv_shader_variant * gs,struct v3dv_cl_reloc gs_render_uniforms)2287 emit_gs_shader_state_record(struct v3dv_job *job,
2288                             struct v3dv_bo *assembly_bo,
2289                             struct v3dv_shader_variant *gs_bin,
2290                             struct v3dv_cl_reloc gs_bin_uniforms,
2291                             struct v3dv_shader_variant *gs,
2292                             struct v3dv_cl_reloc gs_render_uniforms)
2293 {
2294    cl_emit(&job->indirect, GEOMETRY_SHADER_STATE_RECORD, shader) {
2295       shader.geometry_bin_mode_shader_code_address =
2296          v3dv_cl_address(assembly_bo, gs_bin->assembly_offset);
2297       shader.geometry_bin_mode_shader_4_way_threadable =
2298          gs_bin->prog_data.gs->base.threads == 4;
2299       shader.geometry_bin_mode_shader_start_in_final_thread_section =
2300          gs_bin->prog_data.gs->base.single_seg;
2301 #if V3D_VERSION <= 42
2302       shader.geometry_bin_mode_shader_propagate_nans = true;
2303 #endif
2304       shader.geometry_bin_mode_shader_uniforms_address =
2305          gs_bin_uniforms;
2306 
2307       shader.geometry_render_mode_shader_code_address =
2308          v3dv_cl_address(assembly_bo, gs->assembly_offset);
2309       shader.geometry_render_mode_shader_4_way_threadable =
2310          gs->prog_data.gs->base.threads == 4;
2311       shader.geometry_render_mode_shader_start_in_final_thread_section =
2312          gs->prog_data.gs->base.single_seg;
2313 #if V3D_VERSION <= 42
2314       shader.geometry_render_mode_shader_propagate_nans = true;
2315 #endif
2316       shader.geometry_render_mode_shader_uniforms_address =
2317          gs_render_uniforms;
2318    }
2319 }
2320 
2321 static uint8_t
v3d_gs_output_primitive(enum mesa_prim prim_type)2322 v3d_gs_output_primitive(enum mesa_prim prim_type)
2323 {
2324     switch (prim_type) {
2325     case MESA_PRIM_POINTS:
2326         return GEOMETRY_SHADER_POINTS;
2327     case MESA_PRIM_LINE_STRIP:
2328         return GEOMETRY_SHADER_LINE_STRIP;
2329     case MESA_PRIM_TRIANGLE_STRIP:
2330         return GEOMETRY_SHADER_TRI_STRIP;
2331     default:
2332         unreachable("Unsupported primitive type");
2333     }
2334 }
2335 
2336 static void
emit_tes_gs_common_params(struct v3dv_job * job,uint8_t gs_out_prim_type,uint8_t gs_num_invocations)2337 emit_tes_gs_common_params(struct v3dv_job *job,
2338                           uint8_t gs_out_prim_type,
2339                           uint8_t gs_num_invocations)
2340 {
2341    cl_emit(&job->indirect, TESSELLATION_GEOMETRY_COMMON_PARAMS, shader) {
2342       shader.tessellation_type = TESSELLATION_TYPE_TRIANGLE;
2343       shader.tessellation_point_mode = false;
2344       shader.tessellation_edge_spacing = TESSELLATION_EDGE_SPACING_EVEN;
2345       shader.tessellation_clockwise = true;
2346       shader.tessellation_invocations = 1;
2347 
2348       shader.geometry_shader_output_format =
2349          v3d_gs_output_primitive(gs_out_prim_type);
2350       shader.geometry_shader_instances = gs_num_invocations & 0x1F;
2351    }
2352 }
2353 
2354 static uint8_t
simd_width_to_gs_pack_mode(uint32_t width)2355 simd_width_to_gs_pack_mode(uint32_t width)
2356 {
2357    switch (width) {
2358    case 16:
2359       return V3D_PACK_MODE_16_WAY;
2360    case 8:
2361       return V3D_PACK_MODE_8_WAY;
2362    case 4:
2363       return V3D_PACK_MODE_4_WAY;
2364    case 1:
2365       return V3D_PACK_MODE_1_WAY;
2366    default:
2367       unreachable("Invalid SIMD width");
2368    };
2369 }
2370 
2371 static void
emit_tes_gs_shader_params(struct v3dv_job * job,uint32_t gs_simd,uint32_t gs_vpm_output_size,uint32_t gs_max_vpm_input_size_per_batch)2372 emit_tes_gs_shader_params(struct v3dv_job *job,
2373                           uint32_t gs_simd,
2374                           uint32_t gs_vpm_output_size,
2375                           uint32_t gs_max_vpm_input_size_per_batch)
2376 {
2377    cl_emit(&job->indirect, TESSELLATION_GEOMETRY_SHADER_PARAMS, shader) {
2378       shader.tcs_batch_flush_mode = V3D_TCS_FLUSH_MODE_FULLY_PACKED;
2379       shader.per_patch_data_column_depth = 1;
2380       shader.tcs_output_segment_size_in_sectors = 1;
2381       shader.tcs_output_segment_pack_mode = V3D_PACK_MODE_16_WAY;
2382       shader.tes_output_segment_size_in_sectors = 1;
2383       shader.tes_output_segment_pack_mode = V3D_PACK_MODE_16_WAY;
2384       shader.gs_output_segment_size_in_sectors = gs_vpm_output_size;
2385       shader.gs_output_segment_pack_mode =
2386          simd_width_to_gs_pack_mode(gs_simd);
2387       shader.tbg_max_patches_per_tcs_batch = 1;
2388       shader.tbg_max_extra_vertex_segs_for_patches_after_first = 0;
2389       shader.tbg_min_tcs_output_segments_required_in_play = 1;
2390       shader.tbg_min_per_patch_data_segments_required_in_play = 1;
2391       shader.tpg_max_patches_per_tes_batch = 1;
2392       shader.tpg_max_vertex_segments_per_tes_batch = 0;
2393       shader.tpg_max_tcs_output_segments_per_tes_batch = 1;
2394       shader.tpg_min_tes_output_segments_required_in_play = 1;
2395       shader.gbg_max_tes_output_vertex_segments_per_gs_batch =
2396          gs_max_vpm_input_size_per_batch;
2397       shader.gbg_min_gs_output_segments_required_in_play = 1;
2398    }
2399 }
2400 
2401 void
v3dX(cmd_buffer_emit_gl_shader_state)2402 v3dX(cmd_buffer_emit_gl_shader_state)(struct v3dv_cmd_buffer *cmd_buffer)
2403 {
2404    struct v3dv_job *job = cmd_buffer->state.job;
2405    assert(job);
2406 
2407    struct v3dv_cmd_buffer_state *state = &cmd_buffer->state;
2408    struct v3dv_pipeline *pipeline = state->gfx.pipeline;
2409    assert(pipeline);
2410 
2411    struct v3dv_shader_variant *vs_variant =
2412       pipeline->shared_data->variants[BROADCOM_SHADER_VERTEX];
2413    struct v3d_vs_prog_data *prog_data_vs = vs_variant->prog_data.vs;
2414 
2415    struct v3dv_shader_variant *vs_bin_variant =
2416       pipeline->shared_data->variants[BROADCOM_SHADER_VERTEX_BIN];
2417    struct v3d_vs_prog_data *prog_data_vs_bin = vs_bin_variant->prog_data.vs;
2418 
2419    struct v3dv_shader_variant *fs_variant =
2420       pipeline->shared_data->variants[BROADCOM_SHADER_FRAGMENT];
2421    struct v3d_fs_prog_data *prog_data_fs = fs_variant->prog_data.fs;
2422 
2423    struct v3dv_shader_variant *gs_variant = NULL;
2424    struct v3dv_shader_variant *gs_bin_variant = NULL;
2425    struct v3d_gs_prog_data *prog_data_gs = NULL;
2426    struct v3d_gs_prog_data *prog_data_gs_bin = NULL;
2427    if (pipeline->has_gs) {
2428       gs_variant =
2429          pipeline->shared_data->variants[BROADCOM_SHADER_GEOMETRY];
2430       prog_data_gs = gs_variant->prog_data.gs;
2431 
2432       gs_bin_variant =
2433          pipeline->shared_data->variants[BROADCOM_SHADER_GEOMETRY_BIN];
2434       prog_data_gs_bin = gs_bin_variant->prog_data.gs;
2435    }
2436 
2437    /* Update the cache dirty flag based on the shader progs data */
2438    job->tmu_dirty_rcl |= prog_data_vs_bin->base.tmu_dirty_rcl;
2439    job->tmu_dirty_rcl |= prog_data_vs->base.tmu_dirty_rcl;
2440    job->tmu_dirty_rcl |= prog_data_fs->base.tmu_dirty_rcl;
2441    if (pipeline->has_gs) {
2442       job->tmu_dirty_rcl |= prog_data_gs_bin->base.tmu_dirty_rcl;
2443       job->tmu_dirty_rcl |= prog_data_gs->base.tmu_dirty_rcl;
2444    }
2445 
2446    /* See GFXH-930 workaround below */
2447    uint32_t num_elements_to_emit = MAX2(pipeline->va_count, 1);
2448 
2449    uint32_t shader_state_record_length =
2450       cl_packet_length(GL_SHADER_STATE_RECORD);
2451 #if V3D_VERSION >= 71
2452    if (v3d_device_has_draw_index(&pipeline->device->devinfo)) {
2453       shader_state_record_length =
2454          cl_packet_length(GL_SHADER_STATE_RECORD_DRAW_INDEX);
2455    }
2456 #endif
2457 
2458    if (pipeline->has_gs) {
2459       shader_state_record_length +=
2460          cl_packet_length(GEOMETRY_SHADER_STATE_RECORD) +
2461          cl_packet_length(TESSELLATION_GEOMETRY_COMMON_PARAMS) +
2462          2 * cl_packet_length(TESSELLATION_GEOMETRY_SHADER_PARAMS);
2463    }
2464 
2465    uint32_t shader_rec_offset =
2466       v3dv_cl_ensure_space(&job->indirect,
2467                            shader_state_record_length +
2468                            num_elements_to_emit *
2469                            cl_packet_length(GL_SHADER_STATE_ATTRIBUTE_RECORD),
2470                            32);
2471    v3dv_return_if_oom(cmd_buffer, NULL);
2472 
2473    struct v3dv_bo *assembly_bo = pipeline->shared_data->assembly_bo;
2474 
2475    if (pipeline->has_gs) {
2476       emit_gs_shader_state_record(job,
2477                                   assembly_bo,
2478                                   gs_bin_variant,
2479                                   cmd_buffer->state.uniforms.gs_bin,
2480                                   gs_variant,
2481                                   cmd_buffer->state.uniforms.gs);
2482 
2483       emit_tes_gs_common_params(job,
2484                                 prog_data_gs->out_prim_type,
2485                                 prog_data_gs->num_invocations);
2486 
2487       emit_tes_gs_shader_params(job,
2488                                 pipeline->vpm_cfg_bin.gs_width,
2489                                 pipeline->vpm_cfg_bin.Gd,
2490                                 pipeline->vpm_cfg_bin.Gv);
2491 
2492       emit_tes_gs_shader_params(job,
2493                                 pipeline->vpm_cfg.gs_width,
2494                                 pipeline->vpm_cfg.Gd,
2495                                 pipeline->vpm_cfg.Gv);
2496    }
2497 
2498 #if V3D_VERSION == 42
2499    struct v3dv_bo *default_attribute_values =
2500       pipeline->default_attribute_values != NULL ?
2501       pipeline->default_attribute_values :
2502       pipeline->device->default_attribute_float;
2503 #endif
2504 
2505 #if V3D_VERSION >= 71
2506    if (v3d_device_has_draw_index(&pipeline->device->devinfo)) {
2507       cl_emit_with_prepacked(&job->indirect, GL_SHADER_STATE_RECORD_DRAW_INDEX,
2508                              pipeline->shader_state_record, shader) {
2509          shader.min_coord_shader_input_segments_required_in_play =
2510             pipeline->vpm_cfg_bin.As;
2511          shader.min_vertex_shader_input_segments_required_in_play =
2512             pipeline->vpm_cfg.As;
2513          shader.coordinate_shader_code_address =
2514             v3dv_cl_address(assembly_bo, vs_bin_variant->assembly_offset);
2515          shader.vertex_shader_code_address =
2516             v3dv_cl_address(assembly_bo, vs_variant->assembly_offset);
2517          shader.fragment_shader_code_address =
2518             v3dv_cl_address(assembly_bo, fs_variant->assembly_offset);
2519          shader.coordinate_shader_uniforms_address = cmd_buffer->state.uniforms.vs_bin;
2520          shader.vertex_shader_uniforms_address = cmd_buffer->state.uniforms.vs;
2521          shader.fragment_shader_uniforms_address = cmd_buffer->state.uniforms.fs;
2522          shader.any_shader_reads_hardware_written_primitive_id =
2523             (pipeline->has_gs && prog_data_gs->uses_pid) || prog_data_fs->uses_pid;
2524          shader.insert_primitive_id_as_first_varying_to_fragment_shader =
2525             !pipeline->has_gs && prog_data_fs->uses_pid;
2526       }
2527    } else
2528 #endif
2529    {
2530       cl_emit_with_prepacked(&job->indirect, GL_SHADER_STATE_RECORD,
2531                              pipeline->shader_state_record, shader) {
2532          /* FIXME: we are setting this values here and during the
2533           * prepacking. This is because both cl_emit_with_prepacked and v3dvx_pack
2534           * asserts for minimum values of these. It would be good to get
2535           * v3dvx_pack to assert on the final value if possible
2536           */
2537          shader.min_coord_shader_input_segments_required_in_play =
2538             pipeline->vpm_cfg_bin.As;
2539          shader.min_vertex_shader_input_segments_required_in_play =
2540             pipeline->vpm_cfg.As;
2541 
2542          shader.coordinate_shader_code_address =
2543             v3dv_cl_address(assembly_bo, vs_bin_variant->assembly_offset);
2544          shader.vertex_shader_code_address =
2545             v3dv_cl_address(assembly_bo, vs_variant->assembly_offset);
2546          shader.fragment_shader_code_address =
2547             v3dv_cl_address(assembly_bo, fs_variant->assembly_offset);
2548 
2549          shader.coordinate_shader_uniforms_address = cmd_buffer->state.uniforms.vs_bin;
2550          shader.vertex_shader_uniforms_address = cmd_buffer->state.uniforms.vs;
2551          shader.fragment_shader_uniforms_address = cmd_buffer->state.uniforms.fs;
2552 
2553    #if V3D_VERSION == 42
2554          shader.address_of_default_attribute_values =
2555             v3dv_cl_address(default_attribute_values, 0);
2556    #endif
2557 
2558          shader.any_shader_reads_hardware_written_primitive_id =
2559             (pipeline->has_gs && prog_data_gs->uses_pid) || prog_data_fs->uses_pid;
2560          shader.insert_primitive_id_as_first_varying_to_fragment_shader =
2561             !pipeline->has_gs && prog_data_fs->uses_pid;
2562       }
2563    }
2564 
2565    /* Upload vertex element attributes (SHADER_STATE_ATTRIBUTE_RECORD) */
2566    bool cs_loaded_any = false;
2567    const bool cs_uses_builtins = prog_data_vs_bin->uses_iid ||
2568                                  prog_data_vs_bin->uses_biid ||
2569                                  prog_data_vs_bin->uses_vid;
2570    const uint32_t packet_length =
2571       cl_packet_length(GL_SHADER_STATE_ATTRIBUTE_RECORD);
2572 
2573    uint32_t emitted_va_count = 0;
2574    for (uint32_t i = 0; emitted_va_count < pipeline->va_count; i++) {
2575       assert(i < MAX_VERTEX_ATTRIBS);
2576 
2577       if (pipeline->va[i].vk_format == VK_FORMAT_UNDEFINED)
2578          continue;
2579 
2580       const uint32_t binding = pipeline->va[i].binding;
2581 
2582       /* We store each vertex attribute in the array using its driver location
2583        * as index.
2584        */
2585       const uint32_t location = i;
2586 
2587       struct v3dv_vertex_binding *c_vb = &cmd_buffer->state.vertex_bindings[binding];
2588 
2589       cl_emit_with_prepacked(&job->indirect, GL_SHADER_STATE_ATTRIBUTE_RECORD,
2590                              &pipeline->vertex_attrs[i * packet_length], attr) {
2591 
2592          assert(c_vb->buffer->mem->bo);
2593          attr.address = v3dv_cl_address(c_vb->buffer->mem->bo,
2594                                         c_vb->buffer->mem_offset +
2595                                         pipeline->va[i].offset +
2596                                         c_vb->offset);
2597 
2598          attr.number_of_values_read_by_coordinate_shader =
2599             prog_data_vs_bin->vattr_sizes[location];
2600          attr.number_of_values_read_by_vertex_shader =
2601             prog_data_vs->vattr_sizes[location];
2602 
2603          /* GFXH-930: At least one attribute must be enabled and read by CS
2604           * and VS.  If we have attributes being consumed by the VS but not
2605           * the CS, then set up a dummy load of the last attribute into the
2606           * CS's VPM inputs.  (Since CS is just dead-code-elimination compared
2607           * to VS, we can't have CS loading but not VS).
2608           *
2609           * GFXH-1602: first attribute must be active if using builtins.
2610           */
2611          if (prog_data_vs_bin->vattr_sizes[location])
2612             cs_loaded_any = true;
2613 
2614          if (i == 0 && cs_uses_builtins && !cs_loaded_any) {
2615             attr.number_of_values_read_by_coordinate_shader = 1;
2616             cs_loaded_any = true;
2617          } else if (i == pipeline->va_count - 1 && !cs_loaded_any) {
2618             attr.number_of_values_read_by_coordinate_shader = 1;
2619             cs_loaded_any = true;
2620          }
2621 
2622          attr.stride =
2623             cmd_buffer->vk.dynamic_graphics_state.vi_binding_strides[binding];
2624 
2625          attr.maximum_index = attr.stride == 0 ?
2626                               1u : MIN2(0xffffffu, c_vb->size / attr.stride);
2627       }
2628 
2629       emitted_va_count++;
2630    }
2631 
2632    if (pipeline->va_count == 0) {
2633       /* GFXH-930: At least one attribute must be enabled and read
2634        * by CS and VS.  If we have no attributes being consumed by
2635        * the shader, set up a dummy to be loaded into the VPM.
2636        */
2637       cl_emit(&job->indirect, GL_SHADER_STATE_ATTRIBUTE_RECORD, attr) {
2638          /* Valid address of data whose value will be unused. */
2639          attr.address = v3dv_cl_address(job->indirect.bo, 0);
2640 
2641          attr.type = ATTRIBUTE_FLOAT;
2642          attr.stride = 0;
2643          attr.vec_size = 1;
2644 
2645          attr.number_of_values_read_by_coordinate_shader = 1;
2646          attr.number_of_values_read_by_vertex_shader = 1;
2647       }
2648    }
2649 
2650    if (cmd_buffer->state.dirty & V3DV_CMD_DIRTY_PIPELINE) {
2651       v3dv_cl_ensure_space_with_branch(&job->bcl,
2652                                        sizeof(pipeline->vcm_cache_size));
2653       v3dv_return_if_oom(cmd_buffer, NULL);
2654 
2655       cl_emit_prepacked(&job->bcl, &pipeline->vcm_cache_size);
2656    }
2657 
2658    v3dv_cl_ensure_space_with_branch(&job->bcl,
2659                                     cl_packet_length(GL_SHADER_STATE));
2660    v3dv_return_if_oom(cmd_buffer, NULL);
2661 
2662    if (pipeline->has_gs) {
2663       cl_emit(&job->bcl, GL_SHADER_STATE_INCLUDING_GS, state) {
2664          state.address = v3dv_cl_address(job->indirect.bo, shader_rec_offset);
2665          state.number_of_attribute_arrays = num_elements_to_emit;
2666       }
2667    } else {
2668       cl_emit(&job->bcl, GL_SHADER_STATE, state) {
2669          state.address = v3dv_cl_address(job->indirect.bo, shader_rec_offset);
2670          state.number_of_attribute_arrays = num_elements_to_emit;
2671       }
2672    }
2673 
2674    /* Clearing push constants and descriptor sets for all stages is not quite
2675     * correct (some shader stages may not be used at all or they may not be
2676     * consuming push constants), however this is not relevant because if we
2677     * bind a different pipeline we always have to rebuild the uniform streams.
2678     */
2679    cmd_buffer->state.dirty &= ~(V3DV_CMD_DIRTY_VERTEX_BUFFER |
2680                                 V3DV_CMD_DIRTY_DESCRIPTOR_SETS |
2681                                 V3DV_CMD_DIRTY_PUSH_CONSTANTS);
2682    cmd_buffer->state.dirty_descriptor_stages &= ~VK_SHADER_STAGE_ALL_GRAPHICS;
2683    cmd_buffer->state.dirty_push_constants_stages &= ~VK_SHADER_STAGE_ALL_GRAPHICS;
2684 }
2685 
2686 void
v3dX(cmd_buffer_emit_draw)2687 v3dX(cmd_buffer_emit_draw)(struct v3dv_cmd_buffer *cmd_buffer,
2688                            struct v3dv_draw_info *info)
2689 {
2690    struct v3dv_job *job = cmd_buffer->state.job;
2691    assert(job);
2692    const struct vk_dynamic_graphics_state *dyn =
2693       &cmd_buffer->vk.dynamic_graphics_state;
2694    uint32_t hw_prim_type = v3dv_pipeline_primitive(dyn->ia.primitive_topology);
2695 
2696    if (info->first_instance > 0) {
2697       v3dv_cl_ensure_space_with_branch(
2698          &job->bcl, cl_packet_length(BASE_VERTEX_BASE_INSTANCE));
2699       v3dv_return_if_oom(cmd_buffer, NULL);
2700 
2701       cl_emit(&job->bcl, BASE_VERTEX_BASE_INSTANCE, base) {
2702          base.base_instance = info->first_instance;
2703          base.base_vertex = 0;
2704       }
2705    }
2706 
2707    if (info->instance_count > 1) {
2708       v3dv_cl_ensure_space_with_branch(
2709          &job->bcl, cl_packet_length(VERTEX_ARRAY_INSTANCED_PRIMS));
2710       v3dv_return_if_oom(cmd_buffer, NULL);
2711 
2712       cl_emit(&job->bcl, VERTEX_ARRAY_INSTANCED_PRIMS, prim) {
2713          prim.mode = hw_prim_type;
2714          prim.index_of_first_vertex = info->first_vertex;
2715          prim.number_of_instances = info->instance_count;
2716          prim.instance_length = info->vertex_count;
2717       }
2718    } else {
2719       v3dv_cl_ensure_space_with_branch(
2720          &job->bcl, cl_packet_length(VERTEX_ARRAY_PRIMS));
2721       v3dv_return_if_oom(cmd_buffer, NULL);
2722       cl_emit(&job->bcl, VERTEX_ARRAY_PRIMS, prim) {
2723          prim.mode = hw_prim_type;
2724          prim.length = info->vertex_count;
2725          prim.index_of_first_vertex = info->first_vertex;
2726       }
2727    }
2728 }
2729 
2730 void
v3dX(cmd_buffer_emit_index_buffer)2731 v3dX(cmd_buffer_emit_index_buffer)(struct v3dv_cmd_buffer *cmd_buffer)
2732 {
2733    struct v3dv_job *job = cmd_buffer->state.job;
2734    assert(job);
2735 
2736    /* We flag all state as dirty when we create a new job so make sure we
2737     * have a valid index buffer before attempting to emit state for it.
2738     */
2739    struct v3dv_buffer *ibuffer =
2740       v3dv_buffer_from_handle(cmd_buffer->state.index_buffer.buffer);
2741    if (ibuffer) {
2742       v3dv_cl_ensure_space_with_branch(
2743          &job->bcl, cl_packet_length(INDEX_BUFFER_SETUP));
2744       v3dv_return_if_oom(cmd_buffer, NULL);
2745 
2746       const uint32_t offset = ibuffer->mem_offset +
2747                               cmd_buffer->state.index_buffer.offset;
2748       assert(ibuffer->mem->bo->size >= offset);
2749       cl_emit(&job->bcl, INDEX_BUFFER_SETUP, ib) {
2750          ib.address = v3dv_cl_address(ibuffer->mem->bo, offset);
2751          ib.size = cmd_buffer->state.index_buffer.size;
2752       }
2753    }
2754 
2755    cmd_buffer->state.dirty &= ~V3DV_CMD_DIRTY_INDEX_BUFFER;
2756 }
2757 
2758 void
v3dX(cmd_buffer_emit_draw_indexed)2759 v3dX(cmd_buffer_emit_draw_indexed)(struct v3dv_cmd_buffer *cmd_buffer,
2760                                    uint32_t indexCount,
2761                                    uint32_t instanceCount,
2762                                    uint32_t firstIndex,
2763                                    int32_t vertexOffset,
2764                                    uint32_t firstInstance)
2765 {
2766    struct v3dv_job *job = cmd_buffer->state.job;
2767    assert(job);
2768 
2769    const struct vk_dynamic_graphics_state *dyn =
2770       &cmd_buffer->vk.dynamic_graphics_state;
2771    uint32_t hw_prim_type = v3dv_pipeline_primitive(dyn->ia.primitive_topology);
2772    uint8_t index_type = ffs(cmd_buffer->state.index_buffer.index_size) - 1;
2773    uint32_t index_offset = firstIndex * cmd_buffer->state.index_buffer.index_size;
2774 
2775    if (vertexOffset != 0 || firstInstance != 0) {
2776       v3dv_cl_ensure_space_with_branch(
2777          &job->bcl, cl_packet_length(BASE_VERTEX_BASE_INSTANCE));
2778       v3dv_return_if_oom(cmd_buffer, NULL);
2779 
2780       cl_emit(&job->bcl, BASE_VERTEX_BASE_INSTANCE, base) {
2781          base.base_instance = firstInstance;
2782          base.base_vertex = vertexOffset;
2783       }
2784    }
2785 
2786    if (instanceCount == 1) {
2787       v3dv_cl_ensure_space_with_branch(
2788          &job->bcl, cl_packet_length(INDEXED_PRIM_LIST));
2789       v3dv_return_if_oom(cmd_buffer, NULL);
2790 
2791       cl_emit(&job->bcl, INDEXED_PRIM_LIST, prim) {
2792          prim.index_type = index_type;
2793          prim.length = indexCount;
2794          prim.index_offset = index_offset;
2795          prim.mode = hw_prim_type;
2796          prim.enable_primitive_restarts = dyn->ia.primitive_restart_enable;
2797       }
2798    } else if (instanceCount > 1) {
2799       v3dv_cl_ensure_space_with_branch(
2800          &job->bcl, cl_packet_length(INDEXED_INSTANCED_PRIM_LIST));
2801       v3dv_return_if_oom(cmd_buffer, NULL);
2802 
2803       cl_emit(&job->bcl, INDEXED_INSTANCED_PRIM_LIST, prim) {
2804          prim.index_type = index_type;
2805          prim.index_offset = index_offset;
2806          prim.mode = hw_prim_type;
2807          prim.enable_primitive_restarts = dyn->ia.primitive_restart_enable;
2808          prim.number_of_instances = instanceCount;
2809          prim.instance_length = indexCount;
2810       }
2811    }
2812 }
2813 
2814 void
v3dX(cmd_buffer_emit_draw_indirect)2815 v3dX(cmd_buffer_emit_draw_indirect)(struct v3dv_cmd_buffer *cmd_buffer,
2816                                     struct v3dv_buffer *buffer,
2817                                     VkDeviceSize offset,
2818                                     uint32_t drawCount,
2819                                     uint32_t stride)
2820 {
2821    struct v3dv_job *job = cmd_buffer->state.job;
2822    assert(job);
2823 
2824    const struct vk_dynamic_graphics_state *dyn =
2825       &cmd_buffer->vk.dynamic_graphics_state;
2826    uint32_t hw_prim_type = v3dv_pipeline_primitive(dyn->ia.primitive_topology);
2827 
2828    v3dv_cl_ensure_space_with_branch(
2829       &job->bcl, cl_packet_length(INDIRECT_VERTEX_ARRAY_INSTANCED_PRIMS));
2830    v3dv_return_if_oom(cmd_buffer, NULL);
2831 
2832    cl_emit(&job->bcl, INDIRECT_VERTEX_ARRAY_INSTANCED_PRIMS, prim) {
2833       prim.mode = hw_prim_type;
2834       prim.number_of_draw_indirect_array_records = drawCount;
2835       prim.stride_in_multiples_of_4_bytes = stride >> 2;
2836       prim.address = v3dv_cl_address(buffer->mem->bo,
2837                                      buffer->mem_offset + offset);
2838    }
2839 }
2840 
2841 void
v3dX(cmd_buffer_emit_indexed_indirect)2842 v3dX(cmd_buffer_emit_indexed_indirect)(struct v3dv_cmd_buffer *cmd_buffer,
2843                                        struct v3dv_buffer *buffer,
2844                                        VkDeviceSize offset,
2845                                        uint32_t drawCount,
2846                                        uint32_t stride)
2847 {
2848    struct v3dv_job *job = cmd_buffer->state.job;
2849    assert(job);
2850 
2851    const struct vk_dynamic_graphics_state *dyn =
2852       &cmd_buffer->vk.dynamic_graphics_state;
2853    uint32_t hw_prim_type = v3dv_pipeline_primitive(dyn->ia.primitive_topology);
2854    uint8_t index_type = ffs(cmd_buffer->state.index_buffer.index_size) - 1;
2855 
2856    v3dv_cl_ensure_space_with_branch(
2857       &job->bcl, cl_packet_length(INDIRECT_INDEXED_INSTANCED_PRIM_LIST));
2858    v3dv_return_if_oom(cmd_buffer, NULL);
2859 
2860    cl_emit(&job->bcl, INDIRECT_INDEXED_INSTANCED_PRIM_LIST, prim) {
2861       prim.index_type = index_type;
2862       prim.mode = hw_prim_type;
2863       prim.enable_primitive_restarts = dyn->ia.primitive_restart_enable;
2864       prim.number_of_draw_indirect_indexed_records = drawCount;
2865       prim.stride_in_multiples_of_4_bytes = stride >> 2;
2866       prim.address = v3dv_cl_address(buffer->mem->bo,
2867                                      buffer->mem_offset + offset);
2868    }
2869 }
2870 
2871 void
v3dX(cmd_buffer_suspend)2872 v3dX(cmd_buffer_suspend)(struct v3dv_cmd_buffer *cmd_buffer)
2873 {
2874    struct v3dv_job *job = cmd_buffer->state.job;
2875    assert(job);
2876 
2877    job->suspending = true;
2878 
2879    v3dv_cl_ensure_space_with_branch(&job->bcl, cl_packet_length(BRANCH));
2880 
2881    job->suspend_branch_inst_ptr = cl_start(&job->bcl);
2882    cl_emit(&job->bcl, BRANCH, branch) {
2883       branch.address = v3dv_cl_address(NULL, 0);
2884    }
2885 
2886    /* The sim complains if the command list ends with a branch */
2887    cl_emit(&job->bcl, NOP, nop);
2888 }
2889 
2890 void
v3dX(job_patch_resume_address)2891 v3dX(job_patch_resume_address)(struct v3dv_job *first_suspend,
2892                                struct v3dv_job *suspend,
2893                                struct v3dv_job *resume)
2894 {
2895    assert(resume && resume->resuming);
2896    assert(first_suspend && first_suspend->suspending);
2897    assert(suspend && suspend->suspending);
2898    assert(suspend->suspend_branch_inst_ptr != NULL);
2899 
2900    struct v3dv_bo *resume_bo =
2901       list_first_entry(&resume->bcl.bo_list, struct v3dv_bo, list_link);
2902    struct cl_packet_struct(BRANCH) branch = {
2903       cl_packet_header(BRANCH),
2904    };
2905    branch.address = v3dv_cl_address(NULL, resume_bo->offset);
2906 
2907    uint8_t *rewrite_addr = (uint8_t *) suspend->suspend_branch_inst_ptr;
2908    cl_packet_pack(BRANCH)(NULL, rewrite_addr, &branch);
2909 
2910    if (resume != first_suspend) {
2911       set_foreach(resume->bos, entry) {
2912          struct v3dv_bo *bo = (void *)entry->key;
2913          v3dv_job_add_bo(first_suspend, bo);
2914       }
2915    }
2916 
2917    first_suspend->suspended_bcl_end = resume->bcl.bo->offset +
2918                                       v3dv_cl_offset(&resume->bcl);
2919 }
2920 
2921 static void
job_destroy_cb(VkDevice device,uint64_t pobj,VkAllocationCallbacks * allocb)2922 job_destroy_cb(VkDevice device, uint64_t pobj, VkAllocationCallbacks *allocb)
2923 {
2924    struct v3dv_job *clone = (struct v3dv_job *) (uintptr_t) pobj;
2925    v3dv_job_destroy(clone);
2926 }
2927 
2928 /**
2929  * This checks if the command buffer has been created with
2930  * VK_COMMAND_BUFFER_USAGE_SIMULTANEOUS_USE_BIT, in which case we won't be
2931  * able to safely patch the resume address into the job (since we could have
2932  * another instance of this job running in the GPU, potentially resuming in a
2933  * different address). In that case, we clone the job and make the clone have
2934  * its own BCL copied from the original job so we can later patch the resume
2935  * address into it safely.
2936  */
2937 struct v3dv_job *
v3dX(cmd_buffer_prepare_suspend_job_for_submit)2938 v3dX(cmd_buffer_prepare_suspend_job_for_submit)(struct v3dv_job *job)
2939 {
2940    assert(job->suspending);
2941    assert(job->cmd_buffer);
2942    assert(job->type == V3DV_JOB_TYPE_GPU_CL);
2943 
2944    if (!(job->cmd_buffer->usage_flags & VK_COMMAND_BUFFER_USAGE_SIMULTANEOUS_USE_BIT))
2945       return job;
2946 
2947    /* Create the clone job, but skip the BCL since we are going to create
2948     * our own below.
2949     */
2950    struct v3dv_job *clone = v3dv_job_clone(job, true);
2951    if (!clone)
2952       return NULL;
2953 
2954    /* Compute total size of BCL we need to copy */
2955    uint32_t bcl_size = 0;
2956    list_for_each_entry(struct v3dv_bo, bo, &job->bcl.bo_list, list_link)
2957       bcl_size += bo->size;
2958 
2959    /* Prepare the BCL for the cloned job. For this we go over the BOs in the
2960     * BCL of the original job and we copy their contents into the single BO
2961     * in the BCL of the cloned job.
2962     */
2963    clone->clone_owns_bcl = true;
2964    v3dv_cl_init(clone, &clone->bcl);
2965    v3dv_cl_ensure_space(&clone->bcl, bcl_size, 4);
2966    if (!clone->bcl.bo)
2967       return NULL;
2968 
2969    assert(clone->bcl.base);
2970    assert(clone->bcl.base == clone->bcl.next);
2971 
2972    /* Unlink this job from the command buffer's execution list */
2973    list_inithead(&clone->list_link);
2974 
2975    /* Copy the contents of each BO in the original job's BCL into the single
2976     * BO we have in the clone's BCL.
2977     *
2978     * If the BO is the last in the BCL (which we can tell because it wouldn't
2979     * have emitted a BRANCH instruction to link to another BO) we need to copy
2980     * up to the current BCL offset, otherwise we need to copy up to the BRANCH
2981     * instruction (excluded, since we are putting everything together into a
2982     * single BO here).
2983     */
2984    list_for_each_entry(struct v3dv_bo, bo, &job->bcl.bo_list, list_link) {
2985       assert(bo->map);
2986       uint32_t copy_size;
2987       if (bo->cl_branch_offset == 0xffffffff) { /* Last BO in BCL */
2988          assert(bo == list_last_entry(&job->bcl.bo_list, struct v3dv_bo, list_link));
2989          copy_size = v3dv_cl_offset(&job->bcl);
2990       } else {
2991          assert(bo->cl_branch_offset >= cl_packet_length(BRANCH));
2992          copy_size = bo->cl_branch_offset - cl_packet_length(BRANCH);
2993       }
2994 
2995       assert(v3dv_cl_offset(&job->bcl) + copy_size < bcl_size);
2996       memcpy(cl_start(&clone->bcl), bo->map, copy_size);
2997       cl_advance_and_end(&clone->bcl, copy_size);
2998    }
2999 
3000    /* Now we need to fixup the pointer to the suspend BRANCH instruction at the
3001     * end of the BCL so it points to the address in the new BCL. We know that
3002     * to suspend a command buffer we always emit a BRANCH+NOP combo, so we just
3003     * need to go back that many bytes in to the BCL to find the instruction.
3004     */
3005    uint32_t suspend_terminator_size =
3006       cl_packet_length(BRANCH) + cl_packet_length(NOP);
3007    clone->suspend_branch_inst_ptr = (struct v3dv_cl_out *)
3008       (((uint8_t *)cl_start(&clone->bcl)) - suspend_terminator_size);
3009    assert(*(((uint8_t *)clone->suspend_branch_inst_ptr)) == V3DX(BRANCH_opcode));
3010 
3011    /* This job is not in the execution list of the command buffer so it
3012     * won't be destroyed with it; add it as a private object to get it freed.
3013     *
3014     * FIXME: every time this job is submitted we clone the job and we only
3015     * destroy it when the command buffer is destroyed. If the user keeps the
3016     * command buffer for the entire lifetime of the application, this command
3017     * buffer could grow significantly, so maybe we want to do something smarter
3018     * like having a syncobj bound to these jobs and every time we submit the
3019     * command buffer again we first check these sncobjs to see if we can free
3020     * some of these clones so we avoid blowing up memory.
3021     */
3022    v3dv_cmd_buffer_add_private_obj(
3023       job->cmd_buffer, (uintptr_t)clone,
3024       (v3dv_cmd_buffer_private_obj_destroy_cb)job_destroy_cb);
3025 
3026    return clone;
3027 }
3028