• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*
2  * Copyright © 2015 Intel Corporation
3  *
4  * Permission is hereby granted, free of charge, to any person obtaining a
5  * copy of this software and associated documentation files (the "Software"),
6  * to deal in the Software without restriction, including without limitation
7  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8  * and/or sell copies of the Software, and to permit persons to whom the
9  * Software is furnished to do so, subject to the following conditions:
10  *
11  * The above copyright notice and this permission notice (including the next
12  * paragraph) shall be included in all copies or substantial portions of the
13  * Software.
14  *
15  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
18  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21  * IN THE SOFTWARE.
22  */
23 
24 #include <assert.h>
25 #include <stdbool.h>
26 
27 #include "anv_private.h"
28 #include "anv_measure.h"
29 #include "vk_format.h"
30 #include "vk_render_pass.h"
31 #include "vk_util.h"
32 #include "util/fast_idiv_by_const.h"
33 
34 #include "common/intel_l3_config.h"
35 #include "genxml/gen_macros.h"
36 #include "genxml/genX_pack.h"
37 #include "common/intel_guardband.h"
38 #include "compiler/elk/elk_prim.h"
39 
40 #include "nir/nir_xfb_info.h"
41 
42 #include "ds/intel_tracepoints.h"
43 
44 /* We reserve :
45  *    - GPR 14 for secondary command buffer returns
46  *    - GPR 15 for conditional rendering
47  */
48 #define MI_BUILDER_NUM_ALLOC_GPRS 14
49 #define __gen_get_batch_dwords anv_batch_emit_dwords
50 #define __gen_address_offset anv_address_add
51 #define __gen_get_batch_address(b, a) anv_batch_address(b, a)
52 #include "common/mi_builder.h"
53 
54 static void genX(flush_pipeline_select)(struct anv_cmd_buffer *cmd_buffer,
55                                         uint32_t pipeline);
56 
57 static enum anv_pipe_bits
convert_pc_to_bits(struct GENX (PIPE_CONTROL)* pc)58 convert_pc_to_bits(struct GENX(PIPE_CONTROL) *pc) {
59    enum anv_pipe_bits bits = 0;
60    bits |= (pc->DepthCacheFlushEnable) ?  ANV_PIPE_DEPTH_CACHE_FLUSH_BIT : 0;
61    bits |= (pc->DCFlushEnable) ?  ANV_PIPE_DATA_CACHE_FLUSH_BIT : 0;
62    bits |= (pc->RenderTargetCacheFlushEnable) ?  ANV_PIPE_RENDER_TARGET_CACHE_FLUSH_BIT : 0;
63    bits |= (pc->VFCacheInvalidationEnable) ?  ANV_PIPE_VF_CACHE_INVALIDATE_BIT : 0;
64    bits |= (pc->StateCacheInvalidationEnable) ?  ANV_PIPE_STATE_CACHE_INVALIDATE_BIT : 0;
65    bits |= (pc->ConstantCacheInvalidationEnable) ?  ANV_PIPE_CONSTANT_CACHE_INVALIDATE_BIT : 0;
66    bits |= (pc->TextureCacheInvalidationEnable) ?  ANV_PIPE_TEXTURE_CACHE_INVALIDATE_BIT : 0;
67    bits |= (pc->InstructionCacheInvalidateEnable) ?  ANV_PIPE_INSTRUCTION_CACHE_INVALIDATE_BIT : 0;
68    bits |= (pc->StallAtPixelScoreboard) ?  ANV_PIPE_STALL_AT_SCOREBOARD_BIT : 0;
69    bits |= (pc->DepthStallEnable) ?  ANV_PIPE_DEPTH_STALL_BIT : 0;
70    bits |= (pc->CommandStreamerStallEnable) ?  ANV_PIPE_CS_STALL_BIT : 0;
71    return bits;
72 }
73 
74 #define anv_debug_dump_pc(pc) \
75    if (INTEL_DEBUG(DEBUG_PIPE_CONTROL)) { \
76       fputs("pc: emit PC=( ", stderr); \
77       anv_dump_pipe_bits(convert_pc_to_bits(&(pc))); \
78       fprintf(stderr, ") reason: %s\n", __func__); \
79    }
80 
81 static bool
is_render_queue_cmd_buffer(const struct anv_cmd_buffer * cmd_buffer)82 is_render_queue_cmd_buffer(const struct anv_cmd_buffer *cmd_buffer)
83 {
84    struct anv_queue_family *queue_family = cmd_buffer->queue_family;
85    return (queue_family->queueFlags & VK_QUEUE_GRAPHICS_BIT) != 0;
86 }
87 
88 void
genX(cmd_buffer_emit_state_base_address)89 genX(cmd_buffer_emit_state_base_address)(struct anv_cmd_buffer *cmd_buffer)
90 {
91    struct anv_device *device = cmd_buffer->device;
92    uint32_t mocs = isl_mocs(&device->isl_dev, 0, false);
93 
94    /* If we are emitting a new state base address we probably need to re-emit
95     * binding tables.
96     */
97    cmd_buffer->state.descriptors_dirty |= ~0;
98 
99    /* Emit a render target cache flush.
100     *
101     * This isn't documented anywhere in the PRM.  However, it seems to be
102     * necessary prior to changing the surface state base address.  Without
103     * this, we get GPU hangs when using multi-level command buffers which
104     * clear depth, reset state base address, and then go render stuff.
105     */
106    anv_batch_emit(&cmd_buffer->batch, GENX(PIPE_CONTROL), pc) {
107       pc.DCFlushEnable = true;
108       pc.RenderTargetCacheFlushEnable = true;
109       pc.CommandStreamerStallEnable = true;
110       anv_debug_dump_pc(pc);
111    }
112 
113    anv_batch_emit(&cmd_buffer->batch, GENX(STATE_BASE_ADDRESS), sba) {
114       sba.GeneralStateBaseAddress = (struct anv_address) { NULL, 0 };
115       sba.GeneralStateMOCS = mocs;
116       sba.GeneralStateBaseAddressModifyEnable = true;
117 
118       sba.StatelessDataPortAccessMOCS = mocs;
119 
120       sba.SurfaceStateBaseAddress =
121          anv_cmd_buffer_surface_base_address(cmd_buffer);
122       sba.SurfaceStateMOCS = mocs;
123       sba.SurfaceStateBaseAddressModifyEnable = true;
124 
125       sba.DynamicStateBaseAddress =
126          (struct anv_address) { device->dynamic_state_pool.block_pool.bo, 0 };
127       sba.DynamicStateMOCS = mocs;
128       sba.DynamicStateBaseAddressModifyEnable = true;
129 
130       sba.IndirectObjectBaseAddress = (struct anv_address) { NULL, 0 };
131       sba.IndirectObjectMOCS = mocs;
132       sba.IndirectObjectBaseAddressModifyEnable = true;
133 
134       sba.InstructionBaseAddress =
135          (struct anv_address) { device->instruction_state_pool.block_pool.bo, 0 };
136       sba.InstructionMOCS = mocs;
137       sba.InstructionBaseAddressModifyEnable = true;
138 
139 #  if (GFX_VER >= 8)
140       /* Broadwell requires that we specify a buffer size for a bunch of
141        * these fields.  However, since we will be growing the BO's live, we
142        * just set them all to the maximum.
143        */
144       sba.GeneralStateBufferSize       = 0xfffff;
145       sba.IndirectObjectBufferSize     = 0xfffff;
146       if (anv_use_relocations(device->physical)) {
147          sba.DynamicStateBufferSize    = 0xfffff;
148          sba.InstructionBufferSize     = 0xfffff;
149       } else {
150          /* With softpin, we use fixed addresses so we actually know how big
151           * our base addresses are.
152           */
153          sba.DynamicStateBufferSize    = DYNAMIC_STATE_POOL_SIZE / 4096;
154          sba.InstructionBufferSize     = INSTRUCTION_STATE_POOL_SIZE / 4096;
155       }
156       sba.GeneralStateBufferSizeModifyEnable    = true;
157       sba.IndirectObjectBufferSizeModifyEnable  = true;
158       sba.DynamicStateBufferSizeModifyEnable    = true;
159       sba.InstructionBuffersizeModifyEnable     = true;
160 #  else
161       /* On gfx7, we have upper bounds instead.  According to the docs,
162        * setting an upper bound of zero means that no bounds checking is
163        * performed so, in theory, we should be able to leave them zero.
164        * However, border color is broken and the GPU bounds-checks anyway.
165        * To avoid this and other potential problems, we may as well set it
166        * for everything.
167        */
168       sba.GeneralStateAccessUpperBound =
169          (struct anv_address) { .bo = NULL, .offset = 0xfffff000 };
170       sba.GeneralStateAccessUpperBoundModifyEnable = true;
171       sba.DynamicStateAccessUpperBound =
172          (struct anv_address) { .bo = NULL, .offset = 0xfffff000 };
173       sba.DynamicStateAccessUpperBoundModifyEnable = true;
174       sba.InstructionAccessUpperBound =
175          (struct anv_address) { .bo = NULL, .offset = 0xfffff000 };
176       sba.InstructionAccessUpperBoundModifyEnable = true;
177 #  endif
178    }
179 
180    /* After re-setting the surface state base address, we have to do some
181     * cache flushing so that the sampler engine will pick up the new
182     * SURFACE_STATE objects and binding tables. From the Broadwell PRM,
183     * Shared Function > 3D Sampler > State > State Caching (page 96):
184     *
185     *    Coherency with system memory in the state cache, like the texture
186     *    cache is handled partially by software. It is expected that the
187     *    command stream or shader will issue Cache Flush operation or
188     *    Cache_Flush sampler message to ensure that the L1 cache remains
189     *    coherent with system memory.
190     *
191     *    [...]
192     *
193     *    Whenever the value of the Dynamic_State_Base_Addr,
194     *    Surface_State_Base_Addr are altered, the L1 state cache must be
195     *    invalidated to ensure the new surface or sampler state is fetched
196     *    from system memory.
197     *
198     * The PIPE_CONTROL command has a "State Cache Invalidation Enable" bit
199     * which, according the PIPE_CONTROL instruction documentation in the
200     * Broadwell PRM:
201     *
202     *    Setting this bit is independent of any other bit in this packet.
203     *    This bit controls the invalidation of the L1 and L2 state caches
204     *    at the top of the pipe i.e. at the parsing time.
205     *
206     * Unfortunately, experimentation seems to indicate that state cache
207     * invalidation through a PIPE_CONTROL does nothing whatsoever in
208     * regards to surface state and binding tables.  In stead, it seems that
209     * invalidating the texture cache is what is actually needed.
210     *
211     * XXX:  As far as we have been able to determine through
212     * experimentation, shows that flush the texture cache appears to be
213     * sufficient.  The theory here is that all of the sampling/rendering
214     * units cache the binding table in the texture cache.  However, we have
215     * yet to be able to actually confirm this.
216     *
217     * Wa_14013910100:
218     *
219     *  "DG2 128/256/512-A/B: S/W must program STATE_BASE_ADDRESS command twice
220     *   or program pipe control with Instruction cache invalidate post
221     *   STATE_BASE_ADDRESS command"
222     */
223    anv_batch_emit(&cmd_buffer->batch, GENX(PIPE_CONTROL), pc) {
224       pc.TextureCacheInvalidationEnable = true;
225       pc.ConstantCacheInvalidationEnable = true;
226       pc.StateCacheInvalidationEnable = true;
227       anv_debug_dump_pc(pc);
228    }
229 }
230 
231 static void
add_surface_reloc(struct anv_cmd_buffer * cmd_buffer,struct anv_state state,struct anv_address addr)232 add_surface_reloc(struct anv_cmd_buffer *cmd_buffer,
233                   struct anv_state state, struct anv_address addr)
234 {
235    VkResult result;
236 
237    if (anv_use_relocations(cmd_buffer->device->physical)) {
238       const struct isl_device *isl_dev = &cmd_buffer->device->isl_dev;
239       result = anv_reloc_list_add(&cmd_buffer->surface_relocs,
240                                   &cmd_buffer->vk.pool->alloc,
241                                   state.offset + isl_dev->ss.addr_offset,
242                                   addr.bo, addr.offset, NULL);
243    } else {
244       result = anv_reloc_list_add_bo(&cmd_buffer->surface_relocs,
245                                      &cmd_buffer->vk.pool->alloc,
246                                      addr.bo);
247    }
248 
249    if (unlikely(result != VK_SUCCESS))
250       anv_batch_set_error(&cmd_buffer->batch, result);
251 }
252 
253 static void
add_surface_state_relocs(struct anv_cmd_buffer * cmd_buffer,struct anv_surface_state state)254 add_surface_state_relocs(struct anv_cmd_buffer *cmd_buffer,
255                          struct anv_surface_state state)
256 {
257    const struct isl_device *isl_dev = &cmd_buffer->device->isl_dev;
258 
259    assert(!anv_address_is_null(state.address));
260    add_surface_reloc(cmd_buffer, state.state, state.address);
261 
262    if (!anv_address_is_null(state.aux_address)) {
263       VkResult result =
264          anv_reloc_list_add(&cmd_buffer->surface_relocs,
265                             &cmd_buffer->vk.pool->alloc,
266                             state.state.offset + isl_dev->ss.aux_addr_offset,
267                             state.aux_address.bo,
268                             state.aux_address.offset,
269                             NULL);
270       if (result != VK_SUCCESS)
271          anv_batch_set_error(&cmd_buffer->batch, result);
272    }
273 
274    if (!anv_address_is_null(state.clear_address)) {
275       VkResult result =
276          anv_reloc_list_add(&cmd_buffer->surface_relocs,
277                             &cmd_buffer->vk.pool->alloc,
278                             state.state.offset +
279                             isl_dev->ss.clear_color_state_offset,
280                             state.clear_address.bo,
281                             state.clear_address.offset,
282                             NULL);
283       if (result != VK_SUCCESS)
284          anv_batch_set_error(&cmd_buffer->batch, result);
285    }
286 }
287 
288 static bool
isl_color_value_requires_conversion(union isl_color_value color,const struct isl_surf * surf,const struct isl_view * view)289 isl_color_value_requires_conversion(union isl_color_value color,
290                                     const struct isl_surf *surf,
291                                     const struct isl_view *view)
292 {
293    if (surf->format == view->format && isl_swizzle_is_identity(view->swizzle))
294       return false;
295 
296    uint32_t surf_pack[4] = { 0, 0, 0, 0 };
297    isl_color_value_pack(&color, surf->format, surf_pack);
298 
299    uint32_t view_pack[4] = { 0, 0, 0, 0 };
300    union isl_color_value swiz_color =
301       isl_color_value_swizzle_inv(color, view->swizzle);
302    isl_color_value_pack(&swiz_color, view->format, view_pack);
303 
304    return memcmp(surf_pack, view_pack, sizeof(surf_pack)) != 0;
305 }
306 
307 static bool
anv_can_fast_clear_color_view(struct anv_device * device,struct anv_image_view * iview,VkImageLayout layout,union isl_color_value clear_color,uint32_t num_layers,VkRect2D render_area)308 anv_can_fast_clear_color_view(struct anv_device * device,
309                               struct anv_image_view *iview,
310                               VkImageLayout layout,
311                               union isl_color_value clear_color,
312                               uint32_t num_layers,
313                               VkRect2D render_area)
314 {
315    if (iview->planes[0].isl.base_array_layer >=
316        anv_image_aux_layers(iview->image, VK_IMAGE_ASPECT_COLOR_BIT,
317                             iview->planes[0].isl.base_level))
318       return false;
319 
320    /* Start by getting the fast clear type.  We use the first subpass
321     * layout here because we don't want to fast-clear if the first subpass
322     * to use the attachment can't handle fast-clears.
323     */
324    enum anv_fast_clear_type fast_clear_type =
325       anv_layout_to_fast_clear_type(device->info, iview->image,
326                                     VK_IMAGE_ASPECT_COLOR_BIT,
327                                     layout);
328    switch (fast_clear_type) {
329    case ANV_FAST_CLEAR_NONE:
330       return false;
331    case ANV_FAST_CLEAR_DEFAULT_VALUE:
332       if (!isl_color_value_is_zero(clear_color, iview->planes[0].isl.format))
333          return false;
334       break;
335    case ANV_FAST_CLEAR_ANY:
336       break;
337    }
338 
339    /* Potentially, we could do partial fast-clears but doing so has crazy
340     * alignment restrictions.  It's easier to just restrict to full size
341     * fast clears for now.
342     */
343    if (render_area.offset.x != 0 ||
344        render_area.offset.y != 0 ||
345        render_area.extent.width != iview->vk.extent.width ||
346        render_area.extent.height != iview->vk.extent.height)
347       return false;
348 
349    /* On Broadwell and earlier, we can only handle 0/1 clear colors */
350    if (!isl_color_value_is_zero_one(clear_color, iview->planes[0].isl.format))
351       return false;
352 
353    /* If the clear color is one that would require non-trivial format
354     * conversion on resolve, we don't bother with the fast clear.  This
355     * shouldn't be common as most clear colors are 0/1 and the most common
356     * format re-interpretation is for sRGB.
357     */
358    if (isl_color_value_requires_conversion(clear_color,
359                                            &iview->image->planes[0].primary_surface.isl,
360                                            &iview->planes[0].isl)) {
361       anv_perf_warn(VK_LOG_OBJS(&iview->vk.base),
362                     "Cannot fast-clear to colors which would require "
363                     "format conversion on resolve");
364       return false;
365    }
366 
367    /* We only allow fast clears to the first slice of an image (level 0,
368     * layer 0) and only for the entire slice.  This guarantees us that, at
369     * any given time, there is only one clear color on any given image at
370     * any given time.  At the time of our testing (Jan 17, 2018), there
371     * were no known applications which would benefit from fast-clearing
372     * more than just the first slice.
373     */
374    if (iview->planes[0].isl.base_level > 0 ||
375        iview->planes[0].isl.base_array_layer > 0) {
376       anv_perf_warn(VK_LOG_OBJS(&iview->image->vk.base),
377                     "Rendering with multi-lod or multi-layer framebuffer "
378                     "with LOAD_OP_LOAD and baseMipLevel > 0 or "
379                     "baseArrayLayer > 0.  Not fast clearing.");
380       return false;
381    }
382 
383    if (num_layers > 1) {
384       anv_perf_warn(VK_LOG_OBJS(&iview->image->vk.base),
385                     "Rendering to a multi-layer framebuffer with "
386                     "LOAD_OP_CLEAR.  Only fast-clearing the first slice");
387    }
388 
389    return true;
390 }
391 
392 static bool
anv_can_hiz_clear_ds_view(struct anv_device * device,const struct anv_image_view * iview,VkImageLayout layout,VkImageAspectFlags clear_aspects,float depth_clear_value,VkRect2D render_area)393 anv_can_hiz_clear_ds_view(struct anv_device *device,
394                           const struct anv_image_view *iview,
395                           VkImageLayout layout,
396                           VkImageAspectFlags clear_aspects,
397                           float depth_clear_value,
398                           VkRect2D render_area)
399 {
400    /* We don't do any HiZ or depth fast-clears on gfx7 yet */
401    if (GFX_VER == 7)
402       return false;
403 
404    /* If we're just clearing stencil, we can always HiZ clear */
405    if (!(clear_aspects & VK_IMAGE_ASPECT_DEPTH_BIT))
406       return true;
407 
408    /* We must have depth in order to have HiZ */
409    if (!(iview->image->vk.aspects & VK_IMAGE_ASPECT_DEPTH_BIT))
410       return false;
411 
412    const enum isl_aux_usage clear_aux_usage =
413       anv_layout_to_aux_usage(device->info, iview->image,
414                               VK_IMAGE_ASPECT_DEPTH_BIT,
415                               VK_IMAGE_USAGE_DEPTH_STENCIL_ATTACHMENT_BIT,
416                               layout);
417    if (!blorp_can_hiz_clear_depth(device->info,
418                                   &iview->image->planes[0].primary_surface.isl,
419                                   clear_aux_usage,
420                                   iview->planes[0].isl.base_level,
421                                   iview->planes[0].isl.base_array_layer,
422                                   render_area.offset.x,
423                                   render_area.offset.y,
424                                   render_area.offset.x +
425                                   render_area.extent.width,
426                                   render_area.offset.y +
427                                   render_area.extent.height))
428       return false;
429 
430    if (depth_clear_value != ANV_HZ_FC_VAL)
431       return false;
432 
433    /* Only gfx9+ supports returning ANV_HZ_FC_VAL when sampling a fast-cleared
434     * portion of a HiZ buffer. Testing has revealed that Gfx8 only supports
435     * returning 0.0f. Gens prior to gfx8 do not support this feature at all.
436     */
437    if (GFX_VER == 8 && anv_can_sample_with_hiz(device->info, iview->image))
438       return false;
439 
440    /* If we got here, then we can fast clear */
441    return true;
442 }
443 
444 #define READ_ONCE(x) (*(volatile __typeof__(x) *)&(x))
445 
446 /* Transitions a HiZ-enabled depth buffer from one layout to another. Unless
447  * the initial layout is undefined, the HiZ buffer and depth buffer will
448  * represent the same data at the end of this operation.
449  */
450 static void
transition_depth_buffer(struct anv_cmd_buffer * cmd_buffer,const struct anv_image * image,uint32_t base_layer,uint32_t layer_count,VkImageLayout initial_layout,VkImageLayout final_layout,bool will_full_fast_clear)451 transition_depth_buffer(struct anv_cmd_buffer *cmd_buffer,
452                         const struct anv_image *image,
453                         uint32_t base_layer, uint32_t layer_count,
454                         VkImageLayout initial_layout,
455                         VkImageLayout final_layout,
456                         bool will_full_fast_clear)
457 {
458    const uint32_t depth_plane =
459       anv_image_aspect_to_plane(image, VK_IMAGE_ASPECT_DEPTH_BIT);
460    if (image->planes[depth_plane].aux_usage == ISL_AUX_USAGE_NONE)
461       return;
462 
463    /* If will_full_fast_clear is set, the caller promises to fast-clear the
464     * largest portion of the specified range as it can.  For depth images,
465     * that means the entire image because we don't support multi-LOD HiZ.
466     */
467    assert(image->planes[0].primary_surface.isl.levels == 1);
468    if (will_full_fast_clear)
469       return;
470 
471    const enum isl_aux_state initial_state =
472       anv_layout_to_aux_state(cmd_buffer->device->info, image,
473                               VK_IMAGE_ASPECT_DEPTH_BIT,
474                               initial_layout);
475    const enum isl_aux_state final_state =
476       anv_layout_to_aux_state(cmd_buffer->device->info, image,
477                               VK_IMAGE_ASPECT_DEPTH_BIT,
478                               final_layout);
479 
480    const bool initial_depth_valid =
481       isl_aux_state_has_valid_primary(initial_state);
482    const bool initial_hiz_valid =
483       isl_aux_state_has_valid_aux(initial_state);
484    const bool final_needs_depth =
485       isl_aux_state_has_valid_primary(final_state);
486    const bool final_needs_hiz =
487       isl_aux_state_has_valid_aux(final_state);
488 
489    /* Getting into the pass-through state for Depth is tricky and involves
490     * both a resolve and an ambiguate.  We don't handle that state right now
491     * as anv_layout_to_aux_state never returns it.
492     */
493    assert(final_state != ISL_AUX_STATE_PASS_THROUGH);
494 
495    if (final_needs_depth && !initial_depth_valid) {
496       assert(initial_hiz_valid);
497       anv_image_hiz_op(cmd_buffer, image, VK_IMAGE_ASPECT_DEPTH_BIT,
498                        0, base_layer, layer_count, ISL_AUX_OP_FULL_RESOLVE);
499    } else if (final_needs_hiz && !initial_hiz_valid) {
500       assert(initial_depth_valid);
501       anv_image_hiz_op(cmd_buffer, image, VK_IMAGE_ASPECT_DEPTH_BIT,
502                        0, base_layer, layer_count, ISL_AUX_OP_AMBIGUATE);
503    }
504 }
505 
506 #if GFX_VER == 7
507 static inline bool
vk_image_layout_stencil_write_optimal(VkImageLayout layout)508 vk_image_layout_stencil_write_optimal(VkImageLayout layout)
509 {
510    return layout == VK_IMAGE_LAYOUT_DEPTH_STENCIL_ATTACHMENT_OPTIMAL ||
511           layout == VK_IMAGE_LAYOUT_DEPTH_READ_ONLY_STENCIL_ATTACHMENT_OPTIMAL ||
512           layout == VK_IMAGE_LAYOUT_STENCIL_ATTACHMENT_OPTIMAL ||
513           layout == VK_IMAGE_LAYOUT_ATTACHMENT_OPTIMAL;
514 }
515 #endif
516 
517 /* Transitions a HiZ-enabled depth buffer from one layout to another. Unless
518  * the initial layout is undefined, the HiZ buffer and depth buffer will
519  * represent the same data at the end of this operation.
520  */
521 static void
transition_stencil_buffer(struct anv_cmd_buffer * cmd_buffer,const struct anv_image * image,uint32_t base_level,uint32_t level_count,uint32_t base_layer,uint32_t layer_count,VkImageLayout initial_layout,VkImageLayout final_layout,bool will_full_fast_clear)522 transition_stencil_buffer(struct anv_cmd_buffer *cmd_buffer,
523                           const struct anv_image *image,
524                           uint32_t base_level, uint32_t level_count,
525                           uint32_t base_layer, uint32_t layer_count,
526                           VkImageLayout initial_layout,
527                           VkImageLayout final_layout,
528                           bool will_full_fast_clear)
529 {
530 #if GFX_VER == 7
531    const uint32_t plane =
532       anv_image_aspect_to_plane(image, VK_IMAGE_ASPECT_STENCIL_BIT);
533 
534    /* On gfx7, we have to store a texturable version of the stencil buffer in
535     * a shadow whenever VK_IMAGE_USAGE_SAMPLED_BIT is set and copy back and
536     * forth at strategic points. Stencil writes are only allowed in following
537     * layouts:
538     *
539     *  - VK_IMAGE_LAYOUT_GENERAL
540     *  - VK_IMAGE_LAYOUT_TRANSFER_DST_OPTIMAL
541     *  - VK_IMAGE_LAYOUT_DEPTH_STENCIL_ATTACHMENT_OPTIMAL
542     *  - VK_IMAGE_LAYOUT_DEPTH_READ_ONLY_STENCIL_ATTACHMENT_OPTIMAL
543     *  - VK_IMAGE_LAYOUT_STENCIL_ATTACHMENT_OPTIMAL
544     *  - VK_IMAGE_LAYOUT_ATTACHMENT_OPTIMAL
545     *
546     * For general, we have no nice opportunity to transition so we do the copy
547     * to the shadow unconditionally at the end of the subpass. For transfer
548     * destinations, we can update it as part of the transfer op. For the other
549     * layouts, we delay the copy until a transition into some other layout.
550     */
551    if (anv_surface_is_valid(&image->planes[plane].shadow_surface) &&
552        vk_image_layout_stencil_write_optimal(initial_layout) &&
553        !vk_image_layout_stencil_write_optimal(final_layout)) {
554       anv_image_copy_to_shadow(cmd_buffer, image,
555                                VK_IMAGE_ASPECT_STENCIL_BIT,
556                                base_level, level_count,
557                                base_layer, layer_count);
558    }
559 #endif
560 }
561 
562 #define MI_PREDICATE_SRC0    0x2400
563 #define MI_PREDICATE_SRC1    0x2408
564 #define MI_PREDICATE_RESULT  0x2418
565 
566 static void
set_image_fast_clear_state(struct anv_cmd_buffer * cmd_buffer,const struct anv_image * image,VkImageAspectFlagBits aspect,enum anv_fast_clear_type fast_clear)567 set_image_fast_clear_state(struct anv_cmd_buffer *cmd_buffer,
568                            const struct anv_image *image,
569                            VkImageAspectFlagBits aspect,
570                            enum anv_fast_clear_type fast_clear)
571 {
572    anv_batch_emit(&cmd_buffer->batch, GENX(MI_STORE_DATA_IMM), sdi) {
573       sdi.Address = anv_image_get_fast_clear_type_addr(cmd_buffer->device,
574                                                        image, aspect);
575       sdi.ImmediateData = fast_clear;
576    }
577 }
578 
579 /* This is only really practical on haswell and above because it requires
580  * MI math in order to get it correct.
581  */
582 #if GFX_VERx10 >= 75
583 static void
anv_cmd_compute_resolve_predicate(struct anv_cmd_buffer * cmd_buffer,const struct anv_image * image,VkImageAspectFlagBits aspect,uint32_t level,uint32_t array_layer,enum isl_aux_op resolve_op,enum anv_fast_clear_type fast_clear_supported)584 anv_cmd_compute_resolve_predicate(struct anv_cmd_buffer *cmd_buffer,
585                                   const struct anv_image *image,
586                                   VkImageAspectFlagBits aspect,
587                                   uint32_t level, uint32_t array_layer,
588                                   enum isl_aux_op resolve_op,
589                                   enum anv_fast_clear_type fast_clear_supported)
590 {
591    struct mi_builder b;
592    mi_builder_init(&b, cmd_buffer->device->info, &cmd_buffer->batch);
593 
594    const struct mi_value fast_clear_type =
595       mi_mem32(anv_image_get_fast_clear_type_addr(cmd_buffer->device,
596                                                   image, aspect));
597 
598    assert(resolve_op == ISL_AUX_OP_PARTIAL_RESOLVE);
599    if (level == 0 && array_layer == 0) {
600       /* In this case, we are doing a partial resolve to get rid of fast-clear
601        * colors.  We don't care about the compression state but we do care
602        * about how much fast clear is allowed by the final layout.
603        */
604       assert(resolve_op == ISL_AUX_OP_PARTIAL_RESOLVE);
605       assert(fast_clear_supported < ANV_FAST_CLEAR_ANY);
606 
607       /* We need to compute (fast_clear_supported < image->fast_clear) */
608       struct mi_value pred =
609          mi_ult(&b, mi_imm(fast_clear_supported), fast_clear_type);
610       mi_store(&b, mi_reg64(MI_PREDICATE_SRC0), mi_value_ref(&b, pred));
611 
612       /* If the predicate is true, we want to write 0 to the fast clear type
613        * and, if it's false, leave it alone.  We can do this by writing
614        *
615        * clear_type = clear_type & ~predicate;
616        */
617       struct mi_value new_fast_clear_type =
618          mi_iand(&b, fast_clear_type, mi_inot(&b, pred));
619       mi_store(&b, fast_clear_type, new_fast_clear_type);
620    } else {
621       /* In this case, we're trying to do a partial resolve on a slice that
622        * doesn't have clear color.  There's nothing to do.
623        */
624       assert(resolve_op == ISL_AUX_OP_PARTIAL_RESOLVE);
625       return;
626    }
627 
628    /* Set src1 to 0 and use a != condition */
629    mi_store(&b, mi_reg64(MI_PREDICATE_SRC1), mi_imm(0));
630 
631    anv_batch_emit(&cmd_buffer->batch, GENX(MI_PREDICATE), mip) {
632       mip.LoadOperation    = LOAD_LOADINV;
633       mip.CombineOperation = COMBINE_SET;
634       mip.CompareOperation = COMPARE_SRCS_EQUAL;
635    }
636 }
637 #endif /* GFX_VERx10 >= 75 */
638 
639 static void
anv_cmd_simple_resolve_predicate(struct anv_cmd_buffer * cmd_buffer,const struct anv_image * image,VkImageAspectFlagBits aspect,uint32_t level,uint32_t array_layer,enum isl_aux_op resolve_op,enum anv_fast_clear_type fast_clear_supported)640 anv_cmd_simple_resolve_predicate(struct anv_cmd_buffer *cmd_buffer,
641                                  const struct anv_image *image,
642                                  VkImageAspectFlagBits aspect,
643                                  uint32_t level, uint32_t array_layer,
644                                  enum isl_aux_op resolve_op,
645                                  enum anv_fast_clear_type fast_clear_supported)
646 {
647    struct mi_builder b;
648    mi_builder_init(&b, cmd_buffer->device->info, &cmd_buffer->batch);
649 
650    struct mi_value fast_clear_type_mem =
651       mi_mem32(anv_image_get_fast_clear_type_addr(cmd_buffer->device,
652                                                       image, aspect));
653 
654    /* This only works for partial resolves and only when the clear color is
655     * all or nothing.  On the upside, this emits less command streamer code
656     * and works on Ivybridge and Bay Trail.
657     */
658    assert(resolve_op == ISL_AUX_OP_PARTIAL_RESOLVE);
659    assert(fast_clear_supported != ANV_FAST_CLEAR_ANY);
660 
661    /* We don't support fast clears on anything other than the first slice. */
662    if (level > 0 || array_layer > 0)
663       return;
664 
665    /* On gfx8, we don't have a concept of default clear colors because we
666     * can't sample from CCS surfaces.  It's enough to just load the fast clear
667     * state into the predicate register.
668     */
669    mi_store(&b, mi_reg64(MI_PREDICATE_SRC0), fast_clear_type_mem);
670    mi_store(&b, mi_reg64(MI_PREDICATE_SRC1), mi_imm(0));
671    mi_store(&b, fast_clear_type_mem, mi_imm(0));
672 
673    anv_batch_emit(&cmd_buffer->batch, GENX(MI_PREDICATE), mip) {
674       mip.LoadOperation    = LOAD_LOADINV;
675       mip.CombineOperation = COMBINE_SET;
676       mip.CompareOperation = COMPARE_SRCS_EQUAL;
677    }
678 }
679 
680 static void
anv_cmd_predicated_ccs_resolve(struct anv_cmd_buffer * cmd_buffer,const struct anv_image * image,enum isl_format format,struct isl_swizzle swizzle,VkImageAspectFlagBits aspect,uint32_t level,uint32_t array_layer,enum isl_aux_op resolve_op,enum anv_fast_clear_type fast_clear_supported)681 anv_cmd_predicated_ccs_resolve(struct anv_cmd_buffer *cmd_buffer,
682                                const struct anv_image *image,
683                                enum isl_format format,
684                                struct isl_swizzle swizzle,
685                                VkImageAspectFlagBits aspect,
686                                uint32_t level, uint32_t array_layer,
687                                enum isl_aux_op resolve_op,
688                                enum anv_fast_clear_type fast_clear_supported)
689 {
690    const uint32_t plane = anv_image_aspect_to_plane(image, aspect);
691 
692    anv_cmd_simple_resolve_predicate(cmd_buffer, image,
693                                     aspect, level, array_layer,
694                                     resolve_op, fast_clear_supported);
695 
696    /* CCS_D only supports full resolves and BLORP will assert on us if we try
697     * to do a partial resolve on a CCS_D surface.
698     */
699    if (resolve_op == ISL_AUX_OP_PARTIAL_RESOLVE &&
700        image->planes[plane].aux_usage == ISL_AUX_USAGE_CCS_D)
701       resolve_op = ISL_AUX_OP_FULL_RESOLVE;
702 
703    anv_image_ccs_op(cmd_buffer, image, format, swizzle, aspect,
704                     level, array_layer, 1, resolve_op, NULL, true);
705 }
706 
707 static void
anv_cmd_predicated_mcs_resolve(struct anv_cmd_buffer * cmd_buffer,const struct anv_image * image,enum isl_format format,struct isl_swizzle swizzle,VkImageAspectFlagBits aspect,uint32_t array_layer,enum isl_aux_op resolve_op,enum anv_fast_clear_type fast_clear_supported)708 anv_cmd_predicated_mcs_resolve(struct anv_cmd_buffer *cmd_buffer,
709                                const struct anv_image *image,
710                                enum isl_format format,
711                                struct isl_swizzle swizzle,
712                                VkImageAspectFlagBits aspect,
713                                uint32_t array_layer,
714                                enum isl_aux_op resolve_op,
715                                enum anv_fast_clear_type fast_clear_supported)
716 {
717    assert(aspect == VK_IMAGE_ASPECT_COLOR_BIT);
718    assert(resolve_op == ISL_AUX_OP_PARTIAL_RESOLVE);
719 
720 #if GFX_VERx10 >= 75
721    anv_cmd_compute_resolve_predicate(cmd_buffer, image,
722                                      aspect, 0, array_layer,
723                                      resolve_op, fast_clear_supported);
724 
725    anv_image_mcs_op(cmd_buffer, image, format, swizzle, aspect,
726                     array_layer, 1, resolve_op, NULL, true);
727 #else
728    unreachable("MCS resolves are unsupported on Ivybridge and Bay Trail");
729 #endif
730 }
731 
732 void
genX(cmd_buffer_mark_image_written)733 genX(cmd_buffer_mark_image_written)(struct anv_cmd_buffer *cmd_buffer,
734                                     const struct anv_image *image,
735                                     VkImageAspectFlagBits aspect,
736                                     enum isl_aux_usage aux_usage,
737                                     uint32_t level,
738                                     uint32_t base_layer,
739                                     uint32_t layer_count)
740 {
741    /* The aspect must be exactly one of the image aspects. */
742    assert(util_bitcount(aspect) == 1 && (aspect & image->vk.aspects));
743 }
744 
745 static void
init_fast_clear_color(struct anv_cmd_buffer * cmd_buffer,const struct anv_image * image,VkImageAspectFlagBits aspect)746 init_fast_clear_color(struct anv_cmd_buffer *cmd_buffer,
747                       const struct anv_image *image,
748                       VkImageAspectFlagBits aspect)
749 {
750    assert(cmd_buffer && image);
751    assert(image->vk.aspects & VK_IMAGE_ASPECT_ANY_COLOR_BIT_ANV);
752 
753    set_image_fast_clear_state(cmd_buffer, image, aspect,
754                               ANV_FAST_CLEAR_NONE);
755 
756    /* Initialize the struct fields that are accessed for fast-clears so that
757     * the HW restrictions on the field values are satisfied.
758     */
759    struct anv_address addr =
760       anv_image_get_clear_color_addr(cmd_buffer->device, image, aspect);
761 
762    anv_batch_emit(&cmd_buffer->batch, GENX(MI_STORE_DATA_IMM), sdi) {
763       sdi.Address = addr;
764       if (GFX_VERx10 >= 75) {
765          /* Pre-SKL, the dword containing the clear values also contains
766           * other fields, so we need to initialize those fields to match the
767           * values that would be in a color attachment.
768           */
769          sdi.ImmediateData = ISL_CHANNEL_SELECT_RED   << 25 |
770                              ISL_CHANNEL_SELECT_GREEN << 22 |
771                              ISL_CHANNEL_SELECT_BLUE  << 19 |
772                              ISL_CHANNEL_SELECT_ALPHA << 16;
773       } else if (GFX_VER == 7) {
774          /* On IVB, the dword containing the clear values also contains
775           * other fields that must be zero or can be zero.
776           */
777          sdi.ImmediateData = 0;
778       }
779    }
780 }
781 
782 /* Copy the fast-clear value dword(s) between a surface state object and an
783  * image's fast clear state buffer.
784  */
785 static void
genX(copy_fast_clear_dwords)786 genX(copy_fast_clear_dwords)(struct anv_cmd_buffer *cmd_buffer,
787                              struct anv_state surface_state,
788                              const struct anv_image *image,
789                              VkImageAspectFlagBits aspect,
790                              bool copy_from_surface_state)
791 {
792    assert(cmd_buffer && image);
793    assert(image->vk.aspects & VK_IMAGE_ASPECT_ANY_COLOR_BIT_ANV);
794 
795    struct anv_address ss_clear_addr = {
796       .bo = cmd_buffer->device->surface_state_pool.block_pool.bo,
797       .offset = surface_state.offset +
798                 cmd_buffer->device->isl_dev.ss.clear_value_offset,
799    };
800    const struct anv_address entry_addr =
801       anv_image_get_clear_color_addr(cmd_buffer->device, image, aspect);
802    unsigned copy_size = cmd_buffer->device->isl_dev.ss.clear_value_size;
803 
804 #if GFX_VER == 7
805    /* On gfx7, the combination of commands used here(MI_LOAD_REGISTER_MEM
806     * and MI_STORE_REGISTER_MEM) can cause GPU hangs if any rendering is
807     * in-flight when they are issued even if the memory touched is not
808     * currently active for rendering.  The weird bit is that it is not the
809     * MI_LOAD/STORE_REGISTER_MEM commands which hang but rather the in-flight
810     * rendering hangs such that the next stalling command after the
811     * MI_LOAD/STORE_REGISTER_MEM commands will catch the hang.
812     *
813     * It is unclear exactly why this hang occurs.  Both MI commands come with
814     * warnings about the 3D pipeline but that doesn't seem to fully explain
815     * it.  My (Faith's) best theory is that it has something to do with the
816     * fact that we're using a GPU state register as our temporary and that
817     * something with reading/writing it is causing problems.
818     *
819     * In order to work around this issue, we emit a PIPE_CONTROL with the
820     * command streamer stall bit set.
821     */
822    anv_add_pending_pipe_bits(cmd_buffer,
823                              ANV_PIPE_CS_STALL_BIT,
824                              "after copy_fast_clear_dwords. Avoid potential hang");
825    genX(cmd_buffer_apply_pipe_flushes)(cmd_buffer);
826 #endif
827 
828    struct mi_builder b;
829    mi_builder_init(&b, cmd_buffer->device->info, &cmd_buffer->batch);
830 
831    if (copy_from_surface_state) {
832       mi_memcpy(&b, entry_addr, ss_clear_addr, copy_size);
833    } else {
834       mi_memcpy(&b, ss_clear_addr, entry_addr, copy_size);
835 
836       /* Updating a surface state object may require that the state cache be
837        * invalidated. From the SKL PRM, Shared Functions -> State -> State
838        * Caching:
839        *
840        *    Whenever the RENDER_SURFACE_STATE object in memory pointed to by
841        *    the Binding Table Pointer (BTP) and Binding Table Index (BTI) is
842        *    modified [...], the L1 state cache must be invalidated to ensure
843        *    the new surface or sampler state is fetched from system memory.
844        *
845        * In testing, SKL doesn't actually seem to need this, but HSW does.
846        */
847       anv_add_pending_pipe_bits(cmd_buffer,
848                                 ANV_PIPE_STATE_CACHE_INVALIDATE_BIT,
849                                 "after copy_fast_clear_dwords surface state update");
850    }
851 }
852 
853 /**
854  * @brief Transitions a color buffer from one layout to another.
855  *
856  * See section 6.1.1. Image Layout Transitions of the Vulkan 1.0.50 spec for
857  * more information.
858  *
859  * @param level_count VK_REMAINING_MIP_LEVELS isn't supported.
860  * @param layer_count VK_REMAINING_ARRAY_LAYERS isn't supported. For 3D images,
861  *                    this represents the maximum layers to transition at each
862  *                    specified miplevel.
863  */
864 static void
transition_color_buffer(struct anv_cmd_buffer * cmd_buffer,const struct anv_image * image,VkImageAspectFlagBits aspect,const uint32_t base_level,uint32_t level_count,uint32_t base_layer,uint32_t layer_count,VkImageLayout initial_layout,VkImageLayout final_layout,uint32_t src_queue_family,uint32_t dst_queue_family,bool will_full_fast_clear)865 transition_color_buffer(struct anv_cmd_buffer *cmd_buffer,
866                         const struct anv_image *image,
867                         VkImageAspectFlagBits aspect,
868                         const uint32_t base_level, uint32_t level_count,
869                         uint32_t base_layer, uint32_t layer_count,
870                         VkImageLayout initial_layout,
871                         VkImageLayout final_layout,
872                         uint32_t src_queue_family,
873                         uint32_t dst_queue_family,
874                         bool will_full_fast_clear)
875 {
876    struct anv_device *device = cmd_buffer->device;
877    const struct intel_device_info *devinfo = device->info;
878    /* Validate the inputs. */
879    assert(cmd_buffer);
880    assert(image && image->vk.aspects & VK_IMAGE_ASPECT_ANY_COLOR_BIT_ANV);
881    /* These values aren't supported for simplicity's sake. */
882    assert(level_count != VK_REMAINING_MIP_LEVELS &&
883           layer_count != VK_REMAINING_ARRAY_LAYERS);
884    /* Ensure the subresource range is valid. */
885    UNUSED uint64_t last_level_num = base_level + level_count;
886    const uint32_t max_depth = u_minify(image->vk.extent.depth, base_level);
887    UNUSED const uint32_t image_layers = MAX2(image->vk.array_layers, max_depth);
888    assert((uint64_t)base_layer + layer_count  <= image_layers);
889    assert(last_level_num <= image->vk.mip_levels);
890    /* If there is a layout transfer, the final layout cannot be undefined or
891     * preinitialized (VUID-VkImageMemoryBarrier-newLayout-01198).
892     */
893    assert(initial_layout == final_layout ||
894           (final_layout != VK_IMAGE_LAYOUT_UNDEFINED &&
895            final_layout != VK_IMAGE_LAYOUT_PREINITIALIZED));
896    const struct isl_drm_modifier_info *isl_mod_info =
897       image->vk.tiling == VK_IMAGE_TILING_DRM_FORMAT_MODIFIER_EXT
898       ? isl_drm_modifier_get_info(image->vk.drm_format_mod)
899       : NULL;
900 
901    const bool src_queue_external =
902       src_queue_family == VK_QUEUE_FAMILY_FOREIGN_EXT ||
903       src_queue_family == VK_QUEUE_FAMILY_EXTERNAL;
904 
905    const bool dst_queue_external =
906       dst_queue_family == VK_QUEUE_FAMILY_FOREIGN_EXT ||
907       dst_queue_family == VK_QUEUE_FAMILY_EXTERNAL;
908 
909    /* Simultaneous acquire and release on external queues is illegal. */
910    assert(!src_queue_external || !dst_queue_external);
911 
912    /* Ownership transition on an external queue requires special action if the
913     * image has a DRM format modifier because we store image data in
914     * a driver-private bo which is inaccessible to the external queue.
915     */
916    const bool private_binding_acquire =
917       src_queue_external &&
918       anv_image_is_externally_shared(image) &&
919       anv_image_has_private_binding(image);
920 
921    const bool private_binding_release =
922       dst_queue_external &&
923       anv_image_is_externally_shared(image) &&
924       anv_image_has_private_binding(image);
925 
926    if (initial_layout == final_layout &&
927        !private_binding_acquire && !private_binding_release) {
928       /* No work is needed. */
929        return;
930    }
931 
932    const uint32_t plane = anv_image_aspect_to_plane(image, aspect);
933 
934    if (anv_surface_is_valid(&image->planes[plane].shadow_surface) &&
935        final_layout == VK_IMAGE_LAYOUT_SHADER_READ_ONLY_OPTIMAL) {
936       /* This surface is a linear compressed image with a tiled shadow surface
937        * for texturing.  The client is about to use it in READ_ONLY_OPTIMAL so
938        * we need to ensure the shadow copy is up-to-date.
939        */
940       assert(image->vk.tiling != VK_IMAGE_TILING_DRM_FORMAT_MODIFIER_EXT);
941       assert(image->vk.aspects == VK_IMAGE_ASPECT_COLOR_BIT);
942       assert(image->planes[plane].primary_surface.isl.tiling == ISL_TILING_LINEAR);
943       assert(image->planes[plane].shadow_surface.isl.tiling != ISL_TILING_LINEAR);
944       assert(isl_format_is_compressed(image->planes[plane].primary_surface.isl.format));
945       assert(plane == 0);
946       anv_image_copy_to_shadow(cmd_buffer, image,
947                                VK_IMAGE_ASPECT_COLOR_BIT,
948                                base_level, level_count,
949                                base_layer, layer_count);
950    }
951 
952    if (base_layer >= anv_image_aux_layers(image, aspect, base_level))
953       return;
954 
955    assert(image->planes[plane].primary_surface.isl.tiling != ISL_TILING_LINEAR);
956 
957    /* The following layouts are equivalent for non-linear images. */
958    const bool initial_layout_undefined =
959       initial_layout == VK_IMAGE_LAYOUT_UNDEFINED ||
960       initial_layout == VK_IMAGE_LAYOUT_PREINITIALIZED;
961 
962    bool must_init_fast_clear_state = false;
963    bool must_init_aux_surface = false;
964 
965    if (initial_layout_undefined) {
966       /* The subresource may have been aliased and populated with arbitrary
967        * data.
968        */
969       must_init_fast_clear_state = true;
970       must_init_aux_surface = true;
971    } else if (private_binding_acquire) {
972       /* The fast clear state lives in a driver-private bo, and therefore the
973        * external/foreign queue is unaware of it.
974        *
975        * If this is the first time we are accessing the image, then the fast
976        * clear state is uninitialized.
977        *
978        * If this is NOT the first time we are accessing the image, then the fast
979        * clear state may still be valid and correct due to the resolve during
980        * our most recent ownership release.  However, we do not track the aux
981        * state with MI stores, and therefore must assume the worst-case: that
982        * this is the first time we are accessing the image.
983        */
984       assert(image->planes[plane].fast_clear_memory_range.binding ==
985               ANV_IMAGE_MEMORY_BINDING_PRIVATE);
986       must_init_fast_clear_state = true;
987 
988       /* The aux surface, like the fast clear state, lives in
989        * a driver-private bo.  We must initialize the aux surface for the
990        * same reasons we must initialize the fast clear state.
991        */
992       assert(image->planes[plane].aux_surface.memory_range.binding ==
993              ANV_IMAGE_MEMORY_BINDING_PRIVATE);
994       must_init_aux_surface = true;
995    }
996 
997    if (must_init_fast_clear_state) {
998       if (base_level == 0 && base_layer == 0)
999          init_fast_clear_color(cmd_buffer, image, aspect);
1000    }
1001 
1002    if (must_init_aux_surface) {
1003       assert(must_init_fast_clear_state);
1004 
1005       /* Initialize the aux buffers to enable correct rendering.  In order to
1006        * ensure that things such as storage images work correctly, aux buffers
1007        * need to be initialized to valid data.
1008        *
1009        * Having an aux buffer with invalid data is a problem for two reasons:
1010        *
1011        *  1) Having an invalid value in the buffer can confuse the hardware.
1012        *     For instance, with CCS_E on SKL, a two-bit CCS value of 2 is
1013        *     invalid and leads to the hardware doing strange things.  It
1014        *     doesn't hang as far as we can tell but rendering corruption can
1015        *     occur.
1016        *
1017        *  2) If this transition is into the GENERAL layout and we then use the
1018        *     image as a storage image, then we must have the aux buffer in the
1019        *     pass-through state so that, if we then go to texture from the
1020        *     image, we get the results of our storage image writes and not the
1021        *     fast clear color or other random data.
1022        *
1023        * For CCS both of the problems above are real demonstrable issues.  In
1024        * that case, the only thing we can do is to perform an ambiguate to
1025        * transition the aux surface into the pass-through state.
1026        *
1027        * For MCS, (2) is never an issue because we don't support multisampled
1028        * storage images.  In theory, issue (1) is a problem with MCS but we've
1029        * never seen it in the wild.  For 4x and 16x, all bit patters could, in
1030        * theory, be interpreted as something but we don't know that all bit
1031        * patterns are actually valid.  For 2x and 8x, you could easily end up
1032        * with the MCS referring to an invalid plane because not all bits of
1033        * the MCS value are actually used.  Even though we've never seen issues
1034        * in the wild, it's best to play it safe and initialize the MCS.  We
1035        * can use a fast-clear for MCS because we only ever touch from render
1036        * and texture (no image load store).
1037        */
1038       if (image->vk.samples == 1) {
1039          for (uint32_t l = 0; l < level_count; l++) {
1040             const uint32_t level = base_level + l;
1041 
1042             uint32_t aux_layers = anv_image_aux_layers(image, aspect, level);
1043             if (base_layer >= aux_layers)
1044                break; /* We will only get fewer layers as level increases */
1045             uint32_t level_layer_count =
1046                MIN2(layer_count, aux_layers - base_layer);
1047 
1048             /* If will_full_fast_clear is set, the caller promises to
1049              * fast-clear the largest portion of the specified range as it can.
1050              * For color images, that means only the first LOD and array slice.
1051              */
1052             if (level == 0 && base_layer == 0 && will_full_fast_clear) {
1053                base_layer++;
1054                level_layer_count--;
1055                if (level_layer_count == 0)
1056                   continue;
1057             }
1058 
1059             anv_image_ccs_op(cmd_buffer, image,
1060                              image->planes[plane].primary_surface.isl.format,
1061                              ISL_SWIZZLE_IDENTITY,
1062                              aspect, level, base_layer, level_layer_count,
1063                              ISL_AUX_OP_AMBIGUATE, NULL, false);
1064          }
1065       } else {
1066          if (image->vk.samples == 4 || image->vk.samples == 16) {
1067             anv_perf_warn(VK_LOG_OBJS(&image->vk.base),
1068                           "Doing a potentially unnecessary fast-clear to "
1069                           "define an MCS buffer.");
1070          }
1071 
1072          /* If will_full_fast_clear is set, the caller promises to fast-clear
1073           * the largest portion of the specified range as it can.
1074           */
1075          if (will_full_fast_clear)
1076             return;
1077 
1078          assert(base_level == 0 && level_count == 1);
1079          anv_image_mcs_op(cmd_buffer, image,
1080                           image->planes[plane].primary_surface.isl.format,
1081                           ISL_SWIZZLE_IDENTITY,
1082                           aspect, base_layer, layer_count,
1083                           ISL_AUX_OP_FAST_CLEAR, NULL, false);
1084       }
1085       return;
1086    }
1087 
1088    enum isl_aux_usage initial_aux_usage =
1089       anv_layout_to_aux_usage(devinfo, image, aspect, 0, initial_layout);
1090    enum isl_aux_usage final_aux_usage =
1091       anv_layout_to_aux_usage(devinfo, image, aspect, 0, final_layout);
1092    enum anv_fast_clear_type initial_fast_clear =
1093       anv_layout_to_fast_clear_type(devinfo, image, aspect, initial_layout);
1094    enum anv_fast_clear_type final_fast_clear =
1095       anv_layout_to_fast_clear_type(devinfo, image, aspect, final_layout);
1096 
1097    /* We must override the anv_layout_to_* functions because they are unaware of
1098     * acquire/release direction.
1099     */
1100    if (private_binding_acquire) {
1101       assert(!isl_drm_modifier_has_aux(isl_mod_info->modifier));
1102       initial_aux_usage = ISL_AUX_USAGE_NONE;
1103       initial_fast_clear = ANV_FAST_CLEAR_NONE;
1104    } else if (private_binding_release) {
1105       assert(!isl_drm_modifier_has_aux(isl_mod_info->modifier));
1106       final_aux_usage = ISL_AUX_USAGE_NONE;
1107       final_fast_clear = ANV_FAST_CLEAR_NONE;
1108    }
1109 
1110    /* The current code assumes that there is no mixing of CCS_E and CCS_D.
1111     * We can handle transitions between CCS_D/E to and from NONE.  What we
1112     * don't yet handle is switching between CCS_E and CCS_D within a given
1113     * image.  Doing so in a performant way requires more detailed aux state
1114     * tracking such as what is done in i965.  For now, just assume that we
1115     * only have one type of compression.
1116     */
1117    assert(initial_aux_usage == ISL_AUX_USAGE_NONE ||
1118           final_aux_usage == ISL_AUX_USAGE_NONE ||
1119           initial_aux_usage == final_aux_usage);
1120 
1121    /* If initial aux usage is NONE, there is nothing to resolve */
1122    if (initial_aux_usage == ISL_AUX_USAGE_NONE)
1123       return;
1124 
1125    enum isl_aux_op resolve_op = ISL_AUX_OP_NONE;
1126 
1127    /* If the initial layout supports more fast clear than the final layout
1128     * then we need at least a partial resolve.
1129     */
1130    if (final_fast_clear < initial_fast_clear)
1131       resolve_op = ISL_AUX_OP_PARTIAL_RESOLVE;
1132 
1133    if (resolve_op == ISL_AUX_OP_NONE)
1134       return;
1135 
1136    /* Perform a resolve to synchronize data between the main and aux buffer.
1137     * Before we begin, we must satisfy the cache flushing requirement specified
1138     * in the Sky Lake PRM Vol. 7, "MCS Buffer for Render Target(s)":
1139     *
1140     *    Any transition from any value in {Clear, Render, Resolve} to a
1141     *    different value in {Clear, Render, Resolve} requires end of pipe
1142     *    synchronization.
1143     *
1144     * We perform a flush of the write cache before and after the clear and
1145     * resolve operations to meet this requirement.
1146     *
1147     * Unlike other drawing, fast clear operations are not properly
1148     * synchronized. The first PIPE_CONTROL here likely ensures that the
1149     * contents of the previous render or clear hit the render target before we
1150     * resolve and the second likely ensures that the resolve is complete before
1151     * we do any more rendering or clearing.
1152     */
1153    anv_add_pending_pipe_bits(cmd_buffer,
1154                              ANV_PIPE_RENDER_TARGET_CACHE_FLUSH_BIT |
1155                              ANV_PIPE_END_OF_PIPE_SYNC_BIT,
1156                              "after transition RT");
1157 
1158    for (uint32_t l = 0; l < level_count; l++) {
1159       uint32_t level = base_level + l;
1160 
1161       uint32_t aux_layers = anv_image_aux_layers(image, aspect, level);
1162       if (base_layer >= aux_layers)
1163          break; /* We will only get fewer layers as level increases */
1164       uint32_t level_layer_count =
1165          MIN2(layer_count, aux_layers - base_layer);
1166 
1167       for (uint32_t a = 0; a < level_layer_count; a++) {
1168          uint32_t array_layer = base_layer + a;
1169 
1170          /* If will_full_fast_clear is set, the caller promises to fast-clear
1171           * the largest portion of the specified range as it can.  For color
1172           * images, that means only the first LOD and array slice.
1173           */
1174          if (level == 0 && array_layer == 0 && will_full_fast_clear)
1175             continue;
1176 
1177          if (image->vk.samples == 1) {
1178             anv_cmd_predicated_ccs_resolve(cmd_buffer, image,
1179                                            image->planes[plane].primary_surface.isl.format,
1180                                            ISL_SWIZZLE_IDENTITY,
1181                                            aspect, level, array_layer, resolve_op,
1182                                            final_fast_clear);
1183          } else {
1184             /* We only support fast-clear on the first layer so partial
1185              * resolves should not be used on other layers as they will use
1186              * the clear color stored in memory that is only valid for layer0.
1187              */
1188             if (resolve_op == ISL_AUX_OP_PARTIAL_RESOLVE &&
1189                 array_layer != 0)
1190                continue;
1191 
1192             anv_cmd_predicated_mcs_resolve(cmd_buffer, image,
1193                                            image->planes[plane].primary_surface.isl.format,
1194                                            ISL_SWIZZLE_IDENTITY,
1195                                            aspect, array_layer, resolve_op,
1196                                            final_fast_clear);
1197          }
1198       }
1199    }
1200 
1201    anv_add_pending_pipe_bits(cmd_buffer,
1202                              ANV_PIPE_RENDER_TARGET_CACHE_FLUSH_BIT |
1203                              ANV_PIPE_END_OF_PIPE_SYNC_BIT,
1204                              "after transition RT");
1205 }
1206 
1207 static MUST_CHECK VkResult
anv_cmd_buffer_init_attachments(struct anv_cmd_buffer * cmd_buffer,uint32_t color_att_count)1208 anv_cmd_buffer_init_attachments(struct anv_cmd_buffer *cmd_buffer,
1209                                 uint32_t color_att_count)
1210 {
1211    struct anv_cmd_graphics_state *gfx = &cmd_buffer->state.gfx;
1212 
1213    /* Reserve one for the NULL state. */
1214    unsigned num_states = 1 + color_att_count;
1215    const struct isl_device *isl_dev = &cmd_buffer->device->isl_dev;
1216    const uint32_t ss_stride = align(isl_dev->ss.size, isl_dev->ss.align);
1217    gfx->att_states =
1218       anv_state_stream_alloc(&cmd_buffer->surface_state_stream,
1219                              num_states * ss_stride, isl_dev->ss.align);
1220    if (gfx->att_states.map == NULL) {
1221       return anv_batch_set_error(&cmd_buffer->batch,
1222                                  VK_ERROR_OUT_OF_DEVICE_MEMORY);
1223    }
1224 
1225    struct anv_state next_state = gfx->att_states;
1226    next_state.alloc_size = isl_dev->ss.size;
1227 
1228    gfx->null_surface_state = next_state;
1229    next_state.offset += ss_stride;
1230    next_state.map += ss_stride;
1231 
1232    gfx->color_att_count = color_att_count;
1233    for (uint32_t i = 0; i < color_att_count; i++) {
1234       gfx->color_att[i] = (struct anv_attachment) {
1235          .surface_state.state = next_state,
1236       };
1237       next_state.offset += ss_stride;
1238       next_state.map += ss_stride;
1239    }
1240    gfx->depth_att = (struct anv_attachment) { };
1241    gfx->stencil_att = (struct anv_attachment) { };
1242 
1243    return VK_SUCCESS;
1244 }
1245 
1246 static void
anv_cmd_buffer_reset_rendering(struct anv_cmd_buffer * cmd_buffer)1247 anv_cmd_buffer_reset_rendering(struct anv_cmd_buffer *cmd_buffer)
1248 {
1249    struct anv_cmd_graphics_state *gfx = &cmd_buffer->state.gfx;
1250 
1251    gfx->render_area = (VkRect2D) { };
1252    gfx->layer_count = 0;
1253    gfx->samples = 0;
1254 
1255    gfx->color_att_count = 0;
1256    gfx->depth_att = (struct anv_attachment) { };
1257    gfx->stencil_att = (struct anv_attachment) { };
1258    gfx->null_surface_state = ANV_STATE_NULL;
1259 }
1260 
1261 VkResult
genX(BeginCommandBuffer)1262 genX(BeginCommandBuffer)(
1263     VkCommandBuffer                             commandBuffer,
1264     const VkCommandBufferBeginInfo*             pBeginInfo)
1265 {
1266    ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer);
1267    VkResult result;
1268 
1269    /* If this is the first vkBeginCommandBuffer, we must *initialize* the
1270     * command buffer's state. Otherwise, we must *reset* its state. In both
1271     * cases we reset it.
1272     *
1273     * From the Vulkan 1.0 spec:
1274     *
1275     *    If a command buffer is in the executable state and the command buffer
1276     *    was allocated from a command pool with the
1277     *    VK_COMMAND_POOL_CREATE_RESET_COMMAND_BUFFER_BIT flag set, then
1278     *    vkBeginCommandBuffer implicitly resets the command buffer, behaving
1279     *    as if vkResetCommandBuffer had been called with
1280     *    VK_COMMAND_BUFFER_RESET_RELEASE_RESOURCES_BIT not set. It then puts
1281     *    the command buffer in the recording state.
1282     */
1283    anv_cmd_buffer_reset(&cmd_buffer->vk, 0);
1284    anv_cmd_buffer_reset_rendering(cmd_buffer);
1285 
1286    cmd_buffer->usage_flags = pBeginInfo->flags;
1287 
1288    /* VK_COMMAND_BUFFER_USAGE_RENDER_PASS_CONTINUE_BIT must be ignored for
1289     * primary level command buffers.
1290     *
1291     * From the Vulkan 1.0 spec:
1292     *
1293     *    VK_COMMAND_BUFFER_USAGE_RENDER_PASS_CONTINUE_BIT specifies that a
1294     *    secondary command buffer is considered to be entirely inside a render
1295     *    pass. If this is a primary command buffer, then this bit is ignored.
1296     */
1297    if (cmd_buffer->vk.level == VK_COMMAND_BUFFER_LEVEL_PRIMARY)
1298       cmd_buffer->usage_flags &= ~VK_COMMAND_BUFFER_USAGE_RENDER_PASS_CONTINUE_BIT;
1299 
1300    trace_intel_begin_cmd_buffer(&cmd_buffer->trace);
1301 
1302    genX(cmd_buffer_emit_state_base_address)(cmd_buffer);
1303 
1304    /* We sometimes store vertex data in the dynamic state buffer for blorp
1305     * operations and our dynamic state stream may re-use data from previous
1306     * command buffers.  In order to prevent stale cache data, we flush the VF
1307     * cache.  We could do this on every blorp call but that's not really
1308     * needed as all of the data will get written by the CPU prior to the GPU
1309     * executing anything.  The chances are fairly high that they will use
1310     * blorp at least once per primary command buffer so it shouldn't be
1311     * wasted.
1312     *
1313     * There is also a workaround on gfx8 which requires us to invalidate the
1314     * VF cache occasionally.  It's easier if we can assume we start with a
1315     * fresh cache (See also genX(cmd_buffer_set_binding_for_gfx8_vb_flush).)
1316     */
1317    anv_add_pending_pipe_bits(cmd_buffer,
1318                              ANV_PIPE_VF_CACHE_INVALIDATE_BIT,
1319                              "new cmd buffer");
1320 
1321    /* We send an "Indirect State Pointers Disable" packet at
1322     * EndCommandBuffer, so all push constant packets are ignored during a
1323     * context restore. Documentation says after that command, we need to
1324     * emit push constants again before any rendering operation. So we
1325     * flag them dirty here to make sure they get emitted.
1326     */
1327    cmd_buffer->state.push_constants_dirty |= VK_SHADER_STAGE_ALL_GRAPHICS;
1328 
1329    if (cmd_buffer->usage_flags &
1330        VK_COMMAND_BUFFER_USAGE_RENDER_PASS_CONTINUE_BIT) {
1331       struct anv_cmd_graphics_state *gfx = &cmd_buffer->state.gfx;
1332 
1333       char gcbiar_data[VK_GCBIARR_DATA_SIZE(MAX_RTS)];
1334       const VkRenderingInfo *resume_info =
1335          vk_get_command_buffer_inheritance_as_rendering_resume(cmd_buffer->vk.level,
1336                                                                pBeginInfo,
1337                                                                gcbiar_data);
1338       if (resume_info != NULL) {
1339          genX(CmdBeginRendering)(commandBuffer, resume_info);
1340       } else {
1341          const VkCommandBufferInheritanceRenderingInfo *inheritance_info =
1342             vk_get_command_buffer_inheritance_rendering_info(cmd_buffer->vk.level,
1343                                                              pBeginInfo);
1344          assert(inheritance_info);
1345 
1346          gfx->rendering_flags = inheritance_info->flags;
1347          gfx->render_area = (VkRect2D) { };
1348          gfx->layer_count = 0;
1349          gfx->samples = inheritance_info->rasterizationSamples;
1350          gfx->view_mask = inheritance_info->viewMask;
1351 
1352          uint32_t color_att_count = inheritance_info->colorAttachmentCount;
1353          result = anv_cmd_buffer_init_attachments(cmd_buffer, color_att_count);
1354          if (result != VK_SUCCESS)
1355             return result;
1356 
1357          for (uint32_t i = 0; i < color_att_count; i++) {
1358             gfx->color_att[i].vk_format =
1359                inheritance_info->pColorAttachmentFormats[i];
1360          }
1361          gfx->depth_att.vk_format =
1362             inheritance_info->depthAttachmentFormat;
1363          gfx->stencil_att.vk_format =
1364             inheritance_info->stencilAttachmentFormat;
1365 
1366          cmd_buffer->state.gfx.dirty |= ANV_CMD_DIRTY_RENDER_TARGETS;
1367 
1368          anv_cmd_graphic_state_update_has_uint_rt(gfx);
1369       }
1370    }
1371 
1372 #if GFX_VER >= 8
1373    /* Emit the sample pattern at the beginning of the batch because the
1374     * default locations emitted at the device initialization might have been
1375     * changed by a previous command buffer.
1376     *
1377     * Do not change that when we're continuing a previous renderpass.
1378     */
1379    if (cmd_buffer->device->vk.enabled_extensions.EXT_sample_locations &&
1380        !(cmd_buffer->usage_flags & VK_COMMAND_BUFFER_USAGE_RENDER_PASS_CONTINUE_BIT))
1381       genX(emit_sample_pattern)(&cmd_buffer->batch, NULL);
1382 #endif
1383 
1384 #if GFX_VERx10 >= 75
1385    if (cmd_buffer->vk.level == VK_COMMAND_BUFFER_LEVEL_SECONDARY) {
1386       const VkCommandBufferInheritanceConditionalRenderingInfoEXT *conditional_rendering_info =
1387          vk_find_struct_const(pBeginInfo->pInheritanceInfo->pNext, COMMAND_BUFFER_INHERITANCE_CONDITIONAL_RENDERING_INFO_EXT);
1388 
1389       /* If secondary buffer supports conditional rendering
1390        * we should emit commands as if conditional rendering is enabled.
1391        */
1392       cmd_buffer->state.conditional_render_enabled =
1393          conditional_rendering_info && conditional_rendering_info->conditionalRenderingEnable;
1394    }
1395 #endif
1396 
1397    return VK_SUCCESS;
1398 }
1399 
1400 /* From the PRM, Volume 2a:
1401  *
1402  *    "Indirect State Pointers Disable
1403  *
1404  *    At the completion of the post-sync operation associated with this pipe
1405  *    control packet, the indirect state pointers in the hardware are
1406  *    considered invalid; the indirect pointers are not saved in the context.
1407  *    If any new indirect state commands are executed in the command stream
1408  *    while the pipe control is pending, the new indirect state commands are
1409  *    preserved.
1410  *
1411  *    [DevIVB+]: Using Invalidate State Pointer (ISP) only inhibits context
1412  *    restoring of Push Constant (3DSTATE_CONSTANT_*) commands. Push Constant
1413  *    commands are only considered as Indirect State Pointers. Once ISP is
1414  *    issued in a context, SW must initialize by programming push constant
1415  *    commands for all the shaders (at least to zero length) before attempting
1416  *    any rendering operation for the same context."
1417  *
1418  * 3DSTATE_CONSTANT_* packets are restored during a context restore,
1419  * even though they point to a BO that has been already unreferenced at
1420  * the end of the previous batch buffer. This has been fine so far since
1421  * we are protected by these scratch page (every address not covered by
1422  * a BO should be pointing to the scratch page). But on CNL, it is
1423  * causing a GPU hang during context restore at the 3DSTATE_CONSTANT_*
1424  * instruction.
1425  *
1426  * The flag "Indirect State Pointers Disable" in PIPE_CONTROL tells the
1427  * hardware to ignore previous 3DSTATE_CONSTANT_* packets during a
1428  * context restore, so the mentioned hang doesn't happen. However,
1429  * software must program push constant commands for all stages prior to
1430  * rendering anything. So we flag them dirty in BeginCommandBuffer.
1431  *
1432  * Finally, we also make sure to stall at pixel scoreboard to make sure the
1433  * constants have been loaded into the EUs prior to disable the push constants
1434  * so that it doesn't hang a previous 3DPRIMITIVE.
1435  */
1436 static void
emit_isp_disable(struct anv_cmd_buffer * cmd_buffer)1437 emit_isp_disable(struct anv_cmd_buffer *cmd_buffer)
1438 {
1439    anv_batch_emit(&cmd_buffer->batch, GENX(PIPE_CONTROL), pc) {
1440          pc.StallAtPixelScoreboard = true;
1441          pc.CommandStreamerStallEnable = true;
1442          anv_debug_dump_pc(pc);
1443    }
1444    anv_batch_emit(&cmd_buffer->batch, GENX(PIPE_CONTROL), pc) {
1445          pc.IndirectStatePointersDisable = true;
1446          pc.CommandStreamerStallEnable = true;
1447          anv_debug_dump_pc(pc);
1448    }
1449 }
1450 
1451 VkResult
genX(EndCommandBuffer)1452 genX(EndCommandBuffer)(
1453     VkCommandBuffer                             commandBuffer)
1454 {
1455    ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer);
1456 
1457    if (anv_batch_has_error(&cmd_buffer->batch))
1458       return cmd_buffer->batch.status;
1459 
1460    anv_measure_endcommandbuffer(cmd_buffer);
1461 
1462    /* We want every command buffer to start with the PMA fix in a known state,
1463     * so we disable it at the end of the command buffer.
1464     */
1465    genX(cmd_buffer_enable_pma_fix)(cmd_buffer, false);
1466 
1467    genX(cmd_buffer_apply_pipe_flushes)(cmd_buffer);
1468 
1469    emit_isp_disable(cmd_buffer);
1470 
1471    trace_intel_end_cmd_buffer(&cmd_buffer->trace, cmd_buffer->vk.level);
1472 
1473    anv_cmd_buffer_end_batch_buffer(cmd_buffer);
1474 
1475    return VK_SUCCESS;
1476 }
1477 
1478 void
genX(CmdExecuteCommands)1479 genX(CmdExecuteCommands)(
1480     VkCommandBuffer                             commandBuffer,
1481     uint32_t                                    commandBufferCount,
1482     const VkCommandBuffer*                      pCmdBuffers)
1483 {
1484    ANV_FROM_HANDLE(anv_cmd_buffer, primary, commandBuffer);
1485 
1486    assert(primary->vk.level == VK_COMMAND_BUFFER_LEVEL_PRIMARY);
1487 
1488    if (anv_batch_has_error(&primary->batch))
1489       return;
1490 
1491    /* The secondary command buffers will assume that the PMA fix is disabled
1492     * when they begin executing.  Make sure this is true.
1493     */
1494    genX(cmd_buffer_enable_pma_fix)(primary, false);
1495 
1496    /* The secondary command buffer doesn't know which textures etc. have been
1497     * flushed prior to their execution.  Apply those flushes now.
1498     */
1499    genX(cmd_buffer_apply_pipe_flushes)(primary);
1500 
1501    for (uint32_t i = 0; i < commandBufferCount; i++) {
1502       ANV_FROM_HANDLE(anv_cmd_buffer, secondary, pCmdBuffers[i]);
1503 
1504       assert(secondary->vk.level == VK_COMMAND_BUFFER_LEVEL_SECONDARY);
1505       assert(!anv_batch_has_error(&secondary->batch));
1506 
1507 #if GFX_VERx10 >= 75
1508       if (secondary->state.conditional_render_enabled) {
1509          if (!primary->state.conditional_render_enabled) {
1510             /* Secondary buffer is constructed as if it will be executed
1511              * with conditional rendering, we should satisfy this dependency
1512              * regardless of conditional rendering being enabled in primary.
1513              */
1514             struct mi_builder b;
1515             mi_builder_init(&b, primary->device->info, &primary->batch);
1516             mi_store(&b, mi_reg64(ANV_PREDICATE_RESULT_REG),
1517                          mi_imm(UINT64_MAX));
1518          }
1519       }
1520 #endif
1521 
1522       if (secondary->usage_flags &
1523           VK_COMMAND_BUFFER_USAGE_RENDER_PASS_CONTINUE_BIT) {
1524          /* If we're continuing a render pass from the primary, we need to
1525           * copy the surface states for the current subpass into the storage
1526           * we allocated for them in BeginCommandBuffer.
1527           */
1528          struct anv_bo *ss_bo =
1529             primary->device->surface_state_pool.block_pool.bo;
1530          struct anv_state src_state = primary->state.gfx.att_states;
1531          struct anv_state dst_state = secondary->state.gfx.att_states;
1532          assert(src_state.alloc_size == dst_state.alloc_size);
1533 
1534          genX(cmd_buffer_so_memcpy)(primary,
1535                                     (struct anv_address) {
1536                                        .bo = ss_bo,
1537                                        .offset = dst_state.offset,
1538                                     },
1539                                     (struct anv_address) {
1540                                        .bo = ss_bo,
1541                                        .offset = src_state.offset,
1542                                     },
1543                                     src_state.alloc_size);
1544       }
1545 
1546       anv_cmd_buffer_add_secondary(primary, secondary);
1547 
1548       assert(secondary->perf_query_pool == NULL || primary->perf_query_pool == NULL ||
1549              secondary->perf_query_pool == primary->perf_query_pool);
1550       if (secondary->perf_query_pool)
1551          primary->perf_query_pool = secondary->perf_query_pool;
1552    }
1553 
1554    /* The secondary isn't counted in our VF cache tracking so we need to
1555     * invalidate the whole thing.
1556     */
1557    if (GFX_VER == 8) {
1558       anv_add_pending_pipe_bits(primary,
1559                                 ANV_PIPE_CS_STALL_BIT | ANV_PIPE_VF_CACHE_INVALIDATE_BIT,
1560                                 "Secondary cmd buffer not tracked in VF cache");
1561    }
1562 
1563    /* The secondary may have selected a different pipeline (3D or compute) and
1564     * may have changed the current L3$ configuration.  Reset our tracking
1565     * variables to invalid values to ensure that we re-emit these in the case
1566     * where we do any draws or compute dispatches from the primary after the
1567     * secondary has returned.
1568     */
1569    primary->state.current_pipeline = UINT32_MAX;
1570    primary->state.current_l3_config = NULL;
1571    primary->state.current_hash_scale = 0;
1572    primary->state.gfx.push_constant_stages = 0;
1573    vk_dynamic_graphics_state_dirty_all(&primary->vk.dynamic_graphics_state);
1574 
1575    /* Each of the secondary command buffers will use its own state base
1576     * address.  We need to re-emit state base address for the primary after
1577     * all of the secondaries are done.
1578     *
1579     * TODO: Maybe we want to make this a dirty bit to avoid extra state base
1580     * address calls?
1581     */
1582    genX(cmd_buffer_emit_state_base_address)(primary);
1583 }
1584 
1585 /**
1586  * Program the hardware to use the specified L3 configuration.
1587  */
1588 void
genX(cmd_buffer_config_l3)1589 genX(cmd_buffer_config_l3)(struct anv_cmd_buffer *cmd_buffer,
1590                            const struct intel_l3_config *cfg)
1591 {
1592    assert(cfg);
1593    if (cfg == cmd_buffer->state.current_l3_config)
1594       return;
1595 
1596    if (INTEL_DEBUG(DEBUG_L3)) {
1597       mesa_logd("L3 config transition: ");
1598       intel_dump_l3_config(cfg, stderr);
1599    }
1600 
1601    /* According to the hardware docs, the L3 partitioning can only be changed
1602     * while the pipeline is completely drained and the caches are flushed,
1603     * which involves a first PIPE_CONTROL flush which stalls the pipeline...
1604     */
1605    anv_batch_emit(&cmd_buffer->batch, GENX(PIPE_CONTROL), pc) {
1606       pc.DCFlushEnable = true;
1607       pc.PostSyncOperation = NoWrite;
1608       pc.CommandStreamerStallEnable = true;
1609       anv_debug_dump_pc(pc);
1610    }
1611 
1612    /* ...followed by a second pipelined PIPE_CONTROL that initiates
1613     * invalidation of the relevant caches.  Note that because RO invalidation
1614     * happens at the top of the pipeline (i.e. right away as the PIPE_CONTROL
1615     * command is processed by the CS) we cannot combine it with the previous
1616     * stalling flush as the hardware documentation suggests, because that
1617     * would cause the CS to stall on previous rendering *after* RO
1618     * invalidation and wouldn't prevent the RO caches from being polluted by
1619     * concurrent rendering before the stall completes.  This intentionally
1620     * doesn't implement the SKL+ hardware workaround suggesting to enable CS
1621     * stall on PIPE_CONTROLs with the texture cache invalidation bit set for
1622     * GPGPU workloads because the previous and subsequent PIPE_CONTROLs
1623     * already guarantee that there is no concurrent GPGPU kernel execution
1624     * (see SKL HSD 2132585).
1625     */
1626    anv_batch_emit(&cmd_buffer->batch, GENX(PIPE_CONTROL), pc) {
1627       pc.TextureCacheInvalidationEnable = true;
1628       pc.ConstantCacheInvalidationEnable = true;
1629       pc.InstructionCacheInvalidateEnable = true;
1630       pc.StateCacheInvalidationEnable = true;
1631       pc.PostSyncOperation = NoWrite;
1632       anv_debug_dump_pc(pc);
1633    }
1634 
1635    /* Now send a third stalling flush to make sure that invalidation is
1636     * complete when the L3 configuration registers are modified.
1637     */
1638    anv_batch_emit(&cmd_buffer->batch, GENX(PIPE_CONTROL), pc) {
1639       pc.DCFlushEnable = true;
1640       pc.PostSyncOperation = NoWrite;
1641       pc.CommandStreamerStallEnable = true;
1642       anv_debug_dump_pc(pc);
1643    }
1644 
1645    genX(emit_l3_config)(&cmd_buffer->batch, cmd_buffer->device, cfg);
1646    cmd_buffer->state.current_l3_config = cfg;
1647 }
1648 
1649 ALWAYS_INLINE enum anv_pipe_bits
genX(emit_apply_pipe_flushes)1650 genX(emit_apply_pipe_flushes)(struct anv_batch *batch,
1651                               struct anv_device *device,
1652                               uint32_t current_pipeline,
1653                               enum anv_pipe_bits bits)
1654 {
1655    /*
1656     * From Sandybridge PRM, volume 2, "1.7.2 End-of-Pipe Synchronization":
1657     *
1658     *    Write synchronization is a special case of end-of-pipe
1659     *    synchronization that requires that the render cache and/or depth
1660     *    related caches are flushed to memory, where the data will become
1661     *    globally visible. This type of synchronization is required prior to
1662     *    SW (CPU) actually reading the result data from memory, or initiating
1663     *    an operation that will use as a read surface (such as a texture
1664     *    surface) a previous render target and/or depth/stencil buffer
1665     *
1666     *
1667     * From Haswell PRM, volume 2, part 1, "End-of-Pipe Synchronization":
1668     *
1669     *    Exercising the write cache flush bits (Render Target Cache Flush
1670     *    Enable, Depth Cache Flush Enable, DC Flush) in PIPE_CONTROL only
1671     *    ensures the write caches are flushed and doesn't guarantee the data
1672     *    is globally visible.
1673     *
1674     *    SW can track the completion of the end-of-pipe-synchronization by
1675     *    using "Notify Enable" and "PostSync Operation - Write Immediate
1676     *    Data" in the PIPE_CONTROL command.
1677     *
1678     * In other words, flushes are pipelined while invalidations are handled
1679     * immediately.  Therefore, if we're flushing anything then we need to
1680     * schedule an end-of-pipe sync before any invalidations can happen.
1681     */
1682    if (bits & ANV_PIPE_FLUSH_BITS)
1683       bits |= ANV_PIPE_NEEDS_END_OF_PIPE_SYNC_BIT;
1684 
1685    /* If we're going to do an invalidate and we have a pending end-of-pipe
1686     * sync that has yet to be resolved, we do the end-of-pipe sync now.
1687     */
1688    if ((bits & ANV_PIPE_INVALIDATE_BITS) &&
1689        (bits & ANV_PIPE_NEEDS_END_OF_PIPE_SYNC_BIT)) {
1690       bits |= ANV_PIPE_END_OF_PIPE_SYNC_BIT;
1691       bits &= ~ANV_PIPE_NEEDS_END_OF_PIPE_SYNC_BIT;
1692    }
1693 
1694    /* Project: SKL / Argument: LRI Post Sync Operation [23]
1695     *
1696     * "PIPECONTROL command with “Command Streamer Stall Enable” must be
1697     *  programmed prior to programming a PIPECONTROL command with "LRI
1698     *  Post Sync Operation" in GPGPU mode of operation (i.e when
1699     *  PIPELINE_SELECT command is set to GPGPU mode of operation)."
1700     *
1701     * The same text exists a few rows below for Post Sync Op.
1702     */
1703    if (bits & ANV_PIPE_POST_SYNC_BIT)
1704       bits &= ~ANV_PIPE_POST_SYNC_BIT;
1705 
1706    if (bits & (ANV_PIPE_FLUSH_BITS | ANV_PIPE_STALL_BITS |
1707                ANV_PIPE_END_OF_PIPE_SYNC_BIT)) {
1708       anv_batch_emit(batch, GENX(PIPE_CONTROL), pipe) {
1709          /* Flushing HDC pipeline requires DC Flush on earlier HW. */
1710          pipe.DCFlushEnable |= bits & ANV_PIPE_HDC_PIPELINE_FLUSH_BIT;
1711          pipe.DepthCacheFlushEnable = bits & ANV_PIPE_DEPTH_CACHE_FLUSH_BIT;
1712          pipe.DCFlushEnable |= bits & ANV_PIPE_DATA_CACHE_FLUSH_BIT;
1713          pipe.RenderTargetCacheFlushEnable =
1714             bits & ANV_PIPE_RENDER_TARGET_CACHE_FLUSH_BIT;
1715 
1716          pipe.CommandStreamerStallEnable = bits & ANV_PIPE_CS_STALL_BIT;
1717 #if GFX_VER == 8
1718          /* From Broadwell PRM, volume 2a:
1719           *    PIPE_CONTROL: Command Streamer Stall Enable:
1720           *
1721           *    "This bit must be always set when PIPE_CONTROL command is
1722           *     programmed by GPGPU and MEDIA workloads, except for the cases
1723           *     when only Read Only Cache Invalidation bits are set (State
1724           *     Cache Invalidation Enable, Instruction cache Invalidation
1725           *     Enable, Texture Cache Invalidation Enable, Constant Cache
1726           *     Invalidation Enable). This is to WA FFDOP CG issue, this WA
1727           *     need not implemented when FF_DOP_CG is disabled."
1728           *
1729           *    Since we do all the invalidation in the following PIPE_CONTROL,
1730           *    if we got here, we need a stall.
1731           */
1732          pipe.CommandStreamerStallEnable |= current_pipeline == GPGPU;
1733 #endif
1734 
1735          pipe.StallAtPixelScoreboard = bits & ANV_PIPE_STALL_AT_SCOREBOARD_BIT;
1736 
1737          /* From Sandybridge PRM, volume 2, "1.7.3.1 Writing a Value to Memory":
1738           *
1739           *    "The most common action to perform upon reaching a
1740           *    synchronization point is to write a value out to memory. An
1741           *    immediate value (included with the synchronization command) may
1742           *    be written."
1743           *
1744           *
1745           * From Broadwell PRM, volume 7, "End-of-Pipe Synchronization":
1746           *
1747           *    "In case the data flushed out by the render engine is to be
1748           *    read back in to the render engine in coherent manner, then the
1749           *    render engine has to wait for the fence completion before
1750           *    accessing the flushed data. This can be achieved by following
1751           *    means on various products: PIPE_CONTROL command with CS Stall
1752           *    and the required write caches flushed with Post-Sync-Operation
1753           *    as Write Immediate Data.
1754           *
1755           *    Example:
1756           *       - Workload-1 (3D/GPGPU/MEDIA)
1757           *       - PIPE_CONTROL (CS Stall, Post-Sync-Operation Write
1758           *         Immediate Data, Required Write Cache Flush bits set)
1759           *       - Workload-2 (Can use the data produce or output by
1760           *         Workload-1)
1761           */
1762          if (bits & ANV_PIPE_END_OF_PIPE_SYNC_BIT) {
1763             pipe.CommandStreamerStallEnable = true;
1764             pipe.PostSyncOperation = WriteImmediateData;
1765             pipe.Address = device->workaround_address;
1766          }
1767 
1768          /*
1769           * According to the Broadwell documentation, any PIPE_CONTROL with the
1770           * "Command Streamer Stall" bit set must also have another bit set,
1771           * with five different options:
1772           *
1773           *  - Render Target Cache Flush
1774           *  - Depth Cache Flush
1775           *  - Stall at Pixel Scoreboard
1776           *  - Post-Sync Operation
1777           *  - Depth Stall
1778           *  - DC Flush Enable
1779           *
1780           * I chose "Stall at Pixel Scoreboard" since that's what we use in
1781           * mesa and it seems to work fine. The choice is fairly arbitrary.
1782           */
1783          if (pipe.CommandStreamerStallEnable &&
1784              !pipe.RenderTargetCacheFlushEnable &&
1785              !pipe.DepthCacheFlushEnable &&
1786              !pipe.StallAtPixelScoreboard &&
1787              !pipe.PostSyncOperation &&
1788              !pipe.DepthStallEnable &&
1789              !pipe.DCFlushEnable)
1790             pipe.StallAtPixelScoreboard = true;
1791          anv_debug_dump_pc(pipe);
1792       }
1793 
1794       /* If a render target flush was emitted, then we can toggle off the bit
1795        * saying that render target writes are ongoing.
1796        */
1797       if (bits & ANV_PIPE_RENDER_TARGET_CACHE_FLUSH_BIT)
1798          bits &= ~(ANV_PIPE_RENDER_TARGET_BUFFER_WRITES);
1799 
1800       if (GFX_VERx10 == 75) {
1801          /* Haswell needs addition work-arounds:
1802           *
1803           * From Haswell PRM, volume 2, part 1, "End-of-Pipe Synchronization":
1804           *
1805           *    Option 1:
1806           *    PIPE_CONTROL command with the CS Stall and the required write
1807           *    caches flushed with Post-SyncOperation as Write Immediate Data
1808           *    followed by eight dummy MI_STORE_DATA_IMM (write to scratch
1809           *    spce) commands.
1810           *
1811           *    Example:
1812           *       - Workload-1
1813           *       - PIPE_CONTROL (CS Stall, Post-Sync-Operation Write
1814           *         Immediate Data, Required Write Cache Flush bits set)
1815           *       - MI_STORE_DATA_IMM (8 times) (Dummy data, Scratch Address)
1816           *       - Workload-2 (Can use the data produce or output by
1817           *         Workload-1)
1818           *
1819           * Unfortunately, both the PRMs and the internal docs are a bit
1820           * out-of-date in this regard.  What the windows driver does (and
1821           * this appears to actually work) is to emit a register read from the
1822           * memory address written by the pipe control above.
1823           *
1824           * What register we load into doesn't matter.  We choose an indirect
1825           * rendering register because we know it always exists and it's one
1826           * of the first registers the command parser allows us to write.  If
1827           * you don't have command parser support in your kernel (pre-4.2),
1828           * this will get turned into MI_NOOP and you won't get the
1829           * workaround.  Unfortunately, there's just not much we can do in
1830           * that case.  This register is perfectly safe to write since we
1831           * always re-load all of the indirect draw registers right before
1832           * 3DPRIMITIVE when needed anyway.
1833           */
1834          anv_batch_emit(batch, GENX(MI_LOAD_REGISTER_MEM), lrm) {
1835             lrm.RegisterAddress  = 0x243C; /* GFX7_3DPRIM_START_INSTANCE */
1836             lrm.MemoryAddress = device->workaround_address;
1837          }
1838       }
1839 
1840       bits &= ~(ANV_PIPE_FLUSH_BITS | ANV_PIPE_STALL_BITS |
1841                 ANV_PIPE_END_OF_PIPE_SYNC_BIT);
1842    }
1843 
1844    if (bits & ANV_PIPE_INVALIDATE_BITS) {
1845       anv_batch_emit(batch, GENX(PIPE_CONTROL), pipe) {
1846          pipe.StateCacheInvalidationEnable =
1847             bits & ANV_PIPE_STATE_CACHE_INVALIDATE_BIT;
1848          pipe.ConstantCacheInvalidationEnable =
1849             bits & ANV_PIPE_CONSTANT_CACHE_INVALIDATE_BIT;
1850          pipe.VFCacheInvalidationEnable =
1851             bits & ANV_PIPE_VF_CACHE_INVALIDATE_BIT;
1852          pipe.TextureCacheInvalidationEnable =
1853             bits & ANV_PIPE_TEXTURE_CACHE_INVALIDATE_BIT;
1854          pipe.InstructionCacheInvalidateEnable =
1855             bits & ANV_PIPE_INSTRUCTION_CACHE_INVALIDATE_BIT;
1856 
1857          anv_debug_dump_pc(pipe);
1858       }
1859 
1860       bits &= ~ANV_PIPE_INVALIDATE_BITS;
1861    }
1862 
1863    return bits;
1864 }
1865 
1866 ALWAYS_INLINE void
genX(cmd_buffer_apply_pipe_flushes)1867 genX(cmd_buffer_apply_pipe_flushes)(struct anv_cmd_buffer *cmd_buffer)
1868 {
1869    enum anv_pipe_bits bits = cmd_buffer->state.pending_pipe_bits;
1870 
1871    if (unlikely(cmd_buffer->device->physical->always_flush_cache))
1872       bits |= ANV_PIPE_FLUSH_BITS | ANV_PIPE_INVALIDATE_BITS;
1873    else if (bits == 0)
1874       return;
1875 
1876    bool trace_flush =
1877       (bits & (ANV_PIPE_FLUSH_BITS | ANV_PIPE_STALL_BITS | ANV_PIPE_INVALIDATE_BITS)) != 0;
1878    if (trace_flush)
1879       trace_intel_begin_stall(&cmd_buffer->trace);
1880 
1881    if (GFX_VER == 8 &&
1882        (bits & ANV_PIPE_CS_STALL_BIT) &&
1883        (bits & ANV_PIPE_VF_CACHE_INVALIDATE_BIT)) {
1884       /* If we are doing a VF cache invalidate AND a CS stall (it must be
1885        * both) then we can reset our vertex cache tracking.
1886        */
1887       memset(cmd_buffer->state.gfx.vb_dirty_ranges, 0,
1888              sizeof(cmd_buffer->state.gfx.vb_dirty_ranges));
1889       memset(&cmd_buffer->state.gfx.ib_dirty_range, 0,
1890              sizeof(cmd_buffer->state.gfx.ib_dirty_range));
1891    }
1892 
1893    cmd_buffer->state.pending_pipe_bits =
1894       genX(emit_apply_pipe_flushes)(&cmd_buffer->batch,
1895                                     cmd_buffer->device,
1896                                     cmd_buffer->state.current_pipeline,
1897                                     bits);
1898 
1899    if (trace_flush) {
1900       trace_intel_end_stall(&cmd_buffer->trace, bits,
1901                             anv_pipe_flush_bit_to_ds_stall_flag, NULL);
1902    }
1903 }
1904 
1905 static void
cmd_buffer_barrier(struct anv_cmd_buffer * cmd_buffer,const VkDependencyInfo * dep_info,const char * reason)1906 cmd_buffer_barrier(struct anv_cmd_buffer *cmd_buffer,
1907                    const VkDependencyInfo *dep_info,
1908                    const char *reason)
1909 {
1910    /* XXX: Right now, we're really dumb and just flush whatever categories
1911     * the app asks for.  One of these days we may make this a bit better
1912     * but right now that's all the hardware allows for in most areas.
1913     */
1914    VkAccessFlags2 src_flags = 0;
1915    VkAccessFlags2 dst_flags = 0;
1916 
1917    for (uint32_t i = 0; i < dep_info->memoryBarrierCount; i++) {
1918       src_flags |= dep_info->pMemoryBarriers[i].srcAccessMask;
1919       dst_flags |= dep_info->pMemoryBarriers[i].dstAccessMask;
1920    }
1921 
1922    for (uint32_t i = 0; i < dep_info->bufferMemoryBarrierCount; i++) {
1923       src_flags |= dep_info->pBufferMemoryBarriers[i].srcAccessMask;
1924       dst_flags |= dep_info->pBufferMemoryBarriers[i].dstAccessMask;
1925    }
1926 
1927    for (uint32_t i = 0; i < dep_info->imageMemoryBarrierCount; i++) {
1928       const VkImageMemoryBarrier2 *img_barrier =
1929          &dep_info->pImageMemoryBarriers[i];
1930 
1931       src_flags |= img_barrier->srcAccessMask;
1932       dst_flags |= img_barrier->dstAccessMask;
1933 
1934       ANV_FROM_HANDLE(anv_image, image, img_barrier->image);
1935       const VkImageSubresourceRange *range = &img_barrier->subresourceRange;
1936 
1937       uint32_t base_layer, layer_count;
1938       if (image->vk.image_type == VK_IMAGE_TYPE_3D) {
1939          base_layer = 0;
1940          layer_count = u_minify(image->vk.extent.depth, range->baseMipLevel);
1941       } else {
1942          base_layer = range->baseArrayLayer;
1943          layer_count = vk_image_subresource_layer_count(&image->vk, range);
1944       }
1945       const uint32_t level_count =
1946          vk_image_subresource_level_count(&image->vk, range);
1947 
1948       if (range->aspectMask & VK_IMAGE_ASPECT_DEPTH_BIT) {
1949          transition_depth_buffer(cmd_buffer, image,
1950                                  base_layer, layer_count,
1951                                  img_barrier->oldLayout,
1952                                  img_barrier->newLayout,
1953                                  false /* will_full_fast_clear */);
1954       }
1955 
1956       if (range->aspectMask & VK_IMAGE_ASPECT_STENCIL_BIT) {
1957          transition_stencil_buffer(cmd_buffer, image,
1958                                    range->baseMipLevel, level_count,
1959                                    base_layer, layer_count,
1960                                    img_barrier->oldLayout,
1961                                    img_barrier->newLayout,
1962                                    false /* will_full_fast_clear */);
1963 
1964          /* If we are in a renderpass, the gfx7 stencil shadow may need to be
1965           * updated even if the layout doesn't change
1966           */
1967          if (cmd_buffer->state.gfx.samples &&
1968               (img_barrier->dstAccessMask & (VK_ACCESS_2_SHADER_READ_BIT |
1969                                              VK_ACCESS_2_SHADER_SAMPLED_READ_BIT |
1970                                              VK_ACCESS_2_INPUT_ATTACHMENT_READ_BIT))) {
1971             const uint32_t plane =
1972                anv_image_aspect_to_plane(image, VK_IMAGE_ASPECT_STENCIL_BIT);
1973             if (anv_surface_is_valid(&image->planes[plane].shadow_surface))
1974                anv_image_copy_to_shadow(cmd_buffer, image,
1975                                         VK_IMAGE_ASPECT_STENCIL_BIT,
1976                                         range->baseMipLevel, level_count,
1977                                         base_layer, layer_count);
1978          }
1979       }
1980 
1981       if (range->aspectMask & VK_IMAGE_ASPECT_ANY_COLOR_BIT_ANV) {
1982          VkImageAspectFlags color_aspects =
1983             vk_image_expand_aspect_mask(&image->vk, range->aspectMask);
1984          anv_foreach_image_aspect_bit(aspect_bit, image, color_aspects) {
1985             transition_color_buffer(cmd_buffer, image, 1UL << aspect_bit,
1986                                     range->baseMipLevel, level_count,
1987                                     base_layer, layer_count,
1988                                     img_barrier->oldLayout,
1989                                     img_barrier->newLayout,
1990                                     img_barrier->srcQueueFamilyIndex,
1991                                     img_barrier->dstQueueFamilyIndex,
1992                                     false /* will_full_fast_clear */);
1993          }
1994       }
1995    }
1996 
1997    enum anv_pipe_bits bits =
1998       anv_pipe_flush_bits_for_access_flags(cmd_buffer->device, src_flags) |
1999       anv_pipe_invalidate_bits_for_access_flags(cmd_buffer->device, dst_flags);
2000 
2001    anv_add_pending_pipe_bits(cmd_buffer, bits, reason);
2002 }
2003 
genX(CmdPipelineBarrier2)2004 void genX(CmdPipelineBarrier2)(
2005     VkCommandBuffer                             commandBuffer,
2006     const VkDependencyInfo*                     pDependencyInfo)
2007 {
2008    ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer);
2009 
2010    cmd_buffer_barrier(cmd_buffer, pDependencyInfo, "pipe barrier");
2011 }
2012 
2013 static void
cmd_buffer_alloc_push_constants(struct anv_cmd_buffer * cmd_buffer)2014 cmd_buffer_alloc_push_constants(struct anv_cmd_buffer *cmd_buffer)
2015 {
2016    VkShaderStageFlags stages =
2017       cmd_buffer->state.gfx.pipeline->active_stages;
2018 
2019    /* In order to avoid thrash, we assume that vertex and fragment stages
2020     * always exist.  In the rare case where one is missing *and* the other
2021     * uses push concstants, this may be suboptimal.  However, avoiding stalls
2022     * seems more important.
2023     */
2024    stages |= VK_SHADER_STAGE_FRAGMENT_BIT;
2025    if (anv_pipeline_is_primitive(cmd_buffer->state.gfx.pipeline))
2026       stages |= VK_SHADER_STAGE_VERTEX_BIT;
2027 
2028    if (stages == cmd_buffer->state.gfx.push_constant_stages)
2029       return;
2030 
2031    const unsigned push_constant_kb =
2032       cmd_buffer->device->info->max_constant_urb_size_kb;
2033 
2034    const unsigned num_stages =
2035       util_bitcount(stages & VK_SHADER_STAGE_ALL_GRAPHICS);
2036    unsigned size_per_stage = push_constant_kb / num_stages;
2037 
2038    /* Broadwell+ and Haswell gt3 require that the push constant sizes be in
2039     * units of 2KB.  Incidentally, these are the same platforms that have
2040     * 32KB worth of push constant space.
2041     */
2042    if (push_constant_kb == 32)
2043       size_per_stage &= ~1u;
2044 
2045    uint32_t kb_used = 0;
2046    for (int i = MESA_SHADER_VERTEX; i < MESA_SHADER_FRAGMENT; i++) {
2047       unsigned push_size = (stages & (1 << i)) ? size_per_stage : 0;
2048       anv_batch_emit(&cmd_buffer->batch,
2049                      GENX(3DSTATE_PUSH_CONSTANT_ALLOC_VS), alloc) {
2050          alloc._3DCommandSubOpcode  = 18 + i;
2051          alloc.ConstantBufferOffset = (push_size > 0) ? kb_used : 0;
2052          alloc.ConstantBufferSize   = push_size;
2053       }
2054       kb_used += push_size;
2055    }
2056 
2057    anv_batch_emit(&cmd_buffer->batch,
2058                   GENX(3DSTATE_PUSH_CONSTANT_ALLOC_PS), alloc) {
2059       alloc.ConstantBufferOffset = kb_used;
2060       alloc.ConstantBufferSize = push_constant_kb - kb_used;
2061    }
2062 
2063    cmd_buffer->state.gfx.push_constant_stages = stages;
2064 
2065    /* From the BDW PRM for 3DSTATE_PUSH_CONSTANT_ALLOC_VS:
2066     *
2067     *    "The 3DSTATE_CONSTANT_VS must be reprogrammed prior to
2068     *    the next 3DPRIMITIVE command after programming the
2069     *    3DSTATE_PUSH_CONSTANT_ALLOC_VS"
2070     *
2071     * Since 3DSTATE_PUSH_CONSTANT_ALLOC_VS is programmed as part of
2072     * pipeline setup, we need to dirty push constants.
2073     */
2074    cmd_buffer->state.push_constants_dirty |= VK_SHADER_STAGE_ALL_GRAPHICS;
2075 }
2076 
2077 static VkResult
emit_binding_table(struct anv_cmd_buffer * cmd_buffer,struct anv_cmd_pipeline_state * pipe_state,struct anv_shader_bin * shader,struct anv_state * bt_state)2078 emit_binding_table(struct anv_cmd_buffer *cmd_buffer,
2079                    struct anv_cmd_pipeline_state *pipe_state,
2080                    struct anv_shader_bin *shader,
2081                    struct anv_state *bt_state)
2082 {
2083    uint32_t state_offset;
2084 
2085    struct anv_pipeline_bind_map *map = &shader->bind_map;
2086    if (map->surface_count == 0) {
2087       *bt_state = (struct anv_state) { 0, };
2088       return VK_SUCCESS;
2089    }
2090 
2091    *bt_state = anv_cmd_buffer_alloc_binding_table(cmd_buffer,
2092                                                   map->surface_count,
2093                                                   &state_offset);
2094    uint32_t *bt_map = bt_state->map;
2095 
2096    if (bt_state->map == NULL)
2097       return VK_ERROR_OUT_OF_DEVICE_MEMORY;
2098 
2099    /* We only need to emit relocs if we're not using softpin.  If we are using
2100     * softpin then we always keep all user-allocated memory objects resident.
2101     */
2102    const bool need_client_mem_relocs =
2103       anv_use_relocations(cmd_buffer->device->physical);
2104    struct anv_push_constants *push = &pipe_state->push_constants;
2105 
2106    for (uint32_t s = 0; s < map->surface_count; s++) {
2107       struct anv_pipeline_binding *binding = &map->surface_to_descriptor[s];
2108 
2109       struct anv_state surface_state;
2110 
2111       switch (binding->set) {
2112       case ANV_DESCRIPTOR_SET_NULL:
2113          bt_map[s] = 0;
2114          break;
2115 
2116       case ANV_DESCRIPTOR_SET_COLOR_ATTACHMENTS:
2117          /* Color attachment binding */
2118          assert(shader->stage == MESA_SHADER_FRAGMENT);
2119          if (binding->index < cmd_buffer->state.gfx.color_att_count) {
2120             const struct anv_attachment *att =
2121                &cmd_buffer->state.gfx.color_att[binding->index];
2122             surface_state = att->surface_state.state;
2123          } else {
2124             surface_state = cmd_buffer->state.gfx.null_surface_state;
2125          }
2126          assert(surface_state.map);
2127          bt_map[s] = surface_state.offset + state_offset;
2128          break;
2129 
2130       case ANV_DESCRIPTOR_SET_SHADER_CONSTANTS: {
2131          struct anv_state surface_state =
2132             anv_cmd_buffer_alloc_surface_state(cmd_buffer);
2133 
2134          struct anv_address constant_data = {
2135             .bo = cmd_buffer->device->instruction_state_pool.block_pool.bo,
2136             .offset = shader->kernel.offset +
2137                       shader->prog_data->const_data_offset,
2138          };
2139          unsigned constant_data_size = shader->prog_data->const_data_size;
2140 
2141          const enum isl_format format =
2142             anv_isl_format_for_descriptor_type(cmd_buffer->device,
2143                                                VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER);
2144          anv_fill_buffer_surface_state(cmd_buffer->device, surface_state,
2145                                        format, ISL_SWIZZLE_IDENTITY,
2146                                        ISL_SURF_USAGE_CONSTANT_BUFFER_BIT,
2147                                        constant_data, constant_data_size, 1);
2148 
2149          assert(surface_state.map);
2150          bt_map[s] = surface_state.offset + state_offset;
2151          add_surface_reloc(cmd_buffer, surface_state, constant_data);
2152          break;
2153       }
2154 
2155       case ANV_DESCRIPTOR_SET_NUM_WORK_GROUPS: {
2156          /* This is always the first binding for compute shaders */
2157          assert(shader->stage == MESA_SHADER_COMPUTE && s == 0);
2158 
2159          struct anv_state surface_state =
2160             anv_cmd_buffer_alloc_surface_state(cmd_buffer);
2161 
2162          const enum isl_format format =
2163             anv_isl_format_for_descriptor_type(cmd_buffer->device,
2164                                                VK_DESCRIPTOR_TYPE_STORAGE_BUFFER);
2165          anv_fill_buffer_surface_state(cmd_buffer->device, surface_state,
2166                                        format, ISL_SWIZZLE_IDENTITY,
2167                                        ISL_SURF_USAGE_CONSTANT_BUFFER_BIT,
2168                                        cmd_buffer->state.compute.num_workgroups,
2169                                        12, 1);
2170 
2171          assert(surface_state.map);
2172          bt_map[s] = surface_state.offset + state_offset;
2173          if (need_client_mem_relocs) {
2174             add_surface_reloc(cmd_buffer, surface_state,
2175                               cmd_buffer->state.compute.num_workgroups);
2176          }
2177          break;
2178       }
2179 
2180       case ANV_DESCRIPTOR_SET_DESCRIPTORS: {
2181          /* This is a descriptor set buffer so the set index is actually
2182           * given by binding->binding.  (Yes, that's confusing.)
2183           */
2184          struct anv_descriptor_set *set =
2185             pipe_state->descriptors[binding->index];
2186          assert(set->desc_mem.alloc_size);
2187          assert(set->desc_surface_state.alloc_size);
2188          bt_map[s] = set->desc_surface_state.offset + state_offset;
2189          add_surface_reloc(cmd_buffer, set->desc_surface_state,
2190                            anv_descriptor_set_address(set));
2191          break;
2192       }
2193 
2194       default: {
2195          assert(binding->set < MAX_SETS);
2196          const struct anv_descriptor_set *set =
2197             pipe_state->descriptors[binding->set];
2198          if (binding->index >= set->descriptor_count) {
2199             /* From the Vulkan spec section entitled "DescriptorSet and
2200              * Binding Assignment":
2201              *
2202              *    "If the array is runtime-sized, then array elements greater
2203              *    than or equal to the size of that binding in the bound
2204              *    descriptor set must not be used."
2205              *
2206              * Unfortunately, the compiler isn't smart enough to figure out
2207              * when a dynamic binding isn't used so it may grab the whole
2208              * array and stick it in the binding table.  In this case, it's
2209              * safe to just skip those bindings that are OOB.
2210              */
2211             assert(binding->index < set->layout->descriptor_count);
2212             continue;
2213          }
2214          const struct anv_descriptor *desc = &set->descriptors[binding->index];
2215 
2216          switch (desc->type) {
2217          case VK_DESCRIPTOR_TYPE_SAMPLER:
2218             /* Nothing for us to do here */
2219             continue;
2220 
2221          case VK_DESCRIPTOR_TYPE_COMBINED_IMAGE_SAMPLER:
2222          case VK_DESCRIPTOR_TYPE_SAMPLED_IMAGE:
2223          case VK_DESCRIPTOR_TYPE_INPUT_ATTACHMENT: {
2224             if (desc->image_view) {
2225                struct anv_surface_state sstate =
2226                   (desc->layout == VK_IMAGE_LAYOUT_GENERAL) ?
2227                   desc->image_view->planes[binding->plane].general_sampler_surface_state :
2228                   desc->image_view->planes[binding->plane].optimal_sampler_surface_state;
2229                surface_state = sstate.state;
2230                assert(surface_state.alloc_size);
2231                if (need_client_mem_relocs)
2232                   add_surface_state_relocs(cmd_buffer, sstate);
2233             } else {
2234                surface_state = cmd_buffer->device->null_surface_state;
2235             }
2236             break;
2237          }
2238 
2239          case VK_DESCRIPTOR_TYPE_STORAGE_IMAGE: {
2240             if (desc->image_view) {
2241                struct anv_surface_state sstate =
2242                   binding->lowered_storage_surface
2243                   ? desc->image_view->planes[binding->plane].lowered_storage_surface_state
2244                   : desc->image_view->planes[binding->plane].storage_surface_state;
2245                surface_state = sstate.state;
2246                assert(surface_state.alloc_size);
2247                if (surface_state.offset == 0) {
2248                   mesa_loge("Bound a image to a descriptor where the "
2249                             "descriptor does not have NonReadable "
2250                             "set and the image does not have a "
2251                             "corresponding SPIR-V format enum.");
2252                   vk_debug_report(&cmd_buffer->device->physical->instance->vk,
2253                                   VK_DEBUG_REPORT_ERROR_BIT_EXT,
2254                                   &desc->image_view->vk.base,
2255                                   __LINE__, 0, "anv",
2256                                   "Bound a image to a descriptor where the "
2257                                   "descriptor does not have NonReadable "
2258                                   "set and the image does not have a "
2259                                   "corresponding SPIR-V format enum.");
2260                }
2261                if (surface_state.offset && need_client_mem_relocs)
2262                   add_surface_state_relocs(cmd_buffer, sstate);
2263             } else {
2264                surface_state = cmd_buffer->device->null_surface_state;
2265             }
2266             break;
2267          }
2268 
2269          case VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER:
2270          case VK_DESCRIPTOR_TYPE_STORAGE_BUFFER:
2271             if (desc->set_buffer_view) {
2272                surface_state = desc->set_buffer_view->surface_state;
2273                assert(surface_state.alloc_size);
2274                if (need_client_mem_relocs) {
2275                   add_surface_reloc(cmd_buffer, surface_state,
2276                                     desc->set_buffer_view->address);
2277                }
2278             } else {
2279                surface_state = cmd_buffer->device->null_surface_state;
2280             }
2281             break;
2282 
2283          case VK_DESCRIPTOR_TYPE_UNIFORM_TEXEL_BUFFER:
2284             if (desc->buffer_view) {
2285                surface_state = desc->buffer_view->surface_state;
2286                assert(surface_state.alloc_size);
2287                if (need_client_mem_relocs) {
2288                   add_surface_reloc(cmd_buffer, surface_state,
2289                                     desc->buffer_view->address);
2290                }
2291             } else {
2292                surface_state = cmd_buffer->device->null_surface_state;
2293             }
2294             break;
2295 
2296          case VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER_DYNAMIC:
2297          case VK_DESCRIPTOR_TYPE_STORAGE_BUFFER_DYNAMIC: {
2298             if (desc->buffer) {
2299                /* Compute the offset within the buffer */
2300                uint32_t dynamic_offset =
2301                   push->dynamic_offsets[binding->dynamic_offset_index];
2302                uint64_t offset = desc->offset + dynamic_offset;
2303                /* Clamp to the buffer size */
2304                offset = MIN2(offset, desc->buffer->vk.size);
2305                /* Clamp the range to the buffer size */
2306                uint32_t range = MIN2(desc->range, desc->buffer->vk.size - offset);
2307 
2308                /* Align the range for consistency */
2309                if (desc->type == VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER_DYNAMIC)
2310                   range = align(range, ANV_UBO_ALIGNMENT);
2311 
2312                struct anv_address address =
2313                   anv_address_add(desc->buffer->address, offset);
2314 
2315                surface_state =
2316                   anv_state_stream_alloc(&cmd_buffer->surface_state_stream, 64, 64);
2317                enum isl_format format =
2318                   anv_isl_format_for_descriptor_type(cmd_buffer->device,
2319                                                      desc->type);
2320 
2321                isl_surf_usage_flags_t usage =
2322                   desc->type == VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER_DYNAMIC ?
2323                   ISL_SURF_USAGE_CONSTANT_BUFFER_BIT :
2324                   ISL_SURF_USAGE_STORAGE_BIT;
2325 
2326                anv_fill_buffer_surface_state(cmd_buffer->device, surface_state,
2327                                              format, ISL_SWIZZLE_IDENTITY,
2328                                              usage, address, range, 1);
2329                if (need_client_mem_relocs)
2330                   add_surface_reloc(cmd_buffer, surface_state, address);
2331             } else {
2332                surface_state = cmd_buffer->device->null_surface_state;
2333             }
2334             break;
2335          }
2336 
2337          case VK_DESCRIPTOR_TYPE_STORAGE_TEXEL_BUFFER:
2338             if (desc->buffer_view) {
2339                surface_state = binding->lowered_storage_surface
2340                   ? desc->buffer_view->lowered_storage_surface_state
2341                   : desc->buffer_view->storage_surface_state;
2342                assert(surface_state.alloc_size);
2343                if (need_client_mem_relocs) {
2344                   add_surface_reloc(cmd_buffer, surface_state,
2345                                     desc->buffer_view->address);
2346                }
2347             } else {
2348                surface_state = cmd_buffer->device->null_surface_state;
2349             }
2350             break;
2351 
2352          default:
2353             assert(!"Invalid descriptor type");
2354             continue;
2355          }
2356          assert(surface_state.map);
2357          bt_map[s] = surface_state.offset + state_offset;
2358          break;
2359       }
2360       }
2361    }
2362 
2363    return VK_SUCCESS;
2364 }
2365 
2366 static VkResult
emit_samplers(struct anv_cmd_buffer * cmd_buffer,struct anv_cmd_pipeline_state * pipe_state,struct anv_shader_bin * shader,struct anv_state * state)2367 emit_samplers(struct anv_cmd_buffer *cmd_buffer,
2368               struct anv_cmd_pipeline_state *pipe_state,
2369               struct anv_shader_bin *shader,
2370               struct anv_state *state)
2371 {
2372    struct anv_pipeline_bind_map *map = &shader->bind_map;
2373    if (map->sampler_count == 0) {
2374       *state = (struct anv_state) { 0, };
2375       return VK_SUCCESS;
2376    }
2377 
2378    uint32_t size = map->sampler_count * 16;
2379    *state = anv_cmd_buffer_alloc_dynamic_state(cmd_buffer, size, 32);
2380 
2381    if (state->map == NULL)
2382       return VK_ERROR_OUT_OF_DEVICE_MEMORY;
2383 
2384    for (uint32_t s = 0; s < map->sampler_count; s++) {
2385       struct anv_pipeline_binding *binding = &map->sampler_to_descriptor[s];
2386       const struct anv_descriptor *desc =
2387          &pipe_state->descriptors[binding->set]->descriptors[binding->index];
2388 
2389       if (desc->type != VK_DESCRIPTOR_TYPE_SAMPLER &&
2390           desc->type != VK_DESCRIPTOR_TYPE_COMBINED_IMAGE_SAMPLER)
2391          continue;
2392 
2393       struct anv_sampler *sampler = desc->sampler;
2394 
2395       /* This can happen if we have an unfilled slot since TYPE_SAMPLER
2396        * happens to be zero.
2397        */
2398       if (sampler == NULL)
2399          continue;
2400 
2401       memcpy(state->map + (s * 16),
2402              sampler->state[binding->plane], sizeof(sampler->state[0]));
2403    }
2404 
2405    return VK_SUCCESS;
2406 }
2407 
2408 static uint32_t
flush_descriptor_sets(struct anv_cmd_buffer * cmd_buffer,struct anv_cmd_pipeline_state * pipe_state,const VkShaderStageFlags dirty,struct anv_shader_bin ** shaders,uint32_t num_shaders)2409 flush_descriptor_sets(struct anv_cmd_buffer *cmd_buffer,
2410                       struct anv_cmd_pipeline_state *pipe_state,
2411                       const VkShaderStageFlags dirty,
2412                       struct anv_shader_bin **shaders,
2413                       uint32_t num_shaders)
2414 {
2415    VkShaderStageFlags flushed = 0;
2416 
2417    VkResult result = VK_SUCCESS;
2418    for (uint32_t i = 0; i < num_shaders; i++) {
2419       if (!shaders[i])
2420          continue;
2421 
2422       gl_shader_stage stage = shaders[i]->stage;
2423       VkShaderStageFlags vk_stage = mesa_to_vk_shader_stage(stage);
2424       if ((vk_stage & dirty) == 0)
2425          continue;
2426 
2427       assert(stage < ARRAY_SIZE(cmd_buffer->state.samplers));
2428       result = emit_samplers(cmd_buffer, pipe_state, shaders[i],
2429                              &cmd_buffer->state.samplers[stage]);
2430       if (result != VK_SUCCESS)
2431          break;
2432 
2433       assert(stage < ARRAY_SIZE(cmd_buffer->state.binding_tables));
2434       result = emit_binding_table(cmd_buffer, pipe_state, shaders[i],
2435                                   &cmd_buffer->state.binding_tables[stage]);
2436       if (result != VK_SUCCESS)
2437          break;
2438 
2439       flushed |= vk_stage;
2440    }
2441 
2442    if (result != VK_SUCCESS) {
2443       assert(result == VK_ERROR_OUT_OF_DEVICE_MEMORY);
2444 
2445       result = anv_cmd_buffer_new_binding_table_block(cmd_buffer);
2446       if (result != VK_SUCCESS)
2447          return 0;
2448 
2449       /* Re-emit state base addresses so we get the new surface state base
2450        * address before we start emitting binding tables etc.
2451        */
2452       genX(cmd_buffer_emit_state_base_address)(cmd_buffer);
2453 
2454       /* Re-emit all active binding tables */
2455       flushed = 0;
2456 
2457       for (uint32_t i = 0; i < num_shaders; i++) {
2458          if (!shaders[i])
2459             continue;
2460 
2461          gl_shader_stage stage = shaders[i]->stage;
2462 
2463          result = emit_samplers(cmd_buffer, pipe_state, shaders[i],
2464                                 &cmd_buffer->state.samplers[stage]);
2465          if (result != VK_SUCCESS) {
2466             anv_batch_set_error(&cmd_buffer->batch, result);
2467             return 0;
2468          }
2469          result = emit_binding_table(cmd_buffer, pipe_state, shaders[i],
2470                                      &cmd_buffer->state.binding_tables[stage]);
2471          if (result != VK_SUCCESS) {
2472             anv_batch_set_error(&cmd_buffer->batch, result);
2473             return 0;
2474          }
2475 
2476          flushed |= mesa_to_vk_shader_stage(stage);
2477       }
2478    }
2479 
2480    return flushed;
2481 }
2482 
2483 static void
cmd_buffer_emit_descriptor_pointers(struct anv_cmd_buffer * cmd_buffer,uint32_t stages)2484 cmd_buffer_emit_descriptor_pointers(struct anv_cmd_buffer *cmd_buffer,
2485                                     uint32_t stages)
2486 {
2487    static const uint32_t sampler_state_opcodes[] = {
2488       [MESA_SHADER_VERTEX]                      = 43,
2489       [MESA_SHADER_TESS_CTRL]                   = 44, /* HS */
2490       [MESA_SHADER_TESS_EVAL]                   = 45, /* DS */
2491       [MESA_SHADER_GEOMETRY]                    = 46,
2492       [MESA_SHADER_FRAGMENT]                    = 47,
2493    };
2494 
2495    static const uint32_t binding_table_opcodes[] = {
2496       [MESA_SHADER_VERTEX]                      = 38,
2497       [MESA_SHADER_TESS_CTRL]                   = 39,
2498       [MESA_SHADER_TESS_EVAL]                   = 40,
2499       [MESA_SHADER_GEOMETRY]                    = 41,
2500       [MESA_SHADER_FRAGMENT]                    = 42,
2501    };
2502 
2503    anv_foreach_stage(s, stages) {
2504       assert(s < ARRAY_SIZE(binding_table_opcodes));
2505 
2506       if (cmd_buffer->state.samplers[s].alloc_size > 0) {
2507          anv_batch_emit(&cmd_buffer->batch,
2508                         GENX(3DSTATE_SAMPLER_STATE_POINTERS_VS), ssp) {
2509             ssp._3DCommandSubOpcode = sampler_state_opcodes[s];
2510             ssp.PointertoVSSamplerState = cmd_buffer->state.samplers[s].offset;
2511          }
2512       }
2513 
2514       /* Always emit binding table pointers if we're asked to, since on SKL
2515        * this is what flushes push constants. */
2516       anv_batch_emit(&cmd_buffer->batch,
2517                      GENX(3DSTATE_BINDING_TABLE_POINTERS_VS), btp) {
2518          btp._3DCommandSubOpcode = binding_table_opcodes[s];
2519          btp.PointertoVSBindingTable = cmd_buffer->state.binding_tables[s].offset;
2520       }
2521    }
2522 }
2523 
2524 static struct anv_address
get_push_range_address(struct anv_cmd_buffer * cmd_buffer,const struct anv_shader_bin * shader,const struct anv_push_range * range)2525 get_push_range_address(struct anv_cmd_buffer *cmd_buffer,
2526                        const struct anv_shader_bin *shader,
2527                        const struct anv_push_range *range)
2528 {
2529    struct anv_cmd_graphics_state *gfx_state = &cmd_buffer->state.gfx;
2530    switch (range->set) {
2531    case ANV_DESCRIPTOR_SET_DESCRIPTORS: {
2532       /* This is a descriptor set buffer so the set index is
2533        * actually given by binding->binding.  (Yes, that's
2534        * confusing.)
2535        */
2536       struct anv_descriptor_set *set =
2537          gfx_state->base.descriptors[range->index];
2538       return anv_descriptor_set_address(set);
2539    }
2540 
2541    case ANV_DESCRIPTOR_SET_PUSH_CONSTANTS: {
2542       if (gfx_state->base.push_constants_state.alloc_size == 0) {
2543          gfx_state->base.push_constants_state =
2544             anv_cmd_buffer_gfx_push_constants(cmd_buffer);
2545       }
2546       return (struct anv_address) {
2547          .bo = cmd_buffer->device->dynamic_state_pool.block_pool.bo,
2548          .offset = gfx_state->base.push_constants_state.offset,
2549       };
2550    }
2551 
2552    case ANV_DESCRIPTOR_SET_SHADER_CONSTANTS:
2553       return (struct anv_address) {
2554          .bo = cmd_buffer->device->instruction_state_pool.block_pool.bo,
2555          .offset = shader->kernel.offset +
2556                    shader->prog_data->const_data_offset,
2557       };
2558 
2559    default: {
2560       assert(range->set < MAX_SETS);
2561       struct anv_descriptor_set *set =
2562          gfx_state->base.descriptors[range->set];
2563       const struct anv_descriptor *desc =
2564          &set->descriptors[range->index];
2565 
2566       if (desc->type == VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER) {
2567          if (desc->buffer_view)
2568             return desc->buffer_view->address;
2569       } else {
2570          assert(desc->type == VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER_DYNAMIC);
2571          if (desc->buffer) {
2572             const struct anv_push_constants *push =
2573                &gfx_state->base.push_constants;
2574             uint32_t dynamic_offset =
2575                push->dynamic_offsets[range->dynamic_offset_index];
2576             return anv_address_add(desc->buffer->address,
2577                                    desc->offset + dynamic_offset);
2578          }
2579       }
2580 
2581       /* For NULL UBOs, we just return an address in the workaround BO.  We do
2582        * writes to it for workarounds but always at the bottom.  The higher
2583        * bytes should be all zeros.
2584        */
2585       assert(range->length * 32 <= 2048);
2586       return (struct anv_address) {
2587          .bo = cmd_buffer->device->workaround_bo,
2588          .offset = 1024,
2589       };
2590    }
2591    }
2592 }
2593 
2594 
2595 /** Returns the size in bytes of the bound buffer
2596  *
2597  * The range is relative to the start of the buffer, not the start of the
2598  * range.  The returned range may be smaller than
2599  *
2600  *    (range->start + range->length) * 32;
2601  */
2602 static uint32_t
get_push_range_bound_size(struct anv_cmd_buffer * cmd_buffer,const struct anv_shader_bin * shader,const struct anv_push_range * range)2603 get_push_range_bound_size(struct anv_cmd_buffer *cmd_buffer,
2604                           const struct anv_shader_bin *shader,
2605                           const struct anv_push_range *range)
2606 {
2607    assert(shader->stage != MESA_SHADER_COMPUTE);
2608    const struct anv_cmd_graphics_state *gfx_state = &cmd_buffer->state.gfx;
2609    switch (range->set) {
2610    case ANV_DESCRIPTOR_SET_DESCRIPTORS: {
2611       struct anv_descriptor_set *set =
2612          gfx_state->base.descriptors[range->index];
2613       assert(range->start * 32 < set->desc_mem.alloc_size);
2614       assert((range->start + range->length) * 32 <= set->desc_mem.alloc_size);
2615       return set->desc_mem.alloc_size;
2616    }
2617 
2618    case ANV_DESCRIPTOR_SET_PUSH_CONSTANTS:
2619       return (range->start + range->length) * 32;
2620 
2621    case ANV_DESCRIPTOR_SET_SHADER_CONSTANTS:
2622       return ALIGN(shader->prog_data->const_data_size, ANV_UBO_ALIGNMENT);
2623 
2624    default: {
2625       assert(range->set < MAX_SETS);
2626       struct anv_descriptor_set *set =
2627          gfx_state->base.descriptors[range->set];
2628       const struct anv_descriptor *desc =
2629          &set->descriptors[range->index];
2630 
2631       if (desc->type == VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER) {
2632          /* Here we promote a UBO to a binding table entry so that we can avoid a layer of indirection.
2633             * We use the descriptor set's internally allocated surface state to fill the binding table entry.
2634          */
2635          if (!desc->set_buffer_view)
2636             return 0;
2637 
2638          if (range->start * 32 > desc->set_buffer_view->range)
2639             return 0;
2640 
2641          return desc->set_buffer_view->range;
2642       } else {
2643          if (!desc->buffer)
2644             return 0;
2645 
2646          assert(desc->type == VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER_DYNAMIC);
2647          /* Compute the offset within the buffer */
2648          const struct anv_push_constants *push =
2649             &gfx_state->base.push_constants;
2650          uint32_t dynamic_offset =
2651             push->dynamic_offsets[range->dynamic_offset_index];
2652          uint64_t offset = desc->offset + dynamic_offset;
2653          /* Clamp to the buffer size */
2654          offset = MIN2(offset, desc->buffer->vk.size);
2655          /* Clamp the range to the buffer size */
2656          uint32_t bound_range = MIN2(desc->range, desc->buffer->vk.size - offset);
2657 
2658          /* Align the range for consistency */
2659          bound_range = align(bound_range, ANV_UBO_ALIGNMENT);
2660 
2661          return bound_range;
2662       }
2663    }
2664    }
2665 }
2666 
2667 static void
cmd_buffer_emit_push_constant(struct anv_cmd_buffer * cmd_buffer,gl_shader_stage stage,struct anv_address * buffers,unsigned buffer_count)2668 cmd_buffer_emit_push_constant(struct anv_cmd_buffer *cmd_buffer,
2669                               gl_shader_stage stage,
2670                               struct anv_address *buffers,
2671                               unsigned buffer_count)
2672 {
2673    const struct anv_cmd_graphics_state *gfx_state = &cmd_buffer->state.gfx;
2674    const struct anv_graphics_pipeline *pipeline = gfx_state->pipeline;
2675 
2676    static const uint32_t push_constant_opcodes[] = {
2677       [MESA_SHADER_VERTEX]                      = 21,
2678       [MESA_SHADER_TESS_CTRL]                   = 25, /* HS */
2679       [MESA_SHADER_TESS_EVAL]                   = 26, /* DS */
2680       [MESA_SHADER_GEOMETRY]                    = 22,
2681       [MESA_SHADER_FRAGMENT]                    = 23,
2682    };
2683 
2684    assert(stage < ARRAY_SIZE(push_constant_opcodes));
2685 
2686    UNUSED uint32_t mocs = anv_mocs(cmd_buffer->device, NULL, 0);
2687 
2688    anv_batch_emit(&cmd_buffer->batch, GENX(3DSTATE_CONSTANT_VS), c) {
2689       c._3DCommandSubOpcode = push_constant_opcodes[stage];
2690 
2691       /* Set MOCS, except on Gfx8, because the Broadwell PRM says:
2692        *
2693        *    "Constant Buffer Object Control State must be always
2694        *     programmed to zero."
2695        *
2696        * This restriction does not exist on any newer platforms.
2697        *
2698        * We only have one MOCS field for the whole packet, not one per
2699        * buffer.  We could go out of our way here to walk over all of
2700        * the buffers and see if any of them are used externally and use
2701        * the external MOCS.  However, the notion that someone would use
2702        * the same bit of memory for both scanout and a UBO is nuts.
2703        *
2704        * Let's not bother and assume it's all internal.
2705        */
2706 #if GFX_VER != 8
2707       c.ConstantBody.MOCS = mocs;
2708 #endif
2709 
2710       if (anv_pipeline_has_stage(pipeline, stage)) {
2711          const struct anv_pipeline_bind_map *bind_map =
2712             &pipeline->shaders[stage]->bind_map;
2713 
2714 #if GFX_VERx10 >= 75
2715          /* The Skylake PRM contains the following restriction:
2716           *
2717           *    "The driver must ensure The following case does not occur
2718           *     without a flush to the 3D engine: 3DSTATE_CONSTANT_* with
2719           *     buffer 3 read length equal to zero committed followed by a
2720           *     3DSTATE_CONSTANT_* with buffer 0 read length not equal to
2721           *     zero committed."
2722           *
2723           * To avoid this, we program the buffers in the highest slots.
2724           * This way, slot 0 is only used if slot 3 is also used.
2725           */
2726          assert(buffer_count <= 4);
2727          const unsigned shift = 4 - buffer_count;
2728          for (unsigned i = 0; i < buffer_count; i++) {
2729             const struct anv_push_range *range = &bind_map->push_ranges[i];
2730 
2731             /* At this point we only have non-empty ranges */
2732             assert(range->length > 0);
2733 
2734             /* For Ivy Bridge, make sure we only set the first range (actual
2735              * push constants)
2736              */
2737             assert((GFX_VERx10 >= 75) || i == 0);
2738 
2739             c.ConstantBody.ReadLength[i + shift] = range->length;
2740             c.ConstantBody.Buffer[i + shift] =
2741                anv_address_add(buffers[i], range->start * 32);
2742          }
2743 #else
2744          /* For Ivy Bridge, push constants are relative to dynamic state
2745           * base address and we only ever push actual push constants.
2746           */
2747          if (bind_map->push_ranges[0].length > 0) {
2748             assert(buffer_count == 1);
2749             assert(bind_map->push_ranges[0].set ==
2750                    ANV_DESCRIPTOR_SET_PUSH_CONSTANTS);
2751             assert(buffers[0].bo ==
2752                    cmd_buffer->device->dynamic_state_pool.block_pool.bo);
2753             c.ConstantBody.ReadLength[0] = bind_map->push_ranges[0].length;
2754             c.ConstantBody.Buffer[0].bo = NULL;
2755             c.ConstantBody.Buffer[0].offset = buffers[0].offset;
2756          }
2757          assert(bind_map->push_ranges[1].length == 0);
2758          assert(bind_map->push_ranges[2].length == 0);
2759          assert(bind_map->push_ranges[3].length == 0);
2760 #endif
2761       }
2762    }
2763 }
2764 
2765 static void
cmd_buffer_flush_push_constants(struct anv_cmd_buffer * cmd_buffer,VkShaderStageFlags dirty_stages)2766 cmd_buffer_flush_push_constants(struct anv_cmd_buffer *cmd_buffer,
2767                                 VkShaderStageFlags dirty_stages)
2768 {
2769    VkShaderStageFlags flushed = 0;
2770    struct anv_cmd_graphics_state *gfx_state = &cmd_buffer->state.gfx;
2771    const struct anv_graphics_pipeline *pipeline = gfx_state->pipeline;
2772 
2773    /* Compute robust pushed register access mask for each stage. */
2774    if (cmd_buffer->device->vk.enabled_features.robustBufferAccess) {
2775       anv_foreach_stage(stage, dirty_stages) {
2776          if (!anv_pipeline_has_stage(pipeline, stage))
2777             continue;
2778 
2779          const struct anv_shader_bin *shader = pipeline->shaders[stage];
2780          const struct anv_pipeline_bind_map *bind_map = &shader->bind_map;
2781          struct anv_push_constants *push = &gfx_state->base.push_constants;
2782 
2783          push->push_reg_mask[stage] = 0;
2784          /* Start of the current range in the shader, relative to the start of
2785           * push constants in the shader.
2786           */
2787          unsigned range_start_reg = 0;
2788          for (unsigned i = 0; i < 4; i++) {
2789             const struct anv_push_range *range = &bind_map->push_ranges[i];
2790             if (range->length == 0)
2791                continue;
2792 
2793             unsigned bound_size =
2794                get_push_range_bound_size(cmd_buffer, shader, range);
2795             if (bound_size >= range->start * 32) {
2796                unsigned bound_regs =
2797                   MIN2(DIV_ROUND_UP(bound_size, 32) - range->start,
2798                        range->length);
2799                assert(range_start_reg + bound_regs <= 64);
2800                push->push_reg_mask[stage] |= BITFIELD64_RANGE(range_start_reg,
2801                                                               bound_regs);
2802             }
2803 
2804             cmd_buffer->state.push_constants_dirty |=
2805                mesa_to_vk_shader_stage(stage);
2806 
2807             range_start_reg += range->length;
2808          }
2809       }
2810    }
2811 
2812    /* Resets the push constant state so that we allocate a new one if
2813     * needed.
2814     */
2815    gfx_state->base.push_constants_state = ANV_STATE_NULL;
2816 
2817    anv_foreach_stage(stage, dirty_stages) {
2818       unsigned buffer_count = 0;
2819       flushed |= mesa_to_vk_shader_stage(stage);
2820       UNUSED uint32_t max_push_range = 0;
2821 
2822       struct anv_address buffers[4] = {};
2823       if (anv_pipeline_has_stage(pipeline, stage)) {
2824          const struct anv_shader_bin *shader = pipeline->shaders[stage];
2825          const struct anv_pipeline_bind_map *bind_map = &shader->bind_map;
2826 
2827          /* We have to gather buffer addresses as a second step because the
2828           * loop above puts data into the push constant area and the call to
2829           * get_push_range_address is what locks our push constants and copies
2830           * them into the actual GPU buffer.  If we did the two loops at the
2831           * same time, we'd risk only having some of the sizes in the push
2832           * constant buffer when we did the copy.
2833           */
2834          for (unsigned i = 0; i < 4; i++) {
2835             const struct anv_push_range *range = &bind_map->push_ranges[i];
2836             if (range->length == 0)
2837                break;
2838 
2839             buffers[i] = get_push_range_address(cmd_buffer, shader, range);
2840             max_push_range = MAX2(max_push_range, range->length);
2841             buffer_count++;
2842          }
2843 
2844          /* We have at most 4 buffers but they should be tightly packed */
2845          for (unsigned i = buffer_count; i < 4; i++)
2846             assert(bind_map->push_ranges[i].length == 0);
2847       }
2848 
2849       cmd_buffer_emit_push_constant(cmd_buffer, stage, buffers, buffer_count);
2850    }
2851 
2852    cmd_buffer->state.push_constants_dirty &= ~flushed;
2853 }
2854 
2855 static void
cmd_buffer_emit_clip(struct anv_cmd_buffer * cmd_buffer)2856 cmd_buffer_emit_clip(struct anv_cmd_buffer *cmd_buffer)
2857 {
2858    const struct vk_dynamic_graphics_state *dyn =
2859       &cmd_buffer->vk.dynamic_graphics_state;
2860 
2861    if (!(cmd_buffer->state.gfx.dirty & ANV_CMD_DIRTY_PIPELINE) &&
2862        !BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_IA_PRIMITIVE_TOPOLOGY) &&
2863 #if GFX_VER <= 7
2864        !BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_RS_CULL_MODE) &&
2865        !BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_RS_FRONT_FACE) &&
2866 #endif
2867        !BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_VP_VIEWPORT_COUNT))
2868       return;
2869 
2870    /* Take dynamic primitive topology in to account with
2871     *    3DSTATE_CLIP::ViewportXYClipTestEnable
2872     */
2873    VkPolygonMode dynamic_raster_mode =
2874       genX(raster_polygon_mode)(cmd_buffer->state.gfx.pipeline,
2875                                 dyn->ia.primitive_topology);
2876    bool xy_clip_test_enable = (dynamic_raster_mode == VK_POLYGON_MODE_FILL);
2877 
2878    struct GENX(3DSTATE_CLIP) clip = {
2879       GENX(3DSTATE_CLIP_header),
2880 #if GFX_VER <= 7
2881       .FrontWinding = genX(vk_to_intel_front_face)[dyn->rs.front_face],
2882       .CullMode     = genX(vk_to_intel_cullmode)[dyn->rs.cull_mode],
2883 #endif
2884       .ViewportXYClipTestEnable = xy_clip_test_enable,
2885    };
2886    uint32_t dwords[GENX(3DSTATE_CLIP_length)];
2887 
2888    struct anv_graphics_pipeline *pipeline = cmd_buffer->state.gfx.pipeline;
2889    if (anv_pipeline_is_primitive(pipeline)) {
2890       const struct elk_vue_prog_data *last =
2891          anv_pipeline_get_last_vue_prog_data(pipeline);
2892       if (last->vue_map.slots_valid & VARYING_BIT_VIEWPORT) {
2893          clip.MaximumVPIndex = dyn->vp.viewport_count > 0 ?
2894                                dyn->vp.viewport_count - 1 : 0;
2895       }
2896    }
2897 
2898    GENX(3DSTATE_CLIP_pack)(NULL, dwords, &clip);
2899    anv_batch_emit_merge(&cmd_buffer->batch, dwords,
2900                         pipeline->gfx7.clip);
2901 }
2902 
2903 static void
cmd_buffer_emit_viewport(struct anv_cmd_buffer * cmd_buffer)2904 cmd_buffer_emit_viewport(struct anv_cmd_buffer *cmd_buffer)
2905 {
2906    struct anv_instance *instance = cmd_buffer->device->physical->instance;
2907    struct anv_cmd_graphics_state *gfx = &cmd_buffer->state.gfx;
2908    const struct vk_dynamic_graphics_state *dyn =
2909       &cmd_buffer->vk.dynamic_graphics_state;
2910    uint32_t count = dyn->vp.viewport_count;
2911    const VkViewport *viewports = dyn->vp.viewports;
2912    struct anv_state sf_clip_state =
2913       anv_cmd_buffer_alloc_dynamic_state(cmd_buffer, count * 64, 64);
2914 
2915    bool negative_one_to_one =
2916       cmd_buffer->state.gfx.pipeline->negative_one_to_one;
2917 
2918    float scale = negative_one_to_one ? 0.5f : 1.0f;
2919 
2920    for (uint32_t i = 0; i < count; i++) {
2921       const VkViewport *vp = &viewports[i];
2922 
2923       /* The gfx7 state struct has just the matrix and guardband fields, the
2924        * gfx8 struct adds the min/max viewport fields. */
2925       struct GENX(SF_CLIP_VIEWPORT) sfv = {
2926          .ViewportMatrixElementm00 = vp->width / 2,
2927          .ViewportMatrixElementm11 = vp->height / 2,
2928          .ViewportMatrixElementm22 = (vp->maxDepth - vp->minDepth) * scale,
2929          .ViewportMatrixElementm30 = vp->x + vp->width / 2,
2930          .ViewportMatrixElementm31 = vp->y + vp->height / 2,
2931          .ViewportMatrixElementm32 = negative_one_to_one ?
2932             (vp->minDepth + vp->maxDepth) * scale : vp->minDepth,
2933          .XMinClipGuardband = -1.0f,
2934          .XMaxClipGuardband = 1.0f,
2935          .YMinClipGuardband = -1.0f,
2936          .YMaxClipGuardband = 1.0f,
2937 #if GFX_VER >= 8
2938          .XMinViewPort = vp->x,
2939          .XMaxViewPort = vp->x + vp->width - 1,
2940          .YMinViewPort = MIN2(vp->y, vp->y + vp->height),
2941          .YMaxViewPort = MAX2(vp->y, vp->y + vp->height) - 1,
2942 #endif
2943       };
2944 
2945       /* Fix depth test misrenderings by lowering translated depth range */
2946       if (instance->lower_depth_range_rate != 1.0f)
2947          sfv.ViewportMatrixElementm32 *= instance->lower_depth_range_rate;
2948 
2949       const uint32_t fb_size_max = 1 << 14;
2950       uint32_t x_min = 0, x_max = fb_size_max;
2951       uint32_t y_min = 0, y_max = fb_size_max;
2952 
2953       /* If we have a valid renderArea, include that */
2954       if (gfx->render_area.extent.width > 0 &&
2955           gfx->render_area.extent.height > 0) {
2956          x_min = MAX2(x_min, gfx->render_area.offset.x);
2957          x_max = MIN2(x_min, gfx->render_area.offset.x +
2958                              gfx->render_area.extent.width);
2959          y_min = MAX2(y_min, gfx->render_area.offset.y);
2960          y_max = MIN2(y_min, gfx->render_area.offset.y +
2961                              gfx->render_area.extent.height);
2962       }
2963 
2964       /* The client is required to have enough scissors for whatever it sets
2965        * as ViewportIndex but it's possible that they've got more viewports
2966        * set from a previous command.  Also, from the Vulkan 1.3.207:
2967        *
2968        *    "The application must ensure (using scissor if necessary) that
2969        *    all rendering is contained within the render area."
2970        *
2971        * If the client doesn't set a scissor, that basically means it
2972        * guarantees everything is in-bounds already.  If we end up using a
2973        * guardband of [-1, 1] in that case, there shouldn't be much loss.
2974        * It's theoretically possible that they could do all their clipping
2975        * with clip planes but that'd be a bit odd.
2976        */
2977       if (i < dyn->vp.scissor_count) {
2978          const VkRect2D *scissor = &dyn->vp.scissors[i];
2979          x_min = MAX2(x_min, scissor->offset.x);
2980          x_max = MIN2(x_min, scissor->offset.x + scissor->extent.width);
2981          y_min = MAX2(y_min, scissor->offset.y);
2982          y_max = MIN2(y_min, scissor->offset.y + scissor->extent.height);
2983       }
2984 
2985       /* Only bother calculating the guardband if our known render area is
2986        * less than the maximum size.  Otherwise, it will calculate [-1, 1]
2987        * anyway but possibly with precision loss.
2988        */
2989       if (x_min > 0 || x_max < fb_size_max ||
2990           y_min > 0 || y_max < fb_size_max) {
2991          intel_calculate_guardband_size(x_min, x_max, y_min, y_max,
2992                                         sfv.ViewportMatrixElementm00,
2993                                         sfv.ViewportMatrixElementm11,
2994                                         sfv.ViewportMatrixElementm30,
2995                                         sfv.ViewportMatrixElementm31,
2996                                         &sfv.XMinClipGuardband,
2997                                         &sfv.XMaxClipGuardband,
2998                                         &sfv.YMinClipGuardband,
2999                                         &sfv.YMaxClipGuardband);
3000       }
3001 
3002       GENX(SF_CLIP_VIEWPORT_pack)(NULL, sf_clip_state.map + i * 64, &sfv);
3003    }
3004 
3005    anv_batch_emit(&cmd_buffer->batch,
3006                   GENX(3DSTATE_VIEWPORT_STATE_POINTERS_SF_CLIP), clip) {
3007       clip.SFClipViewportPointer = sf_clip_state.offset;
3008    }
3009 }
3010 
3011 static void
cmd_buffer_emit_depth_viewport(struct anv_cmd_buffer * cmd_buffer,bool depth_clamp_enable)3012 cmd_buffer_emit_depth_viewport(struct anv_cmd_buffer *cmd_buffer,
3013                                bool depth_clamp_enable)
3014 {
3015    const struct vk_dynamic_graphics_state *dyn =
3016       &cmd_buffer->vk.dynamic_graphics_state;
3017    uint32_t count = dyn->vp.viewport_count;
3018    const VkViewport *viewports = dyn->vp.viewports;
3019    struct anv_state cc_state =
3020       anv_cmd_buffer_alloc_dynamic_state(cmd_buffer, count * 8, 32);
3021 
3022    for (uint32_t i = 0; i < count; i++) {
3023       const VkViewport *vp = &viewports[i];
3024 
3025       /* From the Vulkan spec:
3026        *
3027        *    "It is valid for minDepth to be greater than or equal to
3028        *    maxDepth."
3029        */
3030       float min_depth = MIN2(vp->minDepth, vp->maxDepth);
3031       float max_depth = MAX2(vp->minDepth, vp->maxDepth);
3032 
3033       struct GENX(CC_VIEWPORT) cc_viewport = {
3034          .MinimumDepth = depth_clamp_enable ? min_depth : 0.0f,
3035          .MaximumDepth = depth_clamp_enable ? max_depth : 1.0f,
3036       };
3037 
3038       GENX(CC_VIEWPORT_pack)(NULL, cc_state.map + i * 8, &cc_viewport);
3039    }
3040 
3041    anv_batch_emit(&cmd_buffer->batch,
3042                   GENX(3DSTATE_VIEWPORT_STATE_POINTERS_CC), cc) {
3043       cc.CCViewportPointer = cc_state.offset;
3044    }
3045 }
3046 
3047 static void
cmd_buffer_emit_scissor(struct anv_cmd_buffer * cmd_buffer)3048 cmd_buffer_emit_scissor(struct anv_cmd_buffer *cmd_buffer)
3049 {
3050    struct anv_cmd_graphics_state *gfx = &cmd_buffer->state.gfx;
3051    const struct vk_dynamic_graphics_state *dyn =
3052       &cmd_buffer->vk.dynamic_graphics_state;
3053    uint32_t count = dyn->vp.scissor_count;
3054    const VkRect2D *scissors = dyn->vp.scissors;
3055    const VkViewport *viewports = dyn->vp.viewports;
3056 
3057    /* Wa_1409725701:
3058     *    "The viewport-specific state used by the SF unit (SCISSOR_RECT) is
3059     *    stored as an array of up to 16 elements. The location of first
3060     *    element of the array, as specified by Pointer to SCISSOR_RECT, should
3061     *    be aligned to a 64-byte boundary.
3062     */
3063    uint32_t alignment = 64;
3064    struct anv_state scissor_state =
3065       anv_cmd_buffer_alloc_dynamic_state(cmd_buffer, count * 8, alignment);
3066 
3067    for (uint32_t i = 0; i < count; i++) {
3068       const VkRect2D *s = &scissors[i];
3069       const VkViewport *vp = &viewports[i];
3070 
3071       /* Since xmax and ymax are inclusive, we have to have xmax < xmin or
3072        * ymax < ymin for empty clips.  In case clip x, y, width height are all
3073        * 0, the clamps below produce 0 for xmin, ymin, xmax, ymax, which isn't
3074        * what we want. Just special case empty clips and produce a canonical
3075        * empty clip. */
3076       static const struct GENX(SCISSOR_RECT) empty_scissor = {
3077          .ScissorRectangleYMin = 1,
3078          .ScissorRectangleXMin = 1,
3079          .ScissorRectangleYMax = 0,
3080          .ScissorRectangleXMax = 0
3081       };
3082 
3083       const int max = 0xffff;
3084 
3085       uint32_t y_min = MAX2(s->offset.y, MIN2(vp->y, vp->y + vp->height));
3086       uint32_t x_min = MAX2(s->offset.x, vp->x);
3087       int64_t y_max = MIN2(s->offset.y + s->extent.height - 1,
3088                        MAX2(vp->y, vp->y + vp->height) - 1);
3089       int64_t x_max = MIN2(s->offset.x + s->extent.width - 1,
3090                        vp->x + vp->width - 1);
3091 
3092       y_max = CLAMP(y_max, 0, INT16_MAX >> 1);
3093       x_max = CLAMP(x_max, 0, INT16_MAX >> 1);
3094 
3095       /* Do this math using int64_t so overflow gets clamped correctly. */
3096       if (cmd_buffer->vk.level == VK_COMMAND_BUFFER_LEVEL_PRIMARY) {
3097          y_min = CLAMP((uint64_t) y_min, gfx->render_area.offset.y, max);
3098          x_min = CLAMP((uint64_t) x_min, gfx->render_area.offset.x, max);
3099          y_max = CLAMP((uint64_t) y_max, 0,
3100                              gfx->render_area.offset.y +
3101                              gfx->render_area.extent.height - 1);
3102          x_max = CLAMP((uint64_t) x_max, 0,
3103                              gfx->render_area.offset.x +
3104                              gfx->render_area.extent.width - 1);
3105       }
3106 
3107       struct GENX(SCISSOR_RECT) scissor = {
3108          .ScissorRectangleYMin = y_min,
3109          .ScissorRectangleXMin = x_min,
3110          .ScissorRectangleYMax = y_max,
3111          .ScissorRectangleXMax = x_max
3112       };
3113 
3114       if (s->extent.width <= 0 || s->extent.height <= 0) {
3115          GENX(SCISSOR_RECT_pack)(NULL, scissor_state.map + i * 8,
3116                                  &empty_scissor);
3117       } else {
3118          GENX(SCISSOR_RECT_pack)(NULL, scissor_state.map + i * 8, &scissor);
3119       }
3120    }
3121 
3122    anv_batch_emit(&cmd_buffer->batch,
3123                   GENX(3DSTATE_SCISSOR_STATE_POINTERS), ssp) {
3124       ssp.ScissorRectPointer = scissor_state.offset;
3125    }
3126 }
3127 
3128 static void
cmd_buffer_emit_streamout(struct anv_cmd_buffer * cmd_buffer)3129 cmd_buffer_emit_streamout(struct anv_cmd_buffer *cmd_buffer)
3130 {
3131    const struct vk_dynamic_graphics_state *dyn =
3132       &cmd_buffer->vk.dynamic_graphics_state;
3133    struct anv_graphics_pipeline *pipeline = cmd_buffer->state.gfx.pipeline;
3134 
3135 #if GFX_VER == 7
3136 #  define streamout_state_dw pipeline->gfx7.streamout_state
3137 #else
3138 #  define streamout_state_dw pipeline->gfx8.streamout_state
3139 #endif
3140 
3141    uint32_t dwords[GENX(3DSTATE_STREAMOUT_length)];
3142 
3143    struct GENX(3DSTATE_STREAMOUT) so = {
3144       GENX(3DSTATE_STREAMOUT_header),
3145       .RenderingDisable = dyn->rs.rasterizer_discard_enable,
3146    };
3147    GENX(3DSTATE_STREAMOUT_pack)(NULL, dwords, &so);
3148    anv_batch_emit_merge(&cmd_buffer->batch, dwords, streamout_state_dw);
3149 }
3150 
3151 ALWAYS_INLINE static void
genX(cmd_buffer_flush_gfx_state)3152 genX(cmd_buffer_flush_gfx_state)(struct anv_cmd_buffer *cmd_buffer)
3153 {
3154    struct anv_graphics_pipeline *pipeline = cmd_buffer->state.gfx.pipeline;
3155    const struct vk_dynamic_graphics_state *dyn =
3156       &cmd_buffer->vk.dynamic_graphics_state;
3157    uint32_t *p;
3158 
3159    assert((pipeline->active_stages & VK_SHADER_STAGE_COMPUTE_BIT) == 0);
3160 
3161    genX(cmd_buffer_config_l3)(cmd_buffer, pipeline->base.l3_config);
3162 
3163    genX(flush_pipeline_select_3d)(cmd_buffer);
3164 
3165    /* Apply any pending pipeline flushes we may have.  We want to apply them
3166     * now because, if any of those flushes are for things like push constants,
3167     * the GPU will read the state at weird times.
3168     */
3169    genX(cmd_buffer_apply_pipe_flushes)(cmd_buffer);
3170 
3171    uint32_t vb_emit = cmd_buffer->state.gfx.vb_dirty & pipeline->vb_used;
3172    if (cmd_buffer->state.gfx.dirty & ANV_CMD_DIRTY_PIPELINE)
3173       vb_emit |= pipeline->vb_used;
3174 
3175    if (vb_emit) {
3176       const uint32_t num_buffers = __builtin_popcount(vb_emit);
3177       const uint32_t num_dwords = 1 + num_buffers * 4;
3178 
3179       p = anv_batch_emitn(&cmd_buffer->batch, num_dwords,
3180                           GENX(3DSTATE_VERTEX_BUFFERS));
3181       uint32_t i = 0;
3182       u_foreach_bit(vb, vb_emit) {
3183          struct anv_buffer *buffer = cmd_buffer->state.vertex_bindings[vb].buffer;
3184          uint32_t offset = cmd_buffer->state.vertex_bindings[vb].offset;
3185 
3186          struct GENX(VERTEX_BUFFER_STATE) state;
3187          if (buffer) {
3188             uint32_t stride = dyn->vi_binding_strides[vb];
3189             UNUSED uint32_t size = cmd_buffer->state.vertex_bindings[vb].size;
3190 
3191 #if GFX_VER <= 7
3192             bool per_instance = pipeline->vb[vb].instanced;
3193             uint32_t divisor = pipeline->vb[vb].instance_divisor *
3194                                pipeline->instance_multiplier;
3195 #endif
3196 
3197             state = (struct GENX(VERTEX_BUFFER_STATE)) {
3198                .VertexBufferIndex = vb,
3199 
3200                .MOCS = anv_mocs(cmd_buffer->device, buffer->address.bo,
3201                                 ISL_SURF_USAGE_VERTEX_BUFFER_BIT),
3202 #if GFX_VER <= 7
3203                .BufferAccessType = per_instance ? INSTANCEDATA : VERTEXDATA,
3204                .InstanceDataStepRate = per_instance ? divisor : 1,
3205 #endif
3206                .AddressModifyEnable = true,
3207                .BufferPitch = stride,
3208                .BufferStartingAddress = anv_address_add(buffer->address, offset),
3209                .NullVertexBuffer = offset >= buffer->vk.size,
3210 
3211 #if GFX_VER >= 8
3212                .BufferSize = size,
3213 #else
3214                /* XXX: to handle dynamic offset for older gens we might want
3215                 * to modify Endaddress, but there are issues when doing so:
3216                 *
3217                 * https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/7439
3218                 */
3219                .EndAddress = anv_address_add(buffer->address, buffer->vk.size - 1),
3220 #endif
3221             };
3222          } else {
3223             state = (struct GENX(VERTEX_BUFFER_STATE)) {
3224                .VertexBufferIndex = vb,
3225                .NullVertexBuffer = true,
3226                .MOCS = anv_mocs(cmd_buffer->device, NULL,
3227                                 ISL_SURF_USAGE_VERTEX_BUFFER_BIT),
3228             };
3229          }
3230 
3231 #if GFX_VER == 8
3232          genX(cmd_buffer_set_binding_for_gfx8_vb_flush)(cmd_buffer, vb,
3233                                                         state.BufferStartingAddress,
3234                                                         state.BufferSize);
3235 #endif
3236 
3237          GENX(VERTEX_BUFFER_STATE_pack)(&cmd_buffer->batch, &p[1 + i * 4], &state);
3238          i++;
3239       }
3240    }
3241 
3242    cmd_buffer->state.gfx.vb_dirty &= ~vb_emit;
3243 
3244    uint32_t descriptors_dirty = cmd_buffer->state.descriptors_dirty &
3245                                 pipeline->active_stages;
3246    if (!cmd_buffer->state.gfx.dirty && !descriptors_dirty &&
3247        !vk_dynamic_graphics_state_any_dirty(dyn) &&
3248        !cmd_buffer->state.push_constants_dirty)
3249       return;
3250 
3251    if ((cmd_buffer->state.gfx.dirty & ANV_CMD_DIRTY_XFB_ENABLE) ||
3252        (GFX_VER == 7 && (cmd_buffer->state.gfx.dirty &
3253                          ANV_CMD_DIRTY_PIPELINE))) {
3254       /* Wa_16011411144:
3255        *
3256        * SW must insert a PIPE_CONTROL cmd before and after the
3257        * 3dstate_so_buffer_index_0/1/2/3 states to ensure so_buffer_index_*
3258        * state is not combined with other state changes.
3259        */
3260       if (intel_device_info_is_dg2(cmd_buffer->device->info)) {
3261          anv_add_pending_pipe_bits(cmd_buffer,
3262                                    ANV_PIPE_CS_STALL_BIT,
3263                                    "before SO_BUFFER change WA");
3264          genX(cmd_buffer_apply_pipe_flushes)(cmd_buffer);
3265       }
3266 
3267       /* We don't need any per-buffer dirty tracking because you're not
3268        * allowed to bind different XFB buffers while XFB is enabled.
3269        */
3270       for (unsigned idx = 0; idx < MAX_XFB_BUFFERS; idx++) {
3271          struct anv_xfb_binding *xfb = &cmd_buffer->state.xfb_bindings[idx];
3272          anv_batch_emit(&cmd_buffer->batch, GENX(3DSTATE_SO_BUFFER), sob) {
3273             sob.SOBufferIndex = idx;
3274 
3275             if (cmd_buffer->state.xfb_enabled && xfb->buffer && xfb->size != 0) {
3276                sob.MOCS = anv_mocs(cmd_buffer->device, xfb->buffer->address.bo,
3277                                    ISL_SURF_USAGE_STREAM_OUT_BIT);
3278                sob.SurfaceBaseAddress = anv_address_add(xfb->buffer->address,
3279                                                         xfb->offset);
3280 #if GFX_VER >= 8
3281                sob.SOBufferEnable = true;
3282                sob.StreamOffsetWriteEnable = false;
3283                /* Size is in DWords - 1 */
3284                sob.SurfaceSize = DIV_ROUND_UP(xfb->size, 4) - 1;
3285 #else
3286                /* We don't have SOBufferEnable in 3DSTATE_SO_BUFFER on Gfx7 so
3287                 * we trust in SurfaceEndAddress = SurfaceBaseAddress = 0 (the
3288                 * default for an empty SO_BUFFER packet) to disable them.
3289                 */
3290                sob.SurfacePitch = pipeline->gfx7.xfb_bo_pitch[idx];
3291                sob.SurfaceEndAddress = anv_address_add(xfb->buffer->address,
3292                                                        xfb->offset + xfb->size);
3293 #endif
3294             } else {
3295                sob.MOCS = anv_mocs(cmd_buffer->device, NULL, 0);
3296             }
3297          }
3298       }
3299    }
3300 
3301    if (cmd_buffer->state.gfx.dirty & ANV_CMD_DIRTY_PIPELINE) {
3302       anv_batch_emit_batch(&cmd_buffer->batch, &pipeline->base.batch);
3303 
3304       /* If the pipeline changed, we may need to re-allocate push constant
3305        * space in the URB.
3306        */
3307       cmd_buffer_alloc_push_constants(cmd_buffer);
3308    }
3309 
3310 #if GFX_VER <= 7
3311    if (cmd_buffer->state.descriptors_dirty & VK_SHADER_STAGE_VERTEX_BIT ||
3312        cmd_buffer->state.push_constants_dirty & VK_SHADER_STAGE_VERTEX_BIT) {
3313       /* From the IVB PRM Vol. 2, Part 1, Section 3.2.1:
3314        *
3315        *    "A PIPE_CONTROL with Post-Sync Operation set to 1h and a depth
3316        *    stall needs to be sent just prior to any 3DSTATE_VS,
3317        *    3DSTATE_URB_VS, 3DSTATE_CONSTANT_VS,
3318        *    3DSTATE_BINDING_TABLE_POINTER_VS,
3319        *    3DSTATE_SAMPLER_STATE_POINTER_VS command.  Only one
3320        *    PIPE_CONTROL needs to be sent before any combination of VS
3321        *    associated 3DSTATE."
3322        */
3323       anv_batch_emit(&cmd_buffer->batch, GENX(PIPE_CONTROL), pc) {
3324          pc.DepthStallEnable  = true;
3325          pc.PostSyncOperation = WriteImmediateData;
3326          pc.Address           = cmd_buffer->device->workaround_address;
3327          anv_debug_dump_pc(pc);
3328       }
3329    }
3330 #endif
3331 
3332    /* Render targets live in the same binding table as fragment descriptors */
3333    if (cmd_buffer->state.gfx.dirty & ANV_CMD_DIRTY_RENDER_TARGETS)
3334       descriptors_dirty |= VK_SHADER_STAGE_FRAGMENT_BIT;
3335 
3336    /* We emit the binding tables and sampler tables first, then emit push
3337     * constants and then finally emit binding table and sampler table
3338     * pointers.  It has to happen in this order, since emitting the binding
3339     * tables may change the push constants (in case of storage images). After
3340     * emitting push constants, on SKL+ we have to emit the corresponding
3341     * 3DSTATE_BINDING_TABLE_POINTER_* for the push constants to take effect.
3342     */
3343    uint32_t dirty = 0;
3344    if (descriptors_dirty) {
3345       dirty = flush_descriptor_sets(cmd_buffer,
3346                                     &cmd_buffer->state.gfx.base,
3347                                     descriptors_dirty,
3348                                     pipeline->shaders,
3349                                     ARRAY_SIZE(pipeline->shaders));
3350       cmd_buffer->state.descriptors_dirty &= ~dirty;
3351    }
3352 
3353    if (dirty || cmd_buffer->state.push_constants_dirty) {
3354       /* Because we're pushing UBOs, we have to push whenever either
3355        * descriptors or push constants is dirty.
3356        */
3357       dirty |= cmd_buffer->state.push_constants_dirty;
3358       cmd_buffer_flush_push_constants(cmd_buffer,
3359                                       dirty & VK_SHADER_STAGE_ALL_GRAPHICS);
3360    }
3361 
3362    if (dirty & VK_SHADER_STAGE_ALL_GRAPHICS) {
3363       cmd_buffer_emit_descriptor_pointers(cmd_buffer,
3364                                           dirty & VK_SHADER_STAGE_ALL_GRAPHICS);
3365    }
3366 
3367    cmd_buffer_emit_clip(cmd_buffer);
3368 
3369    if ((cmd_buffer->state.gfx.dirty & (ANV_CMD_DIRTY_PIPELINE |
3370                                        ANV_CMD_DIRTY_XFB_ENABLE)) ||
3371        BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_RS_RASTERIZER_DISCARD_ENABLE))
3372       cmd_buffer_emit_streamout(cmd_buffer);
3373 
3374    if ((cmd_buffer->state.gfx.dirty & (ANV_CMD_DIRTY_PIPELINE |
3375                                        ANV_CMD_DIRTY_RENDER_TARGETS)) ||
3376        BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_VP_VIEWPORTS) ||
3377        BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_VP_SCISSORS)) {
3378       cmd_buffer_emit_viewport(cmd_buffer);
3379       cmd_buffer_emit_depth_viewport(cmd_buffer,
3380                                      pipeline->depth_clamp_enable);
3381       cmd_buffer_emit_scissor(cmd_buffer);
3382    }
3383 
3384    if ((cmd_buffer->state.gfx.dirty & ANV_CMD_DIRTY_PIPELINE) ||
3385        BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_IA_PRIMITIVE_TOPOLOGY)) {
3386       uint32_t topology;
3387       if (anv_pipeline_has_stage(pipeline, MESA_SHADER_TESS_EVAL))
3388          topology = _3DPRIM_PATCHLIST(pipeline->patch_control_points);
3389       else
3390          topology = genX(vk_to_intel_primitive_type)[dyn->ia.primitive_topology];
3391 
3392       cmd_buffer->state.gfx.primitive_topology = topology;
3393 
3394 #if (GFX_VER >= 8)
3395       anv_batch_emit(&cmd_buffer->batch, GENX(3DSTATE_VF_TOPOLOGY), vft) {
3396          vft.PrimitiveTopologyType = topology;
3397       }
3398 #endif
3399    }
3400 
3401    genX(cmd_buffer_flush_dynamic_state)(cmd_buffer);
3402 }
3403 
3404 static void
emit_vertex_bo(struct anv_cmd_buffer * cmd_buffer,struct anv_address addr,uint32_t size,uint32_t index)3405 emit_vertex_bo(struct anv_cmd_buffer *cmd_buffer,
3406                struct anv_address addr,
3407                uint32_t size, uint32_t index)
3408 {
3409    uint32_t *p = anv_batch_emitn(&cmd_buffer->batch, 5,
3410                                  GENX(3DSTATE_VERTEX_BUFFERS));
3411 
3412    GENX(VERTEX_BUFFER_STATE_pack)(&cmd_buffer->batch, p + 1,
3413       &(struct GENX(VERTEX_BUFFER_STATE)) {
3414          .VertexBufferIndex = index,
3415          .AddressModifyEnable = true,
3416          .BufferPitch = 0,
3417          .MOCS = anv_mocs(cmd_buffer->device, addr.bo,
3418                           ISL_SURF_USAGE_VERTEX_BUFFER_BIT),
3419          .NullVertexBuffer = size == 0,
3420 #if (GFX_VER >= 8)
3421          .BufferStartingAddress = addr,
3422          .BufferSize = size
3423 #else
3424          .BufferStartingAddress = addr,
3425          .EndAddress = anv_address_add(addr, size),
3426 #endif
3427       });
3428 
3429    genX(cmd_buffer_set_binding_for_gfx8_vb_flush)(cmd_buffer,
3430                                                   index, addr, size);
3431 }
3432 
3433 static void
emit_base_vertex_instance_bo(struct anv_cmd_buffer * cmd_buffer,struct anv_address addr)3434 emit_base_vertex_instance_bo(struct anv_cmd_buffer *cmd_buffer,
3435                              struct anv_address addr)
3436 {
3437    emit_vertex_bo(cmd_buffer, addr, addr.bo ? 8 : 0, ANV_SVGS_VB_INDEX);
3438 }
3439 
3440 static void
emit_base_vertex_instance(struct anv_cmd_buffer * cmd_buffer,uint32_t base_vertex,uint32_t base_instance)3441 emit_base_vertex_instance(struct anv_cmd_buffer *cmd_buffer,
3442                           uint32_t base_vertex, uint32_t base_instance)
3443 {
3444    if (base_vertex == 0 && base_instance == 0) {
3445       emit_base_vertex_instance_bo(cmd_buffer, ANV_NULL_ADDRESS);
3446    } else {
3447       struct anv_state id_state =
3448          anv_cmd_buffer_alloc_dynamic_state(cmd_buffer, 8, 4);
3449 
3450       ((uint32_t *)id_state.map)[0] = base_vertex;
3451       ((uint32_t *)id_state.map)[1] = base_instance;
3452 
3453       struct anv_address addr = {
3454          .bo = cmd_buffer->device->dynamic_state_pool.block_pool.bo,
3455          .offset = id_state.offset,
3456       };
3457 
3458       emit_base_vertex_instance_bo(cmd_buffer, addr);
3459    }
3460 }
3461 
3462 static void
emit_draw_index(struct anv_cmd_buffer * cmd_buffer,uint32_t draw_index)3463 emit_draw_index(struct anv_cmd_buffer *cmd_buffer, uint32_t draw_index)
3464 {
3465    struct anv_state state =
3466       anv_cmd_buffer_alloc_dynamic_state(cmd_buffer, 4, 4);
3467 
3468    ((uint32_t *)state.map)[0] = draw_index;
3469 
3470    struct anv_address addr = {
3471       .bo = cmd_buffer->device->dynamic_state_pool.block_pool.bo,
3472       .offset = state.offset,
3473    };
3474 
3475    emit_vertex_bo(cmd_buffer, addr, 4, ANV_DRAWID_VB_INDEX);
3476 }
3477 
3478 static void
update_dirty_vbs_for_gfx8_vb_flush(struct anv_cmd_buffer * cmd_buffer,uint32_t access_type)3479 update_dirty_vbs_for_gfx8_vb_flush(struct anv_cmd_buffer *cmd_buffer,
3480                                    uint32_t access_type)
3481 {
3482    struct anv_graphics_pipeline *pipeline = cmd_buffer->state.gfx.pipeline;
3483    const struct elk_vs_prog_data *vs_prog_data = get_vs_prog_data(pipeline);
3484 
3485    uint64_t vb_used = pipeline->vb_used;
3486    if (vs_prog_data->uses_firstvertex ||
3487        vs_prog_data->uses_baseinstance)
3488       vb_used |= 1ull << ANV_SVGS_VB_INDEX;
3489    if (vs_prog_data->uses_drawid)
3490       vb_used |= 1ull << ANV_DRAWID_VB_INDEX;
3491 
3492    genX(cmd_buffer_update_dirty_vbs_for_gfx8_vb_flush)(cmd_buffer,
3493                                                        access_type == RANDOM,
3494                                                        vb_used);
3495 }
3496 
3497 ALWAYS_INLINE static void
cmd_buffer_emit_vertex_constants_and_flush(struct anv_cmd_buffer * cmd_buffer,const struct elk_vs_prog_data * vs_prog_data,uint32_t base_vertex,uint32_t base_instance,uint32_t draw_id,bool force_flush)3498 cmd_buffer_emit_vertex_constants_and_flush(struct anv_cmd_buffer *cmd_buffer,
3499                                            const struct elk_vs_prog_data *vs_prog_data,
3500                                            uint32_t base_vertex,
3501                                            uint32_t base_instance,
3502                                            uint32_t draw_id,
3503                                            bool force_flush)
3504 {
3505    bool emitted = false;
3506    if (vs_prog_data->uses_firstvertex ||
3507        vs_prog_data->uses_baseinstance) {
3508       emit_base_vertex_instance(cmd_buffer, base_vertex, base_instance);
3509       emitted = true;
3510    }
3511    if (vs_prog_data->uses_drawid) {
3512       emit_draw_index(cmd_buffer, draw_id);
3513       emitted = true;
3514    }
3515    /* Emitting draw index or vertex index BOs may result in needing
3516     * additional VF cache flushes.
3517     */
3518    if (emitted || force_flush)
3519       genX(cmd_buffer_apply_pipe_flushes)(cmd_buffer);
3520 }
3521 
genX(CmdDraw)3522 void genX(CmdDraw)(
3523     VkCommandBuffer                             commandBuffer,
3524     uint32_t                                    vertexCount,
3525     uint32_t                                    instanceCount,
3526     uint32_t                                    firstVertex,
3527     uint32_t                                    firstInstance)
3528 {
3529    ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer);
3530    struct anv_graphics_pipeline *pipeline = cmd_buffer->state.gfx.pipeline;
3531    const struct elk_vs_prog_data *vs_prog_data = get_vs_prog_data(pipeline);
3532 
3533    if (anv_batch_has_error(&cmd_buffer->batch))
3534       return;
3535 
3536    const uint32_t count =
3537       vertexCount * instanceCount * pipeline->instance_multiplier;
3538    anv_measure_snapshot(cmd_buffer,
3539                         INTEL_SNAPSHOT_DRAW,
3540                         "draw", count);
3541    trace_intel_begin_draw(&cmd_buffer->trace);
3542 
3543    /* Select pipeline here to allow
3544     * cmd_buffer_emit_vertex_constants_and_flush() without flushing before
3545     * cmd_buffer_flush_gfx_state().
3546     */
3547    genX(flush_pipeline_select_3d)(cmd_buffer);
3548 
3549    if (cmd_buffer->state.conditional_render_enabled)
3550       genX(cmd_emit_conditional_render_predicate)(cmd_buffer);
3551 
3552    cmd_buffer_emit_vertex_constants_and_flush(cmd_buffer, vs_prog_data,
3553                                               firstVertex, firstInstance, 0,
3554                                               false /* force_flush */);
3555 
3556    genX(cmd_buffer_flush_gfx_state)(cmd_buffer);
3557 
3558    anv_batch_emit(&cmd_buffer->batch, GENX(3DPRIMITIVE), prim) {
3559       prim.PredicateEnable          = cmd_buffer->state.conditional_render_enabled;
3560       prim.VertexAccessType         = SEQUENTIAL;
3561       prim.PrimitiveTopologyType    = cmd_buffer->state.gfx.primitive_topology;
3562       prim.VertexCountPerInstance   = vertexCount;
3563       prim.StartVertexLocation      = firstVertex;
3564       prim.InstanceCount            = instanceCount *
3565                                       pipeline->instance_multiplier;
3566       prim.StartInstanceLocation    = firstInstance;
3567       prim.BaseVertexLocation       = 0;
3568    }
3569 
3570    update_dirty_vbs_for_gfx8_vb_flush(cmd_buffer, SEQUENTIAL);
3571 
3572    trace_intel_end_draw(&cmd_buffer->trace, count);
3573 }
3574 
genX(CmdDrawMultiEXT)3575 void genX(CmdDrawMultiEXT)(
3576     VkCommandBuffer                             commandBuffer,
3577     uint32_t                                    drawCount,
3578     const VkMultiDrawInfoEXT                   *pVertexInfo,
3579     uint32_t                                    instanceCount,
3580     uint32_t                                    firstInstance,
3581     uint32_t                                    stride)
3582 {
3583    ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer);
3584    struct anv_graphics_pipeline *pipeline = cmd_buffer->state.gfx.pipeline;
3585    const struct elk_vs_prog_data *vs_prog_data = get_vs_prog_data(pipeline);
3586 
3587    if (anv_batch_has_error(&cmd_buffer->batch))
3588       return;
3589 
3590    const uint32_t count =
3591       drawCount * instanceCount * pipeline->instance_multiplier;
3592    anv_measure_snapshot(cmd_buffer,
3593                         INTEL_SNAPSHOT_DRAW,
3594                         "draw_multi", count);
3595    trace_intel_begin_draw_multi(&cmd_buffer->trace);
3596 
3597    genX(cmd_buffer_flush_gfx_state)(cmd_buffer);
3598 
3599    if (cmd_buffer->state.conditional_render_enabled)
3600       genX(cmd_emit_conditional_render_predicate)(cmd_buffer);
3601 
3602    uint32_t i = 0;
3603    vk_foreach_multi_draw(draw, i, pVertexInfo, drawCount, stride) {
3604       cmd_buffer_emit_vertex_constants_and_flush(cmd_buffer, vs_prog_data,
3605                                                  draw->firstVertex,
3606                                                  firstInstance, i, !i);
3607 
3608       anv_batch_emit(&cmd_buffer->batch, GENX(3DPRIMITIVE), prim) {
3609          prim.PredicateEnable          = cmd_buffer->state.conditional_render_enabled;
3610          prim.VertexAccessType         = SEQUENTIAL;
3611          prim.PrimitiveTopologyType    = cmd_buffer->state.gfx.primitive_topology;
3612          prim.VertexCountPerInstance   = draw->vertexCount;
3613          prim.StartVertexLocation      = draw->firstVertex;
3614          prim.InstanceCount            = instanceCount *
3615                                          pipeline->instance_multiplier;
3616          prim.StartInstanceLocation    = firstInstance;
3617          prim.BaseVertexLocation       = 0;
3618       }
3619    }
3620 
3621    update_dirty_vbs_for_gfx8_vb_flush(cmd_buffer, SEQUENTIAL);
3622 
3623    trace_intel_end_draw_multi(&cmd_buffer->trace, count);
3624 }
3625 
genX(CmdDrawIndexed)3626 void genX(CmdDrawIndexed)(
3627     VkCommandBuffer                             commandBuffer,
3628     uint32_t                                    indexCount,
3629     uint32_t                                    instanceCount,
3630     uint32_t                                    firstIndex,
3631     int32_t                                     vertexOffset,
3632     uint32_t                                    firstInstance)
3633 {
3634    ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer);
3635    struct anv_graphics_pipeline *pipeline = cmd_buffer->state.gfx.pipeline;
3636    const struct elk_vs_prog_data *vs_prog_data = get_vs_prog_data(pipeline);
3637 
3638    if (anv_batch_has_error(&cmd_buffer->batch))
3639       return;
3640 
3641    const uint32_t count =
3642       indexCount * instanceCount * pipeline->instance_multiplier;
3643    anv_measure_snapshot(cmd_buffer,
3644                         INTEL_SNAPSHOT_DRAW,
3645                         "draw indexed",
3646                         count);
3647    trace_intel_begin_draw_indexed(&cmd_buffer->trace);
3648 
3649    /* Select pipeline here to allow
3650     * cmd_buffer_emit_vertex_constants_and_flush() without flushing before
3651     * cmd_buffer_flush_gfx_state().
3652     */
3653    genX(flush_pipeline_select_3d)(cmd_buffer);
3654 
3655    if (cmd_buffer->state.conditional_render_enabled)
3656       genX(cmd_emit_conditional_render_predicate)(cmd_buffer);
3657 
3658    cmd_buffer_emit_vertex_constants_and_flush(cmd_buffer, vs_prog_data,
3659                                               vertexOffset, firstInstance,
3660                                               0, false /* force_flush */);
3661 
3662    genX(cmd_buffer_flush_gfx_state)(cmd_buffer);
3663 
3664    anv_batch_emit(&cmd_buffer->batch, GENX(3DPRIMITIVE), prim) {
3665       prim.PredicateEnable          = cmd_buffer->state.conditional_render_enabled;
3666       prim.VertexAccessType         = RANDOM;
3667       prim.PrimitiveTopologyType    = cmd_buffer->state.gfx.primitive_topology;
3668       prim.VertexCountPerInstance   = indexCount;
3669       prim.StartVertexLocation      = firstIndex;
3670       prim.InstanceCount            = instanceCount *
3671                                       pipeline->instance_multiplier;
3672       prim.StartInstanceLocation    = firstInstance;
3673       prim.BaseVertexLocation       = vertexOffset;
3674    }
3675 
3676    update_dirty_vbs_for_gfx8_vb_flush(cmd_buffer, RANDOM);
3677 
3678    trace_intel_end_draw_indexed(&cmd_buffer->trace, count);
3679 }
3680 
genX(CmdDrawMultiIndexedEXT)3681 void genX(CmdDrawMultiIndexedEXT)(
3682     VkCommandBuffer                             commandBuffer,
3683     uint32_t                                    drawCount,
3684     const VkMultiDrawIndexedInfoEXT            *pIndexInfo,
3685     uint32_t                                    instanceCount,
3686     uint32_t                                    firstInstance,
3687     uint32_t                                    stride,
3688     const int32_t                              *pVertexOffset)
3689 {
3690    ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer);
3691    struct anv_graphics_pipeline *pipeline = cmd_buffer->state.gfx.pipeline;
3692    const struct elk_vs_prog_data *vs_prog_data = get_vs_prog_data(pipeline);
3693 
3694    if (anv_batch_has_error(&cmd_buffer->batch))
3695       return;
3696 
3697    const uint32_t count =
3698       drawCount * instanceCount * pipeline->instance_multiplier;
3699    anv_measure_snapshot(cmd_buffer,
3700                         INTEL_SNAPSHOT_DRAW,
3701                         "draw indexed_multi",
3702                         count);
3703    trace_intel_begin_draw_indexed_multi(&cmd_buffer->trace);
3704 
3705    genX(cmd_buffer_flush_gfx_state)(cmd_buffer);
3706 
3707    if (cmd_buffer->state.conditional_render_enabled)
3708       genX(cmd_emit_conditional_render_predicate)(cmd_buffer);
3709 
3710    uint32_t i = 0;
3711    if (pVertexOffset) {
3712       if (vs_prog_data->uses_drawid) {
3713          bool emitted = true;
3714          if (vs_prog_data->uses_firstvertex ||
3715              vs_prog_data->uses_baseinstance) {
3716             emit_base_vertex_instance(cmd_buffer, *pVertexOffset, firstInstance);
3717             emitted = true;
3718          }
3719          vk_foreach_multi_draw_indexed(draw, i, pIndexInfo, drawCount, stride) {
3720             if (vs_prog_data->uses_drawid) {
3721                emit_draw_index(cmd_buffer, i);
3722                emitted = true;
3723             }
3724             /* Emitting draw index or vertex index BOs may result in needing
3725              * additional VF cache flushes.
3726              */
3727             if (emitted)
3728                genX(cmd_buffer_apply_pipe_flushes)(cmd_buffer);
3729 
3730             anv_batch_emit(&cmd_buffer->batch, GENX(3DPRIMITIVE), prim) {
3731                prim.PredicateEnable          = cmd_buffer->state.conditional_render_enabled;
3732                prim.VertexAccessType         = RANDOM;
3733                prim.PrimitiveTopologyType    = cmd_buffer->state.gfx.primitive_topology;
3734                prim.VertexCountPerInstance   = draw->indexCount;
3735                prim.StartVertexLocation      = draw->firstIndex;
3736                prim.InstanceCount            = instanceCount *
3737                                                pipeline->instance_multiplier;
3738                prim.StartInstanceLocation    = firstInstance;
3739                prim.BaseVertexLocation       = *pVertexOffset;
3740             }
3741             emitted = false;
3742          }
3743       } else {
3744          if (vs_prog_data->uses_firstvertex ||
3745              vs_prog_data->uses_baseinstance) {
3746             emit_base_vertex_instance(cmd_buffer, *pVertexOffset, firstInstance);
3747             /* Emitting draw index or vertex index BOs may result in needing
3748              * additional VF cache flushes.
3749              */
3750             genX(cmd_buffer_apply_pipe_flushes)(cmd_buffer);
3751          }
3752          vk_foreach_multi_draw_indexed(draw, i, pIndexInfo, drawCount, stride) {
3753             anv_batch_emit(&cmd_buffer->batch, GENX(3DPRIMITIVE), prim) {
3754                prim.PredicateEnable          = cmd_buffer->state.conditional_render_enabled;
3755                prim.VertexAccessType         = RANDOM;
3756                prim.PrimitiveTopologyType    = cmd_buffer->state.gfx.primitive_topology;
3757                prim.VertexCountPerInstance   = draw->indexCount;
3758                prim.StartVertexLocation      = draw->firstIndex;
3759                prim.InstanceCount            = instanceCount *
3760                                                pipeline->instance_multiplier;
3761                prim.StartInstanceLocation    = firstInstance;
3762                prim.BaseVertexLocation       = *pVertexOffset;
3763             }
3764          }
3765       }
3766    } else {
3767       vk_foreach_multi_draw_indexed(draw, i, pIndexInfo, drawCount, stride) {
3768          cmd_buffer_emit_vertex_constants_and_flush(cmd_buffer, vs_prog_data,
3769                                                     draw->vertexOffset,
3770                                                     firstInstance, i, i != 0);
3771 
3772          anv_batch_emit(&cmd_buffer->batch, GENX(3DPRIMITIVE), prim) {
3773             prim.PredicateEnable          = cmd_buffer->state.conditional_render_enabled;
3774             prim.VertexAccessType         = RANDOM;
3775             prim.PrimitiveTopologyType    = cmd_buffer->state.gfx.primitive_topology;
3776             prim.VertexCountPerInstance   = draw->indexCount;
3777             prim.StartVertexLocation      = draw->firstIndex;
3778             prim.InstanceCount            = instanceCount *
3779                                             pipeline->instance_multiplier;
3780             prim.StartInstanceLocation    = firstInstance;
3781             prim.BaseVertexLocation       = draw->vertexOffset;
3782          }
3783       }
3784    }
3785 
3786    update_dirty_vbs_for_gfx8_vb_flush(cmd_buffer, RANDOM);
3787 
3788    trace_intel_end_draw_indexed_multi(&cmd_buffer->trace, count);
3789 }
3790 
3791 /* Auto-Draw / Indirect Registers */
3792 #define GFX7_3DPRIM_END_OFFSET          0x2420
3793 #define GFX7_3DPRIM_START_VERTEX        0x2430
3794 #define GFX7_3DPRIM_VERTEX_COUNT        0x2434
3795 #define GFX7_3DPRIM_INSTANCE_COUNT      0x2438
3796 #define GFX7_3DPRIM_START_INSTANCE      0x243C
3797 #define GFX7_3DPRIM_BASE_VERTEX         0x2440
3798 
genX(CmdDrawIndirectByteCountEXT)3799 void genX(CmdDrawIndirectByteCountEXT)(
3800     VkCommandBuffer                             commandBuffer,
3801     uint32_t                                    instanceCount,
3802     uint32_t                                    firstInstance,
3803     VkBuffer                                    counterBuffer,
3804     VkDeviceSize                                counterBufferOffset,
3805     uint32_t                                    counterOffset,
3806     uint32_t                                    vertexStride)
3807 {
3808 #if GFX_VERx10 >= 75
3809    ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer);
3810    ANV_FROM_HANDLE(anv_buffer, counter_buffer, counterBuffer);
3811    struct anv_graphics_pipeline *pipeline = cmd_buffer->state.gfx.pipeline;
3812    const struct elk_vs_prog_data *vs_prog_data = get_vs_prog_data(pipeline);
3813 
3814    /* firstVertex is always zero for this draw function */
3815    const uint32_t firstVertex = 0;
3816 
3817    if (anv_batch_has_error(&cmd_buffer->batch))
3818       return;
3819 
3820    anv_measure_snapshot(cmd_buffer,
3821                         INTEL_SNAPSHOT_DRAW,
3822                         "draw indirect byte count",
3823                         instanceCount * pipeline->instance_multiplier);
3824    trace_intel_begin_draw_indirect_byte_count(&cmd_buffer->trace);
3825 
3826    /* Select pipeline here to allow
3827     * cmd_buffer_emit_vertex_constants_and_flush() without flushing before
3828     * emit_base_vertex_instance() & emit_draw_index().
3829     */
3830    genX(flush_pipeline_select_3d)(cmd_buffer);
3831 
3832    if (cmd_buffer->state.conditional_render_enabled)
3833       genX(cmd_emit_conditional_render_predicate)(cmd_buffer);
3834 
3835    if (vs_prog_data->uses_firstvertex ||
3836        vs_prog_data->uses_baseinstance)
3837       emit_base_vertex_instance(cmd_buffer, firstVertex, firstInstance);
3838    if (vs_prog_data->uses_drawid)
3839       emit_draw_index(cmd_buffer, 0);
3840 
3841    genX(cmd_buffer_flush_gfx_state)(cmd_buffer);
3842 
3843    struct mi_builder b;
3844    mi_builder_init(&b, cmd_buffer->device->info, &cmd_buffer->batch);
3845    struct mi_value count =
3846       mi_mem32(anv_address_add(counter_buffer->address,
3847                                    counterBufferOffset));
3848    if (counterOffset)
3849       count = mi_isub(&b, count, mi_imm(counterOffset));
3850    count = mi_udiv32_imm(&b, count, vertexStride);
3851    mi_store(&b, mi_reg32(GFX7_3DPRIM_VERTEX_COUNT), count);
3852 
3853    mi_store(&b, mi_reg32(GFX7_3DPRIM_START_VERTEX), mi_imm(firstVertex));
3854    mi_store(&b, mi_reg32(GFX7_3DPRIM_INSTANCE_COUNT),
3855             mi_imm(instanceCount * pipeline->instance_multiplier));
3856    mi_store(&b, mi_reg32(GFX7_3DPRIM_START_INSTANCE), mi_imm(firstInstance));
3857    mi_store(&b, mi_reg32(GFX7_3DPRIM_BASE_VERTEX), mi_imm(0));
3858 
3859    anv_batch_emit(&cmd_buffer->batch, GENX(3DPRIMITIVE), prim) {
3860       prim.IndirectParameterEnable  = true;
3861       prim.PredicateEnable          = cmd_buffer->state.conditional_render_enabled;
3862       prim.VertexAccessType         = SEQUENTIAL;
3863       prim.PrimitiveTopologyType    = cmd_buffer->state.gfx.primitive_topology;
3864    }
3865 
3866    update_dirty_vbs_for_gfx8_vb_flush(cmd_buffer, SEQUENTIAL);
3867 
3868    trace_intel_end_draw_indirect_byte_count(&cmd_buffer->trace,
3869       instanceCount * pipeline->instance_multiplier);
3870 #endif /* GFX_VERx10 >= 75 */
3871 }
3872 
3873 static void
load_indirect_parameters(struct anv_cmd_buffer * cmd_buffer,struct anv_address addr,bool indexed)3874 load_indirect_parameters(struct anv_cmd_buffer *cmd_buffer,
3875                          struct anv_address addr,
3876                          bool indexed)
3877 {
3878    struct anv_graphics_pipeline *pipeline = cmd_buffer->state.gfx.pipeline;
3879 
3880    struct mi_builder b;
3881    mi_builder_init(&b, cmd_buffer->device->info, &cmd_buffer->batch);
3882 
3883    mi_store(&b, mi_reg32(GFX7_3DPRIM_VERTEX_COUNT),
3884                 mi_mem32(anv_address_add(addr, 0)));
3885 
3886    struct mi_value instance_count = mi_mem32(anv_address_add(addr, 4));
3887    if (pipeline->instance_multiplier > 1) {
3888 #if GFX_VERx10 >= 75
3889       instance_count = mi_imul_imm(&b, instance_count,
3890                                    pipeline->instance_multiplier);
3891 #else
3892       anv_finishme("Multiview + indirect draw requires MI_MATH; "
3893                    "MI_MATH is not supported on Ivy Bridge");
3894 #endif
3895    }
3896    mi_store(&b, mi_reg32(GFX7_3DPRIM_INSTANCE_COUNT), instance_count);
3897 
3898    mi_store(&b, mi_reg32(GFX7_3DPRIM_START_VERTEX),
3899                 mi_mem32(anv_address_add(addr, 8)));
3900 
3901    if (indexed) {
3902       mi_store(&b, mi_reg32(GFX7_3DPRIM_BASE_VERTEX),
3903                    mi_mem32(anv_address_add(addr, 12)));
3904       mi_store(&b, mi_reg32(GFX7_3DPRIM_START_INSTANCE),
3905                    mi_mem32(anv_address_add(addr, 16)));
3906    } else {
3907       mi_store(&b, mi_reg32(GFX7_3DPRIM_START_INSTANCE),
3908                    mi_mem32(anv_address_add(addr, 12)));
3909       mi_store(&b, mi_reg32(GFX7_3DPRIM_BASE_VERTEX), mi_imm(0));
3910    }
3911 }
3912 
genX(CmdDrawIndirect)3913 void genX(CmdDrawIndirect)(
3914     VkCommandBuffer                             commandBuffer,
3915     VkBuffer                                    _buffer,
3916     VkDeviceSize                                offset,
3917     uint32_t                                    drawCount,
3918     uint32_t                                    stride)
3919 {
3920    ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer);
3921    ANV_FROM_HANDLE(anv_buffer, buffer, _buffer);
3922    struct anv_graphics_pipeline *pipeline = cmd_buffer->state.gfx.pipeline;
3923    const struct elk_vs_prog_data *vs_prog_data = get_vs_prog_data(pipeline);
3924 
3925    if (anv_batch_has_error(&cmd_buffer->batch))
3926       return;
3927 
3928    anv_measure_snapshot(cmd_buffer,
3929                         INTEL_SNAPSHOT_DRAW,
3930                         "draw indirect",
3931                         drawCount);
3932    trace_intel_begin_draw_indirect(&cmd_buffer->trace);
3933 
3934    genX(cmd_buffer_flush_gfx_state)(cmd_buffer);
3935 
3936    if (cmd_buffer->state.conditional_render_enabled)
3937       genX(cmd_emit_conditional_render_predicate)(cmd_buffer);
3938 
3939    for (uint32_t i = 0; i < drawCount; i++) {
3940       struct anv_address draw = anv_address_add(buffer->address, offset);
3941 
3942       if (vs_prog_data->uses_firstvertex ||
3943           vs_prog_data->uses_baseinstance)
3944          emit_base_vertex_instance_bo(cmd_buffer, anv_address_add(draw, 8));
3945       if (vs_prog_data->uses_drawid)
3946          emit_draw_index(cmd_buffer, i);
3947 
3948       /* Emitting draw index or vertex index BOs may result in needing
3949        * additional VF cache flushes.
3950        */
3951       genX(cmd_buffer_apply_pipe_flushes)(cmd_buffer);
3952 
3953       load_indirect_parameters(cmd_buffer, draw, false);
3954 
3955       anv_batch_emit(&cmd_buffer->batch, GENX(3DPRIMITIVE), prim) {
3956          prim.IndirectParameterEnable  = true;
3957          prim.PredicateEnable          = cmd_buffer->state.conditional_render_enabled;
3958          prim.VertexAccessType         = SEQUENTIAL;
3959          prim.PrimitiveTopologyType    = cmd_buffer->state.gfx.primitive_topology;
3960       }
3961 
3962       update_dirty_vbs_for_gfx8_vb_flush(cmd_buffer, SEQUENTIAL);
3963 
3964       offset += stride;
3965    }
3966 
3967    trace_intel_end_draw_indirect(&cmd_buffer->trace, drawCount);
3968 }
3969 
genX(CmdDrawIndexedIndirect)3970 void genX(CmdDrawIndexedIndirect)(
3971     VkCommandBuffer                             commandBuffer,
3972     VkBuffer                                    _buffer,
3973     VkDeviceSize                                offset,
3974     uint32_t                                    drawCount,
3975     uint32_t                                    stride)
3976 {
3977    ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer);
3978    ANV_FROM_HANDLE(anv_buffer, buffer, _buffer);
3979    struct anv_graphics_pipeline *pipeline = cmd_buffer->state.gfx.pipeline;
3980    const struct elk_vs_prog_data *vs_prog_data = get_vs_prog_data(pipeline);
3981 
3982    if (anv_batch_has_error(&cmd_buffer->batch))
3983       return;
3984 
3985    anv_measure_snapshot(cmd_buffer,
3986                         INTEL_SNAPSHOT_DRAW,
3987                         "draw indexed indirect",
3988                         drawCount);
3989    trace_intel_begin_draw_indexed_indirect(&cmd_buffer->trace);
3990 
3991    genX(cmd_buffer_flush_gfx_state)(cmd_buffer);
3992 
3993    if (cmd_buffer->state.conditional_render_enabled)
3994       genX(cmd_emit_conditional_render_predicate)(cmd_buffer);
3995 
3996    for (uint32_t i = 0; i < drawCount; i++) {
3997       struct anv_address draw = anv_address_add(buffer->address, offset);
3998 
3999       /* TODO: We need to stomp base vertex to 0 somehow */
4000       if (vs_prog_data->uses_firstvertex ||
4001           vs_prog_data->uses_baseinstance)
4002          emit_base_vertex_instance_bo(cmd_buffer, anv_address_add(draw, 12));
4003       if (vs_prog_data->uses_drawid)
4004          emit_draw_index(cmd_buffer, i);
4005 
4006       /* Emitting draw index or vertex index BOs may result in needing
4007        * additional VF cache flushes.
4008        */
4009       genX(cmd_buffer_apply_pipe_flushes)(cmd_buffer);
4010 
4011       load_indirect_parameters(cmd_buffer, draw, true);
4012 
4013       anv_batch_emit(&cmd_buffer->batch, GENX(3DPRIMITIVE), prim) {
4014          prim.IndirectParameterEnable  = true;
4015          prim.PredicateEnable          = cmd_buffer->state.conditional_render_enabled;
4016          prim.VertexAccessType         = RANDOM;
4017          prim.PrimitiveTopologyType    = cmd_buffer->state.gfx.primitive_topology;
4018       }
4019 
4020       update_dirty_vbs_for_gfx8_vb_flush(cmd_buffer, RANDOM);
4021 
4022       offset += stride;
4023    }
4024 
4025    trace_intel_end_draw_indexed_indirect(&cmd_buffer->trace, drawCount);
4026 }
4027 
4028 static struct mi_value
prepare_for_draw_count_predicate(struct anv_cmd_buffer * cmd_buffer,struct mi_builder * b,struct anv_buffer * count_buffer,uint64_t countBufferOffset)4029 prepare_for_draw_count_predicate(struct anv_cmd_buffer *cmd_buffer,
4030                                  struct mi_builder *b,
4031                                  struct anv_buffer *count_buffer,
4032                                  uint64_t countBufferOffset)
4033 {
4034    struct anv_address count_address =
4035          anv_address_add(count_buffer->address, countBufferOffset);
4036 
4037    struct mi_value ret = mi_imm(0);
4038 
4039    if (cmd_buffer->state.conditional_render_enabled) {
4040 #if GFX_VERx10 >= 75
4041       ret = mi_new_gpr(b);
4042       mi_store(b, mi_value_ref(b, ret), mi_mem32(count_address));
4043 #endif
4044    } else {
4045       /* Upload the current draw count from the draw parameters buffer to
4046        * MI_PREDICATE_SRC0.
4047        */
4048       mi_store(b, mi_reg64(MI_PREDICATE_SRC0), mi_mem32(count_address));
4049       mi_store(b, mi_reg32(MI_PREDICATE_SRC1 + 4), mi_imm(0));
4050    }
4051 
4052    return ret;
4053 }
4054 
4055 static void
emit_draw_count_predicate(struct anv_cmd_buffer * cmd_buffer,struct mi_builder * b,uint32_t draw_index)4056 emit_draw_count_predicate(struct anv_cmd_buffer *cmd_buffer,
4057                           struct mi_builder *b,
4058                           uint32_t draw_index)
4059 {
4060    /* Upload the index of the current primitive to MI_PREDICATE_SRC1. */
4061    mi_store(b, mi_reg32(MI_PREDICATE_SRC1), mi_imm(draw_index));
4062 
4063    if (draw_index == 0) {
4064       anv_batch_emit(&cmd_buffer->batch, GENX(MI_PREDICATE), mip) {
4065          mip.LoadOperation    = LOAD_LOADINV;
4066          mip.CombineOperation = COMBINE_SET;
4067          mip.CompareOperation = COMPARE_SRCS_EQUAL;
4068       }
4069    } else {
4070       /* While draw_index < draw_count the predicate's result will be
4071        *  (draw_index == draw_count) ^ TRUE = TRUE
4072        * When draw_index == draw_count the result is
4073        *  (TRUE) ^ TRUE = FALSE
4074        * After this all results will be:
4075        *  (FALSE) ^ FALSE = FALSE
4076        */
4077       anv_batch_emit(&cmd_buffer->batch, GENX(MI_PREDICATE), mip) {
4078          mip.LoadOperation    = LOAD_LOAD;
4079          mip.CombineOperation = COMBINE_XOR;
4080          mip.CompareOperation = COMPARE_SRCS_EQUAL;
4081       }
4082    }
4083 }
4084 
4085 #if GFX_VERx10 >= 75
4086 static void
emit_draw_count_predicate_with_conditional_render(struct anv_cmd_buffer * cmd_buffer,struct mi_builder * b,uint32_t draw_index,struct mi_value max)4087 emit_draw_count_predicate_with_conditional_render(
4088                           struct anv_cmd_buffer *cmd_buffer,
4089                           struct mi_builder *b,
4090                           uint32_t draw_index,
4091                           struct mi_value max)
4092 {
4093    struct mi_value pred = mi_ult(b, mi_imm(draw_index), max);
4094    pred = mi_iand(b, pred, mi_reg64(ANV_PREDICATE_RESULT_REG));
4095 
4096 #if GFX_VER >= 8
4097    mi_store(b, mi_reg32(MI_PREDICATE_RESULT), pred);
4098 #else
4099    /* MI_PREDICATE_RESULT is not whitelisted in i915 command parser
4100     * so we emit MI_PREDICATE to set it.
4101     */
4102 
4103    mi_store(b, mi_reg64(MI_PREDICATE_SRC0), pred);
4104    mi_store(b, mi_reg64(MI_PREDICATE_SRC1), mi_imm(0));
4105 
4106    anv_batch_emit(&cmd_buffer->batch, GENX(MI_PREDICATE), mip) {
4107       mip.LoadOperation    = LOAD_LOADINV;
4108       mip.CombineOperation = COMBINE_SET;
4109       mip.CompareOperation = COMPARE_SRCS_EQUAL;
4110    }
4111 #endif
4112 }
4113 #endif
4114 
4115 static void
emit_draw_count_predicate_cond(struct anv_cmd_buffer * cmd_buffer,struct mi_builder * b,uint32_t draw_index,struct mi_value max)4116 emit_draw_count_predicate_cond(struct anv_cmd_buffer *cmd_buffer,
4117                                struct mi_builder *b,
4118                                uint32_t draw_index,
4119                                struct mi_value max)
4120 {
4121 #if GFX_VERx10 >= 75
4122    if (cmd_buffer->state.conditional_render_enabled) {
4123       emit_draw_count_predicate_with_conditional_render(
4124             cmd_buffer, b, draw_index, mi_value_ref(b, max));
4125    } else {
4126       emit_draw_count_predicate(cmd_buffer, b, draw_index);
4127    }
4128 #else
4129    emit_draw_count_predicate(cmd_buffer, b, draw_index);
4130 #endif
4131 }
4132 
genX(CmdDrawIndirectCount)4133 void genX(CmdDrawIndirectCount)(
4134     VkCommandBuffer                             commandBuffer,
4135     VkBuffer                                    _buffer,
4136     VkDeviceSize                                offset,
4137     VkBuffer                                    _countBuffer,
4138     VkDeviceSize                                countBufferOffset,
4139     uint32_t                                    maxDrawCount,
4140     uint32_t                                    stride)
4141 {
4142    ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer);
4143    ANV_FROM_HANDLE(anv_buffer, buffer, _buffer);
4144    ANV_FROM_HANDLE(anv_buffer, count_buffer, _countBuffer);
4145    struct anv_cmd_state *cmd_state = &cmd_buffer->state;
4146    struct anv_graphics_pipeline *pipeline = cmd_state->gfx.pipeline;
4147    const struct elk_vs_prog_data *vs_prog_data = get_vs_prog_data(pipeline);
4148 
4149    if (anv_batch_has_error(&cmd_buffer->batch))
4150       return;
4151 
4152    anv_measure_snapshot(cmd_buffer,
4153                         INTEL_SNAPSHOT_DRAW,
4154                         "draw indirect count",
4155                         0);
4156    trace_intel_begin_draw_indirect_count(&cmd_buffer->trace);
4157 
4158    genX(cmd_buffer_flush_gfx_state)(cmd_buffer);
4159 
4160    struct mi_builder b;
4161    mi_builder_init(&b, cmd_buffer->device->info, &cmd_buffer->batch);
4162    struct mi_value max =
4163       prepare_for_draw_count_predicate(cmd_buffer, &b,
4164                                        count_buffer, countBufferOffset);
4165 
4166    for (uint32_t i = 0; i < maxDrawCount; i++) {
4167       struct anv_address draw = anv_address_add(buffer->address, offset);
4168 
4169       emit_draw_count_predicate_cond(cmd_buffer, &b, i, max);
4170 
4171       if (vs_prog_data->uses_firstvertex ||
4172           vs_prog_data->uses_baseinstance)
4173          emit_base_vertex_instance_bo(cmd_buffer, anv_address_add(draw, 8));
4174       if (vs_prog_data->uses_drawid)
4175          emit_draw_index(cmd_buffer, i);
4176 
4177       /* Emitting draw index or vertex index BOs may result in needing
4178        * additional VF cache flushes.
4179        */
4180       genX(cmd_buffer_apply_pipe_flushes)(cmd_buffer);
4181 
4182       load_indirect_parameters(cmd_buffer, draw, false);
4183 
4184       anv_batch_emit(&cmd_buffer->batch, GENX(3DPRIMITIVE), prim) {
4185          prim.IndirectParameterEnable  = true;
4186          prim.PredicateEnable          = true;
4187          prim.VertexAccessType         = SEQUENTIAL;
4188          prim.PrimitiveTopologyType    = cmd_buffer->state.gfx.primitive_topology;
4189       }
4190 
4191       update_dirty_vbs_for_gfx8_vb_flush(cmd_buffer, SEQUENTIAL);
4192 
4193       offset += stride;
4194    }
4195 
4196    mi_value_unref(&b, max);
4197 
4198    trace_intel_end_draw_indirect_count(&cmd_buffer->trace, maxDrawCount);
4199 }
4200 
genX(CmdDrawIndexedIndirectCount)4201 void genX(CmdDrawIndexedIndirectCount)(
4202     VkCommandBuffer                             commandBuffer,
4203     VkBuffer                                    _buffer,
4204     VkDeviceSize                                offset,
4205     VkBuffer                                    _countBuffer,
4206     VkDeviceSize                                countBufferOffset,
4207     uint32_t                                    maxDrawCount,
4208     uint32_t                                    stride)
4209 {
4210    ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer);
4211    ANV_FROM_HANDLE(anv_buffer, buffer, _buffer);
4212    ANV_FROM_HANDLE(anv_buffer, count_buffer, _countBuffer);
4213    struct anv_cmd_state *cmd_state = &cmd_buffer->state;
4214    struct anv_graphics_pipeline *pipeline = cmd_state->gfx.pipeline;
4215    const struct elk_vs_prog_data *vs_prog_data = get_vs_prog_data(pipeline);
4216 
4217    if (anv_batch_has_error(&cmd_buffer->batch))
4218       return;
4219 
4220    anv_measure_snapshot(cmd_buffer,
4221                         INTEL_SNAPSHOT_DRAW,
4222                         "draw indexed indirect count",
4223                         0);
4224    trace_intel_begin_draw_indexed_indirect_count(&cmd_buffer->trace);
4225 
4226    genX(cmd_buffer_flush_gfx_state)(cmd_buffer);
4227 
4228    struct mi_builder b;
4229    mi_builder_init(&b, cmd_buffer->device->info, &cmd_buffer->batch);
4230    struct mi_value max =
4231       prepare_for_draw_count_predicate(cmd_buffer, &b,
4232                                        count_buffer, countBufferOffset);
4233 
4234    for (uint32_t i = 0; i < maxDrawCount; i++) {
4235       struct anv_address draw = anv_address_add(buffer->address, offset);
4236 
4237       emit_draw_count_predicate_cond(cmd_buffer, &b, i, max);
4238 
4239       /* TODO: We need to stomp base vertex to 0 somehow */
4240       if (vs_prog_data->uses_firstvertex ||
4241           vs_prog_data->uses_baseinstance)
4242          emit_base_vertex_instance_bo(cmd_buffer, anv_address_add(draw, 12));
4243       if (vs_prog_data->uses_drawid)
4244          emit_draw_index(cmd_buffer, i);
4245 
4246       /* Emitting draw index or vertex index BOs may result in needing
4247        * additional VF cache flushes.
4248        */
4249       genX(cmd_buffer_apply_pipe_flushes)(cmd_buffer);
4250 
4251       load_indirect_parameters(cmd_buffer, draw, true);
4252 
4253       anv_batch_emit(&cmd_buffer->batch, GENX(3DPRIMITIVE), prim) {
4254          prim.IndirectParameterEnable  = true;
4255          prim.PredicateEnable          = true;
4256          prim.VertexAccessType         = RANDOM;
4257          prim.PrimitiveTopologyType    = cmd_buffer->state.gfx.primitive_topology;
4258       }
4259 
4260       update_dirty_vbs_for_gfx8_vb_flush(cmd_buffer, RANDOM);
4261 
4262       offset += stride;
4263    }
4264 
4265    mi_value_unref(&b, max);
4266 
4267    trace_intel_end_draw_indexed_indirect_count(&cmd_buffer->trace, maxDrawCount);
4268 
4269 }
4270 
genX(CmdBeginTransformFeedbackEXT)4271 void genX(CmdBeginTransformFeedbackEXT)(
4272     VkCommandBuffer                             commandBuffer,
4273     uint32_t                                    firstCounterBuffer,
4274     uint32_t                                    counterBufferCount,
4275     const VkBuffer*                             pCounterBuffers,
4276     const VkDeviceSize*                         pCounterBufferOffsets)
4277 {
4278    ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer);
4279 
4280    assert(firstCounterBuffer < MAX_XFB_BUFFERS);
4281    assert(counterBufferCount <= MAX_XFB_BUFFERS);
4282    assert(firstCounterBuffer + counterBufferCount <= MAX_XFB_BUFFERS);
4283 
4284    /* From the SKL PRM Vol. 2c, SO_WRITE_OFFSET:
4285     *
4286     *    "Ssoftware must ensure that no HW stream output operations can be in
4287     *    process or otherwise pending at the point that the MI_LOAD/STORE
4288     *    commands are processed. This will likely require a pipeline flush."
4289     */
4290    anv_add_pending_pipe_bits(cmd_buffer,
4291                              ANV_PIPE_CS_STALL_BIT,
4292                              "begin transform feedback");
4293    genX(cmd_buffer_apply_pipe_flushes)(cmd_buffer);
4294 
4295    for (uint32_t idx = 0; idx < MAX_XFB_BUFFERS; idx++) {
4296       /* If we have a counter buffer, this is a resume so we need to load the
4297        * value into the streamout offset register.  Otherwise, this is a begin
4298        * and we need to reset it to zero.
4299        */
4300       if (pCounterBuffers &&
4301           idx >= firstCounterBuffer &&
4302           idx - firstCounterBuffer < counterBufferCount &&
4303           pCounterBuffers[idx - firstCounterBuffer] != VK_NULL_HANDLE) {
4304          uint32_t cb_idx = idx - firstCounterBuffer;
4305          ANV_FROM_HANDLE(anv_buffer, counter_buffer, pCounterBuffers[cb_idx]);
4306          uint64_t offset = pCounterBufferOffsets ?
4307                            pCounterBufferOffsets[cb_idx] : 0;
4308 
4309          anv_batch_emit(&cmd_buffer->batch, GENX(MI_LOAD_REGISTER_MEM), lrm) {
4310             lrm.RegisterAddress  = GENX(SO_WRITE_OFFSET0_num) + idx * 4;
4311             lrm.MemoryAddress    = anv_address_add(counter_buffer->address,
4312                                                    offset);
4313          }
4314       } else {
4315          anv_batch_emit(&cmd_buffer->batch, GENX(MI_LOAD_REGISTER_IMM), lri) {
4316             lri.RegisterOffset   = GENX(SO_WRITE_OFFSET0_num) + idx * 4;
4317             lri.DataDWord        = 0;
4318          }
4319       }
4320    }
4321 
4322    cmd_buffer->state.xfb_enabled = true;
4323    cmd_buffer->state.gfx.dirty |= ANV_CMD_DIRTY_XFB_ENABLE;
4324 }
4325 
genX(CmdEndTransformFeedbackEXT)4326 void genX(CmdEndTransformFeedbackEXT)(
4327     VkCommandBuffer                             commandBuffer,
4328     uint32_t                                    firstCounterBuffer,
4329     uint32_t                                    counterBufferCount,
4330     const VkBuffer*                             pCounterBuffers,
4331     const VkDeviceSize*                         pCounterBufferOffsets)
4332 {
4333    ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer);
4334 
4335    assert(firstCounterBuffer < MAX_XFB_BUFFERS);
4336    assert(counterBufferCount <= MAX_XFB_BUFFERS);
4337    assert(firstCounterBuffer + counterBufferCount <= MAX_XFB_BUFFERS);
4338 
4339    /* From the SKL PRM Vol. 2c, SO_WRITE_OFFSET:
4340     *
4341     *    "Ssoftware must ensure that no HW stream output operations can be in
4342     *    process or otherwise pending at the point that the MI_LOAD/STORE
4343     *    commands are processed. This will likely require a pipeline flush."
4344     */
4345    anv_add_pending_pipe_bits(cmd_buffer,
4346                              ANV_PIPE_CS_STALL_BIT,
4347                              "end transform feedback");
4348    genX(cmd_buffer_apply_pipe_flushes)(cmd_buffer);
4349 
4350    for (uint32_t cb_idx = 0; cb_idx < counterBufferCount; cb_idx++) {
4351       unsigned idx = firstCounterBuffer + cb_idx;
4352 
4353       /* If we have a counter buffer, this is a resume so we need to load the
4354        * value into the streamout offset register.  Otherwise, this is a begin
4355        * and we need to reset it to zero.
4356        */
4357       if (pCounterBuffers &&
4358           cb_idx < counterBufferCount &&
4359           pCounterBuffers[cb_idx] != VK_NULL_HANDLE) {
4360          ANV_FROM_HANDLE(anv_buffer, counter_buffer, pCounterBuffers[cb_idx]);
4361          uint64_t offset = pCounterBufferOffsets ?
4362                            pCounterBufferOffsets[cb_idx] : 0;
4363 
4364          anv_batch_emit(&cmd_buffer->batch, GENX(MI_STORE_REGISTER_MEM), srm) {
4365             srm.MemoryAddress    = anv_address_add(counter_buffer->address,
4366                                                    offset);
4367             srm.RegisterAddress  = GENX(SO_WRITE_OFFSET0_num) + idx * 4;
4368          }
4369       }
4370    }
4371 
4372    cmd_buffer->state.xfb_enabled = false;
4373    cmd_buffer->state.gfx.dirty |= ANV_CMD_DIRTY_XFB_ENABLE;
4374 }
4375 
4376 static void
genX(cmd_buffer_flush_compute_state)4377 genX(cmd_buffer_flush_compute_state)(struct anv_cmd_buffer *cmd_buffer)
4378 {
4379    struct anv_cmd_compute_state *comp_state = &cmd_buffer->state.compute;
4380    struct anv_compute_pipeline *pipeline = comp_state->pipeline;
4381 
4382    assert(pipeline->cs);
4383 
4384    genX(cmd_buffer_config_l3)(cmd_buffer, pipeline->base.l3_config);
4385 
4386    genX(flush_pipeline_select_gpgpu)(cmd_buffer);
4387 
4388    /* Apply any pending pipeline flushes we may have.  We want to apply them
4389     * now because, if any of those flushes are for things like push constants,
4390     * the GPU will read the state at weird times.
4391     */
4392    genX(cmd_buffer_apply_pipe_flushes)(cmd_buffer);
4393 
4394    if (cmd_buffer->state.compute.pipeline_dirty) {
4395       /* From the Sky Lake PRM Vol 2a, MEDIA_VFE_STATE:
4396        *
4397        *    "A stalling PIPE_CONTROL is required before MEDIA_VFE_STATE unless
4398        *    the only bits that are changed are scoreboard related: Scoreboard
4399        *    Enable, Scoreboard Type, Scoreboard Mask, Scoreboard * Delta. For
4400        *    these scoreboard related states, a MEDIA_STATE_FLUSH is
4401        *    sufficient."
4402        */
4403       anv_add_pending_pipe_bits(cmd_buffer,
4404                               ANV_PIPE_CS_STALL_BIT,
4405                               "flush compute state");
4406       genX(cmd_buffer_apply_pipe_flushes)(cmd_buffer);
4407 
4408       anv_batch_emit_batch(&cmd_buffer->batch, &pipeline->base.batch);
4409 
4410       /* The workgroup size of the pipeline affects our push constant layout
4411        * so flag push constants as dirty if we change the pipeline.
4412        */
4413       cmd_buffer->state.push_constants_dirty |= VK_SHADER_STAGE_COMPUTE_BIT;
4414    }
4415 
4416    if ((cmd_buffer->state.descriptors_dirty & VK_SHADER_STAGE_COMPUTE_BIT) ||
4417        cmd_buffer->state.compute.pipeline_dirty) {
4418       flush_descriptor_sets(cmd_buffer,
4419                             &cmd_buffer->state.compute.base,
4420                             VK_SHADER_STAGE_COMPUTE_BIT,
4421                             &pipeline->cs, 1);
4422       cmd_buffer->state.descriptors_dirty &= ~VK_SHADER_STAGE_COMPUTE_BIT;
4423 
4424       uint32_t iface_desc_data_dw[GENX(INTERFACE_DESCRIPTOR_DATA_length)];
4425       struct GENX(INTERFACE_DESCRIPTOR_DATA) desc = {
4426          .BindingTablePointer =
4427             cmd_buffer->state.binding_tables[MESA_SHADER_COMPUTE].offset,
4428          .SamplerStatePointer =
4429             cmd_buffer->state.samplers[MESA_SHADER_COMPUTE].offset,
4430       };
4431       GENX(INTERFACE_DESCRIPTOR_DATA_pack)(NULL, iface_desc_data_dw, &desc);
4432 
4433       struct anv_state state =
4434          anv_cmd_buffer_merge_dynamic(cmd_buffer, iface_desc_data_dw,
4435                                       pipeline->interface_descriptor_data,
4436                                       GENX(INTERFACE_DESCRIPTOR_DATA_length),
4437                                       64);
4438 
4439       uint32_t size = GENX(INTERFACE_DESCRIPTOR_DATA_length) * sizeof(uint32_t);
4440       anv_batch_emit(&cmd_buffer->batch,
4441                      GENX(MEDIA_INTERFACE_DESCRIPTOR_LOAD), mid) {
4442          mid.InterfaceDescriptorTotalLength        = size;
4443          mid.InterfaceDescriptorDataStartAddress   = state.offset;
4444       }
4445    }
4446 
4447    if (cmd_buffer->state.push_constants_dirty & VK_SHADER_STAGE_COMPUTE_BIT) {
4448       comp_state->push_data =
4449          anv_cmd_buffer_cs_push_constants(cmd_buffer);
4450 
4451       if (comp_state->push_data.alloc_size) {
4452          anv_batch_emit(&cmd_buffer->batch, GENX(MEDIA_CURBE_LOAD), curbe) {
4453             curbe.CURBETotalDataLength    = comp_state->push_data.alloc_size;
4454             curbe.CURBEDataStartAddress   = comp_state->push_data.offset;
4455          }
4456       }
4457 
4458       cmd_buffer->state.push_constants_dirty &= ~VK_SHADER_STAGE_COMPUTE_BIT;
4459    }
4460 
4461    cmd_buffer->state.compute.pipeline_dirty = false;
4462 
4463    genX(cmd_buffer_apply_pipe_flushes)(cmd_buffer);
4464 }
4465 
4466 #if GFX_VER == 7
4467 
4468 static VkResult
verify_cmd_parser(const struct anv_device * device,int required_version,const char * function)4469 verify_cmd_parser(const struct anv_device *device,
4470                   int required_version,
4471                   const char *function)
4472 {
4473    if (device->physical->cmd_parser_version < required_version) {
4474       return vk_errorf(device->physical, VK_ERROR_FEATURE_NOT_PRESENT,
4475                        "cmd parser version %d is required for %s",
4476                        required_version, function);
4477    } else {
4478       return VK_SUCCESS;
4479    }
4480 }
4481 
4482 #endif
4483 
4484 static void
anv_cmd_buffer_push_base_group_id(struct anv_cmd_buffer * cmd_buffer,uint32_t baseGroupX,uint32_t baseGroupY,uint32_t baseGroupZ)4485 anv_cmd_buffer_push_base_group_id(struct anv_cmd_buffer *cmd_buffer,
4486                                   uint32_t baseGroupX,
4487                                   uint32_t baseGroupY,
4488                                   uint32_t baseGroupZ)
4489 {
4490    if (anv_batch_has_error(&cmd_buffer->batch))
4491       return;
4492 
4493    struct anv_push_constants *push =
4494       &cmd_buffer->state.compute.base.push_constants;
4495    if (push->cs.base_work_group_id[0] != baseGroupX ||
4496        push->cs.base_work_group_id[1] != baseGroupY ||
4497        push->cs.base_work_group_id[2] != baseGroupZ) {
4498       push->cs.base_work_group_id[0] = baseGroupX;
4499       push->cs.base_work_group_id[1] = baseGroupY;
4500       push->cs.base_work_group_id[2] = baseGroupZ;
4501 
4502       cmd_buffer->state.push_constants_dirty |= VK_SHADER_STAGE_COMPUTE_BIT;
4503    }
4504 }
4505 
4506 static inline void
emit_gpgpu_walker(struct anv_cmd_buffer * cmd_buffer,const struct anv_compute_pipeline * pipeline,bool indirect,const struct elk_cs_prog_data * prog_data,uint32_t groupCountX,uint32_t groupCountY,uint32_t groupCountZ)4507 emit_gpgpu_walker(struct anv_cmd_buffer *cmd_buffer,
4508                   const struct anv_compute_pipeline *pipeline, bool indirect,
4509                   const struct elk_cs_prog_data *prog_data,
4510                   uint32_t groupCountX, uint32_t groupCountY,
4511                   uint32_t groupCountZ)
4512 {
4513    bool predicate = (GFX_VER <= 7 && indirect) ||
4514       cmd_buffer->state.conditional_render_enabled;
4515 
4516    const struct intel_device_info *devinfo = pipeline->base.device->info;
4517    const struct intel_cs_dispatch_info dispatch =
4518       elk_cs_get_dispatch_info(devinfo, prog_data, NULL);
4519 
4520    anv_batch_emit(&cmd_buffer->batch, GENX(GPGPU_WALKER), ggw) {
4521       ggw.IndirectParameterEnable      = indirect;
4522       ggw.PredicateEnable              = predicate;
4523       ggw.SIMDSize                     = dispatch.simd_size / 16;
4524       ggw.ThreadDepthCounterMaximum    = 0;
4525       ggw.ThreadHeightCounterMaximum   = 0;
4526       ggw.ThreadWidthCounterMaximum    = dispatch.threads - 1;
4527       ggw.ThreadGroupIDXDimension      = groupCountX;
4528       ggw.ThreadGroupIDYDimension      = groupCountY;
4529       ggw.ThreadGroupIDZDimension      = groupCountZ;
4530       ggw.RightExecutionMask           = dispatch.right_mask;
4531       ggw.BottomExecutionMask          = 0xffffffff;
4532    }
4533 
4534    anv_batch_emit(&cmd_buffer->batch, GENX(MEDIA_STATE_FLUSH), msf);
4535 }
4536 
4537 static inline void
emit_cs_walker(struct anv_cmd_buffer * cmd_buffer,const struct anv_compute_pipeline * pipeline,bool indirect,const struct elk_cs_prog_data * prog_data,uint32_t groupCountX,uint32_t groupCountY,uint32_t groupCountZ)4538 emit_cs_walker(struct anv_cmd_buffer *cmd_buffer,
4539                const struct anv_compute_pipeline *pipeline, bool indirect,
4540                const struct elk_cs_prog_data *prog_data,
4541                uint32_t groupCountX, uint32_t groupCountY,
4542                uint32_t groupCountZ)
4543 {
4544    emit_gpgpu_walker(cmd_buffer, pipeline, indirect, prog_data, groupCountX,
4545                      groupCountY, groupCountZ);
4546 }
4547 
genX(CmdDispatchBase)4548 void genX(CmdDispatchBase)(
4549     VkCommandBuffer                             commandBuffer,
4550     uint32_t                                    baseGroupX,
4551     uint32_t                                    baseGroupY,
4552     uint32_t                                    baseGroupZ,
4553     uint32_t                                    groupCountX,
4554     uint32_t                                    groupCountY,
4555     uint32_t                                    groupCountZ)
4556 {
4557    ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer);
4558    struct anv_compute_pipeline *pipeline = cmd_buffer->state.compute.pipeline;
4559    const struct elk_cs_prog_data *prog_data = get_cs_prog_data(pipeline);
4560 
4561    anv_cmd_buffer_push_base_group_id(cmd_buffer, baseGroupX,
4562                                      baseGroupY, baseGroupZ);
4563 
4564    if (anv_batch_has_error(&cmd_buffer->batch))
4565       return;
4566 
4567    anv_measure_snapshot(cmd_buffer,
4568                         INTEL_SNAPSHOT_COMPUTE,
4569                         "compute",
4570                         groupCountX * groupCountY * groupCountZ *
4571                         prog_data->local_size[0] * prog_data->local_size[1] *
4572                         prog_data->local_size[2]);
4573 
4574    trace_intel_begin_compute(&cmd_buffer->trace);
4575 
4576    if (prog_data->uses_num_work_groups) {
4577       struct anv_state state =
4578          anv_cmd_buffer_alloc_dynamic_state(cmd_buffer, 12, 4);
4579       uint32_t *sizes = state.map;
4580       sizes[0] = groupCountX;
4581       sizes[1] = groupCountY;
4582       sizes[2] = groupCountZ;
4583       cmd_buffer->state.compute.num_workgroups = (struct anv_address) {
4584          .bo = cmd_buffer->device->dynamic_state_pool.block_pool.bo,
4585          .offset = state.offset,
4586       };
4587 
4588       /* The num_workgroups buffer goes in the binding table */
4589       cmd_buffer->state.descriptors_dirty |= VK_SHADER_STAGE_COMPUTE_BIT;
4590    }
4591 
4592    genX(cmd_buffer_flush_compute_state)(cmd_buffer);
4593 
4594    if (cmd_buffer->state.conditional_render_enabled)
4595       genX(cmd_emit_conditional_render_predicate)(cmd_buffer);
4596 
4597    emit_cs_walker(cmd_buffer, pipeline, false, prog_data, groupCountX,
4598                   groupCountY, groupCountZ);
4599 
4600    trace_intel_end_compute(&cmd_buffer->trace,
4601                            groupCountX, groupCountY, groupCountZ);
4602 }
4603 
4604 #define GPGPU_DISPATCHDIMX 0x2500
4605 #define GPGPU_DISPATCHDIMY 0x2504
4606 #define GPGPU_DISPATCHDIMZ 0x2508
4607 
genX(CmdDispatchIndirect)4608 void genX(CmdDispatchIndirect)(
4609     VkCommandBuffer                             commandBuffer,
4610     VkBuffer                                    _buffer,
4611     VkDeviceSize                                offset)
4612 {
4613    ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer);
4614    ANV_FROM_HANDLE(anv_buffer, buffer, _buffer);
4615    struct anv_compute_pipeline *pipeline = cmd_buffer->state.compute.pipeline;
4616    const struct elk_cs_prog_data *prog_data = get_cs_prog_data(pipeline);
4617    struct anv_address addr = anv_address_add(buffer->address, offset);
4618    UNUSED struct anv_batch *batch = &cmd_buffer->batch;
4619 
4620    anv_cmd_buffer_push_base_group_id(cmd_buffer, 0, 0, 0);
4621 
4622 #if GFX_VER == 7
4623    /* Linux 4.4 added command parser version 5 which allows the GPGPU
4624     * indirect dispatch registers to be written.
4625     */
4626    if (verify_cmd_parser(cmd_buffer->device, 5,
4627                          "vkCmdDispatchIndirect") != VK_SUCCESS)
4628       return;
4629 #endif
4630 
4631    anv_measure_snapshot(cmd_buffer,
4632                         INTEL_SNAPSHOT_COMPUTE,
4633                         "compute indirect",
4634                         0);
4635    trace_intel_begin_compute(&cmd_buffer->trace);
4636 
4637    if (prog_data->uses_num_work_groups) {
4638       cmd_buffer->state.compute.num_workgroups = addr;
4639 
4640       /* The num_workgroups buffer goes in the binding table */
4641       cmd_buffer->state.descriptors_dirty |= VK_SHADER_STAGE_COMPUTE_BIT;
4642    }
4643 
4644    genX(cmd_buffer_flush_compute_state)(cmd_buffer);
4645 
4646    struct mi_builder b;
4647    mi_builder_init(&b, cmd_buffer->device->info, &cmd_buffer->batch);
4648 
4649    struct mi_value size_x = mi_mem32(anv_address_add(addr, 0));
4650    struct mi_value size_y = mi_mem32(anv_address_add(addr, 4));
4651    struct mi_value size_z = mi_mem32(anv_address_add(addr, 8));
4652 
4653    mi_store(&b, mi_reg32(GPGPU_DISPATCHDIMX), size_x);
4654    mi_store(&b, mi_reg32(GPGPU_DISPATCHDIMY), size_y);
4655    mi_store(&b, mi_reg32(GPGPU_DISPATCHDIMZ), size_z);
4656 
4657 #if GFX_VER <= 7
4658    /* predicate = (compute_dispatch_indirect_x_size == 0); */
4659    mi_store(&b, mi_reg64(MI_PREDICATE_SRC0), size_x);
4660    mi_store(&b, mi_reg64(MI_PREDICATE_SRC1), mi_imm(0));
4661    anv_batch_emit(batch, GENX(MI_PREDICATE), mip) {
4662       mip.LoadOperation    = LOAD_LOAD;
4663       mip.CombineOperation = COMBINE_SET;
4664       mip.CompareOperation = COMPARE_SRCS_EQUAL;
4665    }
4666 
4667    /* predicate |= (compute_dispatch_indirect_y_size == 0); */
4668    mi_store(&b, mi_reg32(MI_PREDICATE_SRC0), size_y);
4669    anv_batch_emit(batch, GENX(MI_PREDICATE), mip) {
4670       mip.LoadOperation    = LOAD_LOAD;
4671       mip.CombineOperation = COMBINE_OR;
4672       mip.CompareOperation = COMPARE_SRCS_EQUAL;
4673    }
4674 
4675    /* predicate |= (compute_dispatch_indirect_z_size == 0); */
4676    mi_store(&b, mi_reg32(MI_PREDICATE_SRC0), size_z);
4677    anv_batch_emit(batch, GENX(MI_PREDICATE), mip) {
4678       mip.LoadOperation    = LOAD_LOAD;
4679       mip.CombineOperation = COMBINE_OR;
4680       mip.CompareOperation = COMPARE_SRCS_EQUAL;
4681    }
4682 
4683    /* predicate = !predicate; */
4684    anv_batch_emit(batch, GENX(MI_PREDICATE), mip) {
4685       mip.LoadOperation    = LOAD_LOADINV;
4686       mip.CombineOperation = COMBINE_OR;
4687       mip.CompareOperation = COMPARE_FALSE;
4688    }
4689 
4690 #if GFX_VERx10 == 75
4691    if (cmd_buffer->state.conditional_render_enabled) {
4692       /* predicate &= !(conditional_rendering_predicate == 0); */
4693       mi_store(&b, mi_reg32(MI_PREDICATE_SRC0),
4694                    mi_reg32(ANV_PREDICATE_RESULT_REG));
4695       anv_batch_emit(batch, GENX(MI_PREDICATE), mip) {
4696          mip.LoadOperation    = LOAD_LOADINV;
4697          mip.CombineOperation = COMBINE_AND;
4698          mip.CompareOperation = COMPARE_SRCS_EQUAL;
4699       }
4700    }
4701 #endif
4702 
4703 #else /* GFX_VER > 7 */
4704    if (cmd_buffer->state.conditional_render_enabled)
4705       genX(cmd_emit_conditional_render_predicate)(cmd_buffer);
4706 #endif
4707 
4708    emit_cs_walker(cmd_buffer, pipeline, true, prog_data, 0, 0, 0);
4709 
4710    trace_intel_end_compute(&cmd_buffer->trace, 0, 0, 0);
4711 }
4712 
4713 static void
genX(flush_pipeline_select)4714 genX(flush_pipeline_select)(struct anv_cmd_buffer *cmd_buffer,
4715                             uint32_t pipeline)
4716 {
4717    UNUSED const struct intel_device_info *devinfo = cmd_buffer->device->info;
4718 
4719    if (cmd_buffer->state.current_pipeline == pipeline)
4720       return;
4721 
4722 #if GFX_VER >= 8
4723    /* From the Broadwell PRM, Volume 2a: Instructions, PIPELINE_SELECT:
4724     *
4725     *   Software must clear the COLOR_CALC_STATE Valid field in
4726     *   3DSTATE_CC_STATE_POINTERS command prior to send a PIPELINE_SELECT
4727     *   with Pipeline Select set to GPGPU.
4728     *
4729     * The internal hardware docs recommend the same workaround for Gfx9
4730     * hardware too.
4731     */
4732    if (pipeline == GPGPU)
4733       anv_batch_emit(&cmd_buffer->batch, GENX(3DSTATE_CC_STATE_POINTERS), t);
4734 #endif
4735 
4736    /* From "BXML » GT » MI » vol1a GPU Overview » [Instruction]
4737     * PIPELINE_SELECT [DevBWR+]":
4738     *
4739     *   Project: DEVSNB+
4740     *
4741     *   Software must ensure all the write caches are flushed through a
4742     *   stalling PIPE_CONTROL command followed by another PIPE_CONTROL
4743     *   command to invalidate read only caches prior to programming
4744     *   MI_PIPELINE_SELECT command to change the Pipeline Select Mode.
4745     *
4746     * Note the cmd_buffer_apply_pipe_flushes will split this into two
4747     * PIPE_CONTROLs.
4748     */
4749    anv_add_pending_pipe_bits(cmd_buffer,
4750                              ANV_PIPE_RENDER_TARGET_CACHE_FLUSH_BIT |
4751                              ANV_PIPE_DEPTH_CACHE_FLUSH_BIT |
4752                              ANV_PIPE_HDC_PIPELINE_FLUSH_BIT |
4753                              ANV_PIPE_CS_STALL_BIT |
4754                              ANV_PIPE_TEXTURE_CACHE_INVALIDATE_BIT |
4755                              ANV_PIPE_CONSTANT_CACHE_INVALIDATE_BIT |
4756                              ANV_PIPE_STATE_CACHE_INVALIDATE_BIT |
4757                              ANV_PIPE_INSTRUCTION_CACHE_INVALIDATE_BIT |
4758                              ANV_PIPE_UNTYPED_DATAPORT_CACHE_FLUSH_BIT,
4759                              "flush and invalidate for PIPELINE_SELECT");
4760    genX(cmd_buffer_apply_pipe_flushes)(cmd_buffer);
4761 
4762    anv_batch_emit(&cmd_buffer->batch, GENX(PIPELINE_SELECT), ps) {
4763       ps.PipelineSelection = pipeline;
4764    }
4765 
4766    cmd_buffer->state.current_pipeline = pipeline;
4767 }
4768 
4769 void
genX(flush_pipeline_select_3d)4770 genX(flush_pipeline_select_3d)(struct anv_cmd_buffer *cmd_buffer)
4771 {
4772    genX(flush_pipeline_select)(cmd_buffer, _3D);
4773 }
4774 
4775 void
genX(flush_pipeline_select_gpgpu)4776 genX(flush_pipeline_select_gpgpu)(struct anv_cmd_buffer *cmd_buffer)
4777 {
4778    genX(flush_pipeline_select)(cmd_buffer, GPGPU);
4779 }
4780 
4781 void
genX(cmd_buffer_emit_gfx7_depth_flush)4782 genX(cmd_buffer_emit_gfx7_depth_flush)(struct anv_cmd_buffer *cmd_buffer)
4783 {
4784    if (GFX_VER >= 8)
4785       return;
4786 
4787    /* From the Haswell PRM, documentation for 3DSTATE_DEPTH_BUFFER:
4788     *
4789     *    "Restriction: Prior to changing Depth/Stencil Buffer state (i.e., any
4790     *    combination of 3DSTATE_DEPTH_BUFFER, 3DSTATE_CLEAR_PARAMS,
4791     *    3DSTATE_STENCIL_BUFFER, 3DSTATE_HIER_DEPTH_BUFFER) SW must first
4792     *    issue a pipelined depth stall (PIPE_CONTROL with Depth Stall bit
4793     *    set), followed by a pipelined depth cache flush (PIPE_CONTROL with
4794     *    Depth Flush Bit set, followed by another pipelined depth stall
4795     *    (PIPE_CONTROL with Depth Stall Bit set), unless SW can otherwise
4796     *    guarantee that the pipeline from WM onwards is already flushed (e.g.,
4797     *    via a preceding MI_FLUSH)."
4798     */
4799    anv_batch_emit(&cmd_buffer->batch, GENX(PIPE_CONTROL), pipe) {
4800       pipe.DepthStallEnable = true;
4801       anv_debug_dump_pc(pipe);
4802    }
4803    anv_batch_emit(&cmd_buffer->batch, GENX(PIPE_CONTROL), pipe) {
4804       pipe.DepthCacheFlushEnable = true;
4805       anv_debug_dump_pc(pipe);
4806    }
4807    anv_batch_emit(&cmd_buffer->batch, GENX(PIPE_CONTROL), pipe) {
4808       pipe.DepthStallEnable = true;
4809       anv_debug_dump_pc(pipe);
4810    }
4811 }
4812 
4813 /* From the Skylake PRM, 3DSTATE_VERTEX_BUFFERS:
4814  *
4815  *    "The VF cache needs to be invalidated before binding and then using
4816  *    Vertex Buffers that overlap with any previously bound Vertex Buffer
4817  *    (at a 64B granularity) since the last invalidation.  A VF cache
4818  *    invalidate is performed by setting the "VF Cache Invalidation Enable"
4819  *    bit in PIPE_CONTROL."
4820  *
4821  * This is implemented by carefully tracking all vertex and index buffer
4822  * bindings and flushing if the cache ever ends up with a range in the cache
4823  * that would exceed 4 GiB.  This is implemented in three parts:
4824  *
4825  *    1. genX(cmd_buffer_set_binding_for_gfx8_vb_flush)() which must be called
4826  *       every time a 3DSTATE_VERTEX_BUFFER packet is emitted and informs the
4827  *       tracking code of the new binding.  If this new binding would cause
4828  *       the cache to have a too-large range on the next draw call, a pipeline
4829  *       stall and VF cache invalidate are added to pending_pipeline_bits.
4830  *
4831  *    2. genX(cmd_buffer_apply_pipe_flushes)() resets the cache tracking to
4832  *       empty whenever we emit a VF invalidate.
4833  *
4834  *    3. genX(cmd_buffer_update_dirty_vbs_for_gfx8_vb_flush)() must be called
4835  *       after every 3DPRIMITIVE and copies the bound range into the dirty
4836  *       range for each used buffer.  This has to be a separate step because
4837  *       we don't always re-bind all buffers and so 1. can't know which
4838  *       buffers are actually bound.
4839  */
4840 void
genX(cmd_buffer_set_binding_for_gfx8_vb_flush)4841 genX(cmd_buffer_set_binding_for_gfx8_vb_flush)(struct anv_cmd_buffer *cmd_buffer,
4842                                                int vb_index,
4843                                                struct anv_address vb_address,
4844                                                uint32_t vb_size)
4845 {
4846    if (GFX_VER < 8 || anv_use_relocations(cmd_buffer->device->physical))
4847       return;
4848 
4849    struct anv_vb_cache_range *bound, *dirty;
4850    if (vb_index == -1) {
4851       bound = &cmd_buffer->state.gfx.ib_bound_range;
4852       dirty = &cmd_buffer->state.gfx.ib_dirty_range;
4853    } else {
4854       assert(vb_index >= 0);
4855       assert(vb_index < ARRAY_SIZE(cmd_buffer->state.gfx.vb_bound_ranges));
4856       assert(vb_index < ARRAY_SIZE(cmd_buffer->state.gfx.vb_dirty_ranges));
4857       bound = &cmd_buffer->state.gfx.vb_bound_ranges[vb_index];
4858       dirty = &cmd_buffer->state.gfx.vb_dirty_ranges[vb_index];
4859    }
4860 
4861    if (anv_gfx8_9_vb_cache_range_needs_workaround(bound, dirty,
4862                                                   vb_address,
4863                                                   vb_size)) {
4864       anv_add_pending_pipe_bits(cmd_buffer,
4865                                 ANV_PIPE_CS_STALL_BIT |
4866                                 ANV_PIPE_VF_CACHE_INVALIDATE_BIT,
4867                                 "vb > 32b range");
4868    }
4869 }
4870 
4871 void
genX(cmd_buffer_update_dirty_vbs_for_gfx8_vb_flush)4872 genX(cmd_buffer_update_dirty_vbs_for_gfx8_vb_flush)(struct anv_cmd_buffer *cmd_buffer,
4873                                                     uint32_t access_type,
4874                                                     uint64_t vb_used)
4875 {
4876    if (GFX_VER < 8 || anv_use_relocations(cmd_buffer->device->physical))
4877       return;
4878 
4879    if (access_type == RANDOM) {
4880       /* We have an index buffer */
4881       struct anv_vb_cache_range *bound = &cmd_buffer->state.gfx.ib_bound_range;
4882       struct anv_vb_cache_range *dirty = &cmd_buffer->state.gfx.ib_dirty_range;
4883 
4884       anv_merge_vb_cache_range(dirty, bound);
4885    }
4886 
4887    uint64_t mask = vb_used;
4888    while (mask) {
4889       int i = u_bit_scan64(&mask);
4890       assert(i >= 0);
4891       assert(i < ARRAY_SIZE(cmd_buffer->state.gfx.vb_bound_ranges));
4892       assert(i < ARRAY_SIZE(cmd_buffer->state.gfx.vb_dirty_ranges));
4893 
4894       struct anv_vb_cache_range *bound, *dirty;
4895       bound = &cmd_buffer->state.gfx.vb_bound_ranges[i];
4896       dirty = &cmd_buffer->state.gfx.vb_dirty_ranges[i];
4897 
4898       anv_merge_vb_cache_range(dirty, bound);
4899    }
4900 }
4901 
4902 static void
cmd_buffer_emit_depth_stencil(struct anv_cmd_buffer * cmd_buffer)4903 cmd_buffer_emit_depth_stencil(struct anv_cmd_buffer *cmd_buffer)
4904 {
4905    struct anv_device *device = cmd_buffer->device;
4906    struct anv_cmd_graphics_state *gfx = &cmd_buffer->state.gfx;
4907 
4908    /* FIXME: Width and Height are wrong */
4909 
4910    genX(cmd_buffer_emit_gfx7_depth_flush)(cmd_buffer);
4911 
4912    uint32_t *dw = anv_batch_emit_dwords(&cmd_buffer->batch,
4913                                         device->isl_dev.ds.size / 4);
4914    if (dw == NULL)
4915       return;
4916 
4917    struct isl_view isl_view = {};
4918    struct isl_depth_stencil_hiz_emit_info info = {
4919       .view = &isl_view,
4920       .mocs = anv_mocs(device, NULL, ISL_SURF_USAGE_DEPTH_BIT),
4921    };
4922 
4923    if (gfx->depth_att.iview != NULL) {
4924       isl_view = gfx->depth_att.iview->planes[0].isl;
4925    } else if (gfx->stencil_att.iview != NULL) {
4926       isl_view = gfx->stencil_att.iview->planes[0].isl;
4927    }
4928 
4929    if (gfx->view_mask) {
4930       assert(isl_view.array_len == 0 ||
4931              isl_view.array_len >= util_last_bit(gfx->view_mask));
4932       isl_view.array_len = util_last_bit(gfx->view_mask);
4933    } else {
4934       assert(isl_view.array_len == 0 ||
4935              isl_view.array_len >= util_last_bit(gfx->layer_count));
4936       isl_view.array_len = gfx->layer_count;
4937    }
4938 
4939    if (gfx->depth_att.iview != NULL) {
4940       const struct anv_image_view *iview = gfx->depth_att.iview;
4941       const struct anv_image *image = iview->image;
4942 
4943       const uint32_t depth_plane =
4944          anv_image_aspect_to_plane(image, VK_IMAGE_ASPECT_DEPTH_BIT);
4945       const struct anv_surface *depth_surface =
4946          &image->planes[depth_plane].primary_surface;
4947       const struct anv_address depth_address =
4948          anv_image_address(image, &depth_surface->memory_range);
4949 
4950       info.depth_surf = &depth_surface->isl;
4951 
4952       info.depth_address =
4953          anv_batch_emit_reloc(&cmd_buffer->batch,
4954                               dw + device->isl_dev.ds.depth_offset / 4,
4955                               depth_address.bo, depth_address.offset);
4956       info.mocs =
4957          anv_mocs(device, depth_address.bo, ISL_SURF_USAGE_DEPTH_BIT);
4958 
4959       info.hiz_usage = gfx->depth_att.aux_usage;
4960       if (info.hiz_usage != ISL_AUX_USAGE_NONE) {
4961          assert(isl_aux_usage_has_hiz(info.hiz_usage));
4962 
4963          const struct anv_surface *hiz_surface =
4964             &image->planes[depth_plane].aux_surface;
4965          const struct anv_address hiz_address =
4966             anv_image_address(image, &hiz_surface->memory_range);
4967 
4968          info.hiz_surf = &hiz_surface->isl;
4969 
4970          info.hiz_address =
4971             anv_batch_emit_reloc(&cmd_buffer->batch,
4972                                  dw + device->isl_dev.ds.hiz_offset / 4,
4973                                  hiz_address.bo, hiz_address.offset);
4974 
4975          info.depth_clear_value = ANV_HZ_FC_VAL;
4976       }
4977    }
4978 
4979    if (gfx->stencil_att.iview != NULL) {
4980       const struct anv_image_view *iview = gfx->stencil_att.iview;
4981       const struct anv_image *image = iview->image;
4982 
4983       const uint32_t stencil_plane =
4984          anv_image_aspect_to_plane(image, VK_IMAGE_ASPECT_STENCIL_BIT);
4985       const struct anv_surface *stencil_surface =
4986          &image->planes[stencil_plane].primary_surface;
4987       const struct anv_address stencil_address =
4988          anv_image_address(image, &stencil_surface->memory_range);
4989 
4990       info.stencil_surf = &stencil_surface->isl;
4991 
4992       info.stencil_aux_usage = image->planes[stencil_plane].aux_usage;
4993       info.stencil_address =
4994          anv_batch_emit_reloc(&cmd_buffer->batch,
4995                               dw + device->isl_dev.ds.stencil_offset / 4,
4996                               stencil_address.bo, stencil_address.offset);
4997       info.mocs =
4998          anv_mocs(device, stencil_address.bo, ISL_SURF_USAGE_STENCIL_BIT);
4999    }
5000 
5001    isl_emit_depth_stencil_hiz_s(&device->isl_dev, dw, &info);
5002 
5003    cmd_buffer->state.hiz_enabled = isl_aux_usage_has_hiz(info.hiz_usage);
5004 }
5005 
5006 static VkImageLayout
attachment_initial_layout(const VkRenderingAttachmentInfo * att)5007 attachment_initial_layout(const VkRenderingAttachmentInfo *att)
5008 {
5009    const VkRenderingAttachmentInitialLayoutInfoMESA *layout_info =
5010       vk_find_struct_const(att->pNext,
5011                            RENDERING_ATTACHMENT_INITIAL_LAYOUT_INFO_MESA);
5012    if (layout_info != NULL)
5013       return layout_info->initialLayout;
5014 
5015    return att->imageLayout;
5016 }
5017 
genX(CmdBeginRendering)5018 void genX(CmdBeginRendering)(
5019     VkCommandBuffer                             commandBuffer,
5020     const VkRenderingInfo*                      pRenderingInfo)
5021 {
5022    ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer);
5023    struct anv_cmd_graphics_state *gfx = &cmd_buffer->state.gfx;
5024    VkResult result;
5025 
5026    if (!is_render_queue_cmd_buffer(cmd_buffer)) {
5027       assert(!"Trying to start a render pass on non-render queue!");
5028       anv_batch_set_error(&cmd_buffer->batch, VK_ERROR_UNKNOWN);
5029       return;
5030    }
5031 
5032    anv_measure_beginrenderpass(cmd_buffer);
5033    trace_intel_begin_render_pass(&cmd_buffer->trace);
5034 
5035    gfx->rendering_flags = pRenderingInfo->flags;
5036    gfx->render_area = pRenderingInfo->renderArea;
5037    gfx->view_mask = pRenderingInfo->viewMask;
5038    gfx->layer_count = pRenderingInfo->layerCount;
5039    gfx->samples = 0;
5040 
5041    const bool is_multiview = gfx->view_mask != 0;
5042    const VkRect2D render_area = gfx->render_area;
5043    const uint32_t layers =
5044       is_multiview ? util_last_bit(gfx->view_mask) : gfx->layer_count;
5045 
5046    /* The framebuffer size is at least large enough to contain the render
5047     * area.  Because a zero renderArea is possible, we MAX with 1.
5048     */
5049    struct isl_extent3d fb_size = {
5050       .w = MAX2(1, render_area.offset.x + render_area.extent.width),
5051       .h = MAX2(1, render_area.offset.y + render_area.extent.height),
5052       .d = layers,
5053    };
5054 
5055    const uint32_t color_att_count = pRenderingInfo->colorAttachmentCount;
5056    result = anv_cmd_buffer_init_attachments(cmd_buffer, color_att_count);
5057    if (result != VK_SUCCESS)
5058       return;
5059 
5060    genX(flush_pipeline_select_3d)(cmd_buffer);
5061 
5062    for (uint32_t i = 0; i < gfx->color_att_count; i++) {
5063       if (pRenderingInfo->pColorAttachments[i].imageView == VK_NULL_HANDLE)
5064          continue;
5065 
5066       const VkRenderingAttachmentInfo *att =
5067          &pRenderingInfo->pColorAttachments[i];
5068       ANV_FROM_HANDLE(anv_image_view, iview, att->imageView);
5069       const VkImageLayout initial_layout = attachment_initial_layout(att);
5070 
5071       assert(render_area.offset.x + render_area.extent.width <=
5072              iview->vk.extent.width);
5073       assert(render_area.offset.y + render_area.extent.height <=
5074              iview->vk.extent.height);
5075       assert(layers <= iview->vk.layer_count);
5076 
5077       fb_size.w = MAX2(fb_size.w, iview->vk.extent.width);
5078       fb_size.h = MAX2(fb_size.h, iview->vk.extent.height);
5079 
5080       assert(gfx->samples == 0 || gfx->samples == iview->vk.image->samples);
5081       gfx->samples |= iview->vk.image->samples;
5082 
5083       enum isl_aux_usage aux_usage =
5084          anv_layout_to_aux_usage(cmd_buffer->device->info,
5085                                  iview->image,
5086                                  VK_IMAGE_ASPECT_COLOR_BIT,
5087                                  VK_IMAGE_USAGE_COLOR_ATTACHMENT_BIT,
5088                                  att->imageLayout);
5089 
5090       union isl_color_value fast_clear_color = { .u32 = { 0, } };
5091 
5092       if (att->loadOp == VK_ATTACHMENT_LOAD_OP_CLEAR &&
5093           !(gfx->rendering_flags & VK_RENDERING_RESUMING_BIT)) {
5094          const union isl_color_value clear_color =
5095             vk_to_isl_color_with_format(att->clearValue.color,
5096                                         iview->planes[0].isl.format);
5097 
5098          /* We only support fast-clears on the first layer */
5099          const bool fast_clear =
5100             (!is_multiview || (gfx->view_mask & 1)) &&
5101             anv_can_fast_clear_color_view(cmd_buffer->device, iview,
5102                                           att->imageLayout, clear_color,
5103                                           layers, render_area);
5104 
5105          if (att->imageLayout != initial_layout) {
5106             assert(render_area.offset.x == 0 && render_area.offset.y == 0 &&
5107                    render_area.extent.width == iview->vk.extent.width &&
5108                    render_area.extent.height == iview->vk.extent.height);
5109             if (is_multiview) {
5110                u_foreach_bit(view, gfx->view_mask) {
5111                   transition_color_buffer(cmd_buffer, iview->image,
5112                                           VK_IMAGE_ASPECT_COLOR_BIT,
5113                                           iview->vk.base_mip_level, 1,
5114                                           iview->vk.base_array_layer + view,
5115                                           1, /* layer_count */
5116                                           initial_layout, att->imageLayout,
5117                                           VK_QUEUE_FAMILY_IGNORED,
5118                                           VK_QUEUE_FAMILY_IGNORED,
5119                                           fast_clear);
5120                }
5121             } else {
5122                transition_color_buffer(cmd_buffer, iview->image,
5123                                        VK_IMAGE_ASPECT_COLOR_BIT,
5124                                        iview->vk.base_mip_level, 1,
5125                                        iview->vk.base_array_layer,
5126                                        gfx->layer_count,
5127                                        initial_layout, att->imageLayout,
5128                                        VK_QUEUE_FAMILY_IGNORED,
5129                                        VK_QUEUE_FAMILY_IGNORED,
5130                                        fast_clear);
5131             }
5132          }
5133 
5134          uint32_t clear_view_mask = pRenderingInfo->viewMask;
5135          uint32_t base_clear_layer = iview->vk.base_array_layer;
5136          uint32_t clear_layer_count = gfx->layer_count;
5137          if (fast_clear) {
5138             /* We only support fast-clears on the first layer */
5139             assert(iview->vk.base_mip_level == 0 &&
5140                    iview->vk.base_array_layer == 0);
5141 
5142             fast_clear_color = clear_color;
5143 
5144             if (iview->image->vk.samples == 1) {
5145                anv_image_ccs_op(cmd_buffer, iview->image,
5146                                 iview->planes[0].isl.format,
5147                                 iview->planes[0].isl.swizzle,
5148                                 VK_IMAGE_ASPECT_COLOR_BIT,
5149                                 0, 0, 1, ISL_AUX_OP_FAST_CLEAR,
5150                                 &fast_clear_color,
5151                                 false);
5152             } else {
5153                anv_image_mcs_op(cmd_buffer, iview->image,
5154                                 iview->planes[0].isl.format,
5155                                 iview->planes[0].isl.swizzle,
5156                                 VK_IMAGE_ASPECT_COLOR_BIT,
5157                                 0, 1, ISL_AUX_OP_FAST_CLEAR,
5158                                 &fast_clear_color,
5159                                 false);
5160             }
5161             clear_view_mask &= ~1u;
5162             base_clear_layer++;
5163             clear_layer_count--;
5164 
5165             if (isl_color_value_is_zero(clear_color,
5166                                         iview->planes[0].isl.format)) {
5167                /* This image has the auxiliary buffer enabled. We can mark the
5168                 * subresource as not needing a resolve because the clear color
5169                 * will match what's in every RENDER_SURFACE_STATE object when
5170                 * it's being used for sampling.
5171                 */
5172                set_image_fast_clear_state(cmd_buffer, iview->image,
5173                                           VK_IMAGE_ASPECT_COLOR_BIT,
5174                                           ANV_FAST_CLEAR_DEFAULT_VALUE);
5175             } else {
5176                set_image_fast_clear_state(cmd_buffer, iview->image,
5177                                           VK_IMAGE_ASPECT_COLOR_BIT,
5178                                           ANV_FAST_CLEAR_ANY);
5179             }
5180          }
5181 
5182          if (is_multiview) {
5183             u_foreach_bit(view, clear_view_mask) {
5184                anv_image_clear_color(cmd_buffer, iview->image,
5185                                      VK_IMAGE_ASPECT_COLOR_BIT,
5186                                      aux_usage,
5187                                      iview->planes[0].isl.format,
5188                                      iview->planes[0].isl.swizzle,
5189                                      iview->vk.base_mip_level,
5190                                      iview->vk.base_array_layer + view, 1,
5191                                      render_area, clear_color);
5192             }
5193          } else {
5194             anv_image_clear_color(cmd_buffer, iview->image,
5195                                   VK_IMAGE_ASPECT_COLOR_BIT,
5196                                   aux_usage,
5197                                   iview->planes[0].isl.format,
5198                                   iview->planes[0].isl.swizzle,
5199                                   iview->vk.base_mip_level,
5200                                   base_clear_layer, clear_layer_count,
5201                                   render_area, clear_color);
5202          }
5203       } else {
5204          /* If not LOAD_OP_CLEAR, we shouldn't have a layout transition. */
5205          assert(att->imageLayout == initial_layout);
5206       }
5207 
5208       gfx->color_att[i].vk_format = iview->vk.format;
5209       gfx->color_att[i].iview = iview;
5210       gfx->color_att[i].layout = att->imageLayout;
5211       gfx->color_att[i].aux_usage = aux_usage;
5212 
5213       struct isl_view isl_view = iview->planes[0].isl;
5214       if (pRenderingInfo->viewMask) {
5215          assert(isl_view.array_len >= util_last_bit(pRenderingInfo->viewMask));
5216          isl_view.array_len = util_last_bit(pRenderingInfo->viewMask);
5217       } else {
5218          assert(isl_view.array_len >= pRenderingInfo->layerCount);
5219          isl_view.array_len = pRenderingInfo->layerCount;
5220       }
5221 
5222       anv_image_fill_surface_state(cmd_buffer->device,
5223                                    iview->image,
5224                                    VK_IMAGE_ASPECT_COLOR_BIT,
5225                                    &isl_view,
5226                                    ISL_SURF_USAGE_RENDER_TARGET_BIT,
5227                                    aux_usage, &fast_clear_color,
5228                                    0, /* anv_image_view_state_flags */
5229                                    &gfx->color_att[i].surface_state,
5230                                    NULL);
5231 
5232       add_surface_state_relocs(cmd_buffer, gfx->color_att[i].surface_state);
5233 
5234       if ((att->loadOp == VK_ATTACHMENT_LOAD_OP_LOAD ||
5235            (gfx->rendering_flags & VK_RENDERING_RESUMING_BIT)) &&
5236           iview->image->planes[0].aux_usage != ISL_AUX_USAGE_NONE &&
5237           iview->planes[0].isl.base_level == 0 &&
5238           iview->planes[0].isl.base_array_layer == 0) {
5239          genX(copy_fast_clear_dwords)(cmd_buffer,
5240                                       gfx->color_att[i].surface_state.state,
5241                                       iview->image,
5242                                       VK_IMAGE_ASPECT_COLOR_BIT,
5243                                       false /* copy to ss */);
5244       }
5245 
5246       if (att->resolveMode != VK_RESOLVE_MODE_NONE) {
5247          gfx->color_att[i].resolve_mode = att->resolveMode;
5248          gfx->color_att[i].resolve_iview =
5249             anv_image_view_from_handle(att->resolveImageView);
5250          gfx->color_att[i].resolve_layout = att->resolveImageLayout;
5251       }
5252    }
5253 
5254    anv_cmd_graphic_state_update_has_uint_rt(gfx);
5255 
5256    const struct anv_image_view *ds_iview = NULL;
5257    const VkRenderingAttachmentInfo *d_att = pRenderingInfo->pDepthAttachment;
5258    const VkRenderingAttachmentInfo *s_att = pRenderingInfo->pStencilAttachment;
5259    if ((d_att != NULL && d_att->imageView != VK_NULL_HANDLE) ||
5260        (s_att != NULL && s_att->imageView != VK_NULL_HANDLE)) {
5261       const struct anv_image_view *d_iview = NULL, *s_iview = NULL;
5262       VkImageLayout depth_layout = VK_IMAGE_LAYOUT_UNDEFINED;
5263       VkImageLayout stencil_layout = VK_IMAGE_LAYOUT_UNDEFINED;
5264       VkImageLayout initial_depth_layout = VK_IMAGE_LAYOUT_UNDEFINED;
5265       VkImageLayout initial_stencil_layout = VK_IMAGE_LAYOUT_UNDEFINED;
5266       enum isl_aux_usage depth_aux_usage = ISL_AUX_USAGE_NONE;
5267       enum isl_aux_usage stencil_aux_usage = ISL_AUX_USAGE_NONE;
5268       float depth_clear_value = 0;
5269       uint32_t stencil_clear_value = 0;
5270 
5271       if (d_att != NULL && d_att->imageView != VK_NULL_HANDLE) {
5272          d_iview = anv_image_view_from_handle(d_att->imageView);
5273          initial_depth_layout = attachment_initial_layout(d_att);
5274          depth_layout = d_att->imageLayout;
5275          depth_aux_usage =
5276             anv_layout_to_aux_usage(cmd_buffer->device->info,
5277                                     d_iview->image,
5278                                     VK_IMAGE_ASPECT_DEPTH_BIT,
5279                                     VK_IMAGE_USAGE_DEPTH_STENCIL_ATTACHMENT_BIT,
5280                                     depth_layout);
5281          depth_clear_value = d_att->clearValue.depthStencil.depth;
5282       }
5283 
5284       if (s_att != NULL && s_att->imageView != VK_NULL_HANDLE) {
5285          s_iview = anv_image_view_from_handle(s_att->imageView);
5286          initial_stencil_layout = attachment_initial_layout(s_att);
5287          stencil_layout = s_att->imageLayout;
5288          stencil_aux_usage =
5289             anv_layout_to_aux_usage(cmd_buffer->device->info,
5290                                     s_iview->image,
5291                                     VK_IMAGE_ASPECT_STENCIL_BIT,
5292                                     VK_IMAGE_USAGE_DEPTH_STENCIL_ATTACHMENT_BIT,
5293                                     stencil_layout);
5294          stencil_clear_value = s_att->clearValue.depthStencil.stencil;
5295       }
5296 
5297       assert(s_iview == NULL || d_iview == NULL || s_iview == d_iview);
5298       ds_iview = d_iview != NULL ? d_iview : s_iview;
5299       assert(ds_iview != NULL);
5300 
5301       assert(render_area.offset.x + render_area.extent.width <=
5302              ds_iview->vk.extent.width);
5303       assert(render_area.offset.y + render_area.extent.height <=
5304              ds_iview->vk.extent.height);
5305       assert(layers <= ds_iview->vk.layer_count);
5306 
5307       fb_size.w = MAX2(fb_size.w, ds_iview->vk.extent.width);
5308       fb_size.h = MAX2(fb_size.h, ds_iview->vk.extent.height);
5309 
5310       assert(gfx->samples == 0 || gfx->samples == ds_iview->vk.image->samples);
5311       gfx->samples |= ds_iview->vk.image->samples;
5312 
5313       VkImageAspectFlags clear_aspects = 0;
5314       if (d_iview != NULL && d_att->loadOp == VK_ATTACHMENT_LOAD_OP_CLEAR &&
5315           !(gfx->rendering_flags & VK_RENDERING_RESUMING_BIT))
5316          clear_aspects |= VK_IMAGE_ASPECT_DEPTH_BIT;
5317       if (s_iview != NULL && s_att->loadOp == VK_ATTACHMENT_LOAD_OP_CLEAR &&
5318           !(gfx->rendering_flags & VK_RENDERING_RESUMING_BIT))
5319          clear_aspects |= VK_IMAGE_ASPECT_STENCIL_BIT;
5320 
5321       if (clear_aspects != 0) {
5322          const bool hiz_clear =
5323             anv_can_hiz_clear_ds_view(cmd_buffer->device, d_iview,
5324                                       depth_layout, clear_aspects,
5325                                       depth_clear_value,
5326                                       render_area);
5327 
5328          if (depth_layout != initial_depth_layout) {
5329             assert(render_area.offset.x == 0 && render_area.offset.y == 0 &&
5330                    render_area.extent.width == d_iview->vk.extent.width &&
5331                    render_area.extent.height == d_iview->vk.extent.height);
5332 
5333             if (is_multiview) {
5334                u_foreach_bit(view, gfx->view_mask) {
5335                   transition_depth_buffer(cmd_buffer, d_iview->image,
5336                                           d_iview->vk.base_array_layer + view,
5337                                           1 /* layer_count */,
5338                                           initial_depth_layout, depth_layout,
5339                                           hiz_clear);
5340                }
5341             } else {
5342                transition_depth_buffer(cmd_buffer, d_iview->image,
5343                                        d_iview->vk.base_array_layer,
5344                                        gfx->layer_count,
5345                                        initial_depth_layout, depth_layout,
5346                                        hiz_clear);
5347             }
5348          }
5349 
5350          if (stencil_layout != initial_stencil_layout) {
5351             assert(render_area.offset.x == 0 && render_area.offset.y == 0 &&
5352                    render_area.extent.width == s_iview->vk.extent.width &&
5353                    render_area.extent.height == s_iview->vk.extent.height);
5354 
5355             if (is_multiview) {
5356                u_foreach_bit(view, gfx->view_mask) {
5357                   transition_stencil_buffer(cmd_buffer, s_iview->image,
5358                                             s_iview->vk.base_mip_level, 1,
5359                                             s_iview->vk.base_array_layer + view,
5360                                             1 /* layer_count */,
5361                                             initial_stencil_layout,
5362                                             stencil_layout,
5363                                             hiz_clear);
5364                }
5365             } else {
5366                transition_stencil_buffer(cmd_buffer, s_iview->image,
5367                                          s_iview->vk.base_mip_level, 1,
5368                                          s_iview->vk.base_array_layer,
5369                                          gfx->layer_count,
5370                                          initial_stencil_layout,
5371                                          stencil_layout,
5372                                          hiz_clear);
5373             }
5374          }
5375 
5376          if (is_multiview) {
5377             uint32_t clear_view_mask = pRenderingInfo->viewMask;
5378             while (clear_view_mask) {
5379                int view = u_bit_scan(&clear_view_mask);
5380 
5381                uint32_t level = ds_iview->vk.base_mip_level;
5382                uint32_t layer = ds_iview->vk.base_array_layer + view;
5383 
5384                if (hiz_clear) {
5385                   anv_image_hiz_clear(cmd_buffer, ds_iview->image,
5386                                       clear_aspects,
5387                                       level, layer, 1,
5388                                       render_area,
5389                                       stencil_clear_value);
5390                } else {
5391                   anv_image_clear_depth_stencil(cmd_buffer, ds_iview->image,
5392                                                 clear_aspects,
5393                                                 depth_aux_usage,
5394                                                 level, layer, 1,
5395                                                 render_area,
5396                                                 depth_clear_value,
5397                                                 stencil_clear_value);
5398                }
5399             }
5400          } else {
5401             uint32_t level = ds_iview->vk.base_mip_level;
5402             uint32_t base_layer = ds_iview->vk.base_array_layer;
5403             uint32_t layer_count = gfx->layer_count;
5404 
5405             if (hiz_clear) {
5406                anv_image_hiz_clear(cmd_buffer, ds_iview->image,
5407                                    clear_aspects,
5408                                    level, base_layer, layer_count,
5409                                    render_area,
5410                                    stencil_clear_value);
5411             } else {
5412                anv_image_clear_depth_stencil(cmd_buffer, ds_iview->image,
5413                                              clear_aspects,
5414                                              depth_aux_usage,
5415                                              level, base_layer, layer_count,
5416                                              render_area,
5417                                              depth_clear_value,
5418                                              stencil_clear_value);
5419             }
5420          }
5421       } else {
5422          /* If not LOAD_OP_CLEAR, we shouldn't have a layout transition. */
5423          assert(depth_layout == initial_depth_layout);
5424          assert(stencil_layout == initial_stencil_layout);
5425       }
5426 
5427       if (d_iview != NULL) {
5428          gfx->depth_att.vk_format = d_iview->vk.format;
5429          gfx->depth_att.iview = d_iview;
5430          gfx->depth_att.layout = depth_layout;
5431          gfx->depth_att.aux_usage = depth_aux_usage;
5432          if (d_att != NULL && d_att->resolveMode != VK_RESOLVE_MODE_NONE) {
5433             assert(d_att->resolveImageView != VK_NULL_HANDLE);
5434             gfx->depth_att.resolve_mode = d_att->resolveMode;
5435             gfx->depth_att.resolve_iview =
5436                anv_image_view_from_handle(d_att->resolveImageView);
5437             gfx->depth_att.resolve_layout = d_att->resolveImageLayout;
5438          }
5439       }
5440 
5441       if (s_iview != NULL) {
5442          gfx->stencil_att.vk_format = s_iview->vk.format;
5443          gfx->stencil_att.iview = s_iview;
5444          gfx->stencil_att.layout = stencil_layout;
5445          gfx->stencil_att.aux_usage = stencil_aux_usage;
5446          if (s_att->resolveMode != VK_RESOLVE_MODE_NONE) {
5447             assert(s_att->resolveImageView != VK_NULL_HANDLE);
5448             gfx->stencil_att.resolve_mode = s_att->resolveMode;
5449             gfx->stencil_att.resolve_iview =
5450                anv_image_view_from_handle(s_att->resolveImageView);
5451             gfx->stencil_att.resolve_layout = s_att->resolveImageLayout;
5452          }
5453       }
5454    }
5455 
5456    /* Finally, now that we know the right size, set up the null surface */
5457    assert(util_bitcount(gfx->samples) <= 1);
5458    isl_null_fill_state(&cmd_buffer->device->isl_dev,
5459                        gfx->null_surface_state.map,
5460                        .size = fb_size);
5461 
5462    for (uint32_t i = 0; i < gfx->color_att_count; i++) {
5463       if (pRenderingInfo->pColorAttachments[i].imageView != VK_NULL_HANDLE)
5464          continue;
5465 
5466       isl_null_fill_state(&cmd_buffer->device->isl_dev,
5467                           gfx->color_att[i].surface_state.state.map,
5468                           .size = fb_size);
5469    }
5470 
5471    /****** We can now start emitting code to begin the render pass ******/
5472 
5473    gfx->dirty |= ANV_CMD_DIRTY_RENDER_TARGETS;
5474 
5475    /* Our implementation of VK_KHR_multiview uses instancing to draw the
5476     * different views.  If the client asks for instancing, we need to use the
5477     * Instance Data Step Rate to ensure that we repeat the client's
5478     * per-instance data once for each view.  Since this bit is in
5479     * VERTEX_BUFFER_STATE on gfx7, we need to dirty vertex buffers at the top
5480     * of each subpass.
5481     */
5482    if (GFX_VER == 7)
5483       gfx->vb_dirty |= ~0;
5484 
5485    /* It is possible to start a render pass with an old pipeline.  Because the
5486     * render pass and subpass index are both baked into the pipeline, this is
5487     * highly unlikely.  In order to do so, it requires that you have a render
5488     * pass with a single subpass and that you use that render pass twice
5489     * back-to-back and use the same pipeline at the start of the second render
5490     * pass as at the end of the first.  In order to avoid unpredictable issues
5491     * with this edge case, we just dirty the pipeline at the start of every
5492     * subpass.
5493     */
5494    gfx->dirty |= ANV_CMD_DIRTY_PIPELINE;
5495 
5496    cmd_buffer_emit_depth_stencil(cmd_buffer);
5497 }
5498 
5499 static void
cmd_buffer_mark_attachment_written(struct anv_cmd_buffer * cmd_buffer,struct anv_attachment * att,VkImageAspectFlagBits aspect)5500 cmd_buffer_mark_attachment_written(struct anv_cmd_buffer *cmd_buffer,
5501                                    struct anv_attachment *att,
5502                                    VkImageAspectFlagBits aspect)
5503 {
5504    struct anv_cmd_graphics_state *gfx = &cmd_buffer->state.gfx;
5505    const struct anv_image_view *iview = att->iview;
5506 
5507    if (iview == NULL)
5508       return;
5509 
5510    if (gfx->view_mask == 0) {
5511       genX(cmd_buffer_mark_image_written)(cmd_buffer, iview->image,
5512                                           aspect, att->aux_usage,
5513                                           iview->planes[0].isl.base_level,
5514                                           iview->planes[0].isl.base_array_layer,
5515                                           gfx->layer_count);
5516    } else {
5517       uint32_t res_view_mask = gfx->view_mask;
5518       while (res_view_mask) {
5519          int i = u_bit_scan(&res_view_mask);
5520 
5521          const uint32_t level = iview->planes[0].isl.base_level;
5522          const uint32_t layer = iview->planes[0].isl.base_array_layer + i;
5523 
5524          genX(cmd_buffer_mark_image_written)(cmd_buffer, iview->image,
5525                                              aspect, att->aux_usage,
5526                                              level, layer, 1);
5527       }
5528    }
5529 }
5530 
5531 static enum blorp_filter
vk_to_blorp_resolve_mode(VkResolveModeFlagBits vk_mode)5532 vk_to_blorp_resolve_mode(VkResolveModeFlagBits vk_mode)
5533 {
5534    switch (vk_mode) {
5535    case VK_RESOLVE_MODE_SAMPLE_ZERO_BIT:
5536       return BLORP_FILTER_SAMPLE_0;
5537    case VK_RESOLVE_MODE_AVERAGE_BIT:
5538       return BLORP_FILTER_AVERAGE;
5539    case VK_RESOLVE_MODE_MIN_BIT:
5540       return BLORP_FILTER_MIN_SAMPLE;
5541    case VK_RESOLVE_MODE_MAX_BIT:
5542       return BLORP_FILTER_MAX_SAMPLE;
5543    default:
5544       return BLORP_FILTER_NONE;
5545    }
5546 }
5547 
5548 static void
cmd_buffer_resolve_msaa_attachment(struct anv_cmd_buffer * cmd_buffer,const struct anv_attachment * att,VkImageLayout layout,VkImageAspectFlagBits aspect)5549 cmd_buffer_resolve_msaa_attachment(struct anv_cmd_buffer *cmd_buffer,
5550                                    const struct anv_attachment *att,
5551                                    VkImageLayout layout,
5552                                    VkImageAspectFlagBits aspect)
5553 {
5554    struct anv_cmd_graphics_state *gfx = &cmd_buffer->state.gfx;
5555    const struct anv_image_view *src_iview = att->iview;
5556    const struct anv_image_view *dst_iview = att->resolve_iview;
5557 
5558    enum isl_aux_usage src_aux_usage =
5559       anv_layout_to_aux_usage(cmd_buffer->device->info,
5560                               src_iview->image, aspect,
5561                               VK_IMAGE_USAGE_TRANSFER_SRC_BIT,
5562                               layout);
5563 
5564    enum isl_aux_usage dst_aux_usage =
5565       anv_layout_to_aux_usage(cmd_buffer->device->info,
5566                               dst_iview->image, aspect,
5567                               VK_IMAGE_USAGE_TRANSFER_DST_BIT,
5568                               att->resolve_layout);
5569 
5570    enum blorp_filter filter = vk_to_blorp_resolve_mode(att->resolve_mode);
5571 
5572    const VkRect2D render_area = gfx->render_area;
5573    if (gfx->view_mask == 0) {
5574       anv_image_msaa_resolve(cmd_buffer,
5575                              src_iview->image, src_aux_usage,
5576                              src_iview->planes[0].isl.base_level,
5577                              src_iview->planes[0].isl.base_array_layer,
5578                              dst_iview->image, dst_aux_usage,
5579                              dst_iview->planes[0].isl.base_level,
5580                              dst_iview->planes[0].isl.base_array_layer,
5581                              aspect,
5582                              render_area.offset.x, render_area.offset.y,
5583                              render_area.offset.x, render_area.offset.y,
5584                              render_area.extent.width,
5585                              render_area.extent.height,
5586                              gfx->layer_count, filter);
5587    } else {
5588       uint32_t res_view_mask = gfx->view_mask;
5589       while (res_view_mask) {
5590          int i = u_bit_scan(&res_view_mask);
5591 
5592          anv_image_msaa_resolve(cmd_buffer,
5593                                 src_iview->image, src_aux_usage,
5594                                 src_iview->planes[0].isl.base_level,
5595                                 src_iview->planes[0].isl.base_array_layer + i,
5596                                 dst_iview->image, dst_aux_usage,
5597                                 dst_iview->planes[0].isl.base_level,
5598                                 dst_iview->planes[0].isl.base_array_layer + i,
5599                                 aspect,
5600                                 render_area.offset.x, render_area.offset.y,
5601                                 render_area.offset.x, render_area.offset.y,
5602                                 render_area.extent.width,
5603                                 render_area.extent.height,
5604                                 1, filter);
5605       }
5606    }
5607 }
5608 
genX(CmdEndRendering)5609 void genX(CmdEndRendering)(
5610     VkCommandBuffer                             commandBuffer)
5611 {
5612    ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer);
5613    struct anv_cmd_graphics_state *gfx = &cmd_buffer->state.gfx;
5614 
5615    if (anv_batch_has_error(&cmd_buffer->batch))
5616       return;
5617 
5618    const bool is_multiview = gfx->view_mask != 0;
5619    const uint32_t layers =
5620       is_multiview ? util_last_bit(gfx->view_mask) : gfx->layer_count;
5621 
5622    bool has_color_resolve = false;
5623    for (uint32_t i = 0; i < gfx->color_att_count; i++) {
5624       cmd_buffer_mark_attachment_written(cmd_buffer, &gfx->color_att[i],
5625                                          VK_IMAGE_ASPECT_COLOR_BIT);
5626 
5627       /* Stash this off for later */
5628       if (gfx->color_att[i].resolve_mode != VK_RESOLVE_MODE_NONE &&
5629           !(gfx->rendering_flags & VK_RENDERING_SUSPENDING_BIT))
5630          has_color_resolve = true;
5631    }
5632 
5633    cmd_buffer_mark_attachment_written(cmd_buffer, &gfx->depth_att,
5634                                       VK_IMAGE_ASPECT_DEPTH_BIT);
5635 
5636    cmd_buffer_mark_attachment_written(cmd_buffer, &gfx->stencil_att,
5637                                       VK_IMAGE_ASPECT_STENCIL_BIT);
5638 
5639    if (has_color_resolve) {
5640       /* We are about to do some MSAA resolves.  We need to flush so that the
5641        * result of writes to the MSAA color attachments show up in the sampler
5642        * when we blit to the single-sampled resolve target.
5643        */
5644       anv_add_pending_pipe_bits(cmd_buffer,
5645                                 ANV_PIPE_TEXTURE_CACHE_INVALIDATE_BIT |
5646                                 ANV_PIPE_RENDER_TARGET_CACHE_FLUSH_BIT,
5647                                 "MSAA resolve");
5648    }
5649 
5650    if (gfx->depth_att.resolve_mode != VK_RESOLVE_MODE_NONE ||
5651        gfx->stencil_att.resolve_mode != VK_RESOLVE_MODE_NONE) {
5652       /* We are about to do some MSAA resolves.  We need to flush so that the
5653        * result of writes to the MSAA depth attachments show up in the sampler
5654        * when we blit to the single-sampled resolve target.
5655        */
5656       anv_add_pending_pipe_bits(cmd_buffer,
5657                               ANV_PIPE_TEXTURE_CACHE_INVALIDATE_BIT |
5658                               ANV_PIPE_DEPTH_CACHE_FLUSH_BIT,
5659                               "MSAA resolve");
5660    }
5661 
5662    for (uint32_t i = 0; i < gfx->color_att_count; i++) {
5663       const struct anv_attachment *att = &gfx->color_att[i];
5664       if (att->resolve_mode == VK_RESOLVE_MODE_NONE ||
5665           (gfx->rendering_flags & VK_RENDERING_SUSPENDING_BIT))
5666          continue;
5667 
5668       cmd_buffer_resolve_msaa_attachment(cmd_buffer, att, att->layout,
5669                                          VK_IMAGE_ASPECT_COLOR_BIT);
5670    }
5671 
5672    if (gfx->depth_att.resolve_mode != VK_RESOLVE_MODE_NONE &&
5673        !(gfx->rendering_flags & VK_RENDERING_SUSPENDING_BIT)) {
5674       const struct anv_image_view *src_iview = gfx->depth_att.iview;
5675 
5676       /* MSAA resolves sample from the source attachment.  Transition the
5677        * depth attachment first to get rid of any HiZ that we may not be
5678        * able to handle.
5679        */
5680       transition_depth_buffer(cmd_buffer, src_iview->image,
5681                               src_iview->planes[0].isl.base_array_layer,
5682                               layers,
5683                               gfx->depth_att.layout,
5684                               VK_IMAGE_LAYOUT_TRANSFER_SRC_OPTIMAL,
5685                               false /* will_full_fast_clear */);
5686 
5687       cmd_buffer_resolve_msaa_attachment(cmd_buffer, &gfx->depth_att,
5688                                          VK_IMAGE_LAYOUT_TRANSFER_SRC_OPTIMAL,
5689                                          VK_IMAGE_ASPECT_DEPTH_BIT);
5690 
5691       /* Transition the source back to the original layout.  This seems a bit
5692        * inefficient but, since HiZ resolves aren't destructive, going from
5693        * less HiZ to more is generally a no-op.
5694        */
5695       transition_depth_buffer(cmd_buffer, src_iview->image,
5696                               src_iview->planes[0].isl.base_array_layer,
5697                               layers,
5698                               VK_IMAGE_LAYOUT_TRANSFER_SRC_OPTIMAL,
5699                               gfx->depth_att.layout,
5700                               false /* will_full_fast_clear */);
5701    }
5702 
5703    if (gfx->stencil_att.resolve_mode != VK_RESOLVE_MODE_NONE &&
5704        !(gfx->rendering_flags & VK_RENDERING_SUSPENDING_BIT)) {
5705       cmd_buffer_resolve_msaa_attachment(cmd_buffer, &gfx->stencil_att,
5706                                          gfx->stencil_att.layout,
5707                                          VK_IMAGE_ASPECT_STENCIL_BIT);
5708    }
5709 
5710 #if GFX_VER == 7
5711    /* On gfx7, we have to store a texturable version of the stencil buffer in
5712     * a shadow whenever VK_IMAGE_USAGE_SAMPLED_BIT is set and copy back and
5713     * forth at strategic points. Stencil writes are only allowed in following
5714     * layouts:
5715     *
5716     *  - VK_IMAGE_LAYOUT_GENERAL
5717     *  - VK_IMAGE_LAYOUT_TRANSFER_DST_OPTIMAL
5718     *  - VK_IMAGE_LAYOUT_DEPTH_STENCIL_ATTACHMENT_OPTIMAL
5719     *  - VK_IMAGE_LAYOUT_DEPTH_READ_ONLY_STENCIL_ATTACHMENT_OPTIMAL
5720     *  - VK_IMAGE_LAYOUT_STENCIL_ATTACHMENT_OPTIMAL
5721     *  - VK_IMAGE_LAYOUT_ATTACHMENT_OPTIMAL
5722     *  - VK_IMAGE_LAYOUT_ATTACHMENT_FEEDBACK_LOOP_OPTIMAL_EXT
5723     *
5724     * For general, we have no nice opportunity to transition so we do the copy
5725     * to the shadow unconditionally at the end of the subpass. For transfer
5726     * destinations, we can update it as part of the transfer op. For the other
5727     * layouts, we delay the copy until a transition into some other layout.
5728     */
5729    if (gfx->stencil_att.iview != NULL) {
5730       const struct anv_image_view *iview = gfx->stencil_att.iview;
5731       const struct anv_image *image = iview->image;
5732       const uint32_t plane =
5733          anv_image_aspect_to_plane(image, VK_IMAGE_ASPECT_STENCIL_BIT);
5734 
5735       if (anv_surface_is_valid(&image->planes[plane].shadow_surface) &&
5736           (gfx->stencil_att.layout == VK_IMAGE_LAYOUT_GENERAL ||
5737            gfx->stencil_att.layout == VK_IMAGE_LAYOUT_ATTACHMENT_FEEDBACK_LOOP_OPTIMAL_EXT)) {
5738          anv_image_copy_to_shadow(cmd_buffer, image,
5739                                   VK_IMAGE_ASPECT_STENCIL_BIT,
5740                                   iview->planes[plane].isl.base_level, 1,
5741                                   iview->planes[plane].isl.base_array_layer,
5742                                   layers);
5743       }
5744    }
5745 #endif
5746 
5747    anv_cmd_buffer_reset_rendering(cmd_buffer);
5748 }
5749 
5750 void
genX(cmd_emit_conditional_render_predicate)5751 genX(cmd_emit_conditional_render_predicate)(struct anv_cmd_buffer *cmd_buffer)
5752 {
5753 #if GFX_VERx10 >= 75
5754    struct mi_builder b;
5755    mi_builder_init(&b, cmd_buffer->device->info, &cmd_buffer->batch);
5756 
5757    mi_store(&b, mi_reg64(MI_PREDICATE_SRC0),
5758                 mi_reg32(ANV_PREDICATE_RESULT_REG));
5759    mi_store(&b, mi_reg64(MI_PREDICATE_SRC1), mi_imm(0));
5760 
5761    anv_batch_emit(&cmd_buffer->batch, GENX(MI_PREDICATE), mip) {
5762       mip.LoadOperation    = LOAD_LOADINV;
5763       mip.CombineOperation = COMBINE_SET;
5764       mip.CompareOperation = COMPARE_SRCS_EQUAL;
5765    }
5766 #endif
5767 }
5768 
5769 #if GFX_VERx10 >= 75
genX(CmdBeginConditionalRenderingEXT)5770 void genX(CmdBeginConditionalRenderingEXT)(
5771    VkCommandBuffer                             commandBuffer,
5772    const VkConditionalRenderingBeginInfoEXT*   pConditionalRenderingBegin)
5773 {
5774    ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer);
5775    ANV_FROM_HANDLE(anv_buffer, buffer, pConditionalRenderingBegin->buffer);
5776    struct anv_cmd_state *cmd_state = &cmd_buffer->state;
5777    struct anv_address value_address =
5778       anv_address_add(buffer->address, pConditionalRenderingBegin->offset);
5779 
5780    const bool isInverted = pConditionalRenderingBegin->flags &
5781                            VK_CONDITIONAL_RENDERING_INVERTED_BIT_EXT;
5782 
5783    cmd_state->conditional_render_enabled = true;
5784 
5785    genX(cmd_buffer_apply_pipe_flushes)(cmd_buffer);
5786 
5787    struct mi_builder b;
5788    mi_builder_init(&b, cmd_buffer->device->info, &cmd_buffer->batch);
5789 
5790    /* Section 19.4 of the Vulkan 1.1.85 spec says:
5791     *
5792     *    If the value of the predicate in buffer memory changes
5793     *    while conditional rendering is active, the rendering commands
5794     *    may be discarded in an implementation-dependent way.
5795     *    Some implementations may latch the value of the predicate
5796     *    upon beginning conditional rendering while others
5797     *    may read it before every rendering command.
5798     *
5799     * So it's perfectly fine to read a value from the buffer once.
5800     */
5801    struct mi_value value =  mi_mem32(value_address);
5802 
5803    /* Precompute predicate result, it is necessary to support secondary
5804     * command buffers since it is unknown if conditional rendering is
5805     * inverted when populating them.
5806     */
5807    mi_store(&b, mi_reg64(ANV_PREDICATE_RESULT_REG),
5808                 isInverted ? mi_uge(&b, mi_imm(0), value) :
5809                              mi_ult(&b, mi_imm(0), value));
5810 }
5811 
genX(CmdEndConditionalRenderingEXT)5812 void genX(CmdEndConditionalRenderingEXT)(
5813 	VkCommandBuffer                             commandBuffer)
5814 {
5815    ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer);
5816    struct anv_cmd_state *cmd_state = &cmd_buffer->state;
5817 
5818    cmd_state->conditional_render_enabled = false;
5819 }
5820 #endif
5821 
5822 /* Set of stage bits for which are pipelined, i.e. they get queued
5823  * by the command streamer for later execution.
5824  */
5825 #define ANV_PIPELINE_STAGE_PIPELINED_BITS \
5826    ~(VK_PIPELINE_STAGE_2_TOP_OF_PIPE_BIT | \
5827      VK_PIPELINE_STAGE_2_DRAW_INDIRECT_BIT | \
5828      VK_PIPELINE_STAGE_2_HOST_BIT | \
5829      VK_PIPELINE_STAGE_2_CONDITIONAL_RENDERING_BIT_EXT)
5830 
genX(CmdSetEvent2)5831 void genX(CmdSetEvent2)(
5832     VkCommandBuffer                             commandBuffer,
5833     VkEvent                                     _event,
5834     const VkDependencyInfo*                     pDependencyInfo)
5835 {
5836    ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer);
5837    ANV_FROM_HANDLE(anv_event, event, _event);
5838 
5839    VkPipelineStageFlags2 src_stages = 0;
5840 
5841    for (uint32_t i = 0; i < pDependencyInfo->memoryBarrierCount; i++)
5842       src_stages |= pDependencyInfo->pMemoryBarriers[i].srcStageMask;
5843    for (uint32_t i = 0; i < pDependencyInfo->bufferMemoryBarrierCount; i++)
5844       src_stages |= pDependencyInfo->pBufferMemoryBarriers[i].srcStageMask;
5845    for (uint32_t i = 0; i < pDependencyInfo->imageMemoryBarrierCount; i++)
5846       src_stages |= pDependencyInfo->pImageMemoryBarriers[i].srcStageMask;
5847 
5848    cmd_buffer->state.pending_pipe_bits |= ANV_PIPE_POST_SYNC_BIT;
5849    genX(cmd_buffer_apply_pipe_flushes)(cmd_buffer);
5850 
5851    anv_batch_emit(&cmd_buffer->batch, GENX(PIPE_CONTROL), pc) {
5852       if (src_stages & ANV_PIPELINE_STAGE_PIPELINED_BITS) {
5853          pc.StallAtPixelScoreboard = true;
5854          pc.CommandStreamerStallEnable = true;
5855       }
5856 
5857       pc.DestinationAddressType  = DAT_PPGTT,
5858       pc.PostSyncOperation       = WriteImmediateData,
5859       pc.Address = (struct anv_address) {
5860          cmd_buffer->device->dynamic_state_pool.block_pool.bo,
5861          event->state.offset
5862       };
5863       pc.ImmediateData           = VK_EVENT_SET;
5864       anv_debug_dump_pc(pc);
5865    }
5866 }
5867 
genX(CmdResetEvent2)5868 void genX(CmdResetEvent2)(
5869     VkCommandBuffer                             commandBuffer,
5870     VkEvent                                     _event,
5871     VkPipelineStageFlags2                       stageMask)
5872 {
5873    ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer);
5874    ANV_FROM_HANDLE(anv_event, event, _event);
5875 
5876    cmd_buffer->state.pending_pipe_bits |= ANV_PIPE_POST_SYNC_BIT;
5877    genX(cmd_buffer_apply_pipe_flushes)(cmd_buffer);
5878 
5879    anv_batch_emit(&cmd_buffer->batch, GENX(PIPE_CONTROL), pc) {
5880       if (stageMask & ANV_PIPELINE_STAGE_PIPELINED_BITS) {
5881          pc.StallAtPixelScoreboard = true;
5882          pc.CommandStreamerStallEnable = true;
5883       }
5884 
5885       pc.DestinationAddressType  = DAT_PPGTT;
5886       pc.PostSyncOperation       = WriteImmediateData;
5887       pc.Address = (struct anv_address) {
5888          cmd_buffer->device->dynamic_state_pool.block_pool.bo,
5889          event->state.offset
5890       };
5891       pc.ImmediateData           = VK_EVENT_RESET;
5892       anv_debug_dump_pc(pc);
5893    }
5894 }
5895 
genX(CmdWaitEvents2)5896 void genX(CmdWaitEvents2)(
5897     VkCommandBuffer                             commandBuffer,
5898     uint32_t                                    eventCount,
5899     const VkEvent*                              pEvents,
5900     const VkDependencyInfo*                     pDependencyInfos)
5901 {
5902    ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer);
5903 
5904 #if GFX_VER >= 8
5905    for (uint32_t i = 0; i < eventCount; i++) {
5906       ANV_FROM_HANDLE(anv_event, event, pEvents[i]);
5907 
5908       anv_batch_emit(&cmd_buffer->batch, GENX(MI_SEMAPHORE_WAIT), sem) {
5909          sem.WaitMode            = PollingMode,
5910          sem.CompareOperation    = COMPARE_SAD_EQUAL_SDD,
5911          sem.SemaphoreDataDword  = VK_EVENT_SET,
5912          sem.SemaphoreAddress = (struct anv_address) {
5913             cmd_buffer->device->dynamic_state_pool.block_pool.bo,
5914             event->state.offset
5915          };
5916       }
5917    }
5918 #else
5919    anv_finishme("Implement events on gfx7");
5920 #endif
5921 
5922    cmd_buffer_barrier(cmd_buffer, pDependencyInfos, "wait event");
5923 }
5924 
vk_to_intel_index_type(VkIndexType type)5925 static uint32_t vk_to_intel_index_type(VkIndexType type)
5926 {
5927    switch (type) {
5928    case VK_INDEX_TYPE_UINT8_EXT:
5929       return INDEX_BYTE;
5930    case VK_INDEX_TYPE_UINT16:
5931       return INDEX_WORD;
5932    case VK_INDEX_TYPE_UINT32:
5933       return INDEX_DWORD;
5934    default:
5935       unreachable("invalid index type");
5936    }
5937 }
5938 
genX(CmdBindIndexBuffer)5939 void genX(CmdBindIndexBuffer)(
5940     VkCommandBuffer                             commandBuffer,
5941     VkBuffer                                    _buffer,
5942     VkDeviceSize                                offset,
5943     VkIndexType                                 indexType)
5944 {
5945    ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer);
5946    ANV_FROM_HANDLE(anv_buffer, buffer, _buffer);
5947 
5948    cmd_buffer->state.gfx.restart_index = vk_index_to_restart(indexType);
5949    cmd_buffer->state.gfx.index_buffer = buffer;
5950    cmd_buffer->state.gfx.index_type = vk_to_intel_index_type(indexType);
5951    cmd_buffer->state.gfx.index_offset = offset;
5952 
5953    cmd_buffer->state.gfx.dirty |= ANV_CMD_DIRTY_INDEX_BUFFER;
5954 }
5955 
genX(CmdSetPerformanceOverrideINTEL)5956 VkResult genX(CmdSetPerformanceOverrideINTEL)(
5957     VkCommandBuffer                             commandBuffer,
5958     const VkPerformanceOverrideInfoINTEL*       pOverrideInfo)
5959 {
5960    ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer);
5961 
5962    switch (pOverrideInfo->type) {
5963    case VK_PERFORMANCE_OVERRIDE_TYPE_NULL_HARDWARE_INTEL: {
5964       anv_batch_write_reg(&cmd_buffer->batch, GENX(INSTPM), instpm) {
5965          instpm._3DRenderingInstructionDisable = pOverrideInfo->enable;
5966          instpm.MediaInstructionDisable = pOverrideInfo->enable;
5967          instpm._3DRenderingInstructionDisableMask = true;
5968          instpm.MediaInstructionDisableMask = true;
5969       }
5970       break;
5971    }
5972 
5973    case VK_PERFORMANCE_OVERRIDE_TYPE_FLUSH_GPU_CACHES_INTEL:
5974       if (pOverrideInfo->enable) {
5975          /* FLUSH ALL THE THINGS! As requested by the MDAPI team. */
5976          anv_add_pending_pipe_bits(cmd_buffer,
5977                                    ANV_PIPE_FLUSH_BITS |
5978                                    ANV_PIPE_INVALIDATE_BITS,
5979                                    "perf counter isolation");
5980          genX(cmd_buffer_apply_pipe_flushes)(cmd_buffer);
5981       }
5982       break;
5983 
5984    default:
5985       unreachable("Invalid override");
5986    }
5987 
5988    return VK_SUCCESS;
5989 }
5990 
genX(CmdSetPerformanceStreamMarkerINTEL)5991 VkResult genX(CmdSetPerformanceStreamMarkerINTEL)(
5992     VkCommandBuffer                             commandBuffer,
5993     const VkPerformanceStreamMarkerInfoINTEL*   pMarkerInfo)
5994 {
5995    /* TODO: Waiting on the register to write, might depend on generation. */
5996 
5997    return VK_SUCCESS;
5998 }
5999 
6000 #define TIMESTAMP 0x2358
6001 
genX(cmd_emit_timestamp)6002 void genX(cmd_emit_timestamp)(struct anv_batch *batch,
6003                               struct anv_device *device,
6004                               struct anv_address addr,
6005                               enum anv_timestamp_capture_type type) {
6006    switch (type) {
6007    case ANV_TIMESTAMP_CAPTURE_TOP_OF_PIPE: {
6008       struct mi_builder b;
6009       mi_builder_init(&b, device->info, batch);
6010       mi_store(&b, mi_mem64(addr), mi_reg64(TIMESTAMP));
6011       break;
6012    }
6013 
6014    case ANV_TIMESTAMP_CAPTURE_END_OF_PIPE:
6015       anv_batch_emit(batch, GENX(PIPE_CONTROL), pc) {
6016          pc.PostSyncOperation   = WriteTimestamp;
6017          pc.Address             = addr;
6018          anv_debug_dump_pc(pc);
6019       }
6020       break;
6021 
6022    case ANV_TIMESTAMP_CAPTURE_AT_CS_STALL:
6023       anv_batch_emit(batch, GENX(PIPE_CONTROL), pc) {
6024          pc.CommandStreamerStallEnable = true;
6025          pc.PostSyncOperation    = WriteTimestamp;
6026          pc.Address              = addr;
6027          anv_debug_dump_pc(pc);
6028       }
6029       break;
6030 
6031    default:
6032       unreachable("invalid");
6033    }
6034 }
6035