• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*
2  * Copyright © 2015 Intel Corporation
3  *
4  * Permission is hereby granted, free of charge, to any person obtaining a
5  * copy of this software and associated documentation files (the "Software"),
6  * to deal in the Software without restriction, including without limitation
7  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8  * and/or sell copies of the Software, and to permit persons to whom the
9  * Software is furnished to do so, subject to the following conditions:
10  *
11  * The above copyright notice and this permission notice (including the next
12  * paragraph) shall be included in all copies or substantial portions of the
13  * Software.
14  *
15  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
18  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21  * IN THE SOFTWARE.
22  */
23 
24 #include <assert.h>
25 #include <stdbool.h>
26 
27 #include "anv_private.h"
28 #include "anv_measure.h"
29 
30 #include "genxml/gen_macros.h"
31 #include "genxml/genX_pack.h"
32 #include "common/intel_genX_state_brw.h"
33 
34 #include "ds/intel_tracepoints.h"
35 
36 #include "genX_mi_builder.h"
37 
38 static void
cmd_buffer_alloc_gfx_push_constants(struct anv_cmd_buffer * cmd_buffer)39 cmd_buffer_alloc_gfx_push_constants(struct anv_cmd_buffer *cmd_buffer)
40 {
41    struct anv_graphics_pipeline *pipeline =
42       anv_pipeline_to_graphics(cmd_buffer->state.gfx.base.pipeline);
43    VkShaderStageFlags stages = pipeline->base.base.active_stages;
44 
45    /* In order to avoid thrash, we assume that vertex and fragment stages
46     * always exist.  In the rare case where one is missing *and* the other
47     * uses push concstants, this may be suboptimal.  However, avoiding stalls
48     * seems more important.
49     */
50    stages |= VK_SHADER_STAGE_FRAGMENT_BIT;
51    if (anv_pipeline_is_primitive(pipeline))
52       stages |= VK_SHADER_STAGE_VERTEX_BIT;
53 
54    if (stages == cmd_buffer->state.gfx.push_constant_stages)
55       return;
56 
57    unsigned push_constant_kb;
58 
59    const struct intel_device_info *devinfo = cmd_buffer->device->info;
60    if (anv_pipeline_is_mesh(pipeline))
61       push_constant_kb = devinfo->mesh_max_constant_urb_size_kb;
62    else
63       push_constant_kb = devinfo->max_constant_urb_size_kb;
64 
65    const unsigned num_stages =
66       util_bitcount(stages & VK_SHADER_STAGE_ALL_GRAPHICS);
67    unsigned size_per_stage = push_constant_kb / num_stages;
68 
69    /* Broadwell+ and Haswell gt3 require that the push constant sizes be in
70     * units of 2KB.  Incidentally, these are the same platforms that have
71     * 32KB worth of push constant space.
72     */
73    if (push_constant_kb == 32)
74       size_per_stage &= ~1u;
75 
76    uint32_t kb_used = 0;
77    for (int i = MESA_SHADER_VERTEX; i < MESA_SHADER_FRAGMENT; i++) {
78       const unsigned push_size = (stages & (1 << i)) ? size_per_stage : 0;
79       anv_batch_emit(&cmd_buffer->batch,
80                      GENX(3DSTATE_PUSH_CONSTANT_ALLOC_VS), alloc) {
81          alloc._3DCommandSubOpcode  = 18 + i;
82          alloc.ConstantBufferOffset = (push_size > 0) ? kb_used : 0;
83          alloc.ConstantBufferSize   = push_size;
84       }
85       kb_used += push_size;
86    }
87 
88    anv_batch_emit(&cmd_buffer->batch,
89                   GENX(3DSTATE_PUSH_CONSTANT_ALLOC_PS), alloc) {
90       alloc.ConstantBufferOffset = kb_used;
91       alloc.ConstantBufferSize = push_constant_kb - kb_used;
92    }
93 
94 #if GFX_VERx10 == 125
95    /* DG2: Wa_22011440098
96     * MTL: Wa_18022330953
97     *
98     * In 3D mode, after programming push constant alloc command immediately
99     * program push constant command(ZERO length) without any commit between
100     * them.
101     */
102    anv_batch_emit(&cmd_buffer->batch, GENX(3DSTATE_CONSTANT_ALL), c) {
103       /* Update empty push constants for all stages (bitmask = 11111b) */
104       c.ShaderUpdateEnable = 0x1f;
105       c.MOCS = anv_mocs(cmd_buffer->device, NULL, 0);
106    }
107 #endif
108 
109    cmd_buffer->state.gfx.push_constant_stages = stages;
110 
111    /* From the BDW PRM for 3DSTATE_PUSH_CONSTANT_ALLOC_VS:
112     *
113     *    "The 3DSTATE_CONSTANT_VS must be reprogrammed prior to
114     *    the next 3DPRIMITIVE command after programming the
115     *    3DSTATE_PUSH_CONSTANT_ALLOC_VS"
116     *
117     * Since 3DSTATE_PUSH_CONSTANT_ALLOC_VS is programmed as part of
118     * pipeline setup, we need to dirty push constants.
119     */
120    cmd_buffer->state.push_constants_dirty |= stages;
121 }
122 
123 static void
cmd_buffer_emit_descriptor_pointers(struct anv_cmd_buffer * cmd_buffer,uint32_t stages)124 cmd_buffer_emit_descriptor_pointers(struct anv_cmd_buffer *cmd_buffer,
125                                     uint32_t stages)
126 {
127    static const uint32_t sampler_state_opcodes[] = {
128       [MESA_SHADER_VERTEX]                      = 43,
129       [MESA_SHADER_TESS_CTRL]                   = 44, /* HS */
130       [MESA_SHADER_TESS_EVAL]                   = 45, /* DS */
131       [MESA_SHADER_GEOMETRY]                    = 46,
132       [MESA_SHADER_FRAGMENT]                    = 47,
133    };
134 
135    static const uint32_t binding_table_opcodes[] = {
136       [MESA_SHADER_VERTEX]                      = 38,
137       [MESA_SHADER_TESS_CTRL]                   = 39,
138       [MESA_SHADER_TESS_EVAL]                   = 40,
139       [MESA_SHADER_GEOMETRY]                    = 41,
140       [MESA_SHADER_FRAGMENT]                    = 42,
141    };
142 
143    anv_foreach_stage(s, stages) {
144       assert(s < ARRAY_SIZE(binding_table_opcodes));
145 
146       if (cmd_buffer->state.samplers[s].alloc_size > 0) {
147          anv_batch_emit(&cmd_buffer->batch,
148                         GENX(3DSTATE_SAMPLER_STATE_POINTERS_VS), ssp) {
149             ssp._3DCommandSubOpcode = sampler_state_opcodes[s];
150             ssp.PointertoVSSamplerState = cmd_buffer->state.samplers[s].offset;
151          }
152       }
153 
154       /* Always emit binding table pointers if we're asked to, since on SKL
155        * this is what flushes push constants. */
156       anv_batch_emit(&cmd_buffer->batch,
157                      GENX(3DSTATE_BINDING_TABLE_POINTERS_VS), btp) {
158          btp._3DCommandSubOpcode = binding_table_opcodes[s];
159          btp.PointertoVSBindingTable = cmd_buffer->state.binding_tables[s].offset;
160       }
161    }
162 }
163 
164 static struct anv_address
get_push_range_address(struct anv_cmd_buffer * cmd_buffer,const struct anv_shader_bin * shader,const struct anv_push_range * range)165 get_push_range_address(struct anv_cmd_buffer *cmd_buffer,
166                        const struct anv_shader_bin *shader,
167                        const struct anv_push_range *range)
168 {
169    struct anv_cmd_graphics_state *gfx_state = &cmd_buffer->state.gfx;
170    switch (range->set) {
171    case ANV_DESCRIPTOR_SET_DESCRIPTORS: {
172       /* This is a descriptor set buffer so the set index is
173        * actually given by binding->binding.  (Yes, that's
174        * confusing.)
175        */
176       struct anv_descriptor_set *set =
177          gfx_state->base.descriptors[range->index];
178       return anv_descriptor_set_address(set);
179    }
180 
181    case ANV_DESCRIPTOR_SET_DESCRIPTORS_BUFFER: {
182       return anv_address_from_u64(
183          anv_cmd_buffer_descriptor_buffer_address(
184             cmd_buffer,
185             gfx_state->base.descriptor_buffers[range->index].buffer_index) +
186          gfx_state->base.descriptor_buffers[range->index].buffer_offset);
187    }
188 
189    case ANV_DESCRIPTOR_SET_PUSH_CONSTANTS: {
190       if (gfx_state->base.push_constants_state.alloc_size == 0) {
191          gfx_state->base.push_constants_state =
192             anv_cmd_buffer_gfx_push_constants(cmd_buffer);
193       }
194       return anv_cmd_buffer_temporary_state_address(
195          cmd_buffer, gfx_state->base.push_constants_state);
196    }
197 
198    case ANV_DESCRIPTOR_SET_NULL:
199       return cmd_buffer->device->workaround_address;
200 
201    default: {
202       assert(range->set < MAX_SETS);
203       struct anv_descriptor_set *set =
204          gfx_state->base.descriptors[range->set];
205       const struct anv_descriptor *desc =
206          &set->descriptors[range->index];
207 
208       if (desc->type == VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER) {
209          if (desc->buffer) {
210             return anv_address_add(desc->buffer->address,
211                                    desc->offset);
212          }
213       } else {
214          assert(desc->type == VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER_DYNAMIC);
215          if (desc->buffer) {
216             const struct anv_cmd_pipeline_state *pipe_state = &gfx_state->base;
217             uint32_t dynamic_offset =
218                pipe_state->dynamic_offsets[
219                   range->set].offsets[range->dynamic_offset_index];
220             return anv_address_add(desc->buffer->address,
221                                    desc->offset + dynamic_offset);
222          }
223       }
224 
225       /* For NULL UBOs, we just return an address in the workaround BO.  We do
226        * writes to it for workarounds but always at the bottom.  The higher
227        * bytes should be all zeros.
228        */
229       assert(range->length * 32 <= 2048);
230       return cmd_buffer->device->workaround_address;
231    }
232    }
233 }
234 
235 
236 /** Returns the size in bytes of the bound buffer
237  *
238  * The range is relative to the start of the buffer, not the start of the
239  * range.  The returned range may be smaller than
240  *
241  *    (range->start + range->length) * 32;
242  */
243 static uint32_t
get_push_range_bound_size(struct anv_cmd_buffer * cmd_buffer,const struct anv_shader_bin * shader,const struct anv_push_range * range)244 get_push_range_bound_size(struct anv_cmd_buffer *cmd_buffer,
245                           const struct anv_shader_bin *shader,
246                           const struct anv_push_range *range)
247 {
248    assert(shader->stage != MESA_SHADER_COMPUTE);
249    const struct anv_cmd_graphics_state *gfx_state = &cmd_buffer->state.gfx;
250    switch (range->set) {
251    case ANV_DESCRIPTOR_SET_DESCRIPTORS: {
252       struct anv_descriptor_set *set =
253          gfx_state->base.descriptors[range->index];
254       struct anv_state state = set->desc_surface_mem;
255       assert(range->start * 32 < state.alloc_size);
256       assert((range->start + range->length) * 32 <= state.alloc_size);
257       return state.alloc_size;
258    }
259 
260    case ANV_DESCRIPTOR_SET_DESCRIPTORS_BUFFER:
261       return gfx_state->base.pipeline->layout.set[
262          range->index].layout->descriptor_buffer_surface_size;
263 
264    case ANV_DESCRIPTOR_SET_NULL:
265    case ANV_DESCRIPTOR_SET_PUSH_CONSTANTS:
266       return (range->start + range->length) * 32;
267 
268    default: {
269       assert(range->set < MAX_SETS);
270       struct anv_descriptor_set *set =
271          gfx_state->base.descriptors[range->set];
272       const struct anv_descriptor *desc =
273          &set->descriptors[range->index];
274 
275       if (desc->type == VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER) {
276          /* Here we promote a UBO to a binding table entry so that we can avoid a layer of indirection.
277             * We use the descriptor set's internally allocated surface state to fill the binding table entry.
278          */
279          if (!desc->buffer)
280             return 0;
281 
282          if (range->start * 32 > desc->bind_range)
283             return 0;
284 
285          return desc->bind_range;
286       } else {
287          if (!desc->buffer)
288             return 0;
289 
290          assert(desc->type == VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER_DYNAMIC);
291          /* Compute the offset within the buffer */
292          const struct anv_cmd_pipeline_state *pipe_state = &gfx_state->base;
293          uint32_t dynamic_offset =
294             pipe_state->dynamic_offsets[
295                range->set].offsets[range->dynamic_offset_index];
296          uint64_t offset = desc->offset + dynamic_offset;
297          /* Clamp to the buffer size */
298          offset = MIN2(offset, desc->buffer->vk.size);
299          /* Clamp the range to the buffer size */
300          uint32_t bound_range = MIN2(desc->range, desc->buffer->vk.size - offset);
301 
302          /* Align the range for consistency */
303          bound_range = align(bound_range, ANV_UBO_ALIGNMENT);
304 
305          return bound_range;
306       }
307    }
308    }
309 }
310 
311 static void
cmd_buffer_emit_push_constant(struct anv_cmd_buffer * cmd_buffer,gl_shader_stage stage,struct anv_address * buffers,unsigned buffer_count)312 cmd_buffer_emit_push_constant(struct anv_cmd_buffer *cmd_buffer,
313                               gl_shader_stage stage,
314                               struct anv_address *buffers,
315                               unsigned buffer_count)
316 {
317    const struct anv_cmd_graphics_state *gfx_state = &cmd_buffer->state.gfx;
318    const struct anv_graphics_pipeline *pipeline =
319       anv_pipeline_to_graphics(gfx_state->base.pipeline);
320 
321    static const uint32_t push_constant_opcodes[] = {
322       [MESA_SHADER_VERTEX]                      = 21,
323       [MESA_SHADER_TESS_CTRL]                   = 25, /* HS */
324       [MESA_SHADER_TESS_EVAL]                   = 26, /* DS */
325       [MESA_SHADER_GEOMETRY]                    = 22,
326       [MESA_SHADER_FRAGMENT]                    = 23,
327    };
328 
329    assert(stage < ARRAY_SIZE(push_constant_opcodes));
330 
331    UNUSED uint32_t mocs = anv_mocs(cmd_buffer->device, NULL, 0);
332 
333    anv_batch_emit(&cmd_buffer->batch, GENX(3DSTATE_CONSTANT_VS), c) {
334       c._3DCommandSubOpcode = push_constant_opcodes[stage];
335 
336       /* Set MOCS.
337        *
338        * We only have one MOCS field for the whole packet, not one per
339        * buffer.  We could go out of our way here to walk over all of
340        * the buffers and see if any of them are used externally and use
341        * the external MOCS.  However, the notion that someone would use
342        * the same bit of memory for both scanout and a UBO is nuts.
343        *
344        * Let's not bother and assume it's all internal.
345        */
346       c.MOCS = mocs;
347 
348       if (anv_pipeline_has_stage(pipeline, stage)) {
349          const struct anv_pipeline_bind_map *bind_map =
350             &pipeline->base.shaders[stage]->bind_map;
351 
352          /* The Skylake PRM contains the following restriction:
353           *
354           *    "The driver must ensure The following case does not occur
355           *     without a flush to the 3D engine: 3DSTATE_CONSTANT_* with
356           *     buffer 3 read length equal to zero committed followed by a
357           *     3DSTATE_CONSTANT_* with buffer 0 read length not equal to
358           *     zero committed."
359           *
360           * To avoid this, we program the buffers in the highest slots.
361           * This way, slot 0 is only used if slot 3 is also used.
362           */
363          assert(buffer_count <= 4);
364          const unsigned shift = 4 - buffer_count;
365          for (unsigned i = 0; i < buffer_count; i++) {
366             const struct anv_push_range *range = &bind_map->push_ranges[i];
367 
368             /* At this point we only have non-empty ranges */
369             assert(range->length > 0);
370 
371             c.ConstantBody.ReadLength[i + shift] = range->length;
372             c.ConstantBody.Buffer[i + shift] =
373                anv_address_add(buffers[i], range->start * 32);
374          }
375       }
376    }
377 }
378 
379 #if GFX_VER >= 12
380 static void
cmd_buffer_emit_push_constant_all(struct anv_cmd_buffer * cmd_buffer,uint32_t shader_mask,struct anv_address * buffers,uint32_t buffer_count)381 cmd_buffer_emit_push_constant_all(struct anv_cmd_buffer *cmd_buffer,
382                                   uint32_t shader_mask,
383                                   struct anv_address *buffers,
384                                   uint32_t buffer_count)
385 {
386    if (buffer_count == 0) {
387       if (shader_mask) {
388          anv_batch_emit(&cmd_buffer->batch, GENX(3DSTATE_CONSTANT_ALL), c) {
389             c.ShaderUpdateEnable = shader_mask;
390             c.MOCS = isl_mocs(&cmd_buffer->device->isl_dev, 0, false);
391          }
392       }
393 
394       return;
395    }
396 
397    const struct anv_cmd_graphics_state *gfx_state = &cmd_buffer->state.gfx;
398    const struct anv_graphics_pipeline *pipeline =
399       anv_pipeline_to_graphics(gfx_state->base.pipeline);
400 
401    gl_shader_stage stage = vk_to_mesa_shader_stage(shader_mask);
402 
403    const struct anv_pipeline_bind_map *bind_map =
404       &pipeline->base.shaders[stage]->bind_map;
405 
406    uint32_t *dw;
407    const uint32_t buffer_mask = (1 << buffer_count) - 1;
408    const uint32_t num_dwords = 2 + 2 * buffer_count;
409 
410    dw = anv_batch_emitn(&cmd_buffer->batch, num_dwords,
411                         GENX(3DSTATE_CONSTANT_ALL),
412                         .ShaderUpdateEnable = shader_mask,
413                         .PointerBufferMask = buffer_mask,
414                         .MOCS = isl_mocs(&cmd_buffer->device->isl_dev, 0, false));
415 
416    for (int i = 0; i < buffer_count; i++) {
417       const struct anv_push_range *range = &bind_map->push_ranges[i];
418       GENX(3DSTATE_CONSTANT_ALL_DATA_pack)(
419          &cmd_buffer->batch, dw + 2 + i * 2,
420          &(struct GENX(3DSTATE_CONSTANT_ALL_DATA)) {
421             .PointerToConstantBuffer =
422                anv_address_add(buffers[i], range->start * 32),
423             .ConstantBufferReadLength = range->length,
424          });
425    }
426 }
427 #endif
428 
429 static void
cmd_buffer_flush_gfx_push_constants(struct anv_cmd_buffer * cmd_buffer,VkShaderStageFlags dirty_stages)430 cmd_buffer_flush_gfx_push_constants(struct anv_cmd_buffer *cmd_buffer,
431                                     VkShaderStageFlags dirty_stages)
432 {
433    VkShaderStageFlags flushed = 0;
434    struct anv_cmd_graphics_state *gfx_state = &cmd_buffer->state.gfx;
435    const struct anv_graphics_pipeline *pipeline =
436       anv_pipeline_to_graphics(gfx_state->base.pipeline);
437 
438 #if GFX_VER >= 12
439    uint32_t nobuffer_stages = 0;
440 #endif
441 
442    /* Compute robust pushed register access mask for each stage. */
443    anv_foreach_stage(stage, dirty_stages) {
444       if (!anv_pipeline_has_stage(pipeline, stage))
445          continue;
446 
447       const struct anv_shader_bin *shader = pipeline->base.shaders[stage];
448       if (shader->prog_data->zero_push_reg) {
449          const struct anv_pipeline_bind_map *bind_map = &shader->bind_map;
450          struct anv_push_constants *push = &gfx_state->base.push_constants;
451 
452          push->push_reg_mask[stage] = 0;
453          /* Start of the current range in the shader, relative to the start of
454           * push constants in the shader.
455           */
456          unsigned range_start_reg = 0;
457          for (unsigned i = 0; i < 4; i++) {
458             const struct anv_push_range *range = &bind_map->push_ranges[i];
459             if (range->length == 0)
460                continue;
461 
462             unsigned bound_size =
463                get_push_range_bound_size(cmd_buffer, shader, range);
464             if (bound_size >= range->start * 32) {
465                unsigned bound_regs =
466                   MIN2(DIV_ROUND_UP(bound_size, 32) - range->start,
467                        range->length);
468                assert(range_start_reg + bound_regs <= 64);
469                push->push_reg_mask[stage] |= BITFIELD64_RANGE(range_start_reg,
470                                                               bound_regs);
471             }
472 
473             cmd_buffer->state.push_constants_dirty |=
474                mesa_to_vk_shader_stage(stage);
475             gfx_state->base.push_constants_data_dirty = true;
476 
477             range_start_reg += range->length;
478          }
479       }
480    }
481 
482     /* Setting NULL resets the push constant state so that we allocate a new one
483     * if needed. If push constant data not dirty, get_push_range_address can
484     * re-use existing allocation.
485     *
486     * Always reallocate on gfx9, gfx11 to fix push constant related flaky tests.
487     * See https://gitlab.freedesktop.org/mesa/mesa/-/issues/11064
488     */
489    if (gfx_state->base.push_constants_data_dirty || GFX_VER < 12)
490       gfx_state->base.push_constants_state = ANV_STATE_NULL;
491 
492    anv_foreach_stage(stage, dirty_stages) {
493       unsigned buffer_count = 0;
494       flushed |= mesa_to_vk_shader_stage(stage);
495       UNUSED uint32_t max_push_range = 0;
496 
497       struct anv_address buffers[4] = {};
498       if (anv_pipeline_has_stage(pipeline, stage)) {
499          const struct anv_shader_bin *shader = pipeline->base.shaders[stage];
500          const struct anv_pipeline_bind_map *bind_map = &shader->bind_map;
501 
502          /* We have to gather buffer addresses as a second step because the
503           * loop above puts data into the push constant area and the call to
504           * get_push_range_address is what locks our push constants and copies
505           * them into the actual GPU buffer.  If we did the two loops at the
506           * same time, we'd risk only having some of the sizes in the push
507           * constant buffer when we did the copy.
508           */
509          for (unsigned i = 0; i < 4; i++) {
510             const struct anv_push_range *range = &bind_map->push_ranges[i];
511             if (range->length == 0)
512                break;
513 
514             buffers[i] = get_push_range_address(cmd_buffer, shader, range);
515             max_push_range = MAX2(max_push_range, range->length);
516             buffer_count++;
517          }
518 
519          /* We have at most 4 buffers but they should be tightly packed */
520          for (unsigned i = buffer_count; i < 4; i++)
521             assert(bind_map->push_ranges[i].length == 0);
522       }
523 
524 #if GFX_VER >= 12
525       /* If this stage doesn't have any push constants, emit it later in a
526        * single CONSTANT_ALL packet.
527        */
528       if (buffer_count == 0) {
529          nobuffer_stages |= 1 << stage;
530          continue;
531       }
532 
533       /* The Constant Buffer Read Length field from 3DSTATE_CONSTANT_ALL
534        * contains only 5 bits, so we can only use it for buffers smaller than
535        * 32.
536        *
537        * According to Wa_16011448509, Gfx12.0 misinterprets some address bits
538        * in 3DSTATE_CONSTANT_ALL.  It should still be safe to use the command
539        * for disabling stages, where all address bits are zero.  However, we
540        * can't safely use it for general buffers with arbitrary addresses.
541        * Just fall back to the individual 3DSTATE_CONSTANT_XS commands in that
542        * case.
543        */
544       if (max_push_range < 32 && GFX_VERx10 > 120) {
545          cmd_buffer_emit_push_constant_all(cmd_buffer, 1 << stage,
546                                            buffers, buffer_count);
547          continue;
548       }
549 #endif
550 
551       cmd_buffer_emit_push_constant(cmd_buffer, stage, buffers, buffer_count);
552    }
553 
554 #if GFX_VER >= 12
555    if (nobuffer_stages)
556       /* Wa_16011448509: all address bits are zero */
557       cmd_buffer_emit_push_constant_all(cmd_buffer, nobuffer_stages, NULL, 0);
558 #endif
559 
560    cmd_buffer->state.push_constants_dirty &= ~flushed;
561    gfx_state->base.push_constants_data_dirty = false;
562 }
563 
564 #if GFX_VERx10 >= 125
565 static void
cmd_buffer_flush_mesh_inline_data(struct anv_cmd_buffer * cmd_buffer,VkShaderStageFlags dirty_stages)566 cmd_buffer_flush_mesh_inline_data(struct anv_cmd_buffer *cmd_buffer,
567                                   VkShaderStageFlags dirty_stages)
568 {
569    struct anv_cmd_graphics_state *gfx_state = &cmd_buffer->state.gfx;
570    const struct anv_graphics_pipeline *pipeline =
571       anv_pipeline_to_graphics(gfx_state->base.pipeline);
572 
573    if (dirty_stages & VK_SHADER_STAGE_TASK_BIT_EXT &&
574        anv_pipeline_has_stage(pipeline, MESA_SHADER_TASK)) {
575 
576       const struct anv_shader_bin *shader = pipeline->base.shaders[MESA_SHADER_TASK];
577       const struct anv_pipeline_bind_map *bind_map = &shader->bind_map;
578 
579       anv_batch_emit(&cmd_buffer->batch, GENX(3DSTATE_TASK_SHADER_DATA), data) {
580          const struct anv_push_range *range = &bind_map->push_ranges[0];
581          if (range->length > 0) {
582             struct anv_address buffer =
583                get_push_range_address(cmd_buffer, shader, range);
584 
585             uint64_t addr = anv_address_physical(buffer);
586             data.InlineData[0] = addr & 0xffffffff;
587             data.InlineData[1] = addr >> 32;
588 
589             memcpy(&data.InlineData[BRW_TASK_MESH_PUSH_CONSTANTS_START_DW],
590                    cmd_buffer->state.gfx.base.push_constants.client_data,
591                    BRW_TASK_MESH_PUSH_CONSTANTS_SIZE_DW * 4);
592          }
593       }
594    }
595 
596    if (dirty_stages & VK_SHADER_STAGE_MESH_BIT_EXT &&
597        anv_pipeline_has_stage(pipeline, MESA_SHADER_MESH)) {
598 
599       const struct anv_shader_bin *shader = pipeline->base.shaders[MESA_SHADER_MESH];
600       const struct anv_pipeline_bind_map *bind_map = &shader->bind_map;
601 
602       anv_batch_emit(&cmd_buffer->batch, GENX(3DSTATE_MESH_SHADER_DATA), data) {
603          const struct anv_push_range *range = &bind_map->push_ranges[0];
604          if (range->length > 0) {
605             struct anv_address buffer =
606                get_push_range_address(cmd_buffer, shader, range);
607 
608             uint64_t addr = anv_address_physical(buffer);
609             data.InlineData[0] = addr & 0xffffffff;
610             data.InlineData[1] = addr >> 32;
611 
612             memcpy(&data.InlineData[BRW_TASK_MESH_PUSH_CONSTANTS_START_DW],
613                    cmd_buffer->state.gfx.base.push_constants.client_data,
614                    BRW_TASK_MESH_PUSH_CONSTANTS_SIZE_DW * 4);
615          }
616       }
617    }
618 
619    cmd_buffer->state.push_constants_dirty &= ~dirty_stages;
620 }
621 #endif
622 
623 ALWAYS_INLINE static void
cmd_buffer_maybe_flush_rt_writes(struct anv_cmd_buffer * cmd_buffer,const struct anv_graphics_pipeline * pipeline)624 cmd_buffer_maybe_flush_rt_writes(struct anv_cmd_buffer *cmd_buffer,
625                                  const struct anv_graphics_pipeline *pipeline)
626 {
627    if (!anv_pipeline_has_stage(pipeline, MESA_SHADER_FRAGMENT))
628       return;
629 
630    UNUSED bool need_rt_flush = false;
631    for (uint32_t rt = 0; rt < pipeline->num_color_outputs; rt++) {
632       /* No writes going to this render target so it won't affect the RT cache
633        */
634       if (pipeline->color_output_mapping[rt] == ANV_COLOR_OUTPUT_UNUSED)
635          continue;
636 
637       /* No change */
638       if (cmd_buffer->state.gfx.color_output_mapping[rt] ==
639           pipeline->color_output_mapping[rt])
640          continue;
641 
642       cmd_buffer->state.gfx.color_output_mapping[rt] =
643          pipeline->color_output_mapping[rt];
644       need_rt_flush = true;
645       cmd_buffer->state.descriptors_dirty |= VK_SHADER_STAGE_FRAGMENT_BIT;
646    }
647 
648 #if GFX_VER >= 11
649    if (need_rt_flush) {
650       /* The PIPE_CONTROL command description says:
651        *
652        *    "Whenever a Binding Table Index (BTI) used by a Render Target Message
653        *     points to a different RENDER_SURFACE_STATE, SW must issue a Render
654        *     Target Cache Flush by enabling this bit. When render target flush
655        *     is set due to new association of BTI, PS Scoreboard Stall bit must
656        *     be set in this packet."
657        *
658        * Within a renderpass, the render target entries in the binding tables
659        * remain the same as what was setup at CmdBeginRendering() with one
660        * exception where have to setup a null render target because a fragment
661        * writes only depth/stencil yet the renderpass has been setup with at
662        * least one color attachment. This is because our render target messages
663        * in the shader always send the color.
664        */
665       anv_add_pending_pipe_bits(cmd_buffer,
666                                 ANV_PIPE_RENDER_TARGET_CACHE_FLUSH_BIT |
667                                 ANV_PIPE_STALL_AT_SCOREBOARD_BIT,
668                                 "change RT due to shader outputs");
669    }
670 #endif
671 }
672 
673 ALWAYS_INLINE static void
cmd_buffer_flush_vertex_buffers(struct anv_cmd_buffer * cmd_buffer,uint32_t vb_emit)674 cmd_buffer_flush_vertex_buffers(struct anv_cmd_buffer *cmd_buffer,
675                                 uint32_t vb_emit)
676 {
677    const struct vk_dynamic_graphics_state *dyn =
678       &cmd_buffer->vk.dynamic_graphics_state;
679    const uint32_t num_buffers = __builtin_popcount(vb_emit);
680    const uint32_t num_dwords = 1 + num_buffers * 4;
681    uint32_t *p = anv_batch_emitn(&cmd_buffer->batch, num_dwords,
682                                  GENX(3DSTATE_VERTEX_BUFFERS));
683    uint32_t i = 0;
684    u_foreach_bit(vb, vb_emit) {
685       struct anv_buffer *buffer = cmd_buffer->state.vertex_bindings[vb].buffer;
686       uint32_t offset = cmd_buffer->state.vertex_bindings[vb].offset;
687 
688       struct GENX(VERTEX_BUFFER_STATE) state;
689       if (buffer) {
690          uint32_t stride = dyn->vi_binding_strides[vb];
691          UNUSED uint32_t size = cmd_buffer->state.vertex_bindings[vb].size;
692 
693          state = (struct GENX(VERTEX_BUFFER_STATE)) {
694             .VertexBufferIndex = vb,
695 
696             .MOCS = anv_mocs(cmd_buffer->device, buffer->address.bo,
697                              ISL_SURF_USAGE_VERTEX_BUFFER_BIT),
698             .AddressModifyEnable = true,
699             .BufferPitch = stride,
700             .BufferStartingAddress = anv_address_add(buffer->address, offset),
701             .NullVertexBuffer = offset >= buffer->vk.size,
702 #if GFX_VER >= 12
703             .L3BypassDisable = true,
704 #endif
705 
706             .BufferSize = size,
707          };
708       } else {
709          state = (struct GENX(VERTEX_BUFFER_STATE)) {
710             .VertexBufferIndex = vb,
711             .NullVertexBuffer = true,
712             .MOCS = anv_mocs(cmd_buffer->device, NULL,
713                              ISL_SURF_USAGE_VERTEX_BUFFER_BIT),
714          };
715       }
716 
717 #if GFX_VER == 9
718       genX(cmd_buffer_set_binding_for_gfx8_vb_flush)(cmd_buffer, vb,
719                                                      state.BufferStartingAddress,
720                                                      state.BufferSize);
721 #endif
722 
723       GENX(VERTEX_BUFFER_STATE_pack)(&cmd_buffer->batch, &p[1 + i * 4], &state);
724       i++;
725    }
726 }
727 
728 ALWAYS_INLINE static void
genX(cmd_buffer_flush_gfx_state)729 genX(cmd_buffer_flush_gfx_state)(struct anv_cmd_buffer *cmd_buffer)
730 {
731    struct anv_graphics_pipeline *pipeline =
732       anv_pipeline_to_graphics(cmd_buffer->state.gfx.base.pipeline);
733    const struct vk_dynamic_graphics_state *dyn =
734       &cmd_buffer->vk.dynamic_graphics_state;
735 
736    assert((pipeline->base.base.active_stages & VK_SHADER_STAGE_COMPUTE_BIT) == 0);
737 
738    genX(cmd_buffer_config_l3)(cmd_buffer, pipeline->base.base.l3_config);
739 
740    genX(cmd_buffer_update_color_aux_op(cmd_buffer, ISL_AUX_OP_NONE));
741 
742    genX(cmd_buffer_emit_hashing_mode)(cmd_buffer, UINT_MAX, UINT_MAX, 1);
743 
744    genX(flush_descriptor_buffers)(cmd_buffer, &cmd_buffer->state.gfx.base);
745 
746    genX(flush_pipeline_select_3d)(cmd_buffer);
747 
748    if (cmd_buffer->state.gfx.dirty & ANV_CMD_DIRTY_PIPELINE) {
749       /* Wa_14015814527
750        *
751        * Apply task URB workaround when switching from task to primitive.
752        */
753       if (anv_pipeline_is_primitive(pipeline)) {
754          genX(apply_task_urb_workaround)(cmd_buffer);
755       } else if (anv_pipeline_has_stage(pipeline, MESA_SHADER_TASK)) {
756          cmd_buffer->state.gfx.used_task_shader = true;
757       }
758 
759       cmd_buffer_maybe_flush_rt_writes(cmd_buffer, pipeline);
760    }
761 
762    /* Apply any pending pipeline flushes we may have.  We want to apply them
763     * now because, if any of those flushes are for things like push constants,
764     * the GPU will read the state at weird times.
765     */
766    genX(cmd_buffer_apply_pipe_flushes)(cmd_buffer);
767 
768    /* Check what vertex buffers have been rebound against the set of bindings
769     * being used by the current set of vertex attributes.
770     */
771    uint32_t vb_emit = cmd_buffer->state.gfx.vb_dirty & dyn->vi->bindings_valid;
772    /* If the pipeline changed, the we have to consider all the valid bindings. */
773    if ((cmd_buffer->state.gfx.dirty & ANV_CMD_DIRTY_PIPELINE) ||
774        BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_VI_BINDINGS_VALID) ||
775        BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_VI_BINDING_STRIDES))
776       vb_emit |= dyn->vi->bindings_valid;
777 
778    if (vb_emit) {
779       cmd_buffer_flush_vertex_buffers(cmd_buffer, vb_emit);
780       cmd_buffer->state.gfx.vb_dirty &= ~vb_emit;
781    }
782 
783    const bool any_dynamic_state_dirty =
784       vk_dynamic_graphics_state_any_dirty(dyn);
785    uint32_t descriptors_dirty = cmd_buffer->state.descriptors_dirty &
786                                 pipeline->base.base.active_stages;
787 
788    descriptors_dirty |=
789       genX(cmd_buffer_flush_push_descriptors)(cmd_buffer,
790                                               &cmd_buffer->state.gfx.base,
791                                               &pipeline->base.base);
792 
793    if (!cmd_buffer->state.gfx.dirty && !descriptors_dirty &&
794        !any_dynamic_state_dirty &&
795        ((cmd_buffer->state.push_constants_dirty &
796          (VK_SHADER_STAGE_ALL_GRAPHICS |
797           VK_SHADER_STAGE_TASK_BIT_EXT |
798           VK_SHADER_STAGE_MESH_BIT_EXT)) == 0))
799       return;
800 
801    if (cmd_buffer->state.gfx.dirty & ANV_CMD_DIRTY_XFB_ENABLE) {
802       /* Wa_16011411144:
803        *
804        * SW must insert a PIPE_CONTROL cmd before and after the
805        * 3dstate_so_buffer_index_0/1/2/3 states to ensure so_buffer_index_*
806        * state is not combined with other state changes.
807        */
808       if (intel_needs_workaround(cmd_buffer->device->info, 16011411144)) {
809          anv_add_pending_pipe_bits(cmd_buffer,
810                                    ANV_PIPE_CS_STALL_BIT,
811                                    "before SO_BUFFER change WA");
812          genX(cmd_buffer_apply_pipe_flushes)(cmd_buffer);
813       }
814 
815       /* We don't need any per-buffer dirty tracking because you're not
816        * allowed to bind different XFB buffers while XFB is enabled.
817        */
818       for (unsigned idx = 0; idx < MAX_XFB_BUFFERS; idx++) {
819          struct anv_xfb_binding *xfb = &cmd_buffer->state.xfb_bindings[idx];
820          anv_batch_emit(&cmd_buffer->batch, GENX(3DSTATE_SO_BUFFER), sob) {
821 #if GFX_VER < 12
822             sob.SOBufferIndex = idx;
823 #else
824             sob._3DCommandOpcode = 0;
825             sob._3DCommandSubOpcode = SO_BUFFER_INDEX_0_CMD + idx;
826 #endif
827 
828             if (cmd_buffer->state.xfb_enabled && xfb->buffer && xfb->size != 0) {
829                sob.MOCS = anv_mocs(cmd_buffer->device, xfb->buffer->address.bo,
830                                    ISL_SURF_USAGE_STREAM_OUT_BIT);
831                sob.SurfaceBaseAddress = anv_address_add(xfb->buffer->address,
832                                                         xfb->offset);
833                sob.SOBufferEnable = true;
834                sob.StreamOffsetWriteEnable = false;
835                /* Size is in DWords - 1 */
836                sob.SurfaceSize = DIV_ROUND_UP(xfb->size, 4) - 1;
837             } else {
838                sob.MOCS = anv_mocs(cmd_buffer->device, NULL, 0);
839             }
840          }
841       }
842 
843       if (intel_needs_workaround(cmd_buffer->device->info, 16011411144)) {
844          /* Wa_16011411144: also CS_STALL after touching SO_BUFFER change */
845          anv_add_pending_pipe_bits(cmd_buffer,
846                                    ANV_PIPE_CS_STALL_BIT,
847                                    "after SO_BUFFER change WA");
848          genX(cmd_buffer_apply_pipe_flushes)(cmd_buffer);
849       } else if (GFX_VER >= 10) {
850          /* CNL and later require a CS stall after 3DSTATE_SO_BUFFER */
851          anv_add_pending_pipe_bits(cmd_buffer,
852                                    ANV_PIPE_CS_STALL_BIT,
853                                    "after 3DSTATE_SO_BUFFER call");
854       }
855    }
856 
857    /* Flush the runtime state into the HW state tracking */
858    if (cmd_buffer->state.gfx.dirty || any_dynamic_state_dirty)
859       genX(cmd_buffer_flush_gfx_runtime_state)(cmd_buffer);
860 
861    /* Flush the HW state into the commmand buffer */
862    if (!BITSET_IS_EMPTY(cmd_buffer->state.gfx.dyn_state.dirty))
863       genX(cmd_buffer_flush_gfx_hw_state)(cmd_buffer);
864 
865    /* If the pipeline changed, we may need to re-allocate push constant space
866     * in the URB.
867     */
868    if (cmd_buffer->state.gfx.dirty & ANV_CMD_DIRTY_PIPELINE) {
869       cmd_buffer_alloc_gfx_push_constants(cmd_buffer);
870 
871       /* Also add the relocations (scratch buffers) */
872       VkResult result = anv_reloc_list_append(cmd_buffer->batch.relocs,
873                                               pipeline->base.base.batch.relocs);
874       if (result != VK_SUCCESS) {
875          anv_batch_set_error(&cmd_buffer->batch, result);
876          return;
877       }
878    }
879 
880    /* Render targets live in the same binding table as fragment descriptors */
881    if (cmd_buffer->state.gfx.dirty & ANV_CMD_DIRTY_RENDER_TARGETS)
882       descriptors_dirty |= VK_SHADER_STAGE_FRAGMENT_BIT;
883 
884    /* We emit the binding tables and sampler tables first, then emit push
885     * constants and then finally emit binding table and sampler table
886     * pointers.  It has to happen in this order, since emitting the binding
887     * tables may change the push constants (in case of storage images). After
888     * emitting push constants, on SKL+ we have to emit the corresponding
889     * 3DSTATE_BINDING_TABLE_POINTER_* for the push constants to take effect.
890     */
891    uint32_t dirty = 0;
892    if (descriptors_dirty) {
893       dirty = genX(cmd_buffer_flush_descriptor_sets)(
894          cmd_buffer,
895          &cmd_buffer->state.gfx.base,
896          descriptors_dirty,
897          pipeline->base.shaders,
898          ARRAY_SIZE(pipeline->base.shaders));
899       cmd_buffer->state.descriptors_dirty &= ~dirty;
900    }
901 
902    if (dirty || cmd_buffer->state.push_constants_dirty) {
903       /* Because we're pushing UBOs, we have to push whenever either
904        * descriptors or push constants is dirty.
905        */
906       dirty |= cmd_buffer->state.push_constants_dirty &
907                pipeline->base.base.active_stages;
908       cmd_buffer_flush_gfx_push_constants(cmd_buffer,
909                                       dirty & VK_SHADER_STAGE_ALL_GRAPHICS);
910 #if GFX_VERx10 >= 125
911       cmd_buffer_flush_mesh_inline_data(
912          cmd_buffer, dirty & (VK_SHADER_STAGE_TASK_BIT_EXT |
913                               VK_SHADER_STAGE_MESH_BIT_EXT));
914 #endif
915    }
916 
917    if (dirty & VK_SHADER_STAGE_ALL_GRAPHICS) {
918       cmd_buffer_emit_descriptor_pointers(cmd_buffer,
919                                           dirty & VK_SHADER_STAGE_ALL_GRAPHICS);
920    }
921 
922 #if GFX_VER >= 20
923    if (cmd_buffer->state.gfx.dirty & ANV_CMD_DIRTY_INDIRECT_DATA_STRIDE) {
924       anv_batch_emit(&cmd_buffer->batch, GENX(STATE_BYTE_STRIDE), sb_stride) {
925          sb_stride.ByteStride = cmd_buffer->state.gfx.indirect_data_stride;
926          sb_stride.ByteStrideEnable = !cmd_buffer->state.gfx.indirect_data_stride_aligned;
927       }
928    }
929 #endif
930 
931    cmd_buffer->state.gfx.dirty = 0;
932 }
933 
934 ALWAYS_INLINE static bool
anv_use_generated_draws(const struct anv_cmd_buffer * cmd_buffer,uint32_t count)935 anv_use_generated_draws(const struct anv_cmd_buffer *cmd_buffer, uint32_t count)
936 {
937    const struct anv_device *device = cmd_buffer->device;
938    const struct anv_graphics_pipeline *pipeline =
939       anv_pipeline_to_graphics(cmd_buffer->state.gfx.base.pipeline);
940 
941    /* We cannot generate readable commands in protected mode. */
942    if (cmd_buffer->vk.pool->flags & VK_COMMAND_POOL_CREATE_PROTECTED_BIT)
943       return false;
944 
945    /* Limit generated draws to pipelines without HS stage. This makes things
946     * simpler for implementing Wa_1306463417, Wa_16011107343.
947     */
948    if ((INTEL_NEEDS_WA_1306463417 || INTEL_NEEDS_WA_16011107343) &&
949        anv_pipeline_has_stage(pipeline, MESA_SHADER_TESS_CTRL))
950       return false;
951 
952    return count >= device->physical->instance->generated_indirect_threshold;
953 }
954 
955 #include "genX_cmd_draw_helpers.h"
956 #include "genX_cmd_draw_generated_indirect.h"
957 
958 ALWAYS_INLINE static void
cmd_buffer_pre_draw_wa(struct anv_cmd_buffer * cmd_buffer)959 cmd_buffer_pre_draw_wa(struct anv_cmd_buffer *cmd_buffer)
960 {
961    UNUSED const bool protected = cmd_buffer->vk.pool->flags &
962                                  VK_COMMAND_POOL_CREATE_PROTECTED_BIT;
963    UNUSED struct anv_graphics_pipeline *pipeline =
964       anv_pipeline_to_graphics(cmd_buffer->state.gfx.base.pipeline);
965 
966 #if INTEL_WA_16011107343_GFX_VER
967    if (intel_needs_workaround(cmd_buffer->device->info, 16011107343) &&
968        anv_pipeline_has_stage(pipeline, MESA_SHADER_TESS_CTRL)) {
969       anv_batch_emit_pipeline_state_protected(&cmd_buffer->batch, pipeline,
970                                               final.hs, protected);
971    }
972 #endif
973 
974 #if INTEL_WA_22018402687_GFX_VER
975    if (intel_needs_workaround(cmd_buffer->device->info, 22018402687) &&
976        anv_pipeline_has_stage(pipeline, MESA_SHADER_TESS_EVAL)) {
977       /* Wa_22018402687:
978        *   In any 3D enabled context, just before any Tessellation enabled
979        *   draw call (3D Primitive), re-send the last programmed 3DSTATE_DS
980        *   again. This will make sure that the 3DSTATE_INT generated just
981        *   before the draw call will have TDS dirty which will make sure TDS
982        *   will launch the state thread before the draw call.
983        *
984        * This fixes a hang resulting from running anything using tessellation
985        * after a switch away from the mesh pipeline. We don't need to track
986        * said switch, as it matters at the HW level, and can be triggered even
987        * across processes, so we apply the Wa at all times.
988        */
989       anv_batch_emit_pipeline_state_protected(&cmd_buffer->batch, pipeline,
990                                               final.ds, protected);
991    }
992 #endif
993 
994    genX(emit_breakpoint)(&cmd_buffer->batch, cmd_buffer->device, true);
995 }
996 
997 ALWAYS_INLINE static void
batch_post_draw_wa(struct anv_batch * batch,const struct anv_device * device,uint32_t primitive_topology,uint32_t vertex_count)998 batch_post_draw_wa(struct anv_batch *batch,
999                    const struct anv_device *device,
1000                    uint32_t primitive_topology,
1001                    uint32_t vertex_count)
1002 {
1003 #if INTEL_WA_22014412737_GFX_VER || INTEL_WA_16014538804_GFX_VER
1004    if (intel_needs_workaround(device->info, 22014412737) &&
1005        (primitive_topology == _3DPRIM_POINTLIST ||
1006         primitive_topology == _3DPRIM_LINELIST ||
1007         primitive_topology == _3DPRIM_LINESTRIP ||
1008         primitive_topology == _3DPRIM_LINELIST_ADJ ||
1009         primitive_topology == _3DPRIM_LINESTRIP_ADJ ||
1010         primitive_topology == _3DPRIM_LINELOOP ||
1011         primitive_topology == _3DPRIM_POINTLIST_BF ||
1012         primitive_topology == _3DPRIM_LINESTRIP_CONT ||
1013         primitive_topology == _3DPRIM_LINESTRIP_BF ||
1014         primitive_topology == _3DPRIM_LINESTRIP_CONT_BF) &&
1015        (vertex_count == 1 || vertex_count == 2)) {
1016       genx_batch_emit_pipe_control_write
1017          (batch, device->info, 0, WriteImmediateData,
1018           device->workaround_address, 0, 0);
1019 
1020       /* Reset counter because we just emitted a PC */
1021       batch->num_3d_primitives_emitted = 0;
1022    } else if (intel_needs_workaround(device->info, 16014538804)) {
1023       batch->num_3d_primitives_emitted++;
1024       /* WA 16014538804:
1025        *    After every 3 3D_Primitive command,
1026        *    atleast 1 pipe_control must be inserted.
1027        */
1028       if (batch->num_3d_primitives_emitted == 3) {
1029          anv_batch_emit(batch, GENX(PIPE_CONTROL), pc);
1030          batch->num_3d_primitives_emitted = 0;
1031       }
1032    }
1033 #endif
1034 }
1035 
1036 void
genX(batch_emit_post_3dprimitive_was)1037 genX(batch_emit_post_3dprimitive_was)(struct anv_batch *batch,
1038                                       const struct anv_device *device,
1039                                       uint32_t primitive_topology,
1040                                       uint32_t vertex_count)
1041 {
1042    batch_post_draw_wa(batch, device, primitive_topology, vertex_count);
1043 }
1044 
1045 ALWAYS_INLINE static void
cmd_buffer_post_draw_wa(struct anv_cmd_buffer * cmd_buffer,uint32_t vertex_count,uint32_t access_type)1046 cmd_buffer_post_draw_wa(struct anv_cmd_buffer *cmd_buffer,
1047                         uint32_t vertex_count,
1048                         uint32_t access_type)
1049 {
1050    batch_post_draw_wa(&cmd_buffer->batch, cmd_buffer->device,
1051                       cmd_buffer->state.gfx.dyn_state.vft.PrimitiveTopologyType,
1052                       vertex_count);
1053 
1054    update_dirty_vbs_for_gfx8_vb_flush(cmd_buffer, access_type);
1055 
1056    genX(emit_breakpoint)(&cmd_buffer->batch, cmd_buffer->device, false);
1057 }
1058 
1059 #if GFX_VER >= 11
1060 #define _3DPRIMITIVE_DIRECT GENX(3DPRIMITIVE_EXTENDED)
1061 #else
1062 #define _3DPRIMITIVE_DIRECT GENX(3DPRIMITIVE)
1063 #endif
1064 
genX(CmdDraw)1065 void genX(CmdDraw)(
1066     VkCommandBuffer                             commandBuffer,
1067     uint32_t                                    vertexCount,
1068     uint32_t                                    instanceCount,
1069     uint32_t                                    firstVertex,
1070     uint32_t                                    firstInstance)
1071 {
1072    ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer);
1073    struct anv_graphics_pipeline *pipeline =
1074       anv_pipeline_to_graphics(cmd_buffer->state.gfx.base.pipeline);
1075 
1076    if (anv_batch_has_error(&cmd_buffer->batch))
1077       return;
1078 
1079    const uint32_t count =
1080       vertexCount * instanceCount * pipeline->instance_multiplier;
1081    anv_measure_snapshot(cmd_buffer,
1082                         INTEL_SNAPSHOT_DRAW,
1083                         "draw", count);
1084    trace_intel_begin_draw(&cmd_buffer->trace);
1085 
1086    /* Select pipeline here to allow
1087     * cmd_buffer_emit_vertex_constants_and_flush() without flushing before
1088     * cmd_buffer_flush_gfx_state().
1089     */
1090    genX(flush_pipeline_select_3d)(cmd_buffer);
1091 
1092 #if GFX_VER < 11
1093    cmd_buffer_emit_vertex_constants_and_flush(cmd_buffer,
1094                                               get_vs_prog_data(pipeline),
1095                                               firstVertex, firstInstance, 0,
1096                                               false /* force_flush */);
1097 #endif
1098 
1099    genX(cmd_buffer_flush_gfx_state)(cmd_buffer);
1100 
1101    if (cmd_buffer->state.conditional_render_enabled)
1102       genX(cmd_emit_conditional_render_predicate)(cmd_buffer);
1103 
1104    cmd_buffer_pre_draw_wa(cmd_buffer);
1105 
1106    anv_batch_emit(&cmd_buffer->batch, _3DPRIMITIVE_DIRECT, prim) {
1107       prim.PredicateEnable          = cmd_buffer->state.conditional_render_enabled;
1108 #if GFX_VERx10 >= 125
1109       prim.TBIMREnable = cmd_buffer->state.gfx.dyn_state.use_tbimr;
1110 #endif
1111       prim.VertexAccessType         = SEQUENTIAL;
1112       prim.VertexCountPerInstance   = vertexCount;
1113       prim.StartVertexLocation      = firstVertex;
1114       prim.InstanceCount            = instanceCount *
1115                                       pipeline->instance_multiplier;
1116       prim.StartInstanceLocation    = firstInstance;
1117       prim.BaseVertexLocation       = 0;
1118 #if GFX_VER >= 11
1119       prim.ExtendedParametersPresent = true;
1120       prim.ExtendedParameter0       = firstVertex;
1121       prim.ExtendedParameter1       = firstInstance;
1122       prim.ExtendedParameter2       = 0;
1123 #endif
1124    }
1125 
1126    cmd_buffer_post_draw_wa(cmd_buffer, vertexCount, SEQUENTIAL);
1127 
1128    trace_intel_end_draw(&cmd_buffer->trace, count,
1129                         pipeline->base.source_hashes[MESA_SHADER_VERTEX],
1130                         pipeline->base.source_hashes[MESA_SHADER_FRAGMENT]);
1131 }
1132 
genX(CmdDrawMultiEXT)1133 void genX(CmdDrawMultiEXT)(
1134     VkCommandBuffer                             commandBuffer,
1135     uint32_t                                    drawCount,
1136     const VkMultiDrawInfoEXT                   *pVertexInfo,
1137     uint32_t                                    instanceCount,
1138     uint32_t                                    firstInstance,
1139     uint32_t                                    stride)
1140 {
1141    ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer);
1142    UNUSED struct anv_graphics_pipeline *pipeline =
1143       anv_pipeline_to_graphics(cmd_buffer->state.gfx.base.pipeline);
1144 
1145    if (anv_batch_has_error(&cmd_buffer->batch))
1146       return;
1147 
1148    genX(cmd_buffer_flush_gfx_state)(cmd_buffer);
1149 
1150    if (cmd_buffer->state.conditional_render_enabled)
1151       genX(cmd_emit_conditional_render_predicate)(cmd_buffer);
1152 
1153    uint32_t i = 0;
1154 #if GFX_VER < 11
1155    vk_foreach_multi_draw(draw, i, pVertexInfo, drawCount, stride) {
1156       cmd_buffer_emit_vertex_constants_and_flush(cmd_buffer,
1157                                                  get_vs_prog_data(pipeline),
1158                                                  draw->firstVertex,
1159                                                  firstInstance, i, !i);
1160 
1161       const uint32_t count =
1162          draw->vertexCount * instanceCount * pipeline->instance_multiplier;
1163       anv_measure_snapshot(cmd_buffer,
1164                            INTEL_SNAPSHOT_DRAW,
1165                            "draw multi", count);
1166       trace_intel_begin_draw_multi(&cmd_buffer->trace);
1167 
1168       cmd_buffer_pre_draw_wa(cmd_buffer);
1169 
1170       anv_batch_emit(&cmd_buffer->batch, GENX(3DPRIMITIVE), prim) {
1171          prim.PredicateEnable          = cmd_buffer->state.conditional_render_enabled;
1172          prim.VertexAccessType         = SEQUENTIAL;
1173          prim.VertexCountPerInstance   = draw->vertexCount;
1174          prim.StartVertexLocation      = draw->firstVertex;
1175          prim.InstanceCount            = instanceCount *
1176                                          pipeline->instance_multiplier;
1177          prim.StartInstanceLocation    = firstInstance;
1178          prim.BaseVertexLocation       = 0;
1179       }
1180 
1181       cmd_buffer_post_draw_wa(cmd_buffer, drawCount == 0 ? 0 :
1182                               pVertexInfo[drawCount - 1].vertexCount,
1183                               SEQUENTIAL);
1184 
1185       trace_intel_end_draw_multi(&cmd_buffer->trace, count,
1186                                  pipeline->base.source_hashes[MESA_SHADER_VERTEX],
1187                                  pipeline->base.source_hashes[MESA_SHADER_FRAGMENT]);
1188    }
1189 #else
1190    vk_foreach_multi_draw(draw, i, pVertexInfo, drawCount, stride) {
1191       const uint32_t count = draw->vertexCount * instanceCount;
1192       anv_measure_snapshot(cmd_buffer,
1193                            INTEL_SNAPSHOT_DRAW,
1194                            "draw multi", count);
1195       trace_intel_begin_draw_multi(&cmd_buffer->trace);
1196 
1197       cmd_buffer_pre_draw_wa(cmd_buffer);
1198 
1199       anv_batch_emit(&cmd_buffer->batch, _3DPRIMITIVE_DIRECT, prim) {
1200 #if GFX_VERx10 >= 125
1201          prim.TBIMREnable = cmd_buffer->state.gfx.dyn_state.use_tbimr;
1202 #endif
1203          prim.PredicateEnable          = cmd_buffer->state.conditional_render_enabled;
1204          prim.VertexAccessType         = SEQUENTIAL;
1205          prim.VertexCountPerInstance   = draw->vertexCount;
1206          prim.StartVertexLocation      = draw->firstVertex;
1207          prim.InstanceCount            = instanceCount *
1208                                          pipeline->instance_multiplier;
1209          prim.StartInstanceLocation    = firstInstance;
1210          prim.BaseVertexLocation       = 0;
1211          prim.ExtendedParametersPresent = true;
1212          prim.ExtendedParameter0       = draw->firstVertex;
1213          prim.ExtendedParameter1       = firstInstance;
1214          prim.ExtendedParameter2       = i;
1215       }
1216 
1217       cmd_buffer_post_draw_wa(cmd_buffer, drawCount == 0 ? 0 :
1218                               pVertexInfo[drawCount - 1].vertexCount,
1219                               SEQUENTIAL);
1220 
1221       trace_intel_end_draw_multi(&cmd_buffer->trace, count,
1222                                  pipeline->base.source_hashes[MESA_SHADER_VERTEX],
1223                                  pipeline->base.source_hashes[MESA_SHADER_FRAGMENT]);
1224    }
1225 #endif
1226 }
1227 
genX(CmdDrawIndexed)1228 void genX(CmdDrawIndexed)(
1229     VkCommandBuffer                             commandBuffer,
1230     uint32_t                                    indexCount,
1231     uint32_t                                    instanceCount,
1232     uint32_t                                    firstIndex,
1233     int32_t                                     vertexOffset,
1234     uint32_t                                    firstInstance)
1235 {
1236    ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer);
1237    struct anv_graphics_pipeline *pipeline =
1238       anv_pipeline_to_graphics(cmd_buffer->state.gfx.base.pipeline);
1239 
1240    if (anv_batch_has_error(&cmd_buffer->batch))
1241       return;
1242 
1243    const uint32_t count =
1244       indexCount * instanceCount * pipeline->instance_multiplier;
1245    anv_measure_snapshot(cmd_buffer,
1246                         INTEL_SNAPSHOT_DRAW,
1247                         "draw indexed",
1248                         count);
1249    trace_intel_begin_draw_indexed(&cmd_buffer->trace);
1250 
1251    /* Select pipeline here to allow
1252     * cmd_buffer_emit_vertex_constants_and_flush() without flushing before
1253     * cmd_buffer_flush_gfx_state().
1254     */
1255    genX(flush_pipeline_select_3d)(cmd_buffer);
1256 
1257 #if GFX_VER < 11
1258    const struct brw_vs_prog_data *vs_prog_data = get_vs_prog_data(pipeline);
1259    cmd_buffer_emit_vertex_constants_and_flush(cmd_buffer, vs_prog_data,
1260                                               vertexOffset, firstInstance,
1261                                               0, false /* force_flush */);
1262 #endif
1263 
1264    genX(cmd_buffer_flush_gfx_state)(cmd_buffer);
1265 
1266    if (cmd_buffer->state.conditional_render_enabled)
1267       genX(cmd_emit_conditional_render_predicate)(cmd_buffer);
1268 
1269    cmd_buffer_pre_draw_wa(cmd_buffer);
1270 
1271    anv_batch_emit(&cmd_buffer->batch, _3DPRIMITIVE_DIRECT, prim) {
1272       prim.PredicateEnable          = cmd_buffer->state.conditional_render_enabled;
1273 #if GFX_VERx10 >= 125
1274       prim.TBIMREnable = cmd_buffer->state.gfx.dyn_state.use_tbimr;
1275 #endif
1276       prim.VertexAccessType         = RANDOM;
1277       prim.VertexCountPerInstance   = indexCount;
1278       prim.StartVertexLocation      = firstIndex;
1279       prim.InstanceCount            = instanceCount *
1280                                       pipeline->instance_multiplier;
1281       prim.StartInstanceLocation    = firstInstance;
1282       prim.BaseVertexLocation       = vertexOffset;
1283 #if GFX_VER >= 11
1284       prim.ExtendedParametersPresent = true;
1285       prim.ExtendedParameter0       = vertexOffset;
1286       prim.ExtendedParameter1       = firstInstance;
1287       prim.ExtendedParameter2       = 0;
1288 #endif
1289    }
1290 
1291    cmd_buffer_post_draw_wa(cmd_buffer, indexCount, RANDOM);
1292 
1293    trace_intel_end_draw_indexed(&cmd_buffer->trace, count,
1294                                 pipeline->base.source_hashes[MESA_SHADER_VERTEX],
1295                                 pipeline->base.source_hashes[MESA_SHADER_FRAGMENT]);
1296 }
1297 
genX(CmdDrawMultiIndexedEXT)1298 void genX(CmdDrawMultiIndexedEXT)(
1299     VkCommandBuffer                             commandBuffer,
1300     uint32_t                                    drawCount,
1301     const VkMultiDrawIndexedInfoEXT            *pIndexInfo,
1302     uint32_t                                    instanceCount,
1303     uint32_t                                    firstInstance,
1304     uint32_t                                    stride,
1305     const int32_t                              *pVertexOffset)
1306 {
1307    ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer);
1308    struct anv_graphics_pipeline *pipeline =
1309       anv_pipeline_to_graphics(cmd_buffer->state.gfx.base.pipeline);
1310 
1311    if (anv_batch_has_error(&cmd_buffer->batch))
1312       return;
1313 
1314    genX(cmd_buffer_flush_gfx_state)(cmd_buffer);
1315 
1316    if (cmd_buffer->state.conditional_render_enabled)
1317       genX(cmd_emit_conditional_render_predicate)(cmd_buffer);
1318 
1319    uint32_t i = 0;
1320 #if GFX_VER < 11
1321    const struct brw_vs_prog_data *vs_prog_data = get_vs_prog_data(pipeline);
1322    if (pVertexOffset) {
1323       if (vs_prog_data->uses_drawid) {
1324          bool emitted = true;
1325          if (vs_prog_data->uses_firstvertex ||
1326              vs_prog_data->uses_baseinstance) {
1327             emit_base_vertex_instance(cmd_buffer, *pVertexOffset, firstInstance);
1328             emitted = true;
1329          }
1330          vk_foreach_multi_draw_indexed(draw, i, pIndexInfo, drawCount, stride) {
1331             if (vs_prog_data->uses_drawid) {
1332                emit_draw_index(cmd_buffer, i);
1333                emitted = true;
1334             }
1335             /* Emitting draw index or vertex index BOs may result in needing
1336              * additional VF cache flushes.
1337              */
1338             if (emitted)
1339                genX(cmd_buffer_apply_pipe_flushes)(cmd_buffer);
1340 
1341             const uint32_t count =
1342                draw->indexCount * instanceCount * pipeline->instance_multiplier;
1343             anv_measure_snapshot(cmd_buffer,
1344                                  INTEL_SNAPSHOT_DRAW,
1345                                  "draw indexed multi",
1346                                  count);
1347             trace_intel_begin_draw_indexed_multi(&cmd_buffer->trace);
1348 
1349             cmd_buffer_pre_draw_wa(cmd_buffer);
1350 
1351             anv_batch_emit(&cmd_buffer->batch, GENX(3DPRIMITIVE), prim) {
1352                prim.PredicateEnable          = cmd_buffer->state.conditional_render_enabled;
1353                prim.VertexAccessType         = RANDOM;
1354                prim.VertexCountPerInstance   = draw->indexCount;
1355                prim.StartVertexLocation      = draw->firstIndex;
1356                prim.InstanceCount            = instanceCount *
1357                                                pipeline->instance_multiplier;
1358                prim.StartInstanceLocation    = firstInstance;
1359                prim.BaseVertexLocation       = *pVertexOffset;
1360             }
1361 
1362             cmd_buffer_post_draw_wa(cmd_buffer, drawCount == 0 ? 0 :
1363                                     pIndexInfo[drawCount - 1].indexCount,
1364                                     RANDOM);
1365 
1366             trace_intel_end_draw_indexed_multi(&cmd_buffer->trace, count,
1367                                                pipeline->base.source_hashes[MESA_SHADER_VERTEX],
1368                                                pipeline->base.source_hashes[MESA_SHADER_FRAGMENT]);
1369             emitted = false;
1370          }
1371       } else {
1372          if (vs_prog_data->uses_firstvertex ||
1373              vs_prog_data->uses_baseinstance) {
1374             emit_base_vertex_instance(cmd_buffer, *pVertexOffset, firstInstance);
1375             /* Emitting draw index or vertex index BOs may result in needing
1376              * additional VF cache flushes.
1377              */
1378             genX(cmd_buffer_apply_pipe_flushes)(cmd_buffer);
1379          }
1380          vk_foreach_multi_draw_indexed(draw, i, pIndexInfo, drawCount, stride) {
1381             const uint32_t count =
1382                draw->indexCount * instanceCount * pipeline->instance_multiplier;
1383             anv_measure_snapshot(cmd_buffer,
1384                                  INTEL_SNAPSHOT_DRAW,
1385                                  "draw indexed multi",
1386                                  count);
1387             trace_intel_begin_draw_indexed_multi(&cmd_buffer->trace);
1388 
1389             cmd_buffer_pre_draw_wa(cmd_buffer);
1390 
1391             anv_batch_emit(&cmd_buffer->batch, GENX(3DPRIMITIVE), prim) {
1392                prim.PredicateEnable          = cmd_buffer->state.conditional_render_enabled;
1393                prim.VertexAccessType         = RANDOM;
1394                prim.VertexCountPerInstance   = draw->indexCount;
1395                prim.StartVertexLocation      = draw->firstIndex;
1396                prim.InstanceCount            = instanceCount *
1397                                                pipeline->instance_multiplier;
1398                prim.StartInstanceLocation    = firstInstance;
1399                prim.BaseVertexLocation       = *pVertexOffset;
1400             }
1401 
1402             cmd_buffer_post_draw_wa(cmd_buffer, drawCount == 0 ? 0 :
1403                                     pIndexInfo[drawCount - 1].indexCount,
1404                                     RANDOM);
1405 
1406             trace_intel_end_draw_indexed_multi(&cmd_buffer->trace, count,
1407                                                pipeline->base.source_hashes[MESA_SHADER_VERTEX],
1408                                                pipeline->base.source_hashes[MESA_SHADER_FRAGMENT]);
1409          }
1410       }
1411    } else {
1412       vk_foreach_multi_draw_indexed(draw, i, pIndexInfo, drawCount, stride) {
1413          cmd_buffer_emit_vertex_constants_and_flush(cmd_buffer, vs_prog_data,
1414                                                     draw->vertexOffset,
1415                                                     firstInstance, i, i != 0);
1416 
1417          const uint32_t count =
1418             draw->indexCount * instanceCount * pipeline->instance_multiplier;
1419          anv_measure_snapshot(cmd_buffer,
1420                               INTEL_SNAPSHOT_DRAW,
1421                               "draw indexed multi",
1422                               count);
1423          trace_intel_begin_draw_indexed_multi(&cmd_buffer->trace);
1424 
1425          cmd_buffer_pre_draw_wa(cmd_buffer);
1426 
1427          anv_batch_emit(&cmd_buffer->batch, GENX(3DPRIMITIVE), prim) {
1428             prim.PredicateEnable          = cmd_buffer->state.conditional_render_enabled;
1429             prim.VertexAccessType         = RANDOM;
1430             prim.VertexCountPerInstance   = draw->indexCount;
1431             prim.StartVertexLocation      = draw->firstIndex;
1432             prim.InstanceCount            = instanceCount *
1433                                             pipeline->instance_multiplier;
1434             prim.StartInstanceLocation    = firstInstance;
1435             prim.BaseVertexLocation       = draw->vertexOffset;
1436          }
1437 
1438          cmd_buffer_post_draw_wa(cmd_buffer, drawCount == 0 ? 0 :
1439                                  pIndexInfo[drawCount - 1].indexCount,
1440                                  RANDOM);
1441 
1442          trace_intel_end_draw_indexed_multi(&cmd_buffer->trace, count,
1443                                              pipeline->base.source_hashes[MESA_SHADER_VERTEX],
1444                                              pipeline->base.source_hashes[MESA_SHADER_FRAGMENT]);
1445       }
1446    }
1447 #else
1448    vk_foreach_multi_draw_indexed(draw, i, pIndexInfo, drawCount, stride) {
1449       const uint32_t count =
1450          draw->indexCount * instanceCount * pipeline->instance_multiplier;
1451       anv_measure_snapshot(cmd_buffer,
1452                            INTEL_SNAPSHOT_DRAW,
1453                            "draw indexed multi",
1454                            count);
1455       trace_intel_begin_draw_indexed_multi(&cmd_buffer->trace);
1456 
1457       cmd_buffer_pre_draw_wa(cmd_buffer);
1458 
1459       anv_batch_emit(&cmd_buffer->batch, GENX(3DPRIMITIVE_EXTENDED), prim) {
1460 #if GFX_VERx10 >= 125
1461          prim.TBIMREnable = cmd_buffer->state.gfx.dyn_state.use_tbimr;
1462 #endif
1463          prim.PredicateEnable          = cmd_buffer->state.conditional_render_enabled;
1464          prim.VertexAccessType         = RANDOM;
1465          prim.VertexCountPerInstance   = draw->indexCount;
1466          prim.StartVertexLocation      = draw->firstIndex;
1467          prim.InstanceCount            = instanceCount *
1468                                          pipeline->instance_multiplier;
1469          prim.StartInstanceLocation    = firstInstance;
1470          prim.BaseVertexLocation       = pVertexOffset ? *pVertexOffset : draw->vertexOffset;
1471          prim.ExtendedParametersPresent = true;
1472          prim.ExtendedParameter0       = pVertexOffset ? *pVertexOffset : draw->vertexOffset;
1473          prim.ExtendedParameter1       = firstInstance;
1474          prim.ExtendedParameter2       = i;
1475       }
1476 
1477       cmd_buffer_post_draw_wa(cmd_buffer, drawCount == 0 ? 0 :
1478                               pIndexInfo[drawCount - 1].indexCount,
1479                               RANDOM);
1480 
1481       trace_intel_end_draw_indexed_multi(&cmd_buffer->trace, count,
1482                                          pipeline->base.source_hashes[MESA_SHADER_VERTEX],
1483                                          pipeline->base.source_hashes[MESA_SHADER_FRAGMENT]);
1484    }
1485 #endif
1486 }
1487 
1488 /* Auto-Draw / Indirect Registers */
1489 #define GFX7_3DPRIM_END_OFFSET          0x2420
1490 #define GFX7_3DPRIM_START_VERTEX        0x2430
1491 #define GFX7_3DPRIM_VERTEX_COUNT        0x2434
1492 #define GFX7_3DPRIM_INSTANCE_COUNT      0x2438
1493 #define GFX7_3DPRIM_START_INSTANCE      0x243C
1494 #define GFX7_3DPRIM_BASE_VERTEX         0x2440
1495 
1496 /* On Gen11+, we have three custom "extended parameters" which we can use to
1497  * provide extra system-generated values to shaders.  Our assignment of these
1498  * is arbitrary; we choose to assign them as follows:
1499  *
1500  *    gl_BaseVertex = XP0
1501  *    gl_BaseInstance = XP1
1502  *    gl_DrawID = XP2
1503  *
1504  * For gl_BaseInstance, we never actually have to set up the value because we
1505  * can just program 3DSTATE_VF_SGVS_2 to load it implicitly.  We can also do
1506  * that for gl_BaseVertex but it does the wrong thing for indexed draws.
1507  */
1508 #define GEN11_3DPRIM_XP0                0x2690
1509 #define GEN11_3DPRIM_XP1                0x2694
1510 #define GEN11_3DPRIM_XP2                0x2698
1511 #define GEN11_3DPRIM_XP_BASE_VERTEX     GEN11_3DPRIM_XP0
1512 #define GEN11_3DPRIM_XP_BASE_INSTANCE   GEN11_3DPRIM_XP1
1513 #define GEN11_3DPRIM_XP_DRAW_ID         GEN11_3DPRIM_XP2
1514 
genX(CmdDrawIndirectByteCountEXT)1515 void genX(CmdDrawIndirectByteCountEXT)(
1516     VkCommandBuffer                             commandBuffer,
1517     uint32_t                                    instanceCount,
1518     uint32_t                                    firstInstance,
1519     VkBuffer                                    counterBuffer,
1520     VkDeviceSize                                counterBufferOffset,
1521     uint32_t                                    counterOffset,
1522     uint32_t                                    vertexStride)
1523 {
1524    ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer);
1525    ANV_FROM_HANDLE(anv_buffer, counter_buffer, counterBuffer);
1526    struct anv_graphics_pipeline *pipeline =
1527       anv_pipeline_to_graphics(cmd_buffer->state.gfx.base.pipeline);
1528 
1529    /* firstVertex is always zero for this draw function */
1530    const uint32_t firstVertex = 0;
1531 
1532    if (anv_batch_has_error(&cmd_buffer->batch))
1533       return;
1534 
1535    anv_measure_snapshot(cmd_buffer,
1536                         INTEL_SNAPSHOT_DRAW,
1537                         "draw indirect byte count",
1538                         instanceCount * pipeline->instance_multiplier);
1539    trace_intel_begin_draw_indirect_byte_count(&cmd_buffer->trace);
1540 
1541    /* Select pipeline here to allow
1542     * cmd_buffer_emit_vertex_constants_and_flush() without flushing before
1543     * emit_base_vertex_instance() & emit_draw_index().
1544     */
1545    genX(flush_pipeline_select_3d)(cmd_buffer);
1546 
1547 #if GFX_VER < 11
1548    const struct brw_vs_prog_data *vs_prog_data = get_vs_prog_data(pipeline);
1549    if (vs_prog_data->uses_firstvertex ||
1550        vs_prog_data->uses_baseinstance)
1551       emit_base_vertex_instance(cmd_buffer, firstVertex, firstInstance);
1552    if (vs_prog_data->uses_drawid)
1553       emit_draw_index(cmd_buffer, 0);
1554 #endif
1555 
1556    genX(cmd_buffer_flush_gfx_state)(cmd_buffer);
1557 
1558    if (cmd_buffer->state.conditional_render_enabled)
1559       genX(cmd_emit_conditional_render_predicate)(cmd_buffer);
1560 
1561    struct mi_builder b;
1562    mi_builder_init(&b, cmd_buffer->device->info, &cmd_buffer->batch);
1563    const uint32_t mocs = anv_mocs_for_address(cmd_buffer->device, &counter_buffer->address);
1564    mi_builder_set_mocs(&b, mocs);
1565    struct mi_value count =
1566       mi_mem32(anv_address_add(counter_buffer->address,
1567                                    counterBufferOffset));
1568    if (counterOffset)
1569       count = mi_isub(&b, count, mi_imm(counterOffset));
1570    count = mi_udiv32_imm(&b, count, vertexStride);
1571    mi_store(&b, mi_reg32(GFX7_3DPRIM_VERTEX_COUNT), count);
1572 
1573    mi_store(&b, mi_reg32(GFX7_3DPRIM_START_VERTEX), mi_imm(firstVertex));
1574    mi_store(&b, mi_reg32(GFX7_3DPRIM_INSTANCE_COUNT),
1575             mi_imm(instanceCount * pipeline->instance_multiplier));
1576    mi_store(&b, mi_reg32(GFX7_3DPRIM_START_INSTANCE), mi_imm(firstInstance));
1577    mi_store(&b, mi_reg32(GFX7_3DPRIM_BASE_VERTEX), mi_imm(0));
1578 
1579 #if GFX_VER >= 11
1580    mi_store(&b, mi_reg32(GEN11_3DPRIM_XP_BASE_VERTEX),
1581                 mi_imm(firstVertex));
1582    /* GEN11_3DPRIM_XP_BASE_INSTANCE is implicit */
1583    mi_store(&b, mi_reg32(GEN11_3DPRIM_XP_DRAW_ID), mi_imm(0));
1584 #endif
1585 
1586    cmd_buffer_pre_draw_wa(cmd_buffer);
1587 
1588    anv_batch_emit(&cmd_buffer->batch, _3DPRIMITIVE_DIRECT, prim) {
1589 #if GFX_VERx10 >= 125
1590       prim.TBIMREnable = cmd_buffer->state.gfx.dyn_state.use_tbimr;
1591 #endif
1592       prim.IndirectParameterEnable  = true;
1593       prim.PredicateEnable          = cmd_buffer->state.conditional_render_enabled;
1594       prim.VertexAccessType         = SEQUENTIAL;
1595 #if GFX_VER >= 11
1596       prim.ExtendedParametersPresent = true;
1597 #endif
1598    }
1599 
1600    cmd_buffer_post_draw_wa(cmd_buffer, 1, SEQUENTIAL);
1601 
1602    trace_intel_end_draw_indirect_byte_count(&cmd_buffer->trace,
1603                                             instanceCount * pipeline->instance_multiplier,
1604                                             pipeline->base.source_hashes[MESA_SHADER_VERTEX],
1605                                             pipeline->base.source_hashes[MESA_SHADER_FRAGMENT]);
1606 }
1607 
1608 static void
load_indirect_parameters(struct anv_cmd_buffer * cmd_buffer,struct anv_address addr,bool indexed,uint32_t draw_id)1609 load_indirect_parameters(struct anv_cmd_buffer *cmd_buffer,
1610                          struct anv_address addr,
1611                          bool indexed,
1612                          uint32_t draw_id)
1613 {
1614    struct anv_graphics_pipeline *pipeline =
1615       anv_pipeline_to_graphics(cmd_buffer->state.gfx.base.pipeline);
1616 
1617    struct mi_builder b;
1618    mi_builder_init(&b, cmd_buffer->device->info, &cmd_buffer->batch);
1619    const uint32_t mocs = anv_mocs_for_address(cmd_buffer->device, &addr);
1620    mi_builder_set_mocs(&b, mocs);
1621 
1622    mi_store(&b, mi_reg32(GFX7_3DPRIM_VERTEX_COUNT),
1623                 mi_mem32(anv_address_add(addr, 0)));
1624 
1625    struct mi_value instance_count = mi_mem32(anv_address_add(addr, 4));
1626    if (pipeline->instance_multiplier > 1) {
1627       instance_count = mi_imul_imm(&b, instance_count,
1628                                    pipeline->instance_multiplier);
1629    }
1630    mi_store(&b, mi_reg32(GFX7_3DPRIM_INSTANCE_COUNT), instance_count);
1631 
1632    mi_store(&b, mi_reg32(GFX7_3DPRIM_START_VERTEX),
1633                 mi_mem32(anv_address_add(addr, 8)));
1634 
1635    if (indexed) {
1636       mi_store(&b, mi_reg32(GFX7_3DPRIM_BASE_VERTEX),
1637                    mi_mem32(anv_address_add(addr, 12)));
1638       mi_store(&b, mi_reg32(GFX7_3DPRIM_START_INSTANCE),
1639                    mi_mem32(anv_address_add(addr, 16)));
1640 #if GFX_VER >= 11
1641       mi_store(&b, mi_reg32(GEN11_3DPRIM_XP_BASE_VERTEX),
1642                    mi_mem32(anv_address_add(addr, 12)));
1643       /* GEN11_3DPRIM_XP_BASE_INSTANCE is implicit */
1644 #endif
1645    } else {
1646       mi_store(&b, mi_reg32(GFX7_3DPRIM_START_INSTANCE),
1647                    mi_mem32(anv_address_add(addr, 12)));
1648       mi_store(&b, mi_reg32(GFX7_3DPRIM_BASE_VERTEX), mi_imm(0));
1649 #if GFX_VER >= 11
1650       mi_store(&b, mi_reg32(GEN11_3DPRIM_XP_BASE_VERTEX),
1651                    mi_mem32(anv_address_add(addr, 8)));
1652       /* GEN11_3DPRIM_XP_BASE_INSTANCE is implicit */
1653 #endif
1654    }
1655 
1656 #if GFX_VER >= 11
1657    mi_store(&b, mi_reg32(GEN11_3DPRIM_XP_DRAW_ID),
1658                 mi_imm(draw_id));
1659 #endif
1660 }
1661 
1662 static const inline bool
execute_indirect_draw_supported(const struct anv_cmd_buffer * cmd_buffer)1663 execute_indirect_draw_supported(const struct anv_cmd_buffer *cmd_buffer)
1664 {
1665 #if GFX_VERx10 >= 125
1666    const struct intel_device_info *devinfo = cmd_buffer->device->info;
1667 
1668    if (!devinfo->has_indirect_unroll)
1669       return false;
1670 
1671    struct anv_graphics_pipeline *pipeline =
1672       anv_pipeline_to_graphics(cmd_buffer->state.gfx.base.pipeline);
1673    const struct brw_vs_prog_data *vs_prog_data = get_vs_prog_data(pipeline);
1674    const struct brw_task_prog_data *task_prog_data = get_task_prog_data(pipeline);
1675    const struct brw_mesh_prog_data *mesh_prog_data = get_mesh_prog_data(pipeline);
1676    const bool is_multiview = pipeline->instance_multiplier > 1;
1677 
1678    const bool uses_draw_id =
1679       (vs_prog_data && vs_prog_data->uses_drawid) ||
1680       (mesh_prog_data && mesh_prog_data->uses_drawid) ||
1681       (task_prog_data && task_prog_data->uses_drawid);
1682 
1683    const bool uses_firstvertex =
1684       (vs_prog_data && vs_prog_data->uses_firstvertex);
1685 
1686    const bool uses_baseinstance =
1687       (vs_prog_data && vs_prog_data->uses_baseinstance);
1688 
1689    return !is_multiview &&
1690           !uses_draw_id &&
1691           !uses_firstvertex &&
1692           !uses_baseinstance;
1693 #else
1694    return false;
1695 #endif
1696 }
1697 
1698 static void
emit_indirect_draws(struct anv_cmd_buffer * cmd_buffer,struct anv_address indirect_data_addr,uint32_t indirect_data_stride,uint32_t draw_count,bool indexed)1699 emit_indirect_draws(struct anv_cmd_buffer *cmd_buffer,
1700                     struct anv_address indirect_data_addr,
1701                     uint32_t indirect_data_stride,
1702                     uint32_t draw_count,
1703                     bool indexed)
1704 {
1705 #if GFX_VER < 11
1706    struct anv_graphics_pipeline *pipeline =
1707       anv_pipeline_to_graphics(cmd_buffer->state.gfx.base.pipeline);
1708    const struct brw_vs_prog_data *vs_prog_data = get_vs_prog_data(pipeline);
1709 #endif
1710    genX(cmd_buffer_flush_gfx_state)(cmd_buffer);
1711 
1712    if (cmd_buffer->state.conditional_render_enabled)
1713       genX(cmd_emit_conditional_render_predicate)(cmd_buffer);
1714 
1715    uint32_t offset = 0;
1716    for (uint32_t i = 0; i < draw_count; i++) {
1717       struct anv_address draw = anv_address_add(indirect_data_addr, offset);
1718 
1719 #if GFX_VER < 11
1720       /* TODO: We need to stomp base vertex to 0 somehow */
1721 
1722       /* With sequential draws, we're dealing with the VkDrawIndirectCommand
1723        * structure data. We want to load VkDrawIndirectCommand::firstVertex at
1724        * offset 8 in the structure.
1725        *
1726        * With indexed draws, we're dealing with VkDrawIndexedIndirectCommand.
1727        * We want the VkDrawIndirectCommand::vertexOffset field at offset 12 in
1728        * the structure.
1729        */
1730       if (vs_prog_data->uses_firstvertex ||
1731           vs_prog_data->uses_baseinstance) {
1732          emit_base_vertex_instance_bo(cmd_buffer,
1733                                       anv_address_add(draw, indexed ? 12 : 8));
1734       }
1735       if (vs_prog_data->uses_drawid)
1736          emit_draw_index(cmd_buffer, i);
1737 #endif
1738 
1739       /* Emitting draw index or vertex index BOs may result in needing
1740        * additional VF cache flushes.
1741        */
1742       genX(cmd_buffer_apply_pipe_flushes)(cmd_buffer);
1743 
1744       load_indirect_parameters(cmd_buffer, draw, indexed, i);
1745 
1746       cmd_buffer_pre_draw_wa(cmd_buffer);
1747 
1748       anv_batch_emit(&cmd_buffer->batch, _3DPRIMITIVE_DIRECT, prim) {
1749 #if GFX_VERx10 >= 125
1750          prim.TBIMREnable = cmd_buffer->state.gfx.dyn_state.use_tbimr;
1751 #endif
1752          prim.IndirectParameterEnable  = true;
1753          prim.PredicateEnable          = cmd_buffer->state.conditional_render_enabled;
1754          prim.VertexAccessType         = indexed ? RANDOM : SEQUENTIAL;
1755 #if GFX_VER >= 11
1756          prim.ExtendedParametersPresent = true;
1757 #endif
1758       }
1759 
1760       cmd_buffer_post_draw_wa(cmd_buffer, 1, indexed ? RANDOM : SEQUENTIAL);
1761 
1762       offset += indirect_data_stride;
1763    }
1764 }
1765 
xi_argument_format_for_vk_cmd(enum vk_cmd_type cmd)1766 static inline const uint32_t xi_argument_format_for_vk_cmd(enum vk_cmd_type cmd)
1767 {
1768 #if GFX_VERx10 >= 125
1769    switch (cmd) {
1770       case VK_CMD_DRAW_INDIRECT:
1771       case VK_CMD_DRAW_INDIRECT_COUNT:
1772          return XI_DRAW;
1773       case VK_CMD_DRAW_INDEXED_INDIRECT:
1774       case VK_CMD_DRAW_INDEXED_INDIRECT_COUNT:
1775          return XI_DRAWINDEXED;
1776       case VK_CMD_DRAW_MESH_TASKS_INDIRECT_EXT:
1777       case VK_CMD_DRAW_MESH_TASKS_INDIRECT_COUNT_EXT:
1778          return XI_MESH_3D;
1779       default:
1780          unreachable("unhandled cmd type");
1781    }
1782 #else
1783    unreachable("unsupported GFX VER");
1784 #endif
1785 }
1786 
1787 static inline bool
cmd_buffer_set_indirect_stride(struct anv_cmd_buffer * cmd_buffer,uint32_t stride,enum vk_cmd_type cmd)1788 cmd_buffer_set_indirect_stride(struct anv_cmd_buffer *cmd_buffer,
1789                                uint32_t stride, enum vk_cmd_type cmd)
1790 {
1791    /* Should have been sanitized by the caller */
1792    assert(stride != 0);
1793 
1794    uint32_t data_stride = 0;
1795 
1796    switch (cmd) {
1797    case VK_CMD_DRAW_INDIRECT:
1798    case VK_CMD_DRAW_INDIRECT_COUNT:
1799       data_stride = sizeof(VkDrawIndirectCommand);
1800       break;
1801    case VK_CMD_DRAW_INDEXED_INDIRECT:
1802    case VK_CMD_DRAW_INDEXED_INDIRECT_COUNT:
1803       data_stride = sizeof(VkDrawIndexedIndirectCommand);
1804       break;
1805    case VK_CMD_DRAW_MESH_TASKS_INDIRECT_EXT:
1806    case VK_CMD_DRAW_MESH_TASKS_INDIRECT_COUNT_EXT:
1807       data_stride = sizeof(VkDrawMeshTasksIndirectCommandEXT);
1808       break;
1809    default:
1810       unreachable("unhandled cmd type");
1811    }
1812 
1813    bool aligned = stride == data_stride;
1814 
1815 #if GFX_VER >= 20
1816    /* The stride can change as long as it matches the default command stride
1817     * and STATE_BYTE_STRIDE::ByteStrideEnable=false, we can just do nothing.
1818     *
1819     * Otheriwse STATE_BYTE_STRIDE::ByteStrideEnable=true, any stride change
1820     * should be signaled.
1821     */
1822    struct anv_cmd_graphics_state *gfx_state = &cmd_buffer->state.gfx;
1823    if (gfx_state->indirect_data_stride_aligned != aligned) {
1824       gfx_state->indirect_data_stride = stride;
1825       gfx_state->indirect_data_stride_aligned = aligned;
1826       gfx_state->dirty |= ANV_CMD_DIRTY_INDIRECT_DATA_STRIDE;
1827    } else if (!gfx_state->indirect_data_stride_aligned &&
1828               gfx_state->indirect_data_stride != stride) {
1829       gfx_state->indirect_data_stride = stride;
1830       gfx_state->indirect_data_stride_aligned = aligned;
1831       gfx_state->dirty |= ANV_CMD_DIRTY_INDIRECT_DATA_STRIDE;
1832    }
1833 #endif
1834 
1835    return aligned;
1836 }
1837 
1838 static void
genX(cmd_buffer_emit_execute_indirect_draws)1839 genX(cmd_buffer_emit_execute_indirect_draws)(struct anv_cmd_buffer *cmd_buffer,
1840                                              struct anv_address indirect_data_addr,
1841                                              uint32_t indirect_data_stride,
1842                                              struct anv_address count_addr,
1843                                              uint32_t max_draw_count,
1844                                              enum vk_cmd_type cmd)
1845 {
1846 #if GFX_VERx10 >= 125
1847    bool aligned_stride =
1848       cmd_buffer_set_indirect_stride(cmd_buffer, indirect_data_stride, cmd);
1849 
1850    genX(cmd_buffer_flush_gfx_state)(cmd_buffer);
1851 
1852    if (cmd_buffer->state.conditional_render_enabled)
1853       genX(cmd_emit_conditional_render_predicate)(cmd_buffer);
1854 
1855    uint32_t offset = 0;
1856    for (uint32_t i = 0; i < max_draw_count; i++) {
1857       struct anv_address draw = anv_address_add(indirect_data_addr, offset);
1858 
1859       cmd_buffer_pre_draw_wa(cmd_buffer);
1860 
1861       anv_batch_emit(&cmd_buffer->batch, GENX(EXECUTE_INDIRECT_DRAW), ind) {
1862          ind.ArgumentFormat             = xi_argument_format_for_vk_cmd(cmd);
1863          ind.TBIMREnabled               = cmd_buffer->state.gfx.dyn_state.use_tbimr;
1864          ind.PredicateEnable            =
1865             cmd_buffer->state.conditional_render_enabled;
1866          ind.MaxCount                   = aligned_stride ? max_draw_count : 1;
1867          ind.ArgumentBufferStartAddress = draw;
1868          ind.CountBufferAddress         = count_addr;
1869          ind.CountBufferIndirectEnable  = !anv_address_is_null(count_addr);
1870          ind.MOCS                       =
1871             anv_mocs(cmd_buffer->device, draw.bo, 0);
1872 
1873       }
1874 
1875       cmd_buffer_post_draw_wa(cmd_buffer, 1,
1876                               0 /* Doesn't matter for GFX_VER > 9 */);
1877 
1878       /* If all the indirect structures are aligned, then we can let the HW
1879        * do the unrolling and we only need one instruction. Otherwise we
1880        * need to emit one instruction per draw, but we're still avoiding
1881        * the register loads with MI commands.
1882        */
1883       if (aligned_stride || GFX_VER >= 20)
1884          break;
1885 
1886       offset += indirect_data_stride;
1887    }
1888 #endif // GFX_VERx10 >= 125
1889 }
genX(CmdDrawIndirect)1890 void genX(CmdDrawIndirect)(
1891     VkCommandBuffer                             commandBuffer,
1892     VkBuffer                                    _buffer,
1893     VkDeviceSize                                offset,
1894     uint32_t                                    drawCount,
1895     uint32_t                                    stride)
1896 {
1897    ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer);
1898    ANV_FROM_HANDLE(anv_buffer, buffer, _buffer);
1899    struct anv_graphics_pipeline *pipeline =
1900       anv_pipeline_to_graphics(cmd_buffer->state.gfx.base.pipeline);
1901 
1902    if (anv_batch_has_error(&cmd_buffer->batch))
1903       return;
1904 
1905    anv_measure_snapshot(cmd_buffer,
1906                         INTEL_SNAPSHOT_DRAW,
1907                         "draw indirect",
1908                         drawCount);
1909    trace_intel_begin_draw_indirect(&cmd_buffer->trace);
1910 
1911    struct anv_address indirect_data_addr =
1912       anv_address_add(buffer->address, offset);
1913 
1914    stride = MAX2(stride, sizeof(VkDrawIndirectCommand));
1915 
1916    if (execute_indirect_draw_supported(cmd_buffer)) {
1917       genX(cmd_buffer_emit_execute_indirect_draws)(
1918          cmd_buffer,
1919          indirect_data_addr,
1920          stride,
1921          ANV_NULL_ADDRESS /* count_addr */,
1922          drawCount,
1923          VK_CMD_DRAW_INDIRECT);
1924    } else if (anv_use_generated_draws(cmd_buffer, drawCount)) {
1925       genX(cmd_buffer_emit_indirect_generated_draws)(
1926          cmd_buffer,
1927          indirect_data_addr,
1928          stride,
1929          ANV_NULL_ADDRESS /* count_addr */,
1930          drawCount,
1931          false /* indexed */);
1932    } else {
1933       emit_indirect_draws(cmd_buffer,
1934                           indirect_data_addr,
1935                           stride, drawCount, false /* indexed */);
1936    }
1937 
1938    trace_intel_end_draw_indirect(&cmd_buffer->trace, drawCount,
1939                                  pipeline->base.source_hashes[MESA_SHADER_VERTEX],
1940                                  pipeline->base.source_hashes[MESA_SHADER_FRAGMENT]);
1941 }
1942 
genX(CmdDrawIndexedIndirect)1943 void genX(CmdDrawIndexedIndirect)(
1944     VkCommandBuffer                             commandBuffer,
1945     VkBuffer                                    _buffer,
1946     VkDeviceSize                                offset,
1947     uint32_t                                    drawCount,
1948     uint32_t                                    stride)
1949 {
1950    ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer);
1951    ANV_FROM_HANDLE(anv_buffer, buffer, _buffer);
1952    struct anv_graphics_pipeline *pipeline =
1953       anv_pipeline_to_graphics(cmd_buffer->state.gfx.base.pipeline);
1954 
1955    if (anv_batch_has_error(&cmd_buffer->batch))
1956       return;
1957 
1958    anv_measure_snapshot(cmd_buffer,
1959                         INTEL_SNAPSHOT_DRAW,
1960                         "draw indexed indirect",
1961                         drawCount);
1962    trace_intel_begin_draw_indexed_indirect(&cmd_buffer->trace);
1963 
1964    struct anv_address indirect_data_addr =
1965       anv_address_add(buffer->address, offset);
1966 
1967    stride = MAX2(stride, sizeof(VkDrawIndexedIndirectCommand));
1968 
1969    if (execute_indirect_draw_supported(cmd_buffer)) {
1970       genX(cmd_buffer_emit_execute_indirect_draws)(
1971          cmd_buffer,
1972          indirect_data_addr,
1973          stride,
1974          ANV_NULL_ADDRESS /* count_addr */,
1975          drawCount,
1976          VK_CMD_DRAW_INDEXED_INDIRECT);
1977    } else if (anv_use_generated_draws(cmd_buffer, drawCount)) {
1978       genX(cmd_buffer_emit_indirect_generated_draws)(
1979          cmd_buffer,
1980          indirect_data_addr,
1981          stride,
1982          ANV_NULL_ADDRESS /* count_addr */,
1983          drawCount,
1984          true /* indexed */);
1985    } else {
1986       emit_indirect_draws(cmd_buffer,
1987                           indirect_data_addr,
1988                           stride, drawCount, true /* indexed */);
1989    }
1990 
1991    trace_intel_end_draw_indexed_indirect(&cmd_buffer->trace, drawCount,
1992                                          pipeline->base.source_hashes[MESA_SHADER_VERTEX],
1993                                          pipeline->base.source_hashes[MESA_SHADER_FRAGMENT]);
1994 }
1995 
1996 #define MI_PREDICATE_SRC0    0x2400
1997 #define MI_PREDICATE_SRC1    0x2408
1998 #define MI_PREDICATE_RESULT  0x2418
1999 
2000 static struct mi_value
prepare_for_draw_count_predicate(struct anv_cmd_buffer * cmd_buffer,struct mi_builder * b,struct anv_address count_address)2001 prepare_for_draw_count_predicate(struct anv_cmd_buffer *cmd_buffer,
2002                                  struct mi_builder *b,
2003                                  struct anv_address count_address)
2004 {
2005    struct mi_value ret = mi_imm(0);
2006 
2007    if (cmd_buffer->state.conditional_render_enabled) {
2008       ret = mi_new_gpr(b);
2009       mi_store(b, mi_value_ref(b, ret), mi_mem32(count_address));
2010    } else {
2011       /* Upload the current draw count from the draw parameters buffer to
2012        * MI_PREDICATE_SRC0.
2013        */
2014       mi_store(b, mi_reg64(MI_PREDICATE_SRC0), mi_mem32(count_address));
2015       mi_store(b, mi_reg32(MI_PREDICATE_SRC1 + 4), mi_imm(0));
2016    }
2017 
2018    return ret;
2019 }
2020 
2021 static void
emit_draw_count_predicate(struct anv_cmd_buffer * cmd_buffer,struct mi_builder * b,uint32_t draw_index)2022 emit_draw_count_predicate(struct anv_cmd_buffer *cmd_buffer,
2023                           struct mi_builder *b,
2024                           uint32_t draw_index)
2025 {
2026    /* Upload the index of the current primitive to MI_PREDICATE_SRC1. */
2027    mi_store(b, mi_reg32(MI_PREDICATE_SRC1), mi_imm(draw_index));
2028 
2029    if (draw_index == 0) {
2030       anv_batch_emit(&cmd_buffer->batch, GENX(MI_PREDICATE), mip) {
2031          mip.LoadOperation    = LOAD_LOADINV;
2032          mip.CombineOperation = COMBINE_SET;
2033          mip.CompareOperation = COMPARE_SRCS_EQUAL;
2034       }
2035    } else {
2036       /* While draw_index < draw_count the predicate's result will be
2037        *  (draw_index == draw_count) ^ TRUE = TRUE
2038        * When draw_index == draw_count the result is
2039        *  (TRUE) ^ TRUE = FALSE
2040        * After this all results will be:
2041        *  (FALSE) ^ FALSE = FALSE
2042        */
2043       anv_batch_emit(&cmd_buffer->batch, GENX(MI_PREDICATE), mip) {
2044          mip.LoadOperation    = LOAD_LOAD;
2045          mip.CombineOperation = COMBINE_XOR;
2046          mip.CompareOperation = COMPARE_SRCS_EQUAL;
2047       }
2048    }
2049 }
2050 
2051 static void
emit_draw_count_predicate_with_conditional_render(struct anv_cmd_buffer * cmd_buffer,struct mi_builder * b,uint32_t draw_index,struct mi_value max)2052 emit_draw_count_predicate_with_conditional_render(
2053                           struct anv_cmd_buffer *cmd_buffer,
2054                           struct mi_builder *b,
2055                           uint32_t draw_index,
2056                           struct mi_value max)
2057 {
2058    struct mi_value pred = mi_ult(b, mi_imm(draw_index), max);
2059    pred = mi_iand(b, pred, mi_reg64(ANV_PREDICATE_RESULT_REG));
2060 
2061    mi_store(b, mi_reg32(MI_PREDICATE_RESULT), pred);
2062 }
2063 
2064 static void
emit_draw_count_predicate_cond(struct anv_cmd_buffer * cmd_buffer,struct mi_builder * b,uint32_t draw_index,struct mi_value max)2065 emit_draw_count_predicate_cond(struct anv_cmd_buffer *cmd_buffer,
2066                                struct mi_builder *b,
2067                                uint32_t draw_index,
2068                                struct mi_value max)
2069 {
2070    if (cmd_buffer->state.conditional_render_enabled) {
2071       emit_draw_count_predicate_with_conditional_render(
2072             cmd_buffer, b, draw_index, mi_value_ref(b, max));
2073    } else {
2074       emit_draw_count_predicate(cmd_buffer, b, draw_index);
2075    }
2076 }
2077 
2078 static void
emit_indirect_count_draws(struct anv_cmd_buffer * cmd_buffer,struct anv_address indirect_data_addr,uint64_t indirect_data_stride,struct anv_address draw_count_addr,uint32_t max_draw_count,bool indexed)2079 emit_indirect_count_draws(struct anv_cmd_buffer *cmd_buffer,
2080                           struct anv_address indirect_data_addr,
2081                           uint64_t indirect_data_stride,
2082                           struct anv_address draw_count_addr,
2083                           uint32_t max_draw_count,
2084                           bool indexed)
2085 {
2086 #if GFX_VER < 11
2087    struct anv_graphics_pipeline *pipeline =
2088       anv_pipeline_to_graphics(cmd_buffer->state.gfx.base.pipeline);
2089    const struct brw_vs_prog_data *vs_prog_data = get_vs_prog_data(pipeline);
2090 #endif
2091 
2092    genX(cmd_buffer_flush_gfx_state)(cmd_buffer);
2093 
2094    struct mi_builder b;
2095    mi_builder_init(&b, cmd_buffer->device->info, &cmd_buffer->batch);
2096    const uint32_t mocs = anv_mocs_for_address(cmd_buffer->device, &draw_count_addr);
2097    mi_builder_set_mocs(&b, mocs);
2098    struct mi_value max =
2099       prepare_for_draw_count_predicate(cmd_buffer, &b, draw_count_addr);
2100 
2101    for (uint32_t i = 0; i < max_draw_count; i++) {
2102       struct anv_address draw =
2103          anv_address_add(indirect_data_addr, i * indirect_data_stride);
2104 
2105       emit_draw_count_predicate_cond(cmd_buffer, &b, i, max);
2106 
2107 #if GFX_VER < 11
2108       if (vs_prog_data->uses_firstvertex ||
2109           vs_prog_data->uses_baseinstance) {
2110          emit_base_vertex_instance_bo(cmd_buffer,
2111                                       anv_address_add(draw, indexed ? 12 : 8));
2112       }
2113       if (vs_prog_data->uses_drawid)
2114          emit_draw_index(cmd_buffer, i);
2115 
2116       /* Emitting draw index or vertex index BOs may result in needing
2117        * additional VF cache flushes.
2118        */
2119       genX(cmd_buffer_apply_pipe_flushes)(cmd_buffer);
2120 #endif
2121 
2122       load_indirect_parameters(cmd_buffer, draw, indexed, i);
2123 
2124       cmd_buffer_pre_draw_wa(cmd_buffer);
2125 
2126       anv_batch_emit(&cmd_buffer->batch, _3DPRIMITIVE_DIRECT, prim) {
2127 #if GFX_VERx10 >= 125
2128          prim.TBIMREnable = cmd_buffer->state.gfx.dyn_state.use_tbimr;
2129 #endif
2130          prim.IndirectParameterEnable  = true;
2131          prim.PredicateEnable          = true;
2132          prim.VertexAccessType         = indexed ? RANDOM : SEQUENTIAL;
2133 #if GFX_VER >= 11
2134          prim.ExtendedParametersPresent = true;
2135 #endif
2136       }
2137 
2138       cmd_buffer_post_draw_wa(cmd_buffer, 1, SEQUENTIAL);
2139    }
2140 
2141    mi_value_unref(&b, max);
2142 }
2143 
genX(CmdDrawIndirectCount)2144 void genX(CmdDrawIndirectCount)(
2145     VkCommandBuffer                             commandBuffer,
2146     VkBuffer                                    _buffer,
2147     VkDeviceSize                                offset,
2148     VkBuffer                                    _countBuffer,
2149     VkDeviceSize                                countBufferOffset,
2150     uint32_t                                    maxDrawCount,
2151     uint32_t                                    stride)
2152 {
2153    ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer);
2154    ANV_FROM_HANDLE(anv_buffer, buffer, _buffer);
2155    ANV_FROM_HANDLE(anv_buffer, count_buffer, _countBuffer);
2156    struct anv_graphics_pipeline *pipeline =
2157       anv_pipeline_to_graphics(cmd_buffer->state.gfx.base.pipeline);
2158 
2159    if (anv_batch_has_error(&cmd_buffer->batch))
2160       return;
2161 
2162    anv_measure_snapshot(cmd_buffer,
2163                         INTEL_SNAPSHOT_DRAW,
2164                         "draw indirect count",
2165                         0);
2166    trace_intel_begin_draw_indirect_count(&cmd_buffer->trace);
2167 
2168    struct anv_address indirect_data_address =
2169       anv_address_add(buffer->address, offset);
2170    struct anv_address count_address =
2171       anv_address_add(count_buffer->address, countBufferOffset);
2172    stride = MAX2(stride, sizeof(VkDrawIndirectCommand));
2173 
2174    if (execute_indirect_draw_supported(cmd_buffer)) {
2175       genX(cmd_buffer_emit_execute_indirect_draws)(
2176          cmd_buffer,
2177          indirect_data_address,
2178          stride,
2179          count_address,
2180          maxDrawCount,
2181          VK_CMD_DRAW_INDIRECT_COUNT);
2182    } else if (anv_use_generated_draws(cmd_buffer, maxDrawCount)) {
2183       genX(cmd_buffer_emit_indirect_generated_draws)(
2184          cmd_buffer,
2185          indirect_data_address,
2186          stride,
2187          count_address,
2188          maxDrawCount,
2189          false /* indexed */);
2190    } else {
2191       emit_indirect_count_draws(cmd_buffer,
2192                                 indirect_data_address,
2193                                 stride,
2194                                 count_address,
2195                                 maxDrawCount,
2196                                 false /* indexed */);
2197    }
2198 
2199    trace_intel_end_draw_indirect_count(&cmd_buffer->trace,
2200                                        anv_address_utrace(count_address),
2201                                        pipeline->base.source_hashes[MESA_SHADER_VERTEX],
2202                                        pipeline->base.source_hashes[MESA_SHADER_FRAGMENT]);
2203 }
2204 
genX(CmdDrawIndexedIndirectCount)2205 void genX(CmdDrawIndexedIndirectCount)(
2206     VkCommandBuffer                             commandBuffer,
2207     VkBuffer                                    _buffer,
2208     VkDeviceSize                                offset,
2209     VkBuffer                                    _countBuffer,
2210     VkDeviceSize                                countBufferOffset,
2211     uint32_t                                    maxDrawCount,
2212     uint32_t                                    stride)
2213 {
2214    ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer);
2215    ANV_FROM_HANDLE(anv_buffer, buffer, _buffer);
2216    ANV_FROM_HANDLE(anv_buffer, count_buffer, _countBuffer);
2217    struct anv_graphics_pipeline *pipeline =
2218       anv_pipeline_to_graphics(cmd_buffer->state.gfx.base.pipeline);
2219 
2220    if (anv_batch_has_error(&cmd_buffer->batch))
2221       return;
2222 
2223    anv_measure_snapshot(cmd_buffer,
2224                         INTEL_SNAPSHOT_DRAW,
2225                         "draw indexed indirect count",
2226                         0);
2227    trace_intel_begin_draw_indexed_indirect_count(&cmd_buffer->trace);
2228 
2229    struct anv_address indirect_data_address =
2230       anv_address_add(buffer->address, offset);
2231    struct anv_address count_address =
2232       anv_address_add(count_buffer->address, countBufferOffset);
2233    stride = MAX2(stride, sizeof(VkDrawIndexedIndirectCommand));
2234 
2235    if (execute_indirect_draw_supported(cmd_buffer)) {
2236       genX(cmd_buffer_emit_execute_indirect_draws)(
2237          cmd_buffer,
2238          indirect_data_address,
2239          stride,
2240          count_address,
2241          maxDrawCount,
2242          VK_CMD_DRAW_INDEXED_INDIRECT_COUNT);
2243    } else if (anv_use_generated_draws(cmd_buffer, maxDrawCount)) {
2244       genX(cmd_buffer_emit_indirect_generated_draws)(
2245          cmd_buffer,
2246          indirect_data_address,
2247          stride,
2248          count_address,
2249          maxDrawCount,
2250          true /* indexed */);
2251    } else {
2252       emit_indirect_count_draws(cmd_buffer,
2253                                 indirect_data_address,
2254                                 stride,
2255                                 count_address,
2256                                 maxDrawCount,
2257                                 true /* indexed */);
2258    }
2259 
2260    trace_intel_end_draw_indexed_indirect_count(&cmd_buffer->trace,
2261                                                anv_address_utrace(count_address),
2262                                                pipeline->base.source_hashes[MESA_SHADER_VERTEX],
2263                                                pipeline->base.source_hashes[MESA_SHADER_FRAGMENT]);
2264 
2265 }
2266 
genX(CmdBeginTransformFeedbackEXT)2267 void genX(CmdBeginTransformFeedbackEXT)(
2268     VkCommandBuffer                             commandBuffer,
2269     uint32_t                                    firstCounterBuffer,
2270     uint32_t                                    counterBufferCount,
2271     const VkBuffer*                             pCounterBuffers,
2272     const VkDeviceSize*                         pCounterBufferOffsets)
2273 {
2274    ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer);
2275 
2276    assert(firstCounterBuffer < MAX_XFB_BUFFERS);
2277    assert(counterBufferCount <= MAX_XFB_BUFFERS);
2278    assert(firstCounterBuffer + counterBufferCount <= MAX_XFB_BUFFERS);
2279 
2280    trace_intel_begin_xfb(&cmd_buffer->trace);
2281 
2282    /* From the SKL PRM Vol. 2c, SO_WRITE_OFFSET:
2283     *
2284     *    "Ssoftware must ensure that no HW stream output operations can be in
2285     *    process or otherwise pending at the point that the MI_LOAD/STORE
2286     *    commands are processed. This will likely require a pipeline flush."
2287     */
2288    anv_add_pending_pipe_bits(cmd_buffer,
2289                              ANV_PIPE_CS_STALL_BIT,
2290                              "begin transform feedback");
2291    genX(cmd_buffer_apply_pipe_flushes)(cmd_buffer);
2292 
2293    struct mi_builder b;
2294    mi_builder_init(&b, cmd_buffer->device->info, &cmd_buffer->batch);
2295 
2296    for (uint32_t idx = 0; idx < MAX_XFB_BUFFERS; idx++) {
2297       /* If we have a counter buffer, this is a resume so we need to load the
2298        * value into the streamout offset register.  Otherwise, this is a begin
2299        * and we need to reset it to zero.
2300        */
2301       if (pCounterBuffers &&
2302           idx >= firstCounterBuffer &&
2303           idx - firstCounterBuffer < counterBufferCount &&
2304           pCounterBuffers[idx - firstCounterBuffer] != VK_NULL_HANDLE) {
2305          uint32_t cb_idx = idx - firstCounterBuffer;
2306          ANV_FROM_HANDLE(anv_buffer, counter_buffer, pCounterBuffers[cb_idx]);
2307          uint64_t offset = pCounterBufferOffsets ?
2308                            pCounterBufferOffsets[cb_idx] : 0;
2309          mi_store(&b, mi_reg32(GENX(SO_WRITE_OFFSET0_num) + idx * 4),
2310                   mi_mem32(anv_address_add(counter_buffer->address, offset)));
2311       } else {
2312          mi_store(&b, mi_reg32(GENX(SO_WRITE_OFFSET0_num) + idx * 4),
2313                   mi_imm(0));
2314       }
2315    }
2316 
2317    cmd_buffer->state.xfb_enabled = true;
2318    cmd_buffer->state.gfx.dirty |= ANV_CMD_DIRTY_XFB_ENABLE;
2319 }
2320 
genX(CmdEndTransformFeedbackEXT)2321 void genX(CmdEndTransformFeedbackEXT)(
2322     VkCommandBuffer                             commandBuffer,
2323     uint32_t                                    firstCounterBuffer,
2324     uint32_t                                    counterBufferCount,
2325     const VkBuffer*                             pCounterBuffers,
2326     const VkDeviceSize*                         pCounterBufferOffsets)
2327 {
2328    ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer);
2329 
2330    assert(firstCounterBuffer < MAX_XFB_BUFFERS);
2331    assert(counterBufferCount <= MAX_XFB_BUFFERS);
2332    assert(firstCounterBuffer + counterBufferCount <= MAX_XFB_BUFFERS);
2333 
2334    /* From the SKL PRM Vol. 2c, SO_WRITE_OFFSET:
2335     *
2336     *    "Ssoftware must ensure that no HW stream output operations can be in
2337     *    process or otherwise pending at the point that the MI_LOAD/STORE
2338     *    commands are processed. This will likely require a pipeline flush."
2339     */
2340    anv_add_pending_pipe_bits(cmd_buffer,
2341                              ANV_PIPE_CS_STALL_BIT,
2342                              "end transform feedback");
2343    genX(cmd_buffer_apply_pipe_flushes)(cmd_buffer);
2344 
2345    for (uint32_t cb_idx = 0; cb_idx < counterBufferCount; cb_idx++) {
2346       unsigned idx = firstCounterBuffer + cb_idx;
2347 
2348       /* If we have a counter buffer, this is a resume so we need to load the
2349        * value into the streamout offset register.  Otherwise, this is a begin
2350        * and we need to reset it to zero.
2351        */
2352       if (pCounterBuffers &&
2353           cb_idx < counterBufferCount &&
2354           pCounterBuffers[cb_idx] != VK_NULL_HANDLE) {
2355          ANV_FROM_HANDLE(anv_buffer, counter_buffer, pCounterBuffers[cb_idx]);
2356          uint64_t offset = pCounterBufferOffsets ?
2357                            pCounterBufferOffsets[cb_idx] : 0;
2358 
2359          anv_batch_emit(&cmd_buffer->batch, GENX(MI_STORE_REGISTER_MEM), srm) {
2360             srm.MemoryAddress    = anv_address_add(counter_buffer->address,
2361                                                    offset);
2362             srm.RegisterAddress  = GENX(SO_WRITE_OFFSET0_num) + idx * 4;
2363          }
2364       }
2365    }
2366 
2367    trace_intel_end_xfb(&cmd_buffer->trace);
2368 
2369    cmd_buffer->state.xfb_enabled = false;
2370    cmd_buffer->state.gfx.dirty |= ANV_CMD_DIRTY_XFB_ENABLE;
2371 }
2372 
2373 #if GFX_VERx10 >= 125
2374 
2375 void
genX(CmdDrawMeshTasksEXT)2376 genX(CmdDrawMeshTasksEXT)(
2377       VkCommandBuffer commandBuffer,
2378       uint32_t x,
2379       uint32_t y,
2380       uint32_t z)
2381 {
2382    ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer);
2383 
2384    if (anv_batch_has_error(&cmd_buffer->batch))
2385       return;
2386 
2387    anv_measure_snapshot(cmd_buffer,
2388                         INTEL_SNAPSHOT_DRAW,
2389                         "draw mesh", x * y * z);
2390 
2391    trace_intel_begin_draw_mesh(&cmd_buffer->trace);
2392 
2393    /* TODO(mesh): Check if this is not emitting more packets than we need. */
2394    genX(cmd_buffer_flush_gfx_state)(cmd_buffer);
2395 
2396    if (cmd_buffer->state.conditional_render_enabled)
2397       genX(cmd_emit_conditional_render_predicate)(cmd_buffer);
2398 
2399    anv_batch_emit(&cmd_buffer->batch, GENX(3DMESH_3D), m) {
2400       m.PredicateEnable = cmd_buffer->state.conditional_render_enabled;
2401       m.ThreadGroupCountX = x;
2402       m.ThreadGroupCountY = y;
2403       m.ThreadGroupCountZ = z;
2404    }
2405 
2406    trace_intel_end_draw_mesh(&cmd_buffer->trace, x, y, z);
2407 }
2408 
2409 #define GFX125_3DMESH_TG_COUNT 0x26F0
2410 #define GFX10_3DPRIM_XP(n) (0x2690 + (n) * 4) /* n = { 0, 1, 2 } */
2411 
2412 static void
mesh_load_indirect_parameters_3dmesh_3d(struct anv_cmd_buffer * cmd_buffer,struct mi_builder * b,struct anv_address addr,bool emit_xp0,uint32_t xp0)2413 mesh_load_indirect_parameters_3dmesh_3d(struct anv_cmd_buffer *cmd_buffer,
2414                                         struct mi_builder *b,
2415                                         struct anv_address addr,
2416                                         bool emit_xp0,
2417                                         uint32_t xp0)
2418 {
2419    const size_t groupCountXOff = offsetof(VkDrawMeshTasksIndirectCommandEXT, groupCountX);
2420    const size_t groupCountYOff = offsetof(VkDrawMeshTasksIndirectCommandEXT, groupCountY);
2421    const size_t groupCountZOff = offsetof(VkDrawMeshTasksIndirectCommandEXT, groupCountZ);
2422 
2423    mi_store(b, mi_reg32(GFX125_3DMESH_TG_COUNT),
2424                mi_mem32(anv_address_add(addr, groupCountXOff)));
2425 
2426    mi_store(b, mi_reg32(GFX10_3DPRIM_XP(1)),
2427                mi_mem32(anv_address_add(addr, groupCountYOff)));
2428 
2429    mi_store(b, mi_reg32(GFX10_3DPRIM_XP(2)),
2430                mi_mem32(anv_address_add(addr, groupCountZOff)));
2431 
2432    if (emit_xp0)
2433       mi_store(b, mi_reg32(GFX10_3DPRIM_XP(0)), mi_imm(xp0));
2434 }
2435 
2436 static void
emit_indirect_3dmesh_3d(struct anv_batch * batch,bool predicate_enable,bool uses_drawid)2437 emit_indirect_3dmesh_3d(struct anv_batch *batch,
2438                         bool predicate_enable,
2439                         bool uses_drawid)
2440 {
2441    uint32_t len = GENX(3DMESH_3D_length) + uses_drawid;
2442    uint32_t *dw = anv_batch_emitn(batch, len, GENX(3DMESH_3D),
2443                    .PredicateEnable           = predicate_enable,
2444                    .IndirectParameterEnable   = true,
2445                    .ExtendedParameter0Present = uses_drawid);
2446    if (uses_drawid)
2447       dw[len - 1] = 0;
2448 }
2449 
2450 void
genX(CmdDrawMeshTasksIndirectEXT)2451 genX(CmdDrawMeshTasksIndirectEXT)(
2452     VkCommandBuffer                             commandBuffer,
2453     VkBuffer                                    _buffer,
2454     VkDeviceSize                                offset,
2455     uint32_t                                    drawCount,
2456     uint32_t                                    stride)
2457 {
2458    ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer);
2459    ANV_FROM_HANDLE(anv_buffer, buffer, _buffer);
2460    struct anv_graphics_pipeline *pipeline =
2461       anv_pipeline_to_graphics(cmd_buffer->state.gfx.base.pipeline);
2462    const struct brw_task_prog_data *task_prog_data = get_task_prog_data(pipeline);
2463    const struct brw_mesh_prog_data *mesh_prog_data = get_mesh_prog_data(pipeline);
2464    struct anv_cmd_state *cmd_state = &cmd_buffer->state;
2465 
2466    if (anv_batch_has_error(&cmd_buffer->batch))
2467       return;
2468 
2469    anv_measure_snapshot(cmd_buffer,
2470                         INTEL_SNAPSHOT_DRAW,
2471                         "draw mesh indirect", drawCount);
2472 
2473    trace_intel_begin_draw_mesh_indirect(&cmd_buffer->trace);
2474 
2475    if (execute_indirect_draw_supported(cmd_buffer)) {
2476       genX(cmd_buffer_emit_execute_indirect_draws)(
2477          cmd_buffer,
2478          anv_address_add(buffer->address, offset),
2479          MAX2(stride, sizeof(VkDrawMeshTasksIndirectCommandEXT)),
2480          ANV_NULL_ADDRESS /* count_addr */,
2481          drawCount,
2482          VK_CMD_DRAW_MESH_TASKS_INDIRECT_EXT);
2483 
2484       trace_intel_end_draw_mesh_indirect(&cmd_buffer->trace, drawCount);
2485       return;
2486    }
2487 
2488    genX(cmd_buffer_flush_gfx_state)(cmd_buffer);
2489 
2490    if (cmd_state->conditional_render_enabled)
2491       genX(cmd_emit_conditional_render_predicate)(cmd_buffer);
2492 
2493    bool uses_drawid = (task_prog_data && task_prog_data->uses_drawid) ||
2494                        mesh_prog_data->uses_drawid;
2495    struct mi_builder b;
2496    mi_builder_init(&b, cmd_buffer->device->info, &cmd_buffer->batch);
2497 
2498    for (uint32_t i = 0; i < drawCount; i++) {
2499       struct anv_address draw = anv_address_add(buffer->address, offset);
2500 
2501       mesh_load_indirect_parameters_3dmesh_3d(cmd_buffer, &b, draw, uses_drawid, i);
2502 
2503       emit_indirect_3dmesh_3d(&cmd_buffer->batch,
2504             cmd_state->conditional_render_enabled, uses_drawid);
2505 
2506       offset += stride;
2507    }
2508 
2509    trace_intel_end_draw_mesh_indirect(&cmd_buffer->trace, drawCount);
2510 }
2511 
2512 void
genX(CmdDrawMeshTasksIndirectCountEXT)2513 genX(CmdDrawMeshTasksIndirectCountEXT)(
2514     VkCommandBuffer                             commandBuffer,
2515     VkBuffer                                    _buffer,
2516     VkDeviceSize                                offset,
2517     VkBuffer                                    _countBuffer,
2518     VkDeviceSize                                countBufferOffset,
2519     uint32_t                                    maxDrawCount,
2520     uint32_t                                    stride)
2521 {
2522    ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer);
2523    ANV_FROM_HANDLE(anv_buffer, buffer, _buffer);
2524    ANV_FROM_HANDLE(anv_buffer, count_buffer, _countBuffer);
2525    struct anv_graphics_pipeline *pipeline =
2526       anv_pipeline_to_graphics(cmd_buffer->state.gfx.base.pipeline);
2527    const struct brw_task_prog_data *task_prog_data = get_task_prog_data(pipeline);
2528    const struct brw_mesh_prog_data *mesh_prog_data = get_mesh_prog_data(pipeline);
2529 
2530    if (anv_batch_has_error(&cmd_buffer->batch))
2531       return;
2532 
2533    anv_measure_snapshot(cmd_buffer,
2534                         INTEL_SNAPSHOT_DRAW,
2535                         "draw mesh indirect count", 0);
2536 
2537    trace_intel_begin_draw_mesh_indirect_count(&cmd_buffer->trace);
2538 
2539    struct anv_address count_addr =
2540       anv_address_add(count_buffer->address, countBufferOffset);
2541 
2542 
2543    if (execute_indirect_draw_supported(cmd_buffer)) {
2544       genX(cmd_buffer_emit_execute_indirect_draws)(
2545          cmd_buffer,
2546          anv_address_add(buffer->address, offset),
2547          MAX2(stride, sizeof(VkDrawMeshTasksIndirectCommandEXT)),
2548          count_addr /* count_addr */,
2549          maxDrawCount,
2550          VK_CMD_DRAW_MESH_TASKS_INDIRECT_COUNT_EXT);
2551 
2552       trace_intel_end_draw_mesh_indirect(&cmd_buffer->trace, maxDrawCount);
2553       return;
2554    }
2555 
2556    genX(cmd_buffer_flush_gfx_state)(cmd_buffer);
2557 
2558    bool uses_drawid = (task_prog_data && task_prog_data->uses_drawid) ||
2559                        mesh_prog_data->uses_drawid;
2560 
2561    struct mi_builder b;
2562    mi_builder_init(&b, cmd_buffer->device->info, &cmd_buffer->batch);
2563    const uint32_t mocs = anv_mocs_for_address(cmd_buffer->device, &count_buffer->address);
2564    mi_builder_set_mocs(&b, mocs);
2565 
2566    struct mi_value max =
2567          prepare_for_draw_count_predicate(
2568             cmd_buffer, &b, count_addr);
2569 
2570    for (uint32_t i = 0; i < maxDrawCount; i++) {
2571       struct anv_address draw = anv_address_add(buffer->address, offset);
2572 
2573       emit_draw_count_predicate_cond(cmd_buffer, &b, i, max);
2574 
2575       mesh_load_indirect_parameters_3dmesh_3d(cmd_buffer, &b, draw, uses_drawid, i);
2576 
2577       emit_indirect_3dmesh_3d(&cmd_buffer->batch, true, uses_drawid);
2578 
2579       offset += stride;
2580    }
2581 
2582    trace_intel_end_draw_mesh_indirect_count(&cmd_buffer->trace,
2583                                             anv_address_utrace(count_addr));
2584 }
2585 
2586 #endif /* GFX_VERx10 >= 125 */
2587