• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*
2  * Copyright © 2015 Intel Corporation
3  *
4  * Permission is hereby granted, free of charge, to any person obtaining a
5  * copy of this software and associated documentation files (the "Software"),
6  * to deal in the Software without restriction, including without limitation
7  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8  * and/or sell copies of the Software, and to permit persons to whom the
9  * Software is furnished to do so, subject to the following conditions:
10  *
11  * The above copyright notice and this permission notice (including the next
12  * paragraph) shall be included in all copies or substantial portions of the
13  * Software.
14  *
15  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
18  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21  * IN THE SOFTWARE.
22  */
23 
24 #include <assert.h>
25 #include <stdbool.h>
26 
27 #include "anv_private.h"
28 #include "anv_measure.h"
29 #include "vk_render_pass.h"
30 #include "vk_util.h"
31 
32 #include "common/intel_aux_map.h"
33 #include "genxml/gen_macros.h"
34 #include "genxml/genX_pack.h"
35 #include "genxml/genX_rt_pack.h"
36 #include "common/intel_genX_state_brw.h"
37 
38 #include "ds/intel_tracepoints.h"
39 
40 /* We reserve :
41  *    - GPR 14 for secondary command buffer returns
42  *    - GPR 15 for conditional rendering
43  */
44 #define MI_BUILDER_NUM_ALLOC_GPRS 14
45 #define __gen_get_batch_dwords anv_batch_emit_dwords
46 #define __gen_address_offset anv_address_add
47 #define __gen_get_batch_address(b, a) anv_batch_address(b, a)
48 #include "common/mi_builder.h"
49 
50 static void
cmd_buffer_alloc_gfx_push_constants(struct anv_cmd_buffer * cmd_buffer)51 cmd_buffer_alloc_gfx_push_constants(struct anv_cmd_buffer *cmd_buffer)
52 {
53    struct anv_graphics_pipeline *pipeline =
54       anv_pipeline_to_graphics(cmd_buffer->state.gfx.base.pipeline);
55    VkShaderStageFlags stages = pipeline->base.base.active_stages;
56 
57    /* In order to avoid thrash, we assume that vertex and fragment stages
58     * always exist.  In the rare case where one is missing *and* the other
59     * uses push concstants, this may be suboptimal.  However, avoiding stalls
60     * seems more important.
61     */
62    stages |= VK_SHADER_STAGE_FRAGMENT_BIT;
63    if (anv_pipeline_is_primitive(pipeline))
64       stages |= VK_SHADER_STAGE_VERTEX_BIT;
65 
66    if (stages == cmd_buffer->state.gfx.push_constant_stages)
67       return;
68 
69    unsigned push_constant_kb;
70 
71    const struct intel_device_info *devinfo = cmd_buffer->device->info;
72    if (anv_pipeline_is_mesh(pipeline))
73       push_constant_kb = devinfo->mesh_max_constant_urb_size_kb;
74    else
75       push_constant_kb = devinfo->max_constant_urb_size_kb;
76 
77    const unsigned num_stages =
78       util_bitcount(stages & VK_SHADER_STAGE_ALL_GRAPHICS);
79    unsigned size_per_stage = push_constant_kb / num_stages;
80 
81    /* Broadwell+ and Haswell gt3 require that the push constant sizes be in
82     * units of 2KB.  Incidentally, these are the same platforms that have
83     * 32KB worth of push constant space.
84     */
85    if (push_constant_kb == 32)
86       size_per_stage &= ~1u;
87 
88    uint32_t kb_used = 0;
89    for (int i = MESA_SHADER_VERTEX; i < MESA_SHADER_FRAGMENT; i++) {
90       const unsigned push_size = (stages & (1 << i)) ? size_per_stage : 0;
91       anv_batch_emit(&cmd_buffer->batch,
92                      GENX(3DSTATE_PUSH_CONSTANT_ALLOC_VS), alloc) {
93          alloc._3DCommandSubOpcode  = 18 + i;
94          alloc.ConstantBufferOffset = (push_size > 0) ? kb_used : 0;
95          alloc.ConstantBufferSize   = push_size;
96       }
97       kb_used += push_size;
98    }
99 
100    anv_batch_emit(&cmd_buffer->batch,
101                   GENX(3DSTATE_PUSH_CONSTANT_ALLOC_PS), alloc) {
102       alloc.ConstantBufferOffset = kb_used;
103       alloc.ConstantBufferSize = push_constant_kb - kb_used;
104    }
105 
106 #if GFX_VERx10 == 125
107    /* DG2: Wa_22011440098
108     * MTL: Wa_18022330953
109     *
110     * In 3D mode, after programming push constant alloc command immediately
111     * program push constant command(ZERO length) without any commit between
112     * them.
113     */
114    anv_batch_emit(&cmd_buffer->batch, GENX(3DSTATE_CONSTANT_ALL), c) {
115       /* Update empty push constants for all stages (bitmask = 11111b) */
116       c.ShaderUpdateEnable = 0x1f;
117       c.MOCS = anv_mocs(cmd_buffer->device, NULL, 0);
118    }
119 #endif
120 
121    cmd_buffer->state.gfx.push_constant_stages = stages;
122 
123    /* From the BDW PRM for 3DSTATE_PUSH_CONSTANT_ALLOC_VS:
124     *
125     *    "The 3DSTATE_CONSTANT_VS must be reprogrammed prior to
126     *    the next 3DPRIMITIVE command after programming the
127     *    3DSTATE_PUSH_CONSTANT_ALLOC_VS"
128     *
129     * Since 3DSTATE_PUSH_CONSTANT_ALLOC_VS is programmed as part of
130     * pipeline setup, we need to dirty push constants.
131     */
132    cmd_buffer->state.push_constants_dirty |= stages;
133 }
134 
135 static void
cmd_buffer_emit_descriptor_pointers(struct anv_cmd_buffer * cmd_buffer,uint32_t stages)136 cmd_buffer_emit_descriptor_pointers(struct anv_cmd_buffer *cmd_buffer,
137                                     uint32_t stages)
138 {
139    static const uint32_t sampler_state_opcodes[] = {
140       [MESA_SHADER_VERTEX]                      = 43,
141       [MESA_SHADER_TESS_CTRL]                   = 44, /* HS */
142       [MESA_SHADER_TESS_EVAL]                   = 45, /* DS */
143       [MESA_SHADER_GEOMETRY]                    = 46,
144       [MESA_SHADER_FRAGMENT]                    = 47,
145    };
146 
147    static const uint32_t binding_table_opcodes[] = {
148       [MESA_SHADER_VERTEX]                      = 38,
149       [MESA_SHADER_TESS_CTRL]                   = 39,
150       [MESA_SHADER_TESS_EVAL]                   = 40,
151       [MESA_SHADER_GEOMETRY]                    = 41,
152       [MESA_SHADER_FRAGMENT]                    = 42,
153    };
154 
155    anv_foreach_stage(s, stages) {
156       assert(s < ARRAY_SIZE(binding_table_opcodes));
157 
158       if (cmd_buffer->state.samplers[s].alloc_size > 0) {
159          anv_batch_emit(&cmd_buffer->batch,
160                         GENX(3DSTATE_SAMPLER_STATE_POINTERS_VS), ssp) {
161             ssp._3DCommandSubOpcode = sampler_state_opcodes[s];
162             ssp.PointertoVSSamplerState = cmd_buffer->state.samplers[s].offset;
163          }
164       }
165 
166       /* Always emit binding table pointers if we're asked to, since on SKL
167        * this is what flushes push constants. */
168       anv_batch_emit(&cmd_buffer->batch,
169                      GENX(3DSTATE_BINDING_TABLE_POINTERS_VS), btp) {
170          btp._3DCommandSubOpcode = binding_table_opcodes[s];
171          btp.PointertoVSBindingTable = cmd_buffer->state.binding_tables[s].offset;
172       }
173    }
174 }
175 
176 static struct anv_address
get_push_range_address(struct anv_cmd_buffer * cmd_buffer,const struct anv_shader_bin * shader,const struct anv_push_range * range)177 get_push_range_address(struct anv_cmd_buffer *cmd_buffer,
178                        const struct anv_shader_bin *shader,
179                        const struct anv_push_range *range)
180 {
181    struct anv_cmd_graphics_state *gfx_state = &cmd_buffer->state.gfx;
182    switch (range->set) {
183    case ANV_DESCRIPTOR_SET_DESCRIPTORS: {
184       /* This is a descriptor set buffer so the set index is
185        * actually given by binding->binding.  (Yes, that's
186        * confusing.)
187        */
188       struct anv_descriptor_set *set =
189          gfx_state->base.descriptors[range->index];
190       return anv_descriptor_set_address(set);
191    }
192 
193    case ANV_DESCRIPTOR_SET_PUSH_CONSTANTS: {
194       if (gfx_state->base.push_constants_state.alloc_size == 0) {
195          gfx_state->base.push_constants_state =
196             anv_cmd_buffer_gfx_push_constants(cmd_buffer);
197       }
198       return anv_state_pool_state_address(
199          &cmd_buffer->device->dynamic_state_pool,
200          gfx_state->base.push_constants_state);
201    }
202 
203    default: {
204       assert(range->set < MAX_SETS);
205       struct anv_descriptor_set *set =
206          gfx_state->base.descriptors[range->set];
207       const struct anv_descriptor *desc =
208          &set->descriptors[range->index];
209 
210       if (desc->type == VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER) {
211          if (desc->buffer) {
212             return anv_address_add(desc->buffer->address,
213                                    desc->offset);
214          }
215       } else {
216          assert(desc->type == VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER_DYNAMIC);
217          if (desc->buffer) {
218             const struct anv_cmd_pipeline_state *pipe_state = &gfx_state->base;
219             uint32_t dynamic_offset =
220                pipe_state->dynamic_offsets[
221                   range->set].offsets[range->dynamic_offset_index];
222             return anv_address_add(desc->buffer->address,
223                                    desc->offset + dynamic_offset);
224          }
225       }
226 
227       /* For NULL UBOs, we just return an address in the workaround BO.  We do
228        * writes to it for workarounds but always at the bottom.  The higher
229        * bytes should be all zeros.
230        */
231       assert(range->length * 32 <= 2048);
232       return (struct anv_address) {
233          .bo = cmd_buffer->device->workaround_bo,
234          .offset = 1024,
235       };
236    }
237    }
238 }
239 
240 
241 /** Returns the size in bytes of the bound buffer
242  *
243  * The range is relative to the start of the buffer, not the start of the
244  * range.  The returned range may be smaller than
245  *
246  *    (range->start + range->length) * 32;
247  */
248 static uint32_t
get_push_range_bound_size(struct anv_cmd_buffer * cmd_buffer,const struct anv_shader_bin * shader,const struct anv_push_range * range)249 get_push_range_bound_size(struct anv_cmd_buffer *cmd_buffer,
250                           const struct anv_shader_bin *shader,
251                           const struct anv_push_range *range)
252 {
253    assert(shader->stage != MESA_SHADER_COMPUTE);
254    const struct anv_cmd_graphics_state *gfx_state = &cmd_buffer->state.gfx;
255    switch (range->set) {
256    case ANV_DESCRIPTOR_SET_DESCRIPTORS: {
257       struct anv_descriptor_set *set =
258          gfx_state->base.descriptors[range->index];
259       struct anv_state state = set->desc_surface_mem;
260       assert(range->start * 32 < state.alloc_size);
261       assert((range->start + range->length) * 32 <= state.alloc_size);
262       return state.alloc_size;
263    }
264 
265    case ANV_DESCRIPTOR_SET_PUSH_CONSTANTS:
266       return (range->start + range->length) * 32;
267 
268    default: {
269       assert(range->set < MAX_SETS);
270       struct anv_descriptor_set *set =
271          gfx_state->base.descriptors[range->set];
272       const struct anv_descriptor *desc =
273          &set->descriptors[range->index];
274 
275       if (desc->type == VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER) {
276          /* Here we promote a UBO to a binding table entry so that we can avoid a layer of indirection.
277             * We use the descriptor set's internally allocated surface state to fill the binding table entry.
278          */
279          if (!desc->buffer)
280             return 0;
281 
282          if (range->start * 32 > desc->bind_range)
283             return 0;
284 
285          return desc->bind_range;
286       } else {
287          if (!desc->buffer)
288             return 0;
289 
290          assert(desc->type == VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER_DYNAMIC);
291          /* Compute the offset within the buffer */
292          const struct anv_cmd_pipeline_state *pipe_state = &gfx_state->base;
293          uint32_t dynamic_offset =
294             pipe_state->dynamic_offsets[
295                range->set].offsets[range->dynamic_offset_index];
296          uint64_t offset = desc->offset + dynamic_offset;
297          /* Clamp to the buffer size */
298          offset = MIN2(offset, desc->buffer->vk.size);
299          /* Clamp the range to the buffer size */
300          uint32_t bound_range = MIN2(desc->range, desc->buffer->vk.size - offset);
301 
302          /* Align the range for consistency */
303          bound_range = align(bound_range, ANV_UBO_ALIGNMENT);
304 
305          return bound_range;
306       }
307    }
308    }
309 }
310 
311 static void
cmd_buffer_emit_push_constant(struct anv_cmd_buffer * cmd_buffer,gl_shader_stage stage,struct anv_address * buffers,unsigned buffer_count)312 cmd_buffer_emit_push_constant(struct anv_cmd_buffer *cmd_buffer,
313                               gl_shader_stage stage,
314                               struct anv_address *buffers,
315                               unsigned buffer_count)
316 {
317    const struct anv_cmd_graphics_state *gfx_state = &cmd_buffer->state.gfx;
318    const struct anv_graphics_pipeline *pipeline =
319       anv_pipeline_to_graphics(gfx_state->base.pipeline);
320 
321    static const uint32_t push_constant_opcodes[] = {
322       [MESA_SHADER_VERTEX]                      = 21,
323       [MESA_SHADER_TESS_CTRL]                   = 25, /* HS */
324       [MESA_SHADER_TESS_EVAL]                   = 26, /* DS */
325       [MESA_SHADER_GEOMETRY]                    = 22,
326       [MESA_SHADER_FRAGMENT]                    = 23,
327    };
328 
329    assert(stage < ARRAY_SIZE(push_constant_opcodes));
330 
331    UNUSED uint32_t mocs = anv_mocs(cmd_buffer->device, NULL, 0);
332 
333    anv_batch_emit(&cmd_buffer->batch, GENX(3DSTATE_CONSTANT_VS), c) {
334       c._3DCommandSubOpcode = push_constant_opcodes[stage];
335 
336       /* Set MOCS.
337        *
338        * We only have one MOCS field for the whole packet, not one per
339        * buffer.  We could go out of our way here to walk over all of
340        * the buffers and see if any of them are used externally and use
341        * the external MOCS.  However, the notion that someone would use
342        * the same bit of memory for both scanout and a UBO is nuts.
343        *
344        * Let's not bother and assume it's all internal.
345        */
346       c.MOCS = mocs;
347 
348       if (anv_pipeline_has_stage(pipeline, stage)) {
349          const struct anv_pipeline_bind_map *bind_map =
350             &pipeline->base.shaders[stage]->bind_map;
351 
352          /* The Skylake PRM contains the following restriction:
353           *
354           *    "The driver must ensure The following case does not occur
355           *     without a flush to the 3D engine: 3DSTATE_CONSTANT_* with
356           *     buffer 3 read length equal to zero committed followed by a
357           *     3DSTATE_CONSTANT_* with buffer 0 read length not equal to
358           *     zero committed."
359           *
360           * To avoid this, we program the buffers in the highest slots.
361           * This way, slot 0 is only used if slot 3 is also used.
362           */
363          assert(buffer_count <= 4);
364          const unsigned shift = 4 - buffer_count;
365          for (unsigned i = 0; i < buffer_count; i++) {
366             const struct anv_push_range *range = &bind_map->push_ranges[i];
367 
368             /* At this point we only have non-empty ranges */
369             assert(range->length > 0);
370 
371             c.ConstantBody.ReadLength[i + shift] = range->length;
372             c.ConstantBody.Buffer[i + shift] =
373                anv_address_add(buffers[i], range->start * 32);
374          }
375       }
376    }
377 }
378 
379 #if GFX_VER >= 12
380 static void
cmd_buffer_emit_push_constant_all(struct anv_cmd_buffer * cmd_buffer,uint32_t shader_mask,struct anv_address * buffers,uint32_t buffer_count)381 cmd_buffer_emit_push_constant_all(struct anv_cmd_buffer *cmd_buffer,
382                                   uint32_t shader_mask,
383                                   struct anv_address *buffers,
384                                   uint32_t buffer_count)
385 {
386    if (buffer_count == 0) {
387       anv_batch_emit(&cmd_buffer->batch, GENX(3DSTATE_CONSTANT_ALL), c) {
388          c.ShaderUpdateEnable = shader_mask;
389          c.MOCS = isl_mocs(&cmd_buffer->device->isl_dev, 0, false);
390       }
391       return;
392    }
393 
394    const struct anv_cmd_graphics_state *gfx_state = &cmd_buffer->state.gfx;
395    const struct anv_graphics_pipeline *pipeline =
396       anv_pipeline_to_graphics(gfx_state->base.pipeline);
397 
398    gl_shader_stage stage = vk_to_mesa_shader_stage(shader_mask);
399 
400    const struct anv_pipeline_bind_map *bind_map =
401       &pipeline->base.shaders[stage]->bind_map;
402 
403    uint32_t *dw;
404    const uint32_t buffer_mask = (1 << buffer_count) - 1;
405    const uint32_t num_dwords = 2 + 2 * buffer_count;
406 
407    dw = anv_batch_emitn(&cmd_buffer->batch, num_dwords,
408                         GENX(3DSTATE_CONSTANT_ALL),
409                         .ShaderUpdateEnable = shader_mask,
410                         .PointerBufferMask = buffer_mask,
411                         .MOCS = isl_mocs(&cmd_buffer->device->isl_dev, 0, false));
412 
413    for (int i = 0; i < buffer_count; i++) {
414       const struct anv_push_range *range = &bind_map->push_ranges[i];
415       GENX(3DSTATE_CONSTANT_ALL_DATA_pack)(
416          &cmd_buffer->batch, dw + 2 + i * 2,
417          &(struct GENX(3DSTATE_CONSTANT_ALL_DATA)) {
418             .PointerToConstantBuffer =
419                anv_address_add(buffers[i], range->start * 32),
420             .ConstantBufferReadLength = range->length,
421          });
422    }
423 }
424 #endif
425 
426 static void
cmd_buffer_flush_gfx_push_constants(struct anv_cmd_buffer * cmd_buffer,VkShaderStageFlags dirty_stages)427 cmd_buffer_flush_gfx_push_constants(struct anv_cmd_buffer *cmd_buffer,
428                                     VkShaderStageFlags dirty_stages)
429 {
430    VkShaderStageFlags flushed = 0;
431    struct anv_cmd_graphics_state *gfx_state = &cmd_buffer->state.gfx;
432    const struct anv_graphics_pipeline *pipeline =
433       anv_pipeline_to_graphics(gfx_state->base.pipeline);
434 
435 #if GFX_VER >= 12
436    uint32_t nobuffer_stages = 0;
437 #endif
438 
439    /* Compute robust pushed register access mask for each stage. */
440    anv_foreach_stage(stage, dirty_stages) {
441       if (!anv_pipeline_has_stage(pipeline, stage))
442          continue;
443 
444       const struct anv_shader_bin *shader = pipeline->base.shaders[stage];
445       if (shader->prog_data->zero_push_reg) {
446          const struct anv_pipeline_bind_map *bind_map = &shader->bind_map;
447          struct anv_push_constants *push = &gfx_state->base.push_constants;
448 
449          push->push_reg_mask[stage] = 0;
450          /* Start of the current range in the shader, relative to the start of
451           * push constants in the shader.
452           */
453          unsigned range_start_reg = 0;
454          for (unsigned i = 0; i < 4; i++) {
455             const struct anv_push_range *range = &bind_map->push_ranges[i];
456             if (range->length == 0)
457                continue;
458 
459             unsigned bound_size =
460                get_push_range_bound_size(cmd_buffer, shader, range);
461             if (bound_size >= range->start * 32) {
462                unsigned bound_regs =
463                   MIN2(DIV_ROUND_UP(bound_size, 32) - range->start,
464                        range->length);
465                assert(range_start_reg + bound_regs <= 64);
466                push->push_reg_mask[stage] |= BITFIELD64_RANGE(range_start_reg,
467                                                               bound_regs);
468             }
469 
470             cmd_buffer->state.push_constants_dirty |=
471                mesa_to_vk_shader_stage(stage);
472 
473             range_start_reg += range->length;
474          }
475       }
476    }
477 
478    /* Resets the push constant state so that we allocate a new one if
479     * needed.
480     */
481    gfx_state->base.push_constants_state = ANV_STATE_NULL;
482 
483    anv_foreach_stage(stage, dirty_stages) {
484       unsigned buffer_count = 0;
485       flushed |= mesa_to_vk_shader_stage(stage);
486       UNUSED uint32_t max_push_range = 0;
487 
488       struct anv_address buffers[4] = {};
489       if (anv_pipeline_has_stage(pipeline, stage)) {
490          const struct anv_shader_bin *shader = pipeline->base.shaders[stage];
491          const struct anv_pipeline_bind_map *bind_map = &shader->bind_map;
492 
493          /* We have to gather buffer addresses as a second step because the
494           * loop above puts data into the push constant area and the call to
495           * get_push_range_address is what locks our push constants and copies
496           * them into the actual GPU buffer.  If we did the two loops at the
497           * same time, we'd risk only having some of the sizes in the push
498           * constant buffer when we did the copy.
499           */
500          for (unsigned i = 0; i < 4; i++) {
501             const struct anv_push_range *range = &bind_map->push_ranges[i];
502             if (range->length == 0)
503                break;
504 
505             buffers[i] = get_push_range_address(cmd_buffer, shader, range);
506             max_push_range = MAX2(max_push_range, range->length);
507             buffer_count++;
508          }
509 
510          /* We have at most 4 buffers but they should be tightly packed */
511          for (unsigned i = buffer_count; i < 4; i++)
512             assert(bind_map->push_ranges[i].length == 0);
513       }
514 
515 #if GFX_VER >= 12
516       /* If this stage doesn't have any push constants, emit it later in a
517        * single CONSTANT_ALL packet.
518        */
519       if (buffer_count == 0) {
520          nobuffer_stages |= 1 << stage;
521          continue;
522       }
523 
524       /* The Constant Buffer Read Length field from 3DSTATE_CONSTANT_ALL
525        * contains only 5 bits, so we can only use it for buffers smaller than
526        * 32.
527        *
528        * According to Wa_16011448509, Gfx12.0 misinterprets some address bits
529        * in 3DSTATE_CONSTANT_ALL.  It should still be safe to use the command
530        * for disabling stages, where all address bits are zero.  However, we
531        * can't safely use it for general buffers with arbitrary addresses.
532        * Just fall back to the individual 3DSTATE_CONSTANT_XS commands in that
533        * case.
534        */
535       if (max_push_range < 32 && GFX_VERx10 > 120) {
536          cmd_buffer_emit_push_constant_all(cmd_buffer, 1 << stage,
537                                            buffers, buffer_count);
538          continue;
539       }
540 #endif
541 
542       cmd_buffer_emit_push_constant(cmd_buffer, stage, buffers, buffer_count);
543    }
544 
545 #if GFX_VER >= 12
546    if (nobuffer_stages)
547       /* Wa_16011448509: all address bits are zero */
548       cmd_buffer_emit_push_constant_all(cmd_buffer, nobuffer_stages, NULL, 0);
549 #endif
550 
551    cmd_buffer->state.push_constants_dirty &= ~flushed;
552 }
553 
554 #if GFX_VERx10 >= 125
555 static void
cmd_buffer_flush_mesh_inline_data(struct anv_cmd_buffer * cmd_buffer,VkShaderStageFlags dirty_stages)556 cmd_buffer_flush_mesh_inline_data(struct anv_cmd_buffer *cmd_buffer,
557                                   VkShaderStageFlags dirty_stages)
558 {
559    struct anv_cmd_graphics_state *gfx_state = &cmd_buffer->state.gfx;
560    const struct anv_graphics_pipeline *pipeline =
561       anv_pipeline_to_graphics(gfx_state->base.pipeline);
562 
563    if (dirty_stages & VK_SHADER_STAGE_TASK_BIT_EXT &&
564        anv_pipeline_has_stage(pipeline, MESA_SHADER_TASK)) {
565 
566       const struct anv_shader_bin *shader = pipeline->base.shaders[MESA_SHADER_TASK];
567       const struct anv_pipeline_bind_map *bind_map = &shader->bind_map;
568 
569       anv_batch_emit(&cmd_buffer->batch, GENX(3DSTATE_TASK_SHADER_DATA), data) {
570          const struct anv_push_range *range = &bind_map->push_ranges[0];
571          if (range->length > 0) {
572             struct anv_address buffer =
573                get_push_range_address(cmd_buffer, shader, range);
574 
575             uint64_t addr = anv_address_physical(buffer);
576             data.InlineData[0] = addr & 0xffffffff;
577             data.InlineData[1] = addr >> 32;
578 
579             memcpy(&data.InlineData[BRW_TASK_MESH_PUSH_CONSTANTS_START_DW],
580                    cmd_buffer->state.gfx.base.push_constants.client_data,
581                    BRW_TASK_MESH_PUSH_CONSTANTS_SIZE_DW * 4);
582          }
583       }
584    }
585 
586    if (dirty_stages & VK_SHADER_STAGE_MESH_BIT_EXT &&
587        anv_pipeline_has_stage(pipeline, MESA_SHADER_MESH)) {
588 
589       const struct anv_shader_bin *shader = pipeline->base.shaders[MESA_SHADER_MESH];
590       const struct anv_pipeline_bind_map *bind_map = &shader->bind_map;
591 
592       anv_batch_emit(&cmd_buffer->batch, GENX(3DSTATE_MESH_SHADER_DATA), data) {
593          const struct anv_push_range *range = &bind_map->push_ranges[0];
594          if (range->length > 0) {
595             struct anv_address buffer =
596                get_push_range_address(cmd_buffer, shader, range);
597 
598             uint64_t addr = anv_address_physical(buffer);
599             data.InlineData[0] = addr & 0xffffffff;
600             data.InlineData[1] = addr >> 32;
601 
602             memcpy(&data.InlineData[BRW_TASK_MESH_PUSH_CONSTANTS_START_DW],
603                    cmd_buffer->state.gfx.base.push_constants.client_data,
604                    BRW_TASK_MESH_PUSH_CONSTANTS_SIZE_DW * 4);
605          }
606       }
607    }
608 
609    cmd_buffer->state.push_constants_dirty &= ~dirty_stages;
610 }
611 #endif
612 
613 ALWAYS_INLINE static void
genX(emit_hs)614 genX(emit_hs)(struct anv_cmd_buffer *cmd_buffer)
615 {
616    struct anv_graphics_pipeline *pipeline =
617       anv_pipeline_to_graphics(cmd_buffer->state.gfx.base.pipeline);
618    if (!anv_pipeline_has_stage(pipeline, MESA_SHADER_TESS_EVAL))
619       return;
620 
621    anv_batch_emit_pipeline_state(&cmd_buffer->batch, pipeline, final.hs);
622 }
623 
624 ALWAYS_INLINE static void
genX(emit_ds)625 genX(emit_ds)(struct anv_cmd_buffer *cmd_buffer)
626 {
627 #if INTEL_NEEDS_WA_22018402687
628    /* Wa_22018402687:
629     *   In any 3D enabled context, just before any Tessellation enabled draw
630     *   call (3D Primitive), re-send the last programmed 3DSTATE_DS again.
631     *   This will make sure that the 3DSTATE_INT generated just before the
632     *   draw call will have TDS dirty which will make sure TDS will launch the
633     *   state thread before the draw call.
634     *
635     * This fixes a hang resulting from running anything using tessellation
636     * after a switch away from the mesh pipeline.
637     * We don't need to track said switch, as it matters at the HW level, and
638     * can be triggered even across processes, so we apply the Wa at all times.
639     */
640    struct anv_graphics_pipeline *pipeline =
641       anv_pipeline_to_graphics(cmd_buffer->state.gfx.base.pipeline);
642    if (!anv_pipeline_has_stage(pipeline, MESA_SHADER_TESS_EVAL))
643       return;
644 
645    anv_batch_emit_pipeline_state(&cmd_buffer->batch, pipeline, final.ds);
646 #endif
647 }
648 
649 ALWAYS_INLINE static void
genX(cmd_buffer_flush_gfx_state)650 genX(cmd_buffer_flush_gfx_state)(struct anv_cmd_buffer *cmd_buffer)
651 {
652    struct anv_graphics_pipeline *pipeline =
653       anv_pipeline_to_graphics(cmd_buffer->state.gfx.base.pipeline);
654    const struct vk_dynamic_graphics_state *dyn =
655       &cmd_buffer->vk.dynamic_graphics_state;
656    uint32_t *p;
657 
658    assert((pipeline->base.base.active_stages & VK_SHADER_STAGE_COMPUTE_BIT) == 0);
659 
660    genX(cmd_buffer_config_l3)(cmd_buffer, pipeline->base.base.l3_config);
661 
662    genX(cmd_buffer_emit_hashing_mode)(cmd_buffer, UINT_MAX, UINT_MAX, 1);
663 
664    genX(flush_pipeline_select_3d)(cmd_buffer);
665 
666    /* Wa_14015814527
667     *
668     * Apply task URB workaround when switching from task to primitive.
669     */
670    if (cmd_buffer->state.gfx.dirty & ANV_CMD_DIRTY_PIPELINE) {
671       if (anv_pipeline_is_primitive(pipeline)) {
672          genX(apply_task_urb_workaround)(cmd_buffer);
673       } else if (anv_pipeline_has_stage(pipeline, MESA_SHADER_TASK)) {
674          cmd_buffer->state.gfx.used_task_shader = true;
675       }
676    }
677 
678    /* Apply any pending pipeline flushes we may have.  We want to apply them
679     * now because, if any of those flushes are for things like push constants,
680     * the GPU will read the state at weird times.
681     */
682    genX(cmd_buffer_apply_pipe_flushes)(cmd_buffer);
683 
684    /* Check what vertex buffers have been rebound against the set of bindings
685     * being used by the current set of vertex attributes.
686     */
687    uint32_t vb_emit = cmd_buffer->state.gfx.vb_dirty & dyn->vi->bindings_valid;
688    /* If the pipeline changed, the we have to consider all the valid bindings. */
689    if ((cmd_buffer->state.gfx.dirty & ANV_CMD_DIRTY_PIPELINE) ||
690        BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_VI_BINDING_STRIDES))
691       vb_emit |= dyn->vi->bindings_valid;
692 
693    if (vb_emit) {
694       const uint32_t num_buffers = __builtin_popcount(vb_emit);
695       const uint32_t num_dwords = 1 + num_buffers * 4;
696 
697       p = anv_batch_emitn(&cmd_buffer->batch, num_dwords,
698                           GENX(3DSTATE_VERTEX_BUFFERS));
699       uint32_t i = 0;
700       u_foreach_bit(vb, vb_emit) {
701          struct anv_buffer *buffer = cmd_buffer->state.vertex_bindings[vb].buffer;
702          uint32_t offset = cmd_buffer->state.vertex_bindings[vb].offset;
703 
704          struct GENX(VERTEX_BUFFER_STATE) state;
705          if (buffer) {
706             uint32_t stride = dyn->vi_binding_strides[vb];
707             UNUSED uint32_t size = cmd_buffer->state.vertex_bindings[vb].size;
708 
709             state = (struct GENX(VERTEX_BUFFER_STATE)) {
710                .VertexBufferIndex = vb,
711 
712                .MOCS = anv_mocs(cmd_buffer->device, buffer->address.bo,
713                                 ISL_SURF_USAGE_VERTEX_BUFFER_BIT),
714                .AddressModifyEnable = true,
715                .BufferPitch = stride,
716                .BufferStartingAddress = anv_address_add(buffer->address, offset),
717                .NullVertexBuffer = offset >= buffer->vk.size,
718 #if GFX_VER >= 12
719                .L3BypassDisable = true,
720 #endif
721 
722                .BufferSize = size,
723             };
724          } else {
725             state = (struct GENX(VERTEX_BUFFER_STATE)) {
726                .VertexBufferIndex = vb,
727                .NullVertexBuffer = true,
728                .MOCS = anv_mocs(cmd_buffer->device, NULL,
729                                 ISL_SURF_USAGE_VERTEX_BUFFER_BIT),
730             };
731          }
732 
733 #if GFX_VER == 9
734          genX(cmd_buffer_set_binding_for_gfx8_vb_flush)(cmd_buffer, vb,
735                                                         state.BufferStartingAddress,
736                                                         state.BufferSize);
737 #endif
738 
739          GENX(VERTEX_BUFFER_STATE_pack)(&cmd_buffer->batch, &p[1 + i * 4], &state);
740          i++;
741       }
742    }
743 
744    cmd_buffer->state.gfx.vb_dirty &= ~vb_emit;
745 
746    /* If patch control points value is changed, let's just update the push
747     * constant data. If the current pipeline also use this, we need to reemit
748     * the 3DSTATE_CONSTANT packet.
749     */
750    struct anv_push_constants *push = &cmd_buffer->state.gfx.base.push_constants;
751    if (BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_TS_PATCH_CONTROL_POINTS) &&
752        push->gfx.tcs_input_vertices != dyn->ts.patch_control_points) {
753       push->gfx.tcs_input_vertices = dyn->ts.patch_control_points;
754       if (pipeline->dynamic_patch_control_points)
755          cmd_buffer->state.push_constants_dirty |= VK_SHADER_STAGE_TESSELLATION_CONTROL_BIT;
756    }
757 
758    const bool any_dynamic_state_dirty =
759       vk_dynamic_graphics_state_any_dirty(dyn);
760    uint32_t descriptors_dirty = cmd_buffer->state.descriptors_dirty &
761                                 pipeline->base.base.active_stages;
762 
763    descriptors_dirty |=
764       genX(cmd_buffer_flush_push_descriptors)(cmd_buffer,
765                                               &cmd_buffer->state.gfx.base,
766                                               &pipeline->base.base);
767 
768    /* Wa_1306463417, Wa_16011107343 - Send HS state for every primitive. */
769    if (cmd_buffer->state.gfx.dirty & ANV_CMD_DIRTY_PIPELINE ||
770        (INTEL_NEEDS_WA_1306463417 || INTEL_NEEDS_WA_16011107343)) {
771       genX(emit_hs)(cmd_buffer);
772    }
773 
774    if (!cmd_buffer->state.gfx.dirty && !descriptors_dirty &&
775        !any_dynamic_state_dirty &&
776        ((cmd_buffer->state.push_constants_dirty &
777          (VK_SHADER_STAGE_ALL_GRAPHICS |
778           VK_SHADER_STAGE_TASK_BIT_EXT |
779           VK_SHADER_STAGE_MESH_BIT_EXT)) == 0))
780       return;
781 
782    if (cmd_buffer->state.gfx.dirty & ANV_CMD_DIRTY_XFB_ENABLE) {
783       /* Wa_16011411144:
784        *
785        * SW must insert a PIPE_CONTROL cmd before and after the
786        * 3dstate_so_buffer_index_0/1/2/3 states to ensure so_buffer_index_*
787        * state is not combined with other state changes.
788        */
789       if (intel_needs_workaround(cmd_buffer->device->info, 16011411144)) {
790          anv_add_pending_pipe_bits(cmd_buffer,
791                                    ANV_PIPE_CS_STALL_BIT,
792                                    "before SO_BUFFER change WA");
793          genX(cmd_buffer_apply_pipe_flushes)(cmd_buffer);
794       }
795 
796       /* We don't need any per-buffer dirty tracking because you're not
797        * allowed to bind different XFB buffers while XFB is enabled.
798        */
799       for (unsigned idx = 0; idx < MAX_XFB_BUFFERS; idx++) {
800          struct anv_xfb_binding *xfb = &cmd_buffer->state.xfb_bindings[idx];
801          anv_batch_emit(&cmd_buffer->batch, GENX(3DSTATE_SO_BUFFER), sob) {
802 #if GFX_VER < 12
803             sob.SOBufferIndex = idx;
804 #else
805             sob._3DCommandOpcode = 0;
806             sob._3DCommandSubOpcode = SO_BUFFER_INDEX_0_CMD + idx;
807 #endif
808 
809             if (cmd_buffer->state.xfb_enabled && xfb->buffer && xfb->size != 0) {
810                sob.MOCS = anv_mocs(cmd_buffer->device, xfb->buffer->address.bo,
811                                    ISL_SURF_USAGE_STREAM_OUT_BIT);
812                sob.SurfaceBaseAddress = anv_address_add(xfb->buffer->address,
813                                                         xfb->offset);
814                sob.SOBufferEnable = true;
815                sob.StreamOffsetWriteEnable = false;
816                /* Size is in DWords - 1 */
817                sob.SurfaceSize = DIV_ROUND_UP(xfb->size, 4) - 1;
818             } else {
819                sob.MOCS = anv_mocs(cmd_buffer->device, NULL, 0);
820             }
821          }
822       }
823 
824       if (intel_needs_workaround(cmd_buffer->device->info, 16011411144)) {
825          /* Wa_16011411144: also CS_STALL after touching SO_BUFFER change */
826          anv_add_pending_pipe_bits(cmd_buffer,
827                                    ANV_PIPE_CS_STALL_BIT,
828                                    "after SO_BUFFER change WA");
829          genX(cmd_buffer_apply_pipe_flushes)(cmd_buffer);
830       } else if (GFX_VER >= 10) {
831          /* CNL and later require a CS stall after 3DSTATE_SO_BUFFER */
832          anv_add_pending_pipe_bits(cmd_buffer,
833                                    ANV_PIPE_CS_STALL_BIT,
834                                    "after 3DSTATE_SO_BUFFER call");
835       }
836    }
837 
838    /* Flush the runtime state into the HW state tracking */
839    if (cmd_buffer->state.gfx.dirty || any_dynamic_state_dirty)
840       genX(cmd_buffer_flush_gfx_runtime_state)(cmd_buffer);
841 
842    /* Flush the HW state into the commmand buffer */
843    if (!BITSET_IS_EMPTY(cmd_buffer->state.gfx.dyn_state.dirty))
844       genX(cmd_buffer_flush_gfx_hw_state)(cmd_buffer);
845 
846    /* If the pipeline changed, we may need to re-allocate push constant space
847     * in the URB.
848     */
849    if (cmd_buffer->state.gfx.dirty & ANV_CMD_DIRTY_PIPELINE) {
850       cmd_buffer_alloc_gfx_push_constants(cmd_buffer);
851 
852       /* Also add the relocations (scratch buffers) */
853       VkResult result = anv_reloc_list_append(cmd_buffer->batch.relocs,
854                                               pipeline->base.base.batch.relocs);
855       if (result != VK_SUCCESS) {
856          anv_batch_set_error(&cmd_buffer->batch, result);
857          return;
858       }
859    }
860 
861    /* Render targets live in the same binding table as fragment descriptors */
862    if (cmd_buffer->state.gfx.dirty & ANV_CMD_DIRTY_RENDER_TARGETS)
863       descriptors_dirty |= VK_SHADER_STAGE_FRAGMENT_BIT;
864 
865    /* We emit the binding tables and sampler tables first, then emit push
866     * constants and then finally emit binding table and sampler table
867     * pointers.  It has to happen in this order, since emitting the binding
868     * tables may change the push constants (in case of storage images). After
869     * emitting push constants, on SKL+ we have to emit the corresponding
870     * 3DSTATE_BINDING_TABLE_POINTER_* for the push constants to take effect.
871     */
872    uint32_t dirty = 0;
873    if (descriptors_dirty) {
874       dirty = genX(cmd_buffer_flush_descriptor_sets)(
875          cmd_buffer,
876          &cmd_buffer->state.gfx.base,
877          descriptors_dirty,
878          pipeline->base.shaders,
879          ARRAY_SIZE(pipeline->base.shaders));
880       cmd_buffer->state.descriptors_dirty &= ~dirty;
881    }
882 
883    if (dirty || cmd_buffer->state.push_constants_dirty) {
884       /* Because we're pushing UBOs, we have to push whenever either
885        * descriptors or push constants is dirty.
886        */
887       dirty |= cmd_buffer->state.push_constants_dirty &
888                pipeline->base.base.active_stages;
889       cmd_buffer_flush_gfx_push_constants(cmd_buffer,
890                                       dirty & VK_SHADER_STAGE_ALL_GRAPHICS);
891 #if GFX_VERx10 >= 125
892       cmd_buffer_flush_mesh_inline_data(
893          cmd_buffer, dirty & (VK_SHADER_STAGE_TASK_BIT_EXT |
894                               VK_SHADER_STAGE_MESH_BIT_EXT));
895 #endif
896    }
897 
898    if (dirty & VK_SHADER_STAGE_ALL_GRAPHICS) {
899       cmd_buffer_emit_descriptor_pointers(cmd_buffer,
900                                           dirty & VK_SHADER_STAGE_ALL_GRAPHICS);
901    }
902 
903    /* When we're done, there is no more dirty gfx state. */
904    cmd_buffer->state.gfx.dirty = 0;
905 }
906 
907 ALWAYS_INLINE static bool
anv_use_generated_draws(const struct anv_cmd_buffer * cmd_buffer,uint32_t count)908 anv_use_generated_draws(const struct anv_cmd_buffer *cmd_buffer, uint32_t count)
909 {
910    const struct anv_device *device = cmd_buffer->device;
911    const struct anv_graphics_pipeline *pipeline =
912       anv_pipeline_to_graphics(cmd_buffer->state.gfx.base.pipeline);
913 
914    /* Limit generated draws to pipelines without HS stage. This makes things
915     * simpler for implementing Wa_1306463417, Wa_16011107343.
916     */
917    if ((INTEL_NEEDS_WA_1306463417 || INTEL_NEEDS_WA_16011107343) &&
918        anv_pipeline_has_stage(pipeline, MESA_SHADER_TESS_CTRL))
919       return false;
920 
921    return count >= device->physical->instance->generated_indirect_threshold;
922 }
923 
924 #include "genX_cmd_draw_helpers.h"
925 #include "genX_cmd_draw_generated_indirect.h"
926 
927 #if GFX_VER >= 11
928 #define _3DPRIMITIVE_DIRECT GENX(3DPRIMITIVE_EXTENDED)
929 #else
930 #define _3DPRIMITIVE_DIRECT GENX(3DPRIMITIVE)
931 #endif
932 
genX(CmdDraw)933 void genX(CmdDraw)(
934     VkCommandBuffer                             commandBuffer,
935     uint32_t                                    vertexCount,
936     uint32_t                                    instanceCount,
937     uint32_t                                    firstVertex,
938     uint32_t                                    firstInstance)
939 {
940    ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer);
941    struct anv_graphics_pipeline *pipeline =
942       anv_pipeline_to_graphics(cmd_buffer->state.gfx.base.pipeline);
943 
944    if (anv_batch_has_error(&cmd_buffer->batch))
945       return;
946 
947    const uint32_t count =
948       vertexCount * instanceCount * pipeline->instance_multiplier;
949    anv_measure_snapshot(cmd_buffer,
950                         INTEL_SNAPSHOT_DRAW,
951                         "draw", count);
952    trace_intel_begin_draw(&cmd_buffer->trace);
953 
954    /* Select pipeline here to allow
955     * cmd_buffer_emit_vertex_constants_and_flush() without flushing before
956     * cmd_buffer_flush_gfx_state().
957     */
958    genX(flush_pipeline_select_3d)(cmd_buffer);
959 
960    if (cmd_buffer->state.conditional_render_enabled)
961       genX(cmd_emit_conditional_render_predicate)(cmd_buffer);
962 
963 #if GFX_VER < 11
964    cmd_buffer_emit_vertex_constants_and_flush(cmd_buffer,
965                                               get_vs_prog_data(pipeline),
966                                               firstVertex, firstInstance, 0,
967                                               false /* force_flush */);
968 #endif
969 
970    genX(cmd_buffer_flush_gfx_state)(cmd_buffer);
971    genX(emit_ds)(cmd_buffer);
972 
973    genX(emit_breakpoint)(&cmd_buffer->batch, cmd_buffer->device, true);
974 
975    anv_batch_emit(&cmd_buffer->batch, _3DPRIMITIVE_DIRECT, prim) {
976       prim.PredicateEnable          = cmd_buffer->state.conditional_render_enabled;
977 #if GFX_VERx10 >= 125
978       prim.TBIMREnable = cmd_buffer->state.gfx.dyn_state.use_tbimr;
979 #endif
980       prim.VertexAccessType         = SEQUENTIAL;
981       prim.VertexCountPerInstance   = vertexCount;
982       prim.StartVertexLocation      = firstVertex;
983       prim.InstanceCount            = instanceCount *
984                                       pipeline->instance_multiplier;
985       prim.StartInstanceLocation    = firstInstance;
986       prim.BaseVertexLocation       = 0;
987 #if GFX_VER >= 11
988       prim.ExtendedParametersPresent = true;
989       prim.ExtendedParameter0       = firstVertex;
990       prim.ExtendedParameter1       = firstInstance;
991       prim.ExtendedParameter2       = 0;
992 #endif
993    }
994 
995    genX(batch_emit_post_3dprimitive_was)(&cmd_buffer->batch,
996                                          cmd_buffer->device,
997                                          cmd_buffer->state.gfx.primitive_topology,
998                                          vertexCount);
999    genX(emit_breakpoint)(&cmd_buffer->batch, cmd_buffer->device, false);
1000 
1001    update_dirty_vbs_for_gfx8_vb_flush(cmd_buffer, SEQUENTIAL);
1002 
1003    trace_intel_end_draw(&cmd_buffer->trace, count);
1004 }
1005 
genX(CmdDrawMultiEXT)1006 void genX(CmdDrawMultiEXT)(
1007     VkCommandBuffer                             commandBuffer,
1008     uint32_t                                    drawCount,
1009     const VkMultiDrawInfoEXT                   *pVertexInfo,
1010     uint32_t                                    instanceCount,
1011     uint32_t                                    firstInstance,
1012     uint32_t                                    stride)
1013 {
1014    ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer);
1015    UNUSED struct anv_graphics_pipeline *pipeline =
1016       anv_pipeline_to_graphics(cmd_buffer->state.gfx.base.pipeline);
1017 
1018    if (anv_batch_has_error(&cmd_buffer->batch))
1019       return;
1020 
1021    genX(cmd_buffer_flush_gfx_state)(cmd_buffer);
1022 
1023    if (cmd_buffer->state.conditional_render_enabled)
1024       genX(cmd_emit_conditional_render_predicate)(cmd_buffer);
1025 
1026    uint32_t i = 0;
1027 #if GFX_VER < 11
1028    vk_foreach_multi_draw(draw, i, pVertexInfo, drawCount, stride) {
1029       cmd_buffer_emit_vertex_constants_and_flush(cmd_buffer,
1030                                                  get_vs_prog_data(pipeline),
1031                                                  draw->firstVertex,
1032                                                  firstInstance, i, !i);
1033 
1034       const uint32_t count =
1035          draw->vertexCount * instanceCount * pipeline->instance_multiplier;
1036       anv_measure_snapshot(cmd_buffer,
1037                            INTEL_SNAPSHOT_DRAW,
1038                            "draw multi", count);
1039       trace_intel_begin_draw_multi(&cmd_buffer->trace);
1040 
1041       genX(emit_breakpoint)(&cmd_buffer->batch, cmd_buffer->device, true);
1042 
1043       anv_batch_emit(&cmd_buffer->batch, GENX(3DPRIMITIVE), prim) {
1044          prim.PredicateEnable          = cmd_buffer->state.conditional_render_enabled;
1045          prim.VertexAccessType         = SEQUENTIAL;
1046          prim.VertexCountPerInstance   = draw->vertexCount;
1047          prim.StartVertexLocation      = draw->firstVertex;
1048          prim.InstanceCount            = instanceCount *
1049                                          pipeline->instance_multiplier;
1050          prim.StartInstanceLocation    = firstInstance;
1051          prim.BaseVertexLocation       = 0;
1052       }
1053 
1054       genX(batch_emit_post_3dprimitive_was)(&cmd_buffer->batch,
1055                                             cmd_buffer->device,
1056                                             cmd_buffer->state.gfx.primitive_topology,
1057                                             drawCount == 0 ? 0 :
1058                                             pVertexInfo[drawCount - 1].vertexCount);
1059 
1060       genX(emit_breakpoint)(&cmd_buffer->batch, cmd_buffer->device, false);
1061       trace_intel_end_draw_multi(&cmd_buffer->trace, count);
1062    }
1063 #else
1064    vk_foreach_multi_draw(draw, i, pVertexInfo, drawCount, stride) {
1065 
1066       /* Wa_1306463417, Wa_16011107343 - Send HS state for every primitive,
1067        * first one was handled by cmd_buffer_flush_gfx_state.
1068        */
1069       if (i && (INTEL_NEEDS_WA_1306463417 || INTEL_NEEDS_WA_16011107343))
1070          genX(emit_hs)(cmd_buffer);
1071       genX(emit_ds)(cmd_buffer);
1072 
1073       const uint32_t count = draw->vertexCount * instanceCount;
1074       anv_measure_snapshot(cmd_buffer,
1075                            INTEL_SNAPSHOT_DRAW,
1076                            "draw multi", count);
1077       trace_intel_begin_draw_multi(&cmd_buffer->trace);
1078 
1079       genX(emit_breakpoint)(&cmd_buffer->batch, cmd_buffer->device, true);
1080 
1081       anv_batch_emit(&cmd_buffer->batch, _3DPRIMITIVE_DIRECT, prim) {
1082 #if GFX_VERx10 >= 125
1083          prim.TBIMREnable = cmd_buffer->state.gfx.dyn_state.use_tbimr;
1084 #endif
1085          prim.PredicateEnable          = cmd_buffer->state.conditional_render_enabled;
1086          prim.VertexAccessType         = SEQUENTIAL;
1087          prim.VertexCountPerInstance   = draw->vertexCount;
1088          prim.StartVertexLocation      = draw->firstVertex;
1089          prim.InstanceCount            = instanceCount;
1090          prim.StartInstanceLocation    = firstInstance;
1091          prim.BaseVertexLocation       = 0;
1092          prim.ExtendedParametersPresent = true;
1093          prim.ExtendedParameter0       = draw->firstVertex;
1094          prim.ExtendedParameter1       = firstInstance;
1095          prim.ExtendedParameter2       = i;
1096       }
1097 
1098       genX(batch_emit_post_3dprimitive_was)(&cmd_buffer->batch,
1099                                             cmd_buffer->device,
1100                                             cmd_buffer->state.gfx.primitive_topology,
1101                                             drawCount == 0 ? 0 :
1102                                             pVertexInfo[drawCount - 1].vertexCount);
1103 
1104       genX(emit_breakpoint)(&cmd_buffer->batch, cmd_buffer->device, false);
1105       trace_intel_end_draw_multi(&cmd_buffer->trace, count);
1106    }
1107 #endif
1108 
1109    update_dirty_vbs_for_gfx8_vb_flush(cmd_buffer, SEQUENTIAL);
1110 }
1111 
genX(CmdDrawIndexed)1112 void genX(CmdDrawIndexed)(
1113     VkCommandBuffer                             commandBuffer,
1114     uint32_t                                    indexCount,
1115     uint32_t                                    instanceCount,
1116     uint32_t                                    firstIndex,
1117     int32_t                                     vertexOffset,
1118     uint32_t                                    firstInstance)
1119 {
1120    ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer);
1121    struct anv_graphics_pipeline *pipeline =
1122       anv_pipeline_to_graphics(cmd_buffer->state.gfx.base.pipeline);
1123 
1124    if (anv_batch_has_error(&cmd_buffer->batch))
1125       return;
1126 
1127    const uint32_t count =
1128       indexCount * instanceCount * pipeline->instance_multiplier;
1129    anv_measure_snapshot(cmd_buffer,
1130                         INTEL_SNAPSHOT_DRAW,
1131                         "draw indexed",
1132                         count);
1133    trace_intel_begin_draw_indexed(&cmd_buffer->trace);
1134 
1135    /* Select pipeline here to allow
1136     * cmd_buffer_emit_vertex_constants_and_flush() without flushing before
1137     * cmd_buffer_flush_gfx_state().
1138     */
1139    genX(flush_pipeline_select_3d)(cmd_buffer);
1140 
1141    if (cmd_buffer->state.conditional_render_enabled)
1142       genX(cmd_emit_conditional_render_predicate)(cmd_buffer);
1143 
1144 #if GFX_VER < 11
1145    const struct brw_vs_prog_data *vs_prog_data = get_vs_prog_data(pipeline);
1146    cmd_buffer_emit_vertex_constants_and_flush(cmd_buffer, vs_prog_data,
1147                                               vertexOffset, firstInstance,
1148                                               0, false /* force_flush */);
1149 #endif
1150 
1151    genX(cmd_buffer_flush_gfx_state)(cmd_buffer);
1152    genX(emit_breakpoint)(&cmd_buffer->batch, cmd_buffer->device, true);
1153 
1154    anv_batch_emit(&cmd_buffer->batch, _3DPRIMITIVE_DIRECT, prim) {
1155       prim.PredicateEnable          = cmd_buffer->state.conditional_render_enabled;
1156 #if GFX_VERx10 >= 125
1157       prim.TBIMREnable = cmd_buffer->state.gfx.dyn_state.use_tbimr;
1158 #endif
1159       prim.VertexAccessType         = RANDOM;
1160       prim.VertexCountPerInstance   = indexCount;
1161       prim.StartVertexLocation      = firstIndex;
1162       prim.InstanceCount            = instanceCount *
1163                                       pipeline->instance_multiplier;
1164       prim.StartInstanceLocation    = firstInstance;
1165       prim.BaseVertexLocation       = vertexOffset;
1166 #if GFX_VER >= 11
1167       prim.ExtendedParametersPresent = true;
1168       prim.ExtendedParameter0       = vertexOffset;
1169       prim.ExtendedParameter1       = firstInstance;
1170       prim.ExtendedParameter2       = 0;
1171 #endif
1172    }
1173 
1174    genX(batch_emit_post_3dprimitive_was)(&cmd_buffer->batch,
1175                                          cmd_buffer->device,
1176                                          cmd_buffer->state.gfx.primitive_topology,
1177                                          indexCount);
1178    genX(emit_breakpoint)(&cmd_buffer->batch, cmd_buffer->device, false);
1179 
1180    update_dirty_vbs_for_gfx8_vb_flush(cmd_buffer, RANDOM);
1181 
1182    trace_intel_end_draw_indexed(&cmd_buffer->trace, count);
1183 }
1184 
genX(CmdDrawMultiIndexedEXT)1185 void genX(CmdDrawMultiIndexedEXT)(
1186     VkCommandBuffer                             commandBuffer,
1187     uint32_t                                    drawCount,
1188     const VkMultiDrawIndexedInfoEXT            *pIndexInfo,
1189     uint32_t                                    instanceCount,
1190     uint32_t                                    firstInstance,
1191     uint32_t                                    stride,
1192     const int32_t                              *pVertexOffset)
1193 {
1194    ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer);
1195    struct anv_graphics_pipeline *pipeline =
1196       anv_pipeline_to_graphics(cmd_buffer->state.gfx.base.pipeline);
1197 
1198    if (anv_batch_has_error(&cmd_buffer->batch))
1199       return;
1200 
1201    genX(cmd_buffer_flush_gfx_state)(cmd_buffer);
1202 
1203    if (cmd_buffer->state.conditional_render_enabled)
1204       genX(cmd_emit_conditional_render_predicate)(cmd_buffer);
1205 
1206    uint32_t i = 0;
1207 #if GFX_VER < 11
1208    const struct brw_vs_prog_data *vs_prog_data = get_vs_prog_data(pipeline);
1209    if (pVertexOffset) {
1210       if (vs_prog_data->uses_drawid) {
1211          bool emitted = true;
1212          if (vs_prog_data->uses_firstvertex ||
1213              vs_prog_data->uses_baseinstance) {
1214             emit_base_vertex_instance(cmd_buffer, *pVertexOffset, firstInstance);
1215             emitted = true;
1216          }
1217          vk_foreach_multi_draw_indexed(draw, i, pIndexInfo, drawCount, stride) {
1218             if (vs_prog_data->uses_drawid) {
1219                emit_draw_index(cmd_buffer, i);
1220                emitted = true;
1221             }
1222             /* Emitting draw index or vertex index BOs may result in needing
1223              * additional VF cache flushes.
1224              */
1225             if (emitted)
1226                genX(cmd_buffer_apply_pipe_flushes)(cmd_buffer);
1227 
1228             const uint32_t count =
1229                draw->indexCount * instanceCount * pipeline->instance_multiplier;
1230             anv_measure_snapshot(cmd_buffer,
1231                                  INTEL_SNAPSHOT_DRAW,
1232                                  "draw indexed multi",
1233                                  count);
1234             trace_intel_begin_draw_indexed_multi(&cmd_buffer->trace);
1235             genX(emit_breakpoint)(&cmd_buffer->batch, cmd_buffer->device,
1236                                   true);
1237 
1238             anv_batch_emit(&cmd_buffer->batch, GENX(3DPRIMITIVE), prim) {
1239                prim.PredicateEnable          = cmd_buffer->state.conditional_render_enabled;
1240                prim.VertexAccessType         = RANDOM;
1241                prim.VertexCountPerInstance   = draw->indexCount;
1242                prim.StartVertexLocation      = draw->firstIndex;
1243                prim.InstanceCount            = instanceCount *
1244                                                pipeline->instance_multiplier;
1245                prim.StartInstanceLocation    = firstInstance;
1246                prim.BaseVertexLocation       = *pVertexOffset;
1247             }
1248 
1249             genX(batch_emit_post_3dprimitive_was)(&cmd_buffer->batch,
1250                                                   cmd_buffer->device,
1251                                                   cmd_buffer->state.gfx.primitive_topology,
1252                                                   drawCount == 0 ? 0 :
1253                                                   pIndexInfo[drawCount - 1].indexCount);
1254 
1255             genX(emit_breakpoint)(&cmd_buffer->batch, cmd_buffer->device,
1256                                   false);
1257             trace_intel_end_draw_indexed_multi(&cmd_buffer->trace, count);
1258             emitted = false;
1259          }
1260       } else {
1261          if (vs_prog_data->uses_firstvertex ||
1262              vs_prog_data->uses_baseinstance) {
1263             emit_base_vertex_instance(cmd_buffer, *pVertexOffset, firstInstance);
1264             /* Emitting draw index or vertex index BOs may result in needing
1265              * additional VF cache flushes.
1266              */
1267             genX(cmd_buffer_apply_pipe_flushes)(cmd_buffer);
1268          }
1269          vk_foreach_multi_draw_indexed(draw, i, pIndexInfo, drawCount, stride) {
1270             const uint32_t count =
1271                draw->indexCount * instanceCount * pipeline->instance_multiplier;
1272             anv_measure_snapshot(cmd_buffer,
1273                                  INTEL_SNAPSHOT_DRAW,
1274                                  "draw indexed multi",
1275                                  count);
1276             trace_intel_begin_draw_indexed_multi(&cmd_buffer->trace);
1277             genX(emit_breakpoint)(&cmd_buffer->batch, cmd_buffer->device,
1278                                   true);
1279 
1280             anv_batch_emit(&cmd_buffer->batch, GENX(3DPRIMITIVE), prim) {
1281                prim.PredicateEnable          = cmd_buffer->state.conditional_render_enabled;
1282                prim.VertexAccessType         = RANDOM;
1283                prim.VertexCountPerInstance   = draw->indexCount;
1284                prim.StartVertexLocation      = draw->firstIndex;
1285                prim.InstanceCount            = instanceCount *
1286                                                pipeline->instance_multiplier;
1287                prim.StartInstanceLocation    = firstInstance;
1288                prim.BaseVertexLocation       = *pVertexOffset;
1289             }
1290 
1291             genX(batch_emit_post_3dprimitive_was)(&cmd_buffer->batch,
1292                                                   cmd_buffer->device,
1293                                                   cmd_buffer->state.gfx.primitive_topology,
1294                                                   drawCount == 0 ? 0 :
1295                                                   pIndexInfo[drawCount - 1].indexCount);
1296 
1297             genX(emit_breakpoint)(&cmd_buffer->batch, cmd_buffer->device,
1298                                   false);
1299             trace_intel_end_draw_indexed_multi(&cmd_buffer->trace, count);
1300          }
1301       }
1302    } else {
1303       vk_foreach_multi_draw_indexed(draw, i, pIndexInfo, drawCount, stride) {
1304          cmd_buffer_emit_vertex_constants_and_flush(cmd_buffer, vs_prog_data,
1305                                                     draw->vertexOffset,
1306                                                     firstInstance, i, i != 0);
1307 
1308          const uint32_t count =
1309             draw->indexCount * instanceCount * pipeline->instance_multiplier;
1310          anv_measure_snapshot(cmd_buffer,
1311                               INTEL_SNAPSHOT_DRAW,
1312                               "draw indexed multi",
1313                               count);
1314          trace_intel_begin_draw_indexed_multi(&cmd_buffer->trace);
1315          genX(emit_breakpoint)(&cmd_buffer->batch, cmd_buffer->device, true);
1316 
1317          anv_batch_emit(&cmd_buffer->batch, GENX(3DPRIMITIVE), prim) {
1318             prim.PredicateEnable          = cmd_buffer->state.conditional_render_enabled;
1319             prim.VertexAccessType         = RANDOM;
1320             prim.VertexCountPerInstance   = draw->indexCount;
1321             prim.StartVertexLocation      = draw->firstIndex;
1322             prim.InstanceCount            = instanceCount *
1323                                             pipeline->instance_multiplier;
1324             prim.StartInstanceLocation    = firstInstance;
1325             prim.BaseVertexLocation       = draw->vertexOffset;
1326          }
1327 
1328          genX(batch_emit_post_3dprimitive_was)(&cmd_buffer->batch,
1329                                                cmd_buffer->device,
1330                                                cmd_buffer->state.gfx.primitive_topology,
1331                                                drawCount == 0 ? 0 :
1332                                                pIndexInfo[drawCount - 1].indexCount);
1333 
1334          genX(emit_breakpoint)(&cmd_buffer->batch, cmd_buffer->device, false);
1335          trace_intel_end_draw_indexed_multi(&cmd_buffer->trace, count);
1336       }
1337    }
1338 #else
1339    vk_foreach_multi_draw_indexed(draw, i, pIndexInfo, drawCount, stride) {
1340 
1341       /* Wa_1306463417, Wa_16011107343 - Send HS state for every primitive,
1342        * first one was handled by cmd_buffer_flush_gfx_state.
1343        */
1344       if (i && (INTEL_NEEDS_WA_1306463417 || INTEL_NEEDS_WA_16011107343))
1345          genX(emit_hs)(cmd_buffer);
1346       genX(emit_ds)(cmd_buffer);
1347 
1348       const uint32_t count =
1349          draw->indexCount * instanceCount * pipeline->instance_multiplier;
1350       anv_measure_snapshot(cmd_buffer,
1351                            INTEL_SNAPSHOT_DRAW,
1352                            "draw indexed multi",
1353                            count);
1354       trace_intel_begin_draw_indexed_multi(&cmd_buffer->trace);
1355       genX(emit_breakpoint)(&cmd_buffer->batch, cmd_buffer->device, true);
1356 
1357       anv_batch_emit(&cmd_buffer->batch, GENX(3DPRIMITIVE_EXTENDED), prim) {
1358 #if GFX_VERx10 >= 125
1359          prim.TBIMREnable = cmd_buffer->state.gfx.dyn_state.use_tbimr;
1360 #endif
1361          prim.PredicateEnable          = cmd_buffer->state.conditional_render_enabled;
1362          prim.VertexAccessType         = RANDOM;
1363          prim.VertexCountPerInstance   = draw->indexCount;
1364          prim.StartVertexLocation      = draw->firstIndex;
1365          prim.InstanceCount            = instanceCount *
1366                                          pipeline->instance_multiplier;
1367          prim.StartInstanceLocation    = firstInstance;
1368          prim.BaseVertexLocation       = pVertexOffset ? *pVertexOffset : draw->vertexOffset;
1369          prim.ExtendedParametersPresent = true;
1370          prim.ExtendedParameter0       = pVertexOffset ? *pVertexOffset : draw->vertexOffset;
1371          prim.ExtendedParameter1       = firstInstance;
1372          prim.ExtendedParameter2       = i;
1373       }
1374 
1375       genX(batch_emit_post_3dprimitive_was)(&cmd_buffer->batch,
1376                                             cmd_buffer->device,
1377                                             cmd_buffer->state.gfx.primitive_topology,
1378                                             drawCount == 0 ? 0 :
1379                                             pIndexInfo[drawCount - 1].indexCount);
1380 
1381       genX(emit_breakpoint)(&cmd_buffer->batch, cmd_buffer->device, false);
1382       trace_intel_end_draw_indexed_multi(&cmd_buffer->trace, count);
1383    }
1384 #endif
1385 
1386    update_dirty_vbs_for_gfx8_vb_flush(cmd_buffer, RANDOM);
1387 }
1388 
1389 /* Auto-Draw / Indirect Registers */
1390 #define GFX7_3DPRIM_END_OFFSET          0x2420
1391 #define GFX7_3DPRIM_START_VERTEX        0x2430
1392 #define GFX7_3DPRIM_VERTEX_COUNT        0x2434
1393 #define GFX7_3DPRIM_INSTANCE_COUNT      0x2438
1394 #define GFX7_3DPRIM_START_INSTANCE      0x243C
1395 #define GFX7_3DPRIM_BASE_VERTEX         0x2440
1396 
1397 /* On Gen11+, we have three custom "extended parameters" which we can use to
1398  * provide extra system-generated values to shaders.  Our assignment of these
1399  * is arbitrary; we choose to assign them as follows:
1400  *
1401  *    gl_BaseVertex = XP0
1402  *    gl_BaseInstance = XP1
1403  *    gl_DrawID = XP2
1404  *
1405  * For gl_BaseInstance, we never actually have to set up the value because we
1406  * can just program 3DSTATE_VF_SGVS_2 to load it implicitly.  We can also do
1407  * that for gl_BaseVertex but it does the wrong thing for indexed draws.
1408  */
1409 #define GEN11_3DPRIM_XP0                0x2690
1410 #define GEN11_3DPRIM_XP1                0x2694
1411 #define GEN11_3DPRIM_XP2                0x2698
1412 #define GEN11_3DPRIM_XP_BASE_VERTEX     GEN11_3DPRIM_XP0
1413 #define GEN11_3DPRIM_XP_BASE_INSTANCE   GEN11_3DPRIM_XP1
1414 #define GEN11_3DPRIM_XP_DRAW_ID         GEN11_3DPRIM_XP2
1415 
genX(CmdDrawIndirectByteCountEXT)1416 void genX(CmdDrawIndirectByteCountEXT)(
1417     VkCommandBuffer                             commandBuffer,
1418     uint32_t                                    instanceCount,
1419     uint32_t                                    firstInstance,
1420     VkBuffer                                    counterBuffer,
1421     VkDeviceSize                                counterBufferOffset,
1422     uint32_t                                    counterOffset,
1423     uint32_t                                    vertexStride)
1424 {
1425    ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer);
1426    ANV_FROM_HANDLE(anv_buffer, counter_buffer, counterBuffer);
1427    struct anv_graphics_pipeline *pipeline =
1428       anv_pipeline_to_graphics(cmd_buffer->state.gfx.base.pipeline);
1429 
1430    /* firstVertex is always zero for this draw function */
1431    const uint32_t firstVertex = 0;
1432 
1433    if (anv_batch_has_error(&cmd_buffer->batch))
1434       return;
1435 
1436    anv_measure_snapshot(cmd_buffer,
1437                         INTEL_SNAPSHOT_DRAW,
1438                         "draw indirect byte count",
1439                         instanceCount * pipeline->instance_multiplier);
1440    trace_intel_begin_draw_indirect_byte_count(&cmd_buffer->trace);
1441 
1442    /* Select pipeline here to allow
1443     * cmd_buffer_emit_vertex_constants_and_flush() without flushing before
1444     * emit_base_vertex_instance() & emit_draw_index().
1445     */
1446    genX(flush_pipeline_select_3d)(cmd_buffer);
1447 
1448    if (cmd_buffer->state.conditional_render_enabled)
1449       genX(cmd_emit_conditional_render_predicate)(cmd_buffer);
1450 
1451 #if GFX_VER < 11
1452    const struct brw_vs_prog_data *vs_prog_data = get_vs_prog_data(pipeline);
1453    if (vs_prog_data->uses_firstvertex ||
1454        vs_prog_data->uses_baseinstance)
1455       emit_base_vertex_instance(cmd_buffer, firstVertex, firstInstance);
1456    if (vs_prog_data->uses_drawid)
1457       emit_draw_index(cmd_buffer, 0);
1458 #endif
1459 
1460    genX(cmd_buffer_flush_gfx_state)(cmd_buffer);
1461 
1462    struct mi_builder b;
1463    mi_builder_init(&b, cmd_buffer->device->info, &cmd_buffer->batch);
1464    const uint32_t mocs = anv_mocs_for_address(cmd_buffer->device, &counter_buffer->address);
1465    mi_builder_set_mocs(&b, mocs);
1466    struct mi_value count =
1467       mi_mem32(anv_address_add(counter_buffer->address,
1468                                    counterBufferOffset));
1469    if (counterOffset)
1470       count = mi_isub(&b, count, mi_imm(counterOffset));
1471    count = mi_udiv32_imm(&b, count, vertexStride);
1472    mi_store(&b, mi_reg32(GFX7_3DPRIM_VERTEX_COUNT), count);
1473 
1474    mi_store(&b, mi_reg32(GFX7_3DPRIM_START_VERTEX), mi_imm(firstVertex));
1475    mi_store(&b, mi_reg32(GFX7_3DPRIM_INSTANCE_COUNT),
1476             mi_imm(instanceCount * pipeline->instance_multiplier));
1477    mi_store(&b, mi_reg32(GFX7_3DPRIM_START_INSTANCE), mi_imm(firstInstance));
1478    mi_store(&b, mi_reg32(GFX7_3DPRIM_BASE_VERTEX), mi_imm(0));
1479 
1480 #if GFX_VER >= 11
1481    mi_store(&b, mi_reg32(GEN11_3DPRIM_XP_BASE_VERTEX),
1482                 mi_imm(firstVertex));
1483    /* GEN11_3DPRIM_XP_BASE_INSTANCE is implicit */
1484    mi_store(&b, mi_reg32(GEN11_3DPRIM_XP_DRAW_ID), mi_imm(0));
1485 #endif
1486 
1487    genX(emit_breakpoint)(&cmd_buffer->batch, cmd_buffer->device, true);
1488    anv_batch_emit(&cmd_buffer->batch, _3DPRIMITIVE_DIRECT, prim) {
1489 #if GFX_VERx10 >= 125
1490       prim.TBIMREnable = cmd_buffer->state.gfx.dyn_state.use_tbimr;
1491 #endif
1492       prim.IndirectParameterEnable  = true;
1493       prim.PredicateEnable          = cmd_buffer->state.conditional_render_enabled;
1494       prim.VertexAccessType         = SEQUENTIAL;
1495 #if GFX_VER >= 11
1496       prim.ExtendedParametersPresent = true;
1497 #endif
1498    }
1499 
1500    genX(batch_emit_post_3dprimitive_was)(&cmd_buffer->batch,
1501                                          cmd_buffer->device,
1502                                          cmd_buffer->state.gfx.primitive_topology,
1503                                          1);
1504    genX(emit_breakpoint)(&cmd_buffer->batch, cmd_buffer->device, false);
1505 
1506    update_dirty_vbs_for_gfx8_vb_flush(cmd_buffer, SEQUENTIAL);
1507 
1508    trace_intel_end_draw_indirect_byte_count(&cmd_buffer->trace,
1509       instanceCount * pipeline->instance_multiplier);
1510 }
1511 
1512 static void
load_indirect_parameters(struct anv_cmd_buffer * cmd_buffer,struct anv_address addr,bool indexed,uint32_t draw_id)1513 load_indirect_parameters(struct anv_cmd_buffer *cmd_buffer,
1514                          struct anv_address addr,
1515                          bool indexed,
1516                          uint32_t draw_id)
1517 {
1518    struct anv_graphics_pipeline *pipeline =
1519       anv_pipeline_to_graphics(cmd_buffer->state.gfx.base.pipeline);
1520 
1521    struct mi_builder b;
1522    mi_builder_init(&b, cmd_buffer->device->info, &cmd_buffer->batch);
1523    const uint32_t mocs = anv_mocs_for_address(cmd_buffer->device, &addr);
1524    mi_builder_set_mocs(&b, mocs);
1525 
1526    mi_store(&b, mi_reg32(GFX7_3DPRIM_VERTEX_COUNT),
1527                 mi_mem32(anv_address_add(addr, 0)));
1528 
1529    struct mi_value instance_count = mi_mem32(anv_address_add(addr, 4));
1530    if (pipeline->instance_multiplier > 1) {
1531       instance_count = mi_imul_imm(&b, instance_count,
1532                                    pipeline->instance_multiplier);
1533    }
1534    mi_store(&b, mi_reg32(GFX7_3DPRIM_INSTANCE_COUNT), instance_count);
1535 
1536    mi_store(&b, mi_reg32(GFX7_3DPRIM_START_VERTEX),
1537                 mi_mem32(anv_address_add(addr, 8)));
1538 
1539    if (indexed) {
1540       mi_store(&b, mi_reg32(GFX7_3DPRIM_BASE_VERTEX),
1541                    mi_mem32(anv_address_add(addr, 12)));
1542       mi_store(&b, mi_reg32(GFX7_3DPRIM_START_INSTANCE),
1543                    mi_mem32(anv_address_add(addr, 16)));
1544 #if GFX_VER >= 11
1545       mi_store(&b, mi_reg32(GEN11_3DPRIM_XP_BASE_VERTEX),
1546                    mi_mem32(anv_address_add(addr, 12)));
1547       /* GEN11_3DPRIM_XP_BASE_INSTANCE is implicit */
1548 #endif
1549    } else {
1550       mi_store(&b, mi_reg32(GFX7_3DPRIM_START_INSTANCE),
1551                    mi_mem32(anv_address_add(addr, 12)));
1552       mi_store(&b, mi_reg32(GFX7_3DPRIM_BASE_VERTEX), mi_imm(0));
1553 #if GFX_VER >= 11
1554       mi_store(&b, mi_reg32(GEN11_3DPRIM_XP_BASE_VERTEX),
1555                    mi_mem32(anv_address_add(addr, 8)));
1556       /* GEN11_3DPRIM_XP_BASE_INSTANCE is implicit */
1557 #endif
1558    }
1559 
1560 #if GFX_VER >= 11
1561    mi_store(&b, mi_reg32(GEN11_3DPRIM_XP_DRAW_ID),
1562                 mi_imm(draw_id));
1563 #endif
1564 }
1565 
1566 static const bool
execute_indirect_draw_supported(struct anv_cmd_buffer * cmd_buffer)1567 execute_indirect_draw_supported(struct anv_cmd_buffer *cmd_buffer)
1568 {
1569 #if GFX_VERx10 >= 125
1570    const struct intel_device_info *devinfo = cmd_buffer->device->info;
1571    struct anv_graphics_pipeline *pipeline =
1572       anv_pipeline_to_graphics(cmd_buffer->state.gfx.base.pipeline);
1573    const struct brw_vs_prog_data *vs_prog_data = get_vs_prog_data(pipeline);
1574    const bool is_multiview = pipeline->instance_multiplier > 1;
1575 
1576    return (devinfo->has_indirect_unroll &&
1577            !is_multiview &&
1578            !vs_prog_data->uses_firstvertex &&
1579            !vs_prog_data->uses_baseinstance &&
1580            !vs_prog_data->uses_drawid);
1581 #else
1582    return false;
1583 #endif
1584 }
1585 
1586 static void
emit_indirect_draws(struct anv_cmd_buffer * cmd_buffer,struct anv_address indirect_data_addr,uint32_t indirect_data_stride,uint32_t draw_count,bool indexed)1587 emit_indirect_draws(struct anv_cmd_buffer *cmd_buffer,
1588                     struct anv_address indirect_data_addr,
1589                     uint32_t indirect_data_stride,
1590                     uint32_t draw_count,
1591                     bool indexed)
1592 {
1593 #if GFX_VER < 11
1594    struct anv_graphics_pipeline *pipeline =
1595       anv_pipeline_to_graphics(cmd_buffer->state.gfx.base.pipeline);
1596    const struct brw_vs_prog_data *vs_prog_data = get_vs_prog_data(pipeline);
1597 #endif
1598    UNUSED const struct intel_device_info *devinfo = cmd_buffer->device->info;
1599    UNUSED const bool aligned_stride =
1600       (indirect_data_stride == 0 ||
1601        indirect_data_stride == sizeof(VkDrawIndirectCommand));
1602    UNUSED const bool execute_indirect_supported =
1603       execute_indirect_draw_supported(cmd_buffer);
1604 
1605    genX(cmd_buffer_flush_gfx_state)(cmd_buffer);
1606 
1607    if (cmd_buffer->state.conditional_render_enabled)
1608       genX(cmd_emit_conditional_render_predicate)(cmd_buffer);
1609 
1610    uint32_t offset = 0;
1611    for (uint32_t i = 0; i < draw_count; i++) {
1612       struct anv_address draw = anv_address_add(indirect_data_addr, offset);
1613 
1614 #if GFX_VER < 11
1615       /* TODO: We need to stomp base vertex to 0 somehow */
1616 
1617       /* With sequential draws, we're dealing with the VkDrawIndirectCommand
1618        * structure data. We want to load VkDrawIndirectCommand::firstVertex at
1619        * offset 8 in the structure.
1620        *
1621        * With indexed draws, we're dealing with VkDrawIndexedIndirectCommand.
1622        * We want the VkDrawIndirectCommand::vertexOffset field at offset 12 in
1623        * the structure.
1624        */
1625       if (vs_prog_data->uses_firstvertex ||
1626           vs_prog_data->uses_baseinstance) {
1627          emit_base_vertex_instance_bo(cmd_buffer,
1628                                       anv_address_add(draw, indexed ? 12 : 8));
1629       }
1630       if (vs_prog_data->uses_drawid)
1631          emit_draw_index(cmd_buffer, i);
1632 #endif
1633 
1634       /* Emitting draw index or vertex index BOs may result in needing
1635        * additional VF cache flushes.
1636        */
1637       genX(cmd_buffer_apply_pipe_flushes)(cmd_buffer);
1638 
1639       /* Wa_1306463417, Wa_16011107343 - Send HS state for every primitive,
1640        * first one was handled by cmd_buffer_flush_gfx_state.
1641        */
1642       if (i && (INTEL_NEEDS_WA_1306463417 || INTEL_NEEDS_WA_16011107343))
1643          genX(emit_hs)(cmd_buffer);
1644       genX(emit_ds)(cmd_buffer);
1645 
1646       if (execute_indirect_supported) {
1647 #if GFX_VERx10 >= 125
1648          genX(emit_breakpoint)(&cmd_buffer->batch, cmd_buffer->device, true);
1649          anv_batch_emit(&cmd_buffer->batch, GENX(EXECUTE_INDIRECT_DRAW), ind) {
1650             ind.ArgumentFormat             = DRAW;
1651             ind.TBIMREnabled               = cmd_buffer->state.gfx.dyn_state.use_tbimr;
1652             ind.PredicateEnable            =
1653                cmd_buffer->state.conditional_render_enabled;
1654             ind.MaxCount                   = aligned_stride ? draw_count : 1;
1655             ind.ArgumentBufferStartAddress = draw;
1656             ind.MOCS                       =
1657                anv_mocs(cmd_buffer->device, draw.bo, 0);
1658          }
1659          /* If all the indirect structures are aligned, then we can let the HW
1660           * do the unrolling and we only need one instruction. Otherwise we
1661           * need to emit one instruction per draw, but we're still avoiding
1662           * the register loads with MI commands.
1663           */
1664          if (aligned_stride)
1665             break;
1666 #else
1667          unreachable("EXECUTE_INDIRECT_DRAW instruction expectation mismatch");
1668 #endif
1669       } else {
1670          load_indirect_parameters(cmd_buffer, draw, indexed, i);
1671 
1672          genX(emit_breakpoint)(&cmd_buffer->batch, cmd_buffer->device, true);
1673          anv_batch_emit(&cmd_buffer->batch, _3DPRIMITIVE_DIRECT, prim) {
1674 #if GFX_VERx10 >= 125
1675             prim.TBIMREnable = cmd_buffer->state.gfx.dyn_state.use_tbimr;
1676 #endif
1677             prim.IndirectParameterEnable  = true;
1678             prim.PredicateEnable          = cmd_buffer->state.conditional_render_enabled;
1679             prim.VertexAccessType         = indexed ? RANDOM : SEQUENTIAL;
1680 #if GFX_VER >= 11
1681             prim.ExtendedParametersPresent = true;
1682 #endif
1683          }
1684       }
1685 
1686       genX(batch_emit_post_3dprimitive_was)(&cmd_buffer->batch,
1687                                             cmd_buffer->device,
1688                                             cmd_buffer->state.gfx.primitive_topology,
1689                                             1);
1690 
1691       genX(emit_breakpoint)(&cmd_buffer->batch, cmd_buffer->device, false);
1692 
1693       update_dirty_vbs_for_gfx8_vb_flush(cmd_buffer,
1694                                          indexed ? RANDOM : SEQUENTIAL);
1695 
1696       offset += indirect_data_stride;
1697    }
1698 }
1699 
genX(CmdDrawIndirect)1700 void genX(CmdDrawIndirect)(
1701     VkCommandBuffer                             commandBuffer,
1702     VkBuffer                                    _buffer,
1703     VkDeviceSize                                offset,
1704     uint32_t                                    drawCount,
1705     uint32_t                                    stride)
1706 {
1707    ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer);
1708    ANV_FROM_HANDLE(anv_buffer, buffer, _buffer);
1709 
1710    if (anv_batch_has_error(&cmd_buffer->batch))
1711       return;
1712 
1713    anv_measure_snapshot(cmd_buffer,
1714                         INTEL_SNAPSHOT_DRAW,
1715                         "draw indirect",
1716                         drawCount);
1717    trace_intel_begin_draw_indirect(&cmd_buffer->trace);
1718 
1719    if (anv_use_generated_draws(cmd_buffer, drawCount)) {
1720       genX(cmd_buffer_emit_indirect_generated_draws)(
1721          cmd_buffer,
1722          anv_address_add(buffer->address, offset),
1723          MAX2(stride, sizeof(VkDrawIndirectCommand)),
1724          ANV_NULL_ADDRESS /* count_addr */,
1725          drawCount,
1726          false /* indexed */);
1727    } else {
1728       emit_indirect_draws(cmd_buffer,
1729                           anv_address_add(buffer->address, offset),
1730                           stride, drawCount, false /* indexed */);
1731    }
1732 
1733    trace_intel_end_draw_indirect(&cmd_buffer->trace, drawCount);
1734 }
1735 
genX(CmdDrawIndexedIndirect)1736 void genX(CmdDrawIndexedIndirect)(
1737     VkCommandBuffer                             commandBuffer,
1738     VkBuffer                                    _buffer,
1739     VkDeviceSize                                offset,
1740     uint32_t                                    drawCount,
1741     uint32_t                                    stride)
1742 {
1743    ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer);
1744    ANV_FROM_HANDLE(anv_buffer, buffer, _buffer);
1745 
1746    if (anv_batch_has_error(&cmd_buffer->batch))
1747       return;
1748 
1749    anv_measure_snapshot(cmd_buffer,
1750                         INTEL_SNAPSHOT_DRAW,
1751                         "draw indexed indirect",
1752                         drawCount);
1753    trace_intel_begin_draw_indexed_indirect(&cmd_buffer->trace);
1754 
1755    if (anv_use_generated_draws(cmd_buffer, drawCount)) {
1756       genX(cmd_buffer_emit_indirect_generated_draws)(
1757          cmd_buffer,
1758          anv_address_add(buffer->address, offset),
1759          MAX2(stride, sizeof(VkDrawIndexedIndirectCommand)),
1760          ANV_NULL_ADDRESS /* count_addr */,
1761          drawCount,
1762          true /* indexed */);
1763    } else {
1764       emit_indirect_draws(cmd_buffer,
1765                           anv_address_add(buffer->address, offset),
1766                           stride, drawCount, true /* indexed */);
1767    }
1768 
1769    trace_intel_end_draw_indexed_indirect(&cmd_buffer->trace, drawCount);
1770 }
1771 
1772 #define MI_PREDICATE_SRC0    0x2400
1773 #define MI_PREDICATE_SRC1    0x2408
1774 #define MI_PREDICATE_RESULT  0x2418
1775 
1776 static struct mi_value
prepare_for_draw_count_predicate(struct anv_cmd_buffer * cmd_buffer,struct mi_builder * b,struct anv_address count_address)1777 prepare_for_draw_count_predicate(struct anv_cmd_buffer *cmd_buffer,
1778                                  struct mi_builder *b,
1779                                  struct anv_address count_address)
1780 {
1781    struct mi_value ret = mi_imm(0);
1782 
1783    if (cmd_buffer->state.conditional_render_enabled) {
1784       ret = mi_new_gpr(b);
1785       mi_store(b, mi_value_ref(b, ret), mi_mem32(count_address));
1786    } else {
1787       /* Upload the current draw count from the draw parameters buffer to
1788        * MI_PREDICATE_SRC0.
1789        */
1790       mi_store(b, mi_reg64(MI_PREDICATE_SRC0), mi_mem32(count_address));
1791       mi_store(b, mi_reg32(MI_PREDICATE_SRC1 + 4), mi_imm(0));
1792    }
1793 
1794    return ret;
1795 }
1796 
1797 static void
emit_draw_count_predicate(struct anv_cmd_buffer * cmd_buffer,struct mi_builder * b,uint32_t draw_index)1798 emit_draw_count_predicate(struct anv_cmd_buffer *cmd_buffer,
1799                           struct mi_builder *b,
1800                           uint32_t draw_index)
1801 {
1802    /* Upload the index of the current primitive to MI_PREDICATE_SRC1. */
1803    mi_store(b, mi_reg32(MI_PREDICATE_SRC1), mi_imm(draw_index));
1804 
1805    if (draw_index == 0) {
1806       anv_batch_emit(&cmd_buffer->batch, GENX(MI_PREDICATE), mip) {
1807          mip.LoadOperation    = LOAD_LOADINV;
1808          mip.CombineOperation = COMBINE_SET;
1809          mip.CompareOperation = COMPARE_SRCS_EQUAL;
1810       }
1811    } else {
1812       /* While draw_index < draw_count the predicate's result will be
1813        *  (draw_index == draw_count) ^ TRUE = TRUE
1814        * When draw_index == draw_count the result is
1815        *  (TRUE) ^ TRUE = FALSE
1816        * After this all results will be:
1817        *  (FALSE) ^ FALSE = FALSE
1818        */
1819       anv_batch_emit(&cmd_buffer->batch, GENX(MI_PREDICATE), mip) {
1820          mip.LoadOperation    = LOAD_LOAD;
1821          mip.CombineOperation = COMBINE_XOR;
1822          mip.CompareOperation = COMPARE_SRCS_EQUAL;
1823       }
1824    }
1825 }
1826 
1827 static void
emit_draw_count_predicate_with_conditional_render(struct anv_cmd_buffer * cmd_buffer,struct mi_builder * b,uint32_t draw_index,struct mi_value max)1828 emit_draw_count_predicate_with_conditional_render(
1829                           struct anv_cmd_buffer *cmd_buffer,
1830                           struct mi_builder *b,
1831                           uint32_t draw_index,
1832                           struct mi_value max)
1833 {
1834    struct mi_value pred = mi_ult(b, mi_imm(draw_index), max);
1835    pred = mi_iand(b, pred, mi_reg64(ANV_PREDICATE_RESULT_REG));
1836 
1837    mi_store(b, mi_reg32(MI_PREDICATE_RESULT), pred);
1838 }
1839 
1840 static void
emit_draw_count_predicate_cond(struct anv_cmd_buffer * cmd_buffer,struct mi_builder * b,uint32_t draw_index,struct mi_value max)1841 emit_draw_count_predicate_cond(struct anv_cmd_buffer *cmd_buffer,
1842                                struct mi_builder *b,
1843                                uint32_t draw_index,
1844                                struct mi_value max)
1845 {
1846    if (cmd_buffer->state.conditional_render_enabled) {
1847       emit_draw_count_predicate_with_conditional_render(
1848             cmd_buffer, b, draw_index, mi_value_ref(b, max));
1849    } else {
1850       emit_draw_count_predicate(cmd_buffer, b, draw_index);
1851    }
1852 }
1853 
1854 static void
emit_indirect_count_draws(struct anv_cmd_buffer * cmd_buffer,struct anv_address indirect_data_addr,uint64_t indirect_data_stride,struct anv_address draw_count_addr,uint32_t max_draw_count,bool indexed)1855 emit_indirect_count_draws(struct anv_cmd_buffer *cmd_buffer,
1856                           struct anv_address indirect_data_addr,
1857                           uint64_t indirect_data_stride,
1858                           struct anv_address draw_count_addr,
1859                           uint32_t max_draw_count,
1860                           bool indexed)
1861 {
1862 #if GFX_VER < 11
1863    struct anv_graphics_pipeline *pipeline =
1864       anv_pipeline_to_graphics(cmd_buffer->state.gfx.base.pipeline);
1865    const struct brw_vs_prog_data *vs_prog_data = get_vs_prog_data(pipeline);
1866 #endif
1867 
1868    genX(cmd_buffer_flush_gfx_state)(cmd_buffer);
1869 
1870    struct mi_builder b;
1871    mi_builder_init(&b, cmd_buffer->device->info, &cmd_buffer->batch);
1872    const uint32_t mocs = anv_mocs_for_address(cmd_buffer->device, &draw_count_addr);
1873    mi_builder_set_mocs(&b, mocs);
1874    struct mi_value max =
1875       prepare_for_draw_count_predicate(cmd_buffer, &b, draw_count_addr);
1876 
1877    for (uint32_t i = 0; i < max_draw_count; i++) {
1878       struct anv_address draw =
1879          anv_address_add(indirect_data_addr, i * indirect_data_stride);
1880 
1881       emit_draw_count_predicate_cond(cmd_buffer, &b, i, max);
1882 
1883 #if GFX_VER < 11
1884       if (vs_prog_data->uses_firstvertex ||
1885           vs_prog_data->uses_baseinstance) {
1886          emit_base_vertex_instance_bo(cmd_buffer,
1887                                       anv_address_add(draw, indexed ? 12 : 8));
1888       }
1889       if (vs_prog_data->uses_drawid)
1890          emit_draw_index(cmd_buffer, i);
1891 
1892       /* Emitting draw index or vertex index BOs may result in needing
1893        * additional VF cache flushes.
1894        */
1895       genX(cmd_buffer_apply_pipe_flushes)(cmd_buffer);
1896 #endif
1897 
1898       load_indirect_parameters(cmd_buffer, draw, indexed, i);
1899 
1900       /* Wa_1306463417, Wa_16011107343 - Send HS state for every primitive,
1901        * first one was handled by cmd_buffer_flush_gfx_state.
1902        */
1903       if (i && (INTEL_NEEDS_WA_1306463417 || INTEL_NEEDS_WA_16011107343))
1904          genX(emit_hs)(cmd_buffer);
1905       genX(emit_ds)(cmd_buffer);
1906 
1907       genX(emit_breakpoint)(&cmd_buffer->batch, cmd_buffer->device, true);
1908       anv_batch_emit(&cmd_buffer->batch, _3DPRIMITIVE_DIRECT, prim) {
1909 #if GFX_VERx10 >= 125
1910          prim.TBIMREnable = cmd_buffer->state.gfx.dyn_state.use_tbimr;
1911 #endif
1912          prim.IndirectParameterEnable  = true;
1913          prim.PredicateEnable          = true;
1914          prim.VertexAccessType         = indexed ? RANDOM : SEQUENTIAL;
1915 #if GFX_VER >= 11
1916          prim.ExtendedParametersPresent = true;
1917 #endif
1918       }
1919 
1920       genX(batch_emit_post_3dprimitive_was)(&cmd_buffer->batch,
1921                                             cmd_buffer->device,
1922                                             cmd_buffer->state.gfx.primitive_topology,
1923                                             1);
1924       genX(emit_breakpoint)(&cmd_buffer->batch, cmd_buffer->device, false);
1925 
1926       update_dirty_vbs_for_gfx8_vb_flush(cmd_buffer, SEQUENTIAL);
1927    }
1928 
1929    mi_value_unref(&b, max);
1930 }
1931 
genX(CmdDrawIndirectCount)1932 void genX(CmdDrawIndirectCount)(
1933     VkCommandBuffer                             commandBuffer,
1934     VkBuffer                                    _buffer,
1935     VkDeviceSize                                offset,
1936     VkBuffer                                    _countBuffer,
1937     VkDeviceSize                                countBufferOffset,
1938     uint32_t                                    maxDrawCount,
1939     uint32_t                                    stride)
1940 {
1941    ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer);
1942    ANV_FROM_HANDLE(anv_buffer, buffer, _buffer);
1943    ANV_FROM_HANDLE(anv_buffer, count_buffer, _countBuffer);
1944 
1945    if (anv_batch_has_error(&cmd_buffer->batch))
1946       return;
1947 
1948    anv_measure_snapshot(cmd_buffer,
1949                         INTEL_SNAPSHOT_DRAW,
1950                         "draw indirect count",
1951                         0);
1952    trace_intel_begin_draw_indirect_count(&cmd_buffer->trace);
1953 
1954    struct anv_address indirect_data_address =
1955       anv_address_add(buffer->address, offset);
1956    struct anv_address count_address =
1957       anv_address_add(count_buffer->address, countBufferOffset);
1958    stride = MAX2(stride, sizeof(VkDrawIndirectCommand));
1959 
1960    if (anv_use_generated_draws(cmd_buffer, maxDrawCount)) {
1961       genX(cmd_buffer_emit_indirect_generated_draws)(
1962          cmd_buffer,
1963          indirect_data_address,
1964          stride,
1965          count_address,
1966          maxDrawCount,
1967          false /* indexed */);
1968    } else {
1969       emit_indirect_count_draws(cmd_buffer,
1970                                 indirect_data_address,
1971                                 stride,
1972                                 count_address,
1973                                 maxDrawCount,
1974                                 false /* indexed */);
1975    }
1976 
1977    trace_intel_end_draw_indirect_count(&cmd_buffer->trace, maxDrawCount);
1978 }
1979 
genX(CmdDrawIndexedIndirectCount)1980 void genX(CmdDrawIndexedIndirectCount)(
1981     VkCommandBuffer                             commandBuffer,
1982     VkBuffer                                    _buffer,
1983     VkDeviceSize                                offset,
1984     VkBuffer                                    _countBuffer,
1985     VkDeviceSize                                countBufferOffset,
1986     uint32_t                                    maxDrawCount,
1987     uint32_t                                    stride)
1988 {
1989    ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer);
1990    ANV_FROM_HANDLE(anv_buffer, buffer, _buffer);
1991    ANV_FROM_HANDLE(anv_buffer, count_buffer, _countBuffer);
1992 
1993    if (anv_batch_has_error(&cmd_buffer->batch))
1994       return;
1995 
1996    anv_measure_snapshot(cmd_buffer,
1997                         INTEL_SNAPSHOT_DRAW,
1998                         "draw indexed indirect count",
1999                         0);
2000    trace_intel_begin_draw_indexed_indirect_count(&cmd_buffer->trace);
2001 
2002    struct anv_address indirect_data_address =
2003       anv_address_add(buffer->address, offset);
2004    struct anv_address count_address =
2005       anv_address_add(count_buffer->address, countBufferOffset);
2006    stride = MAX2(stride, sizeof(VkDrawIndexedIndirectCommand));
2007 
2008    if (anv_use_generated_draws(cmd_buffer, maxDrawCount)) {
2009       genX(cmd_buffer_emit_indirect_generated_draws)(
2010          cmd_buffer,
2011          indirect_data_address,
2012          stride,
2013          count_address,
2014          maxDrawCount,
2015          true /* indexed */);
2016    } else {
2017       emit_indirect_count_draws(cmd_buffer,
2018                                 indirect_data_address,
2019                                 stride,
2020                                 count_address,
2021                                 maxDrawCount,
2022                                 true /* indexed */);
2023    }
2024 
2025    trace_intel_end_draw_indexed_indirect_count(&cmd_buffer->trace, maxDrawCount);
2026 
2027 }
2028 
genX(CmdBeginTransformFeedbackEXT)2029 void genX(CmdBeginTransformFeedbackEXT)(
2030     VkCommandBuffer                             commandBuffer,
2031     uint32_t                                    firstCounterBuffer,
2032     uint32_t                                    counterBufferCount,
2033     const VkBuffer*                             pCounterBuffers,
2034     const VkDeviceSize*                         pCounterBufferOffsets)
2035 {
2036    ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer);
2037 
2038    assert(firstCounterBuffer < MAX_XFB_BUFFERS);
2039    assert(counterBufferCount <= MAX_XFB_BUFFERS);
2040    assert(firstCounterBuffer + counterBufferCount <= MAX_XFB_BUFFERS);
2041 
2042    trace_intel_begin_xfb(&cmd_buffer->trace);
2043 
2044    /* From the SKL PRM Vol. 2c, SO_WRITE_OFFSET:
2045     *
2046     *    "Ssoftware must ensure that no HW stream output operations can be in
2047     *    process or otherwise pending at the point that the MI_LOAD/STORE
2048     *    commands are processed. This will likely require a pipeline flush."
2049     */
2050    anv_add_pending_pipe_bits(cmd_buffer,
2051                              ANV_PIPE_CS_STALL_BIT,
2052                              "begin transform feedback");
2053    genX(cmd_buffer_apply_pipe_flushes)(cmd_buffer);
2054 
2055    for (uint32_t idx = 0; idx < MAX_XFB_BUFFERS; idx++) {
2056       /* If we have a counter buffer, this is a resume so we need to load the
2057        * value into the streamout offset register.  Otherwise, this is a begin
2058        * and we need to reset it to zero.
2059        */
2060       if (pCounterBuffers &&
2061           idx >= firstCounterBuffer &&
2062           idx - firstCounterBuffer < counterBufferCount &&
2063           pCounterBuffers[idx - firstCounterBuffer] != VK_NULL_HANDLE) {
2064          uint32_t cb_idx = idx - firstCounterBuffer;
2065          ANV_FROM_HANDLE(anv_buffer, counter_buffer, pCounterBuffers[cb_idx]);
2066          uint64_t offset = pCounterBufferOffsets ?
2067                            pCounterBufferOffsets[cb_idx] : 0;
2068 
2069          anv_batch_emit(&cmd_buffer->batch, GENX(MI_LOAD_REGISTER_MEM), lrm) {
2070             lrm.RegisterAddress  = GENX(SO_WRITE_OFFSET0_num) + idx * 4;
2071             lrm.MemoryAddress    = anv_address_add(counter_buffer->address,
2072                                                    offset);
2073          }
2074       } else {
2075          anv_batch_emit(&cmd_buffer->batch, GENX(MI_LOAD_REGISTER_IMM), lri) {
2076             lri.RegisterOffset   = GENX(SO_WRITE_OFFSET0_num) + idx * 4;
2077             lri.DataDWord        = 0;
2078          }
2079       }
2080    }
2081 
2082    cmd_buffer->state.xfb_enabled = true;
2083    cmd_buffer->state.gfx.dirty |= ANV_CMD_DIRTY_XFB_ENABLE;
2084 }
2085 
genX(CmdEndTransformFeedbackEXT)2086 void genX(CmdEndTransformFeedbackEXT)(
2087     VkCommandBuffer                             commandBuffer,
2088     uint32_t                                    firstCounterBuffer,
2089     uint32_t                                    counterBufferCount,
2090     const VkBuffer*                             pCounterBuffers,
2091     const VkDeviceSize*                         pCounterBufferOffsets)
2092 {
2093    ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer);
2094 
2095    assert(firstCounterBuffer < MAX_XFB_BUFFERS);
2096    assert(counterBufferCount <= MAX_XFB_BUFFERS);
2097    assert(firstCounterBuffer + counterBufferCount <= MAX_XFB_BUFFERS);
2098 
2099    /* From the SKL PRM Vol. 2c, SO_WRITE_OFFSET:
2100     *
2101     *    "Ssoftware must ensure that no HW stream output operations can be in
2102     *    process or otherwise pending at the point that the MI_LOAD/STORE
2103     *    commands are processed. This will likely require a pipeline flush."
2104     */
2105    anv_add_pending_pipe_bits(cmd_buffer,
2106                              ANV_PIPE_CS_STALL_BIT,
2107                              "end transform feedback");
2108    genX(cmd_buffer_apply_pipe_flushes)(cmd_buffer);
2109 
2110    for (uint32_t cb_idx = 0; cb_idx < counterBufferCount; cb_idx++) {
2111       unsigned idx = firstCounterBuffer + cb_idx;
2112 
2113       /* If we have a counter buffer, this is a resume so we need to load the
2114        * value into the streamout offset register.  Otherwise, this is a begin
2115        * and we need to reset it to zero.
2116        */
2117       if (pCounterBuffers &&
2118           cb_idx < counterBufferCount &&
2119           pCounterBuffers[cb_idx] != VK_NULL_HANDLE) {
2120          ANV_FROM_HANDLE(anv_buffer, counter_buffer, pCounterBuffers[cb_idx]);
2121          uint64_t offset = pCounterBufferOffsets ?
2122                            pCounterBufferOffsets[cb_idx] : 0;
2123 
2124          anv_batch_emit(&cmd_buffer->batch, GENX(MI_STORE_REGISTER_MEM), srm) {
2125             srm.MemoryAddress    = anv_address_add(counter_buffer->address,
2126                                                    offset);
2127             srm.RegisterAddress  = GENX(SO_WRITE_OFFSET0_num) + idx * 4;
2128          }
2129       }
2130    }
2131 
2132    trace_intel_end_xfb(&cmd_buffer->trace);
2133 
2134    cmd_buffer->state.xfb_enabled = false;
2135    cmd_buffer->state.gfx.dirty |= ANV_CMD_DIRTY_XFB_ENABLE;
2136 }
2137 
2138 #if GFX_VERx10 >= 125
2139 
2140 void
genX(CmdDrawMeshTasksEXT)2141 genX(CmdDrawMeshTasksEXT)(
2142       VkCommandBuffer commandBuffer,
2143       uint32_t x,
2144       uint32_t y,
2145       uint32_t z)
2146 {
2147    ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer);
2148 
2149    if (anv_batch_has_error(&cmd_buffer->batch))
2150       return;
2151 
2152    anv_measure_snapshot(cmd_buffer,
2153                         INTEL_SNAPSHOT_DRAW,
2154                         "draw mesh", x * y * z);
2155 
2156    trace_intel_begin_draw_mesh(&cmd_buffer->trace);
2157 
2158    /* TODO(mesh): Check if this is not emitting more packets than we need. */
2159    genX(cmd_buffer_flush_gfx_state)(cmd_buffer);
2160 
2161    if (cmd_buffer->state.conditional_render_enabled)
2162       genX(cmd_emit_conditional_render_predicate)(cmd_buffer);
2163 
2164    anv_batch_emit(&cmd_buffer->batch, GENX(3DMESH_3D), m) {
2165       m.PredicateEnable = cmd_buffer->state.conditional_render_enabled;
2166       m.ThreadGroupCountX = x;
2167       m.ThreadGroupCountY = y;
2168       m.ThreadGroupCountZ = z;
2169    }
2170 
2171    trace_intel_end_draw_mesh(&cmd_buffer->trace, x, y, z);
2172 }
2173 
2174 #define GFX125_3DMESH_TG_COUNT 0x26F0
2175 #define GFX10_3DPRIM_XP(n) (0x2690 + (n) * 4) /* n = { 0, 1, 2 } */
2176 
2177 static void
mesh_load_indirect_parameters_3dmesh_3d(struct anv_cmd_buffer * cmd_buffer,struct mi_builder * b,struct anv_address addr,bool emit_xp0,uint32_t xp0)2178 mesh_load_indirect_parameters_3dmesh_3d(struct anv_cmd_buffer *cmd_buffer,
2179                                         struct mi_builder *b,
2180                                         struct anv_address addr,
2181                                         bool emit_xp0,
2182                                         uint32_t xp0)
2183 {
2184    const size_t groupCountXOff = offsetof(VkDrawMeshTasksIndirectCommandEXT, groupCountX);
2185    const size_t groupCountYOff = offsetof(VkDrawMeshTasksIndirectCommandEXT, groupCountY);
2186    const size_t groupCountZOff = offsetof(VkDrawMeshTasksIndirectCommandEXT, groupCountZ);
2187 
2188    mi_store(b, mi_reg32(GFX125_3DMESH_TG_COUNT),
2189                mi_mem32(anv_address_add(addr, groupCountXOff)));
2190 
2191    mi_store(b, mi_reg32(GFX10_3DPRIM_XP(1)),
2192                mi_mem32(anv_address_add(addr, groupCountYOff)));
2193 
2194    mi_store(b, mi_reg32(GFX10_3DPRIM_XP(2)),
2195                mi_mem32(anv_address_add(addr, groupCountZOff)));
2196 
2197    if (emit_xp0)
2198       mi_store(b, mi_reg32(GFX10_3DPRIM_XP(0)), mi_imm(xp0));
2199 }
2200 
2201 static void
emit_indirect_3dmesh_3d(struct anv_batch * batch,bool predicate_enable,bool uses_drawid)2202 emit_indirect_3dmesh_3d(struct anv_batch *batch,
2203                         bool predicate_enable,
2204                         bool uses_drawid)
2205 {
2206    uint32_t len = GENX(3DMESH_3D_length) + uses_drawid;
2207    uint32_t *dw = anv_batch_emitn(batch, len, GENX(3DMESH_3D),
2208                    .PredicateEnable           = predicate_enable,
2209                    .IndirectParameterEnable   = true,
2210                    .ExtendedParameter0Present = uses_drawid);
2211    if (uses_drawid)
2212       dw[len - 1] = 0;
2213 }
2214 
2215 void
genX(CmdDrawMeshTasksIndirectEXT)2216 genX(CmdDrawMeshTasksIndirectEXT)(
2217     VkCommandBuffer                             commandBuffer,
2218     VkBuffer                                    _buffer,
2219     VkDeviceSize                                offset,
2220     uint32_t                                    drawCount,
2221     uint32_t                                    stride)
2222 {
2223    ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer);
2224    ANV_FROM_HANDLE(anv_buffer, buffer, _buffer);
2225    struct anv_graphics_pipeline *pipeline =
2226       anv_pipeline_to_graphics(cmd_buffer->state.gfx.base.pipeline);
2227    const struct brw_task_prog_data *task_prog_data = get_task_prog_data(pipeline);
2228    const struct brw_mesh_prog_data *mesh_prog_data = get_mesh_prog_data(pipeline);
2229    struct anv_cmd_state *cmd_state = &cmd_buffer->state;
2230 
2231    if (anv_batch_has_error(&cmd_buffer->batch))
2232       return;
2233 
2234    anv_measure_snapshot(cmd_buffer,
2235                         INTEL_SNAPSHOT_DRAW,
2236                         "draw mesh indirect", drawCount);
2237 
2238    trace_intel_begin_draw_mesh_indirect(&cmd_buffer->trace);
2239 
2240    genX(cmd_buffer_flush_gfx_state)(cmd_buffer);
2241 
2242    if (cmd_state->conditional_render_enabled)
2243       genX(cmd_emit_conditional_render_predicate)(cmd_buffer);
2244 
2245    bool uses_drawid = (task_prog_data && task_prog_data->uses_drawid) ||
2246                        mesh_prog_data->uses_drawid;
2247    struct mi_builder b;
2248    mi_builder_init(&b, cmd_buffer->device->info, &cmd_buffer->batch);
2249 
2250    for (uint32_t i = 0; i < drawCount; i++) {
2251       struct anv_address draw = anv_address_add(buffer->address, offset);
2252 
2253       mesh_load_indirect_parameters_3dmesh_3d(cmd_buffer, &b, draw, uses_drawid, i);
2254 
2255       emit_indirect_3dmesh_3d(&cmd_buffer->batch,
2256             cmd_state->conditional_render_enabled, uses_drawid);
2257 
2258       offset += stride;
2259    }
2260 
2261    trace_intel_end_draw_mesh_indirect(&cmd_buffer->trace, drawCount);
2262 }
2263 
2264 void
genX(CmdDrawMeshTasksIndirectCountEXT)2265 genX(CmdDrawMeshTasksIndirectCountEXT)(
2266     VkCommandBuffer                             commandBuffer,
2267     VkBuffer                                    _buffer,
2268     VkDeviceSize                                offset,
2269     VkBuffer                                    _countBuffer,
2270     VkDeviceSize                                countBufferOffset,
2271     uint32_t                                    maxDrawCount,
2272     uint32_t                                    stride)
2273 {
2274    ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer);
2275    ANV_FROM_HANDLE(anv_buffer, buffer, _buffer);
2276    ANV_FROM_HANDLE(anv_buffer, count_buffer, _countBuffer);
2277    struct anv_graphics_pipeline *pipeline =
2278       anv_pipeline_to_graphics(cmd_buffer->state.gfx.base.pipeline);
2279    const struct brw_task_prog_data *task_prog_data = get_task_prog_data(pipeline);
2280    const struct brw_mesh_prog_data *mesh_prog_data = get_mesh_prog_data(pipeline);
2281 
2282    if (anv_batch_has_error(&cmd_buffer->batch))
2283       return;
2284 
2285    anv_measure_snapshot(cmd_buffer,
2286                         INTEL_SNAPSHOT_DRAW,
2287                         "draw mesh indirect count", 0);
2288 
2289    trace_intel_begin_draw_mesh_indirect_count(&cmd_buffer->trace);
2290 
2291    genX(cmd_buffer_flush_gfx_state)(cmd_buffer);
2292 
2293    bool uses_drawid = (task_prog_data && task_prog_data->uses_drawid) ||
2294                        mesh_prog_data->uses_drawid;
2295 
2296    struct mi_builder b;
2297    mi_builder_init(&b, cmd_buffer->device->info, &cmd_buffer->batch);
2298    const uint32_t mocs = anv_mocs_for_address(cmd_buffer->device, &count_buffer->address);
2299    mi_builder_set_mocs(&b, mocs);
2300 
2301    struct mi_value max =
2302          prepare_for_draw_count_predicate(
2303             cmd_buffer, &b,
2304             anv_address_add(count_buffer->address, countBufferOffset));
2305 
2306    for (uint32_t i = 0; i < maxDrawCount; i++) {
2307       struct anv_address draw = anv_address_add(buffer->address, offset);
2308 
2309       emit_draw_count_predicate_cond(cmd_buffer, &b, i, max);
2310 
2311       mesh_load_indirect_parameters_3dmesh_3d(cmd_buffer, &b, draw, uses_drawid, i);
2312 
2313       emit_indirect_3dmesh_3d(&cmd_buffer->batch, true, uses_drawid);
2314 
2315       offset += stride;
2316    }
2317 
2318    trace_intel_end_draw_mesh_indirect_count(&cmd_buffer->trace, maxDrawCount);
2319 }
2320 
2321 #endif /* GFX_VERx10 >= 125 */
2322