• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*
2  * Copyright © 2022 Intel Corporation
3  *
4  * Permission is hereby granted, free of charge, to any person obtaining a
5  * copy of this software and associated documentation files (the "Software"),
6  * to deal in the Software without restriction, including without limitation
7  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8  * and/or sell copies of the Software, and to permit persons to whom the
9  * Software is furnished to do so, subject to the following conditions:
10  *
11  * The above copyright notice and this permission notice (including the next
12  * paragraph) shall be included in all copies or substantial portions of the
13  * Software.
14  *
15  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
18  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21  * IN THE SOFTWARE.
22  */
23 
24 #ifndef GENX_CMD_DRAW_GENERATED_INDIRECT_H
25 #define GENX_CMD_DRAW_GENERATED_INDIRECT_H
26 
27 #include <assert.h>
28 #include <stdbool.h>
29 
30 #include "util/macros.h"
31 
32 #include "common/intel_genX_state_brw.h"
33 
34 #include "anv_private.h"
35 #include "anv_internal_kernels.h"
36 
37 /* This is a maximum number of items a fragment shader can generate due to the
38  * viewport size.
39  */
40 #define MAX_GENERATED_DRAW_COUNT (8192 * 8192)
41 
42 #define MAX_RING_BO_ITEMS (8192)
43 
44 static struct anv_state
genX(cmd_buffer_emit_generate_draws)45 genX(cmd_buffer_emit_generate_draws)(struct anv_cmd_buffer *cmd_buffer,
46                                      struct anv_simple_shader *simple_state,
47                                      struct anv_address generated_cmds_addr,
48                                      uint32_t generated_cmd_stride,
49                                      struct anv_address indirect_data_addr,
50                                      uint32_t indirect_data_stride,
51                                      struct anv_address draw_id_addr,
52                                      uint32_t item_base,
53                                      uint32_t item_count,
54                                      struct anv_address count_addr,
55                                      uint32_t max_count,
56                                      bool indexed,
57                                      uint32_t ring_count)
58 {
59    struct anv_device *device = cmd_buffer->device;
60 
61    struct anv_state push_data_state =
62       genX(simple_shader_alloc_push)(simple_state,
63                                      sizeof(struct anv_gen_indirect_params));
64    if (push_data_state.map == NULL)
65       return ANV_STATE_NULL;
66 
67    struct anv_graphics_pipeline *pipeline =
68       anv_pipeline_to_graphics(cmd_buffer->state.gfx.base.pipeline);
69    const struct brw_vs_prog_data *vs_prog_data = get_vs_prog_data(pipeline);
70    const bool use_tbimr = cmd_buffer->state.gfx.dyn_state.use_tbimr;
71 
72    struct anv_address draw_count_addr;
73    if (anv_address_is_null(count_addr)) {
74       draw_count_addr = anv_address_add(
75          genX(simple_shader_push_state_address)(simple_state, push_data_state),
76          offsetof(struct anv_gen_indirect_params, draw_count));
77    } else {
78       draw_count_addr = count_addr;
79    }
80 
81    const bool wa_16011107343 =
82       intel_needs_workaround(device->info, 16011107343) &&
83       anv_pipeline_has_stage(pipeline, MESA_SHADER_TESS_CTRL);
84    const bool wa_22018402687 =
85       intel_needs_workaround(device->info, 22018402687) &&
86       anv_pipeline_has_stage(pipeline, MESA_SHADER_TESS_EVAL);
87 
88    const uint32_t wa_insts_size =
89       ((wa_16011107343 ? GENX(3DSTATE_HS_length) : 0) +
90        (wa_22018402687 ? GENX(3DSTATE_HS_length) : 0)) * 4;
91    UNUSED const bool protected = cmd_buffer->vk.pool->flags &
92                                  VK_COMMAND_POOL_CREATE_PROTECTED_BIT;
93 
94    struct anv_state wa_insts_state =
95       wa_insts_size ?
96       anv_cmd_buffer_alloc_temporary_state(cmd_buffer, wa_insts_size, 4) :
97       ANV_STATE_NULL;
98    UNUSED uint32_t wa_insts_offset = 0;
99 
100 #if INTEL_WA_16011107343_GFX_VER
101    if (wa_16011107343) {
102       memcpy(wa_insts_state.map + wa_insts_offset,
103              &pipeline->batch_data[
104                 protected ?
105                 pipeline->final.hs_protected.offset :
106                 pipeline->final.hs.offset],
107              GENX(3DSTATE_HS_length) * 4);
108       wa_insts_offset += GENX(3DSTATE_HS_length) * 4;
109    }
110 #endif
111 
112 #if INTEL_WA_22018402687_GFX_VER
113    if (wa_22018402687) {
114       memcpy(wa_insts_state.map + wa_insts_offset,
115              &pipeline->batch_data[
116                 protected ?
117                 pipeline->final.ds_protected.offset :
118                 pipeline->final.ds.offset],
119              GENX(3DSTATE_DS_length) * 4);
120       wa_insts_offset += GENX(3DSTATE_DS_length) * 4;
121    }
122 #endif
123 
124    struct anv_gen_indirect_params *push_data = push_data_state.map;
125    *push_data = (struct anv_gen_indirect_params) {
126       .wa_insts_addr          = anv_address_physical(
127          anv_cmd_buffer_temporary_state_address(cmd_buffer, wa_insts_state)),
128       .draw_id_addr           = anv_address_physical(draw_id_addr),
129       .indirect_data_addr     = anv_address_physical(indirect_data_addr),
130       .indirect_data_stride   = indirect_data_stride,
131       .flags                  = (use_tbimr ? ANV_GENERATED_FLAG_TBIMR : 0) |
132                                 (indexed ? ANV_GENERATED_FLAG_INDEXED : 0) |
133                                 (cmd_buffer->state.conditional_render_enabled ?
134                                  ANV_GENERATED_FLAG_PREDICATED : 0) |
135                                 ((vs_prog_data->uses_firstvertex ||
136                                   vs_prog_data->uses_baseinstance) ?
137                                  ANV_GENERATED_FLAG_BASE : 0) |
138                                 (vs_prog_data->uses_drawid ? ANV_GENERATED_FLAG_DRAWID : 0) |
139                                 (!anv_address_is_null(count_addr) ?
140                                  ANV_GENERATED_FLAG_COUNT : 0) |
141                                 (ring_count != 0 ? ANV_GENERATED_FLAG_RING_MODE : 0),
142       .mocs                   = anv_mocs(device, indirect_data_addr.bo,
143                                          ISL_SURF_USAGE_VERTEX_BUFFER_BIT),
144       .cmd_primitive_size     = wa_insts_size + generated_cmd_stride,
145       .draw_base              = item_base,
146       .max_draw_count         = max_count,
147       .ring_count             = ring_count,
148       .instance_multiplier    = pipeline->instance_multiplier,
149       .draw_count             = anv_address_is_null(count_addr) ? max_count : 0,
150       .generated_cmds_addr    = anv_address_physical(generated_cmds_addr),
151       .draw_count_addr        = anv_address_physical(draw_count_addr),
152    };
153 
154    genX(emit_simple_shader_dispatch)(simple_state, item_count, push_data_state);
155 
156    return push_data_state;
157 }
158 
159 static void
genX(cmd_buffer_emit_indirect_generated_draws_init)160 genX(cmd_buffer_emit_indirect_generated_draws_init)(struct anv_cmd_buffer *cmd_buffer)
161 {
162    anv_batch_emit_ensure_space(&cmd_buffer->generation.batch, 4);
163 
164    trace_intel_begin_generate_draws(&cmd_buffer->trace);
165 
166    anv_batch_emit(&cmd_buffer->batch, GENX(MI_BATCH_BUFFER_START), bbs) {
167       bbs.AddressSpaceIndicator = ASI_PPGTT;
168       bbs.BatchBufferStartAddress =
169          anv_batch_current_address(&cmd_buffer->generation.batch);
170    }
171 
172    cmd_buffer->generation.return_addr = anv_batch_current_address(&cmd_buffer->batch);
173 
174 #if GFX_VER >= 12
175    anv_batch_emit(&cmd_buffer->batch, GENX(MI_ARB_CHECK), arb) {
176       arb.PreParserDisableMask = true;
177       arb.PreParserDisable = false;
178    }
179 #endif
180 
181    trace_intel_end_generate_draws(&cmd_buffer->trace);
182 
183    struct anv_shader_bin *gen_kernel;
184    VkResult ret =
185       anv_device_get_internal_shader(
186          cmd_buffer->device,
187          ANV_INTERNAL_KERNEL_GENERATED_DRAWS,
188          &gen_kernel);
189    if (ret != VK_SUCCESS) {
190       anv_batch_set_error(&cmd_buffer->batch, ret);
191       return;
192    }
193 
194    struct anv_device *device = cmd_buffer->device;
195    struct anv_simple_shader *state = &cmd_buffer->generation.shader_state;
196    *state = (struct anv_simple_shader) {
197       .device               = device,
198       .cmd_buffer           = cmd_buffer,
199       .dynamic_state_stream = &cmd_buffer->dynamic_state_stream,
200       .general_state_stream = &cmd_buffer->general_state_stream,
201       .batch                = &cmd_buffer->generation.batch,
202       .kernel               = gen_kernel,
203       .l3_config            = device->internal_kernels_l3_config,
204       .urb_cfg              = &cmd_buffer->state.gfx.urb_cfg,
205    };
206 
207    genX(emit_simple_shader_init)(state);
208 }
209 
210 static struct anv_address
genX(cmd_buffer_get_draw_id_addr)211 genX(cmd_buffer_get_draw_id_addr)(struct anv_cmd_buffer *cmd_buffer,
212                                   uint32_t draw_id_count)
213 {
214 #if GFX_VER >= 11
215    return ANV_NULL_ADDRESS;
216 #else
217    struct anv_graphics_pipeline *pipeline =
218       anv_pipeline_to_graphics(cmd_buffer->state.gfx.base.pipeline);
219    const struct brw_vs_prog_data *vs_prog_data = get_vs_prog_data(pipeline);
220    if (!vs_prog_data->uses_drawid)
221       return ANV_NULL_ADDRESS;
222 
223    struct anv_state draw_id_state =
224       anv_cmd_buffer_alloc_temporary_state(cmd_buffer, 4 * draw_id_count, 4);
225    return anv_cmd_buffer_temporary_state_address(cmd_buffer, draw_id_state);
226 #endif
227 }
228 
229 static uint32_t
genX(cmd_buffer_get_generated_draw_stride)230 genX(cmd_buffer_get_generated_draw_stride)(struct anv_cmd_buffer *cmd_buffer)
231 {
232    /* With the extended parameters in 3DPRIMITIVE on Gfx11+ we can emit
233     * everything. Prior to this, we need to emit a couple of
234     * VERTEX_BUFFER_STATE.
235     */
236 #if GFX_VER >= 11
237    return 4 * GENX(3DPRIMITIVE_EXTENDED_length);
238 #else
239    struct anv_graphics_pipeline *pipeline =
240       anv_pipeline_to_graphics(cmd_buffer->state.gfx.base.pipeline);
241    const struct brw_vs_prog_data *vs_prog_data = get_vs_prog_data(pipeline);
242 
243    uint32_t len = 0;
244 
245    if (vs_prog_data->uses_firstvertex ||
246        vs_prog_data->uses_baseinstance ||
247        vs_prog_data->uses_drawid) {
248       len += 4; /* 3DSTATE_VERTEX_BUFFERS */
249 
250       if (vs_prog_data->uses_firstvertex ||
251           vs_prog_data->uses_baseinstance)
252          len += 4 * GENX(VERTEX_BUFFER_STATE_length);
253 
254       if (vs_prog_data->uses_drawid)
255          len += 4 * GENX(VERTEX_BUFFER_STATE_length);
256    }
257 
258    return len + 4 * GENX(3DPRIMITIVE_length);
259 #endif
260 }
261 
262 static void
genX(cmd_buffer_rewrite_forward_end_addr)263 genX(cmd_buffer_rewrite_forward_end_addr)(struct anv_cmd_buffer *cmd_buffer,
264                                           struct anv_gen_indirect_params *params)
265 {
266    /* We don't know the end_addr until we have emitted all the generation
267     * draws. Go and edit the address of all the push parameters.
268     */
269    uint64_t end_addr =
270       anv_address_physical(anv_batch_current_address(&cmd_buffer->batch));
271    while (params != NULL) {
272       params->end_addr = end_addr;
273       params = params->prev;
274    }
275 }
276 
277 static void
genX(cmd_buffer_emit_indirect_generated_draws_inplace)278 genX(cmd_buffer_emit_indirect_generated_draws_inplace)(struct anv_cmd_buffer *cmd_buffer,
279                                                        struct anv_address indirect_data_addr,
280                                                        uint32_t indirect_data_stride,
281                                                        struct anv_address count_addr,
282                                                        uint32_t max_draw_count,
283                                                        bool indexed)
284 {
285    const bool start_generation_batch =
286       anv_address_is_null(cmd_buffer->generation.return_addr);
287 
288    genX(flush_pipeline_select_3d)(cmd_buffer);
289 
290    struct anv_address draw_id_addr =
291       genX(cmd_buffer_get_draw_id_addr)(cmd_buffer, max_draw_count);
292 
293 #if GFX_VER == 9
294    /* Mark the VB-0 as using the entire dynamic state pool area, but only for
295     * the draw call starting the generation batch. All the following ones will
296     * use the same area.
297     */
298    if (start_generation_batch) {
299       struct anv_device *device = cmd_buffer->device;
300       genX(cmd_buffer_set_binding_for_gfx8_vb_flush)(
301          cmd_buffer, 0,
302          (struct anv_address) {
303             .offset = device->physical->va.dynamic_state_pool.addr,
304          },
305          device->physical->va.dynamic_state_pool.size);
306    }
307 
308    struct anv_graphics_pipeline *pipeline =
309       anv_pipeline_to_graphics(cmd_buffer->state.gfx.base.pipeline);
310    const struct brw_vs_prog_data *vs_prog_data = get_vs_prog_data(pipeline);
311 
312    if (vs_prog_data->uses_baseinstance ||
313        vs_prog_data->uses_firstvertex) {
314       /* We're using the indirect buffer directly to source base instance &
315        * first vertex values. Mark the entire area as used.
316        */
317       genX(cmd_buffer_set_binding_for_gfx8_vb_flush)(cmd_buffer, ANV_SVGS_VB_INDEX,
318                                                      indirect_data_addr,
319                                                      indirect_data_stride * max_draw_count);
320    }
321 
322    if (vs_prog_data->uses_drawid) {
323       /* Mark the whole draw id buffer as used. */
324       genX(cmd_buffer_set_binding_for_gfx8_vb_flush)(cmd_buffer, ANV_SVGS_VB_INDEX,
325                                                      draw_id_addr,
326                                                      sizeof(uint32_t) * max_draw_count);
327    }
328 #endif
329 
330    /* Apply the pipeline flush here so the indirect data is available for the
331     * generation shader.
332     */
333    genX(cmd_buffer_apply_pipe_flushes)(cmd_buffer);
334 
335    if (start_generation_batch)
336       genX(cmd_buffer_emit_indirect_generated_draws_init)(cmd_buffer);
337 
338    /* Emit the 3D state in the main batch. */
339    genX(cmd_buffer_flush_gfx_state)(cmd_buffer);
340 
341    if (cmd_buffer->state.conditional_render_enabled)
342       genX(cmd_emit_conditional_render_predicate)(cmd_buffer);
343 
344    const uint32_t draw_cmd_stride =
345       genX(cmd_buffer_get_generated_draw_stride)(cmd_buffer);
346 
347    struct anv_gen_indirect_params *last_params = NULL;
348    uint32_t item_base = 0;
349    while (item_base < max_draw_count) {
350       const uint32_t item_count = MIN2(max_draw_count - item_base,
351                                        MAX_GENERATED_DRAW_COUNT);
352       const uint32_t draw_cmd_size = item_count * draw_cmd_stride;
353 
354       /* Ensure we have enough contiguous space for all the draws so that the
355        * compute shader can edit all the 3DPRIMITIVEs from a single base
356        * address.
357        *
358        * TODO: we might have to split that if the amount of space is to large (at
359        *       1Mb?).
360        */
361       VkResult result = anv_batch_emit_ensure_space(&cmd_buffer->batch,
362                                                     draw_cmd_size);
363       if (result != VK_SUCCESS)
364          return;
365 
366       struct anv_state params_state =
367          genX(cmd_buffer_emit_generate_draws)(
368             cmd_buffer,
369             &cmd_buffer->generation.shader_state,
370             anv_batch_current_address(&cmd_buffer->batch),
371             draw_cmd_stride,
372             indirect_data_addr,
373             indirect_data_stride,
374             anv_address_add(draw_id_addr, 4 * item_base),
375             item_base,
376             item_count,
377             count_addr,
378             max_draw_count,
379             indexed,
380             0 /* ring_count */);
381       struct anv_gen_indirect_params *params = params_state.map;
382       if (params == NULL)
383          return;
384 
385       anv_batch_advance(&cmd_buffer->batch, draw_cmd_size);
386 
387       item_base += item_count;
388 
389       params->prev = last_params;
390       last_params = params;
391    }
392 
393    genX(cmd_buffer_rewrite_forward_end_addr)(cmd_buffer, last_params);
394 
395 #if GFX_VER == 9
396    update_dirty_vbs_for_gfx8_vb_flush(cmd_buffer, indexed ? RANDOM : SEQUENTIAL);
397 #endif
398 }
399 
400 static void
genX(cmd_buffer_emit_indirect_generated_draws_inring)401 genX(cmd_buffer_emit_indirect_generated_draws_inring)(struct anv_cmd_buffer *cmd_buffer,
402                                                       struct anv_address indirect_data_addr,
403                                                       uint32_t indirect_data_stride,
404                                                       struct anv_address count_addr,
405                                                       uint32_t max_draw_count,
406                                                       bool indexed)
407 {
408    struct anv_device *device = cmd_buffer->device;
409 
410    genX(flush_pipeline_select_3d)(cmd_buffer);
411 
412    const uint32_t draw_cmd_stride =
413       genX(cmd_buffer_get_generated_draw_stride)(cmd_buffer);
414 
415    if (cmd_buffer->generation.ring_bo == NULL) {
416       const uint32_t bo_size = align(
417 #if GFX_VER >= 12
418          GENX(MI_ARB_CHECK_length) * 4 +
419 #endif
420          draw_cmd_stride * MAX_RING_BO_ITEMS +
421 #if GFX_VER == 9
422          4 * MAX_RING_BO_ITEMS +
423 #endif
424          GENX(MI_BATCH_BUFFER_START_length) * 4,
425          4096);
426       VkResult result = anv_bo_pool_alloc(&device->batch_bo_pool, bo_size,
427                                           &cmd_buffer->generation.ring_bo);
428       if (result != VK_SUCCESS) {
429          anv_batch_set_error(&cmd_buffer->batch, result);
430          return;
431       }
432    }
433 
434    /* How many items will be generated by each iteration of the generation
435     * shader dispatch.
436     */
437    const uint32_t ring_count = MIN2(MAX_RING_BO_ITEMS, max_draw_count);
438 
439    /* The ring bo has the following layout:
440     *
441     *   --------------------------------------------------
442     *   | MI_ARB_CHECK to resume CS prefetch (Gfx12+)    |
443     *   |------------------------------------------------|
444     *   |            ring_count * 3DPRIMITIVE            |
445     *   |------------------------------------------------|
446     *   | jump instruction (either back to generate more |
447     *   | commands or to the next set of commands)       |
448     *   |------------------------------------------------|
449     *   |          draw ids (only used on Gfx9)          |
450     *   --------------------------------------------------
451     */
452 
453    struct anv_address draw_id_addr = (struct anv_address) {
454       .bo     = cmd_buffer->generation.ring_bo,
455       .offset = ring_count * draw_cmd_stride +
456                 GENX(MI_BATCH_BUFFER_START_length) * 4,
457    };
458 
459    struct anv_address draw_cmds_addr = (struct anv_address) {
460       .bo = cmd_buffer->generation.ring_bo,
461 #if GFX_VER >= 12
462       .offset = GENX(MI_ARB_CHECK_length) * 4,
463 #endif
464    };
465 
466 #if GFX_VER >= 12
467    struct GENX(MI_ARB_CHECK) resume_prefetch = {
468       .PreParserDisableMask = true,
469       .PreParserDisable = false,
470    };
471    GENX(MI_ARB_CHECK_pack)(NULL, cmd_buffer->generation.ring_bo->map,
472                            &resume_prefetch);
473 #endif
474 
475 #if GFX_VER == 9
476    /* Mark the VB-0 as using the entire ring_bo, but only for the draw call
477     * starting the generation batch. All the following ones will use the same
478     * area.
479     */
480    genX(cmd_buffer_set_binding_for_gfx8_vb_flush)(
481       cmd_buffer, 0,
482       (struct anv_address) {
483          .bo = cmd_buffer->generation.ring_bo,
484       },
485       cmd_buffer->generation.ring_bo->size);
486 
487    struct anv_graphics_pipeline *pipeline =
488       anv_pipeline_to_graphics(cmd_buffer->state.gfx.base.pipeline);
489    const struct brw_vs_prog_data *vs_prog_data = get_vs_prog_data(pipeline);
490 
491    if (vs_prog_data->uses_baseinstance ||
492        vs_prog_data->uses_firstvertex) {
493       /* We're using the indirect buffer directly to source base instance &
494        * first vertex values. Mark the entire area as used.
495        */
496       genX(cmd_buffer_set_binding_for_gfx8_vb_flush)(cmd_buffer, ANV_SVGS_VB_INDEX,
497                                                      indirect_data_addr,
498                                                      indirect_data_stride * max_draw_count);
499    }
500 
501    if (vs_prog_data->uses_drawid) {
502       /* Mark the whole draw id buffer as used. */
503       genX(cmd_buffer_set_binding_for_gfx8_vb_flush)(cmd_buffer, ANV_SVGS_VB_INDEX,
504                                                      draw_id_addr,
505                                                      sizeof(uint32_t) * max_draw_count);
506    }
507 #endif
508 
509    /* Apply the pipeline flush here so the indirect data is available for the
510     * generation shader.
511     */
512    genX(cmd_buffer_apply_pipe_flushes)(cmd_buffer);
513 
514    trace_intel_begin_generate_draws(&cmd_buffer->trace);
515 
516    /***
517     * This is where the command buffer below will jump back to if we need to
518     * generate more draws.
519     */
520    struct anv_address gen_addr = anv_batch_current_address(&cmd_buffer->batch);
521 
522    struct anv_shader_bin *gen_kernel;
523    VkResult ret =
524       anv_device_get_internal_shader(
525          cmd_buffer->device,
526          ANV_INTERNAL_KERNEL_GENERATED_DRAWS,
527          &gen_kernel);
528    if (ret != VK_SUCCESS) {
529       anv_batch_set_error(&cmd_buffer->batch, ret);
530       return;
531    }
532 
533    struct anv_simple_shader simple_state = (struct anv_simple_shader) {
534       .device               = device,
535       .cmd_buffer           = cmd_buffer,
536       .dynamic_state_stream = &cmd_buffer->dynamic_state_stream,
537       .general_state_stream = &cmd_buffer->general_state_stream,
538       .batch                = &cmd_buffer->batch,
539       .kernel               = gen_kernel,
540       .l3_config            = device->internal_kernels_l3_config,
541       .urb_cfg              = &cmd_buffer->state.gfx.urb_cfg,
542    };
543    genX(emit_simple_shader_init)(&simple_state);
544 
545    struct anv_state params_state =
546       genX(cmd_buffer_emit_generate_draws)(
547          cmd_buffer,
548          &simple_state,
549          draw_cmds_addr,
550          draw_cmd_stride,
551          indirect_data_addr,
552          indirect_data_stride,
553          draw_id_addr,
554          0 /* item_base */,
555          MIN2(MAX_RING_BO_ITEMS, max_draw_count) /* item_count */,
556          count_addr,
557          max_draw_count,
558          indexed,
559          ring_count);
560    struct anv_gen_indirect_params *params = params_state.map;
561 
562    anv_add_pending_pipe_bits(cmd_buffer,
563 #if GFX_VER == 9
564                              ANV_PIPE_VF_CACHE_INVALIDATE_BIT |
565 #endif
566                              ANV_PIPE_DATA_CACHE_FLUSH_BIT |
567                              ANV_PIPE_CS_STALL_BIT,
568                              "after generation flush");
569 
570    trace_intel_end_generate_draws(&cmd_buffer->trace);
571 
572    /* Emit the 3D state in the main batch. */
573    genX(cmd_buffer_flush_gfx_state)(cmd_buffer);
574 
575    if (cmd_buffer->state.conditional_render_enabled)
576       genX(cmd_emit_conditional_render_predicate)(cmd_buffer);
577 
578    if (max_draw_count > 0) {
579 #if GFX_VER >= 12
580       /* Prior to Gfx12 we cannot disable the CS prefetch but it doesn't matter
581        * as the prefetch shouldn't follow the MI_BATCH_BUFFER_START.
582        */
583       anv_batch_emit(&cmd_buffer->batch, GENX(MI_ARB_CHECK), arb) {
584          arb.PreParserDisableMask = true;
585          arb.PreParserDisable = true;
586       }
587 #endif
588 
589       /* Jump into the ring buffer. */
590       anv_batch_emit(&cmd_buffer->batch, GENX(MI_BATCH_BUFFER_START), bbs) {
591          bbs.AddressSpaceIndicator = ASI_PPGTT;
592          bbs.BatchBufferStartAddress = (struct anv_address) {
593             .bo = cmd_buffer->generation.ring_bo,
594          };
595       }
596 
597       /***
598        * This is the location at which the ring buffer jumps to if it needs to
599        * generate more draw calls. We do the following :
600        *    - wait for draws in the ring buffer to complete (cs stall) so we're
601        *      sure the push constant data we're about to edit is not read anymore
602        *    - increment the base draw number by the number of draws
603        *      executed in the ring
604        *    - invalidate the constant cache since the
605        *      anv_generated_indirect_params::draw::draw_base is updated
606        *    - jump back to the generation shader
607        */
608       struct anv_address inc_addr =
609          anv_batch_current_address(&cmd_buffer->batch);
610 
611       anv_add_pending_pipe_bits(cmd_buffer,
612                                 ANV_PIPE_STALL_AT_SCOREBOARD_BIT |
613                                 ANV_PIPE_CS_STALL_BIT,
614                                 "after generated draws batch");
615       genX(cmd_buffer_apply_pipe_flushes)(cmd_buffer);
616 
617       struct mi_builder b;
618       mi_builder_init(&b, cmd_buffer->device->info, &cmd_buffer->batch);
619 
620       struct anv_address draw_base_addr = anv_address_add(
621          genX(simple_shader_push_state_address)(
622             &simple_state, params_state),
623          offsetof(struct anv_gen_indirect_params, draw_base));
624 
625       const uint32_t mocs = anv_mocs_for_address(cmd_buffer->device,
626                                                  &draw_base_addr);
627       mi_builder_set_mocs(&b, mocs);
628       mi_builder_set_write_check(&b, true);
629 
630       mi_store(&b, mi_mem32(draw_base_addr),
631                    mi_iadd(&b, mi_mem32(draw_base_addr),
632                                mi_imm(ring_count)));
633 
634       /* Make sure the MI writes are globally observable */
635       mi_ensure_write_fence(&b);
636 
637       anv_add_pending_pipe_bits(cmd_buffer,
638                                 ANV_PIPE_CONSTANT_CACHE_INVALIDATE_BIT,
639                                 "after generated draws batch increment");
640       genX(cmd_buffer_apply_pipe_flushes)(cmd_buffer);
641 
642       anv_batch_emit(&cmd_buffer->batch, GENX(MI_BATCH_BUFFER_START), bbs) {
643          bbs.AddressSpaceIndicator = ASI_PPGTT;
644          bbs.BatchBufferStartAddress = gen_addr;
645       }
646 
647       /***
648        * This is the location at which the ring buffer jump to once all the draw
649        * calls have executed.
650        */
651       struct anv_address end_addr = anv_batch_current_address(&cmd_buffer->batch);
652 
653       /* Reset the draw_base field in case we ever replay the command buffer. */
654       mi_store(&b, mi_mem32(draw_base_addr), mi_imm(0));
655 
656       /* Make sure the MI writes are globally observable */
657       mi_ensure_write_fence(&b);
658 
659       anv_add_pending_pipe_bits(cmd_buffer,
660                                 ANV_PIPE_CONSTANT_CACHE_INVALIDATE_BIT,
661                                 "after generated draws end");
662 
663       params->gen_addr = anv_address_physical(inc_addr);
664       params->end_addr = anv_address_physical(end_addr);
665    }
666 }
667 
668 static void
genX(cmd_buffer_emit_indirect_generated_draws)669 genX(cmd_buffer_emit_indirect_generated_draws)(struct anv_cmd_buffer *cmd_buffer,
670                                                struct anv_address indirect_data_addr,
671                                                uint32_t indirect_data_stride,
672                                                struct anv_address count_addr,
673                                                uint32_t max_draw_count,
674                                                bool indexed)
675 {
676    /* In order to have the vertex fetch gather the data we need to have a non
677     * 0 stride. It's possible to have a 0 stride given by the application when
678     * draw_count is 1, but we need a correct value for the
679     * VERTEX_BUFFER_STATE::BufferPitch, so ensure the caller set this
680     * correctly :
681     *
682     * Vulkan spec, vkCmdDrawIndirect:
683     *
684     *   "If drawCount is less than or equal to one, stride is ignored."
685     */
686    assert(indirect_data_stride > 0);
687 
688    const bool use_ring_buffer = max_draw_count >=
689       cmd_buffer->device->physical->instance->generated_indirect_ring_threshold;
690    if (use_ring_buffer) {
691       genX(cmd_buffer_emit_indirect_generated_draws_inring)(cmd_buffer,
692                                                             indirect_data_addr,
693                                                             indirect_data_stride,
694                                                             count_addr,
695                                                             max_draw_count,
696                                                             indexed);
697    } else {
698       genX(cmd_buffer_emit_indirect_generated_draws_inplace)(cmd_buffer,
699                                                              indirect_data_addr,
700                                                              indirect_data_stride,
701                                                              count_addr,
702                                                              max_draw_count,
703                                                              indexed);
704    }
705 }
706 
707 #endif /* GENX_CMD_DRAW_GENERATED_INDIRECT_H */
708