1 /*
2 * Copyright © 2022 Intel Corporation
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
13 * Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21 * IN THE SOFTWARE.
22 */
23
24 #ifndef GENX_CMD_DRAW_GENERATED_INDIRECT_H
25 #define GENX_CMD_DRAW_GENERATED_INDIRECT_H
26
27 #include <assert.h>
28 #include <stdbool.h>
29
30 #include "util/macros.h"
31
32 #include "common/intel_genX_state_brw.h"
33
34 #include "anv_private.h"
35 #include "anv_internal_kernels.h"
36
37 /* This is a maximum number of items a fragment shader can generate due to the
38 * viewport size.
39 */
40 #define MAX_GENERATED_DRAW_COUNT (8192 * 8192)
41
42 #define MAX_RING_BO_ITEMS (8192)
43
44 static struct anv_state
genX(cmd_buffer_emit_generate_draws)45 genX(cmd_buffer_emit_generate_draws)(struct anv_cmd_buffer *cmd_buffer,
46 struct anv_simple_shader *simple_state,
47 struct anv_address generated_cmds_addr,
48 uint32_t generated_cmd_stride,
49 struct anv_address indirect_data_addr,
50 uint32_t indirect_data_stride,
51 struct anv_address draw_id_addr,
52 uint32_t item_base,
53 uint32_t item_count,
54 struct anv_address count_addr,
55 uint32_t max_count,
56 bool indexed,
57 uint32_t ring_count)
58 {
59 struct anv_device *device = cmd_buffer->device;
60
61 struct anv_state push_data_state =
62 genX(simple_shader_alloc_push)(simple_state,
63 sizeof(struct anv_gen_indirect_params));
64 if (push_data_state.map == NULL)
65 return ANV_STATE_NULL;
66
67 struct anv_graphics_pipeline *pipeline =
68 anv_pipeline_to_graphics(cmd_buffer->state.gfx.base.pipeline);
69 const struct brw_vs_prog_data *vs_prog_data = get_vs_prog_data(pipeline);
70 const bool use_tbimr = cmd_buffer->state.gfx.dyn_state.use_tbimr;
71
72 struct anv_address draw_count_addr;
73 if (anv_address_is_null(count_addr)) {
74 draw_count_addr = anv_address_add(
75 genX(simple_shader_push_state_address)(simple_state, push_data_state),
76 offsetof(struct anv_gen_indirect_params, draw_count));
77 } else {
78 draw_count_addr = count_addr;
79 }
80
81 const bool wa_16011107343 =
82 intel_needs_workaround(device->info, 16011107343) &&
83 anv_pipeline_has_stage(pipeline, MESA_SHADER_TESS_CTRL);
84 const bool wa_22018402687 =
85 intel_needs_workaround(device->info, 22018402687) &&
86 anv_pipeline_has_stage(pipeline, MESA_SHADER_TESS_EVAL);
87
88 const uint32_t wa_insts_size =
89 ((wa_16011107343 ? GENX(3DSTATE_HS_length) : 0) +
90 (wa_22018402687 ? GENX(3DSTATE_HS_length) : 0)) * 4;
91 UNUSED const bool protected = cmd_buffer->vk.pool->flags &
92 VK_COMMAND_POOL_CREATE_PROTECTED_BIT;
93
94 struct anv_state wa_insts_state =
95 wa_insts_size ?
96 anv_cmd_buffer_alloc_temporary_state(cmd_buffer, wa_insts_size, 4) :
97 ANV_STATE_NULL;
98 UNUSED uint32_t wa_insts_offset = 0;
99
100 #if INTEL_WA_16011107343_GFX_VER
101 if (wa_16011107343) {
102 memcpy(wa_insts_state.map + wa_insts_offset,
103 &pipeline->batch_data[
104 protected ?
105 pipeline->final.hs_protected.offset :
106 pipeline->final.hs.offset],
107 GENX(3DSTATE_HS_length) * 4);
108 wa_insts_offset += GENX(3DSTATE_HS_length) * 4;
109 }
110 #endif
111
112 #if INTEL_WA_22018402687_GFX_VER
113 if (wa_22018402687) {
114 memcpy(wa_insts_state.map + wa_insts_offset,
115 &pipeline->batch_data[
116 protected ?
117 pipeline->final.ds_protected.offset :
118 pipeline->final.ds.offset],
119 GENX(3DSTATE_DS_length) * 4);
120 wa_insts_offset += GENX(3DSTATE_DS_length) * 4;
121 }
122 #endif
123
124 struct anv_gen_indirect_params *push_data = push_data_state.map;
125 *push_data = (struct anv_gen_indirect_params) {
126 .wa_insts_addr = anv_address_physical(
127 anv_cmd_buffer_temporary_state_address(cmd_buffer, wa_insts_state)),
128 .draw_id_addr = anv_address_physical(draw_id_addr),
129 .indirect_data_addr = anv_address_physical(indirect_data_addr),
130 .indirect_data_stride = indirect_data_stride,
131 .flags = (use_tbimr ? ANV_GENERATED_FLAG_TBIMR : 0) |
132 (indexed ? ANV_GENERATED_FLAG_INDEXED : 0) |
133 (cmd_buffer->state.conditional_render_enabled ?
134 ANV_GENERATED_FLAG_PREDICATED : 0) |
135 ((vs_prog_data->uses_firstvertex ||
136 vs_prog_data->uses_baseinstance) ?
137 ANV_GENERATED_FLAG_BASE : 0) |
138 (vs_prog_data->uses_drawid ? ANV_GENERATED_FLAG_DRAWID : 0) |
139 (!anv_address_is_null(count_addr) ?
140 ANV_GENERATED_FLAG_COUNT : 0) |
141 (ring_count != 0 ? ANV_GENERATED_FLAG_RING_MODE : 0),
142 .mocs = anv_mocs(device, indirect_data_addr.bo,
143 ISL_SURF_USAGE_VERTEX_BUFFER_BIT),
144 .cmd_primitive_size = wa_insts_size + generated_cmd_stride,
145 .draw_base = item_base,
146 .max_draw_count = max_count,
147 .ring_count = ring_count,
148 .instance_multiplier = pipeline->instance_multiplier,
149 .draw_count = anv_address_is_null(count_addr) ? max_count : 0,
150 .generated_cmds_addr = anv_address_physical(generated_cmds_addr),
151 .draw_count_addr = anv_address_physical(draw_count_addr),
152 };
153
154 genX(emit_simple_shader_dispatch)(simple_state, item_count, push_data_state);
155
156 return push_data_state;
157 }
158
159 static void
genX(cmd_buffer_emit_indirect_generated_draws_init)160 genX(cmd_buffer_emit_indirect_generated_draws_init)(struct anv_cmd_buffer *cmd_buffer)
161 {
162 anv_batch_emit_ensure_space(&cmd_buffer->generation.batch, 4);
163
164 trace_intel_begin_generate_draws(&cmd_buffer->trace);
165
166 anv_batch_emit(&cmd_buffer->batch, GENX(MI_BATCH_BUFFER_START), bbs) {
167 bbs.AddressSpaceIndicator = ASI_PPGTT;
168 bbs.BatchBufferStartAddress =
169 anv_batch_current_address(&cmd_buffer->generation.batch);
170 }
171
172 cmd_buffer->generation.return_addr = anv_batch_current_address(&cmd_buffer->batch);
173
174 #if GFX_VER >= 12
175 anv_batch_emit(&cmd_buffer->batch, GENX(MI_ARB_CHECK), arb) {
176 arb.PreParserDisableMask = true;
177 arb.PreParserDisable = false;
178 }
179 #endif
180
181 trace_intel_end_generate_draws(&cmd_buffer->trace);
182
183 struct anv_shader_bin *gen_kernel;
184 VkResult ret =
185 anv_device_get_internal_shader(
186 cmd_buffer->device,
187 ANV_INTERNAL_KERNEL_GENERATED_DRAWS,
188 &gen_kernel);
189 if (ret != VK_SUCCESS) {
190 anv_batch_set_error(&cmd_buffer->batch, ret);
191 return;
192 }
193
194 struct anv_device *device = cmd_buffer->device;
195 struct anv_simple_shader *state = &cmd_buffer->generation.shader_state;
196 *state = (struct anv_simple_shader) {
197 .device = device,
198 .cmd_buffer = cmd_buffer,
199 .dynamic_state_stream = &cmd_buffer->dynamic_state_stream,
200 .general_state_stream = &cmd_buffer->general_state_stream,
201 .batch = &cmd_buffer->generation.batch,
202 .kernel = gen_kernel,
203 .l3_config = device->internal_kernels_l3_config,
204 .urb_cfg = &cmd_buffer->state.gfx.urb_cfg,
205 };
206
207 genX(emit_simple_shader_init)(state);
208 }
209
210 static struct anv_address
genX(cmd_buffer_get_draw_id_addr)211 genX(cmd_buffer_get_draw_id_addr)(struct anv_cmd_buffer *cmd_buffer,
212 uint32_t draw_id_count)
213 {
214 #if GFX_VER >= 11
215 return ANV_NULL_ADDRESS;
216 #else
217 struct anv_graphics_pipeline *pipeline =
218 anv_pipeline_to_graphics(cmd_buffer->state.gfx.base.pipeline);
219 const struct brw_vs_prog_data *vs_prog_data = get_vs_prog_data(pipeline);
220 if (!vs_prog_data->uses_drawid)
221 return ANV_NULL_ADDRESS;
222
223 struct anv_state draw_id_state =
224 anv_cmd_buffer_alloc_temporary_state(cmd_buffer, 4 * draw_id_count, 4);
225 return anv_cmd_buffer_temporary_state_address(cmd_buffer, draw_id_state);
226 #endif
227 }
228
229 static uint32_t
genX(cmd_buffer_get_generated_draw_stride)230 genX(cmd_buffer_get_generated_draw_stride)(struct anv_cmd_buffer *cmd_buffer)
231 {
232 /* With the extended parameters in 3DPRIMITIVE on Gfx11+ we can emit
233 * everything. Prior to this, we need to emit a couple of
234 * VERTEX_BUFFER_STATE.
235 */
236 #if GFX_VER >= 11
237 return 4 * GENX(3DPRIMITIVE_EXTENDED_length);
238 #else
239 struct anv_graphics_pipeline *pipeline =
240 anv_pipeline_to_graphics(cmd_buffer->state.gfx.base.pipeline);
241 const struct brw_vs_prog_data *vs_prog_data = get_vs_prog_data(pipeline);
242
243 uint32_t len = 0;
244
245 if (vs_prog_data->uses_firstvertex ||
246 vs_prog_data->uses_baseinstance ||
247 vs_prog_data->uses_drawid) {
248 len += 4; /* 3DSTATE_VERTEX_BUFFERS */
249
250 if (vs_prog_data->uses_firstvertex ||
251 vs_prog_data->uses_baseinstance)
252 len += 4 * GENX(VERTEX_BUFFER_STATE_length);
253
254 if (vs_prog_data->uses_drawid)
255 len += 4 * GENX(VERTEX_BUFFER_STATE_length);
256 }
257
258 return len + 4 * GENX(3DPRIMITIVE_length);
259 #endif
260 }
261
262 static void
genX(cmd_buffer_rewrite_forward_end_addr)263 genX(cmd_buffer_rewrite_forward_end_addr)(struct anv_cmd_buffer *cmd_buffer,
264 struct anv_gen_indirect_params *params)
265 {
266 /* We don't know the end_addr until we have emitted all the generation
267 * draws. Go and edit the address of all the push parameters.
268 */
269 uint64_t end_addr =
270 anv_address_physical(anv_batch_current_address(&cmd_buffer->batch));
271 while (params != NULL) {
272 params->end_addr = end_addr;
273 params = params->prev;
274 }
275 }
276
277 static void
genX(cmd_buffer_emit_indirect_generated_draws_inplace)278 genX(cmd_buffer_emit_indirect_generated_draws_inplace)(struct anv_cmd_buffer *cmd_buffer,
279 struct anv_address indirect_data_addr,
280 uint32_t indirect_data_stride,
281 struct anv_address count_addr,
282 uint32_t max_draw_count,
283 bool indexed)
284 {
285 const bool start_generation_batch =
286 anv_address_is_null(cmd_buffer->generation.return_addr);
287
288 genX(flush_pipeline_select_3d)(cmd_buffer);
289
290 struct anv_address draw_id_addr =
291 genX(cmd_buffer_get_draw_id_addr)(cmd_buffer, max_draw_count);
292
293 #if GFX_VER == 9
294 /* Mark the VB-0 as using the entire dynamic state pool area, but only for
295 * the draw call starting the generation batch. All the following ones will
296 * use the same area.
297 */
298 if (start_generation_batch) {
299 struct anv_device *device = cmd_buffer->device;
300 genX(cmd_buffer_set_binding_for_gfx8_vb_flush)(
301 cmd_buffer, 0,
302 (struct anv_address) {
303 .offset = device->physical->va.dynamic_state_pool.addr,
304 },
305 device->physical->va.dynamic_state_pool.size);
306 }
307
308 struct anv_graphics_pipeline *pipeline =
309 anv_pipeline_to_graphics(cmd_buffer->state.gfx.base.pipeline);
310 const struct brw_vs_prog_data *vs_prog_data = get_vs_prog_data(pipeline);
311
312 if (vs_prog_data->uses_baseinstance ||
313 vs_prog_data->uses_firstvertex) {
314 /* We're using the indirect buffer directly to source base instance &
315 * first vertex values. Mark the entire area as used.
316 */
317 genX(cmd_buffer_set_binding_for_gfx8_vb_flush)(cmd_buffer, ANV_SVGS_VB_INDEX,
318 indirect_data_addr,
319 indirect_data_stride * max_draw_count);
320 }
321
322 if (vs_prog_data->uses_drawid) {
323 /* Mark the whole draw id buffer as used. */
324 genX(cmd_buffer_set_binding_for_gfx8_vb_flush)(cmd_buffer, ANV_SVGS_VB_INDEX,
325 draw_id_addr,
326 sizeof(uint32_t) * max_draw_count);
327 }
328 #endif
329
330 /* Apply the pipeline flush here so the indirect data is available for the
331 * generation shader.
332 */
333 genX(cmd_buffer_apply_pipe_flushes)(cmd_buffer);
334
335 if (start_generation_batch)
336 genX(cmd_buffer_emit_indirect_generated_draws_init)(cmd_buffer);
337
338 /* Emit the 3D state in the main batch. */
339 genX(cmd_buffer_flush_gfx_state)(cmd_buffer);
340
341 if (cmd_buffer->state.conditional_render_enabled)
342 genX(cmd_emit_conditional_render_predicate)(cmd_buffer);
343
344 const uint32_t draw_cmd_stride =
345 genX(cmd_buffer_get_generated_draw_stride)(cmd_buffer);
346
347 struct anv_gen_indirect_params *last_params = NULL;
348 uint32_t item_base = 0;
349 while (item_base < max_draw_count) {
350 const uint32_t item_count = MIN2(max_draw_count - item_base,
351 MAX_GENERATED_DRAW_COUNT);
352 const uint32_t draw_cmd_size = item_count * draw_cmd_stride;
353
354 /* Ensure we have enough contiguous space for all the draws so that the
355 * compute shader can edit all the 3DPRIMITIVEs from a single base
356 * address.
357 *
358 * TODO: we might have to split that if the amount of space is to large (at
359 * 1Mb?).
360 */
361 VkResult result = anv_batch_emit_ensure_space(&cmd_buffer->batch,
362 draw_cmd_size);
363 if (result != VK_SUCCESS)
364 return;
365
366 struct anv_state params_state =
367 genX(cmd_buffer_emit_generate_draws)(
368 cmd_buffer,
369 &cmd_buffer->generation.shader_state,
370 anv_batch_current_address(&cmd_buffer->batch),
371 draw_cmd_stride,
372 indirect_data_addr,
373 indirect_data_stride,
374 anv_address_add(draw_id_addr, 4 * item_base),
375 item_base,
376 item_count,
377 count_addr,
378 max_draw_count,
379 indexed,
380 0 /* ring_count */);
381 struct anv_gen_indirect_params *params = params_state.map;
382 if (params == NULL)
383 return;
384
385 anv_batch_advance(&cmd_buffer->batch, draw_cmd_size);
386
387 item_base += item_count;
388
389 params->prev = last_params;
390 last_params = params;
391 }
392
393 genX(cmd_buffer_rewrite_forward_end_addr)(cmd_buffer, last_params);
394
395 #if GFX_VER == 9
396 update_dirty_vbs_for_gfx8_vb_flush(cmd_buffer, indexed ? RANDOM : SEQUENTIAL);
397 #endif
398 }
399
400 static void
genX(cmd_buffer_emit_indirect_generated_draws_inring)401 genX(cmd_buffer_emit_indirect_generated_draws_inring)(struct anv_cmd_buffer *cmd_buffer,
402 struct anv_address indirect_data_addr,
403 uint32_t indirect_data_stride,
404 struct anv_address count_addr,
405 uint32_t max_draw_count,
406 bool indexed)
407 {
408 struct anv_device *device = cmd_buffer->device;
409
410 genX(flush_pipeline_select_3d)(cmd_buffer);
411
412 const uint32_t draw_cmd_stride =
413 genX(cmd_buffer_get_generated_draw_stride)(cmd_buffer);
414
415 if (cmd_buffer->generation.ring_bo == NULL) {
416 const uint32_t bo_size = align(
417 #if GFX_VER >= 12
418 GENX(MI_ARB_CHECK_length) * 4 +
419 #endif
420 draw_cmd_stride * MAX_RING_BO_ITEMS +
421 #if GFX_VER == 9
422 4 * MAX_RING_BO_ITEMS +
423 #endif
424 GENX(MI_BATCH_BUFFER_START_length) * 4,
425 4096);
426 VkResult result = anv_bo_pool_alloc(&device->batch_bo_pool, bo_size,
427 &cmd_buffer->generation.ring_bo);
428 if (result != VK_SUCCESS) {
429 anv_batch_set_error(&cmd_buffer->batch, result);
430 return;
431 }
432 }
433
434 /* How many items will be generated by each iteration of the generation
435 * shader dispatch.
436 */
437 const uint32_t ring_count = MIN2(MAX_RING_BO_ITEMS, max_draw_count);
438
439 /* The ring bo has the following layout:
440 *
441 * --------------------------------------------------
442 * | MI_ARB_CHECK to resume CS prefetch (Gfx12+) |
443 * |------------------------------------------------|
444 * | ring_count * 3DPRIMITIVE |
445 * |------------------------------------------------|
446 * | jump instruction (either back to generate more |
447 * | commands or to the next set of commands) |
448 * |------------------------------------------------|
449 * | draw ids (only used on Gfx9) |
450 * --------------------------------------------------
451 */
452
453 struct anv_address draw_id_addr = (struct anv_address) {
454 .bo = cmd_buffer->generation.ring_bo,
455 .offset = ring_count * draw_cmd_stride +
456 GENX(MI_BATCH_BUFFER_START_length) * 4,
457 };
458
459 struct anv_address draw_cmds_addr = (struct anv_address) {
460 .bo = cmd_buffer->generation.ring_bo,
461 #if GFX_VER >= 12
462 .offset = GENX(MI_ARB_CHECK_length) * 4,
463 #endif
464 };
465
466 #if GFX_VER >= 12
467 struct GENX(MI_ARB_CHECK) resume_prefetch = {
468 .PreParserDisableMask = true,
469 .PreParserDisable = false,
470 };
471 GENX(MI_ARB_CHECK_pack)(NULL, cmd_buffer->generation.ring_bo->map,
472 &resume_prefetch);
473 #endif
474
475 #if GFX_VER == 9
476 /* Mark the VB-0 as using the entire ring_bo, but only for the draw call
477 * starting the generation batch. All the following ones will use the same
478 * area.
479 */
480 genX(cmd_buffer_set_binding_for_gfx8_vb_flush)(
481 cmd_buffer, 0,
482 (struct anv_address) {
483 .bo = cmd_buffer->generation.ring_bo,
484 },
485 cmd_buffer->generation.ring_bo->size);
486
487 struct anv_graphics_pipeline *pipeline =
488 anv_pipeline_to_graphics(cmd_buffer->state.gfx.base.pipeline);
489 const struct brw_vs_prog_data *vs_prog_data = get_vs_prog_data(pipeline);
490
491 if (vs_prog_data->uses_baseinstance ||
492 vs_prog_data->uses_firstvertex) {
493 /* We're using the indirect buffer directly to source base instance &
494 * first vertex values. Mark the entire area as used.
495 */
496 genX(cmd_buffer_set_binding_for_gfx8_vb_flush)(cmd_buffer, ANV_SVGS_VB_INDEX,
497 indirect_data_addr,
498 indirect_data_stride * max_draw_count);
499 }
500
501 if (vs_prog_data->uses_drawid) {
502 /* Mark the whole draw id buffer as used. */
503 genX(cmd_buffer_set_binding_for_gfx8_vb_flush)(cmd_buffer, ANV_SVGS_VB_INDEX,
504 draw_id_addr,
505 sizeof(uint32_t) * max_draw_count);
506 }
507 #endif
508
509 /* Apply the pipeline flush here so the indirect data is available for the
510 * generation shader.
511 */
512 genX(cmd_buffer_apply_pipe_flushes)(cmd_buffer);
513
514 trace_intel_begin_generate_draws(&cmd_buffer->trace);
515
516 /***
517 * This is where the command buffer below will jump back to if we need to
518 * generate more draws.
519 */
520 struct anv_address gen_addr = anv_batch_current_address(&cmd_buffer->batch);
521
522 struct anv_shader_bin *gen_kernel;
523 VkResult ret =
524 anv_device_get_internal_shader(
525 cmd_buffer->device,
526 ANV_INTERNAL_KERNEL_GENERATED_DRAWS,
527 &gen_kernel);
528 if (ret != VK_SUCCESS) {
529 anv_batch_set_error(&cmd_buffer->batch, ret);
530 return;
531 }
532
533 struct anv_simple_shader simple_state = (struct anv_simple_shader) {
534 .device = device,
535 .cmd_buffer = cmd_buffer,
536 .dynamic_state_stream = &cmd_buffer->dynamic_state_stream,
537 .general_state_stream = &cmd_buffer->general_state_stream,
538 .batch = &cmd_buffer->batch,
539 .kernel = gen_kernel,
540 .l3_config = device->internal_kernels_l3_config,
541 .urb_cfg = &cmd_buffer->state.gfx.urb_cfg,
542 };
543 genX(emit_simple_shader_init)(&simple_state);
544
545 struct anv_state params_state =
546 genX(cmd_buffer_emit_generate_draws)(
547 cmd_buffer,
548 &simple_state,
549 draw_cmds_addr,
550 draw_cmd_stride,
551 indirect_data_addr,
552 indirect_data_stride,
553 draw_id_addr,
554 0 /* item_base */,
555 MIN2(MAX_RING_BO_ITEMS, max_draw_count) /* item_count */,
556 count_addr,
557 max_draw_count,
558 indexed,
559 ring_count);
560 struct anv_gen_indirect_params *params = params_state.map;
561
562 anv_add_pending_pipe_bits(cmd_buffer,
563 #if GFX_VER == 9
564 ANV_PIPE_VF_CACHE_INVALIDATE_BIT |
565 #endif
566 ANV_PIPE_DATA_CACHE_FLUSH_BIT |
567 ANV_PIPE_CS_STALL_BIT,
568 "after generation flush");
569
570 trace_intel_end_generate_draws(&cmd_buffer->trace);
571
572 /* Emit the 3D state in the main batch. */
573 genX(cmd_buffer_flush_gfx_state)(cmd_buffer);
574
575 if (cmd_buffer->state.conditional_render_enabled)
576 genX(cmd_emit_conditional_render_predicate)(cmd_buffer);
577
578 if (max_draw_count > 0) {
579 #if GFX_VER >= 12
580 /* Prior to Gfx12 we cannot disable the CS prefetch but it doesn't matter
581 * as the prefetch shouldn't follow the MI_BATCH_BUFFER_START.
582 */
583 anv_batch_emit(&cmd_buffer->batch, GENX(MI_ARB_CHECK), arb) {
584 arb.PreParserDisableMask = true;
585 arb.PreParserDisable = true;
586 }
587 #endif
588
589 /* Jump into the ring buffer. */
590 anv_batch_emit(&cmd_buffer->batch, GENX(MI_BATCH_BUFFER_START), bbs) {
591 bbs.AddressSpaceIndicator = ASI_PPGTT;
592 bbs.BatchBufferStartAddress = (struct anv_address) {
593 .bo = cmd_buffer->generation.ring_bo,
594 };
595 }
596
597 /***
598 * This is the location at which the ring buffer jumps to if it needs to
599 * generate more draw calls. We do the following :
600 * - wait for draws in the ring buffer to complete (cs stall) so we're
601 * sure the push constant data we're about to edit is not read anymore
602 * - increment the base draw number by the number of draws
603 * executed in the ring
604 * - invalidate the constant cache since the
605 * anv_generated_indirect_params::draw::draw_base is updated
606 * - jump back to the generation shader
607 */
608 struct anv_address inc_addr =
609 anv_batch_current_address(&cmd_buffer->batch);
610
611 anv_add_pending_pipe_bits(cmd_buffer,
612 ANV_PIPE_STALL_AT_SCOREBOARD_BIT |
613 ANV_PIPE_CS_STALL_BIT,
614 "after generated draws batch");
615 genX(cmd_buffer_apply_pipe_flushes)(cmd_buffer);
616
617 struct mi_builder b;
618 mi_builder_init(&b, cmd_buffer->device->info, &cmd_buffer->batch);
619
620 struct anv_address draw_base_addr = anv_address_add(
621 genX(simple_shader_push_state_address)(
622 &simple_state, params_state),
623 offsetof(struct anv_gen_indirect_params, draw_base));
624
625 const uint32_t mocs = anv_mocs_for_address(cmd_buffer->device,
626 &draw_base_addr);
627 mi_builder_set_mocs(&b, mocs);
628 mi_builder_set_write_check(&b, true);
629
630 mi_store(&b, mi_mem32(draw_base_addr),
631 mi_iadd(&b, mi_mem32(draw_base_addr),
632 mi_imm(ring_count)));
633
634 /* Make sure the MI writes are globally observable */
635 mi_ensure_write_fence(&b);
636
637 anv_add_pending_pipe_bits(cmd_buffer,
638 ANV_PIPE_CONSTANT_CACHE_INVALIDATE_BIT,
639 "after generated draws batch increment");
640 genX(cmd_buffer_apply_pipe_flushes)(cmd_buffer);
641
642 anv_batch_emit(&cmd_buffer->batch, GENX(MI_BATCH_BUFFER_START), bbs) {
643 bbs.AddressSpaceIndicator = ASI_PPGTT;
644 bbs.BatchBufferStartAddress = gen_addr;
645 }
646
647 /***
648 * This is the location at which the ring buffer jump to once all the draw
649 * calls have executed.
650 */
651 struct anv_address end_addr = anv_batch_current_address(&cmd_buffer->batch);
652
653 /* Reset the draw_base field in case we ever replay the command buffer. */
654 mi_store(&b, mi_mem32(draw_base_addr), mi_imm(0));
655
656 /* Make sure the MI writes are globally observable */
657 mi_ensure_write_fence(&b);
658
659 anv_add_pending_pipe_bits(cmd_buffer,
660 ANV_PIPE_CONSTANT_CACHE_INVALIDATE_BIT,
661 "after generated draws end");
662
663 params->gen_addr = anv_address_physical(inc_addr);
664 params->end_addr = anv_address_physical(end_addr);
665 }
666 }
667
668 static void
genX(cmd_buffer_emit_indirect_generated_draws)669 genX(cmd_buffer_emit_indirect_generated_draws)(struct anv_cmd_buffer *cmd_buffer,
670 struct anv_address indirect_data_addr,
671 uint32_t indirect_data_stride,
672 struct anv_address count_addr,
673 uint32_t max_draw_count,
674 bool indexed)
675 {
676 /* In order to have the vertex fetch gather the data we need to have a non
677 * 0 stride. It's possible to have a 0 stride given by the application when
678 * draw_count is 1, but we need a correct value for the
679 * VERTEX_BUFFER_STATE::BufferPitch, so ensure the caller set this
680 * correctly :
681 *
682 * Vulkan spec, vkCmdDrawIndirect:
683 *
684 * "If drawCount is less than or equal to one, stride is ignored."
685 */
686 assert(indirect_data_stride > 0);
687
688 const bool use_ring_buffer = max_draw_count >=
689 cmd_buffer->device->physical->instance->generated_indirect_ring_threshold;
690 if (use_ring_buffer) {
691 genX(cmd_buffer_emit_indirect_generated_draws_inring)(cmd_buffer,
692 indirect_data_addr,
693 indirect_data_stride,
694 count_addr,
695 max_draw_count,
696 indexed);
697 } else {
698 genX(cmd_buffer_emit_indirect_generated_draws_inplace)(cmd_buffer,
699 indirect_data_addr,
700 indirect_data_stride,
701 count_addr,
702 max_draw_count,
703 indexed);
704 }
705 }
706
707 #endif /* GENX_CMD_DRAW_GENERATED_INDIRECT_H */
708