1 /*
2 * Copyright © 2015 Intel Corporation
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
13 * Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21 * IN THE SOFTWARE.
22 */
23
24 #include <assert.h>
25 #include <stdbool.h>
26
27 #include "anv_private.h"
28 #include "anv_measure.h"
29
30 #include "genxml/gen_macros.h"
31 #include "genxml/genX_pack.h"
32 #include "common/intel_genX_state_brw.h"
33
34 #include "ds/intel_tracepoints.h"
35
36 #include "genX_mi_builder.h"
37
38 static void
cmd_buffer_alloc_gfx_push_constants(struct anv_cmd_buffer * cmd_buffer)39 cmd_buffer_alloc_gfx_push_constants(struct anv_cmd_buffer *cmd_buffer)
40 {
41 struct anv_graphics_pipeline *pipeline =
42 anv_pipeline_to_graphics(cmd_buffer->state.gfx.base.pipeline);
43 VkShaderStageFlags stages = pipeline->base.base.active_stages;
44
45 /* In order to avoid thrash, we assume that vertex and fragment stages
46 * always exist. In the rare case where one is missing *and* the other
47 * uses push concstants, this may be suboptimal. However, avoiding stalls
48 * seems more important.
49 */
50 stages |= VK_SHADER_STAGE_FRAGMENT_BIT;
51 if (anv_pipeline_is_primitive(pipeline))
52 stages |= VK_SHADER_STAGE_VERTEX_BIT;
53
54 if (stages == cmd_buffer->state.gfx.push_constant_stages)
55 return;
56
57 unsigned push_constant_kb;
58
59 const struct intel_device_info *devinfo = cmd_buffer->device->info;
60 if (anv_pipeline_is_mesh(pipeline))
61 push_constant_kb = devinfo->mesh_max_constant_urb_size_kb;
62 else
63 push_constant_kb = devinfo->max_constant_urb_size_kb;
64
65 const unsigned num_stages =
66 util_bitcount(stages & VK_SHADER_STAGE_ALL_GRAPHICS);
67 unsigned size_per_stage = push_constant_kb / num_stages;
68
69 /* Broadwell+ and Haswell gt3 require that the push constant sizes be in
70 * units of 2KB. Incidentally, these are the same platforms that have
71 * 32KB worth of push constant space.
72 */
73 if (push_constant_kb == 32)
74 size_per_stage &= ~1u;
75
76 uint32_t kb_used = 0;
77 for (int i = MESA_SHADER_VERTEX; i < MESA_SHADER_FRAGMENT; i++) {
78 const unsigned push_size = (stages & (1 << i)) ? size_per_stage : 0;
79 anv_batch_emit(&cmd_buffer->batch,
80 GENX(3DSTATE_PUSH_CONSTANT_ALLOC_VS), alloc) {
81 alloc._3DCommandSubOpcode = 18 + i;
82 alloc.ConstantBufferOffset = (push_size > 0) ? kb_used : 0;
83 alloc.ConstantBufferSize = push_size;
84 }
85 kb_used += push_size;
86 }
87
88 anv_batch_emit(&cmd_buffer->batch,
89 GENX(3DSTATE_PUSH_CONSTANT_ALLOC_PS), alloc) {
90 alloc.ConstantBufferOffset = kb_used;
91 alloc.ConstantBufferSize = push_constant_kb - kb_used;
92 }
93
94 #if GFX_VERx10 == 125
95 /* DG2: Wa_22011440098
96 * MTL: Wa_18022330953
97 *
98 * In 3D mode, after programming push constant alloc command immediately
99 * program push constant command(ZERO length) without any commit between
100 * them.
101 */
102 anv_batch_emit(&cmd_buffer->batch, GENX(3DSTATE_CONSTANT_ALL), c) {
103 /* Update empty push constants for all stages (bitmask = 11111b) */
104 c.ShaderUpdateEnable = 0x1f;
105 c.MOCS = anv_mocs(cmd_buffer->device, NULL, 0);
106 }
107 #endif
108
109 cmd_buffer->state.gfx.push_constant_stages = stages;
110
111 /* From the BDW PRM for 3DSTATE_PUSH_CONSTANT_ALLOC_VS:
112 *
113 * "The 3DSTATE_CONSTANT_VS must be reprogrammed prior to
114 * the next 3DPRIMITIVE command after programming the
115 * 3DSTATE_PUSH_CONSTANT_ALLOC_VS"
116 *
117 * Since 3DSTATE_PUSH_CONSTANT_ALLOC_VS is programmed as part of
118 * pipeline setup, we need to dirty push constants.
119 */
120 cmd_buffer->state.push_constants_dirty |= stages;
121 }
122
123 static void
cmd_buffer_emit_descriptor_pointers(struct anv_cmd_buffer * cmd_buffer,uint32_t stages)124 cmd_buffer_emit_descriptor_pointers(struct anv_cmd_buffer *cmd_buffer,
125 uint32_t stages)
126 {
127 static const uint32_t sampler_state_opcodes[] = {
128 [MESA_SHADER_VERTEX] = 43,
129 [MESA_SHADER_TESS_CTRL] = 44, /* HS */
130 [MESA_SHADER_TESS_EVAL] = 45, /* DS */
131 [MESA_SHADER_GEOMETRY] = 46,
132 [MESA_SHADER_FRAGMENT] = 47,
133 };
134
135 static const uint32_t binding_table_opcodes[] = {
136 [MESA_SHADER_VERTEX] = 38,
137 [MESA_SHADER_TESS_CTRL] = 39,
138 [MESA_SHADER_TESS_EVAL] = 40,
139 [MESA_SHADER_GEOMETRY] = 41,
140 [MESA_SHADER_FRAGMENT] = 42,
141 };
142
143 anv_foreach_stage(s, stages) {
144 assert(s < ARRAY_SIZE(binding_table_opcodes));
145
146 if (cmd_buffer->state.samplers[s].alloc_size > 0) {
147 anv_batch_emit(&cmd_buffer->batch,
148 GENX(3DSTATE_SAMPLER_STATE_POINTERS_VS), ssp) {
149 ssp._3DCommandSubOpcode = sampler_state_opcodes[s];
150 ssp.PointertoVSSamplerState = cmd_buffer->state.samplers[s].offset;
151 }
152 }
153
154 /* Always emit binding table pointers if we're asked to, since on SKL
155 * this is what flushes push constants. */
156 anv_batch_emit(&cmd_buffer->batch,
157 GENX(3DSTATE_BINDING_TABLE_POINTERS_VS), btp) {
158 btp._3DCommandSubOpcode = binding_table_opcodes[s];
159 btp.PointertoVSBindingTable = cmd_buffer->state.binding_tables[s].offset;
160 }
161 }
162 }
163
164 static struct anv_address
get_push_range_address(struct anv_cmd_buffer * cmd_buffer,const struct anv_shader_bin * shader,const struct anv_push_range * range)165 get_push_range_address(struct anv_cmd_buffer *cmd_buffer,
166 const struct anv_shader_bin *shader,
167 const struct anv_push_range *range)
168 {
169 struct anv_cmd_graphics_state *gfx_state = &cmd_buffer->state.gfx;
170 switch (range->set) {
171 case ANV_DESCRIPTOR_SET_DESCRIPTORS: {
172 /* This is a descriptor set buffer so the set index is
173 * actually given by binding->binding. (Yes, that's
174 * confusing.)
175 */
176 struct anv_descriptor_set *set =
177 gfx_state->base.descriptors[range->index];
178 return anv_descriptor_set_address(set);
179 }
180
181 case ANV_DESCRIPTOR_SET_DESCRIPTORS_BUFFER: {
182 return anv_address_from_u64(
183 anv_cmd_buffer_descriptor_buffer_address(
184 cmd_buffer,
185 gfx_state->base.descriptor_buffers[range->index].buffer_index) +
186 gfx_state->base.descriptor_buffers[range->index].buffer_offset);
187 }
188
189 case ANV_DESCRIPTOR_SET_PUSH_CONSTANTS: {
190 if (gfx_state->base.push_constants_state.alloc_size == 0) {
191 gfx_state->base.push_constants_state =
192 anv_cmd_buffer_gfx_push_constants(cmd_buffer);
193 }
194 return anv_cmd_buffer_temporary_state_address(
195 cmd_buffer, gfx_state->base.push_constants_state);
196 }
197
198 case ANV_DESCRIPTOR_SET_NULL:
199 return cmd_buffer->device->workaround_address;
200
201 default: {
202 assert(range->set < MAX_SETS);
203 struct anv_descriptor_set *set =
204 gfx_state->base.descriptors[range->set];
205 const struct anv_descriptor *desc =
206 &set->descriptors[range->index];
207
208 if (desc->type == VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER) {
209 if (desc->buffer) {
210 return anv_address_add(desc->buffer->address,
211 desc->offset);
212 }
213 } else {
214 assert(desc->type == VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER_DYNAMIC);
215 if (desc->buffer) {
216 const struct anv_cmd_pipeline_state *pipe_state = &gfx_state->base;
217 uint32_t dynamic_offset =
218 pipe_state->dynamic_offsets[
219 range->set].offsets[range->dynamic_offset_index];
220 return anv_address_add(desc->buffer->address,
221 desc->offset + dynamic_offset);
222 }
223 }
224
225 /* For NULL UBOs, we just return an address in the workaround BO. We do
226 * writes to it for workarounds but always at the bottom. The higher
227 * bytes should be all zeros.
228 */
229 assert(range->length * 32 <= 2048);
230 return cmd_buffer->device->workaround_address;
231 }
232 }
233 }
234
235
236 /** Returns the size in bytes of the bound buffer
237 *
238 * The range is relative to the start of the buffer, not the start of the
239 * range. The returned range may be smaller than
240 *
241 * (range->start + range->length) * 32;
242 */
243 static uint32_t
get_push_range_bound_size(struct anv_cmd_buffer * cmd_buffer,const struct anv_shader_bin * shader,const struct anv_push_range * range)244 get_push_range_bound_size(struct anv_cmd_buffer *cmd_buffer,
245 const struct anv_shader_bin *shader,
246 const struct anv_push_range *range)
247 {
248 assert(shader->stage != MESA_SHADER_COMPUTE);
249 const struct anv_cmd_graphics_state *gfx_state = &cmd_buffer->state.gfx;
250 switch (range->set) {
251 case ANV_DESCRIPTOR_SET_DESCRIPTORS: {
252 struct anv_descriptor_set *set =
253 gfx_state->base.descriptors[range->index];
254 struct anv_state state = set->desc_surface_mem;
255 assert(range->start * 32 < state.alloc_size);
256 assert((range->start + range->length) * 32 <= state.alloc_size);
257 return state.alloc_size;
258 }
259
260 case ANV_DESCRIPTOR_SET_DESCRIPTORS_BUFFER:
261 return gfx_state->base.pipeline->layout.set[
262 range->index].layout->descriptor_buffer_surface_size;
263
264 case ANV_DESCRIPTOR_SET_NULL:
265 case ANV_DESCRIPTOR_SET_PUSH_CONSTANTS:
266 return (range->start + range->length) * 32;
267
268 default: {
269 assert(range->set < MAX_SETS);
270 struct anv_descriptor_set *set =
271 gfx_state->base.descriptors[range->set];
272 const struct anv_descriptor *desc =
273 &set->descriptors[range->index];
274
275 if (desc->type == VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER) {
276 /* Here we promote a UBO to a binding table entry so that we can avoid a layer of indirection.
277 * We use the descriptor set's internally allocated surface state to fill the binding table entry.
278 */
279 if (!desc->buffer)
280 return 0;
281
282 if (range->start * 32 > desc->bind_range)
283 return 0;
284
285 return desc->bind_range;
286 } else {
287 if (!desc->buffer)
288 return 0;
289
290 assert(desc->type == VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER_DYNAMIC);
291 /* Compute the offset within the buffer */
292 const struct anv_cmd_pipeline_state *pipe_state = &gfx_state->base;
293 uint32_t dynamic_offset =
294 pipe_state->dynamic_offsets[
295 range->set].offsets[range->dynamic_offset_index];
296 uint64_t offset = desc->offset + dynamic_offset;
297 /* Clamp to the buffer size */
298 offset = MIN2(offset, desc->buffer->vk.size);
299 /* Clamp the range to the buffer size */
300 uint32_t bound_range = MIN2(desc->range, desc->buffer->vk.size - offset);
301
302 /* Align the range for consistency */
303 bound_range = align(bound_range, ANV_UBO_ALIGNMENT);
304
305 return bound_range;
306 }
307 }
308 }
309 }
310
311 static void
cmd_buffer_emit_push_constant(struct anv_cmd_buffer * cmd_buffer,gl_shader_stage stage,struct anv_address * buffers,unsigned buffer_count)312 cmd_buffer_emit_push_constant(struct anv_cmd_buffer *cmd_buffer,
313 gl_shader_stage stage,
314 struct anv_address *buffers,
315 unsigned buffer_count)
316 {
317 const struct anv_cmd_graphics_state *gfx_state = &cmd_buffer->state.gfx;
318 const struct anv_graphics_pipeline *pipeline =
319 anv_pipeline_to_graphics(gfx_state->base.pipeline);
320
321 static const uint32_t push_constant_opcodes[] = {
322 [MESA_SHADER_VERTEX] = 21,
323 [MESA_SHADER_TESS_CTRL] = 25, /* HS */
324 [MESA_SHADER_TESS_EVAL] = 26, /* DS */
325 [MESA_SHADER_GEOMETRY] = 22,
326 [MESA_SHADER_FRAGMENT] = 23,
327 };
328
329 assert(stage < ARRAY_SIZE(push_constant_opcodes));
330
331 UNUSED uint32_t mocs = anv_mocs(cmd_buffer->device, NULL, 0);
332
333 anv_batch_emit(&cmd_buffer->batch, GENX(3DSTATE_CONSTANT_VS), c) {
334 c._3DCommandSubOpcode = push_constant_opcodes[stage];
335
336 /* Set MOCS.
337 *
338 * We only have one MOCS field for the whole packet, not one per
339 * buffer. We could go out of our way here to walk over all of
340 * the buffers and see if any of them are used externally and use
341 * the external MOCS. However, the notion that someone would use
342 * the same bit of memory for both scanout and a UBO is nuts.
343 *
344 * Let's not bother and assume it's all internal.
345 */
346 c.MOCS = mocs;
347
348 if (anv_pipeline_has_stage(pipeline, stage)) {
349 const struct anv_pipeline_bind_map *bind_map =
350 &pipeline->base.shaders[stage]->bind_map;
351
352 /* The Skylake PRM contains the following restriction:
353 *
354 * "The driver must ensure The following case does not occur
355 * without a flush to the 3D engine: 3DSTATE_CONSTANT_* with
356 * buffer 3 read length equal to zero committed followed by a
357 * 3DSTATE_CONSTANT_* with buffer 0 read length not equal to
358 * zero committed."
359 *
360 * To avoid this, we program the buffers in the highest slots.
361 * This way, slot 0 is only used if slot 3 is also used.
362 */
363 assert(buffer_count <= 4);
364 const unsigned shift = 4 - buffer_count;
365 for (unsigned i = 0; i < buffer_count; i++) {
366 const struct anv_push_range *range = &bind_map->push_ranges[i];
367
368 /* At this point we only have non-empty ranges */
369 assert(range->length > 0);
370
371 c.ConstantBody.ReadLength[i + shift] = range->length;
372 c.ConstantBody.Buffer[i + shift] =
373 anv_address_add(buffers[i], range->start * 32);
374 }
375 }
376 }
377 }
378
379 #if GFX_VER >= 12
380 static void
cmd_buffer_emit_push_constant_all(struct anv_cmd_buffer * cmd_buffer,uint32_t shader_mask,struct anv_address * buffers,uint32_t buffer_count)381 cmd_buffer_emit_push_constant_all(struct anv_cmd_buffer *cmd_buffer,
382 uint32_t shader_mask,
383 struct anv_address *buffers,
384 uint32_t buffer_count)
385 {
386 if (buffer_count == 0) {
387 if (shader_mask) {
388 anv_batch_emit(&cmd_buffer->batch, GENX(3DSTATE_CONSTANT_ALL), c) {
389 c.ShaderUpdateEnable = shader_mask;
390 c.MOCS = isl_mocs(&cmd_buffer->device->isl_dev, 0, false);
391 }
392 }
393
394 return;
395 }
396
397 const struct anv_cmd_graphics_state *gfx_state = &cmd_buffer->state.gfx;
398 const struct anv_graphics_pipeline *pipeline =
399 anv_pipeline_to_graphics(gfx_state->base.pipeline);
400
401 gl_shader_stage stage = vk_to_mesa_shader_stage(shader_mask);
402
403 const struct anv_pipeline_bind_map *bind_map =
404 &pipeline->base.shaders[stage]->bind_map;
405
406 uint32_t *dw;
407 const uint32_t buffer_mask = (1 << buffer_count) - 1;
408 const uint32_t num_dwords = 2 + 2 * buffer_count;
409
410 dw = anv_batch_emitn(&cmd_buffer->batch, num_dwords,
411 GENX(3DSTATE_CONSTANT_ALL),
412 .ShaderUpdateEnable = shader_mask,
413 .PointerBufferMask = buffer_mask,
414 .MOCS = isl_mocs(&cmd_buffer->device->isl_dev, 0, false));
415
416 for (int i = 0; i < buffer_count; i++) {
417 const struct anv_push_range *range = &bind_map->push_ranges[i];
418 GENX(3DSTATE_CONSTANT_ALL_DATA_pack)(
419 &cmd_buffer->batch, dw + 2 + i * 2,
420 &(struct GENX(3DSTATE_CONSTANT_ALL_DATA)) {
421 .PointerToConstantBuffer =
422 anv_address_add(buffers[i], range->start * 32),
423 .ConstantBufferReadLength = range->length,
424 });
425 }
426 }
427 #endif
428
429 static void
cmd_buffer_flush_gfx_push_constants(struct anv_cmd_buffer * cmd_buffer,VkShaderStageFlags dirty_stages)430 cmd_buffer_flush_gfx_push_constants(struct anv_cmd_buffer *cmd_buffer,
431 VkShaderStageFlags dirty_stages)
432 {
433 VkShaderStageFlags flushed = 0;
434 struct anv_cmd_graphics_state *gfx_state = &cmd_buffer->state.gfx;
435 const struct anv_graphics_pipeline *pipeline =
436 anv_pipeline_to_graphics(gfx_state->base.pipeline);
437
438 #if GFX_VER >= 12
439 uint32_t nobuffer_stages = 0;
440 #endif
441
442 /* Compute robust pushed register access mask for each stage. */
443 anv_foreach_stage(stage, dirty_stages) {
444 if (!anv_pipeline_has_stage(pipeline, stage))
445 continue;
446
447 const struct anv_shader_bin *shader = pipeline->base.shaders[stage];
448 if (shader->prog_data->zero_push_reg) {
449 const struct anv_pipeline_bind_map *bind_map = &shader->bind_map;
450 struct anv_push_constants *push = &gfx_state->base.push_constants;
451
452 push->push_reg_mask[stage] = 0;
453 /* Start of the current range in the shader, relative to the start of
454 * push constants in the shader.
455 */
456 unsigned range_start_reg = 0;
457 for (unsigned i = 0; i < 4; i++) {
458 const struct anv_push_range *range = &bind_map->push_ranges[i];
459 if (range->length == 0)
460 continue;
461
462 unsigned bound_size =
463 get_push_range_bound_size(cmd_buffer, shader, range);
464 if (bound_size >= range->start * 32) {
465 unsigned bound_regs =
466 MIN2(DIV_ROUND_UP(bound_size, 32) - range->start,
467 range->length);
468 assert(range_start_reg + bound_regs <= 64);
469 push->push_reg_mask[stage] |= BITFIELD64_RANGE(range_start_reg,
470 bound_regs);
471 }
472
473 cmd_buffer->state.push_constants_dirty |=
474 mesa_to_vk_shader_stage(stage);
475 gfx_state->base.push_constants_data_dirty = true;
476
477 range_start_reg += range->length;
478 }
479 }
480 }
481
482 /* Setting NULL resets the push constant state so that we allocate a new one
483 * if needed. If push constant data not dirty, get_push_range_address can
484 * re-use existing allocation.
485 *
486 * Always reallocate on gfx9, gfx11 to fix push constant related flaky tests.
487 * See https://gitlab.freedesktop.org/mesa/mesa/-/issues/11064
488 */
489 if (gfx_state->base.push_constants_data_dirty || GFX_VER < 12)
490 gfx_state->base.push_constants_state = ANV_STATE_NULL;
491
492 anv_foreach_stage(stage, dirty_stages) {
493 unsigned buffer_count = 0;
494 flushed |= mesa_to_vk_shader_stage(stage);
495 UNUSED uint32_t max_push_range = 0;
496
497 struct anv_address buffers[4] = {};
498 if (anv_pipeline_has_stage(pipeline, stage)) {
499 const struct anv_shader_bin *shader = pipeline->base.shaders[stage];
500 const struct anv_pipeline_bind_map *bind_map = &shader->bind_map;
501
502 /* We have to gather buffer addresses as a second step because the
503 * loop above puts data into the push constant area and the call to
504 * get_push_range_address is what locks our push constants and copies
505 * them into the actual GPU buffer. If we did the two loops at the
506 * same time, we'd risk only having some of the sizes in the push
507 * constant buffer when we did the copy.
508 */
509 for (unsigned i = 0; i < 4; i++) {
510 const struct anv_push_range *range = &bind_map->push_ranges[i];
511 if (range->length == 0)
512 break;
513
514 buffers[i] = get_push_range_address(cmd_buffer, shader, range);
515 max_push_range = MAX2(max_push_range, range->length);
516 buffer_count++;
517 }
518
519 /* We have at most 4 buffers but they should be tightly packed */
520 for (unsigned i = buffer_count; i < 4; i++)
521 assert(bind_map->push_ranges[i].length == 0);
522 }
523
524 #if GFX_VER >= 12
525 /* If this stage doesn't have any push constants, emit it later in a
526 * single CONSTANT_ALL packet.
527 */
528 if (buffer_count == 0) {
529 nobuffer_stages |= 1 << stage;
530 continue;
531 }
532
533 /* The Constant Buffer Read Length field from 3DSTATE_CONSTANT_ALL
534 * contains only 5 bits, so we can only use it for buffers smaller than
535 * 32.
536 *
537 * According to Wa_16011448509, Gfx12.0 misinterprets some address bits
538 * in 3DSTATE_CONSTANT_ALL. It should still be safe to use the command
539 * for disabling stages, where all address bits are zero. However, we
540 * can't safely use it for general buffers with arbitrary addresses.
541 * Just fall back to the individual 3DSTATE_CONSTANT_XS commands in that
542 * case.
543 */
544 if (max_push_range < 32 && GFX_VERx10 > 120) {
545 cmd_buffer_emit_push_constant_all(cmd_buffer, 1 << stage,
546 buffers, buffer_count);
547 continue;
548 }
549 #endif
550
551 cmd_buffer_emit_push_constant(cmd_buffer, stage, buffers, buffer_count);
552 }
553
554 #if GFX_VER >= 12
555 if (nobuffer_stages)
556 /* Wa_16011448509: all address bits are zero */
557 cmd_buffer_emit_push_constant_all(cmd_buffer, nobuffer_stages, NULL, 0);
558 #endif
559
560 cmd_buffer->state.push_constants_dirty &= ~flushed;
561 gfx_state->base.push_constants_data_dirty = false;
562 }
563
564 #if GFX_VERx10 >= 125
565 static void
cmd_buffer_flush_mesh_inline_data(struct anv_cmd_buffer * cmd_buffer,VkShaderStageFlags dirty_stages)566 cmd_buffer_flush_mesh_inline_data(struct anv_cmd_buffer *cmd_buffer,
567 VkShaderStageFlags dirty_stages)
568 {
569 struct anv_cmd_graphics_state *gfx_state = &cmd_buffer->state.gfx;
570 const struct anv_graphics_pipeline *pipeline =
571 anv_pipeline_to_graphics(gfx_state->base.pipeline);
572
573 if (dirty_stages & VK_SHADER_STAGE_TASK_BIT_EXT &&
574 anv_pipeline_has_stage(pipeline, MESA_SHADER_TASK)) {
575
576 const struct anv_shader_bin *shader = pipeline->base.shaders[MESA_SHADER_TASK];
577 const struct anv_pipeline_bind_map *bind_map = &shader->bind_map;
578
579 anv_batch_emit(&cmd_buffer->batch, GENX(3DSTATE_TASK_SHADER_DATA), data) {
580 const struct anv_push_range *range = &bind_map->push_ranges[0];
581 if (range->length > 0) {
582 struct anv_address buffer =
583 get_push_range_address(cmd_buffer, shader, range);
584
585 uint64_t addr = anv_address_physical(buffer);
586 data.InlineData[0] = addr & 0xffffffff;
587 data.InlineData[1] = addr >> 32;
588
589 memcpy(&data.InlineData[BRW_TASK_MESH_PUSH_CONSTANTS_START_DW],
590 cmd_buffer->state.gfx.base.push_constants.client_data,
591 BRW_TASK_MESH_PUSH_CONSTANTS_SIZE_DW * 4);
592 }
593 }
594 }
595
596 if (dirty_stages & VK_SHADER_STAGE_MESH_BIT_EXT &&
597 anv_pipeline_has_stage(pipeline, MESA_SHADER_MESH)) {
598
599 const struct anv_shader_bin *shader = pipeline->base.shaders[MESA_SHADER_MESH];
600 const struct anv_pipeline_bind_map *bind_map = &shader->bind_map;
601
602 anv_batch_emit(&cmd_buffer->batch, GENX(3DSTATE_MESH_SHADER_DATA), data) {
603 const struct anv_push_range *range = &bind_map->push_ranges[0];
604 if (range->length > 0) {
605 struct anv_address buffer =
606 get_push_range_address(cmd_buffer, shader, range);
607
608 uint64_t addr = anv_address_physical(buffer);
609 data.InlineData[0] = addr & 0xffffffff;
610 data.InlineData[1] = addr >> 32;
611
612 memcpy(&data.InlineData[BRW_TASK_MESH_PUSH_CONSTANTS_START_DW],
613 cmd_buffer->state.gfx.base.push_constants.client_data,
614 BRW_TASK_MESH_PUSH_CONSTANTS_SIZE_DW * 4);
615 }
616 }
617 }
618
619 cmd_buffer->state.push_constants_dirty &= ~dirty_stages;
620 }
621 #endif
622
623 ALWAYS_INLINE static void
cmd_buffer_maybe_flush_rt_writes(struct anv_cmd_buffer * cmd_buffer,const struct anv_graphics_pipeline * pipeline)624 cmd_buffer_maybe_flush_rt_writes(struct anv_cmd_buffer *cmd_buffer,
625 const struct anv_graphics_pipeline *pipeline)
626 {
627 if (!anv_pipeline_has_stage(pipeline, MESA_SHADER_FRAGMENT))
628 return;
629
630 UNUSED bool need_rt_flush = false;
631 for (uint32_t rt = 0; rt < pipeline->num_color_outputs; rt++) {
632 /* No writes going to this render target so it won't affect the RT cache
633 */
634 if (pipeline->color_output_mapping[rt] == ANV_COLOR_OUTPUT_UNUSED)
635 continue;
636
637 /* No change */
638 if (cmd_buffer->state.gfx.color_output_mapping[rt] ==
639 pipeline->color_output_mapping[rt])
640 continue;
641
642 cmd_buffer->state.gfx.color_output_mapping[rt] =
643 pipeline->color_output_mapping[rt];
644 need_rt_flush = true;
645 cmd_buffer->state.descriptors_dirty |= VK_SHADER_STAGE_FRAGMENT_BIT;
646 }
647
648 #if GFX_VER >= 11
649 if (need_rt_flush) {
650 /* The PIPE_CONTROL command description says:
651 *
652 * "Whenever a Binding Table Index (BTI) used by a Render Target Message
653 * points to a different RENDER_SURFACE_STATE, SW must issue a Render
654 * Target Cache Flush by enabling this bit. When render target flush
655 * is set due to new association of BTI, PS Scoreboard Stall bit must
656 * be set in this packet."
657 *
658 * Within a renderpass, the render target entries in the binding tables
659 * remain the same as what was setup at CmdBeginRendering() with one
660 * exception where have to setup a null render target because a fragment
661 * writes only depth/stencil yet the renderpass has been setup with at
662 * least one color attachment. This is because our render target messages
663 * in the shader always send the color.
664 */
665 anv_add_pending_pipe_bits(cmd_buffer,
666 ANV_PIPE_RENDER_TARGET_CACHE_FLUSH_BIT |
667 ANV_PIPE_STALL_AT_SCOREBOARD_BIT,
668 "change RT due to shader outputs");
669 }
670 #endif
671 }
672
673 ALWAYS_INLINE static void
cmd_buffer_flush_vertex_buffers(struct anv_cmd_buffer * cmd_buffer,uint32_t vb_emit)674 cmd_buffer_flush_vertex_buffers(struct anv_cmd_buffer *cmd_buffer,
675 uint32_t vb_emit)
676 {
677 const struct vk_dynamic_graphics_state *dyn =
678 &cmd_buffer->vk.dynamic_graphics_state;
679 const uint32_t num_buffers = __builtin_popcount(vb_emit);
680 const uint32_t num_dwords = 1 + num_buffers * 4;
681 uint32_t *p = anv_batch_emitn(&cmd_buffer->batch, num_dwords,
682 GENX(3DSTATE_VERTEX_BUFFERS));
683 uint32_t i = 0;
684 u_foreach_bit(vb, vb_emit) {
685 struct anv_buffer *buffer = cmd_buffer->state.vertex_bindings[vb].buffer;
686 uint32_t offset = cmd_buffer->state.vertex_bindings[vb].offset;
687
688 struct GENX(VERTEX_BUFFER_STATE) state;
689 if (buffer) {
690 uint32_t stride = dyn->vi_binding_strides[vb];
691 UNUSED uint32_t size = cmd_buffer->state.vertex_bindings[vb].size;
692
693 state = (struct GENX(VERTEX_BUFFER_STATE)) {
694 .VertexBufferIndex = vb,
695
696 .MOCS = anv_mocs(cmd_buffer->device, buffer->address.bo,
697 ISL_SURF_USAGE_VERTEX_BUFFER_BIT),
698 .AddressModifyEnable = true,
699 .BufferPitch = stride,
700 .BufferStartingAddress = anv_address_add(buffer->address, offset),
701 .NullVertexBuffer = offset >= buffer->vk.size,
702 #if GFX_VER >= 12
703 .L3BypassDisable = true,
704 #endif
705
706 .BufferSize = size,
707 };
708 } else {
709 state = (struct GENX(VERTEX_BUFFER_STATE)) {
710 .VertexBufferIndex = vb,
711 .NullVertexBuffer = true,
712 .MOCS = anv_mocs(cmd_buffer->device, NULL,
713 ISL_SURF_USAGE_VERTEX_BUFFER_BIT),
714 };
715 }
716
717 #if GFX_VER == 9
718 genX(cmd_buffer_set_binding_for_gfx8_vb_flush)(cmd_buffer, vb,
719 state.BufferStartingAddress,
720 state.BufferSize);
721 #endif
722
723 GENX(VERTEX_BUFFER_STATE_pack)(&cmd_buffer->batch, &p[1 + i * 4], &state);
724 i++;
725 }
726 }
727
728 ALWAYS_INLINE static void
genX(cmd_buffer_flush_gfx_state)729 genX(cmd_buffer_flush_gfx_state)(struct anv_cmd_buffer *cmd_buffer)
730 {
731 struct anv_graphics_pipeline *pipeline =
732 anv_pipeline_to_graphics(cmd_buffer->state.gfx.base.pipeline);
733 const struct vk_dynamic_graphics_state *dyn =
734 &cmd_buffer->vk.dynamic_graphics_state;
735
736 assert((pipeline->base.base.active_stages & VK_SHADER_STAGE_COMPUTE_BIT) == 0);
737
738 genX(cmd_buffer_config_l3)(cmd_buffer, pipeline->base.base.l3_config);
739
740 genX(cmd_buffer_update_color_aux_op(cmd_buffer, ISL_AUX_OP_NONE));
741
742 genX(cmd_buffer_emit_hashing_mode)(cmd_buffer, UINT_MAX, UINT_MAX, 1);
743
744 genX(flush_descriptor_buffers)(cmd_buffer, &cmd_buffer->state.gfx.base);
745
746 genX(flush_pipeline_select_3d)(cmd_buffer);
747
748 if (cmd_buffer->state.gfx.dirty & ANV_CMD_DIRTY_PIPELINE) {
749 /* Wa_14015814527
750 *
751 * Apply task URB workaround when switching from task to primitive.
752 */
753 if (anv_pipeline_is_primitive(pipeline)) {
754 genX(apply_task_urb_workaround)(cmd_buffer);
755 } else if (anv_pipeline_has_stage(pipeline, MESA_SHADER_TASK)) {
756 cmd_buffer->state.gfx.used_task_shader = true;
757 }
758
759 cmd_buffer_maybe_flush_rt_writes(cmd_buffer, pipeline);
760 }
761
762 /* Apply any pending pipeline flushes we may have. We want to apply them
763 * now because, if any of those flushes are for things like push constants,
764 * the GPU will read the state at weird times.
765 */
766 genX(cmd_buffer_apply_pipe_flushes)(cmd_buffer);
767
768 /* Check what vertex buffers have been rebound against the set of bindings
769 * being used by the current set of vertex attributes.
770 */
771 uint32_t vb_emit = cmd_buffer->state.gfx.vb_dirty & dyn->vi->bindings_valid;
772 /* If the pipeline changed, the we have to consider all the valid bindings. */
773 if ((cmd_buffer->state.gfx.dirty & ANV_CMD_DIRTY_PIPELINE) ||
774 BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_VI_BINDINGS_VALID) ||
775 BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_VI_BINDING_STRIDES))
776 vb_emit |= dyn->vi->bindings_valid;
777
778 if (vb_emit) {
779 cmd_buffer_flush_vertex_buffers(cmd_buffer, vb_emit);
780 cmd_buffer->state.gfx.vb_dirty &= ~vb_emit;
781 }
782
783 const bool any_dynamic_state_dirty =
784 vk_dynamic_graphics_state_any_dirty(dyn);
785 uint32_t descriptors_dirty = cmd_buffer->state.descriptors_dirty &
786 pipeline->base.base.active_stages;
787
788 descriptors_dirty |=
789 genX(cmd_buffer_flush_push_descriptors)(cmd_buffer,
790 &cmd_buffer->state.gfx.base,
791 &pipeline->base.base);
792
793 if (!cmd_buffer->state.gfx.dirty && !descriptors_dirty &&
794 !any_dynamic_state_dirty &&
795 ((cmd_buffer->state.push_constants_dirty &
796 (VK_SHADER_STAGE_ALL_GRAPHICS |
797 VK_SHADER_STAGE_TASK_BIT_EXT |
798 VK_SHADER_STAGE_MESH_BIT_EXT)) == 0))
799 return;
800
801 if (cmd_buffer->state.gfx.dirty & ANV_CMD_DIRTY_XFB_ENABLE) {
802 /* Wa_16011411144:
803 *
804 * SW must insert a PIPE_CONTROL cmd before and after the
805 * 3dstate_so_buffer_index_0/1/2/3 states to ensure so_buffer_index_*
806 * state is not combined with other state changes.
807 */
808 if (intel_needs_workaround(cmd_buffer->device->info, 16011411144)) {
809 anv_add_pending_pipe_bits(cmd_buffer,
810 ANV_PIPE_CS_STALL_BIT,
811 "before SO_BUFFER change WA");
812 genX(cmd_buffer_apply_pipe_flushes)(cmd_buffer);
813 }
814
815 /* We don't need any per-buffer dirty tracking because you're not
816 * allowed to bind different XFB buffers while XFB is enabled.
817 */
818 for (unsigned idx = 0; idx < MAX_XFB_BUFFERS; idx++) {
819 struct anv_xfb_binding *xfb = &cmd_buffer->state.xfb_bindings[idx];
820 anv_batch_emit(&cmd_buffer->batch, GENX(3DSTATE_SO_BUFFER), sob) {
821 #if GFX_VER < 12
822 sob.SOBufferIndex = idx;
823 #else
824 sob._3DCommandOpcode = 0;
825 sob._3DCommandSubOpcode = SO_BUFFER_INDEX_0_CMD + idx;
826 #endif
827
828 if (cmd_buffer->state.xfb_enabled && xfb->buffer && xfb->size != 0) {
829 sob.MOCS = anv_mocs(cmd_buffer->device, xfb->buffer->address.bo,
830 ISL_SURF_USAGE_STREAM_OUT_BIT);
831 sob.SurfaceBaseAddress = anv_address_add(xfb->buffer->address,
832 xfb->offset);
833 sob.SOBufferEnable = true;
834 sob.StreamOffsetWriteEnable = false;
835 /* Size is in DWords - 1 */
836 sob.SurfaceSize = DIV_ROUND_UP(xfb->size, 4) - 1;
837 } else {
838 sob.MOCS = anv_mocs(cmd_buffer->device, NULL, 0);
839 }
840 }
841 }
842
843 if (intel_needs_workaround(cmd_buffer->device->info, 16011411144)) {
844 /* Wa_16011411144: also CS_STALL after touching SO_BUFFER change */
845 anv_add_pending_pipe_bits(cmd_buffer,
846 ANV_PIPE_CS_STALL_BIT,
847 "after SO_BUFFER change WA");
848 genX(cmd_buffer_apply_pipe_flushes)(cmd_buffer);
849 } else if (GFX_VER >= 10) {
850 /* CNL and later require a CS stall after 3DSTATE_SO_BUFFER */
851 anv_add_pending_pipe_bits(cmd_buffer,
852 ANV_PIPE_CS_STALL_BIT,
853 "after 3DSTATE_SO_BUFFER call");
854 }
855 }
856
857 /* Flush the runtime state into the HW state tracking */
858 if (cmd_buffer->state.gfx.dirty || any_dynamic_state_dirty)
859 genX(cmd_buffer_flush_gfx_runtime_state)(cmd_buffer);
860
861 /* Flush the HW state into the commmand buffer */
862 if (!BITSET_IS_EMPTY(cmd_buffer->state.gfx.dyn_state.dirty))
863 genX(cmd_buffer_flush_gfx_hw_state)(cmd_buffer);
864
865 /* If the pipeline changed, we may need to re-allocate push constant space
866 * in the URB.
867 */
868 if (cmd_buffer->state.gfx.dirty & ANV_CMD_DIRTY_PIPELINE) {
869 cmd_buffer_alloc_gfx_push_constants(cmd_buffer);
870
871 /* Also add the relocations (scratch buffers) */
872 VkResult result = anv_reloc_list_append(cmd_buffer->batch.relocs,
873 pipeline->base.base.batch.relocs);
874 if (result != VK_SUCCESS) {
875 anv_batch_set_error(&cmd_buffer->batch, result);
876 return;
877 }
878 }
879
880 /* Render targets live in the same binding table as fragment descriptors */
881 if (cmd_buffer->state.gfx.dirty & ANV_CMD_DIRTY_RENDER_TARGETS)
882 descriptors_dirty |= VK_SHADER_STAGE_FRAGMENT_BIT;
883
884 /* We emit the binding tables and sampler tables first, then emit push
885 * constants and then finally emit binding table and sampler table
886 * pointers. It has to happen in this order, since emitting the binding
887 * tables may change the push constants (in case of storage images). After
888 * emitting push constants, on SKL+ we have to emit the corresponding
889 * 3DSTATE_BINDING_TABLE_POINTER_* for the push constants to take effect.
890 */
891 uint32_t dirty = 0;
892 if (descriptors_dirty) {
893 dirty = genX(cmd_buffer_flush_descriptor_sets)(
894 cmd_buffer,
895 &cmd_buffer->state.gfx.base,
896 descriptors_dirty,
897 pipeline->base.shaders,
898 ARRAY_SIZE(pipeline->base.shaders));
899 cmd_buffer->state.descriptors_dirty &= ~dirty;
900 }
901
902 if (dirty || cmd_buffer->state.push_constants_dirty) {
903 /* Because we're pushing UBOs, we have to push whenever either
904 * descriptors or push constants is dirty.
905 */
906 dirty |= cmd_buffer->state.push_constants_dirty &
907 pipeline->base.base.active_stages;
908 cmd_buffer_flush_gfx_push_constants(cmd_buffer,
909 dirty & VK_SHADER_STAGE_ALL_GRAPHICS);
910 #if GFX_VERx10 >= 125
911 cmd_buffer_flush_mesh_inline_data(
912 cmd_buffer, dirty & (VK_SHADER_STAGE_TASK_BIT_EXT |
913 VK_SHADER_STAGE_MESH_BIT_EXT));
914 #endif
915 }
916
917 if (dirty & VK_SHADER_STAGE_ALL_GRAPHICS) {
918 cmd_buffer_emit_descriptor_pointers(cmd_buffer,
919 dirty & VK_SHADER_STAGE_ALL_GRAPHICS);
920 }
921
922 #if GFX_VER >= 20
923 if (cmd_buffer->state.gfx.dirty & ANV_CMD_DIRTY_INDIRECT_DATA_STRIDE) {
924 anv_batch_emit(&cmd_buffer->batch, GENX(STATE_BYTE_STRIDE), sb_stride) {
925 sb_stride.ByteStride = cmd_buffer->state.gfx.indirect_data_stride;
926 sb_stride.ByteStrideEnable = !cmd_buffer->state.gfx.indirect_data_stride_aligned;
927 }
928 }
929 #endif
930
931 cmd_buffer->state.gfx.dirty = 0;
932 }
933
934 ALWAYS_INLINE static bool
anv_use_generated_draws(const struct anv_cmd_buffer * cmd_buffer,uint32_t count)935 anv_use_generated_draws(const struct anv_cmd_buffer *cmd_buffer, uint32_t count)
936 {
937 const struct anv_device *device = cmd_buffer->device;
938 const struct anv_graphics_pipeline *pipeline =
939 anv_pipeline_to_graphics(cmd_buffer->state.gfx.base.pipeline);
940
941 /* We cannot generate readable commands in protected mode. */
942 if (cmd_buffer->vk.pool->flags & VK_COMMAND_POOL_CREATE_PROTECTED_BIT)
943 return false;
944
945 /* Limit generated draws to pipelines without HS stage. This makes things
946 * simpler for implementing Wa_1306463417, Wa_16011107343.
947 */
948 if ((INTEL_NEEDS_WA_1306463417 || INTEL_NEEDS_WA_16011107343) &&
949 anv_pipeline_has_stage(pipeline, MESA_SHADER_TESS_CTRL))
950 return false;
951
952 return count >= device->physical->instance->generated_indirect_threshold;
953 }
954
955 #include "genX_cmd_draw_helpers.h"
956 #include "genX_cmd_draw_generated_indirect.h"
957
958 ALWAYS_INLINE static void
cmd_buffer_pre_draw_wa(struct anv_cmd_buffer * cmd_buffer)959 cmd_buffer_pre_draw_wa(struct anv_cmd_buffer *cmd_buffer)
960 {
961 UNUSED const bool protected = cmd_buffer->vk.pool->flags &
962 VK_COMMAND_POOL_CREATE_PROTECTED_BIT;
963 UNUSED struct anv_graphics_pipeline *pipeline =
964 anv_pipeline_to_graphics(cmd_buffer->state.gfx.base.pipeline);
965
966 #if INTEL_WA_16011107343_GFX_VER
967 if (intel_needs_workaround(cmd_buffer->device->info, 16011107343) &&
968 anv_pipeline_has_stage(pipeline, MESA_SHADER_TESS_CTRL)) {
969 anv_batch_emit_pipeline_state_protected(&cmd_buffer->batch, pipeline,
970 final.hs, protected);
971 }
972 #endif
973
974 #if INTEL_WA_22018402687_GFX_VER
975 if (intel_needs_workaround(cmd_buffer->device->info, 22018402687) &&
976 anv_pipeline_has_stage(pipeline, MESA_SHADER_TESS_EVAL)) {
977 /* Wa_22018402687:
978 * In any 3D enabled context, just before any Tessellation enabled
979 * draw call (3D Primitive), re-send the last programmed 3DSTATE_DS
980 * again. This will make sure that the 3DSTATE_INT generated just
981 * before the draw call will have TDS dirty which will make sure TDS
982 * will launch the state thread before the draw call.
983 *
984 * This fixes a hang resulting from running anything using tessellation
985 * after a switch away from the mesh pipeline. We don't need to track
986 * said switch, as it matters at the HW level, and can be triggered even
987 * across processes, so we apply the Wa at all times.
988 */
989 anv_batch_emit_pipeline_state_protected(&cmd_buffer->batch, pipeline,
990 final.ds, protected);
991 }
992 #endif
993
994 genX(emit_breakpoint)(&cmd_buffer->batch, cmd_buffer->device, true);
995 }
996
997 ALWAYS_INLINE static void
batch_post_draw_wa(struct anv_batch * batch,const struct anv_device * device,uint32_t primitive_topology,uint32_t vertex_count)998 batch_post_draw_wa(struct anv_batch *batch,
999 const struct anv_device *device,
1000 uint32_t primitive_topology,
1001 uint32_t vertex_count)
1002 {
1003 #if INTEL_WA_22014412737_GFX_VER || INTEL_WA_16014538804_GFX_VER
1004 if (intel_needs_workaround(device->info, 22014412737) &&
1005 (primitive_topology == _3DPRIM_POINTLIST ||
1006 primitive_topology == _3DPRIM_LINELIST ||
1007 primitive_topology == _3DPRIM_LINESTRIP ||
1008 primitive_topology == _3DPRIM_LINELIST_ADJ ||
1009 primitive_topology == _3DPRIM_LINESTRIP_ADJ ||
1010 primitive_topology == _3DPRIM_LINELOOP ||
1011 primitive_topology == _3DPRIM_POINTLIST_BF ||
1012 primitive_topology == _3DPRIM_LINESTRIP_CONT ||
1013 primitive_topology == _3DPRIM_LINESTRIP_BF ||
1014 primitive_topology == _3DPRIM_LINESTRIP_CONT_BF) &&
1015 (vertex_count == 1 || vertex_count == 2)) {
1016 genx_batch_emit_pipe_control_write
1017 (batch, device->info, 0, WriteImmediateData,
1018 device->workaround_address, 0, 0);
1019
1020 /* Reset counter because we just emitted a PC */
1021 batch->num_3d_primitives_emitted = 0;
1022 } else if (intel_needs_workaround(device->info, 16014538804)) {
1023 batch->num_3d_primitives_emitted++;
1024 /* WA 16014538804:
1025 * After every 3 3D_Primitive command,
1026 * atleast 1 pipe_control must be inserted.
1027 */
1028 if (batch->num_3d_primitives_emitted == 3) {
1029 anv_batch_emit(batch, GENX(PIPE_CONTROL), pc);
1030 batch->num_3d_primitives_emitted = 0;
1031 }
1032 }
1033 #endif
1034 }
1035
1036 void
genX(batch_emit_post_3dprimitive_was)1037 genX(batch_emit_post_3dprimitive_was)(struct anv_batch *batch,
1038 const struct anv_device *device,
1039 uint32_t primitive_topology,
1040 uint32_t vertex_count)
1041 {
1042 batch_post_draw_wa(batch, device, primitive_topology, vertex_count);
1043 }
1044
1045 ALWAYS_INLINE static void
cmd_buffer_post_draw_wa(struct anv_cmd_buffer * cmd_buffer,uint32_t vertex_count,uint32_t access_type)1046 cmd_buffer_post_draw_wa(struct anv_cmd_buffer *cmd_buffer,
1047 uint32_t vertex_count,
1048 uint32_t access_type)
1049 {
1050 batch_post_draw_wa(&cmd_buffer->batch, cmd_buffer->device,
1051 cmd_buffer->state.gfx.dyn_state.vft.PrimitiveTopologyType,
1052 vertex_count);
1053
1054 update_dirty_vbs_for_gfx8_vb_flush(cmd_buffer, access_type);
1055
1056 genX(emit_breakpoint)(&cmd_buffer->batch, cmd_buffer->device, false);
1057 }
1058
1059 #if GFX_VER >= 11
1060 #define _3DPRIMITIVE_DIRECT GENX(3DPRIMITIVE_EXTENDED)
1061 #else
1062 #define _3DPRIMITIVE_DIRECT GENX(3DPRIMITIVE)
1063 #endif
1064
genX(CmdDraw)1065 void genX(CmdDraw)(
1066 VkCommandBuffer commandBuffer,
1067 uint32_t vertexCount,
1068 uint32_t instanceCount,
1069 uint32_t firstVertex,
1070 uint32_t firstInstance)
1071 {
1072 ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer);
1073 struct anv_graphics_pipeline *pipeline =
1074 anv_pipeline_to_graphics(cmd_buffer->state.gfx.base.pipeline);
1075
1076 if (anv_batch_has_error(&cmd_buffer->batch))
1077 return;
1078
1079 const uint32_t count =
1080 vertexCount * instanceCount * pipeline->instance_multiplier;
1081 anv_measure_snapshot(cmd_buffer,
1082 INTEL_SNAPSHOT_DRAW,
1083 "draw", count);
1084 trace_intel_begin_draw(&cmd_buffer->trace);
1085
1086 /* Select pipeline here to allow
1087 * cmd_buffer_emit_vertex_constants_and_flush() without flushing before
1088 * cmd_buffer_flush_gfx_state().
1089 */
1090 genX(flush_pipeline_select_3d)(cmd_buffer);
1091
1092 #if GFX_VER < 11
1093 cmd_buffer_emit_vertex_constants_and_flush(cmd_buffer,
1094 get_vs_prog_data(pipeline),
1095 firstVertex, firstInstance, 0,
1096 false /* force_flush */);
1097 #endif
1098
1099 genX(cmd_buffer_flush_gfx_state)(cmd_buffer);
1100
1101 if (cmd_buffer->state.conditional_render_enabled)
1102 genX(cmd_emit_conditional_render_predicate)(cmd_buffer);
1103
1104 cmd_buffer_pre_draw_wa(cmd_buffer);
1105
1106 anv_batch_emit(&cmd_buffer->batch, _3DPRIMITIVE_DIRECT, prim) {
1107 prim.PredicateEnable = cmd_buffer->state.conditional_render_enabled;
1108 #if GFX_VERx10 >= 125
1109 prim.TBIMREnable = cmd_buffer->state.gfx.dyn_state.use_tbimr;
1110 #endif
1111 prim.VertexAccessType = SEQUENTIAL;
1112 prim.VertexCountPerInstance = vertexCount;
1113 prim.StartVertexLocation = firstVertex;
1114 prim.InstanceCount = instanceCount *
1115 pipeline->instance_multiplier;
1116 prim.StartInstanceLocation = firstInstance;
1117 prim.BaseVertexLocation = 0;
1118 #if GFX_VER >= 11
1119 prim.ExtendedParametersPresent = true;
1120 prim.ExtendedParameter0 = firstVertex;
1121 prim.ExtendedParameter1 = firstInstance;
1122 prim.ExtendedParameter2 = 0;
1123 #endif
1124 }
1125
1126 cmd_buffer_post_draw_wa(cmd_buffer, vertexCount, SEQUENTIAL);
1127
1128 trace_intel_end_draw(&cmd_buffer->trace, count,
1129 pipeline->base.source_hashes[MESA_SHADER_VERTEX],
1130 pipeline->base.source_hashes[MESA_SHADER_FRAGMENT]);
1131 }
1132
genX(CmdDrawMultiEXT)1133 void genX(CmdDrawMultiEXT)(
1134 VkCommandBuffer commandBuffer,
1135 uint32_t drawCount,
1136 const VkMultiDrawInfoEXT *pVertexInfo,
1137 uint32_t instanceCount,
1138 uint32_t firstInstance,
1139 uint32_t stride)
1140 {
1141 ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer);
1142 UNUSED struct anv_graphics_pipeline *pipeline =
1143 anv_pipeline_to_graphics(cmd_buffer->state.gfx.base.pipeline);
1144
1145 if (anv_batch_has_error(&cmd_buffer->batch))
1146 return;
1147
1148 genX(cmd_buffer_flush_gfx_state)(cmd_buffer);
1149
1150 if (cmd_buffer->state.conditional_render_enabled)
1151 genX(cmd_emit_conditional_render_predicate)(cmd_buffer);
1152
1153 uint32_t i = 0;
1154 #if GFX_VER < 11
1155 vk_foreach_multi_draw(draw, i, pVertexInfo, drawCount, stride) {
1156 cmd_buffer_emit_vertex_constants_and_flush(cmd_buffer,
1157 get_vs_prog_data(pipeline),
1158 draw->firstVertex,
1159 firstInstance, i, !i);
1160
1161 const uint32_t count =
1162 draw->vertexCount * instanceCount * pipeline->instance_multiplier;
1163 anv_measure_snapshot(cmd_buffer,
1164 INTEL_SNAPSHOT_DRAW,
1165 "draw multi", count);
1166 trace_intel_begin_draw_multi(&cmd_buffer->trace);
1167
1168 cmd_buffer_pre_draw_wa(cmd_buffer);
1169
1170 anv_batch_emit(&cmd_buffer->batch, GENX(3DPRIMITIVE), prim) {
1171 prim.PredicateEnable = cmd_buffer->state.conditional_render_enabled;
1172 prim.VertexAccessType = SEQUENTIAL;
1173 prim.VertexCountPerInstance = draw->vertexCount;
1174 prim.StartVertexLocation = draw->firstVertex;
1175 prim.InstanceCount = instanceCount *
1176 pipeline->instance_multiplier;
1177 prim.StartInstanceLocation = firstInstance;
1178 prim.BaseVertexLocation = 0;
1179 }
1180
1181 cmd_buffer_post_draw_wa(cmd_buffer, drawCount == 0 ? 0 :
1182 pVertexInfo[drawCount - 1].vertexCount,
1183 SEQUENTIAL);
1184
1185 trace_intel_end_draw_multi(&cmd_buffer->trace, count,
1186 pipeline->base.source_hashes[MESA_SHADER_VERTEX],
1187 pipeline->base.source_hashes[MESA_SHADER_FRAGMENT]);
1188 }
1189 #else
1190 vk_foreach_multi_draw(draw, i, pVertexInfo, drawCount, stride) {
1191 const uint32_t count = draw->vertexCount * instanceCount;
1192 anv_measure_snapshot(cmd_buffer,
1193 INTEL_SNAPSHOT_DRAW,
1194 "draw multi", count);
1195 trace_intel_begin_draw_multi(&cmd_buffer->trace);
1196
1197 cmd_buffer_pre_draw_wa(cmd_buffer);
1198
1199 anv_batch_emit(&cmd_buffer->batch, _3DPRIMITIVE_DIRECT, prim) {
1200 #if GFX_VERx10 >= 125
1201 prim.TBIMREnable = cmd_buffer->state.gfx.dyn_state.use_tbimr;
1202 #endif
1203 prim.PredicateEnable = cmd_buffer->state.conditional_render_enabled;
1204 prim.VertexAccessType = SEQUENTIAL;
1205 prim.VertexCountPerInstance = draw->vertexCount;
1206 prim.StartVertexLocation = draw->firstVertex;
1207 prim.InstanceCount = instanceCount *
1208 pipeline->instance_multiplier;
1209 prim.StartInstanceLocation = firstInstance;
1210 prim.BaseVertexLocation = 0;
1211 prim.ExtendedParametersPresent = true;
1212 prim.ExtendedParameter0 = draw->firstVertex;
1213 prim.ExtendedParameter1 = firstInstance;
1214 prim.ExtendedParameter2 = i;
1215 }
1216
1217 cmd_buffer_post_draw_wa(cmd_buffer, drawCount == 0 ? 0 :
1218 pVertexInfo[drawCount - 1].vertexCount,
1219 SEQUENTIAL);
1220
1221 trace_intel_end_draw_multi(&cmd_buffer->trace, count,
1222 pipeline->base.source_hashes[MESA_SHADER_VERTEX],
1223 pipeline->base.source_hashes[MESA_SHADER_FRAGMENT]);
1224 }
1225 #endif
1226 }
1227
genX(CmdDrawIndexed)1228 void genX(CmdDrawIndexed)(
1229 VkCommandBuffer commandBuffer,
1230 uint32_t indexCount,
1231 uint32_t instanceCount,
1232 uint32_t firstIndex,
1233 int32_t vertexOffset,
1234 uint32_t firstInstance)
1235 {
1236 ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer);
1237 struct anv_graphics_pipeline *pipeline =
1238 anv_pipeline_to_graphics(cmd_buffer->state.gfx.base.pipeline);
1239
1240 if (anv_batch_has_error(&cmd_buffer->batch))
1241 return;
1242
1243 const uint32_t count =
1244 indexCount * instanceCount * pipeline->instance_multiplier;
1245 anv_measure_snapshot(cmd_buffer,
1246 INTEL_SNAPSHOT_DRAW,
1247 "draw indexed",
1248 count);
1249 trace_intel_begin_draw_indexed(&cmd_buffer->trace);
1250
1251 /* Select pipeline here to allow
1252 * cmd_buffer_emit_vertex_constants_and_flush() without flushing before
1253 * cmd_buffer_flush_gfx_state().
1254 */
1255 genX(flush_pipeline_select_3d)(cmd_buffer);
1256
1257 #if GFX_VER < 11
1258 const struct brw_vs_prog_data *vs_prog_data = get_vs_prog_data(pipeline);
1259 cmd_buffer_emit_vertex_constants_and_flush(cmd_buffer, vs_prog_data,
1260 vertexOffset, firstInstance,
1261 0, false /* force_flush */);
1262 #endif
1263
1264 genX(cmd_buffer_flush_gfx_state)(cmd_buffer);
1265
1266 if (cmd_buffer->state.conditional_render_enabled)
1267 genX(cmd_emit_conditional_render_predicate)(cmd_buffer);
1268
1269 cmd_buffer_pre_draw_wa(cmd_buffer);
1270
1271 anv_batch_emit(&cmd_buffer->batch, _3DPRIMITIVE_DIRECT, prim) {
1272 prim.PredicateEnable = cmd_buffer->state.conditional_render_enabled;
1273 #if GFX_VERx10 >= 125
1274 prim.TBIMREnable = cmd_buffer->state.gfx.dyn_state.use_tbimr;
1275 #endif
1276 prim.VertexAccessType = RANDOM;
1277 prim.VertexCountPerInstance = indexCount;
1278 prim.StartVertexLocation = firstIndex;
1279 prim.InstanceCount = instanceCount *
1280 pipeline->instance_multiplier;
1281 prim.StartInstanceLocation = firstInstance;
1282 prim.BaseVertexLocation = vertexOffset;
1283 #if GFX_VER >= 11
1284 prim.ExtendedParametersPresent = true;
1285 prim.ExtendedParameter0 = vertexOffset;
1286 prim.ExtendedParameter1 = firstInstance;
1287 prim.ExtendedParameter2 = 0;
1288 #endif
1289 }
1290
1291 cmd_buffer_post_draw_wa(cmd_buffer, indexCount, RANDOM);
1292
1293 trace_intel_end_draw_indexed(&cmd_buffer->trace, count,
1294 pipeline->base.source_hashes[MESA_SHADER_VERTEX],
1295 pipeline->base.source_hashes[MESA_SHADER_FRAGMENT]);
1296 }
1297
genX(CmdDrawMultiIndexedEXT)1298 void genX(CmdDrawMultiIndexedEXT)(
1299 VkCommandBuffer commandBuffer,
1300 uint32_t drawCount,
1301 const VkMultiDrawIndexedInfoEXT *pIndexInfo,
1302 uint32_t instanceCount,
1303 uint32_t firstInstance,
1304 uint32_t stride,
1305 const int32_t *pVertexOffset)
1306 {
1307 ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer);
1308 struct anv_graphics_pipeline *pipeline =
1309 anv_pipeline_to_graphics(cmd_buffer->state.gfx.base.pipeline);
1310
1311 if (anv_batch_has_error(&cmd_buffer->batch))
1312 return;
1313
1314 genX(cmd_buffer_flush_gfx_state)(cmd_buffer);
1315
1316 if (cmd_buffer->state.conditional_render_enabled)
1317 genX(cmd_emit_conditional_render_predicate)(cmd_buffer);
1318
1319 uint32_t i = 0;
1320 #if GFX_VER < 11
1321 const struct brw_vs_prog_data *vs_prog_data = get_vs_prog_data(pipeline);
1322 if (pVertexOffset) {
1323 if (vs_prog_data->uses_drawid) {
1324 bool emitted = true;
1325 if (vs_prog_data->uses_firstvertex ||
1326 vs_prog_data->uses_baseinstance) {
1327 emit_base_vertex_instance(cmd_buffer, *pVertexOffset, firstInstance);
1328 emitted = true;
1329 }
1330 vk_foreach_multi_draw_indexed(draw, i, pIndexInfo, drawCount, stride) {
1331 if (vs_prog_data->uses_drawid) {
1332 emit_draw_index(cmd_buffer, i);
1333 emitted = true;
1334 }
1335 /* Emitting draw index or vertex index BOs may result in needing
1336 * additional VF cache flushes.
1337 */
1338 if (emitted)
1339 genX(cmd_buffer_apply_pipe_flushes)(cmd_buffer);
1340
1341 const uint32_t count =
1342 draw->indexCount * instanceCount * pipeline->instance_multiplier;
1343 anv_measure_snapshot(cmd_buffer,
1344 INTEL_SNAPSHOT_DRAW,
1345 "draw indexed multi",
1346 count);
1347 trace_intel_begin_draw_indexed_multi(&cmd_buffer->trace);
1348
1349 cmd_buffer_pre_draw_wa(cmd_buffer);
1350
1351 anv_batch_emit(&cmd_buffer->batch, GENX(3DPRIMITIVE), prim) {
1352 prim.PredicateEnable = cmd_buffer->state.conditional_render_enabled;
1353 prim.VertexAccessType = RANDOM;
1354 prim.VertexCountPerInstance = draw->indexCount;
1355 prim.StartVertexLocation = draw->firstIndex;
1356 prim.InstanceCount = instanceCount *
1357 pipeline->instance_multiplier;
1358 prim.StartInstanceLocation = firstInstance;
1359 prim.BaseVertexLocation = *pVertexOffset;
1360 }
1361
1362 cmd_buffer_post_draw_wa(cmd_buffer, drawCount == 0 ? 0 :
1363 pIndexInfo[drawCount - 1].indexCount,
1364 RANDOM);
1365
1366 trace_intel_end_draw_indexed_multi(&cmd_buffer->trace, count,
1367 pipeline->base.source_hashes[MESA_SHADER_VERTEX],
1368 pipeline->base.source_hashes[MESA_SHADER_FRAGMENT]);
1369 emitted = false;
1370 }
1371 } else {
1372 if (vs_prog_data->uses_firstvertex ||
1373 vs_prog_data->uses_baseinstance) {
1374 emit_base_vertex_instance(cmd_buffer, *pVertexOffset, firstInstance);
1375 /* Emitting draw index or vertex index BOs may result in needing
1376 * additional VF cache flushes.
1377 */
1378 genX(cmd_buffer_apply_pipe_flushes)(cmd_buffer);
1379 }
1380 vk_foreach_multi_draw_indexed(draw, i, pIndexInfo, drawCount, stride) {
1381 const uint32_t count =
1382 draw->indexCount * instanceCount * pipeline->instance_multiplier;
1383 anv_measure_snapshot(cmd_buffer,
1384 INTEL_SNAPSHOT_DRAW,
1385 "draw indexed multi",
1386 count);
1387 trace_intel_begin_draw_indexed_multi(&cmd_buffer->trace);
1388
1389 cmd_buffer_pre_draw_wa(cmd_buffer);
1390
1391 anv_batch_emit(&cmd_buffer->batch, GENX(3DPRIMITIVE), prim) {
1392 prim.PredicateEnable = cmd_buffer->state.conditional_render_enabled;
1393 prim.VertexAccessType = RANDOM;
1394 prim.VertexCountPerInstance = draw->indexCount;
1395 prim.StartVertexLocation = draw->firstIndex;
1396 prim.InstanceCount = instanceCount *
1397 pipeline->instance_multiplier;
1398 prim.StartInstanceLocation = firstInstance;
1399 prim.BaseVertexLocation = *pVertexOffset;
1400 }
1401
1402 cmd_buffer_post_draw_wa(cmd_buffer, drawCount == 0 ? 0 :
1403 pIndexInfo[drawCount - 1].indexCount,
1404 RANDOM);
1405
1406 trace_intel_end_draw_indexed_multi(&cmd_buffer->trace, count,
1407 pipeline->base.source_hashes[MESA_SHADER_VERTEX],
1408 pipeline->base.source_hashes[MESA_SHADER_FRAGMENT]);
1409 }
1410 }
1411 } else {
1412 vk_foreach_multi_draw_indexed(draw, i, pIndexInfo, drawCount, stride) {
1413 cmd_buffer_emit_vertex_constants_and_flush(cmd_buffer, vs_prog_data,
1414 draw->vertexOffset,
1415 firstInstance, i, i != 0);
1416
1417 const uint32_t count =
1418 draw->indexCount * instanceCount * pipeline->instance_multiplier;
1419 anv_measure_snapshot(cmd_buffer,
1420 INTEL_SNAPSHOT_DRAW,
1421 "draw indexed multi",
1422 count);
1423 trace_intel_begin_draw_indexed_multi(&cmd_buffer->trace);
1424
1425 cmd_buffer_pre_draw_wa(cmd_buffer);
1426
1427 anv_batch_emit(&cmd_buffer->batch, GENX(3DPRIMITIVE), prim) {
1428 prim.PredicateEnable = cmd_buffer->state.conditional_render_enabled;
1429 prim.VertexAccessType = RANDOM;
1430 prim.VertexCountPerInstance = draw->indexCount;
1431 prim.StartVertexLocation = draw->firstIndex;
1432 prim.InstanceCount = instanceCount *
1433 pipeline->instance_multiplier;
1434 prim.StartInstanceLocation = firstInstance;
1435 prim.BaseVertexLocation = draw->vertexOffset;
1436 }
1437
1438 cmd_buffer_post_draw_wa(cmd_buffer, drawCount == 0 ? 0 :
1439 pIndexInfo[drawCount - 1].indexCount,
1440 RANDOM);
1441
1442 trace_intel_end_draw_indexed_multi(&cmd_buffer->trace, count,
1443 pipeline->base.source_hashes[MESA_SHADER_VERTEX],
1444 pipeline->base.source_hashes[MESA_SHADER_FRAGMENT]);
1445 }
1446 }
1447 #else
1448 vk_foreach_multi_draw_indexed(draw, i, pIndexInfo, drawCount, stride) {
1449 const uint32_t count =
1450 draw->indexCount * instanceCount * pipeline->instance_multiplier;
1451 anv_measure_snapshot(cmd_buffer,
1452 INTEL_SNAPSHOT_DRAW,
1453 "draw indexed multi",
1454 count);
1455 trace_intel_begin_draw_indexed_multi(&cmd_buffer->trace);
1456
1457 cmd_buffer_pre_draw_wa(cmd_buffer);
1458
1459 anv_batch_emit(&cmd_buffer->batch, GENX(3DPRIMITIVE_EXTENDED), prim) {
1460 #if GFX_VERx10 >= 125
1461 prim.TBIMREnable = cmd_buffer->state.gfx.dyn_state.use_tbimr;
1462 #endif
1463 prim.PredicateEnable = cmd_buffer->state.conditional_render_enabled;
1464 prim.VertexAccessType = RANDOM;
1465 prim.VertexCountPerInstance = draw->indexCount;
1466 prim.StartVertexLocation = draw->firstIndex;
1467 prim.InstanceCount = instanceCount *
1468 pipeline->instance_multiplier;
1469 prim.StartInstanceLocation = firstInstance;
1470 prim.BaseVertexLocation = pVertexOffset ? *pVertexOffset : draw->vertexOffset;
1471 prim.ExtendedParametersPresent = true;
1472 prim.ExtendedParameter0 = pVertexOffset ? *pVertexOffset : draw->vertexOffset;
1473 prim.ExtendedParameter1 = firstInstance;
1474 prim.ExtendedParameter2 = i;
1475 }
1476
1477 cmd_buffer_post_draw_wa(cmd_buffer, drawCount == 0 ? 0 :
1478 pIndexInfo[drawCount - 1].indexCount,
1479 RANDOM);
1480
1481 trace_intel_end_draw_indexed_multi(&cmd_buffer->trace, count,
1482 pipeline->base.source_hashes[MESA_SHADER_VERTEX],
1483 pipeline->base.source_hashes[MESA_SHADER_FRAGMENT]);
1484 }
1485 #endif
1486 }
1487
1488 /* Auto-Draw / Indirect Registers */
1489 #define GFX7_3DPRIM_END_OFFSET 0x2420
1490 #define GFX7_3DPRIM_START_VERTEX 0x2430
1491 #define GFX7_3DPRIM_VERTEX_COUNT 0x2434
1492 #define GFX7_3DPRIM_INSTANCE_COUNT 0x2438
1493 #define GFX7_3DPRIM_START_INSTANCE 0x243C
1494 #define GFX7_3DPRIM_BASE_VERTEX 0x2440
1495
1496 /* On Gen11+, we have three custom "extended parameters" which we can use to
1497 * provide extra system-generated values to shaders. Our assignment of these
1498 * is arbitrary; we choose to assign them as follows:
1499 *
1500 * gl_BaseVertex = XP0
1501 * gl_BaseInstance = XP1
1502 * gl_DrawID = XP2
1503 *
1504 * For gl_BaseInstance, we never actually have to set up the value because we
1505 * can just program 3DSTATE_VF_SGVS_2 to load it implicitly. We can also do
1506 * that for gl_BaseVertex but it does the wrong thing for indexed draws.
1507 */
1508 #define GEN11_3DPRIM_XP0 0x2690
1509 #define GEN11_3DPRIM_XP1 0x2694
1510 #define GEN11_3DPRIM_XP2 0x2698
1511 #define GEN11_3DPRIM_XP_BASE_VERTEX GEN11_3DPRIM_XP0
1512 #define GEN11_3DPRIM_XP_BASE_INSTANCE GEN11_3DPRIM_XP1
1513 #define GEN11_3DPRIM_XP_DRAW_ID GEN11_3DPRIM_XP2
1514
genX(CmdDrawIndirectByteCountEXT)1515 void genX(CmdDrawIndirectByteCountEXT)(
1516 VkCommandBuffer commandBuffer,
1517 uint32_t instanceCount,
1518 uint32_t firstInstance,
1519 VkBuffer counterBuffer,
1520 VkDeviceSize counterBufferOffset,
1521 uint32_t counterOffset,
1522 uint32_t vertexStride)
1523 {
1524 ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer);
1525 ANV_FROM_HANDLE(anv_buffer, counter_buffer, counterBuffer);
1526 struct anv_graphics_pipeline *pipeline =
1527 anv_pipeline_to_graphics(cmd_buffer->state.gfx.base.pipeline);
1528
1529 /* firstVertex is always zero for this draw function */
1530 const uint32_t firstVertex = 0;
1531
1532 if (anv_batch_has_error(&cmd_buffer->batch))
1533 return;
1534
1535 anv_measure_snapshot(cmd_buffer,
1536 INTEL_SNAPSHOT_DRAW,
1537 "draw indirect byte count",
1538 instanceCount * pipeline->instance_multiplier);
1539 trace_intel_begin_draw_indirect_byte_count(&cmd_buffer->trace);
1540
1541 /* Select pipeline here to allow
1542 * cmd_buffer_emit_vertex_constants_and_flush() without flushing before
1543 * emit_base_vertex_instance() & emit_draw_index().
1544 */
1545 genX(flush_pipeline_select_3d)(cmd_buffer);
1546
1547 #if GFX_VER < 11
1548 const struct brw_vs_prog_data *vs_prog_data = get_vs_prog_data(pipeline);
1549 if (vs_prog_data->uses_firstvertex ||
1550 vs_prog_data->uses_baseinstance)
1551 emit_base_vertex_instance(cmd_buffer, firstVertex, firstInstance);
1552 if (vs_prog_data->uses_drawid)
1553 emit_draw_index(cmd_buffer, 0);
1554 #endif
1555
1556 genX(cmd_buffer_flush_gfx_state)(cmd_buffer);
1557
1558 if (cmd_buffer->state.conditional_render_enabled)
1559 genX(cmd_emit_conditional_render_predicate)(cmd_buffer);
1560
1561 struct mi_builder b;
1562 mi_builder_init(&b, cmd_buffer->device->info, &cmd_buffer->batch);
1563 const uint32_t mocs = anv_mocs_for_address(cmd_buffer->device, &counter_buffer->address);
1564 mi_builder_set_mocs(&b, mocs);
1565 struct mi_value count =
1566 mi_mem32(anv_address_add(counter_buffer->address,
1567 counterBufferOffset));
1568 if (counterOffset)
1569 count = mi_isub(&b, count, mi_imm(counterOffset));
1570 count = mi_udiv32_imm(&b, count, vertexStride);
1571 mi_store(&b, mi_reg32(GFX7_3DPRIM_VERTEX_COUNT), count);
1572
1573 mi_store(&b, mi_reg32(GFX7_3DPRIM_START_VERTEX), mi_imm(firstVertex));
1574 mi_store(&b, mi_reg32(GFX7_3DPRIM_INSTANCE_COUNT),
1575 mi_imm(instanceCount * pipeline->instance_multiplier));
1576 mi_store(&b, mi_reg32(GFX7_3DPRIM_START_INSTANCE), mi_imm(firstInstance));
1577 mi_store(&b, mi_reg32(GFX7_3DPRIM_BASE_VERTEX), mi_imm(0));
1578
1579 #if GFX_VER >= 11
1580 mi_store(&b, mi_reg32(GEN11_3DPRIM_XP_BASE_VERTEX),
1581 mi_imm(firstVertex));
1582 /* GEN11_3DPRIM_XP_BASE_INSTANCE is implicit */
1583 mi_store(&b, mi_reg32(GEN11_3DPRIM_XP_DRAW_ID), mi_imm(0));
1584 #endif
1585
1586 cmd_buffer_pre_draw_wa(cmd_buffer);
1587
1588 anv_batch_emit(&cmd_buffer->batch, _3DPRIMITIVE_DIRECT, prim) {
1589 #if GFX_VERx10 >= 125
1590 prim.TBIMREnable = cmd_buffer->state.gfx.dyn_state.use_tbimr;
1591 #endif
1592 prim.IndirectParameterEnable = true;
1593 prim.PredicateEnable = cmd_buffer->state.conditional_render_enabled;
1594 prim.VertexAccessType = SEQUENTIAL;
1595 #if GFX_VER >= 11
1596 prim.ExtendedParametersPresent = true;
1597 #endif
1598 }
1599
1600 cmd_buffer_post_draw_wa(cmd_buffer, 1, SEQUENTIAL);
1601
1602 trace_intel_end_draw_indirect_byte_count(&cmd_buffer->trace,
1603 instanceCount * pipeline->instance_multiplier,
1604 pipeline->base.source_hashes[MESA_SHADER_VERTEX],
1605 pipeline->base.source_hashes[MESA_SHADER_FRAGMENT]);
1606 }
1607
1608 static void
load_indirect_parameters(struct anv_cmd_buffer * cmd_buffer,struct anv_address addr,bool indexed,uint32_t draw_id)1609 load_indirect_parameters(struct anv_cmd_buffer *cmd_buffer,
1610 struct anv_address addr,
1611 bool indexed,
1612 uint32_t draw_id)
1613 {
1614 struct anv_graphics_pipeline *pipeline =
1615 anv_pipeline_to_graphics(cmd_buffer->state.gfx.base.pipeline);
1616
1617 struct mi_builder b;
1618 mi_builder_init(&b, cmd_buffer->device->info, &cmd_buffer->batch);
1619 const uint32_t mocs = anv_mocs_for_address(cmd_buffer->device, &addr);
1620 mi_builder_set_mocs(&b, mocs);
1621
1622 mi_store(&b, mi_reg32(GFX7_3DPRIM_VERTEX_COUNT),
1623 mi_mem32(anv_address_add(addr, 0)));
1624
1625 struct mi_value instance_count = mi_mem32(anv_address_add(addr, 4));
1626 if (pipeline->instance_multiplier > 1) {
1627 instance_count = mi_imul_imm(&b, instance_count,
1628 pipeline->instance_multiplier);
1629 }
1630 mi_store(&b, mi_reg32(GFX7_3DPRIM_INSTANCE_COUNT), instance_count);
1631
1632 mi_store(&b, mi_reg32(GFX7_3DPRIM_START_VERTEX),
1633 mi_mem32(anv_address_add(addr, 8)));
1634
1635 if (indexed) {
1636 mi_store(&b, mi_reg32(GFX7_3DPRIM_BASE_VERTEX),
1637 mi_mem32(anv_address_add(addr, 12)));
1638 mi_store(&b, mi_reg32(GFX7_3DPRIM_START_INSTANCE),
1639 mi_mem32(anv_address_add(addr, 16)));
1640 #if GFX_VER >= 11
1641 mi_store(&b, mi_reg32(GEN11_3DPRIM_XP_BASE_VERTEX),
1642 mi_mem32(anv_address_add(addr, 12)));
1643 /* GEN11_3DPRIM_XP_BASE_INSTANCE is implicit */
1644 #endif
1645 } else {
1646 mi_store(&b, mi_reg32(GFX7_3DPRIM_START_INSTANCE),
1647 mi_mem32(anv_address_add(addr, 12)));
1648 mi_store(&b, mi_reg32(GFX7_3DPRIM_BASE_VERTEX), mi_imm(0));
1649 #if GFX_VER >= 11
1650 mi_store(&b, mi_reg32(GEN11_3DPRIM_XP_BASE_VERTEX),
1651 mi_mem32(anv_address_add(addr, 8)));
1652 /* GEN11_3DPRIM_XP_BASE_INSTANCE is implicit */
1653 #endif
1654 }
1655
1656 #if GFX_VER >= 11
1657 mi_store(&b, mi_reg32(GEN11_3DPRIM_XP_DRAW_ID),
1658 mi_imm(draw_id));
1659 #endif
1660 }
1661
1662 static const inline bool
execute_indirect_draw_supported(const struct anv_cmd_buffer * cmd_buffer)1663 execute_indirect_draw_supported(const struct anv_cmd_buffer *cmd_buffer)
1664 {
1665 #if GFX_VERx10 >= 125
1666 const struct intel_device_info *devinfo = cmd_buffer->device->info;
1667
1668 if (!devinfo->has_indirect_unroll)
1669 return false;
1670
1671 struct anv_graphics_pipeline *pipeline =
1672 anv_pipeline_to_graphics(cmd_buffer->state.gfx.base.pipeline);
1673 const struct brw_vs_prog_data *vs_prog_data = get_vs_prog_data(pipeline);
1674 const struct brw_task_prog_data *task_prog_data = get_task_prog_data(pipeline);
1675 const struct brw_mesh_prog_data *mesh_prog_data = get_mesh_prog_data(pipeline);
1676 const bool is_multiview = pipeline->instance_multiplier > 1;
1677
1678 const bool uses_draw_id =
1679 (vs_prog_data && vs_prog_data->uses_drawid) ||
1680 (mesh_prog_data && mesh_prog_data->uses_drawid) ||
1681 (task_prog_data && task_prog_data->uses_drawid);
1682
1683 const bool uses_firstvertex =
1684 (vs_prog_data && vs_prog_data->uses_firstvertex);
1685
1686 const bool uses_baseinstance =
1687 (vs_prog_data && vs_prog_data->uses_baseinstance);
1688
1689 return !is_multiview &&
1690 !uses_draw_id &&
1691 !uses_firstvertex &&
1692 !uses_baseinstance;
1693 #else
1694 return false;
1695 #endif
1696 }
1697
1698 static void
emit_indirect_draws(struct anv_cmd_buffer * cmd_buffer,struct anv_address indirect_data_addr,uint32_t indirect_data_stride,uint32_t draw_count,bool indexed)1699 emit_indirect_draws(struct anv_cmd_buffer *cmd_buffer,
1700 struct anv_address indirect_data_addr,
1701 uint32_t indirect_data_stride,
1702 uint32_t draw_count,
1703 bool indexed)
1704 {
1705 #if GFX_VER < 11
1706 struct anv_graphics_pipeline *pipeline =
1707 anv_pipeline_to_graphics(cmd_buffer->state.gfx.base.pipeline);
1708 const struct brw_vs_prog_data *vs_prog_data = get_vs_prog_data(pipeline);
1709 #endif
1710 genX(cmd_buffer_flush_gfx_state)(cmd_buffer);
1711
1712 if (cmd_buffer->state.conditional_render_enabled)
1713 genX(cmd_emit_conditional_render_predicate)(cmd_buffer);
1714
1715 uint32_t offset = 0;
1716 for (uint32_t i = 0; i < draw_count; i++) {
1717 struct anv_address draw = anv_address_add(indirect_data_addr, offset);
1718
1719 #if GFX_VER < 11
1720 /* TODO: We need to stomp base vertex to 0 somehow */
1721
1722 /* With sequential draws, we're dealing with the VkDrawIndirectCommand
1723 * structure data. We want to load VkDrawIndirectCommand::firstVertex at
1724 * offset 8 in the structure.
1725 *
1726 * With indexed draws, we're dealing with VkDrawIndexedIndirectCommand.
1727 * We want the VkDrawIndirectCommand::vertexOffset field at offset 12 in
1728 * the structure.
1729 */
1730 if (vs_prog_data->uses_firstvertex ||
1731 vs_prog_data->uses_baseinstance) {
1732 emit_base_vertex_instance_bo(cmd_buffer,
1733 anv_address_add(draw, indexed ? 12 : 8));
1734 }
1735 if (vs_prog_data->uses_drawid)
1736 emit_draw_index(cmd_buffer, i);
1737 #endif
1738
1739 /* Emitting draw index or vertex index BOs may result in needing
1740 * additional VF cache flushes.
1741 */
1742 genX(cmd_buffer_apply_pipe_flushes)(cmd_buffer);
1743
1744 load_indirect_parameters(cmd_buffer, draw, indexed, i);
1745
1746 cmd_buffer_pre_draw_wa(cmd_buffer);
1747
1748 anv_batch_emit(&cmd_buffer->batch, _3DPRIMITIVE_DIRECT, prim) {
1749 #if GFX_VERx10 >= 125
1750 prim.TBIMREnable = cmd_buffer->state.gfx.dyn_state.use_tbimr;
1751 #endif
1752 prim.IndirectParameterEnable = true;
1753 prim.PredicateEnable = cmd_buffer->state.conditional_render_enabled;
1754 prim.VertexAccessType = indexed ? RANDOM : SEQUENTIAL;
1755 #if GFX_VER >= 11
1756 prim.ExtendedParametersPresent = true;
1757 #endif
1758 }
1759
1760 cmd_buffer_post_draw_wa(cmd_buffer, 1, indexed ? RANDOM : SEQUENTIAL);
1761
1762 offset += indirect_data_stride;
1763 }
1764 }
1765
xi_argument_format_for_vk_cmd(enum vk_cmd_type cmd)1766 static inline const uint32_t xi_argument_format_for_vk_cmd(enum vk_cmd_type cmd)
1767 {
1768 #if GFX_VERx10 >= 125
1769 switch (cmd) {
1770 case VK_CMD_DRAW_INDIRECT:
1771 case VK_CMD_DRAW_INDIRECT_COUNT:
1772 return XI_DRAW;
1773 case VK_CMD_DRAW_INDEXED_INDIRECT:
1774 case VK_CMD_DRAW_INDEXED_INDIRECT_COUNT:
1775 return XI_DRAWINDEXED;
1776 case VK_CMD_DRAW_MESH_TASKS_INDIRECT_EXT:
1777 case VK_CMD_DRAW_MESH_TASKS_INDIRECT_COUNT_EXT:
1778 return XI_MESH_3D;
1779 default:
1780 unreachable("unhandled cmd type");
1781 }
1782 #else
1783 unreachable("unsupported GFX VER");
1784 #endif
1785 }
1786
1787 static inline bool
cmd_buffer_set_indirect_stride(struct anv_cmd_buffer * cmd_buffer,uint32_t stride,enum vk_cmd_type cmd)1788 cmd_buffer_set_indirect_stride(struct anv_cmd_buffer *cmd_buffer,
1789 uint32_t stride, enum vk_cmd_type cmd)
1790 {
1791 /* Should have been sanitized by the caller */
1792 assert(stride != 0);
1793
1794 uint32_t data_stride = 0;
1795
1796 switch (cmd) {
1797 case VK_CMD_DRAW_INDIRECT:
1798 case VK_CMD_DRAW_INDIRECT_COUNT:
1799 data_stride = sizeof(VkDrawIndirectCommand);
1800 break;
1801 case VK_CMD_DRAW_INDEXED_INDIRECT:
1802 case VK_CMD_DRAW_INDEXED_INDIRECT_COUNT:
1803 data_stride = sizeof(VkDrawIndexedIndirectCommand);
1804 break;
1805 case VK_CMD_DRAW_MESH_TASKS_INDIRECT_EXT:
1806 case VK_CMD_DRAW_MESH_TASKS_INDIRECT_COUNT_EXT:
1807 data_stride = sizeof(VkDrawMeshTasksIndirectCommandEXT);
1808 break;
1809 default:
1810 unreachable("unhandled cmd type");
1811 }
1812
1813 bool aligned = stride == data_stride;
1814
1815 #if GFX_VER >= 20
1816 /* The stride can change as long as it matches the default command stride
1817 * and STATE_BYTE_STRIDE::ByteStrideEnable=false, we can just do nothing.
1818 *
1819 * Otheriwse STATE_BYTE_STRIDE::ByteStrideEnable=true, any stride change
1820 * should be signaled.
1821 */
1822 struct anv_cmd_graphics_state *gfx_state = &cmd_buffer->state.gfx;
1823 if (gfx_state->indirect_data_stride_aligned != aligned) {
1824 gfx_state->indirect_data_stride = stride;
1825 gfx_state->indirect_data_stride_aligned = aligned;
1826 gfx_state->dirty |= ANV_CMD_DIRTY_INDIRECT_DATA_STRIDE;
1827 } else if (!gfx_state->indirect_data_stride_aligned &&
1828 gfx_state->indirect_data_stride != stride) {
1829 gfx_state->indirect_data_stride = stride;
1830 gfx_state->indirect_data_stride_aligned = aligned;
1831 gfx_state->dirty |= ANV_CMD_DIRTY_INDIRECT_DATA_STRIDE;
1832 }
1833 #endif
1834
1835 return aligned;
1836 }
1837
1838 static void
genX(cmd_buffer_emit_execute_indirect_draws)1839 genX(cmd_buffer_emit_execute_indirect_draws)(struct anv_cmd_buffer *cmd_buffer,
1840 struct anv_address indirect_data_addr,
1841 uint32_t indirect_data_stride,
1842 struct anv_address count_addr,
1843 uint32_t max_draw_count,
1844 enum vk_cmd_type cmd)
1845 {
1846 #if GFX_VERx10 >= 125
1847 bool aligned_stride =
1848 cmd_buffer_set_indirect_stride(cmd_buffer, indirect_data_stride, cmd);
1849
1850 genX(cmd_buffer_flush_gfx_state)(cmd_buffer);
1851
1852 if (cmd_buffer->state.conditional_render_enabled)
1853 genX(cmd_emit_conditional_render_predicate)(cmd_buffer);
1854
1855 uint32_t offset = 0;
1856 for (uint32_t i = 0; i < max_draw_count; i++) {
1857 struct anv_address draw = anv_address_add(indirect_data_addr, offset);
1858
1859 cmd_buffer_pre_draw_wa(cmd_buffer);
1860
1861 anv_batch_emit(&cmd_buffer->batch, GENX(EXECUTE_INDIRECT_DRAW), ind) {
1862 ind.ArgumentFormat = xi_argument_format_for_vk_cmd(cmd);
1863 ind.TBIMREnabled = cmd_buffer->state.gfx.dyn_state.use_tbimr;
1864 ind.PredicateEnable =
1865 cmd_buffer->state.conditional_render_enabled;
1866 ind.MaxCount = aligned_stride ? max_draw_count : 1;
1867 ind.ArgumentBufferStartAddress = draw;
1868 ind.CountBufferAddress = count_addr;
1869 ind.CountBufferIndirectEnable = !anv_address_is_null(count_addr);
1870 ind.MOCS =
1871 anv_mocs(cmd_buffer->device, draw.bo, 0);
1872
1873 }
1874
1875 cmd_buffer_post_draw_wa(cmd_buffer, 1,
1876 0 /* Doesn't matter for GFX_VER > 9 */);
1877
1878 /* If all the indirect structures are aligned, then we can let the HW
1879 * do the unrolling and we only need one instruction. Otherwise we
1880 * need to emit one instruction per draw, but we're still avoiding
1881 * the register loads with MI commands.
1882 */
1883 if (aligned_stride || GFX_VER >= 20)
1884 break;
1885
1886 offset += indirect_data_stride;
1887 }
1888 #endif // GFX_VERx10 >= 125
1889 }
genX(CmdDrawIndirect)1890 void genX(CmdDrawIndirect)(
1891 VkCommandBuffer commandBuffer,
1892 VkBuffer _buffer,
1893 VkDeviceSize offset,
1894 uint32_t drawCount,
1895 uint32_t stride)
1896 {
1897 ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer);
1898 ANV_FROM_HANDLE(anv_buffer, buffer, _buffer);
1899 struct anv_graphics_pipeline *pipeline =
1900 anv_pipeline_to_graphics(cmd_buffer->state.gfx.base.pipeline);
1901
1902 if (anv_batch_has_error(&cmd_buffer->batch))
1903 return;
1904
1905 anv_measure_snapshot(cmd_buffer,
1906 INTEL_SNAPSHOT_DRAW,
1907 "draw indirect",
1908 drawCount);
1909 trace_intel_begin_draw_indirect(&cmd_buffer->trace);
1910
1911 struct anv_address indirect_data_addr =
1912 anv_address_add(buffer->address, offset);
1913
1914 stride = MAX2(stride, sizeof(VkDrawIndirectCommand));
1915
1916 if (execute_indirect_draw_supported(cmd_buffer)) {
1917 genX(cmd_buffer_emit_execute_indirect_draws)(
1918 cmd_buffer,
1919 indirect_data_addr,
1920 stride,
1921 ANV_NULL_ADDRESS /* count_addr */,
1922 drawCount,
1923 VK_CMD_DRAW_INDIRECT);
1924 } else if (anv_use_generated_draws(cmd_buffer, drawCount)) {
1925 genX(cmd_buffer_emit_indirect_generated_draws)(
1926 cmd_buffer,
1927 indirect_data_addr,
1928 stride,
1929 ANV_NULL_ADDRESS /* count_addr */,
1930 drawCount,
1931 false /* indexed */);
1932 } else {
1933 emit_indirect_draws(cmd_buffer,
1934 indirect_data_addr,
1935 stride, drawCount, false /* indexed */);
1936 }
1937
1938 trace_intel_end_draw_indirect(&cmd_buffer->trace, drawCount,
1939 pipeline->base.source_hashes[MESA_SHADER_VERTEX],
1940 pipeline->base.source_hashes[MESA_SHADER_FRAGMENT]);
1941 }
1942
genX(CmdDrawIndexedIndirect)1943 void genX(CmdDrawIndexedIndirect)(
1944 VkCommandBuffer commandBuffer,
1945 VkBuffer _buffer,
1946 VkDeviceSize offset,
1947 uint32_t drawCount,
1948 uint32_t stride)
1949 {
1950 ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer);
1951 ANV_FROM_HANDLE(anv_buffer, buffer, _buffer);
1952 struct anv_graphics_pipeline *pipeline =
1953 anv_pipeline_to_graphics(cmd_buffer->state.gfx.base.pipeline);
1954
1955 if (anv_batch_has_error(&cmd_buffer->batch))
1956 return;
1957
1958 anv_measure_snapshot(cmd_buffer,
1959 INTEL_SNAPSHOT_DRAW,
1960 "draw indexed indirect",
1961 drawCount);
1962 trace_intel_begin_draw_indexed_indirect(&cmd_buffer->trace);
1963
1964 struct anv_address indirect_data_addr =
1965 anv_address_add(buffer->address, offset);
1966
1967 stride = MAX2(stride, sizeof(VkDrawIndexedIndirectCommand));
1968
1969 if (execute_indirect_draw_supported(cmd_buffer)) {
1970 genX(cmd_buffer_emit_execute_indirect_draws)(
1971 cmd_buffer,
1972 indirect_data_addr,
1973 stride,
1974 ANV_NULL_ADDRESS /* count_addr */,
1975 drawCount,
1976 VK_CMD_DRAW_INDEXED_INDIRECT);
1977 } else if (anv_use_generated_draws(cmd_buffer, drawCount)) {
1978 genX(cmd_buffer_emit_indirect_generated_draws)(
1979 cmd_buffer,
1980 indirect_data_addr,
1981 stride,
1982 ANV_NULL_ADDRESS /* count_addr */,
1983 drawCount,
1984 true /* indexed */);
1985 } else {
1986 emit_indirect_draws(cmd_buffer,
1987 indirect_data_addr,
1988 stride, drawCount, true /* indexed */);
1989 }
1990
1991 trace_intel_end_draw_indexed_indirect(&cmd_buffer->trace, drawCount,
1992 pipeline->base.source_hashes[MESA_SHADER_VERTEX],
1993 pipeline->base.source_hashes[MESA_SHADER_FRAGMENT]);
1994 }
1995
1996 #define MI_PREDICATE_SRC0 0x2400
1997 #define MI_PREDICATE_SRC1 0x2408
1998 #define MI_PREDICATE_RESULT 0x2418
1999
2000 static struct mi_value
prepare_for_draw_count_predicate(struct anv_cmd_buffer * cmd_buffer,struct mi_builder * b,struct anv_address count_address)2001 prepare_for_draw_count_predicate(struct anv_cmd_buffer *cmd_buffer,
2002 struct mi_builder *b,
2003 struct anv_address count_address)
2004 {
2005 struct mi_value ret = mi_imm(0);
2006
2007 if (cmd_buffer->state.conditional_render_enabled) {
2008 ret = mi_new_gpr(b);
2009 mi_store(b, mi_value_ref(b, ret), mi_mem32(count_address));
2010 } else {
2011 /* Upload the current draw count from the draw parameters buffer to
2012 * MI_PREDICATE_SRC0.
2013 */
2014 mi_store(b, mi_reg64(MI_PREDICATE_SRC0), mi_mem32(count_address));
2015 mi_store(b, mi_reg32(MI_PREDICATE_SRC1 + 4), mi_imm(0));
2016 }
2017
2018 return ret;
2019 }
2020
2021 static void
emit_draw_count_predicate(struct anv_cmd_buffer * cmd_buffer,struct mi_builder * b,uint32_t draw_index)2022 emit_draw_count_predicate(struct anv_cmd_buffer *cmd_buffer,
2023 struct mi_builder *b,
2024 uint32_t draw_index)
2025 {
2026 /* Upload the index of the current primitive to MI_PREDICATE_SRC1. */
2027 mi_store(b, mi_reg32(MI_PREDICATE_SRC1), mi_imm(draw_index));
2028
2029 if (draw_index == 0) {
2030 anv_batch_emit(&cmd_buffer->batch, GENX(MI_PREDICATE), mip) {
2031 mip.LoadOperation = LOAD_LOADINV;
2032 mip.CombineOperation = COMBINE_SET;
2033 mip.CompareOperation = COMPARE_SRCS_EQUAL;
2034 }
2035 } else {
2036 /* While draw_index < draw_count the predicate's result will be
2037 * (draw_index == draw_count) ^ TRUE = TRUE
2038 * When draw_index == draw_count the result is
2039 * (TRUE) ^ TRUE = FALSE
2040 * After this all results will be:
2041 * (FALSE) ^ FALSE = FALSE
2042 */
2043 anv_batch_emit(&cmd_buffer->batch, GENX(MI_PREDICATE), mip) {
2044 mip.LoadOperation = LOAD_LOAD;
2045 mip.CombineOperation = COMBINE_XOR;
2046 mip.CompareOperation = COMPARE_SRCS_EQUAL;
2047 }
2048 }
2049 }
2050
2051 static void
emit_draw_count_predicate_with_conditional_render(struct anv_cmd_buffer * cmd_buffer,struct mi_builder * b,uint32_t draw_index,struct mi_value max)2052 emit_draw_count_predicate_with_conditional_render(
2053 struct anv_cmd_buffer *cmd_buffer,
2054 struct mi_builder *b,
2055 uint32_t draw_index,
2056 struct mi_value max)
2057 {
2058 struct mi_value pred = mi_ult(b, mi_imm(draw_index), max);
2059 pred = mi_iand(b, pred, mi_reg64(ANV_PREDICATE_RESULT_REG));
2060
2061 mi_store(b, mi_reg32(MI_PREDICATE_RESULT), pred);
2062 }
2063
2064 static void
emit_draw_count_predicate_cond(struct anv_cmd_buffer * cmd_buffer,struct mi_builder * b,uint32_t draw_index,struct mi_value max)2065 emit_draw_count_predicate_cond(struct anv_cmd_buffer *cmd_buffer,
2066 struct mi_builder *b,
2067 uint32_t draw_index,
2068 struct mi_value max)
2069 {
2070 if (cmd_buffer->state.conditional_render_enabled) {
2071 emit_draw_count_predicate_with_conditional_render(
2072 cmd_buffer, b, draw_index, mi_value_ref(b, max));
2073 } else {
2074 emit_draw_count_predicate(cmd_buffer, b, draw_index);
2075 }
2076 }
2077
2078 static void
emit_indirect_count_draws(struct anv_cmd_buffer * cmd_buffer,struct anv_address indirect_data_addr,uint64_t indirect_data_stride,struct anv_address draw_count_addr,uint32_t max_draw_count,bool indexed)2079 emit_indirect_count_draws(struct anv_cmd_buffer *cmd_buffer,
2080 struct anv_address indirect_data_addr,
2081 uint64_t indirect_data_stride,
2082 struct anv_address draw_count_addr,
2083 uint32_t max_draw_count,
2084 bool indexed)
2085 {
2086 #if GFX_VER < 11
2087 struct anv_graphics_pipeline *pipeline =
2088 anv_pipeline_to_graphics(cmd_buffer->state.gfx.base.pipeline);
2089 const struct brw_vs_prog_data *vs_prog_data = get_vs_prog_data(pipeline);
2090 #endif
2091
2092 genX(cmd_buffer_flush_gfx_state)(cmd_buffer);
2093
2094 struct mi_builder b;
2095 mi_builder_init(&b, cmd_buffer->device->info, &cmd_buffer->batch);
2096 const uint32_t mocs = anv_mocs_for_address(cmd_buffer->device, &draw_count_addr);
2097 mi_builder_set_mocs(&b, mocs);
2098 struct mi_value max =
2099 prepare_for_draw_count_predicate(cmd_buffer, &b, draw_count_addr);
2100
2101 for (uint32_t i = 0; i < max_draw_count; i++) {
2102 struct anv_address draw =
2103 anv_address_add(indirect_data_addr, i * indirect_data_stride);
2104
2105 emit_draw_count_predicate_cond(cmd_buffer, &b, i, max);
2106
2107 #if GFX_VER < 11
2108 if (vs_prog_data->uses_firstvertex ||
2109 vs_prog_data->uses_baseinstance) {
2110 emit_base_vertex_instance_bo(cmd_buffer,
2111 anv_address_add(draw, indexed ? 12 : 8));
2112 }
2113 if (vs_prog_data->uses_drawid)
2114 emit_draw_index(cmd_buffer, i);
2115
2116 /* Emitting draw index or vertex index BOs may result in needing
2117 * additional VF cache flushes.
2118 */
2119 genX(cmd_buffer_apply_pipe_flushes)(cmd_buffer);
2120 #endif
2121
2122 load_indirect_parameters(cmd_buffer, draw, indexed, i);
2123
2124 cmd_buffer_pre_draw_wa(cmd_buffer);
2125
2126 anv_batch_emit(&cmd_buffer->batch, _3DPRIMITIVE_DIRECT, prim) {
2127 #if GFX_VERx10 >= 125
2128 prim.TBIMREnable = cmd_buffer->state.gfx.dyn_state.use_tbimr;
2129 #endif
2130 prim.IndirectParameterEnable = true;
2131 prim.PredicateEnable = true;
2132 prim.VertexAccessType = indexed ? RANDOM : SEQUENTIAL;
2133 #if GFX_VER >= 11
2134 prim.ExtendedParametersPresent = true;
2135 #endif
2136 }
2137
2138 cmd_buffer_post_draw_wa(cmd_buffer, 1, SEQUENTIAL);
2139 }
2140
2141 mi_value_unref(&b, max);
2142 }
2143
genX(CmdDrawIndirectCount)2144 void genX(CmdDrawIndirectCount)(
2145 VkCommandBuffer commandBuffer,
2146 VkBuffer _buffer,
2147 VkDeviceSize offset,
2148 VkBuffer _countBuffer,
2149 VkDeviceSize countBufferOffset,
2150 uint32_t maxDrawCount,
2151 uint32_t stride)
2152 {
2153 ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer);
2154 ANV_FROM_HANDLE(anv_buffer, buffer, _buffer);
2155 ANV_FROM_HANDLE(anv_buffer, count_buffer, _countBuffer);
2156 struct anv_graphics_pipeline *pipeline =
2157 anv_pipeline_to_graphics(cmd_buffer->state.gfx.base.pipeline);
2158
2159 if (anv_batch_has_error(&cmd_buffer->batch))
2160 return;
2161
2162 anv_measure_snapshot(cmd_buffer,
2163 INTEL_SNAPSHOT_DRAW,
2164 "draw indirect count",
2165 0);
2166 trace_intel_begin_draw_indirect_count(&cmd_buffer->trace);
2167
2168 struct anv_address indirect_data_address =
2169 anv_address_add(buffer->address, offset);
2170 struct anv_address count_address =
2171 anv_address_add(count_buffer->address, countBufferOffset);
2172 stride = MAX2(stride, sizeof(VkDrawIndirectCommand));
2173
2174 if (execute_indirect_draw_supported(cmd_buffer)) {
2175 genX(cmd_buffer_emit_execute_indirect_draws)(
2176 cmd_buffer,
2177 indirect_data_address,
2178 stride,
2179 count_address,
2180 maxDrawCount,
2181 VK_CMD_DRAW_INDIRECT_COUNT);
2182 } else if (anv_use_generated_draws(cmd_buffer, maxDrawCount)) {
2183 genX(cmd_buffer_emit_indirect_generated_draws)(
2184 cmd_buffer,
2185 indirect_data_address,
2186 stride,
2187 count_address,
2188 maxDrawCount,
2189 false /* indexed */);
2190 } else {
2191 emit_indirect_count_draws(cmd_buffer,
2192 indirect_data_address,
2193 stride,
2194 count_address,
2195 maxDrawCount,
2196 false /* indexed */);
2197 }
2198
2199 trace_intel_end_draw_indirect_count(&cmd_buffer->trace,
2200 anv_address_utrace(count_address),
2201 pipeline->base.source_hashes[MESA_SHADER_VERTEX],
2202 pipeline->base.source_hashes[MESA_SHADER_FRAGMENT]);
2203 }
2204
genX(CmdDrawIndexedIndirectCount)2205 void genX(CmdDrawIndexedIndirectCount)(
2206 VkCommandBuffer commandBuffer,
2207 VkBuffer _buffer,
2208 VkDeviceSize offset,
2209 VkBuffer _countBuffer,
2210 VkDeviceSize countBufferOffset,
2211 uint32_t maxDrawCount,
2212 uint32_t stride)
2213 {
2214 ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer);
2215 ANV_FROM_HANDLE(anv_buffer, buffer, _buffer);
2216 ANV_FROM_HANDLE(anv_buffer, count_buffer, _countBuffer);
2217 struct anv_graphics_pipeline *pipeline =
2218 anv_pipeline_to_graphics(cmd_buffer->state.gfx.base.pipeline);
2219
2220 if (anv_batch_has_error(&cmd_buffer->batch))
2221 return;
2222
2223 anv_measure_snapshot(cmd_buffer,
2224 INTEL_SNAPSHOT_DRAW,
2225 "draw indexed indirect count",
2226 0);
2227 trace_intel_begin_draw_indexed_indirect_count(&cmd_buffer->trace);
2228
2229 struct anv_address indirect_data_address =
2230 anv_address_add(buffer->address, offset);
2231 struct anv_address count_address =
2232 anv_address_add(count_buffer->address, countBufferOffset);
2233 stride = MAX2(stride, sizeof(VkDrawIndexedIndirectCommand));
2234
2235 if (execute_indirect_draw_supported(cmd_buffer)) {
2236 genX(cmd_buffer_emit_execute_indirect_draws)(
2237 cmd_buffer,
2238 indirect_data_address,
2239 stride,
2240 count_address,
2241 maxDrawCount,
2242 VK_CMD_DRAW_INDEXED_INDIRECT_COUNT);
2243 } else if (anv_use_generated_draws(cmd_buffer, maxDrawCount)) {
2244 genX(cmd_buffer_emit_indirect_generated_draws)(
2245 cmd_buffer,
2246 indirect_data_address,
2247 stride,
2248 count_address,
2249 maxDrawCount,
2250 true /* indexed */);
2251 } else {
2252 emit_indirect_count_draws(cmd_buffer,
2253 indirect_data_address,
2254 stride,
2255 count_address,
2256 maxDrawCount,
2257 true /* indexed */);
2258 }
2259
2260 trace_intel_end_draw_indexed_indirect_count(&cmd_buffer->trace,
2261 anv_address_utrace(count_address),
2262 pipeline->base.source_hashes[MESA_SHADER_VERTEX],
2263 pipeline->base.source_hashes[MESA_SHADER_FRAGMENT]);
2264
2265 }
2266
genX(CmdBeginTransformFeedbackEXT)2267 void genX(CmdBeginTransformFeedbackEXT)(
2268 VkCommandBuffer commandBuffer,
2269 uint32_t firstCounterBuffer,
2270 uint32_t counterBufferCount,
2271 const VkBuffer* pCounterBuffers,
2272 const VkDeviceSize* pCounterBufferOffsets)
2273 {
2274 ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer);
2275
2276 assert(firstCounterBuffer < MAX_XFB_BUFFERS);
2277 assert(counterBufferCount <= MAX_XFB_BUFFERS);
2278 assert(firstCounterBuffer + counterBufferCount <= MAX_XFB_BUFFERS);
2279
2280 trace_intel_begin_xfb(&cmd_buffer->trace);
2281
2282 /* From the SKL PRM Vol. 2c, SO_WRITE_OFFSET:
2283 *
2284 * "Ssoftware must ensure that no HW stream output operations can be in
2285 * process or otherwise pending at the point that the MI_LOAD/STORE
2286 * commands are processed. This will likely require a pipeline flush."
2287 */
2288 anv_add_pending_pipe_bits(cmd_buffer,
2289 ANV_PIPE_CS_STALL_BIT,
2290 "begin transform feedback");
2291 genX(cmd_buffer_apply_pipe_flushes)(cmd_buffer);
2292
2293 struct mi_builder b;
2294 mi_builder_init(&b, cmd_buffer->device->info, &cmd_buffer->batch);
2295
2296 for (uint32_t idx = 0; idx < MAX_XFB_BUFFERS; idx++) {
2297 /* If we have a counter buffer, this is a resume so we need to load the
2298 * value into the streamout offset register. Otherwise, this is a begin
2299 * and we need to reset it to zero.
2300 */
2301 if (pCounterBuffers &&
2302 idx >= firstCounterBuffer &&
2303 idx - firstCounterBuffer < counterBufferCount &&
2304 pCounterBuffers[idx - firstCounterBuffer] != VK_NULL_HANDLE) {
2305 uint32_t cb_idx = idx - firstCounterBuffer;
2306 ANV_FROM_HANDLE(anv_buffer, counter_buffer, pCounterBuffers[cb_idx]);
2307 uint64_t offset = pCounterBufferOffsets ?
2308 pCounterBufferOffsets[cb_idx] : 0;
2309 mi_store(&b, mi_reg32(GENX(SO_WRITE_OFFSET0_num) + idx * 4),
2310 mi_mem32(anv_address_add(counter_buffer->address, offset)));
2311 } else {
2312 mi_store(&b, mi_reg32(GENX(SO_WRITE_OFFSET0_num) + idx * 4),
2313 mi_imm(0));
2314 }
2315 }
2316
2317 cmd_buffer->state.xfb_enabled = true;
2318 cmd_buffer->state.gfx.dirty |= ANV_CMD_DIRTY_XFB_ENABLE;
2319 }
2320
genX(CmdEndTransformFeedbackEXT)2321 void genX(CmdEndTransformFeedbackEXT)(
2322 VkCommandBuffer commandBuffer,
2323 uint32_t firstCounterBuffer,
2324 uint32_t counterBufferCount,
2325 const VkBuffer* pCounterBuffers,
2326 const VkDeviceSize* pCounterBufferOffsets)
2327 {
2328 ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer);
2329
2330 assert(firstCounterBuffer < MAX_XFB_BUFFERS);
2331 assert(counterBufferCount <= MAX_XFB_BUFFERS);
2332 assert(firstCounterBuffer + counterBufferCount <= MAX_XFB_BUFFERS);
2333
2334 /* From the SKL PRM Vol. 2c, SO_WRITE_OFFSET:
2335 *
2336 * "Ssoftware must ensure that no HW stream output operations can be in
2337 * process or otherwise pending at the point that the MI_LOAD/STORE
2338 * commands are processed. This will likely require a pipeline flush."
2339 */
2340 anv_add_pending_pipe_bits(cmd_buffer,
2341 ANV_PIPE_CS_STALL_BIT,
2342 "end transform feedback");
2343 genX(cmd_buffer_apply_pipe_flushes)(cmd_buffer);
2344
2345 for (uint32_t cb_idx = 0; cb_idx < counterBufferCount; cb_idx++) {
2346 unsigned idx = firstCounterBuffer + cb_idx;
2347
2348 /* If we have a counter buffer, this is a resume so we need to load the
2349 * value into the streamout offset register. Otherwise, this is a begin
2350 * and we need to reset it to zero.
2351 */
2352 if (pCounterBuffers &&
2353 cb_idx < counterBufferCount &&
2354 pCounterBuffers[cb_idx] != VK_NULL_HANDLE) {
2355 ANV_FROM_HANDLE(anv_buffer, counter_buffer, pCounterBuffers[cb_idx]);
2356 uint64_t offset = pCounterBufferOffsets ?
2357 pCounterBufferOffsets[cb_idx] : 0;
2358
2359 anv_batch_emit(&cmd_buffer->batch, GENX(MI_STORE_REGISTER_MEM), srm) {
2360 srm.MemoryAddress = anv_address_add(counter_buffer->address,
2361 offset);
2362 srm.RegisterAddress = GENX(SO_WRITE_OFFSET0_num) + idx * 4;
2363 }
2364 }
2365 }
2366
2367 trace_intel_end_xfb(&cmd_buffer->trace);
2368
2369 cmd_buffer->state.xfb_enabled = false;
2370 cmd_buffer->state.gfx.dirty |= ANV_CMD_DIRTY_XFB_ENABLE;
2371 }
2372
2373 #if GFX_VERx10 >= 125
2374
2375 void
genX(CmdDrawMeshTasksEXT)2376 genX(CmdDrawMeshTasksEXT)(
2377 VkCommandBuffer commandBuffer,
2378 uint32_t x,
2379 uint32_t y,
2380 uint32_t z)
2381 {
2382 ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer);
2383
2384 if (anv_batch_has_error(&cmd_buffer->batch))
2385 return;
2386
2387 anv_measure_snapshot(cmd_buffer,
2388 INTEL_SNAPSHOT_DRAW,
2389 "draw mesh", x * y * z);
2390
2391 trace_intel_begin_draw_mesh(&cmd_buffer->trace);
2392
2393 /* TODO(mesh): Check if this is not emitting more packets than we need. */
2394 genX(cmd_buffer_flush_gfx_state)(cmd_buffer);
2395
2396 if (cmd_buffer->state.conditional_render_enabled)
2397 genX(cmd_emit_conditional_render_predicate)(cmd_buffer);
2398
2399 anv_batch_emit(&cmd_buffer->batch, GENX(3DMESH_3D), m) {
2400 m.PredicateEnable = cmd_buffer->state.conditional_render_enabled;
2401 m.ThreadGroupCountX = x;
2402 m.ThreadGroupCountY = y;
2403 m.ThreadGroupCountZ = z;
2404 }
2405
2406 trace_intel_end_draw_mesh(&cmd_buffer->trace, x, y, z);
2407 }
2408
2409 #define GFX125_3DMESH_TG_COUNT 0x26F0
2410 #define GFX10_3DPRIM_XP(n) (0x2690 + (n) * 4) /* n = { 0, 1, 2 } */
2411
2412 static void
mesh_load_indirect_parameters_3dmesh_3d(struct anv_cmd_buffer * cmd_buffer,struct mi_builder * b,struct anv_address addr,bool emit_xp0,uint32_t xp0)2413 mesh_load_indirect_parameters_3dmesh_3d(struct anv_cmd_buffer *cmd_buffer,
2414 struct mi_builder *b,
2415 struct anv_address addr,
2416 bool emit_xp0,
2417 uint32_t xp0)
2418 {
2419 const size_t groupCountXOff = offsetof(VkDrawMeshTasksIndirectCommandEXT, groupCountX);
2420 const size_t groupCountYOff = offsetof(VkDrawMeshTasksIndirectCommandEXT, groupCountY);
2421 const size_t groupCountZOff = offsetof(VkDrawMeshTasksIndirectCommandEXT, groupCountZ);
2422
2423 mi_store(b, mi_reg32(GFX125_3DMESH_TG_COUNT),
2424 mi_mem32(anv_address_add(addr, groupCountXOff)));
2425
2426 mi_store(b, mi_reg32(GFX10_3DPRIM_XP(1)),
2427 mi_mem32(anv_address_add(addr, groupCountYOff)));
2428
2429 mi_store(b, mi_reg32(GFX10_3DPRIM_XP(2)),
2430 mi_mem32(anv_address_add(addr, groupCountZOff)));
2431
2432 if (emit_xp0)
2433 mi_store(b, mi_reg32(GFX10_3DPRIM_XP(0)), mi_imm(xp0));
2434 }
2435
2436 static void
emit_indirect_3dmesh_3d(struct anv_batch * batch,bool predicate_enable,bool uses_drawid)2437 emit_indirect_3dmesh_3d(struct anv_batch *batch,
2438 bool predicate_enable,
2439 bool uses_drawid)
2440 {
2441 uint32_t len = GENX(3DMESH_3D_length) + uses_drawid;
2442 uint32_t *dw = anv_batch_emitn(batch, len, GENX(3DMESH_3D),
2443 .PredicateEnable = predicate_enable,
2444 .IndirectParameterEnable = true,
2445 .ExtendedParameter0Present = uses_drawid);
2446 if (uses_drawid)
2447 dw[len - 1] = 0;
2448 }
2449
2450 void
genX(CmdDrawMeshTasksIndirectEXT)2451 genX(CmdDrawMeshTasksIndirectEXT)(
2452 VkCommandBuffer commandBuffer,
2453 VkBuffer _buffer,
2454 VkDeviceSize offset,
2455 uint32_t drawCount,
2456 uint32_t stride)
2457 {
2458 ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer);
2459 ANV_FROM_HANDLE(anv_buffer, buffer, _buffer);
2460 struct anv_graphics_pipeline *pipeline =
2461 anv_pipeline_to_graphics(cmd_buffer->state.gfx.base.pipeline);
2462 const struct brw_task_prog_data *task_prog_data = get_task_prog_data(pipeline);
2463 const struct brw_mesh_prog_data *mesh_prog_data = get_mesh_prog_data(pipeline);
2464 struct anv_cmd_state *cmd_state = &cmd_buffer->state;
2465
2466 if (anv_batch_has_error(&cmd_buffer->batch))
2467 return;
2468
2469 anv_measure_snapshot(cmd_buffer,
2470 INTEL_SNAPSHOT_DRAW,
2471 "draw mesh indirect", drawCount);
2472
2473 trace_intel_begin_draw_mesh_indirect(&cmd_buffer->trace);
2474
2475 if (execute_indirect_draw_supported(cmd_buffer)) {
2476 genX(cmd_buffer_emit_execute_indirect_draws)(
2477 cmd_buffer,
2478 anv_address_add(buffer->address, offset),
2479 MAX2(stride, sizeof(VkDrawMeshTasksIndirectCommandEXT)),
2480 ANV_NULL_ADDRESS /* count_addr */,
2481 drawCount,
2482 VK_CMD_DRAW_MESH_TASKS_INDIRECT_EXT);
2483
2484 trace_intel_end_draw_mesh_indirect(&cmd_buffer->trace, drawCount);
2485 return;
2486 }
2487
2488 genX(cmd_buffer_flush_gfx_state)(cmd_buffer);
2489
2490 if (cmd_state->conditional_render_enabled)
2491 genX(cmd_emit_conditional_render_predicate)(cmd_buffer);
2492
2493 bool uses_drawid = (task_prog_data && task_prog_data->uses_drawid) ||
2494 mesh_prog_data->uses_drawid;
2495 struct mi_builder b;
2496 mi_builder_init(&b, cmd_buffer->device->info, &cmd_buffer->batch);
2497
2498 for (uint32_t i = 0; i < drawCount; i++) {
2499 struct anv_address draw = anv_address_add(buffer->address, offset);
2500
2501 mesh_load_indirect_parameters_3dmesh_3d(cmd_buffer, &b, draw, uses_drawid, i);
2502
2503 emit_indirect_3dmesh_3d(&cmd_buffer->batch,
2504 cmd_state->conditional_render_enabled, uses_drawid);
2505
2506 offset += stride;
2507 }
2508
2509 trace_intel_end_draw_mesh_indirect(&cmd_buffer->trace, drawCount);
2510 }
2511
2512 void
genX(CmdDrawMeshTasksIndirectCountEXT)2513 genX(CmdDrawMeshTasksIndirectCountEXT)(
2514 VkCommandBuffer commandBuffer,
2515 VkBuffer _buffer,
2516 VkDeviceSize offset,
2517 VkBuffer _countBuffer,
2518 VkDeviceSize countBufferOffset,
2519 uint32_t maxDrawCount,
2520 uint32_t stride)
2521 {
2522 ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer);
2523 ANV_FROM_HANDLE(anv_buffer, buffer, _buffer);
2524 ANV_FROM_HANDLE(anv_buffer, count_buffer, _countBuffer);
2525 struct anv_graphics_pipeline *pipeline =
2526 anv_pipeline_to_graphics(cmd_buffer->state.gfx.base.pipeline);
2527 const struct brw_task_prog_data *task_prog_data = get_task_prog_data(pipeline);
2528 const struct brw_mesh_prog_data *mesh_prog_data = get_mesh_prog_data(pipeline);
2529
2530 if (anv_batch_has_error(&cmd_buffer->batch))
2531 return;
2532
2533 anv_measure_snapshot(cmd_buffer,
2534 INTEL_SNAPSHOT_DRAW,
2535 "draw mesh indirect count", 0);
2536
2537 trace_intel_begin_draw_mesh_indirect_count(&cmd_buffer->trace);
2538
2539 struct anv_address count_addr =
2540 anv_address_add(count_buffer->address, countBufferOffset);
2541
2542
2543 if (execute_indirect_draw_supported(cmd_buffer)) {
2544 genX(cmd_buffer_emit_execute_indirect_draws)(
2545 cmd_buffer,
2546 anv_address_add(buffer->address, offset),
2547 MAX2(stride, sizeof(VkDrawMeshTasksIndirectCommandEXT)),
2548 count_addr /* count_addr */,
2549 maxDrawCount,
2550 VK_CMD_DRAW_MESH_TASKS_INDIRECT_COUNT_EXT);
2551
2552 trace_intel_end_draw_mesh_indirect(&cmd_buffer->trace, maxDrawCount);
2553 return;
2554 }
2555
2556 genX(cmd_buffer_flush_gfx_state)(cmd_buffer);
2557
2558 bool uses_drawid = (task_prog_data && task_prog_data->uses_drawid) ||
2559 mesh_prog_data->uses_drawid;
2560
2561 struct mi_builder b;
2562 mi_builder_init(&b, cmd_buffer->device->info, &cmd_buffer->batch);
2563 const uint32_t mocs = anv_mocs_for_address(cmd_buffer->device, &count_buffer->address);
2564 mi_builder_set_mocs(&b, mocs);
2565
2566 struct mi_value max =
2567 prepare_for_draw_count_predicate(
2568 cmd_buffer, &b, count_addr);
2569
2570 for (uint32_t i = 0; i < maxDrawCount; i++) {
2571 struct anv_address draw = anv_address_add(buffer->address, offset);
2572
2573 emit_draw_count_predicate_cond(cmd_buffer, &b, i, max);
2574
2575 mesh_load_indirect_parameters_3dmesh_3d(cmd_buffer, &b, draw, uses_drawid, i);
2576
2577 emit_indirect_3dmesh_3d(&cmd_buffer->batch, true, uses_drawid);
2578
2579 offset += stride;
2580 }
2581
2582 trace_intel_end_draw_mesh_indirect_count(&cmd_buffer->trace,
2583 anv_address_utrace(count_addr));
2584 }
2585
2586 #endif /* GFX_VERx10 >= 125 */
2587