1 /*
2 * Copyright © 2015 Intel Corporation
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
13 * Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21 * IN THE SOFTWARE.
22 */
23
24 #include <assert.h>
25 #include <stdbool.h>
26
27 #include "anv_private.h"
28 #include "anv_measure.h"
29 #include "vk_render_pass.h"
30 #include "vk_util.h"
31
32 #include "common/intel_aux_map.h"
33 #include "genxml/gen_macros.h"
34 #include "genxml/genX_pack.h"
35 #include "genxml/genX_rt_pack.h"
36 #include "common/intel_genX_state_brw.h"
37
38 #include "ds/intel_tracepoints.h"
39
40 /* We reserve :
41 * - GPR 14 for secondary command buffer returns
42 * - GPR 15 for conditional rendering
43 */
44 #define MI_BUILDER_NUM_ALLOC_GPRS 14
45 #define __gen_get_batch_dwords anv_batch_emit_dwords
46 #define __gen_address_offset anv_address_add
47 #define __gen_get_batch_address(b, a) anv_batch_address(b, a)
48 #include "common/mi_builder.h"
49
50 static void
cmd_buffer_alloc_gfx_push_constants(struct anv_cmd_buffer * cmd_buffer)51 cmd_buffer_alloc_gfx_push_constants(struct anv_cmd_buffer *cmd_buffer)
52 {
53 struct anv_graphics_pipeline *pipeline =
54 anv_pipeline_to_graphics(cmd_buffer->state.gfx.base.pipeline);
55 VkShaderStageFlags stages = pipeline->base.base.active_stages;
56
57 /* In order to avoid thrash, we assume that vertex and fragment stages
58 * always exist. In the rare case where one is missing *and* the other
59 * uses push concstants, this may be suboptimal. However, avoiding stalls
60 * seems more important.
61 */
62 stages |= VK_SHADER_STAGE_FRAGMENT_BIT;
63 if (anv_pipeline_is_primitive(pipeline))
64 stages |= VK_SHADER_STAGE_VERTEX_BIT;
65
66 if (stages == cmd_buffer->state.gfx.push_constant_stages)
67 return;
68
69 unsigned push_constant_kb;
70
71 const struct intel_device_info *devinfo = cmd_buffer->device->info;
72 if (anv_pipeline_is_mesh(pipeline))
73 push_constant_kb = devinfo->mesh_max_constant_urb_size_kb;
74 else
75 push_constant_kb = devinfo->max_constant_urb_size_kb;
76
77 const unsigned num_stages =
78 util_bitcount(stages & VK_SHADER_STAGE_ALL_GRAPHICS);
79 unsigned size_per_stage = push_constant_kb / num_stages;
80
81 /* Broadwell+ and Haswell gt3 require that the push constant sizes be in
82 * units of 2KB. Incidentally, these are the same platforms that have
83 * 32KB worth of push constant space.
84 */
85 if (push_constant_kb == 32)
86 size_per_stage &= ~1u;
87
88 uint32_t kb_used = 0;
89 for (int i = MESA_SHADER_VERTEX; i < MESA_SHADER_FRAGMENT; i++) {
90 const unsigned push_size = (stages & (1 << i)) ? size_per_stage : 0;
91 anv_batch_emit(&cmd_buffer->batch,
92 GENX(3DSTATE_PUSH_CONSTANT_ALLOC_VS), alloc) {
93 alloc._3DCommandSubOpcode = 18 + i;
94 alloc.ConstantBufferOffset = (push_size > 0) ? kb_used : 0;
95 alloc.ConstantBufferSize = push_size;
96 }
97 kb_used += push_size;
98 }
99
100 anv_batch_emit(&cmd_buffer->batch,
101 GENX(3DSTATE_PUSH_CONSTANT_ALLOC_PS), alloc) {
102 alloc.ConstantBufferOffset = kb_used;
103 alloc.ConstantBufferSize = push_constant_kb - kb_used;
104 }
105
106 #if GFX_VERx10 == 125
107 /* DG2: Wa_22011440098
108 * MTL: Wa_18022330953
109 *
110 * In 3D mode, after programming push constant alloc command immediately
111 * program push constant command(ZERO length) without any commit between
112 * them.
113 */
114 anv_batch_emit(&cmd_buffer->batch, GENX(3DSTATE_CONSTANT_ALL), c) {
115 /* Update empty push constants for all stages (bitmask = 11111b) */
116 c.ShaderUpdateEnable = 0x1f;
117 c.MOCS = anv_mocs(cmd_buffer->device, NULL, 0);
118 }
119 #endif
120
121 cmd_buffer->state.gfx.push_constant_stages = stages;
122
123 /* From the BDW PRM for 3DSTATE_PUSH_CONSTANT_ALLOC_VS:
124 *
125 * "The 3DSTATE_CONSTANT_VS must be reprogrammed prior to
126 * the next 3DPRIMITIVE command after programming the
127 * 3DSTATE_PUSH_CONSTANT_ALLOC_VS"
128 *
129 * Since 3DSTATE_PUSH_CONSTANT_ALLOC_VS is programmed as part of
130 * pipeline setup, we need to dirty push constants.
131 */
132 cmd_buffer->state.push_constants_dirty |= stages;
133 }
134
135 static void
cmd_buffer_emit_descriptor_pointers(struct anv_cmd_buffer * cmd_buffer,uint32_t stages)136 cmd_buffer_emit_descriptor_pointers(struct anv_cmd_buffer *cmd_buffer,
137 uint32_t stages)
138 {
139 static const uint32_t sampler_state_opcodes[] = {
140 [MESA_SHADER_VERTEX] = 43,
141 [MESA_SHADER_TESS_CTRL] = 44, /* HS */
142 [MESA_SHADER_TESS_EVAL] = 45, /* DS */
143 [MESA_SHADER_GEOMETRY] = 46,
144 [MESA_SHADER_FRAGMENT] = 47,
145 };
146
147 static const uint32_t binding_table_opcodes[] = {
148 [MESA_SHADER_VERTEX] = 38,
149 [MESA_SHADER_TESS_CTRL] = 39,
150 [MESA_SHADER_TESS_EVAL] = 40,
151 [MESA_SHADER_GEOMETRY] = 41,
152 [MESA_SHADER_FRAGMENT] = 42,
153 };
154
155 anv_foreach_stage(s, stages) {
156 assert(s < ARRAY_SIZE(binding_table_opcodes));
157
158 if (cmd_buffer->state.samplers[s].alloc_size > 0) {
159 anv_batch_emit(&cmd_buffer->batch,
160 GENX(3DSTATE_SAMPLER_STATE_POINTERS_VS), ssp) {
161 ssp._3DCommandSubOpcode = sampler_state_opcodes[s];
162 ssp.PointertoVSSamplerState = cmd_buffer->state.samplers[s].offset;
163 }
164 }
165
166 /* Always emit binding table pointers if we're asked to, since on SKL
167 * this is what flushes push constants. */
168 anv_batch_emit(&cmd_buffer->batch,
169 GENX(3DSTATE_BINDING_TABLE_POINTERS_VS), btp) {
170 btp._3DCommandSubOpcode = binding_table_opcodes[s];
171 btp.PointertoVSBindingTable = cmd_buffer->state.binding_tables[s].offset;
172 }
173 }
174 }
175
176 static struct anv_address
get_push_range_address(struct anv_cmd_buffer * cmd_buffer,const struct anv_shader_bin * shader,const struct anv_push_range * range)177 get_push_range_address(struct anv_cmd_buffer *cmd_buffer,
178 const struct anv_shader_bin *shader,
179 const struct anv_push_range *range)
180 {
181 struct anv_cmd_graphics_state *gfx_state = &cmd_buffer->state.gfx;
182 switch (range->set) {
183 case ANV_DESCRIPTOR_SET_DESCRIPTORS: {
184 /* This is a descriptor set buffer so the set index is
185 * actually given by binding->binding. (Yes, that's
186 * confusing.)
187 */
188 struct anv_descriptor_set *set =
189 gfx_state->base.descriptors[range->index];
190 return anv_descriptor_set_address(set);
191 }
192
193 case ANV_DESCRIPTOR_SET_PUSH_CONSTANTS: {
194 if (gfx_state->base.push_constants_state.alloc_size == 0) {
195 gfx_state->base.push_constants_state =
196 anv_cmd_buffer_gfx_push_constants(cmd_buffer);
197 }
198 return anv_state_pool_state_address(
199 &cmd_buffer->device->dynamic_state_pool,
200 gfx_state->base.push_constants_state);
201 }
202
203 default: {
204 assert(range->set < MAX_SETS);
205 struct anv_descriptor_set *set =
206 gfx_state->base.descriptors[range->set];
207 const struct anv_descriptor *desc =
208 &set->descriptors[range->index];
209
210 if (desc->type == VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER) {
211 if (desc->buffer) {
212 return anv_address_add(desc->buffer->address,
213 desc->offset);
214 }
215 } else {
216 assert(desc->type == VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER_DYNAMIC);
217 if (desc->buffer) {
218 const struct anv_cmd_pipeline_state *pipe_state = &gfx_state->base;
219 uint32_t dynamic_offset =
220 pipe_state->dynamic_offsets[
221 range->set].offsets[range->dynamic_offset_index];
222 return anv_address_add(desc->buffer->address,
223 desc->offset + dynamic_offset);
224 }
225 }
226
227 /* For NULL UBOs, we just return an address in the workaround BO. We do
228 * writes to it for workarounds but always at the bottom. The higher
229 * bytes should be all zeros.
230 */
231 assert(range->length * 32 <= 2048);
232 return (struct anv_address) {
233 .bo = cmd_buffer->device->workaround_bo,
234 .offset = 1024,
235 };
236 }
237 }
238 }
239
240
241 /** Returns the size in bytes of the bound buffer
242 *
243 * The range is relative to the start of the buffer, not the start of the
244 * range. The returned range may be smaller than
245 *
246 * (range->start + range->length) * 32;
247 */
248 static uint32_t
get_push_range_bound_size(struct anv_cmd_buffer * cmd_buffer,const struct anv_shader_bin * shader,const struct anv_push_range * range)249 get_push_range_bound_size(struct anv_cmd_buffer *cmd_buffer,
250 const struct anv_shader_bin *shader,
251 const struct anv_push_range *range)
252 {
253 assert(shader->stage != MESA_SHADER_COMPUTE);
254 const struct anv_cmd_graphics_state *gfx_state = &cmd_buffer->state.gfx;
255 switch (range->set) {
256 case ANV_DESCRIPTOR_SET_DESCRIPTORS: {
257 struct anv_descriptor_set *set =
258 gfx_state->base.descriptors[range->index];
259 struct anv_state state = set->desc_surface_mem;
260 assert(range->start * 32 < state.alloc_size);
261 assert((range->start + range->length) * 32 <= state.alloc_size);
262 return state.alloc_size;
263 }
264
265 case ANV_DESCRIPTOR_SET_PUSH_CONSTANTS:
266 return (range->start + range->length) * 32;
267
268 default: {
269 assert(range->set < MAX_SETS);
270 struct anv_descriptor_set *set =
271 gfx_state->base.descriptors[range->set];
272 const struct anv_descriptor *desc =
273 &set->descriptors[range->index];
274
275 if (desc->type == VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER) {
276 /* Here we promote a UBO to a binding table entry so that we can avoid a layer of indirection.
277 * We use the descriptor set's internally allocated surface state to fill the binding table entry.
278 */
279 if (!desc->buffer)
280 return 0;
281
282 if (range->start * 32 > desc->bind_range)
283 return 0;
284
285 return desc->bind_range;
286 } else {
287 if (!desc->buffer)
288 return 0;
289
290 assert(desc->type == VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER_DYNAMIC);
291 /* Compute the offset within the buffer */
292 const struct anv_cmd_pipeline_state *pipe_state = &gfx_state->base;
293 uint32_t dynamic_offset =
294 pipe_state->dynamic_offsets[
295 range->set].offsets[range->dynamic_offset_index];
296 uint64_t offset = desc->offset + dynamic_offset;
297 /* Clamp to the buffer size */
298 offset = MIN2(offset, desc->buffer->vk.size);
299 /* Clamp the range to the buffer size */
300 uint32_t bound_range = MIN2(desc->range, desc->buffer->vk.size - offset);
301
302 /* Align the range for consistency */
303 bound_range = align(bound_range, ANV_UBO_ALIGNMENT);
304
305 return bound_range;
306 }
307 }
308 }
309 }
310
311 static void
cmd_buffer_emit_push_constant(struct anv_cmd_buffer * cmd_buffer,gl_shader_stage stage,struct anv_address * buffers,unsigned buffer_count)312 cmd_buffer_emit_push_constant(struct anv_cmd_buffer *cmd_buffer,
313 gl_shader_stage stage,
314 struct anv_address *buffers,
315 unsigned buffer_count)
316 {
317 const struct anv_cmd_graphics_state *gfx_state = &cmd_buffer->state.gfx;
318 const struct anv_graphics_pipeline *pipeline =
319 anv_pipeline_to_graphics(gfx_state->base.pipeline);
320
321 static const uint32_t push_constant_opcodes[] = {
322 [MESA_SHADER_VERTEX] = 21,
323 [MESA_SHADER_TESS_CTRL] = 25, /* HS */
324 [MESA_SHADER_TESS_EVAL] = 26, /* DS */
325 [MESA_SHADER_GEOMETRY] = 22,
326 [MESA_SHADER_FRAGMENT] = 23,
327 };
328
329 assert(stage < ARRAY_SIZE(push_constant_opcodes));
330
331 UNUSED uint32_t mocs = anv_mocs(cmd_buffer->device, NULL, 0);
332
333 anv_batch_emit(&cmd_buffer->batch, GENX(3DSTATE_CONSTANT_VS), c) {
334 c._3DCommandSubOpcode = push_constant_opcodes[stage];
335
336 /* Set MOCS.
337 *
338 * We only have one MOCS field for the whole packet, not one per
339 * buffer. We could go out of our way here to walk over all of
340 * the buffers and see if any of them are used externally and use
341 * the external MOCS. However, the notion that someone would use
342 * the same bit of memory for both scanout and a UBO is nuts.
343 *
344 * Let's not bother and assume it's all internal.
345 */
346 c.MOCS = mocs;
347
348 if (anv_pipeline_has_stage(pipeline, stage)) {
349 const struct anv_pipeline_bind_map *bind_map =
350 &pipeline->base.shaders[stage]->bind_map;
351
352 /* The Skylake PRM contains the following restriction:
353 *
354 * "The driver must ensure The following case does not occur
355 * without a flush to the 3D engine: 3DSTATE_CONSTANT_* with
356 * buffer 3 read length equal to zero committed followed by a
357 * 3DSTATE_CONSTANT_* with buffer 0 read length not equal to
358 * zero committed."
359 *
360 * To avoid this, we program the buffers in the highest slots.
361 * This way, slot 0 is only used if slot 3 is also used.
362 */
363 assert(buffer_count <= 4);
364 const unsigned shift = 4 - buffer_count;
365 for (unsigned i = 0; i < buffer_count; i++) {
366 const struct anv_push_range *range = &bind_map->push_ranges[i];
367
368 /* At this point we only have non-empty ranges */
369 assert(range->length > 0);
370
371 c.ConstantBody.ReadLength[i + shift] = range->length;
372 c.ConstantBody.Buffer[i + shift] =
373 anv_address_add(buffers[i], range->start * 32);
374 }
375 }
376 }
377 }
378
379 #if GFX_VER >= 12
380 static void
cmd_buffer_emit_push_constant_all(struct anv_cmd_buffer * cmd_buffer,uint32_t shader_mask,struct anv_address * buffers,uint32_t buffer_count)381 cmd_buffer_emit_push_constant_all(struct anv_cmd_buffer *cmd_buffer,
382 uint32_t shader_mask,
383 struct anv_address *buffers,
384 uint32_t buffer_count)
385 {
386 if (buffer_count == 0) {
387 anv_batch_emit(&cmd_buffer->batch, GENX(3DSTATE_CONSTANT_ALL), c) {
388 c.ShaderUpdateEnable = shader_mask;
389 c.MOCS = isl_mocs(&cmd_buffer->device->isl_dev, 0, false);
390 }
391 return;
392 }
393
394 const struct anv_cmd_graphics_state *gfx_state = &cmd_buffer->state.gfx;
395 const struct anv_graphics_pipeline *pipeline =
396 anv_pipeline_to_graphics(gfx_state->base.pipeline);
397
398 gl_shader_stage stage = vk_to_mesa_shader_stage(shader_mask);
399
400 const struct anv_pipeline_bind_map *bind_map =
401 &pipeline->base.shaders[stage]->bind_map;
402
403 uint32_t *dw;
404 const uint32_t buffer_mask = (1 << buffer_count) - 1;
405 const uint32_t num_dwords = 2 + 2 * buffer_count;
406
407 dw = anv_batch_emitn(&cmd_buffer->batch, num_dwords,
408 GENX(3DSTATE_CONSTANT_ALL),
409 .ShaderUpdateEnable = shader_mask,
410 .PointerBufferMask = buffer_mask,
411 .MOCS = isl_mocs(&cmd_buffer->device->isl_dev, 0, false));
412
413 for (int i = 0; i < buffer_count; i++) {
414 const struct anv_push_range *range = &bind_map->push_ranges[i];
415 GENX(3DSTATE_CONSTANT_ALL_DATA_pack)(
416 &cmd_buffer->batch, dw + 2 + i * 2,
417 &(struct GENX(3DSTATE_CONSTANT_ALL_DATA)) {
418 .PointerToConstantBuffer =
419 anv_address_add(buffers[i], range->start * 32),
420 .ConstantBufferReadLength = range->length,
421 });
422 }
423 }
424 #endif
425
426 static void
cmd_buffer_flush_gfx_push_constants(struct anv_cmd_buffer * cmd_buffer,VkShaderStageFlags dirty_stages)427 cmd_buffer_flush_gfx_push_constants(struct anv_cmd_buffer *cmd_buffer,
428 VkShaderStageFlags dirty_stages)
429 {
430 VkShaderStageFlags flushed = 0;
431 struct anv_cmd_graphics_state *gfx_state = &cmd_buffer->state.gfx;
432 const struct anv_graphics_pipeline *pipeline =
433 anv_pipeline_to_graphics(gfx_state->base.pipeline);
434
435 #if GFX_VER >= 12
436 uint32_t nobuffer_stages = 0;
437 #endif
438
439 /* Compute robust pushed register access mask for each stage. */
440 anv_foreach_stage(stage, dirty_stages) {
441 if (!anv_pipeline_has_stage(pipeline, stage))
442 continue;
443
444 const struct anv_shader_bin *shader = pipeline->base.shaders[stage];
445 if (shader->prog_data->zero_push_reg) {
446 const struct anv_pipeline_bind_map *bind_map = &shader->bind_map;
447 struct anv_push_constants *push = &gfx_state->base.push_constants;
448
449 push->push_reg_mask[stage] = 0;
450 /* Start of the current range in the shader, relative to the start of
451 * push constants in the shader.
452 */
453 unsigned range_start_reg = 0;
454 for (unsigned i = 0; i < 4; i++) {
455 const struct anv_push_range *range = &bind_map->push_ranges[i];
456 if (range->length == 0)
457 continue;
458
459 unsigned bound_size =
460 get_push_range_bound_size(cmd_buffer, shader, range);
461 if (bound_size >= range->start * 32) {
462 unsigned bound_regs =
463 MIN2(DIV_ROUND_UP(bound_size, 32) - range->start,
464 range->length);
465 assert(range_start_reg + bound_regs <= 64);
466 push->push_reg_mask[stage] |= BITFIELD64_RANGE(range_start_reg,
467 bound_regs);
468 }
469
470 cmd_buffer->state.push_constants_dirty |=
471 mesa_to_vk_shader_stage(stage);
472
473 range_start_reg += range->length;
474 }
475 }
476 }
477
478 /* Resets the push constant state so that we allocate a new one if
479 * needed.
480 */
481 gfx_state->base.push_constants_state = ANV_STATE_NULL;
482
483 anv_foreach_stage(stage, dirty_stages) {
484 unsigned buffer_count = 0;
485 flushed |= mesa_to_vk_shader_stage(stage);
486 UNUSED uint32_t max_push_range = 0;
487
488 struct anv_address buffers[4] = {};
489 if (anv_pipeline_has_stage(pipeline, stage)) {
490 const struct anv_shader_bin *shader = pipeline->base.shaders[stage];
491 const struct anv_pipeline_bind_map *bind_map = &shader->bind_map;
492
493 /* We have to gather buffer addresses as a second step because the
494 * loop above puts data into the push constant area and the call to
495 * get_push_range_address is what locks our push constants and copies
496 * them into the actual GPU buffer. If we did the two loops at the
497 * same time, we'd risk only having some of the sizes in the push
498 * constant buffer when we did the copy.
499 */
500 for (unsigned i = 0; i < 4; i++) {
501 const struct anv_push_range *range = &bind_map->push_ranges[i];
502 if (range->length == 0)
503 break;
504
505 buffers[i] = get_push_range_address(cmd_buffer, shader, range);
506 max_push_range = MAX2(max_push_range, range->length);
507 buffer_count++;
508 }
509
510 /* We have at most 4 buffers but they should be tightly packed */
511 for (unsigned i = buffer_count; i < 4; i++)
512 assert(bind_map->push_ranges[i].length == 0);
513 }
514
515 #if GFX_VER >= 12
516 /* If this stage doesn't have any push constants, emit it later in a
517 * single CONSTANT_ALL packet.
518 */
519 if (buffer_count == 0) {
520 nobuffer_stages |= 1 << stage;
521 continue;
522 }
523
524 /* The Constant Buffer Read Length field from 3DSTATE_CONSTANT_ALL
525 * contains only 5 bits, so we can only use it for buffers smaller than
526 * 32.
527 *
528 * According to Wa_16011448509, Gfx12.0 misinterprets some address bits
529 * in 3DSTATE_CONSTANT_ALL. It should still be safe to use the command
530 * for disabling stages, where all address bits are zero. However, we
531 * can't safely use it for general buffers with arbitrary addresses.
532 * Just fall back to the individual 3DSTATE_CONSTANT_XS commands in that
533 * case.
534 */
535 if (max_push_range < 32 && GFX_VERx10 > 120) {
536 cmd_buffer_emit_push_constant_all(cmd_buffer, 1 << stage,
537 buffers, buffer_count);
538 continue;
539 }
540 #endif
541
542 cmd_buffer_emit_push_constant(cmd_buffer, stage, buffers, buffer_count);
543 }
544
545 #if GFX_VER >= 12
546 if (nobuffer_stages)
547 /* Wa_16011448509: all address bits are zero */
548 cmd_buffer_emit_push_constant_all(cmd_buffer, nobuffer_stages, NULL, 0);
549 #endif
550
551 cmd_buffer->state.push_constants_dirty &= ~flushed;
552 }
553
554 #if GFX_VERx10 >= 125
555 static void
cmd_buffer_flush_mesh_inline_data(struct anv_cmd_buffer * cmd_buffer,VkShaderStageFlags dirty_stages)556 cmd_buffer_flush_mesh_inline_data(struct anv_cmd_buffer *cmd_buffer,
557 VkShaderStageFlags dirty_stages)
558 {
559 struct anv_cmd_graphics_state *gfx_state = &cmd_buffer->state.gfx;
560 const struct anv_graphics_pipeline *pipeline =
561 anv_pipeline_to_graphics(gfx_state->base.pipeline);
562
563 if (dirty_stages & VK_SHADER_STAGE_TASK_BIT_EXT &&
564 anv_pipeline_has_stage(pipeline, MESA_SHADER_TASK)) {
565
566 const struct anv_shader_bin *shader = pipeline->base.shaders[MESA_SHADER_TASK];
567 const struct anv_pipeline_bind_map *bind_map = &shader->bind_map;
568
569 anv_batch_emit(&cmd_buffer->batch, GENX(3DSTATE_TASK_SHADER_DATA), data) {
570 const struct anv_push_range *range = &bind_map->push_ranges[0];
571 if (range->length > 0) {
572 struct anv_address buffer =
573 get_push_range_address(cmd_buffer, shader, range);
574
575 uint64_t addr = anv_address_physical(buffer);
576 data.InlineData[0] = addr & 0xffffffff;
577 data.InlineData[1] = addr >> 32;
578
579 memcpy(&data.InlineData[BRW_TASK_MESH_PUSH_CONSTANTS_START_DW],
580 cmd_buffer->state.gfx.base.push_constants.client_data,
581 BRW_TASK_MESH_PUSH_CONSTANTS_SIZE_DW * 4);
582 }
583 }
584 }
585
586 if (dirty_stages & VK_SHADER_STAGE_MESH_BIT_EXT &&
587 anv_pipeline_has_stage(pipeline, MESA_SHADER_MESH)) {
588
589 const struct anv_shader_bin *shader = pipeline->base.shaders[MESA_SHADER_MESH];
590 const struct anv_pipeline_bind_map *bind_map = &shader->bind_map;
591
592 anv_batch_emit(&cmd_buffer->batch, GENX(3DSTATE_MESH_SHADER_DATA), data) {
593 const struct anv_push_range *range = &bind_map->push_ranges[0];
594 if (range->length > 0) {
595 struct anv_address buffer =
596 get_push_range_address(cmd_buffer, shader, range);
597
598 uint64_t addr = anv_address_physical(buffer);
599 data.InlineData[0] = addr & 0xffffffff;
600 data.InlineData[1] = addr >> 32;
601
602 memcpy(&data.InlineData[BRW_TASK_MESH_PUSH_CONSTANTS_START_DW],
603 cmd_buffer->state.gfx.base.push_constants.client_data,
604 BRW_TASK_MESH_PUSH_CONSTANTS_SIZE_DW * 4);
605 }
606 }
607 }
608
609 cmd_buffer->state.push_constants_dirty &= ~dirty_stages;
610 }
611 #endif
612
613 ALWAYS_INLINE static void
genX(emit_hs)614 genX(emit_hs)(struct anv_cmd_buffer *cmd_buffer)
615 {
616 struct anv_graphics_pipeline *pipeline =
617 anv_pipeline_to_graphics(cmd_buffer->state.gfx.base.pipeline);
618 if (!anv_pipeline_has_stage(pipeline, MESA_SHADER_TESS_EVAL))
619 return;
620
621 anv_batch_emit_pipeline_state(&cmd_buffer->batch, pipeline, final.hs);
622 }
623
624 ALWAYS_INLINE static void
genX(emit_ds)625 genX(emit_ds)(struct anv_cmd_buffer *cmd_buffer)
626 {
627 #if INTEL_NEEDS_WA_22018402687
628 /* Wa_22018402687:
629 * In any 3D enabled context, just before any Tessellation enabled draw
630 * call (3D Primitive), re-send the last programmed 3DSTATE_DS again.
631 * This will make sure that the 3DSTATE_INT generated just before the
632 * draw call will have TDS dirty which will make sure TDS will launch the
633 * state thread before the draw call.
634 *
635 * This fixes a hang resulting from running anything using tessellation
636 * after a switch away from the mesh pipeline.
637 * We don't need to track said switch, as it matters at the HW level, and
638 * can be triggered even across processes, so we apply the Wa at all times.
639 */
640 struct anv_graphics_pipeline *pipeline =
641 anv_pipeline_to_graphics(cmd_buffer->state.gfx.base.pipeline);
642 if (!anv_pipeline_has_stage(pipeline, MESA_SHADER_TESS_EVAL))
643 return;
644
645 anv_batch_emit_pipeline_state(&cmd_buffer->batch, pipeline, final.ds);
646 #endif
647 }
648
649 ALWAYS_INLINE static void
genX(cmd_buffer_flush_gfx_state)650 genX(cmd_buffer_flush_gfx_state)(struct anv_cmd_buffer *cmd_buffer)
651 {
652 struct anv_graphics_pipeline *pipeline =
653 anv_pipeline_to_graphics(cmd_buffer->state.gfx.base.pipeline);
654 const struct vk_dynamic_graphics_state *dyn =
655 &cmd_buffer->vk.dynamic_graphics_state;
656 uint32_t *p;
657
658 assert((pipeline->base.base.active_stages & VK_SHADER_STAGE_COMPUTE_BIT) == 0);
659
660 genX(cmd_buffer_config_l3)(cmd_buffer, pipeline->base.base.l3_config);
661
662 genX(cmd_buffer_emit_hashing_mode)(cmd_buffer, UINT_MAX, UINT_MAX, 1);
663
664 genX(flush_pipeline_select_3d)(cmd_buffer);
665
666 /* Wa_14015814527
667 *
668 * Apply task URB workaround when switching from task to primitive.
669 */
670 if (cmd_buffer->state.gfx.dirty & ANV_CMD_DIRTY_PIPELINE) {
671 if (anv_pipeline_is_primitive(pipeline)) {
672 genX(apply_task_urb_workaround)(cmd_buffer);
673 } else if (anv_pipeline_has_stage(pipeline, MESA_SHADER_TASK)) {
674 cmd_buffer->state.gfx.used_task_shader = true;
675 }
676 }
677
678 /* Apply any pending pipeline flushes we may have. We want to apply them
679 * now because, if any of those flushes are for things like push constants,
680 * the GPU will read the state at weird times.
681 */
682 genX(cmd_buffer_apply_pipe_flushes)(cmd_buffer);
683
684 /* Check what vertex buffers have been rebound against the set of bindings
685 * being used by the current set of vertex attributes.
686 */
687 uint32_t vb_emit = cmd_buffer->state.gfx.vb_dirty & dyn->vi->bindings_valid;
688 /* If the pipeline changed, the we have to consider all the valid bindings. */
689 if ((cmd_buffer->state.gfx.dirty & ANV_CMD_DIRTY_PIPELINE) ||
690 BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_VI_BINDING_STRIDES))
691 vb_emit |= dyn->vi->bindings_valid;
692
693 if (vb_emit) {
694 const uint32_t num_buffers = __builtin_popcount(vb_emit);
695 const uint32_t num_dwords = 1 + num_buffers * 4;
696
697 p = anv_batch_emitn(&cmd_buffer->batch, num_dwords,
698 GENX(3DSTATE_VERTEX_BUFFERS));
699 uint32_t i = 0;
700 u_foreach_bit(vb, vb_emit) {
701 struct anv_buffer *buffer = cmd_buffer->state.vertex_bindings[vb].buffer;
702 uint32_t offset = cmd_buffer->state.vertex_bindings[vb].offset;
703
704 struct GENX(VERTEX_BUFFER_STATE) state;
705 if (buffer) {
706 uint32_t stride = dyn->vi_binding_strides[vb];
707 UNUSED uint32_t size = cmd_buffer->state.vertex_bindings[vb].size;
708
709 state = (struct GENX(VERTEX_BUFFER_STATE)) {
710 .VertexBufferIndex = vb,
711
712 .MOCS = anv_mocs(cmd_buffer->device, buffer->address.bo,
713 ISL_SURF_USAGE_VERTEX_BUFFER_BIT),
714 .AddressModifyEnable = true,
715 .BufferPitch = stride,
716 .BufferStartingAddress = anv_address_add(buffer->address, offset),
717 .NullVertexBuffer = offset >= buffer->vk.size,
718 #if GFX_VER >= 12
719 .L3BypassDisable = true,
720 #endif
721
722 .BufferSize = size,
723 };
724 } else {
725 state = (struct GENX(VERTEX_BUFFER_STATE)) {
726 .VertexBufferIndex = vb,
727 .NullVertexBuffer = true,
728 .MOCS = anv_mocs(cmd_buffer->device, NULL,
729 ISL_SURF_USAGE_VERTEX_BUFFER_BIT),
730 };
731 }
732
733 #if GFX_VER == 9
734 genX(cmd_buffer_set_binding_for_gfx8_vb_flush)(cmd_buffer, vb,
735 state.BufferStartingAddress,
736 state.BufferSize);
737 #endif
738
739 GENX(VERTEX_BUFFER_STATE_pack)(&cmd_buffer->batch, &p[1 + i * 4], &state);
740 i++;
741 }
742 }
743
744 cmd_buffer->state.gfx.vb_dirty &= ~vb_emit;
745
746 /* If patch control points value is changed, let's just update the push
747 * constant data. If the current pipeline also use this, we need to reemit
748 * the 3DSTATE_CONSTANT packet.
749 */
750 struct anv_push_constants *push = &cmd_buffer->state.gfx.base.push_constants;
751 if (BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_TS_PATCH_CONTROL_POINTS) &&
752 push->gfx.tcs_input_vertices != dyn->ts.patch_control_points) {
753 push->gfx.tcs_input_vertices = dyn->ts.patch_control_points;
754 if (pipeline->dynamic_patch_control_points)
755 cmd_buffer->state.push_constants_dirty |= VK_SHADER_STAGE_TESSELLATION_CONTROL_BIT;
756 }
757
758 const bool any_dynamic_state_dirty =
759 vk_dynamic_graphics_state_any_dirty(dyn);
760 uint32_t descriptors_dirty = cmd_buffer->state.descriptors_dirty &
761 pipeline->base.base.active_stages;
762
763 descriptors_dirty |=
764 genX(cmd_buffer_flush_push_descriptors)(cmd_buffer,
765 &cmd_buffer->state.gfx.base,
766 &pipeline->base.base);
767
768 /* Wa_1306463417, Wa_16011107343 - Send HS state for every primitive. */
769 if (cmd_buffer->state.gfx.dirty & ANV_CMD_DIRTY_PIPELINE ||
770 (INTEL_NEEDS_WA_1306463417 || INTEL_NEEDS_WA_16011107343)) {
771 genX(emit_hs)(cmd_buffer);
772 }
773
774 if (!cmd_buffer->state.gfx.dirty && !descriptors_dirty &&
775 !any_dynamic_state_dirty &&
776 ((cmd_buffer->state.push_constants_dirty &
777 (VK_SHADER_STAGE_ALL_GRAPHICS |
778 VK_SHADER_STAGE_TASK_BIT_EXT |
779 VK_SHADER_STAGE_MESH_BIT_EXT)) == 0))
780 return;
781
782 if (cmd_buffer->state.gfx.dirty & ANV_CMD_DIRTY_XFB_ENABLE) {
783 /* Wa_16011411144:
784 *
785 * SW must insert a PIPE_CONTROL cmd before and after the
786 * 3dstate_so_buffer_index_0/1/2/3 states to ensure so_buffer_index_*
787 * state is not combined with other state changes.
788 */
789 if (intel_needs_workaround(cmd_buffer->device->info, 16011411144)) {
790 anv_add_pending_pipe_bits(cmd_buffer,
791 ANV_PIPE_CS_STALL_BIT,
792 "before SO_BUFFER change WA");
793 genX(cmd_buffer_apply_pipe_flushes)(cmd_buffer);
794 }
795
796 /* We don't need any per-buffer dirty tracking because you're not
797 * allowed to bind different XFB buffers while XFB is enabled.
798 */
799 for (unsigned idx = 0; idx < MAX_XFB_BUFFERS; idx++) {
800 struct anv_xfb_binding *xfb = &cmd_buffer->state.xfb_bindings[idx];
801 anv_batch_emit(&cmd_buffer->batch, GENX(3DSTATE_SO_BUFFER), sob) {
802 #if GFX_VER < 12
803 sob.SOBufferIndex = idx;
804 #else
805 sob._3DCommandOpcode = 0;
806 sob._3DCommandSubOpcode = SO_BUFFER_INDEX_0_CMD + idx;
807 #endif
808
809 if (cmd_buffer->state.xfb_enabled && xfb->buffer && xfb->size != 0) {
810 sob.MOCS = anv_mocs(cmd_buffer->device, xfb->buffer->address.bo,
811 ISL_SURF_USAGE_STREAM_OUT_BIT);
812 sob.SurfaceBaseAddress = anv_address_add(xfb->buffer->address,
813 xfb->offset);
814 sob.SOBufferEnable = true;
815 sob.StreamOffsetWriteEnable = false;
816 /* Size is in DWords - 1 */
817 sob.SurfaceSize = DIV_ROUND_UP(xfb->size, 4) - 1;
818 } else {
819 sob.MOCS = anv_mocs(cmd_buffer->device, NULL, 0);
820 }
821 }
822 }
823
824 if (intel_needs_workaround(cmd_buffer->device->info, 16011411144)) {
825 /* Wa_16011411144: also CS_STALL after touching SO_BUFFER change */
826 anv_add_pending_pipe_bits(cmd_buffer,
827 ANV_PIPE_CS_STALL_BIT,
828 "after SO_BUFFER change WA");
829 genX(cmd_buffer_apply_pipe_flushes)(cmd_buffer);
830 } else if (GFX_VER >= 10) {
831 /* CNL and later require a CS stall after 3DSTATE_SO_BUFFER */
832 anv_add_pending_pipe_bits(cmd_buffer,
833 ANV_PIPE_CS_STALL_BIT,
834 "after 3DSTATE_SO_BUFFER call");
835 }
836 }
837
838 /* Flush the runtime state into the HW state tracking */
839 if (cmd_buffer->state.gfx.dirty || any_dynamic_state_dirty)
840 genX(cmd_buffer_flush_gfx_runtime_state)(cmd_buffer);
841
842 /* Flush the HW state into the commmand buffer */
843 if (!BITSET_IS_EMPTY(cmd_buffer->state.gfx.dyn_state.dirty))
844 genX(cmd_buffer_flush_gfx_hw_state)(cmd_buffer);
845
846 /* If the pipeline changed, we may need to re-allocate push constant space
847 * in the URB.
848 */
849 if (cmd_buffer->state.gfx.dirty & ANV_CMD_DIRTY_PIPELINE) {
850 cmd_buffer_alloc_gfx_push_constants(cmd_buffer);
851
852 /* Also add the relocations (scratch buffers) */
853 VkResult result = anv_reloc_list_append(cmd_buffer->batch.relocs,
854 pipeline->base.base.batch.relocs);
855 if (result != VK_SUCCESS) {
856 anv_batch_set_error(&cmd_buffer->batch, result);
857 return;
858 }
859 }
860
861 /* Render targets live in the same binding table as fragment descriptors */
862 if (cmd_buffer->state.gfx.dirty & ANV_CMD_DIRTY_RENDER_TARGETS)
863 descriptors_dirty |= VK_SHADER_STAGE_FRAGMENT_BIT;
864
865 /* We emit the binding tables and sampler tables first, then emit push
866 * constants and then finally emit binding table and sampler table
867 * pointers. It has to happen in this order, since emitting the binding
868 * tables may change the push constants (in case of storage images). After
869 * emitting push constants, on SKL+ we have to emit the corresponding
870 * 3DSTATE_BINDING_TABLE_POINTER_* for the push constants to take effect.
871 */
872 uint32_t dirty = 0;
873 if (descriptors_dirty) {
874 dirty = genX(cmd_buffer_flush_descriptor_sets)(
875 cmd_buffer,
876 &cmd_buffer->state.gfx.base,
877 descriptors_dirty,
878 pipeline->base.shaders,
879 ARRAY_SIZE(pipeline->base.shaders));
880 cmd_buffer->state.descriptors_dirty &= ~dirty;
881 }
882
883 if (dirty || cmd_buffer->state.push_constants_dirty) {
884 /* Because we're pushing UBOs, we have to push whenever either
885 * descriptors or push constants is dirty.
886 */
887 dirty |= cmd_buffer->state.push_constants_dirty &
888 pipeline->base.base.active_stages;
889 cmd_buffer_flush_gfx_push_constants(cmd_buffer,
890 dirty & VK_SHADER_STAGE_ALL_GRAPHICS);
891 #if GFX_VERx10 >= 125
892 cmd_buffer_flush_mesh_inline_data(
893 cmd_buffer, dirty & (VK_SHADER_STAGE_TASK_BIT_EXT |
894 VK_SHADER_STAGE_MESH_BIT_EXT));
895 #endif
896 }
897
898 if (dirty & VK_SHADER_STAGE_ALL_GRAPHICS) {
899 cmd_buffer_emit_descriptor_pointers(cmd_buffer,
900 dirty & VK_SHADER_STAGE_ALL_GRAPHICS);
901 }
902
903 /* When we're done, there is no more dirty gfx state. */
904 cmd_buffer->state.gfx.dirty = 0;
905 }
906
907 ALWAYS_INLINE static bool
anv_use_generated_draws(const struct anv_cmd_buffer * cmd_buffer,uint32_t count)908 anv_use_generated_draws(const struct anv_cmd_buffer *cmd_buffer, uint32_t count)
909 {
910 const struct anv_device *device = cmd_buffer->device;
911 const struct anv_graphics_pipeline *pipeline =
912 anv_pipeline_to_graphics(cmd_buffer->state.gfx.base.pipeline);
913
914 /* Limit generated draws to pipelines without HS stage. This makes things
915 * simpler for implementing Wa_1306463417, Wa_16011107343.
916 */
917 if ((INTEL_NEEDS_WA_1306463417 || INTEL_NEEDS_WA_16011107343) &&
918 anv_pipeline_has_stage(pipeline, MESA_SHADER_TESS_CTRL))
919 return false;
920
921 return count >= device->physical->instance->generated_indirect_threshold;
922 }
923
924 #include "genX_cmd_draw_helpers.h"
925 #include "genX_cmd_draw_generated_indirect.h"
926
927 #if GFX_VER >= 11
928 #define _3DPRIMITIVE_DIRECT GENX(3DPRIMITIVE_EXTENDED)
929 #else
930 #define _3DPRIMITIVE_DIRECT GENX(3DPRIMITIVE)
931 #endif
932
genX(CmdDraw)933 void genX(CmdDraw)(
934 VkCommandBuffer commandBuffer,
935 uint32_t vertexCount,
936 uint32_t instanceCount,
937 uint32_t firstVertex,
938 uint32_t firstInstance)
939 {
940 ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer);
941 struct anv_graphics_pipeline *pipeline =
942 anv_pipeline_to_graphics(cmd_buffer->state.gfx.base.pipeline);
943
944 if (anv_batch_has_error(&cmd_buffer->batch))
945 return;
946
947 const uint32_t count =
948 vertexCount * instanceCount * pipeline->instance_multiplier;
949 anv_measure_snapshot(cmd_buffer,
950 INTEL_SNAPSHOT_DRAW,
951 "draw", count);
952 trace_intel_begin_draw(&cmd_buffer->trace);
953
954 /* Select pipeline here to allow
955 * cmd_buffer_emit_vertex_constants_and_flush() without flushing before
956 * cmd_buffer_flush_gfx_state().
957 */
958 genX(flush_pipeline_select_3d)(cmd_buffer);
959
960 if (cmd_buffer->state.conditional_render_enabled)
961 genX(cmd_emit_conditional_render_predicate)(cmd_buffer);
962
963 #if GFX_VER < 11
964 cmd_buffer_emit_vertex_constants_and_flush(cmd_buffer,
965 get_vs_prog_data(pipeline),
966 firstVertex, firstInstance, 0,
967 false /* force_flush */);
968 #endif
969
970 genX(cmd_buffer_flush_gfx_state)(cmd_buffer);
971 genX(emit_ds)(cmd_buffer);
972
973 genX(emit_breakpoint)(&cmd_buffer->batch, cmd_buffer->device, true);
974
975 anv_batch_emit(&cmd_buffer->batch, _3DPRIMITIVE_DIRECT, prim) {
976 prim.PredicateEnable = cmd_buffer->state.conditional_render_enabled;
977 #if GFX_VERx10 >= 125
978 prim.TBIMREnable = cmd_buffer->state.gfx.dyn_state.use_tbimr;
979 #endif
980 prim.VertexAccessType = SEQUENTIAL;
981 prim.VertexCountPerInstance = vertexCount;
982 prim.StartVertexLocation = firstVertex;
983 prim.InstanceCount = instanceCount *
984 pipeline->instance_multiplier;
985 prim.StartInstanceLocation = firstInstance;
986 prim.BaseVertexLocation = 0;
987 #if GFX_VER >= 11
988 prim.ExtendedParametersPresent = true;
989 prim.ExtendedParameter0 = firstVertex;
990 prim.ExtendedParameter1 = firstInstance;
991 prim.ExtendedParameter2 = 0;
992 #endif
993 }
994
995 genX(batch_emit_post_3dprimitive_was)(&cmd_buffer->batch,
996 cmd_buffer->device,
997 cmd_buffer->state.gfx.primitive_topology,
998 vertexCount);
999 genX(emit_breakpoint)(&cmd_buffer->batch, cmd_buffer->device, false);
1000
1001 update_dirty_vbs_for_gfx8_vb_flush(cmd_buffer, SEQUENTIAL);
1002
1003 trace_intel_end_draw(&cmd_buffer->trace, count);
1004 }
1005
genX(CmdDrawMultiEXT)1006 void genX(CmdDrawMultiEXT)(
1007 VkCommandBuffer commandBuffer,
1008 uint32_t drawCount,
1009 const VkMultiDrawInfoEXT *pVertexInfo,
1010 uint32_t instanceCount,
1011 uint32_t firstInstance,
1012 uint32_t stride)
1013 {
1014 ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer);
1015 UNUSED struct anv_graphics_pipeline *pipeline =
1016 anv_pipeline_to_graphics(cmd_buffer->state.gfx.base.pipeline);
1017
1018 if (anv_batch_has_error(&cmd_buffer->batch))
1019 return;
1020
1021 genX(cmd_buffer_flush_gfx_state)(cmd_buffer);
1022
1023 if (cmd_buffer->state.conditional_render_enabled)
1024 genX(cmd_emit_conditional_render_predicate)(cmd_buffer);
1025
1026 uint32_t i = 0;
1027 #if GFX_VER < 11
1028 vk_foreach_multi_draw(draw, i, pVertexInfo, drawCount, stride) {
1029 cmd_buffer_emit_vertex_constants_and_flush(cmd_buffer,
1030 get_vs_prog_data(pipeline),
1031 draw->firstVertex,
1032 firstInstance, i, !i);
1033
1034 const uint32_t count =
1035 draw->vertexCount * instanceCount * pipeline->instance_multiplier;
1036 anv_measure_snapshot(cmd_buffer,
1037 INTEL_SNAPSHOT_DRAW,
1038 "draw multi", count);
1039 trace_intel_begin_draw_multi(&cmd_buffer->trace);
1040
1041 genX(emit_breakpoint)(&cmd_buffer->batch, cmd_buffer->device, true);
1042
1043 anv_batch_emit(&cmd_buffer->batch, GENX(3DPRIMITIVE), prim) {
1044 prim.PredicateEnable = cmd_buffer->state.conditional_render_enabled;
1045 prim.VertexAccessType = SEQUENTIAL;
1046 prim.VertexCountPerInstance = draw->vertexCount;
1047 prim.StartVertexLocation = draw->firstVertex;
1048 prim.InstanceCount = instanceCount *
1049 pipeline->instance_multiplier;
1050 prim.StartInstanceLocation = firstInstance;
1051 prim.BaseVertexLocation = 0;
1052 }
1053
1054 genX(batch_emit_post_3dprimitive_was)(&cmd_buffer->batch,
1055 cmd_buffer->device,
1056 cmd_buffer->state.gfx.primitive_topology,
1057 drawCount == 0 ? 0 :
1058 pVertexInfo[drawCount - 1].vertexCount);
1059
1060 genX(emit_breakpoint)(&cmd_buffer->batch, cmd_buffer->device, false);
1061 trace_intel_end_draw_multi(&cmd_buffer->trace, count);
1062 }
1063 #else
1064 vk_foreach_multi_draw(draw, i, pVertexInfo, drawCount, stride) {
1065
1066 /* Wa_1306463417, Wa_16011107343 - Send HS state for every primitive,
1067 * first one was handled by cmd_buffer_flush_gfx_state.
1068 */
1069 if (i && (INTEL_NEEDS_WA_1306463417 || INTEL_NEEDS_WA_16011107343))
1070 genX(emit_hs)(cmd_buffer);
1071 genX(emit_ds)(cmd_buffer);
1072
1073 const uint32_t count = draw->vertexCount * instanceCount;
1074 anv_measure_snapshot(cmd_buffer,
1075 INTEL_SNAPSHOT_DRAW,
1076 "draw multi", count);
1077 trace_intel_begin_draw_multi(&cmd_buffer->trace);
1078
1079 genX(emit_breakpoint)(&cmd_buffer->batch, cmd_buffer->device, true);
1080
1081 anv_batch_emit(&cmd_buffer->batch, _3DPRIMITIVE_DIRECT, prim) {
1082 #if GFX_VERx10 >= 125
1083 prim.TBIMREnable = cmd_buffer->state.gfx.dyn_state.use_tbimr;
1084 #endif
1085 prim.PredicateEnable = cmd_buffer->state.conditional_render_enabled;
1086 prim.VertexAccessType = SEQUENTIAL;
1087 prim.VertexCountPerInstance = draw->vertexCount;
1088 prim.StartVertexLocation = draw->firstVertex;
1089 prim.InstanceCount = instanceCount;
1090 prim.StartInstanceLocation = firstInstance;
1091 prim.BaseVertexLocation = 0;
1092 prim.ExtendedParametersPresent = true;
1093 prim.ExtendedParameter0 = draw->firstVertex;
1094 prim.ExtendedParameter1 = firstInstance;
1095 prim.ExtendedParameter2 = i;
1096 }
1097
1098 genX(batch_emit_post_3dprimitive_was)(&cmd_buffer->batch,
1099 cmd_buffer->device,
1100 cmd_buffer->state.gfx.primitive_topology,
1101 drawCount == 0 ? 0 :
1102 pVertexInfo[drawCount - 1].vertexCount);
1103
1104 genX(emit_breakpoint)(&cmd_buffer->batch, cmd_buffer->device, false);
1105 trace_intel_end_draw_multi(&cmd_buffer->trace, count);
1106 }
1107 #endif
1108
1109 update_dirty_vbs_for_gfx8_vb_flush(cmd_buffer, SEQUENTIAL);
1110 }
1111
genX(CmdDrawIndexed)1112 void genX(CmdDrawIndexed)(
1113 VkCommandBuffer commandBuffer,
1114 uint32_t indexCount,
1115 uint32_t instanceCount,
1116 uint32_t firstIndex,
1117 int32_t vertexOffset,
1118 uint32_t firstInstance)
1119 {
1120 ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer);
1121 struct anv_graphics_pipeline *pipeline =
1122 anv_pipeline_to_graphics(cmd_buffer->state.gfx.base.pipeline);
1123
1124 if (anv_batch_has_error(&cmd_buffer->batch))
1125 return;
1126
1127 const uint32_t count =
1128 indexCount * instanceCount * pipeline->instance_multiplier;
1129 anv_measure_snapshot(cmd_buffer,
1130 INTEL_SNAPSHOT_DRAW,
1131 "draw indexed",
1132 count);
1133 trace_intel_begin_draw_indexed(&cmd_buffer->trace);
1134
1135 /* Select pipeline here to allow
1136 * cmd_buffer_emit_vertex_constants_and_flush() without flushing before
1137 * cmd_buffer_flush_gfx_state().
1138 */
1139 genX(flush_pipeline_select_3d)(cmd_buffer);
1140
1141 if (cmd_buffer->state.conditional_render_enabled)
1142 genX(cmd_emit_conditional_render_predicate)(cmd_buffer);
1143
1144 #if GFX_VER < 11
1145 const struct brw_vs_prog_data *vs_prog_data = get_vs_prog_data(pipeline);
1146 cmd_buffer_emit_vertex_constants_and_flush(cmd_buffer, vs_prog_data,
1147 vertexOffset, firstInstance,
1148 0, false /* force_flush */);
1149 #endif
1150
1151 genX(cmd_buffer_flush_gfx_state)(cmd_buffer);
1152 genX(emit_breakpoint)(&cmd_buffer->batch, cmd_buffer->device, true);
1153
1154 anv_batch_emit(&cmd_buffer->batch, _3DPRIMITIVE_DIRECT, prim) {
1155 prim.PredicateEnable = cmd_buffer->state.conditional_render_enabled;
1156 #if GFX_VERx10 >= 125
1157 prim.TBIMREnable = cmd_buffer->state.gfx.dyn_state.use_tbimr;
1158 #endif
1159 prim.VertexAccessType = RANDOM;
1160 prim.VertexCountPerInstance = indexCount;
1161 prim.StartVertexLocation = firstIndex;
1162 prim.InstanceCount = instanceCount *
1163 pipeline->instance_multiplier;
1164 prim.StartInstanceLocation = firstInstance;
1165 prim.BaseVertexLocation = vertexOffset;
1166 #if GFX_VER >= 11
1167 prim.ExtendedParametersPresent = true;
1168 prim.ExtendedParameter0 = vertexOffset;
1169 prim.ExtendedParameter1 = firstInstance;
1170 prim.ExtendedParameter2 = 0;
1171 #endif
1172 }
1173
1174 genX(batch_emit_post_3dprimitive_was)(&cmd_buffer->batch,
1175 cmd_buffer->device,
1176 cmd_buffer->state.gfx.primitive_topology,
1177 indexCount);
1178 genX(emit_breakpoint)(&cmd_buffer->batch, cmd_buffer->device, false);
1179
1180 update_dirty_vbs_for_gfx8_vb_flush(cmd_buffer, RANDOM);
1181
1182 trace_intel_end_draw_indexed(&cmd_buffer->trace, count);
1183 }
1184
genX(CmdDrawMultiIndexedEXT)1185 void genX(CmdDrawMultiIndexedEXT)(
1186 VkCommandBuffer commandBuffer,
1187 uint32_t drawCount,
1188 const VkMultiDrawIndexedInfoEXT *pIndexInfo,
1189 uint32_t instanceCount,
1190 uint32_t firstInstance,
1191 uint32_t stride,
1192 const int32_t *pVertexOffset)
1193 {
1194 ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer);
1195 struct anv_graphics_pipeline *pipeline =
1196 anv_pipeline_to_graphics(cmd_buffer->state.gfx.base.pipeline);
1197
1198 if (anv_batch_has_error(&cmd_buffer->batch))
1199 return;
1200
1201 genX(cmd_buffer_flush_gfx_state)(cmd_buffer);
1202
1203 if (cmd_buffer->state.conditional_render_enabled)
1204 genX(cmd_emit_conditional_render_predicate)(cmd_buffer);
1205
1206 uint32_t i = 0;
1207 #if GFX_VER < 11
1208 const struct brw_vs_prog_data *vs_prog_data = get_vs_prog_data(pipeline);
1209 if (pVertexOffset) {
1210 if (vs_prog_data->uses_drawid) {
1211 bool emitted = true;
1212 if (vs_prog_data->uses_firstvertex ||
1213 vs_prog_data->uses_baseinstance) {
1214 emit_base_vertex_instance(cmd_buffer, *pVertexOffset, firstInstance);
1215 emitted = true;
1216 }
1217 vk_foreach_multi_draw_indexed(draw, i, pIndexInfo, drawCount, stride) {
1218 if (vs_prog_data->uses_drawid) {
1219 emit_draw_index(cmd_buffer, i);
1220 emitted = true;
1221 }
1222 /* Emitting draw index or vertex index BOs may result in needing
1223 * additional VF cache flushes.
1224 */
1225 if (emitted)
1226 genX(cmd_buffer_apply_pipe_flushes)(cmd_buffer);
1227
1228 const uint32_t count =
1229 draw->indexCount * instanceCount * pipeline->instance_multiplier;
1230 anv_measure_snapshot(cmd_buffer,
1231 INTEL_SNAPSHOT_DRAW,
1232 "draw indexed multi",
1233 count);
1234 trace_intel_begin_draw_indexed_multi(&cmd_buffer->trace);
1235 genX(emit_breakpoint)(&cmd_buffer->batch, cmd_buffer->device,
1236 true);
1237
1238 anv_batch_emit(&cmd_buffer->batch, GENX(3DPRIMITIVE), prim) {
1239 prim.PredicateEnable = cmd_buffer->state.conditional_render_enabled;
1240 prim.VertexAccessType = RANDOM;
1241 prim.VertexCountPerInstance = draw->indexCount;
1242 prim.StartVertexLocation = draw->firstIndex;
1243 prim.InstanceCount = instanceCount *
1244 pipeline->instance_multiplier;
1245 prim.StartInstanceLocation = firstInstance;
1246 prim.BaseVertexLocation = *pVertexOffset;
1247 }
1248
1249 genX(batch_emit_post_3dprimitive_was)(&cmd_buffer->batch,
1250 cmd_buffer->device,
1251 cmd_buffer->state.gfx.primitive_topology,
1252 drawCount == 0 ? 0 :
1253 pIndexInfo[drawCount - 1].indexCount);
1254
1255 genX(emit_breakpoint)(&cmd_buffer->batch, cmd_buffer->device,
1256 false);
1257 trace_intel_end_draw_indexed_multi(&cmd_buffer->trace, count);
1258 emitted = false;
1259 }
1260 } else {
1261 if (vs_prog_data->uses_firstvertex ||
1262 vs_prog_data->uses_baseinstance) {
1263 emit_base_vertex_instance(cmd_buffer, *pVertexOffset, firstInstance);
1264 /* Emitting draw index or vertex index BOs may result in needing
1265 * additional VF cache flushes.
1266 */
1267 genX(cmd_buffer_apply_pipe_flushes)(cmd_buffer);
1268 }
1269 vk_foreach_multi_draw_indexed(draw, i, pIndexInfo, drawCount, stride) {
1270 const uint32_t count =
1271 draw->indexCount * instanceCount * pipeline->instance_multiplier;
1272 anv_measure_snapshot(cmd_buffer,
1273 INTEL_SNAPSHOT_DRAW,
1274 "draw indexed multi",
1275 count);
1276 trace_intel_begin_draw_indexed_multi(&cmd_buffer->trace);
1277 genX(emit_breakpoint)(&cmd_buffer->batch, cmd_buffer->device,
1278 true);
1279
1280 anv_batch_emit(&cmd_buffer->batch, GENX(3DPRIMITIVE), prim) {
1281 prim.PredicateEnable = cmd_buffer->state.conditional_render_enabled;
1282 prim.VertexAccessType = RANDOM;
1283 prim.VertexCountPerInstance = draw->indexCount;
1284 prim.StartVertexLocation = draw->firstIndex;
1285 prim.InstanceCount = instanceCount *
1286 pipeline->instance_multiplier;
1287 prim.StartInstanceLocation = firstInstance;
1288 prim.BaseVertexLocation = *pVertexOffset;
1289 }
1290
1291 genX(batch_emit_post_3dprimitive_was)(&cmd_buffer->batch,
1292 cmd_buffer->device,
1293 cmd_buffer->state.gfx.primitive_topology,
1294 drawCount == 0 ? 0 :
1295 pIndexInfo[drawCount - 1].indexCount);
1296
1297 genX(emit_breakpoint)(&cmd_buffer->batch, cmd_buffer->device,
1298 false);
1299 trace_intel_end_draw_indexed_multi(&cmd_buffer->trace, count);
1300 }
1301 }
1302 } else {
1303 vk_foreach_multi_draw_indexed(draw, i, pIndexInfo, drawCount, stride) {
1304 cmd_buffer_emit_vertex_constants_and_flush(cmd_buffer, vs_prog_data,
1305 draw->vertexOffset,
1306 firstInstance, i, i != 0);
1307
1308 const uint32_t count =
1309 draw->indexCount * instanceCount * pipeline->instance_multiplier;
1310 anv_measure_snapshot(cmd_buffer,
1311 INTEL_SNAPSHOT_DRAW,
1312 "draw indexed multi",
1313 count);
1314 trace_intel_begin_draw_indexed_multi(&cmd_buffer->trace);
1315 genX(emit_breakpoint)(&cmd_buffer->batch, cmd_buffer->device, true);
1316
1317 anv_batch_emit(&cmd_buffer->batch, GENX(3DPRIMITIVE), prim) {
1318 prim.PredicateEnable = cmd_buffer->state.conditional_render_enabled;
1319 prim.VertexAccessType = RANDOM;
1320 prim.VertexCountPerInstance = draw->indexCount;
1321 prim.StartVertexLocation = draw->firstIndex;
1322 prim.InstanceCount = instanceCount *
1323 pipeline->instance_multiplier;
1324 prim.StartInstanceLocation = firstInstance;
1325 prim.BaseVertexLocation = draw->vertexOffset;
1326 }
1327
1328 genX(batch_emit_post_3dprimitive_was)(&cmd_buffer->batch,
1329 cmd_buffer->device,
1330 cmd_buffer->state.gfx.primitive_topology,
1331 drawCount == 0 ? 0 :
1332 pIndexInfo[drawCount - 1].indexCount);
1333
1334 genX(emit_breakpoint)(&cmd_buffer->batch, cmd_buffer->device, false);
1335 trace_intel_end_draw_indexed_multi(&cmd_buffer->trace, count);
1336 }
1337 }
1338 #else
1339 vk_foreach_multi_draw_indexed(draw, i, pIndexInfo, drawCount, stride) {
1340
1341 /* Wa_1306463417, Wa_16011107343 - Send HS state for every primitive,
1342 * first one was handled by cmd_buffer_flush_gfx_state.
1343 */
1344 if (i && (INTEL_NEEDS_WA_1306463417 || INTEL_NEEDS_WA_16011107343))
1345 genX(emit_hs)(cmd_buffer);
1346 genX(emit_ds)(cmd_buffer);
1347
1348 const uint32_t count =
1349 draw->indexCount * instanceCount * pipeline->instance_multiplier;
1350 anv_measure_snapshot(cmd_buffer,
1351 INTEL_SNAPSHOT_DRAW,
1352 "draw indexed multi",
1353 count);
1354 trace_intel_begin_draw_indexed_multi(&cmd_buffer->trace);
1355 genX(emit_breakpoint)(&cmd_buffer->batch, cmd_buffer->device, true);
1356
1357 anv_batch_emit(&cmd_buffer->batch, GENX(3DPRIMITIVE_EXTENDED), prim) {
1358 #if GFX_VERx10 >= 125
1359 prim.TBIMREnable = cmd_buffer->state.gfx.dyn_state.use_tbimr;
1360 #endif
1361 prim.PredicateEnable = cmd_buffer->state.conditional_render_enabled;
1362 prim.VertexAccessType = RANDOM;
1363 prim.VertexCountPerInstance = draw->indexCount;
1364 prim.StartVertexLocation = draw->firstIndex;
1365 prim.InstanceCount = instanceCount *
1366 pipeline->instance_multiplier;
1367 prim.StartInstanceLocation = firstInstance;
1368 prim.BaseVertexLocation = pVertexOffset ? *pVertexOffset : draw->vertexOffset;
1369 prim.ExtendedParametersPresent = true;
1370 prim.ExtendedParameter0 = pVertexOffset ? *pVertexOffset : draw->vertexOffset;
1371 prim.ExtendedParameter1 = firstInstance;
1372 prim.ExtendedParameter2 = i;
1373 }
1374
1375 genX(batch_emit_post_3dprimitive_was)(&cmd_buffer->batch,
1376 cmd_buffer->device,
1377 cmd_buffer->state.gfx.primitive_topology,
1378 drawCount == 0 ? 0 :
1379 pIndexInfo[drawCount - 1].indexCount);
1380
1381 genX(emit_breakpoint)(&cmd_buffer->batch, cmd_buffer->device, false);
1382 trace_intel_end_draw_indexed_multi(&cmd_buffer->trace, count);
1383 }
1384 #endif
1385
1386 update_dirty_vbs_for_gfx8_vb_flush(cmd_buffer, RANDOM);
1387 }
1388
1389 /* Auto-Draw / Indirect Registers */
1390 #define GFX7_3DPRIM_END_OFFSET 0x2420
1391 #define GFX7_3DPRIM_START_VERTEX 0x2430
1392 #define GFX7_3DPRIM_VERTEX_COUNT 0x2434
1393 #define GFX7_3DPRIM_INSTANCE_COUNT 0x2438
1394 #define GFX7_3DPRIM_START_INSTANCE 0x243C
1395 #define GFX7_3DPRIM_BASE_VERTEX 0x2440
1396
1397 /* On Gen11+, we have three custom "extended parameters" which we can use to
1398 * provide extra system-generated values to shaders. Our assignment of these
1399 * is arbitrary; we choose to assign them as follows:
1400 *
1401 * gl_BaseVertex = XP0
1402 * gl_BaseInstance = XP1
1403 * gl_DrawID = XP2
1404 *
1405 * For gl_BaseInstance, we never actually have to set up the value because we
1406 * can just program 3DSTATE_VF_SGVS_2 to load it implicitly. We can also do
1407 * that for gl_BaseVertex but it does the wrong thing for indexed draws.
1408 */
1409 #define GEN11_3DPRIM_XP0 0x2690
1410 #define GEN11_3DPRIM_XP1 0x2694
1411 #define GEN11_3DPRIM_XP2 0x2698
1412 #define GEN11_3DPRIM_XP_BASE_VERTEX GEN11_3DPRIM_XP0
1413 #define GEN11_3DPRIM_XP_BASE_INSTANCE GEN11_3DPRIM_XP1
1414 #define GEN11_3DPRIM_XP_DRAW_ID GEN11_3DPRIM_XP2
1415
genX(CmdDrawIndirectByteCountEXT)1416 void genX(CmdDrawIndirectByteCountEXT)(
1417 VkCommandBuffer commandBuffer,
1418 uint32_t instanceCount,
1419 uint32_t firstInstance,
1420 VkBuffer counterBuffer,
1421 VkDeviceSize counterBufferOffset,
1422 uint32_t counterOffset,
1423 uint32_t vertexStride)
1424 {
1425 ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer);
1426 ANV_FROM_HANDLE(anv_buffer, counter_buffer, counterBuffer);
1427 struct anv_graphics_pipeline *pipeline =
1428 anv_pipeline_to_graphics(cmd_buffer->state.gfx.base.pipeline);
1429
1430 /* firstVertex is always zero for this draw function */
1431 const uint32_t firstVertex = 0;
1432
1433 if (anv_batch_has_error(&cmd_buffer->batch))
1434 return;
1435
1436 anv_measure_snapshot(cmd_buffer,
1437 INTEL_SNAPSHOT_DRAW,
1438 "draw indirect byte count",
1439 instanceCount * pipeline->instance_multiplier);
1440 trace_intel_begin_draw_indirect_byte_count(&cmd_buffer->trace);
1441
1442 /* Select pipeline here to allow
1443 * cmd_buffer_emit_vertex_constants_and_flush() without flushing before
1444 * emit_base_vertex_instance() & emit_draw_index().
1445 */
1446 genX(flush_pipeline_select_3d)(cmd_buffer);
1447
1448 if (cmd_buffer->state.conditional_render_enabled)
1449 genX(cmd_emit_conditional_render_predicate)(cmd_buffer);
1450
1451 #if GFX_VER < 11
1452 const struct brw_vs_prog_data *vs_prog_data = get_vs_prog_data(pipeline);
1453 if (vs_prog_data->uses_firstvertex ||
1454 vs_prog_data->uses_baseinstance)
1455 emit_base_vertex_instance(cmd_buffer, firstVertex, firstInstance);
1456 if (vs_prog_data->uses_drawid)
1457 emit_draw_index(cmd_buffer, 0);
1458 #endif
1459
1460 genX(cmd_buffer_flush_gfx_state)(cmd_buffer);
1461
1462 struct mi_builder b;
1463 mi_builder_init(&b, cmd_buffer->device->info, &cmd_buffer->batch);
1464 const uint32_t mocs = anv_mocs_for_address(cmd_buffer->device, &counter_buffer->address);
1465 mi_builder_set_mocs(&b, mocs);
1466 struct mi_value count =
1467 mi_mem32(anv_address_add(counter_buffer->address,
1468 counterBufferOffset));
1469 if (counterOffset)
1470 count = mi_isub(&b, count, mi_imm(counterOffset));
1471 count = mi_udiv32_imm(&b, count, vertexStride);
1472 mi_store(&b, mi_reg32(GFX7_3DPRIM_VERTEX_COUNT), count);
1473
1474 mi_store(&b, mi_reg32(GFX7_3DPRIM_START_VERTEX), mi_imm(firstVertex));
1475 mi_store(&b, mi_reg32(GFX7_3DPRIM_INSTANCE_COUNT),
1476 mi_imm(instanceCount * pipeline->instance_multiplier));
1477 mi_store(&b, mi_reg32(GFX7_3DPRIM_START_INSTANCE), mi_imm(firstInstance));
1478 mi_store(&b, mi_reg32(GFX7_3DPRIM_BASE_VERTEX), mi_imm(0));
1479
1480 #if GFX_VER >= 11
1481 mi_store(&b, mi_reg32(GEN11_3DPRIM_XP_BASE_VERTEX),
1482 mi_imm(firstVertex));
1483 /* GEN11_3DPRIM_XP_BASE_INSTANCE is implicit */
1484 mi_store(&b, mi_reg32(GEN11_3DPRIM_XP_DRAW_ID), mi_imm(0));
1485 #endif
1486
1487 genX(emit_breakpoint)(&cmd_buffer->batch, cmd_buffer->device, true);
1488 anv_batch_emit(&cmd_buffer->batch, _3DPRIMITIVE_DIRECT, prim) {
1489 #if GFX_VERx10 >= 125
1490 prim.TBIMREnable = cmd_buffer->state.gfx.dyn_state.use_tbimr;
1491 #endif
1492 prim.IndirectParameterEnable = true;
1493 prim.PredicateEnable = cmd_buffer->state.conditional_render_enabled;
1494 prim.VertexAccessType = SEQUENTIAL;
1495 #if GFX_VER >= 11
1496 prim.ExtendedParametersPresent = true;
1497 #endif
1498 }
1499
1500 genX(batch_emit_post_3dprimitive_was)(&cmd_buffer->batch,
1501 cmd_buffer->device,
1502 cmd_buffer->state.gfx.primitive_topology,
1503 1);
1504 genX(emit_breakpoint)(&cmd_buffer->batch, cmd_buffer->device, false);
1505
1506 update_dirty_vbs_for_gfx8_vb_flush(cmd_buffer, SEQUENTIAL);
1507
1508 trace_intel_end_draw_indirect_byte_count(&cmd_buffer->trace,
1509 instanceCount * pipeline->instance_multiplier);
1510 }
1511
1512 static void
load_indirect_parameters(struct anv_cmd_buffer * cmd_buffer,struct anv_address addr,bool indexed,uint32_t draw_id)1513 load_indirect_parameters(struct anv_cmd_buffer *cmd_buffer,
1514 struct anv_address addr,
1515 bool indexed,
1516 uint32_t draw_id)
1517 {
1518 struct anv_graphics_pipeline *pipeline =
1519 anv_pipeline_to_graphics(cmd_buffer->state.gfx.base.pipeline);
1520
1521 struct mi_builder b;
1522 mi_builder_init(&b, cmd_buffer->device->info, &cmd_buffer->batch);
1523 const uint32_t mocs = anv_mocs_for_address(cmd_buffer->device, &addr);
1524 mi_builder_set_mocs(&b, mocs);
1525
1526 mi_store(&b, mi_reg32(GFX7_3DPRIM_VERTEX_COUNT),
1527 mi_mem32(anv_address_add(addr, 0)));
1528
1529 struct mi_value instance_count = mi_mem32(anv_address_add(addr, 4));
1530 if (pipeline->instance_multiplier > 1) {
1531 instance_count = mi_imul_imm(&b, instance_count,
1532 pipeline->instance_multiplier);
1533 }
1534 mi_store(&b, mi_reg32(GFX7_3DPRIM_INSTANCE_COUNT), instance_count);
1535
1536 mi_store(&b, mi_reg32(GFX7_3DPRIM_START_VERTEX),
1537 mi_mem32(anv_address_add(addr, 8)));
1538
1539 if (indexed) {
1540 mi_store(&b, mi_reg32(GFX7_3DPRIM_BASE_VERTEX),
1541 mi_mem32(anv_address_add(addr, 12)));
1542 mi_store(&b, mi_reg32(GFX7_3DPRIM_START_INSTANCE),
1543 mi_mem32(anv_address_add(addr, 16)));
1544 #if GFX_VER >= 11
1545 mi_store(&b, mi_reg32(GEN11_3DPRIM_XP_BASE_VERTEX),
1546 mi_mem32(anv_address_add(addr, 12)));
1547 /* GEN11_3DPRIM_XP_BASE_INSTANCE is implicit */
1548 #endif
1549 } else {
1550 mi_store(&b, mi_reg32(GFX7_3DPRIM_START_INSTANCE),
1551 mi_mem32(anv_address_add(addr, 12)));
1552 mi_store(&b, mi_reg32(GFX7_3DPRIM_BASE_VERTEX), mi_imm(0));
1553 #if GFX_VER >= 11
1554 mi_store(&b, mi_reg32(GEN11_3DPRIM_XP_BASE_VERTEX),
1555 mi_mem32(anv_address_add(addr, 8)));
1556 /* GEN11_3DPRIM_XP_BASE_INSTANCE is implicit */
1557 #endif
1558 }
1559
1560 #if GFX_VER >= 11
1561 mi_store(&b, mi_reg32(GEN11_3DPRIM_XP_DRAW_ID),
1562 mi_imm(draw_id));
1563 #endif
1564 }
1565
1566 static const bool
execute_indirect_draw_supported(struct anv_cmd_buffer * cmd_buffer)1567 execute_indirect_draw_supported(struct anv_cmd_buffer *cmd_buffer)
1568 {
1569 #if GFX_VERx10 >= 125
1570 const struct intel_device_info *devinfo = cmd_buffer->device->info;
1571 struct anv_graphics_pipeline *pipeline =
1572 anv_pipeline_to_graphics(cmd_buffer->state.gfx.base.pipeline);
1573 const struct brw_vs_prog_data *vs_prog_data = get_vs_prog_data(pipeline);
1574 const bool is_multiview = pipeline->instance_multiplier > 1;
1575
1576 return (devinfo->has_indirect_unroll &&
1577 !is_multiview &&
1578 !vs_prog_data->uses_firstvertex &&
1579 !vs_prog_data->uses_baseinstance &&
1580 !vs_prog_data->uses_drawid);
1581 #else
1582 return false;
1583 #endif
1584 }
1585
1586 static void
emit_indirect_draws(struct anv_cmd_buffer * cmd_buffer,struct anv_address indirect_data_addr,uint32_t indirect_data_stride,uint32_t draw_count,bool indexed)1587 emit_indirect_draws(struct anv_cmd_buffer *cmd_buffer,
1588 struct anv_address indirect_data_addr,
1589 uint32_t indirect_data_stride,
1590 uint32_t draw_count,
1591 bool indexed)
1592 {
1593 #if GFX_VER < 11
1594 struct anv_graphics_pipeline *pipeline =
1595 anv_pipeline_to_graphics(cmd_buffer->state.gfx.base.pipeline);
1596 const struct brw_vs_prog_data *vs_prog_data = get_vs_prog_data(pipeline);
1597 #endif
1598 UNUSED const struct intel_device_info *devinfo = cmd_buffer->device->info;
1599 UNUSED const bool aligned_stride =
1600 (indirect_data_stride == 0 ||
1601 indirect_data_stride == sizeof(VkDrawIndirectCommand));
1602 UNUSED const bool execute_indirect_supported =
1603 execute_indirect_draw_supported(cmd_buffer);
1604
1605 genX(cmd_buffer_flush_gfx_state)(cmd_buffer);
1606
1607 if (cmd_buffer->state.conditional_render_enabled)
1608 genX(cmd_emit_conditional_render_predicate)(cmd_buffer);
1609
1610 uint32_t offset = 0;
1611 for (uint32_t i = 0; i < draw_count; i++) {
1612 struct anv_address draw = anv_address_add(indirect_data_addr, offset);
1613
1614 #if GFX_VER < 11
1615 /* TODO: We need to stomp base vertex to 0 somehow */
1616
1617 /* With sequential draws, we're dealing with the VkDrawIndirectCommand
1618 * structure data. We want to load VkDrawIndirectCommand::firstVertex at
1619 * offset 8 in the structure.
1620 *
1621 * With indexed draws, we're dealing with VkDrawIndexedIndirectCommand.
1622 * We want the VkDrawIndirectCommand::vertexOffset field at offset 12 in
1623 * the structure.
1624 */
1625 if (vs_prog_data->uses_firstvertex ||
1626 vs_prog_data->uses_baseinstance) {
1627 emit_base_vertex_instance_bo(cmd_buffer,
1628 anv_address_add(draw, indexed ? 12 : 8));
1629 }
1630 if (vs_prog_data->uses_drawid)
1631 emit_draw_index(cmd_buffer, i);
1632 #endif
1633
1634 /* Emitting draw index or vertex index BOs may result in needing
1635 * additional VF cache flushes.
1636 */
1637 genX(cmd_buffer_apply_pipe_flushes)(cmd_buffer);
1638
1639 /* Wa_1306463417, Wa_16011107343 - Send HS state for every primitive,
1640 * first one was handled by cmd_buffer_flush_gfx_state.
1641 */
1642 if (i && (INTEL_NEEDS_WA_1306463417 || INTEL_NEEDS_WA_16011107343))
1643 genX(emit_hs)(cmd_buffer);
1644 genX(emit_ds)(cmd_buffer);
1645
1646 if (execute_indirect_supported) {
1647 #if GFX_VERx10 >= 125
1648 genX(emit_breakpoint)(&cmd_buffer->batch, cmd_buffer->device, true);
1649 anv_batch_emit(&cmd_buffer->batch, GENX(EXECUTE_INDIRECT_DRAW), ind) {
1650 ind.ArgumentFormat = DRAW;
1651 ind.TBIMREnabled = cmd_buffer->state.gfx.dyn_state.use_tbimr;
1652 ind.PredicateEnable =
1653 cmd_buffer->state.conditional_render_enabled;
1654 ind.MaxCount = aligned_stride ? draw_count : 1;
1655 ind.ArgumentBufferStartAddress = draw;
1656 ind.MOCS =
1657 anv_mocs(cmd_buffer->device, draw.bo, 0);
1658 }
1659 /* If all the indirect structures are aligned, then we can let the HW
1660 * do the unrolling and we only need one instruction. Otherwise we
1661 * need to emit one instruction per draw, but we're still avoiding
1662 * the register loads with MI commands.
1663 */
1664 if (aligned_stride)
1665 break;
1666 #else
1667 unreachable("EXECUTE_INDIRECT_DRAW instruction expectation mismatch");
1668 #endif
1669 } else {
1670 load_indirect_parameters(cmd_buffer, draw, indexed, i);
1671
1672 genX(emit_breakpoint)(&cmd_buffer->batch, cmd_buffer->device, true);
1673 anv_batch_emit(&cmd_buffer->batch, _3DPRIMITIVE_DIRECT, prim) {
1674 #if GFX_VERx10 >= 125
1675 prim.TBIMREnable = cmd_buffer->state.gfx.dyn_state.use_tbimr;
1676 #endif
1677 prim.IndirectParameterEnable = true;
1678 prim.PredicateEnable = cmd_buffer->state.conditional_render_enabled;
1679 prim.VertexAccessType = indexed ? RANDOM : SEQUENTIAL;
1680 #if GFX_VER >= 11
1681 prim.ExtendedParametersPresent = true;
1682 #endif
1683 }
1684 }
1685
1686 genX(batch_emit_post_3dprimitive_was)(&cmd_buffer->batch,
1687 cmd_buffer->device,
1688 cmd_buffer->state.gfx.primitive_topology,
1689 1);
1690
1691 genX(emit_breakpoint)(&cmd_buffer->batch, cmd_buffer->device, false);
1692
1693 update_dirty_vbs_for_gfx8_vb_flush(cmd_buffer,
1694 indexed ? RANDOM : SEQUENTIAL);
1695
1696 offset += indirect_data_stride;
1697 }
1698 }
1699
genX(CmdDrawIndirect)1700 void genX(CmdDrawIndirect)(
1701 VkCommandBuffer commandBuffer,
1702 VkBuffer _buffer,
1703 VkDeviceSize offset,
1704 uint32_t drawCount,
1705 uint32_t stride)
1706 {
1707 ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer);
1708 ANV_FROM_HANDLE(anv_buffer, buffer, _buffer);
1709
1710 if (anv_batch_has_error(&cmd_buffer->batch))
1711 return;
1712
1713 anv_measure_snapshot(cmd_buffer,
1714 INTEL_SNAPSHOT_DRAW,
1715 "draw indirect",
1716 drawCount);
1717 trace_intel_begin_draw_indirect(&cmd_buffer->trace);
1718
1719 if (anv_use_generated_draws(cmd_buffer, drawCount)) {
1720 genX(cmd_buffer_emit_indirect_generated_draws)(
1721 cmd_buffer,
1722 anv_address_add(buffer->address, offset),
1723 MAX2(stride, sizeof(VkDrawIndirectCommand)),
1724 ANV_NULL_ADDRESS /* count_addr */,
1725 drawCount,
1726 false /* indexed */);
1727 } else {
1728 emit_indirect_draws(cmd_buffer,
1729 anv_address_add(buffer->address, offset),
1730 stride, drawCount, false /* indexed */);
1731 }
1732
1733 trace_intel_end_draw_indirect(&cmd_buffer->trace, drawCount);
1734 }
1735
genX(CmdDrawIndexedIndirect)1736 void genX(CmdDrawIndexedIndirect)(
1737 VkCommandBuffer commandBuffer,
1738 VkBuffer _buffer,
1739 VkDeviceSize offset,
1740 uint32_t drawCount,
1741 uint32_t stride)
1742 {
1743 ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer);
1744 ANV_FROM_HANDLE(anv_buffer, buffer, _buffer);
1745
1746 if (anv_batch_has_error(&cmd_buffer->batch))
1747 return;
1748
1749 anv_measure_snapshot(cmd_buffer,
1750 INTEL_SNAPSHOT_DRAW,
1751 "draw indexed indirect",
1752 drawCount);
1753 trace_intel_begin_draw_indexed_indirect(&cmd_buffer->trace);
1754
1755 if (anv_use_generated_draws(cmd_buffer, drawCount)) {
1756 genX(cmd_buffer_emit_indirect_generated_draws)(
1757 cmd_buffer,
1758 anv_address_add(buffer->address, offset),
1759 MAX2(stride, sizeof(VkDrawIndexedIndirectCommand)),
1760 ANV_NULL_ADDRESS /* count_addr */,
1761 drawCount,
1762 true /* indexed */);
1763 } else {
1764 emit_indirect_draws(cmd_buffer,
1765 anv_address_add(buffer->address, offset),
1766 stride, drawCount, true /* indexed */);
1767 }
1768
1769 trace_intel_end_draw_indexed_indirect(&cmd_buffer->trace, drawCount);
1770 }
1771
1772 #define MI_PREDICATE_SRC0 0x2400
1773 #define MI_PREDICATE_SRC1 0x2408
1774 #define MI_PREDICATE_RESULT 0x2418
1775
1776 static struct mi_value
prepare_for_draw_count_predicate(struct anv_cmd_buffer * cmd_buffer,struct mi_builder * b,struct anv_address count_address)1777 prepare_for_draw_count_predicate(struct anv_cmd_buffer *cmd_buffer,
1778 struct mi_builder *b,
1779 struct anv_address count_address)
1780 {
1781 struct mi_value ret = mi_imm(0);
1782
1783 if (cmd_buffer->state.conditional_render_enabled) {
1784 ret = mi_new_gpr(b);
1785 mi_store(b, mi_value_ref(b, ret), mi_mem32(count_address));
1786 } else {
1787 /* Upload the current draw count from the draw parameters buffer to
1788 * MI_PREDICATE_SRC0.
1789 */
1790 mi_store(b, mi_reg64(MI_PREDICATE_SRC0), mi_mem32(count_address));
1791 mi_store(b, mi_reg32(MI_PREDICATE_SRC1 + 4), mi_imm(0));
1792 }
1793
1794 return ret;
1795 }
1796
1797 static void
emit_draw_count_predicate(struct anv_cmd_buffer * cmd_buffer,struct mi_builder * b,uint32_t draw_index)1798 emit_draw_count_predicate(struct anv_cmd_buffer *cmd_buffer,
1799 struct mi_builder *b,
1800 uint32_t draw_index)
1801 {
1802 /* Upload the index of the current primitive to MI_PREDICATE_SRC1. */
1803 mi_store(b, mi_reg32(MI_PREDICATE_SRC1), mi_imm(draw_index));
1804
1805 if (draw_index == 0) {
1806 anv_batch_emit(&cmd_buffer->batch, GENX(MI_PREDICATE), mip) {
1807 mip.LoadOperation = LOAD_LOADINV;
1808 mip.CombineOperation = COMBINE_SET;
1809 mip.CompareOperation = COMPARE_SRCS_EQUAL;
1810 }
1811 } else {
1812 /* While draw_index < draw_count the predicate's result will be
1813 * (draw_index == draw_count) ^ TRUE = TRUE
1814 * When draw_index == draw_count the result is
1815 * (TRUE) ^ TRUE = FALSE
1816 * After this all results will be:
1817 * (FALSE) ^ FALSE = FALSE
1818 */
1819 anv_batch_emit(&cmd_buffer->batch, GENX(MI_PREDICATE), mip) {
1820 mip.LoadOperation = LOAD_LOAD;
1821 mip.CombineOperation = COMBINE_XOR;
1822 mip.CompareOperation = COMPARE_SRCS_EQUAL;
1823 }
1824 }
1825 }
1826
1827 static void
emit_draw_count_predicate_with_conditional_render(struct anv_cmd_buffer * cmd_buffer,struct mi_builder * b,uint32_t draw_index,struct mi_value max)1828 emit_draw_count_predicate_with_conditional_render(
1829 struct anv_cmd_buffer *cmd_buffer,
1830 struct mi_builder *b,
1831 uint32_t draw_index,
1832 struct mi_value max)
1833 {
1834 struct mi_value pred = mi_ult(b, mi_imm(draw_index), max);
1835 pred = mi_iand(b, pred, mi_reg64(ANV_PREDICATE_RESULT_REG));
1836
1837 mi_store(b, mi_reg32(MI_PREDICATE_RESULT), pred);
1838 }
1839
1840 static void
emit_draw_count_predicate_cond(struct anv_cmd_buffer * cmd_buffer,struct mi_builder * b,uint32_t draw_index,struct mi_value max)1841 emit_draw_count_predicate_cond(struct anv_cmd_buffer *cmd_buffer,
1842 struct mi_builder *b,
1843 uint32_t draw_index,
1844 struct mi_value max)
1845 {
1846 if (cmd_buffer->state.conditional_render_enabled) {
1847 emit_draw_count_predicate_with_conditional_render(
1848 cmd_buffer, b, draw_index, mi_value_ref(b, max));
1849 } else {
1850 emit_draw_count_predicate(cmd_buffer, b, draw_index);
1851 }
1852 }
1853
1854 static void
emit_indirect_count_draws(struct anv_cmd_buffer * cmd_buffer,struct anv_address indirect_data_addr,uint64_t indirect_data_stride,struct anv_address draw_count_addr,uint32_t max_draw_count,bool indexed)1855 emit_indirect_count_draws(struct anv_cmd_buffer *cmd_buffer,
1856 struct anv_address indirect_data_addr,
1857 uint64_t indirect_data_stride,
1858 struct anv_address draw_count_addr,
1859 uint32_t max_draw_count,
1860 bool indexed)
1861 {
1862 #if GFX_VER < 11
1863 struct anv_graphics_pipeline *pipeline =
1864 anv_pipeline_to_graphics(cmd_buffer->state.gfx.base.pipeline);
1865 const struct brw_vs_prog_data *vs_prog_data = get_vs_prog_data(pipeline);
1866 #endif
1867
1868 genX(cmd_buffer_flush_gfx_state)(cmd_buffer);
1869
1870 struct mi_builder b;
1871 mi_builder_init(&b, cmd_buffer->device->info, &cmd_buffer->batch);
1872 const uint32_t mocs = anv_mocs_for_address(cmd_buffer->device, &draw_count_addr);
1873 mi_builder_set_mocs(&b, mocs);
1874 struct mi_value max =
1875 prepare_for_draw_count_predicate(cmd_buffer, &b, draw_count_addr);
1876
1877 for (uint32_t i = 0; i < max_draw_count; i++) {
1878 struct anv_address draw =
1879 anv_address_add(indirect_data_addr, i * indirect_data_stride);
1880
1881 emit_draw_count_predicate_cond(cmd_buffer, &b, i, max);
1882
1883 #if GFX_VER < 11
1884 if (vs_prog_data->uses_firstvertex ||
1885 vs_prog_data->uses_baseinstance) {
1886 emit_base_vertex_instance_bo(cmd_buffer,
1887 anv_address_add(draw, indexed ? 12 : 8));
1888 }
1889 if (vs_prog_data->uses_drawid)
1890 emit_draw_index(cmd_buffer, i);
1891
1892 /* Emitting draw index or vertex index BOs may result in needing
1893 * additional VF cache flushes.
1894 */
1895 genX(cmd_buffer_apply_pipe_flushes)(cmd_buffer);
1896 #endif
1897
1898 load_indirect_parameters(cmd_buffer, draw, indexed, i);
1899
1900 /* Wa_1306463417, Wa_16011107343 - Send HS state for every primitive,
1901 * first one was handled by cmd_buffer_flush_gfx_state.
1902 */
1903 if (i && (INTEL_NEEDS_WA_1306463417 || INTEL_NEEDS_WA_16011107343))
1904 genX(emit_hs)(cmd_buffer);
1905 genX(emit_ds)(cmd_buffer);
1906
1907 genX(emit_breakpoint)(&cmd_buffer->batch, cmd_buffer->device, true);
1908 anv_batch_emit(&cmd_buffer->batch, _3DPRIMITIVE_DIRECT, prim) {
1909 #if GFX_VERx10 >= 125
1910 prim.TBIMREnable = cmd_buffer->state.gfx.dyn_state.use_tbimr;
1911 #endif
1912 prim.IndirectParameterEnable = true;
1913 prim.PredicateEnable = true;
1914 prim.VertexAccessType = indexed ? RANDOM : SEQUENTIAL;
1915 #if GFX_VER >= 11
1916 prim.ExtendedParametersPresent = true;
1917 #endif
1918 }
1919
1920 genX(batch_emit_post_3dprimitive_was)(&cmd_buffer->batch,
1921 cmd_buffer->device,
1922 cmd_buffer->state.gfx.primitive_topology,
1923 1);
1924 genX(emit_breakpoint)(&cmd_buffer->batch, cmd_buffer->device, false);
1925
1926 update_dirty_vbs_for_gfx8_vb_flush(cmd_buffer, SEQUENTIAL);
1927 }
1928
1929 mi_value_unref(&b, max);
1930 }
1931
genX(CmdDrawIndirectCount)1932 void genX(CmdDrawIndirectCount)(
1933 VkCommandBuffer commandBuffer,
1934 VkBuffer _buffer,
1935 VkDeviceSize offset,
1936 VkBuffer _countBuffer,
1937 VkDeviceSize countBufferOffset,
1938 uint32_t maxDrawCount,
1939 uint32_t stride)
1940 {
1941 ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer);
1942 ANV_FROM_HANDLE(anv_buffer, buffer, _buffer);
1943 ANV_FROM_HANDLE(anv_buffer, count_buffer, _countBuffer);
1944
1945 if (anv_batch_has_error(&cmd_buffer->batch))
1946 return;
1947
1948 anv_measure_snapshot(cmd_buffer,
1949 INTEL_SNAPSHOT_DRAW,
1950 "draw indirect count",
1951 0);
1952 trace_intel_begin_draw_indirect_count(&cmd_buffer->trace);
1953
1954 struct anv_address indirect_data_address =
1955 anv_address_add(buffer->address, offset);
1956 struct anv_address count_address =
1957 anv_address_add(count_buffer->address, countBufferOffset);
1958 stride = MAX2(stride, sizeof(VkDrawIndirectCommand));
1959
1960 if (anv_use_generated_draws(cmd_buffer, maxDrawCount)) {
1961 genX(cmd_buffer_emit_indirect_generated_draws)(
1962 cmd_buffer,
1963 indirect_data_address,
1964 stride,
1965 count_address,
1966 maxDrawCount,
1967 false /* indexed */);
1968 } else {
1969 emit_indirect_count_draws(cmd_buffer,
1970 indirect_data_address,
1971 stride,
1972 count_address,
1973 maxDrawCount,
1974 false /* indexed */);
1975 }
1976
1977 trace_intel_end_draw_indirect_count(&cmd_buffer->trace, maxDrawCount);
1978 }
1979
genX(CmdDrawIndexedIndirectCount)1980 void genX(CmdDrawIndexedIndirectCount)(
1981 VkCommandBuffer commandBuffer,
1982 VkBuffer _buffer,
1983 VkDeviceSize offset,
1984 VkBuffer _countBuffer,
1985 VkDeviceSize countBufferOffset,
1986 uint32_t maxDrawCount,
1987 uint32_t stride)
1988 {
1989 ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer);
1990 ANV_FROM_HANDLE(anv_buffer, buffer, _buffer);
1991 ANV_FROM_HANDLE(anv_buffer, count_buffer, _countBuffer);
1992
1993 if (anv_batch_has_error(&cmd_buffer->batch))
1994 return;
1995
1996 anv_measure_snapshot(cmd_buffer,
1997 INTEL_SNAPSHOT_DRAW,
1998 "draw indexed indirect count",
1999 0);
2000 trace_intel_begin_draw_indexed_indirect_count(&cmd_buffer->trace);
2001
2002 struct anv_address indirect_data_address =
2003 anv_address_add(buffer->address, offset);
2004 struct anv_address count_address =
2005 anv_address_add(count_buffer->address, countBufferOffset);
2006 stride = MAX2(stride, sizeof(VkDrawIndexedIndirectCommand));
2007
2008 if (anv_use_generated_draws(cmd_buffer, maxDrawCount)) {
2009 genX(cmd_buffer_emit_indirect_generated_draws)(
2010 cmd_buffer,
2011 indirect_data_address,
2012 stride,
2013 count_address,
2014 maxDrawCount,
2015 true /* indexed */);
2016 } else {
2017 emit_indirect_count_draws(cmd_buffer,
2018 indirect_data_address,
2019 stride,
2020 count_address,
2021 maxDrawCount,
2022 true /* indexed */);
2023 }
2024
2025 trace_intel_end_draw_indexed_indirect_count(&cmd_buffer->trace, maxDrawCount);
2026
2027 }
2028
genX(CmdBeginTransformFeedbackEXT)2029 void genX(CmdBeginTransformFeedbackEXT)(
2030 VkCommandBuffer commandBuffer,
2031 uint32_t firstCounterBuffer,
2032 uint32_t counterBufferCount,
2033 const VkBuffer* pCounterBuffers,
2034 const VkDeviceSize* pCounterBufferOffsets)
2035 {
2036 ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer);
2037
2038 assert(firstCounterBuffer < MAX_XFB_BUFFERS);
2039 assert(counterBufferCount <= MAX_XFB_BUFFERS);
2040 assert(firstCounterBuffer + counterBufferCount <= MAX_XFB_BUFFERS);
2041
2042 trace_intel_begin_xfb(&cmd_buffer->trace);
2043
2044 /* From the SKL PRM Vol. 2c, SO_WRITE_OFFSET:
2045 *
2046 * "Ssoftware must ensure that no HW stream output operations can be in
2047 * process or otherwise pending at the point that the MI_LOAD/STORE
2048 * commands are processed. This will likely require a pipeline flush."
2049 */
2050 anv_add_pending_pipe_bits(cmd_buffer,
2051 ANV_PIPE_CS_STALL_BIT,
2052 "begin transform feedback");
2053 genX(cmd_buffer_apply_pipe_flushes)(cmd_buffer);
2054
2055 for (uint32_t idx = 0; idx < MAX_XFB_BUFFERS; idx++) {
2056 /* If we have a counter buffer, this is a resume so we need to load the
2057 * value into the streamout offset register. Otherwise, this is a begin
2058 * and we need to reset it to zero.
2059 */
2060 if (pCounterBuffers &&
2061 idx >= firstCounterBuffer &&
2062 idx - firstCounterBuffer < counterBufferCount &&
2063 pCounterBuffers[idx - firstCounterBuffer] != VK_NULL_HANDLE) {
2064 uint32_t cb_idx = idx - firstCounterBuffer;
2065 ANV_FROM_HANDLE(anv_buffer, counter_buffer, pCounterBuffers[cb_idx]);
2066 uint64_t offset = pCounterBufferOffsets ?
2067 pCounterBufferOffsets[cb_idx] : 0;
2068
2069 anv_batch_emit(&cmd_buffer->batch, GENX(MI_LOAD_REGISTER_MEM), lrm) {
2070 lrm.RegisterAddress = GENX(SO_WRITE_OFFSET0_num) + idx * 4;
2071 lrm.MemoryAddress = anv_address_add(counter_buffer->address,
2072 offset);
2073 }
2074 } else {
2075 anv_batch_emit(&cmd_buffer->batch, GENX(MI_LOAD_REGISTER_IMM), lri) {
2076 lri.RegisterOffset = GENX(SO_WRITE_OFFSET0_num) + idx * 4;
2077 lri.DataDWord = 0;
2078 }
2079 }
2080 }
2081
2082 cmd_buffer->state.xfb_enabled = true;
2083 cmd_buffer->state.gfx.dirty |= ANV_CMD_DIRTY_XFB_ENABLE;
2084 }
2085
genX(CmdEndTransformFeedbackEXT)2086 void genX(CmdEndTransformFeedbackEXT)(
2087 VkCommandBuffer commandBuffer,
2088 uint32_t firstCounterBuffer,
2089 uint32_t counterBufferCount,
2090 const VkBuffer* pCounterBuffers,
2091 const VkDeviceSize* pCounterBufferOffsets)
2092 {
2093 ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer);
2094
2095 assert(firstCounterBuffer < MAX_XFB_BUFFERS);
2096 assert(counterBufferCount <= MAX_XFB_BUFFERS);
2097 assert(firstCounterBuffer + counterBufferCount <= MAX_XFB_BUFFERS);
2098
2099 /* From the SKL PRM Vol. 2c, SO_WRITE_OFFSET:
2100 *
2101 * "Ssoftware must ensure that no HW stream output operations can be in
2102 * process or otherwise pending at the point that the MI_LOAD/STORE
2103 * commands are processed. This will likely require a pipeline flush."
2104 */
2105 anv_add_pending_pipe_bits(cmd_buffer,
2106 ANV_PIPE_CS_STALL_BIT,
2107 "end transform feedback");
2108 genX(cmd_buffer_apply_pipe_flushes)(cmd_buffer);
2109
2110 for (uint32_t cb_idx = 0; cb_idx < counterBufferCount; cb_idx++) {
2111 unsigned idx = firstCounterBuffer + cb_idx;
2112
2113 /* If we have a counter buffer, this is a resume so we need to load the
2114 * value into the streamout offset register. Otherwise, this is a begin
2115 * and we need to reset it to zero.
2116 */
2117 if (pCounterBuffers &&
2118 cb_idx < counterBufferCount &&
2119 pCounterBuffers[cb_idx] != VK_NULL_HANDLE) {
2120 ANV_FROM_HANDLE(anv_buffer, counter_buffer, pCounterBuffers[cb_idx]);
2121 uint64_t offset = pCounterBufferOffsets ?
2122 pCounterBufferOffsets[cb_idx] : 0;
2123
2124 anv_batch_emit(&cmd_buffer->batch, GENX(MI_STORE_REGISTER_MEM), srm) {
2125 srm.MemoryAddress = anv_address_add(counter_buffer->address,
2126 offset);
2127 srm.RegisterAddress = GENX(SO_WRITE_OFFSET0_num) + idx * 4;
2128 }
2129 }
2130 }
2131
2132 trace_intel_end_xfb(&cmd_buffer->trace);
2133
2134 cmd_buffer->state.xfb_enabled = false;
2135 cmd_buffer->state.gfx.dirty |= ANV_CMD_DIRTY_XFB_ENABLE;
2136 }
2137
2138 #if GFX_VERx10 >= 125
2139
2140 void
genX(CmdDrawMeshTasksEXT)2141 genX(CmdDrawMeshTasksEXT)(
2142 VkCommandBuffer commandBuffer,
2143 uint32_t x,
2144 uint32_t y,
2145 uint32_t z)
2146 {
2147 ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer);
2148
2149 if (anv_batch_has_error(&cmd_buffer->batch))
2150 return;
2151
2152 anv_measure_snapshot(cmd_buffer,
2153 INTEL_SNAPSHOT_DRAW,
2154 "draw mesh", x * y * z);
2155
2156 trace_intel_begin_draw_mesh(&cmd_buffer->trace);
2157
2158 /* TODO(mesh): Check if this is not emitting more packets than we need. */
2159 genX(cmd_buffer_flush_gfx_state)(cmd_buffer);
2160
2161 if (cmd_buffer->state.conditional_render_enabled)
2162 genX(cmd_emit_conditional_render_predicate)(cmd_buffer);
2163
2164 anv_batch_emit(&cmd_buffer->batch, GENX(3DMESH_3D), m) {
2165 m.PredicateEnable = cmd_buffer->state.conditional_render_enabled;
2166 m.ThreadGroupCountX = x;
2167 m.ThreadGroupCountY = y;
2168 m.ThreadGroupCountZ = z;
2169 }
2170
2171 trace_intel_end_draw_mesh(&cmd_buffer->trace, x, y, z);
2172 }
2173
2174 #define GFX125_3DMESH_TG_COUNT 0x26F0
2175 #define GFX10_3DPRIM_XP(n) (0x2690 + (n) * 4) /* n = { 0, 1, 2 } */
2176
2177 static void
mesh_load_indirect_parameters_3dmesh_3d(struct anv_cmd_buffer * cmd_buffer,struct mi_builder * b,struct anv_address addr,bool emit_xp0,uint32_t xp0)2178 mesh_load_indirect_parameters_3dmesh_3d(struct anv_cmd_buffer *cmd_buffer,
2179 struct mi_builder *b,
2180 struct anv_address addr,
2181 bool emit_xp0,
2182 uint32_t xp0)
2183 {
2184 const size_t groupCountXOff = offsetof(VkDrawMeshTasksIndirectCommandEXT, groupCountX);
2185 const size_t groupCountYOff = offsetof(VkDrawMeshTasksIndirectCommandEXT, groupCountY);
2186 const size_t groupCountZOff = offsetof(VkDrawMeshTasksIndirectCommandEXT, groupCountZ);
2187
2188 mi_store(b, mi_reg32(GFX125_3DMESH_TG_COUNT),
2189 mi_mem32(anv_address_add(addr, groupCountXOff)));
2190
2191 mi_store(b, mi_reg32(GFX10_3DPRIM_XP(1)),
2192 mi_mem32(anv_address_add(addr, groupCountYOff)));
2193
2194 mi_store(b, mi_reg32(GFX10_3DPRIM_XP(2)),
2195 mi_mem32(anv_address_add(addr, groupCountZOff)));
2196
2197 if (emit_xp0)
2198 mi_store(b, mi_reg32(GFX10_3DPRIM_XP(0)), mi_imm(xp0));
2199 }
2200
2201 static void
emit_indirect_3dmesh_3d(struct anv_batch * batch,bool predicate_enable,bool uses_drawid)2202 emit_indirect_3dmesh_3d(struct anv_batch *batch,
2203 bool predicate_enable,
2204 bool uses_drawid)
2205 {
2206 uint32_t len = GENX(3DMESH_3D_length) + uses_drawid;
2207 uint32_t *dw = anv_batch_emitn(batch, len, GENX(3DMESH_3D),
2208 .PredicateEnable = predicate_enable,
2209 .IndirectParameterEnable = true,
2210 .ExtendedParameter0Present = uses_drawid);
2211 if (uses_drawid)
2212 dw[len - 1] = 0;
2213 }
2214
2215 void
genX(CmdDrawMeshTasksIndirectEXT)2216 genX(CmdDrawMeshTasksIndirectEXT)(
2217 VkCommandBuffer commandBuffer,
2218 VkBuffer _buffer,
2219 VkDeviceSize offset,
2220 uint32_t drawCount,
2221 uint32_t stride)
2222 {
2223 ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer);
2224 ANV_FROM_HANDLE(anv_buffer, buffer, _buffer);
2225 struct anv_graphics_pipeline *pipeline =
2226 anv_pipeline_to_graphics(cmd_buffer->state.gfx.base.pipeline);
2227 const struct brw_task_prog_data *task_prog_data = get_task_prog_data(pipeline);
2228 const struct brw_mesh_prog_data *mesh_prog_data = get_mesh_prog_data(pipeline);
2229 struct anv_cmd_state *cmd_state = &cmd_buffer->state;
2230
2231 if (anv_batch_has_error(&cmd_buffer->batch))
2232 return;
2233
2234 anv_measure_snapshot(cmd_buffer,
2235 INTEL_SNAPSHOT_DRAW,
2236 "draw mesh indirect", drawCount);
2237
2238 trace_intel_begin_draw_mesh_indirect(&cmd_buffer->trace);
2239
2240 genX(cmd_buffer_flush_gfx_state)(cmd_buffer);
2241
2242 if (cmd_state->conditional_render_enabled)
2243 genX(cmd_emit_conditional_render_predicate)(cmd_buffer);
2244
2245 bool uses_drawid = (task_prog_data && task_prog_data->uses_drawid) ||
2246 mesh_prog_data->uses_drawid;
2247 struct mi_builder b;
2248 mi_builder_init(&b, cmd_buffer->device->info, &cmd_buffer->batch);
2249
2250 for (uint32_t i = 0; i < drawCount; i++) {
2251 struct anv_address draw = anv_address_add(buffer->address, offset);
2252
2253 mesh_load_indirect_parameters_3dmesh_3d(cmd_buffer, &b, draw, uses_drawid, i);
2254
2255 emit_indirect_3dmesh_3d(&cmd_buffer->batch,
2256 cmd_state->conditional_render_enabled, uses_drawid);
2257
2258 offset += stride;
2259 }
2260
2261 trace_intel_end_draw_mesh_indirect(&cmd_buffer->trace, drawCount);
2262 }
2263
2264 void
genX(CmdDrawMeshTasksIndirectCountEXT)2265 genX(CmdDrawMeshTasksIndirectCountEXT)(
2266 VkCommandBuffer commandBuffer,
2267 VkBuffer _buffer,
2268 VkDeviceSize offset,
2269 VkBuffer _countBuffer,
2270 VkDeviceSize countBufferOffset,
2271 uint32_t maxDrawCount,
2272 uint32_t stride)
2273 {
2274 ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer);
2275 ANV_FROM_HANDLE(anv_buffer, buffer, _buffer);
2276 ANV_FROM_HANDLE(anv_buffer, count_buffer, _countBuffer);
2277 struct anv_graphics_pipeline *pipeline =
2278 anv_pipeline_to_graphics(cmd_buffer->state.gfx.base.pipeline);
2279 const struct brw_task_prog_data *task_prog_data = get_task_prog_data(pipeline);
2280 const struct brw_mesh_prog_data *mesh_prog_data = get_mesh_prog_data(pipeline);
2281
2282 if (anv_batch_has_error(&cmd_buffer->batch))
2283 return;
2284
2285 anv_measure_snapshot(cmd_buffer,
2286 INTEL_SNAPSHOT_DRAW,
2287 "draw mesh indirect count", 0);
2288
2289 trace_intel_begin_draw_mesh_indirect_count(&cmd_buffer->trace);
2290
2291 genX(cmd_buffer_flush_gfx_state)(cmd_buffer);
2292
2293 bool uses_drawid = (task_prog_data && task_prog_data->uses_drawid) ||
2294 mesh_prog_data->uses_drawid;
2295
2296 struct mi_builder b;
2297 mi_builder_init(&b, cmd_buffer->device->info, &cmd_buffer->batch);
2298 const uint32_t mocs = anv_mocs_for_address(cmd_buffer->device, &count_buffer->address);
2299 mi_builder_set_mocs(&b, mocs);
2300
2301 struct mi_value max =
2302 prepare_for_draw_count_predicate(
2303 cmd_buffer, &b,
2304 anv_address_add(count_buffer->address, countBufferOffset));
2305
2306 for (uint32_t i = 0; i < maxDrawCount; i++) {
2307 struct anv_address draw = anv_address_add(buffer->address, offset);
2308
2309 emit_draw_count_predicate_cond(cmd_buffer, &b, i, max);
2310
2311 mesh_load_indirect_parameters_3dmesh_3d(cmd_buffer, &b, draw, uses_drawid, i);
2312
2313 emit_indirect_3dmesh_3d(&cmd_buffer->batch, true, uses_drawid);
2314
2315 offset += stride;
2316 }
2317
2318 trace_intel_end_draw_mesh_indirect_count(&cmd_buffer->trace, maxDrawCount);
2319 }
2320
2321 #endif /* GFX_VERx10 >= 125 */
2322