1 /*
2 * Copyright © 2015 Intel Corporation
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
13 * Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21 * IN THE SOFTWARE.
22 */
23
24 #include <assert.h>
25 #include <stdbool.h>
26
27 #include "anv_private.h"
28 #include "anv_measure.h"
29
30 #include "common/intel_compute_slm.h"
31 #include "genxml/gen_macros.h"
32 #include "genxml/genX_pack.h"
33 #include "genxml/genX_rt_pack.h"
34 #include "common/intel_genX_state_brw.h"
35
36 #include "ds/intel_tracepoints.h"
37
38 #include "genX_mi_builder.h"
39
40 void
genX(cmd_buffer_ensure_cfe_state)41 genX(cmd_buffer_ensure_cfe_state)(struct anv_cmd_buffer *cmd_buffer,
42 uint32_t total_scratch)
43 {
44 #if GFX_VERx10 >= 125
45 assert(cmd_buffer->state.current_pipeline == GPGPU);
46
47 struct anv_cmd_compute_state *comp_state = &cmd_buffer->state.compute;
48
49 if (total_scratch <= comp_state->scratch_size)
50 return;
51
52 const struct intel_device_info *devinfo = cmd_buffer->device->info;
53 anv_batch_emit(&cmd_buffer->batch, GENX(CFE_STATE), cfe) {
54 cfe.MaximumNumberofThreads = devinfo->max_cs_threads * devinfo->subslice_total;
55
56 uint32_t scratch_surf;
57 struct anv_scratch_pool *scratch_pool =
58 (cmd_buffer->vk.pool->flags & VK_COMMAND_POOL_CREATE_PROTECTED_BIT) ?
59 &cmd_buffer->device->protected_scratch_pool :
60 &cmd_buffer->device->scratch_pool;
61 struct anv_bo *scratch_bo =
62 anv_scratch_pool_alloc(cmd_buffer->device, scratch_pool,
63 MESA_SHADER_COMPUTE,
64 total_scratch);
65 anv_reloc_list_add_bo(cmd_buffer->batch.relocs, scratch_bo);
66 scratch_surf = anv_scratch_pool_get_surf(cmd_buffer->device, scratch_pool,
67 total_scratch);
68 cfe.ScratchSpaceBuffer = scratch_surf >> ANV_SCRATCH_SPACE_SHIFT(GFX_VER);
69 #if GFX_VER >= 20
70 switch (cmd_buffer->device->physical->instance->stack_ids) {
71 case 256: cfe.StackIDControl = StackIDs256; break;
72 case 512: cfe.StackIDControl = StackIDs512; break;
73 case 1024: cfe.StackIDControl = StackIDs1024; break;
74 case 2048: cfe.StackIDControl = StackIDs2048; break;
75 default: unreachable("invalid stack_ids value");
76 }
77
78 #if INTEL_WA_14021821874_GFX_VER || INTEL_WA_14018813551_GFX_VER
79 /* Wa_14021821874, Wa_14018813551:
80 *
81 * "StackIDControlOverride_RTGlobals = 0 (i.e. 2k)". We
82 * already set stack size per ray to 64 in brw_nir_lower_rt_intrinsics
83 * as the workaround also requires.
84 */
85 if (intel_needs_workaround(cmd_buffer->device->info, 14021821874) ||
86 intel_needs_workaround(cmd_buffer->device->info, 14018813551))
87 cfe.StackIDControl = StackIDs2048;
88 #endif
89
90 #endif
91
92 cfe.OverDispatchControl = 2; /* 50% overdispatch */
93 }
94
95 comp_state->scratch_size = total_scratch;
96 #else
97 unreachable("Invalid call");
98 #endif
99 }
100
101 static void
genX(cmd_buffer_flush_compute_state)102 genX(cmd_buffer_flush_compute_state)(struct anv_cmd_buffer *cmd_buffer)
103 {
104 struct anv_cmd_compute_state *comp_state = &cmd_buffer->state.compute;
105 struct anv_compute_pipeline *pipeline =
106 anv_pipeline_to_compute(comp_state->base.pipeline);
107 const UNUSED struct intel_device_info *devinfo = cmd_buffer->device->info;
108
109 assert(pipeline->cs);
110
111 genX(cmd_buffer_config_l3)(cmd_buffer, pipeline->base.l3_config);
112
113 genX(cmd_buffer_update_color_aux_op(cmd_buffer, ISL_AUX_OP_NONE));
114
115 genX(flush_descriptor_buffers)(cmd_buffer, &comp_state->base);
116
117 genX(flush_pipeline_select_gpgpu)(cmd_buffer);
118
119 /* Apply any pending pipeline flushes we may have. We want to apply them
120 * now because, if any of those flushes are for things like push constants,
121 * the GPU will read the state at weird times.
122 */
123 genX(cmd_buffer_apply_pipe_flushes)(cmd_buffer);
124
125 if (cmd_buffer->state.compute.pipeline_dirty) {
126 #if GFX_VERx10 < 125
127 /* From the Sky Lake PRM Vol 2a, MEDIA_VFE_STATE:
128 *
129 * "A stalling PIPE_CONTROL is required before MEDIA_VFE_STATE unless
130 * the only bits that are changed are scoreboard related: Scoreboard
131 * Enable, Scoreboard Type, Scoreboard Mask, Scoreboard * Delta. For
132 * these scoreboard related states, a MEDIA_STATE_FLUSH is
133 * sufficient."
134 */
135 anv_add_pending_pipe_bits(cmd_buffer,
136 ANV_PIPE_CS_STALL_BIT,
137 "flush compute state");
138 genX(cmd_buffer_apply_pipe_flushes)(cmd_buffer);
139 #endif
140
141 anv_batch_emit_batch(&cmd_buffer->batch, &pipeline->base.batch);
142
143 #if GFX_VERx10 >= 125
144 const struct brw_cs_prog_data *prog_data = get_cs_prog_data(pipeline);
145 genX(cmd_buffer_ensure_cfe_state)(cmd_buffer, prog_data->base.total_scratch);
146 #endif
147
148 /* Changing the pipeline affects the push constants layout (different
149 * amount of cross/per thread allocations). The allocation is also
150 * bounded to just the amount consummed by the pipeline (see
151 * anv_cmd_buffer_cs_push_constants). So we force the reallocation for
152 * every pipeline change.
153 *
154 * On Gfx12.0 we're also seeing failures in the dEQP-VK.memory_model.*
155 * tests when run in parallel. This is likely a HW issue with push
156 * constants & context save/restore.
157 *
158 * TODO: optimize this on Gfx12.5+ where the shader is not using per
159 * thread allocations and is also pulling the data using SEND messages.
160 * We should be able to limit reallocations only the data actually
161 * changes.
162 */
163 cmd_buffer->state.push_constants_dirty |= VK_SHADER_STAGE_COMPUTE_BIT;
164 comp_state->base.push_constants_data_dirty = true;
165 }
166
167 cmd_buffer->state.descriptors_dirty |=
168 genX(cmd_buffer_flush_push_descriptors)(cmd_buffer,
169 &cmd_buffer->state.compute.base,
170 &pipeline->base);
171
172 if ((cmd_buffer->state.descriptors_dirty & VK_SHADER_STAGE_COMPUTE_BIT) ||
173 cmd_buffer->state.compute.pipeline_dirty) {
174 genX(cmd_buffer_flush_descriptor_sets)(cmd_buffer,
175 &cmd_buffer->state.compute.base,
176 VK_SHADER_STAGE_COMPUTE_BIT,
177 &pipeline->cs, 1);
178 cmd_buffer->state.descriptors_dirty &= ~VK_SHADER_STAGE_COMPUTE_BIT;
179
180 #if GFX_VERx10 < 125
181 uint32_t iface_desc_data_dw[GENX(INTERFACE_DESCRIPTOR_DATA_length)];
182 struct GENX(INTERFACE_DESCRIPTOR_DATA) desc = {
183 .BindingTablePointer =
184 cmd_buffer->state.binding_tables[MESA_SHADER_COMPUTE].offset,
185 .SamplerStatePointer =
186 cmd_buffer->state.samplers[MESA_SHADER_COMPUTE].offset,
187 };
188 GENX(INTERFACE_DESCRIPTOR_DATA_pack)(NULL, iface_desc_data_dw, &desc);
189
190 struct anv_state state =
191 anv_cmd_buffer_merge_dynamic(cmd_buffer, iface_desc_data_dw,
192 pipeline->interface_descriptor_data,
193 GENX(INTERFACE_DESCRIPTOR_DATA_length),
194 64);
195
196 uint32_t size = GENX(INTERFACE_DESCRIPTOR_DATA_length) * sizeof(uint32_t);
197 anv_batch_emit(&cmd_buffer->batch,
198 GENX(MEDIA_INTERFACE_DESCRIPTOR_LOAD), mid) {
199 mid.InterfaceDescriptorTotalLength = size;
200 mid.InterfaceDescriptorDataStartAddress = state.offset;
201 }
202 #endif
203 }
204
205 if (cmd_buffer->state.push_constants_dirty & VK_SHADER_STAGE_COMPUTE_BIT) {
206
207 if (comp_state->base.push_constants_state.alloc_size == 0 ||
208 comp_state->base.push_constants_data_dirty) {
209 comp_state->base.push_constants_state =
210 anv_cmd_buffer_cs_push_constants(cmd_buffer);
211 comp_state->base.push_constants_data_dirty = false;
212 }
213
214 #if GFX_VERx10 < 125
215 if (comp_state->base.push_constants_state.alloc_size) {
216 anv_batch_emit(&cmd_buffer->batch, GENX(MEDIA_CURBE_LOAD), curbe) {
217 curbe.CURBETotalDataLength = comp_state->base.push_constants_state.alloc_size;
218 curbe.CURBEDataStartAddress = comp_state->base.push_constants_state.offset;
219 }
220 }
221 #endif
222
223 cmd_buffer->state.push_constants_dirty &= ~VK_SHADER_STAGE_COMPUTE_BIT;
224 }
225
226 cmd_buffer->state.compute.pipeline_dirty = false;
227
228 genX(cmd_buffer_apply_pipe_flushes)(cmd_buffer);
229 }
230
231 static void
anv_cmd_buffer_push_workgroups(struct anv_cmd_buffer * cmd_buffer,const struct brw_cs_prog_data * prog_data,uint32_t baseGroupX,uint32_t baseGroupY,uint32_t baseGroupZ,uint32_t groupCountX,uint32_t groupCountY,uint32_t groupCountZ,struct anv_address indirect_group)232 anv_cmd_buffer_push_workgroups(struct anv_cmd_buffer *cmd_buffer,
233 const struct brw_cs_prog_data *prog_data,
234 uint32_t baseGroupX,
235 uint32_t baseGroupY,
236 uint32_t baseGroupZ,
237 uint32_t groupCountX,
238 uint32_t groupCountY,
239 uint32_t groupCountZ,
240 struct anv_address indirect_group)
241 {
242 if (anv_batch_has_error(&cmd_buffer->batch))
243 return;
244
245 struct anv_push_constants *push =
246 &cmd_buffer->state.compute.base.push_constants;
247 bool updated = false;
248 if (push->cs.base_work_group_id[0] != baseGroupX ||
249 push->cs.base_work_group_id[1] != baseGroupY ||
250 push->cs.base_work_group_id[2] != baseGroupZ) {
251 push->cs.base_work_group_id[0] = baseGroupX;
252 push->cs.base_work_group_id[1] = baseGroupY;
253 push->cs.base_work_group_id[2] = baseGroupZ;
254 updated = true;
255 }
256
257 /* On Gfx12.5+ this value goes into the inline parameter register */
258 if (GFX_VERx10 < 125 && prog_data->uses_num_work_groups) {
259 if (anv_address_is_null(indirect_group)) {
260 if (push->cs.num_work_groups[0] != groupCountX ||
261 push->cs.num_work_groups[1] != groupCountY ||
262 push->cs.num_work_groups[2] != groupCountZ) {
263 push->cs.num_work_groups[0] = groupCountX;
264 push->cs.num_work_groups[1] = groupCountY;
265 push->cs.num_work_groups[2] = groupCountZ;
266 updated = true;
267 }
268 } else {
269 uint64_t addr64 = anv_address_physical(indirect_group);
270 uint32_t lower_addr32 = addr64 & 0xffffffff;
271 uint32_t upper_addr32 = addr64 >> 32;
272 if (push->cs.num_work_groups[0] != UINT32_MAX ||
273 push->cs.num_work_groups[1] != lower_addr32 ||
274 push->cs.num_work_groups[2] != upper_addr32) {
275 push->cs.num_work_groups[0] = UINT32_MAX;
276 push->cs.num_work_groups[1] = lower_addr32;
277 push->cs.num_work_groups[2] = upper_addr32;
278 updated = true;
279 }
280 }
281 }
282
283 if (updated) {
284 cmd_buffer->state.push_constants_dirty |= VK_SHADER_STAGE_COMPUTE_BIT;
285 cmd_buffer->state.compute.base.push_constants_data_dirty = true;
286 }
287 }
288
289 #define GPGPU_DISPATCHDIMX 0x2500
290 #define GPGPU_DISPATCHDIMY 0x2504
291 #define GPGPU_DISPATCHDIMZ 0x2508
292
293 static void
compute_load_indirect_params(struct anv_cmd_buffer * cmd_buffer,const struct anv_address indirect_addr,bool is_unaligned_size_x)294 compute_load_indirect_params(struct anv_cmd_buffer *cmd_buffer,
295 const struct anv_address indirect_addr,
296 bool is_unaligned_size_x)
297 {
298 struct mi_builder b;
299 mi_builder_init(&b, cmd_buffer->device->info, &cmd_buffer->batch);
300
301 struct mi_value size_x = mi_mem32(anv_address_add(indirect_addr, 0));
302
303 /* Convert unaligned thread invocations to aligned thread group in X
304 * dimension for unaligned shader dispatches during ray tracing phase.
305 */
306 if (is_unaligned_size_x) {
307 const uint32_t mocs = isl_mocs(&cmd_buffer->device->isl_dev, 0, false);
308 mi_builder_set_mocs(&b, mocs);
309
310 struct anv_compute_pipeline *pipeline =
311 anv_pipeline_to_compute(cmd_buffer->state.compute.base.pipeline);
312 const struct brw_cs_prog_data *prog_data = get_cs_prog_data(pipeline);
313
314 assert(util_is_power_of_two_or_zero(prog_data->local_size[0]));
315 size_x = mi_udiv32_imm(&b, size_x, prog_data->local_size[0]);
316 size_x = mi_iadd(&b, size_x, mi_imm(1));
317 }
318
319 struct mi_value size_y = mi_mem32(anv_address_add(indirect_addr, 4));
320 struct mi_value size_z = mi_mem32(anv_address_add(indirect_addr, 8));
321
322 mi_store(&b, mi_reg32(GPGPU_DISPATCHDIMX), size_x);
323 mi_store(&b, mi_reg32(GPGPU_DISPATCHDIMY), size_y);
324 mi_store(&b, mi_reg32(GPGPU_DISPATCHDIMZ), size_z);
325 }
326
327 static void
compute_store_indirect_params(struct anv_cmd_buffer * cmd_buffer,const struct anv_address indirect_addr)328 compute_store_indirect_params(struct anv_cmd_buffer *cmd_buffer,
329 const struct anv_address indirect_addr)
330 {
331 struct mi_builder b;
332 mi_builder_init(&b, cmd_buffer->device->info, &cmd_buffer->batch);
333
334 struct mi_value size_x = mi_mem32(anv_address_add(indirect_addr, 0));
335 struct mi_value size_y = mi_mem32(anv_address_add(indirect_addr, 4));
336 struct mi_value size_z = mi_mem32(anv_address_add(indirect_addr, 8));
337
338 mi_store(&b, size_x, mi_reg32(GPGPU_DISPATCHDIMX));
339 mi_store(&b, size_y, mi_reg32(GPGPU_DISPATCHDIMY));
340 mi_store(&b, size_z, mi_reg32(GPGPU_DISPATCHDIMZ));
341 }
342
343
344 #if GFX_VERx10 >= 125
345
GENX(INTERFACE_DESCRIPTOR_DATA)346 static inline struct GENX(INTERFACE_DESCRIPTOR_DATA)
347 get_interface_descriptor_data(struct anv_cmd_buffer *cmd_buffer,
348 const struct anv_shader_bin *shader,
349 const struct brw_cs_prog_data *prog_data,
350 const struct intel_cs_dispatch_info *dispatch)
351 {
352 const struct intel_device_info *devinfo = cmd_buffer->device->info;
353
354 return (struct GENX(INTERFACE_DESCRIPTOR_DATA)) {
355 .SamplerCount = DIV_ROUND_UP(CLAMP(shader->bind_map.sampler_count, 0, 16), 4),
356 .KernelStartPointer = shader->kernel.offset,
357 .SamplerStatePointer = cmd_buffer->state.samplers[MESA_SHADER_COMPUTE].offset,
358 .BindingTablePointer = cmd_buffer->state.binding_tables[MESA_SHADER_COMPUTE].offset,
359 /* Typically set to 0 to avoid prefetching on every thread dispatch. */
360 .BindingTableEntryCount = devinfo->verx10 == 125 ?
361 0 : MIN2(shader->bind_map.surface_count, 30),
362 .NumberofThreadsinGPGPUThreadGroup = dispatch->threads,
363 .SharedLocalMemorySize = intel_compute_slm_encode_size(GFX_VER, prog_data->base.total_shared),
364 .PreferredSLMAllocationSize =
365 intel_compute_preferred_slm_calc_encode_size(devinfo,
366 prog_data->base.total_shared,
367 dispatch->group_size,
368 dispatch->simd_size),
369 .NumberOfBarriers = prog_data->uses_barrier,
370 #if GFX_VER >= 30
371 .RegistersPerThread = ptl_register_blocks(prog_data->base.grf_used),
372 #endif
373 };
374 }
375
376 static inline void
emit_indirect_compute_walker(struct anv_cmd_buffer * cmd_buffer,const struct anv_shader_bin * shader,const struct brw_cs_prog_data * prog_data,struct anv_address indirect_addr)377 emit_indirect_compute_walker(struct anv_cmd_buffer *cmd_buffer,
378 const struct anv_shader_bin *shader,
379 const struct brw_cs_prog_data *prog_data,
380 struct anv_address indirect_addr)
381 {
382 const struct intel_device_info *devinfo = cmd_buffer->device->info;
383 assert(devinfo->has_indirect_unroll);
384
385 struct anv_cmd_compute_state *comp_state = &cmd_buffer->state.compute;
386 bool predicate = cmd_buffer->state.conditional_render_enabled;
387
388 const struct intel_cs_dispatch_info dispatch =
389 brw_cs_get_dispatch_info(devinfo, prog_data, NULL);
390 const int dispatch_size = dispatch.simd_size / 16;
391
392 uint64_t indirect_addr64 = anv_address_physical(indirect_addr);
393
394 struct GENX(COMPUTE_WALKER_BODY) body = {
395 .SIMDSize = dispatch_size,
396 /* HSD 14016252163: Use of Morton walk order (and batching using a batch
397 * size of 4) is expected to increase sampler cache hit rates by
398 * increasing sample address locality within a subslice.
399 */
400 #if GFX_VER >= 30
401 .DispatchWalkOrder = prog_data->uses_sampler ?
402 MortonWalk :
403 LinearWalk,
404 .ThreadGroupBatchSize = prog_data->uses_sampler ? TG_BATCH_4 :
405 TG_BATCH_1,
406 #endif
407 .MessageSIMD = dispatch_size,
408 .IndirectDataStartAddress = comp_state->base.push_constants_state.offset,
409 .IndirectDataLength = comp_state->base.push_constants_state.alloc_size,
410 .GenerateLocalID = prog_data->generate_local_id != 0,
411 .EmitLocal = prog_data->generate_local_id,
412 .WalkOrder = prog_data->walk_order,
413 .TileLayout = prog_data->walk_order == INTEL_WALK_ORDER_YXZ ?
414 TileY32bpe : Linear,
415 .LocalXMaximum = prog_data->local_size[0] - 1,
416 .LocalYMaximum = prog_data->local_size[1] - 1,
417 .LocalZMaximum = prog_data->local_size[2] - 1,
418 .ExecutionMask = dispatch.right_mask,
419 .PostSync.MOCS = anv_mocs(cmd_buffer->device, NULL, 0),
420 .InterfaceDescriptor =
421 get_interface_descriptor_data(cmd_buffer, shader, prog_data,
422 &dispatch),
423 .EmitInlineParameter = prog_data->uses_inline_data,
424 .InlineData = {
425 [ANV_INLINE_PARAM_NUM_WORKGROUPS_OFFSET / 4 + 0] = UINT32_MAX,
426 [ANV_INLINE_PARAM_NUM_WORKGROUPS_OFFSET / 4 + 1] = indirect_addr64 & 0xffffffff,
427 [ANV_INLINE_PARAM_NUM_WORKGROUPS_OFFSET / 4 + 2] = indirect_addr64 >> 32,
428 },
429 };
430
431 cmd_buffer->state.last_indirect_dispatch =
432 anv_batch_emitn(
433 &cmd_buffer->batch,
434 GENX(EXECUTE_INDIRECT_DISPATCH_length),
435 GENX(EXECUTE_INDIRECT_DISPATCH),
436 .PredicateEnable = predicate,
437 .MaxCount = 1,
438 .COMPUTE_WALKER_BODY = body,
439 .ArgumentBufferStartAddress = indirect_addr,
440 .MOCS = anv_mocs(cmd_buffer->device,
441 indirect_addr.bo, 0),
442 );
443 }
444
445 static inline void
emit_compute_walker(struct anv_cmd_buffer * cmd_buffer,const struct anv_compute_pipeline * pipeline,struct anv_address indirect_addr,const struct brw_cs_prog_data * prog_data,struct intel_cs_dispatch_info dispatch,uint32_t groupCountX,uint32_t groupCountY,uint32_t groupCountZ)446 emit_compute_walker(struct anv_cmd_buffer *cmd_buffer,
447 const struct anv_compute_pipeline *pipeline,
448 struct anv_address indirect_addr,
449 const struct brw_cs_prog_data *prog_data,
450 struct intel_cs_dispatch_info dispatch,
451 uint32_t groupCountX, uint32_t groupCountY,
452 uint32_t groupCountZ)
453 {
454 const struct anv_cmd_compute_state *comp_state = &cmd_buffer->state.compute;
455 const bool predicate = cmd_buffer->state.conditional_render_enabled;
456
457 uint32_t num_workgroup_data[3];
458 if (!anv_address_is_null(indirect_addr)) {
459 uint64_t indirect_addr64 = anv_address_physical(indirect_addr);
460 num_workgroup_data[0] = UINT32_MAX;
461 num_workgroup_data[1] = indirect_addr64 & 0xffffffff;
462 num_workgroup_data[2] = indirect_addr64 >> 32;
463 } else {
464 num_workgroup_data[0] = groupCountX;
465 num_workgroup_data[1] = groupCountY;
466 num_workgroup_data[2] = groupCountZ;
467 }
468
469 struct GENX(COMPUTE_WALKER_BODY) body = {
470 .SIMDSize = dispatch.simd_size / 16,
471 .MessageSIMD = dispatch.simd_size / 16,
472 .IndirectDataStartAddress = comp_state->base.push_constants_state.offset,
473 .IndirectDataLength = comp_state->base.push_constants_state.alloc_size,
474 .GenerateLocalID = prog_data->generate_local_id != 0,
475 .EmitLocal = prog_data->generate_local_id,
476 .WalkOrder = prog_data->walk_order,
477 .TileLayout = prog_data->walk_order == INTEL_WALK_ORDER_YXZ ?
478 TileY32bpe : Linear,
479 .LocalXMaximum = prog_data->local_size[0] - 1,
480 .LocalYMaximum = prog_data->local_size[1] - 1,
481 .LocalZMaximum = prog_data->local_size[2] - 1,
482 .ThreadGroupIDXDimension = groupCountX,
483 .ThreadGroupIDYDimension = groupCountY,
484 .ThreadGroupIDZDimension = groupCountZ,
485 .ExecutionMask = dispatch.right_mask,
486 .PostSync = {
487 .MOCS = anv_mocs(pipeline->base.device, NULL, 0),
488 },
489 .InterfaceDescriptor =
490 get_interface_descriptor_data(cmd_buffer, pipeline->cs,
491 prog_data, &dispatch),
492 .EmitInlineParameter = prog_data->uses_inline_data,
493 .InlineData = {
494 [ANV_INLINE_PARAM_NUM_WORKGROUPS_OFFSET / 4 + 0] = num_workgroup_data[0],
495 [ANV_INLINE_PARAM_NUM_WORKGROUPS_OFFSET / 4 + 1] = num_workgroup_data[1],
496 [ANV_INLINE_PARAM_NUM_WORKGROUPS_OFFSET / 4 + 2] = num_workgroup_data[2],
497 }
498 };
499
500 cmd_buffer->state.last_compute_walker =
501 anv_batch_emitn(
502 &cmd_buffer->batch,
503 GENX(COMPUTE_WALKER_length),
504 GENX(COMPUTE_WALKER),
505 .IndirectParameterEnable = !anv_address_is_null(indirect_addr),
506 .PredicateEnable = predicate,
507 .body = body,
508 #if GFX_VERx10 == 125
509 .SystolicModeEnable = prog_data->uses_systolic,
510 #endif
511 );
512 }
513
514 #else /* #if GFX_VERx10 >= 125 */
515
516 static inline void
emit_gpgpu_walker(struct anv_cmd_buffer * cmd_buffer,const struct anv_compute_pipeline * pipeline,bool indirect,const struct brw_cs_prog_data * prog_data,uint32_t groupCountX,uint32_t groupCountY,uint32_t groupCountZ)517 emit_gpgpu_walker(struct anv_cmd_buffer *cmd_buffer,
518 const struct anv_compute_pipeline *pipeline, bool indirect,
519 const struct brw_cs_prog_data *prog_data,
520 uint32_t groupCountX, uint32_t groupCountY,
521 uint32_t groupCountZ)
522 {
523 const bool predicate = cmd_buffer->state.conditional_render_enabled;
524
525 const struct intel_device_info *devinfo = pipeline->base.device->info;
526 const struct intel_cs_dispatch_info dispatch =
527 brw_cs_get_dispatch_info(devinfo, prog_data, NULL);
528
529 anv_batch_emit(&cmd_buffer->batch, GENX(GPGPU_WALKER), ggw) {
530 ggw.IndirectParameterEnable = indirect;
531 ggw.PredicateEnable = predicate;
532 ggw.SIMDSize = dispatch.simd_size / 16;
533 ggw.ThreadDepthCounterMaximum = 0;
534 ggw.ThreadHeightCounterMaximum = 0;
535 ggw.ThreadWidthCounterMaximum = dispatch.threads - 1;
536 ggw.ThreadGroupIDXDimension = groupCountX;
537 ggw.ThreadGroupIDYDimension = groupCountY;
538 ggw.ThreadGroupIDZDimension = groupCountZ;
539 ggw.RightExecutionMask = dispatch.right_mask;
540 ggw.BottomExecutionMask = 0xffffffff;
541 }
542
543 anv_batch_emit(&cmd_buffer->batch, GENX(MEDIA_STATE_FLUSH), msf);
544 }
545
546 #endif /* #if GFX_VERx10 >= 125 */
547
548 static inline void
emit_cs_walker(struct anv_cmd_buffer * cmd_buffer,const struct anv_compute_pipeline * pipeline,const struct brw_cs_prog_data * prog_data,struct intel_cs_dispatch_info dispatch,struct anv_address indirect_addr,uint32_t groupCountX,uint32_t groupCountY,uint32_t groupCountZ,bool is_unaligned_size_x)549 emit_cs_walker(struct anv_cmd_buffer *cmd_buffer,
550 const struct anv_compute_pipeline *pipeline,
551 const struct brw_cs_prog_data *prog_data,
552 struct intel_cs_dispatch_info dispatch,
553 struct anv_address indirect_addr,
554 uint32_t groupCountX, uint32_t groupCountY, uint32_t groupCountZ,
555 bool is_unaligned_size_x)
556 {
557 bool is_indirect = !anv_address_is_null(indirect_addr);
558
559 #if GFX_VERx10 >= 125
560 /* For unaligned dispatch, we need to tweak the dispatch value with
561 * MI_MATH, so we can't use indirect HW instructions.
562 */
563 if (is_indirect && !is_unaligned_size_x &&
564 cmd_buffer->device->info->has_indirect_unroll) {
565 emit_indirect_compute_walker(cmd_buffer, pipeline->cs, prog_data,
566 indirect_addr);
567 return;
568 }
569 #endif
570
571 if (is_indirect)
572 compute_load_indirect_params(cmd_buffer, indirect_addr,
573 is_unaligned_size_x);
574
575 #if GFX_VERx10 >= 125
576 emit_compute_walker(cmd_buffer, pipeline, indirect_addr, prog_data,
577 dispatch, groupCountX, groupCountY, groupCountZ);
578 #else
579 emit_gpgpu_walker(cmd_buffer, pipeline, is_indirect, prog_data,
580 groupCountX, groupCountY, groupCountZ);
581 #endif
582 }
583
genX(CmdDispatchBase)584 void genX(CmdDispatchBase)(
585 VkCommandBuffer commandBuffer,
586 uint32_t baseGroupX,
587 uint32_t baseGroupY,
588 uint32_t baseGroupZ,
589 uint32_t groupCountX,
590 uint32_t groupCountY,
591 uint32_t groupCountZ)
592 {
593 ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer);
594 struct anv_compute_pipeline *pipeline =
595 anv_pipeline_to_compute(cmd_buffer->state.compute.base.pipeline);
596 const struct brw_cs_prog_data *prog_data = get_cs_prog_data(pipeline);
597 struct intel_cs_dispatch_info dispatch =
598 brw_cs_get_dispatch_info(cmd_buffer->device->info, prog_data, NULL);
599
600 if (anv_batch_has_error(&cmd_buffer->batch))
601 return;
602
603 anv_cmd_buffer_push_workgroups(cmd_buffer, prog_data,
604 baseGroupX, baseGroupY, baseGroupZ,
605 groupCountX, groupCountY, groupCountZ,
606 ANV_NULL_ADDRESS);
607
608 anv_measure_snapshot(cmd_buffer,
609 INTEL_SNAPSHOT_COMPUTE,
610 "compute",
611 groupCountX * groupCountY * groupCountZ *
612 prog_data->local_size[0] * prog_data->local_size[1] *
613 prog_data->local_size[2]);
614
615 if (cmd_buffer->state.rt.debug_marker_count == 0)
616 trace_intel_begin_compute(&cmd_buffer->trace);
617
618 genX(cmd_buffer_flush_compute_state)(cmd_buffer);
619
620 if (cmd_buffer->state.conditional_render_enabled)
621 genX(cmd_emit_conditional_render_predicate)(cmd_buffer);
622
623 emit_cs_walker(cmd_buffer, pipeline, prog_data, dispatch,
624 ANV_NULL_ADDRESS /* no indirect data */,
625 groupCountX, groupCountY, groupCountZ,
626 false);
627
628 if (cmd_buffer->state.rt.debug_marker_count == 0) {
629 trace_intel_end_compute(&cmd_buffer->trace,
630 groupCountX, groupCountY, groupCountZ,
631 pipeline->source_hash);
632 }
633 }
634
635 static void
emit_unaligned_cs_walker(VkCommandBuffer commandBuffer,uint32_t baseGroupX,uint32_t baseGroupY,uint32_t baseGroupZ,uint32_t groupCountX,uint32_t groupCountY,uint32_t groupCountZ,struct intel_cs_dispatch_info dispatch)636 emit_unaligned_cs_walker(
637 VkCommandBuffer commandBuffer,
638 uint32_t baseGroupX,
639 uint32_t baseGroupY,
640 uint32_t baseGroupZ,
641 uint32_t groupCountX,
642 uint32_t groupCountY,
643 uint32_t groupCountZ,
644 struct intel_cs_dispatch_info dispatch)
645 {
646 ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer);
647 struct anv_compute_pipeline *pipeline =
648 anv_pipeline_to_compute(cmd_buffer->state.compute.base.pipeline);
649 const struct brw_cs_prog_data *prog_data = get_cs_prog_data(pipeline);
650
651 if (anv_batch_has_error(&cmd_buffer->batch))
652 return;
653
654 anv_cmd_buffer_push_workgroups(cmd_buffer, prog_data,
655 baseGroupX, baseGroupY, baseGroupZ,
656 groupCountX, groupCountY, groupCountZ,
657 ANV_NULL_ADDRESS);
658
659 /* RT shaders have Y and Z local size set to 1 always. */
660 assert(prog_data->local_size[1] == 1 && prog_data->local_size[2] == 1);
661
662 /* RT shaders dispatched with group Y and Z set to 1 always. */
663 assert(groupCountY == 1 && groupCountZ == 1);
664
665 if (anv_batch_has_error(&cmd_buffer->batch))
666 return;
667
668 anv_measure_snapshot(cmd_buffer,
669 INTEL_SNAPSHOT_COMPUTE,
670 "compute-unaligned-cs-walker",
671 groupCountX * groupCountY * groupCountZ *
672 prog_data->local_size[0] * prog_data->local_size[1] *
673 prog_data->local_size[2]);
674
675 if (cmd_buffer->state.rt.debug_marker_count == 0)
676 trace_intel_begin_compute(&cmd_buffer->trace);
677
678 assert(!prog_data->uses_num_work_groups);
679 genX(cmd_buffer_flush_compute_state)(cmd_buffer);
680
681 if (cmd_buffer->state.conditional_render_enabled)
682 genX(cmd_emit_conditional_render_predicate)(cmd_buffer);
683
684 #if GFX_VERx10 >= 125
685 emit_compute_walker(cmd_buffer, pipeline, ANV_NULL_ADDRESS, prog_data,
686 dispatch, groupCountX, groupCountY, groupCountZ);
687 #endif
688
689 if (cmd_buffer->state.rt.debug_marker_count == 0) {
690 trace_intel_end_compute(&cmd_buffer->trace,
691 groupCountX, groupCountY, groupCountZ,
692 pipeline->source_hash);
693 }
694 }
695
696 /*
697 * Dispatch compute work item with unaligned thread invocations.
698 *
699 * This helper takes unaligned thread invocations, convert it into aligned
700 * thread group count and dispatch compute work items.
701 *
702 * We launch two CS walker, one with aligned part and another CS walker
703 * with single group for remaining thread invocations.
704 *
705 * This function is now specifically for BVH building.
706 */
707 void
genX(cmd_dispatch_unaligned)708 genX(cmd_dispatch_unaligned)(
709 VkCommandBuffer commandBuffer,
710 uint32_t invocations_x,
711 uint32_t invocations_y,
712 uint32_t invocations_z)
713 {
714 ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer);
715 struct anv_compute_pipeline *pipeline =
716 anv_pipeline_to_compute(cmd_buffer->state.compute.base.pipeline);
717 const struct brw_cs_prog_data *prog_data = get_cs_prog_data(pipeline);
718
719 /* Group X can be unaligned for RT dispatches. */
720 uint32_t groupCountX = invocations_x / prog_data->local_size[0];
721 uint32_t groupCountY = invocations_y;
722 uint32_t groupCountZ = invocations_z;
723
724 struct intel_cs_dispatch_info dispatch =
725 brw_cs_get_dispatch_info(cmd_buffer->device->info, prog_data, NULL);
726
727 /* Launch first CS walker with aligned group count X. */
728 if (groupCountX) {
729 emit_unaligned_cs_walker(commandBuffer, 0, 0, 0, groupCountX,
730 groupCountY, groupCountZ, dispatch);
731 }
732
733 uint32_t unaligned_invocations_x = invocations_x % prog_data->local_size[0];
734 if (unaligned_invocations_x) {
735 dispatch.threads = DIV_ROUND_UP(unaligned_invocations_x,
736 dispatch.simd_size);
737
738 /* Make sure the 2nd walker has the same amount of invocations per
739 * workgroup as the 1st walker, so that gl_GlobalInvocationsID can be
740 * calculated correctly with baseGroup.
741 */
742 assert(dispatch.threads * dispatch.simd_size == prog_data->local_size[0]);
743
744 const uint32_t remainder = unaligned_invocations_x & (dispatch.simd_size - 1);
745 if (remainder > 0) {
746 dispatch.right_mask = ~0u >> (32 - remainder);
747 } else {
748 dispatch.right_mask = ~0u >> (32 - dispatch.simd_size);
749 }
750
751 /* Launch second CS walker for unaligned part. */
752 emit_unaligned_cs_walker(commandBuffer, groupCountX, 0, 0, 1, 1, 1,
753 dispatch);
754 }
755 }
756
757 /*
758 * This dispatches compute work item with indirect parameters.
759 * Helper also makes the unaligned thread invocations aligned.
760 */
761 void
genX(cmd_buffer_dispatch_indirect)762 genX(cmd_buffer_dispatch_indirect)(struct anv_cmd_buffer *cmd_buffer,
763 struct anv_address indirect_addr,
764 bool is_unaligned_size_x)
765 {
766 struct anv_compute_pipeline *pipeline =
767 anv_pipeline_to_compute(cmd_buffer->state.compute.base.pipeline);
768 const struct brw_cs_prog_data *prog_data = get_cs_prog_data(pipeline);
769 UNUSED struct anv_batch *batch = &cmd_buffer->batch;
770 struct intel_cs_dispatch_info dispatch =
771 brw_cs_get_dispatch_info(cmd_buffer->device->info, prog_data, NULL);
772
773 if (anv_batch_has_error(&cmd_buffer->batch))
774 return;
775
776 anv_cmd_buffer_push_workgroups(cmd_buffer, prog_data,
777 0, 0, 0, 0, 0, 0, indirect_addr);
778
779 anv_measure_snapshot(cmd_buffer,
780 INTEL_SNAPSHOT_COMPUTE,
781 "compute indirect",
782 0);
783
784 if (cmd_buffer->state.rt.debug_marker_count == 0)
785 trace_intel_begin_compute_indirect(&cmd_buffer->trace);
786
787 genX(cmd_buffer_flush_compute_state)(cmd_buffer);
788
789 if (cmd_buffer->state.conditional_render_enabled)
790 genX(cmd_emit_conditional_render_predicate)(cmd_buffer);
791
792 emit_cs_walker(cmd_buffer, pipeline, prog_data, dispatch, indirect_addr, 0,
793 0, 0, is_unaligned_size_x);
794
795 if (cmd_buffer->state.rt.debug_marker_count == 0) {
796 trace_intel_end_compute_indirect(&cmd_buffer->trace,
797 anv_address_utrace(indirect_addr),
798 pipeline->source_hash);
799 }
800 }
801
genX(CmdDispatchIndirect)802 void genX(CmdDispatchIndirect)(
803 VkCommandBuffer commandBuffer,
804 VkBuffer _buffer,
805 VkDeviceSize offset)
806 {
807 ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer);
808 ANV_FROM_HANDLE(anv_buffer, buffer, _buffer);
809 struct anv_address addr = anv_address_add(buffer->address, offset);
810
811 genX(cmd_buffer_dispatch_indirect)(cmd_buffer, addr, false);
812 }
813
814 struct anv_address
genX(cmd_buffer_ray_query_globals)815 genX(cmd_buffer_ray_query_globals)(struct anv_cmd_buffer *cmd_buffer)
816 {
817 #if GFX_VERx10 >= 125
818 struct anv_device *device = cmd_buffer->device;
819
820 struct anv_state state =
821 anv_cmd_buffer_alloc_temporary_state(cmd_buffer,
822 BRW_RT_DISPATCH_GLOBALS_SIZE, 64);
823 struct brw_rt_scratch_layout layout;
824 uint32_t stack_ids_per_dss = 2048; /* TODO: can we use a lower value in
825 * some cases?
826 */
827 brw_rt_compute_scratch_layout(&layout, device->info,
828 stack_ids_per_dss, 1 << 10);
829
830 uint8_t idx = anv_get_ray_query_bo_index(cmd_buffer);
831
832 const struct GENX(RT_DISPATCH_GLOBALS) rtdg = {
833 .MemBaseAddress = (struct anv_address) {
834 /* The ray query HW computes offsets from the top of the buffer, so
835 * let the address at the end of the buffer.
836 */
837 .bo = device->ray_query_bo[idx],
838 .offset = device->ray_query_bo[idx]->size
839 },
840 .AsyncRTStackSize = layout.ray_stack_stride / 64,
841 .NumDSSRTStacks = layout.stack_ids_per_dss,
842 .MaxBVHLevels = BRW_RT_MAX_BVH_LEVELS,
843 .Flags = RT_DEPTH_TEST_LESS_EQUAL,
844 .ResumeShaderTable = (struct anv_address) {
845 .bo = cmd_buffer->state.ray_query_shadow_bo,
846 },
847 };
848 GENX(RT_DISPATCH_GLOBALS_pack)(NULL, state.map, &rtdg);
849
850 return anv_cmd_buffer_temporary_state_address(cmd_buffer, state);
851 #else
852 unreachable("Not supported");
853 #endif
854 }
855
856 #if GFX_VERx10 >= 125
857 void
genX(cmd_buffer_dispatch_kernel)858 genX(cmd_buffer_dispatch_kernel)(struct anv_cmd_buffer *cmd_buffer,
859 struct anv_kernel *kernel,
860 const uint32_t *global_size,
861 uint32_t arg_count,
862 const struct anv_kernel_arg *args)
863 {
864 const struct intel_device_info *devinfo = cmd_buffer->device->info;
865 const struct brw_cs_prog_data *cs_prog_data =
866 brw_cs_prog_data_const(kernel->bin->prog_data);
867
868 genX(cmd_buffer_config_l3)(cmd_buffer, kernel->l3_config);
869
870 genX(cmd_buffer_update_color_aux_op(cmd_buffer, ISL_AUX_OP_NONE));
871
872 genX(flush_pipeline_select_gpgpu)(cmd_buffer);
873
874 /* Apply any pending pipeline flushes we may have. We want to apply them
875 * now because, if any of those flushes are for things like push constants,
876 * the GPU will read the state at weird times.
877 */
878 genX(cmd_buffer_apply_pipe_flushes)(cmd_buffer);
879
880 uint32_t indirect_data_size = sizeof(struct brw_kernel_sysvals);
881 indirect_data_size += kernel->bin->bind_map.kernel_args_size;
882 indirect_data_size = ALIGN(indirect_data_size, 64);
883 struct anv_state indirect_data =
884 anv_cmd_buffer_alloc_general_state(cmd_buffer,
885 indirect_data_size, 64);
886 memset(indirect_data.map, 0, indirect_data.alloc_size);
887
888 struct brw_kernel_sysvals sysvals = {};
889 if (global_size != NULL) {
890 for (unsigned i = 0; i < 3; i++)
891 sysvals.num_work_groups[i] = global_size[i];
892 memcpy(indirect_data.map, &sysvals, sizeof(sysvals));
893 } else {
894 struct anv_address sysvals_addr = {
895 .bo = NULL, /* General state buffer is always 0. */
896 .offset = indirect_data.offset,
897 };
898
899 compute_store_indirect_params(cmd_buffer, sysvals_addr);
900 }
901
902 void *args_map = indirect_data.map + sizeof(sysvals);
903 for (unsigned i = 0; i < kernel->bin->bind_map.kernel_arg_count; i++) {
904 struct brw_kernel_arg_desc *arg_desc =
905 &kernel->bin->bind_map.kernel_args[i];
906 assert(i < arg_count);
907 const struct anv_kernel_arg *arg = &args[i];
908 if (arg->is_ptr) {
909 memcpy(args_map + arg_desc->offset, arg->ptr, arg_desc->size);
910 } else {
911 assert(arg_desc->size <= sizeof(arg->u64));
912 memcpy(args_map + arg_desc->offset, &arg->u64, arg_desc->size);
913 }
914 }
915
916 struct intel_cs_dispatch_info dispatch =
917 brw_cs_get_dispatch_info(devinfo, cs_prog_data, NULL);
918
919 struct GENX(COMPUTE_WALKER_BODY) body = {
920 .SIMDSize = dispatch.simd_size / 16,
921 .MessageSIMD = dispatch.simd_size / 16,
922 .IndirectDataStartAddress = indirect_data.offset,
923 .IndirectDataLength = indirect_data.alloc_size,
924 .LocalXMaximum = cs_prog_data->local_size[0] - 1,
925 .LocalYMaximum = cs_prog_data->local_size[1] - 1,
926 .LocalZMaximum = cs_prog_data->local_size[2] - 1,
927 .ExecutionMask = dispatch.right_mask,
928 .PostSync.MOCS = cmd_buffer->device->isl_dev.mocs.internal,
929 .InterfaceDescriptor =
930 get_interface_descriptor_data(cmd_buffer,
931 kernel->bin,
932 cs_prog_data,
933 &dispatch),
934 };
935
936 if (global_size != NULL) {
937 body.ThreadGroupIDXDimension = global_size[0];
938 body.ThreadGroupIDYDimension = global_size[1];
939 body.ThreadGroupIDZDimension = global_size[2];
940 }
941
942 cmd_buffer->state.last_compute_walker =
943 anv_batch_emitn(
944 &cmd_buffer->batch,
945 GENX(COMPUTE_WALKER_length),
946 GENX(COMPUTE_WALKER),
947 .IndirectParameterEnable = global_size == NULL,
948 .PredicateEnable = false,
949 .body = body,
950 );
951
952 /* We just blew away the compute pipeline state */
953 cmd_buffer->state.compute.pipeline_dirty = true;
954 }
955
956 static void
calc_local_trace_size(uint8_t local_shift[3],const uint32_t global[3])957 calc_local_trace_size(uint8_t local_shift[3], const uint32_t global[3])
958 {
959 unsigned total_shift = 0;
960 memset(local_shift, 0, 3);
961
962 bool progress;
963 do {
964 progress = false;
965 for (unsigned i = 0; i < 3; i++) {
966 assert(global[i] > 0);
967 if ((1 << local_shift[i]) < global[i]) {
968 progress = true;
969 local_shift[i]++;
970 total_shift++;
971 }
972
973 if (total_shift == 3)
974 return;
975 }
976 } while(progress);
977
978 /* Assign whatever's left to x */
979 local_shift[0] += 3 - total_shift;
980 }
981
GENX(RT_SHADER_TABLE)982 static struct GENX(RT_SHADER_TABLE)
983 vk_sdar_to_shader_table(const VkStridedDeviceAddressRegionKHR *region)
984 {
985 return (struct GENX(RT_SHADER_TABLE)) {
986 .BaseAddress = anv_address_from_u64(region->deviceAddress),
987 .Stride = region->stride,
988 };
989 }
990
991 struct trace_params {
992 /* If is_sbt_indirect, use indirect_sbts_addr to build RT_DISPATCH_GLOBALS
993 * with mi_builder.
994 */
995 bool is_sbt_indirect;
996 const VkStridedDeviceAddressRegionKHR *raygen_sbt;
997 const VkStridedDeviceAddressRegionKHR *miss_sbt;
998 const VkStridedDeviceAddressRegionKHR *hit_sbt;
999 const VkStridedDeviceAddressRegionKHR *callable_sbt;
1000
1001 /* A pointer to a VkTraceRaysIndirectCommand2KHR structure */
1002 uint64_t indirect_sbts_addr;
1003
1004 /* If is_indirect, use launch_size_addr to program the dispatch size. */
1005 bool is_launch_size_indirect;
1006 uint32_t launch_size[3];
1007
1008 /* A pointer a uint32_t[3] */
1009 uint64_t launch_size_addr;
1010 };
1011
1012 static struct anv_state
cmd_buffer_emit_rt_dispatch_globals(struct anv_cmd_buffer * cmd_buffer,struct trace_params * params)1013 cmd_buffer_emit_rt_dispatch_globals(struct anv_cmd_buffer *cmd_buffer,
1014 struct trace_params *params)
1015 {
1016 assert(!params->is_sbt_indirect);
1017 assert(params->miss_sbt != NULL);
1018 assert(params->hit_sbt != NULL);
1019 assert(params->callable_sbt != NULL);
1020
1021 struct anv_cmd_ray_tracing_state *rt = &cmd_buffer->state.rt;
1022
1023 struct anv_state rtdg_state =
1024 anv_cmd_buffer_alloc_temporary_state(cmd_buffer,
1025 BRW_RT_PUSH_CONST_OFFSET +
1026 sizeof(struct anv_push_constants),
1027 64);
1028
1029 struct GENX(RT_DISPATCH_GLOBALS) rtdg = {
1030 .MemBaseAddress = (struct anv_address) {
1031 .bo = rt->scratch.bo,
1032 .offset = rt->scratch.layout.ray_stack_start,
1033 },
1034 .CallStackHandler = anv_shader_bin_get_bsr(
1035 cmd_buffer->device->rt_trivial_return, 0),
1036 .AsyncRTStackSize = rt->scratch.layout.ray_stack_stride / 64,
1037 .NumDSSRTStacks = rt->scratch.layout.stack_ids_per_dss,
1038 .MaxBVHLevels = BRW_RT_MAX_BVH_LEVELS,
1039 .Flags = RT_DEPTH_TEST_LESS_EQUAL,
1040 .HitGroupTable = vk_sdar_to_shader_table(params->hit_sbt),
1041 .MissGroupTable = vk_sdar_to_shader_table(params->miss_sbt),
1042 .SWStackSize = rt->scratch.layout.sw_stack_size / 64,
1043 .LaunchWidth = params->launch_size[0],
1044 .LaunchHeight = params->launch_size[1],
1045 .LaunchDepth = params->launch_size[2],
1046 .CallableGroupTable = vk_sdar_to_shader_table(params->callable_sbt),
1047 };
1048 GENX(RT_DISPATCH_GLOBALS_pack)(NULL, rtdg_state.map, &rtdg);
1049
1050 return rtdg_state;
1051 }
1052
1053 static struct mi_value
mi_build_sbt_entry(struct mi_builder * b,uint64_t addr_field_addr,uint64_t stride_field_addr)1054 mi_build_sbt_entry(struct mi_builder *b,
1055 uint64_t addr_field_addr,
1056 uint64_t stride_field_addr)
1057 {
1058 return mi_ior(b,
1059 mi_iand(b, mi_mem64(anv_address_from_u64(addr_field_addr)),
1060 mi_imm(BITFIELD64_BIT(49) - 1)),
1061 mi_ishl_imm(b, mi_mem32(anv_address_from_u64(stride_field_addr)),
1062 48));
1063 }
1064
1065 static struct anv_state
cmd_buffer_emit_rt_dispatch_globals_indirect(struct anv_cmd_buffer * cmd_buffer,struct trace_params * params)1066 cmd_buffer_emit_rt_dispatch_globals_indirect(struct anv_cmd_buffer *cmd_buffer,
1067 struct trace_params *params)
1068 {
1069 struct anv_cmd_ray_tracing_state *rt = &cmd_buffer->state.rt;
1070
1071 struct anv_state rtdg_state =
1072 anv_cmd_buffer_alloc_temporary_state(cmd_buffer,
1073 BRW_RT_PUSH_CONST_OFFSET +
1074 sizeof(struct anv_push_constants),
1075 64);
1076
1077 struct GENX(RT_DISPATCH_GLOBALS) rtdg = {
1078 .MemBaseAddress = (struct anv_address) {
1079 .bo = rt->scratch.bo,
1080 .offset = rt->scratch.layout.ray_stack_start,
1081 },
1082 .CallStackHandler = anv_shader_bin_get_bsr(
1083 cmd_buffer->device->rt_trivial_return, 0),
1084 .AsyncRTStackSize = rt->scratch.layout.ray_stack_stride / 64,
1085 .NumDSSRTStacks = rt->scratch.layout.stack_ids_per_dss,
1086 .MaxBVHLevels = BRW_RT_MAX_BVH_LEVELS,
1087 .Flags = RT_DEPTH_TEST_LESS_EQUAL,
1088 .SWStackSize = rt->scratch.layout.sw_stack_size / 64,
1089 };
1090 GENX(RT_DISPATCH_GLOBALS_pack)(NULL, rtdg_state.map, &rtdg);
1091
1092 struct anv_address rtdg_addr =
1093 anv_cmd_buffer_temporary_state_address(cmd_buffer, rtdg_state);
1094
1095 struct mi_builder b;
1096 mi_builder_init(&b, cmd_buffer->device->info, &cmd_buffer->batch);
1097 const uint32_t mocs = anv_mocs_for_address(cmd_buffer->device, &rtdg_addr);
1098 mi_builder_set_mocs(&b, mocs);
1099 mi_builder_set_write_check(&b, true);
1100
1101 /* Fill the MissGroupTable, HitGroupTable & CallableGroupTable fields of
1102 * RT_DISPATCH_GLOBALS using the mi_builder.
1103 */
1104 mi_store(&b,
1105 mi_mem64(
1106 anv_address_add(
1107 rtdg_addr,
1108 GENX(RT_DISPATCH_GLOBALS_MissGroupTable_start) / 8)),
1109 mi_build_sbt_entry(&b,
1110 params->indirect_sbts_addr +
1111 offsetof(VkTraceRaysIndirectCommand2KHR,
1112 missShaderBindingTableAddress),
1113 params->indirect_sbts_addr +
1114 offsetof(VkTraceRaysIndirectCommand2KHR,
1115 missShaderBindingTableStride)));
1116 mi_store(&b,
1117 mi_mem64(
1118 anv_address_add(
1119 rtdg_addr,
1120 GENX(RT_DISPATCH_GLOBALS_HitGroupTable_start) / 8)),
1121 mi_build_sbt_entry(&b,
1122 params->indirect_sbts_addr +
1123 offsetof(VkTraceRaysIndirectCommand2KHR,
1124 hitShaderBindingTableAddress),
1125 params->indirect_sbts_addr +
1126 offsetof(VkTraceRaysIndirectCommand2KHR,
1127 hitShaderBindingTableStride)));
1128 mi_store(&b,
1129 mi_mem64(
1130 anv_address_add(
1131 rtdg_addr,
1132 GENX(RT_DISPATCH_GLOBALS_CallableGroupTable_start) / 8)),
1133 mi_build_sbt_entry(&b,
1134 params->indirect_sbts_addr +
1135 offsetof(VkTraceRaysIndirectCommand2KHR,
1136 callableShaderBindingTableAddress),
1137 params->indirect_sbts_addr +
1138 offsetof(VkTraceRaysIndirectCommand2KHR,
1139 callableShaderBindingTableStride)));
1140
1141 return rtdg_state;
1142 }
1143
1144 static void
cmd_buffer_trace_rays(struct anv_cmd_buffer * cmd_buffer,struct trace_params * params)1145 cmd_buffer_trace_rays(struct anv_cmd_buffer *cmd_buffer,
1146 struct trace_params *params)
1147 {
1148 struct anv_device *device = cmd_buffer->device;
1149 struct anv_cmd_ray_tracing_state *rt = &cmd_buffer->state.rt;
1150 struct anv_ray_tracing_pipeline *pipeline =
1151 anv_pipeline_to_ray_tracing(rt->base.pipeline);
1152
1153 if (anv_batch_has_error(&cmd_buffer->batch))
1154 return;
1155
1156 /* If we have a known degenerate launch size, just bail */
1157 if (!params->is_launch_size_indirect &&
1158 (params->launch_size[0] == 0 ||
1159 params->launch_size[1] == 0 ||
1160 params->launch_size[2] == 0))
1161 return;
1162
1163 trace_intel_begin_rays(&cmd_buffer->trace);
1164
1165 genX(cmd_buffer_config_l3)(cmd_buffer, pipeline->base.l3_config);
1166
1167 genX(cmd_buffer_update_color_aux_op(cmd_buffer, ISL_AUX_OP_NONE));
1168
1169 genX(flush_descriptor_buffers)(cmd_buffer, &rt->base);
1170
1171 genX(flush_pipeline_select_gpgpu)(cmd_buffer);
1172
1173 cmd_buffer->state.rt.pipeline_dirty = false;
1174
1175 genX(cmd_buffer_apply_pipe_flushes)(cmd_buffer);
1176
1177 genX(cmd_buffer_flush_push_descriptors)(cmd_buffer,
1178 &cmd_buffer->state.rt.base,
1179 &pipeline->base);
1180
1181 /* Add these to the reloc list as they're internal buffers that don't
1182 * actually have relocs to pick them up manually.
1183 *
1184 * TODO(RT): This is a bit of a hack
1185 */
1186 anv_reloc_list_add_bo(cmd_buffer->batch.relocs,
1187 rt->scratch.bo);
1188 anv_reloc_list_add_bo(cmd_buffer->batch.relocs,
1189 cmd_buffer->device->btd_fifo_bo);
1190
1191 /* Allocate and set up our RT_DISPATCH_GLOBALS */
1192 struct anv_state rtdg_state =
1193 params->is_sbt_indirect ?
1194 cmd_buffer_emit_rt_dispatch_globals_indirect(cmd_buffer, params) :
1195 cmd_buffer_emit_rt_dispatch_globals(cmd_buffer, params);
1196
1197 assert(rtdg_state.alloc_size >= (BRW_RT_PUSH_CONST_OFFSET +
1198 sizeof(struct anv_push_constants)));
1199 assert(GENX(RT_DISPATCH_GLOBALS_length) * 4 <= BRW_RT_PUSH_CONST_OFFSET);
1200 /* Push constants go after the RT_DISPATCH_GLOBALS */
1201 memcpy(rtdg_state.map + BRW_RT_PUSH_CONST_OFFSET,
1202 &cmd_buffer->state.rt.base.push_constants,
1203 sizeof(struct anv_push_constants));
1204
1205 struct anv_address rtdg_addr =
1206 anv_cmd_buffer_temporary_state_address(cmd_buffer, rtdg_state);
1207
1208 uint8_t local_size_log2[3];
1209 uint32_t global_size[3] = {};
1210 if (params->is_launch_size_indirect) {
1211 /* Pick a local size that's probably ok. We assume most TraceRays calls
1212 * will use a two-dimensional dispatch size. Worst case, our initial
1213 * dispatch will be a little slower than it has to be.
1214 */
1215 local_size_log2[0] = 2;
1216 local_size_log2[1] = 1;
1217 local_size_log2[2] = 0;
1218
1219 struct mi_builder b;
1220 mi_builder_init(&b, cmd_buffer->device->info, &cmd_buffer->batch);
1221 const uint32_t mocs = anv_mocs_for_address(cmd_buffer->device, &rtdg_addr);
1222 mi_builder_set_mocs(&b, mocs);
1223 mi_builder_set_write_check(&b, true);
1224
1225 struct mi_value launch_size[3] = {
1226 mi_mem32(anv_address_from_u64(params->launch_size_addr + 0)),
1227 mi_mem32(anv_address_from_u64(params->launch_size_addr + 4)),
1228 mi_mem32(anv_address_from_u64(params->launch_size_addr + 8)),
1229 };
1230
1231 /* Store the original launch size into RT_DISPATCH_GLOBALS */
1232 mi_store(&b, mi_mem32(anv_address_add(rtdg_addr,
1233 GENX(RT_DISPATCH_GLOBALS_LaunchWidth_start) / 8)),
1234 mi_value_ref(&b, launch_size[0]));
1235 mi_store(&b, mi_mem32(anv_address_add(rtdg_addr,
1236 GENX(RT_DISPATCH_GLOBALS_LaunchHeight_start) / 8)),
1237 mi_value_ref(&b, launch_size[1]));
1238 mi_store(&b, mi_mem32(anv_address_add(rtdg_addr,
1239 GENX(RT_DISPATCH_GLOBALS_LaunchDepth_start) / 8)),
1240 mi_value_ref(&b, launch_size[2]));
1241
1242 /* Compute the global dispatch size */
1243 for (unsigned i = 0; i < 3; i++) {
1244 if (local_size_log2[i] == 0)
1245 continue;
1246
1247 /* global_size = DIV_ROUND_UP(launch_size, local_size)
1248 *
1249 * Fortunately for us MI_ALU math is 64-bit and , mi_ushr32_imm
1250 * has the semantics of shifting the enture 64-bit value and taking
1251 * the bottom 32 so we don't have to worry about roll-over.
1252 */
1253 uint32_t local_size = 1 << local_size_log2[i];
1254 launch_size[i] = mi_iadd(&b, launch_size[i],
1255 mi_imm(local_size - 1));
1256 launch_size[i] = mi_ushr32_imm(&b, launch_size[i],
1257 local_size_log2[i]);
1258 }
1259
1260 mi_store(&b, mi_reg32(GPGPU_DISPATCHDIMX), launch_size[0]);
1261 mi_store(&b, mi_reg32(GPGPU_DISPATCHDIMY), launch_size[1]);
1262 mi_store(&b, mi_reg32(GPGPU_DISPATCHDIMZ), launch_size[2]);
1263
1264 } else {
1265 calc_local_trace_size(local_size_log2, params->launch_size);
1266
1267 for (unsigned i = 0; i < 3; i++) {
1268 /* We have to be a bit careful here because DIV_ROUND_UP adds to the
1269 * numerator value may overflow. Cast to uint64_t to avoid this.
1270 */
1271 uint32_t local_size = 1 << local_size_log2[i];
1272 global_size[i] = DIV_ROUND_UP((uint64_t)params->launch_size[i], local_size);
1273 }
1274 }
1275
1276 #if GFX_VERx10 == 125
1277 /* Wa_14014427904 - We need additional invalidate/flush when
1278 * emitting NP state commands with ATS-M in compute mode.
1279 */
1280 if (intel_device_info_is_atsm(device->info) &&
1281 cmd_buffer->queue_family->engine_class == INTEL_ENGINE_CLASS_COMPUTE) {
1282 genx_batch_emit_pipe_control(&cmd_buffer->batch,
1283 cmd_buffer->device->info,
1284 cmd_buffer->state.current_pipeline,
1285 ANV_PIPE_CS_STALL_BIT |
1286 ANV_PIPE_STATE_CACHE_INVALIDATE_BIT |
1287 ANV_PIPE_CONSTANT_CACHE_INVALIDATE_BIT |
1288 ANV_PIPE_UNTYPED_DATAPORT_CACHE_FLUSH_BIT |
1289 ANV_PIPE_TEXTURE_CACHE_INVALIDATE_BIT |
1290 ANV_PIPE_INSTRUCTION_CACHE_INVALIDATE_BIT |
1291 ANV_PIPE_HDC_PIPELINE_FLUSH_BIT);
1292 }
1293 #endif
1294
1295 anv_batch_emit(&cmd_buffer->batch, GENX(3DSTATE_BTD), btd) {
1296 /* TODO: This is the timeout after which the bucketed thread dispatcher
1297 * will kick off a wave of threads. We go with the lowest value
1298 * for now. It could be tweaked on a per application basis
1299 * (drirc).
1300 */
1301 btd.DispatchTimeoutCounter = _64clocks;
1302 /* BSpec 43851: "This field must be programmed to 6h i.e. memory backed
1303 * buffer must be 128KB."
1304 */
1305 btd.PerDSSMemoryBackedBufferSize = 6;
1306 btd.MemoryBackedBufferBasePointer = (struct anv_address) { .bo = device->btd_fifo_bo };
1307 if (pipeline->base.scratch_size > 0) {
1308 struct anv_bo *scratch_bo =
1309 anv_scratch_pool_alloc(device,
1310 &device->scratch_pool,
1311 MESA_SHADER_COMPUTE,
1312 pipeline->base.scratch_size);
1313 anv_reloc_list_add_bo(cmd_buffer->batch.relocs,
1314 scratch_bo);
1315 uint32_t scratch_surf =
1316 anv_scratch_pool_get_surf(cmd_buffer->device,
1317 &device->scratch_pool,
1318 pipeline->base.scratch_size);
1319 btd.ScratchSpaceBuffer = scratch_surf >> ANV_SCRATCH_SPACE_SHIFT(GFX_VER);
1320 }
1321 #if INTEL_NEEDS_WA_14017794102 || INTEL_NEEDS_WA_14023061436
1322 btd.BTDMidthreadpreemption = false;
1323 #endif
1324 }
1325
1326 genX(cmd_buffer_ensure_cfe_state)(cmd_buffer, pipeline->base.scratch_size);
1327
1328 const struct brw_cs_prog_data *cs_prog_data =
1329 brw_cs_prog_data_const(device->rt_trampoline->prog_data);
1330 struct intel_cs_dispatch_info dispatch =
1331 brw_cs_get_dispatch_info(device->info, cs_prog_data, NULL);
1332
1333 const gl_shader_stage s = MESA_SHADER_RAYGEN;
1334 struct anv_state *surfaces = &cmd_buffer->state.binding_tables[s];
1335 struct anv_state *samplers = &cmd_buffer->state.samplers[s];
1336 struct brw_rt_raygen_trampoline_params trampoline_params = {
1337 .rt_disp_globals_addr = anv_address_physical(rtdg_addr),
1338 .raygen_bsr_addr =
1339 params->is_sbt_indirect ?
1340 (params->indirect_sbts_addr +
1341 offsetof(VkTraceRaysIndirectCommand2KHR,
1342 raygenShaderRecordAddress)) :
1343 params->raygen_sbt->deviceAddress,
1344 .is_indirect = params->is_sbt_indirect,
1345 .local_group_size_log2 = {
1346 local_size_log2[0],
1347 local_size_log2[1],
1348 local_size_log2[2],
1349 },
1350 };
1351
1352 struct GENX(COMPUTE_WALKER_BODY) body = {
1353 .SIMDSize = dispatch.simd_size / 16,
1354 .MessageSIMD = dispatch.simd_size / 16,
1355 .LocalXMaximum = (1 << local_size_log2[0]) - 1,
1356 .LocalYMaximum = (1 << local_size_log2[1]) - 1,
1357 .LocalZMaximum = (1 << local_size_log2[2]) - 1,
1358 .ThreadGroupIDXDimension = global_size[0],
1359 .ThreadGroupIDYDimension = global_size[1],
1360 .ThreadGroupIDZDimension = global_size[2],
1361 .ExecutionMask = 0xff,
1362 .EmitInlineParameter = true,
1363 .PostSync.MOCS = anv_mocs(pipeline->base.device, NULL, 0),
1364
1365 .InterfaceDescriptor = (struct GENX(INTERFACE_DESCRIPTOR_DATA)) {
1366 .KernelStartPointer = device->rt_trampoline->kernel.offset,
1367 .SamplerStatePointer = samplers->offset,
1368 /* i965: DIV_ROUND_UP(CLAMP(stage_state->sampler_count, 0, 16), 4), */
1369 .SamplerCount = 0,
1370 .BindingTablePointer = surfaces->offset,
1371 .NumberofThreadsinGPGPUThreadGroup = 1,
1372 .BTDMode = true,
1373 #if INTEL_NEEDS_WA_14017794102 || INTEL_NEEDS_WA_14023061436
1374 .ThreadPreemption = false,
1375 #endif
1376 #if GFX_VER >= 30
1377 .RegistersPerThread = ptl_register_blocks(cs_prog_data->base.grf_used),
1378 #endif
1379 },
1380 };
1381
1382 STATIC_ASSERT(sizeof(trampoline_params) == 32);
1383 memcpy(body.InlineData, &trampoline_params, sizeof(trampoline_params));
1384
1385 cmd_buffer->state.last_compute_walker =
1386 anv_batch_emitn(
1387 &cmd_buffer->batch,
1388 GENX(COMPUTE_WALKER_length),
1389 GENX(COMPUTE_WALKER),
1390 .IndirectParameterEnable = params->is_launch_size_indirect,
1391 .PredicateEnable = cmd_buffer->state.conditional_render_enabled,
1392 .body = body,
1393 );
1394
1395 trace_intel_end_rays(&cmd_buffer->trace,
1396 params->launch_size[0],
1397 params->launch_size[1],
1398 params->launch_size[2]);
1399 }
1400
1401 void
genX(CmdTraceRaysKHR)1402 genX(CmdTraceRaysKHR)(
1403 VkCommandBuffer commandBuffer,
1404 const VkStridedDeviceAddressRegionKHR* pRaygenShaderBindingTable,
1405 const VkStridedDeviceAddressRegionKHR* pMissShaderBindingTable,
1406 const VkStridedDeviceAddressRegionKHR* pHitShaderBindingTable,
1407 const VkStridedDeviceAddressRegionKHR* pCallableShaderBindingTable,
1408 uint32_t width,
1409 uint32_t height,
1410 uint32_t depth)
1411 {
1412 ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer);
1413 struct trace_params params = {
1414 .is_sbt_indirect = false,
1415 .raygen_sbt = pRaygenShaderBindingTable,
1416 .miss_sbt = pMissShaderBindingTable,
1417 .hit_sbt = pHitShaderBindingTable,
1418 .callable_sbt = pCallableShaderBindingTable,
1419 .is_launch_size_indirect = false,
1420 .launch_size = {
1421 width,
1422 height,
1423 depth,
1424 },
1425 };
1426
1427 cmd_buffer_trace_rays(cmd_buffer, ¶ms);
1428 }
1429
1430 void
genX(CmdTraceRaysIndirectKHR)1431 genX(CmdTraceRaysIndirectKHR)(
1432 VkCommandBuffer commandBuffer,
1433 const VkStridedDeviceAddressRegionKHR* pRaygenShaderBindingTable,
1434 const VkStridedDeviceAddressRegionKHR* pMissShaderBindingTable,
1435 const VkStridedDeviceAddressRegionKHR* pHitShaderBindingTable,
1436 const VkStridedDeviceAddressRegionKHR* pCallableShaderBindingTable,
1437 VkDeviceAddress indirectDeviceAddress)
1438 {
1439 ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer);
1440 struct trace_params params = {
1441 .is_sbt_indirect = false,
1442 .raygen_sbt = pRaygenShaderBindingTable,
1443 .miss_sbt = pMissShaderBindingTable,
1444 .hit_sbt = pHitShaderBindingTable,
1445 .callable_sbt = pCallableShaderBindingTable,
1446 .is_launch_size_indirect = true,
1447 .launch_size_addr = indirectDeviceAddress,
1448 };
1449
1450 cmd_buffer_trace_rays(cmd_buffer, ¶ms);
1451 }
1452
1453 void
genX(CmdTraceRaysIndirect2KHR)1454 genX(CmdTraceRaysIndirect2KHR)(
1455 VkCommandBuffer commandBuffer,
1456 VkDeviceAddress indirectDeviceAddress)
1457 {
1458 ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer);
1459 struct trace_params params = {
1460 .is_sbt_indirect = true,
1461 .indirect_sbts_addr = indirectDeviceAddress,
1462 .is_launch_size_indirect = true,
1463 .launch_size_addr = indirectDeviceAddress +
1464 offsetof(VkTraceRaysIndirectCommand2KHR, width),
1465 };
1466
1467 cmd_buffer_trace_rays(cmd_buffer, ¶ms);
1468 }
1469
1470 #endif /* GFX_VERx10 >= 125 */
1471