1 /*
2 * Copyright © 2015 Intel Corporation
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
13 * Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21 * IN THE SOFTWARE.
22 */
23
24 #include <assert.h>
25 #include <stdbool.h>
26
27 #include "anv_private.h"
28 #include "anv_measure.h"
29 #include "vk_render_pass.h"
30 #include "vk_util.h"
31
32 #include "common/intel_aux_map.h"
33 #include "genxml/gen_macros.h"
34 #include "genxml/genX_pack.h"
35 #include "genxml/genX_rt_pack.h"
36 #include "common/intel_genX_state_brw.h"
37
38 #include "ds/intel_tracepoints.h"
39
40 /* We reserve :
41 * - GPR 14 for secondary command buffer returns
42 * - GPR 15 for conditional rendering
43 */
44 #define MI_BUILDER_NUM_ALLOC_GPRS 14
45 #define __gen_get_batch_dwords anv_batch_emit_dwords
46 #define __gen_address_offset anv_address_add
47 #define __gen_get_batch_address(b, a) anv_batch_address(b, a)
48 #include "common/mi_builder.h"
49
50 void
genX(cmd_buffer_ensure_cfe_state)51 genX(cmd_buffer_ensure_cfe_state)(struct anv_cmd_buffer *cmd_buffer,
52 uint32_t total_scratch)
53 {
54 #if GFX_VERx10 >= 125
55 assert(cmd_buffer->state.current_pipeline == GPGPU);
56
57 struct anv_cmd_compute_state *comp_state = &cmd_buffer->state.compute;
58
59 if (total_scratch <= comp_state->scratch_size)
60 return;
61
62 const struct intel_device_info *devinfo = cmd_buffer->device->info;
63 anv_batch_emit(&cmd_buffer->batch, GENX(CFE_STATE), cfe) {
64 cfe.MaximumNumberofThreads =
65 devinfo->max_cs_threads * devinfo->subslice_total;
66
67 uint32_t scratch_surf = 0xffffffff;
68 if (total_scratch > 0) {
69 struct anv_bo *scratch_bo =
70 anv_scratch_pool_alloc(cmd_buffer->device,
71 &cmd_buffer->device->scratch_pool,
72 MESA_SHADER_COMPUTE,
73 total_scratch);
74 anv_reloc_list_add_bo(cmd_buffer->batch.relocs,
75 scratch_bo);
76 scratch_surf =
77 anv_scratch_pool_get_surf(cmd_buffer->device,
78 &cmd_buffer->device->scratch_pool,
79 total_scratch);
80 cfe.ScratchSpaceBuffer = scratch_surf >> 4;
81 }
82
83 cfe.OverDispatchControl = 2; /* 50% overdispatch */
84 }
85
86 comp_state->scratch_size = total_scratch;
87 #else
88 unreachable("Invalid call");
89 #endif
90 }
91
92 static void
genX(cmd_buffer_flush_compute_state)93 genX(cmd_buffer_flush_compute_state)(struct anv_cmd_buffer *cmd_buffer)
94 {
95 struct anv_cmd_compute_state *comp_state = &cmd_buffer->state.compute;
96 struct anv_compute_pipeline *pipeline =
97 anv_pipeline_to_compute(comp_state->base.pipeline);
98 const UNUSED struct intel_device_info *devinfo = cmd_buffer->device->info;
99
100 assert(pipeline->cs);
101
102 genX(cmd_buffer_config_l3)(cmd_buffer, pipeline->base.l3_config);
103
104 genX(flush_pipeline_select_gpgpu)(cmd_buffer);
105
106 /* Apply any pending pipeline flushes we may have. We want to apply them
107 * now because, if any of those flushes are for things like push constants,
108 * the GPU will read the state at weird times.
109 */
110 genX(cmd_buffer_apply_pipe_flushes)(cmd_buffer);
111
112 if (cmd_buffer->state.compute.pipeline_dirty) {
113 #if GFX_VERx10 < 125
114 /* From the Sky Lake PRM Vol 2a, MEDIA_VFE_STATE:
115 *
116 * "A stalling PIPE_CONTROL is required before MEDIA_VFE_STATE unless
117 * the only bits that are changed are scoreboard related: Scoreboard
118 * Enable, Scoreboard Type, Scoreboard Mask, Scoreboard * Delta. For
119 * these scoreboard related states, a MEDIA_STATE_FLUSH is
120 * sufficient."
121 */
122 anv_add_pending_pipe_bits(cmd_buffer,
123 ANV_PIPE_CS_STALL_BIT,
124 "flush compute state");
125 genX(cmd_buffer_apply_pipe_flushes)(cmd_buffer);
126 #endif
127
128 anv_batch_emit_batch(&cmd_buffer->batch, &pipeline->base.batch);
129
130 #if GFX_VERx10 >= 125
131 const struct brw_cs_prog_data *prog_data = get_cs_prog_data(pipeline);
132 genX(cmd_buffer_ensure_cfe_state)(cmd_buffer, prog_data->base.total_scratch);
133 #endif
134
135 /* The workgroup size of the pipeline affects our push constant layout
136 * so flag push constants as dirty if we change the pipeline.
137 */
138 cmd_buffer->state.push_constants_dirty |= VK_SHADER_STAGE_COMPUTE_BIT;
139 }
140
141 cmd_buffer->state.descriptors_dirty |=
142 genX(cmd_buffer_flush_push_descriptors)(cmd_buffer,
143 &cmd_buffer->state.compute.base,
144 &pipeline->base);
145
146 if ((cmd_buffer->state.descriptors_dirty & VK_SHADER_STAGE_COMPUTE_BIT) ||
147 cmd_buffer->state.compute.pipeline_dirty) {
148 genX(cmd_buffer_flush_descriptor_sets)(cmd_buffer,
149 &cmd_buffer->state.compute.base,
150 VK_SHADER_STAGE_COMPUTE_BIT,
151 &pipeline->cs, 1);
152 cmd_buffer->state.descriptors_dirty &= ~VK_SHADER_STAGE_COMPUTE_BIT;
153
154 #if GFX_VERx10 < 125
155 uint32_t iface_desc_data_dw[GENX(INTERFACE_DESCRIPTOR_DATA_length)];
156 struct GENX(INTERFACE_DESCRIPTOR_DATA) desc = {
157 .BindingTablePointer =
158 cmd_buffer->state.binding_tables[MESA_SHADER_COMPUTE].offset,
159 .SamplerStatePointer =
160 cmd_buffer->state.samplers[MESA_SHADER_COMPUTE].offset,
161 };
162 GENX(INTERFACE_DESCRIPTOR_DATA_pack)(NULL, iface_desc_data_dw, &desc);
163
164 struct anv_state state =
165 anv_cmd_buffer_merge_dynamic(cmd_buffer, iface_desc_data_dw,
166 pipeline->interface_descriptor_data,
167 GENX(INTERFACE_DESCRIPTOR_DATA_length),
168 64);
169
170 uint32_t size = GENX(INTERFACE_DESCRIPTOR_DATA_length) * sizeof(uint32_t);
171 anv_batch_emit(&cmd_buffer->batch,
172 GENX(MEDIA_INTERFACE_DESCRIPTOR_LOAD), mid) {
173 mid.InterfaceDescriptorTotalLength = size;
174 mid.InterfaceDescriptorDataStartAddress = state.offset;
175 }
176 #endif
177 }
178
179 if (cmd_buffer->state.push_constants_dirty & VK_SHADER_STAGE_COMPUTE_BIT) {
180 comp_state->push_data =
181 anv_cmd_buffer_cs_push_constants(cmd_buffer);
182
183 #if GFX_VERx10 < 125
184 if (comp_state->push_data.alloc_size) {
185 anv_batch_emit(&cmd_buffer->batch, GENX(MEDIA_CURBE_LOAD), curbe) {
186 curbe.CURBETotalDataLength = comp_state->push_data.alloc_size;
187 curbe.CURBEDataStartAddress = comp_state->push_data.offset;
188 }
189 }
190 #endif
191
192 cmd_buffer->state.push_constants_dirty &= ~VK_SHADER_STAGE_COMPUTE_BIT;
193 }
194
195 cmd_buffer->state.compute.pipeline_dirty = false;
196
197 genX(cmd_buffer_apply_pipe_flushes)(cmd_buffer);
198 }
199
200 static void
anv_cmd_buffer_push_base_group_id(struct anv_cmd_buffer * cmd_buffer,uint32_t baseGroupX,uint32_t baseGroupY,uint32_t baseGroupZ)201 anv_cmd_buffer_push_base_group_id(struct anv_cmd_buffer *cmd_buffer,
202 uint32_t baseGroupX,
203 uint32_t baseGroupY,
204 uint32_t baseGroupZ)
205 {
206 if (anv_batch_has_error(&cmd_buffer->batch))
207 return;
208
209 struct anv_push_constants *push =
210 &cmd_buffer->state.compute.base.push_constants;
211 if (push->cs.base_work_group_id[0] != baseGroupX ||
212 push->cs.base_work_group_id[1] != baseGroupY ||
213 push->cs.base_work_group_id[2] != baseGroupZ) {
214 push->cs.base_work_group_id[0] = baseGroupX;
215 push->cs.base_work_group_id[1] = baseGroupY;
216 push->cs.base_work_group_id[2] = baseGroupZ;
217
218 cmd_buffer->state.push_constants_dirty |= VK_SHADER_STAGE_COMPUTE_BIT;
219 }
220 }
221
222 #define GPGPU_DISPATCHDIMX 0x2500
223 #define GPGPU_DISPATCHDIMY 0x2504
224 #define GPGPU_DISPATCHDIMZ 0x2508
225
226 static void
compute_load_indirect_params(struct anv_cmd_buffer * cmd_buffer,const struct anv_address indirect_addr)227 compute_load_indirect_params(struct anv_cmd_buffer *cmd_buffer,
228 const struct anv_address indirect_addr)
229 {
230 struct mi_builder b;
231 mi_builder_init(&b, cmd_buffer->device->info, &cmd_buffer->batch);
232
233 struct mi_value size_x = mi_mem32(anv_address_add(indirect_addr, 0));
234 struct mi_value size_y = mi_mem32(anv_address_add(indirect_addr, 4));
235 struct mi_value size_z = mi_mem32(anv_address_add(indirect_addr, 8));
236
237 mi_store(&b, mi_reg32(GPGPU_DISPATCHDIMX), size_x);
238 mi_store(&b, mi_reg32(GPGPU_DISPATCHDIMY), size_y);
239 mi_store(&b, mi_reg32(GPGPU_DISPATCHDIMZ), size_z);
240 }
241
242 static void
compute_store_indirect_params(struct anv_cmd_buffer * cmd_buffer,const struct anv_address indirect_addr)243 compute_store_indirect_params(struct anv_cmd_buffer *cmd_buffer,
244 const struct anv_address indirect_addr)
245 {
246 struct mi_builder b;
247 mi_builder_init(&b, cmd_buffer->device->info, &cmd_buffer->batch);
248
249 struct mi_value size_x = mi_mem32(anv_address_add(indirect_addr, 0));
250 struct mi_value size_y = mi_mem32(anv_address_add(indirect_addr, 4));
251 struct mi_value size_z = mi_mem32(anv_address_add(indirect_addr, 8));
252
253 mi_store(&b, size_x, mi_reg32(GPGPU_DISPATCHDIMX));
254 mi_store(&b, size_y, mi_reg32(GPGPU_DISPATCHDIMY));
255 mi_store(&b, size_z, mi_reg32(GPGPU_DISPATCHDIMZ));
256 }
257
258
259 #if GFX_VERx10 >= 125
260
GENX(INTERFACE_DESCRIPTOR_DATA)261 static inline struct GENX(INTERFACE_DESCRIPTOR_DATA)
262 get_interface_descriptor_data(struct anv_cmd_buffer *cmd_buffer,
263 const struct anv_shader_bin *shader,
264 const struct brw_cs_prog_data *prog_data,
265 const struct intel_cs_dispatch_info *dispatch)
266 {
267 const struct intel_device_info *devinfo = cmd_buffer->device->info;
268
269 return (struct GENX(INTERFACE_DESCRIPTOR_DATA)) {
270 .KernelStartPointer = shader->kernel.offset,
271 .SamplerStatePointer = cmd_buffer->state.samplers[MESA_SHADER_COMPUTE].offset,
272 .BindingTablePointer = cmd_buffer->state.binding_tables[MESA_SHADER_COMPUTE].offset,
273 /* Typically set to 0 to avoid prefetching on every thread dispatch. */
274 .BindingTableEntryCount = devinfo->verx10 == 125 ?
275 0 : 1 + MIN2(shader->bind_map.surface_count, 30),
276 .NumberofThreadsinGPGPUThreadGroup = dispatch->threads,
277 .SharedLocalMemorySize = encode_slm_size(GFX_VER, prog_data->base.total_shared),
278 .PreferredSLMAllocationSize = preferred_slm_allocation_size(devinfo),
279 .NumberOfBarriers = prog_data->uses_barrier,
280 };
281 }
282
283 static inline void
emit_indirect_compute_walker(struct anv_cmd_buffer * cmd_buffer,const struct anv_shader_bin * shader,const struct brw_cs_prog_data * prog_data,struct anv_address indirect_addr)284 emit_indirect_compute_walker(struct anv_cmd_buffer *cmd_buffer,
285 const struct anv_shader_bin *shader,
286 const struct brw_cs_prog_data *prog_data,
287 struct anv_address indirect_addr)
288 {
289 const struct intel_device_info *devinfo = cmd_buffer->device->info;
290 assert(devinfo->has_indirect_unroll);
291
292 struct anv_cmd_compute_state *comp_state = &cmd_buffer->state.compute;
293 bool predicate = cmd_buffer->state.conditional_render_enabled;
294
295 const struct intel_cs_dispatch_info dispatch =
296 brw_cs_get_dispatch_info(devinfo, prog_data, NULL);
297 const int dispatch_size = dispatch.simd_size / 16;
298
299 struct GENX(COMPUTE_WALKER_BODY) body = {
300 .SIMDSize = dispatch_size,
301 .MessageSIMD = dispatch_size,
302 .IndirectDataStartAddress = comp_state->push_data.offset,
303 .IndirectDataLength = comp_state->push_data.alloc_size,
304 .LocalXMaximum = prog_data->local_size[0] - 1,
305 .LocalYMaximum = prog_data->local_size[1] - 1,
306 .LocalZMaximum = prog_data->local_size[2] - 1,
307 .ExecutionMask = dispatch.right_mask,
308 .PostSync.MOCS = anv_mocs(cmd_buffer->device, NULL, 0),
309 .InterfaceDescriptor =
310 get_interface_descriptor_data(cmd_buffer, shader, prog_data,
311 &dispatch),
312 };
313
314 cmd_buffer->last_indirect_dispatch =
315 anv_batch_emitn(
316 &cmd_buffer->batch,
317 GENX(EXECUTE_INDIRECT_DISPATCH_length),
318 GENX(EXECUTE_INDIRECT_DISPATCH),
319 .PredicateEnable = predicate,
320 .MaxCount = 1,
321 .COMPUTE_WALKER_BODY = body,
322 .ArgumentBufferStartAddress = indirect_addr,
323 .MOCS = anv_mocs(cmd_buffer->device,
324 indirect_addr.bo, 0),
325 );
326 }
327
328 static inline void
emit_compute_walker(struct anv_cmd_buffer * cmd_buffer,const struct anv_compute_pipeline * pipeline,bool indirect,const struct brw_cs_prog_data * prog_data,uint32_t groupCountX,uint32_t groupCountY,uint32_t groupCountZ)329 emit_compute_walker(struct anv_cmd_buffer *cmd_buffer,
330 const struct anv_compute_pipeline *pipeline, bool indirect,
331 const struct brw_cs_prog_data *prog_data,
332 uint32_t groupCountX, uint32_t groupCountY,
333 uint32_t groupCountZ)
334 {
335 const struct anv_cmd_compute_state *comp_state = &cmd_buffer->state.compute;
336 const bool predicate = cmd_buffer->state.conditional_render_enabled;
337
338 const struct intel_device_info *devinfo = pipeline->base.device->info;
339 const struct intel_cs_dispatch_info dispatch =
340 brw_cs_get_dispatch_info(devinfo, prog_data, NULL);
341
342 cmd_buffer->last_compute_walker =
343 anv_batch_emitn(
344 &cmd_buffer->batch,
345 GENX(COMPUTE_WALKER_length),
346 GENX(COMPUTE_WALKER),
347 .IndirectParameterEnable = indirect,
348 .PredicateEnable = predicate,
349 .SIMDSize = dispatch.simd_size / 16,
350 .MessageSIMD = dispatch.simd_size / 16,
351 .IndirectDataStartAddress = comp_state->push_data.offset,
352 .IndirectDataLength = comp_state->push_data.alloc_size,
353 #if GFX_VERx10 == 125
354 .SystolicModeEnable = prog_data->uses_systolic,
355 #endif
356 .GenerateLocalID = prog_data->generate_local_id != 0,
357 .EmitLocal = prog_data->generate_local_id,
358 .WalkOrder = prog_data->walk_order,
359 .TileLayout = prog_data->walk_order == INTEL_WALK_ORDER_YXZ ?
360 TileY32bpe : Linear,
361 .LocalXMaximum = prog_data->local_size[0] - 1,
362 .LocalYMaximum = prog_data->local_size[1] - 1,
363 .LocalZMaximum = prog_data->local_size[2] - 1,
364 .ThreadGroupIDXDimension = groupCountX,
365 .ThreadGroupIDYDimension = groupCountY,
366 .ThreadGroupIDZDimension = groupCountZ,
367 .ExecutionMask = dispatch.right_mask,
368 .PostSync = {
369 .MOCS = anv_mocs(pipeline->base.device, NULL, 0),
370 },
371 .InterfaceDescriptor =
372 get_interface_descriptor_data(cmd_buffer, pipeline->cs,
373 prog_data, &dispatch),
374 );
375 }
376
377 #else /* #if GFX_VERx10 >= 125 */
378
379 static inline void
emit_gpgpu_walker(struct anv_cmd_buffer * cmd_buffer,const struct anv_compute_pipeline * pipeline,bool indirect,const struct brw_cs_prog_data * prog_data,uint32_t groupCountX,uint32_t groupCountY,uint32_t groupCountZ)380 emit_gpgpu_walker(struct anv_cmd_buffer *cmd_buffer,
381 const struct anv_compute_pipeline *pipeline, bool indirect,
382 const struct brw_cs_prog_data *prog_data,
383 uint32_t groupCountX, uint32_t groupCountY,
384 uint32_t groupCountZ)
385 {
386 const bool predicate = cmd_buffer->state.conditional_render_enabled;
387
388 const struct intel_device_info *devinfo = pipeline->base.device->info;
389 const struct intel_cs_dispatch_info dispatch =
390 brw_cs_get_dispatch_info(devinfo, prog_data, NULL);
391
392 anv_batch_emit(&cmd_buffer->batch, GENX(GPGPU_WALKER), ggw) {
393 ggw.IndirectParameterEnable = indirect;
394 ggw.PredicateEnable = predicate;
395 ggw.SIMDSize = dispatch.simd_size / 16;
396 ggw.ThreadDepthCounterMaximum = 0;
397 ggw.ThreadHeightCounterMaximum = 0;
398 ggw.ThreadWidthCounterMaximum = dispatch.threads - 1;
399 ggw.ThreadGroupIDXDimension = groupCountX;
400 ggw.ThreadGroupIDYDimension = groupCountY;
401 ggw.ThreadGroupIDZDimension = groupCountZ;
402 ggw.RightExecutionMask = dispatch.right_mask;
403 ggw.BottomExecutionMask = 0xffffffff;
404 }
405
406 anv_batch_emit(&cmd_buffer->batch, GENX(MEDIA_STATE_FLUSH), msf);
407 }
408
409 #endif /* #if GFX_VERx10 >= 125 */
410
411 static inline void
emit_cs_walker(struct anv_cmd_buffer * cmd_buffer,const struct anv_compute_pipeline * pipeline,const struct brw_cs_prog_data * prog_data,struct anv_address indirect_addr,uint32_t groupCountX,uint32_t groupCountY,uint32_t groupCountZ)412 emit_cs_walker(struct anv_cmd_buffer *cmd_buffer,
413 const struct anv_compute_pipeline *pipeline,
414 const struct brw_cs_prog_data *prog_data,
415 struct anv_address indirect_addr,
416 uint32_t groupCountX, uint32_t groupCountY, uint32_t groupCountZ)
417 {
418 bool is_indirect = !anv_address_is_null(indirect_addr);
419
420 #if GFX_VERx10 >= 125
421 if (is_indirect && cmd_buffer->device->info->has_indirect_unroll) {
422 emit_indirect_compute_walker(cmd_buffer, pipeline->cs, prog_data,
423 indirect_addr);
424 return;
425 }
426 #endif
427
428 if (is_indirect)
429 compute_load_indirect_params(cmd_buffer, indirect_addr);
430
431 #if GFX_VERx10 >= 125
432 emit_compute_walker(cmd_buffer, pipeline, is_indirect, prog_data,
433 groupCountX, groupCountY, groupCountZ);
434 #else
435 emit_gpgpu_walker(cmd_buffer, pipeline, is_indirect, prog_data,
436 groupCountX, groupCountY, groupCountZ);
437 #endif
438 }
439
genX(CmdDispatchBase)440 void genX(CmdDispatchBase)(
441 VkCommandBuffer commandBuffer,
442 uint32_t baseGroupX,
443 uint32_t baseGroupY,
444 uint32_t baseGroupZ,
445 uint32_t groupCountX,
446 uint32_t groupCountY,
447 uint32_t groupCountZ)
448 {
449 ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer);
450 struct anv_compute_pipeline *pipeline =
451 anv_pipeline_to_compute(cmd_buffer->state.compute.base.pipeline);
452 const struct brw_cs_prog_data *prog_data = get_cs_prog_data(pipeline);
453
454 anv_cmd_buffer_push_base_group_id(cmd_buffer, baseGroupX,
455 baseGroupY, baseGroupZ);
456
457 if (anv_batch_has_error(&cmd_buffer->batch))
458 return;
459
460 anv_measure_snapshot(cmd_buffer,
461 INTEL_SNAPSHOT_COMPUTE,
462 "compute",
463 groupCountX * groupCountY * groupCountZ *
464 prog_data->local_size[0] * prog_data->local_size[1] *
465 prog_data->local_size[2]);
466
467 trace_intel_begin_compute(&cmd_buffer->trace);
468
469 if (prog_data->uses_num_work_groups) {
470 struct anv_state state =
471 anv_cmd_buffer_alloc_dynamic_state(cmd_buffer, 12, 4);
472 uint32_t *sizes = state.map;
473 sizes[0] = groupCountX;
474 sizes[1] = groupCountY;
475 sizes[2] = groupCountZ;
476 cmd_buffer->state.compute.num_workgroups =
477 anv_state_pool_state_address(&cmd_buffer->device->dynamic_state_pool,
478 state);
479
480 /* The num_workgroups buffer goes in the binding table */
481 cmd_buffer->state.descriptors_dirty |= VK_SHADER_STAGE_COMPUTE_BIT;
482 }
483
484 genX(cmd_buffer_flush_compute_state)(cmd_buffer);
485
486 if (cmd_buffer->state.conditional_render_enabled)
487 genX(cmd_emit_conditional_render_predicate)(cmd_buffer);
488
489 emit_cs_walker(cmd_buffer, pipeline, prog_data,
490 ANV_NULL_ADDRESS /* no indirect data */,
491 groupCountX, groupCountY, groupCountZ);
492
493 trace_intel_end_compute(&cmd_buffer->trace,
494 groupCountX, groupCountY, groupCountZ);
495 }
496
genX(CmdDispatchIndirect)497 void genX(CmdDispatchIndirect)(
498 VkCommandBuffer commandBuffer,
499 VkBuffer _buffer,
500 VkDeviceSize offset)
501 {
502 ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer);
503 ANV_FROM_HANDLE(anv_buffer, buffer, _buffer);
504 struct anv_compute_pipeline *pipeline =
505 anv_pipeline_to_compute(cmd_buffer->state.compute.base.pipeline);
506 const struct brw_cs_prog_data *prog_data = get_cs_prog_data(pipeline);
507 struct anv_address addr = anv_address_add(buffer->address, offset);
508 UNUSED struct anv_batch *batch = &cmd_buffer->batch;
509
510 anv_cmd_buffer_push_base_group_id(cmd_buffer, 0, 0, 0);
511
512 anv_measure_snapshot(cmd_buffer,
513 INTEL_SNAPSHOT_COMPUTE,
514 "compute indirect",
515 0);
516 trace_intel_begin_compute(&cmd_buffer->trace);
517
518 if (prog_data->uses_num_work_groups) {
519 cmd_buffer->state.compute.num_workgroups = addr;
520
521 /* The num_workgroups buffer goes in the binding table */
522 cmd_buffer->state.descriptors_dirty |= VK_SHADER_STAGE_COMPUTE_BIT;
523 }
524
525 genX(cmd_buffer_flush_compute_state)(cmd_buffer);
526
527 if (cmd_buffer->state.conditional_render_enabled)
528 genX(cmd_emit_conditional_render_predicate)(cmd_buffer);
529
530 emit_cs_walker(cmd_buffer, pipeline, prog_data, addr, 0, 0, 0);
531
532 trace_intel_end_compute(&cmd_buffer->trace, 0, 0, 0);
533 }
534
535 struct anv_state
genX(cmd_buffer_ray_query_globals)536 genX(cmd_buffer_ray_query_globals)(struct anv_cmd_buffer *cmd_buffer)
537 {
538 #if GFX_VERx10 >= 125
539 struct anv_device *device = cmd_buffer->device;
540
541 struct anv_state state =
542 anv_cmd_buffer_alloc_dynamic_state(cmd_buffer,
543 BRW_RT_DISPATCH_GLOBALS_SIZE,
544 64);
545 struct brw_rt_scratch_layout layout;
546 uint32_t stack_ids_per_dss = 2048; /* TODO: can we use a lower value in
547 * some cases?
548 */
549 brw_rt_compute_scratch_layout(&layout, device->info,
550 stack_ids_per_dss, 1 << 10);
551
552 const struct GENX(RT_DISPATCH_GLOBALS) rtdg = {
553 .MemBaseAddress = (struct anv_address) {
554 /* The ray query HW computes offsets from the top of the buffer, so
555 * let the address at the end of the buffer.
556 */
557 .bo = device->ray_query_bo,
558 .offset = device->ray_query_bo->size
559 },
560 .AsyncRTStackSize = layout.ray_stack_stride / 64,
561 .NumDSSRTStacks = layout.stack_ids_per_dss,
562 .MaxBVHLevels = BRW_RT_MAX_BVH_LEVELS,
563 .Flags = RT_DEPTH_TEST_LESS_EQUAL,
564 .ResumeShaderTable = (struct anv_address) {
565 .bo = cmd_buffer->state.ray_query_shadow_bo,
566 },
567 };
568 GENX(RT_DISPATCH_GLOBALS_pack)(NULL, state.map, &rtdg);
569
570 return state;
571 #else
572 unreachable("Not supported");
573 #endif
574 }
575
576 #if GFX_VERx10 >= 125
577 void
genX(cmd_buffer_dispatch_kernel)578 genX(cmd_buffer_dispatch_kernel)(struct anv_cmd_buffer *cmd_buffer,
579 struct anv_kernel *kernel,
580 const uint32_t *global_size,
581 uint32_t arg_count,
582 const struct anv_kernel_arg *args)
583 {
584 const struct intel_device_info *devinfo = cmd_buffer->device->info;
585 const struct brw_cs_prog_data *cs_prog_data =
586 brw_cs_prog_data_const(kernel->bin->prog_data);
587
588 genX(cmd_buffer_config_l3)(cmd_buffer, kernel->l3_config);
589
590 if (anv_cmd_buffer_is_render_queue(cmd_buffer))
591 genX(flush_pipeline_select_gpgpu)(cmd_buffer);
592
593 /* Apply any pending pipeline flushes we may have. We want to apply them
594 * now because, if any of those flushes are for things like push constants,
595 * the GPU will read the state at weird times.
596 */
597 genX(cmd_buffer_apply_pipe_flushes)(cmd_buffer);
598
599 uint32_t indirect_data_size = sizeof(struct brw_kernel_sysvals);
600 indirect_data_size += kernel->bin->bind_map.kernel_args_size;
601 indirect_data_size = ALIGN(indirect_data_size, 64);
602 struct anv_state indirect_data =
603 anv_cmd_buffer_alloc_general_state(cmd_buffer,
604 indirect_data_size, 64);
605 memset(indirect_data.map, 0, indirect_data.alloc_size);
606
607 struct brw_kernel_sysvals sysvals = {};
608 if (global_size != NULL) {
609 for (unsigned i = 0; i < 3; i++)
610 sysvals.num_work_groups[i] = global_size[i];
611 memcpy(indirect_data.map, &sysvals, sizeof(sysvals));
612 } else {
613 struct anv_address sysvals_addr = {
614 .bo = NULL, /* General state buffer is always 0. */
615 .offset = indirect_data.offset,
616 };
617
618 compute_store_indirect_params(cmd_buffer, sysvals_addr);
619 }
620
621 void *args_map = indirect_data.map + sizeof(sysvals);
622 for (unsigned i = 0; i < kernel->bin->bind_map.kernel_arg_count; i++) {
623 struct brw_kernel_arg_desc *arg_desc =
624 &kernel->bin->bind_map.kernel_args[i];
625 assert(i < arg_count);
626 const struct anv_kernel_arg *arg = &args[i];
627 if (arg->is_ptr) {
628 memcpy(args_map + arg_desc->offset, arg->ptr, arg_desc->size);
629 } else {
630 assert(arg_desc->size <= sizeof(arg->u64));
631 memcpy(args_map + arg_desc->offset, &arg->u64, arg_desc->size);
632 }
633 }
634
635 struct intel_cs_dispatch_info dispatch =
636 brw_cs_get_dispatch_info(devinfo, cs_prog_data, NULL);
637
638 anv_batch_emit(&cmd_buffer->batch, GENX(COMPUTE_WALKER), cw) {
639 cw.PredicateEnable = false;
640 cw.SIMDSize = dispatch.simd_size / 16;
641 cw.MessageSIMD = dispatch.simd_size / 16;
642 cw.IndirectDataStartAddress = indirect_data.offset;
643 cw.IndirectDataLength = indirect_data.alloc_size;
644 cw.LocalXMaximum = cs_prog_data->local_size[0] - 1;
645 cw.LocalYMaximum = cs_prog_data->local_size[1] - 1;
646 cw.LocalZMaximum = cs_prog_data->local_size[2] - 1;
647 cw.ExecutionMask = dispatch.right_mask;
648 cw.PostSync.MOCS = cmd_buffer->device->isl_dev.mocs.internal;
649
650 if (global_size != NULL) {
651 cw.ThreadGroupIDXDimension = global_size[0];
652 cw.ThreadGroupIDYDimension = global_size[1];
653 cw.ThreadGroupIDZDimension = global_size[2];
654 } else {
655 cw.IndirectParameterEnable = true;
656 }
657
658 cw.InterfaceDescriptor =
659 get_interface_descriptor_data(cmd_buffer,
660 kernel->bin,
661 cs_prog_data,
662 &dispatch);
663 }
664
665 /* We just blew away the compute pipeline state */
666 cmd_buffer->state.compute.pipeline_dirty = true;
667 }
668
669 static void
calc_local_trace_size(uint8_t local_shift[3],const uint32_t global[3])670 calc_local_trace_size(uint8_t local_shift[3], const uint32_t global[3])
671 {
672 unsigned total_shift = 0;
673 memset(local_shift, 0, 3);
674
675 bool progress;
676 do {
677 progress = false;
678 for (unsigned i = 0; i < 3; i++) {
679 assert(global[i] > 0);
680 if ((1 << local_shift[i]) < global[i]) {
681 progress = true;
682 local_shift[i]++;
683 total_shift++;
684 }
685
686 if (total_shift == 3)
687 return;
688 }
689 } while(progress);
690
691 /* Assign whatever's left to x */
692 local_shift[0] += 3 - total_shift;
693 }
694
GENX(RT_SHADER_TABLE)695 static struct GENX(RT_SHADER_TABLE)
696 vk_sdar_to_shader_table(const VkStridedDeviceAddressRegionKHR *region)
697 {
698 return (struct GENX(RT_SHADER_TABLE)) {
699 .BaseAddress = anv_address_from_u64(region->deviceAddress),
700 .Stride = region->stride,
701 };
702 }
703
704 struct trace_params {
705 /* If is_sbt_indirect, use indirect_sbts_addr to build RT_DISPATCH_GLOBALS
706 * with mi_builder.
707 */
708 bool is_sbt_indirect;
709 const VkStridedDeviceAddressRegionKHR *raygen_sbt;
710 const VkStridedDeviceAddressRegionKHR *miss_sbt;
711 const VkStridedDeviceAddressRegionKHR *hit_sbt;
712 const VkStridedDeviceAddressRegionKHR *callable_sbt;
713
714 /* A pointer to a VkTraceRaysIndirectCommand2KHR structure */
715 uint64_t indirect_sbts_addr;
716
717 /* If is_indirect, use launch_size_addr to program the dispatch size. */
718 bool is_launch_size_indirect;
719 uint32_t launch_size[3];
720
721 /* A pointer a uint32_t[3] */
722 uint64_t launch_size_addr;
723 };
724
725 static struct anv_state
cmd_buffer_emit_rt_dispatch_globals(struct anv_cmd_buffer * cmd_buffer,struct trace_params * params)726 cmd_buffer_emit_rt_dispatch_globals(struct anv_cmd_buffer *cmd_buffer,
727 struct trace_params *params)
728 {
729 assert(!params->is_sbt_indirect);
730 assert(params->miss_sbt != NULL);
731 assert(params->hit_sbt != NULL);
732 assert(params->callable_sbt != NULL);
733
734 struct anv_cmd_ray_tracing_state *rt = &cmd_buffer->state.rt;
735
736 struct anv_state rtdg_state =
737 anv_cmd_buffer_alloc_dynamic_state(cmd_buffer,
738 BRW_RT_PUSH_CONST_OFFSET +
739 sizeof(struct anv_push_constants),
740 64);
741
742 struct GENX(RT_DISPATCH_GLOBALS) rtdg = {
743 .MemBaseAddress = (struct anv_address) {
744 .bo = rt->scratch.bo,
745 .offset = rt->scratch.layout.ray_stack_start,
746 },
747 .CallStackHandler = anv_shader_bin_get_bsr(
748 cmd_buffer->device->rt_trivial_return, 0),
749 .AsyncRTStackSize = rt->scratch.layout.ray_stack_stride / 64,
750 .NumDSSRTStacks = rt->scratch.layout.stack_ids_per_dss,
751 .MaxBVHLevels = BRW_RT_MAX_BVH_LEVELS,
752 .Flags = RT_DEPTH_TEST_LESS_EQUAL,
753 .HitGroupTable = vk_sdar_to_shader_table(params->hit_sbt),
754 .MissGroupTable = vk_sdar_to_shader_table(params->miss_sbt),
755 .SWStackSize = rt->scratch.layout.sw_stack_size / 64,
756 .LaunchWidth = params->launch_size[0],
757 .LaunchHeight = params->launch_size[1],
758 .LaunchDepth = params->launch_size[2],
759 .CallableGroupTable = vk_sdar_to_shader_table(params->callable_sbt),
760 };
761 GENX(RT_DISPATCH_GLOBALS_pack)(NULL, rtdg_state.map, &rtdg);
762
763 return rtdg_state;
764 }
765
766 static struct mi_value
mi_build_sbt_entry(struct mi_builder * b,uint64_t addr_field_addr,uint64_t stride_field_addr)767 mi_build_sbt_entry(struct mi_builder *b,
768 uint64_t addr_field_addr,
769 uint64_t stride_field_addr)
770 {
771 return mi_ior(b,
772 mi_iand(b, mi_mem64(anv_address_from_u64(addr_field_addr)),
773 mi_imm(BITFIELD64_BIT(49) - 1)),
774 mi_ishl_imm(b, mi_mem32(anv_address_from_u64(stride_field_addr)),
775 48));
776 }
777
778 static struct anv_state
cmd_buffer_emit_rt_dispatch_globals_indirect(struct anv_cmd_buffer * cmd_buffer,struct trace_params * params)779 cmd_buffer_emit_rt_dispatch_globals_indirect(struct anv_cmd_buffer *cmd_buffer,
780 struct trace_params *params)
781 {
782 struct anv_cmd_ray_tracing_state *rt = &cmd_buffer->state.rt;
783
784 struct anv_state rtdg_state =
785 anv_cmd_buffer_alloc_dynamic_state(cmd_buffer,
786 BRW_RT_PUSH_CONST_OFFSET +
787 sizeof(struct anv_push_constants),
788 64);
789
790 struct GENX(RT_DISPATCH_GLOBALS) rtdg = {
791 .MemBaseAddress = (struct anv_address) {
792 .bo = rt->scratch.bo,
793 .offset = rt->scratch.layout.ray_stack_start,
794 },
795 .CallStackHandler = anv_shader_bin_get_bsr(
796 cmd_buffer->device->rt_trivial_return, 0),
797 .AsyncRTStackSize = rt->scratch.layout.ray_stack_stride / 64,
798 .NumDSSRTStacks = rt->scratch.layout.stack_ids_per_dss,
799 .MaxBVHLevels = BRW_RT_MAX_BVH_LEVELS,
800 .Flags = RT_DEPTH_TEST_LESS_EQUAL,
801 .SWStackSize = rt->scratch.layout.sw_stack_size / 64,
802 };
803 GENX(RT_DISPATCH_GLOBALS_pack)(NULL, rtdg_state.map, &rtdg);
804
805 struct anv_address rtdg_addr =
806 anv_state_pool_state_address(
807 &cmd_buffer->device->dynamic_state_pool,
808 rtdg_state);
809
810 struct mi_builder b;
811 mi_builder_init(&b, cmd_buffer->device->info, &cmd_buffer->batch);
812 const uint32_t mocs = anv_mocs_for_address(cmd_buffer->device, &rtdg_addr);
813 mi_builder_set_mocs(&b, mocs);
814
815 /* Fill the MissGroupTable, HitGroupTable & CallableGroupTable fields of
816 * RT_DISPATCH_GLOBALS using the mi_builder.
817 */
818 mi_store(&b,
819 mi_mem64(
820 anv_address_add(
821 rtdg_addr,
822 GENX(RT_DISPATCH_GLOBALS_MissGroupTable_start) / 8)),
823 mi_build_sbt_entry(&b,
824 params->indirect_sbts_addr +
825 offsetof(VkTraceRaysIndirectCommand2KHR,
826 missShaderBindingTableAddress),
827 params->indirect_sbts_addr +
828 offsetof(VkTraceRaysIndirectCommand2KHR,
829 missShaderBindingTableStride)));
830 mi_store(&b,
831 mi_mem64(
832 anv_address_add(
833 rtdg_addr,
834 GENX(RT_DISPATCH_GLOBALS_HitGroupTable_start) / 8)),
835 mi_build_sbt_entry(&b,
836 params->indirect_sbts_addr +
837 offsetof(VkTraceRaysIndirectCommand2KHR,
838 hitShaderBindingTableAddress),
839 params->indirect_sbts_addr +
840 offsetof(VkTraceRaysIndirectCommand2KHR,
841 hitShaderBindingTableStride)));
842 mi_store(&b,
843 mi_mem64(
844 anv_address_add(
845 rtdg_addr,
846 GENX(RT_DISPATCH_GLOBALS_CallableGroupTable_start) / 8)),
847 mi_build_sbt_entry(&b,
848 params->indirect_sbts_addr +
849 offsetof(VkTraceRaysIndirectCommand2KHR,
850 callableShaderBindingTableAddress),
851 params->indirect_sbts_addr +
852 offsetof(VkTraceRaysIndirectCommand2KHR,
853 callableShaderBindingTableStride)));
854
855 return rtdg_state;
856 }
857
858 static void
cmd_buffer_trace_rays(struct anv_cmd_buffer * cmd_buffer,struct trace_params * params)859 cmd_buffer_trace_rays(struct anv_cmd_buffer *cmd_buffer,
860 struct trace_params *params)
861 {
862 struct anv_device *device = cmd_buffer->device;
863 struct anv_cmd_ray_tracing_state *rt = &cmd_buffer->state.rt;
864 struct anv_ray_tracing_pipeline *pipeline =
865 anv_pipeline_to_ray_tracing(rt->base.pipeline);
866
867 if (anv_batch_has_error(&cmd_buffer->batch))
868 return;
869
870 /* If we have a known degenerate launch size, just bail */
871 if (!params->is_launch_size_indirect &&
872 (params->launch_size[0] == 0 ||
873 params->launch_size[1] == 0 ||
874 params->launch_size[2] == 0))
875 return;
876
877 trace_intel_begin_rays(&cmd_buffer->trace);
878
879 genX(cmd_buffer_config_l3)(cmd_buffer, pipeline->base.l3_config);
880 genX(flush_pipeline_select_gpgpu)(cmd_buffer);
881
882 cmd_buffer->state.rt.pipeline_dirty = false;
883
884 genX(cmd_buffer_apply_pipe_flushes)(cmd_buffer);
885
886 genX(cmd_buffer_flush_push_descriptors)(cmd_buffer,
887 &cmd_buffer->state.rt.base,
888 &pipeline->base);
889
890 /* Add these to the reloc list as they're internal buffers that don't
891 * actually have relocs to pick them up manually.
892 *
893 * TODO(RT): This is a bit of a hack
894 */
895 anv_reloc_list_add_bo(cmd_buffer->batch.relocs,
896 rt->scratch.bo);
897 anv_reloc_list_add_bo(cmd_buffer->batch.relocs,
898 cmd_buffer->device->btd_fifo_bo);
899
900 /* Allocate and set up our RT_DISPATCH_GLOBALS */
901 struct anv_state rtdg_state =
902 params->is_sbt_indirect ?
903 cmd_buffer_emit_rt_dispatch_globals_indirect(cmd_buffer, params) :
904 cmd_buffer_emit_rt_dispatch_globals(cmd_buffer, params);
905
906 assert(rtdg_state.alloc_size >= (BRW_RT_PUSH_CONST_OFFSET +
907 sizeof(struct anv_push_constants)));
908 assert(GENX(RT_DISPATCH_GLOBALS_length) * 4 <= BRW_RT_PUSH_CONST_OFFSET);
909 /* Push constants go after the RT_DISPATCH_GLOBALS */
910 memcpy(rtdg_state.map + BRW_RT_PUSH_CONST_OFFSET,
911 &cmd_buffer->state.rt.base.push_constants,
912 sizeof(struct anv_push_constants));
913
914 struct anv_address rtdg_addr =
915 anv_state_pool_state_address(&cmd_buffer->device->dynamic_state_pool,
916 rtdg_state);
917
918 uint8_t local_size_log2[3];
919 uint32_t global_size[3] = {};
920 if (params->is_launch_size_indirect) {
921 /* Pick a local size that's probably ok. We assume most TraceRays calls
922 * will use a two-dimensional dispatch size. Worst case, our initial
923 * dispatch will be a little slower than it has to be.
924 */
925 local_size_log2[0] = 2;
926 local_size_log2[1] = 1;
927 local_size_log2[2] = 0;
928
929 struct mi_builder b;
930 mi_builder_init(&b, cmd_buffer->device->info, &cmd_buffer->batch);
931 const uint32_t mocs = anv_mocs_for_address(cmd_buffer->device, &rtdg_addr);
932 mi_builder_set_mocs(&b, mocs);
933
934 struct mi_value launch_size[3] = {
935 mi_mem32(anv_address_from_u64(params->launch_size_addr + 0)),
936 mi_mem32(anv_address_from_u64(params->launch_size_addr + 4)),
937 mi_mem32(anv_address_from_u64(params->launch_size_addr + 8)),
938 };
939
940 /* Store the original launch size into RT_DISPATCH_GLOBALS */
941 mi_store(&b, mi_mem32(anv_address_add(rtdg_addr,
942 GENX(RT_DISPATCH_GLOBALS_LaunchWidth_start) / 8)),
943 mi_value_ref(&b, launch_size[0]));
944 mi_store(&b, mi_mem32(anv_address_add(rtdg_addr,
945 GENX(RT_DISPATCH_GLOBALS_LaunchHeight_start) / 8)),
946 mi_value_ref(&b, launch_size[1]));
947 mi_store(&b, mi_mem32(anv_address_add(rtdg_addr,
948 GENX(RT_DISPATCH_GLOBALS_LaunchDepth_start) / 8)),
949 mi_value_ref(&b, launch_size[2]));
950
951 /* Compute the global dispatch size */
952 for (unsigned i = 0; i < 3; i++) {
953 if (local_size_log2[i] == 0)
954 continue;
955
956 /* global_size = DIV_ROUND_UP(launch_size, local_size)
957 *
958 * Fortunately for us MI_ALU math is 64-bit and , mi_ushr32_imm
959 * has the semantics of shifting the enture 64-bit value and taking
960 * the bottom 32 so we don't have to worry about roll-over.
961 */
962 uint32_t local_size = 1 << local_size_log2[i];
963 launch_size[i] = mi_iadd(&b, launch_size[i],
964 mi_imm(local_size - 1));
965 launch_size[i] = mi_ushr32_imm(&b, launch_size[i],
966 local_size_log2[i]);
967 }
968
969 mi_store(&b, mi_reg32(GPGPU_DISPATCHDIMX), launch_size[0]);
970 mi_store(&b, mi_reg32(GPGPU_DISPATCHDIMY), launch_size[1]);
971 mi_store(&b, mi_reg32(GPGPU_DISPATCHDIMZ), launch_size[2]);
972
973 } else {
974 calc_local_trace_size(local_size_log2, params->launch_size);
975
976 for (unsigned i = 0; i < 3; i++) {
977 /* We have to be a bit careful here because DIV_ROUND_UP adds to the
978 * numerator value may overflow. Cast to uint64_t to avoid this.
979 */
980 uint32_t local_size = 1 << local_size_log2[i];
981 global_size[i] = DIV_ROUND_UP((uint64_t)params->launch_size[i], local_size);
982 }
983 }
984
985 #if GFX_VERx10 == 125
986 /* Wa_14014427904 - We need additional invalidate/flush when
987 * emitting NP state commands with ATS-M in compute mode.
988 */
989 if (intel_device_info_is_atsm(device->info) &&
990 cmd_buffer->queue_family->engine_class == INTEL_ENGINE_CLASS_COMPUTE) {
991 genx_batch_emit_pipe_control(&cmd_buffer->batch,
992 cmd_buffer->device->info,
993 cmd_buffer->state.current_pipeline,
994 ANV_PIPE_CS_STALL_BIT |
995 ANV_PIPE_STATE_CACHE_INVALIDATE_BIT |
996 ANV_PIPE_CONSTANT_CACHE_INVALIDATE_BIT |
997 ANV_PIPE_UNTYPED_DATAPORT_CACHE_FLUSH_BIT |
998 ANV_PIPE_TEXTURE_CACHE_INVALIDATE_BIT |
999 ANV_PIPE_INSTRUCTION_CACHE_INVALIDATE_BIT |
1000 ANV_PIPE_HDC_PIPELINE_FLUSH_BIT);
1001 }
1002 #endif
1003
1004 anv_batch_emit(&cmd_buffer->batch, GENX(3DSTATE_BTD), btd) {
1005 /* TODO: This is the timeout after which the bucketed thread dispatcher
1006 * will kick off a wave of threads. We go with the lowest value
1007 * for now. It could be tweaked on a per application basis
1008 * (drirc).
1009 */
1010 btd.DispatchTimeoutCounter = _64clocks;
1011 /* BSpec 43851: "This field must be programmed to 6h i.e. memory backed
1012 * buffer must be 128KB."
1013 */
1014 btd.PerDSSMemoryBackedBufferSize = 6;
1015 btd.MemoryBackedBufferBasePointer = (struct anv_address) { .bo = device->btd_fifo_bo };
1016 if (pipeline->base.scratch_size > 0) {
1017 struct anv_bo *scratch_bo =
1018 anv_scratch_pool_alloc(device,
1019 &device->scratch_pool,
1020 MESA_SHADER_COMPUTE,
1021 pipeline->base.scratch_size);
1022 anv_reloc_list_add_bo(cmd_buffer->batch.relocs,
1023 scratch_bo);
1024 uint32_t scratch_surf =
1025 anv_scratch_pool_get_surf(cmd_buffer->device,
1026 &device->scratch_pool,
1027 pipeline->base.scratch_size);
1028 btd.ScratchSpaceBuffer = scratch_surf >> 4;
1029 }
1030 }
1031
1032 genX(cmd_buffer_ensure_cfe_state)(cmd_buffer, pipeline->base.scratch_size);
1033
1034 const struct brw_cs_prog_data *cs_prog_data =
1035 brw_cs_prog_data_const(device->rt_trampoline->prog_data);
1036 struct intel_cs_dispatch_info dispatch =
1037 brw_cs_get_dispatch_info(device->info, cs_prog_data, NULL);
1038
1039 anv_batch_emit(&cmd_buffer->batch, GENX(COMPUTE_WALKER), cw) {
1040 cw.IndirectParameterEnable = params->is_launch_size_indirect;
1041 cw.PredicateEnable = cmd_buffer->state.conditional_render_enabled;
1042 cw.SIMDSize = dispatch.simd_size / 16;
1043 cw.MessageSIMD = dispatch.simd_size / 16;
1044 cw.LocalXMaximum = (1 << local_size_log2[0]) - 1;
1045 cw.LocalYMaximum = (1 << local_size_log2[1]) - 1;
1046 cw.LocalZMaximum = (1 << local_size_log2[2]) - 1;
1047 cw.ThreadGroupIDXDimension = global_size[0];
1048 cw.ThreadGroupIDYDimension = global_size[1];
1049 cw.ThreadGroupIDZDimension = global_size[2];
1050 cw.ExecutionMask = 0xff;
1051 cw.EmitInlineParameter = true;
1052 cw.PostSync.MOCS = anv_mocs(pipeline->base.device, NULL, 0);
1053
1054 const gl_shader_stage s = MESA_SHADER_RAYGEN;
1055 struct anv_device *device = cmd_buffer->device;
1056 struct anv_state *surfaces = &cmd_buffer->state.binding_tables[s];
1057 struct anv_state *samplers = &cmd_buffer->state.samplers[s];
1058 cw.InterfaceDescriptor = (struct GENX(INTERFACE_DESCRIPTOR_DATA)) {
1059 .KernelStartPointer = device->rt_trampoline->kernel.offset,
1060 .SamplerStatePointer = samplers->offset,
1061 /* i965: DIV_ROUND_UP(CLAMP(stage_state->sampler_count, 0, 16), 4), */
1062 .SamplerCount = 0,
1063 .BindingTablePointer = surfaces->offset,
1064 .NumberofThreadsinGPGPUThreadGroup = 1,
1065 .BTDMode = true,
1066 };
1067
1068 struct brw_rt_raygen_trampoline_params trampoline_params = {
1069 .rt_disp_globals_addr = anv_address_physical(rtdg_addr),
1070 .raygen_bsr_addr =
1071 params->is_sbt_indirect ?
1072 (params->indirect_sbts_addr +
1073 offsetof(VkTraceRaysIndirectCommand2KHR,
1074 raygenShaderRecordAddress)) :
1075 params->raygen_sbt->deviceAddress,
1076 .is_indirect = params->is_sbt_indirect,
1077 .local_group_size_log2 = {
1078 local_size_log2[0],
1079 local_size_log2[1],
1080 local_size_log2[2],
1081 },
1082 };
1083 STATIC_ASSERT(sizeof(trampoline_params) == 32);
1084 memcpy(cw.InlineData, &trampoline_params, sizeof(trampoline_params));
1085 }
1086
1087 trace_intel_end_rays(&cmd_buffer->trace,
1088 params->launch_size[0],
1089 params->launch_size[1],
1090 params->launch_size[2]);
1091 }
1092
1093 void
genX(CmdTraceRaysKHR)1094 genX(CmdTraceRaysKHR)(
1095 VkCommandBuffer commandBuffer,
1096 const VkStridedDeviceAddressRegionKHR* pRaygenShaderBindingTable,
1097 const VkStridedDeviceAddressRegionKHR* pMissShaderBindingTable,
1098 const VkStridedDeviceAddressRegionKHR* pHitShaderBindingTable,
1099 const VkStridedDeviceAddressRegionKHR* pCallableShaderBindingTable,
1100 uint32_t width,
1101 uint32_t height,
1102 uint32_t depth)
1103 {
1104 ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer);
1105 struct trace_params params = {
1106 .is_sbt_indirect = false,
1107 .raygen_sbt = pRaygenShaderBindingTable,
1108 .miss_sbt = pMissShaderBindingTable,
1109 .hit_sbt = pHitShaderBindingTable,
1110 .callable_sbt = pCallableShaderBindingTable,
1111 .is_launch_size_indirect = false,
1112 .launch_size = {
1113 width,
1114 height,
1115 depth,
1116 },
1117 };
1118
1119 cmd_buffer_trace_rays(cmd_buffer, ¶ms);
1120 }
1121
1122 void
genX(CmdTraceRaysIndirectKHR)1123 genX(CmdTraceRaysIndirectKHR)(
1124 VkCommandBuffer commandBuffer,
1125 const VkStridedDeviceAddressRegionKHR* pRaygenShaderBindingTable,
1126 const VkStridedDeviceAddressRegionKHR* pMissShaderBindingTable,
1127 const VkStridedDeviceAddressRegionKHR* pHitShaderBindingTable,
1128 const VkStridedDeviceAddressRegionKHR* pCallableShaderBindingTable,
1129 VkDeviceAddress indirectDeviceAddress)
1130 {
1131 ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer);
1132 struct trace_params params = {
1133 .is_sbt_indirect = false,
1134 .raygen_sbt = pRaygenShaderBindingTable,
1135 .miss_sbt = pMissShaderBindingTable,
1136 .hit_sbt = pHitShaderBindingTable,
1137 .callable_sbt = pCallableShaderBindingTable,
1138 .is_launch_size_indirect = true,
1139 .launch_size_addr = indirectDeviceAddress,
1140 };
1141
1142 cmd_buffer_trace_rays(cmd_buffer, ¶ms);
1143 }
1144
1145 void
genX(CmdTraceRaysIndirect2KHR)1146 genX(CmdTraceRaysIndirect2KHR)(
1147 VkCommandBuffer commandBuffer,
1148 VkDeviceAddress indirectDeviceAddress)
1149 {
1150 ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer);
1151 struct trace_params params = {
1152 .is_sbt_indirect = true,
1153 .indirect_sbts_addr = indirectDeviceAddress,
1154 .is_launch_size_indirect = true,
1155 .launch_size_addr = indirectDeviceAddress +
1156 offsetof(VkTraceRaysIndirectCommand2KHR, width),
1157 };
1158
1159 cmd_buffer_trace_rays(cmd_buffer, ¶ms);
1160 }
1161
1162 #endif /* GFX_VERx10 >= 125 */
1163