• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*
2  * Copyright © 2015 Intel Corporation
3  *
4  * Permission is hereby granted, free of charge, to any person obtaining a
5  * copy of this software and associated documentation files (the "Software"),
6  * to deal in the Software without restriction, including without limitation
7  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8  * and/or sell copies of the Software, and to permit persons to whom the
9  * Software is furnished to do so, subject to the following conditions:
10  *
11  * The above copyright notice and this permission notice (including the next
12  * paragraph) shall be included in all copies or substantial portions of the
13  * Software.
14  *
15  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
18  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21  * IN THE SOFTWARE.
22  */
23 
24 #include <assert.h>
25 #include <stdbool.h>
26 
27 #include "anv_private.h"
28 #include "anv_measure.h"
29 #include "vk_render_pass.h"
30 #include "vk_util.h"
31 
32 #include "common/intel_aux_map.h"
33 #include "genxml/gen_macros.h"
34 #include "genxml/genX_pack.h"
35 #include "genxml/genX_rt_pack.h"
36 #include "common/intel_genX_state_brw.h"
37 
38 #include "ds/intel_tracepoints.h"
39 
40 /* We reserve :
41  *    - GPR 14 for secondary command buffer returns
42  *    - GPR 15 for conditional rendering
43  */
44 #define MI_BUILDER_NUM_ALLOC_GPRS 14
45 #define __gen_get_batch_dwords anv_batch_emit_dwords
46 #define __gen_address_offset anv_address_add
47 #define __gen_get_batch_address(b, a) anv_batch_address(b, a)
48 #include "common/mi_builder.h"
49 
50 void
genX(cmd_buffer_ensure_cfe_state)51 genX(cmd_buffer_ensure_cfe_state)(struct anv_cmd_buffer *cmd_buffer,
52                                   uint32_t total_scratch)
53 {
54 #if GFX_VERx10 >= 125
55    assert(cmd_buffer->state.current_pipeline == GPGPU);
56 
57    struct anv_cmd_compute_state *comp_state = &cmd_buffer->state.compute;
58 
59    if (total_scratch <= comp_state->scratch_size)
60       return;
61 
62    const struct intel_device_info *devinfo = cmd_buffer->device->info;
63    anv_batch_emit(&cmd_buffer->batch, GENX(CFE_STATE), cfe) {
64       cfe.MaximumNumberofThreads =
65          devinfo->max_cs_threads * devinfo->subslice_total;
66 
67       uint32_t scratch_surf = 0xffffffff;
68       if (total_scratch > 0) {
69          struct anv_bo *scratch_bo =
70                anv_scratch_pool_alloc(cmd_buffer->device,
71                                       &cmd_buffer->device->scratch_pool,
72                                       MESA_SHADER_COMPUTE,
73                                       total_scratch);
74          anv_reloc_list_add_bo(cmd_buffer->batch.relocs,
75                                scratch_bo);
76          scratch_surf =
77             anv_scratch_pool_get_surf(cmd_buffer->device,
78                                       &cmd_buffer->device->scratch_pool,
79                                       total_scratch);
80          cfe.ScratchSpaceBuffer = scratch_surf >> 4;
81       }
82 
83       cfe.OverDispatchControl = 2; /* 50% overdispatch */
84    }
85 
86    comp_state->scratch_size = total_scratch;
87 #else
88    unreachable("Invalid call");
89 #endif
90 }
91 
92 static void
genX(cmd_buffer_flush_compute_state)93 genX(cmd_buffer_flush_compute_state)(struct anv_cmd_buffer *cmd_buffer)
94 {
95    struct anv_cmd_compute_state *comp_state = &cmd_buffer->state.compute;
96    struct anv_compute_pipeline *pipeline =
97       anv_pipeline_to_compute(comp_state->base.pipeline);
98    const UNUSED struct intel_device_info *devinfo = cmd_buffer->device->info;
99 
100    assert(pipeline->cs);
101 
102    genX(cmd_buffer_config_l3)(cmd_buffer, pipeline->base.l3_config);
103 
104    genX(flush_pipeline_select_gpgpu)(cmd_buffer);
105 
106    /* Apply any pending pipeline flushes we may have.  We want to apply them
107     * now because, if any of those flushes are for things like push constants,
108     * the GPU will read the state at weird times.
109     */
110    genX(cmd_buffer_apply_pipe_flushes)(cmd_buffer);
111 
112    if (cmd_buffer->state.compute.pipeline_dirty) {
113 #if GFX_VERx10 < 125
114       /* From the Sky Lake PRM Vol 2a, MEDIA_VFE_STATE:
115        *
116        *    "A stalling PIPE_CONTROL is required before MEDIA_VFE_STATE unless
117        *    the only bits that are changed are scoreboard related: Scoreboard
118        *    Enable, Scoreboard Type, Scoreboard Mask, Scoreboard * Delta. For
119        *    these scoreboard related states, a MEDIA_STATE_FLUSH is
120        *    sufficient."
121        */
122       anv_add_pending_pipe_bits(cmd_buffer,
123                               ANV_PIPE_CS_STALL_BIT,
124                               "flush compute state");
125       genX(cmd_buffer_apply_pipe_flushes)(cmd_buffer);
126 #endif
127 
128       anv_batch_emit_batch(&cmd_buffer->batch, &pipeline->base.batch);
129 
130 #if GFX_VERx10 >= 125
131       const struct brw_cs_prog_data *prog_data = get_cs_prog_data(pipeline);
132       genX(cmd_buffer_ensure_cfe_state)(cmd_buffer, prog_data->base.total_scratch);
133 #endif
134 
135       /* The workgroup size of the pipeline affects our push constant layout
136        * so flag push constants as dirty if we change the pipeline.
137        */
138       cmd_buffer->state.push_constants_dirty |= VK_SHADER_STAGE_COMPUTE_BIT;
139    }
140 
141    cmd_buffer->state.descriptors_dirty |=
142       genX(cmd_buffer_flush_push_descriptors)(cmd_buffer,
143                                               &cmd_buffer->state.compute.base,
144                                               &pipeline->base);
145 
146    if ((cmd_buffer->state.descriptors_dirty & VK_SHADER_STAGE_COMPUTE_BIT) ||
147        cmd_buffer->state.compute.pipeline_dirty) {
148       genX(cmd_buffer_flush_descriptor_sets)(cmd_buffer,
149                                              &cmd_buffer->state.compute.base,
150                                              VK_SHADER_STAGE_COMPUTE_BIT,
151                                              &pipeline->cs, 1);
152       cmd_buffer->state.descriptors_dirty &= ~VK_SHADER_STAGE_COMPUTE_BIT;
153 
154 #if GFX_VERx10 < 125
155       uint32_t iface_desc_data_dw[GENX(INTERFACE_DESCRIPTOR_DATA_length)];
156       struct GENX(INTERFACE_DESCRIPTOR_DATA) desc = {
157          .BindingTablePointer =
158             cmd_buffer->state.binding_tables[MESA_SHADER_COMPUTE].offset,
159          .SamplerStatePointer =
160             cmd_buffer->state.samplers[MESA_SHADER_COMPUTE].offset,
161       };
162       GENX(INTERFACE_DESCRIPTOR_DATA_pack)(NULL, iface_desc_data_dw, &desc);
163 
164       struct anv_state state =
165          anv_cmd_buffer_merge_dynamic(cmd_buffer, iface_desc_data_dw,
166                                       pipeline->interface_descriptor_data,
167                                       GENX(INTERFACE_DESCRIPTOR_DATA_length),
168                                       64);
169 
170       uint32_t size = GENX(INTERFACE_DESCRIPTOR_DATA_length) * sizeof(uint32_t);
171       anv_batch_emit(&cmd_buffer->batch,
172                      GENX(MEDIA_INTERFACE_DESCRIPTOR_LOAD), mid) {
173          mid.InterfaceDescriptorTotalLength        = size;
174          mid.InterfaceDescriptorDataStartAddress   = state.offset;
175       }
176 #endif
177    }
178 
179    if (cmd_buffer->state.push_constants_dirty & VK_SHADER_STAGE_COMPUTE_BIT) {
180       comp_state->push_data =
181          anv_cmd_buffer_cs_push_constants(cmd_buffer);
182 
183 #if GFX_VERx10 < 125
184       if (comp_state->push_data.alloc_size) {
185          anv_batch_emit(&cmd_buffer->batch, GENX(MEDIA_CURBE_LOAD), curbe) {
186             curbe.CURBETotalDataLength    = comp_state->push_data.alloc_size;
187             curbe.CURBEDataStartAddress   = comp_state->push_data.offset;
188          }
189       }
190 #endif
191 
192       cmd_buffer->state.push_constants_dirty &= ~VK_SHADER_STAGE_COMPUTE_BIT;
193    }
194 
195    cmd_buffer->state.compute.pipeline_dirty = false;
196 
197    genX(cmd_buffer_apply_pipe_flushes)(cmd_buffer);
198 }
199 
200 static void
anv_cmd_buffer_push_base_group_id(struct anv_cmd_buffer * cmd_buffer,uint32_t baseGroupX,uint32_t baseGroupY,uint32_t baseGroupZ)201 anv_cmd_buffer_push_base_group_id(struct anv_cmd_buffer *cmd_buffer,
202                                   uint32_t baseGroupX,
203                                   uint32_t baseGroupY,
204                                   uint32_t baseGroupZ)
205 {
206    if (anv_batch_has_error(&cmd_buffer->batch))
207       return;
208 
209    struct anv_push_constants *push =
210       &cmd_buffer->state.compute.base.push_constants;
211    if (push->cs.base_work_group_id[0] != baseGroupX ||
212        push->cs.base_work_group_id[1] != baseGroupY ||
213        push->cs.base_work_group_id[2] != baseGroupZ) {
214       push->cs.base_work_group_id[0] = baseGroupX;
215       push->cs.base_work_group_id[1] = baseGroupY;
216       push->cs.base_work_group_id[2] = baseGroupZ;
217 
218       cmd_buffer->state.push_constants_dirty |= VK_SHADER_STAGE_COMPUTE_BIT;
219    }
220 }
221 
222 #define GPGPU_DISPATCHDIMX 0x2500
223 #define GPGPU_DISPATCHDIMY 0x2504
224 #define GPGPU_DISPATCHDIMZ 0x2508
225 
226 static void
compute_load_indirect_params(struct anv_cmd_buffer * cmd_buffer,const struct anv_address indirect_addr)227 compute_load_indirect_params(struct anv_cmd_buffer *cmd_buffer,
228                              const struct anv_address indirect_addr)
229 {
230    struct mi_builder b;
231    mi_builder_init(&b, cmd_buffer->device->info, &cmd_buffer->batch);
232 
233    struct mi_value size_x = mi_mem32(anv_address_add(indirect_addr, 0));
234    struct mi_value size_y = mi_mem32(anv_address_add(indirect_addr, 4));
235    struct mi_value size_z = mi_mem32(anv_address_add(indirect_addr, 8));
236 
237    mi_store(&b, mi_reg32(GPGPU_DISPATCHDIMX), size_x);
238    mi_store(&b, mi_reg32(GPGPU_DISPATCHDIMY), size_y);
239    mi_store(&b, mi_reg32(GPGPU_DISPATCHDIMZ), size_z);
240 }
241 
242 static void
compute_store_indirect_params(struct anv_cmd_buffer * cmd_buffer,const struct anv_address indirect_addr)243 compute_store_indirect_params(struct anv_cmd_buffer *cmd_buffer,
244                              const struct anv_address indirect_addr)
245 {
246    struct mi_builder b;
247    mi_builder_init(&b, cmd_buffer->device->info, &cmd_buffer->batch);
248 
249    struct mi_value size_x = mi_mem32(anv_address_add(indirect_addr, 0));
250    struct mi_value size_y = mi_mem32(anv_address_add(indirect_addr, 4));
251    struct mi_value size_z = mi_mem32(anv_address_add(indirect_addr, 8));
252 
253    mi_store(&b, size_x, mi_reg32(GPGPU_DISPATCHDIMX));
254    mi_store(&b, size_y, mi_reg32(GPGPU_DISPATCHDIMY));
255    mi_store(&b, size_z, mi_reg32(GPGPU_DISPATCHDIMZ));
256 }
257 
258 
259 #if GFX_VERx10 >= 125
260 
GENX(INTERFACE_DESCRIPTOR_DATA)261 static inline struct GENX(INTERFACE_DESCRIPTOR_DATA)
262 get_interface_descriptor_data(struct anv_cmd_buffer *cmd_buffer,
263                               const struct anv_shader_bin *shader,
264                               const struct brw_cs_prog_data *prog_data,
265                               const struct intel_cs_dispatch_info *dispatch)
266 {
267    const struct intel_device_info *devinfo = cmd_buffer->device->info;
268 
269    return (struct GENX(INTERFACE_DESCRIPTOR_DATA)) {
270       .KernelStartPointer = shader->kernel.offset,
271       .SamplerStatePointer = cmd_buffer->state.samplers[MESA_SHADER_COMPUTE].offset,
272       .BindingTablePointer = cmd_buffer->state.binding_tables[MESA_SHADER_COMPUTE].offset,
273       /* Typically set to 0 to avoid prefetching on every thread dispatch. */
274       .BindingTableEntryCount = devinfo->verx10 == 125 ?
275          0 : 1 + MIN2(shader->bind_map.surface_count, 30),
276       .NumberofThreadsinGPGPUThreadGroup = dispatch->threads,
277       .SharedLocalMemorySize = encode_slm_size(GFX_VER, prog_data->base.total_shared),
278       .PreferredSLMAllocationSize = preferred_slm_allocation_size(devinfo),
279       .NumberOfBarriers = prog_data->uses_barrier,
280    };
281 }
282 
283 static inline void
emit_indirect_compute_walker(struct anv_cmd_buffer * cmd_buffer,const struct anv_shader_bin * shader,const struct brw_cs_prog_data * prog_data,struct anv_address indirect_addr)284 emit_indirect_compute_walker(struct anv_cmd_buffer *cmd_buffer,
285                              const struct anv_shader_bin *shader,
286                              const struct brw_cs_prog_data *prog_data,
287                              struct anv_address indirect_addr)
288 {
289    const struct intel_device_info *devinfo = cmd_buffer->device->info;
290    assert(devinfo->has_indirect_unroll);
291 
292    struct anv_cmd_compute_state *comp_state = &cmd_buffer->state.compute;
293    bool predicate = cmd_buffer->state.conditional_render_enabled;
294 
295    const struct intel_cs_dispatch_info dispatch =
296       brw_cs_get_dispatch_info(devinfo, prog_data, NULL);
297    const int dispatch_size = dispatch.simd_size / 16;
298 
299    struct GENX(COMPUTE_WALKER_BODY) body =  {
300       .SIMDSize                 = dispatch_size,
301       .MessageSIMD              = dispatch_size,
302       .IndirectDataStartAddress = comp_state->push_data.offset,
303       .IndirectDataLength       = comp_state->push_data.alloc_size,
304       .LocalXMaximum            = prog_data->local_size[0] - 1,
305       .LocalYMaximum            = prog_data->local_size[1] - 1,
306       .LocalZMaximum            = prog_data->local_size[2] - 1,
307       .ExecutionMask            = dispatch.right_mask,
308       .PostSync.MOCS            = anv_mocs(cmd_buffer->device, NULL, 0),
309       .InterfaceDescriptor =
310          get_interface_descriptor_data(cmd_buffer, shader, prog_data,
311                                        &dispatch),
312    };
313 
314    cmd_buffer->last_indirect_dispatch =
315       anv_batch_emitn(
316          &cmd_buffer->batch,
317          GENX(EXECUTE_INDIRECT_DISPATCH_length),
318          GENX(EXECUTE_INDIRECT_DISPATCH),
319          .PredicateEnable            = predicate,
320          .MaxCount                   = 1,
321          .COMPUTE_WALKER_BODY        = body,
322          .ArgumentBufferStartAddress = indirect_addr,
323          .MOCS                       = anv_mocs(cmd_buffer->device,
324                                                 indirect_addr.bo, 0),
325       );
326 }
327 
328 static inline void
emit_compute_walker(struct anv_cmd_buffer * cmd_buffer,const struct anv_compute_pipeline * pipeline,bool indirect,const struct brw_cs_prog_data * prog_data,uint32_t groupCountX,uint32_t groupCountY,uint32_t groupCountZ)329 emit_compute_walker(struct anv_cmd_buffer *cmd_buffer,
330                     const struct anv_compute_pipeline *pipeline, bool indirect,
331                     const struct brw_cs_prog_data *prog_data,
332                     uint32_t groupCountX, uint32_t groupCountY,
333                     uint32_t groupCountZ)
334 {
335    const struct anv_cmd_compute_state *comp_state = &cmd_buffer->state.compute;
336    const bool predicate = cmd_buffer->state.conditional_render_enabled;
337 
338    const struct intel_device_info *devinfo = pipeline->base.device->info;
339    const struct intel_cs_dispatch_info dispatch =
340       brw_cs_get_dispatch_info(devinfo, prog_data, NULL);
341 
342    cmd_buffer->last_compute_walker =
343       anv_batch_emitn(
344          &cmd_buffer->batch,
345          GENX(COMPUTE_WALKER_length),
346          GENX(COMPUTE_WALKER),
347          .IndirectParameterEnable        = indirect,
348          .PredicateEnable                = predicate,
349          .SIMDSize                       = dispatch.simd_size / 16,
350          .MessageSIMD                    = dispatch.simd_size / 16,
351          .IndirectDataStartAddress       = comp_state->push_data.offset,
352          .IndirectDataLength             = comp_state->push_data.alloc_size,
353 #if GFX_VERx10 == 125
354          .SystolicModeEnable             = prog_data->uses_systolic,
355 #endif
356          .GenerateLocalID                = prog_data->generate_local_id != 0,
357          .EmitLocal                      = prog_data->generate_local_id,
358          .WalkOrder                      = prog_data->walk_order,
359          .TileLayout = prog_data->walk_order == INTEL_WALK_ORDER_YXZ ?
360                        TileY32bpe : Linear,
361          .LocalXMaximum                  = prog_data->local_size[0] - 1,
362          .LocalYMaximum                  = prog_data->local_size[1] - 1,
363          .LocalZMaximum                  = prog_data->local_size[2] - 1,
364          .ThreadGroupIDXDimension        = groupCountX,
365          .ThreadGroupIDYDimension        = groupCountY,
366          .ThreadGroupIDZDimension        = groupCountZ,
367          .ExecutionMask                  = dispatch.right_mask,
368          .PostSync                       = {
369             .MOCS                        = anv_mocs(pipeline->base.device, NULL, 0),
370          },
371          .InterfaceDescriptor =
372             get_interface_descriptor_data(cmd_buffer, pipeline->cs,
373                                           prog_data, &dispatch),
374       );
375 }
376 
377 #else /* #if GFX_VERx10 >= 125 */
378 
379 static inline void
emit_gpgpu_walker(struct anv_cmd_buffer * cmd_buffer,const struct anv_compute_pipeline * pipeline,bool indirect,const struct brw_cs_prog_data * prog_data,uint32_t groupCountX,uint32_t groupCountY,uint32_t groupCountZ)380 emit_gpgpu_walker(struct anv_cmd_buffer *cmd_buffer,
381                   const struct anv_compute_pipeline *pipeline, bool indirect,
382                   const struct brw_cs_prog_data *prog_data,
383                   uint32_t groupCountX, uint32_t groupCountY,
384                   uint32_t groupCountZ)
385 {
386    const bool predicate = cmd_buffer->state.conditional_render_enabled;
387 
388    const struct intel_device_info *devinfo = pipeline->base.device->info;
389    const struct intel_cs_dispatch_info dispatch =
390       brw_cs_get_dispatch_info(devinfo, prog_data, NULL);
391 
392    anv_batch_emit(&cmd_buffer->batch, GENX(GPGPU_WALKER), ggw) {
393       ggw.IndirectParameterEnable      = indirect;
394       ggw.PredicateEnable              = predicate;
395       ggw.SIMDSize                     = dispatch.simd_size / 16;
396       ggw.ThreadDepthCounterMaximum    = 0;
397       ggw.ThreadHeightCounterMaximum   = 0;
398       ggw.ThreadWidthCounterMaximum    = dispatch.threads - 1;
399       ggw.ThreadGroupIDXDimension      = groupCountX;
400       ggw.ThreadGroupIDYDimension      = groupCountY;
401       ggw.ThreadGroupIDZDimension      = groupCountZ;
402       ggw.RightExecutionMask           = dispatch.right_mask;
403       ggw.BottomExecutionMask          = 0xffffffff;
404    }
405 
406    anv_batch_emit(&cmd_buffer->batch, GENX(MEDIA_STATE_FLUSH), msf);
407 }
408 
409 #endif /* #if GFX_VERx10 >= 125 */
410 
411 static inline void
emit_cs_walker(struct anv_cmd_buffer * cmd_buffer,const struct anv_compute_pipeline * pipeline,const struct brw_cs_prog_data * prog_data,struct anv_address indirect_addr,uint32_t groupCountX,uint32_t groupCountY,uint32_t groupCountZ)412 emit_cs_walker(struct anv_cmd_buffer *cmd_buffer,
413                const struct anv_compute_pipeline *pipeline,
414                const struct brw_cs_prog_data *prog_data,
415                struct anv_address indirect_addr,
416                uint32_t groupCountX, uint32_t groupCountY, uint32_t groupCountZ)
417 {
418    bool is_indirect = !anv_address_is_null(indirect_addr);
419 
420 #if GFX_VERx10 >= 125
421    if (is_indirect && cmd_buffer->device->info->has_indirect_unroll) {
422       emit_indirect_compute_walker(cmd_buffer, pipeline->cs, prog_data,
423                                    indirect_addr);
424       return;
425    }
426 #endif
427 
428    if (is_indirect)
429       compute_load_indirect_params(cmd_buffer, indirect_addr);
430 
431 #if GFX_VERx10 >= 125
432    emit_compute_walker(cmd_buffer, pipeline, is_indirect, prog_data,
433                        groupCountX, groupCountY, groupCountZ);
434 #else
435    emit_gpgpu_walker(cmd_buffer, pipeline, is_indirect, prog_data,
436                      groupCountX, groupCountY, groupCountZ);
437 #endif
438 }
439 
genX(CmdDispatchBase)440 void genX(CmdDispatchBase)(
441     VkCommandBuffer                             commandBuffer,
442     uint32_t                                    baseGroupX,
443     uint32_t                                    baseGroupY,
444     uint32_t                                    baseGroupZ,
445     uint32_t                                    groupCountX,
446     uint32_t                                    groupCountY,
447     uint32_t                                    groupCountZ)
448 {
449    ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer);
450    struct anv_compute_pipeline *pipeline =
451       anv_pipeline_to_compute(cmd_buffer->state.compute.base.pipeline);
452    const struct brw_cs_prog_data *prog_data = get_cs_prog_data(pipeline);
453 
454    anv_cmd_buffer_push_base_group_id(cmd_buffer, baseGroupX,
455                                      baseGroupY, baseGroupZ);
456 
457    if (anv_batch_has_error(&cmd_buffer->batch))
458       return;
459 
460    anv_measure_snapshot(cmd_buffer,
461                         INTEL_SNAPSHOT_COMPUTE,
462                         "compute",
463                         groupCountX * groupCountY * groupCountZ *
464                         prog_data->local_size[0] * prog_data->local_size[1] *
465                         prog_data->local_size[2]);
466 
467    trace_intel_begin_compute(&cmd_buffer->trace);
468 
469    if (prog_data->uses_num_work_groups) {
470       struct anv_state state =
471          anv_cmd_buffer_alloc_dynamic_state(cmd_buffer, 12, 4);
472       uint32_t *sizes = state.map;
473       sizes[0] = groupCountX;
474       sizes[1] = groupCountY;
475       sizes[2] = groupCountZ;
476       cmd_buffer->state.compute.num_workgroups =
477          anv_state_pool_state_address(&cmd_buffer->device->dynamic_state_pool,
478                                       state);
479 
480       /* The num_workgroups buffer goes in the binding table */
481       cmd_buffer->state.descriptors_dirty |= VK_SHADER_STAGE_COMPUTE_BIT;
482    }
483 
484    genX(cmd_buffer_flush_compute_state)(cmd_buffer);
485 
486    if (cmd_buffer->state.conditional_render_enabled)
487       genX(cmd_emit_conditional_render_predicate)(cmd_buffer);
488 
489    emit_cs_walker(cmd_buffer, pipeline, prog_data,
490                   ANV_NULL_ADDRESS /* no indirect data */,
491                   groupCountX, groupCountY, groupCountZ);
492 
493    trace_intel_end_compute(&cmd_buffer->trace,
494                            groupCountX, groupCountY, groupCountZ);
495 }
496 
genX(CmdDispatchIndirect)497 void genX(CmdDispatchIndirect)(
498     VkCommandBuffer                             commandBuffer,
499     VkBuffer                                    _buffer,
500     VkDeviceSize                                offset)
501 {
502    ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer);
503    ANV_FROM_HANDLE(anv_buffer, buffer, _buffer);
504    struct anv_compute_pipeline *pipeline =
505       anv_pipeline_to_compute(cmd_buffer->state.compute.base.pipeline);
506    const struct brw_cs_prog_data *prog_data = get_cs_prog_data(pipeline);
507    struct anv_address addr = anv_address_add(buffer->address, offset);
508    UNUSED struct anv_batch *batch = &cmd_buffer->batch;
509 
510    anv_cmd_buffer_push_base_group_id(cmd_buffer, 0, 0, 0);
511 
512    anv_measure_snapshot(cmd_buffer,
513                         INTEL_SNAPSHOT_COMPUTE,
514                         "compute indirect",
515                         0);
516    trace_intel_begin_compute(&cmd_buffer->trace);
517 
518    if (prog_data->uses_num_work_groups) {
519       cmd_buffer->state.compute.num_workgroups = addr;
520 
521       /* The num_workgroups buffer goes in the binding table */
522       cmd_buffer->state.descriptors_dirty |= VK_SHADER_STAGE_COMPUTE_BIT;
523    }
524 
525    genX(cmd_buffer_flush_compute_state)(cmd_buffer);
526 
527    if (cmd_buffer->state.conditional_render_enabled)
528       genX(cmd_emit_conditional_render_predicate)(cmd_buffer);
529 
530    emit_cs_walker(cmd_buffer, pipeline, prog_data, addr, 0, 0, 0);
531 
532    trace_intel_end_compute(&cmd_buffer->trace, 0, 0, 0);
533 }
534 
535 struct anv_state
genX(cmd_buffer_ray_query_globals)536 genX(cmd_buffer_ray_query_globals)(struct anv_cmd_buffer *cmd_buffer)
537 {
538 #if GFX_VERx10 >= 125
539    struct anv_device *device = cmd_buffer->device;
540 
541    struct anv_state state =
542       anv_cmd_buffer_alloc_dynamic_state(cmd_buffer,
543                                          BRW_RT_DISPATCH_GLOBALS_SIZE,
544                                          64);
545    struct brw_rt_scratch_layout layout;
546    uint32_t stack_ids_per_dss = 2048; /* TODO: can we use a lower value in
547                                        * some cases?
548                                        */
549    brw_rt_compute_scratch_layout(&layout, device->info,
550                                  stack_ids_per_dss, 1 << 10);
551 
552    const struct GENX(RT_DISPATCH_GLOBALS) rtdg = {
553       .MemBaseAddress = (struct anv_address) {
554          /* The ray query HW computes offsets from the top of the buffer, so
555           * let the address at the end of the buffer.
556           */
557          .bo = device->ray_query_bo,
558          .offset = device->ray_query_bo->size
559       },
560       .AsyncRTStackSize = layout.ray_stack_stride / 64,
561       .NumDSSRTStacks = layout.stack_ids_per_dss,
562       .MaxBVHLevels = BRW_RT_MAX_BVH_LEVELS,
563       .Flags = RT_DEPTH_TEST_LESS_EQUAL,
564       .ResumeShaderTable = (struct anv_address) {
565          .bo = cmd_buffer->state.ray_query_shadow_bo,
566       },
567    };
568    GENX(RT_DISPATCH_GLOBALS_pack)(NULL, state.map, &rtdg);
569 
570    return state;
571 #else
572    unreachable("Not supported");
573 #endif
574 }
575 
576 #if GFX_VERx10 >= 125
577 void
genX(cmd_buffer_dispatch_kernel)578 genX(cmd_buffer_dispatch_kernel)(struct anv_cmd_buffer *cmd_buffer,
579                                  struct anv_kernel *kernel,
580                                  const uint32_t *global_size,
581                                  uint32_t arg_count,
582                                  const struct anv_kernel_arg *args)
583 {
584    const struct intel_device_info *devinfo = cmd_buffer->device->info;
585    const struct brw_cs_prog_data *cs_prog_data =
586       brw_cs_prog_data_const(kernel->bin->prog_data);
587 
588    genX(cmd_buffer_config_l3)(cmd_buffer, kernel->l3_config);
589 
590    if (anv_cmd_buffer_is_render_queue(cmd_buffer))
591       genX(flush_pipeline_select_gpgpu)(cmd_buffer);
592 
593    /* Apply any pending pipeline flushes we may have.  We want to apply them
594     * now because, if any of those flushes are for things like push constants,
595     * the GPU will read the state at weird times.
596     */
597    genX(cmd_buffer_apply_pipe_flushes)(cmd_buffer);
598 
599    uint32_t indirect_data_size = sizeof(struct brw_kernel_sysvals);
600    indirect_data_size += kernel->bin->bind_map.kernel_args_size;
601    indirect_data_size = ALIGN(indirect_data_size, 64);
602    struct anv_state indirect_data =
603       anv_cmd_buffer_alloc_general_state(cmd_buffer,
604                                          indirect_data_size, 64);
605    memset(indirect_data.map, 0, indirect_data.alloc_size);
606 
607    struct brw_kernel_sysvals sysvals = {};
608    if (global_size != NULL) {
609       for (unsigned i = 0; i < 3; i++)
610          sysvals.num_work_groups[i] = global_size[i];
611       memcpy(indirect_data.map, &sysvals, sizeof(sysvals));
612    } else {
613       struct anv_address sysvals_addr = {
614          .bo = NULL, /* General state buffer is always 0. */
615          .offset = indirect_data.offset,
616       };
617 
618       compute_store_indirect_params(cmd_buffer, sysvals_addr);
619    }
620 
621    void *args_map = indirect_data.map + sizeof(sysvals);
622    for (unsigned i = 0; i < kernel->bin->bind_map.kernel_arg_count; i++) {
623       struct brw_kernel_arg_desc *arg_desc =
624          &kernel->bin->bind_map.kernel_args[i];
625       assert(i < arg_count);
626       const struct anv_kernel_arg *arg = &args[i];
627       if (arg->is_ptr) {
628          memcpy(args_map + arg_desc->offset, arg->ptr, arg_desc->size);
629       } else {
630          assert(arg_desc->size <= sizeof(arg->u64));
631          memcpy(args_map + arg_desc->offset, &arg->u64, arg_desc->size);
632       }
633    }
634 
635    struct intel_cs_dispatch_info dispatch =
636       brw_cs_get_dispatch_info(devinfo, cs_prog_data, NULL);
637 
638    anv_batch_emit(&cmd_buffer->batch, GENX(COMPUTE_WALKER), cw) {
639       cw.PredicateEnable                = false;
640       cw.SIMDSize                       = dispatch.simd_size / 16;
641       cw.MessageSIMD                    = dispatch.simd_size / 16;
642       cw.IndirectDataStartAddress       = indirect_data.offset;
643       cw.IndirectDataLength             = indirect_data.alloc_size;
644       cw.LocalXMaximum                  = cs_prog_data->local_size[0] - 1;
645       cw.LocalYMaximum                  = cs_prog_data->local_size[1] - 1;
646       cw.LocalZMaximum                  = cs_prog_data->local_size[2] - 1;
647       cw.ExecutionMask                  = dispatch.right_mask;
648       cw.PostSync.MOCS                  = cmd_buffer->device->isl_dev.mocs.internal;
649 
650       if (global_size != NULL) {
651          cw.ThreadGroupIDXDimension     = global_size[0];
652          cw.ThreadGroupIDYDimension     = global_size[1];
653          cw.ThreadGroupIDZDimension     = global_size[2];
654       } else {
655          cw.IndirectParameterEnable     = true;
656       }
657 
658       cw.InterfaceDescriptor =
659          get_interface_descriptor_data(cmd_buffer,
660                                        kernel->bin,
661                                        cs_prog_data,
662                                        &dispatch);
663    }
664 
665    /* We just blew away the compute pipeline state */
666    cmd_buffer->state.compute.pipeline_dirty = true;
667 }
668 
669 static void
calc_local_trace_size(uint8_t local_shift[3],const uint32_t global[3])670 calc_local_trace_size(uint8_t local_shift[3], const uint32_t global[3])
671 {
672    unsigned total_shift = 0;
673    memset(local_shift, 0, 3);
674 
675    bool progress;
676    do {
677       progress = false;
678       for (unsigned i = 0; i < 3; i++) {
679          assert(global[i] > 0);
680          if ((1 << local_shift[i]) < global[i]) {
681             progress = true;
682             local_shift[i]++;
683             total_shift++;
684          }
685 
686          if (total_shift == 3)
687             return;
688       }
689    } while(progress);
690 
691    /* Assign whatever's left to x */
692    local_shift[0] += 3 - total_shift;
693 }
694 
GENX(RT_SHADER_TABLE)695 static struct GENX(RT_SHADER_TABLE)
696 vk_sdar_to_shader_table(const VkStridedDeviceAddressRegionKHR *region)
697 {
698    return (struct GENX(RT_SHADER_TABLE)) {
699       .BaseAddress = anv_address_from_u64(region->deviceAddress),
700       .Stride = region->stride,
701    };
702 }
703 
704 struct trace_params {
705    /* If is_sbt_indirect, use indirect_sbts_addr to build RT_DISPATCH_GLOBALS
706     * with mi_builder.
707     */
708    bool is_sbt_indirect;
709    const VkStridedDeviceAddressRegionKHR *raygen_sbt;
710    const VkStridedDeviceAddressRegionKHR *miss_sbt;
711    const VkStridedDeviceAddressRegionKHR *hit_sbt;
712    const VkStridedDeviceAddressRegionKHR *callable_sbt;
713 
714    /* A pointer to a VkTraceRaysIndirectCommand2KHR structure */
715    uint64_t indirect_sbts_addr;
716 
717    /* If is_indirect, use launch_size_addr to program the dispatch size. */
718    bool is_launch_size_indirect;
719    uint32_t launch_size[3];
720 
721    /* A pointer a uint32_t[3] */
722    uint64_t launch_size_addr;
723 };
724 
725 static struct anv_state
cmd_buffer_emit_rt_dispatch_globals(struct anv_cmd_buffer * cmd_buffer,struct trace_params * params)726 cmd_buffer_emit_rt_dispatch_globals(struct anv_cmd_buffer *cmd_buffer,
727                                     struct trace_params *params)
728 {
729    assert(!params->is_sbt_indirect);
730    assert(params->miss_sbt != NULL);
731    assert(params->hit_sbt != NULL);
732    assert(params->callable_sbt != NULL);
733 
734    struct anv_cmd_ray_tracing_state *rt = &cmd_buffer->state.rt;
735 
736    struct anv_state rtdg_state =
737       anv_cmd_buffer_alloc_dynamic_state(cmd_buffer,
738                                          BRW_RT_PUSH_CONST_OFFSET +
739                                          sizeof(struct anv_push_constants),
740                                          64);
741 
742    struct GENX(RT_DISPATCH_GLOBALS) rtdg = {
743       .MemBaseAddress     = (struct anv_address) {
744          .bo = rt->scratch.bo,
745          .offset = rt->scratch.layout.ray_stack_start,
746       },
747       .CallStackHandler   = anv_shader_bin_get_bsr(
748          cmd_buffer->device->rt_trivial_return, 0),
749       .AsyncRTStackSize   = rt->scratch.layout.ray_stack_stride / 64,
750       .NumDSSRTStacks     = rt->scratch.layout.stack_ids_per_dss,
751       .MaxBVHLevels       = BRW_RT_MAX_BVH_LEVELS,
752       .Flags              = RT_DEPTH_TEST_LESS_EQUAL,
753       .HitGroupTable      = vk_sdar_to_shader_table(params->hit_sbt),
754       .MissGroupTable     = vk_sdar_to_shader_table(params->miss_sbt),
755       .SWStackSize        = rt->scratch.layout.sw_stack_size / 64,
756       .LaunchWidth        = params->launch_size[0],
757       .LaunchHeight       = params->launch_size[1],
758       .LaunchDepth        = params->launch_size[2],
759       .CallableGroupTable = vk_sdar_to_shader_table(params->callable_sbt),
760    };
761    GENX(RT_DISPATCH_GLOBALS_pack)(NULL, rtdg_state.map, &rtdg);
762 
763    return rtdg_state;
764 }
765 
766 static struct mi_value
mi_build_sbt_entry(struct mi_builder * b,uint64_t addr_field_addr,uint64_t stride_field_addr)767 mi_build_sbt_entry(struct mi_builder *b,
768                    uint64_t addr_field_addr,
769                    uint64_t stride_field_addr)
770 {
771    return mi_ior(b,
772                  mi_iand(b, mi_mem64(anv_address_from_u64(addr_field_addr)),
773                             mi_imm(BITFIELD64_BIT(49) - 1)),
774                  mi_ishl_imm(b, mi_mem32(anv_address_from_u64(stride_field_addr)),
775                                 48));
776 }
777 
778 static struct anv_state
cmd_buffer_emit_rt_dispatch_globals_indirect(struct anv_cmd_buffer * cmd_buffer,struct trace_params * params)779 cmd_buffer_emit_rt_dispatch_globals_indirect(struct anv_cmd_buffer *cmd_buffer,
780                                              struct trace_params *params)
781 {
782    struct anv_cmd_ray_tracing_state *rt = &cmd_buffer->state.rt;
783 
784    struct anv_state rtdg_state =
785       anv_cmd_buffer_alloc_dynamic_state(cmd_buffer,
786                                          BRW_RT_PUSH_CONST_OFFSET +
787                                          sizeof(struct anv_push_constants),
788                                          64);
789 
790    struct GENX(RT_DISPATCH_GLOBALS) rtdg = {
791       .MemBaseAddress     = (struct anv_address) {
792          .bo = rt->scratch.bo,
793          .offset = rt->scratch.layout.ray_stack_start,
794       },
795       .CallStackHandler   = anv_shader_bin_get_bsr(
796          cmd_buffer->device->rt_trivial_return, 0),
797       .AsyncRTStackSize   = rt->scratch.layout.ray_stack_stride / 64,
798       .NumDSSRTStacks     = rt->scratch.layout.stack_ids_per_dss,
799       .MaxBVHLevels       = BRW_RT_MAX_BVH_LEVELS,
800       .Flags              = RT_DEPTH_TEST_LESS_EQUAL,
801       .SWStackSize        = rt->scratch.layout.sw_stack_size / 64,
802    };
803    GENX(RT_DISPATCH_GLOBALS_pack)(NULL, rtdg_state.map, &rtdg);
804 
805    struct anv_address rtdg_addr =
806       anv_state_pool_state_address(
807          &cmd_buffer->device->dynamic_state_pool,
808          rtdg_state);
809 
810    struct mi_builder b;
811    mi_builder_init(&b, cmd_buffer->device->info, &cmd_buffer->batch);
812    const uint32_t mocs = anv_mocs_for_address(cmd_buffer->device, &rtdg_addr);
813    mi_builder_set_mocs(&b, mocs);
814 
815    /* Fill the MissGroupTable, HitGroupTable & CallableGroupTable fields of
816     * RT_DISPATCH_GLOBALS using the mi_builder.
817     */
818    mi_store(&b,
819             mi_mem64(
820                anv_address_add(
821                   rtdg_addr,
822                   GENX(RT_DISPATCH_GLOBALS_MissGroupTable_start) / 8)),
823             mi_build_sbt_entry(&b,
824                                params->indirect_sbts_addr +
825                                offsetof(VkTraceRaysIndirectCommand2KHR,
826                                         missShaderBindingTableAddress),
827                                params->indirect_sbts_addr +
828                                offsetof(VkTraceRaysIndirectCommand2KHR,
829                                         missShaderBindingTableStride)));
830    mi_store(&b,
831             mi_mem64(
832                anv_address_add(
833                   rtdg_addr,
834                   GENX(RT_DISPATCH_GLOBALS_HitGroupTable_start) / 8)),
835             mi_build_sbt_entry(&b,
836                                params->indirect_sbts_addr +
837                                offsetof(VkTraceRaysIndirectCommand2KHR,
838                                         hitShaderBindingTableAddress),
839                                params->indirect_sbts_addr +
840                                offsetof(VkTraceRaysIndirectCommand2KHR,
841                                         hitShaderBindingTableStride)));
842    mi_store(&b,
843             mi_mem64(
844                anv_address_add(
845                   rtdg_addr,
846                   GENX(RT_DISPATCH_GLOBALS_CallableGroupTable_start) / 8)),
847             mi_build_sbt_entry(&b,
848                                params->indirect_sbts_addr +
849                                offsetof(VkTraceRaysIndirectCommand2KHR,
850                                         callableShaderBindingTableAddress),
851                                params->indirect_sbts_addr +
852                                offsetof(VkTraceRaysIndirectCommand2KHR,
853                                         callableShaderBindingTableStride)));
854 
855    return rtdg_state;
856 }
857 
858 static void
cmd_buffer_trace_rays(struct anv_cmd_buffer * cmd_buffer,struct trace_params * params)859 cmd_buffer_trace_rays(struct anv_cmd_buffer *cmd_buffer,
860                       struct trace_params *params)
861 {
862    struct anv_device *device = cmd_buffer->device;
863    struct anv_cmd_ray_tracing_state *rt = &cmd_buffer->state.rt;
864    struct anv_ray_tracing_pipeline *pipeline =
865       anv_pipeline_to_ray_tracing(rt->base.pipeline);
866 
867    if (anv_batch_has_error(&cmd_buffer->batch))
868       return;
869 
870    /* If we have a known degenerate launch size, just bail */
871    if (!params->is_launch_size_indirect &&
872        (params->launch_size[0] == 0 ||
873         params->launch_size[1] == 0 ||
874         params->launch_size[2] == 0))
875       return;
876 
877    trace_intel_begin_rays(&cmd_buffer->trace);
878 
879    genX(cmd_buffer_config_l3)(cmd_buffer, pipeline->base.l3_config);
880    genX(flush_pipeline_select_gpgpu)(cmd_buffer);
881 
882    cmd_buffer->state.rt.pipeline_dirty = false;
883 
884    genX(cmd_buffer_apply_pipe_flushes)(cmd_buffer);
885 
886    genX(cmd_buffer_flush_push_descriptors)(cmd_buffer,
887                                            &cmd_buffer->state.rt.base,
888                                            &pipeline->base);
889 
890    /* Add these to the reloc list as they're internal buffers that don't
891     * actually have relocs to pick them up manually.
892     *
893     * TODO(RT): This is a bit of a hack
894     */
895    anv_reloc_list_add_bo(cmd_buffer->batch.relocs,
896                          rt->scratch.bo);
897    anv_reloc_list_add_bo(cmd_buffer->batch.relocs,
898                          cmd_buffer->device->btd_fifo_bo);
899 
900    /* Allocate and set up our RT_DISPATCH_GLOBALS */
901    struct anv_state rtdg_state =
902       params->is_sbt_indirect ?
903       cmd_buffer_emit_rt_dispatch_globals_indirect(cmd_buffer, params) :
904       cmd_buffer_emit_rt_dispatch_globals(cmd_buffer, params);
905 
906    assert(rtdg_state.alloc_size >= (BRW_RT_PUSH_CONST_OFFSET +
907                                     sizeof(struct anv_push_constants)));
908    assert(GENX(RT_DISPATCH_GLOBALS_length) * 4 <= BRW_RT_PUSH_CONST_OFFSET);
909    /* Push constants go after the RT_DISPATCH_GLOBALS */
910    memcpy(rtdg_state.map + BRW_RT_PUSH_CONST_OFFSET,
911           &cmd_buffer->state.rt.base.push_constants,
912           sizeof(struct anv_push_constants));
913 
914    struct anv_address rtdg_addr =
915       anv_state_pool_state_address(&cmd_buffer->device->dynamic_state_pool,
916                                    rtdg_state);
917 
918    uint8_t local_size_log2[3];
919    uint32_t global_size[3] = {};
920    if (params->is_launch_size_indirect) {
921       /* Pick a local size that's probably ok.  We assume most TraceRays calls
922        * will use a two-dimensional dispatch size.  Worst case, our initial
923        * dispatch will be a little slower than it has to be.
924        */
925       local_size_log2[0] = 2;
926       local_size_log2[1] = 1;
927       local_size_log2[2] = 0;
928 
929       struct mi_builder b;
930       mi_builder_init(&b, cmd_buffer->device->info, &cmd_buffer->batch);
931       const uint32_t mocs = anv_mocs_for_address(cmd_buffer->device, &rtdg_addr);
932       mi_builder_set_mocs(&b, mocs);
933 
934       struct mi_value launch_size[3] = {
935          mi_mem32(anv_address_from_u64(params->launch_size_addr + 0)),
936          mi_mem32(anv_address_from_u64(params->launch_size_addr + 4)),
937          mi_mem32(anv_address_from_u64(params->launch_size_addr + 8)),
938       };
939 
940       /* Store the original launch size into RT_DISPATCH_GLOBALS */
941       mi_store(&b, mi_mem32(anv_address_add(rtdg_addr,
942                                             GENX(RT_DISPATCH_GLOBALS_LaunchWidth_start) / 8)),
943                mi_value_ref(&b, launch_size[0]));
944       mi_store(&b, mi_mem32(anv_address_add(rtdg_addr,
945                                             GENX(RT_DISPATCH_GLOBALS_LaunchHeight_start) / 8)),
946                mi_value_ref(&b, launch_size[1]));
947       mi_store(&b, mi_mem32(anv_address_add(rtdg_addr,
948                                             GENX(RT_DISPATCH_GLOBALS_LaunchDepth_start) / 8)),
949                mi_value_ref(&b, launch_size[2]));
950 
951       /* Compute the global dispatch size */
952       for (unsigned i = 0; i < 3; i++) {
953          if (local_size_log2[i] == 0)
954             continue;
955 
956          /* global_size = DIV_ROUND_UP(launch_size, local_size)
957           *
958           * Fortunately for us MI_ALU math is 64-bit and , mi_ushr32_imm
959           * has the semantics of shifting the enture 64-bit value and taking
960           * the bottom 32 so we don't have to worry about roll-over.
961           */
962          uint32_t local_size = 1 << local_size_log2[i];
963          launch_size[i] = mi_iadd(&b, launch_size[i],
964                                       mi_imm(local_size - 1));
965          launch_size[i] = mi_ushr32_imm(&b, launch_size[i],
966                                             local_size_log2[i]);
967       }
968 
969       mi_store(&b, mi_reg32(GPGPU_DISPATCHDIMX), launch_size[0]);
970       mi_store(&b, mi_reg32(GPGPU_DISPATCHDIMY), launch_size[1]);
971       mi_store(&b, mi_reg32(GPGPU_DISPATCHDIMZ), launch_size[2]);
972 
973    } else {
974       calc_local_trace_size(local_size_log2, params->launch_size);
975 
976       for (unsigned i = 0; i < 3; i++) {
977          /* We have to be a bit careful here because DIV_ROUND_UP adds to the
978           * numerator value may overflow.  Cast to uint64_t to avoid this.
979           */
980          uint32_t local_size = 1 << local_size_log2[i];
981          global_size[i] = DIV_ROUND_UP((uint64_t)params->launch_size[i], local_size);
982       }
983    }
984 
985 #if GFX_VERx10 == 125
986    /* Wa_14014427904 - We need additional invalidate/flush when
987     * emitting NP state commands with ATS-M in compute mode.
988     */
989    if (intel_device_info_is_atsm(device->info) &&
990       cmd_buffer->queue_family->engine_class == INTEL_ENGINE_CLASS_COMPUTE) {
991       genx_batch_emit_pipe_control(&cmd_buffer->batch,
992                                    cmd_buffer->device->info,
993                                    cmd_buffer->state.current_pipeline,
994                                    ANV_PIPE_CS_STALL_BIT |
995                                    ANV_PIPE_STATE_CACHE_INVALIDATE_BIT |
996                                    ANV_PIPE_CONSTANT_CACHE_INVALIDATE_BIT |
997                                    ANV_PIPE_UNTYPED_DATAPORT_CACHE_FLUSH_BIT |
998                                    ANV_PIPE_TEXTURE_CACHE_INVALIDATE_BIT |
999                                    ANV_PIPE_INSTRUCTION_CACHE_INVALIDATE_BIT |
1000                                    ANV_PIPE_HDC_PIPELINE_FLUSH_BIT);
1001    }
1002 #endif
1003 
1004    anv_batch_emit(&cmd_buffer->batch, GENX(3DSTATE_BTD), btd) {
1005       /* TODO: This is the timeout after which the bucketed thread dispatcher
1006        *       will kick off a wave of threads. We go with the lowest value
1007        *       for now. It could be tweaked on a per application basis
1008        *       (drirc).
1009        */
1010       btd.DispatchTimeoutCounter = _64clocks;
1011       /* BSpec 43851: "This field must be programmed to 6h i.e. memory backed
1012        *               buffer must be 128KB."
1013        */
1014       btd.PerDSSMemoryBackedBufferSize = 6;
1015       btd.MemoryBackedBufferBasePointer = (struct anv_address) { .bo = device->btd_fifo_bo };
1016       if (pipeline->base.scratch_size > 0) {
1017          struct anv_bo *scratch_bo =
1018             anv_scratch_pool_alloc(device,
1019                                    &device->scratch_pool,
1020                                    MESA_SHADER_COMPUTE,
1021                                    pipeline->base.scratch_size);
1022          anv_reloc_list_add_bo(cmd_buffer->batch.relocs,
1023                                scratch_bo);
1024          uint32_t scratch_surf =
1025             anv_scratch_pool_get_surf(cmd_buffer->device,
1026                                       &device->scratch_pool,
1027                                       pipeline->base.scratch_size);
1028          btd.ScratchSpaceBuffer = scratch_surf >> 4;
1029       }
1030    }
1031 
1032    genX(cmd_buffer_ensure_cfe_state)(cmd_buffer, pipeline->base.scratch_size);
1033 
1034    const struct brw_cs_prog_data *cs_prog_data =
1035       brw_cs_prog_data_const(device->rt_trampoline->prog_data);
1036    struct intel_cs_dispatch_info dispatch =
1037       brw_cs_get_dispatch_info(device->info, cs_prog_data, NULL);
1038 
1039    anv_batch_emit(&cmd_buffer->batch, GENX(COMPUTE_WALKER), cw) {
1040       cw.IndirectParameterEnable        = params->is_launch_size_indirect;
1041       cw.PredicateEnable                = cmd_buffer->state.conditional_render_enabled;
1042       cw.SIMDSize                       = dispatch.simd_size / 16;
1043       cw.MessageSIMD                    = dispatch.simd_size / 16;
1044       cw.LocalXMaximum                  = (1 << local_size_log2[0]) - 1;
1045       cw.LocalYMaximum                  = (1 << local_size_log2[1]) - 1;
1046       cw.LocalZMaximum                  = (1 << local_size_log2[2]) - 1;
1047       cw.ThreadGroupIDXDimension        = global_size[0];
1048       cw.ThreadGroupIDYDimension        = global_size[1];
1049       cw.ThreadGroupIDZDimension        = global_size[2];
1050       cw.ExecutionMask                  = 0xff;
1051       cw.EmitInlineParameter            = true;
1052       cw.PostSync.MOCS                  = anv_mocs(pipeline->base.device, NULL, 0);
1053 
1054       const gl_shader_stage s = MESA_SHADER_RAYGEN;
1055       struct anv_device *device = cmd_buffer->device;
1056       struct anv_state *surfaces = &cmd_buffer->state.binding_tables[s];
1057       struct anv_state *samplers = &cmd_buffer->state.samplers[s];
1058       cw.InterfaceDescriptor = (struct GENX(INTERFACE_DESCRIPTOR_DATA)) {
1059          .KernelStartPointer = device->rt_trampoline->kernel.offset,
1060          .SamplerStatePointer = samplers->offset,
1061          /* i965: DIV_ROUND_UP(CLAMP(stage_state->sampler_count, 0, 16), 4), */
1062          .SamplerCount = 0,
1063          .BindingTablePointer = surfaces->offset,
1064          .NumberofThreadsinGPGPUThreadGroup = 1,
1065          .BTDMode = true,
1066       };
1067 
1068       struct brw_rt_raygen_trampoline_params trampoline_params = {
1069          .rt_disp_globals_addr = anv_address_physical(rtdg_addr),
1070          .raygen_bsr_addr =
1071             params->is_sbt_indirect ?
1072             (params->indirect_sbts_addr +
1073              offsetof(VkTraceRaysIndirectCommand2KHR,
1074                       raygenShaderRecordAddress)) :
1075             params->raygen_sbt->deviceAddress,
1076          .is_indirect = params->is_sbt_indirect,
1077          .local_group_size_log2 = {
1078             local_size_log2[0],
1079             local_size_log2[1],
1080             local_size_log2[2],
1081          },
1082       };
1083       STATIC_ASSERT(sizeof(trampoline_params) == 32);
1084       memcpy(cw.InlineData, &trampoline_params, sizeof(trampoline_params));
1085    }
1086 
1087    trace_intel_end_rays(&cmd_buffer->trace,
1088                         params->launch_size[0],
1089                         params->launch_size[1],
1090                         params->launch_size[2]);
1091 }
1092 
1093 void
genX(CmdTraceRaysKHR)1094 genX(CmdTraceRaysKHR)(
1095     VkCommandBuffer                             commandBuffer,
1096     const VkStridedDeviceAddressRegionKHR*      pRaygenShaderBindingTable,
1097     const VkStridedDeviceAddressRegionKHR*      pMissShaderBindingTable,
1098     const VkStridedDeviceAddressRegionKHR*      pHitShaderBindingTable,
1099     const VkStridedDeviceAddressRegionKHR*      pCallableShaderBindingTable,
1100     uint32_t                                    width,
1101     uint32_t                                    height,
1102     uint32_t                                    depth)
1103 {
1104    ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer);
1105    struct trace_params params = {
1106       .is_sbt_indirect         = false,
1107       .raygen_sbt              = pRaygenShaderBindingTable,
1108       .miss_sbt                = pMissShaderBindingTable,
1109       .hit_sbt                 = pHitShaderBindingTable,
1110       .callable_sbt            = pCallableShaderBindingTable,
1111       .is_launch_size_indirect = false,
1112       .launch_size             = {
1113          width,
1114          height,
1115          depth,
1116       },
1117    };
1118 
1119    cmd_buffer_trace_rays(cmd_buffer, &params);
1120 }
1121 
1122 void
genX(CmdTraceRaysIndirectKHR)1123 genX(CmdTraceRaysIndirectKHR)(
1124     VkCommandBuffer                             commandBuffer,
1125     const VkStridedDeviceAddressRegionKHR*      pRaygenShaderBindingTable,
1126     const VkStridedDeviceAddressRegionKHR*      pMissShaderBindingTable,
1127     const VkStridedDeviceAddressRegionKHR*      pHitShaderBindingTable,
1128     const VkStridedDeviceAddressRegionKHR*      pCallableShaderBindingTable,
1129     VkDeviceAddress                             indirectDeviceAddress)
1130 {
1131    ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer);
1132    struct trace_params params = {
1133       .is_sbt_indirect         = false,
1134       .raygen_sbt              = pRaygenShaderBindingTable,
1135       .miss_sbt                = pMissShaderBindingTable,
1136       .hit_sbt                 = pHitShaderBindingTable,
1137       .callable_sbt            = pCallableShaderBindingTable,
1138       .is_launch_size_indirect = true,
1139       .launch_size_addr        = indirectDeviceAddress,
1140    };
1141 
1142    cmd_buffer_trace_rays(cmd_buffer, &params);
1143 }
1144 
1145 void
genX(CmdTraceRaysIndirect2KHR)1146 genX(CmdTraceRaysIndirect2KHR)(
1147     VkCommandBuffer                             commandBuffer,
1148     VkDeviceAddress                             indirectDeviceAddress)
1149 {
1150    ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer);
1151    struct trace_params params = {
1152       .is_sbt_indirect         = true,
1153       .indirect_sbts_addr      = indirectDeviceAddress,
1154       .is_launch_size_indirect = true,
1155       .launch_size_addr        = indirectDeviceAddress +
1156                                  offsetof(VkTraceRaysIndirectCommand2KHR, width),
1157    };
1158 
1159    cmd_buffer_trace_rays(cmd_buffer, &params);
1160 }
1161 
1162 #endif /* GFX_VERx10 >= 125 */
1163