• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*
2  * Copyright © 2015 Intel Corporation
3  *
4  * Permission is hereby granted, free of charge, to any person obtaining a
5  * copy of this software and associated documentation files (the "Software"),
6  * to deal in the Software without restriction, including without limitation
7  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8  * and/or sell copies of the Software, and to permit persons to whom the
9  * Software is furnished to do so, subject to the following conditions:
10  *
11  * The above copyright notice and this permission notice (including the next
12  * paragraph) shall be included in all copies or substantial portions of the
13  * Software.
14  *
15  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
18  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21  * IN THE SOFTWARE.
22  */
23 
24 #include <assert.h>
25 #include <stdbool.h>
26 
27 #include "anv_private.h"
28 #include "anv_measure.h"
29 
30 #include "common/intel_compute_slm.h"
31 #include "genxml/gen_macros.h"
32 #include "genxml/genX_pack.h"
33 #include "genxml/genX_rt_pack.h"
34 #include "common/intel_genX_state_brw.h"
35 
36 #include "ds/intel_tracepoints.h"
37 
38 #include "genX_mi_builder.h"
39 
40 void
genX(cmd_buffer_ensure_cfe_state)41 genX(cmd_buffer_ensure_cfe_state)(struct anv_cmd_buffer *cmd_buffer,
42                                   uint32_t total_scratch)
43 {
44 #if GFX_VERx10 >= 125
45    assert(cmd_buffer->state.current_pipeline == GPGPU);
46 
47    struct anv_cmd_compute_state *comp_state = &cmd_buffer->state.compute;
48 
49    if (total_scratch <= comp_state->scratch_size)
50       return;
51 
52    const struct intel_device_info *devinfo = cmd_buffer->device->info;
53    anv_batch_emit(&cmd_buffer->batch, GENX(CFE_STATE), cfe) {
54       cfe.MaximumNumberofThreads = devinfo->max_cs_threads * devinfo->subslice_total;
55 
56       uint32_t scratch_surf;
57       struct anv_scratch_pool *scratch_pool =
58          (cmd_buffer->vk.pool->flags & VK_COMMAND_POOL_CREATE_PROTECTED_BIT) ?
59           &cmd_buffer->device->protected_scratch_pool :
60           &cmd_buffer->device->scratch_pool;
61       struct anv_bo *scratch_bo =
62             anv_scratch_pool_alloc(cmd_buffer->device, scratch_pool,
63                                    MESA_SHADER_COMPUTE,
64                                    total_scratch);
65       anv_reloc_list_add_bo(cmd_buffer->batch.relocs, scratch_bo);
66       scratch_surf = anv_scratch_pool_get_surf(cmd_buffer->device, scratch_pool,
67                                                total_scratch);
68       cfe.ScratchSpaceBuffer = scratch_surf >> ANV_SCRATCH_SPACE_SHIFT(GFX_VER);
69 #if GFX_VER >= 20
70       switch (cmd_buffer->device->physical->instance->stack_ids) {
71       case 256:  cfe.StackIDControl = StackIDs256;  break;
72       case 512:  cfe.StackIDControl = StackIDs512;  break;
73       case 1024: cfe.StackIDControl = StackIDs1024; break;
74       case 2048: cfe.StackIDControl = StackIDs2048; break;
75       default:   unreachable("invalid stack_ids value");
76       }
77 
78 #if INTEL_WA_14021821874_GFX_VER || INTEL_WA_14018813551_GFX_VER
79       /* Wa_14021821874, Wa_14018813551:
80        *
81        * "StackIDControlOverride_RTGlobals = 0 (i.e. 2k)". We
82        * already set stack size per ray to 64 in brw_nir_lower_rt_intrinsics
83        * as the workaround also requires.
84        */
85       if (intel_needs_workaround(cmd_buffer->device->info, 14021821874) ||
86           intel_needs_workaround(cmd_buffer->device->info, 14018813551))
87          cfe.StackIDControl = StackIDs2048;
88 #endif
89 
90 #endif
91 
92       cfe.OverDispatchControl = 2; /* 50% overdispatch */
93    }
94 
95    comp_state->scratch_size = total_scratch;
96 #else
97    unreachable("Invalid call");
98 #endif
99 }
100 
101 static void
genX(cmd_buffer_flush_compute_state)102 genX(cmd_buffer_flush_compute_state)(struct anv_cmd_buffer *cmd_buffer)
103 {
104    struct anv_cmd_compute_state *comp_state = &cmd_buffer->state.compute;
105    struct anv_compute_pipeline *pipeline =
106       anv_pipeline_to_compute(comp_state->base.pipeline);
107    const UNUSED struct intel_device_info *devinfo = cmd_buffer->device->info;
108 
109    assert(pipeline->cs);
110 
111    genX(cmd_buffer_config_l3)(cmd_buffer, pipeline->base.l3_config);
112 
113    genX(cmd_buffer_update_color_aux_op(cmd_buffer, ISL_AUX_OP_NONE));
114 
115    genX(flush_descriptor_buffers)(cmd_buffer, &comp_state->base);
116 
117    genX(flush_pipeline_select_gpgpu)(cmd_buffer);
118 
119    /* Apply any pending pipeline flushes we may have.  We want to apply them
120     * now because, if any of those flushes are for things like push constants,
121     * the GPU will read the state at weird times.
122     */
123    genX(cmd_buffer_apply_pipe_flushes)(cmd_buffer);
124 
125    if (cmd_buffer->state.compute.pipeline_dirty) {
126 #if GFX_VERx10 < 125
127       /* From the Sky Lake PRM Vol 2a, MEDIA_VFE_STATE:
128        *
129        *    "A stalling PIPE_CONTROL is required before MEDIA_VFE_STATE unless
130        *    the only bits that are changed are scoreboard related: Scoreboard
131        *    Enable, Scoreboard Type, Scoreboard Mask, Scoreboard * Delta. For
132        *    these scoreboard related states, a MEDIA_STATE_FLUSH is
133        *    sufficient."
134        */
135       anv_add_pending_pipe_bits(cmd_buffer,
136                               ANV_PIPE_CS_STALL_BIT,
137                               "flush compute state");
138       genX(cmd_buffer_apply_pipe_flushes)(cmd_buffer);
139 #endif
140 
141       anv_batch_emit_batch(&cmd_buffer->batch, &pipeline->base.batch);
142 
143 #if GFX_VERx10 >= 125
144       const struct brw_cs_prog_data *prog_data = get_cs_prog_data(pipeline);
145       genX(cmd_buffer_ensure_cfe_state)(cmd_buffer, prog_data->base.total_scratch);
146 #endif
147 
148       /* Changing the pipeline affects the push constants layout (different
149        * amount of cross/per thread allocations). The allocation is also
150        * bounded to just the amount consummed by the pipeline (see
151        * anv_cmd_buffer_cs_push_constants). So we force the reallocation for
152        * every pipeline change.
153        *
154        * On Gfx12.0 we're also seeing failures in the dEQP-VK.memory_model.*
155        * tests when run in parallel. This is likely a HW issue with push
156        * constants & context save/restore.
157        *
158        * TODO: optimize this on Gfx12.5+ where the shader is not using per
159        * thread allocations and is also pulling the data using SEND messages.
160        * We should be able to limit reallocations only the data actually
161        * changes.
162        */
163       cmd_buffer->state.push_constants_dirty |= VK_SHADER_STAGE_COMPUTE_BIT;
164       comp_state->base.push_constants_data_dirty = true;
165    }
166 
167    cmd_buffer->state.descriptors_dirty |=
168       genX(cmd_buffer_flush_push_descriptors)(cmd_buffer,
169                                               &cmd_buffer->state.compute.base,
170                                               &pipeline->base);
171 
172    if ((cmd_buffer->state.descriptors_dirty & VK_SHADER_STAGE_COMPUTE_BIT) ||
173        cmd_buffer->state.compute.pipeline_dirty) {
174       genX(cmd_buffer_flush_descriptor_sets)(cmd_buffer,
175                                              &cmd_buffer->state.compute.base,
176                                              VK_SHADER_STAGE_COMPUTE_BIT,
177                                              &pipeline->cs, 1);
178       cmd_buffer->state.descriptors_dirty &= ~VK_SHADER_STAGE_COMPUTE_BIT;
179 
180 #if GFX_VERx10 < 125
181       uint32_t iface_desc_data_dw[GENX(INTERFACE_DESCRIPTOR_DATA_length)];
182       struct GENX(INTERFACE_DESCRIPTOR_DATA) desc = {
183          .BindingTablePointer =
184             cmd_buffer->state.binding_tables[MESA_SHADER_COMPUTE].offset,
185          .SamplerStatePointer =
186             cmd_buffer->state.samplers[MESA_SHADER_COMPUTE].offset,
187       };
188       GENX(INTERFACE_DESCRIPTOR_DATA_pack)(NULL, iface_desc_data_dw, &desc);
189 
190       struct anv_state state =
191          anv_cmd_buffer_merge_dynamic(cmd_buffer, iface_desc_data_dw,
192                                       pipeline->interface_descriptor_data,
193                                       GENX(INTERFACE_DESCRIPTOR_DATA_length),
194                                       64);
195 
196       uint32_t size = GENX(INTERFACE_DESCRIPTOR_DATA_length) * sizeof(uint32_t);
197       anv_batch_emit(&cmd_buffer->batch,
198                      GENX(MEDIA_INTERFACE_DESCRIPTOR_LOAD), mid) {
199          mid.InterfaceDescriptorTotalLength        = size;
200          mid.InterfaceDescriptorDataStartAddress   = state.offset;
201       }
202 #endif
203    }
204 
205    if (cmd_buffer->state.push_constants_dirty & VK_SHADER_STAGE_COMPUTE_BIT) {
206 
207       if (comp_state->base.push_constants_state.alloc_size == 0 ||
208           comp_state->base.push_constants_data_dirty) {
209          comp_state->base.push_constants_state =
210             anv_cmd_buffer_cs_push_constants(cmd_buffer);
211          comp_state->base.push_constants_data_dirty = false;
212       }
213 
214 #if GFX_VERx10 < 125
215       if (comp_state->base.push_constants_state.alloc_size) {
216          anv_batch_emit(&cmd_buffer->batch, GENX(MEDIA_CURBE_LOAD), curbe) {
217             curbe.CURBETotalDataLength    = comp_state->base.push_constants_state.alloc_size;
218             curbe.CURBEDataStartAddress   = comp_state->base.push_constants_state.offset;
219          }
220       }
221 #endif
222 
223       cmd_buffer->state.push_constants_dirty &= ~VK_SHADER_STAGE_COMPUTE_BIT;
224    }
225 
226    cmd_buffer->state.compute.pipeline_dirty = false;
227 
228    genX(cmd_buffer_apply_pipe_flushes)(cmd_buffer);
229 }
230 
231 static void
anv_cmd_buffer_push_workgroups(struct anv_cmd_buffer * cmd_buffer,const struct brw_cs_prog_data * prog_data,uint32_t baseGroupX,uint32_t baseGroupY,uint32_t baseGroupZ,uint32_t groupCountX,uint32_t groupCountY,uint32_t groupCountZ,struct anv_address indirect_group)232 anv_cmd_buffer_push_workgroups(struct anv_cmd_buffer *cmd_buffer,
233                                const struct brw_cs_prog_data *prog_data,
234                                uint32_t baseGroupX,
235                                uint32_t baseGroupY,
236                                uint32_t baseGroupZ,
237                                uint32_t groupCountX,
238                                uint32_t groupCountY,
239                                uint32_t groupCountZ,
240                                struct anv_address indirect_group)
241 {
242    if (anv_batch_has_error(&cmd_buffer->batch))
243       return;
244 
245    struct anv_push_constants *push =
246       &cmd_buffer->state.compute.base.push_constants;
247    bool updated = false;
248    if (push->cs.base_work_group_id[0] != baseGroupX ||
249        push->cs.base_work_group_id[1] != baseGroupY ||
250        push->cs.base_work_group_id[2] != baseGroupZ) {
251       push->cs.base_work_group_id[0] = baseGroupX;
252       push->cs.base_work_group_id[1] = baseGroupY;
253       push->cs.base_work_group_id[2] = baseGroupZ;
254       updated = true;
255    }
256 
257    /* On Gfx12.5+ this value goes into the inline parameter register */
258    if (GFX_VERx10 < 125 && prog_data->uses_num_work_groups) {
259       if (anv_address_is_null(indirect_group)) {
260          if (push->cs.num_work_groups[0] != groupCountX ||
261              push->cs.num_work_groups[1] != groupCountY ||
262              push->cs.num_work_groups[2] != groupCountZ) {
263             push->cs.num_work_groups[0] = groupCountX;
264             push->cs.num_work_groups[1] = groupCountY;
265             push->cs.num_work_groups[2] = groupCountZ;
266             updated = true;
267          }
268       } else {
269          uint64_t addr64 = anv_address_physical(indirect_group);
270          uint32_t lower_addr32 = addr64 & 0xffffffff;
271          uint32_t upper_addr32 = addr64 >> 32;
272          if (push->cs.num_work_groups[0] != UINT32_MAX ||
273              push->cs.num_work_groups[1] != lower_addr32 ||
274              push->cs.num_work_groups[2] != upper_addr32) {
275             push->cs.num_work_groups[0] = UINT32_MAX;
276             push->cs.num_work_groups[1] = lower_addr32;
277             push->cs.num_work_groups[2] = upper_addr32;
278             updated = true;
279          }
280       }
281    }
282 
283    if (updated) {
284       cmd_buffer->state.push_constants_dirty |= VK_SHADER_STAGE_COMPUTE_BIT;
285       cmd_buffer->state.compute.base.push_constants_data_dirty = true;
286    }
287 }
288 
289 #define GPGPU_DISPATCHDIMX 0x2500
290 #define GPGPU_DISPATCHDIMY 0x2504
291 #define GPGPU_DISPATCHDIMZ 0x2508
292 
293 static void
compute_load_indirect_params(struct anv_cmd_buffer * cmd_buffer,const struct anv_address indirect_addr,bool is_unaligned_size_x)294 compute_load_indirect_params(struct anv_cmd_buffer *cmd_buffer,
295                              const struct anv_address indirect_addr,
296                              bool is_unaligned_size_x)
297 {
298    struct mi_builder b;
299    mi_builder_init(&b, cmd_buffer->device->info, &cmd_buffer->batch);
300 
301    struct mi_value size_x = mi_mem32(anv_address_add(indirect_addr, 0));
302 
303    /* Convert unaligned thread invocations to aligned thread group in X
304     * dimension for unaligned shader dispatches during ray tracing phase.
305     */
306    if (is_unaligned_size_x) {
307       const uint32_t mocs = isl_mocs(&cmd_buffer->device->isl_dev, 0, false);
308       mi_builder_set_mocs(&b, mocs);
309 
310       struct anv_compute_pipeline *pipeline =
311          anv_pipeline_to_compute(cmd_buffer->state.compute.base.pipeline);
312       const struct brw_cs_prog_data *prog_data = get_cs_prog_data(pipeline);
313 
314       assert(util_is_power_of_two_or_zero(prog_data->local_size[0]));
315       size_x = mi_udiv32_imm(&b, size_x, prog_data->local_size[0]);
316       size_x = mi_iadd(&b, size_x, mi_imm(1));
317    }
318 
319    struct mi_value size_y = mi_mem32(anv_address_add(indirect_addr, 4));
320    struct mi_value size_z = mi_mem32(anv_address_add(indirect_addr, 8));
321 
322    mi_store(&b, mi_reg32(GPGPU_DISPATCHDIMX), size_x);
323    mi_store(&b, mi_reg32(GPGPU_DISPATCHDIMY), size_y);
324    mi_store(&b, mi_reg32(GPGPU_DISPATCHDIMZ), size_z);
325 }
326 
327 static void
compute_store_indirect_params(struct anv_cmd_buffer * cmd_buffer,const struct anv_address indirect_addr)328 compute_store_indirect_params(struct anv_cmd_buffer *cmd_buffer,
329                              const struct anv_address indirect_addr)
330 {
331    struct mi_builder b;
332    mi_builder_init(&b, cmd_buffer->device->info, &cmd_buffer->batch);
333 
334    struct mi_value size_x = mi_mem32(anv_address_add(indirect_addr, 0));
335    struct mi_value size_y = mi_mem32(anv_address_add(indirect_addr, 4));
336    struct mi_value size_z = mi_mem32(anv_address_add(indirect_addr, 8));
337 
338    mi_store(&b, size_x, mi_reg32(GPGPU_DISPATCHDIMX));
339    mi_store(&b, size_y, mi_reg32(GPGPU_DISPATCHDIMY));
340    mi_store(&b, size_z, mi_reg32(GPGPU_DISPATCHDIMZ));
341 }
342 
343 
344 #if GFX_VERx10 >= 125
345 
GENX(INTERFACE_DESCRIPTOR_DATA)346 static inline struct GENX(INTERFACE_DESCRIPTOR_DATA)
347 get_interface_descriptor_data(struct anv_cmd_buffer *cmd_buffer,
348                               const struct anv_shader_bin *shader,
349                               const struct brw_cs_prog_data *prog_data,
350                               const struct intel_cs_dispatch_info *dispatch)
351 {
352    const struct intel_device_info *devinfo = cmd_buffer->device->info;
353 
354    return (struct GENX(INTERFACE_DESCRIPTOR_DATA)) {
355       .SamplerCount = DIV_ROUND_UP(CLAMP(shader->bind_map.sampler_count, 0, 16), 4),
356       .KernelStartPointer = shader->kernel.offset,
357       .SamplerStatePointer = cmd_buffer->state.samplers[MESA_SHADER_COMPUTE].offset,
358       .BindingTablePointer = cmd_buffer->state.binding_tables[MESA_SHADER_COMPUTE].offset,
359       /* Typically set to 0 to avoid prefetching on every thread dispatch. */
360       .BindingTableEntryCount = devinfo->verx10 == 125 ?
361          0 : MIN2(shader->bind_map.surface_count, 30),
362       .NumberofThreadsinGPGPUThreadGroup = dispatch->threads,
363       .SharedLocalMemorySize = intel_compute_slm_encode_size(GFX_VER, prog_data->base.total_shared),
364       .PreferredSLMAllocationSize =
365          intel_compute_preferred_slm_calc_encode_size(devinfo,
366                                                       prog_data->base.total_shared,
367                                                       dispatch->group_size,
368                                                       dispatch->simd_size),
369       .NumberOfBarriers = prog_data->uses_barrier,
370    };
371 }
372 
373 static inline void
emit_indirect_compute_walker(struct anv_cmd_buffer * cmd_buffer,const struct anv_shader_bin * shader,const struct brw_cs_prog_data * prog_data,struct anv_address indirect_addr)374 emit_indirect_compute_walker(struct anv_cmd_buffer *cmd_buffer,
375                              const struct anv_shader_bin *shader,
376                              const struct brw_cs_prog_data *prog_data,
377                              struct anv_address indirect_addr)
378 {
379    const struct intel_device_info *devinfo = cmd_buffer->device->info;
380    assert(devinfo->has_indirect_unroll);
381 
382    struct anv_cmd_compute_state *comp_state = &cmd_buffer->state.compute;
383    bool predicate = cmd_buffer->state.conditional_render_enabled;
384 
385    const struct intel_cs_dispatch_info dispatch =
386       brw_cs_get_dispatch_info(devinfo, prog_data, NULL);
387    const int dispatch_size = dispatch.simd_size / 16;
388 
389    uint64_t indirect_addr64 = anv_address_physical(indirect_addr);
390 
391    struct GENX(COMPUTE_WALKER_BODY) body =  {
392       .SIMDSize                 = dispatch_size,
393       /* HSD 14016252163: Use of Morton walk order (and batching using a batch
394        * size of 4) is expected to increase sampler cache hit rates by
395        * increasing sample address locality within a subslice.
396        */
397 #if GFX_VER >= 30
398       .DispatchWalkOrder        = prog_data->uses_sampler ?
399                                   MortonWalk :
400                                   LinearWalk,
401       .ThreadGroupBatchSize     = prog_data->uses_sampler ? TG_BATCH_4 :
402                                                             TG_BATCH_1,
403 #endif
404       .MessageSIMD              = dispatch_size,
405       .IndirectDataStartAddress = comp_state->base.push_constants_state.offset,
406       .IndirectDataLength       = comp_state->base.push_constants_state.alloc_size,
407       .GenerateLocalID          = prog_data->generate_local_id != 0,
408       .EmitLocal                = prog_data->generate_local_id,
409       .WalkOrder                = prog_data->walk_order,
410       .TileLayout               = prog_data->walk_order == INTEL_WALK_ORDER_YXZ ?
411                                   TileY32bpe : Linear,
412       .LocalXMaximum            = prog_data->local_size[0] - 1,
413       .LocalYMaximum            = prog_data->local_size[1] - 1,
414       .LocalZMaximum            = prog_data->local_size[2] - 1,
415       .ExecutionMask            = dispatch.right_mask,
416       .PostSync.MOCS            = anv_mocs(cmd_buffer->device, NULL, 0),
417       .InterfaceDescriptor =
418          get_interface_descriptor_data(cmd_buffer, shader, prog_data,
419                                        &dispatch),
420       .EmitInlineParameter      = prog_data->uses_inline_data,
421       .InlineData               = {
422          [ANV_INLINE_PARAM_NUM_WORKGROUPS_OFFSET / 4 + 0] = UINT32_MAX,
423          [ANV_INLINE_PARAM_NUM_WORKGROUPS_OFFSET / 4 + 1] = indirect_addr64 & 0xffffffff,
424          [ANV_INLINE_PARAM_NUM_WORKGROUPS_OFFSET / 4 + 2] = indirect_addr64 >> 32,
425       },
426    };
427 
428    cmd_buffer->state.last_indirect_dispatch =
429       anv_batch_emitn(
430          &cmd_buffer->batch,
431          GENX(EXECUTE_INDIRECT_DISPATCH_length),
432          GENX(EXECUTE_INDIRECT_DISPATCH),
433          .PredicateEnable            = predicate,
434          .MaxCount                   = 1,
435          .COMPUTE_WALKER_BODY        = body,
436          .ArgumentBufferStartAddress = indirect_addr,
437          .MOCS                       = anv_mocs(cmd_buffer->device,
438                                                 indirect_addr.bo, 0),
439       );
440 }
441 
442 static inline void
emit_compute_walker(struct anv_cmd_buffer * cmd_buffer,const struct anv_compute_pipeline * pipeline,struct anv_address indirect_addr,const struct brw_cs_prog_data * prog_data,struct intel_cs_dispatch_info dispatch,uint32_t groupCountX,uint32_t groupCountY,uint32_t groupCountZ)443 emit_compute_walker(struct anv_cmd_buffer *cmd_buffer,
444                     const struct anv_compute_pipeline *pipeline,
445                     struct anv_address indirect_addr,
446                     const struct brw_cs_prog_data *prog_data,
447                     struct intel_cs_dispatch_info dispatch,
448                     uint32_t groupCountX, uint32_t groupCountY,
449                     uint32_t groupCountZ)
450 {
451    const struct anv_cmd_compute_state *comp_state = &cmd_buffer->state.compute;
452    const bool predicate = cmd_buffer->state.conditional_render_enabled;
453 
454    uint32_t num_workgroup_data[3];
455    if (!anv_address_is_null(indirect_addr)) {
456       uint64_t indirect_addr64 = anv_address_physical(indirect_addr);
457       num_workgroup_data[0] = UINT32_MAX;
458       num_workgroup_data[1] = indirect_addr64 & 0xffffffff;
459       num_workgroup_data[2] = indirect_addr64 >> 32;
460    } else {
461       num_workgroup_data[0] = groupCountX;
462       num_workgroup_data[1] = groupCountY;
463       num_workgroup_data[2] = groupCountZ;
464    }
465 
466    struct GENX(COMPUTE_WALKER_BODY) body = {
467       .SIMDSize                       = dispatch.simd_size / 16,
468       .MessageSIMD                    = dispatch.simd_size / 16,
469       .IndirectDataStartAddress       = comp_state->base.push_constants_state.offset,
470       .IndirectDataLength             = comp_state->base.push_constants_state.alloc_size,
471       .GenerateLocalID                = prog_data->generate_local_id != 0,
472       .EmitLocal                      = prog_data->generate_local_id,
473       .WalkOrder                      = prog_data->walk_order,
474       .TileLayout = prog_data->walk_order == INTEL_WALK_ORDER_YXZ ?
475                     TileY32bpe : Linear,
476       .LocalXMaximum                  = prog_data->local_size[0] - 1,
477       .LocalYMaximum                  = prog_data->local_size[1] - 1,
478       .LocalZMaximum                  = prog_data->local_size[2] - 1,
479       .ThreadGroupIDXDimension        = groupCountX,
480       .ThreadGroupIDYDimension        = groupCountY,
481       .ThreadGroupIDZDimension        = groupCountZ,
482       .ExecutionMask                  = dispatch.right_mask,
483       .PostSync                       = {
484          .MOCS                        = anv_mocs(pipeline->base.device, NULL, 0),
485       },
486       .InterfaceDescriptor =
487          get_interface_descriptor_data(cmd_buffer, pipeline->cs,
488                                        prog_data, &dispatch),
489       .EmitInlineParameter            = prog_data->uses_inline_data,
490       .InlineData                     = {
491          [ANV_INLINE_PARAM_NUM_WORKGROUPS_OFFSET / 4 + 0] = num_workgroup_data[0],
492          [ANV_INLINE_PARAM_NUM_WORKGROUPS_OFFSET / 4 + 1] = num_workgroup_data[1],
493          [ANV_INLINE_PARAM_NUM_WORKGROUPS_OFFSET / 4 + 2] = num_workgroup_data[2],
494       }
495    };
496 
497    cmd_buffer->state.last_compute_walker =
498       anv_batch_emitn(
499          &cmd_buffer->batch,
500          GENX(COMPUTE_WALKER_length),
501          GENX(COMPUTE_WALKER),
502          .IndirectParameterEnable        = !anv_address_is_null(indirect_addr),
503          .PredicateEnable                = predicate,
504          .body                           = body,
505 #if GFX_VERx10 == 125
506          .SystolicModeEnable             = prog_data->uses_systolic,
507 #endif
508       );
509 }
510 
511 #else /* #if GFX_VERx10 >= 125 */
512 
513 static inline void
emit_gpgpu_walker(struct anv_cmd_buffer * cmd_buffer,const struct anv_compute_pipeline * pipeline,bool indirect,const struct brw_cs_prog_data * prog_data,uint32_t groupCountX,uint32_t groupCountY,uint32_t groupCountZ)514 emit_gpgpu_walker(struct anv_cmd_buffer *cmd_buffer,
515                   const struct anv_compute_pipeline *pipeline, bool indirect,
516                   const struct brw_cs_prog_data *prog_data,
517                   uint32_t groupCountX, uint32_t groupCountY,
518                   uint32_t groupCountZ)
519 {
520    const bool predicate = cmd_buffer->state.conditional_render_enabled;
521 
522    const struct intel_device_info *devinfo = pipeline->base.device->info;
523    const struct intel_cs_dispatch_info dispatch =
524       brw_cs_get_dispatch_info(devinfo, prog_data, NULL);
525 
526    anv_batch_emit(&cmd_buffer->batch, GENX(GPGPU_WALKER), ggw) {
527       ggw.IndirectParameterEnable      = indirect;
528       ggw.PredicateEnable              = predicate;
529       ggw.SIMDSize                     = dispatch.simd_size / 16;
530       ggw.ThreadDepthCounterMaximum    = 0;
531       ggw.ThreadHeightCounterMaximum   = 0;
532       ggw.ThreadWidthCounterMaximum    = dispatch.threads - 1;
533       ggw.ThreadGroupIDXDimension      = groupCountX;
534       ggw.ThreadGroupIDYDimension      = groupCountY;
535       ggw.ThreadGroupIDZDimension      = groupCountZ;
536       ggw.RightExecutionMask           = dispatch.right_mask;
537       ggw.BottomExecutionMask          = 0xffffffff;
538    }
539 
540    anv_batch_emit(&cmd_buffer->batch, GENX(MEDIA_STATE_FLUSH), msf);
541 }
542 
543 #endif /* #if GFX_VERx10 >= 125 */
544 
545 static inline void
emit_cs_walker(struct anv_cmd_buffer * cmd_buffer,const struct anv_compute_pipeline * pipeline,const struct brw_cs_prog_data * prog_data,struct intel_cs_dispatch_info dispatch,struct anv_address indirect_addr,uint32_t groupCountX,uint32_t groupCountY,uint32_t groupCountZ,bool is_unaligned_size_x)546 emit_cs_walker(struct anv_cmd_buffer *cmd_buffer,
547                const struct anv_compute_pipeline *pipeline,
548                const struct brw_cs_prog_data *prog_data,
549                struct intel_cs_dispatch_info dispatch,
550                struct anv_address indirect_addr,
551                uint32_t groupCountX, uint32_t groupCountY, uint32_t groupCountZ,
552                bool is_unaligned_size_x)
553 {
554    bool is_indirect = !anv_address_is_null(indirect_addr);
555 
556 #if GFX_VERx10 >= 125
557    /* For unaligned dispatch, we need to tweak the dispatch value with
558     * MI_MATH, so we can't use indirect HW instructions.
559     */
560    if (is_indirect && !is_unaligned_size_x &&
561        cmd_buffer->device->info->has_indirect_unroll) {
562       emit_indirect_compute_walker(cmd_buffer, pipeline->cs, prog_data,
563                                    indirect_addr);
564       return;
565    }
566 #endif
567 
568    if (is_indirect)
569       compute_load_indirect_params(cmd_buffer, indirect_addr,
570             is_unaligned_size_x);
571 
572 #if GFX_VERx10 >= 125
573    emit_compute_walker(cmd_buffer, pipeline, indirect_addr, prog_data,
574                        dispatch, groupCountX, groupCountY, groupCountZ);
575 #else
576    emit_gpgpu_walker(cmd_buffer, pipeline, is_indirect, prog_data,
577                      groupCountX, groupCountY, groupCountZ);
578 #endif
579 }
580 
genX(CmdDispatchBase)581 void genX(CmdDispatchBase)(
582     VkCommandBuffer                             commandBuffer,
583     uint32_t                                    baseGroupX,
584     uint32_t                                    baseGroupY,
585     uint32_t                                    baseGroupZ,
586     uint32_t                                    groupCountX,
587     uint32_t                                    groupCountY,
588     uint32_t                                    groupCountZ)
589 {
590    ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer);
591    struct anv_compute_pipeline *pipeline =
592       anv_pipeline_to_compute(cmd_buffer->state.compute.base.pipeline);
593    const struct brw_cs_prog_data *prog_data = get_cs_prog_data(pipeline);
594    struct intel_cs_dispatch_info dispatch =
595       brw_cs_get_dispatch_info(cmd_buffer->device->info, prog_data, NULL);
596 
597    if (anv_batch_has_error(&cmd_buffer->batch))
598       return;
599 
600    anv_cmd_buffer_push_workgroups(cmd_buffer, prog_data,
601                                   baseGroupX, baseGroupY, baseGroupZ,
602                                   groupCountX, groupCountY, groupCountZ,
603                                   ANV_NULL_ADDRESS);
604 
605    anv_measure_snapshot(cmd_buffer,
606                         INTEL_SNAPSHOT_COMPUTE,
607                         "compute",
608                         groupCountX * groupCountY * groupCountZ *
609                         prog_data->local_size[0] * prog_data->local_size[1] *
610                         prog_data->local_size[2]);
611 
612    if (cmd_buffer->state.rt.debug_marker_count == 0)
613       trace_intel_begin_compute(&cmd_buffer->trace);
614 
615    genX(cmd_buffer_flush_compute_state)(cmd_buffer);
616 
617    if (cmd_buffer->state.conditional_render_enabled)
618       genX(cmd_emit_conditional_render_predicate)(cmd_buffer);
619 
620    emit_cs_walker(cmd_buffer, pipeline, prog_data, dispatch,
621                   ANV_NULL_ADDRESS /* no indirect data */,
622                   groupCountX, groupCountY, groupCountZ,
623                   false);
624 
625    if (cmd_buffer->state.rt.debug_marker_count == 0) {
626       trace_intel_end_compute(&cmd_buffer->trace,
627                               groupCountX, groupCountY, groupCountZ,
628                               pipeline->source_hash);
629    }
630 }
631 
632 static void
emit_unaligned_cs_walker(VkCommandBuffer commandBuffer,uint32_t baseGroupX,uint32_t baseGroupY,uint32_t baseGroupZ,uint32_t groupCountX,uint32_t groupCountY,uint32_t groupCountZ,struct intel_cs_dispatch_info dispatch)633 emit_unaligned_cs_walker(
634     VkCommandBuffer                             commandBuffer,
635     uint32_t                                    baseGroupX,
636     uint32_t                                    baseGroupY,
637     uint32_t                                    baseGroupZ,
638     uint32_t                                    groupCountX,
639     uint32_t                                    groupCountY,
640     uint32_t                                    groupCountZ,
641     struct intel_cs_dispatch_info               dispatch)
642 {
643    ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer);
644    struct anv_compute_pipeline *pipeline =
645       anv_pipeline_to_compute(cmd_buffer->state.compute.base.pipeline);
646    const struct brw_cs_prog_data *prog_data = get_cs_prog_data(pipeline);
647 
648    if (anv_batch_has_error(&cmd_buffer->batch))
649       return;
650 
651    anv_cmd_buffer_push_workgroups(cmd_buffer, prog_data,
652                                   baseGroupX, baseGroupY, baseGroupZ,
653                                   groupCountX, groupCountY, groupCountZ,
654                                   ANV_NULL_ADDRESS);
655 
656    /* RT shaders have Y and Z local size set to 1 always. */
657    assert(prog_data->local_size[1] == 1 && prog_data->local_size[2] == 1);
658 
659    /* RT shaders dispatched with group Y and Z set to 1 always. */
660    assert(groupCountY == 1 && groupCountZ == 1);
661 
662    if (anv_batch_has_error(&cmd_buffer->batch))
663       return;
664 
665    anv_measure_snapshot(cmd_buffer,
666                         INTEL_SNAPSHOT_COMPUTE,
667                         "compute-unaligned-cs-walker",
668                         groupCountX * groupCountY * groupCountZ *
669                         prog_data->local_size[0] * prog_data->local_size[1] *
670                         prog_data->local_size[2]);
671 
672    if (cmd_buffer->state.rt.debug_marker_count == 0)
673       trace_intel_begin_compute(&cmd_buffer->trace);
674 
675    assert(!prog_data->uses_num_work_groups);
676    genX(cmd_buffer_flush_compute_state)(cmd_buffer);
677 
678    if (cmd_buffer->state.conditional_render_enabled)
679       genX(cmd_emit_conditional_render_predicate)(cmd_buffer);
680 
681 #if GFX_VERx10 >= 125
682    emit_compute_walker(cmd_buffer, pipeline, ANV_NULL_ADDRESS, prog_data,
683                        dispatch, groupCountX, groupCountY, groupCountZ);
684 #endif
685 
686    if (cmd_buffer->state.rt.debug_marker_count == 0) {
687       trace_intel_end_compute(&cmd_buffer->trace,
688                               groupCountX, groupCountY, groupCountZ,
689                               pipeline->source_hash);
690    }
691 }
692 
693 /*
694  * Dispatch compute work item with unaligned thread invocations.
695  *
696  * This helper takes unaligned thread invocations, convert it into aligned
697  * thread group count and dispatch compute work items.
698  *
699  * We launch two CS walker, one with aligned part and another CS walker
700  * with single group for remaining thread invocations.
701  *
702  * This function is now specifically for BVH building.
703  */
704 void
genX(cmd_dispatch_unaligned)705 genX(cmd_dispatch_unaligned)(
706     VkCommandBuffer                             commandBuffer,
707     uint32_t                                    invocations_x,
708     uint32_t                                    invocations_y,
709     uint32_t                                    invocations_z)
710 {
711    ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer);
712    struct anv_compute_pipeline *pipeline =
713       anv_pipeline_to_compute(cmd_buffer->state.compute.base.pipeline);
714    const struct brw_cs_prog_data *prog_data = get_cs_prog_data(pipeline);
715 
716    /* Group X can be unaligned for RT dispatches. */
717    uint32_t groupCountX = invocations_x / prog_data->local_size[0];
718    uint32_t groupCountY = invocations_y;
719    uint32_t groupCountZ = invocations_z;
720 
721    struct intel_cs_dispatch_info dispatch =
722       brw_cs_get_dispatch_info(cmd_buffer->device->info, prog_data, NULL);
723 
724    /* Launch first CS walker with aligned group count X. */
725    if (groupCountX) {
726       emit_unaligned_cs_walker(commandBuffer, 0, 0, 0, groupCountX,
727                                groupCountY, groupCountZ, dispatch);
728    }
729 
730    uint32_t unaligned_invocations_x = invocations_x % prog_data->local_size[0];
731    if (unaligned_invocations_x) {
732       dispatch.threads = DIV_ROUND_UP(unaligned_invocations_x,
733                                       dispatch.simd_size);
734 
735       /* Make sure the 2nd walker has the same amount of invocations per
736        * workgroup as the 1st walker, so that gl_GlobalInvocationsID can be
737        * calculated correctly with baseGroup.
738        */
739       assert(dispatch.threads * dispatch.simd_size == prog_data->local_size[0]);
740 
741       const uint32_t remainder = unaligned_invocations_x & (dispatch.simd_size - 1);
742       if (remainder > 0) {
743          dispatch.right_mask = ~0u >> (32 - remainder);
744       } else {
745          dispatch.right_mask = ~0u >> (32 - dispatch.simd_size);
746       }
747 
748       /* Launch second CS walker for unaligned part. */
749       emit_unaligned_cs_walker(commandBuffer, groupCountX, 0, 0, 1, 1, 1,
750                                dispatch);
751    }
752 }
753 
754 /*
755  * This dispatches compute work item with indirect parameters.
756  * Helper also makes the unaligned thread invocations aligned.
757  */
758 void
genX(cmd_buffer_dispatch_indirect)759 genX(cmd_buffer_dispatch_indirect)(struct anv_cmd_buffer *cmd_buffer,
760                                    struct anv_address indirect_addr,
761                                    bool is_unaligned_size_x)
762 {
763    struct anv_compute_pipeline *pipeline =
764       anv_pipeline_to_compute(cmd_buffer->state.compute.base.pipeline);
765    const struct brw_cs_prog_data *prog_data = get_cs_prog_data(pipeline);
766    UNUSED struct anv_batch *batch = &cmd_buffer->batch;
767    struct intel_cs_dispatch_info dispatch =
768       brw_cs_get_dispatch_info(cmd_buffer->device->info, prog_data, NULL);
769 
770    if (anv_batch_has_error(&cmd_buffer->batch))
771       return;
772 
773    anv_cmd_buffer_push_workgroups(cmd_buffer, prog_data,
774                                   0, 0, 0, 0, 0, 0, indirect_addr);
775 
776    anv_measure_snapshot(cmd_buffer,
777                         INTEL_SNAPSHOT_COMPUTE,
778                         "compute indirect",
779                         0);
780 
781    if (cmd_buffer->state.rt.debug_marker_count == 0)
782       trace_intel_begin_compute_indirect(&cmd_buffer->trace);
783 
784    genX(cmd_buffer_flush_compute_state)(cmd_buffer);
785 
786    if (cmd_buffer->state.conditional_render_enabled)
787       genX(cmd_emit_conditional_render_predicate)(cmd_buffer);
788 
789    emit_cs_walker(cmd_buffer, pipeline, prog_data, dispatch, indirect_addr, 0,
790                   0, 0, is_unaligned_size_x);
791 
792    if (cmd_buffer->state.rt.debug_marker_count == 0) {
793       trace_intel_end_compute_indirect(&cmd_buffer->trace,
794                                        anv_address_utrace(indirect_addr),
795                                        pipeline->source_hash);
796    }
797 }
798 
genX(CmdDispatchIndirect)799 void genX(CmdDispatchIndirect)(
800     VkCommandBuffer                             commandBuffer,
801     VkBuffer                                    _buffer,
802     VkDeviceSize                                offset)
803 {
804    ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer);
805    ANV_FROM_HANDLE(anv_buffer, buffer, _buffer);
806    struct anv_address addr = anv_address_add(buffer->address, offset);
807 
808    genX(cmd_buffer_dispatch_indirect)(cmd_buffer, addr, false);
809 }
810 
811 struct anv_address
genX(cmd_buffer_ray_query_globals)812 genX(cmd_buffer_ray_query_globals)(struct anv_cmd_buffer *cmd_buffer)
813 {
814 #if GFX_VERx10 >= 125
815    struct anv_device *device = cmd_buffer->device;
816 
817    struct anv_state state =
818       anv_cmd_buffer_alloc_temporary_state(cmd_buffer,
819                                            BRW_RT_DISPATCH_GLOBALS_SIZE, 64);
820    struct brw_rt_scratch_layout layout;
821    uint32_t stack_ids_per_dss = 2048; /* TODO: can we use a lower value in
822                                        * some cases?
823                                        */
824    brw_rt_compute_scratch_layout(&layout, device->info,
825                                  stack_ids_per_dss, 1 << 10);
826 
827    uint8_t idx = anv_get_ray_query_bo_index(cmd_buffer);
828 
829    const struct GENX(RT_DISPATCH_GLOBALS) rtdg = {
830       .MemBaseAddress = (struct anv_address) {
831          /* The ray query HW computes offsets from the top of the buffer, so
832           * let the address at the end of the buffer.
833           */
834          .bo = device->ray_query_bo[idx],
835          .offset = device->ray_query_bo[idx]->size
836       },
837       .AsyncRTStackSize = layout.ray_stack_stride / 64,
838       .NumDSSRTStacks = layout.stack_ids_per_dss,
839       .MaxBVHLevels = BRW_RT_MAX_BVH_LEVELS,
840       .Flags = RT_DEPTH_TEST_LESS_EQUAL,
841       .ResumeShaderTable = (struct anv_address) {
842          .bo = cmd_buffer->state.ray_query_shadow_bo,
843       },
844    };
845    GENX(RT_DISPATCH_GLOBALS_pack)(NULL, state.map, &rtdg);
846 
847    return anv_cmd_buffer_temporary_state_address(cmd_buffer, state);
848 #else
849    unreachable("Not supported");
850 #endif
851 }
852 
853 #if GFX_VERx10 >= 125
854 void
genX(cmd_buffer_dispatch_kernel)855 genX(cmd_buffer_dispatch_kernel)(struct anv_cmd_buffer *cmd_buffer,
856                                  struct anv_kernel *kernel,
857                                  const uint32_t *global_size,
858                                  uint32_t arg_count,
859                                  const struct anv_kernel_arg *args)
860 {
861    const struct intel_device_info *devinfo = cmd_buffer->device->info;
862    const struct brw_cs_prog_data *cs_prog_data =
863       brw_cs_prog_data_const(kernel->bin->prog_data);
864 
865    genX(cmd_buffer_config_l3)(cmd_buffer, kernel->l3_config);
866 
867    genX(cmd_buffer_update_color_aux_op(cmd_buffer, ISL_AUX_OP_NONE));
868 
869    genX(flush_pipeline_select_gpgpu)(cmd_buffer);
870 
871    /* Apply any pending pipeline flushes we may have.  We want to apply them
872     * now because, if any of those flushes are for things like push constants,
873     * the GPU will read the state at weird times.
874     */
875    genX(cmd_buffer_apply_pipe_flushes)(cmd_buffer);
876 
877    uint32_t indirect_data_size = sizeof(struct brw_kernel_sysvals);
878    indirect_data_size += kernel->bin->bind_map.kernel_args_size;
879    indirect_data_size = ALIGN(indirect_data_size, 64);
880    struct anv_state indirect_data =
881       anv_cmd_buffer_alloc_general_state(cmd_buffer,
882                                          indirect_data_size, 64);
883    memset(indirect_data.map, 0, indirect_data.alloc_size);
884 
885    struct brw_kernel_sysvals sysvals = {};
886    if (global_size != NULL) {
887       for (unsigned i = 0; i < 3; i++)
888          sysvals.num_work_groups[i] = global_size[i];
889       memcpy(indirect_data.map, &sysvals, sizeof(sysvals));
890    } else {
891       struct anv_address sysvals_addr = {
892          .bo = NULL, /* General state buffer is always 0. */
893          .offset = indirect_data.offset,
894       };
895 
896       compute_store_indirect_params(cmd_buffer, sysvals_addr);
897    }
898 
899    void *args_map = indirect_data.map + sizeof(sysvals);
900    for (unsigned i = 0; i < kernel->bin->bind_map.kernel_arg_count; i++) {
901       struct brw_kernel_arg_desc *arg_desc =
902          &kernel->bin->bind_map.kernel_args[i];
903       assert(i < arg_count);
904       const struct anv_kernel_arg *arg = &args[i];
905       if (arg->is_ptr) {
906          memcpy(args_map + arg_desc->offset, arg->ptr, arg_desc->size);
907       } else {
908          assert(arg_desc->size <= sizeof(arg->u64));
909          memcpy(args_map + arg_desc->offset, &arg->u64, arg_desc->size);
910       }
911    }
912 
913    struct intel_cs_dispatch_info dispatch =
914       brw_cs_get_dispatch_info(devinfo, cs_prog_data, NULL);
915 
916    struct GENX(COMPUTE_WALKER_BODY) body = {
917       .SIMDSize                       = dispatch.simd_size / 16,
918       .MessageSIMD                    = dispatch.simd_size / 16,
919       .IndirectDataStartAddress       = indirect_data.offset,
920       .IndirectDataLength             = indirect_data.alloc_size,
921       .LocalXMaximum                  = cs_prog_data->local_size[0] - 1,
922       .LocalYMaximum                  = cs_prog_data->local_size[1] - 1,
923       .LocalZMaximum                  = cs_prog_data->local_size[2] - 1,
924       .ExecutionMask                  = dispatch.right_mask,
925       .PostSync.MOCS                  = cmd_buffer->device->isl_dev.mocs.internal,
926       .InterfaceDescriptor =
927          get_interface_descriptor_data(cmd_buffer,
928                                        kernel->bin,
929                                        cs_prog_data,
930                                        &dispatch),
931    };
932 
933    if (global_size != NULL) {
934       body.ThreadGroupIDXDimension     = global_size[0];
935       body.ThreadGroupIDYDimension     = global_size[1];
936       body.ThreadGroupIDZDimension     = global_size[2];
937    }
938 
939    cmd_buffer->state.last_compute_walker =
940       anv_batch_emitn(
941          &cmd_buffer->batch,
942          GENX(COMPUTE_WALKER_length),
943          GENX(COMPUTE_WALKER),
944          .IndirectParameterEnable = global_size == NULL,
945          .PredicateEnable = false,
946          .body = body,
947       );
948 
949    /* We just blew away the compute pipeline state */
950    cmd_buffer->state.compute.pipeline_dirty = true;
951 }
952 
953 static void
calc_local_trace_size(uint8_t local_shift[3],const uint32_t global[3])954 calc_local_trace_size(uint8_t local_shift[3], const uint32_t global[3])
955 {
956    unsigned total_shift = 0;
957    memset(local_shift, 0, 3);
958 
959    bool progress;
960    do {
961       progress = false;
962       for (unsigned i = 0; i < 3; i++) {
963          assert(global[i] > 0);
964          if ((1 << local_shift[i]) < global[i]) {
965             progress = true;
966             local_shift[i]++;
967             total_shift++;
968          }
969 
970          if (total_shift == 3)
971             return;
972       }
973    } while(progress);
974 
975    /* Assign whatever's left to x */
976    local_shift[0] += 3 - total_shift;
977 }
978 
GENX(RT_SHADER_TABLE)979 static struct GENX(RT_SHADER_TABLE)
980 vk_sdar_to_shader_table(const VkStridedDeviceAddressRegionKHR *region)
981 {
982    return (struct GENX(RT_SHADER_TABLE)) {
983       .BaseAddress = anv_address_from_u64(region->deviceAddress),
984       .Stride = region->stride,
985    };
986 }
987 
988 struct trace_params {
989    /* If is_sbt_indirect, use indirect_sbts_addr to build RT_DISPATCH_GLOBALS
990     * with mi_builder.
991     */
992    bool is_sbt_indirect;
993    const VkStridedDeviceAddressRegionKHR *raygen_sbt;
994    const VkStridedDeviceAddressRegionKHR *miss_sbt;
995    const VkStridedDeviceAddressRegionKHR *hit_sbt;
996    const VkStridedDeviceAddressRegionKHR *callable_sbt;
997 
998    /* A pointer to a VkTraceRaysIndirectCommand2KHR structure */
999    uint64_t indirect_sbts_addr;
1000 
1001    /* If is_indirect, use launch_size_addr to program the dispatch size. */
1002    bool is_launch_size_indirect;
1003    uint32_t launch_size[3];
1004 
1005    /* A pointer a uint32_t[3] */
1006    uint64_t launch_size_addr;
1007 };
1008 
1009 static struct anv_state
cmd_buffer_emit_rt_dispatch_globals(struct anv_cmd_buffer * cmd_buffer,struct trace_params * params)1010 cmd_buffer_emit_rt_dispatch_globals(struct anv_cmd_buffer *cmd_buffer,
1011                                     struct trace_params *params)
1012 {
1013    assert(!params->is_sbt_indirect);
1014    assert(params->miss_sbt != NULL);
1015    assert(params->hit_sbt != NULL);
1016    assert(params->callable_sbt != NULL);
1017 
1018    struct anv_cmd_ray_tracing_state *rt = &cmd_buffer->state.rt;
1019 
1020    struct anv_state rtdg_state =
1021       anv_cmd_buffer_alloc_temporary_state(cmd_buffer,
1022                                            BRW_RT_PUSH_CONST_OFFSET +
1023                                            sizeof(struct anv_push_constants),
1024                                            64);
1025 
1026    struct GENX(RT_DISPATCH_GLOBALS) rtdg = {
1027       .MemBaseAddress     = (struct anv_address) {
1028          .bo = rt->scratch.bo,
1029          .offset = rt->scratch.layout.ray_stack_start,
1030       },
1031       .CallStackHandler   = anv_shader_bin_get_bsr(
1032          cmd_buffer->device->rt_trivial_return, 0),
1033       .AsyncRTStackSize   = rt->scratch.layout.ray_stack_stride / 64,
1034       .NumDSSRTStacks     = rt->scratch.layout.stack_ids_per_dss,
1035       .MaxBVHLevels       = BRW_RT_MAX_BVH_LEVELS,
1036       .Flags              = RT_DEPTH_TEST_LESS_EQUAL,
1037       .HitGroupTable      = vk_sdar_to_shader_table(params->hit_sbt),
1038       .MissGroupTable     = vk_sdar_to_shader_table(params->miss_sbt),
1039       .SWStackSize        = rt->scratch.layout.sw_stack_size / 64,
1040       .LaunchWidth        = params->launch_size[0],
1041       .LaunchHeight       = params->launch_size[1],
1042       .LaunchDepth        = params->launch_size[2],
1043       .CallableGroupTable = vk_sdar_to_shader_table(params->callable_sbt),
1044    };
1045    GENX(RT_DISPATCH_GLOBALS_pack)(NULL, rtdg_state.map, &rtdg);
1046 
1047    return rtdg_state;
1048 }
1049 
1050 static struct mi_value
mi_build_sbt_entry(struct mi_builder * b,uint64_t addr_field_addr,uint64_t stride_field_addr)1051 mi_build_sbt_entry(struct mi_builder *b,
1052                    uint64_t addr_field_addr,
1053                    uint64_t stride_field_addr)
1054 {
1055    return mi_ior(b,
1056                  mi_iand(b, mi_mem64(anv_address_from_u64(addr_field_addr)),
1057                             mi_imm(BITFIELD64_BIT(49) - 1)),
1058                  mi_ishl_imm(b, mi_mem32(anv_address_from_u64(stride_field_addr)),
1059                                 48));
1060 }
1061 
1062 static struct anv_state
cmd_buffer_emit_rt_dispatch_globals_indirect(struct anv_cmd_buffer * cmd_buffer,struct trace_params * params)1063 cmd_buffer_emit_rt_dispatch_globals_indirect(struct anv_cmd_buffer *cmd_buffer,
1064                                              struct trace_params *params)
1065 {
1066    struct anv_cmd_ray_tracing_state *rt = &cmd_buffer->state.rt;
1067 
1068    struct anv_state rtdg_state =
1069       anv_cmd_buffer_alloc_temporary_state(cmd_buffer,
1070                                            BRW_RT_PUSH_CONST_OFFSET +
1071                                            sizeof(struct anv_push_constants),
1072                                            64);
1073 
1074    struct GENX(RT_DISPATCH_GLOBALS) rtdg = {
1075       .MemBaseAddress     = (struct anv_address) {
1076          .bo = rt->scratch.bo,
1077          .offset = rt->scratch.layout.ray_stack_start,
1078       },
1079       .CallStackHandler   = anv_shader_bin_get_bsr(
1080          cmd_buffer->device->rt_trivial_return, 0),
1081       .AsyncRTStackSize   = rt->scratch.layout.ray_stack_stride / 64,
1082       .NumDSSRTStacks     = rt->scratch.layout.stack_ids_per_dss,
1083       .MaxBVHLevels       = BRW_RT_MAX_BVH_LEVELS,
1084       .Flags              = RT_DEPTH_TEST_LESS_EQUAL,
1085       .SWStackSize        = rt->scratch.layout.sw_stack_size / 64,
1086    };
1087    GENX(RT_DISPATCH_GLOBALS_pack)(NULL, rtdg_state.map, &rtdg);
1088 
1089    struct anv_address rtdg_addr =
1090       anv_cmd_buffer_temporary_state_address(cmd_buffer, rtdg_state);
1091 
1092    struct mi_builder b;
1093    mi_builder_init(&b, cmd_buffer->device->info, &cmd_buffer->batch);
1094    const uint32_t mocs = anv_mocs_for_address(cmd_buffer->device, &rtdg_addr);
1095    mi_builder_set_mocs(&b, mocs);
1096    mi_builder_set_write_check(&b, true);
1097 
1098    /* Fill the MissGroupTable, HitGroupTable & CallableGroupTable fields of
1099     * RT_DISPATCH_GLOBALS using the mi_builder.
1100     */
1101    mi_store(&b,
1102             mi_mem64(
1103                anv_address_add(
1104                   rtdg_addr,
1105                   GENX(RT_DISPATCH_GLOBALS_MissGroupTable_start) / 8)),
1106             mi_build_sbt_entry(&b,
1107                                params->indirect_sbts_addr +
1108                                offsetof(VkTraceRaysIndirectCommand2KHR,
1109                                         missShaderBindingTableAddress),
1110                                params->indirect_sbts_addr +
1111                                offsetof(VkTraceRaysIndirectCommand2KHR,
1112                                         missShaderBindingTableStride)));
1113    mi_store(&b,
1114             mi_mem64(
1115                anv_address_add(
1116                   rtdg_addr,
1117                   GENX(RT_DISPATCH_GLOBALS_HitGroupTable_start) / 8)),
1118             mi_build_sbt_entry(&b,
1119                                params->indirect_sbts_addr +
1120                                offsetof(VkTraceRaysIndirectCommand2KHR,
1121                                         hitShaderBindingTableAddress),
1122                                params->indirect_sbts_addr +
1123                                offsetof(VkTraceRaysIndirectCommand2KHR,
1124                                         hitShaderBindingTableStride)));
1125    mi_store(&b,
1126             mi_mem64(
1127                anv_address_add(
1128                   rtdg_addr,
1129                   GENX(RT_DISPATCH_GLOBALS_CallableGroupTable_start) / 8)),
1130             mi_build_sbt_entry(&b,
1131                                params->indirect_sbts_addr +
1132                                offsetof(VkTraceRaysIndirectCommand2KHR,
1133                                         callableShaderBindingTableAddress),
1134                                params->indirect_sbts_addr +
1135                                offsetof(VkTraceRaysIndirectCommand2KHR,
1136                                         callableShaderBindingTableStride)));
1137 
1138    return rtdg_state;
1139 }
1140 
1141 static void
cmd_buffer_trace_rays(struct anv_cmd_buffer * cmd_buffer,struct trace_params * params)1142 cmd_buffer_trace_rays(struct anv_cmd_buffer *cmd_buffer,
1143                       struct trace_params *params)
1144 {
1145    struct anv_device *device = cmd_buffer->device;
1146    struct anv_cmd_ray_tracing_state *rt = &cmd_buffer->state.rt;
1147    struct anv_ray_tracing_pipeline *pipeline =
1148       anv_pipeline_to_ray_tracing(rt->base.pipeline);
1149 
1150    if (anv_batch_has_error(&cmd_buffer->batch))
1151       return;
1152 
1153    /* If we have a known degenerate launch size, just bail */
1154    if (!params->is_launch_size_indirect &&
1155        (params->launch_size[0] == 0 ||
1156         params->launch_size[1] == 0 ||
1157         params->launch_size[2] == 0))
1158       return;
1159 
1160    trace_intel_begin_rays(&cmd_buffer->trace);
1161 
1162    genX(cmd_buffer_config_l3)(cmd_buffer, pipeline->base.l3_config);
1163 
1164    genX(cmd_buffer_update_color_aux_op(cmd_buffer, ISL_AUX_OP_NONE));
1165 
1166    genX(flush_descriptor_buffers)(cmd_buffer, &rt->base);
1167 
1168    genX(flush_pipeline_select_gpgpu)(cmd_buffer);
1169 
1170    cmd_buffer->state.rt.pipeline_dirty = false;
1171 
1172    genX(cmd_buffer_apply_pipe_flushes)(cmd_buffer);
1173 
1174    genX(cmd_buffer_flush_push_descriptors)(cmd_buffer,
1175                                            &cmd_buffer->state.rt.base,
1176                                            &pipeline->base);
1177 
1178    /* Add these to the reloc list as they're internal buffers that don't
1179     * actually have relocs to pick them up manually.
1180     *
1181     * TODO(RT): This is a bit of a hack
1182     */
1183    anv_reloc_list_add_bo(cmd_buffer->batch.relocs,
1184                          rt->scratch.bo);
1185    anv_reloc_list_add_bo(cmd_buffer->batch.relocs,
1186                          cmd_buffer->device->btd_fifo_bo);
1187 
1188    /* Allocate and set up our RT_DISPATCH_GLOBALS */
1189    struct anv_state rtdg_state =
1190       params->is_sbt_indirect ?
1191       cmd_buffer_emit_rt_dispatch_globals_indirect(cmd_buffer, params) :
1192       cmd_buffer_emit_rt_dispatch_globals(cmd_buffer, params);
1193 
1194    assert(rtdg_state.alloc_size >= (BRW_RT_PUSH_CONST_OFFSET +
1195                                     sizeof(struct anv_push_constants)));
1196    assert(GENX(RT_DISPATCH_GLOBALS_length) * 4 <= BRW_RT_PUSH_CONST_OFFSET);
1197    /* Push constants go after the RT_DISPATCH_GLOBALS */
1198    memcpy(rtdg_state.map + BRW_RT_PUSH_CONST_OFFSET,
1199           &cmd_buffer->state.rt.base.push_constants,
1200           sizeof(struct anv_push_constants));
1201 
1202    struct anv_address rtdg_addr =
1203       anv_cmd_buffer_temporary_state_address(cmd_buffer, rtdg_state);
1204 
1205    uint8_t local_size_log2[3];
1206    uint32_t global_size[3] = {};
1207    if (params->is_launch_size_indirect) {
1208       /* Pick a local size that's probably ok.  We assume most TraceRays calls
1209        * will use a two-dimensional dispatch size.  Worst case, our initial
1210        * dispatch will be a little slower than it has to be.
1211        */
1212       local_size_log2[0] = 2;
1213       local_size_log2[1] = 1;
1214       local_size_log2[2] = 0;
1215 
1216       struct mi_builder b;
1217       mi_builder_init(&b, cmd_buffer->device->info, &cmd_buffer->batch);
1218       const uint32_t mocs = anv_mocs_for_address(cmd_buffer->device, &rtdg_addr);
1219       mi_builder_set_mocs(&b, mocs);
1220       mi_builder_set_write_check(&b, true);
1221 
1222       struct mi_value launch_size[3] = {
1223          mi_mem32(anv_address_from_u64(params->launch_size_addr + 0)),
1224          mi_mem32(anv_address_from_u64(params->launch_size_addr + 4)),
1225          mi_mem32(anv_address_from_u64(params->launch_size_addr + 8)),
1226       };
1227 
1228       /* Store the original launch size into RT_DISPATCH_GLOBALS */
1229       mi_store(&b, mi_mem32(anv_address_add(rtdg_addr,
1230                                             GENX(RT_DISPATCH_GLOBALS_LaunchWidth_start) / 8)),
1231                mi_value_ref(&b, launch_size[0]));
1232       mi_store(&b, mi_mem32(anv_address_add(rtdg_addr,
1233                                             GENX(RT_DISPATCH_GLOBALS_LaunchHeight_start) / 8)),
1234                mi_value_ref(&b, launch_size[1]));
1235       mi_store(&b, mi_mem32(anv_address_add(rtdg_addr,
1236                                             GENX(RT_DISPATCH_GLOBALS_LaunchDepth_start) / 8)),
1237                mi_value_ref(&b, launch_size[2]));
1238 
1239       /* Compute the global dispatch size */
1240       for (unsigned i = 0; i < 3; i++) {
1241          if (local_size_log2[i] == 0)
1242             continue;
1243 
1244          /* global_size = DIV_ROUND_UP(launch_size, local_size)
1245           *
1246           * Fortunately for us MI_ALU math is 64-bit and , mi_ushr32_imm
1247           * has the semantics of shifting the enture 64-bit value and taking
1248           * the bottom 32 so we don't have to worry about roll-over.
1249           */
1250          uint32_t local_size = 1 << local_size_log2[i];
1251          launch_size[i] = mi_iadd(&b, launch_size[i],
1252                                       mi_imm(local_size - 1));
1253          launch_size[i] = mi_ushr32_imm(&b, launch_size[i],
1254                                             local_size_log2[i]);
1255       }
1256 
1257       mi_store(&b, mi_reg32(GPGPU_DISPATCHDIMX), launch_size[0]);
1258       mi_store(&b, mi_reg32(GPGPU_DISPATCHDIMY), launch_size[1]);
1259       mi_store(&b, mi_reg32(GPGPU_DISPATCHDIMZ), launch_size[2]);
1260 
1261    } else {
1262       calc_local_trace_size(local_size_log2, params->launch_size);
1263 
1264       for (unsigned i = 0; i < 3; i++) {
1265          /* We have to be a bit careful here because DIV_ROUND_UP adds to the
1266           * numerator value may overflow.  Cast to uint64_t to avoid this.
1267           */
1268          uint32_t local_size = 1 << local_size_log2[i];
1269          global_size[i] = DIV_ROUND_UP((uint64_t)params->launch_size[i], local_size);
1270       }
1271    }
1272 
1273 #if GFX_VERx10 == 125
1274    /* Wa_14014427904 - We need additional invalidate/flush when
1275     * emitting NP state commands with ATS-M in compute mode.
1276     */
1277    if (intel_device_info_is_atsm(device->info) &&
1278       cmd_buffer->queue_family->engine_class == INTEL_ENGINE_CLASS_COMPUTE) {
1279       genx_batch_emit_pipe_control(&cmd_buffer->batch,
1280                                    cmd_buffer->device->info,
1281                                    cmd_buffer->state.current_pipeline,
1282                                    ANV_PIPE_CS_STALL_BIT |
1283                                    ANV_PIPE_STATE_CACHE_INVALIDATE_BIT |
1284                                    ANV_PIPE_CONSTANT_CACHE_INVALIDATE_BIT |
1285                                    ANV_PIPE_UNTYPED_DATAPORT_CACHE_FLUSH_BIT |
1286                                    ANV_PIPE_TEXTURE_CACHE_INVALIDATE_BIT |
1287                                    ANV_PIPE_INSTRUCTION_CACHE_INVALIDATE_BIT |
1288                                    ANV_PIPE_HDC_PIPELINE_FLUSH_BIT);
1289    }
1290 #endif
1291 
1292    anv_batch_emit(&cmd_buffer->batch, GENX(3DSTATE_BTD), btd) {
1293       /* TODO: This is the timeout after which the bucketed thread dispatcher
1294        *       will kick off a wave of threads. We go with the lowest value
1295        *       for now. It could be tweaked on a per application basis
1296        *       (drirc).
1297        */
1298       btd.DispatchTimeoutCounter = _64clocks;
1299       /* BSpec 43851: "This field must be programmed to 6h i.e. memory backed
1300        *               buffer must be 128KB."
1301        */
1302       btd.PerDSSMemoryBackedBufferSize = 6;
1303       btd.MemoryBackedBufferBasePointer = (struct anv_address) { .bo = device->btd_fifo_bo };
1304       if (pipeline->base.scratch_size > 0) {
1305          struct anv_bo *scratch_bo =
1306             anv_scratch_pool_alloc(device,
1307                                    &device->scratch_pool,
1308                                    MESA_SHADER_COMPUTE,
1309                                    pipeline->base.scratch_size);
1310          anv_reloc_list_add_bo(cmd_buffer->batch.relocs,
1311                                scratch_bo);
1312          uint32_t scratch_surf =
1313             anv_scratch_pool_get_surf(cmd_buffer->device,
1314                                       &device->scratch_pool,
1315                                       pipeline->base.scratch_size);
1316          btd.ScratchSpaceBuffer = scratch_surf >> ANV_SCRATCH_SPACE_SHIFT(GFX_VER);
1317       }
1318 #if INTEL_NEEDS_WA_14017794102 || INTEL_NEEDS_WA_14023061436
1319       btd.BTDMidthreadpreemption = false;
1320 #endif
1321    }
1322 
1323    genX(cmd_buffer_ensure_cfe_state)(cmd_buffer, pipeline->base.scratch_size);
1324 
1325    const struct brw_cs_prog_data *cs_prog_data =
1326       brw_cs_prog_data_const(device->rt_trampoline->prog_data);
1327    struct intel_cs_dispatch_info dispatch =
1328       brw_cs_get_dispatch_info(device->info, cs_prog_data, NULL);
1329 
1330    const gl_shader_stage s = MESA_SHADER_RAYGEN;
1331    struct anv_state *surfaces = &cmd_buffer->state.binding_tables[s];
1332    struct anv_state *samplers = &cmd_buffer->state.samplers[s];
1333    struct brw_rt_raygen_trampoline_params trampoline_params = {
1334       .rt_disp_globals_addr = anv_address_physical(rtdg_addr),
1335       .raygen_bsr_addr =
1336          params->is_sbt_indirect ?
1337          (params->indirect_sbts_addr +
1338           offsetof(VkTraceRaysIndirectCommand2KHR,
1339                    raygenShaderRecordAddress)) :
1340          params->raygen_sbt->deviceAddress,
1341       .is_indirect = params->is_sbt_indirect,
1342       .local_group_size_log2 = {
1343          local_size_log2[0],
1344          local_size_log2[1],
1345          local_size_log2[2],
1346       },
1347    };
1348 
1349    struct GENX(COMPUTE_WALKER_BODY) body =  {
1350       .SIMDSize                       = dispatch.simd_size / 16,
1351       .MessageSIMD                    = dispatch.simd_size / 16,
1352       .LocalXMaximum                  = (1 << local_size_log2[0]) - 1,
1353       .LocalYMaximum                  = (1 << local_size_log2[1]) - 1,
1354       .LocalZMaximum                  = (1 << local_size_log2[2]) - 1,
1355       .ThreadGroupIDXDimension        = global_size[0],
1356       .ThreadGroupIDYDimension        = global_size[1],
1357       .ThreadGroupIDZDimension        = global_size[2],
1358       .ExecutionMask                  = 0xff,
1359       .EmitInlineParameter            = true,
1360       .PostSync.MOCS                  = anv_mocs(pipeline->base.device, NULL, 0),
1361 
1362       .InterfaceDescriptor = (struct GENX(INTERFACE_DESCRIPTOR_DATA)) {
1363          .KernelStartPointer = device->rt_trampoline->kernel.offset,
1364          .SamplerStatePointer = samplers->offset,
1365          /* i965: DIV_ROUND_UP(CLAMP(stage_state->sampler_count, 0, 16), 4), */
1366          .SamplerCount = 0,
1367          .BindingTablePointer = surfaces->offset,
1368          .NumberofThreadsinGPGPUThreadGroup = 1,
1369          .BTDMode = true,
1370 #if INTEL_NEEDS_WA_14017794102 || INTEL_NEEDS_WA_14023061436
1371          .ThreadPreemption = false,
1372 #endif
1373       },
1374    };
1375 
1376    STATIC_ASSERT(sizeof(trampoline_params) == 32);
1377    memcpy(body.InlineData, &trampoline_params, sizeof(trampoline_params));
1378 
1379    cmd_buffer->state.last_compute_walker =
1380       anv_batch_emitn(
1381          &cmd_buffer->batch,
1382          GENX(COMPUTE_WALKER_length),
1383          GENX(COMPUTE_WALKER),
1384          .IndirectParameterEnable  = params->is_launch_size_indirect,
1385          .PredicateEnable          = cmd_buffer->state.conditional_render_enabled,
1386          .body                     = body,
1387       );
1388 
1389    trace_intel_end_rays(&cmd_buffer->trace,
1390                         params->launch_size[0],
1391                         params->launch_size[1],
1392                         params->launch_size[2]);
1393 }
1394 
1395 void
genX(CmdTraceRaysKHR)1396 genX(CmdTraceRaysKHR)(
1397     VkCommandBuffer                             commandBuffer,
1398     const VkStridedDeviceAddressRegionKHR*      pRaygenShaderBindingTable,
1399     const VkStridedDeviceAddressRegionKHR*      pMissShaderBindingTable,
1400     const VkStridedDeviceAddressRegionKHR*      pHitShaderBindingTable,
1401     const VkStridedDeviceAddressRegionKHR*      pCallableShaderBindingTable,
1402     uint32_t                                    width,
1403     uint32_t                                    height,
1404     uint32_t                                    depth)
1405 {
1406    ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer);
1407    struct trace_params params = {
1408       .is_sbt_indirect         = false,
1409       .raygen_sbt              = pRaygenShaderBindingTable,
1410       .miss_sbt                = pMissShaderBindingTable,
1411       .hit_sbt                 = pHitShaderBindingTable,
1412       .callable_sbt            = pCallableShaderBindingTable,
1413       .is_launch_size_indirect = false,
1414       .launch_size             = {
1415          width,
1416          height,
1417          depth,
1418       },
1419    };
1420 
1421    cmd_buffer_trace_rays(cmd_buffer, &params);
1422 }
1423 
1424 void
genX(CmdTraceRaysIndirectKHR)1425 genX(CmdTraceRaysIndirectKHR)(
1426     VkCommandBuffer                             commandBuffer,
1427     const VkStridedDeviceAddressRegionKHR*      pRaygenShaderBindingTable,
1428     const VkStridedDeviceAddressRegionKHR*      pMissShaderBindingTable,
1429     const VkStridedDeviceAddressRegionKHR*      pHitShaderBindingTable,
1430     const VkStridedDeviceAddressRegionKHR*      pCallableShaderBindingTable,
1431     VkDeviceAddress                             indirectDeviceAddress)
1432 {
1433    ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer);
1434    struct trace_params params = {
1435       .is_sbt_indirect         = false,
1436       .raygen_sbt              = pRaygenShaderBindingTable,
1437       .miss_sbt                = pMissShaderBindingTable,
1438       .hit_sbt                 = pHitShaderBindingTable,
1439       .callable_sbt            = pCallableShaderBindingTable,
1440       .is_launch_size_indirect = true,
1441       .launch_size_addr        = indirectDeviceAddress,
1442    };
1443 
1444    cmd_buffer_trace_rays(cmd_buffer, &params);
1445 }
1446 
1447 void
genX(CmdTraceRaysIndirect2KHR)1448 genX(CmdTraceRaysIndirect2KHR)(
1449     VkCommandBuffer                             commandBuffer,
1450     VkDeviceAddress                             indirectDeviceAddress)
1451 {
1452    ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer);
1453    struct trace_params params = {
1454       .is_sbt_indirect         = true,
1455       .indirect_sbts_addr      = indirectDeviceAddress,
1456       .is_launch_size_indirect = true,
1457       .launch_size_addr        = indirectDeviceAddress +
1458                                  offsetof(VkTraceRaysIndirectCommand2KHR, width),
1459    };
1460 
1461    cmd_buffer_trace_rays(cmd_buffer, &params);
1462 }
1463 
1464 #endif /* GFX_VERx10 >= 125 */
1465