• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*
2  * Copyright © 2015 Intel Corporation
3  *
4  * Permission is hereby granted, free of charge, to any person obtaining a
5  * copy of this software and associated documentation files (the "Software"),
6  * to deal in the Software without restriction, including without limitation
7  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8  * and/or sell copies of the Software, and to permit persons to whom the
9  * Software is furnished to do so, subject to the following conditions:
10  *
11  * The above copyright notice and this permission notice (including the next
12  * paragraph) shall be included in all copies or substantial portions of the
13  * Software.
14  *
15  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
18  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21  * IN THE SOFTWARE.
22  */
23 
24 #include <assert.h>
25 #include <stdbool.h>
26 
27 #include "anv_private.h"
28 #include "anv_measure.h"
29 
30 #include "common/intel_compute_slm.h"
31 #include "genxml/gen_macros.h"
32 #include "genxml/genX_pack.h"
33 #include "genxml/genX_rt_pack.h"
34 #include "common/intel_genX_state_brw.h"
35 
36 #include "ds/intel_tracepoints.h"
37 
38 #include "genX_mi_builder.h"
39 
40 void
genX(cmd_buffer_ensure_cfe_state)41 genX(cmd_buffer_ensure_cfe_state)(struct anv_cmd_buffer *cmd_buffer,
42                                   uint32_t total_scratch)
43 {
44 #if GFX_VERx10 >= 125
45    assert(cmd_buffer->state.current_pipeline == GPGPU);
46 
47    struct anv_cmd_compute_state *comp_state = &cmd_buffer->state.compute;
48 
49    if (total_scratch <= comp_state->scratch_size)
50       return;
51 
52    const struct intel_device_info *devinfo = cmd_buffer->device->info;
53    anv_batch_emit(&cmd_buffer->batch, GENX(CFE_STATE), cfe) {
54       cfe.MaximumNumberofThreads = devinfo->max_cs_threads * devinfo->subslice_total;
55 
56       uint32_t scratch_surf;
57       struct anv_scratch_pool *scratch_pool =
58          (cmd_buffer->vk.pool->flags & VK_COMMAND_POOL_CREATE_PROTECTED_BIT) ?
59           &cmd_buffer->device->protected_scratch_pool :
60           &cmd_buffer->device->scratch_pool;
61       struct anv_bo *scratch_bo =
62             anv_scratch_pool_alloc(cmd_buffer->device, scratch_pool,
63                                    MESA_SHADER_COMPUTE,
64                                    total_scratch);
65       anv_reloc_list_add_bo(cmd_buffer->batch.relocs, scratch_bo);
66       scratch_surf = anv_scratch_pool_get_surf(cmd_buffer->device, scratch_pool,
67                                                total_scratch);
68       cfe.ScratchSpaceBuffer = scratch_surf >> ANV_SCRATCH_SPACE_SHIFT(GFX_VER);
69 #if GFX_VER >= 20
70       switch (cmd_buffer->device->physical->instance->stack_ids) {
71       case 256:  cfe.StackIDControl = StackIDs256;  break;
72       case 512:  cfe.StackIDControl = StackIDs512;  break;
73       case 1024: cfe.StackIDControl = StackIDs1024; break;
74       case 2048: cfe.StackIDControl = StackIDs2048; break;
75       default:   unreachable("invalid stack_ids value");
76       }
77 
78 #if INTEL_WA_14021821874_GFX_VER || INTEL_WA_14018813551_GFX_VER
79       /* Wa_14021821874, Wa_14018813551:
80        *
81        * "StackIDControlOverride_RTGlobals = 0 (i.e. 2k)". We
82        * already set stack size per ray to 64 in brw_nir_lower_rt_intrinsics
83        * as the workaround also requires.
84        */
85       if (intel_needs_workaround(cmd_buffer->device->info, 14021821874) ||
86           intel_needs_workaround(cmd_buffer->device->info, 14018813551))
87          cfe.StackIDControl = StackIDs2048;
88 #endif
89 
90 #endif
91 
92       cfe.OverDispatchControl = 2; /* 50% overdispatch */
93    }
94 
95    comp_state->scratch_size = total_scratch;
96 #else
97    unreachable("Invalid call");
98 #endif
99 }
100 
101 static void
genX(cmd_buffer_flush_compute_state)102 genX(cmd_buffer_flush_compute_state)(struct anv_cmd_buffer *cmd_buffer)
103 {
104    struct anv_cmd_compute_state *comp_state = &cmd_buffer->state.compute;
105    struct anv_compute_pipeline *pipeline =
106       anv_pipeline_to_compute(comp_state->base.pipeline);
107    const UNUSED struct intel_device_info *devinfo = cmd_buffer->device->info;
108 
109    assert(pipeline->cs);
110 
111    genX(cmd_buffer_config_l3)(cmd_buffer, pipeline->base.l3_config);
112 
113    genX(cmd_buffer_update_color_aux_op(cmd_buffer, ISL_AUX_OP_NONE));
114 
115    genX(flush_descriptor_buffers)(cmd_buffer, &comp_state->base);
116 
117    genX(flush_pipeline_select_gpgpu)(cmd_buffer);
118 
119    /* Apply any pending pipeline flushes we may have.  We want to apply them
120     * now because, if any of those flushes are for things like push constants,
121     * the GPU will read the state at weird times.
122     */
123    genX(cmd_buffer_apply_pipe_flushes)(cmd_buffer);
124 
125    if (cmd_buffer->state.compute.pipeline_dirty) {
126 #if GFX_VERx10 < 125
127       /* From the Sky Lake PRM Vol 2a, MEDIA_VFE_STATE:
128        *
129        *    "A stalling PIPE_CONTROL is required before MEDIA_VFE_STATE unless
130        *    the only bits that are changed are scoreboard related: Scoreboard
131        *    Enable, Scoreboard Type, Scoreboard Mask, Scoreboard * Delta. For
132        *    these scoreboard related states, a MEDIA_STATE_FLUSH is
133        *    sufficient."
134        */
135       anv_add_pending_pipe_bits(cmd_buffer,
136                               ANV_PIPE_CS_STALL_BIT,
137                               "flush compute state");
138       genX(cmd_buffer_apply_pipe_flushes)(cmd_buffer);
139 #endif
140 
141       anv_batch_emit_batch(&cmd_buffer->batch, &pipeline->base.batch);
142 
143 #if GFX_VERx10 >= 125
144       const struct brw_cs_prog_data *prog_data = get_cs_prog_data(pipeline);
145       genX(cmd_buffer_ensure_cfe_state)(cmd_buffer, prog_data->base.total_scratch);
146 #endif
147 
148       /* Changing the pipeline affects the push constants layout (different
149        * amount of cross/per thread allocations). The allocation is also
150        * bounded to just the amount consummed by the pipeline (see
151        * anv_cmd_buffer_cs_push_constants). So we force the reallocation for
152        * every pipeline change.
153        *
154        * On Gfx12.0 we're also seeing failures in the dEQP-VK.memory_model.*
155        * tests when run in parallel. This is likely a HW issue with push
156        * constants & context save/restore.
157        *
158        * TODO: optimize this on Gfx12.5+ where the shader is not using per
159        * thread allocations and is also pulling the data using SEND messages.
160        * We should be able to limit reallocations only the data actually
161        * changes.
162        */
163       cmd_buffer->state.push_constants_dirty |= VK_SHADER_STAGE_COMPUTE_BIT;
164       comp_state->base.push_constants_data_dirty = true;
165    }
166 
167    cmd_buffer->state.descriptors_dirty |=
168       genX(cmd_buffer_flush_push_descriptors)(cmd_buffer,
169                                               &cmd_buffer->state.compute.base,
170                                               &pipeline->base);
171 
172    if ((cmd_buffer->state.descriptors_dirty & VK_SHADER_STAGE_COMPUTE_BIT) ||
173        cmd_buffer->state.compute.pipeline_dirty) {
174       genX(cmd_buffer_flush_descriptor_sets)(cmd_buffer,
175                                              &cmd_buffer->state.compute.base,
176                                              VK_SHADER_STAGE_COMPUTE_BIT,
177                                              &pipeline->cs, 1);
178       cmd_buffer->state.descriptors_dirty &= ~VK_SHADER_STAGE_COMPUTE_BIT;
179 
180 #if GFX_VERx10 < 125
181       uint32_t iface_desc_data_dw[GENX(INTERFACE_DESCRIPTOR_DATA_length)];
182       struct GENX(INTERFACE_DESCRIPTOR_DATA) desc = {
183          .BindingTablePointer =
184             cmd_buffer->state.binding_tables[MESA_SHADER_COMPUTE].offset,
185          .SamplerStatePointer =
186             cmd_buffer->state.samplers[MESA_SHADER_COMPUTE].offset,
187       };
188       GENX(INTERFACE_DESCRIPTOR_DATA_pack)(NULL, iface_desc_data_dw, &desc);
189 
190       struct anv_state state =
191          anv_cmd_buffer_merge_dynamic(cmd_buffer, iface_desc_data_dw,
192                                       pipeline->interface_descriptor_data,
193                                       GENX(INTERFACE_DESCRIPTOR_DATA_length),
194                                       64);
195 
196       uint32_t size = GENX(INTERFACE_DESCRIPTOR_DATA_length) * sizeof(uint32_t);
197       anv_batch_emit(&cmd_buffer->batch,
198                      GENX(MEDIA_INTERFACE_DESCRIPTOR_LOAD), mid) {
199          mid.InterfaceDescriptorTotalLength        = size;
200          mid.InterfaceDescriptorDataStartAddress   = state.offset;
201       }
202 #endif
203    }
204 
205    if (cmd_buffer->state.push_constants_dirty & VK_SHADER_STAGE_COMPUTE_BIT) {
206 
207       if (comp_state->base.push_constants_state.alloc_size == 0 ||
208           comp_state->base.push_constants_data_dirty) {
209          comp_state->base.push_constants_state =
210             anv_cmd_buffer_cs_push_constants(cmd_buffer);
211          comp_state->base.push_constants_data_dirty = false;
212       }
213 
214 #if GFX_VERx10 < 125
215       if (comp_state->base.push_constants_state.alloc_size) {
216          anv_batch_emit(&cmd_buffer->batch, GENX(MEDIA_CURBE_LOAD), curbe) {
217             curbe.CURBETotalDataLength    = comp_state->base.push_constants_state.alloc_size;
218             curbe.CURBEDataStartAddress   = comp_state->base.push_constants_state.offset;
219          }
220       }
221 #endif
222 
223       cmd_buffer->state.push_constants_dirty &= ~VK_SHADER_STAGE_COMPUTE_BIT;
224    }
225 
226    cmd_buffer->state.compute.pipeline_dirty = false;
227 
228    genX(cmd_buffer_apply_pipe_flushes)(cmd_buffer);
229 }
230 
231 static void
anv_cmd_buffer_push_workgroups(struct anv_cmd_buffer * cmd_buffer,const struct brw_cs_prog_data * prog_data,uint32_t baseGroupX,uint32_t baseGroupY,uint32_t baseGroupZ,uint32_t groupCountX,uint32_t groupCountY,uint32_t groupCountZ,struct anv_address indirect_group)232 anv_cmd_buffer_push_workgroups(struct anv_cmd_buffer *cmd_buffer,
233                                const struct brw_cs_prog_data *prog_data,
234                                uint32_t baseGroupX,
235                                uint32_t baseGroupY,
236                                uint32_t baseGroupZ,
237                                uint32_t groupCountX,
238                                uint32_t groupCountY,
239                                uint32_t groupCountZ,
240                                struct anv_address indirect_group)
241 {
242    if (anv_batch_has_error(&cmd_buffer->batch))
243       return;
244 
245    struct anv_push_constants *push =
246       &cmd_buffer->state.compute.base.push_constants;
247    bool updated = false;
248    if (push->cs.base_work_group_id[0] != baseGroupX ||
249        push->cs.base_work_group_id[1] != baseGroupY ||
250        push->cs.base_work_group_id[2] != baseGroupZ) {
251       push->cs.base_work_group_id[0] = baseGroupX;
252       push->cs.base_work_group_id[1] = baseGroupY;
253       push->cs.base_work_group_id[2] = baseGroupZ;
254       updated = true;
255    }
256 
257    /* On Gfx12.5+ this value goes into the inline parameter register */
258    if (GFX_VERx10 < 125 && prog_data->uses_num_work_groups) {
259       if (anv_address_is_null(indirect_group)) {
260          if (push->cs.num_work_groups[0] != groupCountX ||
261              push->cs.num_work_groups[1] != groupCountY ||
262              push->cs.num_work_groups[2] != groupCountZ) {
263             push->cs.num_work_groups[0] = groupCountX;
264             push->cs.num_work_groups[1] = groupCountY;
265             push->cs.num_work_groups[2] = groupCountZ;
266             updated = true;
267          }
268       } else {
269          uint64_t addr64 = anv_address_physical(indirect_group);
270          uint32_t lower_addr32 = addr64 & 0xffffffff;
271          uint32_t upper_addr32 = addr64 >> 32;
272          if (push->cs.num_work_groups[0] != UINT32_MAX ||
273              push->cs.num_work_groups[1] != lower_addr32 ||
274              push->cs.num_work_groups[2] != upper_addr32) {
275             push->cs.num_work_groups[0] = UINT32_MAX;
276             push->cs.num_work_groups[1] = lower_addr32;
277             push->cs.num_work_groups[2] = upper_addr32;
278             updated = true;
279          }
280       }
281    }
282 
283    if (updated) {
284       cmd_buffer->state.push_constants_dirty |= VK_SHADER_STAGE_COMPUTE_BIT;
285       cmd_buffer->state.compute.base.push_constants_data_dirty = true;
286    }
287 }
288 
289 #define GPGPU_DISPATCHDIMX 0x2500
290 #define GPGPU_DISPATCHDIMY 0x2504
291 #define GPGPU_DISPATCHDIMZ 0x2508
292 
293 static void
compute_load_indirect_params(struct anv_cmd_buffer * cmd_buffer,const struct anv_address indirect_addr,bool is_unaligned_size_x)294 compute_load_indirect_params(struct anv_cmd_buffer *cmd_buffer,
295                              const struct anv_address indirect_addr,
296                              bool is_unaligned_size_x)
297 {
298    struct mi_builder b;
299    mi_builder_init(&b, cmd_buffer->device->info, &cmd_buffer->batch);
300 
301    struct mi_value size_x = mi_mem32(anv_address_add(indirect_addr, 0));
302 
303    /* Convert unaligned thread invocations to aligned thread group in X
304     * dimension for unaligned shader dispatches during ray tracing phase.
305     */
306    if (is_unaligned_size_x) {
307       const uint32_t mocs = isl_mocs(&cmd_buffer->device->isl_dev, 0, false);
308       mi_builder_set_mocs(&b, mocs);
309 
310       struct anv_compute_pipeline *pipeline =
311          anv_pipeline_to_compute(cmd_buffer->state.compute.base.pipeline);
312       const struct brw_cs_prog_data *prog_data = get_cs_prog_data(pipeline);
313 
314       assert(util_is_power_of_two_or_zero(prog_data->local_size[0]));
315       size_x = mi_udiv32_imm(&b, size_x, prog_data->local_size[0]);
316       size_x = mi_iadd(&b, size_x, mi_imm(1));
317    }
318 
319    struct mi_value size_y = mi_mem32(anv_address_add(indirect_addr, 4));
320    struct mi_value size_z = mi_mem32(anv_address_add(indirect_addr, 8));
321 
322    mi_store(&b, mi_reg32(GPGPU_DISPATCHDIMX), size_x);
323    mi_store(&b, mi_reg32(GPGPU_DISPATCHDIMY), size_y);
324    mi_store(&b, mi_reg32(GPGPU_DISPATCHDIMZ), size_z);
325 }
326 
327 static void
compute_store_indirect_params(struct anv_cmd_buffer * cmd_buffer,const struct anv_address indirect_addr)328 compute_store_indirect_params(struct anv_cmd_buffer *cmd_buffer,
329                              const struct anv_address indirect_addr)
330 {
331    struct mi_builder b;
332    mi_builder_init(&b, cmd_buffer->device->info, &cmd_buffer->batch);
333 
334    struct mi_value size_x = mi_mem32(anv_address_add(indirect_addr, 0));
335    struct mi_value size_y = mi_mem32(anv_address_add(indirect_addr, 4));
336    struct mi_value size_z = mi_mem32(anv_address_add(indirect_addr, 8));
337 
338    mi_store(&b, size_x, mi_reg32(GPGPU_DISPATCHDIMX));
339    mi_store(&b, size_y, mi_reg32(GPGPU_DISPATCHDIMY));
340    mi_store(&b, size_z, mi_reg32(GPGPU_DISPATCHDIMZ));
341 }
342 
343 
344 #if GFX_VERx10 >= 125
345 
GENX(INTERFACE_DESCRIPTOR_DATA)346 static inline struct GENX(INTERFACE_DESCRIPTOR_DATA)
347 get_interface_descriptor_data(struct anv_cmd_buffer *cmd_buffer,
348                               const struct anv_shader_bin *shader,
349                               const struct brw_cs_prog_data *prog_data,
350                               const struct intel_cs_dispatch_info *dispatch)
351 {
352    const struct intel_device_info *devinfo = cmd_buffer->device->info;
353 
354    return (struct GENX(INTERFACE_DESCRIPTOR_DATA)) {
355       .SamplerCount = DIV_ROUND_UP(CLAMP(shader->bind_map.sampler_count, 0, 16), 4),
356       .KernelStartPointer = shader->kernel.offset,
357       .SamplerStatePointer = cmd_buffer->state.samplers[MESA_SHADER_COMPUTE].offset,
358       .BindingTablePointer = cmd_buffer->state.binding_tables[MESA_SHADER_COMPUTE].offset,
359       /* Typically set to 0 to avoid prefetching on every thread dispatch. */
360       .BindingTableEntryCount = devinfo->verx10 == 125 ?
361          0 : MIN2(shader->bind_map.surface_count, 30),
362       .NumberofThreadsinGPGPUThreadGroup = dispatch->threads,
363       .SharedLocalMemorySize = intel_compute_slm_encode_size(GFX_VER, prog_data->base.total_shared),
364       .PreferredSLMAllocationSize =
365          intel_compute_preferred_slm_calc_encode_size(devinfo,
366                                                       prog_data->base.total_shared,
367                                                       dispatch->group_size,
368                                                       dispatch->simd_size),
369       .NumberOfBarriers = prog_data->uses_barrier,
370 #if GFX_VER >= 30
371       .RegistersPerThread = ptl_register_blocks(prog_data->base.grf_used),
372 #endif
373    };
374 }
375 
376 static inline void
emit_indirect_compute_walker(struct anv_cmd_buffer * cmd_buffer,const struct anv_shader_bin * shader,const struct brw_cs_prog_data * prog_data,struct anv_address indirect_addr)377 emit_indirect_compute_walker(struct anv_cmd_buffer *cmd_buffer,
378                              const struct anv_shader_bin *shader,
379                              const struct brw_cs_prog_data *prog_data,
380                              struct anv_address indirect_addr)
381 {
382    const struct intel_device_info *devinfo = cmd_buffer->device->info;
383    assert(devinfo->has_indirect_unroll);
384 
385    struct anv_cmd_compute_state *comp_state = &cmd_buffer->state.compute;
386    bool predicate = cmd_buffer->state.conditional_render_enabled;
387 
388    const struct intel_cs_dispatch_info dispatch =
389       brw_cs_get_dispatch_info(devinfo, prog_data, NULL);
390    const int dispatch_size = dispatch.simd_size / 16;
391 
392    uint64_t indirect_addr64 = anv_address_physical(indirect_addr);
393 
394    struct GENX(COMPUTE_WALKER_BODY) body =  {
395       .SIMDSize                 = dispatch_size,
396       /* HSD 14016252163: Use of Morton walk order (and batching using a batch
397        * size of 4) is expected to increase sampler cache hit rates by
398        * increasing sample address locality within a subslice.
399        */
400 #if GFX_VER >= 30
401       .DispatchWalkOrder        = prog_data->uses_sampler ?
402                                   MortonWalk :
403                                   LinearWalk,
404       .ThreadGroupBatchSize     = prog_data->uses_sampler ? TG_BATCH_4 :
405                                                             TG_BATCH_1,
406 #endif
407       .MessageSIMD              = dispatch_size,
408       .IndirectDataStartAddress = comp_state->base.push_constants_state.offset,
409       .IndirectDataLength       = comp_state->base.push_constants_state.alloc_size,
410       .GenerateLocalID          = prog_data->generate_local_id != 0,
411       .EmitLocal                = prog_data->generate_local_id,
412       .WalkOrder                = prog_data->walk_order,
413       .TileLayout               = prog_data->walk_order == INTEL_WALK_ORDER_YXZ ?
414                                   TileY32bpe : Linear,
415       .LocalXMaximum            = prog_data->local_size[0] - 1,
416       .LocalYMaximum            = prog_data->local_size[1] - 1,
417       .LocalZMaximum            = prog_data->local_size[2] - 1,
418       .ExecutionMask            = dispatch.right_mask,
419       .PostSync.MOCS            = anv_mocs(cmd_buffer->device, NULL, 0),
420       .InterfaceDescriptor =
421          get_interface_descriptor_data(cmd_buffer, shader, prog_data,
422                                        &dispatch),
423       .EmitInlineParameter      = prog_data->uses_inline_data,
424       .InlineData               = {
425          [ANV_INLINE_PARAM_NUM_WORKGROUPS_OFFSET / 4 + 0] = UINT32_MAX,
426          [ANV_INLINE_PARAM_NUM_WORKGROUPS_OFFSET / 4 + 1] = indirect_addr64 & 0xffffffff,
427          [ANV_INLINE_PARAM_NUM_WORKGROUPS_OFFSET / 4 + 2] = indirect_addr64 >> 32,
428       },
429    };
430 
431    cmd_buffer->state.last_indirect_dispatch =
432       anv_batch_emitn(
433          &cmd_buffer->batch,
434          GENX(EXECUTE_INDIRECT_DISPATCH_length),
435          GENX(EXECUTE_INDIRECT_DISPATCH),
436          .PredicateEnable            = predicate,
437          .MaxCount                   = 1,
438          .COMPUTE_WALKER_BODY        = body,
439          .ArgumentBufferStartAddress = indirect_addr,
440          .MOCS                       = anv_mocs(cmd_buffer->device,
441                                                 indirect_addr.bo, 0),
442       );
443 }
444 
445 static inline void
emit_compute_walker(struct anv_cmd_buffer * cmd_buffer,const struct anv_compute_pipeline * pipeline,struct anv_address indirect_addr,const struct brw_cs_prog_data * prog_data,struct intel_cs_dispatch_info dispatch,uint32_t groupCountX,uint32_t groupCountY,uint32_t groupCountZ)446 emit_compute_walker(struct anv_cmd_buffer *cmd_buffer,
447                     const struct anv_compute_pipeline *pipeline,
448                     struct anv_address indirect_addr,
449                     const struct brw_cs_prog_data *prog_data,
450                     struct intel_cs_dispatch_info dispatch,
451                     uint32_t groupCountX, uint32_t groupCountY,
452                     uint32_t groupCountZ)
453 {
454    const struct anv_cmd_compute_state *comp_state = &cmd_buffer->state.compute;
455    const bool predicate = cmd_buffer->state.conditional_render_enabled;
456 
457    uint32_t num_workgroup_data[3];
458    if (!anv_address_is_null(indirect_addr)) {
459       uint64_t indirect_addr64 = anv_address_physical(indirect_addr);
460       num_workgroup_data[0] = UINT32_MAX;
461       num_workgroup_data[1] = indirect_addr64 & 0xffffffff;
462       num_workgroup_data[2] = indirect_addr64 >> 32;
463    } else {
464       num_workgroup_data[0] = groupCountX;
465       num_workgroup_data[1] = groupCountY;
466       num_workgroup_data[2] = groupCountZ;
467    }
468 
469    struct GENX(COMPUTE_WALKER_BODY) body = {
470       .SIMDSize                       = dispatch.simd_size / 16,
471       .MessageSIMD                    = dispatch.simd_size / 16,
472       .IndirectDataStartAddress       = comp_state->base.push_constants_state.offset,
473       .IndirectDataLength             = comp_state->base.push_constants_state.alloc_size,
474       .GenerateLocalID                = prog_data->generate_local_id != 0,
475       .EmitLocal                      = prog_data->generate_local_id,
476       .WalkOrder                      = prog_data->walk_order,
477       .TileLayout = prog_data->walk_order == INTEL_WALK_ORDER_YXZ ?
478                     TileY32bpe : Linear,
479       .LocalXMaximum                  = prog_data->local_size[0] - 1,
480       .LocalYMaximum                  = prog_data->local_size[1] - 1,
481       .LocalZMaximum                  = prog_data->local_size[2] - 1,
482       .ThreadGroupIDXDimension        = groupCountX,
483       .ThreadGroupIDYDimension        = groupCountY,
484       .ThreadGroupIDZDimension        = groupCountZ,
485       .ExecutionMask                  = dispatch.right_mask,
486       .PostSync                       = {
487          .MOCS                        = anv_mocs(pipeline->base.device, NULL, 0),
488       },
489       .InterfaceDescriptor =
490          get_interface_descriptor_data(cmd_buffer, pipeline->cs,
491                                        prog_data, &dispatch),
492       .EmitInlineParameter            = prog_data->uses_inline_data,
493       .InlineData                     = {
494          [ANV_INLINE_PARAM_NUM_WORKGROUPS_OFFSET / 4 + 0] = num_workgroup_data[0],
495          [ANV_INLINE_PARAM_NUM_WORKGROUPS_OFFSET / 4 + 1] = num_workgroup_data[1],
496          [ANV_INLINE_PARAM_NUM_WORKGROUPS_OFFSET / 4 + 2] = num_workgroup_data[2],
497       }
498    };
499 
500    cmd_buffer->state.last_compute_walker =
501       anv_batch_emitn(
502          &cmd_buffer->batch,
503          GENX(COMPUTE_WALKER_length),
504          GENX(COMPUTE_WALKER),
505          .IndirectParameterEnable        = !anv_address_is_null(indirect_addr),
506          .PredicateEnable                = predicate,
507          .body                           = body,
508 #if GFX_VERx10 == 125
509          .SystolicModeEnable             = prog_data->uses_systolic,
510 #endif
511       );
512 }
513 
514 #else /* #if GFX_VERx10 >= 125 */
515 
516 static inline void
emit_gpgpu_walker(struct anv_cmd_buffer * cmd_buffer,const struct anv_compute_pipeline * pipeline,bool indirect,const struct brw_cs_prog_data * prog_data,uint32_t groupCountX,uint32_t groupCountY,uint32_t groupCountZ)517 emit_gpgpu_walker(struct anv_cmd_buffer *cmd_buffer,
518                   const struct anv_compute_pipeline *pipeline, bool indirect,
519                   const struct brw_cs_prog_data *prog_data,
520                   uint32_t groupCountX, uint32_t groupCountY,
521                   uint32_t groupCountZ)
522 {
523    const bool predicate = cmd_buffer->state.conditional_render_enabled;
524 
525    const struct intel_device_info *devinfo = pipeline->base.device->info;
526    const struct intel_cs_dispatch_info dispatch =
527       brw_cs_get_dispatch_info(devinfo, prog_data, NULL);
528 
529    anv_batch_emit(&cmd_buffer->batch, GENX(GPGPU_WALKER), ggw) {
530       ggw.IndirectParameterEnable      = indirect;
531       ggw.PredicateEnable              = predicate;
532       ggw.SIMDSize                     = dispatch.simd_size / 16;
533       ggw.ThreadDepthCounterMaximum    = 0;
534       ggw.ThreadHeightCounterMaximum   = 0;
535       ggw.ThreadWidthCounterMaximum    = dispatch.threads - 1;
536       ggw.ThreadGroupIDXDimension      = groupCountX;
537       ggw.ThreadGroupIDYDimension      = groupCountY;
538       ggw.ThreadGroupIDZDimension      = groupCountZ;
539       ggw.RightExecutionMask           = dispatch.right_mask;
540       ggw.BottomExecutionMask          = 0xffffffff;
541    }
542 
543    anv_batch_emit(&cmd_buffer->batch, GENX(MEDIA_STATE_FLUSH), msf);
544 }
545 
546 #endif /* #if GFX_VERx10 >= 125 */
547 
548 static inline void
emit_cs_walker(struct anv_cmd_buffer * cmd_buffer,const struct anv_compute_pipeline * pipeline,const struct brw_cs_prog_data * prog_data,struct intel_cs_dispatch_info dispatch,struct anv_address indirect_addr,uint32_t groupCountX,uint32_t groupCountY,uint32_t groupCountZ,bool is_unaligned_size_x)549 emit_cs_walker(struct anv_cmd_buffer *cmd_buffer,
550                const struct anv_compute_pipeline *pipeline,
551                const struct brw_cs_prog_data *prog_data,
552                struct intel_cs_dispatch_info dispatch,
553                struct anv_address indirect_addr,
554                uint32_t groupCountX, uint32_t groupCountY, uint32_t groupCountZ,
555                bool is_unaligned_size_x)
556 {
557    bool is_indirect = !anv_address_is_null(indirect_addr);
558 
559 #if GFX_VERx10 >= 125
560    /* For unaligned dispatch, we need to tweak the dispatch value with
561     * MI_MATH, so we can't use indirect HW instructions.
562     */
563    if (is_indirect && !is_unaligned_size_x &&
564        cmd_buffer->device->info->has_indirect_unroll) {
565       emit_indirect_compute_walker(cmd_buffer, pipeline->cs, prog_data,
566                                    indirect_addr);
567       return;
568    }
569 #endif
570 
571    if (is_indirect)
572       compute_load_indirect_params(cmd_buffer, indirect_addr,
573             is_unaligned_size_x);
574 
575 #if GFX_VERx10 >= 125
576    emit_compute_walker(cmd_buffer, pipeline, indirect_addr, prog_data,
577                        dispatch, groupCountX, groupCountY, groupCountZ);
578 #else
579    emit_gpgpu_walker(cmd_buffer, pipeline, is_indirect, prog_data,
580                      groupCountX, groupCountY, groupCountZ);
581 #endif
582 }
583 
genX(CmdDispatchBase)584 void genX(CmdDispatchBase)(
585     VkCommandBuffer                             commandBuffer,
586     uint32_t                                    baseGroupX,
587     uint32_t                                    baseGroupY,
588     uint32_t                                    baseGroupZ,
589     uint32_t                                    groupCountX,
590     uint32_t                                    groupCountY,
591     uint32_t                                    groupCountZ)
592 {
593    ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer);
594    struct anv_compute_pipeline *pipeline =
595       anv_pipeline_to_compute(cmd_buffer->state.compute.base.pipeline);
596    const struct brw_cs_prog_data *prog_data = get_cs_prog_data(pipeline);
597    struct intel_cs_dispatch_info dispatch =
598       brw_cs_get_dispatch_info(cmd_buffer->device->info, prog_data, NULL);
599 
600    if (anv_batch_has_error(&cmd_buffer->batch))
601       return;
602 
603    anv_cmd_buffer_push_workgroups(cmd_buffer, prog_data,
604                                   baseGroupX, baseGroupY, baseGroupZ,
605                                   groupCountX, groupCountY, groupCountZ,
606                                   ANV_NULL_ADDRESS);
607 
608    anv_measure_snapshot(cmd_buffer,
609                         INTEL_SNAPSHOT_COMPUTE,
610                         "compute",
611                         groupCountX * groupCountY * groupCountZ *
612                         prog_data->local_size[0] * prog_data->local_size[1] *
613                         prog_data->local_size[2]);
614 
615    if (cmd_buffer->state.rt.debug_marker_count == 0)
616       trace_intel_begin_compute(&cmd_buffer->trace);
617 
618    genX(cmd_buffer_flush_compute_state)(cmd_buffer);
619 
620    if (cmd_buffer->state.conditional_render_enabled)
621       genX(cmd_emit_conditional_render_predicate)(cmd_buffer);
622 
623    emit_cs_walker(cmd_buffer, pipeline, prog_data, dispatch,
624                   ANV_NULL_ADDRESS /* no indirect data */,
625                   groupCountX, groupCountY, groupCountZ,
626                   false);
627 
628    if (cmd_buffer->state.rt.debug_marker_count == 0) {
629       trace_intel_end_compute(&cmd_buffer->trace,
630                               groupCountX, groupCountY, groupCountZ,
631                               pipeline->source_hash);
632    }
633 }
634 
635 static void
emit_unaligned_cs_walker(VkCommandBuffer commandBuffer,uint32_t baseGroupX,uint32_t baseGroupY,uint32_t baseGroupZ,uint32_t groupCountX,uint32_t groupCountY,uint32_t groupCountZ,struct intel_cs_dispatch_info dispatch)636 emit_unaligned_cs_walker(
637     VkCommandBuffer                             commandBuffer,
638     uint32_t                                    baseGroupX,
639     uint32_t                                    baseGroupY,
640     uint32_t                                    baseGroupZ,
641     uint32_t                                    groupCountX,
642     uint32_t                                    groupCountY,
643     uint32_t                                    groupCountZ,
644     struct intel_cs_dispatch_info               dispatch)
645 {
646    ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer);
647    struct anv_compute_pipeline *pipeline =
648       anv_pipeline_to_compute(cmd_buffer->state.compute.base.pipeline);
649    const struct brw_cs_prog_data *prog_data = get_cs_prog_data(pipeline);
650 
651    if (anv_batch_has_error(&cmd_buffer->batch))
652       return;
653 
654    anv_cmd_buffer_push_workgroups(cmd_buffer, prog_data,
655                                   baseGroupX, baseGroupY, baseGroupZ,
656                                   groupCountX, groupCountY, groupCountZ,
657                                   ANV_NULL_ADDRESS);
658 
659    /* RT shaders have Y and Z local size set to 1 always. */
660    assert(prog_data->local_size[1] == 1 && prog_data->local_size[2] == 1);
661 
662    /* RT shaders dispatched with group Y and Z set to 1 always. */
663    assert(groupCountY == 1 && groupCountZ == 1);
664 
665    if (anv_batch_has_error(&cmd_buffer->batch))
666       return;
667 
668    anv_measure_snapshot(cmd_buffer,
669                         INTEL_SNAPSHOT_COMPUTE,
670                         "compute-unaligned-cs-walker",
671                         groupCountX * groupCountY * groupCountZ *
672                         prog_data->local_size[0] * prog_data->local_size[1] *
673                         prog_data->local_size[2]);
674 
675    if (cmd_buffer->state.rt.debug_marker_count == 0)
676       trace_intel_begin_compute(&cmd_buffer->trace);
677 
678    assert(!prog_data->uses_num_work_groups);
679    genX(cmd_buffer_flush_compute_state)(cmd_buffer);
680 
681    if (cmd_buffer->state.conditional_render_enabled)
682       genX(cmd_emit_conditional_render_predicate)(cmd_buffer);
683 
684 #if GFX_VERx10 >= 125
685    emit_compute_walker(cmd_buffer, pipeline, ANV_NULL_ADDRESS, prog_data,
686                        dispatch, groupCountX, groupCountY, groupCountZ);
687 #endif
688 
689    if (cmd_buffer->state.rt.debug_marker_count == 0) {
690       trace_intel_end_compute(&cmd_buffer->trace,
691                               groupCountX, groupCountY, groupCountZ,
692                               pipeline->source_hash);
693    }
694 }
695 
696 /*
697  * Dispatch compute work item with unaligned thread invocations.
698  *
699  * This helper takes unaligned thread invocations, convert it into aligned
700  * thread group count and dispatch compute work items.
701  *
702  * We launch two CS walker, one with aligned part and another CS walker
703  * with single group for remaining thread invocations.
704  *
705  * This function is now specifically for BVH building.
706  */
707 void
genX(cmd_dispatch_unaligned)708 genX(cmd_dispatch_unaligned)(
709     VkCommandBuffer                             commandBuffer,
710     uint32_t                                    invocations_x,
711     uint32_t                                    invocations_y,
712     uint32_t                                    invocations_z)
713 {
714    ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer);
715    struct anv_compute_pipeline *pipeline =
716       anv_pipeline_to_compute(cmd_buffer->state.compute.base.pipeline);
717    const struct brw_cs_prog_data *prog_data = get_cs_prog_data(pipeline);
718 
719    /* Group X can be unaligned for RT dispatches. */
720    uint32_t groupCountX = invocations_x / prog_data->local_size[0];
721    uint32_t groupCountY = invocations_y;
722    uint32_t groupCountZ = invocations_z;
723 
724    struct intel_cs_dispatch_info dispatch =
725       brw_cs_get_dispatch_info(cmd_buffer->device->info, prog_data, NULL);
726 
727    /* Launch first CS walker with aligned group count X. */
728    if (groupCountX) {
729       emit_unaligned_cs_walker(commandBuffer, 0, 0, 0, groupCountX,
730                                groupCountY, groupCountZ, dispatch);
731    }
732 
733    uint32_t unaligned_invocations_x = invocations_x % prog_data->local_size[0];
734    if (unaligned_invocations_x) {
735       dispatch.threads = DIV_ROUND_UP(unaligned_invocations_x,
736                                       dispatch.simd_size);
737 
738       /* Make sure the 2nd walker has the same amount of invocations per
739        * workgroup as the 1st walker, so that gl_GlobalInvocationsID can be
740        * calculated correctly with baseGroup.
741        */
742       assert(dispatch.threads * dispatch.simd_size == prog_data->local_size[0]);
743 
744       const uint32_t remainder = unaligned_invocations_x & (dispatch.simd_size - 1);
745       if (remainder > 0) {
746          dispatch.right_mask = ~0u >> (32 - remainder);
747       } else {
748          dispatch.right_mask = ~0u >> (32 - dispatch.simd_size);
749       }
750 
751       /* Launch second CS walker for unaligned part. */
752       emit_unaligned_cs_walker(commandBuffer, groupCountX, 0, 0, 1, 1, 1,
753                                dispatch);
754    }
755 }
756 
757 /*
758  * This dispatches compute work item with indirect parameters.
759  * Helper also makes the unaligned thread invocations aligned.
760  */
761 void
genX(cmd_buffer_dispatch_indirect)762 genX(cmd_buffer_dispatch_indirect)(struct anv_cmd_buffer *cmd_buffer,
763                                    struct anv_address indirect_addr,
764                                    bool is_unaligned_size_x)
765 {
766    struct anv_compute_pipeline *pipeline =
767       anv_pipeline_to_compute(cmd_buffer->state.compute.base.pipeline);
768    const struct brw_cs_prog_data *prog_data = get_cs_prog_data(pipeline);
769    UNUSED struct anv_batch *batch = &cmd_buffer->batch;
770    struct intel_cs_dispatch_info dispatch =
771       brw_cs_get_dispatch_info(cmd_buffer->device->info, prog_data, NULL);
772 
773    if (anv_batch_has_error(&cmd_buffer->batch))
774       return;
775 
776    anv_cmd_buffer_push_workgroups(cmd_buffer, prog_data,
777                                   0, 0, 0, 0, 0, 0, indirect_addr);
778 
779    anv_measure_snapshot(cmd_buffer,
780                         INTEL_SNAPSHOT_COMPUTE,
781                         "compute indirect",
782                         0);
783 
784    if (cmd_buffer->state.rt.debug_marker_count == 0)
785       trace_intel_begin_compute_indirect(&cmd_buffer->trace);
786 
787    genX(cmd_buffer_flush_compute_state)(cmd_buffer);
788 
789    if (cmd_buffer->state.conditional_render_enabled)
790       genX(cmd_emit_conditional_render_predicate)(cmd_buffer);
791 
792    emit_cs_walker(cmd_buffer, pipeline, prog_data, dispatch, indirect_addr, 0,
793                   0, 0, is_unaligned_size_x);
794 
795    if (cmd_buffer->state.rt.debug_marker_count == 0) {
796       trace_intel_end_compute_indirect(&cmd_buffer->trace,
797                                        anv_address_utrace(indirect_addr),
798                                        pipeline->source_hash);
799    }
800 }
801 
genX(CmdDispatchIndirect)802 void genX(CmdDispatchIndirect)(
803     VkCommandBuffer                             commandBuffer,
804     VkBuffer                                    _buffer,
805     VkDeviceSize                                offset)
806 {
807    ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer);
808    ANV_FROM_HANDLE(anv_buffer, buffer, _buffer);
809    struct anv_address addr = anv_address_add(buffer->address, offset);
810 
811    genX(cmd_buffer_dispatch_indirect)(cmd_buffer, addr, false);
812 }
813 
814 struct anv_address
genX(cmd_buffer_ray_query_globals)815 genX(cmd_buffer_ray_query_globals)(struct anv_cmd_buffer *cmd_buffer)
816 {
817 #if GFX_VERx10 >= 125
818    struct anv_device *device = cmd_buffer->device;
819 
820    struct anv_state state =
821       anv_cmd_buffer_alloc_temporary_state(cmd_buffer,
822                                            BRW_RT_DISPATCH_GLOBALS_SIZE, 64);
823    struct brw_rt_scratch_layout layout;
824    uint32_t stack_ids_per_dss = 2048; /* TODO: can we use a lower value in
825                                        * some cases?
826                                        */
827    brw_rt_compute_scratch_layout(&layout, device->info,
828                                  stack_ids_per_dss, 1 << 10);
829 
830    uint8_t idx = anv_get_ray_query_bo_index(cmd_buffer);
831 
832    const struct GENX(RT_DISPATCH_GLOBALS) rtdg = {
833       .MemBaseAddress = (struct anv_address) {
834          /* The ray query HW computes offsets from the top of the buffer, so
835           * let the address at the end of the buffer.
836           */
837          .bo = device->ray_query_bo[idx],
838          .offset = device->ray_query_bo[idx]->size
839       },
840       .AsyncRTStackSize = layout.ray_stack_stride / 64,
841       .NumDSSRTStacks = layout.stack_ids_per_dss,
842       .MaxBVHLevels = BRW_RT_MAX_BVH_LEVELS,
843       .Flags = RT_DEPTH_TEST_LESS_EQUAL,
844       .ResumeShaderTable = (struct anv_address) {
845          .bo = cmd_buffer->state.ray_query_shadow_bo,
846       },
847    };
848    GENX(RT_DISPATCH_GLOBALS_pack)(NULL, state.map, &rtdg);
849 
850    return anv_cmd_buffer_temporary_state_address(cmd_buffer, state);
851 #else
852    unreachable("Not supported");
853 #endif
854 }
855 
856 #if GFX_VERx10 >= 125
857 void
genX(cmd_buffer_dispatch_kernel)858 genX(cmd_buffer_dispatch_kernel)(struct anv_cmd_buffer *cmd_buffer,
859                                  struct anv_kernel *kernel,
860                                  const uint32_t *global_size,
861                                  uint32_t arg_count,
862                                  const struct anv_kernel_arg *args)
863 {
864    const struct intel_device_info *devinfo = cmd_buffer->device->info;
865    const struct brw_cs_prog_data *cs_prog_data =
866       brw_cs_prog_data_const(kernel->bin->prog_data);
867 
868    genX(cmd_buffer_config_l3)(cmd_buffer, kernel->l3_config);
869 
870    genX(cmd_buffer_update_color_aux_op(cmd_buffer, ISL_AUX_OP_NONE));
871 
872    genX(flush_pipeline_select_gpgpu)(cmd_buffer);
873 
874    /* Apply any pending pipeline flushes we may have.  We want to apply them
875     * now because, if any of those flushes are for things like push constants,
876     * the GPU will read the state at weird times.
877     */
878    genX(cmd_buffer_apply_pipe_flushes)(cmd_buffer);
879 
880    uint32_t indirect_data_size = sizeof(struct brw_kernel_sysvals);
881    indirect_data_size += kernel->bin->bind_map.kernel_args_size;
882    indirect_data_size = ALIGN(indirect_data_size, 64);
883    struct anv_state indirect_data =
884       anv_cmd_buffer_alloc_general_state(cmd_buffer,
885                                          indirect_data_size, 64);
886    memset(indirect_data.map, 0, indirect_data.alloc_size);
887 
888    struct brw_kernel_sysvals sysvals = {};
889    if (global_size != NULL) {
890       for (unsigned i = 0; i < 3; i++)
891          sysvals.num_work_groups[i] = global_size[i];
892       memcpy(indirect_data.map, &sysvals, sizeof(sysvals));
893    } else {
894       struct anv_address sysvals_addr = {
895          .bo = NULL, /* General state buffer is always 0. */
896          .offset = indirect_data.offset,
897       };
898 
899       compute_store_indirect_params(cmd_buffer, sysvals_addr);
900    }
901 
902    void *args_map = indirect_data.map + sizeof(sysvals);
903    for (unsigned i = 0; i < kernel->bin->bind_map.kernel_arg_count; i++) {
904       struct brw_kernel_arg_desc *arg_desc =
905          &kernel->bin->bind_map.kernel_args[i];
906       assert(i < arg_count);
907       const struct anv_kernel_arg *arg = &args[i];
908       if (arg->is_ptr) {
909          memcpy(args_map + arg_desc->offset, arg->ptr, arg_desc->size);
910       } else {
911          assert(arg_desc->size <= sizeof(arg->u64));
912          memcpy(args_map + arg_desc->offset, &arg->u64, arg_desc->size);
913       }
914    }
915 
916    struct intel_cs_dispatch_info dispatch =
917       brw_cs_get_dispatch_info(devinfo, cs_prog_data, NULL);
918 
919    struct GENX(COMPUTE_WALKER_BODY) body = {
920       .SIMDSize                       = dispatch.simd_size / 16,
921       .MessageSIMD                    = dispatch.simd_size / 16,
922       .IndirectDataStartAddress       = indirect_data.offset,
923       .IndirectDataLength             = indirect_data.alloc_size,
924       .LocalXMaximum                  = cs_prog_data->local_size[0] - 1,
925       .LocalYMaximum                  = cs_prog_data->local_size[1] - 1,
926       .LocalZMaximum                  = cs_prog_data->local_size[2] - 1,
927       .ExecutionMask                  = dispatch.right_mask,
928       .PostSync.MOCS                  = cmd_buffer->device->isl_dev.mocs.internal,
929       .InterfaceDescriptor =
930          get_interface_descriptor_data(cmd_buffer,
931                                        kernel->bin,
932                                        cs_prog_data,
933                                        &dispatch),
934    };
935 
936    if (global_size != NULL) {
937       body.ThreadGroupIDXDimension     = global_size[0];
938       body.ThreadGroupIDYDimension     = global_size[1];
939       body.ThreadGroupIDZDimension     = global_size[2];
940    }
941 
942    cmd_buffer->state.last_compute_walker =
943       anv_batch_emitn(
944          &cmd_buffer->batch,
945          GENX(COMPUTE_WALKER_length),
946          GENX(COMPUTE_WALKER),
947          .IndirectParameterEnable = global_size == NULL,
948          .PredicateEnable = false,
949          .body = body,
950       );
951 
952    /* We just blew away the compute pipeline state */
953    cmd_buffer->state.compute.pipeline_dirty = true;
954 }
955 
956 static void
calc_local_trace_size(uint8_t local_shift[3],const uint32_t global[3])957 calc_local_trace_size(uint8_t local_shift[3], const uint32_t global[3])
958 {
959    unsigned total_shift = 0;
960    memset(local_shift, 0, 3);
961 
962    bool progress;
963    do {
964       progress = false;
965       for (unsigned i = 0; i < 3; i++) {
966          assert(global[i] > 0);
967          if ((1 << local_shift[i]) < global[i]) {
968             progress = true;
969             local_shift[i]++;
970             total_shift++;
971          }
972 
973          if (total_shift == 3)
974             return;
975       }
976    } while(progress);
977 
978    /* Assign whatever's left to x */
979    local_shift[0] += 3 - total_shift;
980 }
981 
GENX(RT_SHADER_TABLE)982 static struct GENX(RT_SHADER_TABLE)
983 vk_sdar_to_shader_table(const VkStridedDeviceAddressRegionKHR *region)
984 {
985    return (struct GENX(RT_SHADER_TABLE)) {
986       .BaseAddress = anv_address_from_u64(region->deviceAddress),
987       .Stride = region->stride,
988    };
989 }
990 
991 struct trace_params {
992    /* If is_sbt_indirect, use indirect_sbts_addr to build RT_DISPATCH_GLOBALS
993     * with mi_builder.
994     */
995    bool is_sbt_indirect;
996    const VkStridedDeviceAddressRegionKHR *raygen_sbt;
997    const VkStridedDeviceAddressRegionKHR *miss_sbt;
998    const VkStridedDeviceAddressRegionKHR *hit_sbt;
999    const VkStridedDeviceAddressRegionKHR *callable_sbt;
1000 
1001    /* A pointer to a VkTraceRaysIndirectCommand2KHR structure */
1002    uint64_t indirect_sbts_addr;
1003 
1004    /* If is_indirect, use launch_size_addr to program the dispatch size. */
1005    bool is_launch_size_indirect;
1006    uint32_t launch_size[3];
1007 
1008    /* A pointer a uint32_t[3] */
1009    uint64_t launch_size_addr;
1010 };
1011 
1012 static struct anv_state
cmd_buffer_emit_rt_dispatch_globals(struct anv_cmd_buffer * cmd_buffer,struct trace_params * params)1013 cmd_buffer_emit_rt_dispatch_globals(struct anv_cmd_buffer *cmd_buffer,
1014                                     struct trace_params *params)
1015 {
1016    assert(!params->is_sbt_indirect);
1017    assert(params->miss_sbt != NULL);
1018    assert(params->hit_sbt != NULL);
1019    assert(params->callable_sbt != NULL);
1020 
1021    struct anv_cmd_ray_tracing_state *rt = &cmd_buffer->state.rt;
1022 
1023    struct anv_state rtdg_state =
1024       anv_cmd_buffer_alloc_temporary_state(cmd_buffer,
1025                                            BRW_RT_PUSH_CONST_OFFSET +
1026                                            sizeof(struct anv_push_constants),
1027                                            64);
1028 
1029    struct GENX(RT_DISPATCH_GLOBALS) rtdg = {
1030       .MemBaseAddress     = (struct anv_address) {
1031          .bo = rt->scratch.bo,
1032          .offset = rt->scratch.layout.ray_stack_start,
1033       },
1034       .CallStackHandler   = anv_shader_bin_get_bsr(
1035          cmd_buffer->device->rt_trivial_return, 0),
1036       .AsyncRTStackSize   = rt->scratch.layout.ray_stack_stride / 64,
1037       .NumDSSRTStacks     = rt->scratch.layout.stack_ids_per_dss,
1038       .MaxBVHLevels       = BRW_RT_MAX_BVH_LEVELS,
1039       .Flags              = RT_DEPTH_TEST_LESS_EQUAL,
1040       .HitGroupTable      = vk_sdar_to_shader_table(params->hit_sbt),
1041       .MissGroupTable     = vk_sdar_to_shader_table(params->miss_sbt),
1042       .SWStackSize        = rt->scratch.layout.sw_stack_size / 64,
1043       .LaunchWidth        = params->launch_size[0],
1044       .LaunchHeight       = params->launch_size[1],
1045       .LaunchDepth        = params->launch_size[2],
1046       .CallableGroupTable = vk_sdar_to_shader_table(params->callable_sbt),
1047    };
1048    GENX(RT_DISPATCH_GLOBALS_pack)(NULL, rtdg_state.map, &rtdg);
1049 
1050    return rtdg_state;
1051 }
1052 
1053 static struct mi_value
mi_build_sbt_entry(struct mi_builder * b,uint64_t addr_field_addr,uint64_t stride_field_addr)1054 mi_build_sbt_entry(struct mi_builder *b,
1055                    uint64_t addr_field_addr,
1056                    uint64_t stride_field_addr)
1057 {
1058    return mi_ior(b,
1059                  mi_iand(b, mi_mem64(anv_address_from_u64(addr_field_addr)),
1060                             mi_imm(BITFIELD64_BIT(49) - 1)),
1061                  mi_ishl_imm(b, mi_mem32(anv_address_from_u64(stride_field_addr)),
1062                                 48));
1063 }
1064 
1065 static struct anv_state
cmd_buffer_emit_rt_dispatch_globals_indirect(struct anv_cmd_buffer * cmd_buffer,struct trace_params * params)1066 cmd_buffer_emit_rt_dispatch_globals_indirect(struct anv_cmd_buffer *cmd_buffer,
1067                                              struct trace_params *params)
1068 {
1069    struct anv_cmd_ray_tracing_state *rt = &cmd_buffer->state.rt;
1070 
1071    struct anv_state rtdg_state =
1072       anv_cmd_buffer_alloc_temporary_state(cmd_buffer,
1073                                            BRW_RT_PUSH_CONST_OFFSET +
1074                                            sizeof(struct anv_push_constants),
1075                                            64);
1076 
1077    struct GENX(RT_DISPATCH_GLOBALS) rtdg = {
1078       .MemBaseAddress     = (struct anv_address) {
1079          .bo = rt->scratch.bo,
1080          .offset = rt->scratch.layout.ray_stack_start,
1081       },
1082       .CallStackHandler   = anv_shader_bin_get_bsr(
1083          cmd_buffer->device->rt_trivial_return, 0),
1084       .AsyncRTStackSize   = rt->scratch.layout.ray_stack_stride / 64,
1085       .NumDSSRTStacks     = rt->scratch.layout.stack_ids_per_dss,
1086       .MaxBVHLevels       = BRW_RT_MAX_BVH_LEVELS,
1087       .Flags              = RT_DEPTH_TEST_LESS_EQUAL,
1088       .SWStackSize        = rt->scratch.layout.sw_stack_size / 64,
1089    };
1090    GENX(RT_DISPATCH_GLOBALS_pack)(NULL, rtdg_state.map, &rtdg);
1091 
1092    struct anv_address rtdg_addr =
1093       anv_cmd_buffer_temporary_state_address(cmd_buffer, rtdg_state);
1094 
1095    struct mi_builder b;
1096    mi_builder_init(&b, cmd_buffer->device->info, &cmd_buffer->batch);
1097    const uint32_t mocs = anv_mocs_for_address(cmd_buffer->device, &rtdg_addr);
1098    mi_builder_set_mocs(&b, mocs);
1099    mi_builder_set_write_check(&b, true);
1100 
1101    /* Fill the MissGroupTable, HitGroupTable & CallableGroupTable fields of
1102     * RT_DISPATCH_GLOBALS using the mi_builder.
1103     */
1104    mi_store(&b,
1105             mi_mem64(
1106                anv_address_add(
1107                   rtdg_addr,
1108                   GENX(RT_DISPATCH_GLOBALS_MissGroupTable_start) / 8)),
1109             mi_build_sbt_entry(&b,
1110                                params->indirect_sbts_addr +
1111                                offsetof(VkTraceRaysIndirectCommand2KHR,
1112                                         missShaderBindingTableAddress),
1113                                params->indirect_sbts_addr +
1114                                offsetof(VkTraceRaysIndirectCommand2KHR,
1115                                         missShaderBindingTableStride)));
1116    mi_store(&b,
1117             mi_mem64(
1118                anv_address_add(
1119                   rtdg_addr,
1120                   GENX(RT_DISPATCH_GLOBALS_HitGroupTable_start) / 8)),
1121             mi_build_sbt_entry(&b,
1122                                params->indirect_sbts_addr +
1123                                offsetof(VkTraceRaysIndirectCommand2KHR,
1124                                         hitShaderBindingTableAddress),
1125                                params->indirect_sbts_addr +
1126                                offsetof(VkTraceRaysIndirectCommand2KHR,
1127                                         hitShaderBindingTableStride)));
1128    mi_store(&b,
1129             mi_mem64(
1130                anv_address_add(
1131                   rtdg_addr,
1132                   GENX(RT_DISPATCH_GLOBALS_CallableGroupTable_start) / 8)),
1133             mi_build_sbt_entry(&b,
1134                                params->indirect_sbts_addr +
1135                                offsetof(VkTraceRaysIndirectCommand2KHR,
1136                                         callableShaderBindingTableAddress),
1137                                params->indirect_sbts_addr +
1138                                offsetof(VkTraceRaysIndirectCommand2KHR,
1139                                         callableShaderBindingTableStride)));
1140 
1141    return rtdg_state;
1142 }
1143 
1144 static void
cmd_buffer_trace_rays(struct anv_cmd_buffer * cmd_buffer,struct trace_params * params)1145 cmd_buffer_trace_rays(struct anv_cmd_buffer *cmd_buffer,
1146                       struct trace_params *params)
1147 {
1148    struct anv_device *device = cmd_buffer->device;
1149    struct anv_cmd_ray_tracing_state *rt = &cmd_buffer->state.rt;
1150    struct anv_ray_tracing_pipeline *pipeline =
1151       anv_pipeline_to_ray_tracing(rt->base.pipeline);
1152 
1153    if (anv_batch_has_error(&cmd_buffer->batch))
1154       return;
1155 
1156    /* If we have a known degenerate launch size, just bail */
1157    if (!params->is_launch_size_indirect &&
1158        (params->launch_size[0] == 0 ||
1159         params->launch_size[1] == 0 ||
1160         params->launch_size[2] == 0))
1161       return;
1162 
1163    trace_intel_begin_rays(&cmd_buffer->trace);
1164 
1165    genX(cmd_buffer_config_l3)(cmd_buffer, pipeline->base.l3_config);
1166 
1167    genX(cmd_buffer_update_color_aux_op(cmd_buffer, ISL_AUX_OP_NONE));
1168 
1169    genX(flush_descriptor_buffers)(cmd_buffer, &rt->base);
1170 
1171    genX(flush_pipeline_select_gpgpu)(cmd_buffer);
1172 
1173    cmd_buffer->state.rt.pipeline_dirty = false;
1174 
1175    genX(cmd_buffer_apply_pipe_flushes)(cmd_buffer);
1176 
1177    genX(cmd_buffer_flush_push_descriptors)(cmd_buffer,
1178                                            &cmd_buffer->state.rt.base,
1179                                            &pipeline->base);
1180 
1181    /* Add these to the reloc list as they're internal buffers that don't
1182     * actually have relocs to pick them up manually.
1183     *
1184     * TODO(RT): This is a bit of a hack
1185     */
1186    anv_reloc_list_add_bo(cmd_buffer->batch.relocs,
1187                          rt->scratch.bo);
1188    anv_reloc_list_add_bo(cmd_buffer->batch.relocs,
1189                          cmd_buffer->device->btd_fifo_bo);
1190 
1191    /* Allocate and set up our RT_DISPATCH_GLOBALS */
1192    struct anv_state rtdg_state =
1193       params->is_sbt_indirect ?
1194       cmd_buffer_emit_rt_dispatch_globals_indirect(cmd_buffer, params) :
1195       cmd_buffer_emit_rt_dispatch_globals(cmd_buffer, params);
1196 
1197    assert(rtdg_state.alloc_size >= (BRW_RT_PUSH_CONST_OFFSET +
1198                                     sizeof(struct anv_push_constants)));
1199    assert(GENX(RT_DISPATCH_GLOBALS_length) * 4 <= BRW_RT_PUSH_CONST_OFFSET);
1200    /* Push constants go after the RT_DISPATCH_GLOBALS */
1201    memcpy(rtdg_state.map + BRW_RT_PUSH_CONST_OFFSET,
1202           &cmd_buffer->state.rt.base.push_constants,
1203           sizeof(struct anv_push_constants));
1204 
1205    struct anv_address rtdg_addr =
1206       anv_cmd_buffer_temporary_state_address(cmd_buffer, rtdg_state);
1207 
1208    uint8_t local_size_log2[3];
1209    uint32_t global_size[3] = {};
1210    if (params->is_launch_size_indirect) {
1211       /* Pick a local size that's probably ok.  We assume most TraceRays calls
1212        * will use a two-dimensional dispatch size.  Worst case, our initial
1213        * dispatch will be a little slower than it has to be.
1214        */
1215       local_size_log2[0] = 2;
1216       local_size_log2[1] = 1;
1217       local_size_log2[2] = 0;
1218 
1219       struct mi_builder b;
1220       mi_builder_init(&b, cmd_buffer->device->info, &cmd_buffer->batch);
1221       const uint32_t mocs = anv_mocs_for_address(cmd_buffer->device, &rtdg_addr);
1222       mi_builder_set_mocs(&b, mocs);
1223       mi_builder_set_write_check(&b, true);
1224 
1225       struct mi_value launch_size[3] = {
1226          mi_mem32(anv_address_from_u64(params->launch_size_addr + 0)),
1227          mi_mem32(anv_address_from_u64(params->launch_size_addr + 4)),
1228          mi_mem32(anv_address_from_u64(params->launch_size_addr + 8)),
1229       };
1230 
1231       /* Store the original launch size into RT_DISPATCH_GLOBALS */
1232       mi_store(&b, mi_mem32(anv_address_add(rtdg_addr,
1233                                             GENX(RT_DISPATCH_GLOBALS_LaunchWidth_start) / 8)),
1234                mi_value_ref(&b, launch_size[0]));
1235       mi_store(&b, mi_mem32(anv_address_add(rtdg_addr,
1236                                             GENX(RT_DISPATCH_GLOBALS_LaunchHeight_start) / 8)),
1237                mi_value_ref(&b, launch_size[1]));
1238       mi_store(&b, mi_mem32(anv_address_add(rtdg_addr,
1239                                             GENX(RT_DISPATCH_GLOBALS_LaunchDepth_start) / 8)),
1240                mi_value_ref(&b, launch_size[2]));
1241 
1242       /* Compute the global dispatch size */
1243       for (unsigned i = 0; i < 3; i++) {
1244          if (local_size_log2[i] == 0)
1245             continue;
1246 
1247          /* global_size = DIV_ROUND_UP(launch_size, local_size)
1248           *
1249           * Fortunately for us MI_ALU math is 64-bit and , mi_ushr32_imm
1250           * has the semantics of shifting the enture 64-bit value and taking
1251           * the bottom 32 so we don't have to worry about roll-over.
1252           */
1253          uint32_t local_size = 1 << local_size_log2[i];
1254          launch_size[i] = mi_iadd(&b, launch_size[i],
1255                                       mi_imm(local_size - 1));
1256          launch_size[i] = mi_ushr32_imm(&b, launch_size[i],
1257                                             local_size_log2[i]);
1258       }
1259 
1260       mi_store(&b, mi_reg32(GPGPU_DISPATCHDIMX), launch_size[0]);
1261       mi_store(&b, mi_reg32(GPGPU_DISPATCHDIMY), launch_size[1]);
1262       mi_store(&b, mi_reg32(GPGPU_DISPATCHDIMZ), launch_size[2]);
1263 
1264    } else {
1265       calc_local_trace_size(local_size_log2, params->launch_size);
1266 
1267       for (unsigned i = 0; i < 3; i++) {
1268          /* We have to be a bit careful here because DIV_ROUND_UP adds to the
1269           * numerator value may overflow.  Cast to uint64_t to avoid this.
1270           */
1271          uint32_t local_size = 1 << local_size_log2[i];
1272          global_size[i] = DIV_ROUND_UP((uint64_t)params->launch_size[i], local_size);
1273       }
1274    }
1275 
1276 #if GFX_VERx10 == 125
1277    /* Wa_14014427904 - We need additional invalidate/flush when
1278     * emitting NP state commands with ATS-M in compute mode.
1279     */
1280    if (intel_device_info_is_atsm(device->info) &&
1281       cmd_buffer->queue_family->engine_class == INTEL_ENGINE_CLASS_COMPUTE) {
1282       genx_batch_emit_pipe_control(&cmd_buffer->batch,
1283                                    cmd_buffer->device->info,
1284                                    cmd_buffer->state.current_pipeline,
1285                                    ANV_PIPE_CS_STALL_BIT |
1286                                    ANV_PIPE_STATE_CACHE_INVALIDATE_BIT |
1287                                    ANV_PIPE_CONSTANT_CACHE_INVALIDATE_BIT |
1288                                    ANV_PIPE_UNTYPED_DATAPORT_CACHE_FLUSH_BIT |
1289                                    ANV_PIPE_TEXTURE_CACHE_INVALIDATE_BIT |
1290                                    ANV_PIPE_INSTRUCTION_CACHE_INVALIDATE_BIT |
1291                                    ANV_PIPE_HDC_PIPELINE_FLUSH_BIT);
1292    }
1293 #endif
1294 
1295    anv_batch_emit(&cmd_buffer->batch, GENX(3DSTATE_BTD), btd) {
1296       /* TODO: This is the timeout after which the bucketed thread dispatcher
1297        *       will kick off a wave of threads. We go with the lowest value
1298        *       for now. It could be tweaked on a per application basis
1299        *       (drirc).
1300        */
1301       btd.DispatchTimeoutCounter = _64clocks;
1302       /* BSpec 43851: "This field must be programmed to 6h i.e. memory backed
1303        *               buffer must be 128KB."
1304        */
1305       btd.PerDSSMemoryBackedBufferSize = 6;
1306       btd.MemoryBackedBufferBasePointer = (struct anv_address) { .bo = device->btd_fifo_bo };
1307       if (pipeline->base.scratch_size > 0) {
1308          struct anv_bo *scratch_bo =
1309             anv_scratch_pool_alloc(device,
1310                                    &device->scratch_pool,
1311                                    MESA_SHADER_COMPUTE,
1312                                    pipeline->base.scratch_size);
1313          anv_reloc_list_add_bo(cmd_buffer->batch.relocs,
1314                                scratch_bo);
1315          uint32_t scratch_surf =
1316             anv_scratch_pool_get_surf(cmd_buffer->device,
1317                                       &device->scratch_pool,
1318                                       pipeline->base.scratch_size);
1319          btd.ScratchSpaceBuffer = scratch_surf >> ANV_SCRATCH_SPACE_SHIFT(GFX_VER);
1320       }
1321 #if INTEL_NEEDS_WA_14017794102 || INTEL_NEEDS_WA_14023061436
1322       btd.BTDMidthreadpreemption = false;
1323 #endif
1324    }
1325 
1326    genX(cmd_buffer_ensure_cfe_state)(cmd_buffer, pipeline->base.scratch_size);
1327 
1328    const struct brw_cs_prog_data *cs_prog_data =
1329       brw_cs_prog_data_const(device->rt_trampoline->prog_data);
1330    struct intel_cs_dispatch_info dispatch =
1331       brw_cs_get_dispatch_info(device->info, cs_prog_data, NULL);
1332 
1333    const gl_shader_stage s = MESA_SHADER_RAYGEN;
1334    struct anv_state *surfaces = &cmd_buffer->state.binding_tables[s];
1335    struct anv_state *samplers = &cmd_buffer->state.samplers[s];
1336    struct brw_rt_raygen_trampoline_params trampoline_params = {
1337       .rt_disp_globals_addr = anv_address_physical(rtdg_addr),
1338       .raygen_bsr_addr =
1339          params->is_sbt_indirect ?
1340          (params->indirect_sbts_addr +
1341           offsetof(VkTraceRaysIndirectCommand2KHR,
1342                    raygenShaderRecordAddress)) :
1343          params->raygen_sbt->deviceAddress,
1344       .is_indirect = params->is_sbt_indirect,
1345       .local_group_size_log2 = {
1346          local_size_log2[0],
1347          local_size_log2[1],
1348          local_size_log2[2],
1349       },
1350    };
1351 
1352    struct GENX(COMPUTE_WALKER_BODY) body =  {
1353       .SIMDSize                       = dispatch.simd_size / 16,
1354       .MessageSIMD                    = dispatch.simd_size / 16,
1355       .LocalXMaximum                  = (1 << local_size_log2[0]) - 1,
1356       .LocalYMaximum                  = (1 << local_size_log2[1]) - 1,
1357       .LocalZMaximum                  = (1 << local_size_log2[2]) - 1,
1358       .ThreadGroupIDXDimension        = global_size[0],
1359       .ThreadGroupIDYDimension        = global_size[1],
1360       .ThreadGroupIDZDimension        = global_size[2],
1361       .ExecutionMask                  = 0xff,
1362       .EmitInlineParameter            = true,
1363       .PostSync.MOCS                  = anv_mocs(pipeline->base.device, NULL, 0),
1364 
1365       .InterfaceDescriptor = (struct GENX(INTERFACE_DESCRIPTOR_DATA)) {
1366          .KernelStartPointer = device->rt_trampoline->kernel.offset,
1367          .SamplerStatePointer = samplers->offset,
1368          /* i965: DIV_ROUND_UP(CLAMP(stage_state->sampler_count, 0, 16), 4), */
1369          .SamplerCount = 0,
1370          .BindingTablePointer = surfaces->offset,
1371          .NumberofThreadsinGPGPUThreadGroup = 1,
1372          .BTDMode = true,
1373 #if INTEL_NEEDS_WA_14017794102 || INTEL_NEEDS_WA_14023061436
1374          .ThreadPreemption = false,
1375 #endif
1376 #if GFX_VER >= 30
1377          .RegistersPerThread = ptl_register_blocks(cs_prog_data->base.grf_used),
1378 #endif
1379       },
1380    };
1381 
1382    STATIC_ASSERT(sizeof(trampoline_params) == 32);
1383    memcpy(body.InlineData, &trampoline_params, sizeof(trampoline_params));
1384 
1385    cmd_buffer->state.last_compute_walker =
1386       anv_batch_emitn(
1387          &cmd_buffer->batch,
1388          GENX(COMPUTE_WALKER_length),
1389          GENX(COMPUTE_WALKER),
1390          .IndirectParameterEnable  = params->is_launch_size_indirect,
1391          .PredicateEnable          = cmd_buffer->state.conditional_render_enabled,
1392          .body                     = body,
1393       );
1394 
1395    trace_intel_end_rays(&cmd_buffer->trace,
1396                         params->launch_size[0],
1397                         params->launch_size[1],
1398                         params->launch_size[2]);
1399 }
1400 
1401 void
genX(CmdTraceRaysKHR)1402 genX(CmdTraceRaysKHR)(
1403     VkCommandBuffer                             commandBuffer,
1404     const VkStridedDeviceAddressRegionKHR*      pRaygenShaderBindingTable,
1405     const VkStridedDeviceAddressRegionKHR*      pMissShaderBindingTable,
1406     const VkStridedDeviceAddressRegionKHR*      pHitShaderBindingTable,
1407     const VkStridedDeviceAddressRegionKHR*      pCallableShaderBindingTable,
1408     uint32_t                                    width,
1409     uint32_t                                    height,
1410     uint32_t                                    depth)
1411 {
1412    ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer);
1413    struct trace_params params = {
1414       .is_sbt_indirect         = false,
1415       .raygen_sbt              = pRaygenShaderBindingTable,
1416       .miss_sbt                = pMissShaderBindingTable,
1417       .hit_sbt                 = pHitShaderBindingTable,
1418       .callable_sbt            = pCallableShaderBindingTable,
1419       .is_launch_size_indirect = false,
1420       .launch_size             = {
1421          width,
1422          height,
1423          depth,
1424       },
1425    };
1426 
1427    cmd_buffer_trace_rays(cmd_buffer, &params);
1428 }
1429 
1430 void
genX(CmdTraceRaysIndirectKHR)1431 genX(CmdTraceRaysIndirectKHR)(
1432     VkCommandBuffer                             commandBuffer,
1433     const VkStridedDeviceAddressRegionKHR*      pRaygenShaderBindingTable,
1434     const VkStridedDeviceAddressRegionKHR*      pMissShaderBindingTable,
1435     const VkStridedDeviceAddressRegionKHR*      pHitShaderBindingTable,
1436     const VkStridedDeviceAddressRegionKHR*      pCallableShaderBindingTable,
1437     VkDeviceAddress                             indirectDeviceAddress)
1438 {
1439    ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer);
1440    struct trace_params params = {
1441       .is_sbt_indirect         = false,
1442       .raygen_sbt              = pRaygenShaderBindingTable,
1443       .miss_sbt                = pMissShaderBindingTable,
1444       .hit_sbt                 = pHitShaderBindingTable,
1445       .callable_sbt            = pCallableShaderBindingTable,
1446       .is_launch_size_indirect = true,
1447       .launch_size_addr        = indirectDeviceAddress,
1448    };
1449 
1450    cmd_buffer_trace_rays(cmd_buffer, &params);
1451 }
1452 
1453 void
genX(CmdTraceRaysIndirect2KHR)1454 genX(CmdTraceRaysIndirect2KHR)(
1455     VkCommandBuffer                             commandBuffer,
1456     VkDeviceAddress                             indirectDeviceAddress)
1457 {
1458    ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer);
1459    struct trace_params params = {
1460       .is_sbt_indirect         = true,
1461       .indirect_sbts_addr      = indirectDeviceAddress,
1462       .is_launch_size_indirect = true,
1463       .launch_size_addr        = indirectDeviceAddress +
1464                                  offsetof(VkTraceRaysIndirectCommand2KHR, width),
1465    };
1466 
1467    cmd_buffer_trace_rays(cmd_buffer, &params);
1468 }
1469 
1470 #endif /* GFX_VERx10 >= 125 */
1471