1 /*
2 * Copyright © 2024 Collabora Ltd.
3 *
4 * SPDX-License-Identifier: MIT
5 */
6
7 #include <stddef.h>
8 #include <stdint.h>
9
10 #include "genxml/gen_macros.h"
11
12 #include "nir.h"
13 #include "nir_builder.h"
14
15 #include "pan_encoder.h"
16 #include "pan_shader.h"
17
18 #include "panvk_cmd_alloc.h"
19 #include "panvk_cmd_buffer.h"
20 #include "panvk_device.h"
21 #include "panvk_shader.h"
22
23 struct pan_nir_desc_copy_info {
24 uint64_t sets[MAX_SETS];
25 uint64_t tables[PANVK_BIFROST_DESC_TABLE_COUNT];
26 uint64_t img_attrib_table;
27 struct {
28 uint64_t table;
29 uint32_t limits[PANVK_BIFROST_DESC_TABLE_COUNT];
30 uint32_t attrib_buf_idx_offset;
31 } desc_copy;
32 uint32_t set_desc_counts[MAX_SETS];
33 };
34
35 #define get_input_field(b, name) \
36 nir_load_push_constant( \
37 b, 1, sizeof(((struct pan_nir_desc_copy_info *)0)->name) * 8, \
38 nir_imm_int(b, offsetof(struct pan_nir_desc_copy_info, name)))
39
40 static nir_def *
get_array_entry(nir_builder * b,unsigned array_offset,unsigned array_size,unsigned array_stride,nir_def * idx)41 get_array_entry(nir_builder *b, unsigned array_offset, unsigned array_size,
42 unsigned array_stride, nir_def *idx)
43 {
44 assert(array_size > 0);
45 assert(array_stride == 4 || array_stride == 8);
46
47 STACK_ARRAY(nir_def *, lut, array_size);
48
49 /* First we populate a lookup table covering the whole array. */
50 for (unsigned i = 0; i < array_size; i++) {
51 lut[i] = nir_load_push_constant(
52 b, 1, array_stride * 8,
53 nir_imm_int(b, (i * array_stride) + array_offset));
54 }
55
56 /* Then we test each bit in the index starting from the MSB of the biggest
57 * valid index in the array and select the entry accordingly. */
58 for (unsigned lut_stride = BITFIELD_BIT(util_last_bit(array_size - 1) - 1);
59 lut_stride > 0; lut_stride >>= 1) {
60 nir_def *bit_is_set = nir_i2b(b, nir_iand_imm(b, idx, lut_stride));
61
62 for (unsigned i = 0; i < lut_stride && i + lut_stride < array_size; i++)
63 lut[i] = nir_bcsel(b, bit_is_set, lut[i + lut_stride], lut[i]);
64 }
65
66 nir_def *result = lut[0];
67
68 STACK_ARRAY_FINISH(lut);
69
70 return result;
71 }
72
73 #define get_input_array_slot(b, name, index) \
74 get_array_entry(b, offsetof(struct pan_nir_desc_copy_info, name), \
75 ARRAY_SIZE(((struct pan_nir_desc_copy_info *)0)->name), \
76 sizeof(((struct pan_nir_desc_copy_info *)0)->name[0]), \
77 index)
78
79 static void
extract_desc_info_from_handle(nir_builder * b,nir_def * handle,nir_def ** table,nir_def ** desc_idx)80 extract_desc_info_from_handle(nir_builder *b, nir_def *handle, nir_def **table,
81 nir_def **desc_idx)
82 {
83 *table = nir_ushr_imm(b, handle, 28);
84 *desc_idx = nir_iand_imm(b, handle, 0xfffffff);
85 }
86
87 static void
set_to_table_copy(nir_builder * b,nir_def * set_ptr,nir_def * set_desc_count,nir_def * src_desc_idx,nir_def * table_ptr,nir_def * dst_desc_idx,unsigned element_size)88 set_to_table_copy(nir_builder *b, nir_def *set_ptr, nir_def *set_desc_count,
89 nir_def *src_desc_idx, nir_def *table_ptr,
90 nir_def *dst_desc_idx, unsigned element_size)
91 {
92 /* The last binding can have
93 * VK_DESCRIPTOR_BINDING_VARIABLE_DESCRIPTOR_COUNT_BIT set, we need to make
94 * we don't do an out-of-bound access on the source set. */
95 nir_def *dst_offset =
96 nir_u2u64(b, nir_imul_imm(b, dst_desc_idx, element_size));
97
98 nir_push_if(b, nir_ult(b, src_desc_idx, set_desc_count));
99 {
100 nir_def *src_offset =
101 nir_u2u64(b, nir_imul_imm(b, src_desc_idx, PANVK_DESCRIPTOR_SIZE));
102 nir_def *desc = nir_load_global(b, nir_iadd(b, set_ptr, src_offset),
103 element_size, element_size / 4, 32);
104 nir_store_global(b, nir_iadd(b, table_ptr, dst_offset), element_size,
105 desc, ~0);
106 }
107 nir_push_else(b, NULL);
108 {
109 nir_const_value v[] = {
110 nir_const_value_for_uint(0, 32), nir_const_value_for_uint(0, 32),
111 nir_const_value_for_uint(0, 32), nir_const_value_for_uint(0, 32),
112 nir_const_value_for_uint(0, 32), nir_const_value_for_uint(0, 32),
113 nir_const_value_for_uint(0, 32), nir_const_value_for_uint(0, 32),
114 };
115
116 nir_def *desc = nir_build_imm(b, element_size / 4, 32, v);
117 nir_store_global(b, nir_iadd(b, table_ptr, dst_offset), element_size,
118 desc, ~0);
119 }
120 nir_pop_if(b, NULL);
121 }
122
123 static void
set_to_table_img_copy(nir_builder * b,nir_def * set_ptr,nir_def * set_desc_count,nir_def * src_desc_idx,nir_def * attrib_table_ptr,nir_def * attrib_buf_table_ptr,nir_def * dst_desc_idx)124 set_to_table_img_copy(nir_builder *b, nir_def *set_ptr, nir_def *set_desc_count,
125 nir_def *src_desc_idx, nir_def *attrib_table_ptr,
126 nir_def *attrib_buf_table_ptr, nir_def *dst_desc_idx)
127 {
128 /* The last binding can have
129 * VK_DESCRIPTOR_BINDING_VARIABLE_DESCRIPTOR_COUNT_BIT set, we need to make
130 * sure we don't do an out-of-bound access on the source set. */
131 const unsigned element_size = pan_size(ATTRIBUTE_BUFFER) * 2;
132 const unsigned attrib_buf_comps = element_size / 4;
133 const unsigned attrib_comps = pan_size(ATTRIBUTE) / 4;
134 nir_def *attrib_offset =
135 nir_u2u64(b, nir_imul_imm(b, dst_desc_idx, pan_size(ATTRIBUTE)));
136 nir_def *attrib_buf_offset =
137 nir_u2u64(b, nir_imul_imm(b, dst_desc_idx, element_size));
138
139 nir_push_if(b, nir_ult(b, src_desc_idx, set_desc_count));
140 {
141 nir_def *attr_buf_idx_offset =
142 get_input_field(b, desc_copy.attrib_buf_idx_offset);
143 nir_def *src_offset =
144 nir_u2u64(b, nir_imul_imm(b, src_desc_idx, PANVK_DESCRIPTOR_SIZE));
145 nir_def *src_desc = nir_load_global(b, nir_iadd(b, set_ptr, src_offset),
146 element_size, element_size / 4, 32);
147 nir_def *fmt = nir_iand_imm(b, nir_channel(b, src_desc, 2), 0xfffffc00);
148
149 /* Each image descriptor takes two attribute buffer slots, and we need
150 * to add the attribute buffer offset to have images working with vertex
151 * shader. */
152 nir_def *buf_idx =
153 nir_iadd(b, nir_imul_imm(b, dst_desc_idx, 2), attr_buf_idx_offset);
154
155 nir_def *attrib_w1 = nir_ior(b, buf_idx, fmt);
156
157 nir_def *attrib_desc = nir_vec2(b, attrib_w1, nir_imm_int(b, 0));
158
159 nir_store_global(b, nir_iadd(b, attrib_table_ptr, attrib_offset),
160 pan_size(ATTRIBUTE), attrib_desc,
161 nir_component_mask(attrib_comps));
162
163 nir_def *attrib_buf_desc = nir_vec8(
164 b, nir_channel(b, src_desc, 0), nir_channel(b, src_desc, 1),
165 nir_iand_imm(b, nir_channel(b, src_desc, 2), BITFIELD_MASK(10)),
166 nir_channel(b, src_desc, 3), nir_channel(b, src_desc, 4),
167 nir_channel(b, src_desc, 5), nir_channel(b, src_desc, 6),
168 nir_channel(b, src_desc, 7));
169 nir_store_global(b, nir_iadd(b, attrib_buf_table_ptr, attrib_buf_offset),
170 element_size, attrib_buf_desc,
171 nir_component_mask(attrib_buf_comps));
172 }
173 nir_push_else(b, NULL);
174 {
175 nir_const_value v[] = {
176 nir_const_value_for_uint(0, 32), nir_const_value_for_uint(0, 32),
177 nir_const_value_for_uint(0, 32), nir_const_value_for_uint(0, 32),
178 nir_const_value_for_uint(0, 32), nir_const_value_for_uint(0, 32),
179 nir_const_value_for_uint(0, 32), nir_const_value_for_uint(0, 32),
180 };
181
182 nir_def *desc =
183 nir_build_imm(b, MAX2(attrib_buf_comps, attrib_comps), 32, v);
184
185 nir_store_global(b, nir_iadd(b, attrib_buf_table_ptr, attrib_buf_offset),
186 pan_size(ATTRIBUTE), desc,
187 nir_component_mask(attrib_buf_comps));
188 nir_store_global(b, nir_iadd(b, attrib_table_ptr, attrib_offset),
189 element_size, desc, nir_component_mask(attrib_comps));
190 }
191 nir_pop_if(b, NULL);
192 }
193
194 static void
single_desc_copy(nir_builder * b,nir_def * desc_copy_idx)195 single_desc_copy(nir_builder *b, nir_def *desc_copy_idx)
196 {
197 nir_def *desc_copy_offset = nir_imul_imm(b, desc_copy_idx, sizeof(uint32_t));
198 nir_def *desc_copy_ptr = nir_iadd(b, get_input_field(b, desc_copy.table),
199 nir_u2u64(b, desc_copy_offset));
200 nir_def *src_copy_handle = nir_load_global(b, desc_copy_ptr, 4, 1, 32);
201
202 nir_def *set_idx, *src_desc_idx;
203 extract_desc_info_from_handle(b, src_copy_handle, &set_idx, &src_desc_idx);
204
205 nir_def *set_ptr = get_input_array_slot(b, sets, set_idx);
206 nir_def *set_desc_count = get_input_array_slot(b, set_desc_counts, set_idx);
207 nir_def *ubo_end =
208 get_input_field(b, desc_copy.limits[PANVK_BIFROST_DESC_TABLE_UBO]);
209 nir_def *img_end =
210 get_input_field(b, desc_copy.limits[PANVK_BIFROST_DESC_TABLE_IMG]);
211 nir_def *tex_end =
212 get_input_field(b, desc_copy.limits[PANVK_BIFROST_DESC_TABLE_TEXTURE]);
213 nir_def *sampler_end =
214 get_input_field(b, desc_copy.limits[PANVK_BIFROST_DESC_TABLE_SAMPLER]);
215
216 nir_push_if(b, nir_ult(b, desc_copy_idx, ubo_end));
217 {
218 nir_def *table_ptr =
219 get_input_field(b, tables[PANVK_BIFROST_DESC_TABLE_UBO]);
220
221 set_to_table_copy(b, set_ptr, set_desc_count, src_desc_idx, table_ptr,
222 desc_copy_idx, sizeof(struct mali_attribute_packed));
223 }
224 nir_push_else(b, NULL);
225 {
226 nir_push_if(b, nir_ult(b, desc_copy_idx, img_end));
227 {
228 nir_def *table_ptr =
229 get_input_field(b, tables[PANVK_BIFROST_DESC_TABLE_IMG]);
230 nir_def *attrib_table_ptr = get_input_field(b, img_attrib_table);
231 nir_def *attrib_buf_table_ptr = table_ptr;
232
233 set_to_table_img_copy(b, set_ptr, set_desc_count, src_desc_idx,
234 attrib_table_ptr, attrib_buf_table_ptr,
235 nir_isub(b, desc_copy_idx, ubo_end));
236 }
237 nir_push_else(b, NULL);
238 {
239 nir_push_if(b, nir_ult(b, desc_copy_idx, tex_end));
240 {
241 nir_def *table_ptr =
242 get_input_field(b, tables[PANVK_BIFROST_DESC_TABLE_TEXTURE]);
243
244 set_to_table_copy(b, set_ptr, set_desc_count, src_desc_idx,
245 table_ptr, nir_isub(b, desc_copy_idx, img_end),
246 sizeof(struct mali_texture_packed));
247 }
248 nir_push_else(b, NULL);
249 {
250 nir_push_if(b, nir_ult(b, desc_copy_idx, sampler_end));
251 {
252 nir_def *table_ptr =
253 get_input_field(b, tables[PANVK_BIFROST_DESC_TABLE_SAMPLER]);
254
255 set_to_table_copy(b, set_ptr, set_desc_count, src_desc_idx,
256 table_ptr, nir_isub(b, desc_copy_idx, tex_end),
257 sizeof(struct mali_sampler_packed));
258 }
259 nir_pop_if(b, NULL);
260 }
261 nir_pop_if(b, NULL);
262 }
263 nir_pop_if(b, NULL);
264 }
265 nir_pop_if(b, NULL);
266 }
267
268 static uint64_t
panvk_meta_desc_copy_rsd(struct panvk_device * dev)269 panvk_meta_desc_copy_rsd(struct panvk_device *dev)
270 {
271 struct panvk_physical_device *phys_dev =
272 to_panvk_physical_device(dev->vk.physical);
273 enum panvk_meta_object_key_type key = PANVK_META_OBJECT_KEY_COPY_DESC_SHADER;
274 struct panvk_internal_shader *shader;
275
276 VkShaderEXT shader_handle = (VkShaderEXT)vk_meta_lookup_object(
277 &dev->meta, VK_OBJECT_TYPE_SHADER_EXT, &key, sizeof(key));
278 if (shader_handle != VK_NULL_HANDLE)
279 goto out;
280
281 nir_builder b = nir_builder_init_simple_shader(
282 MESA_SHADER_COMPUTE, GENX(pan_shader_get_compiler_options)(), "%s",
283 "desc_copy");
284
285 /* We actually customize that at execution time to issue the
286 * exact number of jobs. */
287 b.shader->info.workgroup_size[0] = 1;
288 b.shader->info.workgroup_size[1] = 1;
289 b.shader->info.workgroup_size[2] = 1;
290
291 nir_def *desc_copy_id =
292 nir_channel(&b, nir_load_global_invocation_id(&b, 32), 0);
293 single_desc_copy(&b, desc_copy_id);
294
295 struct panfrost_compile_inputs inputs = {
296 .gpu_id = phys_dev->kmod.props.gpu_prod_id,
297 .no_ubo_to_push = true,
298 };
299
300 pan_shader_preprocess(b.shader, inputs.gpu_id);
301
302 VkResult result = panvk_per_arch(create_internal_shader)(
303 dev, b.shader, &inputs, &shader);
304
305 ralloc_free(b.shader);
306
307 if (result != VK_SUCCESS)
308 return 0;
309
310 shader->info.push.count =
311 DIV_ROUND_UP(sizeof(struct pan_nir_desc_copy_info), 4);
312
313 shader->rsd = panvk_pool_alloc_desc(&dev->mempools.rw, RENDERER_STATE);
314 if (!panvk_priv_mem_host_addr(shader->rsd)) {
315 vk_shader_destroy(&dev->vk, &shader->vk, NULL);
316 return 0;
317 }
318
319 pan_cast_and_pack(panvk_priv_mem_host_addr(shader->rsd), RENDERER_STATE,
320 cfg) {
321 pan_shader_prepare_rsd(&shader->info,
322 panvk_priv_mem_dev_addr(shader->code_mem), &cfg);
323 }
324
325 shader_handle = (VkShaderEXT)vk_meta_cache_object(
326 &dev->vk, &dev->meta, &key, sizeof(key), VK_OBJECT_TYPE_SHADER_EXT,
327 (uint64_t)panvk_internal_shader_to_handle(shader));
328
329 out:
330 shader = panvk_internal_shader_from_handle(shader_handle);
331 return panvk_priv_mem_dev_addr(shader->rsd);
332 }
333
334 VkResult
panvk_per_arch(meta_get_copy_desc_job)335 panvk_per_arch(meta_get_copy_desc_job)(
336 struct panvk_cmd_buffer *cmdbuf, const struct panvk_shader *shader,
337 const struct panvk_descriptor_state *desc_state,
338 const struct panvk_shader_desc_state *shader_desc_state,
339 uint32_t attrib_buf_idx_offset, struct panfrost_ptr *job_desc)
340 {
341 struct panvk_device *dev = to_panvk_device(cmdbuf->vk.base.device);
342
343 *job_desc = (struct panfrost_ptr){0};
344
345 if (!shader)
346 return VK_SUCCESS;
347
348 uint64_t copy_table = panvk_priv_mem_dev_addr(shader->desc_info.others.map);
349 if (!copy_table)
350 return VK_SUCCESS;
351
352 struct pan_nir_desc_copy_info copy_info = {
353 .img_attrib_table = shader_desc_state->img_attrib_table,
354 .desc_copy =
355 {
356 .table = copy_table,
357 .attrib_buf_idx_offset = attrib_buf_idx_offset,
358 },
359 };
360
361 for (uint32_t i = 0; i < ARRAY_SIZE(copy_info.desc_copy.limits); i++)
362 copy_info.desc_copy.limits[i] =
363 shader->desc_info.others.count[i] +
364 (i > 0 ? copy_info.desc_copy.limits[i - 1] : 0);
365
366 for (uint32_t i = 0; i < ARRAY_SIZE(desc_state->sets); i++) {
367 const struct panvk_descriptor_set *set = desc_state->sets[i];
368
369 if (!set)
370 continue;
371
372 copy_info.sets[i] = set->descs.dev;
373 copy_info.set_desc_counts[i] = set->desc_count;
374 }
375
376 for (uint32_t i = 0; i < ARRAY_SIZE(shader->desc_info.others.count); i++) {
377 uint32_t desc_count = shader->desc_info.others.count[i];
378
379 if (!desc_count)
380 continue;
381
382 copy_info.tables[i] = shader_desc_state->tables[i];
383 }
384
385 uint64_t desc_copy_rsd = panvk_meta_desc_copy_rsd(dev);
386 if (!desc_copy_rsd)
387 return VK_ERROR_OUT_OF_DEVICE_MEMORY;
388
389 struct panfrost_ptr push_uniforms =
390 panvk_cmd_alloc_dev_mem(cmdbuf, desc, sizeof(copy_info), 16);
391
392 if (!push_uniforms.gpu)
393 return VK_ERROR_OUT_OF_DEVICE_MEMORY;
394
395 memcpy(push_uniforms.cpu, ©_info, sizeof(copy_info));
396
397 *job_desc = panvk_cmd_alloc_desc(cmdbuf, COMPUTE_JOB);
398 if (!job_desc->gpu)
399 return VK_ERROR_OUT_OF_DEVICE_MEMORY;
400
401 /* Given the per-stage max descriptors limit, we should never
402 * reach the workgroup dimension limit. */
403 uint32_t copy_count =
404 copy_info.desc_copy.limits[PANVK_BIFROST_DESC_TABLE_COUNT - 1];
405
406 assert(copy_count - 1 < BITFIELD_MASK(10));
407
408 panfrost_pack_work_groups_compute(
409 pan_section_ptr(job_desc->cpu, COMPUTE_JOB, INVOCATION), 1, 1, 1,
410 copy_count, 1, 1, false, false);
411
412 pan_section_pack(job_desc->cpu, COMPUTE_JOB, PARAMETERS, cfg) {
413 cfg.job_task_split = util_logbase2_ceil(copy_count + 1) +
414 util_logbase2_ceil(1 + 1) +
415 util_logbase2_ceil(1 + 1);
416 }
417
418 struct pan_tls_info tlsinfo = {0};
419 struct panfrost_ptr tls = panvk_cmd_alloc_desc(cmdbuf, LOCAL_STORAGE);
420 if (!tls.gpu)
421 return VK_ERROR_OUT_OF_DEVICE_MEMORY;
422
423 GENX(pan_emit_tls)(&tlsinfo, tls.cpu);
424
425 pan_section_pack(job_desc->cpu, COMPUTE_JOB, DRAW, cfg) {
426 cfg.state = desc_copy_rsd,
427 cfg.push_uniforms = push_uniforms.gpu;
428 cfg.thread_storage = tls.gpu;
429 }
430
431 return VK_SUCCESS;
432 }
433