• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*
2  * Copyright © 2024 Collabora Ltd.
3  *
4  * SPDX-License-Identifier: MIT
5  */
6 
7 #include <stddef.h>
8 #include <stdint.h>
9 
10 #include "genxml/gen_macros.h"
11 
12 #include "nir.h"
13 #include "nir_builder.h"
14 
15 #include "pan_encoder.h"
16 #include "pan_shader.h"
17 
18 #include "panvk_cmd_alloc.h"
19 #include "panvk_cmd_buffer.h"
20 #include "panvk_device.h"
21 #include "panvk_shader.h"
22 
23 struct pan_nir_desc_copy_info {
24    uint64_t sets[MAX_SETS];
25    uint64_t tables[PANVK_BIFROST_DESC_TABLE_COUNT];
26    uint64_t img_attrib_table;
27    struct {
28       uint64_t table;
29       uint32_t limits[PANVK_BIFROST_DESC_TABLE_COUNT];
30       uint32_t attrib_buf_idx_offset;
31    } desc_copy;
32    uint32_t set_desc_counts[MAX_SETS];
33 };
34 
35 #define get_input_field(b, name)                                               \
36    nir_load_push_constant(                                                     \
37       b, 1, sizeof(((struct pan_nir_desc_copy_info *)0)->name) * 8,            \
38       nir_imm_int(b, offsetof(struct pan_nir_desc_copy_info, name)))
39 
40 static nir_def *
get_array_entry(nir_builder * b,unsigned array_offset,unsigned array_size,unsigned array_stride,nir_def * idx)41 get_array_entry(nir_builder *b, unsigned array_offset, unsigned array_size,
42                 unsigned array_stride, nir_def *idx)
43 {
44    assert(array_size > 0);
45    assert(array_stride == 4 || array_stride == 8);
46 
47    STACK_ARRAY(nir_def *, lut, array_size);
48 
49    /* First we populate a lookup table covering the whole array. */
50    for (unsigned i = 0; i < array_size; i++) {
51       lut[i] = nir_load_push_constant(
52          b, 1, array_stride * 8,
53          nir_imm_int(b, (i * array_stride) + array_offset));
54    }
55 
56    /* Then we test each bit in the index starting from the MSB of the biggest
57     * valid index in the array and select the entry accordingly. */
58    for (unsigned lut_stride = BITFIELD_BIT(util_last_bit(array_size - 1) - 1);
59         lut_stride > 0; lut_stride >>= 1) {
60       nir_def *bit_is_set = nir_i2b(b, nir_iand_imm(b, idx, lut_stride));
61 
62       for (unsigned i = 0; i < lut_stride && i + lut_stride < array_size; i++)
63          lut[i] = nir_bcsel(b, bit_is_set, lut[i + lut_stride], lut[i]);
64    }
65 
66    nir_def *result = lut[0];
67 
68    STACK_ARRAY_FINISH(lut);
69 
70    return result;
71 }
72 
73 #define get_input_array_slot(b, name, index)                                   \
74    get_array_entry(b, offsetof(struct pan_nir_desc_copy_info, name),           \
75                    ARRAY_SIZE(((struct pan_nir_desc_copy_info *)0)->name),     \
76                    sizeof(((struct pan_nir_desc_copy_info *)0)->name[0]),      \
77                    index)
78 
79 static void
extract_desc_info_from_handle(nir_builder * b,nir_def * handle,nir_def ** table,nir_def ** desc_idx)80 extract_desc_info_from_handle(nir_builder *b, nir_def *handle, nir_def **table,
81                               nir_def **desc_idx)
82 {
83    *table = nir_ushr_imm(b, handle, 28);
84    *desc_idx = nir_iand_imm(b, handle, 0xfffffff);
85 }
86 
87 static void
set_to_table_copy(nir_builder * b,nir_def * set_ptr,nir_def * set_desc_count,nir_def * src_desc_idx,nir_def * table_ptr,nir_def * dst_desc_idx,unsigned element_size)88 set_to_table_copy(nir_builder *b, nir_def *set_ptr, nir_def *set_desc_count,
89                   nir_def *src_desc_idx, nir_def *table_ptr,
90                   nir_def *dst_desc_idx, unsigned element_size)
91 {
92    /* The last binding can have
93     * VK_DESCRIPTOR_BINDING_VARIABLE_DESCRIPTOR_COUNT_BIT set, we need to make
94     * we don't do an out-of-bound access on the source set. */
95    nir_def *dst_offset =
96       nir_u2u64(b, nir_imul_imm(b, dst_desc_idx, element_size));
97 
98    nir_push_if(b, nir_ult(b, src_desc_idx, set_desc_count));
99    {
100       nir_def *src_offset =
101          nir_u2u64(b, nir_imul_imm(b, src_desc_idx, PANVK_DESCRIPTOR_SIZE));
102       nir_def *desc = nir_load_global(b, nir_iadd(b, set_ptr, src_offset),
103                                       element_size, element_size / 4, 32);
104       nir_store_global(b, nir_iadd(b, table_ptr, dst_offset), element_size,
105                        desc, ~0);
106    }
107    nir_push_else(b, NULL);
108    {
109       nir_const_value v[] = {
110          nir_const_value_for_uint(0, 32), nir_const_value_for_uint(0, 32),
111          nir_const_value_for_uint(0, 32), nir_const_value_for_uint(0, 32),
112          nir_const_value_for_uint(0, 32), nir_const_value_for_uint(0, 32),
113          nir_const_value_for_uint(0, 32), nir_const_value_for_uint(0, 32),
114       };
115 
116       nir_def *desc = nir_build_imm(b, element_size / 4, 32, v);
117       nir_store_global(b, nir_iadd(b, table_ptr, dst_offset), element_size,
118                        desc, ~0);
119    }
120    nir_pop_if(b, NULL);
121 }
122 
123 static void
set_to_table_img_copy(nir_builder * b,nir_def * set_ptr,nir_def * set_desc_count,nir_def * src_desc_idx,nir_def * attrib_table_ptr,nir_def * attrib_buf_table_ptr,nir_def * dst_desc_idx)124 set_to_table_img_copy(nir_builder *b, nir_def *set_ptr, nir_def *set_desc_count,
125                       nir_def *src_desc_idx, nir_def *attrib_table_ptr,
126                       nir_def *attrib_buf_table_ptr, nir_def *dst_desc_idx)
127 {
128    /* The last binding can have
129     * VK_DESCRIPTOR_BINDING_VARIABLE_DESCRIPTOR_COUNT_BIT set, we need to make
130     * sure we don't do an out-of-bound access on the source set. */
131    const unsigned element_size = pan_size(ATTRIBUTE_BUFFER) * 2;
132    const unsigned attrib_buf_comps = element_size / 4;
133    const unsigned attrib_comps = pan_size(ATTRIBUTE) / 4;
134    nir_def *attrib_offset =
135       nir_u2u64(b, nir_imul_imm(b, dst_desc_idx, pan_size(ATTRIBUTE)));
136    nir_def *attrib_buf_offset =
137       nir_u2u64(b, nir_imul_imm(b, dst_desc_idx, element_size));
138 
139    nir_push_if(b, nir_ult(b, src_desc_idx, set_desc_count));
140    {
141       nir_def *attr_buf_idx_offset =
142          get_input_field(b, desc_copy.attrib_buf_idx_offset);
143       nir_def *src_offset =
144          nir_u2u64(b, nir_imul_imm(b, src_desc_idx, PANVK_DESCRIPTOR_SIZE));
145       nir_def *src_desc = nir_load_global(b, nir_iadd(b, set_ptr, src_offset),
146                                           element_size, element_size / 4, 32);
147       nir_def *fmt = nir_iand_imm(b, nir_channel(b, src_desc, 2), 0xfffffc00);
148 
149       /* Each image descriptor takes two attribute buffer slots, and we need
150        * to add the attribute buffer offset to have images working with vertex
151        * shader. */
152       nir_def *buf_idx =
153          nir_iadd(b, nir_imul_imm(b, dst_desc_idx, 2), attr_buf_idx_offset);
154 
155       nir_def *attrib_w1 = nir_ior(b, buf_idx, fmt);
156 
157       nir_def *attrib_desc = nir_vec2(b, attrib_w1, nir_imm_int(b, 0));
158 
159       nir_store_global(b, nir_iadd(b, attrib_table_ptr, attrib_offset),
160                        pan_size(ATTRIBUTE), attrib_desc,
161                        nir_component_mask(attrib_comps));
162 
163       nir_def *attrib_buf_desc = nir_vec8(
164          b, nir_channel(b, src_desc, 0), nir_channel(b, src_desc, 1),
165          nir_iand_imm(b, nir_channel(b, src_desc, 2), BITFIELD_MASK(10)),
166          nir_channel(b, src_desc, 3), nir_channel(b, src_desc, 4),
167          nir_channel(b, src_desc, 5), nir_channel(b, src_desc, 6),
168          nir_channel(b, src_desc, 7));
169       nir_store_global(b, nir_iadd(b, attrib_buf_table_ptr, attrib_buf_offset),
170                        element_size, attrib_buf_desc,
171                        nir_component_mask(attrib_buf_comps));
172    }
173    nir_push_else(b, NULL);
174    {
175       nir_const_value v[] = {
176          nir_const_value_for_uint(0, 32), nir_const_value_for_uint(0, 32),
177          nir_const_value_for_uint(0, 32), nir_const_value_for_uint(0, 32),
178          nir_const_value_for_uint(0, 32), nir_const_value_for_uint(0, 32),
179          nir_const_value_for_uint(0, 32), nir_const_value_for_uint(0, 32),
180       };
181 
182       nir_def *desc =
183          nir_build_imm(b, MAX2(attrib_buf_comps, attrib_comps), 32, v);
184 
185       nir_store_global(b, nir_iadd(b, attrib_buf_table_ptr, attrib_buf_offset),
186                        pan_size(ATTRIBUTE), desc,
187                        nir_component_mask(attrib_buf_comps));
188       nir_store_global(b, nir_iadd(b, attrib_table_ptr, attrib_offset),
189                        element_size, desc, nir_component_mask(attrib_comps));
190    }
191    nir_pop_if(b, NULL);
192 }
193 
194 static void
single_desc_copy(nir_builder * b,nir_def * desc_copy_idx)195 single_desc_copy(nir_builder *b, nir_def *desc_copy_idx)
196 {
197    nir_def *desc_copy_offset = nir_imul_imm(b, desc_copy_idx, sizeof(uint32_t));
198    nir_def *desc_copy_ptr = nir_iadd(b, get_input_field(b, desc_copy.table),
199                                      nir_u2u64(b, desc_copy_offset));
200    nir_def *src_copy_handle = nir_load_global(b, desc_copy_ptr, 4, 1, 32);
201 
202    nir_def *set_idx, *src_desc_idx;
203    extract_desc_info_from_handle(b, src_copy_handle, &set_idx, &src_desc_idx);
204 
205    nir_def *set_ptr = get_input_array_slot(b, sets, set_idx);
206    nir_def *set_desc_count = get_input_array_slot(b, set_desc_counts, set_idx);
207    nir_def *ubo_end =
208       get_input_field(b, desc_copy.limits[PANVK_BIFROST_DESC_TABLE_UBO]);
209    nir_def *img_end =
210       get_input_field(b, desc_copy.limits[PANVK_BIFROST_DESC_TABLE_IMG]);
211    nir_def *tex_end =
212       get_input_field(b, desc_copy.limits[PANVK_BIFROST_DESC_TABLE_TEXTURE]);
213    nir_def *sampler_end =
214       get_input_field(b, desc_copy.limits[PANVK_BIFROST_DESC_TABLE_SAMPLER]);
215 
216    nir_push_if(b, nir_ult(b, desc_copy_idx, ubo_end));
217    {
218       nir_def *table_ptr =
219          get_input_field(b, tables[PANVK_BIFROST_DESC_TABLE_UBO]);
220 
221       set_to_table_copy(b, set_ptr, set_desc_count, src_desc_idx, table_ptr,
222                         desc_copy_idx, sizeof(struct mali_attribute_packed));
223    }
224    nir_push_else(b, NULL);
225    {
226       nir_push_if(b, nir_ult(b, desc_copy_idx, img_end));
227       {
228          nir_def *table_ptr =
229             get_input_field(b, tables[PANVK_BIFROST_DESC_TABLE_IMG]);
230          nir_def *attrib_table_ptr = get_input_field(b, img_attrib_table);
231          nir_def *attrib_buf_table_ptr = table_ptr;
232 
233          set_to_table_img_copy(b, set_ptr, set_desc_count, src_desc_idx,
234                                attrib_table_ptr, attrib_buf_table_ptr,
235                                nir_isub(b, desc_copy_idx, ubo_end));
236       }
237       nir_push_else(b, NULL);
238       {
239          nir_push_if(b, nir_ult(b, desc_copy_idx, tex_end));
240          {
241             nir_def *table_ptr =
242                get_input_field(b, tables[PANVK_BIFROST_DESC_TABLE_TEXTURE]);
243 
244             set_to_table_copy(b, set_ptr, set_desc_count, src_desc_idx,
245                               table_ptr, nir_isub(b, desc_copy_idx, img_end),
246                               sizeof(struct mali_texture_packed));
247          }
248          nir_push_else(b, NULL);
249          {
250             nir_push_if(b, nir_ult(b, desc_copy_idx, sampler_end));
251             {
252                nir_def *table_ptr =
253                   get_input_field(b, tables[PANVK_BIFROST_DESC_TABLE_SAMPLER]);
254 
255                set_to_table_copy(b, set_ptr, set_desc_count, src_desc_idx,
256                                  table_ptr, nir_isub(b, desc_copy_idx, tex_end),
257                                  sizeof(struct mali_sampler_packed));
258             }
259             nir_pop_if(b, NULL);
260          }
261          nir_pop_if(b, NULL);
262       }
263       nir_pop_if(b, NULL);
264    }
265    nir_pop_if(b, NULL);
266 }
267 
268 static uint64_t
panvk_meta_desc_copy_rsd(struct panvk_device * dev)269 panvk_meta_desc_copy_rsd(struct panvk_device *dev)
270 {
271    struct panvk_physical_device *phys_dev =
272       to_panvk_physical_device(dev->vk.physical);
273    enum panvk_meta_object_key_type key = PANVK_META_OBJECT_KEY_COPY_DESC_SHADER;
274    struct panvk_internal_shader *shader;
275 
276    VkShaderEXT shader_handle = (VkShaderEXT)vk_meta_lookup_object(
277       &dev->meta, VK_OBJECT_TYPE_SHADER_EXT, &key, sizeof(key));
278    if (shader_handle != VK_NULL_HANDLE)
279       goto out;
280 
281    nir_builder b = nir_builder_init_simple_shader(
282       MESA_SHADER_COMPUTE, GENX(pan_shader_get_compiler_options)(), "%s",
283       "desc_copy");
284 
285    /* We actually customize that at execution time to issue the
286     * exact number of jobs. */
287    b.shader->info.workgroup_size[0] = 1;
288    b.shader->info.workgroup_size[1] = 1;
289    b.shader->info.workgroup_size[2] = 1;
290 
291    nir_def *desc_copy_id =
292       nir_channel(&b, nir_load_global_invocation_id(&b, 32), 0);
293    single_desc_copy(&b, desc_copy_id);
294 
295    struct panfrost_compile_inputs inputs = {
296       .gpu_id = phys_dev->kmod.props.gpu_prod_id,
297       .no_ubo_to_push = true,
298    };
299 
300    pan_shader_preprocess(b.shader, inputs.gpu_id);
301 
302    VkResult result = panvk_per_arch(create_internal_shader)(
303       dev, b.shader, &inputs, &shader);
304 
305    ralloc_free(b.shader);
306 
307    if (result != VK_SUCCESS)
308       return 0;
309 
310    shader->info.push.count =
311       DIV_ROUND_UP(sizeof(struct pan_nir_desc_copy_info), 4);
312 
313    shader->rsd = panvk_pool_alloc_desc(&dev->mempools.rw, RENDERER_STATE);
314    if (!panvk_priv_mem_host_addr(shader->rsd)) {
315       vk_shader_destroy(&dev->vk, &shader->vk, NULL);
316       return 0;
317    }
318 
319    pan_cast_and_pack(panvk_priv_mem_host_addr(shader->rsd), RENDERER_STATE,
320                      cfg) {
321       pan_shader_prepare_rsd(&shader->info,
322                              panvk_priv_mem_dev_addr(shader->code_mem), &cfg);
323    }
324 
325    shader_handle = (VkShaderEXT)vk_meta_cache_object(
326       &dev->vk, &dev->meta, &key, sizeof(key), VK_OBJECT_TYPE_SHADER_EXT,
327       (uint64_t)panvk_internal_shader_to_handle(shader));
328 
329 out:
330    shader = panvk_internal_shader_from_handle(shader_handle);
331    return panvk_priv_mem_dev_addr(shader->rsd);
332 }
333 
334 VkResult
panvk_per_arch(meta_get_copy_desc_job)335 panvk_per_arch(meta_get_copy_desc_job)(
336    struct panvk_cmd_buffer *cmdbuf, const struct panvk_shader *shader,
337    const struct panvk_descriptor_state *desc_state,
338    const struct panvk_shader_desc_state *shader_desc_state,
339    uint32_t attrib_buf_idx_offset, struct panfrost_ptr *job_desc)
340 {
341    struct panvk_device *dev = to_panvk_device(cmdbuf->vk.base.device);
342 
343    *job_desc = (struct panfrost_ptr){0};
344 
345    if (!shader)
346       return VK_SUCCESS;
347 
348    uint64_t copy_table = panvk_priv_mem_dev_addr(shader->desc_info.others.map);
349    if (!copy_table)
350       return VK_SUCCESS;
351 
352    struct pan_nir_desc_copy_info copy_info = {
353       .img_attrib_table = shader_desc_state->img_attrib_table,
354       .desc_copy =
355          {
356             .table = copy_table,
357             .attrib_buf_idx_offset = attrib_buf_idx_offset,
358          },
359    };
360 
361    for (uint32_t i = 0; i < ARRAY_SIZE(copy_info.desc_copy.limits); i++)
362       copy_info.desc_copy.limits[i] =
363          shader->desc_info.others.count[i] +
364          (i > 0 ? copy_info.desc_copy.limits[i - 1] : 0);
365 
366    for (uint32_t i = 0; i < ARRAY_SIZE(desc_state->sets); i++) {
367       const struct panvk_descriptor_set *set = desc_state->sets[i];
368 
369       if (!set)
370          continue;
371 
372       copy_info.sets[i] = set->descs.dev;
373       copy_info.set_desc_counts[i] = set->desc_count;
374    }
375 
376    for (uint32_t i = 0; i < ARRAY_SIZE(shader->desc_info.others.count); i++) {
377       uint32_t desc_count = shader->desc_info.others.count[i];
378 
379       if (!desc_count)
380          continue;
381 
382       copy_info.tables[i] = shader_desc_state->tables[i];
383    }
384 
385    uint64_t desc_copy_rsd = panvk_meta_desc_copy_rsd(dev);
386    if (!desc_copy_rsd)
387       return VK_ERROR_OUT_OF_DEVICE_MEMORY;
388 
389    struct panfrost_ptr push_uniforms =
390       panvk_cmd_alloc_dev_mem(cmdbuf, desc, sizeof(copy_info), 16);
391 
392    if (!push_uniforms.gpu)
393       return VK_ERROR_OUT_OF_DEVICE_MEMORY;
394 
395    memcpy(push_uniforms.cpu, &copy_info, sizeof(copy_info));
396 
397    *job_desc = panvk_cmd_alloc_desc(cmdbuf, COMPUTE_JOB);
398    if (!job_desc->gpu)
399       return VK_ERROR_OUT_OF_DEVICE_MEMORY;
400 
401    /* Given the per-stage max descriptors limit, we should never
402     * reach the workgroup dimension limit. */
403    uint32_t copy_count =
404       copy_info.desc_copy.limits[PANVK_BIFROST_DESC_TABLE_COUNT - 1];
405 
406    assert(copy_count - 1 < BITFIELD_MASK(10));
407 
408    panfrost_pack_work_groups_compute(
409       pan_section_ptr(job_desc->cpu, COMPUTE_JOB, INVOCATION), 1, 1, 1,
410       copy_count, 1, 1, false, false);
411 
412    pan_section_pack(job_desc->cpu, COMPUTE_JOB, PARAMETERS, cfg) {
413       cfg.job_task_split = util_logbase2_ceil(copy_count + 1) +
414                            util_logbase2_ceil(1 + 1) +
415                            util_logbase2_ceil(1 + 1);
416    }
417 
418    struct pan_tls_info tlsinfo = {0};
419    struct panfrost_ptr tls = panvk_cmd_alloc_desc(cmdbuf, LOCAL_STORAGE);
420    if (!tls.gpu)
421       return VK_ERROR_OUT_OF_DEVICE_MEMORY;
422 
423    GENX(pan_emit_tls)(&tlsinfo, tls.cpu);
424 
425    pan_section_pack(job_desc->cpu, COMPUTE_JOB, DRAW, cfg) {
426       cfg.state = desc_copy_rsd,
427       cfg.push_uniforms = push_uniforms.gpu;
428       cfg.thread_storage = tls.gpu;
429    }
430 
431    return VK_SUCCESS;
432 }
433