• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*
2  * Copyright (C) 2021 Collabora, Ltd.
3  *
4  * Permission is hereby granted, free of charge, to any person obtaining a
5  * copy of this software and associated documentation files (the "Software"),
6  * to deal in the Software without restriction, including without limitation
7  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8  * and/or sell copies of the Software, and to permit persons to whom the
9  * Software is furnished to do so, subject to the following conditions:
10  *
11  * The above copyright notice and this permission notice (including the next
12  * paragraph) shall be included in all copies or substantial portions of the
13  * Software.
14  *
15  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
18  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21  * SOFTWARE.
22  *
23  */
24 
25 #include <stdio.h>
26 #include "pan_bo.h"
27 #include "pan_shader.h"
28 #include "pan_scoreboard.h"
29 #include "pan_encoder.h"
30 #include "pan_indirect_dispatch.h"
31 #include "pan_pool.h"
32 #include "pan_util.h"
33 #include "panfrost-quirks.h"
34 #include "compiler/nir/nir_builder.h"
35 #include "util/u_memory.h"
36 #include "util/macros.h"
37 
38 struct indirect_dispatch_inputs {
39         mali_ptr job;
40         mali_ptr indirect_dim;
41         mali_ptr num_wg_sysval[3];
42 };
43 
44 static nir_ssa_def *
get_input_data(nir_builder * b,unsigned offset,unsigned size)45 get_input_data(nir_builder *b, unsigned offset, unsigned size)
46 {
47         assert(!(offset & 0x3));
48         assert(size && !(size & 0x3));
49 
50         return nir_load_ubo(b, 1, size,
51                             nir_imm_int(b, 0),
52                             nir_imm_int(b, offset),
53                             .align_mul = 4,
54                             .align_offset = 0,
55                             .range_base = 0,
56                             .range = ~0);
57 }
58 
59 #define get_input_field(b, name) \
60         get_input_data(b, offsetof(struct indirect_dispatch_inputs, name), \
61                        sizeof(((struct indirect_dispatch_inputs *)0)->name) * 8)
62 
63 static mali_ptr
get_rsd(const struct panfrost_device * dev)64 get_rsd(const struct panfrost_device *dev)
65 {
66         return dev->indirect_dispatch.descs->ptr.gpu;
67 }
68 
69 static mali_ptr
get_tls(const struct panfrost_device * dev)70 get_tls(const struct panfrost_device *dev)
71 {
72         return dev->indirect_dispatch.descs->ptr.gpu +
73                pan_size(RENDERER_STATE);
74 }
75 
76 static mali_ptr
get_ubos(struct pan_pool * pool,const struct indirect_dispatch_inputs * inputs)77 get_ubos(struct pan_pool *pool,
78          const struct indirect_dispatch_inputs *inputs)
79 {
80         struct panfrost_ptr inputs_buf =
81                 pan_pool_alloc_aligned(pool, ALIGN_POT(sizeof(*inputs), 16), 16);
82 
83         memcpy(inputs_buf.cpu, inputs, sizeof(*inputs));
84 
85         struct panfrost_ptr ubos_buf =
86                 pan_pool_alloc_desc(pool, UNIFORM_BUFFER);
87 
88         pan_pack(ubos_buf.cpu, UNIFORM_BUFFER, cfg) {
89                 cfg.entries = DIV_ROUND_UP(sizeof(*inputs), 16);
90                 cfg.pointer = inputs_buf.gpu;
91         }
92 
93         return ubos_buf.gpu;
94 }
95 
96 static mali_ptr
get_push_uniforms(struct pan_pool * pool,const struct indirect_dispatch_inputs * inputs)97 get_push_uniforms(struct pan_pool *pool,
98                   const struct indirect_dispatch_inputs *inputs)
99 {
100         const struct panfrost_device *dev = pool->dev;
101         struct panfrost_ptr push_consts_buf =
102                 pan_pool_alloc_aligned(pool,
103                                        ALIGN(dev->indirect_dispatch.push.count * 4, 16),
104                                        16);
105         uint32_t *out = push_consts_buf.cpu;
106         uint8_t *in = (uint8_t *)inputs;
107 
108         for (unsigned i = 0; i < dev->indirect_dispatch.push.count; ++i)
109                 memcpy(out + i, in +  dev->indirect_dispatch.push.words[i].offset, 4);
110 
111         return push_consts_buf.gpu;
112 }
113 
114 unsigned
GENX(pan_indirect_dispatch_emit)115 GENX(pan_indirect_dispatch_emit)(struct pan_pool *pool,
116                                  struct pan_scoreboard *scoreboard,
117                                  const struct pan_indirect_dispatch_info *dispatch_info)
118 {
119         struct panfrost_device *dev = pool->dev;
120         struct panfrost_ptr job =
121                 pan_pool_alloc_desc(pool, COMPUTE_JOB);
122         void *invocation =
123                 pan_section_ptr(job.cpu, COMPUTE_JOB, INVOCATION);
124         struct indirect_dispatch_inputs inputs = {
125                 .job = dispatch_info->job,
126                 .indirect_dim = dispatch_info->indirect_dim,
127                 .num_wg_sysval = {
128                         dispatch_info->num_wg_sysval[0],
129                         dispatch_info->num_wg_sysval[1],
130                         dispatch_info->num_wg_sysval[2],
131                 },
132         };
133 
134         panfrost_pack_work_groups_compute(invocation,
135                                           1, 1, 1, 1, 1, 1,
136                                           false, false);
137 
138         pan_section_pack(job.cpu, COMPUTE_JOB, PARAMETERS, cfg) {
139                 cfg.job_task_split = 2;
140         }
141 
142         pan_section_pack(job.cpu, COMPUTE_JOB, DRAW, cfg) {
143                 cfg.draw_descriptor_is_64b = true;
144                 cfg.state = get_rsd(dev);
145                 cfg.thread_storage = get_tls(pool->dev);
146                 cfg.uniform_buffers = get_ubos(pool, &inputs);
147                 cfg.push_uniforms = get_push_uniforms(pool, &inputs);
148         }
149 
150         return panfrost_add_job(pool, scoreboard, MALI_JOB_TYPE_COMPUTE,
151                                 false, true, 0, 0, &job, false);
152 }
153 
154 void
GENX(pan_indirect_dispatch_init)155 GENX(pan_indirect_dispatch_init)(struct panfrost_device *dev)
156 {
157         nir_builder b =
158                 nir_builder_init_simple_shader(MESA_SHADER_COMPUTE,
159                                                GENX(pan_shader_get_compiler_options)(),
160                                                "%s", "indirect_dispatch");
161         b.shader->info.internal = true;
162         nir_variable_create(b.shader, nir_var_mem_ubo,
163                             glsl_uint_type(), "inputs");
164         b.shader->info.num_ubos++;
165 
166         nir_ssa_def *zero = nir_imm_int(&b, 0);
167         nir_ssa_def *one = nir_imm_int(&b, 1);
168         nir_ssa_def *num_wg = nir_load_global(&b, get_input_field(&b, indirect_dim), 4, 3, 32);
169         nir_ssa_def *num_wg_x = nir_channel(&b, num_wg, 0);
170         nir_ssa_def *num_wg_y = nir_channel(&b, num_wg, 1);
171         nir_ssa_def *num_wg_z = nir_channel(&b, num_wg, 2);
172 
173         nir_ssa_def *job_hdr_ptr = get_input_field(&b, job);
174         nir_ssa_def *num_wg_flat = nir_imul(&b, num_wg_x, nir_imul(&b, num_wg_y, num_wg_z));
175 
176         nir_push_if(&b, nir_ieq(&b, num_wg_flat, zero));
177         {
178                 nir_ssa_def *type_ptr = nir_iadd(&b, job_hdr_ptr, nir_imm_int64(&b, 4 * 4));
179                 nir_ssa_def *ntype = nir_imm_intN_t(&b, (MALI_JOB_TYPE_NULL << 1) | 1, 8);
180                 nir_store_global(&b, type_ptr, 1, ntype, 1);
181         }
182         nir_push_else(&b, NULL);
183         {
184                 nir_ssa_def *job_dim_ptr = nir_iadd(&b, job_hdr_ptr,
185                                 nir_imm_int64(&b, pan_section_offset(COMPUTE_JOB, INVOCATION)));
186                 nir_ssa_def *num_wg_x_m1 = nir_isub(&b, num_wg_x, one);
187                 nir_ssa_def *num_wg_y_m1 = nir_isub(&b, num_wg_y, one);
188                 nir_ssa_def *num_wg_z_m1 = nir_isub(&b, num_wg_z, one);
189                 nir_ssa_def *job_dim = nir_load_global(&b, job_dim_ptr, 8, 2, 32);
190                 nir_ssa_def *dims = nir_channel(&b, job_dim, 0);
191                 nir_ssa_def *split = nir_channel(&b, job_dim, 1);
192                 nir_ssa_def *num_wg_x_split = nir_iand_imm(&b, nir_ushr_imm(&b, split, 10), 0x3f);
193                 nir_ssa_def *num_wg_y_split = nir_iadd(&b, num_wg_x_split,
194                                 nir_isub_imm(&b, 32, nir_uclz(&b, num_wg_x_m1)));
195                 nir_ssa_def *num_wg_z_split = nir_iadd(&b, num_wg_y_split,
196                                 nir_isub_imm(&b, 32, nir_uclz(&b, num_wg_y_m1)));
197                 split = nir_ior(&b, split,
198                                 nir_ior(&b,
199                                         nir_ishl(&b, num_wg_y_split, nir_imm_int(&b, 16)),
200                                         nir_ishl(&b, num_wg_z_split, nir_imm_int(&b, 22))));
201                 dims = nir_ior(&b, dims,
202                                nir_ior(&b, nir_ishl(&b, num_wg_x_m1, num_wg_x_split),
203                                        nir_ior(&b, nir_ishl(&b, num_wg_y_m1, num_wg_y_split),
204                                                nir_ishl(&b, num_wg_z_m1, num_wg_z_split))));
205 
206                 nir_store_global(&b, job_dim_ptr, 8, nir_vec2(&b, dims, split), 3);
207 
208                 nir_ssa_def *num_wg_x_ptr = get_input_field(&b, num_wg_sysval[0]);
209 
210                 nir_push_if(&b, nir_ine(&b, num_wg_x_ptr, nir_imm_int64(&b, 0)));
211                 {
212                         nir_store_global(&b, num_wg_x_ptr, 8, num_wg_x, 1);
213                         nir_store_global(&b, get_input_field(&b, num_wg_sysval[1]), 8, num_wg_y, 1);
214                         nir_store_global(&b, get_input_field(&b, num_wg_sysval[2]), 8, num_wg_z, 1);
215                 }
216                 nir_pop_if(&b, NULL);
217         }
218 
219         nir_pop_if(&b, NULL);
220 
221         struct panfrost_compile_inputs inputs = { .gpu_id = dev->gpu_id };
222         struct pan_shader_info shader_info;
223         struct util_dynarray binary;
224 
225         util_dynarray_init(&binary, NULL);
226         GENX(pan_shader_compile)(b.shader, &inputs, &binary, &shader_info);
227 
228         ralloc_free(b.shader);
229 
230         assert(!shader_info.tls_size);
231         assert(!shader_info.wls_size);
232         assert(!shader_info.sysvals.sysval_count);
233 
234         dev->indirect_dispatch.bin =
235                 panfrost_bo_create(dev, binary.size, PAN_BO_EXECUTE,
236                                 "Indirect dispatch shader");
237 
238         memcpy(dev->indirect_dispatch.bin->ptr.cpu, binary.data, binary.size);
239         util_dynarray_fini(&binary);
240 
241         dev->indirect_dispatch.push = shader_info.push;
242         dev->indirect_dispatch.descs =
243                 panfrost_bo_create(dev,
244                                    pan_size(RENDERER_STATE) +
245                                    pan_size(LOCAL_STORAGE),
246                                    0, "Indirect dispatch descriptors");
247 
248         mali_ptr address = dev->indirect_dispatch.bin->ptr.gpu;
249 
250 #if PAN_ARCH <= 5
251         address |= shader_info.midgard.first_tag;
252 #endif
253 
254         void *rsd = dev->indirect_dispatch.descs->ptr.cpu;
255         pan_pack(rsd, RENDERER_STATE, cfg) {
256                 pan_shader_prepare_rsd(&shader_info, address, &cfg);
257         }
258 
259         void *tsd = dev->indirect_dispatch.descs->ptr.cpu +
260                     pan_size(RENDERER_STATE);
261         pan_pack(tsd, LOCAL_STORAGE, ls) {
262                 ls.wls_instances = MALI_LOCAL_STORAGE_NO_WORKGROUP_MEM;
263         };
264 }
265 
266 void
GENX(pan_indirect_dispatch_cleanup)267 GENX(pan_indirect_dispatch_cleanup)(struct panfrost_device *dev)
268 {
269         panfrost_bo_unreference(dev->indirect_dispatch.bin);
270         panfrost_bo_unreference(dev->indirect_dispatch.descs);
271 }
272