• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*
2  * Copyright (C) 2021 Collabora, Ltd.
3  *
4  * Permission is hereby granted, free of charge, to any person obtaining a
5  * copy of this software and associated documentation files (the "Software"),
6  * to deal in the Software without restriction, including without limitation
7  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8  * and/or sell copies of the Software, and to permit persons to whom the
9  * Software is furnished to do so, subject to the following conditions:
10  *
11  * The above copyright notice and this permission notice (including the next
12  * paragraph) shall be included in all copies or substantial portions of the
13  * Software.
14  *
15  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
18  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21  * SOFTWARE.
22  *
23  */
24 
25 #include <stdio.h>
26 #include "pan_bo.h"
27 #include "pan_shader.h"
28 #include "pan_scoreboard.h"
29 #include "pan_encoder.h"
30 #include "pan_indirect_draw.h"
31 #include "pan_pool.h"
32 #include "pan_util.h"
33 #include "panfrost-quirks.h"
34 #include "compiler/nir/nir_builder.h"
35 #include "util/u_memory.h"
36 #include "util/macros.h"
37 
38 #define WORD(x) ((x) * 4)
39 
40 #define LOOP \
41         for (nir_loop *l = nir_push_loop(b); l != NULL; \
42              nir_pop_loop(b, l), l = NULL)
43 #define BREAK nir_jump(b, nir_jump_break)
44 #define CONTINUE nir_jump(b, nir_jump_continue)
45 
46 #define IF(cond) nir_push_if(b, cond);
47 #define ELSE nir_push_else(b, NULL);
48 #define ENDIF nir_pop_if(b, NULL);
49 
50 #define MIN_MAX_JOBS 128
51 
52 struct draw_data {
53         nir_ssa_def *draw_buf;
54         nir_ssa_def *draw_buf_stride;
55         nir_ssa_def *index_buf;
56         nir_ssa_def *restart_index;
57         nir_ssa_def *vertex_count;
58         nir_ssa_def *start_instance;
59         nir_ssa_def *instance_count;
60         nir_ssa_def *vertex_start;
61         nir_ssa_def *index_bias;
62         nir_ssa_def *draw_ctx;
63         nir_ssa_def *min_max_ctx;
64 };
65 
66 struct instance_size {
67         nir_ssa_def *raw;
68         nir_ssa_def *padded;
69         nir_ssa_def *packed;
70 };
71 
72 struct jobs_data {
73         nir_ssa_def *vertex_job;
74         nir_ssa_def *tiler_job;
75         nir_ssa_def *base_vertex_offset;
76         nir_ssa_def *first_vertex_sysval;
77         nir_ssa_def *base_vertex_sysval;
78         nir_ssa_def *base_instance_sysval;
79         nir_ssa_def *offset_start;
80         nir_ssa_def *invocation;
81 };
82 
83 struct varyings_data {
84         nir_ssa_def *varying_bufs;
85         nir_ssa_def *pos_ptr;
86         nir_ssa_def *psiz_ptr;
87         nir_variable *mem_ptr;
88 };
89 
90 struct attribs_data {
91         nir_ssa_def *attrib_count;
92         nir_ssa_def *attrib_bufs;
93         nir_ssa_def *attribs;
94 };
95 
96 struct indirect_draw_shader_builder {
97         nir_builder b;
98         const struct panfrost_device *dev;
99         unsigned flags;
100         bool index_min_max_search;
101         unsigned index_size;
102         struct draw_data draw;
103         struct instance_size instance_size;
104         struct jobs_data jobs;
105         struct varyings_data varyings;
106         struct attribs_data attribs;
107 };
108 
109 /* Describes an indirect draw (see glDrawArraysIndirect()) */
110 
111 struct indirect_draw_info {
112         uint32_t count;
113         uint32_t instance_count;
114         uint32_t start;
115         uint32_t start_instance;
116 };
117 
118 struct indirect_indexed_draw_info {
119         uint32_t count;
120         uint32_t instance_count;
121         uint32_t start;
122         int32_t index_bias;
123         uint32_t start_instance;
124 };
125 
126 /* Store the min/max index in a separate context. This is not supported yet, but
127  * the DDK seems to put all min/max search jobs at the beginning of the job chain
128  * when multiple indirect draws are issued to avoid the serialization caused by
129  * the draw patching jobs which have the suppress_prefetch flag set. Merging the
130  * min/max and draw contexts would prevent such optimizations (draw contexts are
131  * shared by all indirect draw in a batch).
132  */
133 
134 struct min_max_context {
135         uint32_t min;
136         uint32_t max;
137 };
138 
139 /* Per-batch context shared by all indirect draws queued to a given batch. */
140 
141 struct indirect_draw_context {
142         /* Pointer to the top of the varying heap. */
143         mali_ptr varying_mem;
144 };
145 
146 /* Indirect draw shader inputs. Those are stored in a UBO. */
147 
148 struct indirect_draw_inputs {
149         /* indirect_draw_context pointer */
150         mali_ptr draw_ctx;
151 
152         /* min_max_context pointer */
153         mali_ptr min_max_ctx;
154 
155         /* Pointer to an array of indirect_draw_info objects */
156         mali_ptr draw_buf;
157 
158         /* Pointer to an uint32_t containing the number of draws to issue */
159         mali_ptr draw_count_ptr;
160 
161         /* index buffer */
162         mali_ptr index_buf;
163 
164         /* {base,first}_{vertex,instance} sysvals */
165         mali_ptr first_vertex_sysval;
166         mali_ptr base_vertex_sysval;
167         mali_ptr base_instance_sysval;
168 
169         /* Pointers to various cmdstream structs that need to be patched */
170         mali_ptr vertex_job;
171         mali_ptr tiler_job;
172         mali_ptr attrib_bufs;
173         mali_ptr attribs;
174         mali_ptr varying_bufs;
175         uint32_t draw_count;
176         uint32_t draw_buf_stride;
177         uint32_t restart_index;
178         uint32_t attrib_count;
179 };
180 
181 static nir_ssa_def *
get_input_data(nir_builder * b,unsigned offset,unsigned size)182 get_input_data(nir_builder *b, unsigned offset, unsigned size)
183 {
184         assert(!(offset & 0x3));
185         assert(size && !(size & 0x3));
186 
187         return nir_load_ubo(b, 1, size,
188                             nir_imm_int(b, 0),
189                             nir_imm_int(b, offset),
190                             .align_mul = 4,
191                             .align_offset = 0,
192                             .range_base = 0,
193                             .range = ~0);
194 }
195 
196 #define get_input_field(b, name) \
197         get_input_data(b, offsetof(struct indirect_draw_inputs, name), \
198                        sizeof(((struct indirect_draw_inputs *)0)->name) * 8)
199 
200 static nir_ssa_def *
get_address(nir_builder * b,nir_ssa_def * base,nir_ssa_def * offset)201 get_address(nir_builder *b, nir_ssa_def *base, nir_ssa_def *offset)
202 {
203         return nir_iadd(b, base, nir_u2u64(b, offset));
204 }
205 
206 static nir_ssa_def *
get_address_imm(nir_builder * b,nir_ssa_def * base,unsigned offset)207 get_address_imm(nir_builder *b, nir_ssa_def *base, unsigned offset)
208 {
209         return get_address(b, base, nir_imm_int(b, offset));
210 }
211 
212 static nir_ssa_def *
load_global(nir_builder * b,nir_ssa_def * addr,unsigned ncomps,unsigned bit_size)213 load_global(nir_builder *b, nir_ssa_def *addr, unsigned ncomps, unsigned bit_size)
214 {
215         return nir_load_global(b, addr, 4, ncomps, bit_size);
216 }
217 
218 static void
store_global(nir_builder * b,nir_ssa_def * addr,nir_ssa_def * value,unsigned ncomps)219 store_global(nir_builder *b, nir_ssa_def *addr,
220              nir_ssa_def *value, unsigned ncomps)
221 {
222         nir_store_global(b, addr, 4, value, (1 << ncomps) - 1);
223 }
224 
225 static nir_ssa_def *
get_draw_ctx_data(struct indirect_draw_shader_builder * builder,unsigned offset,unsigned size)226 get_draw_ctx_data(struct indirect_draw_shader_builder *builder,
227                   unsigned offset, unsigned size)
228 {
229         nir_builder *b = &builder->b;
230         return load_global(b,
231                            get_address_imm(b, builder->draw.draw_ctx, offset),
232                            1, size);
233 }
234 
235 static void
set_draw_ctx_data(struct indirect_draw_shader_builder * builder,unsigned offset,nir_ssa_def * value,unsigned size)236 set_draw_ctx_data(struct indirect_draw_shader_builder *builder,
237                   unsigned offset, nir_ssa_def *value, unsigned size)
238 {
239         nir_builder *b = &builder->b;
240         store_global(b,
241                      get_address_imm(b, builder->draw.draw_ctx, offset),
242                      value, 1);
243 }
244 
245 #define get_draw_ctx_field(builder, name) \
246         get_draw_ctx_data(builder, \
247                           offsetof(struct indirect_draw_context, name), \
248                           sizeof(((struct indirect_draw_context *)0)->name) * 8)
249 
250 #define set_draw_ctx_field(builder, name, val) \
251         set_draw_ctx_data(builder, \
252                           offsetof(struct indirect_draw_context, name), \
253                           val, \
254                           sizeof(((struct indirect_draw_context *)0)->name) * 8)
255 
256 static nir_ssa_def *
get_min_max_ctx_data(struct indirect_draw_shader_builder * builder,unsigned offset,unsigned size)257 get_min_max_ctx_data(struct indirect_draw_shader_builder *builder,
258                      unsigned offset, unsigned size)
259 {
260         nir_builder *b = &builder->b;
261         return load_global(b,
262                            get_address_imm(b, builder->draw.min_max_ctx, offset),
263                            1, size);
264 }
265 
266 #define get_min_max_ctx_field(builder, name) \
267         get_min_max_ctx_data(builder, \
268                              offsetof(struct min_max_context, name), \
269                              sizeof(((struct min_max_context *)0)->name) * 8)
270 
271 static void
update_min(struct indirect_draw_shader_builder * builder,nir_ssa_def * val)272 update_min(struct indirect_draw_shader_builder *builder, nir_ssa_def *val)
273 {
274         nir_builder *b = &builder->b;
275         nir_ssa_def *addr =
276                 get_address_imm(b,
277                                 builder->draw.min_max_ctx,
278                                 offsetof(struct min_max_context, min));
279         nir_global_atomic_umin(b, 32, addr, val);
280 }
281 
282 static void
update_max(struct indirect_draw_shader_builder * builder,nir_ssa_def * val)283 update_max(struct indirect_draw_shader_builder *builder, nir_ssa_def *val)
284 {
285         nir_builder *b = &builder->b;
286         nir_ssa_def *addr =
287                 get_address_imm(b,
288                                 builder->draw.min_max_ctx,
289                                 offsetof(struct min_max_context, max));
290         nir_global_atomic_umax(b, 32, addr, val);
291 }
292 
293 #define get_draw_field(b, draw_ptr, field) \
294         load_global(b, \
295                     get_address_imm(b, draw_ptr, \
296                                     offsetof(struct indirect_draw_info, field)), \
297                     1, sizeof(((struct indirect_draw_info *)0)->field) * 8)
298 
299 #define get_indexed_draw_field(b, draw_ptr, field) \
300         load_global(b, \
301                     get_address_imm(b, draw_ptr, \
302                                     offsetof(struct indirect_indexed_draw_info, field)), \
303                     1, sizeof(((struct indirect_indexed_draw_info *)0)->field) * 8)
304 
305 static void
extract_inputs(struct indirect_draw_shader_builder * builder)306 extract_inputs(struct indirect_draw_shader_builder *builder)
307 {
308         nir_builder *b = &builder->b;
309 
310         builder->draw.draw_ctx = get_input_field(b, draw_ctx);
311         builder->draw.draw_buf = get_input_field(b, draw_buf);
312         builder->draw.draw_buf_stride = get_input_field(b, draw_buf_stride);
313 
314         if (builder->index_size) {
315                 builder->draw.index_buf = get_input_field(b, index_buf);
316                 builder->draw.min_max_ctx = get_input_field(b, min_max_ctx);
317                 if (builder->flags & PAN_INDIRECT_DRAW_PRIMITIVE_RESTART) {
318                         builder->draw.restart_index =
319                                 get_input_field(b, restart_index);
320                 }
321         }
322 
323         if (builder->index_min_max_search)
324                 return;
325 
326         builder->jobs.first_vertex_sysval = get_input_field(b, first_vertex_sysval);
327         builder->jobs.base_vertex_sysval = get_input_field(b, base_vertex_sysval);
328         builder->jobs.base_instance_sysval = get_input_field(b, base_instance_sysval);
329         builder->jobs.vertex_job = get_input_field(b, vertex_job);
330         builder->jobs.tiler_job = get_input_field(b, tiler_job);
331         builder->attribs.attrib_bufs = get_input_field(b, attrib_bufs);
332         builder->attribs.attribs = get_input_field(b, attribs);
333         builder->attribs.attrib_count = get_input_field(b, attrib_count);
334         builder->varyings.varying_bufs = get_input_field(b, varying_bufs);
335         builder->varyings.mem_ptr =
336                 nir_local_variable_create(b->impl,
337                                           glsl_uint64_t_type(),
338                                           "var_mem_ptr");
339         nir_store_var(b, builder->varyings.mem_ptr,
340                       get_draw_ctx_field(builder, varying_mem), 3);
341 }
342 
343 static void
init_shader_builder(struct indirect_draw_shader_builder * builder,const struct panfrost_device * dev,unsigned flags,unsigned index_size,bool index_min_max_search)344 init_shader_builder(struct indirect_draw_shader_builder *builder,
345                     const struct panfrost_device *dev,
346                     unsigned flags, unsigned index_size,
347                     bool index_min_max_search)
348 {
349         memset(builder, 0, sizeof(*builder));
350         builder->dev = dev;
351         builder->flags = flags;
352         builder->index_size = index_size;
353 
354         builder->index_min_max_search = index_min_max_search;
355 
356         if (index_min_max_search) {
357                 builder->b =
358                         nir_builder_init_simple_shader(MESA_SHADER_COMPUTE,
359                                                        GENX(pan_shader_get_compiler_options)(),
360                                                        "indirect_draw_min_max_index(index_size=%d)",
361                                                        builder->index_size);
362         } else {
363                 builder->b =
364                         nir_builder_init_simple_shader(MESA_SHADER_COMPUTE,
365                                                        GENX(pan_shader_get_compiler_options)(),
366                                                        "indirect_draw(index_size=%d%s%s%s)",
367                                                        builder->index_size,
368                                                        flags & PAN_INDIRECT_DRAW_HAS_PSIZ ?
369                                                        ",psiz" : "",
370                                                        flags & PAN_INDIRECT_DRAW_PRIMITIVE_RESTART ?
371                                                        ",primitive_restart" : "",
372                                                        flags & PAN_INDIRECT_DRAW_UPDATE_PRIM_SIZE ?
373                                                        ",update_primitive_size" : "");
374         }
375 
376         nir_builder *b = &builder->b;
377         b->shader->info.internal = true;
378         nir_variable_create(b->shader, nir_var_mem_ubo,
379                             glsl_uint_type(), "inputs");
380         b->shader->info.num_ubos++;
381 
382         extract_inputs(builder);
383 }
384 
385 static void
update_job(struct indirect_draw_shader_builder * builder,enum mali_job_type type)386 update_job(struct indirect_draw_shader_builder *builder, enum mali_job_type type)
387 {
388         nir_builder *b = &builder->b;
389         nir_ssa_def *job_ptr =
390                 type == MALI_JOB_TYPE_VERTEX ?
391                 builder->jobs.vertex_job : builder->jobs.tiler_job;
392 
393         /* Update the invocation words. */
394         store_global(b, get_address_imm(b, job_ptr, WORD(8)),
395                      builder->jobs.invocation, 2);
396 
397         unsigned draw_offset =
398                 type == MALI_JOB_TYPE_VERTEX ?
399                 pan_section_offset(COMPUTE_JOB, DRAW) :
400                 pan_section_offset(TILER_JOB, DRAW);
401         unsigned prim_offset = pan_section_offset(TILER_JOB, PRIMITIVE);
402         unsigned psiz_offset = pan_section_offset(TILER_JOB, PRIMITIVE_SIZE);
403         unsigned index_size = builder->index_size;
404 
405         if (type == MALI_JOB_TYPE_TILER) {
406                 /* Update PRIMITIVE.{base_vertex_offset,count} */
407                 store_global(b,
408                              get_address_imm(b, job_ptr, prim_offset + WORD(1)),
409                              builder->jobs.base_vertex_offset, 1);
410                 store_global(b,
411                              get_address_imm(b, job_ptr, prim_offset + WORD(3)),
412                              nir_iadd_imm(b, builder->draw.vertex_count, -1), 1);
413 
414                 if (index_size) {
415                         nir_ssa_def *addr =
416                                 get_address_imm(b, job_ptr, prim_offset + WORD(4));
417                         nir_ssa_def *indices = load_global(b, addr, 1, 64);
418                         nir_ssa_def *offset =
419                                 nir_imul_imm(b, builder->draw.vertex_start, index_size);
420 
421                         indices = get_address(b, indices, offset);
422                         store_global(b, addr, indices, 2);
423                 }
424 
425                 /* Update PRIMITIVE_SIZE.size_array */
426                 if ((builder->flags & PAN_INDIRECT_DRAW_HAS_PSIZ) &&
427                     (builder->flags & PAN_INDIRECT_DRAW_UPDATE_PRIM_SIZE)) {
428                         store_global(b,
429                                      get_address_imm(b, job_ptr, psiz_offset + WORD(0)),
430                                      builder->varyings.psiz_ptr, 2);
431                 }
432 
433                 /* Update DRAW.position */
434                 store_global(b, get_address_imm(b, job_ptr, draw_offset + WORD(4)),
435                              builder->varyings.pos_ptr, 2);
436         }
437 
438         nir_ssa_def *draw_w01 =
439                 load_global(b, get_address_imm(b, job_ptr, draw_offset + WORD(0)), 2, 32);
440         nir_ssa_def *draw_w0 = nir_channel(b, draw_w01, 0);
441 
442         /* Update DRAW.{instance_size,offset_start} */
443         nir_ssa_def *instance_size =
444                 nir_bcsel(b,
445                           nir_ult(b, builder->draw.instance_count, nir_imm_int(b, 2)),
446                           nir_imm_int(b, 0), builder->instance_size.packed);
447         draw_w01 = nir_vec2(b,
448                             nir_ior(b, nir_iand_imm(b, draw_w0, 0xffff),
449                                     nir_ishl(b, instance_size, nir_imm_int(b, 16))),
450                             builder->jobs.offset_start);
451         store_global(b, get_address_imm(b, job_ptr, draw_offset + WORD(0)),
452                      draw_w01, 2);
453 }
454 
455 static void
split_div(nir_builder * b,nir_ssa_def * div,nir_ssa_def ** r_e,nir_ssa_def ** d)456 split_div(nir_builder *b, nir_ssa_def *div, nir_ssa_def **r_e, nir_ssa_def **d)
457 {
458         /* TODO: Lower this 64bit div to something GPU-friendly */
459         nir_ssa_def *r = nir_imax(b, nir_ufind_msb(b, div), nir_imm_int(b, 0));
460         nir_ssa_def *div64 = nir_u2u64(b, div);
461         nir_ssa_def *half_div64 = nir_u2u64(b, nir_ushr_imm(b, div, 1));
462         nir_ssa_def *f0 = nir_iadd(b,
463                                    nir_ishl(b, nir_imm_int64(b, 1),
464                                             nir_iadd_imm(b, r, 32)),
465                                    half_div64);
466         nir_ssa_def *fi = nir_idiv(b, f0, div64);
467         nir_ssa_def *ff = nir_isub(b, f0, nir_imul(b, fi, div64));
468         nir_ssa_def *e = nir_bcsel(b, nir_ult(b, half_div64, ff),
469                                    nir_imm_int(b, 1 << 5), nir_imm_int(b, 0));
470         *d = nir_iand_imm(b, nir_u2u32(b, fi), ~(1 << 31));
471         *r_e = nir_ior(b, r, e);
472 }
473 
474 static void
update_vertex_attrib_buf(struct indirect_draw_shader_builder * builder,nir_ssa_def * attrib_buf_ptr,enum mali_attribute_type type,nir_ssa_def * div1,nir_ssa_def * div2)475 update_vertex_attrib_buf(struct indirect_draw_shader_builder *builder,
476                          nir_ssa_def *attrib_buf_ptr,
477                          enum mali_attribute_type type,
478                          nir_ssa_def *div1,
479                          nir_ssa_def *div2)
480 {
481         nir_builder *b = &builder->b;
482         unsigned type_mask = BITFIELD_MASK(6);
483         nir_ssa_def *w01 = load_global(b, attrib_buf_ptr, 2, 32);
484         nir_ssa_def *w0 = nir_channel(b, w01, 0);
485         nir_ssa_def *w1 = nir_channel(b, w01, 1);
486 
487         /* Word 0 and 1 of the attribute descriptor contain the type,
488          * pointer and the the divisor exponent.
489          */
490         w0 = nir_iand_imm(b, nir_channel(b, w01, 0), ~type_mask);
491         w0 = nir_ior(b, w0, nir_imm_int(b, type));
492         w1 = nir_ior(b, w1, nir_ishl(b, div1, nir_imm_int(b, 24)));
493 
494         store_global(b, attrib_buf_ptr, nir_vec2(b, w0, w1), 2);
495 
496         if (type == MALI_ATTRIBUTE_TYPE_1D_NPOT_DIVISOR) {
497                 /* If the divisor is not a power of two, the divisor numerator
498                  * is passed in word 1 of the continuation attribute (word 5
499                  * if we consider the attribute and its continuation as a
500                  * single attribute).
501                  */
502                 assert(div2);
503                 store_global(b, get_address_imm(b, attrib_buf_ptr, WORD(5)),
504                              div2, 1);
505         }
506 }
507 
508 static void
zero_attrib_buf_stride(struct indirect_draw_shader_builder * builder,nir_ssa_def * attrib_buf_ptr)509 zero_attrib_buf_stride(struct indirect_draw_shader_builder *builder,
510                        nir_ssa_def *attrib_buf_ptr)
511 {
512         /* Stride is an unadorned 32-bit uint at word 2 */
513         nir_builder *b = &builder->b;
514         store_global(b, get_address_imm(b, attrib_buf_ptr, WORD(2)),
515                         nir_imm_int(b, 0), 1);
516 }
517 
518 static void
adjust_attrib_offset(struct indirect_draw_shader_builder * builder,nir_ssa_def * attrib_ptr,nir_ssa_def * attrib_buf_ptr,nir_ssa_def * instance_div)519 adjust_attrib_offset(struct indirect_draw_shader_builder *builder,
520                      nir_ssa_def *attrib_ptr, nir_ssa_def *attrib_buf_ptr,
521                      nir_ssa_def *instance_div)
522 {
523         nir_builder *b = &builder->b;
524         nir_ssa_def *zero = nir_imm_int(b, 0);
525         nir_ssa_def *two = nir_imm_int(b, 2);
526         nir_ssa_def *sub_cur_offset =
527                 nir_iand(b, nir_ine(b, builder->jobs.offset_start, zero),
528                          nir_uge(b, builder->draw.instance_count, two));
529 
530         nir_ssa_def *add_base_inst_offset =
531                 nir_iand(b, nir_ine(b, builder->draw.start_instance, zero),
532                          nir_ine(b, instance_div, zero));
533 
534         IF (nir_ior(b, sub_cur_offset, add_base_inst_offset)) {
535                 nir_ssa_def *offset =
536                         load_global(b, get_address_imm(b, attrib_ptr, WORD(1)), 1, 32);
537                 nir_ssa_def *stride =
538                         load_global(b, get_address_imm(b, attrib_buf_ptr, WORD(2)), 1, 32);
539 
540                 /* Per-instance data needs to be offset in response to a
541                  * delayed start in an indexed draw.
542                  */
543 
544                 IF (add_base_inst_offset) {
545                         offset = nir_iadd(b, offset,
546                                           nir_idiv(b,
547                                                    nir_imul(b, stride,
548                                                             builder->draw.start_instance),
549                                                    instance_div));
550                 } ENDIF
551 
552                 IF (sub_cur_offset) {
553                         offset = nir_isub(b, offset,
554                                           nir_imul(b, stride,
555                                                    builder->jobs.offset_start));
556                 } ENDIF
557 
558                 store_global(b, get_address_imm(b, attrib_ptr, WORD(1)),
559                              offset, 1);
560         } ENDIF
561 }
562 
563 /* x is power of two or zero <===> x has 0 (zero) or 1 (POT) bits set */
564 
565 static nir_ssa_def *
nir_is_power_of_two_or_zero(nir_builder * b,nir_ssa_def * x)566 nir_is_power_of_two_or_zero(nir_builder *b, nir_ssa_def *x)
567 {
568         return nir_ult(b, nir_bit_count(b, x), nir_imm_int(b, 2));
569 }
570 
571 /* Based on panfrost_emit_vertex_data() */
572 
573 static void
update_vertex_attribs(struct indirect_draw_shader_builder * builder)574 update_vertex_attribs(struct indirect_draw_shader_builder *builder)
575 {
576         nir_builder *b = &builder->b;
577         nir_variable *attrib_idx_var =
578                 nir_local_variable_create(b->impl, glsl_uint_type(),
579                                           "attrib_idx");
580         nir_store_var(b, attrib_idx_var, nir_imm_int(b, 0), 1);
581 
582 #if PAN_ARCH <= 5
583         nir_ssa_def *single_instance =
584                 nir_ult(b, builder->draw.instance_count, nir_imm_int(b, 2));
585 #endif
586 
587         LOOP {
588                 nir_ssa_def *attrib_idx = nir_load_var(b, attrib_idx_var);
589                 IF (nir_uge(b, attrib_idx, builder->attribs.attrib_count))
590                         BREAK;
591                 ENDIF
592 
593                 nir_ssa_def *attrib_buf_ptr =
594                          get_address(b, builder->attribs.attrib_bufs,
595                                      nir_imul_imm(b, attrib_idx,
596                                                   2 * pan_size(ATTRIBUTE_BUFFER)));
597                 nir_ssa_def *attrib_ptr =
598                          get_address(b, builder->attribs.attribs,
599                                      nir_imul_imm(b, attrib_idx,
600                                                   pan_size(ATTRIBUTE)));
601 
602                 nir_ssa_def *r_e, *d;
603 
604 #if PAN_ARCH <= 5
605                 IF (nir_ieq_imm(b, attrib_idx, PAN_VERTEX_ID)) {
606                         nir_ssa_def *r_p =
607                                 nir_bcsel(b, single_instance,
608                                           nir_imm_int(b, 0x9f),
609                                           builder->instance_size.packed);
610 
611                         store_global(b,
612                                      get_address_imm(b, attrib_buf_ptr, WORD(4)),
613                                      nir_ishl(b, r_p, nir_imm_int(b, 24)), 1);
614 
615                         nir_store_var(b, attrib_idx_var,
616                                       nir_iadd_imm(b, attrib_idx, 1), 1);
617                         CONTINUE;
618                 } ENDIF
619 
620                 IF (nir_ieq_imm(b, attrib_idx, PAN_INSTANCE_ID)) {
621                         split_div(b, builder->instance_size.padded,
622                                   &r_e, &d);
623                         nir_ssa_def *default_div =
624                                 nir_ior(b, single_instance,
625                                         nir_ult(b,
626                                                 builder->instance_size.padded,
627                                                 nir_imm_int(b, 2)));
628                         r_e = nir_bcsel(b, default_div,
629                                         nir_imm_int(b, 0x3f), r_e);
630                         d = nir_bcsel(b, default_div,
631                                       nir_imm_int(b, (1u << 31) - 1), d);
632                         store_global(b,
633                                      get_address_imm(b, attrib_buf_ptr, WORD(1)),
634                                      nir_vec2(b, nir_ishl(b, r_e, nir_imm_int(b, 24)), d),
635                                      2);
636                         nir_store_var(b, attrib_idx_var,
637                                       nir_iadd_imm(b, attrib_idx, 1), 1);
638                         CONTINUE;
639                 } ENDIF
640 #endif
641 
642                 nir_ssa_def *instance_div =
643                         load_global(b, get_address_imm(b, attrib_buf_ptr, WORD(7)), 1, 32);
644 
645                 nir_ssa_def *div = nir_imul(b, instance_div, builder->instance_size.padded);
646 
647                 nir_ssa_def *multi_instance =
648                         nir_uge(b, builder->draw.instance_count, nir_imm_int(b, 2));
649 
650                 IF (nir_ine(b, div, nir_imm_int(b, 0))) {
651                         IF (multi_instance) {
652                                 IF (nir_is_power_of_two_or_zero(b, div)) {
653                                         nir_ssa_def *exp =
654                                                 nir_imax(b, nir_ufind_msb(b, div),
655                                                          nir_imm_int(b, 0));
656                                         update_vertex_attrib_buf(builder, attrib_buf_ptr,
657                                                                  MALI_ATTRIBUTE_TYPE_1D_POT_DIVISOR,
658                                                                  exp, NULL);
659                                 } ELSE {
660                                         split_div(b, div, &r_e, &d);
661                                         update_vertex_attrib_buf(builder, attrib_buf_ptr,
662                                                                  MALI_ATTRIBUTE_TYPE_1D_NPOT_DIVISOR,
663                                                                  r_e, d);
664                                 } ENDIF
665                         } ELSE {
666                                 /* Single instance with a non-0 divisor: all
667                                  * accesses should point to attribute 0 */
668                                 zero_attrib_buf_stride(builder, attrib_buf_ptr);
669                         } ENDIF
670 
671                         adjust_attrib_offset(builder, attrib_ptr, attrib_buf_ptr, instance_div);
672                 } ELSE IF (multi_instance) {
673                         update_vertex_attrib_buf(builder, attrib_buf_ptr,
674                                         MALI_ATTRIBUTE_TYPE_1D_MODULUS,
675                                         builder->instance_size.packed, NULL);
676                 } ENDIF ENDIF
677 
678                 nir_store_var(b, attrib_idx_var, nir_iadd_imm(b, attrib_idx, 1), 1);
679         }
680 }
681 
682 static nir_ssa_def *
update_varying_buf(struct indirect_draw_shader_builder * builder,nir_ssa_def * varying_buf_ptr,nir_ssa_def * vertex_count)683 update_varying_buf(struct indirect_draw_shader_builder *builder,
684                    nir_ssa_def *varying_buf_ptr,
685                    nir_ssa_def *vertex_count)
686 {
687         nir_builder *b = &builder->b;
688 
689         nir_ssa_def *stride =
690                 load_global(b, get_address_imm(b, varying_buf_ptr, WORD(2)), 1, 32);
691         nir_ssa_def *size = nir_imul(b, stride, vertex_count);
692         nir_ssa_def *aligned_size =
693                 nir_iand_imm(b, nir_iadd_imm(b, size, 63), ~63);
694         nir_ssa_def *var_mem_ptr =
695                 nir_load_var(b, builder->varyings.mem_ptr);
696         nir_ssa_def *w0 =
697                 nir_ior(b, nir_unpack_64_2x32_split_x(b, var_mem_ptr),
698                         nir_imm_int(b, MALI_ATTRIBUTE_TYPE_1D));
699         nir_ssa_def *w1 = nir_unpack_64_2x32_split_y(b, var_mem_ptr);
700         store_global(b, get_address_imm(b, varying_buf_ptr, WORD(0)),
701                      nir_vec4(b, w0, w1, stride, size), 4);
702 
703         nir_store_var(b, builder->varyings.mem_ptr,
704                       get_address(b, var_mem_ptr, aligned_size), 3);
705 
706         return var_mem_ptr;
707 }
708 
709 /* Based on panfrost_emit_varying_descriptor() */
710 
711 static void
update_varyings(struct indirect_draw_shader_builder * builder)712 update_varyings(struct indirect_draw_shader_builder *builder)
713 {
714         nir_builder *b = &builder->b;
715         nir_ssa_def *vertex_count =
716                 nir_imul(b, builder->instance_size.padded,
717                          builder->draw.instance_count);
718         nir_ssa_def *buf_ptr =
719                 get_address_imm(b, builder->varyings.varying_bufs,
720                                 PAN_VARY_GENERAL *
721                                 pan_size(ATTRIBUTE_BUFFER));
722         update_varying_buf(builder, buf_ptr, vertex_count);
723 
724         buf_ptr = get_address_imm(b, builder->varyings.varying_bufs,
725                                   PAN_VARY_POSITION *
726                                   pan_size(ATTRIBUTE_BUFFER));
727         builder->varyings.pos_ptr =
728                 update_varying_buf(builder, buf_ptr, vertex_count);
729 
730         if (builder->flags & PAN_INDIRECT_DRAW_HAS_PSIZ) {
731                 buf_ptr = get_address_imm(b, builder->varyings.varying_bufs,
732                                           PAN_VARY_PSIZ *
733                                           pan_size(ATTRIBUTE_BUFFER));
734                 builder->varyings.psiz_ptr =
735                         update_varying_buf(builder, buf_ptr, vertex_count);
736         }
737 
738         set_draw_ctx_field(builder, varying_mem,
739                            nir_load_var(b, builder->varyings.mem_ptr));
740 }
741 
742 /* Based on panfrost_pack_work_groups_compute() */
743 
744 static void
get_invocation(struct indirect_draw_shader_builder * builder)745 get_invocation(struct indirect_draw_shader_builder *builder)
746 {
747         nir_builder *b = &builder->b;
748         nir_ssa_def *one = nir_imm_int(b, 1);
749         nir_ssa_def *max_vertex =
750                 nir_usub_sat(b, builder->instance_size.raw, one);
751         nir_ssa_def *max_instance =
752                 nir_usub_sat(b, builder->draw.instance_count, one);
753         nir_ssa_def *split =
754                 nir_bcsel(b, nir_ieq_imm(b, max_instance, 0),
755                           nir_imm_int(b, 32),
756                           nir_iadd_imm(b, nir_ufind_msb(b, max_vertex), 1));
757 
758         builder->jobs.invocation =
759                 nir_vec2(b,
760                          nir_ior(b, max_vertex,
761                                  nir_ishl(b, max_instance, split)),
762                          nir_ior(b, nir_ishl(b, split, nir_imm_int(b, 22)),
763                                  nir_imm_int(b, 2 << 28)));
764 }
765 
766 /* Based on panfrost_padded_vertex_count() */
767 
768 static nir_ssa_def *
get_padded_count(nir_builder * b,nir_ssa_def * val,nir_ssa_def ** packed)769 get_padded_count(nir_builder *b, nir_ssa_def *val, nir_ssa_def **packed)
770 {
771         nir_ssa_def *one = nir_imm_int(b, 1);
772         nir_ssa_def *zero = nir_imm_int(b, 0);
773         nir_ssa_def *eleven = nir_imm_int(b, 11);
774         nir_ssa_def *four = nir_imm_int(b, 4);
775 
776         nir_ssa_def *exp =
777                 nir_usub_sat(b, nir_imax(b, nir_ufind_msb(b, val), zero), four);
778         nir_ssa_def *base = nir_ushr(b, val, exp);
779 
780         base = nir_iadd(b, base,
781                         nir_bcsel(b, nir_ine(b, val, nir_ishl(b, base, exp)), one, zero));
782 
783         nir_ssa_def *rshift = nir_imax(b, nir_find_lsb(b, base), zero);
784         exp = nir_iadd(b, exp, rshift);
785         base = nir_ushr(b, base, rshift);
786         base = nir_iadd(b, base, nir_bcsel(b, nir_uge(b, base, eleven), one, zero));
787         rshift = nir_imax(b, nir_find_lsb(b, base), zero);
788         exp = nir_iadd(b, exp, rshift);
789         base = nir_ushr(b, base, rshift);
790 
791         *packed = nir_ior(b, exp,
792                           nir_ishl(b, nir_ushr_imm(b, base, 1), nir_imm_int(b, 5)));
793         return nir_ishl(b, base, exp);
794 }
795 
796 static void
update_jobs(struct indirect_draw_shader_builder * builder)797 update_jobs(struct indirect_draw_shader_builder *builder)
798 {
799         get_invocation(builder);
800         update_job(builder, MALI_JOB_TYPE_VERTEX);
801         update_job(builder, MALI_JOB_TYPE_TILER);
802 }
803 
804 
805 static void
set_null_job(struct indirect_draw_shader_builder * builder,nir_ssa_def * job_ptr)806 set_null_job(struct indirect_draw_shader_builder *builder,
807              nir_ssa_def *job_ptr)
808 {
809         nir_builder *b = &builder->b;
810         nir_ssa_def *w4 = get_address_imm(b, job_ptr, WORD(4));
811         nir_ssa_def *val = load_global(b, w4, 1, 32);
812 
813         /* Set job type to NULL (AKA NOOP) */
814         val = nir_ior(b, nir_iand_imm(b, val, 0xffffff01),
815                       nir_imm_int(b, MALI_JOB_TYPE_NULL << 1));
816         store_global(b, w4, val, 1);
817 }
818 
819 static void
get_instance_size(struct indirect_draw_shader_builder * builder)820 get_instance_size(struct indirect_draw_shader_builder *builder)
821 {
822         nir_builder *b = &builder->b;
823 
824         if (!builder->index_size) {
825                 builder->jobs.base_vertex_offset = nir_imm_int(b, 0);
826                 builder->jobs.offset_start = builder->draw.vertex_start;
827                 builder->instance_size.raw = builder->draw.vertex_count;
828                 return;
829         }
830 
831         unsigned index_size = builder->index_size;
832         nir_ssa_def *min = get_min_max_ctx_field(builder, min);
833         nir_ssa_def *max = get_min_max_ctx_field(builder, max);
834 
835         /* We handle unaligned indices here to avoid the extra complexity in
836          * the min/max search job.
837          */
838         if (builder->index_size < 4) {
839                 nir_variable *min_var =
840                         nir_local_variable_create(b->impl, glsl_uint_type(), "min");
841                 nir_store_var(b, min_var, min, 1);
842                 nir_variable *max_var =
843                         nir_local_variable_create(b->impl, glsl_uint_type(), "max");
844                 nir_store_var(b, max_var, max, 1);
845 
846                 nir_ssa_def *base =
847                         get_address(b, builder->draw.index_buf,
848                                     nir_imul_imm(b, builder->draw.vertex_start, index_size));
849                 nir_ssa_def *offset = nir_iand_imm(b, nir_unpack_64_2x32_split_x(b, base), 3);
850                 nir_ssa_def *end =
851                         nir_iadd(b, offset,
852                                  nir_imul_imm(b, builder->draw.vertex_count, index_size));
853                 nir_ssa_def *aligned_end = nir_iand_imm(b, end, ~3);
854                 unsigned shift = index_size * 8;
855                 unsigned mask = (1 << shift) - 1;
856 
857                 base = nir_iand(b, base, nir_imm_int64(b, ~3ULL));
858 
859                 /* Unaligned start offset, we need to ignore any data that's
860                  * outside the requested range. We also handle ranges that are
861                  * covering less than 2 words here.
862                  */
863                 IF (nir_ior(b, nir_ine(b, offset, nir_imm_int(b, 0)), nir_ieq(b, aligned_end, nir_imm_int(b, 0)))) {
864                         min = nir_load_var(b, min_var);
865                         max = nir_load_var(b, max_var);
866 
867                         nir_ssa_def *val = load_global(b, base, 1, 32);
868                         for (unsigned i = 0; i < sizeof(uint32_t); i += index_size) {
869                                 nir_ssa_def *oob =
870                                         nir_ior(b,
871                                                 nir_ult(b, nir_imm_int(b, i), offset),
872                                                 nir_uge(b, nir_imm_int(b, i), end));
873                                 nir_ssa_def *data = nir_iand_imm(b, val, mask);
874 
875                                 min = nir_umin(b, min,
876                                                nir_bcsel(b, oob, nir_imm_int(b, UINT32_MAX), data));
877                                 max = nir_umax(b, max,
878                                                nir_bcsel(b, oob, nir_imm_int(b, 0), data));
879                                 val = nir_ushr_imm(b, val, shift);
880                         }
881 
882                         nir_store_var(b, min_var, min, 1);
883                         nir_store_var(b, max_var, max, 1);
884                 } ENDIF
885 
886                 nir_ssa_def *remaining = nir_isub(b, end, aligned_end);
887 
888                 /* The last word contains less than 4bytes of data, we need to
889                  * discard anything falling outside the requested range.
890                  */
891                 IF (nir_iand(b, nir_ine(b, end, aligned_end), nir_ine(b, aligned_end, nir_imm_int(b, 0)))) {
892                         min = nir_load_var(b, min_var);
893                         max = nir_load_var(b, max_var);
894 
895                         nir_ssa_def *val = load_global(b, get_address(b, base, aligned_end), 1, 32);
896                         for (unsigned i = 0; i < sizeof(uint32_t); i += index_size) {
897                                 nir_ssa_def *oob = nir_uge(b, nir_imm_int(b, i), remaining);
898                                 nir_ssa_def *data = nir_iand_imm(b, val, mask);
899 
900                                 min = nir_umin(b, min,
901                                                nir_bcsel(b, oob, nir_imm_int(b, UINT32_MAX), data));
902                                 max = nir_umax(b, max,
903                                                nir_bcsel(b, oob, nir_imm_int(b, 0), data));
904                                 val = nir_ushr_imm(b, val, shift);
905                         }
906 
907                         nir_store_var(b, min_var, min, 1);
908                         nir_store_var(b, max_var, max, 1);
909                 } ENDIF
910 
911                 min = nir_load_var(b, min_var);
912                 max = nir_load_var(b, max_var);
913         }
914 
915         builder->jobs.base_vertex_offset = nir_ineg(b, min);
916         builder->jobs.offset_start = nir_iadd(b, min, builder->draw.index_bias);
917         builder->instance_size.raw = nir_iadd_imm(b, nir_usub_sat(b, max, min), 1);
918 }
919 
920 /* Patch a draw sequence */
921 
922 static void
patch(struct indirect_draw_shader_builder * builder)923 patch(struct indirect_draw_shader_builder *builder)
924 {
925         unsigned index_size = builder->index_size;
926         nir_builder *b = &builder->b;
927 
928         nir_ssa_def *draw_ptr = builder->draw.draw_buf;
929 
930         if (index_size) {
931                 builder->draw.vertex_count = get_indexed_draw_field(b, draw_ptr, count);
932                 builder->draw.start_instance = get_indexed_draw_field(b, draw_ptr, start_instance);
933                 builder->draw.instance_count =
934                         get_indexed_draw_field(b, draw_ptr, instance_count);
935                 builder->draw.vertex_start = get_indexed_draw_field(b, draw_ptr, start);
936                 builder->draw.index_bias = get_indexed_draw_field(b, draw_ptr, index_bias);
937         } else {
938                 builder->draw.vertex_count = get_draw_field(b, draw_ptr, count);
939                 builder->draw.start_instance = get_draw_field(b, draw_ptr, start_instance);
940                 builder->draw.instance_count = get_draw_field(b, draw_ptr, instance_count);
941                 builder->draw.vertex_start = get_draw_field(b, draw_ptr, start);
942         }
943 
944         assert(builder->draw.vertex_count->num_components);
945 
946         nir_ssa_def *num_vertices =
947                 nir_imul(b, builder->draw.vertex_count, builder->draw.instance_count);
948 
949         IF (nir_ieq(b, num_vertices, nir_imm_int(b, 0))) {
950                 /* If there's nothing to draw, turn the vertex/tiler jobs into
951                  * null jobs.
952                  */
953                 set_null_job(builder, builder->jobs.vertex_job);
954                 set_null_job(builder, builder->jobs.tiler_job);
955         } ELSE {
956                 get_instance_size(builder);
957 
958                 builder->instance_size.padded =
959                         get_padded_count(b, builder->instance_size.raw,
960                                          &builder->instance_size.packed);
961 
962                 update_varyings(builder);
963                 update_jobs(builder);
964                 update_vertex_attribs(builder);
965 
966                 IF (nir_ine(b, builder->jobs.first_vertex_sysval, nir_imm_int64(b, 0))) {
967                         store_global(b, builder->jobs.first_vertex_sysval,
968                                      builder->jobs.offset_start, 1);
969                 } ENDIF
970 
971                 IF (nir_ine(b, builder->jobs.base_vertex_sysval, nir_imm_int64(b, 0))) {
972                         store_global(b, builder->jobs.base_vertex_sysval,
973                                      index_size ?
974                                      builder->draw.index_bias :
975                                      nir_imm_int(b, 0),
976                                      1);
977                 } ENDIF
978 
979                 IF (nir_ine(b, builder->jobs.base_instance_sysval, nir_imm_int64(b, 0))) {
980                         store_global(b, builder->jobs.base_instance_sysval,
981                                      builder->draw.start_instance, 1);
982                 } ENDIF
983         } ENDIF
984 }
985 
986 /* Search the min/max index in the range covered by the indirect draw call */
987 
988 static void
get_index_min_max(struct indirect_draw_shader_builder * builder)989 get_index_min_max(struct indirect_draw_shader_builder *builder)
990 {
991         nir_ssa_def *restart_index = builder->draw.restart_index;
992         unsigned index_size = builder->index_size;
993         nir_builder *b = &builder->b;
994 
995         nir_ssa_def *draw_ptr = builder->draw.draw_buf;
996 
997         builder->draw.vertex_count = get_draw_field(b, draw_ptr, count);
998         builder->draw.vertex_start = get_draw_field(b, draw_ptr, start);
999 
1000         nir_ssa_def *thread_id = nir_channel(b, nir_load_global_invocation_id(b, 32), 0);
1001         nir_variable *min_var =
1002                 nir_local_variable_create(b->impl, glsl_uint_type(), "min");
1003         nir_store_var(b, min_var, nir_imm_int(b, UINT32_MAX), 1);
1004         nir_variable *max_var =
1005                 nir_local_variable_create(b->impl, glsl_uint_type(), "max");
1006         nir_store_var(b, max_var, nir_imm_int(b, 0), 1);
1007 
1008         nir_ssa_def *base =
1009                 get_address(b, builder->draw.index_buf,
1010                             nir_imul_imm(b, builder->draw.vertex_start, index_size));
1011 
1012 
1013         nir_ssa_def *start = nir_iand_imm(b, nir_unpack_64_2x32_split_x(b, base), 3);
1014         nir_ssa_def *end =
1015                 nir_iadd(b, start, nir_imul_imm(b, builder->draw.vertex_count, index_size));
1016 
1017         base = nir_iand(b, base, nir_imm_int64(b, ~3ULL));
1018 
1019         /* Align on 4 bytes, non-aligned indices are handled in the indirect draw job. */
1020         start = nir_iand_imm(b, nir_iadd_imm(b, start, 3), ~3);
1021         end = nir_iand_imm(b, end, ~3);
1022 
1023         /* Add the job offset. */
1024         start = nir_iadd(b, start, nir_imul_imm(b, thread_id, sizeof(uint32_t)));
1025 
1026         nir_variable *offset_var =
1027                 nir_local_variable_create(b->impl, glsl_uint_type(), "offset");
1028         nir_store_var(b, offset_var, start, 1);
1029 
1030         LOOP {
1031                 nir_ssa_def *offset = nir_load_var(b, offset_var);
1032                 IF (nir_uge(b, offset, end))
1033                         BREAK;
1034                 ENDIF
1035 
1036                 nir_ssa_def *val = load_global(b, get_address(b, base, offset), 1, 32);
1037                 nir_ssa_def *old_min = nir_load_var(b, min_var);
1038                 nir_ssa_def *old_max = nir_load_var(b, max_var);
1039                 nir_ssa_def *new_min;
1040                 nir_ssa_def *new_max;
1041 
1042                 /* TODO: use 8/16 bit arithmetic when index_size < 4. */
1043                 for (unsigned i = 0; i < 4; i += index_size) {
1044                         nir_ssa_def *data = nir_ushr_imm(b, val, i * 8);
1045                         data = nir_iand_imm(b, data, (1ULL << (index_size * 8)) - 1);
1046                         new_min = nir_umin(b, old_min, data);
1047                         new_max = nir_umax(b, old_max, data);
1048                         if (restart_index) {
1049                                 new_min = nir_bcsel(b, nir_ine(b, restart_index, data), new_min, old_min);
1050                                 new_max = nir_bcsel(b, nir_ine(b, restart_index, data), new_max, old_max);
1051                         }
1052                         old_min = new_min;
1053                         old_max = new_max;
1054                 }
1055 
1056                 nir_store_var(b, min_var, new_min, 1);
1057                 nir_store_var(b, max_var, new_max, 1);
1058                 nir_store_var(b, offset_var,
1059                               nir_iadd_imm(b, offset, MIN_MAX_JOBS * sizeof(uint32_t)), 1);
1060         }
1061 
1062         IF (nir_ult(b, start, end))
1063                 update_min(builder, nir_load_var(b, min_var));
1064                 update_max(builder, nir_load_var(b, max_var));
1065         ENDIF
1066 }
1067 
1068 static unsigned
get_shader_id(unsigned flags,unsigned index_size,bool index_min_max_search)1069 get_shader_id(unsigned flags, unsigned index_size, bool index_min_max_search)
1070 {
1071         if (!index_min_max_search) {
1072                 flags &= PAN_INDIRECT_DRAW_FLAGS_MASK;
1073                 flags &= ~PAN_INDIRECT_DRAW_INDEX_SIZE_MASK;
1074                 if (index_size)
1075                         flags |= (util_logbase2(index_size) + 1);
1076                 return flags;
1077         }
1078 
1079         return ((flags & PAN_INDIRECT_DRAW_PRIMITIVE_RESTART) ?
1080                 PAN_INDIRECT_DRAW_MIN_MAX_SEARCH_1B_INDEX_PRIM_RESTART :
1081                 PAN_INDIRECT_DRAW_MIN_MAX_SEARCH_1B_INDEX) +
1082                util_logbase2(index_size);
1083 }
1084 
1085 static void
create_indirect_draw_shader(struct panfrost_device * dev,unsigned flags,unsigned index_size,bool index_min_max_search)1086 create_indirect_draw_shader(struct panfrost_device *dev,
1087                             unsigned flags, unsigned index_size,
1088                             bool index_min_max_search)
1089 {
1090         assert(flags < PAN_INDIRECT_DRAW_NUM_SHADERS);
1091         struct indirect_draw_shader_builder builder;
1092         init_shader_builder(&builder, dev, flags, index_size, index_min_max_search);
1093 
1094         nir_builder *b = &builder.b;
1095 
1096         if (index_min_max_search)
1097                 get_index_min_max(&builder);
1098         else
1099                 patch(&builder);
1100 
1101         struct panfrost_compile_inputs inputs = { .gpu_id = dev->gpu_id };
1102         struct pan_shader_info shader_info;
1103         struct util_dynarray binary;
1104 
1105         util_dynarray_init(&binary, NULL);
1106         GENX(pan_shader_compile)(b->shader, &inputs, &binary, &shader_info);
1107 
1108         assert(!shader_info.tls_size);
1109         assert(!shader_info.wls_size);
1110         assert(!shader_info.sysvals.sysval_count);
1111 
1112         unsigned shader_id = get_shader_id(flags, index_size, index_min_max_search);
1113         struct pan_indirect_draw_shader *draw_shader =
1114                 &dev->indirect_draw_shaders.shaders[shader_id];
1115         void *state = dev->indirect_draw_shaders.states->ptr.cpu +
1116                       (shader_id * pan_size(RENDERER_STATE));
1117 
1118         pthread_mutex_lock(&dev->indirect_draw_shaders.lock);
1119         if (!draw_shader->rsd) {
1120                 mali_ptr address =
1121                         pan_pool_upload_aligned(dev->indirect_draw_shaders.bin_pool,
1122                                                 binary.data, binary.size,
1123                                                 PAN_ARCH >= 6 ? 128 : 64);
1124 
1125 #if PAN_ARCH <= 5
1126                 address |= shader_info.midgard.first_tag;
1127 #endif
1128 
1129                 util_dynarray_fini(&binary);
1130 
1131                 pan_pack(state, RENDERER_STATE, cfg) {
1132                         pan_shader_prepare_rsd(&shader_info, address, &cfg);
1133                 }
1134 
1135                 draw_shader->push = shader_info.push;
1136                 draw_shader->rsd = dev->indirect_draw_shaders.states->ptr.gpu +
1137                                    (shader_id * pan_size(RENDERER_STATE));
1138         }
1139         pthread_mutex_unlock(&dev->indirect_draw_shaders.lock);
1140 
1141         ralloc_free(b->shader);
1142 }
1143 
1144 static mali_ptr
get_renderer_state(struct panfrost_device * dev,unsigned flags,unsigned index_size,bool index_min_max_search)1145 get_renderer_state(struct panfrost_device *dev, unsigned flags,
1146                    unsigned index_size, bool index_min_max_search)
1147 {
1148         unsigned shader_id = get_shader_id(flags, index_size, index_min_max_search);
1149         struct pan_indirect_draw_shader *info =
1150                 &dev->indirect_draw_shaders.shaders[shader_id];
1151 
1152         if (!info->rsd) {
1153                 create_indirect_draw_shader(dev, flags, index_size,
1154                                             index_min_max_search);
1155                 assert(info->rsd);
1156         }
1157 
1158         return info->rsd;
1159 }
1160 
1161 static mali_ptr
get_tls(const struct panfrost_device * dev)1162 get_tls(const struct panfrost_device *dev)
1163 {
1164         return dev->indirect_draw_shaders.states->ptr.gpu +
1165                (PAN_INDIRECT_DRAW_NUM_SHADERS * pan_size(RENDERER_STATE));
1166 }
1167 
1168 static mali_ptr
get_ubos(struct pan_pool * pool,const struct indirect_draw_inputs * inputs)1169 get_ubos(struct pan_pool *pool,
1170          const struct indirect_draw_inputs *inputs)
1171 {
1172         struct panfrost_ptr inputs_buf =
1173                 pan_pool_alloc_aligned(pool, sizeof(*inputs), 16);
1174 
1175         memcpy(inputs_buf.cpu, inputs, sizeof(*inputs));
1176 
1177         struct panfrost_ptr ubos_buf =
1178                 pan_pool_alloc_desc(pool, UNIFORM_BUFFER);
1179 
1180         pan_pack(ubos_buf.cpu, UNIFORM_BUFFER, cfg) {
1181                 cfg.entries = DIV_ROUND_UP(sizeof(*inputs), 16);
1182                 cfg.pointer = inputs_buf.gpu;
1183         }
1184 
1185         return ubos_buf.gpu;
1186 }
1187 
1188 static mali_ptr
get_push_uniforms(struct pan_pool * pool,const struct pan_indirect_draw_shader * shader,const struct indirect_draw_inputs * inputs)1189 get_push_uniforms(struct pan_pool *pool,
1190                   const struct pan_indirect_draw_shader *shader,
1191                   const struct indirect_draw_inputs *inputs)
1192 {
1193         if (!shader->push.count)
1194                 return 0;
1195 
1196         struct panfrost_ptr push_consts_buf =
1197                 pan_pool_alloc_aligned(pool, shader->push.count * 4, 16);
1198         uint32_t *out = push_consts_buf.cpu;
1199         uint8_t *in = (uint8_t *)inputs;
1200 
1201         for (unsigned i = 0; i < shader->push.count; ++i)
1202                 memcpy(out + i, in + shader->push.words[i].offset, 4);
1203 
1204         return push_consts_buf.gpu;
1205 }
1206 
1207 static void
panfrost_indirect_draw_alloc_deps(struct panfrost_device * dev)1208 panfrost_indirect_draw_alloc_deps(struct panfrost_device *dev)
1209 {
1210         pthread_mutex_lock(&dev->indirect_draw_shaders.lock);
1211         if (dev->indirect_draw_shaders.states)
1212                 goto out;
1213 
1214         unsigned state_bo_size = (PAN_INDIRECT_DRAW_NUM_SHADERS *
1215                                   pan_size(RENDERER_STATE)) +
1216                                  pan_size(LOCAL_STORAGE);
1217 
1218         dev->indirect_draw_shaders.states =
1219                 panfrost_bo_create(dev, state_bo_size, 0, "Indirect draw states");
1220 
1221         /* Prepare the thread storage descriptor now since it's invariant. */
1222         void *tsd = dev->indirect_draw_shaders.states->ptr.cpu +
1223                     (PAN_INDIRECT_DRAW_NUM_SHADERS * pan_size(RENDERER_STATE));
1224         pan_pack(tsd, LOCAL_STORAGE, ls) {
1225                 ls.wls_instances = MALI_LOCAL_STORAGE_NO_WORKGROUP_MEM;
1226         };
1227 
1228         /* FIXME: Currently allocating 512M of growable memory, meaning that we
1229          * only allocate what we really use, the problem is:
1230          * - allocation happens 2M at a time, which might be more than we
1231          *   actually need
1232          * - the memory is attached to the device to speed up subsequent
1233          *   indirect draws, but that also means it's never shrinked
1234          */
1235         dev->indirect_draw_shaders.varying_heap =
1236                 panfrost_bo_create(dev, 512 * 1024 * 1024,
1237                                    PAN_BO_INVISIBLE | PAN_BO_GROWABLE,
1238                                    "Indirect draw varying heap");
1239 
1240 out:
1241         pthread_mutex_unlock(&dev->indirect_draw_shaders.lock);
1242 }
1243 
1244 static unsigned
panfrost_emit_index_min_max_search(struct pan_pool * pool,struct pan_scoreboard * scoreboard,const struct pan_indirect_draw_info * draw_info,const struct indirect_draw_inputs * inputs,struct indirect_draw_context * draw_ctx,mali_ptr ubos)1245 panfrost_emit_index_min_max_search(struct pan_pool *pool,
1246                                    struct pan_scoreboard *scoreboard,
1247                                    const struct pan_indirect_draw_info *draw_info,
1248                                    const struct indirect_draw_inputs *inputs,
1249                                    struct indirect_draw_context *draw_ctx,
1250                                    mali_ptr ubos)
1251 {
1252         struct panfrost_device *dev = pool->dev;
1253         unsigned index_size = draw_info->index_size;
1254 
1255         if (!index_size)
1256                 return 0;
1257 
1258         mali_ptr rsd =
1259                 get_renderer_state(dev, draw_info->flags,
1260                                    draw_info->index_size, true);
1261         unsigned shader_id =
1262                 get_shader_id(draw_info->flags, draw_info->index_size, true);
1263         const struct pan_indirect_draw_shader *shader =
1264                 &dev->indirect_draw_shaders.shaders[shader_id];
1265         struct panfrost_ptr job =
1266                 pan_pool_alloc_desc(pool, COMPUTE_JOB);
1267         void *invocation =
1268                 pan_section_ptr(job.cpu, COMPUTE_JOB, INVOCATION);
1269         panfrost_pack_work_groups_compute(invocation,
1270                                           1, 1, 1, MIN_MAX_JOBS, 1, 1,
1271                                           false, false);
1272 
1273         pan_section_pack(job.cpu, COMPUTE_JOB, PARAMETERS, cfg) {
1274                 cfg.job_task_split = 7;
1275         }
1276 
1277         pan_section_pack(job.cpu, COMPUTE_JOB, DRAW, cfg) {
1278                 cfg.draw_descriptor_is_64b = true;
1279                 cfg.state = rsd;
1280                 cfg.thread_storage = get_tls(pool->dev);
1281                 cfg.uniform_buffers = ubos;
1282                 cfg.push_uniforms = get_push_uniforms(pool, shader, inputs);
1283         }
1284 
1285         return panfrost_add_job(pool, scoreboard, MALI_JOB_TYPE_COMPUTE,
1286                                 false, false, 0, 0, &job, false);
1287 }
1288 
1289 unsigned
GENX(panfrost_emit_indirect_draw)1290 GENX(panfrost_emit_indirect_draw)(struct pan_pool *pool,
1291                                   struct pan_scoreboard *scoreboard,
1292                                   const struct pan_indirect_draw_info *draw_info,
1293                                   struct panfrost_ptr *ctx)
1294 {
1295         struct panfrost_device *dev = pool->dev;
1296 
1297         /* Currently only tested on Bifrost, but the logic should be the same
1298          * on Midgard.
1299          */
1300         assert(pan_is_bifrost(dev));
1301 
1302         panfrost_indirect_draw_alloc_deps(dev);
1303 
1304         struct panfrost_ptr job =
1305                 pan_pool_alloc_desc(pool, COMPUTE_JOB);
1306         mali_ptr rsd =
1307                 get_renderer_state(dev, draw_info->flags,
1308                                    draw_info->index_size, false);
1309 
1310         struct indirect_draw_context draw_ctx = {
1311                 .varying_mem = dev->indirect_draw_shaders.varying_heap->ptr.gpu,
1312         };
1313 
1314         struct panfrost_ptr draw_ctx_ptr = *ctx;
1315         if (!draw_ctx_ptr.cpu) {
1316                 draw_ctx_ptr = pan_pool_alloc_aligned(pool,
1317                                                       sizeof(draw_ctx),
1318                                                       sizeof(mali_ptr));
1319         }
1320 
1321         struct indirect_draw_inputs inputs = {
1322                 .draw_ctx = draw_ctx_ptr.gpu,
1323                 .draw_buf = draw_info->draw_buf,
1324                 .index_buf = draw_info->index_buf,
1325                 .first_vertex_sysval = draw_info->first_vertex_sysval,
1326                 .base_vertex_sysval = draw_info->base_vertex_sysval,
1327                 .base_instance_sysval = draw_info->base_instance_sysval,
1328                 .vertex_job = draw_info->vertex_job,
1329                 .tiler_job = draw_info->tiler_job,
1330                 .attrib_bufs = draw_info->attrib_bufs,
1331                 .attribs = draw_info->attribs,
1332                 .varying_bufs = draw_info->varying_bufs,
1333                 .attrib_count = draw_info->attrib_count,
1334         };
1335 
1336         if (draw_info->index_size) {
1337                 inputs.restart_index = draw_info->restart_index;
1338 
1339                 struct panfrost_ptr min_max_ctx_ptr =
1340                         pan_pool_alloc_aligned(pool,
1341                                                sizeof(struct min_max_context),
1342                                                4);
1343                 struct min_max_context *ctx = min_max_ctx_ptr.cpu;
1344 
1345                 ctx->min = UINT32_MAX;
1346                 ctx->max = 0;
1347                 inputs.min_max_ctx = min_max_ctx_ptr.gpu;
1348         }
1349 
1350         unsigned shader_id =
1351                 get_shader_id(draw_info->flags, draw_info->index_size, false);
1352         const struct pan_indirect_draw_shader *shader =
1353                 &dev->indirect_draw_shaders.shaders[shader_id];
1354         mali_ptr ubos = get_ubos(pool, &inputs);
1355 
1356         void *invocation =
1357                 pan_section_ptr(job.cpu, COMPUTE_JOB, INVOCATION);
1358         panfrost_pack_work_groups_compute(invocation,
1359                                           1, 1, 1, 1, 1, 1,
1360                                           false, false);
1361 
1362         pan_section_pack(job.cpu, COMPUTE_JOB, PARAMETERS, cfg) {
1363                 cfg.job_task_split = 2;
1364         }
1365 
1366         pan_section_pack(job.cpu, COMPUTE_JOB, DRAW, cfg) {
1367                 cfg.draw_descriptor_is_64b = true;
1368                 cfg.state = rsd;
1369                 cfg.thread_storage = get_tls(pool->dev);
1370                 cfg.uniform_buffers = ubos;
1371                 cfg.push_uniforms = get_push_uniforms(pool, shader, &inputs);
1372         }
1373 
1374         unsigned global_dep = draw_info->last_indirect_draw;
1375         unsigned local_dep =
1376                 panfrost_emit_index_min_max_search(pool, scoreboard, draw_info,
1377                                                    &inputs, &draw_ctx, ubos);
1378 
1379         if (!ctx->cpu) {
1380                 *ctx = draw_ctx_ptr;
1381                 memcpy(ctx->cpu, &draw_ctx, sizeof(draw_ctx));
1382         }
1383 
1384         return panfrost_add_job(pool, scoreboard, MALI_JOB_TYPE_COMPUTE,
1385                                 false, true, local_dep, global_dep,
1386                                 &job, false);
1387 }
1388 
1389 void
GENX(panfrost_init_indirect_draw_shaders)1390 GENX(panfrost_init_indirect_draw_shaders)(struct panfrost_device *dev,
1391                                           struct pan_pool *bin_pool)
1392 {
1393         /* We allocate the states and varying_heap BO lazily to avoid
1394          * reserving memory when indirect draws are not used.
1395          */
1396         pthread_mutex_init(&dev->indirect_draw_shaders.lock, NULL);
1397         dev->indirect_draw_shaders.bin_pool = bin_pool;
1398 }
1399 
1400 void
GENX(panfrost_cleanup_indirect_draw_shaders)1401 GENX(panfrost_cleanup_indirect_draw_shaders)(struct panfrost_device *dev)
1402 {
1403         panfrost_bo_unreference(dev->indirect_draw_shaders.states);
1404         panfrost_bo_unreference(dev->indirect_draw_shaders.varying_heap);
1405         pthread_mutex_destroy(&dev->indirect_draw_shaders.lock);
1406 }
1407