• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*
2  * Copyright (C) 2021 Collabora, Ltd.
3  *
4  * Permission is hereby granted, free of charge, to any person obtaining a
5  * copy of this software and associated documentation files (the "Software"),
6  * to deal in the Software without restriction, including without limitation
7  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8  * and/or sell copies of the Software, and to permit persons to whom the
9  * Software is furnished to do so, subject to the following conditions:
10  *
11  * The above copyright notice and this permission notice (including the next
12  * paragraph) shall be included in all copies or substantial portions of the
13  * Software.
14  *
15  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
18  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21  * SOFTWARE.
22  *
23  */
24 
25 #include <stdio.h>
26 #include "pan_bo.h"
27 #include "pan_shader.h"
28 #include "pan_scoreboard.h"
29 #include "pan_encoder.h"
30 #include "pan_indirect_draw.h"
31 #include "pan_pool.h"
32 #include "pan_util.h"
33 #include "compiler/nir/nir_builder.h"
34 #include "util/u_memory.h"
35 #include "util/macros.h"
36 
37 #define WORD(x) ((x) * 4)
38 
39 #define LOOP \
40         for (nir_loop *l = nir_push_loop(b); l != NULL; \
41              nir_pop_loop(b, l), l = NULL)
42 #define BREAK nir_jump(b, nir_jump_break)
43 #define CONTINUE nir_jump(b, nir_jump_continue)
44 
45 #define IF(cond) nir_push_if(b, cond);
46 #define ELSE nir_push_else(b, NULL);
47 #define ENDIF nir_pop_if(b, NULL);
48 
49 #define MIN_MAX_JOBS 128
50 
51 struct draw_data {
52         nir_ssa_def *draw_buf;
53         nir_ssa_def *draw_buf_stride;
54         nir_ssa_def *index_buf;
55         nir_ssa_def *restart_index;
56         nir_ssa_def *vertex_count;
57         nir_ssa_def *start_instance;
58         nir_ssa_def *instance_count;
59         nir_ssa_def *vertex_start;
60         nir_ssa_def *index_bias;
61         nir_ssa_def *draw_ctx;
62         nir_ssa_def *min_max_ctx;
63 };
64 
65 struct instance_size {
66         nir_ssa_def *raw;
67         nir_ssa_def *padded;
68         nir_ssa_def *packed;
69 };
70 
71 struct jobs_data {
72         nir_ssa_def *vertex_job;
73         nir_ssa_def *tiler_job;
74         nir_ssa_def *base_vertex_offset;
75         nir_ssa_def *first_vertex_sysval;
76         nir_ssa_def *base_vertex_sysval;
77         nir_ssa_def *base_instance_sysval;
78         nir_ssa_def *offset_start;
79         nir_ssa_def *invocation;
80 };
81 
82 struct varyings_data {
83         nir_ssa_def *varying_bufs;
84         nir_ssa_def *pos_ptr;
85         nir_ssa_def *psiz_ptr;
86         nir_variable *mem_ptr;
87 };
88 
89 struct attribs_data {
90         nir_ssa_def *attrib_count;
91         nir_ssa_def *attrib_bufs;
92         nir_ssa_def *attribs;
93 };
94 
95 struct indirect_draw_shader_builder {
96         nir_builder b;
97         const struct panfrost_device *dev;
98         unsigned flags;
99         bool index_min_max_search;
100         unsigned index_size;
101         struct draw_data draw;
102         struct instance_size instance_size;
103         struct jobs_data jobs;
104         struct varyings_data varyings;
105         struct attribs_data attribs;
106 };
107 
108 /* Describes an indirect draw (see glDrawArraysIndirect()) */
109 
110 struct indirect_draw_info {
111         uint32_t count;
112         uint32_t instance_count;
113         uint32_t start;
114         uint32_t start_instance;
115 };
116 
117 struct indirect_indexed_draw_info {
118         uint32_t count;
119         uint32_t instance_count;
120         uint32_t start;
121         int32_t index_bias;
122         uint32_t start_instance;
123 };
124 
125 /* Store the min/max index in a separate context. This is not supported yet, but
126  * the DDK seems to put all min/max search jobs at the beginning of the job chain
127  * when multiple indirect draws are issued to avoid the serialization caused by
128  * the draw patching jobs which have the suppress_prefetch flag set. Merging the
129  * min/max and draw contexts would prevent such optimizations (draw contexts are
130  * shared by all indirect draw in a batch).
131  */
132 
133 struct min_max_context {
134         uint32_t min;
135         uint32_t max;
136 };
137 
138 /* Per-batch context shared by all indirect draws queued to a given batch. */
139 
140 struct indirect_draw_context {
141         /* Pointer to the top of the varying heap. */
142         mali_ptr varying_mem;
143 };
144 
145 /* Indirect draw shader inputs. Those are stored in FAU. */
146 
147 struct indirect_draw_inputs {
148         /* indirect_draw_context pointer */
149         mali_ptr draw_ctx;
150 
151         /* min_max_context pointer */
152         mali_ptr min_max_ctx;
153 
154         /* Pointer to an array of indirect_draw_info objects */
155         mali_ptr draw_buf;
156 
157         /* Pointer to an uint32_t containing the number of draws to issue */
158         mali_ptr draw_count_ptr;
159 
160         /* index buffer */
161         mali_ptr index_buf;
162 
163         /* {base,first}_{vertex,instance} sysvals */
164         mali_ptr first_vertex_sysval;
165         mali_ptr base_vertex_sysval;
166         mali_ptr base_instance_sysval;
167 
168         /* Pointers to various cmdstream structs that need to be patched */
169         mali_ptr vertex_job;
170         mali_ptr tiler_job;
171         mali_ptr attrib_bufs;
172         mali_ptr attribs;
173         mali_ptr varying_bufs;
174         uint32_t draw_count;
175         uint32_t draw_buf_stride;
176         uint32_t restart_index;
177         uint32_t attrib_count;
178 } PACKED;
179 
180 #define get_input_field(b, name) \
181         nir_load_push_constant(b, \
182                1, sizeof(((struct indirect_draw_inputs *)0)->name) * 8, \
183                nir_imm_int(b, 0), \
184                .base = offsetof(struct indirect_draw_inputs, name))
185 
186 static nir_ssa_def *
get_address(nir_builder * b,nir_ssa_def * base,nir_ssa_def * offset)187 get_address(nir_builder *b, nir_ssa_def *base, nir_ssa_def *offset)
188 {
189         return nir_iadd(b, base, nir_u2u64(b, offset));
190 }
191 
192 static nir_ssa_def *
get_address_imm(nir_builder * b,nir_ssa_def * base,unsigned offset)193 get_address_imm(nir_builder *b, nir_ssa_def *base, unsigned offset)
194 {
195         return get_address(b, base, nir_imm_int(b, offset));
196 }
197 
198 static nir_ssa_def *
load_global(nir_builder * b,nir_ssa_def * addr,unsigned ncomps,unsigned bit_size)199 load_global(nir_builder *b, nir_ssa_def *addr, unsigned ncomps, unsigned bit_size)
200 {
201         return nir_load_global(b, addr, 4, ncomps, bit_size);
202 }
203 
204 static void
store_global(nir_builder * b,nir_ssa_def * addr,nir_ssa_def * value,unsigned ncomps)205 store_global(nir_builder *b, nir_ssa_def *addr,
206              nir_ssa_def *value, unsigned ncomps)
207 {
208         nir_store_global(b, addr, 4, value, (1 << ncomps) - 1);
209 }
210 
211 static nir_ssa_def *
get_draw_ctx_data(struct indirect_draw_shader_builder * builder,unsigned offset,unsigned size)212 get_draw_ctx_data(struct indirect_draw_shader_builder *builder,
213                   unsigned offset, unsigned size)
214 {
215         nir_builder *b = &builder->b;
216         return load_global(b,
217                            get_address_imm(b, builder->draw.draw_ctx, offset),
218                            1, size);
219 }
220 
221 static void
set_draw_ctx_data(struct indirect_draw_shader_builder * builder,unsigned offset,nir_ssa_def * value,unsigned size)222 set_draw_ctx_data(struct indirect_draw_shader_builder *builder,
223                   unsigned offset, nir_ssa_def *value, unsigned size)
224 {
225         nir_builder *b = &builder->b;
226         store_global(b,
227                      get_address_imm(b, builder->draw.draw_ctx, offset),
228                      value, 1);
229 }
230 
231 #define get_draw_ctx_field(builder, name) \
232         get_draw_ctx_data(builder, \
233                           offsetof(struct indirect_draw_context, name), \
234                           sizeof(((struct indirect_draw_context *)0)->name) * 8)
235 
236 #define set_draw_ctx_field(builder, name, val) \
237         set_draw_ctx_data(builder, \
238                           offsetof(struct indirect_draw_context, name), \
239                           val, \
240                           sizeof(((struct indirect_draw_context *)0)->name) * 8)
241 
242 static nir_ssa_def *
get_min_max_ctx_data(struct indirect_draw_shader_builder * builder,unsigned offset,unsigned size)243 get_min_max_ctx_data(struct indirect_draw_shader_builder *builder,
244                      unsigned offset, unsigned size)
245 {
246         nir_builder *b = &builder->b;
247         return load_global(b,
248                            get_address_imm(b, builder->draw.min_max_ctx, offset),
249                            1, size);
250 }
251 
252 #define get_min_max_ctx_field(builder, name) \
253         get_min_max_ctx_data(builder, \
254                              offsetof(struct min_max_context, name), \
255                              sizeof(((struct min_max_context *)0)->name) * 8)
256 
257 static void
update_min(struct indirect_draw_shader_builder * builder,nir_ssa_def * val)258 update_min(struct indirect_draw_shader_builder *builder, nir_ssa_def *val)
259 {
260         nir_builder *b = &builder->b;
261         nir_ssa_def *addr =
262                 get_address_imm(b,
263                                 builder->draw.min_max_ctx,
264                                 offsetof(struct min_max_context, min));
265         nir_global_atomic_umin(b, 32, addr, val);
266 }
267 
268 static void
update_max(struct indirect_draw_shader_builder * builder,nir_ssa_def * val)269 update_max(struct indirect_draw_shader_builder *builder, nir_ssa_def *val)
270 {
271         nir_builder *b = &builder->b;
272         nir_ssa_def *addr =
273                 get_address_imm(b,
274                                 builder->draw.min_max_ctx,
275                                 offsetof(struct min_max_context, max));
276         nir_global_atomic_umax(b, 32, addr, val);
277 }
278 
279 #define get_draw_field(b, draw_ptr, field) \
280         load_global(b, \
281                     get_address_imm(b, draw_ptr, \
282                                     offsetof(struct indirect_draw_info, field)), \
283                     1, sizeof(((struct indirect_draw_info *)0)->field) * 8)
284 
285 #define get_indexed_draw_field(b, draw_ptr, field) \
286         load_global(b, \
287                     get_address_imm(b, draw_ptr, \
288                                     offsetof(struct indirect_indexed_draw_info, field)), \
289                     1, sizeof(((struct indirect_indexed_draw_info *)0)->field) * 8)
290 
291 static void
extract_inputs(struct indirect_draw_shader_builder * builder)292 extract_inputs(struct indirect_draw_shader_builder *builder)
293 {
294         nir_builder *b = &builder->b;
295 
296         builder->draw.draw_ctx = get_input_field(b, draw_ctx);
297         builder->draw.draw_buf = get_input_field(b, draw_buf);
298         builder->draw.draw_buf_stride = get_input_field(b, draw_buf_stride);
299 
300         if (builder->index_size) {
301                 builder->draw.index_buf = get_input_field(b, index_buf);
302                 builder->draw.min_max_ctx = get_input_field(b, min_max_ctx);
303                 if (builder->flags & PAN_INDIRECT_DRAW_PRIMITIVE_RESTART) {
304                         builder->draw.restart_index =
305                                 get_input_field(b, restart_index);
306                 }
307         }
308 
309         if (builder->index_min_max_search)
310                 return;
311 
312         builder->jobs.first_vertex_sysval = get_input_field(b, first_vertex_sysval);
313         builder->jobs.base_vertex_sysval = get_input_field(b, base_vertex_sysval);
314         builder->jobs.base_instance_sysval = get_input_field(b, base_instance_sysval);
315         builder->jobs.vertex_job = get_input_field(b, vertex_job);
316         builder->jobs.tiler_job = get_input_field(b, tiler_job);
317         builder->attribs.attrib_bufs = get_input_field(b, attrib_bufs);
318         builder->attribs.attribs = get_input_field(b, attribs);
319         builder->attribs.attrib_count = get_input_field(b, attrib_count);
320         builder->varyings.varying_bufs = get_input_field(b, varying_bufs);
321         builder->varyings.mem_ptr =
322                 nir_local_variable_create(b->impl,
323                                           glsl_uint64_t_type(),
324                                           "var_mem_ptr");
325         nir_store_var(b, builder->varyings.mem_ptr,
326                       get_draw_ctx_field(builder, varying_mem), 3);
327 }
328 
329 static void
init_shader_builder(struct indirect_draw_shader_builder * builder,const struct panfrost_device * dev,unsigned flags,unsigned index_size,bool index_min_max_search)330 init_shader_builder(struct indirect_draw_shader_builder *builder,
331                     const struct panfrost_device *dev,
332                     unsigned flags, unsigned index_size,
333                     bool index_min_max_search)
334 {
335         memset(builder, 0, sizeof(*builder));
336         builder->dev = dev;
337         builder->flags = flags;
338         builder->index_size = index_size;
339 
340         builder->index_min_max_search = index_min_max_search;
341 
342         if (index_min_max_search) {
343                 builder->b =
344                         nir_builder_init_simple_shader(MESA_SHADER_COMPUTE,
345                                                        GENX(pan_shader_get_compiler_options)(),
346                                                        "indirect_draw_min_max_index(index_size=%d)",
347                                                        builder->index_size);
348         } else {
349                 builder->b =
350                         nir_builder_init_simple_shader(MESA_SHADER_COMPUTE,
351                                                        GENX(pan_shader_get_compiler_options)(),
352                                                        "indirect_draw(index_size=%d%s%s%s%s)",
353                                                        builder->index_size,
354                                                        flags & PAN_INDIRECT_DRAW_HAS_PSIZ ?
355                                                        ",psiz" : "",
356                                                        flags & PAN_INDIRECT_DRAW_PRIMITIVE_RESTART ?
357                                                        ",primitive_restart" : "",
358                                                        flags & PAN_INDIRECT_DRAW_UPDATE_PRIM_SIZE ?
359                                                        ",update_primitive_size" : "",
360                                                        flags & PAN_INDIRECT_DRAW_IDVS ?
361                                                        ",idvs" : "");
362         }
363 
364         extract_inputs(builder);
365 }
366 
367 static void
update_dcd(struct indirect_draw_shader_builder * builder,nir_ssa_def * job_ptr,unsigned draw_offset)368 update_dcd(struct indirect_draw_shader_builder *builder,
369            nir_ssa_def *job_ptr,
370            unsigned draw_offset)
371 {
372         nir_builder *b = &builder->b;
373         nir_ssa_def *draw_w01 =
374                 load_global(b, get_address_imm(b, job_ptr, draw_offset + WORD(0)), 2, 32);
375         nir_ssa_def *draw_w0 = nir_channel(b, draw_w01, 0);
376 
377         /* Update DRAW.{instance_size,offset_start} */
378         nir_ssa_def *instance_size =
379                 nir_bcsel(b,
380                           nir_ult(b, builder->draw.instance_count, nir_imm_int(b, 2)),
381                           nir_imm_int(b, 0), builder->instance_size.packed);
382         draw_w01 = nir_vec2(b,
383                             nir_ior(b, nir_iand_imm(b, draw_w0, 0xffff),
384                                     nir_ishl(b, instance_size, nir_imm_int(b, 16))),
385                             builder->jobs.offset_start);
386         store_global(b, get_address_imm(b, job_ptr, draw_offset + WORD(0)),
387                      draw_w01, 2);
388 }
389 
390 static void
update_job(struct indirect_draw_shader_builder * builder,enum mali_job_type type)391 update_job(struct indirect_draw_shader_builder *builder, enum mali_job_type type)
392 {
393         nir_builder *b = &builder->b;
394         nir_ssa_def *job_ptr =
395                 type == MALI_JOB_TYPE_VERTEX ?
396                 builder->jobs.vertex_job : builder->jobs.tiler_job;
397 
398         /* Update the invocation words. */
399         store_global(b, get_address_imm(b, job_ptr, WORD(8)),
400                      builder->jobs.invocation, 2);
401 
402         unsigned draw_offset =
403                 type == MALI_JOB_TYPE_VERTEX ?
404                 pan_section_offset(COMPUTE_JOB, DRAW) :
405                 pan_section_offset(TILER_JOB, DRAW);
406         unsigned prim_offset = pan_section_offset(TILER_JOB, PRIMITIVE);
407         unsigned psiz_offset = pan_section_offset(TILER_JOB, PRIMITIVE_SIZE);
408         unsigned index_size = builder->index_size;
409 
410         if (type == MALI_JOB_TYPE_TILER) {
411                 /* Update PRIMITIVE.{base_vertex_offset,count} */
412                 store_global(b,
413                              get_address_imm(b, job_ptr, prim_offset + WORD(1)),
414                              builder->jobs.base_vertex_offset, 1);
415                 store_global(b,
416                              get_address_imm(b, job_ptr, prim_offset + WORD(3)),
417                              nir_iadd_imm(b, builder->draw.vertex_count, -1), 1);
418 
419                 if (index_size) {
420                         nir_ssa_def *addr =
421                                 get_address_imm(b, job_ptr, prim_offset + WORD(4));
422                         nir_ssa_def *indices = load_global(b, addr, 1, 64);
423                         nir_ssa_def *offset =
424                                 nir_imul_imm(b, builder->draw.vertex_start, index_size);
425 
426                         indices = get_address(b, indices, offset);
427                         store_global(b, addr, indices, 2);
428                 }
429 
430                 /* Update PRIMITIVE_SIZE.size_array */
431                 if ((builder->flags & PAN_INDIRECT_DRAW_HAS_PSIZ) &&
432                     (builder->flags & PAN_INDIRECT_DRAW_UPDATE_PRIM_SIZE)) {
433                         store_global(b,
434                                      get_address_imm(b, job_ptr, psiz_offset + WORD(0)),
435                                      builder->varyings.psiz_ptr, 2);
436                 }
437 
438                 /* Update DRAW.position */
439                 store_global(b, get_address_imm(b, job_ptr, draw_offset + WORD(4)),
440                              builder->varyings.pos_ptr, 2);
441         }
442 
443         update_dcd(builder, job_ptr, draw_offset);
444 
445         if (builder->flags & PAN_INDIRECT_DRAW_IDVS) {
446                 assert(type == MALI_JOB_TYPE_TILER);
447 
448                 update_dcd(builder, job_ptr,
449                            pan_section_offset(INDEXED_VERTEX_JOB, VERTEX_DRAW));
450         }
451 }
452 
453 static void
split_div(nir_builder * b,nir_ssa_def * div,nir_ssa_def ** r_e,nir_ssa_def ** d)454 split_div(nir_builder *b, nir_ssa_def *div, nir_ssa_def **r_e, nir_ssa_def **d)
455 {
456         /* TODO: Lower this 64bit div to something GPU-friendly */
457         nir_ssa_def *r = nir_imax(b, nir_ufind_msb(b, div), nir_imm_int(b, 0));
458         nir_ssa_def *div64 = nir_u2u64(b, div);
459         nir_ssa_def *half_div64 = nir_u2u64(b, nir_ushr_imm(b, div, 1));
460         nir_ssa_def *f0 = nir_iadd(b,
461                                    nir_ishl(b, nir_imm_int64(b, 1),
462                                             nir_iadd_imm(b, r, 32)),
463                                    half_div64);
464         nir_ssa_def *fi = nir_idiv(b, f0, div64);
465         nir_ssa_def *ff = nir_isub(b, f0, nir_imul(b, fi, div64));
466         nir_ssa_def *e = nir_bcsel(b, nir_ult(b, half_div64, ff),
467                                    nir_imm_int(b, 1 << 5), nir_imm_int(b, 0));
468         *d = nir_iand_imm(b, nir_u2u32(b, fi), ~(1 << 31));
469         *r_e = nir_ior(b, r, e);
470 }
471 
472 static void
update_vertex_attrib_buf(struct indirect_draw_shader_builder * builder,nir_ssa_def * attrib_buf_ptr,enum mali_attribute_type type,nir_ssa_def * div1,nir_ssa_def * div2)473 update_vertex_attrib_buf(struct indirect_draw_shader_builder *builder,
474                          nir_ssa_def *attrib_buf_ptr,
475                          enum mali_attribute_type type,
476                          nir_ssa_def *div1,
477                          nir_ssa_def *div2)
478 {
479         nir_builder *b = &builder->b;
480         unsigned type_mask = BITFIELD_MASK(6);
481         nir_ssa_def *w01 = load_global(b, attrib_buf_ptr, 2, 32);
482         nir_ssa_def *w0 = nir_channel(b, w01, 0);
483         nir_ssa_def *w1 = nir_channel(b, w01, 1);
484 
485         /* Word 0 and 1 of the attribute descriptor contain the type,
486          * pointer and the the divisor exponent.
487          */
488         w0 = nir_iand_imm(b, nir_channel(b, w01, 0), ~type_mask);
489         w0 = nir_ior(b, w0, nir_imm_int(b, type));
490         w1 = nir_ior(b, w1, nir_ishl(b, div1, nir_imm_int(b, 24)));
491 
492         store_global(b, attrib_buf_ptr, nir_vec2(b, w0, w1), 2);
493 
494         if (type == MALI_ATTRIBUTE_TYPE_1D_NPOT_DIVISOR) {
495                 /* If the divisor is not a power of two, the divisor numerator
496                  * is passed in word 1 of the continuation attribute (word 5
497                  * if we consider the attribute and its continuation as a
498                  * single attribute).
499                  */
500                 assert(div2);
501                 store_global(b, get_address_imm(b, attrib_buf_ptr, WORD(5)),
502                              div2, 1);
503         }
504 }
505 
506 static void
zero_attrib_buf_stride(struct indirect_draw_shader_builder * builder,nir_ssa_def * attrib_buf_ptr)507 zero_attrib_buf_stride(struct indirect_draw_shader_builder *builder,
508                        nir_ssa_def *attrib_buf_ptr)
509 {
510         /* Stride is an unadorned 32-bit uint at word 2 */
511         nir_builder *b = &builder->b;
512         store_global(b, get_address_imm(b, attrib_buf_ptr, WORD(2)),
513                         nir_imm_int(b, 0), 1);
514 }
515 
516 static void
adjust_attrib_offset(struct indirect_draw_shader_builder * builder,nir_ssa_def * attrib_ptr,nir_ssa_def * attrib_buf_ptr,nir_ssa_def * instance_div)517 adjust_attrib_offset(struct indirect_draw_shader_builder *builder,
518                      nir_ssa_def *attrib_ptr, nir_ssa_def *attrib_buf_ptr,
519                      nir_ssa_def *instance_div)
520 {
521         nir_builder *b = &builder->b;
522         nir_ssa_def *zero = nir_imm_int(b, 0);
523         nir_ssa_def *two = nir_imm_int(b, 2);
524         nir_ssa_def *sub_cur_offset =
525                 nir_iand(b, nir_ine(b, builder->jobs.offset_start, zero),
526                          nir_uge(b, builder->draw.instance_count, two));
527 
528         nir_ssa_def *add_base_inst_offset =
529                 nir_iand(b, nir_ine(b, builder->draw.start_instance, zero),
530                          nir_ine(b, instance_div, zero));
531 
532         IF (nir_ior(b, sub_cur_offset, add_base_inst_offset)) {
533                 nir_ssa_def *offset =
534                         load_global(b, get_address_imm(b, attrib_ptr, WORD(1)), 1, 32);
535                 nir_ssa_def *stride =
536                         load_global(b, get_address_imm(b, attrib_buf_ptr, WORD(2)), 1, 32);
537 
538                 /* Per-instance data needs to be offset in response to a
539                  * delayed start in an indexed draw.
540                  */
541 
542                 IF (add_base_inst_offset) {
543                         offset = nir_iadd(b, offset,
544                                           nir_idiv(b,
545                                                    nir_imul(b, stride,
546                                                             builder->draw.start_instance),
547                                                    instance_div));
548                 } ENDIF
549 
550                 IF (sub_cur_offset) {
551                         offset = nir_isub(b, offset,
552                                           nir_imul(b, stride,
553                                                    builder->jobs.offset_start));
554                 } ENDIF
555 
556                 store_global(b, get_address_imm(b, attrib_ptr, WORD(1)),
557                              offset, 1);
558         } ENDIF
559 }
560 
561 /* x is power of two or zero <===> x has 0 (zero) or 1 (POT) bits set */
562 
563 static nir_ssa_def *
nir_is_power_of_two_or_zero(nir_builder * b,nir_ssa_def * x)564 nir_is_power_of_two_or_zero(nir_builder *b, nir_ssa_def *x)
565 {
566         return nir_ult(b, nir_bit_count(b, x), nir_imm_int(b, 2));
567 }
568 
569 /* Based on panfrost_emit_vertex_data() */
570 
571 static void
update_vertex_attribs(struct indirect_draw_shader_builder * builder)572 update_vertex_attribs(struct indirect_draw_shader_builder *builder)
573 {
574         nir_builder *b = &builder->b;
575         nir_variable *attrib_idx_var =
576                 nir_local_variable_create(b->impl, glsl_uint_type(),
577                                           "attrib_idx");
578         nir_store_var(b, attrib_idx_var, nir_imm_int(b, 0), 1);
579 
580 #if PAN_ARCH <= 5
581         nir_ssa_def *single_instance =
582                 nir_ult(b, builder->draw.instance_count, nir_imm_int(b, 2));
583 #endif
584 
585         LOOP {
586                 nir_ssa_def *attrib_idx = nir_load_var(b, attrib_idx_var);
587                 IF (nir_uge(b, attrib_idx, builder->attribs.attrib_count))
588                         BREAK;
589                 ENDIF
590 
591                 nir_ssa_def *attrib_buf_ptr =
592                          get_address(b, builder->attribs.attrib_bufs,
593                                      nir_imul_imm(b, attrib_idx,
594                                                   2 * pan_size(ATTRIBUTE_BUFFER)));
595                 nir_ssa_def *attrib_ptr =
596                          get_address(b, builder->attribs.attribs,
597                                      nir_imul_imm(b, attrib_idx,
598                                                   pan_size(ATTRIBUTE)));
599 
600                 nir_ssa_def *r_e, *d;
601 
602 #if PAN_ARCH <= 5
603                 IF (nir_ieq_imm(b, attrib_idx, PAN_VERTEX_ID)) {
604                         nir_ssa_def *r_p =
605                                 nir_bcsel(b, single_instance,
606                                           nir_imm_int(b, 0x9f),
607                                           builder->instance_size.packed);
608 
609                         store_global(b,
610                                      get_address_imm(b, attrib_buf_ptr, WORD(4)),
611                                      nir_ishl(b, r_p, nir_imm_int(b, 24)), 1);
612 
613                         nir_store_var(b, attrib_idx_var,
614                                       nir_iadd_imm(b, attrib_idx, 1), 1);
615                         CONTINUE;
616                 } ENDIF
617 
618                 IF (nir_ieq_imm(b, attrib_idx, PAN_INSTANCE_ID)) {
619                         split_div(b, builder->instance_size.padded,
620                                   &r_e, &d);
621                         nir_ssa_def *default_div =
622                                 nir_ior(b, single_instance,
623                                         nir_ult(b,
624                                                 builder->instance_size.padded,
625                                                 nir_imm_int(b, 2)));
626                         r_e = nir_bcsel(b, default_div,
627                                         nir_imm_int(b, 0x3f), r_e);
628                         d = nir_bcsel(b, default_div,
629                                       nir_imm_int(b, (1u << 31) - 1), d);
630                         store_global(b,
631                                      get_address_imm(b, attrib_buf_ptr, WORD(1)),
632                                      nir_vec2(b, nir_ishl(b, r_e, nir_imm_int(b, 24)), d),
633                                      2);
634                         nir_store_var(b, attrib_idx_var,
635                                       nir_iadd_imm(b, attrib_idx, 1), 1);
636                         CONTINUE;
637                 } ENDIF
638 #endif
639 
640                 nir_ssa_def *instance_div =
641                         load_global(b, get_address_imm(b, attrib_buf_ptr, WORD(7)), 1, 32);
642 
643                 nir_ssa_def *div = nir_imul(b, instance_div, builder->instance_size.padded);
644 
645                 nir_ssa_def *multi_instance =
646                         nir_uge(b, builder->draw.instance_count, nir_imm_int(b, 2));
647 
648                 IF (nir_ine(b, div, nir_imm_int(b, 0))) {
649                         IF (multi_instance) {
650                                 IF (nir_is_power_of_two_or_zero(b, div)) {
651                                         nir_ssa_def *exp =
652                                                 nir_imax(b, nir_ufind_msb(b, div),
653                                                          nir_imm_int(b, 0));
654                                         update_vertex_attrib_buf(builder, attrib_buf_ptr,
655                                                                  MALI_ATTRIBUTE_TYPE_1D_POT_DIVISOR,
656                                                                  exp, NULL);
657                                 } ELSE {
658                                         split_div(b, div, &r_e, &d);
659                                         update_vertex_attrib_buf(builder, attrib_buf_ptr,
660                                                                  MALI_ATTRIBUTE_TYPE_1D_NPOT_DIVISOR,
661                                                                  r_e, d);
662                                 } ENDIF
663                         } ELSE {
664                                 /* Single instance with a non-0 divisor: all
665                                  * accesses should point to attribute 0 */
666                                 zero_attrib_buf_stride(builder, attrib_buf_ptr);
667                         } ENDIF
668 
669                         adjust_attrib_offset(builder, attrib_ptr, attrib_buf_ptr, instance_div);
670                 } ELSE IF (multi_instance) {
671                         update_vertex_attrib_buf(builder, attrib_buf_ptr,
672                                         MALI_ATTRIBUTE_TYPE_1D_MODULUS,
673                                         builder->instance_size.packed, NULL);
674                 } ENDIF ENDIF
675 
676                 nir_store_var(b, attrib_idx_var, nir_iadd_imm(b, attrib_idx, 1), 1);
677         }
678 }
679 
680 static nir_ssa_def *
update_varying_buf(struct indirect_draw_shader_builder * builder,nir_ssa_def * varying_buf_ptr,nir_ssa_def * vertex_count)681 update_varying_buf(struct indirect_draw_shader_builder *builder,
682                    nir_ssa_def *varying_buf_ptr,
683                    nir_ssa_def *vertex_count)
684 {
685         nir_builder *b = &builder->b;
686 
687         nir_ssa_def *stride =
688                 load_global(b, get_address_imm(b, varying_buf_ptr, WORD(2)), 1, 32);
689         nir_ssa_def *size = nir_imul(b, stride, vertex_count);
690         nir_ssa_def *aligned_size =
691                 nir_iand_imm(b, nir_iadd_imm(b, size, 63), ~63);
692         nir_ssa_def *var_mem_ptr =
693                 nir_load_var(b, builder->varyings.mem_ptr);
694         nir_ssa_def *w0 =
695                 nir_ior(b, nir_unpack_64_2x32_split_x(b, var_mem_ptr),
696                         nir_imm_int(b, MALI_ATTRIBUTE_TYPE_1D));
697         nir_ssa_def *w1 = nir_unpack_64_2x32_split_y(b, var_mem_ptr);
698         store_global(b, get_address_imm(b, varying_buf_ptr, WORD(0)),
699                      nir_vec4(b, w0, w1, stride, size), 4);
700 
701         nir_store_var(b, builder->varyings.mem_ptr,
702                       get_address(b, var_mem_ptr, aligned_size), 3);
703 
704         return var_mem_ptr;
705 }
706 
707 /* Based on panfrost_emit_varying_descriptor() */
708 
709 static void
update_varyings(struct indirect_draw_shader_builder * builder)710 update_varyings(struct indirect_draw_shader_builder *builder)
711 {
712         nir_builder *b = &builder->b;
713         nir_ssa_def *vertex_count =
714                 nir_imul(b, builder->instance_size.padded,
715                          builder->draw.instance_count);
716         nir_ssa_def *buf_ptr =
717                 get_address_imm(b, builder->varyings.varying_bufs,
718                                 PAN_VARY_GENERAL *
719                                 pan_size(ATTRIBUTE_BUFFER));
720         update_varying_buf(builder, buf_ptr, vertex_count);
721 
722         buf_ptr = get_address_imm(b, builder->varyings.varying_bufs,
723                                   PAN_VARY_POSITION *
724                                   pan_size(ATTRIBUTE_BUFFER));
725         builder->varyings.pos_ptr =
726                 update_varying_buf(builder, buf_ptr, vertex_count);
727 
728         if (builder->flags & PAN_INDIRECT_DRAW_HAS_PSIZ) {
729                 buf_ptr = get_address_imm(b, builder->varyings.varying_bufs,
730                                           PAN_VARY_PSIZ *
731                                           pan_size(ATTRIBUTE_BUFFER));
732                 builder->varyings.psiz_ptr =
733                         update_varying_buf(builder, buf_ptr, vertex_count);
734         }
735 
736         set_draw_ctx_field(builder, varying_mem,
737                            nir_load_var(b, builder->varyings.mem_ptr));
738 }
739 
740 /* Based on panfrost_pack_work_groups_compute() */
741 
742 static void
get_invocation(struct indirect_draw_shader_builder * builder)743 get_invocation(struct indirect_draw_shader_builder *builder)
744 {
745         nir_builder *b = &builder->b;
746         nir_ssa_def *one = nir_imm_int(b, 1);
747         nir_ssa_def *max_vertex =
748                 nir_usub_sat(b, builder->instance_size.raw, one);
749         nir_ssa_def *max_instance =
750                 nir_usub_sat(b, builder->draw.instance_count, one);
751         nir_ssa_def *split =
752                 nir_bcsel(b, nir_ieq_imm(b, max_instance, 0),
753                           nir_imm_int(b, 32),
754                           nir_iadd_imm(b, nir_ufind_msb(b, max_vertex), 1));
755 
756         builder->jobs.invocation =
757                 nir_vec2(b,
758                          nir_ior(b, max_vertex,
759                                  nir_ishl(b, max_instance, split)),
760                          nir_ior(b, nir_ishl(b, split, nir_imm_int(b, 22)),
761                                  nir_imm_int(b, 2 << 28)));
762 }
763 
764 static nir_ssa_def *
nir_align_pot(nir_builder * b,nir_ssa_def * val,unsigned pot)765 nir_align_pot(nir_builder *b, nir_ssa_def *val, unsigned pot)
766 {
767         assert(pot != 0 && util_is_power_of_two_or_zero(pot));
768 
769         return nir_iand_imm(b, nir_iadd_imm(b, val, pot - 1), ~(pot - 1));
770 }
771 
772 /* Based on panfrost_padded_vertex_count() */
773 
774 static nir_ssa_def *
get_padded_count(nir_builder * b,nir_ssa_def * val,nir_ssa_def ** packed)775 get_padded_count(nir_builder *b, nir_ssa_def *val, nir_ssa_def **packed)
776 {
777         nir_ssa_def *one = nir_imm_int(b, 1);
778         nir_ssa_def *zero = nir_imm_int(b, 0);
779         nir_ssa_def *eleven = nir_imm_int(b, 11);
780         nir_ssa_def *four = nir_imm_int(b, 4);
781 
782         nir_ssa_def *exp =
783                 nir_usub_sat(b, nir_imax(b, nir_ufind_msb(b, val), zero), four);
784         nir_ssa_def *base = nir_ushr(b, val, exp);
785 
786         base = nir_iadd(b, base,
787                         nir_bcsel(b, nir_ine(b, val, nir_ishl(b, base, exp)), one, zero));
788 
789         nir_ssa_def *rshift = nir_imax(b, nir_find_lsb(b, base), zero);
790         exp = nir_iadd(b, exp, rshift);
791         base = nir_ushr(b, base, rshift);
792         base = nir_iadd(b, base, nir_bcsel(b, nir_uge(b, base, eleven), one, zero));
793         rshift = nir_imax(b, nir_find_lsb(b, base), zero);
794         exp = nir_iadd(b, exp, rshift);
795         base = nir_ushr(b, base, rshift);
796 
797         *packed = nir_ior(b, exp,
798                           nir_ishl(b, nir_ushr_imm(b, base, 1), nir_imm_int(b, 5)));
799         return nir_ishl(b, base, exp);
800 }
801 
802 static void
update_jobs(struct indirect_draw_shader_builder * builder)803 update_jobs(struct indirect_draw_shader_builder *builder)
804 {
805         get_invocation(builder);
806 
807         if (!(builder->flags & PAN_INDIRECT_DRAW_IDVS))
808                 update_job(builder, MALI_JOB_TYPE_VERTEX);
809 
810         update_job(builder, MALI_JOB_TYPE_TILER);
811 }
812 
813 
814 static void
set_null_job(struct indirect_draw_shader_builder * builder,nir_ssa_def * job_ptr)815 set_null_job(struct indirect_draw_shader_builder *builder,
816              nir_ssa_def *job_ptr)
817 {
818         nir_builder *b = &builder->b;
819         nir_ssa_def *w4 = get_address_imm(b, job_ptr, WORD(4));
820         nir_ssa_def *val = load_global(b, w4, 1, 32);
821 
822         /* Set job type to NULL (AKA NOOP) */
823         val = nir_ior(b, nir_iand_imm(b, val, 0xffffff01),
824                       nir_imm_int(b, MALI_JOB_TYPE_NULL << 1));
825         store_global(b, w4, val, 1);
826 }
827 
828 static void
get_instance_size(struct indirect_draw_shader_builder * builder)829 get_instance_size(struct indirect_draw_shader_builder *builder)
830 {
831         nir_builder *b = &builder->b;
832 
833         if (!builder->index_size) {
834                 builder->jobs.base_vertex_offset = nir_imm_int(b, 0);
835                 builder->jobs.offset_start = builder->draw.vertex_start;
836                 builder->instance_size.raw = builder->draw.vertex_count;
837                 return;
838         }
839 
840         unsigned index_size = builder->index_size;
841         nir_ssa_def *min = get_min_max_ctx_field(builder, min);
842         nir_ssa_def *max = get_min_max_ctx_field(builder, max);
843 
844         /* We handle unaligned indices here to avoid the extra complexity in
845          * the min/max search job.
846          */
847         if (builder->index_size < 4) {
848                 nir_variable *min_var =
849                         nir_local_variable_create(b->impl, glsl_uint_type(), "min");
850                 nir_store_var(b, min_var, min, 1);
851                 nir_variable *max_var =
852                         nir_local_variable_create(b->impl, glsl_uint_type(), "max");
853                 nir_store_var(b, max_var, max, 1);
854 
855                 nir_ssa_def *base =
856                         get_address(b, builder->draw.index_buf,
857                                     nir_imul_imm(b, builder->draw.vertex_start, index_size));
858                 nir_ssa_def *offset = nir_iand_imm(b, nir_unpack_64_2x32_split_x(b, base), 3);
859                 nir_ssa_def *end =
860                         nir_iadd(b, offset,
861                                  nir_imul_imm(b, builder->draw.vertex_count, index_size));
862                 nir_ssa_def *aligned_end = nir_iand_imm(b, end, ~3);
863                 unsigned shift = index_size * 8;
864                 unsigned mask = (1 << shift) - 1;
865 
866                 base = nir_iand(b, base, nir_imm_int64(b, ~3ULL));
867 
868                 /* Unaligned start offset, we need to ignore any data that's
869                  * outside the requested range. We also handle ranges that are
870                  * covering less than 2 words here.
871                  */
872                 IF (nir_ior(b, nir_ine(b, offset, nir_imm_int(b, 0)), nir_ieq(b, aligned_end, nir_imm_int(b, 0)))) {
873                         min = nir_load_var(b, min_var);
874                         max = nir_load_var(b, max_var);
875 
876                         nir_ssa_def *val = load_global(b, base, 1, 32);
877                         for (unsigned i = 0; i < sizeof(uint32_t); i += index_size) {
878                                 nir_ssa_def *oob =
879                                         nir_ior(b,
880                                                 nir_ult(b, nir_imm_int(b, i), offset),
881                                                 nir_uge(b, nir_imm_int(b, i), end));
882                                 nir_ssa_def *data = nir_iand_imm(b, val, mask);
883 
884                                 min = nir_umin(b, min,
885                                                nir_bcsel(b, oob, nir_imm_int(b, UINT32_MAX), data));
886                                 max = nir_umax(b, max,
887                                                nir_bcsel(b, oob, nir_imm_int(b, 0), data));
888                                 val = nir_ushr_imm(b, val, shift);
889                         }
890 
891                         nir_store_var(b, min_var, min, 1);
892                         nir_store_var(b, max_var, max, 1);
893                 } ENDIF
894 
895                 nir_ssa_def *remaining = nir_isub(b, end, aligned_end);
896 
897                 /* The last word contains less than 4bytes of data, we need to
898                  * discard anything falling outside the requested range.
899                  */
900                 IF (nir_iand(b, nir_ine(b, end, aligned_end), nir_ine(b, aligned_end, nir_imm_int(b, 0)))) {
901                         min = nir_load_var(b, min_var);
902                         max = nir_load_var(b, max_var);
903 
904                         nir_ssa_def *val = load_global(b, get_address(b, base, aligned_end), 1, 32);
905                         for (unsigned i = 0; i < sizeof(uint32_t); i += index_size) {
906                                 nir_ssa_def *oob = nir_uge(b, nir_imm_int(b, i), remaining);
907                                 nir_ssa_def *data = nir_iand_imm(b, val, mask);
908 
909                                 min = nir_umin(b, min,
910                                                nir_bcsel(b, oob, nir_imm_int(b, UINT32_MAX), data));
911                                 max = nir_umax(b, max,
912                                                nir_bcsel(b, oob, nir_imm_int(b, 0), data));
913                                 val = nir_ushr_imm(b, val, shift);
914                         }
915 
916                         nir_store_var(b, min_var, min, 1);
917                         nir_store_var(b, max_var, max, 1);
918                 } ENDIF
919 
920                 min = nir_load_var(b, min_var);
921                 max = nir_load_var(b, max_var);
922         }
923 
924         builder->jobs.base_vertex_offset = nir_ineg(b, min);
925         builder->jobs.offset_start = nir_iadd(b, min, builder->draw.index_bias);
926         builder->instance_size.raw = nir_iadd_imm(b, nir_usub_sat(b, max, min), 1);
927 }
928 
929 /* Patch a draw sequence */
930 
931 static void
patch(struct indirect_draw_shader_builder * builder)932 patch(struct indirect_draw_shader_builder *builder)
933 {
934         unsigned index_size = builder->index_size;
935         nir_builder *b = &builder->b;
936 
937         nir_ssa_def *draw_ptr = builder->draw.draw_buf;
938 
939         if (index_size) {
940                 builder->draw.vertex_count = get_indexed_draw_field(b, draw_ptr, count);
941                 builder->draw.start_instance = get_indexed_draw_field(b, draw_ptr, start_instance);
942                 builder->draw.instance_count =
943                         get_indexed_draw_field(b, draw_ptr, instance_count);
944                 builder->draw.vertex_start = get_indexed_draw_field(b, draw_ptr, start);
945                 builder->draw.index_bias = get_indexed_draw_field(b, draw_ptr, index_bias);
946         } else {
947                 builder->draw.vertex_count = get_draw_field(b, draw_ptr, count);
948                 builder->draw.start_instance = get_draw_field(b, draw_ptr, start_instance);
949                 builder->draw.instance_count = get_draw_field(b, draw_ptr, instance_count);
950                 builder->draw.vertex_start = get_draw_field(b, draw_ptr, start);
951         }
952 
953         assert(builder->draw.vertex_count->num_components);
954 
955         nir_ssa_def *num_vertices =
956                 nir_imul(b, builder->draw.vertex_count, builder->draw.instance_count);
957 
958         IF (nir_ieq(b, num_vertices, nir_imm_int(b, 0))) {
959                 /* If there's nothing to draw, turn the vertex/tiler jobs into
960                  * null jobs.
961                  */
962                 if (!(builder->flags & PAN_INDIRECT_DRAW_IDVS))
963                         set_null_job(builder, builder->jobs.vertex_job);
964 
965                 set_null_job(builder, builder->jobs.tiler_job);
966         } ELSE {
967                 get_instance_size(builder);
968 
969                 nir_ssa_def *count = builder->instance_size.raw;
970 
971                 /* IDVS requires padding to a multiple of 4 */
972                 if (builder->flags & PAN_INDIRECT_DRAW_IDVS)
973                         count = nir_align_pot(b, count, 4);
974 
975                 builder->instance_size.padded =
976                         get_padded_count(b, count,
977                                          &builder->instance_size.packed);
978 
979                 update_varyings(builder);
980                 update_jobs(builder);
981                 update_vertex_attribs(builder);
982 
983                 IF (nir_ine(b, builder->jobs.first_vertex_sysval, nir_imm_int64(b, 0))) {
984                         store_global(b, builder->jobs.first_vertex_sysval,
985                                      builder->jobs.offset_start, 1);
986                 } ENDIF
987 
988                 IF (nir_ine(b, builder->jobs.base_vertex_sysval, nir_imm_int64(b, 0))) {
989                         store_global(b, builder->jobs.base_vertex_sysval,
990                                      index_size ?
991                                      builder->draw.index_bias :
992                                      nir_imm_int(b, 0),
993                                      1);
994                 } ENDIF
995 
996                 IF (nir_ine(b, builder->jobs.base_instance_sysval, nir_imm_int64(b, 0))) {
997                         store_global(b, builder->jobs.base_instance_sysval,
998                                      builder->draw.start_instance, 1);
999                 } ENDIF
1000         } ENDIF
1001 }
1002 
1003 /* Search the min/max index in the range covered by the indirect draw call */
1004 
1005 static void
get_index_min_max(struct indirect_draw_shader_builder * builder)1006 get_index_min_max(struct indirect_draw_shader_builder *builder)
1007 {
1008         nir_ssa_def *restart_index = builder->draw.restart_index;
1009         unsigned index_size = builder->index_size;
1010         nir_builder *b = &builder->b;
1011 
1012         nir_ssa_def *draw_ptr = builder->draw.draw_buf;
1013 
1014         builder->draw.vertex_count = get_draw_field(b, draw_ptr, count);
1015         builder->draw.vertex_start = get_draw_field(b, draw_ptr, start);
1016 
1017         nir_ssa_def *thread_id = nir_channel(b, nir_load_global_invocation_id(b, 32), 0);
1018         nir_variable *min_var =
1019                 nir_local_variable_create(b->impl, glsl_uint_type(), "min");
1020         nir_store_var(b, min_var, nir_imm_int(b, UINT32_MAX), 1);
1021         nir_variable *max_var =
1022                 nir_local_variable_create(b->impl, glsl_uint_type(), "max");
1023         nir_store_var(b, max_var, nir_imm_int(b, 0), 1);
1024 
1025         nir_ssa_def *base =
1026                 get_address(b, builder->draw.index_buf,
1027                             nir_imul_imm(b, builder->draw.vertex_start, index_size));
1028 
1029 
1030         nir_ssa_def *start = nir_iand_imm(b, nir_unpack_64_2x32_split_x(b, base), 3);
1031         nir_ssa_def *end =
1032                 nir_iadd(b, start, nir_imul_imm(b, builder->draw.vertex_count, index_size));
1033 
1034         base = nir_iand(b, base, nir_imm_int64(b, ~3ULL));
1035 
1036         /* Align on 4 bytes, non-aligned indices are handled in the indirect draw job. */
1037         start = nir_iand_imm(b, nir_iadd_imm(b, start, 3), ~3);
1038         end = nir_iand_imm(b, end, ~3);
1039 
1040         /* Add the job offset. */
1041         start = nir_iadd(b, start, nir_imul_imm(b, thread_id, sizeof(uint32_t)));
1042 
1043         nir_variable *offset_var =
1044                 nir_local_variable_create(b->impl, glsl_uint_type(), "offset");
1045         nir_store_var(b, offset_var, start, 1);
1046 
1047         LOOP {
1048                 nir_ssa_def *offset = nir_load_var(b, offset_var);
1049                 IF (nir_uge(b, offset, end))
1050                         BREAK;
1051                 ENDIF
1052 
1053                 nir_ssa_def *val = load_global(b, get_address(b, base, offset), 1, 32);
1054                 nir_ssa_def *old_min = nir_load_var(b, min_var);
1055                 nir_ssa_def *old_max = nir_load_var(b, max_var);
1056                 nir_ssa_def *new_min;
1057                 nir_ssa_def *new_max;
1058 
1059                 /* TODO: use 8/16 bit arithmetic when index_size < 4. */
1060                 for (unsigned i = 0; i < 4; i += index_size) {
1061                         nir_ssa_def *data = nir_ushr_imm(b, val, i * 8);
1062                         data = nir_iand_imm(b, data, (1ULL << (index_size * 8)) - 1);
1063                         new_min = nir_umin(b, old_min, data);
1064                         new_max = nir_umax(b, old_max, data);
1065                         if (restart_index) {
1066                                 new_min = nir_bcsel(b, nir_ine(b, restart_index, data), new_min, old_min);
1067                                 new_max = nir_bcsel(b, nir_ine(b, restart_index, data), new_max, old_max);
1068                         }
1069                         old_min = new_min;
1070                         old_max = new_max;
1071                 }
1072 
1073                 nir_store_var(b, min_var, new_min, 1);
1074                 nir_store_var(b, max_var, new_max, 1);
1075                 nir_store_var(b, offset_var,
1076                               nir_iadd_imm(b, offset, MIN_MAX_JOBS * sizeof(uint32_t)), 1);
1077         }
1078 
1079         IF (nir_ult(b, start, end))
1080                 update_min(builder, nir_load_var(b, min_var));
1081                 update_max(builder, nir_load_var(b, max_var));
1082         ENDIF
1083 }
1084 
1085 static unsigned
get_shader_id(unsigned flags,unsigned index_size,bool index_min_max_search)1086 get_shader_id(unsigned flags, unsigned index_size, bool index_min_max_search)
1087 {
1088         if (!index_min_max_search) {
1089                 flags &= PAN_INDIRECT_DRAW_FLAGS_MASK;
1090                 flags &= ~PAN_INDIRECT_DRAW_INDEX_SIZE_MASK;
1091                 if (index_size)
1092                         flags |= (util_logbase2(index_size) + 1);
1093                 return flags;
1094         }
1095 
1096         return ((flags & PAN_INDIRECT_DRAW_PRIMITIVE_RESTART) ?
1097                 PAN_INDIRECT_DRAW_MIN_MAX_SEARCH_1B_INDEX_PRIM_RESTART :
1098                 PAN_INDIRECT_DRAW_MIN_MAX_SEARCH_1B_INDEX) +
1099                util_logbase2(index_size);
1100 }
1101 
1102 static void
create_indirect_draw_shader(struct panfrost_device * dev,unsigned flags,unsigned index_size,bool index_min_max_search)1103 create_indirect_draw_shader(struct panfrost_device *dev,
1104                             unsigned flags, unsigned index_size,
1105                             bool index_min_max_search)
1106 {
1107         assert(flags < PAN_INDIRECT_DRAW_NUM_SHADERS);
1108         struct indirect_draw_shader_builder builder;
1109         init_shader_builder(&builder, dev, flags, index_size, index_min_max_search);
1110 
1111         nir_builder *b = &builder.b;
1112 
1113         if (index_min_max_search)
1114                 get_index_min_max(&builder);
1115         else
1116                 patch(&builder);
1117 
1118         struct panfrost_compile_inputs inputs = {
1119                 .gpu_id = dev->gpu_id,
1120                 .fixed_sysval_ubo = -1,
1121                 .no_ubo_to_push = true,
1122         };
1123         struct pan_shader_info shader_info;
1124         struct util_dynarray binary;
1125 
1126         util_dynarray_init(&binary, NULL);
1127         GENX(pan_shader_compile)(b->shader, &inputs, &binary, &shader_info);
1128 
1129         assert(!shader_info.tls_size);
1130         assert(!shader_info.wls_size);
1131         assert(!shader_info.sysvals.sysval_count);
1132 
1133         shader_info.push.count =
1134                 DIV_ROUND_UP(sizeof(struct indirect_draw_inputs), 4);
1135 
1136         unsigned shader_id = get_shader_id(flags, index_size, index_min_max_search);
1137         struct pan_indirect_draw_shader *draw_shader =
1138                 &dev->indirect_draw_shaders.shaders[shader_id];
1139         void *state = dev->indirect_draw_shaders.states->ptr.cpu +
1140                       (shader_id * pan_size(RENDERER_STATE));
1141 
1142         pthread_mutex_lock(&dev->indirect_draw_shaders.lock);
1143         if (!draw_shader->rsd) {
1144                 mali_ptr address =
1145                         pan_pool_upload_aligned(dev->indirect_draw_shaders.bin_pool,
1146                                                 binary.data, binary.size,
1147                                                 PAN_ARCH >= 6 ? 128 : 64);
1148 
1149                 util_dynarray_fini(&binary);
1150 
1151                 pan_pack(state, RENDERER_STATE, cfg) {
1152                         pan_shader_prepare_rsd(&shader_info, address, &cfg);
1153                 }
1154 
1155                 draw_shader->push = shader_info.push;
1156                 draw_shader->rsd = dev->indirect_draw_shaders.states->ptr.gpu +
1157                                    (shader_id * pan_size(RENDERER_STATE));
1158         }
1159         pthread_mutex_unlock(&dev->indirect_draw_shaders.lock);
1160 
1161         ralloc_free(b->shader);
1162 }
1163 
1164 static mali_ptr
get_renderer_state(struct panfrost_device * dev,unsigned flags,unsigned index_size,bool index_min_max_search)1165 get_renderer_state(struct panfrost_device *dev, unsigned flags,
1166                    unsigned index_size, bool index_min_max_search)
1167 {
1168         unsigned shader_id = get_shader_id(flags, index_size, index_min_max_search);
1169         struct pan_indirect_draw_shader *info =
1170                 &dev->indirect_draw_shaders.shaders[shader_id];
1171 
1172         if (!info->rsd) {
1173                 create_indirect_draw_shader(dev, flags, index_size,
1174                                             index_min_max_search);
1175                 assert(info->rsd);
1176         }
1177 
1178         return info->rsd;
1179 }
1180 
1181 static mali_ptr
get_tls(const struct panfrost_device * dev)1182 get_tls(const struct panfrost_device *dev)
1183 {
1184         return dev->indirect_draw_shaders.states->ptr.gpu +
1185                (PAN_INDIRECT_DRAW_NUM_SHADERS * pan_size(RENDERER_STATE));
1186 }
1187 
1188 static void
panfrost_indirect_draw_alloc_deps(struct panfrost_device * dev)1189 panfrost_indirect_draw_alloc_deps(struct panfrost_device *dev)
1190 {
1191         pthread_mutex_lock(&dev->indirect_draw_shaders.lock);
1192         if (dev->indirect_draw_shaders.states)
1193                 goto out;
1194 
1195         unsigned state_bo_size = (PAN_INDIRECT_DRAW_NUM_SHADERS *
1196                                   pan_size(RENDERER_STATE)) +
1197                                  pan_size(LOCAL_STORAGE);
1198 
1199         dev->indirect_draw_shaders.states =
1200                 panfrost_bo_create(dev, state_bo_size, 0, "Indirect draw states");
1201 
1202         /* Prepare the thread storage descriptor now since it's invariant. */
1203         void *tsd = dev->indirect_draw_shaders.states->ptr.cpu +
1204                     (PAN_INDIRECT_DRAW_NUM_SHADERS * pan_size(RENDERER_STATE));
1205         pan_pack(tsd, LOCAL_STORAGE, ls) {
1206                 ls.wls_instances = MALI_LOCAL_STORAGE_NO_WORKGROUP_MEM;
1207         };
1208 
1209         /* FIXME: Currently allocating 512M of growable memory, meaning that we
1210          * only allocate what we really use, the problem is:
1211          * - allocation happens 2M at a time, which might be more than we
1212          *   actually need
1213          * - the memory is attached to the device to speed up subsequent
1214          *   indirect draws, but that also means it's never shrinked
1215          */
1216         dev->indirect_draw_shaders.varying_heap =
1217                 panfrost_bo_create(dev, 512 * 1024 * 1024,
1218                                    PAN_BO_INVISIBLE | PAN_BO_GROWABLE,
1219                                    "Indirect draw varying heap");
1220 
1221 out:
1222         pthread_mutex_unlock(&dev->indirect_draw_shaders.lock);
1223 }
1224 
1225 static unsigned
panfrost_emit_index_min_max_search(struct pan_pool * pool,struct pan_scoreboard * scoreboard,const struct pan_indirect_draw_info * draw_info,const struct indirect_draw_inputs * inputs,struct indirect_draw_context * draw_ctx)1226 panfrost_emit_index_min_max_search(struct pan_pool *pool,
1227                                    struct pan_scoreboard *scoreboard,
1228                                    const struct pan_indirect_draw_info *draw_info,
1229                                    const struct indirect_draw_inputs *inputs,
1230                                    struct indirect_draw_context *draw_ctx)
1231 {
1232         struct panfrost_device *dev = pool->dev;
1233         unsigned index_size = draw_info->index_size;
1234 
1235         if (!index_size)
1236                 return 0;
1237 
1238         mali_ptr rsd =
1239                 get_renderer_state(dev, draw_info->flags,
1240                                    draw_info->index_size, true);
1241         struct panfrost_ptr job =
1242                 pan_pool_alloc_desc(pool, COMPUTE_JOB);
1243         void *invocation =
1244                 pan_section_ptr(job.cpu, COMPUTE_JOB, INVOCATION);
1245         panfrost_pack_work_groups_compute(invocation,
1246                                           1, 1, 1, MIN_MAX_JOBS, 1, 1,
1247                                           false, false);
1248 
1249         pan_section_pack(job.cpu, COMPUTE_JOB, PARAMETERS, cfg) {
1250                 cfg.job_task_split = 7;
1251         }
1252 
1253         pan_section_pack(job.cpu, COMPUTE_JOB, DRAW, cfg) {
1254                 cfg.state = rsd;
1255                 cfg.thread_storage = get_tls(pool->dev);
1256                 cfg.push_uniforms =
1257                         pan_pool_upload_aligned(pool, inputs, sizeof(*inputs), 16);
1258         }
1259 
1260         return panfrost_add_job(pool, scoreboard, MALI_JOB_TYPE_COMPUTE,
1261                                 false, false, 0, 0, &job, false);
1262 }
1263 
1264 unsigned
GENX(panfrost_emit_indirect_draw)1265 GENX(panfrost_emit_indirect_draw)(struct pan_pool *pool,
1266                                   struct pan_scoreboard *scoreboard,
1267                                   const struct pan_indirect_draw_info *draw_info,
1268                                   struct panfrost_ptr *ctx)
1269 {
1270         struct panfrost_device *dev = pool->dev;
1271 
1272         /* Currently only tested on Bifrost, but the logic should be the same
1273          * on Midgard.
1274          */
1275         assert(pan_is_bifrost(dev));
1276 
1277         panfrost_indirect_draw_alloc_deps(dev);
1278 
1279         struct panfrost_ptr job =
1280                 pan_pool_alloc_desc(pool, COMPUTE_JOB);
1281         mali_ptr rsd =
1282                 get_renderer_state(dev, draw_info->flags,
1283                                    draw_info->index_size, false);
1284 
1285         struct indirect_draw_context draw_ctx = {
1286                 .varying_mem = dev->indirect_draw_shaders.varying_heap->ptr.gpu,
1287         };
1288 
1289         struct panfrost_ptr draw_ctx_ptr = *ctx;
1290         if (!draw_ctx_ptr.cpu) {
1291                 draw_ctx_ptr = pan_pool_alloc_aligned(pool,
1292                                                       sizeof(draw_ctx),
1293                                                       sizeof(mali_ptr));
1294         }
1295 
1296         struct indirect_draw_inputs inputs = {
1297                 .draw_ctx = draw_ctx_ptr.gpu,
1298                 .draw_buf = draw_info->draw_buf,
1299                 .index_buf = draw_info->index_buf,
1300                 .first_vertex_sysval = draw_info->first_vertex_sysval,
1301                 .base_vertex_sysval = draw_info->base_vertex_sysval,
1302                 .base_instance_sysval = draw_info->base_instance_sysval,
1303                 .vertex_job = draw_info->vertex_job,
1304                 .tiler_job = draw_info->tiler_job,
1305                 .attrib_bufs = draw_info->attrib_bufs,
1306                 .attribs = draw_info->attribs,
1307                 .varying_bufs = draw_info->varying_bufs,
1308                 .attrib_count = draw_info->attrib_count,
1309         };
1310 
1311         if (draw_info->index_size) {
1312                 inputs.restart_index = draw_info->restart_index;
1313 
1314                 struct panfrost_ptr min_max_ctx_ptr =
1315                         pan_pool_alloc_aligned(pool,
1316                                                sizeof(struct min_max_context),
1317                                                4);
1318                 struct min_max_context *ctx = min_max_ctx_ptr.cpu;
1319 
1320                 ctx->min = UINT32_MAX;
1321                 ctx->max = 0;
1322                 inputs.min_max_ctx = min_max_ctx_ptr.gpu;
1323         }
1324 
1325         void *invocation =
1326                 pan_section_ptr(job.cpu, COMPUTE_JOB, INVOCATION);
1327         panfrost_pack_work_groups_compute(invocation,
1328                                           1, 1, 1, 1, 1, 1,
1329                                           false, false);
1330 
1331         pan_section_pack(job.cpu, COMPUTE_JOB, PARAMETERS, cfg) {
1332                 cfg.job_task_split = 2;
1333         }
1334 
1335         pan_section_pack(job.cpu, COMPUTE_JOB, DRAW, cfg) {
1336                 cfg.state = rsd;
1337                 cfg.thread_storage = get_tls(pool->dev);
1338                 cfg.push_uniforms =
1339                         pan_pool_upload_aligned(pool, &inputs, sizeof(inputs), 16);
1340         }
1341 
1342         unsigned global_dep = draw_info->last_indirect_draw;
1343         unsigned local_dep =
1344                 panfrost_emit_index_min_max_search(pool, scoreboard, draw_info,
1345                                                    &inputs, &draw_ctx);
1346 
1347         if (!ctx->cpu) {
1348                 *ctx = draw_ctx_ptr;
1349                 memcpy(ctx->cpu, &draw_ctx, sizeof(draw_ctx));
1350         }
1351 
1352         return panfrost_add_job(pool, scoreboard, MALI_JOB_TYPE_COMPUTE,
1353                                 false, true, local_dep, global_dep,
1354                                 &job, false);
1355 }
1356 
1357 void
GENX(panfrost_init_indirect_draw_shaders)1358 GENX(panfrost_init_indirect_draw_shaders)(struct panfrost_device *dev,
1359                                           struct pan_pool *bin_pool)
1360 {
1361         /* We allocate the states and varying_heap BO lazily to avoid
1362          * reserving memory when indirect draws are not used.
1363          */
1364         pthread_mutex_init(&dev->indirect_draw_shaders.lock, NULL);
1365         dev->indirect_draw_shaders.bin_pool = bin_pool;
1366 }
1367 
1368 void
GENX(panfrost_cleanup_indirect_draw_shaders)1369 GENX(panfrost_cleanup_indirect_draw_shaders)(struct panfrost_device *dev)
1370 {
1371         panfrost_bo_unreference(dev->indirect_draw_shaders.states);
1372         panfrost_bo_unreference(dev->indirect_draw_shaders.varying_heap);
1373         pthread_mutex_destroy(&dev->indirect_draw_shaders.lock);
1374 }
1375