1 /*
2 * Copyright (C) 2021 Collabora, Ltd.
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
13 * Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 * SOFTWARE.
22 *
23 */
24
25 #include <stdio.h>
26 #include "pan_bo.h"
27 #include "pan_shader.h"
28 #include "pan_scoreboard.h"
29 #include "pan_encoder.h"
30 #include "pan_indirect_draw.h"
31 #include "pan_pool.h"
32 #include "pan_util.h"
33 #include "compiler/nir/nir_builder.h"
34 #include "util/u_memory.h"
35 #include "util/macros.h"
36
37 #define WORD(x) ((x) * 4)
38
39 #define LOOP \
40 for (nir_loop *l = nir_push_loop(b); l != NULL; \
41 nir_pop_loop(b, l), l = NULL)
42 #define BREAK nir_jump(b, nir_jump_break)
43 #define CONTINUE nir_jump(b, nir_jump_continue)
44
45 #define IF(cond) nir_push_if(b, cond);
46 #define ELSE nir_push_else(b, NULL);
47 #define ENDIF nir_pop_if(b, NULL);
48
49 #define MIN_MAX_JOBS 128
50
51 struct draw_data {
52 nir_ssa_def *draw_buf;
53 nir_ssa_def *draw_buf_stride;
54 nir_ssa_def *index_buf;
55 nir_ssa_def *restart_index;
56 nir_ssa_def *vertex_count;
57 nir_ssa_def *start_instance;
58 nir_ssa_def *instance_count;
59 nir_ssa_def *vertex_start;
60 nir_ssa_def *index_bias;
61 nir_ssa_def *draw_ctx;
62 nir_ssa_def *min_max_ctx;
63 };
64
65 struct instance_size {
66 nir_ssa_def *raw;
67 nir_ssa_def *padded;
68 nir_ssa_def *packed;
69 };
70
71 struct jobs_data {
72 nir_ssa_def *vertex_job;
73 nir_ssa_def *tiler_job;
74 nir_ssa_def *base_vertex_offset;
75 nir_ssa_def *first_vertex_sysval;
76 nir_ssa_def *base_vertex_sysval;
77 nir_ssa_def *base_instance_sysval;
78 nir_ssa_def *offset_start;
79 nir_ssa_def *invocation;
80 };
81
82 struct varyings_data {
83 nir_ssa_def *varying_bufs;
84 nir_ssa_def *pos_ptr;
85 nir_ssa_def *psiz_ptr;
86 nir_variable *mem_ptr;
87 };
88
89 struct attribs_data {
90 nir_ssa_def *attrib_count;
91 nir_ssa_def *attrib_bufs;
92 nir_ssa_def *attribs;
93 };
94
95 struct indirect_draw_shader_builder {
96 nir_builder b;
97 const struct panfrost_device *dev;
98 unsigned flags;
99 bool index_min_max_search;
100 unsigned index_size;
101 struct draw_data draw;
102 struct instance_size instance_size;
103 struct jobs_data jobs;
104 struct varyings_data varyings;
105 struct attribs_data attribs;
106 };
107
108 /* Describes an indirect draw (see glDrawArraysIndirect()) */
109
110 struct indirect_draw_info {
111 uint32_t count;
112 uint32_t instance_count;
113 uint32_t start;
114 uint32_t start_instance;
115 };
116
117 struct indirect_indexed_draw_info {
118 uint32_t count;
119 uint32_t instance_count;
120 uint32_t start;
121 int32_t index_bias;
122 uint32_t start_instance;
123 };
124
125 /* Store the min/max index in a separate context. This is not supported yet, but
126 * the DDK seems to put all min/max search jobs at the beginning of the job chain
127 * when multiple indirect draws are issued to avoid the serialization caused by
128 * the draw patching jobs which have the suppress_prefetch flag set. Merging the
129 * min/max and draw contexts would prevent such optimizations (draw contexts are
130 * shared by all indirect draw in a batch).
131 */
132
133 struct min_max_context {
134 uint32_t min;
135 uint32_t max;
136 };
137
138 /* Per-batch context shared by all indirect draws queued to a given batch. */
139
140 struct indirect_draw_context {
141 /* Pointer to the top of the varying heap. */
142 mali_ptr varying_mem;
143 };
144
145 /* Indirect draw shader inputs. Those are stored in FAU. */
146
147 struct indirect_draw_inputs {
148 /* indirect_draw_context pointer */
149 mali_ptr draw_ctx;
150
151 /* min_max_context pointer */
152 mali_ptr min_max_ctx;
153
154 /* Pointer to an array of indirect_draw_info objects */
155 mali_ptr draw_buf;
156
157 /* Pointer to an uint32_t containing the number of draws to issue */
158 mali_ptr draw_count_ptr;
159
160 /* index buffer */
161 mali_ptr index_buf;
162
163 /* {base,first}_{vertex,instance} sysvals */
164 mali_ptr first_vertex_sysval;
165 mali_ptr base_vertex_sysval;
166 mali_ptr base_instance_sysval;
167
168 /* Pointers to various cmdstream structs that need to be patched */
169 mali_ptr vertex_job;
170 mali_ptr tiler_job;
171 mali_ptr attrib_bufs;
172 mali_ptr attribs;
173 mali_ptr varying_bufs;
174 uint32_t draw_count;
175 uint32_t draw_buf_stride;
176 uint32_t restart_index;
177 uint32_t attrib_count;
178 } PACKED;
179
180 #define get_input_field(b, name) \
181 nir_load_push_constant(b, \
182 1, sizeof(((struct indirect_draw_inputs *)0)->name) * 8, \
183 nir_imm_int(b, 0), \
184 .base = offsetof(struct indirect_draw_inputs, name))
185
186 static nir_ssa_def *
get_address(nir_builder * b,nir_ssa_def * base,nir_ssa_def * offset)187 get_address(nir_builder *b, nir_ssa_def *base, nir_ssa_def *offset)
188 {
189 return nir_iadd(b, base, nir_u2u64(b, offset));
190 }
191
192 static nir_ssa_def *
get_address_imm(nir_builder * b,nir_ssa_def * base,unsigned offset)193 get_address_imm(nir_builder *b, nir_ssa_def *base, unsigned offset)
194 {
195 return get_address(b, base, nir_imm_int(b, offset));
196 }
197
198 static nir_ssa_def *
load_global(nir_builder * b,nir_ssa_def * addr,unsigned ncomps,unsigned bit_size)199 load_global(nir_builder *b, nir_ssa_def *addr, unsigned ncomps, unsigned bit_size)
200 {
201 return nir_load_global(b, addr, 4, ncomps, bit_size);
202 }
203
204 static void
store_global(nir_builder * b,nir_ssa_def * addr,nir_ssa_def * value,unsigned ncomps)205 store_global(nir_builder *b, nir_ssa_def *addr,
206 nir_ssa_def *value, unsigned ncomps)
207 {
208 nir_store_global(b, addr, 4, value, (1 << ncomps) - 1);
209 }
210
211 static nir_ssa_def *
get_draw_ctx_data(struct indirect_draw_shader_builder * builder,unsigned offset,unsigned size)212 get_draw_ctx_data(struct indirect_draw_shader_builder *builder,
213 unsigned offset, unsigned size)
214 {
215 nir_builder *b = &builder->b;
216 return load_global(b,
217 get_address_imm(b, builder->draw.draw_ctx, offset),
218 1, size);
219 }
220
221 static void
set_draw_ctx_data(struct indirect_draw_shader_builder * builder,unsigned offset,nir_ssa_def * value,unsigned size)222 set_draw_ctx_data(struct indirect_draw_shader_builder *builder,
223 unsigned offset, nir_ssa_def *value, unsigned size)
224 {
225 nir_builder *b = &builder->b;
226 store_global(b,
227 get_address_imm(b, builder->draw.draw_ctx, offset),
228 value, 1);
229 }
230
231 #define get_draw_ctx_field(builder, name) \
232 get_draw_ctx_data(builder, \
233 offsetof(struct indirect_draw_context, name), \
234 sizeof(((struct indirect_draw_context *)0)->name) * 8)
235
236 #define set_draw_ctx_field(builder, name, val) \
237 set_draw_ctx_data(builder, \
238 offsetof(struct indirect_draw_context, name), \
239 val, \
240 sizeof(((struct indirect_draw_context *)0)->name) * 8)
241
242 static nir_ssa_def *
get_min_max_ctx_data(struct indirect_draw_shader_builder * builder,unsigned offset,unsigned size)243 get_min_max_ctx_data(struct indirect_draw_shader_builder *builder,
244 unsigned offset, unsigned size)
245 {
246 nir_builder *b = &builder->b;
247 return load_global(b,
248 get_address_imm(b, builder->draw.min_max_ctx, offset),
249 1, size);
250 }
251
252 #define get_min_max_ctx_field(builder, name) \
253 get_min_max_ctx_data(builder, \
254 offsetof(struct min_max_context, name), \
255 sizeof(((struct min_max_context *)0)->name) * 8)
256
257 static void
update_min(struct indirect_draw_shader_builder * builder,nir_ssa_def * val)258 update_min(struct indirect_draw_shader_builder *builder, nir_ssa_def *val)
259 {
260 nir_builder *b = &builder->b;
261 nir_ssa_def *addr =
262 get_address_imm(b,
263 builder->draw.min_max_ctx,
264 offsetof(struct min_max_context, min));
265 nir_global_atomic_umin(b, 32, addr, val);
266 }
267
268 static void
update_max(struct indirect_draw_shader_builder * builder,nir_ssa_def * val)269 update_max(struct indirect_draw_shader_builder *builder, nir_ssa_def *val)
270 {
271 nir_builder *b = &builder->b;
272 nir_ssa_def *addr =
273 get_address_imm(b,
274 builder->draw.min_max_ctx,
275 offsetof(struct min_max_context, max));
276 nir_global_atomic_umax(b, 32, addr, val);
277 }
278
279 #define get_draw_field(b, draw_ptr, field) \
280 load_global(b, \
281 get_address_imm(b, draw_ptr, \
282 offsetof(struct indirect_draw_info, field)), \
283 1, sizeof(((struct indirect_draw_info *)0)->field) * 8)
284
285 #define get_indexed_draw_field(b, draw_ptr, field) \
286 load_global(b, \
287 get_address_imm(b, draw_ptr, \
288 offsetof(struct indirect_indexed_draw_info, field)), \
289 1, sizeof(((struct indirect_indexed_draw_info *)0)->field) * 8)
290
291 static void
extract_inputs(struct indirect_draw_shader_builder * builder)292 extract_inputs(struct indirect_draw_shader_builder *builder)
293 {
294 nir_builder *b = &builder->b;
295
296 builder->draw.draw_ctx = get_input_field(b, draw_ctx);
297 builder->draw.draw_buf = get_input_field(b, draw_buf);
298 builder->draw.draw_buf_stride = get_input_field(b, draw_buf_stride);
299
300 if (builder->index_size) {
301 builder->draw.index_buf = get_input_field(b, index_buf);
302 builder->draw.min_max_ctx = get_input_field(b, min_max_ctx);
303 if (builder->flags & PAN_INDIRECT_DRAW_PRIMITIVE_RESTART) {
304 builder->draw.restart_index =
305 get_input_field(b, restart_index);
306 }
307 }
308
309 if (builder->index_min_max_search)
310 return;
311
312 builder->jobs.first_vertex_sysval = get_input_field(b, first_vertex_sysval);
313 builder->jobs.base_vertex_sysval = get_input_field(b, base_vertex_sysval);
314 builder->jobs.base_instance_sysval = get_input_field(b, base_instance_sysval);
315 builder->jobs.vertex_job = get_input_field(b, vertex_job);
316 builder->jobs.tiler_job = get_input_field(b, tiler_job);
317 builder->attribs.attrib_bufs = get_input_field(b, attrib_bufs);
318 builder->attribs.attribs = get_input_field(b, attribs);
319 builder->attribs.attrib_count = get_input_field(b, attrib_count);
320 builder->varyings.varying_bufs = get_input_field(b, varying_bufs);
321 builder->varyings.mem_ptr =
322 nir_local_variable_create(b->impl,
323 glsl_uint64_t_type(),
324 "var_mem_ptr");
325 nir_store_var(b, builder->varyings.mem_ptr,
326 get_draw_ctx_field(builder, varying_mem), 3);
327 }
328
329 static void
init_shader_builder(struct indirect_draw_shader_builder * builder,const struct panfrost_device * dev,unsigned flags,unsigned index_size,bool index_min_max_search)330 init_shader_builder(struct indirect_draw_shader_builder *builder,
331 const struct panfrost_device *dev,
332 unsigned flags, unsigned index_size,
333 bool index_min_max_search)
334 {
335 memset(builder, 0, sizeof(*builder));
336 builder->dev = dev;
337 builder->flags = flags;
338 builder->index_size = index_size;
339
340 builder->index_min_max_search = index_min_max_search;
341
342 if (index_min_max_search) {
343 builder->b =
344 nir_builder_init_simple_shader(MESA_SHADER_COMPUTE,
345 GENX(pan_shader_get_compiler_options)(),
346 "indirect_draw_min_max_index(index_size=%d)",
347 builder->index_size);
348 } else {
349 builder->b =
350 nir_builder_init_simple_shader(MESA_SHADER_COMPUTE,
351 GENX(pan_shader_get_compiler_options)(),
352 "indirect_draw(index_size=%d%s%s%s%s)",
353 builder->index_size,
354 flags & PAN_INDIRECT_DRAW_HAS_PSIZ ?
355 ",psiz" : "",
356 flags & PAN_INDIRECT_DRAW_PRIMITIVE_RESTART ?
357 ",primitive_restart" : "",
358 flags & PAN_INDIRECT_DRAW_UPDATE_PRIM_SIZE ?
359 ",update_primitive_size" : "",
360 flags & PAN_INDIRECT_DRAW_IDVS ?
361 ",idvs" : "");
362 }
363
364 extract_inputs(builder);
365 }
366
367 static void
update_dcd(struct indirect_draw_shader_builder * builder,nir_ssa_def * job_ptr,unsigned draw_offset)368 update_dcd(struct indirect_draw_shader_builder *builder,
369 nir_ssa_def *job_ptr,
370 unsigned draw_offset)
371 {
372 nir_builder *b = &builder->b;
373 nir_ssa_def *draw_w01 =
374 load_global(b, get_address_imm(b, job_ptr, draw_offset + WORD(0)), 2, 32);
375 nir_ssa_def *draw_w0 = nir_channel(b, draw_w01, 0);
376
377 /* Update DRAW.{instance_size,offset_start} */
378 nir_ssa_def *instance_size =
379 nir_bcsel(b,
380 nir_ult(b, builder->draw.instance_count, nir_imm_int(b, 2)),
381 nir_imm_int(b, 0), builder->instance_size.packed);
382 draw_w01 = nir_vec2(b,
383 nir_ior(b, nir_iand_imm(b, draw_w0, 0xffff),
384 nir_ishl(b, instance_size, nir_imm_int(b, 16))),
385 builder->jobs.offset_start);
386 store_global(b, get_address_imm(b, job_ptr, draw_offset + WORD(0)),
387 draw_w01, 2);
388 }
389
390 static void
update_job(struct indirect_draw_shader_builder * builder,enum mali_job_type type)391 update_job(struct indirect_draw_shader_builder *builder, enum mali_job_type type)
392 {
393 nir_builder *b = &builder->b;
394 nir_ssa_def *job_ptr =
395 type == MALI_JOB_TYPE_VERTEX ?
396 builder->jobs.vertex_job : builder->jobs.tiler_job;
397
398 /* Update the invocation words. */
399 store_global(b, get_address_imm(b, job_ptr, WORD(8)),
400 builder->jobs.invocation, 2);
401
402 unsigned draw_offset =
403 type == MALI_JOB_TYPE_VERTEX ?
404 pan_section_offset(COMPUTE_JOB, DRAW) :
405 pan_section_offset(TILER_JOB, DRAW);
406 unsigned prim_offset = pan_section_offset(TILER_JOB, PRIMITIVE);
407 unsigned psiz_offset = pan_section_offset(TILER_JOB, PRIMITIVE_SIZE);
408 unsigned index_size = builder->index_size;
409
410 if (type == MALI_JOB_TYPE_TILER) {
411 /* Update PRIMITIVE.{base_vertex_offset,count} */
412 store_global(b,
413 get_address_imm(b, job_ptr, prim_offset + WORD(1)),
414 builder->jobs.base_vertex_offset, 1);
415 store_global(b,
416 get_address_imm(b, job_ptr, prim_offset + WORD(3)),
417 nir_iadd_imm(b, builder->draw.vertex_count, -1), 1);
418
419 if (index_size) {
420 nir_ssa_def *addr =
421 get_address_imm(b, job_ptr, prim_offset + WORD(4));
422 nir_ssa_def *indices = load_global(b, addr, 1, 64);
423 nir_ssa_def *offset =
424 nir_imul_imm(b, builder->draw.vertex_start, index_size);
425
426 indices = get_address(b, indices, offset);
427 store_global(b, addr, indices, 2);
428 }
429
430 /* Update PRIMITIVE_SIZE.size_array */
431 if ((builder->flags & PAN_INDIRECT_DRAW_HAS_PSIZ) &&
432 (builder->flags & PAN_INDIRECT_DRAW_UPDATE_PRIM_SIZE)) {
433 store_global(b,
434 get_address_imm(b, job_ptr, psiz_offset + WORD(0)),
435 builder->varyings.psiz_ptr, 2);
436 }
437
438 /* Update DRAW.position */
439 store_global(b, get_address_imm(b, job_ptr, draw_offset + WORD(4)),
440 builder->varyings.pos_ptr, 2);
441 }
442
443 update_dcd(builder, job_ptr, draw_offset);
444
445 if (builder->flags & PAN_INDIRECT_DRAW_IDVS) {
446 assert(type == MALI_JOB_TYPE_TILER);
447
448 update_dcd(builder, job_ptr,
449 pan_section_offset(INDEXED_VERTEX_JOB, VERTEX_DRAW));
450 }
451 }
452
453 static void
split_div(nir_builder * b,nir_ssa_def * div,nir_ssa_def ** r_e,nir_ssa_def ** d)454 split_div(nir_builder *b, nir_ssa_def *div, nir_ssa_def **r_e, nir_ssa_def **d)
455 {
456 /* TODO: Lower this 64bit div to something GPU-friendly */
457 nir_ssa_def *r = nir_imax(b, nir_ufind_msb(b, div), nir_imm_int(b, 0));
458 nir_ssa_def *div64 = nir_u2u64(b, div);
459 nir_ssa_def *half_div64 = nir_u2u64(b, nir_ushr_imm(b, div, 1));
460 nir_ssa_def *f0 = nir_iadd(b,
461 nir_ishl(b, nir_imm_int64(b, 1),
462 nir_iadd_imm(b, r, 32)),
463 half_div64);
464 nir_ssa_def *fi = nir_idiv(b, f0, div64);
465 nir_ssa_def *ff = nir_isub(b, f0, nir_imul(b, fi, div64));
466 nir_ssa_def *e = nir_bcsel(b, nir_ult(b, half_div64, ff),
467 nir_imm_int(b, 1 << 5), nir_imm_int(b, 0));
468 *d = nir_iand_imm(b, nir_u2u32(b, fi), ~(1 << 31));
469 *r_e = nir_ior(b, r, e);
470 }
471
472 static void
update_vertex_attrib_buf(struct indirect_draw_shader_builder * builder,nir_ssa_def * attrib_buf_ptr,enum mali_attribute_type type,nir_ssa_def * div1,nir_ssa_def * div2)473 update_vertex_attrib_buf(struct indirect_draw_shader_builder *builder,
474 nir_ssa_def *attrib_buf_ptr,
475 enum mali_attribute_type type,
476 nir_ssa_def *div1,
477 nir_ssa_def *div2)
478 {
479 nir_builder *b = &builder->b;
480 unsigned type_mask = BITFIELD_MASK(6);
481 nir_ssa_def *w01 = load_global(b, attrib_buf_ptr, 2, 32);
482 nir_ssa_def *w0 = nir_channel(b, w01, 0);
483 nir_ssa_def *w1 = nir_channel(b, w01, 1);
484
485 /* Word 0 and 1 of the attribute descriptor contain the type,
486 * pointer and the the divisor exponent.
487 */
488 w0 = nir_iand_imm(b, nir_channel(b, w01, 0), ~type_mask);
489 w0 = nir_ior(b, w0, nir_imm_int(b, type));
490 w1 = nir_ior(b, w1, nir_ishl(b, div1, nir_imm_int(b, 24)));
491
492 store_global(b, attrib_buf_ptr, nir_vec2(b, w0, w1), 2);
493
494 if (type == MALI_ATTRIBUTE_TYPE_1D_NPOT_DIVISOR) {
495 /* If the divisor is not a power of two, the divisor numerator
496 * is passed in word 1 of the continuation attribute (word 5
497 * if we consider the attribute and its continuation as a
498 * single attribute).
499 */
500 assert(div2);
501 store_global(b, get_address_imm(b, attrib_buf_ptr, WORD(5)),
502 div2, 1);
503 }
504 }
505
506 static void
zero_attrib_buf_stride(struct indirect_draw_shader_builder * builder,nir_ssa_def * attrib_buf_ptr)507 zero_attrib_buf_stride(struct indirect_draw_shader_builder *builder,
508 nir_ssa_def *attrib_buf_ptr)
509 {
510 /* Stride is an unadorned 32-bit uint at word 2 */
511 nir_builder *b = &builder->b;
512 store_global(b, get_address_imm(b, attrib_buf_ptr, WORD(2)),
513 nir_imm_int(b, 0), 1);
514 }
515
516 static void
adjust_attrib_offset(struct indirect_draw_shader_builder * builder,nir_ssa_def * attrib_ptr,nir_ssa_def * attrib_buf_ptr,nir_ssa_def * instance_div)517 adjust_attrib_offset(struct indirect_draw_shader_builder *builder,
518 nir_ssa_def *attrib_ptr, nir_ssa_def *attrib_buf_ptr,
519 nir_ssa_def *instance_div)
520 {
521 nir_builder *b = &builder->b;
522 nir_ssa_def *zero = nir_imm_int(b, 0);
523 nir_ssa_def *two = nir_imm_int(b, 2);
524 nir_ssa_def *sub_cur_offset =
525 nir_iand(b, nir_ine(b, builder->jobs.offset_start, zero),
526 nir_uge(b, builder->draw.instance_count, two));
527
528 nir_ssa_def *add_base_inst_offset =
529 nir_iand(b, nir_ine(b, builder->draw.start_instance, zero),
530 nir_ine(b, instance_div, zero));
531
532 IF (nir_ior(b, sub_cur_offset, add_base_inst_offset)) {
533 nir_ssa_def *offset =
534 load_global(b, get_address_imm(b, attrib_ptr, WORD(1)), 1, 32);
535 nir_ssa_def *stride =
536 load_global(b, get_address_imm(b, attrib_buf_ptr, WORD(2)), 1, 32);
537
538 /* Per-instance data needs to be offset in response to a
539 * delayed start in an indexed draw.
540 */
541
542 IF (add_base_inst_offset) {
543 offset = nir_iadd(b, offset,
544 nir_idiv(b,
545 nir_imul(b, stride,
546 builder->draw.start_instance),
547 instance_div));
548 } ENDIF
549
550 IF (sub_cur_offset) {
551 offset = nir_isub(b, offset,
552 nir_imul(b, stride,
553 builder->jobs.offset_start));
554 } ENDIF
555
556 store_global(b, get_address_imm(b, attrib_ptr, WORD(1)),
557 offset, 1);
558 } ENDIF
559 }
560
561 /* x is power of two or zero <===> x has 0 (zero) or 1 (POT) bits set */
562
563 static nir_ssa_def *
nir_is_power_of_two_or_zero(nir_builder * b,nir_ssa_def * x)564 nir_is_power_of_two_or_zero(nir_builder *b, nir_ssa_def *x)
565 {
566 return nir_ult(b, nir_bit_count(b, x), nir_imm_int(b, 2));
567 }
568
569 /* Based on panfrost_emit_vertex_data() */
570
571 static void
update_vertex_attribs(struct indirect_draw_shader_builder * builder)572 update_vertex_attribs(struct indirect_draw_shader_builder *builder)
573 {
574 nir_builder *b = &builder->b;
575 nir_variable *attrib_idx_var =
576 nir_local_variable_create(b->impl, glsl_uint_type(),
577 "attrib_idx");
578 nir_store_var(b, attrib_idx_var, nir_imm_int(b, 0), 1);
579
580 #if PAN_ARCH <= 5
581 nir_ssa_def *single_instance =
582 nir_ult(b, builder->draw.instance_count, nir_imm_int(b, 2));
583 #endif
584
585 LOOP {
586 nir_ssa_def *attrib_idx = nir_load_var(b, attrib_idx_var);
587 IF (nir_uge(b, attrib_idx, builder->attribs.attrib_count))
588 BREAK;
589 ENDIF
590
591 nir_ssa_def *attrib_buf_ptr =
592 get_address(b, builder->attribs.attrib_bufs,
593 nir_imul_imm(b, attrib_idx,
594 2 * pan_size(ATTRIBUTE_BUFFER)));
595 nir_ssa_def *attrib_ptr =
596 get_address(b, builder->attribs.attribs,
597 nir_imul_imm(b, attrib_idx,
598 pan_size(ATTRIBUTE)));
599
600 nir_ssa_def *r_e, *d;
601
602 #if PAN_ARCH <= 5
603 IF (nir_ieq_imm(b, attrib_idx, PAN_VERTEX_ID)) {
604 nir_ssa_def *r_p =
605 nir_bcsel(b, single_instance,
606 nir_imm_int(b, 0x9f),
607 builder->instance_size.packed);
608
609 store_global(b,
610 get_address_imm(b, attrib_buf_ptr, WORD(4)),
611 nir_ishl(b, r_p, nir_imm_int(b, 24)), 1);
612
613 nir_store_var(b, attrib_idx_var,
614 nir_iadd_imm(b, attrib_idx, 1), 1);
615 CONTINUE;
616 } ENDIF
617
618 IF (nir_ieq_imm(b, attrib_idx, PAN_INSTANCE_ID)) {
619 split_div(b, builder->instance_size.padded,
620 &r_e, &d);
621 nir_ssa_def *default_div =
622 nir_ior(b, single_instance,
623 nir_ult(b,
624 builder->instance_size.padded,
625 nir_imm_int(b, 2)));
626 r_e = nir_bcsel(b, default_div,
627 nir_imm_int(b, 0x3f), r_e);
628 d = nir_bcsel(b, default_div,
629 nir_imm_int(b, (1u << 31) - 1), d);
630 store_global(b,
631 get_address_imm(b, attrib_buf_ptr, WORD(1)),
632 nir_vec2(b, nir_ishl(b, r_e, nir_imm_int(b, 24)), d),
633 2);
634 nir_store_var(b, attrib_idx_var,
635 nir_iadd_imm(b, attrib_idx, 1), 1);
636 CONTINUE;
637 } ENDIF
638 #endif
639
640 nir_ssa_def *instance_div =
641 load_global(b, get_address_imm(b, attrib_buf_ptr, WORD(7)), 1, 32);
642
643 nir_ssa_def *div = nir_imul(b, instance_div, builder->instance_size.padded);
644
645 nir_ssa_def *multi_instance =
646 nir_uge(b, builder->draw.instance_count, nir_imm_int(b, 2));
647
648 IF (nir_ine(b, div, nir_imm_int(b, 0))) {
649 IF (multi_instance) {
650 IF (nir_is_power_of_two_or_zero(b, div)) {
651 nir_ssa_def *exp =
652 nir_imax(b, nir_ufind_msb(b, div),
653 nir_imm_int(b, 0));
654 update_vertex_attrib_buf(builder, attrib_buf_ptr,
655 MALI_ATTRIBUTE_TYPE_1D_POT_DIVISOR,
656 exp, NULL);
657 } ELSE {
658 split_div(b, div, &r_e, &d);
659 update_vertex_attrib_buf(builder, attrib_buf_ptr,
660 MALI_ATTRIBUTE_TYPE_1D_NPOT_DIVISOR,
661 r_e, d);
662 } ENDIF
663 } ELSE {
664 /* Single instance with a non-0 divisor: all
665 * accesses should point to attribute 0 */
666 zero_attrib_buf_stride(builder, attrib_buf_ptr);
667 } ENDIF
668
669 adjust_attrib_offset(builder, attrib_ptr, attrib_buf_ptr, instance_div);
670 } ELSE IF (multi_instance) {
671 update_vertex_attrib_buf(builder, attrib_buf_ptr,
672 MALI_ATTRIBUTE_TYPE_1D_MODULUS,
673 builder->instance_size.packed, NULL);
674 } ENDIF ENDIF
675
676 nir_store_var(b, attrib_idx_var, nir_iadd_imm(b, attrib_idx, 1), 1);
677 }
678 }
679
680 static nir_ssa_def *
update_varying_buf(struct indirect_draw_shader_builder * builder,nir_ssa_def * varying_buf_ptr,nir_ssa_def * vertex_count)681 update_varying_buf(struct indirect_draw_shader_builder *builder,
682 nir_ssa_def *varying_buf_ptr,
683 nir_ssa_def *vertex_count)
684 {
685 nir_builder *b = &builder->b;
686
687 nir_ssa_def *stride =
688 load_global(b, get_address_imm(b, varying_buf_ptr, WORD(2)), 1, 32);
689 nir_ssa_def *size = nir_imul(b, stride, vertex_count);
690 nir_ssa_def *aligned_size =
691 nir_iand_imm(b, nir_iadd_imm(b, size, 63), ~63);
692 nir_ssa_def *var_mem_ptr =
693 nir_load_var(b, builder->varyings.mem_ptr);
694 nir_ssa_def *w0 =
695 nir_ior(b, nir_unpack_64_2x32_split_x(b, var_mem_ptr),
696 nir_imm_int(b, MALI_ATTRIBUTE_TYPE_1D));
697 nir_ssa_def *w1 = nir_unpack_64_2x32_split_y(b, var_mem_ptr);
698 store_global(b, get_address_imm(b, varying_buf_ptr, WORD(0)),
699 nir_vec4(b, w0, w1, stride, size), 4);
700
701 nir_store_var(b, builder->varyings.mem_ptr,
702 get_address(b, var_mem_ptr, aligned_size), 3);
703
704 return var_mem_ptr;
705 }
706
707 /* Based on panfrost_emit_varying_descriptor() */
708
709 static void
update_varyings(struct indirect_draw_shader_builder * builder)710 update_varyings(struct indirect_draw_shader_builder *builder)
711 {
712 nir_builder *b = &builder->b;
713 nir_ssa_def *vertex_count =
714 nir_imul(b, builder->instance_size.padded,
715 builder->draw.instance_count);
716 nir_ssa_def *buf_ptr =
717 get_address_imm(b, builder->varyings.varying_bufs,
718 PAN_VARY_GENERAL *
719 pan_size(ATTRIBUTE_BUFFER));
720 update_varying_buf(builder, buf_ptr, vertex_count);
721
722 buf_ptr = get_address_imm(b, builder->varyings.varying_bufs,
723 PAN_VARY_POSITION *
724 pan_size(ATTRIBUTE_BUFFER));
725 builder->varyings.pos_ptr =
726 update_varying_buf(builder, buf_ptr, vertex_count);
727
728 if (builder->flags & PAN_INDIRECT_DRAW_HAS_PSIZ) {
729 buf_ptr = get_address_imm(b, builder->varyings.varying_bufs,
730 PAN_VARY_PSIZ *
731 pan_size(ATTRIBUTE_BUFFER));
732 builder->varyings.psiz_ptr =
733 update_varying_buf(builder, buf_ptr, vertex_count);
734 }
735
736 set_draw_ctx_field(builder, varying_mem,
737 nir_load_var(b, builder->varyings.mem_ptr));
738 }
739
740 /* Based on panfrost_pack_work_groups_compute() */
741
742 static void
get_invocation(struct indirect_draw_shader_builder * builder)743 get_invocation(struct indirect_draw_shader_builder *builder)
744 {
745 nir_builder *b = &builder->b;
746 nir_ssa_def *one = nir_imm_int(b, 1);
747 nir_ssa_def *max_vertex =
748 nir_usub_sat(b, builder->instance_size.raw, one);
749 nir_ssa_def *max_instance =
750 nir_usub_sat(b, builder->draw.instance_count, one);
751 nir_ssa_def *split =
752 nir_bcsel(b, nir_ieq_imm(b, max_instance, 0),
753 nir_imm_int(b, 32),
754 nir_iadd_imm(b, nir_ufind_msb(b, max_vertex), 1));
755
756 builder->jobs.invocation =
757 nir_vec2(b,
758 nir_ior(b, max_vertex,
759 nir_ishl(b, max_instance, split)),
760 nir_ior(b, nir_ishl(b, split, nir_imm_int(b, 22)),
761 nir_imm_int(b, 2 << 28)));
762 }
763
764 static nir_ssa_def *
nir_align_pot(nir_builder * b,nir_ssa_def * val,unsigned pot)765 nir_align_pot(nir_builder *b, nir_ssa_def *val, unsigned pot)
766 {
767 assert(pot != 0 && util_is_power_of_two_or_zero(pot));
768
769 return nir_iand_imm(b, nir_iadd_imm(b, val, pot - 1), ~(pot - 1));
770 }
771
772 /* Based on panfrost_padded_vertex_count() */
773
774 static nir_ssa_def *
get_padded_count(nir_builder * b,nir_ssa_def * val,nir_ssa_def ** packed)775 get_padded_count(nir_builder *b, nir_ssa_def *val, nir_ssa_def **packed)
776 {
777 nir_ssa_def *one = nir_imm_int(b, 1);
778 nir_ssa_def *zero = nir_imm_int(b, 0);
779 nir_ssa_def *eleven = nir_imm_int(b, 11);
780 nir_ssa_def *four = nir_imm_int(b, 4);
781
782 nir_ssa_def *exp =
783 nir_usub_sat(b, nir_imax(b, nir_ufind_msb(b, val), zero), four);
784 nir_ssa_def *base = nir_ushr(b, val, exp);
785
786 base = nir_iadd(b, base,
787 nir_bcsel(b, nir_ine(b, val, nir_ishl(b, base, exp)), one, zero));
788
789 nir_ssa_def *rshift = nir_imax(b, nir_find_lsb(b, base), zero);
790 exp = nir_iadd(b, exp, rshift);
791 base = nir_ushr(b, base, rshift);
792 base = nir_iadd(b, base, nir_bcsel(b, nir_uge(b, base, eleven), one, zero));
793 rshift = nir_imax(b, nir_find_lsb(b, base), zero);
794 exp = nir_iadd(b, exp, rshift);
795 base = nir_ushr(b, base, rshift);
796
797 *packed = nir_ior(b, exp,
798 nir_ishl(b, nir_ushr_imm(b, base, 1), nir_imm_int(b, 5)));
799 return nir_ishl(b, base, exp);
800 }
801
802 static void
update_jobs(struct indirect_draw_shader_builder * builder)803 update_jobs(struct indirect_draw_shader_builder *builder)
804 {
805 get_invocation(builder);
806
807 if (!(builder->flags & PAN_INDIRECT_DRAW_IDVS))
808 update_job(builder, MALI_JOB_TYPE_VERTEX);
809
810 update_job(builder, MALI_JOB_TYPE_TILER);
811 }
812
813
814 static void
set_null_job(struct indirect_draw_shader_builder * builder,nir_ssa_def * job_ptr)815 set_null_job(struct indirect_draw_shader_builder *builder,
816 nir_ssa_def *job_ptr)
817 {
818 nir_builder *b = &builder->b;
819 nir_ssa_def *w4 = get_address_imm(b, job_ptr, WORD(4));
820 nir_ssa_def *val = load_global(b, w4, 1, 32);
821
822 /* Set job type to NULL (AKA NOOP) */
823 val = nir_ior(b, nir_iand_imm(b, val, 0xffffff01),
824 nir_imm_int(b, MALI_JOB_TYPE_NULL << 1));
825 store_global(b, w4, val, 1);
826 }
827
828 static void
get_instance_size(struct indirect_draw_shader_builder * builder)829 get_instance_size(struct indirect_draw_shader_builder *builder)
830 {
831 nir_builder *b = &builder->b;
832
833 if (!builder->index_size) {
834 builder->jobs.base_vertex_offset = nir_imm_int(b, 0);
835 builder->jobs.offset_start = builder->draw.vertex_start;
836 builder->instance_size.raw = builder->draw.vertex_count;
837 return;
838 }
839
840 unsigned index_size = builder->index_size;
841 nir_ssa_def *min = get_min_max_ctx_field(builder, min);
842 nir_ssa_def *max = get_min_max_ctx_field(builder, max);
843
844 /* We handle unaligned indices here to avoid the extra complexity in
845 * the min/max search job.
846 */
847 if (builder->index_size < 4) {
848 nir_variable *min_var =
849 nir_local_variable_create(b->impl, glsl_uint_type(), "min");
850 nir_store_var(b, min_var, min, 1);
851 nir_variable *max_var =
852 nir_local_variable_create(b->impl, glsl_uint_type(), "max");
853 nir_store_var(b, max_var, max, 1);
854
855 nir_ssa_def *base =
856 get_address(b, builder->draw.index_buf,
857 nir_imul_imm(b, builder->draw.vertex_start, index_size));
858 nir_ssa_def *offset = nir_iand_imm(b, nir_unpack_64_2x32_split_x(b, base), 3);
859 nir_ssa_def *end =
860 nir_iadd(b, offset,
861 nir_imul_imm(b, builder->draw.vertex_count, index_size));
862 nir_ssa_def *aligned_end = nir_iand_imm(b, end, ~3);
863 unsigned shift = index_size * 8;
864 unsigned mask = (1 << shift) - 1;
865
866 base = nir_iand(b, base, nir_imm_int64(b, ~3ULL));
867
868 /* Unaligned start offset, we need to ignore any data that's
869 * outside the requested range. We also handle ranges that are
870 * covering less than 2 words here.
871 */
872 IF (nir_ior(b, nir_ine(b, offset, nir_imm_int(b, 0)), nir_ieq(b, aligned_end, nir_imm_int(b, 0)))) {
873 min = nir_load_var(b, min_var);
874 max = nir_load_var(b, max_var);
875
876 nir_ssa_def *val = load_global(b, base, 1, 32);
877 for (unsigned i = 0; i < sizeof(uint32_t); i += index_size) {
878 nir_ssa_def *oob =
879 nir_ior(b,
880 nir_ult(b, nir_imm_int(b, i), offset),
881 nir_uge(b, nir_imm_int(b, i), end));
882 nir_ssa_def *data = nir_iand_imm(b, val, mask);
883
884 min = nir_umin(b, min,
885 nir_bcsel(b, oob, nir_imm_int(b, UINT32_MAX), data));
886 max = nir_umax(b, max,
887 nir_bcsel(b, oob, nir_imm_int(b, 0), data));
888 val = nir_ushr_imm(b, val, shift);
889 }
890
891 nir_store_var(b, min_var, min, 1);
892 nir_store_var(b, max_var, max, 1);
893 } ENDIF
894
895 nir_ssa_def *remaining = nir_isub(b, end, aligned_end);
896
897 /* The last word contains less than 4bytes of data, we need to
898 * discard anything falling outside the requested range.
899 */
900 IF (nir_iand(b, nir_ine(b, end, aligned_end), nir_ine(b, aligned_end, nir_imm_int(b, 0)))) {
901 min = nir_load_var(b, min_var);
902 max = nir_load_var(b, max_var);
903
904 nir_ssa_def *val = load_global(b, get_address(b, base, aligned_end), 1, 32);
905 for (unsigned i = 0; i < sizeof(uint32_t); i += index_size) {
906 nir_ssa_def *oob = nir_uge(b, nir_imm_int(b, i), remaining);
907 nir_ssa_def *data = nir_iand_imm(b, val, mask);
908
909 min = nir_umin(b, min,
910 nir_bcsel(b, oob, nir_imm_int(b, UINT32_MAX), data));
911 max = nir_umax(b, max,
912 nir_bcsel(b, oob, nir_imm_int(b, 0), data));
913 val = nir_ushr_imm(b, val, shift);
914 }
915
916 nir_store_var(b, min_var, min, 1);
917 nir_store_var(b, max_var, max, 1);
918 } ENDIF
919
920 min = nir_load_var(b, min_var);
921 max = nir_load_var(b, max_var);
922 }
923
924 builder->jobs.base_vertex_offset = nir_ineg(b, min);
925 builder->jobs.offset_start = nir_iadd(b, min, builder->draw.index_bias);
926 builder->instance_size.raw = nir_iadd_imm(b, nir_usub_sat(b, max, min), 1);
927 }
928
929 /* Patch a draw sequence */
930
931 static void
patch(struct indirect_draw_shader_builder * builder)932 patch(struct indirect_draw_shader_builder *builder)
933 {
934 unsigned index_size = builder->index_size;
935 nir_builder *b = &builder->b;
936
937 nir_ssa_def *draw_ptr = builder->draw.draw_buf;
938
939 if (index_size) {
940 builder->draw.vertex_count = get_indexed_draw_field(b, draw_ptr, count);
941 builder->draw.start_instance = get_indexed_draw_field(b, draw_ptr, start_instance);
942 builder->draw.instance_count =
943 get_indexed_draw_field(b, draw_ptr, instance_count);
944 builder->draw.vertex_start = get_indexed_draw_field(b, draw_ptr, start);
945 builder->draw.index_bias = get_indexed_draw_field(b, draw_ptr, index_bias);
946 } else {
947 builder->draw.vertex_count = get_draw_field(b, draw_ptr, count);
948 builder->draw.start_instance = get_draw_field(b, draw_ptr, start_instance);
949 builder->draw.instance_count = get_draw_field(b, draw_ptr, instance_count);
950 builder->draw.vertex_start = get_draw_field(b, draw_ptr, start);
951 }
952
953 assert(builder->draw.vertex_count->num_components);
954
955 nir_ssa_def *num_vertices =
956 nir_imul(b, builder->draw.vertex_count, builder->draw.instance_count);
957
958 IF (nir_ieq(b, num_vertices, nir_imm_int(b, 0))) {
959 /* If there's nothing to draw, turn the vertex/tiler jobs into
960 * null jobs.
961 */
962 if (!(builder->flags & PAN_INDIRECT_DRAW_IDVS))
963 set_null_job(builder, builder->jobs.vertex_job);
964
965 set_null_job(builder, builder->jobs.tiler_job);
966 } ELSE {
967 get_instance_size(builder);
968
969 nir_ssa_def *count = builder->instance_size.raw;
970
971 /* IDVS requires padding to a multiple of 4 */
972 if (builder->flags & PAN_INDIRECT_DRAW_IDVS)
973 count = nir_align_pot(b, count, 4);
974
975 builder->instance_size.padded =
976 get_padded_count(b, count,
977 &builder->instance_size.packed);
978
979 update_varyings(builder);
980 update_jobs(builder);
981 update_vertex_attribs(builder);
982
983 IF (nir_ine(b, builder->jobs.first_vertex_sysval, nir_imm_int64(b, 0))) {
984 store_global(b, builder->jobs.first_vertex_sysval,
985 builder->jobs.offset_start, 1);
986 } ENDIF
987
988 IF (nir_ine(b, builder->jobs.base_vertex_sysval, nir_imm_int64(b, 0))) {
989 store_global(b, builder->jobs.base_vertex_sysval,
990 index_size ?
991 builder->draw.index_bias :
992 nir_imm_int(b, 0),
993 1);
994 } ENDIF
995
996 IF (nir_ine(b, builder->jobs.base_instance_sysval, nir_imm_int64(b, 0))) {
997 store_global(b, builder->jobs.base_instance_sysval,
998 builder->draw.start_instance, 1);
999 } ENDIF
1000 } ENDIF
1001 }
1002
1003 /* Search the min/max index in the range covered by the indirect draw call */
1004
1005 static void
get_index_min_max(struct indirect_draw_shader_builder * builder)1006 get_index_min_max(struct indirect_draw_shader_builder *builder)
1007 {
1008 nir_ssa_def *restart_index = builder->draw.restart_index;
1009 unsigned index_size = builder->index_size;
1010 nir_builder *b = &builder->b;
1011
1012 nir_ssa_def *draw_ptr = builder->draw.draw_buf;
1013
1014 builder->draw.vertex_count = get_draw_field(b, draw_ptr, count);
1015 builder->draw.vertex_start = get_draw_field(b, draw_ptr, start);
1016
1017 nir_ssa_def *thread_id = nir_channel(b, nir_load_global_invocation_id(b, 32), 0);
1018 nir_variable *min_var =
1019 nir_local_variable_create(b->impl, glsl_uint_type(), "min");
1020 nir_store_var(b, min_var, nir_imm_int(b, UINT32_MAX), 1);
1021 nir_variable *max_var =
1022 nir_local_variable_create(b->impl, glsl_uint_type(), "max");
1023 nir_store_var(b, max_var, nir_imm_int(b, 0), 1);
1024
1025 nir_ssa_def *base =
1026 get_address(b, builder->draw.index_buf,
1027 nir_imul_imm(b, builder->draw.vertex_start, index_size));
1028
1029
1030 nir_ssa_def *start = nir_iand_imm(b, nir_unpack_64_2x32_split_x(b, base), 3);
1031 nir_ssa_def *end =
1032 nir_iadd(b, start, nir_imul_imm(b, builder->draw.vertex_count, index_size));
1033
1034 base = nir_iand(b, base, nir_imm_int64(b, ~3ULL));
1035
1036 /* Align on 4 bytes, non-aligned indices are handled in the indirect draw job. */
1037 start = nir_iand_imm(b, nir_iadd_imm(b, start, 3), ~3);
1038 end = nir_iand_imm(b, end, ~3);
1039
1040 /* Add the job offset. */
1041 start = nir_iadd(b, start, nir_imul_imm(b, thread_id, sizeof(uint32_t)));
1042
1043 nir_variable *offset_var =
1044 nir_local_variable_create(b->impl, glsl_uint_type(), "offset");
1045 nir_store_var(b, offset_var, start, 1);
1046
1047 LOOP {
1048 nir_ssa_def *offset = nir_load_var(b, offset_var);
1049 IF (nir_uge(b, offset, end))
1050 BREAK;
1051 ENDIF
1052
1053 nir_ssa_def *val = load_global(b, get_address(b, base, offset), 1, 32);
1054 nir_ssa_def *old_min = nir_load_var(b, min_var);
1055 nir_ssa_def *old_max = nir_load_var(b, max_var);
1056 nir_ssa_def *new_min;
1057 nir_ssa_def *new_max;
1058
1059 /* TODO: use 8/16 bit arithmetic when index_size < 4. */
1060 for (unsigned i = 0; i < 4; i += index_size) {
1061 nir_ssa_def *data = nir_ushr_imm(b, val, i * 8);
1062 data = nir_iand_imm(b, data, (1ULL << (index_size * 8)) - 1);
1063 new_min = nir_umin(b, old_min, data);
1064 new_max = nir_umax(b, old_max, data);
1065 if (restart_index) {
1066 new_min = nir_bcsel(b, nir_ine(b, restart_index, data), new_min, old_min);
1067 new_max = nir_bcsel(b, nir_ine(b, restart_index, data), new_max, old_max);
1068 }
1069 old_min = new_min;
1070 old_max = new_max;
1071 }
1072
1073 nir_store_var(b, min_var, new_min, 1);
1074 nir_store_var(b, max_var, new_max, 1);
1075 nir_store_var(b, offset_var,
1076 nir_iadd_imm(b, offset, MIN_MAX_JOBS * sizeof(uint32_t)), 1);
1077 }
1078
1079 IF (nir_ult(b, start, end))
1080 update_min(builder, nir_load_var(b, min_var));
1081 update_max(builder, nir_load_var(b, max_var));
1082 ENDIF
1083 }
1084
1085 static unsigned
get_shader_id(unsigned flags,unsigned index_size,bool index_min_max_search)1086 get_shader_id(unsigned flags, unsigned index_size, bool index_min_max_search)
1087 {
1088 if (!index_min_max_search) {
1089 flags &= PAN_INDIRECT_DRAW_FLAGS_MASK;
1090 flags &= ~PAN_INDIRECT_DRAW_INDEX_SIZE_MASK;
1091 if (index_size)
1092 flags |= (util_logbase2(index_size) + 1);
1093 return flags;
1094 }
1095
1096 return ((flags & PAN_INDIRECT_DRAW_PRIMITIVE_RESTART) ?
1097 PAN_INDIRECT_DRAW_MIN_MAX_SEARCH_1B_INDEX_PRIM_RESTART :
1098 PAN_INDIRECT_DRAW_MIN_MAX_SEARCH_1B_INDEX) +
1099 util_logbase2(index_size);
1100 }
1101
1102 static void
create_indirect_draw_shader(struct panfrost_device * dev,unsigned flags,unsigned index_size,bool index_min_max_search)1103 create_indirect_draw_shader(struct panfrost_device *dev,
1104 unsigned flags, unsigned index_size,
1105 bool index_min_max_search)
1106 {
1107 assert(flags < PAN_INDIRECT_DRAW_NUM_SHADERS);
1108 struct indirect_draw_shader_builder builder;
1109 init_shader_builder(&builder, dev, flags, index_size, index_min_max_search);
1110
1111 nir_builder *b = &builder.b;
1112
1113 if (index_min_max_search)
1114 get_index_min_max(&builder);
1115 else
1116 patch(&builder);
1117
1118 struct panfrost_compile_inputs inputs = {
1119 .gpu_id = dev->gpu_id,
1120 .fixed_sysval_ubo = -1,
1121 .no_ubo_to_push = true,
1122 };
1123 struct pan_shader_info shader_info;
1124 struct util_dynarray binary;
1125
1126 util_dynarray_init(&binary, NULL);
1127 GENX(pan_shader_compile)(b->shader, &inputs, &binary, &shader_info);
1128
1129 assert(!shader_info.tls_size);
1130 assert(!shader_info.wls_size);
1131 assert(!shader_info.sysvals.sysval_count);
1132
1133 shader_info.push.count =
1134 DIV_ROUND_UP(sizeof(struct indirect_draw_inputs), 4);
1135
1136 unsigned shader_id = get_shader_id(flags, index_size, index_min_max_search);
1137 struct pan_indirect_draw_shader *draw_shader =
1138 &dev->indirect_draw_shaders.shaders[shader_id];
1139 void *state = dev->indirect_draw_shaders.states->ptr.cpu +
1140 (shader_id * pan_size(RENDERER_STATE));
1141
1142 pthread_mutex_lock(&dev->indirect_draw_shaders.lock);
1143 if (!draw_shader->rsd) {
1144 mali_ptr address =
1145 pan_pool_upload_aligned(dev->indirect_draw_shaders.bin_pool,
1146 binary.data, binary.size,
1147 PAN_ARCH >= 6 ? 128 : 64);
1148
1149 util_dynarray_fini(&binary);
1150
1151 pan_pack(state, RENDERER_STATE, cfg) {
1152 pan_shader_prepare_rsd(&shader_info, address, &cfg);
1153 }
1154
1155 draw_shader->push = shader_info.push;
1156 draw_shader->rsd = dev->indirect_draw_shaders.states->ptr.gpu +
1157 (shader_id * pan_size(RENDERER_STATE));
1158 }
1159 pthread_mutex_unlock(&dev->indirect_draw_shaders.lock);
1160
1161 ralloc_free(b->shader);
1162 }
1163
1164 static mali_ptr
get_renderer_state(struct panfrost_device * dev,unsigned flags,unsigned index_size,bool index_min_max_search)1165 get_renderer_state(struct panfrost_device *dev, unsigned flags,
1166 unsigned index_size, bool index_min_max_search)
1167 {
1168 unsigned shader_id = get_shader_id(flags, index_size, index_min_max_search);
1169 struct pan_indirect_draw_shader *info =
1170 &dev->indirect_draw_shaders.shaders[shader_id];
1171
1172 if (!info->rsd) {
1173 create_indirect_draw_shader(dev, flags, index_size,
1174 index_min_max_search);
1175 assert(info->rsd);
1176 }
1177
1178 return info->rsd;
1179 }
1180
1181 static mali_ptr
get_tls(const struct panfrost_device * dev)1182 get_tls(const struct panfrost_device *dev)
1183 {
1184 return dev->indirect_draw_shaders.states->ptr.gpu +
1185 (PAN_INDIRECT_DRAW_NUM_SHADERS * pan_size(RENDERER_STATE));
1186 }
1187
1188 static void
panfrost_indirect_draw_alloc_deps(struct panfrost_device * dev)1189 panfrost_indirect_draw_alloc_deps(struct panfrost_device *dev)
1190 {
1191 pthread_mutex_lock(&dev->indirect_draw_shaders.lock);
1192 if (dev->indirect_draw_shaders.states)
1193 goto out;
1194
1195 unsigned state_bo_size = (PAN_INDIRECT_DRAW_NUM_SHADERS *
1196 pan_size(RENDERER_STATE)) +
1197 pan_size(LOCAL_STORAGE);
1198
1199 dev->indirect_draw_shaders.states =
1200 panfrost_bo_create(dev, state_bo_size, 0, "Indirect draw states");
1201
1202 /* Prepare the thread storage descriptor now since it's invariant. */
1203 void *tsd = dev->indirect_draw_shaders.states->ptr.cpu +
1204 (PAN_INDIRECT_DRAW_NUM_SHADERS * pan_size(RENDERER_STATE));
1205 pan_pack(tsd, LOCAL_STORAGE, ls) {
1206 ls.wls_instances = MALI_LOCAL_STORAGE_NO_WORKGROUP_MEM;
1207 };
1208
1209 /* FIXME: Currently allocating 512M of growable memory, meaning that we
1210 * only allocate what we really use, the problem is:
1211 * - allocation happens 2M at a time, which might be more than we
1212 * actually need
1213 * - the memory is attached to the device to speed up subsequent
1214 * indirect draws, but that also means it's never shrinked
1215 */
1216 dev->indirect_draw_shaders.varying_heap =
1217 panfrost_bo_create(dev, 512 * 1024 * 1024,
1218 PAN_BO_INVISIBLE | PAN_BO_GROWABLE,
1219 "Indirect draw varying heap");
1220
1221 out:
1222 pthread_mutex_unlock(&dev->indirect_draw_shaders.lock);
1223 }
1224
1225 static unsigned
panfrost_emit_index_min_max_search(struct pan_pool * pool,struct pan_scoreboard * scoreboard,const struct pan_indirect_draw_info * draw_info,const struct indirect_draw_inputs * inputs,struct indirect_draw_context * draw_ctx)1226 panfrost_emit_index_min_max_search(struct pan_pool *pool,
1227 struct pan_scoreboard *scoreboard,
1228 const struct pan_indirect_draw_info *draw_info,
1229 const struct indirect_draw_inputs *inputs,
1230 struct indirect_draw_context *draw_ctx)
1231 {
1232 struct panfrost_device *dev = pool->dev;
1233 unsigned index_size = draw_info->index_size;
1234
1235 if (!index_size)
1236 return 0;
1237
1238 mali_ptr rsd =
1239 get_renderer_state(dev, draw_info->flags,
1240 draw_info->index_size, true);
1241 struct panfrost_ptr job =
1242 pan_pool_alloc_desc(pool, COMPUTE_JOB);
1243 void *invocation =
1244 pan_section_ptr(job.cpu, COMPUTE_JOB, INVOCATION);
1245 panfrost_pack_work_groups_compute(invocation,
1246 1, 1, 1, MIN_MAX_JOBS, 1, 1,
1247 false, false);
1248
1249 pan_section_pack(job.cpu, COMPUTE_JOB, PARAMETERS, cfg) {
1250 cfg.job_task_split = 7;
1251 }
1252
1253 pan_section_pack(job.cpu, COMPUTE_JOB, DRAW, cfg) {
1254 cfg.state = rsd;
1255 cfg.thread_storage = get_tls(pool->dev);
1256 cfg.push_uniforms =
1257 pan_pool_upload_aligned(pool, inputs, sizeof(*inputs), 16);
1258 }
1259
1260 return panfrost_add_job(pool, scoreboard, MALI_JOB_TYPE_COMPUTE,
1261 false, false, 0, 0, &job, false);
1262 }
1263
1264 unsigned
GENX(panfrost_emit_indirect_draw)1265 GENX(panfrost_emit_indirect_draw)(struct pan_pool *pool,
1266 struct pan_scoreboard *scoreboard,
1267 const struct pan_indirect_draw_info *draw_info,
1268 struct panfrost_ptr *ctx)
1269 {
1270 struct panfrost_device *dev = pool->dev;
1271
1272 /* Currently only tested on Bifrost, but the logic should be the same
1273 * on Midgard.
1274 */
1275 assert(pan_is_bifrost(dev));
1276
1277 panfrost_indirect_draw_alloc_deps(dev);
1278
1279 struct panfrost_ptr job =
1280 pan_pool_alloc_desc(pool, COMPUTE_JOB);
1281 mali_ptr rsd =
1282 get_renderer_state(dev, draw_info->flags,
1283 draw_info->index_size, false);
1284
1285 struct indirect_draw_context draw_ctx = {
1286 .varying_mem = dev->indirect_draw_shaders.varying_heap->ptr.gpu,
1287 };
1288
1289 struct panfrost_ptr draw_ctx_ptr = *ctx;
1290 if (!draw_ctx_ptr.cpu) {
1291 draw_ctx_ptr = pan_pool_alloc_aligned(pool,
1292 sizeof(draw_ctx),
1293 sizeof(mali_ptr));
1294 }
1295
1296 struct indirect_draw_inputs inputs = {
1297 .draw_ctx = draw_ctx_ptr.gpu,
1298 .draw_buf = draw_info->draw_buf,
1299 .index_buf = draw_info->index_buf,
1300 .first_vertex_sysval = draw_info->first_vertex_sysval,
1301 .base_vertex_sysval = draw_info->base_vertex_sysval,
1302 .base_instance_sysval = draw_info->base_instance_sysval,
1303 .vertex_job = draw_info->vertex_job,
1304 .tiler_job = draw_info->tiler_job,
1305 .attrib_bufs = draw_info->attrib_bufs,
1306 .attribs = draw_info->attribs,
1307 .varying_bufs = draw_info->varying_bufs,
1308 .attrib_count = draw_info->attrib_count,
1309 };
1310
1311 if (draw_info->index_size) {
1312 inputs.restart_index = draw_info->restart_index;
1313
1314 struct panfrost_ptr min_max_ctx_ptr =
1315 pan_pool_alloc_aligned(pool,
1316 sizeof(struct min_max_context),
1317 4);
1318 struct min_max_context *ctx = min_max_ctx_ptr.cpu;
1319
1320 ctx->min = UINT32_MAX;
1321 ctx->max = 0;
1322 inputs.min_max_ctx = min_max_ctx_ptr.gpu;
1323 }
1324
1325 void *invocation =
1326 pan_section_ptr(job.cpu, COMPUTE_JOB, INVOCATION);
1327 panfrost_pack_work_groups_compute(invocation,
1328 1, 1, 1, 1, 1, 1,
1329 false, false);
1330
1331 pan_section_pack(job.cpu, COMPUTE_JOB, PARAMETERS, cfg) {
1332 cfg.job_task_split = 2;
1333 }
1334
1335 pan_section_pack(job.cpu, COMPUTE_JOB, DRAW, cfg) {
1336 cfg.state = rsd;
1337 cfg.thread_storage = get_tls(pool->dev);
1338 cfg.push_uniforms =
1339 pan_pool_upload_aligned(pool, &inputs, sizeof(inputs), 16);
1340 }
1341
1342 unsigned global_dep = draw_info->last_indirect_draw;
1343 unsigned local_dep =
1344 panfrost_emit_index_min_max_search(pool, scoreboard, draw_info,
1345 &inputs, &draw_ctx);
1346
1347 if (!ctx->cpu) {
1348 *ctx = draw_ctx_ptr;
1349 memcpy(ctx->cpu, &draw_ctx, sizeof(draw_ctx));
1350 }
1351
1352 return panfrost_add_job(pool, scoreboard, MALI_JOB_TYPE_COMPUTE,
1353 false, true, local_dep, global_dep,
1354 &job, false);
1355 }
1356
1357 void
GENX(panfrost_init_indirect_draw_shaders)1358 GENX(panfrost_init_indirect_draw_shaders)(struct panfrost_device *dev,
1359 struct pan_pool *bin_pool)
1360 {
1361 /* We allocate the states and varying_heap BO lazily to avoid
1362 * reserving memory when indirect draws are not used.
1363 */
1364 pthread_mutex_init(&dev->indirect_draw_shaders.lock, NULL);
1365 dev->indirect_draw_shaders.bin_pool = bin_pool;
1366 }
1367
1368 void
GENX(panfrost_cleanup_indirect_draw_shaders)1369 GENX(panfrost_cleanup_indirect_draw_shaders)(struct panfrost_device *dev)
1370 {
1371 panfrost_bo_unreference(dev->indirect_draw_shaders.states);
1372 panfrost_bo_unreference(dev->indirect_draw_shaders.varying_heap);
1373 pthread_mutex_destroy(&dev->indirect_draw_shaders.lock);
1374 }
1375