1 /*
2 * Copyright (C) 2021 Collabora, Ltd.
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
13 * Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 * SOFTWARE.
22 *
23 */
24
25 #include <stdio.h>
26 #include "pan_bo.h"
27 #include "pan_shader.h"
28 #include "pan_scoreboard.h"
29 #include "pan_encoder.h"
30 #include "pan_indirect_draw.h"
31 #include "pan_pool.h"
32 #include "pan_util.h"
33 #include "panfrost-quirks.h"
34 #include "compiler/nir/nir_builder.h"
35 #include "util/u_memory.h"
36 #include "util/macros.h"
37
38 #define WORD(x) ((x) * 4)
39
40 #define LOOP \
41 for (nir_loop *l = nir_push_loop(b); l != NULL; \
42 nir_pop_loop(b, l), l = NULL)
43 #define BREAK nir_jump(b, nir_jump_break)
44 #define CONTINUE nir_jump(b, nir_jump_continue)
45
46 #define IF(cond) nir_push_if(b, cond);
47 #define ELSE nir_push_else(b, NULL);
48 #define ENDIF nir_pop_if(b, NULL);
49
50 #define MIN_MAX_JOBS 128
51
52 struct draw_data {
53 nir_ssa_def *draw_buf;
54 nir_ssa_def *draw_buf_stride;
55 nir_ssa_def *index_buf;
56 nir_ssa_def *restart_index;
57 nir_ssa_def *vertex_count;
58 nir_ssa_def *start_instance;
59 nir_ssa_def *instance_count;
60 nir_ssa_def *vertex_start;
61 nir_ssa_def *index_bias;
62 nir_ssa_def *draw_ctx;
63 nir_ssa_def *min_max_ctx;
64 };
65
66 struct instance_size {
67 nir_ssa_def *raw;
68 nir_ssa_def *padded;
69 nir_ssa_def *packed;
70 };
71
72 struct jobs_data {
73 nir_ssa_def *vertex_job;
74 nir_ssa_def *tiler_job;
75 nir_ssa_def *base_vertex_offset;
76 nir_ssa_def *first_vertex_sysval;
77 nir_ssa_def *base_vertex_sysval;
78 nir_ssa_def *base_instance_sysval;
79 nir_ssa_def *offset_start;
80 nir_ssa_def *invocation;
81 };
82
83 struct varyings_data {
84 nir_ssa_def *varying_bufs;
85 nir_ssa_def *pos_ptr;
86 nir_ssa_def *psiz_ptr;
87 nir_variable *mem_ptr;
88 };
89
90 struct attribs_data {
91 nir_ssa_def *attrib_count;
92 nir_ssa_def *attrib_bufs;
93 nir_ssa_def *attribs;
94 };
95
96 struct indirect_draw_shader_builder {
97 nir_builder b;
98 const struct panfrost_device *dev;
99 unsigned flags;
100 bool index_min_max_search;
101 unsigned index_size;
102 struct draw_data draw;
103 struct instance_size instance_size;
104 struct jobs_data jobs;
105 struct varyings_data varyings;
106 struct attribs_data attribs;
107 };
108
109 /* Describes an indirect draw (see glDrawArraysIndirect()) */
110
111 struct indirect_draw_info {
112 uint32_t count;
113 uint32_t instance_count;
114 uint32_t start;
115 uint32_t start_instance;
116 };
117
118 struct indirect_indexed_draw_info {
119 uint32_t count;
120 uint32_t instance_count;
121 uint32_t start;
122 int32_t index_bias;
123 uint32_t start_instance;
124 };
125
126 /* Store the min/max index in a separate context. This is not supported yet, but
127 * the DDK seems to put all min/max search jobs at the beginning of the job chain
128 * when multiple indirect draws are issued to avoid the serialization caused by
129 * the draw patching jobs which have the suppress_prefetch flag set. Merging the
130 * min/max and draw contexts would prevent such optimizations (draw contexts are
131 * shared by all indirect draw in a batch).
132 */
133
134 struct min_max_context {
135 uint32_t min;
136 uint32_t max;
137 };
138
139 /* Per-batch context shared by all indirect draws queued to a given batch. */
140
141 struct indirect_draw_context {
142 /* Pointer to the top of the varying heap. */
143 mali_ptr varying_mem;
144 };
145
146 /* Indirect draw shader inputs. Those are stored in a UBO. */
147
148 struct indirect_draw_inputs {
149 /* indirect_draw_context pointer */
150 mali_ptr draw_ctx;
151
152 /* min_max_context pointer */
153 mali_ptr min_max_ctx;
154
155 /* Pointer to an array of indirect_draw_info objects */
156 mali_ptr draw_buf;
157
158 /* Pointer to an uint32_t containing the number of draws to issue */
159 mali_ptr draw_count_ptr;
160
161 /* index buffer */
162 mali_ptr index_buf;
163
164 /* {base,first}_{vertex,instance} sysvals */
165 mali_ptr first_vertex_sysval;
166 mali_ptr base_vertex_sysval;
167 mali_ptr base_instance_sysval;
168
169 /* Pointers to various cmdstream structs that need to be patched */
170 mali_ptr vertex_job;
171 mali_ptr tiler_job;
172 mali_ptr attrib_bufs;
173 mali_ptr attribs;
174 mali_ptr varying_bufs;
175 uint32_t draw_count;
176 uint32_t draw_buf_stride;
177 uint32_t restart_index;
178 uint32_t attrib_count;
179 };
180
181 static nir_ssa_def *
get_input_data(nir_builder * b,unsigned offset,unsigned size)182 get_input_data(nir_builder *b, unsigned offset, unsigned size)
183 {
184 assert(!(offset & 0x3));
185 assert(size && !(size & 0x3));
186
187 return nir_load_ubo(b, 1, size,
188 nir_imm_int(b, 0),
189 nir_imm_int(b, offset),
190 .align_mul = 4,
191 .align_offset = 0,
192 .range_base = 0,
193 .range = ~0);
194 }
195
196 #define get_input_field(b, name) \
197 get_input_data(b, offsetof(struct indirect_draw_inputs, name), \
198 sizeof(((struct indirect_draw_inputs *)0)->name) * 8)
199
200 static nir_ssa_def *
get_address(nir_builder * b,nir_ssa_def * base,nir_ssa_def * offset)201 get_address(nir_builder *b, nir_ssa_def *base, nir_ssa_def *offset)
202 {
203 return nir_iadd(b, base, nir_u2u64(b, offset));
204 }
205
206 static nir_ssa_def *
get_address_imm(nir_builder * b,nir_ssa_def * base,unsigned offset)207 get_address_imm(nir_builder *b, nir_ssa_def *base, unsigned offset)
208 {
209 return get_address(b, base, nir_imm_int(b, offset));
210 }
211
212 static nir_ssa_def *
load_global(nir_builder * b,nir_ssa_def * addr,unsigned ncomps,unsigned bit_size)213 load_global(nir_builder *b, nir_ssa_def *addr, unsigned ncomps, unsigned bit_size)
214 {
215 return nir_load_global(b, addr, 4, ncomps, bit_size);
216 }
217
218 static void
store_global(nir_builder * b,nir_ssa_def * addr,nir_ssa_def * value,unsigned ncomps)219 store_global(nir_builder *b, nir_ssa_def *addr,
220 nir_ssa_def *value, unsigned ncomps)
221 {
222 nir_store_global(b, addr, 4, value, (1 << ncomps) - 1);
223 }
224
225 static nir_ssa_def *
get_draw_ctx_data(struct indirect_draw_shader_builder * builder,unsigned offset,unsigned size)226 get_draw_ctx_data(struct indirect_draw_shader_builder *builder,
227 unsigned offset, unsigned size)
228 {
229 nir_builder *b = &builder->b;
230 return load_global(b,
231 get_address_imm(b, builder->draw.draw_ctx, offset),
232 1, size);
233 }
234
235 static void
set_draw_ctx_data(struct indirect_draw_shader_builder * builder,unsigned offset,nir_ssa_def * value,unsigned size)236 set_draw_ctx_data(struct indirect_draw_shader_builder *builder,
237 unsigned offset, nir_ssa_def *value, unsigned size)
238 {
239 nir_builder *b = &builder->b;
240 store_global(b,
241 get_address_imm(b, builder->draw.draw_ctx, offset),
242 value, 1);
243 }
244
245 #define get_draw_ctx_field(builder, name) \
246 get_draw_ctx_data(builder, \
247 offsetof(struct indirect_draw_context, name), \
248 sizeof(((struct indirect_draw_context *)0)->name) * 8)
249
250 #define set_draw_ctx_field(builder, name, val) \
251 set_draw_ctx_data(builder, \
252 offsetof(struct indirect_draw_context, name), \
253 val, \
254 sizeof(((struct indirect_draw_context *)0)->name) * 8)
255
256 static nir_ssa_def *
get_min_max_ctx_data(struct indirect_draw_shader_builder * builder,unsigned offset,unsigned size)257 get_min_max_ctx_data(struct indirect_draw_shader_builder *builder,
258 unsigned offset, unsigned size)
259 {
260 nir_builder *b = &builder->b;
261 return load_global(b,
262 get_address_imm(b, builder->draw.min_max_ctx, offset),
263 1, size);
264 }
265
266 #define get_min_max_ctx_field(builder, name) \
267 get_min_max_ctx_data(builder, \
268 offsetof(struct min_max_context, name), \
269 sizeof(((struct min_max_context *)0)->name) * 8)
270
271 static void
update_min(struct indirect_draw_shader_builder * builder,nir_ssa_def * val)272 update_min(struct indirect_draw_shader_builder *builder, nir_ssa_def *val)
273 {
274 nir_builder *b = &builder->b;
275 nir_ssa_def *addr =
276 get_address_imm(b,
277 builder->draw.min_max_ctx,
278 offsetof(struct min_max_context, min));
279 nir_global_atomic_umin(b, 32, addr, val);
280 }
281
282 static void
update_max(struct indirect_draw_shader_builder * builder,nir_ssa_def * val)283 update_max(struct indirect_draw_shader_builder *builder, nir_ssa_def *val)
284 {
285 nir_builder *b = &builder->b;
286 nir_ssa_def *addr =
287 get_address_imm(b,
288 builder->draw.min_max_ctx,
289 offsetof(struct min_max_context, max));
290 nir_global_atomic_umax(b, 32, addr, val);
291 }
292
293 #define get_draw_field(b, draw_ptr, field) \
294 load_global(b, \
295 get_address_imm(b, draw_ptr, \
296 offsetof(struct indirect_draw_info, field)), \
297 1, sizeof(((struct indirect_draw_info *)0)->field) * 8)
298
299 #define get_indexed_draw_field(b, draw_ptr, field) \
300 load_global(b, \
301 get_address_imm(b, draw_ptr, \
302 offsetof(struct indirect_indexed_draw_info, field)), \
303 1, sizeof(((struct indirect_indexed_draw_info *)0)->field) * 8)
304
305 static void
extract_inputs(struct indirect_draw_shader_builder * builder)306 extract_inputs(struct indirect_draw_shader_builder *builder)
307 {
308 nir_builder *b = &builder->b;
309
310 builder->draw.draw_ctx = get_input_field(b, draw_ctx);
311 builder->draw.draw_buf = get_input_field(b, draw_buf);
312 builder->draw.draw_buf_stride = get_input_field(b, draw_buf_stride);
313
314 if (builder->index_size) {
315 builder->draw.index_buf = get_input_field(b, index_buf);
316 builder->draw.min_max_ctx = get_input_field(b, min_max_ctx);
317 if (builder->flags & PAN_INDIRECT_DRAW_PRIMITIVE_RESTART) {
318 builder->draw.restart_index =
319 get_input_field(b, restart_index);
320 }
321 }
322
323 if (builder->index_min_max_search)
324 return;
325
326 builder->jobs.first_vertex_sysval = get_input_field(b, first_vertex_sysval);
327 builder->jobs.base_vertex_sysval = get_input_field(b, base_vertex_sysval);
328 builder->jobs.base_instance_sysval = get_input_field(b, base_instance_sysval);
329 builder->jobs.vertex_job = get_input_field(b, vertex_job);
330 builder->jobs.tiler_job = get_input_field(b, tiler_job);
331 builder->attribs.attrib_bufs = get_input_field(b, attrib_bufs);
332 builder->attribs.attribs = get_input_field(b, attribs);
333 builder->attribs.attrib_count = get_input_field(b, attrib_count);
334 builder->varyings.varying_bufs = get_input_field(b, varying_bufs);
335 builder->varyings.mem_ptr =
336 nir_local_variable_create(b->impl,
337 glsl_uint64_t_type(),
338 "var_mem_ptr");
339 nir_store_var(b, builder->varyings.mem_ptr,
340 get_draw_ctx_field(builder, varying_mem), 3);
341 }
342
343 static void
init_shader_builder(struct indirect_draw_shader_builder * builder,const struct panfrost_device * dev,unsigned flags,unsigned index_size,bool index_min_max_search)344 init_shader_builder(struct indirect_draw_shader_builder *builder,
345 const struct panfrost_device *dev,
346 unsigned flags, unsigned index_size,
347 bool index_min_max_search)
348 {
349 memset(builder, 0, sizeof(*builder));
350 builder->dev = dev;
351 builder->flags = flags;
352 builder->index_size = index_size;
353
354 builder->index_min_max_search = index_min_max_search;
355
356 if (index_min_max_search) {
357 builder->b =
358 nir_builder_init_simple_shader(MESA_SHADER_COMPUTE,
359 GENX(pan_shader_get_compiler_options)(),
360 "indirect_draw_min_max_index(index_size=%d)",
361 builder->index_size);
362 } else {
363 builder->b =
364 nir_builder_init_simple_shader(MESA_SHADER_COMPUTE,
365 GENX(pan_shader_get_compiler_options)(),
366 "indirect_draw(index_size=%d%s%s%s)",
367 builder->index_size,
368 flags & PAN_INDIRECT_DRAW_HAS_PSIZ ?
369 ",psiz" : "",
370 flags & PAN_INDIRECT_DRAW_PRIMITIVE_RESTART ?
371 ",primitive_restart" : "",
372 flags & PAN_INDIRECT_DRAW_UPDATE_PRIM_SIZE ?
373 ",update_primitive_size" : "");
374 }
375
376 nir_builder *b = &builder->b;
377 b->shader->info.internal = true;
378 nir_variable_create(b->shader, nir_var_mem_ubo,
379 glsl_uint_type(), "inputs");
380 b->shader->info.num_ubos++;
381
382 extract_inputs(builder);
383 }
384
385 static void
update_job(struct indirect_draw_shader_builder * builder,enum mali_job_type type)386 update_job(struct indirect_draw_shader_builder *builder, enum mali_job_type type)
387 {
388 nir_builder *b = &builder->b;
389 nir_ssa_def *job_ptr =
390 type == MALI_JOB_TYPE_VERTEX ?
391 builder->jobs.vertex_job : builder->jobs.tiler_job;
392
393 /* Update the invocation words. */
394 store_global(b, get_address_imm(b, job_ptr, WORD(8)),
395 builder->jobs.invocation, 2);
396
397 unsigned draw_offset =
398 type == MALI_JOB_TYPE_VERTEX ?
399 pan_section_offset(COMPUTE_JOB, DRAW) :
400 pan_section_offset(TILER_JOB, DRAW);
401 unsigned prim_offset = pan_section_offset(TILER_JOB, PRIMITIVE);
402 unsigned psiz_offset = pan_section_offset(TILER_JOB, PRIMITIVE_SIZE);
403 unsigned index_size = builder->index_size;
404
405 if (type == MALI_JOB_TYPE_TILER) {
406 /* Update PRIMITIVE.{base_vertex_offset,count} */
407 store_global(b,
408 get_address_imm(b, job_ptr, prim_offset + WORD(1)),
409 builder->jobs.base_vertex_offset, 1);
410 store_global(b,
411 get_address_imm(b, job_ptr, prim_offset + WORD(3)),
412 nir_iadd_imm(b, builder->draw.vertex_count, -1), 1);
413
414 if (index_size) {
415 nir_ssa_def *addr =
416 get_address_imm(b, job_ptr, prim_offset + WORD(4));
417 nir_ssa_def *indices = load_global(b, addr, 1, 64);
418 nir_ssa_def *offset =
419 nir_imul_imm(b, builder->draw.vertex_start, index_size);
420
421 indices = get_address(b, indices, offset);
422 store_global(b, addr, indices, 2);
423 }
424
425 /* Update PRIMITIVE_SIZE.size_array */
426 if ((builder->flags & PAN_INDIRECT_DRAW_HAS_PSIZ) &&
427 (builder->flags & PAN_INDIRECT_DRAW_UPDATE_PRIM_SIZE)) {
428 store_global(b,
429 get_address_imm(b, job_ptr, psiz_offset + WORD(0)),
430 builder->varyings.psiz_ptr, 2);
431 }
432
433 /* Update DRAW.position */
434 store_global(b, get_address_imm(b, job_ptr, draw_offset + WORD(4)),
435 builder->varyings.pos_ptr, 2);
436 }
437
438 nir_ssa_def *draw_w01 =
439 load_global(b, get_address_imm(b, job_ptr, draw_offset + WORD(0)), 2, 32);
440 nir_ssa_def *draw_w0 = nir_channel(b, draw_w01, 0);
441
442 /* Update DRAW.{instance_size,offset_start} */
443 nir_ssa_def *instance_size =
444 nir_bcsel(b,
445 nir_ult(b, builder->draw.instance_count, nir_imm_int(b, 2)),
446 nir_imm_int(b, 0), builder->instance_size.packed);
447 draw_w01 = nir_vec2(b,
448 nir_ior(b, nir_iand_imm(b, draw_w0, 0xffff),
449 nir_ishl(b, instance_size, nir_imm_int(b, 16))),
450 builder->jobs.offset_start);
451 store_global(b, get_address_imm(b, job_ptr, draw_offset + WORD(0)),
452 draw_w01, 2);
453 }
454
455 static void
split_div(nir_builder * b,nir_ssa_def * div,nir_ssa_def ** r_e,nir_ssa_def ** d)456 split_div(nir_builder *b, nir_ssa_def *div, nir_ssa_def **r_e, nir_ssa_def **d)
457 {
458 /* TODO: Lower this 64bit div to something GPU-friendly */
459 nir_ssa_def *r = nir_imax(b, nir_ufind_msb(b, div), nir_imm_int(b, 0));
460 nir_ssa_def *div64 = nir_u2u64(b, div);
461 nir_ssa_def *half_div64 = nir_u2u64(b, nir_ushr_imm(b, div, 1));
462 nir_ssa_def *f0 = nir_iadd(b,
463 nir_ishl(b, nir_imm_int64(b, 1),
464 nir_iadd_imm(b, r, 32)),
465 half_div64);
466 nir_ssa_def *fi = nir_idiv(b, f0, div64);
467 nir_ssa_def *ff = nir_isub(b, f0, nir_imul(b, fi, div64));
468 nir_ssa_def *e = nir_bcsel(b, nir_ult(b, half_div64, ff),
469 nir_imm_int(b, 1 << 5), nir_imm_int(b, 0));
470 *d = nir_iand_imm(b, nir_u2u32(b, fi), ~(1 << 31));
471 *r_e = nir_ior(b, r, e);
472 }
473
474 static void
update_vertex_attrib_buf(struct indirect_draw_shader_builder * builder,nir_ssa_def * attrib_buf_ptr,enum mali_attribute_type type,nir_ssa_def * div1,nir_ssa_def * div2)475 update_vertex_attrib_buf(struct indirect_draw_shader_builder *builder,
476 nir_ssa_def *attrib_buf_ptr,
477 enum mali_attribute_type type,
478 nir_ssa_def *div1,
479 nir_ssa_def *div2)
480 {
481 nir_builder *b = &builder->b;
482 unsigned type_mask = BITFIELD_MASK(6);
483 nir_ssa_def *w01 = load_global(b, attrib_buf_ptr, 2, 32);
484 nir_ssa_def *w0 = nir_channel(b, w01, 0);
485 nir_ssa_def *w1 = nir_channel(b, w01, 1);
486
487 /* Word 0 and 1 of the attribute descriptor contain the type,
488 * pointer and the the divisor exponent.
489 */
490 w0 = nir_iand_imm(b, nir_channel(b, w01, 0), ~type_mask);
491 w0 = nir_ior(b, w0, nir_imm_int(b, type));
492 w1 = nir_ior(b, w1, nir_ishl(b, div1, nir_imm_int(b, 24)));
493
494 store_global(b, attrib_buf_ptr, nir_vec2(b, w0, w1), 2);
495
496 if (type == MALI_ATTRIBUTE_TYPE_1D_NPOT_DIVISOR) {
497 /* If the divisor is not a power of two, the divisor numerator
498 * is passed in word 1 of the continuation attribute (word 5
499 * if we consider the attribute and its continuation as a
500 * single attribute).
501 */
502 assert(div2);
503 store_global(b, get_address_imm(b, attrib_buf_ptr, WORD(5)),
504 div2, 1);
505 }
506 }
507
508 static void
zero_attrib_buf_stride(struct indirect_draw_shader_builder * builder,nir_ssa_def * attrib_buf_ptr)509 zero_attrib_buf_stride(struct indirect_draw_shader_builder *builder,
510 nir_ssa_def *attrib_buf_ptr)
511 {
512 /* Stride is an unadorned 32-bit uint at word 2 */
513 nir_builder *b = &builder->b;
514 store_global(b, get_address_imm(b, attrib_buf_ptr, WORD(2)),
515 nir_imm_int(b, 0), 1);
516 }
517
518 static void
adjust_attrib_offset(struct indirect_draw_shader_builder * builder,nir_ssa_def * attrib_ptr,nir_ssa_def * attrib_buf_ptr,nir_ssa_def * instance_div)519 adjust_attrib_offset(struct indirect_draw_shader_builder *builder,
520 nir_ssa_def *attrib_ptr, nir_ssa_def *attrib_buf_ptr,
521 nir_ssa_def *instance_div)
522 {
523 nir_builder *b = &builder->b;
524 nir_ssa_def *zero = nir_imm_int(b, 0);
525 nir_ssa_def *two = nir_imm_int(b, 2);
526 nir_ssa_def *sub_cur_offset =
527 nir_iand(b, nir_ine(b, builder->jobs.offset_start, zero),
528 nir_uge(b, builder->draw.instance_count, two));
529
530 nir_ssa_def *add_base_inst_offset =
531 nir_iand(b, nir_ine(b, builder->draw.start_instance, zero),
532 nir_ine(b, instance_div, zero));
533
534 IF (nir_ior(b, sub_cur_offset, add_base_inst_offset)) {
535 nir_ssa_def *offset =
536 load_global(b, get_address_imm(b, attrib_ptr, WORD(1)), 1, 32);
537 nir_ssa_def *stride =
538 load_global(b, get_address_imm(b, attrib_buf_ptr, WORD(2)), 1, 32);
539
540 /* Per-instance data needs to be offset in response to a
541 * delayed start in an indexed draw.
542 */
543
544 IF (add_base_inst_offset) {
545 offset = nir_iadd(b, offset,
546 nir_idiv(b,
547 nir_imul(b, stride,
548 builder->draw.start_instance),
549 instance_div));
550 } ENDIF
551
552 IF (sub_cur_offset) {
553 offset = nir_isub(b, offset,
554 nir_imul(b, stride,
555 builder->jobs.offset_start));
556 } ENDIF
557
558 store_global(b, get_address_imm(b, attrib_ptr, WORD(1)),
559 offset, 1);
560 } ENDIF
561 }
562
563 /* x is power of two or zero <===> x has 0 (zero) or 1 (POT) bits set */
564
565 static nir_ssa_def *
nir_is_power_of_two_or_zero(nir_builder * b,nir_ssa_def * x)566 nir_is_power_of_two_or_zero(nir_builder *b, nir_ssa_def *x)
567 {
568 return nir_ult(b, nir_bit_count(b, x), nir_imm_int(b, 2));
569 }
570
571 /* Based on panfrost_emit_vertex_data() */
572
573 static void
update_vertex_attribs(struct indirect_draw_shader_builder * builder)574 update_vertex_attribs(struct indirect_draw_shader_builder *builder)
575 {
576 nir_builder *b = &builder->b;
577 nir_variable *attrib_idx_var =
578 nir_local_variable_create(b->impl, glsl_uint_type(),
579 "attrib_idx");
580 nir_store_var(b, attrib_idx_var, nir_imm_int(b, 0), 1);
581
582 #if PAN_ARCH <= 5
583 nir_ssa_def *single_instance =
584 nir_ult(b, builder->draw.instance_count, nir_imm_int(b, 2));
585 #endif
586
587 LOOP {
588 nir_ssa_def *attrib_idx = nir_load_var(b, attrib_idx_var);
589 IF (nir_uge(b, attrib_idx, builder->attribs.attrib_count))
590 BREAK;
591 ENDIF
592
593 nir_ssa_def *attrib_buf_ptr =
594 get_address(b, builder->attribs.attrib_bufs,
595 nir_imul_imm(b, attrib_idx,
596 2 * pan_size(ATTRIBUTE_BUFFER)));
597 nir_ssa_def *attrib_ptr =
598 get_address(b, builder->attribs.attribs,
599 nir_imul_imm(b, attrib_idx,
600 pan_size(ATTRIBUTE)));
601
602 nir_ssa_def *r_e, *d;
603
604 #if PAN_ARCH <= 5
605 IF (nir_ieq_imm(b, attrib_idx, PAN_VERTEX_ID)) {
606 nir_ssa_def *r_p =
607 nir_bcsel(b, single_instance,
608 nir_imm_int(b, 0x9f),
609 builder->instance_size.packed);
610
611 store_global(b,
612 get_address_imm(b, attrib_buf_ptr, WORD(4)),
613 nir_ishl(b, r_p, nir_imm_int(b, 24)), 1);
614
615 nir_store_var(b, attrib_idx_var,
616 nir_iadd_imm(b, attrib_idx, 1), 1);
617 CONTINUE;
618 } ENDIF
619
620 IF (nir_ieq_imm(b, attrib_idx, PAN_INSTANCE_ID)) {
621 split_div(b, builder->instance_size.padded,
622 &r_e, &d);
623 nir_ssa_def *default_div =
624 nir_ior(b, single_instance,
625 nir_ult(b,
626 builder->instance_size.padded,
627 nir_imm_int(b, 2)));
628 r_e = nir_bcsel(b, default_div,
629 nir_imm_int(b, 0x3f), r_e);
630 d = nir_bcsel(b, default_div,
631 nir_imm_int(b, (1u << 31) - 1), d);
632 store_global(b,
633 get_address_imm(b, attrib_buf_ptr, WORD(1)),
634 nir_vec2(b, nir_ishl(b, r_e, nir_imm_int(b, 24)), d),
635 2);
636 nir_store_var(b, attrib_idx_var,
637 nir_iadd_imm(b, attrib_idx, 1), 1);
638 CONTINUE;
639 } ENDIF
640 #endif
641
642 nir_ssa_def *instance_div =
643 load_global(b, get_address_imm(b, attrib_buf_ptr, WORD(7)), 1, 32);
644
645 nir_ssa_def *div = nir_imul(b, instance_div, builder->instance_size.padded);
646
647 nir_ssa_def *multi_instance =
648 nir_uge(b, builder->draw.instance_count, nir_imm_int(b, 2));
649
650 IF (nir_ine(b, div, nir_imm_int(b, 0))) {
651 IF (multi_instance) {
652 IF (nir_is_power_of_two_or_zero(b, div)) {
653 nir_ssa_def *exp =
654 nir_imax(b, nir_ufind_msb(b, div),
655 nir_imm_int(b, 0));
656 update_vertex_attrib_buf(builder, attrib_buf_ptr,
657 MALI_ATTRIBUTE_TYPE_1D_POT_DIVISOR,
658 exp, NULL);
659 } ELSE {
660 split_div(b, div, &r_e, &d);
661 update_vertex_attrib_buf(builder, attrib_buf_ptr,
662 MALI_ATTRIBUTE_TYPE_1D_NPOT_DIVISOR,
663 r_e, d);
664 } ENDIF
665 } ELSE {
666 /* Single instance with a non-0 divisor: all
667 * accesses should point to attribute 0 */
668 zero_attrib_buf_stride(builder, attrib_buf_ptr);
669 } ENDIF
670
671 adjust_attrib_offset(builder, attrib_ptr, attrib_buf_ptr, instance_div);
672 } ELSE IF (multi_instance) {
673 update_vertex_attrib_buf(builder, attrib_buf_ptr,
674 MALI_ATTRIBUTE_TYPE_1D_MODULUS,
675 builder->instance_size.packed, NULL);
676 } ENDIF ENDIF
677
678 nir_store_var(b, attrib_idx_var, nir_iadd_imm(b, attrib_idx, 1), 1);
679 }
680 }
681
682 static nir_ssa_def *
update_varying_buf(struct indirect_draw_shader_builder * builder,nir_ssa_def * varying_buf_ptr,nir_ssa_def * vertex_count)683 update_varying_buf(struct indirect_draw_shader_builder *builder,
684 nir_ssa_def *varying_buf_ptr,
685 nir_ssa_def *vertex_count)
686 {
687 nir_builder *b = &builder->b;
688
689 nir_ssa_def *stride =
690 load_global(b, get_address_imm(b, varying_buf_ptr, WORD(2)), 1, 32);
691 nir_ssa_def *size = nir_imul(b, stride, vertex_count);
692 nir_ssa_def *aligned_size =
693 nir_iand_imm(b, nir_iadd_imm(b, size, 63), ~63);
694 nir_ssa_def *var_mem_ptr =
695 nir_load_var(b, builder->varyings.mem_ptr);
696 nir_ssa_def *w0 =
697 nir_ior(b, nir_unpack_64_2x32_split_x(b, var_mem_ptr),
698 nir_imm_int(b, MALI_ATTRIBUTE_TYPE_1D));
699 nir_ssa_def *w1 = nir_unpack_64_2x32_split_y(b, var_mem_ptr);
700 store_global(b, get_address_imm(b, varying_buf_ptr, WORD(0)),
701 nir_vec4(b, w0, w1, stride, size), 4);
702
703 nir_store_var(b, builder->varyings.mem_ptr,
704 get_address(b, var_mem_ptr, aligned_size), 3);
705
706 return var_mem_ptr;
707 }
708
709 /* Based on panfrost_emit_varying_descriptor() */
710
711 static void
update_varyings(struct indirect_draw_shader_builder * builder)712 update_varyings(struct indirect_draw_shader_builder *builder)
713 {
714 nir_builder *b = &builder->b;
715 nir_ssa_def *vertex_count =
716 nir_imul(b, builder->instance_size.padded,
717 builder->draw.instance_count);
718 nir_ssa_def *buf_ptr =
719 get_address_imm(b, builder->varyings.varying_bufs,
720 PAN_VARY_GENERAL *
721 pan_size(ATTRIBUTE_BUFFER));
722 update_varying_buf(builder, buf_ptr, vertex_count);
723
724 buf_ptr = get_address_imm(b, builder->varyings.varying_bufs,
725 PAN_VARY_POSITION *
726 pan_size(ATTRIBUTE_BUFFER));
727 builder->varyings.pos_ptr =
728 update_varying_buf(builder, buf_ptr, vertex_count);
729
730 if (builder->flags & PAN_INDIRECT_DRAW_HAS_PSIZ) {
731 buf_ptr = get_address_imm(b, builder->varyings.varying_bufs,
732 PAN_VARY_PSIZ *
733 pan_size(ATTRIBUTE_BUFFER));
734 builder->varyings.psiz_ptr =
735 update_varying_buf(builder, buf_ptr, vertex_count);
736 }
737
738 set_draw_ctx_field(builder, varying_mem,
739 nir_load_var(b, builder->varyings.mem_ptr));
740 }
741
742 /* Based on panfrost_pack_work_groups_compute() */
743
744 static void
get_invocation(struct indirect_draw_shader_builder * builder)745 get_invocation(struct indirect_draw_shader_builder *builder)
746 {
747 nir_builder *b = &builder->b;
748 nir_ssa_def *one = nir_imm_int(b, 1);
749 nir_ssa_def *max_vertex =
750 nir_usub_sat(b, builder->instance_size.raw, one);
751 nir_ssa_def *max_instance =
752 nir_usub_sat(b, builder->draw.instance_count, one);
753 nir_ssa_def *split =
754 nir_bcsel(b, nir_ieq_imm(b, max_instance, 0),
755 nir_imm_int(b, 32),
756 nir_iadd_imm(b, nir_ufind_msb(b, max_vertex), 1));
757
758 builder->jobs.invocation =
759 nir_vec2(b,
760 nir_ior(b, max_vertex,
761 nir_ishl(b, max_instance, split)),
762 nir_ior(b, nir_ishl(b, split, nir_imm_int(b, 22)),
763 nir_imm_int(b, 2 << 28)));
764 }
765
766 /* Based on panfrost_padded_vertex_count() */
767
768 static nir_ssa_def *
get_padded_count(nir_builder * b,nir_ssa_def * val,nir_ssa_def ** packed)769 get_padded_count(nir_builder *b, nir_ssa_def *val, nir_ssa_def **packed)
770 {
771 nir_ssa_def *one = nir_imm_int(b, 1);
772 nir_ssa_def *zero = nir_imm_int(b, 0);
773 nir_ssa_def *eleven = nir_imm_int(b, 11);
774 nir_ssa_def *four = nir_imm_int(b, 4);
775
776 nir_ssa_def *exp =
777 nir_usub_sat(b, nir_imax(b, nir_ufind_msb(b, val), zero), four);
778 nir_ssa_def *base = nir_ushr(b, val, exp);
779
780 base = nir_iadd(b, base,
781 nir_bcsel(b, nir_ine(b, val, nir_ishl(b, base, exp)), one, zero));
782
783 nir_ssa_def *rshift = nir_imax(b, nir_find_lsb(b, base), zero);
784 exp = nir_iadd(b, exp, rshift);
785 base = nir_ushr(b, base, rshift);
786 base = nir_iadd(b, base, nir_bcsel(b, nir_uge(b, base, eleven), one, zero));
787 rshift = nir_imax(b, nir_find_lsb(b, base), zero);
788 exp = nir_iadd(b, exp, rshift);
789 base = nir_ushr(b, base, rshift);
790
791 *packed = nir_ior(b, exp,
792 nir_ishl(b, nir_ushr_imm(b, base, 1), nir_imm_int(b, 5)));
793 return nir_ishl(b, base, exp);
794 }
795
796 static void
update_jobs(struct indirect_draw_shader_builder * builder)797 update_jobs(struct indirect_draw_shader_builder *builder)
798 {
799 get_invocation(builder);
800 update_job(builder, MALI_JOB_TYPE_VERTEX);
801 update_job(builder, MALI_JOB_TYPE_TILER);
802 }
803
804
805 static void
set_null_job(struct indirect_draw_shader_builder * builder,nir_ssa_def * job_ptr)806 set_null_job(struct indirect_draw_shader_builder *builder,
807 nir_ssa_def *job_ptr)
808 {
809 nir_builder *b = &builder->b;
810 nir_ssa_def *w4 = get_address_imm(b, job_ptr, WORD(4));
811 nir_ssa_def *val = load_global(b, w4, 1, 32);
812
813 /* Set job type to NULL (AKA NOOP) */
814 val = nir_ior(b, nir_iand_imm(b, val, 0xffffff01),
815 nir_imm_int(b, MALI_JOB_TYPE_NULL << 1));
816 store_global(b, w4, val, 1);
817 }
818
819 static void
get_instance_size(struct indirect_draw_shader_builder * builder)820 get_instance_size(struct indirect_draw_shader_builder *builder)
821 {
822 nir_builder *b = &builder->b;
823
824 if (!builder->index_size) {
825 builder->jobs.base_vertex_offset = nir_imm_int(b, 0);
826 builder->jobs.offset_start = builder->draw.vertex_start;
827 builder->instance_size.raw = builder->draw.vertex_count;
828 return;
829 }
830
831 unsigned index_size = builder->index_size;
832 nir_ssa_def *min = get_min_max_ctx_field(builder, min);
833 nir_ssa_def *max = get_min_max_ctx_field(builder, max);
834
835 /* We handle unaligned indices here to avoid the extra complexity in
836 * the min/max search job.
837 */
838 if (builder->index_size < 4) {
839 nir_variable *min_var =
840 nir_local_variable_create(b->impl, glsl_uint_type(), "min");
841 nir_store_var(b, min_var, min, 1);
842 nir_variable *max_var =
843 nir_local_variable_create(b->impl, glsl_uint_type(), "max");
844 nir_store_var(b, max_var, max, 1);
845
846 nir_ssa_def *base =
847 get_address(b, builder->draw.index_buf,
848 nir_imul_imm(b, builder->draw.vertex_start, index_size));
849 nir_ssa_def *offset = nir_iand_imm(b, nir_unpack_64_2x32_split_x(b, base), 3);
850 nir_ssa_def *end =
851 nir_iadd(b, offset,
852 nir_imul_imm(b, builder->draw.vertex_count, index_size));
853 nir_ssa_def *aligned_end = nir_iand_imm(b, end, ~3);
854 unsigned shift = index_size * 8;
855 unsigned mask = (1 << shift) - 1;
856
857 base = nir_iand(b, base, nir_imm_int64(b, ~3ULL));
858
859 /* Unaligned start offset, we need to ignore any data that's
860 * outside the requested range. We also handle ranges that are
861 * covering less than 2 words here.
862 */
863 IF (nir_ior(b, nir_ine(b, offset, nir_imm_int(b, 0)), nir_ieq(b, aligned_end, nir_imm_int(b, 0)))) {
864 min = nir_load_var(b, min_var);
865 max = nir_load_var(b, max_var);
866
867 nir_ssa_def *val = load_global(b, base, 1, 32);
868 for (unsigned i = 0; i < sizeof(uint32_t); i += index_size) {
869 nir_ssa_def *oob =
870 nir_ior(b,
871 nir_ult(b, nir_imm_int(b, i), offset),
872 nir_uge(b, nir_imm_int(b, i), end));
873 nir_ssa_def *data = nir_iand_imm(b, val, mask);
874
875 min = nir_umin(b, min,
876 nir_bcsel(b, oob, nir_imm_int(b, UINT32_MAX), data));
877 max = nir_umax(b, max,
878 nir_bcsel(b, oob, nir_imm_int(b, 0), data));
879 val = nir_ushr_imm(b, val, shift);
880 }
881
882 nir_store_var(b, min_var, min, 1);
883 nir_store_var(b, max_var, max, 1);
884 } ENDIF
885
886 nir_ssa_def *remaining = nir_isub(b, end, aligned_end);
887
888 /* The last word contains less than 4bytes of data, we need to
889 * discard anything falling outside the requested range.
890 */
891 IF (nir_iand(b, nir_ine(b, end, aligned_end), nir_ine(b, aligned_end, nir_imm_int(b, 0)))) {
892 min = nir_load_var(b, min_var);
893 max = nir_load_var(b, max_var);
894
895 nir_ssa_def *val = load_global(b, get_address(b, base, aligned_end), 1, 32);
896 for (unsigned i = 0; i < sizeof(uint32_t); i += index_size) {
897 nir_ssa_def *oob = nir_uge(b, nir_imm_int(b, i), remaining);
898 nir_ssa_def *data = nir_iand_imm(b, val, mask);
899
900 min = nir_umin(b, min,
901 nir_bcsel(b, oob, nir_imm_int(b, UINT32_MAX), data));
902 max = nir_umax(b, max,
903 nir_bcsel(b, oob, nir_imm_int(b, 0), data));
904 val = nir_ushr_imm(b, val, shift);
905 }
906
907 nir_store_var(b, min_var, min, 1);
908 nir_store_var(b, max_var, max, 1);
909 } ENDIF
910
911 min = nir_load_var(b, min_var);
912 max = nir_load_var(b, max_var);
913 }
914
915 builder->jobs.base_vertex_offset = nir_ineg(b, min);
916 builder->jobs.offset_start = nir_iadd(b, min, builder->draw.index_bias);
917 builder->instance_size.raw = nir_iadd_imm(b, nir_usub_sat(b, max, min), 1);
918 }
919
920 /* Patch a draw sequence */
921
922 static void
patch(struct indirect_draw_shader_builder * builder)923 patch(struct indirect_draw_shader_builder *builder)
924 {
925 unsigned index_size = builder->index_size;
926 nir_builder *b = &builder->b;
927
928 nir_ssa_def *draw_ptr = builder->draw.draw_buf;
929
930 if (index_size) {
931 builder->draw.vertex_count = get_indexed_draw_field(b, draw_ptr, count);
932 builder->draw.start_instance = get_indexed_draw_field(b, draw_ptr, start_instance);
933 builder->draw.instance_count =
934 get_indexed_draw_field(b, draw_ptr, instance_count);
935 builder->draw.vertex_start = get_indexed_draw_field(b, draw_ptr, start);
936 builder->draw.index_bias = get_indexed_draw_field(b, draw_ptr, index_bias);
937 } else {
938 builder->draw.vertex_count = get_draw_field(b, draw_ptr, count);
939 builder->draw.start_instance = get_draw_field(b, draw_ptr, start_instance);
940 builder->draw.instance_count = get_draw_field(b, draw_ptr, instance_count);
941 builder->draw.vertex_start = get_draw_field(b, draw_ptr, start);
942 }
943
944 assert(builder->draw.vertex_count->num_components);
945
946 nir_ssa_def *num_vertices =
947 nir_imul(b, builder->draw.vertex_count, builder->draw.instance_count);
948
949 IF (nir_ieq(b, num_vertices, nir_imm_int(b, 0))) {
950 /* If there's nothing to draw, turn the vertex/tiler jobs into
951 * null jobs.
952 */
953 set_null_job(builder, builder->jobs.vertex_job);
954 set_null_job(builder, builder->jobs.tiler_job);
955 } ELSE {
956 get_instance_size(builder);
957
958 builder->instance_size.padded =
959 get_padded_count(b, builder->instance_size.raw,
960 &builder->instance_size.packed);
961
962 update_varyings(builder);
963 update_jobs(builder);
964 update_vertex_attribs(builder);
965
966 IF (nir_ine(b, builder->jobs.first_vertex_sysval, nir_imm_int64(b, 0))) {
967 store_global(b, builder->jobs.first_vertex_sysval,
968 builder->jobs.offset_start, 1);
969 } ENDIF
970
971 IF (nir_ine(b, builder->jobs.base_vertex_sysval, nir_imm_int64(b, 0))) {
972 store_global(b, builder->jobs.base_vertex_sysval,
973 index_size ?
974 builder->draw.index_bias :
975 nir_imm_int(b, 0),
976 1);
977 } ENDIF
978
979 IF (nir_ine(b, builder->jobs.base_instance_sysval, nir_imm_int64(b, 0))) {
980 store_global(b, builder->jobs.base_instance_sysval,
981 builder->draw.start_instance, 1);
982 } ENDIF
983 } ENDIF
984 }
985
986 /* Search the min/max index in the range covered by the indirect draw call */
987
988 static void
get_index_min_max(struct indirect_draw_shader_builder * builder)989 get_index_min_max(struct indirect_draw_shader_builder *builder)
990 {
991 nir_ssa_def *restart_index = builder->draw.restart_index;
992 unsigned index_size = builder->index_size;
993 nir_builder *b = &builder->b;
994
995 nir_ssa_def *draw_ptr = builder->draw.draw_buf;
996
997 builder->draw.vertex_count = get_draw_field(b, draw_ptr, count);
998 builder->draw.vertex_start = get_draw_field(b, draw_ptr, start);
999
1000 nir_ssa_def *thread_id = nir_channel(b, nir_load_global_invocation_id(b, 32), 0);
1001 nir_variable *min_var =
1002 nir_local_variable_create(b->impl, glsl_uint_type(), "min");
1003 nir_store_var(b, min_var, nir_imm_int(b, UINT32_MAX), 1);
1004 nir_variable *max_var =
1005 nir_local_variable_create(b->impl, glsl_uint_type(), "max");
1006 nir_store_var(b, max_var, nir_imm_int(b, 0), 1);
1007
1008 nir_ssa_def *base =
1009 get_address(b, builder->draw.index_buf,
1010 nir_imul_imm(b, builder->draw.vertex_start, index_size));
1011
1012
1013 nir_ssa_def *start = nir_iand_imm(b, nir_unpack_64_2x32_split_x(b, base), 3);
1014 nir_ssa_def *end =
1015 nir_iadd(b, start, nir_imul_imm(b, builder->draw.vertex_count, index_size));
1016
1017 base = nir_iand(b, base, nir_imm_int64(b, ~3ULL));
1018
1019 /* Align on 4 bytes, non-aligned indices are handled in the indirect draw job. */
1020 start = nir_iand_imm(b, nir_iadd_imm(b, start, 3), ~3);
1021 end = nir_iand_imm(b, end, ~3);
1022
1023 /* Add the job offset. */
1024 start = nir_iadd(b, start, nir_imul_imm(b, thread_id, sizeof(uint32_t)));
1025
1026 nir_variable *offset_var =
1027 nir_local_variable_create(b->impl, glsl_uint_type(), "offset");
1028 nir_store_var(b, offset_var, start, 1);
1029
1030 LOOP {
1031 nir_ssa_def *offset = nir_load_var(b, offset_var);
1032 IF (nir_uge(b, offset, end))
1033 BREAK;
1034 ENDIF
1035
1036 nir_ssa_def *val = load_global(b, get_address(b, base, offset), 1, 32);
1037 nir_ssa_def *old_min = nir_load_var(b, min_var);
1038 nir_ssa_def *old_max = nir_load_var(b, max_var);
1039 nir_ssa_def *new_min;
1040 nir_ssa_def *new_max;
1041
1042 /* TODO: use 8/16 bit arithmetic when index_size < 4. */
1043 for (unsigned i = 0; i < 4; i += index_size) {
1044 nir_ssa_def *data = nir_ushr_imm(b, val, i * 8);
1045 data = nir_iand_imm(b, data, (1ULL << (index_size * 8)) - 1);
1046 new_min = nir_umin(b, old_min, data);
1047 new_max = nir_umax(b, old_max, data);
1048 if (restart_index) {
1049 new_min = nir_bcsel(b, nir_ine(b, restart_index, data), new_min, old_min);
1050 new_max = nir_bcsel(b, nir_ine(b, restart_index, data), new_max, old_max);
1051 }
1052 old_min = new_min;
1053 old_max = new_max;
1054 }
1055
1056 nir_store_var(b, min_var, new_min, 1);
1057 nir_store_var(b, max_var, new_max, 1);
1058 nir_store_var(b, offset_var,
1059 nir_iadd_imm(b, offset, MIN_MAX_JOBS * sizeof(uint32_t)), 1);
1060 }
1061
1062 IF (nir_ult(b, start, end))
1063 update_min(builder, nir_load_var(b, min_var));
1064 update_max(builder, nir_load_var(b, max_var));
1065 ENDIF
1066 }
1067
1068 static unsigned
get_shader_id(unsigned flags,unsigned index_size,bool index_min_max_search)1069 get_shader_id(unsigned flags, unsigned index_size, bool index_min_max_search)
1070 {
1071 if (!index_min_max_search) {
1072 flags &= PAN_INDIRECT_DRAW_FLAGS_MASK;
1073 flags &= ~PAN_INDIRECT_DRAW_INDEX_SIZE_MASK;
1074 if (index_size)
1075 flags |= (util_logbase2(index_size) + 1);
1076 return flags;
1077 }
1078
1079 return ((flags & PAN_INDIRECT_DRAW_PRIMITIVE_RESTART) ?
1080 PAN_INDIRECT_DRAW_MIN_MAX_SEARCH_1B_INDEX_PRIM_RESTART :
1081 PAN_INDIRECT_DRAW_MIN_MAX_SEARCH_1B_INDEX) +
1082 util_logbase2(index_size);
1083 }
1084
1085 static void
create_indirect_draw_shader(struct panfrost_device * dev,unsigned flags,unsigned index_size,bool index_min_max_search)1086 create_indirect_draw_shader(struct panfrost_device *dev,
1087 unsigned flags, unsigned index_size,
1088 bool index_min_max_search)
1089 {
1090 assert(flags < PAN_INDIRECT_DRAW_NUM_SHADERS);
1091 struct indirect_draw_shader_builder builder;
1092 init_shader_builder(&builder, dev, flags, index_size, index_min_max_search);
1093
1094 nir_builder *b = &builder.b;
1095
1096 if (index_min_max_search)
1097 get_index_min_max(&builder);
1098 else
1099 patch(&builder);
1100
1101 struct panfrost_compile_inputs inputs = { .gpu_id = dev->gpu_id };
1102 struct pan_shader_info shader_info;
1103 struct util_dynarray binary;
1104
1105 util_dynarray_init(&binary, NULL);
1106 GENX(pan_shader_compile)(b->shader, &inputs, &binary, &shader_info);
1107
1108 assert(!shader_info.tls_size);
1109 assert(!shader_info.wls_size);
1110 assert(!shader_info.sysvals.sysval_count);
1111
1112 unsigned shader_id = get_shader_id(flags, index_size, index_min_max_search);
1113 struct pan_indirect_draw_shader *draw_shader =
1114 &dev->indirect_draw_shaders.shaders[shader_id];
1115 void *state = dev->indirect_draw_shaders.states->ptr.cpu +
1116 (shader_id * pan_size(RENDERER_STATE));
1117
1118 pthread_mutex_lock(&dev->indirect_draw_shaders.lock);
1119 if (!draw_shader->rsd) {
1120 mali_ptr address =
1121 pan_pool_upload_aligned(dev->indirect_draw_shaders.bin_pool,
1122 binary.data, binary.size,
1123 PAN_ARCH >= 6 ? 128 : 64);
1124
1125 #if PAN_ARCH <= 5
1126 address |= shader_info.midgard.first_tag;
1127 #endif
1128
1129 util_dynarray_fini(&binary);
1130
1131 pan_pack(state, RENDERER_STATE, cfg) {
1132 pan_shader_prepare_rsd(&shader_info, address, &cfg);
1133 }
1134
1135 draw_shader->push = shader_info.push;
1136 draw_shader->rsd = dev->indirect_draw_shaders.states->ptr.gpu +
1137 (shader_id * pan_size(RENDERER_STATE));
1138 }
1139 pthread_mutex_unlock(&dev->indirect_draw_shaders.lock);
1140
1141 ralloc_free(b->shader);
1142 }
1143
1144 static mali_ptr
get_renderer_state(struct panfrost_device * dev,unsigned flags,unsigned index_size,bool index_min_max_search)1145 get_renderer_state(struct panfrost_device *dev, unsigned flags,
1146 unsigned index_size, bool index_min_max_search)
1147 {
1148 unsigned shader_id = get_shader_id(flags, index_size, index_min_max_search);
1149 struct pan_indirect_draw_shader *info =
1150 &dev->indirect_draw_shaders.shaders[shader_id];
1151
1152 if (!info->rsd) {
1153 create_indirect_draw_shader(dev, flags, index_size,
1154 index_min_max_search);
1155 assert(info->rsd);
1156 }
1157
1158 return info->rsd;
1159 }
1160
1161 static mali_ptr
get_tls(const struct panfrost_device * dev)1162 get_tls(const struct panfrost_device *dev)
1163 {
1164 return dev->indirect_draw_shaders.states->ptr.gpu +
1165 (PAN_INDIRECT_DRAW_NUM_SHADERS * pan_size(RENDERER_STATE));
1166 }
1167
1168 static mali_ptr
get_ubos(struct pan_pool * pool,const struct indirect_draw_inputs * inputs)1169 get_ubos(struct pan_pool *pool,
1170 const struct indirect_draw_inputs *inputs)
1171 {
1172 struct panfrost_ptr inputs_buf =
1173 pan_pool_alloc_aligned(pool, sizeof(*inputs), 16);
1174
1175 memcpy(inputs_buf.cpu, inputs, sizeof(*inputs));
1176
1177 struct panfrost_ptr ubos_buf =
1178 pan_pool_alloc_desc(pool, UNIFORM_BUFFER);
1179
1180 pan_pack(ubos_buf.cpu, UNIFORM_BUFFER, cfg) {
1181 cfg.entries = DIV_ROUND_UP(sizeof(*inputs), 16);
1182 cfg.pointer = inputs_buf.gpu;
1183 }
1184
1185 return ubos_buf.gpu;
1186 }
1187
1188 static mali_ptr
get_push_uniforms(struct pan_pool * pool,const struct pan_indirect_draw_shader * shader,const struct indirect_draw_inputs * inputs)1189 get_push_uniforms(struct pan_pool *pool,
1190 const struct pan_indirect_draw_shader *shader,
1191 const struct indirect_draw_inputs *inputs)
1192 {
1193 if (!shader->push.count)
1194 return 0;
1195
1196 struct panfrost_ptr push_consts_buf =
1197 pan_pool_alloc_aligned(pool, shader->push.count * 4, 16);
1198 uint32_t *out = push_consts_buf.cpu;
1199 uint8_t *in = (uint8_t *)inputs;
1200
1201 for (unsigned i = 0; i < shader->push.count; ++i)
1202 memcpy(out + i, in + shader->push.words[i].offset, 4);
1203
1204 return push_consts_buf.gpu;
1205 }
1206
1207 static void
panfrost_indirect_draw_alloc_deps(struct panfrost_device * dev)1208 panfrost_indirect_draw_alloc_deps(struct panfrost_device *dev)
1209 {
1210 pthread_mutex_lock(&dev->indirect_draw_shaders.lock);
1211 if (dev->indirect_draw_shaders.states)
1212 goto out;
1213
1214 unsigned state_bo_size = (PAN_INDIRECT_DRAW_NUM_SHADERS *
1215 pan_size(RENDERER_STATE)) +
1216 pan_size(LOCAL_STORAGE);
1217
1218 dev->indirect_draw_shaders.states =
1219 panfrost_bo_create(dev, state_bo_size, 0, "Indirect draw states");
1220
1221 /* Prepare the thread storage descriptor now since it's invariant. */
1222 void *tsd = dev->indirect_draw_shaders.states->ptr.cpu +
1223 (PAN_INDIRECT_DRAW_NUM_SHADERS * pan_size(RENDERER_STATE));
1224 pan_pack(tsd, LOCAL_STORAGE, ls) {
1225 ls.wls_instances = MALI_LOCAL_STORAGE_NO_WORKGROUP_MEM;
1226 };
1227
1228 /* FIXME: Currently allocating 512M of growable memory, meaning that we
1229 * only allocate what we really use, the problem is:
1230 * - allocation happens 2M at a time, which might be more than we
1231 * actually need
1232 * - the memory is attached to the device to speed up subsequent
1233 * indirect draws, but that also means it's never shrinked
1234 */
1235 dev->indirect_draw_shaders.varying_heap =
1236 panfrost_bo_create(dev, 512 * 1024 * 1024,
1237 PAN_BO_INVISIBLE | PAN_BO_GROWABLE,
1238 "Indirect draw varying heap");
1239
1240 out:
1241 pthread_mutex_unlock(&dev->indirect_draw_shaders.lock);
1242 }
1243
1244 static unsigned
panfrost_emit_index_min_max_search(struct pan_pool * pool,struct pan_scoreboard * scoreboard,const struct pan_indirect_draw_info * draw_info,const struct indirect_draw_inputs * inputs,struct indirect_draw_context * draw_ctx,mali_ptr ubos)1245 panfrost_emit_index_min_max_search(struct pan_pool *pool,
1246 struct pan_scoreboard *scoreboard,
1247 const struct pan_indirect_draw_info *draw_info,
1248 const struct indirect_draw_inputs *inputs,
1249 struct indirect_draw_context *draw_ctx,
1250 mali_ptr ubos)
1251 {
1252 struct panfrost_device *dev = pool->dev;
1253 unsigned index_size = draw_info->index_size;
1254
1255 if (!index_size)
1256 return 0;
1257
1258 mali_ptr rsd =
1259 get_renderer_state(dev, draw_info->flags,
1260 draw_info->index_size, true);
1261 unsigned shader_id =
1262 get_shader_id(draw_info->flags, draw_info->index_size, true);
1263 const struct pan_indirect_draw_shader *shader =
1264 &dev->indirect_draw_shaders.shaders[shader_id];
1265 struct panfrost_ptr job =
1266 pan_pool_alloc_desc(pool, COMPUTE_JOB);
1267 void *invocation =
1268 pan_section_ptr(job.cpu, COMPUTE_JOB, INVOCATION);
1269 panfrost_pack_work_groups_compute(invocation,
1270 1, 1, 1, MIN_MAX_JOBS, 1, 1,
1271 false, false);
1272
1273 pan_section_pack(job.cpu, COMPUTE_JOB, PARAMETERS, cfg) {
1274 cfg.job_task_split = 7;
1275 }
1276
1277 pan_section_pack(job.cpu, COMPUTE_JOB, DRAW, cfg) {
1278 cfg.draw_descriptor_is_64b = true;
1279 cfg.state = rsd;
1280 cfg.thread_storage = get_tls(pool->dev);
1281 cfg.uniform_buffers = ubos;
1282 cfg.push_uniforms = get_push_uniforms(pool, shader, inputs);
1283 }
1284
1285 return panfrost_add_job(pool, scoreboard, MALI_JOB_TYPE_COMPUTE,
1286 false, false, 0, 0, &job, false);
1287 }
1288
1289 unsigned
GENX(panfrost_emit_indirect_draw)1290 GENX(panfrost_emit_indirect_draw)(struct pan_pool *pool,
1291 struct pan_scoreboard *scoreboard,
1292 const struct pan_indirect_draw_info *draw_info,
1293 struct panfrost_ptr *ctx)
1294 {
1295 struct panfrost_device *dev = pool->dev;
1296
1297 /* Currently only tested on Bifrost, but the logic should be the same
1298 * on Midgard.
1299 */
1300 assert(pan_is_bifrost(dev));
1301
1302 panfrost_indirect_draw_alloc_deps(dev);
1303
1304 struct panfrost_ptr job =
1305 pan_pool_alloc_desc(pool, COMPUTE_JOB);
1306 mali_ptr rsd =
1307 get_renderer_state(dev, draw_info->flags,
1308 draw_info->index_size, false);
1309
1310 struct indirect_draw_context draw_ctx = {
1311 .varying_mem = dev->indirect_draw_shaders.varying_heap->ptr.gpu,
1312 };
1313
1314 struct panfrost_ptr draw_ctx_ptr = *ctx;
1315 if (!draw_ctx_ptr.cpu) {
1316 draw_ctx_ptr = pan_pool_alloc_aligned(pool,
1317 sizeof(draw_ctx),
1318 sizeof(mali_ptr));
1319 }
1320
1321 struct indirect_draw_inputs inputs = {
1322 .draw_ctx = draw_ctx_ptr.gpu,
1323 .draw_buf = draw_info->draw_buf,
1324 .index_buf = draw_info->index_buf,
1325 .first_vertex_sysval = draw_info->first_vertex_sysval,
1326 .base_vertex_sysval = draw_info->base_vertex_sysval,
1327 .base_instance_sysval = draw_info->base_instance_sysval,
1328 .vertex_job = draw_info->vertex_job,
1329 .tiler_job = draw_info->tiler_job,
1330 .attrib_bufs = draw_info->attrib_bufs,
1331 .attribs = draw_info->attribs,
1332 .varying_bufs = draw_info->varying_bufs,
1333 .attrib_count = draw_info->attrib_count,
1334 };
1335
1336 if (draw_info->index_size) {
1337 inputs.restart_index = draw_info->restart_index;
1338
1339 struct panfrost_ptr min_max_ctx_ptr =
1340 pan_pool_alloc_aligned(pool,
1341 sizeof(struct min_max_context),
1342 4);
1343 struct min_max_context *ctx = min_max_ctx_ptr.cpu;
1344
1345 ctx->min = UINT32_MAX;
1346 ctx->max = 0;
1347 inputs.min_max_ctx = min_max_ctx_ptr.gpu;
1348 }
1349
1350 unsigned shader_id =
1351 get_shader_id(draw_info->flags, draw_info->index_size, false);
1352 const struct pan_indirect_draw_shader *shader =
1353 &dev->indirect_draw_shaders.shaders[shader_id];
1354 mali_ptr ubos = get_ubos(pool, &inputs);
1355
1356 void *invocation =
1357 pan_section_ptr(job.cpu, COMPUTE_JOB, INVOCATION);
1358 panfrost_pack_work_groups_compute(invocation,
1359 1, 1, 1, 1, 1, 1,
1360 false, false);
1361
1362 pan_section_pack(job.cpu, COMPUTE_JOB, PARAMETERS, cfg) {
1363 cfg.job_task_split = 2;
1364 }
1365
1366 pan_section_pack(job.cpu, COMPUTE_JOB, DRAW, cfg) {
1367 cfg.draw_descriptor_is_64b = true;
1368 cfg.state = rsd;
1369 cfg.thread_storage = get_tls(pool->dev);
1370 cfg.uniform_buffers = ubos;
1371 cfg.push_uniforms = get_push_uniforms(pool, shader, &inputs);
1372 }
1373
1374 unsigned global_dep = draw_info->last_indirect_draw;
1375 unsigned local_dep =
1376 panfrost_emit_index_min_max_search(pool, scoreboard, draw_info,
1377 &inputs, &draw_ctx, ubos);
1378
1379 if (!ctx->cpu) {
1380 *ctx = draw_ctx_ptr;
1381 memcpy(ctx->cpu, &draw_ctx, sizeof(draw_ctx));
1382 }
1383
1384 return panfrost_add_job(pool, scoreboard, MALI_JOB_TYPE_COMPUTE,
1385 false, true, local_dep, global_dep,
1386 &job, false);
1387 }
1388
1389 void
GENX(panfrost_init_indirect_draw_shaders)1390 GENX(panfrost_init_indirect_draw_shaders)(struct panfrost_device *dev,
1391 struct pan_pool *bin_pool)
1392 {
1393 /* We allocate the states and varying_heap BO lazily to avoid
1394 * reserving memory when indirect draws are not used.
1395 */
1396 pthread_mutex_init(&dev->indirect_draw_shaders.lock, NULL);
1397 dev->indirect_draw_shaders.bin_pool = bin_pool;
1398 }
1399
1400 void
GENX(panfrost_cleanup_indirect_draw_shaders)1401 GENX(panfrost_cleanup_indirect_draw_shaders)(struct panfrost_device *dev)
1402 {
1403 panfrost_bo_unreference(dev->indirect_draw_shaders.states);
1404 panfrost_bo_unreference(dev->indirect_draw_shaders.varying_heap);
1405 pthread_mutex_destroy(&dev->indirect_draw_shaders.lock);
1406 }
1407