1 /*
2 * Copyright © 2016-2017 Broadcom
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
13 * Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21 * IN THE SOFTWARE.
22 */
23
24 #include "broadcom/common/v3d_device_info.h"
25 #include "v3d_compiler.h"
26 #include "compiler/nir/nir_schedule.h"
27 #include "compiler/nir/nir_builder.h"
28 #include "util/perf/cpu_trace.h"
29
30 int
vir_get_nsrc(struct qinst * inst)31 vir_get_nsrc(struct qinst *inst)
32 {
33 switch (inst->qpu.type) {
34 case V3D_QPU_INSTR_TYPE_BRANCH:
35 return 0;
36 case V3D_QPU_INSTR_TYPE_ALU:
37 if (inst->qpu.alu.add.op != V3D_QPU_A_NOP)
38 return v3d_qpu_add_op_num_src(inst->qpu.alu.add.op);
39 else
40 return v3d_qpu_mul_op_num_src(inst->qpu.alu.mul.op);
41 }
42
43 return 0;
44 }
45
46 /**
47 * Returns whether the instruction has any side effects that must be
48 * preserved.
49 */
50 bool
vir_has_side_effects(struct v3d_compile * c,struct qinst * inst)51 vir_has_side_effects(struct v3d_compile *c, struct qinst *inst)
52 {
53 switch (inst->qpu.type) {
54 case V3D_QPU_INSTR_TYPE_BRANCH:
55 return true;
56 case V3D_QPU_INSTR_TYPE_ALU:
57 switch (inst->qpu.alu.add.op) {
58 case V3D_QPU_A_SETREVF:
59 case V3D_QPU_A_SETMSF:
60 case V3D_QPU_A_VPMSETUP:
61 case V3D_QPU_A_STVPMV:
62 case V3D_QPU_A_STVPMD:
63 case V3D_QPU_A_STVPMP:
64 case V3D_QPU_A_VPMWT:
65 case V3D_QPU_A_TMUWT:
66 return true;
67 default:
68 break;
69 }
70
71 switch (inst->qpu.alu.mul.op) {
72 case V3D_QPU_M_MULTOP:
73 return true;
74 default:
75 break;
76 }
77 }
78
79 if (inst->qpu.sig.ldtmu ||
80 inst->qpu.sig.ldvary ||
81 inst->qpu.sig.ldtlbu ||
82 inst->qpu.sig.ldtlb ||
83 inst->qpu.sig.wrtmuc ||
84 inst->qpu.sig.thrsw) {
85 return true;
86 }
87
88 /* ldunifa works like ldunif: it reads an element and advances the
89 * pointer, so each read has a side effect (we don't care for ldunif
90 * because we reconstruct the uniform stream buffer after compiling
91 * with the surviving uniforms), so allowing DCE to remove
92 * one would break follow-up loads. We could fix this by emitting a
93 * unifa for each ldunifa, but each unifa requires 3 delay slots
94 * before a ldunifa, so that would be quite expensive.
95 */
96 if (inst->qpu.sig.ldunifa || inst->qpu.sig.ldunifarf)
97 return true;
98
99 return false;
100 }
101
102 bool
vir_is_raw_mov(struct qinst * inst)103 vir_is_raw_mov(struct qinst *inst)
104 {
105 if (inst->qpu.type != V3D_QPU_INSTR_TYPE_ALU ||
106 (inst->qpu.alu.mul.op != V3D_QPU_M_FMOV &&
107 inst->qpu.alu.mul.op != V3D_QPU_M_MOV)) {
108 return false;
109 }
110
111 if (inst->qpu.alu.add.output_pack != V3D_QPU_PACK_NONE ||
112 inst->qpu.alu.mul.output_pack != V3D_QPU_PACK_NONE) {
113 return false;
114 }
115
116 if (inst->qpu.alu.add.a.unpack != V3D_QPU_UNPACK_NONE ||
117 inst->qpu.alu.add.b.unpack != V3D_QPU_UNPACK_NONE ||
118 inst->qpu.alu.mul.a.unpack != V3D_QPU_UNPACK_NONE ||
119 inst->qpu.alu.mul.b.unpack != V3D_QPU_UNPACK_NONE) {
120 return false;
121 }
122
123 if (inst->qpu.flags.ac != V3D_QPU_COND_NONE ||
124 inst->qpu.flags.mc != V3D_QPU_COND_NONE)
125 return false;
126
127 return true;
128 }
129
130 bool
vir_is_add(struct qinst * inst)131 vir_is_add(struct qinst *inst)
132 {
133 return (inst->qpu.type == V3D_QPU_INSTR_TYPE_ALU &&
134 inst->qpu.alu.add.op != V3D_QPU_A_NOP);
135 }
136
137 bool
vir_is_mul(struct qinst * inst)138 vir_is_mul(struct qinst *inst)
139 {
140 return (inst->qpu.type == V3D_QPU_INSTR_TYPE_ALU &&
141 inst->qpu.alu.mul.op != V3D_QPU_M_NOP);
142 }
143
144 bool
vir_is_tex(const struct v3d_device_info * devinfo,struct qinst * inst)145 vir_is_tex(const struct v3d_device_info *devinfo, struct qinst *inst)
146 {
147 if (inst->dst.file == QFILE_MAGIC)
148 return v3d_qpu_magic_waddr_is_tmu(devinfo, inst->dst.index);
149
150 if (inst->qpu.type == V3D_QPU_INSTR_TYPE_ALU &&
151 inst->qpu.alu.add.op == V3D_QPU_A_TMUWT) {
152 return true;
153 }
154
155 return false;
156 }
157
158 bool
vir_writes_r4_implicitly(const struct v3d_device_info * devinfo,struct qinst * inst)159 vir_writes_r4_implicitly(const struct v3d_device_info *devinfo,
160 struct qinst *inst)
161 {
162 if (!devinfo->has_accumulators)
163 return false;
164
165 switch (inst->dst.file) {
166 case QFILE_MAGIC:
167 switch (inst->dst.index) {
168 case V3D_QPU_WADDR_RECIP:
169 case V3D_QPU_WADDR_RSQRT:
170 case V3D_QPU_WADDR_EXP:
171 case V3D_QPU_WADDR_LOG:
172 case V3D_QPU_WADDR_SIN:
173 return true;
174 }
175 break;
176 default:
177 break;
178 }
179
180 return false;
181 }
182
183 void
vir_set_unpack(struct qinst * inst,int src,enum v3d_qpu_input_unpack unpack)184 vir_set_unpack(struct qinst *inst, int src,
185 enum v3d_qpu_input_unpack unpack)
186 {
187 assert(src == 0 || src == 1);
188
189 if (vir_is_add(inst)) {
190 if (src == 0)
191 inst->qpu.alu.add.a.unpack = unpack;
192 else
193 inst->qpu.alu.add.b.unpack = unpack;
194 } else {
195 assert(vir_is_mul(inst));
196 if (src == 0)
197 inst->qpu.alu.mul.a.unpack = unpack;
198 else
199 inst->qpu.alu.mul.b.unpack = unpack;
200 }
201 }
202
203 void
vir_set_pack(struct qinst * inst,enum v3d_qpu_output_pack pack)204 vir_set_pack(struct qinst *inst, enum v3d_qpu_output_pack pack)
205 {
206 if (vir_is_add(inst)) {
207 inst->qpu.alu.add.output_pack = pack;
208 } else {
209 assert(vir_is_mul(inst));
210 inst->qpu.alu.mul.output_pack = pack;
211 }
212 }
213
214 void
vir_set_cond(struct qinst * inst,enum v3d_qpu_cond cond)215 vir_set_cond(struct qinst *inst, enum v3d_qpu_cond cond)
216 {
217 if (vir_is_add(inst)) {
218 inst->qpu.flags.ac = cond;
219 } else {
220 assert(vir_is_mul(inst));
221 inst->qpu.flags.mc = cond;
222 }
223 }
224
225 enum v3d_qpu_cond
vir_get_cond(struct qinst * inst)226 vir_get_cond(struct qinst *inst)
227 {
228 assert(inst->qpu.type == V3D_QPU_INSTR_TYPE_ALU);
229
230 if (vir_is_add(inst))
231 return inst->qpu.flags.ac;
232 else if (vir_is_mul(inst))
233 return inst->qpu.flags.mc;
234 else /* NOP */
235 return V3D_QPU_COND_NONE;
236 }
237
238 void
vir_set_pf(struct v3d_compile * c,struct qinst * inst,enum v3d_qpu_pf pf)239 vir_set_pf(struct v3d_compile *c, struct qinst *inst, enum v3d_qpu_pf pf)
240 {
241 c->flags_temp = -1;
242 if (vir_is_add(inst)) {
243 inst->qpu.flags.apf = pf;
244 } else {
245 assert(vir_is_mul(inst));
246 inst->qpu.flags.mpf = pf;
247 }
248 }
249
250 void
vir_set_uf(struct v3d_compile * c,struct qinst * inst,enum v3d_qpu_uf uf)251 vir_set_uf(struct v3d_compile *c, struct qinst *inst, enum v3d_qpu_uf uf)
252 {
253 c->flags_temp = -1;
254 if (vir_is_add(inst)) {
255 inst->qpu.flags.auf = uf;
256 } else {
257 assert(vir_is_mul(inst));
258 inst->qpu.flags.muf = uf;
259 }
260 }
261
262 #if 0
263 uint8_t
264 vir_channels_written(struct qinst *inst)
265 {
266 if (vir_is_mul(inst)) {
267 switch (inst->dst.pack) {
268 case QPU_PACK_MUL_NOP:
269 case QPU_PACK_MUL_8888:
270 return 0xf;
271 case QPU_PACK_MUL_8A:
272 return 0x1;
273 case QPU_PACK_MUL_8B:
274 return 0x2;
275 case QPU_PACK_MUL_8C:
276 return 0x4;
277 case QPU_PACK_MUL_8D:
278 return 0x8;
279 }
280 } else {
281 switch (inst->dst.pack) {
282 case QPU_PACK_A_NOP:
283 case QPU_PACK_A_8888:
284 case QPU_PACK_A_8888_SAT:
285 case QPU_PACK_A_32_SAT:
286 return 0xf;
287 case QPU_PACK_A_8A:
288 case QPU_PACK_A_8A_SAT:
289 return 0x1;
290 case QPU_PACK_A_8B:
291 case QPU_PACK_A_8B_SAT:
292 return 0x2;
293 case QPU_PACK_A_8C:
294 case QPU_PACK_A_8C_SAT:
295 return 0x4;
296 case QPU_PACK_A_8D:
297 case QPU_PACK_A_8D_SAT:
298 return 0x8;
299 case QPU_PACK_A_16A:
300 case QPU_PACK_A_16A_SAT:
301 return 0x3;
302 case QPU_PACK_A_16B:
303 case QPU_PACK_A_16B_SAT:
304 return 0xc;
305 }
306 }
307 unreachable("Bad pack field");
308 }
309 #endif
310
311 struct qreg
vir_get_temp(struct v3d_compile * c)312 vir_get_temp(struct v3d_compile *c)
313 {
314 struct qreg reg;
315
316 reg.file = QFILE_TEMP;
317 reg.index = c->num_temps++;
318
319 if (c->num_temps > c->defs_array_size) {
320 uint32_t old_size = c->defs_array_size;
321 c->defs_array_size = MAX2(old_size * 2, 16);
322
323 c->defs = reralloc(c, c->defs, struct qinst *,
324 c->defs_array_size);
325 memset(&c->defs[old_size], 0,
326 sizeof(c->defs[0]) * (c->defs_array_size - old_size));
327
328 c->spillable = reralloc(c, c->spillable,
329 BITSET_WORD,
330 BITSET_WORDS(c->defs_array_size));
331 for (int i = old_size; i < c->defs_array_size; i++)
332 BITSET_SET(c->spillable, i);
333 }
334
335 return reg;
336 }
337
338 struct qinst *
vir_add_inst(enum v3d_qpu_add_op op,struct qreg dst,struct qreg src0,struct qreg src1)339 vir_add_inst(enum v3d_qpu_add_op op, struct qreg dst, struct qreg src0, struct qreg src1)
340 {
341 struct qinst *inst = calloc(1, sizeof(*inst));
342
343 inst->qpu = v3d_qpu_nop();
344 inst->qpu.alu.add.op = op;
345
346 inst->dst = dst;
347 inst->src[0] = src0;
348 inst->src[1] = src1;
349 inst->uniform = ~0;
350
351 inst->ip = -1;
352
353 return inst;
354 }
355
356 struct qinst *
vir_mul_inst(enum v3d_qpu_mul_op op,struct qreg dst,struct qreg src0,struct qreg src1)357 vir_mul_inst(enum v3d_qpu_mul_op op, struct qreg dst, struct qreg src0, struct qreg src1)
358 {
359 struct qinst *inst = calloc(1, sizeof(*inst));
360
361 inst->qpu = v3d_qpu_nop();
362 inst->qpu.alu.mul.op = op;
363
364 inst->dst = dst;
365 inst->src[0] = src0;
366 inst->src[1] = src1;
367 inst->uniform = ~0;
368
369 inst->ip = -1;
370
371 return inst;
372 }
373
374 struct qinst *
vir_branch_inst(struct v3d_compile * c,enum v3d_qpu_branch_cond cond)375 vir_branch_inst(struct v3d_compile *c, enum v3d_qpu_branch_cond cond)
376 {
377 struct qinst *inst = calloc(1, sizeof(*inst));
378
379 inst->qpu = v3d_qpu_nop();
380 inst->qpu.type = V3D_QPU_INSTR_TYPE_BRANCH;
381 inst->qpu.branch.cond = cond;
382 inst->qpu.branch.msfign = V3D_QPU_MSFIGN_NONE;
383 inst->qpu.branch.bdi = V3D_QPU_BRANCH_DEST_REL;
384 inst->qpu.branch.ub = true;
385 inst->qpu.branch.bdu = V3D_QPU_BRANCH_DEST_REL;
386
387 inst->dst = vir_nop_reg();
388 inst->uniform = vir_get_uniform_index(c, QUNIFORM_CONSTANT, 0);
389
390 inst->ip = -1;
391
392 return inst;
393 }
394
395 static void
vir_emit(struct v3d_compile * c,struct qinst * inst)396 vir_emit(struct v3d_compile *c, struct qinst *inst)
397 {
398 inst->ip = -1;
399
400 switch (c->cursor.mode) {
401 case vir_cursor_add:
402 list_add(&inst->link, c->cursor.link);
403 break;
404 case vir_cursor_addtail:
405 list_addtail(&inst->link, c->cursor.link);
406 break;
407 }
408
409 c->cursor = vir_after_inst(inst);
410 c->live_intervals_valid = false;
411 }
412
413 /* Updates inst to write to a new temporary, emits it, and notes the def. */
414 struct qreg
vir_emit_def(struct v3d_compile * c,struct qinst * inst)415 vir_emit_def(struct v3d_compile *c, struct qinst *inst)
416 {
417 assert(inst->dst.file == QFILE_NULL);
418
419 /* If we're emitting an instruction that's a def, it had better be
420 * writing a register.
421 */
422 if (inst->qpu.type == V3D_QPU_INSTR_TYPE_ALU) {
423 assert(inst->qpu.alu.add.op == V3D_QPU_A_NOP ||
424 v3d_qpu_add_op_has_dst(inst->qpu.alu.add.op));
425 assert(inst->qpu.alu.mul.op == V3D_QPU_M_NOP ||
426 v3d_qpu_mul_op_has_dst(inst->qpu.alu.mul.op));
427 }
428
429 inst->dst = vir_get_temp(c);
430
431 if (inst->dst.file == QFILE_TEMP)
432 c->defs[inst->dst.index] = inst;
433
434 vir_emit(c, inst);
435
436 return inst->dst;
437 }
438
439 struct qinst *
vir_emit_nondef(struct v3d_compile * c,struct qinst * inst)440 vir_emit_nondef(struct v3d_compile *c, struct qinst *inst)
441 {
442 if (inst->dst.file == QFILE_TEMP)
443 c->defs[inst->dst.index] = NULL;
444
445 vir_emit(c, inst);
446
447 return inst;
448 }
449
450 struct qblock *
vir_new_block(struct v3d_compile * c)451 vir_new_block(struct v3d_compile *c)
452 {
453 struct qblock *block = rzalloc(c, struct qblock);
454
455 list_inithead(&block->instructions);
456
457 block->predecessors = _mesa_set_create(block,
458 _mesa_hash_pointer,
459 _mesa_key_pointer_equal);
460
461 block->index = c->next_block_index++;
462
463 return block;
464 }
465
466 void
vir_set_emit_block(struct v3d_compile * c,struct qblock * block)467 vir_set_emit_block(struct v3d_compile *c, struct qblock *block)
468 {
469 c->cur_block = block;
470 c->cursor = vir_after_block(block);
471 list_addtail(&block->link, &c->blocks);
472 }
473
474 struct qblock *
vir_entry_block(struct v3d_compile * c)475 vir_entry_block(struct v3d_compile *c)
476 {
477 return list_first_entry(&c->blocks, struct qblock, link);
478 }
479
480 struct qblock *
vir_exit_block(struct v3d_compile * c)481 vir_exit_block(struct v3d_compile *c)
482 {
483 return list_last_entry(&c->blocks, struct qblock, link);
484 }
485
486 void
vir_link_blocks(struct qblock * predecessor,struct qblock * successor)487 vir_link_blocks(struct qblock *predecessor, struct qblock *successor)
488 {
489 _mesa_set_add(successor->predecessors, predecessor);
490 if (predecessor->successors[0]) {
491 assert(!predecessor->successors[1]);
492 predecessor->successors[1] = successor;
493 } else {
494 predecessor->successors[0] = successor;
495 }
496 }
497
498 const struct v3d_compiler *
v3d_compiler_init(const struct v3d_device_info * devinfo,uint32_t max_inline_uniform_buffers)499 v3d_compiler_init(const struct v3d_device_info *devinfo,
500 uint32_t max_inline_uniform_buffers)
501 {
502 struct v3d_compiler *compiler = rzalloc(NULL, struct v3d_compiler);
503 if (!compiler)
504 return NULL;
505
506 compiler->devinfo = devinfo;
507 compiler->max_inline_uniform_buffers = max_inline_uniform_buffers;
508
509 if (!vir_init_reg_sets(compiler)) {
510 ralloc_free(compiler);
511 return NULL;
512 }
513
514 return compiler;
515 }
516
517 void
v3d_compiler_free(const struct v3d_compiler * compiler)518 v3d_compiler_free(const struct v3d_compiler *compiler)
519 {
520 ralloc_free((void *)compiler);
521 }
522
523 struct v3d_compiler_strategy {
524 const char *name;
525 uint32_t max_threads;
526 uint32_t min_threads;
527 bool disable_general_tmu_sched;
528 bool disable_gcm;
529 bool disable_loop_unrolling;
530 bool disable_ubo_load_sorting;
531 bool move_buffer_loads;
532 bool disable_tmu_pipelining;
533 uint32_t max_tmu_spills;
534 };
535
536 static struct v3d_compile *
vir_compile_init(const struct v3d_compiler * compiler,struct v3d_key * key,nir_shader * s,void (* debug_output)(const char * msg,void * debug_output_data),void * debug_output_data,int program_id,int variant_id,uint32_t compile_strategy_idx,const struct v3d_compiler_strategy * strategy,bool fallback_scheduler)537 vir_compile_init(const struct v3d_compiler *compiler,
538 struct v3d_key *key,
539 nir_shader *s,
540 void (*debug_output)(const char *msg,
541 void *debug_output_data),
542 void *debug_output_data,
543 int program_id, int variant_id,
544 uint32_t compile_strategy_idx,
545 const struct v3d_compiler_strategy *strategy,
546 bool fallback_scheduler)
547 {
548 struct v3d_compile *c = rzalloc(NULL, struct v3d_compile);
549
550 c->compiler = compiler;
551 c->devinfo = compiler->devinfo;
552 c->key = key;
553 c->program_id = program_id;
554 c->variant_id = variant_id;
555 c->compile_strategy_idx = compile_strategy_idx;
556 c->threads = strategy->max_threads;
557 c->debug_output = debug_output;
558 c->debug_output_data = debug_output_data;
559 c->compilation_result = V3D_COMPILATION_SUCCEEDED;
560 c->min_threads_for_reg_alloc = strategy->min_threads;
561 c->max_tmu_spills = strategy->max_tmu_spills;
562 c->fallback_scheduler = fallback_scheduler;
563 c->disable_general_tmu_sched = strategy->disable_general_tmu_sched;
564 c->disable_tmu_pipelining = strategy->disable_tmu_pipelining;
565 c->disable_constant_ubo_load_sorting = strategy->disable_ubo_load_sorting;
566 c->move_buffer_loads = strategy->move_buffer_loads;
567 c->disable_gcm = strategy->disable_gcm;
568 c->disable_loop_unrolling = V3D_DBG(NO_LOOP_UNROLL)
569 ? true : strategy->disable_loop_unrolling;
570
571
572 s = nir_shader_clone(c, s);
573 c->s = s;
574
575 list_inithead(&c->blocks);
576 vir_set_emit_block(c, vir_new_block(c));
577
578 c->output_position_index = -1;
579 c->output_sample_mask_index = -1;
580
581 c->def_ht = _mesa_hash_table_create(c, _mesa_hash_pointer,
582 _mesa_key_pointer_equal);
583
584 c->tmu.outstanding_regs = _mesa_pointer_set_create(c);
585 c->flags_temp = -1;
586
587 return c;
588 }
589
590 static int
type_size_vec4(const struct glsl_type * type,bool bindless)591 type_size_vec4(const struct glsl_type *type, bool bindless)
592 {
593 return glsl_count_attribute_slots(type, false);
594 }
595
596 static enum nir_lower_tex_packing
lower_tex_packing_cb(const nir_tex_instr * tex,const void * data)597 lower_tex_packing_cb(const nir_tex_instr *tex, const void *data)
598 {
599 struct v3d_compile *c = (struct v3d_compile *) data;
600
601 int sampler_index = nir_tex_instr_need_sampler(tex) ?
602 tex->sampler_index : tex->backend_flags;
603
604 assert(sampler_index < c->key->num_samplers_used);
605 return c->key->sampler[sampler_index].return_size == 16 ?
606 nir_lower_tex_packing_16 : nir_lower_tex_packing_none;
607 }
608
609 static bool
v3d_nir_lower_null_pointers_cb(nir_builder * b,nir_intrinsic_instr * intr,void * _state)610 v3d_nir_lower_null_pointers_cb(nir_builder *b,
611 nir_intrinsic_instr *intr,
612 void *_state)
613 {
614 uint32_t buffer_src_idx;
615
616 switch (intr->intrinsic) {
617 case nir_intrinsic_load_ubo:
618 case nir_intrinsic_load_ssbo:
619 buffer_src_idx = 0;
620 break;
621 case nir_intrinsic_store_ssbo:
622 buffer_src_idx = 1;
623 break;
624 default:
625 return false;
626 }
627
628 /* If index if constant we are good */
629 nir_src *src = &intr->src[buffer_src_idx];
630 if (nir_src_is_const(*src))
631 return false;
632
633 /* Otherwise, see if it comes from a bcsel including a null pointer */
634 if (src->ssa->parent_instr->type != nir_instr_type_alu)
635 return false;
636
637 nir_alu_instr *alu = nir_instr_as_alu(src->ssa->parent_instr);
638 if (alu->op != nir_op_bcsel)
639 return false;
640
641 /* A null pointer is specified using block index 0xffffffff */
642 int32_t null_src_idx = -1;
643 for (int i = 1; i < 3; i++) {
644 /* FIXME: since we are running this before optimization maybe
645 * we need to also handle the case where we may have bcsel
646 * chain that we need to recurse?
647 */
648 if (!nir_src_is_const(alu->src[i].src))
649 continue;
650 if (nir_src_comp_as_uint(alu->src[i].src, 0) != 0xffffffff)
651 continue;
652
653 /* One of the bcsel srcs is a null pointer reference */
654 null_src_idx = i;
655 break;
656 }
657
658 if (null_src_idx < 0)
659 return false;
660
661 assert(null_src_idx == 1 || null_src_idx == 2);
662 int32_t copy_src_idx = null_src_idx == 1 ? 2 : 1;
663
664 /* Rewrite the null pointer reference so we use the same buffer index
665 * as the other bcsel branch. This will allow optimization to remove
666 * the bcsel and we should then end up with a constant buffer index
667 * like we need.
668 */
669 b->cursor = nir_before_instr(&alu->instr);
670 nir_def *copy = nir_mov(b, alu->src[copy_src_idx].src.ssa);
671 nir_src_rewrite(&alu->src[null_src_idx].src, copy);
672
673 return true;
674 }
675
676 static bool
v3d_nir_lower_null_pointers(nir_shader * s)677 v3d_nir_lower_null_pointers(nir_shader *s)
678 {
679 return nir_shader_intrinsics_pass(s, v3d_nir_lower_null_pointers_cb,
680 nir_metadata_control_flow, NULL);
681 }
682
683 static unsigned
lower_bit_size_cb(const nir_instr * instr,void * _data)684 lower_bit_size_cb(const nir_instr *instr, void *_data)
685 {
686 if (instr->type != nir_instr_type_alu)
687 return 0;
688
689 nir_alu_instr *alu = nir_instr_as_alu(instr);
690
691 switch (alu->op) {
692 case nir_op_mov:
693 case nir_op_vec2:
694 case nir_op_vec3:
695 case nir_op_vec4:
696 case nir_op_vec5:
697 case nir_op_vec8:
698 case nir_op_vec16:
699 case nir_op_b2i8:
700 case nir_op_b2f16:
701 case nir_op_b2i16:
702 case nir_op_b2f32:
703 case nir_op_b2i32:
704 case nir_op_f2f16:
705 case nir_op_f2f16_rtne:
706 case nir_op_f2f16_rtz:
707 case nir_op_f2f32:
708 case nir_op_f2i32:
709 case nir_op_f2u32:
710 case nir_op_i2i8:
711 case nir_op_i2i16:
712 case nir_op_i2f16:
713 case nir_op_i2f32:
714 case nir_op_i2i32:
715 case nir_op_u2u8:
716 case nir_op_u2u16:
717 case nir_op_u2f16:
718 case nir_op_u2f32:
719 case nir_op_u2u32:
720 case nir_op_pack_32_2x16_split:
721 case nir_op_pack_32_4x8_split:
722 case nir_op_pack_half_2x16_split:
723 return 0;
724
725 /* we need to handle those here as they only work with 32 bits */
726 default:
727 if (alu->src[0].src.ssa->bit_size != 1 && alu->src[0].src.ssa->bit_size < 32)
728 return 32;
729 return 0;
730 }
731 }
732
733 static void
v3d_lower_nir(struct v3d_compile * c)734 v3d_lower_nir(struct v3d_compile *c)
735 {
736 struct nir_lower_tex_options tex_options = {
737 .lower_txd = true,
738 .lower_tg4_offsets = true,
739 .lower_tg4_broadcom_swizzle = true,
740
741 .lower_rect = false, /* XXX: Use this on V3D 3.x */
742 .lower_txp = ~0,
743 /* Apply swizzles to all samplers. */
744 .swizzle_result = ~0,
745 .lower_invalid_implicit_lod = true,
746 };
747
748 /* Lower the format swizzle and (for 32-bit returns)
749 * ARB_texture_swizzle-style swizzle.
750 */
751 assert(c->key->num_tex_used <= ARRAY_SIZE(c->key->tex));
752 for (int i = 0; i < c->key->num_tex_used; i++) {
753 for (int j = 0; j < 4; j++)
754 tex_options.swizzles[i][j] = c->key->tex[i].swizzle[j];
755 }
756
757 tex_options.lower_tex_packing_cb = lower_tex_packing_cb;
758 tex_options.lower_tex_packing_data = c;
759
760 NIR_PASS(_, c->s, nir_lower_tex, &tex_options);
761 NIR_PASS(_, c->s, nir_lower_system_values);
762
763 if (c->s->info.zero_initialize_shared_memory &&
764 c->s->info.shared_size > 0) {
765 /* All our BOs allocate full pages, so the underlying allocation
766 * for shared memory will always be a multiple of 4KB. This
767 * ensures that we can do an exact number of full chunk_size
768 * writes to initialize the memory independently of the actual
769 * shared_size used by the shader, which is a requirement of
770 * the initialization pass.
771 */
772 const unsigned chunk_size = 16; /* max single store size */
773 NIR_PASS(_, c->s, nir_zero_initialize_shared_memory,
774 align(c->s->info.shared_size, chunk_size), chunk_size);
775 }
776
777 NIR_PASS(_, c->s, nir_lower_compute_system_values, NULL);
778 NIR_PASS(_, c->s, nir_lower_is_helper_invocation);
779 NIR_PASS(_, c->s, v3d_nir_lower_null_pointers);
780 NIR_PASS(_, c->s, nir_lower_bit_size, lower_bit_size_cb, NULL);
781 }
782
783 static void
v3d_set_prog_data_uniforms(struct v3d_compile * c,struct v3d_prog_data * prog_data)784 v3d_set_prog_data_uniforms(struct v3d_compile *c,
785 struct v3d_prog_data *prog_data)
786 {
787 int count = c->num_uniforms;
788 struct v3d_uniform_list *ulist = &prog_data->uniforms;
789
790 ulist->count = count;
791 ulist->data = ralloc_array(prog_data, uint32_t, count);
792 memcpy(ulist->data, c->uniform_data,
793 count * sizeof(*ulist->data));
794 ulist->contents = ralloc_array(prog_data, enum quniform_contents, count);
795 memcpy(ulist->contents, c->uniform_contents,
796 count * sizeof(*ulist->contents));
797 }
798
799 static void
v3d_vs_set_prog_data(struct v3d_compile * c,struct v3d_vs_prog_data * prog_data)800 v3d_vs_set_prog_data(struct v3d_compile *c,
801 struct v3d_vs_prog_data *prog_data)
802 {
803 /* The vertex data gets format converted by the VPM so that
804 * each attribute channel takes up a VPM column. Precompute
805 * the sizes for the shader record.
806 */
807 for (int i = 0; i < ARRAY_SIZE(prog_data->vattr_sizes); i++) {
808 prog_data->vattr_sizes[i] = c->vattr_sizes[i];
809 prog_data->vpm_input_size += c->vattr_sizes[i];
810 }
811
812 memset(prog_data->driver_location_map, -1,
813 sizeof(prog_data->driver_location_map));
814
815 nir_foreach_shader_in_variable(var, c->s) {
816 prog_data->driver_location_map[var->data.location] =
817 var->data.driver_location;
818 }
819
820 prog_data->uses_vid = BITSET_TEST(c->s->info.system_values_read,
821 SYSTEM_VALUE_VERTEX_ID) ||
822 BITSET_TEST(c->s->info.system_values_read,
823 SYSTEM_VALUE_VERTEX_ID_ZERO_BASE);
824
825 prog_data->uses_biid = BITSET_TEST(c->s->info.system_values_read,
826 SYSTEM_VALUE_BASE_INSTANCE);
827
828 prog_data->uses_iid = BITSET_TEST(c->s->info.system_values_read,
829 SYSTEM_VALUE_INSTANCE_ID) ||
830 BITSET_TEST(c->s->info.system_values_read,
831 SYSTEM_VALUE_INSTANCE_INDEX);
832
833 if (prog_data->uses_vid)
834 prog_data->vpm_input_size++;
835 if (prog_data->uses_biid)
836 prog_data->vpm_input_size++;
837 if (prog_data->uses_iid)
838 prog_data->vpm_input_size++;
839
840 prog_data->writes_psiz =
841 c->s->info.outputs_written & (1 << VARYING_SLOT_PSIZ);
842
843 /* Input/output segment size are in sectors (8 rows of 32 bits per
844 * channel).
845 */
846 prog_data->vpm_input_size = align(prog_data->vpm_input_size, 8) / 8;
847 prog_data->vpm_output_size = align(c->vpm_output_size, 8) / 8;
848
849 /* Set us up for shared input/output segments. This is apparently
850 * necessary for our VCM setup to avoid varying corruption.
851 *
852 * FIXME: initial testing on V3D 7.1 seems to work fine when using
853 * separate segments. So we could try to reevaluate in the future, if
854 * there is any advantage of using separate segments.
855 */
856 prog_data->separate_segments = false;
857 prog_data->vpm_output_size = MAX2(prog_data->vpm_output_size,
858 prog_data->vpm_input_size);
859 prog_data->vpm_input_size = 0;
860
861 /* Compute VCM cache size. We set up our program to take up less than
862 * half of the VPM, so that any set of bin and render programs won't
863 * run out of space. We need space for at least one input segment,
864 * and then allocate the rest to output segments (one for the current
865 * program, the rest to VCM). The valid range of the VCM cache size
866 * field is 1-4 16-vertex batches, but GFXH-1744 limits us to 2-4
867 * batches.
868 */
869 assert(c->devinfo->vpm_size);
870 int sector_size = V3D_CHANNELS * sizeof(uint32_t) * 8;
871 int vpm_size_in_sectors = c->devinfo->vpm_size / sector_size;
872 int half_vpm = vpm_size_in_sectors / 2;
873 int vpm_output_sectors = half_vpm - prog_data->vpm_input_size;
874 int vpm_output_batches = vpm_output_sectors / prog_data->vpm_output_size;
875 assert(vpm_output_batches >= 2);
876 prog_data->vcm_cache_size = CLAMP(vpm_output_batches - 1, 2, 4);
877 }
878
879 static void
v3d_gs_set_prog_data(struct v3d_compile * c,struct v3d_gs_prog_data * prog_data)880 v3d_gs_set_prog_data(struct v3d_compile *c,
881 struct v3d_gs_prog_data *prog_data)
882 {
883 prog_data->num_inputs = c->num_inputs;
884 memcpy(prog_data->input_slots, c->input_slots,
885 c->num_inputs * sizeof(*c->input_slots));
886
887 /* gl_PrimitiveIdIn is written by the GBG into the first word of the
888 * VPM output header automatically and the shader will overwrite
889 * it after reading it if necessary, so it doesn't add to the VPM
890 * size requirements.
891 */
892 prog_data->uses_pid = BITSET_TEST(c->s->info.system_values_read,
893 SYSTEM_VALUE_PRIMITIVE_ID);
894
895 /* Output segment size is in sectors (8 rows of 32 bits per channel) */
896 prog_data->vpm_output_size = align(c->vpm_output_size, 8) / 8;
897
898 /* Compute SIMD dispatch width and update VPM output size accordingly
899 * to ensure we can fit our program in memory. Available widths are
900 * 16, 8, 4, 1.
901 *
902 * Notice that at draw time we will have to consider VPM memory
903 * requirements from other stages and choose a smaller dispatch
904 * width if needed to fit the program in VPM memory.
905 */
906 prog_data->simd_width = 16;
907 while ((prog_data->simd_width > 1 && prog_data->vpm_output_size > 16) ||
908 prog_data->simd_width == 2) {
909 prog_data->simd_width >>= 1;
910 prog_data->vpm_output_size =
911 align(prog_data->vpm_output_size, 2) / 2;
912 }
913 assert(prog_data->vpm_output_size <= 16);
914 assert(prog_data->simd_width != 2);
915
916 prog_data->out_prim_type = c->s->info.gs.output_primitive;
917 prog_data->num_invocations = c->s->info.gs.invocations;
918
919 prog_data->writes_psiz =
920 c->s->info.outputs_written & (1 << VARYING_SLOT_PSIZ);
921 }
922
923 static void
v3d_set_fs_prog_data_inputs(struct v3d_compile * c,struct v3d_fs_prog_data * prog_data)924 v3d_set_fs_prog_data_inputs(struct v3d_compile *c,
925 struct v3d_fs_prog_data *prog_data)
926 {
927 prog_data->num_inputs = c->num_inputs;
928 memcpy(prog_data->input_slots, c->input_slots,
929 c->num_inputs * sizeof(*c->input_slots));
930
931 STATIC_ASSERT(ARRAY_SIZE(prog_data->flat_shade_flags) >
932 (V3D_MAX_FS_INPUTS - 1) / 24);
933 for (int i = 0; i < V3D_MAX_FS_INPUTS; i++) {
934 if (BITSET_TEST(c->flat_shade_flags, i))
935 prog_data->flat_shade_flags[i / 24] |= 1 << (i % 24);
936
937 if (BITSET_TEST(c->noperspective_flags, i))
938 prog_data->noperspective_flags[i / 24] |= 1 << (i % 24);
939
940 if (BITSET_TEST(c->centroid_flags, i))
941 prog_data->centroid_flags[i / 24] |= 1 << (i % 24);
942 }
943 }
944
945 static void
v3d_fs_set_prog_data(struct v3d_compile * c,struct v3d_fs_prog_data * prog_data)946 v3d_fs_set_prog_data(struct v3d_compile *c,
947 struct v3d_fs_prog_data *prog_data)
948 {
949 v3d_set_fs_prog_data_inputs(c, prog_data);
950 prog_data->writes_z = c->writes_z;
951 prog_data->writes_z_from_fep = c->writes_z_from_fep;
952 prog_data->disable_ez = !c->s->info.fs.early_fragment_tests;
953 prog_data->uses_center_w = c->uses_center_w;
954 prog_data->uses_implicit_point_line_varyings =
955 c->uses_implicit_point_line_varyings;
956 prog_data->lock_scoreboard_on_first_thrsw =
957 c->lock_scoreboard_on_first_thrsw;
958 prog_data->force_per_sample_msaa = c->s->info.fs.uses_sample_shading;
959 prog_data->uses_pid = c->fs_uses_primitive_id;
960 }
961
962 static void
v3d_cs_set_prog_data(struct v3d_compile * c,struct v3d_compute_prog_data * prog_data)963 v3d_cs_set_prog_data(struct v3d_compile *c,
964 struct v3d_compute_prog_data *prog_data)
965 {
966 prog_data->shared_size = c->s->info.shared_size;
967
968 prog_data->local_size[0] = c->s->info.workgroup_size[0];
969 prog_data->local_size[1] = c->s->info.workgroup_size[1];
970 prog_data->local_size[2] = c->s->info.workgroup_size[2];
971
972 prog_data->has_subgroups = c->has_subgroups;
973 }
974
975 static void
v3d_set_prog_data(struct v3d_compile * c,struct v3d_prog_data * prog_data)976 v3d_set_prog_data(struct v3d_compile *c,
977 struct v3d_prog_data *prog_data)
978 {
979 prog_data->threads = c->threads;
980 prog_data->single_seg = !c->last_thrsw;
981 prog_data->spill_size = c->spill_size;
982 prog_data->tmu_spills = c->spills;
983 prog_data->tmu_fills = c->fills;
984 prog_data->tmu_count = c->tmu.total_count;
985 prog_data->qpu_read_stalls = c->qpu_inst_stalled_count;
986 prog_data->compile_strategy_idx = c->compile_strategy_idx;
987 prog_data->tmu_dirty_rcl = c->tmu_dirty_rcl;
988 prog_data->has_control_barrier = c->s->info.uses_control_barrier;
989 prog_data->has_global_address = c->has_global_address;
990
991 v3d_set_prog_data_uniforms(c, prog_data);
992
993 switch (c->s->info.stage) {
994 case MESA_SHADER_VERTEX:
995 v3d_vs_set_prog_data(c, (struct v3d_vs_prog_data *)prog_data);
996 break;
997 case MESA_SHADER_GEOMETRY:
998 v3d_gs_set_prog_data(c, (struct v3d_gs_prog_data *)prog_data);
999 break;
1000 case MESA_SHADER_FRAGMENT:
1001 v3d_fs_set_prog_data(c, (struct v3d_fs_prog_data *)prog_data);
1002 break;
1003 case MESA_SHADER_COMPUTE:
1004 v3d_cs_set_prog_data(c, (struct v3d_compute_prog_data *)prog_data);
1005 break;
1006 default:
1007 unreachable("unsupported shader stage");
1008 }
1009 }
1010
1011 static uint64_t *
v3d_return_qpu_insts(struct v3d_compile * c,uint32_t * final_assembly_size)1012 v3d_return_qpu_insts(struct v3d_compile *c, uint32_t *final_assembly_size)
1013 {
1014 *final_assembly_size = c->qpu_inst_count * sizeof(uint64_t);
1015
1016 uint64_t *qpu_insts = malloc(*final_assembly_size);
1017 if (!qpu_insts)
1018 return NULL;
1019
1020 memcpy(qpu_insts, c->qpu_insts, *final_assembly_size);
1021
1022 vir_compile_destroy(c);
1023
1024 return qpu_insts;
1025 }
1026
1027 static void
v3d_nir_lower_vs_early(struct v3d_compile * c)1028 v3d_nir_lower_vs_early(struct v3d_compile *c)
1029 {
1030 /* Split our I/O vars and dead code eliminate the unused
1031 * components.
1032 */
1033 NIR_PASS(_, c->s, nir_lower_io_to_scalar_early,
1034 nir_var_shader_in | nir_var_shader_out);
1035 uint64_t used_outputs[4] = {0};
1036 for (int i = 0; i < c->vs_key->num_used_outputs; i++) {
1037 int slot = v3d_slot_get_slot(c->vs_key->used_outputs[i]);
1038 int comp = v3d_slot_get_component(c->vs_key->used_outputs[i]);
1039 used_outputs[comp] |= 1ull << slot;
1040 }
1041 NIR_PASS(_, c->s, nir_remove_unused_io_vars,
1042 nir_var_shader_out, used_outputs, NULL); /* demotes to globals */
1043 NIR_PASS(_, c->s, nir_lower_global_vars_to_local);
1044 v3d_optimize_nir(c, c->s);
1045 NIR_PASS(_, c->s, nir_remove_dead_variables, nir_var_shader_in, NULL);
1046
1047 /* This must go before nir_lower_io */
1048 if (c->vs_key->per_vertex_point_size)
1049 NIR_PASS(_, c->s, nir_lower_point_size, 1.0f, 0.0f);
1050
1051 NIR_PASS(_, c->s, nir_lower_io, nir_var_shader_in | nir_var_shader_out,
1052 type_size_vec4,
1053 (nir_lower_io_options)0);
1054 /* clean up nir_lower_io's deref_var remains and do a constant folding pass
1055 * on the code it generated.
1056 */
1057 NIR_PASS(_, c->s, nir_opt_dce);
1058 NIR_PASS(_, c->s, nir_opt_constant_folding);
1059 }
1060
1061 static void
v3d_nir_lower_gs_early(struct v3d_compile * c)1062 v3d_nir_lower_gs_early(struct v3d_compile *c)
1063 {
1064 /* Split our I/O vars and dead code eliminate the unused
1065 * components.
1066 */
1067 NIR_PASS(_, c->s, nir_lower_io_to_scalar_early,
1068 nir_var_shader_in | nir_var_shader_out);
1069 uint64_t used_outputs[4] = {0};
1070 for (int i = 0; i < c->gs_key->num_used_outputs; i++) {
1071 int slot = v3d_slot_get_slot(c->gs_key->used_outputs[i]);
1072 int comp = v3d_slot_get_component(c->gs_key->used_outputs[i]);
1073 used_outputs[comp] |= 1ull << slot;
1074 }
1075 NIR_PASS(_, c->s, nir_remove_unused_io_vars,
1076 nir_var_shader_out, used_outputs, NULL); /* demotes to globals */
1077 NIR_PASS(_, c->s, nir_lower_global_vars_to_local);
1078 v3d_optimize_nir(c, c->s);
1079 NIR_PASS(_, c->s, nir_remove_dead_variables, nir_var_shader_in, NULL);
1080
1081 /* This must go before nir_lower_io */
1082 if (c->gs_key->per_vertex_point_size)
1083 NIR_PASS(_, c->s, nir_lower_point_size, 1.0f, 0.0f);
1084
1085 NIR_PASS(_, c->s, nir_lower_io, nir_var_shader_in | nir_var_shader_out,
1086 type_size_vec4,
1087 (nir_lower_io_options)0);
1088 /* clean up nir_lower_io's deref_var remains and do a constant folding pass
1089 * on the code it generated.
1090 */
1091 NIR_PASS(_, c->s, nir_opt_dce);
1092 NIR_PASS(_, c->s, nir_opt_constant_folding);
1093 }
1094
1095 static void
v3d_fixup_fs_output_types(struct v3d_compile * c)1096 v3d_fixup_fs_output_types(struct v3d_compile *c)
1097 {
1098 nir_foreach_shader_out_variable(var, c->s) {
1099 uint32_t mask = 0;
1100
1101 switch (var->data.location) {
1102 case FRAG_RESULT_COLOR:
1103 mask = ~0;
1104 break;
1105 case FRAG_RESULT_DATA0:
1106 case FRAG_RESULT_DATA1:
1107 case FRAG_RESULT_DATA2:
1108 case FRAG_RESULT_DATA3:
1109 mask = 1 << (var->data.location - FRAG_RESULT_DATA0);
1110 break;
1111 }
1112
1113 if (c->fs_key->int_color_rb & mask) {
1114 var->type =
1115 glsl_vector_type(GLSL_TYPE_INT,
1116 glsl_get_components(var->type));
1117 } else if (c->fs_key->uint_color_rb & mask) {
1118 var->type =
1119 glsl_vector_type(GLSL_TYPE_UINT,
1120 glsl_get_components(var->type));
1121 }
1122 }
1123 }
1124
1125 static void
v3d_nir_lower_fs_early(struct v3d_compile * c)1126 v3d_nir_lower_fs_early(struct v3d_compile *c)
1127 {
1128 if (c->fs_key->int_color_rb || c->fs_key->uint_color_rb)
1129 v3d_fixup_fs_output_types(c);
1130
1131 NIR_PASS(_, c->s, v3d_nir_lower_logic_ops, c);
1132
1133 if (c->fs_key->line_smoothing) {
1134 NIR_PASS(_, c->s, v3d_nir_lower_line_smooth);
1135 NIR_PASS(_, c->s, nir_lower_global_vars_to_local);
1136 /* The lowering pass can introduce new sysval reads */
1137 nir_shader_gather_info(c->s, nir_shader_get_entrypoint(c->s));
1138 }
1139 }
1140
1141 static void
v3d_nir_lower_gs_late(struct v3d_compile * c)1142 v3d_nir_lower_gs_late(struct v3d_compile *c)
1143 {
1144 if (c->key->ucp_enables) {
1145 NIR_PASS(_, c->s, nir_lower_clip_gs, c->key->ucp_enables,
1146 true, NULL);
1147 }
1148
1149 /* Note: GS output scalarizing must happen after nir_lower_clip_gs. */
1150 NIR_PASS_V(c->s, nir_lower_io_to_scalar, nir_var_shader_out, NULL, NULL);
1151 }
1152
1153 static void
v3d_nir_lower_vs_late(struct v3d_compile * c)1154 v3d_nir_lower_vs_late(struct v3d_compile *c)
1155 {
1156 if (c->key->ucp_enables) {
1157 NIR_PASS(_, c->s, nir_lower_clip_vs, c->key->ucp_enables,
1158 false, true, NULL);
1159 NIR_PASS_V(c->s, nir_lower_io_to_scalar,
1160 nir_var_shader_out, NULL, NULL);
1161 }
1162
1163 /* Note: VS output scalarizing must happen after nir_lower_clip_vs. */
1164 NIR_PASS_V(c->s, nir_lower_io_to_scalar, nir_var_shader_out, NULL, NULL);
1165 }
1166
1167 static void
v3d_nir_lower_fs_late(struct v3d_compile * c)1168 v3d_nir_lower_fs_late(struct v3d_compile *c)
1169 {
1170 /* In OpenGL the fragment shader can't read gl_ClipDistance[], but
1171 * Vulkan allows it, in which case the SPIR-V compiler will declare
1172 * VARING_SLOT_CLIP_DIST0 as compact array variable. Pass true as
1173 * the last parameter to always operate with a compact array in both
1174 * OpenGL and Vulkan so we do't have to care about the API we
1175 * are using.
1176 */
1177 if (c->key->ucp_enables)
1178 NIR_PASS(_, c->s, nir_lower_clip_fs, c->key->ucp_enables, true, false);
1179
1180 NIR_PASS_V(c->s, nir_lower_io_to_scalar, nir_var_shader_in, NULL, NULL);
1181 }
1182
1183 static uint32_t
vir_get_max_temps(struct v3d_compile * c)1184 vir_get_max_temps(struct v3d_compile *c)
1185 {
1186 int max_ip = 0;
1187 vir_for_each_inst_inorder(inst, c)
1188 max_ip++;
1189
1190 uint32_t *pressure = rzalloc_array(NULL, uint32_t, max_ip);
1191
1192 for (int t = 0; t < c->num_temps; t++) {
1193 for (int i = c->temp_start[t]; (i < c->temp_end[t] &&
1194 i < max_ip); i++) {
1195 if (i > max_ip)
1196 break;
1197 pressure[i]++;
1198 }
1199 }
1200
1201 uint32_t max_temps = 0;
1202 for (int i = 0; i < max_ip; i++)
1203 max_temps = MAX2(max_temps, pressure[i]);
1204
1205 ralloc_free(pressure);
1206
1207 return max_temps;
1208 }
1209
1210 enum v3d_dependency_class {
1211 V3D_DEPENDENCY_CLASS_GS_VPM_OUTPUT_0
1212 };
1213
1214 static bool
v3d_intrinsic_dependency_cb(nir_intrinsic_instr * intr,nir_schedule_dependency * dep,void * user_data)1215 v3d_intrinsic_dependency_cb(nir_intrinsic_instr *intr,
1216 nir_schedule_dependency *dep,
1217 void *user_data)
1218 {
1219 struct v3d_compile *c = user_data;
1220
1221 switch (intr->intrinsic) {
1222 case nir_intrinsic_store_output:
1223 /* Writing to location 0 overwrites the value passed in for
1224 * gl_PrimitiveID on geometry shaders
1225 */
1226 if (c->s->info.stage != MESA_SHADER_GEOMETRY ||
1227 nir_intrinsic_base(intr) != 0)
1228 break;
1229
1230 nir_const_value *const_value =
1231 nir_src_as_const_value(intr->src[1]);
1232
1233 if (const_value == NULL)
1234 break;
1235
1236 uint64_t offset =
1237 nir_const_value_as_uint(*const_value,
1238 nir_src_bit_size(intr->src[1]));
1239 if (offset != 0)
1240 break;
1241
1242 dep->klass = V3D_DEPENDENCY_CLASS_GS_VPM_OUTPUT_0;
1243 dep->type = NIR_SCHEDULE_WRITE_DEPENDENCY;
1244 return true;
1245
1246 case nir_intrinsic_load_primitive_id:
1247 if (c->s->info.stage != MESA_SHADER_GEOMETRY)
1248 break;
1249
1250 dep->klass = V3D_DEPENDENCY_CLASS_GS_VPM_OUTPUT_0;
1251 dep->type = NIR_SCHEDULE_READ_DEPENDENCY;
1252 return true;
1253
1254 default:
1255 break;
1256 }
1257
1258 return false;
1259 }
1260
1261 static unsigned
v3d_instr_delay_cb(nir_instr * instr,void * data)1262 v3d_instr_delay_cb(nir_instr *instr, void *data)
1263 {
1264 struct v3d_compile *c = (struct v3d_compile *) data;
1265
1266 switch (instr->type) {
1267 case nir_instr_type_undef:
1268 case nir_instr_type_load_const:
1269 case nir_instr_type_alu:
1270 case nir_instr_type_deref:
1271 case nir_instr_type_jump:
1272 case nir_instr_type_parallel_copy:
1273 case nir_instr_type_call:
1274 case nir_instr_type_phi:
1275 return 1;
1276
1277 /* We should not use very large delays for TMU instructions. Typically,
1278 * thread switches will be sufficient to hide all or most of the latency,
1279 * so we typically only need a little bit of extra room. If we over-estimate
1280 * the latency here we may end up unnecessarily delaying the critical path in
1281 * the shader, which would have a negative effect in performance, so here
1282 * we are trying to strike a balance based on empirical testing.
1283 */
1284 case nir_instr_type_intrinsic: {
1285 nir_intrinsic_instr *intr = nir_instr_as_intrinsic(instr);
1286 if (!c->disable_general_tmu_sched) {
1287 switch (intr->intrinsic) {
1288 case nir_intrinsic_decl_reg:
1289 case nir_intrinsic_load_reg:
1290 case nir_intrinsic_store_reg:
1291 return 0;
1292 case nir_intrinsic_load_ssbo:
1293 case nir_intrinsic_load_scratch:
1294 case nir_intrinsic_load_shared:
1295 case nir_intrinsic_image_load:
1296 return 3;
1297 case nir_intrinsic_load_ubo:
1298 if (nir_src_is_divergent(&intr->src[1]))
1299 return 3;
1300 FALLTHROUGH;
1301 default:
1302 return 1;
1303 }
1304 } else {
1305 switch (intr->intrinsic) {
1306 case nir_intrinsic_decl_reg:
1307 case nir_intrinsic_load_reg:
1308 case nir_intrinsic_store_reg:
1309 return 0;
1310 default:
1311 return 1;
1312 }
1313 }
1314 break;
1315 }
1316
1317 case nir_instr_type_tex:
1318 return 5;
1319
1320 case nir_instr_type_debug_info:
1321 return 0;
1322 }
1323
1324 return 0;
1325 }
1326
1327 static bool
should_split_wrmask(const nir_instr * instr,const void * data)1328 should_split_wrmask(const nir_instr *instr, const void *data)
1329 {
1330 nir_intrinsic_instr *intr = nir_instr_as_intrinsic(instr);
1331 switch (intr->intrinsic) {
1332 case nir_intrinsic_store_ssbo:
1333 case nir_intrinsic_store_shared:
1334 case nir_intrinsic_store_global:
1335 case nir_intrinsic_store_scratch:
1336 return true;
1337 default:
1338 return false;
1339 }
1340 }
1341
1342 static nir_intrinsic_instr *
nir_instr_as_constant_ubo_load(nir_instr * inst)1343 nir_instr_as_constant_ubo_load(nir_instr *inst)
1344 {
1345 if (inst->type != nir_instr_type_intrinsic)
1346 return NULL;
1347
1348 nir_intrinsic_instr *intr = nir_instr_as_intrinsic(inst);
1349 if (intr->intrinsic != nir_intrinsic_load_ubo)
1350 return NULL;
1351
1352 assert(nir_src_is_const(intr->src[0]));
1353 if (!nir_src_is_const(intr->src[1]))
1354 return NULL;
1355
1356 return intr;
1357 }
1358
1359 static bool
v3d_nir_sort_constant_ubo_load(nir_block * block,nir_intrinsic_instr * ref)1360 v3d_nir_sort_constant_ubo_load(nir_block *block, nir_intrinsic_instr *ref)
1361 {
1362 bool progress = false;
1363
1364 nir_instr *ref_inst = &ref->instr;
1365 uint32_t ref_offset = nir_src_as_uint(ref->src[1]);
1366 uint32_t ref_index = nir_src_as_uint(ref->src[0]);
1367
1368 /* Go through all instructions after ref searching for constant UBO
1369 * loads for the same UBO index.
1370 */
1371 bool seq_break = false;
1372 nir_instr *inst = &ref->instr;
1373 nir_instr *next_inst = NULL;
1374 while (true) {
1375 inst = next_inst ? next_inst : nir_instr_next(inst);
1376 if (!inst)
1377 break;
1378
1379 next_inst = NULL;
1380
1381 if (inst->type != nir_instr_type_intrinsic)
1382 continue;
1383
1384 nir_intrinsic_instr *intr = nir_instr_as_intrinsic(inst);
1385 if (intr->intrinsic != nir_intrinsic_load_ubo)
1386 continue;
1387
1388 /* We only produce unifa sequences for non-divergent loads */
1389 if (nir_src_is_divergent(&intr->src[1]))
1390 continue;
1391
1392 /* If there are any UBO loads that are not constant or that
1393 * use a different UBO index in between the reference load and
1394 * any other constant load for the same index, they would break
1395 * the unifa sequence. We will flag that so we can then move
1396 * all constant UBO loads for the reference index before these
1397 * and not just the ones that are not ordered to avoid breaking
1398 * the sequence and reduce unifa writes.
1399 */
1400 if (!nir_src_is_const(intr->src[1])) {
1401 seq_break = true;
1402 continue;
1403 }
1404 uint32_t offset = nir_src_as_uint(intr->src[1]);
1405
1406 assert(nir_src_is_const(intr->src[0]));
1407 uint32_t index = nir_src_as_uint(intr->src[0]);
1408 if (index != ref_index) {
1409 seq_break = true;
1410 continue;
1411 }
1412
1413 /* Only move loads with an offset that is close enough to the
1414 * reference offset, since otherwise we would not be able to
1415 * skip the unifa write for them. See ntq_emit_load_ubo_unifa.
1416 */
1417 if (abs((int)(ref_offset - offset)) > MAX_UNIFA_SKIP_DISTANCE)
1418 continue;
1419
1420 /* We will move this load if its offset is smaller than ref's
1421 * (in which case we will move it before ref) or if the offset
1422 * is larger than ref's but there are sequence breakers in
1423 * in between (in which case we will move it after ref and
1424 * before the sequence breakers).
1425 */
1426 if (!seq_break && offset >= ref_offset)
1427 continue;
1428
1429 /* Find where exactly we want to move this load:
1430 *
1431 * If we are moving it before ref, we want to check any other
1432 * UBO loads we placed before ref and make sure we insert this
1433 * one properly ordered with them. Likewise, if we are moving
1434 * it after ref.
1435 */
1436 nir_instr *pos = ref_inst;
1437 nir_instr *tmp = pos;
1438 do {
1439 if (offset < ref_offset)
1440 tmp = nir_instr_prev(tmp);
1441 else
1442 tmp = nir_instr_next(tmp);
1443
1444 if (!tmp || tmp == inst)
1445 break;
1446
1447 /* Ignore non-unifa UBO loads */
1448 if (tmp->type != nir_instr_type_intrinsic)
1449 continue;
1450
1451 nir_intrinsic_instr *tmp_intr =
1452 nir_instr_as_intrinsic(tmp);
1453 if (tmp_intr->intrinsic != nir_intrinsic_load_ubo)
1454 continue;
1455
1456 if (nir_src_is_divergent(&tmp_intr->src[1]))
1457 continue;
1458
1459 /* Stop if we find a unifa UBO load that breaks the
1460 * sequence.
1461 */
1462 if (!nir_src_is_const(tmp_intr->src[1]))
1463 break;
1464
1465 if (nir_src_as_uint(tmp_intr->src[0]) != index)
1466 break;
1467
1468 uint32_t tmp_offset = nir_src_as_uint(tmp_intr->src[1]);
1469 if (offset < ref_offset) {
1470 if (tmp_offset < offset ||
1471 tmp_offset >= ref_offset) {
1472 break;
1473 } else {
1474 pos = tmp;
1475 }
1476 } else {
1477 if (tmp_offset > offset ||
1478 tmp_offset <= ref_offset) {
1479 break;
1480 } else {
1481 pos = tmp;
1482 }
1483 }
1484 } while (true);
1485
1486 /* We can't move the UBO load before the instruction that
1487 * defines its constant offset. If that instruction is placed
1488 * in between the new location (pos) and the current location
1489 * of this load, we will have to move that instruction too.
1490 *
1491 * We don't care about the UBO index definition because that
1492 * is optimized to be reused by all UBO loads for the same
1493 * index and therefore is certain to be defined before the
1494 * first UBO load that uses it.
1495 */
1496 nir_instr *offset_inst = NULL;
1497 tmp = inst;
1498 while ((tmp = nir_instr_prev(tmp)) != NULL) {
1499 if (pos == tmp) {
1500 /* We reached the target location without
1501 * finding the instruction that defines the
1502 * offset, so that instruction must be before
1503 * the new position and we don't have to fix it.
1504 */
1505 break;
1506 }
1507 if (intr->src[1].ssa->parent_instr == tmp) {
1508 offset_inst = tmp;
1509 break;
1510 }
1511 }
1512
1513 if (offset_inst) {
1514 exec_node_remove(&offset_inst->node);
1515 exec_node_insert_node_before(&pos->node,
1516 &offset_inst->node);
1517 }
1518
1519 /* Since we are moving the instruction before its current
1520 * location, grab its successor before the move so that
1521 * we can continue the next iteration of the main loop from
1522 * that instruction.
1523 */
1524 next_inst = nir_instr_next(inst);
1525
1526 /* Move this load to the selected location */
1527 exec_node_remove(&inst->node);
1528 if (offset < ref_offset)
1529 exec_node_insert_node_before(&pos->node, &inst->node);
1530 else
1531 exec_node_insert_after(&pos->node, &inst->node);
1532
1533 progress = true;
1534 }
1535
1536 return progress;
1537 }
1538
1539 static bool
v3d_nir_sort_constant_ubo_loads_block(struct v3d_compile * c,nir_block * block)1540 v3d_nir_sort_constant_ubo_loads_block(struct v3d_compile *c,
1541 nir_block *block)
1542 {
1543 bool progress = false;
1544 bool local_progress;
1545 do {
1546 local_progress = false;
1547 nir_foreach_instr_safe(inst, block) {
1548 nir_intrinsic_instr *intr =
1549 nir_instr_as_constant_ubo_load(inst);
1550 if (intr) {
1551 local_progress |=
1552 v3d_nir_sort_constant_ubo_load(block, intr);
1553 }
1554 }
1555 progress |= local_progress;
1556 } while (local_progress);
1557
1558 return progress;
1559 }
1560
1561 /**
1562 * Sorts constant UBO loads in each block by offset to maximize chances of
1563 * skipping unifa writes when converting to VIR. This can increase register
1564 * pressure.
1565 */
1566 static bool
v3d_nir_sort_constant_ubo_loads(nir_shader * s,struct v3d_compile * c)1567 v3d_nir_sort_constant_ubo_loads(nir_shader *s, struct v3d_compile *c)
1568 {
1569 nir_foreach_function_impl(impl, s) {
1570 nir_foreach_block(block, impl) {
1571 c->sorted_any_ubo_loads |=
1572 v3d_nir_sort_constant_ubo_loads_block(c, block);
1573 }
1574 nir_metadata_preserve(impl,
1575 nir_metadata_control_flow);
1576 }
1577 return c->sorted_any_ubo_loads;
1578 }
1579
1580 static void
lower_load_num_subgroups(struct v3d_compile * c,nir_builder * b,nir_intrinsic_instr * intr)1581 lower_load_num_subgroups(struct v3d_compile *c,
1582 nir_builder *b,
1583 nir_intrinsic_instr *intr)
1584 {
1585 assert(c->s->info.stage == MESA_SHADER_COMPUTE);
1586 assert(intr->intrinsic == nir_intrinsic_load_num_subgroups);
1587
1588 b->cursor = nir_after_instr(&intr->instr);
1589 uint32_t num_subgroups =
1590 DIV_ROUND_UP(c->s->info.workgroup_size[0] *
1591 c->s->info.workgroup_size[1] *
1592 c->s->info.workgroup_size[2], V3D_CHANNELS);
1593 nir_def *result = nir_imm_int(b, num_subgroups);
1594 nir_def_replace(&intr->def, result);
1595 }
1596
1597 static bool
lower_subgroup_intrinsics(struct v3d_compile * c,nir_block * block,nir_builder * b)1598 lower_subgroup_intrinsics(struct v3d_compile *c,
1599 nir_block *block, nir_builder *b)
1600 {
1601 bool progress = false;
1602 nir_foreach_instr_safe(inst, block) {
1603 if (inst->type != nir_instr_type_intrinsic)
1604 continue;;
1605
1606 nir_intrinsic_instr *intr =
1607 nir_instr_as_intrinsic(inst);
1608 if (!intr)
1609 continue;
1610
1611 switch (intr->intrinsic) {
1612 case nir_intrinsic_load_num_subgroups:
1613 lower_load_num_subgroups(c, b, intr);
1614 progress = true;
1615 FALLTHROUGH;
1616 case nir_intrinsic_load_subgroup_id:
1617 case nir_intrinsic_load_subgroup_size:
1618 case nir_intrinsic_load_subgroup_invocation:
1619 case nir_intrinsic_elect:
1620 case nir_intrinsic_ballot:
1621 case nir_intrinsic_inverse_ballot:
1622 case nir_intrinsic_ballot_bitfield_extract:
1623 case nir_intrinsic_ballot_bit_count_reduce:
1624 case nir_intrinsic_ballot_find_lsb:
1625 case nir_intrinsic_ballot_find_msb:
1626 case nir_intrinsic_ballot_bit_count_exclusive:
1627 case nir_intrinsic_ballot_bit_count_inclusive:
1628 case nir_intrinsic_reduce:
1629 case nir_intrinsic_inclusive_scan:
1630 case nir_intrinsic_exclusive_scan:
1631 case nir_intrinsic_read_invocation:
1632 case nir_intrinsic_read_first_invocation:
1633 case nir_intrinsic_load_subgroup_eq_mask:
1634 case nir_intrinsic_load_subgroup_ge_mask:
1635 case nir_intrinsic_load_subgroup_gt_mask:
1636 case nir_intrinsic_load_subgroup_le_mask:
1637 case nir_intrinsic_load_subgroup_lt_mask:
1638 case nir_intrinsic_shuffle:
1639 case nir_intrinsic_shuffle_xor:
1640 case nir_intrinsic_shuffle_up:
1641 case nir_intrinsic_shuffle_down:
1642 case nir_intrinsic_vote_all:
1643 case nir_intrinsic_vote_any:
1644 case nir_intrinsic_vote_feq:
1645 case nir_intrinsic_vote_ieq:
1646 case nir_intrinsic_quad_broadcast:
1647 case nir_intrinsic_quad_swap_horizontal:
1648 case nir_intrinsic_quad_swap_vertical:
1649 case nir_intrinsic_quad_swap_diagonal:
1650 c->has_subgroups = true;
1651 break;
1652 default:
1653 break;
1654 }
1655 }
1656
1657 return progress;
1658 }
1659
1660 static bool
v3d_nir_lower_subgroup_intrinsics(nir_shader * s,struct v3d_compile * c)1661 v3d_nir_lower_subgroup_intrinsics(nir_shader *s, struct v3d_compile *c)
1662 {
1663 bool progress = false;
1664 nir_foreach_function_impl(impl, s) {
1665 nir_builder b = nir_builder_create(impl);
1666
1667 nir_foreach_block(block, impl)
1668 progress |= lower_subgroup_intrinsics(c, block, &b);
1669
1670 nir_metadata_preserve(impl,
1671 nir_metadata_control_flow);
1672 }
1673 return progress;
1674 }
1675
1676 static bool
should_lower_robustness(const nir_intrinsic_instr * intr,const void * data)1677 should_lower_robustness(const nir_intrinsic_instr *intr, const void *data)
1678 {
1679 const struct v3d_key *key = data;
1680
1681 switch (intr->intrinsic) {
1682 case nir_intrinsic_load_ubo:
1683 return key->robust_uniform_access;
1684
1685 case nir_intrinsic_load_ssbo:
1686 case nir_intrinsic_store_ssbo:
1687 case nir_intrinsic_ssbo_atomic:
1688 case nir_intrinsic_ssbo_atomic_swap:
1689 return key->robust_storage_access;
1690
1691 case nir_intrinsic_image_load:
1692 case nir_intrinsic_image_store:
1693 case nir_intrinsic_image_atomic:
1694 case nir_intrinsic_image_atomic_swap:
1695 return key->robust_image_access;
1696
1697 default:
1698 return false;
1699 }
1700 }
1701
1702 static void
v3d_attempt_compile(struct v3d_compile * c)1703 v3d_attempt_compile(struct v3d_compile *c)
1704 {
1705 switch (c->s->info.stage) {
1706 case MESA_SHADER_VERTEX:
1707 c->vs_key = (struct v3d_vs_key *) c->key;
1708 break;
1709 case MESA_SHADER_GEOMETRY:
1710 c->gs_key = (struct v3d_gs_key *) c->key;
1711 break;
1712 case MESA_SHADER_FRAGMENT:
1713 c->fs_key = (struct v3d_fs_key *) c->key;
1714 break;
1715 case MESA_SHADER_COMPUTE:
1716 break;
1717 default:
1718 unreachable("unsupported shader stage");
1719 }
1720
1721 switch (c->s->info.stage) {
1722 case MESA_SHADER_VERTEX:
1723 v3d_nir_lower_vs_early(c);
1724 break;
1725 case MESA_SHADER_GEOMETRY:
1726 v3d_nir_lower_gs_early(c);
1727 break;
1728 case MESA_SHADER_FRAGMENT:
1729 v3d_nir_lower_fs_early(c);
1730 break;
1731 default:
1732 break;
1733 }
1734
1735 v3d_lower_nir(c);
1736
1737 switch (c->s->info.stage) {
1738 case MESA_SHADER_VERTEX:
1739 v3d_nir_lower_vs_late(c);
1740 break;
1741 case MESA_SHADER_GEOMETRY:
1742 v3d_nir_lower_gs_late(c);
1743 break;
1744 case MESA_SHADER_FRAGMENT:
1745 v3d_nir_lower_fs_late(c);
1746 break;
1747 default:
1748 break;
1749 }
1750
1751 NIR_PASS(_, c->s, v3d_nir_lower_io, c);
1752 NIR_PASS(_, c->s, v3d_nir_lower_txf_ms);
1753 NIR_PASS(_, c->s, v3d_nir_lower_image_load_store, c);
1754
1755 NIR_PASS(_, c->s, nir_opt_idiv_const, 8);
1756 nir_lower_idiv_options idiv_options = {
1757 .allow_fp16 = true,
1758 };
1759 NIR_PASS(_, c->s, nir_lower_idiv, &idiv_options);
1760 NIR_PASS(_, c->s, nir_lower_alu);
1761
1762 if (c->key->robust_uniform_access || c->key->robust_storage_access ||
1763 c->key->robust_image_access) {
1764 /* nir_lower_robust_access assumes constant buffer
1765 * indices on ubo/ssbo intrinsics so run copy propagation and
1766 * constant folding passes before we run the lowering to warrant
1767 * this. We also want to run the lowering before v3d_optimize to
1768 * clean-up redundant get_buffer_size calls produced in the pass.
1769 */
1770 NIR_PASS(_, c->s, nir_copy_prop);
1771 NIR_PASS(_, c->s, nir_opt_constant_folding);
1772
1773 NIR_PASS(_, c->s, nir_lower_robust_access,
1774 should_lower_robustness, c->key);
1775 }
1776
1777 NIR_PASS(_, c->s, nir_lower_vars_to_scratch,
1778 nir_var_function_temp,
1779 0,
1780 glsl_get_natural_size_align_bytes,
1781 glsl_get_natural_size_align_bytes);
1782
1783 NIR_PASS(_, c->s, v3d_nir_lower_global_2x32);
1784 NIR_PASS(_, c->s, nir_lower_wrmasks, should_split_wrmask, c->s);
1785 NIR_PASS(_, c->s, v3d_nir_lower_load_store_bitsize);
1786 NIR_PASS(_, c->s, v3d_nir_lower_scratch);
1787
1788 /* needs to run after load_store_bitsize */
1789 NIR_PASS(_, c->s, nir_lower_pack);
1790
1791 NIR_PASS(_, c->s, v3d_nir_lower_subgroup_intrinsics, c);
1792
1793 const nir_lower_subgroups_options subgroup_opts = {
1794 .subgroup_size = V3D_CHANNELS,
1795 .ballot_components = 1,
1796 .ballot_bit_size = 32,
1797 .lower_to_scalar = true,
1798 .lower_inverse_ballot = true,
1799 .lower_subgroup_masks = true,
1800 .lower_relative_shuffle = true,
1801 .lower_quad = true,
1802 };
1803 NIR_PASS(_, c->s, nir_lower_subgroups, &subgroup_opts);
1804
1805 v3d_optimize_nir(c, c->s);
1806
1807 /* Do late algebraic optimization to turn add(a, neg(b)) back into
1808 * subs, then the mandatory cleanup after algebraic. Note that it may
1809 * produce fnegs, and if so then we need to keep running to squash
1810 * fneg(fneg(a)).
1811 */
1812 bool more_late_algebraic = true;
1813 while (more_late_algebraic) {
1814 more_late_algebraic = false;
1815 NIR_PASS(more_late_algebraic, c->s, nir_opt_algebraic_late);
1816 NIR_PASS(_, c->s, nir_opt_constant_folding);
1817 NIR_PASS(_, c->s, nir_copy_prop);
1818 NIR_PASS(_, c->s, nir_opt_dce);
1819 NIR_PASS(_, c->s, nir_opt_cse);
1820 }
1821
1822 NIR_PASS(_, c->s, nir_lower_bool_to_int32);
1823 NIR_PASS(_, c->s, nir_convert_to_lcssa, true, true);
1824 NIR_PASS_V(c->s, nir_divergence_analysis);
1825 NIR_PASS(_, c->s, nir_convert_from_ssa, true);
1826
1827 struct nir_schedule_options schedule_options = {
1828 /* Schedule for about half our register space, to enable more
1829 * shaders to hit 4 threads.
1830 */
1831 .threshold = c->threads == 4 ? 24 : 48,
1832
1833 /* Vertex shaders share the same memory for inputs and outputs,
1834 * fragment and geometry shaders do not.
1835 */
1836 .stages_with_shared_io_memory =
1837 (((1 << MESA_ALL_SHADER_STAGES) - 1) &
1838 ~((1 << MESA_SHADER_FRAGMENT) |
1839 (1 << MESA_SHADER_GEOMETRY))),
1840
1841 .fallback = c->fallback_scheduler,
1842
1843 .intrinsic_cb = v3d_intrinsic_dependency_cb,
1844 .intrinsic_cb_data = c,
1845
1846 .instr_delay_cb = v3d_instr_delay_cb,
1847 .instr_delay_cb_data = c,
1848 };
1849 NIR_PASS_V(c->s, nir_schedule, &schedule_options);
1850
1851 if (!c->disable_constant_ubo_load_sorting)
1852 NIR_PASS(_, c->s, v3d_nir_sort_constant_ubo_loads, c);
1853
1854 const nir_move_options buffer_opts = c->move_buffer_loads ?
1855 (nir_move_load_ubo | nir_move_load_ssbo) : 0;
1856 NIR_PASS(_, c->s, nir_opt_move, nir_move_load_uniform |
1857 nir_move_const_undef |
1858 buffer_opts);
1859
1860 NIR_PASS_V(c->s, nir_trivialize_registers);
1861
1862 v3d_nir_to_vir(c);
1863 }
1864
1865 uint32_t
v3d_prog_data_size(gl_shader_stage stage)1866 v3d_prog_data_size(gl_shader_stage stage)
1867 {
1868 static const int prog_data_size[] = {
1869 [MESA_SHADER_VERTEX] = sizeof(struct v3d_vs_prog_data),
1870 [MESA_SHADER_GEOMETRY] = sizeof(struct v3d_gs_prog_data),
1871 [MESA_SHADER_FRAGMENT] = sizeof(struct v3d_fs_prog_data),
1872 [MESA_SHADER_COMPUTE] = sizeof(struct v3d_compute_prog_data),
1873 };
1874
1875 assert(stage >= 0 &&
1876 stage < ARRAY_SIZE(prog_data_size) &&
1877 prog_data_size[stage]);
1878
1879 return prog_data_size[stage];
1880 }
1881
v3d_shaderdb_dump(struct v3d_compile * c,char ** shaderdb_str)1882 int v3d_shaderdb_dump(struct v3d_compile *c,
1883 char **shaderdb_str)
1884 {
1885 if (c == NULL || c->compilation_result != V3D_COMPILATION_SUCCEEDED)
1886 return -1;
1887
1888 return asprintf(shaderdb_str,
1889 "%s shader: %d inst, %d threads, %d loops, "
1890 "%d uniforms, %d max-temps, %d:%d spills:fills, "
1891 "%d sfu-stalls, %d inst-and-stalls, %d nops",
1892 vir_get_stage_name(c),
1893 c->qpu_inst_count,
1894 c->threads,
1895 c->loops,
1896 c->num_uniforms,
1897 vir_get_max_temps(c),
1898 c->spills,
1899 c->fills,
1900 c->qpu_inst_stalled_count,
1901 c->qpu_inst_count + c->qpu_inst_stalled_count,
1902 c->nop_count);
1903 }
1904
1905 /* This is a list of incremental changes to the compilation strategy
1906 * that will be used to try to compile the shader successfully. The
1907 * default strategy is to enable all optimizations which will have
1908 * the highest register pressure but is expected to produce most
1909 * optimal code. Following strategies incrementally disable specific
1910 * optimizations that are known to contribute to register pressure
1911 * in order to be able to compile the shader successfully while meeting
1912 * thread count requirements.
1913 *
1914 * V3D 4.1+ has a min thread count of 2, but we can use 1 here to also
1915 * cover previous hardware as well (meaning that we are not limiting
1916 * register allocation to any particular thread count). This is fine
1917 * because v3d_nir_to_vir will cap this to the actual minimum.
1918 */
1919 static const struct v3d_compiler_strategy strategies[] = {
1920 /*0*/ { "default", 4, 4, false, false, false, false, false, false, 0 },
1921 /*1*/ { "disable general TMU sched", 4, 4, true, false, false, false, false, false, 0 },
1922 /*2*/ { "disable gcm", 4, 4, true, true, false, false, false, false, 0 },
1923 /*3*/ { "disable loop unrolling", 4, 4, true, true, true, false, false, false, 0 },
1924 /*4*/ { "disable UBO load sorting", 4, 4, true, true, true, true, false, false, 0 },
1925 /*5*/ { "disable TMU pipelining", 4, 4, true, true, true, true, false, true, 0 },
1926 /*6*/ { "lower thread count", 2, 1, false, false, false, false, false, false, -1 },
1927 /*7*/ { "disable general TMU sched (2t)", 2, 1, true, false, false, false, false, false, -1 },
1928 /*8*/ { "disable gcm (2t)", 2, 1, true, true, false, false, false, false, -1 },
1929 /*9*/ { "disable loop unrolling (2t)", 2, 1, true, true, true, false, false, false, -1 },
1930 /*10*/ { "Move buffer loads (2t)", 2, 1, true, true, true, true, true, false, -1 },
1931 /*11*/ { "disable TMU pipelining (2t)", 2, 1, true, true, true, true, true, true, -1 },
1932 /*12*/ { "fallback scheduler", 2, 1, true, true, true, true, true, true, -1 }
1933 };
1934
1935 /**
1936 * If a particular optimization didn't make any progress during a compile
1937 * attempt disabling it alone won't allow us to compile the shader successfully,
1938 * since we'll end up with the same code. Detect these scenarios so we can
1939 * avoid wasting time with useless compiles. We should also consider if the
1940 * gy changes other aspects of the compilation process though, like
1941 * spilling, and not skip it in that case.
1942 */
1943 static bool
skip_compile_strategy(struct v3d_compile * c,uint32_t idx)1944 skip_compile_strategy(struct v3d_compile *c, uint32_t idx)
1945 {
1946 /* We decide if we can skip a strategy based on the optimizations that
1947 * were active in the previous strategy, so we should only be calling this
1948 * for strategies after the first.
1949 */
1950 assert(idx > 0);
1951
1952 /* Don't skip a strategy that changes spilling behavior */
1953 if (strategies[idx].max_tmu_spills !=
1954 strategies[idx - 1].max_tmu_spills) {
1955 return false;
1956 }
1957
1958 switch (idx) {
1959 /* General TMU sched.: skip if we didn't emit any TMU loads */
1960 case 1:
1961 case 7:
1962 return !c->has_general_tmu_load;
1963 /* Global code motion: skip if nir_opt_gcm didn't make any progress */
1964 case 2:
1965 case 8:
1966 return !c->gcm_progress;
1967 /* Loop unrolling: skip if we didn't unroll any loops */
1968 case 3:
1969 case 9:
1970 return !c->unrolled_any_loops;
1971 /* UBO load sorting: skip if we didn't sort any loads */
1972 case 4:
1973 return !c->sorted_any_ubo_loads;
1974 /* Move buffer loads: we assume any shader with difficult RA
1975 * most likely has UBO / SSBO loads so we never try to skip.
1976 * For now, we only try this for 2-thread compiles since it
1977 * is expected to impact instruction counts and latency.
1978 */
1979 case 10:
1980 assert(c->threads < 4);
1981 return false;
1982 /* TMU pipelining: skip if we didn't pipeline any TMU ops */
1983 case 5:
1984 case 11:
1985 return !c->pipelined_any_tmu;
1986 /* Lower thread count: skip if we already tried less that 4 threads */
1987 case 6:
1988 return c->threads < 4;
1989 default:
1990 return false;
1991 };
1992 }
1993
1994 static inline void
set_best_compile(struct v3d_compile ** best,struct v3d_compile * c)1995 set_best_compile(struct v3d_compile **best, struct v3d_compile *c)
1996 {
1997 if (*best)
1998 vir_compile_destroy(*best);
1999 *best = c;
2000 }
2001
v3d_compile(const struct v3d_compiler * compiler,struct v3d_key * key,struct v3d_prog_data ** out_prog_data,nir_shader * s,void (* debug_output)(const char * msg,void * debug_output_data),void * debug_output_data,int program_id,int variant_id,uint32_t * final_assembly_size)2002 uint64_t *v3d_compile(const struct v3d_compiler *compiler,
2003 struct v3d_key *key,
2004 struct v3d_prog_data **out_prog_data,
2005 nir_shader *s,
2006 void (*debug_output)(const char *msg,
2007 void *debug_output_data),
2008 void *debug_output_data,
2009 int program_id, int variant_id,
2010 uint32_t *final_assembly_size)
2011 {
2012 struct v3d_compile *c = NULL;
2013 uint32_t best_spill_fill_count = UINT32_MAX;
2014 struct v3d_compile *best_c = NULL;
2015
2016 MESA_TRACE_FUNC();
2017
2018 for (int32_t strat = 0; strat < ARRAY_SIZE(strategies); strat++) {
2019 /* Fallback strategy */
2020 if (strat > 0) {
2021 assert(c);
2022 if (skip_compile_strategy(c, strat))
2023 continue;
2024
2025 char *debug_msg;
2026 int ret = asprintf(&debug_msg,
2027 "Falling back to strategy '%s' "
2028 "for %s prog %d/%d",
2029 strategies[strat].name,
2030 vir_get_stage_name(c),
2031 c->program_id, c->variant_id);
2032
2033 if (ret >= 0) {
2034 if (V3D_DBG(PERF))
2035 fprintf(stderr, "%s\n", debug_msg);
2036
2037 c->debug_output(debug_msg, c->debug_output_data);
2038 free(debug_msg);
2039 }
2040
2041 if (c != best_c)
2042 vir_compile_destroy(c);
2043 }
2044
2045 c = vir_compile_init(compiler, key, s,
2046 debug_output, debug_output_data,
2047 program_id, variant_id,
2048 strat, &strategies[strat],
2049 strat == ARRAY_SIZE(strategies) - 1);
2050
2051 v3d_attempt_compile(c);
2052
2053 /* Broken shader or driver bug */
2054 if (c->compilation_result == V3D_COMPILATION_FAILED)
2055 break;
2056
2057 /* If we compiled without spills, choose this.
2058 * Otherwise if this is a 4-thread compile, choose this (these
2059 * have a very low cap on the allowed TMU spills so we assume
2060 * it will be better than a 2-thread compile without spills).
2061 * Otherwise, keep going while tracking the strategy with the
2062 * lowest spill count.
2063 */
2064 if (c->compilation_result == V3D_COMPILATION_SUCCEEDED) {
2065 if (c->spills == 0 ||
2066 strategies[strat].min_threads == 4 ||
2067 V3D_DBG(OPT_COMPILE_TIME)) {
2068 set_best_compile(&best_c, c);
2069 break;
2070 } else if (c->spills + c->fills <
2071 best_spill_fill_count) {
2072 set_best_compile(&best_c, c);
2073 best_spill_fill_count = c->spills + c->fills;
2074 }
2075
2076 if (V3D_DBG(PERF)) {
2077 char *debug_msg;
2078 int ret = asprintf(&debug_msg,
2079 "Compiled %s prog %d/%d with %d "
2080 "spills and %d fills. Will try "
2081 "more strategies.",
2082 vir_get_stage_name(c),
2083 c->program_id, c->variant_id,
2084 c->spills, c->fills);
2085 if (ret >= 0) {
2086 fprintf(stderr, "%s\n", debug_msg);
2087 c->debug_output(debug_msg, c->debug_output_data);
2088 free(debug_msg);
2089 }
2090 }
2091 }
2092
2093 /* Only try next streategy if we failed to register allocate
2094 * or we had to spill.
2095 */
2096 assert(c->compilation_result ==
2097 V3D_COMPILATION_FAILED_REGISTER_ALLOCATION ||
2098 c->spills > 0);
2099 }
2100
2101 /* If the best strategy was not the last, choose that */
2102 if (best_c && c != best_c)
2103 set_best_compile(&c, best_c);
2104
2105 if (V3D_DBG(PERF) &&
2106 c->compilation_result !=
2107 V3D_COMPILATION_FAILED_REGISTER_ALLOCATION &&
2108 c->spills > 0) {
2109 char *debug_msg;
2110 int ret = asprintf(&debug_msg,
2111 "Compiled %s prog %d/%d with %d "
2112 "spills and %d fills",
2113 vir_get_stage_name(c),
2114 c->program_id, c->variant_id,
2115 c->spills, c->fills);
2116 fprintf(stderr, "%s\n", debug_msg);
2117
2118 if (ret >= 0) {
2119 c->debug_output(debug_msg, c->debug_output_data);
2120 free(debug_msg);
2121 }
2122 }
2123
2124 if (c->compilation_result != V3D_COMPILATION_SUCCEEDED) {
2125 fprintf(stderr, "Failed to compile %s prog %d/%d "
2126 "with any strategy.\n",
2127 vir_get_stage_name(c), c->program_id, c->variant_id);
2128
2129 vir_compile_destroy(c);
2130 return NULL;
2131 }
2132
2133 struct v3d_prog_data *prog_data;
2134
2135 prog_data = rzalloc_size(NULL, v3d_prog_data_size(c->s->info.stage));
2136
2137 v3d_set_prog_data(c, prog_data);
2138
2139 *out_prog_data = prog_data;
2140
2141 char *shaderdb;
2142 int ret = v3d_shaderdb_dump(c, &shaderdb);
2143 if (ret >= 0) {
2144 if (V3D_DBG(SHADERDB))
2145 fprintf(stderr, "SHADER-DB-%s - %s\n", s->info.name, shaderdb);
2146
2147 c->debug_output(shaderdb, c->debug_output_data);
2148 free(shaderdb);
2149 }
2150
2151 return v3d_return_qpu_insts(c, final_assembly_size);
2152 }
2153
2154 void
vir_remove_instruction(struct v3d_compile * c,struct qinst * qinst)2155 vir_remove_instruction(struct v3d_compile *c, struct qinst *qinst)
2156 {
2157 if (qinst->dst.file == QFILE_TEMP)
2158 c->defs[qinst->dst.index] = NULL;
2159
2160 assert(&qinst->link != c->cursor.link);
2161
2162 list_del(&qinst->link);
2163 free(qinst);
2164
2165 c->live_intervals_valid = false;
2166 }
2167
2168 struct qreg
vir_follow_movs(struct v3d_compile * c,struct qreg reg)2169 vir_follow_movs(struct v3d_compile *c, struct qreg reg)
2170 {
2171 /* XXX
2172 int pack = reg.pack;
2173
2174 while (reg.file == QFILE_TEMP &&
2175 c->defs[reg.index] &&
2176 (c->defs[reg.index]->op == QOP_MOV ||
2177 c->defs[reg.index]->op == QOP_FMOV) &&
2178 !c->defs[reg.index]->dst.pack &&
2179 !c->defs[reg.index]->src[0].pack) {
2180 reg = c->defs[reg.index]->src[0];
2181 }
2182
2183 reg.pack = pack;
2184 */
2185 return reg;
2186 }
2187
2188 void
vir_compile_destroy(struct v3d_compile * c)2189 vir_compile_destroy(struct v3d_compile *c)
2190 {
2191 /* Defuse the assert that we aren't removing the cursor's instruction.
2192 */
2193 c->cursor.link = NULL;
2194
2195 vir_for_each_block(block, c) {
2196 while (!list_is_empty(&block->instructions)) {
2197 struct qinst *qinst =
2198 list_first_entry(&block->instructions,
2199 struct qinst, link);
2200 vir_remove_instruction(c, qinst);
2201 }
2202 }
2203
2204 ralloc_free(c);
2205 }
2206
2207 uint32_t
vir_get_uniform_index(struct v3d_compile * c,enum quniform_contents contents,uint32_t data)2208 vir_get_uniform_index(struct v3d_compile *c,
2209 enum quniform_contents contents,
2210 uint32_t data)
2211 {
2212 for (int i = 0; i < c->num_uniforms; i++) {
2213 if (c->uniform_contents[i] == contents &&
2214 c->uniform_data[i] == data) {
2215 return i;
2216 }
2217 }
2218
2219 uint32_t uniform = c->num_uniforms++;
2220
2221 if (uniform >= c->uniform_array_size) {
2222 c->uniform_array_size = MAX2(MAX2(16, uniform + 1),
2223 c->uniform_array_size * 2);
2224
2225 c->uniform_data = reralloc(c, c->uniform_data,
2226 uint32_t,
2227 c->uniform_array_size);
2228 c->uniform_contents = reralloc(c, c->uniform_contents,
2229 enum quniform_contents,
2230 c->uniform_array_size);
2231 }
2232
2233 c->uniform_contents[uniform] = contents;
2234 c->uniform_data[uniform] = data;
2235
2236 return uniform;
2237 }
2238
2239 /* Looks back into the current block to find the ldunif that wrote the uniform
2240 * at the requested index. If it finds it, it returns true and writes the
2241 * destination register of the ldunif instruction to 'unif'.
2242 *
2243 * This can impact register pressure and end up leading to worse code, so we
2244 * limit the number of instructions we are willing to look back through to
2245 * strike a good balance.
2246 */
2247 static bool
try_opt_ldunif(struct v3d_compile * c,uint32_t index,struct qreg * unif)2248 try_opt_ldunif(struct v3d_compile *c, uint32_t index, struct qreg *unif)
2249 {
2250 uint32_t count = 20;
2251 struct qinst *prev_inst = NULL;
2252 assert(c->cur_block);
2253
2254 #if MESA_DEBUG
2255 /* We can only reuse a uniform if it was emitted in the same block,
2256 * so callers must make sure the current instruction is being emitted
2257 * in the current block.
2258 */
2259 bool found = false;
2260 vir_for_each_inst(inst, c->cur_block) {
2261 if (&inst->link == c->cursor.link) {
2262 found = true;
2263 break;
2264 }
2265 }
2266
2267 assert(found || &c->cur_block->instructions == c->cursor.link);
2268 #endif
2269
2270 list_for_each_entry_from_rev(struct qinst, inst, c->cursor.link->prev,
2271 &c->cur_block->instructions, link) {
2272 if ((inst->qpu.sig.ldunif || inst->qpu.sig.ldunifrf) &&
2273 inst->uniform == index) {
2274 prev_inst = inst;
2275 break;
2276 }
2277
2278 if (--count == 0)
2279 break;
2280 }
2281
2282 if (!prev_inst)
2283 return false;
2284
2285 /* Only reuse the ldunif result if it was written to a temp register,
2286 * otherwise there may be special restrictions (for example, ldunif
2287 * may write directly to unifa, which is a write-only register).
2288 */
2289 if (prev_inst->dst.file != QFILE_TEMP)
2290 return false;
2291
2292 list_for_each_entry_from(struct qinst, inst, prev_inst->link.next,
2293 &c->cur_block->instructions, link) {
2294 if (inst->dst.file == prev_inst->dst.file &&
2295 inst->dst.index == prev_inst->dst.index) {
2296 return false;
2297 }
2298 }
2299
2300 *unif = prev_inst->dst;
2301 return true;
2302 }
2303
2304 struct qreg
vir_uniform(struct v3d_compile * c,enum quniform_contents contents,uint32_t data)2305 vir_uniform(struct v3d_compile *c,
2306 enum quniform_contents contents,
2307 uint32_t data)
2308 {
2309 const int num_uniforms = c->num_uniforms;
2310 const int index = vir_get_uniform_index(c, contents, data);
2311
2312 /* If this is not the first time we see this uniform try to reuse the
2313 * result of the last ldunif that loaded it.
2314 */
2315 const bool is_new_uniform = num_uniforms != c->num_uniforms;
2316 if (!is_new_uniform && !c->disable_ldunif_opt) {
2317 struct qreg ldunif_dst;
2318 if (try_opt_ldunif(c, index, &ldunif_dst))
2319 return ldunif_dst;
2320 }
2321
2322 struct qinst *inst = vir_NOP(c);
2323 inst->qpu.sig.ldunif = true;
2324 inst->uniform = index;
2325 inst->dst = vir_get_temp(c);
2326 c->defs[inst->dst.index] = inst;
2327 return inst->dst;
2328 }
2329
2330 #define OPTPASS(func) \
2331 do { \
2332 bool stage_progress = func(c); \
2333 if (stage_progress) { \
2334 progress = true; \
2335 if (print_opt_debug) { \
2336 fprintf(stderr, \
2337 "VIR opt pass %2d: %s progress\n", \
2338 pass, #func); \
2339 } \
2340 /*XXX vir_validate(c);*/ \
2341 } \
2342 } while (0)
2343
2344 void
vir_optimize(struct v3d_compile * c)2345 vir_optimize(struct v3d_compile *c)
2346 {
2347 bool print_opt_debug = false;
2348 int pass = 1;
2349
2350 while (true) {
2351 bool progress = false;
2352
2353 OPTPASS(vir_opt_copy_propagate);
2354 OPTPASS(vir_opt_redundant_flags);
2355 OPTPASS(vir_opt_dead_code);
2356 OPTPASS(vir_opt_small_immediates);
2357 OPTPASS(vir_opt_constant_alu);
2358
2359 if (!progress)
2360 break;
2361
2362 pass++;
2363 }
2364 }
2365
2366 const char *
vir_get_stage_name(struct v3d_compile * c)2367 vir_get_stage_name(struct v3d_compile *c)
2368 {
2369 if (c->vs_key && c->vs_key->is_coord)
2370 return "MESA_SHADER_VERTEX_BIN";
2371 else if (c->gs_key && c->gs_key->is_coord)
2372 return "MESA_SHADER_GEOMETRY_BIN";
2373 else
2374 return gl_shader_stage_name(c->s->info.stage);
2375 }
2376
2377 static inline uint32_t
compute_vpm_size_in_sectors(const struct v3d_device_info * devinfo)2378 compute_vpm_size_in_sectors(const struct v3d_device_info *devinfo)
2379 {
2380 assert(devinfo->vpm_size > 0);
2381 const uint32_t sector_size = V3D_CHANNELS * sizeof(uint32_t) * 8;
2382 return devinfo->vpm_size / sector_size;
2383 }
2384
2385 /* Computes various parameters affecting VPM memory configuration for programs
2386 * involving geometry shaders to ensure the program fits in memory and honors
2387 * requirements described in section "VPM usage" of the programming manual.
2388 */
2389 static bool
compute_vpm_config_gs(struct v3d_device_info * devinfo,struct v3d_vs_prog_data * vs,struct v3d_gs_prog_data * gs,struct vpm_config * vpm_cfg_out)2390 compute_vpm_config_gs(struct v3d_device_info *devinfo,
2391 struct v3d_vs_prog_data *vs,
2392 struct v3d_gs_prog_data *gs,
2393 struct vpm_config *vpm_cfg_out)
2394 {
2395 const uint32_t A = vs->separate_segments ? 1 : 0;
2396 const uint32_t Ad = vs->vpm_input_size;
2397 const uint32_t Vd = vs->vpm_output_size;
2398
2399 const uint32_t vpm_size = compute_vpm_size_in_sectors(devinfo);
2400
2401 /* Try to fit program into our VPM memory budget by adjusting
2402 * configurable parameters iteratively. We do this in two phases:
2403 * the first phase tries to fit the program into the total available
2404 * VPM memory. If we succeed at that, then the second phase attempts
2405 * to fit the program into half of that budget so we can run bin and
2406 * render programs in parallel.
2407 */
2408 struct vpm_config vpm_cfg[2];
2409 struct vpm_config *final_vpm_cfg = NULL;
2410 uint32_t phase = 0;
2411
2412 vpm_cfg[phase].As = 1;
2413 vpm_cfg[phase].Gs = 1;
2414 vpm_cfg[phase].Gd = gs->vpm_output_size;
2415 vpm_cfg[phase].gs_width = gs->simd_width;
2416
2417 /* While there is a requirement that Vc >= [Vn / 16], this is
2418 * always the case when tessellation is not present because in that
2419 * case Vn can only be 6 at most (when input primitive is triangles
2420 * with adjacency).
2421 *
2422 * We always choose Vc=2. We can't go lower than this due to GFXH-1744,
2423 * and Broadcom has not found it worth it to increase it beyond this
2424 * in general. Increasing Vc also increases VPM memory pressure which
2425 * can turn up being detrimental for performance in some scenarios.
2426 */
2427 vpm_cfg[phase].Vc = 2;
2428
2429 /* Gv is a constraint on the hardware to not exceed the
2430 * specified number of vertex segments per GS batch. If adding a
2431 * new primitive to a GS batch would result in a range of more
2432 * than Gv vertex segments being referenced by the batch, then
2433 * the hardware will flush the batch and start a new one. This
2434 * means that we can choose any value we want, we just need to
2435 * be aware that larger values improve GS batch utilization
2436 * at the expense of more VPM memory pressure (which can affect
2437 * other performance aspects, such as GS dispatch width).
2438 * We start with the largest value, and will reduce it if we
2439 * find that total memory pressure is too high.
2440 */
2441 vpm_cfg[phase].Gv = 3;
2442 do {
2443 /* When GS is present in absence of TES, then we need to satisfy
2444 * that Ve >= Gv. We go with the smallest value of Ve to avoid
2445 * increasing memory pressure.
2446 */
2447 vpm_cfg[phase].Ve = vpm_cfg[phase].Gv;
2448
2449 uint32_t vpm_sectors =
2450 A * vpm_cfg[phase].As * Ad +
2451 (vpm_cfg[phase].Vc + vpm_cfg[phase].Ve) * Vd +
2452 vpm_cfg[phase].Gs * vpm_cfg[phase].Gd;
2453
2454 /* Ideally we want to use no more than half of the available
2455 * memory so we can execute a bin and render program in parallel
2456 * without stalls. If we achieved that then we are done.
2457 */
2458 if (vpm_sectors <= vpm_size / 2) {
2459 final_vpm_cfg = &vpm_cfg[phase];
2460 break;
2461 }
2462
2463 /* At the very least, we should not allocate more than the
2464 * total available VPM memory. If we have a configuration that
2465 * succeeds at this we save it and continue to see if we can
2466 * meet the half-memory-use criteria too.
2467 */
2468 if (phase == 0 && vpm_sectors <= vpm_size) {
2469 vpm_cfg[1] = vpm_cfg[0];
2470 phase = 1;
2471 }
2472
2473 /* Try lowering Gv */
2474 if (vpm_cfg[phase].Gv > 0) {
2475 vpm_cfg[phase].Gv--;
2476 continue;
2477 }
2478
2479 /* Try lowering GS dispatch width */
2480 if (vpm_cfg[phase].gs_width > 1) {
2481 do {
2482 vpm_cfg[phase].gs_width >>= 1;
2483 vpm_cfg[phase].Gd = align(vpm_cfg[phase].Gd, 2) / 2;
2484 } while (vpm_cfg[phase].gs_width == 2);
2485
2486 /* Reset Gv to max after dropping dispatch width */
2487 vpm_cfg[phase].Gv = 3;
2488 continue;
2489 }
2490
2491 /* We ran out of options to reduce memory pressure. If we
2492 * are at phase 1 we have at least a valid configuration, so we
2493 * we use that.
2494 */
2495 if (phase == 1)
2496 final_vpm_cfg = &vpm_cfg[0];
2497 break;
2498 } while (true);
2499
2500 if (!final_vpm_cfg)
2501 return false;
2502
2503 assert(final_vpm_cfg);
2504 assert(final_vpm_cfg->Gd <= 16);
2505 assert(final_vpm_cfg->Gv < 4);
2506 assert(final_vpm_cfg->Ve < 4);
2507 assert(final_vpm_cfg->Vc >= 2 && final_vpm_cfg->Vc <= 4);
2508 assert(final_vpm_cfg->gs_width == 1 ||
2509 final_vpm_cfg->gs_width == 4 ||
2510 final_vpm_cfg->gs_width == 8 ||
2511 final_vpm_cfg->gs_width == 16);
2512
2513 *vpm_cfg_out = *final_vpm_cfg;
2514 return true;
2515 }
2516
2517 bool
v3d_compute_vpm_config(struct v3d_device_info * devinfo,struct v3d_vs_prog_data * vs_bin,struct v3d_vs_prog_data * vs,struct v3d_gs_prog_data * gs_bin,struct v3d_gs_prog_data * gs,struct vpm_config * vpm_cfg_bin,struct vpm_config * vpm_cfg)2518 v3d_compute_vpm_config(struct v3d_device_info *devinfo,
2519 struct v3d_vs_prog_data *vs_bin,
2520 struct v3d_vs_prog_data *vs,
2521 struct v3d_gs_prog_data *gs_bin,
2522 struct v3d_gs_prog_data *gs,
2523 struct vpm_config *vpm_cfg_bin,
2524 struct vpm_config *vpm_cfg)
2525 {
2526 assert(vs && vs_bin);
2527 assert((gs != NULL) == (gs_bin != NULL));
2528
2529 if (!gs) {
2530 vpm_cfg_bin->As = 1;
2531 vpm_cfg_bin->Ve = 0;
2532 vpm_cfg_bin->Vc = vs_bin->vcm_cache_size;
2533
2534 vpm_cfg->As = 1;
2535 vpm_cfg->Ve = 0;
2536 vpm_cfg->Vc = vs->vcm_cache_size;
2537 } else {
2538 if (!compute_vpm_config_gs(devinfo, vs_bin, gs_bin, vpm_cfg_bin))
2539 return false;
2540
2541 if (!compute_vpm_config_gs(devinfo, vs, gs, vpm_cfg))
2542 return false;
2543 }
2544
2545 return true;
2546 }
2547
2548 static inline uint32_t
compute_prog_score(struct v3d_prog_data * p,uint32_t qpu_size)2549 compute_prog_score(struct v3d_prog_data *p, uint32_t qpu_size)
2550 {
2551 const uint32_t inst_count = qpu_size / sizeof(uint64_t);
2552 const uint32_t tmu_count = p->tmu_count + p->tmu_spills + p->tmu_fills;
2553 return inst_count + 4 * tmu_count;
2554 }
2555
2556 void
v3d_update_double_buffer_score(uint32_t vertex_count,uint32_t vs_qpu_size,uint32_t fs_qpu_size,struct v3d_prog_data * vs,struct v3d_prog_data * fs,struct v3d_double_buffer_score * score)2557 v3d_update_double_buffer_score(uint32_t vertex_count,
2558 uint32_t vs_qpu_size,
2559 uint32_t fs_qpu_size,
2560 struct v3d_prog_data *vs,
2561 struct v3d_prog_data *fs,
2562 struct v3d_double_buffer_score *score)
2563 {
2564 score->geom += vertex_count * compute_prog_score(vs, vs_qpu_size);
2565 score->render += compute_prog_score(fs, fs_qpu_size);
2566 }
2567