1 /*
2 * Copyright © 2016-2017 Broadcom
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
13 * Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21 * IN THE SOFTWARE.
22 */
23
24 #include "broadcom/common/v3d_device_info.h"
25 #include "v3d_compiler.h"
26 #include "compiler/nir/nir_schedule.h"
27 #include "compiler/nir/nir_builder.h"
28
29 int
vir_get_nsrc(struct qinst * inst)30 vir_get_nsrc(struct qinst *inst)
31 {
32 switch (inst->qpu.type) {
33 case V3D_QPU_INSTR_TYPE_BRANCH:
34 return 0;
35 case V3D_QPU_INSTR_TYPE_ALU:
36 if (inst->qpu.alu.add.op != V3D_QPU_A_NOP)
37 return v3d_qpu_add_op_num_src(inst->qpu.alu.add.op);
38 else
39 return v3d_qpu_mul_op_num_src(inst->qpu.alu.mul.op);
40 }
41
42 return 0;
43 }
44
45 /**
46 * Returns whether the instruction has any side effects that must be
47 * preserved.
48 */
49 bool
vir_has_side_effects(struct v3d_compile * c,struct qinst * inst)50 vir_has_side_effects(struct v3d_compile *c, struct qinst *inst)
51 {
52 switch (inst->qpu.type) {
53 case V3D_QPU_INSTR_TYPE_BRANCH:
54 return true;
55 case V3D_QPU_INSTR_TYPE_ALU:
56 switch (inst->qpu.alu.add.op) {
57 case V3D_QPU_A_SETREVF:
58 case V3D_QPU_A_SETMSF:
59 case V3D_QPU_A_VPMSETUP:
60 case V3D_QPU_A_STVPMV:
61 case V3D_QPU_A_STVPMD:
62 case V3D_QPU_A_STVPMP:
63 case V3D_QPU_A_VPMWT:
64 case V3D_QPU_A_TMUWT:
65 return true;
66 default:
67 break;
68 }
69
70 switch (inst->qpu.alu.mul.op) {
71 case V3D_QPU_M_MULTOP:
72 return true;
73 default:
74 break;
75 }
76 }
77
78 if (inst->qpu.sig.ldtmu ||
79 inst->qpu.sig.ldvary ||
80 inst->qpu.sig.ldtlbu ||
81 inst->qpu.sig.ldtlb ||
82 inst->qpu.sig.wrtmuc ||
83 inst->qpu.sig.thrsw) {
84 return true;
85 }
86
87 /* ldunifa works like ldunif: it reads an element and advances the
88 * pointer, so each read has a side effect (we don't care for ldunif
89 * because we reconstruct the uniform stream buffer after compiling
90 * with the surviving uniforms), so allowing DCE to remove
91 * one would break follow-up loads. We could fix this by emitting a
92 * unifa for each ldunifa, but each unifa requires 3 delay slots
93 * before a ldunifa, so that would be quite expensive.
94 */
95 if (inst->qpu.sig.ldunifa || inst->qpu.sig.ldunifarf)
96 return true;
97
98 return false;
99 }
100
101 bool
vir_is_raw_mov(struct qinst * inst)102 vir_is_raw_mov(struct qinst *inst)
103 {
104 if (inst->qpu.type != V3D_QPU_INSTR_TYPE_ALU ||
105 (inst->qpu.alu.mul.op != V3D_QPU_M_FMOV &&
106 inst->qpu.alu.mul.op != V3D_QPU_M_MOV)) {
107 return false;
108 }
109
110 if (inst->qpu.alu.add.output_pack != V3D_QPU_PACK_NONE ||
111 inst->qpu.alu.mul.output_pack != V3D_QPU_PACK_NONE) {
112 return false;
113 }
114
115 if (inst->qpu.alu.add.a.unpack != V3D_QPU_UNPACK_NONE ||
116 inst->qpu.alu.add.b.unpack != V3D_QPU_UNPACK_NONE ||
117 inst->qpu.alu.mul.a.unpack != V3D_QPU_UNPACK_NONE ||
118 inst->qpu.alu.mul.b.unpack != V3D_QPU_UNPACK_NONE) {
119 return false;
120 }
121
122 if (inst->qpu.flags.ac != V3D_QPU_COND_NONE ||
123 inst->qpu.flags.mc != V3D_QPU_COND_NONE)
124 return false;
125
126 return true;
127 }
128
129 bool
vir_is_add(struct qinst * inst)130 vir_is_add(struct qinst *inst)
131 {
132 return (inst->qpu.type == V3D_QPU_INSTR_TYPE_ALU &&
133 inst->qpu.alu.add.op != V3D_QPU_A_NOP);
134 }
135
136 bool
vir_is_mul(struct qinst * inst)137 vir_is_mul(struct qinst *inst)
138 {
139 return (inst->qpu.type == V3D_QPU_INSTR_TYPE_ALU &&
140 inst->qpu.alu.mul.op != V3D_QPU_M_NOP);
141 }
142
143 bool
vir_is_tex(const struct v3d_device_info * devinfo,struct qinst * inst)144 vir_is_tex(const struct v3d_device_info *devinfo, struct qinst *inst)
145 {
146 if (inst->dst.file == QFILE_MAGIC)
147 return v3d_qpu_magic_waddr_is_tmu(devinfo, inst->dst.index);
148
149 if (inst->qpu.type == V3D_QPU_INSTR_TYPE_ALU &&
150 inst->qpu.alu.add.op == V3D_QPU_A_TMUWT) {
151 return true;
152 }
153
154 return false;
155 }
156
157 bool
vir_writes_r4_implicitly(const struct v3d_device_info * devinfo,struct qinst * inst)158 vir_writes_r4_implicitly(const struct v3d_device_info *devinfo,
159 struct qinst *inst)
160 {
161 if (!devinfo->has_accumulators)
162 return false;
163
164 switch (inst->dst.file) {
165 case QFILE_MAGIC:
166 switch (inst->dst.index) {
167 case V3D_QPU_WADDR_RECIP:
168 case V3D_QPU_WADDR_RSQRT:
169 case V3D_QPU_WADDR_EXP:
170 case V3D_QPU_WADDR_LOG:
171 case V3D_QPU_WADDR_SIN:
172 return true;
173 }
174 break;
175 default:
176 break;
177 }
178
179 return false;
180 }
181
182 void
vir_set_unpack(struct qinst * inst,int src,enum v3d_qpu_input_unpack unpack)183 vir_set_unpack(struct qinst *inst, int src,
184 enum v3d_qpu_input_unpack unpack)
185 {
186 assert(src == 0 || src == 1);
187
188 if (vir_is_add(inst)) {
189 if (src == 0)
190 inst->qpu.alu.add.a.unpack = unpack;
191 else
192 inst->qpu.alu.add.b.unpack = unpack;
193 } else {
194 assert(vir_is_mul(inst));
195 if (src == 0)
196 inst->qpu.alu.mul.a.unpack = unpack;
197 else
198 inst->qpu.alu.mul.b.unpack = unpack;
199 }
200 }
201
202 void
vir_set_pack(struct qinst * inst,enum v3d_qpu_output_pack pack)203 vir_set_pack(struct qinst *inst, enum v3d_qpu_output_pack pack)
204 {
205 if (vir_is_add(inst)) {
206 inst->qpu.alu.add.output_pack = pack;
207 } else {
208 assert(vir_is_mul(inst));
209 inst->qpu.alu.mul.output_pack = pack;
210 }
211 }
212
213 void
vir_set_cond(struct qinst * inst,enum v3d_qpu_cond cond)214 vir_set_cond(struct qinst *inst, enum v3d_qpu_cond cond)
215 {
216 if (vir_is_add(inst)) {
217 inst->qpu.flags.ac = cond;
218 } else {
219 assert(vir_is_mul(inst));
220 inst->qpu.flags.mc = cond;
221 }
222 }
223
224 enum v3d_qpu_cond
vir_get_cond(struct qinst * inst)225 vir_get_cond(struct qinst *inst)
226 {
227 assert(inst->qpu.type == V3D_QPU_INSTR_TYPE_ALU);
228
229 if (vir_is_add(inst))
230 return inst->qpu.flags.ac;
231 else if (vir_is_mul(inst))
232 return inst->qpu.flags.mc;
233 else /* NOP */
234 return V3D_QPU_COND_NONE;
235 }
236
237 void
vir_set_pf(struct v3d_compile * c,struct qinst * inst,enum v3d_qpu_pf pf)238 vir_set_pf(struct v3d_compile *c, struct qinst *inst, enum v3d_qpu_pf pf)
239 {
240 c->flags_temp = -1;
241 if (vir_is_add(inst)) {
242 inst->qpu.flags.apf = pf;
243 } else {
244 assert(vir_is_mul(inst));
245 inst->qpu.flags.mpf = pf;
246 }
247 }
248
249 void
vir_set_uf(struct v3d_compile * c,struct qinst * inst,enum v3d_qpu_uf uf)250 vir_set_uf(struct v3d_compile *c, struct qinst *inst, enum v3d_qpu_uf uf)
251 {
252 c->flags_temp = -1;
253 if (vir_is_add(inst)) {
254 inst->qpu.flags.auf = uf;
255 } else {
256 assert(vir_is_mul(inst));
257 inst->qpu.flags.muf = uf;
258 }
259 }
260
261 #if 0
262 uint8_t
263 vir_channels_written(struct qinst *inst)
264 {
265 if (vir_is_mul(inst)) {
266 switch (inst->dst.pack) {
267 case QPU_PACK_MUL_NOP:
268 case QPU_PACK_MUL_8888:
269 return 0xf;
270 case QPU_PACK_MUL_8A:
271 return 0x1;
272 case QPU_PACK_MUL_8B:
273 return 0x2;
274 case QPU_PACK_MUL_8C:
275 return 0x4;
276 case QPU_PACK_MUL_8D:
277 return 0x8;
278 }
279 } else {
280 switch (inst->dst.pack) {
281 case QPU_PACK_A_NOP:
282 case QPU_PACK_A_8888:
283 case QPU_PACK_A_8888_SAT:
284 case QPU_PACK_A_32_SAT:
285 return 0xf;
286 case QPU_PACK_A_8A:
287 case QPU_PACK_A_8A_SAT:
288 return 0x1;
289 case QPU_PACK_A_8B:
290 case QPU_PACK_A_8B_SAT:
291 return 0x2;
292 case QPU_PACK_A_8C:
293 case QPU_PACK_A_8C_SAT:
294 return 0x4;
295 case QPU_PACK_A_8D:
296 case QPU_PACK_A_8D_SAT:
297 return 0x8;
298 case QPU_PACK_A_16A:
299 case QPU_PACK_A_16A_SAT:
300 return 0x3;
301 case QPU_PACK_A_16B:
302 case QPU_PACK_A_16B_SAT:
303 return 0xc;
304 }
305 }
306 unreachable("Bad pack field");
307 }
308 #endif
309
310 struct qreg
vir_get_temp(struct v3d_compile * c)311 vir_get_temp(struct v3d_compile *c)
312 {
313 struct qreg reg;
314
315 reg.file = QFILE_TEMP;
316 reg.index = c->num_temps++;
317
318 if (c->num_temps > c->defs_array_size) {
319 uint32_t old_size = c->defs_array_size;
320 c->defs_array_size = MAX2(old_size * 2, 16);
321
322 c->defs = reralloc(c, c->defs, struct qinst *,
323 c->defs_array_size);
324 memset(&c->defs[old_size], 0,
325 sizeof(c->defs[0]) * (c->defs_array_size - old_size));
326
327 c->spillable = reralloc(c, c->spillable,
328 BITSET_WORD,
329 BITSET_WORDS(c->defs_array_size));
330 for (int i = old_size; i < c->defs_array_size; i++)
331 BITSET_SET(c->spillable, i);
332 }
333
334 return reg;
335 }
336
337 struct qinst *
vir_add_inst(enum v3d_qpu_add_op op,struct qreg dst,struct qreg src0,struct qreg src1)338 vir_add_inst(enum v3d_qpu_add_op op, struct qreg dst, struct qreg src0, struct qreg src1)
339 {
340 struct qinst *inst = calloc(1, sizeof(*inst));
341
342 inst->qpu = v3d_qpu_nop();
343 inst->qpu.alu.add.op = op;
344
345 inst->dst = dst;
346 inst->src[0] = src0;
347 inst->src[1] = src1;
348 inst->uniform = ~0;
349
350 inst->ip = -1;
351
352 return inst;
353 }
354
355 struct qinst *
vir_mul_inst(enum v3d_qpu_mul_op op,struct qreg dst,struct qreg src0,struct qreg src1)356 vir_mul_inst(enum v3d_qpu_mul_op op, struct qreg dst, struct qreg src0, struct qreg src1)
357 {
358 struct qinst *inst = calloc(1, sizeof(*inst));
359
360 inst->qpu = v3d_qpu_nop();
361 inst->qpu.alu.mul.op = op;
362
363 inst->dst = dst;
364 inst->src[0] = src0;
365 inst->src[1] = src1;
366 inst->uniform = ~0;
367
368 inst->ip = -1;
369
370 return inst;
371 }
372
373 struct qinst *
vir_branch_inst(struct v3d_compile * c,enum v3d_qpu_branch_cond cond)374 vir_branch_inst(struct v3d_compile *c, enum v3d_qpu_branch_cond cond)
375 {
376 struct qinst *inst = calloc(1, sizeof(*inst));
377
378 inst->qpu = v3d_qpu_nop();
379 inst->qpu.type = V3D_QPU_INSTR_TYPE_BRANCH;
380 inst->qpu.branch.cond = cond;
381 inst->qpu.branch.msfign = V3D_QPU_MSFIGN_NONE;
382 inst->qpu.branch.bdi = V3D_QPU_BRANCH_DEST_REL;
383 inst->qpu.branch.ub = true;
384 inst->qpu.branch.bdu = V3D_QPU_BRANCH_DEST_REL;
385
386 inst->dst = vir_nop_reg();
387 inst->uniform = vir_get_uniform_index(c, QUNIFORM_CONSTANT, 0);
388
389 inst->ip = -1;
390
391 return inst;
392 }
393
394 static void
vir_emit(struct v3d_compile * c,struct qinst * inst)395 vir_emit(struct v3d_compile *c, struct qinst *inst)
396 {
397 inst->ip = -1;
398
399 switch (c->cursor.mode) {
400 case vir_cursor_add:
401 list_add(&inst->link, c->cursor.link);
402 break;
403 case vir_cursor_addtail:
404 list_addtail(&inst->link, c->cursor.link);
405 break;
406 }
407
408 c->cursor = vir_after_inst(inst);
409 c->live_intervals_valid = false;
410 }
411
412 /* Updates inst to write to a new temporary, emits it, and notes the def. */
413 struct qreg
vir_emit_def(struct v3d_compile * c,struct qinst * inst)414 vir_emit_def(struct v3d_compile *c, struct qinst *inst)
415 {
416 assert(inst->dst.file == QFILE_NULL);
417
418 /* If we're emitting an instruction that's a def, it had better be
419 * writing a register.
420 */
421 if (inst->qpu.type == V3D_QPU_INSTR_TYPE_ALU) {
422 assert(inst->qpu.alu.add.op == V3D_QPU_A_NOP ||
423 v3d_qpu_add_op_has_dst(inst->qpu.alu.add.op));
424 assert(inst->qpu.alu.mul.op == V3D_QPU_M_NOP ||
425 v3d_qpu_mul_op_has_dst(inst->qpu.alu.mul.op));
426 }
427
428 inst->dst = vir_get_temp(c);
429
430 if (inst->dst.file == QFILE_TEMP)
431 c->defs[inst->dst.index] = inst;
432
433 vir_emit(c, inst);
434
435 return inst->dst;
436 }
437
438 struct qinst *
vir_emit_nondef(struct v3d_compile * c,struct qinst * inst)439 vir_emit_nondef(struct v3d_compile *c, struct qinst *inst)
440 {
441 if (inst->dst.file == QFILE_TEMP)
442 c->defs[inst->dst.index] = NULL;
443
444 vir_emit(c, inst);
445
446 return inst;
447 }
448
449 struct qblock *
vir_new_block(struct v3d_compile * c)450 vir_new_block(struct v3d_compile *c)
451 {
452 struct qblock *block = rzalloc(c, struct qblock);
453
454 list_inithead(&block->instructions);
455
456 block->predecessors = _mesa_set_create(block,
457 _mesa_hash_pointer,
458 _mesa_key_pointer_equal);
459
460 block->index = c->next_block_index++;
461
462 return block;
463 }
464
465 void
vir_set_emit_block(struct v3d_compile * c,struct qblock * block)466 vir_set_emit_block(struct v3d_compile *c, struct qblock *block)
467 {
468 c->cur_block = block;
469 c->cursor = vir_after_block(block);
470 list_addtail(&block->link, &c->blocks);
471 }
472
473 struct qblock *
vir_entry_block(struct v3d_compile * c)474 vir_entry_block(struct v3d_compile *c)
475 {
476 return list_first_entry(&c->blocks, struct qblock, link);
477 }
478
479 struct qblock *
vir_exit_block(struct v3d_compile * c)480 vir_exit_block(struct v3d_compile *c)
481 {
482 return list_last_entry(&c->blocks, struct qblock, link);
483 }
484
485 void
vir_link_blocks(struct qblock * predecessor,struct qblock * successor)486 vir_link_blocks(struct qblock *predecessor, struct qblock *successor)
487 {
488 _mesa_set_add(successor->predecessors, predecessor);
489 if (predecessor->successors[0]) {
490 assert(!predecessor->successors[1]);
491 predecessor->successors[1] = successor;
492 } else {
493 predecessor->successors[0] = successor;
494 }
495 }
496
497 const struct v3d_compiler *
v3d_compiler_init(const struct v3d_device_info * devinfo,uint32_t max_inline_uniform_buffers)498 v3d_compiler_init(const struct v3d_device_info *devinfo,
499 uint32_t max_inline_uniform_buffers)
500 {
501 struct v3d_compiler *compiler = rzalloc(NULL, struct v3d_compiler);
502 if (!compiler)
503 return NULL;
504
505 compiler->devinfo = devinfo;
506 compiler->max_inline_uniform_buffers = max_inline_uniform_buffers;
507
508 if (!vir_init_reg_sets(compiler)) {
509 ralloc_free(compiler);
510 return NULL;
511 }
512
513 return compiler;
514 }
515
516 void
v3d_compiler_free(const struct v3d_compiler * compiler)517 v3d_compiler_free(const struct v3d_compiler *compiler)
518 {
519 ralloc_free((void *)compiler);
520 }
521
522 struct v3d_compiler_strategy {
523 const char *name;
524 uint32_t max_threads;
525 uint32_t min_threads;
526 bool disable_general_tmu_sched;
527 bool disable_gcm;
528 bool disable_loop_unrolling;
529 bool disable_ubo_load_sorting;
530 bool move_buffer_loads;
531 bool disable_tmu_pipelining;
532 uint32_t max_tmu_spills;
533 };
534
535 static struct v3d_compile *
vir_compile_init(const struct v3d_compiler * compiler,struct v3d_key * key,nir_shader * s,void (* debug_output)(const char * msg,void * debug_output_data),void * debug_output_data,int program_id,int variant_id,uint32_t compile_strategy_idx,const struct v3d_compiler_strategy * strategy,bool fallback_scheduler)536 vir_compile_init(const struct v3d_compiler *compiler,
537 struct v3d_key *key,
538 nir_shader *s,
539 void (*debug_output)(const char *msg,
540 void *debug_output_data),
541 void *debug_output_data,
542 int program_id, int variant_id,
543 uint32_t compile_strategy_idx,
544 const struct v3d_compiler_strategy *strategy,
545 bool fallback_scheduler)
546 {
547 struct v3d_compile *c = rzalloc(NULL, struct v3d_compile);
548
549 c->compiler = compiler;
550 c->devinfo = compiler->devinfo;
551 c->key = key;
552 c->program_id = program_id;
553 c->variant_id = variant_id;
554 c->compile_strategy_idx = compile_strategy_idx;
555 c->threads = strategy->max_threads;
556 c->debug_output = debug_output;
557 c->debug_output_data = debug_output_data;
558 c->compilation_result = V3D_COMPILATION_SUCCEEDED;
559 c->min_threads_for_reg_alloc = strategy->min_threads;
560 c->max_tmu_spills = strategy->max_tmu_spills;
561 c->fallback_scheduler = fallback_scheduler;
562 c->disable_general_tmu_sched = strategy->disable_general_tmu_sched;
563 c->disable_tmu_pipelining = strategy->disable_tmu_pipelining;
564 c->disable_constant_ubo_load_sorting = strategy->disable_ubo_load_sorting;
565 c->move_buffer_loads = strategy->move_buffer_loads;
566 c->disable_gcm = strategy->disable_gcm;
567 c->disable_loop_unrolling = V3D_DBG(NO_LOOP_UNROLL)
568 ? true : strategy->disable_loop_unrolling;
569
570
571 s = nir_shader_clone(c, s);
572 c->s = s;
573
574 list_inithead(&c->blocks);
575 vir_set_emit_block(c, vir_new_block(c));
576
577 c->output_position_index = -1;
578 c->output_sample_mask_index = -1;
579
580 c->def_ht = _mesa_hash_table_create(c, _mesa_hash_pointer,
581 _mesa_key_pointer_equal);
582
583 c->tmu.outstanding_regs = _mesa_pointer_set_create(c);
584 c->flags_temp = -1;
585
586 return c;
587 }
588
589 static int
type_size_vec4(const struct glsl_type * type,bool bindless)590 type_size_vec4(const struct glsl_type *type, bool bindless)
591 {
592 return glsl_count_attribute_slots(type, false);
593 }
594
595 static enum nir_lower_tex_packing
lower_tex_packing_cb(const nir_tex_instr * tex,const void * data)596 lower_tex_packing_cb(const nir_tex_instr *tex, const void *data)
597 {
598 struct v3d_compile *c = (struct v3d_compile *) data;
599
600 int sampler_index = nir_tex_instr_need_sampler(tex) ?
601 tex->sampler_index : tex->backend_flags;
602
603 assert(sampler_index < c->key->num_samplers_used);
604 return c->key->sampler[sampler_index].return_size == 16 ?
605 nir_lower_tex_packing_16 : nir_lower_tex_packing_none;
606 }
607
608 static bool
v3d_nir_lower_null_pointers_cb(nir_builder * b,nir_intrinsic_instr * intr,void * _state)609 v3d_nir_lower_null_pointers_cb(nir_builder *b,
610 nir_intrinsic_instr *intr,
611 void *_state)
612 {
613 uint32_t buffer_src_idx;
614
615 switch (intr->intrinsic) {
616 case nir_intrinsic_load_ubo:
617 case nir_intrinsic_load_ssbo:
618 buffer_src_idx = 0;
619 break;
620 case nir_intrinsic_store_ssbo:
621 buffer_src_idx = 1;
622 break;
623 default:
624 return false;
625 }
626
627 /* If index if constant we are good */
628 nir_src *src = &intr->src[buffer_src_idx];
629 if (nir_src_is_const(*src))
630 return false;
631
632 /* Otherwise, see if it comes from a bcsel including a null pointer */
633 if (src->ssa->parent_instr->type != nir_instr_type_alu)
634 return false;
635
636 nir_alu_instr *alu = nir_instr_as_alu(src->ssa->parent_instr);
637 if (alu->op != nir_op_bcsel)
638 return false;
639
640 /* A null pointer is specified using block index 0xffffffff */
641 int32_t null_src_idx = -1;
642 for (int i = 1; i < 3; i++) {
643 /* FIXME: since we are running this before optimization maybe
644 * we need to also handle the case where we may have bcsel
645 * chain that we need to recurse?
646 */
647 if (!nir_src_is_const(alu->src[i].src))
648 continue;
649 if (nir_src_comp_as_uint(alu->src[i].src, 0) != 0xffffffff)
650 continue;
651
652 /* One of the bcsel srcs is a null pointer reference */
653 null_src_idx = i;
654 break;
655 }
656
657 if (null_src_idx < 0)
658 return false;
659
660 assert(null_src_idx == 1 || null_src_idx == 2);
661 int32_t copy_src_idx = null_src_idx == 1 ? 2 : 1;
662
663 /* Rewrite the null pointer reference so we use the same buffer index
664 * as the other bcsel branch. This will allow optimization to remove
665 * the bcsel and we should then end up with a constant buffer index
666 * like we need.
667 */
668 b->cursor = nir_before_instr(&alu->instr);
669 nir_def *copy = nir_mov(b, alu->src[copy_src_idx].src.ssa);
670 nir_src_rewrite(&alu->src[null_src_idx].src, copy);
671
672 return true;
673 }
674
675 static bool
v3d_nir_lower_null_pointers(nir_shader * s)676 v3d_nir_lower_null_pointers(nir_shader *s)
677 {
678 return nir_shader_intrinsics_pass(s, v3d_nir_lower_null_pointers_cb,
679 nir_metadata_block_index |
680 nir_metadata_dominance, NULL);
681 }
682
683 static void
v3d_lower_nir(struct v3d_compile * c)684 v3d_lower_nir(struct v3d_compile *c)
685 {
686 struct nir_lower_tex_options tex_options = {
687 .lower_txd = true,
688 .lower_tg4_offsets = true,
689 .lower_tg4_broadcom_swizzle = true,
690
691 .lower_rect = false, /* XXX: Use this on V3D 3.x */
692 .lower_txp = ~0,
693 /* Apply swizzles to all samplers. */
694 .swizzle_result = ~0,
695 .lower_invalid_implicit_lod = true,
696 };
697
698 /* Lower the format swizzle and (for 32-bit returns)
699 * ARB_texture_swizzle-style swizzle.
700 */
701 assert(c->key->num_tex_used <= ARRAY_SIZE(c->key->tex));
702 for (int i = 0; i < c->key->num_tex_used; i++) {
703 for (int j = 0; j < 4; j++)
704 tex_options.swizzles[i][j] = c->key->tex[i].swizzle[j];
705 }
706
707 tex_options.lower_tex_packing_cb = lower_tex_packing_cb;
708 tex_options.lower_tex_packing_data = c;
709
710 NIR_PASS(_, c->s, nir_lower_tex, &tex_options);
711 NIR_PASS(_, c->s, nir_lower_system_values);
712
713 if (c->s->info.zero_initialize_shared_memory &&
714 c->s->info.shared_size > 0) {
715 /* All our BOs allocate full pages, so the underlying allocation
716 * for shared memory will always be a multiple of 4KB. This
717 * ensures that we can do an exact number of full chunk_size
718 * writes to initialize the memory independently of the actual
719 * shared_size used by the shader, which is a requirement of
720 * the initialization pass.
721 */
722 const unsigned chunk_size = 16; /* max single store size */
723 NIR_PASS(_, c->s, nir_zero_initialize_shared_memory,
724 align(c->s->info.shared_size, chunk_size), chunk_size);
725 }
726
727 NIR_PASS(_, c->s, nir_lower_compute_system_values, NULL);
728
729 NIR_PASS(_, c->s, nir_lower_vars_to_scratch,
730 nir_var_function_temp,
731 0,
732 glsl_get_natural_size_align_bytes);
733 NIR_PASS(_, c->s, nir_lower_is_helper_invocation);
734 NIR_PASS(_, c->s, v3d_nir_lower_scratch);
735 NIR_PASS(_, c->s, v3d_nir_lower_null_pointers);
736 }
737
738 static void
v3d_set_prog_data_uniforms(struct v3d_compile * c,struct v3d_prog_data * prog_data)739 v3d_set_prog_data_uniforms(struct v3d_compile *c,
740 struct v3d_prog_data *prog_data)
741 {
742 int count = c->num_uniforms;
743 struct v3d_uniform_list *ulist = &prog_data->uniforms;
744
745 ulist->count = count;
746 ulist->data = ralloc_array(prog_data, uint32_t, count);
747 memcpy(ulist->data, c->uniform_data,
748 count * sizeof(*ulist->data));
749 ulist->contents = ralloc_array(prog_data, enum quniform_contents, count);
750 memcpy(ulist->contents, c->uniform_contents,
751 count * sizeof(*ulist->contents));
752 }
753
754 static void
v3d_vs_set_prog_data(struct v3d_compile * c,struct v3d_vs_prog_data * prog_data)755 v3d_vs_set_prog_data(struct v3d_compile *c,
756 struct v3d_vs_prog_data *prog_data)
757 {
758 /* The vertex data gets format converted by the VPM so that
759 * each attribute channel takes up a VPM column. Precompute
760 * the sizes for the shader record.
761 */
762 for (int i = 0; i < ARRAY_SIZE(prog_data->vattr_sizes); i++) {
763 prog_data->vattr_sizes[i] = c->vattr_sizes[i];
764 prog_data->vpm_input_size += c->vattr_sizes[i];
765 }
766
767 memset(prog_data->driver_location_map, -1,
768 sizeof(prog_data->driver_location_map));
769
770 nir_foreach_shader_in_variable(var, c->s) {
771 prog_data->driver_location_map[var->data.location] =
772 var->data.driver_location;
773 }
774
775 prog_data->uses_vid = BITSET_TEST(c->s->info.system_values_read,
776 SYSTEM_VALUE_VERTEX_ID) ||
777 BITSET_TEST(c->s->info.system_values_read,
778 SYSTEM_VALUE_VERTEX_ID_ZERO_BASE);
779
780 prog_data->uses_biid = BITSET_TEST(c->s->info.system_values_read,
781 SYSTEM_VALUE_BASE_INSTANCE);
782
783 prog_data->uses_iid = BITSET_TEST(c->s->info.system_values_read,
784 SYSTEM_VALUE_INSTANCE_ID) ||
785 BITSET_TEST(c->s->info.system_values_read,
786 SYSTEM_VALUE_INSTANCE_INDEX);
787
788 if (prog_data->uses_vid)
789 prog_data->vpm_input_size++;
790 if (prog_data->uses_biid)
791 prog_data->vpm_input_size++;
792 if (prog_data->uses_iid)
793 prog_data->vpm_input_size++;
794
795 /* Input/output segment size are in sectors (8 rows of 32 bits per
796 * channel).
797 */
798 prog_data->vpm_input_size = align(prog_data->vpm_input_size, 8) / 8;
799 prog_data->vpm_output_size = align(c->vpm_output_size, 8) / 8;
800
801 /* Set us up for shared input/output segments. This is apparently
802 * necessary for our VCM setup to avoid varying corruption.
803 *
804 * FIXME: initial testing on V3D 7.1 seems to work fine when using
805 * separate segments. So we could try to reevaluate in the future, if
806 * there is any advantage of using separate segments.
807 */
808 prog_data->separate_segments = false;
809 prog_data->vpm_output_size = MAX2(prog_data->vpm_output_size,
810 prog_data->vpm_input_size);
811 prog_data->vpm_input_size = 0;
812
813 /* Compute VCM cache size. We set up our program to take up less than
814 * half of the VPM, so that any set of bin and render programs won't
815 * run out of space. We need space for at least one input segment,
816 * and then allocate the rest to output segments (one for the current
817 * program, the rest to VCM). The valid range of the VCM cache size
818 * field is 1-4 16-vertex batches, but GFXH-1744 limits us to 2-4
819 * batches.
820 */
821 assert(c->devinfo->vpm_size);
822 int sector_size = V3D_CHANNELS * sizeof(uint32_t) * 8;
823 int vpm_size_in_sectors = c->devinfo->vpm_size / sector_size;
824 int half_vpm = vpm_size_in_sectors / 2;
825 int vpm_output_sectors = half_vpm - prog_data->vpm_input_size;
826 int vpm_output_batches = vpm_output_sectors / prog_data->vpm_output_size;
827 assert(vpm_output_batches >= 2);
828 prog_data->vcm_cache_size = CLAMP(vpm_output_batches - 1, 2, 4);
829 }
830
831 static void
v3d_gs_set_prog_data(struct v3d_compile * c,struct v3d_gs_prog_data * prog_data)832 v3d_gs_set_prog_data(struct v3d_compile *c,
833 struct v3d_gs_prog_data *prog_data)
834 {
835 prog_data->num_inputs = c->num_inputs;
836 memcpy(prog_data->input_slots, c->input_slots,
837 c->num_inputs * sizeof(*c->input_slots));
838
839 /* gl_PrimitiveIdIn is written by the GBG into the first word of the
840 * VPM output header automatically and the shader will overwrite
841 * it after reading it if necessary, so it doesn't add to the VPM
842 * size requirements.
843 */
844 prog_data->uses_pid = BITSET_TEST(c->s->info.system_values_read,
845 SYSTEM_VALUE_PRIMITIVE_ID);
846
847 /* Output segment size is in sectors (8 rows of 32 bits per channel) */
848 prog_data->vpm_output_size = align(c->vpm_output_size, 8) / 8;
849
850 /* Compute SIMD dispatch width and update VPM output size accordingly
851 * to ensure we can fit our program in memory. Available widths are
852 * 16, 8, 4, 1.
853 *
854 * Notice that at draw time we will have to consider VPM memory
855 * requirements from other stages and choose a smaller dispatch
856 * width if needed to fit the program in VPM memory.
857 */
858 prog_data->simd_width = 16;
859 while ((prog_data->simd_width > 1 && prog_data->vpm_output_size > 16) ||
860 prog_data->simd_width == 2) {
861 prog_data->simd_width >>= 1;
862 prog_data->vpm_output_size =
863 align(prog_data->vpm_output_size, 2) / 2;
864 }
865 assert(prog_data->vpm_output_size <= 16);
866 assert(prog_data->simd_width != 2);
867
868 prog_data->out_prim_type = c->s->info.gs.output_primitive;
869 prog_data->num_invocations = c->s->info.gs.invocations;
870
871 prog_data->writes_psiz =
872 c->s->info.outputs_written & (1 << VARYING_SLOT_PSIZ);
873 }
874
875 static void
v3d_set_fs_prog_data_inputs(struct v3d_compile * c,struct v3d_fs_prog_data * prog_data)876 v3d_set_fs_prog_data_inputs(struct v3d_compile *c,
877 struct v3d_fs_prog_data *prog_data)
878 {
879 prog_data->num_inputs = c->num_inputs;
880 memcpy(prog_data->input_slots, c->input_slots,
881 c->num_inputs * sizeof(*c->input_slots));
882
883 STATIC_ASSERT(ARRAY_SIZE(prog_data->flat_shade_flags) >
884 (V3D_MAX_FS_INPUTS - 1) / 24);
885 for (int i = 0; i < V3D_MAX_FS_INPUTS; i++) {
886 if (BITSET_TEST(c->flat_shade_flags, i))
887 prog_data->flat_shade_flags[i / 24] |= 1 << (i % 24);
888
889 if (BITSET_TEST(c->noperspective_flags, i))
890 prog_data->noperspective_flags[i / 24] |= 1 << (i % 24);
891
892 if (BITSET_TEST(c->centroid_flags, i))
893 prog_data->centroid_flags[i / 24] |= 1 << (i % 24);
894 }
895 }
896
897 static void
v3d_fs_set_prog_data(struct v3d_compile * c,struct v3d_fs_prog_data * prog_data)898 v3d_fs_set_prog_data(struct v3d_compile *c,
899 struct v3d_fs_prog_data *prog_data)
900 {
901 v3d_set_fs_prog_data_inputs(c, prog_data);
902 prog_data->writes_z = c->writes_z;
903 prog_data->writes_z_from_fep = c->writes_z_from_fep;
904 prog_data->disable_ez = !c->s->info.fs.early_fragment_tests;
905 prog_data->uses_center_w = c->uses_center_w;
906 prog_data->uses_implicit_point_line_varyings =
907 c->uses_implicit_point_line_varyings;
908 prog_data->lock_scoreboard_on_first_thrsw =
909 c->lock_scoreboard_on_first_thrsw;
910 prog_data->force_per_sample_msaa = c->s->info.fs.uses_sample_shading;
911 prog_data->uses_pid = c->fs_uses_primitive_id;
912 }
913
914 static void
v3d_cs_set_prog_data(struct v3d_compile * c,struct v3d_compute_prog_data * prog_data)915 v3d_cs_set_prog_data(struct v3d_compile *c,
916 struct v3d_compute_prog_data *prog_data)
917 {
918 prog_data->shared_size = c->s->info.shared_size;
919
920 prog_data->local_size[0] = c->s->info.workgroup_size[0];
921 prog_data->local_size[1] = c->s->info.workgroup_size[1];
922 prog_data->local_size[2] = c->s->info.workgroup_size[2];
923
924 prog_data->has_subgroups = c->has_subgroups;
925 }
926
927 static void
v3d_set_prog_data(struct v3d_compile * c,struct v3d_prog_data * prog_data)928 v3d_set_prog_data(struct v3d_compile *c,
929 struct v3d_prog_data *prog_data)
930 {
931 prog_data->threads = c->threads;
932 prog_data->single_seg = !c->last_thrsw;
933 prog_data->spill_size = c->spill_size;
934 prog_data->tmu_spills = c->spills;
935 prog_data->tmu_fills = c->fills;
936 prog_data->tmu_count = c->tmu.total_count;
937 prog_data->qpu_read_stalls = c->qpu_inst_stalled_count;
938 prog_data->compile_strategy_idx = c->compile_strategy_idx;
939 prog_data->tmu_dirty_rcl = c->tmu_dirty_rcl;
940 prog_data->has_control_barrier = c->s->info.uses_control_barrier;
941 prog_data->has_global_address = c->has_global_address;
942
943 v3d_set_prog_data_uniforms(c, prog_data);
944
945 switch (c->s->info.stage) {
946 case MESA_SHADER_VERTEX:
947 v3d_vs_set_prog_data(c, (struct v3d_vs_prog_data *)prog_data);
948 break;
949 case MESA_SHADER_GEOMETRY:
950 v3d_gs_set_prog_data(c, (struct v3d_gs_prog_data *)prog_data);
951 break;
952 case MESA_SHADER_FRAGMENT:
953 v3d_fs_set_prog_data(c, (struct v3d_fs_prog_data *)prog_data);
954 break;
955 case MESA_SHADER_COMPUTE:
956 v3d_cs_set_prog_data(c, (struct v3d_compute_prog_data *)prog_data);
957 break;
958 default:
959 unreachable("unsupported shader stage");
960 }
961 }
962
963 static uint64_t *
v3d_return_qpu_insts(struct v3d_compile * c,uint32_t * final_assembly_size)964 v3d_return_qpu_insts(struct v3d_compile *c, uint32_t *final_assembly_size)
965 {
966 *final_assembly_size = c->qpu_inst_count * sizeof(uint64_t);
967
968 uint64_t *qpu_insts = malloc(*final_assembly_size);
969 if (!qpu_insts)
970 return NULL;
971
972 memcpy(qpu_insts, c->qpu_insts, *final_assembly_size);
973
974 vir_compile_destroy(c);
975
976 return qpu_insts;
977 }
978
979 static void
v3d_nir_lower_vs_early(struct v3d_compile * c)980 v3d_nir_lower_vs_early(struct v3d_compile *c)
981 {
982 /* Split our I/O vars and dead code eliminate the unused
983 * components.
984 */
985 NIR_PASS(_, c->s, nir_lower_io_to_scalar_early,
986 nir_var_shader_in | nir_var_shader_out);
987 uint64_t used_outputs[4] = {0};
988 for (int i = 0; i < c->vs_key->num_used_outputs; i++) {
989 int slot = v3d_slot_get_slot(c->vs_key->used_outputs[i]);
990 int comp = v3d_slot_get_component(c->vs_key->used_outputs[i]);
991 used_outputs[comp] |= 1ull << slot;
992 }
993 NIR_PASS(_, c->s, nir_remove_unused_io_vars,
994 nir_var_shader_out, used_outputs, NULL); /* demotes to globals */
995 NIR_PASS(_, c->s, nir_lower_global_vars_to_local);
996 v3d_optimize_nir(c, c->s);
997 NIR_PASS(_, c->s, nir_remove_dead_variables, nir_var_shader_in, NULL);
998
999 /* This must go before nir_lower_io */
1000 if (c->vs_key->per_vertex_point_size)
1001 NIR_PASS(_, c->s, nir_lower_point_size, 1.0f, 0.0f);
1002
1003 NIR_PASS(_, c->s, nir_lower_io, nir_var_shader_in | nir_var_shader_out,
1004 type_size_vec4,
1005 (nir_lower_io_options)0);
1006 /* clean up nir_lower_io's deref_var remains and do a constant folding pass
1007 * on the code it generated.
1008 */
1009 NIR_PASS(_, c->s, nir_opt_dce);
1010 NIR_PASS(_, c->s, nir_opt_constant_folding);
1011 }
1012
1013 static void
v3d_nir_lower_gs_early(struct v3d_compile * c)1014 v3d_nir_lower_gs_early(struct v3d_compile *c)
1015 {
1016 /* Split our I/O vars and dead code eliminate the unused
1017 * components.
1018 */
1019 NIR_PASS(_, c->s, nir_lower_io_to_scalar_early,
1020 nir_var_shader_in | nir_var_shader_out);
1021 uint64_t used_outputs[4] = {0};
1022 for (int i = 0; i < c->gs_key->num_used_outputs; i++) {
1023 int slot = v3d_slot_get_slot(c->gs_key->used_outputs[i]);
1024 int comp = v3d_slot_get_component(c->gs_key->used_outputs[i]);
1025 used_outputs[comp] |= 1ull << slot;
1026 }
1027 NIR_PASS(_, c->s, nir_remove_unused_io_vars,
1028 nir_var_shader_out, used_outputs, NULL); /* demotes to globals */
1029 NIR_PASS(_, c->s, nir_lower_global_vars_to_local);
1030 v3d_optimize_nir(c, c->s);
1031 NIR_PASS(_, c->s, nir_remove_dead_variables, nir_var_shader_in, NULL);
1032
1033 /* This must go before nir_lower_io */
1034 if (c->gs_key->per_vertex_point_size)
1035 NIR_PASS(_, c->s, nir_lower_point_size, 1.0f, 0.0f);
1036
1037 NIR_PASS(_, c->s, nir_lower_io, nir_var_shader_in | nir_var_shader_out,
1038 type_size_vec4,
1039 (nir_lower_io_options)0);
1040 /* clean up nir_lower_io's deref_var remains and do a constant folding pass
1041 * on the code it generated.
1042 */
1043 NIR_PASS(_, c->s, nir_opt_dce);
1044 NIR_PASS(_, c->s, nir_opt_constant_folding);
1045 }
1046
1047 static void
v3d_fixup_fs_output_types(struct v3d_compile * c)1048 v3d_fixup_fs_output_types(struct v3d_compile *c)
1049 {
1050 nir_foreach_shader_out_variable(var, c->s) {
1051 uint32_t mask = 0;
1052
1053 switch (var->data.location) {
1054 case FRAG_RESULT_COLOR:
1055 mask = ~0;
1056 break;
1057 case FRAG_RESULT_DATA0:
1058 case FRAG_RESULT_DATA1:
1059 case FRAG_RESULT_DATA2:
1060 case FRAG_RESULT_DATA3:
1061 mask = 1 << (var->data.location - FRAG_RESULT_DATA0);
1062 break;
1063 }
1064
1065 if (c->fs_key->int_color_rb & mask) {
1066 var->type =
1067 glsl_vector_type(GLSL_TYPE_INT,
1068 glsl_get_components(var->type));
1069 } else if (c->fs_key->uint_color_rb & mask) {
1070 var->type =
1071 glsl_vector_type(GLSL_TYPE_UINT,
1072 glsl_get_components(var->type));
1073 }
1074 }
1075 }
1076
1077 static void
v3d_nir_lower_fs_early(struct v3d_compile * c)1078 v3d_nir_lower_fs_early(struct v3d_compile *c)
1079 {
1080 if (c->fs_key->int_color_rb || c->fs_key->uint_color_rb)
1081 v3d_fixup_fs_output_types(c);
1082
1083 NIR_PASS(_, c->s, v3d_nir_lower_logic_ops, c);
1084
1085 if (c->fs_key->line_smoothing) {
1086 NIR_PASS(_, c->s, v3d_nir_lower_line_smooth);
1087 NIR_PASS(_, c->s, nir_lower_global_vars_to_local);
1088 /* The lowering pass can introduce new sysval reads */
1089 nir_shader_gather_info(c->s, nir_shader_get_entrypoint(c->s));
1090 }
1091 }
1092
1093 static void
v3d_nir_lower_gs_late(struct v3d_compile * c)1094 v3d_nir_lower_gs_late(struct v3d_compile *c)
1095 {
1096 if (c->key->ucp_enables) {
1097 NIR_PASS(_, c->s, nir_lower_clip_gs, c->key->ucp_enables,
1098 false, NULL);
1099 }
1100
1101 /* Note: GS output scalarizing must happen after nir_lower_clip_gs. */
1102 NIR_PASS_V(c->s, nir_lower_io_to_scalar, nir_var_shader_out, NULL, NULL);
1103 }
1104
1105 static void
v3d_nir_lower_vs_late(struct v3d_compile * c)1106 v3d_nir_lower_vs_late(struct v3d_compile *c)
1107 {
1108 if (c->key->ucp_enables) {
1109 NIR_PASS(_, c->s, nir_lower_clip_vs, c->key->ucp_enables,
1110 false, false, NULL);
1111 NIR_PASS_V(c->s, nir_lower_io_to_scalar,
1112 nir_var_shader_out, NULL, NULL);
1113 }
1114
1115 /* Note: VS output scalarizing must happen after nir_lower_clip_vs. */
1116 NIR_PASS_V(c->s, nir_lower_io_to_scalar, nir_var_shader_out, NULL, NULL);
1117 }
1118
1119 static void
v3d_nir_lower_fs_late(struct v3d_compile * c)1120 v3d_nir_lower_fs_late(struct v3d_compile *c)
1121 {
1122 /* In OpenGL the fragment shader can't read gl_ClipDistance[], but
1123 * Vulkan allows it, in which case the SPIR-V compiler will declare
1124 * VARING_SLOT_CLIP_DIST0 as compact array variable. Pass true as
1125 * the last parameter to always operate with a compact array in both
1126 * OpenGL and Vulkan so we do't have to care about the API we
1127 * are using.
1128 */
1129 if (c->key->ucp_enables)
1130 NIR_PASS(_, c->s, nir_lower_clip_fs, c->key->ucp_enables, true);
1131
1132 NIR_PASS_V(c->s, nir_lower_io_to_scalar, nir_var_shader_in, NULL, NULL);
1133 }
1134
1135 static uint32_t
vir_get_max_temps(struct v3d_compile * c)1136 vir_get_max_temps(struct v3d_compile *c)
1137 {
1138 int max_ip = 0;
1139 vir_for_each_inst_inorder(inst, c)
1140 max_ip++;
1141
1142 uint32_t *pressure = rzalloc_array(NULL, uint32_t, max_ip);
1143
1144 for (int t = 0; t < c->num_temps; t++) {
1145 for (int i = c->temp_start[t]; (i < c->temp_end[t] &&
1146 i < max_ip); i++) {
1147 if (i > max_ip)
1148 break;
1149 pressure[i]++;
1150 }
1151 }
1152
1153 uint32_t max_temps = 0;
1154 for (int i = 0; i < max_ip; i++)
1155 max_temps = MAX2(max_temps, pressure[i]);
1156
1157 ralloc_free(pressure);
1158
1159 return max_temps;
1160 }
1161
1162 enum v3d_dependency_class {
1163 V3D_DEPENDENCY_CLASS_GS_VPM_OUTPUT_0
1164 };
1165
1166 static bool
v3d_intrinsic_dependency_cb(nir_intrinsic_instr * intr,nir_schedule_dependency * dep,void * user_data)1167 v3d_intrinsic_dependency_cb(nir_intrinsic_instr *intr,
1168 nir_schedule_dependency *dep,
1169 void *user_data)
1170 {
1171 struct v3d_compile *c = user_data;
1172
1173 switch (intr->intrinsic) {
1174 case nir_intrinsic_store_output:
1175 /* Writing to location 0 overwrites the value passed in for
1176 * gl_PrimitiveID on geometry shaders
1177 */
1178 if (c->s->info.stage != MESA_SHADER_GEOMETRY ||
1179 nir_intrinsic_base(intr) != 0)
1180 break;
1181
1182 nir_const_value *const_value =
1183 nir_src_as_const_value(intr->src[1]);
1184
1185 if (const_value == NULL)
1186 break;
1187
1188 uint64_t offset =
1189 nir_const_value_as_uint(*const_value,
1190 nir_src_bit_size(intr->src[1]));
1191 if (offset != 0)
1192 break;
1193
1194 dep->klass = V3D_DEPENDENCY_CLASS_GS_VPM_OUTPUT_0;
1195 dep->type = NIR_SCHEDULE_WRITE_DEPENDENCY;
1196 return true;
1197
1198 case nir_intrinsic_load_primitive_id:
1199 if (c->s->info.stage != MESA_SHADER_GEOMETRY)
1200 break;
1201
1202 dep->klass = V3D_DEPENDENCY_CLASS_GS_VPM_OUTPUT_0;
1203 dep->type = NIR_SCHEDULE_READ_DEPENDENCY;
1204 return true;
1205
1206 default:
1207 break;
1208 }
1209
1210 return false;
1211 }
1212
1213 static unsigned
v3d_instr_delay_cb(nir_instr * instr,void * data)1214 v3d_instr_delay_cb(nir_instr *instr, void *data)
1215 {
1216 struct v3d_compile *c = (struct v3d_compile *) data;
1217
1218 switch (instr->type) {
1219 case nir_instr_type_undef:
1220 case nir_instr_type_load_const:
1221 case nir_instr_type_alu:
1222 case nir_instr_type_deref:
1223 case nir_instr_type_jump:
1224 case nir_instr_type_parallel_copy:
1225 case nir_instr_type_call:
1226 case nir_instr_type_phi:
1227 return 1;
1228
1229 /* We should not use very large delays for TMU instructions. Typically,
1230 * thread switches will be sufficient to hide all or most of the latency,
1231 * so we typically only need a little bit of extra room. If we over-estimate
1232 * the latency here we may end up unnecessarily delaying the critical path in
1233 * the shader, which would have a negative effect in performance, so here
1234 * we are trying to strike a balance based on empirical testing.
1235 */
1236 case nir_instr_type_intrinsic: {
1237 nir_intrinsic_instr *intr = nir_instr_as_intrinsic(instr);
1238 if (!c->disable_general_tmu_sched) {
1239 switch (intr->intrinsic) {
1240 case nir_intrinsic_decl_reg:
1241 case nir_intrinsic_load_reg:
1242 case nir_intrinsic_store_reg:
1243 return 0;
1244 case nir_intrinsic_load_ssbo:
1245 case nir_intrinsic_load_scratch:
1246 case nir_intrinsic_load_shared:
1247 case nir_intrinsic_image_load:
1248 return 3;
1249 case nir_intrinsic_load_ubo:
1250 if (nir_src_is_divergent(intr->src[1]))
1251 return 3;
1252 FALLTHROUGH;
1253 default:
1254 return 1;
1255 }
1256 } else {
1257 switch (intr->intrinsic) {
1258 case nir_intrinsic_decl_reg:
1259 case nir_intrinsic_load_reg:
1260 case nir_intrinsic_store_reg:
1261 return 0;
1262 default:
1263 return 1;
1264 }
1265 }
1266 break;
1267 }
1268
1269 case nir_instr_type_tex:
1270 return 5;
1271 }
1272
1273 return 0;
1274 }
1275
1276 static bool
should_split_wrmask(const nir_instr * instr,const void * data)1277 should_split_wrmask(const nir_instr *instr, const void *data)
1278 {
1279 nir_intrinsic_instr *intr = nir_instr_as_intrinsic(instr);
1280 switch (intr->intrinsic) {
1281 case nir_intrinsic_store_ssbo:
1282 case nir_intrinsic_store_shared:
1283 case nir_intrinsic_store_global:
1284 case nir_intrinsic_store_scratch:
1285 return true;
1286 default:
1287 return false;
1288 }
1289 }
1290
1291 static nir_intrinsic_instr *
nir_instr_as_constant_ubo_load(nir_instr * inst)1292 nir_instr_as_constant_ubo_load(nir_instr *inst)
1293 {
1294 if (inst->type != nir_instr_type_intrinsic)
1295 return NULL;
1296
1297 nir_intrinsic_instr *intr = nir_instr_as_intrinsic(inst);
1298 if (intr->intrinsic != nir_intrinsic_load_ubo)
1299 return NULL;
1300
1301 assert(nir_src_is_const(intr->src[0]));
1302 if (!nir_src_is_const(intr->src[1]))
1303 return NULL;
1304
1305 return intr;
1306 }
1307
1308 static bool
v3d_nir_sort_constant_ubo_load(nir_block * block,nir_intrinsic_instr * ref)1309 v3d_nir_sort_constant_ubo_load(nir_block *block, nir_intrinsic_instr *ref)
1310 {
1311 bool progress = false;
1312
1313 nir_instr *ref_inst = &ref->instr;
1314 uint32_t ref_offset = nir_src_as_uint(ref->src[1]);
1315 uint32_t ref_index = nir_src_as_uint(ref->src[0]);
1316
1317 /* Go through all instructions after ref searching for constant UBO
1318 * loads for the same UBO index.
1319 */
1320 bool seq_break = false;
1321 nir_instr *inst = &ref->instr;
1322 nir_instr *next_inst = NULL;
1323 while (true) {
1324 inst = next_inst ? next_inst : nir_instr_next(inst);
1325 if (!inst)
1326 break;
1327
1328 next_inst = NULL;
1329
1330 if (inst->type != nir_instr_type_intrinsic)
1331 continue;
1332
1333 nir_intrinsic_instr *intr = nir_instr_as_intrinsic(inst);
1334 if (intr->intrinsic != nir_intrinsic_load_ubo)
1335 continue;
1336
1337 /* We only produce unifa sequences for non-divergent loads */
1338 if (nir_src_is_divergent(intr->src[1]))
1339 continue;
1340
1341 /* If there are any UBO loads that are not constant or that
1342 * use a different UBO index in between the reference load and
1343 * any other constant load for the same index, they would break
1344 * the unifa sequence. We will flag that so we can then move
1345 * all constant UBO loads for the reference index before these
1346 * and not just the ones that are not ordered to avoid breaking
1347 * the sequence and reduce unifa writes.
1348 */
1349 if (!nir_src_is_const(intr->src[1])) {
1350 seq_break = true;
1351 continue;
1352 }
1353 uint32_t offset = nir_src_as_uint(intr->src[1]);
1354
1355 assert(nir_src_is_const(intr->src[0]));
1356 uint32_t index = nir_src_as_uint(intr->src[0]);
1357 if (index != ref_index) {
1358 seq_break = true;
1359 continue;
1360 }
1361
1362 /* Only move loads with an offset that is close enough to the
1363 * reference offset, since otherwise we would not be able to
1364 * skip the unifa write for them. See ntq_emit_load_ubo_unifa.
1365 */
1366 if (abs((int)(ref_offset - offset)) > MAX_UNIFA_SKIP_DISTANCE)
1367 continue;
1368
1369 /* We will move this load if its offset is smaller than ref's
1370 * (in which case we will move it before ref) or if the offset
1371 * is larger than ref's but there are sequence breakers in
1372 * in between (in which case we will move it after ref and
1373 * before the sequence breakers).
1374 */
1375 if (!seq_break && offset >= ref_offset)
1376 continue;
1377
1378 /* Find where exactly we want to move this load:
1379 *
1380 * If we are moving it before ref, we want to check any other
1381 * UBO loads we placed before ref and make sure we insert this
1382 * one properly ordered with them. Likewise, if we are moving
1383 * it after ref.
1384 */
1385 nir_instr *pos = ref_inst;
1386 nir_instr *tmp = pos;
1387 do {
1388 if (offset < ref_offset)
1389 tmp = nir_instr_prev(tmp);
1390 else
1391 tmp = nir_instr_next(tmp);
1392
1393 if (!tmp || tmp == inst)
1394 break;
1395
1396 /* Ignore non-unifa UBO loads */
1397 if (tmp->type != nir_instr_type_intrinsic)
1398 continue;
1399
1400 nir_intrinsic_instr *tmp_intr =
1401 nir_instr_as_intrinsic(tmp);
1402 if (tmp_intr->intrinsic != nir_intrinsic_load_ubo)
1403 continue;
1404
1405 if (nir_src_is_divergent(tmp_intr->src[1]))
1406 continue;
1407
1408 /* Stop if we find a unifa UBO load that breaks the
1409 * sequence.
1410 */
1411 if (!nir_src_is_const(tmp_intr->src[1]))
1412 break;
1413
1414 if (nir_src_as_uint(tmp_intr->src[0]) != index)
1415 break;
1416
1417 uint32_t tmp_offset = nir_src_as_uint(tmp_intr->src[1]);
1418 if (offset < ref_offset) {
1419 if (tmp_offset < offset ||
1420 tmp_offset >= ref_offset) {
1421 break;
1422 } else {
1423 pos = tmp;
1424 }
1425 } else {
1426 if (tmp_offset > offset ||
1427 tmp_offset <= ref_offset) {
1428 break;
1429 } else {
1430 pos = tmp;
1431 }
1432 }
1433 } while (true);
1434
1435 /* We can't move the UBO load before the instruction that
1436 * defines its constant offset. If that instruction is placed
1437 * in between the new location (pos) and the current location
1438 * of this load, we will have to move that instruction too.
1439 *
1440 * We don't care about the UBO index definition because that
1441 * is optimized to be reused by all UBO loads for the same
1442 * index and therefore is certain to be defined before the
1443 * first UBO load that uses it.
1444 */
1445 nir_instr *offset_inst = NULL;
1446 tmp = inst;
1447 while ((tmp = nir_instr_prev(tmp)) != NULL) {
1448 if (pos == tmp) {
1449 /* We reached the target location without
1450 * finding the instruction that defines the
1451 * offset, so that instruction must be before
1452 * the new position and we don't have to fix it.
1453 */
1454 break;
1455 }
1456 if (intr->src[1].ssa->parent_instr == tmp) {
1457 offset_inst = tmp;
1458 break;
1459 }
1460 }
1461
1462 if (offset_inst) {
1463 exec_node_remove(&offset_inst->node);
1464 exec_node_insert_node_before(&pos->node,
1465 &offset_inst->node);
1466 }
1467
1468 /* Since we are moving the instruction before its current
1469 * location, grab its successor before the move so that
1470 * we can continue the next iteration of the main loop from
1471 * that instruction.
1472 */
1473 next_inst = nir_instr_next(inst);
1474
1475 /* Move this load to the selected location */
1476 exec_node_remove(&inst->node);
1477 if (offset < ref_offset)
1478 exec_node_insert_node_before(&pos->node, &inst->node);
1479 else
1480 exec_node_insert_after(&pos->node, &inst->node);
1481
1482 progress = true;
1483 }
1484
1485 return progress;
1486 }
1487
1488 static bool
v3d_nir_sort_constant_ubo_loads_block(struct v3d_compile * c,nir_block * block)1489 v3d_nir_sort_constant_ubo_loads_block(struct v3d_compile *c,
1490 nir_block *block)
1491 {
1492 bool progress = false;
1493 bool local_progress;
1494 do {
1495 local_progress = false;
1496 nir_foreach_instr_safe(inst, block) {
1497 nir_intrinsic_instr *intr =
1498 nir_instr_as_constant_ubo_load(inst);
1499 if (intr) {
1500 local_progress |=
1501 v3d_nir_sort_constant_ubo_load(block, intr);
1502 }
1503 }
1504 progress |= local_progress;
1505 } while (local_progress);
1506
1507 return progress;
1508 }
1509
1510 /**
1511 * Sorts constant UBO loads in each block by offset to maximize chances of
1512 * skipping unifa writes when converting to VIR. This can increase register
1513 * pressure.
1514 */
1515 static bool
v3d_nir_sort_constant_ubo_loads(nir_shader * s,struct v3d_compile * c)1516 v3d_nir_sort_constant_ubo_loads(nir_shader *s, struct v3d_compile *c)
1517 {
1518 nir_foreach_function_impl(impl, s) {
1519 nir_foreach_block(block, impl) {
1520 c->sorted_any_ubo_loads |=
1521 v3d_nir_sort_constant_ubo_loads_block(c, block);
1522 }
1523 nir_metadata_preserve(impl,
1524 nir_metadata_block_index |
1525 nir_metadata_dominance);
1526 }
1527 return c->sorted_any_ubo_loads;
1528 }
1529
1530 static void
lower_load_num_subgroups(struct v3d_compile * c,nir_builder * b,nir_intrinsic_instr * intr)1531 lower_load_num_subgroups(struct v3d_compile *c,
1532 nir_builder *b,
1533 nir_intrinsic_instr *intr)
1534 {
1535 assert(c->s->info.stage == MESA_SHADER_COMPUTE);
1536 assert(intr->intrinsic == nir_intrinsic_load_num_subgroups);
1537
1538 b->cursor = nir_after_instr(&intr->instr);
1539 uint32_t num_subgroups =
1540 DIV_ROUND_UP(c->s->info.workgroup_size[0] *
1541 c->s->info.workgroup_size[1] *
1542 c->s->info.workgroup_size[2], V3D_CHANNELS);
1543 nir_def *result = nir_imm_int(b, num_subgroups);
1544 nir_def_rewrite_uses(&intr->def, result);
1545 nir_instr_remove(&intr->instr);
1546 }
1547
1548 static bool
lower_subgroup_intrinsics(struct v3d_compile * c,nir_block * block,nir_builder * b)1549 lower_subgroup_intrinsics(struct v3d_compile *c,
1550 nir_block *block, nir_builder *b)
1551 {
1552 bool progress = false;
1553 nir_foreach_instr_safe(inst, block) {
1554 if (inst->type != nir_instr_type_intrinsic)
1555 continue;;
1556
1557 nir_intrinsic_instr *intr =
1558 nir_instr_as_intrinsic(inst);
1559 if (!intr)
1560 continue;
1561
1562 switch (intr->intrinsic) {
1563 case nir_intrinsic_load_num_subgroups:
1564 lower_load_num_subgroups(c, b, intr);
1565 progress = true;
1566 FALLTHROUGH;
1567 case nir_intrinsic_load_subgroup_id:
1568 case nir_intrinsic_load_subgroup_size:
1569 case nir_intrinsic_load_subgroup_invocation:
1570 case nir_intrinsic_elect:
1571 case nir_intrinsic_ballot:
1572 case nir_intrinsic_inverse_ballot:
1573 case nir_intrinsic_ballot_bitfield_extract:
1574 case nir_intrinsic_ballot_bit_count_reduce:
1575 case nir_intrinsic_ballot_find_lsb:
1576 case nir_intrinsic_ballot_find_msb:
1577 case nir_intrinsic_ballot_bit_count_exclusive:
1578 case nir_intrinsic_ballot_bit_count_inclusive:
1579 case nir_intrinsic_reduce:
1580 case nir_intrinsic_inclusive_scan:
1581 case nir_intrinsic_exclusive_scan:
1582 case nir_intrinsic_read_invocation:
1583 case nir_intrinsic_read_first_invocation:
1584 case nir_intrinsic_load_subgroup_eq_mask:
1585 case nir_intrinsic_load_subgroup_ge_mask:
1586 case nir_intrinsic_load_subgroup_gt_mask:
1587 case nir_intrinsic_load_subgroup_le_mask:
1588 case nir_intrinsic_load_subgroup_lt_mask:
1589 case nir_intrinsic_shuffle:
1590 case nir_intrinsic_shuffle_xor:
1591 case nir_intrinsic_shuffle_up:
1592 case nir_intrinsic_shuffle_down:
1593 case nir_intrinsic_vote_all:
1594 case nir_intrinsic_vote_any:
1595 case nir_intrinsic_vote_feq:
1596 case nir_intrinsic_vote_ieq:
1597 case nir_intrinsic_quad_broadcast:
1598 case nir_intrinsic_quad_swap_horizontal:
1599 case nir_intrinsic_quad_swap_vertical:
1600 case nir_intrinsic_quad_swap_diagonal:
1601 c->has_subgroups = true;
1602 break;
1603 default:
1604 break;
1605 }
1606 }
1607
1608 return progress;
1609 }
1610
1611 static bool
v3d_nir_lower_subgroup_intrinsics(nir_shader * s,struct v3d_compile * c)1612 v3d_nir_lower_subgroup_intrinsics(nir_shader *s, struct v3d_compile *c)
1613 {
1614 bool progress = false;
1615 nir_foreach_function_impl(impl, s) {
1616 nir_builder b = nir_builder_create(impl);
1617
1618 nir_foreach_block(block, impl)
1619 progress |= lower_subgroup_intrinsics(c, block, &b);
1620
1621 nir_metadata_preserve(impl,
1622 nir_metadata_block_index |
1623 nir_metadata_dominance);
1624 }
1625 return progress;
1626 }
1627
1628 static void
v3d_attempt_compile(struct v3d_compile * c)1629 v3d_attempt_compile(struct v3d_compile *c)
1630 {
1631 switch (c->s->info.stage) {
1632 case MESA_SHADER_VERTEX:
1633 c->vs_key = (struct v3d_vs_key *) c->key;
1634 break;
1635 case MESA_SHADER_GEOMETRY:
1636 c->gs_key = (struct v3d_gs_key *) c->key;
1637 break;
1638 case MESA_SHADER_FRAGMENT:
1639 c->fs_key = (struct v3d_fs_key *) c->key;
1640 break;
1641 case MESA_SHADER_COMPUTE:
1642 break;
1643 default:
1644 unreachable("unsupported shader stage");
1645 }
1646
1647 switch (c->s->info.stage) {
1648 case MESA_SHADER_VERTEX:
1649 v3d_nir_lower_vs_early(c);
1650 break;
1651 case MESA_SHADER_GEOMETRY:
1652 v3d_nir_lower_gs_early(c);
1653 break;
1654 case MESA_SHADER_FRAGMENT:
1655 v3d_nir_lower_fs_early(c);
1656 break;
1657 default:
1658 break;
1659 }
1660
1661 v3d_lower_nir(c);
1662
1663 switch (c->s->info.stage) {
1664 case MESA_SHADER_VERTEX:
1665 v3d_nir_lower_vs_late(c);
1666 break;
1667 case MESA_SHADER_GEOMETRY:
1668 v3d_nir_lower_gs_late(c);
1669 break;
1670 case MESA_SHADER_FRAGMENT:
1671 v3d_nir_lower_fs_late(c);
1672 break;
1673 default:
1674 break;
1675 }
1676
1677 NIR_PASS(_, c->s, v3d_nir_lower_io, c);
1678 NIR_PASS(_, c->s, v3d_nir_lower_txf_ms);
1679 NIR_PASS(_, c->s, v3d_nir_lower_image_load_store, c);
1680
1681 NIR_PASS(_, c->s, nir_opt_idiv_const, 8);
1682 nir_lower_idiv_options idiv_options = {
1683 .allow_fp16 = true,
1684 };
1685 NIR_PASS(_, c->s, nir_lower_idiv, &idiv_options);
1686 NIR_PASS(_, c->s, nir_lower_alu);
1687
1688 if (c->key->robust_uniform_access || c->key->robust_storage_access ||
1689 c->key->robust_image_access) {
1690 /* nir_lower_robust_access assumes constant buffer
1691 * indices on ubo/ssbo intrinsics so run copy propagation and
1692 * constant folding passes before we run the lowering to warrant
1693 * this. We also want to run the lowering before v3d_optimize to
1694 * clean-up redundant get_buffer_size calls produced in the pass.
1695 */
1696 NIR_PASS(_, c->s, nir_copy_prop);
1697 NIR_PASS(_, c->s, nir_opt_constant_folding);
1698
1699 nir_lower_robust_access_options opts = {
1700 .lower_image = c->key->robust_image_access,
1701 .lower_ssbo = c->key->robust_storage_access,
1702 .lower_ubo = c->key->robust_uniform_access,
1703 };
1704
1705 NIR_PASS(_, c->s, nir_lower_robust_access, &opts);
1706 }
1707
1708 NIR_PASS(_, c->s, nir_lower_wrmasks, should_split_wrmask, c->s);
1709
1710 NIR_PASS(_, c->s, v3d_nir_lower_load_store_bitsize);
1711
1712 NIR_PASS(_, c->s, v3d_nir_lower_subgroup_intrinsics, c);
1713
1714 const nir_lower_subgroups_options subgroup_opts = {
1715 .subgroup_size = V3D_CHANNELS,
1716 .ballot_components = 1,
1717 .ballot_bit_size = 32,
1718 .lower_to_scalar = true,
1719 .lower_inverse_ballot = true,
1720 .lower_subgroup_masks = true,
1721 .lower_relative_shuffle = true,
1722 .lower_quad = true,
1723 };
1724 NIR_PASS(_, c->s, nir_lower_subgroups, &subgroup_opts);
1725
1726 v3d_optimize_nir(c, c->s);
1727
1728 /* Do late algebraic optimization to turn add(a, neg(b)) back into
1729 * subs, then the mandatory cleanup after algebraic. Note that it may
1730 * produce fnegs, and if so then we need to keep running to squash
1731 * fneg(fneg(a)).
1732 */
1733 bool more_late_algebraic = true;
1734 while (more_late_algebraic) {
1735 more_late_algebraic = false;
1736 NIR_PASS(more_late_algebraic, c->s, nir_opt_algebraic_late);
1737 NIR_PASS(_, c->s, nir_opt_constant_folding);
1738 NIR_PASS(_, c->s, nir_copy_prop);
1739 NIR_PASS(_, c->s, nir_opt_dce);
1740 NIR_PASS(_, c->s, nir_opt_cse);
1741 }
1742
1743 NIR_PASS(_, c->s, nir_lower_bool_to_int32);
1744 NIR_PASS(_, c->s, nir_convert_to_lcssa, true, true);
1745 NIR_PASS_V(c->s, nir_divergence_analysis);
1746 NIR_PASS(_, c->s, nir_convert_from_ssa, true);
1747
1748 struct nir_schedule_options schedule_options = {
1749 /* Schedule for about half our register space, to enable more
1750 * shaders to hit 4 threads.
1751 */
1752 .threshold = c->threads == 4 ? 24 : 48,
1753
1754 /* Vertex shaders share the same memory for inputs and outputs,
1755 * fragment and geometry shaders do not.
1756 */
1757 .stages_with_shared_io_memory =
1758 (((1 << MESA_ALL_SHADER_STAGES) - 1) &
1759 ~((1 << MESA_SHADER_FRAGMENT) |
1760 (1 << MESA_SHADER_GEOMETRY))),
1761
1762 .fallback = c->fallback_scheduler,
1763
1764 .intrinsic_cb = v3d_intrinsic_dependency_cb,
1765 .intrinsic_cb_data = c,
1766
1767 .instr_delay_cb = v3d_instr_delay_cb,
1768 .instr_delay_cb_data = c,
1769 };
1770 NIR_PASS_V(c->s, nir_schedule, &schedule_options);
1771
1772 if (!c->disable_constant_ubo_load_sorting)
1773 NIR_PASS(_, c->s, v3d_nir_sort_constant_ubo_loads, c);
1774
1775 const nir_move_options buffer_opts = c->move_buffer_loads ?
1776 (nir_move_load_ubo | nir_move_load_ssbo) : 0;
1777 NIR_PASS(_, c->s, nir_opt_move, nir_move_load_uniform |
1778 nir_move_const_undef |
1779 buffer_opts);
1780
1781 NIR_PASS_V(c->s, nir_trivialize_registers);
1782
1783 v3d_nir_to_vir(c);
1784 }
1785
1786 uint32_t
v3d_prog_data_size(gl_shader_stage stage)1787 v3d_prog_data_size(gl_shader_stage stage)
1788 {
1789 static const int prog_data_size[] = {
1790 [MESA_SHADER_VERTEX] = sizeof(struct v3d_vs_prog_data),
1791 [MESA_SHADER_GEOMETRY] = sizeof(struct v3d_gs_prog_data),
1792 [MESA_SHADER_FRAGMENT] = sizeof(struct v3d_fs_prog_data),
1793 [MESA_SHADER_COMPUTE] = sizeof(struct v3d_compute_prog_data),
1794 };
1795
1796 assert(stage >= 0 &&
1797 stage < ARRAY_SIZE(prog_data_size) &&
1798 prog_data_size[stage]);
1799
1800 return prog_data_size[stage];
1801 }
1802
v3d_shaderdb_dump(struct v3d_compile * c,char ** shaderdb_str)1803 int v3d_shaderdb_dump(struct v3d_compile *c,
1804 char **shaderdb_str)
1805 {
1806 if (c == NULL || c->compilation_result != V3D_COMPILATION_SUCCEEDED)
1807 return -1;
1808
1809 return asprintf(shaderdb_str,
1810 "%s shader: %d inst, %d threads, %d loops, "
1811 "%d uniforms, %d max-temps, %d:%d spills:fills, "
1812 "%d sfu-stalls, %d inst-and-stalls, %d nops",
1813 vir_get_stage_name(c),
1814 c->qpu_inst_count,
1815 c->threads,
1816 c->loops,
1817 c->num_uniforms,
1818 vir_get_max_temps(c),
1819 c->spills,
1820 c->fills,
1821 c->qpu_inst_stalled_count,
1822 c->qpu_inst_count + c->qpu_inst_stalled_count,
1823 c->nop_count);
1824 }
1825
1826 /* This is a list of incremental changes to the compilation strategy
1827 * that will be used to try to compile the shader successfully. The
1828 * default strategy is to enable all optimizations which will have
1829 * the highest register pressure but is expected to produce most
1830 * optimal code. Following strategies incrementally disable specific
1831 * optimizations that are known to contribute to register pressure
1832 * in order to be able to compile the shader successfully while meeting
1833 * thread count requirements.
1834 *
1835 * V3D 4.1+ has a min thread count of 2, but we can use 1 here to also
1836 * cover previous hardware as well (meaning that we are not limiting
1837 * register allocation to any particular thread count). This is fine
1838 * because v3d_nir_to_vir will cap this to the actual minimum.
1839 */
1840 static const struct v3d_compiler_strategy strategies[] = {
1841 /*0*/ { "default", 4, 4, false, false, false, false, false, false, 0 },
1842 /*1*/ { "disable general TMU sched", 4, 4, true, false, false, false, false, false, 0 },
1843 /*2*/ { "disable gcm", 4, 4, true, true, false, false, false, false, 0 },
1844 /*3*/ { "disable loop unrolling", 4, 4, true, true, true, false, false, false, 0 },
1845 /*4*/ { "disable UBO load sorting", 4, 4, true, true, true, true, false, false, 0 },
1846 /*5*/ { "disable TMU pipelining", 4, 4, true, true, true, true, false, true, 0 },
1847 /*6*/ { "lower thread count", 2, 1, false, false, false, false, false, false, -1 },
1848 /*7*/ { "disable general TMU sched (2t)", 2, 1, true, false, false, false, false, false, -1 },
1849 /*8*/ { "disable gcm (2t)", 2, 1, true, true, false, false, false, false, -1 },
1850 /*9*/ { "disable loop unrolling (2t)", 2, 1, true, true, true, false, false, false, -1 },
1851 /*10*/ { "Move buffer loads (2t)", 2, 1, true, true, true, true, true, false, -1 },
1852 /*11*/ { "disable TMU pipelining (2t)", 2, 1, true, true, true, true, true, true, -1 },
1853 /*12*/ { "fallback scheduler", 2, 1, true, true, true, true, true, true, -1 }
1854 };
1855
1856 /**
1857 * If a particular optimization didn't make any progress during a compile
1858 * attempt disabling it alone won't allow us to compile the shader successfully,
1859 * since we'll end up with the same code. Detect these scenarios so we can
1860 * avoid wasting time with useless compiles. We should also consider if the
1861 * gy changes other aspects of the compilation process though, like
1862 * spilling, and not skip it in that case.
1863 */
1864 static bool
skip_compile_strategy(struct v3d_compile * c,uint32_t idx)1865 skip_compile_strategy(struct v3d_compile *c, uint32_t idx)
1866 {
1867 /* We decide if we can skip a strategy based on the optimizations that
1868 * were active in the previous strategy, so we should only be calling this
1869 * for strategies after the first.
1870 */
1871 assert(idx > 0);
1872
1873 /* Don't skip a strategy that changes spilling behavior */
1874 if (strategies[idx].max_tmu_spills !=
1875 strategies[idx - 1].max_tmu_spills) {
1876 return false;
1877 }
1878
1879 switch (idx) {
1880 /* General TMU sched.: skip if we didn't emit any TMU loads */
1881 case 1:
1882 case 7:
1883 return !c->has_general_tmu_load;
1884 /* Global code motion: skip if nir_opt_gcm didn't make any progress */
1885 case 2:
1886 case 8:
1887 return !c->gcm_progress;
1888 /* Loop unrolling: skip if we didn't unroll any loops */
1889 case 3:
1890 case 9:
1891 return !c->unrolled_any_loops;
1892 /* UBO load sorting: skip if we didn't sort any loads */
1893 case 4:
1894 return !c->sorted_any_ubo_loads;
1895 /* Move buffer loads: we assume any shader with difficult RA
1896 * most likely has UBO / SSBO loads so we never try to skip.
1897 * For now, we only try this for 2-thread compiles since it
1898 * is expected to impact instruction counts and latency.
1899 */
1900 case 10:
1901 assert(c->threads < 4);
1902 return false;
1903 /* TMU pipelining: skip if we didn't pipeline any TMU ops */
1904 case 5:
1905 case 11:
1906 return !c->pipelined_any_tmu;
1907 /* Lower thread count: skip if we already tried less that 4 threads */
1908 case 6:
1909 return c->threads < 4;
1910 default:
1911 return false;
1912 };
1913 }
1914
1915 static inline void
set_best_compile(struct v3d_compile ** best,struct v3d_compile * c)1916 set_best_compile(struct v3d_compile **best, struct v3d_compile *c)
1917 {
1918 if (*best)
1919 vir_compile_destroy(*best);
1920 *best = c;
1921 }
1922
v3d_compile(const struct v3d_compiler * compiler,struct v3d_key * key,struct v3d_prog_data ** out_prog_data,nir_shader * s,void (* debug_output)(const char * msg,void * debug_output_data),void * debug_output_data,int program_id,int variant_id,uint32_t * final_assembly_size)1923 uint64_t *v3d_compile(const struct v3d_compiler *compiler,
1924 struct v3d_key *key,
1925 struct v3d_prog_data **out_prog_data,
1926 nir_shader *s,
1927 void (*debug_output)(const char *msg,
1928 void *debug_output_data),
1929 void *debug_output_data,
1930 int program_id, int variant_id,
1931 uint32_t *final_assembly_size)
1932 {
1933 struct v3d_compile *c = NULL;
1934
1935 uint32_t best_spill_fill_count = UINT32_MAX;
1936 struct v3d_compile *best_c = NULL;
1937 for (int32_t strat = 0; strat < ARRAY_SIZE(strategies); strat++) {
1938 /* Fallback strategy */
1939 if (strat > 0) {
1940 assert(c);
1941 if (skip_compile_strategy(c, strat))
1942 continue;
1943
1944 char *debug_msg;
1945 int ret = asprintf(&debug_msg,
1946 "Falling back to strategy '%s' "
1947 "for %s prog %d/%d",
1948 strategies[strat].name,
1949 vir_get_stage_name(c),
1950 c->program_id, c->variant_id);
1951
1952 if (ret >= 0) {
1953 if (V3D_DBG(PERF))
1954 fprintf(stderr, "%s\n", debug_msg);
1955
1956 c->debug_output(debug_msg, c->debug_output_data);
1957 free(debug_msg);
1958 }
1959
1960 if (c != best_c)
1961 vir_compile_destroy(c);
1962 }
1963
1964 c = vir_compile_init(compiler, key, s,
1965 debug_output, debug_output_data,
1966 program_id, variant_id,
1967 strat, &strategies[strat],
1968 strat == ARRAY_SIZE(strategies) - 1);
1969
1970 v3d_attempt_compile(c);
1971
1972 /* Broken shader or driver bug */
1973 if (c->compilation_result == V3D_COMPILATION_FAILED)
1974 break;
1975
1976 /* If we compiled without spills, choose this.
1977 * Otherwise if this is a 4-thread compile, choose this (these
1978 * have a very low cap on the allowed TMU spills so we assume
1979 * it will be better than a 2-thread compile without spills).
1980 * Otherwise, keep going while tracking the strategy with the
1981 * lowest spill count.
1982 */
1983 if (c->compilation_result == V3D_COMPILATION_SUCCEEDED) {
1984 if (c->spills == 0 ||
1985 strategies[strat].min_threads == 4 ||
1986 V3D_DBG(OPT_COMPILE_TIME)) {
1987 set_best_compile(&best_c, c);
1988 break;
1989 } else if (c->spills + c->fills <
1990 best_spill_fill_count) {
1991 set_best_compile(&best_c, c);
1992 best_spill_fill_count = c->spills + c->fills;
1993 }
1994
1995 if (V3D_DBG(PERF)) {
1996 char *debug_msg;
1997 int ret = asprintf(&debug_msg,
1998 "Compiled %s prog %d/%d with %d "
1999 "spills and %d fills. Will try "
2000 "more strategies.",
2001 vir_get_stage_name(c),
2002 c->program_id, c->variant_id,
2003 c->spills, c->fills);
2004 if (ret >= 0) {
2005 fprintf(stderr, "%s\n", debug_msg);
2006 c->debug_output(debug_msg, c->debug_output_data);
2007 free(debug_msg);
2008 }
2009 }
2010 }
2011
2012 /* Only try next streategy if we failed to register allocate
2013 * or we had to spill.
2014 */
2015 assert(c->compilation_result ==
2016 V3D_COMPILATION_FAILED_REGISTER_ALLOCATION ||
2017 c->spills > 0);
2018 }
2019
2020 /* If the best strategy was not the last, choose that */
2021 if (best_c && c != best_c)
2022 set_best_compile(&c, best_c);
2023
2024 if (V3D_DBG(PERF) &&
2025 c->compilation_result !=
2026 V3D_COMPILATION_FAILED_REGISTER_ALLOCATION &&
2027 c->spills > 0) {
2028 char *debug_msg;
2029 int ret = asprintf(&debug_msg,
2030 "Compiled %s prog %d/%d with %d "
2031 "spills and %d fills",
2032 vir_get_stage_name(c),
2033 c->program_id, c->variant_id,
2034 c->spills, c->fills);
2035 fprintf(stderr, "%s\n", debug_msg);
2036
2037 if (ret >= 0) {
2038 c->debug_output(debug_msg, c->debug_output_data);
2039 free(debug_msg);
2040 }
2041 }
2042
2043 if (c->compilation_result != V3D_COMPILATION_SUCCEEDED) {
2044 fprintf(stderr, "Failed to compile %s prog %d/%d "
2045 "with any strategy.\n",
2046 vir_get_stage_name(c), c->program_id, c->variant_id);
2047
2048 vir_compile_destroy(c);
2049 return NULL;
2050 }
2051
2052 struct v3d_prog_data *prog_data;
2053
2054 prog_data = rzalloc_size(NULL, v3d_prog_data_size(c->s->info.stage));
2055
2056 v3d_set_prog_data(c, prog_data);
2057
2058 *out_prog_data = prog_data;
2059
2060 char *shaderdb;
2061 int ret = v3d_shaderdb_dump(c, &shaderdb);
2062 if (ret >= 0) {
2063 if (V3D_DBG(SHADERDB))
2064 fprintf(stderr, "SHADER-DB-%s - %s\n", s->info.name, shaderdb);
2065
2066 c->debug_output(shaderdb, c->debug_output_data);
2067 free(shaderdb);
2068 }
2069
2070 return v3d_return_qpu_insts(c, final_assembly_size);
2071 }
2072
2073 void
vir_remove_instruction(struct v3d_compile * c,struct qinst * qinst)2074 vir_remove_instruction(struct v3d_compile *c, struct qinst *qinst)
2075 {
2076 if (qinst->dst.file == QFILE_TEMP)
2077 c->defs[qinst->dst.index] = NULL;
2078
2079 assert(&qinst->link != c->cursor.link);
2080
2081 list_del(&qinst->link);
2082 free(qinst);
2083
2084 c->live_intervals_valid = false;
2085 }
2086
2087 struct qreg
vir_follow_movs(struct v3d_compile * c,struct qreg reg)2088 vir_follow_movs(struct v3d_compile *c, struct qreg reg)
2089 {
2090 /* XXX
2091 int pack = reg.pack;
2092
2093 while (reg.file == QFILE_TEMP &&
2094 c->defs[reg.index] &&
2095 (c->defs[reg.index]->op == QOP_MOV ||
2096 c->defs[reg.index]->op == QOP_FMOV) &&
2097 !c->defs[reg.index]->dst.pack &&
2098 !c->defs[reg.index]->src[0].pack) {
2099 reg = c->defs[reg.index]->src[0];
2100 }
2101
2102 reg.pack = pack;
2103 */
2104 return reg;
2105 }
2106
2107 void
vir_compile_destroy(struct v3d_compile * c)2108 vir_compile_destroy(struct v3d_compile *c)
2109 {
2110 /* Defuse the assert that we aren't removing the cursor's instruction.
2111 */
2112 c->cursor.link = NULL;
2113
2114 vir_for_each_block(block, c) {
2115 while (!list_is_empty(&block->instructions)) {
2116 struct qinst *qinst =
2117 list_first_entry(&block->instructions,
2118 struct qinst, link);
2119 vir_remove_instruction(c, qinst);
2120 }
2121 }
2122
2123 ralloc_free(c);
2124 }
2125
2126 uint32_t
vir_get_uniform_index(struct v3d_compile * c,enum quniform_contents contents,uint32_t data)2127 vir_get_uniform_index(struct v3d_compile *c,
2128 enum quniform_contents contents,
2129 uint32_t data)
2130 {
2131 for (int i = 0; i < c->num_uniforms; i++) {
2132 if (c->uniform_contents[i] == contents &&
2133 c->uniform_data[i] == data) {
2134 return i;
2135 }
2136 }
2137
2138 uint32_t uniform = c->num_uniforms++;
2139
2140 if (uniform >= c->uniform_array_size) {
2141 c->uniform_array_size = MAX2(MAX2(16, uniform + 1),
2142 c->uniform_array_size * 2);
2143
2144 c->uniform_data = reralloc(c, c->uniform_data,
2145 uint32_t,
2146 c->uniform_array_size);
2147 c->uniform_contents = reralloc(c, c->uniform_contents,
2148 enum quniform_contents,
2149 c->uniform_array_size);
2150 }
2151
2152 c->uniform_contents[uniform] = contents;
2153 c->uniform_data[uniform] = data;
2154
2155 return uniform;
2156 }
2157
2158 /* Looks back into the current block to find the ldunif that wrote the uniform
2159 * at the requested index. If it finds it, it returns true and writes the
2160 * destination register of the ldunif instruction to 'unif'.
2161 *
2162 * This can impact register pressure and end up leading to worse code, so we
2163 * limit the number of instructions we are willing to look back through to
2164 * strike a good balance.
2165 */
2166 static bool
try_opt_ldunif(struct v3d_compile * c,uint32_t index,struct qreg * unif)2167 try_opt_ldunif(struct v3d_compile *c, uint32_t index, struct qreg *unif)
2168 {
2169 uint32_t count = 20;
2170 struct qinst *prev_inst = NULL;
2171 assert(c->cur_block);
2172
2173 #ifdef DEBUG
2174 /* We can only reuse a uniform if it was emitted in the same block,
2175 * so callers must make sure the current instruction is being emitted
2176 * in the current block.
2177 */
2178 bool found = false;
2179 vir_for_each_inst(inst, c->cur_block) {
2180 if (&inst->link == c->cursor.link) {
2181 found = true;
2182 break;
2183 }
2184 }
2185
2186 assert(found || &c->cur_block->instructions == c->cursor.link);
2187 #endif
2188
2189 list_for_each_entry_from_rev(struct qinst, inst, c->cursor.link->prev,
2190 &c->cur_block->instructions, link) {
2191 if ((inst->qpu.sig.ldunif || inst->qpu.sig.ldunifrf) &&
2192 inst->uniform == index) {
2193 prev_inst = inst;
2194 break;
2195 }
2196
2197 if (--count == 0)
2198 break;
2199 }
2200
2201 if (!prev_inst)
2202 return false;
2203
2204 /* Only reuse the ldunif result if it was written to a temp register,
2205 * otherwise there may be special restrictions (for example, ldunif
2206 * may write directly to unifa, which is a write-only register).
2207 */
2208 if (prev_inst->dst.file != QFILE_TEMP)
2209 return false;
2210
2211 list_for_each_entry_from(struct qinst, inst, prev_inst->link.next,
2212 &c->cur_block->instructions, link) {
2213 if (inst->dst.file == prev_inst->dst.file &&
2214 inst->dst.index == prev_inst->dst.index) {
2215 return false;
2216 }
2217 }
2218
2219 *unif = prev_inst->dst;
2220 return true;
2221 }
2222
2223 struct qreg
vir_uniform(struct v3d_compile * c,enum quniform_contents contents,uint32_t data)2224 vir_uniform(struct v3d_compile *c,
2225 enum quniform_contents contents,
2226 uint32_t data)
2227 {
2228 const int num_uniforms = c->num_uniforms;
2229 const int index = vir_get_uniform_index(c, contents, data);
2230
2231 /* If this is not the first time we see this uniform try to reuse the
2232 * result of the last ldunif that loaded it.
2233 */
2234 const bool is_new_uniform = num_uniforms != c->num_uniforms;
2235 if (!is_new_uniform && !c->disable_ldunif_opt) {
2236 struct qreg ldunif_dst;
2237 if (try_opt_ldunif(c, index, &ldunif_dst))
2238 return ldunif_dst;
2239 }
2240
2241 struct qinst *inst = vir_NOP(c);
2242 inst->qpu.sig.ldunif = true;
2243 inst->uniform = index;
2244 inst->dst = vir_get_temp(c);
2245 c->defs[inst->dst.index] = inst;
2246 return inst->dst;
2247 }
2248
2249 #define OPTPASS(func) \
2250 do { \
2251 bool stage_progress = func(c); \
2252 if (stage_progress) { \
2253 progress = true; \
2254 if (print_opt_debug) { \
2255 fprintf(stderr, \
2256 "VIR opt pass %2d: %s progress\n", \
2257 pass, #func); \
2258 } \
2259 /*XXX vir_validate(c);*/ \
2260 } \
2261 } while (0)
2262
2263 void
vir_optimize(struct v3d_compile * c)2264 vir_optimize(struct v3d_compile *c)
2265 {
2266 bool print_opt_debug = false;
2267 int pass = 1;
2268
2269 while (true) {
2270 bool progress = false;
2271
2272 OPTPASS(vir_opt_copy_propagate);
2273 OPTPASS(vir_opt_redundant_flags);
2274 OPTPASS(vir_opt_dead_code);
2275 OPTPASS(vir_opt_small_immediates);
2276 OPTPASS(vir_opt_constant_alu);
2277
2278 if (!progress)
2279 break;
2280
2281 pass++;
2282 }
2283 }
2284
2285 const char *
vir_get_stage_name(struct v3d_compile * c)2286 vir_get_stage_name(struct v3d_compile *c)
2287 {
2288 if (c->vs_key && c->vs_key->is_coord)
2289 return "MESA_SHADER_VERTEX_BIN";
2290 else if (c->gs_key && c->gs_key->is_coord)
2291 return "MESA_SHADER_GEOMETRY_BIN";
2292 else
2293 return gl_shader_stage_name(c->s->info.stage);
2294 }
2295
2296 static inline uint32_t
compute_vpm_size_in_sectors(const struct v3d_device_info * devinfo)2297 compute_vpm_size_in_sectors(const struct v3d_device_info *devinfo)
2298 {
2299 assert(devinfo->vpm_size > 0);
2300 const uint32_t sector_size = V3D_CHANNELS * sizeof(uint32_t) * 8;
2301 return devinfo->vpm_size / sector_size;
2302 }
2303
2304 /* Computes various parameters affecting VPM memory configuration for programs
2305 * involving geometry shaders to ensure the program fits in memory and honors
2306 * requirements described in section "VPM usage" of the programming manual.
2307 */
2308 static bool
compute_vpm_config_gs(struct v3d_device_info * devinfo,struct v3d_vs_prog_data * vs,struct v3d_gs_prog_data * gs,struct vpm_config * vpm_cfg_out)2309 compute_vpm_config_gs(struct v3d_device_info *devinfo,
2310 struct v3d_vs_prog_data *vs,
2311 struct v3d_gs_prog_data *gs,
2312 struct vpm_config *vpm_cfg_out)
2313 {
2314 const uint32_t A = vs->separate_segments ? 1 : 0;
2315 const uint32_t Ad = vs->vpm_input_size;
2316 const uint32_t Vd = vs->vpm_output_size;
2317
2318 const uint32_t vpm_size = compute_vpm_size_in_sectors(devinfo);
2319
2320 /* Try to fit program into our VPM memory budget by adjusting
2321 * configurable parameters iteratively. We do this in two phases:
2322 * the first phase tries to fit the program into the total available
2323 * VPM memory. If we succeed at that, then the second phase attempts
2324 * to fit the program into half of that budget so we can run bin and
2325 * render programs in parallel.
2326 */
2327 struct vpm_config vpm_cfg[2];
2328 struct vpm_config *final_vpm_cfg = NULL;
2329 uint32_t phase = 0;
2330
2331 vpm_cfg[phase].As = 1;
2332 vpm_cfg[phase].Gs = 1;
2333 vpm_cfg[phase].Gd = gs->vpm_output_size;
2334 vpm_cfg[phase].gs_width = gs->simd_width;
2335
2336 /* While there is a requirement that Vc >= [Vn / 16], this is
2337 * always the case when tessellation is not present because in that
2338 * case Vn can only be 6 at most (when input primitive is triangles
2339 * with adjacency).
2340 *
2341 * We always choose Vc=2. We can't go lower than this due to GFXH-1744,
2342 * and Broadcom has not found it worth it to increase it beyond this
2343 * in general. Increasing Vc also increases VPM memory pressure which
2344 * can turn up being detrimental for performance in some scenarios.
2345 */
2346 vpm_cfg[phase].Vc = 2;
2347
2348 /* Gv is a constraint on the hardware to not exceed the
2349 * specified number of vertex segments per GS batch. If adding a
2350 * new primitive to a GS batch would result in a range of more
2351 * than Gv vertex segments being referenced by the batch, then
2352 * the hardware will flush the batch and start a new one. This
2353 * means that we can choose any value we want, we just need to
2354 * be aware that larger values improve GS batch utilization
2355 * at the expense of more VPM memory pressure (which can affect
2356 * other performance aspects, such as GS dispatch width).
2357 * We start with the largest value, and will reduce it if we
2358 * find that total memory pressure is too high.
2359 */
2360 vpm_cfg[phase].Gv = 3;
2361 do {
2362 /* When GS is present in absence of TES, then we need to satisfy
2363 * that Ve >= Gv. We go with the smallest value of Ve to avoid
2364 * increasing memory pressure.
2365 */
2366 vpm_cfg[phase].Ve = vpm_cfg[phase].Gv;
2367
2368 uint32_t vpm_sectors =
2369 A * vpm_cfg[phase].As * Ad +
2370 (vpm_cfg[phase].Vc + vpm_cfg[phase].Ve) * Vd +
2371 vpm_cfg[phase].Gs * vpm_cfg[phase].Gd;
2372
2373 /* Ideally we want to use no more than half of the available
2374 * memory so we can execute a bin and render program in parallel
2375 * without stalls. If we achieved that then we are done.
2376 */
2377 if (vpm_sectors <= vpm_size / 2) {
2378 final_vpm_cfg = &vpm_cfg[phase];
2379 break;
2380 }
2381
2382 /* At the very least, we should not allocate more than the
2383 * total available VPM memory. If we have a configuration that
2384 * succeeds at this we save it and continue to see if we can
2385 * meet the half-memory-use criteria too.
2386 */
2387 if (phase == 0 && vpm_sectors <= vpm_size) {
2388 vpm_cfg[1] = vpm_cfg[0];
2389 phase = 1;
2390 }
2391
2392 /* Try lowering Gv */
2393 if (vpm_cfg[phase].Gv > 0) {
2394 vpm_cfg[phase].Gv--;
2395 continue;
2396 }
2397
2398 /* Try lowering GS dispatch width */
2399 if (vpm_cfg[phase].gs_width > 1) {
2400 do {
2401 vpm_cfg[phase].gs_width >>= 1;
2402 vpm_cfg[phase].Gd = align(vpm_cfg[phase].Gd, 2) / 2;
2403 } while (vpm_cfg[phase].gs_width == 2);
2404
2405 /* Reset Gv to max after dropping dispatch width */
2406 vpm_cfg[phase].Gv = 3;
2407 continue;
2408 }
2409
2410 /* We ran out of options to reduce memory pressure. If we
2411 * are at phase 1 we have at least a valid configuration, so we
2412 * we use that.
2413 */
2414 if (phase == 1)
2415 final_vpm_cfg = &vpm_cfg[0];
2416 break;
2417 } while (true);
2418
2419 if (!final_vpm_cfg)
2420 return false;
2421
2422 assert(final_vpm_cfg);
2423 assert(final_vpm_cfg->Gd <= 16);
2424 assert(final_vpm_cfg->Gv < 4);
2425 assert(final_vpm_cfg->Ve < 4);
2426 assert(final_vpm_cfg->Vc >= 2 && final_vpm_cfg->Vc <= 4);
2427 assert(final_vpm_cfg->gs_width == 1 ||
2428 final_vpm_cfg->gs_width == 4 ||
2429 final_vpm_cfg->gs_width == 8 ||
2430 final_vpm_cfg->gs_width == 16);
2431
2432 *vpm_cfg_out = *final_vpm_cfg;
2433 return true;
2434 }
2435
2436 bool
v3d_compute_vpm_config(struct v3d_device_info * devinfo,struct v3d_vs_prog_data * vs_bin,struct v3d_vs_prog_data * vs,struct v3d_gs_prog_data * gs_bin,struct v3d_gs_prog_data * gs,struct vpm_config * vpm_cfg_bin,struct vpm_config * vpm_cfg)2437 v3d_compute_vpm_config(struct v3d_device_info *devinfo,
2438 struct v3d_vs_prog_data *vs_bin,
2439 struct v3d_vs_prog_data *vs,
2440 struct v3d_gs_prog_data *gs_bin,
2441 struct v3d_gs_prog_data *gs,
2442 struct vpm_config *vpm_cfg_bin,
2443 struct vpm_config *vpm_cfg)
2444 {
2445 assert(vs && vs_bin);
2446 assert((gs != NULL) == (gs_bin != NULL));
2447
2448 if (!gs) {
2449 vpm_cfg_bin->As = 1;
2450 vpm_cfg_bin->Ve = 0;
2451 vpm_cfg_bin->Vc = vs_bin->vcm_cache_size;
2452
2453 vpm_cfg->As = 1;
2454 vpm_cfg->Ve = 0;
2455 vpm_cfg->Vc = vs->vcm_cache_size;
2456 } else {
2457 if (!compute_vpm_config_gs(devinfo, vs_bin, gs_bin, vpm_cfg_bin))
2458 return false;
2459
2460 if (!compute_vpm_config_gs(devinfo, vs, gs, vpm_cfg))
2461 return false;
2462 }
2463
2464 return true;
2465 }
2466