1 /*
2 * Copyright © 2016-2017 Broadcom
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
13 * Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21 * IN THE SOFTWARE.
22 */
23
24 #include "broadcom/common/v3d_device_info.h"
25 #include "v3d_compiler.h"
26 #include "util/u_prim.h"
27 #include "compiler/nir/nir_schedule.h"
28 #include "compiler/nir/nir_builder.h"
29
30 int
vir_get_nsrc(struct qinst * inst)31 vir_get_nsrc(struct qinst *inst)
32 {
33 switch (inst->qpu.type) {
34 case V3D_QPU_INSTR_TYPE_BRANCH:
35 return 0;
36 case V3D_QPU_INSTR_TYPE_ALU:
37 if (inst->qpu.alu.add.op != V3D_QPU_A_NOP)
38 return v3d_qpu_add_op_num_src(inst->qpu.alu.add.op);
39 else
40 return v3d_qpu_mul_op_num_src(inst->qpu.alu.mul.op);
41 }
42
43 return 0;
44 }
45
46 /**
47 * Returns whether the instruction has any side effects that must be
48 * preserved.
49 */
50 bool
vir_has_side_effects(struct v3d_compile * c,struct qinst * inst)51 vir_has_side_effects(struct v3d_compile *c, struct qinst *inst)
52 {
53 switch (inst->qpu.type) {
54 case V3D_QPU_INSTR_TYPE_BRANCH:
55 return true;
56 case V3D_QPU_INSTR_TYPE_ALU:
57 switch (inst->qpu.alu.add.op) {
58 case V3D_QPU_A_SETREVF:
59 case V3D_QPU_A_SETMSF:
60 case V3D_QPU_A_VPMSETUP:
61 case V3D_QPU_A_STVPMV:
62 case V3D_QPU_A_STVPMD:
63 case V3D_QPU_A_STVPMP:
64 case V3D_QPU_A_VPMWT:
65 case V3D_QPU_A_TMUWT:
66 return true;
67 default:
68 break;
69 }
70
71 switch (inst->qpu.alu.mul.op) {
72 case V3D_QPU_M_MULTOP:
73 return true;
74 default:
75 break;
76 }
77 }
78
79 if (inst->qpu.sig.ldtmu ||
80 inst->qpu.sig.ldvary ||
81 inst->qpu.sig.ldtlbu ||
82 inst->qpu.sig.ldtlb ||
83 inst->qpu.sig.wrtmuc ||
84 inst->qpu.sig.thrsw) {
85 return true;
86 }
87
88 /* ldunifa works like ldunif: it reads an element and advances the
89 * pointer, so each read has a side effect (we don't care for ldunif
90 * because we reconstruct the uniform stream buffer after compiling
91 * with the surviving uniforms), so allowing DCE to remove
92 * one would break follow-up loads. We could fix this by emiting a
93 * unifa for each ldunifa, but each unifa requires 3 delay slots
94 * before a ldunifa, so that would be quite expensive.
95 */
96 if (inst->qpu.sig.ldunifa || inst->qpu.sig.ldunifarf)
97 return true;
98
99 return false;
100 }
101
102 bool
vir_is_raw_mov(struct qinst * inst)103 vir_is_raw_mov(struct qinst *inst)
104 {
105 if (inst->qpu.type != V3D_QPU_INSTR_TYPE_ALU ||
106 (inst->qpu.alu.mul.op != V3D_QPU_M_FMOV &&
107 inst->qpu.alu.mul.op != V3D_QPU_M_MOV)) {
108 return false;
109 }
110
111 if (inst->qpu.alu.add.output_pack != V3D_QPU_PACK_NONE ||
112 inst->qpu.alu.mul.output_pack != V3D_QPU_PACK_NONE) {
113 return false;
114 }
115
116 if (inst->qpu.alu.add.a_unpack != V3D_QPU_UNPACK_NONE ||
117 inst->qpu.alu.add.b_unpack != V3D_QPU_UNPACK_NONE ||
118 inst->qpu.alu.mul.a_unpack != V3D_QPU_UNPACK_NONE ||
119 inst->qpu.alu.mul.b_unpack != V3D_QPU_UNPACK_NONE) {
120 return false;
121 }
122
123 if (inst->qpu.flags.ac != V3D_QPU_COND_NONE ||
124 inst->qpu.flags.mc != V3D_QPU_COND_NONE)
125 return false;
126
127 return true;
128 }
129
130 bool
vir_is_add(struct qinst * inst)131 vir_is_add(struct qinst *inst)
132 {
133 return (inst->qpu.type == V3D_QPU_INSTR_TYPE_ALU &&
134 inst->qpu.alu.add.op != V3D_QPU_A_NOP);
135 }
136
137 bool
vir_is_mul(struct qinst * inst)138 vir_is_mul(struct qinst *inst)
139 {
140 return (inst->qpu.type == V3D_QPU_INSTR_TYPE_ALU &&
141 inst->qpu.alu.mul.op != V3D_QPU_M_NOP);
142 }
143
144 bool
vir_is_tex(const struct v3d_device_info * devinfo,struct qinst * inst)145 vir_is_tex(const struct v3d_device_info *devinfo, struct qinst *inst)
146 {
147 if (inst->dst.file == QFILE_MAGIC)
148 return v3d_qpu_magic_waddr_is_tmu(devinfo, inst->dst.index);
149
150 if (inst->qpu.type == V3D_QPU_INSTR_TYPE_ALU &&
151 inst->qpu.alu.add.op == V3D_QPU_A_TMUWT) {
152 return true;
153 }
154
155 return false;
156 }
157
158 bool
vir_writes_r3(const struct v3d_device_info * devinfo,struct qinst * inst)159 vir_writes_r3(const struct v3d_device_info *devinfo, struct qinst *inst)
160 {
161 for (int i = 0; i < vir_get_nsrc(inst); i++) {
162 switch (inst->src[i].file) {
163 case QFILE_VPM:
164 return true;
165 default:
166 break;
167 }
168 }
169
170 if (devinfo->ver < 41 && (inst->qpu.sig.ldvary ||
171 inst->qpu.sig.ldtlb ||
172 inst->qpu.sig.ldtlbu ||
173 inst->qpu.sig.ldvpm)) {
174 return true;
175 }
176
177 return false;
178 }
179
180 bool
vir_writes_r4(const struct v3d_device_info * devinfo,struct qinst * inst)181 vir_writes_r4(const struct v3d_device_info *devinfo, struct qinst *inst)
182 {
183 switch (inst->dst.file) {
184 case QFILE_MAGIC:
185 switch (inst->dst.index) {
186 case V3D_QPU_WADDR_RECIP:
187 case V3D_QPU_WADDR_RSQRT:
188 case V3D_QPU_WADDR_EXP:
189 case V3D_QPU_WADDR_LOG:
190 case V3D_QPU_WADDR_SIN:
191 return true;
192 }
193 break;
194 default:
195 break;
196 }
197
198 if (devinfo->ver < 41 && inst->qpu.sig.ldtmu)
199 return true;
200
201 return false;
202 }
203
204 void
vir_set_unpack(struct qinst * inst,int src,enum v3d_qpu_input_unpack unpack)205 vir_set_unpack(struct qinst *inst, int src,
206 enum v3d_qpu_input_unpack unpack)
207 {
208 assert(src == 0 || src == 1);
209
210 if (vir_is_add(inst)) {
211 if (src == 0)
212 inst->qpu.alu.add.a_unpack = unpack;
213 else
214 inst->qpu.alu.add.b_unpack = unpack;
215 } else {
216 assert(vir_is_mul(inst));
217 if (src == 0)
218 inst->qpu.alu.mul.a_unpack = unpack;
219 else
220 inst->qpu.alu.mul.b_unpack = unpack;
221 }
222 }
223
224 void
vir_set_pack(struct qinst * inst,enum v3d_qpu_output_pack pack)225 vir_set_pack(struct qinst *inst, enum v3d_qpu_output_pack pack)
226 {
227 if (vir_is_add(inst)) {
228 inst->qpu.alu.add.output_pack = pack;
229 } else {
230 assert(vir_is_mul(inst));
231 inst->qpu.alu.mul.output_pack = pack;
232 }
233 }
234
235 void
vir_set_cond(struct qinst * inst,enum v3d_qpu_cond cond)236 vir_set_cond(struct qinst *inst, enum v3d_qpu_cond cond)
237 {
238 if (vir_is_add(inst)) {
239 inst->qpu.flags.ac = cond;
240 } else {
241 assert(vir_is_mul(inst));
242 inst->qpu.flags.mc = cond;
243 }
244 }
245
246 enum v3d_qpu_cond
vir_get_cond(struct qinst * inst)247 vir_get_cond(struct qinst *inst)
248 {
249 assert(inst->qpu.type == V3D_QPU_INSTR_TYPE_ALU);
250
251 if (vir_is_add(inst))
252 return inst->qpu.flags.ac;
253 else if (vir_is_mul(inst))
254 return inst->qpu.flags.mc;
255 else /* NOP */
256 return V3D_QPU_COND_NONE;
257 }
258
259 void
vir_set_pf(struct v3d_compile * c,struct qinst * inst,enum v3d_qpu_pf pf)260 vir_set_pf(struct v3d_compile *c, struct qinst *inst, enum v3d_qpu_pf pf)
261 {
262 c->flags_temp = -1;
263 if (vir_is_add(inst)) {
264 inst->qpu.flags.apf = pf;
265 } else {
266 assert(vir_is_mul(inst));
267 inst->qpu.flags.mpf = pf;
268 }
269 }
270
271 void
vir_set_uf(struct v3d_compile * c,struct qinst * inst,enum v3d_qpu_uf uf)272 vir_set_uf(struct v3d_compile *c, struct qinst *inst, enum v3d_qpu_uf uf)
273 {
274 c->flags_temp = -1;
275 if (vir_is_add(inst)) {
276 inst->qpu.flags.auf = uf;
277 } else {
278 assert(vir_is_mul(inst));
279 inst->qpu.flags.muf = uf;
280 }
281 }
282
283 #if 0
284 uint8_t
285 vir_channels_written(struct qinst *inst)
286 {
287 if (vir_is_mul(inst)) {
288 switch (inst->dst.pack) {
289 case QPU_PACK_MUL_NOP:
290 case QPU_PACK_MUL_8888:
291 return 0xf;
292 case QPU_PACK_MUL_8A:
293 return 0x1;
294 case QPU_PACK_MUL_8B:
295 return 0x2;
296 case QPU_PACK_MUL_8C:
297 return 0x4;
298 case QPU_PACK_MUL_8D:
299 return 0x8;
300 }
301 } else {
302 switch (inst->dst.pack) {
303 case QPU_PACK_A_NOP:
304 case QPU_PACK_A_8888:
305 case QPU_PACK_A_8888_SAT:
306 case QPU_PACK_A_32_SAT:
307 return 0xf;
308 case QPU_PACK_A_8A:
309 case QPU_PACK_A_8A_SAT:
310 return 0x1;
311 case QPU_PACK_A_8B:
312 case QPU_PACK_A_8B_SAT:
313 return 0x2;
314 case QPU_PACK_A_8C:
315 case QPU_PACK_A_8C_SAT:
316 return 0x4;
317 case QPU_PACK_A_8D:
318 case QPU_PACK_A_8D_SAT:
319 return 0x8;
320 case QPU_PACK_A_16A:
321 case QPU_PACK_A_16A_SAT:
322 return 0x3;
323 case QPU_PACK_A_16B:
324 case QPU_PACK_A_16B_SAT:
325 return 0xc;
326 }
327 }
328 unreachable("Bad pack field");
329 }
330 #endif
331
332 struct qreg
vir_get_temp(struct v3d_compile * c)333 vir_get_temp(struct v3d_compile *c)
334 {
335 struct qreg reg;
336
337 reg.file = QFILE_TEMP;
338 reg.index = c->num_temps++;
339
340 if (c->num_temps > c->defs_array_size) {
341 uint32_t old_size = c->defs_array_size;
342 c->defs_array_size = MAX2(old_size * 2, 16);
343
344 c->defs = reralloc(c, c->defs, struct qinst *,
345 c->defs_array_size);
346 memset(&c->defs[old_size], 0,
347 sizeof(c->defs[0]) * (c->defs_array_size - old_size));
348
349 c->spillable = reralloc(c, c->spillable,
350 BITSET_WORD,
351 BITSET_WORDS(c->defs_array_size));
352 for (int i = old_size; i < c->defs_array_size; i++)
353 BITSET_SET(c->spillable, i);
354 }
355
356 return reg;
357 }
358
359 struct qinst *
vir_add_inst(enum v3d_qpu_add_op op,struct qreg dst,struct qreg src0,struct qreg src1)360 vir_add_inst(enum v3d_qpu_add_op op, struct qreg dst, struct qreg src0, struct qreg src1)
361 {
362 struct qinst *inst = calloc(1, sizeof(*inst));
363
364 inst->qpu = v3d_qpu_nop();
365 inst->qpu.alu.add.op = op;
366
367 inst->dst = dst;
368 inst->src[0] = src0;
369 inst->src[1] = src1;
370 inst->uniform = ~0;
371
372 return inst;
373 }
374
375 struct qinst *
vir_mul_inst(enum v3d_qpu_mul_op op,struct qreg dst,struct qreg src0,struct qreg src1)376 vir_mul_inst(enum v3d_qpu_mul_op op, struct qreg dst, struct qreg src0, struct qreg src1)
377 {
378 struct qinst *inst = calloc(1, sizeof(*inst));
379
380 inst->qpu = v3d_qpu_nop();
381 inst->qpu.alu.mul.op = op;
382
383 inst->dst = dst;
384 inst->src[0] = src0;
385 inst->src[1] = src1;
386 inst->uniform = ~0;
387
388 return inst;
389 }
390
391 struct qinst *
vir_branch_inst(struct v3d_compile * c,enum v3d_qpu_branch_cond cond)392 vir_branch_inst(struct v3d_compile *c, enum v3d_qpu_branch_cond cond)
393 {
394 struct qinst *inst = calloc(1, sizeof(*inst));
395
396 inst->qpu = v3d_qpu_nop();
397 inst->qpu.type = V3D_QPU_INSTR_TYPE_BRANCH;
398 inst->qpu.branch.cond = cond;
399 inst->qpu.branch.msfign = V3D_QPU_MSFIGN_NONE;
400 inst->qpu.branch.bdi = V3D_QPU_BRANCH_DEST_REL;
401 inst->qpu.branch.ub = true;
402 inst->qpu.branch.bdu = V3D_QPU_BRANCH_DEST_REL;
403
404 inst->dst = vir_nop_reg();
405 inst->uniform = vir_get_uniform_index(c, QUNIFORM_CONSTANT, 0);
406
407 return inst;
408 }
409
410 static void
vir_emit(struct v3d_compile * c,struct qinst * inst)411 vir_emit(struct v3d_compile *c, struct qinst *inst)
412 {
413 switch (c->cursor.mode) {
414 case vir_cursor_add:
415 list_add(&inst->link, c->cursor.link);
416 break;
417 case vir_cursor_addtail:
418 list_addtail(&inst->link, c->cursor.link);
419 break;
420 }
421
422 c->cursor = vir_after_inst(inst);
423 c->live_intervals_valid = false;
424 }
425
426 /* Updates inst to write to a new temporary, emits it, and notes the def. */
427 struct qreg
vir_emit_def(struct v3d_compile * c,struct qinst * inst)428 vir_emit_def(struct v3d_compile *c, struct qinst *inst)
429 {
430 assert(inst->dst.file == QFILE_NULL);
431
432 /* If we're emitting an instruction that's a def, it had better be
433 * writing a register.
434 */
435 if (inst->qpu.type == V3D_QPU_INSTR_TYPE_ALU) {
436 assert(inst->qpu.alu.add.op == V3D_QPU_A_NOP ||
437 v3d_qpu_add_op_has_dst(inst->qpu.alu.add.op));
438 assert(inst->qpu.alu.mul.op == V3D_QPU_M_NOP ||
439 v3d_qpu_mul_op_has_dst(inst->qpu.alu.mul.op));
440 }
441
442 inst->dst = vir_get_temp(c);
443
444 if (inst->dst.file == QFILE_TEMP)
445 c->defs[inst->dst.index] = inst;
446
447 vir_emit(c, inst);
448
449 return inst->dst;
450 }
451
452 struct qinst *
vir_emit_nondef(struct v3d_compile * c,struct qinst * inst)453 vir_emit_nondef(struct v3d_compile *c, struct qinst *inst)
454 {
455 if (inst->dst.file == QFILE_TEMP)
456 c->defs[inst->dst.index] = NULL;
457
458 vir_emit(c, inst);
459
460 return inst;
461 }
462
463 struct qblock *
vir_new_block(struct v3d_compile * c)464 vir_new_block(struct v3d_compile *c)
465 {
466 struct qblock *block = rzalloc(c, struct qblock);
467
468 list_inithead(&block->instructions);
469
470 block->predecessors = _mesa_set_create(block,
471 _mesa_hash_pointer,
472 _mesa_key_pointer_equal);
473
474 block->index = c->next_block_index++;
475
476 return block;
477 }
478
479 void
vir_set_emit_block(struct v3d_compile * c,struct qblock * block)480 vir_set_emit_block(struct v3d_compile *c, struct qblock *block)
481 {
482 c->cur_block = block;
483 c->cursor = vir_after_block(block);
484 list_addtail(&block->link, &c->blocks);
485 }
486
487 struct qblock *
vir_entry_block(struct v3d_compile * c)488 vir_entry_block(struct v3d_compile *c)
489 {
490 return list_first_entry(&c->blocks, struct qblock, link);
491 }
492
493 struct qblock *
vir_exit_block(struct v3d_compile * c)494 vir_exit_block(struct v3d_compile *c)
495 {
496 return list_last_entry(&c->blocks, struct qblock, link);
497 }
498
499 void
vir_link_blocks(struct qblock * predecessor,struct qblock * successor)500 vir_link_blocks(struct qblock *predecessor, struct qblock *successor)
501 {
502 _mesa_set_add(successor->predecessors, predecessor);
503 if (predecessor->successors[0]) {
504 assert(!predecessor->successors[1]);
505 predecessor->successors[1] = successor;
506 } else {
507 predecessor->successors[0] = successor;
508 }
509 }
510
511 const struct v3d_compiler *
v3d_compiler_init(const struct v3d_device_info * devinfo)512 v3d_compiler_init(const struct v3d_device_info *devinfo)
513 {
514 struct v3d_compiler *compiler = rzalloc(NULL, struct v3d_compiler);
515 if (!compiler)
516 return NULL;
517
518 compiler->devinfo = devinfo;
519
520 if (!vir_init_reg_sets(compiler)) {
521 ralloc_free(compiler);
522 return NULL;
523 }
524
525 return compiler;
526 }
527
528 void
v3d_compiler_free(const struct v3d_compiler * compiler)529 v3d_compiler_free(const struct v3d_compiler *compiler)
530 {
531 ralloc_free((void *)compiler);
532 }
533
534 static struct v3d_compile *
vir_compile_init(const struct v3d_compiler * compiler,struct v3d_key * key,nir_shader * s,void (* debug_output)(const char * msg,void * debug_output_data),void * debug_output_data,int program_id,int variant_id,uint32_t max_threads,uint32_t min_threads_for_reg_alloc,bool tmu_spilling_allowed,bool disable_loop_unrolling,bool disable_constant_ubo_load_sorting,bool disable_tmu_pipelining,bool fallback_scheduler)535 vir_compile_init(const struct v3d_compiler *compiler,
536 struct v3d_key *key,
537 nir_shader *s,
538 void (*debug_output)(const char *msg,
539 void *debug_output_data),
540 void *debug_output_data,
541 int program_id, int variant_id,
542 uint32_t max_threads,
543 uint32_t min_threads_for_reg_alloc,
544 bool tmu_spilling_allowed,
545 bool disable_loop_unrolling,
546 bool disable_constant_ubo_load_sorting,
547 bool disable_tmu_pipelining,
548 bool fallback_scheduler)
549 {
550 struct v3d_compile *c = rzalloc(NULL, struct v3d_compile);
551
552 c->compiler = compiler;
553 c->devinfo = compiler->devinfo;
554 c->key = key;
555 c->program_id = program_id;
556 c->variant_id = variant_id;
557 c->threads = max_threads;
558 c->debug_output = debug_output;
559 c->debug_output_data = debug_output_data;
560 c->compilation_result = V3D_COMPILATION_SUCCEEDED;
561 c->min_threads_for_reg_alloc = min_threads_for_reg_alloc;
562 c->tmu_spilling_allowed = tmu_spilling_allowed;
563 c->fallback_scheduler = fallback_scheduler;
564 c->disable_tmu_pipelining = disable_tmu_pipelining;
565 c->disable_constant_ubo_load_sorting = disable_constant_ubo_load_sorting;
566 c->disable_loop_unrolling = V3D_DEBUG & V3D_DEBUG_NO_LOOP_UNROLL
567 ? true : disable_loop_unrolling;
568
569 s = nir_shader_clone(c, s);
570 c->s = s;
571
572 list_inithead(&c->blocks);
573 vir_set_emit_block(c, vir_new_block(c));
574
575 c->output_position_index = -1;
576 c->output_sample_mask_index = -1;
577
578 c->def_ht = _mesa_hash_table_create(c, _mesa_hash_pointer,
579 _mesa_key_pointer_equal);
580
581 c->tmu.outstanding_regs = _mesa_pointer_set_create(c);
582 c->flags_temp = -1;
583
584 return c;
585 }
586
587 static int
type_size_vec4(const struct glsl_type * type,bool bindless)588 type_size_vec4(const struct glsl_type *type, bool bindless)
589 {
590 return glsl_count_attribute_slots(type, false);
591 }
592
593 static void
v3d_lower_nir(struct v3d_compile * c)594 v3d_lower_nir(struct v3d_compile *c)
595 {
596 struct nir_lower_tex_options tex_options = {
597 .lower_txd = true,
598 .lower_tg4_broadcom_swizzle = true,
599
600 .lower_rect = false, /* XXX: Use this on V3D 3.x */
601 .lower_txp = ~0,
602 /* Apply swizzles to all samplers. */
603 .swizzle_result = ~0,
604 };
605
606 /* Lower the format swizzle and (for 32-bit returns)
607 * ARB_texture_swizzle-style swizzle.
608 */
609 assert(c->key->num_tex_used <= ARRAY_SIZE(c->key->tex));
610 for (int i = 0; i < c->key->num_tex_used; i++) {
611 for (int j = 0; j < 4; j++)
612 tex_options.swizzles[i][j] = c->key->tex[i].swizzle[j];
613 }
614
615 assert(c->key->num_samplers_used <= ARRAY_SIZE(c->key->sampler));
616 for (int i = 0; i < c->key->num_samplers_used; i++) {
617 if (c->key->sampler[i].return_size == 16) {
618 tex_options.lower_tex_packing[i] =
619 nir_lower_tex_packing_16;
620 }
621 }
622
623 /* CS textures may not have return_size reflecting the shadow state. */
624 nir_foreach_uniform_variable(var, c->s) {
625 const struct glsl_type *type = glsl_without_array(var->type);
626 unsigned array_len = MAX2(glsl_get_length(var->type), 1);
627
628 if (!glsl_type_is_sampler(type) ||
629 !glsl_sampler_type_is_shadow(type))
630 continue;
631
632 for (int i = 0; i < array_len; i++) {
633 tex_options.lower_tex_packing[var->data.binding + i] =
634 nir_lower_tex_packing_16;
635 }
636 }
637
638 NIR_PASS_V(c->s, nir_lower_tex, &tex_options);
639 NIR_PASS_V(c->s, nir_lower_system_values);
640 NIR_PASS_V(c->s, nir_lower_compute_system_values, NULL);
641
642 NIR_PASS_V(c->s, nir_lower_vars_to_scratch,
643 nir_var_function_temp,
644 0,
645 glsl_get_natural_size_align_bytes);
646 NIR_PASS_V(c->s, v3d_nir_lower_scratch);
647 }
648
649 static void
v3d_set_prog_data_uniforms(struct v3d_compile * c,struct v3d_prog_data * prog_data)650 v3d_set_prog_data_uniforms(struct v3d_compile *c,
651 struct v3d_prog_data *prog_data)
652 {
653 int count = c->num_uniforms;
654 struct v3d_uniform_list *ulist = &prog_data->uniforms;
655
656 ulist->count = count;
657 ulist->data = ralloc_array(prog_data, uint32_t, count);
658 memcpy(ulist->data, c->uniform_data,
659 count * sizeof(*ulist->data));
660 ulist->contents = ralloc_array(prog_data, enum quniform_contents, count);
661 memcpy(ulist->contents, c->uniform_contents,
662 count * sizeof(*ulist->contents));
663 }
664
665 static void
v3d_vs_set_prog_data(struct v3d_compile * c,struct v3d_vs_prog_data * prog_data)666 v3d_vs_set_prog_data(struct v3d_compile *c,
667 struct v3d_vs_prog_data *prog_data)
668 {
669 /* The vertex data gets format converted by the VPM so that
670 * each attribute channel takes up a VPM column. Precompute
671 * the sizes for the shader record.
672 */
673 for (int i = 0; i < ARRAY_SIZE(prog_data->vattr_sizes); i++) {
674 prog_data->vattr_sizes[i] = c->vattr_sizes[i];
675 prog_data->vpm_input_size += c->vattr_sizes[i];
676 }
677
678 memset(prog_data->driver_location_map, -1,
679 sizeof(prog_data->driver_location_map));
680
681 nir_foreach_shader_in_variable(var, c->s) {
682 prog_data->driver_location_map[var->data.location] =
683 var->data.driver_location;
684 }
685
686 prog_data->uses_vid = BITSET_TEST(c->s->info.system_values_read,
687 SYSTEM_VALUE_VERTEX_ID) ||
688 BITSET_TEST(c->s->info.system_values_read,
689 SYSTEM_VALUE_VERTEX_ID_ZERO_BASE);
690
691 prog_data->uses_biid = BITSET_TEST(c->s->info.system_values_read,
692 SYSTEM_VALUE_BASE_INSTANCE);
693
694 prog_data->uses_iid = BITSET_TEST(c->s->info.system_values_read,
695 SYSTEM_VALUE_INSTANCE_ID) ||
696 BITSET_TEST(c->s->info.system_values_read,
697 SYSTEM_VALUE_INSTANCE_INDEX);
698
699 if (prog_data->uses_vid)
700 prog_data->vpm_input_size++;
701 if (prog_data->uses_biid)
702 prog_data->vpm_input_size++;
703 if (prog_data->uses_iid)
704 prog_data->vpm_input_size++;
705
706 /* Input/output segment size are in sectors (8 rows of 32 bits per
707 * channel).
708 */
709 prog_data->vpm_input_size = align(prog_data->vpm_input_size, 8) / 8;
710 prog_data->vpm_output_size = align(c->vpm_output_size, 8) / 8;
711
712 /* Set us up for shared input/output segments. This is apparently
713 * necessary for our VCM setup to avoid varying corruption.
714 */
715 prog_data->separate_segments = false;
716 prog_data->vpm_output_size = MAX2(prog_data->vpm_output_size,
717 prog_data->vpm_input_size);
718 prog_data->vpm_input_size = 0;
719
720 /* Compute VCM cache size. We set up our program to take up less than
721 * half of the VPM, so that any set of bin and render programs won't
722 * run out of space. We need space for at least one input segment,
723 * and then allocate the rest to output segments (one for the current
724 * program, the rest to VCM). The valid range of the VCM cache size
725 * field is 1-4 16-vertex batches, but GFXH-1744 limits us to 2-4
726 * batches.
727 */
728 assert(c->devinfo->vpm_size);
729 int sector_size = V3D_CHANNELS * sizeof(uint32_t) * 8;
730 int vpm_size_in_sectors = c->devinfo->vpm_size / sector_size;
731 int half_vpm = vpm_size_in_sectors / 2;
732 int vpm_output_sectors = half_vpm - prog_data->vpm_input_size;
733 int vpm_output_batches = vpm_output_sectors / prog_data->vpm_output_size;
734 assert(vpm_output_batches >= 2);
735 prog_data->vcm_cache_size = CLAMP(vpm_output_batches - 1, 2, 4);
736 }
737
738 static void
v3d_gs_set_prog_data(struct v3d_compile * c,struct v3d_gs_prog_data * prog_data)739 v3d_gs_set_prog_data(struct v3d_compile *c,
740 struct v3d_gs_prog_data *prog_data)
741 {
742 prog_data->num_inputs = c->num_inputs;
743 memcpy(prog_data->input_slots, c->input_slots,
744 c->num_inputs * sizeof(*c->input_slots));
745
746 /* gl_PrimitiveIdIn is written by the GBG into the first word of the
747 * VPM output header automatically and the shader will overwrite
748 * it after reading it if necessary, so it doesn't add to the VPM
749 * size requirements.
750 */
751 prog_data->uses_pid = BITSET_TEST(c->s->info.system_values_read,
752 SYSTEM_VALUE_PRIMITIVE_ID);
753
754 /* Output segment size is in sectors (8 rows of 32 bits per channel) */
755 prog_data->vpm_output_size = align(c->vpm_output_size, 8) / 8;
756
757 /* Compute SIMD dispatch width and update VPM output size accordingly
758 * to ensure we can fit our program in memory. Available widths are
759 * 16, 8, 4, 1.
760 *
761 * Notice that at draw time we will have to consider VPM memory
762 * requirements from other stages and choose a smaller dispatch
763 * width if needed to fit the program in VPM memory.
764 */
765 prog_data->simd_width = 16;
766 while ((prog_data->simd_width > 1 && prog_data->vpm_output_size > 16) ||
767 prog_data->simd_width == 2) {
768 prog_data->simd_width >>= 1;
769 prog_data->vpm_output_size =
770 align(prog_data->vpm_output_size, 2) / 2;
771 }
772 assert(prog_data->vpm_output_size <= 16);
773 assert(prog_data->simd_width != 2);
774
775 prog_data->out_prim_type = c->s->info.gs.output_primitive;
776 prog_data->num_invocations = c->s->info.gs.invocations;
777
778 prog_data->writes_psiz =
779 c->s->info.outputs_written & (1 << VARYING_SLOT_PSIZ);
780 }
781
782 static void
v3d_set_fs_prog_data_inputs(struct v3d_compile * c,struct v3d_fs_prog_data * prog_data)783 v3d_set_fs_prog_data_inputs(struct v3d_compile *c,
784 struct v3d_fs_prog_data *prog_data)
785 {
786 prog_data->num_inputs = c->num_inputs;
787 memcpy(prog_data->input_slots, c->input_slots,
788 c->num_inputs * sizeof(*c->input_slots));
789
790 STATIC_ASSERT(ARRAY_SIZE(prog_data->flat_shade_flags) >
791 (V3D_MAX_FS_INPUTS - 1) / 24);
792 for (int i = 0; i < V3D_MAX_FS_INPUTS; i++) {
793 if (BITSET_TEST(c->flat_shade_flags, i))
794 prog_data->flat_shade_flags[i / 24] |= 1 << (i % 24);
795
796 if (BITSET_TEST(c->noperspective_flags, i))
797 prog_data->noperspective_flags[i / 24] |= 1 << (i % 24);
798
799 if (BITSET_TEST(c->centroid_flags, i))
800 prog_data->centroid_flags[i / 24] |= 1 << (i % 24);
801 }
802 }
803
804 static void
v3d_fs_set_prog_data(struct v3d_compile * c,struct v3d_fs_prog_data * prog_data)805 v3d_fs_set_prog_data(struct v3d_compile *c,
806 struct v3d_fs_prog_data *prog_data)
807 {
808 v3d_set_fs_prog_data_inputs(c, prog_data);
809 prog_data->writes_z = c->writes_z;
810 prog_data->disable_ez = !c->s->info.fs.early_fragment_tests;
811 prog_data->uses_center_w = c->uses_center_w;
812 prog_data->uses_implicit_point_line_varyings =
813 c->uses_implicit_point_line_varyings;
814 prog_data->lock_scoreboard_on_first_thrsw =
815 c->lock_scoreboard_on_first_thrsw;
816 prog_data->force_per_sample_msaa = c->force_per_sample_msaa;
817 prog_data->uses_pid = c->fs_uses_primitive_id;
818 }
819
820 static void
v3d_cs_set_prog_data(struct v3d_compile * c,struct v3d_compute_prog_data * prog_data)821 v3d_cs_set_prog_data(struct v3d_compile *c,
822 struct v3d_compute_prog_data *prog_data)
823 {
824 prog_data->shared_size = c->s->info.shared_size;
825
826 prog_data->local_size[0] = c->s->info.workgroup_size[0];
827 prog_data->local_size[1] = c->s->info.workgroup_size[1];
828 prog_data->local_size[2] = c->s->info.workgroup_size[2];
829
830 prog_data->has_subgroups = c->has_subgroups;
831 }
832
833 static void
v3d_set_prog_data(struct v3d_compile * c,struct v3d_prog_data * prog_data)834 v3d_set_prog_data(struct v3d_compile *c,
835 struct v3d_prog_data *prog_data)
836 {
837 prog_data->threads = c->threads;
838 prog_data->single_seg = !c->last_thrsw;
839 prog_data->spill_size = c->spill_size;
840 prog_data->tmu_dirty_rcl = c->tmu_dirty_rcl;
841 prog_data->has_control_barrier = c->s->info.uses_control_barrier;
842
843 v3d_set_prog_data_uniforms(c, prog_data);
844
845 switch (c->s->info.stage) {
846 case MESA_SHADER_VERTEX:
847 v3d_vs_set_prog_data(c, (struct v3d_vs_prog_data *)prog_data);
848 break;
849 case MESA_SHADER_GEOMETRY:
850 v3d_gs_set_prog_data(c, (struct v3d_gs_prog_data *)prog_data);
851 break;
852 case MESA_SHADER_FRAGMENT:
853 v3d_fs_set_prog_data(c, (struct v3d_fs_prog_data *)prog_data);
854 break;
855 case MESA_SHADER_COMPUTE:
856 v3d_cs_set_prog_data(c, (struct v3d_compute_prog_data *)prog_data);
857 break;
858 default:
859 unreachable("unsupported shader stage");
860 }
861 }
862
863 static uint64_t *
v3d_return_qpu_insts(struct v3d_compile * c,uint32_t * final_assembly_size)864 v3d_return_qpu_insts(struct v3d_compile *c, uint32_t *final_assembly_size)
865 {
866 *final_assembly_size = c->qpu_inst_count * sizeof(uint64_t);
867
868 uint64_t *qpu_insts = malloc(*final_assembly_size);
869 if (!qpu_insts)
870 return NULL;
871
872 memcpy(qpu_insts, c->qpu_insts, *final_assembly_size);
873
874 vir_compile_destroy(c);
875
876 return qpu_insts;
877 }
878
879 static void
v3d_nir_lower_vs_early(struct v3d_compile * c)880 v3d_nir_lower_vs_early(struct v3d_compile *c)
881 {
882 /* Split our I/O vars and dead code eliminate the unused
883 * components.
884 */
885 NIR_PASS_V(c->s, nir_lower_io_to_scalar_early,
886 nir_var_shader_in | nir_var_shader_out);
887 uint64_t used_outputs[4] = {0};
888 for (int i = 0; i < c->vs_key->num_used_outputs; i++) {
889 int slot = v3d_slot_get_slot(c->vs_key->used_outputs[i]);
890 int comp = v3d_slot_get_component(c->vs_key->used_outputs[i]);
891 used_outputs[comp] |= 1ull << slot;
892 }
893 NIR_PASS_V(c->s, nir_remove_unused_io_vars,
894 nir_var_shader_out, used_outputs, NULL); /* demotes to globals */
895 NIR_PASS_V(c->s, nir_lower_global_vars_to_local);
896 v3d_optimize_nir(c, c->s);
897 NIR_PASS_V(c->s, nir_remove_dead_variables, nir_var_shader_in, NULL);
898
899 /* This must go before nir_lower_io */
900 if (c->vs_key->per_vertex_point_size)
901 NIR_PASS_V(c->s, nir_lower_point_size, 1.0f, 0.0f);
902
903 NIR_PASS_V(c->s, nir_lower_io, nir_var_shader_in | nir_var_shader_out,
904 type_size_vec4,
905 (nir_lower_io_options)0);
906 /* clean up nir_lower_io's deref_var remains and do a constant folding pass
907 * on the code it generated.
908 */
909 NIR_PASS_V(c->s, nir_opt_dce);
910 NIR_PASS_V(c->s, nir_opt_constant_folding);
911 }
912
913 static void
v3d_nir_lower_gs_early(struct v3d_compile * c)914 v3d_nir_lower_gs_early(struct v3d_compile *c)
915 {
916 /* Split our I/O vars and dead code eliminate the unused
917 * components.
918 */
919 NIR_PASS_V(c->s, nir_lower_io_to_scalar_early,
920 nir_var_shader_in | nir_var_shader_out);
921 uint64_t used_outputs[4] = {0};
922 for (int i = 0; i < c->gs_key->num_used_outputs; i++) {
923 int slot = v3d_slot_get_slot(c->gs_key->used_outputs[i]);
924 int comp = v3d_slot_get_component(c->gs_key->used_outputs[i]);
925 used_outputs[comp] |= 1ull << slot;
926 }
927 NIR_PASS_V(c->s, nir_remove_unused_io_vars,
928 nir_var_shader_out, used_outputs, NULL); /* demotes to globals */
929 NIR_PASS_V(c->s, nir_lower_global_vars_to_local);
930 v3d_optimize_nir(c, c->s);
931 NIR_PASS_V(c->s, nir_remove_dead_variables, nir_var_shader_in, NULL);
932
933 /* This must go before nir_lower_io */
934 if (c->gs_key->per_vertex_point_size)
935 NIR_PASS_V(c->s, nir_lower_point_size, 1.0f, 0.0f);
936
937 NIR_PASS_V(c->s, nir_lower_io, nir_var_shader_in | nir_var_shader_out,
938 type_size_vec4,
939 (nir_lower_io_options)0);
940 /* clean up nir_lower_io's deref_var remains */
941 NIR_PASS_V(c->s, nir_opt_dce);
942 }
943
944 static void
v3d_fixup_fs_output_types(struct v3d_compile * c)945 v3d_fixup_fs_output_types(struct v3d_compile *c)
946 {
947 nir_foreach_shader_out_variable(var, c->s) {
948 uint32_t mask = 0;
949
950 switch (var->data.location) {
951 case FRAG_RESULT_COLOR:
952 mask = ~0;
953 break;
954 case FRAG_RESULT_DATA0:
955 case FRAG_RESULT_DATA1:
956 case FRAG_RESULT_DATA2:
957 case FRAG_RESULT_DATA3:
958 mask = 1 << (var->data.location - FRAG_RESULT_DATA0);
959 break;
960 }
961
962 if (c->fs_key->int_color_rb & mask) {
963 var->type =
964 glsl_vector_type(GLSL_TYPE_INT,
965 glsl_get_components(var->type));
966 } else if (c->fs_key->uint_color_rb & mask) {
967 var->type =
968 glsl_vector_type(GLSL_TYPE_UINT,
969 glsl_get_components(var->type));
970 }
971 }
972 }
973
974 static void
v3d_nir_lower_fs_early(struct v3d_compile * c)975 v3d_nir_lower_fs_early(struct v3d_compile *c)
976 {
977 if (c->fs_key->int_color_rb || c->fs_key->uint_color_rb)
978 v3d_fixup_fs_output_types(c);
979
980 NIR_PASS_V(c->s, v3d_nir_lower_logic_ops, c);
981
982 if (c->fs_key->line_smoothing) {
983 v3d_nir_lower_line_smooth(c->s);
984 NIR_PASS_V(c->s, nir_lower_global_vars_to_local);
985 /* The lowering pass can introduce new sysval reads */
986 nir_shader_gather_info(c->s, nir_shader_get_entrypoint(c->s));
987 }
988 }
989
990 static void
v3d_nir_lower_gs_late(struct v3d_compile * c)991 v3d_nir_lower_gs_late(struct v3d_compile *c)
992 {
993 if (c->key->ucp_enables) {
994 NIR_PASS_V(c->s, nir_lower_clip_gs, c->key->ucp_enables,
995 false, NULL);
996 }
997
998 /* Note: GS output scalarizing must happen after nir_lower_clip_gs. */
999 NIR_PASS_V(c->s, nir_lower_io_to_scalar, nir_var_shader_out);
1000 }
1001
1002 static void
v3d_nir_lower_vs_late(struct v3d_compile * c)1003 v3d_nir_lower_vs_late(struct v3d_compile *c)
1004 {
1005 if (c->key->ucp_enables) {
1006 NIR_PASS_V(c->s, nir_lower_clip_vs, c->key->ucp_enables,
1007 false, false, NULL);
1008 NIR_PASS_V(c->s, nir_lower_io_to_scalar,
1009 nir_var_shader_out);
1010 }
1011
1012 /* Note: VS output scalarizing must happen after nir_lower_clip_vs. */
1013 NIR_PASS_V(c->s, nir_lower_io_to_scalar, nir_var_shader_out);
1014 }
1015
1016 static void
v3d_nir_lower_fs_late(struct v3d_compile * c)1017 v3d_nir_lower_fs_late(struct v3d_compile *c)
1018 {
1019 /* In OpenGL the fragment shader can't read gl_ClipDistance[], but
1020 * Vulkan allows it, in which case the SPIR-V compiler will declare
1021 * VARING_SLOT_CLIP_DIST0 as compact array variable. Pass true as
1022 * the last parameter to always operate with a compact array in both
1023 * OpenGL and Vulkan so we do't have to care about the API we
1024 * are using.
1025 */
1026 if (c->key->ucp_enables)
1027 NIR_PASS_V(c->s, nir_lower_clip_fs, c->key->ucp_enables, true);
1028
1029 NIR_PASS_V(c->s, nir_lower_io_to_scalar, nir_var_shader_in);
1030 }
1031
1032 static uint32_t
vir_get_max_temps(struct v3d_compile * c)1033 vir_get_max_temps(struct v3d_compile *c)
1034 {
1035 int max_ip = 0;
1036 vir_for_each_inst_inorder(inst, c)
1037 max_ip++;
1038
1039 uint32_t *pressure = rzalloc_array(NULL, uint32_t, max_ip);
1040
1041 for (int t = 0; t < c->num_temps; t++) {
1042 for (int i = c->temp_start[t]; (i < c->temp_end[t] &&
1043 i < max_ip); i++) {
1044 if (i > max_ip)
1045 break;
1046 pressure[i]++;
1047 }
1048 }
1049
1050 uint32_t max_temps = 0;
1051 for (int i = 0; i < max_ip; i++)
1052 max_temps = MAX2(max_temps, pressure[i]);
1053
1054 ralloc_free(pressure);
1055
1056 return max_temps;
1057 }
1058
1059 enum v3d_dependency_class {
1060 V3D_DEPENDENCY_CLASS_GS_VPM_OUTPUT_0
1061 };
1062
1063 static bool
v3d_intrinsic_dependency_cb(nir_intrinsic_instr * intr,nir_schedule_dependency * dep,void * user_data)1064 v3d_intrinsic_dependency_cb(nir_intrinsic_instr *intr,
1065 nir_schedule_dependency *dep,
1066 void *user_data)
1067 {
1068 struct v3d_compile *c = user_data;
1069
1070 switch (intr->intrinsic) {
1071 case nir_intrinsic_store_output:
1072 /* Writing to location 0 overwrites the value passed in for
1073 * gl_PrimitiveID on geometry shaders
1074 */
1075 if (c->s->info.stage != MESA_SHADER_GEOMETRY ||
1076 nir_intrinsic_base(intr) != 0)
1077 break;
1078
1079 nir_const_value *const_value =
1080 nir_src_as_const_value(intr->src[1]);
1081
1082 if (const_value == NULL)
1083 break;
1084
1085 uint64_t offset =
1086 nir_const_value_as_uint(*const_value,
1087 nir_src_bit_size(intr->src[1]));
1088 if (offset != 0)
1089 break;
1090
1091 dep->klass = V3D_DEPENDENCY_CLASS_GS_VPM_OUTPUT_0;
1092 dep->type = NIR_SCHEDULE_WRITE_DEPENDENCY;
1093 return true;
1094
1095 case nir_intrinsic_load_primitive_id:
1096 if (c->s->info.stage != MESA_SHADER_GEOMETRY)
1097 break;
1098
1099 dep->klass = V3D_DEPENDENCY_CLASS_GS_VPM_OUTPUT_0;
1100 dep->type = NIR_SCHEDULE_READ_DEPENDENCY;
1101 return true;
1102
1103 default:
1104 break;
1105 }
1106
1107 return false;
1108 }
1109
1110 static bool
should_split_wrmask(const nir_instr * instr,const void * data)1111 should_split_wrmask(const nir_instr *instr, const void *data)
1112 {
1113 nir_intrinsic_instr *intr = nir_instr_as_intrinsic(instr);
1114 switch (intr->intrinsic) {
1115 case nir_intrinsic_store_ssbo:
1116 case nir_intrinsic_store_shared:
1117 case nir_intrinsic_store_global:
1118 case nir_intrinsic_store_scratch:
1119 return true;
1120 default:
1121 return false;
1122 }
1123 }
1124
1125 static nir_intrinsic_instr *
nir_instr_as_constant_ubo_load(nir_instr * inst)1126 nir_instr_as_constant_ubo_load(nir_instr *inst)
1127 {
1128 if (inst->type != nir_instr_type_intrinsic)
1129 return NULL;
1130
1131 nir_intrinsic_instr *intr = nir_instr_as_intrinsic(inst);
1132 if (intr->intrinsic != nir_intrinsic_load_ubo)
1133 return NULL;
1134
1135 assert(nir_src_is_const(intr->src[0]));
1136 if (!nir_src_is_const(intr->src[1]))
1137 return NULL;
1138
1139 return intr;
1140 }
1141
1142 static bool
v3d_nir_sort_constant_ubo_load(nir_block * block,nir_intrinsic_instr * ref)1143 v3d_nir_sort_constant_ubo_load(nir_block *block, nir_intrinsic_instr *ref)
1144 {
1145 bool progress = false;
1146
1147 nir_instr *ref_inst = &ref->instr;
1148 uint32_t ref_offset = nir_src_as_uint(ref->src[1]);
1149 uint32_t ref_index = nir_src_as_uint(ref->src[0]);
1150
1151 /* Go through all instructions after ref searching for constant UBO
1152 * loads for the same UBO index.
1153 */
1154 bool seq_break = false;
1155 nir_instr *inst = &ref->instr;
1156 nir_instr *next_inst = NULL;
1157 while (true) {
1158 inst = next_inst ? next_inst : nir_instr_next(inst);
1159 if (!inst)
1160 break;
1161
1162 next_inst = NULL;
1163
1164 if (inst->type != nir_instr_type_intrinsic)
1165 continue;
1166
1167 nir_intrinsic_instr *intr = nir_instr_as_intrinsic(inst);
1168 if (intr->intrinsic != nir_intrinsic_load_ubo)
1169 continue;
1170
1171 /* We only produce unifa sequences for non-divergent loads */
1172 if (nir_src_is_divergent(intr->src[1]))
1173 continue;
1174
1175 /* If there are any UBO loads that are not constant or that
1176 * use a different UBO index in between the reference load and
1177 * any other constant load for the same index, they would break
1178 * the unifa sequence. We will flag that so we can then move
1179 * all constant UBO loads for the reference index before these
1180 * and not just the ones that are not ordered to avoid breaking
1181 * the sequence and reduce unifa writes.
1182 */
1183 if (!nir_src_is_const(intr->src[1])) {
1184 seq_break = true;
1185 continue;
1186 }
1187 uint32_t offset = nir_src_as_uint(intr->src[1]);
1188
1189 assert(nir_src_is_const(intr->src[0]));
1190 uint32_t index = nir_src_as_uint(intr->src[0]);
1191 if (index != ref_index) {
1192 seq_break = true;
1193 continue;
1194 }
1195
1196 /* Only move loads with an offset that is close enough to the
1197 * reference offset, since otherwise we would not be able to
1198 * skip the unifa write for them. See ntq_emit_load_ubo_unifa.
1199 */
1200 if (abs(ref_offset - offset) > MAX_UNIFA_SKIP_DISTANCE)
1201 continue;
1202
1203 /* We will move this load if its offset is smaller than ref's
1204 * (in which case we will move it before ref) or if the offset
1205 * is larger than ref's but there are sequence breakers in
1206 * in between (in which case we will move it after ref and
1207 * before the sequence breakers).
1208 */
1209 if (!seq_break && offset >= ref_offset)
1210 continue;
1211
1212 /* Find where exactly we want to move this load:
1213 *
1214 * If we are moving it before ref, we want to check any other
1215 * UBO loads we placed before ref and make sure we insert this
1216 * one properly ordered with them. Likewise, if we are moving
1217 * it after ref.
1218 */
1219 nir_instr *pos = ref_inst;
1220 nir_instr *tmp = pos;
1221 do {
1222 if (offset < ref_offset)
1223 tmp = nir_instr_prev(tmp);
1224 else
1225 tmp = nir_instr_next(tmp);
1226
1227 if (!tmp || tmp == inst)
1228 break;
1229
1230 /* Ignore non-unifa UBO loads */
1231 if (tmp->type != nir_instr_type_intrinsic)
1232 continue;
1233
1234 nir_intrinsic_instr *tmp_intr =
1235 nir_instr_as_intrinsic(tmp);
1236 if (tmp_intr->intrinsic != nir_intrinsic_load_ubo)
1237 continue;
1238
1239 if (nir_src_is_divergent(tmp_intr->src[1]))
1240 continue;
1241
1242 /* Stop if we find a unifa UBO load that breaks the
1243 * sequence.
1244 */
1245 if (!nir_src_is_const(tmp_intr->src[1]))
1246 break;
1247
1248 if (nir_src_as_uint(tmp_intr->src[0]) != index)
1249 break;
1250
1251 uint32_t tmp_offset = nir_src_as_uint(tmp_intr->src[1]);
1252 if (offset < ref_offset) {
1253 if (tmp_offset < offset ||
1254 tmp_offset >= ref_offset) {
1255 break;
1256 } else {
1257 pos = tmp;
1258 }
1259 } else {
1260 if (tmp_offset > offset ||
1261 tmp_offset <= ref_offset) {
1262 break;
1263 } else {
1264 pos = tmp;
1265 }
1266 }
1267 } while (true);
1268
1269 /* We can't move the UBO load before the instruction that
1270 * defines its constant offset. If that instruction is placed
1271 * in between the new location (pos) and the current location
1272 * of this load, we will have to move that instruction too.
1273 *
1274 * We don't care about the UBO index definition because that
1275 * is optimized to be reused by all UBO loads for the same
1276 * index and therefore is certain to be defined before the
1277 * first UBO load that uses it.
1278 */
1279 nir_instr *offset_inst = NULL;
1280 tmp = inst;
1281 while ((tmp = nir_instr_prev(tmp)) != NULL) {
1282 if (pos == tmp) {
1283 /* We reached the target location without
1284 * finding the instruction that defines the
1285 * offset, so that instruction must be before
1286 * the new position and we don't have to fix it.
1287 */
1288 break;
1289 }
1290 if (intr->src[1].ssa->parent_instr == tmp) {
1291 offset_inst = tmp;
1292 break;
1293 }
1294 }
1295
1296 if (offset_inst) {
1297 exec_node_remove(&offset_inst->node);
1298 exec_node_insert_node_before(&pos->node,
1299 &offset_inst->node);
1300 }
1301
1302 /* Since we are moving the instruction before its current
1303 * location, grab its successor before the move so that
1304 * we can continue the next iteration of the main loop from
1305 * that instruction.
1306 */
1307 next_inst = nir_instr_next(inst);
1308
1309 /* Move this load to the selected location */
1310 exec_node_remove(&inst->node);
1311 if (offset < ref_offset)
1312 exec_node_insert_node_before(&pos->node, &inst->node);
1313 else
1314 exec_node_insert_after(&pos->node, &inst->node);
1315
1316 progress = true;
1317 }
1318
1319 return progress;
1320 }
1321
1322 static bool
v3d_nir_sort_constant_ubo_loads_block(struct v3d_compile * c,nir_block * block)1323 v3d_nir_sort_constant_ubo_loads_block(struct v3d_compile *c,
1324 nir_block *block)
1325 {
1326 bool progress = false;
1327 bool local_progress;
1328 do {
1329 local_progress = false;
1330 nir_foreach_instr_safe(inst, block) {
1331 nir_intrinsic_instr *intr =
1332 nir_instr_as_constant_ubo_load(inst);
1333 if (intr) {
1334 local_progress |=
1335 v3d_nir_sort_constant_ubo_load(block, intr);
1336 }
1337 }
1338 progress |= local_progress;
1339 } while (local_progress);
1340
1341 return progress;
1342 }
1343
1344 /**
1345 * Sorts constant UBO loads in each block by offset to maximize chances of
1346 * skipping unifa writes when converting to VIR. This can increase register
1347 * pressure.
1348 */
1349 static bool
v3d_nir_sort_constant_ubo_loads(nir_shader * s,struct v3d_compile * c)1350 v3d_nir_sort_constant_ubo_loads(nir_shader *s, struct v3d_compile *c)
1351 {
1352 nir_foreach_function(function, s) {
1353 if (function->impl) {
1354 nir_foreach_block(block, function->impl) {
1355 c->sorted_any_ubo_loads |=
1356 v3d_nir_sort_constant_ubo_loads_block(c, block);
1357 }
1358 nir_metadata_preserve(function->impl,
1359 nir_metadata_block_index |
1360 nir_metadata_dominance);
1361 }
1362 }
1363 return c->sorted_any_ubo_loads;
1364 }
1365
1366 static void
lower_load_num_subgroups(struct v3d_compile * c,nir_builder * b,nir_intrinsic_instr * intr)1367 lower_load_num_subgroups(struct v3d_compile *c,
1368 nir_builder *b,
1369 nir_intrinsic_instr *intr)
1370 {
1371 assert(c->s->info.stage == MESA_SHADER_COMPUTE);
1372 assert(intr->intrinsic == nir_intrinsic_load_num_subgroups);
1373
1374 b->cursor = nir_after_instr(&intr->instr);
1375 uint32_t num_subgroups =
1376 DIV_ROUND_UP(c->s->info.workgroup_size[0] *
1377 c->s->info.workgroup_size[1] *
1378 c->s->info.workgroup_size[2], V3D_CHANNELS);
1379 nir_ssa_def *result = nir_imm_int(b, num_subgroups);
1380 nir_ssa_def_rewrite_uses(&intr->dest.ssa, result);
1381 nir_instr_remove(&intr->instr);
1382 }
1383
1384 static bool
lower_subgroup_intrinsics(struct v3d_compile * c,nir_block * block,nir_builder * b)1385 lower_subgroup_intrinsics(struct v3d_compile *c,
1386 nir_block *block, nir_builder *b)
1387 {
1388 bool progress = false;
1389 nir_foreach_instr_safe(inst, block) {
1390 if (inst->type != nir_instr_type_intrinsic)
1391 continue;;
1392
1393 nir_intrinsic_instr *intr =
1394 nir_instr_as_intrinsic(inst);
1395 if (!intr)
1396 continue;
1397
1398 switch (intr->intrinsic) {
1399 case nir_intrinsic_load_num_subgroups:
1400 lower_load_num_subgroups(c, b, intr);
1401 progress = true;
1402 FALLTHROUGH;
1403 case nir_intrinsic_load_subgroup_id:
1404 case nir_intrinsic_load_subgroup_size:
1405 case nir_intrinsic_load_subgroup_invocation:
1406 case nir_intrinsic_elect:
1407 c->has_subgroups = true;
1408 break;
1409 default:
1410 break;
1411 }
1412 }
1413
1414 return progress;
1415 }
1416
1417 static bool
v3d_nir_lower_subgroup_intrinsics(nir_shader * s,struct v3d_compile * c)1418 v3d_nir_lower_subgroup_intrinsics(nir_shader *s, struct v3d_compile *c)
1419 {
1420 bool progress = false;
1421 nir_foreach_function(function, s) {
1422 if (function->impl) {
1423 nir_builder b;
1424 nir_builder_init(&b, function->impl);
1425
1426 nir_foreach_block(block, function->impl)
1427 progress |= lower_subgroup_intrinsics(c, block, &b);
1428
1429 nir_metadata_preserve(function->impl,
1430 nir_metadata_block_index |
1431 nir_metadata_dominance);
1432 }
1433 }
1434 return progress;
1435 }
1436
1437 static void
v3d_attempt_compile(struct v3d_compile * c)1438 v3d_attempt_compile(struct v3d_compile *c)
1439 {
1440 switch (c->s->info.stage) {
1441 case MESA_SHADER_VERTEX:
1442 c->vs_key = (struct v3d_vs_key *) c->key;
1443 break;
1444 case MESA_SHADER_GEOMETRY:
1445 c->gs_key = (struct v3d_gs_key *) c->key;
1446 break;
1447 case MESA_SHADER_FRAGMENT:
1448 c->fs_key = (struct v3d_fs_key *) c->key;
1449 break;
1450 case MESA_SHADER_COMPUTE:
1451 break;
1452 default:
1453 unreachable("unsupported shader stage");
1454 }
1455
1456 switch (c->s->info.stage) {
1457 case MESA_SHADER_VERTEX:
1458 v3d_nir_lower_vs_early(c);
1459 break;
1460 case MESA_SHADER_GEOMETRY:
1461 v3d_nir_lower_gs_early(c);
1462 break;
1463 case MESA_SHADER_FRAGMENT:
1464 v3d_nir_lower_fs_early(c);
1465 break;
1466 default:
1467 break;
1468 }
1469
1470 v3d_lower_nir(c);
1471
1472 switch (c->s->info.stage) {
1473 case MESA_SHADER_VERTEX:
1474 v3d_nir_lower_vs_late(c);
1475 break;
1476 case MESA_SHADER_GEOMETRY:
1477 v3d_nir_lower_gs_late(c);
1478 break;
1479 case MESA_SHADER_FRAGMENT:
1480 v3d_nir_lower_fs_late(c);
1481 break;
1482 default:
1483 break;
1484 }
1485
1486 NIR_PASS_V(c->s, v3d_nir_lower_io, c);
1487 NIR_PASS_V(c->s, v3d_nir_lower_txf_ms, c);
1488 NIR_PASS_V(c->s, v3d_nir_lower_image_load_store);
1489 nir_lower_idiv_options idiv_options = {
1490 .imprecise_32bit_lowering = true,
1491 .allow_fp16 = true,
1492 };
1493 NIR_PASS_V(c->s, nir_lower_idiv, &idiv_options);
1494
1495 if (c->key->robust_buffer_access) {
1496 /* v3d_nir_lower_robust_buffer_access assumes constant buffer
1497 * indices on ubo/ssbo intrinsics so run copy propagation and
1498 * constant folding passes before we run the lowering to warrant
1499 * this. We also want to run the lowering before v3d_optimize to
1500 * clean-up redundant get_buffer_size calls produced in the pass.
1501 */
1502 NIR_PASS_V(c->s, nir_copy_prop);
1503 NIR_PASS_V(c->s, nir_opt_constant_folding);
1504 NIR_PASS_V(c->s, v3d_nir_lower_robust_buffer_access, c);
1505 }
1506
1507 NIR_PASS_V(c->s, nir_lower_wrmasks, should_split_wrmask, c->s);
1508
1509 NIR_PASS_V(c->s, v3d_nir_lower_subgroup_intrinsics, c);
1510
1511 v3d_optimize_nir(c, c->s);
1512
1513 /* Do late algebraic optimization to turn add(a, neg(b)) back into
1514 * subs, then the mandatory cleanup after algebraic. Note that it may
1515 * produce fnegs, and if so then we need to keep running to squash
1516 * fneg(fneg(a)).
1517 */
1518 bool more_late_algebraic = true;
1519 while (more_late_algebraic) {
1520 more_late_algebraic = false;
1521 NIR_PASS(more_late_algebraic, c->s, nir_opt_algebraic_late);
1522 NIR_PASS_V(c->s, nir_opt_constant_folding);
1523 NIR_PASS_V(c->s, nir_copy_prop);
1524 NIR_PASS_V(c->s, nir_opt_dce);
1525 NIR_PASS_V(c->s, nir_opt_cse);
1526 }
1527
1528 NIR_PASS_V(c->s, nir_lower_bool_to_int32);
1529 nir_convert_to_lcssa(c->s, true, true);
1530 NIR_PASS_V(c->s, nir_divergence_analysis);
1531 NIR_PASS_V(c->s, nir_convert_from_ssa, true);
1532
1533 struct nir_schedule_options schedule_options = {
1534 /* Schedule for about half our register space, to enable more
1535 * shaders to hit 4 threads.
1536 */
1537 .threshold = 24,
1538
1539 /* Vertex shaders share the same memory for inputs and outputs,
1540 * fragement and geometry shaders do not.
1541 */
1542 .stages_with_shared_io_memory =
1543 (((1 << MESA_ALL_SHADER_STAGES) - 1) &
1544 ~((1 << MESA_SHADER_FRAGMENT) |
1545 (1 << MESA_SHADER_GEOMETRY))),
1546
1547 .fallback = c->fallback_scheduler,
1548
1549 .intrinsic_cb = v3d_intrinsic_dependency_cb,
1550 .intrinsic_cb_data = c,
1551 };
1552 NIR_PASS_V(c->s, nir_schedule, &schedule_options);
1553
1554 if (!c->disable_constant_ubo_load_sorting)
1555 NIR_PASS_V(c->s, v3d_nir_sort_constant_ubo_loads, c);
1556
1557 v3d_nir_to_vir(c);
1558 }
1559
1560 uint32_t
v3d_prog_data_size(gl_shader_stage stage)1561 v3d_prog_data_size(gl_shader_stage stage)
1562 {
1563 static const int prog_data_size[] = {
1564 [MESA_SHADER_VERTEX] = sizeof(struct v3d_vs_prog_data),
1565 [MESA_SHADER_GEOMETRY] = sizeof(struct v3d_gs_prog_data),
1566 [MESA_SHADER_FRAGMENT] = sizeof(struct v3d_fs_prog_data),
1567 [MESA_SHADER_COMPUTE] = sizeof(struct v3d_compute_prog_data),
1568 };
1569
1570 assert(stage >= 0 &&
1571 stage < ARRAY_SIZE(prog_data_size) &&
1572 prog_data_size[stage]);
1573
1574 return prog_data_size[stage];
1575 }
1576
v3d_shaderdb_dump(struct v3d_compile * c,char ** shaderdb_str)1577 int v3d_shaderdb_dump(struct v3d_compile *c,
1578 char **shaderdb_str)
1579 {
1580 if (c == NULL || c->compilation_result != V3D_COMPILATION_SUCCEEDED)
1581 return -1;
1582
1583 return asprintf(shaderdb_str,
1584 "%s shader: %d inst, %d threads, %d loops, "
1585 "%d uniforms, %d max-temps, %d:%d spills:fills, "
1586 "%d sfu-stalls, %d inst-and-stalls, %d nops",
1587 vir_get_stage_name(c),
1588 c->qpu_inst_count,
1589 c->threads,
1590 c->loops,
1591 c->num_uniforms,
1592 vir_get_max_temps(c),
1593 c->spills,
1594 c->fills,
1595 c->qpu_inst_stalled_count,
1596 c->qpu_inst_count + c->qpu_inst_stalled_count,
1597 c->nop_count);
1598 }
1599
1600 /* This is a list of incremental changes to the compilation strategy
1601 * that will be used to try to compile the shader successfully. The
1602 * default strategy is to enable all optimizations which will have
1603 * the highest register pressure but is expected to produce most
1604 * optimal code. Following strategies incrementally disable specific
1605 * optimizations that are known to contribute to register pressure
1606 * in order to be able to compile the shader successfully while meeting
1607 * thread count requirements.
1608 *
1609 * V3D 4.1+ has a min thread count of 2, but we can use 1 here to also
1610 * cover previous hardware as well (meaning that we are not limiting
1611 * register allocation to any particular thread count). This is fine
1612 * because v3d_nir_to_vir will cap this to the actual minimum.
1613 */
1614 struct v3d_compiler_strategy {
1615 const char *name;
1616 uint32_t max_threads;
1617 uint32_t min_threads;
1618 bool disable_loop_unrolling;
1619 bool disable_ubo_load_sorting;
1620 bool disable_tmu_pipelining;
1621 bool tmu_spilling_allowed;
1622 } static const strategies[] = {
1623 /*0*/ { "default", 4, 4, false, false, false, false },
1624 /*1*/ { "disable loop unrolling", 4, 4, true, false, false, false },
1625 /*2*/ { "disable UBO load sorting", 4, 4, true, true, false, false },
1626 /*3*/ { "disable TMU pipelining", 4, 4, true, true, true, false },
1627 /*4*/ { "lower thread count", 2, 1, false, false, false, false },
1628 /*5*/ { "disable loop unrolling (ltc)", 2, 1, true, false, false, false },
1629 /*6*/ { "disable UBO load sorting (ltc)", 2, 1, true, true, false, false },
1630 /*7*/ { "disable TMU pipelining (ltc)", 2, 1, true, true, true, true },
1631 /*8*/ { "fallback scheduler", 2, 1, true, true, true, true }
1632 };
1633
1634 /**
1635 * If a particular optimization didn't make any progress during a compile
1636 * attempt disabling it alone won't allow us to compile the shader successfuly,
1637 * since we'll end up with the same code. Detect these scenarios so we can
1638 * avoid wasting time with useless compiles. We should also consider if the
1639 * strategy changes other aspects of the compilation process though, like
1640 * spilling, and not skip it in that case.
1641 */
1642 static bool
skip_compile_strategy(struct v3d_compile * c,uint32_t idx)1643 skip_compile_strategy(struct v3d_compile *c, uint32_t idx)
1644 {
1645 /* We decide if we can skip a strategy based on the optimizations that
1646 * were active in the previous strategy, so we should only be calling this
1647 * for strategies after the first.
1648 */
1649 assert(idx > 0);
1650
1651 /* Don't skip a strategy that changes spilling behavior */
1652 if (strategies[idx].tmu_spilling_allowed !=
1653 strategies[idx - 1].tmu_spilling_allowed) {
1654 return false;
1655 }
1656
1657 switch (idx) {
1658 /* Loop unrolling: skip if we didn't unroll any loops */
1659 case 1:
1660 case 5:
1661 return !c->unrolled_any_loops;
1662 /* UBO load sorting: skip if we didn't sort any loads */
1663 case 2:
1664 case 6:
1665 return !c->sorted_any_ubo_loads;
1666 /* TMU pipelining: skip if we didn't pipeline any TMU ops */
1667 case 3:
1668 case 7:
1669 return !c->pipelined_any_tmu;
1670 /* Lower thread count: skip if we already tried less that 4 threads */
1671 case 4:
1672 return c->threads < 4;
1673 default:
1674 return false;
1675 };
1676 }
v3d_compile(const struct v3d_compiler * compiler,struct v3d_key * key,struct v3d_prog_data ** out_prog_data,nir_shader * s,void (* debug_output)(const char * msg,void * debug_output_data),void * debug_output_data,int program_id,int variant_id,uint32_t * final_assembly_size)1677 uint64_t *v3d_compile(const struct v3d_compiler *compiler,
1678 struct v3d_key *key,
1679 struct v3d_prog_data **out_prog_data,
1680 nir_shader *s,
1681 void (*debug_output)(const char *msg,
1682 void *debug_output_data),
1683 void *debug_output_data,
1684 int program_id, int variant_id,
1685 uint32_t *final_assembly_size)
1686 {
1687 struct v3d_compile *c = NULL;
1688 for (int i = 0; i < ARRAY_SIZE(strategies); i++) {
1689 /* Fallback strategy */
1690 if (i > 0) {
1691 assert(c);
1692 if (skip_compile_strategy(c, i))
1693 continue;
1694
1695 char *debug_msg;
1696 int ret = asprintf(&debug_msg,
1697 "Falling back to strategy '%s' for %s",
1698 strategies[i].name,
1699 vir_get_stage_name(c));
1700
1701 if (ret >= 0) {
1702 if (unlikely(V3D_DEBUG & V3D_DEBUG_PERF))
1703 fprintf(stderr, "%s\n", debug_msg);
1704
1705 c->debug_output(debug_msg, c->debug_output_data);
1706 free(debug_msg);
1707 }
1708
1709 vir_compile_destroy(c);
1710 }
1711
1712 c = vir_compile_init(compiler, key, s,
1713 debug_output, debug_output_data,
1714 program_id, variant_id,
1715 strategies[i].max_threads,
1716 strategies[i].min_threads,
1717 strategies[i].tmu_spilling_allowed,
1718 strategies[i].disable_loop_unrolling,
1719 strategies[i].disable_ubo_load_sorting,
1720 strategies[i].disable_tmu_pipelining,
1721 i == ARRAY_SIZE(strategies) - 1);
1722
1723 v3d_attempt_compile(c);
1724
1725 if (i >= ARRAY_SIZE(strategies) - 1 ||
1726 c->compilation_result !=
1727 V3D_COMPILATION_FAILED_REGISTER_ALLOCATION) {
1728 break;
1729 }
1730 }
1731
1732 if (unlikely(V3D_DEBUG & V3D_DEBUG_PERF) &&
1733 c->compilation_result !=
1734 V3D_COMPILATION_FAILED_REGISTER_ALLOCATION &&
1735 c->spills > 0) {
1736 char *debug_msg;
1737 int ret = asprintf(&debug_msg,
1738 "Compiled %s with %d spills and %d fills",
1739 vir_get_stage_name(c),
1740 c->spills, c->fills);
1741 fprintf(stderr, "%s\n", debug_msg);
1742
1743 if (ret >= 0) {
1744 c->debug_output(debug_msg, c->debug_output_data);
1745 free(debug_msg);
1746 }
1747 }
1748
1749 if (c->compilation_result != V3D_COMPILATION_SUCCEEDED) {
1750 fprintf(stderr, "Failed to compile %s with any strategy.\n",
1751 vir_get_stage_name(c));
1752 }
1753
1754 struct v3d_prog_data *prog_data;
1755
1756 prog_data = rzalloc_size(NULL, v3d_prog_data_size(c->s->info.stage));
1757
1758 v3d_set_prog_data(c, prog_data);
1759
1760 *out_prog_data = prog_data;
1761
1762 char *shaderdb;
1763 int ret = v3d_shaderdb_dump(c, &shaderdb);
1764 if (ret >= 0) {
1765 if (V3D_DEBUG & V3D_DEBUG_SHADERDB)
1766 fprintf(stderr, "SHADER-DB: %s\n", shaderdb);
1767
1768 c->debug_output(shaderdb, c->debug_output_data);
1769 free(shaderdb);
1770 }
1771
1772 return v3d_return_qpu_insts(c, final_assembly_size);
1773 }
1774
1775 void
vir_remove_instruction(struct v3d_compile * c,struct qinst * qinst)1776 vir_remove_instruction(struct v3d_compile *c, struct qinst *qinst)
1777 {
1778 if (qinst->dst.file == QFILE_TEMP)
1779 c->defs[qinst->dst.index] = NULL;
1780
1781 assert(&qinst->link != c->cursor.link);
1782
1783 list_del(&qinst->link);
1784 free(qinst);
1785
1786 c->live_intervals_valid = false;
1787 }
1788
1789 struct qreg
vir_follow_movs(struct v3d_compile * c,struct qreg reg)1790 vir_follow_movs(struct v3d_compile *c, struct qreg reg)
1791 {
1792 /* XXX
1793 int pack = reg.pack;
1794
1795 while (reg.file == QFILE_TEMP &&
1796 c->defs[reg.index] &&
1797 (c->defs[reg.index]->op == QOP_MOV ||
1798 c->defs[reg.index]->op == QOP_FMOV) &&
1799 !c->defs[reg.index]->dst.pack &&
1800 !c->defs[reg.index]->src[0].pack) {
1801 reg = c->defs[reg.index]->src[0];
1802 }
1803
1804 reg.pack = pack;
1805 */
1806 return reg;
1807 }
1808
1809 void
vir_compile_destroy(struct v3d_compile * c)1810 vir_compile_destroy(struct v3d_compile *c)
1811 {
1812 /* Defuse the assert that we aren't removing the cursor's instruction.
1813 */
1814 c->cursor.link = NULL;
1815
1816 vir_for_each_block(block, c) {
1817 while (!list_is_empty(&block->instructions)) {
1818 struct qinst *qinst =
1819 list_first_entry(&block->instructions,
1820 struct qinst, link);
1821 vir_remove_instruction(c, qinst);
1822 }
1823 }
1824
1825 ralloc_free(c);
1826 }
1827
1828 uint32_t
vir_get_uniform_index(struct v3d_compile * c,enum quniform_contents contents,uint32_t data)1829 vir_get_uniform_index(struct v3d_compile *c,
1830 enum quniform_contents contents,
1831 uint32_t data)
1832 {
1833 for (int i = 0; i < c->num_uniforms; i++) {
1834 if (c->uniform_contents[i] == contents &&
1835 c->uniform_data[i] == data) {
1836 return i;
1837 }
1838 }
1839
1840 uint32_t uniform = c->num_uniforms++;
1841
1842 if (uniform >= c->uniform_array_size) {
1843 c->uniform_array_size = MAX2(MAX2(16, uniform + 1),
1844 c->uniform_array_size * 2);
1845
1846 c->uniform_data = reralloc(c, c->uniform_data,
1847 uint32_t,
1848 c->uniform_array_size);
1849 c->uniform_contents = reralloc(c, c->uniform_contents,
1850 enum quniform_contents,
1851 c->uniform_array_size);
1852 }
1853
1854 c->uniform_contents[uniform] = contents;
1855 c->uniform_data[uniform] = data;
1856
1857 return uniform;
1858 }
1859
1860 /* Looks back into the current block to find the ldunif that wrote the uniform
1861 * at the requested index. If it finds it, it returns true and writes the
1862 * destination register of the ldunif instruction to 'unif'.
1863 *
1864 * This can impact register pressure and end up leading to worse code, so we
1865 * limit the number of instructions we are willing to look back through to
1866 * strike a good balance.
1867 */
1868 static bool
try_opt_ldunif(struct v3d_compile * c,uint32_t index,struct qreg * unif)1869 try_opt_ldunif(struct v3d_compile *c, uint32_t index, struct qreg *unif)
1870 {
1871 uint32_t count = 20;
1872 struct qinst *prev_inst = NULL;
1873 assert(c->cur_block);
1874
1875 #ifdef DEBUG
1876 /* We can only reuse a uniform if it was emitted in the same block,
1877 * so callers must make sure the current instruction is being emitted
1878 * in the current block.
1879 */
1880 bool found = false;
1881 vir_for_each_inst(inst, c->cur_block) {
1882 if (&inst->link == c->cursor.link) {
1883 found = true;
1884 break;
1885 }
1886 }
1887
1888 assert(found || &c->cur_block->instructions == c->cursor.link);
1889 #endif
1890
1891 list_for_each_entry_from_rev(struct qinst, inst, c->cursor.link->prev,
1892 &c->cur_block->instructions, link) {
1893 if ((inst->qpu.sig.ldunif || inst->qpu.sig.ldunifrf) &&
1894 inst->uniform == index) {
1895 prev_inst = inst;
1896 break;
1897 }
1898
1899 if (--count == 0)
1900 break;
1901 }
1902
1903 if (!prev_inst)
1904 return false;
1905
1906
1907 list_for_each_entry_from(struct qinst, inst, prev_inst->link.next,
1908 &c->cur_block->instructions, link) {
1909 if (inst->dst.file == prev_inst->dst.file &&
1910 inst->dst.index == prev_inst->dst.index) {
1911 return false;
1912 }
1913 }
1914
1915 *unif = prev_inst->dst;
1916 return true;
1917 }
1918
1919 struct qreg
vir_uniform(struct v3d_compile * c,enum quniform_contents contents,uint32_t data)1920 vir_uniform(struct v3d_compile *c,
1921 enum quniform_contents contents,
1922 uint32_t data)
1923 {
1924 const int num_uniforms = c->num_uniforms;
1925 const int index = vir_get_uniform_index(c, contents, data);
1926
1927 /* If this is not the first time we see this uniform try to reuse the
1928 * result of the last ldunif that loaded it.
1929 */
1930 const bool is_new_uniform = num_uniforms != c->num_uniforms;
1931 if (!is_new_uniform && !c->disable_ldunif_opt) {
1932 struct qreg ldunif_dst;
1933 if (try_opt_ldunif(c, index, &ldunif_dst))
1934 return ldunif_dst;
1935 }
1936
1937 struct qinst *inst = vir_NOP(c);
1938 inst->qpu.sig.ldunif = true;
1939 inst->uniform = index;
1940 inst->dst = vir_get_temp(c);
1941 c->defs[inst->dst.index] = inst;
1942 return inst->dst;
1943 }
1944
1945 #define OPTPASS(func) \
1946 do { \
1947 bool stage_progress = func(c); \
1948 if (stage_progress) { \
1949 progress = true; \
1950 if (print_opt_debug) { \
1951 fprintf(stderr, \
1952 "VIR opt pass %2d: %s progress\n", \
1953 pass, #func); \
1954 } \
1955 /*XXX vir_validate(c);*/ \
1956 } \
1957 } while (0)
1958
1959 void
vir_optimize(struct v3d_compile * c)1960 vir_optimize(struct v3d_compile *c)
1961 {
1962 bool print_opt_debug = false;
1963 int pass = 1;
1964
1965 while (true) {
1966 bool progress = false;
1967
1968 OPTPASS(vir_opt_copy_propagate);
1969 OPTPASS(vir_opt_redundant_flags);
1970 OPTPASS(vir_opt_dead_code);
1971 OPTPASS(vir_opt_small_immediates);
1972 OPTPASS(vir_opt_constant_alu);
1973
1974 if (!progress)
1975 break;
1976
1977 pass++;
1978 }
1979 }
1980
1981 const char *
vir_get_stage_name(struct v3d_compile * c)1982 vir_get_stage_name(struct v3d_compile *c)
1983 {
1984 if (c->vs_key && c->vs_key->is_coord)
1985 return "MESA_SHADER_VERTEX_BIN";
1986 else if (c->gs_key && c->gs_key->is_coord)
1987 return "MESA_SHADER_GEOMETRY_BIN";
1988 else
1989 return gl_shader_stage_name(c->s->info.stage);
1990 }
1991
1992 static inline uint32_t
compute_vpm_size_in_sectors(const struct v3d_device_info * devinfo)1993 compute_vpm_size_in_sectors(const struct v3d_device_info *devinfo)
1994 {
1995 assert(devinfo->vpm_size > 0);
1996 const uint32_t sector_size = V3D_CHANNELS * sizeof(uint32_t) * 8;
1997 return devinfo->vpm_size / sector_size;
1998 }
1999
2000 /* Computes various parameters affecting VPM memory configuration for programs
2001 * involving geometry shaders to ensure the program fits in memory and honors
2002 * requirements described in section "VPM usage" of the programming manual.
2003 */
2004 static bool
compute_vpm_config_gs(struct v3d_device_info * devinfo,struct v3d_vs_prog_data * vs,struct v3d_gs_prog_data * gs,struct vpm_config * vpm_cfg_out)2005 compute_vpm_config_gs(struct v3d_device_info *devinfo,
2006 struct v3d_vs_prog_data *vs,
2007 struct v3d_gs_prog_data *gs,
2008 struct vpm_config *vpm_cfg_out)
2009 {
2010 const uint32_t A = vs->separate_segments ? 1 : 0;
2011 const uint32_t Ad = vs->vpm_input_size;
2012 const uint32_t Vd = vs->vpm_output_size;
2013
2014 const uint32_t vpm_size = compute_vpm_size_in_sectors(devinfo);
2015
2016 /* Try to fit program into our VPM memory budget by adjusting
2017 * configurable parameters iteratively. We do this in two phases:
2018 * the first phase tries to fit the program into the total available
2019 * VPM memory. If we succeed at that, then the second phase attempts
2020 * to fit the program into half of that budget so we can run bin and
2021 * render programs in parallel.
2022 */
2023 struct vpm_config vpm_cfg[2];
2024 struct vpm_config *final_vpm_cfg = NULL;
2025 uint32_t phase = 0;
2026
2027 vpm_cfg[phase].As = 1;
2028 vpm_cfg[phase].Gs = 1;
2029 vpm_cfg[phase].Gd = gs->vpm_output_size;
2030 vpm_cfg[phase].gs_width = gs->simd_width;
2031
2032 /* While there is a requirement that Vc >= [Vn / 16], this is
2033 * always the case when tessellation is not present because in that
2034 * case Vn can only be 6 at most (when input primitive is triangles
2035 * with adjacency).
2036 *
2037 * We always choose Vc=2. We can't go lower than this due to GFXH-1744,
2038 * and Broadcom has not found it worth it to increase it beyond this
2039 * in general. Increasing Vc also increases VPM memory pressure which
2040 * can turn up being detrimental for performance in some scenarios.
2041 */
2042 vpm_cfg[phase].Vc = 2;
2043
2044 /* Gv is a constraint on the hardware to not exceed the
2045 * specified number of vertex segments per GS batch. If adding a
2046 * new primitive to a GS batch would result in a range of more
2047 * than Gv vertex segments being referenced by the batch, then
2048 * the hardware will flush the batch and start a new one. This
2049 * means that we can choose any value we want, we just need to
2050 * be aware that larger values improve GS batch utilization
2051 * at the expense of more VPM memory pressure (which can affect
2052 * other performance aspects, such as GS dispatch width).
2053 * We start with the largest value, and will reduce it if we
2054 * find that total memory pressure is too high.
2055 */
2056 vpm_cfg[phase].Gv = 3;
2057 do {
2058 /* When GS is present in absence of TES, then we need to satisfy
2059 * that Ve >= Gv. We go with the smallest value of Ve to avoid
2060 * increasing memory pressure.
2061 */
2062 vpm_cfg[phase].Ve = vpm_cfg[phase].Gv;
2063
2064 uint32_t vpm_sectors =
2065 A * vpm_cfg[phase].As * Ad +
2066 (vpm_cfg[phase].Vc + vpm_cfg[phase].Ve) * Vd +
2067 vpm_cfg[phase].Gs * vpm_cfg[phase].Gd;
2068
2069 /* Ideally we want to use no more than half of the available
2070 * memory so we can execute a bin and render program in parallel
2071 * without stalls. If we achieved that then we are done.
2072 */
2073 if (vpm_sectors <= vpm_size / 2) {
2074 final_vpm_cfg = &vpm_cfg[phase];
2075 break;
2076 }
2077
2078 /* At the very least, we should not allocate more than the
2079 * total available VPM memory. If we have a configuration that
2080 * succeeds at this we save it and continue to see if we can
2081 * meet the half-memory-use criteria too.
2082 */
2083 if (phase == 0 && vpm_sectors <= vpm_size) {
2084 vpm_cfg[1] = vpm_cfg[0];
2085 phase = 1;
2086 }
2087
2088 /* Try lowering Gv */
2089 if (vpm_cfg[phase].Gv > 0) {
2090 vpm_cfg[phase].Gv--;
2091 continue;
2092 }
2093
2094 /* Try lowering GS dispatch width */
2095 if (vpm_cfg[phase].gs_width > 1) {
2096 do {
2097 vpm_cfg[phase].gs_width >>= 1;
2098 vpm_cfg[phase].Gd = align(vpm_cfg[phase].Gd, 2) / 2;
2099 } while (vpm_cfg[phase].gs_width == 2);
2100
2101 /* Reset Gv to max after dropping dispatch width */
2102 vpm_cfg[phase].Gv = 3;
2103 continue;
2104 }
2105
2106 /* We ran out of options to reduce memory pressure. If we
2107 * are at phase 1 we have at least a valid configuration, so we
2108 * we use that.
2109 */
2110 if (phase == 1)
2111 final_vpm_cfg = &vpm_cfg[0];
2112 break;
2113 } while (true);
2114
2115 if (!final_vpm_cfg)
2116 return false;
2117
2118 assert(final_vpm_cfg);
2119 assert(final_vpm_cfg->Gd <= 16);
2120 assert(final_vpm_cfg->Gv < 4);
2121 assert(final_vpm_cfg->Ve < 4);
2122 assert(final_vpm_cfg->Vc >= 2 && final_vpm_cfg->Vc <= 4);
2123 assert(final_vpm_cfg->gs_width == 1 ||
2124 final_vpm_cfg->gs_width == 4 ||
2125 final_vpm_cfg->gs_width == 8 ||
2126 final_vpm_cfg->gs_width == 16);
2127
2128 *vpm_cfg_out = *final_vpm_cfg;
2129 return true;
2130 }
2131
2132 bool
v3d_compute_vpm_config(struct v3d_device_info * devinfo,struct v3d_vs_prog_data * vs_bin,struct v3d_vs_prog_data * vs,struct v3d_gs_prog_data * gs_bin,struct v3d_gs_prog_data * gs,struct vpm_config * vpm_cfg_bin,struct vpm_config * vpm_cfg)2133 v3d_compute_vpm_config(struct v3d_device_info *devinfo,
2134 struct v3d_vs_prog_data *vs_bin,
2135 struct v3d_vs_prog_data *vs,
2136 struct v3d_gs_prog_data *gs_bin,
2137 struct v3d_gs_prog_data *gs,
2138 struct vpm_config *vpm_cfg_bin,
2139 struct vpm_config *vpm_cfg)
2140 {
2141 assert(vs && vs_bin);
2142 assert((gs != NULL) == (gs_bin != NULL));
2143
2144 if (!gs) {
2145 vpm_cfg_bin->As = 1;
2146 vpm_cfg_bin->Ve = 0;
2147 vpm_cfg_bin->Vc = vs_bin->vcm_cache_size;
2148
2149 vpm_cfg->As = 1;
2150 vpm_cfg->Ve = 0;
2151 vpm_cfg->Vc = vs->vcm_cache_size;
2152 } else {
2153 if (!compute_vpm_config_gs(devinfo, vs_bin, gs_bin, vpm_cfg_bin))
2154 return false;
2155
2156 if (!compute_vpm_config_gs(devinfo, vs, gs, vpm_cfg))
2157 return false;
2158 }
2159
2160 return true;
2161 }
2162