• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*
2  * Copyright © 2016-2017 Broadcom
3  *
4  * Permission is hereby granted, free of charge, to any person obtaining a
5  * copy of this software and associated documentation files (the "Software"),
6  * to deal in the Software without restriction, including without limitation
7  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8  * and/or sell copies of the Software, and to permit persons to whom the
9  * Software is furnished to do so, subject to the following conditions:
10  *
11  * The above copyright notice and this permission notice (including the next
12  * paragraph) shall be included in all copies or substantial portions of the
13  * Software.
14  *
15  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
18  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21  * IN THE SOFTWARE.
22  */
23 
24 #include "broadcom/common/v3d_device_info.h"
25 #include "v3d_compiler.h"
26 #include "compiler/nir/nir_schedule.h"
27 #include "compiler/nir/nir_builder.h"
28 #include "util/perf/cpu_trace.h"
29 
30 int
vir_get_nsrc(struct qinst * inst)31 vir_get_nsrc(struct qinst *inst)
32 {
33         switch (inst->qpu.type) {
34         case V3D_QPU_INSTR_TYPE_BRANCH:
35                 return 0;
36         case V3D_QPU_INSTR_TYPE_ALU:
37                 if (inst->qpu.alu.add.op != V3D_QPU_A_NOP)
38                         return v3d_qpu_add_op_num_src(inst->qpu.alu.add.op);
39                 else
40                         return v3d_qpu_mul_op_num_src(inst->qpu.alu.mul.op);
41         }
42 
43         return 0;
44 }
45 
46 /**
47  * Returns whether the instruction has any side effects that must be
48  * preserved.
49  */
50 bool
vir_has_side_effects(struct v3d_compile * c,struct qinst * inst)51 vir_has_side_effects(struct v3d_compile *c, struct qinst *inst)
52 {
53         switch (inst->qpu.type) {
54         case V3D_QPU_INSTR_TYPE_BRANCH:
55                 return true;
56         case V3D_QPU_INSTR_TYPE_ALU:
57                 switch (inst->qpu.alu.add.op) {
58                 case V3D_QPU_A_SETREVF:
59                 case V3D_QPU_A_SETMSF:
60                 case V3D_QPU_A_VPMSETUP:
61                 case V3D_QPU_A_STVPMV:
62                 case V3D_QPU_A_STVPMD:
63                 case V3D_QPU_A_STVPMP:
64                 case V3D_QPU_A_VPMWT:
65                 case V3D_QPU_A_TMUWT:
66                         return true;
67                 default:
68                         break;
69                 }
70 
71                 switch (inst->qpu.alu.mul.op) {
72                 case V3D_QPU_M_MULTOP:
73                         return true;
74                 default:
75                         break;
76                 }
77         }
78 
79         if (inst->qpu.sig.ldtmu ||
80             inst->qpu.sig.ldvary ||
81             inst->qpu.sig.ldtlbu ||
82             inst->qpu.sig.ldtlb ||
83             inst->qpu.sig.wrtmuc ||
84             inst->qpu.sig.thrsw) {
85                 return true;
86         }
87 
88         /* ldunifa works like ldunif: it reads an element and advances the
89          * pointer, so each read has a side effect (we don't care for ldunif
90          * because we reconstruct the uniform stream buffer after compiling
91          * with the surviving uniforms), so allowing DCE to remove
92          * one would break follow-up loads. We could fix this by emitting a
93          * unifa for each ldunifa, but each unifa requires 3 delay slots
94          * before a ldunifa, so that would be quite expensive.
95          */
96         if (inst->qpu.sig.ldunifa || inst->qpu.sig.ldunifarf)
97                 return true;
98 
99         return false;
100 }
101 
102 bool
vir_is_raw_mov(struct qinst * inst)103 vir_is_raw_mov(struct qinst *inst)
104 {
105         if (inst->qpu.type != V3D_QPU_INSTR_TYPE_ALU ||
106             (inst->qpu.alu.mul.op != V3D_QPU_M_FMOV &&
107              inst->qpu.alu.mul.op != V3D_QPU_M_MOV)) {
108                 return false;
109         }
110 
111         if (inst->qpu.alu.add.output_pack != V3D_QPU_PACK_NONE ||
112             inst->qpu.alu.mul.output_pack != V3D_QPU_PACK_NONE) {
113                 return false;
114         }
115 
116         if (inst->qpu.alu.add.a.unpack != V3D_QPU_UNPACK_NONE ||
117             inst->qpu.alu.add.b.unpack != V3D_QPU_UNPACK_NONE ||
118             inst->qpu.alu.mul.a.unpack != V3D_QPU_UNPACK_NONE ||
119             inst->qpu.alu.mul.b.unpack != V3D_QPU_UNPACK_NONE) {
120                 return false;
121         }
122 
123         if (inst->qpu.flags.ac != V3D_QPU_COND_NONE ||
124             inst->qpu.flags.mc != V3D_QPU_COND_NONE)
125                 return false;
126 
127         return true;
128 }
129 
130 bool
vir_is_add(struct qinst * inst)131 vir_is_add(struct qinst *inst)
132 {
133         return (inst->qpu.type == V3D_QPU_INSTR_TYPE_ALU &&
134                 inst->qpu.alu.add.op != V3D_QPU_A_NOP);
135 }
136 
137 bool
vir_is_mul(struct qinst * inst)138 vir_is_mul(struct qinst *inst)
139 {
140         return (inst->qpu.type == V3D_QPU_INSTR_TYPE_ALU &&
141                 inst->qpu.alu.mul.op != V3D_QPU_M_NOP);
142 }
143 
144 bool
vir_is_tex(const struct v3d_device_info * devinfo,struct qinst * inst)145 vir_is_tex(const struct v3d_device_info *devinfo, struct qinst *inst)
146 {
147         if (inst->dst.file == QFILE_MAGIC)
148                 return v3d_qpu_magic_waddr_is_tmu(devinfo, inst->dst.index);
149 
150         if (inst->qpu.type == V3D_QPU_INSTR_TYPE_ALU &&
151             inst->qpu.alu.add.op == V3D_QPU_A_TMUWT) {
152                 return true;
153         }
154 
155         return false;
156 }
157 
158 bool
vir_writes_r4_implicitly(const struct v3d_device_info * devinfo,struct qinst * inst)159 vir_writes_r4_implicitly(const struct v3d_device_info *devinfo,
160                          struct qinst *inst)
161 {
162         if (!devinfo->has_accumulators)
163                 return false;
164 
165         switch (inst->dst.file) {
166         case QFILE_MAGIC:
167                 switch (inst->dst.index) {
168                 case V3D_QPU_WADDR_RECIP:
169                 case V3D_QPU_WADDR_RSQRT:
170                 case V3D_QPU_WADDR_EXP:
171                 case V3D_QPU_WADDR_LOG:
172                 case V3D_QPU_WADDR_SIN:
173                         return true;
174                 }
175                 break;
176         default:
177                 break;
178         }
179 
180         return false;
181 }
182 
183 void
vir_set_unpack(struct qinst * inst,int src,enum v3d_qpu_input_unpack unpack)184 vir_set_unpack(struct qinst *inst, int src,
185                enum v3d_qpu_input_unpack unpack)
186 {
187         assert(src == 0 || src == 1);
188 
189         if (vir_is_add(inst)) {
190                 if (src == 0)
191                         inst->qpu.alu.add.a.unpack = unpack;
192                 else
193                         inst->qpu.alu.add.b.unpack = unpack;
194         } else {
195                 assert(vir_is_mul(inst));
196                 if (src == 0)
197                         inst->qpu.alu.mul.a.unpack = unpack;
198                 else
199                         inst->qpu.alu.mul.b.unpack = unpack;
200         }
201 }
202 
203 void
vir_set_pack(struct qinst * inst,enum v3d_qpu_output_pack pack)204 vir_set_pack(struct qinst *inst, enum v3d_qpu_output_pack pack)
205 {
206         if (vir_is_add(inst)) {
207                 inst->qpu.alu.add.output_pack = pack;
208         } else {
209                 assert(vir_is_mul(inst));
210                 inst->qpu.alu.mul.output_pack = pack;
211         }
212 }
213 
214 void
vir_set_cond(struct qinst * inst,enum v3d_qpu_cond cond)215 vir_set_cond(struct qinst *inst, enum v3d_qpu_cond cond)
216 {
217         if (vir_is_add(inst)) {
218                 inst->qpu.flags.ac = cond;
219         } else {
220                 assert(vir_is_mul(inst));
221                 inst->qpu.flags.mc = cond;
222         }
223 }
224 
225 enum v3d_qpu_cond
vir_get_cond(struct qinst * inst)226 vir_get_cond(struct qinst *inst)
227 {
228         assert(inst->qpu.type == V3D_QPU_INSTR_TYPE_ALU);
229 
230         if (vir_is_add(inst))
231                 return inst->qpu.flags.ac;
232         else if (vir_is_mul(inst))
233                 return inst->qpu.flags.mc;
234         else /* NOP */
235                 return V3D_QPU_COND_NONE;
236 }
237 
238 void
vir_set_pf(struct v3d_compile * c,struct qinst * inst,enum v3d_qpu_pf pf)239 vir_set_pf(struct v3d_compile *c, struct qinst *inst, enum v3d_qpu_pf pf)
240 {
241         c->flags_temp = -1;
242         if (vir_is_add(inst)) {
243                 inst->qpu.flags.apf = pf;
244         } else {
245                 assert(vir_is_mul(inst));
246                 inst->qpu.flags.mpf = pf;
247         }
248 }
249 
250 void
vir_set_uf(struct v3d_compile * c,struct qinst * inst,enum v3d_qpu_uf uf)251 vir_set_uf(struct v3d_compile *c, struct qinst *inst, enum v3d_qpu_uf uf)
252 {
253         c->flags_temp = -1;
254         if (vir_is_add(inst)) {
255                 inst->qpu.flags.auf = uf;
256         } else {
257                 assert(vir_is_mul(inst));
258                 inst->qpu.flags.muf = uf;
259         }
260 }
261 
262 #if 0
263 uint8_t
264 vir_channels_written(struct qinst *inst)
265 {
266         if (vir_is_mul(inst)) {
267                 switch (inst->dst.pack) {
268                 case QPU_PACK_MUL_NOP:
269                 case QPU_PACK_MUL_8888:
270                         return 0xf;
271                 case QPU_PACK_MUL_8A:
272                         return 0x1;
273                 case QPU_PACK_MUL_8B:
274                         return 0x2;
275                 case QPU_PACK_MUL_8C:
276                         return 0x4;
277                 case QPU_PACK_MUL_8D:
278                         return 0x8;
279                 }
280         } else {
281                 switch (inst->dst.pack) {
282                 case QPU_PACK_A_NOP:
283                 case QPU_PACK_A_8888:
284                 case QPU_PACK_A_8888_SAT:
285                 case QPU_PACK_A_32_SAT:
286                         return 0xf;
287                 case QPU_PACK_A_8A:
288                 case QPU_PACK_A_8A_SAT:
289                         return 0x1;
290                 case QPU_PACK_A_8B:
291                 case QPU_PACK_A_8B_SAT:
292                         return 0x2;
293                 case QPU_PACK_A_8C:
294                 case QPU_PACK_A_8C_SAT:
295                         return 0x4;
296                 case QPU_PACK_A_8D:
297                 case QPU_PACK_A_8D_SAT:
298                         return 0x8;
299                 case QPU_PACK_A_16A:
300                 case QPU_PACK_A_16A_SAT:
301                         return 0x3;
302                 case QPU_PACK_A_16B:
303                 case QPU_PACK_A_16B_SAT:
304                         return 0xc;
305                 }
306         }
307         unreachable("Bad pack field");
308 }
309 #endif
310 
311 struct qreg
vir_get_temp(struct v3d_compile * c)312 vir_get_temp(struct v3d_compile *c)
313 {
314         struct qreg reg;
315 
316         reg.file = QFILE_TEMP;
317         reg.index = c->num_temps++;
318 
319         if (c->num_temps > c->defs_array_size) {
320                 uint32_t old_size = c->defs_array_size;
321                 c->defs_array_size = MAX2(old_size * 2, 16);
322 
323                 c->defs = reralloc(c, c->defs, struct qinst *,
324                                    c->defs_array_size);
325                 memset(&c->defs[old_size], 0,
326                        sizeof(c->defs[0]) * (c->defs_array_size - old_size));
327 
328                 c->spillable = reralloc(c, c->spillable,
329                                         BITSET_WORD,
330                                         BITSET_WORDS(c->defs_array_size));
331                 for (int i = old_size; i < c->defs_array_size; i++)
332                         BITSET_SET(c->spillable, i);
333         }
334 
335         return reg;
336 }
337 
338 struct qinst *
vir_add_inst(enum v3d_qpu_add_op op,struct qreg dst,struct qreg src0,struct qreg src1)339 vir_add_inst(enum v3d_qpu_add_op op, struct qreg dst, struct qreg src0, struct qreg src1)
340 {
341         struct qinst *inst = calloc(1, sizeof(*inst));
342 
343         inst->qpu = v3d_qpu_nop();
344         inst->qpu.alu.add.op = op;
345 
346         inst->dst = dst;
347         inst->src[0] = src0;
348         inst->src[1] = src1;
349         inst->uniform = ~0;
350 
351         inst->ip = -1;
352 
353         return inst;
354 }
355 
356 struct qinst *
vir_mul_inst(enum v3d_qpu_mul_op op,struct qreg dst,struct qreg src0,struct qreg src1)357 vir_mul_inst(enum v3d_qpu_mul_op op, struct qreg dst, struct qreg src0, struct qreg src1)
358 {
359         struct qinst *inst = calloc(1, sizeof(*inst));
360 
361         inst->qpu = v3d_qpu_nop();
362         inst->qpu.alu.mul.op = op;
363 
364         inst->dst = dst;
365         inst->src[0] = src0;
366         inst->src[1] = src1;
367         inst->uniform = ~0;
368 
369         inst->ip = -1;
370 
371         return inst;
372 }
373 
374 struct qinst *
vir_branch_inst(struct v3d_compile * c,enum v3d_qpu_branch_cond cond)375 vir_branch_inst(struct v3d_compile *c, enum v3d_qpu_branch_cond cond)
376 {
377         struct qinst *inst = calloc(1, sizeof(*inst));
378 
379         inst->qpu = v3d_qpu_nop();
380         inst->qpu.type = V3D_QPU_INSTR_TYPE_BRANCH;
381         inst->qpu.branch.cond = cond;
382         inst->qpu.branch.msfign = V3D_QPU_MSFIGN_NONE;
383         inst->qpu.branch.bdi = V3D_QPU_BRANCH_DEST_REL;
384         inst->qpu.branch.ub = true;
385         inst->qpu.branch.bdu = V3D_QPU_BRANCH_DEST_REL;
386 
387         inst->dst = vir_nop_reg();
388         inst->uniform = vir_get_uniform_index(c, QUNIFORM_CONSTANT, 0);
389 
390         inst->ip = -1;
391 
392         return inst;
393 }
394 
395 static void
vir_emit(struct v3d_compile * c,struct qinst * inst)396 vir_emit(struct v3d_compile *c, struct qinst *inst)
397 {
398         inst->ip = -1;
399 
400         switch (c->cursor.mode) {
401         case vir_cursor_add:
402                 list_add(&inst->link, c->cursor.link);
403                 break;
404         case vir_cursor_addtail:
405                 list_addtail(&inst->link, c->cursor.link);
406                 break;
407         }
408 
409         c->cursor = vir_after_inst(inst);
410         c->live_intervals_valid = false;
411 }
412 
413 /* Updates inst to write to a new temporary, emits it, and notes the def. */
414 struct qreg
vir_emit_def(struct v3d_compile * c,struct qinst * inst)415 vir_emit_def(struct v3d_compile *c, struct qinst *inst)
416 {
417         assert(inst->dst.file == QFILE_NULL);
418 
419         /* If we're emitting an instruction that's a def, it had better be
420          * writing a register.
421          */
422         if (inst->qpu.type == V3D_QPU_INSTR_TYPE_ALU) {
423                 assert(inst->qpu.alu.add.op == V3D_QPU_A_NOP ||
424                        v3d_qpu_add_op_has_dst(inst->qpu.alu.add.op));
425                 assert(inst->qpu.alu.mul.op == V3D_QPU_M_NOP ||
426                        v3d_qpu_mul_op_has_dst(inst->qpu.alu.mul.op));
427         }
428 
429         inst->dst = vir_get_temp(c);
430 
431         if (inst->dst.file == QFILE_TEMP)
432                 c->defs[inst->dst.index] = inst;
433 
434         vir_emit(c, inst);
435 
436         return inst->dst;
437 }
438 
439 struct qinst *
vir_emit_nondef(struct v3d_compile * c,struct qinst * inst)440 vir_emit_nondef(struct v3d_compile *c, struct qinst *inst)
441 {
442         if (inst->dst.file == QFILE_TEMP)
443                 c->defs[inst->dst.index] = NULL;
444 
445         vir_emit(c, inst);
446 
447         return inst;
448 }
449 
450 struct qblock *
vir_new_block(struct v3d_compile * c)451 vir_new_block(struct v3d_compile *c)
452 {
453         struct qblock *block = rzalloc(c, struct qblock);
454 
455         list_inithead(&block->instructions);
456 
457         block->predecessors = _mesa_set_create(block,
458                                                _mesa_hash_pointer,
459                                                _mesa_key_pointer_equal);
460 
461         block->index = c->next_block_index++;
462 
463         return block;
464 }
465 
466 void
vir_set_emit_block(struct v3d_compile * c,struct qblock * block)467 vir_set_emit_block(struct v3d_compile *c, struct qblock *block)
468 {
469         c->cur_block = block;
470         c->cursor = vir_after_block(block);
471         list_addtail(&block->link, &c->blocks);
472 }
473 
474 struct qblock *
vir_entry_block(struct v3d_compile * c)475 vir_entry_block(struct v3d_compile *c)
476 {
477         return list_first_entry(&c->blocks, struct qblock, link);
478 }
479 
480 struct qblock *
vir_exit_block(struct v3d_compile * c)481 vir_exit_block(struct v3d_compile *c)
482 {
483         return list_last_entry(&c->blocks, struct qblock, link);
484 }
485 
486 void
vir_link_blocks(struct qblock * predecessor,struct qblock * successor)487 vir_link_blocks(struct qblock *predecessor, struct qblock *successor)
488 {
489         _mesa_set_add(successor->predecessors, predecessor);
490         if (predecessor->successors[0]) {
491                 assert(!predecessor->successors[1]);
492                 predecessor->successors[1] = successor;
493         } else {
494                 predecessor->successors[0] = successor;
495         }
496 }
497 
498 const struct v3d_compiler *
v3d_compiler_init(const struct v3d_device_info * devinfo,uint32_t max_inline_uniform_buffers)499 v3d_compiler_init(const struct v3d_device_info *devinfo,
500                   uint32_t max_inline_uniform_buffers)
501 {
502         struct v3d_compiler *compiler = rzalloc(NULL, struct v3d_compiler);
503         if (!compiler)
504                 return NULL;
505 
506         compiler->devinfo = devinfo;
507         compiler->max_inline_uniform_buffers = max_inline_uniform_buffers;
508 
509         if (!vir_init_reg_sets(compiler)) {
510                 ralloc_free(compiler);
511                 return NULL;
512         }
513 
514         return compiler;
515 }
516 
517 void
v3d_compiler_free(const struct v3d_compiler * compiler)518 v3d_compiler_free(const struct v3d_compiler *compiler)
519 {
520         ralloc_free((void *)compiler);
521 }
522 
523 struct v3d_compiler_strategy {
524         const char *name;
525         uint32_t max_threads;
526         uint32_t min_threads;
527         bool disable_general_tmu_sched;
528         bool disable_gcm;
529         bool disable_loop_unrolling;
530         bool disable_ubo_load_sorting;
531         bool move_buffer_loads;
532         bool disable_tmu_pipelining;
533         uint32_t max_tmu_spills;
534 };
535 
536 static struct v3d_compile *
vir_compile_init(const struct v3d_compiler * compiler,struct v3d_key * key,nir_shader * s,void (* debug_output)(const char * msg,void * debug_output_data),void * debug_output_data,int program_id,int variant_id,uint32_t compile_strategy_idx,const struct v3d_compiler_strategy * strategy,bool fallback_scheduler)537 vir_compile_init(const struct v3d_compiler *compiler,
538                  struct v3d_key *key,
539                  nir_shader *s,
540                  void (*debug_output)(const char *msg,
541                                       void *debug_output_data),
542                  void *debug_output_data,
543                  int program_id, int variant_id,
544                  uint32_t compile_strategy_idx,
545                  const struct v3d_compiler_strategy *strategy,
546                  bool fallback_scheduler)
547 {
548         struct v3d_compile *c = rzalloc(NULL, struct v3d_compile);
549 
550         c->compiler = compiler;
551         c->devinfo = compiler->devinfo;
552         c->key = key;
553         c->program_id = program_id;
554         c->variant_id = variant_id;
555         c->compile_strategy_idx = compile_strategy_idx;
556         c->threads = strategy->max_threads;
557         c->debug_output = debug_output;
558         c->debug_output_data = debug_output_data;
559         c->compilation_result = V3D_COMPILATION_SUCCEEDED;
560         c->min_threads_for_reg_alloc = strategy->min_threads;
561         c->max_tmu_spills = strategy->max_tmu_spills;
562         c->fallback_scheduler = fallback_scheduler;
563         c->disable_general_tmu_sched = strategy->disable_general_tmu_sched;
564         c->disable_tmu_pipelining = strategy->disable_tmu_pipelining;
565         c->disable_constant_ubo_load_sorting = strategy->disable_ubo_load_sorting;
566         c->move_buffer_loads = strategy->move_buffer_loads;
567         c->disable_gcm = strategy->disable_gcm;
568         c->disable_loop_unrolling = V3D_DBG(NO_LOOP_UNROLL)
569                 ? true : strategy->disable_loop_unrolling;
570 
571 
572         s = nir_shader_clone(c, s);
573         c->s = s;
574 
575         list_inithead(&c->blocks);
576         vir_set_emit_block(c, vir_new_block(c));
577 
578         c->output_position_index = -1;
579         c->output_sample_mask_index = -1;
580 
581         c->def_ht = _mesa_hash_table_create(c, _mesa_hash_pointer,
582                                             _mesa_key_pointer_equal);
583 
584         c->tmu.outstanding_regs = _mesa_pointer_set_create(c);
585         c->flags_temp = -1;
586 
587         return c;
588 }
589 
590 static int
type_size_vec4(const struct glsl_type * type,bool bindless)591 type_size_vec4(const struct glsl_type *type, bool bindless)
592 {
593         return glsl_count_attribute_slots(type, false);
594 }
595 
596 static enum nir_lower_tex_packing
lower_tex_packing_cb(const nir_tex_instr * tex,const void * data)597 lower_tex_packing_cb(const nir_tex_instr *tex, const void *data)
598 {
599    struct v3d_compile *c = (struct v3d_compile *) data;
600 
601    int sampler_index = nir_tex_instr_need_sampler(tex) ?
602       tex->sampler_index : tex->backend_flags;
603 
604    assert(sampler_index < c->key->num_samplers_used);
605    return c->key->sampler[sampler_index].return_size == 16 ?
606       nir_lower_tex_packing_16 : nir_lower_tex_packing_none;
607 }
608 
609 static bool
v3d_nir_lower_null_pointers_cb(nir_builder * b,nir_intrinsic_instr * intr,void * _state)610 v3d_nir_lower_null_pointers_cb(nir_builder *b,
611                                nir_intrinsic_instr *intr,
612                                void *_state)
613 {
614         uint32_t buffer_src_idx;
615 
616         switch (intr->intrinsic) {
617         case nir_intrinsic_load_ubo:
618         case nir_intrinsic_load_ssbo:
619                 buffer_src_idx = 0;
620                 break;
621         case nir_intrinsic_store_ssbo:
622                 buffer_src_idx = 1;
623                 break;
624         default:
625                 return false;
626         }
627 
628         /* If index if constant we are good */
629         nir_src *src = &intr->src[buffer_src_idx];
630         if (nir_src_is_const(*src))
631                 return false;
632 
633         /* Otherwise, see if it comes from a bcsel including a null pointer */
634         if (src->ssa->parent_instr->type != nir_instr_type_alu)
635                 return false;
636 
637         nir_alu_instr *alu = nir_instr_as_alu(src->ssa->parent_instr);
638         if (alu->op != nir_op_bcsel)
639                 return false;
640 
641         /* A null pointer is specified using block index 0xffffffff */
642         int32_t null_src_idx = -1;
643         for (int i = 1; i < 3; i++) {
644                  /* FIXME: since we are running this before optimization maybe
645                   * we need to also handle the case where we may have bcsel
646                   * chain that we need to recurse?
647                   */
648                 if (!nir_src_is_const(alu->src[i].src))
649                         continue;
650                 if (nir_src_comp_as_uint(alu->src[i].src, 0) != 0xffffffff)
651                         continue;
652 
653                 /* One of the bcsel srcs is a null pointer reference */
654                 null_src_idx = i;
655                 break;
656         }
657 
658         if (null_src_idx < 0)
659                 return false;
660 
661         assert(null_src_idx == 1 || null_src_idx == 2);
662         int32_t copy_src_idx = null_src_idx == 1 ? 2 : 1;
663 
664         /* Rewrite the null pointer reference so we use the same buffer index
665          * as the other bcsel branch. This will allow optimization to remove
666          * the bcsel and we should then end up with a constant buffer index
667          * like we need.
668          */
669         b->cursor = nir_before_instr(&alu->instr);
670         nir_def *copy = nir_mov(b, alu->src[copy_src_idx].src.ssa);
671         nir_src_rewrite(&alu->src[null_src_idx].src, copy);
672 
673         return true;
674 }
675 
676 static bool
v3d_nir_lower_null_pointers(nir_shader * s)677 v3d_nir_lower_null_pointers(nir_shader *s)
678 {
679         return nir_shader_intrinsics_pass(s, v3d_nir_lower_null_pointers_cb,
680                                             nir_metadata_control_flow, NULL);
681 }
682 
683 static unsigned
lower_bit_size_cb(const nir_instr * instr,void * _data)684 lower_bit_size_cb(const nir_instr *instr, void *_data)
685 {
686         if (instr->type != nir_instr_type_alu)
687                 return 0;
688 
689         nir_alu_instr *alu = nir_instr_as_alu(instr);
690 
691         switch (alu->op) {
692         case nir_op_mov:
693         case nir_op_vec2:
694         case nir_op_vec3:
695         case nir_op_vec4:
696         case nir_op_vec5:
697         case nir_op_vec8:
698         case nir_op_vec16:
699         case nir_op_b2i8:
700         case nir_op_b2f16:
701         case nir_op_b2i16:
702         case nir_op_b2f32:
703         case nir_op_b2i32:
704         case nir_op_f2f16:
705         case nir_op_f2f16_rtne:
706         case nir_op_f2f16_rtz:
707         case nir_op_f2f32:
708         case nir_op_f2i32:
709         case nir_op_f2u32:
710         case nir_op_i2i8:
711         case nir_op_i2i16:
712         case nir_op_i2f16:
713         case nir_op_i2f32:
714         case nir_op_i2i32:
715         case nir_op_u2u8:
716         case nir_op_u2u16:
717         case nir_op_u2f16:
718         case nir_op_u2f32:
719         case nir_op_u2u32:
720         case nir_op_pack_32_2x16_split:
721         case nir_op_pack_32_4x8_split:
722         case nir_op_pack_half_2x16_split:
723                 return 0;
724 
725         /* we need to handle those here as they only work with 32 bits */
726         default:
727                 if (alu->src[0].src.ssa->bit_size != 1 && alu->src[0].src.ssa->bit_size < 32)
728                         return 32;
729                 return 0;
730         }
731 }
732 
733 static void
v3d_lower_nir(struct v3d_compile * c)734 v3d_lower_nir(struct v3d_compile *c)
735 {
736         struct nir_lower_tex_options tex_options = {
737                 .lower_txd = true,
738                 .lower_tg4_offsets = true,
739                 .lower_tg4_broadcom_swizzle = true,
740 
741                 .lower_rect = false, /* XXX: Use this on V3D 3.x */
742                 .lower_txp = ~0,
743                 /* Apply swizzles to all samplers. */
744                 .swizzle_result = ~0,
745                 .lower_invalid_implicit_lod = true,
746         };
747 
748         /* Lower the format swizzle and (for 32-bit returns)
749          * ARB_texture_swizzle-style swizzle.
750          */
751         assert(c->key->num_tex_used <= ARRAY_SIZE(c->key->tex));
752         for (int i = 0; i < c->key->num_tex_used; i++) {
753                 for (int j = 0; j < 4; j++)
754                         tex_options.swizzles[i][j] = c->key->tex[i].swizzle[j];
755         }
756 
757         tex_options.lower_tex_packing_cb = lower_tex_packing_cb;
758         tex_options.lower_tex_packing_data = c;
759 
760         NIR_PASS(_, c->s, nir_lower_tex, &tex_options);
761         NIR_PASS(_, c->s, nir_lower_system_values);
762 
763         if (c->s->info.zero_initialize_shared_memory &&
764             c->s->info.shared_size > 0) {
765                 /* All our BOs allocate full pages, so the underlying allocation
766                  * for shared memory will always be a multiple of 4KB. This
767                  * ensures that we can do an exact number of full chunk_size
768                  * writes to initialize the memory independently of the actual
769                  * shared_size used by the shader, which is a requirement of
770                  * the initialization pass.
771                  */
772                 const unsigned chunk_size = 16; /* max single store size */
773                 NIR_PASS(_, c->s, nir_zero_initialize_shared_memory,
774                          align(c->s->info.shared_size, chunk_size), chunk_size);
775         }
776 
777         NIR_PASS(_, c->s, nir_lower_compute_system_values, NULL);
778         NIR_PASS(_, c->s, nir_lower_is_helper_invocation);
779         NIR_PASS(_, c->s, v3d_nir_lower_null_pointers);
780         NIR_PASS(_, c->s, nir_lower_bit_size, lower_bit_size_cb, NULL);
781 }
782 
783 static void
v3d_set_prog_data_uniforms(struct v3d_compile * c,struct v3d_prog_data * prog_data)784 v3d_set_prog_data_uniforms(struct v3d_compile *c,
785                            struct v3d_prog_data *prog_data)
786 {
787         int count = c->num_uniforms;
788         struct v3d_uniform_list *ulist = &prog_data->uniforms;
789 
790         ulist->count = count;
791         ulist->data = ralloc_array(prog_data, uint32_t, count);
792         memcpy(ulist->data, c->uniform_data,
793                count * sizeof(*ulist->data));
794         ulist->contents = ralloc_array(prog_data, enum quniform_contents, count);
795         memcpy(ulist->contents, c->uniform_contents,
796                count * sizeof(*ulist->contents));
797 }
798 
799 static void
v3d_vs_set_prog_data(struct v3d_compile * c,struct v3d_vs_prog_data * prog_data)800 v3d_vs_set_prog_data(struct v3d_compile *c,
801                      struct v3d_vs_prog_data *prog_data)
802 {
803         /* The vertex data gets format converted by the VPM so that
804          * each attribute channel takes up a VPM column.  Precompute
805          * the sizes for the shader record.
806          */
807         for (int i = 0; i < ARRAY_SIZE(prog_data->vattr_sizes); i++) {
808                 prog_data->vattr_sizes[i] = c->vattr_sizes[i];
809                 prog_data->vpm_input_size += c->vattr_sizes[i];
810         }
811 
812         memset(prog_data->driver_location_map, -1,
813                sizeof(prog_data->driver_location_map));
814 
815         nir_foreach_shader_in_variable(var, c->s) {
816                 prog_data->driver_location_map[var->data.location] =
817                         var->data.driver_location;
818         }
819 
820         prog_data->uses_vid = BITSET_TEST(c->s->info.system_values_read,
821                                           SYSTEM_VALUE_VERTEX_ID) ||
822                               BITSET_TEST(c->s->info.system_values_read,
823                                           SYSTEM_VALUE_VERTEX_ID_ZERO_BASE);
824 
825         prog_data->uses_biid = BITSET_TEST(c->s->info.system_values_read,
826                                            SYSTEM_VALUE_BASE_INSTANCE);
827 
828         prog_data->uses_iid = BITSET_TEST(c->s->info.system_values_read,
829                                           SYSTEM_VALUE_INSTANCE_ID) ||
830                               BITSET_TEST(c->s->info.system_values_read,
831                                           SYSTEM_VALUE_INSTANCE_INDEX);
832 
833         if (prog_data->uses_vid)
834                 prog_data->vpm_input_size++;
835         if (prog_data->uses_biid)
836                 prog_data->vpm_input_size++;
837         if (prog_data->uses_iid)
838                 prog_data->vpm_input_size++;
839 
840         prog_data->writes_psiz =
841             c->s->info.outputs_written & (1 << VARYING_SLOT_PSIZ);
842 
843         /* Input/output segment size are in sectors (8 rows of 32 bits per
844          * channel).
845          */
846         prog_data->vpm_input_size = align(prog_data->vpm_input_size, 8) / 8;
847         prog_data->vpm_output_size = align(c->vpm_output_size, 8) / 8;
848 
849         /* Set us up for shared input/output segments.  This is apparently
850          * necessary for our VCM setup to avoid varying corruption.
851          *
852          * FIXME: initial testing on V3D 7.1 seems to work fine when using
853          * separate segments. So we could try to reevaluate in the future, if
854          * there is any advantage of using separate segments.
855          */
856         prog_data->separate_segments = false;
857         prog_data->vpm_output_size = MAX2(prog_data->vpm_output_size,
858                                           prog_data->vpm_input_size);
859         prog_data->vpm_input_size = 0;
860 
861         /* Compute VCM cache size.  We set up our program to take up less than
862          * half of the VPM, so that any set of bin and render programs won't
863          * run out of space.  We need space for at least one input segment,
864          * and then allocate the rest to output segments (one for the current
865          * program, the rest to VCM).  The valid range of the VCM cache size
866          * field is 1-4 16-vertex batches, but GFXH-1744 limits us to 2-4
867          * batches.
868          */
869         assert(c->devinfo->vpm_size);
870         int sector_size = V3D_CHANNELS * sizeof(uint32_t) * 8;
871         int vpm_size_in_sectors = c->devinfo->vpm_size / sector_size;
872         int half_vpm = vpm_size_in_sectors / 2;
873         int vpm_output_sectors = half_vpm - prog_data->vpm_input_size;
874         int vpm_output_batches = vpm_output_sectors / prog_data->vpm_output_size;
875         assert(vpm_output_batches >= 2);
876         prog_data->vcm_cache_size = CLAMP(vpm_output_batches - 1, 2, 4);
877 }
878 
879 static void
v3d_gs_set_prog_data(struct v3d_compile * c,struct v3d_gs_prog_data * prog_data)880 v3d_gs_set_prog_data(struct v3d_compile *c,
881                      struct v3d_gs_prog_data *prog_data)
882 {
883         prog_data->num_inputs = c->num_inputs;
884         memcpy(prog_data->input_slots, c->input_slots,
885                c->num_inputs * sizeof(*c->input_slots));
886 
887         /* gl_PrimitiveIdIn is written by the GBG into the first word of the
888          * VPM output header automatically and the shader will overwrite
889          * it after reading it if necessary, so it doesn't add to the VPM
890          * size requirements.
891          */
892         prog_data->uses_pid = BITSET_TEST(c->s->info.system_values_read,
893                                           SYSTEM_VALUE_PRIMITIVE_ID);
894 
895         /* Output segment size is in sectors (8 rows of 32 bits per channel) */
896         prog_data->vpm_output_size = align(c->vpm_output_size, 8) / 8;
897 
898         /* Compute SIMD dispatch width and update VPM output size accordingly
899          * to ensure we can fit our program in memory. Available widths are
900          * 16, 8, 4, 1.
901          *
902          * Notice that at draw time we will have to consider VPM memory
903          * requirements from other stages and choose a smaller dispatch
904          * width if needed to fit the program in VPM memory.
905          */
906         prog_data->simd_width = 16;
907         while ((prog_data->simd_width > 1 && prog_data->vpm_output_size > 16) ||
908                prog_data->simd_width == 2) {
909                 prog_data->simd_width >>= 1;
910                 prog_data->vpm_output_size =
911                         align(prog_data->vpm_output_size, 2) / 2;
912         }
913         assert(prog_data->vpm_output_size <= 16);
914         assert(prog_data->simd_width != 2);
915 
916         prog_data->out_prim_type = c->s->info.gs.output_primitive;
917         prog_data->num_invocations = c->s->info.gs.invocations;
918 
919         prog_data->writes_psiz =
920             c->s->info.outputs_written & (1 << VARYING_SLOT_PSIZ);
921 }
922 
923 static void
v3d_set_fs_prog_data_inputs(struct v3d_compile * c,struct v3d_fs_prog_data * prog_data)924 v3d_set_fs_prog_data_inputs(struct v3d_compile *c,
925                             struct v3d_fs_prog_data *prog_data)
926 {
927         prog_data->num_inputs = c->num_inputs;
928         memcpy(prog_data->input_slots, c->input_slots,
929                c->num_inputs * sizeof(*c->input_slots));
930 
931         STATIC_ASSERT(ARRAY_SIZE(prog_data->flat_shade_flags) >
932                       (V3D_MAX_FS_INPUTS - 1) / 24);
933         for (int i = 0; i < V3D_MAX_FS_INPUTS; i++) {
934                 if (BITSET_TEST(c->flat_shade_flags, i))
935                         prog_data->flat_shade_flags[i / 24] |= 1 << (i % 24);
936 
937                 if (BITSET_TEST(c->noperspective_flags, i))
938                         prog_data->noperspective_flags[i / 24] |= 1 << (i % 24);
939 
940                 if (BITSET_TEST(c->centroid_flags, i))
941                         prog_data->centroid_flags[i / 24] |= 1 << (i % 24);
942         }
943 }
944 
945 static void
v3d_fs_set_prog_data(struct v3d_compile * c,struct v3d_fs_prog_data * prog_data)946 v3d_fs_set_prog_data(struct v3d_compile *c,
947                      struct v3d_fs_prog_data *prog_data)
948 {
949         v3d_set_fs_prog_data_inputs(c, prog_data);
950         prog_data->writes_z = c->writes_z;
951         prog_data->writes_z_from_fep = c->writes_z_from_fep;
952         prog_data->disable_ez = !c->s->info.fs.early_fragment_tests;
953         prog_data->uses_center_w = c->uses_center_w;
954         prog_data->uses_implicit_point_line_varyings =
955                 c->uses_implicit_point_line_varyings;
956         prog_data->lock_scoreboard_on_first_thrsw =
957                 c->lock_scoreboard_on_first_thrsw;
958         prog_data->force_per_sample_msaa = c->s->info.fs.uses_sample_shading;
959         prog_data->uses_pid = c->fs_uses_primitive_id;
960 }
961 
962 static void
v3d_cs_set_prog_data(struct v3d_compile * c,struct v3d_compute_prog_data * prog_data)963 v3d_cs_set_prog_data(struct v3d_compile *c,
964                      struct v3d_compute_prog_data *prog_data)
965 {
966         prog_data->shared_size = c->s->info.shared_size;
967 
968         prog_data->local_size[0] = c->s->info.workgroup_size[0];
969         prog_data->local_size[1] = c->s->info.workgroup_size[1];
970         prog_data->local_size[2] = c->s->info.workgroup_size[2];
971 
972         prog_data->has_subgroups = c->has_subgroups;
973 }
974 
975 static void
v3d_set_prog_data(struct v3d_compile * c,struct v3d_prog_data * prog_data)976 v3d_set_prog_data(struct v3d_compile *c,
977                   struct v3d_prog_data *prog_data)
978 {
979         prog_data->threads = c->threads;
980         prog_data->single_seg = !c->last_thrsw;
981         prog_data->spill_size = c->spill_size;
982         prog_data->tmu_spills = c->spills;
983         prog_data->tmu_fills = c->fills;
984         prog_data->tmu_count = c->tmu.total_count;
985         prog_data->qpu_read_stalls = c->qpu_inst_stalled_count;
986         prog_data->compile_strategy_idx = c->compile_strategy_idx;
987         prog_data->tmu_dirty_rcl = c->tmu_dirty_rcl;
988         prog_data->has_control_barrier = c->s->info.uses_control_barrier;
989         prog_data->has_global_address = c->has_global_address;
990 
991         v3d_set_prog_data_uniforms(c, prog_data);
992 
993         switch (c->s->info.stage) {
994         case MESA_SHADER_VERTEX:
995                 v3d_vs_set_prog_data(c, (struct v3d_vs_prog_data *)prog_data);
996                 break;
997         case MESA_SHADER_GEOMETRY:
998                 v3d_gs_set_prog_data(c, (struct v3d_gs_prog_data *)prog_data);
999                 break;
1000         case MESA_SHADER_FRAGMENT:
1001                 v3d_fs_set_prog_data(c, (struct v3d_fs_prog_data *)prog_data);
1002                 break;
1003         case MESA_SHADER_COMPUTE:
1004                 v3d_cs_set_prog_data(c, (struct v3d_compute_prog_data *)prog_data);
1005                 break;
1006         default:
1007                 unreachable("unsupported shader stage");
1008         }
1009 }
1010 
1011 static uint64_t *
v3d_return_qpu_insts(struct v3d_compile * c,uint32_t * final_assembly_size)1012 v3d_return_qpu_insts(struct v3d_compile *c, uint32_t *final_assembly_size)
1013 {
1014         *final_assembly_size = c->qpu_inst_count * sizeof(uint64_t);
1015 
1016         uint64_t *qpu_insts = malloc(*final_assembly_size);
1017         if (!qpu_insts)
1018                 return NULL;
1019 
1020         memcpy(qpu_insts, c->qpu_insts, *final_assembly_size);
1021 
1022         vir_compile_destroy(c);
1023 
1024         return qpu_insts;
1025 }
1026 
1027 static void
v3d_nir_lower_vs_early(struct v3d_compile * c)1028 v3d_nir_lower_vs_early(struct v3d_compile *c)
1029 {
1030         /* Split our I/O vars and dead code eliminate the unused
1031          * components.
1032          */
1033         NIR_PASS(_, c->s, nir_lower_io_to_scalar_early,
1034                  nir_var_shader_in | nir_var_shader_out);
1035         uint64_t used_outputs[4] = {0};
1036         for (int i = 0; i < c->vs_key->num_used_outputs; i++) {
1037                 int slot = v3d_slot_get_slot(c->vs_key->used_outputs[i]);
1038                 int comp = v3d_slot_get_component(c->vs_key->used_outputs[i]);
1039                 used_outputs[comp] |= 1ull << slot;
1040         }
1041         NIR_PASS(_, c->s, nir_remove_unused_io_vars,
1042                  nir_var_shader_out, used_outputs, NULL); /* demotes to globals */
1043         NIR_PASS(_, c->s, nir_lower_global_vars_to_local);
1044         v3d_optimize_nir(c, c->s);
1045         NIR_PASS(_, c->s, nir_remove_dead_variables, nir_var_shader_in, NULL);
1046 
1047         /* This must go before nir_lower_io */
1048         if (c->vs_key->per_vertex_point_size)
1049                 NIR_PASS(_, c->s, nir_lower_point_size, 1.0f, 0.0f);
1050 
1051         NIR_PASS(_, c->s, nir_lower_io, nir_var_shader_in | nir_var_shader_out,
1052                  type_size_vec4,
1053                  (nir_lower_io_options)0);
1054         /* clean up nir_lower_io's deref_var remains and do a constant folding pass
1055          * on the code it generated.
1056          */
1057         NIR_PASS(_, c->s, nir_opt_dce);
1058         NIR_PASS(_, c->s, nir_opt_constant_folding);
1059 }
1060 
1061 static void
v3d_nir_lower_gs_early(struct v3d_compile * c)1062 v3d_nir_lower_gs_early(struct v3d_compile *c)
1063 {
1064         /* Split our I/O vars and dead code eliminate the unused
1065          * components.
1066          */
1067         NIR_PASS(_, c->s, nir_lower_io_to_scalar_early,
1068                  nir_var_shader_in | nir_var_shader_out);
1069         uint64_t used_outputs[4] = {0};
1070         for (int i = 0; i < c->gs_key->num_used_outputs; i++) {
1071                 int slot = v3d_slot_get_slot(c->gs_key->used_outputs[i]);
1072                 int comp = v3d_slot_get_component(c->gs_key->used_outputs[i]);
1073                 used_outputs[comp] |= 1ull << slot;
1074         }
1075         NIR_PASS(_, c->s, nir_remove_unused_io_vars,
1076                  nir_var_shader_out, used_outputs, NULL); /* demotes to globals */
1077         NIR_PASS(_, c->s, nir_lower_global_vars_to_local);
1078         v3d_optimize_nir(c, c->s);
1079         NIR_PASS(_, c->s, nir_remove_dead_variables, nir_var_shader_in, NULL);
1080 
1081         /* This must go before nir_lower_io */
1082         if (c->gs_key->per_vertex_point_size)
1083                 NIR_PASS(_, c->s, nir_lower_point_size, 1.0f, 0.0f);
1084 
1085         NIR_PASS(_, c->s, nir_lower_io, nir_var_shader_in | nir_var_shader_out,
1086                  type_size_vec4,
1087                  (nir_lower_io_options)0);
1088         /* clean up nir_lower_io's deref_var remains and do a constant folding pass
1089          * on the code it generated.
1090          */
1091         NIR_PASS(_, c->s, nir_opt_dce);
1092         NIR_PASS(_, c->s, nir_opt_constant_folding);
1093 }
1094 
1095 static void
v3d_fixup_fs_output_types(struct v3d_compile * c)1096 v3d_fixup_fs_output_types(struct v3d_compile *c)
1097 {
1098         nir_foreach_shader_out_variable(var, c->s) {
1099                 uint32_t mask = 0;
1100 
1101                 switch (var->data.location) {
1102                 case FRAG_RESULT_COLOR:
1103                         mask = ~0;
1104                         break;
1105                 case FRAG_RESULT_DATA0:
1106                 case FRAG_RESULT_DATA1:
1107                 case FRAG_RESULT_DATA2:
1108                 case FRAG_RESULT_DATA3:
1109                         mask = 1 << (var->data.location - FRAG_RESULT_DATA0);
1110                         break;
1111                 }
1112 
1113                 if (c->fs_key->int_color_rb & mask) {
1114                         var->type =
1115                                 glsl_vector_type(GLSL_TYPE_INT,
1116                                                  glsl_get_components(var->type));
1117                 } else if (c->fs_key->uint_color_rb & mask) {
1118                         var->type =
1119                                 glsl_vector_type(GLSL_TYPE_UINT,
1120                                                  glsl_get_components(var->type));
1121                 }
1122         }
1123 }
1124 
1125 static void
v3d_nir_lower_fs_early(struct v3d_compile * c)1126 v3d_nir_lower_fs_early(struct v3d_compile *c)
1127 {
1128         if (c->fs_key->int_color_rb || c->fs_key->uint_color_rb)
1129                 v3d_fixup_fs_output_types(c);
1130 
1131         NIR_PASS(_, c->s, v3d_nir_lower_logic_ops, c);
1132 
1133         if (c->fs_key->line_smoothing) {
1134                 NIR_PASS(_, c->s, v3d_nir_lower_line_smooth);
1135                 NIR_PASS(_, c->s, nir_lower_global_vars_to_local);
1136                 /* The lowering pass can introduce new sysval reads */
1137                 nir_shader_gather_info(c->s, nir_shader_get_entrypoint(c->s));
1138         }
1139 }
1140 
1141 static void
v3d_nir_lower_gs_late(struct v3d_compile * c)1142 v3d_nir_lower_gs_late(struct v3d_compile *c)
1143 {
1144         if (c->key->ucp_enables) {
1145                 NIR_PASS(_, c->s, nir_lower_clip_gs, c->key->ucp_enables,
1146                          true, NULL);
1147         }
1148 
1149         /* Note: GS output scalarizing must happen after nir_lower_clip_gs. */
1150         NIR_PASS_V(c->s, nir_lower_io_to_scalar, nir_var_shader_out, NULL, NULL);
1151 }
1152 
1153 static void
v3d_nir_lower_vs_late(struct v3d_compile * c)1154 v3d_nir_lower_vs_late(struct v3d_compile *c)
1155 {
1156         if (c->key->ucp_enables) {
1157                 NIR_PASS(_, c->s, nir_lower_clip_vs, c->key->ucp_enables,
1158                          false, true, NULL);
1159                 NIR_PASS_V(c->s, nir_lower_io_to_scalar,
1160                            nir_var_shader_out, NULL, NULL);
1161         }
1162 
1163         /* Note: VS output scalarizing must happen after nir_lower_clip_vs. */
1164         NIR_PASS_V(c->s, nir_lower_io_to_scalar, nir_var_shader_out, NULL, NULL);
1165 }
1166 
1167 static void
v3d_nir_lower_fs_late(struct v3d_compile * c)1168 v3d_nir_lower_fs_late(struct v3d_compile *c)
1169 {
1170         /* In OpenGL the fragment shader can't read gl_ClipDistance[], but
1171          * Vulkan allows it, in which case the SPIR-V compiler will declare
1172          * VARING_SLOT_CLIP_DIST0 as compact array variable. Pass true as
1173          * the last parameter to always operate with a compact array in both
1174          * OpenGL and Vulkan so we do't have to care about the API we
1175          * are using.
1176          */
1177         if (c->key->ucp_enables)
1178                 NIR_PASS(_, c->s, nir_lower_clip_fs, c->key->ucp_enables, true, false);
1179 
1180         NIR_PASS_V(c->s, nir_lower_io_to_scalar, nir_var_shader_in, NULL, NULL);
1181 }
1182 
1183 static uint32_t
vir_get_max_temps(struct v3d_compile * c)1184 vir_get_max_temps(struct v3d_compile *c)
1185 {
1186         int max_ip = 0;
1187         vir_for_each_inst_inorder(inst, c)
1188                 max_ip++;
1189 
1190         uint32_t *pressure = rzalloc_array(NULL, uint32_t, max_ip);
1191 
1192         for (int t = 0; t < c->num_temps; t++) {
1193                 for (int i = c->temp_start[t]; (i < c->temp_end[t] &&
1194                                                 i < max_ip); i++) {
1195                         if (i > max_ip)
1196                                 break;
1197                         pressure[i]++;
1198                 }
1199         }
1200 
1201         uint32_t max_temps = 0;
1202         for (int i = 0; i < max_ip; i++)
1203                 max_temps = MAX2(max_temps, pressure[i]);
1204 
1205         ralloc_free(pressure);
1206 
1207         return max_temps;
1208 }
1209 
1210 enum v3d_dependency_class {
1211         V3D_DEPENDENCY_CLASS_GS_VPM_OUTPUT_0
1212 };
1213 
1214 static bool
v3d_intrinsic_dependency_cb(nir_intrinsic_instr * intr,nir_schedule_dependency * dep,void * user_data)1215 v3d_intrinsic_dependency_cb(nir_intrinsic_instr *intr,
1216                             nir_schedule_dependency *dep,
1217                             void *user_data)
1218 {
1219         struct v3d_compile *c = user_data;
1220 
1221         switch (intr->intrinsic) {
1222         case nir_intrinsic_store_output:
1223                 /* Writing to location 0 overwrites the value passed in for
1224                  * gl_PrimitiveID on geometry shaders
1225                  */
1226                 if (c->s->info.stage != MESA_SHADER_GEOMETRY ||
1227                     nir_intrinsic_base(intr) != 0)
1228                         break;
1229 
1230                 nir_const_value *const_value =
1231                         nir_src_as_const_value(intr->src[1]);
1232 
1233                 if (const_value == NULL)
1234                         break;
1235 
1236                 uint64_t offset =
1237                         nir_const_value_as_uint(*const_value,
1238                                                 nir_src_bit_size(intr->src[1]));
1239                 if (offset != 0)
1240                         break;
1241 
1242                 dep->klass = V3D_DEPENDENCY_CLASS_GS_VPM_OUTPUT_0;
1243                 dep->type = NIR_SCHEDULE_WRITE_DEPENDENCY;
1244                 return true;
1245 
1246         case nir_intrinsic_load_primitive_id:
1247                 if (c->s->info.stage != MESA_SHADER_GEOMETRY)
1248                         break;
1249 
1250                 dep->klass = V3D_DEPENDENCY_CLASS_GS_VPM_OUTPUT_0;
1251                 dep->type = NIR_SCHEDULE_READ_DEPENDENCY;
1252                 return true;
1253 
1254         default:
1255                 break;
1256         }
1257 
1258         return false;
1259 }
1260 
1261 static unsigned
v3d_instr_delay_cb(nir_instr * instr,void * data)1262 v3d_instr_delay_cb(nir_instr *instr, void *data)
1263 {
1264    struct v3d_compile *c = (struct v3d_compile *) data;
1265 
1266    switch (instr->type) {
1267    case nir_instr_type_undef:
1268    case nir_instr_type_load_const:
1269    case nir_instr_type_alu:
1270    case nir_instr_type_deref:
1271    case nir_instr_type_jump:
1272    case nir_instr_type_parallel_copy:
1273    case nir_instr_type_call:
1274    case nir_instr_type_phi:
1275       return 1;
1276 
1277    /* We should not use very large delays for TMU instructions. Typically,
1278     * thread switches will be sufficient to hide all or most of the latency,
1279     * so we typically only need a little bit of extra room. If we over-estimate
1280     * the latency here we may end up unnecessarily delaying the critical path in
1281     * the shader, which would have a negative effect in performance, so here
1282     * we are trying to strike a balance based on empirical testing.
1283     */
1284    case nir_instr_type_intrinsic: {
1285       nir_intrinsic_instr *intr = nir_instr_as_intrinsic(instr);
1286       if (!c->disable_general_tmu_sched) {
1287          switch (intr->intrinsic) {
1288          case nir_intrinsic_decl_reg:
1289          case nir_intrinsic_load_reg:
1290          case nir_intrinsic_store_reg:
1291             return 0;
1292          case nir_intrinsic_load_ssbo:
1293          case nir_intrinsic_load_scratch:
1294          case nir_intrinsic_load_shared:
1295          case nir_intrinsic_image_load:
1296             return 3;
1297          case nir_intrinsic_load_ubo:
1298             if (nir_src_is_divergent(&intr->src[1]))
1299                return 3;
1300             FALLTHROUGH;
1301          default:
1302             return 1;
1303          }
1304       } else {
1305          switch (intr->intrinsic) {
1306          case nir_intrinsic_decl_reg:
1307          case nir_intrinsic_load_reg:
1308          case nir_intrinsic_store_reg:
1309             return 0;
1310          default:
1311             return 1;
1312          }
1313       }
1314       break;
1315    }
1316 
1317    case nir_instr_type_tex:
1318       return 5;
1319 
1320    case nir_instr_type_debug_info:
1321       return 0;
1322    }
1323 
1324    return 0;
1325 }
1326 
1327 static bool
should_split_wrmask(const nir_instr * instr,const void * data)1328 should_split_wrmask(const nir_instr *instr, const void *data)
1329 {
1330         nir_intrinsic_instr *intr = nir_instr_as_intrinsic(instr);
1331         switch (intr->intrinsic) {
1332         case nir_intrinsic_store_ssbo:
1333         case nir_intrinsic_store_shared:
1334         case nir_intrinsic_store_global:
1335         case nir_intrinsic_store_scratch:
1336                 return true;
1337         default:
1338                 return false;
1339         }
1340 }
1341 
1342 static nir_intrinsic_instr *
nir_instr_as_constant_ubo_load(nir_instr * inst)1343 nir_instr_as_constant_ubo_load(nir_instr *inst)
1344 {
1345         if (inst->type != nir_instr_type_intrinsic)
1346                 return NULL;
1347 
1348         nir_intrinsic_instr *intr = nir_instr_as_intrinsic(inst);
1349         if (intr->intrinsic != nir_intrinsic_load_ubo)
1350                 return NULL;
1351 
1352         assert(nir_src_is_const(intr->src[0]));
1353         if (!nir_src_is_const(intr->src[1]))
1354                 return NULL;
1355 
1356         return intr;
1357 }
1358 
1359 static bool
v3d_nir_sort_constant_ubo_load(nir_block * block,nir_intrinsic_instr * ref)1360 v3d_nir_sort_constant_ubo_load(nir_block *block, nir_intrinsic_instr *ref)
1361 {
1362         bool progress = false;
1363 
1364         nir_instr *ref_inst = &ref->instr;
1365         uint32_t ref_offset = nir_src_as_uint(ref->src[1]);
1366         uint32_t ref_index = nir_src_as_uint(ref->src[0]);
1367 
1368         /* Go through all instructions after ref searching for constant UBO
1369          * loads for the same UBO index.
1370          */
1371         bool seq_break = false;
1372         nir_instr *inst = &ref->instr;
1373         nir_instr *next_inst = NULL;
1374         while (true) {
1375                 inst = next_inst ? next_inst : nir_instr_next(inst);
1376                 if (!inst)
1377                         break;
1378 
1379                 next_inst = NULL;
1380 
1381                 if (inst->type != nir_instr_type_intrinsic)
1382                         continue;
1383 
1384                 nir_intrinsic_instr *intr = nir_instr_as_intrinsic(inst);
1385                 if (intr->intrinsic != nir_intrinsic_load_ubo)
1386                         continue;
1387 
1388                 /* We only produce unifa sequences for non-divergent loads */
1389                 if (nir_src_is_divergent(&intr->src[1]))
1390                         continue;
1391 
1392                 /* If there are any UBO loads that are not constant or that
1393                  * use a different UBO index in between the reference load and
1394                  * any other constant load for the same index, they would break
1395                  * the unifa sequence. We will flag that so we can then move
1396                  * all constant UBO loads for the reference index before these
1397                  * and not just the ones that are not ordered to avoid breaking
1398                  * the sequence and reduce unifa writes.
1399                  */
1400                 if (!nir_src_is_const(intr->src[1])) {
1401                         seq_break = true;
1402                         continue;
1403                 }
1404                 uint32_t offset = nir_src_as_uint(intr->src[1]);
1405 
1406                 assert(nir_src_is_const(intr->src[0]));
1407                 uint32_t index = nir_src_as_uint(intr->src[0]);
1408                 if (index != ref_index) {
1409                        seq_break = true;
1410                        continue;
1411                 }
1412 
1413                 /* Only move loads with an offset that is close enough to the
1414                  * reference offset, since otherwise we would not be able to
1415                  * skip the unifa write for them. See ntq_emit_load_ubo_unifa.
1416                  */
1417                 if (abs((int)(ref_offset - offset)) > MAX_UNIFA_SKIP_DISTANCE)
1418                         continue;
1419 
1420                 /* We will move this load if its offset is smaller than ref's
1421                  * (in which case we will move it before ref) or if the offset
1422                  * is larger than ref's but there are sequence breakers in
1423                  * in between (in which case we will move it after ref and
1424                  * before the sequence breakers).
1425                  */
1426                 if (!seq_break && offset >= ref_offset)
1427                         continue;
1428 
1429                 /* Find where exactly we want to move this load:
1430                  *
1431                  * If we are moving it before ref, we want to check any other
1432                  * UBO loads we placed before ref and make sure we insert this
1433                  * one properly ordered with them. Likewise, if we are moving
1434                  * it after ref.
1435                  */
1436                 nir_instr *pos = ref_inst;
1437                 nir_instr *tmp = pos;
1438                 do {
1439                         if (offset < ref_offset)
1440                                 tmp = nir_instr_prev(tmp);
1441                         else
1442                                 tmp = nir_instr_next(tmp);
1443 
1444                         if (!tmp || tmp == inst)
1445                                 break;
1446 
1447                         /* Ignore non-unifa UBO loads */
1448                         if (tmp->type != nir_instr_type_intrinsic)
1449                                 continue;
1450 
1451                         nir_intrinsic_instr *tmp_intr =
1452                                 nir_instr_as_intrinsic(tmp);
1453                         if (tmp_intr->intrinsic != nir_intrinsic_load_ubo)
1454                                 continue;
1455 
1456                         if (nir_src_is_divergent(&tmp_intr->src[1]))
1457                                 continue;
1458 
1459                         /* Stop if we find a unifa UBO load that breaks the
1460                          * sequence.
1461                          */
1462                         if (!nir_src_is_const(tmp_intr->src[1]))
1463                                 break;
1464 
1465                         if (nir_src_as_uint(tmp_intr->src[0]) != index)
1466                                 break;
1467 
1468                         uint32_t tmp_offset = nir_src_as_uint(tmp_intr->src[1]);
1469                         if (offset < ref_offset) {
1470                                 if (tmp_offset < offset ||
1471                                     tmp_offset >= ref_offset) {
1472                                         break;
1473                                 } else {
1474                                         pos = tmp;
1475                                 }
1476                         } else {
1477                                 if (tmp_offset > offset ||
1478                                     tmp_offset <= ref_offset) {
1479                                         break;
1480                                 } else {
1481                                         pos = tmp;
1482                                 }
1483                         }
1484                 } while (true);
1485 
1486                 /* We can't move the UBO load before the instruction that
1487                  * defines its constant offset. If that instruction is placed
1488                  * in between the new location (pos) and the current location
1489                  * of this load, we will have to move that instruction too.
1490                  *
1491                  * We don't care about the UBO index definition because that
1492                  * is optimized to be reused by all UBO loads for the same
1493                  * index and therefore is certain to be defined before the
1494                  * first UBO load that uses it.
1495                  */
1496                 nir_instr *offset_inst = NULL;
1497                 tmp = inst;
1498                 while ((tmp = nir_instr_prev(tmp)) != NULL) {
1499                         if (pos == tmp) {
1500                                 /* We reached the target location without
1501                                  * finding the instruction that defines the
1502                                  * offset, so that instruction must be before
1503                                  * the new position and we don't have to fix it.
1504                                  */
1505                                 break;
1506                         }
1507                         if (intr->src[1].ssa->parent_instr == tmp) {
1508                                 offset_inst = tmp;
1509                                 break;
1510                         }
1511                 }
1512 
1513                 if (offset_inst) {
1514                         exec_node_remove(&offset_inst->node);
1515                         exec_node_insert_node_before(&pos->node,
1516                                                      &offset_inst->node);
1517                 }
1518 
1519                 /* Since we are moving the instruction before its current
1520                  * location, grab its successor before the move so that
1521                  * we can continue the next iteration of the main loop from
1522                  * that instruction.
1523                  */
1524                 next_inst = nir_instr_next(inst);
1525 
1526                 /* Move this load to the selected location */
1527                 exec_node_remove(&inst->node);
1528                 if (offset < ref_offset)
1529                         exec_node_insert_node_before(&pos->node, &inst->node);
1530                 else
1531                         exec_node_insert_after(&pos->node, &inst->node);
1532 
1533                 progress = true;
1534         }
1535 
1536         return progress;
1537 }
1538 
1539 static bool
v3d_nir_sort_constant_ubo_loads_block(struct v3d_compile * c,nir_block * block)1540 v3d_nir_sort_constant_ubo_loads_block(struct v3d_compile *c,
1541                                       nir_block *block)
1542 {
1543         bool progress = false;
1544         bool local_progress;
1545         do {
1546                 local_progress = false;
1547                 nir_foreach_instr_safe(inst, block) {
1548                         nir_intrinsic_instr *intr =
1549                                 nir_instr_as_constant_ubo_load(inst);
1550                         if (intr) {
1551                                 local_progress |=
1552                                         v3d_nir_sort_constant_ubo_load(block, intr);
1553                         }
1554                 }
1555                 progress |= local_progress;
1556         } while (local_progress);
1557 
1558         return progress;
1559 }
1560 
1561 /**
1562  * Sorts constant UBO loads in each block by offset to maximize chances of
1563  * skipping unifa writes when converting to VIR. This can increase register
1564  * pressure.
1565  */
1566 static bool
v3d_nir_sort_constant_ubo_loads(nir_shader * s,struct v3d_compile * c)1567 v3d_nir_sort_constant_ubo_loads(nir_shader *s, struct v3d_compile *c)
1568 {
1569         nir_foreach_function_impl(impl, s) {
1570                 nir_foreach_block(block, impl) {
1571                         c->sorted_any_ubo_loads |=
1572                                 v3d_nir_sort_constant_ubo_loads_block(c, block);
1573                 }
1574                 nir_metadata_preserve(impl,
1575                                       nir_metadata_control_flow);
1576         }
1577         return c->sorted_any_ubo_loads;
1578 }
1579 
1580 static void
lower_load_num_subgroups(struct v3d_compile * c,nir_builder * b,nir_intrinsic_instr * intr)1581 lower_load_num_subgroups(struct v3d_compile *c,
1582                          nir_builder *b,
1583                          nir_intrinsic_instr *intr)
1584 {
1585         assert(c->s->info.stage == MESA_SHADER_COMPUTE);
1586         assert(intr->intrinsic == nir_intrinsic_load_num_subgroups);
1587 
1588         b->cursor = nir_after_instr(&intr->instr);
1589         uint32_t num_subgroups =
1590                 DIV_ROUND_UP(c->s->info.workgroup_size[0] *
1591                              c->s->info.workgroup_size[1] *
1592                              c->s->info.workgroup_size[2], V3D_CHANNELS);
1593         nir_def *result = nir_imm_int(b, num_subgroups);
1594         nir_def_replace(&intr->def, result);
1595 }
1596 
1597 static bool
lower_subgroup_intrinsics(struct v3d_compile * c,nir_block * block,nir_builder * b)1598 lower_subgroup_intrinsics(struct v3d_compile *c,
1599                           nir_block *block, nir_builder *b)
1600 {
1601         bool progress = false;
1602         nir_foreach_instr_safe(inst, block) {
1603                 if (inst->type != nir_instr_type_intrinsic)
1604                         continue;;
1605 
1606                 nir_intrinsic_instr *intr =
1607                         nir_instr_as_intrinsic(inst);
1608                 if (!intr)
1609                         continue;
1610 
1611                 switch (intr->intrinsic) {
1612                 case nir_intrinsic_load_num_subgroups:
1613                         lower_load_num_subgroups(c, b, intr);
1614                         progress = true;
1615                         FALLTHROUGH;
1616                 case nir_intrinsic_load_subgroup_id:
1617                 case nir_intrinsic_load_subgroup_size:
1618                 case nir_intrinsic_load_subgroup_invocation:
1619                 case nir_intrinsic_elect:
1620                 case nir_intrinsic_ballot:
1621                 case nir_intrinsic_inverse_ballot:
1622                 case nir_intrinsic_ballot_bitfield_extract:
1623                 case nir_intrinsic_ballot_bit_count_reduce:
1624                 case nir_intrinsic_ballot_find_lsb:
1625                 case nir_intrinsic_ballot_find_msb:
1626                 case nir_intrinsic_ballot_bit_count_exclusive:
1627                 case nir_intrinsic_ballot_bit_count_inclusive:
1628                 case nir_intrinsic_reduce:
1629                 case nir_intrinsic_inclusive_scan:
1630                 case nir_intrinsic_exclusive_scan:
1631                 case nir_intrinsic_read_invocation:
1632                 case nir_intrinsic_read_first_invocation:
1633                 case nir_intrinsic_load_subgroup_eq_mask:
1634                 case nir_intrinsic_load_subgroup_ge_mask:
1635                 case nir_intrinsic_load_subgroup_gt_mask:
1636                 case nir_intrinsic_load_subgroup_le_mask:
1637                 case nir_intrinsic_load_subgroup_lt_mask:
1638                 case nir_intrinsic_shuffle:
1639                 case nir_intrinsic_shuffle_xor:
1640                 case nir_intrinsic_shuffle_up:
1641                 case nir_intrinsic_shuffle_down:
1642                 case nir_intrinsic_vote_all:
1643                 case nir_intrinsic_vote_any:
1644                 case nir_intrinsic_vote_feq:
1645                 case nir_intrinsic_vote_ieq:
1646                 case nir_intrinsic_quad_broadcast:
1647                 case nir_intrinsic_quad_swap_horizontal:
1648                 case nir_intrinsic_quad_swap_vertical:
1649                 case nir_intrinsic_quad_swap_diagonal:
1650                         c->has_subgroups = true;
1651                         break;
1652                 default:
1653                         break;
1654                 }
1655         }
1656 
1657         return progress;
1658 }
1659 
1660 static bool
v3d_nir_lower_subgroup_intrinsics(nir_shader * s,struct v3d_compile * c)1661 v3d_nir_lower_subgroup_intrinsics(nir_shader *s, struct v3d_compile *c)
1662 {
1663         bool progress = false;
1664         nir_foreach_function_impl(impl, s) {
1665                 nir_builder b = nir_builder_create(impl);
1666 
1667                 nir_foreach_block(block, impl)
1668                         progress |= lower_subgroup_intrinsics(c, block, &b);
1669 
1670                 nir_metadata_preserve(impl,
1671                                       nir_metadata_control_flow);
1672         }
1673         return progress;
1674 }
1675 
1676 static bool
should_lower_robustness(const nir_intrinsic_instr * intr,const void * data)1677 should_lower_robustness(const nir_intrinsic_instr *intr, const void *data)
1678 {
1679         const struct v3d_key *key = data;
1680 
1681         switch (intr->intrinsic) {
1682         case nir_intrinsic_load_ubo:
1683                 return key->robust_uniform_access;
1684 
1685         case nir_intrinsic_load_ssbo:
1686         case nir_intrinsic_store_ssbo:
1687         case nir_intrinsic_ssbo_atomic:
1688         case nir_intrinsic_ssbo_atomic_swap:
1689                 return key->robust_storage_access;
1690 
1691         case nir_intrinsic_image_load:
1692         case nir_intrinsic_image_store:
1693         case nir_intrinsic_image_atomic:
1694         case nir_intrinsic_image_atomic_swap:
1695                 return key->robust_image_access;
1696 
1697         default:
1698                 return false;
1699         }
1700 }
1701 
1702 static void
v3d_attempt_compile(struct v3d_compile * c)1703 v3d_attempt_compile(struct v3d_compile *c)
1704 {
1705         switch (c->s->info.stage) {
1706         case MESA_SHADER_VERTEX:
1707                 c->vs_key = (struct v3d_vs_key *) c->key;
1708                 break;
1709         case MESA_SHADER_GEOMETRY:
1710                 c->gs_key = (struct v3d_gs_key *) c->key;
1711                 break;
1712         case MESA_SHADER_FRAGMENT:
1713                 c->fs_key = (struct v3d_fs_key *) c->key;
1714                 break;
1715         case MESA_SHADER_COMPUTE:
1716                 break;
1717         default:
1718                 unreachable("unsupported shader stage");
1719         }
1720 
1721         switch (c->s->info.stage) {
1722         case MESA_SHADER_VERTEX:
1723                 v3d_nir_lower_vs_early(c);
1724                 break;
1725         case MESA_SHADER_GEOMETRY:
1726                 v3d_nir_lower_gs_early(c);
1727                 break;
1728         case MESA_SHADER_FRAGMENT:
1729                 v3d_nir_lower_fs_early(c);
1730                 break;
1731         default:
1732                 break;
1733         }
1734 
1735         v3d_lower_nir(c);
1736 
1737         switch (c->s->info.stage) {
1738         case MESA_SHADER_VERTEX:
1739                 v3d_nir_lower_vs_late(c);
1740                 break;
1741         case MESA_SHADER_GEOMETRY:
1742                 v3d_nir_lower_gs_late(c);
1743                 break;
1744         case MESA_SHADER_FRAGMENT:
1745                 v3d_nir_lower_fs_late(c);
1746                 break;
1747         default:
1748                 break;
1749         }
1750 
1751         NIR_PASS(_, c->s, v3d_nir_lower_io, c);
1752         NIR_PASS(_, c->s, v3d_nir_lower_txf_ms);
1753         NIR_PASS(_, c->s, v3d_nir_lower_image_load_store, c);
1754 
1755         NIR_PASS(_, c->s, nir_opt_idiv_const, 8);
1756         nir_lower_idiv_options idiv_options = {
1757                 .allow_fp16 = true,
1758         };
1759         NIR_PASS(_, c->s, nir_lower_idiv, &idiv_options);
1760         NIR_PASS(_, c->s, nir_lower_alu);
1761 
1762         if (c->key->robust_uniform_access || c->key->robust_storage_access ||
1763             c->key->robust_image_access) {
1764                 /* nir_lower_robust_access assumes constant buffer
1765                  * indices on ubo/ssbo intrinsics so run copy propagation and
1766                  * constant folding passes before we run the lowering to warrant
1767                  * this. We also want to run the lowering before v3d_optimize to
1768                  * clean-up redundant get_buffer_size calls produced in the pass.
1769                  */
1770                 NIR_PASS(_, c->s, nir_copy_prop);
1771                 NIR_PASS(_, c->s, nir_opt_constant_folding);
1772 
1773                 NIR_PASS(_, c->s, nir_lower_robust_access,
1774                          should_lower_robustness, c->key);
1775         }
1776 
1777         NIR_PASS(_, c->s, nir_lower_vars_to_scratch,
1778                  nir_var_function_temp,
1779                  0,
1780                  glsl_get_natural_size_align_bytes,
1781                  glsl_get_natural_size_align_bytes);
1782 
1783         NIR_PASS(_, c->s, v3d_nir_lower_global_2x32);
1784         NIR_PASS(_, c->s, nir_lower_wrmasks, should_split_wrmask, c->s);
1785         NIR_PASS(_, c->s, v3d_nir_lower_load_store_bitsize);
1786         NIR_PASS(_, c->s, v3d_nir_lower_scratch);
1787 
1788         /* needs to run after load_store_bitsize */
1789         NIR_PASS(_, c->s, nir_lower_pack);
1790 
1791         NIR_PASS(_, c->s, v3d_nir_lower_subgroup_intrinsics, c);
1792 
1793         const nir_lower_subgroups_options subgroup_opts = {
1794                 .subgroup_size = V3D_CHANNELS,
1795                 .ballot_components = 1,
1796                 .ballot_bit_size = 32,
1797                 .lower_to_scalar = true,
1798                 .lower_inverse_ballot = true,
1799                 .lower_subgroup_masks = true,
1800                 .lower_relative_shuffle = true,
1801                 .lower_quad = true,
1802         };
1803         NIR_PASS(_, c->s, nir_lower_subgroups, &subgroup_opts);
1804 
1805         v3d_optimize_nir(c, c->s);
1806 
1807         /* Do late algebraic optimization to turn add(a, neg(b)) back into
1808          * subs, then the mandatory cleanup after algebraic.  Note that it may
1809          * produce fnegs, and if so then we need to keep running to squash
1810          * fneg(fneg(a)).
1811          */
1812         bool more_late_algebraic = true;
1813         while (more_late_algebraic) {
1814                 more_late_algebraic = false;
1815                 NIR_PASS(more_late_algebraic, c->s, nir_opt_algebraic_late);
1816                 NIR_PASS(_, c->s, nir_opt_constant_folding);
1817                 NIR_PASS(_, c->s, nir_copy_prop);
1818                 NIR_PASS(_, c->s, nir_opt_dce);
1819                 NIR_PASS(_, c->s, nir_opt_cse);
1820         }
1821 
1822         NIR_PASS(_, c->s, nir_lower_bool_to_int32);
1823         NIR_PASS(_, c->s, nir_convert_to_lcssa, true, true);
1824         NIR_PASS_V(c->s, nir_divergence_analysis);
1825         NIR_PASS(_, c->s, nir_convert_from_ssa, true);
1826 
1827         struct nir_schedule_options schedule_options = {
1828                 /* Schedule for about half our register space, to enable more
1829                  * shaders to hit 4 threads.
1830                  */
1831                 .threshold = c->threads == 4 ? 24 : 48,
1832 
1833                 /* Vertex shaders share the same memory for inputs and outputs,
1834                  * fragment and geometry shaders do not.
1835                  */
1836                 .stages_with_shared_io_memory =
1837                 (((1 << MESA_ALL_SHADER_STAGES) - 1) &
1838                  ~((1 << MESA_SHADER_FRAGMENT) |
1839                    (1 << MESA_SHADER_GEOMETRY))),
1840 
1841                 .fallback = c->fallback_scheduler,
1842 
1843                 .intrinsic_cb = v3d_intrinsic_dependency_cb,
1844                 .intrinsic_cb_data = c,
1845 
1846                 .instr_delay_cb = v3d_instr_delay_cb,
1847                 .instr_delay_cb_data = c,
1848         };
1849         NIR_PASS_V(c->s, nir_schedule, &schedule_options);
1850 
1851         if (!c->disable_constant_ubo_load_sorting)
1852                 NIR_PASS(_, c->s, v3d_nir_sort_constant_ubo_loads, c);
1853 
1854         const nir_move_options buffer_opts = c->move_buffer_loads ?
1855                 (nir_move_load_ubo | nir_move_load_ssbo) : 0;
1856         NIR_PASS(_, c->s, nir_opt_move, nir_move_load_uniform |
1857                                         nir_move_const_undef |
1858                                         buffer_opts);
1859 
1860         NIR_PASS_V(c->s, nir_trivialize_registers);
1861 
1862         v3d_nir_to_vir(c);
1863 }
1864 
1865 uint32_t
v3d_prog_data_size(gl_shader_stage stage)1866 v3d_prog_data_size(gl_shader_stage stage)
1867 {
1868         static const int prog_data_size[] = {
1869                 [MESA_SHADER_VERTEX] = sizeof(struct v3d_vs_prog_data),
1870                 [MESA_SHADER_GEOMETRY] = sizeof(struct v3d_gs_prog_data),
1871                 [MESA_SHADER_FRAGMENT] = sizeof(struct v3d_fs_prog_data),
1872                 [MESA_SHADER_COMPUTE] = sizeof(struct v3d_compute_prog_data),
1873         };
1874 
1875         assert(stage >= 0 &&
1876                stage < ARRAY_SIZE(prog_data_size) &&
1877                prog_data_size[stage]);
1878 
1879         return prog_data_size[stage];
1880 }
1881 
v3d_shaderdb_dump(struct v3d_compile * c,char ** shaderdb_str)1882 int v3d_shaderdb_dump(struct v3d_compile *c,
1883 		      char **shaderdb_str)
1884 {
1885         if (c == NULL || c->compilation_result != V3D_COMPILATION_SUCCEEDED)
1886                 return -1;
1887 
1888         return asprintf(shaderdb_str,
1889                         "%s shader: %d inst, %d threads, %d loops, "
1890                         "%d uniforms, %d max-temps, %d:%d spills:fills, "
1891                         "%d sfu-stalls, %d inst-and-stalls, %d nops",
1892                         vir_get_stage_name(c),
1893                         c->qpu_inst_count,
1894                         c->threads,
1895                         c->loops,
1896                         c->num_uniforms,
1897                         vir_get_max_temps(c),
1898                         c->spills,
1899                         c->fills,
1900                         c->qpu_inst_stalled_count,
1901                         c->qpu_inst_count + c->qpu_inst_stalled_count,
1902                         c->nop_count);
1903 }
1904 
1905 /* This is a list of incremental changes to the compilation strategy
1906  * that will be used to try to compile the shader successfully. The
1907  * default strategy is to enable all optimizations which will have
1908  * the highest register pressure but is expected to produce most
1909  * optimal code. Following strategies incrementally disable specific
1910  * optimizations that are known to contribute to register pressure
1911  * in order to be able to compile the shader successfully while meeting
1912  * thread count requirements.
1913  *
1914  * V3D 4.1+ has a min thread count of 2, but we can use 1 here to also
1915  * cover previous hardware as well (meaning that we are not limiting
1916  * register allocation to any particular thread count). This is fine
1917  * because v3d_nir_to_vir will cap this to the actual minimum.
1918  */
1919 static const struct v3d_compiler_strategy strategies[] = {
1920         /*0*/  { "default",                        4, 4, false, false, false, false, false, false,  0 },
1921         /*1*/  { "disable general TMU sched",      4, 4, true,  false, false, false, false, false,  0 },
1922         /*2*/  { "disable gcm",                    4, 4, true,  true,  false, false, false, false,  0 },
1923         /*3*/  { "disable loop unrolling",         4, 4, true,  true,  true,  false, false, false,  0 },
1924         /*4*/  { "disable UBO load sorting",       4, 4, true,  true,  true,  true,  false, false,  0 },
1925         /*5*/  { "disable TMU pipelining",         4, 4, true,  true,  true,  true,  false, true,   0 },
1926         /*6*/  { "lower thread count",             2, 1, false, false, false, false, false, false, -1 },
1927         /*7*/  { "disable general TMU sched (2t)", 2, 1, true,  false, false, false, false, false, -1 },
1928         /*8*/  { "disable gcm (2t)",               2, 1, true,  true,  false, false, false, false, -1 },
1929         /*9*/  { "disable loop unrolling (2t)",    2, 1, true,  true,  true,  false, false, false, -1 },
1930         /*10*/ { "Move buffer loads (2t)",         2, 1, true,  true,  true,  true,  true,  false, -1 },
1931         /*11*/ { "disable TMU pipelining (2t)",    2, 1, true,  true,  true,  true,  true,  true,  -1 },
1932         /*12*/ { "fallback scheduler",             2, 1, true,  true,  true,  true,  true,  true,  -1 }
1933 };
1934 
1935 /**
1936  * If a particular optimization didn't make any progress during a compile
1937  * attempt disabling it alone won't allow us to compile the shader successfully,
1938  * since we'll end up with the same code. Detect these scenarios so we can
1939  * avoid wasting time with useless compiles. We should also consider if the
1940  * gy changes other aspects of the compilation process though, like
1941  * spilling, and not skip it in that case.
1942  */
1943 static bool
skip_compile_strategy(struct v3d_compile * c,uint32_t idx)1944 skip_compile_strategy(struct v3d_compile *c, uint32_t idx)
1945 {
1946    /* We decide if we can skip a strategy based on the optimizations that
1947     * were active in the previous strategy, so we should only be calling this
1948     * for strategies after the first.
1949     */
1950    assert(idx > 0);
1951 
1952    /* Don't skip a strategy that changes spilling behavior */
1953    if (strategies[idx].max_tmu_spills !=
1954        strategies[idx - 1].max_tmu_spills) {
1955            return false;
1956    }
1957 
1958    switch (idx) {
1959    /* General TMU sched.: skip if we didn't emit any TMU loads */
1960    case 1:
1961    case 7:
1962            return !c->has_general_tmu_load;
1963    /* Global code motion: skip if nir_opt_gcm didn't make any progress */
1964    case 2:
1965    case 8:
1966            return !c->gcm_progress;
1967    /* Loop unrolling: skip if we didn't unroll any loops */
1968    case 3:
1969    case 9:
1970            return !c->unrolled_any_loops;
1971    /* UBO load sorting: skip if we didn't sort any loads */
1972    case 4:
1973            return !c->sorted_any_ubo_loads;
1974    /* Move buffer loads: we assume any shader with difficult RA
1975     * most likely has UBO / SSBO loads so we never try to skip.
1976     * For now, we only try this for 2-thread compiles since it
1977     * is expected to impact instruction counts and latency.
1978     */
1979    case 10:
1980           assert(c->threads < 4);
1981           return false;
1982    /* TMU pipelining: skip if we didn't pipeline any TMU ops */
1983    case 5:
1984    case 11:
1985            return !c->pipelined_any_tmu;
1986    /* Lower thread count: skip if we already tried less that 4 threads */
1987    case 6:
1988           return c->threads < 4;
1989    default:
1990            return false;
1991    };
1992 }
1993 
1994 static inline void
set_best_compile(struct v3d_compile ** best,struct v3d_compile * c)1995 set_best_compile(struct v3d_compile **best, struct v3d_compile *c)
1996 {
1997    if (*best)
1998       vir_compile_destroy(*best);
1999    *best = c;
2000 }
2001 
v3d_compile(const struct v3d_compiler * compiler,struct v3d_key * key,struct v3d_prog_data ** out_prog_data,nir_shader * s,void (* debug_output)(const char * msg,void * debug_output_data),void * debug_output_data,int program_id,int variant_id,uint32_t * final_assembly_size)2002 uint64_t *v3d_compile(const struct v3d_compiler *compiler,
2003                       struct v3d_key *key,
2004                       struct v3d_prog_data **out_prog_data,
2005                       nir_shader *s,
2006                       void (*debug_output)(const char *msg,
2007                                            void *debug_output_data),
2008                       void *debug_output_data,
2009                       int program_id, int variant_id,
2010                       uint32_t *final_assembly_size)
2011 {
2012         struct v3d_compile *c = NULL;
2013         uint32_t best_spill_fill_count = UINT32_MAX;
2014         struct v3d_compile *best_c = NULL;
2015 
2016         MESA_TRACE_FUNC();
2017 
2018         for (int32_t strat = 0; strat < ARRAY_SIZE(strategies); strat++) {
2019                 /* Fallback strategy */
2020                 if (strat > 0) {
2021                         assert(c);
2022                         if (skip_compile_strategy(c, strat))
2023                                 continue;
2024 
2025                         char *debug_msg;
2026                         int ret = asprintf(&debug_msg,
2027                                            "Falling back to strategy '%s' "
2028                                            "for %s prog %d/%d",
2029                                            strategies[strat].name,
2030                                            vir_get_stage_name(c),
2031                                            c->program_id, c->variant_id);
2032 
2033                         if (ret >= 0) {
2034                                 if (V3D_DBG(PERF))
2035                                         fprintf(stderr, "%s\n", debug_msg);
2036 
2037                                 c->debug_output(debug_msg, c->debug_output_data);
2038                                 free(debug_msg);
2039                         }
2040 
2041                         if (c != best_c)
2042                                 vir_compile_destroy(c);
2043                 }
2044 
2045                 c = vir_compile_init(compiler, key, s,
2046                                      debug_output, debug_output_data,
2047                                      program_id, variant_id,
2048                                      strat, &strategies[strat],
2049                                      strat == ARRAY_SIZE(strategies) - 1);
2050 
2051                 v3d_attempt_compile(c);
2052 
2053                 /* Broken shader or driver bug */
2054                 if (c->compilation_result == V3D_COMPILATION_FAILED)
2055                         break;
2056 
2057                 /* If we compiled without spills, choose this.
2058                  * Otherwise if this is a 4-thread compile, choose this (these
2059                  * have a very low cap on the allowed TMU spills so we assume
2060                  * it will be better than a 2-thread compile without spills).
2061                  * Otherwise, keep going while tracking the strategy with the
2062                  * lowest spill count.
2063                  */
2064                 if (c->compilation_result == V3D_COMPILATION_SUCCEEDED) {
2065                         if (c->spills == 0 ||
2066                             strategies[strat].min_threads == 4 ||
2067                             V3D_DBG(OPT_COMPILE_TIME)) {
2068                                 set_best_compile(&best_c, c);
2069                                 break;
2070                         } else if (c->spills + c->fills <
2071                                    best_spill_fill_count) {
2072                                 set_best_compile(&best_c, c);
2073                                 best_spill_fill_count = c->spills + c->fills;
2074                         }
2075 
2076                         if (V3D_DBG(PERF)) {
2077                                 char *debug_msg;
2078                                 int ret = asprintf(&debug_msg,
2079                                                    "Compiled %s prog %d/%d with %d "
2080                                                    "spills and %d fills. Will try "
2081                                                    "more strategies.",
2082                                                    vir_get_stage_name(c),
2083                                                    c->program_id, c->variant_id,
2084                                                    c->spills, c->fills);
2085                                 if (ret >= 0) {
2086                                         fprintf(stderr, "%s\n", debug_msg);
2087                                         c->debug_output(debug_msg, c->debug_output_data);
2088                                         free(debug_msg);
2089                                 }
2090                         }
2091                 }
2092 
2093                 /* Only try next streategy if we failed to register allocate
2094                  * or we had to spill.
2095                  */
2096                 assert(c->compilation_result ==
2097                        V3D_COMPILATION_FAILED_REGISTER_ALLOCATION ||
2098                        c->spills > 0);
2099         }
2100 
2101         /* If the best strategy was not the last, choose that */
2102         if (best_c && c != best_c)
2103                 set_best_compile(&c, best_c);
2104 
2105         if (V3D_DBG(PERF) &&
2106             c->compilation_result !=
2107             V3D_COMPILATION_FAILED_REGISTER_ALLOCATION &&
2108             c->spills > 0) {
2109                 char *debug_msg;
2110                 int ret = asprintf(&debug_msg,
2111                                    "Compiled %s prog %d/%d with %d "
2112                                    "spills and %d fills",
2113                                    vir_get_stage_name(c),
2114                                    c->program_id, c->variant_id,
2115                                    c->spills, c->fills);
2116                 fprintf(stderr, "%s\n", debug_msg);
2117 
2118                 if (ret >= 0) {
2119                         c->debug_output(debug_msg, c->debug_output_data);
2120                         free(debug_msg);
2121                 }
2122         }
2123 
2124         if (c->compilation_result != V3D_COMPILATION_SUCCEEDED) {
2125                 fprintf(stderr, "Failed to compile %s prog %d/%d "
2126                         "with any strategy.\n",
2127                         vir_get_stage_name(c), c->program_id, c->variant_id);
2128 
2129                 vir_compile_destroy(c);
2130                 return NULL;
2131         }
2132 
2133         struct v3d_prog_data *prog_data;
2134 
2135         prog_data = rzalloc_size(NULL, v3d_prog_data_size(c->s->info.stage));
2136 
2137         v3d_set_prog_data(c, prog_data);
2138 
2139         *out_prog_data = prog_data;
2140 
2141         char *shaderdb;
2142         int ret = v3d_shaderdb_dump(c, &shaderdb);
2143         if (ret >= 0) {
2144                 if (V3D_DBG(SHADERDB))
2145                         fprintf(stderr, "SHADER-DB-%s - %s\n", s->info.name, shaderdb);
2146 
2147                 c->debug_output(shaderdb, c->debug_output_data);
2148                 free(shaderdb);
2149         }
2150 
2151        return v3d_return_qpu_insts(c, final_assembly_size);
2152 }
2153 
2154 void
vir_remove_instruction(struct v3d_compile * c,struct qinst * qinst)2155 vir_remove_instruction(struct v3d_compile *c, struct qinst *qinst)
2156 {
2157         if (qinst->dst.file == QFILE_TEMP)
2158                 c->defs[qinst->dst.index] = NULL;
2159 
2160         assert(&qinst->link != c->cursor.link);
2161 
2162         list_del(&qinst->link);
2163         free(qinst);
2164 
2165         c->live_intervals_valid = false;
2166 }
2167 
2168 struct qreg
vir_follow_movs(struct v3d_compile * c,struct qreg reg)2169 vir_follow_movs(struct v3d_compile *c, struct qreg reg)
2170 {
2171         /* XXX
2172         int pack = reg.pack;
2173 
2174         while (reg.file == QFILE_TEMP &&
2175                c->defs[reg.index] &&
2176                (c->defs[reg.index]->op == QOP_MOV ||
2177                 c->defs[reg.index]->op == QOP_FMOV) &&
2178                !c->defs[reg.index]->dst.pack &&
2179                !c->defs[reg.index]->src[0].pack) {
2180                 reg = c->defs[reg.index]->src[0];
2181         }
2182 
2183         reg.pack = pack;
2184         */
2185         return reg;
2186 }
2187 
2188 void
vir_compile_destroy(struct v3d_compile * c)2189 vir_compile_destroy(struct v3d_compile *c)
2190 {
2191         /* Defuse the assert that we aren't removing the cursor's instruction.
2192          */
2193         c->cursor.link = NULL;
2194 
2195         vir_for_each_block(block, c) {
2196                 while (!list_is_empty(&block->instructions)) {
2197                         struct qinst *qinst =
2198                                 list_first_entry(&block->instructions,
2199                                                  struct qinst, link);
2200                         vir_remove_instruction(c, qinst);
2201                 }
2202         }
2203 
2204         ralloc_free(c);
2205 }
2206 
2207 uint32_t
vir_get_uniform_index(struct v3d_compile * c,enum quniform_contents contents,uint32_t data)2208 vir_get_uniform_index(struct v3d_compile *c,
2209                       enum quniform_contents contents,
2210                       uint32_t data)
2211 {
2212         for (int i = 0; i < c->num_uniforms; i++) {
2213                 if (c->uniform_contents[i] == contents &&
2214                     c->uniform_data[i] == data) {
2215                         return i;
2216                 }
2217         }
2218 
2219         uint32_t uniform = c->num_uniforms++;
2220 
2221         if (uniform >= c->uniform_array_size) {
2222                 c->uniform_array_size = MAX2(MAX2(16, uniform + 1),
2223                                              c->uniform_array_size * 2);
2224 
2225                 c->uniform_data = reralloc(c, c->uniform_data,
2226                                            uint32_t,
2227                                            c->uniform_array_size);
2228                 c->uniform_contents = reralloc(c, c->uniform_contents,
2229                                                enum quniform_contents,
2230                                                c->uniform_array_size);
2231         }
2232 
2233         c->uniform_contents[uniform] = contents;
2234         c->uniform_data[uniform] = data;
2235 
2236         return uniform;
2237 }
2238 
2239 /* Looks back into the current block to find the ldunif that wrote the uniform
2240  * at the requested index. If it finds it, it returns true and writes the
2241  * destination register of the ldunif instruction to 'unif'.
2242  *
2243  * This can impact register pressure and end up leading to worse code, so we
2244  * limit the number of instructions we are willing to look back through to
2245  * strike a good balance.
2246  */
2247 static bool
try_opt_ldunif(struct v3d_compile * c,uint32_t index,struct qreg * unif)2248 try_opt_ldunif(struct v3d_compile *c, uint32_t index, struct qreg *unif)
2249 {
2250         uint32_t count = 20;
2251         struct qinst *prev_inst = NULL;
2252         assert(c->cur_block);
2253 
2254 #if MESA_DEBUG
2255         /* We can only reuse a uniform if it was emitted in the same block,
2256          * so callers must make sure the current instruction is being emitted
2257          * in the current block.
2258          */
2259         bool found = false;
2260         vir_for_each_inst(inst, c->cur_block) {
2261                 if (&inst->link == c->cursor.link) {
2262                         found = true;
2263                         break;
2264                 }
2265         }
2266 
2267         assert(found || &c->cur_block->instructions == c->cursor.link);
2268 #endif
2269 
2270         list_for_each_entry_from_rev(struct qinst, inst, c->cursor.link->prev,
2271                                      &c->cur_block->instructions, link) {
2272                 if ((inst->qpu.sig.ldunif || inst->qpu.sig.ldunifrf) &&
2273                     inst->uniform == index) {
2274                         prev_inst = inst;
2275                         break;
2276                 }
2277 
2278                 if (--count == 0)
2279                         break;
2280         }
2281 
2282         if (!prev_inst)
2283                 return false;
2284 
2285         /* Only reuse the ldunif result if it was written to a temp register,
2286          * otherwise there may be special restrictions (for example, ldunif
2287          * may write directly to unifa, which is a write-only register).
2288          */
2289         if (prev_inst->dst.file != QFILE_TEMP)
2290                 return false;
2291 
2292         list_for_each_entry_from(struct qinst, inst, prev_inst->link.next,
2293                                  &c->cur_block->instructions, link) {
2294                 if (inst->dst.file == prev_inst->dst.file &&
2295                     inst->dst.index == prev_inst->dst.index) {
2296                         return false;
2297                 }
2298         }
2299 
2300         *unif = prev_inst->dst;
2301         return true;
2302 }
2303 
2304 struct qreg
vir_uniform(struct v3d_compile * c,enum quniform_contents contents,uint32_t data)2305 vir_uniform(struct v3d_compile *c,
2306             enum quniform_contents contents,
2307             uint32_t data)
2308 {
2309         const int num_uniforms = c->num_uniforms;
2310         const int index = vir_get_uniform_index(c, contents, data);
2311 
2312         /* If this is not the first time we see this uniform try to reuse the
2313          * result of the last ldunif that loaded it.
2314          */
2315         const bool is_new_uniform = num_uniforms != c->num_uniforms;
2316         if (!is_new_uniform && !c->disable_ldunif_opt) {
2317                 struct qreg ldunif_dst;
2318                 if (try_opt_ldunif(c, index, &ldunif_dst))
2319                         return ldunif_dst;
2320         }
2321 
2322         struct qinst *inst = vir_NOP(c);
2323         inst->qpu.sig.ldunif = true;
2324         inst->uniform = index;
2325         inst->dst = vir_get_temp(c);
2326         c->defs[inst->dst.index] = inst;
2327         return inst->dst;
2328 }
2329 
2330 #define OPTPASS(func)                                                   \
2331         do {                                                            \
2332                 bool stage_progress = func(c);                          \
2333                 if (stage_progress) {                                   \
2334                         progress = true;                                \
2335                         if (print_opt_debug) {                          \
2336                                 fprintf(stderr,                         \
2337                                         "VIR opt pass %2d: %s progress\n", \
2338                                         pass, #func);                   \
2339                         }                                               \
2340                         /*XXX vir_validate(c);*/                        \
2341                 }                                                       \
2342         } while (0)
2343 
2344 void
vir_optimize(struct v3d_compile * c)2345 vir_optimize(struct v3d_compile *c)
2346 {
2347         bool print_opt_debug = false;
2348         int pass = 1;
2349 
2350         while (true) {
2351                 bool progress = false;
2352 
2353                 OPTPASS(vir_opt_copy_propagate);
2354                 OPTPASS(vir_opt_redundant_flags);
2355                 OPTPASS(vir_opt_dead_code);
2356                 OPTPASS(vir_opt_small_immediates);
2357                 OPTPASS(vir_opt_constant_alu);
2358 
2359                 if (!progress)
2360                         break;
2361 
2362                 pass++;
2363         }
2364 }
2365 
2366 const char *
vir_get_stage_name(struct v3d_compile * c)2367 vir_get_stage_name(struct v3d_compile *c)
2368 {
2369         if (c->vs_key && c->vs_key->is_coord)
2370                 return "MESA_SHADER_VERTEX_BIN";
2371         else if (c->gs_key && c->gs_key->is_coord)
2372                 return "MESA_SHADER_GEOMETRY_BIN";
2373         else
2374                 return gl_shader_stage_name(c->s->info.stage);
2375 }
2376 
2377 static inline uint32_t
compute_vpm_size_in_sectors(const struct v3d_device_info * devinfo)2378 compute_vpm_size_in_sectors(const struct v3d_device_info *devinfo)
2379 {
2380    assert(devinfo->vpm_size > 0);
2381    const uint32_t sector_size = V3D_CHANNELS * sizeof(uint32_t) * 8;
2382    return devinfo->vpm_size / sector_size;
2383 }
2384 
2385 /* Computes various parameters affecting VPM memory configuration for programs
2386  * involving geometry shaders to ensure the program fits in memory and honors
2387  * requirements described in section "VPM usage" of the programming manual.
2388  */
2389 static bool
compute_vpm_config_gs(struct v3d_device_info * devinfo,struct v3d_vs_prog_data * vs,struct v3d_gs_prog_data * gs,struct vpm_config * vpm_cfg_out)2390 compute_vpm_config_gs(struct v3d_device_info *devinfo,
2391                       struct v3d_vs_prog_data *vs,
2392                       struct v3d_gs_prog_data *gs,
2393                       struct vpm_config *vpm_cfg_out)
2394 {
2395    const uint32_t A = vs->separate_segments ? 1 : 0;
2396    const uint32_t Ad = vs->vpm_input_size;
2397    const uint32_t Vd = vs->vpm_output_size;
2398 
2399    const uint32_t vpm_size = compute_vpm_size_in_sectors(devinfo);
2400 
2401    /* Try to fit program into our VPM memory budget by adjusting
2402     * configurable parameters iteratively. We do this in two phases:
2403     * the first phase tries to fit the program into the total available
2404     * VPM memory. If we succeed at that, then the second phase attempts
2405     * to fit the program into half of that budget so we can run bin and
2406     * render programs in parallel.
2407     */
2408    struct vpm_config vpm_cfg[2];
2409    struct vpm_config *final_vpm_cfg = NULL;
2410    uint32_t phase = 0;
2411 
2412    vpm_cfg[phase].As = 1;
2413    vpm_cfg[phase].Gs = 1;
2414    vpm_cfg[phase].Gd = gs->vpm_output_size;
2415    vpm_cfg[phase].gs_width = gs->simd_width;
2416 
2417    /* While there is a requirement that Vc >= [Vn / 16], this is
2418     * always the case when tessellation is not present because in that
2419     * case Vn can only be 6 at most (when input primitive is triangles
2420     * with adjacency).
2421     *
2422     * We always choose Vc=2. We can't go lower than this due to GFXH-1744,
2423     * and Broadcom has not found it worth it to increase it beyond this
2424     * in general. Increasing Vc also increases VPM memory pressure which
2425     * can turn up being detrimental for performance in some scenarios.
2426     */
2427    vpm_cfg[phase].Vc = 2;
2428 
2429    /* Gv is a constraint on the hardware to not exceed the
2430     * specified number of vertex segments per GS batch. If adding a
2431     * new primitive to a GS batch would result in a range of more
2432     * than Gv vertex segments being referenced by the batch, then
2433     * the hardware will flush the batch and start a new one. This
2434     * means that we can choose any value we want, we just need to
2435     * be aware that larger values improve GS batch utilization
2436     * at the expense of more VPM memory pressure (which can affect
2437     * other performance aspects, such as GS dispatch width).
2438     * We start with the largest value, and will reduce it if we
2439     * find that total memory pressure is too high.
2440     */
2441    vpm_cfg[phase].Gv = 3;
2442    do {
2443       /* When GS is present in absence of TES, then we need to satisfy
2444        * that Ve >= Gv. We go with the smallest value of Ve to avoid
2445        * increasing memory pressure.
2446        */
2447       vpm_cfg[phase].Ve = vpm_cfg[phase].Gv;
2448 
2449       uint32_t vpm_sectors =
2450          A * vpm_cfg[phase].As * Ad +
2451          (vpm_cfg[phase].Vc + vpm_cfg[phase].Ve) * Vd +
2452          vpm_cfg[phase].Gs * vpm_cfg[phase].Gd;
2453 
2454       /* Ideally we want to use no more than half of the available
2455        * memory so we can execute a bin and render program in parallel
2456        * without stalls. If we achieved that then we are done.
2457        */
2458       if (vpm_sectors <= vpm_size / 2) {
2459          final_vpm_cfg = &vpm_cfg[phase];
2460          break;
2461       }
2462 
2463       /* At the very least, we should not allocate more than the
2464        * total available VPM memory. If we have a configuration that
2465        * succeeds at this we save it and continue to see if we can
2466        * meet the half-memory-use criteria too.
2467        */
2468       if (phase == 0 && vpm_sectors <= vpm_size) {
2469          vpm_cfg[1] = vpm_cfg[0];
2470          phase = 1;
2471       }
2472 
2473       /* Try lowering Gv */
2474       if (vpm_cfg[phase].Gv > 0) {
2475          vpm_cfg[phase].Gv--;
2476          continue;
2477       }
2478 
2479       /* Try lowering GS dispatch width */
2480       if (vpm_cfg[phase].gs_width > 1) {
2481          do {
2482             vpm_cfg[phase].gs_width >>= 1;
2483             vpm_cfg[phase].Gd = align(vpm_cfg[phase].Gd, 2) / 2;
2484          } while (vpm_cfg[phase].gs_width == 2);
2485 
2486          /* Reset Gv to max after dropping dispatch width */
2487          vpm_cfg[phase].Gv = 3;
2488          continue;
2489       }
2490 
2491       /* We ran out of options to reduce memory pressure. If we
2492        * are at phase 1 we have at least a valid configuration, so we
2493        * we use that.
2494        */
2495       if (phase == 1)
2496          final_vpm_cfg = &vpm_cfg[0];
2497       break;
2498    } while (true);
2499 
2500    if (!final_vpm_cfg)
2501       return false;
2502 
2503    assert(final_vpm_cfg);
2504    assert(final_vpm_cfg->Gd <= 16);
2505    assert(final_vpm_cfg->Gv < 4);
2506    assert(final_vpm_cfg->Ve < 4);
2507    assert(final_vpm_cfg->Vc >= 2 && final_vpm_cfg->Vc <= 4);
2508    assert(final_vpm_cfg->gs_width == 1 ||
2509           final_vpm_cfg->gs_width == 4 ||
2510           final_vpm_cfg->gs_width == 8 ||
2511           final_vpm_cfg->gs_width == 16);
2512 
2513    *vpm_cfg_out = *final_vpm_cfg;
2514    return true;
2515 }
2516 
2517 bool
v3d_compute_vpm_config(struct v3d_device_info * devinfo,struct v3d_vs_prog_data * vs_bin,struct v3d_vs_prog_data * vs,struct v3d_gs_prog_data * gs_bin,struct v3d_gs_prog_data * gs,struct vpm_config * vpm_cfg_bin,struct vpm_config * vpm_cfg)2518 v3d_compute_vpm_config(struct v3d_device_info *devinfo,
2519                        struct v3d_vs_prog_data *vs_bin,
2520                        struct v3d_vs_prog_data *vs,
2521                        struct v3d_gs_prog_data *gs_bin,
2522                        struct v3d_gs_prog_data *gs,
2523                        struct vpm_config *vpm_cfg_bin,
2524                        struct vpm_config *vpm_cfg)
2525 {
2526    assert(vs && vs_bin);
2527    assert((gs != NULL) == (gs_bin != NULL));
2528 
2529    if (!gs) {
2530       vpm_cfg_bin->As = 1;
2531       vpm_cfg_bin->Ve = 0;
2532       vpm_cfg_bin->Vc = vs_bin->vcm_cache_size;
2533 
2534       vpm_cfg->As = 1;
2535       vpm_cfg->Ve = 0;
2536       vpm_cfg->Vc = vs->vcm_cache_size;
2537    } else {
2538       if (!compute_vpm_config_gs(devinfo, vs_bin, gs_bin, vpm_cfg_bin))
2539          return false;
2540 
2541       if (!compute_vpm_config_gs(devinfo, vs, gs, vpm_cfg))
2542          return false;
2543    }
2544 
2545    return true;
2546 }
2547 
2548 static inline uint32_t
compute_prog_score(struct v3d_prog_data * p,uint32_t qpu_size)2549 compute_prog_score(struct v3d_prog_data *p, uint32_t qpu_size)
2550 {
2551         const uint32_t inst_count = qpu_size / sizeof(uint64_t);
2552         const uint32_t tmu_count = p->tmu_count + p->tmu_spills + p->tmu_fills;
2553         return inst_count + 4 * tmu_count;
2554 }
2555 
2556 void
v3d_update_double_buffer_score(uint32_t vertex_count,uint32_t vs_qpu_size,uint32_t fs_qpu_size,struct v3d_prog_data * vs,struct v3d_prog_data * fs,struct v3d_double_buffer_score * score)2557 v3d_update_double_buffer_score(uint32_t vertex_count,
2558                                uint32_t vs_qpu_size,
2559                                uint32_t fs_qpu_size,
2560                                struct v3d_prog_data *vs,
2561                                struct v3d_prog_data *fs,
2562                                struct v3d_double_buffer_score *score)
2563 {
2564         score->geom += vertex_count * compute_prog_score(vs, vs_qpu_size);
2565         score->render += compute_prog_score(fs, fs_qpu_size);
2566 }
2567