• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*
2  * Copyright © 2016-2017 Broadcom
3  *
4  * Permission is hereby granted, free of charge, to any person obtaining a
5  * copy of this software and associated documentation files (the "Software"),
6  * to deal in the Software without restriction, including without limitation
7  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8  * and/or sell copies of the Software, and to permit persons to whom the
9  * Software is furnished to do so, subject to the following conditions:
10  *
11  * The above copyright notice and this permission notice (including the next
12  * paragraph) shall be included in all copies or substantial portions of the
13  * Software.
14  *
15  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
18  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21  * IN THE SOFTWARE.
22  */
23 
24 #include "broadcom/common/v3d_device_info.h"
25 #include "v3d_compiler.h"
26 #include "compiler/nir/nir_schedule.h"
27 #include "compiler/nir/nir_builder.h"
28 
29 int
vir_get_nsrc(struct qinst * inst)30 vir_get_nsrc(struct qinst *inst)
31 {
32         switch (inst->qpu.type) {
33         case V3D_QPU_INSTR_TYPE_BRANCH:
34                 return 0;
35         case V3D_QPU_INSTR_TYPE_ALU:
36                 if (inst->qpu.alu.add.op != V3D_QPU_A_NOP)
37                         return v3d_qpu_add_op_num_src(inst->qpu.alu.add.op);
38                 else
39                         return v3d_qpu_mul_op_num_src(inst->qpu.alu.mul.op);
40         }
41 
42         return 0;
43 }
44 
45 /**
46  * Returns whether the instruction has any side effects that must be
47  * preserved.
48  */
49 bool
vir_has_side_effects(struct v3d_compile * c,struct qinst * inst)50 vir_has_side_effects(struct v3d_compile *c, struct qinst *inst)
51 {
52         switch (inst->qpu.type) {
53         case V3D_QPU_INSTR_TYPE_BRANCH:
54                 return true;
55         case V3D_QPU_INSTR_TYPE_ALU:
56                 switch (inst->qpu.alu.add.op) {
57                 case V3D_QPU_A_SETREVF:
58                 case V3D_QPU_A_SETMSF:
59                 case V3D_QPU_A_VPMSETUP:
60                 case V3D_QPU_A_STVPMV:
61                 case V3D_QPU_A_STVPMD:
62                 case V3D_QPU_A_STVPMP:
63                 case V3D_QPU_A_VPMWT:
64                 case V3D_QPU_A_TMUWT:
65                         return true;
66                 default:
67                         break;
68                 }
69 
70                 switch (inst->qpu.alu.mul.op) {
71                 case V3D_QPU_M_MULTOP:
72                         return true;
73                 default:
74                         break;
75                 }
76         }
77 
78         if (inst->qpu.sig.ldtmu ||
79             inst->qpu.sig.ldvary ||
80             inst->qpu.sig.ldtlbu ||
81             inst->qpu.sig.ldtlb ||
82             inst->qpu.sig.wrtmuc ||
83             inst->qpu.sig.thrsw) {
84                 return true;
85         }
86 
87         /* ldunifa works like ldunif: it reads an element and advances the
88          * pointer, so each read has a side effect (we don't care for ldunif
89          * because we reconstruct the uniform stream buffer after compiling
90          * with the surviving uniforms), so allowing DCE to remove
91          * one would break follow-up loads. We could fix this by emitting a
92          * unifa for each ldunifa, but each unifa requires 3 delay slots
93          * before a ldunifa, so that would be quite expensive.
94          */
95         if (inst->qpu.sig.ldunifa || inst->qpu.sig.ldunifarf)
96                 return true;
97 
98         return false;
99 }
100 
101 bool
vir_is_raw_mov(struct qinst * inst)102 vir_is_raw_mov(struct qinst *inst)
103 {
104         if (inst->qpu.type != V3D_QPU_INSTR_TYPE_ALU ||
105             (inst->qpu.alu.mul.op != V3D_QPU_M_FMOV &&
106              inst->qpu.alu.mul.op != V3D_QPU_M_MOV)) {
107                 return false;
108         }
109 
110         if (inst->qpu.alu.add.output_pack != V3D_QPU_PACK_NONE ||
111             inst->qpu.alu.mul.output_pack != V3D_QPU_PACK_NONE) {
112                 return false;
113         }
114 
115         if (inst->qpu.alu.add.a.unpack != V3D_QPU_UNPACK_NONE ||
116             inst->qpu.alu.add.b.unpack != V3D_QPU_UNPACK_NONE ||
117             inst->qpu.alu.mul.a.unpack != V3D_QPU_UNPACK_NONE ||
118             inst->qpu.alu.mul.b.unpack != V3D_QPU_UNPACK_NONE) {
119                 return false;
120         }
121 
122         if (inst->qpu.flags.ac != V3D_QPU_COND_NONE ||
123             inst->qpu.flags.mc != V3D_QPU_COND_NONE)
124                 return false;
125 
126         return true;
127 }
128 
129 bool
vir_is_add(struct qinst * inst)130 vir_is_add(struct qinst *inst)
131 {
132         return (inst->qpu.type == V3D_QPU_INSTR_TYPE_ALU &&
133                 inst->qpu.alu.add.op != V3D_QPU_A_NOP);
134 }
135 
136 bool
vir_is_mul(struct qinst * inst)137 vir_is_mul(struct qinst *inst)
138 {
139         return (inst->qpu.type == V3D_QPU_INSTR_TYPE_ALU &&
140                 inst->qpu.alu.mul.op != V3D_QPU_M_NOP);
141 }
142 
143 bool
vir_is_tex(const struct v3d_device_info * devinfo,struct qinst * inst)144 vir_is_tex(const struct v3d_device_info *devinfo, struct qinst *inst)
145 {
146         if (inst->dst.file == QFILE_MAGIC)
147                 return v3d_qpu_magic_waddr_is_tmu(devinfo, inst->dst.index);
148 
149         if (inst->qpu.type == V3D_QPU_INSTR_TYPE_ALU &&
150             inst->qpu.alu.add.op == V3D_QPU_A_TMUWT) {
151                 return true;
152         }
153 
154         return false;
155 }
156 
157 bool
vir_writes_r4_implicitly(const struct v3d_device_info * devinfo,struct qinst * inst)158 vir_writes_r4_implicitly(const struct v3d_device_info *devinfo,
159                          struct qinst *inst)
160 {
161         if (!devinfo->has_accumulators)
162                 return false;
163 
164         switch (inst->dst.file) {
165         case QFILE_MAGIC:
166                 switch (inst->dst.index) {
167                 case V3D_QPU_WADDR_RECIP:
168                 case V3D_QPU_WADDR_RSQRT:
169                 case V3D_QPU_WADDR_EXP:
170                 case V3D_QPU_WADDR_LOG:
171                 case V3D_QPU_WADDR_SIN:
172                         return true;
173                 }
174                 break;
175         default:
176                 break;
177         }
178 
179         return false;
180 }
181 
182 void
vir_set_unpack(struct qinst * inst,int src,enum v3d_qpu_input_unpack unpack)183 vir_set_unpack(struct qinst *inst, int src,
184                enum v3d_qpu_input_unpack unpack)
185 {
186         assert(src == 0 || src == 1);
187 
188         if (vir_is_add(inst)) {
189                 if (src == 0)
190                         inst->qpu.alu.add.a.unpack = unpack;
191                 else
192                         inst->qpu.alu.add.b.unpack = unpack;
193         } else {
194                 assert(vir_is_mul(inst));
195                 if (src == 0)
196                         inst->qpu.alu.mul.a.unpack = unpack;
197                 else
198                         inst->qpu.alu.mul.b.unpack = unpack;
199         }
200 }
201 
202 void
vir_set_pack(struct qinst * inst,enum v3d_qpu_output_pack pack)203 vir_set_pack(struct qinst *inst, enum v3d_qpu_output_pack pack)
204 {
205         if (vir_is_add(inst)) {
206                 inst->qpu.alu.add.output_pack = pack;
207         } else {
208                 assert(vir_is_mul(inst));
209                 inst->qpu.alu.mul.output_pack = pack;
210         }
211 }
212 
213 void
vir_set_cond(struct qinst * inst,enum v3d_qpu_cond cond)214 vir_set_cond(struct qinst *inst, enum v3d_qpu_cond cond)
215 {
216         if (vir_is_add(inst)) {
217                 inst->qpu.flags.ac = cond;
218         } else {
219                 assert(vir_is_mul(inst));
220                 inst->qpu.flags.mc = cond;
221         }
222 }
223 
224 enum v3d_qpu_cond
vir_get_cond(struct qinst * inst)225 vir_get_cond(struct qinst *inst)
226 {
227         assert(inst->qpu.type == V3D_QPU_INSTR_TYPE_ALU);
228 
229         if (vir_is_add(inst))
230                 return inst->qpu.flags.ac;
231         else if (vir_is_mul(inst))
232                 return inst->qpu.flags.mc;
233         else /* NOP */
234                 return V3D_QPU_COND_NONE;
235 }
236 
237 void
vir_set_pf(struct v3d_compile * c,struct qinst * inst,enum v3d_qpu_pf pf)238 vir_set_pf(struct v3d_compile *c, struct qinst *inst, enum v3d_qpu_pf pf)
239 {
240         c->flags_temp = -1;
241         if (vir_is_add(inst)) {
242                 inst->qpu.flags.apf = pf;
243         } else {
244                 assert(vir_is_mul(inst));
245                 inst->qpu.flags.mpf = pf;
246         }
247 }
248 
249 void
vir_set_uf(struct v3d_compile * c,struct qinst * inst,enum v3d_qpu_uf uf)250 vir_set_uf(struct v3d_compile *c, struct qinst *inst, enum v3d_qpu_uf uf)
251 {
252         c->flags_temp = -1;
253         if (vir_is_add(inst)) {
254                 inst->qpu.flags.auf = uf;
255         } else {
256                 assert(vir_is_mul(inst));
257                 inst->qpu.flags.muf = uf;
258         }
259 }
260 
261 #if 0
262 uint8_t
263 vir_channels_written(struct qinst *inst)
264 {
265         if (vir_is_mul(inst)) {
266                 switch (inst->dst.pack) {
267                 case QPU_PACK_MUL_NOP:
268                 case QPU_PACK_MUL_8888:
269                         return 0xf;
270                 case QPU_PACK_MUL_8A:
271                         return 0x1;
272                 case QPU_PACK_MUL_8B:
273                         return 0x2;
274                 case QPU_PACK_MUL_8C:
275                         return 0x4;
276                 case QPU_PACK_MUL_8D:
277                         return 0x8;
278                 }
279         } else {
280                 switch (inst->dst.pack) {
281                 case QPU_PACK_A_NOP:
282                 case QPU_PACK_A_8888:
283                 case QPU_PACK_A_8888_SAT:
284                 case QPU_PACK_A_32_SAT:
285                         return 0xf;
286                 case QPU_PACK_A_8A:
287                 case QPU_PACK_A_8A_SAT:
288                         return 0x1;
289                 case QPU_PACK_A_8B:
290                 case QPU_PACK_A_8B_SAT:
291                         return 0x2;
292                 case QPU_PACK_A_8C:
293                 case QPU_PACK_A_8C_SAT:
294                         return 0x4;
295                 case QPU_PACK_A_8D:
296                 case QPU_PACK_A_8D_SAT:
297                         return 0x8;
298                 case QPU_PACK_A_16A:
299                 case QPU_PACK_A_16A_SAT:
300                         return 0x3;
301                 case QPU_PACK_A_16B:
302                 case QPU_PACK_A_16B_SAT:
303                         return 0xc;
304                 }
305         }
306         unreachable("Bad pack field");
307 }
308 #endif
309 
310 struct qreg
vir_get_temp(struct v3d_compile * c)311 vir_get_temp(struct v3d_compile *c)
312 {
313         struct qreg reg;
314 
315         reg.file = QFILE_TEMP;
316         reg.index = c->num_temps++;
317 
318         if (c->num_temps > c->defs_array_size) {
319                 uint32_t old_size = c->defs_array_size;
320                 c->defs_array_size = MAX2(old_size * 2, 16);
321 
322                 c->defs = reralloc(c, c->defs, struct qinst *,
323                                    c->defs_array_size);
324                 memset(&c->defs[old_size], 0,
325                        sizeof(c->defs[0]) * (c->defs_array_size - old_size));
326 
327                 c->spillable = reralloc(c, c->spillable,
328                                         BITSET_WORD,
329                                         BITSET_WORDS(c->defs_array_size));
330                 for (int i = old_size; i < c->defs_array_size; i++)
331                         BITSET_SET(c->spillable, i);
332         }
333 
334         return reg;
335 }
336 
337 struct qinst *
vir_add_inst(enum v3d_qpu_add_op op,struct qreg dst,struct qreg src0,struct qreg src1)338 vir_add_inst(enum v3d_qpu_add_op op, struct qreg dst, struct qreg src0, struct qreg src1)
339 {
340         struct qinst *inst = calloc(1, sizeof(*inst));
341 
342         inst->qpu = v3d_qpu_nop();
343         inst->qpu.alu.add.op = op;
344 
345         inst->dst = dst;
346         inst->src[0] = src0;
347         inst->src[1] = src1;
348         inst->uniform = ~0;
349 
350         inst->ip = -1;
351 
352         return inst;
353 }
354 
355 struct qinst *
vir_mul_inst(enum v3d_qpu_mul_op op,struct qreg dst,struct qreg src0,struct qreg src1)356 vir_mul_inst(enum v3d_qpu_mul_op op, struct qreg dst, struct qreg src0, struct qreg src1)
357 {
358         struct qinst *inst = calloc(1, sizeof(*inst));
359 
360         inst->qpu = v3d_qpu_nop();
361         inst->qpu.alu.mul.op = op;
362 
363         inst->dst = dst;
364         inst->src[0] = src0;
365         inst->src[1] = src1;
366         inst->uniform = ~0;
367 
368         inst->ip = -1;
369 
370         return inst;
371 }
372 
373 struct qinst *
vir_branch_inst(struct v3d_compile * c,enum v3d_qpu_branch_cond cond)374 vir_branch_inst(struct v3d_compile *c, enum v3d_qpu_branch_cond cond)
375 {
376         struct qinst *inst = calloc(1, sizeof(*inst));
377 
378         inst->qpu = v3d_qpu_nop();
379         inst->qpu.type = V3D_QPU_INSTR_TYPE_BRANCH;
380         inst->qpu.branch.cond = cond;
381         inst->qpu.branch.msfign = V3D_QPU_MSFIGN_NONE;
382         inst->qpu.branch.bdi = V3D_QPU_BRANCH_DEST_REL;
383         inst->qpu.branch.ub = true;
384         inst->qpu.branch.bdu = V3D_QPU_BRANCH_DEST_REL;
385 
386         inst->dst = vir_nop_reg();
387         inst->uniform = vir_get_uniform_index(c, QUNIFORM_CONSTANT, 0);
388 
389         inst->ip = -1;
390 
391         return inst;
392 }
393 
394 static void
vir_emit(struct v3d_compile * c,struct qinst * inst)395 vir_emit(struct v3d_compile *c, struct qinst *inst)
396 {
397         inst->ip = -1;
398 
399         switch (c->cursor.mode) {
400         case vir_cursor_add:
401                 list_add(&inst->link, c->cursor.link);
402                 break;
403         case vir_cursor_addtail:
404                 list_addtail(&inst->link, c->cursor.link);
405                 break;
406         }
407 
408         c->cursor = vir_after_inst(inst);
409         c->live_intervals_valid = false;
410 }
411 
412 /* Updates inst to write to a new temporary, emits it, and notes the def. */
413 struct qreg
vir_emit_def(struct v3d_compile * c,struct qinst * inst)414 vir_emit_def(struct v3d_compile *c, struct qinst *inst)
415 {
416         assert(inst->dst.file == QFILE_NULL);
417 
418         /* If we're emitting an instruction that's a def, it had better be
419          * writing a register.
420          */
421         if (inst->qpu.type == V3D_QPU_INSTR_TYPE_ALU) {
422                 assert(inst->qpu.alu.add.op == V3D_QPU_A_NOP ||
423                        v3d_qpu_add_op_has_dst(inst->qpu.alu.add.op));
424                 assert(inst->qpu.alu.mul.op == V3D_QPU_M_NOP ||
425                        v3d_qpu_mul_op_has_dst(inst->qpu.alu.mul.op));
426         }
427 
428         inst->dst = vir_get_temp(c);
429 
430         if (inst->dst.file == QFILE_TEMP)
431                 c->defs[inst->dst.index] = inst;
432 
433         vir_emit(c, inst);
434 
435         return inst->dst;
436 }
437 
438 struct qinst *
vir_emit_nondef(struct v3d_compile * c,struct qinst * inst)439 vir_emit_nondef(struct v3d_compile *c, struct qinst *inst)
440 {
441         if (inst->dst.file == QFILE_TEMP)
442                 c->defs[inst->dst.index] = NULL;
443 
444         vir_emit(c, inst);
445 
446         return inst;
447 }
448 
449 struct qblock *
vir_new_block(struct v3d_compile * c)450 vir_new_block(struct v3d_compile *c)
451 {
452         struct qblock *block = rzalloc(c, struct qblock);
453 
454         list_inithead(&block->instructions);
455 
456         block->predecessors = _mesa_set_create(block,
457                                                _mesa_hash_pointer,
458                                                _mesa_key_pointer_equal);
459 
460         block->index = c->next_block_index++;
461 
462         return block;
463 }
464 
465 void
vir_set_emit_block(struct v3d_compile * c,struct qblock * block)466 vir_set_emit_block(struct v3d_compile *c, struct qblock *block)
467 {
468         c->cur_block = block;
469         c->cursor = vir_after_block(block);
470         list_addtail(&block->link, &c->blocks);
471 }
472 
473 struct qblock *
vir_entry_block(struct v3d_compile * c)474 vir_entry_block(struct v3d_compile *c)
475 {
476         return list_first_entry(&c->blocks, struct qblock, link);
477 }
478 
479 struct qblock *
vir_exit_block(struct v3d_compile * c)480 vir_exit_block(struct v3d_compile *c)
481 {
482         return list_last_entry(&c->blocks, struct qblock, link);
483 }
484 
485 void
vir_link_blocks(struct qblock * predecessor,struct qblock * successor)486 vir_link_blocks(struct qblock *predecessor, struct qblock *successor)
487 {
488         _mesa_set_add(successor->predecessors, predecessor);
489         if (predecessor->successors[0]) {
490                 assert(!predecessor->successors[1]);
491                 predecessor->successors[1] = successor;
492         } else {
493                 predecessor->successors[0] = successor;
494         }
495 }
496 
497 const struct v3d_compiler *
v3d_compiler_init(const struct v3d_device_info * devinfo,uint32_t max_inline_uniform_buffers)498 v3d_compiler_init(const struct v3d_device_info *devinfo,
499                   uint32_t max_inline_uniform_buffers)
500 {
501         struct v3d_compiler *compiler = rzalloc(NULL, struct v3d_compiler);
502         if (!compiler)
503                 return NULL;
504 
505         compiler->devinfo = devinfo;
506         compiler->max_inline_uniform_buffers = max_inline_uniform_buffers;
507 
508         if (!vir_init_reg_sets(compiler)) {
509                 ralloc_free(compiler);
510                 return NULL;
511         }
512 
513         return compiler;
514 }
515 
516 void
v3d_compiler_free(const struct v3d_compiler * compiler)517 v3d_compiler_free(const struct v3d_compiler *compiler)
518 {
519         ralloc_free((void *)compiler);
520 }
521 
522 struct v3d_compiler_strategy {
523         const char *name;
524         uint32_t max_threads;
525         uint32_t min_threads;
526         bool disable_general_tmu_sched;
527         bool disable_gcm;
528         bool disable_loop_unrolling;
529         bool disable_ubo_load_sorting;
530         bool move_buffer_loads;
531         bool disable_tmu_pipelining;
532         uint32_t max_tmu_spills;
533 };
534 
535 static struct v3d_compile *
vir_compile_init(const struct v3d_compiler * compiler,struct v3d_key * key,nir_shader * s,void (* debug_output)(const char * msg,void * debug_output_data),void * debug_output_data,int program_id,int variant_id,uint32_t compile_strategy_idx,const struct v3d_compiler_strategy * strategy,bool fallback_scheduler)536 vir_compile_init(const struct v3d_compiler *compiler,
537                  struct v3d_key *key,
538                  nir_shader *s,
539                  void (*debug_output)(const char *msg,
540                                       void *debug_output_data),
541                  void *debug_output_data,
542                  int program_id, int variant_id,
543                  uint32_t compile_strategy_idx,
544                  const struct v3d_compiler_strategy *strategy,
545                  bool fallback_scheduler)
546 {
547         struct v3d_compile *c = rzalloc(NULL, struct v3d_compile);
548 
549         c->compiler = compiler;
550         c->devinfo = compiler->devinfo;
551         c->key = key;
552         c->program_id = program_id;
553         c->variant_id = variant_id;
554         c->compile_strategy_idx = compile_strategy_idx;
555         c->threads = strategy->max_threads;
556         c->debug_output = debug_output;
557         c->debug_output_data = debug_output_data;
558         c->compilation_result = V3D_COMPILATION_SUCCEEDED;
559         c->min_threads_for_reg_alloc = strategy->min_threads;
560         c->max_tmu_spills = strategy->max_tmu_spills;
561         c->fallback_scheduler = fallback_scheduler;
562         c->disable_general_tmu_sched = strategy->disable_general_tmu_sched;
563         c->disable_tmu_pipelining = strategy->disable_tmu_pipelining;
564         c->disable_constant_ubo_load_sorting = strategy->disable_ubo_load_sorting;
565         c->move_buffer_loads = strategy->move_buffer_loads;
566         c->disable_gcm = strategy->disable_gcm;
567         c->disable_loop_unrolling = V3D_DBG(NO_LOOP_UNROLL)
568                 ? true : strategy->disable_loop_unrolling;
569 
570 
571         s = nir_shader_clone(c, s);
572         c->s = s;
573 
574         list_inithead(&c->blocks);
575         vir_set_emit_block(c, vir_new_block(c));
576 
577         c->output_position_index = -1;
578         c->output_sample_mask_index = -1;
579 
580         c->def_ht = _mesa_hash_table_create(c, _mesa_hash_pointer,
581                                             _mesa_key_pointer_equal);
582 
583         c->tmu.outstanding_regs = _mesa_pointer_set_create(c);
584         c->flags_temp = -1;
585 
586         return c;
587 }
588 
589 static int
type_size_vec4(const struct glsl_type * type,bool bindless)590 type_size_vec4(const struct glsl_type *type, bool bindless)
591 {
592         return glsl_count_attribute_slots(type, false);
593 }
594 
595 static enum nir_lower_tex_packing
lower_tex_packing_cb(const nir_tex_instr * tex,const void * data)596 lower_tex_packing_cb(const nir_tex_instr *tex, const void *data)
597 {
598    struct v3d_compile *c = (struct v3d_compile *) data;
599 
600    int sampler_index = nir_tex_instr_need_sampler(tex) ?
601       tex->sampler_index : tex->backend_flags;
602 
603    assert(sampler_index < c->key->num_samplers_used);
604    return c->key->sampler[sampler_index].return_size == 16 ?
605       nir_lower_tex_packing_16 : nir_lower_tex_packing_none;
606 }
607 
608 static bool
v3d_nir_lower_null_pointers_cb(nir_builder * b,nir_intrinsic_instr * intr,void * _state)609 v3d_nir_lower_null_pointers_cb(nir_builder *b,
610                                nir_intrinsic_instr *intr,
611                                void *_state)
612 {
613         uint32_t buffer_src_idx;
614 
615         switch (intr->intrinsic) {
616         case nir_intrinsic_load_ubo:
617         case nir_intrinsic_load_ssbo:
618                 buffer_src_idx = 0;
619                 break;
620         case nir_intrinsic_store_ssbo:
621                 buffer_src_idx = 1;
622                 break;
623         default:
624                 return false;
625         }
626 
627         /* If index if constant we are good */
628         nir_src *src = &intr->src[buffer_src_idx];
629         if (nir_src_is_const(*src))
630                 return false;
631 
632         /* Otherwise, see if it comes from a bcsel including a null pointer */
633         if (src->ssa->parent_instr->type != nir_instr_type_alu)
634                 return false;
635 
636         nir_alu_instr *alu = nir_instr_as_alu(src->ssa->parent_instr);
637         if (alu->op != nir_op_bcsel)
638                 return false;
639 
640         /* A null pointer is specified using block index 0xffffffff */
641         int32_t null_src_idx = -1;
642         for (int i = 1; i < 3; i++) {
643                  /* FIXME: since we are running this before optimization maybe
644                   * we need to also handle the case where we may have bcsel
645                   * chain that we need to recurse?
646                   */
647                 if (!nir_src_is_const(alu->src[i].src))
648                         continue;
649                 if (nir_src_comp_as_uint(alu->src[i].src, 0) != 0xffffffff)
650                         continue;
651 
652                 /* One of the bcsel srcs is a null pointer reference */
653                 null_src_idx = i;
654                 break;
655         }
656 
657         if (null_src_idx < 0)
658                 return false;
659 
660         assert(null_src_idx == 1 || null_src_idx == 2);
661         int32_t copy_src_idx = null_src_idx == 1 ? 2 : 1;
662 
663         /* Rewrite the null pointer reference so we use the same buffer index
664          * as the other bcsel branch. This will allow optimization to remove
665          * the bcsel and we should then end up with a constant buffer index
666          * like we need.
667          */
668         b->cursor = nir_before_instr(&alu->instr);
669         nir_def *copy = nir_mov(b, alu->src[copy_src_idx].src.ssa);
670         nir_src_rewrite(&alu->src[null_src_idx].src, copy);
671 
672         return true;
673 }
674 
675 static bool
v3d_nir_lower_null_pointers(nir_shader * s)676 v3d_nir_lower_null_pointers(nir_shader *s)
677 {
678         return nir_shader_intrinsics_pass(s, v3d_nir_lower_null_pointers_cb,
679                                             nir_metadata_block_index |
680                                             nir_metadata_dominance, NULL);
681 }
682 
683 static void
v3d_lower_nir(struct v3d_compile * c)684 v3d_lower_nir(struct v3d_compile *c)
685 {
686         struct nir_lower_tex_options tex_options = {
687                 .lower_txd = true,
688                 .lower_tg4_offsets = true,
689                 .lower_tg4_broadcom_swizzle = true,
690 
691                 .lower_rect = false, /* XXX: Use this on V3D 3.x */
692                 .lower_txp = ~0,
693                 /* Apply swizzles to all samplers. */
694                 .swizzle_result = ~0,
695                 .lower_invalid_implicit_lod = true,
696         };
697 
698         /* Lower the format swizzle and (for 32-bit returns)
699          * ARB_texture_swizzle-style swizzle.
700          */
701         assert(c->key->num_tex_used <= ARRAY_SIZE(c->key->tex));
702         for (int i = 0; i < c->key->num_tex_used; i++) {
703                 for (int j = 0; j < 4; j++)
704                         tex_options.swizzles[i][j] = c->key->tex[i].swizzle[j];
705         }
706 
707         tex_options.lower_tex_packing_cb = lower_tex_packing_cb;
708         tex_options.lower_tex_packing_data = c;
709 
710         NIR_PASS(_, c->s, nir_lower_tex, &tex_options);
711         NIR_PASS(_, c->s, nir_lower_system_values);
712 
713         if (c->s->info.zero_initialize_shared_memory &&
714             c->s->info.shared_size > 0) {
715                 /* All our BOs allocate full pages, so the underlying allocation
716                  * for shared memory will always be a multiple of 4KB. This
717                  * ensures that we can do an exact number of full chunk_size
718                  * writes to initialize the memory independently of the actual
719                  * shared_size used by the shader, which is a requirement of
720                  * the initialization pass.
721                  */
722                 const unsigned chunk_size = 16; /* max single store size */
723                 NIR_PASS(_, c->s, nir_zero_initialize_shared_memory,
724                          align(c->s->info.shared_size, chunk_size), chunk_size);
725         }
726 
727         NIR_PASS(_, c->s, nir_lower_compute_system_values, NULL);
728 
729         NIR_PASS(_, c->s, nir_lower_vars_to_scratch,
730                  nir_var_function_temp,
731                  0,
732                  glsl_get_natural_size_align_bytes);
733         NIR_PASS(_, c->s, nir_lower_is_helper_invocation);
734         NIR_PASS(_, c->s, v3d_nir_lower_scratch);
735         NIR_PASS(_, c->s, v3d_nir_lower_null_pointers);
736 }
737 
738 static void
v3d_set_prog_data_uniforms(struct v3d_compile * c,struct v3d_prog_data * prog_data)739 v3d_set_prog_data_uniforms(struct v3d_compile *c,
740                            struct v3d_prog_data *prog_data)
741 {
742         int count = c->num_uniforms;
743         struct v3d_uniform_list *ulist = &prog_data->uniforms;
744 
745         ulist->count = count;
746         ulist->data = ralloc_array(prog_data, uint32_t, count);
747         memcpy(ulist->data, c->uniform_data,
748                count * sizeof(*ulist->data));
749         ulist->contents = ralloc_array(prog_data, enum quniform_contents, count);
750         memcpy(ulist->contents, c->uniform_contents,
751                count * sizeof(*ulist->contents));
752 }
753 
754 static void
v3d_vs_set_prog_data(struct v3d_compile * c,struct v3d_vs_prog_data * prog_data)755 v3d_vs_set_prog_data(struct v3d_compile *c,
756                      struct v3d_vs_prog_data *prog_data)
757 {
758         /* The vertex data gets format converted by the VPM so that
759          * each attribute channel takes up a VPM column.  Precompute
760          * the sizes for the shader record.
761          */
762         for (int i = 0; i < ARRAY_SIZE(prog_data->vattr_sizes); i++) {
763                 prog_data->vattr_sizes[i] = c->vattr_sizes[i];
764                 prog_data->vpm_input_size += c->vattr_sizes[i];
765         }
766 
767         memset(prog_data->driver_location_map, -1,
768                sizeof(prog_data->driver_location_map));
769 
770         nir_foreach_shader_in_variable(var, c->s) {
771                 prog_data->driver_location_map[var->data.location] =
772                         var->data.driver_location;
773         }
774 
775         prog_data->uses_vid = BITSET_TEST(c->s->info.system_values_read,
776                                           SYSTEM_VALUE_VERTEX_ID) ||
777                               BITSET_TEST(c->s->info.system_values_read,
778                                           SYSTEM_VALUE_VERTEX_ID_ZERO_BASE);
779 
780         prog_data->uses_biid = BITSET_TEST(c->s->info.system_values_read,
781                                            SYSTEM_VALUE_BASE_INSTANCE);
782 
783         prog_data->uses_iid = BITSET_TEST(c->s->info.system_values_read,
784                                           SYSTEM_VALUE_INSTANCE_ID) ||
785                               BITSET_TEST(c->s->info.system_values_read,
786                                           SYSTEM_VALUE_INSTANCE_INDEX);
787 
788         if (prog_data->uses_vid)
789                 prog_data->vpm_input_size++;
790         if (prog_data->uses_biid)
791                 prog_data->vpm_input_size++;
792         if (prog_data->uses_iid)
793                 prog_data->vpm_input_size++;
794 
795         /* Input/output segment size are in sectors (8 rows of 32 bits per
796          * channel).
797          */
798         prog_data->vpm_input_size = align(prog_data->vpm_input_size, 8) / 8;
799         prog_data->vpm_output_size = align(c->vpm_output_size, 8) / 8;
800 
801         /* Set us up for shared input/output segments.  This is apparently
802          * necessary for our VCM setup to avoid varying corruption.
803          *
804          * FIXME: initial testing on V3D 7.1 seems to work fine when using
805          * separate segments. So we could try to reevaluate in the future, if
806          * there is any advantage of using separate segments.
807          */
808         prog_data->separate_segments = false;
809         prog_data->vpm_output_size = MAX2(prog_data->vpm_output_size,
810                                           prog_data->vpm_input_size);
811         prog_data->vpm_input_size = 0;
812 
813         /* Compute VCM cache size.  We set up our program to take up less than
814          * half of the VPM, so that any set of bin and render programs won't
815          * run out of space.  We need space for at least one input segment,
816          * and then allocate the rest to output segments (one for the current
817          * program, the rest to VCM).  The valid range of the VCM cache size
818          * field is 1-4 16-vertex batches, but GFXH-1744 limits us to 2-4
819          * batches.
820          */
821         assert(c->devinfo->vpm_size);
822         int sector_size = V3D_CHANNELS * sizeof(uint32_t) * 8;
823         int vpm_size_in_sectors = c->devinfo->vpm_size / sector_size;
824         int half_vpm = vpm_size_in_sectors / 2;
825         int vpm_output_sectors = half_vpm - prog_data->vpm_input_size;
826         int vpm_output_batches = vpm_output_sectors / prog_data->vpm_output_size;
827         assert(vpm_output_batches >= 2);
828         prog_data->vcm_cache_size = CLAMP(vpm_output_batches - 1, 2, 4);
829 }
830 
831 static void
v3d_gs_set_prog_data(struct v3d_compile * c,struct v3d_gs_prog_data * prog_data)832 v3d_gs_set_prog_data(struct v3d_compile *c,
833                      struct v3d_gs_prog_data *prog_data)
834 {
835         prog_data->num_inputs = c->num_inputs;
836         memcpy(prog_data->input_slots, c->input_slots,
837                c->num_inputs * sizeof(*c->input_slots));
838 
839         /* gl_PrimitiveIdIn is written by the GBG into the first word of the
840          * VPM output header automatically and the shader will overwrite
841          * it after reading it if necessary, so it doesn't add to the VPM
842          * size requirements.
843          */
844         prog_data->uses_pid = BITSET_TEST(c->s->info.system_values_read,
845                                           SYSTEM_VALUE_PRIMITIVE_ID);
846 
847         /* Output segment size is in sectors (8 rows of 32 bits per channel) */
848         prog_data->vpm_output_size = align(c->vpm_output_size, 8) / 8;
849 
850         /* Compute SIMD dispatch width and update VPM output size accordingly
851          * to ensure we can fit our program in memory. Available widths are
852          * 16, 8, 4, 1.
853          *
854          * Notice that at draw time we will have to consider VPM memory
855          * requirements from other stages and choose a smaller dispatch
856          * width if needed to fit the program in VPM memory.
857          */
858         prog_data->simd_width = 16;
859         while ((prog_data->simd_width > 1 && prog_data->vpm_output_size > 16) ||
860                prog_data->simd_width == 2) {
861                 prog_data->simd_width >>= 1;
862                 prog_data->vpm_output_size =
863                         align(prog_data->vpm_output_size, 2) / 2;
864         }
865         assert(prog_data->vpm_output_size <= 16);
866         assert(prog_data->simd_width != 2);
867 
868         prog_data->out_prim_type = c->s->info.gs.output_primitive;
869         prog_data->num_invocations = c->s->info.gs.invocations;
870 
871         prog_data->writes_psiz =
872             c->s->info.outputs_written & (1 << VARYING_SLOT_PSIZ);
873 }
874 
875 static void
v3d_set_fs_prog_data_inputs(struct v3d_compile * c,struct v3d_fs_prog_data * prog_data)876 v3d_set_fs_prog_data_inputs(struct v3d_compile *c,
877                             struct v3d_fs_prog_data *prog_data)
878 {
879         prog_data->num_inputs = c->num_inputs;
880         memcpy(prog_data->input_slots, c->input_slots,
881                c->num_inputs * sizeof(*c->input_slots));
882 
883         STATIC_ASSERT(ARRAY_SIZE(prog_data->flat_shade_flags) >
884                       (V3D_MAX_FS_INPUTS - 1) / 24);
885         for (int i = 0; i < V3D_MAX_FS_INPUTS; i++) {
886                 if (BITSET_TEST(c->flat_shade_flags, i))
887                         prog_data->flat_shade_flags[i / 24] |= 1 << (i % 24);
888 
889                 if (BITSET_TEST(c->noperspective_flags, i))
890                         prog_data->noperspective_flags[i / 24] |= 1 << (i % 24);
891 
892                 if (BITSET_TEST(c->centroid_flags, i))
893                         prog_data->centroid_flags[i / 24] |= 1 << (i % 24);
894         }
895 }
896 
897 static void
v3d_fs_set_prog_data(struct v3d_compile * c,struct v3d_fs_prog_data * prog_data)898 v3d_fs_set_prog_data(struct v3d_compile *c,
899                      struct v3d_fs_prog_data *prog_data)
900 {
901         v3d_set_fs_prog_data_inputs(c, prog_data);
902         prog_data->writes_z = c->writes_z;
903         prog_data->writes_z_from_fep = c->writes_z_from_fep;
904         prog_data->disable_ez = !c->s->info.fs.early_fragment_tests;
905         prog_data->uses_center_w = c->uses_center_w;
906         prog_data->uses_implicit_point_line_varyings =
907                 c->uses_implicit_point_line_varyings;
908         prog_data->lock_scoreboard_on_first_thrsw =
909                 c->lock_scoreboard_on_first_thrsw;
910         prog_data->force_per_sample_msaa = c->s->info.fs.uses_sample_shading;
911         prog_data->uses_pid = c->fs_uses_primitive_id;
912 }
913 
914 static void
v3d_cs_set_prog_data(struct v3d_compile * c,struct v3d_compute_prog_data * prog_data)915 v3d_cs_set_prog_data(struct v3d_compile *c,
916                      struct v3d_compute_prog_data *prog_data)
917 {
918         prog_data->shared_size = c->s->info.shared_size;
919 
920         prog_data->local_size[0] = c->s->info.workgroup_size[0];
921         prog_data->local_size[1] = c->s->info.workgroup_size[1];
922         prog_data->local_size[2] = c->s->info.workgroup_size[2];
923 
924         prog_data->has_subgroups = c->has_subgroups;
925 }
926 
927 static void
v3d_set_prog_data(struct v3d_compile * c,struct v3d_prog_data * prog_data)928 v3d_set_prog_data(struct v3d_compile *c,
929                   struct v3d_prog_data *prog_data)
930 {
931         prog_data->threads = c->threads;
932         prog_data->single_seg = !c->last_thrsw;
933         prog_data->spill_size = c->spill_size;
934         prog_data->tmu_spills = c->spills;
935         prog_data->tmu_fills = c->fills;
936         prog_data->tmu_count = c->tmu.total_count;
937         prog_data->qpu_read_stalls = c->qpu_inst_stalled_count;
938         prog_data->compile_strategy_idx = c->compile_strategy_idx;
939         prog_data->tmu_dirty_rcl = c->tmu_dirty_rcl;
940         prog_data->has_control_barrier = c->s->info.uses_control_barrier;
941         prog_data->has_global_address = c->has_global_address;
942 
943         v3d_set_prog_data_uniforms(c, prog_data);
944 
945         switch (c->s->info.stage) {
946         case MESA_SHADER_VERTEX:
947                 v3d_vs_set_prog_data(c, (struct v3d_vs_prog_data *)prog_data);
948                 break;
949         case MESA_SHADER_GEOMETRY:
950                 v3d_gs_set_prog_data(c, (struct v3d_gs_prog_data *)prog_data);
951                 break;
952         case MESA_SHADER_FRAGMENT:
953                 v3d_fs_set_prog_data(c, (struct v3d_fs_prog_data *)prog_data);
954                 break;
955         case MESA_SHADER_COMPUTE:
956                 v3d_cs_set_prog_data(c, (struct v3d_compute_prog_data *)prog_data);
957                 break;
958         default:
959                 unreachable("unsupported shader stage");
960         }
961 }
962 
963 static uint64_t *
v3d_return_qpu_insts(struct v3d_compile * c,uint32_t * final_assembly_size)964 v3d_return_qpu_insts(struct v3d_compile *c, uint32_t *final_assembly_size)
965 {
966         *final_assembly_size = c->qpu_inst_count * sizeof(uint64_t);
967 
968         uint64_t *qpu_insts = malloc(*final_assembly_size);
969         if (!qpu_insts)
970                 return NULL;
971 
972         memcpy(qpu_insts, c->qpu_insts, *final_assembly_size);
973 
974         vir_compile_destroy(c);
975 
976         return qpu_insts;
977 }
978 
979 static void
v3d_nir_lower_vs_early(struct v3d_compile * c)980 v3d_nir_lower_vs_early(struct v3d_compile *c)
981 {
982         /* Split our I/O vars and dead code eliminate the unused
983          * components.
984          */
985         NIR_PASS(_, c->s, nir_lower_io_to_scalar_early,
986                  nir_var_shader_in | nir_var_shader_out);
987         uint64_t used_outputs[4] = {0};
988         for (int i = 0; i < c->vs_key->num_used_outputs; i++) {
989                 int slot = v3d_slot_get_slot(c->vs_key->used_outputs[i]);
990                 int comp = v3d_slot_get_component(c->vs_key->used_outputs[i]);
991                 used_outputs[comp] |= 1ull << slot;
992         }
993         NIR_PASS(_, c->s, nir_remove_unused_io_vars,
994                  nir_var_shader_out, used_outputs, NULL); /* demotes to globals */
995         NIR_PASS(_, c->s, nir_lower_global_vars_to_local);
996         v3d_optimize_nir(c, c->s);
997         NIR_PASS(_, c->s, nir_remove_dead_variables, nir_var_shader_in, NULL);
998 
999         /* This must go before nir_lower_io */
1000         if (c->vs_key->per_vertex_point_size)
1001                 NIR_PASS(_, c->s, nir_lower_point_size, 1.0f, 0.0f);
1002 
1003         NIR_PASS(_, c->s, nir_lower_io, nir_var_shader_in | nir_var_shader_out,
1004                  type_size_vec4,
1005                  (nir_lower_io_options)0);
1006         /* clean up nir_lower_io's deref_var remains and do a constant folding pass
1007          * on the code it generated.
1008          */
1009         NIR_PASS(_, c->s, nir_opt_dce);
1010         NIR_PASS(_, c->s, nir_opt_constant_folding);
1011 }
1012 
1013 static void
v3d_nir_lower_gs_early(struct v3d_compile * c)1014 v3d_nir_lower_gs_early(struct v3d_compile *c)
1015 {
1016         /* Split our I/O vars and dead code eliminate the unused
1017          * components.
1018          */
1019         NIR_PASS(_, c->s, nir_lower_io_to_scalar_early,
1020                  nir_var_shader_in | nir_var_shader_out);
1021         uint64_t used_outputs[4] = {0};
1022         for (int i = 0; i < c->gs_key->num_used_outputs; i++) {
1023                 int slot = v3d_slot_get_slot(c->gs_key->used_outputs[i]);
1024                 int comp = v3d_slot_get_component(c->gs_key->used_outputs[i]);
1025                 used_outputs[comp] |= 1ull << slot;
1026         }
1027         NIR_PASS(_, c->s, nir_remove_unused_io_vars,
1028                  nir_var_shader_out, used_outputs, NULL); /* demotes to globals */
1029         NIR_PASS(_, c->s, nir_lower_global_vars_to_local);
1030         v3d_optimize_nir(c, c->s);
1031         NIR_PASS(_, c->s, nir_remove_dead_variables, nir_var_shader_in, NULL);
1032 
1033         /* This must go before nir_lower_io */
1034         if (c->gs_key->per_vertex_point_size)
1035                 NIR_PASS(_, c->s, nir_lower_point_size, 1.0f, 0.0f);
1036 
1037         NIR_PASS(_, c->s, nir_lower_io, nir_var_shader_in | nir_var_shader_out,
1038                  type_size_vec4,
1039                  (nir_lower_io_options)0);
1040         /* clean up nir_lower_io's deref_var remains and do a constant folding pass
1041          * on the code it generated.
1042          */
1043         NIR_PASS(_, c->s, nir_opt_dce);
1044         NIR_PASS(_, c->s, nir_opt_constant_folding);
1045 }
1046 
1047 static void
v3d_fixup_fs_output_types(struct v3d_compile * c)1048 v3d_fixup_fs_output_types(struct v3d_compile *c)
1049 {
1050         nir_foreach_shader_out_variable(var, c->s) {
1051                 uint32_t mask = 0;
1052 
1053                 switch (var->data.location) {
1054                 case FRAG_RESULT_COLOR:
1055                         mask = ~0;
1056                         break;
1057                 case FRAG_RESULT_DATA0:
1058                 case FRAG_RESULT_DATA1:
1059                 case FRAG_RESULT_DATA2:
1060                 case FRAG_RESULT_DATA3:
1061                         mask = 1 << (var->data.location - FRAG_RESULT_DATA0);
1062                         break;
1063                 }
1064 
1065                 if (c->fs_key->int_color_rb & mask) {
1066                         var->type =
1067                                 glsl_vector_type(GLSL_TYPE_INT,
1068                                                  glsl_get_components(var->type));
1069                 } else if (c->fs_key->uint_color_rb & mask) {
1070                         var->type =
1071                                 glsl_vector_type(GLSL_TYPE_UINT,
1072                                                  glsl_get_components(var->type));
1073                 }
1074         }
1075 }
1076 
1077 static void
v3d_nir_lower_fs_early(struct v3d_compile * c)1078 v3d_nir_lower_fs_early(struct v3d_compile *c)
1079 {
1080         if (c->fs_key->int_color_rb || c->fs_key->uint_color_rb)
1081                 v3d_fixup_fs_output_types(c);
1082 
1083         NIR_PASS(_, c->s, v3d_nir_lower_logic_ops, c);
1084 
1085         if (c->fs_key->line_smoothing) {
1086                 NIR_PASS(_, c->s, v3d_nir_lower_line_smooth);
1087                 NIR_PASS(_, c->s, nir_lower_global_vars_to_local);
1088                 /* The lowering pass can introduce new sysval reads */
1089                 nir_shader_gather_info(c->s, nir_shader_get_entrypoint(c->s));
1090         }
1091 }
1092 
1093 static void
v3d_nir_lower_gs_late(struct v3d_compile * c)1094 v3d_nir_lower_gs_late(struct v3d_compile *c)
1095 {
1096         if (c->key->ucp_enables) {
1097                 NIR_PASS(_, c->s, nir_lower_clip_gs, c->key->ucp_enables,
1098                          false, NULL);
1099         }
1100 
1101         /* Note: GS output scalarizing must happen after nir_lower_clip_gs. */
1102         NIR_PASS_V(c->s, nir_lower_io_to_scalar, nir_var_shader_out, NULL, NULL);
1103 }
1104 
1105 static void
v3d_nir_lower_vs_late(struct v3d_compile * c)1106 v3d_nir_lower_vs_late(struct v3d_compile *c)
1107 {
1108         if (c->key->ucp_enables) {
1109                 NIR_PASS(_, c->s, nir_lower_clip_vs, c->key->ucp_enables,
1110                          false, false, NULL);
1111                 NIR_PASS_V(c->s, nir_lower_io_to_scalar,
1112                            nir_var_shader_out, NULL, NULL);
1113         }
1114 
1115         /* Note: VS output scalarizing must happen after nir_lower_clip_vs. */
1116         NIR_PASS_V(c->s, nir_lower_io_to_scalar, nir_var_shader_out, NULL, NULL);
1117 }
1118 
1119 static void
v3d_nir_lower_fs_late(struct v3d_compile * c)1120 v3d_nir_lower_fs_late(struct v3d_compile *c)
1121 {
1122         /* In OpenGL the fragment shader can't read gl_ClipDistance[], but
1123          * Vulkan allows it, in which case the SPIR-V compiler will declare
1124          * VARING_SLOT_CLIP_DIST0 as compact array variable. Pass true as
1125          * the last parameter to always operate with a compact array in both
1126          * OpenGL and Vulkan so we do't have to care about the API we
1127          * are using.
1128          */
1129         if (c->key->ucp_enables)
1130                 NIR_PASS(_, c->s, nir_lower_clip_fs, c->key->ucp_enables, true);
1131 
1132         NIR_PASS_V(c->s, nir_lower_io_to_scalar, nir_var_shader_in, NULL, NULL);
1133 }
1134 
1135 static uint32_t
vir_get_max_temps(struct v3d_compile * c)1136 vir_get_max_temps(struct v3d_compile *c)
1137 {
1138         int max_ip = 0;
1139         vir_for_each_inst_inorder(inst, c)
1140                 max_ip++;
1141 
1142         uint32_t *pressure = rzalloc_array(NULL, uint32_t, max_ip);
1143 
1144         for (int t = 0; t < c->num_temps; t++) {
1145                 for (int i = c->temp_start[t]; (i < c->temp_end[t] &&
1146                                                 i < max_ip); i++) {
1147                         if (i > max_ip)
1148                                 break;
1149                         pressure[i]++;
1150                 }
1151         }
1152 
1153         uint32_t max_temps = 0;
1154         for (int i = 0; i < max_ip; i++)
1155                 max_temps = MAX2(max_temps, pressure[i]);
1156 
1157         ralloc_free(pressure);
1158 
1159         return max_temps;
1160 }
1161 
1162 enum v3d_dependency_class {
1163         V3D_DEPENDENCY_CLASS_GS_VPM_OUTPUT_0
1164 };
1165 
1166 static bool
v3d_intrinsic_dependency_cb(nir_intrinsic_instr * intr,nir_schedule_dependency * dep,void * user_data)1167 v3d_intrinsic_dependency_cb(nir_intrinsic_instr *intr,
1168                             nir_schedule_dependency *dep,
1169                             void *user_data)
1170 {
1171         struct v3d_compile *c = user_data;
1172 
1173         switch (intr->intrinsic) {
1174         case nir_intrinsic_store_output:
1175                 /* Writing to location 0 overwrites the value passed in for
1176                  * gl_PrimitiveID on geometry shaders
1177                  */
1178                 if (c->s->info.stage != MESA_SHADER_GEOMETRY ||
1179                     nir_intrinsic_base(intr) != 0)
1180                         break;
1181 
1182                 nir_const_value *const_value =
1183                         nir_src_as_const_value(intr->src[1]);
1184 
1185                 if (const_value == NULL)
1186                         break;
1187 
1188                 uint64_t offset =
1189                         nir_const_value_as_uint(*const_value,
1190                                                 nir_src_bit_size(intr->src[1]));
1191                 if (offset != 0)
1192                         break;
1193 
1194                 dep->klass = V3D_DEPENDENCY_CLASS_GS_VPM_OUTPUT_0;
1195                 dep->type = NIR_SCHEDULE_WRITE_DEPENDENCY;
1196                 return true;
1197 
1198         case nir_intrinsic_load_primitive_id:
1199                 if (c->s->info.stage != MESA_SHADER_GEOMETRY)
1200                         break;
1201 
1202                 dep->klass = V3D_DEPENDENCY_CLASS_GS_VPM_OUTPUT_0;
1203                 dep->type = NIR_SCHEDULE_READ_DEPENDENCY;
1204                 return true;
1205 
1206         default:
1207                 break;
1208         }
1209 
1210         return false;
1211 }
1212 
1213 static unsigned
v3d_instr_delay_cb(nir_instr * instr,void * data)1214 v3d_instr_delay_cb(nir_instr *instr, void *data)
1215 {
1216    struct v3d_compile *c = (struct v3d_compile *) data;
1217 
1218    switch (instr->type) {
1219    case nir_instr_type_undef:
1220    case nir_instr_type_load_const:
1221    case nir_instr_type_alu:
1222    case nir_instr_type_deref:
1223    case nir_instr_type_jump:
1224    case nir_instr_type_parallel_copy:
1225    case nir_instr_type_call:
1226    case nir_instr_type_phi:
1227       return 1;
1228 
1229    /* We should not use very large delays for TMU instructions. Typically,
1230     * thread switches will be sufficient to hide all or most of the latency,
1231     * so we typically only need a little bit of extra room. If we over-estimate
1232     * the latency here we may end up unnecessarily delaying the critical path in
1233     * the shader, which would have a negative effect in performance, so here
1234     * we are trying to strike a balance based on empirical testing.
1235     */
1236    case nir_instr_type_intrinsic: {
1237       nir_intrinsic_instr *intr = nir_instr_as_intrinsic(instr);
1238       if (!c->disable_general_tmu_sched) {
1239          switch (intr->intrinsic) {
1240          case nir_intrinsic_decl_reg:
1241          case nir_intrinsic_load_reg:
1242          case nir_intrinsic_store_reg:
1243             return 0;
1244          case nir_intrinsic_load_ssbo:
1245          case nir_intrinsic_load_scratch:
1246          case nir_intrinsic_load_shared:
1247          case nir_intrinsic_image_load:
1248             return 3;
1249          case nir_intrinsic_load_ubo:
1250             if (nir_src_is_divergent(intr->src[1]))
1251                return 3;
1252             FALLTHROUGH;
1253          default:
1254             return 1;
1255          }
1256       } else {
1257          switch (intr->intrinsic) {
1258          case nir_intrinsic_decl_reg:
1259          case nir_intrinsic_load_reg:
1260          case nir_intrinsic_store_reg:
1261             return 0;
1262          default:
1263             return 1;
1264          }
1265       }
1266       break;
1267    }
1268 
1269    case nir_instr_type_tex:
1270       return 5;
1271    }
1272 
1273    return 0;
1274 }
1275 
1276 static bool
should_split_wrmask(const nir_instr * instr,const void * data)1277 should_split_wrmask(const nir_instr *instr, const void *data)
1278 {
1279         nir_intrinsic_instr *intr = nir_instr_as_intrinsic(instr);
1280         switch (intr->intrinsic) {
1281         case nir_intrinsic_store_ssbo:
1282         case nir_intrinsic_store_shared:
1283         case nir_intrinsic_store_global:
1284         case nir_intrinsic_store_scratch:
1285                 return true;
1286         default:
1287                 return false;
1288         }
1289 }
1290 
1291 static nir_intrinsic_instr *
nir_instr_as_constant_ubo_load(nir_instr * inst)1292 nir_instr_as_constant_ubo_load(nir_instr *inst)
1293 {
1294         if (inst->type != nir_instr_type_intrinsic)
1295                 return NULL;
1296 
1297         nir_intrinsic_instr *intr = nir_instr_as_intrinsic(inst);
1298         if (intr->intrinsic != nir_intrinsic_load_ubo)
1299                 return NULL;
1300 
1301         assert(nir_src_is_const(intr->src[0]));
1302         if (!nir_src_is_const(intr->src[1]))
1303                 return NULL;
1304 
1305         return intr;
1306 }
1307 
1308 static bool
v3d_nir_sort_constant_ubo_load(nir_block * block,nir_intrinsic_instr * ref)1309 v3d_nir_sort_constant_ubo_load(nir_block *block, nir_intrinsic_instr *ref)
1310 {
1311         bool progress = false;
1312 
1313         nir_instr *ref_inst = &ref->instr;
1314         uint32_t ref_offset = nir_src_as_uint(ref->src[1]);
1315         uint32_t ref_index = nir_src_as_uint(ref->src[0]);
1316 
1317         /* Go through all instructions after ref searching for constant UBO
1318          * loads for the same UBO index.
1319          */
1320         bool seq_break = false;
1321         nir_instr *inst = &ref->instr;
1322         nir_instr *next_inst = NULL;
1323         while (true) {
1324                 inst = next_inst ? next_inst : nir_instr_next(inst);
1325                 if (!inst)
1326                         break;
1327 
1328                 next_inst = NULL;
1329 
1330                 if (inst->type != nir_instr_type_intrinsic)
1331                         continue;
1332 
1333                 nir_intrinsic_instr *intr = nir_instr_as_intrinsic(inst);
1334                 if (intr->intrinsic != nir_intrinsic_load_ubo)
1335                         continue;
1336 
1337                 /* We only produce unifa sequences for non-divergent loads */
1338                 if (nir_src_is_divergent(intr->src[1]))
1339                         continue;
1340 
1341                 /* If there are any UBO loads that are not constant or that
1342                  * use a different UBO index in between the reference load and
1343                  * any other constant load for the same index, they would break
1344                  * the unifa sequence. We will flag that so we can then move
1345                  * all constant UBO loads for the reference index before these
1346                  * and not just the ones that are not ordered to avoid breaking
1347                  * the sequence and reduce unifa writes.
1348                  */
1349                 if (!nir_src_is_const(intr->src[1])) {
1350                         seq_break = true;
1351                         continue;
1352                 }
1353                 uint32_t offset = nir_src_as_uint(intr->src[1]);
1354 
1355                 assert(nir_src_is_const(intr->src[0]));
1356                 uint32_t index = nir_src_as_uint(intr->src[0]);
1357                 if (index != ref_index) {
1358                        seq_break = true;
1359                        continue;
1360                 }
1361 
1362                 /* Only move loads with an offset that is close enough to the
1363                  * reference offset, since otherwise we would not be able to
1364                  * skip the unifa write for them. See ntq_emit_load_ubo_unifa.
1365                  */
1366                 if (abs((int)(ref_offset - offset)) > MAX_UNIFA_SKIP_DISTANCE)
1367                         continue;
1368 
1369                 /* We will move this load if its offset is smaller than ref's
1370                  * (in which case we will move it before ref) or if the offset
1371                  * is larger than ref's but there are sequence breakers in
1372                  * in between (in which case we will move it after ref and
1373                  * before the sequence breakers).
1374                  */
1375                 if (!seq_break && offset >= ref_offset)
1376                         continue;
1377 
1378                 /* Find where exactly we want to move this load:
1379                  *
1380                  * If we are moving it before ref, we want to check any other
1381                  * UBO loads we placed before ref and make sure we insert this
1382                  * one properly ordered with them. Likewise, if we are moving
1383                  * it after ref.
1384                  */
1385                 nir_instr *pos = ref_inst;
1386                 nir_instr *tmp = pos;
1387                 do {
1388                         if (offset < ref_offset)
1389                                 tmp = nir_instr_prev(tmp);
1390                         else
1391                                 tmp = nir_instr_next(tmp);
1392 
1393                         if (!tmp || tmp == inst)
1394                                 break;
1395 
1396                         /* Ignore non-unifa UBO loads */
1397                         if (tmp->type != nir_instr_type_intrinsic)
1398                                 continue;
1399 
1400                         nir_intrinsic_instr *tmp_intr =
1401                                 nir_instr_as_intrinsic(tmp);
1402                         if (tmp_intr->intrinsic != nir_intrinsic_load_ubo)
1403                                 continue;
1404 
1405                         if (nir_src_is_divergent(tmp_intr->src[1]))
1406                                 continue;
1407 
1408                         /* Stop if we find a unifa UBO load that breaks the
1409                          * sequence.
1410                          */
1411                         if (!nir_src_is_const(tmp_intr->src[1]))
1412                                 break;
1413 
1414                         if (nir_src_as_uint(tmp_intr->src[0]) != index)
1415                                 break;
1416 
1417                         uint32_t tmp_offset = nir_src_as_uint(tmp_intr->src[1]);
1418                         if (offset < ref_offset) {
1419                                 if (tmp_offset < offset ||
1420                                     tmp_offset >= ref_offset) {
1421                                         break;
1422                                 } else {
1423                                         pos = tmp;
1424                                 }
1425                         } else {
1426                                 if (tmp_offset > offset ||
1427                                     tmp_offset <= ref_offset) {
1428                                         break;
1429                                 } else {
1430                                         pos = tmp;
1431                                 }
1432                         }
1433                 } while (true);
1434 
1435                 /* We can't move the UBO load before the instruction that
1436                  * defines its constant offset. If that instruction is placed
1437                  * in between the new location (pos) and the current location
1438                  * of this load, we will have to move that instruction too.
1439                  *
1440                  * We don't care about the UBO index definition because that
1441                  * is optimized to be reused by all UBO loads for the same
1442                  * index and therefore is certain to be defined before the
1443                  * first UBO load that uses it.
1444                  */
1445                 nir_instr *offset_inst = NULL;
1446                 tmp = inst;
1447                 while ((tmp = nir_instr_prev(tmp)) != NULL) {
1448                         if (pos == tmp) {
1449                                 /* We reached the target location without
1450                                  * finding the instruction that defines the
1451                                  * offset, so that instruction must be before
1452                                  * the new position and we don't have to fix it.
1453                                  */
1454                                 break;
1455                         }
1456                         if (intr->src[1].ssa->parent_instr == tmp) {
1457                                 offset_inst = tmp;
1458                                 break;
1459                         }
1460                 }
1461 
1462                 if (offset_inst) {
1463                         exec_node_remove(&offset_inst->node);
1464                         exec_node_insert_node_before(&pos->node,
1465                                                      &offset_inst->node);
1466                 }
1467 
1468                 /* Since we are moving the instruction before its current
1469                  * location, grab its successor before the move so that
1470                  * we can continue the next iteration of the main loop from
1471                  * that instruction.
1472                  */
1473                 next_inst = nir_instr_next(inst);
1474 
1475                 /* Move this load to the selected location */
1476                 exec_node_remove(&inst->node);
1477                 if (offset < ref_offset)
1478                         exec_node_insert_node_before(&pos->node, &inst->node);
1479                 else
1480                         exec_node_insert_after(&pos->node, &inst->node);
1481 
1482                 progress = true;
1483         }
1484 
1485         return progress;
1486 }
1487 
1488 static bool
v3d_nir_sort_constant_ubo_loads_block(struct v3d_compile * c,nir_block * block)1489 v3d_nir_sort_constant_ubo_loads_block(struct v3d_compile *c,
1490                                       nir_block *block)
1491 {
1492         bool progress = false;
1493         bool local_progress;
1494         do {
1495                 local_progress = false;
1496                 nir_foreach_instr_safe(inst, block) {
1497                         nir_intrinsic_instr *intr =
1498                                 nir_instr_as_constant_ubo_load(inst);
1499                         if (intr) {
1500                                 local_progress |=
1501                                         v3d_nir_sort_constant_ubo_load(block, intr);
1502                         }
1503                 }
1504                 progress |= local_progress;
1505         } while (local_progress);
1506 
1507         return progress;
1508 }
1509 
1510 /**
1511  * Sorts constant UBO loads in each block by offset to maximize chances of
1512  * skipping unifa writes when converting to VIR. This can increase register
1513  * pressure.
1514  */
1515 static bool
v3d_nir_sort_constant_ubo_loads(nir_shader * s,struct v3d_compile * c)1516 v3d_nir_sort_constant_ubo_loads(nir_shader *s, struct v3d_compile *c)
1517 {
1518         nir_foreach_function_impl(impl, s) {
1519                 nir_foreach_block(block, impl) {
1520                         c->sorted_any_ubo_loads |=
1521                                 v3d_nir_sort_constant_ubo_loads_block(c, block);
1522                 }
1523                 nir_metadata_preserve(impl,
1524                                       nir_metadata_block_index |
1525                                       nir_metadata_dominance);
1526         }
1527         return c->sorted_any_ubo_loads;
1528 }
1529 
1530 static void
lower_load_num_subgroups(struct v3d_compile * c,nir_builder * b,nir_intrinsic_instr * intr)1531 lower_load_num_subgroups(struct v3d_compile *c,
1532                          nir_builder *b,
1533                          nir_intrinsic_instr *intr)
1534 {
1535         assert(c->s->info.stage == MESA_SHADER_COMPUTE);
1536         assert(intr->intrinsic == nir_intrinsic_load_num_subgroups);
1537 
1538         b->cursor = nir_after_instr(&intr->instr);
1539         uint32_t num_subgroups =
1540                 DIV_ROUND_UP(c->s->info.workgroup_size[0] *
1541                              c->s->info.workgroup_size[1] *
1542                              c->s->info.workgroup_size[2], V3D_CHANNELS);
1543         nir_def *result = nir_imm_int(b, num_subgroups);
1544         nir_def_rewrite_uses(&intr->def, result);
1545         nir_instr_remove(&intr->instr);
1546 }
1547 
1548 static bool
lower_subgroup_intrinsics(struct v3d_compile * c,nir_block * block,nir_builder * b)1549 lower_subgroup_intrinsics(struct v3d_compile *c,
1550                           nir_block *block, nir_builder *b)
1551 {
1552         bool progress = false;
1553         nir_foreach_instr_safe(inst, block) {
1554                 if (inst->type != nir_instr_type_intrinsic)
1555                         continue;;
1556 
1557                 nir_intrinsic_instr *intr =
1558                         nir_instr_as_intrinsic(inst);
1559                 if (!intr)
1560                         continue;
1561 
1562                 switch (intr->intrinsic) {
1563                 case nir_intrinsic_load_num_subgroups:
1564                         lower_load_num_subgroups(c, b, intr);
1565                         progress = true;
1566                         FALLTHROUGH;
1567                 case nir_intrinsic_load_subgroup_id:
1568                 case nir_intrinsic_load_subgroup_size:
1569                 case nir_intrinsic_load_subgroup_invocation:
1570                 case nir_intrinsic_elect:
1571                 case nir_intrinsic_ballot:
1572                 case nir_intrinsic_inverse_ballot:
1573                 case nir_intrinsic_ballot_bitfield_extract:
1574                 case nir_intrinsic_ballot_bit_count_reduce:
1575                 case nir_intrinsic_ballot_find_lsb:
1576                 case nir_intrinsic_ballot_find_msb:
1577                 case nir_intrinsic_ballot_bit_count_exclusive:
1578                 case nir_intrinsic_ballot_bit_count_inclusive:
1579                 case nir_intrinsic_reduce:
1580                 case nir_intrinsic_inclusive_scan:
1581                 case nir_intrinsic_exclusive_scan:
1582                 case nir_intrinsic_read_invocation:
1583                 case nir_intrinsic_read_first_invocation:
1584                 case nir_intrinsic_load_subgroup_eq_mask:
1585                 case nir_intrinsic_load_subgroup_ge_mask:
1586                 case nir_intrinsic_load_subgroup_gt_mask:
1587                 case nir_intrinsic_load_subgroup_le_mask:
1588                 case nir_intrinsic_load_subgroup_lt_mask:
1589                 case nir_intrinsic_shuffle:
1590                 case nir_intrinsic_shuffle_xor:
1591                 case nir_intrinsic_shuffle_up:
1592                 case nir_intrinsic_shuffle_down:
1593                 case nir_intrinsic_vote_all:
1594                 case nir_intrinsic_vote_any:
1595                 case nir_intrinsic_vote_feq:
1596                 case nir_intrinsic_vote_ieq:
1597                 case nir_intrinsic_quad_broadcast:
1598                 case nir_intrinsic_quad_swap_horizontal:
1599                 case nir_intrinsic_quad_swap_vertical:
1600                 case nir_intrinsic_quad_swap_diagonal:
1601                         c->has_subgroups = true;
1602                         break;
1603                 default:
1604                         break;
1605                 }
1606         }
1607 
1608         return progress;
1609 }
1610 
1611 static bool
v3d_nir_lower_subgroup_intrinsics(nir_shader * s,struct v3d_compile * c)1612 v3d_nir_lower_subgroup_intrinsics(nir_shader *s, struct v3d_compile *c)
1613 {
1614         bool progress = false;
1615         nir_foreach_function_impl(impl, s) {
1616                 nir_builder b = nir_builder_create(impl);
1617 
1618                 nir_foreach_block(block, impl)
1619                         progress |= lower_subgroup_intrinsics(c, block, &b);
1620 
1621                 nir_metadata_preserve(impl,
1622                                       nir_metadata_block_index |
1623                                       nir_metadata_dominance);
1624         }
1625         return progress;
1626 }
1627 
1628 static void
v3d_attempt_compile(struct v3d_compile * c)1629 v3d_attempt_compile(struct v3d_compile *c)
1630 {
1631         switch (c->s->info.stage) {
1632         case MESA_SHADER_VERTEX:
1633                 c->vs_key = (struct v3d_vs_key *) c->key;
1634                 break;
1635         case MESA_SHADER_GEOMETRY:
1636                 c->gs_key = (struct v3d_gs_key *) c->key;
1637                 break;
1638         case MESA_SHADER_FRAGMENT:
1639                 c->fs_key = (struct v3d_fs_key *) c->key;
1640                 break;
1641         case MESA_SHADER_COMPUTE:
1642                 break;
1643         default:
1644                 unreachable("unsupported shader stage");
1645         }
1646 
1647         switch (c->s->info.stage) {
1648         case MESA_SHADER_VERTEX:
1649                 v3d_nir_lower_vs_early(c);
1650                 break;
1651         case MESA_SHADER_GEOMETRY:
1652                 v3d_nir_lower_gs_early(c);
1653                 break;
1654         case MESA_SHADER_FRAGMENT:
1655                 v3d_nir_lower_fs_early(c);
1656                 break;
1657         default:
1658                 break;
1659         }
1660 
1661         v3d_lower_nir(c);
1662 
1663         switch (c->s->info.stage) {
1664         case MESA_SHADER_VERTEX:
1665                 v3d_nir_lower_vs_late(c);
1666                 break;
1667         case MESA_SHADER_GEOMETRY:
1668                 v3d_nir_lower_gs_late(c);
1669                 break;
1670         case MESA_SHADER_FRAGMENT:
1671                 v3d_nir_lower_fs_late(c);
1672                 break;
1673         default:
1674                 break;
1675         }
1676 
1677         NIR_PASS(_, c->s, v3d_nir_lower_io, c);
1678         NIR_PASS(_, c->s, v3d_nir_lower_txf_ms);
1679         NIR_PASS(_, c->s, v3d_nir_lower_image_load_store, c);
1680 
1681         NIR_PASS(_, c->s, nir_opt_idiv_const, 8);
1682         nir_lower_idiv_options idiv_options = {
1683                 .allow_fp16 = true,
1684         };
1685         NIR_PASS(_, c->s, nir_lower_idiv, &idiv_options);
1686         NIR_PASS(_, c->s, nir_lower_alu);
1687 
1688         if (c->key->robust_uniform_access || c->key->robust_storage_access ||
1689             c->key->robust_image_access) {
1690                 /* nir_lower_robust_access assumes constant buffer
1691                  * indices on ubo/ssbo intrinsics so run copy propagation and
1692                  * constant folding passes before we run the lowering to warrant
1693                  * this. We also want to run the lowering before v3d_optimize to
1694                  * clean-up redundant get_buffer_size calls produced in the pass.
1695                  */
1696                 NIR_PASS(_, c->s, nir_copy_prop);
1697                 NIR_PASS(_, c->s, nir_opt_constant_folding);
1698 
1699                 nir_lower_robust_access_options opts = {
1700                    .lower_image = c->key->robust_image_access,
1701                    .lower_ssbo = c->key->robust_storage_access,
1702                    .lower_ubo = c->key->robust_uniform_access,
1703                 };
1704 
1705                 NIR_PASS(_, c->s, nir_lower_robust_access, &opts);
1706         }
1707 
1708         NIR_PASS(_, c->s, nir_lower_wrmasks, should_split_wrmask, c->s);
1709 
1710         NIR_PASS(_, c->s, v3d_nir_lower_load_store_bitsize);
1711 
1712         NIR_PASS(_, c->s, v3d_nir_lower_subgroup_intrinsics, c);
1713 
1714         const nir_lower_subgroups_options subgroup_opts = {
1715                 .subgroup_size = V3D_CHANNELS,
1716                 .ballot_components = 1,
1717                 .ballot_bit_size = 32,
1718                 .lower_to_scalar = true,
1719                 .lower_inverse_ballot = true,
1720                 .lower_subgroup_masks = true,
1721                 .lower_relative_shuffle = true,
1722                 .lower_quad = true,
1723         };
1724         NIR_PASS(_, c->s, nir_lower_subgroups, &subgroup_opts);
1725 
1726         v3d_optimize_nir(c, c->s);
1727 
1728         /* Do late algebraic optimization to turn add(a, neg(b)) back into
1729          * subs, then the mandatory cleanup after algebraic.  Note that it may
1730          * produce fnegs, and if so then we need to keep running to squash
1731          * fneg(fneg(a)).
1732          */
1733         bool more_late_algebraic = true;
1734         while (more_late_algebraic) {
1735                 more_late_algebraic = false;
1736                 NIR_PASS(more_late_algebraic, c->s, nir_opt_algebraic_late);
1737                 NIR_PASS(_, c->s, nir_opt_constant_folding);
1738                 NIR_PASS(_, c->s, nir_copy_prop);
1739                 NIR_PASS(_, c->s, nir_opt_dce);
1740                 NIR_PASS(_, c->s, nir_opt_cse);
1741         }
1742 
1743         NIR_PASS(_, c->s, nir_lower_bool_to_int32);
1744         NIR_PASS(_, c->s, nir_convert_to_lcssa, true, true);
1745         NIR_PASS_V(c->s, nir_divergence_analysis);
1746         NIR_PASS(_, c->s, nir_convert_from_ssa, true);
1747 
1748         struct nir_schedule_options schedule_options = {
1749                 /* Schedule for about half our register space, to enable more
1750                  * shaders to hit 4 threads.
1751                  */
1752                 .threshold = c->threads == 4 ? 24 : 48,
1753 
1754                 /* Vertex shaders share the same memory for inputs and outputs,
1755                  * fragment and geometry shaders do not.
1756                  */
1757                 .stages_with_shared_io_memory =
1758                 (((1 << MESA_ALL_SHADER_STAGES) - 1) &
1759                  ~((1 << MESA_SHADER_FRAGMENT) |
1760                    (1 << MESA_SHADER_GEOMETRY))),
1761 
1762                 .fallback = c->fallback_scheduler,
1763 
1764                 .intrinsic_cb = v3d_intrinsic_dependency_cb,
1765                 .intrinsic_cb_data = c,
1766 
1767                 .instr_delay_cb = v3d_instr_delay_cb,
1768                 .instr_delay_cb_data = c,
1769         };
1770         NIR_PASS_V(c->s, nir_schedule, &schedule_options);
1771 
1772         if (!c->disable_constant_ubo_load_sorting)
1773                 NIR_PASS(_, c->s, v3d_nir_sort_constant_ubo_loads, c);
1774 
1775         const nir_move_options buffer_opts = c->move_buffer_loads ?
1776                 (nir_move_load_ubo | nir_move_load_ssbo) : 0;
1777         NIR_PASS(_, c->s, nir_opt_move, nir_move_load_uniform |
1778                                         nir_move_const_undef |
1779                                         buffer_opts);
1780 
1781         NIR_PASS_V(c->s, nir_trivialize_registers);
1782 
1783         v3d_nir_to_vir(c);
1784 }
1785 
1786 uint32_t
v3d_prog_data_size(gl_shader_stage stage)1787 v3d_prog_data_size(gl_shader_stage stage)
1788 {
1789         static const int prog_data_size[] = {
1790                 [MESA_SHADER_VERTEX] = sizeof(struct v3d_vs_prog_data),
1791                 [MESA_SHADER_GEOMETRY] = sizeof(struct v3d_gs_prog_data),
1792                 [MESA_SHADER_FRAGMENT] = sizeof(struct v3d_fs_prog_data),
1793                 [MESA_SHADER_COMPUTE] = sizeof(struct v3d_compute_prog_data),
1794         };
1795 
1796         assert(stage >= 0 &&
1797                stage < ARRAY_SIZE(prog_data_size) &&
1798                prog_data_size[stage]);
1799 
1800         return prog_data_size[stage];
1801 }
1802 
v3d_shaderdb_dump(struct v3d_compile * c,char ** shaderdb_str)1803 int v3d_shaderdb_dump(struct v3d_compile *c,
1804 		      char **shaderdb_str)
1805 {
1806         if (c == NULL || c->compilation_result != V3D_COMPILATION_SUCCEEDED)
1807                 return -1;
1808 
1809         return asprintf(shaderdb_str,
1810                         "%s shader: %d inst, %d threads, %d loops, "
1811                         "%d uniforms, %d max-temps, %d:%d spills:fills, "
1812                         "%d sfu-stalls, %d inst-and-stalls, %d nops",
1813                         vir_get_stage_name(c),
1814                         c->qpu_inst_count,
1815                         c->threads,
1816                         c->loops,
1817                         c->num_uniforms,
1818                         vir_get_max_temps(c),
1819                         c->spills,
1820                         c->fills,
1821                         c->qpu_inst_stalled_count,
1822                         c->qpu_inst_count + c->qpu_inst_stalled_count,
1823                         c->nop_count);
1824 }
1825 
1826 /* This is a list of incremental changes to the compilation strategy
1827  * that will be used to try to compile the shader successfully. The
1828  * default strategy is to enable all optimizations which will have
1829  * the highest register pressure but is expected to produce most
1830  * optimal code. Following strategies incrementally disable specific
1831  * optimizations that are known to contribute to register pressure
1832  * in order to be able to compile the shader successfully while meeting
1833  * thread count requirements.
1834  *
1835  * V3D 4.1+ has a min thread count of 2, but we can use 1 here to also
1836  * cover previous hardware as well (meaning that we are not limiting
1837  * register allocation to any particular thread count). This is fine
1838  * because v3d_nir_to_vir will cap this to the actual minimum.
1839  */
1840 static const struct v3d_compiler_strategy strategies[] = {
1841         /*0*/  { "default",                        4, 4, false, false, false, false, false, false,  0 },
1842         /*1*/  { "disable general TMU sched",      4, 4, true,  false, false, false, false, false,  0 },
1843         /*2*/  { "disable gcm",                    4, 4, true,  true,  false, false, false, false,  0 },
1844         /*3*/  { "disable loop unrolling",         4, 4, true,  true,  true,  false, false, false,  0 },
1845         /*4*/  { "disable UBO load sorting",       4, 4, true,  true,  true,  true,  false, false,  0 },
1846         /*5*/  { "disable TMU pipelining",         4, 4, true,  true,  true,  true,  false, true,   0 },
1847         /*6*/  { "lower thread count",             2, 1, false, false, false, false, false, false, -1 },
1848         /*7*/  { "disable general TMU sched (2t)", 2, 1, true,  false, false, false, false, false, -1 },
1849         /*8*/  { "disable gcm (2t)",               2, 1, true,  true,  false, false, false, false, -1 },
1850         /*9*/  { "disable loop unrolling (2t)",    2, 1, true,  true,  true,  false, false, false, -1 },
1851         /*10*/ { "Move buffer loads (2t)",         2, 1, true,  true,  true,  true,  true,  false, -1 },
1852         /*11*/ { "disable TMU pipelining (2t)",    2, 1, true,  true,  true,  true,  true,  true,  -1 },
1853         /*12*/ { "fallback scheduler",             2, 1, true,  true,  true,  true,  true,  true,  -1 }
1854 };
1855 
1856 /**
1857  * If a particular optimization didn't make any progress during a compile
1858  * attempt disabling it alone won't allow us to compile the shader successfully,
1859  * since we'll end up with the same code. Detect these scenarios so we can
1860  * avoid wasting time with useless compiles. We should also consider if the
1861  * gy changes other aspects of the compilation process though, like
1862  * spilling, and not skip it in that case.
1863  */
1864 static bool
skip_compile_strategy(struct v3d_compile * c,uint32_t idx)1865 skip_compile_strategy(struct v3d_compile *c, uint32_t idx)
1866 {
1867    /* We decide if we can skip a strategy based on the optimizations that
1868     * were active in the previous strategy, so we should only be calling this
1869     * for strategies after the first.
1870     */
1871    assert(idx > 0);
1872 
1873    /* Don't skip a strategy that changes spilling behavior */
1874    if (strategies[idx].max_tmu_spills !=
1875        strategies[idx - 1].max_tmu_spills) {
1876            return false;
1877    }
1878 
1879    switch (idx) {
1880    /* General TMU sched.: skip if we didn't emit any TMU loads */
1881    case 1:
1882    case 7:
1883            return !c->has_general_tmu_load;
1884    /* Global code motion: skip if nir_opt_gcm didn't make any progress */
1885    case 2:
1886    case 8:
1887            return !c->gcm_progress;
1888    /* Loop unrolling: skip if we didn't unroll any loops */
1889    case 3:
1890    case 9:
1891            return !c->unrolled_any_loops;
1892    /* UBO load sorting: skip if we didn't sort any loads */
1893    case 4:
1894            return !c->sorted_any_ubo_loads;
1895    /* Move buffer loads: we assume any shader with difficult RA
1896     * most likely has UBO / SSBO loads so we never try to skip.
1897     * For now, we only try this for 2-thread compiles since it
1898     * is expected to impact instruction counts and latency.
1899     */
1900    case 10:
1901           assert(c->threads < 4);
1902           return false;
1903    /* TMU pipelining: skip if we didn't pipeline any TMU ops */
1904    case 5:
1905    case 11:
1906            return !c->pipelined_any_tmu;
1907    /* Lower thread count: skip if we already tried less that 4 threads */
1908    case 6:
1909           return c->threads < 4;
1910    default:
1911            return false;
1912    };
1913 }
1914 
1915 static inline void
set_best_compile(struct v3d_compile ** best,struct v3d_compile * c)1916 set_best_compile(struct v3d_compile **best, struct v3d_compile *c)
1917 {
1918    if (*best)
1919       vir_compile_destroy(*best);
1920    *best = c;
1921 }
1922 
v3d_compile(const struct v3d_compiler * compiler,struct v3d_key * key,struct v3d_prog_data ** out_prog_data,nir_shader * s,void (* debug_output)(const char * msg,void * debug_output_data),void * debug_output_data,int program_id,int variant_id,uint32_t * final_assembly_size)1923 uint64_t *v3d_compile(const struct v3d_compiler *compiler,
1924                       struct v3d_key *key,
1925                       struct v3d_prog_data **out_prog_data,
1926                       nir_shader *s,
1927                       void (*debug_output)(const char *msg,
1928                                            void *debug_output_data),
1929                       void *debug_output_data,
1930                       int program_id, int variant_id,
1931                       uint32_t *final_assembly_size)
1932 {
1933         struct v3d_compile *c = NULL;
1934 
1935         uint32_t best_spill_fill_count = UINT32_MAX;
1936         struct v3d_compile *best_c = NULL;
1937         for (int32_t strat = 0; strat < ARRAY_SIZE(strategies); strat++) {
1938                 /* Fallback strategy */
1939                 if (strat > 0) {
1940                         assert(c);
1941                         if (skip_compile_strategy(c, strat))
1942                                 continue;
1943 
1944                         char *debug_msg;
1945                         int ret = asprintf(&debug_msg,
1946                                            "Falling back to strategy '%s' "
1947                                            "for %s prog %d/%d",
1948                                            strategies[strat].name,
1949                                            vir_get_stage_name(c),
1950                                            c->program_id, c->variant_id);
1951 
1952                         if (ret >= 0) {
1953                                 if (V3D_DBG(PERF))
1954                                         fprintf(stderr, "%s\n", debug_msg);
1955 
1956                                 c->debug_output(debug_msg, c->debug_output_data);
1957                                 free(debug_msg);
1958                         }
1959 
1960                         if (c != best_c)
1961                                 vir_compile_destroy(c);
1962                 }
1963 
1964                 c = vir_compile_init(compiler, key, s,
1965                                      debug_output, debug_output_data,
1966                                      program_id, variant_id,
1967                                      strat, &strategies[strat],
1968                                      strat == ARRAY_SIZE(strategies) - 1);
1969 
1970                 v3d_attempt_compile(c);
1971 
1972                 /* Broken shader or driver bug */
1973                 if (c->compilation_result == V3D_COMPILATION_FAILED)
1974                         break;
1975 
1976                 /* If we compiled without spills, choose this.
1977                  * Otherwise if this is a 4-thread compile, choose this (these
1978                  * have a very low cap on the allowed TMU spills so we assume
1979                  * it will be better than a 2-thread compile without spills).
1980                  * Otherwise, keep going while tracking the strategy with the
1981                  * lowest spill count.
1982                  */
1983                 if (c->compilation_result == V3D_COMPILATION_SUCCEEDED) {
1984                         if (c->spills == 0 ||
1985                             strategies[strat].min_threads == 4 ||
1986                             V3D_DBG(OPT_COMPILE_TIME)) {
1987                                 set_best_compile(&best_c, c);
1988                                 break;
1989                         } else if (c->spills + c->fills <
1990                                    best_spill_fill_count) {
1991                                 set_best_compile(&best_c, c);
1992                                 best_spill_fill_count = c->spills + c->fills;
1993                         }
1994 
1995                         if (V3D_DBG(PERF)) {
1996                                 char *debug_msg;
1997                                 int ret = asprintf(&debug_msg,
1998                                                    "Compiled %s prog %d/%d with %d "
1999                                                    "spills and %d fills. Will try "
2000                                                    "more strategies.",
2001                                                    vir_get_stage_name(c),
2002                                                    c->program_id, c->variant_id,
2003                                                    c->spills, c->fills);
2004                                 if (ret >= 0) {
2005                                         fprintf(stderr, "%s\n", debug_msg);
2006                                         c->debug_output(debug_msg, c->debug_output_data);
2007                                         free(debug_msg);
2008                                 }
2009                         }
2010                 }
2011 
2012                 /* Only try next streategy if we failed to register allocate
2013                  * or we had to spill.
2014                  */
2015                 assert(c->compilation_result ==
2016                        V3D_COMPILATION_FAILED_REGISTER_ALLOCATION ||
2017                        c->spills > 0);
2018         }
2019 
2020         /* If the best strategy was not the last, choose that */
2021         if (best_c && c != best_c)
2022                 set_best_compile(&c, best_c);
2023 
2024         if (V3D_DBG(PERF) &&
2025             c->compilation_result !=
2026             V3D_COMPILATION_FAILED_REGISTER_ALLOCATION &&
2027             c->spills > 0) {
2028                 char *debug_msg;
2029                 int ret = asprintf(&debug_msg,
2030                                    "Compiled %s prog %d/%d with %d "
2031                                    "spills and %d fills",
2032                                    vir_get_stage_name(c),
2033                                    c->program_id, c->variant_id,
2034                                    c->spills, c->fills);
2035                 fprintf(stderr, "%s\n", debug_msg);
2036 
2037                 if (ret >= 0) {
2038                         c->debug_output(debug_msg, c->debug_output_data);
2039                         free(debug_msg);
2040                 }
2041         }
2042 
2043         if (c->compilation_result != V3D_COMPILATION_SUCCEEDED) {
2044                 fprintf(stderr, "Failed to compile %s prog %d/%d "
2045                         "with any strategy.\n",
2046                         vir_get_stage_name(c), c->program_id, c->variant_id);
2047 
2048                 vir_compile_destroy(c);
2049                 return NULL;
2050         }
2051 
2052         struct v3d_prog_data *prog_data;
2053 
2054         prog_data = rzalloc_size(NULL, v3d_prog_data_size(c->s->info.stage));
2055 
2056         v3d_set_prog_data(c, prog_data);
2057 
2058         *out_prog_data = prog_data;
2059 
2060         char *shaderdb;
2061         int ret = v3d_shaderdb_dump(c, &shaderdb);
2062         if (ret >= 0) {
2063                 if (V3D_DBG(SHADERDB))
2064                         fprintf(stderr, "SHADER-DB-%s - %s\n", s->info.name, shaderdb);
2065 
2066                 c->debug_output(shaderdb, c->debug_output_data);
2067                 free(shaderdb);
2068         }
2069 
2070        return v3d_return_qpu_insts(c, final_assembly_size);
2071 }
2072 
2073 void
vir_remove_instruction(struct v3d_compile * c,struct qinst * qinst)2074 vir_remove_instruction(struct v3d_compile *c, struct qinst *qinst)
2075 {
2076         if (qinst->dst.file == QFILE_TEMP)
2077                 c->defs[qinst->dst.index] = NULL;
2078 
2079         assert(&qinst->link != c->cursor.link);
2080 
2081         list_del(&qinst->link);
2082         free(qinst);
2083 
2084         c->live_intervals_valid = false;
2085 }
2086 
2087 struct qreg
vir_follow_movs(struct v3d_compile * c,struct qreg reg)2088 vir_follow_movs(struct v3d_compile *c, struct qreg reg)
2089 {
2090         /* XXX
2091         int pack = reg.pack;
2092 
2093         while (reg.file == QFILE_TEMP &&
2094                c->defs[reg.index] &&
2095                (c->defs[reg.index]->op == QOP_MOV ||
2096                 c->defs[reg.index]->op == QOP_FMOV) &&
2097                !c->defs[reg.index]->dst.pack &&
2098                !c->defs[reg.index]->src[0].pack) {
2099                 reg = c->defs[reg.index]->src[0];
2100         }
2101 
2102         reg.pack = pack;
2103         */
2104         return reg;
2105 }
2106 
2107 void
vir_compile_destroy(struct v3d_compile * c)2108 vir_compile_destroy(struct v3d_compile *c)
2109 {
2110         /* Defuse the assert that we aren't removing the cursor's instruction.
2111          */
2112         c->cursor.link = NULL;
2113 
2114         vir_for_each_block(block, c) {
2115                 while (!list_is_empty(&block->instructions)) {
2116                         struct qinst *qinst =
2117                                 list_first_entry(&block->instructions,
2118                                                  struct qinst, link);
2119                         vir_remove_instruction(c, qinst);
2120                 }
2121         }
2122 
2123         ralloc_free(c);
2124 }
2125 
2126 uint32_t
vir_get_uniform_index(struct v3d_compile * c,enum quniform_contents contents,uint32_t data)2127 vir_get_uniform_index(struct v3d_compile *c,
2128                       enum quniform_contents contents,
2129                       uint32_t data)
2130 {
2131         for (int i = 0; i < c->num_uniforms; i++) {
2132                 if (c->uniform_contents[i] == contents &&
2133                     c->uniform_data[i] == data) {
2134                         return i;
2135                 }
2136         }
2137 
2138         uint32_t uniform = c->num_uniforms++;
2139 
2140         if (uniform >= c->uniform_array_size) {
2141                 c->uniform_array_size = MAX2(MAX2(16, uniform + 1),
2142                                              c->uniform_array_size * 2);
2143 
2144                 c->uniform_data = reralloc(c, c->uniform_data,
2145                                            uint32_t,
2146                                            c->uniform_array_size);
2147                 c->uniform_contents = reralloc(c, c->uniform_contents,
2148                                                enum quniform_contents,
2149                                                c->uniform_array_size);
2150         }
2151 
2152         c->uniform_contents[uniform] = contents;
2153         c->uniform_data[uniform] = data;
2154 
2155         return uniform;
2156 }
2157 
2158 /* Looks back into the current block to find the ldunif that wrote the uniform
2159  * at the requested index. If it finds it, it returns true and writes the
2160  * destination register of the ldunif instruction to 'unif'.
2161  *
2162  * This can impact register pressure and end up leading to worse code, so we
2163  * limit the number of instructions we are willing to look back through to
2164  * strike a good balance.
2165  */
2166 static bool
try_opt_ldunif(struct v3d_compile * c,uint32_t index,struct qreg * unif)2167 try_opt_ldunif(struct v3d_compile *c, uint32_t index, struct qreg *unif)
2168 {
2169         uint32_t count = 20;
2170         struct qinst *prev_inst = NULL;
2171         assert(c->cur_block);
2172 
2173 #ifdef DEBUG
2174         /* We can only reuse a uniform if it was emitted in the same block,
2175          * so callers must make sure the current instruction is being emitted
2176          * in the current block.
2177          */
2178         bool found = false;
2179         vir_for_each_inst(inst, c->cur_block) {
2180                 if (&inst->link == c->cursor.link) {
2181                         found = true;
2182                         break;
2183                 }
2184         }
2185 
2186         assert(found || &c->cur_block->instructions == c->cursor.link);
2187 #endif
2188 
2189         list_for_each_entry_from_rev(struct qinst, inst, c->cursor.link->prev,
2190                                      &c->cur_block->instructions, link) {
2191                 if ((inst->qpu.sig.ldunif || inst->qpu.sig.ldunifrf) &&
2192                     inst->uniform == index) {
2193                         prev_inst = inst;
2194                         break;
2195                 }
2196 
2197                 if (--count == 0)
2198                         break;
2199         }
2200 
2201         if (!prev_inst)
2202                 return false;
2203 
2204         /* Only reuse the ldunif result if it was written to a temp register,
2205          * otherwise there may be special restrictions (for example, ldunif
2206          * may write directly to unifa, which is a write-only register).
2207          */
2208         if (prev_inst->dst.file != QFILE_TEMP)
2209                 return false;
2210 
2211         list_for_each_entry_from(struct qinst, inst, prev_inst->link.next,
2212                                  &c->cur_block->instructions, link) {
2213                 if (inst->dst.file == prev_inst->dst.file &&
2214                     inst->dst.index == prev_inst->dst.index) {
2215                         return false;
2216                 }
2217         }
2218 
2219         *unif = prev_inst->dst;
2220         return true;
2221 }
2222 
2223 struct qreg
vir_uniform(struct v3d_compile * c,enum quniform_contents contents,uint32_t data)2224 vir_uniform(struct v3d_compile *c,
2225             enum quniform_contents contents,
2226             uint32_t data)
2227 {
2228         const int num_uniforms = c->num_uniforms;
2229         const int index = vir_get_uniform_index(c, contents, data);
2230 
2231         /* If this is not the first time we see this uniform try to reuse the
2232          * result of the last ldunif that loaded it.
2233          */
2234         const bool is_new_uniform = num_uniforms != c->num_uniforms;
2235         if (!is_new_uniform && !c->disable_ldunif_opt) {
2236                 struct qreg ldunif_dst;
2237                 if (try_opt_ldunif(c, index, &ldunif_dst))
2238                         return ldunif_dst;
2239         }
2240 
2241         struct qinst *inst = vir_NOP(c);
2242         inst->qpu.sig.ldunif = true;
2243         inst->uniform = index;
2244         inst->dst = vir_get_temp(c);
2245         c->defs[inst->dst.index] = inst;
2246         return inst->dst;
2247 }
2248 
2249 #define OPTPASS(func)                                                   \
2250         do {                                                            \
2251                 bool stage_progress = func(c);                          \
2252                 if (stage_progress) {                                   \
2253                         progress = true;                                \
2254                         if (print_opt_debug) {                          \
2255                                 fprintf(stderr,                         \
2256                                         "VIR opt pass %2d: %s progress\n", \
2257                                         pass, #func);                   \
2258                         }                                               \
2259                         /*XXX vir_validate(c);*/                        \
2260                 }                                                       \
2261         } while (0)
2262 
2263 void
vir_optimize(struct v3d_compile * c)2264 vir_optimize(struct v3d_compile *c)
2265 {
2266         bool print_opt_debug = false;
2267         int pass = 1;
2268 
2269         while (true) {
2270                 bool progress = false;
2271 
2272                 OPTPASS(vir_opt_copy_propagate);
2273                 OPTPASS(vir_opt_redundant_flags);
2274                 OPTPASS(vir_opt_dead_code);
2275                 OPTPASS(vir_opt_small_immediates);
2276                 OPTPASS(vir_opt_constant_alu);
2277 
2278                 if (!progress)
2279                         break;
2280 
2281                 pass++;
2282         }
2283 }
2284 
2285 const char *
vir_get_stage_name(struct v3d_compile * c)2286 vir_get_stage_name(struct v3d_compile *c)
2287 {
2288         if (c->vs_key && c->vs_key->is_coord)
2289                 return "MESA_SHADER_VERTEX_BIN";
2290         else if (c->gs_key && c->gs_key->is_coord)
2291                 return "MESA_SHADER_GEOMETRY_BIN";
2292         else
2293                 return gl_shader_stage_name(c->s->info.stage);
2294 }
2295 
2296 static inline uint32_t
compute_vpm_size_in_sectors(const struct v3d_device_info * devinfo)2297 compute_vpm_size_in_sectors(const struct v3d_device_info *devinfo)
2298 {
2299    assert(devinfo->vpm_size > 0);
2300    const uint32_t sector_size = V3D_CHANNELS * sizeof(uint32_t) * 8;
2301    return devinfo->vpm_size / sector_size;
2302 }
2303 
2304 /* Computes various parameters affecting VPM memory configuration for programs
2305  * involving geometry shaders to ensure the program fits in memory and honors
2306  * requirements described in section "VPM usage" of the programming manual.
2307  */
2308 static bool
compute_vpm_config_gs(struct v3d_device_info * devinfo,struct v3d_vs_prog_data * vs,struct v3d_gs_prog_data * gs,struct vpm_config * vpm_cfg_out)2309 compute_vpm_config_gs(struct v3d_device_info *devinfo,
2310                       struct v3d_vs_prog_data *vs,
2311                       struct v3d_gs_prog_data *gs,
2312                       struct vpm_config *vpm_cfg_out)
2313 {
2314    const uint32_t A = vs->separate_segments ? 1 : 0;
2315    const uint32_t Ad = vs->vpm_input_size;
2316    const uint32_t Vd = vs->vpm_output_size;
2317 
2318    const uint32_t vpm_size = compute_vpm_size_in_sectors(devinfo);
2319 
2320    /* Try to fit program into our VPM memory budget by adjusting
2321     * configurable parameters iteratively. We do this in two phases:
2322     * the first phase tries to fit the program into the total available
2323     * VPM memory. If we succeed at that, then the second phase attempts
2324     * to fit the program into half of that budget so we can run bin and
2325     * render programs in parallel.
2326     */
2327    struct vpm_config vpm_cfg[2];
2328    struct vpm_config *final_vpm_cfg = NULL;
2329    uint32_t phase = 0;
2330 
2331    vpm_cfg[phase].As = 1;
2332    vpm_cfg[phase].Gs = 1;
2333    vpm_cfg[phase].Gd = gs->vpm_output_size;
2334    vpm_cfg[phase].gs_width = gs->simd_width;
2335 
2336    /* While there is a requirement that Vc >= [Vn / 16], this is
2337     * always the case when tessellation is not present because in that
2338     * case Vn can only be 6 at most (when input primitive is triangles
2339     * with adjacency).
2340     *
2341     * We always choose Vc=2. We can't go lower than this due to GFXH-1744,
2342     * and Broadcom has not found it worth it to increase it beyond this
2343     * in general. Increasing Vc also increases VPM memory pressure which
2344     * can turn up being detrimental for performance in some scenarios.
2345     */
2346    vpm_cfg[phase].Vc = 2;
2347 
2348    /* Gv is a constraint on the hardware to not exceed the
2349     * specified number of vertex segments per GS batch. If adding a
2350     * new primitive to a GS batch would result in a range of more
2351     * than Gv vertex segments being referenced by the batch, then
2352     * the hardware will flush the batch and start a new one. This
2353     * means that we can choose any value we want, we just need to
2354     * be aware that larger values improve GS batch utilization
2355     * at the expense of more VPM memory pressure (which can affect
2356     * other performance aspects, such as GS dispatch width).
2357     * We start with the largest value, and will reduce it if we
2358     * find that total memory pressure is too high.
2359     */
2360    vpm_cfg[phase].Gv = 3;
2361    do {
2362       /* When GS is present in absence of TES, then we need to satisfy
2363        * that Ve >= Gv. We go with the smallest value of Ve to avoid
2364        * increasing memory pressure.
2365        */
2366       vpm_cfg[phase].Ve = vpm_cfg[phase].Gv;
2367 
2368       uint32_t vpm_sectors =
2369          A * vpm_cfg[phase].As * Ad +
2370          (vpm_cfg[phase].Vc + vpm_cfg[phase].Ve) * Vd +
2371          vpm_cfg[phase].Gs * vpm_cfg[phase].Gd;
2372 
2373       /* Ideally we want to use no more than half of the available
2374        * memory so we can execute a bin and render program in parallel
2375        * without stalls. If we achieved that then we are done.
2376        */
2377       if (vpm_sectors <= vpm_size / 2) {
2378          final_vpm_cfg = &vpm_cfg[phase];
2379          break;
2380       }
2381 
2382       /* At the very least, we should not allocate more than the
2383        * total available VPM memory. If we have a configuration that
2384        * succeeds at this we save it and continue to see if we can
2385        * meet the half-memory-use criteria too.
2386        */
2387       if (phase == 0 && vpm_sectors <= vpm_size) {
2388          vpm_cfg[1] = vpm_cfg[0];
2389          phase = 1;
2390       }
2391 
2392       /* Try lowering Gv */
2393       if (vpm_cfg[phase].Gv > 0) {
2394          vpm_cfg[phase].Gv--;
2395          continue;
2396       }
2397 
2398       /* Try lowering GS dispatch width */
2399       if (vpm_cfg[phase].gs_width > 1) {
2400          do {
2401             vpm_cfg[phase].gs_width >>= 1;
2402             vpm_cfg[phase].Gd = align(vpm_cfg[phase].Gd, 2) / 2;
2403          } while (vpm_cfg[phase].gs_width == 2);
2404 
2405          /* Reset Gv to max after dropping dispatch width */
2406          vpm_cfg[phase].Gv = 3;
2407          continue;
2408       }
2409 
2410       /* We ran out of options to reduce memory pressure. If we
2411        * are at phase 1 we have at least a valid configuration, so we
2412        * we use that.
2413        */
2414       if (phase == 1)
2415          final_vpm_cfg = &vpm_cfg[0];
2416       break;
2417    } while (true);
2418 
2419    if (!final_vpm_cfg)
2420       return false;
2421 
2422    assert(final_vpm_cfg);
2423    assert(final_vpm_cfg->Gd <= 16);
2424    assert(final_vpm_cfg->Gv < 4);
2425    assert(final_vpm_cfg->Ve < 4);
2426    assert(final_vpm_cfg->Vc >= 2 && final_vpm_cfg->Vc <= 4);
2427    assert(final_vpm_cfg->gs_width == 1 ||
2428           final_vpm_cfg->gs_width == 4 ||
2429           final_vpm_cfg->gs_width == 8 ||
2430           final_vpm_cfg->gs_width == 16);
2431 
2432    *vpm_cfg_out = *final_vpm_cfg;
2433    return true;
2434 }
2435 
2436 bool
v3d_compute_vpm_config(struct v3d_device_info * devinfo,struct v3d_vs_prog_data * vs_bin,struct v3d_vs_prog_data * vs,struct v3d_gs_prog_data * gs_bin,struct v3d_gs_prog_data * gs,struct vpm_config * vpm_cfg_bin,struct vpm_config * vpm_cfg)2437 v3d_compute_vpm_config(struct v3d_device_info *devinfo,
2438                        struct v3d_vs_prog_data *vs_bin,
2439                        struct v3d_vs_prog_data *vs,
2440                        struct v3d_gs_prog_data *gs_bin,
2441                        struct v3d_gs_prog_data *gs,
2442                        struct vpm_config *vpm_cfg_bin,
2443                        struct vpm_config *vpm_cfg)
2444 {
2445    assert(vs && vs_bin);
2446    assert((gs != NULL) == (gs_bin != NULL));
2447 
2448    if (!gs) {
2449       vpm_cfg_bin->As = 1;
2450       vpm_cfg_bin->Ve = 0;
2451       vpm_cfg_bin->Vc = vs_bin->vcm_cache_size;
2452 
2453       vpm_cfg->As = 1;
2454       vpm_cfg->Ve = 0;
2455       vpm_cfg->Vc = vs->vcm_cache_size;
2456    } else {
2457       if (!compute_vpm_config_gs(devinfo, vs_bin, gs_bin, vpm_cfg_bin))
2458          return false;
2459 
2460       if (!compute_vpm_config_gs(devinfo, vs, gs, vpm_cfg))
2461          return false;
2462    }
2463 
2464    return true;
2465 }
2466