• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*
2  * Copyright (C) 2020 Collabora Ltd.
3  *
4  * Permission is hereby granted, free of charge, to any person obtaining a
5  * copy of this software and associated documentation files (the "Software"),
6  * to deal in the Software without restriction, including without limitation
7  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8  * and/or sell copies of the Software, and to permit persons to whom the
9  * Software is furnished to do so, subject to the following conditions:
10  *
11  * The above copyright notice and this permission notice (including the next
12  * paragraph) shall be included in all copies or substantial portions of the
13  * Software.
14  *
15  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
18  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21  * SOFTWARE.
22  *
23  * Authors (Collabora):
24  *      Alyssa Rosenzweig <alyssa.rosenzweig@collabora.com>
25  */
26 
27 #include "compiler.h"
28 #include "bi_builder.h"
29 
30 /* Arguments common to worklist, passed by value for convenience */
31 
32 struct bi_worklist {
33         /* # of instructions in the block */
34         unsigned count;
35 
36         /* Instructions in the block */
37         bi_instr **instructions;
38 
39         /* Bitset of instructions in the block ready for scheduling */
40         BITSET_WORD *worklist;
41 
42         /* The backwards dependency graph. nr_dependencies is the number of
43          * unscheduled instructions that must still be scheduled after (before)
44          * this instruction. dependents are which instructions need to be
45          * scheduled before (after) this instruction. */
46         unsigned *dep_counts;
47         BITSET_WORD **dependents;
48 };
49 
50 /* State of a single tuple and clause under construction */
51 
52 struct bi_reg_state {
53         /* Number of register writes */
54         unsigned nr_writes;
55 
56         /* Register reads, expressed as (equivalence classes of)
57          * sources. Only 3 reads are allowed, but up to 2 may spill as
58          * "forced" for the next scheduled tuple, provided such a tuple
59          * can be constructed */
60         bi_index reads[5];
61         unsigned nr_reads;
62 
63         /* The previous tuple scheduled (= the next tuple executed in the
64          * program) may require certain writes, in order to bypass the register
65          * file and use a temporary passthrough for the value. Up to 2 such
66          * constraints are architecturally satisfiable */
67         unsigned forced_count;
68         bi_index forceds[2];
69 };
70 
71 struct bi_tuple_state {
72         /* Is this the last tuple in the clause */
73         bool last;
74 
75         /* Scheduled ADD instruction, or null if none */
76         bi_instr *add;
77 
78         /* Reads for previous (succeeding) tuple */
79         bi_index prev_reads[5];
80         unsigned nr_prev_reads;
81         bi_tuple *prev;
82 
83         /* Register slot state for current tuple */
84         struct bi_reg_state reg;
85 
86         /* Constants are shared in the tuple. If constant_count is nonzero, it
87          * is a size for constant count. Otherwise, fau is the slot read from
88          * FAU, or zero if none is assigned. Ordinarily FAU slot 0 reads zero,
89          * but within a tuple, that should be encoded as constant_count != 0
90          * and constants[0] = constants[1] = 0 */
91         unsigned constant_count;
92 
93         union {
94                 uint32_t constants[2];
95                 enum bir_fau fau;
96         };
97 
98         unsigned pcrel_idx;
99 };
100 
101 struct bi_const_state {
102         unsigned constant_count;
103         bool pcrel; /* applies to first const */
104         uint32_t constants[2];
105 
106         /* Index of the constant into the clause */
107         unsigned word_idx;
108 };
109 
110 struct bi_clause_state {
111         /* Has a message-passing instruction already been assigned? */
112         bool message;
113 
114         /* Indices already accessed, this needs to be tracked to avoid hazards
115          * around message-passing instructions */
116         unsigned access_count;
117         bi_index accesses[(BI_MAX_SRCS + 2) * 16];
118 
119         unsigned tuple_count;
120         struct bi_const_state consts[8];
121 };
122 
123 /* Determines messsage type by checking the table and a few special cases. Only
124  * case missing is tilebuffer instructions that access depth/stencil, which
125  * require a Z_STENCIL message (to implement
126  * ARM_shader_framebuffer_fetch_depth_stencil) */
127 
128 static enum bifrost_message_type
bi_message_type_for_instr(bi_instr * ins)129 bi_message_type_for_instr(bi_instr *ins)
130 {
131         enum bifrost_message_type msg = bi_opcode_props[ins->op].message;
132         bool ld_var_special = (ins->op == BI_OPCODE_LD_VAR_SPECIAL);
133 
134         if (ld_var_special && ins->varying_name == BI_VARYING_NAME_FRAG_Z)
135                 return BIFROST_MESSAGE_Z_STENCIL;
136 
137         if (msg == BIFROST_MESSAGE_LOAD && ins->seg == BI_SEG_UBO)
138                 return BIFROST_MESSAGE_ATTRIBUTE;
139 
140         return msg;
141 }
142 
143 /* Attribute, texture, and UBO load (attribute message) instructions support
144  * bindless, so just check the message type */
145 
146 ASSERTED static bool
bi_supports_dtsel(bi_instr * ins)147 bi_supports_dtsel(bi_instr *ins)
148 {
149         switch (bi_message_type_for_instr(ins)) {
150         case BIFROST_MESSAGE_ATTRIBUTE:
151                 return ins->op != BI_OPCODE_LD_GCLK_U64;
152         case BIFROST_MESSAGE_TEX:
153                 return true;
154         default:
155                 return false;
156         }
157 }
158 
159 /* Adds an edge to the dependency graph */
160 
161 static void
bi_push_dependency(unsigned parent,unsigned child,BITSET_WORD ** dependents,unsigned * dep_counts)162 bi_push_dependency(unsigned parent, unsigned child,
163                 BITSET_WORD **dependents, unsigned *dep_counts)
164 {
165         if (!BITSET_TEST(dependents[parent], child)) {
166                 BITSET_SET(dependents[parent], child);
167                 dep_counts[child]++;
168         }
169 }
170 
171 static void
add_dependency(struct util_dynarray * table,unsigned index,unsigned child,BITSET_WORD ** dependents,unsigned * dep_counts)172 add_dependency(struct util_dynarray *table, unsigned index, unsigned child,
173                 BITSET_WORD **dependents, unsigned *dep_counts)
174 {
175         assert(index < 64);
176         util_dynarray_foreach(table + index, unsigned, parent)
177                 bi_push_dependency(*parent, child, dependents, dep_counts);
178 }
179 
180 static void
mark_access(struct util_dynarray * table,unsigned index,unsigned parent)181 mark_access(struct util_dynarray *table, unsigned index, unsigned parent)
182 {
183         assert(index < 64);
184         util_dynarray_append(&table[index], unsigned, parent);
185 }
186 
187 static bool
bi_is_sched_barrier(bi_instr * I)188 bi_is_sched_barrier(bi_instr *I)
189 {
190         switch (I->op) {
191         case BI_OPCODE_BARRIER:
192         case BI_OPCODE_DISCARD_F32:
193                 return true;
194         default:
195                 return false;
196         }
197 }
198 
199 static void
bi_create_dependency_graph(struct bi_worklist st,bool inorder)200 bi_create_dependency_graph(struct bi_worklist st, bool inorder)
201 {
202         struct util_dynarray last_read[64], last_write[64];
203 
204         for (unsigned i = 0; i < 64; ++i) {
205                 util_dynarray_init(&last_read[i], NULL);
206                 util_dynarray_init(&last_write[i], NULL);
207         }
208 
209         /* Initialize dependency graph */
210         for (unsigned i = 0; i < st.count; ++i) {
211                 st.dependents[i] =
212                         calloc(BITSET_WORDS(st.count), sizeof(BITSET_WORD));
213 
214                 st.dep_counts[i] = 0;
215         }
216 
217         unsigned prev_msg = ~0;
218 
219         /* Populate dependency graph */
220         for (signed i = st.count - 1; i >= 0; --i) {
221                 bi_instr *ins = st.instructions[i];
222 
223                 bi_foreach_src(ins, s) {
224                         if (ins->src[s].type != BI_INDEX_REGISTER) continue;
225                         unsigned count = bi_count_read_registers(ins, s);
226 
227                         for (unsigned c = 0; c < count; ++c)
228                                 add_dependency(last_write, ins->src[s].value + c, i, st.dependents, st.dep_counts);
229                 }
230 
231                 /* Keep message-passing ops in order. (This pass only cares
232                  * about bundling; reordering of message-passing instructions
233                  * happens during earlier scheduling.) */
234 
235                 if (bi_message_type_for_instr(ins)) {
236                         if (prev_msg != ~0)
237                                 bi_push_dependency(prev_msg, i, st.dependents, st.dep_counts);
238 
239                         prev_msg = i;
240                 }
241 
242                 /* Handle schedule barriers, adding All the deps */
243                 if (inorder || bi_is_sched_barrier(ins)) {
244                         for (unsigned j = 0; j < st.count; ++j) {
245                                 if (i == j) continue;
246 
247                                 bi_push_dependency(MAX2(i, j), MIN2(i, j),
248                                                 st.dependents, st.dep_counts);
249                         }
250                 }
251 
252                 bi_foreach_dest(ins, d) {
253                         if (ins->dest[d].type != BI_INDEX_REGISTER) continue;
254                         unsigned dest = ins->dest[d].value;
255 
256                         unsigned count = bi_count_write_registers(ins, d);
257 
258                         for (unsigned c = 0; c < count; ++c) {
259                                 add_dependency(last_read, dest + c, i, st.dependents, st.dep_counts);
260                                 add_dependency(last_write, dest + c, i, st.dependents, st.dep_counts);
261                                 mark_access(last_write, dest + c, i);
262                         }
263                 }
264 
265                 bi_foreach_src(ins, s) {
266                         if (ins->src[s].type != BI_INDEX_REGISTER) continue;
267 
268                         unsigned count = bi_count_read_registers(ins, s);
269 
270                         for (unsigned c = 0; c < count; ++c)
271                                 mark_access(last_read, ins->src[s].value + c, i);
272                 }
273         }
274 
275         /* If there is a branch, all instructions depend on it, as interblock
276          * execution must be purely in-order */
277 
278         bi_instr *last = st.instructions[st.count - 1];
279         if (last->branch_target || last->op == BI_OPCODE_JUMP) {
280                 for (signed i = st.count - 2; i >= 0; --i)
281                         bi_push_dependency(st.count - 1, i, st.dependents, st.dep_counts);
282         }
283 
284         /* Free the intermediate structures */
285         for (unsigned i = 0; i < 64; ++i) {
286                 util_dynarray_fini(&last_read[i]);
287                 util_dynarray_fini(&last_write[i]);
288         }
289 }
290 
291 /* Scheduler pseudoinstruction lowerings to enable instruction pairings.
292  * Currently only support CUBEFACE -> *CUBEFACE1/+CUBEFACE2
293  */
294 
295 static bi_instr *
bi_lower_cubeface(bi_context * ctx,struct bi_clause_state * clause,struct bi_tuple_state * tuple)296 bi_lower_cubeface(bi_context *ctx,
297                 struct bi_clause_state *clause, struct bi_tuple_state *tuple)
298 {
299         bi_instr *pinstr = tuple->add;
300         bi_builder b = bi_init_builder(ctx, bi_before_instr(pinstr));
301         bi_instr *cubeface1 = bi_cubeface1_to(&b, pinstr->dest[0],
302                         pinstr->src[0], pinstr->src[1], pinstr->src[2]);
303 
304         pinstr->op = BI_OPCODE_CUBEFACE2;
305         pinstr->dest[0] = pinstr->dest[1];
306         pinstr->dest[1] = bi_null();
307         pinstr->src[0] = cubeface1->dest[0];
308         pinstr->src[1] = bi_null();
309         pinstr->src[2] = bi_null();
310 
311         return cubeface1;
312 }
313 
314 /* Psuedo arguments are (rbase, address lo, address hi). We need *ATOM_C.i32 to
315  * have the arguments (address lo, address hi, rbase), and +ATOM_CX to have the
316  * arguments (rbase, address lo, address hi, rbase) */
317 
318 static bi_instr *
bi_lower_atom_c(bi_context * ctx,struct bi_clause_state * clause,struct bi_tuple_state * tuple)319 bi_lower_atom_c(bi_context *ctx, struct bi_clause_state *clause, struct
320                 bi_tuple_state *tuple)
321 {
322         bi_instr *pinstr = tuple->add;
323         bi_builder b = bi_init_builder(ctx, bi_before_instr(pinstr));
324         bi_instr *atom_c = bi_atom_c_return_i32(&b,
325                         pinstr->src[1], pinstr->src[2], pinstr->src[0],
326                         pinstr->atom_opc);
327 
328         if (bi_is_null(pinstr->dest[0]))
329                 atom_c->op = BI_OPCODE_ATOM_C_I32;
330 
331         pinstr->op = BI_OPCODE_ATOM_CX;
332         pinstr->src[3] = atom_c->src[2];
333 
334         return atom_c;
335 }
336 
337 static bi_instr *
bi_lower_atom_c1(bi_context * ctx,struct bi_clause_state * clause,struct bi_tuple_state * tuple)338 bi_lower_atom_c1(bi_context *ctx, struct bi_clause_state *clause, struct
339                 bi_tuple_state *tuple)
340 {
341         bi_instr *pinstr = tuple->add;
342         bi_builder b = bi_init_builder(ctx, bi_before_instr(pinstr));
343         bi_instr *atom_c = bi_atom_c1_return_i32(&b,
344                         pinstr->src[0], pinstr->src[1], pinstr->atom_opc);
345 
346         if (bi_is_null(pinstr->dest[0]))
347                 atom_c->op = BI_OPCODE_ATOM_C1_I32;
348 
349         pinstr->op = BI_OPCODE_ATOM_CX;
350         pinstr->src[2] = pinstr->src[1];
351         pinstr->src[1] = pinstr->src[0];
352         pinstr->src[3] = bi_dontcare();
353         pinstr->src[0] = bi_null();
354 
355         return atom_c;
356 }
357 
358 static bi_instr *
bi_lower_seg_add(bi_context * ctx,struct bi_clause_state * clause,struct bi_tuple_state * tuple)359 bi_lower_seg_add(bi_context *ctx,
360                 struct bi_clause_state *clause, struct bi_tuple_state *tuple)
361 {
362         bi_instr *pinstr = tuple->add;
363         bi_builder b = bi_init_builder(ctx, bi_before_instr(pinstr));
364 
365         bi_instr *fma = bi_seg_add_to(&b, pinstr->dest[0], pinstr->src[0],
366                         pinstr->preserve_null, pinstr->seg);
367 
368         pinstr->op = BI_OPCODE_SEG_ADD;
369         pinstr->src[0] = pinstr->src[1];
370         pinstr->src[1] = bi_null();
371 
372         assert(pinstr->dest[0].type == BI_INDEX_REGISTER);
373         pinstr->dest[0].value += 1;
374 
375         return fma;
376 }
377 
378 static bi_instr *
bi_lower_dtsel(bi_context * ctx,struct bi_clause_state * clause,struct bi_tuple_state * tuple)379 bi_lower_dtsel(bi_context *ctx,
380                 struct bi_clause_state *clause, struct bi_tuple_state *tuple)
381 {
382         bi_instr *add = tuple->add;
383         bi_builder b = bi_init_builder(ctx, bi_before_instr(add));
384 
385         bi_instr *dtsel = bi_dtsel_imm_to(&b, bi_temp(b.shader),
386                         add->src[0], add->table);
387         add->src[0] = dtsel->dest[0];
388 
389         assert(bi_supports_dtsel(add));
390         return dtsel;
391 }
392 
393 /* Flatten linked list to array for O(1) indexing */
394 
395 static bi_instr **
bi_flatten_block(bi_block * block,unsigned * len)396 bi_flatten_block(bi_block *block, unsigned *len)
397 {
398         if (list_is_empty(&block->instructions))
399                 return NULL;
400 
401         *len = list_length(&block->instructions);
402         bi_instr **instructions = malloc(sizeof(bi_instr *) * (*len));
403 
404         unsigned i = 0;
405 
406         bi_foreach_instr_in_block(block, ins)
407                 instructions[i++] = ins;
408 
409         return instructions;
410 }
411 
412 /* The worklist would track instructions without outstanding dependencies. For
413  * debug, force in-order scheduling (no dependency graph is constructed).
414  */
415 
416 static struct bi_worklist
bi_initialize_worklist(bi_block * block,bool inorder)417 bi_initialize_worklist(bi_block *block, bool inorder)
418 {
419         struct bi_worklist st = { };
420         st.instructions = bi_flatten_block(block, &st.count);
421 
422         if (!st.count)
423                 return st;
424 
425         st.dependents = calloc(st.count, sizeof(st.dependents[0]));
426         st.dep_counts = calloc(st.count, sizeof(st.dep_counts[0]));
427 
428         bi_create_dependency_graph(st, inorder);
429         st.worklist = calloc(BITSET_WORDS(st.count), sizeof(BITSET_WORD));
430 
431         for (unsigned i = 0; i < st.count; ++i) {
432                 if (st.dep_counts[i] == 0)
433                         BITSET_SET(st.worklist, i);
434         }
435 
436         return st;
437 }
438 
439 static void
bi_free_worklist(struct bi_worklist st)440 bi_free_worklist(struct bi_worklist st)
441 {
442         free(st.dep_counts);
443         free(st.dependents);
444         free(st.instructions);
445         free(st.worklist);
446 }
447 
448 static void
bi_update_worklist(struct bi_worklist st,unsigned idx)449 bi_update_worklist(struct bi_worklist st, unsigned idx)
450 {
451         assert(st.dep_counts[idx] == 0);
452 
453         if (!st.dependents[idx])
454                 return;
455 
456         /* Iterate each dependent to remove one dependency (`done`),
457          * adding dependents to the worklist where possible. */
458 
459         unsigned i;
460         BITSET_FOREACH_SET(i, st.dependents[idx], st.count) {
461                 assert(st.dep_counts[i] != 0);
462                 unsigned new_deps = --st.dep_counts[i];
463 
464                 if (new_deps == 0)
465                         BITSET_SET(st.worklist, i);
466         }
467 
468         free(st.dependents[idx]);
469 }
470 
471 /* Scheduler predicates */
472 
473 /* IADDC.i32 can implement IADD.u32 if no saturation or swizzling is in use */
474 static bool
bi_can_iaddc(bi_instr * ins)475 bi_can_iaddc(bi_instr *ins)
476 {
477         return (ins->op == BI_OPCODE_IADD_U32 && !ins->saturate &&
478                 ins->src[0].swizzle == BI_SWIZZLE_H01 &&
479                 ins->src[1].swizzle == BI_SWIZZLE_H01);
480 }
481 
482 bool
bi_can_fma(bi_instr * ins)483 bi_can_fma(bi_instr *ins)
484 {
485         /* +IADD.i32 -> *IADDC.i32 */
486         if (bi_can_iaddc(ins))
487                 return true;
488 
489         /* TODO: some additional fp16 constraints */
490         return bi_opcode_props[ins->op].fma;
491 }
492 
493 static bool
bi_impacted_fadd_widens(bi_instr * I)494 bi_impacted_fadd_widens(bi_instr *I)
495 {
496         enum bi_swizzle swz0 = I->src[0].swizzle;
497         enum bi_swizzle swz1 = I->src[1].swizzle;
498 
499         return (swz0 == BI_SWIZZLE_H00 && swz1 == BI_SWIZZLE_H11) ||
500                 (swz0 == BI_SWIZZLE_H11 && swz1 == BI_SWIZZLE_H11) ||
501                 (swz0 == BI_SWIZZLE_H11 && swz1 == BI_SWIZZLE_H00);
502 }
503 
504 bool
bi_can_add(bi_instr * ins)505 bi_can_add(bi_instr *ins)
506 {
507         /* +FADD.v2f16 lacks clamp modifier, use *FADD.v2f16 instead */
508         if (ins->op == BI_OPCODE_FADD_V2F16 && ins->clamp)
509                 return false;
510 
511         /* +FCMP.v2f16 lacks abs modifier, use *FCMP.v2f16 instead */
512         if (ins->op == BI_OPCODE_FCMP_V2F16 && (ins->src[0].abs || ins->src[1].abs))
513                 return false;
514 
515         /* +FADD.f32 has restricted widens, use +FADD.f32 for the full set */
516         if (ins->op == BI_OPCODE_FADD_F32 && bi_impacted_fadd_widens(ins))
517                return false;
518 
519         /* TODO: some additional fp16 constraints */
520         return bi_opcode_props[ins->op].add;
521 }
522 
523 /* Architecturally, no single instruction has a "not last" constraint. However,
524  * pseudoinstructions writing multiple destinations (expanding to multiple
525  * paired instructions) can run afoul of the "no two writes on the last clause"
526  * constraint, so we check for that here.
527  */
528 
529 static bool
bi_must_not_last(bi_instr * ins)530 bi_must_not_last(bi_instr *ins)
531 {
532         return !bi_is_null(ins->dest[0]) && !bi_is_null(ins->dest[1]);
533 }
534 
535 /* Check for a message-passing instruction. +DISCARD.f32 is special-cased; we
536  * treat it as a message-passing instruction for the purpose of scheduling
537  * despite no passing no logical message. Otherwise invalid encoding faults may
538  * be raised for unknown reasons (possibly an errata).
539  */
540 
541 bool
bi_must_message(bi_instr * ins)542 bi_must_message(bi_instr *ins)
543 {
544         return (bi_opcode_props[ins->op].message != BIFROST_MESSAGE_NONE) ||
545                 (ins->op == BI_OPCODE_DISCARD_F32);
546 }
547 
548 static bool
bi_fma_atomic(enum bi_opcode op)549 bi_fma_atomic(enum bi_opcode op)
550 {
551         switch (op) {
552         case BI_OPCODE_ATOM_C_I32:
553         case BI_OPCODE_ATOM_C_I64:
554         case BI_OPCODE_ATOM_C1_I32:
555         case BI_OPCODE_ATOM_C1_I64:
556         case BI_OPCODE_ATOM_C1_RETURN_I32:
557         case BI_OPCODE_ATOM_C1_RETURN_I64:
558         case BI_OPCODE_ATOM_C_RETURN_I32:
559         case BI_OPCODE_ATOM_C_RETURN_I64:
560         case BI_OPCODE_ATOM_POST_I32:
561         case BI_OPCODE_ATOM_POST_I64:
562         case BI_OPCODE_ATOM_PRE_I64:
563                 return true;
564         default:
565                 return false;
566         }
567 }
568 
569 bool
bi_reads_zero(bi_instr * ins)570 bi_reads_zero(bi_instr *ins)
571 {
572         return !(bi_fma_atomic(ins->op) || ins->op == BI_OPCODE_IMULD);
573 }
574 
575 bool
bi_reads_temps(bi_instr * ins,unsigned src)576 bi_reads_temps(bi_instr *ins, unsigned src)
577 {
578         switch (ins->op) {
579         /* Cannot permute a temporary */
580         case BI_OPCODE_CLPER_I32:
581         case BI_OPCODE_CLPER_V6_I32:
582                 return src != 0;
583         case BI_OPCODE_IMULD:
584                 return false;
585         default:
586                 return true;
587         }
588 }
589 
590 static bool
bi_impacted_t_modifiers(bi_instr * I,unsigned src)591 bi_impacted_t_modifiers(bi_instr *I, unsigned src)
592 {
593         enum bi_swizzle swizzle = I->src[src].swizzle;
594 
595         switch (I->op) {
596         case BI_OPCODE_F16_TO_F32:
597         case BI_OPCODE_F16_TO_S32:
598         case BI_OPCODE_F16_TO_U32:
599         case BI_OPCODE_MKVEC_V2I16:
600         case BI_OPCODE_S16_TO_F32:
601         case BI_OPCODE_S16_TO_S32:
602         case BI_OPCODE_U16_TO_F32:
603         case BI_OPCODE_U16_TO_U32:
604                 return (swizzle != BI_SWIZZLE_H00);
605 
606         case BI_OPCODE_BRANCH_F32:
607         case BI_OPCODE_LOGB_F32:
608         case BI_OPCODE_ILOGB_F32:
609         case BI_OPCODE_FADD_F32:
610         case BI_OPCODE_FCMP_F32:
611         case BI_OPCODE_FREXPE_F32:
612         case BI_OPCODE_FREXPM_F32:
613         case BI_OPCODE_FROUND_F32:
614                 return (swizzle != BI_SWIZZLE_H01);
615 
616         case BI_OPCODE_IADD_S32:
617         case BI_OPCODE_IADD_U32:
618         case BI_OPCODE_ISUB_S32:
619         case BI_OPCODE_ISUB_U32:
620         case BI_OPCODE_IADD_V4S8:
621         case BI_OPCODE_IADD_V4U8:
622         case BI_OPCODE_ISUB_V4S8:
623         case BI_OPCODE_ISUB_V4U8:
624                 return (src == 1) && (swizzle != BI_SWIZZLE_H01);
625 
626         case BI_OPCODE_S8_TO_F32:
627         case BI_OPCODE_S8_TO_S32:
628         case BI_OPCODE_U8_TO_F32:
629         case BI_OPCODE_U8_TO_U32:
630                 return (swizzle != BI_SWIZZLE_B0000);
631 
632         case BI_OPCODE_V2S8_TO_V2F16:
633         case BI_OPCODE_V2S8_TO_V2S16:
634         case BI_OPCODE_V2U8_TO_V2F16:
635         case BI_OPCODE_V2U8_TO_V2U16:
636                 return (swizzle != BI_SWIZZLE_B0022);
637 
638         case BI_OPCODE_IADD_V2S16:
639         case BI_OPCODE_IADD_V2U16:
640         case BI_OPCODE_ISUB_V2S16:
641         case BI_OPCODE_ISUB_V2U16:
642                 return (src == 1) && (swizzle >= BI_SWIZZLE_H11);
643 
644 #if 0
645         /* Restriction on IADD in 64-bit clauses on G72 */
646         case BI_OPCODE_IADD_S64:
647         case BI_OPCODE_IADD_U64:
648                 return (src == 1) && (swizzle != BI_SWIZZLE_D0);
649 #endif
650 
651         default:
652                 return false;
653         }
654 }
655 
656 bool
bi_reads_t(bi_instr * ins,unsigned src)657 bi_reads_t(bi_instr *ins, unsigned src)
658 {
659         /* Branch offset cannot come from passthrough */
660         if (bi_opcode_props[ins->op].branch)
661                 return src != 2;
662 
663         /* Table can never read passthrough */
664         if (bi_opcode_props[ins->op].table)
665                 return false;
666 
667         /* Staging register reads may happen before the succeeding register
668          * block encodes a write, so effectively there is no passthrough */
669         if (src == 0 && bi_opcode_props[ins->op].sr_read)
670                 return false;
671 
672         /* Bifrost cores newer than Mali G71 have restrictions on swizzles on
673          * same-cycle temporaries. Check the list for these hazards. */
674         if (bi_impacted_t_modifiers(ins, src))
675                 return false;
676 
677         /* Descriptor must not come from a passthrough */
678         switch (ins->op) {
679         case BI_OPCODE_LD_CVT:
680         case BI_OPCODE_LD_TILE:
681         case BI_OPCODE_ST_CVT:
682         case BI_OPCODE_ST_TILE:
683         case BI_OPCODE_TEXC:
684                 return src != 2;
685         case BI_OPCODE_BLEND:
686                 return src != 2 && src != 3;
687 
688         /* Else, just check if we can read any temps */
689         default:
690                 return bi_reads_temps(ins, src);
691         }
692 }
693 
694 /* Counts the number of 64-bit constants required by a clause. TODO: We
695  * might want to account for merging, right now we overestimate, but
696  * that's probably fine most of the time */
697 
698 static unsigned
bi_nconstants(struct bi_clause_state * clause)699 bi_nconstants(struct bi_clause_state *clause)
700 {
701         unsigned count_32 = 0;
702 
703         for (unsigned i = 0; i < ARRAY_SIZE(clause->consts); ++i)
704                 count_32 += clause->consts[i].constant_count;
705 
706         return DIV_ROUND_UP(count_32, 2);
707 }
708 
709 /* Would there be space for constants if we added one tuple? */
710 
711 static bool
bi_space_for_more_constants(struct bi_clause_state * clause)712 bi_space_for_more_constants(struct bi_clause_state *clause)
713 {
714         return (bi_nconstants(clause) < 13 - (clause->tuple_count + 1));
715 }
716 
717 /* Updates the FAU assignment for a tuple. A valid FAU assignment must be
718  * possible (as a precondition), though not necessarily on the selected unit;
719  * this is gauranteed per-instruction by bi_lower_fau and per-tuple by
720  * bi_instr_schedulable */
721 
722 static bool
bi_update_fau(struct bi_clause_state * clause,struct bi_tuple_state * tuple,bi_instr * instr,bool fma,bool destructive)723 bi_update_fau(struct bi_clause_state *clause,
724                 struct bi_tuple_state *tuple,
725                 bi_instr *instr, bool fma, bool destructive)
726 {
727         /* Maintain our own constants, for nondestructive mode */
728         uint32_t copied_constants[2], copied_count;
729         unsigned *constant_count = &tuple->constant_count;
730         uint32_t *constants = tuple->constants;
731         enum bir_fau fau = tuple->fau;
732 
733         if (!destructive) {
734                 memcpy(copied_constants, tuple->constants,
735                                 (*constant_count) * sizeof(constants[0]));
736                 copied_count = tuple->constant_count;
737 
738                 constant_count = &copied_count;
739                 constants = copied_constants;
740         }
741 
742         bi_foreach_src(instr, s) {
743                 bi_index src = instr->src[s];
744 
745                 if (src.type == BI_INDEX_FAU) {
746                         bool no_constants = *constant_count == 0;
747                         bool no_other_fau = (fau == src.value) || !fau;
748                         bool mergable = no_constants && no_other_fau;
749 
750                         if (destructive) {
751                                 assert(mergable);
752                                 tuple->fau = src.value;
753                         } else if (!mergable) {
754                                 return false;
755                         }
756 
757                         fau = src.value;
758                 } else if (src.type == BI_INDEX_CONSTANT) {
759                         /* No need to reserve space if we have a fast 0 */
760                         if (src.value == 0 && fma && bi_reads_zero(instr))
761                                 continue;
762 
763                         /* If there is a branch target, #0 by convention is the
764                          * PC-relative offset to the target */
765                         bool pcrel = instr->branch_target && src.value == 0;
766                         bool found = false;
767 
768                         for (unsigned i = 0; i < *constant_count; ++i) {
769                                 found |= (constants[i] == src.value) &&
770                                         (i != tuple->pcrel_idx);
771                         }
772 
773                         /* pcrel constants are unique, so don't match */
774                         if (found && !pcrel)
775                                 continue;
776 
777                         bool no_fau = (*constant_count > 0) || !fau;
778                         bool mergable = no_fau && ((*constant_count) < 2);
779 
780                         if (destructive) {
781                                 assert(mergable);
782 
783                                 if (pcrel)
784                                         tuple->pcrel_idx = *constant_count;
785                         } else if (!mergable)
786                                 return false;
787 
788                         constants[(*constant_count)++] = src.value;
789                 }
790         }
791 
792         /* Constants per clause may be limited by tuple count */
793         bool room_for_constants = (*constant_count == 0) ||
794                 bi_space_for_more_constants(clause);
795 
796         if (destructive)
797                 assert(room_for_constants);
798         else if (!room_for_constants)
799                 return false;
800 
801         return true;
802 }
803 
804 /* Given an in-progress tuple, a candidate new instruction to add to the tuple,
805  * and a source (index) from that candidate, determine whether this source is
806  * "new", in the sense of requiring an additional read slot. That is, checks
807  * whether the specified source reads from the register file via a read slot
808  * (determined by its type and placement) and whether the source was already
809  * specified by a prior read slot (to avoid double counting) */
810 
811 static bool
bi_tuple_is_new_src(bi_instr * instr,struct bi_reg_state * reg,unsigned src_idx)812 bi_tuple_is_new_src(bi_instr *instr, struct bi_reg_state *reg, unsigned src_idx)
813 {
814         bi_index src = instr->src[src_idx];
815 
816         /* Only consider sources which come from the register file */
817         if (!(src.type == BI_INDEX_NORMAL || src.type == BI_INDEX_REGISTER))
818                 return false;
819 
820         /* Staging register reads bypass the usual register file mechanism */
821         if (src_idx == 0 && bi_opcode_props[instr->op].sr_read)
822                 return false;
823 
824         /* If a source is already read in the tuple, it is already counted */
825         for (unsigned t = 0; t < reg->nr_reads; ++t)
826                 if (bi_is_word_equiv(src, reg->reads[t]))
827                         return false;
828 
829         /* If a source is read in _this instruction_, it is already counted */
830         for (unsigned t = 0; t < src_idx; ++t)
831                 if (bi_is_word_equiv(src, instr->src[t]))
832                         return false;
833 
834         return true;
835 }
836 
837 /* Given two tuples in source order, count the number of register reads of the
838  * successor, determined as the number of unique words accessed that aren't
839  * written by the predecessor (since those are tempable).
840  */
841 
842 static unsigned
bi_count_succ_reads(bi_index t0,bi_index t1,bi_index * succ_reads,unsigned nr_succ_reads)843 bi_count_succ_reads(bi_index t0, bi_index t1,
844                 bi_index *succ_reads, unsigned nr_succ_reads)
845 {
846         unsigned reads = 0;
847 
848         for (unsigned i = 0; i < nr_succ_reads; ++i) {
849                 bool unique = true;
850 
851                 for (unsigned j = 0; j < i; ++j)
852                         if (bi_is_word_equiv(succ_reads[i], succ_reads[j]))
853                                 unique = false;
854 
855                 if (!unique)
856                         continue;
857 
858                 if (bi_is_word_equiv(succ_reads[i], t0))
859                         continue;
860 
861                 if (bi_is_word_equiv(succ_reads[i], t1))
862                         continue;
863 
864                 reads++;
865         }
866 
867         return reads;
868 }
869 
870 /* Not all instructions can read from the staging passthrough (as determined by
871  * reads_t), check if a given pair of instructions has such a restriction. Note
872  * we also use this mechanism to prevent data races around staging register
873  * reads, so we allow the input source to potentially be vector-valued */
874 
875 static bool
bi_has_staging_passthrough_hazard(bi_index fma,bi_instr * add)876 bi_has_staging_passthrough_hazard(bi_index fma, bi_instr *add)
877 {
878         bi_foreach_src(add, s) {
879                 bi_index src = add->src[s];
880 
881                 if (src.type != BI_INDEX_REGISTER)
882                         continue;
883 
884                 unsigned count = bi_count_read_registers(add, s);
885                 bool read = false;
886 
887                 for (unsigned d = 0; d < count; ++d)
888                         read |= bi_is_equiv(fma, bi_register(src.value + d));
889 
890                 if (read && !bi_reads_t(add, s))
891                         return true;
892         }
893 
894         return false;
895 }
896 
897 /* Likewise for cross-tuple passthrough (reads_temps) */
898 
899 static bool
bi_has_cross_passthrough_hazard(bi_tuple * succ,bi_instr * ins)900 bi_has_cross_passthrough_hazard(bi_tuple *succ, bi_instr *ins)
901 {
902         bi_foreach_instr_in_tuple(succ, pins) {
903                 bi_foreach_src(pins, s) {
904                         if (bi_is_word_equiv(ins->dest[0], pins->src[s]) &&
905                                         !bi_reads_temps(pins, s))
906                                 return true;
907                 }
908         }
909 
910         return false;
911 }
912 
913 /* Is a register written other than the staging mechanism? ATEST is special,
914  * writing to both a staging register and a regular register (fixed packing).
915  * BLEND is special since it has to write r48 the normal way even if it never
916  * gets read. This depends on liveness analysis, as a register is not needed
917  * for a write that will be discarded after one tuple. */
918 
919 static unsigned
bi_write_count(bi_instr * instr,uint64_t live_after_temp)920 bi_write_count(bi_instr *instr, uint64_t live_after_temp)
921 {
922         if (instr->op == BI_OPCODE_ATEST || instr->op == BI_OPCODE_BLEND)
923                 return 1;
924 
925         unsigned count = 0;
926 
927         bi_foreach_dest(instr, d) {
928                 if (d == 0 && bi_opcode_props[instr->op].sr_write)
929                         continue;
930 
931                 if (bi_is_null(instr->dest[d]))
932                         continue;
933 
934                 assert(instr->dest[0].type == BI_INDEX_REGISTER);
935                 if (live_after_temp & BITFIELD64_BIT(instr->dest[0].value))
936                         count++;
937         }
938 
939         return count;
940 }
941 
942 /* Instruction placement entails two questions: what subset of instructions in
943  * the block can legally be scheduled? and of those which is the best? That is,
944  * we seek to maximize a cost function on a subset of the worklist satisfying a
945  * particular predicate. The necessary predicate is determined entirely by
946  * Bifrost's architectural limitations and is described in the accompanying
947  * whitepaper. The cost function is a heuristic. */
948 
949 static bool
bi_instr_schedulable(bi_instr * instr,struct bi_clause_state * clause,struct bi_tuple_state * tuple,uint64_t live_after_temp,bool fma)950 bi_instr_schedulable(bi_instr *instr,
951                 struct bi_clause_state *clause,
952                 struct bi_tuple_state *tuple,
953                 uint64_t live_after_temp,
954                 bool fma)
955 {
956         /* The units must match */
957         if ((fma && !bi_can_fma(instr)) || (!fma && !bi_can_add(instr)))
958                 return false;
959 
960         /* There can only be one message-passing instruction per clause */
961         if (bi_must_message(instr) && clause->message)
962                 return false;
963 
964         /* Some instructions have placement requirements */
965         if (bi_opcode_props[instr->op].last && !tuple->last)
966                 return false;
967 
968         if (bi_must_not_last(instr) && tuple->last)
969                 return false;
970 
971         /* Message-passing instructions are not guaranteed write within the
972          * same clause (most likely they will not), so if a later instruction
973          * in the clause accesses the destination, the message-passing
974          * instruction can't be scheduled */
975         if (bi_opcode_props[instr->op].sr_write && !bi_is_null(instr->dest[0])) {
976                 unsigned nr = bi_count_write_registers(instr, 0);
977                 assert(instr->dest[0].type == BI_INDEX_REGISTER);
978                 unsigned reg = instr->dest[0].value;
979 
980                 for (unsigned i = 0; i < clause->access_count; ++i) {
981                         bi_index idx = clause->accesses[i];
982                         for (unsigned d = 0; d < nr; ++d) {
983                                 if (bi_is_equiv(bi_register(reg + d), idx))
984                                         return false;
985                         }
986                 }
987         }
988 
989         if (bi_opcode_props[instr->op].sr_read && !bi_is_null(instr->src[0])) {
990                 unsigned nr = bi_count_read_registers(instr, 0);
991                 assert(instr->src[0].type == BI_INDEX_REGISTER);
992                 unsigned reg = instr->src[0].value;
993 
994                 for (unsigned i = 0; i < clause->access_count; ++i) {
995                         bi_index idx = clause->accesses[i];
996                         for (unsigned d = 0; d < nr; ++d) {
997                                 if (bi_is_equiv(bi_register(reg + d), idx))
998                                         return false;
999                         }
1000                 }
1001         }
1002 
1003         /* If FAU is already assigned, we may not disrupt that. Do a
1004          * non-disruptive test update */
1005         if (!bi_update_fau(clause, tuple, instr, fma, false))
1006                 return false;
1007 
1008         /* If this choice of FMA would force a staging passthrough, the ADD
1009          * instruction must support such a passthrough */
1010         if (tuple->add && bi_has_staging_passthrough_hazard(instr->dest[0], tuple->add))
1011                 return false;
1012 
1013         /* If this choice of destination would force a cross-tuple passthrough, the next tuple must support that */
1014         if (tuple->prev && bi_has_cross_passthrough_hazard(tuple->prev, instr))
1015                 return false;
1016 
1017         /* Register file writes are limited */
1018         unsigned total_writes = tuple->reg.nr_writes;
1019         total_writes += bi_write_count(instr, live_after_temp);
1020 
1021         /* Last tuple in a clause can only write a single value */
1022         if (tuple->last && total_writes > 1)
1023                 return false;
1024 
1025         /* Register file reads are limited, so count unique */
1026 
1027         unsigned unique_new_srcs = 0;
1028 
1029         bi_foreach_src(instr, s) {
1030                 if (bi_tuple_is_new_src(instr, &tuple->reg, s))
1031                         unique_new_srcs++;
1032         }
1033 
1034         unsigned total_srcs = tuple->reg.nr_reads + unique_new_srcs;
1035 
1036         bool can_spill_to_moves = (!tuple->add);
1037         can_spill_to_moves &= (bi_nconstants(clause) < 13 - (clause->tuple_count + 2));
1038         can_spill_to_moves &= (clause->tuple_count < 7);
1039 
1040         /* However, we can get an extra 1 or 2 sources by inserting moves */
1041         if (total_srcs > (can_spill_to_moves ? 4 : 3))
1042                 return false;
1043 
1044         /* Count effective reads for the successor */
1045         unsigned succ_reads = bi_count_succ_reads(instr->dest[0],
1046                         tuple->add ? tuple->add->dest[0] : bi_null(),
1047                         tuple->prev_reads, tuple->nr_prev_reads);
1048 
1049         /* Successor must satisfy R+W <= 4, so we require W <= 4-R */
1050         if ((signed) total_writes > (4 - (signed) succ_reads))
1051                 return false;
1052 
1053         return true;
1054 }
1055 
1056 static signed
bi_instr_cost(bi_instr * instr,struct bi_tuple_state * tuple)1057 bi_instr_cost(bi_instr *instr, struct bi_tuple_state *tuple)
1058 {
1059         signed cost = 0;
1060 
1061         /* Instructions that can schedule to either FMA or to ADD should be
1062          * deprioritized since they're easier to reschedule elsewhere */
1063         if (bi_can_fma(instr) && bi_can_add(instr))
1064                 cost++;
1065 
1066         /* Message-passing instructions impose constraints on the registers
1067          * later in the clause, so schedule them as late within a clause as
1068          * possible (<==> prioritize them since we're backwards <==> decrease
1069          * cost) */
1070         if (bi_must_message(instr))
1071                 cost--;
1072 
1073         /* Last instructions are big constraints (XXX: no effect on shader-db) */
1074         if (bi_opcode_props[instr->op].last)
1075                 cost -= 2;
1076 
1077         return cost;
1078 }
1079 
1080 static unsigned
bi_choose_index(struct bi_worklist st,struct bi_clause_state * clause,struct bi_tuple_state * tuple,uint64_t live_after_temp,bool fma)1081 bi_choose_index(struct bi_worklist st,
1082                 struct bi_clause_state *clause,
1083                 struct bi_tuple_state *tuple,
1084                 uint64_t live_after_temp,
1085                 bool fma)
1086 {
1087         unsigned i, best_idx = ~0;
1088         signed best_cost = INT_MAX;
1089 
1090         BITSET_FOREACH_SET(i, st.worklist, st.count) {
1091                 bi_instr *instr = st.instructions[i];
1092 
1093                 if (!bi_instr_schedulable(instr, clause, tuple, live_after_temp, fma))
1094                         continue;
1095 
1096                 signed cost = bi_instr_cost(instr, tuple);
1097 
1098                 /* Tie break in favour of later instructions, under the
1099                  * assumption this promotes temporary usage (reducing pressure
1100                  * on the register file). This is a side effect of a prepass
1101                  * scheduling for pressure. */
1102 
1103                 if (cost <= best_cost) {
1104                         best_idx = i;
1105                         best_cost = cost;
1106                 }
1107         }
1108 
1109         return best_idx;
1110 }
1111 
1112 static void
bi_pop_instr(struct bi_clause_state * clause,struct bi_tuple_state * tuple,bi_instr * instr,uint64_t live_after_temp,bool fma)1113 bi_pop_instr(struct bi_clause_state *clause, struct bi_tuple_state *tuple,
1114                 bi_instr *instr, uint64_t live_after_temp, bool fma)
1115 {
1116         bi_update_fau(clause, tuple, instr, fma, true);
1117 
1118         /* TODO: maybe opt a bit? or maybe doesn't matter */
1119         assert(clause->access_count + BI_MAX_SRCS + BI_MAX_DESTS <= ARRAY_SIZE(clause->accesses));
1120         memcpy(clause->accesses + clause->access_count, instr->src, sizeof(instr->src));
1121         clause->access_count += BI_MAX_SRCS;
1122         memcpy(clause->accesses + clause->access_count, instr->dest, sizeof(instr->dest));
1123         clause->access_count += BI_MAX_DESTS;
1124         tuple->reg.nr_writes += bi_write_count(instr, live_after_temp);
1125 
1126         bi_foreach_src(instr, s) {
1127                 if (bi_tuple_is_new_src(instr, &tuple->reg, s))
1128                         tuple->reg.reads[tuple->reg.nr_reads++] = instr->src[s];
1129         }
1130 }
1131 
1132 /* Choose the best instruction and pop it off the worklist. Returns NULL if no
1133  * instruction is available. This function is destructive. */
1134 
1135 static bi_instr *
bi_take_instr(bi_context * ctx,struct bi_worklist st,struct bi_clause_state * clause,struct bi_tuple_state * tuple,uint64_t live_after_temp,bool fma)1136 bi_take_instr(bi_context *ctx, struct bi_worklist st,
1137                 struct bi_clause_state *clause,
1138                 struct bi_tuple_state *tuple,
1139                 uint64_t live_after_temp,
1140                 bool fma)
1141 {
1142         if (tuple->add && tuple->add->op == BI_OPCODE_CUBEFACE)
1143                 return bi_lower_cubeface(ctx, clause, tuple);
1144         else if (tuple->add && tuple->add->op == BI_OPCODE_PATOM_C_I32)
1145                 return bi_lower_atom_c(ctx, clause, tuple);
1146         else if (tuple->add && tuple->add->op == BI_OPCODE_PATOM_C1_I32)
1147                 return bi_lower_atom_c1(ctx, clause, tuple);
1148         else if (tuple->add && tuple->add->op == BI_OPCODE_SEG_ADD_I64)
1149                 return bi_lower_seg_add(ctx, clause, tuple);
1150         else if (tuple->add && tuple->add->table)
1151                 return bi_lower_dtsel(ctx, clause, tuple);
1152 
1153         /* TODO: Optimize these moves */
1154         if (!fma && tuple->nr_prev_reads > 3) {
1155                 /* Only spill by one source for now */
1156                 assert(tuple->nr_prev_reads == 4);
1157 
1158                 /* Pick a source to spill */
1159                 bi_index src = tuple->prev_reads[0];
1160 
1161                 /* Schedule the spill */
1162                 bi_builder b = bi_init_builder(ctx, bi_before_tuple(tuple->prev));
1163                 bi_instr *mov = bi_mov_i32_to(&b, src, src);
1164                 bi_pop_instr(clause, tuple, mov, live_after_temp, fma);
1165                 return mov;
1166         }
1167 
1168 #ifndef NDEBUG
1169         /* Don't pair instructions if debugging */
1170         if ((bifrost_debug & BIFROST_DBG_NOSCHED) && tuple->add)
1171                 return NULL;
1172 #endif
1173 
1174         unsigned idx = bi_choose_index(st, clause, tuple, live_after_temp, fma);
1175 
1176         if (idx >= st.count)
1177                 return NULL;
1178 
1179         /* Update state to reflect taking the instruction */
1180         bi_instr *instr = st.instructions[idx];
1181 
1182         BITSET_CLEAR(st.worklist, idx);
1183         bi_update_worklist(st, idx);
1184         bi_pop_instr(clause, tuple, instr, live_after_temp, fma);
1185 
1186         /* Fixups */
1187         if (instr->op == BI_OPCODE_IADD_U32 && fma) {
1188                 assert(bi_can_iaddc(instr));
1189                 instr->op = BI_OPCODE_IADDC_I32;
1190                 instr->src[2] = bi_zero();
1191         }
1192 
1193         return instr;
1194 }
1195 
1196 /* Variant of bi_rewrite_index_src_single that uses word-equivalence, rewriting
1197  * to a passthrough register. If except_zero is true, the zeroth (first) source
1198  * is skipped, so staging register reads are not accidentally encoded as
1199  * passthrough (which is impossible) */
1200 
1201 static void
bi_use_passthrough(bi_instr * ins,bi_index old,enum bifrost_packed_src new,bool except_zero)1202 bi_use_passthrough(bi_instr *ins, bi_index old,
1203                 enum bifrost_packed_src new,
1204                 bool except_zero)
1205 {
1206         /* Optional for convenience */
1207         if (!ins || bi_is_null(old))
1208                 return;
1209 
1210         bi_foreach_src(ins, i) {
1211                 if (i == 0 && except_zero)
1212                         continue;
1213 
1214                 if (bi_is_word_equiv(ins->src[i], old)) {
1215                         ins->src[i].type = BI_INDEX_PASS;
1216                         ins->src[i].value = new;
1217                         ins->src[i].reg = false;
1218                         ins->src[i].offset = 0;
1219                 }
1220         }
1221 }
1222 
1223 /* Rewrites an adjacent pair of tuples _prec_eding and _succ_eding to use
1224  * intertuple passthroughs where necessary. Passthroughs are allowed as a
1225  * post-condition of scheduling. Note we rewrite ADD first, FMA second --
1226  * opposite the order of execution. This is deliberate -- if both FMA and ADD
1227  * write to the same logical register, the next executed tuple will get the
1228  * latter result. There's no interference issue under the assumption of correct
1229  * register allocation. */
1230 
1231 static void
bi_rewrite_passthrough(bi_tuple prec,bi_tuple succ)1232 bi_rewrite_passthrough(bi_tuple prec, bi_tuple succ)
1233 {
1234         bool sr_read = succ.add ? bi_opcode_props[succ.add->op].sr_read : false;
1235 
1236         if (prec.add) {
1237                 bi_use_passthrough(succ.fma, prec.add->dest[0], BIFROST_SRC_PASS_ADD, false);
1238                 bi_use_passthrough(succ.add, prec.add->dest[0], BIFROST_SRC_PASS_ADD, sr_read);
1239         }
1240 
1241         if (prec.fma) {
1242                 bi_use_passthrough(succ.fma, prec.fma->dest[0], BIFROST_SRC_PASS_FMA, false);
1243                 bi_use_passthrough(succ.add, prec.fma->dest[0], BIFROST_SRC_PASS_FMA, sr_read);
1244         }
1245 }
1246 
1247 static void
bi_rewrite_fau_to_pass(bi_tuple * tuple)1248 bi_rewrite_fau_to_pass(bi_tuple *tuple)
1249 {
1250         bi_foreach_instr_and_src_in_tuple(tuple, ins, s) {
1251                 if (ins->src[s].type != BI_INDEX_FAU) continue;
1252 
1253                 bi_index pass = bi_passthrough(ins->src[s].offset ?
1254                                 BIFROST_SRC_FAU_HI : BIFROST_SRC_FAU_LO);
1255 
1256                 ins->src[s] = bi_replace_index(ins->src[s], pass);
1257         }
1258 }
1259 
1260 static void
bi_rewrite_zero(bi_instr * ins,bool fma)1261 bi_rewrite_zero(bi_instr *ins, bool fma)
1262 {
1263         bi_index zero = bi_passthrough(fma ? BIFROST_SRC_STAGE : BIFROST_SRC_FAU_LO);
1264 
1265         bi_foreach_src(ins, s) {
1266                 bi_index src = ins->src[s];
1267 
1268                 if (src.type == BI_INDEX_CONSTANT && src.value == 0)
1269                         ins->src[s] = bi_replace_index(src, zero);
1270         }
1271 }
1272 
1273 /* Assumes #0 to {T, FAU} rewrite has already occurred */
1274 
1275 static void
bi_rewrite_constants_to_pass(bi_tuple * tuple,uint64_t constant,bool pcrel)1276 bi_rewrite_constants_to_pass(bi_tuple *tuple, uint64_t constant, bool pcrel)
1277 {
1278         bi_foreach_instr_and_src_in_tuple(tuple, ins, s) {
1279                 if (ins->src[s].type != BI_INDEX_CONSTANT) continue;
1280 
1281                 uint32_t cons = ins->src[s].value;
1282 
1283                 ASSERTED bool lo = (cons == (constant & 0xffffffff));
1284                 bool hi = (cons == (constant >> 32ull));
1285 
1286                 /* PC offsets always live in the upper half, set to zero by
1287                  * convention before pack time. (This is safe, since if you
1288                  * wanted to compare against zero, you would use a BRANCHZ
1289                  * instruction instead.) */
1290                 if (cons == 0 && ins->branch_target != NULL) {
1291                         assert(pcrel);
1292                         hi = true;
1293                         lo = false;
1294                 } else if (pcrel) {
1295                         hi = false;
1296                 }
1297 
1298                 assert(lo || hi);
1299 
1300                 ins->src[s] = bi_replace_index(ins->src[s],
1301                                 bi_passthrough(hi ?  BIFROST_SRC_FAU_HI :
1302                                         BIFROST_SRC_FAU_LO));
1303         }
1304 }
1305 
1306 /* Constructs a constant state given a tuple state. This has the
1307  * postcondition that pcrel applies to the first constant by convention,
1308  * and PC-relative constants will be #0 by convention here, so swap to
1309  * match if needed */
1310 
1311 static struct bi_const_state
bi_get_const_state(struct bi_tuple_state * tuple)1312 bi_get_const_state(struct bi_tuple_state *tuple)
1313 {
1314         struct bi_const_state consts = {
1315                 .constant_count = tuple->constant_count,
1316                 .constants[0] = tuple->constants[0],
1317                 .constants[1] = tuple->constants[1],
1318                 .pcrel = tuple->add && tuple->add->branch_target,
1319         };
1320 
1321         /* pcrel applies to the first constant by convention, and
1322          * PC-relative constants will be #0 by convention here, so swap
1323          * to match if needed */
1324         if (consts.pcrel && consts.constants[0]) {
1325                 assert(consts.constant_count == 2);
1326                 assert(consts.constants[1] == 0);
1327 
1328                 consts.constants[1] = consts.constants[0];
1329                 consts.constants[0] = 0;
1330         }
1331 
1332         return consts;
1333 }
1334 
1335 /* Merges constants in a clause, satisfying the following rules, assuming no
1336  * more than one tuple has pcrel:
1337  *
1338  * 1. If a tuple has two constants, they must be packed together. If one is
1339  * pcrel, it must be the high constant to use the M1=4 modification [sx64(E0) +
1340  * (PC << 32)]. Otherwise choose an arbitrary order.
1341  *
1342  * 4. If a tuple has one constant, it may be shared with an existing
1343  * pair that already contains that constant, or it may be combined with another
1344  * (distinct) tuple of a single constant.
1345  *
1346  * This gaurantees a packing is possible. The next routine handles modification
1347  * related swapping, to satisfy format 12 and the lack of modification for
1348  * tuple count 5/8 in EC0.
1349  */
1350 
1351 static uint64_t
bi_merge_u32(uint32_t c0,uint32_t c1,bool pcrel)1352 bi_merge_u32(uint32_t c0, uint32_t c1, bool pcrel)
1353 {
1354         /* At this point in the constant merge algorithm, pcrel constants are
1355          * treated as zero, so pcrel implies at least one constants is zero */
1356         assert(!pcrel || (c0 == 0 || c1 == 0));
1357 
1358         /* Order: pcrel, maximum non-pcrel, minimum non-pcrel */
1359         uint32_t hi = pcrel ? 0 : MAX2(c0, c1);
1360         uint32_t lo = (c0 == hi) ? c1 : c0;
1361 
1362         /* Merge in the selected order */
1363         return lo | (((uint64_t) hi) << 32ull);
1364 }
1365 
1366 static unsigned
bi_merge_pairs(struct bi_const_state * consts,unsigned tuple_count,uint64_t * merged,unsigned * pcrel_pair)1367 bi_merge_pairs(struct bi_const_state *consts, unsigned tuple_count,
1368                 uint64_t *merged, unsigned *pcrel_pair)
1369 {
1370         unsigned merge_count = 0;
1371 
1372         for (unsigned t = 0; t < tuple_count; ++t) {
1373                 if (consts[t].constant_count != 2) continue;
1374 
1375                 unsigned idx = ~0;
1376                 uint64_t val = bi_merge_u32(consts[t].constants[0],
1377                                 consts[t].constants[1], consts[t].pcrel);
1378 
1379                 /* Skip the pcrel pair if assigned, because if one is assigned,
1380                  * this one is not pcrel by uniqueness so it's a mismatch */
1381                 for (unsigned s = 0; s < merge_count; ++s) {
1382                         if (merged[s] == val && (*pcrel_pair) != s) {
1383                                 idx = s;
1384                                 break;
1385                         }
1386                 }
1387 
1388                 if (idx == ~0) {
1389                         idx = merge_count++;
1390                         merged[idx] = val;
1391 
1392                         if (consts[t].pcrel)
1393                                 (*pcrel_pair) = idx;
1394                 }
1395 
1396                 consts[t].word_idx = idx;
1397         }
1398 
1399         return merge_count;
1400 }
1401 
1402 static unsigned
bi_merge_singles(struct bi_const_state * consts,unsigned tuple_count,uint64_t * pairs,unsigned pair_count,unsigned * pcrel_pair)1403 bi_merge_singles(struct bi_const_state *consts, unsigned tuple_count,
1404                 uint64_t *pairs, unsigned pair_count, unsigned *pcrel_pair)
1405 {
1406         bool pending = false, pending_pcrel = false;
1407         uint32_t pending_single = 0;
1408 
1409         for (unsigned t = 0; t < tuple_count; ++t) {
1410                 if (consts[t].constant_count != 1) continue;
1411 
1412                 uint32_t val = consts[t].constants[0];
1413                 unsigned idx = ~0;
1414 
1415                 /* Try to match, but don't match pcrel with non-pcrel, even
1416                  * though we can merge a pcrel with a non-pcrel single */
1417                 for (unsigned i = 0; i < pair_count; ++i) {
1418                         bool lo = ((pairs[i] & 0xffffffff) == val);
1419                         bool hi = ((pairs[i] >> 32) == val);
1420                         bool match = (lo || hi);
1421                         match &= ((*pcrel_pair) != i);
1422                         if (match && !consts[t].pcrel) {
1423                                 idx = i;
1424                                 break;
1425                         }
1426                 }
1427 
1428                 if (idx == ~0) {
1429                         idx = pair_count;
1430 
1431                         if (pending && pending_single != val) {
1432                                 assert(!(pending_pcrel && consts[t].pcrel));
1433                                 bool pcrel = pending_pcrel || consts[t].pcrel;
1434 
1435                                 if (pcrel)
1436                                         *pcrel_pair = idx;
1437 
1438                                 pairs[pair_count++] = bi_merge_u32(pending_single, val, pcrel);
1439 
1440                                 pending = pending_pcrel = false;
1441                         } else {
1442                                 pending = true;
1443                                 pending_pcrel = consts[t].pcrel;
1444                                 pending_single = val;
1445                         }
1446                 }
1447 
1448                 consts[t].word_idx = idx;
1449         }
1450 
1451         /* Shift so it works whether pending_pcrel is set or not */
1452         if (pending) {
1453                 if (pending_pcrel)
1454                         *pcrel_pair = pair_count;
1455 
1456                 pairs[pair_count++] = ((uint64_t) pending_single) << 32ull;
1457         }
1458 
1459         return pair_count;
1460 }
1461 
1462 static unsigned
bi_merge_constants(struct bi_const_state * consts,uint64_t * pairs,unsigned * pcrel_idx)1463 bi_merge_constants(struct bi_const_state *consts, uint64_t *pairs, unsigned *pcrel_idx)
1464 {
1465         unsigned pair_count = bi_merge_pairs(consts, 8, pairs, pcrel_idx);
1466         return bi_merge_singles(consts, 8, pairs, pair_count, pcrel_idx);
1467 }
1468 
1469 /* Swap two constants at word i and i+1 by swapping their actual positions and
1470  * swapping all references so the meaning of the clause is preserved */
1471 
1472 static void
bi_swap_constants(struct bi_const_state * consts,uint64_t * pairs,unsigned i)1473 bi_swap_constants(struct bi_const_state *consts, uint64_t *pairs, unsigned i)
1474 {
1475         uint64_t tmp_pair = pairs[i + 0];
1476         pairs[i + 0] = pairs[i + 1];
1477         pairs[i + 1] = tmp_pair;
1478 
1479         for (unsigned t = 0; t < 8; ++t) {
1480                 if (consts[t].word_idx == i)
1481                         consts[t].word_idx = (i + 1);
1482                 else if (consts[t].word_idx == (i + 1))
1483                         consts[t].word_idx = i;
1484         }
1485 }
1486 
1487 /* Given merged constants, one of which might be PC-relative, fix up the M
1488  * values so the PC-relative constant (if it exists) has the M1=4 modification
1489  * and other constants are used as-is (which might require swapping) */
1490 
1491 static unsigned
bi_apply_constant_modifiers(struct bi_const_state * consts,uint64_t * pairs,unsigned * pcrel_idx,unsigned tuple_count,unsigned constant_count)1492 bi_apply_constant_modifiers(struct bi_const_state *consts,
1493                 uint64_t *pairs, unsigned *pcrel_idx,
1494                 unsigned tuple_count, unsigned constant_count)
1495 {
1496         unsigned start = bi_ec0_packed(tuple_count) ? 1 : 0;
1497 
1498         /* Clauses with these tuple counts lack an M field for the packed EC0,
1499          * so EC0 cannot be PC-relative, which might require swapping (and
1500          * possibly adding an unused constant) to fit */
1501 
1502         if (*pcrel_idx == 0 && (tuple_count == 5 || tuple_count == 8)) {
1503                 constant_count = MAX2(constant_count, 2);
1504                 *pcrel_idx = 1;
1505                 bi_swap_constants(consts, pairs, 0);
1506         }
1507 
1508         /* EC0 might be packed free, after that constants are packed in pairs
1509          * (with clause format 12), with M1 values computed from the pair */
1510 
1511         for (unsigned i = start; i < constant_count; i += 2) {
1512                 bool swap = false;
1513                 bool last = (i + 1) == constant_count;
1514 
1515                 unsigned A1 = (pairs[i] >> 60);
1516                 unsigned B1 = (pairs[i + 1] >> 60);
1517 
1518                 if (*pcrel_idx == i || *pcrel_idx == (i + 1)) {
1519                         /* PC-relative constant must be E0, not E1 */
1520                         swap = (*pcrel_idx == (i + 1));
1521 
1522                         /* Set M1 = 4 by noting (A - B) mod 16 = 4 is
1523                          * equivalent to A = (B + 4) mod 16 and that we can
1524                          * control A */
1525                         unsigned B = swap ? A1 : B1;
1526                         unsigned A = (B + 4) & 0xF;
1527                         pairs[*pcrel_idx] |= ((uint64_t) A) << 60;
1528 
1529                         /* Swapped if swap set, identity if swap not set */
1530                         *pcrel_idx = i;
1531                 } else {
1532                         /* Compute M1 value if we don't swap */
1533                         unsigned M1 = (16 + A1 - B1) & 0xF;
1534 
1535                         /* For M1 = 0 or M1 >= 8, the constants are unchanged,
1536                          * we have 0 < (A1 - B1) % 16 < 8, which implies (B1 -
1537                          * A1) % 16 >= 8, so swapping will let them be used
1538                          * unchanged */
1539                         swap = (M1 != 0) && (M1 < 8);
1540 
1541                         /* However, we can't swap the last constant, so we
1542                          * force M1 = 0 instead for this case */
1543                         if (last && swap) {
1544                                 pairs[i + 1] |= pairs[i] & (0xfull << 60);
1545                                 swap = false;
1546                         }
1547                 }
1548 
1549                 if (swap) {
1550                         assert(!last);
1551                         bi_swap_constants(consts, pairs, i);
1552                 }
1553         }
1554 
1555         return constant_count;
1556 }
1557 
1558 /* Schedule a single clause. If no instructions remain, return NULL. */
1559 
1560 static bi_clause *
bi_schedule_clause(bi_context * ctx,bi_block * block,struct bi_worklist st,uint64_t * live)1561 bi_schedule_clause(bi_context *ctx, bi_block *block, struct bi_worklist st, uint64_t *live)
1562 {
1563         struct bi_clause_state clause_state = { 0 };
1564         bi_clause *clause = rzalloc(ctx, bi_clause);
1565         bi_tuple *tuple = NULL;
1566 
1567         const unsigned max_tuples = ARRAY_SIZE(clause->tuples);
1568 
1569         /* TODO: Decide flow control better */
1570         clause->flow_control = BIFROST_FLOW_NBTB;
1571 
1572         /* The last clause can only write one instruction, so initialize that */
1573         struct bi_reg_state reg_state = {};
1574         bi_index prev_reads[5] = { bi_null() };
1575         unsigned nr_prev_reads = 0;
1576 
1577         /* We need to track future liveness. The main *live set tracks what is
1578          * live at the current point int he program we are scheduling, but to
1579          * determine temp eligibility, we instead want what will be live after
1580          * the next tuple in the program. If you scheduled forwards, you'd need
1581          * a crystall ball for this. Luckily we schedule backwards, so we just
1582          * delay updates to the live_after_temp by an extra tuple. */
1583         uint64_t live_after_temp = *live;
1584         uint64_t live_next_tuple = live_after_temp;
1585 
1586         do {
1587                 struct bi_tuple_state tuple_state = {
1588                         .last = (clause->tuple_count == 0),
1589                         .reg = reg_state,
1590                         .nr_prev_reads = nr_prev_reads,
1591                         .prev = tuple,
1592                         .pcrel_idx = ~0,
1593                 };
1594 
1595                 assert(nr_prev_reads < ARRAY_SIZE(prev_reads));
1596                 memcpy(tuple_state.prev_reads, prev_reads, sizeof(prev_reads));
1597 
1598                 unsigned idx = max_tuples - clause->tuple_count - 1;
1599 
1600                 tuple = &clause->tuples[idx];
1601 
1602                 if (clause->message && bi_opcode_props[clause->message->op].sr_read && !bi_is_null(clause->message->src[0])) {
1603                         unsigned nr = bi_count_read_registers(clause->message, 0);
1604                         live_after_temp |= (BITFIELD64_MASK(nr) << clause->message->src[0].value);
1605                 }
1606 
1607                 /* Since we schedule backwards, we schedule ADD first */
1608                 tuple_state.add = bi_take_instr(ctx, st, &clause_state, &tuple_state, live_after_temp, false);
1609                 tuple->fma = bi_take_instr(ctx, st, &clause_state, &tuple_state, live_after_temp, true);
1610                 tuple->add = tuple_state.add;
1611 
1612                 /* Update liveness from the new instructions */
1613                 if (tuple->add)
1614                         *live = bi_postra_liveness_ins(*live, tuple->add);
1615 
1616                 if (tuple->fma)
1617                         *live = bi_postra_liveness_ins(*live, tuple->fma);
1618 
1619                /* Rotate in the new per-tuple liveness */
1620                 live_after_temp = live_next_tuple;
1621                 live_next_tuple = *live;
1622 
1623                 /* We may have a message, but only one per clause */
1624                 if (tuple->add && bi_must_message(tuple->add)) {
1625                         assert(!clause_state.message);
1626                         clause_state.message = true;
1627 
1628                         clause->message_type =
1629                                 bi_message_type_for_instr(tuple->add);
1630                         clause->message = tuple->add;
1631 
1632                         /* We don't need to set dependencies for blend shaders
1633                          * because the BLEND instruction in the fragment
1634                          * shader should have already done the wait */
1635                         if (!ctx->inputs->is_blend) {
1636                                 switch (tuple->add->op) {
1637                                 case BI_OPCODE_ATEST:
1638                                         clause->dependencies |= (1 << BIFROST_SLOT_ELDEST_DEPTH);
1639                                         break;
1640                                 case BI_OPCODE_LD_TILE:
1641                                 case BI_OPCODE_ST_TILE:
1642                                         clause->dependencies |= (1 << BIFROST_SLOT_ELDEST_COLOUR);
1643                                         break;
1644                                 case BI_OPCODE_BLEND:
1645                                         clause->dependencies |= (1 << BIFROST_SLOT_ELDEST_DEPTH);
1646                                         clause->dependencies |= (1 << BIFROST_SLOT_ELDEST_COLOUR);
1647                                         break;
1648                                 default:
1649                                         break;
1650                                 }
1651                         }
1652                 }
1653 
1654                 clause_state.consts[idx] = bi_get_const_state(&tuple_state);
1655 
1656                 /* Before merging constants, eliminate zeroes, otherwise the
1657                  * merging will fight over the #0 that never gets read (and is
1658                  * never marked as read by update_fau) */
1659                 if (tuple->fma && bi_reads_zero(tuple->fma))
1660                         bi_rewrite_zero(tuple->fma, true);
1661 
1662                 /* Rewrite away FAU, constant write is deferred */
1663                 if (!tuple_state.constant_count) {
1664                         tuple->fau_idx = tuple_state.fau;
1665                         bi_rewrite_fau_to_pass(tuple);
1666                 }
1667 
1668                 /* Use passthrough register for cross-stage accesses. Since
1669                  * there are just FMA and ADD stages, that means we rewrite to
1670                  * passthrough the sources of the ADD that read from the
1671                  * destination of the FMA */
1672 
1673                 if (tuple->fma) {
1674                         bi_use_passthrough(tuple->add, tuple->fma->dest[0],
1675                                         BIFROST_SRC_STAGE, false);
1676                 }
1677 
1678                 /* Don't add an empty tuple, unless the worklist has nothing
1679                  * but a (pseudo)instruction failing to schedule due to a "not
1680                  * last instruction" constraint */
1681 
1682                 int some_instruction = __bitset_ffs(st.worklist, BITSET_WORDS(st.count));
1683                 bool not_last = (some_instruction > 0) &&
1684                         bi_must_not_last(st.instructions[some_instruction - 1]);
1685 
1686                 bool insert_empty = tuple_state.last && not_last;
1687 
1688                 if (!(tuple->fma || tuple->add || insert_empty))
1689                         break;
1690 
1691                 clause->tuple_count++;
1692 
1693                 /* Adding enough tuple might overflow constants */
1694                 if (!bi_space_for_more_constants(&clause_state))
1695                         break;
1696 
1697 #ifndef NDEBUG
1698                 /* Don't schedule more than 1 tuple if debugging */
1699                 if ((bifrost_debug & BIFROST_DBG_NOSCHED) && !insert_empty)
1700                         break;
1701 #endif
1702 
1703                 /* Link through the register state */
1704                 STATIC_ASSERT(sizeof(prev_reads) == sizeof(tuple_state.reg.reads));
1705                 memcpy(prev_reads, tuple_state.reg.reads, sizeof(prev_reads));
1706                 nr_prev_reads = tuple_state.reg.nr_reads;
1707                 clause_state.tuple_count++;
1708         } while(clause->tuple_count < 8);
1709 
1710         /* Don't schedule an empty clause */
1711         if (!clause->tuple_count)
1712                 return NULL;
1713 
1714         /* Before merging, rewrite away any tuples that read only zero */
1715         for (unsigned i = max_tuples - clause->tuple_count; i < max_tuples; ++i) {
1716                 bi_tuple *tuple = &clause->tuples[i];
1717                 struct bi_const_state *st = &clause_state.consts[i];
1718 
1719                 if (st->constant_count == 0 || st->constants[0] || st->constants[1] || st->pcrel)
1720                         continue;
1721 
1722                 bi_foreach_instr_in_tuple(tuple, ins)
1723                         bi_rewrite_zero(ins, false);
1724 
1725                 /* Constant has been demoted to FAU, so don't pack it separately */
1726                 st->constant_count = 0;
1727 
1728                 /* Default */
1729                 assert(tuple->fau_idx == BIR_FAU_ZERO);
1730         }
1731 
1732         uint64_t constant_pairs[8] = { 0 };
1733         unsigned pcrel_idx = ~0;
1734         unsigned constant_words =
1735                 bi_merge_constants(clause_state.consts, constant_pairs, &pcrel_idx);
1736 
1737         constant_words = bi_apply_constant_modifiers(clause_state.consts,
1738                         constant_pairs, &pcrel_idx, clause->tuple_count,
1739                         constant_words);
1740 
1741         clause->pcrel_idx = pcrel_idx;
1742 
1743         for (unsigned i = max_tuples - clause->tuple_count; i < max_tuples; ++i) {
1744                 bi_tuple *tuple = &clause->tuples[i];
1745 
1746                 /* If no constants, leave FAU as it is, possibly defaulting to 0 */
1747                 if (clause_state.consts[i].constant_count == 0)
1748                         continue;
1749 
1750                 /* FAU is already handled */
1751                 assert(!tuple->fau_idx);
1752 
1753                 unsigned word_idx = clause_state.consts[i].word_idx;
1754                 assert(word_idx <= 8);
1755 
1756                 /* We could try to merge regardless of bottom bits as well, but
1757                  * that's probably diminishing returns */
1758                 uint64_t pair = constant_pairs[word_idx];
1759                 unsigned lo = pair & 0xF;
1760 
1761                 tuple->fau_idx = bi_constant_field(word_idx) | lo;
1762                 bi_rewrite_constants_to_pass(tuple, pair, word_idx == pcrel_idx);
1763         }
1764 
1765         clause->constant_count = constant_words;
1766         memcpy(clause->constants, constant_pairs, sizeof(constant_pairs));
1767 
1768         /* Branches must be last, so this can be factored out */
1769         bi_instr *last = clause->tuples[max_tuples - 1].add;
1770         clause->next_clause_prefetch = !last || (last->op != BI_OPCODE_JUMP);
1771         clause->block = block;
1772 
1773         /* TODO: scoreboard assignment post-sched */
1774         clause->dependencies |= (1 << 0);
1775 
1776         /* We emit in reverse and emitted to the back of the tuples array, so
1777          * move it up front for easy indexing */
1778         memmove(clause->tuples,
1779                        clause->tuples + (max_tuples - clause->tuple_count),
1780                        clause->tuple_count * sizeof(clause->tuples[0]));
1781 
1782         /* Use passthrough register for cross-tuple accesses. Note this is
1783          * after the memmove, so this is forwards. Skip the first tuple since
1784          * there is nothing before it to passthrough */
1785 
1786         for (unsigned t = 1; t < clause->tuple_count; ++t)
1787                 bi_rewrite_passthrough(clause->tuples[t - 1], clause->tuples[t]);
1788 
1789         return clause;
1790 }
1791 
1792 static void
bi_schedule_block(bi_context * ctx,bi_block * block)1793 bi_schedule_block(bi_context *ctx, bi_block *block)
1794 {
1795         list_inithead(&block->clauses);
1796 
1797         /* Copy list to dynamic array */
1798         struct bi_worklist st = bi_initialize_worklist(block,
1799                         bifrost_debug & BIFROST_DBG_INORDER);
1800 
1801         if (!st.count) {
1802                 bi_free_worklist(st);
1803                 return;
1804         }
1805 
1806         /* We need to track liveness during scheduling in order to determine whether we can use temporary (passthrough) registers */
1807         uint64_t live = block->reg_live_out;
1808 
1809         /* Schedule as many clauses as needed to fill the block */
1810         bi_clause *u = NULL;
1811         while((u = bi_schedule_clause(ctx, block, st, &live)))
1812                 list_add(&u->link, &block->clauses);
1813 
1814         /* Back-to-back bit affects only the last clause of a block,
1815          * the rest are implicitly true */
1816         if (!list_is_empty(&block->clauses)) {
1817                 bi_clause *last_clause = list_last_entry(&block->clauses, bi_clause, link);
1818                 if (bi_reconverge_branches(block))
1819                         last_clause->flow_control = BIFROST_FLOW_NBTB_UNCONDITIONAL;
1820         }
1821 
1822         /* Reorder instructions to match the new schedule. First remove
1823          * existing instructions and then recreate the list */
1824 
1825         bi_foreach_instr_in_block_safe(block, ins) {
1826                 list_del(&ins->link);
1827         }
1828 
1829         bi_foreach_clause_in_block(block, clause) {
1830                 for (unsigned i = 0; i < clause->tuple_count; ++i)  {
1831                         bi_foreach_instr_in_tuple(&clause->tuples[i], ins) {
1832                                 list_addtail(&ins->link, &block->instructions);
1833                         }
1834                 }
1835         }
1836 
1837         block->scheduled = true;
1838 
1839 #ifndef NDEBUG
1840         unsigned i;
1841         bool incomplete = false;
1842 
1843         BITSET_FOREACH_SET(i, st.worklist, st.count) {
1844                 bi_print_instr(st.instructions[i], stderr);
1845                 incomplete = true;
1846         }
1847 
1848         if (incomplete)
1849                 unreachable("The above instructions failed to schedule.");
1850 #endif
1851 
1852         bi_free_worklist(st);
1853 }
1854 
1855 static bool
bi_check_fau_src(bi_instr * ins,unsigned s,uint32_t * constants,unsigned * cwords,bi_index * fau)1856 bi_check_fau_src(bi_instr *ins, unsigned s, uint32_t *constants, unsigned *cwords, bi_index *fau)
1857 {
1858         bi_index src = ins->src[s];
1859 
1860         /* Staging registers can't have FAU accesses */
1861         if (s == 0 && bi_opcode_props[ins->op].sr_read)
1862                 return (src.type != BI_INDEX_CONSTANT) && (src.type != BI_INDEX_FAU);
1863 
1864         if (src.type == BI_INDEX_CONSTANT) {
1865                 /* Allow fast zero */
1866                 if (src.value == 0 && bi_opcode_props[ins->op].fma && bi_reads_zero(ins))
1867                         return true;
1868 
1869                 if (!bi_is_null(*fau))
1870                         return false;
1871 
1872                 /* Else, try to inline a constant */
1873                 for (unsigned i = 0; i < *cwords; ++i) {
1874                         if (src.value == constants[i])
1875                                 return true;
1876                 }
1877 
1878                 if (*cwords >= 2)
1879                         return false;
1880 
1881                 constants[(*cwords)++] = src.value;
1882         } else if (src.type == BI_INDEX_FAU) {
1883                 if (*cwords != 0)
1884                         return false;
1885 
1886                 /* Can only read from one pair of FAU words */
1887                 if (!bi_is_null(*fau) && (src.value != fau->value))
1888                         return false;
1889 
1890                 /* If there is a target, we'll need a PC-relative constant */
1891                 if (ins->branch_target)
1892                         return false;
1893 
1894                 *fau = src;
1895         }
1896 
1897         return true;
1898 }
1899 
1900 void
bi_lower_fau(bi_context * ctx)1901 bi_lower_fau(bi_context *ctx)
1902 {
1903         bi_foreach_instr_global_safe(ctx, ins) {
1904                 bi_builder b = bi_init_builder(ctx, bi_before_instr(ins));
1905 
1906                 uint32_t constants[2];
1907                 unsigned cwords = 0;
1908                 bi_index fau = bi_null();
1909 
1910                 /* ATEST must have the ATEST datum encoded, not any other
1911                  * uniform. See to it this is the case. */
1912                 if (ins->op == BI_OPCODE_ATEST)
1913                         fau = ins->src[2];
1914 
1915                 bi_foreach_src(ins, s) {
1916                         if (bi_check_fau_src(ins, s, constants, &cwords, &fau)) continue;
1917 
1918                         bi_index copy = bi_mov_i32(&b, ins->src[s]);
1919                         ins->src[s] = bi_replace_index(ins->src[s], copy);
1920                 }
1921         }
1922 }
1923 
1924 /* Only v7 allows specifying a dependency on the tilebuffer for the first
1925  * clause of a shader. v6 requires adding a NOP clause with the depedency. */
1926 
1927 static void
bi_add_nop_for_atest(bi_context * ctx)1928 bi_add_nop_for_atest(bi_context *ctx)
1929 {
1930         /* Only needed on v6 */
1931         if (ctx->arch >= 7)
1932                 return;
1933 
1934         if (list_is_empty(&ctx->blocks))
1935                 return;
1936 
1937         /* Fetch the first clause of the shader */
1938         bi_block *block = list_first_entry(&ctx->blocks, bi_block, link);
1939         bi_clause *clause = bi_next_clause(ctx, block, NULL);
1940 
1941         if (!clause || !(clause->dependencies & ((1 << BIFROST_SLOT_ELDEST_DEPTH) |
1942                                                  (1 << BIFROST_SLOT_ELDEST_COLOUR))))
1943                 return;
1944 
1945         /* Add a NOP so we can wait for the dependencies required by the first
1946          * clause */
1947 
1948         bi_instr *I = rzalloc(ctx, bi_instr);
1949         I->op = BI_OPCODE_NOP;
1950         I->dest[0] = bi_null();
1951 
1952         bi_clause *new_clause = ralloc(ctx, bi_clause);
1953         *new_clause = (bi_clause) {
1954                 .flow_control = BIFROST_FLOW_NBTB,
1955                 .next_clause_prefetch = true,
1956                 .block = clause->block,
1957 
1958                 .tuple_count = 1,
1959                 .tuples[0] = { .fma = I, },
1960         };
1961 
1962         list_add(&new_clause->link, &clause->block->clauses);
1963 }
1964 
1965 void
bi_schedule(bi_context * ctx)1966 bi_schedule(bi_context *ctx)
1967 {
1968         /* Fed into both scheduling and DCE */
1969         bi_postra_liveness(ctx);
1970 
1971         bi_foreach_block(ctx, block) {
1972                 bi_schedule_block(ctx, block);
1973         }
1974 
1975         bi_opt_dce_post_ra(ctx);
1976         bi_add_nop_for_atest(ctx);
1977 }
1978