• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*
2  * Copyright (C) 2020 Collabora Ltd.
3  *
4  * Permission is hereby granted, free of charge, to any person obtaining a
5  * copy of this software and associated documentation files (the "Software"),
6  * to deal in the Software without restriction, including without limitation
7  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8  * and/or sell copies of the Software, and to permit persons to whom the
9  * Software is furnished to do so, subject to the following conditions:
10  *
11  * The above copyright notice and this permission notice (including the next
12  * paragraph) shall be included in all copies or substantial portions of the
13  * Software.
14  *
15  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
18  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21  * SOFTWARE.
22  *
23  * Authors (Collabora):
24  *      Alyssa Rosenzweig <alyssa.rosenzweig@collabora.com>
25  */
26 
27 #include "compiler.h"
28 #include "bi_builder.h"
29 
30 /* Arguments common to worklist, passed by value for convenience */
31 
32 struct bi_worklist {
33         /* # of instructions in the block */
34         unsigned count;
35 
36         /* Instructions in the block */
37         bi_instr **instructions;
38 
39         /* Bitset of instructions in the block ready for scheduling */
40         BITSET_WORD *worklist;
41 
42         /* The backwards dependency graph. nr_dependencies is the number of
43          * unscheduled instructions that must still be scheduled after (before)
44          * this instruction. dependents are which instructions need to be
45          * scheduled before (after) this instruction. */
46         unsigned *dep_counts;
47         BITSET_WORD **dependents;
48 };
49 
50 /* State of a single tuple and clause under construction */
51 
52 struct bi_reg_state {
53         /* Number of register writes */
54         unsigned nr_writes;
55 
56         /* Register reads, expressed as (equivalence classes of)
57          * sources. Only 3 reads are allowed, but up to 2 may spill as
58          * "forced" for the next scheduled tuple, provided such a tuple
59          * can be constructed */
60         bi_index reads[5];
61         unsigned nr_reads;
62 
63         /* The previous tuple scheduled (= the next tuple executed in the
64          * program) may require certain writes, in order to bypass the register
65          * file and use a temporary passthrough for the value. Up to 2 such
66          * constraints are architecturally satisfiable */
67         unsigned forced_count;
68         bi_index forceds[2];
69 };
70 
71 struct bi_tuple_state {
72         /* Is this the last tuple in the clause */
73         bool last;
74 
75         /* Scheduled ADD instruction, or null if none */
76         bi_instr *add;
77 
78         /* Reads for previous (succeeding) tuple */
79         bi_index prev_reads[5];
80         unsigned nr_prev_reads;
81         bi_tuple *prev;
82 
83         /* Register slot state for current tuple */
84         struct bi_reg_state reg;
85 
86         /* Constants are shared in the tuple. If constant_count is nonzero, it
87          * is a size for constant count. Otherwise, fau is the slot read from
88          * FAU, or zero if none is assigned. Ordinarily FAU slot 0 reads zero,
89          * but within a tuple, that should be encoded as constant_count != 0
90          * and constants[0] = constants[1] = 0 */
91         unsigned constant_count;
92 
93         union {
94                 uint32_t constants[2];
95                 enum bir_fau fau;
96         };
97 
98         unsigned pcrel_idx;
99 };
100 
101 struct bi_const_state {
102         unsigned constant_count;
103         bool pcrel; /* applies to first const */
104         uint32_t constants[2];
105 
106         /* Index of the constant into the clause */
107         unsigned word_idx;
108 };
109 
110 enum bi_ftz_state {
111         /* No flush-to-zero state assigned yet */
112         BI_FTZ_STATE_NONE,
113 
114         /* Never flush-to-zero */
115         BI_FTZ_STATE_DISABLE,
116 
117         /* Always flush-to-zero */
118         BI_FTZ_STATE_ENABLE,
119 };
120 
121 struct bi_clause_state {
122         /* Has a message-passing instruction already been assigned? */
123         bool message;
124 
125         /* Indices already accessed, this needs to be tracked to avoid hazards
126          * around message-passing instructions */
127         unsigned access_count;
128         bi_index accesses[(BI_MAX_SRCS + BI_MAX_DESTS) * 16];
129 
130         unsigned tuple_count;
131         struct bi_const_state consts[8];
132 
133         /* Numerical state of the clause */
134         enum bi_ftz_state ftz;
135 };
136 
137 /* Determines messsage type by checking the table and a few special cases. Only
138  * case missing is tilebuffer instructions that access depth/stencil, which
139  * require a Z_STENCIL message (to implement
140  * ARM_shader_framebuffer_fetch_depth_stencil) */
141 
142 static enum bifrost_message_type
bi_message_type_for_instr(bi_instr * ins)143 bi_message_type_for_instr(bi_instr *ins)
144 {
145         enum bifrost_message_type msg = bi_opcode_props[ins->op].message;
146         bool ld_var_special = (ins->op == BI_OPCODE_LD_VAR_SPECIAL);
147 
148         if (ld_var_special && ins->varying_name == BI_VARYING_NAME_FRAG_Z)
149                 return BIFROST_MESSAGE_Z_STENCIL;
150 
151         if (msg == BIFROST_MESSAGE_LOAD && ins->seg == BI_SEG_UBO)
152                 return BIFROST_MESSAGE_ATTRIBUTE;
153 
154         return msg;
155 }
156 
157 /* Attribute, texture, and UBO load (attribute message) instructions support
158  * bindless, so just check the message type */
159 
160 ASSERTED static bool
bi_supports_dtsel(bi_instr * ins)161 bi_supports_dtsel(bi_instr *ins)
162 {
163         switch (bi_message_type_for_instr(ins)) {
164         case BIFROST_MESSAGE_ATTRIBUTE:
165                 return ins->op != BI_OPCODE_LD_GCLK_U64;
166         case BIFROST_MESSAGE_TEX:
167                 return true;
168         default:
169                 return false;
170         }
171 }
172 
173 /* Adds an edge to the dependency graph */
174 
175 static void
bi_push_dependency(unsigned parent,unsigned child,BITSET_WORD ** dependents,unsigned * dep_counts)176 bi_push_dependency(unsigned parent, unsigned child,
177                 BITSET_WORD **dependents, unsigned *dep_counts)
178 {
179         if (!BITSET_TEST(dependents[parent], child)) {
180                 BITSET_SET(dependents[parent], child);
181                 dep_counts[child]++;
182         }
183 }
184 
185 static void
add_dependency(struct util_dynarray * table,unsigned index,unsigned child,BITSET_WORD ** dependents,unsigned * dep_counts)186 add_dependency(struct util_dynarray *table, unsigned index, unsigned child,
187                 BITSET_WORD **dependents, unsigned *dep_counts)
188 {
189         assert(index < 64);
190         util_dynarray_foreach(table + index, unsigned, parent)
191                 bi_push_dependency(*parent, child, dependents, dep_counts);
192 }
193 
194 static void
mark_access(struct util_dynarray * table,unsigned index,unsigned parent)195 mark_access(struct util_dynarray *table, unsigned index, unsigned parent)
196 {
197         assert(index < 64);
198         util_dynarray_append(&table[index], unsigned, parent);
199 }
200 
201 static bool
bi_is_sched_barrier(bi_instr * I)202 bi_is_sched_barrier(bi_instr *I)
203 {
204         switch (I->op) {
205         case BI_OPCODE_BARRIER:
206         case BI_OPCODE_DISCARD_F32:
207                 return true;
208         default:
209                 return false;
210         }
211 }
212 
213 static void
bi_create_dependency_graph(struct bi_worklist st,bool inorder,bool is_blend)214 bi_create_dependency_graph(struct bi_worklist st, bool inorder, bool is_blend)
215 {
216         struct util_dynarray last_read[64], last_write[64];
217 
218         for (unsigned i = 0; i < 64; ++i) {
219                 util_dynarray_init(&last_read[i], NULL);
220                 util_dynarray_init(&last_write[i], NULL);
221         }
222 
223         /* Initialize dependency graph */
224         for (unsigned i = 0; i < st.count; ++i) {
225                 st.dependents[i] =
226                         calloc(BITSET_WORDS(st.count), sizeof(BITSET_WORD));
227 
228                 st.dep_counts[i] = 0;
229         }
230 
231         unsigned prev_msg = ~0;
232 
233         /* Populate dependency graph */
234         for (signed i = st.count - 1; i >= 0; --i) {
235                 bi_instr *ins = st.instructions[i];
236 
237                 bi_foreach_src(ins, s) {
238                         if (ins->src[s].type != BI_INDEX_REGISTER) continue;
239                         unsigned count = bi_count_read_registers(ins, s);
240 
241                         for (unsigned c = 0; c < count; ++c)
242                                 add_dependency(last_write, ins->src[s].value + c, i, st.dependents, st.dep_counts);
243                 }
244 
245                 /* Keep message-passing ops in order. (This pass only cares
246                  * about bundling; reordering of message-passing instructions
247                  * happens during earlier scheduling.) */
248 
249                 if (bi_message_type_for_instr(ins)) {
250                         if (prev_msg != ~0)
251                                 bi_push_dependency(prev_msg, i, st.dependents, st.dep_counts);
252 
253                         prev_msg = i;
254                 }
255 
256                 /* Handle schedule barriers, adding All the deps */
257                 if (inorder || bi_is_sched_barrier(ins)) {
258                         for (unsigned j = 0; j < st.count; ++j) {
259                                 if (i == j) continue;
260 
261                                 bi_push_dependency(MAX2(i, j), MIN2(i, j),
262                                                 st.dependents, st.dep_counts);
263                         }
264                 }
265 
266                 bi_foreach_dest(ins, d) {
267                         if (ins->dest[d].type != BI_INDEX_REGISTER) continue;
268                         unsigned dest = ins->dest[d].value;
269 
270                         unsigned count = bi_count_write_registers(ins, d);
271 
272                         for (unsigned c = 0; c < count; ++c) {
273                                 add_dependency(last_read, dest + c, i, st.dependents, st.dep_counts);
274                                 add_dependency(last_write, dest + c, i, st.dependents, st.dep_counts);
275                                 mark_access(last_write, dest + c, i);
276                         }
277                 }
278 
279                 /* Blend shaders are allowed to clobber R0-R15. Treat these
280                  * registers like extra destinations for scheduling purposes.
281                  */
282                 if (ins->op == BI_OPCODE_BLEND && !is_blend) {
283                         for (unsigned c = 0; c < 16; ++c) {
284                                 add_dependency(last_read, c, i, st.dependents, st.dep_counts);
285                                 add_dependency(last_write, c, i, st.dependents, st.dep_counts);
286                                 mark_access(last_write, c, i);
287                         }
288                 }
289 
290                 bi_foreach_src(ins, s) {
291                         if (ins->src[s].type != BI_INDEX_REGISTER) continue;
292 
293                         unsigned count = bi_count_read_registers(ins, s);
294 
295                         for (unsigned c = 0; c < count; ++c)
296                                 mark_access(last_read, ins->src[s].value + c, i);
297                 }
298         }
299 
300         /* If there is a branch, all instructions depend on it, as interblock
301          * execution must be purely in-order */
302 
303         bi_instr *last = st.instructions[st.count - 1];
304         if (last->branch_target || last->op == BI_OPCODE_JUMP) {
305                 for (signed i = st.count - 2; i >= 0; --i)
306                         bi_push_dependency(st.count - 1, i, st.dependents, st.dep_counts);
307         }
308 
309         /* Free the intermediate structures */
310         for (unsigned i = 0; i < 64; ++i) {
311                 util_dynarray_fini(&last_read[i]);
312                 util_dynarray_fini(&last_write[i]);
313         }
314 }
315 
316 /* Scheduler pseudoinstruction lowerings to enable instruction pairings.
317  * Currently only support CUBEFACE -> *CUBEFACE1/+CUBEFACE2
318  */
319 
320 static bi_instr *
bi_lower_cubeface(bi_context * ctx,struct bi_clause_state * clause,struct bi_tuple_state * tuple)321 bi_lower_cubeface(bi_context *ctx,
322                 struct bi_clause_state *clause, struct bi_tuple_state *tuple)
323 {
324         bi_instr *pinstr = tuple->add;
325         bi_builder b = bi_init_builder(ctx, bi_before_instr(pinstr));
326         bi_instr *cubeface1 = bi_cubeface1_to(&b, pinstr->dest[0],
327                         pinstr->src[0], pinstr->src[1], pinstr->src[2]);
328 
329         pinstr->op = BI_OPCODE_CUBEFACE2;
330         pinstr->dest[0] = pinstr->dest[1];
331         pinstr->dest[1] = bi_null();
332         pinstr->src[0] = cubeface1->dest[0];
333         pinstr->src[1] = bi_null();
334         pinstr->src[2] = bi_null();
335 
336         return cubeface1;
337 }
338 
339 /* Psuedo arguments are (rbase, address lo, address hi). We need *ATOM_C.i32 to
340  * have the arguments (address lo, address hi, rbase), and +ATOM_CX to have the
341  * arguments (rbase, address lo, address hi, rbase) */
342 
343 static bi_instr *
bi_lower_atom_c(bi_context * ctx,struct bi_clause_state * clause,struct bi_tuple_state * tuple)344 bi_lower_atom_c(bi_context *ctx, struct bi_clause_state *clause, struct
345                 bi_tuple_state *tuple)
346 {
347         bi_instr *pinstr = tuple->add;
348         bi_builder b = bi_init_builder(ctx, bi_before_instr(pinstr));
349         bi_instr *atom_c = bi_atom_c_return_i32(&b,
350                         pinstr->src[1], pinstr->src[2], pinstr->src[0],
351                         pinstr->atom_opc);
352 
353         if (bi_is_null(pinstr->dest[0]))
354                 atom_c->op = BI_OPCODE_ATOM_C_I32;
355 
356         pinstr->op = BI_OPCODE_ATOM_CX;
357         pinstr->src[3] = atom_c->src[2];
358 
359         return atom_c;
360 }
361 
362 static bi_instr *
bi_lower_atom_c1(bi_context * ctx,struct bi_clause_state * clause,struct bi_tuple_state * tuple)363 bi_lower_atom_c1(bi_context *ctx, struct bi_clause_state *clause, struct
364                 bi_tuple_state *tuple)
365 {
366         bi_instr *pinstr = tuple->add;
367         bi_builder b = bi_init_builder(ctx, bi_before_instr(pinstr));
368         bi_instr *atom_c = bi_atom_c1_return_i32(&b,
369                         pinstr->src[0], pinstr->src[1], pinstr->atom_opc);
370 
371         if (bi_is_null(pinstr->dest[0]))
372                 atom_c->op = BI_OPCODE_ATOM_C1_I32;
373 
374         pinstr->op = BI_OPCODE_ATOM_CX;
375         pinstr->src[2] = pinstr->src[1];
376         pinstr->src[1] = pinstr->src[0];
377         pinstr->src[3] = bi_dontcare(&b);
378         pinstr->src[0] = bi_null();
379 
380         return atom_c;
381 }
382 
383 static bi_instr *
bi_lower_seg_add(bi_context * ctx,struct bi_clause_state * clause,struct bi_tuple_state * tuple)384 bi_lower_seg_add(bi_context *ctx,
385                 struct bi_clause_state *clause, struct bi_tuple_state *tuple)
386 {
387         bi_instr *pinstr = tuple->add;
388         bi_builder b = bi_init_builder(ctx, bi_before_instr(pinstr));
389 
390         bi_instr *fma = bi_seg_add_to(&b, pinstr->dest[0], pinstr->src[0],
391                         pinstr->preserve_null, pinstr->seg);
392 
393         pinstr->op = BI_OPCODE_SEG_ADD;
394         pinstr->src[0] = pinstr->src[1];
395         pinstr->src[1] = bi_null();
396 
397         assert(pinstr->dest[0].type == BI_INDEX_REGISTER);
398         pinstr->dest[0].value += 1;
399 
400         return fma;
401 }
402 
403 static bi_instr *
bi_lower_dtsel(bi_context * ctx,struct bi_clause_state * clause,struct bi_tuple_state * tuple)404 bi_lower_dtsel(bi_context *ctx,
405                 struct bi_clause_state *clause, struct bi_tuple_state *tuple)
406 {
407         bi_instr *add = tuple->add;
408         bi_builder b = bi_init_builder(ctx, bi_before_instr(add));
409 
410         bi_instr *dtsel = bi_dtsel_imm_to(&b, bi_temp(b.shader),
411                         add->src[0], add->table);
412         add->src[0] = dtsel->dest[0];
413 
414         assert(bi_supports_dtsel(add));
415         return dtsel;
416 }
417 
418 /* Flatten linked list to array for O(1) indexing */
419 
420 static bi_instr **
bi_flatten_block(bi_block * block,unsigned * len)421 bi_flatten_block(bi_block *block, unsigned *len)
422 {
423         if (list_is_empty(&block->instructions))
424                 return NULL;
425 
426         *len = list_length(&block->instructions);
427         bi_instr **instructions = malloc(sizeof(bi_instr *) * (*len));
428 
429         unsigned i = 0;
430 
431         bi_foreach_instr_in_block(block, ins)
432                 instructions[i++] = ins;
433 
434         return instructions;
435 }
436 
437 /* The worklist would track instructions without outstanding dependencies. For
438  * debug, force in-order scheduling (no dependency graph is constructed).
439  */
440 
441 static struct bi_worklist
bi_initialize_worklist(bi_block * block,bool inorder,bool is_blend)442 bi_initialize_worklist(bi_block *block, bool inorder, bool is_blend)
443 {
444         struct bi_worklist st = { };
445         st.instructions = bi_flatten_block(block, &st.count);
446 
447         if (!st.count)
448                 return st;
449 
450         st.dependents = calloc(st.count, sizeof(st.dependents[0]));
451         st.dep_counts = calloc(st.count, sizeof(st.dep_counts[0]));
452 
453         bi_create_dependency_graph(st, inorder, is_blend);
454         st.worklist = calloc(BITSET_WORDS(st.count), sizeof(BITSET_WORD));
455 
456         for (unsigned i = 0; i < st.count; ++i) {
457                 if (st.dep_counts[i] == 0)
458                         BITSET_SET(st.worklist, i);
459         }
460 
461         return st;
462 }
463 
464 static void
bi_free_worklist(struct bi_worklist st)465 bi_free_worklist(struct bi_worklist st)
466 {
467         free(st.dep_counts);
468         free(st.dependents);
469         free(st.instructions);
470         free(st.worklist);
471 }
472 
473 static void
bi_update_worklist(struct bi_worklist st,unsigned idx)474 bi_update_worklist(struct bi_worklist st, unsigned idx)
475 {
476         assert(st.dep_counts[idx] == 0);
477 
478         if (!st.dependents[idx])
479                 return;
480 
481         /* Iterate each dependent to remove one dependency (`done`),
482          * adding dependents to the worklist where possible. */
483 
484         unsigned i;
485         BITSET_FOREACH_SET(i, st.dependents[idx], st.count) {
486                 assert(st.dep_counts[i] != 0);
487                 unsigned new_deps = --st.dep_counts[i];
488 
489                 if (new_deps == 0)
490                         BITSET_SET(st.worklist, i);
491         }
492 
493         free(st.dependents[idx]);
494 }
495 
496 /* Scheduler predicates */
497 
498 /* IADDC.i32 can implement IADD.u32 if no saturation or swizzling is in use */
499 static bool
bi_can_iaddc(bi_instr * ins)500 bi_can_iaddc(bi_instr *ins)
501 {
502         return (ins->op == BI_OPCODE_IADD_U32 && !ins->saturate &&
503                 ins->src[0].swizzle == BI_SWIZZLE_H01 &&
504                 ins->src[1].swizzle == BI_SWIZZLE_H01);
505 }
506 
507 /*
508  * The encoding of *FADD.v2f16 only specifies a single abs flag. All abs
509  * encodings are permitted by swapping operands; however, this scheme fails if
510  * both operands are equal. Test for this case.
511  */
512 static bool
bi_impacted_abs(bi_instr * I)513 bi_impacted_abs(bi_instr *I)
514 {
515         return I->src[0].abs && I->src[1].abs &&
516                bi_is_word_equiv(I->src[0], I->src[1]);
517 }
518 
519 bool
bi_can_fma(bi_instr * ins)520 bi_can_fma(bi_instr *ins)
521 {
522         /* +IADD.i32 -> *IADDC.i32 */
523         if (bi_can_iaddc(ins))
524                 return true;
525 
526         /* +MUX -> *CSEL */
527         if (bi_can_replace_with_csel(ins))
528                 return true;
529 
530         /* *FADD.v2f16 has restricted abs modifiers, use +FADD.v2f16 instead */
531         if (ins->op == BI_OPCODE_FADD_V2F16 && bi_impacted_abs(ins))
532                 return false;
533 
534         /* TODO: some additional fp16 constraints */
535         return bi_opcode_props[ins->op].fma;
536 }
537 
538 static bool
bi_impacted_fadd_widens(bi_instr * I)539 bi_impacted_fadd_widens(bi_instr *I)
540 {
541         enum bi_swizzle swz0 = I->src[0].swizzle;
542         enum bi_swizzle swz1 = I->src[1].swizzle;
543 
544         return (swz0 == BI_SWIZZLE_H00 && swz1 == BI_SWIZZLE_H11) ||
545                 (swz0 == BI_SWIZZLE_H11 && swz1 == BI_SWIZZLE_H11) ||
546                 (swz0 == BI_SWIZZLE_H11 && swz1 == BI_SWIZZLE_H00);
547 }
548 
549 bool
bi_can_add(bi_instr * ins)550 bi_can_add(bi_instr *ins)
551 {
552         /* +FADD.v2f16 lacks clamp modifier, use *FADD.v2f16 instead */
553         if (ins->op == BI_OPCODE_FADD_V2F16 && ins->clamp)
554                 return false;
555 
556         /* +FCMP.v2f16 lacks abs modifier, use *FCMP.v2f16 instead */
557         if (ins->op == BI_OPCODE_FCMP_V2F16 && (ins->src[0].abs || ins->src[1].abs))
558                 return false;
559 
560         /* +FADD.f32 has restricted widens, use +FADD.f32 for the full set */
561         if (ins->op == BI_OPCODE_FADD_F32 && bi_impacted_fadd_widens(ins))
562                return false;
563 
564         /* TODO: some additional fp16 constraints */
565         return bi_opcode_props[ins->op].add;
566 }
567 
568 /* Architecturally, no single instruction has a "not last" constraint. However,
569  * pseudoinstructions writing multiple destinations (expanding to multiple
570  * paired instructions) can run afoul of the "no two writes on the last clause"
571  * constraint, so we check for that here.
572  *
573  * Exception to the exception: TEXC, which writes to multiple sets of staging
574  * registers. Staging registers bypass the usual register write mechanism so
575  * this restriction does not apply.
576  */
577 
578 static bool
bi_must_not_last(bi_instr * ins)579 bi_must_not_last(bi_instr *ins)
580 {
581         return !bi_is_null(ins->dest[0]) && !bi_is_null(ins->dest[1]) &&
582                (ins->op != BI_OPCODE_TEXC);
583 }
584 
585 /* Check for a message-passing instruction. +DISCARD.f32 is special-cased; we
586  * treat it as a message-passing instruction for the purpose of scheduling
587  * despite no passing no logical message. Otherwise invalid encoding faults may
588  * be raised for unknown reasons (possibly an errata).
589  */
590 
591 bool
bi_must_message(bi_instr * ins)592 bi_must_message(bi_instr *ins)
593 {
594         return (bi_opcode_props[ins->op].message != BIFROST_MESSAGE_NONE) ||
595                 (ins->op == BI_OPCODE_DISCARD_F32);
596 }
597 
598 static bool
bi_fma_atomic(enum bi_opcode op)599 bi_fma_atomic(enum bi_opcode op)
600 {
601         switch (op) {
602         case BI_OPCODE_ATOM_C_I32:
603         case BI_OPCODE_ATOM_C_I64:
604         case BI_OPCODE_ATOM_C1_I32:
605         case BI_OPCODE_ATOM_C1_I64:
606         case BI_OPCODE_ATOM_C1_RETURN_I32:
607         case BI_OPCODE_ATOM_C1_RETURN_I64:
608         case BI_OPCODE_ATOM_C_RETURN_I32:
609         case BI_OPCODE_ATOM_C_RETURN_I64:
610         case BI_OPCODE_ATOM_POST_I32:
611         case BI_OPCODE_ATOM_POST_I64:
612         case BI_OPCODE_ATOM_PRE_I64:
613                 return true;
614         default:
615                 return false;
616         }
617 }
618 
619 bool
bi_reads_zero(bi_instr * ins)620 bi_reads_zero(bi_instr *ins)
621 {
622         return !(bi_fma_atomic(ins->op) || ins->op == BI_OPCODE_IMULD);
623 }
624 
625 bool
bi_reads_temps(bi_instr * ins,unsigned src)626 bi_reads_temps(bi_instr *ins, unsigned src)
627 {
628         switch (ins->op) {
629         /* Cannot permute a temporary */
630         case BI_OPCODE_CLPER_I32:
631         case BI_OPCODE_CLPER_OLD_I32:
632                 return src != 0;
633 
634         /* ATEST isn't supposed to be restricted, but in practice it always
635          * wants to source its coverage mask input (source 0) from register 60,
636          * which won't work properly if we put the input in a temp. This
637          * requires workarounds in both RA and clause scheduling.
638          */
639         case BI_OPCODE_ATEST:
640                 return src != 0;
641 
642         case BI_OPCODE_IMULD:
643                 return false;
644         default:
645                 return true;
646         }
647 }
648 
649 static bool
bi_impacted_t_modifiers(bi_instr * I,unsigned src)650 bi_impacted_t_modifiers(bi_instr *I, unsigned src)
651 {
652         enum bi_swizzle swizzle = I->src[src].swizzle;
653 
654         switch (I->op) {
655         case BI_OPCODE_F16_TO_F32:
656         case BI_OPCODE_F16_TO_S32:
657         case BI_OPCODE_F16_TO_U32:
658         case BI_OPCODE_MKVEC_V2I16:
659         case BI_OPCODE_S16_TO_F32:
660         case BI_OPCODE_S16_TO_S32:
661         case BI_OPCODE_U16_TO_F32:
662         case BI_OPCODE_U16_TO_U32:
663                 return (swizzle != BI_SWIZZLE_H00);
664 
665         case BI_OPCODE_BRANCH_F32:
666         case BI_OPCODE_LOGB_F32:
667         case BI_OPCODE_ILOGB_F32:
668         case BI_OPCODE_FADD_F32:
669         case BI_OPCODE_FCMP_F32:
670         case BI_OPCODE_FREXPE_F32:
671         case BI_OPCODE_FREXPM_F32:
672         case BI_OPCODE_FROUND_F32:
673                 return (swizzle != BI_SWIZZLE_H01);
674 
675         case BI_OPCODE_IADD_S32:
676         case BI_OPCODE_IADD_U32:
677         case BI_OPCODE_ISUB_S32:
678         case BI_OPCODE_ISUB_U32:
679         case BI_OPCODE_IADD_V4S8:
680         case BI_OPCODE_IADD_V4U8:
681         case BI_OPCODE_ISUB_V4S8:
682         case BI_OPCODE_ISUB_V4U8:
683                 return (src == 1) && (swizzle != BI_SWIZZLE_H01);
684 
685         case BI_OPCODE_S8_TO_F32:
686         case BI_OPCODE_S8_TO_S32:
687         case BI_OPCODE_U8_TO_F32:
688         case BI_OPCODE_U8_TO_U32:
689                 return (swizzle != BI_SWIZZLE_B0000);
690 
691         case BI_OPCODE_V2S8_TO_V2F16:
692         case BI_OPCODE_V2S8_TO_V2S16:
693         case BI_OPCODE_V2U8_TO_V2F16:
694         case BI_OPCODE_V2U8_TO_V2U16:
695                 return (swizzle != BI_SWIZZLE_B0022);
696 
697         case BI_OPCODE_IADD_V2S16:
698         case BI_OPCODE_IADD_V2U16:
699         case BI_OPCODE_ISUB_V2S16:
700         case BI_OPCODE_ISUB_V2U16:
701                 return (src == 1) && (swizzle >= BI_SWIZZLE_H11);
702 
703 #if 0
704         /* Restriction on IADD in 64-bit clauses on G72 */
705         case BI_OPCODE_IADD_S64:
706         case BI_OPCODE_IADD_U64:
707                 return (src == 1) && (swizzle != BI_SWIZZLE_D0);
708 #endif
709 
710         default:
711                 return false;
712         }
713 }
714 
715 bool
bi_reads_t(bi_instr * ins,unsigned src)716 bi_reads_t(bi_instr *ins, unsigned src)
717 {
718         /* Branch offset cannot come from passthrough */
719         if (bi_opcode_props[ins->op].branch)
720                 return src != 2;
721 
722         /* Table can never read passthrough */
723         if (bi_opcode_props[ins->op].table)
724                 return false;
725 
726         /* Staging register reads may happen before the succeeding register
727          * block encodes a write, so effectively there is no passthrough */
728         if (bi_is_staging_src(ins, src))
729                 return false;
730 
731         /* Bifrost cores newer than Mali G71 have restrictions on swizzles on
732          * same-cycle temporaries. Check the list for these hazards. */
733         if (bi_impacted_t_modifiers(ins, src))
734                 return false;
735 
736         /* Descriptor must not come from a passthrough */
737         switch (ins->op) {
738         case BI_OPCODE_LD_CVT:
739         case BI_OPCODE_LD_TILE:
740         case BI_OPCODE_ST_CVT:
741         case BI_OPCODE_ST_TILE:
742         case BI_OPCODE_TEXC:
743                 return src != 2;
744         case BI_OPCODE_BLEND:
745                 return src != 2 && src != 3;
746 
747         /* +JUMP can't read the offset from T */
748         case BI_OPCODE_JUMP:
749                 return false;
750 
751         /* Else, just check if we can read any temps */
752         default:
753                 return bi_reads_temps(ins, src);
754         }
755 }
756 
757 /* Counts the number of 64-bit constants required by a clause. TODO: We
758  * might want to account for merging, right now we overestimate, but
759  * that's probably fine most of the time */
760 
761 static unsigned
bi_nconstants(struct bi_clause_state * clause)762 bi_nconstants(struct bi_clause_state *clause)
763 {
764         unsigned count_32 = 0;
765 
766         for (unsigned i = 0; i < ARRAY_SIZE(clause->consts); ++i)
767                 count_32 += clause->consts[i].constant_count;
768 
769         return DIV_ROUND_UP(count_32, 2);
770 }
771 
772 /* Would there be space for constants if we added one tuple? */
773 
774 static bool
bi_space_for_more_constants(struct bi_clause_state * clause)775 bi_space_for_more_constants(struct bi_clause_state *clause)
776 {
777         return (bi_nconstants(clause) < 13 - (clause->tuple_count + 1));
778 }
779 
780 /* Updates the FAU assignment for a tuple. A valid FAU assignment must be
781  * possible (as a precondition), though not necessarily on the selected unit;
782  * this is gauranteed per-instruction by bi_lower_fau and per-tuple by
783  * bi_instr_schedulable */
784 
785 static bool
bi_update_fau(struct bi_clause_state * clause,struct bi_tuple_state * tuple,bi_instr * instr,bool fma,bool destructive)786 bi_update_fau(struct bi_clause_state *clause,
787                 struct bi_tuple_state *tuple,
788                 bi_instr *instr, bool fma, bool destructive)
789 {
790         /* Maintain our own constants, for nondestructive mode */
791         uint32_t copied_constants[2], copied_count;
792         unsigned *constant_count = &tuple->constant_count;
793         uint32_t *constants = tuple->constants;
794         enum bir_fau fau = tuple->fau;
795 
796         if (!destructive) {
797                 memcpy(copied_constants, tuple->constants,
798                                 (*constant_count) * sizeof(constants[0]));
799                 copied_count = tuple->constant_count;
800 
801                 constant_count = &copied_count;
802                 constants = copied_constants;
803         }
804 
805         bi_foreach_src(instr, s) {
806                 bi_index src = instr->src[s];
807 
808                 if (src.type == BI_INDEX_FAU) {
809                         bool no_constants = *constant_count == 0;
810                         bool no_other_fau = (fau == src.value) || !fau;
811                         bool mergable = no_constants && no_other_fau;
812 
813                         if (destructive) {
814                                 assert(mergable);
815                                 tuple->fau = src.value;
816                         } else if (!mergable) {
817                                 return false;
818                         }
819 
820                         fau = src.value;
821                 } else if (src.type == BI_INDEX_CONSTANT) {
822                         /* No need to reserve space if we have a fast 0 */
823                         if (src.value == 0 && fma && bi_reads_zero(instr))
824                                 continue;
825 
826                         /* If there is a branch target, #0 by convention is the
827                          * PC-relative offset to the target */
828                         bool pcrel = instr->branch_target && src.value == 0;
829                         bool found = false;
830 
831                         for (unsigned i = 0; i < *constant_count; ++i) {
832                                 found |= (constants[i] == src.value) &&
833                                         (i != tuple->pcrel_idx);
834                         }
835 
836                         /* pcrel constants are unique, so don't match */
837                         if (found && !pcrel)
838                                 continue;
839 
840                         bool no_fau = (*constant_count > 0) || !fau;
841                         bool mergable = no_fau && ((*constant_count) < 2);
842 
843                         if (destructive) {
844                                 assert(mergable);
845 
846                                 if (pcrel)
847                                         tuple->pcrel_idx = *constant_count;
848                         } else if (!mergable)
849                                 return false;
850 
851                         constants[(*constant_count)++] = src.value;
852                 }
853         }
854 
855         /* Constants per clause may be limited by tuple count */
856         bool room_for_constants = (*constant_count == 0) ||
857                 bi_space_for_more_constants(clause);
858 
859         if (destructive)
860                 assert(room_for_constants);
861         else if (!room_for_constants)
862                 return false;
863 
864         return true;
865 }
866 
867 /* Given an in-progress tuple, a candidate new instruction to add to the tuple,
868  * and a source (index) from that candidate, determine whether this source is
869  * "new", in the sense of requiring an additional read slot. That is, checks
870  * whether the specified source reads from the register file via a read slot
871  * (determined by its type and placement) and whether the source was already
872  * specified by a prior read slot (to avoid double counting) */
873 
874 static bool
bi_tuple_is_new_src(bi_instr * instr,struct bi_reg_state * reg,unsigned src_idx)875 bi_tuple_is_new_src(bi_instr *instr, struct bi_reg_state *reg, unsigned src_idx)
876 {
877         bi_index src = instr->src[src_idx];
878 
879         /* Only consider sources which come from the register file */
880         if (!(src.type == BI_INDEX_NORMAL || src.type == BI_INDEX_REGISTER))
881                 return false;
882 
883         /* Staging register reads bypass the usual register file mechanism */
884         if (bi_is_staging_src(instr, src_idx))
885                 return false;
886 
887         /* If a source is already read in the tuple, it is already counted */
888         for (unsigned t = 0; t < reg->nr_reads; ++t)
889                 if (bi_is_word_equiv(src, reg->reads[t]))
890                         return false;
891 
892         /* If a source is read in _this instruction_, it is already counted */
893         for (unsigned t = 0; t < src_idx; ++t)
894                 if (bi_is_word_equiv(src, instr->src[t]))
895                         return false;
896 
897         return true;
898 }
899 
900 /* Given two tuples in source order, count the number of register reads of the
901  * successor, determined as the number of unique words accessed that aren't
902  * written by the predecessor (since those are tempable).
903  */
904 
905 static unsigned
bi_count_succ_reads(bi_index t0,bi_index t1,bi_index * succ_reads,unsigned nr_succ_reads)906 bi_count_succ_reads(bi_index t0, bi_index t1,
907                 bi_index *succ_reads, unsigned nr_succ_reads)
908 {
909         unsigned reads = 0;
910 
911         for (unsigned i = 0; i < nr_succ_reads; ++i) {
912                 bool unique = true;
913 
914                 for (unsigned j = 0; j < i; ++j)
915                         if (bi_is_word_equiv(succ_reads[i], succ_reads[j]))
916                                 unique = false;
917 
918                 if (!unique)
919                         continue;
920 
921                 if (bi_is_word_equiv(succ_reads[i], t0))
922                         continue;
923 
924                 if (bi_is_word_equiv(succ_reads[i], t1))
925                         continue;
926 
927                 reads++;
928         }
929 
930         return reads;
931 }
932 
933 /* Not all instructions can read from the staging passthrough (as determined by
934  * reads_t), check if a given pair of instructions has such a restriction. Note
935  * we also use this mechanism to prevent data races around staging register
936  * reads, so we allow the input source to potentially be vector-valued */
937 
938 static bool
bi_has_staging_passthrough_hazard(bi_index fma,bi_instr * add)939 bi_has_staging_passthrough_hazard(bi_index fma, bi_instr *add)
940 {
941         bi_foreach_src(add, s) {
942                 bi_index src = add->src[s];
943 
944                 if (src.type != BI_INDEX_REGISTER)
945                         continue;
946 
947                 unsigned count = bi_count_read_registers(add, s);
948                 bool read = false;
949 
950                 for (unsigned d = 0; d < count; ++d)
951                         read |= bi_is_equiv(fma, bi_register(src.value + d));
952 
953                 if (read && !bi_reads_t(add, s))
954                         return true;
955         }
956 
957         return false;
958 }
959 
960 /* Likewise for cross-tuple passthrough (reads_temps) */
961 
962 static bool
bi_has_cross_passthrough_hazard(bi_tuple * succ,bi_instr * ins)963 bi_has_cross_passthrough_hazard(bi_tuple *succ, bi_instr *ins)
964 {
965         bi_foreach_instr_in_tuple(succ, pins) {
966                 bi_foreach_src(pins, s) {
967                         if (bi_is_word_equiv(ins->dest[0], pins->src[s]) &&
968                                         !bi_reads_temps(pins, s))
969                                 return true;
970                 }
971         }
972 
973         return false;
974 }
975 
976 /* Is a register written other than the staging mechanism? ATEST is special,
977  * writing to both a staging register and a regular register (fixed packing).
978  * BLEND is special since it has to write r48 the normal way even if it never
979  * gets read. This depends on liveness analysis, as a register is not needed
980  * for a write that will be discarded after one tuple. */
981 
982 static unsigned
bi_write_count(bi_instr * instr,uint64_t live_after_temp)983 bi_write_count(bi_instr *instr, uint64_t live_after_temp)
984 {
985         if (instr->op == BI_OPCODE_ATEST || instr->op == BI_OPCODE_BLEND)
986                 return 1;
987 
988         unsigned count = 0;
989 
990         bi_foreach_dest(instr, d) {
991                 if (d == 0 && bi_opcode_props[instr->op].sr_write)
992                         continue;
993 
994                 if (bi_is_null(instr->dest[d]))
995                         continue;
996 
997                 assert(instr->dest[0].type == BI_INDEX_REGISTER);
998                 if (live_after_temp & BITFIELD64_BIT(instr->dest[0].value))
999                         count++;
1000         }
1001 
1002         return count;
1003 }
1004 
1005 /*
1006  * Test if an instruction required flush-to-zero mode. Currently only supported
1007  * for f16<-->f32 conversions to implement fquantize16
1008  */
1009 static bool
bi_needs_ftz(bi_instr * I)1010 bi_needs_ftz(bi_instr *I)
1011 {
1012         return (I->op == BI_OPCODE_F16_TO_F32 ||
1013                 I->op == BI_OPCODE_V2F32_TO_V2F16) && I->ftz;
1014 }
1015 
1016 /*
1017  * Test if an instruction would be numerically incompatible with the clause. At
1018  * present we only consider flush-to-zero modes.
1019  */
1020 static bool
bi_numerically_incompatible(struct bi_clause_state * clause,bi_instr * instr)1021 bi_numerically_incompatible(struct bi_clause_state *clause, bi_instr *instr)
1022 {
1023         return (clause->ftz != BI_FTZ_STATE_NONE) &&
1024                ((clause->ftz == BI_FTZ_STATE_ENABLE) != bi_needs_ftz(instr));
1025 }
1026 
1027 /* Instruction placement entails two questions: what subset of instructions in
1028  * the block can legally be scheduled? and of those which is the best? That is,
1029  * we seek to maximize a cost function on a subset of the worklist satisfying a
1030  * particular predicate. The necessary predicate is determined entirely by
1031  * Bifrost's architectural limitations and is described in the accompanying
1032  * whitepaper. The cost function is a heuristic. */
1033 
1034 static bool
bi_instr_schedulable(bi_instr * instr,struct bi_clause_state * clause,struct bi_tuple_state * tuple,uint64_t live_after_temp,bool fma)1035 bi_instr_schedulable(bi_instr *instr,
1036                 struct bi_clause_state *clause,
1037                 struct bi_tuple_state *tuple,
1038                 uint64_t live_after_temp,
1039                 bool fma)
1040 {
1041         /* The units must match */
1042         if ((fma && !bi_can_fma(instr)) || (!fma && !bi_can_add(instr)))
1043                 return false;
1044 
1045         /* There can only be one message-passing instruction per clause */
1046         if (bi_must_message(instr) && clause->message)
1047                 return false;
1048 
1049         /* Some instructions have placement requirements */
1050         if (bi_opcode_props[instr->op].last && !tuple->last)
1051                 return false;
1052 
1053         if (bi_must_not_last(instr) && tuple->last)
1054                 return false;
1055 
1056         /* Numerical properties must be compatible with the clause */
1057         if (bi_numerically_incompatible(clause, instr))
1058                 return false;
1059 
1060         /* Message-passing instructions are not guaranteed write within the
1061          * same clause (most likely they will not), so if a later instruction
1062          * in the clause accesses the destination, the message-passing
1063          * instruction can't be scheduled */
1064         if (bi_opcode_props[instr->op].sr_write) {
1065                 bi_foreach_dest(instr, d) {
1066                         if (bi_is_null(instr->dest[d]))
1067                                 continue;
1068 
1069                         unsigned nr = bi_count_write_registers(instr, d);
1070                         assert(instr->dest[d].type == BI_INDEX_REGISTER);
1071                         unsigned reg = instr->dest[d].value;
1072 
1073                         for (unsigned i = 0; i < clause->access_count; ++i) {
1074                                 bi_index idx = clause->accesses[i];
1075                                 for (unsigned d = 0; d < nr; ++d) {
1076                                         if (bi_is_equiv(bi_register(reg + d), idx))
1077                                                 return false;
1078                                 }
1079                         }
1080                 }
1081         }
1082 
1083         if (bi_opcode_props[instr->op].sr_read && !bi_is_null(instr->src[0])) {
1084                 unsigned nr = bi_count_read_registers(instr, 0);
1085                 assert(instr->src[0].type == BI_INDEX_REGISTER);
1086                 unsigned reg = instr->src[0].value;
1087 
1088                 for (unsigned i = 0; i < clause->access_count; ++i) {
1089                         bi_index idx = clause->accesses[i];
1090                         for (unsigned d = 0; d < nr; ++d) {
1091                                 if (bi_is_equiv(bi_register(reg + d), idx))
1092                                         return false;
1093                         }
1094                 }
1095         }
1096 
1097         /* If FAU is already assigned, we may not disrupt that. Do a
1098          * non-disruptive test update */
1099         if (!bi_update_fau(clause, tuple, instr, fma, false))
1100                 return false;
1101 
1102         /* If this choice of FMA would force a staging passthrough, the ADD
1103          * instruction must support such a passthrough */
1104         if (tuple->add && bi_has_staging_passthrough_hazard(instr->dest[0], tuple->add))
1105                 return false;
1106 
1107         /* If this choice of destination would force a cross-tuple passthrough, the next tuple must support that */
1108         if (tuple->prev && bi_has_cross_passthrough_hazard(tuple->prev, instr))
1109                 return false;
1110 
1111         /* Register file writes are limited */
1112         unsigned total_writes = tuple->reg.nr_writes;
1113         total_writes += bi_write_count(instr, live_after_temp);
1114 
1115         /* Last tuple in a clause can only write a single value */
1116         if (tuple->last && total_writes > 1)
1117                 return false;
1118 
1119         /* Register file reads are limited, so count unique */
1120 
1121         unsigned unique_new_srcs = 0;
1122 
1123         bi_foreach_src(instr, s) {
1124                 if (bi_tuple_is_new_src(instr, &tuple->reg, s))
1125                         unique_new_srcs++;
1126         }
1127 
1128         unsigned total_srcs = tuple->reg.nr_reads + unique_new_srcs;
1129 
1130         bool can_spill_to_moves = (!tuple->add);
1131         can_spill_to_moves &= (bi_nconstants(clause) < 13 - (clause->tuple_count + 2));
1132         can_spill_to_moves &= (clause->tuple_count < 7);
1133 
1134         /* However, we can get an extra 1 or 2 sources by inserting moves */
1135         if (total_srcs > (can_spill_to_moves ? 4 : 3))
1136                 return false;
1137 
1138         /* Count effective reads for the successor */
1139         unsigned succ_reads = bi_count_succ_reads(instr->dest[0],
1140                         tuple->add ? tuple->add->dest[0] : bi_null(),
1141                         tuple->prev_reads, tuple->nr_prev_reads);
1142 
1143         /* Successor must satisfy R+W <= 4, so we require W <= 4-R */
1144         if ((signed) total_writes > (4 - (signed) succ_reads))
1145                 return false;
1146 
1147         return true;
1148 }
1149 
1150 static signed
bi_instr_cost(bi_instr * instr,struct bi_tuple_state * tuple)1151 bi_instr_cost(bi_instr *instr, struct bi_tuple_state *tuple)
1152 {
1153         signed cost = 0;
1154 
1155         /* Instructions that can schedule to either FMA or to ADD should be
1156          * deprioritized since they're easier to reschedule elsewhere */
1157         if (bi_can_fma(instr) && bi_can_add(instr))
1158                 cost++;
1159 
1160         /* Message-passing instructions impose constraints on the registers
1161          * later in the clause, so schedule them as late within a clause as
1162          * possible (<==> prioritize them since we're backwards <==> decrease
1163          * cost) */
1164         if (bi_must_message(instr))
1165                 cost--;
1166 
1167         /* Last instructions are big constraints (XXX: no effect on shader-db) */
1168         if (bi_opcode_props[instr->op].last)
1169                 cost -= 2;
1170 
1171         return cost;
1172 }
1173 
1174 static unsigned
bi_choose_index(struct bi_worklist st,struct bi_clause_state * clause,struct bi_tuple_state * tuple,uint64_t live_after_temp,bool fma)1175 bi_choose_index(struct bi_worklist st,
1176                 struct bi_clause_state *clause,
1177                 struct bi_tuple_state *tuple,
1178                 uint64_t live_after_temp,
1179                 bool fma)
1180 {
1181         unsigned i, best_idx = ~0;
1182         signed best_cost = INT_MAX;
1183 
1184         BITSET_FOREACH_SET(i, st.worklist, st.count) {
1185                 bi_instr *instr = st.instructions[i];
1186 
1187                 if (!bi_instr_schedulable(instr, clause, tuple, live_after_temp, fma))
1188                         continue;
1189 
1190                 signed cost = bi_instr_cost(instr, tuple);
1191 
1192                 /* Tie break in favour of later instructions, under the
1193                  * assumption this promotes temporary usage (reducing pressure
1194                  * on the register file). This is a side effect of a prepass
1195                  * scheduling for pressure. */
1196 
1197                 if (cost <= best_cost) {
1198                         best_idx = i;
1199                         best_cost = cost;
1200                 }
1201         }
1202 
1203         return best_idx;
1204 }
1205 
1206 static void
bi_pop_instr(struct bi_clause_state * clause,struct bi_tuple_state * tuple,bi_instr * instr,uint64_t live_after_temp,bool fma)1207 bi_pop_instr(struct bi_clause_state *clause, struct bi_tuple_state *tuple,
1208                 bi_instr *instr, uint64_t live_after_temp, bool fma)
1209 {
1210         bi_update_fau(clause, tuple, instr, fma, true);
1211 
1212         /* TODO: maybe opt a bit? or maybe doesn't matter */
1213         assert(clause->access_count + BI_MAX_SRCS + BI_MAX_DESTS <= ARRAY_SIZE(clause->accesses));
1214         memcpy(clause->accesses + clause->access_count, instr->src, sizeof(instr->src));
1215         clause->access_count += BI_MAX_SRCS;
1216         memcpy(clause->accesses + clause->access_count, instr->dest, sizeof(instr->dest));
1217         clause->access_count += BI_MAX_DESTS;
1218         tuple->reg.nr_writes += bi_write_count(instr, live_after_temp);
1219 
1220         bi_foreach_src(instr, s) {
1221                 if (bi_tuple_is_new_src(instr, &tuple->reg, s))
1222                         tuple->reg.reads[tuple->reg.nr_reads++] = instr->src[s];
1223         }
1224 
1225         /* This could be optimized to allow pairing integer instructions with
1226          * special flush-to-zero instructions, but punting on this until we have
1227          * a workload that cares.
1228          */
1229         clause->ftz = bi_needs_ftz(instr) ? BI_FTZ_STATE_ENABLE :
1230                                             BI_FTZ_STATE_DISABLE;
1231 }
1232 
1233 /* Choose the best instruction and pop it off the worklist. Returns NULL if no
1234  * instruction is available. This function is destructive. */
1235 
1236 static bi_instr *
bi_take_instr(bi_context * ctx,struct bi_worklist st,struct bi_clause_state * clause,struct bi_tuple_state * tuple,uint64_t live_after_temp,bool fma)1237 bi_take_instr(bi_context *ctx, struct bi_worklist st,
1238                 struct bi_clause_state *clause,
1239                 struct bi_tuple_state *tuple,
1240                 uint64_t live_after_temp,
1241                 bool fma)
1242 {
1243         if (tuple->add && tuple->add->op == BI_OPCODE_CUBEFACE)
1244                 return bi_lower_cubeface(ctx, clause, tuple);
1245         else if (tuple->add && tuple->add->op == BI_OPCODE_ATOM_RETURN_I32)
1246                 return bi_lower_atom_c(ctx, clause, tuple);
1247         else if (tuple->add && tuple->add->op == BI_OPCODE_ATOM1_RETURN_I32)
1248                 return bi_lower_atom_c1(ctx, clause, tuple);
1249         else if (tuple->add && tuple->add->op == BI_OPCODE_SEG_ADD_I64)
1250                 return bi_lower_seg_add(ctx, clause, tuple);
1251         else if (tuple->add && tuple->add->table)
1252                 return bi_lower_dtsel(ctx, clause, tuple);
1253 
1254         /* TODO: Optimize these moves */
1255         if (!fma && tuple->nr_prev_reads > 3) {
1256                 /* Only spill by one source for now */
1257                 assert(tuple->nr_prev_reads == 4);
1258 
1259                 /* Pick a source to spill */
1260                 bi_index src = tuple->prev_reads[0];
1261 
1262                 /* Schedule the spill */
1263                 bi_builder b = bi_init_builder(ctx, bi_before_tuple(tuple->prev));
1264                 bi_instr *mov = bi_mov_i32_to(&b, src, src);
1265                 bi_pop_instr(clause, tuple, mov, live_after_temp, fma);
1266                 return mov;
1267         }
1268 
1269 #ifndef NDEBUG
1270         /* Don't pair instructions if debugging */
1271         if ((bifrost_debug & BIFROST_DBG_NOSCHED) && tuple->add)
1272                 return NULL;
1273 #endif
1274 
1275         unsigned idx = bi_choose_index(st, clause, tuple, live_after_temp, fma);
1276 
1277         if (idx >= st.count)
1278                 return NULL;
1279 
1280         /* Update state to reflect taking the instruction */
1281         bi_instr *instr = st.instructions[idx];
1282 
1283         BITSET_CLEAR(st.worklist, idx);
1284         bi_update_worklist(st, idx);
1285         bi_pop_instr(clause, tuple, instr, live_after_temp, fma);
1286 
1287         /* Fixups */
1288         if (instr->op == BI_OPCODE_IADD_U32 && fma) {
1289                 assert(bi_can_iaddc(instr));
1290                 instr->op = BI_OPCODE_IADDC_I32;
1291                 instr->src[2] = bi_zero();
1292         } else if (fma && bi_can_replace_with_csel(instr)) {
1293                 bi_replace_mux_with_csel(instr, false);
1294         }
1295 
1296         return instr;
1297 }
1298 
1299 /* Variant of bi_rewrite_index_src_single that uses word-equivalence, rewriting
1300  * to a passthrough register. If except_sr is true, the staging sources are
1301  * skipped, so staging register reads are not accidentally encoded as
1302  * passthrough (which is impossible) */
1303 
1304 static void
bi_use_passthrough(bi_instr * ins,bi_index old,enum bifrost_packed_src new,bool except_sr)1305 bi_use_passthrough(bi_instr *ins, bi_index old,
1306                 enum bifrost_packed_src new,
1307                 bool except_sr)
1308 {
1309         /* Optional for convenience */
1310         if (!ins || bi_is_null(old))
1311                 return;
1312 
1313         bi_foreach_src(ins, i) {
1314                 if ((i == 0 || i == 4) && except_sr)
1315                         continue;
1316 
1317                 if (bi_is_word_equiv(ins->src[i], old)) {
1318                         ins->src[i].type = BI_INDEX_PASS;
1319                         ins->src[i].value = new;
1320                         ins->src[i].reg = false;
1321                         ins->src[i].offset = 0;
1322                 }
1323         }
1324 }
1325 
1326 /* Rewrites an adjacent pair of tuples _prec_eding and _succ_eding to use
1327  * intertuple passthroughs where necessary. Passthroughs are allowed as a
1328  * post-condition of scheduling. Note we rewrite ADD first, FMA second --
1329  * opposite the order of execution. This is deliberate -- if both FMA and ADD
1330  * write to the same logical register, the next executed tuple will get the
1331  * latter result. There's no interference issue under the assumption of correct
1332  * register allocation. */
1333 
1334 static void
bi_rewrite_passthrough(bi_tuple prec,bi_tuple succ)1335 bi_rewrite_passthrough(bi_tuple prec, bi_tuple succ)
1336 {
1337         bool sr_read = succ.add ? bi_opcode_props[succ.add->op].sr_read : false;
1338 
1339         if (prec.add) {
1340                 bi_use_passthrough(succ.fma, prec.add->dest[0], BIFROST_SRC_PASS_ADD, false);
1341                 bi_use_passthrough(succ.add, prec.add->dest[0], BIFROST_SRC_PASS_ADD, sr_read);
1342         }
1343 
1344         if (prec.fma) {
1345                 bi_use_passthrough(succ.fma, prec.fma->dest[0], BIFROST_SRC_PASS_FMA, false);
1346                 bi_use_passthrough(succ.add, prec.fma->dest[0], BIFROST_SRC_PASS_FMA, sr_read);
1347         }
1348 }
1349 
1350 static void
bi_rewrite_fau_to_pass(bi_tuple * tuple)1351 bi_rewrite_fau_to_pass(bi_tuple *tuple)
1352 {
1353         bi_foreach_instr_and_src_in_tuple(tuple, ins, s) {
1354                 if (ins->src[s].type != BI_INDEX_FAU) continue;
1355 
1356                 bi_index pass = bi_passthrough(ins->src[s].offset ?
1357                                 BIFROST_SRC_FAU_HI : BIFROST_SRC_FAU_LO);
1358 
1359                 ins->src[s] = bi_replace_index(ins->src[s], pass);
1360         }
1361 }
1362 
1363 static void
bi_rewrite_zero(bi_instr * ins,bool fma)1364 bi_rewrite_zero(bi_instr *ins, bool fma)
1365 {
1366         bi_index zero = bi_passthrough(fma ? BIFROST_SRC_STAGE : BIFROST_SRC_FAU_LO);
1367 
1368         bi_foreach_src(ins, s) {
1369                 bi_index src = ins->src[s];
1370 
1371                 if (src.type == BI_INDEX_CONSTANT && src.value == 0)
1372                         ins->src[s] = bi_replace_index(src, zero);
1373         }
1374 }
1375 
1376 /* Assumes #0 to {T, FAU} rewrite has already occurred */
1377 
1378 static void
bi_rewrite_constants_to_pass(bi_tuple * tuple,uint64_t constant,bool pcrel)1379 bi_rewrite_constants_to_pass(bi_tuple *tuple, uint64_t constant, bool pcrel)
1380 {
1381         bi_foreach_instr_and_src_in_tuple(tuple, ins, s) {
1382                 if (ins->src[s].type != BI_INDEX_CONSTANT) continue;
1383 
1384                 uint32_t cons = ins->src[s].value;
1385 
1386                 ASSERTED bool lo = (cons == (constant & 0xffffffff));
1387                 bool hi = (cons == (constant >> 32ull));
1388 
1389                 /* PC offsets always live in the upper half, set to zero by
1390                  * convention before pack time. (This is safe, since if you
1391                  * wanted to compare against zero, you would use a BRANCHZ
1392                  * instruction instead.) */
1393                 if (cons == 0 && ins->branch_target != NULL) {
1394                         assert(pcrel);
1395                         hi = true;
1396                         lo = false;
1397                 } else if (pcrel) {
1398                         hi = false;
1399                 }
1400 
1401                 assert(lo || hi);
1402 
1403                 ins->src[s] = bi_replace_index(ins->src[s],
1404                                 bi_passthrough(hi ?  BIFROST_SRC_FAU_HI :
1405                                         BIFROST_SRC_FAU_LO));
1406         }
1407 }
1408 
1409 /* Constructs a constant state given a tuple state. This has the
1410  * postcondition that pcrel applies to the first constant by convention,
1411  * and PC-relative constants will be #0 by convention here, so swap to
1412  * match if needed */
1413 
1414 static struct bi_const_state
bi_get_const_state(struct bi_tuple_state * tuple)1415 bi_get_const_state(struct bi_tuple_state *tuple)
1416 {
1417         struct bi_const_state consts = {
1418                 .constant_count = tuple->constant_count,
1419                 .constants[0] = tuple->constants[0],
1420                 .constants[1] = tuple->constants[1],
1421                 .pcrel = tuple->add && tuple->add->branch_target,
1422         };
1423 
1424         /* pcrel applies to the first constant by convention, and
1425          * PC-relative constants will be #0 by convention here, so swap
1426          * to match if needed */
1427         if (consts.pcrel && consts.constants[0]) {
1428                 assert(consts.constant_count == 2);
1429                 assert(consts.constants[1] == 0);
1430 
1431                 consts.constants[1] = consts.constants[0];
1432                 consts.constants[0] = 0;
1433         }
1434 
1435         return consts;
1436 }
1437 
1438 /* Merges constants in a clause, satisfying the following rules, assuming no
1439  * more than one tuple has pcrel:
1440  *
1441  * 1. If a tuple has two constants, they must be packed together. If one is
1442  * pcrel, it must be the high constant to use the M1=4 modification [sx64(E0) +
1443  * (PC << 32)]. Otherwise choose an arbitrary order.
1444  *
1445  * 4. If a tuple has one constant, it may be shared with an existing
1446  * pair that already contains that constant, or it may be combined with another
1447  * (distinct) tuple of a single constant.
1448  *
1449  * This gaurantees a packing is possible. The next routine handles modification
1450  * related swapping, to satisfy format 12 and the lack of modification for
1451  * tuple count 5/8 in EC0.
1452  */
1453 
1454 static uint64_t
bi_merge_u32(uint32_t c0,uint32_t c1,bool pcrel)1455 bi_merge_u32(uint32_t c0, uint32_t c1, bool pcrel)
1456 {
1457         /* At this point in the constant merge algorithm, pcrel constants are
1458          * treated as zero, so pcrel implies at least one constants is zero */
1459         assert(!pcrel || (c0 == 0 || c1 == 0));
1460 
1461         /* Order: pcrel, maximum non-pcrel, minimum non-pcrel */
1462         uint32_t hi = pcrel ? 0 : MAX2(c0, c1);
1463         uint32_t lo = (c0 == hi) ? c1 : c0;
1464 
1465         /* Merge in the selected order */
1466         return lo | (((uint64_t) hi) << 32ull);
1467 }
1468 
1469 static unsigned
bi_merge_pairs(struct bi_const_state * consts,unsigned tuple_count,uint64_t * merged,unsigned * pcrel_pair)1470 bi_merge_pairs(struct bi_const_state *consts, unsigned tuple_count,
1471                 uint64_t *merged, unsigned *pcrel_pair)
1472 {
1473         unsigned merge_count = 0;
1474 
1475         for (unsigned t = 0; t < tuple_count; ++t) {
1476                 if (consts[t].constant_count != 2) continue;
1477 
1478                 unsigned idx = ~0;
1479                 uint64_t val = bi_merge_u32(consts[t].constants[0],
1480                                 consts[t].constants[1], consts[t].pcrel);
1481 
1482                 /* Skip the pcrel pair if assigned, because if one is assigned,
1483                  * this one is not pcrel by uniqueness so it's a mismatch */
1484                 for (unsigned s = 0; s < merge_count; ++s) {
1485                         if (merged[s] == val && (*pcrel_pair) != s) {
1486                                 idx = s;
1487                                 break;
1488                         }
1489                 }
1490 
1491                 if (idx == ~0) {
1492                         idx = merge_count++;
1493                         merged[idx] = val;
1494 
1495                         if (consts[t].pcrel)
1496                                 (*pcrel_pair) = idx;
1497                 }
1498 
1499                 consts[t].word_idx = idx;
1500         }
1501 
1502         return merge_count;
1503 }
1504 
1505 static unsigned
bi_merge_singles(struct bi_const_state * consts,unsigned tuple_count,uint64_t * pairs,unsigned pair_count,unsigned * pcrel_pair)1506 bi_merge_singles(struct bi_const_state *consts, unsigned tuple_count,
1507                 uint64_t *pairs, unsigned pair_count, unsigned *pcrel_pair)
1508 {
1509         bool pending = false, pending_pcrel = false;
1510         uint32_t pending_single = 0;
1511 
1512         for (unsigned t = 0; t < tuple_count; ++t) {
1513                 if (consts[t].constant_count != 1) continue;
1514 
1515                 uint32_t val = consts[t].constants[0];
1516                 unsigned idx = ~0;
1517 
1518                 /* Try to match, but don't match pcrel with non-pcrel, even
1519                  * though we can merge a pcrel with a non-pcrel single */
1520                 for (unsigned i = 0; i < pair_count; ++i) {
1521                         bool lo = ((pairs[i] & 0xffffffff) == val);
1522                         bool hi = ((pairs[i] >> 32) == val);
1523                         bool match = (lo || hi);
1524                         match &= ((*pcrel_pair) != i);
1525                         if (match && !consts[t].pcrel) {
1526                                 idx = i;
1527                                 break;
1528                         }
1529                 }
1530 
1531                 if (idx == ~0) {
1532                         idx = pair_count;
1533 
1534                         if (pending && pending_single != val) {
1535                                 assert(!(pending_pcrel && consts[t].pcrel));
1536                                 bool pcrel = pending_pcrel || consts[t].pcrel;
1537 
1538                                 if (pcrel)
1539                                         *pcrel_pair = idx;
1540 
1541                                 pairs[pair_count++] = bi_merge_u32(pending_single, val, pcrel);
1542 
1543                                 pending = pending_pcrel = false;
1544                         } else {
1545                                 pending = true;
1546                                 pending_pcrel = consts[t].pcrel;
1547                                 pending_single = val;
1548                         }
1549                 }
1550 
1551                 consts[t].word_idx = idx;
1552         }
1553 
1554         /* Shift so it works whether pending_pcrel is set or not */
1555         if (pending) {
1556                 if (pending_pcrel)
1557                         *pcrel_pair = pair_count;
1558 
1559                 pairs[pair_count++] = ((uint64_t) pending_single) << 32ull;
1560         }
1561 
1562         return pair_count;
1563 }
1564 
1565 static unsigned
bi_merge_constants(struct bi_const_state * consts,uint64_t * pairs,unsigned * pcrel_idx)1566 bi_merge_constants(struct bi_const_state *consts, uint64_t *pairs, unsigned *pcrel_idx)
1567 {
1568         unsigned pair_count = bi_merge_pairs(consts, 8, pairs, pcrel_idx);
1569         return bi_merge_singles(consts, 8, pairs, pair_count, pcrel_idx);
1570 }
1571 
1572 /* Swap two constants at word i and i+1 by swapping their actual positions and
1573  * swapping all references so the meaning of the clause is preserved */
1574 
1575 static void
bi_swap_constants(struct bi_const_state * consts,uint64_t * pairs,unsigned i)1576 bi_swap_constants(struct bi_const_state *consts, uint64_t *pairs, unsigned i)
1577 {
1578         uint64_t tmp_pair = pairs[i + 0];
1579         pairs[i + 0] = pairs[i + 1];
1580         pairs[i + 1] = tmp_pair;
1581 
1582         for (unsigned t = 0; t < 8; ++t) {
1583                 if (consts[t].word_idx == i)
1584                         consts[t].word_idx = (i + 1);
1585                 else if (consts[t].word_idx == (i + 1))
1586                         consts[t].word_idx = i;
1587         }
1588 }
1589 
1590 /* Given merged constants, one of which might be PC-relative, fix up the M
1591  * values so the PC-relative constant (if it exists) has the M1=4 modification
1592  * and other constants are used as-is (which might require swapping) */
1593 
1594 static unsigned
bi_apply_constant_modifiers(struct bi_const_state * consts,uint64_t * pairs,unsigned * pcrel_idx,unsigned tuple_count,unsigned constant_count)1595 bi_apply_constant_modifiers(struct bi_const_state *consts,
1596                 uint64_t *pairs, unsigned *pcrel_idx,
1597                 unsigned tuple_count, unsigned constant_count)
1598 {
1599         unsigned start = bi_ec0_packed(tuple_count) ? 1 : 0;
1600 
1601         /* Clauses with these tuple counts lack an M field for the packed EC0,
1602          * so EC0 cannot be PC-relative, which might require swapping (and
1603          * possibly adding an unused constant) to fit */
1604 
1605         if (*pcrel_idx == 0 && (tuple_count == 5 || tuple_count == 8)) {
1606                 constant_count = MAX2(constant_count, 2);
1607                 *pcrel_idx = 1;
1608                 bi_swap_constants(consts, pairs, 0);
1609         }
1610 
1611         /* EC0 might be packed free, after that constants are packed in pairs
1612          * (with clause format 12), with M1 values computed from the pair */
1613 
1614         for (unsigned i = start; i < constant_count; i += 2) {
1615                 bool swap = false;
1616                 bool last = (i + 1) == constant_count;
1617 
1618                 unsigned A1 = (pairs[i] >> 60);
1619                 unsigned B1 = (pairs[i + 1] >> 60);
1620 
1621                 if (*pcrel_idx == i || *pcrel_idx == (i + 1)) {
1622                         /* PC-relative constant must be E0, not E1 */
1623                         swap = (*pcrel_idx == (i + 1));
1624 
1625                         /* Set M1 = 4 by noting (A - B) mod 16 = 4 is
1626                          * equivalent to A = (B + 4) mod 16 and that we can
1627                          * control A */
1628                         unsigned B = swap ? A1 : B1;
1629                         unsigned A = (B + 4) & 0xF;
1630                         pairs[*pcrel_idx] |= ((uint64_t) A) << 60;
1631 
1632                         /* Swapped if swap set, identity if swap not set */
1633                         *pcrel_idx = i;
1634                 } else {
1635                         /* Compute M1 value if we don't swap */
1636                         unsigned M1 = (16 + A1 - B1) & 0xF;
1637 
1638                         /* For M1 = 0 or M1 >= 8, the constants are unchanged,
1639                          * we have 0 < (A1 - B1) % 16 < 8, which implies (B1 -
1640                          * A1) % 16 >= 8, so swapping will let them be used
1641                          * unchanged */
1642                         swap = (M1 != 0) && (M1 < 8);
1643 
1644                         /* However, we can't swap the last constant, so we
1645                          * force M1 = 0 instead for this case */
1646                         if (last && swap) {
1647                                 pairs[i + 1] |= pairs[i] & (0xfull << 60);
1648                                 swap = false;
1649                         }
1650                 }
1651 
1652                 if (swap) {
1653                         assert(!last);
1654                         bi_swap_constants(consts, pairs, i);
1655                 }
1656         }
1657 
1658         return constant_count;
1659 }
1660 
1661 /* Schedule a single clause. If no instructions remain, return NULL. */
1662 
1663 static bi_clause *
bi_schedule_clause(bi_context * ctx,bi_block * block,struct bi_worklist st,uint64_t * live)1664 bi_schedule_clause(bi_context *ctx, bi_block *block, struct bi_worklist st, uint64_t *live)
1665 {
1666         struct bi_clause_state clause_state = { 0 };
1667         bi_clause *clause = rzalloc(ctx, bi_clause);
1668         bi_tuple *tuple = NULL;
1669 
1670         const unsigned max_tuples = ARRAY_SIZE(clause->tuples);
1671 
1672         /* TODO: Decide flow control better */
1673         clause->flow_control = BIFROST_FLOW_NBTB;
1674 
1675         /* The last clause can only write one instruction, so initialize that */
1676         struct bi_reg_state reg_state = {};
1677         bi_index prev_reads[5] = { bi_null() };
1678         unsigned nr_prev_reads = 0;
1679 
1680         /* We need to track future liveness. The main *live set tracks what is
1681          * live at the current point int he program we are scheduling, but to
1682          * determine temp eligibility, we instead want what will be live after
1683          * the next tuple in the program. If you scheduled forwards, you'd need
1684          * a crystall ball for this. Luckily we schedule backwards, so we just
1685          * delay updates to the live_after_temp by an extra tuple. */
1686         uint64_t live_after_temp = *live;
1687         uint64_t live_next_tuple = live_after_temp;
1688 
1689         do {
1690                 struct bi_tuple_state tuple_state = {
1691                         .last = (clause->tuple_count == 0),
1692                         .reg = reg_state,
1693                         .nr_prev_reads = nr_prev_reads,
1694                         .prev = tuple,
1695                         .pcrel_idx = ~0,
1696                 };
1697 
1698                 assert(nr_prev_reads < ARRAY_SIZE(prev_reads));
1699                 memcpy(tuple_state.prev_reads, prev_reads, sizeof(prev_reads));
1700 
1701                 unsigned idx = max_tuples - clause->tuple_count - 1;
1702 
1703                 tuple = &clause->tuples[idx];
1704 
1705                 if (clause->message && bi_opcode_props[clause->message->op].sr_read && !bi_is_null(clause->message->src[0])) {
1706                         unsigned nr = bi_count_read_registers(clause->message, 0);
1707                         live_after_temp |= (BITFIELD64_MASK(nr) << clause->message->src[0].value);
1708                 }
1709 
1710                 /* Since we schedule backwards, we schedule ADD first */
1711                 tuple_state.add = bi_take_instr(ctx, st, &clause_state, &tuple_state, live_after_temp, false);
1712                 tuple->fma = bi_take_instr(ctx, st, &clause_state, &tuple_state, live_after_temp, true);
1713                 tuple->add = tuple_state.add;
1714 
1715                 /* Update liveness from the new instructions */
1716                 if (tuple->add)
1717                         *live = bi_postra_liveness_ins(*live, tuple->add);
1718 
1719                 if (tuple->fma)
1720                         *live = bi_postra_liveness_ins(*live, tuple->fma);
1721 
1722                /* Rotate in the new per-tuple liveness */
1723                 live_after_temp = live_next_tuple;
1724                 live_next_tuple = *live;
1725 
1726                 /* We may have a message, but only one per clause */
1727                 if (tuple->add && bi_must_message(tuple->add)) {
1728                         assert(!clause_state.message);
1729                         clause_state.message = true;
1730 
1731                         clause->message_type =
1732                                 bi_message_type_for_instr(tuple->add);
1733                         clause->message = tuple->add;
1734 
1735                         /* We don't need to set dependencies for blend shaders
1736                          * because the BLEND instruction in the fragment
1737                          * shader should have already done the wait */
1738                         if (!ctx->inputs->is_blend) {
1739                                 switch (tuple->add->op) {
1740                                 case BI_OPCODE_ATEST:
1741                                         clause->dependencies |= (1 << BIFROST_SLOT_ELDEST_DEPTH);
1742                                         break;
1743                                 case BI_OPCODE_LD_TILE:
1744                                 case BI_OPCODE_ST_TILE:
1745                                         clause->dependencies |= (1 << BIFROST_SLOT_ELDEST_COLOUR);
1746                                         break;
1747                                 case BI_OPCODE_BLEND:
1748                                         clause->dependencies |= (1 << BIFROST_SLOT_ELDEST_DEPTH);
1749                                         clause->dependencies |= (1 << BIFROST_SLOT_ELDEST_COLOUR);
1750                                         break;
1751                                 default:
1752                                         break;
1753                                 }
1754                         }
1755                 }
1756 
1757                 clause_state.consts[idx] = bi_get_const_state(&tuple_state);
1758 
1759                 /* Before merging constants, eliminate zeroes, otherwise the
1760                  * merging will fight over the #0 that never gets read (and is
1761                  * never marked as read by update_fau) */
1762                 if (tuple->fma && bi_reads_zero(tuple->fma))
1763                         bi_rewrite_zero(tuple->fma, true);
1764 
1765                 /* Rewrite away FAU, constant write is deferred */
1766                 if (!tuple_state.constant_count) {
1767                         tuple->fau_idx = tuple_state.fau;
1768                         bi_rewrite_fau_to_pass(tuple);
1769                 }
1770 
1771                 /* Use passthrough register for cross-stage accesses. Since
1772                  * there are just FMA and ADD stages, that means we rewrite to
1773                  * passthrough the sources of the ADD that read from the
1774                  * destination of the FMA */
1775 
1776                 if (tuple->fma) {
1777                         bi_use_passthrough(tuple->add, tuple->fma->dest[0],
1778                                         BIFROST_SRC_STAGE, false);
1779                 }
1780 
1781                 /* Don't add an empty tuple, unless the worklist has nothing
1782                  * but a (pseudo)instruction failing to schedule due to a "not
1783                  * last instruction" constraint */
1784 
1785                 int some_instruction = __bitset_ffs(st.worklist, BITSET_WORDS(st.count));
1786                 bool not_last = (some_instruction > 0) &&
1787                         bi_must_not_last(st.instructions[some_instruction - 1]);
1788 
1789                 bool insert_empty = tuple_state.last && not_last;
1790 
1791                 if (!(tuple->fma || tuple->add || insert_empty))
1792                         break;
1793 
1794                 clause->tuple_count++;
1795 
1796                 /* Adding enough tuple might overflow constants */
1797                 if (!bi_space_for_more_constants(&clause_state))
1798                         break;
1799 
1800 #ifndef NDEBUG
1801                 /* Don't schedule more than 1 tuple if debugging */
1802                 if ((bifrost_debug & BIFROST_DBG_NOSCHED) && !insert_empty)
1803                         break;
1804 #endif
1805 
1806                 /* Link through the register state */
1807                 STATIC_ASSERT(sizeof(prev_reads) == sizeof(tuple_state.reg.reads));
1808                 memcpy(prev_reads, tuple_state.reg.reads, sizeof(prev_reads));
1809                 nr_prev_reads = tuple_state.reg.nr_reads;
1810                 clause_state.tuple_count++;
1811         } while(clause->tuple_count < 8);
1812 
1813         /* Don't schedule an empty clause */
1814         if (!clause->tuple_count)
1815                 return NULL;
1816 
1817         /* Before merging, rewrite away any tuples that read only zero */
1818         for (unsigned i = max_tuples - clause->tuple_count; i < max_tuples; ++i) {
1819                 bi_tuple *tuple = &clause->tuples[i];
1820                 struct bi_const_state *st = &clause_state.consts[i];
1821 
1822                 if (st->constant_count == 0 || st->constants[0] || st->constants[1] || st->pcrel)
1823                         continue;
1824 
1825                 bi_foreach_instr_in_tuple(tuple, ins)
1826                         bi_rewrite_zero(ins, false);
1827 
1828                 /* Constant has been demoted to FAU, so don't pack it separately */
1829                 st->constant_count = 0;
1830 
1831                 /* Default */
1832                 assert(tuple->fau_idx == BIR_FAU_ZERO);
1833         }
1834 
1835         uint64_t constant_pairs[8] = { 0 };
1836         unsigned pcrel_idx = ~0;
1837         unsigned constant_words =
1838                 bi_merge_constants(clause_state.consts, constant_pairs, &pcrel_idx);
1839 
1840         constant_words = bi_apply_constant_modifiers(clause_state.consts,
1841                         constant_pairs, &pcrel_idx, clause->tuple_count,
1842                         constant_words);
1843 
1844         clause->pcrel_idx = pcrel_idx;
1845 
1846         for (unsigned i = max_tuples - clause->tuple_count; i < max_tuples; ++i) {
1847                 bi_tuple *tuple = &clause->tuples[i];
1848 
1849                 /* If no constants, leave FAU as it is, possibly defaulting to 0 */
1850                 if (clause_state.consts[i].constant_count == 0)
1851                         continue;
1852 
1853                 /* FAU is already handled */
1854                 assert(!tuple->fau_idx);
1855 
1856                 unsigned word_idx = clause_state.consts[i].word_idx;
1857                 assert(word_idx <= 8);
1858 
1859                 /* We could try to merge regardless of bottom bits as well, but
1860                  * that's probably diminishing returns */
1861                 uint64_t pair = constant_pairs[word_idx];
1862                 unsigned lo = pair & 0xF;
1863 
1864                 tuple->fau_idx = bi_constant_field(word_idx) | lo;
1865                 bi_rewrite_constants_to_pass(tuple, pair, word_idx == pcrel_idx);
1866         }
1867 
1868         clause->constant_count = constant_words;
1869         memcpy(clause->constants, constant_pairs, sizeof(constant_pairs));
1870 
1871         /* Branches must be last, so this can be factored out */
1872         bi_instr *last = clause->tuples[max_tuples - 1].add;
1873         clause->next_clause_prefetch = !last || (last->op != BI_OPCODE_JUMP);
1874         clause->block = block;
1875 
1876         clause->ftz = (clause_state.ftz == BI_FTZ_STATE_ENABLE);
1877 
1878         /* We emit in reverse and emitted to the back of the tuples array, so
1879          * move it up front for easy indexing */
1880         memmove(clause->tuples,
1881                        clause->tuples + (max_tuples - clause->tuple_count),
1882                        clause->tuple_count * sizeof(clause->tuples[0]));
1883 
1884         /* Use passthrough register for cross-tuple accesses. Note this is
1885          * after the memmove, so this is forwards. Skip the first tuple since
1886          * there is nothing before it to passthrough */
1887 
1888         for (unsigned t = 1; t < clause->tuple_count; ++t)
1889                 bi_rewrite_passthrough(clause->tuples[t - 1], clause->tuples[t]);
1890 
1891         return clause;
1892 }
1893 
1894 static void
bi_schedule_block(bi_context * ctx,bi_block * block)1895 bi_schedule_block(bi_context *ctx, bi_block *block)
1896 {
1897         list_inithead(&block->clauses);
1898 
1899         /* Copy list to dynamic array */
1900         struct bi_worklist st = bi_initialize_worklist(block,
1901                         bifrost_debug & BIFROST_DBG_INORDER,
1902                         ctx->inputs->is_blend);
1903 
1904         if (!st.count) {
1905                 bi_free_worklist(st);
1906                 return;
1907         }
1908 
1909         /* We need to track liveness during scheduling in order to determine whether we can use temporary (passthrough) registers */
1910         uint64_t live = block->reg_live_out;
1911 
1912         /* Schedule as many clauses as needed to fill the block */
1913         bi_clause *u = NULL;
1914         while((u = bi_schedule_clause(ctx, block, st, &live)))
1915                 list_add(&u->link, &block->clauses);
1916 
1917         /* Back-to-back bit affects only the last clause of a block,
1918          * the rest are implicitly true */
1919         if (!list_is_empty(&block->clauses)) {
1920                 bi_clause *last_clause = list_last_entry(&block->clauses, bi_clause, link);
1921                 if (bi_reconverge_branches(block))
1922                         last_clause->flow_control = BIFROST_FLOW_NBTB_UNCONDITIONAL;
1923         }
1924 
1925         /* Reorder instructions to match the new schedule. First remove
1926          * existing instructions and then recreate the list */
1927 
1928         bi_foreach_instr_in_block_safe(block, ins) {
1929                 list_del(&ins->link);
1930         }
1931 
1932         bi_foreach_clause_in_block(block, clause) {
1933                 for (unsigned i = 0; i < clause->tuple_count; ++i)  {
1934                         bi_foreach_instr_in_tuple(&clause->tuples[i], ins) {
1935                                 list_addtail(&ins->link, &block->instructions);
1936                         }
1937                 }
1938         }
1939 
1940         block->scheduled = true;
1941 
1942 #ifndef NDEBUG
1943         unsigned i;
1944         bool incomplete = false;
1945 
1946         BITSET_FOREACH_SET(i, st.worklist, st.count) {
1947                 bi_print_instr(st.instructions[i], stderr);
1948                 incomplete = true;
1949         }
1950 
1951         if (incomplete)
1952                 unreachable("The above instructions failed to schedule.");
1953 #endif
1954 
1955         bi_free_worklist(st);
1956 }
1957 
1958 static bool
bi_check_fau_src(bi_instr * ins,unsigned s,uint32_t * constants,unsigned * cwords,bi_index * fau)1959 bi_check_fau_src(bi_instr *ins, unsigned s, uint32_t *constants, unsigned *cwords, bi_index *fau)
1960 {
1961         bi_index src = ins->src[s];
1962 
1963         /* Staging registers can't have FAU accesses */
1964         if (bi_is_staging_src(ins, s))
1965                 return (src.type != BI_INDEX_CONSTANT) && (src.type != BI_INDEX_FAU);
1966 
1967         if (src.type == BI_INDEX_CONSTANT) {
1968                 /* Allow fast zero */
1969                 if (src.value == 0 && bi_opcode_props[ins->op].fma && bi_reads_zero(ins))
1970                         return true;
1971 
1972                 if (!bi_is_null(*fau))
1973                         return false;
1974 
1975                 /* Else, try to inline a constant */
1976                 for (unsigned i = 0; i < *cwords; ++i) {
1977                         if (src.value == constants[i])
1978                                 return true;
1979                 }
1980 
1981                 if (*cwords >= 2)
1982                         return false;
1983 
1984                 constants[(*cwords)++] = src.value;
1985         } else if (src.type == BI_INDEX_FAU) {
1986                 if (*cwords != 0)
1987                         return false;
1988 
1989                 /* Can only read from one pair of FAU words */
1990                 if (!bi_is_null(*fau) && (src.value != fau->value))
1991                         return false;
1992 
1993                 /* If there is a target, we'll need a PC-relative constant */
1994                 if (ins->branch_target)
1995                         return false;
1996 
1997                 *fau = src;
1998         }
1999 
2000         return true;
2001 }
2002 
2003 void
bi_lower_fau(bi_context * ctx)2004 bi_lower_fau(bi_context *ctx)
2005 {
2006         bi_foreach_instr_global_safe(ctx, ins) {
2007                 bi_builder b = bi_init_builder(ctx, bi_before_instr(ins));
2008 
2009                 uint32_t constants[2];
2010                 unsigned cwords = 0;
2011                 bi_index fau = bi_null();
2012 
2013                 /* ATEST must have the ATEST datum encoded, not any other
2014                  * uniform. See to it this is the case. */
2015                 if (ins->op == BI_OPCODE_ATEST)
2016                         fau = ins->src[2];
2017 
2018                 /* Dual texturing requires the texture operation descriptor
2019                  * encoded as an immediate so we can fix up.
2020                  */
2021                 if (ins->op == BI_OPCODE_TEXC) {
2022                         assert(ins->src[3].type == BI_INDEX_CONSTANT);
2023                         constants[cwords++] = ins->src[3].value;
2024                 }
2025 
2026                 bi_foreach_src(ins, s) {
2027                         if (bi_check_fau_src(ins, s, constants, &cwords, &fau)) continue;
2028 
2029                         bi_index copy = bi_mov_i32(&b, ins->src[s]);
2030                         ins->src[s] = bi_replace_index(ins->src[s], copy);
2031                 }
2032         }
2033 }
2034 
2035 /* Only v7 allows specifying a dependency on the tilebuffer for the first
2036  * clause of a shader. v6 requires adding a NOP clause with the depedency. */
2037 
2038 static void
bi_add_nop_for_atest(bi_context * ctx)2039 bi_add_nop_for_atest(bi_context *ctx)
2040 {
2041         /* Only needed on v6 */
2042         if (ctx->arch >= 7)
2043                 return;
2044 
2045         if (list_is_empty(&ctx->blocks))
2046                 return;
2047 
2048         /* Fetch the first clause of the shader */
2049         bi_block *block = list_first_entry(&ctx->blocks, bi_block, link);
2050         bi_clause *clause = bi_next_clause(ctx, block, NULL);
2051 
2052         if (!clause || !(clause->dependencies & ((1 << BIFROST_SLOT_ELDEST_DEPTH) |
2053                                                  (1 << BIFROST_SLOT_ELDEST_COLOUR))))
2054                 return;
2055 
2056         /* Add a NOP so we can wait for the dependencies required by the first
2057          * clause */
2058 
2059         bi_instr *I = rzalloc(ctx, bi_instr);
2060         I->op = BI_OPCODE_NOP;
2061         I->dest[0] = bi_null();
2062 
2063         bi_clause *new_clause = ralloc(ctx, bi_clause);
2064         *new_clause = (bi_clause) {
2065                 .flow_control = BIFROST_FLOW_NBTB,
2066                 .next_clause_prefetch = true,
2067                 .block = clause->block,
2068 
2069                 .tuple_count = 1,
2070                 .tuples[0] = { .fma = I, },
2071         };
2072 
2073         list_add(&new_clause->link, &clause->block->clauses);
2074 }
2075 
2076 void
bi_schedule(bi_context * ctx)2077 bi_schedule(bi_context *ctx)
2078 {
2079         /* Fed into both scheduling and DCE */
2080         bi_postra_liveness(ctx);
2081 
2082         bi_foreach_block(ctx, block) {
2083                 bi_schedule_block(ctx, block);
2084         }
2085 
2086         bi_opt_dce_post_ra(ctx);
2087         bi_add_nop_for_atest(ctx);
2088 }
2089