1 /*
2 * Copyright (C) 2020 Collabora, Ltd.
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
13 * Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 * SOFTWARE.
22 */
23
24 #include "compiler.h"
25 #include "bi_quirks.h"
26
27 /* This file contains the final passes of the compiler. Running after
28 * scheduling and RA, the IR is now finalized, so we need to emit it to actual
29 * bits on the wire (as well as fixup branches) */
30
31 static uint64_t
bi_pack_header(bi_clause * clause,bi_clause * next_1,bi_clause * next_2)32 bi_pack_header(bi_clause *clause, bi_clause *next_1, bi_clause *next_2)
33 {
34 /* next_dependencies are the union of the dependencies of successors'
35 * dependencies */
36
37 unsigned dependency_wait = next_1 ? next_1->dependencies : 0;
38 dependency_wait |= next_2 ? next_2->dependencies : 0;
39
40 /* Signal barriers (slot #7) immediately. This is not optimal but good
41 * enough. Doing better requires extending the IR and scheduler.
42 */
43 if (clause->message_type == BIFROST_MESSAGE_BARRIER)
44 dependency_wait |= BITFIELD_BIT(7);
45
46 bool staging_barrier = next_1 ? next_1->staging_barrier : false;
47 staging_barrier |= next_2 ? next_2->staging_barrier : 0;
48
49 struct bifrost_header header = {
50 .flow_control =
51 (next_1 == NULL && next_2 == NULL) ?
52 BIFROST_FLOW_END : clause->flow_control,
53 .terminate_discarded_threads = clause->td,
54 .next_clause_prefetch = clause->next_clause_prefetch && next_1,
55 .staging_barrier = staging_barrier,
56 .staging_register = clause->staging_register,
57 .dependency_wait = dependency_wait,
58 .dependency_slot = clause->scoreboard_id,
59 .message_type = clause->message_type,
60 .next_message_type = next_1 ? next_1->message_type : 0,
61 .flush_to_zero = clause->ftz ? BIFROST_FTZ_ALWAYS : BIFROST_FTZ_DISABLE
62 };
63
64 uint64_t u = 0;
65 memcpy(&u, &header, sizeof(header));
66 return u;
67 }
68
69 /* Assigns a slot for reading, before anything is written */
70
71 static void
bi_assign_slot_read(bi_registers * regs,bi_index src)72 bi_assign_slot_read(bi_registers *regs, bi_index src)
73 {
74 /* We only assign for registers */
75 if (src.type != BI_INDEX_REGISTER)
76 return;
77
78 /* Check if we already assigned the slot */
79 for (unsigned i = 0; i <= 1; ++i) {
80 if (regs->slot[i] == src.value && regs->enabled[i])
81 return;
82 }
83
84 if (regs->slot[2] == src.value && regs->slot23.slot2 == BIFROST_OP_READ)
85 return;
86
87 /* Assign it now */
88
89 for (unsigned i = 0; i <= 1; ++i) {
90 if (!regs->enabled[i]) {
91 regs->slot[i] = src.value;
92 regs->enabled[i] = true;
93 return;
94 }
95 }
96
97 if (!regs->slot23.slot3) {
98 regs->slot[2] = src.value;
99 regs->slot23.slot2 = BIFROST_OP_READ;
100 return;
101 }
102
103 bi_print_slots(regs, stderr);
104 unreachable("Failed to find a free slot for src");
105 }
106
107 static bi_registers
bi_assign_slots(bi_tuple * now,bi_tuple * prev)108 bi_assign_slots(bi_tuple *now, bi_tuple *prev)
109 {
110 /* We assign slots for the main register mechanism. Special ops
111 * use the data registers, which has its own mechanism entirely
112 * and thus gets skipped over here. */
113
114 bool read_dreg = now->add && bi_opcode_props[now->add->op].sr_read;
115 bool write_dreg = prev->add && bi_opcode_props[prev->add->op].sr_write;
116
117 /* First, assign reads */
118
119 if (now->fma)
120 bi_foreach_src(now->fma, src)
121 bi_assign_slot_read(&now->regs, (now->fma)->src[src]);
122
123 if (now->add) {
124 bi_foreach_src(now->add, src) {
125 /* This is not a real source, we shouldn't assign a
126 * slot for it.
127 */
128 if (now->add->op == BI_OPCODE_BLEND && src == 4)
129 continue;
130
131 if (!(src == 0 && read_dreg))
132 bi_assign_slot_read(&now->regs, (now->add)->src[src]);
133 }
134 }
135
136 /* Next, assign writes. Staging writes are assigned separately, but
137 * +ATEST wants its destination written to both a staging register
138 * _and_ a regular write, because it may not generate a message */
139
140 if (prev->add && (!write_dreg || prev->add->op == BI_OPCODE_ATEST)) {
141 bi_index idx = prev->add->dest[0];
142
143 if (idx.type == BI_INDEX_REGISTER) {
144 now->regs.slot[3] = idx.value;
145 now->regs.slot23.slot3 = BIFROST_OP_WRITE;
146 }
147 }
148
149 if (prev->fma) {
150 bi_index idx = (prev->fma)->dest[0];
151
152 if (idx.type == BI_INDEX_REGISTER) {
153 if (now->regs.slot23.slot3) {
154 /* Scheduler constraint: cannot read 3 and write 2 */
155 assert(!now->regs.slot23.slot2);
156 now->regs.slot[2] = idx.value;
157 now->regs.slot23.slot2 = BIFROST_OP_WRITE;
158 } else {
159 now->regs.slot[3] = idx.value;
160 now->regs.slot23.slot3 = BIFROST_OP_WRITE;
161 now->regs.slot23.slot3_fma = true;
162 }
163 }
164 }
165
166 return now->regs;
167 }
168
169 static enum bifrost_reg_mode
bi_pack_register_mode(bi_registers r)170 bi_pack_register_mode(bi_registers r)
171 {
172 /* Handle idle as a special case */
173 if (!(r.slot23.slot2 | r.slot23.slot3))
174 return r.first_instruction ? BIFROST_IDLE_1 : BIFROST_IDLE;
175
176 /* Otherwise, use the LUT */
177 for (unsigned i = 0; i < ARRAY_SIZE(bifrost_reg_ctrl_lut); ++i) {
178 if (memcmp(bifrost_reg_ctrl_lut + i, &r.slot23, sizeof(r.slot23)) == 0)
179 return i;
180 }
181
182 bi_print_slots(&r, stderr);
183 unreachable("Invalid slot assignment");
184 }
185
186 static uint64_t
bi_pack_registers(bi_registers regs)187 bi_pack_registers(bi_registers regs)
188 {
189 enum bifrost_reg_mode mode = bi_pack_register_mode(regs);
190 struct bifrost_regs s = { 0 };
191 uint64_t packed = 0;
192
193 /* Need to pack 5-bit mode as a 4-bit field. The decoder moves bit 3 to bit 4 for
194 * first instruction and adds 16 when reg 2 == reg 3 */
195
196 unsigned ctrl;
197 bool r2_equals_r3 = false;
198
199 if (regs.first_instruction) {
200 /* Bit 3 implicitly must be clear for first instructions.
201 * The affected patterns all write both ADD/FMA, but that
202 * is forbidden for the last instruction (whose writes are
203 * encoded by the first), so this does not add additional
204 * encoding constraints */
205 assert(!(mode & 0x8));
206
207 /* Move bit 4 to bit 3, since bit 3 is clear */
208 ctrl = (mode & 0x7) | ((mode & 0x10) >> 1);
209
210 /* If we can let r2 equal r3, we have to or the hardware raises
211 * INSTR_INVALID_ENC (it's unclear why). */
212 if (!(regs.slot23.slot2 && regs.slot23.slot3))
213 r2_equals_r3 = true;
214 } else {
215 /* We force r2=r3 or not for the upper bit */
216 ctrl = (mode & 0xF);
217 r2_equals_r3 = (mode & 0x10);
218 }
219
220 if (regs.enabled[1]) {
221 /* Gotta save that bit!~ Required by the 63-x trick */
222 assert(regs.slot[1] > regs.slot[0]);
223 assert(regs.enabled[0]);
224
225 /* Do the 63-x trick, see docs/disasm */
226 if (regs.slot[0] > 31) {
227 regs.slot[0] = 63 - regs.slot[0];
228 regs.slot[1] = 63 - regs.slot[1];
229 }
230
231 assert(regs.slot[0] <= 31);
232 assert(regs.slot[1] <= 63);
233
234 s.ctrl = ctrl;
235 s.reg1 = regs.slot[1];
236 s.reg0 = regs.slot[0];
237 } else {
238 /* slot 1 disabled, so set to zero and use slot 1 for ctrl */
239 s.ctrl = 0;
240 s.reg1 = ctrl << 2;
241
242 if (regs.enabled[0]) {
243 /* Bit 0 upper bit of slot 0 */
244 s.reg1 |= (regs.slot[0] >> 5);
245
246 /* Rest of slot 0 in usual spot */
247 s.reg0 = (regs.slot[0] & 0b11111);
248 } else {
249 /* Bit 1 set if slot 0 also disabled */
250 s.reg1 |= (1 << 1);
251 }
252 }
253
254 /* Force r2 =/!= r3 as needed */
255 if (r2_equals_r3) {
256 assert(regs.slot[3] == regs.slot[2] || !(regs.slot23.slot2 && regs.slot23.slot3));
257
258 if (regs.slot23.slot2)
259 regs.slot[3] = regs.slot[2];
260 else
261 regs.slot[2] = regs.slot[3];
262 } else if (!regs.first_instruction) {
263 /* Enforced by the encoding anyway */
264 assert(regs.slot[2] != regs.slot[3]);
265 }
266
267 s.reg2 = regs.slot[2];
268 s.reg3 = regs.slot[3];
269 s.fau_idx = regs.fau_idx;
270
271 memcpy(&packed, &s, sizeof(s));
272 return packed;
273 }
274
275 /* We must ensure slot 1 > slot 0 for the 63-x trick to function, so we fix
276 * this up at pack time. (Scheduling doesn't care.) */
277
278 static void
bi_flip_slots(bi_registers * regs)279 bi_flip_slots(bi_registers *regs)
280 {
281 if (regs->enabled[0] && regs->enabled[1] && regs->slot[1] < regs->slot[0]) {
282 unsigned temp = regs->slot[0];
283 regs->slot[0] = regs->slot[1];
284 regs->slot[1] = temp;
285 }
286
287 }
288
289 static inline enum bifrost_packed_src
bi_get_src_slot(bi_registers * regs,unsigned reg)290 bi_get_src_slot(bi_registers *regs, unsigned reg)
291 {
292 if (regs->slot[0] == reg && regs->enabled[0])
293 return BIFROST_SRC_PORT0;
294 else if (regs->slot[1] == reg && regs->enabled[1])
295 return BIFROST_SRC_PORT1;
296 else if (regs->slot[2] == reg && regs->slot23.slot2 == BIFROST_OP_READ)
297 return BIFROST_SRC_PORT2;
298 else
299 unreachable("Tried to access register with no port");
300 }
301
302 static inline enum bifrost_packed_src
bi_get_src_new(bi_instr * ins,bi_registers * regs,unsigned s)303 bi_get_src_new(bi_instr *ins, bi_registers *regs, unsigned s)
304 {
305 if (!ins)
306 return 0;
307
308 bi_index src = ins->src[s];
309
310 if (src.type == BI_INDEX_REGISTER)
311 return bi_get_src_slot(regs, src.value);
312 else if (src.type == BI_INDEX_PASS)
313 return src.value;
314 else {
315 /* TODO make safer */
316 return BIFROST_SRC_STAGE;
317 }
318 }
319
320 static struct bi_packed_tuple
bi_pack_tuple(bi_clause * clause,bi_tuple * tuple,bi_tuple * prev,bool first_tuple,gl_shader_stage stage)321 bi_pack_tuple(bi_clause *clause, bi_tuple *tuple, bi_tuple *prev, bool first_tuple, gl_shader_stage stage)
322 {
323 bi_assign_slots(tuple, prev);
324 tuple->regs.fau_idx = tuple->fau_idx;
325 tuple->regs.first_instruction = first_tuple;
326
327 bi_flip_slots(&tuple->regs);
328
329 bool sr_read = tuple->add &&
330 bi_opcode_props[(tuple->add)->op].sr_read;
331
332 uint64_t reg = bi_pack_registers(tuple->regs);
333 uint64_t fma = bi_pack_fma(tuple->fma,
334 bi_get_src_new(tuple->fma, &tuple->regs, 0),
335 bi_get_src_new(tuple->fma, &tuple->regs, 1),
336 bi_get_src_new(tuple->fma, &tuple->regs, 2),
337 bi_get_src_new(tuple->fma, &tuple->regs, 3));
338
339 uint64_t add = bi_pack_add(tuple->add,
340 bi_get_src_new(tuple->add, &tuple->regs, sr_read + 0),
341 bi_get_src_new(tuple->add, &tuple->regs, sr_read + 1),
342 bi_get_src_new(tuple->add, &tuple->regs, sr_read + 2),
343 0);
344
345 if (tuple->add) {
346 bi_instr *add = tuple->add;
347
348 bool sr_write = bi_opcode_props[add->op].sr_write &&
349 !bi_is_null(add->dest[0]);
350
351 if (sr_read && !bi_is_null(add->src[0])) {
352 assert(add->src[0].type == BI_INDEX_REGISTER);
353 clause->staging_register = add->src[0].value;
354
355 if (sr_write)
356 assert(bi_is_equiv(add->src[0], add->dest[0]));
357 } else if (sr_write) {
358 assert(add->dest[0].type == BI_INDEX_REGISTER);
359 clause->staging_register = add->dest[0].value;
360 }
361 }
362
363 struct bi_packed_tuple packed = {
364 .lo = reg | (fma << 35) | ((add & 0b111111) << 58),
365 .hi = add >> 6
366 };
367
368 return packed;
369 }
370
371 /* A block contains at most one PC-relative constant, from a terminal branch.
372 * Find the last instruction and if it is a relative branch, fix up the
373 * PC-relative constant to contain the absolute offset. This occurs at pack
374 * time instead of schedule time because the number of quadwords between each
375 * block is not known until after all other passes have finished.
376 */
377
378 static void
bi_assign_branch_offset(bi_context * ctx,bi_block * block)379 bi_assign_branch_offset(bi_context *ctx, bi_block *block)
380 {
381 if (list_is_empty(&block->clauses))
382 return;
383
384 bi_clause *clause = list_last_entry(&block->clauses, bi_clause, link);
385 bi_instr *br = bi_last_instr_in_clause(clause);
386
387 if (!br->branch_target)
388 return;
389
390 /* Put it in the high place */
391 int32_t qwords = bi_block_offset(ctx, clause, br->branch_target);
392 int32_t bytes = qwords * 16;
393
394 /* Copy so we can toy with the sign without undefined behaviour */
395 uint32_t raw = 0;
396 memcpy(&raw, &bytes, sizeof(raw));
397
398 /* Clear off top bits for A1/B1 bits */
399 raw &= ~0xF0000000;
400
401 /* Put in top 32-bits */
402 assert(clause->pcrel_idx < 8);
403 clause->constants[clause->pcrel_idx] |= ((uint64_t) raw) << 32ull;
404 }
405
406 static void
bi_pack_constants(unsigned tuple_count,uint64_t * constants,unsigned word_idx,unsigned constant_words,bool ec0_packed,struct util_dynarray * emission)407 bi_pack_constants(unsigned tuple_count, uint64_t *constants,
408 unsigned word_idx, unsigned constant_words, bool ec0_packed,
409 struct util_dynarray *emission)
410 {
411 unsigned index = (word_idx << 1) + ec0_packed;
412
413 /* Do more constants follow */
414 bool more = (word_idx + 1) < constant_words;
415
416 /* Indexed first by tuple count and second by constant word number,
417 * indicates the position in the clause */
418 unsigned pos_lookup[8][3] = {
419 { 0 },
420 { 1 },
421 { 3 },
422 { 2, 5 },
423 { 4, 8 },
424 { 7, 11, 14 },
425 { 6, 10, 13 },
426 { 9, 12 }
427 };
428
429 /* Compute the pos, and check everything is reasonable */
430 assert((tuple_count - 1) < 8);
431 assert(word_idx < 3);
432 unsigned pos = pos_lookup[tuple_count - 1][word_idx];
433 assert(pos != 0 || (tuple_count == 1 && word_idx == 0));
434
435 struct bifrost_fmt_constant quad = {
436 .pos = pos,
437 .tag = more ? BIFROST_FMTC_CONSTANTS : BIFROST_FMTC_FINAL,
438 .imm_1 = constants[index + 0] >> 4,
439 .imm_2 = constants[index + 1] >> 4,
440 };
441
442 util_dynarray_append(emission, struct bifrost_fmt_constant, quad);
443 }
444
445 uint8_t
bi_pack_literal(enum bi_clause_subword literal)446 bi_pack_literal(enum bi_clause_subword literal)
447 {
448 assert(literal >= BI_CLAUSE_SUBWORD_LITERAL_0);
449 assert(literal <= BI_CLAUSE_SUBWORD_LITERAL_7);
450
451 return (literal - BI_CLAUSE_SUBWORD_LITERAL_0);
452 }
453
454 static inline uint8_t
bi_clause_upper(unsigned val,struct bi_packed_tuple * tuples,ASSERTED unsigned tuple_count)455 bi_clause_upper(unsigned val,
456 struct bi_packed_tuple *tuples,
457 ASSERTED unsigned tuple_count)
458 {
459 assert(val < tuple_count);
460
461 /* top 3-bits of 78-bits is tuple >> 75 == (tuple >> 64) >> 11 */
462 struct bi_packed_tuple tuple = tuples[val];
463 return (tuple.hi >> 11);
464 }
465
466 uint8_t
bi_pack_upper(enum bi_clause_subword upper,struct bi_packed_tuple * tuples,ASSERTED unsigned tuple_count)467 bi_pack_upper(enum bi_clause_subword upper,
468 struct bi_packed_tuple *tuples,
469 ASSERTED unsigned tuple_count)
470 {
471 assert(upper >= BI_CLAUSE_SUBWORD_UPPER_0);
472 assert(upper <= BI_CLAUSE_SUBWORD_UPPER_7);
473
474 return bi_clause_upper(upper - BI_CLAUSE_SUBWORD_UPPER_0, tuples,
475 tuple_count);
476 }
477
478 uint64_t
bi_pack_tuple_bits(enum bi_clause_subword idx,struct bi_packed_tuple * tuples,ASSERTED unsigned tuple_count,unsigned offset,unsigned nbits)479 bi_pack_tuple_bits(enum bi_clause_subword idx,
480 struct bi_packed_tuple *tuples,
481 ASSERTED unsigned tuple_count,
482 unsigned offset, unsigned nbits)
483 {
484 assert(idx >= BI_CLAUSE_SUBWORD_TUPLE_0);
485 assert(idx <= BI_CLAUSE_SUBWORD_TUPLE_7);
486
487 unsigned val = (idx - BI_CLAUSE_SUBWORD_TUPLE_0);
488 assert(val < tuple_count);
489
490 struct bi_packed_tuple tuple = tuples[val];
491
492 assert(offset + nbits < 78);
493 assert(nbits <= 64);
494
495 /* (X >> start) & m
496 * = (((hi << 64) | lo) >> start) & m
497 * = (((hi << 64) >> start) | (lo >> start)) & m
498 * = { ((hi << (64 - start)) | (lo >> start)) & m if start <= 64
499 * { ((hi >> (start - 64)) | (lo >> start)) & m if start >= 64
500 * = { ((hi << (64 - start)) & m) | ((lo >> start) & m) if start <= 64
501 * { ((hi >> (start - 64)) & m) | ((lo >> start) & m) if start >= 64
502 *
503 * By setting m = 2^64 - 1, we justify doing the respective shifts as
504 * 64-bit integers. Zero special cased to avoid undefined behaviour.
505 */
506
507 uint64_t lo = (tuple.lo >> offset);
508 uint64_t hi = (offset == 0) ? 0
509 : (offset > 64) ? (tuple.hi >> (offset - 64))
510 : (tuple.hi << (64 - offset));
511
512 return (lo | hi) & ((1ULL << nbits) - 1);
513 }
514
515 static inline uint16_t
bi_pack_lu(enum bi_clause_subword word,struct bi_packed_tuple * tuples,ASSERTED unsigned tuple_count)516 bi_pack_lu(enum bi_clause_subword word,
517 struct bi_packed_tuple *tuples,
518 ASSERTED unsigned tuple_count)
519 {
520 return (word >= BI_CLAUSE_SUBWORD_UPPER_0) ?
521 bi_pack_upper(word, tuples, tuple_count) :
522 bi_pack_literal(word);
523 }
524
525 uint8_t
bi_pack_sync(enum bi_clause_subword t1,enum bi_clause_subword t2,enum bi_clause_subword t3,struct bi_packed_tuple * tuples,ASSERTED unsigned tuple_count,bool z)526 bi_pack_sync(enum bi_clause_subword t1,
527 enum bi_clause_subword t2,
528 enum bi_clause_subword t3,
529 struct bi_packed_tuple *tuples,
530 ASSERTED unsigned tuple_count,
531 bool z)
532 {
533 uint8_t sync =
534 (bi_pack_lu(t3, tuples, tuple_count) << 0) |
535 (bi_pack_lu(t2, tuples, tuple_count) << 3);
536
537 if (t1 == BI_CLAUSE_SUBWORD_Z)
538 sync |= z << 6;
539 else
540 sync |= bi_pack_literal(t1) << 6;
541
542 return sync;
543 }
544
545 static inline uint64_t
bi_pack_t_ec(enum bi_clause_subword word,struct bi_packed_tuple * tuples,ASSERTED unsigned tuple_count,uint64_t ec0)546 bi_pack_t_ec(enum bi_clause_subword word,
547 struct bi_packed_tuple *tuples,
548 ASSERTED unsigned tuple_count,
549 uint64_t ec0)
550 {
551 if (word == BI_CLAUSE_SUBWORD_CONSTANT)
552 return ec0;
553 else
554 return bi_pack_tuple_bits(word, tuples, tuple_count, 0, 60);
555 }
556
557 static uint32_t
bi_pack_subwords_56(enum bi_clause_subword t,struct bi_packed_tuple * tuples,ASSERTED unsigned tuple_count,uint64_t header,uint64_t ec0,unsigned tuple_subword)558 bi_pack_subwords_56(enum bi_clause_subword t,
559 struct bi_packed_tuple *tuples,
560 ASSERTED unsigned tuple_count,
561 uint64_t header, uint64_t ec0,
562 unsigned tuple_subword)
563 {
564 switch (t) {
565 case BI_CLAUSE_SUBWORD_HEADER:
566 return (header & ((1 << 30) - 1));
567 case BI_CLAUSE_SUBWORD_RESERVED:
568 return 0;
569 case BI_CLAUSE_SUBWORD_CONSTANT:
570 return (ec0 >> 15) & ((1 << 30) - 1);
571 default:
572 return bi_pack_tuple_bits(t, tuples, tuple_count, tuple_subword * 15, 30);
573 }
574 }
575
576 static uint16_t
bi_pack_subword(enum bi_clause_subword t,unsigned format,struct bi_packed_tuple * tuples,ASSERTED unsigned tuple_count,uint64_t header,uint64_t ec0,unsigned m0,unsigned tuple_subword)577 bi_pack_subword(enum bi_clause_subword t, unsigned format,
578 struct bi_packed_tuple *tuples,
579 ASSERTED unsigned tuple_count,
580 uint64_t header, uint64_t ec0, unsigned m0,
581 unsigned tuple_subword)
582 {
583 switch (t) {
584 case BI_CLAUSE_SUBWORD_HEADER:
585 return header >> 30;
586 case BI_CLAUSE_SUBWORD_M:
587 return m0;
588 case BI_CLAUSE_SUBWORD_CONSTANT:
589 return (format == 5 || format == 10) ?
590 (ec0 & ((1 << 15) - 1)) :
591 (ec0 >> (15 + 30));
592 case BI_CLAUSE_SUBWORD_UPPER_23:
593 return (bi_clause_upper(2, tuples, tuple_count) << 12) |
594 (bi_clause_upper(3, tuples, tuple_count) << 9);
595 case BI_CLAUSE_SUBWORD_UPPER_56:
596 return (bi_clause_upper(5, tuples, tuple_count) << 12) |
597 (bi_clause_upper(6, tuples, tuple_count) << 9);
598 case BI_CLAUSE_SUBWORD_UPPER_0 ... BI_CLAUSE_SUBWORD_UPPER_7:
599 return bi_pack_upper(t, tuples, tuple_count) << 12;
600 default:
601 return bi_pack_tuple_bits(t, tuples, tuple_count, tuple_subword * 15, 15);
602 }
603 }
604
605 /* EC0 is 60-bits (bottom 4 already shifted off) */
606 void
bi_pack_format(struct util_dynarray * emission,unsigned index,struct bi_packed_tuple * tuples,ASSERTED unsigned tuple_count,uint64_t header,uint64_t ec0,unsigned m0,bool z)607 bi_pack_format(struct util_dynarray *emission,
608 unsigned index,
609 struct bi_packed_tuple *tuples,
610 ASSERTED unsigned tuple_count,
611 uint64_t header, uint64_t ec0,
612 unsigned m0, bool z)
613 {
614 struct bi_clause_format format = bi_clause_formats[index];
615
616 uint8_t sync = bi_pack_sync(format.tag_1, format.tag_2, format.tag_3,
617 tuples, tuple_count, z);
618
619 uint64_t s0_s3 = bi_pack_t_ec(format.s0_s3, tuples, tuple_count, ec0);
620
621 uint16_t s4 = bi_pack_subword(format.s4, format.format, tuples, tuple_count, header, ec0, m0, 4);
622
623 uint32_t s5_s6 = bi_pack_subwords_56(format.s5_s6,
624 tuples, tuple_count, header, ec0,
625 (format.format == 2 || format.format == 7) ? 0 : 3);
626
627 uint64_t s7 = bi_pack_subword(format.s7, format.format, tuples, tuple_count, header, ec0, m0, 2);
628
629 /* Now that subwords are packed, split into 64-bit halves and emit */
630 uint64_t lo = sync | ((s0_s3 & ((1ull << 56) - 1)) << 8);
631 uint64_t hi = (s0_s3 >> 56) | ((uint64_t) s4 << 4) | ((uint64_t) s5_s6 << 19) | ((uint64_t) s7 << 49);
632
633 util_dynarray_append(emission, uint64_t, lo);
634 util_dynarray_append(emission, uint64_t, hi);
635 }
636
637 static void
bi_pack_clause(bi_context * ctx,bi_clause * clause,bi_clause * next_1,bi_clause * next_2,struct util_dynarray * emission,gl_shader_stage stage)638 bi_pack_clause(bi_context *ctx, bi_clause *clause,
639 bi_clause *next_1, bi_clause *next_2,
640 struct util_dynarray *emission, gl_shader_stage stage)
641 {
642 struct bi_packed_tuple ins[8] = { 0 };
643
644 for (unsigned i = 0; i < clause->tuple_count; ++i) {
645 unsigned prev = ((i == 0) ? clause->tuple_count : i) - 1;
646 ins[i] = bi_pack_tuple(clause, &clause->tuples[i],
647 &clause->tuples[prev], i == 0, stage);
648
649 bi_instr *add = clause->tuples[i].add;
650
651 /* Different GPUs support different forms of the CLPER.i32
652 * instruction. Check we use the right one for the target.
653 */
654 if (add && add->op == BI_OPCODE_CLPER_OLD_I32)
655 assert(ctx->quirks & BIFROST_LIMITED_CLPER);
656 else if (add && add->op == BI_OPCODE_CLPER_I32)
657 assert(!(ctx->quirks & BIFROST_LIMITED_CLPER));
658 }
659
660 bool ec0_packed = bi_ec0_packed(clause->tuple_count);
661
662 if (ec0_packed)
663 clause->constant_count = MAX2(clause->constant_count, 1);
664
665 unsigned constant_quads =
666 DIV_ROUND_UP(clause->constant_count - (ec0_packed ? 1 : 0), 2);
667
668 uint64_t header = bi_pack_header(clause, next_1, next_2);
669 uint64_t ec0 = (clause->constants[0] >> 4);
670 unsigned m0 = (clause->pcrel_idx == 0) ? 4 : 0;
671
672 unsigned counts[8] = {
673 1, 2, 3, 3, 4, 5, 5, 6
674 };
675
676 unsigned indices[8][6] = {
677 { 1 },
678 { 0, 2 },
679 { 0, 3, 4 },
680 { 0, 3, 6 },
681 { 0, 3, 7, 8 },
682 { 0, 3, 5, 9, 10 },
683 { 0, 3, 5, 9, 11 },
684 { 0, 3, 5, 9, 12, 13 },
685 };
686
687 unsigned count = counts[clause->tuple_count - 1];
688
689 for (unsigned pos = 0; pos < count; ++pos) {
690 ASSERTED unsigned idx = indices[clause->tuple_count - 1][pos];
691 assert(bi_clause_formats[idx].pos == pos);
692 assert((bi_clause_formats[idx].tag_1 == BI_CLAUSE_SUBWORD_Z) ==
693 (pos == count - 1));
694
695 /* Whether to end the clause immediately after the last tuple */
696 bool z = (constant_quads == 0);
697
698 bi_pack_format(emission, indices[clause->tuple_count - 1][pos],
699 ins, clause->tuple_count, header, ec0, m0,
700 z);
701 }
702
703 /* Pack the remaining constants */
704
705 for (unsigned pos = 0; pos < constant_quads; ++pos) {
706 bi_pack_constants(clause->tuple_count, clause->constants,
707 pos, constant_quads, ec0_packed, emission);
708 }
709 }
710
711 static void
bi_collect_blend_ret_addr(bi_context * ctx,struct util_dynarray * emission,const bi_clause * clause)712 bi_collect_blend_ret_addr(bi_context *ctx, struct util_dynarray *emission,
713 const bi_clause *clause)
714 {
715 /* No need to collect return addresses when we're in a blend shader. */
716 if (ctx->inputs->is_blend)
717 return;
718
719 const bi_tuple *tuple = &clause->tuples[clause->tuple_count - 1];
720 const bi_instr *ins = tuple->add;
721
722 if (!ins || ins->op != BI_OPCODE_BLEND)
723 return;
724
725
726 unsigned loc = tuple->regs.fau_idx - BIR_FAU_BLEND_0;
727 assert(loc < ARRAY_SIZE(ctx->info.bifrost->blend));
728 assert(!ctx->info.bifrost->blend[loc].return_offset);
729 ctx->info.bifrost->blend[loc].return_offset =
730 util_dynarray_num_elements(emission, uint8_t);
731 assert(!(ctx->info.bifrost->blend[loc].return_offset & 0x7));
732 }
733
734 unsigned
bi_pack(bi_context * ctx,struct util_dynarray * emission)735 bi_pack(bi_context *ctx, struct util_dynarray *emission)
736 {
737 unsigned previous_size = emission->size;
738
739 bi_foreach_block(ctx, block) {
740 bi_assign_branch_offset(ctx, block);
741
742 bi_foreach_clause_in_block(block, clause) {
743 bool is_last = (clause->link.next == &block->clauses);
744
745 /* Get the succeeding clauses, either two successors of
746 * the block for the last clause in the block or just
747 * the next clause within the block */
748
749 bi_clause *next = NULL, *next_2 = NULL;
750
751 if (is_last) {
752 next = bi_next_clause(ctx, block->successors[0], NULL);
753 next_2 = bi_next_clause(ctx, block->successors[1], NULL);
754 } else {
755 next = bi_next_clause(ctx, block, clause);
756 }
757
758
759 previous_size = emission->size;
760
761 bi_pack_clause(ctx, clause, next, next_2, emission, ctx->stage);
762
763 if (!is_last)
764 bi_collect_blend_ret_addr(ctx, emission, clause);
765 }
766 }
767
768 return emission->size - previous_size;
769 }
770