1 /*
2 * Copyright 2013 Vadim Girlin <vadimgirlin@gmail.com>
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * on the rights to use, copy, modify, merge, publish, distribute, sub
8 * license, and/or sell copies of the Software, and to permit persons to whom
9 * the Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
13 * Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHOR(S) AND/OR THEIR SUPPLIERS BE LIABLE FOR ANY CLAIM,
19 * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
20 * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
21 * USE OR OTHER DEALINGS IN THE SOFTWARE.
22 *
23 * Authors:
24 * Vadim Girlin
25 */
26
27 #define PSC_DEBUG 0
28
29 #if PSC_DEBUG
30 #define PSC_DUMP(a) do { a } while (0)
31 #else
32 #define PSC_DUMP(a)
33 #endif
34
35 #include "sb_bc.h"
36 #include "sb_shader.h"
37 #include "sb_pass.h"
38 #include "sb_sched.h"
39 #include "eg_sq.h" // V_SQ_CF_INDEX_NONE/0/1
40
41 namespace r600_sb {
42
rp_kcache_tracker(shader & sh)43 rp_kcache_tracker::rp_kcache_tracker(shader &sh) : rp(), uc(),
44 // FIXME: for now we'll use "two const pairs" limit for r600, same as
45 // for other chips, otherwise additional check in alu_group_tracker is
46 // required to make sure that all 4 consts in the group fit into 2
47 // kcache sets
48 sel_count(2) {}
49
try_reserve(sel_chan r)50 bool rp_kcache_tracker::try_reserve(sel_chan r) {
51 unsigned sel = kc_sel(r);
52
53 for (unsigned i = 0; i < sel_count; ++i) {
54 if (rp[i] == 0) {
55 rp[i] = sel;
56 ++uc[i];
57 return true;
58 }
59 if (rp[i] == sel) {
60 ++uc[i];
61 return true;
62 }
63 }
64 return false;
65 }
66
try_reserve(node * n)67 bool rp_kcache_tracker::try_reserve(node* n) {
68 bool need_unreserve = false;
69 vvec::iterator I(n->src.begin()), E(n->src.end());
70
71 for (; I != E; ++I) {
72 value *v = *I;
73 if (v->is_kcache()) {
74 if (!try_reserve(v->select))
75 break;
76 else
77 need_unreserve = true;
78 }
79 }
80 if (I == E)
81 return true;
82
83 if (need_unreserve && I != n->src.begin()) {
84 do {
85 --I;
86 value *v =*I;
87 if (v->is_kcache())
88 unreserve(v->select);
89 } while (I != n->src.begin());
90 }
91 return false;
92 }
93
94 inline
unreserve(node * n)95 void rp_kcache_tracker::unreserve(node* n) {
96 vvec::iterator I(n->src.begin()), E(n->src.end());
97 for (; I != E; ++I) {
98 value *v = *I;
99 if (v->is_kcache())
100 unreserve(v->select);
101 }
102 }
103
unreserve(sel_chan r)104 void rp_kcache_tracker::unreserve(sel_chan r) {
105 unsigned sel = kc_sel(r);
106
107 for (unsigned i = 0; i < sel_count; ++i)
108 if (rp[i] == sel) {
109 if (--uc[i] == 0)
110 rp[i] = 0;
111 return;
112 }
113 assert(0);
114 return;
115 }
116
try_reserve(alu_node * n)117 bool literal_tracker::try_reserve(alu_node* n) {
118 bool need_unreserve = false;
119
120 vvec::iterator I(n->src.begin()), E(n->src.end());
121
122 for (; I != E; ++I) {
123 value *v = *I;
124 if (v->is_literal()) {
125 if (!try_reserve(v->literal_value))
126 break;
127 else
128 need_unreserve = true;
129 }
130 }
131 if (I == E)
132 return true;
133
134 if (need_unreserve && I != n->src.begin()) {
135 do {
136 --I;
137 value *v =*I;
138 if (v->is_literal())
139 unreserve(v->literal_value);
140 } while (I != n->src.begin());
141 }
142 return false;
143 }
144
unreserve(alu_node * n)145 void literal_tracker::unreserve(alu_node* n) {
146 unsigned nsrc = n->bc.op_ptr->src_count, i;
147
148 for (i = 0; i < nsrc; ++i) {
149 value *v = n->src[i];
150 if (v->is_literal())
151 unreserve(v->literal_value);
152 }
153 }
154
try_reserve(literal l)155 bool literal_tracker::try_reserve(literal l) {
156
157 PSC_DUMP( sblog << "literal reserve " << l.u << " " << l.f << "\n"; );
158
159 for (unsigned i = 0; i < MAX_ALU_LITERALS; ++i) {
160 if (lt[i] == 0) {
161 lt[i] = l;
162 ++uc[i];
163 PSC_DUMP( sblog << " reserved new uc = " << uc[i] << "\n"; );
164 return true;
165 } else if (lt[i] == l) {
166 ++uc[i];
167 PSC_DUMP( sblog << " reserved uc = " << uc[i] << "\n"; );
168 return true;
169 }
170 }
171 PSC_DUMP( sblog << " failed to reserve literal\n"; );
172 return false;
173 }
174
unreserve(literal l)175 void literal_tracker::unreserve(literal l) {
176
177 PSC_DUMP( sblog << "literal unreserve " << l.u << " " << l.f << "\n"; );
178
179 for (unsigned i = 0; i < MAX_ALU_LITERALS; ++i) {
180 if (lt[i] == l) {
181 if (--uc[i] == 0)
182 lt[i] = 0;
183 return;
184 }
185 }
186 assert(0);
187 return;
188 }
189
bs_cycle_vector(unsigned bs,unsigned src)190 static inline unsigned bs_cycle_vector(unsigned bs, unsigned src) {
191 static const unsigned swz[VEC_NUM][3] = {
192 {0, 1, 2}, {0, 2, 1}, {1, 2, 0}, {1, 0, 2}, {2, 0, 1}, {2, 1, 0}
193 };
194 assert(bs < VEC_NUM && src < 3);
195 return swz[bs][src];
196 }
197
bs_cycle_scalar(unsigned bs,unsigned src)198 static inline unsigned bs_cycle_scalar(unsigned bs, unsigned src) {
199 static const unsigned swz[SCL_NUM][3] = {
200 {2, 1, 0}, {1, 2, 2}, {2, 1, 2}, {2, 2, 1}
201 };
202
203 if (bs >= SCL_NUM || src >= 3) {
204 // this prevents gcc warning "array subscript is above array bounds"
205 // AFAICS we should never hit this path
206 abort();
207 }
208 return swz[bs][src];
209 }
210
bs_cycle(bool trans,unsigned bs,unsigned src)211 static inline unsigned bs_cycle(bool trans, unsigned bs, unsigned src) {
212 return trans ? bs_cycle_scalar(bs, src) : bs_cycle_vector(bs, src);
213 }
214
215 inline
try_reserve(unsigned cycle,unsigned sel,unsigned chan)216 bool rp_gpr_tracker::try_reserve(unsigned cycle, unsigned sel, unsigned chan) {
217 ++sel;
218 if (rp[cycle][chan] == 0) {
219 rp[cycle][chan] = sel;
220 ++uc[cycle][chan];
221 return true;
222 } else if (rp[cycle][chan] == sel) {
223 ++uc[cycle][chan];
224 return true;
225 }
226 return false;
227 }
228
229 inline
unreserve(alu_node * n)230 void rp_gpr_tracker::unreserve(alu_node* n) {
231 unsigned nsrc = n->bc.op_ptr->src_count, i;
232 unsigned trans = n->bc.slot == SLOT_TRANS;
233 unsigned bs = n->bc.bank_swizzle;
234 unsigned opt = !trans
235 && n->bc.src[0].sel == n->bc.src[1].sel
236 && n->bc.src[0].chan == n->bc.src[1].chan;
237
238 for (i = 0; i < nsrc; ++i) {
239 value *v = n->src[i];
240 if (v->is_readonly() || v->is_undef())
241 continue;
242 if (i == 1 && opt)
243 continue;
244 unsigned cycle = bs_cycle(trans, bs, i);
245 unreserve(cycle, n->bc.src[i].sel, n->bc.src[i].chan);
246 }
247 }
248
249 inline
unreserve(unsigned cycle,unsigned sel,unsigned chan)250 void rp_gpr_tracker::unreserve(unsigned cycle, unsigned sel, unsigned chan) {
251 ++sel;
252 assert(rp[cycle][chan] == sel && uc[cycle][chan]);
253 if (--uc[cycle][chan] == 0)
254 rp[cycle][chan] = 0;
255 }
256
257 inline
try_reserve(alu_node * n)258 bool rp_gpr_tracker::try_reserve(alu_node* n) {
259 unsigned nsrc = n->bc.op_ptr->src_count, i;
260 unsigned trans = n->bc.slot == SLOT_TRANS;
261 unsigned bs = n->bc.bank_swizzle;
262 unsigned opt = !trans && nsrc >= 2 &&
263 n->src[0] == n->src[1];
264
265 bool need_unreserve = false;
266 unsigned const_count = 0, min_gpr_cycle = 3;
267
268 for (i = 0; i < nsrc; ++i) {
269 value *v = n->src[i];
270 if (v->is_readonly() || v->is_undef()) {
271 const_count++;
272 if (trans && const_count == 3)
273 break;
274 } else {
275 if (i == 1 && opt)
276 continue;
277
278 unsigned cycle = bs_cycle(trans, bs, i);
279
280 if (trans && cycle < min_gpr_cycle)
281 min_gpr_cycle = cycle;
282
283 if (const_count && cycle < const_count && trans)
284 break;
285
286 if (!try_reserve(cycle, n->bc.src[i].sel, n->bc.src[i].chan))
287 break;
288 else
289 need_unreserve = true;
290 }
291 }
292
293 if ((i == nsrc) && (min_gpr_cycle + 1 > const_count))
294 return true;
295
296 if (need_unreserve && i--) {
297 do {
298 value *v = n->src[i];
299 if (!v->is_readonly() && !v->is_undef()) {
300 if (i == 1 && opt)
301 continue;
302 unreserve(bs_cycle(trans, bs, i), n->bc.src[i].sel,
303 n->bc.src[i].chan);
304 }
305 } while (i--);
306 }
307 return false;
308 }
309
alu_group_tracker(shader & sh)310 alu_group_tracker::alu_group_tracker(shader &sh)
311 : sh(sh), kc(sh),
312 gpr(), lt(), slots(),
313 max_slots(sh.get_ctx().is_cayman() ? 4 : 5),
314 has_mova(), uses_ar(), has_predset(), has_kill(),
315 updates_exec_mask(), consumes_lds_oqa(), produces_lds_oqa(), chan_count(), interp_param(), next_id() {
316
317 available_slots = sh.get_ctx().has_trans ? 0x1F : 0x0F;
318 }
319
320 inline
get_value_id(value * v)321 sel_chan alu_group_tracker::get_value_id(value* v) {
322 unsigned &id = vmap[v];
323 if (!id)
324 id = ++next_id;
325 return sel_chan(id, v->get_final_chan());
326 }
327
328 inline
assign_slot(unsigned slot,alu_node * n)329 void alu_group_tracker::assign_slot(unsigned slot, alu_node* n) {
330 update_flags(n);
331 slots[slot] = n;
332 available_slots &= ~(1 << slot);
333
334 unsigned param = n->interp_param();
335
336 if (param) {
337 assert(!interp_param || interp_param == param);
338 interp_param = param;
339 }
340 }
341
342
discard_all_slots(container_node & removed_nodes)343 void alu_group_tracker::discard_all_slots(container_node &removed_nodes) {
344 PSC_DUMP( sblog << "agt::discard_all_slots\n"; );
345 discard_slots(~available_slots & ((1 << max_slots) - 1), removed_nodes);
346 }
347
discard_slots(unsigned slot_mask,container_node & removed_nodes)348 void alu_group_tracker::discard_slots(unsigned slot_mask,
349 container_node &removed_nodes) {
350
351 PSC_DUMP(
352 sblog << "discard_slots : packed_ops : "
353 << (unsigned)packed_ops.size() << "\n";
354 );
355
356 for (node_vec::iterator N, I = packed_ops.begin();
357 I != packed_ops.end(); I = N) {
358 N = I; ++N;
359
360 alu_packed_node *n = static_cast<alu_packed_node*>(*I);
361 unsigned pslots = n->get_slot_mask();
362
363 PSC_DUMP(
364 sblog << "discard_slots : packed slot_mask : " << pslots << "\n";
365 );
366
367 if (pslots & slot_mask) {
368
369 PSC_DUMP(
370 sblog << "discard_slots : discarding packed...\n";
371 );
372
373 removed_nodes.push_back(n);
374 slot_mask &= ~pslots;
375 N = packed_ops.erase(I);
376 available_slots |= pslots;
377 for (unsigned k = 0; k < max_slots; ++k) {
378 if (pslots & (1 << k))
379 slots[k] = NULL;
380 }
381 }
382 }
383
384 for (unsigned slot = 0; slot < max_slots; ++slot) {
385 unsigned slot_bit = 1 << slot;
386
387 if (slot_mask & slot_bit) {
388 assert(!(available_slots & slot_bit));
389 assert(slots[slot]);
390
391 assert(!(slots[slot]->bc.slot_flags & AF_4SLOT));
392
393 PSC_DUMP(
394 sblog << "discarding slot " << slot << " : ";
395 dump::dump_op(slots[slot]);
396 sblog << "\n";
397 );
398
399 removed_nodes.push_back(slots[slot]);
400 slots[slot] = NULL;
401 available_slots |= slot_bit;
402 }
403 }
404
405 alu_node *t = slots[4];
406 if (t && (t->bc.slot_flags & AF_V)) {
407 unsigned chan = t->bc.dst_chan;
408 if (!slots[chan]) {
409 PSC_DUMP(
410 sblog << "moving ";
411 dump::dump_op(t);
412 sblog << " from trans slot to free slot " << chan << "\n";
413 );
414
415 slots[chan] = t;
416 slots[4] = NULL;
417 t->bc.slot = chan;
418 }
419 }
420
421 reinit();
422 }
423
emit()424 alu_group_node* alu_group_tracker::emit() {
425
426 alu_group_node *g = sh.create_alu_group();
427
428 lt.init_group_literals(g);
429
430 for (unsigned i = 0; i < max_slots; ++i) {
431 alu_node *n = slots[i];
432 if (n) {
433 g->push_back(n);
434 }
435 }
436 return g;
437 }
438
try_reserve(alu_node * n)439 bool alu_group_tracker::try_reserve(alu_node* n) {
440 unsigned nsrc = n->bc.op_ptr->src_count;
441 unsigned slot = n->bc.slot;
442 bool trans = slot == 4;
443
444 if (slots[slot])
445 return false;
446
447 unsigned flags = n->bc.op_ptr->flags;
448
449 unsigned param = n->interp_param();
450
451 if (param && interp_param && interp_param != param)
452 return false;
453
454 if ((flags & AF_KILL) && has_predset)
455 return false;
456 if ((flags & AF_ANY_PRED) && (has_kill || has_predset))
457 return false;
458 if ((flags & AF_MOVA) && (has_mova || uses_ar))
459 return false;
460
461 if (n->uses_ar() && has_mova)
462 return false;
463
464 if (consumes_lds_oqa)
465 return false;
466 if (n->consumes_lds_oq() && available_slots != (sh.get_ctx().has_trans ? 0x1F : 0x0F))
467 return false;
468 for (unsigned i = 0; i < nsrc; ++i) {
469
470 unsigned last_id = next_id;
471
472 value *v = n->src[i];
473 if (!v->is_any_gpr() && !v->is_rel())
474 continue;
475 sel_chan vid = get_value_id(n->src[i]);
476
477 if (vid > last_id && chan_count[vid.chan()] == 3) {
478 return false;
479 }
480
481 n->bc.src[i].sel = vid.sel();
482 n->bc.src[i].chan = vid.chan();
483 }
484
485 if (!lt.try_reserve(n))
486 return false;
487
488 if (!kc.try_reserve(n)) {
489 lt.unreserve(n);
490 return false;
491 }
492
493 unsigned fbs = n->forced_bank_swizzle();
494
495 n->bc.bank_swizzle = 0;
496
497 if (!trans && fbs)
498 n->bc.bank_swizzle = VEC_210;
499
500 if (gpr.try_reserve(n)) {
501 assign_slot(slot, n);
502 return true;
503 }
504
505 if (!fbs) {
506 unsigned swz_num = trans ? SCL_NUM : VEC_NUM;
507 for (unsigned bs = 0; bs < swz_num; ++bs) {
508 n->bc.bank_swizzle = bs;
509 if (gpr.try_reserve(n)) {
510 assign_slot(slot, n);
511 return true;
512 }
513 }
514 }
515
516 gpr.reset();
517
518 slots[slot] = n;
519 unsigned forced_swz_slots = 0;
520 int first_slot = ~0, first_nf = ~0, last_slot = ~0;
521 unsigned save_bs[5];
522
523 for (unsigned i = 0; i < max_slots; ++i) {
524 alu_node *a = slots[i];
525 if (a) {
526 if (first_slot == ~0)
527 first_slot = i;
528 last_slot = i;
529 save_bs[i] = a->bc.bank_swizzle;
530 if (a->forced_bank_swizzle()) {
531 assert(i != SLOT_TRANS);
532 forced_swz_slots |= (1 << i);
533 a->bc.bank_swizzle = VEC_210;
534 if (!gpr.try_reserve(a))
535 assert(!"internal reservation error");
536 } else {
537 if (first_nf == ~0)
538 first_nf = i;
539
540 a->bc.bank_swizzle = 0;
541 }
542 }
543 }
544
545 if (first_nf == ~0) {
546 assign_slot(slot, n);
547 return true;
548 }
549
550 assert(first_slot != ~0 && last_slot != ~0);
551
552 // silence "array subscript is above array bounds" with gcc 4.8
553 if (last_slot >= 5)
554 abort();
555
556 int i = first_nf;
557 alu_node *a = slots[i];
558 bool backtrack = false;
559
560 while (1) {
561
562 PSC_DUMP(
563 sblog << " bs: trying s" << i << " bs:" << a->bc.bank_swizzle
564 << " bt:" << backtrack << "\n";
565 );
566
567 if (!backtrack && gpr.try_reserve(a)) {
568 PSC_DUMP(
569 sblog << " bs: reserved s" << i << " bs:" << a->bc.bank_swizzle
570 << "\n";
571 );
572
573 while ((++i <= last_slot) && !slots[i]);
574 if (i <= last_slot)
575 a = slots[i];
576 else
577 break;
578 } else {
579 bool itrans = i == SLOT_TRANS;
580 unsigned max_swz = itrans ? SCL_221 : VEC_210;
581
582 if (a->bc.bank_swizzle < max_swz) {
583 ++a->bc.bank_swizzle;
584
585 PSC_DUMP(
586 sblog << " bs: inc s" << i << " bs:" << a->bc.bank_swizzle
587 << "\n";
588 );
589
590 } else {
591
592 a->bc.bank_swizzle = 0;
593 while ((--i >= first_nf) && !slots[i]);
594 if (i < first_nf)
595 break;
596 a = slots[i];
597 PSC_DUMP(
598 sblog << " bs: unreserve s" << i << " bs:" << a->bc.bank_swizzle
599 << "\n";
600 );
601 gpr.unreserve(a);
602 backtrack = true;
603
604 continue;
605 }
606 }
607 backtrack = false;
608 }
609
610 if (i == last_slot + 1) {
611 assign_slot(slot, n);
612 return true;
613 }
614
615 // reservation failed, restore previous state
616 slots[slot] = NULL;
617 gpr.reset();
618 for (unsigned i = 0; i < max_slots; ++i) {
619 alu_node *a = slots[i];
620 if (a) {
621 a->bc.bank_swizzle = save_bs[i];
622 bool b = gpr.try_reserve(a);
623 assert(b);
624 }
625 }
626
627 kc.unreserve(n);
628 lt.unreserve(n);
629 return false;
630 }
631
try_reserve(alu_packed_node * p)632 bool alu_group_tracker::try_reserve(alu_packed_node* p) {
633 bool need_unreserve = false;
634 node_iterator I(p->begin()), E(p->end());
635
636 for (; I != E; ++I) {
637 alu_node *n = static_cast<alu_node*>(*I);
638 if (!try_reserve(n))
639 break;
640 else
641 need_unreserve = true;
642 }
643
644 if (I == E) {
645 packed_ops.push_back(p);
646 return true;
647 }
648
649 if (need_unreserve) {
650 while (--I != E) {
651 alu_node *n = static_cast<alu_node*>(*I);
652 slots[n->bc.slot] = NULL;
653 }
654 reinit();
655 }
656 return false;
657 }
658
reinit()659 void alu_group_tracker::reinit() {
660 alu_node * s[5];
661 memcpy(s, slots, sizeof(slots));
662
663 reset(true);
664
665 for (int i = max_slots - 1; i >= 0; --i) {
666 if (s[i] && !try_reserve(s[i])) {
667 sblog << "alu_group_tracker: reinit error on slot " << i << "\n";
668 for (unsigned i = 0; i < max_slots; ++i) {
669 sblog << " slot " << i << " : ";
670 if (s[i])
671 dump::dump_op(s[i]);
672
673 sblog << "\n";
674 }
675 assert(!"alu_group_tracker: reinit error");
676 }
677 }
678 }
679
reset(bool keep_packed)680 void alu_group_tracker::reset(bool keep_packed) {
681 kc.reset();
682 gpr.reset();
683 lt.reset();
684 memset(slots, 0, sizeof(slots));
685 vmap.clear();
686 next_id = 0;
687 produces_lds_oqa = 0;
688 consumes_lds_oqa = 0;
689 has_mova = false;
690 uses_ar = false;
691 has_predset = false;
692 has_kill = false;
693 updates_exec_mask = false;
694 available_slots = sh.get_ctx().has_trans ? 0x1F : 0x0F;
695 interp_param = 0;
696
697 chan_count[0] = 0;
698 chan_count[1] = 0;
699 chan_count[2] = 0;
700 chan_count[3] = 0;
701
702 if (!keep_packed)
703 packed_ops.clear();
704 }
705
update_flags(alu_node * n)706 void alu_group_tracker::update_flags(alu_node* n) {
707 unsigned flags = n->bc.op_ptr->flags;
708 has_kill |= (flags & AF_KILL);
709 has_mova |= (flags & AF_MOVA);
710 has_predset |= (flags & AF_ANY_PRED);
711 uses_ar |= n->uses_ar();
712 consumes_lds_oqa |= n->consumes_lds_oq();
713 produces_lds_oqa |= n->produces_lds_oq();
714 if (flags & AF_ANY_PRED) {
715 if (n->dst[2] != NULL)
716 updates_exec_mask = true;
717 }
718 }
719
run()720 int post_scheduler::run() {
721 return run_on(sh.root) ? 0 : 1;
722 }
723
run_on(container_node * n)724 bool post_scheduler::run_on(container_node* n) {
725 int r = true;
726 for (node_riterator I = n->rbegin(), E = n->rend(); I != E; ++I) {
727 if (I->is_container()) {
728 if (I->subtype == NST_BB) {
729 bb_node* bb = static_cast<bb_node*>(*I);
730 r = schedule_bb(bb);
731 } else {
732 r = run_on(static_cast<container_node*>(*I));
733 }
734 if (!r)
735 break;
736 }
737 }
738 return r;
739 }
740
init_uc_val(container_node * c,value * v)741 void post_scheduler::init_uc_val(container_node *c, value *v) {
742 node *d = v->any_def();
743 if (d && d->parent == c)
744 ++ucm[d];
745 }
746
init_uc_vec(container_node * c,vvec & vv,bool src)747 void post_scheduler::init_uc_vec(container_node *c, vvec &vv, bool src) {
748 for (vvec::iterator I = vv.begin(), E = vv.end(); I != E; ++I) {
749 value *v = *I;
750 if (!v || v->is_readonly())
751 continue;
752
753 if (v->is_rel()) {
754 init_uc_val(c, v->rel);
755 init_uc_vec(c, v->muse, true);
756 } if (src) {
757 init_uc_val(c, v);
758 }
759 }
760 }
761
init_ucm(container_node * c,node * n)762 unsigned post_scheduler::init_ucm(container_node *c, node *n) {
763 init_uc_vec(c, n->src, true);
764 init_uc_vec(c, n->dst, false);
765
766 uc_map::iterator F = ucm.find(n);
767 return F == ucm.end() ? 0 : F->second;
768 }
769
schedule_bb(bb_node * bb)770 bool post_scheduler::schedule_bb(bb_node* bb) {
771 PSC_DUMP(
772 sblog << "scheduling BB " << bb->id << "\n";
773 if (!pending.empty())
774 dump::dump_op_list(&pending);
775 );
776
777 assert(pending.empty());
778 assert(bb_pending.empty());
779 assert(ready.empty());
780
781 bb_pending.append_from(bb);
782 cur_bb = bb;
783
784 node *n;
785
786 while ((n = bb_pending.back())) {
787
788 PSC_DUMP(
789 sblog << "post_sched_bb ";
790 dump::dump_op(n);
791 sblog << "\n";
792 );
793
794 // May require emitting ALU ops to load index registers
795 if (n->is_fetch_clause()) {
796 n->remove();
797 process_fetch(static_cast<container_node *>(n));
798 continue;
799 }
800
801 if (n->is_alu_clause()) {
802 n->remove();
803 bool r = process_alu(static_cast<container_node*>(n));
804 if (r)
805 continue;
806 return false;
807 }
808
809 n->remove();
810 bb->push_front(n);
811 }
812
813 this->cur_bb = NULL;
814 return true;
815 }
816
init_regmap()817 void post_scheduler::init_regmap() {
818
819 regmap.clear();
820
821 PSC_DUMP(
822 sblog << "init_regmap: live: ";
823 dump::dump_set(sh, live);
824 sblog << "\n";
825 );
826
827 for (val_set::iterator I = live.begin(sh), E = live.end(sh); I != E; ++I) {
828 value *v = *I;
829 assert(v);
830 if (!v->is_sgpr() || !v->is_prealloc())
831 continue;
832
833 sel_chan r = v->gpr;
834
835 PSC_DUMP(
836 sblog << "init_regmap: " << r << " <= ";
837 dump::dump_val(v);
838 sblog << "\n";
839 );
840
841 assert(r);
842 regmap[r] = v;
843 }
844 }
845
create_set_idx(shader & sh,unsigned ar_idx)846 static alu_node *create_set_idx(shader &sh, unsigned ar_idx) {
847 alu_node *a = sh.create_alu();
848
849 assert(ar_idx == V_SQ_CF_INDEX_0 || ar_idx == V_SQ_CF_INDEX_1);
850 if (ar_idx == V_SQ_CF_INDEX_0)
851 a->bc.set_op(ALU_OP0_SET_CF_IDX0);
852 else
853 a->bc.set_op(ALU_OP0_SET_CF_IDX1);
854 a->bc.slot = SLOT_X;
855 a->dst.resize(1); // Dummy needed for recolor
856
857 PSC_DUMP(
858 sblog << "created IDX load: ";
859 dump::dump_op(a);
860 sblog << "\n";
861 );
862
863 return a;
864 }
865
load_index_register(value * v,unsigned ar_idx)866 void post_scheduler::load_index_register(value *v, unsigned ar_idx)
867 {
868 alu.reset();
869
870 if (!sh.get_ctx().is_cayman()) {
871 // Evergreen has to first load address register, then use CF_SET_IDX0/1
872 alu_group_tracker &rt = alu.grp();
873 alu_node *set_idx = create_set_idx(sh, ar_idx);
874 if (!rt.try_reserve(set_idx)) {
875 sblog << "can't emit SET_CF_IDX";
876 dump::dump_op(set_idx);
877 sblog << "\n";
878 }
879 process_group();
880
881 if (!alu.check_clause_limits()) {
882 // Can't happen since clause only contains MOVA/CF_SET_IDX0/1
883 }
884 alu.emit_group();
885 }
886
887 alu_group_tracker &rt = alu.grp();
888 alu_node *a = alu.create_ar_load(v, ar_idx == V_SQ_CF_INDEX_1 ? SEL_Z : SEL_Y);
889
890 if (!rt.try_reserve(a)) {
891 sblog << "can't emit AR load : ";
892 dump::dump_op(a);
893 sblog << "\n";
894 }
895
896 process_group();
897
898 if (!alu.check_clause_limits()) {
899 // Can't happen since clause only contains MOVA/CF_SET_IDX0/1
900 }
901
902 alu.emit_group();
903 alu.emit_clause(cur_bb);
904 }
905
process_fetch(container_node * c)906 void post_scheduler::process_fetch(container_node *c) {
907 if (c->empty())
908 return;
909
910 for (node_iterator N, I = c->begin(), E = c->end(); I != E; I = N) {
911 N = I;
912 ++N;
913
914 node *n = *I;
915
916 fetch_node *f = static_cast<fetch_node*>(n);
917
918 PSC_DUMP(
919 sblog << "process_tex ";
920 dump::dump_op(n);
921 sblog << " ";
922 );
923
924 // TODO: If same values used can avoid reloading index register
925 if (f->bc.sampler_index_mode != V_SQ_CF_INDEX_NONE ||
926 f->bc.resource_index_mode != V_SQ_CF_INDEX_NONE) {
927 unsigned index_mode = f->bc.sampler_index_mode != V_SQ_CF_INDEX_NONE ?
928 f->bc.sampler_index_mode : f->bc.resource_index_mode;
929
930 // Currently require prior opt passes to use one TEX per indexed op
931 assert(f->parent->count() == 1);
932
933 value *v = f->src.back(); // Last src is index offset
934 assert(v);
935
936 cur_bb->push_front(c);
937
938 load_index_register(v, index_mode);
939 f->src.pop_back(); // Don't need index value any more
940
941 return;
942 }
943 }
944
945 cur_bb->push_front(c);
946 }
947
process_alu(container_node * c)948 bool post_scheduler::process_alu(container_node *c) {
949
950 if (c->empty())
951 return true;
952
953 ucm.clear();
954 alu.reset();
955
956 live = c->live_after;
957
958 init_globals(c->live_after, true);
959 init_globals(c->live_before, true);
960
961 init_regmap();
962
963 update_local_interferences();
964
965 for (node_riterator N, I = c->rbegin(), E = c->rend(); I != E; I = N) {
966 N = I;
967 ++N;
968
969 node *n = *I;
970 unsigned uc = init_ucm(c, n);
971
972 PSC_DUMP(
973 sblog << "process_alu uc=" << uc << " ";
974 dump::dump_op(n);
975 sblog << " ";
976 );
977
978 if (uc) {
979 n->remove();
980
981 pending.push_back(n);
982 PSC_DUMP( sblog << "pending\n"; );
983 } else {
984 release_op(n);
985 }
986 }
987
988 return schedule_alu(c);
989 }
990
update_local_interferences()991 void post_scheduler::update_local_interferences() {
992
993 PSC_DUMP(
994 sblog << "update_local_interferences : ";
995 dump::dump_set(sh, live);
996 sblog << "\n";
997 );
998
999
1000 for (val_set::iterator I = live.begin(sh), E = live.end(sh); I != E; ++I) {
1001 value *v = *I;
1002 if (v->is_prealloc())
1003 continue;
1004
1005 v->interferences.add_set(live);
1006 }
1007 }
1008
update_live_src_vec(vvec & vv,val_set * born,bool src)1009 void post_scheduler::update_live_src_vec(vvec &vv, val_set *born, bool src) {
1010 for (vvec::iterator I = vv.begin(), E = vv.end(); I != E; ++I) {
1011 value *v = *I;
1012
1013 if (!v)
1014 continue;
1015
1016 if (src && v->is_any_gpr()) {
1017 if (live.add_val(v)) {
1018 if (!v->is_prealloc()) {
1019 if (!cleared_interf.contains(v)) {
1020 PSC_DUMP(
1021 sblog << "clearing interferences for " << *v << "\n";
1022 );
1023 v->interferences.clear();
1024 cleared_interf.add_val(v);
1025 }
1026 }
1027 if (born)
1028 born->add_val(v);
1029 }
1030 } else if (v->is_rel()) {
1031 if (!v->rel->is_any_gpr())
1032 live.add_val(v->rel);
1033 update_live_src_vec(v->muse, born, true);
1034 }
1035 }
1036 }
1037
update_live_dst_vec(vvec & vv)1038 void post_scheduler::update_live_dst_vec(vvec &vv) {
1039 for (vvec::iterator I = vv.begin(), E = vv.end(); I != E; ++I) {
1040 value *v = *I;
1041 if (!v)
1042 continue;
1043
1044 if (v->is_rel()) {
1045 update_live_dst_vec(v->mdef);
1046 } else if (v->is_any_gpr()) {
1047 if (!live.remove_val(v)) {
1048 PSC_DUMP(
1049 sblog << "failed to remove ";
1050 dump::dump_val(v);
1051 sblog << " from live : ";
1052 dump::dump_set(sh, live);
1053 sblog << "\n";
1054 );
1055 }
1056 }
1057 }
1058 }
1059
update_live(node * n,val_set * born)1060 void post_scheduler::update_live(node *n, val_set *born) {
1061 update_live_dst_vec(n->dst);
1062 update_live_src_vec(n->src, born, true);
1063 update_live_src_vec(n->dst, born, false);
1064 }
1065
process_group()1066 void post_scheduler::process_group() {
1067 alu_group_tracker &rt = alu.grp();
1068
1069 val_set vals_born;
1070
1071 recolor_locals();
1072
1073 PSC_DUMP(
1074 sblog << "process_group: live_before : ";
1075 dump::dump_set(sh, live);
1076 sblog << "\n";
1077 );
1078
1079 for (unsigned s = 0; s < ctx.num_slots; ++s) {
1080 alu_node *n = rt.slot(s);
1081 if (!n)
1082 continue;
1083
1084 update_live(n, &vals_born);
1085 }
1086
1087 PSC_DUMP(
1088 sblog << "process_group: live_after : ";
1089 dump::dump_set(sh, live);
1090 sblog << "\n";
1091 );
1092
1093 update_local_interferences();
1094
1095 for (unsigned i = 0; i < 5; ++i) {
1096 node *n = rt.slot(i);
1097 if (n && !n->is_mova()) {
1098 release_src_values(n);
1099 }
1100 }
1101 }
1102
init_globals(val_set & s,bool prealloc)1103 void post_scheduler::init_globals(val_set &s, bool prealloc) {
1104
1105 PSC_DUMP(
1106 sblog << "init_globals: ";
1107 dump::dump_set(sh, s);
1108 sblog << "\n";
1109 );
1110
1111 for (val_set::iterator I = s.begin(sh), E = s.end(sh); I != E; ++I) {
1112 value *v = *I;
1113 if (v->is_sgpr() && !v->is_global()) {
1114 v->set_global();
1115
1116 if (prealloc && v->is_fixed()) {
1117 v->set_prealloc();
1118 }
1119 }
1120 }
1121 }
1122
emit_index_registers()1123 void post_scheduler::emit_index_registers() {
1124 for (unsigned i = 0; i < 2; i++) {
1125 if (alu.current_idx[i]) {
1126 regmap = prev_regmap;
1127 alu.discard_current_group();
1128
1129 load_index_register(alu.current_idx[i], KC_INDEX_0 + i);
1130 alu.current_idx[i] = NULL;
1131 }
1132 }
1133 }
1134
emit_clause()1135 void post_scheduler::emit_clause() {
1136
1137 if (alu.current_ar) {
1138 emit_load_ar();
1139 process_group();
1140 if (!alu.check_clause_limits()) {
1141 // Can't happen since clause only contains MOVA/CF_SET_IDX0/1
1142 }
1143 alu.emit_group();
1144 }
1145
1146 if (!alu.is_empty()) {
1147 alu.emit_clause(cur_bb);
1148 }
1149
1150 emit_index_registers();
1151 }
1152
schedule_alu(container_node * c)1153 bool post_scheduler::schedule_alu(container_node *c) {
1154
1155 assert(!ready.empty() || !ready_copies.empty());
1156
1157 /* This number is rather arbitrary, important is that the scheduler has
1158 * more than one try to create an instruction group
1159 */
1160 int improving = 10;
1161 int last_pending = pending.count();
1162 while (improving > 0) {
1163 prev_regmap = regmap;
1164 if (!prepare_alu_group()) {
1165
1166 int new_pending = pending.count();
1167 if ((new_pending < last_pending) || (last_pending == 0))
1168 improving = 10;
1169 else
1170 --improving;
1171
1172 last_pending = new_pending;
1173
1174 if (alu.current_idx[0] || alu.current_idx[1]) {
1175 regmap = prev_regmap;
1176 emit_clause();
1177 init_globals(live, false);
1178
1179 continue;
1180 }
1181
1182 if (alu.current_ar) {
1183 emit_load_ar();
1184 continue;
1185 } else
1186 break;
1187 }
1188
1189 if (!alu.check_clause_limits()) {
1190 regmap = prev_regmap;
1191 emit_clause();
1192 init_globals(live, false);
1193
1194 continue;
1195 }
1196
1197 process_group();
1198 alu.emit_group();
1199 };
1200
1201 if (!alu.is_empty()) {
1202 emit_clause();
1203 }
1204
1205 if (!ready.empty()) {
1206 sblog << "##post_scheduler: unscheduled ready instructions :";
1207 dump::dump_op_list(&ready);
1208 }
1209
1210 if (!pending.empty()) {
1211 sblog << "##post_scheduler: unscheduled pending instructions :";
1212 dump::dump_op_list(&pending);
1213 }
1214 return pending.empty() && ready.empty() && improving != 0;
1215 }
1216
add_interferences(value * v,sb_bitset & rb,val_set & vs)1217 void post_scheduler::add_interferences(value *v, sb_bitset &rb, val_set &vs) {
1218 unsigned chan = v->gpr.chan();
1219
1220 for (val_set::iterator I = vs.begin(sh), E = vs.end(sh);
1221 I != E; ++I) {
1222 value *vi = *I;
1223 sel_chan gpr = vi->get_final_gpr();
1224
1225 if (vi->is_any_gpr() && gpr && vi != v &&
1226 (!v->chunk || v->chunk != vi->chunk) &&
1227 vi->is_fixed() && gpr.chan() == chan) {
1228
1229 unsigned r = gpr.sel();
1230
1231 PSC_DUMP(
1232 sblog << "\tadd_interferences: " << *vi << "\n";
1233 );
1234
1235 if (rb.size() <= r)
1236 rb.resize(r + 32);
1237 rb.set(r);
1238 }
1239 }
1240 }
1241
set_color_local_val(value * v,sel_chan color)1242 void post_scheduler::set_color_local_val(value *v, sel_chan color) {
1243 v->gpr = color;
1244
1245 PSC_DUMP(
1246 sblog << " recolored: ";
1247 dump::dump_val(v);
1248 sblog << "\n";
1249 );
1250 }
1251
set_color_local(value * v,sel_chan color)1252 void post_scheduler::set_color_local(value *v, sel_chan color) {
1253 if (v->chunk) {
1254 vvec &vv = v->chunk->values;
1255 for (vvec::iterator I = vv.begin(), E = vv.end(); I != E; ++I) {
1256 value *v2 =*I;
1257 set_color_local_val(v2, color);
1258 }
1259 v->chunk->fix();
1260 } else {
1261 set_color_local_val(v, color);
1262 v->fix();
1263 }
1264 }
1265
recolor_local(value * v)1266 bool post_scheduler::recolor_local(value *v) {
1267
1268 sb_bitset rb;
1269
1270 assert(v->is_sgpr());
1271 assert(!v->is_prealloc());
1272 assert(v->gpr);
1273
1274 unsigned chan = v->gpr.chan();
1275
1276 PSC_DUMP(
1277 sblog << "recolor_local: ";
1278 dump::dump_val(v);
1279 sblog << " interferences: ";
1280 dump::dump_set(sh, v->interferences);
1281 sblog << "\n";
1282 if (v->chunk) {
1283 sblog << " in chunk: ";
1284 coalescer::dump_chunk(v->chunk);
1285 sblog << "\n";
1286 }
1287 );
1288
1289 if (v->chunk) {
1290 for (vvec::iterator I = v->chunk->values.begin(),
1291 E = v->chunk->values.end(); I != E; ++I) {
1292 value *v2 = *I;
1293
1294 PSC_DUMP( sblog << " add_interferences for " << *v2 << " :\n"; );
1295
1296 add_interferences(v, rb, v2->interferences);
1297 }
1298 } else {
1299 add_interferences(v, rb, v->interferences);
1300 }
1301
1302 PSC_DUMP(
1303 unsigned sz = rb.size();
1304 sblog << "registers bits: " << sz;
1305 for (unsigned r = 0; r < sz; ++r) {
1306 if ((r & 7) == 0)
1307 sblog << "\n " << r << " ";
1308 sblog << (rb.get(r) ? 1 : 0);
1309 }
1310 );
1311
1312 bool no_temp_gprs = v->is_global();
1313 unsigned rs, re, pass = no_temp_gprs ? 1 : 0;
1314
1315 while (pass < 2) {
1316
1317 if (pass == 0) {
1318 rs = sh.first_temp_gpr();
1319 re = MAX_GPR;
1320 } else {
1321 rs = 0;
1322 re = sh.num_nontemp_gpr();
1323 }
1324
1325 for (unsigned reg = rs; reg < re; ++reg) {
1326 if (reg >= rb.size() || !rb.get(reg)) {
1327 // color found
1328 set_color_local(v, sel_chan(reg, chan));
1329 return true;
1330 }
1331 }
1332 ++pass;
1333 }
1334
1335 assert(!"recolor_local failed");
1336 return true;
1337 }
1338
emit_load_ar()1339 void post_scheduler::emit_load_ar() {
1340
1341 regmap = prev_regmap;
1342 alu.discard_current_group();
1343
1344 alu_group_tracker &rt = alu.grp();
1345 alu_node *a = alu.create_ar_load(alu.current_ar, SEL_X);
1346
1347 if (!rt.try_reserve(a)) {
1348 sblog << "can't emit AR load : ";
1349 dump::dump_op(a);
1350 sblog << "\n";
1351 }
1352
1353 alu.current_ar = 0;
1354 }
1355
unmap_dst_val(value * d)1356 bool post_scheduler::unmap_dst_val(value *d) {
1357
1358 if (d == alu.current_ar) {
1359 emit_load_ar();
1360 return false;
1361 }
1362
1363 if (d->is_prealloc()) {
1364 sel_chan gpr = d->get_final_gpr();
1365 rv_map::iterator F = regmap.find(gpr);
1366 value *c = NULL;
1367 if (F != regmap.end())
1368 c = F->second;
1369
1370 if (c && c!=d && (!c->chunk || c->chunk != d->chunk)) {
1371 PSC_DUMP(
1372 sblog << "dst value conflict : ";
1373 dump::dump_val(d);
1374 sblog << " regmap contains ";
1375 dump::dump_val(c);
1376 sblog << "\n";
1377 );
1378 assert(!"scheduler error");
1379 return false;
1380 } else if (c) {
1381 regmap.erase(F);
1382 }
1383 }
1384 return true;
1385 }
1386
unmap_dst(alu_node * n)1387 bool post_scheduler::unmap_dst(alu_node *n) {
1388 value *d = n->dst.empty() ? NULL : n->dst[0];
1389
1390 if (!d)
1391 return true;
1392
1393 if (!d->is_rel()) {
1394 if (d && d->is_any_reg()) {
1395
1396 if (d->is_AR()) {
1397 if (alu.current_ar != d) {
1398 sblog << "loading wrong ar value\n";
1399 assert(0);
1400 } else {
1401 alu.current_ar = NULL;
1402 }
1403
1404 } else if (d->is_any_gpr()) {
1405 if (!unmap_dst_val(d))
1406 return false;
1407 }
1408 }
1409 } else {
1410 for (vvec::iterator I = d->mdef.begin(), E = d->mdef.end();
1411 I != E; ++I) {
1412 d = *I;
1413 if (!d)
1414 continue;
1415
1416 assert(d->is_any_gpr());
1417
1418 if (!unmap_dst_val(d))
1419 return false;
1420 }
1421 }
1422 return true;
1423 }
1424
map_src_val(value * v)1425 bool post_scheduler::map_src_val(value *v) {
1426
1427 if (!v->is_prealloc())
1428 return true;
1429
1430 sel_chan gpr = v->get_final_gpr();
1431 rv_map::iterator F = regmap.find(gpr);
1432 value *c = NULL;
1433 if (F != regmap.end()) {
1434 c = F->second;
1435 if (!v->v_equal(c)) {
1436 PSC_DUMP(
1437 sblog << "can't map src value ";
1438 dump::dump_val(v);
1439 sblog << ", regmap contains ";
1440 dump::dump_val(c);
1441 sblog << "\n";
1442 );
1443 return false;
1444 }
1445 } else {
1446 regmap.insert(std::make_pair(gpr, v));
1447 }
1448 return true;
1449 }
1450
map_src_vec(vvec & vv,bool src)1451 bool post_scheduler::map_src_vec(vvec &vv, bool src) {
1452 if (src) {
1453 // Handle possible UBO indexing
1454 bool ubo_indexing[2] = { false, false };
1455 for (vvec::iterator I = vv.begin(), E = vv.end(); I != E; ++I) {
1456 value *v = *I;
1457 if (!v)
1458 continue;
1459
1460 if (v->is_kcache()) {
1461 unsigned index_mode = v->select.kcache_index_mode();
1462 if (index_mode == KC_INDEX_0 || index_mode == KC_INDEX_1) {
1463 ubo_indexing[index_mode - KC_INDEX_0] = true;
1464 }
1465 }
1466 }
1467
1468 // idx values stored at end of src vec, see bc_parser::prepare_alu_group
1469 for (unsigned i = 2; i != 0; i--) {
1470 if (ubo_indexing[i-1]) {
1471 // TODO: skip adding value to kcache reservation somehow, causes
1472 // unnecessary group breaks and cache line locks
1473 value *v = vv.back();
1474 if (alu.current_idx[i-1] && alu.current_idx[i-1] != v) {
1475 PSC_DUMP(
1476 sblog << "IDX" << i-1 << " already set to " <<
1477 *alu.current_idx[i-1] << ", trying to set " << *v << "\n";
1478 );
1479 return false;
1480 }
1481
1482 alu.current_idx[i-1] = v;
1483 PSC_DUMP(sblog << "IDX" << i-1 << " set to " << *v << "\n";);
1484 }
1485 }
1486 }
1487
1488 for (vvec::iterator I = vv.begin(), E = vv.end(); I != E; ++I) {
1489 value *v = *I;
1490 if (!v)
1491 continue;
1492
1493 if ((!v->is_any_gpr() || !v->is_fixed()) && !v->is_rel())
1494 continue;
1495
1496 if (v->is_rel()) {
1497 value *rel = v->rel;
1498 assert(rel);
1499
1500 if (!rel->is_const()) {
1501 if (!map_src_vec(v->muse, true))
1502 return false;
1503
1504 if (rel != alu.current_ar) {
1505 if (alu.current_ar) {
1506 PSC_DUMP(
1507 sblog << " current_AR is " << *alu.current_ar
1508 << " trying to use " << *rel << "\n";
1509 );
1510 return false;
1511 }
1512
1513 alu.current_ar = rel;
1514
1515 PSC_DUMP(
1516 sblog << " new current_AR assigned: " << *alu.current_ar
1517 << "\n";
1518 );
1519 }
1520 }
1521
1522 } else if (src) {
1523 if (!map_src_val(v)) {
1524 return false;
1525 }
1526 }
1527 }
1528 return true;
1529 }
1530
map_src(alu_node * n)1531 bool post_scheduler::map_src(alu_node *n) {
1532 if (!map_src_vec(n->dst, false))
1533 return false;
1534
1535 if (!map_src_vec(n->src, true))
1536 return false;
1537
1538 return true;
1539 }
1540
dump_regmap()1541 void post_scheduler::dump_regmap() {
1542
1543 sblog << "# REGMAP :\n";
1544
1545 for(rv_map::iterator I = regmap.begin(), E = regmap.end(); I != E; ++I) {
1546 sblog << " # " << I->first << " => " << *(I->second) << "\n";
1547 }
1548
1549 if (alu.current_ar)
1550 sblog << " current_AR: " << *alu.current_ar << "\n";
1551 if (alu.current_pr)
1552 sblog << " current_PR: " << *alu.current_pr << "\n";
1553 if (alu.current_idx[0])
1554 sblog << " current IDX0: " << *alu.current_idx[0] << "\n";
1555 if (alu.current_idx[1])
1556 sblog << " current IDX1: " << *alu.current_idx[1] << "\n";
1557 }
1558
recolor_locals()1559 void post_scheduler::recolor_locals() {
1560 alu_group_tracker &rt = alu.grp();
1561
1562 for (unsigned s = 0; s < ctx.num_slots; ++s) {
1563 alu_node *n = rt.slot(s);
1564 if (n) {
1565 value *d = n->dst[0];
1566 if (d && d->is_sgpr() && !d->is_prealloc()) {
1567 recolor_local(d);
1568 }
1569 }
1570 }
1571 }
1572
1573 // returns true if there are interferences
check_interferences()1574 bool post_scheduler::check_interferences() {
1575
1576 alu_group_tracker &rt = alu.grp();
1577
1578 unsigned interf_slots;
1579
1580 bool discarded = false;
1581
1582 PSC_DUMP(
1583 sblog << "check_interferences: before: \n";
1584 dump_regmap();
1585 );
1586
1587 do {
1588
1589 interf_slots = 0;
1590
1591 for (unsigned s = 0; s < ctx.num_slots; ++s) {
1592 alu_node *n = rt.slot(s);
1593 if (n) {
1594 if (!unmap_dst(n)) {
1595 return true;
1596 }
1597 }
1598 }
1599
1600 for (unsigned s = 0; s < ctx.num_slots; ++s) {
1601 alu_node *n = rt.slot(s);
1602 if (n) {
1603 if (!map_src(n)) {
1604 interf_slots |= (1 << s);
1605 }
1606 }
1607 }
1608
1609 PSC_DUMP(
1610 for (unsigned i = 0; i < 5; ++i) {
1611 if (interf_slots & (1 << i)) {
1612 sblog << "!!!!!! interf slot: " << i << " : ";
1613 dump::dump_op(rt.slot(i));
1614 sblog << "\n";
1615 }
1616 }
1617 );
1618
1619 if (!interf_slots)
1620 break;
1621
1622 PSC_DUMP( sblog << "ci: discarding slots " << interf_slots << "\n"; );
1623
1624 rt.discard_slots(interf_slots, alu.conflict_nodes);
1625 regmap = prev_regmap;
1626 discarded = true;
1627
1628 } while(1);
1629
1630 PSC_DUMP(
1631 sblog << "check_interferences: after: \n";
1632 dump_regmap();
1633 );
1634
1635 return discarded;
1636 }
1637
1638 // add instruction(s) (alu_node or contents of alu_packed_node) to current group
1639 // returns the number of added instructions on success
try_add_instruction(node * n)1640 unsigned post_scheduler::try_add_instruction(node *n) {
1641
1642 alu_group_tracker &rt = alu.grp();
1643
1644 unsigned avail_slots = rt.avail_slots();
1645
1646 // Cannot schedule in same clause as instructions using this index value
1647 if (!n->dst.empty() && n->dst[0] &&
1648 (n->dst[0] == alu.current_idx[0] || n->dst[0] == alu.current_idx[1])) {
1649 PSC_DUMP(sblog << " CF_IDX source: " << *n->dst[0] << "\n";);
1650 return 0;
1651 }
1652
1653 if (n->is_alu_packed()) {
1654 alu_packed_node *p = static_cast<alu_packed_node*>(n);
1655 unsigned slots = p->get_slot_mask();
1656 unsigned cnt = __builtin_popcount(slots);
1657
1658 if ((slots & avail_slots) != slots) {
1659 PSC_DUMP( sblog << " no slots \n"; );
1660 return 0;
1661 }
1662
1663 p->update_packed_items(ctx);
1664
1665 if (!rt.try_reserve(p)) {
1666 PSC_DUMP( sblog << " reservation failed \n"; );
1667 return 0;
1668 }
1669
1670 p->remove();
1671 return cnt;
1672
1673 } else {
1674 alu_node *a = static_cast<alu_node*>(n);
1675 value *d = a->dst.empty() ? NULL : a->dst[0];
1676
1677 if (d && d->is_special_reg()) {
1678 assert((a->bc.op_ptr->flags & AF_MOVA) || d->is_geometry_emit() || d->is_lds_oq() || d->is_lds_access() || d->is_scratch());
1679 d = NULL;
1680 }
1681
1682 unsigned allowed_slots = ctx.alu_slots_mask(a->bc.op_ptr);
1683 unsigned slot;
1684
1685 allowed_slots &= avail_slots;
1686
1687 if (!allowed_slots)
1688 return 0;
1689
1690 if (d) {
1691 slot = d->get_final_chan();
1692 a->bc.dst_chan = slot;
1693 allowed_slots &= (1 << slot) | 0x10;
1694 } else {
1695 if (a->bc.op_ptr->flags & AF_MOVA) {
1696 if (a->bc.slot_flags & AF_V)
1697 allowed_slots &= (1 << SLOT_X);
1698 else
1699 allowed_slots &= (1 << SLOT_TRANS);
1700 }
1701 }
1702
1703 // FIXME workaround for some problems with MULADD in trans slot on r700,
1704 // (is it really needed on r600?)
1705 if ((a->bc.op == ALU_OP3_MULADD || a->bc.op == ALU_OP3_MULADD_IEEE) &&
1706 !ctx.is_egcm()) {
1707 allowed_slots &= 0x0F;
1708 }
1709
1710 if (!allowed_slots) {
1711 PSC_DUMP( sblog << " no suitable slots\n"; );
1712 return 0;
1713 }
1714
1715 slot = __builtin_ctz(allowed_slots);
1716 a->bc.slot = slot;
1717
1718 PSC_DUMP( sblog << "slot: " << slot << "\n"; );
1719
1720 if (!rt.try_reserve(a)) {
1721 PSC_DUMP( sblog << " reservation failed\n"; );
1722 return 0;
1723 }
1724
1725 a->remove();
1726 return 1;
1727 }
1728 }
1729
check_copy(node * n)1730 bool post_scheduler::check_copy(node *n) {
1731 if (!n->is_copy_mov())
1732 return false;
1733
1734 value *s = n->src[0];
1735 value *d = n->dst[0];
1736
1737 if (!s->is_sgpr() || !d->is_sgpr())
1738 return false;
1739
1740 if (!s->is_prealloc()) {
1741 recolor_local(s);
1742
1743 if (!s->chunk || s->chunk != d->chunk)
1744 return false;
1745 }
1746
1747 if (s->gpr == d->gpr) {
1748
1749 PSC_DUMP(
1750 sblog << "check_copy: ";
1751 dump::dump_op(n);
1752 sblog << "\n";
1753 );
1754
1755 rv_map::iterator F = regmap.find(d->gpr);
1756 bool gpr_free = (F == regmap.end());
1757
1758 if (d->is_prealloc()) {
1759 if (gpr_free) {
1760 PSC_DUMP( sblog << " copy not ready...\n";);
1761 return true;
1762 }
1763
1764 value *rv = F->second;
1765 if (rv != d && (!rv->chunk || rv->chunk != d->chunk)) {
1766 PSC_DUMP( sblog << " copy not ready(2)...\n";);
1767 return true;
1768 }
1769
1770 unmap_dst(static_cast<alu_node*>(n));
1771 }
1772
1773 if (s->is_prealloc() && !map_src_val(s))
1774 return true;
1775
1776 update_live(n, NULL);
1777
1778 release_src_values(n);
1779 n->remove();
1780 PSC_DUMP( sblog << " copy coalesced...\n";);
1781 return true;
1782 }
1783 return false;
1784 }
1785
dump_group(alu_group_tracker & rt)1786 void post_scheduler::dump_group(alu_group_tracker &rt) {
1787 for (unsigned i = 0; i < 5; ++i) {
1788 node *n = rt.slot(i);
1789 if (n) {
1790 sblog << "slot " << i << " : ";
1791 dump::dump_op(n);
1792 sblog << "\n";
1793 }
1794 }
1795 }
1796
process_ready_copies()1797 void post_scheduler::process_ready_copies() {
1798
1799 node *last;
1800
1801 do {
1802 last = ready_copies.back();
1803
1804 for (node_iterator N, I = ready_copies.begin(), E = ready_copies.end();
1805 I != E; I = N) {
1806 N = I; ++N;
1807
1808 node *n = *I;
1809
1810 if (!check_copy(n)) {
1811 n->remove();
1812 ready.push_back(n);
1813 }
1814 }
1815 } while (last != ready_copies.back());
1816
1817 update_local_interferences();
1818 }
1819
1820
prepare_alu_group()1821 bool post_scheduler::prepare_alu_group() {
1822
1823 alu_group_tracker &rt = alu.grp();
1824
1825 unsigned i1 = 0;
1826
1827 PSC_DUMP(
1828 sblog << "prepare_alu_group: starting...\n";
1829 dump_group(rt);
1830 );
1831
1832 ready.append_from(&alu.conflict_nodes);
1833
1834 // FIXME rework this loop
1835
1836 do {
1837
1838 process_ready_copies();
1839
1840 ++i1;
1841
1842 for (node_iterator N, I = ready.begin(), E = ready.end(); I != E;
1843 I = N) {
1844 N = I; ++N;
1845 node *n = *I;
1846
1847 PSC_DUMP(
1848 sblog << "p_a_g: ";
1849 dump::dump_op(n);
1850 sblog << "\n";
1851 );
1852
1853
1854 unsigned cnt = try_add_instruction(n);
1855
1856 if (!cnt)
1857 continue;
1858
1859 PSC_DUMP(
1860 sblog << "current group:\n";
1861 dump_group(rt);
1862 );
1863
1864 if (rt.inst_count() == ctx.num_slots) {
1865 PSC_DUMP( sblog << " all slots used\n"; );
1866 break;
1867 }
1868 }
1869
1870 if (!check_interferences())
1871 break;
1872
1873 // don't try to add more instructions to the group with mova if this
1874 // can lead to breaking clause slot count limit - we don't want mova to
1875 // end up in the end of the new clause instead of beginning of the
1876 // current clause.
1877 if (rt.has_ar_load() && alu.total_slots() > 121)
1878 break;
1879
1880 if (rt.inst_count() && i1 > 50)
1881 break;
1882
1883 regmap = prev_regmap;
1884
1885 } while (1);
1886
1887 PSC_DUMP(
1888 sblog << " prepare_alu_group done, " << rt.inst_count()
1889 << " slot(s) \n";
1890
1891 sblog << "$$$$$$$$PAG i1=" << i1
1892 << " ready " << ready.count()
1893 << " pending " << pending.count()
1894 << " conflicting " << alu.conflict_nodes.count()
1895 <<"\n";
1896
1897 );
1898
1899 return rt.inst_count();
1900 }
1901
release_src_values(node * n)1902 void post_scheduler::release_src_values(node* n) {
1903 release_src_vec(n->src, true);
1904 release_src_vec(n->dst, false);
1905 }
1906
release_op(node * n)1907 void post_scheduler::release_op(node *n) {
1908 PSC_DUMP(
1909 sblog << "release_op ";
1910 dump::dump_op(n);
1911 sblog << "\n";
1912 );
1913
1914 n->remove();
1915
1916 if (n->is_copy_mov()) {
1917 ready_copies.push_back(n);
1918 } else if (n->is_mova() || n->is_pred_set()) {
1919 ready.push_front(n);
1920 } else {
1921 ready.push_back(n);
1922 }
1923 }
1924
release_src_val(value * v)1925 void post_scheduler::release_src_val(value *v) {
1926 node *d = v->any_def();
1927 if (d) {
1928 if (!--ucm[d])
1929 release_op(d);
1930 }
1931 }
1932
release_src_vec(vvec & vv,bool src)1933 void post_scheduler::release_src_vec(vvec& vv, bool src) {
1934
1935 for (vvec::iterator I = vv.begin(), E = vv.end(); I != E; ++I) {
1936 value *v = *I;
1937 if (!v || v->is_readonly())
1938 continue;
1939
1940 if (v->is_rel()) {
1941 release_src_val(v->rel);
1942 release_src_vec(v->muse, true);
1943
1944 } else if (src) {
1945 release_src_val(v);
1946 }
1947 }
1948 }
1949
reset()1950 void literal_tracker::reset() {
1951 lt[0].u = 0;
1952 lt[1].u = 0;
1953 lt[2].u = 0;
1954 lt[3].u = 0;
1955 memset(uc, 0, sizeof(uc));
1956 }
1957
reset()1958 void rp_gpr_tracker::reset() {
1959 memset(rp, 0, sizeof(rp));
1960 memset(uc, 0, sizeof(uc));
1961 }
1962
reset()1963 void rp_kcache_tracker::reset() {
1964 memset(rp, 0, sizeof(rp));
1965 memset(uc, 0, sizeof(uc));
1966 }
1967
reset()1968 void alu_kcache_tracker::reset() {
1969 memset(kc, 0, sizeof(kc));
1970 lines.clear();
1971 }
1972
reset()1973 void alu_clause_tracker::reset() {
1974 group = 0;
1975 slot_count = 0;
1976 outstanding_lds_oqa_reads = 0;
1977 grp0.reset();
1978 grp1.reset();
1979 }
1980
alu_clause_tracker(shader & sh)1981 alu_clause_tracker::alu_clause_tracker(shader &sh)
1982 : sh(sh), kt(sh.get_ctx().hw_class), slot_count(),
1983 grp0(sh), grp1(sh),
1984 group(), clause(),
1985 push_exec_mask(), outstanding_lds_oqa_reads(),
1986 current_ar(), current_pr(), current_idx() {}
1987
emit_group()1988 void alu_clause_tracker::emit_group() {
1989
1990 assert(grp().inst_count());
1991
1992 alu_group_node *g = grp().emit();
1993
1994 if (grp().has_update_exec_mask()) {
1995 assert(!push_exec_mask);
1996 push_exec_mask = true;
1997 }
1998
1999 assert(g);
2000
2001 if (!clause) {
2002 clause = sh.create_clause(NST_ALU_CLAUSE);
2003 }
2004
2005 clause->push_front(g);
2006
2007 outstanding_lds_oqa_reads += grp().get_consumes_lds_oqa();
2008 outstanding_lds_oqa_reads -= grp().get_produces_lds_oqa();
2009 slot_count += grp().slot_count();
2010
2011 new_group();
2012
2013 PSC_DUMP( sblog << " #### group emitted\n"; );
2014 }
2015
emit_clause(container_node * c)2016 void alu_clause_tracker::emit_clause(container_node *c) {
2017 assert(clause);
2018
2019 kt.init_clause(clause->bc);
2020
2021 assert(!outstanding_lds_oqa_reads);
2022 assert(!current_ar);
2023 assert(!current_pr);
2024
2025 if (push_exec_mask)
2026 clause->bc.set_op(CF_OP_ALU_PUSH_BEFORE);
2027
2028 c->push_front(clause);
2029
2030 clause = NULL;
2031 push_exec_mask = false;
2032 slot_count = 0;
2033 kt.reset();
2034
2035 PSC_DUMP( sblog << "######### ALU clause emitted\n"; );
2036 }
2037
check_clause_limits()2038 bool alu_clause_tracker::check_clause_limits() {
2039
2040 alu_group_tracker > = grp();
2041
2042 unsigned slots = gt.slot_count();
2043
2044 // reserving slots to load AR and PR values
2045 unsigned reserve_slots = (current_ar ? 1 : 0) + (current_pr ? 1 : 0);
2046 // ...and index registers
2047 reserve_slots += (current_idx[0] != NULL) + (current_idx[1] != NULL);
2048
2049 if (gt.get_consumes_lds_oqa() && !outstanding_lds_oqa_reads)
2050 reserve_slots += 60;
2051
2052 if (slot_count + slots > MAX_ALU_SLOTS - reserve_slots)
2053 return false;
2054
2055 if (!kt.try_reserve(gt))
2056 return false;
2057
2058 return true;
2059 }
2060
new_group()2061 void alu_clause_tracker::new_group() {
2062 group = !group;
2063 grp().reset();
2064 }
2065
is_empty()2066 bool alu_clause_tracker::is_empty() {
2067 return clause == NULL;
2068 }
2069
init_group_literals(alu_group_node * g)2070 void literal_tracker::init_group_literals(alu_group_node* g) {
2071
2072 g->literals.clear();
2073 for (unsigned i = 0; i < 4; ++i) {
2074 if (!lt[i])
2075 break;
2076
2077 g->literals.push_back(lt[i]);
2078
2079 PSC_DUMP(
2080 sblog << "literal emitted: " << lt[i].f;
2081 sblog.print_zw_hex(lt[i].u, 8);
2082 sblog << " " << lt[i].i << "\n";
2083 );
2084 }
2085 }
2086
try_reserve(alu_group_tracker & gt)2087 bool alu_kcache_tracker::try_reserve(alu_group_tracker& gt) {
2088 rp_kcache_tracker &kt = gt.kcache();
2089
2090 if (!kt.num_sels())
2091 return true;
2092
2093 sb_set<unsigned> group_lines;
2094
2095 unsigned nl = kt.get_lines(group_lines);
2096 assert(nl);
2097
2098 sb_set<unsigned> clause_lines(lines);
2099 lines.add_set(group_lines);
2100
2101 if (clause_lines.size() == lines.size())
2102 return true;
2103
2104 if (update_kc())
2105 return true;
2106
2107 lines = clause_lines;
2108
2109 return false;
2110 }
2111
get_lines(kc_lines & lines)2112 unsigned rp_kcache_tracker::get_lines(kc_lines& lines) {
2113 unsigned cnt = 0;
2114
2115 for (unsigned i = 0; i < sel_count; ++i) {
2116 unsigned line = rp[i] & 0x1fffffffu;
2117 unsigned index_mode = rp[i] >> 29;
2118
2119 if (!line)
2120 return cnt;
2121
2122 --line;
2123 line = (sel_count == 2) ? line >> 5 : line >> 6;
2124 line |= index_mode << 29;
2125
2126 if (lines.insert(line).second)
2127 ++cnt;
2128 }
2129 return cnt;
2130 }
2131
update_kc()2132 bool alu_kcache_tracker::update_kc() {
2133 unsigned c = 0;
2134
2135 bc_kcache old_kc[4];
2136 memcpy(old_kc, kc, sizeof(kc));
2137
2138 for (kc_lines::iterator I = lines.begin(), E = lines.end(); I != E; ++I) {
2139 unsigned index_mode = *I >> 29;
2140 unsigned line = *I & 0x1fffffffu;
2141 unsigned bank = line >> 8;
2142
2143 assert(index_mode <= KC_INDEX_INVALID);
2144 line &= 0xFF;
2145
2146 if (c && (bank == kc[c-1].bank) && (kc[c-1].addr + 1 == line) &&
2147 kc[c-1].index_mode == index_mode)
2148 {
2149 kc[c-1].mode = KC_LOCK_2;
2150 } else {
2151 if (c == max_kcs) {
2152 memcpy(kc, old_kc, sizeof(kc));
2153 return false;
2154 }
2155
2156 kc[c].mode = KC_LOCK_1;
2157
2158 kc[c].bank = bank;
2159 kc[c].addr = line;
2160 kc[c].index_mode = index_mode;
2161 ++c;
2162 }
2163 }
2164 return true;
2165 }
2166
create_ar_load(value * v,chan_select ar_channel)2167 alu_node* alu_clause_tracker::create_ar_load(value *v, chan_select ar_channel) {
2168 alu_node *a = sh.create_alu();
2169
2170 if (sh.get_ctx().uses_mova_gpr) {
2171 a->bc.set_op(ALU_OP1_MOVA_GPR_INT);
2172 a->bc.slot = SLOT_TRANS;
2173 } else {
2174 a->bc.set_op(ALU_OP1_MOVA_INT);
2175 a->bc.slot = SLOT_X;
2176 }
2177 a->bc.dst_chan = ar_channel;
2178 if (ar_channel != SEL_X && sh.get_ctx().is_cayman()) {
2179 a->bc.dst_gpr = ar_channel == SEL_Y ? CM_V_SQ_MOVA_DST_CF_IDX0 : CM_V_SQ_MOVA_DST_CF_IDX1;
2180 }
2181
2182 a->dst.resize(1);
2183 a->src.push_back(v);
2184
2185 PSC_DUMP(
2186 sblog << "created AR load: ";
2187 dump::dump_op(a);
2188 sblog << "\n";
2189 );
2190
2191 return a;
2192 }
2193
discard_current_group()2194 void alu_clause_tracker::discard_current_group() {
2195 PSC_DUMP( sblog << "act::discard_current_group\n"; );
2196 grp().discard_all_slots(conflict_nodes);
2197 }
2198
dump()2199 void rp_gpr_tracker::dump() {
2200 sblog << "=== gpr_tracker dump:\n";
2201 for (int c = 0; c < 3; ++c) {
2202 sblog << "cycle " << c << " ";
2203 for (int h = 0; h < 4; ++h) {
2204 sblog << rp[c][h] << ":" << uc[c][h] << " ";
2205 }
2206 sblog << "\n";
2207 }
2208 }
2209
2210 } // namespace r600_sb
2211