1 /*
2 * Copyright 2013 Vadim Girlin <vadimgirlin@gmail.com>
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * on the rights to use, copy, modify, merge, publish, distribute, sub
8 * license, and/or sell copies of the Software, and to permit persons to whom
9 * the Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
13 * Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHOR(S) AND/OR THEIR SUPPLIERS BE LIABLE FOR ANY CLAIM,
19 * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
20 * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
21 * USE OR OTHER DEALINGS IN THE SOFTWARE.
22 *
23 * Authors:
24 * Vadim Girlin
25 */
26
27 #define PSC_DEBUG 0
28
29 #if PSC_DEBUG
30 #define PSC_DUMP(a) do { a } while (0)
31 #else
32 #define PSC_DUMP(a)
33 #endif
34
35 #include "sb_bc.h"
36 #include "sb_shader.h"
37 #include "sb_pass.h"
38 #include "sb_sched.h"
39 #include "eg_sq.h" // V_SQ_CF_INDEX_NONE/0/1
40
41 namespace r600_sb {
42
rp_kcache_tracker(shader & sh)43 rp_kcache_tracker::rp_kcache_tracker(shader &sh) : rp(), uc(),
44 // FIXME: for now we'll use "two const pairs" limit for r600, same as
45 // for other chips, otherwise additional check in alu_group_tracker is
46 // required to make sure that all 4 consts in the group fit into 2
47 // kcache sets
48 sel_count(2) {}
49
try_reserve(sel_chan r)50 bool rp_kcache_tracker::try_reserve(sel_chan r) {
51 unsigned sel = kc_sel(r);
52
53 for (unsigned i = 0; i < sel_count; ++i) {
54 if (rp[i] == 0) {
55 rp[i] = sel;
56 ++uc[i];
57 return true;
58 }
59 if (rp[i] == sel) {
60 ++uc[i];
61 return true;
62 }
63 }
64 return false;
65 }
66
try_reserve(node * n)67 bool rp_kcache_tracker::try_reserve(node* n) {
68 bool need_unreserve = false;
69 vvec::iterator I(n->src.begin()), E(n->src.end());
70
71 for (; I != E; ++I) {
72 value *v = *I;
73 if (v->is_kcache()) {
74 if (!try_reserve(v->select))
75 break;
76 else
77 need_unreserve = true;
78 }
79 }
80 if (I == E)
81 return true;
82
83 if (need_unreserve && I != n->src.begin()) {
84 do {
85 --I;
86 value *v =*I;
87 if (v->is_kcache())
88 unreserve(v->select);
89 } while (I != n->src.begin());
90 }
91 return false;
92 }
93
94 inline
unreserve(node * n)95 void rp_kcache_tracker::unreserve(node* n) {
96 vvec::iterator I(n->src.begin()), E(n->src.end());
97 for (; I != E; ++I) {
98 value *v = *I;
99 if (v->is_kcache())
100 unreserve(v->select);
101 }
102 }
103
unreserve(sel_chan r)104 void rp_kcache_tracker::unreserve(sel_chan r) {
105 unsigned sel = kc_sel(r);
106
107 for (unsigned i = 0; i < sel_count; ++i)
108 if (rp[i] == sel) {
109 if (--uc[i] == 0)
110 rp[i] = 0;
111 return;
112 }
113 assert(0);
114 return;
115 }
116
try_reserve(alu_node * n)117 bool literal_tracker::try_reserve(alu_node* n) {
118 bool need_unreserve = false;
119
120 vvec::iterator I(n->src.begin()), E(n->src.end());
121
122 for (; I != E; ++I) {
123 value *v = *I;
124 if (v->is_literal()) {
125 if (!try_reserve(v->literal_value))
126 break;
127 else
128 need_unreserve = true;
129 }
130 }
131 if (I == E)
132 return true;
133
134 if (need_unreserve && I != n->src.begin()) {
135 do {
136 --I;
137 value *v =*I;
138 if (v->is_literal())
139 unreserve(v->literal_value);
140 } while (I != n->src.begin());
141 }
142 return false;
143 }
144
unreserve(alu_node * n)145 void literal_tracker::unreserve(alu_node* n) {
146 unsigned nsrc = n->bc.op_ptr->src_count, i;
147
148 for (i = 0; i < nsrc; ++i) {
149 value *v = n->src[i];
150 if (v->is_literal())
151 unreserve(v->literal_value);
152 }
153 }
154
try_reserve(literal l)155 bool literal_tracker::try_reserve(literal l) {
156
157 PSC_DUMP( sblog << "literal reserve " << l.u << " " << l.f << "\n"; );
158
159 for (unsigned i = 0; i < MAX_ALU_LITERALS; ++i) {
160 if (lt[i] == 0) {
161 lt[i] = l;
162 ++uc[i];
163 PSC_DUMP( sblog << " reserved new uc = " << uc[i] << "\n"; );
164 return true;
165 } else if (lt[i] == l) {
166 ++uc[i];
167 PSC_DUMP( sblog << " reserved uc = " << uc[i] << "\n"; );
168 return true;
169 }
170 }
171 PSC_DUMP( sblog << " failed to reserve literal\n"; );
172 return false;
173 }
174
unreserve(literal l)175 void literal_tracker::unreserve(literal l) {
176
177 PSC_DUMP( sblog << "literal unreserve " << l.u << " " << l.f << "\n"; );
178
179 for (unsigned i = 0; i < MAX_ALU_LITERALS; ++i) {
180 if (lt[i] == l) {
181 if (--uc[i] == 0)
182 lt[i] = 0;
183 return;
184 }
185 }
186 assert(0);
187 return;
188 }
189
bs_cycle_vector(unsigned bs,unsigned src)190 static inline unsigned bs_cycle_vector(unsigned bs, unsigned src) {
191 static const unsigned swz[VEC_NUM][3] = {
192 {0, 1, 2}, {0, 2, 1}, {1, 2, 0}, {1, 0, 2}, {2, 0, 1}, {2, 1, 0}
193 };
194 assert(bs < VEC_NUM && src < 3);
195 return swz[bs][src];
196 }
197
bs_cycle_scalar(unsigned bs,unsigned src)198 static inline unsigned bs_cycle_scalar(unsigned bs, unsigned src) {
199 static const unsigned swz[SCL_NUM][3] = {
200 {2, 1, 0}, {1, 2, 2}, {2, 1, 2}, {2, 2, 1}
201 };
202
203 if (bs >= SCL_NUM || src >= 3) {
204 // this prevents gcc warning "array subscript is above array bounds"
205 // AFAICS we should never hit this path
206 abort();
207 }
208 return swz[bs][src];
209 }
210
bs_cycle(bool trans,unsigned bs,unsigned src)211 static inline unsigned bs_cycle(bool trans, unsigned bs, unsigned src) {
212 return trans ? bs_cycle_scalar(bs, src) : bs_cycle_vector(bs, src);
213 }
214
215 inline
try_reserve(unsigned cycle,unsigned sel,unsigned chan)216 bool rp_gpr_tracker::try_reserve(unsigned cycle, unsigned sel, unsigned chan) {
217 ++sel;
218 if (rp[cycle][chan] == 0) {
219 rp[cycle][chan] = sel;
220 ++uc[cycle][chan];
221 return true;
222 } else if (rp[cycle][chan] == sel) {
223 ++uc[cycle][chan];
224 return true;
225 }
226 return false;
227 }
228
229 inline
unreserve(alu_node * n)230 void rp_gpr_tracker::unreserve(alu_node* n) {
231 unsigned nsrc = n->bc.op_ptr->src_count, i;
232 unsigned trans = n->bc.slot == SLOT_TRANS;
233 unsigned bs = n->bc.bank_swizzle;
234 unsigned opt = !trans
235 && n->bc.src[0].sel == n->bc.src[1].sel
236 && n->bc.src[0].chan == n->bc.src[1].chan;
237
238 for (i = 0; i < nsrc; ++i) {
239 value *v = n->src[i];
240 if (v->is_readonly() || v->is_undef())
241 continue;
242 if (i == 1 && opt)
243 continue;
244 unsigned cycle = bs_cycle(trans, bs, i);
245 unreserve(cycle, n->bc.src[i].sel, n->bc.src[i].chan);
246 }
247 }
248
249 inline
unreserve(unsigned cycle,unsigned sel,unsigned chan)250 void rp_gpr_tracker::unreserve(unsigned cycle, unsigned sel, unsigned chan) {
251 ++sel;
252 assert(rp[cycle][chan] == sel && uc[cycle][chan]);
253 if (--uc[cycle][chan] == 0)
254 rp[cycle][chan] = 0;
255 }
256
257 inline
try_reserve(alu_node * n)258 bool rp_gpr_tracker::try_reserve(alu_node* n) {
259 unsigned nsrc = n->bc.op_ptr->src_count, i;
260 unsigned trans = n->bc.slot == SLOT_TRANS;
261 unsigned bs = n->bc.bank_swizzle;
262 unsigned opt = !trans && nsrc >= 2 &&
263 n->src[0] == n->src[1];
264
265 bool need_unreserve = false;
266 unsigned const_count = 0, min_gpr_cycle = 3;
267
268 for (i = 0; i < nsrc; ++i) {
269 value *v = n->src[i];
270 if (v->is_readonly() || v->is_undef()) {
271 const_count++;
272 if (trans && const_count == 3)
273 break;
274 } else {
275 if (i == 1 && opt)
276 continue;
277
278 unsigned cycle = bs_cycle(trans, bs, i);
279
280 if (trans && cycle < min_gpr_cycle)
281 min_gpr_cycle = cycle;
282
283 if (const_count && cycle < const_count && trans)
284 break;
285
286 if (!try_reserve(cycle, n->bc.src[i].sel, n->bc.src[i].chan))
287 break;
288 else
289 need_unreserve = true;
290 }
291 }
292
293 if ((i == nsrc) && (min_gpr_cycle + 1 > const_count))
294 return true;
295
296 if (need_unreserve && i--) {
297 do {
298 value *v = n->src[i];
299 if (!v->is_readonly() && !v->is_undef()) {
300 if (i == 1 && opt)
301 continue;
302 unreserve(bs_cycle(trans, bs, i), n->bc.src[i].sel,
303 n->bc.src[i].chan);
304 }
305 } while (i--);
306 }
307 return false;
308 }
309
alu_group_tracker(shader & sh)310 alu_group_tracker::alu_group_tracker(shader &sh)
311 : sh(sh), kc(sh),
312 gpr(), lt(), slots(),
313 max_slots(sh.get_ctx().is_cayman() ? 4 : 5),
314 has_mova(), uses_ar(), has_predset(), has_kill(),
315 updates_exec_mask(), chan_count(), interp_param(), next_id() {
316
317 available_slots = sh.get_ctx().has_trans ? 0x1F : 0x0F;
318 }
319
320 inline
get_value_id(value * v)321 sel_chan alu_group_tracker::get_value_id(value* v) {
322 unsigned &id = vmap[v];
323 if (!id)
324 id = ++next_id;
325 return sel_chan(id, v->get_final_chan());
326 }
327
328 inline
assign_slot(unsigned slot,alu_node * n)329 void alu_group_tracker::assign_slot(unsigned slot, alu_node* n) {
330 update_flags(n);
331 slots[slot] = n;
332 available_slots &= ~(1 << slot);
333
334 unsigned param = n->interp_param();
335
336 if (param) {
337 assert(!interp_param || interp_param == param);
338 interp_param = param;
339 }
340 }
341
342
discard_all_slots(container_node & removed_nodes)343 void alu_group_tracker::discard_all_slots(container_node &removed_nodes) {
344 PSC_DUMP( sblog << "agt::discard_all_slots\n"; );
345 discard_slots(~available_slots & ((1 << max_slots) - 1), removed_nodes);
346 }
347
discard_slots(unsigned slot_mask,container_node & removed_nodes)348 void alu_group_tracker::discard_slots(unsigned slot_mask,
349 container_node &removed_nodes) {
350
351 PSC_DUMP(
352 sblog << "discard_slots : packed_ops : "
353 << (unsigned)packed_ops.size() << "\n";
354 );
355
356 for (node_vec::iterator N, I = packed_ops.begin();
357 I != packed_ops.end(); I = N) {
358 N = I; ++N;
359
360 alu_packed_node *n = static_cast<alu_packed_node*>(*I);
361 unsigned pslots = n->get_slot_mask();
362
363 PSC_DUMP(
364 sblog << "discard_slots : packed slot_mask : " << pslots << "\n";
365 );
366
367 if (pslots & slot_mask) {
368
369 PSC_DUMP(
370 sblog << "discard_slots : discarding packed...\n";
371 );
372
373 removed_nodes.push_back(n);
374 slot_mask &= ~pslots;
375 N = packed_ops.erase(I);
376 available_slots |= pslots;
377 for (unsigned k = 0; k < max_slots; ++k) {
378 if (pslots & (1 << k))
379 slots[k] = NULL;
380 }
381 }
382 }
383
384 for (unsigned slot = 0; slot < max_slots; ++slot) {
385 unsigned slot_bit = 1 << slot;
386
387 if (slot_mask & slot_bit) {
388 assert(!(available_slots & slot_bit));
389 assert(slots[slot]);
390
391 assert(!(slots[slot]->bc.slot_flags & AF_4SLOT));
392
393 PSC_DUMP(
394 sblog << "discarding slot " << slot << " : ";
395 dump::dump_op(slots[slot]);
396 sblog << "\n";
397 );
398
399 removed_nodes.push_back(slots[slot]);
400 slots[slot] = NULL;
401 available_slots |= slot_bit;
402 }
403 }
404
405 alu_node *t = slots[4];
406 if (t && (t->bc.slot_flags & AF_V)) {
407 unsigned chan = t->bc.dst_chan;
408 if (!slots[chan]) {
409 PSC_DUMP(
410 sblog << "moving ";
411 dump::dump_op(t);
412 sblog << " from trans slot to free slot " << chan << "\n";
413 );
414
415 slots[chan] = t;
416 slots[4] = NULL;
417 t->bc.slot = chan;
418 }
419 }
420
421 reinit();
422 }
423
emit()424 alu_group_node* alu_group_tracker::emit() {
425
426 alu_group_node *g = sh.create_alu_group();
427
428 lt.init_group_literals(g);
429
430 for (unsigned i = 0; i < max_slots; ++i) {
431 alu_node *n = slots[i];
432 if (n) {
433 g->push_back(n);
434 }
435 }
436 return g;
437 }
438
try_reserve(alu_node * n)439 bool alu_group_tracker::try_reserve(alu_node* n) {
440 unsigned nsrc = n->bc.op_ptr->src_count;
441 unsigned slot = n->bc.slot;
442 bool trans = slot == 4;
443
444 if (slots[slot])
445 return false;
446
447 unsigned flags = n->bc.op_ptr->flags;
448
449 unsigned param = n->interp_param();
450
451 if (param && interp_param && interp_param != param)
452 return false;
453
454 if ((flags & AF_KILL) && has_predset)
455 return false;
456 if ((flags & AF_ANY_PRED) && (has_kill || has_predset))
457 return false;
458 if ((flags & AF_MOVA) && (has_mova || uses_ar))
459 return false;
460
461 if (n->uses_ar() && has_mova)
462 return false;
463
464 for (unsigned i = 0; i < nsrc; ++i) {
465
466 unsigned last_id = next_id;
467
468 value *v = n->src[i];
469 if (!v->is_any_gpr() && !v->is_rel())
470 continue;
471 sel_chan vid = get_value_id(n->src[i]);
472
473 if (vid > last_id && chan_count[vid.chan()] == 3) {
474 return false;
475 }
476
477 n->bc.src[i].sel = vid.sel();
478 n->bc.src[i].chan = vid.chan();
479 }
480
481 if (!lt.try_reserve(n))
482 return false;
483
484 if (!kc.try_reserve(n)) {
485 lt.unreserve(n);
486 return false;
487 }
488
489 unsigned fbs = n->forced_bank_swizzle();
490
491 n->bc.bank_swizzle = 0;
492
493 if (!trans && fbs)
494 n->bc.bank_swizzle = VEC_210;
495
496 if (gpr.try_reserve(n)) {
497 assign_slot(slot, n);
498 return true;
499 }
500
501 if (!fbs) {
502 unsigned swz_num = trans ? SCL_NUM : VEC_NUM;
503 for (unsigned bs = 0; bs < swz_num; ++bs) {
504 n->bc.bank_swizzle = bs;
505 if (gpr.try_reserve(n)) {
506 assign_slot(slot, n);
507 return true;
508 }
509 }
510 }
511
512 gpr.reset();
513
514 slots[slot] = n;
515 unsigned forced_swz_slots = 0;
516 int first_slot = ~0, first_nf = ~0, last_slot = ~0;
517 unsigned save_bs[5];
518
519 for (unsigned i = 0; i < max_slots; ++i) {
520 alu_node *a = slots[i];
521 if (a) {
522 if (first_slot == ~0)
523 first_slot = i;
524 last_slot = i;
525 save_bs[i] = a->bc.bank_swizzle;
526 if (a->forced_bank_swizzle()) {
527 assert(i != SLOT_TRANS);
528 forced_swz_slots |= (1 << i);
529 a->bc.bank_swizzle = VEC_210;
530 if (!gpr.try_reserve(a))
531 assert(!"internal reservation error");
532 } else {
533 if (first_nf == ~0)
534 first_nf = i;
535
536 a->bc.bank_swizzle = 0;
537 }
538 }
539 }
540
541 if (first_nf == ~0) {
542 assign_slot(slot, n);
543 return true;
544 }
545
546 assert(first_slot != ~0 && last_slot != ~0);
547
548 // silence "array subscript is above array bounds" with gcc 4.8
549 if (last_slot >= 5)
550 abort();
551
552 int i = first_nf;
553 alu_node *a = slots[i];
554 bool backtrack = false;
555
556 while (1) {
557
558 PSC_DUMP(
559 sblog << " bs: trying s" << i << " bs:" << a->bc.bank_swizzle
560 << " bt:" << backtrack << "\n";
561 );
562
563 if (!backtrack && gpr.try_reserve(a)) {
564 PSC_DUMP(
565 sblog << " bs: reserved s" << i << " bs:" << a->bc.bank_swizzle
566 << "\n";
567 );
568
569 while ((++i <= last_slot) && !slots[i]);
570 if (i <= last_slot)
571 a = slots[i];
572 else
573 break;
574 } else {
575 bool itrans = i == SLOT_TRANS;
576 unsigned max_swz = itrans ? SCL_221 : VEC_210;
577
578 if (a->bc.bank_swizzle < max_swz) {
579 ++a->bc.bank_swizzle;
580
581 PSC_DUMP(
582 sblog << " bs: inc s" << i << " bs:" << a->bc.bank_swizzle
583 << "\n";
584 );
585
586 } else {
587
588 a->bc.bank_swizzle = 0;
589 while ((--i >= first_nf) && !slots[i]);
590 if (i < first_nf)
591 break;
592 a = slots[i];
593 PSC_DUMP(
594 sblog << " bs: unreserve s" << i << " bs:" << a->bc.bank_swizzle
595 << "\n";
596 );
597 gpr.unreserve(a);
598 backtrack = true;
599
600 continue;
601 }
602 }
603 backtrack = false;
604 }
605
606 if (i == last_slot + 1) {
607 assign_slot(slot, n);
608 return true;
609 }
610
611 // reservation failed, restore previous state
612 slots[slot] = NULL;
613 gpr.reset();
614 for (unsigned i = 0; i < max_slots; ++i) {
615 alu_node *a = slots[i];
616 if (a) {
617 a->bc.bank_swizzle = save_bs[i];
618 bool b = gpr.try_reserve(a);
619 assert(b);
620 }
621 }
622
623 kc.unreserve(n);
624 lt.unreserve(n);
625 return false;
626 }
627
try_reserve(alu_packed_node * p)628 bool alu_group_tracker::try_reserve(alu_packed_node* p) {
629 bool need_unreserve = false;
630 node_iterator I(p->begin()), E(p->end());
631
632 for (; I != E; ++I) {
633 alu_node *n = static_cast<alu_node*>(*I);
634 if (!try_reserve(n))
635 break;
636 else
637 need_unreserve = true;
638 }
639
640 if (I == E) {
641 packed_ops.push_back(p);
642 return true;
643 }
644
645 if (need_unreserve) {
646 while (--I != E) {
647 alu_node *n = static_cast<alu_node*>(*I);
648 slots[n->bc.slot] = NULL;
649 }
650 reinit();
651 }
652 return false;
653 }
654
reinit()655 void alu_group_tracker::reinit() {
656 alu_node * s[5];
657 memcpy(s, slots, sizeof(slots));
658
659 reset(true);
660
661 for (int i = max_slots - 1; i >= 0; --i) {
662 if (s[i] && !try_reserve(s[i])) {
663 sblog << "alu_group_tracker: reinit error on slot " << i << "\n";
664 for (unsigned i = 0; i < max_slots; ++i) {
665 sblog << " slot " << i << " : ";
666 if (s[i])
667 dump::dump_op(s[i]);
668
669 sblog << "\n";
670 }
671 assert(!"alu_group_tracker: reinit error");
672 }
673 }
674 }
675
reset(bool keep_packed)676 void alu_group_tracker::reset(bool keep_packed) {
677 kc.reset();
678 gpr.reset();
679 lt.reset();
680 memset(slots, 0, sizeof(slots));
681 vmap.clear();
682 next_id = 0;
683 has_mova = false;
684 uses_ar = false;
685 has_predset = false;
686 has_kill = false;
687 updates_exec_mask = false;
688 available_slots = sh.get_ctx().has_trans ? 0x1F : 0x0F;
689 interp_param = 0;
690
691 chan_count[0] = 0;
692 chan_count[1] = 0;
693 chan_count[2] = 0;
694 chan_count[3] = 0;
695
696 if (!keep_packed)
697 packed_ops.clear();
698 }
699
update_flags(alu_node * n)700 void alu_group_tracker::update_flags(alu_node* n) {
701 unsigned flags = n->bc.op_ptr->flags;
702 has_kill |= (flags & AF_KILL);
703 has_mova |= (flags & AF_MOVA);
704 has_predset |= (flags & AF_ANY_PRED);
705 uses_ar |= n->uses_ar();
706
707 if (flags & AF_ANY_PRED) {
708 if (n->dst[2] != NULL)
709 updates_exec_mask = true;
710 }
711 }
712
run()713 int post_scheduler::run() {
714 run_on(sh.root);
715 return 0;
716 }
717
run_on(container_node * n)718 void post_scheduler::run_on(container_node* n) {
719
720 for (node_riterator I = n->rbegin(), E = n->rend(); I != E; ++I) {
721 if (I->is_container()) {
722 if (I->subtype == NST_BB) {
723 bb_node* bb = static_cast<bb_node*>(*I);
724 schedule_bb(bb);
725 } else {
726 run_on(static_cast<container_node*>(*I));
727 }
728 }
729 }
730 }
731
init_uc_val(container_node * c,value * v)732 void post_scheduler::init_uc_val(container_node *c, value *v) {
733 node *d = v->any_def();
734 if (d && d->parent == c)
735 ++ucm[d];
736 }
737
init_uc_vec(container_node * c,vvec & vv,bool src)738 void post_scheduler::init_uc_vec(container_node *c, vvec &vv, bool src) {
739 for (vvec::iterator I = vv.begin(), E = vv.end(); I != E; ++I) {
740 value *v = *I;
741 if (!v || v->is_readonly())
742 continue;
743
744 if (v->is_rel()) {
745 init_uc_val(c, v->rel);
746 init_uc_vec(c, v->muse, true);
747 } if (src) {
748 init_uc_val(c, v);
749 }
750 }
751 }
752
init_ucm(container_node * c,node * n)753 unsigned post_scheduler::init_ucm(container_node *c, node *n) {
754 init_uc_vec(c, n->src, true);
755 init_uc_vec(c, n->dst, false);
756
757 uc_map::iterator F = ucm.find(n);
758 return F == ucm.end() ? 0 : F->second;
759 }
760
schedule_bb(bb_node * bb)761 void post_scheduler::schedule_bb(bb_node* bb) {
762 PSC_DUMP(
763 sblog << "scheduling BB " << bb->id << "\n";
764 if (!pending.empty())
765 dump::dump_op_list(&pending);
766 );
767
768 assert(pending.empty());
769 assert(bb_pending.empty());
770 assert(ready.empty());
771
772 bb_pending.append_from(bb);
773 cur_bb = bb;
774
775 node *n;
776
777 while ((n = bb_pending.back())) {
778
779 PSC_DUMP(
780 sblog << "post_sched_bb ";
781 dump::dump_op(n);
782 sblog << "\n";
783 );
784
785 // May require emitting ALU ops to load index registers
786 if (n->is_fetch_clause()) {
787 n->remove();
788 process_fetch(static_cast<container_node *>(n));
789 continue;
790 }
791
792 if (n->is_alu_clause()) {
793 n->remove();
794 process_alu(static_cast<container_node*>(n));
795 continue;
796 }
797
798 n->remove();
799 bb->push_front(n);
800 }
801
802 this->cur_bb = NULL;
803 }
804
init_regmap()805 void post_scheduler::init_regmap() {
806
807 regmap.clear();
808
809 PSC_DUMP(
810 sblog << "init_regmap: live: ";
811 dump::dump_set(sh, live);
812 sblog << "\n";
813 );
814
815 for (val_set::iterator I = live.begin(sh), E = live.end(sh); I != E; ++I) {
816 value *v = *I;
817 assert(v);
818 if (!v->is_sgpr() || !v->is_prealloc())
819 continue;
820
821 sel_chan r = v->gpr;
822
823 PSC_DUMP(
824 sblog << "init_regmap: " << r << " <= ";
825 dump::dump_val(v);
826 sblog << "\n";
827 );
828
829 assert(r);
830 regmap[r] = v;
831 }
832 }
833
create_set_idx(shader & sh,unsigned ar_idx)834 static alu_node *create_set_idx(shader &sh, unsigned ar_idx) {
835 alu_node *a = sh.create_alu();
836
837 assert(ar_idx == V_SQ_CF_INDEX_0 || ar_idx == V_SQ_CF_INDEX_1);
838 if (ar_idx == V_SQ_CF_INDEX_0)
839 a->bc.set_op(ALU_OP0_SET_CF_IDX0);
840 else
841 a->bc.set_op(ALU_OP0_SET_CF_IDX1);
842 a->bc.slot = SLOT_X;
843 a->dst.resize(1); // Dummy needed for recolor
844
845 PSC_DUMP(
846 sblog << "created IDX load: ";
847 dump::dump_op(a);
848 sblog << "\n";
849 );
850
851 return a;
852 }
853
load_index_register(value * v,unsigned ar_idx)854 void post_scheduler::load_index_register(value *v, unsigned ar_idx)
855 {
856 alu.reset();
857
858 if (!sh.get_ctx().is_cayman()) {
859 // Evergreen has to first load address register, then use CF_SET_IDX0/1
860 alu_group_tracker &rt = alu.grp();
861 alu_node *set_idx = create_set_idx(sh, ar_idx);
862 if (!rt.try_reserve(set_idx)) {
863 sblog << "can't emit SET_CF_IDX";
864 dump::dump_op(set_idx);
865 sblog << "\n";
866 }
867 process_group();
868
869 if (!alu.check_clause_limits()) {
870 // Can't happen since clause only contains MOVA/CF_SET_IDX0/1
871 }
872 alu.emit_group();
873 }
874
875 alu_group_tracker &rt = alu.grp();
876 alu_node *a = alu.create_ar_load(v, ar_idx == V_SQ_CF_INDEX_1 ? SEL_Z : SEL_Y);
877
878 if (!rt.try_reserve(a)) {
879 sblog << "can't emit AR load : ";
880 dump::dump_op(a);
881 sblog << "\n";
882 }
883
884 process_group();
885
886 if (!alu.check_clause_limits()) {
887 // Can't happen since clause only contains MOVA/CF_SET_IDX0/1
888 }
889
890 alu.emit_group();
891 alu.emit_clause(cur_bb);
892 }
893
process_fetch(container_node * c)894 void post_scheduler::process_fetch(container_node *c) {
895 if (c->empty())
896 return;
897
898 for (node_iterator N, I = c->begin(), E = c->end(); I != E; I = N) {
899 N = I;
900 ++N;
901
902 node *n = *I;
903
904 fetch_node *f = static_cast<fetch_node*>(n);
905
906 PSC_DUMP(
907 sblog << "process_tex ";
908 dump::dump_op(n);
909 sblog << " ";
910 );
911
912 // TODO: If same values used can avoid reloading index register
913 if (f->bc.sampler_index_mode != V_SQ_CF_INDEX_NONE ||
914 f->bc.resource_index_mode != V_SQ_CF_INDEX_NONE) {
915 unsigned index_mode = f->bc.sampler_index_mode != V_SQ_CF_INDEX_NONE ?
916 f->bc.sampler_index_mode : f->bc.resource_index_mode;
917
918 // Currently require prior opt passes to use one TEX per indexed op
919 assert(f->parent->count() == 1);
920
921 value *v = f->src.back(); // Last src is index offset
922 assert(v);
923
924 cur_bb->push_front(c);
925
926 load_index_register(v, index_mode);
927 f->src.pop_back(); // Don't need index value any more
928
929 return;
930 }
931 }
932
933 cur_bb->push_front(c);
934 }
935
process_alu(container_node * c)936 void post_scheduler::process_alu(container_node *c) {
937
938 if (c->empty())
939 return;
940
941 ucm.clear();
942 alu.reset();
943
944 live = c->live_after;
945
946 init_globals(c->live_after, true);
947 init_globals(c->live_before, true);
948
949 init_regmap();
950
951 update_local_interferences();
952
953 for (node_riterator N, I = c->rbegin(), E = c->rend(); I != E; I = N) {
954 N = I;
955 ++N;
956
957 node *n = *I;
958 unsigned uc = init_ucm(c, n);
959
960 PSC_DUMP(
961 sblog << "process_alu uc=" << uc << " ";
962 dump::dump_op(n);
963 sblog << " ";
964 );
965
966 if (uc) {
967 n->remove();
968
969 pending.push_back(n);
970 PSC_DUMP( sblog << "pending\n"; );
971 } else {
972 release_op(n);
973 }
974 }
975
976 schedule_alu(c);
977 }
978
update_local_interferences()979 void post_scheduler::update_local_interferences() {
980
981 PSC_DUMP(
982 sblog << "update_local_interferences : ";
983 dump::dump_set(sh, live);
984 sblog << "\n";
985 );
986
987
988 for (val_set::iterator I = live.begin(sh), E = live.end(sh); I != E; ++I) {
989 value *v = *I;
990 if (v->is_prealloc())
991 continue;
992
993 v->interferences.add_set(live);
994 }
995 }
996
update_live_src_vec(vvec & vv,val_set * born,bool src)997 void post_scheduler::update_live_src_vec(vvec &vv, val_set *born, bool src) {
998 for (vvec::iterator I = vv.begin(), E = vv.end(); I != E; ++I) {
999 value *v = *I;
1000
1001 if (!v)
1002 continue;
1003
1004 if (src && v->is_any_gpr()) {
1005 if (live.add_val(v)) {
1006 if (!v->is_prealloc()) {
1007 if (!cleared_interf.contains(v)) {
1008 PSC_DUMP(
1009 sblog << "clearing interferences for " << *v << "\n";
1010 );
1011 v->interferences.clear();
1012 cleared_interf.add_val(v);
1013 }
1014 }
1015 if (born)
1016 born->add_val(v);
1017 }
1018 } else if (v->is_rel()) {
1019 if (!v->rel->is_any_gpr())
1020 live.add_val(v->rel);
1021 update_live_src_vec(v->muse, born, true);
1022 }
1023 }
1024 }
1025
update_live_dst_vec(vvec & vv)1026 void post_scheduler::update_live_dst_vec(vvec &vv) {
1027 for (vvec::iterator I = vv.begin(), E = vv.end(); I != E; ++I) {
1028 value *v = *I;
1029 if (!v)
1030 continue;
1031
1032 if (v->is_rel()) {
1033 update_live_dst_vec(v->mdef);
1034 } else if (v->is_any_gpr()) {
1035 if (!live.remove_val(v)) {
1036 PSC_DUMP(
1037 sblog << "failed to remove ";
1038 dump::dump_val(v);
1039 sblog << " from live : ";
1040 dump::dump_set(sh, live);
1041 sblog << "\n";
1042 );
1043 }
1044 }
1045 }
1046 }
1047
update_live(node * n,val_set * born)1048 void post_scheduler::update_live(node *n, val_set *born) {
1049 update_live_dst_vec(n->dst);
1050 update_live_src_vec(n->src, born, true);
1051 update_live_src_vec(n->dst, born, false);
1052 }
1053
process_group()1054 void post_scheduler::process_group() {
1055 alu_group_tracker &rt = alu.grp();
1056
1057 val_set vals_born;
1058
1059 recolor_locals();
1060
1061 PSC_DUMP(
1062 sblog << "process_group: live_before : ";
1063 dump::dump_set(sh, live);
1064 sblog << "\n";
1065 );
1066
1067 for (unsigned s = 0; s < ctx.num_slots; ++s) {
1068 alu_node *n = rt.slot(s);
1069 if (!n)
1070 continue;
1071
1072 update_live(n, &vals_born);
1073 }
1074
1075 PSC_DUMP(
1076 sblog << "process_group: live_after : ";
1077 dump::dump_set(sh, live);
1078 sblog << "\n";
1079 );
1080
1081 update_local_interferences();
1082
1083 for (unsigned i = 0; i < 5; ++i) {
1084 node *n = rt.slot(i);
1085 if (n && !n->is_mova()) {
1086 release_src_values(n);
1087 }
1088 }
1089 }
1090
init_globals(val_set & s,bool prealloc)1091 void post_scheduler::init_globals(val_set &s, bool prealloc) {
1092
1093 PSC_DUMP(
1094 sblog << "init_globals: ";
1095 dump::dump_set(sh, s);
1096 sblog << "\n";
1097 );
1098
1099 for (val_set::iterator I = s.begin(sh), E = s.end(sh); I != E; ++I) {
1100 value *v = *I;
1101 if (v->is_sgpr() && !v->is_global()) {
1102 v->set_global();
1103
1104 if (prealloc && v->is_fixed()) {
1105 v->set_prealloc();
1106 }
1107 }
1108 }
1109 }
1110
emit_index_registers()1111 void post_scheduler::emit_index_registers() {
1112 for (unsigned i = 0; i < 2; i++) {
1113 if (alu.current_idx[i]) {
1114 regmap = prev_regmap;
1115 alu.discard_current_group();
1116
1117 load_index_register(alu.current_idx[i], KC_INDEX_0 + i);
1118 alu.current_idx[i] = NULL;
1119 }
1120 }
1121 }
1122
emit_clause()1123 void post_scheduler::emit_clause() {
1124
1125 if (alu.current_ar) {
1126 emit_load_ar();
1127 process_group();
1128 alu.emit_group();
1129 }
1130
1131 if (!alu.is_empty()) {
1132 alu.emit_clause(cur_bb);
1133 }
1134
1135 emit_index_registers();
1136 }
1137
schedule_alu(container_node * c)1138 void post_scheduler::schedule_alu(container_node *c) {
1139
1140 assert(!ready.empty() || !ready_copies.empty());
1141
1142 while (1) {
1143
1144 prev_regmap = regmap;
1145
1146 if (!prepare_alu_group()) {
1147 if (alu.current_idx[0] || alu.current_idx[1]) {
1148 regmap = prev_regmap;
1149 emit_clause();
1150 init_globals(live, false);
1151
1152 continue;
1153 }
1154
1155 if (alu.current_ar) {
1156 emit_load_ar();
1157 continue;
1158 } else
1159 break;
1160 }
1161
1162 if (!alu.check_clause_limits()) {
1163 regmap = prev_regmap;
1164 emit_clause();
1165 init_globals(live, false);
1166
1167 continue;
1168 }
1169
1170 process_group();
1171 alu.emit_group();
1172 };
1173
1174 if (!alu.is_empty()) {
1175 emit_clause();
1176 }
1177
1178 if (!ready.empty()) {
1179 sblog << "##post_scheduler: unscheduled ready instructions :";
1180 dump::dump_op_list(&ready);
1181 assert(!"unscheduled ready instructions");
1182 }
1183
1184 if (!pending.empty()) {
1185 sblog << "##post_scheduler: unscheduled pending instructions :";
1186 dump::dump_op_list(&pending);
1187 assert(!"unscheduled pending instructions");
1188 }
1189 }
1190
add_interferences(value * v,sb_bitset & rb,val_set & vs)1191 void post_scheduler::add_interferences(value *v, sb_bitset &rb, val_set &vs) {
1192 unsigned chan = v->gpr.chan();
1193
1194 for (val_set::iterator I = vs.begin(sh), E = vs.end(sh);
1195 I != E; ++I) {
1196 value *vi = *I;
1197 sel_chan gpr = vi->get_final_gpr();
1198
1199 if (vi->is_any_gpr() && gpr && vi != v &&
1200 (!v->chunk || v->chunk != vi->chunk) &&
1201 vi->is_fixed() && gpr.chan() == chan) {
1202
1203 unsigned r = gpr.sel();
1204
1205 PSC_DUMP(
1206 sblog << "\tadd_interferences: " << *vi << "\n";
1207 );
1208
1209 if (rb.size() <= r)
1210 rb.resize(r + 32);
1211 rb.set(r);
1212 }
1213 }
1214 }
1215
set_color_local_val(value * v,sel_chan color)1216 void post_scheduler::set_color_local_val(value *v, sel_chan color) {
1217 v->gpr = color;
1218
1219 PSC_DUMP(
1220 sblog << " recolored: ";
1221 dump::dump_val(v);
1222 sblog << "\n";
1223 );
1224 }
1225
set_color_local(value * v,sel_chan color)1226 void post_scheduler::set_color_local(value *v, sel_chan color) {
1227 if (v->chunk) {
1228 vvec &vv = v->chunk->values;
1229 for (vvec::iterator I = vv.begin(), E = vv.end(); I != E; ++I) {
1230 value *v2 =*I;
1231 set_color_local_val(v2, color);
1232 }
1233 v->chunk->fix();
1234 } else {
1235 set_color_local_val(v, color);
1236 v->fix();
1237 }
1238 }
1239
recolor_local(value * v)1240 bool post_scheduler::recolor_local(value *v) {
1241
1242 sb_bitset rb;
1243
1244 assert(v->is_sgpr());
1245 assert(!v->is_prealloc());
1246 assert(v->gpr);
1247
1248 unsigned chan = v->gpr.chan();
1249
1250 PSC_DUMP(
1251 sblog << "recolor_local: ";
1252 dump::dump_val(v);
1253 sblog << " interferences: ";
1254 dump::dump_set(sh, v->interferences);
1255 sblog << "\n";
1256 if (v->chunk) {
1257 sblog << " in chunk: ";
1258 coalescer::dump_chunk(v->chunk);
1259 sblog << "\n";
1260 }
1261 );
1262
1263 if (v->chunk) {
1264 for (vvec::iterator I = v->chunk->values.begin(),
1265 E = v->chunk->values.end(); I != E; ++I) {
1266 value *v2 = *I;
1267
1268 PSC_DUMP( sblog << " add_interferences for " << *v2 << " :\n"; );
1269
1270 add_interferences(v, rb, v2->interferences);
1271 }
1272 } else {
1273 add_interferences(v, rb, v->interferences);
1274 }
1275
1276 PSC_DUMP(
1277 unsigned sz = rb.size();
1278 sblog << "registers bits: " << sz;
1279 for (unsigned r = 0; r < sz; ++r) {
1280 if ((r & 7) == 0)
1281 sblog << "\n " << r << " ";
1282 sblog << (rb.get(r) ? 1 : 0);
1283 }
1284 );
1285
1286 bool no_temp_gprs = v->is_global();
1287 unsigned rs, re, pass = no_temp_gprs ? 1 : 0;
1288
1289 while (pass < 2) {
1290
1291 if (pass == 0) {
1292 rs = sh.first_temp_gpr();
1293 re = MAX_GPR;
1294 } else {
1295 rs = 0;
1296 re = sh.num_nontemp_gpr();
1297 }
1298
1299 for (unsigned reg = rs; reg < re; ++reg) {
1300 if (reg >= rb.size() || !rb.get(reg)) {
1301 // color found
1302 set_color_local(v, sel_chan(reg, chan));
1303 return true;
1304 }
1305 }
1306 ++pass;
1307 }
1308
1309 assert(!"recolor_local failed");
1310 return true;
1311 }
1312
emit_load_ar()1313 void post_scheduler::emit_load_ar() {
1314
1315 regmap = prev_regmap;
1316 alu.discard_current_group();
1317
1318 alu_group_tracker &rt = alu.grp();
1319 alu_node *a = alu.create_ar_load(alu.current_ar, SEL_X);
1320
1321 if (!rt.try_reserve(a)) {
1322 sblog << "can't emit AR load : ";
1323 dump::dump_op(a);
1324 sblog << "\n";
1325 }
1326
1327 alu.current_ar = 0;
1328 }
1329
unmap_dst_val(value * d)1330 bool post_scheduler::unmap_dst_val(value *d) {
1331
1332 if (d == alu.current_ar) {
1333 emit_load_ar();
1334 return false;
1335 }
1336
1337 if (d->is_prealloc()) {
1338 sel_chan gpr = d->get_final_gpr();
1339 rv_map::iterator F = regmap.find(gpr);
1340 value *c = NULL;
1341 if (F != regmap.end())
1342 c = F->second;
1343
1344 if (c && c!=d && (!c->chunk || c->chunk != d->chunk)) {
1345 PSC_DUMP(
1346 sblog << "dst value conflict : ";
1347 dump::dump_val(d);
1348 sblog << " regmap contains ";
1349 dump::dump_val(c);
1350 sblog << "\n";
1351 );
1352 assert(!"scheduler error");
1353 return false;
1354 } else if (c) {
1355 regmap.erase(F);
1356 }
1357 }
1358 return true;
1359 }
1360
unmap_dst(alu_node * n)1361 bool post_scheduler::unmap_dst(alu_node *n) {
1362 value *d = n->dst.empty() ? NULL : n->dst[0];
1363
1364 if (!d)
1365 return true;
1366
1367 if (!d->is_rel()) {
1368 if (d && d->is_any_reg()) {
1369
1370 if (d->is_AR()) {
1371 if (alu.current_ar != d) {
1372 sblog << "loading wrong ar value\n";
1373 assert(0);
1374 } else {
1375 alu.current_ar = NULL;
1376 }
1377
1378 } else if (d->is_any_gpr()) {
1379 if (!unmap_dst_val(d))
1380 return false;
1381 }
1382 }
1383 } else {
1384 for (vvec::iterator I = d->mdef.begin(), E = d->mdef.end();
1385 I != E; ++I) {
1386 d = *I;
1387 if (!d)
1388 continue;
1389
1390 assert(d->is_any_gpr());
1391
1392 if (!unmap_dst_val(d))
1393 return false;
1394 }
1395 }
1396 return true;
1397 }
1398
map_src_val(value * v)1399 bool post_scheduler::map_src_val(value *v) {
1400
1401 if (!v->is_prealloc())
1402 return true;
1403
1404 sel_chan gpr = v->get_final_gpr();
1405 rv_map::iterator F = regmap.find(gpr);
1406 value *c = NULL;
1407 if (F != regmap.end()) {
1408 c = F->second;
1409 if (!v->v_equal(c)) {
1410 PSC_DUMP(
1411 sblog << "can't map src value ";
1412 dump::dump_val(v);
1413 sblog << ", regmap contains ";
1414 dump::dump_val(c);
1415 sblog << "\n";
1416 );
1417 return false;
1418 }
1419 } else {
1420 regmap.insert(std::make_pair(gpr, v));
1421 }
1422 return true;
1423 }
1424
map_src_vec(vvec & vv,bool src)1425 bool post_scheduler::map_src_vec(vvec &vv, bool src) {
1426 if (src) {
1427 // Handle possible UBO indexing
1428 bool ubo_indexing[2] = { false, false };
1429 for (vvec::iterator I = vv.begin(), E = vv.end(); I != E; ++I) {
1430 value *v = *I;
1431 if (!v)
1432 continue;
1433
1434 if (v->is_kcache()) {
1435 unsigned index_mode = v->select.kcache_index_mode();
1436 if (index_mode == KC_INDEX_0 || index_mode == KC_INDEX_1) {
1437 ubo_indexing[index_mode - KC_INDEX_0] = true;
1438 }
1439 }
1440 }
1441
1442 // idx values stored at end of src vec, see bc_parser::prepare_alu_group
1443 for (unsigned i = 2; i != 0; i--) {
1444 if (ubo_indexing[i-1]) {
1445 // TODO: skip adding value to kcache reservation somehow, causes
1446 // unnecessary group breaks and cache line locks
1447 value *v = vv.back();
1448 if (alu.current_idx[i-1] && alu.current_idx[i-1] != v) {
1449 PSC_DUMP(
1450 sblog << "IDX" << i-1 << " already set to " <<
1451 *alu.current_idx[i-1] << ", trying to set " << *v << "\n";
1452 );
1453 return false;
1454 }
1455
1456 alu.current_idx[i-1] = v;
1457 PSC_DUMP(sblog << "IDX" << i-1 << " set to " << *v << "\n";);
1458 }
1459 }
1460 }
1461
1462 for (vvec::iterator I = vv.begin(), E = vv.end(); I != E; ++I) {
1463 value *v = *I;
1464 if (!v)
1465 continue;
1466
1467 if ((!v->is_any_gpr() || !v->is_fixed()) && !v->is_rel())
1468 continue;
1469
1470 if (v->is_rel()) {
1471 value *rel = v->rel;
1472 assert(rel);
1473
1474 if (!rel->is_const()) {
1475 if (!map_src_vec(v->muse, true))
1476 return false;
1477
1478 if (rel != alu.current_ar) {
1479 if (alu.current_ar) {
1480 PSC_DUMP(
1481 sblog << " current_AR is " << *alu.current_ar
1482 << " trying to use " << *rel << "\n";
1483 );
1484 return false;
1485 }
1486
1487 alu.current_ar = rel;
1488
1489 PSC_DUMP(
1490 sblog << " new current_AR assigned: " << *alu.current_ar
1491 << "\n";
1492 );
1493 }
1494 }
1495
1496 } else if (src) {
1497 if (!map_src_val(v)) {
1498 return false;
1499 }
1500 }
1501 }
1502 return true;
1503 }
1504
map_src(alu_node * n)1505 bool post_scheduler::map_src(alu_node *n) {
1506 if (!map_src_vec(n->dst, false))
1507 return false;
1508
1509 if (!map_src_vec(n->src, true))
1510 return false;
1511
1512 return true;
1513 }
1514
dump_regmap()1515 void post_scheduler::dump_regmap() {
1516
1517 sblog << "# REGMAP :\n";
1518
1519 for(rv_map::iterator I = regmap.begin(), E = regmap.end(); I != E; ++I) {
1520 sblog << " # " << I->first << " => " << *(I->second) << "\n";
1521 }
1522
1523 if (alu.current_ar)
1524 sblog << " current_AR: " << *alu.current_ar << "\n";
1525 if (alu.current_pr)
1526 sblog << " current_PR: " << *alu.current_pr << "\n";
1527 if (alu.current_idx[0])
1528 sblog << " current IDX0: " << *alu.current_idx[0] << "\n";
1529 if (alu.current_idx[1])
1530 sblog << " current IDX1: " << *alu.current_idx[1] << "\n";
1531 }
1532
recolor_locals()1533 void post_scheduler::recolor_locals() {
1534 alu_group_tracker &rt = alu.grp();
1535
1536 for (unsigned s = 0; s < ctx.num_slots; ++s) {
1537 alu_node *n = rt.slot(s);
1538 if (n) {
1539 value *d = n->dst[0];
1540 if (d && d->is_sgpr() && !d->is_prealloc()) {
1541 recolor_local(d);
1542 }
1543 }
1544 }
1545 }
1546
1547 // returns true if there are interferences
check_interferences()1548 bool post_scheduler::check_interferences() {
1549
1550 alu_group_tracker &rt = alu.grp();
1551
1552 unsigned interf_slots;
1553
1554 bool discarded = false;
1555
1556 PSC_DUMP(
1557 sblog << "check_interferences: before: \n";
1558 dump_regmap();
1559 );
1560
1561 do {
1562
1563 interf_slots = 0;
1564
1565 for (unsigned s = 0; s < ctx.num_slots; ++s) {
1566 alu_node *n = rt.slot(s);
1567 if (n) {
1568 if (!unmap_dst(n)) {
1569 return true;
1570 }
1571 }
1572 }
1573
1574 for (unsigned s = 0; s < ctx.num_slots; ++s) {
1575 alu_node *n = rt.slot(s);
1576 if (n) {
1577 if (!map_src(n)) {
1578 interf_slots |= (1 << s);
1579 }
1580 }
1581 }
1582
1583 PSC_DUMP(
1584 for (unsigned i = 0; i < 5; ++i) {
1585 if (interf_slots & (1 << i)) {
1586 sblog << "!!!!!! interf slot: " << i << " : ";
1587 dump::dump_op(rt.slot(i));
1588 sblog << "\n";
1589 }
1590 }
1591 );
1592
1593 if (!interf_slots)
1594 break;
1595
1596 PSC_DUMP( sblog << "ci: discarding slots " << interf_slots << "\n"; );
1597
1598 rt.discard_slots(interf_slots, alu.conflict_nodes);
1599 regmap = prev_regmap;
1600 discarded = true;
1601
1602 } while(1);
1603
1604 PSC_DUMP(
1605 sblog << "check_interferences: after: \n";
1606 dump_regmap();
1607 );
1608
1609 return discarded;
1610 }
1611
1612 // add instruction(s) (alu_node or contents of alu_packed_node) to current group
1613 // returns the number of added instructions on success
try_add_instruction(node * n)1614 unsigned post_scheduler::try_add_instruction(node *n) {
1615
1616 alu_group_tracker &rt = alu.grp();
1617
1618 unsigned avail_slots = rt.avail_slots();
1619
1620 // Cannot schedule in same clause as instructions using this index value
1621 if (!n->dst.empty() && n->dst[0] &&
1622 (n->dst[0] == alu.current_idx[0] || n->dst[0] == alu.current_idx[1])) {
1623 PSC_DUMP(sblog << " CF_IDX source: " << *n->dst[0] << "\n";);
1624 return 0;
1625 }
1626
1627 if (n->is_alu_packed()) {
1628 alu_packed_node *p = static_cast<alu_packed_node*>(n);
1629 unsigned slots = p->get_slot_mask();
1630 unsigned cnt = __builtin_popcount(slots);
1631
1632 if ((slots & avail_slots) != slots) {
1633 PSC_DUMP( sblog << " no slots \n"; );
1634 return 0;
1635 }
1636
1637 p->update_packed_items(ctx);
1638
1639 if (!rt.try_reserve(p)) {
1640 PSC_DUMP( sblog << " reservation failed \n"; );
1641 return 0;
1642 }
1643
1644 p->remove();
1645 return cnt;
1646
1647 } else {
1648 alu_node *a = static_cast<alu_node*>(n);
1649 value *d = a->dst.empty() ? NULL : a->dst[0];
1650
1651 if (d && d->is_special_reg()) {
1652 assert((a->bc.op_ptr->flags & AF_MOVA) || d->is_geometry_emit());
1653 d = NULL;
1654 }
1655
1656 unsigned allowed_slots = ctx.alu_slots_mask(a->bc.op_ptr);
1657 unsigned slot;
1658
1659 allowed_slots &= avail_slots;
1660
1661 if (!allowed_slots)
1662 return 0;
1663
1664 if (d) {
1665 slot = d->get_final_chan();
1666 a->bc.dst_chan = slot;
1667 allowed_slots &= (1 << slot) | 0x10;
1668 } else {
1669 if (a->bc.op_ptr->flags & AF_MOVA) {
1670 if (a->bc.slot_flags & AF_V)
1671 allowed_slots &= (1 << SLOT_X);
1672 else
1673 allowed_slots &= (1 << SLOT_TRANS);
1674 }
1675 }
1676
1677 // FIXME workaround for some problems with MULADD in trans slot on r700,
1678 // (is it really needed on r600?)
1679 if ((a->bc.op == ALU_OP3_MULADD || a->bc.op == ALU_OP3_MULADD_IEEE) &&
1680 !ctx.is_egcm()) {
1681 allowed_slots &= 0x0F;
1682 }
1683
1684 if (!allowed_slots) {
1685 PSC_DUMP( sblog << " no suitable slots\n"; );
1686 return 0;
1687 }
1688
1689 slot = __builtin_ctz(allowed_slots);
1690 a->bc.slot = slot;
1691
1692 PSC_DUMP( sblog << "slot: " << slot << "\n"; );
1693
1694 if (!rt.try_reserve(a)) {
1695 PSC_DUMP( sblog << " reservation failed\n"; );
1696 return 0;
1697 }
1698
1699 a->remove();
1700 return 1;
1701 }
1702 }
1703
check_copy(node * n)1704 bool post_scheduler::check_copy(node *n) {
1705 if (!n->is_copy_mov())
1706 return false;
1707
1708 value *s = n->src[0];
1709 value *d = n->dst[0];
1710
1711 if (!s->is_sgpr() || !d->is_sgpr())
1712 return false;
1713
1714 if (!s->is_prealloc()) {
1715 recolor_local(s);
1716
1717 if (!s->chunk || s->chunk != d->chunk)
1718 return false;
1719 }
1720
1721 if (s->gpr == d->gpr) {
1722
1723 PSC_DUMP(
1724 sblog << "check_copy: ";
1725 dump::dump_op(n);
1726 sblog << "\n";
1727 );
1728
1729 rv_map::iterator F = regmap.find(d->gpr);
1730 bool gpr_free = (F == regmap.end());
1731
1732 if (d->is_prealloc()) {
1733 if (gpr_free) {
1734 PSC_DUMP( sblog << " copy not ready...\n";);
1735 return true;
1736 }
1737
1738 value *rv = F->second;
1739 if (rv != d && (!rv->chunk || rv->chunk != d->chunk)) {
1740 PSC_DUMP( sblog << " copy not ready(2)...\n";);
1741 return true;
1742 }
1743
1744 unmap_dst(static_cast<alu_node*>(n));
1745 }
1746
1747 if (s->is_prealloc() && !map_src_val(s))
1748 return true;
1749
1750 update_live(n, NULL);
1751
1752 release_src_values(n);
1753 n->remove();
1754 PSC_DUMP( sblog << " copy coalesced...\n";);
1755 return true;
1756 }
1757 return false;
1758 }
1759
dump_group(alu_group_tracker & rt)1760 void post_scheduler::dump_group(alu_group_tracker &rt) {
1761 for (unsigned i = 0; i < 5; ++i) {
1762 node *n = rt.slot(i);
1763 if (n) {
1764 sblog << "slot " << i << " : ";
1765 dump::dump_op(n);
1766 sblog << "\n";
1767 }
1768 }
1769 }
1770
process_ready_copies()1771 void post_scheduler::process_ready_copies() {
1772
1773 node *last;
1774
1775 do {
1776 last = ready_copies.back();
1777
1778 for (node_iterator N, I = ready_copies.begin(), E = ready_copies.end();
1779 I != E; I = N) {
1780 N = I; ++N;
1781
1782 node *n = *I;
1783
1784 if (!check_copy(n)) {
1785 n->remove();
1786 ready.push_back(n);
1787 }
1788 }
1789 } while (last != ready_copies.back());
1790
1791 update_local_interferences();
1792 }
1793
1794
prepare_alu_group()1795 bool post_scheduler::prepare_alu_group() {
1796
1797 alu_group_tracker &rt = alu.grp();
1798
1799 unsigned i1 = 0;
1800
1801 PSC_DUMP(
1802 sblog << "prepare_alu_group: starting...\n";
1803 dump_group(rt);
1804 );
1805
1806 ready.append_from(&alu.conflict_nodes);
1807
1808 // FIXME rework this loop
1809
1810 do {
1811
1812 process_ready_copies();
1813
1814 ++i1;
1815
1816 for (node_iterator N, I = ready.begin(), E = ready.end(); I != E;
1817 I = N) {
1818 N = I; ++N;
1819 node *n = *I;
1820
1821 PSC_DUMP(
1822 sblog << "p_a_g: ";
1823 dump::dump_op(n);
1824 sblog << "\n";
1825 );
1826
1827
1828 unsigned cnt = try_add_instruction(n);
1829
1830 if (!cnt)
1831 continue;
1832
1833 PSC_DUMP(
1834 sblog << "current group:\n";
1835 dump_group(rt);
1836 );
1837
1838 if (rt.inst_count() == ctx.num_slots) {
1839 PSC_DUMP( sblog << " all slots used\n"; );
1840 break;
1841 }
1842 }
1843
1844 if (!check_interferences())
1845 break;
1846
1847 // don't try to add more instructions to the group with mova if this
1848 // can lead to breaking clause slot count limit - we don't want mova to
1849 // end up in the end of the new clause instead of beginning of the
1850 // current clause.
1851 if (rt.has_ar_load() && alu.total_slots() > 121)
1852 break;
1853
1854 if (rt.inst_count() && i1 > 50)
1855 break;
1856
1857 regmap = prev_regmap;
1858
1859 } while (1);
1860
1861 PSC_DUMP(
1862 sblog << " prepare_alu_group done, " << rt.inst_count()
1863 << " slot(s) \n";
1864
1865 sblog << "$$$$$$$$PAG i1=" << i1
1866 << " ready " << ready.count()
1867 << " pending " << pending.count()
1868 << " conflicting " << alu.conflict_nodes.count()
1869 <<"\n";
1870
1871 );
1872
1873 return rt.inst_count();
1874 }
1875
release_src_values(node * n)1876 void post_scheduler::release_src_values(node* n) {
1877 release_src_vec(n->src, true);
1878 release_src_vec(n->dst, false);
1879 }
1880
release_op(node * n)1881 void post_scheduler::release_op(node *n) {
1882 PSC_DUMP(
1883 sblog << "release_op ";
1884 dump::dump_op(n);
1885 sblog << "\n";
1886 );
1887
1888 n->remove();
1889
1890 if (n->is_copy_mov()) {
1891 ready_copies.push_back(n);
1892 } else if (n->is_mova() || n->is_pred_set()) {
1893 ready.push_front(n);
1894 } else {
1895 ready.push_back(n);
1896 }
1897 }
1898
release_src_val(value * v)1899 void post_scheduler::release_src_val(value *v) {
1900 node *d = v->any_def();
1901 if (d) {
1902 if (!--ucm[d])
1903 release_op(d);
1904 }
1905 }
1906
release_src_vec(vvec & vv,bool src)1907 void post_scheduler::release_src_vec(vvec& vv, bool src) {
1908
1909 for (vvec::iterator I = vv.begin(), E = vv.end(); I != E; ++I) {
1910 value *v = *I;
1911 if (!v || v->is_readonly())
1912 continue;
1913
1914 if (v->is_rel()) {
1915 release_src_val(v->rel);
1916 release_src_vec(v->muse, true);
1917
1918 } else if (src) {
1919 release_src_val(v);
1920 }
1921 }
1922 }
1923
reset()1924 void literal_tracker::reset() {
1925 memset(lt, 0, sizeof(lt));
1926 memset(uc, 0, sizeof(uc));
1927 }
1928
reset()1929 void rp_gpr_tracker::reset() {
1930 memset(rp, 0, sizeof(rp));
1931 memset(uc, 0, sizeof(uc));
1932 }
1933
reset()1934 void rp_kcache_tracker::reset() {
1935 memset(rp, 0, sizeof(rp));
1936 memset(uc, 0, sizeof(uc));
1937 }
1938
reset()1939 void alu_kcache_tracker::reset() {
1940 memset(kc, 0, sizeof(kc));
1941 lines.clear();
1942 }
1943
reset()1944 void alu_clause_tracker::reset() {
1945 group = 0;
1946 slot_count = 0;
1947 grp0.reset();
1948 grp1.reset();
1949 }
1950
alu_clause_tracker(shader & sh)1951 alu_clause_tracker::alu_clause_tracker(shader &sh)
1952 : sh(sh), kt(sh.get_ctx().hw_class), slot_count(),
1953 grp0(sh), grp1(sh),
1954 group(), clause(),
1955 push_exec_mask(),
1956 current_ar(), current_pr(), current_idx() {}
1957
emit_group()1958 void alu_clause_tracker::emit_group() {
1959
1960 assert(grp().inst_count());
1961
1962 alu_group_node *g = grp().emit();
1963
1964 if (grp().has_update_exec_mask()) {
1965 assert(!push_exec_mask);
1966 push_exec_mask = true;
1967 }
1968
1969 assert(g);
1970
1971 if (!clause) {
1972 clause = sh.create_clause(NST_ALU_CLAUSE);
1973 }
1974
1975 clause->push_front(g);
1976
1977 slot_count += grp().slot_count();
1978
1979 new_group();
1980
1981 PSC_DUMP( sblog << " #### group emitted\n"; );
1982 }
1983
emit_clause(container_node * c)1984 void alu_clause_tracker::emit_clause(container_node *c) {
1985 assert(clause);
1986
1987 kt.init_clause(clause->bc);
1988
1989 assert(!current_ar);
1990 assert(!current_pr);
1991
1992 if (push_exec_mask)
1993 clause->bc.set_op(CF_OP_ALU_PUSH_BEFORE);
1994
1995 c->push_front(clause);
1996
1997 clause = NULL;
1998 push_exec_mask = false;
1999 slot_count = 0;
2000 kt.reset();
2001
2002 PSC_DUMP( sblog << "######### ALU clause emitted\n"; );
2003 }
2004
check_clause_limits()2005 bool alu_clause_tracker::check_clause_limits() {
2006
2007 alu_group_tracker > = grp();
2008
2009 unsigned slots = gt.slot_count();
2010
2011 // reserving slots to load AR and PR values
2012 unsigned reserve_slots = (current_ar ? 1 : 0) + (current_pr ? 1 : 0);
2013 // ...and index registers
2014 reserve_slots += (current_idx[0] != NULL) + (current_idx[1] != NULL);
2015
2016 if (slot_count + slots > MAX_ALU_SLOTS - reserve_slots)
2017 return false;
2018
2019 if (!kt.try_reserve(gt))
2020 return false;
2021
2022 return true;
2023 }
2024
new_group()2025 void alu_clause_tracker::new_group() {
2026 group = !group;
2027 grp().reset();
2028 }
2029
is_empty()2030 bool alu_clause_tracker::is_empty() {
2031 return clause == NULL;
2032 }
2033
init_group_literals(alu_group_node * g)2034 void literal_tracker::init_group_literals(alu_group_node* g) {
2035
2036 g->literals.clear();
2037 for (unsigned i = 0; i < 4; ++i) {
2038 if (!lt[i])
2039 break;
2040
2041 g->literals.push_back(lt[i]);
2042
2043 PSC_DUMP(
2044 sblog << "literal emitted: " << lt[i].f;
2045 sblog.print_zw_hex(lt[i].u, 8);
2046 sblog << " " << lt[i].i << "\n";
2047 );
2048 }
2049 }
2050
try_reserve(alu_group_tracker & gt)2051 bool alu_kcache_tracker::try_reserve(alu_group_tracker& gt) {
2052 rp_kcache_tracker &kt = gt.kcache();
2053
2054 if (!kt.num_sels())
2055 return true;
2056
2057 sb_set<unsigned> group_lines;
2058
2059 unsigned nl = kt.get_lines(group_lines);
2060 assert(nl);
2061
2062 sb_set<unsigned> clause_lines(lines);
2063 lines.add_set(group_lines);
2064
2065 if (clause_lines.size() == lines.size())
2066 return true;
2067
2068 if (update_kc())
2069 return true;
2070
2071 lines = clause_lines;
2072
2073 return false;
2074 }
2075
get_lines(kc_lines & lines)2076 unsigned rp_kcache_tracker::get_lines(kc_lines& lines) {
2077 unsigned cnt = 0;
2078
2079 for (unsigned i = 0; i < sel_count; ++i) {
2080 unsigned line = rp[i] & 0x1fffffffu;
2081 unsigned index_mode = rp[i] >> 29;
2082
2083 if (!line)
2084 return cnt;
2085
2086 --line;
2087 line = (sel_count == 2) ? line >> 5 : line >> 6;
2088 line |= index_mode << 29;
2089
2090 if (lines.insert(line).second)
2091 ++cnt;
2092 }
2093 return cnt;
2094 }
2095
update_kc()2096 bool alu_kcache_tracker::update_kc() {
2097 unsigned c = 0;
2098
2099 bc_kcache old_kc[4];
2100 memcpy(old_kc, kc, sizeof(kc));
2101
2102 for (kc_lines::iterator I = lines.begin(), E = lines.end(); I != E; ++I) {
2103 unsigned index_mode = *I >> 29;
2104 unsigned line = *I & 0x1fffffffu;
2105 unsigned bank = line >> 8;
2106
2107 assert(index_mode <= KC_INDEX_INVALID);
2108 line &= 0xFF;
2109
2110 if (c && (bank == kc[c-1].bank) && (kc[c-1].addr + 1 == line) &&
2111 kc[c-1].index_mode == index_mode)
2112 {
2113 kc[c-1].mode = KC_LOCK_2;
2114 } else {
2115 if (c == max_kcs) {
2116 memcpy(kc, old_kc, sizeof(kc));
2117 return false;
2118 }
2119
2120 kc[c].mode = KC_LOCK_1;
2121
2122 kc[c].bank = bank;
2123 kc[c].addr = line;
2124 kc[c].index_mode = index_mode;
2125 ++c;
2126 }
2127 }
2128 return true;
2129 }
2130
create_ar_load(value * v,chan_select ar_channel)2131 alu_node* alu_clause_tracker::create_ar_load(value *v, chan_select ar_channel) {
2132 alu_node *a = sh.create_alu();
2133
2134 if (sh.get_ctx().uses_mova_gpr) {
2135 a->bc.set_op(ALU_OP1_MOVA_GPR_INT);
2136 a->bc.slot = SLOT_TRANS;
2137 } else {
2138 a->bc.set_op(ALU_OP1_MOVA_INT);
2139 a->bc.slot = SLOT_X;
2140 }
2141 a->bc.dst_chan = ar_channel;
2142 if (ar_channel != SEL_X && sh.get_ctx().is_cayman()) {
2143 a->bc.dst_gpr = ar_channel == SEL_Y ? CM_V_SQ_MOVA_DST_CF_IDX0 : CM_V_SQ_MOVA_DST_CF_IDX1;
2144 }
2145
2146 a->dst.resize(1);
2147 a->src.push_back(v);
2148
2149 PSC_DUMP(
2150 sblog << "created AR load: ";
2151 dump::dump_op(a);
2152 sblog << "\n";
2153 );
2154
2155 return a;
2156 }
2157
discard_current_group()2158 void alu_clause_tracker::discard_current_group() {
2159 PSC_DUMP( sblog << "act::discard_current_group\n"; );
2160 grp().discard_all_slots(conflict_nodes);
2161 }
2162
dump()2163 void rp_gpr_tracker::dump() {
2164 sblog << "=== gpr_tracker dump:\n";
2165 for (int c = 0; c < 3; ++c) {
2166 sblog << "cycle " << c << " ";
2167 for (int h = 0; h < 4; ++h) {
2168 sblog << rp[c][h] << ":" << uc[c][h] << " ";
2169 }
2170 sblog << "\n";
2171 }
2172 }
2173
2174 } // namespace r600_sb
2175