1 /* 2 * Copyright 2013 Vadim Girlin <vadimgirlin@gmail.com> 3 * 4 * Permission is hereby granted, free of charge, to any person obtaining a 5 * copy of this software and associated documentation files (the "Software"), 6 * to deal in the Software without restriction, including without limitation 7 * on the rights to use, copy, modify, merge, publish, distribute, sub 8 * license, and/or sell copies of the Software, and to permit persons to whom 9 * the Software is furnished to do so, subject to the following conditions: 10 * 11 * The above copyright notice and this permission notice (including the next 12 * paragraph) shall be included in all copies or substantial portions of the 13 * Software. 14 * 15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL 18 * THE AUTHOR(S) AND/OR THEIR SUPPLIERS BE LIABLE FOR ANY CLAIM, 19 * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR 20 * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE 21 * USE OR OTHER DEALINGS IN THE SOFTWARE. 22 * 23 * Authors: 24 * Vadim Girlin 25 */ 26 27 #ifndef SB_PASS_H_ 28 #define SB_PASS_H_ 29 30 #include <stack> 31 32 namespace r600_sb { 33 34 class pass { 35 protected: 36 sb_context &ctx; 37 shader &sh; 38 39 public: 40 pass(shader &s); 41 42 virtual int run(); 43 ~pass()44 virtual ~pass() {} 45 }; 46 47 class vpass : public pass { 48 49 public: 50 vpass(shader & s)51 vpass(shader &s) : pass(s) {} 52 53 virtual int init(); 54 virtual int done(); 55 56 virtual int run(); 57 virtual void run_on(container_node &n); 58 59 virtual bool visit(node &n, bool enter); 60 virtual bool visit(container_node &n, bool enter); 61 virtual bool visit(alu_group_node &n, bool enter); 62 virtual bool visit(cf_node &n, bool enter); 63 virtual bool visit(alu_node &n, bool enter); 64 virtual bool visit(alu_packed_node &n, bool enter); 65 virtual bool visit(fetch_node &n, bool enter); 66 virtual bool visit(region_node &n, bool enter); 67 virtual bool visit(repeat_node &n, bool enter); 68 virtual bool visit(depart_node &n, bool enter); 69 virtual bool visit(if_node &n, bool enter); 70 virtual bool visit(bb_node &n, bool enter); 71 72 }; 73 74 class rev_vpass : public vpass { 75 76 public: rev_vpass(shader & s)77 rev_vpass(shader &s) : vpass(s) {} 78 79 virtual void run_on(container_node &n); 80 }; 81 82 83 // =================== PASSES 84 85 class bytecode; 86 87 class bc_dump : public vpass { 88 using vpass::visit; 89 90 uint32_t *bc_data; 91 unsigned ndw; 92 93 unsigned id; 94 95 unsigned new_group, group_index; 96 97 public: 98 99 bc_dump(shader &s, bytecode *bc = NULL); 100 bc_dump(shader & s,uint32_t * bc_ptr,unsigned ndw)101 bc_dump(shader &s, uint32_t *bc_ptr, unsigned ndw) : 102 vpass(s), bc_data(bc_ptr), ndw(ndw), id(), new_group(), group_index() {} 103 104 virtual int init(); 105 virtual int done(); 106 107 virtual bool visit(cf_node &n, bool enter); 108 virtual bool visit(alu_node &n, bool enter); 109 virtual bool visit(fetch_node &n, bool enter); 110 111 void dump_dw(unsigned dw_id, unsigned count = 2); 112 113 void dump(cf_node& n); 114 void dump(alu_node& n); 115 void dump(fetch_node& n); 116 }; 117 118 119 class dce_cleanup : public vpass { 120 using vpass::visit; 121 122 bool remove_unused; 123 124 public: 125 dce_cleanup(shader & s)126 dce_cleanup(shader &s) : vpass(s), 127 remove_unused(s.dce_flags & DF_REMOVE_UNUSED), nodes_changed(false) {} 128 129 virtual int run(); 130 131 virtual bool visit(node &n, bool enter); 132 virtual bool visit(alu_group_node &n, bool enter); 133 virtual bool visit(cf_node &n, bool enter); 134 virtual bool visit(alu_node &n, bool enter); 135 virtual bool visit(alu_packed_node &n, bool enter); 136 virtual bool visit(fetch_node &n, bool enter); 137 virtual bool visit(region_node &n, bool enter); 138 virtual bool visit(container_node &n, bool enter); 139 140 private: 141 142 void cleanup_dst(node &n); 143 bool cleanup_dst_vec(vvec &vv); 144 145 // Did we alter/remove nodes during a single pass? 146 bool nodes_changed; 147 }; 148 149 150 class def_use : public pass { 151 152 public: 153 def_use(shader & sh)154 def_use(shader &sh) : pass(sh) {} 155 156 virtual int run(); 157 void run_on(node *n, bool defs); 158 159 private: 160 161 void process_uses(node *n); 162 void process_defs(node *n, vvec &vv, bool arr_def); 163 void process_phi(container_node *c, bool defs, bool uses); 164 }; 165 166 167 168 class dump : public vpass { 169 using vpass::visit; 170 171 int level; 172 173 public: 174 dump(shader & s)175 dump(shader &s) : vpass(s), level(0) {} 176 177 virtual bool visit(node &n, bool enter); 178 virtual bool visit(container_node &n, bool enter); 179 virtual bool visit(alu_group_node &n, bool enter); 180 virtual bool visit(cf_node &n, bool enter); 181 virtual bool visit(alu_node &n, bool enter); 182 virtual bool visit(alu_packed_node &n, bool enter); 183 virtual bool visit(fetch_node &n, bool enter); 184 virtual bool visit(region_node &n, bool enter); 185 virtual bool visit(repeat_node &n, bool enter); 186 virtual bool visit(depart_node &n, bool enter); 187 virtual bool visit(if_node &n, bool enter); 188 virtual bool visit(bb_node &n, bool enter); 189 190 191 static void dump_op(node &n, const char *name); 192 static void dump_vec(const vvec & vv); 193 static void dump_set(shader &sh, val_set & v); 194 195 static void dump_rels(vvec & vv); 196 197 static void dump_val(value *v); 198 static void dump_op(node *n); 199 200 static void dump_op_list(container_node *c); 201 static void dump_queue(sched_queue &q); 202 203 static void dump_alu(alu_node *n); 204 205 private: 206 207 void indent(); 208 209 void dump_common(node &n); 210 void dump_flags(node &n); 211 212 void dump_live_values(container_node &n, bool before); 213 }; 214 215 216 // Global Code Motion 217 218 class gcm : public pass { 219 220 sched_queue bu_ready[SQ_NUM]; 221 sched_queue bu_ready_next[SQ_NUM]; 222 sched_queue bu_ready_early[SQ_NUM]; 223 sched_queue ready; 224 sched_queue ready_above; 225 226 container_node pending; 227 228 struct op_info { 229 bb_node* top_bb; 230 bb_node* bottom_bb; op_infoop_info231 op_info() : top_bb(), bottom_bb() {} 232 }; 233 234 typedef std::map<node*, op_info> op_info_map; 235 236 typedef std::map<node*, unsigned> nuc_map; 237 238 op_info_map op_map; 239 nuc_map uses; 240 241 typedef std::vector<nuc_map> nuc_stack; 242 243 nuc_stack nuc_stk; 244 unsigned ucs_level; 245 246 bb_node * bu_bb; 247 248 vvec pending_defs; 249 250 node_list pending_nodes; 251 252 unsigned cur_sq; 253 254 // for register pressure tracking in bottom-up pass 255 val_set live; 256 int live_count; 257 258 static const int rp_threshold = 100; 259 260 bool pending_exec_mask_update; 261 262 public: 263 gcm(shader & sh)264 gcm(shader &sh) : pass(sh), 265 bu_ready(), bu_ready_next(), bu_ready_early(), 266 ready(), op_map(), uses(), nuc_stk(1), ucs_level(), 267 bu_bb(), pending_defs(), pending_nodes(), cur_sq(), 268 live(), live_count(), pending_exec_mask_update() {} 269 270 virtual int run(); 271 272 private: 273 274 void collect_instructions(container_node *c, bool early_pass); 275 276 void sched_early(container_node *n); 277 void td_sched_bb(bb_node *bb); 278 bool td_is_ready(node *n); 279 void td_release_uses(vvec &v); 280 void td_release_val(value *v); 281 void td_schedule(bb_node *bb, node *n); 282 283 void sched_late(container_node *n); 284 void bu_sched_bb(bb_node *bb); 285 void bu_release_defs(vvec &v, bool src); 286 void bu_release_phi_defs(container_node *p, unsigned op); 287 bool bu_is_ready(node *n); 288 void bu_release_val(value *v); 289 void bu_release_op(node * n); 290 void bu_find_best_bb(node *n, op_info &oi); 291 void bu_schedule(container_node *bb, node *n); 292 293 void push_uc_stack(); 294 void pop_uc_stack(); 295 296 void init_def_count(nuc_map &m, container_node &s); 297 void init_use_count(nuc_map &m, container_node &s); 298 unsigned get_uc_vec(vvec &vv); 299 unsigned get_dc_vec(vvec &vv, bool src); 300 301 void add_ready(node *n); 302 303 void dump_uc_stack(); 304 305 unsigned real_alu_count(sched_queue &q, unsigned max); 306 307 // check if we have not less than threshold ready alu instructions 308 bool check_alu_ready_count(unsigned threshold); 309 }; 310 311 312 class gvn : public vpass { 313 using vpass::visit; 314 315 public: 316 gvn(shader & sh)317 gvn(shader &sh) : vpass(sh) {} 318 319 virtual bool visit(node &n, bool enter); 320 virtual bool visit(cf_node &n, bool enter); 321 virtual bool visit(alu_node &n, bool enter); 322 virtual bool visit(alu_packed_node &n, bool enter); 323 virtual bool visit(fetch_node &n, bool enter); 324 virtual bool visit(region_node &n, bool enter); 325 326 private: 327 328 void process_op(node &n, bool rewrite = true); 329 330 // returns true if the value was rewritten 331 bool process_src(value* &v, bool rewrite); 332 333 334 void process_alu_src_constants(node &n, value* &v); 335 }; 336 337 338 class if_conversion : public pass { 339 340 public: 341 if_conversion(shader & sh)342 if_conversion(shader &sh) : pass(sh) {} 343 344 virtual int run(); 345 346 bool run_on(region_node *r); 347 348 void convert_kill_instructions(region_node *r, value *em, bool branch, 349 container_node *c); 350 351 bool check_and_convert(region_node *r); 352 353 alu_node* convert_phi(value *select, node *phi); 354 355 }; 356 357 358 class liveness : public rev_vpass { 359 using vpass::visit; 360 361 val_set live; 362 bool live_changed; 363 364 public: 365 liveness(shader & s)366 liveness(shader &s) : rev_vpass(s), live_changed(false) {} 367 368 virtual int init(); 369 370 virtual bool visit(node &n, bool enter); 371 virtual bool visit(bb_node &n, bool enter); 372 virtual bool visit(container_node &n, bool enter); 373 virtual bool visit(alu_group_node &n, bool enter); 374 virtual bool visit(cf_node &n, bool enter); 375 virtual bool visit(alu_node &n, bool enter); 376 virtual bool visit(alu_packed_node &n, bool enter); 377 virtual bool visit(fetch_node &n, bool enter); 378 virtual bool visit(region_node &n, bool enter); 379 virtual bool visit(repeat_node &n, bool enter); 380 virtual bool visit(depart_node &n, bool enter); 381 virtual bool visit(if_node &n, bool enter); 382 383 private: 384 385 void update_interferences(); 386 void process_op(node &n); 387 388 bool remove_val(value *v); 389 bool remove_vec(vvec &v); 390 bool process_outs(node& n); 391 void process_ins(node& n); 392 393 void process_phi_outs(container_node *phi); 394 void process_phi_branch(container_node *phi, unsigned id); 395 396 bool process_maydef(value *v); 397 398 bool add_vec(vvec &vv, bool src); 399 400 void update_src_vec(vvec &vv, bool src); 401 }; 402 403 404 struct bool_op_info { 405 bool invert; 406 unsigned int_cvt; 407 408 alu_node *n; 409 }; 410 411 class peephole : public pass { 412 413 public: 414 peephole(shader & sh)415 peephole(shader &sh) : pass(sh) {} 416 417 virtual int run(); 418 419 void run_on(container_node *c); 420 421 void optimize_cc_op(alu_node *a); 422 423 void optimize_cc_op2(alu_node *a); 424 void optimize_CNDcc_op(alu_node *a); 425 426 bool get_bool_op_info(value *b, bool_op_info& bop); 427 bool get_bool_flt_to_int_source(alu_node* &a); 428 void convert_float_setcc(alu_node *f2i, alu_node *s); 429 }; 430 431 432 class psi_ops : public rev_vpass { 433 using rev_vpass::visit; 434 435 public: 436 psi_ops(shader & s)437 psi_ops(shader &s) : rev_vpass(s) {} 438 439 virtual bool visit(node &n, bool enter); 440 virtual bool visit(alu_node &n, bool enter); 441 442 bool try_inline(node &n); 443 bool try_reduce(node &n); 444 bool eliminate(node &n); 445 446 void unpredicate(node *n); 447 }; 448 449 450 // check correctness of the generated code, e.g.: 451 // - expected source operand value is the last value written to its gpr, 452 // - all arguments of phi node should be allocated to the same gpr, 453 // TODO other tests 454 class ra_checker : public pass { 455 456 typedef std::map<sel_chan, value *> reg_value_map; 457 458 typedef std::vector<reg_value_map> regmap_stack; 459 460 regmap_stack rm_stack; 461 unsigned rm_stk_level; 462 463 value* prev_dst[5]; 464 465 public: 466 ra_checker(shader & sh)467 ra_checker(shader &sh) : pass(sh), rm_stk_level(0), prev_dst() {} 468 469 virtual int run(); 470 471 void run_on(container_node *c); 472 473 void dump_error(const error_info &e); 474 void dump_all_errors(); 475 476 private: 477 rmap()478 reg_value_map& rmap() { return rm_stack[rm_stk_level]; } 479 480 void push_stack(); 481 void pop_stack(); 482 483 // when going out of the alu clause, values in the clause temporary gprs, 484 // AR, predicate values, PS/PV are destroyed 485 void kill_alu_only_regs(); 486 void error(node *n, unsigned id, std::string msg); 487 488 void check_phi_src(container_node *p, unsigned id); 489 void process_phi_dst(container_node *p); 490 void check_alu_group(alu_group_node *g); 491 void process_op_dst(node *n); 492 void check_op_src(node *n); 493 void check_src_vec(node *n, unsigned id, vvec &vv, bool src); 494 void check_value_gpr(node *n, unsigned id, value *v); 495 }; 496 497 // ======================================= 498 499 500 class ra_coalesce : public pass { 501 502 public: 503 ra_coalesce(shader & sh)504 ra_coalesce(shader &sh) : pass(sh) {} 505 506 virtual int run(); 507 }; 508 509 510 511 // ======================================= 512 513 class ra_init : public pass { 514 515 public: 516 ra_init(shader & sh)517 ra_init(shader &sh) : pass(sh), prev_chans() { 518 519 // The parameter below affects register channels distribution. 520 // For cayman (VLIW-4) we're trying to distribute the channels 521 // uniformly, this means significantly better alu slots utilization 522 // at the expense of higher gpr usage. Hopefully this will improve 523 // performance, though it has to be proven with real benchmarks yet. 524 // For VLIW-5 this method could also slightly improve slots 525 // utilization, but increased register pressure seems more significant 526 // and overall performance effect is negative according to some 527 // benchmarks, so it's not used currently. Basically, VLIW-5 doesn't 528 // really need it because trans slot (unrestricted by register write 529 // channel) allows to consume most deviations from uniform channel 530 // distribution. 531 // Value 3 means that for new allocation we'll use channel that differs 532 // from 3 last used channels. 0 for VLIW-5 effectively turns this off. 533 534 ra_tune = sh.get_ctx().is_cayman() ? 3 : 0; 535 } 536 537 virtual int run(); 538 539 private: 540 541 unsigned prev_chans; 542 unsigned ra_tune; 543 544 void add_prev_chan(unsigned chan); 545 unsigned get_preferable_chan_mask(); 546 547 void ra_node(container_node *c); 548 void process_op(node *n); 549 550 void color(value *v); 551 552 void color_bs_constraint(ra_constraint *c); 553 554 void assign_color(value *v, sel_chan c); 555 void alloc_arrays(); 556 }; 557 558 // ======================================= 559 560 class ra_split : public pass { 561 562 public: 563 ra_split(shader & sh)564 ra_split(shader &sh) : pass(sh) {} 565 566 virtual int run(); 567 568 void split(container_node *n); 569 void split_op(node *n); 570 void split_alu_packed(alu_packed_node *n); 571 void split_vector_inst(node *n); 572 573 void split_packed_ins(alu_packed_node *n); 574 575 #if 0 576 void split_pinned_outs(node *n); 577 #endif 578 579 void split_vec(vvec &vv, vvec &v1, vvec &v2, bool allow_swz); 580 581 void split_phi_src(container_node *loc, container_node *c, unsigned id, 582 bool loop); 583 void split_phi_dst(node *loc, container_node *c, bool loop); 584 void init_phi_constraints(container_node *c); 585 }; 586 587 588 589 class ssa_prepare : public vpass { 590 using vpass::visit; 591 592 typedef std::vector<val_set> vd_stk; 593 vd_stk stk; 594 595 unsigned level; 596 597 public: ssa_prepare(shader & s)598 ssa_prepare(shader &s) : vpass(s), level(0) {} 599 600 virtual bool visit(cf_node &n, bool enter); 601 virtual bool visit(alu_node &n, bool enter); 602 virtual bool visit(fetch_node &n, bool enter); 603 virtual bool visit(region_node &n, bool enter); 604 virtual bool visit(repeat_node &n, bool enter); 605 virtual bool visit(depart_node &n, bool enter); 606 607 private: 608 push_stk()609 void push_stk() { 610 ++level; 611 if (level + 1 > stk.size()) 612 stk.resize(level+1); 613 else 614 stk[level].clear(); 615 } pop_stk()616 void pop_stk() { 617 assert(level); 618 --level; 619 stk[level].add_set(stk[level + 1]); 620 } 621 622 void add_defs(node &n); 623 cur_set()624 val_set & cur_set() { return stk[level]; } 625 626 container_node* create_phi_nodes(int count); 627 }; 628 629 class ssa_rename : public vpass { 630 using vpass::visit; 631 632 typedef sb_map<value*, unsigned> def_map; 633 634 def_map def_count; 635 std::stack<def_map> rename_stack; 636 637 typedef std::map<uint32_t, value*> val_map; 638 val_map values; 639 640 public: 641 ssa_rename(shader & s)642 ssa_rename(shader &s) : vpass(s) {} 643 644 virtual int init(); 645 646 virtual bool visit(container_node &n, bool enter); 647 virtual bool visit(node &n, bool enter); 648 virtual bool visit(alu_group_node &n, bool enter); 649 virtual bool visit(cf_node &n, bool enter); 650 virtual bool visit(alu_node &n, bool enter); 651 virtual bool visit(alu_packed_node &n, bool enter); 652 virtual bool visit(fetch_node &n, bool enter); 653 virtual bool visit(region_node &n, bool enter); 654 virtual bool visit(repeat_node &n, bool enter); 655 virtual bool visit(depart_node &n, bool enter); 656 virtual bool visit(if_node &n, bool enter); 657 658 private: 659 660 void push(node *phi); 661 void pop(); 662 663 unsigned get_index(def_map& m, value* v); 664 void set_index(def_map& m, value* v, unsigned index); 665 unsigned new_index(def_map& m, value* v); 666 667 value* rename_use(node *n, value* v); 668 value* rename_def(node *def, value* v); 669 670 void rename_src_vec(node *n, vvec &vv, bool src); 671 void rename_dst_vec(node *def, vvec &vv, bool set_def); 672 673 void rename_src(node *n); 674 void rename_dst(node *n); 675 676 void rename_phi_args(container_node *phi, unsigned op, bool def); 677 678 void rename_virt(node *n); 679 void rename_virt_val(node *n, value *v); 680 }; 681 682 class bc_finalizer : public pass { 683 684 cf_node *last_export[EXP_TYPE_COUNT]; 685 cf_node *last_cf; 686 687 unsigned ngpr; 688 unsigned nstack; 689 690 public: 691 bc_finalizer(shader & sh)692 bc_finalizer(shader &sh) : pass(sh), last_export(), last_cf(), ngpr(), 693 nstack() {} 694 695 virtual int run(); 696 697 void finalize_loop(region_node *r); 698 void finalize_if(region_node *r); 699 700 void run_on(container_node *c); 701 702 void insert_rv6xx_load_ar_workaround(alu_group_node *b4); 703 void finalize_alu_group(alu_group_node *g, node *prev_node); 704 bool finalize_alu_src(alu_group_node *g, alu_node *a, alu_group_node *prev_node); 705 706 void emit_set_grad(fetch_node* f); 707 void finalize_fetch(fetch_node *f); 708 709 void finalize_cf(cf_node *c); 710 711 sel_chan translate_kcache(cf_node *alu, value *v); 712 713 void update_ngpr(unsigned gpr); 714 void update_nstack(region_node *r, unsigned add = 0); 715 716 unsigned get_stack_depth(node *n, unsigned &loops, unsigned &ifs, 717 unsigned add = 0); 718 719 void cf_peephole(); 720 721 private: 722 void copy_fetch_src(fetch_node &dst, fetch_node &src, unsigned arg_start); 723 void emit_set_texture_offsets(fetch_node &f); 724 }; 725 726 727 } // namespace r600_sb 728 729 #endif /* SB_PASS_H_ */ 730