1 /* -*- c++ -*- */ 2 /* 3 * Copyright © 2010-2015 Intel Corporation 4 * 5 * Permission is hereby granted, free of charge, to any person obtaining a 6 * copy of this software and associated documentation files (the "Software"), 7 * to deal in the Software without restriction, including without limitation 8 * the rights to use, copy, modify, merge, publish, distribute, sublicense, 9 * and/or sell copies of the Software, and to permit persons to whom the 10 * Software is furnished to do so, subject to the following conditions: 11 * 12 * The above copyright notice and this permission notice (including the next 13 * paragraph) shall be included in all copies or substantial portions of the 14 * Software. 15 * 16 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 17 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 18 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 19 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 20 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING 21 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS 22 * IN THE SOFTWARE. 23 */ 24 25 #ifndef BRW_FS_BUILDER_H 26 #define BRW_FS_BUILDER_H 27 28 #include "brw_ir_fs.h" 29 #include "brw_shader.h" 30 31 namespace brw { 32 /** 33 * Toolbox to assemble an FS IR program out of individual instructions. 34 * 35 * This object is meant to have an interface consistent with 36 * brw::vec4_builder. They cannot be fully interchangeable because 37 * brw::fs_builder generates scalar code while brw::vec4_builder generates 38 * vector code. 39 */ 40 class fs_builder { 41 public: 42 /** Type used in this IR to represent a source of an instruction. */ 43 typedef fs_reg src_reg; 44 45 /** Type used in this IR to represent the destination of an instruction. */ 46 typedef fs_reg dst_reg; 47 48 /** Type used in this IR to represent an instruction. */ 49 typedef fs_inst instruction; 50 51 /** 52 * Construct an fs_builder that inserts instructions into \p shader. 53 * \p dispatch_width gives the native execution width of the program. 54 */ fs_builder(backend_shader * shader,unsigned dispatch_width)55 fs_builder(backend_shader *shader, 56 unsigned dispatch_width) : 57 shader(shader), block(NULL), cursor(NULL), 58 _dispatch_width(dispatch_width), 59 _group(0), 60 force_writemask_all(false), 61 annotation() 62 { 63 } 64 65 /** 66 * Construct an fs_builder that inserts instructions into \p shader 67 * before instruction \p inst in basic block \p block. The default 68 * execution controls and debug annotation are initialized from the 69 * instruction passed as argument. 70 */ fs_builder(backend_shader * shader,bblock_t * block,fs_inst * inst)71 fs_builder(backend_shader *shader, bblock_t *block, fs_inst *inst) : 72 shader(shader), block(block), cursor(inst), 73 _dispatch_width(inst->exec_size), 74 _group(inst->group), 75 force_writemask_all(inst->force_writemask_all) 76 { 77 annotation.str = inst->annotation; 78 annotation.ir = inst->ir; 79 } 80 81 /** 82 * Construct an fs_builder that inserts instructions before \p cursor in 83 * basic block \p block, inheriting other code generation parameters 84 * from this. 85 */ 86 fs_builder at(bblock_t * block,exec_node * cursor)87 at(bblock_t *block, exec_node *cursor) const 88 { 89 fs_builder bld = *this; 90 bld.block = block; 91 bld.cursor = cursor; 92 return bld; 93 } 94 95 /** 96 * Construct an fs_builder appending instructions at the end of the 97 * instruction list of the shader, inheriting other code generation 98 * parameters from this. 99 */ 100 fs_builder at_end()101 at_end() const 102 { 103 return at(NULL, (exec_node *)&shader->instructions.tail_sentinel); 104 } 105 106 /** 107 * Construct a builder specifying the default SIMD width and group of 108 * channel enable signals, inheriting other code generation parameters 109 * from this. 110 * 111 * \p n gives the default SIMD width, \p i gives the slot group used for 112 * predication and control flow masking in multiples of \p n channels. 113 */ 114 fs_builder group(unsigned n,unsigned i)115 group(unsigned n, unsigned i) const 116 { 117 fs_builder bld = *this; 118 119 if (n <= dispatch_width() && i < dispatch_width() / n) { 120 bld._group += i * n; 121 } else { 122 /* The requested channel group isn't a subset of the channel group 123 * of this builder, which means that the resulting instructions 124 * would use (potentially undefined) channel enable signals not 125 * specified by the parent builder. That's only valid if the 126 * instruction doesn't have per-channel semantics, in which case 127 * we should clear off the default group index in order to prevent 128 * emitting instructions with channel group not aligned to their 129 * own execution size. 130 */ 131 assert(force_writemask_all); 132 bld._group = 0; 133 } 134 135 bld._dispatch_width = n; 136 return bld; 137 } 138 139 /** 140 * Alias for group() with width equal to eight. 141 */ 142 fs_builder quarter(unsigned i)143 quarter(unsigned i) const 144 { 145 return group(8, i); 146 } 147 148 /** 149 * Construct a builder with per-channel control flow execution masking 150 * disabled if \p b is true. If control flow execution masking is 151 * already disabled this has no effect. 152 */ 153 fs_builder 154 exec_all(bool b = true) const 155 { 156 fs_builder bld = *this; 157 if (b) 158 bld.force_writemask_all = true; 159 return bld; 160 } 161 162 /** 163 * Construct a builder with the given debug annotation info. 164 */ 165 fs_builder 166 annotate(const char *str, const void *ir = NULL) const 167 { 168 fs_builder bld = *this; 169 bld.annotation.str = str; 170 bld.annotation.ir = ir; 171 return bld; 172 } 173 174 /** 175 * Get the SIMD width in use. 176 */ 177 unsigned dispatch_width()178 dispatch_width() const 179 { 180 return _dispatch_width; 181 } 182 183 /** 184 * Get the channel group in use. 185 */ 186 unsigned group()187 group() const 188 { 189 return _group; 190 } 191 192 /** 193 * Allocate a virtual register of natural vector size (one for this IR) 194 * and SIMD width. \p n gives the amount of space to allocate in 195 * dispatch_width units (which is just enough space for one logical 196 * component in this IR). 197 */ 198 dst_reg 199 vgrf(enum brw_reg_type type, unsigned n = 1) const 200 { 201 assert(dispatch_width() <= 32); 202 203 if (n > 0) 204 return dst_reg(VGRF, shader->alloc.allocate( 205 DIV_ROUND_UP(n * type_sz(type) * dispatch_width(), 206 REG_SIZE)), 207 type); 208 else 209 return retype(null_reg_ud(), type); 210 } 211 212 /** 213 * Create a null register of floating type. 214 */ 215 dst_reg null_reg_f()216 null_reg_f() const 217 { 218 return dst_reg(retype(brw_null_reg(), BRW_REGISTER_TYPE_F)); 219 } 220 221 dst_reg null_reg_df()222 null_reg_df() const 223 { 224 return dst_reg(retype(brw_null_reg(), BRW_REGISTER_TYPE_DF)); 225 } 226 227 /** 228 * Create a null register of signed integer type. 229 */ 230 dst_reg null_reg_d()231 null_reg_d() const 232 { 233 return dst_reg(retype(brw_null_reg(), BRW_REGISTER_TYPE_D)); 234 } 235 236 /** 237 * Create a null register of unsigned integer type. 238 */ 239 dst_reg null_reg_ud()240 null_reg_ud() const 241 { 242 return dst_reg(retype(brw_null_reg(), BRW_REGISTER_TYPE_UD)); 243 } 244 245 /** 246 * Insert an instruction into the program. 247 */ 248 instruction * emit(const instruction & inst)249 emit(const instruction &inst) const 250 { 251 return emit(new(shader->mem_ctx) instruction(inst)); 252 } 253 254 /** 255 * Create and insert a nullary control instruction into the program. 256 */ 257 instruction * emit(enum opcode opcode)258 emit(enum opcode opcode) const 259 { 260 return emit(instruction(opcode, dispatch_width())); 261 } 262 263 /** 264 * Create and insert a nullary instruction into the program. 265 */ 266 instruction * emit(enum opcode opcode,const dst_reg & dst)267 emit(enum opcode opcode, const dst_reg &dst) const 268 { 269 return emit(instruction(opcode, dispatch_width(), dst)); 270 } 271 272 /** 273 * Create and insert a unary instruction into the program. 274 */ 275 instruction * emit(enum opcode opcode,const dst_reg & dst,const src_reg & src0)276 emit(enum opcode opcode, const dst_reg &dst, const src_reg &src0) const 277 { 278 switch (opcode) { 279 case SHADER_OPCODE_RCP: 280 case SHADER_OPCODE_RSQ: 281 case SHADER_OPCODE_SQRT: 282 case SHADER_OPCODE_EXP2: 283 case SHADER_OPCODE_LOG2: 284 case SHADER_OPCODE_SIN: 285 case SHADER_OPCODE_COS: 286 return emit(instruction(opcode, dispatch_width(), dst, 287 fix_math_operand(src0))); 288 289 default: 290 return emit(instruction(opcode, dispatch_width(), dst, src0)); 291 } 292 } 293 294 /** 295 * Create and insert a binary instruction into the program. 296 */ 297 instruction * emit(enum opcode opcode,const dst_reg & dst,const src_reg & src0,const src_reg & src1)298 emit(enum opcode opcode, const dst_reg &dst, const src_reg &src0, 299 const src_reg &src1) const 300 { 301 switch (opcode) { 302 case SHADER_OPCODE_POW: 303 case SHADER_OPCODE_INT_QUOTIENT: 304 case SHADER_OPCODE_INT_REMAINDER: 305 return emit(instruction(opcode, dispatch_width(), dst, 306 fix_math_operand(src0), 307 fix_math_operand(src1))); 308 309 default: 310 return emit(instruction(opcode, dispatch_width(), dst, 311 src0, src1)); 312 313 } 314 } 315 316 /** 317 * Create and insert a ternary instruction into the program. 318 */ 319 instruction * emit(enum opcode opcode,const dst_reg & dst,const src_reg & src0,const src_reg & src1,const src_reg & src2)320 emit(enum opcode opcode, const dst_reg &dst, const src_reg &src0, 321 const src_reg &src1, const src_reg &src2) const 322 { 323 switch (opcode) { 324 case BRW_OPCODE_BFE: 325 case BRW_OPCODE_BFI2: 326 case BRW_OPCODE_MAD: 327 case BRW_OPCODE_LRP: 328 return emit(instruction(opcode, dispatch_width(), dst, 329 fix_3src_operand(src0), 330 fix_3src_operand(src1), 331 fix_3src_operand(src2))); 332 333 default: 334 return emit(instruction(opcode, dispatch_width(), dst, 335 src0, src1, src2)); 336 } 337 } 338 339 /** 340 * Create and insert an instruction with a variable number of sources 341 * into the program. 342 */ 343 instruction * emit(enum opcode opcode,const dst_reg & dst,const src_reg srcs[],unsigned n)344 emit(enum opcode opcode, const dst_reg &dst, const src_reg srcs[], 345 unsigned n) const 346 { 347 /* Use the emit() methods for specific operand counts to ensure that 348 * opcode-specific operand fixups occur. 349 */ 350 if (n == 2) { 351 return emit(opcode, dst, srcs[0], srcs[1]); 352 } else if (n == 3) { 353 return emit(opcode, dst, srcs[0], srcs[1], srcs[2]); 354 } else { 355 return emit(instruction(opcode, dispatch_width(), dst, srcs, n)); 356 } 357 } 358 359 /** 360 * Insert a preallocated instruction into the program. 361 */ 362 instruction * emit(instruction * inst)363 emit(instruction *inst) const 364 { 365 assert(inst->exec_size <= 32); 366 assert(inst->exec_size == dispatch_width() || 367 force_writemask_all); 368 369 inst->group = _group; 370 inst->force_writemask_all = force_writemask_all; 371 inst->annotation = annotation.str; 372 inst->ir = annotation.ir; 373 374 if (block) 375 static_cast<instruction *>(cursor)->insert_before(block, inst); 376 else 377 cursor->insert_before(inst); 378 379 return inst; 380 } 381 382 /** 383 * Select \p src0 if the comparison of both sources with the given 384 * conditional mod evaluates to true, otherwise select \p src1. 385 * 386 * Generally useful to get the minimum or maximum of two values. 387 */ 388 instruction * emit_minmax(const dst_reg & dst,const src_reg & src0,const src_reg & src1,brw_conditional_mod mod)389 emit_minmax(const dst_reg &dst, const src_reg &src0, 390 const src_reg &src1, brw_conditional_mod mod) const 391 { 392 assert(mod == BRW_CONDITIONAL_GE || mod == BRW_CONDITIONAL_L); 393 394 /* In some cases we can't have bytes as operand for src1, so use the 395 * same type for both operand. 396 */ 397 return set_condmod(mod, SEL(dst, fix_unsigned_negate(src0), 398 fix_unsigned_negate(src1))); 399 } 400 401 /** 402 * Copy any live channel from \p src to the first channel of the result. 403 */ 404 src_reg emit_uniformize(const src_reg & src)405 emit_uniformize(const src_reg &src) const 406 { 407 /* FIXME: We use a vector chan_index and dst to allow constant and 408 * copy propagration to move result all the way into the consuming 409 * instruction (typically a surface index or sampler index for a 410 * send). This uses 1 or 3 extra hw registers in 16 or 32 wide 411 * dispatch. Once we teach const/copy propagation about scalars we 412 * should go back to scalar destinations here. 413 */ 414 const fs_builder ubld = exec_all(); 415 const dst_reg chan_index = vgrf(BRW_REGISTER_TYPE_UD); 416 const dst_reg dst = vgrf(src.type); 417 418 ubld.emit(SHADER_OPCODE_FIND_LIVE_CHANNEL, chan_index); 419 ubld.emit(SHADER_OPCODE_BROADCAST, dst, src, component(chan_index, 0)); 420 421 return src_reg(component(dst, 0)); 422 } 423 424 src_reg move_to_vgrf(const src_reg & src,unsigned num_components)425 move_to_vgrf(const src_reg &src, unsigned num_components) const 426 { 427 src_reg *const src_comps = new src_reg[num_components]; 428 for (unsigned i = 0; i < num_components; i++) 429 src_comps[i] = offset(src, dispatch_width(), i); 430 431 const dst_reg dst = vgrf(src.type, num_components); 432 LOAD_PAYLOAD(dst, src_comps, num_components, 0); 433 434 delete[] src_comps; 435 436 return src_reg(dst); 437 } 438 439 void emit_scan_step(enum opcode opcode,brw_conditional_mod mod,const dst_reg & tmp,unsigned left_offset,unsigned left_stride,unsigned right_offset,unsigned right_stride)440 emit_scan_step(enum opcode opcode, brw_conditional_mod mod, 441 const dst_reg &tmp, 442 unsigned left_offset, unsigned left_stride, 443 unsigned right_offset, unsigned right_stride) const 444 { 445 dst_reg left, right; 446 left = horiz_stride(horiz_offset(tmp, left_offset), left_stride); 447 right = horiz_stride(horiz_offset(tmp, right_offset), right_stride); 448 if ((tmp.type == BRW_REGISTER_TYPE_Q || 449 tmp.type == BRW_REGISTER_TYPE_UQ) && 450 !shader->devinfo->has_64bit_int) { 451 switch (opcode) { 452 case BRW_OPCODE_MUL: 453 /* This will get lowered by integer MUL lowering */ 454 set_condmod(mod, emit(opcode, right, left, right)); 455 break; 456 457 case BRW_OPCODE_SEL: { 458 /* In order for the comparisons to work out right, we need our 459 * comparisons to be strict. 460 */ 461 assert(mod == BRW_CONDITIONAL_L || mod == BRW_CONDITIONAL_GE); 462 if (mod == BRW_CONDITIONAL_GE) 463 mod = BRW_CONDITIONAL_G; 464 465 /* We treat the bottom 32 bits as unsigned regardless of 466 * whether or not the integer as a whole is signed. 467 */ 468 dst_reg right_low = subscript(right, BRW_REGISTER_TYPE_UD, 0); 469 dst_reg left_low = subscript(left, BRW_REGISTER_TYPE_UD, 0); 470 471 /* The upper bits get the same sign as the 64-bit type */ 472 brw_reg_type type32 = brw_reg_type_from_bit_size(32, tmp.type); 473 dst_reg right_high = subscript(right, type32, 1); 474 dst_reg left_high = subscript(left, type32, 1); 475 476 /* Build up our comparison: 477 * 478 * l_hi < r_hi || (l_hi == r_hi && l_low < r_low) 479 */ 480 CMP(null_reg_ud(), retype(left_low, BRW_REGISTER_TYPE_UD), 481 retype(right_low, BRW_REGISTER_TYPE_UD), mod); 482 set_predicate(BRW_PREDICATE_NORMAL, 483 CMP(null_reg_ud(), left_high, right_high, 484 BRW_CONDITIONAL_EQ)); 485 set_predicate_inv(BRW_PREDICATE_NORMAL, true, 486 CMP(null_reg_ud(), left_high, right_high, mod)); 487 488 /* We could use selects here or we could use predicated MOVs 489 * because the destination and second source (if it were a SEL) 490 * are the same. 491 */ 492 set_predicate(BRW_PREDICATE_NORMAL, MOV(right_low, left_low)); 493 set_predicate(BRW_PREDICATE_NORMAL, MOV(right_high, left_high)); 494 break; 495 } 496 497 default: 498 unreachable("Unsupported 64-bit scan op"); 499 } 500 } else { 501 set_condmod(mod, emit(opcode, right, left, right)); 502 } 503 } 504 505 void emit_scan(enum opcode opcode,const dst_reg & tmp,unsigned cluster_size,brw_conditional_mod mod)506 emit_scan(enum opcode opcode, const dst_reg &tmp, 507 unsigned cluster_size, brw_conditional_mod mod) const 508 { 509 assert(dispatch_width() >= 8); 510 511 /* The instruction splitting code isn't advanced enough to split 512 * these so we need to handle that ourselves. 513 */ 514 if (dispatch_width() * type_sz(tmp.type) > 2 * REG_SIZE) { 515 const unsigned half_width = dispatch_width() / 2; 516 const fs_builder ubld = exec_all().group(half_width, 0); 517 dst_reg left = tmp; 518 dst_reg right = horiz_offset(tmp, half_width); 519 ubld.emit_scan(opcode, left, cluster_size, mod); 520 ubld.emit_scan(opcode, right, cluster_size, mod); 521 if (cluster_size > half_width) { 522 ubld.emit_scan_step(opcode, mod, tmp, 523 half_width - 1, 0, half_width, 1); 524 } 525 return; 526 } 527 528 if (cluster_size > 1) { 529 const fs_builder ubld = exec_all().group(dispatch_width() / 2, 0); 530 ubld.emit_scan_step(opcode, mod, tmp, 0, 2, 1, 2); 531 } 532 533 if (cluster_size > 2) { 534 if (type_sz(tmp.type) <= 4) { 535 const fs_builder ubld = 536 exec_all().group(dispatch_width() / 4, 0); 537 ubld.emit_scan_step(opcode, mod, tmp, 1, 4, 2, 4); 538 ubld.emit_scan_step(opcode, mod, tmp, 1, 4, 3, 4); 539 } else { 540 /* For 64-bit types, we have to do things differently because 541 * the code above would land us with destination strides that 542 * the hardware can't handle. Fortunately, we'll only be 543 * 8-wide in that case and it's the same number of 544 * instructions. 545 */ 546 const fs_builder ubld = exec_all().group(2, 0); 547 for (unsigned i = 0; i < dispatch_width(); i += 4) 548 ubld.emit_scan_step(opcode, mod, tmp, i + 1, 0, i + 2, 1); 549 } 550 } 551 552 for (unsigned i = 4; 553 i < MIN2(cluster_size, dispatch_width()); 554 i *= 2) { 555 const fs_builder ubld = exec_all().group(i, 0); 556 ubld.emit_scan_step(opcode, mod, tmp, i - 1, 0, i, 1); 557 558 if (dispatch_width() > i * 2) 559 ubld.emit_scan_step(opcode, mod, tmp, i * 3 - 1, 0, i * 3, 1); 560 561 if (dispatch_width() > i * 4) { 562 ubld.emit_scan_step(opcode, mod, tmp, i * 5 - 1, 0, i * 5, 1); 563 ubld.emit_scan_step(opcode, mod, tmp, i * 7 - 1, 0, i * 7, 1); 564 } 565 } 566 } 567 568 /** 569 * Assorted arithmetic ops. 570 * @{ 571 */ 572 #define ALU1(op) \ 573 instruction * \ 574 op(const dst_reg &dst, const src_reg &src0) const \ 575 { \ 576 return emit(BRW_OPCODE_##op, dst, src0); \ 577 } 578 579 #define ALU2(op) \ 580 instruction * \ 581 op(const dst_reg &dst, const src_reg &src0, const src_reg &src1) const \ 582 { \ 583 return emit(BRW_OPCODE_##op, dst, src0, src1); \ 584 } 585 586 #define ALU2_ACC(op) \ 587 instruction * \ 588 op(const dst_reg &dst, const src_reg &src0, const src_reg &src1) const \ 589 { \ 590 instruction *inst = emit(BRW_OPCODE_##op, dst, src0, src1); \ 591 inst->writes_accumulator = true; \ 592 return inst; \ 593 } 594 595 #define ALU3(op) \ 596 instruction * \ 597 op(const dst_reg &dst, const src_reg &src0, const src_reg &src1, \ 598 const src_reg &src2) const \ 599 { \ 600 return emit(BRW_OPCODE_##op, dst, src0, src1, src2); \ 601 } 602 603 ALU2(ADD) ALU3(ADD3)604 ALU3(ADD3) 605 ALU2_ACC(ADDC) 606 ALU2(AND) 607 ALU2(ASR) 608 ALU2(AVG) 609 ALU3(BFE) 610 ALU2(BFI1) 611 ALU3(BFI2) 612 ALU1(BFREV) 613 ALU1(CBIT) 614 ALU1(DIM) 615 ALU2(DP2) 616 ALU2(DP3) 617 ALU2(DP4) 618 ALU2(DPH) 619 ALU1(F16TO32) 620 ALU1(F32TO16) 621 ALU1(FBH) 622 ALU1(FBL) 623 ALU1(FRC) 624 ALU3(DP4A) 625 ALU2(LINE) 626 ALU1(LZD) 627 ALU2(MAC) 628 ALU2_ACC(MACH) 629 ALU3(MAD) 630 ALU1(MOV) 631 ALU2(MUL) 632 ALU1(NOT) 633 ALU2(OR) 634 ALU2(PLN) 635 ALU1(RNDD) 636 ALU1(RNDE) 637 ALU1(RNDU) 638 ALU1(RNDZ) 639 ALU2(ROL) 640 ALU2(ROR) 641 ALU2(SAD2) 642 ALU2_ACC(SADA2) 643 ALU2(SEL) 644 ALU2(SHL) 645 ALU2(SHR) 646 ALU2_ACC(SUBB) 647 ALU2(XOR) 648 649 #undef ALU3 650 #undef ALU2_ACC 651 #undef ALU2 652 #undef ALU1 653 /** @} */ 654 655 /** 656 * CMP: Sets the low bit of the destination channels with the result 657 * of the comparison, while the upper bits are undefined, and updates 658 * the flag register with the packed 16 bits of the result. 659 */ 660 instruction * 661 CMP(const dst_reg &dst, const src_reg &src0, const src_reg &src1, 662 brw_conditional_mod condition) const 663 { 664 /* Take the instruction: 665 * 666 * CMP null<d> src0<f> src1<f> 667 * 668 * Original gfx4 does type conversion to the destination type 669 * before comparison, producing garbage results for floating 670 * point comparisons. 671 * 672 * The destination type doesn't matter on newer generations, 673 * so we set the type to match src0 so we can compact the 674 * instruction. 675 */ 676 return set_condmod(condition, 677 emit(BRW_OPCODE_CMP, retype(dst, src0.type), 678 fix_unsigned_negate(src0), 679 fix_unsigned_negate(src1))); 680 } 681 682 /** 683 * CMPN: Behaves like CMP, but produces true if src1 is NaN. 684 */ 685 instruction * CMPN(const dst_reg & dst,const src_reg & src0,const src_reg & src1,brw_conditional_mod condition)686 CMPN(const dst_reg &dst, const src_reg &src0, const src_reg &src1, 687 brw_conditional_mod condition) const 688 { 689 /* Take the instruction: 690 * 691 * CMP null<d> src0<f> src1<f> 692 * 693 * Original gfx4 does type conversion to the destination type 694 * before comparison, producing garbage results for floating 695 * point comparisons. 696 * 697 * The destination type doesn't matter on newer generations, 698 * so we set the type to match src0 so we can compact the 699 * instruction. 700 */ 701 return set_condmod(condition, 702 emit(BRW_OPCODE_CMPN, retype(dst, src0.type), 703 fix_unsigned_negate(src0), 704 fix_unsigned_negate(src1))); 705 } 706 707 /** 708 * Gfx4 predicated IF. 709 */ 710 instruction * IF(brw_predicate predicate)711 IF(brw_predicate predicate) const 712 { 713 return set_predicate(predicate, emit(BRW_OPCODE_IF)); 714 } 715 716 /** 717 * CSEL: dst = src2 <op> 0.0f ? src0 : src1 718 */ 719 instruction * CSEL(const dst_reg & dst,const src_reg & src0,const src_reg & src1,const src_reg & src2,brw_conditional_mod condition)720 CSEL(const dst_reg &dst, const src_reg &src0, const src_reg &src1, 721 const src_reg &src2, brw_conditional_mod condition) const 722 { 723 /* CSEL only operates on floats, so we can't do integer </<=/>=/> 724 * comparisons. Zero/non-zero (== and !=) comparisons almost work. 725 * 0x80000000 fails because it is -0.0, and -0.0 == 0.0. 726 */ 727 assert(src2.type == BRW_REGISTER_TYPE_F); 728 729 return set_condmod(condition, 730 emit(BRW_OPCODE_CSEL, 731 retype(dst, BRW_REGISTER_TYPE_F), 732 retype(src0, BRW_REGISTER_TYPE_F), 733 retype(src1, BRW_REGISTER_TYPE_F), 734 src2)); 735 } 736 737 /** 738 * Emit a linear interpolation instruction. 739 */ 740 instruction * LRP(const dst_reg & dst,const src_reg & x,const src_reg & y,const src_reg & a)741 LRP(const dst_reg &dst, const src_reg &x, const src_reg &y, 742 const src_reg &a) const 743 { 744 if (shader->devinfo->ver >= 6 && shader->devinfo->ver <= 10) { 745 /* The LRP instruction actually does op1 * op0 + op2 * (1 - op0), so 746 * we need to reorder the operands. 747 */ 748 return emit(BRW_OPCODE_LRP, dst, a, y, x); 749 750 } else { 751 /* We can't use the LRP instruction. Emit x*(1-a) + y*a. */ 752 const dst_reg y_times_a = vgrf(dst.type); 753 const dst_reg one_minus_a = vgrf(dst.type); 754 const dst_reg x_times_one_minus_a = vgrf(dst.type); 755 756 MUL(y_times_a, y, a); 757 ADD(one_minus_a, negate(a), brw_imm_f(1.0f)); 758 MUL(x_times_one_minus_a, x, src_reg(one_minus_a)); 759 return ADD(dst, src_reg(x_times_one_minus_a), src_reg(y_times_a)); 760 } 761 } 762 763 /** 764 * Collect a number of registers in a contiguous range of registers. 765 */ 766 instruction * LOAD_PAYLOAD(const dst_reg & dst,const src_reg * src,unsigned sources,unsigned header_size)767 LOAD_PAYLOAD(const dst_reg &dst, const src_reg *src, 768 unsigned sources, unsigned header_size) const 769 { 770 instruction *inst = emit(SHADER_OPCODE_LOAD_PAYLOAD, dst, src, sources); 771 inst->header_size = header_size; 772 inst->size_written = header_size * REG_SIZE; 773 for (unsigned i = header_size; i < sources; i++) { 774 inst->size_written += dispatch_width() * type_sz(src[i].type) * 775 dst.stride; 776 } 777 778 return inst; 779 } 780 781 instruction * UNDEF(const dst_reg & dst)782 UNDEF(const dst_reg &dst) const 783 { 784 assert(dst.file == VGRF); 785 instruction *inst = emit(SHADER_OPCODE_UNDEF, 786 retype(dst, BRW_REGISTER_TYPE_UD)); 787 inst->size_written = shader->alloc.sizes[dst.nr] * REG_SIZE; 788 789 return inst; 790 } 791 792 backend_shader *shader; 793 794 private: 795 /** 796 * Workaround for negation of UD registers. See comment in 797 * fs_generator::generate_code() for more details. 798 */ 799 src_reg fix_unsigned_negate(const src_reg & src)800 fix_unsigned_negate(const src_reg &src) const 801 { 802 if (src.type == BRW_REGISTER_TYPE_UD && 803 src.negate) { 804 dst_reg temp = vgrf(BRW_REGISTER_TYPE_UD); 805 MOV(temp, src); 806 return src_reg(temp); 807 } else { 808 return src; 809 } 810 } 811 812 /** 813 * Workaround for source register modes not supported by the ternary 814 * instruction encoding. 815 */ 816 src_reg fix_3src_operand(const src_reg & src)817 fix_3src_operand(const src_reg &src) const 818 { 819 switch (src.file) { 820 case FIXED_GRF: 821 /* FINISHME: Could handle scalar region, other stride=1 regions */ 822 if (src.vstride != BRW_VERTICAL_STRIDE_8 || 823 src.width != BRW_WIDTH_8 || 824 src.hstride != BRW_HORIZONTAL_STRIDE_1) 825 break; 826 FALLTHROUGH; 827 case ATTR: 828 case VGRF: 829 case UNIFORM: 830 case IMM: 831 return src; 832 default: 833 break; 834 } 835 836 dst_reg expanded = vgrf(src.type); 837 MOV(expanded, src); 838 return expanded; 839 } 840 841 /** 842 * Workaround for source register modes not supported by the math 843 * instruction. 844 */ 845 src_reg fix_math_operand(const src_reg & src)846 fix_math_operand(const src_reg &src) const 847 { 848 /* Can't do hstride == 0 args on gfx6 math, so expand it out. We 849 * might be able to do better by doing execsize = 1 math and then 850 * expanding that result out, but we would need to be careful with 851 * masking. 852 * 853 * Gfx6 hardware ignores source modifiers (negate and abs) on math 854 * instructions, so we also move to a temp to set those up. 855 * 856 * Gfx7 relaxes most of the above restrictions, but still can't use IMM 857 * operands to math 858 */ 859 if ((shader->devinfo->ver == 6 && 860 (src.file == IMM || src.file == UNIFORM || 861 src.abs || src.negate)) || 862 (shader->devinfo->ver == 7 && src.file == IMM)) { 863 const dst_reg tmp = vgrf(src.type); 864 MOV(tmp, src); 865 return tmp; 866 } else { 867 return src; 868 } 869 } 870 871 bblock_t *block; 872 exec_node *cursor; 873 874 unsigned _dispatch_width; 875 unsigned _group; 876 bool force_writemask_all; 877 878 /** Debug annotation info. */ 879 struct { 880 const char *str; 881 const void *ir; 882 } annotation; 883 }; 884 } 885 886 #endif 887