1 /* -*- c++ -*- */ 2 /* 3 * Copyright © 2010-2015 Intel Corporation 4 * 5 * Permission is hereby granted, free of charge, to any person obtaining a 6 * copy of this software and associated documentation files (the "Software"), 7 * to deal in the Software without restriction, including without limitation 8 * the rights to use, copy, modify, merge, publish, distribute, sublicense, 9 * and/or sell copies of the Software, and to permit persons to whom the 10 * Software is furnished to do so, subject to the following conditions: 11 * 12 * The above copyright notice and this permission notice (including the next 13 * paragraph) shall be included in all copies or substantial portions of the 14 * Software. 15 * 16 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 17 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 18 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 19 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 20 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING 21 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS 22 * IN THE SOFTWARE. 23 */ 24 25 #ifndef BRW_FS_BUILDER_H 26 #define BRW_FS_BUILDER_H 27 28 #include "brw_ir_fs.h" 29 #include "brw_shader.h" 30 31 namespace brw { 32 /** 33 * Toolbox to assemble an FS IR program out of individual instructions. 34 * 35 * This object is meant to have an interface consistent with 36 * brw::vec4_builder. They cannot be fully interchangeable because 37 * brw::fs_builder generates scalar code while brw::vec4_builder generates 38 * vector code. 39 */ 40 class fs_builder { 41 public: 42 /** Type used in this IR to represent a source of an instruction. */ 43 typedef fs_reg src_reg; 44 45 /** Type used in this IR to represent the destination of an instruction. */ 46 typedef fs_reg dst_reg; 47 48 /** Type used in this IR to represent an instruction. */ 49 typedef fs_inst instruction; 50 51 /** 52 * Construct an fs_builder that inserts instructions into \p shader. 53 * \p dispatch_width gives the native execution width of the program. 54 */ fs_builder(backend_shader * shader,unsigned dispatch_width)55 fs_builder(backend_shader *shader, 56 unsigned dispatch_width) : 57 shader(shader), block(NULL), cursor(NULL), 58 _dispatch_width(dispatch_width), 59 _group(0), 60 force_writemask_all(false), 61 annotation() 62 { 63 } 64 65 /** 66 * Construct an fs_builder that inserts instructions into \p shader 67 * before instruction \p inst in basic block \p block. The default 68 * execution controls and debug annotation are initialized from the 69 * instruction passed as argument. 70 */ fs_builder(backend_shader * shader,bblock_t * block,fs_inst * inst)71 fs_builder(backend_shader *shader, bblock_t *block, fs_inst *inst) : 72 shader(shader), block(block), cursor(inst), 73 _dispatch_width(inst->exec_size), 74 _group(inst->group), 75 force_writemask_all(inst->force_writemask_all) 76 { 77 annotation.str = inst->annotation; 78 annotation.ir = inst->ir; 79 } 80 81 /** 82 * Construct an fs_builder that inserts instructions before \p cursor in 83 * basic block \p block, inheriting other code generation parameters 84 * from this. 85 */ 86 fs_builder at(bblock_t * block,exec_node * cursor)87 at(bblock_t *block, exec_node *cursor) const 88 { 89 fs_builder bld = *this; 90 bld.block = block; 91 bld.cursor = cursor; 92 return bld; 93 } 94 95 /** 96 * Construct an fs_builder appending instructions at the end of the 97 * instruction list of the shader, inheriting other code generation 98 * parameters from this. 99 */ 100 fs_builder at_end()101 at_end() const 102 { 103 return at(NULL, (exec_node *)&shader->instructions.tail_sentinel); 104 } 105 106 /** 107 * Construct a builder specifying the default SIMD width and group of 108 * channel enable signals, inheriting other code generation parameters 109 * from this. 110 * 111 * \p n gives the default SIMD width, \p i gives the slot group used for 112 * predication and control flow masking in multiples of \p n channels. 113 */ 114 fs_builder group(unsigned n,unsigned i)115 group(unsigned n, unsigned i) const 116 { 117 fs_builder bld = *this; 118 119 if (n <= dispatch_width() && i < dispatch_width() / n) { 120 bld._group += i * n; 121 } else { 122 /* The requested channel group isn't a subset of the channel group 123 * of this builder, which means that the resulting instructions 124 * would use (potentially undefined) channel enable signals not 125 * specified by the parent builder. That's only valid if the 126 * instruction doesn't have per-channel semantics, in which case 127 * we should clear off the default group index in order to prevent 128 * emitting instructions with channel group not aligned to their 129 * own execution size. 130 */ 131 assert(force_writemask_all); 132 bld._group = 0; 133 } 134 135 bld._dispatch_width = n; 136 return bld; 137 } 138 139 /** 140 * Alias for group() with width equal to eight. 141 */ 142 fs_builder quarter(unsigned i)143 quarter(unsigned i) const 144 { 145 return group(8, i); 146 } 147 148 /** 149 * Construct a builder with per-channel control flow execution masking 150 * disabled if \p b is true. If control flow execution masking is 151 * already disabled this has no effect. 152 */ 153 fs_builder 154 exec_all(bool b = true) const 155 { 156 fs_builder bld = *this; 157 if (b) 158 bld.force_writemask_all = true; 159 return bld; 160 } 161 162 /** 163 * Construct a builder with the given debug annotation info. 164 */ 165 fs_builder 166 annotate(const char *str, const void *ir = NULL) const 167 { 168 fs_builder bld = *this; 169 bld.annotation.str = str; 170 bld.annotation.ir = ir; 171 return bld; 172 } 173 174 /** 175 * Get the SIMD width in use. 176 */ 177 unsigned dispatch_width()178 dispatch_width() const 179 { 180 return _dispatch_width; 181 } 182 183 /** 184 * Get the channel group in use. 185 */ 186 unsigned group()187 group() const 188 { 189 return _group; 190 } 191 192 /** 193 * Allocate a virtual register of natural vector size (one for this IR) 194 * and SIMD width. \p n gives the amount of space to allocate in 195 * dispatch_width units (which is just enough space for one logical 196 * component in this IR). 197 */ 198 dst_reg 199 vgrf(enum brw_reg_type type, unsigned n = 1) const 200 { 201 assert(dispatch_width() <= 32); 202 203 if (n > 0) 204 return dst_reg(VGRF, shader->alloc.allocate( 205 DIV_ROUND_UP(n * type_sz(type) * dispatch_width(), 206 REG_SIZE)), 207 type); 208 else 209 return retype(null_reg_ud(), type); 210 } 211 212 /** 213 * Create a null register of floating type. 214 */ 215 dst_reg null_reg_f()216 null_reg_f() const 217 { 218 return dst_reg(retype(brw_null_reg(), BRW_REGISTER_TYPE_F)); 219 } 220 221 dst_reg null_reg_df()222 null_reg_df() const 223 { 224 return dst_reg(retype(brw_null_reg(), BRW_REGISTER_TYPE_DF)); 225 } 226 227 /** 228 * Create a null register of signed integer type. 229 */ 230 dst_reg null_reg_d()231 null_reg_d() const 232 { 233 return dst_reg(retype(brw_null_reg(), BRW_REGISTER_TYPE_D)); 234 } 235 236 /** 237 * Create a null register of unsigned integer type. 238 */ 239 dst_reg null_reg_ud()240 null_reg_ud() const 241 { 242 return dst_reg(retype(brw_null_reg(), BRW_REGISTER_TYPE_UD)); 243 } 244 245 /** 246 * Insert an instruction into the program. 247 */ 248 instruction * emit(const instruction & inst)249 emit(const instruction &inst) const 250 { 251 return emit(new(shader->mem_ctx) instruction(inst)); 252 } 253 254 /** 255 * Create and insert a nullary control instruction into the program. 256 */ 257 instruction * emit(enum opcode opcode)258 emit(enum opcode opcode) const 259 { 260 return emit(instruction(opcode, dispatch_width())); 261 } 262 263 /** 264 * Create and insert a nullary instruction into the program. 265 */ 266 instruction * emit(enum opcode opcode,const dst_reg & dst)267 emit(enum opcode opcode, const dst_reg &dst) const 268 { 269 return emit(instruction(opcode, dispatch_width(), dst)); 270 } 271 272 /** 273 * Create and insert a unary instruction into the program. 274 */ 275 instruction * emit(enum opcode opcode,const dst_reg & dst,const src_reg & src0)276 emit(enum opcode opcode, const dst_reg &dst, const src_reg &src0) const 277 { 278 switch (opcode) { 279 case SHADER_OPCODE_RCP: 280 case SHADER_OPCODE_RSQ: 281 case SHADER_OPCODE_SQRT: 282 case SHADER_OPCODE_EXP2: 283 case SHADER_OPCODE_LOG2: 284 case SHADER_OPCODE_SIN: 285 case SHADER_OPCODE_COS: 286 return emit(instruction(opcode, dispatch_width(), dst, 287 fix_math_operand(src0))); 288 289 default: 290 return emit(instruction(opcode, dispatch_width(), dst, src0)); 291 } 292 } 293 294 /** 295 * Create and insert a binary instruction into the program. 296 */ 297 instruction * emit(enum opcode opcode,const dst_reg & dst,const src_reg & src0,const src_reg & src1)298 emit(enum opcode opcode, const dst_reg &dst, const src_reg &src0, 299 const src_reg &src1) const 300 { 301 switch (opcode) { 302 case SHADER_OPCODE_POW: 303 case SHADER_OPCODE_INT_QUOTIENT: 304 case SHADER_OPCODE_INT_REMAINDER: 305 return emit(instruction(opcode, dispatch_width(), dst, 306 fix_math_operand(src0), 307 fix_math_operand(src1))); 308 309 default: 310 return emit(instruction(opcode, dispatch_width(), dst, 311 src0, src1)); 312 313 } 314 } 315 316 /** 317 * Create and insert a ternary instruction into the program. 318 */ 319 instruction * emit(enum opcode opcode,const dst_reg & dst,const src_reg & src0,const src_reg & src1,const src_reg & src2)320 emit(enum opcode opcode, const dst_reg &dst, const src_reg &src0, 321 const src_reg &src1, const src_reg &src2) const 322 { 323 switch (opcode) { 324 case BRW_OPCODE_BFE: 325 case BRW_OPCODE_BFI2: 326 case BRW_OPCODE_MAD: 327 case BRW_OPCODE_LRP: 328 return emit(instruction(opcode, dispatch_width(), dst, 329 fix_3src_operand(src0), 330 fix_3src_operand(src1), 331 fix_3src_operand(src2))); 332 333 default: 334 return emit(instruction(opcode, dispatch_width(), dst, 335 src0, src1, src2)); 336 } 337 } 338 339 /** 340 * Create and insert an instruction with a variable number of sources 341 * into the program. 342 */ 343 instruction * emit(enum opcode opcode,const dst_reg & dst,const src_reg srcs[],unsigned n)344 emit(enum opcode opcode, const dst_reg &dst, const src_reg srcs[], 345 unsigned n) const 346 { 347 /* Use the emit() methods for specific operand counts to ensure that 348 * opcode-specific operand fixups occur. 349 */ 350 if (n == 2) { 351 return emit(opcode, dst, srcs[0], srcs[1]); 352 } else if (n == 3) { 353 return emit(opcode, dst, srcs[0], srcs[1], srcs[2]); 354 } else { 355 return emit(instruction(opcode, dispatch_width(), dst, srcs, n)); 356 } 357 } 358 359 /** 360 * Insert a preallocated instruction into the program. 361 */ 362 instruction * emit(instruction * inst)363 emit(instruction *inst) const 364 { 365 assert(inst->exec_size <= 32); 366 assert(inst->exec_size == dispatch_width() || 367 force_writemask_all); 368 369 inst->group = _group; 370 inst->force_writemask_all = force_writemask_all; 371 inst->annotation = annotation.str; 372 inst->ir = annotation.ir; 373 374 if (block) 375 static_cast<instruction *>(cursor)->insert_before(block, inst); 376 else 377 cursor->insert_before(inst); 378 379 return inst; 380 } 381 382 /** 383 * Select \p src0 if the comparison of both sources with the given 384 * conditional mod evaluates to true, otherwise select \p src1. 385 * 386 * Generally useful to get the minimum or maximum of two values. 387 */ 388 instruction * emit_minmax(const dst_reg & dst,const src_reg & src0,const src_reg & src1,brw_conditional_mod mod)389 emit_minmax(const dst_reg &dst, const src_reg &src0, 390 const src_reg &src1, brw_conditional_mod mod) const 391 { 392 assert(mod == BRW_CONDITIONAL_GE || mod == BRW_CONDITIONAL_L); 393 394 /* In some cases we can't have bytes as operand for src1, so use the 395 * same type for both operand. 396 */ 397 return set_condmod(mod, SEL(dst, fix_unsigned_negate(src0), 398 fix_unsigned_negate(src1))); 399 } 400 401 /** 402 * Copy any live channel from \p src to the first channel of the result. 403 */ 404 src_reg emit_uniformize(const src_reg & src)405 emit_uniformize(const src_reg &src) const 406 { 407 /* FIXME: We use a vector chan_index and dst to allow constant and 408 * copy propagration to move result all the way into the consuming 409 * instruction (typically a surface index or sampler index for a 410 * send). This uses 1 or 3 extra hw registers in 16 or 32 wide 411 * dispatch. Once we teach const/copy propagation about scalars we 412 * should go back to scalar destinations here. 413 */ 414 const fs_builder ubld = exec_all(); 415 const dst_reg chan_index = vgrf(BRW_REGISTER_TYPE_UD); 416 const dst_reg dst = vgrf(src.type); 417 418 ubld.emit(SHADER_OPCODE_FIND_LIVE_CHANNEL, chan_index); 419 ubld.emit(SHADER_OPCODE_BROADCAST, dst, src, component(chan_index, 0)); 420 421 return src_reg(component(dst, 0)); 422 } 423 424 src_reg move_to_vgrf(const src_reg & src,unsigned num_components)425 move_to_vgrf(const src_reg &src, unsigned num_components) const 426 { 427 src_reg *const src_comps = new src_reg[num_components]; 428 for (unsigned i = 0; i < num_components; i++) 429 src_comps[i] = offset(src, dispatch_width(), i); 430 431 const dst_reg dst = vgrf(src.type, num_components); 432 LOAD_PAYLOAD(dst, src_comps, num_components, 0); 433 434 delete[] src_comps; 435 436 return src_reg(dst); 437 } 438 439 void emit_scan(enum opcode opcode,const dst_reg & tmp,unsigned cluster_size,brw_conditional_mod mod)440 emit_scan(enum opcode opcode, const dst_reg &tmp, 441 unsigned cluster_size, brw_conditional_mod mod) const 442 { 443 assert(dispatch_width() >= 8); 444 445 /* The instruction splitting code isn't advanced enough to split 446 * these so we need to handle that ourselves. 447 */ 448 if (dispatch_width() * type_sz(tmp.type) > 2 * REG_SIZE) { 449 const unsigned half_width = dispatch_width() / 2; 450 const fs_builder ubld = exec_all().group(half_width, 0); 451 dst_reg left = tmp; 452 dst_reg right = horiz_offset(tmp, half_width); 453 ubld.emit_scan(opcode, left, cluster_size, mod); 454 ubld.emit_scan(opcode, right, cluster_size, mod); 455 if (cluster_size > half_width) { 456 src_reg left_comp = component(left, half_width - 1); 457 set_condmod(mod, ubld.emit(opcode, right, left_comp, right)); 458 } 459 return; 460 } 461 462 if (cluster_size > 1) { 463 const fs_builder ubld = exec_all().group(dispatch_width() / 2, 0); 464 const dst_reg left = horiz_stride(tmp, 2); 465 const dst_reg right = horiz_stride(horiz_offset(tmp, 1), 2); 466 set_condmod(mod, ubld.emit(opcode, right, left, right)); 467 } 468 469 if (cluster_size > 2) { 470 if (type_sz(tmp.type) <= 4) { 471 const fs_builder ubld = 472 exec_all().group(dispatch_width() / 4, 0); 473 src_reg left = horiz_stride(horiz_offset(tmp, 1), 4); 474 475 dst_reg right = horiz_stride(horiz_offset(tmp, 2), 4); 476 set_condmod(mod, ubld.emit(opcode, right, left, right)); 477 478 right = horiz_stride(horiz_offset(tmp, 3), 4); 479 set_condmod(mod, ubld.emit(opcode, right, left, right)); 480 } else { 481 /* For 64-bit types, we have to do things differently because 482 * the code above would land us with destination strides that 483 * the hardware can't handle. Fortunately, we'll only be 484 * 8-wide in that case and it's the same number of 485 * instructions. 486 */ 487 const fs_builder ubld = exec_all().group(2, 0); 488 489 for (unsigned i = 0; i < dispatch_width(); i += 4) { 490 src_reg left = component(tmp, i + 1); 491 dst_reg right = horiz_offset(tmp, i + 2); 492 set_condmod(mod, ubld.emit(opcode, right, left, right)); 493 } 494 } 495 } 496 497 for (unsigned i = 4; 498 i < MIN2(cluster_size, dispatch_width()); 499 i *= 2) { 500 const fs_builder ubld = exec_all().group(i, 0); 501 src_reg left = component(tmp, i - 1); 502 dst_reg right = horiz_offset(tmp, i); 503 set_condmod(mod, ubld.emit(opcode, right, left, right)); 504 505 if (dispatch_width() > i * 2) { 506 left = component(tmp, i * 3 - 1); 507 right = horiz_offset(tmp, i * 3); 508 set_condmod(mod, ubld.emit(opcode, right, left, right)); 509 } 510 511 if (dispatch_width() > i * 4) { 512 left = component(tmp, i * 5 - 1); 513 right = horiz_offset(tmp, i * 5); 514 set_condmod(mod, ubld.emit(opcode, right, left, right)); 515 516 left = component(tmp, i * 7 - 1); 517 right = horiz_offset(tmp, i * 7); 518 set_condmod(mod, ubld.emit(opcode, right, left, right)); 519 } 520 } 521 } 522 523 /** 524 * Assorted arithmetic ops. 525 * @{ 526 */ 527 #define ALU1(op) \ 528 instruction * \ 529 op(const dst_reg &dst, const src_reg &src0) const \ 530 { \ 531 return emit(BRW_OPCODE_##op, dst, src0); \ 532 } 533 534 #define ALU2(op) \ 535 instruction * \ 536 op(const dst_reg &dst, const src_reg &src0, const src_reg &src1) const \ 537 { \ 538 return emit(BRW_OPCODE_##op, dst, src0, src1); \ 539 } 540 541 #define ALU2_ACC(op) \ 542 instruction * \ 543 op(const dst_reg &dst, const src_reg &src0, const src_reg &src1) const \ 544 { \ 545 instruction *inst = emit(BRW_OPCODE_##op, dst, src0, src1); \ 546 inst->writes_accumulator = true; \ 547 return inst; \ 548 } 549 550 #define ALU3(op) \ 551 instruction * \ 552 op(const dst_reg &dst, const src_reg &src0, const src_reg &src1, \ 553 const src_reg &src2) const \ 554 { \ 555 return emit(BRW_OPCODE_##op, dst, src0, src1, src2); \ 556 } 557 558 ALU2(ADD) ALU2_ACC(ADDC)559 ALU2_ACC(ADDC) 560 ALU2(AND) 561 ALU2(ASR) 562 ALU2(AVG) 563 ALU3(BFE) 564 ALU2(BFI1) 565 ALU3(BFI2) 566 ALU1(BFREV) 567 ALU1(CBIT) 568 ALU2(CMPN) 569 ALU1(DIM) 570 ALU2(DP2) 571 ALU2(DP3) 572 ALU2(DP4) 573 ALU2(DPH) 574 ALU1(F16TO32) 575 ALU1(F32TO16) 576 ALU1(FBH) 577 ALU1(FBL) 578 ALU1(FRC) 579 ALU2(LINE) 580 ALU1(LZD) 581 ALU2(MAC) 582 ALU2_ACC(MACH) 583 ALU3(MAD) 584 ALU1(MOV) 585 ALU2(MUL) 586 ALU1(NOT) 587 ALU2(OR) 588 ALU2(PLN) 589 ALU1(RNDD) 590 ALU1(RNDE) 591 ALU1(RNDU) 592 ALU1(RNDZ) 593 ALU2(ROL) 594 ALU2(ROR) 595 ALU2(SAD2) 596 ALU2_ACC(SADA2) 597 ALU2(SEL) 598 ALU2(SHL) 599 ALU2(SHR) 600 ALU2_ACC(SUBB) 601 ALU2(XOR) 602 603 #undef ALU3 604 #undef ALU2_ACC 605 #undef ALU2 606 #undef ALU1 607 /** @} */ 608 609 /** 610 * CMP: Sets the low bit of the destination channels with the result 611 * of the comparison, while the upper bits are undefined, and updates 612 * the flag register with the packed 16 bits of the result. 613 */ 614 instruction * 615 CMP(const dst_reg &dst, const src_reg &src0, const src_reg &src1, 616 brw_conditional_mod condition) const 617 { 618 /* Take the instruction: 619 * 620 * CMP null<d> src0<f> src1<f> 621 * 622 * Original gen4 does type conversion to the destination type 623 * before comparison, producing garbage results for floating 624 * point comparisons. 625 * 626 * The destination type doesn't matter on newer generations, 627 * so we set the type to match src0 so we can compact the 628 * instruction. 629 */ 630 return set_condmod(condition, 631 emit(BRW_OPCODE_CMP, retype(dst, src0.type), 632 fix_unsigned_negate(src0), 633 fix_unsigned_negate(src1))); 634 } 635 636 /** 637 * Gen4 predicated IF. 638 */ 639 instruction * IF(brw_predicate predicate)640 IF(brw_predicate predicate) const 641 { 642 return set_predicate(predicate, emit(BRW_OPCODE_IF)); 643 } 644 645 /** 646 * CSEL: dst = src2 <op> 0.0f ? src0 : src1 647 */ 648 instruction * CSEL(const dst_reg & dst,const src_reg & src0,const src_reg & src1,const src_reg & src2,brw_conditional_mod condition)649 CSEL(const dst_reg &dst, const src_reg &src0, const src_reg &src1, 650 const src_reg &src2, brw_conditional_mod condition) const 651 { 652 /* CSEL only operates on floats, so we can't do integer </<=/>=/> 653 * comparisons. Zero/non-zero (== and !=) comparisons almost work. 654 * 0x80000000 fails because it is -0.0, and -0.0 == 0.0. 655 */ 656 assert(src2.type == BRW_REGISTER_TYPE_F); 657 658 return set_condmod(condition, 659 emit(BRW_OPCODE_CSEL, 660 retype(dst, BRW_REGISTER_TYPE_F), 661 retype(src0, BRW_REGISTER_TYPE_F), 662 retype(src1, BRW_REGISTER_TYPE_F), 663 src2)); 664 } 665 666 /** 667 * Emit a linear interpolation instruction. 668 */ 669 instruction * LRP(const dst_reg & dst,const src_reg & x,const src_reg & y,const src_reg & a)670 LRP(const dst_reg &dst, const src_reg &x, const src_reg &y, 671 const src_reg &a) const 672 { 673 if (shader->devinfo->gen >= 6 && shader->devinfo->gen <= 10) { 674 /* The LRP instruction actually does op1 * op0 + op2 * (1 - op0), so 675 * we need to reorder the operands. 676 */ 677 return emit(BRW_OPCODE_LRP, dst, a, y, x); 678 679 } else { 680 /* We can't use the LRP instruction. Emit x*(1-a) + y*a. */ 681 const dst_reg y_times_a = vgrf(dst.type); 682 const dst_reg one_minus_a = vgrf(dst.type); 683 const dst_reg x_times_one_minus_a = vgrf(dst.type); 684 685 MUL(y_times_a, y, a); 686 ADD(one_minus_a, negate(a), brw_imm_f(1.0f)); 687 MUL(x_times_one_minus_a, x, src_reg(one_minus_a)); 688 return ADD(dst, src_reg(x_times_one_minus_a), src_reg(y_times_a)); 689 } 690 } 691 692 /** 693 * Collect a number of registers in a contiguous range of registers. 694 */ 695 instruction * LOAD_PAYLOAD(const dst_reg & dst,const src_reg * src,unsigned sources,unsigned header_size)696 LOAD_PAYLOAD(const dst_reg &dst, const src_reg *src, 697 unsigned sources, unsigned header_size) const 698 { 699 instruction *inst = emit(SHADER_OPCODE_LOAD_PAYLOAD, dst, src, sources); 700 inst->header_size = header_size; 701 inst->size_written = header_size * REG_SIZE; 702 for (unsigned i = header_size; i < sources; i++) { 703 inst->size_written += 704 ALIGN(dispatch_width() * type_sz(src[i].type) * dst.stride, 705 REG_SIZE); 706 } 707 708 return inst; 709 } 710 711 instruction * UNDEF(const dst_reg & dst)712 UNDEF(const dst_reg &dst) const 713 { 714 assert(dst.file == VGRF); 715 instruction *inst = emit(SHADER_OPCODE_UNDEF, 716 retype(dst, BRW_REGISTER_TYPE_UD)); 717 inst->size_written = shader->alloc.sizes[dst.nr] * REG_SIZE; 718 719 return inst; 720 } 721 722 backend_shader *shader; 723 724 private: 725 /** 726 * Workaround for negation of UD registers. See comment in 727 * fs_generator::generate_code() for more details. 728 */ 729 src_reg fix_unsigned_negate(const src_reg & src)730 fix_unsigned_negate(const src_reg &src) const 731 { 732 if (src.type == BRW_REGISTER_TYPE_UD && 733 src.negate) { 734 dst_reg temp = vgrf(BRW_REGISTER_TYPE_UD); 735 MOV(temp, src); 736 return src_reg(temp); 737 } else { 738 return src; 739 } 740 } 741 742 /** 743 * Workaround for source register modes not supported by the ternary 744 * instruction encoding. 745 */ 746 src_reg fix_3src_operand(const src_reg & src)747 fix_3src_operand(const src_reg &src) const 748 { 749 switch (src.file) { 750 case FIXED_GRF: 751 /* FINISHME: Could handle scalar region, other stride=1 regions */ 752 if (src.vstride != BRW_VERTICAL_STRIDE_8 || 753 src.width != BRW_WIDTH_8 || 754 src.hstride != BRW_HORIZONTAL_STRIDE_1) 755 break; 756 /* fallthrough */ 757 case ATTR: 758 case VGRF: 759 case UNIFORM: 760 case IMM: 761 return src; 762 default: 763 break; 764 } 765 766 dst_reg expanded = vgrf(src.type); 767 MOV(expanded, src); 768 return expanded; 769 } 770 771 /** 772 * Workaround for source register modes not supported by the math 773 * instruction. 774 */ 775 src_reg fix_math_operand(const src_reg & src)776 fix_math_operand(const src_reg &src) const 777 { 778 /* Can't do hstride == 0 args on gen6 math, so expand it out. We 779 * might be able to do better by doing execsize = 1 math and then 780 * expanding that result out, but we would need to be careful with 781 * masking. 782 * 783 * Gen6 hardware ignores source modifiers (negate and abs) on math 784 * instructions, so we also move to a temp to set those up. 785 * 786 * Gen7 relaxes most of the above restrictions, but still can't use IMM 787 * operands to math 788 */ 789 if ((shader->devinfo->gen == 6 && 790 (src.file == IMM || src.file == UNIFORM || 791 src.abs || src.negate)) || 792 (shader->devinfo->gen == 7 && src.file == IMM)) { 793 const dst_reg tmp = vgrf(src.type); 794 MOV(tmp, src); 795 return tmp; 796 } else { 797 return src; 798 } 799 } 800 801 bblock_t *block; 802 exec_node *cursor; 803 804 unsigned _dispatch_width; 805 unsigned _group; 806 bool force_writemask_all; 807 808 /** Debug annotation info. */ 809 struct { 810 const char *str; 811 const void *ir; 812 } annotation; 813 }; 814 } 815 816 #endif 817