1 /* -*- c++ -*- */ 2 /* 3 * Copyright © 2010-2015 Intel Corporation 4 * 5 * Permission is hereby granted, free of charge, to any person obtaining a 6 * copy of this software and associated documentation files (the "Software"), 7 * to deal in the Software without restriction, including without limitation 8 * the rights to use, copy, modify, merge, publish, distribute, sublicense, 9 * and/or sell copies of the Software, and to permit persons to whom the 10 * Software is furnished to do so, subject to the following conditions: 11 * 12 * The above copyright notice and this permission notice (including the next 13 * paragraph) shall be included in all copies or substantial portions of the 14 * Software. 15 * 16 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 17 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 18 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 19 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 20 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING 21 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS 22 * IN THE SOFTWARE. 23 */ 24 25 #ifndef BRW_VEC4_BUILDER_H 26 #define BRW_VEC4_BUILDER_H 27 28 #include "brw_ir_vec4.h" 29 #include "brw_ir_allocator.h" 30 31 namespace brw { 32 /** 33 * Toolbox to assemble a VEC4 IR program out of individual instructions. 34 * 35 * This object is meant to have an interface consistent with 36 * brw::fs_builder. They cannot be fully interchangeable because 37 * brw::fs_builder generates scalar code while brw::vec4_builder generates 38 * vector code. 39 */ 40 class vec4_builder { 41 public: 42 /** Type used in this IR to represent a source of an instruction. */ 43 typedef brw::src_reg src_reg; 44 45 /** Type used in this IR to represent the destination of an instruction. */ 46 typedef brw::dst_reg dst_reg; 47 48 /** Type used in this IR to represent an instruction. */ 49 typedef vec4_instruction instruction; 50 51 /** 52 * Construct a vec4_builder that inserts instructions into \p shader. 53 */ 54 vec4_builder(backend_shader *shader, unsigned dispatch_width = 8) : shader(shader)55 shader(shader), block(NULL), cursor(NULL), 56 _dispatch_width(dispatch_width), _group(0), 57 force_writemask_all(false), 58 annotation() 59 { 60 } 61 62 /** 63 * Construct a vec4_builder that inserts instructions into \p shader 64 * before instruction \p inst in basic block \p block. The default 65 * execution controls and debug annotation are initialized from the 66 * instruction passed as argument. 67 */ vec4_builder(backend_shader * shader,bblock_t * block,instruction * inst)68 vec4_builder(backend_shader *shader, bblock_t *block, instruction *inst) : 69 shader(shader), block(block), cursor(inst), 70 _dispatch_width(inst->exec_size), _group(inst->group), 71 force_writemask_all(inst->force_writemask_all) 72 { 73 annotation.str = inst->annotation; 74 annotation.ir = inst->ir; 75 } 76 77 /** 78 * Construct a vec4_builder that inserts instructions before \p cursor 79 * in basic block \p block, inheriting other code generation parameters 80 * from this. 81 */ 82 vec4_builder at(bblock_t * block,exec_node * cursor)83 at(bblock_t *block, exec_node *cursor) const 84 { 85 vec4_builder bld = *this; 86 bld.block = block; 87 bld.cursor = cursor; 88 return bld; 89 } 90 91 /** 92 * Construct a vec4_builder appending instructions at the end of the 93 * instruction list of the shader, inheriting other code generation 94 * parameters from this. 95 */ 96 vec4_builder at_end()97 at_end() const 98 { 99 return at(NULL, (exec_node *)&shader->instructions.tail_sentinel); 100 } 101 102 /** 103 * Construct a builder specifying the default SIMD width and group of 104 * channel enable signals, inheriting other code generation parameters 105 * from this. 106 * 107 * \p n gives the default SIMD width, \p i gives the slot group used for 108 * predication and control flow masking in multiples of \p n channels. 109 */ 110 vec4_builder group(unsigned n,unsigned i)111 group(unsigned n, unsigned i) const 112 { 113 assert(force_writemask_all || 114 (n <= dispatch_width() && i < dispatch_width() / n)); 115 vec4_builder bld = *this; 116 bld._dispatch_width = n; 117 bld._group += i * n; 118 return bld; 119 } 120 121 /** 122 * Construct a builder with per-channel control flow execution masking 123 * disabled if \p b is true. If control flow execution masking is 124 * already disabled this has no effect. 125 */ 126 vec4_builder 127 exec_all(bool b = true) const 128 { 129 vec4_builder bld = *this; 130 if (b) 131 bld.force_writemask_all = true; 132 return bld; 133 } 134 135 /** 136 * Construct a builder with the given debug annotation info. 137 */ 138 vec4_builder 139 annotate(const char *str, const void *ir = NULL) const 140 { 141 vec4_builder bld = *this; 142 bld.annotation.str = str; 143 bld.annotation.ir = ir; 144 return bld; 145 } 146 147 /** 148 * Get the SIMD width in use. 149 */ 150 unsigned dispatch_width()151 dispatch_width() const 152 { 153 return _dispatch_width; 154 } 155 156 /** 157 * Get the channel group in use. 158 */ 159 unsigned group()160 group() const 161 { 162 return _group; 163 } 164 165 /** 166 * Allocate a virtual register of natural vector size (four for this IR) 167 * and SIMD width. \p n gives the amount of space to allocate in 168 * dispatch_width units (which is just enough space for four logical 169 * components in this IR). 170 */ 171 dst_reg 172 vgrf(enum brw_reg_type type, unsigned n = 1) const 173 { 174 assert(dispatch_width() <= 32); 175 176 if (n > 0) 177 return retype(dst_reg(VGRF, shader->alloc.allocate( 178 n * DIV_ROUND_UP(type_sz(type), 4))), 179 type); 180 else 181 return retype(null_reg_ud(), type); 182 } 183 184 /** 185 * Create a null register of floating type. 186 */ 187 dst_reg null_reg_f()188 null_reg_f() const 189 { 190 return dst_reg(retype(brw_null_vec(dispatch_width()), 191 BRW_REGISTER_TYPE_F)); 192 } 193 194 /** 195 * Create a null register of signed integer type. 196 */ 197 dst_reg null_reg_d()198 null_reg_d() const 199 { 200 return dst_reg(retype(brw_null_vec(dispatch_width()), 201 BRW_REGISTER_TYPE_D)); 202 } 203 204 /** 205 * Create a null register of unsigned integer type. 206 */ 207 dst_reg null_reg_ud()208 null_reg_ud() const 209 { 210 return dst_reg(retype(brw_null_vec(dispatch_width()), 211 BRW_REGISTER_TYPE_UD)); 212 } 213 214 /** 215 * Insert an instruction into the program. 216 */ 217 instruction * emit(const instruction & inst)218 emit(const instruction &inst) const 219 { 220 return emit(new(shader->mem_ctx) instruction(inst)); 221 } 222 223 /** 224 * Create and insert a nullary control instruction into the program. 225 */ 226 instruction * emit(enum opcode opcode)227 emit(enum opcode opcode) const 228 { 229 return emit(instruction(opcode)); 230 } 231 232 /** 233 * Create and insert a nullary instruction into the program. 234 */ 235 instruction * emit(enum opcode opcode,const dst_reg & dst)236 emit(enum opcode opcode, const dst_reg &dst) const 237 { 238 return emit(instruction(opcode, dst)); 239 } 240 241 /** 242 * Create and insert a unary instruction into the program. 243 */ 244 instruction * emit(enum opcode opcode,const dst_reg & dst,const src_reg & src0)245 emit(enum opcode opcode, const dst_reg &dst, const src_reg &src0) const 246 { 247 switch (opcode) { 248 case SHADER_OPCODE_RCP: 249 case SHADER_OPCODE_RSQ: 250 case SHADER_OPCODE_SQRT: 251 case SHADER_OPCODE_EXP2: 252 case SHADER_OPCODE_LOG2: 253 case SHADER_OPCODE_SIN: 254 case SHADER_OPCODE_COS: 255 return fix_math_instruction( 256 emit(instruction(opcode, dst, 257 fix_math_operand(src0)))); 258 259 default: 260 return emit(instruction(opcode, dst, src0)); 261 } 262 } 263 264 /** 265 * Create and insert a binary instruction into the program. 266 */ 267 instruction * emit(enum opcode opcode,const dst_reg & dst,const src_reg & src0,const src_reg & src1)268 emit(enum opcode opcode, const dst_reg &dst, const src_reg &src0, 269 const src_reg &src1) const 270 { 271 switch (opcode) { 272 case SHADER_OPCODE_POW: 273 case SHADER_OPCODE_INT_QUOTIENT: 274 case SHADER_OPCODE_INT_REMAINDER: 275 return fix_math_instruction( 276 emit(instruction(opcode, dst, 277 fix_math_operand(src0), 278 fix_math_operand(src1)))); 279 280 default: 281 return emit(instruction(opcode, dst, src0, src1)); 282 } 283 } 284 285 /** 286 * Create and insert a ternary instruction into the program. 287 */ 288 instruction * emit(enum opcode opcode,const dst_reg & dst,const src_reg & src0,const src_reg & src1,const src_reg & src2)289 emit(enum opcode opcode, const dst_reg &dst, const src_reg &src0, 290 const src_reg &src1, const src_reg &src2) const 291 { 292 switch (opcode) { 293 case BRW_OPCODE_BFE: 294 case BRW_OPCODE_BFI2: 295 case BRW_OPCODE_MAD: 296 case BRW_OPCODE_LRP: 297 return emit(instruction(opcode, dst, 298 fix_3src_operand(src0), 299 fix_3src_operand(src1), 300 fix_3src_operand(src2))); 301 302 default: 303 return emit(instruction(opcode, dst, src0, src1, src2)); 304 } 305 } 306 307 /** 308 * Insert a preallocated instruction into the program. 309 */ 310 instruction * emit(instruction * inst)311 emit(instruction *inst) const 312 { 313 inst->exec_size = dispatch_width(); 314 inst->group = group(); 315 inst->force_writemask_all = force_writemask_all; 316 inst->size_written = inst->exec_size * type_sz(inst->dst.type); 317 inst->annotation = annotation.str; 318 inst->ir = annotation.ir; 319 320 if (block) 321 static_cast<instruction *>(cursor)->insert_before(block, inst); 322 else 323 cursor->insert_before(inst); 324 325 return inst; 326 } 327 328 /** 329 * Select \p src0 if the comparison of both sources with the given 330 * conditional mod evaluates to true, otherwise select \p src1. 331 * 332 * Generally useful to get the minimum or maximum of two values. 333 */ 334 instruction * emit_minmax(const dst_reg & dst,const src_reg & src0,const src_reg & src1,brw_conditional_mod mod)335 emit_minmax(const dst_reg &dst, const src_reg &src0, 336 const src_reg &src1, brw_conditional_mod mod) const 337 { 338 assert(mod == BRW_CONDITIONAL_GE || mod == BRW_CONDITIONAL_L); 339 340 return set_condmod(mod, SEL(dst, fix_unsigned_negate(src0), 341 fix_unsigned_negate(src1))); 342 } 343 344 /** 345 * Copy any live channel from \p src to the first channel of the result. 346 */ 347 src_reg emit_uniformize(const src_reg & src)348 emit_uniformize(const src_reg &src) const 349 { 350 const vec4_builder ubld = exec_all(); 351 const dst_reg chan_index = 352 writemask(vgrf(BRW_REGISTER_TYPE_UD), WRITEMASK_X); 353 const dst_reg dst = vgrf(src.type); 354 355 ubld.emit(SHADER_OPCODE_FIND_LIVE_CHANNEL, chan_index); 356 ubld.emit(SHADER_OPCODE_BROADCAST, dst, src, src_reg(chan_index)); 357 358 return src_reg(dst); 359 } 360 361 /** 362 * Assorted arithmetic ops. 363 * @{ 364 */ 365 #define ALU1(op) \ 366 instruction * \ 367 op(const dst_reg &dst, const src_reg &src0) const \ 368 { \ 369 return emit(BRW_OPCODE_##op, dst, src0); \ 370 } 371 372 #define ALU2(op) \ 373 instruction * \ 374 op(const dst_reg &dst, const src_reg &src0, const src_reg &src1) const \ 375 { \ 376 return emit(BRW_OPCODE_##op, dst, src0, src1); \ 377 } 378 379 #define ALU2_ACC(op) \ 380 instruction * \ 381 op(const dst_reg &dst, const src_reg &src0, const src_reg &src1) const \ 382 { \ 383 instruction *inst = emit(BRW_OPCODE_##op, dst, src0, src1); \ 384 inst->writes_accumulator = true; \ 385 return inst; \ 386 } 387 388 #define ALU3(op) \ 389 instruction * \ 390 op(const dst_reg &dst, const src_reg &src0, const src_reg &src1, \ 391 const src_reg &src2) const \ 392 { \ 393 return emit(BRW_OPCODE_##op, dst, src0, src1, src2); \ 394 } 395 396 ALU2(ADD) ALU2_ACC(ADDC)397 ALU2_ACC(ADDC) 398 ALU2(AND) 399 ALU2(ASR) 400 ALU2(AVG) 401 ALU3(BFE) 402 ALU2(BFI1) 403 ALU3(BFI2) 404 ALU1(BFREV) 405 ALU1(CBIT) 406 ALU3(CSEL) 407 ALU1(DIM) 408 ALU2(DP2) 409 ALU2(DP3) 410 ALU2(DP4) 411 ALU2(DPH) 412 ALU1(F16TO32) 413 ALU1(F32TO16) 414 ALU1(FBH) 415 ALU1(FBL) 416 ALU1(FRC) 417 ALU2(LINE) 418 ALU1(LZD) 419 ALU2(MAC) 420 ALU2_ACC(MACH) 421 ALU3(MAD) 422 ALU1(MOV) 423 ALU2(MUL) 424 ALU1(NOT) 425 ALU2(OR) 426 ALU2(PLN) 427 ALU1(RNDD) 428 ALU1(RNDE) 429 ALU1(RNDU) 430 ALU1(RNDZ) 431 ALU2(SAD2) 432 ALU2_ACC(SADA2) 433 ALU2(SEL) 434 ALU2(SHL) 435 ALU2(SHR) 436 ALU2_ACC(SUBB) 437 ALU2(XOR) 438 439 #undef ALU3 440 #undef ALU2_ACC 441 #undef ALU2 442 #undef ALU1 443 /** @} */ 444 445 /** 446 * CMP: Sets the low bit of the destination channels with the result 447 * of the comparison, while the upper bits are undefined, and updates 448 * the flag register with the packed 16 bits of the result. 449 */ 450 instruction * 451 CMP(const dst_reg &dst, const src_reg &src0, const src_reg &src1, 452 brw_conditional_mod condition) const 453 { 454 /* Take the instruction: 455 * 456 * CMP null<d> src0<f> src1<f> 457 * 458 * Original gfx4 does type conversion to the destination type 459 * before comparison, producing garbage results for floating 460 * point comparisons. 461 * 462 * The destination type doesn't matter on newer generations, 463 * so we set the type to match src0 so we can compact the 464 * instruction. 465 */ 466 return set_condmod(condition, 467 emit(BRW_OPCODE_CMP, retype(dst, src0.type), 468 fix_unsigned_negate(src0), 469 fix_unsigned_negate(src1))); 470 } 471 472 /** 473 * CMPN: Behaves like CMP, but produces true if src1 is NaN. 474 */ 475 instruction * CMPN(const dst_reg & dst,const src_reg & src0,const src_reg & src1,brw_conditional_mod condition)476 CMPN(const dst_reg &dst, const src_reg &src0, const src_reg &src1, 477 brw_conditional_mod condition) const 478 { 479 /* Take the instruction: 480 * 481 * CMPN null<d> src0<f> src1<f> 482 * 483 * Original gfx4 does type conversion to the destination type 484 * before comparison, producing garbage results for floating 485 * point comparisons. 486 * 487 * The destination type doesn't matter on newer generations, 488 * so we set the type to match src0 so we can compact the 489 * instruction. 490 */ 491 return set_condmod(condition, 492 emit(BRW_OPCODE_CMPN, retype(dst, src0.type), 493 fix_unsigned_negate(src0), 494 fix_unsigned_negate(src1))); 495 } 496 497 /** 498 * Gfx4 predicated IF. 499 */ 500 instruction * IF(brw_predicate predicate)501 IF(brw_predicate predicate) const 502 { 503 return set_predicate(predicate, emit(BRW_OPCODE_IF)); 504 } 505 506 /** 507 * Gfx6 IF with embedded comparison. 508 */ 509 instruction * IF(const src_reg & src0,const src_reg & src1,brw_conditional_mod condition)510 IF(const src_reg &src0, const src_reg &src1, 511 brw_conditional_mod condition) const 512 { 513 assert(shader->devinfo->ver == 6); 514 return set_condmod(condition, 515 emit(BRW_OPCODE_IF, 516 null_reg_d(), 517 fix_unsigned_negate(src0), 518 fix_unsigned_negate(src1))); 519 } 520 521 /** 522 * Emit a linear interpolation instruction. 523 */ 524 instruction * LRP(const dst_reg & dst,const src_reg & x,const src_reg & y,const src_reg & a)525 LRP(const dst_reg &dst, const src_reg &x, const src_reg &y, 526 const src_reg &a) const 527 { 528 /* The LRP instruction actually does op1 * op0 + op2 * (1 - op0), so 529 * we need to reorder the operands. 530 */ 531 assert(shader->devinfo->ver >= 6 && shader->devinfo->ver <= 9); 532 return emit(BRW_OPCODE_LRP, dst, a, y, x); 533 } 534 535 backend_shader *shader; 536 537 protected: 538 /** 539 * Workaround for negation of UD registers. See comment in 540 * fs_generator::generate_code() for the details. 541 */ 542 src_reg fix_unsigned_negate(const src_reg & src)543 fix_unsigned_negate(const src_reg &src) const 544 { 545 if (src.type == BRW_REGISTER_TYPE_UD && src.negate) { 546 dst_reg temp = vgrf(BRW_REGISTER_TYPE_UD); 547 MOV(temp, src); 548 return src_reg(temp); 549 } else { 550 return src; 551 } 552 } 553 554 /** 555 * Workaround for register access modes not supported by the ternary 556 * instruction encoding. 557 */ 558 src_reg fix_3src_operand(const src_reg & src)559 fix_3src_operand(const src_reg &src) const 560 { 561 /* Using vec4 uniforms in SIMD4x2 programs is difficult. You'd like to be 562 * able to use vertical stride of zero to replicate the vec4 uniform, like 563 * 564 * g3<0;4,1>:f - [0, 4][1, 5][2, 6][3, 7] 565 * 566 * But you can't, since vertical stride is always four in three-source 567 * instructions. Instead, insert a MOV instruction to do the replication so 568 * that the three-source instruction can consume it. 569 */ 570 571 /* The MOV is only needed if the source is a uniform or immediate. */ 572 if (src.file != UNIFORM && src.file != IMM) 573 return src; 574 575 if (src.file == UNIFORM && brw_is_single_value_swizzle(src.swizzle)) 576 return src; 577 578 const dst_reg expanded = vgrf(src.type); 579 emit(VEC4_OPCODE_UNPACK_UNIFORM, expanded, src); 580 return src_reg(expanded); 581 } 582 583 /** 584 * Workaround for register access modes not supported by the math 585 * instruction. 586 */ 587 src_reg fix_math_operand(const src_reg & src)588 fix_math_operand(const src_reg &src) const 589 { 590 /* The gfx6 math instruction ignores the source modifiers -- 591 * swizzle, abs, negate, and at least some parts of the register 592 * region description. 593 * 594 * Rather than trying to enumerate all these cases, *always* expand the 595 * operand to a temp GRF for gfx6. 596 * 597 * For gfx7, keep the operand as-is, except if immediate, which gfx7 still 598 * can't use. 599 */ 600 if (shader->devinfo->ver == 6 || 601 (shader->devinfo->ver == 7 && src.file == IMM)) { 602 const dst_reg tmp = vgrf(src.type); 603 MOV(tmp, src); 604 return src_reg(tmp); 605 } else { 606 return src; 607 } 608 } 609 610 /** 611 * Workaround other weirdness of the math instruction. 612 */ 613 instruction * fix_math_instruction(instruction * inst)614 fix_math_instruction(instruction *inst) const 615 { 616 if (shader->devinfo->ver == 6 && 617 inst->dst.writemask != WRITEMASK_XYZW) { 618 const dst_reg tmp = vgrf(inst->dst.type); 619 MOV(inst->dst, src_reg(tmp)); 620 inst->dst = tmp; 621 622 } else if (shader->devinfo->ver < 6) { 623 const unsigned sources = (inst->src[1].file == BAD_FILE ? 1 : 2); 624 inst->base_mrf = 1; 625 inst->mlen = sources; 626 } 627 628 return inst; 629 } 630 631 bblock_t *block; 632 exec_node *cursor; 633 634 unsigned _dispatch_width; 635 unsigned _group; 636 bool force_writemask_all; 637 638 /** Debug annotation info. */ 639 struct { 640 const char *str; 641 const void *ir; 642 } annotation; 643 }; 644 } 645 646 #endif 647