1 /* -*- c++ -*- */ 2 /* 3 * Copyright © 2010-2015 Intel Corporation 4 * 5 * Permission is hereby granted, free of charge, to any person obtaining a 6 * copy of this software and associated documentation files (the "Software"), 7 * to deal in the Software without restriction, including without limitation 8 * the rights to use, copy, modify, merge, publish, distribute, sublicense, 9 * and/or sell copies of the Software, and to permit persons to whom the 10 * Software is furnished to do so, subject to the following conditions: 11 * 12 * The above copyright notice and this permission notice (including the next 13 * paragraph) shall be included in all copies or substantial portions of the 14 * Software. 15 * 16 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 17 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 18 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 19 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 20 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING 21 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS 22 * IN THE SOFTWARE. 23 */ 24 25 #ifndef BRW_VEC4_BUILDER_H 26 #define BRW_VEC4_BUILDER_H 27 28 #include "brw_ir_vec4.h" 29 #include "brw_ir_allocator.h" 30 31 namespace brw { 32 /** 33 * Toolbox to assemble a VEC4 IR program out of individual instructions. 34 * 35 * This object is meant to have an interface consistent with 36 * brw::fs_builder. They cannot be fully interchangeable because 37 * brw::fs_builder generates scalar code while brw::vec4_builder generates 38 * vector code. 39 */ 40 class vec4_builder { 41 public: 42 /** Type used in this IR to represent a source of an instruction. */ 43 typedef brw::src_reg src_reg; 44 45 /** Type used in this IR to represent the destination of an instruction. */ 46 typedef brw::dst_reg dst_reg; 47 48 /** Type used in this IR to represent an instruction. */ 49 typedef vec4_instruction instruction; 50 51 /** 52 * Construct a vec4_builder that inserts instructions into \p shader. 53 */ 54 vec4_builder(backend_shader *shader, unsigned dispatch_width = 8) : shader(shader)55 shader(shader), block(NULL), cursor(NULL), 56 _dispatch_width(dispatch_width), _group(0), 57 force_writemask_all(false), 58 annotation() 59 { 60 } 61 62 /** 63 * Construct a vec4_builder that inserts instructions into \p shader 64 * before instruction \p inst in basic block \p block. The default 65 * execution controls and debug annotation are initialized from the 66 * instruction passed as argument. 67 */ vec4_builder(backend_shader * shader,bblock_t * block,instruction * inst)68 vec4_builder(backend_shader *shader, bblock_t *block, instruction *inst) : 69 shader(shader), block(block), cursor(inst), 70 _dispatch_width(inst->exec_size), _group(inst->group), 71 force_writemask_all(inst->force_writemask_all) 72 { 73 annotation.str = inst->annotation; 74 annotation.ir = inst->ir; 75 } 76 77 /** 78 * Construct a vec4_builder that inserts instructions before \p cursor 79 * in basic block \p block, inheriting other code generation parameters 80 * from this. 81 */ 82 vec4_builder at(bblock_t * block,exec_node * cursor)83 at(bblock_t *block, exec_node *cursor) const 84 { 85 vec4_builder bld = *this; 86 bld.block = block; 87 bld.cursor = cursor; 88 return bld; 89 } 90 91 /** 92 * Construct a vec4_builder appending instructions at the end of the 93 * instruction list of the shader, inheriting other code generation 94 * parameters from this. 95 */ 96 vec4_builder at_end()97 at_end() const 98 { 99 return at(NULL, (exec_node *)&shader->instructions.tail_sentinel); 100 } 101 102 /** 103 * Construct a builder specifying the default SIMD width and group of 104 * channel enable signals, inheriting other code generation parameters 105 * from this. 106 * 107 * \p n gives the default SIMD width, \p i gives the slot group used for 108 * predication and control flow masking in multiples of \p n channels. 109 */ 110 vec4_builder group(unsigned n,unsigned i)111 group(unsigned n, unsigned i) const 112 { 113 assert(force_writemask_all || 114 (n <= dispatch_width() && i < dispatch_width() / n)); 115 vec4_builder bld = *this; 116 bld._dispatch_width = n; 117 bld._group += i * n; 118 return bld; 119 } 120 121 /** 122 * Construct a builder with per-channel control flow execution masking 123 * disabled if \p b is true. If control flow execution masking is 124 * already disabled this has no effect. 125 */ 126 vec4_builder 127 exec_all(bool b = true) const 128 { 129 vec4_builder bld = *this; 130 if (b) 131 bld.force_writemask_all = true; 132 return bld; 133 } 134 135 /** 136 * Construct a builder with the given debug annotation info. 137 */ 138 vec4_builder 139 annotate(const char *str, const void *ir = NULL) const 140 { 141 vec4_builder bld = *this; 142 bld.annotation.str = str; 143 bld.annotation.ir = ir; 144 return bld; 145 } 146 147 /** 148 * Get the SIMD width in use. 149 */ 150 unsigned dispatch_width()151 dispatch_width() const 152 { 153 return _dispatch_width; 154 } 155 156 /** 157 * Get the channel group in use. 158 */ 159 unsigned group()160 group() const 161 { 162 return _group; 163 } 164 165 /** 166 * Allocate a virtual register of natural vector size (four for this IR) 167 * and SIMD width. \p n gives the amount of space to allocate in 168 * dispatch_width units (which is just enough space for four logical 169 * components in this IR). 170 */ 171 dst_reg 172 vgrf(enum brw_reg_type type, unsigned n = 1) const 173 { 174 assert(dispatch_width() <= 32); 175 176 if (n > 0) 177 return retype(dst_reg(VGRF, shader->alloc.allocate( 178 n * DIV_ROUND_UP(type_sz(type), 4))), 179 type); 180 else 181 return retype(null_reg_ud(), type); 182 } 183 184 /** 185 * Create a null register of floating type. 186 */ 187 dst_reg null_reg_f()188 null_reg_f() const 189 { 190 return dst_reg(retype(brw_null_vec(dispatch_width()), 191 BRW_REGISTER_TYPE_F)); 192 } 193 194 /** 195 * Create a null register of signed integer type. 196 */ 197 dst_reg null_reg_d()198 null_reg_d() const 199 { 200 return dst_reg(retype(brw_null_vec(dispatch_width()), 201 BRW_REGISTER_TYPE_D)); 202 } 203 204 /** 205 * Create a null register of unsigned integer type. 206 */ 207 dst_reg null_reg_ud()208 null_reg_ud() const 209 { 210 return dst_reg(retype(brw_null_vec(dispatch_width()), 211 BRW_REGISTER_TYPE_UD)); 212 } 213 214 /** 215 * Insert an instruction into the program. 216 */ 217 instruction * emit(const instruction & inst)218 emit(const instruction &inst) const 219 { 220 return emit(new(shader->mem_ctx) instruction(inst)); 221 } 222 223 /** 224 * Create and insert a nullary control instruction into the program. 225 */ 226 instruction * emit(enum opcode opcode)227 emit(enum opcode opcode) const 228 { 229 return emit(instruction(opcode)); 230 } 231 232 /** 233 * Create and insert a nullary instruction into the program. 234 */ 235 instruction * emit(enum opcode opcode,const dst_reg & dst)236 emit(enum opcode opcode, const dst_reg &dst) const 237 { 238 return emit(instruction(opcode, dst)); 239 } 240 241 /** 242 * Create and insert a unary instruction into the program. 243 */ 244 instruction * emit(enum opcode opcode,const dst_reg & dst,const src_reg & src0)245 emit(enum opcode opcode, const dst_reg &dst, const src_reg &src0) const 246 { 247 switch (opcode) { 248 case SHADER_OPCODE_RCP: 249 case SHADER_OPCODE_RSQ: 250 case SHADER_OPCODE_SQRT: 251 case SHADER_OPCODE_EXP2: 252 case SHADER_OPCODE_LOG2: 253 case SHADER_OPCODE_SIN: 254 case SHADER_OPCODE_COS: 255 return fix_math_instruction( 256 emit(instruction(opcode, dst, 257 fix_math_operand(src0)))); 258 259 default: 260 return emit(instruction(opcode, dst, src0)); 261 } 262 } 263 264 /** 265 * Create and insert a binary instruction into the program. 266 */ 267 instruction * emit(enum opcode opcode,const dst_reg & dst,const src_reg & src0,const src_reg & src1)268 emit(enum opcode opcode, const dst_reg &dst, const src_reg &src0, 269 const src_reg &src1) const 270 { 271 switch (opcode) { 272 case SHADER_OPCODE_POW: 273 case SHADER_OPCODE_INT_QUOTIENT: 274 case SHADER_OPCODE_INT_REMAINDER: 275 return fix_math_instruction( 276 emit(instruction(opcode, dst, 277 fix_math_operand(src0), 278 fix_math_operand(src1)))); 279 280 default: 281 return emit(instruction(opcode, dst, src0, src1)); 282 } 283 } 284 285 /** 286 * Create and insert a ternary instruction into the program. 287 */ 288 instruction * emit(enum opcode opcode,const dst_reg & dst,const src_reg & src0,const src_reg & src1,const src_reg & src2)289 emit(enum opcode opcode, const dst_reg &dst, const src_reg &src0, 290 const src_reg &src1, const src_reg &src2) const 291 { 292 switch (opcode) { 293 case BRW_OPCODE_BFE: 294 case BRW_OPCODE_BFI2: 295 case BRW_OPCODE_MAD: 296 case BRW_OPCODE_LRP: 297 return emit(instruction(opcode, dst, 298 fix_3src_operand(src0), 299 fix_3src_operand(src1), 300 fix_3src_operand(src2))); 301 302 default: 303 return emit(instruction(opcode, dst, src0, src1, src2)); 304 } 305 } 306 307 /** 308 * Insert a preallocated instruction into the program. 309 */ 310 instruction * emit(instruction * inst)311 emit(instruction *inst) const 312 { 313 inst->exec_size = dispatch_width(); 314 inst->group = group(); 315 inst->force_writemask_all = force_writemask_all; 316 inst->size_written = inst->exec_size * type_sz(inst->dst.type); 317 inst->annotation = annotation.str; 318 inst->ir = annotation.ir; 319 320 if (block) 321 static_cast<instruction *>(cursor)->insert_before(block, inst); 322 else 323 cursor->insert_before(inst); 324 325 return inst; 326 } 327 328 /** 329 * Select \p src0 if the comparison of both sources with the given 330 * conditional mod evaluates to true, otherwise select \p src1. 331 * 332 * Generally useful to get the minimum or maximum of two values. 333 */ 334 instruction * emit_minmax(const dst_reg & dst,const src_reg & src0,const src_reg & src1,brw_conditional_mod mod)335 emit_minmax(const dst_reg &dst, const src_reg &src0, 336 const src_reg &src1, brw_conditional_mod mod) const 337 { 338 assert(mod == BRW_CONDITIONAL_GE || mod == BRW_CONDITIONAL_L); 339 340 return set_condmod(mod, SEL(dst, fix_unsigned_negate(src0), 341 fix_unsigned_negate(src1))); 342 } 343 344 /** 345 * Copy any live channel from \p src to the first channel of the result. 346 */ 347 src_reg emit_uniformize(const src_reg & src)348 emit_uniformize(const src_reg &src) const 349 { 350 const vec4_builder ubld = exec_all(); 351 const dst_reg chan_index = 352 writemask(vgrf(BRW_REGISTER_TYPE_UD), WRITEMASK_X); 353 const dst_reg dst = vgrf(src.type); 354 355 ubld.emit(SHADER_OPCODE_FIND_LIVE_CHANNEL, chan_index); 356 ubld.emit(SHADER_OPCODE_BROADCAST, dst, src, src_reg(chan_index)); 357 358 return src_reg(dst); 359 } 360 361 /** 362 * Assorted arithmetic ops. 363 * @{ 364 */ 365 #define ALU1(op) \ 366 instruction * \ 367 op(const dst_reg &dst, const src_reg &src0) const \ 368 { \ 369 return emit(BRW_OPCODE_##op, dst, src0); \ 370 } 371 372 #define ALU2(op) \ 373 instruction * \ 374 op(const dst_reg &dst, const src_reg &src0, const src_reg &src1) const \ 375 { \ 376 return emit(BRW_OPCODE_##op, dst, src0, src1); \ 377 } 378 379 #define ALU2_ACC(op) \ 380 instruction * \ 381 op(const dst_reg &dst, const src_reg &src0, const src_reg &src1) const \ 382 { \ 383 instruction *inst = emit(BRW_OPCODE_##op, dst, src0, src1); \ 384 inst->writes_accumulator = true; \ 385 return inst; \ 386 } 387 388 #define ALU3(op) \ 389 instruction * \ 390 op(const dst_reg &dst, const src_reg &src0, const src_reg &src1, \ 391 const src_reg &src2) const \ 392 { \ 393 return emit(BRW_OPCODE_##op, dst, src0, src1, src2); \ 394 } 395 396 ALU2(ADD) ALU2_ACC(ADDC)397 ALU2_ACC(ADDC) 398 ALU2(AND) 399 ALU2(ASR) 400 ALU2(AVG) 401 ALU3(BFE) 402 ALU2(BFI1) 403 ALU3(BFI2) 404 ALU1(BFREV) 405 ALU1(CBIT) 406 ALU2(CMPN) 407 ALU3(CSEL) 408 ALU1(DIM) 409 ALU2(DP2) 410 ALU2(DP3) 411 ALU2(DP4) 412 ALU2(DPH) 413 ALU1(F16TO32) 414 ALU1(F32TO16) 415 ALU1(FBH) 416 ALU1(FBL) 417 ALU1(FRC) 418 ALU2(LINE) 419 ALU1(LZD) 420 ALU2(MAC) 421 ALU2_ACC(MACH) 422 ALU3(MAD) 423 ALU1(MOV) 424 ALU2(MUL) 425 ALU1(NOT) 426 ALU2(OR) 427 ALU2(PLN) 428 ALU1(RNDD) 429 ALU1(RNDE) 430 ALU1(RNDU) 431 ALU1(RNDZ) 432 ALU2(SAD2) 433 ALU2_ACC(SADA2) 434 ALU2(SEL) 435 ALU2(SHL) 436 ALU2(SHR) 437 ALU2_ACC(SUBB) 438 ALU2(XOR) 439 440 #undef ALU3 441 #undef ALU2_ACC 442 #undef ALU2 443 #undef ALU1 444 /** @} */ 445 446 /** 447 * CMP: Sets the low bit of the destination channels with the result 448 * of the comparison, while the upper bits are undefined, and updates 449 * the flag register with the packed 16 bits of the result. 450 */ 451 instruction * 452 CMP(const dst_reg &dst, const src_reg &src0, const src_reg &src1, 453 brw_conditional_mod condition) const 454 { 455 /* Take the instruction: 456 * 457 * CMP null<d> src0<f> src1<f> 458 * 459 * Original gen4 does type conversion to the destination type 460 * before comparison, producing garbage results for floating 461 * point comparisons. 462 * 463 * The destination type doesn't matter on newer generations, 464 * so we set the type to match src0 so we can compact the 465 * instruction. 466 */ 467 return set_condmod(condition, 468 emit(BRW_OPCODE_CMP, retype(dst, src0.type), 469 fix_unsigned_negate(src0), 470 fix_unsigned_negate(src1))); 471 } 472 473 /** 474 * Gen4 predicated IF. 475 */ 476 instruction * IF(brw_predicate predicate)477 IF(brw_predicate predicate) const 478 { 479 return set_predicate(predicate, emit(BRW_OPCODE_IF)); 480 } 481 482 /** 483 * Gen6 IF with embedded comparison. 484 */ 485 instruction * IF(const src_reg & src0,const src_reg & src1,brw_conditional_mod condition)486 IF(const src_reg &src0, const src_reg &src1, 487 brw_conditional_mod condition) const 488 { 489 assert(shader->devinfo->gen == 6); 490 return set_condmod(condition, 491 emit(BRW_OPCODE_IF, 492 null_reg_d(), 493 fix_unsigned_negate(src0), 494 fix_unsigned_negate(src1))); 495 } 496 497 /** 498 * Emit a linear interpolation instruction. 499 */ 500 instruction * LRP(const dst_reg & dst,const src_reg & x,const src_reg & y,const src_reg & a)501 LRP(const dst_reg &dst, const src_reg &x, const src_reg &y, 502 const src_reg &a) const 503 { 504 if (shader->devinfo->gen >= 6) { 505 /* The LRP instruction actually does op1 * op0 + op2 * (1 - op0), so 506 * we need to reorder the operands. 507 */ 508 return emit(BRW_OPCODE_LRP, dst, a, y, x); 509 510 } else { 511 /* We can't use the LRP instruction. Emit x*(1-a) + y*a. */ 512 const dst_reg y_times_a = vgrf(dst.type); 513 const dst_reg one_minus_a = vgrf(dst.type); 514 const dst_reg x_times_one_minus_a = vgrf(dst.type); 515 516 MUL(y_times_a, y, a); 517 ADD(one_minus_a, negate(a), brw_imm_f(1.0f)); 518 MUL(x_times_one_minus_a, x, src_reg(one_minus_a)); 519 return ADD(dst, src_reg(x_times_one_minus_a), src_reg(y_times_a)); 520 } 521 } 522 523 backend_shader *shader; 524 525 protected: 526 /** 527 * Workaround for negation of UD registers. See comment in 528 * fs_generator::generate_code() for the details. 529 */ 530 src_reg fix_unsigned_negate(const src_reg & src)531 fix_unsigned_negate(const src_reg &src) const 532 { 533 if (src.type == BRW_REGISTER_TYPE_UD && src.negate) { 534 dst_reg temp = vgrf(BRW_REGISTER_TYPE_UD); 535 MOV(temp, src); 536 return src_reg(temp); 537 } else { 538 return src; 539 } 540 } 541 542 /** 543 * Workaround for register access modes not supported by the ternary 544 * instruction encoding. 545 */ 546 src_reg fix_3src_operand(const src_reg & src)547 fix_3src_operand(const src_reg &src) const 548 { 549 /* Using vec4 uniforms in SIMD4x2 programs is difficult. You'd like to be 550 * able to use vertical stride of zero to replicate the vec4 uniform, like 551 * 552 * g3<0;4,1>:f - [0, 4][1, 5][2, 6][3, 7] 553 * 554 * But you can't, since vertical stride is always four in three-source 555 * instructions. Instead, insert a MOV instruction to do the replication so 556 * that the three-source instruction can consume it. 557 */ 558 559 /* The MOV is only needed if the source is a uniform or immediate. */ 560 if (src.file != UNIFORM && src.file != IMM) 561 return src; 562 563 if (src.file == UNIFORM && brw_is_single_value_swizzle(src.swizzle)) 564 return src; 565 566 const dst_reg expanded = vgrf(src.type); 567 emit(VEC4_OPCODE_UNPACK_UNIFORM, expanded, src); 568 return src_reg(expanded); 569 } 570 571 /** 572 * Workaround for register access modes not supported by the math 573 * instruction. 574 */ 575 src_reg fix_math_operand(const src_reg & src)576 fix_math_operand(const src_reg &src) const 577 { 578 /* The gen6 math instruction ignores the source modifiers -- 579 * swizzle, abs, negate, and at least some parts of the register 580 * region description. 581 * 582 * Rather than trying to enumerate all these cases, *always* expand the 583 * operand to a temp GRF for gen6. 584 * 585 * For gen7, keep the operand as-is, except if immediate, which gen7 still 586 * can't use. 587 */ 588 if (shader->devinfo->gen == 6 || 589 (shader->devinfo->gen == 7 && src.file == IMM)) { 590 const dst_reg tmp = vgrf(src.type); 591 MOV(tmp, src); 592 return src_reg(tmp); 593 } else { 594 return src; 595 } 596 } 597 598 /** 599 * Workaround other weirdness of the math instruction. 600 */ 601 instruction * fix_math_instruction(instruction * inst)602 fix_math_instruction(instruction *inst) const 603 { 604 if (shader->devinfo->gen == 6 && 605 inst->dst.writemask != WRITEMASK_XYZW) { 606 const dst_reg tmp = vgrf(inst->dst.type); 607 MOV(inst->dst, src_reg(tmp)); 608 inst->dst = tmp; 609 610 } else if (shader->devinfo->gen < 6) { 611 const unsigned sources = (inst->src[1].file == BAD_FILE ? 1 : 2); 612 inst->base_mrf = 1; 613 inst->mlen = sources; 614 } 615 616 return inst; 617 } 618 619 bblock_t *block; 620 exec_node *cursor; 621 622 unsigned _dispatch_width; 623 unsigned _group; 624 bool force_writemask_all; 625 626 /** Debug annotation info. */ 627 struct { 628 const char *str; 629 const void *ir; 630 } annotation; 631 }; 632 } 633 634 #endif 635