1 /* -*- c++ -*- */ 2 /* 3 * Copyright © 2010-2015 Intel Corporation 4 * 5 * Permission is hereby granted, free of charge, to any person obtaining a 6 * copy of this software and associated documentation files (the "Software"), 7 * to deal in the Software without restriction, including without limitation 8 * the rights to use, copy, modify, merge, publish, distribute, sublicense, 9 * and/or sell copies of the Software, and to permit persons to whom the 10 * Software is furnished to do so, subject to the following conditions: 11 * 12 * The above copyright notice and this permission notice (including the next 13 * paragraph) shall be included in all copies or substantial portions of the 14 * Software. 15 * 16 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 17 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 18 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 19 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 20 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING 21 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS 22 * IN THE SOFTWARE. 23 */ 24 25 #ifndef BRW_VEC4_BUILDER_H 26 #define BRW_VEC4_BUILDER_H 27 28 #include "brw_ir_vec4.h" 29 #include "brw_ir_allocator.h" 30 31 namespace brw { 32 /** 33 * Toolbox to assemble a VEC4 IR program out of individual instructions. 34 * 35 * This object is meant to have an interface consistent with 36 * brw::fs_builder. They cannot be fully interchangeable because 37 * brw::fs_builder generates scalar code while brw::vec4_builder generates 38 * vector code. 39 */ 40 class vec4_builder { 41 public: 42 /** Type used in this IR to represent a source of an instruction. */ 43 typedef brw::src_reg src_reg; 44 45 /** Type used in this IR to represent the destination of an instruction. */ 46 typedef brw::dst_reg dst_reg; 47 48 /** Type used in this IR to represent an instruction. */ 49 typedef vec4_instruction instruction; 50 51 /** 52 * Construct a vec4_builder that inserts instructions into \p shader. 53 */ 54 vec4_builder(backend_shader *shader, unsigned dispatch_width = 8) : shader(shader)55 shader(shader), block(NULL), cursor(NULL), 56 _dispatch_width(dispatch_width), _group(0), 57 force_writemask_all(false), 58 annotation() 59 { 60 } 61 62 /** 63 * Construct a vec4_builder that inserts instructions into \p shader 64 * before instruction \p inst in basic block \p block. The default 65 * execution controls and debug annotation are initialized from the 66 * instruction passed as argument. 67 */ vec4_builder(backend_shader * shader,bblock_t * block,instruction * inst)68 vec4_builder(backend_shader *shader, bblock_t *block, instruction *inst) : 69 shader(shader), block(block), cursor(inst), 70 _dispatch_width(inst->exec_size), _group(inst->group), 71 force_writemask_all(inst->force_writemask_all) 72 { 73 annotation.str = inst->annotation; 74 annotation.ir = inst->ir; 75 } 76 77 /** 78 * Construct a vec4_builder that inserts instructions before \p cursor 79 * in basic block \p block, inheriting other code generation parameters 80 * from this. 81 */ 82 vec4_builder at(bblock_t * block,exec_node * cursor)83 at(bblock_t *block, exec_node *cursor) const 84 { 85 vec4_builder bld = *this; 86 bld.block = block; 87 bld.cursor = cursor; 88 return bld; 89 } 90 91 /** 92 * Construct a vec4_builder appending instructions at the end of the 93 * instruction list of the shader, inheriting other code generation 94 * parameters from this. 95 */ 96 vec4_builder at_end()97 at_end() const 98 { 99 return at(NULL, (exec_node *)&shader->instructions.tail_sentinel); 100 } 101 102 /** 103 * Construct a builder specifying the default SIMD width and group of 104 * channel enable signals, inheriting other code generation parameters 105 * from this. 106 * 107 * \p n gives the default SIMD width, \p i gives the slot group used for 108 * predication and control flow masking in multiples of \p n channels. 109 */ 110 vec4_builder group(unsigned n,unsigned i)111 group(unsigned n, unsigned i) const 112 { 113 assert(force_writemask_all || 114 (n <= dispatch_width() && i < dispatch_width() / n)); 115 vec4_builder bld = *this; 116 bld._dispatch_width = n; 117 bld._group += i * n; 118 return bld; 119 } 120 121 /** 122 * Construct a builder with per-channel control flow execution masking 123 * disabled if \p b is true. If control flow execution masking is 124 * already disabled this has no effect. 125 */ 126 vec4_builder 127 exec_all(bool b = true) const 128 { 129 vec4_builder bld = *this; 130 if (b) 131 bld.force_writemask_all = true; 132 return bld; 133 } 134 135 /** 136 * Construct a builder with the given debug annotation info. 137 */ 138 vec4_builder 139 annotate(const char *str, const void *ir = NULL) const 140 { 141 vec4_builder bld = *this; 142 bld.annotation.str = str; 143 bld.annotation.ir = ir; 144 return bld; 145 } 146 147 /** 148 * Get the SIMD width in use. 149 */ 150 unsigned dispatch_width()151 dispatch_width() const 152 { 153 return _dispatch_width; 154 } 155 156 /** 157 * Get the channel group in use. 158 */ 159 unsigned group()160 group() const 161 { 162 return _group; 163 } 164 165 /** 166 * Allocate a virtual register of natural vector size (four for this IR) 167 * and SIMD width. \p n gives the amount of space to allocate in 168 * dispatch_width units (which is just enough space for four logical 169 * components in this IR). 170 */ 171 dst_reg 172 vgrf(enum brw_reg_type type, unsigned n = 1) const 173 { 174 assert(dispatch_width() <= 32); 175 176 if (n > 0) 177 return retype(dst_reg(VGRF, shader->alloc.allocate( 178 n * DIV_ROUND_UP(type_sz(type), 4))), 179 type); 180 else 181 return retype(null_reg_ud(), type); 182 } 183 184 /** 185 * Create a null register of floating type. 186 */ 187 dst_reg null_reg_f()188 null_reg_f() const 189 { 190 return dst_reg(retype(brw_null_vec(dispatch_width()), 191 BRW_REGISTER_TYPE_F)); 192 } 193 194 /** 195 * Create a null register of signed integer type. 196 */ 197 dst_reg null_reg_d()198 null_reg_d() const 199 { 200 return dst_reg(retype(brw_null_vec(dispatch_width()), 201 BRW_REGISTER_TYPE_D)); 202 } 203 204 /** 205 * Create a null register of unsigned integer type. 206 */ 207 dst_reg null_reg_ud()208 null_reg_ud() const 209 { 210 return dst_reg(retype(brw_null_vec(dispatch_width()), 211 BRW_REGISTER_TYPE_UD)); 212 } 213 214 /** 215 * Insert an instruction into the program. 216 */ 217 instruction * emit(const instruction & inst)218 emit(const instruction &inst) const 219 { 220 return emit(new(shader->mem_ctx) instruction(inst)); 221 } 222 223 /** 224 * Create and insert a nullary control instruction into the program. 225 */ 226 instruction * emit(enum opcode opcode)227 emit(enum opcode opcode) const 228 { 229 return emit(instruction(opcode)); 230 } 231 232 /** 233 * Create and insert a nullary instruction into the program. 234 */ 235 instruction * emit(enum opcode opcode,const dst_reg & dst)236 emit(enum opcode opcode, const dst_reg &dst) const 237 { 238 return emit(instruction(opcode, dst)); 239 } 240 241 /** 242 * Create and insert a unary instruction into the program. 243 */ 244 instruction * emit(enum opcode opcode,const dst_reg & dst,const src_reg & src0)245 emit(enum opcode opcode, const dst_reg &dst, const src_reg &src0) const 246 { 247 switch (opcode) { 248 case SHADER_OPCODE_RCP: 249 case SHADER_OPCODE_RSQ: 250 case SHADER_OPCODE_SQRT: 251 case SHADER_OPCODE_EXP2: 252 case SHADER_OPCODE_LOG2: 253 case SHADER_OPCODE_SIN: 254 case SHADER_OPCODE_COS: 255 return fix_math_instruction( 256 emit(instruction(opcode, dst, 257 fix_math_operand(src0)))); 258 259 default: 260 return emit(instruction(opcode, dst, src0)); 261 } 262 } 263 264 /** 265 * Create and insert a binary instruction into the program. 266 */ 267 instruction * emit(enum opcode opcode,const dst_reg & dst,const src_reg & src0,const src_reg & src1)268 emit(enum opcode opcode, const dst_reg &dst, const src_reg &src0, 269 const src_reg &src1) const 270 { 271 switch (opcode) { 272 case SHADER_OPCODE_POW: 273 case SHADER_OPCODE_INT_QUOTIENT: 274 case SHADER_OPCODE_INT_REMAINDER: 275 return fix_math_instruction( 276 emit(instruction(opcode, dst, 277 fix_math_operand(src0), 278 fix_math_operand(src1)))); 279 280 default: 281 return emit(instruction(opcode, dst, src0, src1)); 282 } 283 } 284 285 /** 286 * Create and insert a ternary instruction into the program. 287 */ 288 instruction * emit(enum opcode opcode,const dst_reg & dst,const src_reg & src0,const src_reg & src1,const src_reg & src2)289 emit(enum opcode opcode, const dst_reg &dst, const src_reg &src0, 290 const src_reg &src1, const src_reg &src2) const 291 { 292 switch (opcode) { 293 case BRW_OPCODE_BFE: 294 case BRW_OPCODE_BFI2: 295 case BRW_OPCODE_MAD: 296 case BRW_OPCODE_LRP: 297 return emit(instruction(opcode, dst, 298 fix_3src_operand(src0), 299 fix_3src_operand(src1), 300 fix_3src_operand(src2))); 301 302 default: 303 return emit(instruction(opcode, dst, src0, src1, src2)); 304 } 305 } 306 307 /** 308 * Insert a preallocated instruction into the program. 309 */ 310 instruction * emit(instruction * inst)311 emit(instruction *inst) const 312 { 313 inst->exec_size = dispatch_width(); 314 inst->group = group(); 315 inst->force_writemask_all = force_writemask_all; 316 inst->size_written = inst->exec_size * type_sz(inst->dst.type); 317 inst->annotation = annotation.str; 318 inst->ir = annotation.ir; 319 320 if (block) 321 static_cast<instruction *>(cursor)->insert_before(block, inst); 322 else 323 cursor->insert_before(inst); 324 325 return inst; 326 } 327 328 /** 329 * Select \p src0 if the comparison of both sources with the given 330 * conditional mod evaluates to true, otherwise select \p src1. 331 * 332 * Generally useful to get the minimum or maximum of two values. 333 */ 334 instruction * emit_minmax(const dst_reg & dst,const src_reg & src0,const src_reg & src1,brw_conditional_mod mod)335 emit_minmax(const dst_reg &dst, const src_reg &src0, 336 const src_reg &src1, brw_conditional_mod mod) const 337 { 338 assert(mod == BRW_CONDITIONAL_GE || mod == BRW_CONDITIONAL_L); 339 340 return set_condmod(mod, SEL(dst, fix_unsigned_negate(src0), 341 fix_unsigned_negate(src1))); 342 } 343 344 /** 345 * Copy any live channel from \p src to the first channel of the result. 346 */ 347 src_reg emit_uniformize(const src_reg & src)348 emit_uniformize(const src_reg &src) const 349 { 350 const vec4_builder ubld = exec_all(); 351 const dst_reg chan_index = 352 writemask(vgrf(BRW_REGISTER_TYPE_UD), WRITEMASK_X); 353 const dst_reg dst = vgrf(src.type); 354 355 ubld.emit(SHADER_OPCODE_FIND_LIVE_CHANNEL, chan_index); 356 ubld.emit(SHADER_OPCODE_BROADCAST, dst, src, src_reg(chan_index)); 357 358 return src_reg(dst); 359 } 360 361 /** 362 * Assorted arithmetic ops. 363 * @{ 364 */ 365 #define ALU1(op) \ 366 instruction * \ 367 op(const dst_reg &dst, const src_reg &src0) const \ 368 { \ 369 return emit(BRW_OPCODE_##op, dst, src0); \ 370 } 371 372 #define ALU2(op) \ 373 instruction * \ 374 op(const dst_reg &dst, const src_reg &src0, const src_reg &src1) const \ 375 { \ 376 return emit(BRW_OPCODE_##op, dst, src0, src1); \ 377 } 378 379 #define ALU2_ACC(op) \ 380 instruction * \ 381 op(const dst_reg &dst, const src_reg &src0, const src_reg &src1) const \ 382 { \ 383 instruction *inst = emit(BRW_OPCODE_##op, dst, src0, src1); \ 384 inst->writes_accumulator = true; \ 385 return inst; \ 386 } 387 388 #define ALU3(op) \ 389 instruction * \ 390 op(const dst_reg &dst, const src_reg &src0, const src_reg &src1, \ 391 const src_reg &src2) const \ 392 { \ 393 return emit(BRW_OPCODE_##op, dst, src0, src1, src2); \ 394 } 395 396 ALU2(ADD) ALU2_ACC(ADDC)397 ALU2_ACC(ADDC) 398 ALU2(AND) 399 ALU2(ASR) 400 ALU2(AVG) 401 ALU3(BFE) 402 ALU2(BFI1) 403 ALU3(BFI2) 404 ALU1(BFREV) 405 ALU1(CBIT) 406 ALU2(CMPN) 407 ALU3(CSEL) 408 ALU1(DIM) 409 ALU2(DP2) 410 ALU2(DP3) 411 ALU2(DP4) 412 ALU2(DPH) 413 ALU1(F16TO32) 414 ALU1(F32TO16) 415 ALU1(FBH) 416 ALU1(FBL) 417 ALU1(FRC) 418 ALU2(LINE) 419 ALU1(LZD) 420 ALU2(MAC) 421 ALU2_ACC(MACH) 422 ALU3(MAD) 423 ALU1(MOV) 424 ALU2(MUL) 425 ALU1(NOT) 426 ALU2(OR) 427 ALU2(PLN) 428 ALU1(RNDD) 429 ALU1(RNDE) 430 ALU1(RNDU) 431 ALU1(RNDZ) 432 ALU2(SAD2) 433 ALU2_ACC(SADA2) 434 ALU2(SEL) 435 ALU2(SHL) 436 ALU2(SHR) 437 ALU2_ACC(SUBB) 438 ALU2(XOR) 439 440 #undef ALU3 441 #undef ALU2_ACC 442 #undef ALU2 443 #undef ALU1 444 /** @} */ 445 446 /** 447 * CMP: Sets the low bit of the destination channels with the result 448 * of the comparison, while the upper bits are undefined, and updates 449 * the flag register with the packed 16 bits of the result. 450 */ 451 instruction * 452 CMP(const dst_reg &dst, const src_reg &src0, const src_reg &src1, 453 brw_conditional_mod condition) const 454 { 455 /* Take the instruction: 456 * 457 * CMP null<d> src0<f> src1<f> 458 * 459 * Original gen4 does type conversion to the destination type 460 * before comparison, producing garbage results for floating 461 * point comparisons. 462 * 463 * The destination type doesn't matter on newer generations, 464 * so we set the type to match src0 so we can compact the 465 * instruction. 466 */ 467 return set_condmod(condition, 468 emit(BRW_OPCODE_CMP, retype(dst, src0.type), 469 fix_unsigned_negate(src0), 470 fix_unsigned_negate(src1))); 471 } 472 473 /** 474 * Gen4 predicated IF. 475 */ 476 instruction * IF(brw_predicate predicate)477 IF(brw_predicate predicate) const 478 { 479 return set_predicate(predicate, emit(BRW_OPCODE_IF)); 480 } 481 482 /** 483 * Gen6 IF with embedded comparison. 484 */ 485 instruction * IF(const src_reg & src0,const src_reg & src1,brw_conditional_mod condition)486 IF(const src_reg &src0, const src_reg &src1, 487 brw_conditional_mod condition) const 488 { 489 assert(shader->devinfo->gen == 6); 490 return set_condmod(condition, 491 emit(BRW_OPCODE_IF, 492 null_reg_d(), 493 fix_unsigned_negate(src0), 494 fix_unsigned_negate(src1))); 495 } 496 497 /** 498 * Emit a linear interpolation instruction. 499 */ 500 instruction * LRP(const dst_reg & dst,const src_reg & x,const src_reg & y,const src_reg & a)501 LRP(const dst_reg &dst, const src_reg &x, const src_reg &y, 502 const src_reg &a) const 503 { 504 /* The LRP instruction actually does op1 * op0 + op2 * (1 - op0), so 505 * we need to reorder the operands. 506 */ 507 assert(shader->devinfo->gen >= 6 && shader->devinfo->gen <= 9); 508 return emit(BRW_OPCODE_LRP, dst, a, y, x); 509 } 510 511 backend_shader *shader; 512 513 protected: 514 /** 515 * Workaround for negation of UD registers. See comment in 516 * fs_generator::generate_code() for the details. 517 */ 518 src_reg fix_unsigned_negate(const src_reg & src)519 fix_unsigned_negate(const src_reg &src) const 520 { 521 if (src.type == BRW_REGISTER_TYPE_UD && src.negate) { 522 dst_reg temp = vgrf(BRW_REGISTER_TYPE_UD); 523 MOV(temp, src); 524 return src_reg(temp); 525 } else { 526 return src; 527 } 528 } 529 530 /** 531 * Workaround for register access modes not supported by the ternary 532 * instruction encoding. 533 */ 534 src_reg fix_3src_operand(const src_reg & src)535 fix_3src_operand(const src_reg &src) const 536 { 537 /* Using vec4 uniforms in SIMD4x2 programs is difficult. You'd like to be 538 * able to use vertical stride of zero to replicate the vec4 uniform, like 539 * 540 * g3<0;4,1>:f - [0, 4][1, 5][2, 6][3, 7] 541 * 542 * But you can't, since vertical stride is always four in three-source 543 * instructions. Instead, insert a MOV instruction to do the replication so 544 * that the three-source instruction can consume it. 545 */ 546 547 /* The MOV is only needed if the source is a uniform or immediate. */ 548 if (src.file != UNIFORM && src.file != IMM) 549 return src; 550 551 if (src.file == UNIFORM && brw_is_single_value_swizzle(src.swizzle)) 552 return src; 553 554 const dst_reg expanded = vgrf(src.type); 555 emit(VEC4_OPCODE_UNPACK_UNIFORM, expanded, src); 556 return src_reg(expanded); 557 } 558 559 /** 560 * Workaround for register access modes not supported by the math 561 * instruction. 562 */ 563 src_reg fix_math_operand(const src_reg & src)564 fix_math_operand(const src_reg &src) const 565 { 566 /* The gen6 math instruction ignores the source modifiers -- 567 * swizzle, abs, negate, and at least some parts of the register 568 * region description. 569 * 570 * Rather than trying to enumerate all these cases, *always* expand the 571 * operand to a temp GRF for gen6. 572 * 573 * For gen7, keep the operand as-is, except if immediate, which gen7 still 574 * can't use. 575 */ 576 if (shader->devinfo->gen == 6 || 577 (shader->devinfo->gen == 7 && src.file == IMM)) { 578 const dst_reg tmp = vgrf(src.type); 579 MOV(tmp, src); 580 return src_reg(tmp); 581 } else { 582 return src; 583 } 584 } 585 586 /** 587 * Workaround other weirdness of the math instruction. 588 */ 589 instruction * fix_math_instruction(instruction * inst)590 fix_math_instruction(instruction *inst) const 591 { 592 if (shader->devinfo->gen == 6 && 593 inst->dst.writemask != WRITEMASK_XYZW) { 594 const dst_reg tmp = vgrf(inst->dst.type); 595 MOV(inst->dst, src_reg(tmp)); 596 inst->dst = tmp; 597 598 } else if (shader->devinfo->gen < 6) { 599 const unsigned sources = (inst->src[1].file == BAD_FILE ? 1 : 2); 600 inst->base_mrf = 1; 601 inst->mlen = sources; 602 } 603 604 return inst; 605 } 606 607 bblock_t *block; 608 exec_node *cursor; 609 610 unsigned _dispatch_width; 611 unsigned _group; 612 bool force_writemask_all; 613 614 /** Debug annotation info. */ 615 struct { 616 const char *str; 617 const void *ir; 618 } annotation; 619 }; 620 } 621 622 #endif 623