1 /* -*- c++ -*- */
2 /*
3 * Copyright © 2010-2015 Intel Corporation
4 *
5 * Permission is hereby granted, free of charge, to any person obtaining a
6 * copy of this software and associated documentation files (the "Software"),
7 * to deal in the Software without restriction, including without limitation
8 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
9 * and/or sell copies of the Software, and to permit persons to whom the
10 * Software is furnished to do so, subject to the following conditions:
11 *
12 * The above copyright notice and this permission notice (including the next
13 * paragraph) shall be included in all copies or substantial portions of the
14 * Software.
15 *
16 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
19 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
21 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
22 * IN THE SOFTWARE.
23 */
24
25 #ifndef BRW_FS_BUILDER_H
26 #define BRW_FS_BUILDER_H
27
28 #include "brw_ir_fs.h"
29 #include "brw_shader.h"
30 #include "brw_eu.h"
31 #include "brw_fs.h"
32
33 namespace brw {
34 /**
35 * Toolbox to assemble an FS IR program out of individual instructions.
36 *
37 * This object is meant to have an interface consistent with
38 * brw::vec4_builder. They cannot be fully interchangeable because
39 * brw::fs_builder generates scalar code while brw::vec4_builder generates
40 * vector code.
41 */
42 class fs_builder {
43 public:
44 /** Type used in this IR to represent a source of an instruction. */
45 typedef fs_reg src_reg;
46
47 /** Type used in this IR to represent the destination of an instruction. */
48 typedef fs_reg dst_reg;
49
50 /** Type used in this IR to represent an instruction. */
51 typedef fs_inst instruction;
52
53 /**
54 * Construct an fs_builder that inserts instructions into \p shader.
55 * \p dispatch_width gives the native execution width of the program.
56 */
fs_builder(fs_visitor * shader,unsigned dispatch_width)57 fs_builder(fs_visitor *shader,
58 unsigned dispatch_width) :
59 shader(shader), block(NULL), cursor(NULL),
60 _dispatch_width(dispatch_width),
61 _group(0),
62 force_writemask_all(false),
63 annotation()
64 {
65 }
66
fs_builder(fs_visitor * s)67 explicit fs_builder(fs_visitor *s) : fs_builder(s, s->dispatch_width) {}
68
69 /**
70 * Construct an fs_builder that inserts instructions into \p shader
71 * before instruction \p inst in basic block \p block. The default
72 * execution controls and debug annotation are initialized from the
73 * instruction passed as argument.
74 */
fs_builder(fs_visitor * shader,bblock_t * block,fs_inst * inst)75 fs_builder(fs_visitor *shader, bblock_t *block, fs_inst *inst) :
76 shader(shader), block(block), cursor(inst),
77 _dispatch_width(inst->exec_size),
78 _group(inst->group),
79 force_writemask_all(inst->force_writemask_all)
80 {
81 annotation.str = inst->annotation;
82 annotation.ir = inst->ir;
83 }
84
85 /**
86 * Construct an fs_builder that inserts instructions before \p cursor in
87 * basic block \p block, inheriting other code generation parameters
88 * from this.
89 */
90 fs_builder
at(bblock_t * block,exec_node * cursor)91 at(bblock_t *block, exec_node *cursor) const
92 {
93 fs_builder bld = *this;
94 bld.block = block;
95 bld.cursor = cursor;
96 return bld;
97 }
98
99 /**
100 * Construct an fs_builder appending instructions at the end of the
101 * instruction list of the shader, inheriting other code generation
102 * parameters from this.
103 */
104 fs_builder
at_end()105 at_end() const
106 {
107 return at(NULL, (exec_node *)&shader->instructions.tail_sentinel);
108 }
109
110 /**
111 * Construct a builder specifying the default SIMD width and group of
112 * channel enable signals, inheriting other code generation parameters
113 * from this.
114 *
115 * \p n gives the default SIMD width, \p i gives the slot group used for
116 * predication and control flow masking in multiples of \p n channels.
117 */
118 fs_builder
group(unsigned n,unsigned i)119 group(unsigned n, unsigned i) const
120 {
121 fs_builder bld = *this;
122
123 if (n <= dispatch_width() && i < dispatch_width() / n) {
124 bld._group += i * n;
125 } else {
126 /* The requested channel group isn't a subset of the channel group
127 * of this builder, which means that the resulting instructions
128 * would use (potentially undefined) channel enable signals not
129 * specified by the parent builder. That's only valid if the
130 * instruction doesn't have per-channel semantics, in which case
131 * we should clear off the default group index in order to prevent
132 * emitting instructions with channel group not aligned to their
133 * own execution size.
134 */
135 assert(force_writemask_all);
136 bld._group = 0;
137 }
138
139 bld._dispatch_width = n;
140 return bld;
141 }
142
143 /**
144 * Alias for group() with width equal to eight.
145 */
146 fs_builder
quarter(unsigned i)147 quarter(unsigned i) const
148 {
149 return group(8, i);
150 }
151
152 /**
153 * Construct a builder with per-channel control flow execution masking
154 * disabled if \p b is true. If control flow execution masking is
155 * already disabled this has no effect.
156 */
157 fs_builder
158 exec_all(bool b = true) const
159 {
160 fs_builder bld = *this;
161 if (b)
162 bld.force_writemask_all = true;
163 return bld;
164 }
165
166 /**
167 * Construct a builder with the given debug annotation info.
168 */
169 fs_builder
170 annotate(const char *str, const void *ir = NULL) const
171 {
172 fs_builder bld = *this;
173 bld.annotation.str = str;
174 bld.annotation.ir = ir;
175 return bld;
176 }
177
178 /**
179 * Get the SIMD width in use.
180 */
181 unsigned
dispatch_width()182 dispatch_width() const
183 {
184 return _dispatch_width;
185 }
186
187 /**
188 * Get the channel group in use.
189 */
190 unsigned
group()191 group() const
192 {
193 return _group;
194 }
195
196 /**
197 * Allocate a virtual register of natural vector size (one for this IR)
198 * and SIMD width. \p n gives the amount of space to allocate in
199 * dispatch_width units (which is just enough space for one logical
200 * component in this IR).
201 */
202 dst_reg
203 vgrf(enum brw_reg_type type, unsigned n = 1) const
204 {
205 const unsigned unit = reg_unit(shader->devinfo);
206 assert(dispatch_width() <= 32);
207
208 if (n > 0)
209 return dst_reg(VGRF, shader->alloc.allocate(
210 DIV_ROUND_UP(n * type_sz(type) * dispatch_width(),
211 unit * REG_SIZE) * unit),
212 type);
213 else
214 return retype(null_reg_ud(), type);
215 }
216
217 /**
218 * Create a null register of floating type.
219 */
220 dst_reg
null_reg_f()221 null_reg_f() const
222 {
223 return dst_reg(retype(brw_null_reg(), BRW_REGISTER_TYPE_F));
224 }
225
226 dst_reg
null_reg_df()227 null_reg_df() const
228 {
229 return dst_reg(retype(brw_null_reg(), BRW_REGISTER_TYPE_DF));
230 }
231
232 /**
233 * Create a null register of signed integer type.
234 */
235 dst_reg
null_reg_d()236 null_reg_d() const
237 {
238 return dst_reg(retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
239 }
240
241 /**
242 * Create a null register of unsigned integer type.
243 */
244 dst_reg
null_reg_ud()245 null_reg_ud() const
246 {
247 return dst_reg(retype(brw_null_reg(), BRW_REGISTER_TYPE_UD));
248 }
249
250 /**
251 * Insert an instruction into the program.
252 */
253 instruction *
emit(const instruction & inst)254 emit(const instruction &inst) const
255 {
256 return emit(new(shader->mem_ctx) instruction(inst));
257 }
258
259 /**
260 * Create and insert a nullary control instruction into the program.
261 */
262 instruction *
emit(enum opcode opcode)263 emit(enum opcode opcode) const
264 {
265 return emit(instruction(opcode, dispatch_width()));
266 }
267
268 /**
269 * Create and insert a nullary instruction into the program.
270 */
271 instruction *
emit(enum opcode opcode,const dst_reg & dst)272 emit(enum opcode opcode, const dst_reg &dst) const
273 {
274 return emit(instruction(opcode, dispatch_width(), dst));
275 }
276
277 /**
278 * Create and insert a unary instruction into the program.
279 */
280 instruction *
emit(enum opcode opcode,const dst_reg & dst,const src_reg & src0)281 emit(enum opcode opcode, const dst_reg &dst, const src_reg &src0) const
282 {
283 return emit(instruction(opcode, dispatch_width(), dst, src0));
284 }
285
286 /**
287 * Create and insert a binary instruction into the program.
288 */
289 instruction *
emit(enum opcode opcode,const dst_reg & dst,const src_reg & src0,const src_reg & src1)290 emit(enum opcode opcode, const dst_reg &dst, const src_reg &src0,
291 const src_reg &src1) const
292 {
293 return emit(instruction(opcode, dispatch_width(), dst,
294 src0, src1));
295 }
296
297 /**
298 * Create and insert a ternary instruction into the program.
299 */
300 instruction *
emit(enum opcode opcode,const dst_reg & dst,const src_reg & src0,const src_reg & src1,const src_reg & src2)301 emit(enum opcode opcode, const dst_reg &dst, const src_reg &src0,
302 const src_reg &src1, const src_reg &src2) const
303 {
304 switch (opcode) {
305 case BRW_OPCODE_BFE:
306 case BRW_OPCODE_BFI2:
307 case BRW_OPCODE_MAD:
308 case BRW_OPCODE_LRP:
309 return emit(instruction(opcode, dispatch_width(), dst,
310 fix_3src_operand(src0),
311 fix_3src_operand(src1),
312 fix_3src_operand(src2)));
313
314 default:
315 return emit(instruction(opcode, dispatch_width(), dst,
316 src0, src1, src2));
317 }
318 }
319
320 /**
321 * Create and insert an instruction with a variable number of sources
322 * into the program.
323 */
324 instruction *
emit(enum opcode opcode,const dst_reg & dst,const src_reg srcs[],unsigned n)325 emit(enum opcode opcode, const dst_reg &dst, const src_reg srcs[],
326 unsigned n) const
327 {
328 /* Use the emit() methods for specific operand counts to ensure that
329 * opcode-specific operand fixups occur.
330 */
331 if (n == 2) {
332 return emit(opcode, dst, srcs[0], srcs[1]);
333 } else if (n == 3) {
334 return emit(opcode, dst, srcs[0], srcs[1], srcs[2]);
335 } else {
336 return emit(instruction(opcode, dispatch_width(), dst, srcs, n));
337 }
338 }
339
340 /**
341 * Insert a preallocated instruction into the program.
342 */
343 instruction *
emit(instruction * inst)344 emit(instruction *inst) const
345 {
346 assert(inst->exec_size <= 32);
347 assert(inst->exec_size == dispatch_width() ||
348 force_writemask_all);
349
350 inst->group = _group;
351 inst->force_writemask_all = force_writemask_all;
352 inst->annotation = annotation.str;
353 inst->ir = annotation.ir;
354
355 if (block)
356 static_cast<instruction *>(cursor)->insert_before(block, inst);
357 else
358 cursor->insert_before(inst);
359
360 return inst;
361 }
362
363 /**
364 * Select \p src0 if the comparison of both sources with the given
365 * conditional mod evaluates to true, otherwise select \p src1.
366 *
367 * Generally useful to get the minimum or maximum of two values.
368 */
369 instruction *
emit_minmax(const dst_reg & dst,const src_reg & src0,const src_reg & src1,brw_conditional_mod mod)370 emit_minmax(const dst_reg &dst, const src_reg &src0,
371 const src_reg &src1, brw_conditional_mod mod) const
372 {
373 assert(mod == BRW_CONDITIONAL_GE || mod == BRW_CONDITIONAL_L);
374
375 /* In some cases we can't have bytes as operand for src1, so use the
376 * same type for both operand.
377 */
378 return set_condmod(mod, SEL(dst, fix_unsigned_negate(src0),
379 fix_unsigned_negate(src1)));
380 }
381
382 /**
383 * Copy any live channel from \p src to the first channel of the result.
384 */
385 src_reg
emit_uniformize(const src_reg & src)386 emit_uniformize(const src_reg &src) const
387 {
388 /* FIXME: We use a vector chan_index and dst to allow constant and
389 * copy propagration to move result all the way into the consuming
390 * instruction (typically a surface index or sampler index for a
391 * send). This uses 1 or 3 extra hw registers in 16 or 32 wide
392 * dispatch. Once we teach const/copy propagation about scalars we
393 * should go back to scalar destinations here.
394 */
395 const fs_builder ubld = exec_all();
396 const dst_reg chan_index = vgrf(BRW_REGISTER_TYPE_UD);
397 const dst_reg dst = vgrf(src.type);
398
399 ubld.emit(SHADER_OPCODE_FIND_LIVE_CHANNEL, chan_index);
400 ubld.emit(SHADER_OPCODE_BROADCAST, dst, src, component(chan_index, 0));
401
402 return src_reg(component(dst, 0));
403 }
404
405 src_reg
move_to_vgrf(const src_reg & src,unsigned num_components)406 move_to_vgrf(const src_reg &src, unsigned num_components) const
407 {
408 src_reg *const src_comps = new src_reg[num_components];
409 for (unsigned i = 0; i < num_components; i++)
410 src_comps[i] = offset(src, dispatch_width(), i);
411
412 const dst_reg dst = vgrf(src.type, num_components);
413 LOAD_PAYLOAD(dst, src_comps, num_components, 0);
414
415 delete[] src_comps;
416
417 return src_reg(dst);
418 }
419
420 void
emit_scan_step(enum opcode opcode,brw_conditional_mod mod,const dst_reg & tmp,unsigned left_offset,unsigned left_stride,unsigned right_offset,unsigned right_stride)421 emit_scan_step(enum opcode opcode, brw_conditional_mod mod,
422 const dst_reg &tmp,
423 unsigned left_offset, unsigned left_stride,
424 unsigned right_offset, unsigned right_stride) const
425 {
426 dst_reg left, right;
427 left = horiz_stride(horiz_offset(tmp, left_offset), left_stride);
428 right = horiz_stride(horiz_offset(tmp, right_offset), right_stride);
429 if ((tmp.type == BRW_REGISTER_TYPE_Q ||
430 tmp.type == BRW_REGISTER_TYPE_UQ) &&
431 !shader->devinfo->has_64bit_int) {
432 switch (opcode) {
433 case BRW_OPCODE_MUL:
434 /* This will get lowered by integer MUL lowering */
435 set_condmod(mod, emit(opcode, right, left, right));
436 break;
437
438 case BRW_OPCODE_SEL: {
439 /* In order for the comparisons to work out right, we need our
440 * comparisons to be strict.
441 */
442 assert(mod == BRW_CONDITIONAL_L || mod == BRW_CONDITIONAL_GE);
443 if (mod == BRW_CONDITIONAL_GE)
444 mod = BRW_CONDITIONAL_G;
445
446 /* We treat the bottom 32 bits as unsigned regardless of
447 * whether or not the integer as a whole is signed.
448 */
449 dst_reg right_low = subscript(right, BRW_REGISTER_TYPE_UD, 0);
450 dst_reg left_low = subscript(left, BRW_REGISTER_TYPE_UD, 0);
451
452 /* The upper bits get the same sign as the 64-bit type */
453 brw_reg_type type32 = brw_reg_type_from_bit_size(32, tmp.type);
454 dst_reg right_high = subscript(right, type32, 1);
455 dst_reg left_high = subscript(left, type32, 1);
456
457 /* Build up our comparison:
458 *
459 * l_hi < r_hi || (l_hi == r_hi && l_low < r_low)
460 */
461 CMP(null_reg_ud(), retype(left_low, BRW_REGISTER_TYPE_UD),
462 retype(right_low, BRW_REGISTER_TYPE_UD), mod);
463 set_predicate(BRW_PREDICATE_NORMAL,
464 CMP(null_reg_ud(), left_high, right_high,
465 BRW_CONDITIONAL_EQ));
466 set_predicate_inv(BRW_PREDICATE_NORMAL, true,
467 CMP(null_reg_ud(), left_high, right_high, mod));
468
469 /* We could use selects here or we could use predicated MOVs
470 * because the destination and second source (if it were a SEL)
471 * are the same.
472 */
473 set_predicate(BRW_PREDICATE_NORMAL, MOV(right_low, left_low));
474 set_predicate(BRW_PREDICATE_NORMAL, MOV(right_high, left_high));
475 break;
476 }
477
478 default:
479 unreachable("Unsupported 64-bit scan op");
480 }
481 } else {
482 set_condmod(mod, emit(opcode, right, left, right));
483 }
484 }
485
486 void
emit_scan(enum opcode opcode,const dst_reg & tmp,unsigned cluster_size,brw_conditional_mod mod)487 emit_scan(enum opcode opcode, const dst_reg &tmp,
488 unsigned cluster_size, brw_conditional_mod mod) const
489 {
490 assert(dispatch_width() >= 8);
491
492 /* The instruction splitting code isn't advanced enough to split
493 * these so we need to handle that ourselves.
494 */
495 if (dispatch_width() * type_sz(tmp.type) > 2 * REG_SIZE) {
496 const unsigned half_width = dispatch_width() / 2;
497 const fs_builder ubld = exec_all().group(half_width, 0);
498 dst_reg left = tmp;
499 dst_reg right = horiz_offset(tmp, half_width);
500 ubld.emit_scan(opcode, left, cluster_size, mod);
501 ubld.emit_scan(opcode, right, cluster_size, mod);
502 if (cluster_size > half_width) {
503 ubld.emit_scan_step(opcode, mod, tmp,
504 half_width - 1, 0, half_width, 1);
505 }
506 return;
507 }
508
509 if (cluster_size > 1) {
510 const fs_builder ubld = exec_all().group(dispatch_width() / 2, 0);
511 ubld.emit_scan_step(opcode, mod, tmp, 0, 2, 1, 2);
512 }
513
514 if (cluster_size > 2) {
515 if (type_sz(tmp.type) <= 4) {
516 const fs_builder ubld =
517 exec_all().group(dispatch_width() / 4, 0);
518 ubld.emit_scan_step(opcode, mod, tmp, 1, 4, 2, 4);
519 ubld.emit_scan_step(opcode, mod, tmp, 1, 4, 3, 4);
520 } else {
521 /* For 64-bit types, we have to do things differently because
522 * the code above would land us with destination strides that
523 * the hardware can't handle. Fortunately, we'll only be
524 * 8-wide in that case and it's the same number of
525 * instructions.
526 */
527 const fs_builder ubld = exec_all().group(2, 0);
528 for (unsigned i = 0; i < dispatch_width(); i += 4)
529 ubld.emit_scan_step(opcode, mod, tmp, i + 1, 0, i + 2, 1);
530 }
531 }
532
533 for (unsigned i = 4;
534 i < MIN2(cluster_size, dispatch_width());
535 i *= 2) {
536 const fs_builder ubld = exec_all().group(i, 0);
537 ubld.emit_scan_step(opcode, mod, tmp, i - 1, 0, i, 1);
538
539 if (dispatch_width() > i * 2)
540 ubld.emit_scan_step(opcode, mod, tmp, i * 3 - 1, 0, i * 3, 1);
541
542 if (dispatch_width() > i * 4) {
543 ubld.emit_scan_step(opcode, mod, tmp, i * 5 - 1, 0, i * 5, 1);
544 ubld.emit_scan_step(opcode, mod, tmp, i * 7 - 1, 0, i * 7, 1);
545 }
546 }
547 }
548
549 instruction *
emit_undef_for_dst(const instruction * old_inst)550 emit_undef_for_dst(const instruction *old_inst) const
551 {
552 assert(old_inst->dst.file == VGRF);
553 instruction *inst = emit(SHADER_OPCODE_UNDEF,
554 retype(old_inst->dst, BRW_REGISTER_TYPE_UD));
555 inst->size_written = old_inst->size_written;
556
557 return inst;
558 }
559
560 /**
561 * Assorted arithmetic ops.
562 * @{
563 */
564 #define ALU1(op) \
565 instruction * \
566 op(const dst_reg &dst, const src_reg &src0) const \
567 { \
568 return emit(BRW_OPCODE_##op, dst, src0); \
569 }
570
571 #define ALU2(op) \
572 instruction * \
573 op(const dst_reg &dst, const src_reg &src0, const src_reg &src1) const \
574 { \
575 return emit(BRW_OPCODE_##op, dst, src0, src1); \
576 }
577
578 #define ALU2_ACC(op) \
579 instruction * \
580 op(const dst_reg &dst, const src_reg &src0, const src_reg &src1) const \
581 { \
582 instruction *inst = emit(BRW_OPCODE_##op, dst, src0, src1); \
583 inst->writes_accumulator = true; \
584 return inst; \
585 }
586
587 #define ALU3(op) \
588 instruction * \
589 op(const dst_reg &dst, const src_reg &src0, const src_reg &src1, \
590 const src_reg &src2) const \
591 { \
592 return emit(BRW_OPCODE_##op, dst, src0, src1, src2); \
593 }
594
595 ALU2(ADD)
ALU3(ADD3)596 ALU3(ADD3)
597 ALU2_ACC(ADDC)
598 ALU2(AND)
599 ALU2(ASR)
600 ALU2(AVG)
601 ALU3(BFE)
602 ALU2(BFI1)
603 ALU3(BFI2)
604 ALU1(BFREV)
605 ALU1(CBIT)
606 ALU2(DP2)
607 ALU2(DP3)
608 ALU2(DP4)
609 ALU2(DPH)
610 ALU1(FBH)
611 ALU1(FBL)
612 ALU1(FRC)
613 ALU3(DP4A)
614 ALU2(LINE)
615 ALU1(LZD)
616 ALU2(MAC)
617 ALU2_ACC(MACH)
618 ALU3(MAD)
619 ALU1(MOV)
620 ALU2(MUL)
621 ALU1(NOT)
622 ALU2(OR)
623 ALU2(PLN)
624 ALU1(RNDD)
625 ALU1(RNDE)
626 ALU1(RNDU)
627 ALU1(RNDZ)
628 ALU2(ROL)
629 ALU2(ROR)
630 ALU2(SAD2)
631 ALU2_ACC(SADA2)
632 ALU2(SEL)
633 ALU2(SHL)
634 ALU2(SHR)
635 ALU2_ACC(SUBB)
636 ALU2(XOR)
637
638 #undef ALU3
639 #undef ALU2_ACC
640 #undef ALU2
641 #undef ALU1
642 /** @} */
643
644 /**
645 * CMP: Sets the low bit of the destination channels with the result
646 * of the comparison, while the upper bits are undefined, and updates
647 * the flag register with the packed 16 bits of the result.
648 */
649 instruction *
650 CMP(const dst_reg &dst, const src_reg &src0, const src_reg &src1,
651 brw_conditional_mod condition) const
652 {
653 /* Take the instruction:
654 *
655 * CMP null<d> src0<f> src1<f>
656 *
657 * Original gfx4 does type conversion to the destination type
658 * before comparison, producing garbage results for floating
659 * point comparisons.
660 *
661 * The destination type doesn't matter on newer generations,
662 * so we set the type to match src0 so we can compact the
663 * instruction.
664 */
665 return set_condmod(condition,
666 emit(BRW_OPCODE_CMP, retype(dst, src0.type),
667 fix_unsigned_negate(src0),
668 fix_unsigned_negate(src1)));
669 }
670
671 /**
672 * CMPN: Behaves like CMP, but produces true if src1 is NaN.
673 */
674 instruction *
CMPN(const dst_reg & dst,const src_reg & src0,const src_reg & src1,brw_conditional_mod condition)675 CMPN(const dst_reg &dst, const src_reg &src0, const src_reg &src1,
676 brw_conditional_mod condition) const
677 {
678 /* Take the instruction:
679 *
680 * CMP null<d> src0<f> src1<f>
681 *
682 * Original gfx4 does type conversion to the destination type
683 * before comparison, producing garbage results for floating
684 * point comparisons.
685 *
686 * The destination type doesn't matter on newer generations,
687 * so we set the type to match src0 so we can compact the
688 * instruction.
689 */
690 return set_condmod(condition,
691 emit(BRW_OPCODE_CMPN, retype(dst, src0.type),
692 fix_unsigned_negate(src0),
693 fix_unsigned_negate(src1)));
694 }
695
696 /**
697 * Gfx4 predicated IF.
698 */
699 instruction *
IF(brw_predicate predicate)700 IF(brw_predicate predicate) const
701 {
702 return set_predicate(predicate, emit(BRW_OPCODE_IF));
703 }
704
705 /**
706 * CSEL: dst = src2 <op> 0.0f ? src0 : src1
707 */
708 instruction *
CSEL(const dst_reg & dst,const src_reg & src0,const src_reg & src1,const src_reg & src2,brw_conditional_mod condition)709 CSEL(const dst_reg &dst, const src_reg &src0, const src_reg &src1,
710 const src_reg &src2, brw_conditional_mod condition) const
711 {
712 /* CSEL only operates on floats, so we can't do integer </<=/>=/>
713 * comparisons. Zero/non-zero (== and !=) comparisons almost work.
714 * 0x80000000 fails because it is -0.0, and -0.0 == 0.0.
715 */
716 assert(src2.type == BRW_REGISTER_TYPE_F);
717
718 return set_condmod(condition,
719 emit(BRW_OPCODE_CSEL,
720 retype(dst, BRW_REGISTER_TYPE_F),
721 retype(src0, BRW_REGISTER_TYPE_F),
722 retype(src1, BRW_REGISTER_TYPE_F),
723 src2));
724 }
725
726 /**
727 * Emit a linear interpolation instruction.
728 */
729 instruction *
LRP(const dst_reg & dst,const src_reg & x,const src_reg & y,const src_reg & a)730 LRP(const dst_reg &dst, const src_reg &x, const src_reg &y,
731 const src_reg &a) const
732 {
733 if (shader->devinfo->ver <= 10) {
734 /* The LRP instruction actually does op1 * op0 + op2 * (1 - op0), so
735 * we need to reorder the operands.
736 */
737 return emit(BRW_OPCODE_LRP, dst, a, y, x);
738
739 } else {
740 /* We can't use the LRP instruction. Emit x*(1-a) + y*a. */
741 const dst_reg y_times_a = vgrf(dst.type);
742 const dst_reg one_minus_a = vgrf(dst.type);
743 const dst_reg x_times_one_minus_a = vgrf(dst.type);
744
745 MUL(y_times_a, y, a);
746 ADD(one_minus_a, negate(a), brw_imm_f(1.0f));
747 MUL(x_times_one_minus_a, x, src_reg(one_minus_a));
748 return ADD(dst, src_reg(x_times_one_minus_a), src_reg(y_times_a));
749 }
750 }
751
752 /**
753 * Collect a number of registers in a contiguous range of registers.
754 */
755 instruction *
LOAD_PAYLOAD(const dst_reg & dst,const src_reg * src,unsigned sources,unsigned header_size)756 LOAD_PAYLOAD(const dst_reg &dst, const src_reg *src,
757 unsigned sources, unsigned header_size) const
758 {
759 instruction *inst = emit(SHADER_OPCODE_LOAD_PAYLOAD, dst, src, sources);
760 inst->header_size = header_size;
761 inst->size_written = header_size * REG_SIZE;
762 for (unsigned i = header_size; i < sources; i++) {
763 inst->size_written += dispatch_width() * type_sz(src[i].type) *
764 dst.stride;
765 }
766
767 return inst;
768 }
769
770 instruction *
UNDEF(const dst_reg & dst)771 UNDEF(const dst_reg &dst) const
772 {
773 assert(dst.file == VGRF);
774 assert(dst.offset % REG_SIZE == 0);
775 instruction *inst = emit(SHADER_OPCODE_UNDEF,
776 retype(dst, BRW_REGISTER_TYPE_UD));
777 inst->size_written = shader->alloc.sizes[dst.nr] * REG_SIZE - dst.offset;
778
779 return inst;
780 }
781
782 instruction *
DPAS(const dst_reg & dst,const src_reg & src0,const src_reg & src1,const src_reg & src2,unsigned sdepth,unsigned rcount)783 DPAS(const dst_reg &dst, const src_reg &src0, const src_reg &src1, const src_reg &src2,
784 unsigned sdepth, unsigned rcount) const
785 {
786 assert(_dispatch_width == 8);
787 assert(sdepth == 8);
788 assert(rcount == 1 || rcount == 2 || rcount == 4 || rcount == 8);
789
790 instruction *inst = emit(BRW_OPCODE_DPAS, dst, src0, src1, src2);
791 inst->sdepth = sdepth;
792 inst->rcount = rcount;
793
794 if (dst.type == BRW_REGISTER_TYPE_HF) {
795 inst->size_written = rcount * REG_SIZE / 2;
796 } else {
797 inst->size_written = rcount * REG_SIZE;
798 }
799
800 return inst;
801 }
802
803 fs_visitor *shader;
804
BREAK()805 fs_inst *BREAK() { return emit(BRW_OPCODE_BREAK); }
DO()806 fs_inst *DO() { return emit(BRW_OPCODE_DO); }
ENDIF()807 fs_inst *ENDIF() { return emit(BRW_OPCODE_ENDIF); }
NOP()808 fs_inst *NOP() { return emit(BRW_OPCODE_NOP); }
WHILE()809 fs_inst *WHILE() { return emit(BRW_OPCODE_WHILE); }
CONTINUE()810 fs_inst *CONTINUE() { return emit(BRW_OPCODE_CONTINUE); }
811
812 private:
813 /**
814 * Workaround for negation of UD registers. See comment in
815 * fs_generator::generate_code() for more details.
816 */
817 src_reg
fix_unsigned_negate(const src_reg & src)818 fix_unsigned_negate(const src_reg &src) const
819 {
820 if (src.type == BRW_REGISTER_TYPE_UD &&
821 src.negate) {
822 dst_reg temp = vgrf(BRW_REGISTER_TYPE_UD);
823 MOV(temp, src);
824 return src_reg(temp);
825 } else {
826 return src;
827 }
828 }
829
830 /**
831 * Workaround for source register modes not supported by the ternary
832 * instruction encoding.
833 */
834 src_reg
fix_3src_operand(const src_reg & src)835 fix_3src_operand(const src_reg &src) const
836 {
837 switch (src.file) {
838 case FIXED_GRF:
839 /* FINISHME: Could handle scalar region, other stride=1 regions */
840 if (src.vstride != BRW_VERTICAL_STRIDE_8 ||
841 src.width != BRW_WIDTH_8 ||
842 src.hstride != BRW_HORIZONTAL_STRIDE_1)
843 break;
844 FALLTHROUGH;
845 case ATTR:
846 case VGRF:
847 case UNIFORM:
848 case IMM:
849 return src;
850 default:
851 break;
852 }
853
854 dst_reg expanded = vgrf(src.type);
855 MOV(expanded, src);
856 return expanded;
857 }
858
859 bblock_t *block;
860 exec_node *cursor;
861
862 unsigned _dispatch_width;
863 unsigned _group;
864 bool force_writemask_all;
865
866 /** Debug annotation info. */
867 struct {
868 const char *str;
869 const void *ir;
870 } annotation;
871 };
872 }
873
874 static inline fs_reg
offset(const fs_reg & reg,const brw::fs_builder & bld,unsigned delta)875 offset(const fs_reg ®, const brw::fs_builder &bld, unsigned delta)
876 {
877 return offset(reg, bld.dispatch_width(), delta);
878 }
879
880 #endif
881