• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /* -*- c++ -*- */
2 /*
3  * Copyright © 2010-2015 Intel Corporation
4  *
5  * Permission is hereby granted, free of charge, to any person obtaining a
6  * copy of this software and associated documentation files (the "Software"),
7  * to deal in the Software without restriction, including without limitation
8  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
9  * and/or sell copies of the Software, and to permit persons to whom the
10  * Software is furnished to do so, subject to the following conditions:
11  *
12  * The above copyright notice and this permission notice (including the next
13  * paragraph) shall be included in all copies or substantial portions of the
14  * Software.
15  *
16  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
19  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
21  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
22  * IN THE SOFTWARE.
23  */
24 
25 #ifndef BRW_FS_BUILDER_H
26 #define BRW_FS_BUILDER_H
27 
28 #include "brw_ir_fs.h"
29 #include "brw_shader.h"
30 
31 namespace brw {
32    /**
33     * Toolbox to assemble an FS IR program out of individual instructions.
34     *
35     * This object is meant to have an interface consistent with
36     * brw::vec4_builder.  They cannot be fully interchangeable because
37     * brw::fs_builder generates scalar code while brw::vec4_builder generates
38     * vector code.
39     */
40    class fs_builder {
41    public:
42       /** Type used in this IR to represent a source of an instruction. */
43       typedef fs_reg src_reg;
44 
45       /** Type used in this IR to represent the destination of an instruction. */
46       typedef fs_reg dst_reg;
47 
48       /** Type used in this IR to represent an instruction. */
49       typedef fs_inst instruction;
50 
51       /**
52        * Construct an fs_builder that inserts instructions into \p shader.
53        * \p dispatch_width gives the native execution width of the program.
54        */
fs_builder(backend_shader * shader,unsigned dispatch_width)55       fs_builder(backend_shader *shader,
56                  unsigned dispatch_width) :
57          shader(shader), block(NULL), cursor(NULL),
58          _dispatch_width(dispatch_width),
59          _group(0),
60          force_writemask_all(false),
61          annotation()
62       {
63       }
64 
65       /**
66        * Construct an fs_builder that inserts instructions into \p shader
67        * before instruction \p inst in basic block \p block.  The default
68        * execution controls and debug annotation are initialized from the
69        * instruction passed as argument.
70        */
fs_builder(backend_shader * shader,bblock_t * block,fs_inst * inst)71       fs_builder(backend_shader *shader, bblock_t *block, fs_inst *inst) :
72          shader(shader), block(block), cursor(inst),
73          _dispatch_width(inst->exec_size),
74          _group(inst->group),
75          force_writemask_all(inst->force_writemask_all)
76       {
77          annotation.str = inst->annotation;
78          annotation.ir = inst->ir;
79       }
80 
81       /**
82        * Construct an fs_builder that inserts instructions before \p cursor in
83        * basic block \p block, inheriting other code generation parameters
84        * from this.
85        */
86       fs_builder
at(bblock_t * block,exec_node * cursor)87       at(bblock_t *block, exec_node *cursor) const
88       {
89          fs_builder bld = *this;
90          bld.block = block;
91          bld.cursor = cursor;
92          return bld;
93       }
94 
95       /**
96        * Construct an fs_builder appending instructions at the end of the
97        * instruction list of the shader, inheriting other code generation
98        * parameters from this.
99        */
100       fs_builder
at_end()101       at_end() const
102       {
103          return at(NULL, (exec_node *)&shader->instructions.tail_sentinel);
104       }
105 
106       /**
107        * Construct a builder specifying the default SIMD width and group of
108        * channel enable signals, inheriting other code generation parameters
109        * from this.
110        *
111        * \p n gives the default SIMD width, \p i gives the slot group used for
112        * predication and control flow masking in multiples of \p n channels.
113        */
114       fs_builder
group(unsigned n,unsigned i)115       group(unsigned n, unsigned i) const
116       {
117          fs_builder bld = *this;
118 
119          if (n <= dispatch_width() && i < dispatch_width() / n) {
120             bld._group += i * n;
121          } else {
122             /* The requested channel group isn't a subset of the channel group
123              * of this builder, which means that the resulting instructions
124              * would use (potentially undefined) channel enable signals not
125              * specified by the parent builder.  That's only valid if the
126              * instruction doesn't have per-channel semantics, in which case
127              * we should clear off the default group index in order to prevent
128              * emitting instructions with channel group not aligned to their
129              * own execution size.
130              */
131             assert(force_writemask_all);
132             bld._group = 0;
133          }
134 
135          bld._dispatch_width = n;
136          return bld;
137       }
138 
139       /**
140        * Alias for group() with width equal to eight.
141        */
142       fs_builder
quarter(unsigned i)143       quarter(unsigned i) const
144       {
145          return group(8, i);
146       }
147 
148       /**
149        * Construct a builder with per-channel control flow execution masking
150        * disabled if \p b is true.  If control flow execution masking is
151        * already disabled this has no effect.
152        */
153       fs_builder
154       exec_all(bool b = true) const
155       {
156          fs_builder bld = *this;
157          if (b)
158             bld.force_writemask_all = true;
159          return bld;
160       }
161 
162       /**
163        * Construct a builder with the given debug annotation info.
164        */
165       fs_builder
166       annotate(const char *str, const void *ir = NULL) const
167       {
168          fs_builder bld = *this;
169          bld.annotation.str = str;
170          bld.annotation.ir = ir;
171          return bld;
172       }
173 
174       /**
175        * Get the SIMD width in use.
176        */
177       unsigned
dispatch_width()178       dispatch_width() const
179       {
180          return _dispatch_width;
181       }
182 
183       /**
184        * Get the channel group in use.
185        */
186       unsigned
group()187       group() const
188       {
189          return _group;
190       }
191 
192       /**
193        * Allocate a virtual register of natural vector size (one for this IR)
194        * and SIMD width.  \p n gives the amount of space to allocate in
195        * dispatch_width units (which is just enough space for one logical
196        * component in this IR).
197        */
198       dst_reg
199       vgrf(enum brw_reg_type type, unsigned n = 1) const
200       {
201          assert(dispatch_width() <= 32);
202 
203          if (n > 0)
204             return dst_reg(VGRF, shader->alloc.allocate(
205                               DIV_ROUND_UP(n * type_sz(type) * dispatch_width(),
206                                            REG_SIZE)),
207                            type);
208          else
209             return retype(null_reg_ud(), type);
210       }
211 
212       /**
213        * Create a null register of floating type.
214        */
215       dst_reg
null_reg_f()216       null_reg_f() const
217       {
218          return dst_reg(retype(brw_null_reg(), BRW_REGISTER_TYPE_F));
219       }
220 
221       dst_reg
null_reg_df()222       null_reg_df() const
223       {
224          return dst_reg(retype(brw_null_reg(), BRW_REGISTER_TYPE_DF));
225       }
226 
227       /**
228        * Create a null register of signed integer type.
229        */
230       dst_reg
null_reg_d()231       null_reg_d() const
232       {
233          return dst_reg(retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
234       }
235 
236       /**
237        * Create a null register of unsigned integer type.
238        */
239       dst_reg
null_reg_ud()240       null_reg_ud() const
241       {
242          return dst_reg(retype(brw_null_reg(), BRW_REGISTER_TYPE_UD));
243       }
244 
245       /**
246        * Insert an instruction into the program.
247        */
248       instruction *
emit(const instruction & inst)249       emit(const instruction &inst) const
250       {
251          return emit(new(shader->mem_ctx) instruction(inst));
252       }
253 
254       /**
255        * Create and insert a nullary control instruction into the program.
256        */
257       instruction *
emit(enum opcode opcode)258       emit(enum opcode opcode) const
259       {
260          return emit(instruction(opcode, dispatch_width()));
261       }
262 
263       /**
264        * Create and insert a nullary instruction into the program.
265        */
266       instruction *
emit(enum opcode opcode,const dst_reg & dst)267       emit(enum opcode opcode, const dst_reg &dst) const
268       {
269          return emit(instruction(opcode, dispatch_width(), dst));
270       }
271 
272       /**
273        * Create and insert a unary instruction into the program.
274        */
275       instruction *
emit(enum opcode opcode,const dst_reg & dst,const src_reg & src0)276       emit(enum opcode opcode, const dst_reg &dst, const src_reg &src0) const
277       {
278          switch (opcode) {
279          case SHADER_OPCODE_RCP:
280          case SHADER_OPCODE_RSQ:
281          case SHADER_OPCODE_SQRT:
282          case SHADER_OPCODE_EXP2:
283          case SHADER_OPCODE_LOG2:
284          case SHADER_OPCODE_SIN:
285          case SHADER_OPCODE_COS:
286             return emit(instruction(opcode, dispatch_width(), dst,
287                                     fix_math_operand(src0)));
288 
289          default:
290             return emit(instruction(opcode, dispatch_width(), dst, src0));
291          }
292       }
293 
294       /**
295        * Create and insert a binary instruction into the program.
296        */
297       instruction *
emit(enum opcode opcode,const dst_reg & dst,const src_reg & src0,const src_reg & src1)298       emit(enum opcode opcode, const dst_reg &dst, const src_reg &src0,
299            const src_reg &src1) const
300       {
301          switch (opcode) {
302          case SHADER_OPCODE_POW:
303          case SHADER_OPCODE_INT_QUOTIENT:
304          case SHADER_OPCODE_INT_REMAINDER:
305             return emit(instruction(opcode, dispatch_width(), dst,
306                                     fix_math_operand(src0),
307                                     fix_math_operand(src1)));
308 
309          default:
310             return emit(instruction(opcode, dispatch_width(), dst,
311                                     src0, src1));
312 
313          }
314       }
315 
316       /**
317        * Create and insert a ternary instruction into the program.
318        */
319       instruction *
emit(enum opcode opcode,const dst_reg & dst,const src_reg & src0,const src_reg & src1,const src_reg & src2)320       emit(enum opcode opcode, const dst_reg &dst, const src_reg &src0,
321            const src_reg &src1, const src_reg &src2) const
322       {
323          switch (opcode) {
324          case BRW_OPCODE_BFE:
325          case BRW_OPCODE_BFI2:
326          case BRW_OPCODE_MAD:
327          case BRW_OPCODE_LRP:
328             return emit(instruction(opcode, dispatch_width(), dst,
329                                     fix_3src_operand(src0),
330                                     fix_3src_operand(src1),
331                                     fix_3src_operand(src2)));
332 
333          default:
334             return emit(instruction(opcode, dispatch_width(), dst,
335                                     src0, src1, src2));
336          }
337       }
338 
339       /**
340        * Create and insert an instruction with a variable number of sources
341        * into the program.
342        */
343       instruction *
emit(enum opcode opcode,const dst_reg & dst,const src_reg srcs[],unsigned n)344       emit(enum opcode opcode, const dst_reg &dst, const src_reg srcs[],
345            unsigned n) const
346       {
347          /* Use the emit() methods for specific operand counts to ensure that
348           * opcode-specific operand fixups occur.
349           */
350          if (n == 2) {
351             return emit(opcode, dst, srcs[0], srcs[1]);
352          } else if (n == 3) {
353             return emit(opcode, dst, srcs[0], srcs[1], srcs[2]);
354          } else {
355             return emit(instruction(opcode, dispatch_width(), dst, srcs, n));
356          }
357       }
358 
359       /**
360        * Insert a preallocated instruction into the program.
361        */
362       instruction *
emit(instruction * inst)363       emit(instruction *inst) const
364       {
365          assert(inst->exec_size <= 32);
366          assert(inst->exec_size == dispatch_width() ||
367                 force_writemask_all);
368 
369          inst->group = _group;
370          inst->force_writemask_all = force_writemask_all;
371          inst->annotation = annotation.str;
372          inst->ir = annotation.ir;
373 
374          if (block)
375             static_cast<instruction *>(cursor)->insert_before(block, inst);
376          else
377             cursor->insert_before(inst);
378 
379          return inst;
380       }
381 
382       /**
383        * Select \p src0 if the comparison of both sources with the given
384        * conditional mod evaluates to true, otherwise select \p src1.
385        *
386        * Generally useful to get the minimum or maximum of two values.
387        */
388       instruction *
emit_minmax(const dst_reg & dst,const src_reg & src0,const src_reg & src1,brw_conditional_mod mod)389       emit_minmax(const dst_reg &dst, const src_reg &src0,
390                   const src_reg &src1, brw_conditional_mod mod) const
391       {
392          assert(mod == BRW_CONDITIONAL_GE || mod == BRW_CONDITIONAL_L);
393 
394          /* In some cases we can't have bytes as operand for src1, so use the
395           * same type for both operand.
396           */
397          return set_condmod(mod, SEL(dst, fix_unsigned_negate(src0),
398                                      fix_unsigned_negate(src1)));
399       }
400 
401       /**
402        * Copy any live channel from \p src to the first channel of the result.
403        */
404       src_reg
emit_uniformize(const src_reg & src)405       emit_uniformize(const src_reg &src) const
406       {
407          /* FIXME: We use a vector chan_index and dst to allow constant and
408           * copy propagration to move result all the way into the consuming
409           * instruction (typically a surface index or sampler index for a
410           * send). This uses 1 or 3 extra hw registers in 16 or 32 wide
411           * dispatch. Once we teach const/copy propagation about scalars we
412           * should go back to scalar destinations here.
413           */
414          const fs_builder ubld = exec_all();
415          const dst_reg chan_index = vgrf(BRW_REGISTER_TYPE_UD);
416          const dst_reg dst = vgrf(src.type);
417 
418          ubld.emit(SHADER_OPCODE_FIND_LIVE_CHANNEL, chan_index);
419          ubld.emit(SHADER_OPCODE_BROADCAST, dst, src, component(chan_index, 0));
420 
421          return src_reg(component(dst, 0));
422       }
423 
424       src_reg
move_to_vgrf(const src_reg & src,unsigned num_components)425       move_to_vgrf(const src_reg &src, unsigned num_components) const
426       {
427          src_reg *const src_comps = new src_reg[num_components];
428          for (unsigned i = 0; i < num_components; i++)
429             src_comps[i] = offset(src, dispatch_width(), i);
430 
431          const dst_reg dst = vgrf(src.type, num_components);
432          LOAD_PAYLOAD(dst, src_comps, num_components, 0);
433 
434          delete[] src_comps;
435 
436          return src_reg(dst);
437       }
438 
439       void
emit_scan_step(enum opcode opcode,brw_conditional_mod mod,const dst_reg & tmp,unsigned left_offset,unsigned left_stride,unsigned right_offset,unsigned right_stride)440       emit_scan_step(enum opcode opcode, brw_conditional_mod mod,
441                      const dst_reg &tmp,
442                      unsigned left_offset, unsigned left_stride,
443                      unsigned right_offset, unsigned right_stride) const
444       {
445          dst_reg left, right;
446          left = horiz_stride(horiz_offset(tmp, left_offset), left_stride);
447          right = horiz_stride(horiz_offset(tmp, right_offset), right_stride);
448          if ((tmp.type == BRW_REGISTER_TYPE_Q ||
449               tmp.type == BRW_REGISTER_TYPE_UQ) &&
450              !shader->devinfo->has_64bit_int) {
451             switch (opcode) {
452             case BRW_OPCODE_MUL:
453                /* This will get lowered by integer MUL lowering */
454                set_condmod(mod, emit(opcode, right, left, right));
455                break;
456 
457             case BRW_OPCODE_SEL: {
458                /* In order for the comparisons to work out right, we need our
459                 * comparisons to be strict.
460                 */
461                assert(mod == BRW_CONDITIONAL_L || mod == BRW_CONDITIONAL_GE);
462                if (mod == BRW_CONDITIONAL_GE)
463                   mod = BRW_CONDITIONAL_G;
464 
465                /* We treat the bottom 32 bits as unsigned regardless of
466                 * whether or not the integer as a whole is signed.
467                 */
468                dst_reg right_low = subscript(right, BRW_REGISTER_TYPE_UD, 0);
469                dst_reg left_low = subscript(left, BRW_REGISTER_TYPE_UD, 0);
470 
471                /* The upper bits get the same sign as the 64-bit type */
472                brw_reg_type type32 = brw_reg_type_from_bit_size(32, tmp.type);
473                dst_reg right_high = subscript(right, type32, 1);
474                dst_reg left_high = subscript(left, type32, 1);
475 
476                /* Build up our comparison:
477                 *
478                 *   l_hi < r_hi || (l_hi == r_hi && l_low < r_low)
479                 */
480                CMP(null_reg_ud(), retype(left_low, BRW_REGISTER_TYPE_UD),
481                                   retype(right_low, BRW_REGISTER_TYPE_UD), mod);
482                set_predicate(BRW_PREDICATE_NORMAL,
483                              CMP(null_reg_ud(), left_high, right_high,
484                                  BRW_CONDITIONAL_EQ));
485                set_predicate_inv(BRW_PREDICATE_NORMAL, true,
486                                  CMP(null_reg_ud(), left_high, right_high, mod));
487 
488                /* We could use selects here or we could use predicated MOVs
489                 * because the destination and second source (if it were a SEL)
490                 * are the same.
491                 */
492                set_predicate(BRW_PREDICATE_NORMAL, MOV(right_low, left_low));
493                set_predicate(BRW_PREDICATE_NORMAL, MOV(right_high, left_high));
494                break;
495             }
496 
497             default:
498                unreachable("Unsupported 64-bit scan op");
499             }
500          } else {
501             set_condmod(mod, emit(opcode, right, left, right));
502          }
503       }
504 
505       void
emit_scan(enum opcode opcode,const dst_reg & tmp,unsigned cluster_size,brw_conditional_mod mod)506       emit_scan(enum opcode opcode, const dst_reg &tmp,
507                 unsigned cluster_size, brw_conditional_mod mod) const
508       {
509          assert(dispatch_width() >= 8);
510 
511          /* The instruction splitting code isn't advanced enough to split
512           * these so we need to handle that ourselves.
513           */
514          if (dispatch_width() * type_sz(tmp.type) > 2 * REG_SIZE) {
515             const unsigned half_width = dispatch_width() / 2;
516             const fs_builder ubld = exec_all().group(half_width, 0);
517             dst_reg left = tmp;
518             dst_reg right = horiz_offset(tmp, half_width);
519             ubld.emit_scan(opcode, left, cluster_size, mod);
520             ubld.emit_scan(opcode, right, cluster_size, mod);
521             if (cluster_size > half_width) {
522                ubld.emit_scan_step(opcode, mod, tmp,
523                                    half_width - 1, 0, half_width, 1);
524             }
525             return;
526          }
527 
528          if (cluster_size > 1) {
529             const fs_builder ubld = exec_all().group(dispatch_width() / 2, 0);
530             ubld.emit_scan_step(opcode, mod, tmp, 0, 2, 1, 2);
531          }
532 
533          if (cluster_size > 2) {
534             if (type_sz(tmp.type) <= 4) {
535                const fs_builder ubld =
536                   exec_all().group(dispatch_width() / 4, 0);
537                ubld.emit_scan_step(opcode, mod, tmp, 1, 4, 2, 4);
538                ubld.emit_scan_step(opcode, mod, tmp, 1, 4, 3, 4);
539             } else {
540                /* For 64-bit types, we have to do things differently because
541                 * the code above would land us with destination strides that
542                 * the hardware can't handle.  Fortunately, we'll only be
543                 * 8-wide in that case and it's the same number of
544                 * instructions.
545                 */
546                const fs_builder ubld = exec_all().group(2, 0);
547                for (unsigned i = 0; i < dispatch_width(); i += 4)
548                   ubld.emit_scan_step(opcode, mod, tmp, i + 1, 0, i + 2, 1);
549             }
550          }
551 
552          for (unsigned i = 4;
553               i < MIN2(cluster_size, dispatch_width());
554               i *= 2) {
555             const fs_builder ubld = exec_all().group(i, 0);
556             ubld.emit_scan_step(opcode, mod, tmp, i - 1, 0, i, 1);
557 
558             if (dispatch_width() > i * 2)
559                ubld.emit_scan_step(opcode, mod, tmp, i * 3 - 1, 0, i * 3, 1);
560 
561             if (dispatch_width() > i * 4) {
562                ubld.emit_scan_step(opcode, mod, tmp, i * 5 - 1, 0, i * 5, 1);
563                ubld.emit_scan_step(opcode, mod, tmp, i * 7 - 1, 0, i * 7, 1);
564             }
565          }
566       }
567 
568       /**
569        * Assorted arithmetic ops.
570        * @{
571        */
572 #define ALU1(op)                                        \
573       instruction *                                     \
574       op(const dst_reg &dst, const src_reg &src0) const \
575       {                                                 \
576          return emit(BRW_OPCODE_##op, dst, src0);       \
577       }
578 
579 #define ALU2(op)                                                        \
580       instruction *                                                     \
581       op(const dst_reg &dst, const src_reg &src0, const src_reg &src1) const \
582       {                                                                 \
583          return emit(BRW_OPCODE_##op, dst, src0, src1);                 \
584       }
585 
586 #define ALU2_ACC(op)                                                    \
587       instruction *                                                     \
588       op(const dst_reg &dst, const src_reg &src0, const src_reg &src1) const \
589       {                                                                 \
590          instruction *inst = emit(BRW_OPCODE_##op, dst, src0, src1);    \
591          inst->writes_accumulator = true;                               \
592          return inst;                                                   \
593       }
594 
595 #define ALU3(op)                                                        \
596       instruction *                                                     \
597       op(const dst_reg &dst, const src_reg &src0, const src_reg &src1,  \
598          const src_reg &src2) const                                     \
599       {                                                                 \
600          return emit(BRW_OPCODE_##op, dst, src0, src1, src2);           \
601       }
602 
603       ALU2(ADD)
ALU3(ADD3)604       ALU3(ADD3)
605       ALU2_ACC(ADDC)
606       ALU2(AND)
607       ALU2(ASR)
608       ALU2(AVG)
609       ALU3(BFE)
610       ALU2(BFI1)
611       ALU3(BFI2)
612       ALU1(BFREV)
613       ALU1(CBIT)
614       ALU1(DIM)
615       ALU2(DP2)
616       ALU2(DP3)
617       ALU2(DP4)
618       ALU2(DPH)
619       ALU1(F16TO32)
620       ALU1(F32TO16)
621       ALU1(FBH)
622       ALU1(FBL)
623       ALU1(FRC)
624       ALU3(DP4A)
625       ALU2(LINE)
626       ALU1(LZD)
627       ALU2(MAC)
628       ALU2_ACC(MACH)
629       ALU3(MAD)
630       ALU1(MOV)
631       ALU2(MUL)
632       ALU1(NOT)
633       ALU2(OR)
634       ALU2(PLN)
635       ALU1(RNDD)
636       ALU1(RNDE)
637       ALU1(RNDU)
638       ALU1(RNDZ)
639       ALU2(ROL)
640       ALU2(ROR)
641       ALU2(SAD2)
642       ALU2_ACC(SADA2)
643       ALU2(SEL)
644       ALU2(SHL)
645       ALU2(SHR)
646       ALU2_ACC(SUBB)
647       ALU2(XOR)
648 
649 #undef ALU3
650 #undef ALU2_ACC
651 #undef ALU2
652 #undef ALU1
653       /** @} */
654 
655       /**
656        * CMP: Sets the low bit of the destination channels with the result
657        * of the comparison, while the upper bits are undefined, and updates
658        * the flag register with the packed 16 bits of the result.
659        */
660       instruction *
661       CMP(const dst_reg &dst, const src_reg &src0, const src_reg &src1,
662           brw_conditional_mod condition) const
663       {
664          /* Take the instruction:
665           *
666           * CMP null<d> src0<f> src1<f>
667           *
668           * Original gfx4 does type conversion to the destination type
669           * before comparison, producing garbage results for floating
670           * point comparisons.
671           *
672           * The destination type doesn't matter on newer generations,
673           * so we set the type to match src0 so we can compact the
674           * instruction.
675           */
676          return set_condmod(condition,
677                             emit(BRW_OPCODE_CMP, retype(dst, src0.type),
678                                  fix_unsigned_negate(src0),
679                                  fix_unsigned_negate(src1)));
680       }
681 
682       /**
683        * CMPN: Behaves like CMP, but produces true if src1 is NaN.
684        */
685       instruction *
CMPN(const dst_reg & dst,const src_reg & src0,const src_reg & src1,brw_conditional_mod condition)686       CMPN(const dst_reg &dst, const src_reg &src0, const src_reg &src1,
687            brw_conditional_mod condition) const
688       {
689          /* Take the instruction:
690           *
691           * CMP null<d> src0<f> src1<f>
692           *
693           * Original gfx4 does type conversion to the destination type
694           * before comparison, producing garbage results for floating
695           * point comparisons.
696           *
697           * The destination type doesn't matter on newer generations,
698           * so we set the type to match src0 so we can compact the
699           * instruction.
700           */
701          return set_condmod(condition,
702                             emit(BRW_OPCODE_CMPN, retype(dst, src0.type),
703                                  fix_unsigned_negate(src0),
704                                  fix_unsigned_negate(src1)));
705       }
706 
707       /**
708        * Gfx4 predicated IF.
709        */
710       instruction *
IF(brw_predicate predicate)711       IF(brw_predicate predicate) const
712       {
713          return set_predicate(predicate, emit(BRW_OPCODE_IF));
714       }
715 
716       /**
717        * CSEL: dst = src2 <op> 0.0f ? src0 : src1
718        */
719       instruction *
CSEL(const dst_reg & dst,const src_reg & src0,const src_reg & src1,const src_reg & src2,brw_conditional_mod condition)720       CSEL(const dst_reg &dst, const src_reg &src0, const src_reg &src1,
721            const src_reg &src2, brw_conditional_mod condition) const
722       {
723          /* CSEL only operates on floats, so we can't do integer </<=/>=/>
724           * comparisons.  Zero/non-zero (== and !=) comparisons almost work.
725           * 0x80000000 fails because it is -0.0, and -0.0 == 0.0.
726           */
727          assert(src2.type == BRW_REGISTER_TYPE_F);
728 
729          return set_condmod(condition,
730                             emit(BRW_OPCODE_CSEL,
731                                  retype(dst, BRW_REGISTER_TYPE_F),
732                                  retype(src0, BRW_REGISTER_TYPE_F),
733                                  retype(src1, BRW_REGISTER_TYPE_F),
734                                  src2));
735       }
736 
737       /**
738        * Emit a linear interpolation instruction.
739        */
740       instruction *
LRP(const dst_reg & dst,const src_reg & x,const src_reg & y,const src_reg & a)741       LRP(const dst_reg &dst, const src_reg &x, const src_reg &y,
742           const src_reg &a) const
743       {
744          if (shader->devinfo->ver >= 6 && shader->devinfo->ver <= 10) {
745             /* The LRP instruction actually does op1 * op0 + op2 * (1 - op0), so
746              * we need to reorder the operands.
747              */
748             return emit(BRW_OPCODE_LRP, dst, a, y, x);
749 
750          } else {
751             /* We can't use the LRP instruction.  Emit x*(1-a) + y*a. */
752             const dst_reg y_times_a = vgrf(dst.type);
753             const dst_reg one_minus_a = vgrf(dst.type);
754             const dst_reg x_times_one_minus_a = vgrf(dst.type);
755 
756             MUL(y_times_a, y, a);
757             ADD(one_minus_a, negate(a), brw_imm_f(1.0f));
758             MUL(x_times_one_minus_a, x, src_reg(one_minus_a));
759             return ADD(dst, src_reg(x_times_one_minus_a), src_reg(y_times_a));
760          }
761       }
762 
763       /**
764        * Collect a number of registers in a contiguous range of registers.
765        */
766       instruction *
LOAD_PAYLOAD(const dst_reg & dst,const src_reg * src,unsigned sources,unsigned header_size)767       LOAD_PAYLOAD(const dst_reg &dst, const src_reg *src,
768                    unsigned sources, unsigned header_size) const
769       {
770          instruction *inst = emit(SHADER_OPCODE_LOAD_PAYLOAD, dst, src, sources);
771          inst->header_size = header_size;
772          inst->size_written = header_size * REG_SIZE;
773          for (unsigned i = header_size; i < sources; i++) {
774             inst->size_written +=
775                ALIGN(dispatch_width() * type_sz(src[i].type) * dst.stride,
776                      REG_SIZE);
777          }
778 
779          return inst;
780       }
781 
782       instruction *
UNDEF(const dst_reg & dst)783       UNDEF(const dst_reg &dst) const
784       {
785          assert(dst.file == VGRF);
786          instruction *inst = emit(SHADER_OPCODE_UNDEF,
787                                   retype(dst, BRW_REGISTER_TYPE_UD));
788          inst->size_written = shader->alloc.sizes[dst.nr] * REG_SIZE;
789 
790          return inst;
791       }
792 
793       backend_shader *shader;
794 
795    private:
796       /**
797        * Workaround for negation of UD registers.  See comment in
798        * fs_generator::generate_code() for more details.
799        */
800       src_reg
fix_unsigned_negate(const src_reg & src)801       fix_unsigned_negate(const src_reg &src) const
802       {
803          if (src.type == BRW_REGISTER_TYPE_UD &&
804              src.negate) {
805             dst_reg temp = vgrf(BRW_REGISTER_TYPE_UD);
806             MOV(temp, src);
807             return src_reg(temp);
808          } else {
809             return src;
810          }
811       }
812 
813       /**
814        * Workaround for source register modes not supported by the ternary
815        * instruction encoding.
816        */
817       src_reg
fix_3src_operand(const src_reg & src)818       fix_3src_operand(const src_reg &src) const
819       {
820          switch (src.file) {
821          case FIXED_GRF:
822             /* FINISHME: Could handle scalar region, other stride=1 regions */
823             if (src.vstride != BRW_VERTICAL_STRIDE_8 ||
824                 src.width != BRW_WIDTH_8 ||
825                 src.hstride != BRW_HORIZONTAL_STRIDE_1)
826                break;
827             FALLTHROUGH;
828          case ATTR:
829          case VGRF:
830          case UNIFORM:
831          case IMM:
832             return src;
833          default:
834             break;
835          }
836 
837          dst_reg expanded = vgrf(src.type);
838          MOV(expanded, src);
839          return expanded;
840       }
841 
842       /**
843        * Workaround for source register modes not supported by the math
844        * instruction.
845        */
846       src_reg
fix_math_operand(const src_reg & src)847       fix_math_operand(const src_reg &src) const
848       {
849          /* Can't do hstride == 0 args on gfx6 math, so expand it out. We
850           * might be able to do better by doing execsize = 1 math and then
851           * expanding that result out, but we would need to be careful with
852           * masking.
853           *
854           * Gfx6 hardware ignores source modifiers (negate and abs) on math
855           * instructions, so we also move to a temp to set those up.
856           *
857           * Gfx7 relaxes most of the above restrictions, but still can't use IMM
858           * operands to math
859           */
860          if ((shader->devinfo->ver == 6 &&
861               (src.file == IMM || src.file == UNIFORM ||
862                src.abs || src.negate)) ||
863              (shader->devinfo->ver == 7 && src.file == IMM)) {
864             const dst_reg tmp = vgrf(src.type);
865             MOV(tmp, src);
866             return tmp;
867          } else {
868             return src;
869          }
870       }
871 
872       bblock_t *block;
873       exec_node *cursor;
874 
875       unsigned _dispatch_width;
876       unsigned _group;
877       bool force_writemask_all;
878 
879       /** Debug annotation info. */
880       struct {
881          const char *str;
882          const void *ir;
883       } annotation;
884    };
885 }
886 
887 #endif
888