• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /* -*- c++ -*- */
2 /*
3  * Copyright © 2010-2015 Intel Corporation
4  *
5  * Permission is hereby granted, free of charge, to any person obtaining a
6  * copy of this software and associated documentation files (the "Software"),
7  * to deal in the Software without restriction, including without limitation
8  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
9  * and/or sell copies of the Software, and to permit persons to whom the
10  * Software is furnished to do so, subject to the following conditions:
11  *
12  * The above copyright notice and this permission notice (including the next
13  * paragraph) shall be included in all copies or substantial portions of the
14  * Software.
15  *
16  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
19  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
21  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
22  * IN THE SOFTWARE.
23  */
24 
25 #ifndef BRW_FS_BUILDER_H
26 #define BRW_FS_BUILDER_H
27 
28 #include "brw_ir_fs.h"
29 #include "brw_shader.h"
30 #include "brw_eu.h"
31 #include "brw_fs.h"
32 
33 namespace brw {
34    /**
35     * Toolbox to assemble an FS IR program out of individual instructions.
36     *
37     * This object is meant to have an interface consistent with
38     * brw::vec4_builder.  They cannot be fully interchangeable because
39     * brw::fs_builder generates scalar code while brw::vec4_builder generates
40     * vector code.
41     */
42    class fs_builder {
43    public:
44       /** Type used in this IR to represent a source of an instruction. */
45       typedef fs_reg src_reg;
46 
47       /** Type used in this IR to represent the destination of an instruction. */
48       typedef fs_reg dst_reg;
49 
50       /** Type used in this IR to represent an instruction. */
51       typedef fs_inst instruction;
52 
53       /**
54        * Construct an fs_builder that inserts instructions into \p shader.
55        * \p dispatch_width gives the native execution width of the program.
56        */
fs_builder(fs_visitor * shader,unsigned dispatch_width)57       fs_builder(fs_visitor *shader,
58                  unsigned dispatch_width) :
59          shader(shader), block(NULL), cursor(NULL),
60          _dispatch_width(dispatch_width),
61          _group(0),
62          force_writemask_all(false),
63          annotation()
64       {
65       }
66 
fs_builder(fs_visitor * s)67       explicit fs_builder(fs_visitor *s) : fs_builder(s, s->dispatch_width) {}
68 
69       /**
70        * Construct an fs_builder that inserts instructions into \p shader
71        * before instruction \p inst in basic block \p block.  The default
72        * execution controls and debug annotation are initialized from the
73        * instruction passed as argument.
74        */
fs_builder(fs_visitor * shader,bblock_t * block,fs_inst * inst)75       fs_builder(fs_visitor *shader, bblock_t *block, fs_inst *inst) :
76          shader(shader), block(block), cursor(inst),
77          _dispatch_width(inst->exec_size),
78          _group(inst->group),
79          force_writemask_all(inst->force_writemask_all)
80       {
81          annotation.str = inst->annotation;
82          annotation.ir = inst->ir;
83       }
84 
85       /**
86        * Construct an fs_builder that inserts instructions before \p cursor in
87        * basic block \p block, inheriting other code generation parameters
88        * from this.
89        */
90       fs_builder
at(bblock_t * block,exec_node * cursor)91       at(bblock_t *block, exec_node *cursor) const
92       {
93          fs_builder bld = *this;
94          bld.block = block;
95          bld.cursor = cursor;
96          return bld;
97       }
98 
99       /**
100        * Construct an fs_builder appending instructions at the end of the
101        * instruction list of the shader, inheriting other code generation
102        * parameters from this.
103        */
104       fs_builder
at_end()105       at_end() const
106       {
107          return at(NULL, (exec_node *)&shader->instructions.tail_sentinel);
108       }
109 
110       /**
111        * Construct a builder specifying the default SIMD width and group of
112        * channel enable signals, inheriting other code generation parameters
113        * from this.
114        *
115        * \p n gives the default SIMD width, \p i gives the slot group used for
116        * predication and control flow masking in multiples of \p n channels.
117        */
118       fs_builder
group(unsigned n,unsigned i)119       group(unsigned n, unsigned i) const
120       {
121          fs_builder bld = *this;
122 
123          if (n <= dispatch_width() && i < dispatch_width() / n) {
124             bld._group += i * n;
125          } else {
126             /* The requested channel group isn't a subset of the channel group
127              * of this builder, which means that the resulting instructions
128              * would use (potentially undefined) channel enable signals not
129              * specified by the parent builder.  That's only valid if the
130              * instruction doesn't have per-channel semantics, in which case
131              * we should clear off the default group index in order to prevent
132              * emitting instructions with channel group not aligned to their
133              * own execution size.
134              */
135             assert(force_writemask_all);
136             bld._group = 0;
137          }
138 
139          bld._dispatch_width = n;
140          return bld;
141       }
142 
143       /**
144        * Alias for group() with width equal to eight.
145        */
146       fs_builder
quarter(unsigned i)147       quarter(unsigned i) const
148       {
149          return group(8, i);
150       }
151 
152       /**
153        * Construct a builder with per-channel control flow execution masking
154        * disabled if \p b is true.  If control flow execution masking is
155        * already disabled this has no effect.
156        */
157       fs_builder
158       exec_all(bool b = true) const
159       {
160          fs_builder bld = *this;
161          if (b)
162             bld.force_writemask_all = true;
163          return bld;
164       }
165 
166       /**
167        * Construct a builder with the given debug annotation info.
168        */
169       fs_builder
170       annotate(const char *str, const void *ir = NULL) const
171       {
172          fs_builder bld = *this;
173          bld.annotation.str = str;
174          bld.annotation.ir = ir;
175          return bld;
176       }
177 
178       /**
179        * Get the SIMD width in use.
180        */
181       unsigned
dispatch_width()182       dispatch_width() const
183       {
184          return _dispatch_width;
185       }
186 
187       /**
188        * Get the channel group in use.
189        */
190       unsigned
group()191       group() const
192       {
193          return _group;
194       }
195 
196       /**
197        * Allocate a virtual register of natural vector size (one for this IR)
198        * and SIMD width.  \p n gives the amount of space to allocate in
199        * dispatch_width units (which is just enough space for one logical
200        * component in this IR).
201        */
202       dst_reg
203       vgrf(enum brw_reg_type type, unsigned n = 1) const
204       {
205          const unsigned unit = reg_unit(shader->devinfo);
206          assert(dispatch_width() <= 32);
207 
208          if (n > 0)
209             return dst_reg(VGRF, shader->alloc.allocate(
210                               DIV_ROUND_UP(n * type_sz(type) * dispatch_width(),
211                                            unit * REG_SIZE) * unit),
212                            type);
213          else
214             return retype(null_reg_ud(), type);
215       }
216 
217       /**
218        * Create a null register of floating type.
219        */
220       dst_reg
null_reg_f()221       null_reg_f() const
222       {
223          return dst_reg(retype(brw_null_reg(), BRW_REGISTER_TYPE_F));
224       }
225 
226       dst_reg
null_reg_df()227       null_reg_df() const
228       {
229          return dst_reg(retype(brw_null_reg(), BRW_REGISTER_TYPE_DF));
230       }
231 
232       /**
233        * Create a null register of signed integer type.
234        */
235       dst_reg
null_reg_d()236       null_reg_d() const
237       {
238          return dst_reg(retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
239       }
240 
241       /**
242        * Create a null register of unsigned integer type.
243        */
244       dst_reg
null_reg_ud()245       null_reg_ud() const
246       {
247          return dst_reg(retype(brw_null_reg(), BRW_REGISTER_TYPE_UD));
248       }
249 
250       /**
251        * Insert an instruction into the program.
252        */
253       instruction *
emit(const instruction & inst)254       emit(const instruction &inst) const
255       {
256          return emit(new(shader->mem_ctx) instruction(inst));
257       }
258 
259       /**
260        * Create and insert a nullary control instruction into the program.
261        */
262       instruction *
emit(enum opcode opcode)263       emit(enum opcode opcode) const
264       {
265          return emit(instruction(opcode, dispatch_width()));
266       }
267 
268       /**
269        * Create and insert a nullary instruction into the program.
270        */
271       instruction *
emit(enum opcode opcode,const dst_reg & dst)272       emit(enum opcode opcode, const dst_reg &dst) const
273       {
274          return emit(instruction(opcode, dispatch_width(), dst));
275       }
276 
277       /**
278        * Create and insert a unary instruction into the program.
279        */
280       instruction *
emit(enum opcode opcode,const dst_reg & dst,const src_reg & src0)281       emit(enum opcode opcode, const dst_reg &dst, const src_reg &src0) const
282       {
283          return emit(instruction(opcode, dispatch_width(), dst, src0));
284       }
285 
286       /**
287        * Create and insert a binary instruction into the program.
288        */
289       instruction *
emit(enum opcode opcode,const dst_reg & dst,const src_reg & src0,const src_reg & src1)290       emit(enum opcode opcode, const dst_reg &dst, const src_reg &src0,
291            const src_reg &src1) const
292       {
293          return emit(instruction(opcode, dispatch_width(), dst,
294                                  src0, src1));
295       }
296 
297       /**
298        * Create and insert a ternary instruction into the program.
299        */
300       instruction *
emit(enum opcode opcode,const dst_reg & dst,const src_reg & src0,const src_reg & src1,const src_reg & src2)301       emit(enum opcode opcode, const dst_reg &dst, const src_reg &src0,
302            const src_reg &src1, const src_reg &src2) const
303       {
304          switch (opcode) {
305          case BRW_OPCODE_BFE:
306          case BRW_OPCODE_BFI2:
307          case BRW_OPCODE_MAD:
308          case BRW_OPCODE_LRP:
309             return emit(instruction(opcode, dispatch_width(), dst,
310                                     fix_3src_operand(src0),
311                                     fix_3src_operand(src1),
312                                     fix_3src_operand(src2)));
313 
314          default:
315             return emit(instruction(opcode, dispatch_width(), dst,
316                                     src0, src1, src2));
317          }
318       }
319 
320       /**
321        * Create and insert an instruction with a variable number of sources
322        * into the program.
323        */
324       instruction *
emit(enum opcode opcode,const dst_reg & dst,const src_reg srcs[],unsigned n)325       emit(enum opcode opcode, const dst_reg &dst, const src_reg srcs[],
326            unsigned n) const
327       {
328          /* Use the emit() methods for specific operand counts to ensure that
329           * opcode-specific operand fixups occur.
330           */
331          if (n == 2) {
332             return emit(opcode, dst, srcs[0], srcs[1]);
333          } else if (n == 3) {
334             return emit(opcode, dst, srcs[0], srcs[1], srcs[2]);
335          } else {
336             return emit(instruction(opcode, dispatch_width(), dst, srcs, n));
337          }
338       }
339 
340       /**
341        * Insert a preallocated instruction into the program.
342        */
343       instruction *
emit(instruction * inst)344       emit(instruction *inst) const
345       {
346          assert(inst->exec_size <= 32);
347          assert(inst->exec_size == dispatch_width() ||
348                 force_writemask_all);
349 
350          inst->group = _group;
351          inst->force_writemask_all = force_writemask_all;
352          inst->annotation = annotation.str;
353          inst->ir = annotation.ir;
354 
355          if (block)
356             static_cast<instruction *>(cursor)->insert_before(block, inst);
357          else
358             cursor->insert_before(inst);
359 
360          return inst;
361       }
362 
363       /**
364        * Select \p src0 if the comparison of both sources with the given
365        * conditional mod evaluates to true, otherwise select \p src1.
366        *
367        * Generally useful to get the minimum or maximum of two values.
368        */
369       instruction *
emit_minmax(const dst_reg & dst,const src_reg & src0,const src_reg & src1,brw_conditional_mod mod)370       emit_minmax(const dst_reg &dst, const src_reg &src0,
371                   const src_reg &src1, brw_conditional_mod mod) const
372       {
373          assert(mod == BRW_CONDITIONAL_GE || mod == BRW_CONDITIONAL_L);
374 
375          /* In some cases we can't have bytes as operand for src1, so use the
376           * same type for both operand.
377           */
378          return set_condmod(mod, SEL(dst, fix_unsigned_negate(src0),
379                                      fix_unsigned_negate(src1)));
380       }
381 
382       /**
383        * Copy any live channel from \p src to the first channel of the result.
384        */
385       src_reg
emit_uniformize(const src_reg & src)386       emit_uniformize(const src_reg &src) const
387       {
388          /* FIXME: We use a vector chan_index and dst to allow constant and
389           * copy propagration to move result all the way into the consuming
390           * instruction (typically a surface index or sampler index for a
391           * send). This uses 1 or 3 extra hw registers in 16 or 32 wide
392           * dispatch. Once we teach const/copy propagation about scalars we
393           * should go back to scalar destinations here.
394           */
395          const fs_builder ubld = exec_all();
396          const dst_reg chan_index = vgrf(BRW_REGISTER_TYPE_UD);
397          const dst_reg dst = vgrf(src.type);
398 
399          ubld.emit(SHADER_OPCODE_FIND_LIVE_CHANNEL, chan_index);
400          ubld.emit(SHADER_OPCODE_BROADCAST, dst, src, component(chan_index, 0));
401 
402          return src_reg(component(dst, 0));
403       }
404 
405       src_reg
move_to_vgrf(const src_reg & src,unsigned num_components)406       move_to_vgrf(const src_reg &src, unsigned num_components) const
407       {
408          src_reg *const src_comps = new src_reg[num_components];
409          for (unsigned i = 0; i < num_components; i++)
410             src_comps[i] = offset(src, dispatch_width(), i);
411 
412          const dst_reg dst = vgrf(src.type, num_components);
413          LOAD_PAYLOAD(dst, src_comps, num_components, 0);
414 
415          delete[] src_comps;
416 
417          return src_reg(dst);
418       }
419 
420       void
emit_scan_step(enum opcode opcode,brw_conditional_mod mod,const dst_reg & tmp,unsigned left_offset,unsigned left_stride,unsigned right_offset,unsigned right_stride)421       emit_scan_step(enum opcode opcode, brw_conditional_mod mod,
422                      const dst_reg &tmp,
423                      unsigned left_offset, unsigned left_stride,
424                      unsigned right_offset, unsigned right_stride) const
425       {
426          dst_reg left, right;
427          left = horiz_stride(horiz_offset(tmp, left_offset), left_stride);
428          right = horiz_stride(horiz_offset(tmp, right_offset), right_stride);
429          if ((tmp.type == BRW_REGISTER_TYPE_Q ||
430               tmp.type == BRW_REGISTER_TYPE_UQ) &&
431              !shader->devinfo->has_64bit_int) {
432             switch (opcode) {
433             case BRW_OPCODE_MUL:
434                /* This will get lowered by integer MUL lowering */
435                set_condmod(mod, emit(opcode, right, left, right));
436                break;
437 
438             case BRW_OPCODE_SEL: {
439                /* In order for the comparisons to work out right, we need our
440                 * comparisons to be strict.
441                 */
442                assert(mod == BRW_CONDITIONAL_L || mod == BRW_CONDITIONAL_GE);
443                if (mod == BRW_CONDITIONAL_GE)
444                   mod = BRW_CONDITIONAL_G;
445 
446                /* We treat the bottom 32 bits as unsigned regardless of
447                 * whether or not the integer as a whole is signed.
448                 */
449                dst_reg right_low = subscript(right, BRW_REGISTER_TYPE_UD, 0);
450                dst_reg left_low = subscript(left, BRW_REGISTER_TYPE_UD, 0);
451 
452                /* The upper bits get the same sign as the 64-bit type */
453                brw_reg_type type32 = brw_reg_type_from_bit_size(32, tmp.type);
454                dst_reg right_high = subscript(right, type32, 1);
455                dst_reg left_high = subscript(left, type32, 1);
456 
457                /* Build up our comparison:
458                 *
459                 *   l_hi < r_hi || (l_hi == r_hi && l_low < r_low)
460                 */
461                CMP(null_reg_ud(), retype(left_low, BRW_REGISTER_TYPE_UD),
462                                   retype(right_low, BRW_REGISTER_TYPE_UD), mod);
463                set_predicate(BRW_PREDICATE_NORMAL,
464                              CMP(null_reg_ud(), left_high, right_high,
465                                  BRW_CONDITIONAL_EQ));
466                set_predicate_inv(BRW_PREDICATE_NORMAL, true,
467                                  CMP(null_reg_ud(), left_high, right_high, mod));
468 
469                /* We could use selects here or we could use predicated MOVs
470                 * because the destination and second source (if it were a SEL)
471                 * are the same.
472                 */
473                set_predicate(BRW_PREDICATE_NORMAL, MOV(right_low, left_low));
474                set_predicate(BRW_PREDICATE_NORMAL, MOV(right_high, left_high));
475                break;
476             }
477 
478             default:
479                unreachable("Unsupported 64-bit scan op");
480             }
481          } else {
482             set_condmod(mod, emit(opcode, right, left, right));
483          }
484       }
485 
486       void
emit_scan(enum opcode opcode,const dst_reg & tmp,unsigned cluster_size,brw_conditional_mod mod)487       emit_scan(enum opcode opcode, const dst_reg &tmp,
488                 unsigned cluster_size, brw_conditional_mod mod) const
489       {
490          assert(dispatch_width() >= 8);
491 
492          /* The instruction splitting code isn't advanced enough to split
493           * these so we need to handle that ourselves.
494           */
495          if (dispatch_width() * type_sz(tmp.type) > 2 * REG_SIZE) {
496             const unsigned half_width = dispatch_width() / 2;
497             const fs_builder ubld = exec_all().group(half_width, 0);
498             dst_reg left = tmp;
499             dst_reg right = horiz_offset(tmp, half_width);
500             ubld.emit_scan(opcode, left, cluster_size, mod);
501             ubld.emit_scan(opcode, right, cluster_size, mod);
502             if (cluster_size > half_width) {
503                ubld.emit_scan_step(opcode, mod, tmp,
504                                    half_width - 1, 0, half_width, 1);
505             }
506             return;
507          }
508 
509          if (cluster_size > 1) {
510             const fs_builder ubld = exec_all().group(dispatch_width() / 2, 0);
511             ubld.emit_scan_step(opcode, mod, tmp, 0, 2, 1, 2);
512          }
513 
514          if (cluster_size > 2) {
515             if (type_sz(tmp.type) <= 4) {
516                const fs_builder ubld =
517                   exec_all().group(dispatch_width() / 4, 0);
518                ubld.emit_scan_step(opcode, mod, tmp, 1, 4, 2, 4);
519                ubld.emit_scan_step(opcode, mod, tmp, 1, 4, 3, 4);
520             } else {
521                /* For 64-bit types, we have to do things differently because
522                 * the code above would land us with destination strides that
523                 * the hardware can't handle.  Fortunately, we'll only be
524                 * 8-wide in that case and it's the same number of
525                 * instructions.
526                 */
527                const fs_builder ubld = exec_all().group(2, 0);
528                for (unsigned i = 0; i < dispatch_width(); i += 4)
529                   ubld.emit_scan_step(opcode, mod, tmp, i + 1, 0, i + 2, 1);
530             }
531          }
532 
533          for (unsigned i = 4;
534               i < MIN2(cluster_size, dispatch_width());
535               i *= 2) {
536             const fs_builder ubld = exec_all().group(i, 0);
537             ubld.emit_scan_step(opcode, mod, tmp, i - 1, 0, i, 1);
538 
539             if (dispatch_width() > i * 2)
540                ubld.emit_scan_step(opcode, mod, tmp, i * 3 - 1, 0, i * 3, 1);
541 
542             if (dispatch_width() > i * 4) {
543                ubld.emit_scan_step(opcode, mod, tmp, i * 5 - 1, 0, i * 5, 1);
544                ubld.emit_scan_step(opcode, mod, tmp, i * 7 - 1, 0, i * 7, 1);
545             }
546          }
547       }
548 
549       instruction *
emit_undef_for_dst(const instruction * old_inst)550       emit_undef_for_dst(const instruction *old_inst) const
551       {
552          assert(old_inst->dst.file == VGRF);
553          instruction *inst = emit(SHADER_OPCODE_UNDEF,
554                                   retype(old_inst->dst, BRW_REGISTER_TYPE_UD));
555          inst->size_written = old_inst->size_written;
556 
557          return inst;
558       }
559 
560       /**
561        * Assorted arithmetic ops.
562        * @{
563        */
564 #define ALU1(op)                                        \
565       instruction *                                     \
566       op(const dst_reg &dst, const src_reg &src0) const \
567       {                                                 \
568          return emit(BRW_OPCODE_##op, dst, src0);       \
569       }
570 
571 #define ALU2(op)                                                        \
572       instruction *                                                     \
573       op(const dst_reg &dst, const src_reg &src0, const src_reg &src1) const \
574       {                                                                 \
575          return emit(BRW_OPCODE_##op, dst, src0, src1);                 \
576       }
577 
578 #define ALU2_ACC(op)                                                    \
579       instruction *                                                     \
580       op(const dst_reg &dst, const src_reg &src0, const src_reg &src1) const \
581       {                                                                 \
582          instruction *inst = emit(BRW_OPCODE_##op, dst, src0, src1);    \
583          inst->writes_accumulator = true;                               \
584          return inst;                                                   \
585       }
586 
587 #define ALU3(op)                                                        \
588       instruction *                                                     \
589       op(const dst_reg &dst, const src_reg &src0, const src_reg &src1,  \
590          const src_reg &src2) const                                     \
591       {                                                                 \
592          return emit(BRW_OPCODE_##op, dst, src0, src1, src2);           \
593       }
594 
595       ALU2(ADD)
ALU3(ADD3)596       ALU3(ADD3)
597       ALU2_ACC(ADDC)
598       ALU2(AND)
599       ALU2(ASR)
600       ALU2(AVG)
601       ALU3(BFE)
602       ALU2(BFI1)
603       ALU3(BFI2)
604       ALU1(BFREV)
605       ALU1(CBIT)
606       ALU2(DP2)
607       ALU2(DP3)
608       ALU2(DP4)
609       ALU2(DPH)
610       ALU1(FBH)
611       ALU1(FBL)
612       ALU1(FRC)
613       ALU3(DP4A)
614       ALU2(LINE)
615       ALU1(LZD)
616       ALU2(MAC)
617       ALU2_ACC(MACH)
618       ALU3(MAD)
619       ALU1(MOV)
620       ALU2(MUL)
621       ALU1(NOT)
622       ALU2(OR)
623       ALU2(PLN)
624       ALU1(RNDD)
625       ALU1(RNDE)
626       ALU1(RNDU)
627       ALU1(RNDZ)
628       ALU2(ROL)
629       ALU2(ROR)
630       ALU2(SAD2)
631       ALU2_ACC(SADA2)
632       ALU2(SEL)
633       ALU2(SHL)
634       ALU2(SHR)
635       ALU2_ACC(SUBB)
636       ALU2(XOR)
637 
638 #undef ALU3
639 #undef ALU2_ACC
640 #undef ALU2
641 #undef ALU1
642       /** @} */
643 
644       /**
645        * CMP: Sets the low bit of the destination channels with the result
646        * of the comparison, while the upper bits are undefined, and updates
647        * the flag register with the packed 16 bits of the result.
648        */
649       instruction *
650       CMP(const dst_reg &dst, const src_reg &src0, const src_reg &src1,
651           brw_conditional_mod condition) const
652       {
653          /* Take the instruction:
654           *
655           * CMP null<d> src0<f> src1<f>
656           *
657           * Original gfx4 does type conversion to the destination type
658           * before comparison, producing garbage results for floating
659           * point comparisons.
660           *
661           * The destination type doesn't matter on newer generations,
662           * so we set the type to match src0 so we can compact the
663           * instruction.
664           */
665          return set_condmod(condition,
666                             emit(BRW_OPCODE_CMP, retype(dst, src0.type),
667                                  fix_unsigned_negate(src0),
668                                  fix_unsigned_negate(src1)));
669       }
670 
671       /**
672        * CMPN: Behaves like CMP, but produces true if src1 is NaN.
673        */
674       instruction *
CMPN(const dst_reg & dst,const src_reg & src0,const src_reg & src1,brw_conditional_mod condition)675       CMPN(const dst_reg &dst, const src_reg &src0, const src_reg &src1,
676            brw_conditional_mod condition) const
677       {
678          /* Take the instruction:
679           *
680           * CMP null<d> src0<f> src1<f>
681           *
682           * Original gfx4 does type conversion to the destination type
683           * before comparison, producing garbage results for floating
684           * point comparisons.
685           *
686           * The destination type doesn't matter on newer generations,
687           * so we set the type to match src0 so we can compact the
688           * instruction.
689           */
690          return set_condmod(condition,
691                             emit(BRW_OPCODE_CMPN, retype(dst, src0.type),
692                                  fix_unsigned_negate(src0),
693                                  fix_unsigned_negate(src1)));
694       }
695 
696       /**
697        * Gfx4 predicated IF.
698        */
699       instruction *
IF(brw_predicate predicate)700       IF(brw_predicate predicate) const
701       {
702          return set_predicate(predicate, emit(BRW_OPCODE_IF));
703       }
704 
705       /**
706        * CSEL: dst = src2 <op> 0.0f ? src0 : src1
707        */
708       instruction *
CSEL(const dst_reg & dst,const src_reg & src0,const src_reg & src1,const src_reg & src2,brw_conditional_mod condition)709       CSEL(const dst_reg &dst, const src_reg &src0, const src_reg &src1,
710            const src_reg &src2, brw_conditional_mod condition) const
711       {
712          /* CSEL only operates on floats, so we can't do integer </<=/>=/>
713           * comparisons.  Zero/non-zero (== and !=) comparisons almost work.
714           * 0x80000000 fails because it is -0.0, and -0.0 == 0.0.
715           */
716          assert(src2.type == BRW_REGISTER_TYPE_F);
717 
718          return set_condmod(condition,
719                             emit(BRW_OPCODE_CSEL,
720                                  retype(dst, BRW_REGISTER_TYPE_F),
721                                  retype(src0, BRW_REGISTER_TYPE_F),
722                                  retype(src1, BRW_REGISTER_TYPE_F),
723                                  src2));
724       }
725 
726       /**
727        * Emit a linear interpolation instruction.
728        */
729       instruction *
LRP(const dst_reg & dst,const src_reg & x,const src_reg & y,const src_reg & a)730       LRP(const dst_reg &dst, const src_reg &x, const src_reg &y,
731           const src_reg &a) const
732       {
733          if (shader->devinfo->ver <= 10) {
734             /* The LRP instruction actually does op1 * op0 + op2 * (1 - op0), so
735              * we need to reorder the operands.
736              */
737             return emit(BRW_OPCODE_LRP, dst, a, y, x);
738 
739          } else {
740             /* We can't use the LRP instruction.  Emit x*(1-a) + y*a. */
741             const dst_reg y_times_a = vgrf(dst.type);
742             const dst_reg one_minus_a = vgrf(dst.type);
743             const dst_reg x_times_one_minus_a = vgrf(dst.type);
744 
745             MUL(y_times_a, y, a);
746             ADD(one_minus_a, negate(a), brw_imm_f(1.0f));
747             MUL(x_times_one_minus_a, x, src_reg(one_minus_a));
748             return ADD(dst, src_reg(x_times_one_minus_a), src_reg(y_times_a));
749          }
750       }
751 
752       /**
753        * Collect a number of registers in a contiguous range of registers.
754        */
755       instruction *
LOAD_PAYLOAD(const dst_reg & dst,const src_reg * src,unsigned sources,unsigned header_size)756       LOAD_PAYLOAD(const dst_reg &dst, const src_reg *src,
757                    unsigned sources, unsigned header_size) const
758       {
759          instruction *inst = emit(SHADER_OPCODE_LOAD_PAYLOAD, dst, src, sources);
760          inst->header_size = header_size;
761          inst->size_written = header_size * REG_SIZE;
762          for (unsigned i = header_size; i < sources; i++) {
763             inst->size_written += dispatch_width() * type_sz(src[i].type) *
764                                   dst.stride;
765          }
766 
767          return inst;
768       }
769 
770       instruction *
UNDEF(const dst_reg & dst)771       UNDEF(const dst_reg &dst) const
772       {
773          assert(dst.file == VGRF);
774          assert(dst.offset % REG_SIZE == 0);
775          instruction *inst = emit(SHADER_OPCODE_UNDEF,
776                                   retype(dst, BRW_REGISTER_TYPE_UD));
777          inst->size_written = shader->alloc.sizes[dst.nr] * REG_SIZE - dst.offset;
778 
779          return inst;
780       }
781 
782       instruction *
DPAS(const dst_reg & dst,const src_reg & src0,const src_reg & src1,const src_reg & src2,unsigned sdepth,unsigned rcount)783       DPAS(const dst_reg &dst, const src_reg &src0, const src_reg &src1, const src_reg &src2,
784            unsigned sdepth, unsigned rcount) const
785       {
786          assert(_dispatch_width == 8);
787          assert(sdepth == 8);
788          assert(rcount == 1 || rcount == 2 || rcount == 4 || rcount == 8);
789 
790          instruction *inst = emit(BRW_OPCODE_DPAS, dst, src0, src1, src2);
791          inst->sdepth = sdepth;
792          inst->rcount = rcount;
793 
794          if (dst.type == BRW_REGISTER_TYPE_HF) {
795             inst->size_written = rcount * REG_SIZE / 2;
796          } else {
797             inst->size_written = rcount * REG_SIZE;
798          }
799 
800          return inst;
801       }
802 
803       fs_visitor *shader;
804 
BREAK()805       fs_inst *BREAK()    { return emit(BRW_OPCODE_BREAK); }
DO()806       fs_inst *DO()       { return emit(BRW_OPCODE_DO); }
ENDIF()807       fs_inst *ENDIF()    { return emit(BRW_OPCODE_ENDIF); }
NOP()808       fs_inst *NOP()      { return emit(BRW_OPCODE_NOP); }
WHILE()809       fs_inst *WHILE()    { return emit(BRW_OPCODE_WHILE); }
CONTINUE()810       fs_inst *CONTINUE() { return emit(BRW_OPCODE_CONTINUE); }
811 
812    private:
813       /**
814        * Workaround for negation of UD registers.  See comment in
815        * fs_generator::generate_code() for more details.
816        */
817       src_reg
fix_unsigned_negate(const src_reg & src)818       fix_unsigned_negate(const src_reg &src) const
819       {
820          if (src.type == BRW_REGISTER_TYPE_UD &&
821              src.negate) {
822             dst_reg temp = vgrf(BRW_REGISTER_TYPE_UD);
823             MOV(temp, src);
824             return src_reg(temp);
825          } else {
826             return src;
827          }
828       }
829 
830       /**
831        * Workaround for source register modes not supported by the ternary
832        * instruction encoding.
833        */
834       src_reg
fix_3src_operand(const src_reg & src)835       fix_3src_operand(const src_reg &src) const
836       {
837          switch (src.file) {
838          case FIXED_GRF:
839             /* FINISHME: Could handle scalar region, other stride=1 regions */
840             if (src.vstride != BRW_VERTICAL_STRIDE_8 ||
841                 src.width != BRW_WIDTH_8 ||
842                 src.hstride != BRW_HORIZONTAL_STRIDE_1)
843                break;
844             FALLTHROUGH;
845          case ATTR:
846          case VGRF:
847          case UNIFORM:
848          case IMM:
849             return src;
850          default:
851             break;
852          }
853 
854          dst_reg expanded = vgrf(src.type);
855          MOV(expanded, src);
856          return expanded;
857       }
858 
859       bblock_t *block;
860       exec_node *cursor;
861 
862       unsigned _dispatch_width;
863       unsigned _group;
864       bool force_writemask_all;
865 
866       /** Debug annotation info. */
867       struct {
868          const char *str;
869          const void *ir;
870       } annotation;
871    };
872 }
873 
874 static inline fs_reg
offset(const fs_reg & reg,const brw::fs_builder & bld,unsigned delta)875 offset(const fs_reg &reg, const brw::fs_builder &bld, unsigned delta)
876 {
877    return offset(reg, bld.dispatch_width(), delta);
878 }
879 
880 #endif
881