• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /* -*- c++ -*- */
2 /*
3  * Copyright © 2010-2015 Intel Corporation
4  *
5  * Permission is hereby granted, free of charge, to any person obtaining a
6  * copy of this software and associated documentation files (the "Software"),
7  * to deal in the Software without restriction, including without limitation
8  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
9  * and/or sell copies of the Software, and to permit persons to whom the
10  * Software is furnished to do so, subject to the following conditions:
11  *
12  * The above copyright notice and this permission notice (including the next
13  * paragraph) shall be included in all copies or substantial portions of the
14  * Software.
15  *
16  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
19  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
21  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
22  * IN THE SOFTWARE.
23  */
24 
25 #ifndef BRW_FS_BUILDER_H
26 #define BRW_FS_BUILDER_H
27 
28 #include "brw_ir_fs.h"
29 #include "brw_shader.h"
30 #include "brw_context.h"
31 
32 namespace brw {
33    /**
34     * Toolbox to assemble an FS IR program out of individual instructions.
35     *
36     * This object is meant to have an interface consistent with
37     * brw::vec4_builder.  They cannot be fully interchangeable because
38     * brw::fs_builder generates scalar code while brw::vec4_builder generates
39     * vector code.
40     */
41    class fs_builder {
42    public:
43       /** Type used in this IR to represent a source of an instruction. */
44       typedef fs_reg src_reg;
45 
46       /** Type used in this IR to represent the destination of an instruction. */
47       typedef fs_reg dst_reg;
48 
49       /** Type used in this IR to represent an instruction. */
50       typedef fs_inst instruction;
51 
52       /**
53        * Construct an fs_builder that inserts instructions into \p shader.
54        * \p dispatch_width gives the native execution width of the program.
55        */
fs_builder(backend_shader * shader,unsigned dispatch_width)56       fs_builder(backend_shader *shader,
57                  unsigned dispatch_width) :
58          shader(shader), block(NULL), cursor(NULL),
59          _dispatch_width(dispatch_width),
60          _group(0),
61          force_writemask_all(false),
62          annotation()
63       {
64       }
65 
66       /**
67        * Construct an fs_builder that inserts instructions into \p shader
68        * before instruction \p inst in basic block \p block.  The default
69        * execution controls and debug annotation are initialized from the
70        * instruction passed as argument.
71        */
fs_builder(backend_shader * shader,bblock_t * block,fs_inst * inst)72       fs_builder(backend_shader *shader, bblock_t *block, fs_inst *inst) :
73          shader(shader), block(block), cursor(inst),
74          _dispatch_width(inst->exec_size),
75          _group(inst->group),
76          force_writemask_all(inst->force_writemask_all)
77       {
78          annotation.str = inst->annotation;
79          annotation.ir = inst->ir;
80       }
81 
82       /**
83        * Construct an fs_builder that inserts instructions before \p cursor in
84        * basic block \p block, inheriting other code generation parameters
85        * from this.
86        */
87       fs_builder
at(bblock_t * block,exec_node * cursor)88       at(bblock_t *block, exec_node *cursor) const
89       {
90          fs_builder bld = *this;
91          bld.block = block;
92          bld.cursor = cursor;
93          return bld;
94       }
95 
96       /**
97        * Construct an fs_builder appending instructions at the end of the
98        * instruction list of the shader, inheriting other code generation
99        * parameters from this.
100        */
101       fs_builder
at_end()102       at_end() const
103       {
104          return at(NULL, (exec_node *)&shader->instructions.tail_sentinel);
105       }
106 
107       /**
108        * Construct a builder specifying the default SIMD width and group of
109        * channel enable signals, inheriting other code generation parameters
110        * from this.
111        *
112        * \p n gives the default SIMD width, \p i gives the slot group used for
113        * predication and control flow masking in multiples of \p n channels.
114        */
115       fs_builder
group(unsigned n,unsigned i)116       group(unsigned n, unsigned i) const
117       {
118          assert(force_writemask_all ||
119                 (n <= dispatch_width() && i < dispatch_width() / n));
120          fs_builder bld = *this;
121          bld._dispatch_width = n;
122          bld._group += i * n;
123          return bld;
124       }
125 
126       /**
127        * Alias for group() with width equal to eight.
128        */
129       fs_builder
half(unsigned i)130       half(unsigned i) const
131       {
132          return group(8, i);
133       }
134 
135       /**
136        * Construct a builder with per-channel control flow execution masking
137        * disabled if \p b is true.  If control flow execution masking is
138        * already disabled this has no effect.
139        */
140       fs_builder
141       exec_all(bool b = true) const
142       {
143          fs_builder bld = *this;
144          if (b)
145             bld.force_writemask_all = true;
146          return bld;
147       }
148 
149       /**
150        * Construct a builder with the given debug annotation info.
151        */
152       fs_builder
153       annotate(const char *str, const void *ir = NULL) const
154       {
155          fs_builder bld = *this;
156          bld.annotation.str = str;
157          bld.annotation.ir = ir;
158          return bld;
159       }
160 
161       /**
162        * Get the SIMD width in use.
163        */
164       unsigned
dispatch_width()165       dispatch_width() const
166       {
167          return _dispatch_width;
168       }
169 
170       /**
171        * Get the channel group in use.
172        */
173       unsigned
group()174       group() const
175       {
176          return _group;
177       }
178 
179       /**
180        * Allocate a virtual register of natural vector size (one for this IR)
181        * and SIMD width.  \p n gives the amount of space to allocate in
182        * dispatch_width units (which is just enough space for one logical
183        * component in this IR).
184        */
185       dst_reg
186       vgrf(enum brw_reg_type type, unsigned n = 1) const
187       {
188          assert(dispatch_width() <= 32);
189 
190          if (n > 0)
191             return dst_reg(VGRF, shader->alloc.allocate(
192                               DIV_ROUND_UP(n * type_sz(type) * dispatch_width(),
193                                            REG_SIZE)),
194                            type);
195          else
196             return retype(null_reg_ud(), type);
197       }
198 
199       /**
200        * Create a null register of floating type.
201        */
202       dst_reg
null_reg_f()203       null_reg_f() const
204       {
205          return dst_reg(retype(brw_null_reg(), BRW_REGISTER_TYPE_F));
206       }
207 
208       dst_reg
null_reg_df()209       null_reg_df() const
210       {
211          return dst_reg(retype(brw_null_reg(), BRW_REGISTER_TYPE_DF));
212       }
213 
214       /**
215        * Create a null register of signed integer type.
216        */
217       dst_reg
null_reg_d()218       null_reg_d() const
219       {
220          return dst_reg(retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
221       }
222 
223       /**
224        * Create a null register of unsigned integer type.
225        */
226       dst_reg
null_reg_ud()227       null_reg_ud() const
228       {
229          return dst_reg(retype(brw_null_reg(), BRW_REGISTER_TYPE_UD));
230       }
231 
232       /**
233        * Get the mask of SIMD channels enabled by dispatch and not yet
234        * disabled by discard.
235        */
236       src_reg
sample_mask_reg()237       sample_mask_reg() const
238       {
239          assert(shader->stage != MESA_SHADER_FRAGMENT ||
240                 group() + dispatch_width() <= 16);
241          if (shader->stage != MESA_SHADER_FRAGMENT) {
242             return brw_imm_d(0xffffffff);
243          } else if (brw_wm_prog_data(shader->stage_prog_data)->uses_kill) {
244             return brw_flag_reg(0, 1);
245          } else {
246             return retype(brw_vec1_grf(1, 7), BRW_REGISTER_TYPE_UD);
247          }
248       }
249 
250       /**
251        * Insert an instruction into the program.
252        */
253       instruction *
emit(const instruction & inst)254       emit(const instruction &inst) const
255       {
256          return emit(new(shader->mem_ctx) instruction(inst));
257       }
258 
259       /**
260        * Create and insert a nullary control instruction into the program.
261        */
262       instruction *
emit(enum opcode opcode)263       emit(enum opcode opcode) const
264       {
265          return emit(instruction(opcode, dispatch_width()));
266       }
267 
268       /**
269        * Create and insert a nullary instruction into the program.
270        */
271       instruction *
emit(enum opcode opcode,const dst_reg & dst)272       emit(enum opcode opcode, const dst_reg &dst) const
273       {
274          return emit(instruction(opcode, dispatch_width(), dst));
275       }
276 
277       /**
278        * Create and insert a unary instruction into the program.
279        */
280       instruction *
emit(enum opcode opcode,const dst_reg & dst,const src_reg & src0)281       emit(enum opcode opcode, const dst_reg &dst, const src_reg &src0) const
282       {
283          switch (opcode) {
284          case SHADER_OPCODE_RCP:
285          case SHADER_OPCODE_RSQ:
286          case SHADER_OPCODE_SQRT:
287          case SHADER_OPCODE_EXP2:
288          case SHADER_OPCODE_LOG2:
289          case SHADER_OPCODE_SIN:
290          case SHADER_OPCODE_COS:
291             return emit(instruction(opcode, dispatch_width(), dst,
292                                     fix_math_operand(src0)));
293 
294          default:
295             return emit(instruction(opcode, dispatch_width(), dst, src0));
296          }
297       }
298 
299       /**
300        * Create and insert a binary instruction into the program.
301        */
302       instruction *
emit(enum opcode opcode,const dst_reg & dst,const src_reg & src0,const src_reg & src1)303       emit(enum opcode opcode, const dst_reg &dst, const src_reg &src0,
304            const src_reg &src1) const
305       {
306          switch (opcode) {
307          case SHADER_OPCODE_POW:
308          case SHADER_OPCODE_INT_QUOTIENT:
309          case SHADER_OPCODE_INT_REMAINDER:
310             return emit(instruction(opcode, dispatch_width(), dst,
311                                     fix_math_operand(src0),
312                                     fix_math_operand(src1)));
313 
314          default:
315             return emit(instruction(opcode, dispatch_width(), dst, src0, src1));
316 
317          }
318       }
319 
320       /**
321        * Create and insert a ternary instruction into the program.
322        */
323       instruction *
emit(enum opcode opcode,const dst_reg & dst,const src_reg & src0,const src_reg & src1,const src_reg & src2)324       emit(enum opcode opcode, const dst_reg &dst, const src_reg &src0,
325            const src_reg &src1, const src_reg &src2) const
326       {
327          switch (opcode) {
328          case BRW_OPCODE_BFE:
329          case BRW_OPCODE_BFI2:
330          case BRW_OPCODE_MAD:
331          case BRW_OPCODE_LRP:
332             return emit(instruction(opcode, dispatch_width(), dst,
333                                     fix_3src_operand(src0),
334                                     fix_3src_operand(src1),
335                                     fix_3src_operand(src2)));
336 
337          default:
338             return emit(instruction(opcode, dispatch_width(), dst,
339                                     src0, src1, src2));
340          }
341       }
342 
343       /**
344        * Create and insert an instruction with a variable number of sources
345        * into the program.
346        */
347       instruction *
emit(enum opcode opcode,const dst_reg & dst,const src_reg srcs[],unsigned n)348       emit(enum opcode opcode, const dst_reg &dst, const src_reg srcs[],
349            unsigned n) const
350       {
351          return emit(instruction(opcode, dispatch_width(), dst, srcs, n));
352       }
353 
354       /**
355        * Insert a preallocated instruction into the program.
356        */
357       instruction *
emit(instruction * inst)358       emit(instruction *inst) const
359       {
360          assert(inst->exec_size <= 32);
361          assert(inst->exec_size == dispatch_width() ||
362                 force_writemask_all);
363 
364          inst->group = _group;
365          inst->force_writemask_all = force_writemask_all;
366          inst->annotation = annotation.str;
367          inst->ir = annotation.ir;
368 
369          if (block)
370             static_cast<instruction *>(cursor)->insert_before(block, inst);
371          else
372             cursor->insert_before(inst);
373 
374          return inst;
375       }
376 
377       /**
378        * Select \p src0 if the comparison of both sources with the given
379        * conditional mod evaluates to true, otherwise select \p src1.
380        *
381        * Generally useful to get the minimum or maximum of two values.
382        */
383       instruction *
emit_minmax(const dst_reg & dst,const src_reg & src0,const src_reg & src1,brw_conditional_mod mod)384       emit_minmax(const dst_reg &dst, const src_reg &src0,
385                   const src_reg &src1, brw_conditional_mod mod) const
386       {
387          assert(mod == BRW_CONDITIONAL_GE || mod == BRW_CONDITIONAL_L);
388 
389          return set_condmod(mod, SEL(dst, fix_unsigned_negate(src0),
390                                      fix_unsigned_negate(src1)));
391       }
392 
393       /**
394        * Copy any live channel from \p src to the first channel of the result.
395        */
396       src_reg
emit_uniformize(const src_reg & src)397       emit_uniformize(const src_reg &src) const
398       {
399          /* FIXME: We use a vector chan_index and dst to allow constant and
400           * copy propagration to move result all the way into the consuming
401           * instruction (typically a surface index or sampler index for a
402           * send). This uses 1 or 3 extra hw registers in 16 or 32 wide
403           * dispatch. Once we teach const/copy propagation about scalars we
404           * should go back to scalar destinations here.
405           */
406          const fs_builder ubld = exec_all();
407          const dst_reg chan_index = vgrf(BRW_REGISTER_TYPE_UD);
408          const dst_reg dst = vgrf(src.type);
409 
410          ubld.emit(SHADER_OPCODE_FIND_LIVE_CHANNEL, chan_index);
411          ubld.emit(SHADER_OPCODE_BROADCAST, dst, src, component(chan_index, 0));
412 
413          return src_reg(component(dst, 0));
414       }
415 
416       /**
417        * Assorted arithmetic ops.
418        * @{
419        */
420 #define ALU1(op)                                        \
421       instruction *                                     \
422       op(const dst_reg &dst, const src_reg &src0) const \
423       {                                                 \
424          return emit(BRW_OPCODE_##op, dst, src0);       \
425       }
426 
427 #define ALU2(op)                                                        \
428       instruction *                                                     \
429       op(const dst_reg &dst, const src_reg &src0, const src_reg &src1) const \
430       {                                                                 \
431          return emit(BRW_OPCODE_##op, dst, src0, src1);                 \
432       }
433 
434 #define ALU2_ACC(op)                                                    \
435       instruction *                                                     \
436       op(const dst_reg &dst, const src_reg &src0, const src_reg &src1) const \
437       {                                                                 \
438          instruction *inst = emit(BRW_OPCODE_##op, dst, src0, src1);    \
439          inst->writes_accumulator = true;                               \
440          return inst;                                                   \
441       }
442 
443 #define ALU3(op)                                                        \
444       instruction *                                                     \
445       op(const dst_reg &dst, const src_reg &src0, const src_reg &src1,  \
446          const src_reg &src2) const                                     \
447       {                                                                 \
448          return emit(BRW_OPCODE_##op, dst, src0, src1, src2);           \
449       }
450 
451       ALU2(ADD)
ALU2_ACC(ADDC)452       ALU2_ACC(ADDC)
453       ALU2(AND)
454       ALU2(ASR)
455       ALU2(AVG)
456       ALU3(BFE)
457       ALU2(BFI1)
458       ALU3(BFI2)
459       ALU1(BFREV)
460       ALU1(CBIT)
461       ALU2(CMPN)
462       ALU3(CSEL)
463       ALU1(DIM)
464       ALU2(DP2)
465       ALU2(DP3)
466       ALU2(DP4)
467       ALU2(DPH)
468       ALU1(F16TO32)
469       ALU1(F32TO16)
470       ALU1(FBH)
471       ALU1(FBL)
472       ALU1(FRC)
473       ALU2(LINE)
474       ALU1(LZD)
475       ALU2(MAC)
476       ALU2_ACC(MACH)
477       ALU3(MAD)
478       ALU1(MOV)
479       ALU2(MUL)
480       ALU1(NOT)
481       ALU2(OR)
482       ALU2(PLN)
483       ALU1(RNDD)
484       ALU1(RNDE)
485       ALU1(RNDU)
486       ALU1(RNDZ)
487       ALU2(SAD2)
488       ALU2_ACC(SADA2)
489       ALU2(SEL)
490       ALU2(SHL)
491       ALU2(SHR)
492       ALU2_ACC(SUBB)
493       ALU2(XOR)
494 
495 #undef ALU3
496 #undef ALU2_ACC
497 #undef ALU2
498 #undef ALU1
499       /** @} */
500 
501       /**
502        * CMP: Sets the low bit of the destination channels with the result
503        * of the comparison, while the upper bits are undefined, and updates
504        * the flag register with the packed 16 bits of the result.
505        */
506       instruction *
507       CMP(const dst_reg &dst, const src_reg &src0, const src_reg &src1,
508           brw_conditional_mod condition) const
509       {
510          /* Take the instruction:
511           *
512           * CMP null<d> src0<f> src1<f>
513           *
514           * Original gen4 does type conversion to the destination type
515           * before comparison, producing garbage results for floating
516           * point comparisons.
517           *
518           * The destination type doesn't matter on newer generations,
519           * so we set the type to match src0 so we can compact the
520           * instruction.
521           */
522          return set_condmod(condition,
523                             emit(BRW_OPCODE_CMP, retype(dst, src0.type),
524                                  fix_unsigned_negate(src0),
525                                  fix_unsigned_negate(src1)));
526       }
527 
528       /**
529        * Gen4 predicated IF.
530        */
531       instruction *
IF(brw_predicate predicate)532       IF(brw_predicate predicate) const
533       {
534          return set_predicate(predicate, emit(BRW_OPCODE_IF));
535       }
536 
537       /**
538        * Emit a linear interpolation instruction.
539        */
540       instruction *
LRP(const dst_reg & dst,const src_reg & x,const src_reg & y,const src_reg & a)541       LRP(const dst_reg &dst, const src_reg &x, const src_reg &y,
542           const src_reg &a) const
543       {
544          if (shader->devinfo->gen >= 6) {
545             /* The LRP instruction actually does op1 * op0 + op2 * (1 - op0), so
546              * we need to reorder the operands.
547              */
548             return emit(BRW_OPCODE_LRP, dst, a, y, x);
549 
550          } else {
551             /* We can't use the LRP instruction.  Emit x*(1-a) + y*a. */
552             const dst_reg y_times_a = vgrf(dst.type);
553             const dst_reg one_minus_a = vgrf(dst.type);
554             const dst_reg x_times_one_minus_a = vgrf(dst.type);
555 
556             MUL(y_times_a, y, a);
557             ADD(one_minus_a, negate(a), brw_imm_f(1.0f));
558             MUL(x_times_one_minus_a, x, src_reg(one_minus_a));
559             return ADD(dst, src_reg(x_times_one_minus_a), src_reg(y_times_a));
560          }
561       }
562 
563       /**
564        * Collect a number of registers in a contiguous range of registers.
565        */
566       instruction *
LOAD_PAYLOAD(const dst_reg & dst,const src_reg * src,unsigned sources,unsigned header_size)567       LOAD_PAYLOAD(const dst_reg &dst, const src_reg *src,
568                    unsigned sources, unsigned header_size) const
569       {
570          instruction *inst = emit(SHADER_OPCODE_LOAD_PAYLOAD, dst, src, sources);
571          inst->header_size = header_size;
572          inst->size_written = header_size * REG_SIZE;
573          for (unsigned i = header_size; i < sources; i++) {
574             inst->size_written +=
575                ALIGN(dispatch_width() * type_sz(src[i].type) * dst.stride,
576                      REG_SIZE);
577          }
578 
579          return inst;
580       }
581 
582       backend_shader *shader;
583 
584    private:
585       /**
586        * Workaround for negation of UD registers.  See comment in
587        * fs_generator::generate_code() for more details.
588        */
589       src_reg
fix_unsigned_negate(const src_reg & src)590       fix_unsigned_negate(const src_reg &src) const
591       {
592          if (src.type == BRW_REGISTER_TYPE_UD &&
593              src.negate) {
594             dst_reg temp = vgrf(BRW_REGISTER_TYPE_UD);
595             MOV(temp, src);
596             return src_reg(temp);
597          } else {
598             return src;
599          }
600       }
601 
602       /**
603        * Workaround for source register modes not supported by the ternary
604        * instruction encoding.
605        */
606       src_reg
fix_3src_operand(const src_reg & src)607       fix_3src_operand(const src_reg &src) const
608       {
609          if (src.file == VGRF || src.file == UNIFORM || src.stride > 1) {
610             return src;
611          } else {
612             dst_reg expanded = vgrf(src.type);
613             MOV(expanded, src);
614             return expanded;
615          }
616       }
617 
618       /**
619        * Workaround for source register modes not supported by the math
620        * instruction.
621        */
622       src_reg
fix_math_operand(const src_reg & src)623       fix_math_operand(const src_reg &src) const
624       {
625          /* Can't do hstride == 0 args on gen6 math, so expand it out. We
626           * might be able to do better by doing execsize = 1 math and then
627           * expanding that result out, but we would need to be careful with
628           * masking.
629           *
630           * Gen6 hardware ignores source modifiers (negate and abs) on math
631           * instructions, so we also move to a temp to set those up.
632           *
633           * Gen7 relaxes most of the above restrictions, but still can't use IMM
634           * operands to math
635           */
636          if ((shader->devinfo->gen == 6 &&
637               (src.file == IMM || src.file == UNIFORM ||
638                src.abs || src.negate)) ||
639              (shader->devinfo->gen == 7 && src.file == IMM)) {
640             const dst_reg tmp = vgrf(src.type);
641             MOV(tmp, src);
642             return tmp;
643          } else {
644             return src;
645          }
646       }
647 
648       bblock_t *block;
649       exec_node *cursor;
650 
651       unsigned _dispatch_width;
652       unsigned _group;
653       bool force_writemask_all;
654 
655       /** Debug annotation info. */
656       struct {
657          const char *str;
658          const void *ir;
659       } annotation;
660    };
661 }
662 
663 #endif
664