• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /* -*- c++ -*- */
2 /*
3  * Copyright © 2010-2015 Intel Corporation
4  *
5  * Permission is hereby granted, free of charge, to any person obtaining a
6  * copy of this software and associated documentation files (the "Software"),
7  * to deal in the Software without restriction, including without limitation
8  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
9  * and/or sell copies of the Software, and to permit persons to whom the
10  * Software is furnished to do so, subject to the following conditions:
11  *
12  * The above copyright notice and this permission notice (including the next
13  * paragraph) shall be included in all copies or substantial portions of the
14  * Software.
15  *
16  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
19  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
21  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
22  * IN THE SOFTWARE.
23  */
24 
25 #pragma once
26 
27 #include "elk_ir_fs.h"
28 #include "elk_shader.h"
29 #include "elk_eu.h"
30 #include "elk_fs.h"
31 
32 namespace elk {
33    /**
34     * Toolbox to assemble an FS IR program out of individual instructions.
35     *
36     * This object is meant to have an interface consistent with
37     * elk::vec4_builder.  They cannot be fully interchangeable because
38     * elk::fs_builder generates scalar code while elk::vec4_builder generates
39     * vector code.
40     */
41    class fs_builder {
42    public:
43       /** Type used in this IR to represent a source of an instruction. */
44       typedef elk_fs_reg src_reg;
45 
46       /** Type used in this IR to represent the destination of an instruction. */
47       typedef elk_fs_reg dst_reg;
48 
49       /** Type used in this IR to represent an instruction. */
50       typedef elk_fs_inst instruction;
51 
52       /**
53        * Construct an fs_builder that inserts instructions into \p shader.
54        * \p dispatch_width gives the native execution width of the program.
55        */
fs_builder(elk_fs_visitor * shader,unsigned dispatch_width)56       fs_builder(elk_fs_visitor *shader,
57                  unsigned dispatch_width) :
58          shader(shader), block(NULL), cursor(NULL),
59          _dispatch_width(dispatch_width),
60          _group(0),
61          force_writemask_all(false),
62          annotation()
63       {
64       }
65 
fs_builder(elk_fs_visitor * s)66       explicit fs_builder(elk_fs_visitor *s) : fs_builder(s, s->dispatch_width) {}
67 
68       /**
69        * Construct an fs_builder that inserts instructions into \p shader
70        * before instruction \p inst in basic block \p block.  The default
71        * execution controls and debug annotation are initialized from the
72        * instruction passed as argument.
73        */
fs_builder(elk_fs_visitor * shader,elk_bblock_t * block,elk_fs_inst * inst)74       fs_builder(elk_fs_visitor *shader, elk_bblock_t *block, elk_fs_inst *inst) :
75          shader(shader), block(block), cursor(inst),
76          _dispatch_width(inst->exec_size),
77          _group(inst->group),
78          force_writemask_all(inst->force_writemask_all)
79       {
80          annotation.str = inst->annotation;
81          annotation.ir = inst->ir;
82       }
83 
84       /**
85        * Construct an fs_builder that inserts instructions before \p cursor in
86        * basic block \p block, inheriting other code generation parameters
87        * from this.
88        */
89       fs_builder
at(elk_bblock_t * block,exec_node * cursor)90       at(elk_bblock_t *block, exec_node *cursor) const
91       {
92          fs_builder bld = *this;
93          bld.block = block;
94          bld.cursor = cursor;
95          return bld;
96       }
97 
98       /**
99        * Construct an fs_builder appending instructions at the end of the
100        * instruction list of the shader, inheriting other code generation
101        * parameters from this.
102        */
103       fs_builder
at_end()104       at_end() const
105       {
106          return at(NULL, (exec_node *)&shader->instructions.tail_sentinel);
107       }
108 
109       /**
110        * Construct a builder specifying the default SIMD width and group of
111        * channel enable signals, inheriting other code generation parameters
112        * from this.
113        *
114        * \p n gives the default SIMD width, \p i gives the slot group used for
115        * predication and control flow masking in multiples of \p n channels.
116        */
117       fs_builder
group(unsigned n,unsigned i)118       group(unsigned n, unsigned i) const
119       {
120          fs_builder bld = *this;
121 
122          if (n <= dispatch_width() && i < dispatch_width() / n) {
123             bld._group += i * n;
124          } else {
125             /* The requested channel group isn't a subset of the channel group
126              * of this builder, which means that the resulting instructions
127              * would use (potentially undefined) channel enable signals not
128              * specified by the parent builder.  That's only valid if the
129              * instruction doesn't have per-channel semantics, in which case
130              * we should clear off the default group index in order to prevent
131              * emitting instructions with channel group not aligned to their
132              * own execution size.
133              */
134             assert(force_writemask_all);
135             bld._group = 0;
136          }
137 
138          bld._dispatch_width = n;
139          return bld;
140       }
141 
142       /**
143        * Alias for group() with width equal to eight.
144        */
145       fs_builder
quarter(unsigned i)146       quarter(unsigned i) const
147       {
148          return group(8, i);
149       }
150 
151       /**
152        * Construct a builder with per-channel control flow execution masking
153        * disabled if \p b is true.  If control flow execution masking is
154        * already disabled this has no effect.
155        */
156       fs_builder
157       exec_all(bool b = true) const
158       {
159          fs_builder bld = *this;
160          if (b)
161             bld.force_writemask_all = true;
162          return bld;
163       }
164 
165       /**
166        * Construct a builder with the given debug annotation info.
167        */
168       fs_builder
169       annotate(const char *str, const void *ir = NULL) const
170       {
171          fs_builder bld = *this;
172          bld.annotation.str = str;
173          bld.annotation.ir = ir;
174          return bld;
175       }
176 
177       /**
178        * Get the SIMD width in use.
179        */
180       unsigned
dispatch_width()181       dispatch_width() const
182       {
183          return _dispatch_width;
184       }
185 
186       /**
187        * Get the channel group in use.
188        */
189       unsigned
group()190       group() const
191       {
192          return _group;
193       }
194 
195       /**
196        * Allocate a virtual register of natural vector size (one for this IR)
197        * and SIMD width.  \p n gives the amount of space to allocate in
198        * dispatch_width units (which is just enough space for one logical
199        * component in this IR).
200        */
201       dst_reg
202       vgrf(enum elk_reg_type type, unsigned n = 1) const
203       {
204          const unsigned unit = reg_unit(shader->devinfo);
205          assert(dispatch_width() <= 32);
206 
207          if (n > 0)
208             return dst_reg(VGRF, shader->alloc.allocate(
209                               DIV_ROUND_UP(n * type_sz(type) * dispatch_width(),
210                                            unit * REG_SIZE) * unit),
211                            type);
212          else
213             return retype(null_reg_ud(), type);
214       }
215 
216       /**
217        * Create a null register of floating type.
218        */
219       dst_reg
null_reg_f()220       null_reg_f() const
221       {
222          return dst_reg(retype(elk_null_reg(), ELK_REGISTER_TYPE_F));
223       }
224 
225       dst_reg
null_reg_df()226       null_reg_df() const
227       {
228          return dst_reg(retype(elk_null_reg(), ELK_REGISTER_TYPE_DF));
229       }
230 
231       /**
232        * Create a null register of signed integer type.
233        */
234       dst_reg
null_reg_d()235       null_reg_d() const
236       {
237          return dst_reg(retype(elk_null_reg(), ELK_REGISTER_TYPE_D));
238       }
239 
240       /**
241        * Create a null register of unsigned integer type.
242        */
243       dst_reg
null_reg_ud()244       null_reg_ud() const
245       {
246          return dst_reg(retype(elk_null_reg(), ELK_REGISTER_TYPE_UD));
247       }
248 
249       /**
250        * Insert an instruction into the program.
251        */
252       instruction *
emit(const instruction & inst)253       emit(const instruction &inst) const
254       {
255          return emit(new(shader->mem_ctx) instruction(inst));
256       }
257 
258       /**
259        * Create and insert a nullary control instruction into the program.
260        */
261       instruction *
emit(enum elk_opcode opcode)262       emit(enum elk_opcode opcode) const
263       {
264          return emit(instruction(opcode, dispatch_width()));
265       }
266 
267       /**
268        * Create and insert a nullary instruction into the program.
269        */
270       instruction *
emit(enum elk_opcode opcode,const dst_reg & dst)271       emit(enum elk_opcode opcode, const dst_reg &dst) const
272       {
273          return emit(instruction(opcode, dispatch_width(), dst));
274       }
275 
276       /**
277        * Create and insert a unary instruction into the program.
278        */
279       instruction *
emit(enum elk_opcode opcode,const dst_reg & dst,const src_reg & src0)280       emit(enum elk_opcode opcode, const dst_reg &dst, const src_reg &src0) const
281       {
282          switch (opcode) {
283          case ELK_SHADER_OPCODE_RCP:
284          case ELK_SHADER_OPCODE_RSQ:
285          case ELK_SHADER_OPCODE_SQRT:
286          case ELK_SHADER_OPCODE_EXP2:
287          case ELK_SHADER_OPCODE_LOG2:
288          case ELK_SHADER_OPCODE_SIN:
289          case ELK_SHADER_OPCODE_COS:
290             return emit(instruction(opcode, dispatch_width(), dst,
291                                     fix_math_operand(src0)));
292 
293          default:
294             return emit(instruction(opcode, dispatch_width(), dst, src0));
295          }
296       }
297 
298       /**
299        * Create and insert a binary instruction into the program.
300        */
301       instruction *
emit(enum elk_opcode opcode,const dst_reg & dst,const src_reg & src0,const src_reg & src1)302       emit(enum elk_opcode opcode, const dst_reg &dst, const src_reg &src0,
303            const src_reg &src1) const
304       {
305          switch (opcode) {
306          case ELK_SHADER_OPCODE_POW:
307          case ELK_SHADER_OPCODE_INT_QUOTIENT:
308          case ELK_SHADER_OPCODE_INT_REMAINDER:
309             return emit(instruction(opcode, dispatch_width(), dst,
310                                     fix_math_operand(src0),
311                                     fix_math_operand(src1)));
312 
313          default:
314             return emit(instruction(opcode, dispatch_width(), dst,
315                                     src0, src1));
316 
317          }
318       }
319 
320       /**
321        * Create and insert a ternary instruction into the program.
322        */
323       instruction *
emit(enum elk_opcode opcode,const dst_reg & dst,const src_reg & src0,const src_reg & src1,const src_reg & src2)324       emit(enum elk_opcode opcode, const dst_reg &dst, const src_reg &src0,
325            const src_reg &src1, const src_reg &src2) const
326       {
327          switch (opcode) {
328          case ELK_OPCODE_BFE:
329          case ELK_OPCODE_BFI2:
330          case ELK_OPCODE_MAD:
331          case ELK_OPCODE_LRP:
332             return emit(instruction(opcode, dispatch_width(), dst,
333                                     fix_3src_operand(src0),
334                                     fix_3src_operand(src1),
335                                     fix_3src_operand(src2)));
336 
337          default:
338             return emit(instruction(opcode, dispatch_width(), dst,
339                                     src0, src1, src2));
340          }
341       }
342 
343       /**
344        * Create and insert an instruction with a variable number of sources
345        * into the program.
346        */
347       instruction *
emit(enum elk_opcode opcode,const dst_reg & dst,const src_reg srcs[],unsigned n)348       emit(enum elk_opcode opcode, const dst_reg &dst, const src_reg srcs[],
349            unsigned n) const
350       {
351          /* Use the emit() methods for specific operand counts to ensure that
352           * opcode-specific operand fixups occur.
353           */
354          if (n == 2) {
355             return emit(opcode, dst, srcs[0], srcs[1]);
356          } else if (n == 3) {
357             return emit(opcode, dst, srcs[0], srcs[1], srcs[2]);
358          } else {
359             return emit(instruction(opcode, dispatch_width(), dst, srcs, n));
360          }
361       }
362 
363       /**
364        * Insert a preallocated instruction into the program.
365        */
366       instruction *
emit(instruction * inst)367       emit(instruction *inst) const
368       {
369          assert(inst->exec_size <= 32);
370          assert(inst->exec_size == dispatch_width() ||
371                 force_writemask_all);
372 
373          inst->group = _group;
374          inst->force_writemask_all = force_writemask_all;
375          inst->annotation = annotation.str;
376          inst->ir = annotation.ir;
377 
378          if (block)
379             static_cast<instruction *>(cursor)->insert_before(block, inst);
380          else
381             cursor->insert_before(inst);
382 
383          return inst;
384       }
385 
386       /**
387        * Select \p src0 if the comparison of both sources with the given
388        * conditional mod evaluates to true, otherwise select \p src1.
389        *
390        * Generally useful to get the minimum or maximum of two values.
391        */
392       instruction *
emit_minmax(const dst_reg & dst,const src_reg & src0,const src_reg & src1,elk_conditional_mod mod)393       emit_minmax(const dst_reg &dst, const src_reg &src0,
394                   const src_reg &src1, elk_conditional_mod mod) const
395       {
396          assert(mod == ELK_CONDITIONAL_GE || mod == ELK_CONDITIONAL_L);
397 
398          /* In some cases we can't have bytes as operand for src1, so use the
399           * same type for both operand.
400           */
401          return set_condmod(mod, SEL(dst, fix_unsigned_negate(src0),
402                                      fix_unsigned_negate(src1)));
403       }
404 
405       /**
406        * Copy any live channel from \p src to the first channel of the result.
407        */
408       src_reg
emit_uniformize(const src_reg & src)409       emit_uniformize(const src_reg &src) const
410       {
411          /* FIXME: We use a vector chan_index and dst to allow constant and
412           * copy propagration to move result all the way into the consuming
413           * instruction (typically a surface index or sampler index for a
414           * send). This uses 1 or 3 extra hw registers in 16 or 32 wide
415           * dispatch. Once we teach const/copy propagation about scalars we
416           * should go back to scalar destinations here.
417           */
418          const fs_builder ubld = exec_all();
419          const dst_reg chan_index = vgrf(ELK_REGISTER_TYPE_UD);
420          const dst_reg dst = vgrf(src.type);
421 
422          ubld.emit(ELK_SHADER_OPCODE_FIND_LIVE_CHANNEL, chan_index);
423          ubld.emit(ELK_SHADER_OPCODE_BROADCAST, dst, src, component(chan_index, 0));
424 
425          return src_reg(component(dst, 0));
426       }
427 
428       src_reg
move_to_vgrf(const src_reg & src,unsigned num_components)429       move_to_vgrf(const src_reg &src, unsigned num_components) const
430       {
431          src_reg *const src_comps = new src_reg[num_components];
432          for (unsigned i = 0; i < num_components; i++)
433             src_comps[i] = offset(src, dispatch_width(), i);
434 
435          const dst_reg dst = vgrf(src.type, num_components);
436          LOAD_PAYLOAD(dst, src_comps, num_components, 0);
437 
438          delete[] src_comps;
439 
440          return src_reg(dst);
441       }
442 
443       void
emit_scan_step(enum elk_opcode opcode,elk_conditional_mod mod,const dst_reg & tmp,unsigned left_offset,unsigned left_stride,unsigned right_offset,unsigned right_stride)444       emit_scan_step(enum elk_opcode opcode, elk_conditional_mod mod,
445                      const dst_reg &tmp,
446                      unsigned left_offset, unsigned left_stride,
447                      unsigned right_offset, unsigned right_stride) const
448       {
449          dst_reg left, right;
450          left = horiz_stride(horiz_offset(tmp, left_offset), left_stride);
451          right = horiz_stride(horiz_offset(tmp, right_offset), right_stride);
452          if ((tmp.type == ELK_REGISTER_TYPE_Q ||
453               tmp.type == ELK_REGISTER_TYPE_UQ) &&
454              !shader->devinfo->has_64bit_int) {
455             switch (opcode) {
456             case ELK_OPCODE_MUL:
457                /* This will get lowered by integer MUL lowering */
458                set_condmod(mod, emit(opcode, right, left, right));
459                break;
460 
461             case ELK_OPCODE_SEL: {
462                /* In order for the comparisons to work out right, we need our
463                 * comparisons to be strict.
464                 */
465                assert(mod == ELK_CONDITIONAL_L || mod == ELK_CONDITIONAL_GE);
466                if (mod == ELK_CONDITIONAL_GE)
467                   mod = ELK_CONDITIONAL_G;
468 
469                /* We treat the bottom 32 bits as unsigned regardless of
470                 * whether or not the integer as a whole is signed.
471                 */
472                dst_reg right_low = subscript(right, ELK_REGISTER_TYPE_UD, 0);
473                dst_reg left_low = subscript(left, ELK_REGISTER_TYPE_UD, 0);
474 
475                /* The upper bits get the same sign as the 64-bit type */
476                elk_reg_type type32 = elk_reg_type_from_bit_size(32, tmp.type);
477                dst_reg right_high = subscript(right, type32, 1);
478                dst_reg left_high = subscript(left, type32, 1);
479 
480                /* Build up our comparison:
481                 *
482                 *   l_hi < r_hi || (l_hi == r_hi && l_low < r_low)
483                 */
484                CMP(null_reg_ud(), retype(left_low, ELK_REGISTER_TYPE_UD),
485                                   retype(right_low, ELK_REGISTER_TYPE_UD), mod);
486                set_predicate(ELK_PREDICATE_NORMAL,
487                              CMP(null_reg_ud(), left_high, right_high,
488                                  ELK_CONDITIONAL_EQ));
489                set_predicate_inv(ELK_PREDICATE_NORMAL, true,
490                                  CMP(null_reg_ud(), left_high, right_high, mod));
491 
492                /* We could use selects here or we could use predicated MOVs
493                 * because the destination and second source (if it were a SEL)
494                 * are the same.
495                 */
496                set_predicate(ELK_PREDICATE_NORMAL, MOV(right_low, left_low));
497                set_predicate(ELK_PREDICATE_NORMAL, MOV(right_high, left_high));
498                break;
499             }
500 
501             default:
502                unreachable("Unsupported 64-bit scan op");
503             }
504          } else {
505             set_condmod(mod, emit(opcode, right, left, right));
506          }
507       }
508 
509       void
emit_scan(enum elk_opcode opcode,const dst_reg & tmp,unsigned cluster_size,elk_conditional_mod mod)510       emit_scan(enum elk_opcode opcode, const dst_reg &tmp,
511                 unsigned cluster_size, elk_conditional_mod mod) const
512       {
513          assert(dispatch_width() >= 8);
514 
515          /* The instruction splitting code isn't advanced enough to split
516           * these so we need to handle that ourselves.
517           */
518          if (dispatch_width() * type_sz(tmp.type) > 2 * REG_SIZE) {
519             const unsigned half_width = dispatch_width() / 2;
520             const fs_builder ubld = exec_all().group(half_width, 0);
521             dst_reg left = tmp;
522             dst_reg right = horiz_offset(tmp, half_width);
523             ubld.emit_scan(opcode, left, cluster_size, mod);
524             ubld.emit_scan(opcode, right, cluster_size, mod);
525             if (cluster_size > half_width) {
526                ubld.emit_scan_step(opcode, mod, tmp,
527                                    half_width - 1, 0, half_width, 1);
528             }
529             return;
530          }
531 
532          if (cluster_size > 1) {
533             const fs_builder ubld = exec_all().group(dispatch_width() / 2, 0);
534             ubld.emit_scan_step(opcode, mod, tmp, 0, 2, 1, 2);
535          }
536 
537          if (cluster_size > 2) {
538             if (type_sz(tmp.type) <= 4) {
539                const fs_builder ubld =
540                   exec_all().group(dispatch_width() / 4, 0);
541                ubld.emit_scan_step(opcode, mod, tmp, 1, 4, 2, 4);
542                ubld.emit_scan_step(opcode, mod, tmp, 1, 4, 3, 4);
543             } else {
544                /* For 64-bit types, we have to do things differently because
545                 * the code above would land us with destination strides that
546                 * the hardware can't handle.  Fortunately, we'll only be
547                 * 8-wide in that case and it's the same number of
548                 * instructions.
549                 */
550                const fs_builder ubld = exec_all().group(2, 0);
551                for (unsigned i = 0; i < dispatch_width(); i += 4)
552                   ubld.emit_scan_step(opcode, mod, tmp, i + 1, 0, i + 2, 1);
553             }
554          }
555 
556          for (unsigned i = 4;
557               i < MIN2(cluster_size, dispatch_width());
558               i *= 2) {
559             const fs_builder ubld = exec_all().group(i, 0);
560             ubld.emit_scan_step(opcode, mod, tmp, i - 1, 0, i, 1);
561 
562             if (dispatch_width() > i * 2)
563                ubld.emit_scan_step(opcode, mod, tmp, i * 3 - 1, 0, i * 3, 1);
564 
565             if (dispatch_width() > i * 4) {
566                ubld.emit_scan_step(opcode, mod, tmp, i * 5 - 1, 0, i * 5, 1);
567                ubld.emit_scan_step(opcode, mod, tmp, i * 7 - 1, 0, i * 7, 1);
568             }
569          }
570       }
571 
572       instruction *
emit_undef_for_dst(const instruction * old_inst)573       emit_undef_for_dst(const instruction *old_inst) const
574       {
575          assert(old_inst->dst.file == VGRF);
576          instruction *inst = emit(ELK_SHADER_OPCODE_UNDEF,
577                                   retype(old_inst->dst, ELK_REGISTER_TYPE_UD));
578          inst->size_written = old_inst->size_written;
579 
580          return inst;
581       }
582 
583       /**
584        * Assorted arithmetic ops.
585        * @{
586        */
587 #define ALU1(op)                                        \
588       instruction *                                     \
589       op(const dst_reg &dst, const src_reg &src0) const \
590       {                                                 \
591          return emit(ELK_OPCODE_##op, dst, src0);       \
592       }
593 
594 #define ALU2(op)                                                        \
595       instruction *                                                     \
596       op(const dst_reg &dst, const src_reg &src0, const src_reg &src1) const \
597       {                                                                 \
598          return emit(ELK_OPCODE_##op, dst, src0, src1);                 \
599       }
600 
601 #define ALU2_ACC(op)                                                    \
602       instruction *                                                     \
603       op(const dst_reg &dst, const src_reg &src0, const src_reg &src1) const \
604       {                                                                 \
605          instruction *inst = emit(ELK_OPCODE_##op, dst, src0, src1);    \
606          inst->writes_accumulator = true;                               \
607          return inst;                                                   \
608       }
609 
610 #define ALU3(op)                                                        \
611       instruction *                                                     \
612       op(const dst_reg &dst, const src_reg &src0, const src_reg &src1,  \
613          const src_reg &src2) const                                     \
614       {                                                                 \
615          return emit(ELK_OPCODE_##op, dst, src0, src1, src2);           \
616       }
617 
618       ALU2(ADD)
ALU2_ACC(ADDC)619       ALU2_ACC(ADDC)
620       ALU2(AND)
621       ALU2(ASR)
622       ALU2(AVG)
623       ALU3(BFE)
624       ALU2(BFI1)
625       ALU3(BFI2)
626       ALU1(BFREV)
627       ALU1(CBIT)
628       ALU1(DIM)
629       ALU2(DP2)
630       ALU2(DP3)
631       ALU2(DP4)
632       ALU2(DPH)
633       ALU1(FBH)
634       ALU1(FBL)
635       ALU1(FRC)
636       ALU2(LINE)
637       ALU1(LZD)
638       ALU2(MAC)
639       ALU2_ACC(MACH)
640       ALU3(MAD)
641       ALU1(MOV)
642       ALU2(MUL)
643       ALU1(NOT)
644       ALU2(OR)
645       ALU2(PLN)
646       ALU1(RNDD)
647       ALU1(RNDE)
648       ALU1(RNDU)
649       ALU1(RNDZ)
650       ALU2(SAD2)
651       ALU2_ACC(SADA2)
652       ALU2(SEL)
653       ALU2(SHL)
654       ALU2(SHR)
655       ALU2_ACC(SUBB)
656       ALU2(XOR)
657 
658 #undef ALU3
659 #undef ALU2_ACC
660 #undef ALU2
661 #undef ALU1
662 
663       instruction *
664       F32TO16(const dst_reg &dst, const src_reg &src) const
665       {
666          assert(dst.type == ELK_REGISTER_TYPE_HF);
667          assert(src.type == ELK_REGISTER_TYPE_F);
668 
669          if (shader->devinfo->ver >= 8) {
670             return MOV(dst, src);
671          } else {
672             assert(shader->devinfo->ver == 7);
673             return emit(ELK_OPCODE_F32TO16,
674                         retype(dst, ELK_REGISTER_TYPE_W), src);
675          }
676       }
677 
678       instruction *
F16TO32(const dst_reg & dst,const src_reg & src)679       F16TO32(const dst_reg &dst, const src_reg &src) const
680       {
681          assert(dst.type == ELK_REGISTER_TYPE_F);
682          assert(src.type == ELK_REGISTER_TYPE_HF);
683 
684          if (shader->devinfo->ver >= 8) {
685             return MOV(dst, src);
686          } else {
687             assert(shader->devinfo->ver == 7);
688             return emit(ELK_OPCODE_F16TO32,
689                         dst, retype(src, ELK_REGISTER_TYPE_W));
690          }
691       }
692       /** @} */
693 
694       /**
695        * CMP: Sets the low bit of the destination channels with the result
696        * of the comparison, while the upper bits are undefined, and updates
697        * the flag register with the packed 16 bits of the result.
698        */
699       instruction *
CMP(const dst_reg & dst,const src_reg & src0,const src_reg & src1,elk_conditional_mod condition)700       CMP(const dst_reg &dst, const src_reg &src0, const src_reg &src1,
701           elk_conditional_mod condition) const
702       {
703          /* Take the instruction:
704           *
705           * CMP null<d> src0<f> src1<f>
706           *
707           * Original gfx4 does type conversion to the destination type
708           * before comparison, producing garbage results for floating
709           * point comparisons.
710           *
711           * The destination type doesn't matter on newer generations,
712           * so we set the type to match src0 so we can compact the
713           * instruction.
714           */
715          return set_condmod(condition,
716                             emit(ELK_OPCODE_CMP, retype(dst, src0.type),
717                                  fix_unsigned_negate(src0),
718                                  fix_unsigned_negate(src1)));
719       }
720 
721       /**
722        * CMPN: Behaves like CMP, but produces true if src1 is NaN.
723        */
724       instruction *
CMPN(const dst_reg & dst,const src_reg & src0,const src_reg & src1,elk_conditional_mod condition)725       CMPN(const dst_reg &dst, const src_reg &src0, const src_reg &src1,
726            elk_conditional_mod condition) const
727       {
728          /* Take the instruction:
729           *
730           * CMP null<d> src0<f> src1<f>
731           *
732           * Original gfx4 does type conversion to the destination type
733           * before comparison, producing garbage results for floating
734           * point comparisons.
735           *
736           * The destination type doesn't matter on newer generations,
737           * so we set the type to match src0 so we can compact the
738           * instruction.
739           */
740          return set_condmod(condition,
741                             emit(ELK_OPCODE_CMPN, retype(dst, src0.type),
742                                  fix_unsigned_negate(src0),
743                                  fix_unsigned_negate(src1)));
744       }
745 
746       /**
747        * Gfx4 predicated IF.
748        */
749       instruction *
IF(elk_predicate predicate)750       IF(elk_predicate predicate) const
751       {
752          return set_predicate(predicate, emit(ELK_OPCODE_IF));
753       }
754 
755       /**
756        * CSEL: dst = src2 <op> 0.0f ? src0 : src1
757        */
758       instruction *
CSEL(const dst_reg & dst,const src_reg & src0,const src_reg & src1,const src_reg & src2,elk_conditional_mod condition)759       CSEL(const dst_reg &dst, const src_reg &src0, const src_reg &src1,
760            const src_reg &src2, elk_conditional_mod condition) const
761       {
762          /* CSEL only operates on floats, so we can't do integer </<=/>=/>
763           * comparisons.  Zero/non-zero (== and !=) comparisons almost work.
764           * 0x80000000 fails because it is -0.0, and -0.0 == 0.0.
765           */
766          assert(src2.type == ELK_REGISTER_TYPE_F);
767 
768          return set_condmod(condition,
769                             emit(ELK_OPCODE_CSEL,
770                                  retype(dst, ELK_REGISTER_TYPE_F),
771                                  retype(src0, ELK_REGISTER_TYPE_F),
772                                  retype(src1, ELK_REGISTER_TYPE_F),
773                                  src2));
774       }
775 
776       /**
777        * Emit a linear interpolation instruction.
778        */
779       instruction *
LRP(const dst_reg & dst,const src_reg & x,const src_reg & y,const src_reg & a)780       LRP(const dst_reg &dst, const src_reg &x, const src_reg &y,
781           const src_reg &a) const
782       {
783          if (shader->devinfo->ver >= 6) {
784             /* The LRP instruction actually does op1 * op0 + op2 * (1 - op0), so
785              * we need to reorder the operands.
786              */
787             return emit(ELK_OPCODE_LRP, dst, a, y, x);
788 
789          } else {
790             /* We can't use the LRP instruction.  Emit x*(1-a) + y*a. */
791             const dst_reg y_times_a = vgrf(dst.type);
792             const dst_reg one_minus_a = vgrf(dst.type);
793             const dst_reg x_times_one_minus_a = vgrf(dst.type);
794 
795             MUL(y_times_a, y, a);
796             ADD(one_minus_a, negate(a), elk_imm_f(1.0f));
797             MUL(x_times_one_minus_a, x, src_reg(one_minus_a));
798             return ADD(dst, src_reg(x_times_one_minus_a), src_reg(y_times_a));
799          }
800       }
801 
802       /**
803        * Collect a number of registers in a contiguous range of registers.
804        */
805       instruction *
LOAD_PAYLOAD(const dst_reg & dst,const src_reg * src,unsigned sources,unsigned header_size)806       LOAD_PAYLOAD(const dst_reg &dst, const src_reg *src,
807                    unsigned sources, unsigned header_size) const
808       {
809          instruction *inst = emit(ELK_SHADER_OPCODE_LOAD_PAYLOAD, dst, src, sources);
810          inst->header_size = header_size;
811          inst->size_written = header_size * REG_SIZE;
812          for (unsigned i = header_size; i < sources; i++) {
813             inst->size_written += dispatch_width() * type_sz(src[i].type) *
814                                   dst.stride;
815          }
816 
817          return inst;
818       }
819 
820       instruction *
UNDEF(const dst_reg & dst)821       UNDEF(const dst_reg &dst) const
822       {
823          assert(dst.file == VGRF);
824          assert(dst.offset % REG_SIZE == 0);
825          instruction *inst = emit(ELK_SHADER_OPCODE_UNDEF,
826                                   retype(dst, ELK_REGISTER_TYPE_UD));
827          inst->size_written = shader->alloc.sizes[dst.nr] * REG_SIZE - dst.offset;
828 
829          return inst;
830       }
831 
832       elk_fs_visitor *shader;
833 
BREAK()834       elk_fs_inst *BREAK()    { return emit(ELK_OPCODE_BREAK); }
DO()835       elk_fs_inst *DO()       { return emit(ELK_OPCODE_DO); }
ENDIF()836       elk_fs_inst *ENDIF()    { return emit(ELK_OPCODE_ENDIF); }
NOP()837       elk_fs_inst *NOP()      { return emit(ELK_OPCODE_NOP); }
WHILE()838       elk_fs_inst *WHILE()    { return emit(ELK_OPCODE_WHILE); }
CONTINUE()839       elk_fs_inst *CONTINUE() { return emit(ELK_OPCODE_CONTINUE); }
840 
841    private:
842       /**
843        * Workaround for negation of UD registers.  See comment in
844        * elk_fs_generator::generate_code() for more details.
845        */
846       src_reg
fix_unsigned_negate(const src_reg & src)847       fix_unsigned_negate(const src_reg &src) const
848       {
849          if (src.type == ELK_REGISTER_TYPE_UD &&
850              src.negate) {
851             dst_reg temp = vgrf(ELK_REGISTER_TYPE_UD);
852             MOV(temp, src);
853             return src_reg(temp);
854          } else {
855             return src;
856          }
857       }
858 
859       /**
860        * Workaround for source register modes not supported by the ternary
861        * instruction encoding.
862        */
863       src_reg
fix_3src_operand(const src_reg & src)864       fix_3src_operand(const src_reg &src) const
865       {
866          switch (src.file) {
867          case FIXED_GRF:
868             /* FINISHME: Could handle scalar region, other stride=1 regions */
869             if (src.vstride != ELK_VERTICAL_STRIDE_8 ||
870                 src.width != ELK_WIDTH_8 ||
871                 src.hstride != ELK_HORIZONTAL_STRIDE_1)
872                break;
873             FALLTHROUGH;
874          case ATTR:
875          case VGRF:
876          case UNIFORM:
877          case IMM:
878             return src;
879          default:
880             break;
881          }
882 
883          dst_reg expanded = vgrf(src.type);
884          MOV(expanded, src);
885          return expanded;
886       }
887 
888       /**
889        * Workaround for source register modes not supported by the math
890        * instruction.
891        */
892       src_reg
fix_math_operand(const src_reg & src)893       fix_math_operand(const src_reg &src) const
894       {
895          /* Can't do hstride == 0 args on gfx6 math, so expand it out. We
896           * might be able to do better by doing execsize = 1 math and then
897           * expanding that result out, but we would need to be careful with
898           * masking.
899           *
900           * Gfx6 hardware ignores source modifiers (negate and abs) on math
901           * instructions, so we also move to a temp to set those up.
902           *
903           * Gfx7 relaxes most of the above restrictions, but still can't use IMM
904           * operands to math
905           */
906          if ((shader->devinfo->ver == 6 &&
907               (src.file == IMM || src.file == UNIFORM ||
908                src.abs || src.negate)) ||
909              (shader->devinfo->ver == 7 && src.file == IMM)) {
910             const dst_reg tmp = vgrf(src.type);
911             MOV(tmp, src);
912             return tmp;
913          } else {
914             return src;
915          }
916       }
917 
918       elk_bblock_t *block;
919       exec_node *cursor;
920 
921       unsigned _dispatch_width;
922       unsigned _group;
923       bool force_writemask_all;
924 
925       /** Debug annotation info. */
926       struct {
927          const char *str;
928          const void *ir;
929       } annotation;
930    };
931 }
932 
933 static inline elk_fs_reg
offset(const elk_fs_reg & reg,const elk::fs_builder & bld,unsigned delta)934 offset(const elk_fs_reg &reg, const elk::fs_builder &bld, unsigned delta)
935 {
936    return offset(reg, bld.dispatch_width(), delta);
937 }
938