• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /* -*- c++ -*- */
2 /*
3  * Copyright © 2010-2015 Intel Corporation
4  *
5  * Permission is hereby granted, free of charge, to any person obtaining a
6  * copy of this software and associated documentation files (the "Software"),
7  * to deal in the Software without restriction, including without limitation
8  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
9  * and/or sell copies of the Software, and to permit persons to whom the
10  * Software is furnished to do so, subject to the following conditions:
11  *
12  * The above copyright notice and this permission notice (including the next
13  * paragraph) shall be included in all copies or substantial portions of the
14  * Software.
15  *
16  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
19  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
21  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
22  * IN THE SOFTWARE.
23  */
24 
25 #pragma once
26 
27 #include "brw_ir_fs.h"
28 #include "brw_eu.h"
29 #include "brw_fs.h"
30 
31 static inline brw_reg offset(const brw_reg &, const brw::fs_builder &,
32                              unsigned);
33 
34 namespace brw {
35    /**
36     * Toolbox to assemble an FS IR program out of individual instructions.
37     */
38    class fs_builder {
39    public:
40       /**
41        * Construct an fs_builder that inserts instructions into \p shader.
42        * \p dispatch_width gives the native execution width of the program.
43        */
fs_builder(fs_visitor * shader,unsigned dispatch_width)44       fs_builder(fs_visitor *shader,
45                  unsigned dispatch_width) :
46          shader(shader), block(NULL), cursor(NULL),
47          _dispatch_width(dispatch_width),
48          _group(0),
49          force_writemask_all(false),
50          annotation()
51       {
52       }
53 
fs_builder(fs_visitor * s)54       explicit fs_builder(fs_visitor *s) : fs_builder(s, s->dispatch_width) {}
55 
56       /**
57        * Construct an fs_builder that inserts instructions into \p shader
58        * before instruction \p inst in basic block \p block.  The default
59        * execution controls and debug annotation are initialized from the
60        * instruction passed as argument.
61        */
fs_builder(fs_visitor * shader,bblock_t * block,fs_inst * inst)62       fs_builder(fs_visitor *shader, bblock_t *block, fs_inst *inst) :
63          shader(shader), block(block), cursor(inst),
64          _dispatch_width(inst->exec_size),
65          _group(inst->group),
66          force_writemask_all(inst->force_writemask_all)
67       {
68 #ifndef NDEBUG
69          annotation.str = inst->annotation;
70 #else
71          annotation.str = NULL;
72 #endif
73       }
74 
75       /**
76        * Construct an fs_builder that inserts instructions before \p cursor in
77        * basic block \p block, inheriting other code generation parameters
78        * from this.
79        */
80       fs_builder
at(bblock_t * block,exec_node * cursor)81       at(bblock_t *block, exec_node *cursor) const
82       {
83          fs_builder bld = *this;
84          bld.block = block;
85          bld.cursor = cursor;
86          return bld;
87       }
88 
89       /**
90        * Construct an fs_builder appending instructions at the end of the
91        * instruction list of the shader, inheriting other code generation
92        * parameters from this.
93        */
94       fs_builder
at_end()95       at_end() const
96       {
97          return at(NULL, (exec_node *)&shader->instructions.tail_sentinel);
98       }
99 
100       /**
101        * Construct a builder specifying the default SIMD width and group of
102        * channel enable signals, inheriting other code generation parameters
103        * from this.
104        *
105        * \p n gives the default SIMD width, \p i gives the slot group used for
106        * predication and control flow masking in multiples of \p n channels.
107        */
108       fs_builder
group(unsigned n,unsigned i)109       group(unsigned n, unsigned i) const
110       {
111          fs_builder bld = *this;
112 
113          if (n <= dispatch_width() && i < dispatch_width() / n) {
114             bld._group += i * n;
115          } else {
116             /* The requested channel group isn't a subset of the channel group
117              * of this builder, which means that the resulting instructions
118              * would use (potentially undefined) channel enable signals not
119              * specified by the parent builder.  That's only valid if the
120              * instruction doesn't have per-channel semantics, in which case
121              * we should clear off the default group index in order to prevent
122              * emitting instructions with channel group not aligned to their
123              * own execution size.
124              */
125             assert(force_writemask_all);
126             bld._group = 0;
127          }
128 
129          bld._dispatch_width = n;
130          return bld;
131       }
132 
133       /**
134        * Alias for group() with width equal to eight.
135        */
136       fs_builder
quarter(unsigned i)137       quarter(unsigned i) const
138       {
139          return group(8, i);
140       }
141 
142       /**
143        * Construct a builder with per-channel control flow execution masking
144        * disabled if \p b is true.  If control flow execution masking is
145        * already disabled this has no effect.
146        */
147       fs_builder
148       exec_all(bool b = true) const
149       {
150          fs_builder bld = *this;
151          if (b)
152             bld.force_writemask_all = true;
153          return bld;
154       }
155 
156       /**
157        * Construct a builder for SIMD8-as-scalar
158        */
159       fs_builder
scalar_group()160       scalar_group() const
161       {
162          return exec_all().group(8 * reg_unit(shader->devinfo), 0);
163       }
164 
165       /**
166        * Construct a builder with the given debug annotation info.
167        */
168       fs_builder
annotate(const char * str)169       annotate(const char *str) const
170       {
171          fs_builder bld = *this;
172          bld.annotation.str = str;
173          return bld;
174       }
175 
176       /**
177        * Get the SIMD width in use.
178        */
179       unsigned
dispatch_width()180       dispatch_width() const
181       {
182          return _dispatch_width;
183       }
184 
185       /**
186        * Get the channel group in use.
187        */
188       unsigned
group()189       group() const
190       {
191          return _group;
192       }
193 
194       /**
195        * Allocate a virtual register of natural vector size (one for this IR)
196        * and SIMD width.  \p n gives the amount of space to allocate in
197        * dispatch_width units (which is just enough space for one logical
198        * component in this IR).
199        */
200       brw_reg
201       vgrf(enum brw_reg_type type, unsigned n = 1) const
202       {
203          const unsigned unit = reg_unit(shader->devinfo);
204          assert(dispatch_width() <= 32);
205 
206          if (n > 0)
207             return brw_vgrf(shader->alloc.allocate(
208                                DIV_ROUND_UP(n * brw_type_size_bytes(type) * dispatch_width(),
209                                             unit * REG_SIZE) * unit),
210                             type);
211          else
212             return retype(null_reg_ud(), type);
213       }
214 
215       brw_reg
vaddr(enum brw_reg_type type,unsigned subnr)216       vaddr(enum brw_reg_type type, unsigned subnr) const
217       {
218          brw_reg addr = brw_address_reg(subnr);
219          addr.nr = shader->next_address_register_nr++;
220          return retype(addr, type);
221       }
222 
223       /**
224        * Create a null register of floating type.
225        */
226       brw_reg
null_reg_f()227       null_reg_f() const
228       {
229          return brw_reg(retype(brw_null_reg(), BRW_TYPE_F));
230       }
231 
232       brw_reg
null_reg_df()233       null_reg_df() const
234       {
235          return brw_reg(retype(brw_null_reg(), BRW_TYPE_DF));
236       }
237 
238       /**
239        * Create a null register of signed integer type.
240        */
241       brw_reg
null_reg_d()242       null_reg_d() const
243       {
244          return brw_reg(retype(brw_null_reg(), BRW_TYPE_D));
245       }
246 
247       /**
248        * Create a null register of unsigned integer type.
249        */
250       brw_reg
null_reg_ud()251       null_reg_ud() const
252       {
253          return brw_reg(retype(brw_null_reg(), BRW_TYPE_UD));
254       }
255 
256       /**
257        * Insert an instruction into the program.
258        */
259       fs_inst *
emit(const fs_inst & inst)260       emit(const fs_inst &inst) const
261       {
262          return emit(new(shader->mem_ctx) fs_inst(inst));
263       }
264 
265       /**
266        * Create and insert a nullary control instruction into the program.
267        */
268       fs_inst *
emit(enum opcode opcode)269       emit(enum opcode opcode) const
270       {
271          return emit(fs_inst(opcode, dispatch_width()));
272       }
273 
274       /**
275        * Create and insert a nullary instruction into the program.
276        */
277       fs_inst *
emit(enum opcode opcode,const brw_reg & dst)278       emit(enum opcode opcode, const brw_reg &dst) const
279       {
280          return emit(fs_inst(opcode, dispatch_width(), dst));
281       }
282 
283       /**
284        * Create and insert a unary instruction into the program.
285        */
286       fs_inst *
emit(enum opcode opcode,const brw_reg & dst,const brw_reg & src0)287       emit(enum opcode opcode, const brw_reg &dst, const brw_reg &src0) const
288       {
289          return emit(fs_inst(opcode, dispatch_width(), dst, src0));
290       }
291 
292       /**
293        * Create and insert a binary instruction into the program.
294        */
295       fs_inst *
emit(enum opcode opcode,const brw_reg & dst,const brw_reg & src0,const brw_reg & src1)296       emit(enum opcode opcode, const brw_reg &dst, const brw_reg &src0,
297            const brw_reg &src1) const
298       {
299          return emit(fs_inst(opcode, dispatch_width(), dst,
300                                  src0, src1));
301       }
302 
303       /**
304        * Create and insert a ternary instruction into the program.
305        */
306       fs_inst *
emit(enum opcode opcode,const brw_reg & dst,const brw_reg & src0,const brw_reg & src1,const brw_reg & src2)307       emit(enum opcode opcode, const brw_reg &dst, const brw_reg &src0,
308            const brw_reg &src1, const brw_reg &src2) const
309       {
310          switch (opcode) {
311          case BRW_OPCODE_BFE:
312          case BRW_OPCODE_BFI2:
313          case BRW_OPCODE_MAD:
314          case BRW_OPCODE_LRP:
315             return emit(fs_inst(opcode, dispatch_width(), dst,
316                                     fix_3src_operand(src0),
317                                     fix_3src_operand(src1),
318                                     fix_3src_operand(src2)));
319 
320          default:
321             return emit(fs_inst(opcode, dispatch_width(), dst,
322                                     src0, src1, src2));
323          }
324       }
325 
326       /**
327        * Create and insert an instruction with a variable number of sources
328        * into the program.
329        */
330       fs_inst *
emit(enum opcode opcode,const brw_reg & dst,const brw_reg srcs[],unsigned n)331       emit(enum opcode opcode, const brw_reg &dst, const brw_reg srcs[],
332            unsigned n) const
333       {
334          /* Use the emit() methods for specific operand counts to ensure that
335           * opcode-specific operand fixups occur.
336           */
337          if (n == 3) {
338             return emit(opcode, dst, srcs[0], srcs[1], srcs[2]);
339          } else {
340             return emit(fs_inst(opcode, dispatch_width(), dst, srcs, n));
341          }
342       }
343 
344       /**
345        * Insert a preallocated instruction into the program.
346        */
347       fs_inst *
emit(fs_inst * inst)348       emit(fs_inst *inst) const
349       {
350          assert(inst->exec_size <= 32);
351          assert(inst->exec_size == dispatch_width() ||
352                 force_writemask_all);
353 
354          inst->group = _group;
355          inst->force_writemask_all = force_writemask_all;
356 #ifndef NDEBUG
357          inst->annotation = annotation.str;
358 #endif
359 
360          if (block)
361             static_cast<fs_inst *>(cursor)->insert_before(block, inst);
362          else
363             cursor->insert_before(inst);
364 
365          return inst;
366       }
367 
368       /**
369        * Select \p src0 if the comparison of both sources with the given
370        * conditional mod evaluates to true, otherwise select \p src1.
371        *
372        * Generally useful to get the minimum or maximum of two values.
373        */
374       fs_inst *
emit_minmax(const brw_reg & dst,const brw_reg & src0,const brw_reg & src1,brw_conditional_mod mod)375       emit_minmax(const brw_reg &dst, const brw_reg &src0,
376                   const brw_reg &src1, brw_conditional_mod mod) const
377       {
378          assert(mod == BRW_CONDITIONAL_GE || mod == BRW_CONDITIONAL_L);
379 
380          /* In some cases we can't have bytes as operand for src1, so use the
381           * same type for both operand.
382           */
383          return set_condmod(mod, SEL(dst, fix_unsigned_negate(src0),
384                                      fix_unsigned_negate(src1)));
385       }
386 
387       /**
388        * Copy any live channel from \p src to the first channel of the result.
389        */
390       brw_reg
emit_uniformize(const brw_reg & src)391       emit_uniformize(const brw_reg &src) const
392       {
393          /* Trivial: skip unnecessary work and retain IMM */
394          if (src.file == IMM)
395             return src;
396 
397          /* FIXME: We use a vector chan_index and dst to allow constant and
398           * copy propagration to move result all the way into the consuming
399           * instruction (typically a surface index or sampler index for a
400           * send). Once we teach const/copy propagation about scalars we
401           * should go back to scalar destinations here.
402           */
403          const fs_builder xbld = scalar_group();
404          const brw_reg chan_index = xbld.vgrf(BRW_TYPE_UD);
405 
406          /* FIND_LIVE_CHANNEL will only write a single component after
407           * lowering. Munge size_written here to match the allocated size of
408           * chan_index.
409           */
410          exec_all().emit(SHADER_OPCODE_FIND_LIVE_CHANNEL, chan_index)
411             ->size_written = chan_index.component_size(xbld.dispatch_width());
412 
413          return BROADCAST(src, component(chan_index, 0));
414       }
415 
416       brw_reg
move_to_vgrf(const brw_reg & src,unsigned num_components)417       move_to_vgrf(const brw_reg &src, unsigned num_components) const
418       {
419          brw_reg *const src_comps = new brw_reg[num_components];
420 
421          for (unsigned i = 0; i < num_components; i++)
422             src_comps[i] = offset(src, *this, i);
423 
424          const brw_reg dst = vgrf(src.type, num_components);
425          LOAD_PAYLOAD(dst, src_comps, num_components, 0);
426 
427          delete[] src_comps;
428 
429          return brw_reg(dst);
430       }
431 
432       fs_inst *
emit_undef_for_dst(const fs_inst * old_inst)433       emit_undef_for_dst(const fs_inst *old_inst) const
434       {
435          assert(old_inst->dst.file == VGRF);
436          fs_inst *inst = emit(SHADER_OPCODE_UNDEF,
437                                   retype(old_inst->dst, BRW_TYPE_UD));
438          inst->size_written = old_inst->size_written;
439 
440          return inst;
441       }
442 
443       /**
444        * Assorted arithmetic ops.
445        * @{
446        */
447 #define _ALU1(prefix, op)                                \
448       fs_inst *                                          \
449       op(const brw_reg &dst, const brw_reg &src0) const    \
450       {                                                  \
451          assert(_dispatch_width == 1 ||                  \
452                 (dst.file >= VGRF && dst.stride != 0) || \
453                 (dst.file < VGRF && dst.hstride != 0));  \
454          return emit(prefix##op, dst, src0);             \
455       }                                                  \
456       brw_reg                                             \
457       op(const brw_reg &src0, fs_inst **out = NULL) const \
458       {                                                  \
459          fs_inst *inst = op(vgrf(src0.type), src0);      \
460          if (out) *out = inst;                           \
461          return inst->dst;                               \
462       }
463 #define ALU1(op) _ALU1(BRW_OPCODE_, op)
464 #define VIRT1(op) _ALU1(SHADER_OPCODE_, op)
465 
466       fs_inst *
alu2(opcode op,const brw_reg & dst,const brw_reg & src0,const brw_reg & src1)467       alu2(opcode op, const brw_reg &dst, const brw_reg &src0, const brw_reg &src1) const
468       {
469          return emit(op, dst, src0, src1);
470       }
471       brw_reg
472       alu2(opcode op, const brw_reg &src0, const brw_reg &src1, fs_inst **out = NULL) const
473       {
474          enum brw_reg_type inferred_dst_type =
475             brw_type_larger_of(src0.type, src1.type);
476          fs_inst *inst = alu2(op, vgrf(inferred_dst_type), src0, src1);
477          if (out) *out = inst;
478          return inst->dst;
479       }
480 
481 #define _ALU2(prefix, op)                                                    \
482       fs_inst *                                                              \
483       op(const brw_reg &dst, const brw_reg &src0, const brw_reg &src1) const    \
484       {                                                                      \
485          return alu2(prefix##op, dst, src0, src1);                           \
486       }                                                                      \
487       brw_reg                                                                 \
488       op(const brw_reg &src0, const brw_reg &src1, fs_inst **out = NULL) const \
489       {                                                                      \
490          return alu2(prefix##op, src0, src1, out);                           \
491       }
492 #define ALU2(op) _ALU2(BRW_OPCODE_, op)
493 #define VIRT2(op) _ALU2(SHADER_OPCODE_, op)
494 
495 #define ALU2_ACC(op)                                                    \
496       fs_inst *                                                     \
497       op(const brw_reg &dst, const brw_reg &src0, const brw_reg &src1) const \
498       {                                                                 \
499          fs_inst *inst = emit(BRW_OPCODE_##op, dst, src0, src1);    \
500          inst->writes_accumulator = true;                               \
501          return inst;                                                   \
502       }
503 
504 #define ALU3(op)                                                        \
505       fs_inst *                                                     \
506       op(const brw_reg &dst, const brw_reg &src0, const brw_reg &src1,  \
507          const brw_reg &src2) const                                     \
508       {                                                                 \
509          return emit(BRW_OPCODE_##op, dst, src0, src1, src2);           \
510       }                                                                 \
511       brw_reg                                                           \
512       op(const brw_reg &src0, const brw_reg &src1, const brw_reg &src2, \
513          fs_inst **out = NULL) const                                    \
514       {                                                                 \
515          enum brw_reg_type inferred_dst_type =                          \
516             brw_type_larger_of(brw_type_larger_of(src0.type, src1.type),\
517                                src2.type);                              \
518          fs_inst *inst = op(vgrf(inferred_dst_type), src0, src1, src2); \
519          if (out) *out = inst;                                          \
520          return inst->dst;                                              \
521       }
522 
523       ALU3(ADD3)
ALU2_ACC(ADDC)524       ALU2_ACC(ADDC)
525       ALU2(AND)
526       ALU2(ASR)
527       ALU2(AVG)
528       ALU3(BFE)
529       ALU2(BFI1)
530       ALU3(BFI2)
531       ALU1(BFREV)
532       ALU1(CBIT)
533       ALU2(DP2)
534       ALU2(DP3)
535       ALU2(DP4)
536       ALU2(DPH)
537       ALU1(FBH)
538       ALU1(FBL)
539       ALU1(FRC)
540       ALU3(DP4A)
541       ALU2(LINE)
542       ALU1(LZD)
543       ALU2(MAC)
544       ALU2_ACC(MACH)
545       ALU3(MAD)
546       ALU1(MOV)
547       ALU2(MUL)
548       ALU1(NOT)
549       ALU2(OR)
550       ALU2(PLN)
551       ALU1(RNDD)
552       ALU1(RNDE)
553       ALU1(RNDU)
554       ALU1(RNDZ)
555       ALU2(ROL)
556       ALU2(ROR)
557       ALU2(SEL)
558       ALU2(SHL)
559       ALU2(SHR)
560       ALU2_ACC(SUBB)
561       ALU2(XOR)
562 
563       VIRT1(RCP)
564       VIRT1(RSQ)
565       VIRT1(SQRT)
566       VIRT1(EXP2)
567       VIRT1(LOG2)
568       VIRT2(POW)
569       VIRT2(INT_QUOTIENT)
570       VIRT2(INT_REMAINDER)
571       VIRT1(SIN)
572       VIRT1(COS)
573 
574 #undef ALU3
575 #undef ALU2_ACC
576 #undef ALU2
577 #undef VIRT2
578 #undef _ALU2
579 #undef ALU1
580 #undef VIRT1
581 #undef _ALU1
582       /** @} */
583 
584       fs_inst *
585       ADD(const brw_reg &dst, const brw_reg &src0, const brw_reg &src1) const
586       {
587          return alu2(BRW_OPCODE_ADD, dst, src0, src1);
588       }
589 
590       brw_reg
591       ADD(const brw_reg &src0, const brw_reg &src1, fs_inst **out = NULL) const
592       {
593          if (src1.file == IMM && src1.ud == 0 && !out)
594             return src0;
595 
596          return alu2(BRW_OPCODE_ADD, src0, src1, out);
597       }
598 
599       /**
600        * CMP: Sets the low bit of the destination channels with the result
601        * of the comparison, while the upper bits are undefined, and updates
602        * the flag register with the packed 16 bits of the result.
603        */
604       fs_inst *
CMP(const brw_reg & dst,const brw_reg & src0,const brw_reg & src1,brw_conditional_mod condition)605       CMP(const brw_reg &dst, const brw_reg &src0, const brw_reg &src1,
606           brw_conditional_mod condition) const
607       {
608          /* Take the instruction:
609           *
610           * CMP null<d> src0<f> src1<f>
611           *
612           * Original gfx4 does type conversion to the destination type
613           * before comparison, producing garbage results for floating
614           * point comparisons.
615           */
616          const enum brw_reg_type type =
617             dst.is_null() ?
618             src0.type :
619             brw_type_with_size(src0.type, brw_type_size_bits(dst.type));
620 
621          return set_condmod(condition,
622                             emit(BRW_OPCODE_CMP, retype(dst, type),
623                                  fix_unsigned_negate(src0),
624                                  fix_unsigned_negate(src1)));
625       }
626 
627       /**
628        * CMPN: Behaves like CMP, but produces true if src1 is NaN.
629        */
630       fs_inst *
CMPN(const brw_reg & dst,const brw_reg & src0,const brw_reg & src1,brw_conditional_mod condition)631       CMPN(const brw_reg &dst, const brw_reg &src0, const brw_reg &src1,
632            brw_conditional_mod condition) const
633       {
634          /* Take the instruction:
635           *
636           * CMP null<d> src0<f> src1<f>
637           *
638           * Original gfx4 does type conversion to the destination type
639           * before comparison, producing garbage results for floating
640           * point comparisons.
641           */
642          const enum brw_reg_type type =
643             dst.is_null() ?
644             src0.type :
645             brw_type_with_size(src0.type, brw_type_size_bits(dst.type));
646 
647          return set_condmod(condition,
648                             emit(BRW_OPCODE_CMPN, retype(dst, type),
649                                  fix_unsigned_negate(src0),
650                                  fix_unsigned_negate(src1)));
651       }
652 
653       /**
654        * Gfx4 predicated IF.
655        */
656       fs_inst *
IF(brw_predicate predicate)657       IF(brw_predicate predicate) const
658       {
659          return set_predicate(predicate, emit(BRW_OPCODE_IF));
660       }
661 
662       /**
663        * CSEL: dst = src2 <op> 0.0f ? src0 : src1
664        */
665       fs_inst *
CSEL(const brw_reg & dst,const brw_reg & src0,const brw_reg & src1,const brw_reg & src2,brw_conditional_mod condition)666       CSEL(const brw_reg &dst, const brw_reg &src0, const brw_reg &src1,
667            const brw_reg &src2, brw_conditional_mod condition) const
668       {
669          return set_condmod(condition,
670                             emit(BRW_OPCODE_CSEL,
671                                  retype(dst, src2.type),
672                                  retype(src0, src2.type),
673                                  retype(src1, src2.type),
674                                  src2));
675       }
676 
677       /**
678        * Emit a linear interpolation instruction.
679        */
680       fs_inst *
LRP(const brw_reg & dst,const brw_reg & x,const brw_reg & y,const brw_reg & a)681       LRP(const brw_reg &dst, const brw_reg &x, const brw_reg &y,
682           const brw_reg &a) const
683       {
684          if (shader->devinfo->ver <= 10) {
685             /* The LRP instruction actually does op1 * op0 + op2 * (1 - op0), so
686              * we need to reorder the operands.
687              */
688             return emit(BRW_OPCODE_LRP, dst, a, y, x);
689 
690          } else {
691             /* We can't use the LRP instruction.  Emit x*(1-a) + y*a. */
692             const brw_reg y_times_a = vgrf(dst.type);
693             const brw_reg one_minus_a = vgrf(dst.type);
694             const brw_reg x_times_one_minus_a = vgrf(dst.type);
695 
696             MUL(y_times_a, y, a);
697             ADD(one_minus_a, negate(a), brw_imm_f(1.0f));
698             MUL(x_times_one_minus_a, x, brw_reg(one_minus_a));
699             return ADD(dst, brw_reg(x_times_one_minus_a), brw_reg(y_times_a));
700          }
701       }
702 
703       /**
704        * Collect a number of registers in a contiguous range of registers.
705        */
706       fs_inst *
LOAD_PAYLOAD(const brw_reg & dst,const brw_reg * src,unsigned sources,unsigned header_size)707       LOAD_PAYLOAD(const brw_reg &dst, const brw_reg *src,
708                    unsigned sources, unsigned header_size) const
709       {
710          fs_inst *inst = emit(SHADER_OPCODE_LOAD_PAYLOAD, dst, src, sources);
711          inst->header_size = header_size;
712          inst->size_written = header_size * REG_SIZE;
713          for (unsigned i = header_size; i < sources; i++) {
714             inst->size_written += dispatch_width() * brw_type_size_bytes(src[i].type) *
715                                   dst.stride;
716          }
717 
718          return inst;
719       }
720 
721       fs_inst *
VEC(const brw_reg & dst,const brw_reg * src,unsigned sources)722       VEC(const brw_reg &dst, const brw_reg *src, unsigned sources) const
723       {
724          return sources == 1 ? MOV(dst, src[0])
725                              : LOAD_PAYLOAD(dst, src, sources, 0);
726       }
727 
728       fs_inst *
SYNC(enum tgl_sync_function sync)729       SYNC(enum tgl_sync_function sync) const
730       {
731          return emit(BRW_OPCODE_SYNC, null_reg_ud(), brw_imm_ud(sync));
732       }
733 
734       fs_inst *
UNDEF(const brw_reg & dst)735       UNDEF(const brw_reg &dst) const
736       {
737          assert(dst.file == VGRF);
738          assert(dst.offset % REG_SIZE == 0);
739          fs_inst *inst = emit(SHADER_OPCODE_UNDEF,
740                                   retype(dst, BRW_TYPE_UD));
741          inst->size_written = shader->alloc.sizes[dst.nr] * REG_SIZE - dst.offset;
742 
743          return inst;
744       }
745 
746       fs_inst *
DPAS(const brw_reg & dst,const brw_reg & src0,const brw_reg & src1,const brw_reg & src2,unsigned sdepth,unsigned rcount)747       DPAS(const brw_reg &dst, const brw_reg &src0, const brw_reg &src1, const brw_reg &src2,
748            unsigned sdepth, unsigned rcount) const
749       {
750          assert(_dispatch_width == 8 * reg_unit(shader->devinfo));
751          assert(sdepth == 8);
752          assert(rcount == 1 || rcount == 2 || rcount == 4 || rcount == 8);
753 
754          fs_inst *inst = emit(BRW_OPCODE_DPAS, dst, src0, src1, src2);
755          inst->sdepth = sdepth;
756          inst->rcount = rcount;
757 
758          if (dst.type == BRW_TYPE_HF) {
759             inst->size_written = reg_unit(shader->devinfo) * rcount * REG_SIZE / 2;
760          } else {
761             inst->size_written = reg_unit(shader->devinfo) * rcount * REG_SIZE;
762          }
763 
764          return inst;
765       }
766 
767       void
VARYING_PULL_CONSTANT_LOAD(const brw_reg & dst,const brw_reg & surface,const brw_reg & surface_handle,const brw_reg & varying_offset,uint32_t const_offset,uint8_t alignment,unsigned components)768       VARYING_PULL_CONSTANT_LOAD(const brw_reg &dst,
769                                  const brw_reg &surface,
770                                  const brw_reg &surface_handle,
771                                  const brw_reg &varying_offset,
772                                  uint32_t const_offset,
773                                  uint8_t alignment,
774                                  unsigned components) const
775       {
776          assert(components <= 4);
777 
778          /* We have our constant surface use a pitch of 4 bytes, so our index can
779           * be any component of a vector, and then we load 4 contiguous
780           * components starting from that.  TODO: Support loading fewer than 4.
781           */
782          brw_reg total_offset = ADD(varying_offset, brw_imm_ud(const_offset));
783 
784          /* The pull load message will load a vec4 (16 bytes). If we are loading
785           * a double this means we are only loading 2 elements worth of data.
786           * We also want to use a 32-bit data type for the dst of the load operation
787           * so other parts of the driver don't get confused about the size of the
788           * result.
789           */
790          brw_reg vec4_result = vgrf(BRW_TYPE_F, 4);
791 
792          brw_reg srcs[PULL_VARYING_CONSTANT_SRCS];
793          srcs[PULL_VARYING_CONSTANT_SRC_SURFACE]        = surface;
794          srcs[PULL_VARYING_CONSTANT_SRC_SURFACE_HANDLE] = surface_handle;
795          srcs[PULL_VARYING_CONSTANT_SRC_OFFSET]         = total_offset;
796          srcs[PULL_VARYING_CONSTANT_SRC_ALIGNMENT]      = brw_imm_ud(alignment);
797 
798          fs_inst *inst = emit(FS_OPCODE_VARYING_PULL_CONSTANT_LOAD_LOGICAL,
799                               vec4_result, srcs, PULL_VARYING_CONSTANT_SRCS);
800          inst->size_written = 4 * vec4_result.component_size(inst->exec_size);
801 
802          shuffle_from_32bit_read(*this, dst, vec4_result, 0, components);
803       }
804 
805       brw_reg
LOAD_SUBGROUP_INVOCATION()806       LOAD_SUBGROUP_INVOCATION() const
807       {
808          brw_reg reg = vgrf(shader->dispatch_width < 16 ? BRW_TYPE_UD : BRW_TYPE_UW);
809          exec_all().emit(SHADER_OPCODE_LOAD_SUBGROUP_INVOCATION, reg);
810          return reg;
811       }
812 
813       brw_reg
BROADCAST(brw_reg value,brw_reg index)814       BROADCAST(brw_reg value, brw_reg index) const
815       {
816          const fs_builder xbld = scalar_group();
817          const brw_reg dst = xbld.vgrf(value.type);
818 
819          assert(is_uniform(index));
820 
821          /* A broadcast will always be at the full dispatch width even if the
822           * use of the broadcast result is smaller. If the source is_scalar,
823           * it may be allocated at less than the full dispatch width (e.g.,
824           * allocated at SIMD8 with SIMD32 dispatch). The input may or may
825           * not be stride=0. If it is not, the generated broadcast
826           *
827           *    broadcast(32) dst, value<1>, index<0>
828           *
829           * is invalid because it may read out of bounds from value.
830           *
831           * To account for this, modify the stride of an is_scalar input to be
832           * zero.
833           */
834          if (value.is_scalar)
835             value = component(value, 0);
836 
837          /* Ensure that the source of a broadcast is always register aligned.
838           * See brw_broadcast() non-scalar case for more details.
839           */
840          if (reg_offset(value) % (REG_SIZE * reg_unit(shader->devinfo)) != 0)
841             value = MOV(value);
842 
843          /* BROADCAST will only write a single component after lowering. Munge
844           * size_written here to match the allocated size of dst.
845           */
846          exec_all().emit(SHADER_OPCODE_BROADCAST, dst, value, index)
847             ->size_written = dst.component_size(xbld.dispatch_width());
848 
849          return component(dst, 0);
850       }
851 
852       fs_visitor *shader;
853 
BREAK()854       fs_inst *BREAK()    { return emit(BRW_OPCODE_BREAK); }
DO()855       fs_inst *DO()       { return emit(BRW_OPCODE_DO); }
ENDIF()856       fs_inst *ENDIF()    { return emit(BRW_OPCODE_ENDIF); }
NOP()857       fs_inst *NOP()      { return emit(BRW_OPCODE_NOP); }
WHILE()858       fs_inst *WHILE()    { return emit(BRW_OPCODE_WHILE); }
CONTINUE()859       fs_inst *CONTINUE() { return emit(BRW_OPCODE_CONTINUE); }
860 
has_writemask_all()861       bool has_writemask_all() const {
862          return force_writemask_all;
863       }
864 
865    private:
866       /**
867        * Workaround for negation of UD registers.  See comment in
868        * brw_generator::generate_code() for more details.
869        */
870       brw_reg
fix_unsigned_negate(const brw_reg & src)871       fix_unsigned_negate(const brw_reg &src) const
872       {
873          if (src.type == BRW_TYPE_UD &&
874              src.negate) {
875             brw_reg temp = vgrf(BRW_TYPE_UD);
876             MOV(temp, src);
877             return brw_reg(temp);
878          } else {
879             return src;
880          }
881       }
882 
883       /**
884        * Workaround for source register modes not supported by the ternary
885        * instruction encoding.
886        */
887       brw_reg
fix_3src_operand(const brw_reg & src)888       fix_3src_operand(const brw_reg &src) const
889       {
890          switch (src.file) {
891          case FIXED_GRF:
892             /* FINISHME: Could handle scalar region, other stride=1 regions */
893             if (src.vstride != BRW_VERTICAL_STRIDE_8 ||
894                 src.width != BRW_WIDTH_8 ||
895                 src.hstride != BRW_HORIZONTAL_STRIDE_1)
896                break;
897             FALLTHROUGH;
898          case ATTR:
899          case VGRF:
900          case UNIFORM:
901          case IMM:
902             return src;
903          default:
904             break;
905          }
906 
907          brw_reg expanded = vgrf(src.type);
908          MOV(expanded, src);
909          return expanded;
910       }
911 
912       bblock_t *block;
913       exec_node *cursor;
914 
915       unsigned _dispatch_width;
916       unsigned _group;
917       bool force_writemask_all;
918 
919       /** Debug annotation info. */
920       struct {
921          const char *str;
922       } annotation;
923    };
924 }
925 
926 /**
927  * Offset by a number of components into a VGRF
928  *
929  * It is assumed that the VGRF represents a vector (e.g., returned by
930  * load_uniform or a texture operation). Convergent and divergent values are
931  * stored differently, so care must be taken to offset properly.
932  */
933 static inline brw_reg
offset(const brw_reg & reg,const brw::fs_builder & bld,unsigned delta)934 offset(const brw_reg &reg, const brw::fs_builder &bld, unsigned delta)
935 {
936    /* If the value is convergent (stored as one or more SIMD8), offset using
937     * SIMD8 and select component 0.
938     */
939    if (reg.is_scalar) {
940       const unsigned allocation_width = 8 * reg_unit(bld.shader->devinfo);
941 
942       brw_reg offset_reg = offset(reg, allocation_width, delta);
943 
944       /* If the dispatch width is larger than the allocation width, that
945        * implies that the register can only be used as a source. Otherwise the
946        * instruction would write past the allocation size of the register.
947        */
948       if (bld.dispatch_width() > allocation_width)
949          return component(offset_reg, 0);
950       else
951          return offset_reg;
952    }
953 
954    /* Offset to the component assuming the value was allocated in
955     * dispatch_width units.
956     */
957    return offset(reg, bld.dispatch_width(), delta);
958 }
959