• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /* -*- c++ -*- */
2 /*
3  * Copyright © 2010-2015 Intel Corporation
4  *
5  * Permission is hereby granted, free of charge, to any person obtaining a
6  * copy of this software and associated documentation files (the "Software"),
7  * to deal in the Software without restriction, including without limitation
8  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
9  * and/or sell copies of the Software, and to permit persons to whom the
10  * Software is furnished to do so, subject to the following conditions:
11  *
12  * The above copyright notice and this permission notice (including the next
13  * paragraph) shall be included in all copies or substantial portions of the
14  * Software.
15  *
16  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
19  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
21  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
22  * IN THE SOFTWARE.
23  */
24 
25 #pragma once
26 
27 #include "brw_ir_fs.h"
28 #include "brw_eu.h"
29 #include "brw_fs.h"
30 
31 static inline brw_reg offset(const brw_reg &, const brw_builder &,
32                              unsigned);
33 
34 /**
35  * Toolbox to assemble an BRW IR program out of individual instructions.
36  */
37 class brw_builder {
38 public:
39    /**
40     * Construct an brw_builder that inserts instructions into \p shader.
41     * \p dispatch_width gives the native execution width of the program.
42     */
brw_builder(fs_visitor * shader,unsigned dispatch_width)43    brw_builder(fs_visitor *shader,
44                unsigned dispatch_width) :
45       shader(shader), block(NULL), cursor(NULL),
46       _dispatch_width(dispatch_width),
47       _group(0),
48       force_writemask_all(false),
49       annotation()
50    {
51    }
52 
brw_builder(fs_visitor * s)53    explicit brw_builder(fs_visitor *s) : brw_builder(s, s->dispatch_width) {}
54 
55    /**
56     * Construct an brw_builder that inserts instructions into \p shader
57     * before instruction \p inst in basic block \p block.  The default
58     * execution controls and debug annotation are initialized from the
59     * instruction passed as argument.
60     */
brw_builder(fs_visitor * shader,bblock_t * block,fs_inst * inst)61    brw_builder(fs_visitor *shader, bblock_t *block, fs_inst *inst) :
62       shader(shader), block(block), cursor(inst),
63       _dispatch_width(inst->exec_size),
64       _group(inst->group),
65       force_writemask_all(inst->force_writemask_all)
66    {
67 #ifndef NDEBUG
68       annotation.str = inst->annotation;
69 #else
70       annotation.str = NULL;
71 #endif
72    }
73 
74    /**
75     * Construct an brw_builder that inserts instructions before \p cursor in
76     * basic block \p block, inheriting other code generation parameters
77     * from this.
78     */
79    brw_builder
at(bblock_t * block,exec_node * cursor)80    at(bblock_t *block, exec_node *cursor) const
81    {
82       brw_builder bld = *this;
83       bld.block = block;
84       bld.cursor = cursor;
85       return bld;
86    }
87 
88    /**
89     * Construct an brw_builder appending instructions at the end of the
90     * instruction list of the shader, inheriting other code generation
91     * parameters from this.
92     */
93    brw_builder
at_end()94    at_end() const
95    {
96       return at(NULL, (exec_node *)&shader->instructions.tail_sentinel);
97    }
98 
99    /**
100     * Construct a builder specifying the default SIMD width and group of
101     * channel enable signals, inheriting other code generation parameters
102     * from this.
103     *
104     * \p n gives the default SIMD width, \p i gives the slot group used for
105     * predication and control flow masking in multiples of \p n channels.
106     */
107    brw_builder
group(unsigned n,unsigned i)108    group(unsigned n, unsigned i) const
109    {
110       brw_builder bld = *this;
111 
112       if (n <= dispatch_width() && i < dispatch_width() / n) {
113          bld._group += i * n;
114       } else {
115          /* The requested channel group isn't a subset of the channel group
116           * of this builder, which means that the resulting instructions
117           * would use (potentially undefined) channel enable signals not
118           * specified by the parent builder.  That's only valid if the
119           * instruction doesn't have per-channel semantics, in which case
120           * we should clear off the default group index in order to prevent
121           * emitting instructions with channel group not aligned to their
122           * own execution size.
123           */
124          assert(force_writemask_all);
125          bld._group = 0;
126       }
127 
128       bld._dispatch_width = n;
129       return bld;
130    }
131 
132    /**
133     * Alias for group() with width equal to eight.
134     */
135    brw_builder
quarter(unsigned i)136    quarter(unsigned i) const
137    {
138       return group(8, i);
139    }
140 
141    /**
142     * Construct a builder with per-channel control flow execution masking
143     * disabled if \p b is true.  If control flow execution masking is
144     * already disabled this has no effect.
145     */
146    brw_builder
147    exec_all(bool b = true) const
148    {
149       brw_builder bld = *this;
150       if (b)
151          bld.force_writemask_all = true;
152       return bld;
153    }
154 
155    /**
156     * Construct a builder for SIMD8-as-scalar
157     */
158    brw_builder
scalar_group()159    scalar_group() const
160    {
161       return exec_all().group(8 * reg_unit(shader->devinfo), 0);
162    }
163 
164    /**
165     * Construct a builder with the given debug annotation info.
166     */
167    brw_builder
annotate(const char * str)168    annotate(const char *str) const
169    {
170       brw_builder bld = *this;
171       bld.annotation.str = str;
172       return bld;
173    }
174 
175    /**
176     * Get the SIMD width in use.
177     */
178    unsigned
dispatch_width()179    dispatch_width() const
180    {
181       return _dispatch_width;
182    }
183 
184    /**
185     * Get the channel group in use.
186     */
187    unsigned
group()188    group() const
189    {
190       return _group;
191    }
192 
193    /**
194     * Allocate a virtual register of natural vector size (one for this IR)
195     * and SIMD width.  \p n gives the amount of space to allocate in
196     * dispatch_width units (which is just enough space for one logical
197     * component in this IR).
198     */
199    brw_reg
200    vgrf(enum brw_reg_type type, unsigned n = 1) const
201    {
202       const unsigned unit = reg_unit(shader->devinfo);
203       assert(dispatch_width() <= 32);
204 
205       if (n > 0)
206          return brw_vgrf(shader->alloc.allocate(
207                             DIV_ROUND_UP(n * brw_type_size_bytes(type) * dispatch_width(),
208                                          unit * REG_SIZE) * unit),
209                          type);
210       else
211          return retype(null_reg_ud(), type);
212    }
213 
214    brw_reg
vaddr(enum brw_reg_type type,unsigned subnr)215    vaddr(enum brw_reg_type type, unsigned subnr) const
216    {
217       brw_reg addr = brw_address_reg(subnr);
218       addr.nr = shader->next_address_register_nr++;
219       return retype(addr, type);
220    }
221 
222    /**
223     * Create a null register of floating type.
224     */
225    brw_reg
null_reg_f()226    null_reg_f() const
227    {
228       return brw_reg(retype(brw_null_reg(), BRW_TYPE_F));
229    }
230 
231    brw_reg
null_reg_df()232    null_reg_df() const
233    {
234       return brw_reg(retype(brw_null_reg(), BRW_TYPE_DF));
235    }
236 
237    /**
238     * Create a null register of signed integer type.
239     */
240    brw_reg
null_reg_d()241    null_reg_d() const
242    {
243       return brw_reg(retype(brw_null_reg(), BRW_TYPE_D));
244    }
245 
246    /**
247     * Create a null register of unsigned integer type.
248     */
249    brw_reg
null_reg_ud()250    null_reg_ud() const
251    {
252       return brw_reg(retype(brw_null_reg(), BRW_TYPE_UD));
253    }
254 
255    /**
256     * Insert an instruction into the program.
257     */
258    fs_inst *
emit(const fs_inst & inst)259    emit(const fs_inst &inst) const
260    {
261       return emit(new(shader->mem_ctx) fs_inst(inst));
262    }
263 
264    /**
265     * Create and insert a nullary control instruction into the program.
266     */
267    fs_inst *
emit(enum opcode opcode)268    emit(enum opcode opcode) const
269    {
270       return emit(fs_inst(opcode, dispatch_width()));
271    }
272 
273    /**
274     * Create and insert a nullary instruction into the program.
275     */
276    fs_inst *
emit(enum opcode opcode,const brw_reg & dst)277    emit(enum opcode opcode, const brw_reg &dst) const
278    {
279       return emit(fs_inst(opcode, dispatch_width(), dst));
280    }
281 
282    /**
283     * Create and insert a unary instruction into the program.
284     */
285    fs_inst *
emit(enum opcode opcode,const brw_reg & dst,const brw_reg & src0)286    emit(enum opcode opcode, const brw_reg &dst, const brw_reg &src0) const
287    {
288       return emit(fs_inst(opcode, dispatch_width(), dst, src0));
289    }
290 
291    /**
292     * Create and insert a binary instruction into the program.
293     */
294    fs_inst *
emit(enum opcode opcode,const brw_reg & dst,const brw_reg & src0,const brw_reg & src1)295    emit(enum opcode opcode, const brw_reg &dst, const brw_reg &src0,
296         const brw_reg &src1) const
297    {
298       return emit(fs_inst(opcode, dispatch_width(), dst,
299                               src0, src1));
300    }
301 
302    /**
303     * Create and insert a ternary instruction into the program.
304     */
305    fs_inst *
emit(enum opcode opcode,const brw_reg & dst,const brw_reg & src0,const brw_reg & src1,const brw_reg & src2)306    emit(enum opcode opcode, const brw_reg &dst, const brw_reg &src0,
307         const brw_reg &src1, const brw_reg &src2) const
308    {
309       switch (opcode) {
310       case BRW_OPCODE_BFE:
311       case BRW_OPCODE_BFI2:
312       case BRW_OPCODE_MAD:
313       case BRW_OPCODE_LRP:
314          return emit(fs_inst(opcode, dispatch_width(), dst,
315                                  fix_3src_operand(src0),
316                                  fix_3src_operand(src1),
317                                  fix_3src_operand(src2)));
318 
319       default:
320          return emit(fs_inst(opcode, dispatch_width(), dst,
321                                  src0, src1, src2));
322       }
323    }
324 
325    /**
326     * Create and insert an instruction with a variable number of sources
327     * into the program.
328     */
329    fs_inst *
emit(enum opcode opcode,const brw_reg & dst,const brw_reg srcs[],unsigned n)330    emit(enum opcode opcode, const brw_reg &dst, const brw_reg srcs[],
331         unsigned n) const
332    {
333       /* Use the emit() methods for specific operand counts to ensure that
334        * opcode-specific operand fixups occur.
335        */
336       if (n == 3) {
337          return emit(opcode, dst, srcs[0], srcs[1], srcs[2]);
338       } else {
339          return emit(fs_inst(opcode, dispatch_width(), dst, srcs, n));
340       }
341    }
342 
343    /**
344     * Insert a preallocated instruction into the program.
345     */
346    fs_inst *
emit(fs_inst * inst)347    emit(fs_inst *inst) const
348    {
349       assert(inst->exec_size <= 32);
350       assert(inst->exec_size == dispatch_width() ||
351              force_writemask_all);
352 
353       inst->group = _group;
354       inst->force_writemask_all = force_writemask_all;
355 #ifndef NDEBUG
356       inst->annotation = annotation.str;
357 #endif
358 
359       if (block)
360          static_cast<fs_inst *>(cursor)->insert_before(block, inst);
361       else
362          cursor->insert_before(inst);
363 
364       return inst;
365    }
366 
367    /**
368     * Select \p src0 if the comparison of both sources with the given
369     * conditional mod evaluates to true, otherwise select \p src1.
370     *
371     * Generally useful to get the minimum or maximum of two values.
372     */
373    fs_inst *
emit_minmax(const brw_reg & dst,const brw_reg & src0,const brw_reg & src1,brw_conditional_mod mod)374    emit_minmax(const brw_reg &dst, const brw_reg &src0,
375                const brw_reg &src1, brw_conditional_mod mod) const
376    {
377       assert(mod == BRW_CONDITIONAL_GE || mod == BRW_CONDITIONAL_L);
378 
379       /* In some cases we can't have bytes as operand for src1, so use the
380        * same type for both operand.
381        */
382       return set_condmod(mod, SEL(dst, fix_unsigned_negate(src0),
383                                   fix_unsigned_negate(src1)));
384    }
385 
386    /**
387     * Copy any live channel from \p src to the first channel of the result.
388     */
389    brw_reg
emit_uniformize(const brw_reg & src)390    emit_uniformize(const brw_reg &src) const
391    {
392       /* Trivial: skip unnecessary work and retain IMM */
393       if (src.file == IMM)
394          return src;
395 
396       /* FIXME: We use a vector chan_index and dst to allow constant and
397        * copy propagration to move result all the way into the consuming
398        * instruction (typically a surface index or sampler index for a
399        * send). Once we teach const/copy propagation about scalars we
400        * should go back to scalar destinations here.
401        */
402       const brw_builder xbld = scalar_group();
403       const brw_reg chan_index = xbld.vgrf(BRW_TYPE_UD);
404 
405       /* FIND_LIVE_CHANNEL will only write a single component after
406        * lowering. Munge size_written here to match the allocated size of
407        * chan_index.
408        */
409       exec_all().emit(SHADER_OPCODE_FIND_LIVE_CHANNEL, chan_index)
410          ->size_written = chan_index.component_size(xbld.dispatch_width());
411 
412       return BROADCAST(src, component(chan_index, 0));
413    }
414 
415    brw_reg
move_to_vgrf(const brw_reg & src,unsigned num_components)416    move_to_vgrf(const brw_reg &src, unsigned num_components) const
417    {
418       brw_reg *const src_comps = new brw_reg[num_components];
419 
420       for (unsigned i = 0; i < num_components; i++)
421          src_comps[i] = offset(src, *this, i);
422 
423       const brw_reg dst = vgrf(src.type, num_components);
424       LOAD_PAYLOAD(dst, src_comps, num_components, 0);
425 
426       delete[] src_comps;
427 
428       return brw_reg(dst);
429    }
430 
431    fs_inst *
emit_undef_for_dst(const fs_inst * old_inst)432    emit_undef_for_dst(const fs_inst *old_inst) const
433    {
434       assert(old_inst->dst.file == VGRF);
435       fs_inst *inst = emit(SHADER_OPCODE_UNDEF,
436                                retype(old_inst->dst, BRW_TYPE_UD));
437       inst->size_written = old_inst->size_written;
438 
439       return inst;
440    }
441 
442    /**
443     * Assorted arithmetic ops.
444     * @{
445     */
446 #define _ALU1(prefix, op)                                \
447    fs_inst *                                          \
448    op(const brw_reg &dst, const brw_reg &src0) const    \
449    {                                                  \
450       assert(_dispatch_width == 1 ||                  \
451              (dst.file >= VGRF && dst.stride != 0) || \
452              (dst.file < VGRF && dst.hstride != 0));  \
453       return emit(prefix##op, dst, src0);             \
454    }                                                  \
455    brw_reg                                             \
456    op(const brw_reg &src0, fs_inst **out = NULL) const \
457    {                                                  \
458       fs_inst *inst = op(vgrf(src0.type), src0);      \
459       if (out) *out = inst;                           \
460       return inst->dst;                               \
461    }
462 #define ALU1(op) _ALU1(BRW_OPCODE_, op)
463 #define VIRT1(op) _ALU1(SHADER_OPCODE_, op)
464 
465    fs_inst *
alu2(opcode op,const brw_reg & dst,const brw_reg & src0,const brw_reg & src1)466    alu2(opcode op, const brw_reg &dst, const brw_reg &src0, const brw_reg &src1) const
467    {
468       return emit(op, dst, src0, src1);
469    }
470    brw_reg
471    alu2(opcode op, const brw_reg &src0, const brw_reg &src1, fs_inst **out = NULL) const
472    {
473       enum brw_reg_type inferred_dst_type =
474          brw_type_larger_of(src0.type, src1.type);
475       fs_inst *inst = alu2(op, vgrf(inferred_dst_type), src0, src1);
476       if (out) *out = inst;
477       return inst->dst;
478    }
479 
480 #define _ALU2(prefix, op)                                                    \
481    fs_inst *                                                              \
482    op(const brw_reg &dst, const brw_reg &src0, const brw_reg &src1) const    \
483    {                                                                      \
484       return alu2(prefix##op, dst, src0, src1);                           \
485    }                                                                      \
486    brw_reg                                                                 \
487    op(const brw_reg &src0, const brw_reg &src1, fs_inst **out = NULL) const \
488    {                                                                      \
489       return alu2(prefix##op, src0, src1, out);                           \
490    }
491 #define ALU2(op) _ALU2(BRW_OPCODE_, op)
492 #define VIRT2(op) _ALU2(SHADER_OPCODE_, op)
493 
494 #define ALU2_ACC(op)                                                    \
495    fs_inst *                                                     \
496    op(const brw_reg &dst, const brw_reg &src0, const brw_reg &src1) const \
497    {                                                                 \
498       fs_inst *inst = emit(BRW_OPCODE_##op, dst, src0, src1);    \
499       inst->writes_accumulator = true;                               \
500       return inst;                                                   \
501    }
502 
503 #define ALU3(op)                                                        \
504    fs_inst *                                                     \
505    op(const brw_reg &dst, const brw_reg &src0, const brw_reg &src1,  \
506       const brw_reg &src2) const                                     \
507    {                                                                 \
508       return emit(BRW_OPCODE_##op, dst, src0, src1, src2);           \
509    }                                                                 \
510    brw_reg                                                           \
511    op(const brw_reg &src0, const brw_reg &src1, const brw_reg &src2, \
512       fs_inst **out = NULL) const                                    \
513    {                                                                 \
514       enum brw_reg_type inferred_dst_type =                          \
515          brw_type_larger_of(brw_type_larger_of(src0.type, src1.type),\
516                             src2.type);                              \
517       fs_inst *inst = op(vgrf(inferred_dst_type), src0, src1, src2); \
518       if (out) *out = inst;                                          \
519       return inst->dst;                                              \
520    }
521 
522    ALU3(ADD3)
ALU2_ACC(ADDC)523    ALU2_ACC(ADDC)
524    ALU2(AND)
525    ALU2(ASR)
526    ALU2(AVG)
527    ALU3(BFE)
528    ALU2(BFI1)
529    ALU3(BFI2)
530    ALU1(BFREV)
531    ALU1(CBIT)
532    ALU2(DP2)
533    ALU2(DP3)
534    ALU2(DP4)
535    ALU2(DPH)
536    ALU1(FBH)
537    ALU1(FBL)
538    ALU1(FRC)
539    ALU3(DP4A)
540    ALU2(LINE)
541    ALU1(LZD)
542    ALU2(MAC)
543    ALU2_ACC(MACH)
544    ALU3(MAD)
545    ALU1(MOV)
546    ALU2(MUL)
547    ALU1(NOT)
548    ALU2(OR)
549    ALU2(PLN)
550    ALU1(RNDD)
551    ALU1(RNDE)
552    ALU1(RNDU)
553    ALU1(RNDZ)
554    ALU2(ROL)
555    ALU2(ROR)
556    ALU2(SEL)
557    ALU2(SHL)
558    ALU2(SHR)
559    ALU2_ACC(SUBB)
560    ALU2(XOR)
561 
562    VIRT1(RCP)
563    VIRT1(RSQ)
564    VIRT1(SQRT)
565    VIRT1(EXP2)
566    VIRT1(LOG2)
567    VIRT2(POW)
568    VIRT2(INT_QUOTIENT)
569    VIRT2(INT_REMAINDER)
570    VIRT1(SIN)
571    VIRT1(COS)
572 
573 #undef ALU3
574 #undef ALU2_ACC
575 #undef ALU2
576 #undef VIRT2
577 #undef _ALU2
578 #undef ALU1
579 #undef VIRT1
580 #undef _ALU1
581    /** @} */
582 
583    fs_inst *
584    ADD(const brw_reg &dst, const brw_reg &src0, const brw_reg &src1) const
585    {
586       return alu2(BRW_OPCODE_ADD, dst, src0, src1);
587    }
588 
589    brw_reg
590    ADD(const brw_reg &src0, const brw_reg &src1, fs_inst **out = NULL) const
591    {
592       if (src1.file == IMM && src1.ud == 0 && !out)
593          return src0;
594 
595       return alu2(BRW_OPCODE_ADD, src0, src1, out);
596    }
597 
598    /**
599     * CMP: Sets the low bit of the destination channels with the result
600     * of the comparison, while the upper bits are undefined, and updates
601     * the flag register with the packed 16 bits of the result.
602     */
603    fs_inst *
CMP(const brw_reg & dst,const brw_reg & src0,const brw_reg & src1,brw_conditional_mod condition)604    CMP(const brw_reg &dst, const brw_reg &src0, const brw_reg &src1,
605        brw_conditional_mod condition) const
606    {
607       /* Take the instruction:
608        *
609        * CMP null<d> src0<f> src1<f>
610        *
611        * Original gfx4 does type conversion to the destination type
612        * before comparison, producing garbage results for floating
613        * point comparisons.
614        */
615       const enum brw_reg_type type =
616          dst.is_null() ?
617          src0.type :
618          brw_type_with_size(src0.type, brw_type_size_bits(dst.type));
619 
620       return set_condmod(condition,
621                          emit(BRW_OPCODE_CMP, retype(dst, type),
622                               fix_unsigned_negate(src0),
623                               fix_unsigned_negate(src1)));
624    }
625 
626    /**
627     * CMPN: Behaves like CMP, but produces true if src1 is NaN.
628     */
629    fs_inst *
CMPN(const brw_reg & dst,const brw_reg & src0,const brw_reg & src1,brw_conditional_mod condition)630    CMPN(const brw_reg &dst, const brw_reg &src0, const brw_reg &src1,
631         brw_conditional_mod condition) const
632    {
633       /* Take the instruction:
634        *
635        * CMP null<d> src0<f> src1<f>
636        *
637        * Original gfx4 does type conversion to the destination type
638        * before comparison, producing garbage results for floating
639        * point comparisons.
640        */
641       const enum brw_reg_type type =
642          dst.is_null() ?
643          src0.type :
644          brw_type_with_size(src0.type, brw_type_size_bits(dst.type));
645 
646       return set_condmod(condition,
647                          emit(BRW_OPCODE_CMPN, retype(dst, type),
648                               fix_unsigned_negate(src0),
649                               fix_unsigned_negate(src1)));
650    }
651 
652    /**
653     * Gfx4 predicated IF.
654     */
655    fs_inst *
IF(brw_predicate predicate)656    IF(brw_predicate predicate) const
657    {
658       return set_predicate(predicate, emit(BRW_OPCODE_IF));
659    }
660 
661    /**
662     * CSEL: dst = src2 <op> 0.0f ? src0 : src1
663     */
664    fs_inst *
CSEL(const brw_reg & dst,const brw_reg & src0,const brw_reg & src1,const brw_reg & src2,brw_conditional_mod condition)665    CSEL(const brw_reg &dst, const brw_reg &src0, const brw_reg &src1,
666         const brw_reg &src2, brw_conditional_mod condition) const
667    {
668       return set_condmod(condition,
669                          emit(BRW_OPCODE_CSEL,
670                               retype(dst, src2.type),
671                               retype(src0, src2.type),
672                               retype(src1, src2.type),
673                               src2));
674    }
675 
676    /**
677     * Emit a linear interpolation instruction.
678     */
679    fs_inst *
LRP(const brw_reg & dst,const brw_reg & x,const brw_reg & y,const brw_reg & a)680    LRP(const brw_reg &dst, const brw_reg &x, const brw_reg &y,
681        const brw_reg &a) const
682    {
683       if (shader->devinfo->ver <= 10) {
684          /* The LRP instruction actually does op1 * op0 + op2 * (1 - op0), so
685           * we need to reorder the operands.
686           */
687          return emit(BRW_OPCODE_LRP, dst, a, y, x);
688 
689       } else {
690          /* We can't use the LRP instruction.  Emit x*(1-a) + y*a. */
691          const brw_reg y_times_a = vgrf(dst.type);
692          const brw_reg one_minus_a = vgrf(dst.type);
693          const brw_reg x_times_one_minus_a = vgrf(dst.type);
694 
695          MUL(y_times_a, y, a);
696          ADD(one_minus_a, negate(a), brw_imm_f(1.0f));
697          MUL(x_times_one_minus_a, x, brw_reg(one_minus_a));
698          return ADD(dst, brw_reg(x_times_one_minus_a), brw_reg(y_times_a));
699       }
700    }
701 
702    /**
703     * Collect a number of registers in a contiguous range of registers.
704     */
705    fs_inst *
LOAD_PAYLOAD(const brw_reg & dst,const brw_reg * src,unsigned sources,unsigned header_size)706    LOAD_PAYLOAD(const brw_reg &dst, const brw_reg *src,
707                 unsigned sources, unsigned header_size) const
708    {
709       fs_inst *inst = emit(SHADER_OPCODE_LOAD_PAYLOAD, dst, src, sources);
710       inst->header_size = header_size;
711       inst->size_written = header_size * REG_SIZE;
712       for (unsigned i = header_size; i < sources; i++) {
713          inst->size_written += dispatch_width() * brw_type_size_bytes(src[i].type) *
714                                dst.stride;
715       }
716 
717       return inst;
718    }
719 
720    fs_inst *
VEC(const brw_reg & dst,const brw_reg * src,unsigned sources)721    VEC(const brw_reg &dst, const brw_reg *src, unsigned sources) const
722    {
723       return sources == 1 ? MOV(dst, src[0])
724                           : LOAD_PAYLOAD(dst, src, sources, 0);
725    }
726 
727    fs_inst *
SYNC(enum tgl_sync_function sync)728    SYNC(enum tgl_sync_function sync) const
729    {
730       return emit(BRW_OPCODE_SYNC, null_reg_ud(), brw_imm_ud(sync));
731    }
732 
733    fs_inst *
UNDEF(const brw_reg & dst)734    UNDEF(const brw_reg &dst) const
735    {
736       assert(dst.file == VGRF);
737       assert(dst.offset % REG_SIZE == 0);
738       fs_inst *inst = emit(SHADER_OPCODE_UNDEF,
739                                retype(dst, BRW_TYPE_UD));
740       inst->size_written = shader->alloc.sizes[dst.nr] * REG_SIZE - dst.offset;
741 
742       return inst;
743    }
744 
745    fs_inst *
DPAS(const brw_reg & dst,const brw_reg & src0,const brw_reg & src1,const brw_reg & src2,unsigned sdepth,unsigned rcount)746    DPAS(const brw_reg &dst, const brw_reg &src0, const brw_reg &src1, const brw_reg &src2,
747         unsigned sdepth, unsigned rcount) const
748    {
749       assert(_dispatch_width == 8 * reg_unit(shader->devinfo));
750       assert(sdepth == 8);
751       assert(rcount == 1 || rcount == 2 || rcount == 4 || rcount == 8);
752 
753       fs_inst *inst = emit(BRW_OPCODE_DPAS, dst, src0, src1, src2);
754       inst->sdepth = sdepth;
755       inst->rcount = rcount;
756 
757       if (dst.type == BRW_TYPE_HF) {
758          inst->size_written = reg_unit(shader->devinfo) * rcount * REG_SIZE / 2;
759       } else {
760          inst->size_written = reg_unit(shader->devinfo) * rcount * REG_SIZE;
761       }
762 
763       return inst;
764    }
765 
766    void
VARYING_PULL_CONSTANT_LOAD(const brw_reg & dst,const brw_reg & surface,const brw_reg & surface_handle,const brw_reg & varying_offset,uint32_t const_offset,uint8_t alignment,unsigned components)767    VARYING_PULL_CONSTANT_LOAD(const brw_reg &dst,
768                               const brw_reg &surface,
769                               const brw_reg &surface_handle,
770                               const brw_reg &varying_offset,
771                               uint32_t const_offset,
772                               uint8_t alignment,
773                               unsigned components) const
774    {
775       assert(components <= 4);
776 
777       /* We have our constant surface use a pitch of 4 bytes, so our index can
778        * be any component of a vector, and then we load 4 contiguous
779        * components starting from that.  TODO: Support loading fewer than 4.
780        */
781       brw_reg total_offset = ADD(varying_offset, brw_imm_ud(const_offset));
782 
783       /* The pull load message will load a vec4 (16 bytes). If we are loading
784        * a double this means we are only loading 2 elements worth of data.
785        * We also want to use a 32-bit data type for the dst of the load operation
786        * so other parts of the driver don't get confused about the size of the
787        * result.
788        */
789       brw_reg vec4_result = vgrf(BRW_TYPE_F, 4);
790 
791       brw_reg srcs[PULL_VARYING_CONSTANT_SRCS];
792       srcs[PULL_VARYING_CONSTANT_SRC_SURFACE]        = surface;
793       srcs[PULL_VARYING_CONSTANT_SRC_SURFACE_HANDLE] = surface_handle;
794       srcs[PULL_VARYING_CONSTANT_SRC_OFFSET]         = total_offset;
795       srcs[PULL_VARYING_CONSTANT_SRC_ALIGNMENT]      = brw_imm_ud(alignment);
796 
797       fs_inst *inst = emit(FS_OPCODE_VARYING_PULL_CONSTANT_LOAD_LOGICAL,
798                            vec4_result, srcs, PULL_VARYING_CONSTANT_SRCS);
799       inst->size_written = 4 * vec4_result.component_size(inst->exec_size);
800 
801       shuffle_from_32bit_read(dst, vec4_result, 0, components);
802    }
803 
804    brw_reg
LOAD_SUBGROUP_INVOCATION()805    LOAD_SUBGROUP_INVOCATION() const
806    {
807       brw_reg reg = vgrf(shader->dispatch_width < 16 ? BRW_TYPE_UD : BRW_TYPE_UW);
808       exec_all().emit(SHADER_OPCODE_LOAD_SUBGROUP_INVOCATION, reg);
809       return reg;
810    }
811 
812    brw_reg
BROADCAST(brw_reg value,brw_reg index)813    BROADCAST(brw_reg value, brw_reg index) const
814    {
815       const brw_builder xbld = scalar_group();
816       const brw_reg dst = xbld.vgrf(value.type);
817 
818       assert(is_uniform(index));
819 
820       /* A broadcast will always be at the full dispatch width even if the
821        * use of the broadcast result is smaller. If the source is_scalar,
822        * it may be allocated at less than the full dispatch width (e.g.,
823        * allocated at SIMD8 with SIMD32 dispatch). The input may or may
824        * not be stride=0. If it is not, the generated broadcast
825        *
826        *    broadcast(32) dst, value<1>, index<0>
827        *
828        * is invalid because it may read out of bounds from value.
829        *
830        * To account for this, modify the stride of an is_scalar input to be
831        * zero.
832        */
833       if (value.is_scalar)
834          value = component(value, 0);
835 
836       /* Ensure that the source of a broadcast is always register aligned.
837        * See brw_broadcast() non-scalar case for more details.
838        */
839       if (reg_offset(value) % (REG_SIZE * reg_unit(shader->devinfo)) != 0)
840          value = MOV(value);
841 
842       /* BROADCAST will only write a single component after lowering. Munge
843        * size_written here to match the allocated size of dst.
844        */
845       exec_all().emit(SHADER_OPCODE_BROADCAST, dst, value, index)
846          ->size_written = dst.component_size(xbld.dispatch_width());
847 
848       return component(dst, 0);
849    }
850 
851    fs_visitor *shader;
852 
BREAK()853    fs_inst *BREAK()    { return emit(BRW_OPCODE_BREAK); }
DO()854    fs_inst *DO()       { return emit(BRW_OPCODE_DO); }
ENDIF()855    fs_inst *ENDIF()    { return emit(BRW_OPCODE_ENDIF); }
NOP()856    fs_inst *NOP()      { return emit(BRW_OPCODE_NOP); }
WHILE()857    fs_inst *WHILE()    { return emit(BRW_OPCODE_WHILE); }
CONTINUE()858    fs_inst *CONTINUE() { return emit(BRW_OPCODE_CONTINUE); }
859 
has_writemask_all()860    bool has_writemask_all() const {
861       return force_writemask_all;
862    }
863 
864 private:
865    /**
866     * Workaround for negation of UD registers.  See comment in
867     * brw_generator::generate_code() for more details.
868     */
869    brw_reg
fix_unsigned_negate(const brw_reg & src)870    fix_unsigned_negate(const brw_reg &src) const
871    {
872       if (src.type == BRW_TYPE_UD &&
873           src.negate) {
874          brw_reg temp = vgrf(BRW_TYPE_UD);
875          MOV(temp, src);
876          return brw_reg(temp);
877       } else {
878          return src;
879       }
880    }
881 
882    /**
883     * Workaround for source register modes not supported by the ternary
884     * instruction encoding.
885     */
886    brw_reg
fix_3src_operand(const brw_reg & src)887    fix_3src_operand(const brw_reg &src) const
888    {
889       switch (src.file) {
890       case FIXED_GRF:
891          /* FINISHME: Could handle scalar region, other stride=1 regions */
892          if (src.vstride != BRW_VERTICAL_STRIDE_8 ||
893              src.width != BRW_WIDTH_8 ||
894              src.hstride != BRW_HORIZONTAL_STRIDE_1)
895             break;
896          FALLTHROUGH;
897       case ATTR:
898       case VGRF:
899       case UNIFORM:
900       case IMM:
901          return src;
902       default:
903          break;
904       }
905 
906       brw_reg expanded = vgrf(src.type);
907       MOV(expanded, src);
908       return expanded;
909    }
910 
911    void shuffle_from_32bit_read(const brw_reg &dst,
912                                 const brw_reg &src,
913                                 uint32_t first_component,
914                                 uint32_t components) const;
915 
916    bblock_t *block;
917    exec_node *cursor;
918 
919    unsigned _dispatch_width;
920    unsigned _group;
921    bool force_writemask_all;
922 
923    /** Debug annotation info. */
924    struct {
925       const char *str;
926    } annotation;
927 };
928 
929 /**
930  * Offset by a number of components into a VGRF
931  *
932  * It is assumed that the VGRF represents a vector (e.g., returned by
933  * load_uniform or a texture operation). Convergent and divergent values are
934  * stored differently, so care must be taken to offset properly.
935  */
936 static inline brw_reg
offset(const brw_reg & reg,const brw_builder & bld,unsigned delta)937 offset(const brw_reg &reg, const brw_builder &bld, unsigned delta)
938 {
939    /* If the value is convergent (stored as one or more SIMD8), offset using
940     * SIMD8 and select component 0.
941     */
942    if (reg.is_scalar) {
943       const unsigned allocation_width = 8 * reg_unit(bld.shader->devinfo);
944 
945       brw_reg offset_reg = offset(reg, allocation_width, delta);
946 
947       /* If the dispatch width is larger than the allocation width, that
948        * implies that the register can only be used as a source. Otherwise the
949        * instruction would write past the allocation size of the register.
950        */
951       if (bld.dispatch_width() > allocation_width)
952          return component(offset_reg, 0);
953       else
954          return offset_reg;
955    }
956 
957    /* Offset to the component assuming the value was allocated in
958     * dispatch_width units.
959     */
960    return offset(reg, bld.dispatch_width(), delta);
961 }
962 
963 brw_reg brw_sample_mask_reg(const brw_builder &bld);
964 void brw_emit_predicate_on_sample_mask(const brw_builder &bld, fs_inst *inst);
965 
966 brw_reg
967 brw_fetch_payload_reg(const brw_builder &bld, uint8_t regs[2],
968                       brw_reg_type type = BRW_TYPE_F,
969                       unsigned n = 1);
970 
971 brw_reg
972 brw_fetch_barycentric_reg(const brw_builder &bld, uint8_t regs[2]);
973 
974 void
975 brw_check_dynamic_msaa_flag(const brw_builder &bld,
976                             const struct brw_wm_prog_data *wm_prog_data,
977                             enum intel_msaa_flags flag);
978