1 /* -*- c++ -*- */
2 /*
3 * Copyright © 2010-2015 Intel Corporation
4 *
5 * Permission is hereby granted, free of charge, to any person obtaining a
6 * copy of this software and associated documentation files (the "Software"),
7 * to deal in the Software without restriction, including without limitation
8 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
9 * and/or sell copies of the Software, and to permit persons to whom the
10 * Software is furnished to do so, subject to the following conditions:
11 *
12 * The above copyright notice and this permission notice (including the next
13 * paragraph) shall be included in all copies or substantial portions of the
14 * Software.
15 *
16 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
19 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
21 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
22 * IN THE SOFTWARE.
23 */
24
25 #pragma once
26
27 #include "brw_ir_fs.h"
28 #include "brw_eu.h"
29 #include "brw_fs.h"
30
31 static inline brw_reg offset(const brw_reg &, const brw_builder &,
32 unsigned);
33
34 /**
35 * Toolbox to assemble an BRW IR program out of individual instructions.
36 */
37 class brw_builder {
38 public:
39 /**
40 * Construct an brw_builder that inserts instructions into \p shader.
41 * \p dispatch_width gives the native execution width of the program.
42 */
brw_builder(fs_visitor * shader,unsigned dispatch_width)43 brw_builder(fs_visitor *shader,
44 unsigned dispatch_width) :
45 shader(shader), block(NULL), cursor(NULL),
46 _dispatch_width(dispatch_width),
47 _group(0),
48 force_writemask_all(false),
49 annotation()
50 {
51 }
52
brw_builder(fs_visitor * s)53 explicit brw_builder(fs_visitor *s) : brw_builder(s, s->dispatch_width) {}
54
55 /**
56 * Construct an brw_builder that inserts instructions into \p shader
57 * before instruction \p inst in basic block \p block. The default
58 * execution controls and debug annotation are initialized from the
59 * instruction passed as argument.
60 */
brw_builder(fs_visitor * shader,bblock_t * block,fs_inst * inst)61 brw_builder(fs_visitor *shader, bblock_t *block, fs_inst *inst) :
62 shader(shader), block(block), cursor(inst),
63 _dispatch_width(inst->exec_size),
64 _group(inst->group),
65 force_writemask_all(inst->force_writemask_all)
66 {
67 #ifndef NDEBUG
68 annotation.str = inst->annotation;
69 #else
70 annotation.str = NULL;
71 #endif
72 }
73
74 /**
75 * Construct an brw_builder that inserts instructions before \p cursor in
76 * basic block \p block, inheriting other code generation parameters
77 * from this.
78 */
79 brw_builder
at(bblock_t * block,exec_node * cursor)80 at(bblock_t *block, exec_node *cursor) const
81 {
82 brw_builder bld = *this;
83 bld.block = block;
84 bld.cursor = cursor;
85 return bld;
86 }
87
88 /**
89 * Construct an brw_builder appending instructions at the end of the
90 * instruction list of the shader, inheriting other code generation
91 * parameters from this.
92 */
93 brw_builder
at_end()94 at_end() const
95 {
96 return at(NULL, (exec_node *)&shader->instructions.tail_sentinel);
97 }
98
99 /**
100 * Construct a builder specifying the default SIMD width and group of
101 * channel enable signals, inheriting other code generation parameters
102 * from this.
103 *
104 * \p n gives the default SIMD width, \p i gives the slot group used for
105 * predication and control flow masking in multiples of \p n channels.
106 */
107 brw_builder
group(unsigned n,unsigned i)108 group(unsigned n, unsigned i) const
109 {
110 brw_builder bld = *this;
111
112 if (n <= dispatch_width() && i < dispatch_width() / n) {
113 bld._group += i * n;
114 } else {
115 /* The requested channel group isn't a subset of the channel group
116 * of this builder, which means that the resulting instructions
117 * would use (potentially undefined) channel enable signals not
118 * specified by the parent builder. That's only valid if the
119 * instruction doesn't have per-channel semantics, in which case
120 * we should clear off the default group index in order to prevent
121 * emitting instructions with channel group not aligned to their
122 * own execution size.
123 */
124 assert(force_writemask_all);
125 bld._group = 0;
126 }
127
128 bld._dispatch_width = n;
129 return bld;
130 }
131
132 /**
133 * Alias for group() with width equal to eight.
134 */
135 brw_builder
quarter(unsigned i)136 quarter(unsigned i) const
137 {
138 return group(8, i);
139 }
140
141 /**
142 * Construct a builder with per-channel control flow execution masking
143 * disabled if \p b is true. If control flow execution masking is
144 * already disabled this has no effect.
145 */
146 brw_builder
147 exec_all(bool b = true) const
148 {
149 brw_builder bld = *this;
150 if (b)
151 bld.force_writemask_all = true;
152 return bld;
153 }
154
155 /**
156 * Construct a builder for SIMD8-as-scalar
157 */
158 brw_builder
scalar_group()159 scalar_group() const
160 {
161 return exec_all().group(8 * reg_unit(shader->devinfo), 0);
162 }
163
164 /**
165 * Construct a builder with the given debug annotation info.
166 */
167 brw_builder
annotate(const char * str)168 annotate(const char *str) const
169 {
170 brw_builder bld = *this;
171 bld.annotation.str = str;
172 return bld;
173 }
174
175 /**
176 * Get the SIMD width in use.
177 */
178 unsigned
dispatch_width()179 dispatch_width() const
180 {
181 return _dispatch_width;
182 }
183
184 /**
185 * Get the channel group in use.
186 */
187 unsigned
group()188 group() const
189 {
190 return _group;
191 }
192
193 /**
194 * Allocate a virtual register of natural vector size (one for this IR)
195 * and SIMD width. \p n gives the amount of space to allocate in
196 * dispatch_width units (which is just enough space for one logical
197 * component in this IR).
198 */
199 brw_reg
200 vgrf(enum brw_reg_type type, unsigned n = 1) const
201 {
202 const unsigned unit = reg_unit(shader->devinfo);
203 assert(dispatch_width() <= 32);
204
205 if (n > 0)
206 return brw_vgrf(shader->alloc.allocate(
207 DIV_ROUND_UP(n * brw_type_size_bytes(type) * dispatch_width(),
208 unit * REG_SIZE) * unit),
209 type);
210 else
211 return retype(null_reg_ud(), type);
212 }
213
214 brw_reg
vaddr(enum brw_reg_type type,unsigned subnr)215 vaddr(enum brw_reg_type type, unsigned subnr) const
216 {
217 brw_reg addr = brw_address_reg(subnr);
218 addr.nr = shader->next_address_register_nr++;
219 return retype(addr, type);
220 }
221
222 /**
223 * Create a null register of floating type.
224 */
225 brw_reg
null_reg_f()226 null_reg_f() const
227 {
228 return brw_reg(retype(brw_null_reg(), BRW_TYPE_F));
229 }
230
231 brw_reg
null_reg_df()232 null_reg_df() const
233 {
234 return brw_reg(retype(brw_null_reg(), BRW_TYPE_DF));
235 }
236
237 /**
238 * Create a null register of signed integer type.
239 */
240 brw_reg
null_reg_d()241 null_reg_d() const
242 {
243 return brw_reg(retype(brw_null_reg(), BRW_TYPE_D));
244 }
245
246 /**
247 * Create a null register of unsigned integer type.
248 */
249 brw_reg
null_reg_ud()250 null_reg_ud() const
251 {
252 return brw_reg(retype(brw_null_reg(), BRW_TYPE_UD));
253 }
254
255 /**
256 * Insert an instruction into the program.
257 */
258 fs_inst *
emit(const fs_inst & inst)259 emit(const fs_inst &inst) const
260 {
261 return emit(new(shader->mem_ctx) fs_inst(inst));
262 }
263
264 /**
265 * Create and insert a nullary control instruction into the program.
266 */
267 fs_inst *
emit(enum opcode opcode)268 emit(enum opcode opcode) const
269 {
270 return emit(fs_inst(opcode, dispatch_width()));
271 }
272
273 /**
274 * Create and insert a nullary instruction into the program.
275 */
276 fs_inst *
emit(enum opcode opcode,const brw_reg & dst)277 emit(enum opcode opcode, const brw_reg &dst) const
278 {
279 return emit(fs_inst(opcode, dispatch_width(), dst));
280 }
281
282 /**
283 * Create and insert a unary instruction into the program.
284 */
285 fs_inst *
emit(enum opcode opcode,const brw_reg & dst,const brw_reg & src0)286 emit(enum opcode opcode, const brw_reg &dst, const brw_reg &src0) const
287 {
288 return emit(fs_inst(opcode, dispatch_width(), dst, src0));
289 }
290
291 /**
292 * Create and insert a binary instruction into the program.
293 */
294 fs_inst *
emit(enum opcode opcode,const brw_reg & dst,const brw_reg & src0,const brw_reg & src1)295 emit(enum opcode opcode, const brw_reg &dst, const brw_reg &src0,
296 const brw_reg &src1) const
297 {
298 return emit(fs_inst(opcode, dispatch_width(), dst,
299 src0, src1));
300 }
301
302 /**
303 * Create and insert a ternary instruction into the program.
304 */
305 fs_inst *
emit(enum opcode opcode,const brw_reg & dst,const brw_reg & src0,const brw_reg & src1,const brw_reg & src2)306 emit(enum opcode opcode, const brw_reg &dst, const brw_reg &src0,
307 const brw_reg &src1, const brw_reg &src2) const
308 {
309 switch (opcode) {
310 case BRW_OPCODE_BFE:
311 case BRW_OPCODE_BFI2:
312 case BRW_OPCODE_MAD:
313 case BRW_OPCODE_LRP:
314 return emit(fs_inst(opcode, dispatch_width(), dst,
315 fix_3src_operand(src0),
316 fix_3src_operand(src1),
317 fix_3src_operand(src2)));
318
319 default:
320 return emit(fs_inst(opcode, dispatch_width(), dst,
321 src0, src1, src2));
322 }
323 }
324
325 /**
326 * Create and insert an instruction with a variable number of sources
327 * into the program.
328 */
329 fs_inst *
emit(enum opcode opcode,const brw_reg & dst,const brw_reg srcs[],unsigned n)330 emit(enum opcode opcode, const brw_reg &dst, const brw_reg srcs[],
331 unsigned n) const
332 {
333 /* Use the emit() methods for specific operand counts to ensure that
334 * opcode-specific operand fixups occur.
335 */
336 if (n == 3) {
337 return emit(opcode, dst, srcs[0], srcs[1], srcs[2]);
338 } else {
339 return emit(fs_inst(opcode, dispatch_width(), dst, srcs, n));
340 }
341 }
342
343 /**
344 * Insert a preallocated instruction into the program.
345 */
346 fs_inst *
emit(fs_inst * inst)347 emit(fs_inst *inst) const
348 {
349 assert(inst->exec_size <= 32);
350 assert(inst->exec_size == dispatch_width() ||
351 force_writemask_all);
352
353 inst->group = _group;
354 inst->force_writemask_all = force_writemask_all;
355 #ifndef NDEBUG
356 inst->annotation = annotation.str;
357 #endif
358
359 if (block)
360 static_cast<fs_inst *>(cursor)->insert_before(block, inst);
361 else
362 cursor->insert_before(inst);
363
364 return inst;
365 }
366
367 /**
368 * Select \p src0 if the comparison of both sources with the given
369 * conditional mod evaluates to true, otherwise select \p src1.
370 *
371 * Generally useful to get the minimum or maximum of two values.
372 */
373 fs_inst *
emit_minmax(const brw_reg & dst,const brw_reg & src0,const brw_reg & src1,brw_conditional_mod mod)374 emit_minmax(const brw_reg &dst, const brw_reg &src0,
375 const brw_reg &src1, brw_conditional_mod mod) const
376 {
377 assert(mod == BRW_CONDITIONAL_GE || mod == BRW_CONDITIONAL_L);
378
379 /* In some cases we can't have bytes as operand for src1, so use the
380 * same type for both operand.
381 */
382 return set_condmod(mod, SEL(dst, fix_unsigned_negate(src0),
383 fix_unsigned_negate(src1)));
384 }
385
386 /**
387 * Copy any live channel from \p src to the first channel of the result.
388 */
389 brw_reg
emit_uniformize(const brw_reg & src)390 emit_uniformize(const brw_reg &src) const
391 {
392 /* Trivial: skip unnecessary work and retain IMM */
393 if (src.file == IMM)
394 return src;
395
396 /* FIXME: We use a vector chan_index and dst to allow constant and
397 * copy propagration to move result all the way into the consuming
398 * instruction (typically a surface index or sampler index for a
399 * send). Once we teach const/copy propagation about scalars we
400 * should go back to scalar destinations here.
401 */
402 const brw_builder xbld = scalar_group();
403 const brw_reg chan_index = xbld.vgrf(BRW_TYPE_UD);
404
405 /* FIND_LIVE_CHANNEL will only write a single component after
406 * lowering. Munge size_written here to match the allocated size of
407 * chan_index.
408 */
409 exec_all().emit(SHADER_OPCODE_FIND_LIVE_CHANNEL, chan_index)
410 ->size_written = chan_index.component_size(xbld.dispatch_width());
411
412 return BROADCAST(src, component(chan_index, 0));
413 }
414
415 brw_reg
move_to_vgrf(const brw_reg & src,unsigned num_components)416 move_to_vgrf(const brw_reg &src, unsigned num_components) const
417 {
418 brw_reg *const src_comps = new brw_reg[num_components];
419
420 for (unsigned i = 0; i < num_components; i++)
421 src_comps[i] = offset(src, *this, i);
422
423 const brw_reg dst = vgrf(src.type, num_components);
424 LOAD_PAYLOAD(dst, src_comps, num_components, 0);
425
426 delete[] src_comps;
427
428 return brw_reg(dst);
429 }
430
431 fs_inst *
emit_undef_for_dst(const fs_inst * old_inst)432 emit_undef_for_dst(const fs_inst *old_inst) const
433 {
434 assert(old_inst->dst.file == VGRF);
435 fs_inst *inst = emit(SHADER_OPCODE_UNDEF,
436 retype(old_inst->dst, BRW_TYPE_UD));
437 inst->size_written = old_inst->size_written;
438
439 return inst;
440 }
441
442 /**
443 * Assorted arithmetic ops.
444 * @{
445 */
446 #define _ALU1(prefix, op) \
447 fs_inst * \
448 op(const brw_reg &dst, const brw_reg &src0) const \
449 { \
450 assert(_dispatch_width == 1 || \
451 (dst.file >= VGRF && dst.stride != 0) || \
452 (dst.file < VGRF && dst.hstride != 0)); \
453 return emit(prefix##op, dst, src0); \
454 } \
455 brw_reg \
456 op(const brw_reg &src0, fs_inst **out = NULL) const \
457 { \
458 fs_inst *inst = op(vgrf(src0.type), src0); \
459 if (out) *out = inst; \
460 return inst->dst; \
461 }
462 #define ALU1(op) _ALU1(BRW_OPCODE_, op)
463 #define VIRT1(op) _ALU1(SHADER_OPCODE_, op)
464
465 fs_inst *
alu2(opcode op,const brw_reg & dst,const brw_reg & src0,const brw_reg & src1)466 alu2(opcode op, const brw_reg &dst, const brw_reg &src0, const brw_reg &src1) const
467 {
468 return emit(op, dst, src0, src1);
469 }
470 brw_reg
471 alu2(opcode op, const brw_reg &src0, const brw_reg &src1, fs_inst **out = NULL) const
472 {
473 enum brw_reg_type inferred_dst_type =
474 brw_type_larger_of(src0.type, src1.type);
475 fs_inst *inst = alu2(op, vgrf(inferred_dst_type), src0, src1);
476 if (out) *out = inst;
477 return inst->dst;
478 }
479
480 #define _ALU2(prefix, op) \
481 fs_inst * \
482 op(const brw_reg &dst, const brw_reg &src0, const brw_reg &src1) const \
483 { \
484 return alu2(prefix##op, dst, src0, src1); \
485 } \
486 brw_reg \
487 op(const brw_reg &src0, const brw_reg &src1, fs_inst **out = NULL) const \
488 { \
489 return alu2(prefix##op, src0, src1, out); \
490 }
491 #define ALU2(op) _ALU2(BRW_OPCODE_, op)
492 #define VIRT2(op) _ALU2(SHADER_OPCODE_, op)
493
494 #define ALU2_ACC(op) \
495 fs_inst * \
496 op(const brw_reg &dst, const brw_reg &src0, const brw_reg &src1) const \
497 { \
498 fs_inst *inst = emit(BRW_OPCODE_##op, dst, src0, src1); \
499 inst->writes_accumulator = true; \
500 return inst; \
501 }
502
503 #define ALU3(op) \
504 fs_inst * \
505 op(const brw_reg &dst, const brw_reg &src0, const brw_reg &src1, \
506 const brw_reg &src2) const \
507 { \
508 return emit(BRW_OPCODE_##op, dst, src0, src1, src2); \
509 } \
510 brw_reg \
511 op(const brw_reg &src0, const brw_reg &src1, const brw_reg &src2, \
512 fs_inst **out = NULL) const \
513 { \
514 enum brw_reg_type inferred_dst_type = \
515 brw_type_larger_of(brw_type_larger_of(src0.type, src1.type),\
516 src2.type); \
517 fs_inst *inst = op(vgrf(inferred_dst_type), src0, src1, src2); \
518 if (out) *out = inst; \
519 return inst->dst; \
520 }
521
522 ALU3(ADD3)
ALU2_ACC(ADDC)523 ALU2_ACC(ADDC)
524 ALU2(AND)
525 ALU2(ASR)
526 ALU2(AVG)
527 ALU3(BFE)
528 ALU2(BFI1)
529 ALU3(BFI2)
530 ALU1(BFREV)
531 ALU1(CBIT)
532 ALU2(DP2)
533 ALU2(DP3)
534 ALU2(DP4)
535 ALU2(DPH)
536 ALU1(FBH)
537 ALU1(FBL)
538 ALU1(FRC)
539 ALU3(DP4A)
540 ALU2(LINE)
541 ALU1(LZD)
542 ALU2(MAC)
543 ALU2_ACC(MACH)
544 ALU3(MAD)
545 ALU1(MOV)
546 ALU2(MUL)
547 ALU1(NOT)
548 ALU2(OR)
549 ALU2(PLN)
550 ALU1(RNDD)
551 ALU1(RNDE)
552 ALU1(RNDU)
553 ALU1(RNDZ)
554 ALU2(ROL)
555 ALU2(ROR)
556 ALU2(SEL)
557 ALU2(SHL)
558 ALU2(SHR)
559 ALU2_ACC(SUBB)
560 ALU2(XOR)
561
562 VIRT1(RCP)
563 VIRT1(RSQ)
564 VIRT1(SQRT)
565 VIRT1(EXP2)
566 VIRT1(LOG2)
567 VIRT2(POW)
568 VIRT2(INT_QUOTIENT)
569 VIRT2(INT_REMAINDER)
570 VIRT1(SIN)
571 VIRT1(COS)
572
573 #undef ALU3
574 #undef ALU2_ACC
575 #undef ALU2
576 #undef VIRT2
577 #undef _ALU2
578 #undef ALU1
579 #undef VIRT1
580 #undef _ALU1
581 /** @} */
582
583 fs_inst *
584 ADD(const brw_reg &dst, const brw_reg &src0, const brw_reg &src1) const
585 {
586 return alu2(BRW_OPCODE_ADD, dst, src0, src1);
587 }
588
589 brw_reg
590 ADD(const brw_reg &src0, const brw_reg &src1, fs_inst **out = NULL) const
591 {
592 if (src1.file == IMM && src1.ud == 0 && !out)
593 return src0;
594
595 return alu2(BRW_OPCODE_ADD, src0, src1, out);
596 }
597
598 /**
599 * CMP: Sets the low bit of the destination channels with the result
600 * of the comparison, while the upper bits are undefined, and updates
601 * the flag register with the packed 16 bits of the result.
602 */
603 fs_inst *
CMP(const brw_reg & dst,const brw_reg & src0,const brw_reg & src1,brw_conditional_mod condition)604 CMP(const brw_reg &dst, const brw_reg &src0, const brw_reg &src1,
605 brw_conditional_mod condition) const
606 {
607 /* Take the instruction:
608 *
609 * CMP null<d> src0<f> src1<f>
610 *
611 * Original gfx4 does type conversion to the destination type
612 * before comparison, producing garbage results for floating
613 * point comparisons.
614 */
615 const enum brw_reg_type type =
616 dst.is_null() ?
617 src0.type :
618 brw_type_with_size(src0.type, brw_type_size_bits(dst.type));
619
620 return set_condmod(condition,
621 emit(BRW_OPCODE_CMP, retype(dst, type),
622 fix_unsigned_negate(src0),
623 fix_unsigned_negate(src1)));
624 }
625
626 /**
627 * CMPN: Behaves like CMP, but produces true if src1 is NaN.
628 */
629 fs_inst *
CMPN(const brw_reg & dst,const brw_reg & src0,const brw_reg & src1,brw_conditional_mod condition)630 CMPN(const brw_reg &dst, const brw_reg &src0, const brw_reg &src1,
631 brw_conditional_mod condition) const
632 {
633 /* Take the instruction:
634 *
635 * CMP null<d> src0<f> src1<f>
636 *
637 * Original gfx4 does type conversion to the destination type
638 * before comparison, producing garbage results for floating
639 * point comparisons.
640 */
641 const enum brw_reg_type type =
642 dst.is_null() ?
643 src0.type :
644 brw_type_with_size(src0.type, brw_type_size_bits(dst.type));
645
646 return set_condmod(condition,
647 emit(BRW_OPCODE_CMPN, retype(dst, type),
648 fix_unsigned_negate(src0),
649 fix_unsigned_negate(src1)));
650 }
651
652 /**
653 * Gfx4 predicated IF.
654 */
655 fs_inst *
IF(brw_predicate predicate)656 IF(brw_predicate predicate) const
657 {
658 return set_predicate(predicate, emit(BRW_OPCODE_IF));
659 }
660
661 /**
662 * CSEL: dst = src2 <op> 0.0f ? src0 : src1
663 */
664 fs_inst *
CSEL(const brw_reg & dst,const brw_reg & src0,const brw_reg & src1,const brw_reg & src2,brw_conditional_mod condition)665 CSEL(const brw_reg &dst, const brw_reg &src0, const brw_reg &src1,
666 const brw_reg &src2, brw_conditional_mod condition) const
667 {
668 return set_condmod(condition,
669 emit(BRW_OPCODE_CSEL,
670 retype(dst, src2.type),
671 retype(src0, src2.type),
672 retype(src1, src2.type),
673 src2));
674 }
675
676 /**
677 * Emit a linear interpolation instruction.
678 */
679 fs_inst *
LRP(const brw_reg & dst,const brw_reg & x,const brw_reg & y,const brw_reg & a)680 LRP(const brw_reg &dst, const brw_reg &x, const brw_reg &y,
681 const brw_reg &a) const
682 {
683 if (shader->devinfo->ver <= 10) {
684 /* The LRP instruction actually does op1 * op0 + op2 * (1 - op0), so
685 * we need to reorder the operands.
686 */
687 return emit(BRW_OPCODE_LRP, dst, a, y, x);
688
689 } else {
690 /* We can't use the LRP instruction. Emit x*(1-a) + y*a. */
691 const brw_reg y_times_a = vgrf(dst.type);
692 const brw_reg one_minus_a = vgrf(dst.type);
693 const brw_reg x_times_one_minus_a = vgrf(dst.type);
694
695 MUL(y_times_a, y, a);
696 ADD(one_minus_a, negate(a), brw_imm_f(1.0f));
697 MUL(x_times_one_minus_a, x, brw_reg(one_minus_a));
698 return ADD(dst, brw_reg(x_times_one_minus_a), brw_reg(y_times_a));
699 }
700 }
701
702 /**
703 * Collect a number of registers in a contiguous range of registers.
704 */
705 fs_inst *
LOAD_PAYLOAD(const brw_reg & dst,const brw_reg * src,unsigned sources,unsigned header_size)706 LOAD_PAYLOAD(const brw_reg &dst, const brw_reg *src,
707 unsigned sources, unsigned header_size) const
708 {
709 fs_inst *inst = emit(SHADER_OPCODE_LOAD_PAYLOAD, dst, src, sources);
710 inst->header_size = header_size;
711 inst->size_written = header_size * REG_SIZE;
712 for (unsigned i = header_size; i < sources; i++) {
713 inst->size_written += dispatch_width() * brw_type_size_bytes(src[i].type) *
714 dst.stride;
715 }
716
717 return inst;
718 }
719
720 fs_inst *
VEC(const brw_reg & dst,const brw_reg * src,unsigned sources)721 VEC(const brw_reg &dst, const brw_reg *src, unsigned sources) const
722 {
723 return sources == 1 ? MOV(dst, src[0])
724 : LOAD_PAYLOAD(dst, src, sources, 0);
725 }
726
727 fs_inst *
SYNC(enum tgl_sync_function sync)728 SYNC(enum tgl_sync_function sync) const
729 {
730 return emit(BRW_OPCODE_SYNC, null_reg_ud(), brw_imm_ud(sync));
731 }
732
733 fs_inst *
UNDEF(const brw_reg & dst)734 UNDEF(const brw_reg &dst) const
735 {
736 assert(dst.file == VGRF);
737 assert(dst.offset % REG_SIZE == 0);
738 fs_inst *inst = emit(SHADER_OPCODE_UNDEF,
739 retype(dst, BRW_TYPE_UD));
740 inst->size_written = shader->alloc.sizes[dst.nr] * REG_SIZE - dst.offset;
741
742 return inst;
743 }
744
745 fs_inst *
DPAS(const brw_reg & dst,const brw_reg & src0,const brw_reg & src1,const brw_reg & src2,unsigned sdepth,unsigned rcount)746 DPAS(const brw_reg &dst, const brw_reg &src0, const brw_reg &src1, const brw_reg &src2,
747 unsigned sdepth, unsigned rcount) const
748 {
749 assert(_dispatch_width == 8 * reg_unit(shader->devinfo));
750 assert(sdepth == 8);
751 assert(rcount == 1 || rcount == 2 || rcount == 4 || rcount == 8);
752
753 fs_inst *inst = emit(BRW_OPCODE_DPAS, dst, src0, src1, src2);
754 inst->sdepth = sdepth;
755 inst->rcount = rcount;
756
757 if (dst.type == BRW_TYPE_HF) {
758 inst->size_written = reg_unit(shader->devinfo) * rcount * REG_SIZE / 2;
759 } else {
760 inst->size_written = reg_unit(shader->devinfo) * rcount * REG_SIZE;
761 }
762
763 return inst;
764 }
765
766 void
VARYING_PULL_CONSTANT_LOAD(const brw_reg & dst,const brw_reg & surface,const brw_reg & surface_handle,const brw_reg & varying_offset,uint32_t const_offset,uint8_t alignment,unsigned components)767 VARYING_PULL_CONSTANT_LOAD(const brw_reg &dst,
768 const brw_reg &surface,
769 const brw_reg &surface_handle,
770 const brw_reg &varying_offset,
771 uint32_t const_offset,
772 uint8_t alignment,
773 unsigned components) const
774 {
775 assert(components <= 4);
776
777 /* We have our constant surface use a pitch of 4 bytes, so our index can
778 * be any component of a vector, and then we load 4 contiguous
779 * components starting from that. TODO: Support loading fewer than 4.
780 */
781 brw_reg total_offset = ADD(varying_offset, brw_imm_ud(const_offset));
782
783 /* The pull load message will load a vec4 (16 bytes). If we are loading
784 * a double this means we are only loading 2 elements worth of data.
785 * We also want to use a 32-bit data type for the dst of the load operation
786 * so other parts of the driver don't get confused about the size of the
787 * result.
788 */
789 brw_reg vec4_result = vgrf(BRW_TYPE_F, 4);
790
791 brw_reg srcs[PULL_VARYING_CONSTANT_SRCS];
792 srcs[PULL_VARYING_CONSTANT_SRC_SURFACE] = surface;
793 srcs[PULL_VARYING_CONSTANT_SRC_SURFACE_HANDLE] = surface_handle;
794 srcs[PULL_VARYING_CONSTANT_SRC_OFFSET] = total_offset;
795 srcs[PULL_VARYING_CONSTANT_SRC_ALIGNMENT] = brw_imm_ud(alignment);
796
797 fs_inst *inst = emit(FS_OPCODE_VARYING_PULL_CONSTANT_LOAD_LOGICAL,
798 vec4_result, srcs, PULL_VARYING_CONSTANT_SRCS);
799 inst->size_written = 4 * vec4_result.component_size(inst->exec_size);
800
801 shuffle_from_32bit_read(dst, vec4_result, 0, components);
802 }
803
804 brw_reg
LOAD_SUBGROUP_INVOCATION()805 LOAD_SUBGROUP_INVOCATION() const
806 {
807 brw_reg reg = vgrf(shader->dispatch_width < 16 ? BRW_TYPE_UD : BRW_TYPE_UW);
808 exec_all().emit(SHADER_OPCODE_LOAD_SUBGROUP_INVOCATION, reg);
809 return reg;
810 }
811
812 brw_reg
BROADCAST(brw_reg value,brw_reg index)813 BROADCAST(brw_reg value, brw_reg index) const
814 {
815 const brw_builder xbld = scalar_group();
816 const brw_reg dst = xbld.vgrf(value.type);
817
818 assert(is_uniform(index));
819
820 /* A broadcast will always be at the full dispatch width even if the
821 * use of the broadcast result is smaller. If the source is_scalar,
822 * it may be allocated at less than the full dispatch width (e.g.,
823 * allocated at SIMD8 with SIMD32 dispatch). The input may or may
824 * not be stride=0. If it is not, the generated broadcast
825 *
826 * broadcast(32) dst, value<1>, index<0>
827 *
828 * is invalid because it may read out of bounds from value.
829 *
830 * To account for this, modify the stride of an is_scalar input to be
831 * zero.
832 */
833 if (value.is_scalar)
834 value = component(value, 0);
835
836 /* Ensure that the source of a broadcast is always register aligned.
837 * See brw_broadcast() non-scalar case for more details.
838 */
839 if (reg_offset(value) % (REG_SIZE * reg_unit(shader->devinfo)) != 0)
840 value = MOV(value);
841
842 /* BROADCAST will only write a single component after lowering. Munge
843 * size_written here to match the allocated size of dst.
844 */
845 exec_all().emit(SHADER_OPCODE_BROADCAST, dst, value, index)
846 ->size_written = dst.component_size(xbld.dispatch_width());
847
848 return component(dst, 0);
849 }
850
851 fs_visitor *shader;
852
BREAK()853 fs_inst *BREAK() { return emit(BRW_OPCODE_BREAK); }
DO()854 fs_inst *DO() { return emit(BRW_OPCODE_DO); }
ENDIF()855 fs_inst *ENDIF() { return emit(BRW_OPCODE_ENDIF); }
NOP()856 fs_inst *NOP() { return emit(BRW_OPCODE_NOP); }
WHILE()857 fs_inst *WHILE() { return emit(BRW_OPCODE_WHILE); }
CONTINUE()858 fs_inst *CONTINUE() { return emit(BRW_OPCODE_CONTINUE); }
859
has_writemask_all()860 bool has_writemask_all() const {
861 return force_writemask_all;
862 }
863
864 private:
865 /**
866 * Workaround for negation of UD registers. See comment in
867 * brw_generator::generate_code() for more details.
868 */
869 brw_reg
fix_unsigned_negate(const brw_reg & src)870 fix_unsigned_negate(const brw_reg &src) const
871 {
872 if (src.type == BRW_TYPE_UD &&
873 src.negate) {
874 brw_reg temp = vgrf(BRW_TYPE_UD);
875 MOV(temp, src);
876 return brw_reg(temp);
877 } else {
878 return src;
879 }
880 }
881
882 /**
883 * Workaround for source register modes not supported by the ternary
884 * instruction encoding.
885 */
886 brw_reg
fix_3src_operand(const brw_reg & src)887 fix_3src_operand(const brw_reg &src) const
888 {
889 switch (src.file) {
890 case FIXED_GRF:
891 /* FINISHME: Could handle scalar region, other stride=1 regions */
892 if (src.vstride != BRW_VERTICAL_STRIDE_8 ||
893 src.width != BRW_WIDTH_8 ||
894 src.hstride != BRW_HORIZONTAL_STRIDE_1)
895 break;
896 FALLTHROUGH;
897 case ATTR:
898 case VGRF:
899 case UNIFORM:
900 case IMM:
901 return src;
902 default:
903 break;
904 }
905
906 brw_reg expanded = vgrf(src.type);
907 MOV(expanded, src);
908 return expanded;
909 }
910
911 void shuffle_from_32bit_read(const brw_reg &dst,
912 const brw_reg &src,
913 uint32_t first_component,
914 uint32_t components) const;
915
916 bblock_t *block;
917 exec_node *cursor;
918
919 unsigned _dispatch_width;
920 unsigned _group;
921 bool force_writemask_all;
922
923 /** Debug annotation info. */
924 struct {
925 const char *str;
926 } annotation;
927 };
928
929 /**
930 * Offset by a number of components into a VGRF
931 *
932 * It is assumed that the VGRF represents a vector (e.g., returned by
933 * load_uniform or a texture operation). Convergent and divergent values are
934 * stored differently, so care must be taken to offset properly.
935 */
936 static inline brw_reg
offset(const brw_reg & reg,const brw_builder & bld,unsigned delta)937 offset(const brw_reg ®, const brw_builder &bld, unsigned delta)
938 {
939 /* If the value is convergent (stored as one or more SIMD8), offset using
940 * SIMD8 and select component 0.
941 */
942 if (reg.is_scalar) {
943 const unsigned allocation_width = 8 * reg_unit(bld.shader->devinfo);
944
945 brw_reg offset_reg = offset(reg, allocation_width, delta);
946
947 /* If the dispatch width is larger than the allocation width, that
948 * implies that the register can only be used as a source. Otherwise the
949 * instruction would write past the allocation size of the register.
950 */
951 if (bld.dispatch_width() > allocation_width)
952 return component(offset_reg, 0);
953 else
954 return offset_reg;
955 }
956
957 /* Offset to the component assuming the value was allocated in
958 * dispatch_width units.
959 */
960 return offset(reg, bld.dispatch_width(), delta);
961 }
962
963 brw_reg brw_sample_mask_reg(const brw_builder &bld);
964 void brw_emit_predicate_on_sample_mask(const brw_builder &bld, fs_inst *inst);
965
966 brw_reg
967 brw_fetch_payload_reg(const brw_builder &bld, uint8_t regs[2],
968 brw_reg_type type = BRW_TYPE_F,
969 unsigned n = 1);
970
971 brw_reg
972 brw_fetch_barycentric_reg(const brw_builder &bld, uint8_t regs[2]);
973
974 void
975 brw_check_dynamic_msaa_flag(const brw_builder &bld,
976 const struct brw_wm_prog_data *wm_prog_data,
977 enum intel_msaa_flags flag);
978