1 /* -*- c++ -*- */
2 /*
3 * Copyright © 2010-2015 Intel Corporation
4 *
5 * Permission is hereby granted, free of charge, to any person obtaining a
6 * copy of this software and associated documentation files (the "Software"),
7 * to deal in the Software without restriction, including without limitation
8 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
9 * and/or sell copies of the Software, and to permit persons to whom the
10 * Software is furnished to do so, subject to the following conditions:
11 *
12 * The above copyright notice and this permission notice (including the next
13 * paragraph) shall be included in all copies or substantial portions of the
14 * Software.
15 *
16 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
19 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
21 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
22 * IN THE SOFTWARE.
23 */
24
25 #pragma once
26
27 #include "brw_ir_fs.h"
28 #include "brw_eu.h"
29 #include "brw_fs.h"
30
31 static inline brw_reg offset(const brw_reg &, const brw::fs_builder &,
32 unsigned);
33
34 namespace brw {
35 /**
36 * Toolbox to assemble an FS IR program out of individual instructions.
37 */
38 class fs_builder {
39 public:
40 /**
41 * Construct an fs_builder that inserts instructions into \p shader.
42 * \p dispatch_width gives the native execution width of the program.
43 */
fs_builder(fs_visitor * shader,unsigned dispatch_width)44 fs_builder(fs_visitor *shader,
45 unsigned dispatch_width) :
46 shader(shader), block(NULL), cursor(NULL),
47 _dispatch_width(dispatch_width),
48 _group(0),
49 force_writemask_all(false),
50 annotation()
51 {
52 }
53
fs_builder(fs_visitor * s)54 explicit fs_builder(fs_visitor *s) : fs_builder(s, s->dispatch_width) {}
55
56 /**
57 * Construct an fs_builder that inserts instructions into \p shader
58 * before instruction \p inst in basic block \p block. The default
59 * execution controls and debug annotation are initialized from the
60 * instruction passed as argument.
61 */
fs_builder(fs_visitor * shader,bblock_t * block,fs_inst * inst)62 fs_builder(fs_visitor *shader, bblock_t *block, fs_inst *inst) :
63 shader(shader), block(block), cursor(inst),
64 _dispatch_width(inst->exec_size),
65 _group(inst->group),
66 force_writemask_all(inst->force_writemask_all)
67 {
68 #ifndef NDEBUG
69 annotation.str = inst->annotation;
70 #else
71 annotation.str = NULL;
72 #endif
73 }
74
75 /**
76 * Construct an fs_builder that inserts instructions before \p cursor in
77 * basic block \p block, inheriting other code generation parameters
78 * from this.
79 */
80 fs_builder
at(bblock_t * block,exec_node * cursor)81 at(bblock_t *block, exec_node *cursor) const
82 {
83 fs_builder bld = *this;
84 bld.block = block;
85 bld.cursor = cursor;
86 return bld;
87 }
88
89 /**
90 * Construct an fs_builder appending instructions at the end of the
91 * instruction list of the shader, inheriting other code generation
92 * parameters from this.
93 */
94 fs_builder
at_end()95 at_end() const
96 {
97 return at(NULL, (exec_node *)&shader->instructions.tail_sentinel);
98 }
99
100 /**
101 * Construct a builder specifying the default SIMD width and group of
102 * channel enable signals, inheriting other code generation parameters
103 * from this.
104 *
105 * \p n gives the default SIMD width, \p i gives the slot group used for
106 * predication and control flow masking in multiples of \p n channels.
107 */
108 fs_builder
group(unsigned n,unsigned i)109 group(unsigned n, unsigned i) const
110 {
111 fs_builder bld = *this;
112
113 if (n <= dispatch_width() && i < dispatch_width() / n) {
114 bld._group += i * n;
115 } else {
116 /* The requested channel group isn't a subset of the channel group
117 * of this builder, which means that the resulting instructions
118 * would use (potentially undefined) channel enable signals not
119 * specified by the parent builder. That's only valid if the
120 * instruction doesn't have per-channel semantics, in which case
121 * we should clear off the default group index in order to prevent
122 * emitting instructions with channel group not aligned to their
123 * own execution size.
124 */
125 assert(force_writemask_all);
126 bld._group = 0;
127 }
128
129 bld._dispatch_width = n;
130 return bld;
131 }
132
133 /**
134 * Alias for group() with width equal to eight.
135 */
136 fs_builder
quarter(unsigned i)137 quarter(unsigned i) const
138 {
139 return group(8, i);
140 }
141
142 /**
143 * Construct a builder with per-channel control flow execution masking
144 * disabled if \p b is true. If control flow execution masking is
145 * already disabled this has no effect.
146 */
147 fs_builder
148 exec_all(bool b = true) const
149 {
150 fs_builder bld = *this;
151 if (b)
152 bld.force_writemask_all = true;
153 return bld;
154 }
155
156 /**
157 * Construct a builder for SIMD8-as-scalar
158 */
159 fs_builder
scalar_group()160 scalar_group() const
161 {
162 return exec_all().group(8 * reg_unit(shader->devinfo), 0);
163 }
164
165 /**
166 * Construct a builder with the given debug annotation info.
167 */
168 fs_builder
annotate(const char * str)169 annotate(const char *str) const
170 {
171 fs_builder bld = *this;
172 bld.annotation.str = str;
173 return bld;
174 }
175
176 /**
177 * Get the SIMD width in use.
178 */
179 unsigned
dispatch_width()180 dispatch_width() const
181 {
182 return _dispatch_width;
183 }
184
185 /**
186 * Get the channel group in use.
187 */
188 unsigned
group()189 group() const
190 {
191 return _group;
192 }
193
194 /**
195 * Allocate a virtual register of natural vector size (one for this IR)
196 * and SIMD width. \p n gives the amount of space to allocate in
197 * dispatch_width units (which is just enough space for one logical
198 * component in this IR).
199 */
200 brw_reg
201 vgrf(enum brw_reg_type type, unsigned n = 1) const
202 {
203 const unsigned unit = reg_unit(shader->devinfo);
204 assert(dispatch_width() <= 32);
205
206 if (n > 0)
207 return brw_vgrf(shader->alloc.allocate(
208 DIV_ROUND_UP(n * brw_type_size_bytes(type) * dispatch_width(),
209 unit * REG_SIZE) * unit),
210 type);
211 else
212 return retype(null_reg_ud(), type);
213 }
214
215 brw_reg
vaddr(enum brw_reg_type type,unsigned subnr)216 vaddr(enum brw_reg_type type, unsigned subnr) const
217 {
218 brw_reg addr = brw_address_reg(subnr);
219 addr.nr = shader->next_address_register_nr++;
220 return retype(addr, type);
221 }
222
223 /**
224 * Create a null register of floating type.
225 */
226 brw_reg
null_reg_f()227 null_reg_f() const
228 {
229 return brw_reg(retype(brw_null_reg(), BRW_TYPE_F));
230 }
231
232 brw_reg
null_reg_df()233 null_reg_df() const
234 {
235 return brw_reg(retype(brw_null_reg(), BRW_TYPE_DF));
236 }
237
238 /**
239 * Create a null register of signed integer type.
240 */
241 brw_reg
null_reg_d()242 null_reg_d() const
243 {
244 return brw_reg(retype(brw_null_reg(), BRW_TYPE_D));
245 }
246
247 /**
248 * Create a null register of unsigned integer type.
249 */
250 brw_reg
null_reg_ud()251 null_reg_ud() const
252 {
253 return brw_reg(retype(brw_null_reg(), BRW_TYPE_UD));
254 }
255
256 /**
257 * Insert an instruction into the program.
258 */
259 fs_inst *
emit(const fs_inst & inst)260 emit(const fs_inst &inst) const
261 {
262 return emit(new(shader->mem_ctx) fs_inst(inst));
263 }
264
265 /**
266 * Create and insert a nullary control instruction into the program.
267 */
268 fs_inst *
emit(enum opcode opcode)269 emit(enum opcode opcode) const
270 {
271 return emit(fs_inst(opcode, dispatch_width()));
272 }
273
274 /**
275 * Create and insert a nullary instruction into the program.
276 */
277 fs_inst *
emit(enum opcode opcode,const brw_reg & dst)278 emit(enum opcode opcode, const brw_reg &dst) const
279 {
280 return emit(fs_inst(opcode, dispatch_width(), dst));
281 }
282
283 /**
284 * Create and insert a unary instruction into the program.
285 */
286 fs_inst *
emit(enum opcode opcode,const brw_reg & dst,const brw_reg & src0)287 emit(enum opcode opcode, const brw_reg &dst, const brw_reg &src0) const
288 {
289 return emit(fs_inst(opcode, dispatch_width(), dst, src0));
290 }
291
292 /**
293 * Create and insert a binary instruction into the program.
294 */
295 fs_inst *
emit(enum opcode opcode,const brw_reg & dst,const brw_reg & src0,const brw_reg & src1)296 emit(enum opcode opcode, const brw_reg &dst, const brw_reg &src0,
297 const brw_reg &src1) const
298 {
299 return emit(fs_inst(opcode, dispatch_width(), dst,
300 src0, src1));
301 }
302
303 /**
304 * Create and insert a ternary instruction into the program.
305 */
306 fs_inst *
emit(enum opcode opcode,const brw_reg & dst,const brw_reg & src0,const brw_reg & src1,const brw_reg & src2)307 emit(enum opcode opcode, const brw_reg &dst, const brw_reg &src0,
308 const brw_reg &src1, const brw_reg &src2) const
309 {
310 switch (opcode) {
311 case BRW_OPCODE_BFE:
312 case BRW_OPCODE_BFI2:
313 case BRW_OPCODE_MAD:
314 case BRW_OPCODE_LRP:
315 return emit(fs_inst(opcode, dispatch_width(), dst,
316 fix_3src_operand(src0),
317 fix_3src_operand(src1),
318 fix_3src_operand(src2)));
319
320 default:
321 return emit(fs_inst(opcode, dispatch_width(), dst,
322 src0, src1, src2));
323 }
324 }
325
326 /**
327 * Create and insert an instruction with a variable number of sources
328 * into the program.
329 */
330 fs_inst *
emit(enum opcode opcode,const brw_reg & dst,const brw_reg srcs[],unsigned n)331 emit(enum opcode opcode, const brw_reg &dst, const brw_reg srcs[],
332 unsigned n) const
333 {
334 /* Use the emit() methods for specific operand counts to ensure that
335 * opcode-specific operand fixups occur.
336 */
337 if (n == 3) {
338 return emit(opcode, dst, srcs[0], srcs[1], srcs[2]);
339 } else {
340 return emit(fs_inst(opcode, dispatch_width(), dst, srcs, n));
341 }
342 }
343
344 /**
345 * Insert a preallocated instruction into the program.
346 */
347 fs_inst *
emit(fs_inst * inst)348 emit(fs_inst *inst) const
349 {
350 assert(inst->exec_size <= 32);
351 assert(inst->exec_size == dispatch_width() ||
352 force_writemask_all);
353
354 inst->group = _group;
355 inst->force_writemask_all = force_writemask_all;
356 #ifndef NDEBUG
357 inst->annotation = annotation.str;
358 #endif
359
360 if (block)
361 static_cast<fs_inst *>(cursor)->insert_before(block, inst);
362 else
363 cursor->insert_before(inst);
364
365 return inst;
366 }
367
368 /**
369 * Select \p src0 if the comparison of both sources with the given
370 * conditional mod evaluates to true, otherwise select \p src1.
371 *
372 * Generally useful to get the minimum or maximum of two values.
373 */
374 fs_inst *
emit_minmax(const brw_reg & dst,const brw_reg & src0,const brw_reg & src1,brw_conditional_mod mod)375 emit_minmax(const brw_reg &dst, const brw_reg &src0,
376 const brw_reg &src1, brw_conditional_mod mod) const
377 {
378 assert(mod == BRW_CONDITIONAL_GE || mod == BRW_CONDITIONAL_L);
379
380 /* In some cases we can't have bytes as operand for src1, so use the
381 * same type for both operand.
382 */
383 return set_condmod(mod, SEL(dst, fix_unsigned_negate(src0),
384 fix_unsigned_negate(src1)));
385 }
386
387 /**
388 * Copy any live channel from \p src to the first channel of the result.
389 */
390 brw_reg
emit_uniformize(const brw_reg & src)391 emit_uniformize(const brw_reg &src) const
392 {
393 /* Trivial: skip unnecessary work and retain IMM */
394 if (src.file == IMM)
395 return src;
396
397 /* FIXME: We use a vector chan_index and dst to allow constant and
398 * copy propagration to move result all the way into the consuming
399 * instruction (typically a surface index or sampler index for a
400 * send). Once we teach const/copy propagation about scalars we
401 * should go back to scalar destinations here.
402 */
403 const fs_builder xbld = scalar_group();
404 const brw_reg chan_index = xbld.vgrf(BRW_TYPE_UD);
405
406 /* FIND_LIVE_CHANNEL will only write a single component after
407 * lowering. Munge size_written here to match the allocated size of
408 * chan_index.
409 */
410 exec_all().emit(SHADER_OPCODE_FIND_LIVE_CHANNEL, chan_index)
411 ->size_written = chan_index.component_size(xbld.dispatch_width());
412
413 return BROADCAST(src, component(chan_index, 0));
414 }
415
416 brw_reg
move_to_vgrf(const brw_reg & src,unsigned num_components)417 move_to_vgrf(const brw_reg &src, unsigned num_components) const
418 {
419 brw_reg *const src_comps = new brw_reg[num_components];
420
421 for (unsigned i = 0; i < num_components; i++)
422 src_comps[i] = offset(src, *this, i);
423
424 const brw_reg dst = vgrf(src.type, num_components);
425 LOAD_PAYLOAD(dst, src_comps, num_components, 0);
426
427 delete[] src_comps;
428
429 return brw_reg(dst);
430 }
431
432 fs_inst *
emit_undef_for_dst(const fs_inst * old_inst)433 emit_undef_for_dst(const fs_inst *old_inst) const
434 {
435 assert(old_inst->dst.file == VGRF);
436 fs_inst *inst = emit(SHADER_OPCODE_UNDEF,
437 retype(old_inst->dst, BRW_TYPE_UD));
438 inst->size_written = old_inst->size_written;
439
440 return inst;
441 }
442
443 /**
444 * Assorted arithmetic ops.
445 * @{
446 */
447 #define _ALU1(prefix, op) \
448 fs_inst * \
449 op(const brw_reg &dst, const brw_reg &src0) const \
450 { \
451 assert(_dispatch_width == 1 || \
452 (dst.file >= VGRF && dst.stride != 0) || \
453 (dst.file < VGRF && dst.hstride != 0)); \
454 return emit(prefix##op, dst, src0); \
455 } \
456 brw_reg \
457 op(const brw_reg &src0, fs_inst **out = NULL) const \
458 { \
459 fs_inst *inst = op(vgrf(src0.type), src0); \
460 if (out) *out = inst; \
461 return inst->dst; \
462 }
463 #define ALU1(op) _ALU1(BRW_OPCODE_, op)
464 #define VIRT1(op) _ALU1(SHADER_OPCODE_, op)
465
466 fs_inst *
alu2(opcode op,const brw_reg & dst,const brw_reg & src0,const brw_reg & src1)467 alu2(opcode op, const brw_reg &dst, const brw_reg &src0, const brw_reg &src1) const
468 {
469 return emit(op, dst, src0, src1);
470 }
471 brw_reg
472 alu2(opcode op, const brw_reg &src0, const brw_reg &src1, fs_inst **out = NULL) const
473 {
474 enum brw_reg_type inferred_dst_type =
475 brw_type_larger_of(src0.type, src1.type);
476 fs_inst *inst = alu2(op, vgrf(inferred_dst_type), src0, src1);
477 if (out) *out = inst;
478 return inst->dst;
479 }
480
481 #define _ALU2(prefix, op) \
482 fs_inst * \
483 op(const brw_reg &dst, const brw_reg &src0, const brw_reg &src1) const \
484 { \
485 return alu2(prefix##op, dst, src0, src1); \
486 } \
487 brw_reg \
488 op(const brw_reg &src0, const brw_reg &src1, fs_inst **out = NULL) const \
489 { \
490 return alu2(prefix##op, src0, src1, out); \
491 }
492 #define ALU2(op) _ALU2(BRW_OPCODE_, op)
493 #define VIRT2(op) _ALU2(SHADER_OPCODE_, op)
494
495 #define ALU2_ACC(op) \
496 fs_inst * \
497 op(const brw_reg &dst, const brw_reg &src0, const brw_reg &src1) const \
498 { \
499 fs_inst *inst = emit(BRW_OPCODE_##op, dst, src0, src1); \
500 inst->writes_accumulator = true; \
501 return inst; \
502 }
503
504 #define ALU3(op) \
505 fs_inst * \
506 op(const brw_reg &dst, const brw_reg &src0, const brw_reg &src1, \
507 const brw_reg &src2) const \
508 { \
509 return emit(BRW_OPCODE_##op, dst, src0, src1, src2); \
510 } \
511 brw_reg \
512 op(const brw_reg &src0, const brw_reg &src1, const brw_reg &src2, \
513 fs_inst **out = NULL) const \
514 { \
515 enum brw_reg_type inferred_dst_type = \
516 brw_type_larger_of(brw_type_larger_of(src0.type, src1.type),\
517 src2.type); \
518 fs_inst *inst = op(vgrf(inferred_dst_type), src0, src1, src2); \
519 if (out) *out = inst; \
520 return inst->dst; \
521 }
522
523 ALU3(ADD3)
ALU2_ACC(ADDC)524 ALU2_ACC(ADDC)
525 ALU2(AND)
526 ALU2(ASR)
527 ALU2(AVG)
528 ALU3(BFE)
529 ALU2(BFI1)
530 ALU3(BFI2)
531 ALU1(BFREV)
532 ALU1(CBIT)
533 ALU2(DP2)
534 ALU2(DP3)
535 ALU2(DP4)
536 ALU2(DPH)
537 ALU1(FBH)
538 ALU1(FBL)
539 ALU1(FRC)
540 ALU3(DP4A)
541 ALU2(LINE)
542 ALU1(LZD)
543 ALU2(MAC)
544 ALU2_ACC(MACH)
545 ALU3(MAD)
546 ALU1(MOV)
547 ALU2(MUL)
548 ALU1(NOT)
549 ALU2(OR)
550 ALU2(PLN)
551 ALU1(RNDD)
552 ALU1(RNDE)
553 ALU1(RNDU)
554 ALU1(RNDZ)
555 ALU2(ROL)
556 ALU2(ROR)
557 ALU2(SEL)
558 ALU2(SHL)
559 ALU2(SHR)
560 ALU2_ACC(SUBB)
561 ALU2(XOR)
562
563 VIRT1(RCP)
564 VIRT1(RSQ)
565 VIRT1(SQRT)
566 VIRT1(EXP2)
567 VIRT1(LOG2)
568 VIRT2(POW)
569 VIRT2(INT_QUOTIENT)
570 VIRT2(INT_REMAINDER)
571 VIRT1(SIN)
572 VIRT1(COS)
573
574 #undef ALU3
575 #undef ALU2_ACC
576 #undef ALU2
577 #undef VIRT2
578 #undef _ALU2
579 #undef ALU1
580 #undef VIRT1
581 #undef _ALU1
582 /** @} */
583
584 fs_inst *
585 ADD(const brw_reg &dst, const brw_reg &src0, const brw_reg &src1) const
586 {
587 return alu2(BRW_OPCODE_ADD, dst, src0, src1);
588 }
589
590 brw_reg
591 ADD(const brw_reg &src0, const brw_reg &src1, fs_inst **out = NULL) const
592 {
593 if (src1.file == IMM && src1.ud == 0 && !out)
594 return src0;
595
596 return alu2(BRW_OPCODE_ADD, src0, src1, out);
597 }
598
599 /**
600 * CMP: Sets the low bit of the destination channels with the result
601 * of the comparison, while the upper bits are undefined, and updates
602 * the flag register with the packed 16 bits of the result.
603 */
604 fs_inst *
CMP(const brw_reg & dst,const brw_reg & src0,const brw_reg & src1,brw_conditional_mod condition)605 CMP(const brw_reg &dst, const brw_reg &src0, const brw_reg &src1,
606 brw_conditional_mod condition) const
607 {
608 /* Take the instruction:
609 *
610 * CMP null<d> src0<f> src1<f>
611 *
612 * Original gfx4 does type conversion to the destination type
613 * before comparison, producing garbage results for floating
614 * point comparisons.
615 */
616 const enum brw_reg_type type =
617 dst.is_null() ?
618 src0.type :
619 brw_type_with_size(src0.type, brw_type_size_bits(dst.type));
620
621 return set_condmod(condition,
622 emit(BRW_OPCODE_CMP, retype(dst, type),
623 fix_unsigned_negate(src0),
624 fix_unsigned_negate(src1)));
625 }
626
627 /**
628 * CMPN: Behaves like CMP, but produces true if src1 is NaN.
629 */
630 fs_inst *
CMPN(const brw_reg & dst,const brw_reg & src0,const brw_reg & src1,brw_conditional_mod condition)631 CMPN(const brw_reg &dst, const brw_reg &src0, const brw_reg &src1,
632 brw_conditional_mod condition) const
633 {
634 /* Take the instruction:
635 *
636 * CMP null<d> src0<f> src1<f>
637 *
638 * Original gfx4 does type conversion to the destination type
639 * before comparison, producing garbage results for floating
640 * point comparisons.
641 */
642 const enum brw_reg_type type =
643 dst.is_null() ?
644 src0.type :
645 brw_type_with_size(src0.type, brw_type_size_bits(dst.type));
646
647 return set_condmod(condition,
648 emit(BRW_OPCODE_CMPN, retype(dst, type),
649 fix_unsigned_negate(src0),
650 fix_unsigned_negate(src1)));
651 }
652
653 /**
654 * Gfx4 predicated IF.
655 */
656 fs_inst *
IF(brw_predicate predicate)657 IF(brw_predicate predicate) const
658 {
659 return set_predicate(predicate, emit(BRW_OPCODE_IF));
660 }
661
662 /**
663 * CSEL: dst = src2 <op> 0.0f ? src0 : src1
664 */
665 fs_inst *
CSEL(const brw_reg & dst,const brw_reg & src0,const brw_reg & src1,const brw_reg & src2,brw_conditional_mod condition)666 CSEL(const brw_reg &dst, const brw_reg &src0, const brw_reg &src1,
667 const brw_reg &src2, brw_conditional_mod condition) const
668 {
669 return set_condmod(condition,
670 emit(BRW_OPCODE_CSEL,
671 retype(dst, src2.type),
672 retype(src0, src2.type),
673 retype(src1, src2.type),
674 src2));
675 }
676
677 /**
678 * Emit a linear interpolation instruction.
679 */
680 fs_inst *
LRP(const brw_reg & dst,const brw_reg & x,const brw_reg & y,const brw_reg & a)681 LRP(const brw_reg &dst, const brw_reg &x, const brw_reg &y,
682 const brw_reg &a) const
683 {
684 if (shader->devinfo->ver <= 10) {
685 /* The LRP instruction actually does op1 * op0 + op2 * (1 - op0), so
686 * we need to reorder the operands.
687 */
688 return emit(BRW_OPCODE_LRP, dst, a, y, x);
689
690 } else {
691 /* We can't use the LRP instruction. Emit x*(1-a) + y*a. */
692 const brw_reg y_times_a = vgrf(dst.type);
693 const brw_reg one_minus_a = vgrf(dst.type);
694 const brw_reg x_times_one_minus_a = vgrf(dst.type);
695
696 MUL(y_times_a, y, a);
697 ADD(one_minus_a, negate(a), brw_imm_f(1.0f));
698 MUL(x_times_one_minus_a, x, brw_reg(one_minus_a));
699 return ADD(dst, brw_reg(x_times_one_minus_a), brw_reg(y_times_a));
700 }
701 }
702
703 /**
704 * Collect a number of registers in a contiguous range of registers.
705 */
706 fs_inst *
LOAD_PAYLOAD(const brw_reg & dst,const brw_reg * src,unsigned sources,unsigned header_size)707 LOAD_PAYLOAD(const brw_reg &dst, const brw_reg *src,
708 unsigned sources, unsigned header_size) const
709 {
710 fs_inst *inst = emit(SHADER_OPCODE_LOAD_PAYLOAD, dst, src, sources);
711 inst->header_size = header_size;
712 inst->size_written = header_size * REG_SIZE;
713 for (unsigned i = header_size; i < sources; i++) {
714 inst->size_written += dispatch_width() * brw_type_size_bytes(src[i].type) *
715 dst.stride;
716 }
717
718 return inst;
719 }
720
721 fs_inst *
VEC(const brw_reg & dst,const brw_reg * src,unsigned sources)722 VEC(const brw_reg &dst, const brw_reg *src, unsigned sources) const
723 {
724 return sources == 1 ? MOV(dst, src[0])
725 : LOAD_PAYLOAD(dst, src, sources, 0);
726 }
727
728 fs_inst *
SYNC(enum tgl_sync_function sync)729 SYNC(enum tgl_sync_function sync) const
730 {
731 return emit(BRW_OPCODE_SYNC, null_reg_ud(), brw_imm_ud(sync));
732 }
733
734 fs_inst *
UNDEF(const brw_reg & dst)735 UNDEF(const brw_reg &dst) const
736 {
737 assert(dst.file == VGRF);
738 assert(dst.offset % REG_SIZE == 0);
739 fs_inst *inst = emit(SHADER_OPCODE_UNDEF,
740 retype(dst, BRW_TYPE_UD));
741 inst->size_written = shader->alloc.sizes[dst.nr] * REG_SIZE - dst.offset;
742
743 return inst;
744 }
745
746 fs_inst *
DPAS(const brw_reg & dst,const brw_reg & src0,const brw_reg & src1,const brw_reg & src2,unsigned sdepth,unsigned rcount)747 DPAS(const brw_reg &dst, const brw_reg &src0, const brw_reg &src1, const brw_reg &src2,
748 unsigned sdepth, unsigned rcount) const
749 {
750 assert(_dispatch_width == 8 * reg_unit(shader->devinfo));
751 assert(sdepth == 8);
752 assert(rcount == 1 || rcount == 2 || rcount == 4 || rcount == 8);
753
754 fs_inst *inst = emit(BRW_OPCODE_DPAS, dst, src0, src1, src2);
755 inst->sdepth = sdepth;
756 inst->rcount = rcount;
757
758 if (dst.type == BRW_TYPE_HF) {
759 inst->size_written = reg_unit(shader->devinfo) * rcount * REG_SIZE / 2;
760 } else {
761 inst->size_written = reg_unit(shader->devinfo) * rcount * REG_SIZE;
762 }
763
764 return inst;
765 }
766
767 void
VARYING_PULL_CONSTANT_LOAD(const brw_reg & dst,const brw_reg & surface,const brw_reg & surface_handle,const brw_reg & varying_offset,uint32_t const_offset,uint8_t alignment,unsigned components)768 VARYING_PULL_CONSTANT_LOAD(const brw_reg &dst,
769 const brw_reg &surface,
770 const brw_reg &surface_handle,
771 const brw_reg &varying_offset,
772 uint32_t const_offset,
773 uint8_t alignment,
774 unsigned components) const
775 {
776 assert(components <= 4);
777
778 /* We have our constant surface use a pitch of 4 bytes, so our index can
779 * be any component of a vector, and then we load 4 contiguous
780 * components starting from that. TODO: Support loading fewer than 4.
781 */
782 brw_reg total_offset = ADD(varying_offset, brw_imm_ud(const_offset));
783
784 /* The pull load message will load a vec4 (16 bytes). If we are loading
785 * a double this means we are only loading 2 elements worth of data.
786 * We also want to use a 32-bit data type for the dst of the load operation
787 * so other parts of the driver don't get confused about the size of the
788 * result.
789 */
790 brw_reg vec4_result = vgrf(BRW_TYPE_F, 4);
791
792 brw_reg srcs[PULL_VARYING_CONSTANT_SRCS];
793 srcs[PULL_VARYING_CONSTANT_SRC_SURFACE] = surface;
794 srcs[PULL_VARYING_CONSTANT_SRC_SURFACE_HANDLE] = surface_handle;
795 srcs[PULL_VARYING_CONSTANT_SRC_OFFSET] = total_offset;
796 srcs[PULL_VARYING_CONSTANT_SRC_ALIGNMENT] = brw_imm_ud(alignment);
797
798 fs_inst *inst = emit(FS_OPCODE_VARYING_PULL_CONSTANT_LOAD_LOGICAL,
799 vec4_result, srcs, PULL_VARYING_CONSTANT_SRCS);
800 inst->size_written = 4 * vec4_result.component_size(inst->exec_size);
801
802 shuffle_from_32bit_read(*this, dst, vec4_result, 0, components);
803 }
804
805 brw_reg
LOAD_SUBGROUP_INVOCATION()806 LOAD_SUBGROUP_INVOCATION() const
807 {
808 brw_reg reg = vgrf(shader->dispatch_width < 16 ? BRW_TYPE_UD : BRW_TYPE_UW);
809 exec_all().emit(SHADER_OPCODE_LOAD_SUBGROUP_INVOCATION, reg);
810 return reg;
811 }
812
813 brw_reg
BROADCAST(brw_reg value,brw_reg index)814 BROADCAST(brw_reg value, brw_reg index) const
815 {
816 const fs_builder xbld = scalar_group();
817 const brw_reg dst = xbld.vgrf(value.type);
818
819 assert(is_uniform(index));
820
821 /* A broadcast will always be at the full dispatch width even if the
822 * use of the broadcast result is smaller. If the source is_scalar,
823 * it may be allocated at less than the full dispatch width (e.g.,
824 * allocated at SIMD8 with SIMD32 dispatch). The input may or may
825 * not be stride=0. If it is not, the generated broadcast
826 *
827 * broadcast(32) dst, value<1>, index<0>
828 *
829 * is invalid because it may read out of bounds from value.
830 *
831 * To account for this, modify the stride of an is_scalar input to be
832 * zero.
833 */
834 if (value.is_scalar)
835 value = component(value, 0);
836
837 /* Ensure that the source of a broadcast is always register aligned.
838 * See brw_broadcast() non-scalar case for more details.
839 */
840 if (reg_offset(value) % (REG_SIZE * reg_unit(shader->devinfo)) != 0)
841 value = MOV(value);
842
843 /* BROADCAST will only write a single component after lowering. Munge
844 * size_written here to match the allocated size of dst.
845 */
846 exec_all().emit(SHADER_OPCODE_BROADCAST, dst, value, index)
847 ->size_written = dst.component_size(xbld.dispatch_width());
848
849 return component(dst, 0);
850 }
851
852 fs_visitor *shader;
853
BREAK()854 fs_inst *BREAK() { return emit(BRW_OPCODE_BREAK); }
DO()855 fs_inst *DO() { return emit(BRW_OPCODE_DO); }
ENDIF()856 fs_inst *ENDIF() { return emit(BRW_OPCODE_ENDIF); }
NOP()857 fs_inst *NOP() { return emit(BRW_OPCODE_NOP); }
WHILE()858 fs_inst *WHILE() { return emit(BRW_OPCODE_WHILE); }
CONTINUE()859 fs_inst *CONTINUE() { return emit(BRW_OPCODE_CONTINUE); }
860
has_writemask_all()861 bool has_writemask_all() const {
862 return force_writemask_all;
863 }
864
865 private:
866 /**
867 * Workaround for negation of UD registers. See comment in
868 * brw_generator::generate_code() for more details.
869 */
870 brw_reg
fix_unsigned_negate(const brw_reg & src)871 fix_unsigned_negate(const brw_reg &src) const
872 {
873 if (src.type == BRW_TYPE_UD &&
874 src.negate) {
875 brw_reg temp = vgrf(BRW_TYPE_UD);
876 MOV(temp, src);
877 return brw_reg(temp);
878 } else {
879 return src;
880 }
881 }
882
883 /**
884 * Workaround for source register modes not supported by the ternary
885 * instruction encoding.
886 */
887 brw_reg
fix_3src_operand(const brw_reg & src)888 fix_3src_operand(const brw_reg &src) const
889 {
890 switch (src.file) {
891 case FIXED_GRF:
892 /* FINISHME: Could handle scalar region, other stride=1 regions */
893 if (src.vstride != BRW_VERTICAL_STRIDE_8 ||
894 src.width != BRW_WIDTH_8 ||
895 src.hstride != BRW_HORIZONTAL_STRIDE_1)
896 break;
897 FALLTHROUGH;
898 case ATTR:
899 case VGRF:
900 case UNIFORM:
901 case IMM:
902 return src;
903 default:
904 break;
905 }
906
907 brw_reg expanded = vgrf(src.type);
908 MOV(expanded, src);
909 return expanded;
910 }
911
912 bblock_t *block;
913 exec_node *cursor;
914
915 unsigned _dispatch_width;
916 unsigned _group;
917 bool force_writemask_all;
918
919 /** Debug annotation info. */
920 struct {
921 const char *str;
922 } annotation;
923 };
924 }
925
926 /**
927 * Offset by a number of components into a VGRF
928 *
929 * It is assumed that the VGRF represents a vector (e.g., returned by
930 * load_uniform or a texture operation). Convergent and divergent values are
931 * stored differently, so care must be taken to offset properly.
932 */
933 static inline brw_reg
offset(const brw_reg & reg,const brw::fs_builder & bld,unsigned delta)934 offset(const brw_reg ®, const brw::fs_builder &bld, unsigned delta)
935 {
936 /* If the value is convergent (stored as one or more SIMD8), offset using
937 * SIMD8 and select component 0.
938 */
939 if (reg.is_scalar) {
940 const unsigned allocation_width = 8 * reg_unit(bld.shader->devinfo);
941
942 brw_reg offset_reg = offset(reg, allocation_width, delta);
943
944 /* If the dispatch width is larger than the allocation width, that
945 * implies that the register can only be used as a source. Otherwise the
946 * instruction would write past the allocation size of the register.
947 */
948 if (bld.dispatch_width() > allocation_width)
949 return component(offset_reg, 0);
950 else
951 return offset_reg;
952 }
953
954 /* Offset to the component assuming the value was allocated in
955 * dispatch_width units.
956 */
957 return offset(reg, bld.dispatch_width(), delta);
958 }
959