1 /* -*- c++ -*- */
2 /*
3 * Copyright © 2010-2015 Intel Corporation
4 *
5 * Permission is hereby granted, free of charge, to any person obtaining a
6 * copy of this software and associated documentation files (the "Software"),
7 * to deal in the Software without restriction, including without limitation
8 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
9 * and/or sell copies of the Software, and to permit persons to whom the
10 * Software is furnished to do so, subject to the following conditions:
11 *
12 * The above copyright notice and this permission notice (including the next
13 * paragraph) shall be included in all copies or substantial portions of the
14 * Software.
15 *
16 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
19 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
21 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
22 * IN THE SOFTWARE.
23 */
24
25 #ifndef ELK_FS_BUILDER_H
26 #define ELK_FS_BUILDER_H
27
28 #include "elk_ir_fs.h"
29 #include "elk_shader.h"
30 #include "elk_eu.h"
31 #include "elk_fs.h"
32
33 namespace elk {
34 /**
35 * Toolbox to assemble an FS IR program out of individual instructions.
36 *
37 * This object is meant to have an interface consistent with
38 * elk::vec4_builder. They cannot be fully interchangeable because
39 * elk::fs_builder generates scalar code while elk::vec4_builder generates
40 * vector code.
41 */
42 class fs_builder {
43 public:
44 /** Type used in this IR to represent a source of an instruction. */
45 typedef elk_fs_reg src_reg;
46
47 /** Type used in this IR to represent the destination of an instruction. */
48 typedef elk_fs_reg dst_reg;
49
50 /** Type used in this IR to represent an instruction. */
51 typedef elk_fs_inst instruction;
52
53 /**
54 * Construct an fs_builder that inserts instructions into \p shader.
55 * \p dispatch_width gives the native execution width of the program.
56 */
fs_builder(elk_fs_visitor * shader,unsigned dispatch_width)57 fs_builder(elk_fs_visitor *shader,
58 unsigned dispatch_width) :
59 shader(shader), block(NULL), cursor(NULL),
60 _dispatch_width(dispatch_width),
61 _group(0),
62 force_writemask_all(false),
63 annotation()
64 {
65 }
66
fs_builder(elk_fs_visitor * s)67 explicit fs_builder(elk_fs_visitor *s) : fs_builder(s, s->dispatch_width) {}
68
69 /**
70 * Construct an fs_builder that inserts instructions into \p shader
71 * before instruction \p inst in basic block \p block. The default
72 * execution controls and debug annotation are initialized from the
73 * instruction passed as argument.
74 */
fs_builder(elk_fs_visitor * shader,elk_bblock_t * block,elk_fs_inst * inst)75 fs_builder(elk_fs_visitor *shader, elk_bblock_t *block, elk_fs_inst *inst) :
76 shader(shader), block(block), cursor(inst),
77 _dispatch_width(inst->exec_size),
78 _group(inst->group),
79 force_writemask_all(inst->force_writemask_all)
80 {
81 annotation.str = inst->annotation;
82 annotation.ir = inst->ir;
83 }
84
85 /**
86 * Construct an fs_builder that inserts instructions before \p cursor in
87 * basic block \p block, inheriting other code generation parameters
88 * from this.
89 */
90 fs_builder
at(elk_bblock_t * block,exec_node * cursor)91 at(elk_bblock_t *block, exec_node *cursor) const
92 {
93 fs_builder bld = *this;
94 bld.block = block;
95 bld.cursor = cursor;
96 return bld;
97 }
98
99 /**
100 * Construct an fs_builder appending instructions at the end of the
101 * instruction list of the shader, inheriting other code generation
102 * parameters from this.
103 */
104 fs_builder
at_end()105 at_end() const
106 {
107 return at(NULL, (exec_node *)&shader->instructions.tail_sentinel);
108 }
109
110 /**
111 * Construct a builder specifying the default SIMD width and group of
112 * channel enable signals, inheriting other code generation parameters
113 * from this.
114 *
115 * \p n gives the default SIMD width, \p i gives the slot group used for
116 * predication and control flow masking in multiples of \p n channels.
117 */
118 fs_builder
group(unsigned n,unsigned i)119 group(unsigned n, unsigned i) const
120 {
121 fs_builder bld = *this;
122
123 if (n <= dispatch_width() && i < dispatch_width() / n) {
124 bld._group += i * n;
125 } else {
126 /* The requested channel group isn't a subset of the channel group
127 * of this builder, which means that the resulting instructions
128 * would use (potentially undefined) channel enable signals not
129 * specified by the parent builder. That's only valid if the
130 * instruction doesn't have per-channel semantics, in which case
131 * we should clear off the default group index in order to prevent
132 * emitting instructions with channel group not aligned to their
133 * own execution size.
134 */
135 assert(force_writemask_all);
136 bld._group = 0;
137 }
138
139 bld._dispatch_width = n;
140 return bld;
141 }
142
143 /**
144 * Alias for group() with width equal to eight.
145 */
146 fs_builder
quarter(unsigned i)147 quarter(unsigned i) const
148 {
149 return group(8, i);
150 }
151
152 /**
153 * Construct a builder with per-channel control flow execution masking
154 * disabled if \p b is true. If control flow execution masking is
155 * already disabled this has no effect.
156 */
157 fs_builder
158 exec_all(bool b = true) const
159 {
160 fs_builder bld = *this;
161 if (b)
162 bld.force_writemask_all = true;
163 return bld;
164 }
165
166 /**
167 * Construct a builder with the given debug annotation info.
168 */
169 fs_builder
170 annotate(const char *str, const void *ir = NULL) const
171 {
172 fs_builder bld = *this;
173 bld.annotation.str = str;
174 bld.annotation.ir = ir;
175 return bld;
176 }
177
178 /**
179 * Get the SIMD width in use.
180 */
181 unsigned
dispatch_width()182 dispatch_width() const
183 {
184 return _dispatch_width;
185 }
186
187 /**
188 * Get the channel group in use.
189 */
190 unsigned
group()191 group() const
192 {
193 return _group;
194 }
195
196 /**
197 * Allocate a virtual register of natural vector size (one for this IR)
198 * and SIMD width. \p n gives the amount of space to allocate in
199 * dispatch_width units (which is just enough space for one logical
200 * component in this IR).
201 */
202 dst_reg
203 vgrf(enum elk_reg_type type, unsigned n = 1) const
204 {
205 const unsigned unit = reg_unit(shader->devinfo);
206 assert(dispatch_width() <= 32);
207
208 if (n > 0)
209 return dst_reg(VGRF, shader->alloc.allocate(
210 DIV_ROUND_UP(n * type_sz(type) * dispatch_width(),
211 unit * REG_SIZE) * unit),
212 type);
213 else
214 return retype(null_reg_ud(), type);
215 }
216
217 /**
218 * Create a null register of floating type.
219 */
220 dst_reg
null_reg_f()221 null_reg_f() const
222 {
223 return dst_reg(retype(elk_null_reg(), ELK_REGISTER_TYPE_F));
224 }
225
226 dst_reg
null_reg_df()227 null_reg_df() const
228 {
229 return dst_reg(retype(elk_null_reg(), ELK_REGISTER_TYPE_DF));
230 }
231
232 /**
233 * Create a null register of signed integer type.
234 */
235 dst_reg
null_reg_d()236 null_reg_d() const
237 {
238 return dst_reg(retype(elk_null_reg(), ELK_REGISTER_TYPE_D));
239 }
240
241 /**
242 * Create a null register of unsigned integer type.
243 */
244 dst_reg
null_reg_ud()245 null_reg_ud() const
246 {
247 return dst_reg(retype(elk_null_reg(), ELK_REGISTER_TYPE_UD));
248 }
249
250 /**
251 * Insert an instruction into the program.
252 */
253 instruction *
emit(const instruction & inst)254 emit(const instruction &inst) const
255 {
256 return emit(new(shader->mem_ctx) instruction(inst));
257 }
258
259 /**
260 * Create and insert a nullary control instruction into the program.
261 */
262 instruction *
emit(enum elk_opcode opcode)263 emit(enum elk_opcode opcode) const
264 {
265 return emit(instruction(opcode, dispatch_width()));
266 }
267
268 /**
269 * Create and insert a nullary instruction into the program.
270 */
271 instruction *
emit(enum elk_opcode opcode,const dst_reg & dst)272 emit(enum elk_opcode opcode, const dst_reg &dst) const
273 {
274 return emit(instruction(opcode, dispatch_width(), dst));
275 }
276
277 /**
278 * Create and insert a unary instruction into the program.
279 */
280 instruction *
emit(enum elk_opcode opcode,const dst_reg & dst,const src_reg & src0)281 emit(enum elk_opcode opcode, const dst_reg &dst, const src_reg &src0) const
282 {
283 switch (opcode) {
284 case ELK_SHADER_OPCODE_RCP:
285 case ELK_SHADER_OPCODE_RSQ:
286 case ELK_SHADER_OPCODE_SQRT:
287 case ELK_SHADER_OPCODE_EXP2:
288 case ELK_SHADER_OPCODE_LOG2:
289 case ELK_SHADER_OPCODE_SIN:
290 case ELK_SHADER_OPCODE_COS:
291 return emit(instruction(opcode, dispatch_width(), dst,
292 fix_math_operand(src0)));
293
294 default:
295 return emit(instruction(opcode, dispatch_width(), dst, src0));
296 }
297 }
298
299 /**
300 * Create and insert a binary instruction into the program.
301 */
302 instruction *
emit(enum elk_opcode opcode,const dst_reg & dst,const src_reg & src0,const src_reg & src1)303 emit(enum elk_opcode opcode, const dst_reg &dst, const src_reg &src0,
304 const src_reg &src1) const
305 {
306 switch (opcode) {
307 case ELK_SHADER_OPCODE_POW:
308 case ELK_SHADER_OPCODE_INT_QUOTIENT:
309 case ELK_SHADER_OPCODE_INT_REMAINDER:
310 return emit(instruction(opcode, dispatch_width(), dst,
311 fix_math_operand(src0),
312 fix_math_operand(src1)));
313
314 default:
315 return emit(instruction(opcode, dispatch_width(), dst,
316 src0, src1));
317
318 }
319 }
320
321 /**
322 * Create and insert a ternary instruction into the program.
323 */
324 instruction *
emit(enum elk_opcode opcode,const dst_reg & dst,const src_reg & src0,const src_reg & src1,const src_reg & src2)325 emit(enum elk_opcode opcode, const dst_reg &dst, const src_reg &src0,
326 const src_reg &src1, const src_reg &src2) const
327 {
328 switch (opcode) {
329 case ELK_OPCODE_BFE:
330 case ELK_OPCODE_BFI2:
331 case ELK_OPCODE_MAD:
332 case ELK_OPCODE_LRP:
333 return emit(instruction(opcode, dispatch_width(), dst,
334 fix_3src_operand(src0),
335 fix_3src_operand(src1),
336 fix_3src_operand(src2)));
337
338 default:
339 return emit(instruction(opcode, dispatch_width(), dst,
340 src0, src1, src2));
341 }
342 }
343
344 /**
345 * Create and insert an instruction with a variable number of sources
346 * into the program.
347 */
348 instruction *
emit(enum elk_opcode opcode,const dst_reg & dst,const src_reg srcs[],unsigned n)349 emit(enum elk_opcode opcode, const dst_reg &dst, const src_reg srcs[],
350 unsigned n) const
351 {
352 /* Use the emit() methods for specific operand counts to ensure that
353 * opcode-specific operand fixups occur.
354 */
355 if (n == 2) {
356 return emit(opcode, dst, srcs[0], srcs[1]);
357 } else if (n == 3) {
358 return emit(opcode, dst, srcs[0], srcs[1], srcs[2]);
359 } else {
360 return emit(instruction(opcode, dispatch_width(), dst, srcs, n));
361 }
362 }
363
364 /**
365 * Insert a preallocated instruction into the program.
366 */
367 instruction *
emit(instruction * inst)368 emit(instruction *inst) const
369 {
370 assert(inst->exec_size <= 32);
371 assert(inst->exec_size == dispatch_width() ||
372 force_writemask_all);
373
374 inst->group = _group;
375 inst->force_writemask_all = force_writemask_all;
376 inst->annotation = annotation.str;
377 inst->ir = annotation.ir;
378
379 if (block)
380 static_cast<instruction *>(cursor)->insert_before(block, inst);
381 else
382 cursor->insert_before(inst);
383
384 return inst;
385 }
386
387 /**
388 * Select \p src0 if the comparison of both sources with the given
389 * conditional mod evaluates to true, otherwise select \p src1.
390 *
391 * Generally useful to get the minimum or maximum of two values.
392 */
393 instruction *
emit_minmax(const dst_reg & dst,const src_reg & src0,const src_reg & src1,elk_conditional_mod mod)394 emit_minmax(const dst_reg &dst, const src_reg &src0,
395 const src_reg &src1, elk_conditional_mod mod) const
396 {
397 assert(mod == ELK_CONDITIONAL_GE || mod == ELK_CONDITIONAL_L);
398
399 /* In some cases we can't have bytes as operand for src1, so use the
400 * same type for both operand.
401 */
402 return set_condmod(mod, SEL(dst, fix_unsigned_negate(src0),
403 fix_unsigned_negate(src1)));
404 }
405
406 /**
407 * Copy any live channel from \p src to the first channel of the result.
408 */
409 src_reg
emit_uniformize(const src_reg & src)410 emit_uniformize(const src_reg &src) const
411 {
412 /* FIXME: We use a vector chan_index and dst to allow constant and
413 * copy propagration to move result all the way into the consuming
414 * instruction (typically a surface index or sampler index for a
415 * send). This uses 1 or 3 extra hw registers in 16 or 32 wide
416 * dispatch. Once we teach const/copy propagation about scalars we
417 * should go back to scalar destinations here.
418 */
419 const fs_builder ubld = exec_all();
420 const dst_reg chan_index = vgrf(ELK_REGISTER_TYPE_UD);
421 const dst_reg dst = vgrf(src.type);
422
423 ubld.emit(ELK_SHADER_OPCODE_FIND_LIVE_CHANNEL, chan_index);
424 ubld.emit(ELK_SHADER_OPCODE_BROADCAST, dst, src, component(chan_index, 0));
425
426 return src_reg(component(dst, 0));
427 }
428
429 src_reg
move_to_vgrf(const src_reg & src,unsigned num_components)430 move_to_vgrf(const src_reg &src, unsigned num_components) const
431 {
432 src_reg *const src_comps = new src_reg[num_components];
433 for (unsigned i = 0; i < num_components; i++)
434 src_comps[i] = offset(src, dispatch_width(), i);
435
436 const dst_reg dst = vgrf(src.type, num_components);
437 LOAD_PAYLOAD(dst, src_comps, num_components, 0);
438
439 delete[] src_comps;
440
441 return src_reg(dst);
442 }
443
444 void
emit_scan_step(enum elk_opcode opcode,elk_conditional_mod mod,const dst_reg & tmp,unsigned left_offset,unsigned left_stride,unsigned right_offset,unsigned right_stride)445 emit_scan_step(enum elk_opcode opcode, elk_conditional_mod mod,
446 const dst_reg &tmp,
447 unsigned left_offset, unsigned left_stride,
448 unsigned right_offset, unsigned right_stride) const
449 {
450 dst_reg left, right;
451 left = horiz_stride(horiz_offset(tmp, left_offset), left_stride);
452 right = horiz_stride(horiz_offset(tmp, right_offset), right_stride);
453 if ((tmp.type == ELK_REGISTER_TYPE_Q ||
454 tmp.type == ELK_REGISTER_TYPE_UQ) &&
455 !shader->devinfo->has_64bit_int) {
456 switch (opcode) {
457 case ELK_OPCODE_MUL:
458 /* This will get lowered by integer MUL lowering */
459 set_condmod(mod, emit(opcode, right, left, right));
460 break;
461
462 case ELK_OPCODE_SEL: {
463 /* In order for the comparisons to work out right, we need our
464 * comparisons to be strict.
465 */
466 assert(mod == ELK_CONDITIONAL_L || mod == ELK_CONDITIONAL_GE);
467 if (mod == ELK_CONDITIONAL_GE)
468 mod = ELK_CONDITIONAL_G;
469
470 /* We treat the bottom 32 bits as unsigned regardless of
471 * whether or not the integer as a whole is signed.
472 */
473 dst_reg right_low = subscript(right, ELK_REGISTER_TYPE_UD, 0);
474 dst_reg left_low = subscript(left, ELK_REGISTER_TYPE_UD, 0);
475
476 /* The upper bits get the same sign as the 64-bit type */
477 elk_reg_type type32 = elk_reg_type_from_bit_size(32, tmp.type);
478 dst_reg right_high = subscript(right, type32, 1);
479 dst_reg left_high = subscript(left, type32, 1);
480
481 /* Build up our comparison:
482 *
483 * l_hi < r_hi || (l_hi == r_hi && l_low < r_low)
484 */
485 CMP(null_reg_ud(), retype(left_low, ELK_REGISTER_TYPE_UD),
486 retype(right_low, ELK_REGISTER_TYPE_UD), mod);
487 set_predicate(ELK_PREDICATE_NORMAL,
488 CMP(null_reg_ud(), left_high, right_high,
489 ELK_CONDITIONAL_EQ));
490 set_predicate_inv(ELK_PREDICATE_NORMAL, true,
491 CMP(null_reg_ud(), left_high, right_high, mod));
492
493 /* We could use selects here or we could use predicated MOVs
494 * because the destination and second source (if it were a SEL)
495 * are the same.
496 */
497 set_predicate(ELK_PREDICATE_NORMAL, MOV(right_low, left_low));
498 set_predicate(ELK_PREDICATE_NORMAL, MOV(right_high, left_high));
499 break;
500 }
501
502 default:
503 unreachable("Unsupported 64-bit scan op");
504 }
505 } else {
506 set_condmod(mod, emit(opcode, right, left, right));
507 }
508 }
509
510 void
emit_scan(enum elk_opcode opcode,const dst_reg & tmp,unsigned cluster_size,elk_conditional_mod mod)511 emit_scan(enum elk_opcode opcode, const dst_reg &tmp,
512 unsigned cluster_size, elk_conditional_mod mod) const
513 {
514 assert(dispatch_width() >= 8);
515
516 /* The instruction splitting code isn't advanced enough to split
517 * these so we need to handle that ourselves.
518 */
519 if (dispatch_width() * type_sz(tmp.type) > 2 * REG_SIZE) {
520 const unsigned half_width = dispatch_width() / 2;
521 const fs_builder ubld = exec_all().group(half_width, 0);
522 dst_reg left = tmp;
523 dst_reg right = horiz_offset(tmp, half_width);
524 ubld.emit_scan(opcode, left, cluster_size, mod);
525 ubld.emit_scan(opcode, right, cluster_size, mod);
526 if (cluster_size > half_width) {
527 ubld.emit_scan_step(opcode, mod, tmp,
528 half_width - 1, 0, half_width, 1);
529 }
530 return;
531 }
532
533 if (cluster_size > 1) {
534 const fs_builder ubld = exec_all().group(dispatch_width() / 2, 0);
535 ubld.emit_scan_step(opcode, mod, tmp, 0, 2, 1, 2);
536 }
537
538 if (cluster_size > 2) {
539 if (type_sz(tmp.type) <= 4) {
540 const fs_builder ubld =
541 exec_all().group(dispatch_width() / 4, 0);
542 ubld.emit_scan_step(opcode, mod, tmp, 1, 4, 2, 4);
543 ubld.emit_scan_step(opcode, mod, tmp, 1, 4, 3, 4);
544 } else {
545 /* For 64-bit types, we have to do things differently because
546 * the code above would land us with destination strides that
547 * the hardware can't handle. Fortunately, we'll only be
548 * 8-wide in that case and it's the same number of
549 * instructions.
550 */
551 const fs_builder ubld = exec_all().group(2, 0);
552 for (unsigned i = 0; i < dispatch_width(); i += 4)
553 ubld.emit_scan_step(opcode, mod, tmp, i + 1, 0, i + 2, 1);
554 }
555 }
556
557 for (unsigned i = 4;
558 i < MIN2(cluster_size, dispatch_width());
559 i *= 2) {
560 const fs_builder ubld = exec_all().group(i, 0);
561 ubld.emit_scan_step(opcode, mod, tmp, i - 1, 0, i, 1);
562
563 if (dispatch_width() > i * 2)
564 ubld.emit_scan_step(opcode, mod, tmp, i * 3 - 1, 0, i * 3, 1);
565
566 if (dispatch_width() > i * 4) {
567 ubld.emit_scan_step(opcode, mod, tmp, i * 5 - 1, 0, i * 5, 1);
568 ubld.emit_scan_step(opcode, mod, tmp, i * 7 - 1, 0, i * 7, 1);
569 }
570 }
571 }
572
573 instruction *
emit_undef_for_dst(const instruction * old_inst)574 emit_undef_for_dst(const instruction *old_inst) const
575 {
576 assert(old_inst->dst.file == VGRF);
577 instruction *inst = emit(ELK_SHADER_OPCODE_UNDEF,
578 retype(old_inst->dst, ELK_REGISTER_TYPE_UD));
579 inst->size_written = old_inst->size_written;
580
581 return inst;
582 }
583
584 /**
585 * Assorted arithmetic ops.
586 * @{
587 */
588 #define ALU1(op) \
589 instruction * \
590 op(const dst_reg &dst, const src_reg &src0) const \
591 { \
592 return emit(ELK_OPCODE_##op, dst, src0); \
593 }
594
595 #define ALU2(op) \
596 instruction * \
597 op(const dst_reg &dst, const src_reg &src0, const src_reg &src1) const \
598 { \
599 return emit(ELK_OPCODE_##op, dst, src0, src1); \
600 }
601
602 #define ALU2_ACC(op) \
603 instruction * \
604 op(const dst_reg &dst, const src_reg &src0, const src_reg &src1) const \
605 { \
606 instruction *inst = emit(ELK_OPCODE_##op, dst, src0, src1); \
607 inst->writes_accumulator = true; \
608 return inst; \
609 }
610
611 #define ALU3(op) \
612 instruction * \
613 op(const dst_reg &dst, const src_reg &src0, const src_reg &src1, \
614 const src_reg &src2) const \
615 { \
616 return emit(ELK_OPCODE_##op, dst, src0, src1, src2); \
617 }
618
619 ALU2(ADD)
ALU3(ADD3)620 ALU3(ADD3)
621 ALU2_ACC(ADDC)
622 ALU2(AND)
623 ALU2(ASR)
624 ALU2(AVG)
625 ALU3(BFE)
626 ALU2(BFI1)
627 ALU3(BFI2)
628 ALU1(BFREV)
629 ALU1(CBIT)
630 ALU1(DIM)
631 ALU2(DP2)
632 ALU2(DP3)
633 ALU2(DP4)
634 ALU2(DPH)
635 ALU1(FBH)
636 ALU1(FBL)
637 ALU1(FRC)
638 ALU3(DP4A)
639 ALU2(LINE)
640 ALU1(LZD)
641 ALU2(MAC)
642 ALU2_ACC(MACH)
643 ALU3(MAD)
644 ALU1(MOV)
645 ALU2(MUL)
646 ALU1(NOT)
647 ALU2(OR)
648 ALU2(PLN)
649 ALU1(RNDD)
650 ALU1(RNDE)
651 ALU1(RNDU)
652 ALU1(RNDZ)
653 ALU2(ROL)
654 ALU2(ROR)
655 ALU2(SAD2)
656 ALU2_ACC(SADA2)
657 ALU2(SEL)
658 ALU2(SHL)
659 ALU2(SHR)
660 ALU2_ACC(SUBB)
661 ALU2(XOR)
662
663 #undef ALU3
664 #undef ALU2_ACC
665 #undef ALU2
666 #undef ALU1
667
668 instruction *
669 F32TO16(const dst_reg &dst, const src_reg &src) const
670 {
671 assert(dst.type == ELK_REGISTER_TYPE_HF);
672 assert(src.type == ELK_REGISTER_TYPE_F);
673
674 if (shader->devinfo->ver >= 8) {
675 return MOV(dst, src);
676 } else {
677 assert(shader->devinfo->ver == 7);
678 return emit(ELK_OPCODE_F32TO16,
679 retype(dst, ELK_REGISTER_TYPE_W), src);
680 }
681 }
682
683 instruction *
F16TO32(const dst_reg & dst,const src_reg & src)684 F16TO32(const dst_reg &dst, const src_reg &src) const
685 {
686 assert(dst.type == ELK_REGISTER_TYPE_F);
687 assert(src.type == ELK_REGISTER_TYPE_HF);
688
689 if (shader->devinfo->ver >= 8) {
690 return MOV(dst, src);
691 } else {
692 assert(shader->devinfo->ver == 7);
693 return emit(ELK_OPCODE_F16TO32,
694 dst, retype(src, ELK_REGISTER_TYPE_W));
695 }
696 }
697 /** @} */
698
699 /**
700 * CMP: Sets the low bit of the destination channels with the result
701 * of the comparison, while the upper bits are undefined, and updates
702 * the flag register with the packed 16 bits of the result.
703 */
704 instruction *
CMP(const dst_reg & dst,const src_reg & src0,const src_reg & src1,elk_conditional_mod condition)705 CMP(const dst_reg &dst, const src_reg &src0, const src_reg &src1,
706 elk_conditional_mod condition) const
707 {
708 /* Take the instruction:
709 *
710 * CMP null<d> src0<f> src1<f>
711 *
712 * Original gfx4 does type conversion to the destination type
713 * before comparison, producing garbage results for floating
714 * point comparisons.
715 *
716 * The destination type doesn't matter on newer generations,
717 * so we set the type to match src0 so we can compact the
718 * instruction.
719 */
720 return set_condmod(condition,
721 emit(ELK_OPCODE_CMP, retype(dst, src0.type),
722 fix_unsigned_negate(src0),
723 fix_unsigned_negate(src1)));
724 }
725
726 /**
727 * CMPN: Behaves like CMP, but produces true if src1 is NaN.
728 */
729 instruction *
CMPN(const dst_reg & dst,const src_reg & src0,const src_reg & src1,elk_conditional_mod condition)730 CMPN(const dst_reg &dst, const src_reg &src0, const src_reg &src1,
731 elk_conditional_mod condition) const
732 {
733 /* Take the instruction:
734 *
735 * CMP null<d> src0<f> src1<f>
736 *
737 * Original gfx4 does type conversion to the destination type
738 * before comparison, producing garbage results for floating
739 * point comparisons.
740 *
741 * The destination type doesn't matter on newer generations,
742 * so we set the type to match src0 so we can compact the
743 * instruction.
744 */
745 return set_condmod(condition,
746 emit(ELK_OPCODE_CMPN, retype(dst, src0.type),
747 fix_unsigned_negate(src0),
748 fix_unsigned_negate(src1)));
749 }
750
751 /**
752 * Gfx4 predicated IF.
753 */
754 instruction *
IF(elk_predicate predicate)755 IF(elk_predicate predicate) const
756 {
757 return set_predicate(predicate, emit(ELK_OPCODE_IF));
758 }
759
760 /**
761 * CSEL: dst = src2 <op> 0.0f ? src0 : src1
762 */
763 instruction *
CSEL(const dst_reg & dst,const src_reg & src0,const src_reg & src1,const src_reg & src2,elk_conditional_mod condition)764 CSEL(const dst_reg &dst, const src_reg &src0, const src_reg &src1,
765 const src_reg &src2, elk_conditional_mod condition) const
766 {
767 /* CSEL only operates on floats, so we can't do integer </<=/>=/>
768 * comparisons. Zero/non-zero (== and !=) comparisons almost work.
769 * 0x80000000 fails because it is -0.0, and -0.0 == 0.0.
770 */
771 assert(src2.type == ELK_REGISTER_TYPE_F);
772
773 return set_condmod(condition,
774 emit(ELK_OPCODE_CSEL,
775 retype(dst, ELK_REGISTER_TYPE_F),
776 retype(src0, ELK_REGISTER_TYPE_F),
777 retype(src1, ELK_REGISTER_TYPE_F),
778 src2));
779 }
780
781 /**
782 * Emit a linear interpolation instruction.
783 */
784 instruction *
LRP(const dst_reg & dst,const src_reg & x,const src_reg & y,const src_reg & a)785 LRP(const dst_reg &dst, const src_reg &x, const src_reg &y,
786 const src_reg &a) const
787 {
788 if (shader->devinfo->ver >= 6 && shader->devinfo->ver <= 10) {
789 /* The LRP instruction actually does op1 * op0 + op2 * (1 - op0), so
790 * we need to reorder the operands.
791 */
792 return emit(ELK_OPCODE_LRP, dst, a, y, x);
793
794 } else {
795 /* We can't use the LRP instruction. Emit x*(1-a) + y*a. */
796 const dst_reg y_times_a = vgrf(dst.type);
797 const dst_reg one_minus_a = vgrf(dst.type);
798 const dst_reg x_times_one_minus_a = vgrf(dst.type);
799
800 MUL(y_times_a, y, a);
801 ADD(one_minus_a, negate(a), elk_imm_f(1.0f));
802 MUL(x_times_one_minus_a, x, src_reg(one_minus_a));
803 return ADD(dst, src_reg(x_times_one_minus_a), src_reg(y_times_a));
804 }
805 }
806
807 /**
808 * Collect a number of registers in a contiguous range of registers.
809 */
810 instruction *
LOAD_PAYLOAD(const dst_reg & dst,const src_reg * src,unsigned sources,unsigned header_size)811 LOAD_PAYLOAD(const dst_reg &dst, const src_reg *src,
812 unsigned sources, unsigned header_size) const
813 {
814 instruction *inst = emit(ELK_SHADER_OPCODE_LOAD_PAYLOAD, dst, src, sources);
815 inst->header_size = header_size;
816 inst->size_written = header_size * REG_SIZE;
817 for (unsigned i = header_size; i < sources; i++) {
818 inst->size_written += dispatch_width() * type_sz(src[i].type) *
819 dst.stride;
820 }
821
822 return inst;
823 }
824
825 instruction *
UNDEF(const dst_reg & dst)826 UNDEF(const dst_reg &dst) const
827 {
828 assert(dst.file == VGRF);
829 assert(dst.offset % REG_SIZE == 0);
830 instruction *inst = emit(ELK_SHADER_OPCODE_UNDEF,
831 retype(dst, ELK_REGISTER_TYPE_UD));
832 inst->size_written = shader->alloc.sizes[dst.nr] * REG_SIZE - dst.offset;
833
834 return inst;
835 }
836
837 instruction *
DPAS(const dst_reg & dst,const src_reg & src0,const src_reg & src1,const src_reg & src2,unsigned sdepth,unsigned rcount)838 DPAS(const dst_reg &dst, const src_reg &src0, const src_reg &src1, const src_reg &src2,
839 unsigned sdepth, unsigned rcount) const
840 {
841 assert(_dispatch_width == 8);
842 assert(sdepth == 8);
843 assert(rcount == 1 || rcount == 2 || rcount == 4 || rcount == 8);
844
845 instruction *inst = emit(ELK_OPCODE_DPAS, dst, src0, src1, src2);
846 inst->sdepth = sdepth;
847 inst->rcount = rcount;
848
849 if (dst.type == ELK_REGISTER_TYPE_HF) {
850 inst->size_written = rcount * REG_SIZE / 2;
851 } else {
852 inst->size_written = rcount * REG_SIZE;
853 }
854
855 return inst;
856 }
857
858 elk_fs_visitor *shader;
859
BREAK()860 elk_fs_inst *BREAK() { return emit(ELK_OPCODE_BREAK); }
DO()861 elk_fs_inst *DO() { return emit(ELK_OPCODE_DO); }
ENDIF()862 elk_fs_inst *ENDIF() { return emit(ELK_OPCODE_ENDIF); }
NOP()863 elk_fs_inst *NOP() { return emit(ELK_OPCODE_NOP); }
WHILE()864 elk_fs_inst *WHILE() { return emit(ELK_OPCODE_WHILE); }
CONTINUE()865 elk_fs_inst *CONTINUE() { return emit(ELK_OPCODE_CONTINUE); }
866
867 private:
868 /**
869 * Workaround for negation of UD registers. See comment in
870 * elk_fs_generator::generate_code() for more details.
871 */
872 src_reg
fix_unsigned_negate(const src_reg & src)873 fix_unsigned_negate(const src_reg &src) const
874 {
875 if (src.type == ELK_REGISTER_TYPE_UD &&
876 src.negate) {
877 dst_reg temp = vgrf(ELK_REGISTER_TYPE_UD);
878 MOV(temp, src);
879 return src_reg(temp);
880 } else {
881 return src;
882 }
883 }
884
885 /**
886 * Workaround for source register modes not supported by the ternary
887 * instruction encoding.
888 */
889 src_reg
fix_3src_operand(const src_reg & src)890 fix_3src_operand(const src_reg &src) const
891 {
892 switch (src.file) {
893 case FIXED_GRF:
894 /* FINISHME: Could handle scalar region, other stride=1 regions */
895 if (src.vstride != ELK_VERTICAL_STRIDE_8 ||
896 src.width != ELK_WIDTH_8 ||
897 src.hstride != ELK_HORIZONTAL_STRIDE_1)
898 break;
899 FALLTHROUGH;
900 case ATTR:
901 case VGRF:
902 case UNIFORM:
903 case IMM:
904 return src;
905 default:
906 break;
907 }
908
909 dst_reg expanded = vgrf(src.type);
910 MOV(expanded, src);
911 return expanded;
912 }
913
914 /**
915 * Workaround for source register modes not supported by the math
916 * instruction.
917 */
918 src_reg
fix_math_operand(const src_reg & src)919 fix_math_operand(const src_reg &src) const
920 {
921 /* Can't do hstride == 0 args on gfx6 math, so expand it out. We
922 * might be able to do better by doing execsize = 1 math and then
923 * expanding that result out, but we would need to be careful with
924 * masking.
925 *
926 * Gfx6 hardware ignores source modifiers (negate and abs) on math
927 * instructions, so we also move to a temp to set those up.
928 *
929 * Gfx7 relaxes most of the above restrictions, but still can't use IMM
930 * operands to math
931 */
932 if ((shader->devinfo->ver == 6 &&
933 (src.file == IMM || src.file == UNIFORM ||
934 src.abs || src.negate)) ||
935 (shader->devinfo->ver == 7 && src.file == IMM)) {
936 const dst_reg tmp = vgrf(src.type);
937 MOV(tmp, src);
938 return tmp;
939 } else {
940 return src;
941 }
942 }
943
944 elk_bblock_t *block;
945 exec_node *cursor;
946
947 unsigned _dispatch_width;
948 unsigned _group;
949 bool force_writemask_all;
950
951 /** Debug annotation info. */
952 struct {
953 const char *str;
954 const void *ir;
955 } annotation;
956 };
957 }
958
959 static inline elk_fs_reg
offset(const elk_fs_reg & reg,const elk::fs_builder & bld,unsigned delta)960 offset(const elk_fs_reg ®, const elk::fs_builder &bld, unsigned delta)
961 {
962 return offset(reg, bld.dispatch_width(), delta);
963 }
964
965 #endif
966