1 /* -*- c++ -*- */
2 /*
3 * Copyright © 2010-2015 Intel Corporation
4 *
5 * Permission is hereby granted, free of charge, to any person obtaining a
6 * copy of this software and associated documentation files (the "Software"),
7 * to deal in the Software without restriction, including without limitation
8 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
9 * and/or sell copies of the Software, and to permit persons to whom the
10 * Software is furnished to do so, subject to the following conditions:
11 *
12 * The above copyright notice and this permission notice (including the next
13 * paragraph) shall be included in all copies or substantial portions of the
14 * Software.
15 *
16 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
19 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
21 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
22 * IN THE SOFTWARE.
23 */
24
25 #pragma once
26
27 #include "elk_ir_fs.h"
28 #include "elk_shader.h"
29 #include "elk_eu.h"
30 #include "elk_fs.h"
31
32 namespace elk {
33 /**
34 * Toolbox to assemble an FS IR program out of individual instructions.
35 *
36 * This object is meant to have an interface consistent with
37 * elk::vec4_builder. They cannot be fully interchangeable because
38 * elk::fs_builder generates scalar code while elk::vec4_builder generates
39 * vector code.
40 */
41 class fs_builder {
42 public:
43 /** Type used in this IR to represent a source of an instruction. */
44 typedef elk_fs_reg src_reg;
45
46 /** Type used in this IR to represent the destination of an instruction. */
47 typedef elk_fs_reg dst_reg;
48
49 /** Type used in this IR to represent an instruction. */
50 typedef elk_fs_inst instruction;
51
52 /**
53 * Construct an fs_builder that inserts instructions into \p shader.
54 * \p dispatch_width gives the native execution width of the program.
55 */
fs_builder(elk_fs_visitor * shader,unsigned dispatch_width)56 fs_builder(elk_fs_visitor *shader,
57 unsigned dispatch_width) :
58 shader(shader), block(NULL), cursor(NULL),
59 _dispatch_width(dispatch_width),
60 _group(0),
61 force_writemask_all(false),
62 annotation()
63 {
64 }
65
fs_builder(elk_fs_visitor * s)66 explicit fs_builder(elk_fs_visitor *s) : fs_builder(s, s->dispatch_width) {}
67
68 /**
69 * Construct an fs_builder that inserts instructions into \p shader
70 * before instruction \p inst in basic block \p block. The default
71 * execution controls and debug annotation are initialized from the
72 * instruction passed as argument.
73 */
fs_builder(elk_fs_visitor * shader,elk_bblock_t * block,elk_fs_inst * inst)74 fs_builder(elk_fs_visitor *shader, elk_bblock_t *block, elk_fs_inst *inst) :
75 shader(shader), block(block), cursor(inst),
76 _dispatch_width(inst->exec_size),
77 _group(inst->group),
78 force_writemask_all(inst->force_writemask_all)
79 {
80 annotation.str = inst->annotation;
81 annotation.ir = inst->ir;
82 }
83
84 /**
85 * Construct an fs_builder that inserts instructions before \p cursor in
86 * basic block \p block, inheriting other code generation parameters
87 * from this.
88 */
89 fs_builder
at(elk_bblock_t * block,exec_node * cursor)90 at(elk_bblock_t *block, exec_node *cursor) const
91 {
92 fs_builder bld = *this;
93 bld.block = block;
94 bld.cursor = cursor;
95 return bld;
96 }
97
98 /**
99 * Construct an fs_builder appending instructions at the end of the
100 * instruction list of the shader, inheriting other code generation
101 * parameters from this.
102 */
103 fs_builder
at_end()104 at_end() const
105 {
106 return at(NULL, (exec_node *)&shader->instructions.tail_sentinel);
107 }
108
109 /**
110 * Construct a builder specifying the default SIMD width and group of
111 * channel enable signals, inheriting other code generation parameters
112 * from this.
113 *
114 * \p n gives the default SIMD width, \p i gives the slot group used for
115 * predication and control flow masking in multiples of \p n channels.
116 */
117 fs_builder
group(unsigned n,unsigned i)118 group(unsigned n, unsigned i) const
119 {
120 fs_builder bld = *this;
121
122 if (n <= dispatch_width() && i < dispatch_width() / n) {
123 bld._group += i * n;
124 } else {
125 /* The requested channel group isn't a subset of the channel group
126 * of this builder, which means that the resulting instructions
127 * would use (potentially undefined) channel enable signals not
128 * specified by the parent builder. That's only valid if the
129 * instruction doesn't have per-channel semantics, in which case
130 * we should clear off the default group index in order to prevent
131 * emitting instructions with channel group not aligned to their
132 * own execution size.
133 */
134 assert(force_writemask_all);
135 bld._group = 0;
136 }
137
138 bld._dispatch_width = n;
139 return bld;
140 }
141
142 /**
143 * Alias for group() with width equal to eight.
144 */
145 fs_builder
quarter(unsigned i)146 quarter(unsigned i) const
147 {
148 return group(8, i);
149 }
150
151 /**
152 * Construct a builder with per-channel control flow execution masking
153 * disabled if \p b is true. If control flow execution masking is
154 * already disabled this has no effect.
155 */
156 fs_builder
157 exec_all(bool b = true) const
158 {
159 fs_builder bld = *this;
160 if (b)
161 bld.force_writemask_all = true;
162 return bld;
163 }
164
165 /**
166 * Construct a builder with the given debug annotation info.
167 */
168 fs_builder
169 annotate(const char *str, const void *ir = NULL) const
170 {
171 fs_builder bld = *this;
172 bld.annotation.str = str;
173 bld.annotation.ir = ir;
174 return bld;
175 }
176
177 /**
178 * Get the SIMD width in use.
179 */
180 unsigned
dispatch_width()181 dispatch_width() const
182 {
183 return _dispatch_width;
184 }
185
186 /**
187 * Get the channel group in use.
188 */
189 unsigned
group()190 group() const
191 {
192 return _group;
193 }
194
195 /**
196 * Allocate a virtual register of natural vector size (one for this IR)
197 * and SIMD width. \p n gives the amount of space to allocate in
198 * dispatch_width units (which is just enough space for one logical
199 * component in this IR).
200 */
201 dst_reg
202 vgrf(enum elk_reg_type type, unsigned n = 1) const
203 {
204 const unsigned unit = reg_unit(shader->devinfo);
205 assert(dispatch_width() <= 32);
206
207 if (n > 0)
208 return dst_reg(VGRF, shader->alloc.allocate(
209 DIV_ROUND_UP(n * type_sz(type) * dispatch_width(),
210 unit * REG_SIZE) * unit),
211 type);
212 else
213 return retype(null_reg_ud(), type);
214 }
215
216 /**
217 * Create a null register of floating type.
218 */
219 dst_reg
null_reg_f()220 null_reg_f() const
221 {
222 return dst_reg(retype(elk_null_reg(), ELK_REGISTER_TYPE_F));
223 }
224
225 dst_reg
null_reg_df()226 null_reg_df() const
227 {
228 return dst_reg(retype(elk_null_reg(), ELK_REGISTER_TYPE_DF));
229 }
230
231 /**
232 * Create a null register of signed integer type.
233 */
234 dst_reg
null_reg_d()235 null_reg_d() const
236 {
237 return dst_reg(retype(elk_null_reg(), ELK_REGISTER_TYPE_D));
238 }
239
240 /**
241 * Create a null register of unsigned integer type.
242 */
243 dst_reg
null_reg_ud()244 null_reg_ud() const
245 {
246 return dst_reg(retype(elk_null_reg(), ELK_REGISTER_TYPE_UD));
247 }
248
249 /**
250 * Insert an instruction into the program.
251 */
252 instruction *
emit(const instruction & inst)253 emit(const instruction &inst) const
254 {
255 return emit(new(shader->mem_ctx) instruction(inst));
256 }
257
258 /**
259 * Create and insert a nullary control instruction into the program.
260 */
261 instruction *
emit(enum elk_opcode opcode)262 emit(enum elk_opcode opcode) const
263 {
264 return emit(instruction(opcode, dispatch_width()));
265 }
266
267 /**
268 * Create and insert a nullary instruction into the program.
269 */
270 instruction *
emit(enum elk_opcode opcode,const dst_reg & dst)271 emit(enum elk_opcode opcode, const dst_reg &dst) const
272 {
273 return emit(instruction(opcode, dispatch_width(), dst));
274 }
275
276 /**
277 * Create and insert a unary instruction into the program.
278 */
279 instruction *
emit(enum elk_opcode opcode,const dst_reg & dst,const src_reg & src0)280 emit(enum elk_opcode opcode, const dst_reg &dst, const src_reg &src0) const
281 {
282 switch (opcode) {
283 case ELK_SHADER_OPCODE_RCP:
284 case ELK_SHADER_OPCODE_RSQ:
285 case ELK_SHADER_OPCODE_SQRT:
286 case ELK_SHADER_OPCODE_EXP2:
287 case ELK_SHADER_OPCODE_LOG2:
288 case ELK_SHADER_OPCODE_SIN:
289 case ELK_SHADER_OPCODE_COS:
290 return emit(instruction(opcode, dispatch_width(), dst,
291 fix_math_operand(src0)));
292
293 default:
294 return emit(instruction(opcode, dispatch_width(), dst, src0));
295 }
296 }
297
298 /**
299 * Create and insert a binary instruction into the program.
300 */
301 instruction *
emit(enum elk_opcode opcode,const dst_reg & dst,const src_reg & src0,const src_reg & src1)302 emit(enum elk_opcode opcode, const dst_reg &dst, const src_reg &src0,
303 const src_reg &src1) const
304 {
305 switch (opcode) {
306 case ELK_SHADER_OPCODE_POW:
307 case ELK_SHADER_OPCODE_INT_QUOTIENT:
308 case ELK_SHADER_OPCODE_INT_REMAINDER:
309 return emit(instruction(opcode, dispatch_width(), dst,
310 fix_math_operand(src0),
311 fix_math_operand(src1)));
312
313 default:
314 return emit(instruction(opcode, dispatch_width(), dst,
315 src0, src1));
316
317 }
318 }
319
320 /**
321 * Create and insert a ternary instruction into the program.
322 */
323 instruction *
emit(enum elk_opcode opcode,const dst_reg & dst,const src_reg & src0,const src_reg & src1,const src_reg & src2)324 emit(enum elk_opcode opcode, const dst_reg &dst, const src_reg &src0,
325 const src_reg &src1, const src_reg &src2) const
326 {
327 switch (opcode) {
328 case ELK_OPCODE_BFE:
329 case ELK_OPCODE_BFI2:
330 case ELK_OPCODE_MAD:
331 case ELK_OPCODE_LRP:
332 return emit(instruction(opcode, dispatch_width(), dst,
333 fix_3src_operand(src0),
334 fix_3src_operand(src1),
335 fix_3src_operand(src2)));
336
337 default:
338 return emit(instruction(opcode, dispatch_width(), dst,
339 src0, src1, src2));
340 }
341 }
342
343 /**
344 * Create and insert an instruction with a variable number of sources
345 * into the program.
346 */
347 instruction *
emit(enum elk_opcode opcode,const dst_reg & dst,const src_reg srcs[],unsigned n)348 emit(enum elk_opcode opcode, const dst_reg &dst, const src_reg srcs[],
349 unsigned n) const
350 {
351 /* Use the emit() methods for specific operand counts to ensure that
352 * opcode-specific operand fixups occur.
353 */
354 if (n == 2) {
355 return emit(opcode, dst, srcs[0], srcs[1]);
356 } else if (n == 3) {
357 return emit(opcode, dst, srcs[0], srcs[1], srcs[2]);
358 } else {
359 return emit(instruction(opcode, dispatch_width(), dst, srcs, n));
360 }
361 }
362
363 /**
364 * Insert a preallocated instruction into the program.
365 */
366 instruction *
emit(instruction * inst)367 emit(instruction *inst) const
368 {
369 assert(inst->exec_size <= 32);
370 assert(inst->exec_size == dispatch_width() ||
371 force_writemask_all);
372
373 inst->group = _group;
374 inst->force_writemask_all = force_writemask_all;
375 inst->annotation = annotation.str;
376 inst->ir = annotation.ir;
377
378 if (block)
379 static_cast<instruction *>(cursor)->insert_before(block, inst);
380 else
381 cursor->insert_before(inst);
382
383 return inst;
384 }
385
386 /**
387 * Select \p src0 if the comparison of both sources with the given
388 * conditional mod evaluates to true, otherwise select \p src1.
389 *
390 * Generally useful to get the minimum or maximum of two values.
391 */
392 instruction *
emit_minmax(const dst_reg & dst,const src_reg & src0,const src_reg & src1,elk_conditional_mod mod)393 emit_minmax(const dst_reg &dst, const src_reg &src0,
394 const src_reg &src1, elk_conditional_mod mod) const
395 {
396 assert(mod == ELK_CONDITIONAL_GE || mod == ELK_CONDITIONAL_L);
397
398 /* In some cases we can't have bytes as operand for src1, so use the
399 * same type for both operand.
400 */
401 return set_condmod(mod, SEL(dst, fix_unsigned_negate(src0),
402 fix_unsigned_negate(src1)));
403 }
404
405 /**
406 * Copy any live channel from \p src to the first channel of the result.
407 */
408 src_reg
emit_uniformize(const src_reg & src)409 emit_uniformize(const src_reg &src) const
410 {
411 /* FIXME: We use a vector chan_index and dst to allow constant and
412 * copy propagration to move result all the way into the consuming
413 * instruction (typically a surface index or sampler index for a
414 * send). This uses 1 or 3 extra hw registers in 16 or 32 wide
415 * dispatch. Once we teach const/copy propagation about scalars we
416 * should go back to scalar destinations here.
417 */
418 const fs_builder ubld = exec_all();
419 const dst_reg chan_index = vgrf(ELK_REGISTER_TYPE_UD);
420 const dst_reg dst = vgrf(src.type);
421
422 ubld.emit(ELK_SHADER_OPCODE_FIND_LIVE_CHANNEL, chan_index);
423 ubld.emit(ELK_SHADER_OPCODE_BROADCAST, dst, src, component(chan_index, 0));
424
425 return src_reg(component(dst, 0));
426 }
427
428 src_reg
move_to_vgrf(const src_reg & src,unsigned num_components)429 move_to_vgrf(const src_reg &src, unsigned num_components) const
430 {
431 src_reg *const src_comps = new src_reg[num_components];
432 for (unsigned i = 0; i < num_components; i++)
433 src_comps[i] = offset(src, dispatch_width(), i);
434
435 const dst_reg dst = vgrf(src.type, num_components);
436 LOAD_PAYLOAD(dst, src_comps, num_components, 0);
437
438 delete[] src_comps;
439
440 return src_reg(dst);
441 }
442
443 void
emit_scan_step(enum elk_opcode opcode,elk_conditional_mod mod,const dst_reg & tmp,unsigned left_offset,unsigned left_stride,unsigned right_offset,unsigned right_stride)444 emit_scan_step(enum elk_opcode opcode, elk_conditional_mod mod,
445 const dst_reg &tmp,
446 unsigned left_offset, unsigned left_stride,
447 unsigned right_offset, unsigned right_stride) const
448 {
449 dst_reg left, right;
450 left = horiz_stride(horiz_offset(tmp, left_offset), left_stride);
451 right = horiz_stride(horiz_offset(tmp, right_offset), right_stride);
452 if ((tmp.type == ELK_REGISTER_TYPE_Q ||
453 tmp.type == ELK_REGISTER_TYPE_UQ) &&
454 !shader->devinfo->has_64bit_int) {
455 switch (opcode) {
456 case ELK_OPCODE_MUL:
457 /* This will get lowered by integer MUL lowering */
458 set_condmod(mod, emit(opcode, right, left, right));
459 break;
460
461 case ELK_OPCODE_SEL: {
462 /* In order for the comparisons to work out right, we need our
463 * comparisons to be strict.
464 */
465 assert(mod == ELK_CONDITIONAL_L || mod == ELK_CONDITIONAL_GE);
466 if (mod == ELK_CONDITIONAL_GE)
467 mod = ELK_CONDITIONAL_G;
468
469 /* We treat the bottom 32 bits as unsigned regardless of
470 * whether or not the integer as a whole is signed.
471 */
472 dst_reg right_low = subscript(right, ELK_REGISTER_TYPE_UD, 0);
473 dst_reg left_low = subscript(left, ELK_REGISTER_TYPE_UD, 0);
474
475 /* The upper bits get the same sign as the 64-bit type */
476 elk_reg_type type32 = elk_reg_type_from_bit_size(32, tmp.type);
477 dst_reg right_high = subscript(right, type32, 1);
478 dst_reg left_high = subscript(left, type32, 1);
479
480 /* Build up our comparison:
481 *
482 * l_hi < r_hi || (l_hi == r_hi && l_low < r_low)
483 */
484 CMP(null_reg_ud(), retype(left_low, ELK_REGISTER_TYPE_UD),
485 retype(right_low, ELK_REGISTER_TYPE_UD), mod);
486 set_predicate(ELK_PREDICATE_NORMAL,
487 CMP(null_reg_ud(), left_high, right_high,
488 ELK_CONDITIONAL_EQ));
489 set_predicate_inv(ELK_PREDICATE_NORMAL, true,
490 CMP(null_reg_ud(), left_high, right_high, mod));
491
492 /* We could use selects here or we could use predicated MOVs
493 * because the destination and second source (if it were a SEL)
494 * are the same.
495 */
496 set_predicate(ELK_PREDICATE_NORMAL, MOV(right_low, left_low));
497 set_predicate(ELK_PREDICATE_NORMAL, MOV(right_high, left_high));
498 break;
499 }
500
501 default:
502 unreachable("Unsupported 64-bit scan op");
503 }
504 } else {
505 set_condmod(mod, emit(opcode, right, left, right));
506 }
507 }
508
509 void
emit_scan(enum elk_opcode opcode,const dst_reg & tmp,unsigned cluster_size,elk_conditional_mod mod)510 emit_scan(enum elk_opcode opcode, const dst_reg &tmp,
511 unsigned cluster_size, elk_conditional_mod mod) const
512 {
513 assert(dispatch_width() >= 8);
514
515 /* The instruction splitting code isn't advanced enough to split
516 * these so we need to handle that ourselves.
517 */
518 if (dispatch_width() * type_sz(tmp.type) > 2 * REG_SIZE) {
519 const unsigned half_width = dispatch_width() / 2;
520 const fs_builder ubld = exec_all().group(half_width, 0);
521 dst_reg left = tmp;
522 dst_reg right = horiz_offset(tmp, half_width);
523 ubld.emit_scan(opcode, left, cluster_size, mod);
524 ubld.emit_scan(opcode, right, cluster_size, mod);
525 if (cluster_size > half_width) {
526 ubld.emit_scan_step(opcode, mod, tmp,
527 half_width - 1, 0, half_width, 1);
528 }
529 return;
530 }
531
532 if (cluster_size > 1) {
533 const fs_builder ubld = exec_all().group(dispatch_width() / 2, 0);
534 ubld.emit_scan_step(opcode, mod, tmp, 0, 2, 1, 2);
535 }
536
537 if (cluster_size > 2) {
538 if (type_sz(tmp.type) <= 4) {
539 const fs_builder ubld =
540 exec_all().group(dispatch_width() / 4, 0);
541 ubld.emit_scan_step(opcode, mod, tmp, 1, 4, 2, 4);
542 ubld.emit_scan_step(opcode, mod, tmp, 1, 4, 3, 4);
543 } else {
544 /* For 64-bit types, we have to do things differently because
545 * the code above would land us with destination strides that
546 * the hardware can't handle. Fortunately, we'll only be
547 * 8-wide in that case and it's the same number of
548 * instructions.
549 */
550 const fs_builder ubld = exec_all().group(2, 0);
551 for (unsigned i = 0; i < dispatch_width(); i += 4)
552 ubld.emit_scan_step(opcode, mod, tmp, i + 1, 0, i + 2, 1);
553 }
554 }
555
556 for (unsigned i = 4;
557 i < MIN2(cluster_size, dispatch_width());
558 i *= 2) {
559 const fs_builder ubld = exec_all().group(i, 0);
560 ubld.emit_scan_step(opcode, mod, tmp, i - 1, 0, i, 1);
561
562 if (dispatch_width() > i * 2)
563 ubld.emit_scan_step(opcode, mod, tmp, i * 3 - 1, 0, i * 3, 1);
564
565 if (dispatch_width() > i * 4) {
566 ubld.emit_scan_step(opcode, mod, tmp, i * 5 - 1, 0, i * 5, 1);
567 ubld.emit_scan_step(opcode, mod, tmp, i * 7 - 1, 0, i * 7, 1);
568 }
569 }
570 }
571
572 instruction *
emit_undef_for_dst(const instruction * old_inst)573 emit_undef_for_dst(const instruction *old_inst) const
574 {
575 assert(old_inst->dst.file == VGRF);
576 instruction *inst = emit(ELK_SHADER_OPCODE_UNDEF,
577 retype(old_inst->dst, ELK_REGISTER_TYPE_UD));
578 inst->size_written = old_inst->size_written;
579
580 return inst;
581 }
582
583 /**
584 * Assorted arithmetic ops.
585 * @{
586 */
587 #define ALU1(op) \
588 instruction * \
589 op(const dst_reg &dst, const src_reg &src0) const \
590 { \
591 return emit(ELK_OPCODE_##op, dst, src0); \
592 }
593
594 #define ALU2(op) \
595 instruction * \
596 op(const dst_reg &dst, const src_reg &src0, const src_reg &src1) const \
597 { \
598 return emit(ELK_OPCODE_##op, dst, src0, src1); \
599 }
600
601 #define ALU2_ACC(op) \
602 instruction * \
603 op(const dst_reg &dst, const src_reg &src0, const src_reg &src1) const \
604 { \
605 instruction *inst = emit(ELK_OPCODE_##op, dst, src0, src1); \
606 inst->writes_accumulator = true; \
607 return inst; \
608 }
609
610 #define ALU3(op) \
611 instruction * \
612 op(const dst_reg &dst, const src_reg &src0, const src_reg &src1, \
613 const src_reg &src2) const \
614 { \
615 return emit(ELK_OPCODE_##op, dst, src0, src1, src2); \
616 }
617
618 ALU2(ADD)
ALU2_ACC(ADDC)619 ALU2_ACC(ADDC)
620 ALU2(AND)
621 ALU2(ASR)
622 ALU2(AVG)
623 ALU3(BFE)
624 ALU2(BFI1)
625 ALU3(BFI2)
626 ALU1(BFREV)
627 ALU1(CBIT)
628 ALU1(DIM)
629 ALU2(DP2)
630 ALU2(DP3)
631 ALU2(DP4)
632 ALU2(DPH)
633 ALU1(FBH)
634 ALU1(FBL)
635 ALU1(FRC)
636 ALU2(LINE)
637 ALU1(LZD)
638 ALU2(MAC)
639 ALU2_ACC(MACH)
640 ALU3(MAD)
641 ALU1(MOV)
642 ALU2(MUL)
643 ALU1(NOT)
644 ALU2(OR)
645 ALU2(PLN)
646 ALU1(RNDD)
647 ALU1(RNDE)
648 ALU1(RNDU)
649 ALU1(RNDZ)
650 ALU2(SAD2)
651 ALU2_ACC(SADA2)
652 ALU2(SEL)
653 ALU2(SHL)
654 ALU2(SHR)
655 ALU2_ACC(SUBB)
656 ALU2(XOR)
657
658 #undef ALU3
659 #undef ALU2_ACC
660 #undef ALU2
661 #undef ALU1
662
663 instruction *
664 F32TO16(const dst_reg &dst, const src_reg &src) const
665 {
666 assert(dst.type == ELK_REGISTER_TYPE_HF);
667 assert(src.type == ELK_REGISTER_TYPE_F);
668
669 if (shader->devinfo->ver >= 8) {
670 return MOV(dst, src);
671 } else {
672 assert(shader->devinfo->ver == 7);
673 return emit(ELK_OPCODE_F32TO16,
674 retype(dst, ELK_REGISTER_TYPE_W), src);
675 }
676 }
677
678 instruction *
F16TO32(const dst_reg & dst,const src_reg & src)679 F16TO32(const dst_reg &dst, const src_reg &src) const
680 {
681 assert(dst.type == ELK_REGISTER_TYPE_F);
682 assert(src.type == ELK_REGISTER_TYPE_HF);
683
684 if (shader->devinfo->ver >= 8) {
685 return MOV(dst, src);
686 } else {
687 assert(shader->devinfo->ver == 7);
688 return emit(ELK_OPCODE_F16TO32,
689 dst, retype(src, ELK_REGISTER_TYPE_W));
690 }
691 }
692 /** @} */
693
694 /**
695 * CMP: Sets the low bit of the destination channels with the result
696 * of the comparison, while the upper bits are undefined, and updates
697 * the flag register with the packed 16 bits of the result.
698 */
699 instruction *
CMP(const dst_reg & dst,const src_reg & src0,const src_reg & src1,elk_conditional_mod condition)700 CMP(const dst_reg &dst, const src_reg &src0, const src_reg &src1,
701 elk_conditional_mod condition) const
702 {
703 /* Take the instruction:
704 *
705 * CMP null<d> src0<f> src1<f>
706 *
707 * Original gfx4 does type conversion to the destination type
708 * before comparison, producing garbage results for floating
709 * point comparisons.
710 *
711 * The destination type doesn't matter on newer generations,
712 * so we set the type to match src0 so we can compact the
713 * instruction.
714 */
715 return set_condmod(condition,
716 emit(ELK_OPCODE_CMP, retype(dst, src0.type),
717 fix_unsigned_negate(src0),
718 fix_unsigned_negate(src1)));
719 }
720
721 /**
722 * CMPN: Behaves like CMP, but produces true if src1 is NaN.
723 */
724 instruction *
CMPN(const dst_reg & dst,const src_reg & src0,const src_reg & src1,elk_conditional_mod condition)725 CMPN(const dst_reg &dst, const src_reg &src0, const src_reg &src1,
726 elk_conditional_mod condition) const
727 {
728 /* Take the instruction:
729 *
730 * CMP null<d> src0<f> src1<f>
731 *
732 * Original gfx4 does type conversion to the destination type
733 * before comparison, producing garbage results for floating
734 * point comparisons.
735 *
736 * The destination type doesn't matter on newer generations,
737 * so we set the type to match src0 so we can compact the
738 * instruction.
739 */
740 return set_condmod(condition,
741 emit(ELK_OPCODE_CMPN, retype(dst, src0.type),
742 fix_unsigned_negate(src0),
743 fix_unsigned_negate(src1)));
744 }
745
746 /**
747 * Gfx4 predicated IF.
748 */
749 instruction *
IF(elk_predicate predicate)750 IF(elk_predicate predicate) const
751 {
752 return set_predicate(predicate, emit(ELK_OPCODE_IF));
753 }
754
755 /**
756 * CSEL: dst = src2 <op> 0.0f ? src0 : src1
757 */
758 instruction *
CSEL(const dst_reg & dst,const src_reg & src0,const src_reg & src1,const src_reg & src2,elk_conditional_mod condition)759 CSEL(const dst_reg &dst, const src_reg &src0, const src_reg &src1,
760 const src_reg &src2, elk_conditional_mod condition) const
761 {
762 /* CSEL only operates on floats, so we can't do integer </<=/>=/>
763 * comparisons. Zero/non-zero (== and !=) comparisons almost work.
764 * 0x80000000 fails because it is -0.0, and -0.0 == 0.0.
765 */
766 assert(src2.type == ELK_REGISTER_TYPE_F);
767
768 return set_condmod(condition,
769 emit(ELK_OPCODE_CSEL,
770 retype(dst, ELK_REGISTER_TYPE_F),
771 retype(src0, ELK_REGISTER_TYPE_F),
772 retype(src1, ELK_REGISTER_TYPE_F),
773 src2));
774 }
775
776 /**
777 * Emit a linear interpolation instruction.
778 */
779 instruction *
LRP(const dst_reg & dst,const src_reg & x,const src_reg & y,const src_reg & a)780 LRP(const dst_reg &dst, const src_reg &x, const src_reg &y,
781 const src_reg &a) const
782 {
783 if (shader->devinfo->ver >= 6) {
784 /* The LRP instruction actually does op1 * op0 + op2 * (1 - op0), so
785 * we need to reorder the operands.
786 */
787 return emit(ELK_OPCODE_LRP, dst, a, y, x);
788
789 } else {
790 /* We can't use the LRP instruction. Emit x*(1-a) + y*a. */
791 const dst_reg y_times_a = vgrf(dst.type);
792 const dst_reg one_minus_a = vgrf(dst.type);
793 const dst_reg x_times_one_minus_a = vgrf(dst.type);
794
795 MUL(y_times_a, y, a);
796 ADD(one_minus_a, negate(a), elk_imm_f(1.0f));
797 MUL(x_times_one_minus_a, x, src_reg(one_minus_a));
798 return ADD(dst, src_reg(x_times_one_minus_a), src_reg(y_times_a));
799 }
800 }
801
802 /**
803 * Collect a number of registers in a contiguous range of registers.
804 */
805 instruction *
LOAD_PAYLOAD(const dst_reg & dst,const src_reg * src,unsigned sources,unsigned header_size)806 LOAD_PAYLOAD(const dst_reg &dst, const src_reg *src,
807 unsigned sources, unsigned header_size) const
808 {
809 instruction *inst = emit(ELK_SHADER_OPCODE_LOAD_PAYLOAD, dst, src, sources);
810 inst->header_size = header_size;
811 inst->size_written = header_size * REG_SIZE;
812 for (unsigned i = header_size; i < sources; i++) {
813 inst->size_written += dispatch_width() * type_sz(src[i].type) *
814 dst.stride;
815 }
816
817 return inst;
818 }
819
820 instruction *
UNDEF(const dst_reg & dst)821 UNDEF(const dst_reg &dst) const
822 {
823 assert(dst.file == VGRF);
824 assert(dst.offset % REG_SIZE == 0);
825 instruction *inst = emit(ELK_SHADER_OPCODE_UNDEF,
826 retype(dst, ELK_REGISTER_TYPE_UD));
827 inst->size_written = shader->alloc.sizes[dst.nr] * REG_SIZE - dst.offset;
828
829 return inst;
830 }
831
832 elk_fs_visitor *shader;
833
BREAK()834 elk_fs_inst *BREAK() { return emit(ELK_OPCODE_BREAK); }
DO()835 elk_fs_inst *DO() { return emit(ELK_OPCODE_DO); }
ENDIF()836 elk_fs_inst *ENDIF() { return emit(ELK_OPCODE_ENDIF); }
NOP()837 elk_fs_inst *NOP() { return emit(ELK_OPCODE_NOP); }
WHILE()838 elk_fs_inst *WHILE() { return emit(ELK_OPCODE_WHILE); }
CONTINUE()839 elk_fs_inst *CONTINUE() { return emit(ELK_OPCODE_CONTINUE); }
840
841 private:
842 /**
843 * Workaround for negation of UD registers. See comment in
844 * elk_fs_generator::generate_code() for more details.
845 */
846 src_reg
fix_unsigned_negate(const src_reg & src)847 fix_unsigned_negate(const src_reg &src) const
848 {
849 if (src.type == ELK_REGISTER_TYPE_UD &&
850 src.negate) {
851 dst_reg temp = vgrf(ELK_REGISTER_TYPE_UD);
852 MOV(temp, src);
853 return src_reg(temp);
854 } else {
855 return src;
856 }
857 }
858
859 /**
860 * Workaround for source register modes not supported by the ternary
861 * instruction encoding.
862 */
863 src_reg
fix_3src_operand(const src_reg & src)864 fix_3src_operand(const src_reg &src) const
865 {
866 switch (src.file) {
867 case FIXED_GRF:
868 /* FINISHME: Could handle scalar region, other stride=1 regions */
869 if (src.vstride != ELK_VERTICAL_STRIDE_8 ||
870 src.width != ELK_WIDTH_8 ||
871 src.hstride != ELK_HORIZONTAL_STRIDE_1)
872 break;
873 FALLTHROUGH;
874 case ATTR:
875 case VGRF:
876 case UNIFORM:
877 case IMM:
878 return src;
879 default:
880 break;
881 }
882
883 dst_reg expanded = vgrf(src.type);
884 MOV(expanded, src);
885 return expanded;
886 }
887
888 /**
889 * Workaround for source register modes not supported by the math
890 * instruction.
891 */
892 src_reg
fix_math_operand(const src_reg & src)893 fix_math_operand(const src_reg &src) const
894 {
895 /* Can't do hstride == 0 args on gfx6 math, so expand it out. We
896 * might be able to do better by doing execsize = 1 math and then
897 * expanding that result out, but we would need to be careful with
898 * masking.
899 *
900 * Gfx6 hardware ignores source modifiers (negate and abs) on math
901 * instructions, so we also move to a temp to set those up.
902 *
903 * Gfx7 relaxes most of the above restrictions, but still can't use IMM
904 * operands to math
905 */
906 if ((shader->devinfo->ver == 6 &&
907 (src.file == IMM || src.file == UNIFORM ||
908 src.abs || src.negate)) ||
909 (shader->devinfo->ver == 7 && src.file == IMM)) {
910 const dst_reg tmp = vgrf(src.type);
911 MOV(tmp, src);
912 return tmp;
913 } else {
914 return src;
915 }
916 }
917
918 elk_bblock_t *block;
919 exec_node *cursor;
920
921 unsigned _dispatch_width;
922 unsigned _group;
923 bool force_writemask_all;
924
925 /** Debug annotation info. */
926 struct {
927 const char *str;
928 const void *ir;
929 } annotation;
930 };
931 }
932
933 static inline elk_fs_reg
offset(const elk_fs_reg & reg,const elk::fs_builder & bld,unsigned delta)934 offset(const elk_fs_reg ®, const elk::fs_builder &bld, unsigned delta)
935 {
936 return offset(reg, bld.dispatch_width(), delta);
937 }
938