1 /*
2 * Copyright © 2011 Intel Corporation
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
13 * Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21 * IN THE SOFTWARE.
22 */
23
24 #include "elk_vec4.h"
25 #include "elk_fs.h"
26 #include "elk_eu.h"
27 #include "elk_cfg.h"
28 #include "elk_nir.h"
29 #include "elk_vec4_builder.h"
30 #include "elk_vec4_vs.h"
31 #include "elk_dead_control_flow.h"
32 #include "elk_private.h"
33 #include "dev/intel_debug.h"
34 #include "util/u_math.h"
35
36 #define MAX_INSTRUCTION (1 << 30)
37
38 using namespace elk;
39
40 namespace elk {
41
42 void
init()43 src_reg::init()
44 {
45 memset((void*)this, 0, sizeof(*this));
46 this->file = BAD_FILE;
47 this->type = ELK_REGISTER_TYPE_UD;
48 }
49
src_reg(enum elk_reg_file file,int nr,const glsl_type * type)50 src_reg::src_reg(enum elk_reg_file file, int nr, const glsl_type *type)
51 {
52 init();
53
54 this->file = file;
55 this->nr = nr;
56 if (type && (glsl_type_is_scalar(type) || glsl_type_is_vector(type) || glsl_type_is_matrix(type)))
57 this->swizzle = elk_swizzle_for_size(type->vector_elements);
58 else
59 this->swizzle = ELK_SWIZZLE_XYZW;
60 if (type)
61 this->type = elk_type_for_base_type(type);
62 }
63
64 /** Generic unset register constructor. */
src_reg()65 src_reg::src_reg()
66 {
67 init();
68 }
69
src_reg(struct::elk_reg reg)70 src_reg::src_reg(struct ::elk_reg reg) :
71 elk_backend_reg(reg)
72 {
73 this->offset = 0;
74 this->reladdr = NULL;
75 }
76
src_reg(const dst_reg & reg)77 src_reg::src_reg(const dst_reg ®) :
78 elk_backend_reg(reg)
79 {
80 this->reladdr = reg.reladdr;
81 this->swizzle = elk_swizzle_for_mask(reg.writemask);
82 }
83
84 void
init()85 dst_reg::init()
86 {
87 memset((void*)this, 0, sizeof(*this));
88 this->file = BAD_FILE;
89 this->type = ELK_REGISTER_TYPE_UD;
90 this->writemask = WRITEMASK_XYZW;
91 }
92
dst_reg()93 dst_reg::dst_reg()
94 {
95 init();
96 }
97
dst_reg(enum elk_reg_file file,int nr)98 dst_reg::dst_reg(enum elk_reg_file file, int nr)
99 {
100 init();
101
102 this->file = file;
103 this->nr = nr;
104 }
105
dst_reg(enum elk_reg_file file,int nr,const glsl_type * type,unsigned writemask)106 dst_reg::dst_reg(enum elk_reg_file file, int nr, const glsl_type *type,
107 unsigned writemask)
108 {
109 init();
110
111 this->file = file;
112 this->nr = nr;
113 this->type = elk_type_for_base_type(type);
114 this->writemask = writemask;
115 }
116
dst_reg(enum elk_reg_file file,int nr,elk_reg_type type,unsigned writemask)117 dst_reg::dst_reg(enum elk_reg_file file, int nr, elk_reg_type type,
118 unsigned writemask)
119 {
120 init();
121
122 this->file = file;
123 this->nr = nr;
124 this->type = type;
125 this->writemask = writemask;
126 }
127
dst_reg(struct::elk_reg reg)128 dst_reg::dst_reg(struct ::elk_reg reg) :
129 elk_backend_reg(reg)
130 {
131 this->offset = 0;
132 this->reladdr = NULL;
133 }
134
dst_reg(const src_reg & reg)135 dst_reg::dst_reg(const src_reg ®) :
136 elk_backend_reg(reg)
137 {
138 this->writemask = elk_mask_for_swizzle(reg.swizzle);
139 this->reladdr = reg.reladdr;
140 }
141
142 bool
equals(const dst_reg & r) const143 dst_reg::equals(const dst_reg &r) const
144 {
145 return (this->elk_backend_reg::equals(r) &&
146 (reladdr == r.reladdr ||
147 (reladdr && r.reladdr && reladdr->equals(*r.reladdr))));
148 }
149
150 bool
is_send_from_grf() const151 vec4_instruction::is_send_from_grf() const
152 {
153 switch (opcode) {
154 case ELK_VS_OPCODE_PULL_CONSTANT_LOAD_GFX7:
155 case ELK_VEC4_OPCODE_UNTYPED_ATOMIC:
156 case ELK_VEC4_OPCODE_UNTYPED_SURFACE_READ:
157 case ELK_VEC4_OPCODE_UNTYPED_SURFACE_WRITE:
158 case ELK_VEC4_OPCODE_URB_READ:
159 case ELK_VEC4_TCS_OPCODE_URB_WRITE:
160 case ELK_TCS_OPCODE_RELEASE_INPUT:
161 case ELK_SHADER_OPCODE_BARRIER:
162 return true;
163 default:
164 return false;
165 }
166 }
167
168 /**
169 * Returns true if this instruction's sources and destinations cannot
170 * safely be the same register.
171 *
172 * In most cases, a register can be written over safely by the same
173 * instruction that is its last use. For a single instruction, the
174 * sources are dereferenced before writing of the destination starts
175 * (naturally).
176 *
177 * However, there are a few cases where this can be problematic:
178 *
179 * - Virtual opcodes that translate to multiple instructions in the
180 * code generator: if src == dst and one instruction writes the
181 * destination before a later instruction reads the source, then
182 * src will have been clobbered.
183 *
184 * The register allocator uses this information to set up conflicts between
185 * GRF sources and the destination.
186 */
187 bool
has_source_and_destination_hazard() const188 vec4_instruction::has_source_and_destination_hazard() const
189 {
190 switch (opcode) {
191 case ELK_VEC4_TCS_OPCODE_SET_INPUT_URB_OFFSETS:
192 case ELK_VEC4_TCS_OPCODE_SET_OUTPUT_URB_OFFSETS:
193 case ELK_TES_OPCODE_ADD_INDIRECT_URB_OFFSET:
194 return true;
195 default:
196 /* 8-wide compressed DF operations are executed as two 4-wide operations,
197 * so we have a src/dst hazard if the first half of the instruction
198 * overwrites the source of the second half. Prevent this by marking
199 * compressed instructions as having src/dst hazards, so the register
200 * allocator assigns safe register regions for dst and srcs.
201 */
202 return size_written > REG_SIZE;
203 }
204 }
205
206 unsigned
size_read(unsigned arg) const207 vec4_instruction::size_read(unsigned arg) const
208 {
209 switch (opcode) {
210 case ELK_VEC4_OPCODE_UNTYPED_ATOMIC:
211 case ELK_VEC4_OPCODE_UNTYPED_SURFACE_READ:
212 case ELK_VEC4_OPCODE_UNTYPED_SURFACE_WRITE:
213 case ELK_VEC4_TCS_OPCODE_URB_WRITE:
214 if (arg == 0)
215 return mlen * REG_SIZE;
216 break;
217 case ELK_VS_OPCODE_PULL_CONSTANT_LOAD_GFX7:
218 if (arg == 1)
219 return mlen * REG_SIZE;
220 break;
221 default:
222 break;
223 }
224
225 switch (src[arg].file) {
226 case BAD_FILE:
227 return 0;
228 case IMM:
229 case UNIFORM:
230 return 4 * type_sz(src[arg].type);
231 default:
232 /* XXX - Represent actual vertical stride. */
233 return exec_size * type_sz(src[arg].type);
234 }
235 }
236
237 bool
can_do_source_mods(const struct intel_device_info * devinfo)238 vec4_instruction::can_do_source_mods(const struct intel_device_info *devinfo)
239 {
240 if (devinfo->ver == 6 && is_math())
241 return false;
242
243 if (is_send_from_grf())
244 return false;
245
246 if (!elk_backend_instruction::can_do_source_mods())
247 return false;
248
249 return true;
250 }
251
252 bool
can_do_cmod()253 vec4_instruction::can_do_cmod()
254 {
255 if (!elk_backend_instruction::can_do_cmod())
256 return false;
257
258 /* The accumulator result appears to get used for the conditional modifier
259 * generation. When negating a UD value, there is a 33rd bit generated for
260 * the sign in the accumulator value, so now you can't check, for example,
261 * equality with a 32-bit value. See piglit fs-op-neg-uvec4.
262 */
263 for (unsigned i = 0; i < 3; i++) {
264 if (src[i].file != BAD_FILE &&
265 elk_reg_type_is_unsigned_integer(src[i].type) && src[i].negate)
266 return false;
267 }
268
269 return true;
270 }
271
272 bool
can_do_writemask(const struct intel_device_info * devinfo)273 vec4_instruction::can_do_writemask(const struct intel_device_info *devinfo)
274 {
275 switch (opcode) {
276 case ELK_SHADER_OPCODE_GFX4_SCRATCH_READ:
277 case ELK_VEC4_OPCODE_DOUBLE_TO_F32:
278 case ELK_VEC4_OPCODE_DOUBLE_TO_D32:
279 case ELK_VEC4_OPCODE_DOUBLE_TO_U32:
280 case ELK_VEC4_OPCODE_TO_DOUBLE:
281 case ELK_VEC4_OPCODE_PICK_LOW_32BIT:
282 case ELK_VEC4_OPCODE_PICK_HIGH_32BIT:
283 case ELK_VEC4_OPCODE_SET_LOW_32BIT:
284 case ELK_VEC4_OPCODE_SET_HIGH_32BIT:
285 case ELK_VS_OPCODE_PULL_CONSTANT_LOAD:
286 case ELK_VS_OPCODE_PULL_CONSTANT_LOAD_GFX7:
287 case ELK_VEC4_TCS_OPCODE_SET_INPUT_URB_OFFSETS:
288 case ELK_VEC4_TCS_OPCODE_SET_OUTPUT_URB_OFFSETS:
289 case ELK_TES_OPCODE_CREATE_INPUT_READ_HEADER:
290 case ELK_TES_OPCODE_ADD_INDIRECT_URB_OFFSET:
291 case ELK_VEC4_OPCODE_URB_READ:
292 case ELK_SHADER_OPCODE_MOV_INDIRECT:
293 case ELK_SHADER_OPCODE_TEX:
294 case ELK_FS_OPCODE_TXB:
295 case ELK_SHADER_OPCODE_TXD:
296 case ELK_SHADER_OPCODE_TXF:
297 case ELK_SHADER_OPCODE_TXF_LZ:
298 case ELK_SHADER_OPCODE_TXF_CMS:
299 case ELK_SHADER_OPCODE_TXF_CMS_W:
300 case ELK_SHADER_OPCODE_TXF_UMS:
301 case ELK_SHADER_OPCODE_TXF_MCS:
302 case ELK_SHADER_OPCODE_TXL:
303 case ELK_SHADER_OPCODE_TXL_LZ:
304 case ELK_SHADER_OPCODE_TXS:
305 case ELK_SHADER_OPCODE_LOD:
306 case ELK_SHADER_OPCODE_TG4:
307 case ELK_SHADER_OPCODE_TG4_OFFSET:
308 case ELK_SHADER_OPCODE_SAMPLEINFO:
309 return false;
310 default:
311 /* The MATH instruction on Gfx6 only executes in align1 mode, which does
312 * not support writemasking.
313 */
314 if (devinfo->ver == 6 && is_math())
315 return false;
316
317 return true;
318 }
319 }
320
321 bool
can_change_types() const322 vec4_instruction::can_change_types() const
323 {
324 return dst.type == src[0].type &&
325 !src[0].abs && !src[0].negate && !saturate &&
326 (opcode == ELK_OPCODE_MOV ||
327 (opcode == ELK_OPCODE_SEL &&
328 dst.type == src[1].type &&
329 predicate != ELK_PREDICATE_NONE &&
330 !src[1].abs && !src[1].negate));
331 }
332
333 /**
334 * Returns how many MRFs an opcode will write over.
335 *
336 * Note that this is not the 0 or 1 implied writes in an actual gen
337 * instruction -- the generate_* functions generate additional MOVs
338 * for setup.
339 */
340 unsigned
implied_mrf_writes() const341 vec4_instruction::implied_mrf_writes() const
342 {
343 if (mlen == 0 || is_send_from_grf())
344 return 0;
345
346 switch (opcode) {
347 case ELK_SHADER_OPCODE_RCP:
348 case ELK_SHADER_OPCODE_RSQ:
349 case ELK_SHADER_OPCODE_SQRT:
350 case ELK_SHADER_OPCODE_EXP2:
351 case ELK_SHADER_OPCODE_LOG2:
352 case ELK_SHADER_OPCODE_SIN:
353 case ELK_SHADER_OPCODE_COS:
354 return 1;
355 case ELK_SHADER_OPCODE_INT_QUOTIENT:
356 case ELK_SHADER_OPCODE_INT_REMAINDER:
357 case ELK_SHADER_OPCODE_POW:
358 case ELK_TCS_OPCODE_THREAD_END:
359 return 2;
360 case ELK_VEC4_VS_OPCODE_URB_WRITE:
361 return 1;
362 case ELK_VS_OPCODE_PULL_CONSTANT_LOAD:
363 return 2;
364 case ELK_SHADER_OPCODE_GFX4_SCRATCH_READ:
365 return 2;
366 case ELK_SHADER_OPCODE_GFX4_SCRATCH_WRITE:
367 return 3;
368 case ELK_VEC4_GS_OPCODE_URB_WRITE:
369 case ELK_VEC4_GS_OPCODE_URB_WRITE_ALLOCATE:
370 case ELK_GS_OPCODE_THREAD_END:
371 return 0;
372 case ELK_GS_OPCODE_FF_SYNC:
373 return 1;
374 case ELK_VEC4_TCS_OPCODE_URB_WRITE:
375 return 0;
376 case ELK_SHADER_OPCODE_TEX:
377 case ELK_SHADER_OPCODE_TXL:
378 case ELK_SHADER_OPCODE_TXD:
379 case ELK_SHADER_OPCODE_TXF:
380 case ELK_SHADER_OPCODE_TXF_CMS:
381 case ELK_SHADER_OPCODE_TXF_CMS_W:
382 case ELK_SHADER_OPCODE_TXF_MCS:
383 case ELK_SHADER_OPCODE_TXS:
384 case ELK_SHADER_OPCODE_TG4:
385 case ELK_SHADER_OPCODE_TG4_OFFSET:
386 case ELK_SHADER_OPCODE_SAMPLEINFO:
387 case ELK_SHADER_OPCODE_GET_BUFFER_SIZE:
388 return header_size;
389 default:
390 unreachable("not reached");
391 }
392 }
393
394 bool
equals(const src_reg & r) const395 src_reg::equals(const src_reg &r) const
396 {
397 return (this->elk_backend_reg::equals(r) &&
398 !reladdr && !r.reladdr);
399 }
400
401 bool
negative_equals(const src_reg & r) const402 src_reg::negative_equals(const src_reg &r) const
403 {
404 return this->elk_backend_reg::negative_equals(r) &&
405 !reladdr && !r.reladdr;
406 }
407
408 bool
opt_vector_float()409 vec4_visitor::opt_vector_float()
410 {
411 bool progress = false;
412
413 foreach_block(block, cfg) {
414 unsigned last_reg = ~0u, last_offset = ~0u;
415 enum elk_reg_file last_reg_file = BAD_FILE;
416
417 uint8_t imm[4] = { 0 };
418 int inst_count = 0;
419 vec4_instruction *imm_inst[4];
420 unsigned writemask = 0;
421 enum elk_reg_type dest_type = ELK_REGISTER_TYPE_F;
422
423 foreach_inst_in_block_safe(vec4_instruction, inst, block) {
424 int vf = -1;
425 enum elk_reg_type need_type = ELK_REGISTER_TYPE_LAST;
426
427 /* Look for unconditional MOVs from an immediate with a partial
428 * writemask. Skip type-conversion MOVs other than integer 0,
429 * where the type doesn't matter. See if the immediate can be
430 * represented as a VF.
431 */
432 if (inst->opcode == ELK_OPCODE_MOV &&
433 inst->src[0].file == IMM &&
434 inst->predicate == ELK_PREDICATE_NONE &&
435 inst->dst.writemask != WRITEMASK_XYZW &&
436 type_sz(inst->src[0].type) < 8 &&
437 (inst->src[0].type == inst->dst.type || inst->src[0].d == 0)) {
438
439 vf = elk_float_to_vf(inst->src[0].d);
440 need_type = ELK_REGISTER_TYPE_D;
441
442 if (vf == -1) {
443 vf = elk_float_to_vf(inst->src[0].f);
444 need_type = ELK_REGISTER_TYPE_F;
445 }
446 } else {
447 last_reg = ~0u;
448 }
449
450 /* If this wasn't a MOV, or the destination register doesn't match,
451 * or we have to switch destination types, then this breaks our
452 * sequence. Combine anything we've accumulated so far.
453 */
454 if (last_reg != inst->dst.nr ||
455 last_offset != inst->dst.offset ||
456 last_reg_file != inst->dst.file ||
457 (vf > 0 && dest_type != need_type)) {
458
459 if (inst_count > 1) {
460 unsigned vf;
461 memcpy(&vf, imm, sizeof(vf));
462 vec4_instruction *mov = MOV(imm_inst[0]->dst, elk_imm_vf(vf));
463 mov->dst.type = dest_type;
464 mov->dst.writemask = writemask;
465 inst->insert_before(block, mov);
466
467 for (int i = 0; i < inst_count; i++) {
468 imm_inst[i]->remove(block);
469 }
470
471 progress = true;
472 }
473
474 inst_count = 0;
475 last_reg = ~0u;;
476 writemask = 0;
477 dest_type = ELK_REGISTER_TYPE_F;
478
479 for (int i = 0; i < 4; i++) {
480 imm[i] = 0;
481 }
482 }
483
484 /* Record this instruction's value (if it was representable). */
485 if (vf != -1) {
486 if ((inst->dst.writemask & WRITEMASK_X) != 0)
487 imm[0] = vf;
488 if ((inst->dst.writemask & WRITEMASK_Y) != 0)
489 imm[1] = vf;
490 if ((inst->dst.writemask & WRITEMASK_Z) != 0)
491 imm[2] = vf;
492 if ((inst->dst.writemask & WRITEMASK_W) != 0)
493 imm[3] = vf;
494
495 writemask |= inst->dst.writemask;
496 imm_inst[inst_count++] = inst;
497
498 last_reg = inst->dst.nr;
499 last_offset = inst->dst.offset;
500 last_reg_file = inst->dst.file;
501 if (vf > 0)
502 dest_type = need_type;
503 }
504 }
505 }
506
507 if (progress)
508 invalidate_analysis(DEPENDENCY_INSTRUCTIONS);
509
510 return progress;
511 }
512
513 /* Replaces unused channels of a swizzle with channels that are used.
514 *
515 * For instance, this pass transforms
516 *
517 * mov vgrf4.yz, vgrf5.wxzy
518 *
519 * into
520 *
521 * mov vgrf4.yz, vgrf5.xxzx
522 *
523 * This eliminates false uses of some channels, letting dead code elimination
524 * remove the instructions that wrote them.
525 */
526 bool
opt_reduce_swizzle()527 vec4_visitor::opt_reduce_swizzle()
528 {
529 bool progress = false;
530
531 foreach_block_and_inst_safe(block, vec4_instruction, inst, cfg) {
532 if (inst->dst.file == BAD_FILE ||
533 inst->dst.file == ARF ||
534 inst->dst.file == FIXED_GRF ||
535 inst->is_send_from_grf())
536 continue;
537
538 unsigned swizzle;
539
540 /* Determine which channels of the sources are read. */
541 switch (inst->opcode) {
542 case ELK_VEC4_OPCODE_PACK_BYTES:
543 case ELK_OPCODE_DP4:
544 case ELK_OPCODE_DPH: /* FINISHME: DPH reads only three channels of src0,
545 * but all four of src1.
546 */
547 swizzle = elk_swizzle_for_size(4);
548 break;
549 case ELK_OPCODE_DP3:
550 swizzle = elk_swizzle_for_size(3);
551 break;
552 case ELK_OPCODE_DP2:
553 swizzle = elk_swizzle_for_size(2);
554 break;
555
556 case ELK_VEC4_OPCODE_TO_DOUBLE:
557 case ELK_VEC4_OPCODE_DOUBLE_TO_F32:
558 case ELK_VEC4_OPCODE_DOUBLE_TO_D32:
559 case ELK_VEC4_OPCODE_DOUBLE_TO_U32:
560 case ELK_VEC4_OPCODE_PICK_LOW_32BIT:
561 case ELK_VEC4_OPCODE_PICK_HIGH_32BIT:
562 case ELK_VEC4_OPCODE_SET_LOW_32BIT:
563 case ELK_VEC4_OPCODE_SET_HIGH_32BIT:
564 swizzle = elk_swizzle_for_size(4);
565 break;
566
567 default:
568 swizzle = elk_swizzle_for_mask(inst->dst.writemask);
569 break;
570 }
571
572 /* Update sources' swizzles. */
573 for (int i = 0; i < 3; i++) {
574 if (inst->src[i].file != VGRF &&
575 inst->src[i].file != ATTR &&
576 inst->src[i].file != UNIFORM)
577 continue;
578
579 const unsigned new_swizzle =
580 elk_compose_swizzle(swizzle, inst->src[i].swizzle);
581 if (inst->src[i].swizzle != new_swizzle) {
582 inst->src[i].swizzle = new_swizzle;
583 progress = true;
584 }
585 }
586 }
587
588 if (progress)
589 invalidate_analysis(DEPENDENCY_INSTRUCTION_DETAIL);
590
591 return progress;
592 }
593
594 void
split_uniform_registers()595 vec4_visitor::split_uniform_registers()
596 {
597 /* Prior to this, uniforms have been in an array sized according to
598 * the number of vector uniforms present, sparsely filled (so an
599 * aggregate results in reg indices being skipped over). Now we're
600 * going to cut those aggregates up so each .nr index is one
601 * vector. The goal is to make elimination of unused uniform
602 * components easier later.
603 */
604 foreach_block_and_inst(block, vec4_instruction, inst, cfg) {
605 for (int i = 0 ; i < 3; i++) {
606 if (inst->src[i].file != UNIFORM || inst->src[i].nr >= UBO_START)
607 continue;
608
609 assert(!inst->src[i].reladdr);
610
611 inst->src[i].nr += inst->src[i].offset / 16;
612 inst->src[i].offset %= 16;
613 }
614 }
615 }
616
617 /**
618 * Does algebraic optimizations (0 * a = 0, 1 * a = a, a + 0 = a).
619 *
620 * While GLSL IR also performs this optimization, we end up with it in
621 * our instruction stream for a couple of reasons. One is that we
622 * sometimes generate silly instructions, for example in array access
623 * where we'll generate "ADD offset, index, base" even if base is 0.
624 * The other is that GLSL IR's constant propagation doesn't track the
625 * components of aggregates, so some VS patterns (initialize matrix to
626 * 0, accumulate in vertex blending factors) end up breaking down to
627 * instructions involving 0.
628 */
629 bool
opt_algebraic()630 vec4_visitor::opt_algebraic()
631 {
632 bool progress = false;
633
634 foreach_block_and_inst(block, vec4_instruction, inst, cfg) {
635 switch (inst->opcode) {
636 case ELK_OPCODE_MOV:
637 if (inst->src[0].file != IMM)
638 break;
639
640 if (inst->saturate) {
641 /* Full mixed-type saturates don't happen. However, we can end up
642 * with things like:
643 *
644 * mov.sat(8) g21<1>DF -1F
645 *
646 * Other mixed-size-but-same-base-type cases may also be possible.
647 */
648 if (inst->dst.type != inst->src[0].type &&
649 inst->dst.type != ELK_REGISTER_TYPE_DF &&
650 inst->src[0].type != ELK_REGISTER_TYPE_F)
651 assert(!"unimplemented: saturate mixed types");
652
653 if (elk_saturate_immediate(inst->src[0].type,
654 &inst->src[0].as_elk_reg())) {
655 inst->saturate = false;
656 progress = true;
657 }
658 }
659 break;
660
661 case ELK_OPCODE_OR:
662 if (inst->src[1].is_zero()) {
663 inst->opcode = ELK_OPCODE_MOV;
664 inst->src[1] = src_reg();
665 progress = true;
666 }
667 break;
668
669 case ELK_VEC4_OPCODE_UNPACK_UNIFORM:
670 if (inst->src[0].file != UNIFORM) {
671 inst->opcode = ELK_OPCODE_MOV;
672 progress = true;
673 }
674 break;
675
676 case ELK_OPCODE_ADD:
677 if (inst->src[1].is_zero()) {
678 inst->opcode = ELK_OPCODE_MOV;
679 inst->src[1] = src_reg();
680 progress = true;
681 }
682 break;
683
684 case ELK_OPCODE_MUL:
685 if (inst->src[1].file != IMM)
686 continue;
687
688 if (elk_reg_type_is_floating_point(inst->src[1].type))
689 break;
690
691 if (inst->src[1].is_zero()) {
692 inst->opcode = ELK_OPCODE_MOV;
693 switch (inst->src[0].type) {
694 case ELK_REGISTER_TYPE_F:
695 inst->src[0] = elk_imm_f(0.0f);
696 break;
697 case ELK_REGISTER_TYPE_D:
698 inst->src[0] = elk_imm_d(0);
699 break;
700 case ELK_REGISTER_TYPE_UD:
701 inst->src[0] = elk_imm_ud(0u);
702 break;
703 default:
704 unreachable("not reached");
705 }
706 inst->src[1] = src_reg();
707 progress = true;
708 } else if (inst->src[1].is_one()) {
709 inst->opcode = ELK_OPCODE_MOV;
710 inst->src[1] = src_reg();
711 progress = true;
712 } else if (inst->src[1].is_negative_one()) {
713 inst->opcode = ELK_OPCODE_MOV;
714 inst->src[0].negate = !inst->src[0].negate;
715 inst->src[1] = src_reg();
716 progress = true;
717 }
718 break;
719 case ELK_SHADER_OPCODE_BROADCAST:
720 if (is_uniform(inst->src[0]) ||
721 inst->src[1].is_zero()) {
722 inst->opcode = ELK_OPCODE_MOV;
723 inst->src[1] = src_reg();
724 inst->force_writemask_all = true;
725 progress = true;
726 }
727 break;
728
729 default:
730 break;
731 }
732 }
733
734 if (progress)
735 invalidate_analysis(DEPENDENCY_INSTRUCTION_DATA_FLOW |
736 DEPENDENCY_INSTRUCTION_DETAIL);
737
738 return progress;
739 }
740
741 /* Conditions for which we want to avoid setting the dependency control bits */
742 bool
is_dep_ctrl_unsafe(const vec4_instruction * inst)743 vec4_visitor::is_dep_ctrl_unsafe(const vec4_instruction *inst)
744 {
745 #define IS_DWORD(reg) \
746 (reg.type == ELK_REGISTER_TYPE_UD || \
747 reg.type == ELK_REGISTER_TYPE_D)
748
749 #define IS_64BIT(reg) (reg.file != BAD_FILE && type_sz(reg.type) == 8)
750
751 if (devinfo->ver >= 7) {
752 if (IS_64BIT(inst->dst) || IS_64BIT(inst->src[0]) ||
753 IS_64BIT(inst->src[1]) || IS_64BIT(inst->src[2]))
754 return true;
755 }
756
757 #undef IS_64BIT
758 #undef IS_DWORD
759
760 /*
761 * mlen:
762 * In the presence of send messages, totally interrupt dependency
763 * control. They're long enough that the chance of dependency
764 * control around them just doesn't matter.
765 *
766 * predicate:
767 * From the Ivy Bridge PRM, volume 4 part 3.7, page 80:
768 * When a sequence of NoDDChk and NoDDClr are used, the last instruction that
769 * completes the scoreboard clear must have a non-zero execution mask. This
770 * means, if any kind of predication can change the execution mask or channel
771 * enable of the last instruction, the optimization must be avoided. This is
772 * to avoid instructions being shot down the pipeline when no writes are
773 * required.
774 *
775 * math:
776 * Dependency control does not work well over math instructions.
777 * NB: Discovered empirically
778 */
779 return (inst->mlen || inst->predicate || inst->is_math());
780 }
781
782 /**
783 * Sets the dependency control fields on instructions after register
784 * allocation and before the generator is run.
785 *
786 * When you have a sequence of instructions like:
787 *
788 * DP4 temp.x vertex uniform[0]
789 * DP4 temp.y vertex uniform[0]
790 * DP4 temp.z vertex uniform[0]
791 * DP4 temp.w vertex uniform[0]
792 *
793 * The hardware doesn't know that it can actually run the later instructions
794 * while the previous ones are in flight, producing stalls. However, we have
795 * manual fields we can set in the instructions that let it do so.
796 */
797 void
opt_set_dependency_control()798 vec4_visitor::opt_set_dependency_control()
799 {
800 vec4_instruction *last_grf_write[ELK_MAX_GRF];
801 uint8_t grf_channels_written[ELK_MAX_GRF];
802 vec4_instruction *last_mrf_write[ELK_MAX_GRF];
803 uint8_t mrf_channels_written[ELK_MAX_GRF];
804
805 assert(prog_data->total_grf ||
806 !"Must be called after register allocation");
807
808 foreach_block (block, cfg) {
809 memset(last_grf_write, 0, sizeof(last_grf_write));
810 memset(last_mrf_write, 0, sizeof(last_mrf_write));
811
812 foreach_inst_in_block (vec4_instruction, inst, block) {
813 /* If we read from a register that we were doing dependency control
814 * on, don't do dependency control across the read.
815 */
816 for (int i = 0; i < 3; i++) {
817 int reg = inst->src[i].nr + inst->src[i].offset / REG_SIZE;
818 if (inst->src[i].file == VGRF) {
819 last_grf_write[reg] = NULL;
820 } else if (inst->src[i].file == FIXED_GRF) {
821 memset(last_grf_write, 0, sizeof(last_grf_write));
822 break;
823 }
824 assert(inst->src[i].file != MRF);
825 }
826
827 if (is_dep_ctrl_unsafe(inst)) {
828 memset(last_grf_write, 0, sizeof(last_grf_write));
829 memset(last_mrf_write, 0, sizeof(last_mrf_write));
830 continue;
831 }
832
833 /* Now, see if we can do dependency control for this instruction
834 * against a previous one writing to its destination.
835 */
836 int reg = inst->dst.nr + inst->dst.offset / REG_SIZE;
837 if (inst->dst.file == VGRF || inst->dst.file == FIXED_GRF) {
838 if (last_grf_write[reg] &&
839 last_grf_write[reg]->dst.offset == inst->dst.offset &&
840 !(inst->dst.writemask & grf_channels_written[reg])) {
841 last_grf_write[reg]->no_dd_clear = true;
842 inst->no_dd_check = true;
843 } else {
844 grf_channels_written[reg] = 0;
845 }
846
847 last_grf_write[reg] = inst;
848 grf_channels_written[reg] |= inst->dst.writemask;
849 } else if (inst->dst.file == MRF) {
850 if (last_mrf_write[reg] &&
851 last_mrf_write[reg]->dst.offset == inst->dst.offset &&
852 !(inst->dst.writemask & mrf_channels_written[reg])) {
853 last_mrf_write[reg]->no_dd_clear = true;
854 inst->no_dd_check = true;
855 } else {
856 mrf_channels_written[reg] = 0;
857 }
858
859 last_mrf_write[reg] = inst;
860 mrf_channels_written[reg] |= inst->dst.writemask;
861 }
862 }
863 }
864 }
865
866 bool
can_reswizzle(const struct intel_device_info * devinfo,int dst_writemask,int swizzle,int swizzle_mask)867 vec4_instruction::can_reswizzle(const struct intel_device_info *devinfo,
868 int dst_writemask,
869 int swizzle,
870 int swizzle_mask)
871 {
872 /* Gfx6 MATH instructions can not execute in align16 mode, so swizzles
873 * are not allowed.
874 */
875 if (devinfo->ver == 6 && is_math() && swizzle != ELK_SWIZZLE_XYZW)
876 return false;
877
878 /* If we write to the flag register changing the swizzle would change
879 * what channels are written to the flag register.
880 */
881 if (writes_flag(devinfo))
882 return false;
883
884 /* We can't swizzle implicit accumulator access. We'd have to
885 * reswizzle the producer of the accumulator value in addition
886 * to the consumer (i.e. both MUL and MACH). Just skip this.
887 */
888 if (reads_accumulator_implicitly())
889 return false;
890
891 if (!can_do_writemask(devinfo) && dst_writemask != WRITEMASK_XYZW)
892 return false;
893
894 /* If this instruction sets anything not referenced by swizzle, then we'd
895 * totally break it when we reswizzle.
896 */
897 if (dst.writemask & ~swizzle_mask)
898 return false;
899
900 if (mlen > 0)
901 return false;
902
903 for (int i = 0; i < 3; i++) {
904 if (src[i].is_accumulator())
905 return false;
906 }
907
908 return true;
909 }
910
911 /**
912 * For any channels in the swizzle's source that were populated by this
913 * instruction, rewrite the instruction to put the appropriate result directly
914 * in those channels.
915 *
916 * e.g. for swizzle=yywx, MUL a.xy b c -> MUL a.yy_x b.yy z.yy_x
917 */
918 void
reswizzle(int dst_writemask,int swizzle)919 vec4_instruction::reswizzle(int dst_writemask, int swizzle)
920 {
921 /* Destination write mask doesn't correspond to source swizzle for the dot
922 * product and pack_bytes instructions.
923 */
924 if (opcode != ELK_OPCODE_DP4 && opcode != ELK_OPCODE_DPH &&
925 opcode != ELK_OPCODE_DP3 && opcode != ELK_OPCODE_DP2 &&
926 opcode != ELK_VEC4_OPCODE_PACK_BYTES) {
927 for (int i = 0; i < 3; i++) {
928 if (src[i].file == BAD_FILE)
929 continue;
930
931 if (src[i].file == IMM) {
932 assert(src[i].type != ELK_REGISTER_TYPE_V &&
933 src[i].type != ELK_REGISTER_TYPE_UV);
934
935 /* Vector immediate types need to be reswizzled. */
936 if (src[i].type == ELK_REGISTER_TYPE_VF) {
937 const unsigned imm[] = {
938 (src[i].ud >> 0) & 0x0ff,
939 (src[i].ud >> 8) & 0x0ff,
940 (src[i].ud >> 16) & 0x0ff,
941 (src[i].ud >> 24) & 0x0ff,
942 };
943
944 src[i] = elk_imm_vf4(imm[ELK_GET_SWZ(swizzle, 0)],
945 imm[ELK_GET_SWZ(swizzle, 1)],
946 imm[ELK_GET_SWZ(swizzle, 2)],
947 imm[ELK_GET_SWZ(swizzle, 3)]);
948 }
949
950 continue;
951 }
952
953 src[i].swizzle = elk_compose_swizzle(swizzle, src[i].swizzle);
954 }
955 }
956
957 /* Apply the specified swizzle and writemask to the original mask of
958 * written components.
959 */
960 dst.writemask = dst_writemask &
961 elk_apply_swizzle_to_mask(swizzle, dst.writemask);
962 }
963
964 /*
965 * Tries to reduce extra MOV instructions by taking temporary GRFs that get
966 * just written and then MOVed into another reg and making the original write
967 * of the GRF write directly to the final destination instead.
968 */
969 bool
opt_register_coalesce()970 vec4_visitor::opt_register_coalesce()
971 {
972 bool progress = false;
973 int next_ip = 0;
974 const vec4_live_variables &live = live_analysis.require();
975
976 foreach_block_and_inst_safe (block, vec4_instruction, inst, cfg) {
977 int ip = next_ip;
978 next_ip++;
979
980 if (inst->opcode != ELK_OPCODE_MOV ||
981 (inst->dst.file != VGRF && inst->dst.file != MRF) ||
982 inst->predicate ||
983 inst->src[0].file != VGRF ||
984 inst->dst.type != inst->src[0].type ||
985 inst->src[0].abs || inst->src[0].negate || inst->src[0].reladdr)
986 continue;
987
988 /* Remove no-op MOVs */
989 if (inst->dst.file == inst->src[0].file &&
990 inst->dst.nr == inst->src[0].nr &&
991 inst->dst.offset == inst->src[0].offset) {
992 bool is_nop_mov = true;
993
994 for (unsigned c = 0; c < 4; c++) {
995 if ((inst->dst.writemask & (1 << c)) == 0)
996 continue;
997
998 if (ELK_GET_SWZ(inst->src[0].swizzle, c) != c) {
999 is_nop_mov = false;
1000 break;
1001 }
1002 }
1003
1004 if (is_nop_mov) {
1005 inst->remove(block);
1006 progress = true;
1007 continue;
1008 }
1009 }
1010
1011 bool to_mrf = (inst->dst.file == MRF);
1012
1013 /* Can't coalesce this GRF if someone else was going to
1014 * read it later.
1015 */
1016 if (live.var_range_end(var_from_reg(alloc, dst_reg(inst->src[0])), 8) > ip)
1017 continue;
1018
1019 /* We need to check interference with the final destination between this
1020 * instruction and the earliest instruction involved in writing the GRF
1021 * we're eliminating. To do that, keep track of which of our source
1022 * channels we've seen initialized.
1023 */
1024 const unsigned chans_needed =
1025 elk_apply_inv_swizzle_to_mask(inst->src[0].swizzle,
1026 inst->dst.writemask);
1027 unsigned chans_remaining = chans_needed;
1028
1029 /* Now walk up the instruction stream trying to see if we can rewrite
1030 * everything writing to the temporary to write into the destination
1031 * instead.
1032 */
1033 vec4_instruction *_scan_inst = (vec4_instruction *)inst->prev;
1034 foreach_inst_in_block_reverse_starting_from(vec4_instruction, scan_inst,
1035 inst) {
1036 _scan_inst = scan_inst;
1037
1038 if (regions_overlap(inst->src[0], inst->size_read(0),
1039 scan_inst->dst, scan_inst->size_written)) {
1040 /* Found something writing to the reg we want to coalesce away. */
1041 if (to_mrf) {
1042 /* SEND instructions can't have MRF as a destination. */
1043 if (scan_inst->mlen)
1044 break;
1045
1046 if (devinfo->ver == 6) {
1047 /* gfx6 math instructions must have the destination be
1048 * VGRF, so no compute-to-MRF for them.
1049 */
1050 if (scan_inst->is_math()) {
1051 break;
1052 }
1053 }
1054 }
1055
1056 /* ELK_VS_OPCODE_UNPACK_FLAGS_SIMD4X2 generates a bunch of mov(1)
1057 * instructions, and this optimization pass is not capable of
1058 * handling that. Bail on these instructions and hope that some
1059 * later optimization pass can do the right thing after they are
1060 * expanded.
1061 */
1062 if (scan_inst->opcode == ELK_VS_OPCODE_UNPACK_FLAGS_SIMD4X2)
1063 break;
1064
1065 /* This doesn't handle saturation on the instruction we
1066 * want to coalesce away if the register types do not match.
1067 * But if scan_inst is a non type-converting 'mov', we can fix
1068 * the types later.
1069 */
1070 if (inst->saturate &&
1071 inst->dst.type != scan_inst->dst.type &&
1072 !(scan_inst->opcode == ELK_OPCODE_MOV &&
1073 scan_inst->dst.type == scan_inst->src[0].type))
1074 break;
1075
1076 /* Only allow coalescing between registers of the same type size.
1077 * Otherwise we would need to make the pass aware of the fact that
1078 * channel sizes are different for single and double precision.
1079 */
1080 if (type_sz(inst->src[0].type) != type_sz(scan_inst->src[0].type))
1081 break;
1082
1083 /* Check that scan_inst writes the same amount of data as the
1084 * instruction, otherwise coalescing would lead to writing a
1085 * different (larger or smaller) region of the destination
1086 */
1087 if (scan_inst->size_written != inst->size_written)
1088 break;
1089
1090 /* If we can't handle the swizzle, bail. */
1091 if (!scan_inst->can_reswizzle(devinfo, inst->dst.writemask,
1092 inst->src[0].swizzle,
1093 chans_needed)) {
1094 break;
1095 }
1096
1097 /* This only handles coalescing writes of 8 channels (1 register
1098 * for single-precision and 2 registers for double-precision)
1099 * starting at the source offset of the copy instruction.
1100 */
1101 if (DIV_ROUND_UP(scan_inst->size_written,
1102 type_sz(scan_inst->dst.type)) > 8 ||
1103 scan_inst->dst.offset != inst->src[0].offset)
1104 break;
1105
1106 /* Mark which channels we found unconditional writes for. */
1107 if (!scan_inst->predicate)
1108 chans_remaining &= ~scan_inst->dst.writemask;
1109
1110 if (chans_remaining == 0)
1111 break;
1112 }
1113
1114 /* You can't read from an MRF, so if someone else reads our MRF's
1115 * source GRF that we wanted to rewrite, that stops us. If it's a
1116 * GRF we're trying to coalesce to, we don't actually handle
1117 * rewriting sources so bail in that case as well.
1118 */
1119 bool interfered = false;
1120 for (int i = 0; i < 3; i++) {
1121 if (regions_overlap(inst->src[0], inst->size_read(0),
1122 scan_inst->src[i], scan_inst->size_read(i)))
1123 interfered = true;
1124 }
1125 if (interfered)
1126 break;
1127
1128 /* If somebody else writes the same channels of our destination here,
1129 * we can't coalesce before that.
1130 */
1131 if (regions_overlap(inst->dst, inst->size_written,
1132 scan_inst->dst, scan_inst->size_written) &&
1133 (inst->dst.writemask & scan_inst->dst.writemask) != 0) {
1134 break;
1135 }
1136
1137 /* Check for reads of the register we're trying to coalesce into. We
1138 * can't go rewriting instructions above that to put some other value
1139 * in the register instead.
1140 */
1141 if (to_mrf && scan_inst->mlen > 0) {
1142 unsigned start = scan_inst->base_mrf;
1143 unsigned end = scan_inst->base_mrf + scan_inst->mlen;
1144
1145 if (inst->dst.nr >= start && inst->dst.nr < end) {
1146 break;
1147 }
1148 } else {
1149 for (int i = 0; i < 3; i++) {
1150 if (regions_overlap(inst->dst, inst->size_written,
1151 scan_inst->src[i], scan_inst->size_read(i)))
1152 interfered = true;
1153 }
1154 if (interfered)
1155 break;
1156 }
1157 }
1158
1159 if (chans_remaining == 0) {
1160 /* If we've made it here, we have an MOV we want to coalesce out, and
1161 * a scan_inst pointing to the earliest instruction involved in
1162 * computing the value. Now go rewrite the instruction stream
1163 * between the two.
1164 */
1165 vec4_instruction *scan_inst = _scan_inst;
1166 while (scan_inst != inst) {
1167 if (scan_inst->dst.file == VGRF &&
1168 scan_inst->dst.nr == inst->src[0].nr &&
1169 scan_inst->dst.offset == inst->src[0].offset) {
1170 scan_inst->reswizzle(inst->dst.writemask,
1171 inst->src[0].swizzle);
1172 scan_inst->dst.file = inst->dst.file;
1173 scan_inst->dst.nr = inst->dst.nr;
1174 scan_inst->dst.offset = inst->dst.offset;
1175 if (inst->saturate &&
1176 inst->dst.type != scan_inst->dst.type) {
1177 /* If we have reached this point, scan_inst is a non
1178 * type-converting 'mov' and we can modify its register types
1179 * to match the ones in inst. Otherwise, we could have an
1180 * incorrect saturation result.
1181 */
1182 scan_inst->dst.type = inst->dst.type;
1183 scan_inst->src[0].type = inst->src[0].type;
1184 }
1185 scan_inst->saturate |= inst->saturate;
1186 }
1187 scan_inst = (vec4_instruction *)scan_inst->next;
1188 }
1189 inst->remove(block);
1190 progress = true;
1191 }
1192 }
1193
1194 if (progress)
1195 invalidate_analysis(DEPENDENCY_INSTRUCTIONS);
1196
1197 return progress;
1198 }
1199
1200 /**
1201 * Eliminate FIND_LIVE_CHANNEL instructions occurring outside any control
1202 * flow. We could probably do better here with some form of divergence
1203 * analysis.
1204 */
1205 bool
eliminate_find_live_channel()1206 vec4_visitor::eliminate_find_live_channel()
1207 {
1208 bool progress = false;
1209 unsigned depth = 0;
1210
1211 if (!elk_stage_has_packed_dispatch(devinfo, stage, stage_prog_data)) {
1212 /* The optimization below assumes that channel zero is live on thread
1213 * dispatch, which may not be the case if the fixed function dispatches
1214 * threads sparsely.
1215 */
1216 return false;
1217 }
1218
1219 foreach_block_and_inst_safe(block, vec4_instruction, inst, cfg) {
1220 switch (inst->opcode) {
1221 case ELK_OPCODE_IF:
1222 case ELK_OPCODE_DO:
1223 depth++;
1224 break;
1225
1226 case ELK_OPCODE_ENDIF:
1227 case ELK_OPCODE_WHILE:
1228 depth--;
1229 break;
1230
1231 case ELK_SHADER_OPCODE_FIND_LIVE_CHANNEL:
1232 if (depth == 0) {
1233 inst->opcode = ELK_OPCODE_MOV;
1234 inst->src[0] = elk_imm_d(0);
1235 inst->force_writemask_all = true;
1236 progress = true;
1237 }
1238 break;
1239
1240 default:
1241 break;
1242 }
1243 }
1244
1245 if (progress)
1246 invalidate_analysis(DEPENDENCY_INSTRUCTION_DETAIL);
1247
1248 return progress;
1249 }
1250
1251 /**
1252 * Splits virtual GRFs requesting more than one contiguous physical register.
1253 *
1254 * We initially create large virtual GRFs for temporary structures, arrays,
1255 * and matrices, so that the visitor functions can add offsets to work their
1256 * way down to the actual member being accessed. But when it comes to
1257 * optimization, we'd like to treat each register as individual storage if
1258 * possible.
1259 *
1260 * So far, the only thing that might prevent splitting is a send message from
1261 * a GRF on IVB.
1262 */
1263 void
split_virtual_grfs()1264 vec4_visitor::split_virtual_grfs()
1265 {
1266 int num_vars = this->alloc.count;
1267 int *new_virtual_grf = rzalloc_array(NULL, int, num_vars);
1268 bool *split_grf = ralloc_array(NULL, bool, num_vars);
1269
1270 /* Try to split anything > 0 sized. */
1271 for (int i = 0; i < num_vars; i++) {
1272 split_grf[i] = this->alloc.sizes[i] != 1;
1273 }
1274
1275 /* Check that the instructions are compatible with the registers we're trying
1276 * to split.
1277 */
1278 foreach_block_and_inst(block, vec4_instruction, inst, cfg) {
1279 if (inst->dst.file == VGRF && regs_written(inst) > 1)
1280 split_grf[inst->dst.nr] = false;
1281
1282 for (int i = 0; i < 3; i++) {
1283 if (inst->src[i].file == VGRF && regs_read(inst, i) > 1)
1284 split_grf[inst->src[i].nr] = false;
1285 }
1286 }
1287
1288 /* Allocate new space for split regs. Note that the virtual
1289 * numbers will be contiguous.
1290 */
1291 for (int i = 0; i < num_vars; i++) {
1292 if (!split_grf[i])
1293 continue;
1294
1295 new_virtual_grf[i] = alloc.allocate(1);
1296 for (unsigned j = 2; j < this->alloc.sizes[i]; j++) {
1297 unsigned reg = alloc.allocate(1);
1298 assert(reg == new_virtual_grf[i] + j - 1);
1299 (void) reg;
1300 }
1301 this->alloc.sizes[i] = 1;
1302 }
1303
1304 foreach_block_and_inst(block, vec4_instruction, inst, cfg) {
1305 if (inst->dst.file == VGRF && split_grf[inst->dst.nr] &&
1306 inst->dst.offset / REG_SIZE != 0) {
1307 inst->dst.nr = (new_virtual_grf[inst->dst.nr] +
1308 inst->dst.offset / REG_SIZE - 1);
1309 inst->dst.offset %= REG_SIZE;
1310 }
1311 for (int i = 0; i < 3; i++) {
1312 if (inst->src[i].file == VGRF && split_grf[inst->src[i].nr] &&
1313 inst->src[i].offset / REG_SIZE != 0) {
1314 inst->src[i].nr = (new_virtual_grf[inst->src[i].nr] +
1315 inst->src[i].offset / REG_SIZE - 1);
1316 inst->src[i].offset %= REG_SIZE;
1317 }
1318 }
1319 }
1320
1321 ralloc_free(new_virtual_grf);
1322 ralloc_free(split_grf);
1323
1324 invalidate_analysis(DEPENDENCY_INSTRUCTION_DETAIL | DEPENDENCY_VARIABLES);
1325 }
1326
1327 void
dump_instruction_to_file(const elk_backend_instruction * be_inst,FILE * file) const1328 vec4_visitor::dump_instruction_to_file(const elk_backend_instruction *be_inst, FILE *file) const
1329 {
1330 const vec4_instruction *inst = (const vec4_instruction *)be_inst;
1331
1332 if (inst->predicate) {
1333 fprintf(file, "(%cf%d.%d%s) ",
1334 inst->predicate_inverse ? '-' : '+',
1335 inst->flag_subreg / 2,
1336 inst->flag_subreg % 2,
1337 elk_pred_ctrl_align16[inst->predicate]);
1338 }
1339
1340 fprintf(file, "%s(%d)", elk_instruction_name(&compiler->isa, inst->opcode),
1341 inst->exec_size);
1342 if (inst->saturate)
1343 fprintf(file, ".sat");
1344 if (inst->conditional_mod) {
1345 fprintf(file, "%s", elk_conditional_modifier[inst->conditional_mod]);
1346 if (!inst->predicate &&
1347 (devinfo->ver < 5 || (inst->opcode != ELK_OPCODE_SEL &&
1348 inst->opcode != ELK_OPCODE_CSEL &&
1349 inst->opcode != ELK_OPCODE_IF &&
1350 inst->opcode != ELK_OPCODE_WHILE))) {
1351 fprintf(file, ".f%d.%d", inst->flag_subreg / 2, inst->flag_subreg % 2);
1352 }
1353 }
1354 fprintf(file, " ");
1355
1356 switch (inst->dst.file) {
1357 case VGRF:
1358 fprintf(file, "vgrf%d", inst->dst.nr);
1359 break;
1360 case FIXED_GRF:
1361 fprintf(file, "g%d", inst->dst.nr);
1362 break;
1363 case MRF:
1364 fprintf(file, "m%d", inst->dst.nr);
1365 break;
1366 case ARF:
1367 switch (inst->dst.nr) {
1368 case ELK_ARF_NULL:
1369 fprintf(file, "null");
1370 break;
1371 case ELK_ARF_ADDRESS:
1372 fprintf(file, "a0.%d", inst->dst.subnr);
1373 break;
1374 case ELK_ARF_ACCUMULATOR:
1375 fprintf(file, "acc%d", inst->dst.subnr);
1376 break;
1377 case ELK_ARF_FLAG:
1378 fprintf(file, "f%d.%d", inst->dst.nr & 0xf, inst->dst.subnr);
1379 break;
1380 default:
1381 fprintf(file, "arf%d.%d", inst->dst.nr & 0xf, inst->dst.subnr);
1382 break;
1383 }
1384 break;
1385 case BAD_FILE:
1386 fprintf(file, "(null)");
1387 break;
1388 case IMM:
1389 case ATTR:
1390 case UNIFORM:
1391 unreachable("not reached");
1392 }
1393 if (inst->dst.offset ||
1394 (inst->dst.file == VGRF &&
1395 alloc.sizes[inst->dst.nr] * REG_SIZE != inst->size_written)) {
1396 const unsigned reg_size = (inst->dst.file == UNIFORM ? 16 : REG_SIZE);
1397 fprintf(file, "+%d.%d", inst->dst.offset / reg_size,
1398 inst->dst.offset % reg_size);
1399 }
1400 if (inst->dst.writemask != WRITEMASK_XYZW) {
1401 fprintf(file, ".");
1402 if (inst->dst.writemask & 1)
1403 fprintf(file, "x");
1404 if (inst->dst.writemask & 2)
1405 fprintf(file, "y");
1406 if (inst->dst.writemask & 4)
1407 fprintf(file, "z");
1408 if (inst->dst.writemask & 8)
1409 fprintf(file, "w");
1410 }
1411 fprintf(file, ":%s", elk_reg_type_to_letters(inst->dst.type));
1412
1413 if (inst->src[0].file != BAD_FILE)
1414 fprintf(file, ", ");
1415
1416 for (int i = 0; i < 3 && inst->src[i].file != BAD_FILE; i++) {
1417 if (inst->src[i].negate)
1418 fprintf(file, "-");
1419 if (inst->src[i].abs)
1420 fprintf(file, "|");
1421 switch (inst->src[i].file) {
1422 case VGRF:
1423 fprintf(file, "vgrf%d", inst->src[i].nr);
1424 break;
1425 case FIXED_GRF:
1426 fprintf(file, "g%d.%d", inst->src[i].nr, inst->src[i].subnr);
1427 break;
1428 case ATTR:
1429 fprintf(file, "attr%d", inst->src[i].nr);
1430 break;
1431 case UNIFORM:
1432 fprintf(file, "u%d", inst->src[i].nr);
1433 break;
1434 case IMM:
1435 switch (inst->src[i].type) {
1436 case ELK_REGISTER_TYPE_F:
1437 fprintf(file, "%fF", inst->src[i].f);
1438 break;
1439 case ELK_REGISTER_TYPE_DF:
1440 fprintf(file, "%fDF", inst->src[i].df);
1441 break;
1442 case ELK_REGISTER_TYPE_D:
1443 fprintf(file, "%dD", inst->src[i].d);
1444 break;
1445 case ELK_REGISTER_TYPE_UD:
1446 fprintf(file, "%uU", inst->src[i].ud);
1447 break;
1448 case ELK_REGISTER_TYPE_VF:
1449 fprintf(file, "[%-gF, %-gF, %-gF, %-gF]",
1450 elk_vf_to_float((inst->src[i].ud >> 0) & 0xff),
1451 elk_vf_to_float((inst->src[i].ud >> 8) & 0xff),
1452 elk_vf_to_float((inst->src[i].ud >> 16) & 0xff),
1453 elk_vf_to_float((inst->src[i].ud >> 24) & 0xff));
1454 break;
1455 default:
1456 fprintf(file, "???");
1457 break;
1458 }
1459 break;
1460 case ARF:
1461 switch (inst->src[i].nr) {
1462 case ELK_ARF_NULL:
1463 fprintf(file, "null");
1464 break;
1465 case ELK_ARF_ADDRESS:
1466 fprintf(file, "a0.%d", inst->src[i].subnr);
1467 break;
1468 case ELK_ARF_ACCUMULATOR:
1469 fprintf(file, "acc%d", inst->src[i].subnr);
1470 break;
1471 case ELK_ARF_FLAG:
1472 fprintf(file, "f%d.%d", inst->src[i].nr & 0xf, inst->src[i].subnr);
1473 break;
1474 default:
1475 fprintf(file, "arf%d.%d", inst->src[i].nr & 0xf, inst->src[i].subnr);
1476 break;
1477 }
1478 break;
1479 case BAD_FILE:
1480 fprintf(file, "(null)");
1481 break;
1482 case MRF:
1483 unreachable("not reached");
1484 }
1485
1486 if (inst->src[i].offset ||
1487 (inst->src[i].file == VGRF &&
1488 alloc.sizes[inst->src[i].nr] * REG_SIZE != inst->size_read(i))) {
1489 const unsigned reg_size = (inst->src[i].file == UNIFORM ? 16 : REG_SIZE);
1490 fprintf(file, "+%d.%d", inst->src[i].offset / reg_size,
1491 inst->src[i].offset % reg_size);
1492 }
1493
1494 if (inst->src[i].file != IMM) {
1495 static const char *chans[4] = {"x", "y", "z", "w"};
1496 fprintf(file, ".");
1497 for (int c = 0; c < 4; c++) {
1498 fprintf(file, "%s", chans[ELK_GET_SWZ(inst->src[i].swizzle, c)]);
1499 }
1500 }
1501
1502 if (inst->src[i].abs)
1503 fprintf(file, "|");
1504
1505 if (inst->src[i].file != IMM) {
1506 fprintf(file, ":%s", elk_reg_type_to_letters(inst->src[i].type));
1507 }
1508
1509 if (i < 2 && inst->src[i + 1].file != BAD_FILE)
1510 fprintf(file, ", ");
1511 }
1512
1513 if (inst->force_writemask_all)
1514 fprintf(file, " NoMask");
1515
1516 if (inst->exec_size != 8)
1517 fprintf(file, " group%d", inst->group);
1518
1519 fprintf(file, "\n");
1520 }
1521
1522
1523 int
setup_attributes(int payload_reg)1524 vec4_vs_visitor::setup_attributes(int payload_reg)
1525 {
1526 foreach_block_and_inst(block, vec4_instruction, inst, cfg) {
1527 for (int i = 0; i < 3; i++) {
1528 if (inst->src[i].file == ATTR) {
1529 assert(inst->src[i].offset % REG_SIZE == 0);
1530 int grf = payload_reg + inst->src[i].nr +
1531 inst->src[i].offset / REG_SIZE;
1532
1533 struct elk_reg reg = elk_vec8_grf(grf, 0);
1534 reg.swizzle = inst->src[i].swizzle;
1535 reg.type = inst->src[i].type;
1536 reg.abs = inst->src[i].abs;
1537 reg.negate = inst->src[i].negate;
1538 inst->src[i] = reg;
1539 }
1540 }
1541 }
1542
1543 return payload_reg + vs_prog_data->nr_attribute_slots;
1544 }
1545
1546 void
setup_push_ranges()1547 vec4_visitor::setup_push_ranges()
1548 {
1549 /* Only allow 32 registers (256 uniform components) as push constants,
1550 * which is the limit on gfx6.
1551 *
1552 * If changing this value, note the limitation about total_regs in
1553 * elk_curbe.c.
1554 */
1555 const unsigned max_push_length = 32;
1556
1557 push_length = DIV_ROUND_UP(prog_data->base.nr_params, 8);
1558 push_length = MIN2(push_length, max_push_length);
1559
1560 /* Shrink UBO push ranges so it all fits in max_push_length */
1561 for (unsigned i = 0; i < 4; i++) {
1562 struct elk_ubo_range *range = &prog_data->base.ubo_ranges[i];
1563
1564 if (push_length + range->length > max_push_length)
1565 range->length = max_push_length - push_length;
1566
1567 push_length += range->length;
1568 }
1569 assert(push_length <= max_push_length);
1570 }
1571
1572 int
setup_uniforms(int reg)1573 vec4_visitor::setup_uniforms(int reg)
1574 {
1575 /* It's possible that uniform compaction will shrink further than expected
1576 * so we re-compute the layout and set up our UBO push starts.
1577 */
1578 ASSERTED const unsigned old_push_length = push_length;
1579 push_length = DIV_ROUND_UP(prog_data->base.nr_params, 8);
1580 for (unsigned i = 0; i < 4; i++) {
1581 ubo_push_start[i] = push_length;
1582 push_length += stage_prog_data->ubo_ranges[i].length;
1583 }
1584 assert(push_length == old_push_length);
1585
1586 /* The pre-gfx6 VS requires that some push constants get loaded no
1587 * matter what, or the GPU would hang.
1588 */
1589 if (devinfo->ver < 6 && push_length == 0) {
1590 elk_stage_prog_data_add_params(stage_prog_data, 4);
1591 for (unsigned int i = 0; i < 4; i++) {
1592 unsigned int slot = this->uniforms * 4 + i;
1593 stage_prog_data->param[slot] = ELK_PARAM_BUILTIN_ZERO;
1594 }
1595 push_length = 1;
1596 }
1597
1598 prog_data->base.dispatch_grf_start_reg = reg;
1599 prog_data->base.curb_read_length = push_length;
1600
1601 return reg + push_length;
1602 }
1603
1604 void
setup_payload(void)1605 vec4_vs_visitor::setup_payload(void)
1606 {
1607 int reg = 0;
1608
1609 /* The payload always contains important data in g0, which contains
1610 * the URB handles that are passed on to the URB write at the end
1611 * of the thread. So, we always start push constants at g1.
1612 */
1613 reg++;
1614
1615 reg = setup_uniforms(reg);
1616
1617 reg = setup_attributes(reg);
1618
1619 this->first_non_payload_grf = reg;
1620 }
1621
1622 bool
lower_minmax()1623 vec4_visitor::lower_minmax()
1624 {
1625 assert(devinfo->ver < 6);
1626
1627 bool progress = false;
1628
1629 foreach_block_and_inst_safe(block, vec4_instruction, inst, cfg) {
1630 const vec4_builder ibld(this, block, inst);
1631
1632 if (inst->opcode == ELK_OPCODE_SEL &&
1633 inst->predicate == ELK_PREDICATE_NONE) {
1634 /* If src1 is an immediate value that is not NaN, then it can't be
1635 * NaN. In that case, emit CMP because it is much better for cmod
1636 * propagation. Likewise if src1 is not float. Gfx4 and Gfx5 don't
1637 * support HF or DF, so it is not necessary to check for those.
1638 */
1639 if (inst->src[1].type != ELK_REGISTER_TYPE_F ||
1640 (inst->src[1].file == IMM && !isnan(inst->src[1].f))) {
1641 ibld.CMP(ibld.null_reg_d(), inst->src[0], inst->src[1],
1642 inst->conditional_mod);
1643 } else {
1644 ibld.CMPN(ibld.null_reg_d(), inst->src[0], inst->src[1],
1645 inst->conditional_mod);
1646 }
1647 inst->predicate = ELK_PREDICATE_NORMAL;
1648 inst->conditional_mod = ELK_CONDITIONAL_NONE;
1649
1650 progress = true;
1651 }
1652 }
1653
1654 if (progress)
1655 invalidate_analysis(DEPENDENCY_INSTRUCTIONS);
1656
1657 return progress;
1658 }
1659
1660 src_reg
get_timestamp()1661 vec4_visitor::get_timestamp()
1662 {
1663 assert(devinfo->ver == 7);
1664
1665 src_reg ts = src_reg(elk_reg(ELK_ARCHITECTURE_REGISTER_FILE,
1666 ELK_ARF_TIMESTAMP,
1667 0,
1668 0,
1669 0,
1670 ELK_REGISTER_TYPE_UD,
1671 ELK_VERTICAL_STRIDE_0,
1672 ELK_WIDTH_4,
1673 ELK_HORIZONTAL_STRIDE_4,
1674 ELK_SWIZZLE_XYZW,
1675 WRITEMASK_XYZW));
1676
1677 dst_reg dst = dst_reg(this, glsl_uvec4_type());
1678
1679 vec4_instruction *mov = emit(MOV(dst, ts));
1680 /* We want to read the 3 fields we care about (mostly field 0, but also 2)
1681 * even if it's not enabled in the dispatch.
1682 */
1683 mov->force_writemask_all = true;
1684
1685 return src_reg(dst);
1686 }
1687
1688 static bool
is_align1_df(vec4_instruction * inst)1689 is_align1_df(vec4_instruction *inst)
1690 {
1691 switch (inst->opcode) {
1692 case ELK_VEC4_OPCODE_DOUBLE_TO_F32:
1693 case ELK_VEC4_OPCODE_DOUBLE_TO_D32:
1694 case ELK_VEC4_OPCODE_DOUBLE_TO_U32:
1695 case ELK_VEC4_OPCODE_TO_DOUBLE:
1696 case ELK_VEC4_OPCODE_PICK_LOW_32BIT:
1697 case ELK_VEC4_OPCODE_PICK_HIGH_32BIT:
1698 case ELK_VEC4_OPCODE_SET_LOW_32BIT:
1699 case ELK_VEC4_OPCODE_SET_HIGH_32BIT:
1700 return true;
1701 default:
1702 return false;
1703 }
1704 }
1705
1706 /**
1707 * Three source instruction must have a GRF/MRF destination register.
1708 * ARF NULL is not allowed. Fix that up by allocating a temporary GRF.
1709 */
1710 void
fixup_3src_null_dest()1711 vec4_visitor::fixup_3src_null_dest()
1712 {
1713 bool progress = false;
1714
1715 foreach_block_and_inst_safe (block, vec4_instruction, inst, cfg) {
1716 if (inst->elk_is_3src(compiler) && inst->dst.is_null()) {
1717 const unsigned size_written = type_sz(inst->dst.type);
1718 const unsigned num_regs = DIV_ROUND_UP(size_written, REG_SIZE);
1719
1720 inst->dst = retype(dst_reg(VGRF, alloc.allocate(num_regs)),
1721 inst->dst.type);
1722 progress = true;
1723 }
1724 }
1725
1726 if (progress)
1727 invalidate_analysis(DEPENDENCY_INSTRUCTION_DETAIL |
1728 DEPENDENCY_VARIABLES);
1729 }
1730
1731 void
convert_to_hw_regs()1732 vec4_visitor::convert_to_hw_regs()
1733 {
1734 foreach_block_and_inst(block, vec4_instruction, inst, cfg) {
1735 for (int i = 0; i < 3; i++) {
1736 class src_reg &src = inst->src[i];
1737 struct elk_reg reg;
1738 switch (src.file) {
1739 case VGRF: {
1740 reg = byte_offset(elk_vecn_grf(4, src.nr, 0), src.offset);
1741 reg.type = src.type;
1742 reg.abs = src.abs;
1743 reg.negate = src.negate;
1744 break;
1745 }
1746
1747 case UNIFORM: {
1748 if (src.nr >= UBO_START) {
1749 reg = byte_offset(elk_vec4_grf(
1750 prog_data->base.dispatch_grf_start_reg +
1751 ubo_push_start[src.nr - UBO_START] +
1752 src.offset / 32, 0),
1753 src.offset % 32);
1754 } else {
1755 reg = byte_offset(elk_vec4_grf(
1756 prog_data->base.dispatch_grf_start_reg +
1757 src.nr / 2, src.nr % 2 * 4),
1758 src.offset);
1759 }
1760 reg = stride(reg, 0, 4, 1);
1761 reg.type = src.type;
1762 reg.abs = src.abs;
1763 reg.negate = src.negate;
1764
1765 /* This should have been moved to pull constants. */
1766 assert(!src.reladdr);
1767 break;
1768 }
1769
1770 case FIXED_GRF:
1771 if (type_sz(src.type) == 8) {
1772 reg = src.as_elk_reg();
1773 break;
1774 }
1775 FALLTHROUGH;
1776 case ARF:
1777 case IMM:
1778 continue;
1779
1780 case BAD_FILE:
1781 /* Probably unused. */
1782 reg = elk_null_reg();
1783 reg = retype(reg, src.type);
1784 break;
1785
1786 case MRF:
1787 case ATTR:
1788 unreachable("not reached");
1789 }
1790
1791 apply_logical_swizzle(®, inst, i);
1792 src = reg;
1793
1794 /* From IVB PRM, vol4, part3, "General Restrictions on Regioning
1795 * Parameters":
1796 *
1797 * "If ExecSize = Width and HorzStride ≠ 0, VertStride must be set
1798 * to Width * HorzStride."
1799 *
1800 * We can break this rule with DF sources on DF align1
1801 * instructions, because the exec_size would be 4 and width is 4.
1802 * As we know we are not accessing to next GRF, it is safe to
1803 * set vstride to the formula given by the rule itself.
1804 */
1805 if (is_align1_df(inst) && (cvt(inst->exec_size) - 1) == src.width)
1806 src.vstride = src.width + src.hstride;
1807 }
1808
1809 if (inst->elk_is_3src(compiler)) {
1810 /* 3-src instructions with scalar sources support arbitrary subnr,
1811 * but don't actually use swizzles. Convert swizzle into subnr.
1812 * Skip this for double-precision instructions: RepCtrl=1 is not
1813 * allowed for them and needs special handling.
1814 */
1815 for (int i = 0; i < 3; i++) {
1816 if (inst->src[i].vstride == ELK_VERTICAL_STRIDE_0 &&
1817 type_sz(inst->src[i].type) < 8) {
1818 assert(elk_is_single_value_swizzle(inst->src[i].swizzle));
1819 inst->src[i].subnr += 4 * ELK_GET_SWZ(inst->src[i].swizzle, 0);
1820 }
1821 }
1822 }
1823
1824 dst_reg &dst = inst->dst;
1825 struct elk_reg reg;
1826
1827 switch (inst->dst.file) {
1828 case VGRF:
1829 reg = byte_offset(elk_vec8_grf(dst.nr, 0), dst.offset);
1830 reg.type = dst.type;
1831 reg.writemask = dst.writemask;
1832 break;
1833
1834 case MRF:
1835 reg = byte_offset(elk_message_reg(dst.nr), dst.offset);
1836 assert((reg.nr & ~ELK_MRF_COMPR4) < ELK_MAX_MRF(devinfo->ver));
1837 reg.type = dst.type;
1838 reg.writemask = dst.writemask;
1839 break;
1840
1841 case ARF:
1842 case FIXED_GRF:
1843 reg = dst.as_elk_reg();
1844 break;
1845
1846 case BAD_FILE:
1847 reg = elk_null_reg();
1848 reg = retype(reg, dst.type);
1849 break;
1850
1851 case IMM:
1852 case ATTR:
1853 case UNIFORM:
1854 unreachable("not reached");
1855 }
1856
1857 dst = reg;
1858 }
1859 }
1860
1861 static bool
stage_uses_interleaved_attributes(unsigned stage,enum intel_shader_dispatch_mode dispatch_mode)1862 stage_uses_interleaved_attributes(unsigned stage,
1863 enum intel_shader_dispatch_mode dispatch_mode)
1864 {
1865 switch (stage) {
1866 case MESA_SHADER_TESS_EVAL:
1867 return true;
1868 case MESA_SHADER_GEOMETRY:
1869 return dispatch_mode != INTEL_DISPATCH_MODE_4X2_DUAL_OBJECT;
1870 default:
1871 return false;
1872 }
1873 }
1874
1875 /**
1876 * Get the closest native SIMD width supported by the hardware for instruction
1877 * \p inst. The instruction will be left untouched by
1878 * vec4_visitor::lower_simd_width() if the returned value matches the
1879 * instruction's original execution size.
1880 */
1881 static unsigned
get_lowered_simd_width(const struct intel_device_info * devinfo,enum intel_shader_dispatch_mode dispatch_mode,unsigned stage,const vec4_instruction * inst)1882 get_lowered_simd_width(const struct intel_device_info *devinfo,
1883 enum intel_shader_dispatch_mode dispatch_mode,
1884 unsigned stage, const vec4_instruction *inst)
1885 {
1886 /* Do not split some instructions that require special handling */
1887 switch (inst->opcode) {
1888 case ELK_SHADER_OPCODE_GFX4_SCRATCH_READ:
1889 case ELK_SHADER_OPCODE_GFX4_SCRATCH_WRITE:
1890 return inst->exec_size;
1891 default:
1892 break;
1893 }
1894
1895 unsigned lowered_width = MIN2(16, inst->exec_size);
1896
1897 /* We need to split some cases of double-precision instructions that write
1898 * 2 registers. We only need to care about this in gfx7 because that is the
1899 * only hardware that implements fp64 in Align16.
1900 */
1901 if (devinfo->ver == 7 && inst->size_written > REG_SIZE) {
1902 /* Align16 8-wide double-precision SEL does not work well. Verified
1903 * empirically.
1904 */
1905 if (inst->opcode == ELK_OPCODE_SEL && type_sz(inst->dst.type) == 8)
1906 lowered_width = MIN2(lowered_width, 4);
1907
1908 /* HSW PRM, 3D Media GPGPU Engine, Region Alignment Rules for Direct
1909 * Register Addressing:
1910 *
1911 * "When destination spans two registers, the source MUST span two
1912 * registers."
1913 */
1914 for (unsigned i = 0; i < 3; i++) {
1915 if (inst->src[i].file == BAD_FILE)
1916 continue;
1917 if (inst->size_read(i) <= REG_SIZE)
1918 lowered_width = MIN2(lowered_width, 4);
1919
1920 /* Interleaved attribute setups use a vertical stride of 0, which
1921 * makes them hit the associated instruction decompression bug in gfx7.
1922 * Split them to prevent this.
1923 */
1924 if (inst->src[i].file == ATTR &&
1925 stage_uses_interleaved_attributes(stage, dispatch_mode))
1926 lowered_width = MIN2(lowered_width, 4);
1927 }
1928 }
1929
1930 /* IvyBridge can manage a maximum of 4 DFs per SIMD4x2 instruction, since
1931 * it doesn't support compression in Align16 mode, no matter if it has
1932 * force_writemask_all enabled or disabled (the latter is affected by the
1933 * compressed instruction bug in gfx7, which is another reason to enforce
1934 * this limit).
1935 */
1936 if (devinfo->verx10 == 70 &&
1937 (get_exec_type_size(inst) == 8 || type_sz(inst->dst.type) == 8))
1938 lowered_width = MIN2(lowered_width, 4);
1939
1940 return lowered_width;
1941 }
1942
1943 static bool
dst_src_regions_overlap(vec4_instruction * inst)1944 dst_src_regions_overlap(vec4_instruction *inst)
1945 {
1946 if (inst->size_written == 0)
1947 return false;
1948
1949 unsigned dst_start = inst->dst.offset;
1950 unsigned dst_end = dst_start + inst->size_written - 1;
1951 for (int i = 0; i < 3; i++) {
1952 if (inst->src[i].file == BAD_FILE)
1953 continue;
1954
1955 if (inst->dst.file != inst->src[i].file ||
1956 inst->dst.nr != inst->src[i].nr)
1957 continue;
1958
1959 unsigned src_start = inst->src[i].offset;
1960 unsigned src_end = src_start + inst->size_read(i) - 1;
1961
1962 if ((dst_start >= src_start && dst_start <= src_end) ||
1963 (dst_end >= src_start && dst_end <= src_end) ||
1964 (dst_start <= src_start && dst_end >= src_end)) {
1965 return true;
1966 }
1967 }
1968
1969 return false;
1970 }
1971
1972 bool
lower_simd_width()1973 vec4_visitor::lower_simd_width()
1974 {
1975 bool progress = false;
1976
1977 foreach_block_and_inst_safe(block, vec4_instruction, inst, cfg) {
1978 const unsigned lowered_width =
1979 get_lowered_simd_width(devinfo, prog_data->dispatch_mode, stage, inst);
1980 assert(lowered_width <= inst->exec_size);
1981 if (lowered_width == inst->exec_size)
1982 continue;
1983
1984 /* We need to deal with source / destination overlaps when splitting.
1985 * The hardware supports reading from and writing to the same register
1986 * in the same instruction, but we need to be careful that each split
1987 * instruction we produce does not corrupt the source of the next.
1988 *
1989 * The easiest way to handle this is to make the split instructions write
1990 * to temporaries if there is an src/dst overlap and then move from the
1991 * temporaries to the original destination. We also need to consider
1992 * instructions that do partial writes via align1 opcodes, in which case
1993 * we need to make sure that the we initialize the temporary with the
1994 * value of the instruction's dst.
1995 */
1996 bool needs_temp = dst_src_regions_overlap(inst);
1997 for (unsigned n = 0; n < inst->exec_size / lowered_width; n++) {
1998 unsigned channel_offset = lowered_width * n;
1999
2000 unsigned size_written = lowered_width * type_sz(inst->dst.type);
2001
2002 /* Create the split instruction from the original so that we copy all
2003 * relevant instruction fields, then set the width and calculate the
2004 * new dst/src regions.
2005 */
2006 vec4_instruction *linst = new(mem_ctx) vec4_instruction(*inst);
2007 linst->exec_size = lowered_width;
2008 linst->group = channel_offset;
2009 linst->size_written = size_written;
2010
2011 /* Compute split dst region */
2012 dst_reg dst;
2013 if (needs_temp) {
2014 unsigned num_regs = DIV_ROUND_UP(size_written, REG_SIZE);
2015 dst = retype(dst_reg(VGRF, alloc.allocate(num_regs)),
2016 inst->dst.type);
2017 if (inst->is_align1_partial_write()) {
2018 vec4_instruction *copy = MOV(dst, src_reg(inst->dst));
2019 copy->exec_size = lowered_width;
2020 copy->group = channel_offset;
2021 copy->size_written = size_written;
2022 inst->insert_before(block, copy);
2023 }
2024 } else {
2025 dst = horiz_offset(inst->dst, channel_offset);
2026 }
2027 linst->dst = dst;
2028
2029 /* Compute split source regions */
2030 for (int i = 0; i < 3; i++) {
2031 if (linst->src[i].file == BAD_FILE)
2032 continue;
2033
2034 bool is_interleaved_attr =
2035 linst->src[i].file == ATTR &&
2036 stage_uses_interleaved_attributes(stage,
2037 prog_data->dispatch_mode);
2038
2039 if (!is_uniform(linst->src[i]) && !is_interleaved_attr)
2040 linst->src[i] = horiz_offset(linst->src[i], channel_offset);
2041 }
2042
2043 inst->insert_before(block, linst);
2044
2045 /* If we used a temporary to store the result of the split
2046 * instruction, copy the result to the original destination
2047 */
2048 if (needs_temp) {
2049 vec4_instruction *mov =
2050 MOV(offset(inst->dst, lowered_width, n), src_reg(dst));
2051 mov->exec_size = lowered_width;
2052 mov->group = channel_offset;
2053 mov->size_written = size_written;
2054 mov->predicate = inst->predicate;
2055 inst->insert_before(block, mov);
2056 }
2057 }
2058
2059 inst->remove(block);
2060 progress = true;
2061 }
2062
2063 if (progress)
2064 invalidate_analysis(DEPENDENCY_INSTRUCTIONS | DEPENDENCY_VARIABLES);
2065
2066 return progress;
2067 }
2068
2069 static elk_predicate
scalarize_predicate(elk_predicate predicate,unsigned writemask)2070 scalarize_predicate(elk_predicate predicate, unsigned writemask)
2071 {
2072 if (predicate != ELK_PREDICATE_NORMAL)
2073 return predicate;
2074
2075 switch (writemask) {
2076 case WRITEMASK_X:
2077 return ELK_PREDICATE_ALIGN16_REPLICATE_X;
2078 case WRITEMASK_Y:
2079 return ELK_PREDICATE_ALIGN16_REPLICATE_Y;
2080 case WRITEMASK_Z:
2081 return ELK_PREDICATE_ALIGN16_REPLICATE_Z;
2082 case WRITEMASK_W:
2083 return ELK_PREDICATE_ALIGN16_REPLICATE_W;
2084 default:
2085 unreachable("invalid writemask");
2086 }
2087 }
2088
2089 /* Gfx7 has a hardware decompression bug that we can exploit to represent
2090 * handful of additional swizzles natively.
2091 */
2092 static bool
is_gfx7_supported_64bit_swizzle(vec4_instruction * inst,unsigned arg)2093 is_gfx7_supported_64bit_swizzle(vec4_instruction *inst, unsigned arg)
2094 {
2095 switch (inst->src[arg].swizzle) {
2096 case ELK_SWIZZLE_XXXX:
2097 case ELK_SWIZZLE_YYYY:
2098 case ELK_SWIZZLE_ZZZZ:
2099 case ELK_SWIZZLE_WWWW:
2100 case ELK_SWIZZLE_XYXY:
2101 case ELK_SWIZZLE_YXYX:
2102 case ELK_SWIZZLE_ZWZW:
2103 case ELK_SWIZZLE_WZWZ:
2104 return true;
2105 default:
2106 return false;
2107 }
2108 }
2109
2110 /* 64-bit sources use regions with a width of 2. These 2 elements in each row
2111 * can be addressed using 32-bit swizzles (which is what the hardware supports)
2112 * but it also means that the swizzle we apply on the first two components of a
2113 * dvec4 is coupled with the swizzle we use for the last 2. In other words,
2114 * only some specific swizzle combinations can be natively supported.
2115 *
2116 * FIXME: we can go an step further and implement even more swizzle
2117 * variations using only partial scalarization.
2118 *
2119 * For more details see:
2120 * https://bugs.freedesktop.org/show_bug.cgi?id=92760#c82
2121 */
2122 bool
is_supported_64bit_region(vec4_instruction * inst,unsigned arg)2123 vec4_visitor::is_supported_64bit_region(vec4_instruction *inst, unsigned arg)
2124 {
2125 const src_reg &src = inst->src[arg];
2126 assert(type_sz(src.type) == 8);
2127
2128 /* Uniform regions have a vstride=0. Because we use 2-wide rows with
2129 * 64-bit regions it means that we cannot access components Z/W, so
2130 * return false for any such case. Interleaved attributes will also be
2131 * mapped to GRF registers with a vstride of 0, so apply the same
2132 * treatment.
2133 */
2134 if ((is_uniform(src) ||
2135 (stage_uses_interleaved_attributes(stage, prog_data->dispatch_mode) &&
2136 src.file == ATTR)) &&
2137 (elk_mask_for_swizzle(src.swizzle) & 12))
2138 return false;
2139
2140 switch (src.swizzle) {
2141 case ELK_SWIZZLE_XYZW:
2142 case ELK_SWIZZLE_XXZZ:
2143 case ELK_SWIZZLE_YYWW:
2144 case ELK_SWIZZLE_YXWZ:
2145 return true;
2146 default:
2147 return devinfo->ver == 7 && is_gfx7_supported_64bit_swizzle(inst, arg);
2148 }
2149 }
2150
2151 bool
scalarize_df()2152 vec4_visitor::scalarize_df()
2153 {
2154 bool progress = false;
2155
2156 foreach_block_and_inst_safe(block, vec4_instruction, inst, cfg) {
2157 /* Skip DF instructions that operate in Align1 mode */
2158 if (is_align1_df(inst))
2159 continue;
2160
2161 /* Check if this is a double-precision instruction */
2162 bool is_double = type_sz(inst->dst.type) == 8;
2163 for (int arg = 0; !is_double && arg < 3; arg++) {
2164 is_double = inst->src[arg].file != BAD_FILE &&
2165 type_sz(inst->src[arg].type) == 8;
2166 }
2167
2168 if (!is_double)
2169 continue;
2170
2171 /* Skip the lowering for specific regioning scenarios that we can
2172 * support natively.
2173 */
2174 bool skip_lowering = true;
2175
2176 /* XY and ZW writemasks operate in 32-bit, which means that they don't
2177 * have a native 64-bit representation and they should always be split.
2178 */
2179 if (inst->dst.writemask == WRITEMASK_XY ||
2180 inst->dst.writemask == WRITEMASK_ZW) {
2181 skip_lowering = false;
2182 } else {
2183 for (unsigned i = 0; i < 3; i++) {
2184 if (inst->src[i].file == BAD_FILE || type_sz(inst->src[i].type) < 8)
2185 continue;
2186 skip_lowering = skip_lowering && is_supported_64bit_region(inst, i);
2187 }
2188 }
2189
2190 if (skip_lowering)
2191 continue;
2192
2193 /* Generate scalar instructions for each enabled channel */
2194 for (unsigned chan = 0; chan < 4; chan++) {
2195 unsigned chan_mask = 1 << chan;
2196 if (!(inst->dst.writemask & chan_mask))
2197 continue;
2198
2199 vec4_instruction *scalar_inst = new(mem_ctx) vec4_instruction(*inst);
2200
2201 for (unsigned i = 0; i < 3; i++) {
2202 unsigned swz = ELK_GET_SWZ(inst->src[i].swizzle, chan);
2203 scalar_inst->src[i].swizzle = ELK_SWIZZLE4(swz, swz, swz, swz);
2204 }
2205
2206 scalar_inst->dst.writemask = chan_mask;
2207
2208 if (inst->predicate != ELK_PREDICATE_NONE) {
2209 scalar_inst->predicate =
2210 scalarize_predicate(inst->predicate, chan_mask);
2211 }
2212
2213 inst->insert_before(block, scalar_inst);
2214 }
2215
2216 inst->remove(block);
2217 progress = true;
2218 }
2219
2220 if (progress)
2221 invalidate_analysis(DEPENDENCY_INSTRUCTIONS);
2222
2223 return progress;
2224 }
2225
2226 bool
lower_64bit_mad_to_mul_add()2227 vec4_visitor::lower_64bit_mad_to_mul_add()
2228 {
2229 bool progress = false;
2230
2231 foreach_block_and_inst_safe(block, vec4_instruction, inst, cfg) {
2232 if (inst->opcode != ELK_OPCODE_MAD)
2233 continue;
2234
2235 if (type_sz(inst->dst.type) != 8)
2236 continue;
2237
2238 dst_reg mul_dst = dst_reg(this, glsl_dvec4_type());
2239
2240 /* Use the copy constructor so we copy all relevant instruction fields
2241 * from the original mad into the add and mul instructions
2242 */
2243 vec4_instruction *mul = new(mem_ctx) vec4_instruction(*inst);
2244 mul->opcode = ELK_OPCODE_MUL;
2245 mul->dst = mul_dst;
2246 mul->src[0] = inst->src[1];
2247 mul->src[1] = inst->src[2];
2248 mul->src[2].file = BAD_FILE;
2249
2250 vec4_instruction *add = new(mem_ctx) vec4_instruction(*inst);
2251 add->opcode = ELK_OPCODE_ADD;
2252 add->src[0] = src_reg(mul_dst);
2253 add->src[1] = inst->src[0];
2254 add->src[2].file = BAD_FILE;
2255
2256 inst->insert_before(block, mul);
2257 inst->insert_before(block, add);
2258 inst->remove(block);
2259
2260 progress = true;
2261 }
2262
2263 if (progress)
2264 invalidate_analysis(DEPENDENCY_INSTRUCTIONS | DEPENDENCY_VARIABLES);
2265
2266 return progress;
2267 }
2268
2269 /* The align16 hardware can only do 32-bit swizzle channels, so we need to
2270 * translate the logical 64-bit swizzle channels that we use in the Vec4 IR
2271 * to 32-bit swizzle channels in hardware registers.
2272 *
2273 * @inst and @arg identify the original vec4 IR source operand we need to
2274 * translate the swizzle for and @hw_reg is the hardware register where we
2275 * will write the hardware swizzle to use.
2276 *
2277 * This pass assumes that Align16/DF instructions have been fully scalarized
2278 * previously so there is just one 64-bit swizzle channel to deal with for any
2279 * given Vec4 IR source.
2280 */
2281 void
apply_logical_swizzle(struct elk_reg * hw_reg,vec4_instruction * inst,int arg)2282 vec4_visitor::apply_logical_swizzle(struct elk_reg *hw_reg,
2283 vec4_instruction *inst, int arg)
2284 {
2285 src_reg reg = inst->src[arg];
2286
2287 if (reg.file == BAD_FILE || reg.file == ELK_IMMEDIATE_VALUE)
2288 return;
2289
2290 /* If this is not a 64-bit operand or this is a scalar instruction we don't
2291 * need to do anything about the swizzles.
2292 */
2293 if(type_sz(reg.type) < 8 || is_align1_df(inst)) {
2294 hw_reg->swizzle = reg.swizzle;
2295 return;
2296 }
2297
2298 /* Take the 64-bit logical swizzle channel and translate it to 32-bit */
2299 assert(elk_is_single_value_swizzle(reg.swizzle) ||
2300 is_supported_64bit_region(inst, arg));
2301
2302 /* Apply the region <2, 2, 1> for GRF or <0, 2, 1> for uniforms, as align16
2303 * HW can only do 32-bit swizzle channels.
2304 */
2305 hw_reg->width = ELK_WIDTH_2;
2306
2307 if (is_supported_64bit_region(inst, arg) &&
2308 !is_gfx7_supported_64bit_swizzle(inst, arg)) {
2309 /* Supported 64-bit swizzles are those such that their first two
2310 * components, when expanded to 32-bit swizzles, match the semantics
2311 * of the original 64-bit swizzle with 2-wide row regioning.
2312 */
2313 unsigned swizzle0 = ELK_GET_SWZ(reg.swizzle, 0);
2314 unsigned swizzle1 = ELK_GET_SWZ(reg.swizzle, 1);
2315 hw_reg->swizzle = ELK_SWIZZLE4(swizzle0 * 2, swizzle0 * 2 + 1,
2316 swizzle1 * 2, swizzle1 * 2 + 1);
2317 } else {
2318 /* If we got here then we have one of the following:
2319 *
2320 * 1. An unsupported swizzle, which should be single-value thanks to the
2321 * scalarization pass.
2322 *
2323 * 2. A gfx7 supported swizzle. These can be single-value or double-value
2324 * swizzles. If the latter, they are never cross-dvec2 channels. For
2325 * these we always need to activate the gfx7 vstride=0 exploit.
2326 */
2327 unsigned swizzle0 = ELK_GET_SWZ(reg.swizzle, 0);
2328 unsigned swizzle1 = ELK_GET_SWZ(reg.swizzle, 1);
2329 assert((swizzle0 < 2) == (swizzle1 < 2));
2330
2331 /* To gain access to Z/W components we need to select the second half
2332 * of the register and then use a X/Y swizzle to select Z/W respectively.
2333 */
2334 if (swizzle0 >= 2) {
2335 *hw_reg = suboffset(*hw_reg, 2);
2336 swizzle0 -= 2;
2337 swizzle1 -= 2;
2338 }
2339
2340 /* All gfx7-specific supported swizzles require the vstride=0 exploit */
2341 if (devinfo->ver == 7 && is_gfx7_supported_64bit_swizzle(inst, arg))
2342 hw_reg->vstride = ELK_VERTICAL_STRIDE_0;
2343
2344 /* Any 64-bit source with an offset at 16B is intended to address the
2345 * second half of a register and needs a vertical stride of 0 so we:
2346 *
2347 * 1. Don't violate register region restrictions.
2348 * 2. Activate the gfx7 instruction decompression bug exploit when
2349 * execsize > 4
2350 */
2351 if (hw_reg->subnr % REG_SIZE == 16) {
2352 assert(devinfo->ver == 7);
2353 hw_reg->vstride = ELK_VERTICAL_STRIDE_0;
2354 }
2355
2356 hw_reg->swizzle = ELK_SWIZZLE4(swizzle0 * 2, swizzle0 * 2 + 1,
2357 swizzle1 * 2, swizzle1 * 2 + 1);
2358 }
2359 }
2360
2361 void
invalidate_analysis(elk::analysis_dependency_class c)2362 vec4_visitor::invalidate_analysis(elk::analysis_dependency_class c)
2363 {
2364 elk_backend_shader::invalidate_analysis(c);
2365 live_analysis.invalidate(c);
2366 }
2367
2368 bool
run()2369 vec4_visitor::run()
2370 {
2371 setup_push_ranges();
2372
2373 if (prog_data->base.zero_push_reg) {
2374 /* push_reg_mask_param is in uint32 params and UNIFORM is in vec4s */
2375 const unsigned mask_param = stage_prog_data->push_reg_mask_param;
2376 src_reg mask = src_reg(dst_reg(UNIFORM, mask_param / 4));
2377 assert(mask_param % 2 == 0); /* Should be 64-bit-aligned */
2378 mask.swizzle = ELK_SWIZZLE4((mask_param + 0) % 4,
2379 (mask_param + 1) % 4,
2380 (mask_param + 0) % 4,
2381 (mask_param + 1) % 4);
2382
2383 emit(ELK_VEC4_OPCODE_ZERO_OOB_PUSH_REGS,
2384 dst_reg(VGRF, alloc.allocate(3)), mask);
2385 }
2386
2387 emit_prolog();
2388
2389 emit_nir_code();
2390 if (failed)
2391 return false;
2392 base_ir = NULL;
2393
2394 emit_thread_end();
2395
2396 calculate_cfg();
2397 cfg->validate(_mesa_shader_stage_to_abbrev(stage));
2398
2399 /* Before any optimization, push array accesses out to scratch
2400 * space where we need them to be. This pass may allocate new
2401 * virtual GRFs, so we want to do it early. It also makes sure
2402 * that we have reladdr computations available for CSE, since we'll
2403 * often do repeated subexpressions for those.
2404 */
2405 move_grf_array_access_to_scratch();
2406 split_uniform_registers();
2407
2408 split_virtual_grfs();
2409
2410 #define OPT(pass, args...) ({ \
2411 pass_num++; \
2412 bool this_progress = pass(args); \
2413 \
2414 if (INTEL_DEBUG(DEBUG_OPTIMIZER) && this_progress) { \
2415 char filename[64]; \
2416 snprintf(filename, 64, "%s-%s-%02d-%02d-" #pass, \
2417 _mesa_shader_stage_to_abbrev(stage), \
2418 nir->info.name, iteration, pass_num); \
2419 \
2420 elk_backend_shader::dump_instructions(filename); \
2421 } \
2422 \
2423 cfg->validate(_mesa_shader_stage_to_abbrev(stage)); \
2424 progress = progress || this_progress; \
2425 this_progress; \
2426 })
2427
2428
2429 if (INTEL_DEBUG(DEBUG_OPTIMIZER)) {
2430 char filename[64];
2431 snprintf(filename, 64, "%s-%s-00-00-start",
2432 _mesa_shader_stage_to_abbrev(stage), nir->info.name);
2433
2434 elk_backend_shader::dump_instructions(filename);
2435 }
2436
2437 bool progress;
2438 int iteration = 0;
2439 int pass_num = 0;
2440 do {
2441 progress = false;
2442 pass_num = 0;
2443 iteration++;
2444
2445 OPT(elk_opt_predicated_break, this);
2446 OPT(opt_reduce_swizzle);
2447 OPT(dead_code_eliminate);
2448 OPT(elk_dead_control_flow_eliminate, this);
2449 OPT(opt_copy_propagation);
2450 OPT(opt_cmod_propagation);
2451 OPT(opt_cse);
2452 OPT(opt_algebraic);
2453 OPT(opt_register_coalesce);
2454 OPT(eliminate_find_live_channel);
2455 } while (progress);
2456
2457 pass_num = 0;
2458
2459 if (OPT(opt_vector_float)) {
2460 OPT(opt_cse);
2461 OPT(opt_copy_propagation, false);
2462 OPT(opt_copy_propagation, true);
2463 OPT(dead_code_eliminate);
2464 }
2465
2466 if (devinfo->ver <= 5 && OPT(lower_minmax)) {
2467 OPT(opt_cmod_propagation);
2468 OPT(opt_cse);
2469 OPT(opt_copy_propagation);
2470 OPT(dead_code_eliminate);
2471 }
2472
2473 if (OPT(lower_simd_width)) {
2474 OPT(opt_copy_propagation);
2475 OPT(dead_code_eliminate);
2476 }
2477
2478 if (failed)
2479 return false;
2480
2481 OPT(lower_64bit_mad_to_mul_add);
2482
2483 /* Run this before payload setup because tessellation shaders
2484 * rely on it to prevent cross dvec2 regioning on DF attributes
2485 * that are setup so that XY are on the second half of register and
2486 * ZW are in the first half of the next.
2487 */
2488 OPT(scalarize_df);
2489
2490 setup_payload();
2491
2492 if (INTEL_DEBUG(DEBUG_SPILL_VEC4)) {
2493 /* Debug of register spilling: Go spill everything. */
2494 const int grf_count = alloc.count;
2495 float *spill_costs = ralloc_array(NULL, float, alloc.count);
2496 bool *no_spill = ralloc_array(NULL, bool, alloc.count);
2497 evaluate_spill_costs(spill_costs, no_spill);
2498 for (int i = 0; i < grf_count; i++) {
2499 if (no_spill[i])
2500 continue;
2501 spill_reg(i);
2502 }
2503 ralloc_free(spill_costs);
2504 ralloc_free(no_spill);
2505
2506 /* We want to run this after spilling because 64-bit (un)spills need to
2507 * emit code to shuffle 64-bit data for the 32-bit scratch read/write
2508 * messages that can produce unsupported 64-bit swizzle regions.
2509 */
2510 OPT(scalarize_df);
2511 }
2512
2513 fixup_3src_null_dest();
2514
2515 bool allocated_without_spills = reg_allocate();
2516
2517 if (!allocated_without_spills) {
2518 elk_shader_perf_log(compiler, log_data,
2519 "%s shader triggered register spilling. "
2520 "Try reducing the number of live vec4 values "
2521 "to improve performance.\n",
2522 _mesa_shader_stage_to_string(stage));
2523
2524 while (!reg_allocate()) {
2525 if (failed)
2526 return false;
2527 }
2528
2529 /* We want to run this after spilling because 64-bit (un)spills need to
2530 * emit code to shuffle 64-bit data for the 32-bit scratch read/write
2531 * messages that can produce unsupported 64-bit swizzle regions.
2532 */
2533 OPT(scalarize_df);
2534 }
2535
2536 opt_schedule_instructions();
2537
2538 opt_set_dependency_control();
2539
2540 convert_to_hw_regs();
2541
2542 if (last_scratch > 0) {
2543 prog_data->base.total_scratch =
2544 elk_get_scratch_size(last_scratch * REG_SIZE);
2545 }
2546
2547 return !failed;
2548 }
2549
2550 } /* namespace elk */
2551
2552 extern "C" {
2553
2554 const unsigned *
elk_compile_vs(const struct elk_compiler * compiler,struct elk_compile_vs_params * params)2555 elk_compile_vs(const struct elk_compiler *compiler,
2556 struct elk_compile_vs_params *params)
2557 {
2558 struct nir_shader *nir = params->base.nir;
2559 const struct elk_vs_prog_key *key = params->key;
2560 struct elk_vs_prog_data *prog_data = params->prog_data;
2561 const bool debug_enabled =
2562 elk_should_print_shader(nir, params->base.debug_flag ?
2563 params->base.debug_flag : DEBUG_VS);
2564
2565 prog_data->base.base.stage = MESA_SHADER_VERTEX;
2566 prog_data->base.base.total_scratch = 0;
2567
2568 const bool is_scalar = compiler->scalar_stage[MESA_SHADER_VERTEX];
2569 elk_nir_apply_key(nir, compiler, &key->base, 8);
2570
2571 const unsigned *assembly = NULL;
2572
2573 prog_data->inputs_read = nir->info.inputs_read;
2574 prog_data->double_inputs_read = nir->info.vs.double_inputs;
2575
2576 elk_nir_lower_vs_inputs(nir, params->edgeflag_is_last, key->gl_attrib_wa_flags);
2577 elk_nir_lower_vue_outputs(nir);
2578 elk_postprocess_nir(nir, compiler, debug_enabled,
2579 key->base.robust_flags);
2580
2581 prog_data->base.clip_distance_mask =
2582 ((1 << nir->info.clip_distance_array_size) - 1);
2583 prog_data->base.cull_distance_mask =
2584 ((1 << nir->info.cull_distance_array_size) - 1) <<
2585 nir->info.clip_distance_array_size;
2586
2587 unsigned nr_attribute_slots = util_bitcount64(prog_data->inputs_read);
2588
2589 /* gl_VertexID and gl_InstanceID are system values, but arrive via an
2590 * incoming vertex attribute. So, add an extra slot.
2591 */
2592 if (BITSET_TEST(nir->info.system_values_read, SYSTEM_VALUE_FIRST_VERTEX) ||
2593 BITSET_TEST(nir->info.system_values_read, SYSTEM_VALUE_BASE_INSTANCE) ||
2594 BITSET_TEST(nir->info.system_values_read, SYSTEM_VALUE_VERTEX_ID_ZERO_BASE) ||
2595 BITSET_TEST(nir->info.system_values_read, SYSTEM_VALUE_INSTANCE_ID)) {
2596 nr_attribute_slots++;
2597 }
2598
2599 /* gl_DrawID and IsIndexedDraw share its very own vec4 */
2600 if (BITSET_TEST(nir->info.system_values_read, SYSTEM_VALUE_DRAW_ID) ||
2601 BITSET_TEST(nir->info.system_values_read, SYSTEM_VALUE_IS_INDEXED_DRAW)) {
2602 nr_attribute_slots++;
2603 }
2604
2605 if (BITSET_TEST(nir->info.system_values_read, SYSTEM_VALUE_IS_INDEXED_DRAW))
2606 prog_data->uses_is_indexed_draw = true;
2607
2608 if (BITSET_TEST(nir->info.system_values_read, SYSTEM_VALUE_FIRST_VERTEX))
2609 prog_data->uses_firstvertex = true;
2610
2611 if (BITSET_TEST(nir->info.system_values_read, SYSTEM_VALUE_BASE_INSTANCE))
2612 prog_data->uses_baseinstance = true;
2613
2614 if (BITSET_TEST(nir->info.system_values_read, SYSTEM_VALUE_VERTEX_ID_ZERO_BASE))
2615 prog_data->uses_vertexid = true;
2616
2617 if (BITSET_TEST(nir->info.system_values_read, SYSTEM_VALUE_INSTANCE_ID))
2618 prog_data->uses_instanceid = true;
2619
2620 if (BITSET_TEST(nir->info.system_values_read, SYSTEM_VALUE_DRAW_ID))
2621 prog_data->uses_drawid = true;
2622
2623 /* The 3DSTATE_VS documentation lists the lower bound on "Vertex URB Entry
2624 * Read Length" as 1 in vec4 mode, and 0 in SIMD8 mode. Empirically, in
2625 * vec4 mode, the hardware appears to wedge unless we read something.
2626 */
2627 if (is_scalar)
2628 prog_data->base.urb_read_length =
2629 DIV_ROUND_UP(nr_attribute_slots, 2);
2630 else
2631 prog_data->base.urb_read_length =
2632 DIV_ROUND_UP(MAX2(nr_attribute_slots, 1), 2);
2633
2634 prog_data->nr_attribute_slots = nr_attribute_slots;
2635
2636 /* Since vertex shaders reuse the same VUE entry for inputs and outputs
2637 * (overwriting the original contents), we need to make sure the size is
2638 * the larger of the two.
2639 */
2640 const unsigned vue_entries =
2641 MAX2(nr_attribute_slots, (unsigned)prog_data->base.vue_map.num_slots);
2642
2643 if (compiler->devinfo->ver == 6) {
2644 prog_data->base.urb_entry_size = DIV_ROUND_UP(vue_entries, 8);
2645 } else {
2646 prog_data->base.urb_entry_size = DIV_ROUND_UP(vue_entries, 4);
2647 }
2648
2649 if (unlikely(debug_enabled)) {
2650 fprintf(stderr, "VS Output ");
2651 elk_print_vue_map(stderr, &prog_data->base.vue_map, MESA_SHADER_VERTEX);
2652 }
2653
2654 if (is_scalar) {
2655 const unsigned dispatch_width = 8;
2656 prog_data->base.dispatch_mode = INTEL_DISPATCH_MODE_SIMD8;
2657
2658 elk_fs_visitor v(compiler, ¶ms->base, &key->base,
2659 &prog_data->base.base, nir, dispatch_width,
2660 params->base.stats != NULL, debug_enabled);
2661 if (!v.run_vs()) {
2662 params->base.error_str =
2663 ralloc_strdup(params->base.mem_ctx, v.fail_msg);
2664 return NULL;
2665 }
2666
2667 assert(v.payload().num_regs % reg_unit(compiler->devinfo) == 0);
2668 prog_data->base.base.dispatch_grf_start_reg =
2669 v.payload().num_regs / reg_unit(compiler->devinfo);
2670
2671 elk_fs_generator g(compiler, ¶ms->base,
2672 &prog_data->base.base, v.runtime_check_aads_emit,
2673 MESA_SHADER_VERTEX);
2674 if (unlikely(debug_enabled)) {
2675 const char *debug_name =
2676 ralloc_asprintf(params->base.mem_ctx, "%s vertex shader %s",
2677 nir->info.label ? nir->info.label :
2678 "unnamed",
2679 nir->info.name);
2680
2681 g.enable_debug(debug_name);
2682 }
2683 g.generate_code(v.cfg, dispatch_width, v.shader_stats,
2684 v.performance_analysis.require(), params->base.stats);
2685 g.add_const_data(nir->constant_data, nir->constant_data_size);
2686 assembly = g.get_assembly();
2687 }
2688
2689 if (!assembly) {
2690 prog_data->base.dispatch_mode = INTEL_DISPATCH_MODE_4X2_DUAL_OBJECT;
2691
2692 vec4_vs_visitor v(compiler, ¶ms->base, key, prog_data,
2693 nir, debug_enabled);
2694 if (!v.run()) {
2695 params->base.error_str =
2696 ralloc_strdup(params->base.mem_ctx, v.fail_msg);
2697 return NULL;
2698 }
2699
2700 assembly = elk_vec4_generate_assembly(compiler, ¶ms->base,
2701 nir, &prog_data->base,
2702 v.cfg,
2703 v.performance_analysis.require(),
2704 debug_enabled);
2705 }
2706
2707 return assembly;
2708 }
2709
2710 } /* extern "C" */
2711