1 /*
2 * Copyright © 2010 Intel Corporation
3 * SPDX-License-Identifier: MIT
4 */
5
6 #include "brw_fs.h"
7 #include "brw_builder.h"
8
9 using namespace brw;
10
11 /**
12 * Align16 3-source instructions cannot have scalar stride w/64-bit types.
13 *
14 * The Bspec says:
15 *
16 * Replicate Control. This field is only present in three-source
17 * instructions, for each of the three source operands. It controls
18 * replication of the starting channel to all channels in the execution
19 * size. ChanSel does not apply when Replicate Control is set. This is
20 * applicable to 32b datatypes and 16b datatype. 64b datatypes cannot use
21 * the replicate control.
22 *
23 * In practice, this can only happen on Gfx9 with DF sources to MAD. Since
24 * the source is_scalar, this can be fixed by just making the stride=1. Also
25 * clear is_scalar "just in case."
26 */
27 bool
brw_lower_scalar_fp64_MAD(fs_visitor & s)28 brw_lower_scalar_fp64_MAD(fs_visitor &s)
29 {
30 const intel_device_info *devinfo = s.devinfo;
31 bool progress = false;
32
33 if (devinfo->ver != 9)
34 return false;
35
36 foreach_block_and_inst_safe(block, fs_inst, inst, s.cfg) {
37 if (inst->opcode == BRW_OPCODE_MAD &&
38 inst->dst.type == BRW_TYPE_DF) {
39 for (unsigned i = 0; i < 3; i++) {
40 if (inst->src[i].is_scalar) {
41 inst->src[i].is_scalar = false;
42 inst->src[i].stride = 1;
43 progress = true;
44 }
45 }
46 }
47 }
48
49 return progress;
50 }
51
52 bool
brw_lower_load_payload(fs_visitor & s)53 brw_lower_load_payload(fs_visitor &s)
54 {
55 bool progress = false;
56
57 foreach_block_and_inst_safe (block, fs_inst, inst, s.cfg) {
58 if (inst->opcode != SHADER_OPCODE_LOAD_PAYLOAD)
59 continue;
60
61 assert(inst->dst.file == VGRF);
62 assert(inst->saturate == false);
63 brw_reg dst = inst->dst;
64
65 const brw_builder ibld(&s, block, inst);
66 const brw_builder ubld = ibld.exec_all();
67
68 for (uint8_t i = 0; i < inst->header_size;) {
69 /* Number of header GRFs to initialize at once with a single MOV
70 * instruction.
71 */
72 const unsigned n =
73 (i + 1 < inst->header_size && inst->src[i].stride == 1 &&
74 inst->src[i + 1].equals(byte_offset(inst->src[i], REG_SIZE))) ?
75 2 : 1;
76
77 if (inst->src[i].file != BAD_FILE)
78 ubld.group(8 * n, 0).MOV(retype(dst, BRW_TYPE_UD),
79 retype(inst->src[i], BRW_TYPE_UD));
80
81 dst = byte_offset(dst, n * REG_SIZE);
82 i += n;
83 }
84
85 for (uint8_t i = inst->header_size; i < inst->sources; i++) {
86 dst.type = inst->src[i].type;
87 if (inst->src[i].file != BAD_FILE) {
88 ibld.MOV(dst, inst->src[i]);
89 }
90 dst = offset(dst, ibld, 1);
91 }
92
93 inst->remove(block);
94 progress = true;
95 }
96
97 if (progress)
98 s.invalidate_analysis(DEPENDENCY_INSTRUCTIONS);
99
100 return progress;
101 }
102
103 /**
104 * Lower CSEL with unsupported types to CMP+SEL.
105 *
106 * Or, for unsigned ==/!= comparisons, simply change the types.
107 */
108 bool
brw_lower_csel(fs_visitor & s)109 brw_lower_csel(fs_visitor &s)
110 {
111 const intel_device_info *devinfo = s.devinfo;
112 bool progress = false;
113
114 foreach_block_and_inst_safe(block, fs_inst, inst, s.cfg) {
115 if (inst->opcode != BRW_OPCODE_CSEL)
116 continue;
117
118 bool supported = false;
119 enum brw_reg_type orig_type = inst->src[2].type;
120 enum brw_reg_type new_type = orig_type;
121
122 switch (orig_type) {
123 case BRW_TYPE_F:
124 /* Gfx9 CSEL can only do F */
125 supported = true;
126 break;
127 case BRW_TYPE_HF:
128 case BRW_TYPE_W:
129 case BRW_TYPE_D:
130 /* Gfx11+ CSEL can do HF, W, and D. Note that we can't simply
131 * retype integer ==/!= comparisons as float on earlier hardware
132 * because it breaks for 0x8000000 and 0 (-0.0 == 0.0).
133 */
134 supported = devinfo->ver >= 11;
135 break;
136 case BRW_TYPE_UW:
137 case BRW_TYPE_UD:
138 /* CSEL doesn't support UW/UD but we can simply retype to use the
139 * signed types when comparing with == or !=.
140 */
141 supported = devinfo->ver >= 11 &&
142 (inst->conditional_mod == BRW_CONDITIONAL_EQ ||
143 inst->conditional_mod == BRW_CONDITIONAL_NEQ);
144
145 /* Bspec 47408, Gfx125+ CSEL does support the both signed and unsigned
146 * integer types.
147 */
148 if (devinfo->verx10 < 125) {
149 new_type = inst->src[2].type == BRW_TYPE_UD ?
150 BRW_TYPE_D : BRW_TYPE_W;
151 }
152 break;
153 default:
154 break;
155 }
156
157 if (!supported) {
158 const brw_builder ibld(&s, block, inst);
159
160 /* CSEL: dst = src2 <op> 0 ? src0 : src1 */
161 brw_reg zero = brw_imm_reg(orig_type);
162 ibld.CMP(retype(brw_null_reg(), orig_type),
163 inst->src[2], zero, inst->conditional_mod);
164
165 inst->opcode = BRW_OPCODE_SEL;
166 inst->predicate = BRW_PREDICATE_NORMAL;
167 inst->conditional_mod = BRW_CONDITIONAL_NONE;
168 inst->resize_sources(2);
169 progress = true;
170 } else if (new_type != orig_type) {
171 inst->src[0].type = new_type;
172 inst->src[1].type = new_type;
173 inst->src[2].type = new_type;
174 progress = true;
175 }
176 }
177
178 if (progress)
179 s.invalidate_analysis(DEPENDENCY_INSTRUCTIONS);
180
181 return progress;
182 }
183
184 bool
brw_lower_sub_sat(fs_visitor & s)185 brw_lower_sub_sat(fs_visitor &s)
186 {
187 bool progress = false;
188
189 foreach_block_and_inst_safe(block, fs_inst, inst, s.cfg) {
190 const brw_builder ibld(&s, block, inst);
191
192 if (inst->opcode == SHADER_OPCODE_USUB_SAT ||
193 inst->opcode == SHADER_OPCODE_ISUB_SAT) {
194 /* The fundamental problem is the hardware performs source negation
195 * at the bit width of the source. If the source is 0x80000000D, the
196 * negation is 0x80000000D. As a result, subtractSaturate(0,
197 * 0x80000000) will produce 0x80000000 instead of 0x7fffffff. There
198 * are at least three ways to resolve this:
199 *
200 * 1. Use the accumulator for the negated source. The accumulator is
201 * 33 bits, so our source 0x80000000 is sign-extended to
202 * 0x1800000000. The negation of which is 0x080000000. This
203 * doesn't help for 64-bit integers (which are already bigger than
204 * 33 bits). There are also only 8 accumulators, so SIMD16 or
205 * SIMD32 instructions would have to be split into multiple SIMD8
206 * instructions.
207 *
208 * 2. Use slightly different math. For any n-bit value x, we know (x
209 * >> 1) != -(x >> 1). We can use this fact to only do
210 * subtractions involving (x >> 1). subtractSaturate(a, b) ==
211 * subtractSaturate(subtractSaturate(a, (b >> 1)), b - (b >> 1)).
212 *
213 * 3. For unsigned sources, it is sufficient to replace the
214 * subtractSaturate with (a > b) ? a - b : 0.
215 *
216 * It may also be possible to use the SUBB instruction. This
217 * implicitly writes the accumulator, so it could only be used in the
218 * same situations as #1 above. It is further limited by only
219 * allowing UD sources.
220 */
221 if (inst->exec_size == 8 && inst->src[0].type != BRW_TYPE_Q &&
222 inst->src[0].type != BRW_TYPE_UQ) {
223 brw_reg acc = retype(brw_acc_reg(inst->exec_size),
224 inst->src[1].type);
225
226 ibld.MOV(acc, inst->src[1]);
227 fs_inst *add = ibld.ADD(inst->dst, acc, inst->src[0]);
228 add->saturate = true;
229 add->src[0].negate = true;
230 } else if (inst->opcode == SHADER_OPCODE_ISUB_SAT) {
231 /* tmp = src1 >> 1;
232 * dst = add.sat(add.sat(src0, -tmp), -(src1 - tmp));
233 */
234 fs_inst *add;
235
236 brw_reg tmp = ibld.vgrf(inst->src[0].type);
237 ibld.SHR(tmp, inst->src[1], brw_imm_d(1));
238
239 brw_reg s1_sub_t = ibld.ADD(inst->src[1], negate(tmp));
240 brw_reg sat_s0_sub_t = ibld.ADD(inst->src[0], negate(tmp), &add);
241 add->saturate = true;
242
243 add = ibld.ADD(inst->dst, sat_s0_sub_t, negate(s1_sub_t));
244 add->saturate = true;
245 } else {
246 /* a > b ? a - b : 0 */
247 ibld.CMP(ibld.null_reg_d(), inst->src[0], inst->src[1],
248 BRW_CONDITIONAL_G);
249
250 fs_inst *add = ibld.ADD(inst->dst, inst->src[0], inst->src[1]);
251 add->src[1].negate = !add->src[1].negate;
252
253 ibld.SEL(inst->dst, inst->dst, brw_imm_ud(0))
254 ->predicate = BRW_PREDICATE_NORMAL;
255 }
256
257 inst->remove(block);
258 progress = true;
259 }
260 }
261
262 if (progress)
263 s.invalidate_analysis(DEPENDENCY_INSTRUCTIONS | DEPENDENCY_VARIABLES);
264
265 return progress;
266 }
267
268 /**
269 * Transform barycentric vectors into the interleaved form expected by the PLN
270 * instruction and returned by the Gfx7+ PI shared function.
271 *
272 * For channels 0-15 in SIMD16 mode they are expected to be laid out as
273 * follows in the register file:
274 *
275 * rN+0: X[0-7]
276 * rN+1: Y[0-7]
277 * rN+2: X[8-15]
278 * rN+3: Y[8-15]
279 *
280 * There is no need to handle SIMD32 here -- This is expected to be run after
281 * SIMD lowering, since SIMD lowering relies on vectors having the standard
282 * component layout.
283 */
284 bool
brw_lower_barycentrics(fs_visitor & s)285 brw_lower_barycentrics(fs_visitor &s)
286 {
287 const intel_device_info *devinfo = s.devinfo;
288
289 if (s.stage != MESA_SHADER_FRAGMENT || devinfo->ver >= 20)
290 return false;
291
292 bool progress = false;
293
294 foreach_block_and_inst_safe(block, fs_inst, inst, s.cfg) {
295 if (inst->exec_size < 16)
296 continue;
297
298 const brw_builder ibld(&s, block, inst);
299 const brw_builder ubld = ibld.exec_all().group(8, 0);
300
301 switch (inst->opcode) {
302 case BRW_OPCODE_PLN: {
303 assert(inst->exec_size == 16);
304 const brw_reg tmp = ibld.vgrf(inst->src[1].type, 2);
305 brw_reg srcs[4];
306
307 for (unsigned i = 0; i < ARRAY_SIZE(srcs); i++)
308 srcs[i] = horiz_offset(offset(inst->src[1], ibld, i % 2),
309 8 * (i / 2));
310
311 ubld.LOAD_PAYLOAD(tmp, srcs, ARRAY_SIZE(srcs), ARRAY_SIZE(srcs));
312
313 inst->src[1] = tmp;
314 progress = true;
315 break;
316 }
317 case FS_OPCODE_INTERPOLATE_AT_SAMPLE:
318 case FS_OPCODE_INTERPOLATE_AT_SHARED_OFFSET:
319 case FS_OPCODE_INTERPOLATE_AT_PER_SLOT_OFFSET: {
320 assert(inst->exec_size == 16);
321 const brw_reg tmp = ibld.vgrf(inst->dst.type, 2);
322
323 for (unsigned i = 0; i < 2; i++) {
324 for (unsigned g = 0; g < inst->exec_size / 8; g++) {
325 fs_inst *mov = ibld.at(block, inst->next).group(8, g)
326 .MOV(horiz_offset(offset(inst->dst, ibld, i),
327 8 * g),
328 offset(tmp, ubld, 2 * g + i));
329 mov->predicate = inst->predicate;
330 mov->predicate_inverse = inst->predicate_inverse;
331 mov->flag_subreg = inst->flag_subreg;
332 }
333 }
334
335 inst->dst = tmp;
336 progress = true;
337 break;
338 }
339 default:
340 break;
341 }
342 }
343
344 if (progress)
345 s.invalidate_analysis(DEPENDENCY_INSTRUCTIONS | DEPENDENCY_VARIABLES);
346
347 return progress;
348 }
349
350 /**
351 * Lower a derivative instruction as the floating-point difference of two
352 * swizzles of the source, specified as \p swz0 and \p swz1.
353 */
354 static bool
lower_derivative(fs_visitor & s,bblock_t * block,fs_inst * inst,unsigned swz0,unsigned swz1)355 lower_derivative(fs_visitor &s, bblock_t *block, fs_inst *inst,
356 unsigned swz0, unsigned swz1)
357 {
358 const brw_builder ubld = brw_builder(&s, block, inst).exec_all();
359 const brw_reg tmp0 = ubld.vgrf(inst->src[0].type);
360 const brw_reg tmp1 = ubld.vgrf(inst->src[0].type);
361
362 ubld.emit(SHADER_OPCODE_QUAD_SWIZZLE, tmp0, inst->src[0], brw_imm_ud(swz0));
363 ubld.emit(SHADER_OPCODE_QUAD_SWIZZLE, tmp1, inst->src[0], brw_imm_ud(swz1));
364
365 inst->resize_sources(2);
366 inst->src[0] = negate(tmp0);
367 inst->src[1] = tmp1;
368 inst->opcode = BRW_OPCODE_ADD;
369
370 return true;
371 }
372
373 /**
374 * Lower derivative instructions on platforms where codegen cannot implement
375 * them efficiently (i.e. XeHP).
376 */
377 bool
brw_lower_derivatives(fs_visitor & s)378 brw_lower_derivatives(fs_visitor &s)
379 {
380 bool progress = false;
381
382 if (s.devinfo->verx10 < 125)
383 return false;
384
385 foreach_block_and_inst(block, fs_inst, inst, s.cfg) {
386 if (inst->opcode == FS_OPCODE_DDX_COARSE)
387 progress |= lower_derivative(s, block, inst,
388 BRW_SWIZZLE_XXXX, BRW_SWIZZLE_YYYY);
389
390 else if (inst->opcode == FS_OPCODE_DDX_FINE)
391 progress |= lower_derivative(s, block, inst,
392 BRW_SWIZZLE_XXZZ, BRW_SWIZZLE_YYWW);
393
394 else if (inst->opcode == FS_OPCODE_DDY_COARSE)
395 progress |= lower_derivative(s, block, inst,
396 BRW_SWIZZLE_XXXX, BRW_SWIZZLE_ZZZZ);
397
398 else if (inst->opcode == FS_OPCODE_DDY_FINE)
399 progress |= lower_derivative(s, block, inst,
400 BRW_SWIZZLE_XYXY, BRW_SWIZZLE_ZWZW);
401 }
402
403 if (progress)
404 s.invalidate_analysis(DEPENDENCY_INSTRUCTIONS | DEPENDENCY_VARIABLES);
405
406 return progress;
407 }
408
409 bool
brw_lower_find_live_channel(fs_visitor & s)410 brw_lower_find_live_channel(fs_visitor &s)
411 {
412 bool progress = false;
413
414 bool packed_dispatch =
415 brw_stage_has_packed_dispatch(s.devinfo, s.stage, s.max_polygons,
416 s.prog_data);
417 bool vmask =
418 s.stage == MESA_SHADER_FRAGMENT &&
419 brw_wm_prog_data(s.prog_data)->uses_vmask;
420
421 foreach_block_and_inst_safe(block, fs_inst, inst, s.cfg) {
422 if (inst->opcode != SHADER_OPCODE_FIND_LIVE_CHANNEL &&
423 inst->opcode != SHADER_OPCODE_FIND_LAST_LIVE_CHANNEL &&
424 inst->opcode != SHADER_OPCODE_LOAD_LIVE_CHANNELS)
425 continue;
426
427 bool first = inst->opcode == SHADER_OPCODE_FIND_LIVE_CHANNEL;
428
429 /* Getting the first active channel index is easy on Gfx8: Just find
430 * the first bit set in the execution mask. The register exists on
431 * HSW already but it reads back as all ones when the current
432 * instruction has execution masking disabled, so it's kind of
433 * useless there.
434 */
435
436 const brw_builder ibld(&s, block, inst);
437 if (!inst->is_partial_write())
438 ibld.emit_undef_for_dst(inst);
439
440 const brw_builder ubld = brw_builder(&s, block, inst).exec_all().group(1, 0);
441
442 brw_reg exec_mask = ubld.vgrf(BRW_TYPE_UD);
443 ubld.UNDEF(exec_mask);
444 ubld.emit(SHADER_OPCODE_READ_ARCH_REG, exec_mask,
445 retype(brw_mask_reg(0),
446 BRW_TYPE_UD));
447
448 /* ce0 doesn't consider the thread dispatch mask (DMask or VMask),
449 * so combine the execution and dispatch masks to obtain the true mask.
450 *
451 * If we're looking for the first live channel, and we have packed
452 * dispatch, we can skip this step, as we know all dispatched channels
453 * will appear at the front of the mask.
454 */
455 if (!(first && packed_dispatch)) {
456 brw_reg mask = ubld.vgrf(BRW_TYPE_UD);
457 ubld.UNDEF(mask);
458 ubld.emit(SHADER_OPCODE_READ_ARCH_REG, mask,
459 retype(brw_sr0_reg(vmask ? 3 : 2),
460 BRW_TYPE_UD));
461
462 /* Quarter control has the effect of magically shifting the value of
463 * ce0 so you'll get the first/last active channel relative to the
464 * specified quarter control as result.
465 */
466 if (inst->group > 0)
467 ubld.SHR(mask, mask, brw_imm_ud(ALIGN(inst->group, 8)));
468
469 ubld.AND(mask, exec_mask, mask);
470 exec_mask = mask;
471 }
472
473 switch (inst->opcode) {
474 case SHADER_OPCODE_FIND_LIVE_CHANNEL:
475 ubld.FBL(inst->dst, exec_mask);
476 break;
477
478 case SHADER_OPCODE_FIND_LAST_LIVE_CHANNEL: {
479 brw_reg tmp = ubld.vgrf(BRW_TYPE_UD);
480 ubld.UNDEF(tmp);
481 ubld.LZD(tmp, exec_mask);
482 ubld.ADD(inst->dst, negate(tmp), brw_imm_uw(31));
483 break;
484 }
485
486 case SHADER_OPCODE_LOAD_LIVE_CHANNELS:
487 ubld.MOV(inst->dst, exec_mask);
488 break;
489
490 default:
491 unreachable("Impossible.");
492 }
493
494 inst->remove(block);
495 progress = true;
496 }
497
498 if (progress)
499 s.invalidate_analysis(DEPENDENCY_INSTRUCTIONS | DEPENDENCY_VARIABLES);
500
501 return progress;
502 }
503
504 /**
505 * From the Skylake PRM Vol. 2a docs for sends:
506 *
507 * "It is required that the second block of GRFs does not overlap with the
508 * first block."
509 *
510 * There are plenty of cases where we may accidentally violate this due to
511 * having, for instance, both sources be the constant 0. This little pass
512 * just adds a new vgrf for the second payload and copies it over.
513 */
514 bool
brw_lower_sends_overlapping_payload(fs_visitor & s)515 brw_lower_sends_overlapping_payload(fs_visitor &s)
516 {
517 bool progress = false;
518
519 foreach_block_and_inst_safe (block, fs_inst, inst, s.cfg) {
520 if (inst->opcode == SHADER_OPCODE_SEND && inst->ex_mlen > 0 &&
521 regions_overlap(inst->src[2], inst->mlen * REG_SIZE,
522 inst->src[3], inst->ex_mlen * REG_SIZE)) {
523 const unsigned arg = inst->mlen < inst->ex_mlen ? 2 : 3;
524 const unsigned len = MIN2(inst->mlen, inst->ex_mlen);
525
526 brw_reg tmp = brw_vgrf(s.alloc.allocate(len),
527 BRW_TYPE_UD);
528
529 /* Sadly, we've lost all notion of channels and bit sizes at this
530 * point. Just WE_all it.
531 */
532 const brw_builder ibld = brw_builder(&s, block, inst).exec_all().group(16, 0);
533 brw_reg copy_src = retype(inst->src[arg], BRW_TYPE_UD);
534 brw_reg copy_dst = tmp;
535 for (unsigned i = 0; i < len; i += 2) {
536 if (len == i + 1) {
537 /* Only one register left; do SIMD8 */
538 ibld.group(8, 0).MOV(copy_dst, copy_src);
539 } else {
540 ibld.MOV(copy_dst, copy_src);
541 }
542 copy_src = offset(copy_src, ibld, 1);
543 copy_dst = offset(copy_dst, ibld, 1);
544 }
545 inst->src[arg] = tmp;
546 progress = true;
547 }
548 }
549
550 if (progress)
551 s.invalidate_analysis(DEPENDENCY_INSTRUCTIONS | DEPENDENCY_VARIABLES);
552
553 return progress;
554 }
555
556 /**
557 * Three source instruction must have a GRF destination register.
558 * ARF NULL is not allowed. Fix that up by allocating a temporary GRF.
559 */
560 bool
brw_lower_3src_null_dest(fs_visitor & s)561 brw_lower_3src_null_dest(fs_visitor &s)
562 {
563 bool progress = false;
564
565 foreach_block_and_inst_safe (block, fs_inst, inst, s.cfg) {
566 if (inst->is_3src(s.compiler) && inst->dst.is_null()) {
567 inst->dst = brw_vgrf(s.alloc.allocate(s.dispatch_width / 8),
568 inst->dst.type);
569 progress = true;
570 }
571 }
572
573 if (progress)
574 s.invalidate_analysis(DEPENDENCY_INSTRUCTION_DETAIL |
575 DEPENDENCY_VARIABLES);
576
577 return progress;
578 }
579
580 static bool
unsupported_64bit_type(const intel_device_info * devinfo,enum brw_reg_type type)581 unsupported_64bit_type(const intel_device_info *devinfo,
582 enum brw_reg_type type)
583 {
584 return (!devinfo->has_64bit_float && type == BRW_TYPE_DF) ||
585 (!devinfo->has_64bit_int && (type == BRW_TYPE_UQ ||
586 type == BRW_TYPE_Q));
587 }
588
589 /**
590 * Perform lowering to legalize the IR for various ALU restrictions.
591 *
592 * For example:
593 * - Splitting 64-bit MOV/SEL into 2x32-bit where needed
594 */
595 bool
brw_lower_alu_restrictions(fs_visitor & s)596 brw_lower_alu_restrictions(fs_visitor &s)
597 {
598 const intel_device_info *devinfo = s.devinfo;
599 bool progress = false;
600
601 foreach_block_and_inst_safe(block, fs_inst, inst, s.cfg) {
602 switch (inst->opcode) {
603 case BRW_OPCODE_MOV:
604 if (unsupported_64bit_type(devinfo, inst->dst.type)) {
605 assert(inst->dst.type == inst->src[0].type);
606 assert(!inst->saturate);
607 assert(!inst->src[0].abs);
608 assert(!inst->src[0].negate);
609 const brw_builder ibld(&s, block, inst);
610
611 enum brw_reg_type type = brw_type_with_size(inst->dst.type, 32);
612
613 if (!inst->is_partial_write())
614 ibld.emit_undef_for_dst(inst);
615
616 ibld.MOV(subscript(inst->dst, type, 1),
617 subscript(inst->src[0], type, 1));
618 ibld.MOV(subscript(inst->dst, type, 0),
619 subscript(inst->src[0], type, 0));
620
621 inst->remove(block);
622 progress = true;
623 }
624 break;
625
626 case BRW_OPCODE_SEL:
627 if (unsupported_64bit_type(devinfo, inst->dst.type)) {
628 assert(inst->dst.type == inst->src[0].type);
629 assert(!inst->saturate);
630 assert(!inst->src[0].abs && !inst->src[0].negate);
631 assert(!inst->src[1].abs && !inst->src[1].negate);
632 assert(inst->conditional_mod == BRW_CONDITIONAL_NONE);
633 const brw_builder ibld(&s, block, inst);
634
635 enum brw_reg_type type = brw_type_with_size(inst->dst.type, 32);
636
637 if (!inst->is_partial_write())
638 ibld.emit_undef_for_dst(inst);
639
640 set_predicate(inst->predicate,
641 ibld.SEL(subscript(inst->dst, type, 0),
642 subscript(inst->src[0], type, 0),
643 subscript(inst->src[1], type, 0)));
644 set_predicate(inst->predicate,
645 ibld.SEL(subscript(inst->dst, type, 1),
646 subscript(inst->src[0], type, 1),
647 subscript(inst->src[1], type, 1)));
648
649 inst->remove(block);
650 progress = true;
651 }
652 break;
653
654 default:
655 break;
656 }
657 }
658
659 if (progress) {
660 s.invalidate_analysis(DEPENDENCY_INSTRUCTION_DATA_FLOW |
661 DEPENDENCY_INSTRUCTION_DETAIL);
662 }
663
664 return progress;
665 }
666
667 static void
brw_lower_vgrf_to_fixed_grf(const struct intel_device_info * devinfo,fs_inst * inst,brw_reg * reg,bool compressed)668 brw_lower_vgrf_to_fixed_grf(const struct intel_device_info *devinfo, fs_inst *inst,
669 brw_reg *reg, bool compressed)
670 {
671 if (reg->file != VGRF)
672 return;
673
674 struct brw_reg new_reg;
675
676 if (reg->stride == 0) {
677 new_reg = brw_vec1_grf(reg->nr, 0);
678 } else if (reg->stride > 4) {
679 assert(reg != &inst->dst);
680 assert(reg->stride * brw_type_size_bytes(reg->type) <= REG_SIZE);
681 new_reg = brw_vecn_grf(1, reg->nr, 0);
682 new_reg = stride(new_reg, reg->stride, 1, 0);
683 } else {
684 /* From the Haswell PRM:
685 *
686 * "VertStride must be used to cross GRF register boundaries. This
687 * rule implies that elements within a 'Width' cannot cross GRF
688 * boundaries."
689 *
690 * The maximum width value that could satisfy this restriction is:
691 */
692 const unsigned reg_width =
693 REG_SIZE / (reg->stride * brw_type_size_bytes(reg->type));
694
695 /* Because the hardware can only split source regions at a whole
696 * multiple of width during decompression (i.e. vertically), clamp
697 * the value obtained above to the physical execution size of a
698 * single decompressed chunk of the instruction:
699 */
700 const bool compressed = inst->dst.component_size(inst->exec_size) > REG_SIZE;
701 const unsigned phys_width = compressed ? inst->exec_size / 2 :
702 inst->exec_size;
703
704 /* XXX - The equation above is strictly speaking not correct on
705 * hardware that supports unbalanced GRF writes -- On Gfx9+
706 * each decompressed chunk of the instruction may have a
707 * different execution size when the number of components
708 * written to each destination GRF is not the same.
709 */
710
711 const unsigned max_hw_width = 16;
712
713 const unsigned width = MIN3(reg_width, phys_width, max_hw_width);
714 new_reg = brw_vecn_grf(width, reg->nr, 0);
715 new_reg = stride(new_reg, width * reg->stride, width, reg->stride);
716 }
717
718 new_reg = retype(new_reg, reg->type);
719 new_reg = byte_offset(new_reg, reg->offset);
720 new_reg.abs = reg->abs;
721 new_reg.negate = reg->negate;
722 new_reg.is_scalar = reg->is_scalar;
723
724 *reg = new_reg;
725 }
726
727 void
brw_lower_vgrfs_to_fixed_grfs(fs_visitor & s)728 brw_lower_vgrfs_to_fixed_grfs(fs_visitor &s)
729 {
730 assert(s.grf_used || !"Must be called after register allocation");
731
732 foreach_block_and_inst(block, fs_inst, inst, s.cfg) {
733 /* If the instruction writes to more than one register, it needs to be
734 * explicitly marked as compressed on Gen <= 5. On Gen >= 6 the
735 * hardware figures out by itself what the right compression mode is,
736 * but we still need to know whether the instruction is compressed to
737 * set up the source register regions appropriately.
738 *
739 * XXX - This is wrong for instructions that write a single register but
740 * read more than one which should strictly speaking be treated as
741 * compressed. For instructions that don't write any registers it
742 * relies on the destination being a null register of the correct
743 * type and regioning so the instruction is considered compressed
744 * or not accordingly.
745 */
746
747 const bool compressed =
748 inst->dst.component_size(inst->exec_size) > REG_SIZE;
749
750 brw_lower_vgrf_to_fixed_grf(s.devinfo, inst, &inst->dst, compressed);
751 for (int i = 0; i < inst->sources; i++) {
752 brw_lower_vgrf_to_fixed_grf(s.devinfo, inst, &inst->src[i], compressed);
753 }
754 }
755
756 s.invalidate_analysis(DEPENDENCY_INSTRUCTION_DATA_FLOW |
757 DEPENDENCY_VARIABLES);
758 }
759
760 static brw_reg
brw_s0(enum brw_reg_type type,unsigned subnr)761 brw_s0(enum brw_reg_type type, unsigned subnr)
762 {
763 return brw_make_reg(ARF,
764 BRW_ARF_SCALAR,
765 subnr,
766 0,
767 0,
768 type,
769 BRW_VERTICAL_STRIDE_0,
770 BRW_WIDTH_1,
771 BRW_HORIZONTAL_STRIDE_0,
772 BRW_SWIZZLE_XYZW,
773 WRITEMASK_XYZW);
774 }
775
776 static bool
brw_lower_send_gather_inst(fs_visitor & s,bblock_t * block,fs_inst * inst)777 brw_lower_send_gather_inst(fs_visitor &s, bblock_t *block, fs_inst *inst)
778 {
779 const intel_device_info *devinfo = s.devinfo;
780 assert(devinfo->ver >= 30);
781
782 const unsigned unit = reg_unit(devinfo);
783 assert(unit == 2);
784
785 assert(inst->opcode == SHADER_OPCODE_SEND_GATHER);
786 assert(inst->sources > 2);
787 assert(inst->src[2].file == BAD_FILE);
788
789 unsigned count = 0;
790 uint8_t regs[16] = {};
791
792 const unsigned num_payload_sources = inst->sources - 3;
793 assert(num_payload_sources > 0);
794
795 /* Limited by Src0.Length in the SEND instruction. */
796 assert(num_payload_sources < 16);
797
798 for (unsigned i = 3; i < inst->sources; i++) {
799 assert(inst->src[i].file == FIXED_GRF);
800 assert(inst->src[i].nr % reg_unit(devinfo) == 0);
801
802 unsigned nr = phys_nr(devinfo, inst->src[i]);
803 assert(nr <= UINT8_MAX);
804 regs[count++] = nr;
805 }
806
807 /* Fill out ARF scalar register with the physical register numbers
808 * and use SEND_GATHER.
809 */
810 brw_builder ubld = brw_builder(&s, block, inst).group(1, 0).exec_all();
811 for (unsigned q = 0; q < DIV_ROUND_UP(count, 8); q++) {
812 uint64_t v = 0;
813 for (unsigned i = 0; i < 8; i++) {
814 const uint64_t reg = regs[(q * 8) + i];
815 v |= reg << (8 * i);
816 }
817 ubld.MOV(brw_s0(BRW_TYPE_UQ, q), brw_imm_uq(v));
818 }
819
820 inst->src[2] = brw_s0(BRW_TYPE_UD, 0);
821 inst->mlen = count * unit;
822
823 return true;
824 }
825
826 bool
brw_lower_send_gather(fs_visitor & s)827 brw_lower_send_gather(fs_visitor &s)
828 {
829 assert(s.devinfo->ver >= 30);
830 assert(s.grf_used || !"Must be called after register allocation");
831
832 bool progress = false;
833
834 foreach_block_and_inst(block, fs_inst, inst, s.cfg) {
835 if (inst->opcode == SHADER_OPCODE_SEND_GATHER)
836 progress |= brw_lower_send_gather_inst(s, block, inst);
837 }
838
839 if (progress)
840 s.invalidate_analysis(DEPENDENCY_INSTRUCTION_DATA_FLOW |
841 DEPENDENCY_VARIABLES);
842
843 return progress;
844 }
845
846 bool
brw_lower_load_subgroup_invocation(fs_visitor & s)847 brw_lower_load_subgroup_invocation(fs_visitor &s)
848 {
849 bool progress = false;
850
851 foreach_block_and_inst_safe(block, fs_inst, inst, s.cfg) {
852 if (inst->opcode != SHADER_OPCODE_LOAD_SUBGROUP_INVOCATION)
853 continue;
854
855 const brw_builder abld =
856 brw_builder(&s, block, inst).annotate("SubgroupInvocation");
857 const brw_builder ubld8 = abld.group(8, 0).exec_all();
858 ubld8.UNDEF(inst->dst);
859
860 if (inst->exec_size == 8) {
861 assert(inst->dst.type == BRW_TYPE_UD);
862 brw_reg uw = retype(inst->dst, BRW_TYPE_UW);
863 ubld8.MOV(uw, brw_imm_v(0x76543210));
864 ubld8.MOV(inst->dst, uw);
865 } else {
866 assert(inst->dst.type == BRW_TYPE_UW);
867 ubld8.MOV(inst->dst, brw_imm_v(0x76543210));
868 ubld8.ADD(byte_offset(inst->dst, 16), inst->dst, brw_imm_uw(8u));
869 if (inst->exec_size > 16) {
870 const brw_builder ubld16 = abld.group(16, 0).exec_all();
871 ubld16.ADD(byte_offset(inst->dst, 32), inst->dst, brw_imm_uw(16u));
872 }
873 }
874
875 inst->remove(block);
876 progress = true;
877 }
878
879 if (progress)
880 s.invalidate_analysis(DEPENDENCY_INSTRUCTIONS | DEPENDENCY_VARIABLES);
881
882 return progress;
883 }
884
885 bool
brw_lower_indirect_mov(fs_visitor & s)886 brw_lower_indirect_mov(fs_visitor &s)
887 {
888 bool progress = false;
889
890 if (s.devinfo->ver < 20)
891 return progress;
892
893 foreach_block_and_inst_safe(block, fs_inst, inst, s.cfg) {
894 if (inst->opcode == SHADER_OPCODE_MOV_INDIRECT) {
895 if (brw_type_size_bytes(inst->src[0].type) > 1 &&
896 brw_type_size_bytes(inst->dst.type) > 1) {
897 continue;
898 }
899
900 assert(brw_type_size_bytes(inst->src[0].type) ==
901 brw_type_size_bytes(inst->dst.type));
902
903 const brw_builder ibld(&s, block, inst);
904
905 /* Extract unaligned part */
906 uint16_t extra_offset = inst->src[0].offset & 0x1;
907 brw_reg offset = ibld.ADD(inst->src[1], brw_imm_uw(extra_offset));
908
909 /* Check if offset is odd or even so that we can choose either high or
910 * low byte from the result.
911 */
912 brw_reg is_odd = ibld.AND(offset, brw_imm_ud(1));
913
914 /* Make sure offset is word (2-bytes) aligned */
915 offset = ibld.AND(offset, brw_imm_uw(~1));
916
917 /* Indirect addressing(vx1 and vxh) not supported with UB/B datatype for
918 * Src0, so change data type for src0 and dst to UW.
919 */
920 brw_reg dst = ibld.vgrf(BRW_TYPE_UW);
921
922 /* Substract unaligned offset from src0 offset since we already
923 * accounted unaligned part in the indirect byte offset.
924 */
925 brw_reg start = retype(inst->src[0], BRW_TYPE_UW);
926 start.offset &= ~extra_offset;
927
928 /* Adjust length to account extra offset. */
929 assert(inst->src[2].file == IMM);
930 brw_reg length = brw_imm_ud(inst->src[2].ud + extra_offset);
931
932 ibld.emit(SHADER_OPCODE_MOV_INDIRECT, dst, start, offset, length);
933
934 /* Select high byte if offset is odd otherwise select low byte. */
935 brw_reg lo = ibld.AND(dst, brw_imm_uw(0xff));
936 brw_reg hi = ibld.SHR(dst, brw_imm_uw(8));
937 brw_reg result = ibld.vgrf(BRW_TYPE_UW);
938 ibld.CSEL(result, hi, lo, is_odd, BRW_CONDITIONAL_NZ);
939
940 /* Extra MOV needed here to convert back to the corresponding B type */
941 ibld.MOV(inst->dst, result);
942
943 inst->remove(block);
944 progress = true;
945 }
946 }
947
948 if (progress)
949 s.invalidate_analysis(DEPENDENCY_INSTRUCTIONS | DEPENDENCY_VARIABLES);
950
951 return progress;
952 }
953