1 /*
2 * Copyright © 2010 Intel Corporation
3 * SPDX-License-Identifier: MIT
4 */
5
6 #include "brw_fs.h"
7 #include "brw_fs_builder.h"
8
9 using namespace brw;
10
11 /**
12 * Replace UNIFORM register file access with either UNIFORM_PULL_CONSTANT_LOAD
13 * or VARYING_PULL_CONSTANT_LOAD instructions which load values into VGRFs.
14 */
15 bool
brw_fs_lower_constant_loads(fs_visitor & s)16 brw_fs_lower_constant_loads(fs_visitor &s)
17 {
18 unsigned index, pull_index;
19 bool progress = false;
20
21 foreach_block_and_inst_safe (block, fs_inst, inst, s.cfg) {
22 /* Set up the annotation tracking for new generated instructions. */
23 const fs_builder ibld(&s, block, inst);
24
25 for (int i = 0; i < inst->sources; i++) {
26 if (inst->src[i].file != UNIFORM)
27 continue;
28
29 /* We'll handle this case later */
30 if (inst->opcode == SHADER_OPCODE_MOV_INDIRECT && i == 0)
31 continue;
32
33 if (!s.get_pull_locs(inst->src[i], &index, &pull_index))
34 continue;
35
36 assert(inst->src[i].stride == 0);
37
38 const unsigned block_sz = 64; /* Fetch one cacheline at a time. */
39 const fs_builder ubld = ibld.exec_all().group(block_sz / 4, 0);
40 const fs_reg dst = ubld.vgrf(BRW_REGISTER_TYPE_UD);
41 const unsigned base = pull_index * 4;
42
43 fs_reg srcs[PULL_UNIFORM_CONSTANT_SRCS];
44 srcs[PULL_UNIFORM_CONSTANT_SRC_SURFACE] = brw_imm_ud(index);
45 srcs[PULL_UNIFORM_CONSTANT_SRC_OFFSET] = brw_imm_ud(base & ~(block_sz - 1));
46 srcs[PULL_UNIFORM_CONSTANT_SRC_SIZE] = brw_imm_ud(block_sz);
47
48
49 ubld.emit(FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD, dst,
50 srcs, PULL_UNIFORM_CONSTANT_SRCS);
51
52 /* Rewrite the instruction to use the temporary VGRF. */
53 inst->src[i].file = VGRF;
54 inst->src[i].nr = dst.nr;
55 inst->src[i].offset = (base & (block_sz - 1)) +
56 inst->src[i].offset % 4;
57
58 progress = true;
59 }
60
61 if (inst->opcode == SHADER_OPCODE_MOV_INDIRECT &&
62 inst->src[0].file == UNIFORM) {
63
64 if (!s.get_pull_locs(inst->src[0], &index, &pull_index))
65 continue;
66
67 s.VARYING_PULL_CONSTANT_LOAD(ibld, inst->dst,
68 brw_imm_ud(index),
69 fs_reg() /* surface_handle */,
70 inst->src[1],
71 pull_index * 4, 4, 1);
72 inst->remove(block);
73
74 progress = true;
75 }
76 }
77 s.invalidate_analysis(DEPENDENCY_INSTRUCTIONS);
78
79 return progress;
80 }
81
82 bool
brw_fs_lower_load_payload(fs_visitor & s)83 brw_fs_lower_load_payload(fs_visitor &s)
84 {
85 bool progress = false;
86
87 foreach_block_and_inst_safe (block, fs_inst, inst, s.cfg) {
88 if (inst->opcode != SHADER_OPCODE_LOAD_PAYLOAD)
89 continue;
90
91 assert(inst->dst.file == VGRF);
92 assert(inst->saturate == false);
93 fs_reg dst = inst->dst;
94
95 const fs_builder ibld(&s, block, inst);
96 const fs_builder ubld = ibld.exec_all();
97
98 for (uint8_t i = 0; i < inst->header_size;) {
99 /* Number of header GRFs to initialize at once with a single MOV
100 * instruction.
101 */
102 const unsigned n =
103 (i + 1 < inst->header_size && inst->src[i].stride == 1 &&
104 inst->src[i + 1].equals(byte_offset(inst->src[i], REG_SIZE))) ?
105 2 : 1;
106
107 if (inst->src[i].file != BAD_FILE)
108 ubld.group(8 * n, 0).MOV(retype(dst, BRW_REGISTER_TYPE_UD),
109 retype(inst->src[i], BRW_REGISTER_TYPE_UD));
110
111 dst = byte_offset(dst, n * REG_SIZE);
112 i += n;
113 }
114
115 for (uint8_t i = inst->header_size; i < inst->sources; i++) {
116 dst.type = inst->src[i].type;
117 if (inst->src[i].file != BAD_FILE) {
118 ibld.MOV(dst, inst->src[i]);
119 }
120 dst = offset(dst, ibld, 1);
121 }
122
123 inst->remove(block);
124 progress = true;
125 }
126
127 if (progress)
128 s.invalidate_analysis(DEPENDENCY_INSTRUCTIONS);
129
130 return progress;
131 }
132
133 bool
brw_fs_lower_minmax(fs_visitor & s)134 brw_fs_lower_minmax(fs_visitor &s)
135 {
136 assert(s.devinfo->ver < 6);
137
138 bool progress = false;
139
140 foreach_block_and_inst_safe(block, fs_inst, inst, s.cfg) {
141 const fs_builder ibld(&s, block, inst);
142
143 if (inst->opcode == BRW_OPCODE_SEL &&
144 inst->predicate == BRW_PREDICATE_NONE) {
145 /* If src1 is an immediate value that is not NaN, then it can't be
146 * NaN. In that case, emit CMP because it is much better for cmod
147 * propagation. Likewise if src1 is not float. Gfx4 and Gfx5 don't
148 * support HF or DF, so it is not necessary to check for those.
149 */
150 if (inst->src[1].type != BRW_REGISTER_TYPE_F ||
151 (inst->src[1].file == IMM && !isnan(inst->src[1].f))) {
152 ibld.CMP(ibld.null_reg_d(), inst->src[0], inst->src[1],
153 inst->conditional_mod);
154 } else {
155 ibld.CMPN(ibld.null_reg_d(), inst->src[0], inst->src[1],
156 inst->conditional_mod);
157 }
158 inst->predicate = BRW_PREDICATE_NORMAL;
159 inst->conditional_mod = BRW_CONDITIONAL_NONE;
160
161 progress = true;
162 }
163 }
164
165 if (progress)
166 s.invalidate_analysis(DEPENDENCY_INSTRUCTIONS);
167
168 return progress;
169 }
170
171 bool
brw_fs_lower_sub_sat(fs_visitor & s)172 brw_fs_lower_sub_sat(fs_visitor &s)
173 {
174 bool progress = false;
175
176 foreach_block_and_inst_safe(block, fs_inst, inst, s.cfg) {
177 const fs_builder ibld(&s, block, inst);
178
179 if (inst->opcode == SHADER_OPCODE_USUB_SAT ||
180 inst->opcode == SHADER_OPCODE_ISUB_SAT) {
181 /* The fundamental problem is the hardware performs source negation
182 * at the bit width of the source. If the source is 0x80000000D, the
183 * negation is 0x80000000D. As a result, subtractSaturate(0,
184 * 0x80000000) will produce 0x80000000 instead of 0x7fffffff. There
185 * are at least three ways to resolve this:
186 *
187 * 1. Use the accumulator for the negated source. The accumulator is
188 * 33 bits, so our source 0x80000000 is sign-extended to
189 * 0x1800000000. The negation of which is 0x080000000. This
190 * doesn't help for 64-bit integers (which are already bigger than
191 * 33 bits). There are also only 8 accumulators, so SIMD16 or
192 * SIMD32 instructions would have to be split into multiple SIMD8
193 * instructions.
194 *
195 * 2. Use slightly different math. For any n-bit value x, we know (x
196 * >> 1) != -(x >> 1). We can use this fact to only do
197 * subtractions involving (x >> 1). subtractSaturate(a, b) ==
198 * subtractSaturate(subtractSaturate(a, (b >> 1)), b - (b >> 1)).
199 *
200 * 3. For unsigned sources, it is sufficient to replace the
201 * subtractSaturate with (a > b) ? a - b : 0.
202 *
203 * It may also be possible to use the SUBB instruction. This
204 * implicitly writes the accumulator, so it could only be used in the
205 * same situations as #1 above. It is further limited by only
206 * allowing UD sources.
207 */
208 if (inst->exec_size == 8 && inst->src[0].type != BRW_REGISTER_TYPE_Q &&
209 inst->src[0].type != BRW_REGISTER_TYPE_UQ) {
210 fs_reg acc(ARF, BRW_ARF_ACCUMULATOR, inst->src[1].type);
211
212 ibld.MOV(acc, inst->src[1]);
213 fs_inst *add = ibld.ADD(inst->dst, acc, inst->src[0]);
214 add->saturate = true;
215 add->src[0].negate = true;
216 } else if (inst->opcode == SHADER_OPCODE_ISUB_SAT) {
217 /* tmp = src1 >> 1;
218 * dst = add.sat(add.sat(src0, -tmp), -(src1 - tmp));
219 */
220 fs_reg tmp1 = ibld.vgrf(inst->src[0].type);
221 fs_reg tmp2 = ibld.vgrf(inst->src[0].type);
222 fs_reg tmp3 = ibld.vgrf(inst->src[0].type);
223 fs_inst *add;
224
225 ibld.SHR(tmp1, inst->src[1], brw_imm_d(1));
226
227 add = ibld.ADD(tmp2, inst->src[1], tmp1);
228 add->src[1].negate = true;
229
230 add = ibld.ADD(tmp3, inst->src[0], tmp1);
231 add->src[1].negate = true;
232 add->saturate = true;
233
234 add = ibld.ADD(inst->dst, tmp3, tmp2);
235 add->src[1].negate = true;
236 add->saturate = true;
237 } else {
238 /* a > b ? a - b : 0 */
239 ibld.CMP(ibld.null_reg_d(), inst->src[0], inst->src[1],
240 BRW_CONDITIONAL_G);
241
242 fs_inst *add = ibld.ADD(inst->dst, inst->src[0], inst->src[1]);
243 add->src[1].negate = !add->src[1].negate;
244
245 ibld.SEL(inst->dst, inst->dst, brw_imm_ud(0))
246 ->predicate = BRW_PREDICATE_NORMAL;
247 }
248
249 inst->remove(block);
250 progress = true;
251 }
252 }
253
254 if (progress)
255 s.invalidate_analysis(DEPENDENCY_INSTRUCTIONS | DEPENDENCY_VARIABLES);
256
257 return progress;
258 }
259
260 /**
261 * Transform barycentric vectors into the interleaved form expected by the PLN
262 * instruction and returned by the Gfx7+ PI shared function.
263 *
264 * For channels 0-15 in SIMD16 mode they are expected to be laid out as
265 * follows in the register file:
266 *
267 * rN+0: X[0-7]
268 * rN+1: Y[0-7]
269 * rN+2: X[8-15]
270 * rN+3: Y[8-15]
271 *
272 * There is no need to handle SIMD32 here -- This is expected to be run after
273 * SIMD lowering, since SIMD lowering relies on vectors having the standard
274 * component layout.
275 */
276 bool
brw_fs_lower_barycentrics(fs_visitor & s)277 brw_fs_lower_barycentrics(fs_visitor &s)
278 {
279 const intel_device_info *devinfo = s.devinfo;
280
281 if (s.stage != MESA_SHADER_FRAGMENT || devinfo->ver >= 20)
282 return false;
283
284 bool progress = false;
285
286 foreach_block_and_inst_safe(block, fs_inst, inst, s.cfg) {
287 if (inst->exec_size < 16)
288 continue;
289
290 const fs_builder ibld(&s, block, inst);
291 const fs_builder ubld = ibld.exec_all().group(8, 0);
292
293 switch (inst->opcode) {
294 case FS_OPCODE_LINTERP : {
295 assert(inst->exec_size == 16);
296 const fs_reg tmp = ibld.vgrf(inst->src[0].type, 2);
297 fs_reg srcs[4];
298
299 for (unsigned i = 0; i < ARRAY_SIZE(srcs); i++)
300 srcs[i] = horiz_offset(offset(inst->src[0], ibld, i % 2),
301 8 * (i / 2));
302
303 ubld.LOAD_PAYLOAD(tmp, srcs, ARRAY_SIZE(srcs), ARRAY_SIZE(srcs));
304
305 inst->src[0] = tmp;
306 progress = true;
307 break;
308 }
309 case FS_OPCODE_INTERPOLATE_AT_SAMPLE:
310 case FS_OPCODE_INTERPOLATE_AT_SHARED_OFFSET:
311 case FS_OPCODE_INTERPOLATE_AT_PER_SLOT_OFFSET: {
312 assert(inst->exec_size == 16);
313 const fs_reg tmp = ibld.vgrf(inst->dst.type, 2);
314
315 for (unsigned i = 0; i < 2; i++) {
316 for (unsigned g = 0; g < inst->exec_size / 8; g++) {
317 fs_inst *mov = ibld.at(block, inst->next).group(8, g)
318 .MOV(horiz_offset(offset(inst->dst, ibld, i),
319 8 * g),
320 offset(tmp, ubld, 2 * g + i));
321 mov->predicate = inst->predicate;
322 mov->predicate_inverse = inst->predicate_inverse;
323 mov->flag_subreg = inst->flag_subreg;
324 }
325 }
326
327 inst->dst = tmp;
328 progress = true;
329 break;
330 }
331 default:
332 break;
333 }
334 }
335
336 if (progress)
337 s.invalidate_analysis(DEPENDENCY_INSTRUCTIONS | DEPENDENCY_VARIABLES);
338
339 return progress;
340 }
341
342 /**
343 * Lower a derivative instruction as the floating-point difference of two
344 * swizzles of the source, specified as \p swz0 and \p swz1.
345 */
346 static bool
lower_derivative(fs_visitor & s,bblock_t * block,fs_inst * inst,unsigned swz0,unsigned swz1)347 lower_derivative(fs_visitor &s, bblock_t *block, fs_inst *inst,
348 unsigned swz0, unsigned swz1)
349 {
350 const fs_builder ubld = fs_builder(&s, block, inst).exec_all();
351 const fs_reg tmp0 = ubld.vgrf(inst->src[0].type);
352 const fs_reg tmp1 = ubld.vgrf(inst->src[0].type);
353
354 ubld.emit(SHADER_OPCODE_QUAD_SWIZZLE, tmp0, inst->src[0], brw_imm_ud(swz0));
355 ubld.emit(SHADER_OPCODE_QUAD_SWIZZLE, tmp1, inst->src[0], brw_imm_ud(swz1));
356
357 inst->resize_sources(2);
358 inst->src[0] = negate(tmp0);
359 inst->src[1] = tmp1;
360 inst->opcode = BRW_OPCODE_ADD;
361
362 return true;
363 }
364
365 /**
366 * Lower derivative instructions on platforms where codegen cannot implement
367 * them efficiently (i.e. XeHP).
368 */
369 bool
brw_fs_lower_derivatives(fs_visitor & s)370 brw_fs_lower_derivatives(fs_visitor &s)
371 {
372 bool progress = false;
373
374 if (s.devinfo->verx10 < 125)
375 return false;
376
377 foreach_block_and_inst(block, fs_inst, inst, s.cfg) {
378 if (inst->opcode == FS_OPCODE_DDX_COARSE)
379 progress |= lower_derivative(s, block, inst,
380 BRW_SWIZZLE_XXXX, BRW_SWIZZLE_YYYY);
381
382 else if (inst->opcode == FS_OPCODE_DDX_FINE)
383 progress |= lower_derivative(s, block, inst,
384 BRW_SWIZZLE_XXZZ, BRW_SWIZZLE_YYWW);
385
386 else if (inst->opcode == FS_OPCODE_DDY_COARSE)
387 progress |= lower_derivative(s, block, inst,
388 BRW_SWIZZLE_XXXX, BRW_SWIZZLE_ZZZZ);
389
390 else if (inst->opcode == FS_OPCODE_DDY_FINE)
391 progress |= lower_derivative(s, block, inst,
392 BRW_SWIZZLE_XYXY, BRW_SWIZZLE_ZWZW);
393 }
394
395 if (progress)
396 s.invalidate_analysis(DEPENDENCY_INSTRUCTIONS | DEPENDENCY_VARIABLES);
397
398 return progress;
399 }
400
401 bool
brw_fs_lower_find_live_channel(fs_visitor & s)402 brw_fs_lower_find_live_channel(fs_visitor &s)
403 {
404 bool progress = false;
405
406 bool packed_dispatch =
407 brw_stage_has_packed_dispatch(s.devinfo, s.stage, s.max_polygons,
408 s.stage_prog_data);
409 bool vmask =
410 s.stage == MESA_SHADER_FRAGMENT &&
411 brw_wm_prog_data(s.stage_prog_data)->uses_vmask;
412
413 foreach_block_and_inst_safe(block, fs_inst, inst, s.cfg) {
414 if (inst->opcode != SHADER_OPCODE_FIND_LIVE_CHANNEL &&
415 inst->opcode != SHADER_OPCODE_FIND_LAST_LIVE_CHANNEL &&
416 inst->opcode != SHADER_OPCODE_LOAD_LIVE_CHANNELS)
417 continue;
418
419 bool first = inst->opcode == SHADER_OPCODE_FIND_LIVE_CHANNEL;
420
421 /* Getting the first active channel index is easy on Gfx8: Just find
422 * the first bit set in the execution mask. The register exists on
423 * HSW already but it reads back as all ones when the current
424 * instruction has execution masking disabled, so it's kind of
425 * useless there.
426 */
427 fs_reg exec_mask(retype(brw_mask_reg(0), BRW_REGISTER_TYPE_UD));
428
429 const fs_builder ibld(&s, block, inst);
430 if (!inst->is_partial_write())
431 ibld.emit_undef_for_dst(inst);
432
433 const fs_builder ubld = fs_builder(&s, block, inst).exec_all().group(1, 0);
434
435 /* ce0 doesn't consider the thread dispatch mask (DMask or VMask),
436 * so combine the execution and dispatch masks to obtain the true mask.
437 *
438 * If we're looking for the first live channel, and we have packed
439 * dispatch, we can skip this step, as we know all dispatched channels
440 * will appear at the front of the mask.
441 */
442 if (!(first && packed_dispatch)) {
443 fs_reg mask = ubld.vgrf(BRW_REGISTER_TYPE_UD);
444 ubld.UNDEF(mask);
445 ubld.emit(SHADER_OPCODE_READ_SR_REG, mask, brw_imm_ud(vmask ? 3 : 2));
446
447 /* Quarter control has the effect of magically shifting the value of
448 * ce0 so you'll get the first/last active channel relative to the
449 * specified quarter control as result.
450 */
451 if (inst->group > 0)
452 ubld.SHR(mask, mask, brw_imm_ud(ALIGN(inst->group, 8)));
453
454 ubld.AND(mask, exec_mask, mask);
455 exec_mask = mask;
456 }
457
458 switch (inst->opcode) {
459 case SHADER_OPCODE_FIND_LIVE_CHANNEL:
460 ubld.FBL(inst->dst, exec_mask);
461 break;
462
463 case SHADER_OPCODE_FIND_LAST_LIVE_CHANNEL: {
464 fs_reg tmp = ubld.vgrf(BRW_REGISTER_TYPE_UD, 1);
465 ubld.UNDEF(tmp);
466 ubld.LZD(tmp, exec_mask);
467 ubld.ADD(inst->dst, negate(tmp), brw_imm_uw(31));
468 break;
469 }
470
471 case SHADER_OPCODE_LOAD_LIVE_CHANNELS:
472 ubld.MOV(inst->dst, exec_mask);
473 break;
474
475 default:
476 unreachable("Impossible.");
477 }
478
479 inst->remove(block);
480 progress = true;
481 }
482
483 if (progress)
484 s.invalidate_analysis(DEPENDENCY_INSTRUCTIONS | DEPENDENCY_VARIABLES);
485
486 return progress;
487 }
488
489 /**
490 * From the Skylake PRM Vol. 2a docs for sends:
491 *
492 * "It is required that the second block of GRFs does not overlap with the
493 * first block."
494 *
495 * There are plenty of cases where we may accidentally violate this due to
496 * having, for instance, both sources be the constant 0. This little pass
497 * just adds a new vgrf for the second payload and copies it over.
498 */
499 bool
brw_fs_lower_sends_overlapping_payload(fs_visitor & s)500 brw_fs_lower_sends_overlapping_payload(fs_visitor &s)
501 {
502 bool progress = false;
503
504 foreach_block_and_inst_safe (block, fs_inst, inst, s.cfg) {
505 if (inst->opcode == SHADER_OPCODE_SEND && inst->ex_mlen > 0 &&
506 regions_overlap(inst->src[2], inst->mlen * REG_SIZE,
507 inst->src[3], inst->ex_mlen * REG_SIZE)) {
508 fs_reg tmp = fs_reg(VGRF, s.alloc.allocate(inst->ex_mlen),
509 BRW_REGISTER_TYPE_UD);
510 /* Sadly, we've lost all notion of channels and bit sizes at this
511 * point. Just WE_all it.
512 */
513 const fs_builder ibld = fs_builder(&s, block, inst).exec_all().group(16, 0);
514 fs_reg copy_src = retype(inst->src[3], BRW_REGISTER_TYPE_UD);
515 fs_reg copy_dst = tmp;
516 for (unsigned i = 0; i < inst->ex_mlen; i += 2) {
517 if (inst->ex_mlen == i + 1) {
518 /* Only one register left; do SIMD8 */
519 ibld.group(8, 0).MOV(copy_dst, copy_src);
520 } else {
521 ibld.MOV(copy_dst, copy_src);
522 }
523 copy_src = offset(copy_src, ibld, 1);
524 copy_dst = offset(copy_dst, ibld, 1);
525 }
526 inst->src[3] = tmp;
527 progress = true;
528 }
529 }
530
531 if (progress)
532 s.invalidate_analysis(DEPENDENCY_INSTRUCTIONS | DEPENDENCY_VARIABLES);
533
534 return progress;
535 }
536
537 /**
538 * Three source instruction must have a GRF destination register.
539 * ARF NULL is not allowed. Fix that up by allocating a temporary GRF.
540 */
541 bool
brw_fs_lower_3src_null_dest(fs_visitor & s)542 brw_fs_lower_3src_null_dest(fs_visitor &s)
543 {
544 bool progress = false;
545
546 foreach_block_and_inst_safe (block, fs_inst, inst, s.cfg) {
547 if (inst->is_3src(s.compiler) && inst->dst.is_null()) {
548 inst->dst = fs_reg(VGRF, s.alloc.allocate(s.dispatch_width / 8),
549 inst->dst.type);
550 progress = true;
551 }
552 }
553
554 if (progress)
555 s.invalidate_analysis(DEPENDENCY_INSTRUCTION_DETAIL |
556 DEPENDENCY_VARIABLES);
557
558 return progress;
559 }
560
561