1 /*
2 * Copyright © 2010 Intel Corporation
3 * SPDX-License-Identifier: MIT
4 */
5
6 #include "brw_eu.h"
7 #include "brw_fs.h"
8 #include "brw_builder.h"
9
10 #include "dev/intel_debug.h"
11
12 using namespace brw;
13
14 void
brw_optimize(fs_visitor & s)15 brw_optimize(fs_visitor &s)
16 {
17 const nir_shader *nir = s.nir;
18
19 s.debug_optimizer(nir, "start", 0, 0);
20
21 /* Start by validating the shader we currently have. */
22 brw_validate(s);
23
24 /* Track how much non-SSA at this point. */
25 {
26 const brw::def_analysis &defs = s.def_analysis.require();
27 s.shader_stats.non_ssa_registers_after_nir =
28 defs.count() - defs.ssa_count();
29 }
30
31 bool progress = false;
32 int iteration = 0;
33 int pass_num = 0;
34
35 #define OPT(pass, ...) ({ \
36 pass_num++; \
37 bool this_progress = pass(s, ##__VA_ARGS__); \
38 \
39 if (this_progress) \
40 s.debug_optimizer(nir, #pass, iteration, pass_num); \
41 \
42 brw_validate(s); \
43 \
44 progress = progress || this_progress; \
45 this_progress; \
46 })
47
48 if (s.compiler->lower_dpas)
49 OPT(brw_lower_dpas);
50
51 OPT(brw_opt_split_virtual_grfs);
52
53 /* Before anything else, eliminate dead code. The results of some NIR
54 * instructions may effectively be calculated twice. Once when the
55 * instruction is encountered, and again when the user of that result is
56 * encountered. Wipe those away before algebraic optimizations and
57 * especially copy propagation can mix things up.
58 */
59 OPT(brw_opt_dead_code_eliminate);
60
61 OPT(brw_opt_remove_extra_rounding_modes);
62
63 OPT(brw_opt_eliminate_find_live_channel);
64
65 do {
66 progress = false;
67 pass_num = 0;
68 iteration++;
69
70 OPT(brw_opt_algebraic);
71 OPT(brw_opt_cse_defs);
72 if (!OPT(brw_opt_copy_propagation_defs))
73 OPT(brw_opt_copy_propagation);
74 OPT(brw_opt_cmod_propagation);
75 OPT(brw_opt_dead_code_eliminate);
76 OPT(brw_opt_saturate_propagation);
77 OPT(brw_opt_register_coalesce);
78
79 OPT(brw_opt_compact_virtual_grfs);
80 } while (progress);
81
82 brw_shader_phase_update(s, BRW_SHADER_PHASE_AFTER_OPT_LOOP);
83
84 progress = false;
85 pass_num = 0;
86
87 if (OPT(brw_opt_combine_convergent_txf))
88 OPT(brw_opt_copy_propagation_defs);
89
90 if (OPT(brw_lower_pack)) {
91 OPT(brw_opt_register_coalesce);
92 OPT(brw_opt_dead_code_eliminate);
93 }
94
95 OPT(brw_lower_subgroup_ops);
96 OPT(brw_lower_csel);
97 OPT(brw_lower_simd_width);
98 OPT(brw_lower_scalar_fp64_MAD);
99 OPT(brw_lower_barycentrics);
100 OPT(brw_lower_logical_sends);
101
102 brw_shader_phase_update(s, BRW_SHADER_PHASE_AFTER_EARLY_LOWERING);
103
104 /* After logical SEND lowering. */
105
106 if (!OPT(brw_opt_copy_propagation_defs))
107 OPT(brw_opt_copy_propagation);
108
109 /* Identify trailing zeros LOAD_PAYLOAD of sampler messages.
110 * Do this before splitting SENDs.
111 */
112 if (OPT(brw_opt_zero_samples)) {
113 if (!OPT(brw_opt_copy_propagation_defs)) {
114 OPT(brw_opt_copy_propagation);
115 }
116 }
117
118 if (s.devinfo->ver >= 30)
119 OPT(brw_opt_send_to_send_gather);
120
121 OPT(brw_opt_split_sends);
122 OPT(brw_workaround_nomask_control_flow);
123
124 if (progress) {
125 /* Do both forms of copy propagation because it is important to
126 * eliminate as many cases of load_payload-of-load_payload as possible.
127 */
128 OPT(brw_opt_copy_propagation_defs);
129 OPT(brw_opt_copy_propagation);
130
131 /* Run after logical send lowering to give it a chance to CSE the
132 * LOAD_PAYLOAD instructions created to construct the payloads of
133 * e.g. texturing messages in cases where it wasn't possible to CSE the
134 * whole logical instruction.
135 */
136 OPT(brw_opt_cse_defs);
137 OPT(brw_opt_register_coalesce);
138 OPT(brw_opt_dead_code_eliminate);
139 }
140
141 OPT(brw_opt_remove_redundant_halts);
142
143 if (OPT(brw_lower_load_payload)) {
144 OPT(brw_opt_split_virtual_grfs);
145
146 OPT(brw_opt_register_coalesce);
147 OPT(brw_lower_simd_width);
148 OPT(brw_opt_dead_code_eliminate);
149 }
150
151 brw_shader_phase_update(s, BRW_SHADER_PHASE_AFTER_MIDDLE_LOWERING);
152
153 OPT(brw_lower_alu_restrictions);
154
155 OPT(brw_opt_combine_constants);
156 if (OPT(brw_lower_integer_multiplication)) {
157 /* If lower_integer_multiplication made progress, it may have produced
158 * some 32x32-bit MULs in the process of lowering 64-bit MULs. Run it
159 * one more time to clean those up if they exist.
160 */
161 OPT(brw_lower_integer_multiplication);
162 }
163 OPT(brw_lower_sub_sat);
164
165 progress = false;
166 OPT(brw_lower_derivatives);
167 OPT(brw_lower_regioning);
168
169 /* Try both copy propagation passes. The defs one will likely not be
170 * able to handle everything at this point.
171 */
172 const bool cp1 = OPT(brw_opt_copy_propagation_defs);
173 const bool cp2 = OPT(brw_opt_copy_propagation);
174 if (cp1 || cp2)
175 OPT(brw_opt_combine_constants);
176
177 OPT(brw_opt_dead_code_eliminate);
178 OPT(brw_opt_register_coalesce);
179
180 if (progress)
181 OPT(brw_lower_simd_width);
182
183 if (s.devinfo->ver >= 30)
184 OPT(brw_opt_send_gather_to_send);
185
186 OPT(brw_lower_uniform_pull_constant_loads);
187
188 if (OPT(brw_lower_send_descriptors)) {
189 /* No need for standard copy_propagation since
190 * brw_fs_opt_address_reg_load will only optimize defs.
191 */
192 if (OPT(brw_opt_copy_propagation_defs))
193 OPT(brw_opt_algebraic);
194 OPT(brw_opt_address_reg_load);
195 OPT(brw_opt_dead_code_eliminate);
196 }
197
198 OPT(brw_lower_sends_overlapping_payload);
199
200 OPT(brw_lower_indirect_mov);
201
202 OPT(brw_lower_find_live_channel);
203
204 OPT(brw_lower_load_subgroup_invocation);
205
206 brw_shader_phase_update(s, BRW_SHADER_PHASE_AFTER_LATE_LOWERING);
207 }
208
209 static unsigned
load_payload_sources_read_for_size(fs_inst * lp,unsigned size_read)210 load_payload_sources_read_for_size(fs_inst *lp, unsigned size_read)
211 {
212 assert(lp->opcode == SHADER_OPCODE_LOAD_PAYLOAD);
213 assert(size_read >= lp->header_size * REG_SIZE);
214
215 unsigned i;
216 unsigned size = lp->header_size * REG_SIZE;
217 for (i = lp->header_size; size < size_read && i < lp->sources; i++)
218 size += lp->exec_size * brw_type_size_bytes(lp->src[i].type);
219
220 /* Size read must cover exactly a subset of sources. */
221 assert(size == size_read);
222 return i;
223 }
224
225 /**
226 * Optimize sample messages that have constant zero values for the trailing
227 * parameters. We can just reduce the message length for these
228 * instructions instead of reserving a register for it. Trailing parameters
229 * that aren't sent default to zero anyway. This will cause the dead code
230 * eliminator to remove the MOV instruction that would otherwise be emitted to
231 * set up the zero value.
232 */
233
234 bool
brw_opt_zero_samples(fs_visitor & s)235 brw_opt_zero_samples(fs_visitor &s)
236 {
237 bool progress = false;
238
239 foreach_block_and_inst(block, fs_inst, send, s.cfg) {
240 if (send->opcode != SHADER_OPCODE_SEND ||
241 send->sfid != BRW_SFID_SAMPLER)
242 continue;
243
244 /* Wa_14012688258:
245 *
246 * Don't trim zeros at the end of payload for sample operations
247 * in cube and cube arrays.
248 */
249 if (send->keep_payload_trailing_zeros)
250 continue;
251
252 /* This pass works on SENDs before splitting. */
253 if (send->ex_mlen > 0)
254 continue;
255
256 fs_inst *lp = (fs_inst *) send->prev;
257
258 if (lp->is_head_sentinel() || lp->opcode != SHADER_OPCODE_LOAD_PAYLOAD)
259 continue;
260
261 /* How much of the payload are actually read by this SEND. */
262 const unsigned params =
263 load_payload_sources_read_for_size(lp, send->mlen * REG_SIZE);
264
265 /* We don't want to remove the message header or the first parameter.
266 * Removing the first parameter is not allowed, see the Haswell PRM
267 * volume 7, page 149:
268 *
269 * "Parameter 0 is required except for the sampleinfo message, which
270 * has no parameter 0"
271 */
272 const unsigned first_param_idx = lp->header_size;
273 unsigned zero_size = 0;
274 for (unsigned i = params - 1; i > first_param_idx; i--) {
275 if (lp->src[i].file != BAD_FILE && !lp->src[i].is_zero())
276 break;
277 zero_size += lp->exec_size * brw_type_size_bytes(lp->src[i].type) * lp->dst.stride;
278 }
279
280 /* Round down to ensure to only consider full registers. */
281 const unsigned zero_len = ROUND_DOWN_TO(zero_size / REG_SIZE, reg_unit(s.devinfo));
282 if (zero_len > 0) {
283 /* Note mlen is in REG_SIZE units. */
284 send->mlen -= zero_len;
285 progress = true;
286 }
287 }
288
289 if (progress)
290 s.invalidate_analysis(DEPENDENCY_INSTRUCTION_DETAIL);
291
292 return progress;
293 }
294
295 /**
296 * Opportunistically split SEND message payloads.
297 *
298 * Gfx9+ supports "split" SEND messages, which take two payloads that are
299 * implicitly concatenated. If we find a SEND message with a single payload,
300 * we can split that payload in two. This results in smaller contiguous
301 * register blocks for us to allocate. But it can help beyond that, too.
302 *
303 * We try and split a LOAD_PAYLOAD between sources which change registers.
304 * For example, a sampler message often contains a x/y/z coordinate that may
305 * already be in a contiguous VGRF, combined with an LOD, shadow comparitor,
306 * or array index, which comes from elsewhere. In this case, the first few
307 * sources will be different offsets of the same VGRF, then a later source
308 * will be a different VGRF. So we split there, possibly eliminating the
309 * payload concatenation altogether.
310 */
311 bool
brw_opt_split_sends(fs_visitor & s)312 brw_opt_split_sends(fs_visitor &s)
313 {
314 bool progress = false;
315
316 foreach_block_and_inst(block, fs_inst, send, s.cfg) {
317 if (send->opcode != SHADER_OPCODE_SEND ||
318 send->mlen <= reg_unit(s.devinfo) || send->ex_mlen > 0 ||
319 send->src[2].file != VGRF)
320 continue;
321
322 /* Currently don't split sends that reuse a previously used payload. */
323 fs_inst *lp = (fs_inst *) send->prev;
324
325 if (lp->is_head_sentinel() || lp->opcode != SHADER_OPCODE_LOAD_PAYLOAD)
326 continue;
327
328 if (lp->dst.file != send->src[2].file || lp->dst.nr != send->src[2].nr)
329 continue;
330
331 /* Split either after the header (if present), or when consecutive
332 * sources switch from one VGRF to a different one.
333 */
334 unsigned mid = lp->header_size;
335 if (mid == 0) {
336 for (mid = 1; mid < lp->sources; mid++) {
337 if (lp->src[mid].file == BAD_FILE)
338 continue;
339
340 if (lp->src[0].file != lp->src[mid].file ||
341 lp->src[0].nr != lp->src[mid].nr)
342 break;
343 }
344 }
345
346 /* SEND mlen might be smaller than what LOAD_PAYLOAD provides, so
347 * find out how many sources from the payload does it really need.
348 */
349 const unsigned end =
350 load_payload_sources_read_for_size(lp, send->mlen * REG_SIZE);
351
352 /* Nothing to split. */
353 if (end <= mid)
354 continue;
355
356 const brw_builder ibld(&s, block, lp);
357 fs_inst *lp1 = ibld.LOAD_PAYLOAD(lp->dst, &lp->src[0], mid, lp->header_size);
358 fs_inst *lp2 = ibld.LOAD_PAYLOAD(lp->dst, &lp->src[mid], end - mid, 0);
359
360 assert(lp1->size_written % REG_SIZE == 0);
361 assert(lp2->size_written % REG_SIZE == 0);
362 assert((lp1->size_written + lp2->size_written) / REG_SIZE == send->mlen);
363
364 lp1->dst = brw_vgrf(s.alloc.allocate(lp1->size_written / REG_SIZE), lp1->dst.type);
365 lp2->dst = brw_vgrf(s.alloc.allocate(lp2->size_written / REG_SIZE), lp2->dst.type);
366
367 send->resize_sources(4);
368 send->src[2] = lp1->dst;
369 send->src[3] = lp2->dst;
370 send->ex_mlen = lp2->size_written / REG_SIZE;
371 send->mlen -= send->ex_mlen;
372
373 progress = true;
374 }
375
376 if (progress)
377 s.invalidate_analysis(DEPENDENCY_INSTRUCTIONS | DEPENDENCY_VARIABLES);
378
379 return progress;
380 }
381
382 /**
383 * Remove redundant or useless halts.
384 *
385 * For example, we can eliminate halts in the following sequence:
386 *
387 * halt (redundant with the next halt)
388 * halt (useless; jumps to the next instruction)
389 * halt-target
390 */
391 bool
brw_opt_remove_redundant_halts(fs_visitor & s)392 brw_opt_remove_redundant_halts(fs_visitor &s)
393 {
394 bool progress = false;
395
396 unsigned halt_count = 0;
397 fs_inst *halt_target = NULL;
398 bblock_t *halt_target_block = NULL;
399 foreach_block_and_inst(block, fs_inst, inst, s.cfg) {
400 if (inst->opcode == BRW_OPCODE_HALT)
401 halt_count++;
402
403 if (inst->opcode == SHADER_OPCODE_HALT_TARGET) {
404 halt_target = inst;
405 halt_target_block = block;
406 break;
407 }
408 }
409
410 if (!halt_target) {
411 assert(halt_count == 0);
412 return false;
413 }
414
415 /* Delete any HALTs immediately before the halt target. */
416 for (fs_inst *prev = (fs_inst *) halt_target->prev;
417 !prev->is_head_sentinel() && prev->opcode == BRW_OPCODE_HALT;
418 prev = (fs_inst *) halt_target->prev) {
419 prev->remove(halt_target_block);
420 halt_count--;
421 progress = true;
422 }
423
424 if (halt_count == 0) {
425 halt_target->remove(halt_target_block);
426 progress = true;
427 }
428
429 if (progress)
430 s.invalidate_analysis(DEPENDENCY_INSTRUCTIONS);
431
432 return progress;
433 }
434
435 /**
436 * Eliminate FIND_LIVE_CHANNEL instructions occurring outside any control
437 * flow. We could probably do better here with some form of divergence
438 * analysis.
439 */
440 bool
brw_opt_eliminate_find_live_channel(fs_visitor & s)441 brw_opt_eliminate_find_live_channel(fs_visitor &s)
442 {
443 bool progress = false;
444 unsigned depth = 0;
445
446 if (!brw_stage_has_packed_dispatch(s.devinfo, s.stage, s.max_polygons,
447 s.prog_data)) {
448 /* The optimization below assumes that channel zero is live on thread
449 * dispatch, which may not be the case if the fixed function dispatches
450 * threads sparsely.
451 */
452 return false;
453 }
454
455 foreach_block_and_inst_safe(block, fs_inst, inst, s.cfg) {
456 switch (inst->opcode) {
457 case BRW_OPCODE_IF:
458 case BRW_OPCODE_DO:
459 depth++;
460 break;
461
462 case BRW_OPCODE_ENDIF:
463 case BRW_OPCODE_WHILE:
464 depth--;
465 break;
466
467 case BRW_OPCODE_HALT:
468 /* This can potentially make control flow non-uniform until the end
469 * of the program.
470 */
471 goto out;
472
473 case SHADER_OPCODE_FIND_LIVE_CHANNEL:
474 if (depth == 0) {
475 inst->opcode = BRW_OPCODE_MOV;
476 inst->src[0] = brw_imm_ud(0u);
477 inst->force_writemask_all = true;
478
479 /* FIND_LIVE_CHANNEL emitted by emit_uniformize will have
480 * size_written set by hand to a smaller value. In this case,
481 * munge the exec_size to match.
482 */
483 if (inst->size_written == inst->dst.component_size(8 * reg_unit(s.devinfo)))
484 inst->exec_size = 8 * reg_unit(s.devinfo);
485
486 inst->resize_sources(1);
487 progress = true;
488
489 /* emit_uniformize() frequently emits FIND_LIVE_CHANNEL paired
490 * with a BROADCAST. Save some work for opt_copy_propagation
491 * and opt_algebraic by trivially cleaning up both together.
492 */
493 assert(!inst->next->is_tail_sentinel());
494 fs_inst *bcast = (fs_inst *) inst->next;
495
496 /* Ignore stride when comparing */
497 if (bcast->opcode == SHADER_OPCODE_BROADCAST &&
498 inst->dst.file == VGRF &&
499 inst->dst.file == bcast->src[1].file &&
500 inst->dst.nr == bcast->src[1].nr &&
501 inst->dst.offset == bcast->src[1].offset) {
502 bcast->opcode = BRW_OPCODE_MOV;
503 if (!is_uniform(bcast->src[0]))
504 bcast->src[0] = component(bcast->src[0], 0);
505
506 bcast->force_writemask_all = true;
507 bcast->exec_size = 8 * reg_unit(s.devinfo);
508 assert(bcast->size_written == bcast->dst.component_size(bcast->exec_size));
509 bcast->resize_sources(1);
510 }
511 }
512 break;
513
514 default:
515 break;
516 }
517 }
518
519 out:
520 if (progress)
521 s.invalidate_analysis(DEPENDENCY_INSTRUCTION_DETAIL);
522
523 return progress;
524 }
525
526 /**
527 * Rounding modes for conversion instructions are included for each
528 * conversion, but right now it is a state. So once it is set,
529 * we don't need to call it again for subsequent calls.
530 *
531 * This is useful for vector/matrices conversions, as setting the
532 * mode once is enough for the full vector/matrix
533 */
534 bool
brw_opt_remove_extra_rounding_modes(fs_visitor & s)535 brw_opt_remove_extra_rounding_modes(fs_visitor &s)
536 {
537 bool progress = false;
538 unsigned execution_mode = s.nir->info.float_controls_execution_mode;
539
540 brw_rnd_mode base_mode = BRW_RND_MODE_UNSPECIFIED;
541 if ((FLOAT_CONTROLS_ROUNDING_MODE_RTE_FP16 |
542 FLOAT_CONTROLS_ROUNDING_MODE_RTE_FP32 |
543 FLOAT_CONTROLS_ROUNDING_MODE_RTE_FP64) &
544 execution_mode)
545 base_mode = BRW_RND_MODE_RTNE;
546 if ((FLOAT_CONTROLS_ROUNDING_MODE_RTZ_FP16 |
547 FLOAT_CONTROLS_ROUNDING_MODE_RTZ_FP32 |
548 FLOAT_CONTROLS_ROUNDING_MODE_RTZ_FP64) &
549 execution_mode)
550 base_mode = BRW_RND_MODE_RTZ;
551
552 foreach_block (block, s.cfg) {
553 brw_rnd_mode prev_mode = base_mode;
554
555 foreach_inst_in_block_safe (fs_inst, inst, block) {
556 if (inst->opcode == SHADER_OPCODE_RND_MODE) {
557 assert(inst->src[0].file == IMM);
558 const brw_rnd_mode mode = (brw_rnd_mode) inst->src[0].d;
559 if (mode == prev_mode) {
560 inst->remove(block);
561 progress = true;
562 } else {
563 prev_mode = mode;
564 }
565 }
566 }
567 }
568
569 if (progress)
570 s.invalidate_analysis(DEPENDENCY_INSTRUCTIONS);
571
572 return progress;
573 }
574
575 bool
brw_opt_send_to_send_gather(fs_visitor & s)576 brw_opt_send_to_send_gather(fs_visitor &s)
577 {
578 const intel_device_info *devinfo = s.devinfo;
579 bool progress = false;
580
581 assert(devinfo->ver >= 30);
582
583 const unsigned unit = reg_unit(devinfo);
584 assert(unit == 2);
585
586 unsigned count = 0;
587
588 foreach_block_and_inst_safe(block, fs_inst, inst, s.cfg) {
589 if (inst->opcode != SHADER_OPCODE_SEND)
590 continue;
591
592 /* For 1-2 registers, send-gather offers no benefits over split-send. */
593 if (inst->mlen + inst->ex_mlen <= 2 * unit)
594 continue;
595
596 assert(inst->mlen % unit == 0);
597 assert(inst->ex_mlen % unit == 0);
598
599 struct {
600 brw_reg src;
601 unsigned phys_len;
602 } payload[2] = {
603 { inst->src[2], inst->mlen / unit },
604 { inst->src[3], inst->ex_mlen / unit },
605 };
606
607 const unsigned num_payload_sources = payload[0].phys_len + payload[1].phys_len;
608
609 /* Limited by Src0.Length in the SEND instruction. */
610 if (num_payload_sources > 15)
611 continue;
612
613 if (INTEL_DEBUG(DEBUG_NO_SEND_GATHER)) {
614 count++;
615 continue;
616 }
617
618 inst->resize_sources(3 + num_payload_sources);
619 /* Sources 0 and 1 remain the same. Source 2 will be filled
620 * after register allocation.
621 */
622 inst->src[2] = {};
623
624 int idx = 3;
625 for (unsigned p = 0; p < ARRAY_SIZE(payload); p++) {
626 for (unsigned i = 0; i < payload[p].phys_len; i++) {
627 inst->src[idx++] = byte_offset(payload[p].src,
628 i * reg_unit(devinfo) * REG_SIZE);
629 }
630 }
631 assert(idx == inst->sources);
632
633 inst->opcode = SHADER_OPCODE_SEND_GATHER;
634 inst->mlen = 0;
635 inst->ex_mlen = 0;
636
637 progress = true;
638 }
639
640 if (INTEL_DEBUG(DEBUG_NO_SEND_GATHER)) {
641 fprintf(stderr, "Ignored %u opportunities to try SEND_GATHER in %s shader.\n",
642 count, _mesa_shader_stage_to_string(s.stage));
643 }
644
645 if (progress)
646 s.invalidate_analysis(DEPENDENCY_INSTRUCTION_DETAIL |
647 DEPENDENCY_INSTRUCTION_DATA_FLOW);
648
649 return progress;
650 }
651
652 /* If after optimizations, the sources are *still* contiguous in a
653 * SEND_GATHER, prefer to use the regular SEND, which would save
654 * having to write the ARF scalar register.
655 */
656 bool
brw_opt_send_gather_to_send(fs_visitor & s)657 brw_opt_send_gather_to_send(fs_visitor &s)
658 {
659 const intel_device_info *devinfo = s.devinfo;
660 bool progress = false;
661
662 assert(devinfo->ver >= 30);
663
664 const unsigned unit = reg_unit(devinfo);
665 assert(unit == 2);
666
667 foreach_block_and_inst_safe(block, fs_inst, inst, s.cfg) {
668 if (inst->opcode != SHADER_OPCODE_SEND_GATHER)
669 continue;
670
671 assert(inst->sources > 2);
672 assert(inst->src[2].file == BAD_FILE);
673
674 const int num_payload_sources = inst->sources - 3;
675 assert(num_payload_sources > 0);
676
677 /* Limited by Src0.Length in the SEND instruction. */
678 assert(num_payload_sources < 16);
679
680 /* Determine whether the sources are still spread in either one or two
681 * spans. In those cases the regular SEND instruction can be used
682 * and there's no need to use SEND_GATHER (which would set ARF scalar register
683 * adding an extra instruction).
684 */
685 const brw_reg *payload = &inst->src[3];
686 brw_reg payload1 = payload[0];
687 brw_reg payload2 = {};
688 int payload1_len = 0;
689 int payload2_len = 0;
690
691 for (int i = 0; i < num_payload_sources; i++) {
692 if (payload[i].file == VGRF &&
693 payload[i].nr == payload1.nr &&
694 payload[i].offset == payload1_len * REG_SIZE * unit)
695 payload1_len++;
696 else {
697 payload2 = payload[i];
698 break;
699 }
700 }
701
702 if (payload2.file == VGRF) {
703 for (int i = payload1_len; i < num_payload_sources; i++) {
704 if (payload[i].file == VGRF &&
705 payload[i].nr == payload2.nr &&
706 payload[i].offset == payload2_len * REG_SIZE * unit)
707 payload2_len++;
708 else
709 break;
710 }
711 } else {
712 payload2 = brw_null_reg();
713 }
714
715 if (payload1_len + payload2_len != num_payload_sources)
716 continue;
717
718 /* Bspec 57058 (r64705) says
719 *
720 * When a source data payload is used in dataport message, that payload
721 * must be specified as Source 1 portion of a Split Send message.
722 *
723 * But at this point the split point is not guaranteed to respect that.
724 *
725 * TODO: Pass LSC address length or infer it so valid splits can work.
726 */
727 if (payload2_len && (inst->sfid == GFX12_SFID_UGM ||
728 inst->sfid == GFX12_SFID_TGM ||
729 inst->sfid == GFX12_SFID_SLM ||
730 inst->sfid == BRW_SFID_URB)) {
731 enum lsc_opcode lsc_op = lsc_msg_desc_opcode(devinfo, inst->desc);
732 if (lsc_op_num_data_values(lsc_op) > 0)
733 continue;
734 }
735
736 inst->resize_sources(4);
737 inst->opcode = SHADER_OPCODE_SEND;
738 inst->src[2] = payload1;
739 inst->src[3] = payload2;
740 inst->mlen = payload1_len * unit;
741 inst->ex_mlen = payload2_len * unit;
742
743 progress = true;
744 }
745
746 if (progress) {
747 s.invalidate_analysis(DEPENDENCY_INSTRUCTION_DETAIL |
748 DEPENDENCY_INSTRUCTION_DATA_FLOW);
749 }
750
751 return progress;
752 }
753