• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*
2  * Copyright © 2010 Intel Corporation
3  * SPDX-License-Identifier: MIT
4  */
5 
6 #include "brw_eu.h"
7 #include "brw_fs.h"
8 #include "brw_builder.h"
9 
10 #include "dev/intel_debug.h"
11 
12 using namespace brw;
13 
14 void
brw_optimize(fs_visitor & s)15 brw_optimize(fs_visitor &s)
16 {
17    const nir_shader *nir = s.nir;
18 
19    s.debug_optimizer(nir, "start", 0, 0);
20 
21    /* Start by validating the shader we currently have. */
22    brw_validate(s);
23 
24    /* Track how much non-SSA at this point. */
25    {
26       const brw::def_analysis &defs = s.def_analysis.require();
27       s.shader_stats.non_ssa_registers_after_nir =
28          defs.count() - defs.ssa_count();
29    }
30 
31    bool progress = false;
32    int iteration = 0;
33    int pass_num = 0;
34 
35 #define OPT(pass, ...) ({                                               \
36       pass_num++;                                                       \
37       bool this_progress = pass(s, ##__VA_ARGS__);                      \
38                                                                         \
39       if (this_progress)                                                \
40          s.debug_optimizer(nir, #pass, iteration, pass_num);            \
41                                                                         \
42       brw_validate(s);                                                  \
43                                                                         \
44       progress = progress || this_progress;                             \
45       this_progress;                                                    \
46    })
47 
48    if (s.compiler->lower_dpas)
49       OPT(brw_lower_dpas);
50 
51    OPT(brw_opt_split_virtual_grfs);
52 
53    /* Before anything else, eliminate dead code.  The results of some NIR
54     * instructions may effectively be calculated twice.  Once when the
55     * instruction is encountered, and again when the user of that result is
56     * encountered.  Wipe those away before algebraic optimizations and
57     * especially copy propagation can mix things up.
58     */
59    OPT(brw_opt_dead_code_eliminate);
60 
61    OPT(brw_opt_remove_extra_rounding_modes);
62 
63    OPT(brw_opt_eliminate_find_live_channel);
64 
65    do {
66       progress = false;
67       pass_num = 0;
68       iteration++;
69 
70       OPT(brw_opt_algebraic);
71       OPT(brw_opt_cse_defs);
72       if (!OPT(brw_opt_copy_propagation_defs))
73          OPT(brw_opt_copy_propagation);
74       OPT(brw_opt_cmod_propagation);
75       OPT(brw_opt_dead_code_eliminate);
76       OPT(brw_opt_saturate_propagation);
77       OPT(brw_opt_register_coalesce);
78 
79       OPT(brw_opt_compact_virtual_grfs);
80    } while (progress);
81 
82    brw_shader_phase_update(s, BRW_SHADER_PHASE_AFTER_OPT_LOOP);
83 
84    progress = false;
85    pass_num = 0;
86 
87    if (OPT(brw_opt_combine_convergent_txf))
88       OPT(brw_opt_copy_propagation_defs);
89 
90    if (OPT(brw_lower_pack)) {
91       OPT(brw_opt_register_coalesce);
92       OPT(brw_opt_dead_code_eliminate);
93    }
94 
95    OPT(brw_lower_subgroup_ops);
96    OPT(brw_lower_csel);
97    OPT(brw_lower_simd_width);
98    OPT(brw_lower_scalar_fp64_MAD);
99    OPT(brw_lower_barycentrics);
100    OPT(brw_lower_logical_sends);
101 
102    brw_shader_phase_update(s, BRW_SHADER_PHASE_AFTER_EARLY_LOWERING);
103 
104    /* After logical SEND lowering. */
105 
106    if (!OPT(brw_opt_copy_propagation_defs))
107       OPT(brw_opt_copy_propagation);
108 
109    /* Identify trailing zeros LOAD_PAYLOAD of sampler messages.
110     * Do this before splitting SENDs.
111     */
112    if (OPT(brw_opt_zero_samples)) {
113       if (!OPT(brw_opt_copy_propagation_defs)) {
114          OPT(brw_opt_copy_propagation);
115       }
116    }
117 
118    if (s.devinfo->ver >= 30)
119       OPT(brw_opt_send_to_send_gather);
120 
121    OPT(brw_opt_split_sends);
122    OPT(brw_workaround_nomask_control_flow);
123 
124    if (progress) {
125       /* Do both forms of copy propagation because it is important to
126        * eliminate as many cases of load_payload-of-load_payload as possible.
127        */
128       OPT(brw_opt_copy_propagation_defs);
129       OPT(brw_opt_copy_propagation);
130 
131       /* Run after logical send lowering to give it a chance to CSE the
132        * LOAD_PAYLOAD instructions created to construct the payloads of
133        * e.g. texturing messages in cases where it wasn't possible to CSE the
134        * whole logical instruction.
135        */
136       OPT(brw_opt_cse_defs);
137       OPT(brw_opt_register_coalesce);
138       OPT(brw_opt_dead_code_eliminate);
139    }
140 
141    OPT(brw_opt_remove_redundant_halts);
142 
143    if (OPT(brw_lower_load_payload)) {
144       OPT(brw_opt_split_virtual_grfs);
145 
146       OPT(brw_opt_register_coalesce);
147       OPT(brw_lower_simd_width);
148       OPT(brw_opt_dead_code_eliminate);
149    }
150 
151    brw_shader_phase_update(s, BRW_SHADER_PHASE_AFTER_MIDDLE_LOWERING);
152 
153    OPT(brw_lower_alu_restrictions);
154 
155    OPT(brw_opt_combine_constants);
156    if (OPT(brw_lower_integer_multiplication)) {
157       /* If lower_integer_multiplication made progress, it may have produced
158        * some 32x32-bit MULs in the process of lowering 64-bit MULs.  Run it
159        * one more time to clean those up if they exist.
160        */
161       OPT(brw_lower_integer_multiplication);
162    }
163    OPT(brw_lower_sub_sat);
164 
165    progress = false;
166    OPT(brw_lower_derivatives);
167    OPT(brw_lower_regioning);
168 
169    /* Try both copy propagation passes.  The defs one will likely not be
170     * able to handle everything at this point.
171     */
172    const bool cp1 = OPT(brw_opt_copy_propagation_defs);
173    const bool cp2 = OPT(brw_opt_copy_propagation);
174    if (cp1 || cp2)
175       OPT(brw_opt_combine_constants);
176 
177    OPT(brw_opt_dead_code_eliminate);
178    OPT(brw_opt_register_coalesce);
179 
180    if (progress)
181       OPT(brw_lower_simd_width);
182 
183    if (s.devinfo->ver >= 30)
184       OPT(brw_opt_send_gather_to_send);
185 
186    OPT(brw_lower_uniform_pull_constant_loads);
187 
188    if (OPT(brw_lower_send_descriptors)) {
189       /* No need for standard copy_propagation since
190        * brw_fs_opt_address_reg_load will only optimize defs.
191        */
192       if (OPT(brw_opt_copy_propagation_defs))
193          OPT(brw_opt_algebraic);
194       OPT(brw_opt_address_reg_load);
195       OPT(brw_opt_dead_code_eliminate);
196    }
197 
198    OPT(brw_lower_sends_overlapping_payload);
199 
200    OPT(brw_lower_indirect_mov);
201 
202    OPT(brw_lower_find_live_channel);
203 
204    OPT(brw_lower_load_subgroup_invocation);
205 
206    brw_shader_phase_update(s, BRW_SHADER_PHASE_AFTER_LATE_LOWERING);
207 }
208 
209 static unsigned
load_payload_sources_read_for_size(fs_inst * lp,unsigned size_read)210 load_payload_sources_read_for_size(fs_inst *lp, unsigned size_read)
211 {
212    assert(lp->opcode == SHADER_OPCODE_LOAD_PAYLOAD);
213    assert(size_read >= lp->header_size * REG_SIZE);
214 
215    unsigned i;
216    unsigned size = lp->header_size * REG_SIZE;
217    for (i = lp->header_size; size < size_read && i < lp->sources; i++)
218       size += lp->exec_size * brw_type_size_bytes(lp->src[i].type);
219 
220    /* Size read must cover exactly a subset of sources. */
221    assert(size == size_read);
222    return i;
223 }
224 
225 /**
226  * Optimize sample messages that have constant zero values for the trailing
227  * parameters. We can just reduce the message length for these
228  * instructions instead of reserving a register for it. Trailing parameters
229  * that aren't sent default to zero anyway. This will cause the dead code
230  * eliminator to remove the MOV instruction that would otherwise be emitted to
231  * set up the zero value.
232  */
233 
234 bool
brw_opt_zero_samples(fs_visitor & s)235 brw_opt_zero_samples(fs_visitor &s)
236 {
237    bool progress = false;
238 
239    foreach_block_and_inst(block, fs_inst, send, s.cfg) {
240       if (send->opcode != SHADER_OPCODE_SEND ||
241           send->sfid != BRW_SFID_SAMPLER)
242          continue;
243 
244       /* Wa_14012688258:
245        *
246        * Don't trim zeros at the end of payload for sample operations
247        * in cube and cube arrays.
248        */
249       if (send->keep_payload_trailing_zeros)
250          continue;
251 
252       /* This pass works on SENDs before splitting. */
253       if (send->ex_mlen > 0)
254          continue;
255 
256       fs_inst *lp = (fs_inst *) send->prev;
257 
258       if (lp->is_head_sentinel() || lp->opcode != SHADER_OPCODE_LOAD_PAYLOAD)
259          continue;
260 
261       /* How much of the payload are actually read by this SEND. */
262       const unsigned params =
263          load_payload_sources_read_for_size(lp, send->mlen * REG_SIZE);
264 
265       /* We don't want to remove the message header or the first parameter.
266        * Removing the first parameter is not allowed, see the Haswell PRM
267        * volume 7, page 149:
268        *
269        *     "Parameter 0 is required except for the sampleinfo message, which
270        *      has no parameter 0"
271        */
272       const unsigned first_param_idx = lp->header_size;
273       unsigned zero_size = 0;
274       for (unsigned i = params - 1; i > first_param_idx; i--) {
275          if (lp->src[i].file != BAD_FILE && !lp->src[i].is_zero())
276             break;
277          zero_size += lp->exec_size * brw_type_size_bytes(lp->src[i].type) * lp->dst.stride;
278       }
279 
280       /* Round down to ensure to only consider full registers. */
281       const unsigned zero_len = ROUND_DOWN_TO(zero_size / REG_SIZE, reg_unit(s.devinfo));
282       if (zero_len > 0) {
283          /* Note mlen is in REG_SIZE units. */
284          send->mlen -= zero_len;
285          progress = true;
286       }
287    }
288 
289    if (progress)
290       s.invalidate_analysis(DEPENDENCY_INSTRUCTION_DETAIL);
291 
292    return progress;
293 }
294 
295 /**
296  * Opportunistically split SEND message payloads.
297  *
298  * Gfx9+ supports "split" SEND messages, which take two payloads that are
299  * implicitly concatenated.  If we find a SEND message with a single payload,
300  * we can split that payload in two.  This results in smaller contiguous
301  * register blocks for us to allocate.  But it can help beyond that, too.
302  *
303  * We try and split a LOAD_PAYLOAD between sources which change registers.
304  * For example, a sampler message often contains a x/y/z coordinate that may
305  * already be in a contiguous VGRF, combined with an LOD, shadow comparitor,
306  * or array index, which comes from elsewhere.  In this case, the first few
307  * sources will be different offsets of the same VGRF, then a later source
308  * will be a different VGRF.  So we split there, possibly eliminating the
309  * payload concatenation altogether.
310  */
311 bool
brw_opt_split_sends(fs_visitor & s)312 brw_opt_split_sends(fs_visitor &s)
313 {
314    bool progress = false;
315 
316    foreach_block_and_inst(block, fs_inst, send, s.cfg) {
317       if (send->opcode != SHADER_OPCODE_SEND ||
318           send->mlen <= reg_unit(s.devinfo) || send->ex_mlen > 0 ||
319           send->src[2].file != VGRF)
320          continue;
321 
322       /* Currently don't split sends that reuse a previously used payload. */
323       fs_inst *lp = (fs_inst *) send->prev;
324 
325       if (lp->is_head_sentinel() || lp->opcode != SHADER_OPCODE_LOAD_PAYLOAD)
326          continue;
327 
328       if (lp->dst.file != send->src[2].file || lp->dst.nr != send->src[2].nr)
329          continue;
330 
331       /* Split either after the header (if present), or when consecutive
332        * sources switch from one VGRF to a different one.
333        */
334       unsigned mid = lp->header_size;
335       if (mid == 0) {
336          for (mid = 1; mid < lp->sources; mid++) {
337             if (lp->src[mid].file == BAD_FILE)
338                continue;
339 
340             if (lp->src[0].file != lp->src[mid].file ||
341                 lp->src[0].nr != lp->src[mid].nr)
342                break;
343          }
344       }
345 
346       /* SEND mlen might be smaller than what LOAD_PAYLOAD provides, so
347        * find out how many sources from the payload does it really need.
348        */
349       const unsigned end =
350          load_payload_sources_read_for_size(lp, send->mlen * REG_SIZE);
351 
352       /* Nothing to split. */
353       if (end <= mid)
354          continue;
355 
356       const brw_builder ibld(&s, block, lp);
357       fs_inst *lp1 = ibld.LOAD_PAYLOAD(lp->dst, &lp->src[0], mid, lp->header_size);
358       fs_inst *lp2 = ibld.LOAD_PAYLOAD(lp->dst, &lp->src[mid], end - mid, 0);
359 
360       assert(lp1->size_written % REG_SIZE == 0);
361       assert(lp2->size_written % REG_SIZE == 0);
362       assert((lp1->size_written + lp2->size_written) / REG_SIZE == send->mlen);
363 
364       lp1->dst = brw_vgrf(s.alloc.allocate(lp1->size_written / REG_SIZE), lp1->dst.type);
365       lp2->dst = brw_vgrf(s.alloc.allocate(lp2->size_written / REG_SIZE), lp2->dst.type);
366 
367       send->resize_sources(4);
368       send->src[2] = lp1->dst;
369       send->src[3] = lp2->dst;
370       send->ex_mlen = lp2->size_written / REG_SIZE;
371       send->mlen -= send->ex_mlen;
372 
373       progress = true;
374    }
375 
376    if (progress)
377       s.invalidate_analysis(DEPENDENCY_INSTRUCTIONS | DEPENDENCY_VARIABLES);
378 
379    return progress;
380 }
381 
382 /**
383  * Remove redundant or useless halts.
384  *
385  * For example, we can eliminate halts in the following sequence:
386  *
387  * halt        (redundant with the next halt)
388  * halt        (useless; jumps to the next instruction)
389  * halt-target
390  */
391 bool
brw_opt_remove_redundant_halts(fs_visitor & s)392 brw_opt_remove_redundant_halts(fs_visitor &s)
393 {
394    bool progress = false;
395 
396    unsigned halt_count = 0;
397    fs_inst *halt_target = NULL;
398    bblock_t *halt_target_block = NULL;
399    foreach_block_and_inst(block, fs_inst, inst, s.cfg) {
400       if (inst->opcode == BRW_OPCODE_HALT)
401          halt_count++;
402 
403       if (inst->opcode == SHADER_OPCODE_HALT_TARGET) {
404          halt_target = inst;
405          halt_target_block = block;
406          break;
407       }
408    }
409 
410    if (!halt_target) {
411       assert(halt_count == 0);
412       return false;
413    }
414 
415    /* Delete any HALTs immediately before the halt target. */
416    for (fs_inst *prev = (fs_inst *) halt_target->prev;
417         !prev->is_head_sentinel() && prev->opcode == BRW_OPCODE_HALT;
418         prev = (fs_inst *) halt_target->prev) {
419       prev->remove(halt_target_block);
420       halt_count--;
421       progress = true;
422    }
423 
424    if (halt_count == 0) {
425       halt_target->remove(halt_target_block);
426       progress = true;
427    }
428 
429    if (progress)
430       s.invalidate_analysis(DEPENDENCY_INSTRUCTIONS);
431 
432    return progress;
433 }
434 
435 /**
436  * Eliminate FIND_LIVE_CHANNEL instructions occurring outside any control
437  * flow.  We could probably do better here with some form of divergence
438  * analysis.
439  */
440 bool
brw_opt_eliminate_find_live_channel(fs_visitor & s)441 brw_opt_eliminate_find_live_channel(fs_visitor &s)
442 {
443    bool progress = false;
444    unsigned depth = 0;
445 
446    if (!brw_stage_has_packed_dispatch(s.devinfo, s.stage, s.max_polygons,
447                                       s.prog_data)) {
448       /* The optimization below assumes that channel zero is live on thread
449        * dispatch, which may not be the case if the fixed function dispatches
450        * threads sparsely.
451        */
452       return false;
453    }
454 
455    foreach_block_and_inst_safe(block, fs_inst, inst, s.cfg) {
456       switch (inst->opcode) {
457       case BRW_OPCODE_IF:
458       case BRW_OPCODE_DO:
459          depth++;
460          break;
461 
462       case BRW_OPCODE_ENDIF:
463       case BRW_OPCODE_WHILE:
464          depth--;
465          break;
466 
467       case BRW_OPCODE_HALT:
468          /* This can potentially make control flow non-uniform until the end
469           * of the program.
470           */
471          goto out;
472 
473       case SHADER_OPCODE_FIND_LIVE_CHANNEL:
474          if (depth == 0) {
475             inst->opcode = BRW_OPCODE_MOV;
476             inst->src[0] = brw_imm_ud(0u);
477             inst->force_writemask_all = true;
478 
479             /* FIND_LIVE_CHANNEL emitted by emit_uniformize will have
480              * size_written set by hand to a smaller value. In this case,
481              * munge the exec_size to match.
482              */
483             if (inst->size_written == inst->dst.component_size(8 * reg_unit(s.devinfo)))
484                inst->exec_size = 8 * reg_unit(s.devinfo);
485 
486             inst->resize_sources(1);
487             progress = true;
488 
489             /* emit_uniformize() frequently emits FIND_LIVE_CHANNEL paired
490              * with a BROADCAST.  Save some work for opt_copy_propagation
491              * and opt_algebraic by trivially cleaning up both together.
492              */
493             assert(!inst->next->is_tail_sentinel());
494             fs_inst *bcast = (fs_inst *) inst->next;
495 
496             /* Ignore stride when comparing */
497             if (bcast->opcode == SHADER_OPCODE_BROADCAST &&
498                 inst->dst.file == VGRF &&
499                 inst->dst.file == bcast->src[1].file &&
500                 inst->dst.nr == bcast->src[1].nr &&
501                 inst->dst.offset == bcast->src[1].offset) {
502                bcast->opcode = BRW_OPCODE_MOV;
503                if (!is_uniform(bcast->src[0]))
504                   bcast->src[0] = component(bcast->src[0], 0);
505 
506                bcast->force_writemask_all = true;
507                bcast->exec_size = 8 * reg_unit(s.devinfo);
508                assert(bcast->size_written == bcast->dst.component_size(bcast->exec_size));
509                bcast->resize_sources(1);
510             }
511          }
512          break;
513 
514       default:
515          break;
516       }
517    }
518 
519 out:
520    if (progress)
521       s.invalidate_analysis(DEPENDENCY_INSTRUCTION_DETAIL);
522 
523    return progress;
524 }
525 
526 /**
527  * Rounding modes for conversion instructions are included for each
528  * conversion, but right now it is a state. So once it is set,
529  * we don't need to call it again for subsequent calls.
530  *
531  * This is useful for vector/matrices conversions, as setting the
532  * mode once is enough for the full vector/matrix
533  */
534 bool
brw_opt_remove_extra_rounding_modes(fs_visitor & s)535 brw_opt_remove_extra_rounding_modes(fs_visitor &s)
536 {
537    bool progress = false;
538    unsigned execution_mode = s.nir->info.float_controls_execution_mode;
539 
540    brw_rnd_mode base_mode = BRW_RND_MODE_UNSPECIFIED;
541    if ((FLOAT_CONTROLS_ROUNDING_MODE_RTE_FP16 |
542         FLOAT_CONTROLS_ROUNDING_MODE_RTE_FP32 |
543         FLOAT_CONTROLS_ROUNDING_MODE_RTE_FP64) &
544        execution_mode)
545       base_mode = BRW_RND_MODE_RTNE;
546    if ((FLOAT_CONTROLS_ROUNDING_MODE_RTZ_FP16 |
547         FLOAT_CONTROLS_ROUNDING_MODE_RTZ_FP32 |
548         FLOAT_CONTROLS_ROUNDING_MODE_RTZ_FP64) &
549        execution_mode)
550       base_mode = BRW_RND_MODE_RTZ;
551 
552    foreach_block (block, s.cfg) {
553       brw_rnd_mode prev_mode = base_mode;
554 
555       foreach_inst_in_block_safe (fs_inst, inst, block) {
556          if (inst->opcode == SHADER_OPCODE_RND_MODE) {
557             assert(inst->src[0].file == IMM);
558             const brw_rnd_mode mode = (brw_rnd_mode) inst->src[0].d;
559             if (mode == prev_mode) {
560                inst->remove(block);
561                progress = true;
562             } else {
563                prev_mode = mode;
564             }
565          }
566       }
567    }
568 
569    if (progress)
570       s.invalidate_analysis(DEPENDENCY_INSTRUCTIONS);
571 
572    return progress;
573 }
574 
575 bool
brw_opt_send_to_send_gather(fs_visitor & s)576 brw_opt_send_to_send_gather(fs_visitor &s)
577 {
578    const intel_device_info *devinfo = s.devinfo;
579    bool progress = false;
580 
581    assert(devinfo->ver >= 30);
582 
583    const unsigned unit = reg_unit(devinfo);
584    assert(unit == 2);
585 
586    unsigned count = 0;
587 
588    foreach_block_and_inst_safe(block, fs_inst, inst, s.cfg) {
589       if (inst->opcode != SHADER_OPCODE_SEND)
590          continue;
591 
592       /* For 1-2 registers, send-gather offers no benefits over split-send. */
593       if (inst->mlen + inst->ex_mlen <= 2 * unit)
594          continue;
595 
596       assert(inst->mlen % unit == 0);
597       assert(inst->ex_mlen % unit == 0);
598 
599       struct {
600          brw_reg src;
601          unsigned phys_len;
602       } payload[2] = {
603          { inst->src[2], inst->mlen / unit },
604          { inst->src[3], inst->ex_mlen / unit },
605       };
606 
607       const unsigned num_payload_sources = payload[0].phys_len + payload[1].phys_len;
608 
609       /* Limited by Src0.Length in the SEND instruction. */
610       if (num_payload_sources > 15)
611          continue;
612 
613       if (INTEL_DEBUG(DEBUG_NO_SEND_GATHER)) {
614          count++;
615          continue;
616       }
617 
618       inst->resize_sources(3 + num_payload_sources);
619       /* Sources 0 and 1 remain the same.  Source 2 will be filled
620        * after register allocation.
621        */
622       inst->src[2] = {};
623 
624       int idx = 3;
625       for (unsigned p = 0; p < ARRAY_SIZE(payload); p++) {
626          for (unsigned i = 0; i < payload[p].phys_len; i++) {
627             inst->src[idx++] = byte_offset(payload[p].src,
628                                            i * reg_unit(devinfo) * REG_SIZE);
629          }
630       }
631       assert(idx == inst->sources);
632 
633       inst->opcode = SHADER_OPCODE_SEND_GATHER;
634       inst->mlen = 0;
635       inst->ex_mlen = 0;
636 
637       progress = true;
638    }
639 
640    if (INTEL_DEBUG(DEBUG_NO_SEND_GATHER)) {
641       fprintf(stderr, "Ignored %u opportunities to try SEND_GATHER in %s shader.\n",
642               count, _mesa_shader_stage_to_string(s.stage));
643    }
644 
645    if (progress)
646       s.invalidate_analysis(DEPENDENCY_INSTRUCTION_DETAIL |
647                             DEPENDENCY_INSTRUCTION_DATA_FLOW);
648 
649    return progress;
650 }
651 
652 /* If after optimizations, the sources are *still* contiguous in a
653  * SEND_GATHER, prefer to use the regular SEND, which would save
654  * having to write the ARF scalar register.
655  */
656 bool
brw_opt_send_gather_to_send(fs_visitor & s)657 brw_opt_send_gather_to_send(fs_visitor &s)
658 {
659    const intel_device_info *devinfo = s.devinfo;
660    bool progress = false;
661 
662    assert(devinfo->ver >= 30);
663 
664    const unsigned unit = reg_unit(devinfo);
665    assert(unit == 2);
666 
667    foreach_block_and_inst_safe(block, fs_inst, inst, s.cfg) {
668       if (inst->opcode != SHADER_OPCODE_SEND_GATHER)
669          continue;
670 
671       assert(inst->sources > 2);
672       assert(inst->src[2].file == BAD_FILE);
673 
674       const int num_payload_sources = inst->sources - 3;
675       assert(num_payload_sources > 0);
676 
677       /* Limited by Src0.Length in the SEND instruction. */
678       assert(num_payload_sources < 16);
679 
680       /* Determine whether the sources are still spread in either one or two
681        * spans.  In those cases the regular SEND instruction can be used
682        * and there's no need to use SEND_GATHER (which would set ARF scalar register
683        * adding an extra instruction).
684        */
685       const brw_reg *payload = &inst->src[3];
686       brw_reg payload1       = payload[0];
687       brw_reg payload2       = {};
688       int payload1_len       = 0;
689       int payload2_len       = 0;
690 
691       for (int i = 0; i < num_payload_sources; i++) {
692          if (payload[i].file == VGRF &&
693              payload[i].nr == payload1.nr &&
694              payload[i].offset == payload1_len * REG_SIZE * unit)
695             payload1_len++;
696          else {
697             payload2 = payload[i];
698             break;
699          }
700       }
701 
702       if (payload2.file == VGRF) {
703          for (int i = payload1_len; i < num_payload_sources; i++) {
704             if (payload[i].file == VGRF &&
705                 payload[i].nr == payload2.nr &&
706                 payload[i].offset == payload2_len * REG_SIZE * unit)
707                payload2_len++;
708             else
709                break;
710          }
711       } else {
712          payload2 = brw_null_reg();
713       }
714 
715       if (payload1_len + payload2_len != num_payload_sources)
716          continue;
717 
718       /* Bspec 57058 (r64705) says
719        *
720        *    When a source data payload is used in dataport message, that payload
721        *    must be specified as Source 1 portion of a Split Send message.
722        *
723        * But at this point the split point is not guaranteed to respect that.
724        *
725        * TODO: Pass LSC address length or infer it so valid splits can work.
726        */
727       if (payload2_len && (inst->sfid == GFX12_SFID_UGM ||
728                            inst->sfid == GFX12_SFID_TGM ||
729                            inst->sfid == GFX12_SFID_SLM ||
730                            inst->sfid == BRW_SFID_URB)) {
731          enum lsc_opcode lsc_op = lsc_msg_desc_opcode(devinfo, inst->desc);
732          if (lsc_op_num_data_values(lsc_op) > 0)
733             continue;
734       }
735 
736       inst->resize_sources(4);
737       inst->opcode  = SHADER_OPCODE_SEND;
738       inst->src[2]  = payload1;
739       inst->src[3]  = payload2;
740       inst->mlen    = payload1_len * unit;
741       inst->ex_mlen = payload2_len * unit;
742 
743       progress = true;
744    }
745 
746    if (progress) {
747       s.invalidate_analysis(DEPENDENCY_INSTRUCTION_DETAIL |
748                             DEPENDENCY_INSTRUCTION_DATA_FLOW);
749    }
750 
751    return progress;
752 }
753