• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*
2  * Copyright © 2024 Intel Corporation
3  * SPDX-License-Identifier: MIT
4  */
5 
6 #include "brw_eu.h"
7 #include "brw_fs.h"
8 #include "brw_fs_builder.h"
9 
10 using namespace brw;
11 
12 static unsigned
dest_comps_for_txf(const fs_visitor & s,const fs_inst * txf)13 dest_comps_for_txf(const fs_visitor &s, const fs_inst *txf)
14 {
15    if (!txf)
16       return 0;
17 
18    const unsigned grf_size = REG_SIZE * reg_unit(s.devinfo);
19    const unsigned per_component_regs =
20       DIV_ROUND_UP(brw_type_size_bytes(txf->dst.type) *
21                    txf->exec_size, grf_size);
22    const unsigned dest_regs = txf->size_written / grf_size;
23    const unsigned dest_comps = dest_regs / per_component_regs;
24    return dest_comps;
25 }
26 
27 static bool
is_def(const def_analysis & defs,const brw_reg & r)28 is_def(const def_analysis &defs, const brw_reg &r)
29 {
30    return r.file == IMM || r.file == BAD_FILE || defs.get(r) != NULL;
31 }
32 
33 static bool
is_uniform_def(const def_analysis & defs,const brw_reg & r)34 is_uniform_def(const def_analysis &defs, const brw_reg &r)
35 {
36    return is_def(defs, r) && is_uniform(r);
37 }
38 
39 /**
40  * Check if two texture instructions have a matching source (either the same
41  * immediate value, or both references to the same immutable SSA def and
42  * with matching source modifiers and regions).
43  */
44 static bool
sources_match(ASSERTED const def_analysis & defs,const fs_inst * a,const fs_inst * b,enum tex_logical_srcs src)45 sources_match(ASSERTED const def_analysis &defs,
46               const fs_inst *a, const fs_inst *b, enum tex_logical_srcs src)
47 {
48    assert(is_def(defs, a->src[src]));
49    assert(is_def(defs, b->src[src]));
50    return brw_regs_equal(&a->src[src], &b->src[src]);
51 }
52 
53 /**
54  * Look for a series of convergent texture buffer fetches within a basic
55  * block and combine them into a single divergent load with one lane for
56  * each original fetch.  For example, this series of convergent fetches:
57  *
58  *   txf(16) %12:UD, coord = 12d, lod = 0u, handle = %1<0>:D
59  *   txf(16) %13:UD, coord = 13d, lod = 0u, handle = %1<0>:D
60  *   txf(16) %14:UD, coord = 14d, lod = 0u, handle = %1<0>:D
61  *   txf(16) %15:UD, coord = 15d, lod = 0u, handle = %1<0>:D
62  *   txf(16) %16:UD, coord = 16d, lod = 0u, handle = %1<0>:D
63  *   txf(16) %17:UD, coord = 17d, lod = 0u, handle = %1<0>:D
64  *   txf(16) %18:UD, coord = 18d, lod = 0u, handle = %1<0>:D
65  *   txf(16) %19:UD, coord = 19d, lod = 0u, handle = %1<0>:D
66  *
67  * can be combined into a single divergent load and scalar-expansion moves
68  * (which can easily be copy propagated away):
69  *
70  *   load_payload(1) %2:D 12d, 13d, 14d, 15d, 16d, 17d, 18d, 19d
71  *   txf(8) %3:UD, coord = %2, lod = 0u, handle = %1<0>:D
72  *   mov(16) %12:UD, %3+0.0<0>:UD
73  *   ...
74  *   mov(16) %19:UD, %3+0.28<0>:UD
75  *
76  * Our sampler hardware doesn't have any special support for convergent
77  * loads (like LSC transpose/block loads), and always performs SIMD8/16/32
78  * per-channel loads.  But with this trick, we can still combine multiple
79  * convergent loads into a single message with fewer round-trips, and much
80  * lower register pressure.
81  */
82 bool
brw_opt_combine_convergent_txf(fs_visitor & s)83 brw_opt_combine_convergent_txf(fs_visitor &s)
84 {
85    const def_analysis &defs = s.def_analysis.require();
86 
87    const unsigned min_simd = 8 * reg_unit(s.devinfo);
88    const unsigned max_simd = 16 * reg_unit(s.devinfo);
89    const unsigned grf_size = REG_SIZE * reg_unit(s.devinfo);
90 
91    bool progress = false;
92 
93    foreach_block(block, s.cfg) {
94       /* Gather a list of convergent TXFs to the same surface in this block */
95       fs_inst *txfs[32] = {};
96       unsigned count = 0;
97 
98       foreach_inst_in_block(fs_inst, inst, block) {
99          if (inst->opcode != SHADER_OPCODE_TXF_LOGICAL)
100             continue;
101 
102          /* Only handle buffers or single miplevel 1D images for now */
103          if (inst->src[TEX_LOGICAL_SRC_COORD_COMPONENTS].ud > 1)
104             continue;
105 
106          if (inst->src[TEX_LOGICAL_SRC_RESIDENCY].ud != 0)
107             continue;
108 
109          if (inst->predicate || inst->force_writemask_all)
110             continue;
111 
112          if (!is_uniform_def(defs, inst->src[TEX_LOGICAL_SRC_LOD]) ||
113              !is_uniform_def(defs, inst->src[TEX_LOGICAL_SRC_SURFACE]) ||
114              !is_uniform_def(defs, inst->src[TEX_LOGICAL_SRC_SURFACE_HANDLE]))
115             continue;
116 
117          /* Only handle immediates for now: we could check is_uniform(),
118           * but we'd need to ensure the coordinate's definition reaches
119           * txfs[0] which is where we'll insert the combined coordinate.
120           */
121          if (inst->src[TEX_LOGICAL_SRC_COORDINATE].file != IMM)
122             continue;
123 
124          /* texelFetch from 1D buffers shouldn't have any of these */
125          assert(inst->src[TEX_LOGICAL_SRC_SHADOW_C].file == BAD_FILE);
126          assert(inst->src[TEX_LOGICAL_SRC_LOD2].file == BAD_FILE);
127          assert(inst->src[TEX_LOGICAL_SRC_MIN_LOD].file == BAD_FILE);
128          assert(inst->src[TEX_LOGICAL_SRC_SAMPLE_INDEX].file == BAD_FILE);
129          assert(inst->src[TEX_LOGICAL_SRC_MCS].file == BAD_FILE);
130          assert(inst->src[TEX_LOGICAL_SRC_TG4_OFFSET].file == BAD_FILE);
131          assert(inst->src[TEX_LOGICAL_SRC_GRAD_COMPONENTS].file == IMM &&
132                 inst->src[TEX_LOGICAL_SRC_GRAD_COMPONENTS].ud == 0);
133 
134          if (count > 0 &&
135              (!sources_match(defs, inst, txfs[0], TEX_LOGICAL_SRC_LOD) ||
136               !sources_match(defs, inst, txfs[0], TEX_LOGICAL_SRC_SURFACE) ||
137               !sources_match(defs, inst, txfs[0],
138                              TEX_LOGICAL_SRC_SURFACE_HANDLE)))
139             continue;
140 
141          txfs[count++] = inst;
142 
143          if (count == ARRAY_SIZE(txfs))
144             break;
145       }
146 
147       /* Need at least two things to combine. */
148       if (count < 2)
149          continue;
150 
151       /* Emit divergent TXFs and replace the original ones with MOVs */
152       for (unsigned curr = 0; curr < count; curr += max_simd) {
153          const unsigned lanes = CLAMP(count - curr, min_simd, max_simd);
154          const unsigned width = util_next_power_of_two(lanes);
155          const fs_builder ubld =
156             fs_builder(&s).at(block, txfs[curr]).exec_all().group(width, 0);
157          const fs_builder ubld1 = ubld.group(1, 0);
158 
159          enum brw_reg_type coord_type =
160             txfs[curr]->src[TEX_LOGICAL_SRC_COORDINATE].type;
161          brw_reg coord = ubld.vgrf(coord_type);
162          brw_reg coord_comps[32];
163 
164          for (unsigned i = 0; i < width; i++) {
165             /* Our block size might be larger than the number of convergent
166              * loads we're combining.  If so, repeat the last component.
167              */
168             if (txfs[curr+i])
169                coord_comps[i] = txfs[curr+i]->src[TEX_LOGICAL_SRC_COORDINATE];
170             else
171                coord_comps[i] = coord_comps[i-1];
172          }
173          ubld1.VEC(coord, coord_comps, width);
174 
175          brw_reg srcs[TEX_LOGICAL_NUM_SRCS];
176          srcs[TEX_LOGICAL_SRC_COORDINATE] = coord;
177          srcs[TEX_LOGICAL_SRC_LOD] = txfs[0]->src[TEX_LOGICAL_SRC_LOD];
178          srcs[TEX_LOGICAL_SRC_SURFACE] = txfs[0]->src[TEX_LOGICAL_SRC_SURFACE];
179          srcs[TEX_LOGICAL_SRC_SURFACE_HANDLE] =
180             txfs[0]->src[TEX_LOGICAL_SRC_SURFACE_HANDLE];
181          srcs[TEX_LOGICAL_SRC_SAMPLER] = txfs[0]->src[TEX_LOGICAL_SRC_SAMPLER];
182          srcs[TEX_LOGICAL_SRC_SAMPLER_HANDLE] =
183             txfs[0]->src[TEX_LOGICAL_SRC_SAMPLER_HANDLE];
184          srcs[TEX_LOGICAL_SRC_COORD_COMPONENTS] = brw_imm_ud(1);
185          srcs[TEX_LOGICAL_SRC_GRAD_COMPONENTS] = brw_imm_ud(0);
186          srcs[TEX_LOGICAL_SRC_RESIDENCY] = brw_imm_ud(0);
187 
188          /* Each of our txf may have a reduced response length if some
189           * components are never read.  Use the maximum of the sizes.
190           */
191          unsigned new_dest_comps = 0;
192          for (unsigned i = 0; i < width; i++) {
193             const unsigned this_comps = dest_comps_for_txf(s, txfs[curr+i]);
194             new_dest_comps = MAX2(new_dest_comps, this_comps);
195          }
196 
197          /* Emit the new divergent TXF */
198          brw_reg div = ubld.vgrf(BRW_TYPE_UD, new_dest_comps);
199          fs_inst *div_txf =
200             ubld.emit(SHADER_OPCODE_TXF_LOGICAL, div, srcs,
201                       TEX_LOGICAL_NUM_SRCS);
202 
203          /* Update it to also use response length reduction */
204          const unsigned per_component_regs =
205             DIV_ROUND_UP(brw_type_size_bytes(div.type) * div_txf->exec_size,
206                          grf_size);
207          div_txf->size_written = new_dest_comps * per_component_regs * grf_size;
208 
209          for (unsigned i = 0; i < width; i++) {
210             fs_inst *txf = txfs[curr+i];
211             if (!txf)
212                break;
213 
214             const fs_builder ibld = fs_builder(&s, block, txf);
215 
216             /* Replace each of the original TXFs with MOVs from our new one */
217             const unsigned dest_comps = dest_comps_for_txf(s, txf);
218             assert(dest_comps <= 4);
219 
220             brw_reg v[4];
221             for (unsigned c = 0; c < dest_comps; c++)
222                v[c] = component(offset(div, ubld, c), i);
223             ibld.VEC(retype(txf->dst, BRW_TYPE_UD), v, dest_comps);
224 
225             txf->remove(block);
226          }
227 
228          progress = true;
229       }
230    }
231 
232    if (progress)
233       s.invalidate_analysis(DEPENDENCY_INSTRUCTIONS);
234 
235    return progress;
236 }
237