• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*
2  * Copyright © 2018 Intel Corporation
3  *
4  * Permission is hereby granted, free of charge, to any person obtaining a
5  * copy of this software and associated documentation files (the "Software"),
6  * to deal in the Software without restriction, including without limitation
7  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8  * and/or sell copies of the Software, and to permit persons to whom the
9  * Software is furnished to do so, subject to the following conditions:
10  *
11  * The above copyright notice and this permission notice (including the next
12  * paragraph) shall be included in all copies or substantial portions of the
13  * Software.
14  *
15  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
18  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21  * IN THE SOFTWARE.
22  */
23 
24 #include "dev/intel_device_info.h"
25 #include "intel_nir.h"
26 #include "isl/isl.h"
27 #include "nir_builder.h"
28 
29 static bool
rebase_const_offset_ubo_loads_instr(nir_builder * b,nir_instr * instr,void * cb_data)30 rebase_const_offset_ubo_loads_instr(nir_builder *b,
31                                     nir_instr *instr,
32                                     void *cb_data)
33 {
34    if (instr->type != nir_instr_type_intrinsic)
35       return false;
36 
37    nir_intrinsic_instr *intrin = nir_instr_as_intrinsic(instr);
38    if (intrin->intrinsic != nir_intrinsic_load_ubo_uniform_block_intel)
39       return false;
40 
41    if (!nir_src_is_const(intrin->src[1]))
42       return false;
43 
44    const unsigned type_bytes = intrin->def.bit_size / 8;
45    const unsigned cacheline_bytes = 64;
46    const unsigned block_components =
47       MIN2(cacheline_bytes / type_bytes, NIR_MAX_VEC_COMPONENTS);
48 
49    const unsigned orig_offset = nir_src_as_uint(intrin->src[1]);
50    const unsigned new_offset = ROUND_DOWN_TO(orig_offset, cacheline_bytes);
51 
52    const unsigned orig_def_components = intrin->def.num_components;
53    const unsigned orig_read_components =
54       nir_def_last_component_read(&intrin->def) + 1;
55    const unsigned pad_components = (orig_offset - new_offset) / type_bytes;
56 
57    /* Don't round down if we'd have to split a single load into two loads */
58    if (orig_read_components + pad_components > block_components)
59       return false;
60 
61    /* Always read a full block so we can CSE reads of different sizes.
62     * The backend will skip reading unused trailing components anyway.
63     */
64    intrin->def.num_components = block_components;
65    intrin->num_components = block_components;
66    nir_intrinsic_set_range_base(intrin, new_offset);
67    nir_intrinsic_set_range(intrin, block_components * type_bytes);
68    nir_intrinsic_set_align_offset(intrin, 0);
69 
70    if (pad_components) {
71       /* Change the base of the load to the new lower offset, and emit
72        * moves to read from the now higher vector component locations.
73        */
74       b->cursor = nir_before_instr(instr);
75       nir_src_rewrite(&intrin->src[1], nir_imm_int(b, new_offset));
76    }
77 
78    b->cursor = nir_after_instr(instr);
79 
80    nir_scalar components[NIR_MAX_VEC_COMPONENTS];
81    nir_scalar undef = nir_get_scalar(nir_undef(b, 1, type_bytes * 8), 0);
82    unsigned i = 0;
83    for (; i < orig_read_components; i++)
84       components[i] = nir_get_scalar(&intrin->def, pad_components + i);
85    for (; i < orig_def_components; i++)
86       components[i] = undef;
87 
88    nir_def *rebase = nir_vec_scalars(b, components, orig_def_components);
89    rebase->divergent = false;
90 
91    nir_def_rewrite_uses_after(&intrin->def, rebase, rebase->parent_instr);
92 
93    return true;
94 }
95 
96 /**
97  * Shaders commonly contain small UBO loads with a constant offset scattered
98  * throughout the program.  Ideally, we want to vectorize those into larger
99  * block loads so we can load whole cachelines at a time, or at least fill
100  * whole 32B registers rather than having empty space.
101  *
102  * nir_opt_load_store_vectorize() is terrific for combining small loads into
103  * nice large block loads.  Unfortunately, it only vectorizes within a single
104  * basic block, and there's a lot of opportunity for optimizing globally.
105  *
106  * In the past, our backend loaded whole 64B cachelines at a time (on pre-Xe2,
107  * two registers) and rounded down constant UBO load offsets to the nearest
108  * multiple of 64B.  This meant multiple loads within the same 64B would be
109  * CSE'd into the same load, and we could even take advantage of global CSE.
110  * However, we didn't have a method for shrinking loads from 64B back to 32B
111  * again, and also didn't have a lot of flexibility in how this interacted
112  * with the NIR load/store vectorization.
113  *
114  * This pass takes a similar approach, but in NIR.  The idea is to:
115  *
116  * 1. Run load/store vectorization to combine access within a basic block
117  *
118  * 2. Find load_ubo_uniform_block_intel intrinsics with constant offsets.
119  *    Round their base down to the nearest multiple of 64B, and also increase
120  *    their returned vector to be a vec16 (64B for 32-bit values).  However,
121  *    only do this if a single vec16 load would cover this additional "pad"
122  *    space at the front, and all used components of the existing load.  That
123  *    way, we don't blindly turn a single load into two loads.
124  *
125  *    If we made any progress, then...
126  *
127  * 3. Run global CSE.  This will coalesce any accesses to the same 64B
128  *    region across subtrees of the CFG.
129  *
130  * 4. Run the load/store vectorizer again for UBOs.  This will clean up
131  *    any overlapping memory access within a block.
132  *
133  * 5. Have the backend only issue loads for components of the vec16 which
134  *    are actually read.  We could also shrink this in NIR, but doing it in
135  *    the backend is pretty straightforward.
136  *
137  * We could probably do better with a fancier sliding-window type pass
138  * which looked across blocks to produce optimal loads.  However, this
139  * simple hack using existing passes does a fairly good job for now.
140  */
141 bool
brw_nir_rebase_const_offset_ubo_loads(nir_shader * shader)142 brw_nir_rebase_const_offset_ubo_loads(nir_shader *shader)
143 {
144    return nir_shader_instructions_pass(shader,
145                                        rebase_const_offset_ubo_loads_instr,
146                                        nir_metadata_control_flow |
147                                        nir_metadata_live_defs,
148                                        NULL);
149 }
150 
151 static bool
intel_nir_blockify_uniform_loads_instr(nir_builder * b,nir_instr * instr,void * cb_data)152 intel_nir_blockify_uniform_loads_instr(nir_builder *b,
153                                        nir_instr *instr,
154                                        void *cb_data)
155 {
156    if (instr->type != nir_instr_type_intrinsic)
157       return false;
158 
159    const struct intel_device_info *devinfo = cb_data;
160 
161    nir_intrinsic_instr *intrin = nir_instr_as_intrinsic(instr);
162    switch (intrin->intrinsic) {
163    case nir_intrinsic_load_ubo:
164    case nir_intrinsic_load_ssbo:
165       /* BDW PRMs, Volume 7: 3D-Media-GPGPU: OWord Block ReadWrite:
166        *
167        *    "The surface base address must be OWord-aligned."
168        *
169        * We can't make that guarantee with SSBOs where the alignment is
170        * 4bytes.
171        */
172       if (devinfo->ver < 9)
173          return false;
174 
175       if (nir_src_is_divergent(&intrin->src[1]))
176          return false;
177 
178       if (intrin->def.bit_size != 32)
179          return false;
180 
181       /* Without the LSC, we can only do block loads of at least 4dwords (1
182        * oword).
183        */
184       if (!devinfo->has_lsc && intrin->def.num_components < 4)
185          return false;
186 
187       intrin->intrinsic =
188          intrin->intrinsic == nir_intrinsic_load_ubo ?
189          nir_intrinsic_load_ubo_uniform_block_intel :
190          nir_intrinsic_load_ssbo_uniform_block_intel;
191       return true;
192 
193    case nir_intrinsic_load_shared:
194       /* Block loads on shared memory are not supported before Icelake. */
195       if (devinfo->ver < 11)
196          return false;
197 
198       if (nir_src_is_divergent(&intrin->src[0]))
199          return false;
200 
201       if (intrin->def.bit_size != 32)
202          return false;
203 
204       /* Without the LSC, we have to use OWord Block Load messages (the one
205        * that requires OWord aligned offsets, too).
206        */
207       if (!devinfo->has_lsc &&
208           (intrin->def.num_components < 4 ||
209            nir_intrinsic_align(intrin) < 16))
210          return false;
211 
212       intrin->intrinsic = nir_intrinsic_load_shared_uniform_block_intel;
213       return true;
214 
215    case nir_intrinsic_load_global_constant:
216       if (nir_src_is_divergent(&intrin->src[0]))
217          return false;
218 
219       if (intrin->def.bit_size != 32)
220          return false;
221 
222       /* Without the LSC, we can only do block loads of at least 4dwords (1
223        * oword).
224        */
225       if (!devinfo->has_lsc && intrin->def.num_components < 4)
226          return false;
227 
228       intrin->intrinsic = nir_intrinsic_load_global_constant_uniform_block_intel;
229       return true;
230 
231    default:
232       return false;
233    }
234 }
235 
236 bool
intel_nir_blockify_uniform_loads(nir_shader * shader,const struct intel_device_info * devinfo)237 intel_nir_blockify_uniform_loads(nir_shader *shader,
238                                  const struct intel_device_info *devinfo)
239 {
240    return nir_shader_instructions_pass(shader,
241                                        intel_nir_blockify_uniform_loads_instr,
242                                        nir_metadata_control_flow |
243                                        nir_metadata_live_defs,
244                                        (void *) devinfo);
245 }
246