1 /*
2 * Copyright © 2018 Intel Corporation
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
13 * Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21 * IN THE SOFTWARE.
22 */
23
24 #include "dev/intel_device_info.h"
25 #include "intel_nir.h"
26 #include "isl/isl.h"
27 #include "nir_builder.h"
28
29 static bool
rebase_const_offset_ubo_loads_instr(nir_builder * b,nir_instr * instr,void * cb_data)30 rebase_const_offset_ubo_loads_instr(nir_builder *b,
31 nir_instr *instr,
32 void *cb_data)
33 {
34 if (instr->type != nir_instr_type_intrinsic)
35 return false;
36
37 nir_intrinsic_instr *intrin = nir_instr_as_intrinsic(instr);
38 if (intrin->intrinsic != nir_intrinsic_load_ubo_uniform_block_intel)
39 return false;
40
41 if (!nir_src_is_const(intrin->src[1]))
42 return false;
43
44 const unsigned type_bytes = intrin->def.bit_size / 8;
45 const unsigned cacheline_bytes = 64;
46 const unsigned block_components =
47 MIN2(cacheline_bytes / type_bytes, NIR_MAX_VEC_COMPONENTS);
48
49 const unsigned orig_offset = nir_src_as_uint(intrin->src[1]);
50 const unsigned new_offset = ROUND_DOWN_TO(orig_offset, cacheline_bytes);
51
52 const unsigned orig_def_components = intrin->def.num_components;
53 const unsigned orig_read_components =
54 nir_def_last_component_read(&intrin->def) + 1;
55 const unsigned pad_components = (orig_offset - new_offset) / type_bytes;
56
57 /* Don't round down if we'd have to split a single load into two loads */
58 if (orig_read_components + pad_components > block_components)
59 return false;
60
61 /* Always read a full block so we can CSE reads of different sizes.
62 * The backend will skip reading unused trailing components anyway.
63 */
64 intrin->def.num_components = block_components;
65 intrin->num_components = block_components;
66 nir_intrinsic_set_range_base(intrin, new_offset);
67 nir_intrinsic_set_range(intrin, block_components * type_bytes);
68 nir_intrinsic_set_align_offset(intrin, 0);
69
70 if (pad_components) {
71 /* Change the base of the load to the new lower offset, and emit
72 * moves to read from the now higher vector component locations.
73 */
74 b->cursor = nir_before_instr(instr);
75 nir_src_rewrite(&intrin->src[1], nir_imm_int(b, new_offset));
76 }
77
78 b->cursor = nir_after_instr(instr);
79
80 nir_scalar components[NIR_MAX_VEC_COMPONENTS];
81 nir_scalar undef = nir_get_scalar(nir_undef(b, 1, type_bytes * 8), 0);
82 unsigned i = 0;
83 for (; i < orig_read_components; i++)
84 components[i] = nir_get_scalar(&intrin->def, pad_components + i);
85 for (; i < orig_def_components; i++)
86 components[i] = undef;
87
88 nir_def *rebase = nir_vec_scalars(b, components, orig_def_components);
89 rebase->divergent = false;
90
91 nir_def_rewrite_uses_after(&intrin->def, rebase, rebase->parent_instr);
92
93 return true;
94 }
95
96 /**
97 * Shaders commonly contain small UBO loads with a constant offset scattered
98 * throughout the program. Ideally, we want to vectorize those into larger
99 * block loads so we can load whole cachelines at a time, or at least fill
100 * whole 32B registers rather than having empty space.
101 *
102 * nir_opt_load_store_vectorize() is terrific for combining small loads into
103 * nice large block loads. Unfortunately, it only vectorizes within a single
104 * basic block, and there's a lot of opportunity for optimizing globally.
105 *
106 * In the past, our backend loaded whole 64B cachelines at a time (on pre-Xe2,
107 * two registers) and rounded down constant UBO load offsets to the nearest
108 * multiple of 64B. This meant multiple loads within the same 64B would be
109 * CSE'd into the same load, and we could even take advantage of global CSE.
110 * However, we didn't have a method for shrinking loads from 64B back to 32B
111 * again, and also didn't have a lot of flexibility in how this interacted
112 * with the NIR load/store vectorization.
113 *
114 * This pass takes a similar approach, but in NIR. The idea is to:
115 *
116 * 1. Run load/store vectorization to combine access within a basic block
117 *
118 * 2. Find load_ubo_uniform_block_intel intrinsics with constant offsets.
119 * Round their base down to the nearest multiple of 64B, and also increase
120 * their returned vector to be a vec16 (64B for 32-bit values). However,
121 * only do this if a single vec16 load would cover this additional "pad"
122 * space at the front, and all used components of the existing load. That
123 * way, we don't blindly turn a single load into two loads.
124 *
125 * If we made any progress, then...
126 *
127 * 3. Run global CSE. This will coalesce any accesses to the same 64B
128 * region across subtrees of the CFG.
129 *
130 * 4. Run the load/store vectorizer again for UBOs. This will clean up
131 * any overlapping memory access within a block.
132 *
133 * 5. Have the backend only issue loads for components of the vec16 which
134 * are actually read. We could also shrink this in NIR, but doing it in
135 * the backend is pretty straightforward.
136 *
137 * We could probably do better with a fancier sliding-window type pass
138 * which looked across blocks to produce optimal loads. However, this
139 * simple hack using existing passes does a fairly good job for now.
140 */
141 bool
brw_nir_rebase_const_offset_ubo_loads(nir_shader * shader)142 brw_nir_rebase_const_offset_ubo_loads(nir_shader *shader)
143 {
144 return nir_shader_instructions_pass(shader,
145 rebase_const_offset_ubo_loads_instr,
146 nir_metadata_control_flow |
147 nir_metadata_live_defs,
148 NULL);
149 }
150
151 static bool
intel_nir_blockify_uniform_loads_instr(nir_builder * b,nir_instr * instr,void * cb_data)152 intel_nir_blockify_uniform_loads_instr(nir_builder *b,
153 nir_instr *instr,
154 void *cb_data)
155 {
156 if (instr->type != nir_instr_type_intrinsic)
157 return false;
158
159 const struct intel_device_info *devinfo = cb_data;
160
161 nir_intrinsic_instr *intrin = nir_instr_as_intrinsic(instr);
162 switch (intrin->intrinsic) {
163 case nir_intrinsic_load_ubo:
164 case nir_intrinsic_load_ssbo:
165 /* BDW PRMs, Volume 7: 3D-Media-GPGPU: OWord Block ReadWrite:
166 *
167 * "The surface base address must be OWord-aligned."
168 *
169 * We can't make that guarantee with SSBOs where the alignment is
170 * 4bytes.
171 */
172 if (devinfo->ver < 9)
173 return false;
174
175 if (nir_src_is_divergent(&intrin->src[1]))
176 return false;
177
178 if (intrin->def.bit_size != 32)
179 return false;
180
181 /* Without the LSC, we can only do block loads of at least 4dwords (1
182 * oword).
183 */
184 if (!devinfo->has_lsc && intrin->def.num_components < 4)
185 return false;
186
187 intrin->intrinsic =
188 intrin->intrinsic == nir_intrinsic_load_ubo ?
189 nir_intrinsic_load_ubo_uniform_block_intel :
190 nir_intrinsic_load_ssbo_uniform_block_intel;
191 return true;
192
193 case nir_intrinsic_load_shared:
194 /* Block loads on shared memory are not supported before Icelake. */
195 if (devinfo->ver < 11)
196 return false;
197
198 if (nir_src_is_divergent(&intrin->src[0]))
199 return false;
200
201 if (intrin->def.bit_size != 32)
202 return false;
203
204 /* Without the LSC, we have to use OWord Block Load messages (the one
205 * that requires OWord aligned offsets, too).
206 */
207 if (!devinfo->has_lsc &&
208 (intrin->def.num_components < 4 ||
209 nir_intrinsic_align(intrin) < 16))
210 return false;
211
212 intrin->intrinsic = nir_intrinsic_load_shared_uniform_block_intel;
213 return true;
214
215 case nir_intrinsic_load_global_constant:
216 if (nir_src_is_divergent(&intrin->src[0]))
217 return false;
218
219 if (intrin->def.bit_size != 32)
220 return false;
221
222 /* Without the LSC, we can only do block loads of at least 4dwords (1
223 * oword).
224 */
225 if (!devinfo->has_lsc && intrin->def.num_components < 4)
226 return false;
227
228 intrin->intrinsic = nir_intrinsic_load_global_constant_uniform_block_intel;
229 return true;
230
231 default:
232 return false;
233 }
234 }
235
236 bool
intel_nir_blockify_uniform_loads(nir_shader * shader,const struct intel_device_info * devinfo)237 intel_nir_blockify_uniform_loads(nir_shader *shader,
238 const struct intel_device_info *devinfo)
239 {
240 return nir_shader_instructions_pass(shader,
241 intel_nir_blockify_uniform_loads_instr,
242 nir_metadata_control_flow |
243 nir_metadata_live_defs,
244 (void *) devinfo);
245 }
246