1 /*
2 * Copyright © 2021 Intel Corporation
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
13 * Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21 * IN THE SOFTWARE.
22 */
23
24 #include "brw_private.h"
25 #include "compiler/shader_info.h"
26 #include "intel/dev/intel_debug.h"
27 #include "intel/dev/intel_device_info.h"
28 #include "util/ralloc.h"
29
30 unsigned
brw_required_dispatch_width(const struct shader_info * info)31 brw_required_dispatch_width(const struct shader_info *info)
32 {
33 if ((int)info->subgroup_size >= (int)SUBGROUP_SIZE_REQUIRE_8) {
34 assert(gl_shader_stage_uses_workgroup(info->stage));
35 /* These enum values are expressly chosen to be equal to the subgroup
36 * size that they require.
37 */
38 return (unsigned)info->subgroup_size;
39 } else {
40 return 0;
41 }
42 }
43
44 static inline bool
test_bit(unsigned mask,unsigned bit)45 test_bit(unsigned mask, unsigned bit) {
46 return mask & (1u << bit);
47 }
48
49 namespace {
50
51 struct brw_cs_prog_data *
get_cs_prog_data(brw_simd_selection_state & state)52 get_cs_prog_data(brw_simd_selection_state &state)
53 {
54 if (std::holds_alternative<struct brw_cs_prog_data *>(state.prog_data))
55 return std::get<struct brw_cs_prog_data *>(state.prog_data);
56 else
57 return nullptr;
58 }
59
60 struct brw_stage_prog_data *
get_prog_data(brw_simd_selection_state & state)61 get_prog_data(brw_simd_selection_state &state)
62 {
63 if (std::holds_alternative<struct brw_cs_prog_data *>(state.prog_data))
64 return &std::get<struct brw_cs_prog_data *>(state.prog_data)->base;
65 else if (std::holds_alternative<struct brw_bs_prog_data *>(state.prog_data))
66 return &std::get<struct brw_bs_prog_data *>(state.prog_data)->base;
67 else
68 return nullptr;
69 }
70
71 }
72
73 bool
brw_simd_should_compile(brw_simd_selection_state & state,unsigned simd)74 brw_simd_should_compile(brw_simd_selection_state &state, unsigned simd)
75 {
76 assert(simd < SIMD_COUNT);
77 assert(!state.compiled[simd]);
78
79 const auto cs_prog_data = get_cs_prog_data(state);
80 const auto prog_data = get_prog_data(state);
81 const unsigned width = 8u << simd;
82
83 /* For shaders with variable size workgroup, in most cases we can compile
84 * all the variants (exceptions are bindless dispatch & ray queries), since
85 * the choice will happen only at dispatch time.
86 */
87 const bool workgroup_size_variable = cs_prog_data && cs_prog_data->local_size[0] == 0;
88
89 if (!workgroup_size_variable) {
90 if (state.spilled[simd]) {
91 state.error[simd] = "Would spill";
92 return false;
93 }
94
95 if (state.required_width && state.required_width != width) {
96 state.error[simd] = "Different than required dispatch width";
97 return false;
98 }
99
100 if (cs_prog_data) {
101 const unsigned workgroup_size = cs_prog_data->local_size[0] *
102 cs_prog_data->local_size[1] *
103 cs_prog_data->local_size[2];
104
105 unsigned max_threads = state.devinfo->max_cs_workgroup_threads;
106
107 const unsigned min_simd = state.devinfo->ver >= 20 ? 1 : 0;
108 if (simd > min_simd && state.compiled[simd - 1] &&
109 workgroup_size <= (width / 2)) {
110 state.error[simd] = "Workgroup size already fits in smaller SIMD";
111 return false;
112 }
113
114 if (DIV_ROUND_UP(workgroup_size, width) > max_threads) {
115 state.error[simd] = "Would need more than max_threads to fit all invocations";
116 return false;
117 }
118 }
119
120 /* The SIMD32 is only enabled for cases it is needed unless forced.
121 *
122 * TODO: Use performance_analysis and drop this rule.
123 */
124 if (width == 32 && state.devinfo->ver < 20) {
125 if (!INTEL_DEBUG(DEBUG_DO32) && (state.compiled[0] || state.compiled[1])) {
126 state.error[simd] = "SIMD32 not required (use INTEL_DEBUG=do32 to force)";
127 return false;
128 }
129 }
130 }
131
132 if (width == 8 && state.devinfo->ver >= 20) {
133 state.error[simd] = "SIMD8 not supported on Xe2+";
134 return false;
135 }
136
137 if (width == 32 && cs_prog_data && cs_prog_data->base.ray_queries > 0) {
138 state.error[simd] = "Ray queries not supported";
139 return false;
140 }
141
142 if (width == 32 && cs_prog_data && cs_prog_data->uses_btd_stack_ids) {
143 state.error[simd] = "Bindless shader calls not supported";
144 return false;
145 }
146
147 uint64_t start;
148 switch (prog_data->stage) {
149 case MESA_SHADER_COMPUTE:
150 start = DEBUG_CS_SIMD8;
151 break;
152 case MESA_SHADER_TASK:
153 start = DEBUG_TS_SIMD8;
154 break;
155 case MESA_SHADER_MESH:
156 start = DEBUG_MS_SIMD8;
157 break;
158 case MESA_SHADER_RAYGEN:
159 case MESA_SHADER_ANY_HIT:
160 case MESA_SHADER_CLOSEST_HIT:
161 case MESA_SHADER_MISS:
162 case MESA_SHADER_INTERSECTION:
163 case MESA_SHADER_CALLABLE:
164 start = DEBUG_RT_SIMD8;
165 break;
166 default:
167 unreachable("unknown shader stage in brw_simd_should_compile");
168 }
169
170 const bool env_skip[] = {
171 (intel_simd & (start << 0)) == 0,
172 (intel_simd & (start << 1)) == 0,
173 (intel_simd & (start << 2)) == 0,
174 };
175
176 static_assert(ARRAY_SIZE(env_skip) == SIMD_COUNT);
177
178 if (unlikely(env_skip[simd])) {
179 state.error[simd] = "Disabled by INTEL_DEBUG environment variable";
180 return false;
181 }
182
183 return true;
184 }
185
186 void
brw_simd_mark_compiled(brw_simd_selection_state & state,unsigned simd,bool spilled)187 brw_simd_mark_compiled(brw_simd_selection_state &state, unsigned simd, bool spilled)
188 {
189 assert(simd < SIMD_COUNT);
190 assert(!state.compiled[simd]);
191
192 auto cs_prog_data = get_cs_prog_data(state);
193
194 state.compiled[simd] = true;
195 if (cs_prog_data)
196 cs_prog_data->prog_mask |= 1u << simd;
197
198 /* If a SIMD spilled, all the larger ones would spill too. */
199 if (spilled) {
200 for (unsigned i = simd; i < SIMD_COUNT; i++) {
201 state.spilled[i] = true;
202 if (cs_prog_data)
203 cs_prog_data->prog_spilled |= 1u << i;
204 }
205 }
206 }
207
208 int
brw_simd_select(const struct brw_simd_selection_state & state)209 brw_simd_select(const struct brw_simd_selection_state &state)
210 {
211 for (int i = SIMD_COUNT - 1; i >= 0; i--) {
212 if (state.compiled[i] && !state.spilled[i])
213 return i;
214 }
215 for (int i = SIMD_COUNT - 1; i >= 0; i--) {
216 if (state.compiled[i])
217 return i;
218 }
219 return -1;
220 }
221
222 int
brw_simd_select_for_workgroup_size(const struct intel_device_info * devinfo,const struct brw_cs_prog_data * prog_data,const unsigned * sizes)223 brw_simd_select_for_workgroup_size(const struct intel_device_info *devinfo,
224 const struct brw_cs_prog_data *prog_data,
225 const unsigned *sizes)
226 {
227 if (!sizes || (prog_data->local_size[0] == sizes[0] &&
228 prog_data->local_size[1] == sizes[1] &&
229 prog_data->local_size[2] == sizes[2])) {
230 brw_simd_selection_state simd_state{
231 .prog_data = const_cast<struct brw_cs_prog_data *>(prog_data),
232 };
233
234 /* Propagate the prog_data information back to the simd_state,
235 * so we can use select() directly.
236 */
237 for (int i = 0; i < SIMD_COUNT; i++) {
238 simd_state.compiled[i] = test_bit(prog_data->prog_mask, i);
239 simd_state.spilled[i] = test_bit(prog_data->prog_spilled, i);
240 }
241
242 return brw_simd_select(simd_state);
243 }
244
245 struct brw_cs_prog_data cloned = *prog_data;
246 for (unsigned i = 0; i < 3; i++)
247 cloned.local_size[i] = sizes[i];
248
249 cloned.prog_mask = 0;
250 cloned.prog_spilled = 0;
251
252 brw_simd_selection_state simd_state{
253 .devinfo = devinfo,
254 .prog_data = &cloned,
255 };
256
257 for (unsigned simd = 0; simd < SIMD_COUNT; simd++) {
258 /* We are not recompiling, so use original results of prog_mask and
259 * prog_spilled as they will already contain all possible compilations.
260 */
261 if (brw_simd_should_compile(simd_state, simd) &&
262 test_bit(prog_data->prog_mask, simd)) {
263 brw_simd_mark_compiled(simd_state, simd, test_bit(prog_data->prog_spilled, simd));
264 }
265 }
266
267 return brw_simd_select(simd_state);
268 }
269