1 /*
2 * Copyright © 2021 Intel Corporation
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
13 * Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21 * IN THE SOFTWARE.
22 */
23
24 #include "brw_private.h"
25 #include "compiler/shader_info.h"
26 #include "intel/dev/intel_debug.h"
27 #include "intel/dev/intel_device_info.h"
28 #include "util/ralloc.h"
29
30 unsigned
brw_required_dispatch_width(const struct shader_info * info)31 brw_required_dispatch_width(const struct shader_info *info)
32 {
33 if ((int)info->subgroup_size >= (int)SUBGROUP_SIZE_REQUIRE_8) {
34 /* These enum values are expressly chosen to be equal to the subgroup
35 * size that they require.
36 */
37 return (unsigned)info->subgroup_size;
38 } else {
39 return 0;
40 }
41 }
42
43 unsigned
brw_geometry_stage_dispatch_width(const struct intel_device_info * devinfo)44 brw_geometry_stage_dispatch_width(const struct intel_device_info *devinfo)
45 {
46 if (devinfo->ver >= 20)
47 return 16;
48 return 8;
49 }
50
51 static inline bool
test_bit(unsigned mask,unsigned bit)52 test_bit(unsigned mask, unsigned bit) {
53 return mask & (1u << bit);
54 }
55
56 namespace {
57
58 struct brw_cs_prog_data *
get_cs_prog_data(brw_simd_selection_state & state)59 get_cs_prog_data(brw_simd_selection_state &state)
60 {
61 if (std::holds_alternative<struct brw_cs_prog_data *>(state.prog_data))
62 return std::get<struct brw_cs_prog_data *>(state.prog_data);
63 else
64 return nullptr;
65 }
66
67 struct brw_stage_prog_data *
get_prog_data(brw_simd_selection_state & state)68 get_prog_data(brw_simd_selection_state &state)
69 {
70 if (std::holds_alternative<struct brw_cs_prog_data *>(state.prog_data))
71 return &std::get<struct brw_cs_prog_data *>(state.prog_data)->base;
72 else if (std::holds_alternative<struct brw_bs_prog_data *>(state.prog_data))
73 return &std::get<struct brw_bs_prog_data *>(state.prog_data)->base;
74 else
75 return nullptr;
76 }
77
78 }
79
80 bool
brw_simd_should_compile(brw_simd_selection_state & state,unsigned simd)81 brw_simd_should_compile(brw_simd_selection_state &state, unsigned simd)
82 {
83 assert(simd < SIMD_COUNT);
84 assert(!state.compiled[simd]);
85
86 const auto cs_prog_data = get_cs_prog_data(state);
87 const auto prog_data = get_prog_data(state);
88 const unsigned width = 8u << simd;
89
90 if (state.required_width && state.required_width != width) {
91 state.error[simd] = "Different than required dispatch width";
92 return false;
93 }
94
95 /* For shaders with variable size workgroup, in most cases we can compile
96 * all the variants (exceptions are bindless dispatch & ray queries), since
97 * the choice will happen only at dispatch time.
98 */
99 const bool workgroup_size_variable = cs_prog_data && cs_prog_data->local_size[0] == 0;
100
101 if (!workgroup_size_variable && !state.required_width) {
102 if (state.spilled[simd]) {
103 state.error[simd] = "Would spill";
104 return false;
105 }
106
107 if (cs_prog_data) {
108 const unsigned workgroup_size = cs_prog_data->local_size[0] *
109 cs_prog_data->local_size[1] *
110 cs_prog_data->local_size[2];
111
112 unsigned max_threads = state.devinfo->max_cs_workgroup_threads;
113
114 const unsigned min_simd = state.devinfo->ver >= 20 ? 1 : 0;
115 if (simd > min_simd && workgroup_size <= (width / 2)) {
116 state.error[simd] = "Workgroup size already fits in smaller SIMD";
117 return false;
118 }
119
120 if (DIV_ROUND_UP(workgroup_size, width) > max_threads) {
121 state.error[simd] = "Would need more than max_threads to fit all invocations";
122 return false;
123 }
124 }
125
126 /* The SIMD32 is only enabled for cases it is needed unless forced.
127 *
128 * TODO: Use performance_analysis and drop this rule.
129 */
130 if (width == 32 && state.devinfo->ver < 20) {
131 if (!INTEL_DEBUG(DEBUG_DO32) && (state.compiled[0] || state.compiled[1])) {
132 state.error[simd] = "SIMD32 not required (use INTEL_DEBUG=do32 to force)";
133 return false;
134 }
135 }
136 }
137
138 if (width == 8 && state.devinfo->ver >= 20) {
139 state.error[simd] = "SIMD8 not supported on Xe2+";
140 return false;
141 }
142
143 if (width == 32 && cs_prog_data && cs_prog_data->base.ray_queries > 0) {
144 state.error[simd] = "Ray queries not supported";
145 return false;
146 }
147
148 if (width == 32 && cs_prog_data && cs_prog_data->uses_btd_stack_ids) {
149 state.error[simd] = "Bindless shader calls not supported";
150 return false;
151 }
152
153 uint64_t start;
154 switch (prog_data->stage) {
155 case MESA_SHADER_COMPUTE:
156 start = DEBUG_CS_SIMD8;
157 break;
158 case MESA_SHADER_TASK:
159 start = DEBUG_TS_SIMD8;
160 break;
161 case MESA_SHADER_MESH:
162 start = DEBUG_MS_SIMD8;
163 break;
164 case MESA_SHADER_RAYGEN:
165 case MESA_SHADER_ANY_HIT:
166 case MESA_SHADER_CLOSEST_HIT:
167 case MESA_SHADER_MISS:
168 case MESA_SHADER_INTERSECTION:
169 case MESA_SHADER_CALLABLE:
170 start = DEBUG_RT_SIMD8;
171 break;
172 default:
173 unreachable("unknown shader stage in brw_simd_should_compile");
174 }
175
176 const bool env_skip[] = {
177 (intel_simd & (start << 0)) == 0,
178 (intel_simd & (start << 1)) == 0,
179 (intel_simd & (start << 2)) == 0,
180 };
181
182 static_assert(ARRAY_SIZE(env_skip) == SIMD_COUNT);
183
184 if (unlikely(env_skip[simd])) {
185 state.error[simd] = "Disabled by INTEL_DEBUG environment variable";
186 return false;
187 }
188
189 return true;
190 }
191
192 void
brw_simd_mark_compiled(brw_simd_selection_state & state,unsigned simd,bool spilled)193 brw_simd_mark_compiled(brw_simd_selection_state &state, unsigned simd, bool spilled)
194 {
195 assert(simd < SIMD_COUNT);
196 assert(!state.compiled[simd]);
197
198 auto cs_prog_data = get_cs_prog_data(state);
199
200 state.compiled[simd] = true;
201 if (cs_prog_data)
202 cs_prog_data->prog_mask |= 1u << simd;
203
204 /* If a SIMD spilled, all the larger ones would spill too. */
205 if (spilled) {
206 for (unsigned i = simd; i < SIMD_COUNT; i++) {
207 state.spilled[i] = true;
208 if (cs_prog_data)
209 cs_prog_data->prog_spilled |= 1u << i;
210 }
211 }
212 }
213
214 int
brw_simd_select(const struct brw_simd_selection_state & state)215 brw_simd_select(const struct brw_simd_selection_state &state)
216 {
217 for (int i = SIMD_COUNT - 1; i >= 0; i--) {
218 if (state.compiled[i] && !state.spilled[i])
219 return i;
220 }
221 for (int i = SIMD_COUNT - 1; i >= 0; i--) {
222 if (state.compiled[i])
223 return i;
224 }
225 return -1;
226 }
227
228 int
brw_simd_select_for_workgroup_size(const struct intel_device_info * devinfo,const struct brw_cs_prog_data * prog_data,const unsigned * sizes)229 brw_simd_select_for_workgroup_size(const struct intel_device_info *devinfo,
230 const struct brw_cs_prog_data *prog_data,
231 const unsigned *sizes)
232 {
233 if (!sizes || (prog_data->local_size[0] == sizes[0] &&
234 prog_data->local_size[1] == sizes[1] &&
235 prog_data->local_size[2] == sizes[2])) {
236 brw_simd_selection_state simd_state{
237 .prog_data = const_cast<struct brw_cs_prog_data *>(prog_data),
238 };
239
240 /* Propagate the prog_data information back to the simd_state,
241 * so we can use select() directly.
242 */
243 for (int i = 0; i < SIMD_COUNT; i++) {
244 simd_state.compiled[i] = test_bit(prog_data->prog_mask, i);
245 simd_state.spilled[i] = test_bit(prog_data->prog_spilled, i);
246 }
247
248 return brw_simd_select(simd_state);
249 }
250
251 struct brw_cs_prog_data cloned = *prog_data;
252 for (unsigned i = 0; i < 3; i++)
253 cloned.local_size[i] = sizes[i];
254
255 cloned.prog_mask = 0;
256 cloned.prog_spilled = 0;
257
258 brw_simd_selection_state simd_state{
259 .devinfo = devinfo,
260 .prog_data = &cloned,
261 };
262
263 for (unsigned simd = 0; simd < SIMD_COUNT; simd++) {
264 /* We are not recompiling, so use original results of prog_mask and
265 * prog_spilled as they will already contain all possible compilations.
266 */
267 if (brw_simd_should_compile(simd_state, simd) &&
268 test_bit(prog_data->prog_mask, simd)) {
269 brw_simd_mark_compiled(simd_state, simd, test_bit(prog_data->prog_spilled, simd));
270 }
271 }
272
273 return brw_simd_select(simd_state);
274 }
275