• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*
2  * Copyright © 2021 Intel Corporation
3  *
4  * Permission is hereby granted, free of charge, to any person obtaining a
5  * copy of this software and associated documentation files (the "Software"),
6  * to deal in the Software without restriction, including without limitation
7  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8  * and/or sell copies of the Software, and to permit persons to whom the
9  * Software is furnished to do so, subject to the following conditions:
10  *
11  * The above copyright notice and this permission notice (including the next
12  * paragraph) shall be included in all copies or substantial portions of the
13  * Software.
14  *
15  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
18  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21  * IN THE SOFTWARE.
22  */
23 
24 #include "brw_private.h"
25 #include "compiler/shader_info.h"
26 #include "intel/dev/intel_debug.h"
27 #include "intel/dev/intel_device_info.h"
28 #include "util/ralloc.h"
29 
30 unsigned
brw_required_dispatch_width(const struct shader_info * info)31 brw_required_dispatch_width(const struct shader_info *info)
32 {
33    if ((int)info->subgroup_size >= (int)SUBGROUP_SIZE_REQUIRE_8) {
34       /* These enum values are expressly chosen to be equal to the subgroup
35        * size that they require.
36        */
37       return (unsigned)info->subgroup_size;
38    } else {
39       return 0;
40    }
41 }
42 
43 unsigned
brw_geometry_stage_dispatch_width(const struct intel_device_info * devinfo)44 brw_geometry_stage_dispatch_width(const struct intel_device_info *devinfo)
45 {
46    if (devinfo->ver >= 20)
47       return 16;
48    return 8;
49 }
50 
51 static inline bool
test_bit(unsigned mask,unsigned bit)52 test_bit(unsigned mask, unsigned bit) {
53    return mask & (1u << bit);
54 }
55 
56 namespace {
57 
58 struct brw_cs_prog_data *
get_cs_prog_data(brw_simd_selection_state & state)59 get_cs_prog_data(brw_simd_selection_state &state)
60 {
61    if (std::holds_alternative<struct brw_cs_prog_data *>(state.prog_data))
62       return std::get<struct brw_cs_prog_data *>(state.prog_data);
63    else
64       return nullptr;
65 }
66 
67 struct brw_stage_prog_data *
get_prog_data(brw_simd_selection_state & state)68 get_prog_data(brw_simd_selection_state &state)
69 {
70    if (std::holds_alternative<struct brw_cs_prog_data *>(state.prog_data))
71       return &std::get<struct brw_cs_prog_data *>(state.prog_data)->base;
72    else if (std::holds_alternative<struct brw_bs_prog_data *>(state.prog_data))
73       return &std::get<struct brw_bs_prog_data *>(state.prog_data)->base;
74    else
75       return nullptr;
76 }
77 
78 }
79 
80 bool
brw_simd_should_compile(brw_simd_selection_state & state,unsigned simd)81 brw_simd_should_compile(brw_simd_selection_state &state, unsigned simd)
82 {
83    assert(simd < SIMD_COUNT);
84    assert(!state.compiled[simd]);
85 
86    const auto cs_prog_data = get_cs_prog_data(state);
87    const auto prog_data = get_prog_data(state);
88    const unsigned width = 8u << simd;
89 
90    if (state.required_width && state.required_width != width) {
91       state.error[simd] = "Different than required dispatch width";
92       return false;
93    }
94 
95    /* For shaders with variable size workgroup, in most cases we can compile
96     * all the variants (exceptions are bindless dispatch & ray queries), since
97     * the choice will happen only at dispatch time.
98     */
99    const bool workgroup_size_variable = cs_prog_data && cs_prog_data->local_size[0] == 0;
100 
101    if (!workgroup_size_variable && !state.required_width) {
102       if (state.spilled[simd]) {
103          state.error[simd] = "Would spill";
104          return false;
105       }
106 
107       if (cs_prog_data) {
108          const unsigned workgroup_size = cs_prog_data->local_size[0] *
109                                          cs_prog_data->local_size[1] *
110                                          cs_prog_data->local_size[2];
111 
112          unsigned max_threads = state.devinfo->max_cs_workgroup_threads;
113 
114          const unsigned min_simd = state.devinfo->ver >= 20 ? 1 : 0;
115          if (simd > min_simd && workgroup_size <= (width / 2)) {
116             state.error[simd] = "Workgroup size already fits in smaller SIMD";
117             return false;
118          }
119 
120          if (DIV_ROUND_UP(workgroup_size, width) > max_threads) {
121             state.error[simd] = "Would need more than max_threads to fit all invocations";
122             return false;
123          }
124       }
125 
126       /* The SIMD32 is only enabled for cases it is needed unless forced.
127        *
128        * TODO: Use performance_analysis and drop this rule.
129        */
130       if (width == 32 && state.devinfo->ver < 20) {
131          if (!INTEL_DEBUG(DEBUG_DO32) && (state.compiled[0] || state.compiled[1])) {
132             state.error[simd] = "SIMD32 not required (use INTEL_DEBUG=do32 to force)";
133             return false;
134          }
135       }
136    }
137 
138    if (width == 8 && state.devinfo->ver >= 20) {
139       state.error[simd] = "SIMD8 not supported on Xe2+";
140       return false;
141    }
142 
143    if (width == 32 && cs_prog_data && cs_prog_data->base.ray_queries > 0) {
144       state.error[simd] = "Ray queries not supported";
145       return false;
146    }
147 
148    if (width == 32 && cs_prog_data && cs_prog_data->uses_btd_stack_ids) {
149       state.error[simd] = "Bindless shader calls not supported";
150       return false;
151    }
152 
153    uint64_t start;
154    switch (prog_data->stage) {
155    case MESA_SHADER_COMPUTE:
156       start = DEBUG_CS_SIMD8;
157       break;
158    case MESA_SHADER_TASK:
159       start = DEBUG_TS_SIMD8;
160       break;
161    case MESA_SHADER_MESH:
162       start = DEBUG_MS_SIMD8;
163       break;
164    case MESA_SHADER_RAYGEN:
165    case MESA_SHADER_ANY_HIT:
166    case MESA_SHADER_CLOSEST_HIT:
167    case MESA_SHADER_MISS:
168    case MESA_SHADER_INTERSECTION:
169    case MESA_SHADER_CALLABLE:
170       start = DEBUG_RT_SIMD8;
171       break;
172    default:
173       unreachable("unknown shader stage in brw_simd_should_compile");
174    }
175 
176    const bool env_skip[] = {
177       (intel_simd & (start << 0)) == 0,
178       (intel_simd & (start << 1)) == 0,
179       (intel_simd & (start << 2)) == 0,
180    };
181 
182    static_assert(ARRAY_SIZE(env_skip) == SIMD_COUNT);
183 
184    if (unlikely(env_skip[simd])) {
185       state.error[simd] = "Disabled by INTEL_DEBUG environment variable";
186       return false;
187    }
188 
189    return true;
190 }
191 
192 void
brw_simd_mark_compiled(brw_simd_selection_state & state,unsigned simd,bool spilled)193 brw_simd_mark_compiled(brw_simd_selection_state &state, unsigned simd, bool spilled)
194 {
195    assert(simd < SIMD_COUNT);
196    assert(!state.compiled[simd]);
197 
198    auto cs_prog_data = get_cs_prog_data(state);
199 
200    state.compiled[simd] = true;
201    if (cs_prog_data)
202       cs_prog_data->prog_mask |= 1u << simd;
203 
204    /* If a SIMD spilled, all the larger ones would spill too. */
205    if (spilled) {
206       for (unsigned i = simd; i < SIMD_COUNT; i++) {
207          state.spilled[i] = true;
208          if (cs_prog_data)
209             cs_prog_data->prog_spilled |= 1u << i;
210       }
211    }
212 }
213 
214 int
brw_simd_select(const struct brw_simd_selection_state & state)215 brw_simd_select(const struct brw_simd_selection_state &state)
216 {
217    for (int i = SIMD_COUNT - 1; i >= 0; i--) {
218       if (state.compiled[i] && !state.spilled[i])
219          return i;
220    }
221    for (int i = SIMD_COUNT - 1; i >= 0; i--) {
222       if (state.compiled[i])
223          return i;
224    }
225    return -1;
226 }
227 
228 int
brw_simd_select_for_workgroup_size(const struct intel_device_info * devinfo,const struct brw_cs_prog_data * prog_data,const unsigned * sizes)229 brw_simd_select_for_workgroup_size(const struct intel_device_info *devinfo,
230                                    const struct brw_cs_prog_data *prog_data,
231                                    const unsigned *sizes)
232 {
233    if (!sizes || (prog_data->local_size[0] == sizes[0] &&
234                   prog_data->local_size[1] == sizes[1] &&
235                   prog_data->local_size[2] == sizes[2])) {
236       brw_simd_selection_state simd_state{
237          .prog_data = const_cast<struct brw_cs_prog_data *>(prog_data),
238       };
239 
240       /* Propagate the prog_data information back to the simd_state,
241        * so we can use select() directly.
242        */
243       for (int i = 0; i < SIMD_COUNT; i++) {
244          simd_state.compiled[i] = test_bit(prog_data->prog_mask, i);
245          simd_state.spilled[i] = test_bit(prog_data->prog_spilled, i);
246       }
247 
248       return brw_simd_select(simd_state);
249    }
250 
251    struct brw_cs_prog_data cloned = *prog_data;
252    for (unsigned i = 0; i < 3; i++)
253       cloned.local_size[i] = sizes[i];
254 
255    cloned.prog_mask = 0;
256    cloned.prog_spilled = 0;
257 
258    brw_simd_selection_state simd_state{
259       .devinfo = devinfo,
260       .prog_data = &cloned,
261    };
262 
263    for (unsigned simd = 0; simd < SIMD_COUNT; simd++) {
264       /* We are not recompiling, so use original results of prog_mask and
265        * prog_spilled as they will already contain all possible compilations.
266        */
267       if (brw_simd_should_compile(simd_state, simd) &&
268           test_bit(prog_data->prog_mask, simd)) {
269          brw_simd_mark_compiled(simd_state, simd, test_bit(prog_data->prog_spilled, simd));
270       }
271    }
272 
273    return brw_simd_select(simd_state);
274 }
275