• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*
2  * Copyright © 2021 Intel Corporation
3  *
4  * Permission is hereby granted, free of charge, to any person obtaining a
5  * copy of this software and associated documentation files (the "Software"),
6  * to deal in the Software without restriction, including without limitation
7  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8  * and/or sell copies of the Software, and to permit persons to whom the
9  * Software is furnished to do so, subject to the following conditions:
10  *
11  * The above copyright notice and this permission notice (including the next
12  * paragraph) shall be included in all copies or substantial portions of the
13  * Software.
14  *
15  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
18  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21  * IN THE SOFTWARE.
22  */
23 
24 #include "brw_private.h"
25 #include "compiler/shader_info.h"
26 #include "intel/dev/intel_debug.h"
27 #include "intel/dev/intel_device_info.h"
28 #include "util/ralloc.h"
29 
30 unsigned
brw_required_dispatch_width(const struct shader_info * info)31 brw_required_dispatch_width(const struct shader_info *info)
32 {
33    if ((int)info->subgroup_size >= (int)SUBGROUP_SIZE_REQUIRE_8) {
34       assert(gl_shader_stage_uses_workgroup(info->stage));
35       /* These enum values are expressly chosen to be equal to the subgroup
36        * size that they require.
37        */
38       return (unsigned)info->subgroup_size;
39    } else {
40       return 0;
41    }
42 }
43 
44 static inline bool
test_bit(unsigned mask,unsigned bit)45 test_bit(unsigned mask, unsigned bit) {
46    return mask & (1u << bit);
47 }
48 
49 bool
brw_simd_should_compile(void * mem_ctx,unsigned simd,const struct intel_device_info * devinfo,struct brw_cs_prog_data * prog_data,unsigned required,const char ** error)50 brw_simd_should_compile(void *mem_ctx,
51                         unsigned simd,
52                         const struct intel_device_info *devinfo,
53                         struct brw_cs_prog_data *prog_data,
54                         unsigned required,
55                         const char **error)
56 
57 {
58    assert(!test_bit(prog_data->prog_mask, simd));
59    assert(error);
60 
61    const unsigned width = 8u << simd;
62 
63    /* For shaders with variable size workgroup, we will always compile all the
64     * variants, since the choice will happen only at dispatch time.
65     */
66    const bool workgroup_size_variable = prog_data->local_size[0] == 0;
67 
68    if (!workgroup_size_variable) {
69       if (test_bit(prog_data->prog_spilled, simd)) {
70          *error = ralloc_asprintf(
71             mem_ctx, "SIMD%u skipped because would spill", width);
72          return false;
73       }
74 
75       const unsigned workgroup_size = prog_data->local_size[0] *
76                                       prog_data->local_size[1] *
77                                       prog_data->local_size[2];
78 
79       unsigned max_threads = devinfo->max_cs_workgroup_threads;
80 
81       if (required && required != width) {
82          *error = ralloc_asprintf(
83             mem_ctx, "SIMD%u skipped because required dispatch width is %u",
84             width, required);
85          return false;
86       }
87 
88       if (simd > 0 && test_bit(prog_data->prog_mask, simd - 1) &&
89           workgroup_size <= (width / 2)) {
90          *error = ralloc_asprintf(
91             mem_ctx, "SIMD%u skipped because workgroup size %u already fits in SIMD%u",
92             width, workgroup_size, width / 2);
93          return false;
94       }
95 
96       if (DIV_ROUND_UP(workgroup_size, width) > max_threads) {
97          *error = ralloc_asprintf(
98             mem_ctx, "SIMD%u can't fit all %u invocations in %u threads",
99             width, workgroup_size, max_threads);
100          return false;
101       }
102 
103       /* The SIMD32 is only enabled for cases it is needed unless forced.
104        *
105        * TODO: Use performance_analysis and drop this rule.
106        */
107       if (width == 32) {
108          if (!INTEL_DEBUG(DEBUG_DO32) && prog_data->prog_mask) {
109             *error = ralloc_strdup(
110                mem_ctx, "SIMD32 skipped because not required");
111             return false;
112          }
113       }
114    }
115 
116    const bool env_skip[3] = {
117       INTEL_DEBUG(DEBUG_NO8),
118       INTEL_DEBUG(DEBUG_NO16),
119       INTEL_DEBUG(DEBUG_NO32),
120    };
121 
122    if (unlikely(env_skip[simd])) {
123       *error = ralloc_asprintf(
124          mem_ctx, "SIMD%u skipped because INTEL_DEBUG=no%u",
125          width, width);
126       return false;
127    }
128 
129    return true;
130 }
131 
132 void
brw_simd_mark_compiled(unsigned simd,struct brw_cs_prog_data * prog_data,bool spilled)133 brw_simd_mark_compiled(unsigned simd, struct brw_cs_prog_data *prog_data, bool spilled)
134 {
135    assert(!test_bit(prog_data->prog_mask, simd));
136 
137    prog_data->prog_mask |= 1u << simd;
138 
139    /* If a SIMD spilled, all the larger ones would spill too. */
140    if (spilled) {
141       for (unsigned i = simd; i < 3; i++)
142          prog_data->prog_spilled |= 1u << i;
143    }
144 }
145 
146 int
brw_simd_select(const struct brw_cs_prog_data * prog_data)147 brw_simd_select(const struct brw_cs_prog_data *prog_data)
148 {
149    assert((prog_data->prog_mask & ~0x7u) == 0);
150    const unsigned not_spilled_mask =
151       prog_data->prog_mask & ~prog_data->prog_spilled;
152 
153    /* Util functions index bits from 1 instead of 0, adjust before return. */
154 
155    if (not_spilled_mask)
156       return util_last_bit(not_spilled_mask) - 1;
157    else if (prog_data->prog_mask)
158       return ffs(prog_data->prog_mask) - 1;
159    else
160       return -1;
161 }
162 
163 int
brw_simd_select_for_workgroup_size(const struct intel_device_info * devinfo,const struct brw_cs_prog_data * prog_data,const unsigned * sizes)164 brw_simd_select_for_workgroup_size(const struct intel_device_info *devinfo,
165                                    const struct brw_cs_prog_data *prog_data,
166                                    const unsigned *sizes)
167 {
168    assert(sizes);
169 
170    if (prog_data->local_size[0] == sizes[0] &&
171        prog_data->local_size[1] == sizes[1] &&
172        prog_data->local_size[2] == sizes[2])
173       return brw_simd_select(prog_data);
174 
175    void *mem_ctx = ralloc_context(NULL);
176 
177    struct brw_cs_prog_data cloned = *prog_data;
178    for (unsigned i = 0; i < 3; i++)
179       cloned.local_size[i] = sizes[i];
180 
181    cloned.prog_mask = 0;
182    cloned.prog_spilled = 0;
183 
184    const char *error[3] = {0};
185 
186    for (unsigned simd = 0; simd < 3; simd++) {
187       /* We are not recompiling, so use original results of prog_mask and
188        * prog_spilled as they will already contain all possible compilations.
189        */
190       if (brw_simd_should_compile(mem_ctx, simd, devinfo, &cloned,
191                                   0 /* required_dispatch_width */, &error[simd]) &&
192           test_bit(prog_data->prog_mask, simd)) {
193          brw_simd_mark_compiled(simd, &cloned, test_bit(prog_data->prog_spilled, simd));
194       }
195    }
196 
197    ralloc_free(mem_ctx);
198 
199    return brw_simd_select(&cloned);
200 }
201