1 // Copyright 2019 The SwiftShader Authors. All Rights Reserved.
2 //
3 // Licensed under the Apache License, Version 2.0 (the "License");
4 // you may not use this file except in compliance with the License.
5 // You may obtain a copy of the License at
6 //
7 // http://www.apache.org/licenses/LICENSE-2.0
8 //
9 // Unless required by applicable law or agreed to in writing, software
10 // distributed under the License is distributed on an "AS IS" BASIS,
11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 // See the License for the specific language governing permissions and
13 // limitations under the License.
14
15 #include "ComputeProgram.hpp"
16 #include "Constants.hpp"
17
18 #include "System/Debug.hpp"
19 #include "Vulkan/VkPipelineLayout.hpp"
20
21 #include "marl/defer.h"
22 #include "marl/trace.h"
23 #include "marl/waitgroup.h"
24
25 #include <queue>
26
27 namespace {
28
29 enum
30 {
31 X,
32 Y,
33 Z
34 };
35
36 } // anonymous namespace
37
38 namespace sw {
39
ComputeProgram(SpirvShader const * shader,vk::PipelineLayout const * pipelineLayout,const vk::DescriptorSet::Bindings & descriptorSets)40 ComputeProgram::ComputeProgram(SpirvShader const *shader, vk::PipelineLayout const *pipelineLayout, const vk::DescriptorSet::Bindings &descriptorSets)
41 : shader(shader)
42 , pipelineLayout(pipelineLayout)
43 , descriptorSets(descriptorSets)
44 {
45 }
46
~ComputeProgram()47 ComputeProgram::~ComputeProgram()
48 {
49 }
50
generate()51 void ComputeProgram::generate()
52 {
53 MARL_SCOPED_EVENT("ComputeProgram::generate");
54
55 SpirvRoutine routine(pipelineLayout);
56 shader->emitProlog(&routine);
57 emit(&routine);
58 shader->emitEpilog(&routine);
59 }
60
setWorkgroupBuiltins(Pointer<Byte> data,SpirvRoutine * routine,Int workgroupID[3])61 void ComputeProgram::setWorkgroupBuiltins(Pointer<Byte> data, SpirvRoutine *routine, Int workgroupID[3])
62 {
63 // TODO(b/146486064): Consider only assigning these to the SpirvRoutine iff
64 // they are ever going to be read.
65 routine->numWorkgroups = *Pointer<Int4>(data + OFFSET(Data, numWorkgroups));
66 routine->workgroupID = Insert(Insert(Insert(Int4(0), workgroupID[X], X), workgroupID[Y], Y), workgroupID[Z], Z);
67 routine->workgroupSize = *Pointer<Int4>(data + OFFSET(Data, workgroupSize));
68 routine->subgroupsPerWorkgroup = *Pointer<Int>(data + OFFSET(Data, subgroupsPerWorkgroup));
69 routine->invocationsPerSubgroup = *Pointer<Int>(data + OFFSET(Data, invocationsPerSubgroup));
70
71 routine->setInputBuiltin(shader, spv::BuiltInNumWorkgroups, [&](const SpirvShader::BuiltinMapping &builtin, Array<SIMD::Float> &value) {
72 for(uint32_t component = 0; component < builtin.SizeInComponents; component++)
73 {
74 value[builtin.FirstComponent + component] =
75 As<SIMD::Float>(SIMD::Int(Extract(routine->numWorkgroups, component)));
76 }
77 });
78
79 routine->setInputBuiltin(shader, spv::BuiltInWorkgroupId, [&](const SpirvShader::BuiltinMapping &builtin, Array<SIMD::Float> &value) {
80 for(uint32_t component = 0; component < builtin.SizeInComponents; component++)
81 {
82 value[builtin.FirstComponent + component] =
83 As<SIMD::Float>(SIMD::Int(workgroupID[component]));
84 }
85 });
86
87 routine->setInputBuiltin(shader, spv::BuiltInWorkgroupSize, [&](const SpirvShader::BuiltinMapping &builtin, Array<SIMD::Float> &value) {
88 for(uint32_t component = 0; component < builtin.SizeInComponents; component++)
89 {
90 value[builtin.FirstComponent + component] =
91 As<SIMD::Float>(SIMD::Int(Extract(routine->workgroupSize, component)));
92 }
93 });
94
95 routine->setInputBuiltin(shader, spv::BuiltInNumSubgroups, [&](const SpirvShader::BuiltinMapping &builtin, Array<SIMD::Float> &value) {
96 ASSERT(builtin.SizeInComponents == 1);
97 value[builtin.FirstComponent] = As<SIMD::Float>(SIMD::Int(routine->subgroupsPerWorkgroup));
98 });
99
100 routine->setInputBuiltin(shader, spv::BuiltInSubgroupSize, [&](const SpirvShader::BuiltinMapping &builtin, Array<SIMD::Float> &value) {
101 ASSERT(builtin.SizeInComponents == 1);
102 value[builtin.FirstComponent] = As<SIMD::Float>(SIMD::Int(routine->invocationsPerSubgroup));
103 });
104
105 routine->setImmutableInputBuiltins(shader);
106 }
107
setSubgroupBuiltins(Pointer<Byte> data,SpirvRoutine * routine,Int workgroupID[3],SIMD::Int localInvocationIndex,Int subgroupIndex)108 void ComputeProgram::setSubgroupBuiltins(Pointer<Byte> data, SpirvRoutine *routine, Int workgroupID[3], SIMD::Int localInvocationIndex, Int subgroupIndex)
109 {
110 Int4 numWorkgroups = *Pointer<Int4>(data + OFFSET(Data, numWorkgroups));
111 Int4 workgroupSize = *Pointer<Int4>(data + OFFSET(Data, workgroupSize));
112
113 // TODO: Fix Int4 swizzles so we can just use workgroupSize.x, workgroupSize.y.
114 Int workgroupSizeX = Extract(workgroupSize, X);
115 Int workgroupSizeY = Extract(workgroupSize, Y);
116
117 SIMD::Int localInvocationID[3];
118 {
119 SIMD::Int idx = localInvocationIndex;
120 localInvocationID[Z] = idx / SIMD::Int(workgroupSizeX * workgroupSizeY);
121 idx -= localInvocationID[Z] * SIMD::Int(workgroupSizeX * workgroupSizeY); // modulo
122 localInvocationID[Y] = idx / SIMD::Int(workgroupSizeX);
123 idx -= localInvocationID[Y] * SIMD::Int(workgroupSizeX); // modulo
124 localInvocationID[X] = idx;
125 }
126
127 Int4 wgID = Insert(Insert(Insert(SIMD::Int(0), workgroupID[X], X), workgroupID[Y], Y), workgroupID[Z], Z);
128 auto localBase = workgroupSize * wgID;
129 SIMD::Int globalInvocationID[3];
130 globalInvocationID[X] = SIMD::Int(Extract(localBase, X)) + localInvocationID[X];
131 globalInvocationID[Y] = SIMD::Int(Extract(localBase, Y)) + localInvocationID[Y];
132 globalInvocationID[Z] = SIMD::Int(Extract(localBase, Z)) + localInvocationID[Z];
133
134 routine->localInvocationIndex = localInvocationIndex;
135 routine->subgroupIndex = subgroupIndex;
136 routine->localInvocationID[X] = localInvocationID[X];
137 routine->localInvocationID[Y] = localInvocationID[Y];
138 routine->localInvocationID[Z] = localInvocationID[Z];
139 routine->globalInvocationID[X] = globalInvocationID[X];
140 routine->globalInvocationID[Y] = globalInvocationID[Y];
141 routine->globalInvocationID[Z] = globalInvocationID[Z];
142
143 routine->setInputBuiltin(shader, spv::BuiltInLocalInvocationIndex, [&](const SpirvShader::BuiltinMapping &builtin, Array<SIMD::Float> &value) {
144 ASSERT(builtin.SizeInComponents == 1);
145 value[builtin.FirstComponent] = As<SIMD::Float>(localInvocationIndex);
146 });
147
148 routine->setInputBuiltin(shader, spv::BuiltInSubgroupId, [&](const SpirvShader::BuiltinMapping &builtin, Array<SIMD::Float> &value) {
149 ASSERT(builtin.SizeInComponents == 1);
150 value[builtin.FirstComponent] = As<SIMD::Float>(SIMD::Int(subgroupIndex));
151 });
152
153 routine->setInputBuiltin(shader, spv::BuiltInLocalInvocationId, [&](const SpirvShader::BuiltinMapping &builtin, Array<SIMD::Float> &value) {
154 for(uint32_t component = 0; component < builtin.SizeInComponents; component++)
155 {
156 value[builtin.FirstComponent + component] =
157 As<SIMD::Float>(localInvocationID[component]);
158 }
159 });
160
161 routine->setInputBuiltin(shader, spv::BuiltInGlobalInvocationId, [&](const SpirvShader::BuiltinMapping &builtin, Array<SIMD::Float> &value) {
162 for(uint32_t component = 0; component < builtin.SizeInComponents; component++)
163 {
164 value[builtin.FirstComponent + component] =
165 As<SIMD::Float>(globalInvocationID[component]);
166 }
167 });
168 }
169
emit(SpirvRoutine * routine)170 void ComputeProgram::emit(SpirvRoutine *routine)
171 {
172 Pointer<Byte> data = Arg<0>();
173 Int workgroupX = Arg<1>();
174 Int workgroupY = Arg<2>();
175 Int workgroupZ = Arg<3>();
176 Pointer<Byte> workgroupMemory = Arg<4>();
177 Int firstSubgroup = Arg<5>();
178 Int subgroupCount = Arg<6>();
179
180 routine->descriptorSets = data + OFFSET(Data, descriptorSets);
181 routine->descriptorDynamicOffsets = data + OFFSET(Data, descriptorDynamicOffsets);
182 routine->pushConstants = data + OFFSET(Data, pushConstants);
183 routine->constants = *Pointer<Pointer<Byte>>(data + OFFSET(Data, constants));
184 routine->workgroupMemory = workgroupMemory;
185
186 Int invocationsPerWorkgroup = *Pointer<Int>(data + OFFSET(Data, invocationsPerWorkgroup));
187
188 Int workgroupID[3] = { workgroupX, workgroupY, workgroupZ };
189 setWorkgroupBuiltins(data, routine, workgroupID);
190
191 For(Int i = 0, i < subgroupCount, i++)
192 {
193 auto subgroupIndex = firstSubgroup + i;
194
195 // TODO: Replace SIMD::Int(0, 1, 2, 3) with SIMD-width equivalent
196 auto localInvocationIndex = SIMD::Int(subgroupIndex * SIMD::Width) + SIMD::Int(0, 1, 2, 3);
197
198 // Disable lanes where (invocationIDs >= invocationsPerWorkgroup)
199 auto activeLaneMask = CmpLT(localInvocationIndex, SIMD::Int(invocationsPerWorkgroup));
200
201 setSubgroupBuiltins(data, routine, workgroupID, localInvocationIndex, subgroupIndex);
202
203 shader->emit(routine, activeLaneMask, activeLaneMask, descriptorSets);
204 }
205 }
206
run(vk::DescriptorSet::Bindings const & descriptorSets,vk::DescriptorSet::DynamicOffsets const & descriptorDynamicOffsets,PushConstantStorage const & pushConstants,uint32_t baseGroupX,uint32_t baseGroupY,uint32_t baseGroupZ,uint32_t groupCountX,uint32_t groupCountY,uint32_t groupCountZ)207 void ComputeProgram::run(
208 vk::DescriptorSet::Bindings const &descriptorSets,
209 vk::DescriptorSet::DynamicOffsets const &descriptorDynamicOffsets,
210 PushConstantStorage const &pushConstants,
211 uint32_t baseGroupX, uint32_t baseGroupY, uint32_t baseGroupZ,
212 uint32_t groupCountX, uint32_t groupCountY, uint32_t groupCountZ)
213 {
214 auto &modes = shader->getModes();
215
216 auto invocationsPerSubgroup = SIMD::Width;
217 auto invocationsPerWorkgroup = modes.WorkgroupSizeX * modes.WorkgroupSizeY * modes.WorkgroupSizeZ;
218 auto subgroupsPerWorkgroup = (invocationsPerWorkgroup + invocationsPerSubgroup - 1) / invocationsPerSubgroup;
219
220 Data data;
221 data.descriptorSets = descriptorSets;
222 data.descriptorDynamicOffsets = descriptorDynamicOffsets;
223 data.numWorkgroups[X] = groupCountX;
224 data.numWorkgroups[Y] = groupCountY;
225 data.numWorkgroups[Z] = groupCountZ;
226 data.numWorkgroups[3] = 0;
227 data.workgroupSize[X] = modes.WorkgroupSizeX;
228 data.workgroupSize[Y] = modes.WorkgroupSizeY;
229 data.workgroupSize[Z] = modes.WorkgroupSizeZ;
230 data.workgroupSize[3] = 0;
231 data.invocationsPerSubgroup = invocationsPerSubgroup;
232 data.invocationsPerWorkgroup = invocationsPerWorkgroup;
233 data.subgroupsPerWorkgroup = subgroupsPerWorkgroup;
234 data.pushConstants = pushConstants;
235 data.constants = &sw::constants;
236
237 marl::WaitGroup wg;
238 const uint32_t batchCount = 16;
239
240 auto groupCount = groupCountX * groupCountY * groupCountZ;
241
242 for(uint32_t batchID = 0; batchID < batchCount && batchID < groupCount; batchID++)
243 {
244 wg.add(1);
245 marl::schedule([=, &data] {
246 defer(wg.done());
247 std::vector<uint8_t> workgroupMemory(shader->workgroupMemory.size());
248
249 for(uint32_t groupIndex = batchID; groupIndex < groupCount; groupIndex += batchCount)
250 {
251 auto modulo = groupIndex;
252 auto groupOffsetZ = modulo / (groupCountX * groupCountY);
253 modulo -= groupOffsetZ * (groupCountX * groupCountY);
254 auto groupOffsetY = modulo / groupCountX;
255 modulo -= groupOffsetY * groupCountX;
256 auto groupOffsetX = modulo;
257
258 auto groupZ = baseGroupZ + groupOffsetZ;
259 auto groupY = baseGroupY + groupOffsetY;
260 auto groupX = baseGroupX + groupOffsetX;
261 MARL_SCOPED_EVENT("groupX: %d, groupY: %d, groupZ: %d", groupX, groupY, groupZ);
262
263 using Coroutine = std::unique_ptr<rr::Stream<SpirvShader::YieldResult>>;
264 std::queue<Coroutine> coroutines;
265
266 if(modes.ContainsControlBarriers)
267 {
268 // Make a function call per subgroup so each subgroup
269 // can yield, bringing all subgroups to the barrier
270 // together.
271 for(int subgroupIndex = 0; subgroupIndex < subgroupsPerWorkgroup; subgroupIndex++)
272 {
273 auto coroutine = (*this)(&data, groupX, groupY, groupZ, workgroupMemory.data(), subgroupIndex, 1);
274 coroutines.push(std::move(coroutine));
275 }
276 }
277 else
278 {
279 auto coroutine = (*this)(&data, groupX, groupY, groupZ, workgroupMemory.data(), 0, subgroupsPerWorkgroup);
280 coroutines.push(std::move(coroutine));
281 }
282
283 while(coroutines.size() > 0)
284 {
285 auto coroutine = std::move(coroutines.front());
286 coroutines.pop();
287
288 SpirvShader::YieldResult result;
289 if(coroutine->await(result))
290 {
291 // TODO: Consider result (when the enum is more than 1 entry).
292 coroutines.push(std::move(coroutine));
293 }
294 }
295 }
296 });
297 }
298
299 wg.wait();
300 }
301
302 } // namespace sw
303