• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 // Copyright 2019 The SwiftShader Authors. All Rights Reserved.
2 //
3 // Licensed under the Apache License, Version 2.0 (the "License");
4 // you may not use this file except in compliance with the License.
5 // You may obtain a copy of the License at
6 //
7 //    http://www.apache.org/licenses/LICENSE-2.0
8 //
9 // Unless required by applicable law or agreed to in writing, software
10 // distributed under the License is distributed on an "AS IS" BASIS,
11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 // See the License for the specific language governing permissions and
13 // limitations under the License.
14 
15 #include "ComputeProgram.hpp"
16 #include "Constants.hpp"
17 
18 #include "System/Debug.hpp"
19 #include "Vulkan/VkPipelineLayout.hpp"
20 
21 #include "marl/defer.h"
22 #include "marl/trace.h"
23 #include "marl/waitgroup.h"
24 
25 #include <queue>
26 
27 namespace {
28 
29 enum
30 {
31 	X,
32 	Y,
33 	Z
34 };
35 
36 }  // anonymous namespace
37 
38 namespace sw {
39 
ComputeProgram(SpirvShader const * shader,vk::PipelineLayout const * pipelineLayout,const vk::DescriptorSet::Bindings & descriptorSets)40 ComputeProgram::ComputeProgram(SpirvShader const *shader, vk::PipelineLayout const *pipelineLayout, const vk::DescriptorSet::Bindings &descriptorSets)
41     : shader(shader)
42     , pipelineLayout(pipelineLayout)
43     , descriptorSets(descriptorSets)
44 {
45 }
46 
~ComputeProgram()47 ComputeProgram::~ComputeProgram()
48 {
49 }
50 
generate()51 void ComputeProgram::generate()
52 {
53 	MARL_SCOPED_EVENT("ComputeProgram::generate");
54 
55 	SpirvRoutine routine(pipelineLayout);
56 	shader->emitProlog(&routine);
57 	emit(&routine);
58 	shader->emitEpilog(&routine);
59 }
60 
setWorkgroupBuiltins(Pointer<Byte> data,SpirvRoutine * routine,Int workgroupID[3])61 void ComputeProgram::setWorkgroupBuiltins(Pointer<Byte> data, SpirvRoutine *routine, Int workgroupID[3])
62 {
63 	// TODO(b/146486064): Consider only assigning these to the SpirvRoutine iff
64 	// they are ever going to be read.
65 	routine->numWorkgroups = *Pointer<Int4>(data + OFFSET(Data, numWorkgroups));
66 	routine->workgroupID = Insert(Insert(Insert(Int4(0), workgroupID[X], X), workgroupID[Y], Y), workgroupID[Z], Z);
67 	routine->workgroupSize = *Pointer<Int4>(data + OFFSET(Data, workgroupSize));
68 	routine->subgroupsPerWorkgroup = *Pointer<Int>(data + OFFSET(Data, subgroupsPerWorkgroup));
69 	routine->invocationsPerSubgroup = *Pointer<Int>(data + OFFSET(Data, invocationsPerSubgroup));
70 
71 	routine->setInputBuiltin(shader, spv::BuiltInNumWorkgroups, [&](const SpirvShader::BuiltinMapping &builtin, Array<SIMD::Float> &value) {
72 		for(uint32_t component = 0; component < builtin.SizeInComponents; component++)
73 		{
74 			value[builtin.FirstComponent + component] =
75 			    As<SIMD::Float>(SIMD::Int(Extract(routine->numWorkgroups, component)));
76 		}
77 	});
78 
79 	routine->setInputBuiltin(shader, spv::BuiltInWorkgroupId, [&](const SpirvShader::BuiltinMapping &builtin, Array<SIMD::Float> &value) {
80 		for(uint32_t component = 0; component < builtin.SizeInComponents; component++)
81 		{
82 			value[builtin.FirstComponent + component] =
83 			    As<SIMD::Float>(SIMD::Int(workgroupID[component]));
84 		}
85 	});
86 
87 	routine->setInputBuiltin(shader, spv::BuiltInWorkgroupSize, [&](const SpirvShader::BuiltinMapping &builtin, Array<SIMD::Float> &value) {
88 		for(uint32_t component = 0; component < builtin.SizeInComponents; component++)
89 		{
90 			value[builtin.FirstComponent + component] =
91 			    As<SIMD::Float>(SIMD::Int(Extract(routine->workgroupSize, component)));
92 		}
93 	});
94 
95 	routine->setInputBuiltin(shader, spv::BuiltInNumSubgroups, [&](const SpirvShader::BuiltinMapping &builtin, Array<SIMD::Float> &value) {
96 		ASSERT(builtin.SizeInComponents == 1);
97 		value[builtin.FirstComponent] = As<SIMD::Float>(SIMD::Int(routine->subgroupsPerWorkgroup));
98 	});
99 
100 	routine->setInputBuiltin(shader, spv::BuiltInSubgroupSize, [&](const SpirvShader::BuiltinMapping &builtin, Array<SIMD::Float> &value) {
101 		ASSERT(builtin.SizeInComponents == 1);
102 		value[builtin.FirstComponent] = As<SIMD::Float>(SIMD::Int(routine->invocationsPerSubgroup));
103 	});
104 
105 	routine->setImmutableInputBuiltins(shader);
106 }
107 
setSubgroupBuiltins(Pointer<Byte> data,SpirvRoutine * routine,Int workgroupID[3],SIMD::Int localInvocationIndex,Int subgroupIndex)108 void ComputeProgram::setSubgroupBuiltins(Pointer<Byte> data, SpirvRoutine *routine, Int workgroupID[3], SIMD::Int localInvocationIndex, Int subgroupIndex)
109 {
110 	Int4 numWorkgroups = *Pointer<Int4>(data + OFFSET(Data, numWorkgroups));
111 	Int4 workgroupSize = *Pointer<Int4>(data + OFFSET(Data, workgroupSize));
112 
113 	// TODO: Fix Int4 swizzles so we can just use workgroupSize.x, workgroupSize.y.
114 	Int workgroupSizeX = Extract(workgroupSize, X);
115 	Int workgroupSizeY = Extract(workgroupSize, Y);
116 
117 	SIMD::Int localInvocationID[3];
118 	{
119 		SIMD::Int idx = localInvocationIndex;
120 		localInvocationID[Z] = idx / SIMD::Int(workgroupSizeX * workgroupSizeY);
121 		idx -= localInvocationID[Z] * SIMD::Int(workgroupSizeX * workgroupSizeY);  // modulo
122 		localInvocationID[Y] = idx / SIMD::Int(workgroupSizeX);
123 		idx -= localInvocationID[Y] * SIMD::Int(workgroupSizeX);  // modulo
124 		localInvocationID[X] = idx;
125 	}
126 
127 	Int4 wgID = Insert(Insert(Insert(SIMD::Int(0), workgroupID[X], X), workgroupID[Y], Y), workgroupID[Z], Z);
128 	auto localBase = workgroupSize * wgID;
129 	SIMD::Int globalInvocationID[3];
130 	globalInvocationID[X] = SIMD::Int(Extract(localBase, X)) + localInvocationID[X];
131 	globalInvocationID[Y] = SIMD::Int(Extract(localBase, Y)) + localInvocationID[Y];
132 	globalInvocationID[Z] = SIMD::Int(Extract(localBase, Z)) + localInvocationID[Z];
133 
134 	routine->localInvocationIndex = localInvocationIndex;
135 	routine->subgroupIndex = subgroupIndex;
136 	routine->localInvocationID[X] = localInvocationID[X];
137 	routine->localInvocationID[Y] = localInvocationID[Y];
138 	routine->localInvocationID[Z] = localInvocationID[Z];
139 	routine->globalInvocationID[X] = globalInvocationID[X];
140 	routine->globalInvocationID[Y] = globalInvocationID[Y];
141 	routine->globalInvocationID[Z] = globalInvocationID[Z];
142 
143 	routine->setInputBuiltin(shader, spv::BuiltInLocalInvocationIndex, [&](const SpirvShader::BuiltinMapping &builtin, Array<SIMD::Float> &value) {
144 		ASSERT(builtin.SizeInComponents == 1);
145 		value[builtin.FirstComponent] = As<SIMD::Float>(localInvocationIndex);
146 	});
147 
148 	routine->setInputBuiltin(shader, spv::BuiltInSubgroupId, [&](const SpirvShader::BuiltinMapping &builtin, Array<SIMD::Float> &value) {
149 		ASSERT(builtin.SizeInComponents == 1);
150 		value[builtin.FirstComponent] = As<SIMD::Float>(SIMD::Int(subgroupIndex));
151 	});
152 
153 	routine->setInputBuiltin(shader, spv::BuiltInLocalInvocationId, [&](const SpirvShader::BuiltinMapping &builtin, Array<SIMD::Float> &value) {
154 		for(uint32_t component = 0; component < builtin.SizeInComponents; component++)
155 		{
156 			value[builtin.FirstComponent + component] =
157 			    As<SIMD::Float>(localInvocationID[component]);
158 		}
159 	});
160 
161 	routine->setInputBuiltin(shader, spv::BuiltInGlobalInvocationId, [&](const SpirvShader::BuiltinMapping &builtin, Array<SIMD::Float> &value) {
162 		for(uint32_t component = 0; component < builtin.SizeInComponents; component++)
163 		{
164 			value[builtin.FirstComponent + component] =
165 			    As<SIMD::Float>(globalInvocationID[component]);
166 		}
167 	});
168 }
169 
emit(SpirvRoutine * routine)170 void ComputeProgram::emit(SpirvRoutine *routine)
171 {
172 	Pointer<Byte> data = Arg<0>();
173 	Int workgroupX = Arg<1>();
174 	Int workgroupY = Arg<2>();
175 	Int workgroupZ = Arg<3>();
176 	Pointer<Byte> workgroupMemory = Arg<4>();
177 	Int firstSubgroup = Arg<5>();
178 	Int subgroupCount = Arg<6>();
179 
180 	routine->descriptorSets = data + OFFSET(Data, descriptorSets);
181 	routine->descriptorDynamicOffsets = data + OFFSET(Data, descriptorDynamicOffsets);
182 	routine->pushConstants = data + OFFSET(Data, pushConstants);
183 	routine->constants = *Pointer<Pointer<Byte>>(data + OFFSET(Data, constants));
184 	routine->workgroupMemory = workgroupMemory;
185 
186 	Int invocationsPerWorkgroup = *Pointer<Int>(data + OFFSET(Data, invocationsPerWorkgroup));
187 
188 	Int workgroupID[3] = { workgroupX, workgroupY, workgroupZ };
189 	setWorkgroupBuiltins(data, routine, workgroupID);
190 
191 	For(Int i = 0, i < subgroupCount, i++)
192 	{
193 		auto subgroupIndex = firstSubgroup + i;
194 
195 		// TODO: Replace SIMD::Int(0, 1, 2, 3) with SIMD-width equivalent
196 		auto localInvocationIndex = SIMD::Int(subgroupIndex * SIMD::Width) + SIMD::Int(0, 1, 2, 3);
197 
198 		// Disable lanes where (invocationIDs >= invocationsPerWorkgroup)
199 		auto activeLaneMask = CmpLT(localInvocationIndex, SIMD::Int(invocationsPerWorkgroup));
200 
201 		setSubgroupBuiltins(data, routine, workgroupID, localInvocationIndex, subgroupIndex);
202 
203 		shader->emit(routine, activeLaneMask, activeLaneMask, descriptorSets);
204 	}
205 }
206 
run(vk::DescriptorSet::Bindings const & descriptorSets,vk::DescriptorSet::DynamicOffsets const & descriptorDynamicOffsets,PushConstantStorage const & pushConstants,uint32_t baseGroupX,uint32_t baseGroupY,uint32_t baseGroupZ,uint32_t groupCountX,uint32_t groupCountY,uint32_t groupCountZ)207 void ComputeProgram::run(
208     vk::DescriptorSet::Bindings const &descriptorSets,
209     vk::DescriptorSet::DynamicOffsets const &descriptorDynamicOffsets,
210     PushConstantStorage const &pushConstants,
211     uint32_t baseGroupX, uint32_t baseGroupY, uint32_t baseGroupZ,
212     uint32_t groupCountX, uint32_t groupCountY, uint32_t groupCountZ)
213 {
214 	auto &modes = shader->getModes();
215 
216 	auto invocationsPerSubgroup = SIMD::Width;
217 	auto invocationsPerWorkgroup = modes.WorkgroupSizeX * modes.WorkgroupSizeY * modes.WorkgroupSizeZ;
218 	auto subgroupsPerWorkgroup = (invocationsPerWorkgroup + invocationsPerSubgroup - 1) / invocationsPerSubgroup;
219 
220 	Data data;
221 	data.descriptorSets = descriptorSets;
222 	data.descriptorDynamicOffsets = descriptorDynamicOffsets;
223 	data.numWorkgroups[X] = groupCountX;
224 	data.numWorkgroups[Y] = groupCountY;
225 	data.numWorkgroups[Z] = groupCountZ;
226 	data.numWorkgroups[3] = 0;
227 	data.workgroupSize[X] = modes.WorkgroupSizeX;
228 	data.workgroupSize[Y] = modes.WorkgroupSizeY;
229 	data.workgroupSize[Z] = modes.WorkgroupSizeZ;
230 	data.workgroupSize[3] = 0;
231 	data.invocationsPerSubgroup = invocationsPerSubgroup;
232 	data.invocationsPerWorkgroup = invocationsPerWorkgroup;
233 	data.subgroupsPerWorkgroup = subgroupsPerWorkgroup;
234 	data.pushConstants = pushConstants;
235 	data.constants = &sw::constants;
236 
237 	marl::WaitGroup wg;
238 	const uint32_t batchCount = 16;
239 
240 	auto groupCount = groupCountX * groupCountY * groupCountZ;
241 
242 	for(uint32_t batchID = 0; batchID < batchCount && batchID < groupCount; batchID++)
243 	{
244 		wg.add(1);
245 		marl::schedule([=, &data] {
246 			defer(wg.done());
247 			std::vector<uint8_t> workgroupMemory(shader->workgroupMemory.size());
248 
249 			for(uint32_t groupIndex = batchID; groupIndex < groupCount; groupIndex += batchCount)
250 			{
251 				auto modulo = groupIndex;
252 				auto groupOffsetZ = modulo / (groupCountX * groupCountY);
253 				modulo -= groupOffsetZ * (groupCountX * groupCountY);
254 				auto groupOffsetY = modulo / groupCountX;
255 				modulo -= groupOffsetY * groupCountX;
256 				auto groupOffsetX = modulo;
257 
258 				auto groupZ = baseGroupZ + groupOffsetZ;
259 				auto groupY = baseGroupY + groupOffsetY;
260 				auto groupX = baseGroupX + groupOffsetX;
261 				MARL_SCOPED_EVENT("groupX: %d, groupY: %d, groupZ: %d", groupX, groupY, groupZ);
262 
263 				using Coroutine = std::unique_ptr<rr::Stream<SpirvShader::YieldResult>>;
264 				std::queue<Coroutine> coroutines;
265 
266 				if(modes.ContainsControlBarriers)
267 				{
268 					// Make a function call per subgroup so each subgroup
269 					// can yield, bringing all subgroups to the barrier
270 					// together.
271 					for(int subgroupIndex = 0; subgroupIndex < subgroupsPerWorkgroup; subgroupIndex++)
272 					{
273 						auto coroutine = (*this)(&data, groupX, groupY, groupZ, workgroupMemory.data(), subgroupIndex, 1);
274 						coroutines.push(std::move(coroutine));
275 					}
276 				}
277 				else
278 				{
279 					auto coroutine = (*this)(&data, groupX, groupY, groupZ, workgroupMemory.data(), 0, subgroupsPerWorkgroup);
280 					coroutines.push(std::move(coroutine));
281 				}
282 
283 				while(coroutines.size() > 0)
284 				{
285 					auto coroutine = std::move(coroutines.front());
286 					coroutines.pop();
287 
288 					SpirvShader::YieldResult result;
289 					if(coroutine->await(result))
290 					{
291 						// TODO: Consider result (when the enum is more than 1 entry).
292 						coroutines.push(std::move(coroutine));
293 					}
294 				}
295 			}
296 		});
297 	}
298 
299 	wg.wait();
300 }
301 
302 }  // namespace sw
303