• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 // Copyright 2019 The SwiftShader Authors. All Rights Reserved.
2 //
3 // Licensed under the Apache License, Version 2.0 (the "License");
4 // you may not use this file except in compliance with the License.
5 // You may obtain a copy of the License at
6 //
7 //    http://www.apache.org/licenses/LICENSE-2.0
8 //
9 // Unless required by applicable law or agreed to in writing, software
10 // distributed under the License is distributed on an "AS IS" BASIS,
11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 // See the License for the specific language governing permissions and
13 // limitations under the License.
14 
15 #include "ComputeProgram.hpp"
16 
17 #include "Constants.hpp"
18 #include "System/Debug.hpp"
19 #include "Vulkan/VkDevice.hpp"
20 #include "Vulkan/VkPipelineLayout.hpp"
21 
22 #include "marl/defer.h"
23 #include "marl/trace.h"
24 #include "marl/waitgroup.h"
25 
26 #include <queue>
27 
28 namespace {
29 
30 enum
31 {
32 	X,
33 	Y,
34 	Z
35 };
36 
37 }  // anonymous namespace
38 
39 namespace sw {
40 
ComputeProgram(vk::Device * device,std::shared_ptr<SpirvShader> shader,vk::PipelineLayout const * pipelineLayout,const vk::DescriptorSet::Bindings & descriptorSets)41 ComputeProgram::ComputeProgram(vk::Device *device, std::shared_ptr<SpirvShader> shader, vk::PipelineLayout const *pipelineLayout, const vk::DescriptorSet::Bindings &descriptorSets)
42     : device(device)
43     , shader(shader)
44     , pipelineLayout(pipelineLayout)
45     , descriptorSets(descriptorSets)
46 {
47 }
48 
~ComputeProgram()49 ComputeProgram::~ComputeProgram()
50 {
51 }
52 
generate()53 void ComputeProgram::generate()
54 {
55 	MARL_SCOPED_EVENT("ComputeProgram::generate");
56 
57 	SpirvRoutine routine(pipelineLayout);
58 	shader->emitProlog(&routine);
59 	emit(&routine);
60 	shader->emitEpilog(&routine);
61 	shader->clearPhis(&routine);
62 }
63 
setWorkgroupBuiltins(Pointer<Byte> data,SpirvRoutine * routine,Int workgroupID[3])64 void ComputeProgram::setWorkgroupBuiltins(Pointer<Byte> data, SpirvRoutine *routine, Int workgroupID[3])
65 {
66 	// TODO(b/146486064): Consider only assigning these to the SpirvRoutine iff
67 	// they are ever going to be read.
68 	routine->numWorkgroups = *Pointer<Int4>(data + OFFSET(Data, numWorkgroups));
69 	routine->workgroupID = Insert(Insert(Insert(Int4(0), workgroupID[X], X), workgroupID[Y], Y), workgroupID[Z], Z);
70 	routine->workgroupSize = *Pointer<Int4>(data + OFFSET(Data, workgroupSize));
71 	routine->subgroupsPerWorkgroup = *Pointer<Int>(data + OFFSET(Data, subgroupsPerWorkgroup));
72 	routine->invocationsPerSubgroup = *Pointer<Int>(data + OFFSET(Data, invocationsPerSubgroup));
73 
74 	routine->setInputBuiltin(shader.get(), spv::BuiltInNumWorkgroups, [&](const SpirvShader::BuiltinMapping &builtin, Array<SIMD::Float> &value) {
75 		for(uint32_t component = 0; component < builtin.SizeInComponents; component++)
76 		{
77 			value[builtin.FirstComponent + component] =
78 			    As<SIMD::Float>(SIMD::Int(Extract(routine->numWorkgroups, component)));
79 		}
80 	});
81 
82 	routine->setInputBuiltin(shader.get(), spv::BuiltInWorkgroupId, [&](const SpirvShader::BuiltinMapping &builtin, Array<SIMD::Float> &value) {
83 		for(uint32_t component = 0; component < builtin.SizeInComponents; component++)
84 		{
85 			value[builtin.FirstComponent + component] =
86 			    As<SIMD::Float>(SIMD::Int(workgroupID[component]));
87 		}
88 	});
89 
90 	routine->setInputBuiltin(shader.get(), spv::BuiltInWorkgroupSize, [&](const SpirvShader::BuiltinMapping &builtin, Array<SIMD::Float> &value) {
91 		for(uint32_t component = 0; component < builtin.SizeInComponents; component++)
92 		{
93 			value[builtin.FirstComponent + component] =
94 			    As<SIMD::Float>(SIMD::Int(Extract(routine->workgroupSize, component)));
95 		}
96 	});
97 
98 	routine->setInputBuiltin(shader.get(), spv::BuiltInNumSubgroups, [&](const SpirvShader::BuiltinMapping &builtin, Array<SIMD::Float> &value) {
99 		ASSERT(builtin.SizeInComponents == 1);
100 		value[builtin.FirstComponent] = As<SIMD::Float>(SIMD::Int(routine->subgroupsPerWorkgroup));
101 	});
102 
103 	routine->setInputBuiltin(shader.get(), spv::BuiltInSubgroupSize, [&](const SpirvShader::BuiltinMapping &builtin, Array<SIMD::Float> &value) {
104 		ASSERT(builtin.SizeInComponents == 1);
105 		value[builtin.FirstComponent] = As<SIMD::Float>(SIMD::Int(routine->invocationsPerSubgroup));
106 	});
107 
108 	routine->setImmutableInputBuiltins(shader.get());
109 }
110 
setSubgroupBuiltins(Pointer<Byte> data,SpirvRoutine * routine,Int workgroupID[3],SIMD::Int localInvocationIndex,Int subgroupIndex)111 void ComputeProgram::setSubgroupBuiltins(Pointer<Byte> data, SpirvRoutine *routine, Int workgroupID[3], SIMD::Int localInvocationIndex, Int subgroupIndex)
112 {
113 	Int4 numWorkgroups = *Pointer<Int4>(data + OFFSET(Data, numWorkgroups));
114 	Int4 workgroupSize = *Pointer<Int4>(data + OFFSET(Data, workgroupSize));
115 
116 	// TODO: Fix Int4 swizzles so we can just use workgroupSize.x, workgroupSize.y.
117 	Int workgroupSizeX = Extract(workgroupSize, X);
118 	Int workgroupSizeY = Extract(workgroupSize, Y);
119 
120 	SIMD::Int localInvocationID[3];
121 	{
122 		SIMD::Int idx = localInvocationIndex;
123 		localInvocationID[Z] = idx / SIMD::Int(workgroupSizeX * workgroupSizeY);
124 		idx -= localInvocationID[Z] * SIMD::Int(workgroupSizeX * workgroupSizeY);  // modulo
125 		localInvocationID[Y] = idx / SIMD::Int(workgroupSizeX);
126 		idx -= localInvocationID[Y] * SIMD::Int(workgroupSizeX);  // modulo
127 		localInvocationID[X] = idx;
128 	}
129 
130 	Int4 wgID = Insert(Insert(Insert(SIMD::Int(0), workgroupID[X], X), workgroupID[Y], Y), workgroupID[Z], Z);
131 	auto localBase = workgroupSize * wgID;
132 	SIMD::Int globalInvocationID[3];
133 	globalInvocationID[X] = SIMD::Int(Extract(localBase, X)) + localInvocationID[X];
134 	globalInvocationID[Y] = SIMD::Int(Extract(localBase, Y)) + localInvocationID[Y];
135 	globalInvocationID[Z] = SIMD::Int(Extract(localBase, Z)) + localInvocationID[Z];
136 
137 	routine->localInvocationIndex = localInvocationIndex;
138 	routine->subgroupIndex = subgroupIndex;
139 	routine->localInvocationID[X] = localInvocationID[X];
140 	routine->localInvocationID[Y] = localInvocationID[Y];
141 	routine->localInvocationID[Z] = localInvocationID[Z];
142 	routine->globalInvocationID[X] = globalInvocationID[X];
143 	routine->globalInvocationID[Y] = globalInvocationID[Y];
144 	routine->globalInvocationID[Z] = globalInvocationID[Z];
145 
146 	routine->setInputBuiltin(shader.get(), spv::BuiltInLocalInvocationIndex, [&](const SpirvShader::BuiltinMapping &builtin, Array<SIMD::Float> &value) {
147 		ASSERT(builtin.SizeInComponents == 1);
148 		value[builtin.FirstComponent] = As<SIMD::Float>(localInvocationIndex);
149 	});
150 
151 	routine->setInputBuiltin(shader.get(), spv::BuiltInSubgroupId, [&](const SpirvShader::BuiltinMapping &builtin, Array<SIMD::Float> &value) {
152 		ASSERT(builtin.SizeInComponents == 1);
153 		value[builtin.FirstComponent] = As<SIMD::Float>(SIMD::Int(subgroupIndex));
154 	});
155 
156 	routine->setInputBuiltin(shader.get(), spv::BuiltInLocalInvocationId, [&](const SpirvShader::BuiltinMapping &builtin, Array<SIMD::Float> &value) {
157 		for(uint32_t component = 0; component < builtin.SizeInComponents; component++)
158 		{
159 			value[builtin.FirstComponent + component] =
160 			    As<SIMD::Float>(localInvocationID[component]);
161 		}
162 	});
163 
164 	routine->setInputBuiltin(shader.get(), spv::BuiltInGlobalInvocationId, [&](const SpirvShader::BuiltinMapping &builtin, Array<SIMD::Float> &value) {
165 		for(uint32_t component = 0; component < builtin.SizeInComponents; component++)
166 		{
167 			value[builtin.FirstComponent + component] =
168 			    As<SIMD::Float>(globalInvocationID[component]);
169 		}
170 	});
171 }
172 
emit(SpirvRoutine * routine)173 void ComputeProgram::emit(SpirvRoutine *routine)
174 {
175 	Pointer<Byte> device = Arg<0>();
176 	Pointer<Byte> data = Arg<1>();
177 	Int workgroupX = Arg<2>();
178 	Int workgroupY = Arg<3>();
179 	Int workgroupZ = Arg<4>();
180 	Pointer<Byte> workgroupMemory = Arg<5>();
181 	Int firstSubgroup = Arg<6>();
182 	Int subgroupCount = Arg<7>();
183 
184 	routine->device = device;
185 	routine->descriptorSets = data + OFFSET(Data, descriptorSets);
186 	routine->descriptorDynamicOffsets = data + OFFSET(Data, descriptorDynamicOffsets);
187 	routine->pushConstants = data + OFFSET(Data, pushConstants);
188 	routine->constants = device + OFFSET(vk::Device, constants);
189 	routine->workgroupMemory = workgroupMemory;
190 
191 	Int invocationsPerWorkgroup = *Pointer<Int>(data + OFFSET(Data, invocationsPerWorkgroup));
192 
193 	Int workgroupID[3] = { workgroupX, workgroupY, workgroupZ };
194 	setWorkgroupBuiltins(data, routine, workgroupID);
195 
196 	For(Int i = 0, i < subgroupCount, i++)
197 	{
198 		auto subgroupIndex = firstSubgroup + i;
199 
200 		// TODO: Replace SIMD::Int(0, 1, 2, 3) with SIMD-width equivalent
201 		auto localInvocationIndex = SIMD::Int(subgroupIndex * SIMD::Width) + SIMD::Int(0, 1, 2, 3);
202 
203 		// Disable lanes where (invocationIDs >= invocationsPerWorkgroup)
204 		auto activeLaneMask = CmpLT(localInvocationIndex, SIMD::Int(invocationsPerWorkgroup));
205 
206 		setSubgroupBuiltins(data, routine, workgroupID, localInvocationIndex, subgroupIndex);
207 
208 		shader->emit(routine, activeLaneMask, activeLaneMask, descriptorSets);
209 	}
210 }
211 
run(vk::DescriptorSet::Array const & descriptorSetObjects,vk::DescriptorSet::Bindings const & descriptorSets,vk::DescriptorSet::DynamicOffsets const & descriptorDynamicOffsets,vk::Pipeline::PushConstantStorage const & pushConstants,uint32_t baseGroupX,uint32_t baseGroupY,uint32_t baseGroupZ,uint32_t groupCountX,uint32_t groupCountY,uint32_t groupCountZ)212 void ComputeProgram::run(
213     vk::DescriptorSet::Array const &descriptorSetObjects,
214     vk::DescriptorSet::Bindings const &descriptorSets,
215     vk::DescriptorSet::DynamicOffsets const &descriptorDynamicOffsets,
216     vk::Pipeline::PushConstantStorage const &pushConstants,
217     uint32_t baseGroupX, uint32_t baseGroupY, uint32_t baseGroupZ,
218     uint32_t groupCountX, uint32_t groupCountY, uint32_t groupCountZ)
219 {
220 	uint32_t workgroupSizeX = shader->getWorkgroupSizeX();
221 	uint32_t workgroupSizeY = shader->getWorkgroupSizeY();
222 	uint32_t workgroupSizeZ = shader->getWorkgroupSizeZ();
223 
224 	auto invocationsPerSubgroup = SIMD::Width;
225 	auto invocationsPerWorkgroup = workgroupSizeX * workgroupSizeY * workgroupSizeZ;
226 	auto subgroupsPerWorkgroup = (invocationsPerWorkgroup + invocationsPerSubgroup - 1) / invocationsPerSubgroup;
227 
228 	Data data;
229 	data.descriptorSets = descriptorSets;
230 	data.descriptorDynamicOffsets = descriptorDynamicOffsets;
231 	data.numWorkgroups[X] = groupCountX;
232 	data.numWorkgroups[Y] = groupCountY;
233 	data.numWorkgroups[Z] = groupCountZ;
234 	data.numWorkgroups[3] = 0;
235 	data.workgroupSize[X] = workgroupSizeX;
236 	data.workgroupSize[Y] = workgroupSizeY;
237 	data.workgroupSize[Z] = workgroupSizeZ;
238 	data.workgroupSize[3] = 0;
239 	data.invocationsPerSubgroup = invocationsPerSubgroup;
240 	data.invocationsPerWorkgroup = invocationsPerWorkgroup;
241 	data.subgroupsPerWorkgroup = subgroupsPerWorkgroup;
242 	data.pushConstants = pushConstants;
243 
244 	marl::WaitGroup wg;
245 	const uint32_t batchCount = 16;
246 
247 	auto groupCount = groupCountX * groupCountY * groupCountZ;
248 
249 	for(uint32_t batchID = 0; batchID < batchCount && batchID < groupCount; batchID++)
250 	{
251 		wg.add(1);
252 		marl::schedule([=, &data] {
253 			defer(wg.done());
254 			std::vector<uint8_t> workgroupMemory(shader->workgroupMemory.size());
255 
256 			for(uint32_t groupIndex = batchID; groupIndex < groupCount; groupIndex += batchCount)
257 			{
258 				auto modulo = groupIndex;
259 				auto groupOffsetZ = modulo / (groupCountX * groupCountY);
260 				modulo -= groupOffsetZ * (groupCountX * groupCountY);
261 				auto groupOffsetY = modulo / groupCountX;
262 				modulo -= groupOffsetY * groupCountX;
263 				auto groupOffsetX = modulo;
264 
265 				auto groupZ = baseGroupZ + groupOffsetZ;
266 				auto groupY = baseGroupY + groupOffsetY;
267 				auto groupX = baseGroupX + groupOffsetX;
268 				MARL_SCOPED_EVENT("groupX: %d, groupY: %d, groupZ: %d", groupX, groupY, groupZ);
269 
270 				using Coroutine = std::unique_ptr<rr::Stream<SpirvShader::YieldResult>>;
271 				std::queue<Coroutine> coroutines;
272 
273 				if(shader->getAnalysis().ContainsControlBarriers)
274 				{
275 					// Make a function call per subgroup so each subgroup
276 					// can yield, bringing all subgroups to the barrier
277 					// together.
278 					for(uint32_t subgroupIndex = 0; subgroupIndex < subgroupsPerWorkgroup; subgroupIndex++)
279 					{
280 						auto coroutine = (*this)(device, &data, groupX, groupY, groupZ, workgroupMemory.data(), subgroupIndex, 1);
281 						coroutines.push(std::move(coroutine));
282 					}
283 				}
284 				else
285 				{
286 					auto coroutine = (*this)(device, &data, groupX, groupY, groupZ, workgroupMemory.data(), 0, subgroupsPerWorkgroup);
287 					coroutines.push(std::move(coroutine));
288 				}
289 
290 				while(coroutines.size() > 0)
291 				{
292 					auto coroutine = std::move(coroutines.front());
293 					coroutines.pop();
294 
295 					SpirvShader::YieldResult result;
296 					if(coroutine->await(result))
297 					{
298 						// TODO: Consider result (when the enum is more than 1 entry).
299 						coroutines.push(std::move(coroutine));
300 					}
301 				}
302 			}
303 		});
304 	}
305 
306 	wg.wait();
307 
308 	if(shader->containsImageWrite())
309 	{
310 		vk::DescriptorSet::ContentsChanged(descriptorSetObjects, pipelineLayout, device);
311 	}
312 }
313 
314 }  // namespace sw
315