• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 // Copyright 2019 The SwiftShader Authors. All Rights Reserved.
2 //
3 // Licensed under the Apache License, Version 2.0 (the "License");
4 // you may not use this file except in compliance with the License.
5 // You may obtain a copy of the License at
6 //
7 //    http://www.apache.org/licenses/LICENSE-2.0
8 //
9 // Unless required by applicable law or agreed to in writing, software
10 // distributed under the License is distributed on an "AS IS" BASIS,
11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 // See the License for the specific language governing permissions and
13 // limitations under the License.
14 
15 #include "ComputeProgram.hpp"
16 
17 #include "Constants.hpp"
18 #include "System/Debug.hpp"
19 #include "Vulkan/VkDevice.hpp"
20 #include "Vulkan/VkPipelineLayout.hpp"
21 
22 #include "marl/defer.h"
23 #include "marl/trace.h"
24 #include "marl/waitgroup.h"
25 
26 #include <queue>
27 
28 namespace sw {
29 
ComputeProgram(vk::Device * device,std::shared_ptr<SpirvShader> shader,const vk::PipelineLayout * pipelineLayout,const vk::DescriptorSet::Bindings & descriptorSets)30 ComputeProgram::ComputeProgram(vk::Device *device, std::shared_ptr<SpirvShader> shader, const vk::PipelineLayout *pipelineLayout, const vk::DescriptorSet::Bindings &descriptorSets)
31     : device(device)
32     , shader(shader)
33     , pipelineLayout(pipelineLayout)
34     , descriptorSets(descriptorSets)
35 {
36 }
37 
~ComputeProgram()38 ComputeProgram::~ComputeProgram()
39 {
40 }
41 
generate()42 void ComputeProgram::generate()
43 {
44 	MARL_SCOPED_EVENT("ComputeProgram::generate");
45 
46 	SpirvRoutine routine(pipelineLayout);
47 	shader->emitProlog(&routine);
48 	emit(&routine);
49 	shader->emitEpilog(&routine);
50 }
51 
setWorkgroupBuiltins(Pointer<Byte> data,SpirvRoutine * routine,Int workgroupID[3])52 void ComputeProgram::setWorkgroupBuiltins(Pointer<Byte> data, SpirvRoutine *routine, Int workgroupID[3])
53 {
54 	// TODO(b/146486064): Consider only assigning these to the SpirvRoutine iff they are ever going to be read.
55 	routine->numWorkgroups = *Pointer<Int4>(data + OFFSET(Data, numWorkgroups));
56 	routine->workgroupID = Insert(Insert(Insert(Int4(0), workgroupID[0], 0), workgroupID[1], 1), workgroupID[2], 2);
57 	routine->workgroupSize = *Pointer<Int4>(data + OFFSET(Data, workgroupSize));
58 	routine->subgroupsPerWorkgroup = *Pointer<Int>(data + OFFSET(Data, subgroupsPerWorkgroup));
59 	routine->invocationsPerSubgroup = *Pointer<Int>(data + OFFSET(Data, invocationsPerSubgroup));
60 
61 	routine->setInputBuiltin(shader.get(), spv::BuiltInNumWorkgroups, [&](const Spirv::BuiltinMapping &builtin, Array<SIMD::Float> &value) {
62 		value[builtin.FirstComponent + 0] = As<SIMD::Float>(SIMD::Int(routine->numWorkgroups.x));
63 		value[builtin.FirstComponent + 1] = As<SIMD::Float>(SIMD::Int(routine->numWorkgroups.y));
64 		value[builtin.FirstComponent + 2] = As<SIMD::Float>(SIMD::Int(routine->numWorkgroups.z));
65 	});
66 
67 	routine->setInputBuiltin(shader.get(), spv::BuiltInWorkgroupId, [&](const Spirv::BuiltinMapping &builtin, Array<SIMD::Float> &value) {
68 		value[builtin.FirstComponent + 0] = As<SIMD::Float>(SIMD::Int(workgroupID[0]));
69 		value[builtin.FirstComponent + 1] = As<SIMD::Float>(SIMD::Int(workgroupID[1]));
70 		value[builtin.FirstComponent + 2] = As<SIMD::Float>(SIMD::Int(workgroupID[2]));
71 	});
72 
73 	routine->setInputBuiltin(shader.get(), spv::BuiltInWorkgroupSize, [&](const Spirv::BuiltinMapping &builtin, Array<SIMD::Float> &value) {
74 		value[builtin.FirstComponent + 0] = As<SIMD::Float>(SIMD::Int(routine->workgroupSize.x));
75 		value[builtin.FirstComponent + 1] = As<SIMD::Float>(SIMD::Int(routine->workgroupSize.y));
76 		value[builtin.FirstComponent + 2] = As<SIMD::Float>(SIMD::Int(routine->workgroupSize.z));
77 	});
78 
79 	routine->setInputBuiltin(shader.get(), spv::BuiltInNumSubgroups, [&](const Spirv::BuiltinMapping &builtin, Array<SIMD::Float> &value) {
80 		value[builtin.FirstComponent] = As<SIMD::Float>(SIMD::Int(routine->subgroupsPerWorkgroup));
81 	});
82 
83 	routine->setInputBuiltin(shader.get(), spv::BuiltInSubgroupSize, [&](const Spirv::BuiltinMapping &builtin, Array<SIMD::Float> &value) {
84 		value[builtin.FirstComponent] = As<SIMD::Float>(SIMD::Int(routine->invocationsPerSubgroup));
85 	});
86 
87 	routine->setImmutableInputBuiltins(shader.get());
88 }
89 
setSubgroupBuiltins(Pointer<Byte> data,SpirvRoutine * routine,Int workgroupID[3],SIMD::Int localInvocationIndex,Int subgroupIndex)90 void ComputeProgram::setSubgroupBuiltins(Pointer<Byte> data, SpirvRoutine *routine, Int workgroupID[3], SIMD::Int localInvocationIndex, Int subgroupIndex)
91 {
92 	Int4 numWorkgroups = *Pointer<Int4>(data + OFFSET(Data, numWorkgroups));
93 	Int4 workgroupSize = *Pointer<Int4>(data + OFFSET(Data, workgroupSize));
94 
95 	Int workgroupSizeX = workgroupSize.x;
96 	Int workgroupSizeY = workgroupSize.y;
97 
98 	SIMD::Int localInvocationID[3];
99 	{
100 		SIMD::Int idx = localInvocationIndex;
101 		localInvocationID[2] = idx / SIMD::Int(workgroupSizeX * workgroupSizeY);
102 		idx -= localInvocationID[2] * SIMD::Int(workgroupSizeX * workgroupSizeY);  // modulo
103 		localInvocationID[1] = idx / SIMD::Int(workgroupSizeX);
104 		idx -= localInvocationID[1] * SIMD::Int(workgroupSizeX);  // modulo
105 		localInvocationID[0] = idx;
106 	}
107 
108 	Int4 wgID = Insert(Insert(Insert(Int4(0), workgroupID[0], 0), workgroupID[1], 1), workgroupID[2], 2);
109 	auto localBase = workgroupSize * wgID;
110 	SIMD::Int globalInvocationID[3];
111 	globalInvocationID[0] = SIMD::Int(Extract(localBase, 0)) + localInvocationID[0];
112 	globalInvocationID[1] = SIMD::Int(Extract(localBase, 1)) + localInvocationID[1];
113 	globalInvocationID[2] = SIMD::Int(Extract(localBase, 2)) + localInvocationID[2];
114 
115 	routine->localInvocationIndex = localInvocationIndex;
116 	routine->subgroupIndex = subgroupIndex;
117 	routine->localInvocationID[0] = localInvocationID[0];
118 	routine->localInvocationID[1] = localInvocationID[1];
119 	routine->localInvocationID[2] = localInvocationID[2];
120 	routine->globalInvocationID[0] = globalInvocationID[0];
121 	routine->globalInvocationID[1] = globalInvocationID[1];
122 	routine->globalInvocationID[2] = globalInvocationID[2];
123 
124 	routine->setInputBuiltin(shader.get(), spv::BuiltInLocalInvocationIndex, [&](const Spirv::BuiltinMapping &builtin, Array<SIMD::Float> &value) {
125 		ASSERT(builtin.SizeInComponents == 1);
126 		value[builtin.FirstComponent] = As<SIMD::Float>(localInvocationIndex);
127 	});
128 
129 	routine->setInputBuiltin(shader.get(), spv::BuiltInSubgroupId, [&](const Spirv::BuiltinMapping &builtin, Array<SIMD::Float> &value) {
130 		ASSERT(builtin.SizeInComponents == 1);
131 		value[builtin.FirstComponent] = As<SIMD::Float>(SIMD::Int(subgroupIndex));
132 	});
133 
134 	routine->setInputBuiltin(shader.get(), spv::BuiltInLocalInvocationId, [&](const Spirv::BuiltinMapping &builtin, Array<SIMD::Float> &value) {
135 		for(uint32_t component = 0; component < builtin.SizeInComponents; component++)
136 		{
137 			value[builtin.FirstComponent + component] =
138 			    As<SIMD::Float>(localInvocationID[component]);
139 		}
140 	});
141 
142 	routine->setInputBuiltin(shader.get(), spv::BuiltInGlobalInvocationId, [&](const Spirv::BuiltinMapping &builtin, Array<SIMD::Float> &value) {
143 		for(uint32_t component = 0; component < builtin.SizeInComponents; component++)
144 		{
145 			value[builtin.FirstComponent + component] =
146 			    As<SIMD::Float>(globalInvocationID[component]);
147 		}
148 	});
149 }
150 
emit(SpirvRoutine * routine)151 void ComputeProgram::emit(SpirvRoutine *routine)
152 {
153 	Pointer<Byte> device = Arg<0>();
154 	Pointer<Byte> data = Arg<1>();
155 	Int workgroupX = Arg<2>();
156 	Int workgroupY = Arg<3>();
157 	Int workgroupZ = Arg<4>();
158 	Pointer<Byte> workgroupMemory = Arg<5>();
159 	Int firstSubgroup = Arg<6>();
160 	Int subgroupCount = Arg<7>();
161 
162 	routine->device = device;
163 	routine->descriptorSets = data + OFFSET(Data, descriptorSets);
164 	routine->descriptorDynamicOffsets = data + OFFSET(Data, descriptorDynamicOffsets);
165 	routine->pushConstants = data + OFFSET(Data, pushConstants);
166 	routine->constants = device + OFFSET(vk::Device, constants);
167 	routine->workgroupMemory = workgroupMemory;
168 
169 	Int invocationsPerWorkgroup = *Pointer<Int>(data + OFFSET(Data, invocationsPerWorkgroup));
170 
171 	Int workgroupID[3] = { workgroupX, workgroupY, workgroupZ };
172 	setWorkgroupBuiltins(data, routine, workgroupID);
173 
174 	For(Int i = 0, i < subgroupCount, i++)
175 	{
176 		auto subgroupIndex = firstSubgroup + i;
177 
178 		// TODO: Replace SIMD::Int(0, 1, 2, 3) with SIMD-width equivalent
179 		auto localInvocationIndex = SIMD::Int(subgroupIndex * SIMD::Width) + SIMD::Int(0, 1, 2, 3);
180 
181 		// Disable lanes where (invocationIDs >= invocationsPerWorkgroup)
182 		auto activeLaneMask = CmpLT(localInvocationIndex, SIMD::Int(invocationsPerWorkgroup));
183 
184 		setSubgroupBuiltins(data, routine, workgroupID, localInvocationIndex, subgroupIndex);
185 
186 		shader->emit(routine, activeLaneMask, activeLaneMask, descriptorSets);
187 	}
188 }
189 
run(const vk::DescriptorSet::Array & descriptorSetObjects,const vk::DescriptorSet::Bindings & descriptorSets,const vk::DescriptorSet::DynamicOffsets & descriptorDynamicOffsets,const vk::Pipeline::PushConstantStorage & pushConstants,uint32_t baseGroupX,uint32_t baseGroupY,uint32_t baseGroupZ,uint32_t groupCountX,uint32_t groupCountY,uint32_t groupCountZ)190 void ComputeProgram::run(
191     const vk::DescriptorSet::Array &descriptorSetObjects,
192     const vk::DescriptorSet::Bindings &descriptorSets,
193     const vk::DescriptorSet::DynamicOffsets &descriptorDynamicOffsets,
194     const vk::Pipeline::PushConstantStorage &pushConstants,
195     uint32_t baseGroupX, uint32_t baseGroupY, uint32_t baseGroupZ,
196     uint32_t groupCountX, uint32_t groupCountY, uint32_t groupCountZ)
197 {
198 	uint32_t workgroupSizeX = shader->getWorkgroupSizeX();
199 	uint32_t workgroupSizeY = shader->getWorkgroupSizeY();
200 	uint32_t workgroupSizeZ = shader->getWorkgroupSizeZ();
201 
202 	auto invocationsPerSubgroup = SIMD::Width;
203 	auto invocationsPerWorkgroup = workgroupSizeX * workgroupSizeY * workgroupSizeZ;
204 	auto subgroupsPerWorkgroup = (invocationsPerWorkgroup + invocationsPerSubgroup - 1) / invocationsPerSubgroup;
205 
206 	Data data;
207 	data.descriptorSets = descriptorSets;
208 	data.descriptorDynamicOffsets = descriptorDynamicOffsets;
209 	data.numWorkgroups[0] = groupCountX;
210 	data.numWorkgroups[1] = groupCountY;
211 	data.numWorkgroups[2] = groupCountZ;
212 	data.workgroupSize[0] = workgroupSizeX;
213 	data.workgroupSize[1] = workgroupSizeY;
214 	data.workgroupSize[2] = workgroupSizeZ;
215 	data.invocationsPerSubgroup = invocationsPerSubgroup;
216 	data.invocationsPerWorkgroup = invocationsPerWorkgroup;
217 	data.subgroupsPerWorkgroup = subgroupsPerWorkgroup;
218 	data.pushConstants = pushConstants;
219 
220 	marl::WaitGroup wg;
221 	const uint32_t batchCount = 16;
222 
223 	auto groupCount = groupCountX * groupCountY * groupCountZ;
224 
225 	for(uint32_t batchID = 0; batchID < batchCount && batchID < groupCount; batchID++)
226 	{
227 		wg.add(1);
228 		marl::schedule([=, &data] {
229 			defer(wg.done());
230 			std::vector<uint8_t> workgroupMemory(shader->workgroupMemory.size());
231 
232 			for(uint32_t groupIndex = batchID; groupIndex < groupCount; groupIndex += batchCount)
233 			{
234 				auto modulo = groupIndex;
235 				auto groupOffsetZ = modulo / (groupCountX * groupCountY);
236 				modulo -= groupOffsetZ * (groupCountX * groupCountY);
237 				auto groupOffsetY = modulo / groupCountX;
238 				modulo -= groupOffsetY * groupCountX;
239 				auto groupOffsetX = modulo;
240 
241 				auto groupZ = baseGroupZ + groupOffsetZ;
242 				auto groupY = baseGroupY + groupOffsetY;
243 				auto groupX = baseGroupX + groupOffsetX;
244 				MARL_SCOPED_EVENT("groupX: %d, groupY: %d, groupZ: %d", groupX, groupY, groupZ);
245 
246 				using Coroutine = std::unique_ptr<rr::Stream<SpirvEmitter::YieldResult>>;
247 				std::queue<Coroutine> coroutines;
248 
249 				if(shader->getAnalysis().ContainsControlBarriers)
250 				{
251 					// Make a function call per subgroup so each subgroup
252 					// can yield, bringing all subgroups to the barrier
253 					// together.
254 					for(uint32_t subgroupIndex = 0; subgroupIndex < subgroupsPerWorkgroup; subgroupIndex++)
255 					{
256 						auto coroutine = (*this)(device, &data, groupX, groupY, groupZ, workgroupMemory.data(), subgroupIndex, 1);
257 						coroutines.push(std::move(coroutine));
258 					}
259 				}
260 				else
261 				{
262 					auto coroutine = (*this)(device, &data, groupX, groupY, groupZ, workgroupMemory.data(), 0, subgroupsPerWorkgroup);
263 					coroutines.push(std::move(coroutine));
264 				}
265 
266 				while(coroutines.size() > 0)
267 				{
268 					auto coroutine = std::move(coroutines.front());
269 					coroutines.pop();
270 
271 					SpirvEmitter::YieldResult result;
272 					if(coroutine->await(result))
273 					{
274 						// TODO: Consider result (when the enum is more than 1 entry).
275 						coroutines.push(std::move(coroutine));
276 					}
277 				}
278 			}
279 		});
280 	}
281 
282 	wg.wait();
283 
284 	if(shader->containsImageWrite())
285 	{
286 		vk::DescriptorSet::ContentsChanged(descriptorSetObjects, pipelineLayout, device);
287 	}
288 }
289 
290 }  // namespace sw
291