1 /*
2  * Copyright 2022 Google LLC
3  *
4  * Use of this source code is governed by a BSD-style license that can be
5  * found in the LICENSE file.
6  */
7 
8 #include "tests/Test.h"
9 
10 #include "include/gpu/graphite/Context.h"
11 #include "include/gpu/graphite/Recorder.h"
12 #include "include/gpu/graphite/Recording.h"
13 #include "src/gpu/graphite/Buffer.h"
14 #include "src/gpu/graphite/Caps.h"
15 #include "src/gpu/graphite/ComputePassTask.h"
16 #include "src/gpu/graphite/ComputePipelineDesc.h"
17 #include "src/gpu/graphite/ComputeTypes.h"
18 #include "src/gpu/graphite/RecorderPriv.h"
19 #include "src/gpu/graphite/ResourceProvider.h"
20 #include "src/gpu/graphite/SynchronizeToCpuTask.h"
21 #include "src/gpu/graphite/compute/ComputeStep.h"
22 
23 using namespace skgpu::graphite;
24 
25 // TODO(b/262427430, b/262429132): Enable this test on other backends once they all support
26 // compute programs.
DEF_GRAPHITE_TEST_FOR_METAL_CONTEXT(ComputeTaskTest,reporter,context)27 DEF_GRAPHITE_TEST_FOR_METAL_CONTEXT(ComputeTaskTest, reporter, context) {
28     constexpr uint32_t kProblemSize = 512;
29     constexpr float kFactor = 4.f;
30 
31     std::unique_ptr<Recorder> recorder = context->makeRecorder();
32 
33     class TestComputeStep : public ComputeStep {
34     public:
35         TestComputeStep() : ComputeStep("TestArrayMultiply", {}, {}) {}
36         ~TestComputeStep() override = default;
37 
38         // A kernel that multiplies a large array of floats by a supplied factor.
39         std::string computeSkSL(const ResourceBindingRequirements&, int) const override {
40             return R"(
41                 layout(set=0, binding=0) readonly buffer inputBlock
42                 {
43                     float factor;
44                     float in_data[];
45                 };
46                 layout(set=0, binding=1) buffer outputBlock
47                 {
48                     float out_data[];
49                 };
50                 void main() {
51                     out_data[sk_GlobalInvocationID.x] = in_data[sk_GlobalInvocationID.x] * factor;
52                 }
53             )";
54         }
55     };
56     TestComputeStep step;
57     ComputePipelineDesc pipelineDesc(&step);
58 
59     ResourceProvider* provider = recorder->priv().resourceProvider();
60     size_t inputSize = SkAlignTo(sizeof(float) * (kProblemSize + 1),
61                                  recorder->priv().caps()->requiredStorageBufferAlignment());
62     sk_sp<Buffer> inputBuffer = provider->findOrCreateBuffer(
63             inputSize, BufferType::kStorage, PrioritizeGpuReads::kNo);
64     size_t outputSize = SkAlignTo(sizeof(float) * kProblemSize,
65                                   recorder->priv().caps()->requiredStorageBufferAlignment());
66     sk_sp<Buffer> outputBuffer = provider->findOrCreateBuffer(
67             outputSize, BufferType::kStorage, PrioritizeGpuReads::kNo);
68 
69     std::vector<ResourceBinding> bindings;
70     bindings.push_back({/*index=*/0, {inputBuffer.get(), /*offset=*/0}});
71     bindings.push_back({/*index=*/1, {outputBuffer.get(), /*offset=*/0}});
72 
73     // Initialize "in_data" to contain an ascending sequence of integers.
74     // Initialize "out_data" to "-1"s.
75     {
76         float* inData = static_cast<float*>(inputBuffer->map());
77         float* outData = static_cast<float*>(outputBuffer->map());
78         SkASSERT(inputBuffer->isMapped() && inData != nullptr);
79         SkASSERT(outputBuffer->isMapped() && outData != nullptr);
80 
81         inData[0] = kFactor;  // "in_factor"
82         for (unsigned int i = 0; i < kProblemSize; ++i) {
83             inData[i + 1] = i + 1;
84             outData[i] = -1;
85         }
86         inputBuffer->unmap();
87         outputBuffer->unmap();
88     }
89 
90     ComputePassDesc desc;
91     desc.fLocalDispatchSize = WorkgroupSize(kProblemSize, 1, 1);
92 
93     // Record the compute pass task.
94     recorder->priv().add(ComputePassTask::Make(std::move(bindings), pipelineDesc, desc));
95 
96     // Ensure the output buffer is synchronized to the CPU once the GPU submission has finished.
97     recorder->priv().add(SynchronizeToCpuTask::Make(outputBuffer));
98 
99     // Submit the work and wait for it to complete.
100     std::unique_ptr<Recording> recording = recorder->snap();
101     if (!recording) {
102         ERRORF(reporter, "Failed to make recording");
103         return;
104     }
105 
106     InsertRecordingInfo insertInfo;
107     insertInfo.fRecording = recording.get();
108     context->insertRecording(insertInfo);
109     context->submit(SyncToCpu::kYes);
110 
111     // Verify the contents of the output buffer.
112     {
113         float* inData = static_cast<float*>(inputBuffer->map());
114         float* outData = static_cast<float*>(outputBuffer->map());
115         SkASSERT(inputBuffer->isMapped() && inData != nullptr);
116         SkASSERT(outputBuffer->isMapped() && outData != nullptr);
117         for (unsigned int i = 0; i < kProblemSize; ++i) {
118             const float expected = inData[i + 1] * kFactor;
119             const float found = outData[i];
120             REPORTER_ASSERT(
121                     reporter, expected == found, "expected '%f', found '%f'", expected, found);
122         }
123         inputBuffer->unmap();
124         outputBuffer->unmap();
125     }
126 }
127 
128 // TODO(b/260622403): The shader tested here is identical to
129 // `resources/sksl/compute/AtomicsOperations.compute`. It would be nice to be able to exercise SkSL
130 // features like this as part of SkSLTest.cpp instead of as a graphite test.
131 // TODO(b/262427430, b/262429132): Enable this test on other backends once they all support
132 // compute programs.
DEF_GRAPHITE_TEST_FOR_METAL_CONTEXT(ComputeShaderAtomicOperationsTest,reporter,context)133 DEF_GRAPHITE_TEST_FOR_METAL_CONTEXT(ComputeShaderAtomicOperationsTest, reporter, context) {
134     std::unique_ptr<Recorder> recorder = context->makeRecorder();
135 
136     class TestComputeStep : public ComputeStep {
137     public:
138         TestComputeStep() : ComputeStep("TestAtomicOperations", {}, {}) {}
139         ~TestComputeStep() override = default;
140 
141         // A kernel that increments a global (device memory) counter across multiple workgroups.
142         // Each workgroup maintains its own independent tally in a workgroup-shared counter which
143         // is then added to the global count.
144         //
145         // This exercises atomic store/load/add and coherent reads and writes over memory in storage
146         // and workgroup address spaces.
147         std::string computeSkSL(const ResourceBindingRequirements&, int) const override {
148             return R"(
149                 layout(metal, binding = 0) buffer ssbo {
150                     atomicUint globalCounter;
151                 };
152 
153                 workgroup atomicUint localCounter;
154 
155                 void main() {
156                     // Initialize the local counter.
157                     if (sk_LocalInvocationID.x == 0) {
158                         atomicStore(localCounter, 0);
159                     }
160 
161                     // Synchronize the threads in the workgroup so they all see the initial value.
162                     workgroupBarrier();
163 
164                     // All threads increment the counter.
165                     atomicAdd(localCounter, 1);
166 
167                     // Synchronize the threads again to ensure they have all executed the increment
168                     // and the following load reads the same value across all threads in the
169                     // workgroup.
170                     workgroupBarrier();
171 
172                     // Add the workgroup-only tally to the global counter.
173                     if (sk_LocalInvocationID.x == 0) {
174                         atomicAdd(globalCounter, atomicLoad(localCounter));
175                     }
176                 }
177             )";
178         }
179     };
180     TestComputeStep step;
181     ComputePipelineDesc pipelineDesc(&step);
182 
183     ResourceProvider* provider = recorder->priv().resourceProvider();
184     size_t minSize = SkAlignTo(sizeof(uint32_t),
185                                recorder->priv().caps()->requiredStorageBufferAlignment());
186     sk_sp<Buffer> ssbo = provider->findOrCreateBuffer(
187             minSize, BufferType::kStorage, PrioritizeGpuReads::kNo);
188 
189     std::vector<ResourceBinding> bindings;
190     bindings.push_back({/*index=*/0, {ssbo.get(), /*offset=*/0}});
191 
192     // Initialize the global counter to 0.
193     {
194         uint32_t* ssboData = static_cast<uint32_t*>(ssbo->map());
195         ssboData[0] = 0;
196         ssbo->unmap();
197     }
198 
199     constexpr uint32_t kWorkgroupCount = 32;
200     constexpr uint32_t kWorkgroupSize = 1024;
201 
202     ComputePassDesc desc;
203     desc.fGlobalDispatchSize = WorkgroupSize(kWorkgroupCount, 1, 1);
204     desc.fLocalDispatchSize = WorkgroupSize(kWorkgroupSize, 1, 1);
205 
206     // Record the compute pass task.
207     recorder->priv().add(ComputePassTask::Make(std::move(bindings), pipelineDesc, desc));
208 
209     // Ensure the output buffer is synchronized to the CPU once the GPU submission has finished.
210     recorder->priv().add(SynchronizeToCpuTask::Make(ssbo));
211 
212     // Submit the work and wait for it to complete.
213     std::unique_ptr<Recording> recording = recorder->snap();
214     if (!recording) {
215         ERRORF(reporter, "Failed to make recording");
216         return;
217     }
218 
219     InsertRecordingInfo insertInfo;
220     insertInfo.fRecording = recording.get();
221     context->insertRecording(insertInfo);
222     context->submit(SyncToCpu::kYes);
223 
224     // Verify the contents of the output buffer.
225     {
226         constexpr uint32_t kExpectedCount = kWorkgroupCount * kWorkgroupSize;
227         const uint32_t result = static_cast<const uint32_t*>(ssbo->map())[0];
228         REPORTER_ASSERT(reporter,
229                         result == kExpectedCount,
230                         "expected '%d', found '%d'",
231                         kExpectedCount, result);
232         ssbo->unmap();
233     }
234 }
235 
236 // TODO(b/260622403): The shader tested here is identical to
237 // `resources/sksl/compute/AtomicsOperationsOverArrayAndStruct.compute`. It would be nice to be able
238 // to exercise SkSL features like this as part of SkSLTest.cpp instead of as a graphite test.
239 // TODO(b/262427430, b/262429132): Enable this test on other backends once they all support
240 // compute programs.
DEF_GRAPHITE_TEST_FOR_METAL_CONTEXT(ComputeShaderAtomicOperationsOverArrayAndStructTest,reporter,context)241 DEF_GRAPHITE_TEST_FOR_METAL_CONTEXT(ComputeShaderAtomicOperationsOverArrayAndStructTest,
242                                     reporter,
243                                     context) {
244     std::unique_ptr<Recorder> recorder = context->makeRecorder();
245 
246     class TestComputeStep : public ComputeStep {
247     public:
248         TestComputeStep() : ComputeStep("TestAtomicOperationsOverArrayAndStruct", {}, {}) {}
249         ~TestComputeStep() override = default;
250 
251         // Construct a kernel that increments a two global (device memory) counters across multiple
252         // workgroups. Each workgroup maintains its own independent tallies in workgroup-shared
253         // counters which are then added to the global counts.
254         //
255         // This exercises atomic store/load/add and coherent reads and writes over memory in storage
256         // and workgroup address spaces.
257         std::string computeSkSL(const ResourceBindingRequirements&, int) const override {
258             return R"(
259                 const uint WORKGROUP_SIZE = 1024;
260 
261                 struct GlobalCounts {
262                     atomicUint firstHalfCount;
263                     atomicUint secondHalfCount;
264                 };
265                 layout(metal, binding = 0) buffer ssbo {
266                     GlobalCounts globalCounts;
267                 };
268 
269                 workgroup atomicUint localCounts[2];
270 
271                 void main() {
272                     // Initialize the local counts.
273                     if (sk_LocalInvocationID.x == 0) {
274                         atomicStore(localCounts[0], 0);
275                         atomicStore(localCounts[1], 0);
276                     }
277 
278                     // Synchronize the threads in the workgroup so they all see the initial value.
279                     workgroupBarrier();
280 
281                     // Each thread increments one of the local counters based on its invocation
282                     // index.
283                     uint idx = sk_LocalInvocationID.x < (WORKGROUP_SIZE / 2) ? 0 : 1;
284                     atomicAdd(localCounts[idx], 1);
285 
286                     // Synchronize the threads again to ensure they have all executed the increments
287                     // and the following load reads the same value across all threads in the
288                     // workgroup.
289                     workgroupBarrier();
290 
291                     // Add the workgroup-only tally to the global counter.
292                     if (sk_LocalInvocationID.x == 0) {
293                         atomicAdd(globalCounts.firstHalfCount, atomicLoad(localCounts[0]));
294                         atomicAdd(globalCounts.secondHalfCount, atomicLoad(localCounts[1]));
295                     }
296                 }
297             )";
298         }
299     };
300     TestComputeStep step;
301     ComputePipelineDesc pipelineDesc(&step);
302 
303     ResourceProvider* provider = recorder->priv().resourceProvider();
304     size_t minSize = SkAlignTo(2*sizeof(uint32_t),
305                                recorder->priv().caps()->requiredStorageBufferAlignment());
306     sk_sp<Buffer> ssbo = provider->findOrCreateBuffer(
307             minSize, BufferType::kStorage, PrioritizeGpuReads::kNo);
308 
309     std::vector<ResourceBinding> bindings;
310     bindings.push_back({/*index=*/0, {ssbo.get(), /*offset=*/0}});
311 
312     // Initialize the global counter to 0.
313     {
314         uint32_t* ssboData = static_cast<uint32_t*>(ssbo->map());
315         ssboData[0] = 0;
316         ssboData[1] = 0;
317         ssbo->unmap();
318     }
319 
320     constexpr uint32_t kWorkgroupCount = 32;
321     constexpr uint32_t kWorkgroupSize = 1024;
322 
323     ComputePassDesc desc;
324     desc.fGlobalDispatchSize = WorkgroupSize(kWorkgroupCount, 1, 1);
325     desc.fLocalDispatchSize = WorkgroupSize(kWorkgroupSize, 1, 1);
326 
327     // Record the compute pass task.
328     recorder->priv().add(ComputePassTask::Make(std::move(bindings), pipelineDesc, desc));
329 
330     // Ensure the output buffer is synchronized to the CPU once the GPU submission has finished.
331     recorder->priv().add(SynchronizeToCpuTask::Make(ssbo));
332 
333     // Submit the work and wait for it to complete.
334     std::unique_ptr<Recording> recording = recorder->snap();
335     if (!recording) {
336         ERRORF(reporter, "Failed to make recording");
337         return;
338     }
339 
340     InsertRecordingInfo insertInfo;
341     insertInfo.fRecording = recording.get();
342     context->insertRecording(insertInfo);
343     context->submit(SyncToCpu::kYes);
344 
345     // Verify the contents of the output buffer.
346     {
347         constexpr uint32_t kExpectedCount = kWorkgroupCount * kWorkgroupSize / 2;
348 
349         const uint32_t* ssboData = static_cast<const uint32_t*>(ssbo->map());
350         const uint32_t firstHalfCount = ssboData[0];
351         const uint32_t secondHalfCount = ssboData[1];
352         REPORTER_ASSERT(reporter,
353                         firstHalfCount == kExpectedCount,
354                         "expected '%d', found '%d'",
355                         kExpectedCount, firstHalfCount);
356         REPORTER_ASSERT(reporter,
357                         secondHalfCount == kExpectedCount,
358                         "expected '%d', found '%d'",
359                         kExpectedCount, secondHalfCount);
360         ssbo->unmap();
361     }
362 }
363