1 /*
2 * Copyright 2022 Google LLC
3 *
4 * Use of this source code is governed by a BSD-style license that can be
5 * found in the LICENSE file.
6 */
7
8 #include "tests/Test.h"
9
10 #include "include/core/SkBitmap.h"
11 #include "include/gpu/graphite/Context.h"
12 #include "include/gpu/graphite/Recorder.h"
13 #include "include/gpu/graphite/Recording.h"
14 #include "src/gpu/graphite/Buffer.h"
15 #include "src/gpu/graphite/Caps.h"
16 #include "src/gpu/graphite/ComputePipelineDesc.h"
17 #include "src/gpu/graphite/ComputeTypes.h"
18 #include "src/gpu/graphite/ContextPriv.h"
19 #include "src/gpu/graphite/RecorderPriv.h"
20 #include "src/gpu/graphite/ResourceProvider.h"
21 #include "src/gpu/graphite/UniformManager.h"
22 #include "src/gpu/graphite/compute/ComputeStep.h"
23 #include "src/gpu/graphite/compute/DispatchGroup.h"
24 #include "src/gpu/graphite/task/ComputeTask.h"
25 #include "src/gpu/graphite/task/CopyTask.h"
26 #include "src/gpu/graphite/task/SynchronizeToCpuTask.h"
27 #include "src/gpu/graphite/task/UploadTask.h"
28
29 #include "tools/graphite/GraphiteTestContext.h"
30
31 using namespace skgpu::graphite;
32 using namespace skiatest::graphite;
33
34 namespace {
35
map_buffer(Context * context,skiatest::graphite::GraphiteTestContext * testContext,Buffer * buffer,size_t offset)36 void* map_buffer(Context* context,
37 skiatest::graphite::GraphiteTestContext* testContext,
38 Buffer* buffer,
39 size_t offset) {
40 SkASSERT(buffer);
41 if (context->priv().caps()->bufferMapsAreAsync()) {
42 buffer->asyncMap();
43 while (!buffer->isMapped()) {
44 testContext->tick();
45 }
46 }
47 std::byte* ptr = static_cast<std::byte*>(buffer->map());
48 SkASSERT(ptr);
49
50 return ptr + offset;
51 }
52
sync_buffer_to_cpu(Recorder * recorder,const Buffer * buffer)53 sk_sp<Buffer> sync_buffer_to_cpu(Recorder* recorder, const Buffer* buffer) {
54 if (recorder->priv().caps()->drawBufferCanBeMappedForReadback()) {
55 // `buffer` can be mapped directly, however it may still require a synchronization step
56 // by the underlying API (e.g. a managed buffer in Metal). SynchronizeToCpuTask
57 // automatically handles this for us.
58 recorder->priv().add(SynchronizeToCpuTask::Make(sk_ref_sp(buffer)));
59 return sk_ref_sp(buffer);
60 }
61
62 // The backend requires a transfer buffer for CPU read-back
63 auto xferBuffer =
64 recorder->priv().resourceProvider()->findOrCreateBuffer(buffer->size(),
65 BufferType::kXferGpuToCpu,
66 AccessPattern::kHostVisible,
67 "ComputeTest_TransferToCpu");
68 SkASSERT(xferBuffer);
69
70 recorder->priv().add(CopyBufferToBufferTask::Make(buffer,
71 /*srcOffset=*/0,
72 xferBuffer,
73 /*dstOffset=*/0,
74 buffer->size()));
75 return xferBuffer;
76 }
77
submit_recording(Context * context,GraphiteTestContext * testContext,Recorder * recorder)78 std::unique_ptr<Recording> submit_recording(Context* context,
79 GraphiteTestContext* testContext,
80 Recorder* recorder) {
81 std::unique_ptr<Recording> recording = recorder->snap();
82 if (!recording) {
83 return nullptr;
84 }
85
86 InsertRecordingInfo insertInfo;
87 insertInfo.fRecording = recording.get();
88 context->insertRecording(insertInfo);
89 testContext->syncedSubmit(context);
90
91 return recording;
92 }
93
is_dawn_or_metal_context_type(skiatest::GpuContextType ctxType)94 bool is_dawn_or_metal_context_type(skiatest::GpuContextType ctxType) {
95 return skiatest::IsDawnContextType(ctxType) || skiatest::IsMetalContextType(ctxType);
96 }
97
98 } // namespace
99
100 #define DEF_GRAPHITE_TEST_FOR_DAWN_AND_METAL_CONTEXTS( \
101 name, reporter, graphite_context, test_context) \
102 DEF_GRAPHITE_TEST_FOR_CONTEXTS(name, \
103 is_dawn_or_metal_context_type, \
104 reporter, \
105 graphite_context, \
106 test_context, \
107 CtsEnforcement::kNever)
108
109 // TODO(b/262427430, b/262429132): Enable this test on other backends once they all support
110 // compute programs.
DEF_GRAPHITE_TEST_FOR_DAWN_AND_METAL_CONTEXTS(Compute_SingleDispatchTest,reporter,context,testContext)111 DEF_GRAPHITE_TEST_FOR_DAWN_AND_METAL_CONTEXTS(Compute_SingleDispatchTest,
112 reporter,
113 context,
114 testContext) {
115 constexpr uint32_t kProblemSize = 512;
116 constexpr float kFactor = 4.f;
117
118 // The ComputeStep packs kProblemSize floats into kProblemSize / 4 vectors and each thread
119 // processes 1 vector at a time.
120 constexpr uint32_t kWorkgroupSize = kProblemSize / 4;
121
122 std::unique_ptr<Recorder> recorder = context->makeRecorder();
123
124 class TestComputeStep : public ComputeStep {
125 public:
126 // TODO(skia:40045541): SkSL doesn't support std430 layout well, so the buffers
127 // below all pack their data into vectors to be compatible with SPIR-V/WGSL.
128 TestComputeStep() : ComputeStep(
129 /*name=*/"TestArrayMultiply",
130 /*localDispatchSize=*/{kWorkgroupSize, 1, 1},
131 /*resources=*/{
132 // Input buffer:
133 {
134 // TODO(b/299979165): Declare this binding as read-only.
135 /*type=*/ResourceType::kStorageBuffer,
136 /*flow=*/DataFlow::kPrivate,
137 /*policy=*/ResourcePolicy::kMapped,
138 /*sksl=*/"inputBlock {\n"
139 " float factor;\n"
140 " layout(offset=16) float4 in_data[];\n"
141 "}",
142 },
143 // Output buffer:
144 {
145 /*type=*/ResourceType::kStorageBuffer,
146 /*flow=*/DataFlow::kShared, // shared to allow us to access it from the
147 // Builder
148 /*policy=*/ResourcePolicy::kMapped, // mappable for read-back
149 /*slot=*/0,
150 /*sksl=*/"outputBlock { float4 out_data[]; }",
151 }
152 }) {}
153 ~TestComputeStep() override = default;
154
155 // A kernel that multiplies a large array of floats by a supplied factor.
156 std::string computeSkSL() const override {
157 return R"(
158 void main() {
159 out_data[sk_GlobalInvocationID.x] = in_data[sk_GlobalInvocationID.x] * factor;
160 }
161 )";
162 }
163
164 size_t calculateBufferSize(int index, const ResourceDesc& r) const override {
165 if (index == 0) {
166 SkASSERT(r.fFlow == DataFlow::kPrivate);
167 return sizeof(float) * (kProblemSize + 4);
168 }
169 SkASSERT(index == 1);
170 SkASSERT(r.fSlot == 0);
171 SkASSERT(r.fFlow == DataFlow::kShared);
172 return sizeof(float) * kProblemSize;
173 }
174
175 void prepareStorageBuffer(int resourceIndex,
176 const ResourceDesc& r,
177 void* buffer,
178 size_t bufferSize) const override {
179 // Only initialize the input buffer.
180 if (resourceIndex != 0) {
181 return;
182 }
183 SkASSERT(r.fFlow == DataFlow::kPrivate);
184
185 size_t dataCount = sizeof(float) * (kProblemSize + 4);
186 SkASSERT(bufferSize == dataCount);
187 SkSpan<float> inData(static_cast<float*>(buffer), dataCount);
188 inData[0] = kFactor;
189 for (unsigned int i = 0; i < kProblemSize; ++i) {
190 inData[i + 4] = i + 1;
191 }
192 }
193
194 WorkgroupSize calculateGlobalDispatchSize() const override {
195 return WorkgroupSize(1, 1, 1);
196 }
197 } step;
198
199 DispatchGroup::Builder builder(recorder.get());
200 if (!builder.appendStep(&step)) {
201 ERRORF(reporter, "Failed to add ComputeStep to DispatchGroup");
202 return;
203 }
204
205 // The output buffer should have been placed in the right output slot.
206 BindBufferInfo outputInfo = builder.getSharedBufferResource(0);
207 if (!outputInfo) {
208 ERRORF(reporter, "Failed to allocate an output buffer at slot 0");
209 return;
210 }
211
212 // Record the compute task
213 ComputeTask::DispatchGroupList groups;
214 groups.push_back(builder.finalize());
215 recorder->priv().add(ComputeTask::Make(std::move(groups)));
216
217 // Ensure the output buffer is synchronized to the CPU once the GPU submission has finished.
218 auto outputBuffer = sync_buffer_to_cpu(recorder.get(), outputInfo.fBuffer);
219
220 // Submit the work and wait for it to complete.
221 std::unique_ptr<Recording> recording = recorder->snap();
222 if (!recording) {
223 ERRORF(reporter, "Failed to make recording");
224 return;
225 }
226
227 InsertRecordingInfo insertInfo;
228 insertInfo.fRecording = recording.get();
229 context->insertRecording(insertInfo);
230 testContext->syncedSubmit(context);
231
232 // Verify the contents of the output buffer.
233 float* outData = static_cast<float*>(
234 map_buffer(context, testContext, outputBuffer.get(), outputInfo.fOffset));
235 SkASSERT(outputBuffer->isMapped() && outData != nullptr);
236 for (unsigned int i = 0; i < kProblemSize; ++i) {
237 const float expected = (i + 1) * kFactor;
238 const float found = outData[i];
239 REPORTER_ASSERT(reporter, expected == found, "expected '%f', found '%f'", expected, found);
240 }
241 }
242
243 // TODO(b/262427430, b/262429132): Enable this test on other backends once they all support
244 // compute programs.
DEF_GRAPHITE_TEST_FOR_DAWN_AND_METAL_CONTEXTS(Compute_DispatchGroupTest,reporter,context,testContext)245 DEF_GRAPHITE_TEST_FOR_DAWN_AND_METAL_CONTEXTS(Compute_DispatchGroupTest,
246 reporter,
247 context,
248 testContext) {
249 // TODO(b/315834710): This fails on Dawn D3D11
250 if (testContext->contextType() == skgpu::ContextType::kDawn_D3D11) {
251 return;
252 }
253
254 constexpr uint32_t kProblemSize = 512;
255 constexpr float kFactor1 = 4.f;
256 constexpr float kFactor2 = 3.f;
257
258 // The ComputeStep packs kProblemSize floats into kProblemSize / 4 vectors and each thread
259 // processes 1 vector at a time.
260 constexpr uint32_t kWorkgroupSize = kProblemSize / 4;
261
262 std::unique_ptr<Recorder> recorder = context->makeRecorder();
263
264 // Define two steps that perform two multiplication passes over the same input.
265
266 class TestComputeStep1 : public ComputeStep {
267 public:
268 // TODO(skia:40045541): SkSL doesn't support std430 layout well, so the buffers
269 // below all pack their data into vectors to be compatible with SPIR-V/WGSL.
270 TestComputeStep1() : ComputeStep(
271 /*name=*/"TestArrayMultiplyFirstPass",
272 /*localDispatchSize=*/{kWorkgroupSize, 1, 1},
273 /*resources=*/{
274 // Input buffer:
275 {
276 // TODO(b/299979165): Declare this binding as read-only.
277 /*type=*/ResourceType::kStorageBuffer,
278 /*flow=*/DataFlow::kPrivate,
279 /*policy=*/ResourcePolicy::kMapped, // mappable for read-back
280 /*sksl=*/"inputBlock {\n"
281 " float factor;\n"
282 " layout(offset=16) float4 in_data[];\n"
283 "}",
284 },
285 // Output buffers:
286 {
287 /*type=*/ResourceType::kStorageBuffer,
288 /*flow=*/DataFlow::kShared,
289 /*policy=*/ResourcePolicy::kNone, // GPU-only, read by second step
290 /*slot=*/0,
291 /*sksl=*/"outputBlock1 { float4 forward_data[]; }",
292 },
293 {
294 /*type=*/ResourceType::kStorageBuffer,
295 /*flow=*/DataFlow::kShared,
296 /*policy=*/ResourcePolicy::kMapped, // mappable for read-back
297 /*slot=*/1,
298 /*sksl=*/"outputBlock2 { float2 extra_data; }",
299 }
300 }) {}
301 ~TestComputeStep1() override = default;
302
303 // A kernel that multiplies a large array of floats by a supplied factor.
304 std::string computeSkSL() const override {
305 return R"(
306 void main() {
307 uint idx = sk_GlobalInvocationID.x;
308 forward_data[idx] = in_data[idx] * factor;
309 if (idx == 0) {
310 extra_data.x = factor;
311 extra_data.y = 2 * factor;
312 }
313 }
314 )";
315 }
316
317 size_t calculateBufferSize(int index, const ResourceDesc& r) const override {
318 if (index == 0) {
319 SkASSERT(r.fFlow == DataFlow::kPrivate);
320 return sizeof(float) * (kProblemSize + 4);
321 }
322 if (index == 1) {
323 SkASSERT(r.fFlow == DataFlow::kShared);
324 SkASSERT(r.fSlot == 0);
325 return sizeof(float) * kProblemSize;
326 }
327
328 SkASSERT(index == 2);
329 SkASSERT(r.fSlot == 1);
330 SkASSERT(r.fFlow == DataFlow::kShared);
331 return 2 * sizeof(float);
332 }
333
334 void prepareStorageBuffer(int resourceIndex,
335 const ResourceDesc& r,
336 void* buffer,
337 size_t bufferSize) const override {
338 if (resourceIndex != 0) {
339 return;
340 }
341
342 size_t dataCount = sizeof(float) * (kProblemSize + 4);
343 SkASSERT(bufferSize == dataCount);
344 SkSpan<float> inData(static_cast<float*>(buffer), dataCount);
345 inData[0] = kFactor1;
346 for (unsigned int i = 0; i < kProblemSize; ++i) {
347 inData[i + 4] = i + 1;
348 }
349 }
350
351 WorkgroupSize calculateGlobalDispatchSize() const override {
352 return WorkgroupSize(1, 1, 1);
353 }
354 } step1;
355
356 class TestComputeStep2 : public ComputeStep {
357 public:
358 TestComputeStep2() : ComputeStep(
359 /*name=*/"TestArrayMultiplySecondPass",
360 /*localDispatchSize=*/{kWorkgroupSize, 1, 1},
361 /*resources=*/{
362 // Input buffer:
363 {
364 /*type=*/ResourceType::kStorageBuffer,
365 /*flow=*/DataFlow::kShared,
366 /*policy=*/ResourcePolicy::kNone, // GPU-only
367 /*slot=*/0, // this is the output from the first step
368 /*sksl=*/"inputBlock { float4 in_data[]; }",
369 },
370 {
371 /*type=*/ResourceType::kStorageBuffer,
372 /*flow=*/DataFlow::kPrivate,
373 /*policy=*/ResourcePolicy::kMapped,
374 /*sksl=*/"factorBlock { float factor; }"
375 },
376 // Output buffer:
377 {
378 /*type=*/ResourceType::kStorageBuffer,
379 /*flow=*/DataFlow::kShared,
380 /*policy=*/ResourcePolicy::kMapped, // mappable for read-back
381 /*slot=*/2,
382 /*sksl=*/"outputBlock { float4 out_data[]; }",
383 }
384 }) {}
385 ~TestComputeStep2() override = default;
386
387 // A kernel that multiplies a large array of floats by a supplied factor.
388 std::string computeSkSL() const override {
389 return R"(
390 void main() {
391 out_data[sk_GlobalInvocationID.x] = in_data[sk_GlobalInvocationID.x] * factor;
392 }
393 )";
394 }
395
396 size_t calculateBufferSize(int index, const ResourceDesc& r) const override {
397 SkASSERT(index != 0);
398 if (index == 1) {
399 SkASSERT(r.fFlow == DataFlow::kPrivate);
400 return sizeof(float) * 4;
401 }
402 SkASSERT(index == 2);
403 SkASSERT(r.fSlot == 2);
404 SkASSERT(r.fFlow == DataFlow::kShared);
405 return sizeof(float) * kProblemSize;
406 }
407
408 void prepareStorageBuffer(int resourceIndex,
409 const ResourceDesc& r,
410 void* buffer,
411 size_t bufferSize) const override {
412 if (resourceIndex != 1) {
413 return;
414 }
415 SkASSERT(r.fFlow == DataFlow::kPrivate);
416 *static_cast<float*>(buffer) = kFactor2;
417 }
418
419 WorkgroupSize calculateGlobalDispatchSize() const override {
420 return WorkgroupSize(1, 1, 1);
421 }
422 } step2;
423
424 DispatchGroup::Builder builder(recorder.get());
425 builder.appendStep(&step1);
426 builder.appendStep(&step2);
427
428 // Slots 0, 1, and 2 should all contain shared buffers. Slot 1 contains the extra output buffer
429 // from step 1 while slot 2 contains the result of the second multiplication pass from step 1.
430 // Slot 0 is not mappable.
431 REPORTER_ASSERT(reporter,
432 std::holds_alternative<BufferView>(builder.outputTable().fSharedSlots[0]),
433 "shared resource at slot 0 is missing");
434 BindBufferInfo outputInfo = builder.getSharedBufferResource(2);
435 if (!outputInfo) {
436 ERRORF(reporter, "Failed to allocate an output buffer at slot 0");
437 return;
438 }
439
440 // Extra output buffer from step 1 (corresponding to 'outputBlock2')
441 BindBufferInfo extraOutputInfo = builder.getSharedBufferResource(1);
442 if (!extraOutputInfo) {
443 ERRORF(reporter, "shared resource at slot 1 is missing");
444 return;
445 }
446
447 // Record the compute task
448 ComputeTask::DispatchGroupList groups;
449 groups.push_back(builder.finalize());
450 recorder->priv().add(ComputeTask::Make(std::move(groups)));
451
452 // Ensure the output buffers get synchronized to the CPU once the GPU submission has finished.
453 auto outputBuffer = sync_buffer_to_cpu(recorder.get(), outputInfo.fBuffer);
454 auto extraOutputBuffer = sync_buffer_to_cpu(recorder.get(), extraOutputInfo.fBuffer);
455
456 // Submit the work and wait for it to complete.
457 std::unique_ptr<Recording> recording = recorder->snap();
458 if (!recording) {
459 ERRORF(reporter, "Failed to make recording");
460 return;
461 }
462
463 InsertRecordingInfo insertInfo;
464 insertInfo.fRecording = recording.get();
465 context->insertRecording(insertInfo);
466 testContext->syncedSubmit(context);
467
468 // Verify the contents of the output buffer from step 2
469 float* outData = static_cast<float*>(
470 map_buffer(context, testContext, outputBuffer.get(), outputInfo.fOffset));
471 SkASSERT(outputBuffer->isMapped() && outData != nullptr);
472 for (unsigned int i = 0; i < kProblemSize; ++i) {
473 const float expected = (i + 1) * kFactor1 * kFactor2;
474 const float found = outData[i];
475 REPORTER_ASSERT(reporter, expected == found, "expected '%f', found '%f'", expected, found);
476 }
477
478 // Verify the contents of the extra output buffer from step 1
479 float* extraOutData = static_cast<float*>(
480 map_buffer(context, testContext, extraOutputBuffer.get(), extraOutputInfo.fOffset));
481 SkASSERT(extraOutputBuffer->isMapped() && extraOutData != nullptr);
482 REPORTER_ASSERT(reporter,
483 kFactor1 == extraOutData[0],
484 "expected '%f', found '%f'",
485 kFactor1,
486 extraOutData[0]);
487 REPORTER_ASSERT(reporter,
488 2 * kFactor1 == extraOutData[1],
489 "expected '%f', found '%f'",
490 2 * kFactor2,
491 extraOutData[1]);
492 }
493
494 // TODO(b/262427430, b/262429132): Enable this test on other backends once they all support
495 // compute programs.
DEF_GRAPHITE_TEST_FOR_DAWN_AND_METAL_CONTEXTS(Compute_UniformBufferTest,reporter,context,testContext)496 DEF_GRAPHITE_TEST_FOR_DAWN_AND_METAL_CONTEXTS(Compute_UniformBufferTest,
497 reporter,
498 context,
499 testContext) {
500 // TODO(b/315834710): This fails on Dawn D3D11
501 if (testContext->contextType() == skgpu::ContextType::kDawn_D3D11) {
502 return;
503 }
504
505 constexpr uint32_t kProblemSize = 512;
506 constexpr float kFactor = 4.f;
507
508 // The ComputeStep packs kProblemSize floats into kProblemSize / 4 vectors and each thread
509 // processes 1 vector at a time.
510 constexpr uint32_t kWorkgroupSize = kProblemSize / 4;
511
512 std::unique_ptr<Recorder> recorder = context->makeRecorder();
513
514 class TestComputeStep : public ComputeStep {
515 public:
516 TestComputeStep() : ComputeStep(
517 /*name=*/"TestArrayMultiply",
518 /*localDispatchSize=*/{kWorkgroupSize, 1, 1},
519 /*resources=*/{
520 // Uniform buffer:
521 {
522 /*type=*/ResourceType::kUniformBuffer,
523 /*flow=*/DataFlow::kPrivate,
524 /*policy=*/ResourcePolicy::kMapped,
525 /*sksl=*/"uniformBlock { float factor; }"
526 },
527 // Input buffer:
528 {
529 /*type=*/ResourceType::kStorageBuffer,
530 /*flow=*/DataFlow::kPrivate,
531 /*policy=*/ResourcePolicy::kMapped,
532 /*sksl=*/"inputBlock { float4 in_data[]; }",
533 },
534 // Output buffer:
535 {
536 /*type=*/ResourceType::kStorageBuffer,
537 /*flow=*/DataFlow::kShared, // shared to allow us to access it from the
538 // Builder
539 /*policy=*/ResourcePolicy::kMapped, // mappable for read-back
540 /*slot=*/0,
541 /*sksl=*/"outputBlock { float4 out_data[]; }",
542 }
543 }) {}
544 ~TestComputeStep() override = default;
545
546 // A kernel that multiplies a large array of floats by a supplied factor.
547 std::string computeSkSL() const override {
548 return R"(
549 void main() {
550 out_data[sk_GlobalInvocationID.x] = in_data[sk_GlobalInvocationID.x] * factor;
551 }
552 )";
553 }
554
555 size_t calculateBufferSize(int index, const ResourceDesc& r) const override {
556 if (index == 0) {
557 SkASSERT(r.fFlow == DataFlow::kPrivate);
558 return sizeof(float);
559 }
560 if (index == 1) {
561 SkASSERT(r.fFlow == DataFlow::kPrivate);
562 return sizeof(float) * kProblemSize;
563 }
564 SkASSERT(index == 2);
565 SkASSERT(r.fSlot == 0);
566 SkASSERT(r.fFlow == DataFlow::kShared);
567 return sizeof(float) * kProblemSize;
568 }
569
570 void prepareStorageBuffer(int resourceIndex,
571 const ResourceDesc& r,
572 void* buffer,
573 size_t bufferSize) const override {
574 // Only initialize the input storage buffer.
575 if (resourceIndex != 1) {
576 return;
577 }
578 SkASSERT(r.fFlow == DataFlow::kPrivate);
579 size_t dataCount = sizeof(float) * kProblemSize;
580 SkASSERT(bufferSize == dataCount);
581 SkSpan<float> inData(static_cast<float*>(buffer), dataCount);
582 for (unsigned int i = 0; i < kProblemSize; ++i) {
583 inData[i] = i + 1;
584 }
585 }
586
587 void prepareUniformBuffer(int resourceIndex,
588 const ResourceDesc&,
589 UniformManager* mgr) const override {
590 SkASSERT(resourceIndex == 0);
591 SkDEBUGCODE(
592 const Uniform uniforms[] = {{"factor", SkSLType::kFloat}};
593 mgr->setExpectedUniforms(uniforms);
594 )
595 mgr->write(kFactor);
596 }
597
598 WorkgroupSize calculateGlobalDispatchSize() const override {
599 return WorkgroupSize(1, 1, 1);
600 }
601 } step;
602
603 DispatchGroup::Builder builder(recorder.get());
604 if (!builder.appendStep(&step)) {
605 ERRORF(reporter, "Failed to add ComputeStep to DispatchGroup");
606 return;
607 }
608
609 // The output buffer should have been placed in the right output slot.
610 BindBufferInfo outputInfo = builder.getSharedBufferResource(0);
611 if (!outputInfo) {
612 ERRORF(reporter, "Failed to allocate an output buffer at slot 0");
613 return;
614 }
615
616 // Record the compute task
617 ComputeTask::DispatchGroupList groups;
618 groups.push_back(builder.finalize());
619 recorder->priv().add(ComputeTask::Make(std::move(groups)));
620
621 // Ensure the output buffer is synchronized to the CPU once the GPU submission has finished.
622 auto outputBuffer = sync_buffer_to_cpu(recorder.get(), outputInfo.fBuffer);
623
624 // Submit the work and wait for it to complete.
625 std::unique_ptr<Recording> recording = recorder->snap();
626 if (!recording) {
627 ERRORF(reporter, "Failed to make recording");
628 return;
629 }
630
631 InsertRecordingInfo insertInfo;
632 insertInfo.fRecording = recording.get();
633 context->insertRecording(insertInfo);
634 testContext->syncedSubmit(context);
635
636 // Verify the contents of the output buffer.
637 float* outData = static_cast<float*>(
638 map_buffer(context, testContext, outputBuffer.get(), outputInfo.fOffset));
639 SkASSERT(outputBuffer->isMapped() && outData != nullptr);
640 for (unsigned int i = 0; i < kProblemSize; ++i) {
641 const float expected = (i + 1) * kFactor;
642 const float found = outData[i];
643 REPORTER_ASSERT(reporter, expected == found, "expected '%f', found '%f'", expected, found);
644 }
645 }
646
647 // TODO(b/262427430, b/262429132): Enable this test on other backends once they all support
648 // compute programs.
DEF_GRAPHITE_TEST_FOR_DAWN_AND_METAL_CONTEXTS(Compute_ExternallyAssignedBuffer,reporter,context,testContext)649 DEF_GRAPHITE_TEST_FOR_DAWN_AND_METAL_CONTEXTS(Compute_ExternallyAssignedBuffer,
650 reporter,
651 context,
652 testContext) {
653 constexpr uint32_t kProblemSize = 512;
654 constexpr float kFactor = 4.f;
655
656 // The ComputeStep packs kProblemSize floats into kProblemSize / 4 vectors and each thread
657 // processes 1 vector at a time.
658 constexpr uint32_t kWorkgroupSize = kProblemSize / 4;
659
660 std::unique_ptr<Recorder> recorder = context->makeRecorder();
661
662 class TestComputeStep : public ComputeStep {
663 public:
664 TestComputeStep() : ComputeStep(
665 /*name=*/"ExternallyAssignedBuffer",
666 /*localDispatchSize=*/{kWorkgroupSize, 1, 1},
667 /*resources=*/{
668 // Input buffer:
669 {
670 /*type=*/ResourceType::kStorageBuffer,
671 /*flow=*/DataFlow::kPrivate,
672 /*policy=*/ResourcePolicy::kMapped,
673 /*sksl=*/"inputBlock {\n"
674 " float factor;\n"
675 " layout(offset = 16) float4 in_data[];\n"
676 "}\n",
677 },
678 // Output buffer:
679 {
680 /*type=*/ResourceType::kStorageBuffer,
681 /*flow=*/DataFlow::kShared, // shared to allow us to access it from the
682 // Builder
683 /*policy=*/ResourcePolicy::kMapped, // mappable for read-back
684 /*slot=*/0,
685 /*sksl=*/"outputBlock { float4 out_data[]; }",
686 }
687 }) {}
688 ~TestComputeStep() override = default;
689
690 // A kernel that multiplies a large array of floats by a supplied factor.
691 std::string computeSkSL() const override {
692 return R"(
693 void main() {
694 out_data[sk_GlobalInvocationID.x] = in_data[sk_GlobalInvocationID.x] * factor;
695 }
696 )";
697 }
698
699 size_t calculateBufferSize(int resourceIndex, const ResourceDesc& r) const override {
700 SkASSERT(resourceIndex == 0);
701 SkASSERT(r.fFlow == DataFlow::kPrivate);
702 return sizeof(float) * (kProblemSize + 4);
703 }
704
705 void prepareStorageBuffer(int resourceIndex,
706 const ResourceDesc& r,
707 void* buffer,
708 size_t bufferSize) const override {
709 SkASSERT(resourceIndex == 0);
710 SkASSERT(r.fFlow == DataFlow::kPrivate);
711
712 size_t dataCount = sizeof(float) * (kProblemSize + 4);
713 SkASSERT(bufferSize == dataCount);
714 SkSpan<float> inData(static_cast<float*>(buffer), dataCount);
715 inData[0] = kFactor;
716 for (unsigned int i = 0; i < kProblemSize; ++i) {
717 inData[i + 4] = i + 1;
718 }
719 }
720 } step;
721
722 // We allocate a buffer and directly assign it to the DispatchGroup::Builder. The ComputeStep
723 // will not participate in the creation of this buffer.
724 auto [_, outputInfo] =
725 recorder->priv().drawBufferManager()->getStoragePointer(sizeof(float) * kProblemSize);
726 REPORTER_ASSERT(reporter, outputInfo, "Failed to allocate output buffer");
727
728 DispatchGroup::Builder builder(recorder.get());
729 builder.assignSharedBuffer({outputInfo, sizeof(float) * kProblemSize}, 0);
730
731 // Initialize the step with a pre-determined global size
732 if (!builder.appendStep(&step, {WorkgroupSize(1, 1, 1)})) {
733 ERRORF(reporter, "Failed to add ComputeStep to DispatchGroup");
734 return;
735 }
736
737 // Record the compute task
738 ComputeTask::DispatchGroupList groups;
739 groups.push_back(builder.finalize());
740 recorder->priv().add(ComputeTask::Make(std::move(groups)));
741
742 // Ensure the output buffer is synchronized to the CPU once the GPU submission has finished.
743 auto outputBuffer = sync_buffer_to_cpu(recorder.get(), outputInfo.fBuffer);
744
745 // Submit the work and wait for it to complete.
746 std::unique_ptr<Recording> recording = recorder->snap();
747 if (!recording) {
748 ERRORF(reporter, "Failed to make recording");
749 return;
750 }
751
752 InsertRecordingInfo insertInfo;
753 insertInfo.fRecording = recording.get();
754 context->insertRecording(insertInfo);
755 testContext->syncedSubmit(context);
756
757 // Verify the contents of the output buffer.
758 float* outData = static_cast<float*>(
759 map_buffer(context, testContext, outputBuffer.get(), outputInfo.fOffset));
760 SkASSERT(outputBuffer->isMapped() && outData != nullptr);
761 for (unsigned int i = 0; i < kProblemSize; ++i) {
762 const float expected = (i + 1) * kFactor;
763 const float found = outData[i];
764 REPORTER_ASSERT(reporter, expected == found, "expected '%f', found '%f'", expected, found);
765 }
766 }
767
768 // Tests the storage texture binding for a compute dispatch that writes the same color to every
769 // pixel of a storage texture.
DEF_GRAPHITE_TEST_FOR_DAWN_AND_METAL_CONTEXTS(Compute_StorageTexture,reporter,context,testContext)770 DEF_GRAPHITE_TEST_FOR_DAWN_AND_METAL_CONTEXTS(Compute_StorageTexture,
771 reporter,
772 context,
773 testContext) {
774 std::unique_ptr<Recorder> recorder = context->makeRecorder();
775
776 // For this test we allocate a 16x16 tile which is written to by a single workgroup of the same
777 // size.
778 constexpr uint32_t kDim = 16;
779
780 class TestComputeStep : public ComputeStep {
781 public:
782 TestComputeStep() : ComputeStep(
783 /*name=*/"TestStorageTexture",
784 /*localDispatchSize=*/{kDim, kDim, 1},
785 /*resources=*/{
786 {
787 /*type=*/ResourceType::kWriteOnlyStorageTexture,
788 /*flow=*/DataFlow::kShared,
789 /*policy=*/ResourcePolicy::kNone,
790 /*slot=*/0,
791 /*sksl=*/"dst",
792 }
793 }) {}
794 ~TestComputeStep() override = default;
795
796 std::string computeSkSL() const override {
797 return R"(
798 void main() {
799 textureWrite(dst, sk_LocalInvocationID.xy, half4(0.0, 1.0, 0.0, 1.0));
800 }
801 )";
802 }
803
804 std::tuple<SkISize, SkColorType> calculateTextureParameters(
805 int index, const ResourceDesc& r) const override {
806 return {{kDim, kDim}, kRGBA_8888_SkColorType};
807 }
808
809 WorkgroupSize calculateGlobalDispatchSize() const override {
810 return WorkgroupSize(1, 1, 1);
811 }
812 } step;
813
814 DispatchGroup::Builder builder(recorder.get());
815 if (!builder.appendStep(&step)) {
816 ERRORF(reporter, "Failed to add ComputeStep to DispatchGroup");
817 return;
818 }
819
820 sk_sp<TextureProxy> texture = builder.getSharedTextureResource(0);
821 if (!texture) {
822 ERRORF(reporter, "Shared resource at slot 0 is missing");
823 return;
824 }
825
826 // Record the compute task
827 ComputeTask::DispatchGroupList groups;
828 groups.push_back(builder.finalize());
829 recorder->priv().add(ComputeTask::Make(std::move(groups)));
830
831 // Submit the work and wait for it to complete.
832 std::unique_ptr<Recording> recording = recorder->snap();
833 if (!recording) {
834 ERRORF(reporter, "Failed to make recording");
835 return;
836 }
837
838 InsertRecordingInfo insertInfo;
839 insertInfo.fRecording = recording.get();
840 context->insertRecording(insertInfo);
841 testContext->syncedSubmit(context);
842
843 SkBitmap bitmap;
844 SkImageInfo imgInfo =
845 SkImageInfo::Make(kDim, kDim, kRGBA_8888_SkColorType, kUnpremul_SkAlphaType);
846 bitmap.allocPixels(imgInfo);
847
848 SkPixmap pixels;
849 bool peekPixelsSuccess = bitmap.peekPixels(&pixels);
850 REPORTER_ASSERT(reporter, peekPixelsSuccess);
851
852 bool readPixelsSuccess = context->priv().readPixels(pixels, texture.get(), imgInfo, 0, 0);
853 REPORTER_ASSERT(reporter, readPixelsSuccess);
854
855 for (uint32_t x = 0; x < kDim; ++x) {
856 for (uint32_t y = 0; y < kDim; ++y) {
857 SkColor4f expected = SkColor4f::FromColor(SK_ColorGREEN);
858 SkColor4f color = pixels.getColor4f(x, y);
859 REPORTER_ASSERT(reporter, expected == color,
860 "At position {%u, %u}, "
861 "expected {%.1f, %.1f, %.1f, %.1f}, "
862 "found {%.1f, %.1f, %.1f, %.1f}",
863 x, y,
864 expected.fR, expected.fG, expected.fB, expected.fA,
865 color.fR, color.fG, color.fB, color.fA);
866 }
867 }
868 }
869
870 // Tests the readonly texture binding for a compute dispatch that random-access reads from a
871 // CPU-populated texture and copies it to a storage texture.
DEF_GRAPHITE_TEST_FOR_DAWN_AND_METAL_CONTEXTS(Compute_StorageTextureReadAndWrite,reporter,context,testContext)872 DEF_GRAPHITE_TEST_FOR_DAWN_AND_METAL_CONTEXTS(Compute_StorageTextureReadAndWrite,
873 reporter,
874 context,
875 testContext) {
876 std::unique_ptr<Recorder> recorder = context->makeRecorder();
877
878 // For this test we allocate a 16x16 tile which is written to by a single workgroup of the same
879 // size.
880 constexpr uint32_t kDim = 16;
881
882 class TestComputeStep : public ComputeStep {
883 public:
884 TestComputeStep() : ComputeStep(
885 /*name=*/"TestStorageTextureReadAndWrite",
886 /*localDispatchSize=*/{kDim, kDim, 1},
887 /*resources=*/{
888 {
889 /*type=*/ResourceType::kReadOnlyTexture,
890 /*flow=*/DataFlow::kShared,
891 /*policy=*/ResourcePolicy::kNone,
892 /*slot=*/0,
893 /*sksl=*/"src",
894 },
895 {
896 /*type=*/ResourceType::kWriteOnlyStorageTexture,
897 /*flow=*/DataFlow::kShared,
898 /*policy=*/ResourcePolicy::kNone,
899 /*slot=*/1,
900 /*sksl=*/"dst",
901 }
902 }) {}
903 ~TestComputeStep() override = default;
904
905 std::string computeSkSL() const override {
906 return R"(
907 void main() {
908 half4 color = textureRead(src, sk_LocalInvocationID.xy);
909 textureWrite(dst, sk_LocalInvocationID.xy, color);
910 }
911 )";
912 }
913
914 std::tuple<SkISize, SkColorType> calculateTextureParameters(
915 int index, const ResourceDesc& r) const override {
916 SkASSERT(index == 1);
917 return {{kDim, kDim}, kRGBA_8888_SkColorType};
918 }
919
920 WorkgroupSize calculateGlobalDispatchSize() const override {
921 return WorkgroupSize(1, 1, 1);
922 }
923 } step;
924
925 // Create and populate an input texture.
926 SkBitmap srcBitmap;
927 SkImageInfo srcInfo =
928 SkImageInfo::Make(kDim, kDim, kRGBA_8888_SkColorType, kUnpremul_SkAlphaType);
929 srcBitmap.allocPixels(srcInfo);
930 SkPixmap srcPixels;
931 bool srcPeekPixelsSuccess = srcBitmap.peekPixels(&srcPixels);
932 REPORTER_ASSERT(reporter, srcPeekPixelsSuccess);
933 for (uint32_t x = 0; x < kDim; ++x) {
934 for (uint32_t y = 0; y < kDim; ++y) {
935 *srcPixels.writable_addr32(x, y) =
936 SkColorSetARGB(255, x * 256 / kDim, y * 256 / kDim, 0);
937 }
938 }
939
940 auto texInfo = context->priv().caps()->getDefaultSampledTextureInfo(kRGBA_8888_SkColorType,
941 skgpu::Mipmapped::kNo,
942 skgpu::Protected::kNo,
943 skgpu::Renderable::kNo);
944 sk_sp<TextureProxy> srcProxy = TextureProxy::Make(context->priv().caps(),
945 recorder->priv().resourceProvider(),
946 {kDim, kDim},
947 texInfo,
948 "ComputeTestSrcProxy",
949 skgpu::Budgeted::kNo);
950 MipLevel mipLevel;
951 mipLevel.fPixels = srcPixels.addr();
952 mipLevel.fRowBytes = srcPixels.rowBytes();
953 UploadInstance upload = UploadInstance::Make(recorder.get(),
954 srcProxy,
955 srcPixels.info().colorInfo(),
956 srcPixels.info().colorInfo(),
957 {mipLevel},
958 SkIRect::MakeWH(kDim, kDim),
959 std::make_unique<ImageUploadContext>());
960 if (!upload.isValid()) {
961 ERRORF(reporter, "Could not create UploadInstance");
962 return;
963 }
964 recorder->priv().add(UploadTask::Make(std::move(upload)));
965
966 DispatchGroup::Builder builder(recorder.get());
967
968 // Assign the input texture to slot 0. This corresponds to the ComputeStep's "src" texture
969 // binding.
970 builder.assignSharedTexture(std::move(srcProxy), 0);
971
972 if (!builder.appendStep(&step)) {
973 ERRORF(reporter, "Failed to add ComputeStep to DispatchGroup");
974 return;
975 }
976
977 sk_sp<TextureProxy> dst = builder.getSharedTextureResource(1);
978 if (!dst) {
979 ERRORF(reporter, "shared resource at slot 1 is missing");
980 return;
981 }
982
983 // Record the compute task
984 ComputeTask::DispatchGroupList groups;
985 groups.push_back(builder.finalize());
986 recorder->priv().add(ComputeTask::Make(std::move(groups)));
987
988 // Submit the work and wait for it to complete.
989 std::unique_ptr<Recording> recording = recorder->snap();
990 if (!recording) {
991 ERRORF(reporter, "Failed to make recording");
992 return;
993 }
994
995 InsertRecordingInfo insertInfo;
996 insertInfo.fRecording = recording.get();
997 context->insertRecording(insertInfo);
998 testContext->syncedSubmit(context);
999
1000 SkBitmap bitmap;
1001 SkImageInfo imgInfo =
1002 SkImageInfo::Make(kDim, kDim, kRGBA_8888_SkColorType, kUnpremul_SkAlphaType);
1003 bitmap.allocPixels(imgInfo);
1004
1005 SkPixmap pixels;
1006 bool peekPixelsSuccess = bitmap.peekPixels(&pixels);
1007 REPORTER_ASSERT(reporter, peekPixelsSuccess);
1008
1009 bool readPixelsSuccess = context->priv().readPixels(pixels, dst.get(), imgInfo, 0, 0);
1010 REPORTER_ASSERT(reporter, readPixelsSuccess);
1011
1012 for (uint32_t x = 0; x < kDim; ++x) {
1013 for (uint32_t y = 0; y < kDim; ++y) {
1014 SkColor4f expected = SkColor4f::FromBytes_RGBA(
1015 SkColorSetARGB(255, x * 256 / kDim, y * 256 / kDim, 0));
1016 SkColor4f color = pixels.getColor4f(x, y);
1017 REPORTER_ASSERT(reporter, expected == color,
1018 "At position {%u, %u}, "
1019 "expected {%.1f, %.1f, %.1f, %.1f}, "
1020 "found {%.1f, %.1f, %.1f, %.1f}",
1021 x, y,
1022 expected.fR, expected.fG, expected.fB, expected.fA,
1023 color.fR, color.fG, color.fB, color.fA);
1024 }
1025 }
1026 }
1027
DEF_GRAPHITE_TEST_FOR_DAWN_AND_METAL_CONTEXTS(Compute_ReadOnlyStorageBuffer,reporter,context,testContext)1028 DEF_GRAPHITE_TEST_FOR_DAWN_AND_METAL_CONTEXTS(Compute_ReadOnlyStorageBuffer,
1029 reporter,
1030 context,
1031 testContext) {
1032 std::unique_ptr<Recorder> recorder = context->makeRecorder();
1033
1034 // For this test we allocate a 16x16 tile which is written to by a single workgroup of the same
1035 // size.
1036 constexpr uint32_t kDim = 16;
1037
1038 class TestComputeStep : public ComputeStep {
1039 public:
1040 TestComputeStep() : ComputeStep(
1041 /*name=*/"TestReadOnlyStorageBuffer",
1042 /*localDispatchSize=*/{kDim, kDim, 1},
1043 /*resources=*/{
1044 {
1045 /*type=*/ResourceType::kReadOnlyStorageBuffer,
1046 /*flow=*/DataFlow::kShared,
1047 /*policy=*/ResourcePolicy::kMapped,
1048 /*slot=*/0,
1049 /*sksl=*/"src { uint in_data[]; }",
1050 },
1051 {
1052 /*type=*/ResourceType::kWriteOnlyStorageTexture,
1053 /*flow=*/DataFlow::kShared,
1054 /*policy=*/ResourcePolicy::kNone,
1055 /*slot=*/1,
1056 /*sksl=*/"dst",
1057 }
1058 }) {}
1059 ~TestComputeStep() override = default;
1060
1061 std::string computeSkSL() const override {
1062 return R"(
1063 void main() {
1064 uint ix = sk_LocalInvocationID.y * 16 + sk_LocalInvocationID.x;
1065 uint value = in_data[ix];
1066 half4 splat = half4(
1067 half(value & 0xFF),
1068 half((value >> 8) & 0xFF),
1069 half((value >> 16) & 0xFF),
1070 half((value >> 24) & 0xFF)
1071 );
1072 textureWrite(dst, sk_LocalInvocationID.xy, splat / 255.0);
1073 }
1074 )";
1075 }
1076
1077 size_t calculateBufferSize(int index, const ResourceDesc& r) const override {
1078 SkASSERT(index == 0);
1079 return kDim * kDim * sizeof(uint32_t);
1080 }
1081
1082 void prepareStorageBuffer(int index,
1083 const ResourceDesc&,
1084 void* buffer,
1085 size_t bufferSize) const override {
1086 SkASSERT(index == 0);
1087 SkASSERT(bufferSize == kDim * kDim * sizeof(uint32_t));
1088
1089 uint32_t* inputs = reinterpret_cast<uint32_t*>(buffer);
1090 for (uint32_t y = 0; y < kDim; ++y) {
1091 for (uint32_t x = 0; x < kDim; ++x) {
1092 uint32_t value =
1093 ((x * 256 / kDim) & 0xFF) | ((y * 256 / kDim) & 0xFF) << 8 | 255 << 24;
1094 *(inputs++) = value;
1095 }
1096 }
1097 }
1098
1099 std::tuple<SkISize, SkColorType> calculateTextureParameters(
1100 int index, const ResourceDesc& r) const override {
1101 SkASSERT(index == 1);
1102 return {{kDim, kDim}, kRGBA_8888_SkColorType};
1103 }
1104
1105 WorkgroupSize calculateGlobalDispatchSize() const override {
1106 return WorkgroupSize(1, 1, 1);
1107 }
1108 } step;
1109
1110 DispatchGroup::Builder builder(recorder.get());
1111 if (!builder.appendStep(&step)) {
1112 ERRORF(reporter, "Failed to add ComputeStep to DispatchGroup");
1113 return;
1114 }
1115
1116 sk_sp<TextureProxy> dst = builder.getSharedTextureResource(1);
1117 if (!dst) {
1118 ERRORF(reporter, "shared resource at slot 1 is missing");
1119 return;
1120 }
1121
1122 // Record the compute task
1123 ComputeTask::DispatchGroupList groups;
1124 groups.push_back(builder.finalize());
1125 recorder->priv().add(ComputeTask::Make(std::move(groups)));
1126
1127 // Submit the work and wait for it to complete.
1128 std::unique_ptr<Recording> recording = recorder->snap();
1129 if (!recording) {
1130 ERRORF(reporter, "Failed to make recording");
1131 return;
1132 }
1133
1134 InsertRecordingInfo insertInfo;
1135 insertInfo.fRecording = recording.get();
1136 context->insertRecording(insertInfo);
1137 testContext->syncedSubmit(context);
1138
1139 SkBitmap bitmap;
1140 SkImageInfo imgInfo =
1141 SkImageInfo::Make(kDim, kDim, kRGBA_8888_SkColorType, kUnpremul_SkAlphaType);
1142 bitmap.allocPixels(imgInfo);
1143
1144 SkPixmap pixels;
1145 bool peekPixelsSuccess = bitmap.peekPixels(&pixels);
1146 REPORTER_ASSERT(reporter, peekPixelsSuccess);
1147
1148 bool readPixelsSuccess = context->priv().readPixels(pixels, dst.get(), imgInfo, 0, 0);
1149 REPORTER_ASSERT(reporter, readPixelsSuccess);
1150
1151 for (uint32_t x = 0; x < kDim; ++x) {
1152 for (uint32_t y = 0; y < kDim; ++y) {
1153 SkColor4f expected =
1154 SkColor4f::FromColor(SkColorSetARGB(255, x * 256 / kDim, y * 256 / kDim, 0));
1155 SkColor4f color = pixels.getColor4f(x, y);
1156 bool pass = true;
1157 for (int i = 0; i < 4; i++) {
1158 pass &= color[i] == expected[i];
1159 }
1160 REPORTER_ASSERT(reporter, pass,
1161 "At position {%u, %u}, "
1162 "expected {%.1f, %.1f, %.1f, %.1f}, "
1163 "found {%.1f, %.1f, %.1f, %.1f}",
1164 x, y,
1165 expected.fR, expected.fG, expected.fB, expected.fA,
1166 color.fR, color.fG, color.fB, color.fA);
1167 }
1168 }
1169 }
1170
1171 // Tests that a texture written by one compute step can be sampled by a subsequent step.
DEF_GRAPHITE_TEST_FOR_DAWN_AND_METAL_CONTEXTS(Compute_StorageTextureMultipleComputeSteps,reporter,context,testContext)1172 DEF_GRAPHITE_TEST_FOR_DAWN_AND_METAL_CONTEXTS(Compute_StorageTextureMultipleComputeSteps,
1173 reporter,
1174 context,
1175 testContext) {
1176 std::unique_ptr<Recorder> recorder = context->makeRecorder();
1177
1178 // For this test we allocate a 16x16 tile which is written to by a single workgroup of the same
1179 // size.
1180 constexpr uint32_t kDim = 16;
1181
1182 // Writes to a texture in slot 0.
1183 class TestComputeStep1 : public ComputeStep {
1184 public:
1185 TestComputeStep1() : ComputeStep(
1186 /*name=*/"TestStorageTexturesFirstPass",
1187 /*localDispatchSize=*/{kDim, kDim, 1},
1188 /*resources=*/{
1189 {
1190 /*type=*/ResourceType::kWriteOnlyStorageTexture,
1191 /*flow=*/DataFlow::kShared,
1192 /*policy=*/ResourcePolicy::kNone,
1193 /*slot=*/0,
1194 /*sksl=*/"dst",
1195 }
1196 }) {}
1197 ~TestComputeStep1() override = default;
1198
1199 std::string computeSkSL() const override {
1200 return R"(
1201 void main() {
1202 textureWrite(dst, sk_LocalInvocationID.xy, half4(0.0, 1.0, 0.0, 1.0));
1203 }
1204 )";
1205 }
1206
1207 std::tuple<SkISize, SkColorType> calculateTextureParameters(
1208 int index, const ResourceDesc& r) const override {
1209 SkASSERT(index == 0);
1210 return {{kDim, kDim}, kRGBA_8888_SkColorType};
1211 }
1212
1213 WorkgroupSize calculateGlobalDispatchSize() const override {
1214 return WorkgroupSize(1, 1, 1);
1215 }
1216 } step1;
1217
1218 // Reads from the texture in slot 0 and writes it to another texture in slot 1.
1219 class TestComputeStep2 : public ComputeStep {
1220 public:
1221 TestComputeStep2() : ComputeStep(
1222 /*name=*/"TestStorageTexturesSecondPass",
1223 /*localDispatchSize=*/{kDim, kDim, 1},
1224 /*resources=*/{
1225 {
1226 /*type=*/ResourceType::kReadOnlyTexture,
1227 /*flow=*/DataFlow::kShared,
1228 /*policy=*/ResourcePolicy::kNone,
1229 /*slot=*/0,
1230 /*sksl=*/"src",
1231 },
1232 {
1233 /*type=*/ResourceType::kWriteOnlyStorageTexture,
1234 /*flow=*/DataFlow::kShared,
1235 /*policy=*/ResourcePolicy::kNone,
1236 /*slot=*/1,
1237 /*sksl=*/"dst",
1238 }
1239 }) {}
1240 ~TestComputeStep2() override = default;
1241
1242 std::string computeSkSL() const override {
1243 return R"(
1244 void main() {
1245 half4 color = textureRead(src, sk_LocalInvocationID.xy);
1246 textureWrite(dst, sk_LocalInvocationID.xy, color);
1247 }
1248 )";
1249 }
1250
1251 std::tuple<SkISize, SkColorType> calculateTextureParameters(
1252 int index, const ResourceDesc& r) const override {
1253 SkASSERT(index == 1);
1254 return {{kDim, kDim}, kRGBA_8888_SkColorType};
1255 }
1256
1257 WorkgroupSize calculateGlobalDispatchSize() const override {
1258 return WorkgroupSize(1, 1, 1);
1259 }
1260 } step2;
1261
1262 DispatchGroup::Builder builder(recorder.get());
1263 builder.appendStep(&step1);
1264 builder.appendStep(&step2);
1265
1266 sk_sp<TextureProxy> dst = builder.getSharedTextureResource(1);
1267 if (!dst) {
1268 ERRORF(reporter, "shared resource at slot 1 is missing");
1269 return;
1270 }
1271
1272 // Record the compute task
1273 ComputeTask::DispatchGroupList groups;
1274 groups.push_back(builder.finalize());
1275 recorder->priv().add(ComputeTask::Make(std::move(groups)));
1276
1277 // Submit the work and wait for it to complete.
1278 std::unique_ptr<Recording> recording = recorder->snap();
1279 if (!recording) {
1280 ERRORF(reporter, "Failed to make recording");
1281 return;
1282 }
1283
1284 InsertRecordingInfo insertInfo;
1285 insertInfo.fRecording = recording.get();
1286 context->insertRecording(insertInfo);
1287 testContext->syncedSubmit(context);
1288
1289 SkBitmap bitmap;
1290 SkImageInfo imgInfo =
1291 SkImageInfo::Make(kDim, kDim, kRGBA_8888_SkColorType, kUnpremul_SkAlphaType);
1292 bitmap.allocPixels(imgInfo);
1293
1294 SkPixmap pixels;
1295 bool peekPixelsSuccess = bitmap.peekPixels(&pixels);
1296 REPORTER_ASSERT(reporter, peekPixelsSuccess);
1297
1298 bool readPixelsSuccess = context->priv().readPixels(pixels, dst.get(), imgInfo, 0, 0);
1299 REPORTER_ASSERT(reporter, readPixelsSuccess);
1300
1301 for (uint32_t x = 0; x < kDim; ++x) {
1302 for (uint32_t y = 0; y < kDim; ++y) {
1303 SkColor4f expected = SkColor4f::FromColor(SK_ColorGREEN);
1304 SkColor4f color = pixels.getColor4f(x, y);
1305 REPORTER_ASSERT(reporter, expected == color,
1306 "At position {%u, %u}, "
1307 "expected {%.1f, %.1f, %.1f, %.1f}, "
1308 "found {%.1f, %.1f, %.1f, %.1f}",
1309 x, y,
1310 expected.fR, expected.fG, expected.fB, expected.fA,
1311 color.fR, color.fG, color.fB, color.fA);
1312 }
1313 }
1314 }
1315
1316 // Tests that a texture can be sampled by a compute step using a sampler.
1317 // TODO(armansito): Once the previous TODO is done, add additional tests that exercise mixed use of
1318 // texture, buffer, and sampler bindings.
DEF_GRAPHITE_TEST_FOR_DAWN_AND_METAL_CONTEXTS(Compute_SampledTexture,reporter,context,testContext)1319 DEF_GRAPHITE_TEST_FOR_DAWN_AND_METAL_CONTEXTS(Compute_SampledTexture,
1320 reporter,
1321 context,
1322 testContext) {
1323 std::unique_ptr<Recorder> recorder = context->makeRecorder();
1324
1325 // The first ComputeStep initializes a 16x16 texture with a checkerboard pattern of alternating
1326 // red and black pixels. The second ComputeStep downsamples this texture into a 4x4 using
1327 // bilinear filtering at pixel borders, intentionally averaging the values of each 4x4 tile in
1328 // the source texture, and writes the result to the destination texture.
1329 constexpr uint32_t kSrcDim = 16;
1330 constexpr uint32_t kDstDim = 4;
1331
1332 class TestComputeStep1 : public ComputeStep {
1333 public:
1334 TestComputeStep1() : ComputeStep(
1335 /*name=*/"Test_SampledTexture_Init",
1336 /*localDispatchSize=*/{kSrcDim, kSrcDim, 1},
1337 /*resources=*/{
1338 {
1339 /*type=*/ResourceType::kWriteOnlyStorageTexture,
1340 /*flow=*/DataFlow::kShared,
1341 /*policy=*/ResourcePolicy::kNone,
1342 /*slot=*/0,
1343 /*sksl=*/"dst",
1344 }
1345 }) {}
1346 ~TestComputeStep1() override = default;
1347
1348 std::string computeSkSL() const override {
1349 return R"(
1350 void main() {
1351 uint2 c = sk_LocalInvocationID.xy;
1352 uint checkerBoardColor = (c.x + (c.y % 2)) % 2;
1353 textureWrite(dst, c, half4(checkerBoardColor, 0, 0, 1));
1354 }
1355 )";
1356 }
1357
1358 std::tuple<SkISize, SkColorType> calculateTextureParameters(
1359 int index, const ResourceDesc& r) const override {
1360 SkASSERT(index == 0);
1361 return {{kSrcDim, kSrcDim}, kRGBA_8888_SkColorType};
1362 }
1363
1364 WorkgroupSize calculateGlobalDispatchSize() const override {
1365 return WorkgroupSize(1, 1, 1);
1366 }
1367 } step1;
1368
1369 class TestComputeStep2 : public ComputeStep {
1370 public:
1371 TestComputeStep2() : ComputeStep(
1372 /*name=*/"Test_SampledTexture_Sample",
1373 /*localDispatchSize=*/{kDstDim, kDstDim, 1},
1374 /*resources=*/{
1375 // Declare the storage texture before the sampled texture. This tests that
1376 // binding index assignment works consistently across all backends when a
1377 // sampler-less texture and a texture+sampler pair are intermixed and sampler
1378 // bindings aren't necessarily contiguous when the ranges are distinct.
1379 {
1380 /*type=*/ResourceType::kWriteOnlyStorageTexture,
1381 /*flow=*/DataFlow::kShared,
1382 /*policy=*/ResourcePolicy::kNone,
1383 /*slot=*/1,
1384 /*sksl=*/"dst",
1385 },
1386 {
1387 /*type=*/ResourceType::kSampledTexture,
1388 /*flow=*/DataFlow::kShared,
1389 /*policy=*/ResourcePolicy::kNone,
1390 /*slot=*/0,
1391 /*sksl=*/"src",
1392 }
1393 }) {}
1394 ~TestComputeStep2() override = default;
1395
1396 std::string computeSkSL() const override {
1397 return R"(
1398 void main() {
1399 // Normalize the 4x4 invocation indices and sample the source texture using
1400 // that.
1401 uint2 dstCoord = sk_LocalInvocationID.xy;
1402 const float2 dstSizeInv = float2(0.25, 0.25);
1403 float2 unormCoord = float2(dstCoord) * dstSizeInv;
1404
1405 // Use explicit LOD, as quad derivatives are not available to a compute shader.
1406 half4 color = sampleLod(src, unormCoord, 0);
1407 textureWrite(dst, dstCoord, color);
1408 }
1409 )";
1410 }
1411
1412 std::tuple<SkISize, SkColorType> calculateTextureParameters(
1413 int index, const ResourceDesc& r) const override {
1414 SkASSERT(index == 0 || index == 1);
1415 return {{kDstDim, kDstDim}, kRGBA_8888_SkColorType};
1416 }
1417
1418 SamplerDesc calculateSamplerParameters(int index, const ResourceDesc&) const override {
1419 SkASSERT(index == 1);
1420 // Use the repeat tile mode to sample an infinite checkerboard.
1421 constexpr SkTileMode kTileModes[2] = {SkTileMode::kRepeat, SkTileMode::kRepeat};
1422 return {SkFilterMode::kLinear, kTileModes};
1423 }
1424
1425 WorkgroupSize calculateGlobalDispatchSize() const override {
1426 return WorkgroupSize(1, 1, 1);
1427 }
1428 } step2;
1429
1430 DispatchGroup::Builder builder(recorder.get());
1431 builder.appendStep(&step1);
1432 builder.appendStep(&step2);
1433
1434 sk_sp<TextureProxy> dst = builder.getSharedTextureResource(1);
1435 if (!dst) {
1436 ERRORF(reporter, "shared resource at slot 1 is missing");
1437 return;
1438 }
1439
1440 // Record the compute task
1441 ComputeTask::DispatchGroupList groups;
1442 groups.push_back(builder.finalize());
1443 recorder->priv().add(ComputeTask::Make(std::move(groups)));
1444
1445 // Submit the work and wait for it to complete.
1446 std::unique_ptr<Recording> recording = recorder->snap();
1447 if (!recording) {
1448 ERRORF(reporter, "Failed to make recording");
1449 return;
1450 }
1451
1452 InsertRecordingInfo insertInfo;
1453 insertInfo.fRecording = recording.get();
1454 context->insertRecording(insertInfo);
1455 testContext->syncedSubmit(context);
1456
1457 SkBitmap bitmap;
1458 SkImageInfo imgInfo =
1459 SkImageInfo::Make(kDstDim, kDstDim, kRGBA_8888_SkColorType, kUnpremul_SkAlphaType);
1460 bitmap.allocPixels(imgInfo);
1461
1462 SkPixmap pixels;
1463 bool peekPixelsSuccess = bitmap.peekPixels(&pixels);
1464 REPORTER_ASSERT(reporter, peekPixelsSuccess);
1465
1466 bool readPixelsSuccess = context->priv().readPixels(pixels, dst.get(), imgInfo, 0, 0);
1467 REPORTER_ASSERT(reporter, readPixelsSuccess);
1468
1469 for (uint32_t x = 0; x < kDstDim; ++x) {
1470 for (uint32_t y = 0; y < kDstDim; ++y) {
1471 SkColor4f color = pixels.getColor4f(x, y);
1472 REPORTER_ASSERT(reporter, color.fR > 0.49 && color.fR < 0.51,
1473 "At position {%u, %u}, "
1474 "expected red channel in range [0.49, 0.51], "
1475 "found {%.3f}",
1476 x, y, color.fR);
1477 }
1478 }
1479 }
1480
1481 // TODO(b/260622403): The shader tested here is identical to
1482 // `resources/sksl/compute/AtomicsOperations.compute`. It would be nice to be able to exercise SkSL
1483 // features like this as part of SkSLTest.cpp instead of as a graphite test.
1484 // TODO(b/262427430, b/262429132): Enable this test on other backends once they all support
1485 // compute programs.
DEF_GRAPHITE_TEST_FOR_DAWN_AND_METAL_CONTEXTS(Compute_AtomicOperationsTest,reporter,context,testContext)1486 DEF_GRAPHITE_TEST_FOR_DAWN_AND_METAL_CONTEXTS(Compute_AtomicOperationsTest,
1487 reporter,
1488 context,
1489 testContext) {
1490 // This fails on Dawn D3D11, b/315834710
1491 if (testContext->contextType() == skgpu::ContextType::kDawn_D3D11) {
1492 return;
1493 }
1494
1495 std::unique_ptr<Recorder> recorder = context->makeRecorder();
1496
1497 constexpr uint32_t kWorkgroupCount = 32;
1498 constexpr uint32_t kWorkgroupSize = 256;
1499
1500 class TestComputeStep : public ComputeStep {
1501 public:
1502 TestComputeStep() : ComputeStep(
1503 /*name=*/"TestAtomicOperations",
1504 /*localDispatchSize=*/{kWorkgroupSize, 1, 1},
1505 /*resources=*/{
1506 {
1507 /*type=*/ResourceType::kStorageBuffer,
1508 /*flow=*/DataFlow::kShared,
1509 /*policy=*/ResourcePolicy::kMapped,
1510 /*slot=*/0,
1511 /*sksl=*/"ssbo { atomicUint globalCounter; }",
1512 }
1513 }) {}
1514 ~TestComputeStep() override = default;
1515
1516 // A kernel that increments a global (device memory) counter across multiple workgroups.
1517 // Each workgroup maintains its own independent tally in a workgroup-shared counter which
1518 // is then added to the global count.
1519 //
1520 // This exercises atomic store/load/add and coherent reads and writes over memory in storage
1521 // and workgroup address spaces.
1522 std::string computeSkSL() const override {
1523 return R"(
1524 workgroup atomicUint localCounter;
1525
1526 void main() {
1527 // Initialize the local counter.
1528 if (sk_LocalInvocationID.x == 0) {
1529 atomicStore(localCounter, 0);
1530 }
1531
1532 // Synchronize the threads in the workgroup so they all see the initial value.
1533 workgroupBarrier();
1534
1535 // All threads increment the counter.
1536 atomicAdd(localCounter, 1);
1537
1538 // Synchronize the threads again to ensure they have all executed the increment
1539 // and the following load reads the same value across all threads in the
1540 // workgroup.
1541 workgroupBarrier();
1542
1543 // Add the workgroup-only tally to the global counter.
1544 if (sk_LocalInvocationID.x == 0) {
1545 atomicAdd(globalCounter, atomicLoad(localCounter));
1546 }
1547 }
1548 )";
1549 }
1550
1551 size_t calculateBufferSize(int index, const ResourceDesc& r) const override {
1552 SkASSERT(index == 0);
1553 SkASSERT(r.fSlot == 0);
1554 SkASSERT(r.fFlow == DataFlow::kShared);
1555 return sizeof(uint32_t);
1556 }
1557
1558 WorkgroupSize calculateGlobalDispatchSize() const override {
1559 return WorkgroupSize(kWorkgroupCount, 1, 1);
1560 }
1561
1562 void prepareStorageBuffer(int resourceIndex,
1563 const ResourceDesc& r,
1564 void* buffer,
1565 size_t bufferSize) const override {
1566 SkASSERT(resourceIndex == 0);
1567 *static_cast<uint32_t*>(buffer) = 0;
1568 }
1569 } step;
1570
1571 DispatchGroup::Builder builder(recorder.get());
1572 builder.appendStep(&step);
1573
1574 BindBufferInfo info = builder.getSharedBufferResource(0);
1575 if (!info) {
1576 ERRORF(reporter, "shared resource at slot 0 is missing");
1577 return;
1578 }
1579
1580 // Record the compute pass task.
1581 ComputeTask::DispatchGroupList groups;
1582 groups.push_back(builder.finalize());
1583 recorder->priv().add(ComputeTask::Make(std::move(groups)));
1584
1585 // Ensure the output buffer is synchronized to the CPU once the GPU submission has finished.
1586 auto buffer = sync_buffer_to_cpu(recorder.get(), info.fBuffer);
1587
1588 // Submit the work and wait for it to complete.
1589 std::unique_ptr<Recording> recording = recorder->snap();
1590 if (!recording) {
1591 ERRORF(reporter, "Failed to make recording");
1592 return;
1593 }
1594
1595 InsertRecordingInfo insertInfo;
1596 insertInfo.fRecording = recording.get();
1597 context->insertRecording(insertInfo);
1598 testContext->syncedSubmit(context);
1599
1600 // Verify the contents of the output buffer.
1601 constexpr uint32_t kExpectedCount = kWorkgroupCount * kWorkgroupSize;
1602 const uint32_t result = static_cast<const uint32_t*>(
1603 map_buffer(context, testContext, buffer.get(), info.fOffset))[0];
1604 REPORTER_ASSERT(reporter,
1605 result == kExpectedCount,
1606 "expected '%u', found '%u'",
1607 kExpectedCount,
1608 result);
1609 }
1610
1611 // TODO(b/260622403): The shader tested here is identical to
1612 // `resources/sksl/compute/AtomicsOperationsOverArrayAndStruct.compute`. It would be nice to be able
1613 // to exercise SkSL features like this as part of SkSLTest.cpp instead of as a graphite test.
1614 // TODO(b/262427430, b/262429132): Enable this test on other backends once they all support
1615 // compute programs.
DEF_GRAPHITE_TEST_FOR_DAWN_AND_METAL_CONTEXTS(Compute_AtomicOperationsOverArrayAndStructTest,reporter,context,testContext)1616 DEF_GRAPHITE_TEST_FOR_DAWN_AND_METAL_CONTEXTS(Compute_AtomicOperationsOverArrayAndStructTest,
1617 reporter,
1618 context,
1619 testContext) {
1620 // This fails on Dawn D3D11, b/315834710
1621 if (testContext->contextType() == skgpu::ContextType::kDawn_D3D11) {
1622 return;
1623 }
1624
1625 std::unique_ptr<Recorder> recorder = context->makeRecorder();
1626
1627 constexpr uint32_t kWorkgroupCount = 32;
1628 constexpr uint32_t kWorkgroupSize = 256;
1629
1630 class TestComputeStep : public ComputeStep {
1631 public:
1632 TestComputeStep() : ComputeStep(
1633 /*name=*/"TestAtomicOperationsOverArrayAndStruct",
1634 /*localDispatchSize=*/{kWorkgroupSize, 1, 1},
1635 /*resources=*/{
1636 {
1637 /*type=*/ResourceType::kStorageBuffer,
1638 /*flow=*/DataFlow::kShared,
1639 /*policy=*/ResourcePolicy::kMapped,
1640 /*slot=*/0,
1641 /*sksl=*/"ssbo {\n"
1642 " atomicUint globalCountsFirstHalf;\n"
1643 " atomicUint globalCountsSecondHalf;\n"
1644 "}\n"
1645 }
1646 }) {}
1647 ~TestComputeStep() override = default;
1648
1649 // Construct a kernel that increments a two global (device memory) counters across multiple
1650 // workgroups. Each workgroup maintains its own independent tallies in workgroup-shared
1651 // counters which are then added to the global counts.
1652 //
1653 // This exercises atomic store/load/add and coherent reads and writes over memory in storage
1654 // and workgroup address spaces.
1655 std::string computeSkSL() const override {
1656 return R"(
1657 const uint WORKGROUP_SIZE = 256;
1658
1659 workgroup atomicUint localCounts[2];
1660
1661 void main() {
1662 // Initialize the local counts.
1663 if (sk_LocalInvocationID.x == 0) {
1664 atomicStore(localCounts[0], 0);
1665 atomicStore(localCounts[1], 0);
1666 }
1667
1668 // Synchronize the threads in the workgroup so they all see the initial value.
1669 workgroupBarrier();
1670
1671 // Each thread increments one of the local counters based on its invocation
1672 // index.
1673 uint idx = sk_LocalInvocationID.x < (WORKGROUP_SIZE / 2) ? 0 : 1;
1674 atomicAdd(localCounts[idx], 1);
1675
1676 // Synchronize the threads again to ensure they have all executed the increments
1677 // and the following load reads the same value across all threads in the
1678 // workgroup.
1679 workgroupBarrier();
1680
1681 // Add the workgroup-only tally to the global counter.
1682 if (sk_LocalInvocationID.x == 0) {
1683 atomicAdd(globalCountsFirstHalf, atomicLoad(localCounts[0]));
1684 atomicAdd(globalCountsSecondHalf, atomicLoad(localCounts[1]));
1685 }
1686 }
1687 )";
1688 }
1689
1690 size_t calculateBufferSize(int index, const ResourceDesc& r) const override {
1691 SkASSERT(index == 0);
1692 SkASSERT(r.fSlot == 0);
1693 SkASSERT(r.fFlow == DataFlow::kShared);
1694 return 2 * sizeof(uint32_t);
1695 }
1696
1697 WorkgroupSize calculateGlobalDispatchSize() const override {
1698 return WorkgroupSize(kWorkgroupCount, 1, 1);
1699 }
1700
1701 void prepareStorageBuffer(int resourceIndex,
1702 const ResourceDesc& r,
1703 void* buffer,
1704 size_t bufferSize) const override {
1705 SkASSERT(resourceIndex == 0);
1706 uint32_t* data = static_cast<uint32_t*>(buffer);
1707 data[0] = 0;
1708 data[1] = 0;
1709 }
1710 } step;
1711
1712 DispatchGroup::Builder builder(recorder.get());
1713 builder.appendStep(&step);
1714
1715 BindBufferInfo info = builder.getSharedBufferResource(0);
1716 if (!info) {
1717 ERRORF(reporter, "shared resource at slot 0 is missing");
1718 return;
1719 }
1720
1721 // Record the compute pass task.
1722 ComputeTask::DispatchGroupList groups;
1723 groups.push_back(builder.finalize());
1724 recorder->priv().add(ComputeTask::Make(std::move(groups)));
1725
1726 // Ensure the output buffer is synchronized to the CPU once the GPU submission has finished.
1727 auto buffer = sync_buffer_to_cpu(recorder.get(), info.fBuffer);
1728
1729 // Submit the work and wait for it to complete.
1730 std::unique_ptr<Recording> recording = recorder->snap();
1731 if (!recording) {
1732 ERRORF(reporter, "Failed to make recording");
1733 return;
1734 }
1735
1736 InsertRecordingInfo insertInfo;
1737 insertInfo.fRecording = recording.get();
1738 context->insertRecording(insertInfo);
1739 testContext->syncedSubmit(context);
1740
1741 // Verify the contents of the output buffer.
1742 constexpr uint32_t kExpectedCount = kWorkgroupCount * kWorkgroupSize / 2;
1743
1744 const uint32_t* ssboData = static_cast<const uint32_t*>(
1745 map_buffer(context, testContext, buffer.get(), info.fOffset));
1746 const uint32_t firstHalfCount = ssboData[0];
1747 const uint32_t secondHalfCount = ssboData[1];
1748 REPORTER_ASSERT(reporter,
1749 firstHalfCount == kExpectedCount,
1750 "expected '%u', found '%u'",
1751 kExpectedCount,
1752 firstHalfCount);
1753 REPORTER_ASSERT(reporter,
1754 secondHalfCount == kExpectedCount,
1755 "expected '%u', found '%u'",
1756 kExpectedCount,
1757 secondHalfCount);
1758 }
1759
DEF_GRAPHITE_TEST_FOR_DAWN_AND_METAL_CONTEXTS(Compute_ClearedBuffer,reporter,context,testContext)1760 DEF_GRAPHITE_TEST_FOR_DAWN_AND_METAL_CONTEXTS(Compute_ClearedBuffer,
1761 reporter,
1762 context,
1763 testContext) {
1764 constexpr uint32_t kProblemSize = 512;
1765
1766 // The ComputeStep packs kProblemSize floats into kProblemSize / 4 vectors and each thread
1767 // processes 1 vector at a time.
1768 constexpr uint32_t kWorkgroupSize = kProblemSize / 4;
1769
1770 std::unique_ptr<Recorder> recorder = context->makeRecorder();
1771
1772 // The ComputeStep requests an unmapped buffer that is zero-initialized. It writes the output to
1773 // a mapped buffer which test verifies.
1774 class TestComputeStep : public ComputeStep {
1775 public:
1776 TestComputeStep() : ComputeStep(
1777 /*name=*/"TestClearedBuffer",
1778 /*localDispatchSize=*/{kWorkgroupSize, 1, 1},
1779 /*resources=*/{
1780 // Zero initialized input buffer
1781 {
1782 // TODO(b/299979165): Declare this binding as read-only.
1783 /*type=*/ResourceType::kStorageBuffer,
1784 /*flow=*/DataFlow::kPrivate,
1785 /*policy=*/ResourcePolicy::kClear,
1786 /*sksl=*/"inputBlock { uint4 in_data[]; }\n",
1787 },
1788 // Output buffer:
1789 {
1790 /*type=*/ResourceType::kStorageBuffer,
1791 /*flow=*/DataFlow::kShared, // shared to allow us to access it from the
1792 // Builder
1793 /*policy=*/ResourcePolicy::kMapped, // mappable for read-back
1794 /*slot=*/0,
1795 /*sksl=*/"outputBlock { uint4 out_data[]; }\n",
1796 }
1797 }) {}
1798 ~TestComputeStep() override = default;
1799
1800 std::string computeSkSL() const override {
1801 return R"(
1802 void main() {
1803 out_data[sk_GlobalInvocationID.x] = in_data[sk_GlobalInvocationID.x];
1804 }
1805 )";
1806 }
1807
1808 size_t calculateBufferSize(int index, const ResourceDesc& r) const override {
1809 return sizeof(uint32_t) * kProblemSize;
1810 }
1811
1812 void prepareStorageBuffer(int resourceIndex,
1813 const ResourceDesc& r,
1814 void* buffer,
1815 size_t bufferSize) const override {
1816 // Should receive this call only for the mapped buffer.
1817 SkASSERT(resourceIndex == 1);
1818 }
1819
1820 WorkgroupSize calculateGlobalDispatchSize() const override {
1821 return WorkgroupSize(1, 1, 1);
1822 }
1823 } step;
1824
1825 DispatchGroup::Builder builder(recorder.get());
1826 if (!builder.appendStep(&step)) {
1827 ERRORF(reporter, "Failed to add ComputeStep to DispatchGroup");
1828 return;
1829 }
1830
1831 // The output buffer should have been placed in the right output slot.
1832 BindBufferInfo outputInfo = builder.getSharedBufferResource(0);
1833 if (!outputInfo) {
1834 ERRORF(reporter, "Failed to allocate an output buffer at slot 0");
1835 return;
1836 }
1837
1838 // Record the compute task
1839 ComputeTask::DispatchGroupList groups;
1840 groups.push_back(builder.finalize());
1841 recorder->priv().add(ComputeTask::Make(std::move(groups)));
1842
1843 // Ensure the output buffer is synchronized to the CPU once the GPU submission has finished.
1844 auto outputBuffer = sync_buffer_to_cpu(recorder.get(), outputInfo.fBuffer);
1845
1846 // Submit the work and wait for it to complete.
1847 std::unique_ptr<Recording> recording = recorder->snap();
1848 if (!recording) {
1849 ERRORF(reporter, "Failed to make recording");
1850 return;
1851 }
1852
1853 InsertRecordingInfo insertInfo;
1854 insertInfo.fRecording = recording.get();
1855 context->insertRecording(insertInfo);
1856 testContext->syncedSubmit(context);
1857
1858 // Verify the contents of the output buffer.
1859 uint32_t* outData = static_cast<uint32_t*>(
1860 map_buffer(context, testContext, outputBuffer.get(), outputInfo.fOffset));
1861 SkASSERT(outputBuffer->isMapped() && outData != nullptr);
1862 for (unsigned int i = 0; i < kProblemSize; ++i) {
1863 const uint32_t found = outData[i];
1864 REPORTER_ASSERT(reporter, 0u == found, "expected '0u', found '%u'", found);
1865 }
1866 }
1867
DEF_GRAPHITE_TEST_FOR_DAWN_AND_METAL_CONTEXTS(Compute_ClearOrdering,reporter,context,testContext)1868 DEF_GRAPHITE_TEST_FOR_DAWN_AND_METAL_CONTEXTS(Compute_ClearOrdering,
1869 reporter,
1870 context,
1871 testContext) {
1872 // Initiate two independent DispatchGroups operating on the same buffer. The first group
1873 // writes garbage to the buffer and the second group copies the contents to an output buffer.
1874 // This test validates that the reads, writes, and clear occur in the expected order.
1875 constexpr uint32_t kWorkgroupSize = 64;
1876
1877 // Initialize buffer with non-zero data.
1878 class FillWithGarbage : public ComputeStep {
1879 public:
1880 FillWithGarbage() : ComputeStep(
1881 /*name=*/"FillWithGarbage",
1882 /*localDispatchSize=*/{kWorkgroupSize, 1, 1},
1883 /*resources=*/{
1884 {
1885 /*type=*/ResourceType::kStorageBuffer,
1886 /*flow=*/DataFlow::kShared,
1887 /*policy=*/ResourcePolicy::kNone,
1888 /*slot=*/0,
1889 /*sksl=*/"outputBlock { uint4 out_data[]; }\n",
1890 }
1891 }) {}
1892 ~FillWithGarbage() override = default;
1893
1894 std::string computeSkSL() const override {
1895 return R"(
1896 void main() {
1897 out_data[sk_GlobalInvocationID.x] = uint4(0xFE);
1898 }
1899 )";
1900 }
1901 } garbageStep;
1902
1903 // Second stage just copies the data to a destination buffer. This is only to verify that this
1904 // stage, issued in a separate DispatchGroup, observes the clear.
1905 class CopyBuffer : public ComputeStep {
1906 public:
1907 CopyBuffer() : ComputeStep(
1908 /*name=*/"CopyBuffer",
1909 /*localDispatchSize=*/{kWorkgroupSize, 1, 1},
1910 /*resources=*/{
1911 {
1912 /*type=*/ResourceType::kStorageBuffer,
1913 /*flow=*/DataFlow::kShared,
1914 /*policy=*/ResourcePolicy::kNone,
1915 /*slot=*/0,
1916 /*sksl=*/"inputBlock { uint4 in_data[]; }\n",
1917 },
1918 {
1919 /*type=*/ResourceType::kStorageBuffer,
1920 /*flow=*/DataFlow::kShared,
1921 /*policy=*/ResourcePolicy::kNone,
1922 /*slot=*/1,
1923 /*sksl=*/"outputBlock { uint4 out_data[]; }\n",
1924 }
1925 }) {}
1926 ~CopyBuffer() override = default;
1927
1928 std::string computeSkSL() const override {
1929 return R"(
1930 void main() {
1931 out_data[sk_GlobalInvocationID.x] = in_data[sk_GlobalInvocationID.x];
1932 }
1933 )";
1934 }
1935 } copyStep;
1936
1937 std::unique_ptr<Recorder> recorder = context->makeRecorder();
1938 DispatchGroup::Builder builder(recorder.get());
1939
1940 constexpr size_t kElementCount = 4 * kWorkgroupSize;
1941 constexpr size_t kBufferSize = sizeof(uint32_t) * kElementCount;
1942 auto input = recorder->priv().drawBufferManager()->getStorage(kBufferSize);
1943 auto [_, output] = recorder->priv().drawBufferManager()->getStoragePointer(kBufferSize);
1944
1945 ComputeTask::DispatchGroupList groups;
1946
1947 // First group.
1948 builder.assignSharedBuffer({input, kBufferSize}, 0);
1949 builder.appendStep(&garbageStep, {{1, 1, 1}});
1950 groups.push_back(builder.finalize());
1951
1952 // Second group.
1953 builder.reset();
1954 builder.assignSharedBuffer({input, kBufferSize}, 0, ClearBuffer::kYes);
1955 builder.assignSharedBuffer({output, kBufferSize}, 1);
1956 builder.appendStep(©Step, {{1, 1, 1}});
1957 groups.push_back(builder.finalize());
1958
1959 recorder->priv().add(ComputeTask::Make(std::move(groups)));
1960 // Ensure the output buffer is synchronized to the CPU once the GPU submission has finished.
1961 auto outputBuffer = sync_buffer_to_cpu(recorder.get(), output.fBuffer);
1962
1963 // Submit the work and wait for it to complete.
1964 std::unique_ptr<Recording> recording = submit_recording(context, testContext, recorder.get());
1965 if (!recording) {
1966 ERRORF(reporter, "Failed to make recording");
1967 return;
1968 }
1969
1970 // Verify the contents of the output buffer.
1971 uint32_t* outData = static_cast<uint32_t*>(
1972 map_buffer(context, testContext, outputBuffer.get(), output.fOffset));
1973 SkASSERT(outputBuffer->isMapped() && outData != nullptr);
1974 for (unsigned int i = 0; i < kElementCount; ++i) {
1975 const uint32_t found = outData[i];
1976 REPORTER_ASSERT(reporter, 0u == found, "expected '0u', found '%u'", found);
1977 }
1978 }
1979
DEF_GRAPHITE_TEST_FOR_DAWN_AND_METAL_CONTEXTS(Compute_ClearOrderingScratchBuffers,reporter,context,testContext)1980 DEF_GRAPHITE_TEST_FOR_DAWN_AND_METAL_CONTEXTS(Compute_ClearOrderingScratchBuffers,
1981 reporter,
1982 context,
1983 testContext) {
1984 // This test is the same as the ClearOrdering test but the two stages write to a recycled
1985 // ScratchBuffer. This is primarily to test ScratchBuffer reuse.
1986 constexpr uint32_t kWorkgroupSize = 64;
1987
1988 // Initialize buffer with non-zero data.
1989 class FillWithGarbage : public ComputeStep {
1990 public:
1991 FillWithGarbage() : ComputeStep(
1992 /*name=*/"FillWithGarbage",
1993 /*localDispatchSize=*/{kWorkgroupSize, 1, 1},
1994 /*resources=*/{
1995 {
1996 /*type=*/ResourceType::kStorageBuffer,
1997 /*flow=*/DataFlow::kShared,
1998 /*policy=*/ResourcePolicy::kNone,
1999 /*slot=*/0,
2000 /*sksl=*/"outputBlock { uint4 out_data[]; }\n",
2001 }
2002 }) {}
2003 ~FillWithGarbage() override = default;
2004
2005 std::string computeSkSL() const override {
2006 return R"(
2007 void main() {
2008 out_data[sk_GlobalInvocationID.x] = uint4(0xFE);
2009 }
2010 )";
2011 }
2012 } garbageStep;
2013
2014 // Second stage just copies the data to a destination buffer. This is only to verify that this
2015 // stage (issued in a separate DispatchGroup) sees the changes.
2016 class CopyBuffer : public ComputeStep {
2017 public:
2018 CopyBuffer() : ComputeStep(
2019 /*name=*/"CopyBuffer",
2020 /*localDispatchSize=*/{kWorkgroupSize, 1, 1},
2021 /*resources=*/{
2022 {
2023 /*type=*/ResourceType::kStorageBuffer,
2024 /*flow=*/DataFlow::kShared,
2025 /*policy=*/ResourcePolicy::kNone,
2026 /*slot=*/0,
2027 /*sksl=*/"inputBlock { uint4 in_data[]; }\n",
2028 },
2029 {
2030 /*type=*/ResourceType::kStorageBuffer,
2031 /*flow=*/DataFlow::kShared,
2032 /*policy=*/ResourcePolicy::kNone,
2033 /*slot=*/1,
2034 /*sksl=*/"outputBlock { uint4 out_data[]; }\n",
2035 }
2036 }) {}
2037 ~CopyBuffer() override = default;
2038
2039 std::string computeSkSL() const override {
2040 return R"(
2041 void main() {
2042 out_data[sk_GlobalInvocationID.x] = in_data[sk_GlobalInvocationID.x];
2043 }
2044 )";
2045 }
2046 } copyStep;
2047
2048 std::unique_ptr<Recorder> recorder = context->makeRecorder();
2049 DispatchGroup::Builder builder(recorder.get());
2050
2051 constexpr size_t kElementCount = 4 * kWorkgroupSize;
2052 constexpr size_t kBufferSize = sizeof(uint32_t) * kElementCount;
2053 auto [_, output] = recorder->priv().drawBufferManager()->getStoragePointer(kBufferSize);
2054
2055 ComputeTask::DispatchGroupList groups;
2056
2057 // First group.
2058 {
2059 auto scratch = recorder->priv().drawBufferManager()->getScratchStorage(kBufferSize);
2060 auto input = scratch.suballocate(kBufferSize);
2061 builder.assignSharedBuffer({input, kBufferSize}, 0);
2062
2063 // `scratch` returns to the scratch buffer pool when it goes out of scope
2064 }
2065 builder.appendStep(&garbageStep, {{1, 1, 1}});
2066 groups.push_back(builder.finalize());
2067
2068 // Second group.
2069 builder.reset();
2070 {
2071 auto scratch = recorder->priv().drawBufferManager()->getScratchStorage(kBufferSize);
2072 auto input = scratch.suballocate(kBufferSize);
2073 builder.assignSharedBuffer({input, kBufferSize}, 0, ClearBuffer::kYes);
2074 }
2075 builder.assignSharedBuffer({output, kBufferSize}, 1);
2076 builder.appendStep(©Step, {{1, 1, 1}});
2077 groups.push_back(builder.finalize());
2078
2079 recorder->priv().add(ComputeTask::Make(std::move(groups)));
2080 // Ensure the output buffer is synchronized to the CPU once the GPU submission has finished.
2081 auto outputBuffer = sync_buffer_to_cpu(recorder.get(), output.fBuffer);
2082
2083 // Submit the work and wait for it to complete.
2084 std::unique_ptr<Recording> recording = submit_recording(context, testContext, recorder.get());
2085 if (!recording) {
2086 ERRORF(reporter, "Failed to make recording");
2087 return;
2088 }
2089
2090 // Verify the contents of the output buffer.
2091 uint32_t* outData = static_cast<uint32_t*>(
2092 map_buffer(context, testContext, outputBuffer.get(), output.fOffset));
2093 SkASSERT(outputBuffer->isMapped() && outData != nullptr);
2094 for (unsigned int i = 0; i < kElementCount; ++i) {
2095 const uint32_t found = outData[i];
2096 REPORTER_ASSERT(reporter, 0u == found, "expected '0u', found '%u'", found);
2097 }
2098 }
2099
DEF_GRAPHITE_TEST_FOR_DAWN_AND_METAL_CONTEXTS(Compute_IndirectDispatch,reporter,context,testContext)2100 DEF_GRAPHITE_TEST_FOR_DAWN_AND_METAL_CONTEXTS(Compute_IndirectDispatch,
2101 reporter,
2102 context,
2103 testContext) {
2104 // This fails on Dawn D3D11, b/315834710
2105 if (testContext->contextType() == skgpu::ContextType::kDawn_D3D11) {
2106 return;
2107 }
2108
2109 std::unique_ptr<Recorder> recorder = context->makeRecorder();
2110
2111 constexpr uint32_t kWorkgroupCount = 32;
2112 constexpr uint32_t kWorkgroupSize = 64;
2113
2114 // `IndirectStep` populates a buffer with the global workgroup count for `CountStep`.
2115 // `CountStep` is recorded using `DispatchGroup::appendStepIndirect()` and its workgroups get
2116 // dispatched according to the values computed by `IndirectStep` on the GPU.
2117 class IndirectStep : public ComputeStep {
2118 public:
2119 IndirectStep()
2120 : ComputeStep(
2121 /*name=*/"TestIndirectDispatch_IndirectStep",
2122 /*localDispatchSize=*/{kWorkgroupSize, 1, 1},
2123 /*resources=*/
2124 {{
2125 /*type=*/ResourceType::kIndirectBuffer,
2126 /*flow=*/DataFlow::kShared,
2127 /*policy=*/ResourcePolicy::kClear,
2128 /*slot=*/0,
2129 // TODO(armansito): Ideally the SSBO would have a single member of
2130 // type `IndirectDispatchArgs` struct type. SkSL modules don't
2131 // support struct declarations so this is currently not possible.
2132 /*sksl=*/"ssbo { uint indirect[]; }",
2133 }}) {}
2134 ~IndirectStep() override = default;
2135
2136 // Kernel that specifies a workgroup size of `kWorkgroupCount` to be used by the indirect
2137 // dispatch.
2138 std::string computeSkSL() const override {
2139 return R"(
2140 // This needs to match `kWorkgroupCount` declared above.
2141 const uint kWorkgroupCount = 32;
2142
2143 void main() {
2144 if (sk_LocalInvocationID.x == 0) {
2145 indirect[0] = kWorkgroupCount;
2146 indirect[1] = 1;
2147 indirect[2] = 1;
2148 }
2149 }
2150 )";
2151 }
2152
2153 size_t calculateBufferSize(int index, const ResourceDesc& r) const override {
2154 SkASSERT(index == 0);
2155 SkASSERT(r.fSlot == 0);
2156 SkASSERT(r.fFlow == DataFlow::kShared);
2157 return kIndirectDispatchArgumentSize;
2158 }
2159
2160 WorkgroupSize calculateGlobalDispatchSize() const override {
2161 return WorkgroupSize(1, 1, 1);
2162 }
2163 } indirectStep;
2164
2165 class CountStep : public ComputeStep {
2166 public:
2167 CountStep()
2168 : ComputeStep(
2169 /*name=*/"TestIndirectDispatch_CountStep",
2170 /*localDispatchSize=*/{kWorkgroupSize, 1, 1},
2171 /*resources=*/
2172 {{
2173 /*type=*/ResourceType::kStorageBuffer,
2174 /*flow=*/DataFlow::kShared,
2175 /*policy=*/ResourcePolicy::kMapped,
2176 /*slot=*/1,
2177 /*sksl=*/"ssbo { atomicUint globalCounter; }",
2178 }}) {}
2179 ~CountStep() override = default;
2180
2181 std::string computeSkSL() const override {
2182 return R"(
2183 workgroup atomicUint localCounter;
2184
2185 void main() {
2186 // Initialize the local counter.
2187 if (sk_LocalInvocationID.x == 0) {
2188 atomicStore(localCounter, 0);
2189 }
2190
2191 // Synchronize the threads in the workgroup so they all see the initial value.
2192 workgroupBarrier();
2193
2194 // All threads increment the counter.
2195 atomicAdd(localCounter, 1);
2196
2197 // Synchronize the threads again to ensure they have all executed the increment
2198 // and the following load reads the same value across all threads in the
2199 // workgroup.
2200 workgroupBarrier();
2201
2202 // Add the workgroup-only tally to the global counter.
2203 if (sk_LocalInvocationID.x == 0) {
2204 atomicAdd(globalCounter, atomicLoad(localCounter));
2205 }
2206 }
2207 )";
2208 }
2209
2210 size_t calculateBufferSize(int index, const ResourceDesc& r) const override {
2211 SkASSERT(index == 0);
2212 SkASSERT(r.fSlot == 1);
2213 SkASSERT(r.fFlow == DataFlow::kShared);
2214 return sizeof(uint32_t);
2215 }
2216
2217 void prepareStorageBuffer(int resourceIndex,
2218 const ResourceDesc& r,
2219 void* buffer,
2220 size_t bufferSize) const override {
2221 SkASSERT(resourceIndex == 0);
2222 *static_cast<uint32_t*>(buffer) = 0;
2223 }
2224 } countStep;
2225
2226 DispatchGroup::Builder builder(recorder.get());
2227 builder.appendStep(&indirectStep);
2228 BindBufferInfo indirectBufferInfo = builder.getSharedBufferResource(0);
2229 if (!indirectBufferInfo) {
2230 ERRORF(reporter, "Shared resource at slot 0 is missing");
2231 return;
2232 }
2233 builder.appendStepIndirect(&countStep, {indirectBufferInfo, kIndirectDispatchArgumentSize});
2234
2235 BindBufferInfo info = builder.getSharedBufferResource(1);
2236 if (!info) {
2237 ERRORF(reporter, "Shared resource at slot 1 is missing");
2238 return;
2239 }
2240
2241 // Record the compute pass task.
2242 ComputeTask::DispatchGroupList groups;
2243 groups.push_back(builder.finalize());
2244 recorder->priv().add(ComputeTask::Make(std::move(groups)));
2245
2246 // Ensure the output buffer is synchronized to the CPU once the GPU submission has finished.
2247 auto buffer = sync_buffer_to_cpu(recorder.get(), info.fBuffer);
2248
2249 // Submit the work and wait for it to complete.
2250 std::unique_ptr<Recording> recording = recorder->snap();
2251 if (!recording) {
2252 ERRORF(reporter, "Failed to make recording");
2253 return;
2254 }
2255
2256 InsertRecordingInfo insertInfo;
2257 insertInfo.fRecording = recording.get();
2258 context->insertRecording(insertInfo);
2259 testContext->syncedSubmit(context);
2260
2261 // Verify the contents of the output buffer.
2262 constexpr uint32_t kExpectedCount = kWorkgroupCount * kWorkgroupSize;
2263 const uint32_t result = static_cast<const uint32_t*>(
2264 map_buffer(context, testContext, buffer.get(), info.fOffset))[0];
2265 REPORTER_ASSERT(reporter,
2266 result == kExpectedCount,
2267 "expected '%u', found '%u'",
2268 kExpectedCount,
2269 result);
2270 }
2271
DEF_GRAPHITE_TEST_FOR_METAL_CONTEXT(Compute_NativeShaderSourceMetal,reporter,context,testContext)2272 DEF_GRAPHITE_TEST_FOR_METAL_CONTEXT(Compute_NativeShaderSourceMetal,
2273 reporter,
2274 context,
2275 testContext) {
2276 std::unique_ptr<Recorder> recorder = context->makeRecorder();
2277
2278 constexpr uint32_t kWorkgroupCount = 32;
2279 constexpr uint32_t kWorkgroupSize = 1024;
2280
2281 class TestComputeStep : public ComputeStep {
2282 public:
2283 TestComputeStep() : ComputeStep(
2284 /*name=*/"TestAtomicOperationsMetal",
2285 /*localDispatchSize=*/{kWorkgroupSize, 1, 1},
2286 /*resources=*/{
2287 {
2288 /*type=*/ResourceType::kStorageBuffer,
2289 /*flow=*/DataFlow::kShared,
2290 /*policy=*/ResourcePolicy::kMapped,
2291 /*slot=*/0,
2292 }
2293 },
2294 /*workgroupBuffers=*/{},
2295 /*baseFlags=*/Flags::kSupportsNativeShader) {}
2296 ~TestComputeStep() override = default;
2297
2298 NativeShaderSource nativeShaderSource(NativeShaderFormat format) const override {
2299 SkASSERT(format == NativeShaderFormat::kMSL);
2300 static constexpr std::string_view kSource = R"(
2301 #include <metal_stdlib>
2302
2303 using namespace metal;
2304
2305 kernel void atomicCount(uint3 localId [[thread_position_in_threadgroup]],
2306 device atomic_uint& globalCounter [[buffer(0)]]) {
2307 threadgroup atomic_uint localCounter;
2308
2309 // Initialize the local counter.
2310 if (localId.x == 0u) {
2311 atomic_store_explicit(&localCounter, 0u, memory_order_relaxed);
2312 }
2313
2314 // Synchronize the threads in the workgroup so they all see the initial value.
2315 threadgroup_barrier(mem_flags::mem_threadgroup);
2316
2317 // All threads increment the counter.
2318 atomic_fetch_add_explicit(&localCounter, 1u, memory_order_relaxed);
2319
2320 // Synchronize the threads again to ensure they have all executed the increment
2321 // and the following load reads the same value across all threads in the
2322 // workgroup.
2323 threadgroup_barrier(mem_flags::mem_threadgroup);
2324
2325 // Add the workgroup-only tally to the global counter.
2326 if (localId.x == 0u) {
2327 uint tally = atomic_load_explicit(&localCounter, memory_order_relaxed);
2328 atomic_fetch_add_explicit(&globalCounter, tally, memory_order_relaxed);
2329 }
2330 }
2331 )";
2332 return {kSource, "atomicCount"};
2333 }
2334
2335 size_t calculateBufferSize(int index, const ResourceDesc& r) const override {
2336 SkASSERT(index == 0);
2337 SkASSERT(r.fSlot == 0);
2338 SkASSERT(r.fFlow == DataFlow::kShared);
2339 return sizeof(uint32_t);
2340 }
2341
2342 WorkgroupSize calculateGlobalDispatchSize() const override {
2343 return WorkgroupSize(kWorkgroupCount, 1, 1);
2344 }
2345
2346 void prepareStorageBuffer(int resourceIndex,
2347 const ResourceDesc& r,
2348 void* buffer,
2349 size_t bufferSize) const override {
2350 SkASSERT(resourceIndex == 0);
2351 *static_cast<uint32_t*>(buffer) = 0;
2352 }
2353 } step;
2354
2355 DispatchGroup::Builder builder(recorder.get());
2356 builder.appendStep(&step);
2357
2358 BindBufferInfo info = builder.getSharedBufferResource(0);
2359 if (!info) {
2360 ERRORF(reporter, "shared resource at slot 0 is missing");
2361 return;
2362 }
2363
2364 // Record the compute pass task.
2365 ComputeTask::DispatchGroupList groups;
2366 groups.push_back(builder.finalize());
2367 recorder->priv().add(ComputeTask::Make(std::move(groups)));
2368
2369 // Ensure the output buffer is synchronized to the CPU once the GPU submission has finished.
2370 auto buffer = sync_buffer_to_cpu(recorder.get(), info.fBuffer);
2371
2372 // Submit the work and wait for it to complete.
2373 std::unique_ptr<Recording> recording = recorder->snap();
2374 if (!recording) {
2375 ERRORF(reporter, "Failed to make recording");
2376 return;
2377 }
2378
2379 InsertRecordingInfo insertInfo;
2380 insertInfo.fRecording = recording.get();
2381 context->insertRecording(insertInfo);
2382 testContext->syncedSubmit(context);
2383
2384 // Verify the contents of the output buffer.
2385 constexpr uint32_t kExpectedCount = kWorkgroupCount * kWorkgroupSize;
2386 const uint32_t result = static_cast<const uint32_t*>(
2387 map_buffer(context, testContext, buffer.get(), info.fOffset))[0];
2388 REPORTER_ASSERT(reporter,
2389 result == kExpectedCount,
2390 "expected '%u', found '%u'",
2391 kExpectedCount,
2392 result);
2393 }
2394
DEF_GRAPHITE_TEST_FOR_METAL_CONTEXT(Compute_WorkgroupBufferDescMetal,reporter,context,testContext)2395 DEF_GRAPHITE_TEST_FOR_METAL_CONTEXT(Compute_WorkgroupBufferDescMetal,
2396 reporter,
2397 context,
2398 testContext) {
2399 std::unique_ptr<Recorder> recorder = context->makeRecorder();
2400
2401 constexpr uint32_t kWorkgroupCount = 32;
2402 constexpr uint32_t kWorkgroupSize = 1024;
2403
2404 class TestComputeStep : public ComputeStep {
2405 public:
2406 TestComputeStep() : ComputeStep(
2407 /*name=*/"TestAtomicOperationsMetal",
2408 /*localDispatchSize=*/{kWorkgroupSize, 1, 1},
2409 /*resources=*/{
2410 {
2411 /*type=*/ResourceType::kStorageBuffer,
2412 /*flow=*/DataFlow::kShared,
2413 /*policy=*/ResourcePolicy::kMapped,
2414 /*slot=*/0,
2415 }
2416 },
2417 /*workgroupBuffers=*/{
2418 {
2419 /*size=*/sizeof(uint32_t),
2420 /*index=*/0u,
2421 }
2422 },
2423 /*baseFlags=*/Flags::kSupportsNativeShader) {}
2424 ~TestComputeStep() override = default;
2425
2426 // This is the same MSL kernel as in Compute_NativeShaderSourceMetal, except `localCounter`
2427 // is an entry-point parameter instead of a local variable. This forces the workgroup
2428 // binding to be encoded explicitly in the command encoder.
2429 NativeShaderSource nativeShaderSource(NativeShaderFormat format) const override {
2430 SkASSERT(format == NativeShaderFormat::kMSL);
2431 static constexpr std::string_view kSource = R"(
2432 #include <metal_stdlib>
2433
2434 using namespace metal;
2435
2436 kernel void atomicCount(uint3 localId [[thread_position_in_threadgroup]],
2437 device atomic_uint& globalCounter [[buffer(0)]],
2438 threadgroup atomic_uint& localCounter [[threadgroup(0)]]) {
2439 // Initialize the local counter.
2440 if (localId.x == 0u) {
2441 atomic_store_explicit(&localCounter, 0u, memory_order_relaxed);
2442 }
2443
2444 // Synchronize the threads in the workgroup so they all see the initial value.
2445 threadgroup_barrier(mem_flags::mem_threadgroup);
2446
2447 // All threads increment the counter.
2448 atomic_fetch_add_explicit(&localCounter, 1u, memory_order_relaxed);
2449
2450 // Synchronize the threads again to ensure they have all executed the increment
2451 // and the following load reads the same value across all threads in the
2452 // workgroup.
2453 threadgroup_barrier(mem_flags::mem_threadgroup);
2454
2455 // Add the workgroup-only tally to the global counter.
2456 if (localId.x == 0u) {
2457 uint tally = atomic_load_explicit(&localCounter, memory_order_relaxed);
2458 atomic_fetch_add_explicit(&globalCounter, tally, memory_order_relaxed);
2459 }
2460 }
2461 )";
2462 return {kSource, "atomicCount"};
2463 }
2464
2465 size_t calculateBufferSize(int index, const ResourceDesc& r) const override {
2466 SkASSERT(index == 0);
2467 SkASSERT(r.fSlot == 0);
2468 SkASSERT(r.fFlow == DataFlow::kShared);
2469 return sizeof(uint32_t);
2470 }
2471
2472 WorkgroupSize calculateGlobalDispatchSize() const override {
2473 return WorkgroupSize(kWorkgroupCount, 1, 1);
2474 }
2475
2476 void prepareStorageBuffer(int resourceIndex,
2477 const ResourceDesc& r,
2478 void* buffer,
2479 size_t bufferSize) const override {
2480 SkASSERT(resourceIndex == 0);
2481 *static_cast<uint32_t*>(buffer) = 0;
2482 }
2483 } step;
2484
2485 DispatchGroup::Builder builder(recorder.get());
2486 builder.appendStep(&step);
2487
2488 BindBufferInfo info = builder.getSharedBufferResource(0);
2489 if (!info) {
2490 ERRORF(reporter, "shared resource at slot 0 is missing");
2491 return;
2492 }
2493
2494 // Record the compute pass task.
2495 ComputeTask::DispatchGroupList groups;
2496 groups.push_back(builder.finalize());
2497 recorder->priv().add(ComputeTask::Make(std::move(groups)));
2498
2499 // Ensure the output buffer is synchronized to the CPU once the GPU submission has finished.
2500 auto buffer = sync_buffer_to_cpu(recorder.get(), info.fBuffer);
2501
2502 // Submit the work and wait for it to complete.
2503 std::unique_ptr<Recording> recording = recorder->snap();
2504 if (!recording) {
2505 ERRORF(reporter, "Failed to make recording");
2506 return;
2507 }
2508
2509 InsertRecordingInfo insertInfo;
2510 insertInfo.fRecording = recording.get();
2511 context->insertRecording(insertInfo);
2512 testContext->syncedSubmit(context);
2513
2514 // Verify the contents of the output buffer.
2515 constexpr uint32_t kExpectedCount = kWorkgroupCount * kWorkgroupSize;
2516 const uint32_t result = static_cast<const uint32_t*>(
2517 map_buffer(context, testContext, buffer.get(), info.fOffset))[0];
2518 REPORTER_ASSERT(reporter,
2519 result == kExpectedCount,
2520 "expected '%u', found '%u'",
2521 kExpectedCount,
2522 result);
2523 }
2524
DEF_GRAPHITE_TEST_FOR_DAWN_CONTEXT(Compute_NativeShaderSourceWGSL,reporter,context,testContext)2525 DEF_GRAPHITE_TEST_FOR_DAWN_CONTEXT(Compute_NativeShaderSourceWGSL, reporter, context, testContext) {
2526 // This fails on Dawn D3D11, b/315834710
2527 if (testContext->contextType() == skgpu::ContextType::kDawn_D3D11) {
2528 return;
2529 }
2530
2531 std::unique_ptr<Recorder> recorder = context->makeRecorder();
2532
2533 constexpr uint32_t kWorkgroupCount = 32;
2534 constexpr uint32_t kWorkgroupSize = 256; // The WebGPU default workgroup size limit is 256
2535
2536 class TestComputeStep : public ComputeStep {
2537 public:
2538 TestComputeStep() : ComputeStep(
2539 /*name=*/"TestAtomicOperationsWGSL",
2540 /*localDispatchSize=*/{kWorkgroupSize, 1, 1},
2541 /*resources=*/{
2542 {
2543 /*type=*/ResourceType::kStorageBuffer,
2544 /*flow=*/DataFlow::kShared,
2545 /*policy=*/ResourcePolicy::kMapped,
2546 /*slot=*/0,
2547 }
2548 },
2549 /*workgroupBuffers=*/{},
2550 /*baseFlags=*/Flags::kSupportsNativeShader) {}
2551 ~TestComputeStep() override = default;
2552
2553 NativeShaderSource nativeShaderSource(NativeShaderFormat format) const override {
2554 SkASSERT(format == NativeShaderFormat::kWGSL);
2555 static constexpr std::string_view kSource = R"(
2556 @group(0) @binding(0) var<storage, read_write> globalCounter: atomic<u32>;
2557
2558 var<workgroup> localCounter: atomic<u32>;
2559
2560 @compute @workgroup_size(256)
2561 fn atomicCount(@builtin(local_invocation_id) localId: vec3u) {
2562 // Initialize the local counter.
2563 if localId.x == 0u {
2564 atomicStore(&localCounter, 0u);
2565 }
2566
2567 // Synchronize the threads in the workgroup so they all see the initial value.
2568 workgroupBarrier();
2569
2570 // All threads increment the counter.
2571 atomicAdd(&localCounter, 1u);
2572
2573 // Synchronize the threads again to ensure they have all executed the increment
2574 // and the following load reads the same value across all threads in the
2575 // workgroup.
2576 workgroupBarrier();
2577
2578 // Add the workgroup-only tally to the global counter.
2579 if localId.x == 0u {
2580 let tally = atomicLoad(&localCounter);
2581 atomicAdd(&globalCounter, tally);
2582 }
2583 }
2584 )";
2585 return {kSource, "atomicCount"};
2586 }
2587
2588 size_t calculateBufferSize(int index, const ResourceDesc& r) const override {
2589 SkASSERT(index == 0);
2590 SkASSERT(r.fSlot == 0);
2591 SkASSERT(r.fFlow == DataFlow::kShared);
2592 return sizeof(uint32_t);
2593 }
2594
2595 WorkgroupSize calculateGlobalDispatchSize() const override {
2596 return WorkgroupSize(kWorkgroupCount, 1, 1);
2597 }
2598
2599 void prepareStorageBuffer(int resourceIndex,
2600 const ResourceDesc& r,
2601 void* buffer,
2602 size_t bufferSize) const override {
2603 SkASSERT(resourceIndex == 0);
2604 *static_cast<uint32_t*>(buffer) = 0;
2605 }
2606 } step;
2607
2608 DispatchGroup::Builder builder(recorder.get());
2609 builder.appendStep(&step);
2610
2611 BindBufferInfo info = builder.getSharedBufferResource(0);
2612 if (!info) {
2613 ERRORF(reporter, "shared resource at slot 0 is missing");
2614 return;
2615 }
2616
2617 // Record the compute pass task.
2618 ComputeTask::DispatchGroupList groups;
2619 groups.push_back(builder.finalize());
2620 recorder->priv().add(ComputeTask::Make(std::move(groups)));
2621
2622 // Ensure the output buffer is synchronized to the CPU once the GPU submission has finished.
2623 auto buffer = sync_buffer_to_cpu(recorder.get(), info.fBuffer);
2624
2625 // Submit the work and wait for it to complete.
2626 std::unique_ptr<Recording> recording = recorder->snap();
2627 if (!recording) {
2628 ERRORF(reporter, "Failed to make recording");
2629 return;
2630 }
2631
2632 InsertRecordingInfo insertInfo;
2633 insertInfo.fRecording = recording.get();
2634 context->insertRecording(insertInfo);
2635 testContext->syncedSubmit(context);
2636
2637 // Verify the contents of the output buffer.
2638 constexpr uint32_t kExpectedCount = kWorkgroupCount * kWorkgroupSize;
2639 const uint32_t result = static_cast<const uint32_t*>(
2640 map_buffer(context, testContext, buffer.get(), info.fOffset))[0];
2641 REPORTER_ASSERT(reporter,
2642 result == kExpectedCount,
2643 "expected '%u', found '%u'",
2644 kExpectedCount,
2645 result);
2646 }
2647