• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*
2  * Copyright 2022 Google LLC
3  *
4  * Use of this source code is governed by a BSD-style license that can be
5  * found in the LICENSE file.
6  */
7 
8 #include "tests/Test.h"
9 
10 #include "include/core/SkBitmap.h"
11 #include "include/gpu/graphite/Context.h"
12 #include "include/gpu/graphite/Recorder.h"
13 #include "include/gpu/graphite/Recording.h"
14 #include "src/gpu/graphite/Buffer.h"
15 #include "src/gpu/graphite/Caps.h"
16 #include "src/gpu/graphite/ComputePipelineDesc.h"
17 #include "src/gpu/graphite/ComputeTypes.h"
18 #include "src/gpu/graphite/ContextPriv.h"
19 #include "src/gpu/graphite/RecorderPriv.h"
20 #include "src/gpu/graphite/ResourceProvider.h"
21 #include "src/gpu/graphite/UniformManager.h"
22 #include "src/gpu/graphite/compute/ComputeStep.h"
23 #include "src/gpu/graphite/compute/DispatchGroup.h"
24 #include "src/gpu/graphite/task/ComputeTask.h"
25 #include "src/gpu/graphite/task/CopyTask.h"
26 #include "src/gpu/graphite/task/SynchronizeToCpuTask.h"
27 #include "src/gpu/graphite/task/UploadTask.h"
28 
29 #include "tools/graphite/GraphiteTestContext.h"
30 
31 using namespace skgpu::graphite;
32 using namespace skiatest::graphite;
33 
34 namespace {
35 
map_buffer(Context * context,skiatest::graphite::GraphiteTestContext * testContext,Buffer * buffer,size_t offset)36 void* map_buffer(Context* context,
37                  skiatest::graphite::GraphiteTestContext* testContext,
38                  Buffer* buffer,
39                  size_t offset) {
40     SkASSERT(buffer);
41     if (context->priv().caps()->bufferMapsAreAsync()) {
42         buffer->asyncMap();
43         while (!buffer->isMapped()) {
44             testContext->tick();
45         }
46     }
47     std::byte* ptr = static_cast<std::byte*>(buffer->map());
48     SkASSERT(ptr);
49 
50     return ptr + offset;
51 }
52 
sync_buffer_to_cpu(Recorder * recorder,const Buffer * buffer)53 sk_sp<Buffer> sync_buffer_to_cpu(Recorder* recorder, const Buffer* buffer) {
54     if (recorder->priv().caps()->drawBufferCanBeMappedForReadback()) {
55         // `buffer` can be mapped directly, however it may still require a synchronization step
56         // by the underlying API (e.g. a managed buffer in Metal). SynchronizeToCpuTask
57         // automatically handles this for us.
58         recorder->priv().add(SynchronizeToCpuTask::Make(sk_ref_sp(buffer)));
59         return sk_ref_sp(buffer);
60     }
61 
62     // The backend requires a transfer buffer for CPU read-back
63     auto xferBuffer =
64             recorder->priv().resourceProvider()->findOrCreateBuffer(buffer->size(),
65                                                                     BufferType::kXferGpuToCpu,
66                                                                     AccessPattern::kHostVisible,
67                                                                     "ComputeTest_TransferToCpu");
68     SkASSERT(xferBuffer);
69 
70     recorder->priv().add(CopyBufferToBufferTask::Make(buffer,
71                                                       /*srcOffset=*/0,
72                                                       xferBuffer,
73                                                       /*dstOffset=*/0,
74                                                       buffer->size()));
75     return xferBuffer;
76 }
77 
submit_recording(Context * context,GraphiteTestContext * testContext,Recorder * recorder)78 std::unique_ptr<Recording> submit_recording(Context* context,
79                                             GraphiteTestContext* testContext,
80                                             Recorder* recorder) {
81     std::unique_ptr<Recording> recording = recorder->snap();
82     if (!recording) {
83         return nullptr;
84     }
85 
86     InsertRecordingInfo insertInfo;
87     insertInfo.fRecording = recording.get();
88     context->insertRecording(insertInfo);
89     testContext->syncedSubmit(context);
90 
91     return recording;
92 }
93 
is_dawn_or_metal_context_type(skiatest::GpuContextType ctxType)94 bool is_dawn_or_metal_context_type(skiatest::GpuContextType ctxType) {
95     return skiatest::IsDawnContextType(ctxType) || skiatest::IsMetalContextType(ctxType);
96 }
97 
98 }  // namespace
99 
100 #define DEF_GRAPHITE_TEST_FOR_DAWN_AND_METAL_CONTEXTS(            \
101         name, reporter, graphite_context, test_context)           \
102     DEF_GRAPHITE_TEST_FOR_CONTEXTS(name,                          \
103                                    is_dawn_or_metal_context_type, \
104                                    reporter,                      \
105                                    graphite_context,              \
106                                    test_context,                  \
107                                    CtsEnforcement::kNever)
108 
109 // TODO(b/262427430, b/262429132): Enable this test on other backends once they all support
110 // compute programs.
DEF_GRAPHITE_TEST_FOR_DAWN_AND_METAL_CONTEXTS(Compute_SingleDispatchTest,reporter,context,testContext)111 DEF_GRAPHITE_TEST_FOR_DAWN_AND_METAL_CONTEXTS(Compute_SingleDispatchTest,
112                                               reporter,
113                                               context,
114                                               testContext) {
115     constexpr uint32_t kProblemSize = 512;
116     constexpr float kFactor = 4.f;
117 
118     // The ComputeStep packs kProblemSize floats into kProblemSize / 4 vectors and each thread
119     // processes 1 vector at a time.
120     constexpr uint32_t kWorkgroupSize = kProblemSize / 4;
121 
122     std::unique_ptr<Recorder> recorder = context->makeRecorder();
123 
124     class TestComputeStep : public ComputeStep {
125     public:
126         // TODO(skia:40045541): SkSL doesn't support std430 layout well, so the buffers
127         // below all pack their data into vectors to be compatible with SPIR-V/WGSL.
128         TestComputeStep() : ComputeStep(
129                 /*name=*/"TestArrayMultiply",
130                 /*localDispatchSize=*/{kWorkgroupSize, 1, 1},
131                 /*resources=*/{
132                     // Input buffer:
133                     {
134                         // TODO(b/299979165): Declare this binding as read-only.
135                         /*type=*/ResourceType::kStorageBuffer,
136                         /*flow=*/DataFlow::kPrivate,
137                         /*policy=*/ResourcePolicy::kMapped,
138                         /*sksl=*/"inputBlock {\n"
139                             "    float factor;\n"
140                             "    layout(offset=16) float4 in_data[];\n"
141                             "}",
142                     },
143                     // Output buffer:
144                     {
145                         /*type=*/ResourceType::kStorageBuffer,
146                         /*flow=*/DataFlow::kShared,  // shared to allow us to access it from the
147                                                      // Builder
148                         /*policy=*/ResourcePolicy::kMapped,  // mappable for read-back
149                         /*slot=*/0,
150                         /*sksl=*/"outputBlock { float4 out_data[]; }",
151                     }
152                 }) {}
153         ~TestComputeStep() override = default;
154 
155         // A kernel that multiplies a large array of floats by a supplied factor.
156         std::string computeSkSL() const override {
157             return R"(
158                 void main() {
159                     out_data[sk_GlobalInvocationID.x] = in_data[sk_GlobalInvocationID.x] * factor;
160                 }
161             )";
162         }
163 
164         size_t calculateBufferSize(int index, const ResourceDesc& r) const override {
165             if (index == 0) {
166                 SkASSERT(r.fFlow == DataFlow::kPrivate);
167                 return sizeof(float) * (kProblemSize + 4);
168             }
169             SkASSERT(index == 1);
170             SkASSERT(r.fSlot == 0);
171             SkASSERT(r.fFlow == DataFlow::kShared);
172             return sizeof(float) * kProblemSize;
173         }
174 
175         void prepareStorageBuffer(int resourceIndex,
176                                   const ResourceDesc& r,
177                                   void* buffer,
178                                   size_t bufferSize) const override {
179             // Only initialize the input buffer.
180             if (resourceIndex != 0) {
181                 return;
182             }
183             SkASSERT(r.fFlow == DataFlow::kPrivate);
184 
185             size_t dataCount = sizeof(float) * (kProblemSize + 4);
186             SkASSERT(bufferSize == dataCount);
187             SkSpan<float> inData(static_cast<float*>(buffer), dataCount);
188             inData[0] = kFactor;
189             for (unsigned int i = 0; i < kProblemSize; ++i) {
190                 inData[i + 4] = i + 1;
191             }
192         }
193 
194         WorkgroupSize calculateGlobalDispatchSize() const override {
195             return WorkgroupSize(1, 1, 1);
196         }
197     } step;
198 
199     DispatchGroup::Builder builder(recorder.get());
200     if (!builder.appendStep(&step)) {
201         ERRORF(reporter, "Failed to add ComputeStep to DispatchGroup");
202         return;
203     }
204 
205     // The output buffer should have been placed in the right output slot.
206     BindBufferInfo outputInfo = builder.getSharedBufferResource(0);
207     if (!outputInfo) {
208         ERRORF(reporter, "Failed to allocate an output buffer at slot 0");
209         return;
210     }
211 
212     // Record the compute task
213     ComputeTask::DispatchGroupList groups;
214     groups.push_back(builder.finalize());
215     recorder->priv().add(ComputeTask::Make(std::move(groups)));
216 
217     // Ensure the output buffer is synchronized to the CPU once the GPU submission has finished.
218     auto outputBuffer = sync_buffer_to_cpu(recorder.get(), outputInfo.fBuffer);
219 
220     // Submit the work and wait for it to complete.
221     std::unique_ptr<Recording> recording = recorder->snap();
222     if (!recording) {
223         ERRORF(reporter, "Failed to make recording");
224         return;
225     }
226 
227     InsertRecordingInfo insertInfo;
228     insertInfo.fRecording = recording.get();
229     context->insertRecording(insertInfo);
230     testContext->syncedSubmit(context);
231 
232     // Verify the contents of the output buffer.
233     float* outData = static_cast<float*>(
234             map_buffer(context, testContext, outputBuffer.get(), outputInfo.fOffset));
235     SkASSERT(outputBuffer->isMapped() && outData != nullptr);
236     for (unsigned int i = 0; i < kProblemSize; ++i) {
237         const float expected = (i + 1) * kFactor;
238         const float found = outData[i];
239         REPORTER_ASSERT(reporter, expected == found, "expected '%f', found '%f'", expected, found);
240     }
241 }
242 
243 // TODO(b/262427430, b/262429132): Enable this test on other backends once they all support
244 // compute programs.
DEF_GRAPHITE_TEST_FOR_DAWN_AND_METAL_CONTEXTS(Compute_DispatchGroupTest,reporter,context,testContext)245 DEF_GRAPHITE_TEST_FOR_DAWN_AND_METAL_CONTEXTS(Compute_DispatchGroupTest,
246                                               reporter,
247                                               context,
248                                               testContext) {
249     // TODO(b/315834710): This fails on Dawn D3D11
250     if (testContext->contextType() == skgpu::ContextType::kDawn_D3D11) {
251         return;
252     }
253 
254     constexpr uint32_t kProblemSize = 512;
255     constexpr float kFactor1 = 4.f;
256     constexpr float kFactor2 = 3.f;
257 
258     // The ComputeStep packs kProblemSize floats into kProblemSize / 4 vectors and each thread
259     // processes 1 vector at a time.
260     constexpr uint32_t kWorkgroupSize = kProblemSize / 4;
261 
262     std::unique_ptr<Recorder> recorder = context->makeRecorder();
263 
264     // Define two steps that perform two multiplication passes over the same input.
265 
266     class TestComputeStep1 : public ComputeStep {
267     public:
268         // TODO(skia:40045541): SkSL doesn't support std430 layout well, so the buffers
269         // below all pack their data into vectors to be compatible with SPIR-V/WGSL.
270         TestComputeStep1() : ComputeStep(
271                 /*name=*/"TestArrayMultiplyFirstPass",
272                 /*localDispatchSize=*/{kWorkgroupSize, 1, 1},
273                 /*resources=*/{
274                     // Input buffer:
275                     {
276                         // TODO(b/299979165): Declare this binding as read-only.
277                         /*type=*/ResourceType::kStorageBuffer,
278                         /*flow=*/DataFlow::kPrivate,
279                         /*policy=*/ResourcePolicy::kMapped,  // mappable for read-back
280                         /*sksl=*/"inputBlock {\n"
281                             "    float factor;\n"
282                             "    layout(offset=16) float4 in_data[];\n"
283                             "}",
284                     },
285                     // Output buffers:
286                     {
287                         /*type=*/ResourceType::kStorageBuffer,
288                         /*flow=*/DataFlow::kShared,
289                         /*policy=*/ResourcePolicy::kNone,  // GPU-only, read by second step
290                         /*slot=*/0,
291                         /*sksl=*/"outputBlock1 { float4 forward_data[]; }",
292                     },
293                     {
294                         /*type=*/ResourceType::kStorageBuffer,
295                         /*flow=*/DataFlow::kShared,
296                         /*policy=*/ResourcePolicy::kMapped,  // mappable for read-back
297                         /*slot=*/1,
298                         /*sksl=*/"outputBlock2 { float2 extra_data; }",
299                     }
300                 }) {}
301         ~TestComputeStep1() override = default;
302 
303         // A kernel that multiplies a large array of floats by a supplied factor.
304         std::string computeSkSL() const override {
305             return R"(
306                 void main() {
307                     uint idx = sk_GlobalInvocationID.x;
308                     forward_data[idx] = in_data[idx] * factor;
309                     if (idx == 0) {
310                         extra_data.x = factor;
311                         extra_data.y = 2 * factor;
312                     }
313                 }
314             )";
315         }
316 
317         size_t calculateBufferSize(int index, const ResourceDesc& r) const override {
318             if (index == 0) {
319                 SkASSERT(r.fFlow == DataFlow::kPrivate);
320                 return sizeof(float) * (kProblemSize + 4);
321             }
322             if (index == 1) {
323                 SkASSERT(r.fFlow == DataFlow::kShared);
324                 SkASSERT(r.fSlot == 0);
325                 return sizeof(float) * kProblemSize;
326             }
327 
328             SkASSERT(index == 2);
329             SkASSERT(r.fSlot == 1);
330             SkASSERT(r.fFlow == DataFlow::kShared);
331             return 2 * sizeof(float);
332         }
333 
334         void prepareStorageBuffer(int resourceIndex,
335                                   const ResourceDesc& r,
336                                   void* buffer,
337                                   size_t bufferSize) const override {
338             if (resourceIndex != 0) {
339                 return;
340             }
341 
342             size_t dataCount = sizeof(float) * (kProblemSize + 4);
343             SkASSERT(bufferSize == dataCount);
344             SkSpan<float> inData(static_cast<float*>(buffer), dataCount);
345             inData[0] = kFactor1;
346             for (unsigned int i = 0; i < kProblemSize; ++i) {
347                 inData[i + 4] = i + 1;
348             }
349         }
350 
351         WorkgroupSize calculateGlobalDispatchSize() const override {
352             return WorkgroupSize(1, 1, 1);
353         }
354     } step1;
355 
356     class TestComputeStep2 : public ComputeStep {
357     public:
358         TestComputeStep2() : ComputeStep(
359                 /*name=*/"TestArrayMultiplySecondPass",
360                 /*localDispatchSize=*/{kWorkgroupSize, 1, 1},
361                 /*resources=*/{
362                     // Input buffer:
363                     {
364                         /*type=*/ResourceType::kStorageBuffer,
365                         /*flow=*/DataFlow::kShared,
366                         /*policy=*/ResourcePolicy::kNone,  // GPU-only
367                         /*slot=*/0, // this is the output from the first step
368                         /*sksl=*/"inputBlock { float4 in_data[]; }",
369                     },
370                     {
371                         /*type=*/ResourceType::kStorageBuffer,
372                         /*flow=*/DataFlow::kPrivate,
373                         /*policy=*/ResourcePolicy::kMapped,
374                         /*sksl=*/"factorBlock { float factor; }"
375                     },
376                     // Output buffer:
377                     {
378                         /*type=*/ResourceType::kStorageBuffer,
379                         /*flow=*/DataFlow::kShared,
380                         /*policy=*/ResourcePolicy::kMapped,  // mappable for read-back
381                         /*slot=*/2,
382                         /*sksl=*/"outputBlock { float4 out_data[]; }",
383                     }
384                 }) {}
385         ~TestComputeStep2() override = default;
386 
387         // A kernel that multiplies a large array of floats by a supplied factor.
388         std::string computeSkSL() const override {
389             return R"(
390                 void main() {
391                     out_data[sk_GlobalInvocationID.x] = in_data[sk_GlobalInvocationID.x] * factor;
392                 }
393             )";
394         }
395 
396         size_t calculateBufferSize(int index, const ResourceDesc& r) const override {
397             SkASSERT(index != 0);
398             if (index == 1) {
399                 SkASSERT(r.fFlow == DataFlow::kPrivate);
400                 return sizeof(float) * 4;
401             }
402             SkASSERT(index == 2);
403             SkASSERT(r.fSlot == 2);
404             SkASSERT(r.fFlow == DataFlow::kShared);
405             return sizeof(float) * kProblemSize;
406         }
407 
408         void prepareStorageBuffer(int resourceIndex,
409                                   const ResourceDesc& r,
410                                   void* buffer,
411                                   size_t bufferSize) const override {
412             if (resourceIndex != 1) {
413                 return;
414             }
415             SkASSERT(r.fFlow == DataFlow::kPrivate);
416             *static_cast<float*>(buffer) = kFactor2;
417         }
418 
419         WorkgroupSize calculateGlobalDispatchSize() const override {
420             return WorkgroupSize(1, 1, 1);
421         }
422     } step2;
423 
424     DispatchGroup::Builder builder(recorder.get());
425     builder.appendStep(&step1);
426     builder.appendStep(&step2);
427 
428     // Slots 0, 1, and 2 should all contain shared buffers. Slot 1 contains the extra output buffer
429     // from step 1 while slot 2 contains the result of the second multiplication pass from step 1.
430     // Slot 0 is not mappable.
431     REPORTER_ASSERT(reporter,
432                     std::holds_alternative<BufferView>(builder.outputTable().fSharedSlots[0]),
433                     "shared resource at slot 0 is missing");
434     BindBufferInfo outputInfo = builder.getSharedBufferResource(2);
435     if (!outputInfo) {
436         ERRORF(reporter, "Failed to allocate an output buffer at slot 0");
437         return;
438     }
439 
440     // Extra output buffer from step 1 (corresponding to 'outputBlock2')
441     BindBufferInfo extraOutputInfo = builder.getSharedBufferResource(1);
442     if (!extraOutputInfo) {
443         ERRORF(reporter, "shared resource at slot 1 is missing");
444         return;
445     }
446 
447     // Record the compute task
448     ComputeTask::DispatchGroupList groups;
449     groups.push_back(builder.finalize());
450     recorder->priv().add(ComputeTask::Make(std::move(groups)));
451 
452     // Ensure the output buffers get synchronized to the CPU once the GPU submission has finished.
453     auto outputBuffer = sync_buffer_to_cpu(recorder.get(), outputInfo.fBuffer);
454     auto extraOutputBuffer = sync_buffer_to_cpu(recorder.get(), extraOutputInfo.fBuffer);
455 
456     // Submit the work and wait for it to complete.
457     std::unique_ptr<Recording> recording = recorder->snap();
458     if (!recording) {
459         ERRORF(reporter, "Failed to make recording");
460         return;
461     }
462 
463     InsertRecordingInfo insertInfo;
464     insertInfo.fRecording = recording.get();
465     context->insertRecording(insertInfo);
466     testContext->syncedSubmit(context);
467 
468     // Verify the contents of the output buffer from step 2
469     float* outData = static_cast<float*>(
470             map_buffer(context, testContext, outputBuffer.get(), outputInfo.fOffset));
471     SkASSERT(outputBuffer->isMapped() && outData != nullptr);
472     for (unsigned int i = 0; i < kProblemSize; ++i) {
473         const float expected = (i + 1) * kFactor1 * kFactor2;
474         const float found = outData[i];
475         REPORTER_ASSERT(reporter, expected == found, "expected '%f', found '%f'", expected, found);
476     }
477 
478     // Verify the contents of the extra output buffer from step 1
479     float* extraOutData = static_cast<float*>(
480             map_buffer(context, testContext, extraOutputBuffer.get(), extraOutputInfo.fOffset));
481     SkASSERT(extraOutputBuffer->isMapped() && extraOutData != nullptr);
482     REPORTER_ASSERT(reporter,
483                     kFactor1 == extraOutData[0],
484                     "expected '%f', found '%f'",
485                     kFactor1,
486                     extraOutData[0]);
487     REPORTER_ASSERT(reporter,
488                     2 * kFactor1 == extraOutData[1],
489                     "expected '%f', found '%f'",
490                     2 * kFactor2,
491                     extraOutData[1]);
492 }
493 
494 // TODO(b/262427430, b/262429132): Enable this test on other backends once they all support
495 // compute programs.
DEF_GRAPHITE_TEST_FOR_DAWN_AND_METAL_CONTEXTS(Compute_UniformBufferTest,reporter,context,testContext)496 DEF_GRAPHITE_TEST_FOR_DAWN_AND_METAL_CONTEXTS(Compute_UniformBufferTest,
497                                               reporter,
498                                               context,
499                                               testContext) {
500     // TODO(b/315834710): This fails on Dawn D3D11
501     if (testContext->contextType() == skgpu::ContextType::kDawn_D3D11) {
502         return;
503     }
504 
505     constexpr uint32_t kProblemSize = 512;
506     constexpr float kFactor = 4.f;
507 
508     // The ComputeStep packs kProblemSize floats into kProblemSize / 4 vectors and each thread
509     // processes 1 vector at a time.
510     constexpr uint32_t kWorkgroupSize = kProblemSize / 4;
511 
512     std::unique_ptr<Recorder> recorder = context->makeRecorder();
513 
514     class TestComputeStep : public ComputeStep {
515     public:
516         TestComputeStep() : ComputeStep(
517                 /*name=*/"TestArrayMultiply",
518                 /*localDispatchSize=*/{kWorkgroupSize, 1, 1},
519                 /*resources=*/{
520                     // Uniform buffer:
521                     {
522                         /*type=*/ResourceType::kUniformBuffer,
523                         /*flow=*/DataFlow::kPrivate,
524                         /*policy=*/ResourcePolicy::kMapped,
525                         /*sksl=*/"uniformBlock { float factor; }"
526                     },
527                     // Input buffer:
528                     {
529                         /*type=*/ResourceType::kStorageBuffer,
530                         /*flow=*/DataFlow::kPrivate,
531                         /*policy=*/ResourcePolicy::kMapped,
532                         /*sksl=*/"inputBlock { float4 in_data[]; }",
533                     },
534                     // Output buffer:
535                     {
536                         /*type=*/ResourceType::kStorageBuffer,
537                         /*flow=*/DataFlow::kShared,  // shared to allow us to access it from the
538                                                      // Builder
539                         /*policy=*/ResourcePolicy::kMapped,  // mappable for read-back
540                         /*slot=*/0,
541                         /*sksl=*/"outputBlock { float4 out_data[]; }",
542                     }
543                 }) {}
544         ~TestComputeStep() override = default;
545 
546         // A kernel that multiplies a large array of floats by a supplied factor.
547         std::string computeSkSL() const override {
548             return R"(
549                 void main() {
550                     out_data[sk_GlobalInvocationID.x] = in_data[sk_GlobalInvocationID.x] * factor;
551                 }
552             )";
553         }
554 
555         size_t calculateBufferSize(int index, const ResourceDesc& r) const override {
556             if (index == 0) {
557                 SkASSERT(r.fFlow == DataFlow::kPrivate);
558                 return sizeof(float);
559             }
560             if (index == 1) {
561                 SkASSERT(r.fFlow == DataFlow::kPrivate);
562                 return sizeof(float) * kProblemSize;
563             }
564             SkASSERT(index == 2);
565             SkASSERT(r.fSlot == 0);
566             SkASSERT(r.fFlow == DataFlow::kShared);
567             return sizeof(float) * kProblemSize;
568         }
569 
570         void prepareStorageBuffer(int resourceIndex,
571                                   const ResourceDesc& r,
572                                   void* buffer,
573                                   size_t bufferSize) const override {
574             // Only initialize the input storage buffer.
575             if (resourceIndex != 1) {
576                 return;
577             }
578             SkASSERT(r.fFlow == DataFlow::kPrivate);
579             size_t dataCount = sizeof(float) * kProblemSize;
580             SkASSERT(bufferSize == dataCount);
581             SkSpan<float> inData(static_cast<float*>(buffer), dataCount);
582             for (unsigned int i = 0; i < kProblemSize; ++i) {
583                 inData[i] = i + 1;
584             }
585         }
586 
587         void prepareUniformBuffer(int resourceIndex,
588                                   const ResourceDesc&,
589                                   UniformManager* mgr) const override {
590             SkASSERT(resourceIndex == 0);
591             SkDEBUGCODE(
592                 const Uniform uniforms[] = {{"factor", SkSLType::kFloat}};
593                 mgr->setExpectedUniforms(uniforms);
594             )
595             mgr->write(kFactor);
596         }
597 
598         WorkgroupSize calculateGlobalDispatchSize() const override {
599             return WorkgroupSize(1, 1, 1);
600         }
601     } step;
602 
603     DispatchGroup::Builder builder(recorder.get());
604     if (!builder.appendStep(&step)) {
605         ERRORF(reporter, "Failed to add ComputeStep to DispatchGroup");
606         return;
607     }
608 
609     // The output buffer should have been placed in the right output slot.
610     BindBufferInfo outputInfo = builder.getSharedBufferResource(0);
611     if (!outputInfo) {
612         ERRORF(reporter, "Failed to allocate an output buffer at slot 0");
613         return;
614     }
615 
616     // Record the compute task
617     ComputeTask::DispatchGroupList groups;
618     groups.push_back(builder.finalize());
619     recorder->priv().add(ComputeTask::Make(std::move(groups)));
620 
621     // Ensure the output buffer is synchronized to the CPU once the GPU submission has finished.
622     auto outputBuffer = sync_buffer_to_cpu(recorder.get(), outputInfo.fBuffer);
623 
624     // Submit the work and wait for it to complete.
625     std::unique_ptr<Recording> recording = recorder->snap();
626     if (!recording) {
627         ERRORF(reporter, "Failed to make recording");
628         return;
629     }
630 
631     InsertRecordingInfo insertInfo;
632     insertInfo.fRecording = recording.get();
633     context->insertRecording(insertInfo);
634     testContext->syncedSubmit(context);
635 
636     // Verify the contents of the output buffer.
637     float* outData = static_cast<float*>(
638             map_buffer(context, testContext, outputBuffer.get(), outputInfo.fOffset));
639     SkASSERT(outputBuffer->isMapped() && outData != nullptr);
640     for (unsigned int i = 0; i < kProblemSize; ++i) {
641         const float expected = (i + 1) * kFactor;
642         const float found = outData[i];
643         REPORTER_ASSERT(reporter, expected == found, "expected '%f', found '%f'", expected, found);
644     }
645 }
646 
647 // TODO(b/262427430, b/262429132): Enable this test on other backends once they all support
648 // compute programs.
DEF_GRAPHITE_TEST_FOR_DAWN_AND_METAL_CONTEXTS(Compute_ExternallyAssignedBuffer,reporter,context,testContext)649 DEF_GRAPHITE_TEST_FOR_DAWN_AND_METAL_CONTEXTS(Compute_ExternallyAssignedBuffer,
650                                               reporter,
651                                               context,
652                                               testContext) {
653     constexpr uint32_t kProblemSize = 512;
654     constexpr float kFactor = 4.f;
655 
656     // The ComputeStep packs kProblemSize floats into kProblemSize / 4 vectors and each thread
657     // processes 1 vector at a time.
658     constexpr uint32_t kWorkgroupSize = kProblemSize / 4;
659 
660     std::unique_ptr<Recorder> recorder = context->makeRecorder();
661 
662     class TestComputeStep : public ComputeStep {
663     public:
664         TestComputeStep() : ComputeStep(
665                 /*name=*/"ExternallyAssignedBuffer",
666                 /*localDispatchSize=*/{kWorkgroupSize, 1, 1},
667                 /*resources=*/{
668                     // Input buffer:
669                     {
670                         /*type=*/ResourceType::kStorageBuffer,
671                         /*flow=*/DataFlow::kPrivate,
672                         /*policy=*/ResourcePolicy::kMapped,
673                         /*sksl=*/"inputBlock {\n"
674                                  "    float factor;\n"
675                                  "    layout(offset = 16) float4 in_data[];\n"
676                                  "}\n",
677                     },
678                     // Output buffer:
679                     {
680                         /*type=*/ResourceType::kStorageBuffer,
681                         /*flow=*/DataFlow::kShared,  // shared to allow us to access it from the
682                                                      // Builder
683                         /*policy=*/ResourcePolicy::kMapped,  // mappable for read-back
684                         /*slot=*/0,
685                         /*sksl=*/"outputBlock { float4 out_data[]; }",
686                     }
687                 }) {}
688         ~TestComputeStep() override = default;
689 
690         // A kernel that multiplies a large array of floats by a supplied factor.
691         std::string computeSkSL() const override {
692             return R"(
693                 void main() {
694                     out_data[sk_GlobalInvocationID.x] = in_data[sk_GlobalInvocationID.x] * factor;
695                 }
696             )";
697         }
698 
699         size_t calculateBufferSize(int resourceIndex, const ResourceDesc& r) const override {
700             SkASSERT(resourceIndex == 0);
701             SkASSERT(r.fFlow == DataFlow::kPrivate);
702             return sizeof(float) * (kProblemSize + 4);
703         }
704 
705         void prepareStorageBuffer(int resourceIndex,
706                                   const ResourceDesc& r,
707                                   void* buffer,
708                                   size_t bufferSize) const override {
709             SkASSERT(resourceIndex == 0);
710             SkASSERT(r.fFlow == DataFlow::kPrivate);
711 
712             size_t dataCount = sizeof(float) * (kProblemSize + 4);
713             SkASSERT(bufferSize == dataCount);
714             SkSpan<float> inData(static_cast<float*>(buffer), dataCount);
715             inData[0] = kFactor;
716             for (unsigned int i = 0; i < kProblemSize; ++i) {
717                 inData[i + 4] = i + 1;
718             }
719         }
720     } step;
721 
722     // We allocate a buffer and directly assign it to the DispatchGroup::Builder. The ComputeStep
723     // will not participate in the creation of this buffer.
724     auto [_, outputInfo] =
725             recorder->priv().drawBufferManager()->getStoragePointer(sizeof(float) * kProblemSize);
726     REPORTER_ASSERT(reporter, outputInfo, "Failed to allocate output buffer");
727 
728     DispatchGroup::Builder builder(recorder.get());
729     builder.assignSharedBuffer({outputInfo, sizeof(float) * kProblemSize}, 0);
730 
731     // Initialize the step with a pre-determined global size
732     if (!builder.appendStep(&step, {WorkgroupSize(1, 1, 1)})) {
733         ERRORF(reporter, "Failed to add ComputeStep to DispatchGroup");
734         return;
735     }
736 
737     // Record the compute task
738     ComputeTask::DispatchGroupList groups;
739     groups.push_back(builder.finalize());
740     recorder->priv().add(ComputeTask::Make(std::move(groups)));
741 
742     // Ensure the output buffer is synchronized to the CPU once the GPU submission has finished.
743     auto outputBuffer = sync_buffer_to_cpu(recorder.get(), outputInfo.fBuffer);
744 
745     // Submit the work and wait for it to complete.
746     std::unique_ptr<Recording> recording = recorder->snap();
747     if (!recording) {
748         ERRORF(reporter, "Failed to make recording");
749         return;
750     }
751 
752     InsertRecordingInfo insertInfo;
753     insertInfo.fRecording = recording.get();
754     context->insertRecording(insertInfo);
755     testContext->syncedSubmit(context);
756 
757     // Verify the contents of the output buffer.
758     float* outData = static_cast<float*>(
759             map_buffer(context, testContext, outputBuffer.get(), outputInfo.fOffset));
760     SkASSERT(outputBuffer->isMapped() && outData != nullptr);
761     for (unsigned int i = 0; i < kProblemSize; ++i) {
762         const float expected = (i + 1) * kFactor;
763         const float found = outData[i];
764         REPORTER_ASSERT(reporter, expected == found, "expected '%f', found '%f'", expected, found);
765     }
766 }
767 
768 // Tests the storage texture binding for a compute dispatch that writes the same color to every
769 // pixel of a storage texture.
DEF_GRAPHITE_TEST_FOR_DAWN_AND_METAL_CONTEXTS(Compute_StorageTexture,reporter,context,testContext)770 DEF_GRAPHITE_TEST_FOR_DAWN_AND_METAL_CONTEXTS(Compute_StorageTexture,
771                                               reporter,
772                                               context,
773                                               testContext) {
774     std::unique_ptr<Recorder> recorder = context->makeRecorder();
775 
776     // For this test we allocate a 16x16 tile which is written to by a single workgroup of the same
777     // size.
778     constexpr uint32_t kDim = 16;
779 
780     class TestComputeStep : public ComputeStep {
781     public:
782         TestComputeStep() : ComputeStep(
783                 /*name=*/"TestStorageTexture",
784                 /*localDispatchSize=*/{kDim, kDim, 1},
785                 /*resources=*/{
786                     {
787                         /*type=*/ResourceType::kWriteOnlyStorageTexture,
788                         /*flow=*/DataFlow::kShared,
789                         /*policy=*/ResourcePolicy::kNone,
790                         /*slot=*/0,
791                         /*sksl=*/"dst",
792                     }
793                 }) {}
794         ~TestComputeStep() override = default;
795 
796         std::string computeSkSL() const override {
797             return R"(
798                 void main() {
799                     textureWrite(dst, sk_LocalInvocationID.xy, half4(0.0, 1.0, 0.0, 1.0));
800                 }
801             )";
802         }
803 
804         std::tuple<SkISize, SkColorType> calculateTextureParameters(
805                 int index, const ResourceDesc& r) const override {
806             return {{kDim, kDim}, kRGBA_8888_SkColorType};
807         }
808 
809         WorkgroupSize calculateGlobalDispatchSize() const override {
810             return WorkgroupSize(1, 1, 1);
811         }
812     } step;
813 
814     DispatchGroup::Builder builder(recorder.get());
815     if (!builder.appendStep(&step)) {
816         ERRORF(reporter, "Failed to add ComputeStep to DispatchGroup");
817         return;
818     }
819 
820     sk_sp<TextureProxy> texture = builder.getSharedTextureResource(0);
821     if (!texture) {
822         ERRORF(reporter, "Shared resource at slot 0 is missing");
823         return;
824     }
825 
826     // Record the compute task
827     ComputeTask::DispatchGroupList groups;
828     groups.push_back(builder.finalize());
829     recorder->priv().add(ComputeTask::Make(std::move(groups)));
830 
831     // Submit the work and wait for it to complete.
832     std::unique_ptr<Recording> recording = recorder->snap();
833     if (!recording) {
834         ERRORF(reporter, "Failed to make recording");
835         return;
836     }
837 
838     InsertRecordingInfo insertInfo;
839     insertInfo.fRecording = recording.get();
840     context->insertRecording(insertInfo);
841     testContext->syncedSubmit(context);
842 
843     SkBitmap bitmap;
844     SkImageInfo imgInfo =
845             SkImageInfo::Make(kDim, kDim, kRGBA_8888_SkColorType, kUnpremul_SkAlphaType);
846     bitmap.allocPixels(imgInfo);
847 
848     SkPixmap pixels;
849     bool peekPixelsSuccess = bitmap.peekPixels(&pixels);
850     REPORTER_ASSERT(reporter, peekPixelsSuccess);
851 
852     bool readPixelsSuccess = context->priv().readPixels(pixels, texture.get(), imgInfo, 0, 0);
853     REPORTER_ASSERT(reporter, readPixelsSuccess);
854 
855     for (uint32_t x = 0; x < kDim; ++x) {
856         for (uint32_t y = 0; y < kDim; ++y) {
857             SkColor4f expected = SkColor4f::FromColor(SK_ColorGREEN);
858             SkColor4f color = pixels.getColor4f(x, y);
859             REPORTER_ASSERT(reporter, expected == color,
860                             "At position {%u, %u}, "
861                             "expected {%.1f, %.1f, %.1f, %.1f}, "
862                             "found {%.1f, %.1f, %.1f, %.1f}",
863                             x, y,
864                             expected.fR, expected.fG, expected.fB, expected.fA,
865                             color.fR, color.fG, color.fB, color.fA);
866         }
867     }
868 }
869 
870 // Tests the readonly texture binding for a compute dispatch that random-access reads from a
871 // CPU-populated texture and copies it to a storage texture.
DEF_GRAPHITE_TEST_FOR_DAWN_AND_METAL_CONTEXTS(Compute_StorageTextureReadAndWrite,reporter,context,testContext)872 DEF_GRAPHITE_TEST_FOR_DAWN_AND_METAL_CONTEXTS(Compute_StorageTextureReadAndWrite,
873                                               reporter,
874                                               context,
875                                               testContext) {
876     std::unique_ptr<Recorder> recorder = context->makeRecorder();
877 
878     // For this test we allocate a 16x16 tile which is written to by a single workgroup of the same
879     // size.
880     constexpr uint32_t kDim = 16;
881 
882     class TestComputeStep : public ComputeStep {
883     public:
884         TestComputeStep() : ComputeStep(
885                 /*name=*/"TestStorageTextureReadAndWrite",
886                 /*localDispatchSize=*/{kDim, kDim, 1},
887                 /*resources=*/{
888                     {
889                         /*type=*/ResourceType::kReadOnlyTexture,
890                         /*flow=*/DataFlow::kShared,
891                         /*policy=*/ResourcePolicy::kNone,
892                         /*slot=*/0,
893                         /*sksl=*/"src",
894                     },
895                     {
896                         /*type=*/ResourceType::kWriteOnlyStorageTexture,
897                         /*flow=*/DataFlow::kShared,
898                         /*policy=*/ResourcePolicy::kNone,
899                         /*slot=*/1,
900                         /*sksl=*/"dst",
901                     }
902                 }) {}
903         ~TestComputeStep() override = default;
904 
905         std::string computeSkSL() const override {
906             return R"(
907                 void main() {
908                     half4 color = textureRead(src, sk_LocalInvocationID.xy);
909                     textureWrite(dst, sk_LocalInvocationID.xy, color);
910                 }
911             )";
912         }
913 
914         std::tuple<SkISize, SkColorType> calculateTextureParameters(
915                 int index, const ResourceDesc& r) const override {
916             SkASSERT(index == 1);
917             return {{kDim, kDim}, kRGBA_8888_SkColorType};
918         }
919 
920         WorkgroupSize calculateGlobalDispatchSize() const override {
921             return WorkgroupSize(1, 1, 1);
922         }
923     } step;
924 
925     // Create and populate an input texture.
926     SkBitmap srcBitmap;
927     SkImageInfo srcInfo =
928             SkImageInfo::Make(kDim, kDim, kRGBA_8888_SkColorType, kUnpremul_SkAlphaType);
929     srcBitmap.allocPixels(srcInfo);
930     SkPixmap srcPixels;
931     bool srcPeekPixelsSuccess = srcBitmap.peekPixels(&srcPixels);
932     REPORTER_ASSERT(reporter, srcPeekPixelsSuccess);
933     for (uint32_t x = 0; x < kDim; ++x) {
934         for (uint32_t y = 0; y < kDim; ++y) {
935             *srcPixels.writable_addr32(x, y) =
936                     SkColorSetARGB(255, x * 256 / kDim, y * 256 / kDim, 0);
937         }
938     }
939 
940     auto texInfo = context->priv().caps()->getDefaultSampledTextureInfo(kRGBA_8888_SkColorType,
941                                                                         skgpu::Mipmapped::kNo,
942                                                                         skgpu::Protected::kNo,
943                                                                         skgpu::Renderable::kNo);
944     sk_sp<TextureProxy> srcProxy = TextureProxy::Make(context->priv().caps(),
945                                                       recorder->priv().resourceProvider(),
946                                                       {kDim, kDim},
947                                                       texInfo,
948                                                       "ComputeTestSrcProxy",
949                                                       skgpu::Budgeted::kNo);
950     MipLevel mipLevel;
951     mipLevel.fPixels = srcPixels.addr();
952     mipLevel.fRowBytes = srcPixels.rowBytes();
953     UploadInstance upload = UploadInstance::Make(recorder.get(),
954                                                  srcProxy,
955                                                  srcPixels.info().colorInfo(),
956                                                  srcPixels.info().colorInfo(),
957                                                  {mipLevel},
958                                                  SkIRect::MakeWH(kDim, kDim),
959                                                  std::make_unique<ImageUploadContext>());
960     if (!upload.isValid()) {
961         ERRORF(reporter, "Could not create UploadInstance");
962         return;
963     }
964     recorder->priv().add(UploadTask::Make(std::move(upload)));
965 
966     DispatchGroup::Builder builder(recorder.get());
967 
968     // Assign the input texture to slot 0. This corresponds to the ComputeStep's "src" texture
969     // binding.
970     builder.assignSharedTexture(std::move(srcProxy), 0);
971 
972     if (!builder.appendStep(&step)) {
973         ERRORF(reporter, "Failed to add ComputeStep to DispatchGroup");
974         return;
975     }
976 
977     sk_sp<TextureProxy> dst = builder.getSharedTextureResource(1);
978     if (!dst) {
979         ERRORF(reporter, "shared resource at slot 1 is missing");
980         return;
981     }
982 
983     // Record the compute task
984     ComputeTask::DispatchGroupList groups;
985     groups.push_back(builder.finalize());
986     recorder->priv().add(ComputeTask::Make(std::move(groups)));
987 
988     // Submit the work and wait for it to complete.
989     std::unique_ptr<Recording> recording = recorder->snap();
990     if (!recording) {
991         ERRORF(reporter, "Failed to make recording");
992         return;
993     }
994 
995     InsertRecordingInfo insertInfo;
996     insertInfo.fRecording = recording.get();
997     context->insertRecording(insertInfo);
998     testContext->syncedSubmit(context);
999 
1000     SkBitmap bitmap;
1001     SkImageInfo imgInfo =
1002             SkImageInfo::Make(kDim, kDim, kRGBA_8888_SkColorType, kUnpremul_SkAlphaType);
1003     bitmap.allocPixels(imgInfo);
1004 
1005     SkPixmap pixels;
1006     bool peekPixelsSuccess = bitmap.peekPixels(&pixels);
1007     REPORTER_ASSERT(reporter, peekPixelsSuccess);
1008 
1009     bool readPixelsSuccess = context->priv().readPixels(pixels, dst.get(), imgInfo, 0, 0);
1010     REPORTER_ASSERT(reporter, readPixelsSuccess);
1011 
1012     for (uint32_t x = 0; x < kDim; ++x) {
1013         for (uint32_t y = 0; y < kDim; ++y) {
1014             SkColor4f expected = SkColor4f::FromBytes_RGBA(
1015                     SkColorSetARGB(255, x * 256 / kDim, y * 256 / kDim, 0));
1016             SkColor4f color = pixels.getColor4f(x, y);
1017             REPORTER_ASSERT(reporter, expected == color,
1018                             "At position {%u, %u}, "
1019                             "expected {%.1f, %.1f, %.1f, %.1f}, "
1020                             "found {%.1f, %.1f, %.1f, %.1f}",
1021                             x, y,
1022                             expected.fR, expected.fG, expected.fB, expected.fA,
1023                             color.fR, color.fG, color.fB, color.fA);
1024         }
1025     }
1026 }
1027 
DEF_GRAPHITE_TEST_FOR_DAWN_AND_METAL_CONTEXTS(Compute_ReadOnlyStorageBuffer,reporter,context,testContext)1028 DEF_GRAPHITE_TEST_FOR_DAWN_AND_METAL_CONTEXTS(Compute_ReadOnlyStorageBuffer,
1029                                               reporter,
1030                                               context,
1031                                               testContext) {
1032     std::unique_ptr<Recorder> recorder = context->makeRecorder();
1033 
1034     // For this test we allocate a 16x16 tile which is written to by a single workgroup of the same
1035     // size.
1036     constexpr uint32_t kDim = 16;
1037 
1038     class TestComputeStep : public ComputeStep {
1039     public:
1040         TestComputeStep() : ComputeStep(
1041                 /*name=*/"TestReadOnlyStorageBuffer",
1042                 /*localDispatchSize=*/{kDim, kDim, 1},
1043                 /*resources=*/{
1044                     {
1045                         /*type=*/ResourceType::kReadOnlyStorageBuffer,
1046                         /*flow=*/DataFlow::kShared,
1047                         /*policy=*/ResourcePolicy::kMapped,
1048                         /*slot=*/0,
1049                         /*sksl=*/"src { uint in_data[]; }",
1050                     },
1051                     {
1052                         /*type=*/ResourceType::kWriteOnlyStorageTexture,
1053                         /*flow=*/DataFlow::kShared,
1054                         /*policy=*/ResourcePolicy::kNone,
1055                         /*slot=*/1,
1056                         /*sksl=*/"dst",
1057                     }
1058                 }) {}
1059         ~TestComputeStep() override = default;
1060 
1061         std::string computeSkSL() const override {
1062             return R"(
1063                 void main() {
1064                     uint ix = sk_LocalInvocationID.y * 16 + sk_LocalInvocationID.x;
1065                     uint value = in_data[ix];
1066                     half4 splat = half4(
1067                         half(value & 0xFF),
1068                         half((value >> 8) & 0xFF),
1069                         half((value >> 16) & 0xFF),
1070                         half((value >> 24) & 0xFF)
1071                     );
1072                     textureWrite(dst, sk_LocalInvocationID.xy, splat / 255.0);
1073                 }
1074             )";
1075         }
1076 
1077         size_t calculateBufferSize(int index, const ResourceDesc& r) const override {
1078             SkASSERT(index == 0);
1079             return kDim * kDim * sizeof(uint32_t);
1080         }
1081 
1082         void prepareStorageBuffer(int index,
1083                                   const ResourceDesc&,
1084                                   void* buffer,
1085                                   size_t bufferSize) const override {
1086             SkASSERT(index == 0);
1087             SkASSERT(bufferSize == kDim * kDim * sizeof(uint32_t));
1088 
1089             uint32_t* inputs = reinterpret_cast<uint32_t*>(buffer);
1090             for (uint32_t y = 0; y < kDim; ++y) {
1091                 for (uint32_t x = 0; x < kDim; ++x) {
1092                     uint32_t value =
1093                             ((x * 256 / kDim) & 0xFF) | ((y * 256 / kDim) & 0xFF) << 8 | 255 << 24;
1094                     *(inputs++) = value;
1095                 }
1096             }
1097         }
1098 
1099         std::tuple<SkISize, SkColorType> calculateTextureParameters(
1100                 int index, const ResourceDesc& r) const override {
1101             SkASSERT(index == 1);
1102             return {{kDim, kDim}, kRGBA_8888_SkColorType};
1103         }
1104 
1105         WorkgroupSize calculateGlobalDispatchSize() const override {
1106             return WorkgroupSize(1, 1, 1);
1107         }
1108     } step;
1109 
1110     DispatchGroup::Builder builder(recorder.get());
1111     if (!builder.appendStep(&step)) {
1112         ERRORF(reporter, "Failed to add ComputeStep to DispatchGroup");
1113         return;
1114     }
1115 
1116     sk_sp<TextureProxy> dst = builder.getSharedTextureResource(1);
1117     if (!dst) {
1118         ERRORF(reporter, "shared resource at slot 1 is missing");
1119         return;
1120     }
1121 
1122     // Record the compute task
1123     ComputeTask::DispatchGroupList groups;
1124     groups.push_back(builder.finalize());
1125     recorder->priv().add(ComputeTask::Make(std::move(groups)));
1126 
1127     // Submit the work and wait for it to complete.
1128     std::unique_ptr<Recording> recording = recorder->snap();
1129     if (!recording) {
1130         ERRORF(reporter, "Failed to make recording");
1131         return;
1132     }
1133 
1134     InsertRecordingInfo insertInfo;
1135     insertInfo.fRecording = recording.get();
1136     context->insertRecording(insertInfo);
1137     testContext->syncedSubmit(context);
1138 
1139     SkBitmap bitmap;
1140     SkImageInfo imgInfo =
1141             SkImageInfo::Make(kDim, kDim, kRGBA_8888_SkColorType, kUnpremul_SkAlphaType);
1142     bitmap.allocPixels(imgInfo);
1143 
1144     SkPixmap pixels;
1145     bool peekPixelsSuccess = bitmap.peekPixels(&pixels);
1146     REPORTER_ASSERT(reporter, peekPixelsSuccess);
1147 
1148     bool readPixelsSuccess = context->priv().readPixels(pixels, dst.get(), imgInfo, 0, 0);
1149     REPORTER_ASSERT(reporter, readPixelsSuccess);
1150 
1151     for (uint32_t x = 0; x < kDim; ++x) {
1152         for (uint32_t y = 0; y < kDim; ++y) {
1153             SkColor4f expected =
1154                     SkColor4f::FromColor(SkColorSetARGB(255, x * 256 / kDim, y * 256 / kDim, 0));
1155             SkColor4f color = pixels.getColor4f(x, y);
1156             bool pass = true;
1157             for (int i = 0; i < 4; i++) {
1158                 pass &= color[i] == expected[i];
1159             }
1160             REPORTER_ASSERT(reporter, pass,
1161                             "At position {%u, %u}, "
1162                             "expected {%.1f, %.1f, %.1f, %.1f}, "
1163                             "found {%.1f, %.1f, %.1f, %.1f}",
1164                             x, y,
1165                             expected.fR, expected.fG, expected.fB, expected.fA,
1166                             color.fR, color.fG, color.fB, color.fA);
1167         }
1168     }
1169 }
1170 
1171 // Tests that a texture written by one compute step can be sampled by a subsequent step.
DEF_GRAPHITE_TEST_FOR_DAWN_AND_METAL_CONTEXTS(Compute_StorageTextureMultipleComputeSteps,reporter,context,testContext)1172 DEF_GRAPHITE_TEST_FOR_DAWN_AND_METAL_CONTEXTS(Compute_StorageTextureMultipleComputeSteps,
1173                                               reporter,
1174                                               context,
1175                                               testContext) {
1176     std::unique_ptr<Recorder> recorder = context->makeRecorder();
1177 
1178     // For this test we allocate a 16x16 tile which is written to by a single workgroup of the same
1179     // size.
1180     constexpr uint32_t kDim = 16;
1181 
1182     // Writes to a texture in slot 0.
1183     class TestComputeStep1 : public ComputeStep {
1184     public:
1185         TestComputeStep1() : ComputeStep(
1186                 /*name=*/"TestStorageTexturesFirstPass",
1187                 /*localDispatchSize=*/{kDim, kDim, 1},
1188                 /*resources=*/{
1189                     {
1190                         /*type=*/ResourceType::kWriteOnlyStorageTexture,
1191                         /*flow=*/DataFlow::kShared,
1192                         /*policy=*/ResourcePolicy::kNone,
1193                         /*slot=*/0,
1194                         /*sksl=*/"dst",
1195                     }
1196                 }) {}
1197         ~TestComputeStep1() override = default;
1198 
1199         std::string computeSkSL() const override {
1200             return R"(
1201                 void main() {
1202                     textureWrite(dst, sk_LocalInvocationID.xy, half4(0.0, 1.0, 0.0, 1.0));
1203                 }
1204             )";
1205         }
1206 
1207         std::tuple<SkISize, SkColorType> calculateTextureParameters(
1208                 int index, const ResourceDesc& r) const override {
1209             SkASSERT(index == 0);
1210             return {{kDim, kDim}, kRGBA_8888_SkColorType};
1211         }
1212 
1213         WorkgroupSize calculateGlobalDispatchSize() const override {
1214             return WorkgroupSize(1, 1, 1);
1215         }
1216     } step1;
1217 
1218     // Reads from the texture in slot 0 and writes it to another texture in slot 1.
1219     class TestComputeStep2 : public ComputeStep {
1220     public:
1221         TestComputeStep2() : ComputeStep(
1222                 /*name=*/"TestStorageTexturesSecondPass",
1223                 /*localDispatchSize=*/{kDim, kDim, 1},
1224                 /*resources=*/{
1225                     {
1226                         /*type=*/ResourceType::kReadOnlyTexture,
1227                         /*flow=*/DataFlow::kShared,
1228                         /*policy=*/ResourcePolicy::kNone,
1229                         /*slot=*/0,
1230                         /*sksl=*/"src",
1231                     },
1232                     {
1233                         /*type=*/ResourceType::kWriteOnlyStorageTexture,
1234                         /*flow=*/DataFlow::kShared,
1235                         /*policy=*/ResourcePolicy::kNone,
1236                         /*slot=*/1,
1237                         /*sksl=*/"dst",
1238                     }
1239                 }) {}
1240         ~TestComputeStep2() override = default;
1241 
1242         std::string computeSkSL() const override {
1243             return R"(
1244                 void main() {
1245                     half4 color = textureRead(src, sk_LocalInvocationID.xy);
1246                     textureWrite(dst, sk_LocalInvocationID.xy, color);
1247                 }
1248             )";
1249         }
1250 
1251         std::tuple<SkISize, SkColorType> calculateTextureParameters(
1252                 int index, const ResourceDesc& r) const override {
1253             SkASSERT(index == 1);
1254             return {{kDim, kDim}, kRGBA_8888_SkColorType};
1255         }
1256 
1257         WorkgroupSize calculateGlobalDispatchSize() const override {
1258             return WorkgroupSize(1, 1, 1);
1259         }
1260     } step2;
1261 
1262     DispatchGroup::Builder builder(recorder.get());
1263     builder.appendStep(&step1);
1264     builder.appendStep(&step2);
1265 
1266     sk_sp<TextureProxy> dst = builder.getSharedTextureResource(1);
1267     if (!dst) {
1268         ERRORF(reporter, "shared resource at slot 1 is missing");
1269         return;
1270     }
1271 
1272     // Record the compute task
1273     ComputeTask::DispatchGroupList groups;
1274     groups.push_back(builder.finalize());
1275     recorder->priv().add(ComputeTask::Make(std::move(groups)));
1276 
1277     // Submit the work and wait for it to complete.
1278     std::unique_ptr<Recording> recording = recorder->snap();
1279     if (!recording) {
1280         ERRORF(reporter, "Failed to make recording");
1281         return;
1282     }
1283 
1284     InsertRecordingInfo insertInfo;
1285     insertInfo.fRecording = recording.get();
1286     context->insertRecording(insertInfo);
1287     testContext->syncedSubmit(context);
1288 
1289     SkBitmap bitmap;
1290     SkImageInfo imgInfo =
1291             SkImageInfo::Make(kDim, kDim, kRGBA_8888_SkColorType, kUnpremul_SkAlphaType);
1292     bitmap.allocPixels(imgInfo);
1293 
1294     SkPixmap pixels;
1295     bool peekPixelsSuccess = bitmap.peekPixels(&pixels);
1296     REPORTER_ASSERT(reporter, peekPixelsSuccess);
1297 
1298     bool readPixelsSuccess = context->priv().readPixels(pixels, dst.get(), imgInfo, 0, 0);
1299     REPORTER_ASSERT(reporter, readPixelsSuccess);
1300 
1301     for (uint32_t x = 0; x < kDim; ++x) {
1302         for (uint32_t y = 0; y < kDim; ++y) {
1303             SkColor4f expected = SkColor4f::FromColor(SK_ColorGREEN);
1304             SkColor4f color = pixels.getColor4f(x, y);
1305             REPORTER_ASSERT(reporter, expected == color,
1306                             "At position {%u, %u}, "
1307                             "expected {%.1f, %.1f, %.1f, %.1f}, "
1308                             "found {%.1f, %.1f, %.1f, %.1f}",
1309                             x, y,
1310                             expected.fR, expected.fG, expected.fB, expected.fA,
1311                             color.fR, color.fG, color.fB, color.fA);
1312         }
1313     }
1314 }
1315 
1316 // Tests that a texture can be sampled by a compute step using a sampler.
1317 // TODO(armansito): Once the previous TODO is done, add additional tests that exercise mixed use of
1318 // texture, buffer, and sampler bindings.
DEF_GRAPHITE_TEST_FOR_DAWN_AND_METAL_CONTEXTS(Compute_SampledTexture,reporter,context,testContext)1319 DEF_GRAPHITE_TEST_FOR_DAWN_AND_METAL_CONTEXTS(Compute_SampledTexture,
1320                                               reporter,
1321                                               context,
1322                                               testContext) {
1323     std::unique_ptr<Recorder> recorder = context->makeRecorder();
1324 
1325     // The first ComputeStep initializes a 16x16 texture with a checkerboard pattern of alternating
1326     // red and black pixels. The second ComputeStep downsamples this texture into a 4x4 using
1327     // bilinear filtering at pixel borders, intentionally averaging the values of each 4x4 tile in
1328     // the source texture, and writes the result to the destination texture.
1329     constexpr uint32_t kSrcDim = 16;
1330     constexpr uint32_t kDstDim = 4;
1331 
1332     class TestComputeStep1 : public ComputeStep {
1333     public:
1334         TestComputeStep1() : ComputeStep(
1335                 /*name=*/"Test_SampledTexture_Init",
1336                 /*localDispatchSize=*/{kSrcDim, kSrcDim, 1},
1337                 /*resources=*/{
1338                     {
1339                         /*type=*/ResourceType::kWriteOnlyStorageTexture,
1340                         /*flow=*/DataFlow::kShared,
1341                         /*policy=*/ResourcePolicy::kNone,
1342                         /*slot=*/0,
1343                         /*sksl=*/"dst",
1344                     }
1345                 }) {}
1346         ~TestComputeStep1() override = default;
1347 
1348         std::string computeSkSL() const override {
1349             return R"(
1350                 void main() {
1351                     uint2 c = sk_LocalInvocationID.xy;
1352                     uint checkerBoardColor = (c.x + (c.y % 2)) % 2;
1353                     textureWrite(dst, c, half4(checkerBoardColor, 0, 0, 1));
1354                 }
1355             )";
1356         }
1357 
1358         std::tuple<SkISize, SkColorType> calculateTextureParameters(
1359                 int index, const ResourceDesc& r) const override {
1360             SkASSERT(index == 0);
1361             return {{kSrcDim, kSrcDim}, kRGBA_8888_SkColorType};
1362         }
1363 
1364         WorkgroupSize calculateGlobalDispatchSize() const override {
1365             return WorkgroupSize(1, 1, 1);
1366         }
1367     } step1;
1368 
1369     class TestComputeStep2 : public ComputeStep {
1370     public:
1371         TestComputeStep2() : ComputeStep(
1372                 /*name=*/"Test_SampledTexture_Sample",
1373                 /*localDispatchSize=*/{kDstDim, kDstDim, 1},
1374                 /*resources=*/{
1375                     // Declare the storage texture before the sampled texture. This tests that
1376                     // binding index assignment works consistently across all backends when a
1377                     // sampler-less texture and a texture+sampler pair are intermixed and sampler
1378                     // bindings aren't necessarily contiguous when the ranges are distinct.
1379                     {
1380                         /*type=*/ResourceType::kWriteOnlyStorageTexture,
1381                         /*flow=*/DataFlow::kShared,
1382                         /*policy=*/ResourcePolicy::kNone,
1383                         /*slot=*/1,
1384                         /*sksl=*/"dst",
1385                     },
1386                     {
1387                         /*type=*/ResourceType::kSampledTexture,
1388                         /*flow=*/DataFlow::kShared,
1389                         /*policy=*/ResourcePolicy::kNone,
1390                         /*slot=*/0,
1391                         /*sksl=*/"src",
1392                     }
1393                 }) {}
1394         ~TestComputeStep2() override = default;
1395 
1396         std::string computeSkSL() const override {
1397             return R"(
1398                 void main() {
1399                     // Normalize the 4x4 invocation indices and sample the source texture using
1400                     // that.
1401                     uint2 dstCoord = sk_LocalInvocationID.xy;
1402                     const float2 dstSizeInv = float2(0.25, 0.25);
1403                     float2 unormCoord = float2(dstCoord) * dstSizeInv;
1404 
1405                     // Use explicit LOD, as quad derivatives are not available to a compute shader.
1406                     half4 color = sampleLod(src, unormCoord, 0);
1407                     textureWrite(dst, dstCoord, color);
1408                 }
1409             )";
1410         }
1411 
1412         std::tuple<SkISize, SkColorType> calculateTextureParameters(
1413                 int index, const ResourceDesc& r) const override {
1414             SkASSERT(index == 0 || index == 1);
1415             return {{kDstDim, kDstDim}, kRGBA_8888_SkColorType};
1416         }
1417 
1418         SamplerDesc calculateSamplerParameters(int index, const ResourceDesc&) const override {
1419             SkASSERT(index == 1);
1420             // Use the repeat tile mode to sample an infinite checkerboard.
1421             constexpr SkTileMode kTileModes[2] = {SkTileMode::kRepeat, SkTileMode::kRepeat};
1422             return {SkFilterMode::kLinear, kTileModes};
1423         }
1424 
1425         WorkgroupSize calculateGlobalDispatchSize() const override {
1426             return WorkgroupSize(1, 1, 1);
1427         }
1428     } step2;
1429 
1430     DispatchGroup::Builder builder(recorder.get());
1431     builder.appendStep(&step1);
1432     builder.appendStep(&step2);
1433 
1434     sk_sp<TextureProxy> dst = builder.getSharedTextureResource(1);
1435     if (!dst) {
1436         ERRORF(reporter, "shared resource at slot 1 is missing");
1437         return;
1438     }
1439 
1440     // Record the compute task
1441     ComputeTask::DispatchGroupList groups;
1442     groups.push_back(builder.finalize());
1443     recorder->priv().add(ComputeTask::Make(std::move(groups)));
1444 
1445     // Submit the work and wait for it to complete.
1446     std::unique_ptr<Recording> recording = recorder->snap();
1447     if (!recording) {
1448         ERRORF(reporter, "Failed to make recording");
1449         return;
1450     }
1451 
1452     InsertRecordingInfo insertInfo;
1453     insertInfo.fRecording = recording.get();
1454     context->insertRecording(insertInfo);
1455     testContext->syncedSubmit(context);
1456 
1457     SkBitmap bitmap;
1458     SkImageInfo imgInfo =
1459             SkImageInfo::Make(kDstDim, kDstDim, kRGBA_8888_SkColorType, kUnpremul_SkAlphaType);
1460     bitmap.allocPixels(imgInfo);
1461 
1462     SkPixmap pixels;
1463     bool peekPixelsSuccess = bitmap.peekPixels(&pixels);
1464     REPORTER_ASSERT(reporter, peekPixelsSuccess);
1465 
1466     bool readPixelsSuccess = context->priv().readPixels(pixels, dst.get(), imgInfo, 0, 0);
1467     REPORTER_ASSERT(reporter, readPixelsSuccess);
1468 
1469     for (uint32_t x = 0; x < kDstDim; ++x) {
1470         for (uint32_t y = 0; y < kDstDim; ++y) {
1471             SkColor4f color = pixels.getColor4f(x, y);
1472             REPORTER_ASSERT(reporter, color.fR > 0.49 && color.fR < 0.51,
1473                             "At position {%u, %u}, "
1474                             "expected red channel in range [0.49, 0.51], "
1475                             "found {%.3f}",
1476                             x, y, color.fR);
1477         }
1478     }
1479 }
1480 
1481 // TODO(b/260622403): The shader tested here is identical to
1482 // `resources/sksl/compute/AtomicsOperations.compute`. It would be nice to be able to exercise SkSL
1483 // features like this as part of SkSLTest.cpp instead of as a graphite test.
1484 // TODO(b/262427430, b/262429132): Enable this test on other backends once they all support
1485 // compute programs.
DEF_GRAPHITE_TEST_FOR_DAWN_AND_METAL_CONTEXTS(Compute_AtomicOperationsTest,reporter,context,testContext)1486 DEF_GRAPHITE_TEST_FOR_DAWN_AND_METAL_CONTEXTS(Compute_AtomicOperationsTest,
1487                                               reporter,
1488                                               context,
1489                                               testContext) {
1490     // This fails on Dawn D3D11, b/315834710
1491     if (testContext->contextType() == skgpu::ContextType::kDawn_D3D11) {
1492         return;
1493     }
1494 
1495     std::unique_ptr<Recorder> recorder = context->makeRecorder();
1496 
1497     constexpr uint32_t kWorkgroupCount = 32;
1498     constexpr uint32_t kWorkgroupSize = 256;
1499 
1500     class TestComputeStep : public ComputeStep {
1501     public:
1502         TestComputeStep() : ComputeStep(
1503                 /*name=*/"TestAtomicOperations",
1504                 /*localDispatchSize=*/{kWorkgroupSize, 1, 1},
1505                 /*resources=*/{
1506                     {
1507                         /*type=*/ResourceType::kStorageBuffer,
1508                         /*flow=*/DataFlow::kShared,
1509                         /*policy=*/ResourcePolicy::kMapped,
1510                         /*slot=*/0,
1511                         /*sksl=*/"ssbo { atomicUint globalCounter; }",
1512                     }
1513                 }) {}
1514         ~TestComputeStep() override = default;
1515 
1516         // A kernel that increments a global (device memory) counter across multiple workgroups.
1517         // Each workgroup maintains its own independent tally in a workgroup-shared counter which
1518         // is then added to the global count.
1519         //
1520         // This exercises atomic store/load/add and coherent reads and writes over memory in storage
1521         // and workgroup address spaces.
1522         std::string computeSkSL() const override {
1523             return R"(
1524                 workgroup atomicUint localCounter;
1525 
1526                 void main() {
1527                     // Initialize the local counter.
1528                     if (sk_LocalInvocationID.x == 0) {
1529                         atomicStore(localCounter, 0);
1530                     }
1531 
1532                     // Synchronize the threads in the workgroup so they all see the initial value.
1533                     workgroupBarrier();
1534 
1535                     // All threads increment the counter.
1536                     atomicAdd(localCounter, 1);
1537 
1538                     // Synchronize the threads again to ensure they have all executed the increment
1539                     // and the following load reads the same value across all threads in the
1540                     // workgroup.
1541                     workgroupBarrier();
1542 
1543                     // Add the workgroup-only tally to the global counter.
1544                     if (sk_LocalInvocationID.x == 0) {
1545                         atomicAdd(globalCounter, atomicLoad(localCounter));
1546                     }
1547                 }
1548             )";
1549         }
1550 
1551         size_t calculateBufferSize(int index, const ResourceDesc& r) const override {
1552             SkASSERT(index == 0);
1553             SkASSERT(r.fSlot == 0);
1554             SkASSERT(r.fFlow == DataFlow::kShared);
1555             return sizeof(uint32_t);
1556         }
1557 
1558         WorkgroupSize calculateGlobalDispatchSize() const override {
1559             return WorkgroupSize(kWorkgroupCount, 1, 1);
1560         }
1561 
1562         void prepareStorageBuffer(int resourceIndex,
1563                                   const ResourceDesc& r,
1564                                   void* buffer,
1565                                   size_t bufferSize) const override {
1566             SkASSERT(resourceIndex == 0);
1567             *static_cast<uint32_t*>(buffer) = 0;
1568         }
1569     } step;
1570 
1571     DispatchGroup::Builder builder(recorder.get());
1572     builder.appendStep(&step);
1573 
1574     BindBufferInfo info = builder.getSharedBufferResource(0);
1575     if (!info) {
1576         ERRORF(reporter, "shared resource at slot 0 is missing");
1577         return;
1578     }
1579 
1580     // Record the compute pass task.
1581     ComputeTask::DispatchGroupList groups;
1582     groups.push_back(builder.finalize());
1583     recorder->priv().add(ComputeTask::Make(std::move(groups)));
1584 
1585     // Ensure the output buffer is synchronized to the CPU once the GPU submission has finished.
1586     auto buffer = sync_buffer_to_cpu(recorder.get(), info.fBuffer);
1587 
1588     // Submit the work and wait for it to complete.
1589     std::unique_ptr<Recording> recording = recorder->snap();
1590     if (!recording) {
1591         ERRORF(reporter, "Failed to make recording");
1592         return;
1593     }
1594 
1595     InsertRecordingInfo insertInfo;
1596     insertInfo.fRecording = recording.get();
1597     context->insertRecording(insertInfo);
1598     testContext->syncedSubmit(context);
1599 
1600     // Verify the contents of the output buffer.
1601     constexpr uint32_t kExpectedCount = kWorkgroupCount * kWorkgroupSize;
1602     const uint32_t result = static_cast<const uint32_t*>(
1603             map_buffer(context, testContext, buffer.get(), info.fOffset))[0];
1604     REPORTER_ASSERT(reporter,
1605                     result == kExpectedCount,
1606                     "expected '%u', found '%u'",
1607                     kExpectedCount,
1608                     result);
1609 }
1610 
1611 // TODO(b/260622403): The shader tested here is identical to
1612 // `resources/sksl/compute/AtomicsOperationsOverArrayAndStruct.compute`. It would be nice to be able
1613 // to exercise SkSL features like this as part of SkSLTest.cpp instead of as a graphite test.
1614 // TODO(b/262427430, b/262429132): Enable this test on other backends once they all support
1615 // compute programs.
DEF_GRAPHITE_TEST_FOR_DAWN_AND_METAL_CONTEXTS(Compute_AtomicOperationsOverArrayAndStructTest,reporter,context,testContext)1616 DEF_GRAPHITE_TEST_FOR_DAWN_AND_METAL_CONTEXTS(Compute_AtomicOperationsOverArrayAndStructTest,
1617                                               reporter,
1618                                               context,
1619                                               testContext) {
1620     // This fails on Dawn D3D11, b/315834710
1621     if (testContext->contextType() == skgpu::ContextType::kDawn_D3D11) {
1622         return;
1623     }
1624 
1625     std::unique_ptr<Recorder> recorder = context->makeRecorder();
1626 
1627     constexpr uint32_t kWorkgroupCount = 32;
1628     constexpr uint32_t kWorkgroupSize = 256;
1629 
1630     class TestComputeStep : public ComputeStep {
1631     public:
1632         TestComputeStep() : ComputeStep(
1633                 /*name=*/"TestAtomicOperationsOverArrayAndStruct",
1634                 /*localDispatchSize=*/{kWorkgroupSize, 1, 1},
1635                 /*resources=*/{
1636                     {
1637                         /*type=*/ResourceType::kStorageBuffer,
1638                         /*flow=*/DataFlow::kShared,
1639                         /*policy=*/ResourcePolicy::kMapped,
1640                         /*slot=*/0,
1641                         /*sksl=*/"ssbo {\n"
1642                             "   atomicUint globalCountsFirstHalf;\n"
1643                             "   atomicUint globalCountsSecondHalf;\n"
1644                             "}\n"
1645                     }
1646                 }) {}
1647         ~TestComputeStep() override = default;
1648 
1649         // Construct a kernel that increments a two global (device memory) counters across multiple
1650         // workgroups. Each workgroup maintains its own independent tallies in workgroup-shared
1651         // counters which are then added to the global counts.
1652         //
1653         // This exercises atomic store/load/add and coherent reads and writes over memory in storage
1654         // and workgroup address spaces.
1655         std::string computeSkSL() const override {
1656             return R"(
1657                 const uint WORKGROUP_SIZE = 256;
1658 
1659                 workgroup atomicUint localCounts[2];
1660 
1661                 void main() {
1662                     // Initialize the local counts.
1663                     if (sk_LocalInvocationID.x == 0) {
1664                         atomicStore(localCounts[0], 0);
1665                         atomicStore(localCounts[1], 0);
1666                     }
1667 
1668                     // Synchronize the threads in the workgroup so they all see the initial value.
1669                     workgroupBarrier();
1670 
1671                     // Each thread increments one of the local counters based on its invocation
1672                     // index.
1673                     uint idx = sk_LocalInvocationID.x < (WORKGROUP_SIZE / 2) ? 0 : 1;
1674                     atomicAdd(localCounts[idx], 1);
1675 
1676                     // Synchronize the threads again to ensure they have all executed the increments
1677                     // and the following load reads the same value across all threads in the
1678                     // workgroup.
1679                     workgroupBarrier();
1680 
1681                     // Add the workgroup-only tally to the global counter.
1682                     if (sk_LocalInvocationID.x == 0) {
1683                         atomicAdd(globalCountsFirstHalf, atomicLoad(localCounts[0]));
1684                         atomicAdd(globalCountsSecondHalf, atomicLoad(localCounts[1]));
1685                     }
1686                 }
1687             )";
1688         }
1689 
1690         size_t calculateBufferSize(int index, const ResourceDesc& r) const override {
1691             SkASSERT(index == 0);
1692             SkASSERT(r.fSlot == 0);
1693             SkASSERT(r.fFlow == DataFlow::kShared);
1694             return 2 * sizeof(uint32_t);
1695         }
1696 
1697         WorkgroupSize calculateGlobalDispatchSize() const override {
1698             return WorkgroupSize(kWorkgroupCount, 1, 1);
1699         }
1700 
1701         void prepareStorageBuffer(int resourceIndex,
1702                                   const ResourceDesc& r,
1703                                   void* buffer,
1704                                   size_t bufferSize) const override {
1705             SkASSERT(resourceIndex == 0);
1706             uint32_t* data = static_cast<uint32_t*>(buffer);
1707             data[0] = 0;
1708             data[1] = 0;
1709         }
1710     } step;
1711 
1712     DispatchGroup::Builder builder(recorder.get());
1713     builder.appendStep(&step);
1714 
1715     BindBufferInfo info = builder.getSharedBufferResource(0);
1716     if (!info) {
1717         ERRORF(reporter, "shared resource at slot 0 is missing");
1718         return;
1719     }
1720 
1721     // Record the compute pass task.
1722     ComputeTask::DispatchGroupList groups;
1723     groups.push_back(builder.finalize());
1724     recorder->priv().add(ComputeTask::Make(std::move(groups)));
1725 
1726     // Ensure the output buffer is synchronized to the CPU once the GPU submission has finished.
1727     auto buffer = sync_buffer_to_cpu(recorder.get(), info.fBuffer);
1728 
1729     // Submit the work and wait for it to complete.
1730     std::unique_ptr<Recording> recording = recorder->snap();
1731     if (!recording) {
1732         ERRORF(reporter, "Failed to make recording");
1733         return;
1734     }
1735 
1736     InsertRecordingInfo insertInfo;
1737     insertInfo.fRecording = recording.get();
1738     context->insertRecording(insertInfo);
1739     testContext->syncedSubmit(context);
1740 
1741     // Verify the contents of the output buffer.
1742     constexpr uint32_t kExpectedCount = kWorkgroupCount * kWorkgroupSize / 2;
1743 
1744     const uint32_t* ssboData = static_cast<const uint32_t*>(
1745             map_buffer(context, testContext, buffer.get(), info.fOffset));
1746     const uint32_t firstHalfCount = ssboData[0];
1747     const uint32_t secondHalfCount = ssboData[1];
1748     REPORTER_ASSERT(reporter,
1749                     firstHalfCount == kExpectedCount,
1750                     "expected '%u', found '%u'",
1751                     kExpectedCount,
1752                     firstHalfCount);
1753     REPORTER_ASSERT(reporter,
1754                     secondHalfCount == kExpectedCount,
1755                     "expected '%u', found '%u'",
1756                     kExpectedCount,
1757                     secondHalfCount);
1758 }
1759 
DEF_GRAPHITE_TEST_FOR_DAWN_AND_METAL_CONTEXTS(Compute_ClearedBuffer,reporter,context,testContext)1760 DEF_GRAPHITE_TEST_FOR_DAWN_AND_METAL_CONTEXTS(Compute_ClearedBuffer,
1761                                               reporter,
1762                                               context,
1763                                               testContext) {
1764     constexpr uint32_t kProblemSize = 512;
1765 
1766     // The ComputeStep packs kProblemSize floats into kProblemSize / 4 vectors and each thread
1767     // processes 1 vector at a time.
1768     constexpr uint32_t kWorkgroupSize = kProblemSize / 4;
1769 
1770     std::unique_ptr<Recorder> recorder = context->makeRecorder();
1771 
1772     // The ComputeStep requests an unmapped buffer that is zero-initialized. It writes the output to
1773     // a mapped buffer which test verifies.
1774     class TestComputeStep : public ComputeStep {
1775     public:
1776         TestComputeStep() : ComputeStep(
1777                 /*name=*/"TestClearedBuffer",
1778                 /*localDispatchSize=*/{kWorkgroupSize, 1, 1},
1779                 /*resources=*/{
1780                     // Zero initialized input buffer
1781                     {
1782                         // TODO(b/299979165): Declare this binding as read-only.
1783                         /*type=*/ResourceType::kStorageBuffer,
1784                         /*flow=*/DataFlow::kPrivate,
1785                         /*policy=*/ResourcePolicy::kClear,
1786                         /*sksl=*/"inputBlock { uint4 in_data[]; }\n",
1787                     },
1788                     // Output buffer:
1789                     {
1790                         /*type=*/ResourceType::kStorageBuffer,
1791                         /*flow=*/DataFlow::kShared,  // shared to allow us to access it from the
1792                                                      // Builder
1793                         /*policy=*/ResourcePolicy::kMapped,  // mappable for read-back
1794                         /*slot=*/0,
1795                         /*sksl=*/"outputBlock { uint4 out_data[]; }\n",
1796                     }
1797                 }) {}
1798         ~TestComputeStep() override = default;
1799 
1800         std::string computeSkSL() const override {
1801             return R"(
1802                 void main() {
1803                     out_data[sk_GlobalInvocationID.x] = in_data[sk_GlobalInvocationID.x];
1804                 }
1805             )";
1806         }
1807 
1808         size_t calculateBufferSize(int index, const ResourceDesc& r) const override {
1809             return sizeof(uint32_t) * kProblemSize;
1810         }
1811 
1812         void prepareStorageBuffer(int resourceIndex,
1813                                   const ResourceDesc& r,
1814                                   void* buffer,
1815                                   size_t bufferSize) const override {
1816             // Should receive this call only for the mapped buffer.
1817             SkASSERT(resourceIndex == 1);
1818         }
1819 
1820         WorkgroupSize calculateGlobalDispatchSize() const override {
1821             return WorkgroupSize(1, 1, 1);
1822         }
1823     } step;
1824 
1825     DispatchGroup::Builder builder(recorder.get());
1826     if (!builder.appendStep(&step)) {
1827         ERRORF(reporter, "Failed to add ComputeStep to DispatchGroup");
1828         return;
1829     }
1830 
1831     // The output buffer should have been placed in the right output slot.
1832     BindBufferInfo outputInfo = builder.getSharedBufferResource(0);
1833     if (!outputInfo) {
1834         ERRORF(reporter, "Failed to allocate an output buffer at slot 0");
1835         return;
1836     }
1837 
1838     // Record the compute task
1839     ComputeTask::DispatchGroupList groups;
1840     groups.push_back(builder.finalize());
1841     recorder->priv().add(ComputeTask::Make(std::move(groups)));
1842 
1843     // Ensure the output buffer is synchronized to the CPU once the GPU submission has finished.
1844     auto outputBuffer = sync_buffer_to_cpu(recorder.get(), outputInfo.fBuffer);
1845 
1846     // Submit the work and wait for it to complete.
1847     std::unique_ptr<Recording> recording = recorder->snap();
1848     if (!recording) {
1849         ERRORF(reporter, "Failed to make recording");
1850         return;
1851     }
1852 
1853     InsertRecordingInfo insertInfo;
1854     insertInfo.fRecording = recording.get();
1855     context->insertRecording(insertInfo);
1856     testContext->syncedSubmit(context);
1857 
1858     // Verify the contents of the output buffer.
1859     uint32_t* outData = static_cast<uint32_t*>(
1860             map_buffer(context, testContext, outputBuffer.get(), outputInfo.fOffset));
1861     SkASSERT(outputBuffer->isMapped() && outData != nullptr);
1862     for (unsigned int i = 0; i < kProblemSize; ++i) {
1863         const uint32_t found = outData[i];
1864         REPORTER_ASSERT(reporter, 0u == found, "expected '0u', found '%u'", found);
1865     }
1866 }
1867 
DEF_GRAPHITE_TEST_FOR_DAWN_AND_METAL_CONTEXTS(Compute_ClearOrdering,reporter,context,testContext)1868 DEF_GRAPHITE_TEST_FOR_DAWN_AND_METAL_CONTEXTS(Compute_ClearOrdering,
1869                                               reporter,
1870                                               context,
1871                                               testContext) {
1872     // Initiate two independent DispatchGroups operating on the same buffer. The first group
1873     // writes garbage to the buffer and the second group copies the contents to an output buffer.
1874     // This test validates that the reads, writes, and clear occur in the expected order.
1875     constexpr uint32_t kWorkgroupSize = 64;
1876 
1877     // Initialize buffer with non-zero data.
1878     class FillWithGarbage : public ComputeStep {
1879     public:
1880         FillWithGarbage() : ComputeStep(
1881                 /*name=*/"FillWithGarbage",
1882                 /*localDispatchSize=*/{kWorkgroupSize, 1, 1},
1883                 /*resources=*/{
1884                     {
1885                         /*type=*/ResourceType::kStorageBuffer,
1886                         /*flow=*/DataFlow::kShared,
1887                         /*policy=*/ResourcePolicy::kNone,
1888                         /*slot=*/0,
1889                         /*sksl=*/"outputBlock { uint4 out_data[]; }\n",
1890                     }
1891                 }) {}
1892         ~FillWithGarbage() override = default;
1893 
1894         std::string computeSkSL() const override {
1895             return R"(
1896                 void main() {
1897                     out_data[sk_GlobalInvocationID.x] = uint4(0xFE);
1898                 }
1899             )";
1900         }
1901     } garbageStep;
1902 
1903     // Second stage just copies the data to a destination buffer. This is only to verify that this
1904     // stage, issued in a separate DispatchGroup, observes the clear.
1905     class CopyBuffer : public ComputeStep {
1906     public:
1907         CopyBuffer() : ComputeStep(
1908                 /*name=*/"CopyBuffer",
1909                 /*localDispatchSize=*/{kWorkgroupSize, 1, 1},
1910                 /*resources=*/{
1911                     {
1912                         /*type=*/ResourceType::kStorageBuffer,
1913                         /*flow=*/DataFlow::kShared,
1914                         /*policy=*/ResourcePolicy::kNone,
1915                         /*slot=*/0,
1916                         /*sksl=*/"inputBlock { uint4 in_data[]; }\n",
1917                     },
1918                     {
1919                         /*type=*/ResourceType::kStorageBuffer,
1920                         /*flow=*/DataFlow::kShared,
1921                         /*policy=*/ResourcePolicy::kNone,
1922                         /*slot=*/1,
1923                         /*sksl=*/"outputBlock { uint4 out_data[]; }\n",
1924                     }
1925                 }) {}
1926         ~CopyBuffer() override = default;
1927 
1928         std::string computeSkSL() const override {
1929             return R"(
1930                 void main() {
1931                     out_data[sk_GlobalInvocationID.x] = in_data[sk_GlobalInvocationID.x];
1932                 }
1933             )";
1934         }
1935     } copyStep;
1936 
1937     std::unique_ptr<Recorder> recorder = context->makeRecorder();
1938     DispatchGroup::Builder builder(recorder.get());
1939 
1940     constexpr size_t kElementCount = 4 * kWorkgroupSize;
1941     constexpr size_t kBufferSize = sizeof(uint32_t) * kElementCount;
1942     auto input = recorder->priv().drawBufferManager()->getStorage(kBufferSize);
1943     auto [_, output] = recorder->priv().drawBufferManager()->getStoragePointer(kBufferSize);
1944 
1945     ComputeTask::DispatchGroupList groups;
1946 
1947     // First group.
1948     builder.assignSharedBuffer({input, kBufferSize}, 0);
1949     builder.appendStep(&garbageStep, {{1, 1, 1}});
1950     groups.push_back(builder.finalize());
1951 
1952     // Second group.
1953     builder.reset();
1954     builder.assignSharedBuffer({input, kBufferSize}, 0, ClearBuffer::kYes);
1955     builder.assignSharedBuffer({output, kBufferSize}, 1);
1956     builder.appendStep(&copyStep, {{1, 1, 1}});
1957     groups.push_back(builder.finalize());
1958 
1959     recorder->priv().add(ComputeTask::Make(std::move(groups)));
1960     // Ensure the output buffer is synchronized to the CPU once the GPU submission has finished.
1961     auto outputBuffer = sync_buffer_to_cpu(recorder.get(), output.fBuffer);
1962 
1963     // Submit the work and wait for it to complete.
1964     std::unique_ptr<Recording> recording = submit_recording(context, testContext, recorder.get());
1965     if (!recording) {
1966         ERRORF(reporter, "Failed to make recording");
1967         return;
1968     }
1969 
1970     // Verify the contents of the output buffer.
1971     uint32_t* outData = static_cast<uint32_t*>(
1972             map_buffer(context, testContext, outputBuffer.get(), output.fOffset));
1973     SkASSERT(outputBuffer->isMapped() && outData != nullptr);
1974     for (unsigned int i = 0; i < kElementCount; ++i) {
1975         const uint32_t found = outData[i];
1976         REPORTER_ASSERT(reporter, 0u == found, "expected '0u', found '%u'", found);
1977     }
1978 }
1979 
DEF_GRAPHITE_TEST_FOR_DAWN_AND_METAL_CONTEXTS(Compute_ClearOrderingScratchBuffers,reporter,context,testContext)1980 DEF_GRAPHITE_TEST_FOR_DAWN_AND_METAL_CONTEXTS(Compute_ClearOrderingScratchBuffers,
1981                                               reporter,
1982                                               context,
1983                                               testContext) {
1984     // This test is the same as the ClearOrdering test but the two stages write to a recycled
1985     // ScratchBuffer. This is primarily to test ScratchBuffer reuse.
1986     constexpr uint32_t kWorkgroupSize = 64;
1987 
1988     // Initialize buffer with non-zero data.
1989     class FillWithGarbage : public ComputeStep {
1990     public:
1991         FillWithGarbage() : ComputeStep(
1992                 /*name=*/"FillWithGarbage",
1993                 /*localDispatchSize=*/{kWorkgroupSize, 1, 1},
1994                 /*resources=*/{
1995                     {
1996                         /*type=*/ResourceType::kStorageBuffer,
1997                         /*flow=*/DataFlow::kShared,
1998                         /*policy=*/ResourcePolicy::kNone,
1999                         /*slot=*/0,
2000                         /*sksl=*/"outputBlock { uint4 out_data[]; }\n",
2001                     }
2002                 }) {}
2003         ~FillWithGarbage() override = default;
2004 
2005         std::string computeSkSL() const override {
2006             return R"(
2007                 void main() {
2008                     out_data[sk_GlobalInvocationID.x] = uint4(0xFE);
2009                 }
2010             )";
2011         }
2012     } garbageStep;
2013 
2014     // Second stage just copies the data to a destination buffer. This is only to verify that this
2015     // stage (issued in a separate DispatchGroup) sees the changes.
2016     class CopyBuffer : public ComputeStep {
2017     public:
2018         CopyBuffer() : ComputeStep(
2019                 /*name=*/"CopyBuffer",
2020                 /*localDispatchSize=*/{kWorkgroupSize, 1, 1},
2021                 /*resources=*/{
2022                     {
2023                         /*type=*/ResourceType::kStorageBuffer,
2024                         /*flow=*/DataFlow::kShared,
2025                         /*policy=*/ResourcePolicy::kNone,
2026                         /*slot=*/0,
2027                         /*sksl=*/"inputBlock { uint4 in_data[]; }\n",
2028                     },
2029                     {
2030                         /*type=*/ResourceType::kStorageBuffer,
2031                         /*flow=*/DataFlow::kShared,
2032                         /*policy=*/ResourcePolicy::kNone,
2033                         /*slot=*/1,
2034                         /*sksl=*/"outputBlock { uint4 out_data[]; }\n",
2035                     }
2036                 }) {}
2037         ~CopyBuffer() override = default;
2038 
2039         std::string computeSkSL() const override {
2040             return R"(
2041                 void main() {
2042                     out_data[sk_GlobalInvocationID.x] = in_data[sk_GlobalInvocationID.x];
2043                 }
2044             )";
2045         }
2046     } copyStep;
2047 
2048     std::unique_ptr<Recorder> recorder = context->makeRecorder();
2049     DispatchGroup::Builder builder(recorder.get());
2050 
2051     constexpr size_t kElementCount = 4 * kWorkgroupSize;
2052     constexpr size_t kBufferSize = sizeof(uint32_t) * kElementCount;
2053     auto [_, output] = recorder->priv().drawBufferManager()->getStoragePointer(kBufferSize);
2054 
2055     ComputeTask::DispatchGroupList groups;
2056 
2057     // First group.
2058     {
2059         auto scratch = recorder->priv().drawBufferManager()->getScratchStorage(kBufferSize);
2060         auto input = scratch.suballocate(kBufferSize);
2061         builder.assignSharedBuffer({input, kBufferSize}, 0);
2062 
2063         // `scratch` returns to the scratch buffer pool when it goes out of scope
2064     }
2065     builder.appendStep(&garbageStep, {{1, 1, 1}});
2066     groups.push_back(builder.finalize());
2067 
2068     // Second group.
2069     builder.reset();
2070     {
2071         auto scratch = recorder->priv().drawBufferManager()->getScratchStorage(kBufferSize);
2072         auto input = scratch.suballocate(kBufferSize);
2073         builder.assignSharedBuffer({input, kBufferSize}, 0, ClearBuffer::kYes);
2074     }
2075     builder.assignSharedBuffer({output, kBufferSize}, 1);
2076     builder.appendStep(&copyStep, {{1, 1, 1}});
2077     groups.push_back(builder.finalize());
2078 
2079     recorder->priv().add(ComputeTask::Make(std::move(groups)));
2080     // Ensure the output buffer is synchronized to the CPU once the GPU submission has finished.
2081     auto outputBuffer = sync_buffer_to_cpu(recorder.get(), output.fBuffer);
2082 
2083     // Submit the work and wait for it to complete.
2084     std::unique_ptr<Recording> recording = submit_recording(context, testContext, recorder.get());
2085     if (!recording) {
2086         ERRORF(reporter, "Failed to make recording");
2087         return;
2088     }
2089 
2090     // Verify the contents of the output buffer.
2091     uint32_t* outData = static_cast<uint32_t*>(
2092             map_buffer(context, testContext, outputBuffer.get(), output.fOffset));
2093     SkASSERT(outputBuffer->isMapped() && outData != nullptr);
2094     for (unsigned int i = 0; i < kElementCount; ++i) {
2095         const uint32_t found = outData[i];
2096         REPORTER_ASSERT(reporter, 0u == found, "expected '0u', found '%u'", found);
2097     }
2098 }
2099 
DEF_GRAPHITE_TEST_FOR_DAWN_AND_METAL_CONTEXTS(Compute_IndirectDispatch,reporter,context,testContext)2100 DEF_GRAPHITE_TEST_FOR_DAWN_AND_METAL_CONTEXTS(Compute_IndirectDispatch,
2101                                               reporter,
2102                                               context,
2103                                               testContext) {
2104     // This fails on Dawn D3D11, b/315834710
2105     if (testContext->contextType() == skgpu::ContextType::kDawn_D3D11) {
2106         return;
2107     }
2108 
2109     std::unique_ptr<Recorder> recorder = context->makeRecorder();
2110 
2111     constexpr uint32_t kWorkgroupCount = 32;
2112     constexpr uint32_t kWorkgroupSize = 64;
2113 
2114     // `IndirectStep` populates a buffer with the global workgroup count for `CountStep`.
2115     // `CountStep` is recorded using `DispatchGroup::appendStepIndirect()` and its workgroups get
2116     // dispatched according to the values computed by `IndirectStep` on the GPU.
2117     class IndirectStep : public ComputeStep {
2118     public:
2119         IndirectStep()
2120                 : ComputeStep(
2121                           /*name=*/"TestIndirectDispatch_IndirectStep",
2122                           /*localDispatchSize=*/{kWorkgroupSize, 1, 1},
2123                           /*resources=*/
2124                           {{
2125                                   /*type=*/ResourceType::kIndirectBuffer,
2126                                   /*flow=*/DataFlow::kShared,
2127                                   /*policy=*/ResourcePolicy::kClear,
2128                                   /*slot=*/0,
2129                                   // TODO(armansito): Ideally the SSBO would have a single member of
2130                                   // type `IndirectDispatchArgs` struct type. SkSL modules don't
2131                                   // support struct declarations so this is currently not possible.
2132                                   /*sksl=*/"ssbo { uint indirect[]; }",
2133                           }}) {}
2134         ~IndirectStep() override = default;
2135 
2136         // Kernel that specifies a workgroup size of `kWorkgroupCount` to be used by the indirect
2137         // dispatch.
2138         std::string computeSkSL() const override {
2139             return R"(
2140                 // This needs to match `kWorkgroupCount` declared above.
2141                 const uint kWorkgroupCount = 32;
2142 
2143                 void main() {
2144                     if (sk_LocalInvocationID.x == 0) {
2145                         indirect[0] = kWorkgroupCount;
2146                         indirect[1] = 1;
2147                         indirect[2] = 1;
2148                     }
2149                 }
2150             )";
2151         }
2152 
2153         size_t calculateBufferSize(int index, const ResourceDesc& r) const override {
2154             SkASSERT(index == 0);
2155             SkASSERT(r.fSlot == 0);
2156             SkASSERT(r.fFlow == DataFlow::kShared);
2157             return kIndirectDispatchArgumentSize;
2158         }
2159 
2160         WorkgroupSize calculateGlobalDispatchSize() const override {
2161             return WorkgroupSize(1, 1, 1);
2162         }
2163     } indirectStep;
2164 
2165     class CountStep : public ComputeStep {
2166     public:
2167         CountStep()
2168                 : ComputeStep(
2169                           /*name=*/"TestIndirectDispatch_CountStep",
2170                           /*localDispatchSize=*/{kWorkgroupSize, 1, 1},
2171                           /*resources=*/
2172                           {{
2173                                   /*type=*/ResourceType::kStorageBuffer,
2174                                   /*flow=*/DataFlow::kShared,
2175                                   /*policy=*/ResourcePolicy::kMapped,
2176                                   /*slot=*/1,
2177                                   /*sksl=*/"ssbo { atomicUint globalCounter; }",
2178                           }}) {}
2179         ~CountStep() override = default;
2180 
2181         std::string computeSkSL() const override {
2182             return R"(
2183                 workgroup atomicUint localCounter;
2184 
2185                 void main() {
2186                     // Initialize the local counter.
2187                     if (sk_LocalInvocationID.x == 0) {
2188                         atomicStore(localCounter, 0);
2189                     }
2190 
2191                     // Synchronize the threads in the workgroup so they all see the initial value.
2192                     workgroupBarrier();
2193 
2194                     // All threads increment the counter.
2195                     atomicAdd(localCounter, 1);
2196 
2197                     // Synchronize the threads again to ensure they have all executed the increment
2198                     // and the following load reads the same value across all threads in the
2199                     // workgroup.
2200                     workgroupBarrier();
2201 
2202                     // Add the workgroup-only tally to the global counter.
2203                     if (sk_LocalInvocationID.x == 0) {
2204                         atomicAdd(globalCounter, atomicLoad(localCounter));
2205                     }
2206                 }
2207             )";
2208         }
2209 
2210         size_t calculateBufferSize(int index, const ResourceDesc& r) const override {
2211             SkASSERT(index == 0);
2212             SkASSERT(r.fSlot == 1);
2213             SkASSERT(r.fFlow == DataFlow::kShared);
2214             return sizeof(uint32_t);
2215         }
2216 
2217         void prepareStorageBuffer(int resourceIndex,
2218                                   const ResourceDesc& r,
2219                                   void* buffer,
2220                                   size_t bufferSize) const override {
2221             SkASSERT(resourceIndex == 0);
2222             *static_cast<uint32_t*>(buffer) = 0;
2223         }
2224     } countStep;
2225 
2226     DispatchGroup::Builder builder(recorder.get());
2227     builder.appendStep(&indirectStep);
2228     BindBufferInfo indirectBufferInfo = builder.getSharedBufferResource(0);
2229     if (!indirectBufferInfo) {
2230         ERRORF(reporter, "Shared resource at slot 0 is missing");
2231         return;
2232     }
2233     builder.appendStepIndirect(&countStep, {indirectBufferInfo, kIndirectDispatchArgumentSize});
2234 
2235     BindBufferInfo info = builder.getSharedBufferResource(1);
2236     if (!info) {
2237         ERRORF(reporter, "Shared resource at slot 1 is missing");
2238         return;
2239     }
2240 
2241     // Record the compute pass task.
2242     ComputeTask::DispatchGroupList groups;
2243     groups.push_back(builder.finalize());
2244     recorder->priv().add(ComputeTask::Make(std::move(groups)));
2245 
2246     // Ensure the output buffer is synchronized to the CPU once the GPU submission has finished.
2247     auto buffer = sync_buffer_to_cpu(recorder.get(), info.fBuffer);
2248 
2249     // Submit the work and wait for it to complete.
2250     std::unique_ptr<Recording> recording = recorder->snap();
2251     if (!recording) {
2252         ERRORF(reporter, "Failed to make recording");
2253         return;
2254     }
2255 
2256     InsertRecordingInfo insertInfo;
2257     insertInfo.fRecording = recording.get();
2258     context->insertRecording(insertInfo);
2259     testContext->syncedSubmit(context);
2260 
2261     // Verify the contents of the output buffer.
2262     constexpr uint32_t kExpectedCount = kWorkgroupCount * kWorkgroupSize;
2263     const uint32_t result = static_cast<const uint32_t*>(
2264             map_buffer(context, testContext, buffer.get(), info.fOffset))[0];
2265     REPORTER_ASSERT(reporter,
2266                     result == kExpectedCount,
2267                     "expected '%u', found '%u'",
2268                     kExpectedCount,
2269                     result);
2270 }
2271 
DEF_GRAPHITE_TEST_FOR_METAL_CONTEXT(Compute_NativeShaderSourceMetal,reporter,context,testContext)2272 DEF_GRAPHITE_TEST_FOR_METAL_CONTEXT(Compute_NativeShaderSourceMetal,
2273                                     reporter,
2274                                     context,
2275                                     testContext) {
2276     std::unique_ptr<Recorder> recorder = context->makeRecorder();
2277 
2278     constexpr uint32_t kWorkgroupCount = 32;
2279     constexpr uint32_t kWorkgroupSize = 1024;
2280 
2281     class TestComputeStep : public ComputeStep {
2282     public:
2283         TestComputeStep() : ComputeStep(
2284                 /*name=*/"TestAtomicOperationsMetal",
2285                 /*localDispatchSize=*/{kWorkgroupSize, 1, 1},
2286                 /*resources=*/{
2287                     {
2288                         /*type=*/ResourceType::kStorageBuffer,
2289                         /*flow=*/DataFlow::kShared,
2290                         /*policy=*/ResourcePolicy::kMapped,
2291                         /*slot=*/0,
2292                     }
2293                 },
2294                 /*workgroupBuffers=*/{},
2295                 /*baseFlags=*/Flags::kSupportsNativeShader) {}
2296         ~TestComputeStep() override = default;
2297 
2298         NativeShaderSource nativeShaderSource(NativeShaderFormat format) const override {
2299             SkASSERT(format == NativeShaderFormat::kMSL);
2300             static constexpr std::string_view kSource = R"(
2301                 #include <metal_stdlib>
2302 
2303                 using namespace metal;
2304 
2305                 kernel void atomicCount(uint3 localId [[thread_position_in_threadgroup]],
2306                                         device atomic_uint& globalCounter [[buffer(0)]]) {
2307                     threadgroup atomic_uint localCounter;
2308 
2309                     // Initialize the local counter.
2310                     if (localId.x == 0u) {
2311                         atomic_store_explicit(&localCounter, 0u, memory_order_relaxed);
2312                     }
2313 
2314                     // Synchronize the threads in the workgroup so they all see the initial value.
2315                     threadgroup_barrier(mem_flags::mem_threadgroup);
2316 
2317                     // All threads increment the counter.
2318                     atomic_fetch_add_explicit(&localCounter, 1u, memory_order_relaxed);
2319 
2320                     // Synchronize the threads again to ensure they have all executed the increment
2321                     // and the following load reads the same value across all threads in the
2322                     // workgroup.
2323                     threadgroup_barrier(mem_flags::mem_threadgroup);
2324 
2325                     // Add the workgroup-only tally to the global counter.
2326                     if (localId.x == 0u) {
2327                         uint tally = atomic_load_explicit(&localCounter, memory_order_relaxed);
2328                         atomic_fetch_add_explicit(&globalCounter, tally, memory_order_relaxed);
2329                     }
2330                 }
2331             )";
2332             return {kSource, "atomicCount"};
2333         }
2334 
2335         size_t calculateBufferSize(int index, const ResourceDesc& r) const override {
2336             SkASSERT(index == 0);
2337             SkASSERT(r.fSlot == 0);
2338             SkASSERT(r.fFlow == DataFlow::kShared);
2339             return sizeof(uint32_t);
2340         }
2341 
2342         WorkgroupSize calculateGlobalDispatchSize() const override {
2343             return WorkgroupSize(kWorkgroupCount, 1, 1);
2344         }
2345 
2346         void prepareStorageBuffer(int resourceIndex,
2347                                   const ResourceDesc& r,
2348                                   void* buffer,
2349                                   size_t bufferSize) const override {
2350             SkASSERT(resourceIndex == 0);
2351             *static_cast<uint32_t*>(buffer) = 0;
2352         }
2353     } step;
2354 
2355     DispatchGroup::Builder builder(recorder.get());
2356     builder.appendStep(&step);
2357 
2358     BindBufferInfo info = builder.getSharedBufferResource(0);
2359     if (!info) {
2360         ERRORF(reporter, "shared resource at slot 0 is missing");
2361         return;
2362     }
2363 
2364     // Record the compute pass task.
2365     ComputeTask::DispatchGroupList groups;
2366     groups.push_back(builder.finalize());
2367     recorder->priv().add(ComputeTask::Make(std::move(groups)));
2368 
2369     // Ensure the output buffer is synchronized to the CPU once the GPU submission has finished.
2370     auto buffer = sync_buffer_to_cpu(recorder.get(), info.fBuffer);
2371 
2372     // Submit the work and wait for it to complete.
2373     std::unique_ptr<Recording> recording = recorder->snap();
2374     if (!recording) {
2375         ERRORF(reporter, "Failed to make recording");
2376         return;
2377     }
2378 
2379     InsertRecordingInfo insertInfo;
2380     insertInfo.fRecording = recording.get();
2381     context->insertRecording(insertInfo);
2382     testContext->syncedSubmit(context);
2383 
2384     // Verify the contents of the output buffer.
2385     constexpr uint32_t kExpectedCount = kWorkgroupCount * kWorkgroupSize;
2386     const uint32_t result = static_cast<const uint32_t*>(
2387             map_buffer(context, testContext, buffer.get(), info.fOffset))[0];
2388     REPORTER_ASSERT(reporter,
2389                     result == kExpectedCount,
2390                     "expected '%u', found '%u'",
2391                     kExpectedCount,
2392                     result);
2393 }
2394 
DEF_GRAPHITE_TEST_FOR_METAL_CONTEXT(Compute_WorkgroupBufferDescMetal,reporter,context,testContext)2395 DEF_GRAPHITE_TEST_FOR_METAL_CONTEXT(Compute_WorkgroupBufferDescMetal,
2396                                     reporter,
2397                                     context,
2398                                     testContext) {
2399     std::unique_ptr<Recorder> recorder = context->makeRecorder();
2400 
2401     constexpr uint32_t kWorkgroupCount = 32;
2402     constexpr uint32_t kWorkgroupSize = 1024;
2403 
2404     class TestComputeStep : public ComputeStep {
2405     public:
2406         TestComputeStep() : ComputeStep(
2407                 /*name=*/"TestAtomicOperationsMetal",
2408                 /*localDispatchSize=*/{kWorkgroupSize, 1, 1},
2409                 /*resources=*/{
2410                     {
2411                         /*type=*/ResourceType::kStorageBuffer,
2412                         /*flow=*/DataFlow::kShared,
2413                         /*policy=*/ResourcePolicy::kMapped,
2414                         /*slot=*/0,
2415                     }
2416                 },
2417                 /*workgroupBuffers=*/{
2418                     {
2419                         /*size=*/sizeof(uint32_t),
2420                         /*index=*/0u,
2421                     }
2422                 },
2423                 /*baseFlags=*/Flags::kSupportsNativeShader) {}
2424         ~TestComputeStep() override = default;
2425 
2426         // This is the same MSL kernel as in Compute_NativeShaderSourceMetal, except `localCounter`
2427         // is an entry-point parameter instead of a local variable. This forces the workgroup
2428         // binding to be encoded explicitly in the command encoder.
2429         NativeShaderSource nativeShaderSource(NativeShaderFormat format) const override {
2430             SkASSERT(format == NativeShaderFormat::kMSL);
2431             static constexpr std::string_view kSource = R"(
2432                 #include <metal_stdlib>
2433 
2434                 using namespace metal;
2435 
2436                 kernel void atomicCount(uint3 localId [[thread_position_in_threadgroup]],
2437                                         device atomic_uint& globalCounter [[buffer(0)]],
2438                                         threadgroup atomic_uint& localCounter [[threadgroup(0)]]) {
2439                     // Initialize the local counter.
2440                     if (localId.x == 0u) {
2441                         atomic_store_explicit(&localCounter, 0u, memory_order_relaxed);
2442                     }
2443 
2444                     // Synchronize the threads in the workgroup so they all see the initial value.
2445                     threadgroup_barrier(mem_flags::mem_threadgroup);
2446 
2447                     // All threads increment the counter.
2448                     atomic_fetch_add_explicit(&localCounter, 1u, memory_order_relaxed);
2449 
2450                     // Synchronize the threads again to ensure they have all executed the increment
2451                     // and the following load reads the same value across all threads in the
2452                     // workgroup.
2453                     threadgroup_barrier(mem_flags::mem_threadgroup);
2454 
2455                     // Add the workgroup-only tally to the global counter.
2456                     if (localId.x == 0u) {
2457                         uint tally = atomic_load_explicit(&localCounter, memory_order_relaxed);
2458                         atomic_fetch_add_explicit(&globalCounter, tally, memory_order_relaxed);
2459                     }
2460                 }
2461             )";
2462             return {kSource, "atomicCount"};
2463         }
2464 
2465         size_t calculateBufferSize(int index, const ResourceDesc& r) const override {
2466             SkASSERT(index == 0);
2467             SkASSERT(r.fSlot == 0);
2468             SkASSERT(r.fFlow == DataFlow::kShared);
2469             return sizeof(uint32_t);
2470         }
2471 
2472         WorkgroupSize calculateGlobalDispatchSize() const override {
2473             return WorkgroupSize(kWorkgroupCount, 1, 1);
2474         }
2475 
2476         void prepareStorageBuffer(int resourceIndex,
2477                                   const ResourceDesc& r,
2478                                   void* buffer,
2479                                   size_t bufferSize) const override {
2480             SkASSERT(resourceIndex == 0);
2481             *static_cast<uint32_t*>(buffer) = 0;
2482         }
2483     } step;
2484 
2485     DispatchGroup::Builder builder(recorder.get());
2486     builder.appendStep(&step);
2487 
2488     BindBufferInfo info = builder.getSharedBufferResource(0);
2489     if (!info) {
2490         ERRORF(reporter, "shared resource at slot 0 is missing");
2491         return;
2492     }
2493 
2494     // Record the compute pass task.
2495     ComputeTask::DispatchGroupList groups;
2496     groups.push_back(builder.finalize());
2497     recorder->priv().add(ComputeTask::Make(std::move(groups)));
2498 
2499     // Ensure the output buffer is synchronized to the CPU once the GPU submission has finished.
2500     auto buffer = sync_buffer_to_cpu(recorder.get(), info.fBuffer);
2501 
2502     // Submit the work and wait for it to complete.
2503     std::unique_ptr<Recording> recording = recorder->snap();
2504     if (!recording) {
2505         ERRORF(reporter, "Failed to make recording");
2506         return;
2507     }
2508 
2509     InsertRecordingInfo insertInfo;
2510     insertInfo.fRecording = recording.get();
2511     context->insertRecording(insertInfo);
2512     testContext->syncedSubmit(context);
2513 
2514     // Verify the contents of the output buffer.
2515     constexpr uint32_t kExpectedCount = kWorkgroupCount * kWorkgroupSize;
2516     const uint32_t result = static_cast<const uint32_t*>(
2517             map_buffer(context, testContext, buffer.get(), info.fOffset))[0];
2518     REPORTER_ASSERT(reporter,
2519                     result == kExpectedCount,
2520                     "expected '%u', found '%u'",
2521                     kExpectedCount,
2522                     result);
2523 }
2524 
DEF_GRAPHITE_TEST_FOR_DAWN_CONTEXT(Compute_NativeShaderSourceWGSL,reporter,context,testContext)2525 DEF_GRAPHITE_TEST_FOR_DAWN_CONTEXT(Compute_NativeShaderSourceWGSL, reporter, context, testContext) {
2526     // This fails on Dawn D3D11, b/315834710
2527     if (testContext->contextType() == skgpu::ContextType::kDawn_D3D11) {
2528         return;
2529     }
2530 
2531     std::unique_ptr<Recorder> recorder = context->makeRecorder();
2532 
2533     constexpr uint32_t kWorkgroupCount = 32;
2534     constexpr uint32_t kWorkgroupSize = 256;  // The WebGPU default workgroup size limit is 256
2535 
2536     class TestComputeStep : public ComputeStep {
2537     public:
2538         TestComputeStep() : ComputeStep(
2539                 /*name=*/"TestAtomicOperationsWGSL",
2540                 /*localDispatchSize=*/{kWorkgroupSize, 1, 1},
2541                 /*resources=*/{
2542                     {
2543                         /*type=*/ResourceType::kStorageBuffer,
2544                         /*flow=*/DataFlow::kShared,
2545                         /*policy=*/ResourcePolicy::kMapped,
2546                         /*slot=*/0,
2547                     }
2548                 },
2549                 /*workgroupBuffers=*/{},
2550                 /*baseFlags=*/Flags::kSupportsNativeShader) {}
2551         ~TestComputeStep() override = default;
2552 
2553         NativeShaderSource nativeShaderSource(NativeShaderFormat format) const override {
2554             SkASSERT(format == NativeShaderFormat::kWGSL);
2555             static constexpr std::string_view kSource = R"(
2556                 @group(0) @binding(0) var<storage, read_write> globalCounter: atomic<u32>;
2557 
2558                 var<workgroup> localCounter: atomic<u32>;
2559 
2560                 @compute @workgroup_size(256)
2561                 fn atomicCount(@builtin(local_invocation_id) localId: vec3u) {
2562                     // Initialize the local counter.
2563                     if localId.x == 0u {
2564                         atomicStore(&localCounter, 0u);
2565                     }
2566 
2567                     // Synchronize the threads in the workgroup so they all see the initial value.
2568                     workgroupBarrier();
2569 
2570                     // All threads increment the counter.
2571                     atomicAdd(&localCounter, 1u);
2572 
2573                     // Synchronize the threads again to ensure they have all executed the increment
2574                     // and the following load reads the same value across all threads in the
2575                     // workgroup.
2576                     workgroupBarrier();
2577 
2578                     // Add the workgroup-only tally to the global counter.
2579                     if localId.x == 0u {
2580                         let tally = atomicLoad(&localCounter);
2581                         atomicAdd(&globalCounter, tally);
2582                     }
2583                 }
2584             )";
2585             return {kSource, "atomicCount"};
2586         }
2587 
2588         size_t calculateBufferSize(int index, const ResourceDesc& r) const override {
2589             SkASSERT(index == 0);
2590             SkASSERT(r.fSlot == 0);
2591             SkASSERT(r.fFlow == DataFlow::kShared);
2592             return sizeof(uint32_t);
2593         }
2594 
2595         WorkgroupSize calculateGlobalDispatchSize() const override {
2596             return WorkgroupSize(kWorkgroupCount, 1, 1);
2597         }
2598 
2599         void prepareStorageBuffer(int resourceIndex,
2600                                   const ResourceDesc& r,
2601                                   void* buffer,
2602                                   size_t bufferSize) const override {
2603             SkASSERT(resourceIndex == 0);
2604             *static_cast<uint32_t*>(buffer) = 0;
2605         }
2606     } step;
2607 
2608     DispatchGroup::Builder builder(recorder.get());
2609     builder.appendStep(&step);
2610 
2611     BindBufferInfo info = builder.getSharedBufferResource(0);
2612     if (!info) {
2613         ERRORF(reporter, "shared resource at slot 0 is missing");
2614         return;
2615     }
2616 
2617     // Record the compute pass task.
2618     ComputeTask::DispatchGroupList groups;
2619     groups.push_back(builder.finalize());
2620     recorder->priv().add(ComputeTask::Make(std::move(groups)));
2621 
2622     // Ensure the output buffer is synchronized to the CPU once the GPU submission has finished.
2623     auto buffer = sync_buffer_to_cpu(recorder.get(), info.fBuffer);
2624 
2625     // Submit the work and wait for it to complete.
2626     std::unique_ptr<Recording> recording = recorder->snap();
2627     if (!recording) {
2628         ERRORF(reporter, "Failed to make recording");
2629         return;
2630     }
2631 
2632     InsertRecordingInfo insertInfo;
2633     insertInfo.fRecording = recording.get();
2634     context->insertRecording(insertInfo);
2635     testContext->syncedSubmit(context);
2636 
2637     // Verify the contents of the output buffer.
2638     constexpr uint32_t kExpectedCount = kWorkgroupCount * kWorkgroupSize;
2639     const uint32_t result = static_cast<const uint32_t*>(
2640             map_buffer(context, testContext, buffer.get(), info.fOffset))[0];
2641     REPORTER_ASSERT(reporter,
2642                     result == kExpectedCount,
2643                     "expected '%u', found '%u'",
2644                     kExpectedCount,
2645                     result);
2646 }
2647