• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*
2  * Copyright 2023 Google LLC
3  *
4  * Use of this source code is governed by a BSD-style license that can be
5  * found in the LICENSE file.
6  */
7 
8 #ifndef skgpu_graphite_compute_VelloComputeSteps_DEFINED
9 #define skgpu_graphite_compute_VelloComputeSteps_DEFINED
10 
11 #include "include/core/SkSpan.h"
12 #include "include/private/base/SkTArray.h"
13 #include "src/gpu/graphite/ComputeTypes.h"
14 #include "src/gpu/graphite/compute/ComputeStep.h"
15 #include "third_party/vello/cpp/vello.h"
16 
17 #include <string_view>
18 
19 namespace skgpu::graphite {
20 
21 // This file defines ComputeSteps for all Vello compute stages and their permutations. The
22 // declaration of each ComputeStep subclass mirrors the name of the pipeline stage as defined in the
23 // shader metadata.
24 //
25 // The compute stages all operate over a shared set of buffer and image resources. The
26 // `kVelloSlot_*` constant definitions below each uniquely identify a shared resource that must be
27 // instantiated when assembling the ComputeSteps into a DispatchGroup.
28 //
29 // === Monoids and Prefix Sums ===
30 //
31 // Vello's GPU algorithms make repeated use of parallel prefix sums techniques. These occur
32 // frequently in path rasterization (e.g. winding number accummulation across a scanline can be
33 // thought of as per-pixel prefix sums) but Vello also uses them to calculate buffer offsets for
34 // associated entries across its variable length encoding streams.
35 //
36 // For instance, given a scene that contains Bézier paths, each path gets encoded as a transform,
37 // a sequence of path tags (verbs), and zero or more 2-D points associated with each
38 // tag. N paths will often map to N transforms, N + M tags, and N + M + L points (where N > 0, M >
39 // 0, L >= 0). These entries are stored in separate parallel transform, path tag, and path data
40 // streams. The correspondence between entries of these independent streams is implicit. To keep
41 // CPU encoding of these streams fast, the offsets into each buffer for a given "path object" is
42 // computed dynamically and in parallel on the GPU. Since the offsets for each object build
43 // additively on offsets that appear before it in the stream, parallel computation of
44 // offsets can be treated as a dynamic programming problem that maps well to parallel prefix sums
45 // where each object is a "monoid" (https://en.wikipedia.org/wiki/Monoid) that supports algebraic
46 // addition/subtraction over data encoded in the path tags themselves.
47 //
48 // Once computed, a monoid contains the offsets into the input (and sometimes output) buffers for a
49 // given object. The parallel prefix sums operation is defined as a monoidal reduce + pre-scan pair.
50 // (Prefix Sums and Their Applications, Blelloch, G., https://www.cs.cmu.edu/~guyb/papers/Ble93.pdf)
51 //
52 // While these concepts are an implementation detail they are core to the Vello algorithm and are
53 // reflected in the pipeline names and data slot definitions.
54 //
55 // === Full Pipeline ===
56 //
57 // The full Vello pipeline stages are as follows and should be dispatched in the following order:
58 //
59 // I. Build the path monoid stream:
60 //   If the input fits within the workgroup size:
61 //     pathtag_reduce, pathtag_scan_small
62 //   else
63 //     pathtag_reduce, pathtag_reduce2, pathtag_scan1, pathtag_scan_large
64 //
65 // II. Compute path bounding boxes, convert path segments into cubics:
66 //   bbox_clear, pathseg
67 //
68 // III. Process the draw object stream to build the draw monoids and inputs to the clip stage:
69 //   draw_reduce, draw_leaf
70 //
71 // IV. Compute the bounding boxes for the clip stack from the input stream, if the scene contains
72 // clips:
73 //   clip_reduce, clip_leaf
74 //
75 // V. Allocate tile and segment buffers for the individual bins and prepare for coarse rasterization
76 //   binning, tile_alloc, path_coarse
77 //
78 // VI. Coarse rasterization
79 //   backdrop_dyn, coarse
80 //
81 // VII. Fine rasterization
82 //   fine
83 //
84 // TODO: Document the coverage mask pipeline once it has been re-implemented.
85 
86 // ***
87 // Shared buffers that are accessed by various stages.
88 //
89 // The render configration uniform buffer.
90 constexpr int kVelloSlot_ConfigUniform = 0;
91 
92 // The scene encoding buffer.
93 constexpr int kVelloSlot_Scene = 1;
94 
95 // ***
96 // Buffers used during the element processing stage. This stage converts the stream of variable
97 // length path tags, transforms, brushes into a "path monoid" stream containing buffer offsets for
98 // the subsequent stages that associate the input streams with individual draw elements. This stage
99 // performs a parallel prefix sum (reduce + scan) which can be performed in two dispatches if the
100 // entire input can be processed by a single workgroup per dispatch. Otherwise, the algorithm
101 // requires two additional dispatches to continue the traversal (this is due to a lack of primitives
102 // to synchronize execution across workgroups in MSL and WGSL).
103 //
104 // Single pass variant pipelines: pathtag_reduce, pathtag_scan_small
105 // Multi-pass variant pipelines: pathtag_reduce, pathtag_reduce2, pathtag_scan1, pathtag_scan_large
106 constexpr int kVelloSlot_TagMonoid = 2;
107 
108 // Single pass variant slots:
109 constexpr int kVelloSlot_PathtagReduceOutput = 3;
110 
111 // Multi pass variant slots:
112 constexpr int kVelloSlot_LargePathtagReduceFirstPassOutput = kVelloSlot_PathtagReduceOutput;
113 constexpr int kVelloSlot_LargePathtagReduceSecondPassOutput = 4;
114 constexpr int kVelloSlot_LargePathtagScanFirstPassOutput = 5;
115 
116 // ***
117 // The second part of element processing flattens path elements (moveTo, lineTo, quadTo, etc) into
118 // an unordered line soup buffer and computes their bounding boxes. This stage is where strokes get
119 // expanded to fills and stroke styles get applied. The output is an unordered "line soup" buffer
120 // and the tight device-space bounding box of each path.
121 //
122 // Pipelines: bbox_clear, flatten
123 constexpr int kVelloSlot_PathBBoxes = 6;
124 constexpr int kVelloSlot_Lines = 7;
125 
126 // ***
127 // The next part prepares the draw object stream (entries in the per-tile command list aka PTCL)
128 // and additional metadata for the subsequent clipping and binning stages.
129 //
130 // Pipelines: draw_reduce, draw_leaf
131 constexpr int kVelloSlot_DrawReduceOutput = 8;
132 constexpr int kVelloSlot_DrawMonoid = 9;
133 constexpr int kVelloSlot_InfoBinData = 10;
134 constexpr int kVelloSlot_ClipInput = 11;
135 
136 // ***
137 // Clipping. The outputs of this stage are the finalized draw monoid and the clip bounding-boxes.
138 // Clipping involves evaluating the stack monoid: refer to the following paper for the meaning of
139 // these buffers: https://arxiv.org/pdf/2205.11659.pdf,
140 // https://en.wikipedia.org/wiki/Bicyclic_semigroup
141 //
142 // Pipelines: clip_reduce, clip_leaf
143 constexpr int kVelloSlot_ClipBicyclic = 12;
144 constexpr int kVelloSlot_ClipElement = 13;
145 constexpr int kVelloSlot_ClipBBoxes = 14;
146 
147 // ***
148 // Buffers containing bump allocated data, the inputs and outputs to the binning, coarse raster, and
149 // per-tile segment assembly stages.
150 //
151 // Pipelines: binning, tile_alloc, path_count, backdrop, coarse, path_tiling
152 constexpr int kVelloSlot_DrawBBoxes = 15;
153 constexpr int kVelloSlot_BumpAlloc = 16;
154 constexpr int kVelloSlot_BinHeader = 17;
155 
156 constexpr int kVelloSlot_Path = 18;
157 constexpr int kVelloSlot_Tile = 19;
158 constexpr int kVelloSlot_SegmentCounts = 20;
159 constexpr int kVelloSlot_Segments = 21;
160 constexpr int kVelloSlot_PTCL = 22;
161 
162 // ***
163 // Texture resources used by the fine rasterization stage. The gradient image needs to get populated
164 // on the CPU with pre-computed gradient ramps. The image atlas is intended to hold pre-uploaded
165 // images that are composited into the scene.
166 //
167 // The output image contains the final render.
168 constexpr int kVelloSlot_OutputImage = 23;
169 constexpr int kVelloSlot_GradientImage = 24;
170 constexpr int kVelloSlot_ImageAtlas = 25;
171 
172 // ***
173 // The indirect count buffer is used to issue an indirect dispatch of the path count and path tiling
174 // stages.
175 constexpr int kVelloSlot_IndirectCount = 26;
176 
177 // ***
178 // The sample mask lookup table used in MSAA modes of the fine rasterization stage.
179 constexpr int kVelloSlot_MaskLUT = 27;
180 
181 std::string_view VelloStageName(vello_cpp::ShaderStage);
182 WorkgroupSize VelloStageLocalSize(vello_cpp::ShaderStage);
183 skia_private::TArray<ComputeStep::WorkgroupBufferDesc> VelloWorkgroupBuffers(
184         vello_cpp::ShaderStage);
185 ComputeStep::NativeShaderSource VelloNativeShaderSource(vello_cpp::ShaderStage,
186                                                         ComputeStep::NativeShaderFormat);
187 
188 template <vello_cpp::ShaderStage S>
189 class VelloStep : public ComputeStep {
190 public:
191     ~VelloStep() override = default;
192 
nativeShaderSource(NativeShaderFormat format)193     NativeShaderSource nativeShaderSource(NativeShaderFormat format) const override {
194         return VelloNativeShaderSource(S, format);
195     }
196 
197 protected:
VelloStep(SkSpan<const ResourceDesc> resources)198     explicit VelloStep(SkSpan<const ResourceDesc> resources)
199             : ComputeStep(VelloStageName(S),
200                           VelloStageLocalSize(S),
201                           resources,
202                           AsSpan<ComputeStep::WorkgroupBufferDesc>(VelloWorkgroupBuffers(S)),
203                           Flags::kSupportsNativeShader) {}
204 
205 private:
206     // Helper that creates a SkSpan from a universal reference to a container. Generally, creating a
207     // SkSpan from an rvalue reference is not safe since the pointer stored in the SkSpan will
208     // dangle beyond the constructor expression. In our usage in the constructor above,
209     // the lifetime of the temporary TArray should match that of the SkSpan, both of which should
210     // live through the constructor call expression.
211     //
212     // From https://en.cppreference.com/w/cpp/language/reference_initialization#Lifetime_of_a_temporary:
213     //
214     //     a temporary bound to a reference parameter in a function call exists until the end of the
215     //     full expression containing that function call
216     //
217     template <typename T, typename C>
AsSpan(C && container)218     static SkSpan<const T> AsSpan(C&& container) {
219         return SkSpan(std::data(container), std::size(container));
220     }
221 };
222 
223 #define VELLO_COMPUTE_STEP(stage)                                                      \
224     class Vello##stage##Step final : public VelloStep<vello_cpp::ShaderStage::stage> { \
225     public:                                                                            \
226         Vello##stage##Step();                                                          \
227     };
228 
229 VELLO_COMPUTE_STEP(BackdropDyn);
230 VELLO_COMPUTE_STEP(BboxClear);
231 VELLO_COMPUTE_STEP(Binning);
232 VELLO_COMPUTE_STEP(ClipLeaf);
233 VELLO_COMPUTE_STEP(ClipReduce);
234 VELLO_COMPUTE_STEP(Coarse);
235 VELLO_COMPUTE_STEP(Flatten);
236 VELLO_COMPUTE_STEP(DrawLeaf);
237 VELLO_COMPUTE_STEP(DrawReduce);
238 VELLO_COMPUTE_STEP(PathCount);
239 VELLO_COMPUTE_STEP(PathCountSetup);
240 VELLO_COMPUTE_STEP(PathTiling);
241 VELLO_COMPUTE_STEP(PathTilingSetup);
242 VELLO_COMPUTE_STEP(PathtagReduce);
243 VELLO_COMPUTE_STEP(PathtagReduce2);
244 VELLO_COMPUTE_STEP(PathtagScan1);
245 VELLO_COMPUTE_STEP(PathtagScanLarge);
246 VELLO_COMPUTE_STEP(PathtagScanSmall);
247 VELLO_COMPUTE_STEP(TileAlloc);
248 
249 #undef VELLO_COMPUTE_STEP
250 
251 template <vello_cpp::ShaderStage S, SkColorType T> class VelloFineStepBase : public VelloStep<S> {
252 public:
253     // We need to return a texture format for the bound textures.
calculateTextureParameters(int index,const ComputeStep::ResourceDesc &)254     std::tuple<SkISize, SkColorType> calculateTextureParameters(
255             int index, const ComputeStep::ResourceDesc&) const override {
256         SkASSERT(index == 4);
257         // TODO: The texture dimensions are unknown here so this method returns 0 for the texture
258         // size. In this case this field is unused since VelloRenderer assigns texture resources
259         // directly to the DispatchGroupBuilder. The format must still be queried to describe the
260         // ComputeStep's binding layout. This method could be improved to enable conditional
261         // querying of optional/dynamic parameters.
262         return {{}, T};
263     }
264 
265 protected:
VelloFineStepBase(SkSpan<const ComputeStep::ResourceDesc> resources)266     explicit VelloFineStepBase(SkSpan<const ComputeStep::ResourceDesc> resources)
267             : VelloStep<S>(resources) {}
268 };
269 
270 template <vello_cpp::ShaderStage S, SkColorType T, ::rust::Vec<uint8_t> (*MaskLutBuilder)()>
271 class VelloFineMsaaStepBase : public VelloFineStepBase<S, T> {
272 public:
calculateBufferSize(int resourceIndex,const ComputeStep::ResourceDesc &)273     size_t calculateBufferSize(int resourceIndex, const ComputeStep::ResourceDesc&) const override {
274         SkASSERT(resourceIndex == 5);
275         return fMaskLut.size();
276     }
277 
prepareStorageBuffer(int resourceIndex,const ComputeStep::ResourceDesc &,void * buffer,size_t bufferSize)278     void prepareStorageBuffer(int resourceIndex,
279                               const ComputeStep::ResourceDesc&,
280                               void* buffer,
281                               size_t bufferSize) const override {
282         SkASSERT(resourceIndex == 5);
283         SkASSERT(fMaskLut.size() == bufferSize);
284         memcpy(buffer, fMaskLut.data(), fMaskLut.size());
285     }
286 
287 protected:
VelloFineMsaaStepBase(SkSpan<const ComputeStep::ResourceDesc> resources)288     explicit VelloFineMsaaStepBase(SkSpan<const ComputeStep::ResourceDesc> resources)
289             : VelloFineStepBase<S, T>(resources), fMaskLut(MaskLutBuilder()) {}
290 
291 private:
292     ::rust::Vec<uint8_t> fMaskLut;
293 };
294 
295 class VelloFineAreaStep final
296         : public VelloFineStepBase<vello_cpp::ShaderStage::FineArea, kRGBA_8888_SkColorType> {
297 public:
298     VelloFineAreaStep();
299 };
300 
301 class VelloFineAreaAlpha8Step final
302         : public VelloFineStepBase<vello_cpp::ShaderStage::FineAreaR8, kAlpha_8_SkColorType> {
303 public:
304     VelloFineAreaAlpha8Step();
305 };
306 
307 class VelloFineMsaa16Step final : public VelloFineMsaaStepBase<vello_cpp::ShaderStage::FineMsaa16,
308                                                                kRGBA_8888_SkColorType,
309                                                                vello_cpp::build_mask_lut_16> {
310 public:
311     VelloFineMsaa16Step();
312 };
313 
314 class VelloFineMsaa16Alpha8Step final
315         : public VelloFineMsaaStepBase<vello_cpp::ShaderStage::FineMsaa16R8,
316                                        kAlpha_8_SkColorType,
317                                        vello_cpp::build_mask_lut_16> {
318 public:
319     VelloFineMsaa16Alpha8Step();
320 };
321 
322 class VelloFineMsaa8Step final : public VelloFineMsaaStepBase<vello_cpp::ShaderStage::FineMsaa8,
323                                                               kRGBA_8888_SkColorType,
324                                                               vello_cpp::build_mask_lut_8> {
325 public:
326     VelloFineMsaa8Step();
327 };
328 
329 class VelloFineMsaa8Alpha8Step final
330         : public VelloFineMsaaStepBase<vello_cpp::ShaderStage::FineMsaa8R8,
331                                        kAlpha_8_SkColorType,
332                                        vello_cpp::build_mask_lut_8> {
333 public:
334     VelloFineMsaa8Alpha8Step();
335 };
336 
337 }  // namespace skgpu::graphite
338 
339 #endif  // skgpu_graphite_compute_VelloComputeSteps_DEFINED
340