1 /* 2 * Copyright 2023 Google LLC 3 * 4 * Use of this source code is governed by a BSD-style license that can be 5 * found in the LICENSE file. 6 */ 7 8 #ifndef skgpu_graphite_compute_VelloComputeSteps_DEFINED 9 #define skgpu_graphite_compute_VelloComputeSteps_DEFINED 10 11 #include "include/core/SkSpan.h" 12 #include "include/private/base/SkTArray.h" 13 #include "src/gpu/graphite/ComputeTypes.h" 14 #include "src/gpu/graphite/compute/ComputeStep.h" 15 #include "third_party/vello/cpp/vello.h" 16 17 #include <string_view> 18 19 namespace skgpu::graphite { 20 21 // This file defines ComputeSteps for all Vello compute stages and their permutations. The 22 // declaration of each ComputeStep subclass mirrors the name of the pipeline stage as defined in the 23 // shader metadata. 24 // 25 // The compute stages all operate over a shared set of buffer and image resources. The 26 // `kVelloSlot_*` constant definitions below each uniquely identify a shared resource that must be 27 // instantiated when assembling the ComputeSteps into a DispatchGroup. 28 // 29 // === Monoids and Prefix Sums === 30 // 31 // Vello's GPU algorithms make repeated use of parallel prefix sums techniques. These occur 32 // frequently in path rasterization (e.g. winding number accummulation across a scanline can be 33 // thought of as per-pixel prefix sums) but Vello also uses them to calculate buffer offsets for 34 // associated entries across its variable length encoding streams. 35 // 36 // For instance, given a scene that contains Bézier paths, each path gets encoded as a transform, 37 // a sequence of path tags (verbs), and zero or more 2-D points associated with each 38 // tag. N paths will often map to N transforms, N + M tags, and N + M + L points (where N > 0, M > 39 // 0, L >= 0). These entries are stored in separate parallel transform, path tag, and path data 40 // streams. The correspondence between entries of these independent streams is implicit. To keep 41 // CPU encoding of these streams fast, the offsets into each buffer for a given "path object" is 42 // computed dynamically and in parallel on the GPU. Since the offsets for each object build 43 // additively on offsets that appear before it in the stream, parallel computation of 44 // offsets can be treated as a dynamic programming problem that maps well to parallel prefix sums 45 // where each object is a "monoid" (https://en.wikipedia.org/wiki/Monoid) that supports algebraic 46 // addition/subtraction over data encoded in the path tags themselves. 47 // 48 // Once computed, a monoid contains the offsets into the input (and sometimes output) buffers for a 49 // given object. The parallel prefix sums operation is defined as a monoidal reduce + pre-scan pair. 50 // (Prefix Sums and Their Applications, Blelloch, G., https://www.cs.cmu.edu/~guyb/papers/Ble93.pdf) 51 // 52 // While these concepts are an implementation detail they are core to the Vello algorithm and are 53 // reflected in the pipeline names and data slot definitions. 54 // 55 // === Full Pipeline === 56 // 57 // The full Vello pipeline stages are as follows and should be dispatched in the following order: 58 // 59 // I. Build the path monoid stream: 60 // If the input fits within the workgroup size: 61 // pathtag_reduce, pathtag_scan_small 62 // else 63 // pathtag_reduce, pathtag_reduce2, pathtag_scan1, pathtag_scan_large 64 // 65 // II. Compute path bounding boxes, convert path segments into cubics: 66 // bbox_clear, pathseg 67 // 68 // III. Process the draw object stream to build the draw monoids and inputs to the clip stage: 69 // draw_reduce, draw_leaf 70 // 71 // IV. Compute the bounding boxes for the clip stack from the input stream, if the scene contains 72 // clips: 73 // clip_reduce, clip_leaf 74 // 75 // V. Allocate tile and segment buffers for the individual bins and prepare for coarse rasterization 76 // binning, tile_alloc, path_coarse 77 // 78 // VI. Coarse rasterization 79 // backdrop_dyn, coarse 80 // 81 // VII. Fine rasterization 82 // fine 83 // 84 // TODO: Document the coverage mask pipeline once it has been re-implemented. 85 86 // *** 87 // Shared buffers that are accessed by various stages. 88 // 89 // The render configration uniform buffer. 90 constexpr int kVelloSlot_ConfigUniform = 0; 91 92 // The scene encoding buffer. 93 constexpr int kVelloSlot_Scene = 1; 94 95 // *** 96 // Buffers used during the element processing stage. This stage converts the stream of variable 97 // length path tags, transforms, brushes into a "path monoid" stream containing buffer offsets for 98 // the subsequent stages that associate the input streams with individual draw elements. This stage 99 // performs a parallel prefix sum (reduce + scan) which can be performed in two dispatches if the 100 // entire input can be processed by a single workgroup per dispatch. Otherwise, the algorithm 101 // requires two additional dispatches to continue the traversal (this is due to a lack of primitives 102 // to synchronize execution across workgroups in MSL and WGSL). 103 // 104 // Single pass variant pipelines: pathtag_reduce, pathtag_scan_small 105 // Multi-pass variant pipelines: pathtag_reduce, pathtag_reduce2, pathtag_scan1, pathtag_scan_large 106 constexpr int kVelloSlot_TagMonoid = 2; 107 108 // Single pass variant slots: 109 constexpr int kVelloSlot_PathtagReduceOutput = 3; 110 111 // Multi pass variant slots: 112 constexpr int kVelloSlot_LargePathtagReduceFirstPassOutput = kVelloSlot_PathtagReduceOutput; 113 constexpr int kVelloSlot_LargePathtagReduceSecondPassOutput = 4; 114 constexpr int kVelloSlot_LargePathtagScanFirstPassOutput = 5; 115 116 // *** 117 // The second part of element processing flattens path elements (moveTo, lineTo, quadTo, etc) into 118 // an unordered line soup buffer and computes their bounding boxes. This stage is where strokes get 119 // expanded to fills and stroke styles get applied. The output is an unordered "line soup" buffer 120 // and the tight device-space bounding box of each path. 121 // 122 // Pipelines: bbox_clear, flatten 123 constexpr int kVelloSlot_PathBBoxes = 6; 124 constexpr int kVelloSlot_Lines = 7; 125 126 // *** 127 // The next part prepares the draw object stream (entries in the per-tile command list aka PTCL) 128 // and additional metadata for the subsequent clipping and binning stages. 129 // 130 // Pipelines: draw_reduce, draw_leaf 131 constexpr int kVelloSlot_DrawReduceOutput = 8; 132 constexpr int kVelloSlot_DrawMonoid = 9; 133 constexpr int kVelloSlot_InfoBinData = 10; 134 constexpr int kVelloSlot_ClipInput = 11; 135 136 // *** 137 // Clipping. The outputs of this stage are the finalized draw monoid and the clip bounding-boxes. 138 // Clipping involves evaluating the stack monoid: refer to the following paper for the meaning of 139 // these buffers: https://arxiv.org/pdf/2205.11659.pdf, 140 // https://en.wikipedia.org/wiki/Bicyclic_semigroup 141 // 142 // Pipelines: clip_reduce, clip_leaf 143 constexpr int kVelloSlot_ClipBicyclic = 12; 144 constexpr int kVelloSlot_ClipElement = 13; 145 constexpr int kVelloSlot_ClipBBoxes = 14; 146 147 // *** 148 // Buffers containing bump allocated data, the inputs and outputs to the binning, coarse raster, and 149 // per-tile segment assembly stages. 150 // 151 // Pipelines: binning, tile_alloc, path_count, backdrop, coarse, path_tiling 152 constexpr int kVelloSlot_DrawBBoxes = 15; 153 constexpr int kVelloSlot_BumpAlloc = 16; 154 constexpr int kVelloSlot_BinHeader = 17; 155 156 constexpr int kVelloSlot_Path = 18; 157 constexpr int kVelloSlot_Tile = 19; 158 constexpr int kVelloSlot_SegmentCounts = 20; 159 constexpr int kVelloSlot_Segments = 21; 160 constexpr int kVelloSlot_PTCL = 22; 161 162 // *** 163 // Texture resources used by the fine rasterization stage. The gradient image needs to get populated 164 // on the CPU with pre-computed gradient ramps. The image atlas is intended to hold pre-uploaded 165 // images that are composited into the scene. 166 // 167 // The output image contains the final render. 168 constexpr int kVelloSlot_OutputImage = 23; 169 constexpr int kVelloSlot_GradientImage = 24; 170 constexpr int kVelloSlot_ImageAtlas = 25; 171 172 // *** 173 // The indirect count buffer is used to issue an indirect dispatch of the path count and path tiling 174 // stages. 175 constexpr int kVelloSlot_IndirectCount = 26; 176 177 // *** 178 // The sample mask lookup table used in MSAA modes of the fine rasterization stage. 179 constexpr int kVelloSlot_MaskLUT = 27; 180 181 std::string_view VelloStageName(vello_cpp::ShaderStage); 182 WorkgroupSize VelloStageLocalSize(vello_cpp::ShaderStage); 183 skia_private::TArray<ComputeStep::WorkgroupBufferDesc> VelloWorkgroupBuffers( 184 vello_cpp::ShaderStage); 185 ComputeStep::NativeShaderSource VelloNativeShaderSource(vello_cpp::ShaderStage, 186 ComputeStep::NativeShaderFormat); 187 188 template <vello_cpp::ShaderStage S> 189 class VelloStep : public ComputeStep { 190 public: 191 ~VelloStep() override = default; 192 nativeShaderSource(NativeShaderFormat format)193 NativeShaderSource nativeShaderSource(NativeShaderFormat format) const override { 194 return VelloNativeShaderSource(S, format); 195 } 196 197 protected: VelloStep(SkSpan<const ResourceDesc> resources)198 explicit VelloStep(SkSpan<const ResourceDesc> resources) 199 : ComputeStep(VelloStageName(S), 200 VelloStageLocalSize(S), 201 resources, 202 AsSpan<ComputeStep::WorkgroupBufferDesc>(VelloWorkgroupBuffers(S)), 203 Flags::kSupportsNativeShader) {} 204 205 private: 206 // Helper that creates a SkSpan from a universal reference to a container. Generally, creating a 207 // SkSpan from an rvalue reference is not safe since the pointer stored in the SkSpan will 208 // dangle beyond the constructor expression. In our usage in the constructor above, 209 // the lifetime of the temporary TArray should match that of the SkSpan, both of which should 210 // live through the constructor call expression. 211 // 212 // From https://en.cppreference.com/w/cpp/language/reference_initialization#Lifetime_of_a_temporary: 213 // 214 // a temporary bound to a reference parameter in a function call exists until the end of the 215 // full expression containing that function call 216 // 217 template <typename T, typename C> AsSpan(C && container)218 static SkSpan<const T> AsSpan(C&& container) { 219 return SkSpan(std::data(container), std::size(container)); 220 } 221 }; 222 223 #define VELLO_COMPUTE_STEP(stage) \ 224 class Vello##stage##Step final : public VelloStep<vello_cpp::ShaderStage::stage> { \ 225 public: \ 226 Vello##stage##Step(); \ 227 }; 228 229 VELLO_COMPUTE_STEP(BackdropDyn); 230 VELLO_COMPUTE_STEP(BboxClear); 231 VELLO_COMPUTE_STEP(Binning); 232 VELLO_COMPUTE_STEP(ClipLeaf); 233 VELLO_COMPUTE_STEP(ClipReduce); 234 VELLO_COMPUTE_STEP(Coarse); 235 VELLO_COMPUTE_STEP(Flatten); 236 VELLO_COMPUTE_STEP(DrawLeaf); 237 VELLO_COMPUTE_STEP(DrawReduce); 238 VELLO_COMPUTE_STEP(PathCount); 239 VELLO_COMPUTE_STEP(PathCountSetup); 240 VELLO_COMPUTE_STEP(PathTiling); 241 VELLO_COMPUTE_STEP(PathTilingSetup); 242 VELLO_COMPUTE_STEP(PathtagReduce); 243 VELLO_COMPUTE_STEP(PathtagReduce2); 244 VELLO_COMPUTE_STEP(PathtagScan1); 245 VELLO_COMPUTE_STEP(PathtagScanLarge); 246 VELLO_COMPUTE_STEP(PathtagScanSmall); 247 VELLO_COMPUTE_STEP(TileAlloc); 248 249 #undef VELLO_COMPUTE_STEP 250 251 template <vello_cpp::ShaderStage S, SkColorType T> class VelloFineStepBase : public VelloStep<S> { 252 public: 253 // We need to return a texture format for the bound textures. calculateTextureParameters(int index,const ComputeStep::ResourceDesc &)254 std::tuple<SkISize, SkColorType> calculateTextureParameters( 255 int index, const ComputeStep::ResourceDesc&) const override { 256 SkASSERT(index == 4); 257 // TODO: The texture dimensions are unknown here so this method returns 0 for the texture 258 // size. In this case this field is unused since VelloRenderer assigns texture resources 259 // directly to the DispatchGroupBuilder. The format must still be queried to describe the 260 // ComputeStep's binding layout. This method could be improved to enable conditional 261 // querying of optional/dynamic parameters. 262 return {{}, T}; 263 } 264 265 protected: VelloFineStepBase(SkSpan<const ComputeStep::ResourceDesc> resources)266 explicit VelloFineStepBase(SkSpan<const ComputeStep::ResourceDesc> resources) 267 : VelloStep<S>(resources) {} 268 }; 269 270 template <vello_cpp::ShaderStage S, SkColorType T, ::rust::Vec<uint8_t> (*MaskLutBuilder)()> 271 class VelloFineMsaaStepBase : public VelloFineStepBase<S, T> { 272 public: calculateBufferSize(int resourceIndex,const ComputeStep::ResourceDesc &)273 size_t calculateBufferSize(int resourceIndex, const ComputeStep::ResourceDesc&) const override { 274 SkASSERT(resourceIndex == 5); 275 return fMaskLut.size(); 276 } 277 prepareStorageBuffer(int resourceIndex,const ComputeStep::ResourceDesc &,void * buffer,size_t bufferSize)278 void prepareStorageBuffer(int resourceIndex, 279 const ComputeStep::ResourceDesc&, 280 void* buffer, 281 size_t bufferSize) const override { 282 SkASSERT(resourceIndex == 5); 283 SkASSERT(fMaskLut.size() == bufferSize); 284 memcpy(buffer, fMaskLut.data(), fMaskLut.size()); 285 } 286 287 protected: VelloFineMsaaStepBase(SkSpan<const ComputeStep::ResourceDesc> resources)288 explicit VelloFineMsaaStepBase(SkSpan<const ComputeStep::ResourceDesc> resources) 289 : VelloFineStepBase<S, T>(resources), fMaskLut(MaskLutBuilder()) {} 290 291 private: 292 ::rust::Vec<uint8_t> fMaskLut; 293 }; 294 295 class VelloFineAreaStep final 296 : public VelloFineStepBase<vello_cpp::ShaderStage::FineArea, kRGBA_8888_SkColorType> { 297 public: 298 VelloFineAreaStep(); 299 }; 300 301 class VelloFineAreaAlpha8Step final 302 : public VelloFineStepBase<vello_cpp::ShaderStage::FineAreaR8, kAlpha_8_SkColorType> { 303 public: 304 VelloFineAreaAlpha8Step(); 305 }; 306 307 class VelloFineMsaa16Step final : public VelloFineMsaaStepBase<vello_cpp::ShaderStage::FineMsaa16, 308 kRGBA_8888_SkColorType, 309 vello_cpp::build_mask_lut_16> { 310 public: 311 VelloFineMsaa16Step(); 312 }; 313 314 class VelloFineMsaa16Alpha8Step final 315 : public VelloFineMsaaStepBase<vello_cpp::ShaderStage::FineMsaa16R8, 316 kAlpha_8_SkColorType, 317 vello_cpp::build_mask_lut_16> { 318 public: 319 VelloFineMsaa16Alpha8Step(); 320 }; 321 322 class VelloFineMsaa8Step final : public VelloFineMsaaStepBase<vello_cpp::ShaderStage::FineMsaa8, 323 kRGBA_8888_SkColorType, 324 vello_cpp::build_mask_lut_8> { 325 public: 326 VelloFineMsaa8Step(); 327 }; 328 329 class VelloFineMsaa8Alpha8Step final 330 : public VelloFineMsaaStepBase<vello_cpp::ShaderStage::FineMsaa8R8, 331 kAlpha_8_SkColorType, 332 vello_cpp::build_mask_lut_8> { 333 public: 334 VelloFineMsaa8Alpha8Step(); 335 }; 336 337 } // namespace skgpu::graphite 338 339 #endif // skgpu_graphite_compute_VelloComputeSteps_DEFINED 340