1 /* 2 * Copyright 2022 Google Inc. 3 * 4 * Use of this source code is governed by a BSD-style license that can be 5 * found in the LICENSE file. 6 */ 7 8 #include "include/core/SkSpan.h" 9 #include "include/core/SkTypes.h" 10 #include "include/private/base/SkTArray.h" 11 #include "src/base/SkUtils.h" 12 #include "src/core/SkRasterPipelineOpList.h" 13 #include "src/core/SkTHash.h" 14 15 #include <cstdint> 16 #include <initializer_list> 17 #include <memory> 18 19 class SkArenaAlloc; 20 class SkRasterPipeline; 21 class SkWStream; 22 23 namespace SkSL { 24 25 class SkRPDebugTrace; 26 27 namespace RP { 28 29 // A single scalar in our program consumes one slot. 30 using Slot = int; 31 constexpr Slot NA = -1; 32 33 // Scalars, vectors, and matrices can be represented as a range of slot indices. 34 struct SlotRange { 35 Slot index = 0; 36 int count = 0; 37 }; 38 39 // An RP::Program will consist entirely of ProgramOps. The ProgramOps list is a superset of the 40 // native SkRasterPipelineOps op-list. It also has a few extra ops to indicate child-effect 41 // invocation, and a `label` op to indicate branch targets. 42 enum class ProgramOp { 43 // A finished program can contain any native Raster Pipeline op... 44 #define M(stage) stage, 45 SK_RASTER_PIPELINE_OPS_ALL(M) 46 #undef M 47 48 // ... has branch targets... 49 label, 50 51 // ... and can invoke child programs. 52 invoke_shader, 53 invoke_color_filter, 54 invoke_blender, 55 }; 56 57 // BuilderOps are a superset of ProgramOps. They are used by the RP::Builder, which works in terms 58 // of Instructions; Instructions are slightly more expressive than raw SkRasterPipelineOps. In 59 // particular, the Builder supports stacks for pushing and popping scratch values. 60 // RP::Program::makeStages is responsible for rewriting Instructions/BuilderOps into an array of 61 // RP::Program::Stages, which will contain only native SkRasterPipelineOps and (optionally) 62 // child-effect invocations. 63 enum class BuilderOp { 64 // An in-flight program can contain all the native Raster Pipeline ops... 65 #define M(stage) stage, 66 SK_RASTER_PIPELINE_OPS_ALL(M) 67 #undef M 68 69 // ... has branch targets... 70 label, 71 72 // ... can invoke child programs... 73 invoke_shader, 74 invoke_color_filter, 75 invoke_blender, 76 77 // ... and also has Builder-specific ops. These ops generally interface with the stack, and are 78 // converted into ProgramOps during `makeStages`. 79 push_literal, 80 push_slots, 81 push_uniform, 82 push_zeros, 83 push_clone, 84 push_clone_from_stack, 85 copy_stack_to_slots, 86 copy_stack_to_slots_unmasked, 87 swizzle_copy_stack_to_slots, 88 discard_stack, 89 select, 90 push_condition_mask, 91 pop_condition_mask, 92 push_loop_mask, 93 pop_loop_mask, 94 pop_and_reenable_loop_mask, 95 push_return_mask, 96 pop_return_mask, 97 push_src_rgba, 98 push_dst_rgba, 99 pop_src_rg, 100 pop_src_rgba, 101 pop_dst_rgba, 102 set_current_stack, 103 branch_if_no_active_lanes_on_stack_top_equal, 104 unsupported 105 }; 106 107 // If the child-invocation enums are not in sync between enums, program creation will not work. 108 static_assert((int)ProgramOp::label == (int)BuilderOp::label); 109 static_assert((int)ProgramOp::invoke_shader == (int)BuilderOp::invoke_shader); 110 static_assert((int)ProgramOp::invoke_color_filter == (int)BuilderOp::invoke_color_filter); 111 static_assert((int)ProgramOp::invoke_blender == (int)BuilderOp::invoke_blender); 112 113 // Represents a single raster-pipeline SkSL instruction. 114 struct Instruction { 115 Instruction(BuilderOp op, std::initializer_list<Slot> slots, int a = 0, int b = 0, int c = 0) fOpInstruction116 : fOp(op), fImmA(a), fImmB(b), fImmC(c) { 117 auto iter = slots.begin(); 118 if (iter != slots.end()) { fSlotA = *iter++; } 119 if (iter != slots.end()) { fSlotB = *iter++; } 120 if (iter != slots.end()) { fSlotC = *iter++; } 121 SkASSERT(iter == slots.end()); 122 } 123 124 BuilderOp fOp; 125 Slot fSlotA = NA; 126 Slot fSlotB = NA; 127 Slot fSlotC = NA; 128 int fImmA = 0; 129 int fImmB = 0; 130 int fImmC = 0; 131 }; 132 133 class Callbacks { 134 public: 135 virtual ~Callbacks() = default; 136 137 virtual bool appendShader(int index) = 0; 138 virtual bool appendColorFilter(int index) = 0; 139 virtual bool appendBlender(int index) = 0; 140 141 virtual void toLinearSrgb() = 0; 142 virtual void fromLinearSrgb() = 0; 143 }; 144 145 class Program { 146 public: 147 Program(SkTArray<Instruction> instrs, 148 int numValueSlots, 149 int numUniformSlots, 150 int numLabels, 151 SkRPDebugTrace* debugTrace); 152 153 #if !defined(SKSL_STANDALONE) 154 bool appendStages(SkRasterPipeline* pipeline, 155 SkArenaAlloc* alloc, 156 Callbacks* callbacks, 157 SkSpan<const float> uniforms) const; 158 #endif 159 160 void dump(SkWStream* out) const; 161 162 private: 163 using StackDepthMap = SkTHashMap<int, int>; // <stack index, depth of stack> 164 165 struct SlotData { 166 SkSpan<float> values; 167 SkSpan<float> stack; 168 }; 169 SlotData allocateSlotData(SkArenaAlloc* alloc) const; 170 171 struct Stage { 172 ProgramOp op; 173 void* ctx; 174 }; 175 void makeStages(SkTArray<Stage>* pipeline, 176 SkArenaAlloc* alloc, 177 SkSpan<const float> uniforms, 178 const SlotData& slots) const; 179 void optimize(); 180 StackDepthMap tempStackMaxDepths() const; 181 182 // These methods are used to split up large multi-slot operations into multiple ops as needed. 183 void appendCopy(SkTArray<Stage>* pipeline, SkArenaAlloc* alloc, 184 ProgramOp baseStage, 185 float* dst, int dstStride, const float* src, int srcStride, int numSlots) const; 186 void appendCopySlotsUnmasked(SkTArray<Stage>* pipeline, SkArenaAlloc* alloc, 187 float* dst, const float* src, int numSlots) const; 188 void appendCopySlotsMasked(SkTArray<Stage>* pipeline, SkArenaAlloc* alloc, 189 float* dst, const float* src, int numSlots) const; 190 void appendCopyConstants(SkTArray<Stage>* pipeline, SkArenaAlloc* alloc, 191 float* dst, const float* src, int numSlots) const; 192 193 // Appends a single-slot single-input math operation to the pipeline. The op `stage` will 194 // appended `numSlots` times, starting at position `dst` and advancing one slot for each 195 // subsequent invocation. 196 void appendSingleSlotUnaryOp(SkTArray<Stage>* pipeline, ProgramOp stage, 197 float* dst, int numSlots) const; 198 199 // Appends a multi-slot single-input math operation to the pipeline. `baseStage` must refer to 200 // an single-slot "apply_op" stage, which must be immediately followed by specializations for 201 // 2-4 slots. For instance, {`zero_slot`, `zero_2_slots`, `zero_3_slots`, `zero_4_slots`} 202 // must be contiguous ops in the stage list, listed in that order; pass `zero_slot` and we 203 // pick the appropriate op based on `numSlots`. 204 void appendMultiSlotUnaryOp(SkTArray<Stage>* pipeline, ProgramOp baseStage, 205 float* dst, int numSlots) const; 206 207 // Appends a two-input math operation to the pipeline. `src` must be _immediately_ after `dst` 208 // in memory. `baseStage` must refer to an unbounded "apply_to_n_slots" stage. A BinaryOpCtx 209 // will be used to pass pointers to the destination and source; the delta between the two 210 // pointers implicitly gives the number of slots. 211 void appendAdjacentNWayBinaryOp(SkTArray<Stage>* pipeline, SkArenaAlloc* alloc, 212 ProgramOp stage, 213 float* dst, const float* src, int numSlots) const; 214 215 // Appends a multi-slot two-input math operation to the pipeline. `src` must be _immediately_ 216 // after `dst` in memory. `baseStage` must refer to an unbounded "apply_to_n_slots" stage, which 217 // must be immediately followed by specializations for 1-4 slots. For instance, {`add_n_floats`, 218 // `add_float`, `add_2_floats`, `add_3_floats`, `add_4_floats`} must be contiguous ops in the 219 // stage list, listed in that order; pass `add_n_floats` and we pick the appropriate op based on 220 // `numSlots`. 221 void appendAdjacentMultiSlotBinaryOp(SkTArray<Stage>* pipeline, SkArenaAlloc* alloc, 222 ProgramOp baseStage, 223 float* dst, const float* src, int numSlots) const; 224 225 // Appends a multi-slot math operation having three inputs (dst, src0, src1) and one output 226 // (dst) to the pipeline. The three inputs must be _immediately_ adjacent in memory. `baseStage` 227 // must refer to an unbounded "apply_to_n_slots" stage, which must be immediately followed by 228 // specializations for 1-4 slots. 229 void appendAdjacentMultiSlotTernaryOp(SkTArray<Stage>* pipeline, SkArenaAlloc* alloc, 230 ProgramOp stage, float* dst, 231 const float* src0, const float* src1, int numSlots) const; 232 233 // Appends a stack_rewind op on platforms where it is needed (when SK_HAS_MUSTTAIL is not set). 234 void appendStackRewind(SkTArray<Stage>* pipeline) const; 235 236 SkTArray<Instruction> fInstructions; 237 int fNumValueSlots = 0; 238 int fNumUniformSlots = 0; 239 int fNumTempStackSlots = 0; 240 int fNumLabels = 0; 241 SkTHashMap<int, int> fTempStackMaxDepths; 242 SkRPDebugTrace* fDebugTrace = nullptr; 243 }; 244 245 class Builder { 246 public: 247 /** Finalizes and optimizes the program. */ 248 std::unique_ptr<Program> finish(int numValueSlots, 249 int numUniformSlots, 250 SkRPDebugTrace* debugTrace = nullptr); 251 /** 252 * Peels off a label ID for use in the program. Set the label's position in the program with 253 * the `label` instruction. Actually branch to the target with an instruction like 254 * `branch_if_any_active_lanes` or `jump`. 255 */ nextLabelID()256 int nextLabelID() { 257 return fNumLabels++; 258 } 259 260 /** 261 * The builder keeps track of the state of execution masks; when we know that the execution 262 * mask is unaltered, we can generate simpler code. Code which alters the execution mask is 263 * required to enable this flag. 264 */ enableExecutionMaskWrites()265 void enableExecutionMaskWrites() { 266 ++fExecutionMaskWritesEnabled; 267 } 268 disableExecutionMaskWrites()269 void disableExecutionMaskWrites() { 270 SkASSERT(this->executionMaskWritesAreEnabled()); 271 --fExecutionMaskWritesEnabled; 272 } 273 executionMaskWritesAreEnabled()274 bool executionMaskWritesAreEnabled() { 275 return fExecutionMaskWritesEnabled > 0; 276 } 277 278 /** Assemble a program from the Raster Pipeline instructions below. */ init_lane_masks()279 void init_lane_masks() { 280 fInstructions.push_back({BuilderOp::init_lane_masks, {}}); 281 } 282 store_src_rg(SlotRange slots)283 void store_src_rg(SlotRange slots) { 284 SkASSERT(slots.count == 2); 285 fInstructions.push_back({BuilderOp::store_src_rg, {slots.index}}); 286 } 287 store_src(SlotRange slots)288 void store_src(SlotRange slots) { 289 SkASSERT(slots.count == 4); 290 fInstructions.push_back({BuilderOp::store_src, {slots.index}}); 291 } 292 store_dst(SlotRange slots)293 void store_dst(SlotRange slots) { 294 SkASSERT(slots.count == 4); 295 fInstructions.push_back({BuilderOp::store_dst, {slots.index}}); 296 } 297 store_device_xy01(SlotRange slots)298 void store_device_xy01(SlotRange slots) { 299 SkASSERT(slots.count == 4); 300 fInstructions.push_back({BuilderOp::store_device_xy01, {slots.index}}); 301 } 302 load_src(SlotRange slots)303 void load_src(SlotRange slots) { 304 SkASSERT(slots.count == 4); 305 fInstructions.push_back({BuilderOp::load_src, {slots.index}}); 306 } 307 load_dst(SlotRange slots)308 void load_dst(SlotRange slots) { 309 SkASSERT(slots.count == 4); 310 fInstructions.push_back({BuilderOp::load_dst, {slots.index}}); 311 } 312 set_current_stack(int stackIdx)313 void set_current_stack(int stackIdx) { 314 fInstructions.push_back({BuilderOp::set_current_stack, {}, stackIdx}); 315 } 316 317 // Inserts a label into the instruction stream. 318 void label(int labelID); 319 320 // Unconditionally branches to a label. 321 void jump(int labelID); 322 323 // Branches to a label if the execution mask is active in any lane. 324 void branch_if_any_active_lanes(int labelID); 325 326 // Branches to a label if the execution mask is inactive across all lanes. 327 void branch_if_no_active_lanes(int labelID); 328 329 // Branches to a label if the top value on the stack is _not_ equal to `value` in any lane. 330 void branch_if_no_active_lanes_on_stack_top_equal(int value, int labelID); 331 332 // We use the same SkRasterPipeline op regardless of the literal type, and bitcast the value. push_literal_f(float val)333 void push_literal_f(float val) { 334 this->push_literal_i(sk_bit_cast<int32_t>(val)); 335 } 336 push_literal_i(int32_t val)337 void push_literal_i(int32_t val) { 338 if (val == 0) { 339 this->push_zeros(1); 340 } else { 341 fInstructions.push_back({BuilderOp::push_literal, {}, val}); 342 } 343 } 344 push_literal_u(uint32_t val)345 void push_literal_u(uint32_t val) { 346 this->push_literal_i(sk_bit_cast<int32_t>(val)); 347 } 348 349 // Translates into copy_constants (from uniforms into temp stack) in Raster Pipeline. 350 void push_uniform(SlotRange src); 351 push_zeros(int count)352 void push_zeros(int count) { 353 // Translates into zero_slot_unmasked in Raster Pipeline. 354 if (!fInstructions.empty() && fInstructions.back().fOp == BuilderOp::push_zeros) { 355 // Coalesce adjacent push_zero ops into a single op. 356 fInstructions.back().fImmA += count; 357 } else { 358 fInstructions.push_back({BuilderOp::push_zeros, {}, count}); 359 } 360 } 361 362 // Translates into copy_slots_unmasked (from values into temp stack) in Raster Pipeline. 363 void push_slots(SlotRange src); 364 365 // Translates into copy_slots_masked (from temp stack to values) in Raster Pipeline. 366 // Does not discard any values on the temp stack. copy_stack_to_slots(SlotRange dst)367 void copy_stack_to_slots(SlotRange dst) { 368 this->copy_stack_to_slots(dst, /*offsetFromStackTop=*/dst.count); 369 } 370 371 void copy_stack_to_slots(SlotRange dst, int offsetFromStackTop); 372 373 // Translates into swizzle_copy_slots_masked (from temp stack to values) in Raster Pipeline. 374 // Does not discard any values on the temp stack. 375 void swizzle_copy_stack_to_slots(SlotRange dst, 376 SkSpan<const int8_t> components, 377 int offsetFromStackTop); 378 379 // Translates into copy_slots_unmasked (from temp stack to values) in Raster Pipeline. 380 // Does not discard any values on the temp stack. copy_stack_to_slots_unmasked(SlotRange dst)381 void copy_stack_to_slots_unmasked(SlotRange dst) { 382 this->copy_stack_to_slots_unmasked(dst, /*offsetFromStackTop=*/dst.count); 383 } 384 385 void copy_stack_to_slots_unmasked(SlotRange dst, int offsetFromStackTop); 386 387 // Performs a unary op (like `bitwise_not`), given a slot count of `slots`. The stack top is 388 // replaced with the result. 389 void unary_op(BuilderOp op, int32_t slots); 390 391 // Performs a binary op (like `add_n_floats` or `cmpeq_n_ints`), given a slot count of 392 // `slots`. Two n-slot input values are consumed, and the result is pushed onto the stack. 393 void binary_op(BuilderOp op, int32_t slots); 394 395 // Performs a ternary op (like `mix` or `smoothstep`), given a slot count of 396 // `slots`. Three n-slot input values are consumed, and the result is pushed onto the stack. 397 void ternary_op(BuilderOp op, int32_t slots); 398 399 // Computes a dot product on the stack. The slots consumed (`slots`) must be between 1 and 4. 400 // Two n-slot input vectors are consumed, and a scalar result is pushed onto the stack. 401 void dot_floats(int32_t slots); 402 403 // Shrinks the temp stack, discarding values on top. 404 void discard_stack(int32_t count = 1); 405 406 // Copies vales from the temp stack into slots, and then shrinks the temp stack. 407 void pop_slots(SlotRange dst); 408 409 // Creates many clones of the top single-slot item on the temp stack. 410 void push_duplicates(int count); 411 412 // Creates a single clone of an item on the current temp stack. The cloned item can consist of 413 // any number of slots, and can be copied from an earlier position on the stack. 414 void push_clone(int numSlots, int offsetFromStackTop = 0) { 415 fInstructions.push_back({BuilderOp::push_clone, {}, numSlots, 416 numSlots + offsetFromStackTop}); 417 } 418 419 // Creates a single clone from an item on any temp stack. The cloned item can consist of any 420 // number of slots. 421 void push_clone_from_stack(int numSlots, int otherStackIndex, int offsetFromStackTop = 0); 422 423 // Compares the stack top with the passed-in value; if it matches, enables the loop mask. case_op(int value)424 void case_op(int value) { 425 fInstructions.push_back({BuilderOp::case_op, {}, value}); 426 } 427 select(int slots)428 void select(int slots) { 429 // Overlays the top two entries on the stack, making one hybrid entry. The execution mask 430 // is used to select which lanes are preserved. 431 SkASSERT(slots > 0); 432 fInstructions.push_back({BuilderOp::select, {}, slots}); 433 } 434 435 // The opposite of push_slots; copies values from the temp stack into value slots, then 436 // shrinks the temp stack. 437 void pop_slots_unmasked(SlotRange dst); 438 copy_slots_masked(SlotRange dst,SlotRange src)439 void copy_slots_masked(SlotRange dst, SlotRange src) { 440 SkASSERT(dst.count == src.count); 441 fInstructions.push_back({BuilderOp::copy_slot_masked, {dst.index, src.index}, dst.count}); 442 } 443 444 void copy_slots_unmasked(SlotRange dst, SlotRange src); 445 copy_constant(Slot slot,int constantValue)446 void copy_constant(Slot slot, int constantValue) { 447 fInstructions.push_back({BuilderOp::copy_constant, {slot}, constantValue}); 448 } 449 450 // Stores zeros across the entire slot range. 451 void zero_slots_unmasked(SlotRange dst); 452 453 // Consumes `consumedSlots` elements on the stack, then generates `components.size()` elements. 454 void swizzle(int consumedSlots, SkSpan<const int8_t> components); 455 456 // Transposes a matrix of size CxR on the stack (into a matrix of size RxC). 457 void transpose(int columns, int rows); 458 459 // Generates a CxR diagonal matrix from the top two scalars on the stack. The second scalar is 460 // used as the diagonal value; the first scalar (usually zero) fills in the rest of the slots. 461 void diagonal_matrix(int columns, int rows); 462 463 // Resizes a CxR matrix at the top of the stack to C'xR'. 464 void matrix_resize(int origColumns, int origRows, int newColumns, int newRows); 465 push_condition_mask()466 void push_condition_mask() { 467 SkASSERT(this->executionMaskWritesAreEnabled()); 468 fInstructions.push_back({BuilderOp::push_condition_mask, {}}); 469 } 470 pop_condition_mask()471 void pop_condition_mask() { 472 SkASSERT(this->executionMaskWritesAreEnabled()); 473 fInstructions.push_back({BuilderOp::pop_condition_mask, {}}); 474 } 475 merge_condition_mask()476 void merge_condition_mask() { 477 SkASSERT(this->executionMaskWritesAreEnabled()); 478 fInstructions.push_back({BuilderOp::merge_condition_mask, {}}); 479 } 480 push_loop_mask()481 void push_loop_mask() { 482 SkASSERT(this->executionMaskWritesAreEnabled()); 483 fInstructions.push_back({BuilderOp::push_loop_mask, {}}); 484 } 485 pop_loop_mask()486 void pop_loop_mask() { 487 SkASSERT(this->executionMaskWritesAreEnabled()); 488 fInstructions.push_back({BuilderOp::pop_loop_mask, {}}); 489 } 490 push_src_rgba()491 void push_src_rgba() { 492 fInstructions.push_back({BuilderOp::push_src_rgba, {}}); 493 } 494 push_dst_rgba()495 void push_dst_rgba() { 496 fInstructions.push_back({BuilderOp::push_dst_rgba, {}}); 497 } 498 pop_src_rg()499 void pop_src_rg() { 500 fInstructions.push_back({BuilderOp::pop_src_rg, {}}); 501 } 502 pop_src_rgba()503 void pop_src_rgba() { 504 fInstructions.push_back({BuilderOp::pop_src_rgba, {}}); 505 } 506 pop_dst_rgba()507 void pop_dst_rgba() { 508 fInstructions.push_back({BuilderOp::pop_dst_rgba, {}}); 509 } 510 mask_off_loop_mask()511 void mask_off_loop_mask() { 512 SkASSERT(this->executionMaskWritesAreEnabled()); 513 fInstructions.push_back({BuilderOp::mask_off_loop_mask, {}}); 514 } 515 reenable_loop_mask(SlotRange src)516 void reenable_loop_mask(SlotRange src) { 517 SkASSERT(this->executionMaskWritesAreEnabled()); 518 SkASSERT(src.count == 1); 519 fInstructions.push_back({BuilderOp::reenable_loop_mask, {src.index}}); 520 } 521 pop_and_reenable_loop_mask()522 void pop_and_reenable_loop_mask() { 523 SkASSERT(this->executionMaskWritesAreEnabled()); 524 fInstructions.push_back({BuilderOp::pop_and_reenable_loop_mask, {}}); 525 } 526 merge_loop_mask()527 void merge_loop_mask() { 528 SkASSERT(this->executionMaskWritesAreEnabled()); 529 fInstructions.push_back({BuilderOp::merge_loop_mask, {}}); 530 } 531 push_return_mask()532 void push_return_mask() { 533 SkASSERT(this->executionMaskWritesAreEnabled()); 534 fInstructions.push_back({BuilderOp::push_return_mask, {}}); 535 } 536 537 void pop_return_mask(); 538 mask_off_return_mask()539 void mask_off_return_mask() { 540 SkASSERT(this->executionMaskWritesAreEnabled()); 541 fInstructions.push_back({BuilderOp::mask_off_return_mask, {}}); 542 } 543 invoke_shader(int childIdx)544 void invoke_shader(int childIdx) { 545 fInstructions.push_back({BuilderOp::invoke_shader, {}, childIdx}); 546 } 547 invoke_color_filter(int childIdx)548 void invoke_color_filter(int childIdx) { 549 fInstructions.push_back({BuilderOp::invoke_color_filter, {}, childIdx}); 550 } 551 invoke_blender(int childIdx)552 void invoke_blender(int childIdx) { 553 fInstructions.push_back({BuilderOp::invoke_blender, {}, childIdx}); 554 } 555 556 private: 557 void simplifyPopSlotsUnmasked(SlotRange* dst); 558 559 SkTArray<Instruction> fInstructions; 560 int fNumLabels = 0; 561 int fExecutionMaskWritesEnabled = 0; 562 }; 563 564 } // namespace RP 565 } // namespace SkSL 566