• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*
2  * Copyright 2022 Google Inc.
3  *
4  * Use of this source code is governed by a BSD-style license that can be
5  * found in the LICENSE file.
6  */
7 
8 #include "include/core/SkSpan.h"
9 #include "include/core/SkTypes.h"
10 #include "include/private/base/SkTArray.h"
11 #include "src/base/SkUtils.h"
12 #include "src/core/SkRasterPipelineOpList.h"
13 #include "src/core/SkTHash.h"
14 
15 #include <cstdint>
16 #include <initializer_list>
17 #include <memory>
18 
19 class SkArenaAlloc;
20 class SkRasterPipeline;
21 class SkWStream;
22 
23 namespace SkSL {
24 
25 class SkRPDebugTrace;
26 
27 namespace RP {
28 
29 // A single scalar in our program consumes one slot.
30 using Slot = int;
31 constexpr Slot NA = -1;
32 
33 // Scalars, vectors, and matrices can be represented as a range of slot indices.
34 struct SlotRange {
35     Slot index = 0;
36     int count = 0;
37 };
38 
39 // An RP::Program will consist entirely of ProgramOps. The ProgramOps list is a superset of the
40 // native SkRasterPipelineOps op-list. It also has a few extra ops to indicate child-effect
41 // invocation, and a `label` op to indicate branch targets.
42 enum class ProgramOp {
43     // A finished program can contain any native Raster Pipeline op...
44     #define M(stage) stage,
45         SK_RASTER_PIPELINE_OPS_ALL(M)
46     #undef M
47 
48     // ... has branch targets...
49     label,
50 
51     // ... and can invoke child programs.
52     invoke_shader,
53     invoke_color_filter,
54     invoke_blender,
55 };
56 
57 // BuilderOps are a superset of ProgramOps. They are used by the RP::Builder, which works in terms
58 // of Instructions; Instructions are slightly more expressive than raw SkRasterPipelineOps. In
59 // particular, the Builder supports stacks for pushing and popping scratch values.
60 // RP::Program::makeStages is responsible for rewriting Instructions/BuilderOps into an array of
61 // RP::Program::Stages, which will contain only native SkRasterPipelineOps and (optionally)
62 // child-effect invocations.
63 enum class BuilderOp {
64     // An in-flight program can contain all the native Raster Pipeline ops...
65     #define M(stage) stage,
66         SK_RASTER_PIPELINE_OPS_ALL(M)
67     #undef M
68 
69     // ... has branch targets...
70     label,
71 
72     // ... can invoke child programs...
73     invoke_shader,
74     invoke_color_filter,
75     invoke_blender,
76 
77     // ... and also has Builder-specific ops. These ops generally interface with the stack, and are
78     // converted into ProgramOps during `makeStages`.
79     push_literal,
80     push_slots,
81     push_uniform,
82     push_zeros,
83     push_clone,
84     push_clone_from_stack,
85     copy_stack_to_slots,
86     copy_stack_to_slots_unmasked,
87     swizzle_copy_stack_to_slots,
88     discard_stack,
89     select,
90     push_condition_mask,
91     pop_condition_mask,
92     push_loop_mask,
93     pop_loop_mask,
94     pop_and_reenable_loop_mask,
95     push_return_mask,
96     pop_return_mask,
97     push_src_rgba,
98     push_dst_rgba,
99     pop_src_rg,
100     pop_src_rgba,
101     pop_dst_rgba,
102     set_current_stack,
103     branch_if_no_active_lanes_on_stack_top_equal,
104     unsupported
105 };
106 
107 // If the child-invocation enums are not in sync between enums, program creation will not work.
108 static_assert((int)ProgramOp::label               == (int)BuilderOp::label);
109 static_assert((int)ProgramOp::invoke_shader       == (int)BuilderOp::invoke_shader);
110 static_assert((int)ProgramOp::invoke_color_filter == (int)BuilderOp::invoke_color_filter);
111 static_assert((int)ProgramOp::invoke_blender      == (int)BuilderOp::invoke_blender);
112 
113 // Represents a single raster-pipeline SkSL instruction.
114 struct Instruction {
115     Instruction(BuilderOp op, std::initializer_list<Slot> slots, int a = 0, int b = 0, int c = 0)
fOpInstruction116             : fOp(op), fImmA(a), fImmB(b), fImmC(c) {
117         auto iter = slots.begin();
118         if (iter != slots.end()) { fSlotA = *iter++; }
119         if (iter != slots.end()) { fSlotB = *iter++; }
120         if (iter != slots.end()) { fSlotC = *iter++; }
121         SkASSERT(iter == slots.end());
122     }
123 
124     BuilderOp fOp;
125     Slot      fSlotA = NA;
126     Slot      fSlotB = NA;
127     Slot      fSlotC = NA;
128     int       fImmA = 0;
129     int       fImmB = 0;
130     int       fImmC = 0;
131 };
132 
133 class Callbacks {
134 public:
135     virtual ~Callbacks() = default;
136 
137     virtual bool appendShader(int index) = 0;
138     virtual bool appendColorFilter(int index) = 0;
139     virtual bool appendBlender(int index) = 0;
140 
141     virtual void toLinearSrgb() = 0;
142     virtual void fromLinearSrgb() = 0;
143 };
144 
145 class Program {
146 public:
147     Program(SkTArray<Instruction> instrs,
148             int numValueSlots,
149             int numUniformSlots,
150             int numLabels,
151             SkRPDebugTrace* debugTrace);
152 
153 #if !defined(SKSL_STANDALONE)
154     bool appendStages(SkRasterPipeline* pipeline,
155                       SkArenaAlloc* alloc,
156                       Callbacks* callbacks,
157                       SkSpan<const float> uniforms) const;
158 #endif
159 
160     void dump(SkWStream* out) const;
161 
162 private:
163     using StackDepthMap = SkTHashMap<int, int>; // <stack index, depth of stack>
164 
165     struct SlotData {
166         SkSpan<float> values;
167         SkSpan<float> stack;
168     };
169     SlotData allocateSlotData(SkArenaAlloc* alloc) const;
170 
171     struct Stage {
172         ProgramOp op;
173         void*     ctx;
174     };
175     void makeStages(SkTArray<Stage>* pipeline,
176                     SkArenaAlloc* alloc,
177                     SkSpan<const float> uniforms,
178                     const SlotData& slots) const;
179     void optimize();
180     StackDepthMap tempStackMaxDepths() const;
181 
182     // These methods are used to split up large multi-slot operations into multiple ops as needed.
183     void appendCopy(SkTArray<Stage>* pipeline, SkArenaAlloc* alloc,
184                     ProgramOp baseStage,
185                     float* dst, int dstStride, const float* src, int srcStride, int numSlots) const;
186     void appendCopySlotsUnmasked(SkTArray<Stage>* pipeline, SkArenaAlloc* alloc,
187                                  float* dst, const float* src, int numSlots) const;
188     void appendCopySlotsMasked(SkTArray<Stage>* pipeline, SkArenaAlloc* alloc,
189                                float* dst, const float* src, int numSlots) const;
190     void appendCopyConstants(SkTArray<Stage>* pipeline, SkArenaAlloc* alloc,
191                              float* dst, const float* src, int numSlots) const;
192 
193     // Appends a single-slot single-input math operation to the pipeline. The op `stage` will
194     // appended `numSlots` times, starting at position `dst` and advancing one slot for each
195     // subsequent invocation.
196     void appendSingleSlotUnaryOp(SkTArray<Stage>* pipeline, ProgramOp stage,
197                                  float* dst, int numSlots) const;
198 
199     // Appends a multi-slot single-input math operation to the pipeline. `baseStage` must refer to
200     // an single-slot "apply_op" stage, which must be immediately followed by specializations for
201     // 2-4 slots. For instance, {`zero_slot`, `zero_2_slots`, `zero_3_slots`, `zero_4_slots`}
202     // must be contiguous ops in the stage list, listed in that order; pass `zero_slot` and we
203     // pick the appropriate op based on `numSlots`.
204     void appendMultiSlotUnaryOp(SkTArray<Stage>* pipeline, ProgramOp baseStage,
205                                 float* dst, int numSlots) const;
206 
207     // Appends a two-input math operation to the pipeline. `src` must be _immediately_ after `dst`
208     // in memory. `baseStage` must refer to an unbounded "apply_to_n_slots" stage. A BinaryOpCtx
209     // will be used to pass pointers to the destination and source; the delta between the two
210     // pointers implicitly gives the number of slots.
211     void appendAdjacentNWayBinaryOp(SkTArray<Stage>* pipeline, SkArenaAlloc* alloc,
212                                     ProgramOp stage,
213                                     float* dst, const float* src, int numSlots) const;
214 
215     // Appends a multi-slot two-input math operation to the pipeline. `src` must be _immediately_
216     // after `dst` in memory. `baseStage` must refer to an unbounded "apply_to_n_slots" stage, which
217     // must be immediately followed by specializations for 1-4 slots. For instance, {`add_n_floats`,
218     // `add_float`, `add_2_floats`, `add_3_floats`, `add_4_floats`} must be contiguous ops in the
219     // stage list, listed in that order; pass `add_n_floats` and we pick the appropriate op based on
220     // `numSlots`.
221     void appendAdjacentMultiSlotBinaryOp(SkTArray<Stage>* pipeline, SkArenaAlloc* alloc,
222                                          ProgramOp baseStage,
223                                          float* dst, const float* src, int numSlots) const;
224 
225     // Appends a multi-slot math operation having three inputs (dst, src0, src1) and one output
226     // (dst) to the pipeline. The three inputs must be _immediately_ adjacent in memory. `baseStage`
227     // must refer to an unbounded "apply_to_n_slots" stage, which must be immediately followed by
228     // specializations for 1-4 slots.
229     void appendAdjacentMultiSlotTernaryOp(SkTArray<Stage>* pipeline, SkArenaAlloc* alloc,
230                                           ProgramOp stage, float* dst,
231                                           const float* src0, const float* src1, int numSlots) const;
232 
233     // Appends a stack_rewind op on platforms where it is needed (when SK_HAS_MUSTTAIL is not set).
234     void appendStackRewind(SkTArray<Stage>* pipeline) const;
235 
236     SkTArray<Instruction> fInstructions;
237     int fNumValueSlots = 0;
238     int fNumUniformSlots = 0;
239     int fNumTempStackSlots = 0;
240     int fNumLabels = 0;
241     SkTHashMap<int, int> fTempStackMaxDepths;
242     SkRPDebugTrace* fDebugTrace = nullptr;
243 };
244 
245 class Builder {
246 public:
247     /** Finalizes and optimizes the program. */
248     std::unique_ptr<Program> finish(int numValueSlots,
249                                     int numUniformSlots,
250                                     SkRPDebugTrace* debugTrace = nullptr);
251     /**
252      * Peels off a label ID for use in the program. Set the label's position in the program with
253      * the `label` instruction. Actually branch to the target with an instruction like
254      * `branch_if_any_active_lanes` or `jump`.
255      */
nextLabelID()256     int nextLabelID() {
257         return fNumLabels++;
258     }
259 
260     /**
261      * The builder keeps track of the state of execution masks; when we know that the execution
262      * mask is unaltered, we can generate simpler code. Code which alters the execution mask is
263      * required to enable this flag.
264      */
enableExecutionMaskWrites()265     void enableExecutionMaskWrites() {
266         ++fExecutionMaskWritesEnabled;
267     }
268 
disableExecutionMaskWrites()269     void disableExecutionMaskWrites() {
270         SkASSERT(this->executionMaskWritesAreEnabled());
271         --fExecutionMaskWritesEnabled;
272     }
273 
executionMaskWritesAreEnabled()274     bool executionMaskWritesAreEnabled() {
275         return fExecutionMaskWritesEnabled > 0;
276     }
277 
278     /** Assemble a program from the Raster Pipeline instructions below. */
init_lane_masks()279     void init_lane_masks() {
280         fInstructions.push_back({BuilderOp::init_lane_masks, {}});
281     }
282 
store_src_rg(SlotRange slots)283     void store_src_rg(SlotRange slots) {
284         SkASSERT(slots.count == 2);
285         fInstructions.push_back({BuilderOp::store_src_rg, {slots.index}});
286     }
287 
store_src(SlotRange slots)288     void store_src(SlotRange slots) {
289         SkASSERT(slots.count == 4);
290         fInstructions.push_back({BuilderOp::store_src, {slots.index}});
291     }
292 
store_dst(SlotRange slots)293     void store_dst(SlotRange slots) {
294         SkASSERT(slots.count == 4);
295         fInstructions.push_back({BuilderOp::store_dst, {slots.index}});
296     }
297 
store_device_xy01(SlotRange slots)298     void store_device_xy01(SlotRange slots) {
299         SkASSERT(slots.count == 4);
300         fInstructions.push_back({BuilderOp::store_device_xy01, {slots.index}});
301     }
302 
load_src(SlotRange slots)303     void load_src(SlotRange slots) {
304         SkASSERT(slots.count == 4);
305         fInstructions.push_back({BuilderOp::load_src, {slots.index}});
306     }
307 
load_dst(SlotRange slots)308     void load_dst(SlotRange slots) {
309         SkASSERT(slots.count == 4);
310         fInstructions.push_back({BuilderOp::load_dst, {slots.index}});
311     }
312 
set_current_stack(int stackIdx)313     void set_current_stack(int stackIdx) {
314         fInstructions.push_back({BuilderOp::set_current_stack, {}, stackIdx});
315     }
316 
317     // Inserts a label into the instruction stream.
318     void label(int labelID);
319 
320     // Unconditionally branches to a label.
321     void jump(int labelID);
322 
323     // Branches to a label if the execution mask is active in any lane.
324     void branch_if_any_active_lanes(int labelID);
325 
326     // Branches to a label if the execution mask is inactive across all lanes.
327     void branch_if_no_active_lanes(int labelID);
328 
329     // Branches to a label if the top value on the stack is _not_ equal to `value` in any lane.
330     void branch_if_no_active_lanes_on_stack_top_equal(int value, int labelID);
331 
332     // We use the same SkRasterPipeline op regardless of the literal type, and bitcast the value.
push_literal_f(float val)333     void push_literal_f(float val) {
334         this->push_literal_i(sk_bit_cast<int32_t>(val));
335     }
336 
push_literal_i(int32_t val)337     void push_literal_i(int32_t val) {
338         if (val == 0) {
339             this->push_zeros(1);
340         } else {
341             fInstructions.push_back({BuilderOp::push_literal, {}, val});
342         }
343     }
344 
push_literal_u(uint32_t val)345     void push_literal_u(uint32_t val) {
346         this->push_literal_i(sk_bit_cast<int32_t>(val));
347     }
348 
349     // Translates into copy_constants (from uniforms into temp stack) in Raster Pipeline.
350     void push_uniform(SlotRange src);
351 
push_zeros(int count)352     void push_zeros(int count) {
353         // Translates into zero_slot_unmasked in Raster Pipeline.
354         if (!fInstructions.empty() && fInstructions.back().fOp == BuilderOp::push_zeros) {
355             // Coalesce adjacent push_zero ops into a single op.
356             fInstructions.back().fImmA += count;
357         } else {
358             fInstructions.push_back({BuilderOp::push_zeros, {}, count});
359         }
360     }
361 
362     // Translates into copy_slots_unmasked (from values into temp stack) in Raster Pipeline.
363     void push_slots(SlotRange src);
364 
365     // Translates into copy_slots_masked (from temp stack to values) in Raster Pipeline.
366     // Does not discard any values on the temp stack.
copy_stack_to_slots(SlotRange dst)367     void copy_stack_to_slots(SlotRange dst) {
368         this->copy_stack_to_slots(dst, /*offsetFromStackTop=*/dst.count);
369     }
370 
371     void copy_stack_to_slots(SlotRange dst, int offsetFromStackTop);
372 
373     // Translates into swizzle_copy_slots_masked (from temp stack to values) in Raster Pipeline.
374     // Does not discard any values on the temp stack.
375     void swizzle_copy_stack_to_slots(SlotRange dst,
376                                      SkSpan<const int8_t> components,
377                                      int offsetFromStackTop);
378 
379     // Translates into copy_slots_unmasked (from temp stack to values) in Raster Pipeline.
380     // Does not discard any values on the temp stack.
copy_stack_to_slots_unmasked(SlotRange dst)381     void copy_stack_to_slots_unmasked(SlotRange dst) {
382         this->copy_stack_to_slots_unmasked(dst, /*offsetFromStackTop=*/dst.count);
383     }
384 
385     void copy_stack_to_slots_unmasked(SlotRange dst, int offsetFromStackTop);
386 
387     // Performs a unary op (like `bitwise_not`), given a slot count of `slots`. The stack top is
388     // replaced with the result.
389     void unary_op(BuilderOp op, int32_t slots);
390 
391     // Performs a binary op (like `add_n_floats` or `cmpeq_n_ints`), given a slot count of
392     // `slots`. Two n-slot input values are consumed, and the result is pushed onto the stack.
393     void binary_op(BuilderOp op, int32_t slots);
394 
395     // Performs a ternary op (like `mix` or `smoothstep`), given a slot count of
396     // `slots`. Three n-slot input values are consumed, and the result is pushed onto the stack.
397     void ternary_op(BuilderOp op, int32_t slots);
398 
399     // Computes a dot product on the stack. The slots consumed (`slots`) must be between 1 and 4.
400     // Two n-slot input vectors are consumed, and a scalar result is pushed onto the stack.
401     void dot_floats(int32_t slots);
402 
403     // Shrinks the temp stack, discarding values on top.
404     void discard_stack(int32_t count = 1);
405 
406     // Copies vales from the temp stack into slots, and then shrinks the temp stack.
407     void pop_slots(SlotRange dst);
408 
409     // Creates many clones of the top single-slot item on the temp stack.
410     void push_duplicates(int count);
411 
412     // Creates a single clone of an item on the current temp stack. The cloned item can consist of
413     // any number of slots, and can be copied from an earlier position on the stack.
414     void push_clone(int numSlots, int offsetFromStackTop = 0) {
415         fInstructions.push_back({BuilderOp::push_clone, {}, numSlots,
416                                  numSlots + offsetFromStackTop});
417     }
418 
419     // Creates a single clone from an item on any temp stack. The cloned item can consist of any
420     // number of slots.
421     void push_clone_from_stack(int numSlots, int otherStackIndex, int offsetFromStackTop = 0);
422 
423     // Compares the stack top with the passed-in value; if it matches, enables the loop mask.
case_op(int value)424     void case_op(int value) {
425         fInstructions.push_back({BuilderOp::case_op, {}, value});
426     }
427 
select(int slots)428     void select(int slots) {
429         // Overlays the top two entries on the stack, making one hybrid entry. The execution mask
430         // is used to select which lanes are preserved.
431         SkASSERT(slots > 0);
432         fInstructions.push_back({BuilderOp::select, {}, slots});
433     }
434 
435     // The opposite of push_slots; copies values from the temp stack into value slots, then
436     // shrinks the temp stack.
437     void pop_slots_unmasked(SlotRange dst);
438 
copy_slots_masked(SlotRange dst,SlotRange src)439     void copy_slots_masked(SlotRange dst, SlotRange src) {
440         SkASSERT(dst.count == src.count);
441         fInstructions.push_back({BuilderOp::copy_slot_masked, {dst.index, src.index}, dst.count});
442     }
443 
444     void copy_slots_unmasked(SlotRange dst, SlotRange src);
445 
copy_constant(Slot slot,int constantValue)446     void copy_constant(Slot slot, int constantValue) {
447         fInstructions.push_back({BuilderOp::copy_constant, {slot}, constantValue});
448     }
449 
450     // Stores zeros across the entire slot range.
451     void zero_slots_unmasked(SlotRange dst);
452 
453     // Consumes `consumedSlots` elements on the stack, then generates `components.size()` elements.
454     void swizzle(int consumedSlots, SkSpan<const int8_t> components);
455 
456     // Transposes a matrix of size CxR on the stack (into a matrix of size RxC).
457     void transpose(int columns, int rows);
458 
459     // Generates a CxR diagonal matrix from the top two scalars on the stack. The second scalar is
460     // used as the diagonal value; the first scalar (usually zero) fills in the rest of the slots.
461     void diagonal_matrix(int columns, int rows);
462 
463     // Resizes a CxR matrix at the top of the stack to C'xR'.
464     void matrix_resize(int origColumns, int origRows, int newColumns, int newRows);
465 
push_condition_mask()466     void push_condition_mask() {
467         SkASSERT(this->executionMaskWritesAreEnabled());
468         fInstructions.push_back({BuilderOp::push_condition_mask, {}});
469     }
470 
pop_condition_mask()471     void pop_condition_mask() {
472         SkASSERT(this->executionMaskWritesAreEnabled());
473         fInstructions.push_back({BuilderOp::pop_condition_mask, {}});
474     }
475 
merge_condition_mask()476     void merge_condition_mask() {
477         SkASSERT(this->executionMaskWritesAreEnabled());
478         fInstructions.push_back({BuilderOp::merge_condition_mask, {}});
479     }
480 
push_loop_mask()481     void push_loop_mask() {
482         SkASSERT(this->executionMaskWritesAreEnabled());
483         fInstructions.push_back({BuilderOp::push_loop_mask, {}});
484     }
485 
pop_loop_mask()486     void pop_loop_mask() {
487         SkASSERT(this->executionMaskWritesAreEnabled());
488         fInstructions.push_back({BuilderOp::pop_loop_mask, {}});
489     }
490 
push_src_rgba()491     void push_src_rgba() {
492         fInstructions.push_back({BuilderOp::push_src_rgba, {}});
493     }
494 
push_dst_rgba()495     void push_dst_rgba() {
496         fInstructions.push_back({BuilderOp::push_dst_rgba, {}});
497     }
498 
pop_src_rg()499     void pop_src_rg() {
500         fInstructions.push_back({BuilderOp::pop_src_rg, {}});
501     }
502 
pop_src_rgba()503     void pop_src_rgba() {
504         fInstructions.push_back({BuilderOp::pop_src_rgba, {}});
505     }
506 
pop_dst_rgba()507     void pop_dst_rgba() {
508         fInstructions.push_back({BuilderOp::pop_dst_rgba, {}});
509     }
510 
mask_off_loop_mask()511     void mask_off_loop_mask() {
512         SkASSERT(this->executionMaskWritesAreEnabled());
513         fInstructions.push_back({BuilderOp::mask_off_loop_mask, {}});
514     }
515 
reenable_loop_mask(SlotRange src)516     void reenable_loop_mask(SlotRange src) {
517         SkASSERT(this->executionMaskWritesAreEnabled());
518         SkASSERT(src.count == 1);
519         fInstructions.push_back({BuilderOp::reenable_loop_mask, {src.index}});
520     }
521 
pop_and_reenable_loop_mask()522     void pop_and_reenable_loop_mask() {
523         SkASSERT(this->executionMaskWritesAreEnabled());
524         fInstructions.push_back({BuilderOp::pop_and_reenable_loop_mask, {}});
525     }
526 
merge_loop_mask()527     void merge_loop_mask() {
528         SkASSERT(this->executionMaskWritesAreEnabled());
529         fInstructions.push_back({BuilderOp::merge_loop_mask, {}});
530     }
531 
push_return_mask()532     void push_return_mask() {
533         SkASSERT(this->executionMaskWritesAreEnabled());
534         fInstructions.push_back({BuilderOp::push_return_mask, {}});
535     }
536 
537     void pop_return_mask();
538 
mask_off_return_mask()539     void mask_off_return_mask() {
540         SkASSERT(this->executionMaskWritesAreEnabled());
541         fInstructions.push_back({BuilderOp::mask_off_return_mask, {}});
542     }
543 
invoke_shader(int childIdx)544     void invoke_shader(int childIdx) {
545         fInstructions.push_back({BuilderOp::invoke_shader, {}, childIdx});
546     }
547 
invoke_color_filter(int childIdx)548     void invoke_color_filter(int childIdx) {
549         fInstructions.push_back({BuilderOp::invoke_color_filter, {}, childIdx});
550     }
551 
invoke_blender(int childIdx)552     void invoke_blender(int childIdx) {
553         fInstructions.push_back({BuilderOp::invoke_blender, {}, childIdx});
554     }
555 
556 private:
557     void simplifyPopSlotsUnmasked(SlotRange* dst);
558 
559     SkTArray<Instruction> fInstructions;
560     int fNumLabels = 0;
561     int fExecutionMaskWritesEnabled = 0;
562 };
563 
564 }  // namespace RP
565 }  // namespace SkSL
566