• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*
2  * Copyright 2022 Google Inc.
3  *
4  * Use of this source code is governed by a BSD-style license that can be
5  * found in the LICENSE file.
6  */
7 
8 #include "include/core/SkStream.h"
9 #include "include/private/SkSLString.h"
10 #include "include/private/base/SkMalloc.h"
11 #include "include/private/base/SkTo.h"
12 #include "include/sksl/SkSLPosition.h"
13 #include "src/base/SkArenaAlloc.h"
14 #include "src/core/SkOpts.h"
15 #include "src/core/SkRasterPipelineOpContexts.h"
16 #include "src/core/SkRasterPipelineOpList.h"
17 #include "src/sksl/codegen/SkSLRasterPipelineBuilder.h"
18 #include "src/sksl/tracing/SkRPDebugTrace.h"
19 #include "src/sksl/tracing/SkSLDebugInfo.h"
20 #include "src/utils/SkBitSet.h"
21 
22 #if !defined(SKSL_STANDALONE)
23 #include "src/core/SkRasterPipeline.h"
24 #endif
25 
26 #include <algorithm>
27 #include <cmath>
28 #include <cstring>
29 #include <iterator>
30 #include <string>
31 #include <string_view>
32 #include <tuple>
33 #include <utility>
34 #include <vector>
35 
36 namespace SkSL {
37 namespace RP {
38 
39 #define ALL_SINGLE_SLOT_UNARY_OP_CASES  \
40          BuilderOp::atan_float:         \
41     case BuilderOp::cos_float:          \
42     case BuilderOp::exp_float:          \
43     case BuilderOp::sin_float:          \
44     case BuilderOp::sqrt_float:         \
45     case BuilderOp::tan_float
46 
47 #define ALL_MULTI_SLOT_UNARY_OP_CASES        \
48          BuilderOp::abs_float:               \
49     case BuilderOp::abs_int:                 \
50     case BuilderOp::bitwise_not_int:         \
51     case BuilderOp::cast_to_float_from_int:  \
52     case BuilderOp::cast_to_float_from_uint: \
53     case BuilderOp::cast_to_int_from_float:  \
54     case BuilderOp::cast_to_uint_from_float: \
55     case BuilderOp::ceil_float:              \
56     case BuilderOp::floor_float              \
57 
58 #define ALL_N_WAY_BINARY_OP_CASES   \
59          BuilderOp::atan2_n_floats: \
60     case BuilderOp::pow_n_floats
61 
62 #define ALL_MULTI_SLOT_BINARY_OP_CASES  \
63          BuilderOp::add_n_floats:       \
64     case BuilderOp::add_n_ints:         \
65     case BuilderOp::sub_n_floats:       \
66     case BuilderOp::sub_n_ints:         \
67     case BuilderOp::mul_n_floats:       \
68     case BuilderOp::mul_n_ints:         \
69     case BuilderOp::div_n_floats:       \
70     case BuilderOp::div_n_ints:         \
71     case BuilderOp::div_n_uints:        \
72     case BuilderOp::bitwise_and_n_ints: \
73     case BuilderOp::bitwise_or_n_ints:  \
74     case BuilderOp::bitwise_xor_n_ints: \
75     case BuilderOp::min_n_floats:       \
76     case BuilderOp::min_n_ints:         \
77     case BuilderOp::min_n_uints:        \
78     case BuilderOp::max_n_floats:       \
79     case BuilderOp::max_n_ints:         \
80     case BuilderOp::max_n_uints:        \
81     case BuilderOp::cmple_n_floats:     \
82     case BuilderOp::cmple_n_ints:       \
83     case BuilderOp::cmple_n_uints:      \
84     case BuilderOp::cmplt_n_floats:     \
85     case BuilderOp::cmplt_n_ints:       \
86     case BuilderOp::cmplt_n_uints:      \
87     case BuilderOp::cmpeq_n_floats:     \
88     case BuilderOp::cmpeq_n_ints:       \
89     case BuilderOp::cmpne_n_floats:     \
90     case BuilderOp::cmpne_n_ints
91 
92 #define ALL_MULTI_SLOT_TERNARY_OP_CASES \
93          BuilderOp::mix_n_floats:       \
94     case BuilderOp::mix_n_ints
95 
unary_op(BuilderOp op,int32_t slots)96 void Builder::unary_op(BuilderOp op, int32_t slots) {
97     switch (op) {
98         case ALL_SINGLE_SLOT_UNARY_OP_CASES:
99         case ALL_MULTI_SLOT_UNARY_OP_CASES:
100             fInstructions.push_back({op, {}, slots});
101             break;
102 
103         default:
104             SkDEBUGFAIL("not a unary op");
105             break;
106     }
107 }
108 
binary_op(BuilderOp op,int32_t slots)109 void Builder::binary_op(BuilderOp op, int32_t slots) {
110     switch (op) {
111         case ALL_N_WAY_BINARY_OP_CASES:
112         case ALL_MULTI_SLOT_BINARY_OP_CASES:
113             fInstructions.push_back({op, {}, slots});
114             break;
115 
116         default:
117             SkDEBUGFAIL("not a binary op");
118             break;
119     }
120 }
121 
ternary_op(BuilderOp op,int32_t slots)122 void Builder::ternary_op(BuilderOp op, int32_t slots) {
123     switch (op) {
124         case ALL_MULTI_SLOT_TERNARY_OP_CASES:
125             fInstructions.push_back({op, {}, slots});
126             break;
127 
128         default:
129             SkDEBUGFAIL("not a ternary op");
130             break;
131     }
132 }
133 
dot_floats(int32_t slots)134 void Builder::dot_floats(int32_t slots) {
135     switch (slots) {
136         case 1: fInstructions.push_back({BuilderOp::mul_n_floats, {}, slots}); break;
137         case 2: fInstructions.push_back({BuilderOp::dot_2_floats, {}, slots}); break;
138         case 3: fInstructions.push_back({BuilderOp::dot_3_floats, {}, slots}); break;
139         case 4: fInstructions.push_back({BuilderOp::dot_4_floats, {}, slots}); break;
140 
141         default:
142             SkDEBUGFAIL("invalid number of slots");
143             break;
144     }
145 }
146 
discard_stack(int32_t count)147 void Builder::discard_stack(int32_t count) {
148     // If we pushed something onto the stack and then immediately discarded part of it, we can
149     // shrink or eliminate the push.
150     while (count > 0 && !fInstructions.empty()) {
151         Instruction& lastInstruction = fInstructions.back();
152 
153         switch (lastInstruction.fOp) {
154             case BuilderOp::discard_stack:
155                 // Our last op was actually a separate discard_stack; combine the discards.
156                 lastInstruction.fImmA += count;
157                 return;
158 
159             case BuilderOp::push_zeros:
160             case BuilderOp::push_clone:
161             case BuilderOp::push_clone_from_stack:
162             case BuilderOp::push_slots:
163             case BuilderOp::push_uniform:
164                 // Our last op was a multi-slot push; cancel out one discard and eliminate the op
165                 // if its count reached zero.
166                 --count;
167                 --lastInstruction.fImmA;
168                 if (lastInstruction.fImmA == 0) {
169                     fInstructions.pop_back();
170                 }
171                 continue;
172 
173             case BuilderOp::push_literal:
174             case BuilderOp::push_condition_mask:
175             case BuilderOp::push_loop_mask:
176             case BuilderOp::push_return_mask:
177                 // Our last op was a single-slot push; cancel out one discard and eliminate the op.
178                 --count;
179                 fInstructions.pop_back();
180                 continue;
181 
182             default:
183                 break;
184         }
185 
186         // This instruction wasn't a push.
187         break;
188     }
189 
190     if (count > 0) {
191         fInstructions.push_back({BuilderOp::discard_stack, {}, count});
192     }
193 }
194 
label(int labelID)195 void Builder::label(int labelID) {
196     SkASSERT(labelID >= 0 && labelID < fNumLabels);
197 
198     // If the previous instruction was a branch to this label, it's a no-op; jumping to the very
199     // next instruction is effectively meaningless.
200     while (!fInstructions.empty()) {
201         Instruction& lastInstruction = fInstructions.back();
202         switch (lastInstruction.fOp) {
203             case BuilderOp::jump:
204             case BuilderOp::branch_if_any_active_lanes:
205             case BuilderOp::branch_if_no_active_lanes:
206             case BuilderOp::branch_if_no_active_lanes_on_stack_top_equal:
207                 if (lastInstruction.fImmA == labelID) {
208                     fInstructions.pop_back();
209                     continue;
210                 }
211                 break;
212 
213             default:
214                 break;
215         }
216         break;
217     }
218     fInstructions.push_back({BuilderOp::label, {}, labelID});
219 }
220 
jump(int labelID)221 void Builder::jump(int labelID) {
222     SkASSERT(labelID >= 0 && labelID < fNumLabels);
223     if (!fInstructions.empty() && fInstructions.back().fOp == BuilderOp::jump) {
224         // The previous instruction was also `jump`, so this branch could never possibly occur.
225         return;
226     }
227     fInstructions.push_back({BuilderOp::jump, {}, labelID});
228 }
229 
branch_if_any_active_lanes(int labelID)230 void Builder::branch_if_any_active_lanes(int labelID) {
231     if (!this->executionMaskWritesAreEnabled()) {
232         this->jump(labelID);
233         return;
234     }
235 
236     SkASSERT(labelID >= 0 && labelID < fNumLabels);
237     if (!fInstructions.empty() &&
238         (fInstructions.back().fOp == BuilderOp::branch_if_any_active_lanes ||
239          fInstructions.back().fOp == BuilderOp::jump)) {
240         // The previous instruction was `jump` or `branch_if_any_active_lanes`, so this branch
241         // could never possibly occur.
242         return;
243     }
244     fInstructions.push_back({BuilderOp::branch_if_any_active_lanes, {}, labelID});
245 }
246 
branch_if_no_active_lanes(int labelID)247 void Builder::branch_if_no_active_lanes(int labelID) {
248     if (!this->executionMaskWritesAreEnabled()) {
249         return;
250     }
251 
252     SkASSERT(labelID >= 0 && labelID < fNumLabels);
253     if (!fInstructions.empty() &&
254         (fInstructions.back().fOp == BuilderOp::branch_if_no_active_lanes ||
255          fInstructions.back().fOp == BuilderOp::jump)) {
256         // The previous instruction was `jump` or `branch_if_no_active_lanes`, so this branch
257         // could never possibly occur.
258         return;
259     }
260     fInstructions.push_back({BuilderOp::branch_if_no_active_lanes, {}, labelID});
261 }
262 
branch_if_no_active_lanes_on_stack_top_equal(int value,int labelID)263 void Builder::branch_if_no_active_lanes_on_stack_top_equal(int value, int labelID) {
264     SkASSERT(labelID >= 0 && labelID < fNumLabels);
265     if (!fInstructions.empty() &&
266         (fInstructions.back().fOp == BuilderOp::jump ||
267          (fInstructions.back().fOp == BuilderOp::branch_if_no_active_lanes_on_stack_top_equal &&
268           fInstructions.back().fImmB == value))) {
269         // The previous instruction was `jump` or `branch_if_no_active_lanes_on_stack_top_equal`
270         // (checking against the same value), so this branch could never possibly occur.
271         return;
272     }
273     fInstructions.push_back({BuilderOp::branch_if_no_active_lanes_on_stack_top_equal,
274                              {}, labelID, value});
275 }
276 
push_slots(SlotRange src)277 void Builder::push_slots(SlotRange src) {
278     SkASSERT(src.count >= 0);
279     if (!fInstructions.empty()) {
280         Instruction& lastInstruction = fInstructions.back();
281 
282         // If the previous instruction was pushing slots contiguous to this range, we can collapse
283         // the two pushes into one larger push.
284         if (lastInstruction.fOp == BuilderOp::push_slots &&
285             lastInstruction.fSlotA + lastInstruction.fImmA == src.index) {
286             lastInstruction.fImmA += src.count;
287             return;
288         }
289 
290         // If the previous instruction was discarding an equal number of slots...
291         if (lastInstruction.fOp == BuilderOp::discard_stack && lastInstruction.fImmA == src.count) {
292             // ... and the instruction before that was copying from the stack to the same slots...
293             Instruction& prevInstruction = fInstructions.fromBack(1);
294             if ((prevInstruction.fOp == BuilderOp::copy_stack_to_slots ||
295                  prevInstruction.fOp == BuilderOp::copy_stack_to_slots_unmasked) &&
296                 prevInstruction.fSlotA == src.index &&
297                 prevInstruction.fImmA == src.count) {
298                 // ... we are emitting `copy stack to X, discard stack, copy X to stack`. This is a
299                 // common pattern when multiple operations in a row affect the same variable. We can
300                 // eliminate the discard and just leave X on the stack.
301                 fInstructions.pop_back();
302                 return;
303             }
304         }
305     }
306 
307     if (src.count > 0) {
308         fInstructions.push_back({BuilderOp::push_slots, {src.index}, src.count});
309     }
310 }
311 
push_uniform(SlotRange src)312 void Builder::push_uniform(SlotRange src) {
313     SkASSERT(src.count >= 0);
314     if (!fInstructions.empty()) {
315         Instruction& lastInstruction = fInstructions.back();
316 
317         // If the previous instruction was pushing uniforms contiguous to this range, we can
318         // collapse the two pushes into one larger push.
319         if (lastInstruction.fOp == BuilderOp::push_uniform &&
320             lastInstruction.fSlotA + lastInstruction.fImmA == src.index) {
321             lastInstruction.fImmA += src.count;
322             return;
323         }
324     }
325 
326     if (src.count > 0) {
327         fInstructions.push_back({BuilderOp::push_uniform, {src.index}, src.count});
328     }
329 }
330 
push_duplicates(int count)331 void Builder::push_duplicates(int count) {
332     if (!fInstructions.empty()) {
333         Instruction& lastInstruction = fInstructions.back();
334 
335         // If the previous op is pushing a zero, we can just push more of them.
336         if (lastInstruction.fOp == BuilderOp::push_zeros) {
337             lastInstruction.fImmA += count;
338             return;
339         }
340     }
341     SkASSERT(count >= 0);
342     if (count >= 3) {
343         // Use a swizzle to splat the input into a 4-slot value.
344         this->swizzle(/*consumedSlots=*/1, {0, 0, 0, 0});
345         count -= 3;
346     }
347     for (; count >= 4; count -= 4) {
348         // Clone the splatted value four slots at a time.
349         this->push_clone(/*numSlots=*/4);
350     }
351     // Use a swizzle or clone to handle the trailing items.
352     switch (count) {
353         case 3:  this->swizzle(/*consumedSlots=*/1, {0, 0, 0, 0}); break;
354         case 2:  this->swizzle(/*consumedSlots=*/1, {0, 0, 0});    break;
355         case 1:  this->push_clone(/*numSlots=*/1);              break;
356         default: break;
357     }
358 }
359 
push_clone_from_stack(int numSlots,int otherStackIndex,int offsetFromStackTop)360 void Builder::push_clone_from_stack(int numSlots, int otherStackIndex, int offsetFromStackTop) {
361     offsetFromStackTop += numSlots;
362 
363     if (!fInstructions.empty()) {
364         Instruction& lastInstruction = fInstructions.back();
365 
366         // If the previous op is also pushing a clone...
367         if (lastInstruction.fOp == BuilderOp::push_clone_from_stack &&
368             // ... from the same stack...
369             lastInstruction.fImmB == otherStackIndex &&
370             // ... and this clone starts at the same place that the last clone ends...
371             lastInstruction.fImmC - lastInstruction.fImmA == offsetFromStackTop) {
372             // ... just extend the existing clone-op.
373             lastInstruction.fImmA += numSlots;
374             return;
375         }
376     }
377 
378     fInstructions.push_back({BuilderOp::push_clone_from_stack, {},
379                              numSlots, otherStackIndex, offsetFromStackTop});
380 }
381 
pop_slots(SlotRange dst)382 void Builder::pop_slots(SlotRange dst) {
383     if (!this->executionMaskWritesAreEnabled()) {
384         this->pop_slots_unmasked(dst);
385         return;
386     }
387 
388     this->copy_stack_to_slots(dst);
389     this->discard_stack(dst.count);
390 }
391 
simplifyPopSlotsUnmasked(SlotRange * dst)392 void Builder::simplifyPopSlotsUnmasked(SlotRange* dst) {
393     if (!dst->count || fInstructions.empty()) {
394         // There's nothing left to simplify.
395         return;
396     }
397 
398     Instruction& lastInstruction = fInstructions.back();
399 
400     // If the last instruction is pushing a constant, we can simplify it by copying the constant
401     // directly into the destination slot.
402     if (lastInstruction.fOp == BuilderOp::push_literal) {
403         // Remove the constant-push instruction.
404         int value = lastInstruction.fImmA;
405         fInstructions.pop_back();
406 
407         // Consume one destination slot.
408         dst->count--;
409         Slot destinationSlot = dst->index + dst->count;
410 
411         // Continue simplifying if possible.
412         this->simplifyPopSlotsUnmasked(dst);
413 
414         // Write the constant directly to the destination slot.
415         this->copy_constant(destinationSlot, value);
416         return;
417     }
418 
419     // If the last instruction is pushing a zero, we can save a step by directly zeroing out
420     // the destination slot.
421     if (lastInstruction.fOp == BuilderOp::push_zeros) {
422         // Remove one zero-push.
423         lastInstruction.fImmA--;
424         if (lastInstruction.fImmA == 0) {
425             fInstructions.pop_back();
426         }
427 
428         // Consume one destination slot.
429         dst->count--;
430         Slot destinationSlot = dst->index + dst->count;
431 
432         // Continue simplifying if possible.
433         this->simplifyPopSlotsUnmasked(dst);
434 
435         // Zero the destination slot directly.
436         this->zero_slots_unmasked({destinationSlot, 1});
437         return;
438     }
439 
440     // If the last instruction is pushing a slot, we can just copy that slot.
441     if (lastInstruction.fOp == BuilderOp::push_slots) {
442         // Get the last slot.
443         Slot sourceSlot = lastInstruction.fSlotA + lastInstruction.fImmA - 1;
444         lastInstruction.fImmA--;
445         if (lastInstruction.fImmA == 0) {
446             fInstructions.pop_back();
447         }
448 
449         // Consume one destination slot.
450         dst->count--;
451         Slot destinationSlot = dst->index + dst->count;
452 
453         // Try once more.
454         this->simplifyPopSlotsUnmasked(dst);
455 
456         // Copy the slot directly.
457         if (destinationSlot != sourceSlot) {
458             this->copy_slots_unmasked({destinationSlot, 1}, {sourceSlot, 1});
459         }
460         return;
461     }
462 }
463 
pop_slots_unmasked(SlotRange dst)464 void Builder::pop_slots_unmasked(SlotRange dst) {
465     SkASSERT(dst.count >= 0);
466 
467     // If we are popping immediately after a push, we can simplify the code by writing the pushed
468     // value directly to the destination range.
469     this->simplifyPopSlotsUnmasked(&dst);
470 
471     // Pop from the stack normally.
472     if (dst.count > 0) {
473         this->copy_stack_to_slots_unmasked(dst);
474         this->discard_stack(dst.count);
475     }
476 }
477 
copy_stack_to_slots(SlotRange dst,int offsetFromStackTop)478 void Builder::copy_stack_to_slots(SlotRange dst, int offsetFromStackTop) {
479     // If the execution mask is known to be all-true, then we can ignore the write mask.
480     if (!this->executionMaskWritesAreEnabled()) {
481         this->copy_stack_to_slots_unmasked(dst, offsetFromStackTop);
482         return;
483     }
484 
485     // If the last instruction copied the previous stack slots, just extend it.
486     if (!fInstructions.empty()) {
487         Instruction& lastInstruction = fInstructions.back();
488 
489         // If the last op is copy-stack-to-slots...
490         if (lastInstruction.fOp == BuilderOp::copy_stack_to_slots &&
491             // and this op's destination is immediately after the last copy-slots-op's destination
492             lastInstruction.fSlotA + lastInstruction.fImmA == dst.index &&
493             // and this op's source is immediately after the last copy-slots-op's source
494             lastInstruction.fImmB - lastInstruction.fImmA == offsetFromStackTop) {
495             // then we can just extend the copy!
496             lastInstruction.fImmA += dst.count;
497             return;
498         }
499     }
500 
501     fInstructions.push_back({BuilderOp::copy_stack_to_slots, {dst.index},
502                              dst.count, offsetFromStackTop});
503 }
504 
slot_ranges_overlap(SlotRange x,SlotRange y)505 static bool slot_ranges_overlap(SlotRange x, SlotRange y) {
506     return x.index < y.index + y.count &&
507            y.index < x.index + x.count;
508 }
509 
copy_slots_unmasked(SlotRange dst,SlotRange src)510 void Builder::copy_slots_unmasked(SlotRange dst, SlotRange src) {
511     // If the last instruction copied adjacent slots, just extend it.
512     if (!fInstructions.empty()) {
513         Instruction& lastInstr = fInstructions.back();
514 
515         // If the last op is copy-slots-unmasked...
516         if (lastInstr.fOp == BuilderOp::copy_slot_unmasked &&
517             // and this op's destination is immediately after the last copy-slots-op's destination
518             lastInstr.fSlotA + lastInstr.fImmA == dst.index &&
519             // and this op's source is immediately after the last copy-slots-op's source
520             lastInstr.fSlotB + lastInstr.fImmA == src.index &&
521             // and the source/dest ranges will not overlap
522             !slot_ranges_overlap({lastInstr.fSlotB, lastInstr.fImmA + dst.count},
523                                  {lastInstr.fSlotA, lastInstr.fImmA + dst.count})) {
524             // then we can just extend the copy!
525             lastInstr.fImmA += dst.count;
526             return;
527         }
528     }
529 
530     SkASSERT(dst.count == src.count);
531     fInstructions.push_back({BuilderOp::copy_slot_unmasked, {dst.index, src.index}, dst.count});
532 }
533 
copy_stack_to_slots_unmasked(SlotRange dst,int offsetFromStackTop)534 void Builder::copy_stack_to_slots_unmasked(SlotRange dst, int offsetFromStackTop) {
535     // If the last instruction copied the previous stack slots, just extend it.
536     if (!fInstructions.empty()) {
537         Instruction& lastInstruction = fInstructions.back();
538 
539         // If the last op is copy-stack-to-slots-unmasked...
540         if (lastInstruction.fOp == BuilderOp::copy_stack_to_slots_unmasked &&
541             // and this op's destination is immediately after the last copy-slots-op's destination
542             lastInstruction.fSlotA + lastInstruction.fImmA == dst.index &&
543             // and this op's source is immediately after the last copy-slots-op's source
544             lastInstruction.fImmB - lastInstruction.fImmA == offsetFromStackTop) {
545             // then we can just extend the copy!
546             lastInstruction.fImmA += dst.count;
547             return;
548         }
549     }
550 
551     fInstructions.push_back({BuilderOp::copy_stack_to_slots_unmasked, {dst.index},
552                              dst.count, offsetFromStackTop});
553 }
554 
pop_return_mask()555 void Builder::pop_return_mask() {
556     SkASSERT(this->executionMaskWritesAreEnabled());
557 
558     // This instruction is going to overwrite the return mask. If the previous instruction was
559     // masking off the return mask, that's wasted work and it can be eliminated.
560     if (!fInstructions.empty()) {
561         Instruction& lastInstruction = fInstructions.back();
562 
563         if (lastInstruction.fOp == BuilderOp::mask_off_return_mask) {
564             fInstructions.pop_back();
565         }
566     }
567 
568     fInstructions.push_back({BuilderOp::pop_return_mask, {}});
569 }
570 
zero_slots_unmasked(SlotRange dst)571 void Builder::zero_slots_unmasked(SlotRange dst) {
572     if (!fInstructions.empty()) {
573         Instruction& lastInstruction = fInstructions.back();
574 
575         if (lastInstruction.fOp == BuilderOp::zero_slot_unmasked) {
576             if (lastInstruction.fSlotA + lastInstruction.fImmA == dst.index) {
577                 // The previous instruction was zeroing the range immediately before this range.
578                 // Combine the ranges.
579                 lastInstruction.fImmA += dst.count;
580                 return;
581             }
582         }
583 
584         if (lastInstruction.fOp == BuilderOp::zero_slot_unmasked) {
585             if (lastInstruction.fSlotA == dst.index + dst.count) {
586                 // The previous instruction was zeroing the range immediately after this range.
587                 // Combine the ranges.
588                 lastInstruction.fSlotA = dst.index;
589                 lastInstruction.fImmA += dst.count;
590                 return;
591             }
592         }
593     }
594 
595     fInstructions.push_back({BuilderOp::zero_slot_unmasked, {dst.index}, dst.count});
596 }
597 
pack_nybbles(SkSpan<const int8_t> components)598 static int pack_nybbles(SkSpan<const int8_t> components) {
599     // Pack up to 8 elements into nybbles, in reverse order.
600     int packed = 0;
601     for (auto iter = components.rbegin(); iter != components.rend(); ++iter) {
602         SkASSERT(*iter >= 0 && *iter <= 0xF);
603         packed <<= 4;
604         packed |= *iter;
605     }
606     return packed;
607 }
608 
unpack_nybbles_to_offsets(uint32_t components,SkSpan<uint16_t> offsets)609 static void unpack_nybbles_to_offsets(uint32_t components, SkSpan<uint16_t> offsets) {
610     // Unpack component nybbles into byte-offsets pointing at stack slots.
611     for (size_t index = 0; index < offsets.size(); ++index) {
612         offsets[index] = (components & 0xF) * SkOpts::raster_pipeline_highp_stride * sizeof(float);
613         components >>= 4;
614     }
615 }
616 
swizzle_copy_stack_to_slots(SlotRange dst,SkSpan<const int8_t> components,int offsetFromStackTop)617 void Builder::swizzle_copy_stack_to_slots(SlotRange dst,
618                                           SkSpan<const int8_t> components,
619                                           int offsetFromStackTop) {
620     // An unmasked version of this op could squeeze out a little bit of extra speed, if needed.
621     fInstructions.push_back({BuilderOp::swizzle_copy_stack_to_slots, {dst.index},
622                              (int)components.size(), offsetFromStackTop, pack_nybbles(components)});
623 }
624 
swizzle(int consumedSlots,SkSpan<const int8_t> components)625 void Builder::swizzle(int consumedSlots, SkSpan<const int8_t> components) {
626     // Consumes `consumedSlots` elements on the stack, then generates `elementSpan.size()` elements.
627     SkASSERT(consumedSlots >= 0);
628 
629     // We only allow up to 16 elements, and they can only reach 0-15 slots, due to nybble packing.
630     int numElements = components.size();
631     SkASSERT(numElements <= 16);
632     SkASSERT(std::all_of(components.begin(), components.end(), [](int8_t e){ return e >= 0; }));
633     SkASSERT(std::all_of(components.begin(), components.end(), [](int8_t e){ return e <= 0xF; }));
634 
635     // Make a local copy of the element array.
636     int8_t elements[16] = {};
637     std::copy(components.begin(), components.end(), std::begin(elements));
638 
639     while (numElements > 0) {
640         // If the first element of the swizzle is zero...
641         if (elements[0] != 0) {
642             break;
643         }
644         // ...and zero isn't used elsewhere in the swizzle...
645         if (std::any_of(&elements[1], &elements[numElements], [](int8_t e) { return e == 0; })) {
646             break;
647         }
648         // We can omit the first slot from the swizzle entirely.
649         // Slide everything forward by one slot, and reduce the element index by one.
650         for (int index = 1; index < numElements; ++index) {
651             elements[index - 1] = elements[index] - 1;
652         }
653         elements[numElements - 1] = 0;
654         --consumedSlots;
655         --numElements;
656     }
657 
658     // A completely empty swizzle is a no-op.
659     if (numElements == 0) {
660         this->discard_stack(consumedSlots);
661         return;
662     }
663 
664     if (consumedSlots <= 4 && numElements <= 4) {
665         // We can fit everything into a little swizzle.
666         int op = (int)BuilderOp::swizzle_1 + numElements - 1;
667         fInstructions.push_back({(BuilderOp)op, {}, consumedSlots,
668                                  pack_nybbles(SkSpan(elements, numElements))});
669         return;
670     }
671 
672     // This is a big swizzle. We use the `shuffle` op to handle these.
673     // Slot usage is packed into immA. The top 16 bits of immA count the consumed slots; the bottom
674     // 16 bits count the generated slots.
675     int slotUsage = consumedSlots << 16;
676     slotUsage |= numElements;
677 
678     // Pack immB and immC with the shuffle list in packed-nybble form.
679     fInstructions.push_back({BuilderOp::shuffle, {}, slotUsage,
680                              pack_nybbles(SkSpan(&elements[0], 8)),
681                              pack_nybbles(SkSpan(&elements[8], 8))});
682 }
683 
transpose(int columns,int rows)684 void Builder::transpose(int columns, int rows) {
685     // Transposes a matrix of size CxR on the stack (into a matrix of size RxC).
686     int8_t elements[16] = {};
687     size_t index = 0;
688     for (int r = 0; r < rows; ++r) {
689         for (int c = 0; c < columns; ++c) {
690             elements[index++] = (c * rows) + r;
691         }
692     }
693     this->swizzle(/*consumedSlots=*/columns * rows, SkSpan(elements, index));
694 }
695 
diagonal_matrix(int columns,int rows)696 void Builder::diagonal_matrix(int columns, int rows) {
697     // Generates a CxR diagonal matrix from the top two scalars on the stack.
698     int8_t elements[16] = {};
699     size_t index = 0;
700     for (int c = 0; c < columns; ++c) {
701         for (int r = 0; r < rows; ++r) {
702             elements[index++] = (c == r) ? 1 : 0;
703         }
704     }
705     this->swizzle(/*consumedSlots=*/2, SkSpan(elements, index));
706 }
707 
matrix_resize(int origColumns,int origRows,int newColumns,int newRows)708 void Builder::matrix_resize(int origColumns, int origRows, int newColumns, int newRows) {
709     // Resizes a CxR matrix at the top of the stack to C'xR'.
710     int8_t elements[16] = {};
711     size_t index = 0;
712 
713     size_t consumedSlots = origColumns * origRows;
714     size_t zeroOffset = 0, oneOffset = 0;
715 
716     for (int c = 0; c < newColumns; ++c) {
717         for (int r = 0; r < newRows; ++r) {
718             if (c < origColumns && r < origRows) {
719                 // Push an element from the original matrix.
720                 elements[index++] = (c * origRows) + r;
721             } else {
722                 // This element is outside the original matrix; push 1 or 0.
723                 if (c == r) {
724                     // We need to synthesize a literal 1.
725                     if (oneOffset == 0) {
726                         this->push_literal_f(1.0f);
727                         oneOffset = consumedSlots++;
728                     }
729                     elements[index++] = oneOffset;
730                 } else {
731                     // We need to synthesize a literal 0.
732                     if (zeroOffset == 0) {
733                         this->push_zeros(1);
734                         zeroOffset = consumedSlots++;
735                     }
736                     elements[index++] = zeroOffset;
737                 }
738             }
739         }
740     }
741     this->swizzle(consumedSlots, SkSpan(elements, index));
742 }
743 
finish(int numValueSlots,int numUniformSlots,SkRPDebugTrace * debugTrace)744 std::unique_ptr<Program> Builder::finish(int numValueSlots,
745                                          int numUniformSlots,
746                                          SkRPDebugTrace* debugTrace) {
747     // Verify that calls to enableExecutionMaskWrites and disableExecutionMaskWrites are balanced.
748     SkASSERT(fExecutionMaskWritesEnabled == 0);
749 
750     return std::make_unique<Program>(std::move(fInstructions), numValueSlots, numUniformSlots,
751                                      fNumLabels, debugTrace);
752 }
753 
optimize()754 void Program::optimize() {
755     // TODO(johnstiles): perform any last-minute cleanup of the instruction stream here
756 }
757 
stack_usage(const Instruction & inst)758 static int stack_usage(const Instruction& inst) {
759     switch (inst.fOp) {
760         case BuilderOp::push_literal:
761         case BuilderOp::push_condition_mask:
762         case BuilderOp::push_loop_mask:
763         case BuilderOp::push_return_mask:
764             return 1;
765 
766         case BuilderOp::push_src_rgba:
767         case BuilderOp::push_dst_rgba:
768             return 4;
769 
770         case BuilderOp::push_slots:
771         case BuilderOp::push_uniform:
772         case BuilderOp::push_zeros:
773         case BuilderOp::push_clone:
774         case BuilderOp::push_clone_from_stack:
775             return inst.fImmA;
776 
777         case BuilderOp::pop_condition_mask:
778         case BuilderOp::pop_loop_mask:
779         case BuilderOp::pop_and_reenable_loop_mask:
780         case BuilderOp::pop_return_mask:
781             return -1;
782 
783         case BuilderOp::pop_src_rg:
784             return -2;
785 
786         case BuilderOp::pop_src_rgba:
787         case BuilderOp::pop_dst_rgba:
788             return -4;
789 
790         case ALL_N_WAY_BINARY_OP_CASES:
791         case ALL_MULTI_SLOT_BINARY_OP_CASES:
792         case BuilderOp::discard_stack:
793         case BuilderOp::select:
794             return -inst.fImmA;
795 
796         case ALL_MULTI_SLOT_TERNARY_OP_CASES:
797             return 2 * -inst.fImmA;
798 
799         case BuilderOp::swizzle_1:
800             return 1 - inst.fImmA;  // consumes immA slots and emits a scalar
801         case BuilderOp::swizzle_2:
802             return 2 - inst.fImmA;  // consumes immA slots and emits a 2-slot vector
803         case BuilderOp::swizzle_3:
804             return 3 - inst.fImmA;  // consumes immA slots and emits a 3-slot vector
805         case BuilderOp::swizzle_4:
806             return 4 - inst.fImmA;  // consumes immA slots and emits a 4-slot vector
807 
808         case BuilderOp::dot_2_floats:
809             return -3;  // consumes two 2-slot vectors and emits one scalar
810         case BuilderOp::dot_3_floats:
811             return -5;  // consumes two 3-slot vectors and emits one scalar
812         case BuilderOp::dot_4_floats:
813             return -7;  // consumes two 4-slot vectors and emits one scalar
814 
815         case BuilderOp::shuffle: {
816             int consumed = inst.fImmA >> 16;
817             int generated = inst.fImmA & 0xFFFF;
818             return generated - consumed;
819         }
820         case ALL_SINGLE_SLOT_UNARY_OP_CASES:
821         case ALL_MULTI_SLOT_UNARY_OP_CASES:
822         default:
823             return 0;
824     }
825 }
826 
tempStackMaxDepths() const827 Program::StackDepthMap Program::tempStackMaxDepths() const {
828     StackDepthMap largest;
829     StackDepthMap current;
830 
831     int curIdx = 0;
832     for (const Instruction& inst : fInstructions) {
833         if (inst.fOp == BuilderOp::set_current_stack) {
834             curIdx = inst.fImmA;
835         }
836         current[curIdx] += stack_usage(inst);
837         largest[curIdx] = std::max(current[curIdx], largest[curIdx]);
838         SkASSERTF(current[curIdx] >= 0, "unbalanced temp stack push/pop on stack %d", curIdx);
839     }
840 
841     for (const auto& [stackIdx, depth] : current) {
842         (void)stackIdx;
843         SkASSERTF(depth == 0, "unbalanced temp stack push/pop");
844     }
845 
846     return largest;
847 }
848 
Program(SkTArray<Instruction> instrs,int numValueSlots,int numUniformSlots,int numLabels,SkRPDebugTrace * debugTrace)849 Program::Program(SkTArray<Instruction> instrs,
850                  int numValueSlots,
851                  int numUniformSlots,
852                  int numLabels,
853                  SkRPDebugTrace* debugTrace)
854         : fInstructions(std::move(instrs))
855         , fNumValueSlots(numValueSlots)
856         , fNumUniformSlots(numUniformSlots)
857         , fNumLabels(numLabels)
858         , fDebugTrace(debugTrace) {
859     this->optimize();
860 
861     fTempStackMaxDepths = this->tempStackMaxDepths();
862 
863     fNumTempStackSlots = 0;
864     for (const auto& [stackIdx, depth] : fTempStackMaxDepths) {
865         (void)stackIdx;
866         fNumTempStackSlots += depth;
867     }
868 }
869 
appendCopy(SkTArray<Stage> * pipeline,SkArenaAlloc * alloc,ProgramOp baseStage,float * dst,int dstStride,const float * src,int srcStride,int numSlots) const870 void Program::appendCopy(SkTArray<Stage>* pipeline,
871                          SkArenaAlloc* alloc,
872                          ProgramOp baseStage,
873                          float* dst, int dstStride,
874                          const float* src, int srcStride,
875                          int numSlots) const {
876     SkASSERT(numSlots >= 0);
877     while (numSlots > 4) {
878         this->appendCopy(pipeline, alloc, baseStage, dst, dstStride, src, srcStride,/*numSlots=*/4);
879         dst += 4 * dstStride;
880         src += 4 * srcStride;
881         numSlots -= 4;
882     }
883 
884     if (numSlots > 0) {
885         SkASSERT(numSlots <= 4);
886         auto stage = (ProgramOp)((int)baseStage + numSlots - 1);
887         auto* ctx = alloc->make<SkRasterPipeline_BinaryOpCtx>();
888         ctx->dst = dst;
889         ctx->src = src;
890         pipeline->push_back({stage, ctx});
891     }
892 }
893 
appendCopySlotsUnmasked(SkTArray<Stage> * pipeline,SkArenaAlloc * alloc,float * dst,const float * src,int numSlots) const894 void Program::appendCopySlotsUnmasked(SkTArray<Stage>* pipeline,
895                                       SkArenaAlloc* alloc,
896                                       float* dst,
897                                       const float* src,
898                                       int numSlots) const {
899     this->appendCopy(pipeline, alloc,
900                      ProgramOp::copy_slot_unmasked,
901                      dst, /*dstStride=*/SkOpts::raster_pipeline_highp_stride,
902                      src, /*srcStride=*/SkOpts::raster_pipeline_highp_stride,
903                      numSlots);
904 }
905 
appendCopySlotsMasked(SkTArray<Stage> * pipeline,SkArenaAlloc * alloc,float * dst,const float * src,int numSlots) const906 void Program::appendCopySlotsMasked(SkTArray<Stage>* pipeline,
907                                     SkArenaAlloc* alloc,
908                                     float* dst,
909                                     const float* src,
910                                     int numSlots) const {
911     this->appendCopy(pipeline, alloc,
912                      ProgramOp::copy_slot_masked,
913                      dst, /*dstStride=*/SkOpts::raster_pipeline_highp_stride,
914                      src, /*srcStride=*/SkOpts::raster_pipeline_highp_stride,
915                      numSlots);
916 }
917 
appendCopyConstants(SkTArray<Stage> * pipeline,SkArenaAlloc * alloc,float * dst,const float * src,int numSlots) const918 void Program::appendCopyConstants(SkTArray<Stage>* pipeline,
919                                   SkArenaAlloc* alloc,
920                                   float* dst,
921                                   const float* src,
922                                   int numSlots) const {
923     this->appendCopy(pipeline, alloc,
924                      ProgramOp::copy_constant,
925                      dst, /*dstStride=*/SkOpts::raster_pipeline_highp_stride,
926                      src, /*srcStride=*/1,
927                      numSlots);
928 }
929 
appendSingleSlotUnaryOp(SkTArray<Stage> * pipeline,ProgramOp stage,float * dst,int numSlots) const930 void Program::appendSingleSlotUnaryOp(SkTArray<Stage>* pipeline, ProgramOp stage,
931                                       float* dst, int numSlots) const {
932     SkASSERT(numSlots >= 0);
933     while (numSlots--) {
934         pipeline->push_back({stage, dst});
935         dst += SkOpts::raster_pipeline_highp_stride;
936     }
937 }
938 
appendMultiSlotUnaryOp(SkTArray<Stage> * pipeline,ProgramOp baseStage,float * dst,int numSlots) const939 void Program::appendMultiSlotUnaryOp(SkTArray<Stage>* pipeline, ProgramOp baseStage,
940                                      float* dst, int numSlots) const {
941     SkASSERT(numSlots >= 0);
942     while (numSlots > 4) {
943         this->appendMultiSlotUnaryOp(pipeline, baseStage, dst, /*numSlots=*/4);
944         dst += 4 * SkOpts::raster_pipeline_highp_stride;
945         numSlots -= 4;
946     }
947 
948     SkASSERT(numSlots <= 4);
949     auto stage = (ProgramOp)((int)baseStage + numSlots - 1);
950     pipeline->push_back({stage, dst});
951 }
952 
appendAdjacentNWayBinaryOp(SkTArray<Stage> * pipeline,SkArenaAlloc * alloc,ProgramOp stage,float * dst,const float * src,int numSlots) const953 void Program::appendAdjacentNWayBinaryOp(SkTArray<Stage>* pipeline, SkArenaAlloc* alloc,
954                                          ProgramOp stage,
955                                          float* dst, const float* src, int numSlots) const {
956     // The source and destination must be directly next to one another.
957     SkASSERT(numSlots >= 0);
958     SkASSERT((dst + SkOpts::raster_pipeline_highp_stride * numSlots) == src);
959 
960     if (numSlots > 0) {
961         auto ctx = alloc->make<SkRasterPipeline_BinaryOpCtx>();
962         ctx->dst = dst;
963         ctx->src = src;
964         pipeline->push_back({stage, ctx});
965         return;
966     }
967 }
968 
appendAdjacentMultiSlotBinaryOp(SkTArray<Stage> * pipeline,SkArenaAlloc * alloc,ProgramOp baseStage,float * dst,const float * src,int numSlots) const969 void Program::appendAdjacentMultiSlotBinaryOp(SkTArray<Stage>* pipeline, SkArenaAlloc* alloc,
970                                               ProgramOp baseStage,
971                                               float* dst, const float* src, int numSlots) const {
972     // The source and destination must be directly next to one another.
973     SkASSERT(numSlots >= 0);
974     SkASSERT((dst + SkOpts::raster_pipeline_highp_stride * numSlots) == src);
975 
976     if (numSlots > 4) {
977         this->appendAdjacentNWayBinaryOp(pipeline, alloc, baseStage, dst, src, numSlots);
978         return;
979     }
980     if (numSlots > 0) {
981         auto specializedStage = (ProgramOp)((int)baseStage + numSlots);
982         pipeline->push_back({specializedStage, dst});
983     }
984 }
985 
appendAdjacentMultiSlotTernaryOp(SkTArray<Stage> * pipeline,SkArenaAlloc * alloc,ProgramOp baseStage,float * dst,const float * src0,const float * src1,int numSlots) const986 void Program::appendAdjacentMultiSlotTernaryOp(SkTArray<Stage>* pipeline, SkArenaAlloc* alloc,
987                                                ProgramOp baseStage, float* dst, const float* src0,
988                                                const float* src1, int numSlots) const {
989     // The float pointers must all be immediately adjacent to each other.
990     SkASSERT(numSlots >= 0);
991     SkASSERT((dst  + SkOpts::raster_pipeline_highp_stride * numSlots) == src0);
992     SkASSERT((src0 + SkOpts::raster_pipeline_highp_stride * numSlots) == src1);
993 
994     if (numSlots > 4) {
995         auto ctx = alloc->make<SkRasterPipeline_TernaryOpCtx>();
996         ctx->dst = dst;
997         ctx->src0 = src0;
998         ctx->src1 = src1;
999         pipeline->push_back({baseStage, ctx});
1000         return;
1001     }
1002     if (numSlots > 0) {
1003         auto specializedStage = (ProgramOp)((int)baseStage + numSlots);
1004         pipeline->push_back({specializedStage, dst});
1005     }
1006 }
1007 
appendStackRewind(SkTArray<Stage> * pipeline) const1008 void Program::appendStackRewind(SkTArray<Stage>* pipeline) const {
1009 #if defined(SKSL_STANDALONE) || !SK_HAS_MUSTTAIL
1010     pipeline->push_back({ProgramOp::stack_rewind, nullptr});
1011 #endif
1012 }
1013 
context_bit_pun(intptr_t val)1014 static void* context_bit_pun(intptr_t val) {
1015     return sk_bit_cast<void*>(val);
1016 }
1017 
allocateSlotData(SkArenaAlloc * alloc) const1018 Program::SlotData Program::allocateSlotData(SkArenaAlloc* alloc) const {
1019     // Allocate a contiguous slab of slot data for values and stack entries.
1020     const int N = SkOpts::raster_pipeline_highp_stride;
1021     const int vectorWidth = N * sizeof(float);
1022     const int allocSize = vectorWidth * (fNumValueSlots + fNumTempStackSlots);
1023     float* slotPtr = static_cast<float*>(alloc->makeBytesAlignedTo(allocSize, vectorWidth));
1024     sk_bzero(slotPtr, allocSize);
1025 
1026     // Store the temp stack immediately after the values.
1027     SlotData s;
1028     s.values = SkSpan{slotPtr,        N * fNumValueSlots};
1029     s.stack  = SkSpan{s.values.end(), N * fNumTempStackSlots};
1030     return s;
1031 }
1032 
1033 #if !defined(SKSL_STANDALONE)
1034 
appendStages(SkRasterPipeline * pipeline,SkArenaAlloc * alloc,RP::Callbacks * callbacks,SkSpan<const float> uniforms) const1035 bool Program::appendStages(SkRasterPipeline* pipeline,
1036                            SkArenaAlloc* alloc,
1037                            RP::Callbacks* callbacks,
1038                            SkSpan<const float> uniforms) const {
1039     // Convert our Instruction list to an array of ProgramOps.
1040     SkTArray<Stage> stages;
1041     this->makeStages(&stages, alloc, uniforms, this->allocateSlotData(alloc));
1042 
1043     // Allocate buffers for branch targets and labels; these are needed to convert labels into
1044     // actual offsets into the pipeline and fix up branches.
1045     SkTArray<SkRasterPipeline_BranchCtx*> branchContexts;
1046     branchContexts.reserve_back(fNumLabels);
1047     SkTArray<int> labelOffsets;
1048     labelOffsets.push_back_n(fNumLabels, -1);
1049     SkTArray<int> branchGoesToLabel;
1050     branchGoesToLabel.reserve_back(fNumLabels);
1051 
1052     for (const Stage& stage : stages) {
1053         switch (stage.op) {
1054             case ProgramOp::stack_rewind:
1055                 pipeline->append_stack_rewind();
1056                 break;
1057 
1058             case ProgramOp::invoke_shader:
1059                 if (!callbacks || !callbacks->appendShader(sk_bit_cast<intptr_t>(stage.ctx))) {
1060                     return false;
1061                 }
1062                 break;
1063 
1064             case ProgramOp::invoke_color_filter:
1065                 if (!callbacks || !callbacks->appendColorFilter(sk_bit_cast<intptr_t>(stage.ctx))) {
1066                     return false;
1067                 }
1068                 break;
1069 
1070             case ProgramOp::invoke_blender:
1071                 if (!callbacks || !callbacks->appendBlender(sk_bit_cast<intptr_t>(stage.ctx))) {
1072                     return false;
1073                 }
1074                 break;
1075 
1076             case ProgramOp::label: {
1077                 // Remember the absolute pipeline position of this label.
1078                 int labelID = sk_bit_cast<intptr_t>(stage.ctx);
1079                 SkASSERT(labelID >= 0 && labelID < fNumLabels);
1080                 labelOffsets[labelID] = pipeline->getNumStages();
1081                 break;
1082             }
1083             case ProgramOp::jump:
1084             case ProgramOp::branch_if_any_active_lanes:
1085             case ProgramOp::branch_if_no_active_lanes:
1086             case ProgramOp::branch_if_no_active_lanes_eq: {
1087                 // The branch context contain a valid label ID at this point.
1088                 auto* branchCtx = static_cast<SkRasterPipeline_BranchCtx*>(stage.ctx);
1089                 int labelID = branchCtx->offset;
1090                 SkASSERT(labelID >= 0 && labelID < fNumLabels);
1091 
1092                 // Replace the label ID in the branch context with the absolute pipeline position.
1093                 // We will go back over the branch targets at the end and fix them up.
1094                 branchCtx->offset = pipeline->getNumStages();
1095 
1096                 SkASSERT(branchContexts.size() == branchGoesToLabel.size());
1097                 branchContexts.push_back(branchCtx);
1098                 branchGoesToLabel.push_back(labelID);
1099                 [[fallthrough]];
1100             }
1101             default:
1102                 // Append a regular op to the program.
1103                 SkASSERT((int)stage.op < kNumRasterPipelineHighpOps);
1104                 pipeline->append((SkRasterPipelineOp)stage.op, stage.ctx);
1105                 break;
1106         }
1107     }
1108 
1109     // Now that we have assembled the program and know the pipeline positions of each label and
1110     // branch, fix up every branch target.
1111     SkASSERT(branchContexts.size() == branchGoesToLabel.size());
1112     for (int index = 0; index < branchContexts.size(); ++index) {
1113         int branchFromIdx = branchContexts[index]->offset;
1114         int branchToIdx = labelOffsets[branchGoesToLabel[index]];
1115         branchContexts[index]->offset = branchToIdx - branchFromIdx;
1116     }
1117 
1118     return true;
1119 }
1120 
1121 #endif
1122 
makeStages(SkTArray<Stage> * pipeline,SkArenaAlloc * alloc,SkSpan<const float> uniforms,const SlotData & slots) const1123 void Program::makeStages(SkTArray<Stage>* pipeline,
1124                          SkArenaAlloc* alloc,
1125                          SkSpan<const float> uniforms,
1126                          const SlotData& slots) const {
1127     SkASSERT(fNumUniformSlots == SkToInt(uniforms.size()));
1128 
1129     const int N = SkOpts::raster_pipeline_highp_stride;
1130     StackDepthMap tempStackDepth;
1131     int currentStack = 0;
1132     int mostRecentRewind = 0;
1133 
1134     // Assemble a map holding the current stack-top for each temporary stack. Position each temp
1135     // stack immediately after the previous temp stack; temp stacks are never allowed to overlap.
1136     int pos = 0;
1137     SkTHashMap<int, float*> tempStackMap;
1138     for (auto& [idx, depth] : fTempStackMaxDepths) {
1139         tempStackMap[idx] = slots.stack.begin() + (pos * N);
1140         pos += depth;
1141     }
1142 
1143     // Track labels that we have reached in processing.
1144     SkBitSet labelsEncountered(fNumLabels);
1145 
1146     auto EmitStackRewindForBackwardsBranch = [&](int labelID) {
1147         // If we have already encountered the label associated with this branch, this is a
1148         // backwards branch. Add a stack-rewind immediately before the branch to ensure that
1149         // long-running loops don't use an unbounded amount of stack space.
1150         if (labelsEncountered.test(labelID)) {
1151             this->appendStackRewind(pipeline);
1152             mostRecentRewind = pipeline->size();
1153         }
1154     };
1155 
1156     // We can reuse constants from our arena by placing them in this map.
1157     SkTHashMap<int, int*> constantLookupMap; // <constant value, pointer into arena>
1158 
1159     // Write each BuilderOp to the pipeline array.
1160     pipeline->reserve_back(fInstructions.size());
1161     for (const Instruction& inst : fInstructions) {
1162         auto SlotA    = [&]() { return &slots.values[N * inst.fSlotA]; };
1163         auto SlotB    = [&]() { return &slots.values[N * inst.fSlotB]; };
1164         auto UniformA = [&]() { return &uniforms[inst.fSlotA]; };
1165         float*& tempStackPtr = tempStackMap[currentStack];
1166 
1167         switch (inst.fOp) {
1168             case BuilderOp::label:
1169                 SkASSERT(inst.fImmA >= 0 && inst.fImmA < fNumLabels);
1170                 labelsEncountered.set(inst.fImmA);
1171                 pipeline->push_back({ProgramOp::label, context_bit_pun(inst.fImmA)});
1172                 break;
1173 
1174             case BuilderOp::jump:
1175             case BuilderOp::branch_if_any_active_lanes:
1176             case BuilderOp::branch_if_no_active_lanes: {
1177                 SkASSERT(inst.fImmA >= 0 && inst.fImmA < fNumLabels);
1178                 EmitStackRewindForBackwardsBranch(inst.fImmA);
1179 
1180                 auto* ctx = alloc->make<SkRasterPipeline_BranchCtx>();
1181                 ctx->offset = inst.fImmA;
1182                 pipeline->push_back({(ProgramOp)inst.fOp, ctx});
1183                 break;
1184             }
1185             case BuilderOp::branch_if_no_active_lanes_on_stack_top_equal: {
1186                 SkASSERT(inst.fImmA >= 0 && inst.fImmA < fNumLabels);
1187                 EmitStackRewindForBackwardsBranch(inst.fImmA);
1188 
1189                 auto* ctx = alloc->make<SkRasterPipeline_BranchIfEqualCtx>();
1190                 ctx->offset = inst.fImmA;
1191                 ctx->value = inst.fImmB;
1192                 ctx->ptr = reinterpret_cast<int*>(tempStackPtr - N);
1193                 pipeline->push_back({ProgramOp::branch_if_no_active_lanes_eq, ctx});
1194                 break;
1195             }
1196             case BuilderOp::init_lane_masks:
1197                 pipeline->push_back({ProgramOp::init_lane_masks, nullptr});
1198                 break;
1199 
1200             case BuilderOp::store_src_rg:
1201                 pipeline->push_back({ProgramOp::store_src_rg, SlotA()});
1202                 break;
1203 
1204             case BuilderOp::store_src:
1205                 pipeline->push_back({ProgramOp::store_src, SlotA()});
1206                 break;
1207 
1208             case BuilderOp::store_dst:
1209                 pipeline->push_back({ProgramOp::store_dst, SlotA()});
1210                 break;
1211 
1212             case BuilderOp::store_device_xy01:
1213                 pipeline->push_back({ProgramOp::store_device_xy01, SlotA()});
1214                 break;
1215 
1216             case BuilderOp::load_src:
1217                 pipeline->push_back({ProgramOp::load_src, SlotA()});
1218                 break;
1219 
1220             case BuilderOp::load_dst:
1221                 pipeline->push_back({ProgramOp::load_dst, SlotA()});
1222                 break;
1223 
1224             case ALL_SINGLE_SLOT_UNARY_OP_CASES: {
1225                 float* dst = tempStackPtr - (inst.fImmA * N);
1226                 this->appendSingleSlotUnaryOp(pipeline, (ProgramOp)inst.fOp, dst, inst.fImmA);
1227                 break;
1228             }
1229             case ALL_MULTI_SLOT_UNARY_OP_CASES: {
1230                 float* dst = tempStackPtr - (inst.fImmA * N);
1231                 this->appendMultiSlotUnaryOp(pipeline, (ProgramOp)inst.fOp, dst, inst.fImmA);
1232                 break;
1233             }
1234             case ALL_N_WAY_BINARY_OP_CASES: {
1235                 float* src = tempStackPtr - (inst.fImmA * N);
1236                 float* dst = tempStackPtr - (inst.fImmA * 2 * N);
1237                 this->appendAdjacentNWayBinaryOp(pipeline, alloc, (ProgramOp)inst.fOp,
1238                                                  dst, src, inst.fImmA);
1239                 break;
1240             }
1241             case ALL_MULTI_SLOT_BINARY_OP_CASES: {
1242                 float* src = tempStackPtr - (inst.fImmA * N);
1243                 float* dst = tempStackPtr - (inst.fImmA * 2 * N);
1244                 this->appendAdjacentMultiSlotBinaryOp(pipeline, alloc, (ProgramOp)inst.fOp,
1245                                                       dst, src, inst.fImmA);
1246                 break;
1247             }
1248             case ALL_MULTI_SLOT_TERNARY_OP_CASES: {
1249                 float* src1 = tempStackPtr - (inst.fImmA * N);
1250                 float* src0 = tempStackPtr - (inst.fImmA * 2 * N);
1251                 float* dst  = tempStackPtr - (inst.fImmA * 3 * N);
1252                 this->appendAdjacentMultiSlotTernaryOp(pipeline, alloc, (ProgramOp)inst.fOp,
1253                                                        dst, src0, src1, inst.fImmA);
1254                 break;
1255             }
1256             case BuilderOp::select: {
1257                 float* src = tempStackPtr - (inst.fImmA * N);
1258                 float* dst = tempStackPtr - (inst.fImmA * 2 * N);
1259                 this->appendCopySlotsMasked(pipeline, alloc, dst, src, inst.fImmA);
1260                 break;
1261             }
1262             case BuilderOp::copy_slot_masked:
1263                 this->appendCopySlotsMasked(pipeline, alloc, SlotA(), SlotB(), inst.fImmA);
1264                 break;
1265 
1266             case BuilderOp::copy_slot_unmasked:
1267                 this->appendCopySlotsUnmasked(pipeline, alloc, SlotA(), SlotB(), inst.fImmA);
1268                 break;
1269 
1270             case BuilderOp::zero_slot_unmasked:
1271                 this->appendMultiSlotUnaryOp(pipeline, ProgramOp::zero_slot_unmasked,
1272                                              SlotA(), inst.fImmA);
1273                 break;
1274 
1275             case BuilderOp::dot_2_floats:
1276             case BuilderOp::dot_3_floats:
1277             case BuilderOp::dot_4_floats: {
1278                 float* dst = tempStackPtr - (inst.fImmA * 2 * N);
1279                 pipeline->push_back({(ProgramOp)inst.fOp, dst});
1280                 break;
1281             }
1282             case BuilderOp::swizzle_1:
1283             case BuilderOp::swizzle_2:
1284             case BuilderOp::swizzle_3:
1285             case BuilderOp::swizzle_4: {
1286                 auto* ctx = alloc->make<SkRasterPipeline_SwizzleCtx>();
1287                 ctx->ptr = tempStackPtr - (N * inst.fImmA);
1288                 // Unpack component nybbles into byte-offsets pointing at stack slots.
1289                 unpack_nybbles_to_offsets(inst.fImmB, SkSpan(ctx->offsets));
1290                 pipeline->push_back({(ProgramOp)inst.fOp, ctx});
1291                 break;
1292             }
1293             case BuilderOp::shuffle: {
1294                 int consumed = inst.fImmA >> 16;
1295                 int generated = inst.fImmA & 0xFFFF;
1296 
1297                 auto* ctx = alloc->make<SkRasterPipeline_ShuffleCtx>();
1298                 ctx->ptr = tempStackPtr - (N * consumed);
1299                 ctx->count = generated;
1300                 // Unpack immB and immC from nybble form into the offset array.
1301                 unpack_nybbles_to_offsets(inst.fImmB, SkSpan(&ctx->offsets[0], 8));
1302                 unpack_nybbles_to_offsets(inst.fImmC, SkSpan(&ctx->offsets[8], 8));
1303                 pipeline->push_back({ProgramOp::shuffle, ctx});
1304                 break;
1305             }
1306             case BuilderOp::push_src_rgba: {
1307                 float* dst = tempStackPtr;
1308                 pipeline->push_back({ProgramOp::store_src, dst});
1309                 break;
1310             }
1311             case BuilderOp::push_dst_rgba: {
1312                 float* dst = tempStackPtr;
1313                 pipeline->push_back({ProgramOp::store_dst, dst});
1314                 break;
1315             }
1316             case BuilderOp::pop_src_rg: {
1317                 float* dst = tempStackPtr - (2 * N);
1318                 pipeline->push_back({ProgramOp::load_src_rg, dst});
1319                 break;
1320             }
1321             case BuilderOp::pop_src_rgba: {
1322                 float* dst = tempStackPtr - (4 * N);
1323                 pipeline->push_back({ProgramOp::load_src, dst});
1324                 break;
1325             }
1326             case BuilderOp::pop_dst_rgba: {
1327                 float* dst = tempStackPtr - (4 * N);
1328                 pipeline->push_back({ProgramOp::load_dst, dst});
1329                 break;
1330             }
1331             case BuilderOp::push_slots: {
1332                 float* dst = tempStackPtr;
1333                 this->appendCopySlotsUnmasked(pipeline, alloc, dst, SlotA(), inst.fImmA);
1334                 break;
1335             }
1336             case BuilderOp::push_uniform: {
1337                 float* dst = tempStackPtr;
1338                 this->appendCopyConstants(pipeline, alloc, dst, UniformA(), inst.fImmA);
1339                 break;
1340             }
1341             case BuilderOp::push_zeros: {
1342                 float* dst = tempStackPtr;
1343                 this->appendMultiSlotUnaryOp(pipeline, ProgramOp::zero_slot_unmasked, dst,
1344                                              inst.fImmA);
1345                 break;
1346             }
1347             case BuilderOp::push_condition_mask: {
1348                 float* dst = tempStackPtr;
1349                 pipeline->push_back({ProgramOp::store_condition_mask, dst});
1350                 break;
1351             }
1352             case BuilderOp::pop_condition_mask: {
1353                 float* src = tempStackPtr - (1 * N);
1354                 pipeline->push_back({ProgramOp::load_condition_mask, src});
1355                 break;
1356             }
1357             case BuilderOp::merge_condition_mask: {
1358                 float* ptr = tempStackPtr - (2 * N);
1359                 pipeline->push_back({ProgramOp::merge_condition_mask, ptr});
1360                 break;
1361             }
1362             case BuilderOp::push_loop_mask: {
1363                 float* dst = tempStackPtr;
1364                 pipeline->push_back({ProgramOp::store_loop_mask, dst});
1365                 break;
1366             }
1367             case BuilderOp::pop_loop_mask: {
1368                 float* src = tempStackPtr - (1 * N);
1369                 pipeline->push_back({ProgramOp::load_loop_mask, src});
1370                 break;
1371             }
1372             case BuilderOp::pop_and_reenable_loop_mask: {
1373                 float* src = tempStackPtr - (1 * N);
1374                 pipeline->push_back({ProgramOp::reenable_loop_mask, src});
1375                 break;
1376             }
1377             case BuilderOp::reenable_loop_mask:
1378                 pipeline->push_back({ProgramOp::reenable_loop_mask, SlotA()});
1379                 break;
1380 
1381             case BuilderOp::mask_off_loop_mask:
1382                 pipeline->push_back({ProgramOp::mask_off_loop_mask, nullptr});
1383                 break;
1384 
1385             case BuilderOp::merge_loop_mask: {
1386                 float* src = tempStackPtr - (1 * N);
1387                 pipeline->push_back({ProgramOp::merge_loop_mask, src});
1388                 break;
1389             }
1390             case BuilderOp::push_return_mask: {
1391                 float* dst = tempStackPtr;
1392                 pipeline->push_back({ProgramOp::store_return_mask, dst});
1393                 break;
1394             }
1395             case BuilderOp::pop_return_mask: {
1396                 float* src = tempStackPtr - (1 * N);
1397                 pipeline->push_back({ProgramOp::load_return_mask, src});
1398                 break;
1399             }
1400             case BuilderOp::mask_off_return_mask:
1401                 pipeline->push_back({ProgramOp::mask_off_return_mask, nullptr});
1402                 break;
1403 
1404             case BuilderOp::copy_constant:
1405             case BuilderOp::push_literal: {
1406                 float* dst = (inst.fOp == BuilderOp::push_literal) ? tempStackPtr : SlotA();
1407                 int* constantPtr;
1408                 if (int** lookup = constantLookupMap.find(inst.fImmA)) {
1409                     constantPtr = *lookup;
1410                 } else {
1411                     constantPtr = alloc->make<int>(inst.fImmA);
1412                     constantLookupMap[inst.fImmA] = constantPtr;
1413                 }
1414                 SkASSERT(constantPtr);
1415                 this->appendCopyConstants(pipeline, alloc, dst, (float*)constantPtr,/*numSlots=*/1);
1416                 break;
1417             }
1418             case BuilderOp::copy_stack_to_slots: {
1419                 float* src = tempStackPtr - (inst.fImmB * N);
1420                 this->appendCopySlotsMasked(pipeline, alloc, SlotA(), src, inst.fImmA);
1421                 break;
1422             }
1423             case BuilderOp::copy_stack_to_slots_unmasked: {
1424                 float* src = tempStackPtr - (inst.fImmB * N);
1425                 this->appendCopySlotsUnmasked(pipeline, alloc, SlotA(), src, inst.fImmA);
1426                 break;
1427             }
1428             case BuilderOp::swizzle_copy_stack_to_slots: {
1429                 auto stage = (ProgramOp)((int)ProgramOp::swizzle_copy_slot_masked + inst.fImmA - 1);
1430                 auto* ctx = alloc->make<SkRasterPipeline_SwizzleCopyCtx>();
1431                 ctx->src = tempStackPtr - (inst.fImmB * N);
1432                 ctx->dst = SlotA();
1433                 unpack_nybbles_to_offsets(inst.fImmC, SkSpan(ctx->offsets));
1434                 pipeline->push_back({stage, ctx});
1435                 break;
1436             }
1437             case BuilderOp::push_clone: {
1438                 float* src = tempStackPtr - (inst.fImmB * N);
1439                 float* dst = tempStackPtr;
1440                 this->appendCopySlotsUnmasked(pipeline, alloc, dst, src, inst.fImmA);
1441                 break;
1442             }
1443             case BuilderOp::push_clone_from_stack: {
1444                 float* sourceStackPtr = tempStackMap[inst.fImmB];
1445                 float* src = sourceStackPtr - (inst.fImmC * N);
1446                 float* dst = tempStackPtr;
1447                 this->appendCopySlotsUnmasked(pipeline, alloc, dst, src, inst.fImmA);
1448                 break;
1449             }
1450             case BuilderOp::case_op: {
1451                 auto* ctx = alloc->make<SkRasterPipeline_CaseOpCtx>();
1452                 ctx->ptr = reinterpret_cast<int*>(tempStackPtr - 2 * N);
1453                 ctx->expectedValue = inst.fImmA;
1454                 pipeline->push_back({ProgramOp::case_op, ctx});
1455                 break;
1456             }
1457             case BuilderOp::discard_stack:
1458                 break;
1459 
1460             case BuilderOp::set_current_stack:
1461                 currentStack = inst.fImmA;
1462                 break;
1463 
1464             case BuilderOp::invoke_shader:
1465             case BuilderOp::invoke_color_filter:
1466             case BuilderOp::invoke_blender:
1467                 pipeline->push_back({(ProgramOp)inst.fOp, context_bit_pun(inst.fImmA)});
1468                 break;
1469 
1470             default:
1471                 SkDEBUGFAILF("Raster Pipeline: unsupported instruction %d", (int)inst.fOp);
1472                 break;
1473         }
1474 
1475         tempStackPtr += stack_usage(inst) * N;
1476         SkASSERT(tempStackPtr >= slots.stack.begin());
1477         SkASSERT(tempStackPtr <= slots.stack.end());
1478 
1479         // Periodically rewind the stack every 500 instructions. When SK_HAS_MUSTTAIL is set,
1480         // rewinds are not actually used; the appendStackRewind call becomes a no-op. On platforms
1481         // that don't support SK_HAS_MUSTTAIL, rewinding the stack periodically can prevent a
1482         // potential stack overflow when running a long program.
1483         int numPipelineStages = pipeline->size();
1484         if (numPipelineStages - mostRecentRewind > 500) {
1485             this->appendStackRewind(pipeline);
1486             mostRecentRewind = numPipelineStages;
1487         }
1488     }
1489 }
1490 
1491 // Finds duplicate names in the program and disambiguates them with subscripts.
build_unique_slot_name_list(const SkRPDebugTrace * debugTrace)1492 SkTArray<std::string> build_unique_slot_name_list(const SkRPDebugTrace* debugTrace) {
1493     SkTArray<std::string> slotName;
1494     if (debugTrace) {
1495         slotName.reserve_back(debugTrace->fSlotInfo.size());
1496 
1497         // The map consists of <variable name, <source position, unique name>>.
1498         SkTHashMap<std::string_view, SkTHashMap<int, std::string>> uniqueNameMap;
1499 
1500         for (const SlotDebugInfo& slotInfo : debugTrace->fSlotInfo) {
1501             // Look up this variable by its name and source position.
1502             int pos = slotInfo.pos.valid() ? slotInfo.pos.startOffset() : 0;
1503             SkTHashMap<int, std::string>& positionMap = uniqueNameMap[slotInfo.name];
1504             std::string& uniqueName = positionMap[pos];
1505 
1506             // Have we seen this variable name/position combination before?
1507             if (uniqueName.empty()) {
1508                 // This is a unique name/position pair.
1509                 uniqueName = slotInfo.name;
1510 
1511                 // But if it's not a unique _name_, it deserves a subscript to disambiguate it.
1512                 int subscript = positionMap.count() - 1;
1513                 if (subscript > 0) {
1514                     for (char digit : std::to_string(subscript)) {
1515                         // U+2080 through U+2089 (₀₁₂₃₄₅₆₇₈₉) in UTF8:
1516                         uniqueName.push_back((char)0xE2);
1517                         uniqueName.push_back((char)0x82);
1518                         uniqueName.push_back((char)(0x80 + digit - '0'));
1519                     }
1520                 }
1521             }
1522 
1523             slotName.push_back(uniqueName);
1524         }
1525     }
1526     return slotName;
1527 }
1528 
dump(SkWStream * out) const1529 void Program::dump(SkWStream* out) const {
1530     // Allocate memory for the slot and uniform data, even though the program won't ever be
1531     // executed. The program requires pointer ranges for managing its data, and ASAN will report
1532     // errors if those pointers are pointing at unallocated memory.
1533     SkArenaAlloc alloc(/*firstHeapAllocation=*/1000);
1534     const int N = SkOpts::raster_pipeline_highp_stride;
1535     SlotData slots = this->allocateSlotData(&alloc);
1536     float* uniformPtr = alloc.makeArray<float>(fNumUniformSlots);
1537     SkSpan<float> uniforms = SkSpan(uniformPtr, fNumUniformSlots);
1538 
1539     // Turn this program into an array of Raster Pipeline stages.
1540     SkTArray<Stage> stages;
1541     this->makeStages(&stages, &alloc, uniforms, slots);
1542 
1543     // Find the labels in the program, and keep track of their offsets.
1544     SkTHashMap<int, int> labelToStageMap; // <label ID, stage index>
1545     for (int index = 0; index < stages.size(); ++index) {
1546         if (stages[index].op == ProgramOp::label) {
1547             int labelID = sk_bit_cast<intptr_t>(stages[index].ctx);
1548             SkASSERT(!labelToStageMap.find(labelID));
1549             labelToStageMap[labelID] = index;
1550         }
1551     }
1552 
1553     // Assign unique names to each variable slot; our trace might have multiple variables with the
1554     // same name, which can make a dump hard to read.
1555     SkTArray<std::string> slotName = build_unique_slot_name_list(fDebugTrace);
1556 
1557     // Emit the program's instruction list.
1558     for (int index = 0; index < stages.size(); ++index) {
1559         const Stage& stage = stages[index];
1560 
1561         // Interpret the context value as a branch offset.
1562         auto BranchOffset = [&](const SkRasterPipeline_BranchCtx* ctx) -> std::string {
1563             // The context's offset field contains a label ID
1564             int labelID = ctx->offset;
1565             SkASSERT(labelToStageMap.find(labelID));
1566             int labelIndex = labelToStageMap[labelID];
1567             return SkSL::String::printf("%+d (label %d at #%d)",
1568                                         labelIndex - index, labelID, labelIndex + 1);
1569         };
1570 
1571         // Print a 32-bit immediate value of unknown type (int/float).
1572         auto Imm = [&](float immFloat, bool showAsFloat = true) -> std::string {
1573             // Start with `0x3F800000` as a baseline.
1574             uint32_t immUnsigned;
1575             memcpy(&immUnsigned, &immFloat, sizeof(uint32_t));
1576             auto text = SkSL::String::printf("0x%08X", immUnsigned);
1577 
1578             // Extend it to `0x3F800000 (1.0)` for finite floating point values.
1579             if (showAsFloat && std::isfinite(immFloat)) {
1580                 text += " (";
1581                 text += skstd::to_string(immFloat);
1582                 text += ")";
1583             }
1584             return text;
1585         };
1586 
1587         // Interpret the context pointer as a 32-bit immediate value of unknown type (int/float).
1588         auto ImmCtx = [&](const void* ctx, bool showAsFloat = true) -> std::string {
1589             float f;
1590             memcpy(&f, &ctx, sizeof(float));
1591             return Imm(f, showAsFloat);
1592         };
1593 
1594         // Print `1` for single slots and `1..3` for ranges of slots.
1595         auto AsRange = [](int first, int count) -> std::string {
1596             std::string text = std::to_string(first);
1597             if (count > 1) {
1598                 text += ".." + std::to_string(first + count - 1);
1599             }
1600             return text;
1601         };
1602 
1603         // Come up with a reasonable name for a range of slots, e.g.:
1604         // `val`: slot range points at one variable, named val
1605         // `val(0..1)`: slot range points at the first and second slot of val (which has 3+ slots)
1606         // `foo, bar`: slot range fully covers two variables, named foo and bar
1607         // `foo(3), bar(0)`: slot range covers the fourth slot of foo and the first slot of bar
1608         auto SlotName = [&](SkSpan<const SlotDebugInfo> debugInfo,
1609                             SkSpan<const std::string> names,
1610                             SlotRange range) -> std::string {
1611             SkASSERT(range.index >= 0 && (range.index + range.count) <= (int)debugInfo.size());
1612 
1613             std::string text;
1614             auto separator = SkSL::String::Separator();
1615             while (range.count > 0) {
1616                 const SlotDebugInfo& slotInfo = debugInfo[range.index];
1617                 text += separator();
1618                 text += names.empty() ? slotInfo.name : names[range.index];
1619 
1620                 // Figure out how many slots we can chomp in this iteration.
1621                 int entireVariable = slotInfo.columns * slotInfo.rows;
1622                 int slotsToChomp = std::min(range.count, entireVariable - slotInfo.componentIndex);
1623                 // If we aren't consuming an entire variable, from first slot to last...
1624                 if (slotsToChomp != entireVariable) {
1625                     // ... decorate it with a range suffix.
1626                     text += "(" + AsRange(slotInfo.componentIndex, slotsToChomp) + ")";
1627                 }
1628                 range.index += slotsToChomp;
1629                 range.count -= slotsToChomp;
1630             }
1631 
1632             return text;
1633         };
1634 
1635         // Attempts to interpret the passed-in pointer as a uniform range.
1636         auto UniformPtrCtx = [&](const float* ptr, int numSlots) -> std::string {
1637             const float* end = ptr + numSlots;
1638             if (ptr >= uniforms.begin() && end <= uniforms.end()) {
1639                 int uniformIdx = ptr - uniforms.begin();
1640                 if (fDebugTrace) {
1641                     // Handle pointers to named uniform slots.
1642                     std::string name = SlotName(fDebugTrace->fUniformInfo, /*names=*/{},
1643                                                 {uniformIdx, numSlots});
1644                     if (!name.empty()) {
1645                         return name;
1646                     }
1647                 }
1648                 // Handle pointers to uniforms (when no debug info exists).
1649                 return "u" + AsRange(uniformIdx, numSlots);
1650             }
1651             return {};
1652         };
1653 
1654         // Attempts to interpret the passed-in pointer as a value slot range.
1655         auto ValuePtrCtx = [&](const float* ptr, int numSlots) -> std::string {
1656             const float* end = ptr + (N * numSlots);
1657             if (ptr >= slots.values.begin() && end <= slots.values.end()) {
1658                 int valueIdx = ptr - slots.values.begin();
1659                 SkASSERT((valueIdx % N) == 0);
1660                 valueIdx /= N;
1661                 if (fDebugTrace) {
1662                     // Handle pointers to named value slots.
1663                     std::string name = SlotName(fDebugTrace->fSlotInfo, slotName,
1664                                                 {valueIdx, numSlots});
1665                     if (!name.empty()) {
1666                         return name;
1667                     }
1668                 }
1669                 // Handle pointers to value slots (when no debug info exists).
1670                 return "v" + AsRange(valueIdx, numSlots);
1671             }
1672             return {};
1673         };
1674 
1675         // Interpret the context value as a pointer to `count` immediate values.
1676         auto MultiImmCtx = [&](const float* ptr, int count) -> std::string {
1677             // If this is a uniform, print it by name.
1678             if (std::string text = UniformPtrCtx(ptr, count); !text.empty()) {
1679                 return text;
1680             }
1681             // Emit a single unbracketed immediate.
1682             if (count == 1) {
1683                 return Imm(*ptr);
1684             }
1685             // Emit a list like `[0x00000000 (0.0), 0x3F80000 (1.0)]`.
1686             std::string text = "[";
1687             auto separator = SkSL::String::Separator();
1688             while (count--) {
1689                 text += separator();
1690                 text += Imm(*ptr++);
1691             }
1692             return text + "]";
1693         };
1694 
1695         // Interpret the context value as a generic pointer.
1696         auto PtrCtx = [&](const void* ctx, int numSlots) -> std::string {
1697             const float *ctxAsSlot = static_cast<const float*>(ctx);
1698             // Check for uniform and value pointers.
1699             if (std::string uniform = UniformPtrCtx(ctxAsSlot, numSlots); !uniform.empty()) {
1700                 return uniform;
1701             }
1702             if (std::string value = ValuePtrCtx(ctxAsSlot, numSlots); !value.empty()) {
1703                 return value;
1704             }
1705             // Handle pointers to temporary stack slots.
1706             if (ctxAsSlot >= slots.stack.begin() && ctxAsSlot < slots.stack.end()) {
1707                 int stackIdx = ctxAsSlot - slots.stack.begin();
1708                 SkASSERT((stackIdx % N) == 0);
1709                 return "$" + AsRange(stackIdx / N, numSlots);
1710             }
1711             // This pointer is out of our expected bounds; this generally isn't expected to happen.
1712             return "ExternalPtr(" + AsRange(0, numSlots) + ")";
1713         };
1714 
1715         // Interpret the context value as a pointer to two adjacent values.
1716         auto AdjacentPtrCtx = [&](const void* ctx,
1717                                   int numSlots) -> std::tuple<std::string, std::string> {
1718             const float *ctxAsSlot = static_cast<const float*>(ctx);
1719             return std::make_tuple(PtrCtx(ctxAsSlot, numSlots),
1720                                    PtrCtx(ctxAsSlot + (N * numSlots), numSlots));
1721         };
1722 
1723         // Interpret the context value as a pointer to three adjacent values.
1724         auto Adjacent3PtrCtx = [&](const void* ctx, int numSlots) ->
1725                                   std::tuple<std::string, std::string, std::string> {
1726             const float *ctxAsSlot = static_cast<const float*>(ctx);
1727             return std::make_tuple(PtrCtx(ctxAsSlot, numSlots),
1728                                    PtrCtx(ctxAsSlot + (N * numSlots), numSlots),
1729                                    PtrCtx(ctxAsSlot + (2 * N * numSlots), numSlots));
1730         };
1731 
1732         // Interpret the context value as a BinaryOp structure for copy_n_slots (numSlots is
1733         // dictated by the op itself).
1734         auto BinaryOpCtx = [&](const void* v,
1735                                int numSlots) -> std::tuple<std::string, std::string> {
1736             const auto *ctx = static_cast<const SkRasterPipeline_BinaryOpCtx*>(v);
1737             return std::make_tuple(PtrCtx(ctx->dst, numSlots),
1738                                    PtrCtx(ctx->src, numSlots));
1739         };
1740 
1741         // Interpret the context value as a BinaryOp structure for copy_n_constants (numSlots is
1742         // dictated by the op itself).
1743         auto CopyConstantCtx = [&](const void* v,
1744                                    int numSlots) -> std::tuple<std::string, std::string> {
1745             const auto *ctx = static_cast<const SkRasterPipeline_BinaryOpCtx*>(v);
1746             return std::make_tuple(PtrCtx(ctx->dst, numSlots),
1747                                    MultiImmCtx(ctx->src, numSlots));
1748         };
1749 
1750         // Interpret the context value as a BinaryOp structure (numSlots is inferred from the
1751         // distance between pointers).
1752         auto AdjacentBinaryOpCtx = [&](const void* v) -> std::tuple<std::string, std::string> {
1753             const auto *ctx = static_cast<const SkRasterPipeline_BinaryOpCtx*>(v);
1754             int numSlots = (ctx->src - ctx->dst) / N;
1755             return AdjacentPtrCtx(ctx->dst, numSlots);
1756         };
1757 
1758         // Interpret the context value as a TernaryOp structure (numSlots is inferred from the
1759         // distance between pointers).
1760         auto AdjacentTernaryOpCtx = [&](const void* v) ->
1761                                        std::tuple<std::string, std::string, std::string> {
1762             const auto* ctx = static_cast<const SkRasterPipeline_TernaryOpCtx*>(v);
1763             int numSlots = (ctx->src0 - ctx->dst) / N;
1764             return Adjacent3PtrCtx(ctx->dst, numSlots);
1765         };
1766 
1767         // Stringize a swizzled pointer. Note that the slot-width of the original expression is not
1768         // preserved in the instruction encoding, so we need to do our best using the data we have.
1769         // (e.g., myFloat4.y would be indistinguishable from myFloat2.y.)
1770         auto SwizzlePtr = [&](const float* ptr, SkSpan<const uint16_t> offsets) {
1771             size_t highestComponent = *std::max_element(offsets.begin(), offsets.end()) /
1772                                       (N * sizeof(float));
1773 
1774             std::string src = "(" + PtrCtx(ptr, std::max(offsets.size(), highestComponent + 1)) +
1775                               ").";
1776             for (uint16_t offset : offsets) {
1777                 if (offset == (0 * N * sizeof(float))) {
1778                     src.push_back('x');
1779                 } else if (offset == (1 * N * sizeof(float))) {
1780                     src.push_back('y');
1781                 } else if (offset == (2 * N * sizeof(float))) {
1782                     src.push_back('z');
1783                 } else if (offset == (3 * N * sizeof(float))) {
1784                     src.push_back('w');
1785                 } else {
1786                     src.push_back('?');
1787                 }
1788             }
1789             return src;
1790         };
1791 
1792         // Interpret the context value as a Swizzle structure.
1793         auto SwizzleCtx = [&](ProgramOp op, const void* v) -> std::tuple<std::string, std::string> {
1794             const auto* ctx = static_cast<const SkRasterPipeline_SwizzleCtx*>(v);
1795             int destSlots = (int)op - (int)BuilderOp::swizzle_1 + 1;
1796 
1797             return std::make_tuple(PtrCtx(ctx->ptr, destSlots),
1798                                    SwizzlePtr(ctx->ptr, SkSpan(ctx->offsets, destSlots)));
1799         };
1800 
1801         // Interpret the context value as a SwizzleCopy structure.
1802         auto SwizzleCopyCtx = [&](ProgramOp op,
1803                                   const void* v) -> std::tuple<std::string, std::string> {
1804             const auto* ctx = static_cast<const SkRasterPipeline_SwizzleCopyCtx*>(v);
1805             int destSlots = (int)op - (int)BuilderOp::swizzle_copy_slot_masked + 1;
1806 
1807             return std::make_tuple(SwizzlePtr(ctx->dst, SkSpan(ctx->offsets, destSlots)),
1808                                    PtrCtx(ctx->src, destSlots));
1809         };
1810 
1811         // Interpret the context value as a Shuffle structure.
1812         auto ShuffleCtx = [&](const void* v) -> std::tuple<std::string, std::string> {
1813             const auto* ctx = static_cast<const SkRasterPipeline_ShuffleCtx*>(v);
1814 
1815             std::string dst = PtrCtx(ctx->ptr, ctx->count);
1816             std::string src = "(" + dst + ")[";
1817             for (int index = 0; index < ctx->count; ++index) {
1818                 if (ctx->offsets[index] % (N * sizeof(float))) {
1819                     src.push_back('?');
1820                 } else {
1821                     src += std::to_string(ctx->offsets[index] / (N * sizeof(float)));
1822                 }
1823                 src.push_back(' ');
1824             }
1825             src.back() = ']';
1826             return std::make_tuple(dst, src);
1827         };
1828 
1829         std::string opArg1, opArg2, opArg3;
1830         using POp = ProgramOp;
1831         switch (stage.op) {
1832             case POp::label:
1833             case POp::invoke_shader:
1834             case POp::invoke_color_filter:
1835             case POp::invoke_blender:
1836                 opArg1 = ImmCtx(stage.ctx, /*showAsFloat=*/false);
1837                 break;
1838 
1839             case POp::case_op: {
1840                 const auto* ctx = static_cast<SkRasterPipeline_CaseOpCtx*>(stage.ctx);
1841                 opArg1 = PtrCtx(ctx->ptr, 1);
1842                 opArg2 = PtrCtx(ctx->ptr + N, 1);
1843                 opArg3 = Imm(sk_bit_cast<float>(ctx->expectedValue), /*showAsFloat=*/false);
1844                 break;
1845             }
1846             case POp::swizzle_1:
1847             case POp::swizzle_2:
1848             case POp::swizzle_3:
1849             case POp::swizzle_4:
1850                 std::tie(opArg1, opArg2) = SwizzleCtx(stage.op, stage.ctx);
1851                 break;
1852 
1853             case POp::swizzle_copy_slot_masked:
1854             case POp::swizzle_copy_2_slots_masked:
1855             case POp::swizzle_copy_3_slots_masked:
1856             case POp::swizzle_copy_4_slots_masked:
1857                 std::tie(opArg1, opArg2) = SwizzleCopyCtx(stage.op, stage.ctx);
1858                 break;
1859 
1860             case POp::dot_2_floats:
1861                 opArg1 = PtrCtx(stage.ctx, 1);
1862                 std::tie(opArg2, opArg3) = AdjacentPtrCtx(stage.ctx, 2);
1863                 break;
1864 
1865             case POp::dot_3_floats:
1866                 opArg1 = PtrCtx(stage.ctx, 1);
1867                 std::tie(opArg2, opArg3) = AdjacentPtrCtx(stage.ctx, 3);
1868                 break;
1869 
1870             case POp::dot_4_floats:
1871                 opArg1 = PtrCtx(stage.ctx, 1);
1872                 std::tie(opArg2, opArg3) = AdjacentPtrCtx(stage.ctx, 4);
1873                 break;
1874 
1875             case POp::shuffle:
1876                 std::tie(opArg1, opArg2) = ShuffleCtx(stage.ctx);
1877                 break;
1878 
1879             case POp::load_condition_mask:
1880             case POp::store_condition_mask:
1881             case POp::load_loop_mask:
1882             case POp::store_loop_mask:
1883             case POp::merge_loop_mask:
1884             case POp::reenable_loop_mask:
1885             case POp::load_return_mask:
1886             case POp::store_return_mask:
1887             case POp::zero_slot_unmasked:
1888             case POp::bitwise_not_int:
1889             case POp::cast_to_float_from_int: case POp::cast_to_float_from_uint:
1890             case POp::cast_to_int_from_float: case POp::cast_to_uint_from_float:
1891             case POp::abs_float:              case POp::abs_int:
1892             case POp::atan_float:
1893             case POp::ceil_float:
1894             case POp::cos_float:
1895             case POp::exp_float:
1896             case POp::floor_float:
1897             case POp::sin_float:
1898             case POp::sqrt_float:
1899             case POp::tan_float:
1900                 opArg1 = PtrCtx(stage.ctx, 1);
1901                 break;
1902 
1903             case POp::zero_2_slots_unmasked:
1904             case POp::bitwise_not_2_ints:
1905             case POp::load_src_rg:               case POp::store_src_rg:
1906             case POp::cast_to_float_from_2_ints: case POp::cast_to_float_from_2_uints:
1907             case POp::cast_to_int_from_2_floats: case POp::cast_to_uint_from_2_floats:
1908             case POp::abs_2_floats:              case POp::abs_2_ints:
1909             case POp::ceil_2_floats:
1910             case POp::floor_2_floats:
1911                 opArg1 = PtrCtx(stage.ctx, 2);
1912                 break;
1913 
1914             case POp::zero_3_slots_unmasked:
1915             case POp::bitwise_not_3_ints:
1916             case POp::cast_to_float_from_3_ints: case POp::cast_to_float_from_3_uints:
1917             case POp::cast_to_int_from_3_floats: case POp::cast_to_uint_from_3_floats:
1918             case POp::abs_3_floats:              case POp::abs_3_ints:
1919             case POp::ceil_3_floats:
1920             case POp::floor_3_floats:
1921                 opArg1 = PtrCtx(stage.ctx, 3);
1922                 break;
1923 
1924             case POp::load_src:
1925             case POp::load_dst:
1926             case POp::store_src:
1927             case POp::store_dst:
1928             case POp::store_device_xy01:
1929             case POp::zero_4_slots_unmasked:
1930             case POp::bitwise_not_4_ints:
1931             case POp::cast_to_float_from_4_ints: case POp::cast_to_float_from_4_uints:
1932             case POp::cast_to_int_from_4_floats: case POp::cast_to_uint_from_4_floats:
1933             case POp::abs_4_floats:              case POp::abs_4_ints:
1934             case POp::ceil_4_floats:
1935             case POp::floor_4_floats:
1936                 opArg1 = PtrCtx(stage.ctx, 4);
1937                 break;
1938 
1939             case POp::copy_constant:
1940                 std::tie(opArg1, opArg2) = CopyConstantCtx(stage.ctx, 1);
1941                 break;
1942 
1943             case POp::copy_2_constants:
1944                 std::tie(opArg1, opArg2) = CopyConstantCtx(stage.ctx, 2);
1945                 break;
1946 
1947             case POp::copy_3_constants:
1948                 std::tie(opArg1, opArg2) = CopyConstantCtx(stage.ctx, 3);
1949                 break;
1950 
1951             case POp::copy_4_constants:
1952                 std::tie(opArg1, opArg2) = CopyConstantCtx(stage.ctx, 4);
1953                 break;
1954 
1955             case POp::copy_slot_masked:
1956             case POp::copy_slot_unmasked:
1957                 std::tie(opArg1, opArg2) = BinaryOpCtx(stage.ctx, 1);
1958                 break;
1959 
1960             case POp::copy_2_slots_masked:
1961             case POp::copy_2_slots_unmasked:
1962                 std::tie(opArg1, opArg2) = BinaryOpCtx(stage.ctx, 2);
1963                 break;
1964 
1965             case POp::copy_3_slots_masked:
1966             case POp::copy_3_slots_unmasked:
1967                 std::tie(opArg1, opArg2) = BinaryOpCtx(stage.ctx, 3);
1968                 break;
1969 
1970             case POp::copy_4_slots_masked:
1971             case POp::copy_4_slots_unmasked:
1972                 std::tie(opArg1, opArg2) = BinaryOpCtx(stage.ctx, 4);
1973                 break;
1974 
1975             case POp::merge_condition_mask:
1976             case POp::add_float:   case POp::add_int:
1977             case POp::sub_float:   case POp::sub_int:
1978             case POp::mul_float:   case POp::mul_int:
1979             case POp::div_float:   case POp::div_int:   case POp::div_uint:
1980                                    case POp::bitwise_and_int:
1981                                    case POp::bitwise_or_int:
1982                                    case POp::bitwise_xor_int:
1983             case POp::min_float:   case POp::min_int:   case POp::min_uint:
1984             case POp::max_float:   case POp::max_int:   case POp::max_uint:
1985             case POp::cmplt_float: case POp::cmplt_int: case POp::cmplt_uint:
1986             case POp::cmple_float: case POp::cmple_int: case POp::cmple_uint:
1987             case POp::cmpeq_float: case POp::cmpeq_int:
1988             case POp::cmpne_float: case POp::cmpne_int:
1989                 std::tie(opArg1, opArg2) = AdjacentPtrCtx(stage.ctx, 1);
1990                 break;
1991 
1992             case POp::mix_float:   case POp::mix_int:
1993                 std::tie(opArg1, opArg2, opArg3) = Adjacent3PtrCtx(stage.ctx, 1);
1994                 break;
1995 
1996             case POp::add_2_floats:   case POp::add_2_ints:
1997             case POp::sub_2_floats:   case POp::sub_2_ints:
1998             case POp::mul_2_floats:   case POp::mul_2_ints:
1999             case POp::div_2_floats:   case POp::div_2_ints:   case POp::div_2_uints:
2000                                       case POp::bitwise_and_2_ints:
2001                                       case POp::bitwise_or_2_ints:
2002                                       case POp::bitwise_xor_2_ints:
2003             case POp::min_2_floats:   case POp::min_2_ints:   case POp::min_2_uints:
2004             case POp::max_2_floats:   case POp::max_2_ints:   case POp::max_2_uints:
2005             case POp::cmplt_2_floats: case POp::cmplt_2_ints: case POp::cmplt_2_uints:
2006             case POp::cmple_2_floats: case POp::cmple_2_ints: case POp::cmple_2_uints:
2007             case POp::cmpeq_2_floats: case POp::cmpeq_2_ints:
2008             case POp::cmpne_2_floats: case POp::cmpne_2_ints:
2009                 std::tie(opArg1, opArg2) = AdjacentPtrCtx(stage.ctx, 2);
2010                 break;
2011 
2012             case POp::mix_2_floats:   case POp::mix_2_ints:
2013                 std::tie(opArg1, opArg2, opArg3) = Adjacent3PtrCtx(stage.ctx, 2);
2014                 break;
2015 
2016             case POp::add_3_floats:   case POp::add_3_ints:
2017             case POp::sub_3_floats:   case POp::sub_3_ints:
2018             case POp::mul_3_floats:   case POp::mul_3_ints:
2019             case POp::div_3_floats:   case POp::div_3_ints:   case POp::div_3_uints:
2020                                       case POp::bitwise_and_3_ints:
2021                                       case POp::bitwise_or_3_ints:
2022                                       case POp::bitwise_xor_3_ints:
2023             case POp::min_3_floats:   case POp::min_3_ints:   case POp::min_3_uints:
2024             case POp::max_3_floats:   case POp::max_3_ints:   case POp::max_3_uints:
2025             case POp::cmplt_3_floats: case POp::cmplt_3_ints: case POp::cmplt_3_uints:
2026             case POp::cmple_3_floats: case POp::cmple_3_ints: case POp::cmple_3_uints:
2027             case POp::cmpeq_3_floats: case POp::cmpeq_3_ints:
2028             case POp::cmpne_3_floats: case POp::cmpne_3_ints:
2029                 std::tie(opArg1, opArg2) = AdjacentPtrCtx(stage.ctx, 3);
2030                 break;
2031 
2032             case POp::mix_3_floats:   case POp::mix_3_ints:
2033                 std::tie(opArg1, opArg2, opArg3) = Adjacent3PtrCtx(stage.ctx, 3);
2034                 break;
2035 
2036             case POp::add_4_floats:   case POp::add_4_ints:
2037             case POp::sub_4_floats:   case POp::sub_4_ints:
2038             case POp::mul_4_floats:   case POp::mul_4_ints:
2039             case POp::div_4_floats:   case POp::div_4_ints:   case POp::div_4_uints:
2040                                       case POp::bitwise_and_4_ints:
2041                                       case POp::bitwise_or_4_ints:
2042                                       case POp::bitwise_xor_4_ints:
2043             case POp::min_4_floats:   case POp::min_4_ints:   case POp::min_4_uints:
2044             case POp::max_4_floats:   case POp::max_4_ints:   case POp::max_4_uints:
2045             case POp::cmplt_4_floats: case POp::cmplt_4_ints: case POp::cmplt_4_uints:
2046             case POp::cmple_4_floats: case POp::cmple_4_ints: case POp::cmple_4_uints:
2047             case POp::cmpeq_4_floats: case POp::cmpeq_4_ints:
2048             case POp::cmpne_4_floats: case POp::cmpne_4_ints:
2049                 std::tie(opArg1, opArg2) = AdjacentPtrCtx(stage.ctx, 4);
2050                 break;
2051 
2052             case POp::mix_4_floats:   case POp::mix_4_ints:
2053                 std::tie(opArg1, opArg2, opArg3) = Adjacent3PtrCtx(stage.ctx, 4);
2054                 break;
2055 
2056             case POp::add_n_floats:   case POp::add_n_ints:
2057             case POp::sub_n_floats:   case POp::sub_n_ints:
2058             case POp::mul_n_floats:   case POp::mul_n_ints:
2059             case POp::div_n_floats:   case POp::div_n_ints:   case POp::div_n_uints:
2060                                       case POp::bitwise_and_n_ints:
2061                                       case POp::bitwise_or_n_ints:
2062                                       case POp::bitwise_xor_n_ints:
2063             case POp::min_n_floats:   case POp::min_n_ints:   case POp::min_n_uints:
2064             case POp::max_n_floats:   case POp::max_n_ints:   case POp::max_n_uints:
2065             case POp::cmplt_n_floats: case POp::cmplt_n_ints: case POp::cmplt_n_uints:
2066             case POp::cmple_n_floats: case POp::cmple_n_ints: case POp::cmple_n_uints:
2067             case POp::cmpeq_n_floats: case POp::cmpeq_n_ints:
2068             case POp::cmpne_n_floats: case POp::cmpne_n_ints:
2069             case POp::atan2_n_floats:
2070             case POp::pow_n_floats:
2071                 std::tie(opArg1, opArg2) = AdjacentBinaryOpCtx(stage.ctx);
2072                 break;
2073 
2074             case POp::mix_n_floats:   case POp::mix_n_ints:
2075                 std::tie(opArg1, opArg2, opArg3) = AdjacentTernaryOpCtx(stage.ctx);
2076                 break;
2077 
2078             case POp::jump:
2079             case POp::branch_if_any_active_lanes:
2080             case POp::branch_if_no_active_lanes:
2081                 opArg1 = BranchOffset(static_cast<SkRasterPipeline_BranchCtx*>(stage.ctx));
2082                 break;
2083 
2084             case POp::branch_if_no_active_lanes_eq: {
2085                 const auto* ctx = static_cast<SkRasterPipeline_BranchIfEqualCtx*>(stage.ctx);
2086                 opArg1 = BranchOffset(ctx);
2087                 opArg2 = PtrCtx(ctx->ptr, 1);
2088                 opArg3 = Imm(sk_bit_cast<float>(ctx->value));
2089                 break;
2090             }
2091             default:
2092                 break;
2093         }
2094 
2095         const char* opName = "";
2096         switch (stage.op) {
2097         #define M(x) case POp::x: opName = #x; break;
2098             SK_RASTER_PIPELINE_OPS_ALL(M)
2099         #undef M
2100             case POp::label:               opName = "label";               break;
2101             case POp::invoke_shader:       opName = "invoke_shader";       break;
2102             case POp::invoke_color_filter: opName = "invoke_color_filter"; break;
2103             case POp::invoke_blender:      opName = "invoke_blender";      break;
2104         }
2105 
2106         std::string opText;
2107         switch (stage.op) {
2108             case POp::init_lane_masks:
2109                 opText = "CondMask = LoopMask = RetMask = true";
2110                 break;
2111 
2112             case POp::load_condition_mask:
2113                 opText = "CondMask = " + opArg1;
2114                 break;
2115 
2116             case POp::store_condition_mask:
2117                 opText = opArg1 + " = CondMask";
2118                 break;
2119 
2120             case POp::merge_condition_mask:
2121                 opText = "CondMask = " + opArg1 + " & " + opArg2;
2122                 break;
2123 
2124             case POp::load_loop_mask:
2125                 opText = "LoopMask = " + opArg1;
2126                 break;
2127 
2128             case POp::store_loop_mask:
2129                 opText = opArg1 + " = LoopMask";
2130                 break;
2131 
2132             case POp::mask_off_loop_mask:
2133                 opText = "LoopMask &= ~(CondMask & LoopMask & RetMask)";
2134                 break;
2135 
2136             case POp::reenable_loop_mask:
2137                 opText = "LoopMask |= " + opArg1;
2138                 break;
2139 
2140             case POp::merge_loop_mask:
2141                 opText = "LoopMask &= " + opArg1;
2142                 break;
2143 
2144             case POp::load_return_mask:
2145                 opText = "RetMask = " + opArg1;
2146                 break;
2147 
2148             case POp::store_return_mask:
2149                 opText = opArg1 + " = RetMask";
2150                 break;
2151 
2152             case POp::mask_off_return_mask:
2153                 opText = "RetMask &= ~(CondMask & LoopMask & RetMask)";
2154                 break;
2155 
2156             case POp::store_src_rg:
2157                 opText = opArg1 + " = src.rg";
2158                 break;
2159 
2160             case POp::store_src:
2161                 opText = opArg1 + " = src.rgba";
2162                 break;
2163 
2164             case POp::store_dst:
2165                 opText = opArg1 + " = dst.rgba";
2166                 break;
2167 
2168             case POp::store_device_xy01:
2169                 opText = opArg1 + " = DeviceCoords.xy01";
2170                 break;
2171 
2172             case POp::load_src_rg:
2173                 opText = "src.rg = " + opArg1;
2174                 break;
2175 
2176             case POp::load_src:
2177                 opText = "src.rgba = " + opArg1;
2178                 break;
2179 
2180             case POp::load_dst:
2181                 opText = "dst.rgba = " + opArg1;
2182                 break;
2183 
2184             case POp::bitwise_and_int:
2185             case POp::bitwise_and_2_ints:
2186             case POp::bitwise_and_3_ints:
2187             case POp::bitwise_and_4_ints:
2188             case POp::bitwise_and_n_ints:
2189                 opText = opArg1 + " &= " + opArg2;
2190                 break;
2191 
2192             case POp::bitwise_or_int:
2193             case POp::bitwise_or_2_ints:
2194             case POp::bitwise_or_3_ints:
2195             case POp::bitwise_or_4_ints:
2196             case POp::bitwise_or_n_ints:
2197                 opText = opArg1 + " |= " + opArg2;
2198                 break;
2199 
2200             case POp::bitwise_xor_int:
2201             case POp::bitwise_xor_2_ints:
2202             case POp::bitwise_xor_3_ints:
2203             case POp::bitwise_xor_4_ints:
2204             case POp::bitwise_xor_n_ints:
2205                 opText = opArg1 + " ^= " + opArg2;
2206                 break;
2207 
2208             case POp::bitwise_not_int:
2209             case POp::bitwise_not_2_ints:
2210             case POp::bitwise_not_3_ints:
2211             case POp::bitwise_not_4_ints:
2212                 opText = opArg1 + " = ~" + opArg1;
2213                 break;
2214 
2215             case POp::cast_to_float_from_int:
2216             case POp::cast_to_float_from_2_ints:
2217             case POp::cast_to_float_from_3_ints:
2218             case POp::cast_to_float_from_4_ints:
2219                 opText = opArg1 + " = IntToFloat(" + opArg1 + ")";
2220                 break;
2221 
2222             case POp::cast_to_float_from_uint:
2223             case POp::cast_to_float_from_2_uints:
2224             case POp::cast_to_float_from_3_uints:
2225             case POp::cast_to_float_from_4_uints:
2226                 opText = opArg1 + " = UintToFloat(" + opArg1 + ")";
2227                 break;
2228 
2229             case POp::cast_to_int_from_float:
2230             case POp::cast_to_int_from_2_floats:
2231             case POp::cast_to_int_from_3_floats:
2232             case POp::cast_to_int_from_4_floats:
2233                 opText = opArg1 + " = FloatToInt(" + opArg1 + ")";
2234                 break;
2235 
2236             case POp::cast_to_uint_from_float:
2237             case POp::cast_to_uint_from_2_floats:
2238             case POp::cast_to_uint_from_3_floats:
2239             case POp::cast_to_uint_from_4_floats:
2240                 opText = opArg1 + " = FloatToUint(" + opArg1 + ")";
2241                 break;
2242 
2243             case POp::copy_slot_masked:            case POp::copy_2_slots_masked:
2244             case POp::copy_3_slots_masked:         case POp::copy_4_slots_masked:
2245             case POp::swizzle_copy_slot_masked:    case POp::swizzle_copy_2_slots_masked:
2246             case POp::swizzle_copy_3_slots_masked: case POp::swizzle_copy_4_slots_masked:
2247                 opText = opArg1 + " = Mask(" + opArg2 + ")";
2248                 break;
2249 
2250             case POp::copy_constant:               case POp::copy_2_constants:
2251             case POp::copy_3_constants:            case POp::copy_4_constants:
2252             case POp::copy_slot_unmasked:          case POp::copy_2_slots_unmasked:
2253             case POp::copy_3_slots_unmasked:       case POp::copy_4_slots_unmasked:
2254             case POp::swizzle_1:                   case POp::swizzle_2:
2255             case POp::swizzle_3:                   case POp::swizzle_4:
2256             case POp::shuffle:
2257                 opText = opArg1 + " = " + opArg2;
2258                 break;
2259 
2260             case POp::zero_slot_unmasked:    case POp::zero_2_slots_unmasked:
2261             case POp::zero_3_slots_unmasked: case POp::zero_4_slots_unmasked:
2262                 opText = opArg1 + " = 0";
2263                 break;
2264 
2265             case POp::abs_float:    case POp::abs_int:
2266             case POp::abs_2_floats: case POp::abs_2_ints:
2267             case POp::abs_3_floats: case POp::abs_3_ints:
2268             case POp::abs_4_floats: case POp::abs_4_ints:
2269                 opText = opArg1 + " = abs(" + opArg1 + ")";
2270                 break;
2271 
2272             case POp::atan_float:
2273                 opText = opArg1 + " = atan(" + opArg1 + ")";
2274                 break;
2275 
2276             case POp::atan2_n_floats:
2277                 opText = opArg1 + " = atan2(" + opArg1 + ", " + opArg2 + ")";
2278                 break;
2279 
2280             case POp::ceil_float:
2281             case POp::ceil_2_floats:
2282             case POp::ceil_3_floats:
2283             case POp::ceil_4_floats:
2284                 opText = opArg1 + " = ceil(" + opArg1 + ")";
2285                 break;
2286 
2287             case POp::cos_float:
2288                 opText = opArg1 + " = cos(" + opArg1 + ")";
2289                 break;
2290 
2291             case POp::dot_2_floats:
2292             case POp::dot_3_floats:
2293             case POp::dot_4_floats:
2294                 opText = opArg1 + " = dot(" + opArg2 + ", " + opArg3 + ")";
2295                 break;
2296 
2297             case POp::exp_float:
2298                 opText = opArg1 + " = exp(" + opArg1 + ")";
2299                 break;
2300 
2301             case POp::pow_n_floats:
2302                 opText = opArg1 + " = pow(" + opArg1 + ", " + opArg2 + ")";
2303                 break;
2304 
2305             case POp::sin_float:
2306                 opText = opArg1 + " = sin(" + opArg1 + ")";
2307                 break;
2308 
2309             case POp::sqrt_float:
2310                 opText = opArg1 + " = sqrt(" + opArg1 + ")";
2311                 break;
2312 
2313             case POp::tan_float:
2314                 opText = opArg1 + " = tan(" + opArg1 + ")";
2315                 break;
2316 
2317             case POp::floor_float:
2318             case POp::floor_2_floats:
2319             case POp::floor_3_floats:
2320             case POp::floor_4_floats:
2321                 opText = opArg1 + " = floor(" + opArg1 + ")";
2322                 break;
2323 
2324             case POp::add_float:    case POp::add_int:
2325             case POp::add_2_floats: case POp::add_2_ints:
2326             case POp::add_3_floats: case POp::add_3_ints:
2327             case POp::add_4_floats: case POp::add_4_ints:
2328             case POp::add_n_floats: case POp::add_n_ints:
2329                 opText = opArg1 + " += " + opArg2;
2330                 break;
2331 
2332             case POp::sub_float:    case POp::sub_int:
2333             case POp::sub_2_floats: case POp::sub_2_ints:
2334             case POp::sub_3_floats: case POp::sub_3_ints:
2335             case POp::sub_4_floats: case POp::sub_4_ints:
2336             case POp::sub_n_floats: case POp::sub_n_ints:
2337                 opText = opArg1 + " -= " + opArg2;
2338                 break;
2339 
2340             case POp::mul_float:    case POp::mul_int:
2341             case POp::mul_2_floats: case POp::mul_2_ints:
2342             case POp::mul_3_floats: case POp::mul_3_ints:
2343             case POp::mul_4_floats: case POp::mul_4_ints:
2344             case POp::mul_n_floats: case POp::mul_n_ints:
2345                 opText = opArg1 + " *= " + opArg2;
2346                 break;
2347 
2348             case POp::div_float:    case POp::div_int:    case POp::div_uint:
2349             case POp::div_2_floats: case POp::div_2_ints: case POp::div_2_uints:
2350             case POp::div_3_floats: case POp::div_3_ints: case POp::div_3_uints:
2351             case POp::div_4_floats: case POp::div_4_ints: case POp::div_4_uints:
2352             case POp::div_n_floats: case POp::div_n_ints: case POp::div_n_uints:
2353                 opText = opArg1 + " /= " + opArg2;
2354                 break;
2355 
2356             case POp::min_float:    case POp::min_int:    case POp::min_uint:
2357             case POp::min_2_floats: case POp::min_2_ints: case POp::min_2_uints:
2358             case POp::min_3_floats: case POp::min_3_ints: case POp::min_3_uints:
2359             case POp::min_4_floats: case POp::min_4_ints: case POp::min_4_uints:
2360             case POp::min_n_floats: case POp::min_n_ints: case POp::min_n_uints:
2361                 opText = opArg1 + " = min(" + opArg1 + ", " + opArg2 + ")";
2362                 break;
2363 
2364             case POp::max_float:    case POp::max_int:    case POp::max_uint:
2365             case POp::max_2_floats: case POp::max_2_ints: case POp::max_2_uints:
2366             case POp::max_3_floats: case POp::max_3_ints: case POp::max_3_uints:
2367             case POp::max_4_floats: case POp::max_4_ints: case POp::max_4_uints:
2368             case POp::max_n_floats: case POp::max_n_ints: case POp::max_n_uints:
2369                 opText = opArg1 + " = max(" + opArg1 + ", " + opArg2 + ")";
2370                 break;
2371 
2372             case POp::cmplt_float:    case POp::cmplt_int:    case POp::cmplt_uint:
2373             case POp::cmplt_2_floats: case POp::cmplt_2_ints: case POp::cmplt_2_uints:
2374             case POp::cmplt_3_floats: case POp::cmplt_3_ints: case POp::cmplt_3_uints:
2375             case POp::cmplt_4_floats: case POp::cmplt_4_ints: case POp::cmplt_4_uints:
2376             case POp::cmplt_n_floats: case POp::cmplt_n_ints: case POp::cmplt_n_uints:
2377                 opText = opArg1 + " = lessThan(" + opArg1 + ", " + opArg2 + ")";
2378                 break;
2379 
2380             case POp::cmple_float:    case POp::cmple_int:    case POp::cmple_uint:
2381             case POp::cmple_2_floats: case POp::cmple_2_ints: case POp::cmple_2_uints:
2382             case POp::cmple_3_floats: case POp::cmple_3_ints: case POp::cmple_3_uints:
2383             case POp::cmple_4_floats: case POp::cmple_4_ints: case POp::cmple_4_uints:
2384             case POp::cmple_n_floats: case POp::cmple_n_ints: case POp::cmple_n_uints:
2385                 opText = opArg1 + " = lessThanEqual(" + opArg1 + ", " + opArg2 + ")";
2386                 break;
2387 
2388             case POp::cmpeq_float:    case POp::cmpeq_int:
2389             case POp::cmpeq_2_floats: case POp::cmpeq_2_ints:
2390             case POp::cmpeq_3_floats: case POp::cmpeq_3_ints:
2391             case POp::cmpeq_4_floats: case POp::cmpeq_4_ints:
2392             case POp::cmpeq_n_floats: case POp::cmpeq_n_ints:
2393                 opText = opArg1 + " = equal(" + opArg1 + ", " + opArg2 + ")";
2394                 break;
2395 
2396             case POp::cmpne_float:    case POp::cmpne_int:
2397             case POp::cmpne_2_floats: case POp::cmpne_2_ints:
2398             case POp::cmpne_3_floats: case POp::cmpne_3_ints:
2399             case POp::cmpne_4_floats: case POp::cmpne_4_ints:
2400             case POp::cmpne_n_floats: case POp::cmpne_n_ints:
2401                 opText = opArg1 + " = notEqual(" + opArg1 + ", " + opArg2 + ")";
2402                 break;
2403 
2404             case POp::mix_float:      case POp::mix_int:
2405             case POp::mix_2_floats:   case POp::mix_2_ints:
2406             case POp::mix_3_floats:   case POp::mix_3_ints:
2407             case POp::mix_4_floats:   case POp::mix_4_ints:
2408             case POp::mix_n_floats:   case POp::mix_n_ints:
2409                 opText = opArg1 + " = mix(" + opArg2 + ", " + opArg3 + ", " + opArg1 + ")";
2410                 break;
2411 
2412             case POp::jump:
2413             case POp::branch_if_any_active_lanes:
2414             case POp::branch_if_no_active_lanes:
2415             case POp::invoke_shader:
2416             case POp::invoke_color_filter:
2417             case POp::invoke_blender:
2418                 opText = std::string(opName) + " " + opArg1;
2419                 break;
2420 
2421             case POp::branch_if_no_active_lanes_eq:
2422                 opText = "branch " + opArg1 + " if no lanes of " + opArg2 + " == " + opArg3;
2423                 break;
2424 
2425             case POp::label:
2426                 opText = "label " + opArg1;
2427                 break;
2428 
2429             case POp::case_op: {
2430                 opText = "if (" + opArg1 + " == " + opArg3 +
2431                          ") { LoopMask = true; " + opArg2 + " = false; }";
2432                 break;
2433             }
2434             default:
2435                 break;
2436         }
2437 
2438         std::string line = !opText.empty()
2439                 ? SkSL::String::printf("% 5d. %-30s %s\n", index + 1, opName, opText.c_str())
2440                 : SkSL::String::printf("% 5d. %s\n", index + 1, opName);
2441 
2442         out->writeText(line.c_str());
2443     }
2444 }
2445 
2446 }  // namespace RP
2447 }  // namespace SkSL
2448