• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*
2  * Copyright 2022 Google Inc.
3  *
4  * Use of this source code is governed by a BSD-style license that can be
5  * found in the LICENSE file.
6  */
7 
8 #include "src/sksl/codegen/SkSLRasterPipelineBuilder.h"
9 
10 #include "include/core/SkStream.h"
11 #include "include/private/base/SkMalloc.h"
12 #include "include/private/base/SkTo.h"
13 #include "src/base/SkArenaAlloc.h"
14 #include "src/core/SkOpts.h"
15 #include "src/core/SkRasterPipelineContextUtils.h"
16 #include "src/core/SkRasterPipelineOpContexts.h"
17 #include "src/core/SkRasterPipelineOpList.h"
18 #include "src/core/SkTHash.h"
19 #include "src/sksl/SkSLPosition.h"
20 #include "src/sksl/SkSLString.h"
21 #include "src/sksl/tracing/SkSLDebugTracePriv.h"
22 #include "src/sksl/tracing/SkSLTraceHook.h"
23 #include "src/utils/SkBitSet.h"
24 
25 #if !defined(SKSL_STANDALONE)
26 #include "src/core/SkRasterPipeline.h"
27 #endif
28 
29 #include <algorithm>
30 #include <cmath>
31 #include <cstddef>
32 #include <cstring>
33 #include <iterator>
34 #include <string>
35 #include <string_view>
36 #include <tuple>
37 #include <utility>
38 #include <vector>
39 
40 using namespace skia_private;
41 
42 namespace SkSL::RP {
43 
44 #define ALL_SINGLE_SLOT_UNARY_OP_CASES  \
45          BuilderOp::acos_float:         \
46     case BuilderOp::asin_float:         \
47     case BuilderOp::atan_float:         \
48     case BuilderOp::cos_float:          \
49     case BuilderOp::exp_float:          \
50     case BuilderOp::exp2_float:         \
51     case BuilderOp::log_float:          \
52     case BuilderOp::log2_float:         \
53     case BuilderOp::sin_float:          \
54     case BuilderOp::sqrt_float:         \
55     case BuilderOp::tan_float
56 
57 #define ALL_MULTI_SLOT_UNARY_OP_CASES        \
58          BuilderOp::abs_int:                 \
59     case BuilderOp::cast_to_float_from_int:  \
60     case BuilderOp::cast_to_float_from_uint: \
61     case BuilderOp::cast_to_int_from_float:  \
62     case BuilderOp::cast_to_uint_from_float: \
63     case BuilderOp::ceil_float:              \
64     case BuilderOp::floor_float:             \
65     case BuilderOp::invsqrt_float
66 
67 #define ALL_N_WAY_BINARY_OP_CASES   \
68          BuilderOp::atan2_n_floats: \
69     case BuilderOp::pow_n_floats
70 
71 #define ALL_MULTI_SLOT_BINARY_OP_CASES  \
72          BuilderOp::add_n_floats:       \
73     case BuilderOp::add_n_ints:         \
74     case BuilderOp::sub_n_floats:       \
75     case BuilderOp::sub_n_ints:         \
76     case BuilderOp::mul_n_floats:       \
77     case BuilderOp::mul_n_ints:         \
78     case BuilderOp::div_n_floats:       \
79     case BuilderOp::div_n_ints:         \
80     case BuilderOp::div_n_uints:        \
81     case BuilderOp::bitwise_and_n_ints: \
82     case BuilderOp::bitwise_or_n_ints:  \
83     case BuilderOp::bitwise_xor_n_ints: \
84     case BuilderOp::mod_n_floats:       \
85     case BuilderOp::min_n_floats:       \
86     case BuilderOp::min_n_ints:         \
87     case BuilderOp::min_n_uints:        \
88     case BuilderOp::max_n_floats:       \
89     case BuilderOp::max_n_ints:         \
90     case BuilderOp::max_n_uints:        \
91     case BuilderOp::cmple_n_floats:     \
92     case BuilderOp::cmple_n_ints:       \
93     case BuilderOp::cmple_n_uints:      \
94     case BuilderOp::cmplt_n_floats:     \
95     case BuilderOp::cmplt_n_ints:       \
96     case BuilderOp::cmplt_n_uints:      \
97     case BuilderOp::cmpeq_n_floats:     \
98     case BuilderOp::cmpeq_n_ints:       \
99     case BuilderOp::cmpne_n_floats:     \
100     case BuilderOp::cmpne_n_ints
101 
102 #define ALL_IMMEDIATE_BINARY_OP_CASES    \
103          BuilderOp::add_imm_float:       \
104     case BuilderOp::add_imm_int:         \
105     case BuilderOp::mul_imm_float:       \
106     case BuilderOp::mul_imm_int:         \
107     case BuilderOp::bitwise_and_imm_int: \
108     case BuilderOp::bitwise_xor_imm_int: \
109     case BuilderOp::min_imm_float:       \
110     case BuilderOp::max_imm_float:       \
111     case BuilderOp::cmple_imm_float:     \
112     case BuilderOp::cmple_imm_int:       \
113     case BuilderOp::cmple_imm_uint:      \
114     case BuilderOp::cmplt_imm_float:     \
115     case BuilderOp::cmplt_imm_int:       \
116     case BuilderOp::cmplt_imm_uint:      \
117     case BuilderOp::cmpeq_imm_float:     \
118     case BuilderOp::cmpeq_imm_int:       \
119     case BuilderOp::cmpne_imm_float:     \
120     case BuilderOp::cmpne_imm_int
121 
122 #define ALL_IMMEDIATE_MULTI_SLOT_BINARY_OP_CASES \
123          BuilderOp::bitwise_and_imm_int
124 
125 #define ALL_N_WAY_TERNARY_OP_CASES       \
126          BuilderOp::smoothstep_n_floats
127 
128 #define ALL_MULTI_SLOT_TERNARY_OP_CASES \
129          BuilderOp::mix_n_floats:       \
130     case BuilderOp::mix_n_ints
131 
is_immediate_op(BuilderOp op)132 static bool is_immediate_op(BuilderOp op) {
133     switch (op) {
134         case ALL_IMMEDIATE_BINARY_OP_CASES: return true;
135         default:                            return false;
136     }
137 }
138 
is_multi_slot_immediate_op(BuilderOp op)139 static bool is_multi_slot_immediate_op(BuilderOp op) {
140     switch (op) {
141         case ALL_IMMEDIATE_MULTI_SLOT_BINARY_OP_CASES: return true;
142         default:                                       return false;
143     }
144 }
145 
convert_n_way_op_to_immediate(BuilderOp op,int slots,int32_t * constantValue)146 static BuilderOp convert_n_way_op_to_immediate(BuilderOp op, int slots, int32_t* constantValue) {
147     // We rely on the exact ordering of SkRP ops here; the immediate-mode op must always come
148     // directly before the n-way op. (If we have more than one, the increasing-slot variations
149     // continue backwards from there.)
150     BuilderOp immOp = (BuilderOp)((int)op - 1);
151 
152     // Some immediate ops support multiple slots.
153     if (is_multi_slot_immediate_op(immOp)) {
154         return immOp;
155     }
156 
157     // Most immediate ops only directly support a single slot. However, it's still faster to execute
158     // `add_imm_int, add_imm_int` instead of `splat_2_ints, add_2_ints`, so we allow those
159     // conversions as well.
160     if (slots <= 2) {
161         if (is_immediate_op(immOp)) {
162             return immOp;
163         }
164 
165         // We also allow for immediate-mode subtraction, by adding a negative value.
166         switch (op) {
167             case BuilderOp::sub_n_ints:
168                 *constantValue *= -1;
169                 return BuilderOp::add_imm_int;
170 
171             case BuilderOp::sub_n_floats: {
172                 // This negates the floating-point value by inverting its sign bit.
173                 *constantValue ^= 0x80000000;
174                 return BuilderOp::add_imm_float;
175             }
176             default:
177                 break;
178         }
179     }
180 
181     // We don't have an immediate-mode version of this op.
182     return op;
183 }
184 
appendInstruction(BuilderOp op,SlotList slots,int immA,int immB,int immC,int immD)185 void Builder::appendInstruction(BuilderOp op, SlotList slots,
186                                 int immA, int immB, int immC, int immD) {
187     fInstructions.push_back({op, slots.fSlotA, slots.fSlotB,
188                              immA, immB, immC, immD, fCurrentStackID});
189 }
190 
lastInstruction(int fromBack)191 Instruction* Builder::lastInstruction(int fromBack) {
192     if (fInstructions.size() <= fromBack) {
193         return nullptr;
194     }
195     Instruction* inst = &fInstructions.fromBack(fromBack);
196     if (inst->fStackID != fCurrentStackID) {
197         return nullptr;
198     }
199     return inst;
200 }
201 
lastInstructionOnAnyStack(int fromBack)202 Instruction* Builder::lastInstructionOnAnyStack(int fromBack) {
203     if (fInstructions.size() <= fromBack) {
204         return nullptr;
205     }
206     return &fInstructions.fromBack(fromBack);
207 }
208 
unary_op(BuilderOp op,int32_t slots)209 void Builder::unary_op(BuilderOp op, int32_t slots) {
210     switch (op) {
211         case ALL_SINGLE_SLOT_UNARY_OP_CASES:
212         case ALL_MULTI_SLOT_UNARY_OP_CASES:
213             this->appendInstruction(op, {}, slots);
214             break;
215 
216         default:
217             SkDEBUGFAIL("not a unary op");
218             break;
219     }
220 }
221 
binary_op(BuilderOp op,int32_t slots)222 void Builder::binary_op(BuilderOp op, int32_t slots) {
223     if (Instruction* lastInstruction = this->lastInstruction()) {
224         // If we just pushed or splatted a constant onto the stack...
225         if (lastInstruction->fOp == BuilderOp::push_constant &&
226             lastInstruction->fImmA >= slots) {
227             // ... and this op has an immediate-mode equivalent...
228             int32_t constantValue = lastInstruction->fImmB;
229             BuilderOp immOp = convert_n_way_op_to_immediate(op, slots, &constantValue);
230             if (immOp != op) {
231                 // ... discard the constants from the stack, and use an immediate-mode op.
232                 this->discard_stack(slots);
233                 this->appendInstruction(immOp, {}, slots, constantValue);
234                 return;
235             }
236         }
237     }
238 
239     switch (op) {
240         case ALL_N_WAY_BINARY_OP_CASES:
241         case ALL_MULTI_SLOT_BINARY_OP_CASES:
242             this->appendInstruction(op, {}, slots);
243             break;
244 
245         default:
246             SkDEBUGFAIL("not a binary op");
247             break;
248     }
249 }
250 
ternary_op(BuilderOp op,int32_t slots)251 void Builder::ternary_op(BuilderOp op, int32_t slots) {
252     switch (op) {
253         case ALL_N_WAY_TERNARY_OP_CASES:
254         case ALL_MULTI_SLOT_TERNARY_OP_CASES:
255             this->appendInstruction(op, {}, slots);
256             break;
257 
258         default:
259             SkDEBUGFAIL("not a ternary op");
260             break;
261     }
262 }
263 
dot_floats(int32_t slots)264 void Builder::dot_floats(int32_t slots) {
265     switch (slots) {
266         case 1: this->appendInstruction(BuilderOp::mul_n_floats, {}, slots); break;
267         case 2: this->appendInstruction(BuilderOp::dot_2_floats, {}, slots); break;
268         case 3: this->appendInstruction(BuilderOp::dot_3_floats, {}, slots); break;
269         case 4: this->appendInstruction(BuilderOp::dot_4_floats, {}, slots); break;
270 
271         default:
272             SkDEBUGFAIL("invalid number of slots");
273             break;
274     }
275 }
276 
refract_floats()277 void Builder::refract_floats() {
278     this->appendInstruction(BuilderOp::refract_4_floats, {});
279 }
280 
inverse_matrix(int32_t n)281 void Builder::inverse_matrix(int32_t n) {
282     switch (n) {
283         case 2:  this->appendInstruction(BuilderOp::inverse_mat2, {}, 4);  break;
284         case 3:  this->appendInstruction(BuilderOp::inverse_mat3, {}, 9);  break;
285         case 4:  this->appendInstruction(BuilderOp::inverse_mat4, {}, 16); break;
286         default: SkUNREACHABLE;
287     }
288 }
289 
pad_stack(int32_t count)290 void Builder::pad_stack(int32_t count) {
291     if (count > 0) {
292         this->appendInstruction(BuilderOp::pad_stack, {}, count);
293     }
294 }
295 
simplifyImmediateUnmaskedOp()296 bool Builder::simplifyImmediateUnmaskedOp() {
297     if (fInstructions.size() < 3) {
298         return false;
299     }
300 
301     // If we detect a pattern of 'push, immediate-op, unmasked pop', then we can
302     // convert it into an immediate-op directly onto the value slots and take the
303     // stack entirely out of the equation.
304     Instruction* popInstruction  = this->lastInstruction(/*fromBack=*/0);
305     Instruction* immInstruction  = this->lastInstruction(/*fromBack=*/1);
306     Instruction* pushInstruction = this->lastInstruction(/*fromBack=*/2);
307 
308     // If the last instruction is an unmasked pop...
309     if (popInstruction && immInstruction && pushInstruction &&
310         popInstruction->fOp == BuilderOp::copy_stack_to_slots_unmasked) {
311         // ... and the prior instruction was an immediate-mode op, with the same number of slots...
312         if (is_immediate_op(immInstruction->fOp) &&
313             immInstruction->fImmA == popInstruction->fImmA) {
314             // ... and we support multiple-slot immediates (if this op calls for it)...
315             if (immInstruction->fImmA == 1 || is_multi_slot_immediate_op(immInstruction->fOp)) {
316                 // ... and the prior instruction was `push_slots` or `push_immutable` of at least
317                 // that many slots...
318                 if ((pushInstruction->fOp == BuilderOp::push_slots ||
319                      pushInstruction->fOp == BuilderOp::push_immutable) &&
320                     pushInstruction->fImmA >= popInstruction->fImmA) {
321                     // ... onto the same slot range...
322                     Slot immSlot = popInstruction->fSlotA + popInstruction->fImmA;
323                     Slot pushSlot = pushInstruction->fSlotA + pushInstruction->fImmA;
324                     if (immSlot == pushSlot) {
325                         // ... we can shrink the push, eliminate the pop, and perform the immediate
326                         // op in-place instead.
327                         pushInstruction->fImmA -= immInstruction->fImmA;
328                         immInstruction->fSlotA = immSlot - immInstruction->fImmA;
329                         fInstructions.pop_back();
330                         return true;
331                     }
332                 }
333             }
334         }
335     }
336 
337     return false;
338 }
339 
discard_stack(int32_t count,int stackID)340 void Builder::discard_stack(int32_t count, int stackID) {
341     // If we pushed something onto the stack and then immediately discarded part of it, we can
342     // shrink or eliminate the push.
343     while (count > 0) {
344         Instruction* lastInstruction = this->lastInstructionOnAnyStack();
345         if (!lastInstruction || lastInstruction->fStackID != stackID) {
346             break;
347         }
348 
349         switch (lastInstruction->fOp) {
350             case BuilderOp::discard_stack:
351                 // Our last op was actually a separate discard_stack; combine the discards.
352                 lastInstruction->fImmA += count;
353                 return;
354 
355             case BuilderOp::push_clone:
356             case BuilderOp::push_clone_from_stack:
357             case BuilderOp::push_clone_indirect_from_stack:
358             case BuilderOp::push_constant:
359             case BuilderOp::push_immutable:
360             case BuilderOp::push_immutable_indirect:
361             case BuilderOp::push_slots:
362             case BuilderOp::push_slots_indirect:
363             case BuilderOp::push_uniform:
364             case BuilderOp::push_uniform_indirect:
365             case BuilderOp::pad_stack: {
366                 // Our last op was a multi-slot push; these cancel out. Eliminate the op if its
367                 // count reached zero.
368                 int cancelOut = std::min(count, lastInstruction->fImmA);
369                 count                  -= cancelOut;
370                 lastInstruction->fImmA -= cancelOut;
371                 if (lastInstruction->fImmA == 0) {
372                     fInstructions.pop_back();
373                 }
374                 continue;
375             }
376             case BuilderOp::push_condition_mask:
377             case BuilderOp::push_loop_mask:
378             case BuilderOp::push_return_mask:
379                 // Our last op was a single-slot push; cancel out one discard and eliminate the op.
380                 --count;
381                 fInstructions.pop_back();
382                 continue;
383 
384             case BuilderOp::copy_stack_to_slots_unmasked: {
385                 // Look for a pattern of `push, immediate-ops, pop` and simplify it down to an
386                 // immediate-op directly to the value slot.
387                 if (count == 1) {
388                     if (this->simplifyImmediateUnmaskedOp()) {
389                         return;
390                     }
391                 }
392 
393                 // A `copy_stack_to_slots_unmasked` op, followed immediately by a `discard_stack`
394                 // op with an equal number of slots, is interpreted as an unmasked stack pop.
395                 // We can simplify pops in a variety of ways. First, temporarily get rid of
396                 // `copy_stack_to_slots_unmasked`.
397                 if (count == lastInstruction->fImmA) {
398                     SlotRange dst{lastInstruction->fSlotA, lastInstruction->fImmA};
399                     fInstructions.pop_back();
400 
401                     // See if we can write this pop in a simpler way.
402                     this->simplifyPopSlotsUnmasked(&dst);
403 
404                     // If simplification consumed the entire range, we're done!
405                     if (dst.count == 0) {
406                         return;
407                     }
408 
409                     // Simplification did not consume the entire range. We are still responsible for
410                     // copying-back and discarding any remaining slots.
411                     this->copy_stack_to_slots_unmasked(dst);
412                     count = dst.count;
413                 }
414                 break;
415             }
416             default:
417                 break;
418         }
419 
420         // This instruction wasn't a push.
421         break;
422     }
423 
424     if (count > 0) {
425         this->appendInstruction(BuilderOp::discard_stack, {}, count);
426     }
427 }
428 
label(int labelID)429 void Builder::label(int labelID) {
430     SkASSERT(labelID >= 0 && labelID < fNumLabels);
431 
432     // If the previous instruction was a branch to this label, it's a no-op; jumping to the very
433     // next instruction is effectively meaningless.
434     while (const Instruction* lastInstruction = this->lastInstructionOnAnyStack()) {
435         switch (lastInstruction->fOp) {
436             case BuilderOp::jump:
437             case BuilderOp::branch_if_all_lanes_active:
438             case BuilderOp::branch_if_any_lanes_active:
439             case BuilderOp::branch_if_no_lanes_active:
440             case BuilderOp::branch_if_no_active_lanes_on_stack_top_equal:
441                 if (lastInstruction->fImmA == labelID) {
442                     fInstructions.pop_back();
443                     continue;
444                 }
445                 break;
446 
447             default:
448                 break;
449         }
450         break;
451     }
452     this->appendInstruction(BuilderOp::label, {}, labelID);
453 }
454 
jump(int labelID)455 void Builder::jump(int labelID) {
456     SkASSERT(labelID >= 0 && labelID < fNumLabels);
457     if (const Instruction* lastInstruction = this->lastInstructionOnAnyStack()) {
458         if (lastInstruction->fOp == BuilderOp::jump) {
459             // The previous instruction was also `jump`, so this branch could never possibly occur.
460             return;
461         }
462     }
463     this->appendInstruction(BuilderOp::jump, {}, labelID);
464 }
465 
branch_if_any_lanes_active(int labelID)466 void Builder::branch_if_any_lanes_active(int labelID) {
467     if (!this->executionMaskWritesAreEnabled()) {
468         this->jump(labelID);
469         return;
470     }
471 
472     SkASSERT(labelID >= 0 && labelID < fNumLabels);
473     if (const Instruction* lastInstruction = this->lastInstructionOnAnyStack()) {
474         if (lastInstruction->fOp == BuilderOp::branch_if_any_lanes_active ||
475             lastInstruction->fOp == BuilderOp::jump) {
476             // The previous instruction was `jump` or `branch_if_any_lanes_active`, so this branch
477             // could never possibly occur.
478             return;
479         }
480     }
481     this->appendInstruction(BuilderOp::branch_if_any_lanes_active, {}, labelID);
482 }
483 
branch_if_all_lanes_active(int labelID)484 void Builder::branch_if_all_lanes_active(int labelID) {
485     if (!this->executionMaskWritesAreEnabled()) {
486         this->jump(labelID);
487         return;
488     }
489 
490     SkASSERT(labelID >= 0 && labelID < fNumLabels);
491     if (const Instruction* lastInstruction = this->lastInstructionOnAnyStack()) {
492         if (lastInstruction->fOp == BuilderOp::branch_if_all_lanes_active ||
493             lastInstruction->fOp == BuilderOp::jump) {
494             // The previous instruction was `jump` or `branch_if_all_lanes_active`, so this branch
495             // could never possibly occur.
496             return;
497         }
498     }
499     this->appendInstruction(BuilderOp::branch_if_all_lanes_active, {}, labelID);
500 }
501 
branch_if_no_lanes_active(int labelID)502 void Builder::branch_if_no_lanes_active(int labelID) {
503     if (!this->executionMaskWritesAreEnabled()) {
504         return;
505     }
506 
507     SkASSERT(labelID >= 0 && labelID < fNumLabels);
508     if (const Instruction* lastInstruction = this->lastInstructionOnAnyStack()) {
509         if (lastInstruction->fOp == BuilderOp::branch_if_no_lanes_active ||
510             lastInstruction->fOp == BuilderOp::jump) {
511             // The previous instruction was `jump` or `branch_if_no_lanes_active`, so this branch
512             // could never possibly occur.
513             return;
514         }
515     }
516     this->appendInstruction(BuilderOp::branch_if_no_lanes_active, {}, labelID);
517 }
518 
branch_if_no_active_lanes_on_stack_top_equal(int value,int labelID)519 void Builder::branch_if_no_active_lanes_on_stack_top_equal(int value, int labelID) {
520     SkASSERT(labelID >= 0 && labelID < fNumLabels);
521     if (const Instruction* lastInstruction = this->lastInstructionOnAnyStack()) {
522         if (lastInstruction->fOp == BuilderOp::jump ||
523             (lastInstruction->fOp == BuilderOp::branch_if_no_active_lanes_on_stack_top_equal &&
524              lastInstruction->fImmB == value)) {
525             // The previous instruction was `jump` or `branch_if_no_active_lanes_on_stack_top_equal`
526             // (checking against the same value), so this branch could never possibly occur.
527             return;
528         }
529     }
530     this->appendInstruction(BuilderOp::branch_if_no_active_lanes_on_stack_top_equal,
531                             {}, labelID, value);
532 }
533 
push_slots_or_immutable(SlotRange src,BuilderOp op)534 void Builder::push_slots_or_immutable(SlotRange src, BuilderOp op) {
535     SkASSERT(src.count >= 0);
536     if (Instruction* lastInstruction = this->lastInstruction()) {
537         // If the previous instruction was pushing slots contiguous to this range, we can collapse
538         // the two pushes into one larger push.
539         if (lastInstruction->fOp == op &&
540             lastInstruction->fSlotA + lastInstruction->fImmA == src.index) {
541             lastInstruction->fImmA += src.count;
542             src.count = 0;
543         }
544     }
545 
546     if (src.count > 0) {
547         this->appendInstruction(op, {src.index}, src.count);
548     }
549 
550     // Look for a sequence of "copy stack to X, discard stack, copy X to stack". This is a common
551     // pattern when multiple operations in a row affect the same variable. When we see this, we can
552     // eliminate both the discard and the push.
553     if (fInstructions.size() >= 3) {
554         const Instruction* pushInst        = this->lastInstruction(/*fromBack=*/0);
555         const Instruction* discardInst     = this->lastInstruction(/*fromBack=*/1);
556         const Instruction* copyToSlotsInst = this->lastInstruction(/*fromBack=*/2);
557 
558         if (pushInst && discardInst && copyToSlotsInst && pushInst->fOp == BuilderOp::push_slots) {
559             int pushIndex = pushInst->fSlotA;
560             int pushCount = pushInst->fImmA;
561 
562             // Look for a `discard_stack` matching our push count.
563             if (discardInst->fOp == BuilderOp::discard_stack && discardInst->fImmA == pushCount) {
564                 // Look for a `copy_stack_to_slots` matching our push.
565                 if ((copyToSlotsInst->fOp == BuilderOp::copy_stack_to_slots ||
566                      copyToSlotsInst->fOp == BuilderOp::copy_stack_to_slots_unmasked) &&
567                     copyToSlotsInst->fSlotA == pushIndex && copyToSlotsInst->fImmA == pushCount) {
568                     // We found a matching sequence. Remove the discard and push.
569                     fInstructions.pop_back();
570                     fInstructions.pop_back();
571                     return;
572                 }
573             }
574         }
575     }
576 }
577 
push_slots_or_immutable_indirect(SlotRange fixedRange,int dynamicStackID,SlotRange limitRange,BuilderOp op)578 void Builder::push_slots_or_immutable_indirect(SlotRange fixedRange,
579                                                int dynamicStackID,
580                                                SlotRange limitRange,
581                                                BuilderOp op) {
582     // SlotA: fixed-range start
583     // SlotB: limit-range end
584     // immA: number of slots
585     // immB: dynamic stack ID
586     this->appendInstruction(op,
587                             {fixedRange.index, limitRange.index + limitRange.count},
588                             fixedRange.count,
589                             dynamicStackID);
590 }
591 
push_uniform(SlotRange src)592 void Builder::push_uniform(SlotRange src) {
593     SkASSERT(src.count >= 0);
594     if (Instruction* lastInstruction = this->lastInstruction()) {
595         // If the previous instruction was pushing uniforms contiguous to this range, we can
596         // collapse the two pushes into one larger push.
597         if (lastInstruction->fOp == BuilderOp::push_uniform &&
598             lastInstruction->fSlotA + lastInstruction->fImmA == src.index) {
599             lastInstruction->fImmA += src.count;
600             return;
601         }
602     }
603 
604     if (src.count > 0) {
605         this->appendInstruction(BuilderOp::push_uniform, {src.index}, src.count);
606     }
607 }
608 
push_uniform_indirect(SlotRange fixedRange,int dynamicStackID,SlotRange limitRange)609 void Builder::push_uniform_indirect(SlotRange fixedRange,
610                                     int dynamicStackID,
611                                     SlotRange limitRange) {
612     // SlotA: fixed-range start
613     // SlotB: limit-range end
614     // immA: number of slots
615     // immB: dynamic stack ID
616     this->appendInstruction(BuilderOp::push_uniform_indirect,
617                             {fixedRange.index, limitRange.index + limitRange.count},
618                             fixedRange.count,
619                             dynamicStackID);
620 }
621 
trace_var_indirect(int traceMaskStackID,SlotRange fixedRange,int dynamicStackID,SlotRange limitRange)622 void Builder::trace_var_indirect(int traceMaskStackID,
623                                  SlotRange fixedRange,
624                                  int dynamicStackID,
625                                  SlotRange limitRange) {
626     // SlotA: fixed-range start
627     // SlotB: limit-range end
628     // immA: trace-mask stack ID
629     // immB: number of slots
630     // immC: dynamic stack ID
631     this->appendInstruction(BuilderOp::trace_var_indirect,
632                             {fixedRange.index, limitRange.index + limitRange.count},
633                             traceMaskStackID,
634                             fixedRange.count,
635                             dynamicStackID);
636 }
637 
push_constant_i(int32_t val,int count)638 void Builder::push_constant_i(int32_t val, int count) {
639     SkASSERT(count >= 0);
640     if (count > 0) {
641         if (Instruction* lastInstruction = this->lastInstruction()) {
642             // If the previous op is pushing the same value, we can just push more of them.
643             if (lastInstruction->fOp == BuilderOp::push_constant && lastInstruction->fImmB == val) {
644                 lastInstruction->fImmA += count;
645                 return;
646             }
647         }
648         this->appendInstruction(BuilderOp::push_constant, {}, count, val);
649     }
650 }
651 
push_duplicates(int count)652 void Builder::push_duplicates(int count) {
653     if (Instruction* lastInstruction = this->lastInstruction()) {
654         // If the previous op is pushing a constant, we can just push more of them.
655         if (lastInstruction->fOp == BuilderOp::push_constant) {
656             lastInstruction->fImmA += count;
657             return;
658         }
659     }
660     SkASSERT(count >= 0);
661     if (count >= 3) {
662         // Use a swizzle to splat the input into a 4-slot value.
663         this->swizzle(/*consumedSlots=*/1, {0, 0, 0, 0});
664         count -= 3;
665     }
666     for (; count >= 4; count -= 4) {
667         // Clone the splatted value four slots at a time.
668         this->push_clone(/*numSlots=*/4);
669     }
670     // Use a swizzle or clone to handle the trailing items.
671     switch (count) {
672         case 3:  this->swizzle(/*consumedSlots=*/1, {0, 0, 0, 0}); break;
673         case 2:  this->swizzle(/*consumedSlots=*/1, {0, 0, 0});    break;
674         case 1:  this->push_clone(/*numSlots=*/1);                 break;
675         default: break;
676     }
677 }
678 
push_clone(int numSlots,int offsetFromStackTop)679 void Builder::push_clone(int numSlots, int offsetFromStackTop) {
680     // If we are cloning the stack top...
681     if (numSlots == 1 && offsetFromStackTop == 0) {
682         // ... and the previous op is pushing a constant...
683         if (Instruction* lastInstruction = this->lastInstruction()) {
684             if (lastInstruction->fOp == BuilderOp::push_constant) {
685                 // ... we can just push more of them.
686                 lastInstruction->fImmA += 1;
687                 return;
688             }
689         }
690     }
691     this->appendInstruction(BuilderOp::push_clone, {}, numSlots, numSlots + offsetFromStackTop);
692 }
693 
push_clone_from_stack(SlotRange range,int otherStackID,int offsetFromStackTop)694 void Builder::push_clone_from_stack(SlotRange range, int otherStackID, int offsetFromStackTop) {
695     // immA: number of slots
696     // immB: other stack ID
697     // immC: offset from stack top
698     offsetFromStackTop -= range.index;
699 
700     if (Instruction* lastInstruction = this->lastInstruction()) {
701         // If the previous op is also pushing a clone...
702         if (lastInstruction->fOp == BuilderOp::push_clone_from_stack &&
703             // ... from the same stack...
704             lastInstruction->fImmB == otherStackID &&
705             // ... and this clone starts at the same place that the last clone ends...
706             lastInstruction->fImmC - lastInstruction->fImmA == offsetFromStackTop) {
707             // ... just extend the existing clone-op.
708             lastInstruction->fImmA += range.count;
709             return;
710         }
711     }
712 
713     this->appendInstruction(BuilderOp::push_clone_from_stack, {},
714                             range.count, otherStackID, offsetFromStackTop);
715 }
716 
push_clone_indirect_from_stack(SlotRange fixedOffset,int dynamicStackID,int otherStackID,int offsetFromStackTop)717 void Builder::push_clone_indirect_from_stack(SlotRange fixedOffset,
718                                              int dynamicStackID,
719                                              int otherStackID,
720                                              int offsetFromStackTop) {
721     // immA: number of slots
722     // immB: other stack ID
723     // immC: offset from stack top
724     // immD: dynamic stack ID
725     offsetFromStackTop -= fixedOffset.index;
726 
727     this->appendInstruction(BuilderOp::push_clone_indirect_from_stack, {},
728                             fixedOffset.count, otherStackID, offsetFromStackTop, dynamicStackID);
729 }
730 
pop_slots(SlotRange dst)731 void Builder::pop_slots(SlotRange dst) {
732     if (!this->executionMaskWritesAreEnabled()) {
733         this->pop_slots_unmasked(dst);
734         return;
735     }
736 
737     this->copy_stack_to_slots(dst);
738     this->discard_stack(dst.count);
739 }
740 
simplifyPopSlotsUnmasked(SlotRange * dst)741 void Builder::simplifyPopSlotsUnmasked(SlotRange* dst) {
742     if (!dst->count) {
743         // There's nothing left to simplify.
744         return;
745     }
746     Instruction* lastInstruction = this->lastInstruction();
747     if (!lastInstruction) {
748         // There's nothing left to simplify.
749         return;
750     }
751     BuilderOp lastOp = lastInstruction->fOp;
752 
753     // If the last instruction is pushing a constant, we can simplify it by copying the constant
754     // directly into the destination slot.
755     if (lastOp == BuilderOp::push_constant) {
756         // Get the last slot.
757         int32_t value = lastInstruction->fImmB;
758         lastInstruction->fImmA--;
759         if (lastInstruction->fImmA == 0) {
760             fInstructions.pop_back();
761         }
762 
763         // Consume one destination slot.
764         dst->count--;
765         Slot destinationSlot = dst->index + dst->count;
766 
767         // Continue simplifying if possible.
768         this->simplifyPopSlotsUnmasked(dst);
769 
770         // Write the constant directly to the destination slot.
771         this->copy_constant(destinationSlot, value);
772         return;
773     }
774 
775     // If the last instruction is pushing a uniform, we can simplify it by copying the uniform
776     // directly into the destination slot.
777     if (lastOp == BuilderOp::push_uniform) {
778         // Get the last slot.
779         Slot sourceSlot = lastInstruction->fSlotA + lastInstruction->fImmA - 1;
780         lastInstruction->fImmA--;
781         if (lastInstruction->fImmA == 0) {
782             fInstructions.pop_back();
783         }
784 
785         // Consume one destination slot.
786         dst->count--;
787         Slot destinationSlot = dst->index + dst->count;
788 
789         // Continue simplifying if possible.
790         this->simplifyPopSlotsUnmasked(dst);
791 
792         // Write the constant directly to the destination slot.
793         this->copy_uniform_to_slots_unmasked({destinationSlot, 1}, {sourceSlot, 1});
794         return;
795     }
796 
797     // If the last instruction is pushing a slot or immutable, we can just copy that slot.
798     if (lastOp == BuilderOp::push_slots || lastOp == BuilderOp::push_immutable) {
799         // Get the last slot.
800         Slot sourceSlot = lastInstruction->fSlotA + lastInstruction->fImmA - 1;
801         lastInstruction->fImmA--;
802         if (lastInstruction->fImmA == 0) {
803             fInstructions.pop_back();
804         }
805 
806         // Consume one destination slot.
807         dst->count--;
808         Slot destinationSlot = dst->index + dst->count;
809 
810         // Try once more.
811         this->simplifyPopSlotsUnmasked(dst);
812 
813         // Copy the slot directly.
814         if (lastOp == BuilderOp::push_slots) {
815             if (destinationSlot != sourceSlot) {
816                 this->copy_slots_unmasked({destinationSlot, 1}, {sourceSlot, 1});
817             } else {
818                 // Copying from a value-slot into the same value-slot is a no-op.
819             }
820         } else {
821             // Copy from immutable data directly to the destination slot.
822             this->copy_immutable_unmasked({destinationSlot, 1}, {sourceSlot, 1});
823         }
824         return;
825     }
826 }
827 
pop_slots_unmasked(SlotRange dst)828 void Builder::pop_slots_unmasked(SlotRange dst) {
829     SkASSERT(dst.count >= 0);
830     this->copy_stack_to_slots_unmasked(dst);
831     this->discard_stack(dst.count);
832 }
833 
exchange_src()834 void Builder::exchange_src() {
835     if (Instruction* lastInstruction = this->lastInstruction()) {
836         // If the previous op is also an exchange-src...
837         if (lastInstruction->fOp == BuilderOp::exchange_src) {
838             // ... both ops can be eliminated. A double-swap is a no-op.
839             fInstructions.pop_back();
840             return;
841         }
842     }
843 
844     this->appendInstruction(BuilderOp::exchange_src, {});
845 }
846 
pop_src_rgba()847 void Builder::pop_src_rgba() {
848     if (Instruction* lastInstruction = this->lastInstruction()) {
849         // If the previous op is exchanging src.rgba with the stack...
850         if (lastInstruction->fOp == BuilderOp::exchange_src) {
851             // ... both ops can be eliminated. It's just sliding the color back and forth.
852             fInstructions.pop_back();
853             this->discard_stack(4);
854             return;
855         }
856     }
857 
858     this->appendInstruction(BuilderOp::pop_src_rgba, {});
859 }
860 
copy_stack_to_slots(SlotRange dst,int offsetFromStackTop)861 void Builder::copy_stack_to_slots(SlotRange dst, int offsetFromStackTop) {
862     // If the execution mask is known to be all-true, then we can ignore the write mask.
863     if (!this->executionMaskWritesAreEnabled()) {
864         this->copy_stack_to_slots_unmasked(dst, offsetFromStackTop);
865         return;
866     }
867 
868     // If the last instruction copied the previous stack slots, just extend it.
869     if (Instruction* lastInstruction = this->lastInstruction()) {
870         // If the last op is copy-stack-to-slots...
871         if (lastInstruction->fOp == BuilderOp::copy_stack_to_slots &&
872             // and this op's destination is immediately after the last copy-slots-op's destination
873             lastInstruction->fSlotA + lastInstruction->fImmA == dst.index &&
874             // and this op's source is immediately after the last copy-slots-op's source
875             lastInstruction->fImmB - lastInstruction->fImmA == offsetFromStackTop) {
876             // then we can just extend the copy!
877             lastInstruction->fImmA += dst.count;
878             return;
879         }
880     }
881 
882     this->appendInstruction(BuilderOp::copy_stack_to_slots, {dst.index},
883                             dst.count, offsetFromStackTop);
884 }
885 
copy_stack_to_slots_indirect(SlotRange fixedRange,int dynamicStackID,SlotRange limitRange)886 void Builder::copy_stack_to_slots_indirect(SlotRange fixedRange,
887                                            int dynamicStackID,
888                                            SlotRange limitRange) {
889     // SlotA: fixed-range start
890     // SlotB: limit-range end
891     // immA: number of slots
892     // immB: dynamic stack ID
893     this->appendInstruction(BuilderOp::copy_stack_to_slots_indirect,
894                             {fixedRange.index, limitRange.index + limitRange.count},
895                             fixedRange.count,
896                             dynamicStackID);
897 }
898 
slot_ranges_overlap(SlotRange x,SlotRange y)899 static bool slot_ranges_overlap(SlotRange x, SlotRange y) {
900     return x.index < y.index + y.count &&
901            y.index < x.index + x.count;
902 }
903 
copy_constant(Slot slot,int constantValue)904 void Builder::copy_constant(Slot slot, int constantValue) {
905     // If the last instruction copied the same constant, just extend it.
906     if (Instruction* lastInstr = this->lastInstruction()) {
907         // If the last op is copy-constant...
908         if (lastInstr->fOp == BuilderOp::copy_constant &&
909             // ... and has the same value...
910             lastInstr->fImmB == constantValue &&
911             // ... and the slot is immediately after the last copy-constant's destination...
912             lastInstr->fSlotA + lastInstr->fImmA == slot) {
913             // ... then we can extend the copy!
914             lastInstr->fImmA += 1;
915             return;
916         }
917     }
918 
919     this->appendInstruction(BuilderOp::copy_constant, {slot}, 1, constantValue);
920 }
921 
copy_slots_unmasked(SlotRange dst,SlotRange src)922 void Builder::copy_slots_unmasked(SlotRange dst, SlotRange src) {
923     // If the last instruction copied adjacent slots, just extend it.
924     if (Instruction* lastInstr = this->lastInstruction()) {
925         // If the last op is a match...
926         if (lastInstr->fOp == BuilderOp::copy_slot_unmasked &&
927             // and this op's destination is immediately after the last copy-slots-op's destination
928             lastInstr->fSlotA + lastInstr->fImmA == dst.index &&
929             // and this op's source is immediately after the last copy-slots-op's source
930             lastInstr->fSlotB + lastInstr->fImmA == src.index &&
931             // and the source/dest ranges will not overlap
932             !slot_ranges_overlap({lastInstr->fSlotB, lastInstr->fImmA + dst.count},
933                                  {lastInstr->fSlotA, lastInstr->fImmA + dst.count})) {
934             // then we can just extend the copy!
935             lastInstr->fImmA += dst.count;
936             return;
937         }
938     }
939 
940     SkASSERT(dst.count == src.count);
941     this->appendInstruction(BuilderOp::copy_slot_unmasked, {dst.index, src.index}, dst.count);
942 }
943 
copy_immutable_unmasked(SlotRange dst,SlotRange src)944 void Builder::copy_immutable_unmasked(SlotRange dst, SlotRange src) {
945     // If the last instruction copied adjacent immutable data, just extend it.
946     if (Instruction* lastInstr = this->lastInstruction()) {
947         // If the last op is a match...
948         if (lastInstr->fOp == BuilderOp::copy_immutable_unmasked &&
949             // and this op's destination is immediately after the last copy-slots-op's destination
950             lastInstr->fSlotA + lastInstr->fImmA == dst.index &&
951             // and this op's source is immediately after the last copy-slots-op's source
952             lastInstr->fSlotB + lastInstr->fImmA == src.index) {
953             // then we can just extend the copy!
954             lastInstr->fImmA += dst.count;
955             return;
956         }
957     }
958 
959     SkASSERT(dst.count == src.count);
960     this->appendInstruction(BuilderOp::copy_immutable_unmasked, {dst.index, src.index}, dst.count);
961 }
962 
copy_uniform_to_slots_unmasked(SlotRange dst,SlotRange src)963 void Builder::copy_uniform_to_slots_unmasked(SlotRange dst, SlotRange src) {
964     // If the last instruction copied adjacent uniforms, just extend it.
965     if (Instruction* lastInstr = this->lastInstruction()) {
966         // If the last op is copy-constant...
967         if (lastInstr->fOp == BuilderOp::copy_uniform_to_slots_unmasked &&
968             // and this op's destination is immediately after the last copy-constant's destination
969             lastInstr->fSlotB + lastInstr->fImmA == dst.index &&
970             // and this op's source is immediately after the last copy-constant's source
971             lastInstr->fSlotA + lastInstr->fImmA == src.index) {
972             // then we can just extend the copy!
973             lastInstr->fImmA += dst.count;
974             return;
975         }
976     }
977 
978     SkASSERT(dst.count == src.count);
979     this->appendInstruction(BuilderOp::copy_uniform_to_slots_unmasked, {src.index, dst.index},
980                             dst.count);
981 }
982 
copy_stack_to_slots_unmasked(SlotRange dst,int offsetFromStackTop)983 void Builder::copy_stack_to_slots_unmasked(SlotRange dst, int offsetFromStackTop) {
984     // If the last instruction copied the previous stack slots, just extend it.
985     if (Instruction* lastInstr = this->lastInstruction()) {
986         // If the last op is copy-stack-to-slots-unmasked...
987         if (lastInstr->fOp == BuilderOp::copy_stack_to_slots_unmasked &&
988             // and this op's destination is immediately after the last copy-slots-op's destination
989             lastInstr->fSlotA + lastInstr->fImmA == dst.index &&
990             // and this op's source is immediately after the last copy-slots-op's source
991             lastInstr->fImmB - lastInstr->fImmA == offsetFromStackTop) {
992             // then we can just extend the copy!
993             lastInstr->fImmA += dst.count;
994             return;
995         }
996     }
997 
998     this->appendInstruction(BuilderOp::copy_stack_to_slots_unmasked, {dst.index},
999                             dst.count, offsetFromStackTop);
1000 }
1001 
pop_return_mask()1002 void Builder::pop_return_mask() {
1003     SkASSERT(this->executionMaskWritesAreEnabled());
1004 
1005     // This instruction is going to overwrite the return mask. If the previous instruction was
1006     // masking off the return mask, that's wasted work and it can be eliminated.
1007     if (Instruction* lastInstruction = this->lastInstructionOnAnyStack()) {
1008         if (lastInstruction->fOp == BuilderOp::mask_off_return_mask) {
1009             fInstructions.pop_back();
1010         }
1011     }
1012 
1013     this->appendInstruction(BuilderOp::pop_return_mask, {});
1014 }
1015 
push_condition_mask()1016 void Builder::push_condition_mask() {
1017     SkASSERT(this->executionMaskWritesAreEnabled());
1018 
1019     // If the previous instruction is popping the condition mask, we can restore it onto the stack
1020     // "for free" instead of copying it.
1021     if (Instruction* lastInstruction = this->lastInstruction()) {
1022         if (lastInstruction->fOp == BuilderOp::pop_condition_mask) {
1023             this->pad_stack(1);
1024             return;
1025         }
1026     }
1027     this->appendInstruction(BuilderOp::push_condition_mask, {});
1028 }
1029 
merge_condition_mask()1030 void Builder::merge_condition_mask() {
1031     SkASSERT(this->executionMaskWritesAreEnabled());
1032 
1033     // This instruction is going to overwrite the condition mask. If the previous instruction was
1034     // loading the condition mask, that's wasted work and it can be eliminated.
1035     if (Instruction* lastInstruction = this->lastInstructionOnAnyStack()) {
1036         if (lastInstruction->fOp == BuilderOp::pop_condition_mask) {
1037             int stackID = lastInstruction->fStackID;
1038             fInstructions.pop_back();
1039             this->discard_stack(/*count=*/1, stackID);
1040         }
1041     }
1042 
1043     this->appendInstruction(BuilderOp::merge_condition_mask, {});
1044 }
1045 
zero_slots_unmasked(SlotRange dst)1046 void Builder::zero_slots_unmasked(SlotRange dst) {
1047     if (Instruction* lastInstruction = this->lastInstruction()) {
1048         if (lastInstruction->fOp == BuilderOp::copy_constant && lastInstruction->fImmB == 0) {
1049             if (lastInstruction->fSlotA + lastInstruction->fImmA == dst.index) {
1050                 // The previous instruction was zeroing the range immediately before this range.
1051                 // Combine the ranges.
1052                 lastInstruction->fImmA += dst.count;
1053                 return;
1054             }
1055 
1056             if (lastInstruction->fSlotA == dst.index + dst.count) {
1057                 // The previous instruction was zeroing the range immediately after this range.
1058                 // Combine the ranges.
1059                 lastInstruction->fSlotA = dst.index;
1060                 lastInstruction->fImmA += dst.count;
1061                 return;
1062             }
1063         }
1064     }
1065 
1066     this->appendInstruction(BuilderOp::copy_constant, {dst.index}, dst.count, 0);
1067 }
1068 
pack_nybbles(SkSpan<const int8_t> components)1069 static int pack_nybbles(SkSpan<const int8_t> components) {
1070     // Pack up to 8 elements into nybbles, in reverse order.
1071     int packed = 0;
1072     for (auto iter = components.rbegin(); iter != components.rend(); ++iter) {
1073         SkASSERT(*iter >= 0 && *iter <= 0xF);
1074         packed <<= 4;
1075         packed |= *iter;
1076     }
1077     return packed;
1078 }
1079 
1080 template <typename T>
unpack_nybbles_to_offsets(uint32_t components,SkSpan<T> offsets)1081 static void unpack_nybbles_to_offsets(uint32_t components, SkSpan<T> offsets) {
1082     // Unpack component nybbles into byte-offsets pointing at stack slots.
1083     for (size_t index = 0; index < offsets.size(); ++index) {
1084         offsets[index] = (components & 0xF) * SkOpts::raster_pipeline_highp_stride * sizeof(float);
1085         components >>= 4;
1086     }
1087 }
1088 
max_packed_nybble(uint32_t components,size_t numComponents)1089 static int max_packed_nybble(uint32_t components, size_t numComponents) {
1090     int largest = 0;
1091     for (size_t index = 0; index < numComponents; ++index) {
1092         largest = std::max<int>(largest, components & 0xF);
1093         components >>= 4;
1094     }
1095     return largest;
1096 }
1097 
swizzle_copy_stack_to_slots(SlotRange dst,SkSpan<const int8_t> components,int offsetFromStackTop)1098 void Builder::swizzle_copy_stack_to_slots(SlotRange dst,
1099                                           SkSpan<const int8_t> components,
1100                                           int offsetFromStackTop) {
1101     // When the execution-mask writes-enabled flag is off, we could squeeze out a little bit of
1102     // extra speed here by implementing and using an unmasked version of this op.
1103 
1104     // SlotA: fixed-range start
1105     // immA: number of swizzle components
1106     // immB: swizzle components
1107     // immC: offset from stack top
1108     this->appendInstruction(BuilderOp::swizzle_copy_stack_to_slots, {dst.index},
1109                             (int)components.size(),
1110                             pack_nybbles(components),
1111                             offsetFromStackTop);
1112 }
1113 
swizzle_copy_stack_to_slots_indirect(SlotRange fixedRange,int dynamicStackID,SlotRange limitRange,SkSpan<const int8_t> components,int offsetFromStackTop)1114 void Builder::swizzle_copy_stack_to_slots_indirect(SlotRange fixedRange,
1115                                                    int dynamicStackID,
1116                                                    SlotRange limitRange,
1117                                                    SkSpan<const int8_t> components,
1118                                                    int offsetFromStackTop) {
1119     // When the execution-mask writes-enabled flag is off, we could squeeze out a little bit of
1120     // extra speed here by implementing and using an unmasked version of this op.
1121 
1122     // SlotA: fixed-range start
1123     // SlotB: limit-range end
1124     // immA: number of swizzle components
1125     // immB: swizzle components
1126     // immC: offset from stack top
1127     // immD: dynamic stack ID
1128     this->appendInstruction(BuilderOp::swizzle_copy_stack_to_slots_indirect,
1129                             {fixedRange.index, limitRange.index + limitRange.count},
1130                             (int)components.size(),
1131                             pack_nybbles(components),
1132                             offsetFromStackTop,
1133                             dynamicStackID);
1134 }
1135 
swizzle(int consumedSlots,SkSpan<const int8_t> components)1136 void Builder::swizzle(int consumedSlots, SkSpan<const int8_t> components) {
1137     // Consumes `consumedSlots` elements on the stack, then generates `elementSpan.size()` elements.
1138     SkASSERT(consumedSlots >= 0);
1139 
1140     // We only allow up to 16 elements, and they can only reach 0-15 slots, due to nybble packing.
1141     int numElements = components.size();
1142     SkASSERT(numElements <= 16);
1143     SkASSERT(std::all_of(components.begin(), components.end(), [](int8_t e){ return e >= 0; }));
1144     SkASSERT(std::all_of(components.begin(), components.end(), [](int8_t e){ return e <= 0xF; }));
1145 
1146     // Make a local copy of the element array.
1147     int8_t elements[16] = {};
1148     std::copy(components.begin(), components.end(), std::begin(elements));
1149 
1150     while (numElements > 0) {
1151         // If the first element of the swizzle is zero...
1152         if (elements[0] != 0) {
1153             break;
1154         }
1155         // ...and zero isn't used elsewhere in the swizzle...
1156         if (std::any_of(&elements[1], &elements[numElements], [](int8_t e) { return e == 0; })) {
1157             break;
1158         }
1159         // We can omit the first slot from the swizzle entirely.
1160         // Slide everything forward by one slot, and reduce the element index by one.
1161         for (int index = 1; index < numElements; ++index) {
1162             elements[index - 1] = elements[index] - 1;
1163         }
1164         elements[numElements - 1] = 0;
1165         --consumedSlots;
1166         --numElements;
1167     }
1168 
1169     // A completely empty swizzle is a discard.
1170     if (numElements == 0) {
1171         this->discard_stack(consumedSlots);
1172         return;
1173     }
1174 
1175     if (consumedSlots <= 4 && numElements <= 4) {
1176         // We can fit everything into a little swizzle.
1177         int op = (int)BuilderOp::swizzle_1 + numElements - 1;
1178         this->appendInstruction((BuilderOp)op, {}, consumedSlots,
1179                                 pack_nybbles(SkSpan(elements, numElements)));
1180         return;
1181     }
1182 
1183     // This is a big swizzle. We use the `shuffle` op to handle these. immA counts the consumed
1184     // slots. immB counts the generated slots. immC and immD hold packed-nybble shuffle values.
1185     this->appendInstruction(BuilderOp::shuffle, {},
1186                             consumedSlots, numElements,
1187                             pack_nybbles(SkSpan(&elements[0], 8)),
1188                             pack_nybbles(SkSpan(&elements[8], 8)));
1189 }
1190 
transpose(int columns,int rows)1191 void Builder::transpose(int columns, int rows) {
1192     // Transposes a matrix of size CxR on the stack (into a matrix of size RxC).
1193     int8_t elements[16] = {};
1194     size_t index = 0;
1195     for (int r = 0; r < rows; ++r) {
1196         for (int c = 0; c < columns; ++c) {
1197             elements[index++] = (c * rows) + r;
1198         }
1199     }
1200     this->swizzle(/*consumedSlots=*/columns * rows, SkSpan(elements, index));
1201 }
1202 
diagonal_matrix(int columns,int rows)1203 void Builder::diagonal_matrix(int columns, int rows) {
1204     // Generates a CxR diagonal matrix from the top two scalars on the stack.
1205     int8_t elements[16] = {};
1206     size_t index = 0;
1207     for (int c = 0; c < columns; ++c) {
1208         for (int r = 0; r < rows; ++r) {
1209             elements[index++] = (c == r) ? 1 : 0;
1210         }
1211     }
1212     this->swizzle(/*consumedSlots=*/2, SkSpan(elements, index));
1213 }
1214 
matrix_resize(int origColumns,int origRows,int newColumns,int newRows)1215 void Builder::matrix_resize(int origColumns, int origRows, int newColumns, int newRows) {
1216     // Resizes a CxR matrix at the top of the stack to C'xR'.
1217     int8_t elements[16] = {};
1218     size_t index = 0;
1219 
1220     size_t consumedSlots = origColumns * origRows;
1221     size_t zeroOffset = 0, oneOffset = 0;
1222 
1223     for (int c = 0; c < newColumns; ++c) {
1224         for (int r = 0; r < newRows; ++r) {
1225             if (c < origColumns && r < origRows) {
1226                 // Push an element from the original matrix.
1227                 elements[index++] = (c * origRows) + r;
1228             } else {
1229                 // This element is outside the original matrix; push 1 or 0.
1230                 if (c == r) {
1231                     // We need to synthesize a literal 1.
1232                     if (oneOffset == 0) {
1233                         this->push_constant_f(1.0f);
1234                         oneOffset = consumedSlots++;
1235                     }
1236                     elements[index++] = oneOffset;
1237                 } else {
1238                     // We need to synthesize a literal 0.
1239                     if (zeroOffset == 0) {
1240                         this->push_constant_f(0.0f);
1241                         zeroOffset = consumedSlots++;
1242                     }
1243                     elements[index++] = zeroOffset;
1244                 }
1245             }
1246         }
1247     }
1248     this->swizzle(consumedSlots, SkSpan(elements, index));
1249 }
1250 
matrix_multiply(int leftColumns,int leftRows,int rightColumns,int rightRows)1251 void Builder::matrix_multiply(int leftColumns, int leftRows, int rightColumns, int rightRows) {
1252     BuilderOp op;
1253     switch (leftColumns) {
1254         case 2:  op = BuilderOp::matrix_multiply_2; break;
1255         case 3:  op = BuilderOp::matrix_multiply_3; break;
1256         case 4:  op = BuilderOp::matrix_multiply_4; break;
1257         default: SkDEBUGFAIL("unsupported matrix dimensions"); return;
1258     }
1259 
1260     this->appendInstruction(op, {}, leftColumns, leftRows, rightColumns, rightRows);
1261 }
1262 
finish(int numValueSlots,int numUniformSlots,int numImmutableSlots,DebugTracePriv * debugTrace)1263 std::unique_ptr<Program> Builder::finish(int numValueSlots,
1264                                          int numUniformSlots,
1265                                          int numImmutableSlots,
1266                                          DebugTracePriv* debugTrace) {
1267     // Verify that calls to enableExecutionMaskWrites and disableExecutionMaskWrites are balanced.
1268     SkASSERT(fExecutionMaskWritesEnabled == 0);
1269 
1270     return std::make_unique<Program>(std::move(fInstructions), numValueSlots, numUniformSlots,
1271                                      numImmutableSlots, fNumLabels, debugTrace);
1272 }
1273 
optimize()1274 void Program::optimize() {
1275     // TODO(johnstiles): perform any last-minute cleanup of the instruction stream here
1276 }
1277 
stack_usage(const Instruction & inst)1278 static int stack_usage(const Instruction& inst) {
1279     switch (inst.fOp) {
1280         case BuilderOp::push_condition_mask:
1281         case BuilderOp::push_loop_mask:
1282         case BuilderOp::push_return_mask:
1283             return 1;
1284 
1285         case BuilderOp::push_src_rgba:
1286         case BuilderOp::push_dst_rgba:
1287         case BuilderOp::push_device_xy01:
1288             return 4;
1289 
1290         case BuilderOp::push_immutable:
1291         case BuilderOp::push_immutable_indirect:
1292         case BuilderOp::push_constant:
1293         case BuilderOp::push_slots:
1294         case BuilderOp::push_slots_indirect:
1295         case BuilderOp::push_uniform:
1296         case BuilderOp::push_uniform_indirect:
1297         case BuilderOp::push_clone:
1298         case BuilderOp::push_clone_from_stack:
1299         case BuilderOp::push_clone_indirect_from_stack:
1300         case BuilderOp::pad_stack:
1301             return inst.fImmA;
1302 
1303         case BuilderOp::pop_condition_mask:
1304         case BuilderOp::pop_loop_mask:
1305         case BuilderOp::pop_and_reenable_loop_mask:
1306         case BuilderOp::pop_return_mask:
1307             return -1;
1308 
1309         case BuilderOp::pop_src_rgba:
1310         case BuilderOp::pop_dst_rgba:
1311             return -4;
1312 
1313         case ALL_N_WAY_BINARY_OP_CASES:
1314         case ALL_MULTI_SLOT_BINARY_OP_CASES:
1315         case BuilderOp::discard_stack:
1316         case BuilderOp::select:
1317             return -inst.fImmA;
1318 
1319         case ALL_N_WAY_TERNARY_OP_CASES:
1320         case ALL_MULTI_SLOT_TERNARY_OP_CASES:
1321             return 2 * -inst.fImmA;
1322 
1323         case BuilderOp::swizzle_1:
1324             return 1 - inst.fImmA;  // consumes immA slots and emits a scalar
1325         case BuilderOp::swizzle_2:
1326             return 2 - inst.fImmA;  // consumes immA slots and emits a 2-slot vector
1327         case BuilderOp::swizzle_3:
1328             return 3 - inst.fImmA;  // consumes immA slots and emits a 3-slot vector
1329         case BuilderOp::swizzle_4:
1330             return 4 - inst.fImmA;  // consumes immA slots and emits a 4-slot vector
1331 
1332         case BuilderOp::dot_2_floats:
1333             return -3;  // consumes two 2-slot vectors and emits one scalar
1334         case BuilderOp::dot_3_floats:
1335             return -5;  // consumes two 3-slot vectors and emits one scalar
1336         case BuilderOp::dot_4_floats:
1337             return -7;  // consumes two 4-slot vectors and emits one scalar
1338 
1339         case BuilderOp::refract_4_floats:
1340             return -5;  // consumes nine slots (N + I + eta) and emits a 4-slot vector (R)
1341 
1342         case BuilderOp::matrix_multiply_2:
1343         case BuilderOp::matrix_multiply_3:
1344         case BuilderOp::matrix_multiply_4:
1345             // consumes the left- and right-matrices; emits result over existing padding slots
1346             return -(inst.fImmA * inst.fImmB + inst.fImmC * inst.fImmD);
1347 
1348         case BuilderOp::shuffle: {
1349             int consumed = inst.fImmA;
1350             int generated = inst.fImmB;
1351             return generated - consumed;
1352         }
1353         case ALL_SINGLE_SLOT_UNARY_OP_CASES:
1354         case ALL_MULTI_SLOT_UNARY_OP_CASES:
1355         case ALL_IMMEDIATE_BINARY_OP_CASES:
1356         default:
1357             return 0;
1358     }
1359 }
1360 
tempStackMaxDepths() const1361 Program::StackDepths Program::tempStackMaxDepths() const {
1362     // Count the number of separate temp stacks that the program uses.
1363     int numStacks = 1;
1364     for (const Instruction& inst : fInstructions) {
1365         numStacks = std::max(numStacks, inst.fStackID + 1);
1366     }
1367 
1368     // Walk the program and calculate how deep each stack can potentially get.
1369     StackDepths largest, current;
1370     largest.push_back_n(numStacks, 0);
1371     current.push_back_n(numStacks, 0);
1372 
1373     for (const Instruction& inst : fInstructions) {
1374         int stackID = inst.fStackID;
1375         current[stackID] += stack_usage(inst);
1376         largest[stackID] = std::max(current[stackID], largest[stackID]);
1377         // If we assert here, the generated program has popped off the top of the stack.
1378         SkASSERTF(current[stackID] >= 0, "unbalanced temp stack push/pop on stack %d", stackID);
1379     }
1380 
1381     // Ensure that when the program is complete, our stacks are fully balanced.
1382     for (int stackID = 0; stackID < numStacks; ++stackID) {
1383         // If we assert here, the generated program has pushed more data than it has popped.
1384         SkASSERTF(current[stackID] == 0, "unbalanced temp stack push/pop on stack %d", stackID);
1385     }
1386 
1387     return largest;
1388 }
1389 
Program(TArray<Instruction> instrs,int numValueSlots,int numUniformSlots,int numImmutableSlots,int numLabels,DebugTracePriv * debugTrace)1390 Program::Program(TArray<Instruction> instrs,
1391                  int numValueSlots,
1392                  int numUniformSlots,
1393                  int numImmutableSlots,
1394                  int numLabels,
1395                  DebugTracePriv* debugTrace)
1396         : fInstructions(std::move(instrs))
1397         , fNumValueSlots(numValueSlots)
1398         , fNumUniformSlots(numUniformSlots)
1399         , fNumImmutableSlots(numImmutableSlots)
1400         , fNumLabels(numLabels)
1401         , fDebugTrace(debugTrace) {
1402     this->optimize();
1403 
1404     fTempStackMaxDepths = this->tempStackMaxDepths();
1405 
1406     fNumTempStackSlots = 0;
1407     for (const int depth : fTempStackMaxDepths) {
1408         fNumTempStackSlots += depth;
1409     }
1410 
1411     if (fDebugTrace) {
1412         fTraceHook = SkSL::Tracer::Make(&fDebugTrace->fTraceInfo);
1413     }
1414 }
1415 
1416 Program::~Program() = default;
1417 
immutable_data_is_splattable(int32_t * immutablePtr,int numSlots)1418 static bool immutable_data_is_splattable(int32_t* immutablePtr, int numSlots) {
1419     // If every value between `immutablePtr[0]` and `immutablePtr[numSlots]` is bit-identical, we
1420     // can use a splat.
1421     for (int index = 1; index < numSlots; ++index) {
1422         if (immutablePtr[0] != immutablePtr[index]) {
1423             return false;
1424         }
1425     }
1426     return true;
1427 }
1428 
appendCopy(TArray<Stage> * pipeline,SkArenaAlloc * alloc,std::byte * basePtr,ProgramOp baseStage,SkRPOffset dst,int dstStride,SkRPOffset src,int srcStride,int numSlots) const1429 void Program::appendCopy(TArray<Stage>* pipeline,
1430                          SkArenaAlloc* alloc,
1431                          std::byte* basePtr,  // only used for immutable-value copies
1432                          ProgramOp baseStage,
1433                          SkRPOffset dst, int dstStride,
1434                          SkRPOffset src, int srcStride,
1435                          int numSlots) const {
1436     SkASSERT(numSlots >= 0);
1437     while (numSlots > 4) {
1438         // If we are appending a large copy, split it up into groups of four at a time.
1439         this->appendCopy(pipeline, alloc, basePtr,
1440                          baseStage,
1441                          dst, dstStride,
1442                          src, srcStride,
1443                          /*numSlots=*/4);
1444         dst += 4 * dstStride * sizeof(float);
1445         src += 4 * srcStride * sizeof(float);
1446         numSlots -= 4;
1447     }
1448 
1449     SkASSERT(numSlots <= 4);
1450 
1451     if (numSlots > 0) {
1452         // If we are copying immutable data, it might be representable by a splat; this is
1453         // preferable, since splats are a tiny bit faster than regular copies.
1454         if (basePtr) {
1455             SkASSERT(srcStride == 1);
1456             int32_t* immutablePtr = reinterpret_cast<int32_t*>(basePtr + src);
1457             if (immutable_data_is_splattable(immutablePtr, numSlots)) {
1458                 auto stage = (ProgramOp)((int)ProgramOp::copy_constant + numSlots - 1);
1459                 SkRasterPipeline_ConstantCtx ctx;
1460                 ctx.dst = dst;
1461                 ctx.value = *immutablePtr;
1462                 pipeline->push_back({stage, SkRPCtxUtils::Pack(ctx, alloc)});
1463                 return;
1464             }
1465         }
1466 
1467         // We can't use a splat, so emit the requested copy op.
1468         auto stage = (ProgramOp)((int)baseStage + numSlots - 1);
1469         SkRasterPipeline_BinaryOpCtx ctx;
1470         ctx.dst = dst;
1471         ctx.src = src;
1472         pipeline->push_back({stage, SkRPCtxUtils::Pack(ctx, alloc)});
1473     }
1474 }
1475 
appendCopySlotsUnmasked(TArray<Stage> * pipeline,SkArenaAlloc * alloc,SkRPOffset dst,SkRPOffset src,int numSlots) const1476 void Program::appendCopySlotsUnmasked(TArray<Stage>* pipeline,
1477                                       SkArenaAlloc* alloc,
1478                                       SkRPOffset dst,
1479                                       SkRPOffset src,
1480                                       int numSlots) const {
1481     this->appendCopy(pipeline, alloc, /*basePtr=*/nullptr,
1482                      ProgramOp::copy_slot_unmasked,
1483                      dst, SkOpts::raster_pipeline_highp_stride,
1484                      src, SkOpts::raster_pipeline_highp_stride,
1485                      numSlots);
1486 }
1487 
appendCopyImmutableUnmasked(TArray<Stage> * pipeline,SkArenaAlloc * alloc,std::byte * basePtr,SkRPOffset dst,SkRPOffset src,int numSlots) const1488 void Program::appendCopyImmutableUnmasked(TArray<Stage>* pipeline,
1489                                           SkArenaAlloc* alloc,
1490                                           std::byte* basePtr,
1491                                           SkRPOffset dst,
1492                                           SkRPOffset src,
1493                                           int numSlots) const {
1494     this->appendCopy(pipeline, alloc, basePtr,
1495                      ProgramOp::copy_immutable_unmasked,
1496                      dst, SkOpts::raster_pipeline_highp_stride,
1497                      src, 1,
1498                      numSlots);
1499 }
1500 
appendCopySlotsMasked(TArray<Stage> * pipeline,SkArenaAlloc * alloc,SkRPOffset dst,SkRPOffset src,int numSlots) const1501 void Program::appendCopySlotsMasked(TArray<Stage>* pipeline,
1502                                     SkArenaAlloc* alloc,
1503                                     SkRPOffset dst,
1504                                     SkRPOffset src,
1505                                     int numSlots) const {
1506     this->appendCopy(pipeline, alloc, /*basePtr=*/nullptr,
1507                      ProgramOp::copy_slot_masked,
1508                      dst, SkOpts::raster_pipeline_highp_stride,
1509                      src, SkOpts::raster_pipeline_highp_stride,
1510                      numSlots);
1511 }
1512 
appendSingleSlotUnaryOp(TArray<Stage> * pipeline,ProgramOp stage,float * dst,int numSlots) const1513 void Program::appendSingleSlotUnaryOp(TArray<Stage>* pipeline, ProgramOp stage,
1514                                       float* dst, int numSlots) const {
1515     SkASSERT(numSlots >= 0);
1516     while (numSlots--) {
1517         pipeline->push_back({stage, dst});
1518         dst += SkOpts::raster_pipeline_highp_stride;
1519     }
1520 }
1521 
appendMultiSlotUnaryOp(TArray<Stage> * pipeline,ProgramOp baseStage,float * dst,int numSlots) const1522 void Program::appendMultiSlotUnaryOp(TArray<Stage>* pipeline, ProgramOp baseStage,
1523                                      float* dst, int numSlots) const {
1524     SkASSERT(numSlots >= 0);
1525     while (numSlots > 0) {
1526         int currentSlots = std::min(numSlots, 4);
1527         auto stage = (ProgramOp)((int)baseStage + currentSlots - 1);
1528         pipeline->push_back({stage, dst});
1529 
1530         dst += 4 * SkOpts::raster_pipeline_highp_stride;
1531         numSlots -= 4;
1532     }
1533 }
1534 
appendImmediateBinaryOp(TArray<Stage> * pipeline,SkArenaAlloc * alloc,ProgramOp baseStage,SkRPOffset dst,int32_t value,int numSlots) const1535 void Program::appendImmediateBinaryOp(TArray<Stage>* pipeline, SkArenaAlloc* alloc,
1536                                       ProgramOp baseStage,
1537                                       SkRPOffset dst, int32_t value, int numSlots) const {
1538     SkASSERT(is_immediate_op((BuilderOp)baseStage));
1539     int slotsPerStage = is_multi_slot_immediate_op((BuilderOp)baseStage) ? 4 : 1;
1540 
1541     SkRasterPipeline_ConstantCtx ctx;
1542     ctx.dst = dst;
1543     ctx.value = value;
1544 
1545     SkASSERT(numSlots >= 0);
1546     while (numSlots > 0) {
1547         int currentSlots = std::min(numSlots, slotsPerStage);
1548         auto stage = (ProgramOp)((int)baseStage - (currentSlots - 1));
1549         pipeline->push_back({stage, SkRPCtxUtils::Pack(ctx, alloc)});
1550 
1551         ctx.dst += slotsPerStage * SkOpts::raster_pipeline_highp_stride * sizeof(float);
1552         numSlots -= slotsPerStage;
1553     }
1554 }
1555 
appendAdjacentNWayBinaryOp(TArray<Stage> * pipeline,SkArenaAlloc * alloc,ProgramOp stage,SkRPOffset dst,SkRPOffset src,int numSlots) const1556 void Program::appendAdjacentNWayBinaryOp(TArray<Stage>* pipeline, SkArenaAlloc* alloc,
1557                                          ProgramOp stage,
1558                                          SkRPOffset dst, SkRPOffset src, int numSlots) const {
1559     // The source and destination must be directly next to one another.
1560     SkASSERT(numSlots >= 0);
1561     SkASSERT((dst + SkOpts::raster_pipeline_highp_stride * numSlots * sizeof(float)) == src);
1562 
1563     if (numSlots > 0) {
1564         SkRasterPipeline_BinaryOpCtx ctx;
1565         ctx.dst = dst;
1566         ctx.src = src;
1567         pipeline->push_back({stage, SkRPCtxUtils::Pack(ctx, alloc)});
1568     }
1569 }
1570 
appendAdjacentMultiSlotBinaryOp(TArray<Stage> * pipeline,SkArenaAlloc * alloc,ProgramOp baseStage,std::byte * basePtr,SkRPOffset dst,SkRPOffset src,int numSlots) const1571 void Program::appendAdjacentMultiSlotBinaryOp(TArray<Stage>* pipeline, SkArenaAlloc* alloc,
1572                                               ProgramOp baseStage, std::byte* basePtr,
1573                                               SkRPOffset dst, SkRPOffset src, int numSlots) const {
1574     // The source and destination must be directly next to one another.
1575     SkASSERT(numSlots >= 0);
1576     SkASSERT((dst + SkOpts::raster_pipeline_highp_stride * numSlots * sizeof(float)) == src);
1577 
1578     if (numSlots > 4) {
1579         this->appendAdjacentNWayBinaryOp(pipeline, alloc, baseStage, dst, src, numSlots);
1580         return;
1581     }
1582     if (numSlots > 0) {
1583         auto specializedStage = (ProgramOp)((int)baseStage + numSlots);
1584         pipeline->push_back({specializedStage, basePtr + dst});
1585     }
1586 }
1587 
appendAdjacentNWayTernaryOp(TArray<Stage> * pipeline,SkArenaAlloc * alloc,ProgramOp stage,std::byte * basePtr,SkRPOffset dst,SkRPOffset src0,SkRPOffset src1,int numSlots) const1588 void Program::appendAdjacentNWayTernaryOp(TArray<Stage>* pipeline, SkArenaAlloc* alloc,
1589                                           ProgramOp stage, std::byte* basePtr, SkRPOffset dst,
1590                                           SkRPOffset src0, SkRPOffset src1, int numSlots) const {
1591     // The float pointers must all be immediately adjacent to each other.
1592     SkASSERT(numSlots >= 0);
1593     SkASSERT((dst  + SkOpts::raster_pipeline_highp_stride * numSlots * sizeof(float)) == src0);
1594     SkASSERT((src0 + SkOpts::raster_pipeline_highp_stride * numSlots * sizeof(float)) == src1);
1595 
1596     if (numSlots > 0) {
1597         SkRasterPipeline_TernaryOpCtx ctx;
1598         ctx.dst = dst;
1599         ctx.delta = src0 - dst;
1600         pipeline->push_back({stage, SkRPCtxUtils::Pack(ctx, alloc)});
1601     }
1602 }
1603 
appendAdjacentMultiSlotTernaryOp(TArray<Stage> * pipeline,SkArenaAlloc * alloc,ProgramOp baseStage,std::byte * basePtr,SkRPOffset dst,SkRPOffset src0,SkRPOffset src1,int numSlots) const1604 void Program::appendAdjacentMultiSlotTernaryOp(TArray<Stage>* pipeline, SkArenaAlloc* alloc,
1605                                                ProgramOp baseStage, std::byte* basePtr,
1606                                                SkRPOffset dst, SkRPOffset src0, SkRPOffset src1,
1607                                                int numSlots) const {
1608     // The float pointers must all be immediately adjacent to each other.
1609     SkASSERT(numSlots >= 0);
1610     SkASSERT((dst  + SkOpts::raster_pipeline_highp_stride * numSlots * sizeof(float)) == src0);
1611     SkASSERT((src0 + SkOpts::raster_pipeline_highp_stride * numSlots * sizeof(float)) == src1);
1612 
1613     if (numSlots > 4) {
1614         this->appendAdjacentNWayTernaryOp(pipeline, alloc, baseStage, basePtr,
1615                                           dst, src0, src1, numSlots);
1616         return;
1617     }
1618     if (numSlots > 0) {
1619         auto specializedStage = (ProgramOp)((int)baseStage + numSlots);
1620         pipeline->push_back({specializedStage, basePtr + dst});
1621     }
1622 }
1623 
appendStackRewind(TArray<Stage> * pipeline) const1624 void Program::appendStackRewind(TArray<Stage>* pipeline) const {
1625 #if defined(SKSL_STANDALONE) || !SK_HAS_MUSTTAIL
1626     pipeline->push_back({ProgramOp::stack_rewind, nullptr});
1627 #endif
1628 }
1629 
context_bit_pun(intptr_t val)1630 static void* context_bit_pun(intptr_t val) {
1631     return sk_bit_cast<void*>(val);
1632 }
1633 
allocateSlotData(SkArenaAlloc * alloc) const1634 Program::SlotData Program::allocateSlotData(SkArenaAlloc* alloc) const {
1635     // Allocate a contiguous slab of slot data for immutables, values, and stack entries.
1636     const int N = SkOpts::raster_pipeline_highp_stride;
1637     const int scalarWidth = 1 * sizeof(float);
1638     const int vectorWidth = N * sizeof(float);
1639     const int allocSize = vectorWidth * (fNumValueSlots + fNumTempStackSlots) +
1640                           scalarWidth * fNumImmutableSlots;
1641     float* slotPtr = static_cast<float*>(alloc->makeBytesAlignedTo(allocSize, vectorWidth));
1642     sk_bzero(slotPtr, allocSize);
1643 
1644     // Store the temp stack immediately after the values, and immutable data after the stack.
1645     SlotData s;
1646     s.values    = SkSpan{slotPtr,        N * fNumValueSlots};
1647     s.stack     = SkSpan{s.values.end(), N * fNumTempStackSlots};
1648     s.immutable = SkSpan{s.stack.end(),  1 * fNumImmutableSlots};
1649     return s;
1650 }
1651 
appendStages(SkRasterPipeline * pipeline,SkArenaAlloc * alloc,RP::Callbacks * callbacks,SkSpan<const float> uniforms) const1652 bool Program::appendStages(SkRasterPipeline* pipeline,
1653                            SkArenaAlloc* alloc,
1654                            RP::Callbacks* callbacks,
1655                            SkSpan<const float> uniforms) const {
1656 #if defined(SKSL_STANDALONE)
1657     return false;
1658 #else
1659     // Convert our Instruction list to an array of ProgramOps.
1660     TArray<Stage> stages;
1661     SlotData slotData = this->allocateSlotData(alloc);
1662     this->makeStages(&stages, alloc, uniforms, slotData);
1663 
1664     // Allocate buffers for branch targets and labels; these are needed to convert labels into
1665     // actual offsets into the pipeline and fix up branches.
1666     TArray<SkRasterPipeline_BranchCtx*> branchContexts;
1667     branchContexts.reserve_exact(fNumLabels);
1668     TArray<int> labelOffsets;
1669     labelOffsets.push_back_n(fNumLabels, -1);
1670     TArray<int> branchGoesToLabel;
1671     branchGoesToLabel.reserve_exact(fNumLabels);
1672 
1673     auto resetBasePointer = [&]() {
1674         // Whenever we hand off control to another shader, we have to assume that it might overwrite
1675         // the base pointer (if it uses SkSL, it will!), so we reset it on return.
1676         pipeline->append(SkRasterPipelineOp::set_base_pointer, slotData.values.data());
1677     };
1678 
1679     resetBasePointer();
1680 
1681     for (const Stage& stage : stages) {
1682         switch (stage.op) {
1683             case ProgramOp::stack_rewind:
1684                 pipeline->appendStackRewind();
1685                 break;
1686 
1687             case ProgramOp::invoke_shader:
1688                 if (!callbacks || !callbacks->appendShader(sk_bit_cast<intptr_t>(stage.ctx))) {
1689                     return false;
1690                 }
1691                 resetBasePointer();
1692                 break;
1693 
1694             case ProgramOp::invoke_color_filter:
1695                 if (!callbacks || !callbacks->appendColorFilter(sk_bit_cast<intptr_t>(stage.ctx))) {
1696                     return false;
1697                 }
1698                 resetBasePointer();
1699                 break;
1700 
1701             case ProgramOp::invoke_blender:
1702                 if (!callbacks || !callbacks->appendBlender(sk_bit_cast<intptr_t>(stage.ctx))) {
1703                     return false;
1704                 }
1705                 resetBasePointer();
1706                 break;
1707 
1708             case ProgramOp::invoke_to_linear_srgb:
1709                 if (!callbacks) {
1710                     return false;
1711                 }
1712                 callbacks->toLinearSrgb(stage.ctx);
1713                 // A ColorSpaceXform shouldn't ever alter the base pointer, so we don't need to call
1714                 // resetBasePointer here.
1715                 break;
1716 
1717             case ProgramOp::invoke_from_linear_srgb:
1718                 if (!callbacks) {
1719                     return false;
1720                 }
1721                 callbacks->fromLinearSrgb(stage.ctx);
1722                 // A ColorSpaceXform shouldn't ever alter the base pointer, so we don't need to call
1723                 // resetBasePointer here.
1724                 break;
1725 
1726             case ProgramOp::label: {
1727                 // Remember the absolute pipeline position of this label.
1728                 int labelID = sk_bit_cast<intptr_t>(stage.ctx);
1729                 SkASSERT(labelID >= 0 && labelID < fNumLabels);
1730                 labelOffsets[labelID] = pipeline->getNumStages();
1731                 break;
1732             }
1733             case ProgramOp::jump:
1734             case ProgramOp::branch_if_all_lanes_active:
1735             case ProgramOp::branch_if_any_lanes_active:
1736             case ProgramOp::branch_if_no_lanes_active:
1737             case ProgramOp::branch_if_no_active_lanes_eq: {
1738                 // The branch context contain a valid label ID at this point.
1739                 auto* branchCtx = static_cast<SkRasterPipeline_BranchCtx*>(stage.ctx);
1740                 int labelID = branchCtx->offset;
1741                 SkASSERT(labelID >= 0 && labelID < fNumLabels);
1742 
1743                 // Replace the label ID in the branch context with the absolute pipeline position.
1744                 // We will go back over the branch targets at the end and fix them up.
1745                 branchCtx->offset = pipeline->getNumStages();
1746 
1747                 SkASSERT(branchContexts.size() == branchGoesToLabel.size());
1748                 branchContexts.push_back(branchCtx);
1749                 branchGoesToLabel.push_back(labelID);
1750                 [[fallthrough]];
1751             }
1752             default:
1753                 // Append a regular op to the program.
1754                 SkASSERT((int)stage.op < kNumRasterPipelineHighpOps);
1755                 pipeline->append((SkRasterPipelineOp)stage.op, stage.ctx);
1756                 break;
1757         }
1758     }
1759 
1760     // Now that we have assembled the program and know the pipeline positions of each label and
1761     // branch, fix up every branch target.
1762     SkASSERT(branchContexts.size() == branchGoesToLabel.size());
1763     for (int index = 0; index < branchContexts.size(); ++index) {
1764         int branchFromIdx = branchContexts[index]->offset;
1765         int branchToIdx = labelOffsets[branchGoesToLabel[index]];
1766         branchContexts[index]->offset = branchToIdx - branchFromIdx;
1767     }
1768 
1769     return true;
1770 #endif
1771 }
1772 
makeStages(TArray<Stage> * pipeline,SkArenaAlloc * alloc,SkSpan<const float> uniforms,const SlotData & slots) const1773 void Program::makeStages(TArray<Stage>* pipeline,
1774                          SkArenaAlloc* alloc,
1775                          SkSpan<const float> uniforms,
1776                          const SlotData& slots) const {
1777     SkASSERT(fNumUniformSlots == SkToInt(uniforms.size()));
1778 
1779     const int N = SkOpts::raster_pipeline_highp_stride;
1780     int mostRecentRewind = 0;
1781 
1782     // Assemble a map holding the current stack-top for each temporary stack. Position each temp
1783     // stack immediately after the previous temp stack; temp stacks are never allowed to overlap.
1784     int pos = 0;
1785     TArray<float*> tempStackMap;
1786     tempStackMap.resize(fTempStackMaxDepths.size());
1787     for (int idx = 0; idx < fTempStackMaxDepths.size(); ++idx) {
1788         tempStackMap[idx] = slots.stack.begin() + (pos * N);
1789         pos += fTempStackMaxDepths[idx];
1790     }
1791 
1792     // Track labels that we have reached in processing.
1793     SkBitSet labelsEncountered(fNumLabels);
1794 
1795     auto EmitStackRewindForBackwardsBranch = [&](int labelID) {
1796         // If we have already encountered the label associated with this branch, this is a
1797         // backwards branch. Add a stack-rewind immediately before the branch to ensure that
1798         // long-running loops don't use an unbounded amount of stack space.
1799         if (labelsEncountered.test(labelID)) {
1800             this->appendStackRewind(pipeline);
1801             mostRecentRewind = pipeline->size();
1802         }
1803     };
1804 
1805     auto* const basePtr = (std::byte*)slots.values.data();
1806     auto OffsetFromBase = [&](const void* ptr) -> SkRPOffset {
1807         return (SkRPOffset)((const std::byte*)ptr - basePtr);
1808     };
1809 
1810     // Copy all immutable values into the immutable slots.
1811     for (const Instruction& inst : fInstructions) {
1812         if (inst.fOp == BuilderOp::store_immutable_value) {
1813             slots.immutable[inst.fSlotA] = sk_bit_cast<float>(inst.fImmA);
1814         }
1815     }
1816 
1817     // Write each BuilderOp to the pipeline array.
1818     pipeline->reserve_exact(pipeline->size() + fInstructions.size());
1819     for (const Instruction& inst : fInstructions) {
1820         auto ImmutableA = [&]() { return &slots.immutable[1 * inst.fSlotA]; };
1821         auto ImmutableB = [&]() { return &slots.immutable[1 * inst.fSlotB]; };
1822         auto SlotA      = [&]() { return &slots.values[N * inst.fSlotA]; };
1823         auto SlotB      = [&]() { return &slots.values[N * inst.fSlotB]; };
1824         auto UniformA   = [&]() { return &uniforms[inst.fSlotA]; };
1825         auto AllocTraceContext = [&](auto* ctx) {
1826             // We pass `ctx` solely for its type; the value is unused.
1827             using ContextType = typename std::remove_reference<decltype(*ctx)>::type;
1828             ctx = alloc->make<ContextType>();
1829             ctx->traceMask = reinterpret_cast<int*>(tempStackMap[inst.fImmA] - N);
1830             ctx->traceHook = fTraceHook.get();
1831             return ctx;
1832         };
1833         float*& tempStackPtr = tempStackMap[inst.fStackID];
1834 
1835         switch (inst.fOp) {
1836             case BuilderOp::label:
1837                 SkASSERT(inst.fImmA >= 0 && inst.fImmA < fNumLabels);
1838                 labelsEncountered.set(inst.fImmA);
1839                 pipeline->push_back({ProgramOp::label, context_bit_pun(inst.fImmA)});
1840                 break;
1841 
1842             case BuilderOp::jump:
1843             case BuilderOp::branch_if_any_lanes_active:
1844             case BuilderOp::branch_if_no_lanes_active: {
1845                 SkASSERT(inst.fImmA >= 0 && inst.fImmA < fNumLabels);
1846                 EmitStackRewindForBackwardsBranch(inst.fImmA);
1847 
1848                 auto* ctx = alloc->make<SkRasterPipeline_BranchCtx>();
1849                 ctx->offset = inst.fImmA;
1850                 pipeline->push_back({(ProgramOp)inst.fOp, ctx});
1851                 break;
1852             }
1853             case BuilderOp::branch_if_all_lanes_active: {
1854                 SkASSERT(inst.fImmA >= 0 && inst.fImmA < fNumLabels);
1855                 EmitStackRewindForBackwardsBranch(inst.fImmA);
1856 
1857                 auto* ctx = alloc->make<SkRasterPipeline_BranchIfAllLanesActiveCtx>();
1858                 ctx->offset = inst.fImmA;
1859                 pipeline->push_back({ProgramOp::branch_if_all_lanes_active, ctx});
1860                 break;
1861             }
1862             case BuilderOp::branch_if_no_active_lanes_on_stack_top_equal: {
1863                 SkASSERT(inst.fImmA >= 0 && inst.fImmA < fNumLabels);
1864                 EmitStackRewindForBackwardsBranch(inst.fImmA);
1865 
1866                 auto* ctx = alloc->make<SkRasterPipeline_BranchIfEqualCtx>();
1867                 ctx->offset = inst.fImmA;
1868                 ctx->value = inst.fImmB;
1869                 ctx->ptr = reinterpret_cast<int*>(tempStackPtr - N);
1870                 pipeline->push_back({ProgramOp::branch_if_no_active_lanes_eq, ctx});
1871                 break;
1872             }
1873             case BuilderOp::init_lane_masks: {
1874                 auto* ctx = alloc->make<SkRasterPipeline_InitLaneMasksCtx>();
1875                 pipeline->push_back({ProgramOp::init_lane_masks, ctx});
1876                 break;
1877             }
1878             case BuilderOp::store_src_rg:
1879                 pipeline->push_back({ProgramOp::store_src_rg, SlotA()});
1880                 break;
1881 
1882             case BuilderOp::store_src:
1883                 pipeline->push_back({ProgramOp::store_src, SlotA()});
1884                 break;
1885 
1886             case BuilderOp::store_dst:
1887                 pipeline->push_back({ProgramOp::store_dst, SlotA()});
1888                 break;
1889 
1890             case BuilderOp::store_device_xy01:
1891                 pipeline->push_back({ProgramOp::store_device_xy01, SlotA()});
1892                 break;
1893 
1894             case BuilderOp::store_immutable_value:
1895                 // The immutable slots were populated in an earlier pass.
1896                 break;
1897 
1898             case BuilderOp::load_src:
1899                 pipeline->push_back({ProgramOp::load_src, SlotA()});
1900                 break;
1901 
1902             case BuilderOp::load_dst:
1903                 pipeline->push_back({ProgramOp::load_dst, SlotA()});
1904                 break;
1905 
1906             case ALL_SINGLE_SLOT_UNARY_OP_CASES: {
1907                 float* dst = tempStackPtr - (inst.fImmA * N);
1908                 this->appendSingleSlotUnaryOp(pipeline, (ProgramOp)inst.fOp, dst, inst.fImmA);
1909                 break;
1910             }
1911             case ALL_MULTI_SLOT_UNARY_OP_CASES: {
1912                 float* dst = tempStackPtr - (inst.fImmA * N);
1913                 this->appendMultiSlotUnaryOp(pipeline, (ProgramOp)inst.fOp, dst, inst.fImmA);
1914                 break;
1915             }
1916             case ALL_IMMEDIATE_BINARY_OP_CASES: {
1917                 float* dst = (inst.fSlotA == NA) ? tempStackPtr - (inst.fImmA * N)
1918                                                  : SlotA();
1919 
1920                 this->appendImmediateBinaryOp(pipeline, alloc, (ProgramOp)inst.fOp,
1921                                               OffsetFromBase(dst), inst.fImmB, inst.fImmA);
1922                 break;
1923             }
1924             case ALL_N_WAY_BINARY_OP_CASES: {
1925                 float* src = tempStackPtr - (inst.fImmA * N);
1926                 float* dst = tempStackPtr - (inst.fImmA * 2 * N);
1927                 this->appendAdjacentNWayBinaryOp(pipeline, alloc, (ProgramOp)inst.fOp,
1928                                                  OffsetFromBase(dst), OffsetFromBase(src),
1929                                                  inst.fImmA);
1930                 break;
1931             }
1932             case ALL_MULTI_SLOT_BINARY_OP_CASES: {
1933                 float* src = tempStackPtr - (inst.fImmA * N);
1934                 float* dst = tempStackPtr - (inst.fImmA * 2 * N);
1935                 this->appendAdjacentMultiSlotBinaryOp(pipeline, alloc, (ProgramOp)inst.fOp,
1936                                                       basePtr,
1937                                                       OffsetFromBase(dst),
1938                                                       OffsetFromBase(src),
1939                                                       inst.fImmA);
1940                 break;
1941             }
1942             case ALL_N_WAY_TERNARY_OP_CASES: {
1943                 float* src1 = tempStackPtr - (inst.fImmA * N);
1944                 float* src0 = tempStackPtr - (inst.fImmA * 2 * N);
1945                 float* dst  = tempStackPtr - (inst.fImmA * 3 * N);
1946                 this->appendAdjacentNWayTernaryOp(pipeline, alloc, (ProgramOp)inst.fOp, basePtr,
1947                                                   OffsetFromBase(dst),
1948                                                   OffsetFromBase(src0),
1949                                                   OffsetFromBase(src1),
1950                                                   inst.fImmA);
1951                 break;
1952             }
1953             case ALL_MULTI_SLOT_TERNARY_OP_CASES: {
1954                 float* src1 = tempStackPtr - (inst.fImmA * N);
1955                 float* src0 = tempStackPtr - (inst.fImmA * 2 * N);
1956                 float* dst  = tempStackPtr - (inst.fImmA * 3 * N);
1957                 this->appendAdjacentMultiSlotTernaryOp(pipeline, alloc,(ProgramOp)inst.fOp, basePtr,
1958                                                        OffsetFromBase(dst),
1959                                                        OffsetFromBase(src0),
1960                                                        OffsetFromBase(src1),
1961                                                        inst.fImmA);
1962                 break;
1963             }
1964             case BuilderOp::select: {
1965                 float* src = tempStackPtr - (inst.fImmA * N);
1966                 float* dst = tempStackPtr - (inst.fImmA * 2 * N);
1967                 this->appendCopySlotsMasked(pipeline, alloc,
1968                                             OffsetFromBase(dst),
1969                                             OffsetFromBase(src),
1970                                             inst.fImmA);
1971                 break;
1972             }
1973             case BuilderOp::copy_slot_masked:
1974                 this->appendCopySlotsMasked(pipeline, alloc,
1975                                             OffsetFromBase(SlotA()),
1976                                             OffsetFromBase(SlotB()),
1977                                             inst.fImmA);
1978                 break;
1979 
1980             case BuilderOp::copy_slot_unmasked:
1981                 this->appendCopySlotsUnmasked(pipeline, alloc,
1982                                               OffsetFromBase(SlotA()),
1983                                               OffsetFromBase(SlotB()),
1984                                               inst.fImmA);
1985                 break;
1986 
1987             case BuilderOp::copy_immutable_unmasked:
1988                 this->appendCopyImmutableUnmasked(pipeline, alloc, basePtr,
1989                                                   OffsetFromBase(SlotA()),
1990                                                   OffsetFromBase(ImmutableB()),
1991                                                   inst.fImmA);
1992                 break;
1993 
1994             case BuilderOp::refract_4_floats: {
1995                 float* dst = tempStackPtr - (9 * N);
1996                 pipeline->push_back({ProgramOp::refract_4_floats, dst});
1997                 break;
1998             }
1999             case BuilderOp::inverse_mat2:
2000             case BuilderOp::inverse_mat3:
2001             case BuilderOp::inverse_mat4: {
2002                 float* dst = tempStackPtr - (inst.fImmA * N);
2003                 pipeline->push_back({(ProgramOp)inst.fOp, dst});
2004                 break;
2005             }
2006             case BuilderOp::dot_2_floats:
2007             case BuilderOp::dot_3_floats:
2008             case BuilderOp::dot_4_floats: {
2009                 float* dst = tempStackPtr - (inst.fImmA * 2 * N);
2010                 pipeline->push_back({(ProgramOp)inst.fOp, dst});
2011                 break;
2012             }
2013             case BuilderOp::swizzle_1: {
2014                 // A single-component swizzle just copies a slot and shrinks the stack; we can
2015                 // slightly improve codegen by making that simplification here.
2016                 int offset = inst.fImmB;
2017                 SkASSERT(offset >= 0 && offset <= 15);
2018                 float* dst = tempStackPtr - (inst.fImmA * N);
2019                 float* src = dst + (offset * N);
2020                 if (src != dst) {
2021                     this->appendCopySlotsUnmasked(pipeline, alloc,
2022                                                   OffsetFromBase(dst),
2023                                                   OffsetFromBase(src),
2024                                                   /*numSlots=*/1);
2025                 }
2026                 break;
2027             }
2028             case BuilderOp::swizzle_2:
2029             case BuilderOp::swizzle_3:
2030             case BuilderOp::swizzle_4: {
2031                 SkRasterPipeline_SwizzleCtx ctx;
2032                 ctx.dst = OffsetFromBase(tempStackPtr - (N * inst.fImmA));
2033                 // Unpack component nybbles into byte-offsets pointing at stack slots.
2034                 unpack_nybbles_to_offsets(inst.fImmB, SkSpan(ctx.offsets));
2035                 pipeline->push_back({(ProgramOp)inst.fOp, SkRPCtxUtils::Pack(ctx, alloc)});
2036                 break;
2037             }
2038             case BuilderOp::shuffle: {
2039                 int consumed = inst.fImmA;
2040                 int generated = inst.fImmB;
2041 
2042                 auto* ctx = alloc->make<SkRasterPipeline_ShuffleCtx>();
2043                 ctx->ptr = reinterpret_cast<int32_t*>(tempStackPtr) - (N * consumed);
2044                 ctx->count = generated;
2045                 // Unpack immB and immC from nybble form into the offset array.
2046                 unpack_nybbles_to_offsets(inst.fImmC, SkSpan(&ctx->offsets[0], 8));
2047                 unpack_nybbles_to_offsets(inst.fImmD, SkSpan(&ctx->offsets[8], 8));
2048                 pipeline->push_back({ProgramOp::shuffle, ctx});
2049                 break;
2050             }
2051             case BuilderOp::matrix_multiply_2:
2052             case BuilderOp::matrix_multiply_3:
2053             case BuilderOp::matrix_multiply_4: {
2054                 int consumed = (inst.fImmB * inst.fImmC) +  // result
2055                                (inst.fImmA * inst.fImmB) +  // left-matrix
2056                                (inst.fImmC * inst.fImmD);   // right-matrix
2057 
2058                 SkRasterPipeline_MatrixMultiplyCtx ctx;
2059                 ctx.dst = OffsetFromBase(tempStackPtr - (N * consumed));
2060                 ctx.leftColumns  = inst.fImmA;
2061                 ctx.leftRows     = inst.fImmB;
2062                 ctx.rightColumns = inst.fImmC;
2063                 ctx.rightRows    = inst.fImmD;
2064                 pipeline->push_back({(ProgramOp)inst.fOp, SkRPCtxUtils::Pack(ctx, alloc)});
2065                 break;
2066             }
2067             case BuilderOp::exchange_src: {
2068                 float* dst = tempStackPtr - (4 * N);
2069                 pipeline->push_back({ProgramOp::exchange_src, dst});
2070                 break;
2071             }
2072             case BuilderOp::push_src_rgba: {
2073                 float* dst = tempStackPtr;
2074                 pipeline->push_back({ProgramOp::store_src, dst});
2075                 break;
2076             }
2077             case BuilderOp::push_dst_rgba: {
2078                 float* dst = tempStackPtr;
2079                 pipeline->push_back({ProgramOp::store_dst, dst});
2080                 break;
2081             }
2082             case BuilderOp::push_device_xy01: {
2083                 float* dst = tempStackPtr;
2084                 pipeline->push_back({ProgramOp::store_device_xy01, dst});
2085                 break;
2086             }
2087             case BuilderOp::pop_src_rgba: {
2088                 float* src = tempStackPtr - (4 * N);
2089                 pipeline->push_back({ProgramOp::load_src, src});
2090                 break;
2091             }
2092             case BuilderOp::pop_dst_rgba: {
2093                 float* src = tempStackPtr - (4 * N);
2094                 pipeline->push_back({ProgramOp::load_dst, src});
2095                 break;
2096             }
2097             case BuilderOp::push_slots: {
2098                 float* dst = tempStackPtr;
2099                 this->appendCopySlotsUnmasked(pipeline, alloc,
2100                                               OffsetFromBase(dst),
2101                                               OffsetFromBase(SlotA()),
2102                                               inst.fImmA);
2103                 break;
2104             }
2105             case BuilderOp::push_immutable: {
2106                 float* dst = tempStackPtr;
2107                 this->appendCopyImmutableUnmasked(pipeline, alloc, basePtr,
2108                                                   OffsetFromBase(dst),
2109                                                   OffsetFromBase(ImmutableA()),
2110                                                   inst.fImmA);
2111                 break;
2112             }
2113             case BuilderOp::copy_stack_to_slots_indirect:
2114             case BuilderOp::push_immutable_indirect:
2115             case BuilderOp::push_slots_indirect:
2116             case BuilderOp::push_uniform_indirect: {
2117                 // SlotA: fixed-range start
2118                 // SlotB: limit-range end
2119                 //  immA: number of slots to copy
2120                 //  immB: dynamic stack ID
2121                 ProgramOp op;
2122                 auto* ctx = alloc->make<SkRasterPipeline_CopyIndirectCtx>();
2123                 ctx->indirectOffset =
2124                         reinterpret_cast<const uint32_t*>(tempStackMap[inst.fImmB]) - (1 * N);
2125                 ctx->indirectLimit = inst.fSlotB - inst.fSlotA - inst.fImmA;
2126                 ctx->slots = inst.fImmA;
2127                 if (inst.fOp == BuilderOp::push_slots_indirect) {
2128                     op = ProgramOp::copy_from_indirect_unmasked;
2129                     ctx->src = reinterpret_cast<const int32_t*>(SlotA());
2130                     ctx->dst = reinterpret_cast<int32_t*>(tempStackPtr);
2131                 } else if (inst.fOp == BuilderOp::push_immutable_indirect) {
2132                     // We reuse the indirect-uniform op for indirect copies of immutable data.
2133                     op = ProgramOp::copy_from_indirect_uniform_unmasked;
2134                     ctx->src = reinterpret_cast<const int32_t*>(ImmutableA());
2135                     ctx->dst = reinterpret_cast<int32_t*>(tempStackPtr);
2136                 } else if (inst.fOp == BuilderOp::push_uniform_indirect) {
2137                     op = ProgramOp::copy_from_indirect_uniform_unmasked;
2138                     ctx->src = reinterpret_cast<const int32_t*>(UniformA());
2139                     ctx->dst = reinterpret_cast<int32_t*>(tempStackPtr);
2140                 } else {
2141                     op = ProgramOp::copy_to_indirect_masked;
2142                     ctx->src = reinterpret_cast<const int32_t*>(tempStackPtr) - (ctx->slots * N);
2143                     ctx->dst = reinterpret_cast<int32_t*>(SlotA());
2144                 }
2145                 pipeline->push_back({op, ctx});
2146                 break;
2147             }
2148             case BuilderOp::push_uniform:
2149             case BuilderOp::copy_uniform_to_slots_unmasked: {
2150                 const float* src = UniformA();
2151                 float* dst = (inst.fOp == BuilderOp::push_uniform) ? tempStackPtr : SlotB();
2152 
2153                 for (int remaining = inst.fImmA; remaining > 0; remaining -= 4) {
2154                     auto ctx = alloc->make<SkRasterPipeline_UniformCtx>();
2155                     ctx->dst = reinterpret_cast<int32_t*>(dst);
2156                     ctx->src = reinterpret_cast<const int32_t*>(src);
2157                     switch (remaining) {
2158                         case 1:  pipeline->push_back({ProgramOp::copy_uniform,    ctx}); break;
2159                         case 2:  pipeline->push_back({ProgramOp::copy_2_uniforms, ctx}); break;
2160                         case 3:  pipeline->push_back({ProgramOp::copy_3_uniforms, ctx}); break;
2161                         default: pipeline->push_back({ProgramOp::copy_4_uniforms, ctx}); break;
2162                     }
2163                     dst += 4 * N;
2164                     src += 4;
2165                 }
2166                 break;
2167             }
2168             case BuilderOp::push_condition_mask: {
2169                 float* dst = tempStackPtr;
2170                 pipeline->push_back({ProgramOp::store_condition_mask, dst});
2171                 break;
2172             }
2173             case BuilderOp::pop_condition_mask: {
2174                 float* src = tempStackPtr - (1 * N);
2175                 pipeline->push_back({ProgramOp::load_condition_mask, src});
2176                 break;
2177             }
2178             case BuilderOp::merge_condition_mask:
2179             case BuilderOp::merge_inv_condition_mask: {
2180                 float* ptr = tempStackPtr - (2 * N);
2181                 pipeline->push_back({(ProgramOp)inst.fOp, ptr});
2182                 break;
2183             }
2184             case BuilderOp::push_loop_mask: {
2185                 float* dst = tempStackPtr;
2186                 pipeline->push_back({ProgramOp::store_loop_mask, dst});
2187                 break;
2188             }
2189             case BuilderOp::pop_loop_mask: {
2190                 float* src = tempStackPtr - (1 * N);
2191                 pipeline->push_back({ProgramOp::load_loop_mask, src});
2192                 break;
2193             }
2194             case BuilderOp::pop_and_reenable_loop_mask: {
2195                 float* src = tempStackPtr - (1 * N);
2196                 pipeline->push_back({ProgramOp::reenable_loop_mask, src});
2197                 break;
2198             }
2199             case BuilderOp::reenable_loop_mask:
2200                 pipeline->push_back({ProgramOp::reenable_loop_mask, SlotA()});
2201                 break;
2202 
2203             case BuilderOp::mask_off_loop_mask:
2204                 pipeline->push_back({ProgramOp::mask_off_loop_mask, nullptr});
2205                 break;
2206 
2207             case BuilderOp::merge_loop_mask: {
2208                 float* src = tempStackPtr - (1 * N);
2209                 pipeline->push_back({ProgramOp::merge_loop_mask, src});
2210                 break;
2211             }
2212             case BuilderOp::push_return_mask: {
2213                 float* dst = tempStackPtr;
2214                 pipeline->push_back({ProgramOp::store_return_mask, dst});
2215                 break;
2216             }
2217             case BuilderOp::pop_return_mask: {
2218                 float* src = tempStackPtr - (1 * N);
2219                 pipeline->push_back({ProgramOp::load_return_mask, src});
2220                 break;
2221             }
2222             case BuilderOp::mask_off_return_mask:
2223                 pipeline->push_back({ProgramOp::mask_off_return_mask, nullptr});
2224                 break;
2225 
2226             case BuilderOp::copy_constant:
2227             case BuilderOp::push_constant: {
2228                 float* dst = (inst.fOp == BuilderOp::copy_constant) ? SlotA() : tempStackPtr;
2229                 // Splat constant values onto the stack.
2230                 for (int remaining = inst.fImmA; remaining > 0; remaining -= 4) {
2231                     SkRasterPipeline_ConstantCtx ctx;
2232                     ctx.dst = OffsetFromBase(dst);
2233                     ctx.value = inst.fImmB;
2234                     void* ptr = SkRPCtxUtils::Pack(ctx, alloc);
2235                     switch (remaining) {
2236                         case 1:  pipeline->push_back({ProgramOp::copy_constant,     ptr}); break;
2237                         case 2:  pipeline->push_back({ProgramOp::splat_2_constants, ptr}); break;
2238                         case 3:  pipeline->push_back({ProgramOp::splat_3_constants, ptr}); break;
2239                         default: pipeline->push_back({ProgramOp::splat_4_constants, ptr}); break;
2240                     }
2241                     dst += 4 * N;
2242                 }
2243                 break;
2244             }
2245             case BuilderOp::copy_stack_to_slots: {
2246                 float* src = tempStackPtr - (inst.fImmB * N);
2247                 this->appendCopySlotsMasked(pipeline, alloc,
2248                                             OffsetFromBase(SlotA()),
2249                                             OffsetFromBase(src),
2250                                             inst.fImmA);
2251                 break;
2252             }
2253             case BuilderOp::copy_stack_to_slots_unmasked: {
2254                 float* src = tempStackPtr - (inst.fImmB * N);
2255                 this->appendCopySlotsUnmasked(pipeline, alloc,
2256                                               OffsetFromBase(SlotA()),
2257                                               OffsetFromBase(src),
2258                                               inst.fImmA);
2259                 break;
2260             }
2261             case BuilderOp::swizzle_copy_stack_to_slots: {
2262                 // SlotA: fixed-range start
2263                 // immA: number of swizzle components
2264                 // immB: swizzle components
2265                 // immC: offset from stack top
2266                 auto stage = (ProgramOp)((int)ProgramOp::swizzle_copy_slot_masked + inst.fImmA - 1);
2267                 auto* ctx = alloc->make<SkRasterPipeline_SwizzleCopyCtx>();
2268                 ctx->src = reinterpret_cast<const int32_t*>(tempStackPtr) - (inst.fImmC * N);
2269                 ctx->dst = reinterpret_cast<int32_t*>(SlotA());
2270                 unpack_nybbles_to_offsets(inst.fImmB, SkSpan(ctx->offsets));
2271                 pipeline->push_back({stage, ctx});
2272                 break;
2273             }
2274             case BuilderOp::push_clone: {
2275                 float* src = tempStackPtr - (inst.fImmB * N);
2276                 float* dst = tempStackPtr;
2277                 this->appendCopySlotsUnmasked(pipeline, alloc,
2278                                               OffsetFromBase(dst),
2279                                               OffsetFromBase(src),
2280                                               inst.fImmA);
2281                 break;
2282             }
2283             case BuilderOp::push_clone_from_stack: {
2284                 // immA: number of slots
2285                 // immB: other stack ID
2286                 // immC: offset from stack top
2287                 float* sourceStackPtr = tempStackMap[inst.fImmB];
2288                 float* src = sourceStackPtr - (inst.fImmC * N);
2289                 float* dst = tempStackPtr;
2290                 this->appendCopySlotsUnmasked(pipeline, alloc,
2291                                               OffsetFromBase(dst),
2292                                               OffsetFromBase(src),
2293                                               inst.fImmA);
2294                 break;
2295             }
2296             case BuilderOp::push_clone_indirect_from_stack: {
2297                 // immA: number of slots
2298                 // immB: other stack ID
2299                 // immC: offset from stack top
2300                 // immD: dynamic stack ID
2301                 float* sourceStackPtr = tempStackMap[inst.fImmB];
2302 
2303                 auto* ctx = alloc->make<SkRasterPipeline_CopyIndirectCtx>();
2304                 ctx->dst = reinterpret_cast<int32_t*>(tempStackPtr);
2305                 ctx->src = reinterpret_cast<const int32_t*>(sourceStackPtr) - (inst.fImmC * N);
2306                 ctx->indirectOffset =
2307                         reinterpret_cast<const uint32_t*>(tempStackMap[inst.fImmD]) - (1 * N);
2308                 ctx->indirectLimit = inst.fImmC - inst.fImmA;
2309                 ctx->slots = inst.fImmA;
2310                 pipeline->push_back({ProgramOp::copy_from_indirect_unmasked, ctx});
2311                 break;
2312             }
2313             case BuilderOp::swizzle_copy_stack_to_slots_indirect: {
2314                 // SlotA: fixed-range start
2315                 // SlotB: limit-range end
2316                 // immA: number of swizzle components
2317                 // immB: swizzle components
2318                 // immC: offset from stack top
2319                 // immD: dynamic stack ID
2320                 auto* ctx = alloc->make<SkRasterPipeline_SwizzleCopyIndirectCtx>();
2321                 ctx->src = reinterpret_cast<const int32_t*>(tempStackPtr) - (inst.fImmC * N);
2322                 ctx->dst = reinterpret_cast<int32_t*>(SlotA());
2323                 ctx->indirectOffset =
2324                         reinterpret_cast<const uint32_t*>(tempStackMap[inst.fImmD]) - (1 * N);
2325                 ctx->indirectLimit =
2326                         inst.fSlotB - inst.fSlotA - (max_packed_nybble(inst.fImmB, inst.fImmA) + 1);
2327                 ctx->slots = inst.fImmA;
2328                 unpack_nybbles_to_offsets(inst.fImmB, SkSpan(ctx->offsets));
2329                 pipeline->push_back({ProgramOp::swizzle_copy_to_indirect_masked, ctx});
2330                 break;
2331             }
2332             case BuilderOp::case_op: {
2333                 SkRasterPipeline_CaseOpCtx ctx;
2334                 ctx.expectedValue = inst.fImmA;
2335                 ctx.offset = OffsetFromBase(tempStackPtr - (2 * N));
2336                 pipeline->push_back({ProgramOp::case_op, SkRPCtxUtils::Pack(ctx, alloc)});
2337                 break;
2338             }
2339             case BuilderOp::continue_op:
2340                 pipeline->push_back({ProgramOp::continue_op, tempStackMap[inst.fImmA] - (1 * N)});
2341                 break;
2342 
2343             case BuilderOp::pad_stack:
2344             case BuilderOp::discard_stack:
2345                 break;
2346 
2347             case BuilderOp::invoke_shader:
2348             case BuilderOp::invoke_color_filter:
2349             case BuilderOp::invoke_blender:
2350                 pipeline->push_back({(ProgramOp)inst.fOp, context_bit_pun(inst.fImmA)});
2351                 break;
2352 
2353             case BuilderOp::invoke_to_linear_srgb:
2354             case BuilderOp::invoke_from_linear_srgb:
2355                 pipeline->push_back({(ProgramOp)inst.fOp, tempStackMap[inst.fImmA] - (4 * N)});
2356                 break;
2357 
2358             case BuilderOp::trace_line: {
2359                 auto* ctx = AllocTraceContext((SkRasterPipeline_TraceLineCtx*)nullptr);
2360                 ctx->lineNumber = inst.fImmB;
2361                 pipeline->push_back({ProgramOp::trace_line, ctx});
2362                 break;
2363             }
2364             case BuilderOp::trace_scope: {
2365                 auto* ctx = AllocTraceContext((SkRasterPipeline_TraceScopeCtx*)nullptr);
2366                 ctx->delta = inst.fImmB;
2367                 pipeline->push_back({ProgramOp::trace_scope, ctx});
2368                 break;
2369             }
2370             case BuilderOp::trace_enter:
2371             case BuilderOp::trace_exit: {
2372                 auto* ctx = AllocTraceContext((SkRasterPipeline_TraceFuncCtx*)nullptr);
2373                 ctx->funcIdx = inst.fImmB;
2374                 pipeline->push_back({(ProgramOp)inst.fOp, ctx});
2375                 break;
2376             }
2377             case BuilderOp::trace_var:
2378             case BuilderOp::trace_var_indirect: {
2379                 // SlotA: fixed-range start
2380                 // SlotB: limit-range end
2381                 // immA: trace-mask stack ID
2382                 // immB: number of slots
2383                 // immC: dynamic stack ID
2384                 auto* ctx = AllocTraceContext((SkRasterPipeline_TraceVarCtx*)nullptr);
2385                 ctx->slotIdx = inst.fSlotA;
2386                 ctx->numSlots = inst.fImmB;
2387                 ctx->data = reinterpret_cast<int*>(SlotA());
2388                 if (inst.fOp == BuilderOp::trace_var_indirect) {
2389                     ctx->indirectOffset =
2390                             reinterpret_cast<const uint32_t*>(tempStackMap[inst.fImmC]) - (1 * N);
2391                     ctx->indirectLimit = inst.fSlotB - inst.fSlotA - inst.fImmB;
2392                 } else {
2393                     ctx->indirectOffset = nullptr;
2394                     ctx->indirectLimit = 0;
2395                 }
2396                 pipeline->push_back({ProgramOp::trace_var, ctx});
2397                 break;
2398             }
2399             default:
2400                 SkDEBUGFAILF("Raster Pipeline: unsupported instruction %d", (int)inst.fOp);
2401                 break;
2402         }
2403 
2404         int stackUsage = stack_usage(inst);
2405         if (stackUsage != 0) {
2406             tempStackPtr += stackUsage * N;
2407             SkASSERT(tempStackPtr >= slots.stack.begin());
2408             SkASSERT(tempStackPtr <= slots.stack.end());
2409         }
2410 
2411         // Periodically rewind the stack every 500 instructions. When SK_HAS_MUSTTAIL is set,
2412         // rewinds are not actually used; the appendStackRewind call becomes a no-op. On platforms
2413         // that don't support SK_HAS_MUSTTAIL, rewinding the stack periodically can prevent a
2414         // potential stack overflow when running a long program.
2415         int numPipelineStages = pipeline->size();
2416         if (numPipelineStages - mostRecentRewind > 500) {
2417             this->appendStackRewind(pipeline);
2418             mostRecentRewind = numPipelineStages;
2419         }
2420     }
2421 }
2422 
2423 class Program::Dumper {
2424 public:
Dumper(const Program & p)2425     Dumper(const Program& p) : fProgram(p) {}
2426 
2427     void dump(SkWStream* out, bool writeInstructionCount);
2428 
2429     // Finds the labels in the program, and keeps track of their offsets.
buildLabelToStageMap()2430     void buildLabelToStageMap() {
2431         for (int index = 0; index < fStages.size(); ++index) {
2432             if (fStages[index].op == ProgramOp::label) {
2433                 int labelID = sk_bit_cast<intptr_t>(fStages[index].ctx);
2434                 SkASSERT(!fLabelToStageMap.find(labelID));
2435                 fLabelToStageMap[labelID] = index;
2436             }
2437         }
2438     }
2439 
2440     // Assign unique names to each variable slot; our trace might have multiple variables with the
2441     // same name, which can make a dump hard to read. We disambiguate them with subscripts.
buildUniqueSlotNameList()2442     void buildUniqueSlotNameList() {
2443         if (fProgram.fDebugTrace) {
2444             fSlotNameList.reserve_exact(fProgram.fDebugTrace->fSlotInfo.size());
2445 
2446             // The map consists of <variable name, <source position, unique name>>.
2447             THashMap<std::string_view, THashMap<int, std::string>> uniqueNameMap;
2448 
2449             for (const SlotDebugInfo& slotInfo : fProgram.fDebugTrace->fSlotInfo) {
2450                 // Look up this variable by its name and source position.
2451                 int pos = slotInfo.pos.valid() ? slotInfo.pos.startOffset() : 0;
2452                 THashMap<int, std::string>& positionMap = uniqueNameMap[slotInfo.name];
2453                 std::string& uniqueName = positionMap[pos];
2454 
2455                 // Have we seen this variable name/position combination before?
2456                 if (uniqueName.empty()) {
2457                     // This is a unique name/position pair.
2458                     uniqueName = slotInfo.name;
2459 
2460                     // But if it's not a unique _name_, it deserves a subscript to disambiguate it.
2461                     int subscript = positionMap.count() - 1;
2462                     if (subscript > 0) {
2463                         for (char digit : std::to_string(subscript)) {
2464                             // U+2080 through U+2089 (₀₁₂₃₄₅₆₇₈₉) in UTF8:
2465                             uniqueName.push_back((char)0xE2);
2466                             uniqueName.push_back((char)0x82);
2467                             uniqueName.push_back((char)(0x80 + digit - '0'));
2468                         }
2469                     }
2470                 }
2471 
2472                 fSlotNameList.push_back(uniqueName);
2473             }
2474         }
2475     }
2476 
2477     // Interprets the context value as a branch offset.
branchOffset(const SkRasterPipeline_BranchCtx * ctx,int index) const2478     std::string branchOffset(const SkRasterPipeline_BranchCtx* ctx, int index) const {
2479         // The context's offset field contains a label ID
2480         int labelID = ctx->offset;
2481         const int* targetIndex = fLabelToStageMap.find(labelID);
2482         SkASSERT(targetIndex);
2483         return SkSL::String::printf("%+d (label %d at #%d)", *targetIndex - index, labelID,
2484                                                              *targetIndex + 1);
2485     }
2486 
2487     // Prints a 32-bit immediate value of unknown type (int/float).
imm(float immFloat,bool showAsFloat=true) const2488     std::string imm(float immFloat, bool showAsFloat = true) const {
2489         // Special case exact zero as "0" for readability (vs `0x00000000 (0.0)`).
2490         if (sk_bit_cast<int32_t>(immFloat) == 0) {
2491             return "0";
2492         }
2493         // Start with `0x3F800000` as a baseline.
2494         uint32_t immUnsigned;
2495         memcpy(&immUnsigned, &immFloat, sizeof(uint32_t));
2496         auto text = SkSL::String::printf("0x%08X", immUnsigned);
2497 
2498         // Extend it to `0x3F800000 (1.0)` for finite floating point values.
2499         if (showAsFloat && std::isfinite(immFloat)) {
2500             text += " (";
2501             text += skstd::to_string(immFloat);
2502             text += ')';
2503         }
2504         return text;
2505     }
2506 
2507     // Interprets the context pointer as a 32-bit immediate value of unknown type (int/float).
immCtx(const void * ctx,bool showAsFloat=true) const2508     std::string immCtx(const void* ctx, bool showAsFloat = true) const {
2509         float f;
2510         memcpy(&f, &ctx, sizeof(float));
2511         return this->imm(f, showAsFloat);
2512     }
2513 
2514     // Prints `1` for single slots and `1..3` for ranges of slots.
asRange(int first,int count) const2515     std::string asRange(int first, int count) const {
2516         std::string text = std::to_string(first);
2517         if (count > 1) {
2518             text += ".." + std::to_string(first + count - 1);
2519         }
2520         return text;
2521     }
2522 
2523     // Generates a reasonable name for a range of slots or uniforms, e.g.:
2524     // `val`: slot range points at one variable, named val
2525     // `val(0..1)`: slot range points at the first and second slot of val (which has 3+ slots)
2526     // `foo, bar`: slot range fully covers two variables, named foo and bar
2527     // `foo(3), bar(0)`: slot range covers the fourth slot of foo and the first slot of bar
slotOrUniformName(SkSpan<const SlotDebugInfo> debugInfo,SkSpan<const std::string> names,SlotRange range) const2528     std::string slotOrUniformName(SkSpan<const SlotDebugInfo> debugInfo,
2529                                   SkSpan<const std::string> names,
2530                                   SlotRange range) const {
2531         SkASSERT(range.index >= 0 && (range.index + range.count) <= (int)debugInfo.size());
2532 
2533         std::string text;
2534         auto separator = SkSL::String::Separator();
2535         while (range.count > 0) {
2536             const SlotDebugInfo& slotInfo = debugInfo[range.index];
2537             text += separator();
2538             text += names.empty() ? slotInfo.name : names[range.index];
2539 
2540             // Figure out how many slots we can chomp in this iteration.
2541             int entireVariable = slotInfo.columns * slotInfo.rows;
2542             int slotsToChomp = std::min(range.count, entireVariable - slotInfo.componentIndex);
2543             // If we aren't consuming an entire variable, from first slot to last...
2544             if (slotsToChomp != entireVariable) {
2545                 // ... decorate it with a range suffix.
2546                 text += '(' + this->asRange(slotInfo.componentIndex, slotsToChomp) + ')';
2547             }
2548             range.index += slotsToChomp;
2549             range.count -= slotsToChomp;
2550         }
2551 
2552         return text;
2553     }
2554 
2555     // Generates a reasonable name for a range of slots.
slotName(SlotRange range) const2556     std::string slotName(SlotRange range) const {
2557         return this->slotOrUniformName(fProgram.fDebugTrace->fSlotInfo, fSlotNameList, range);
2558     }
2559 
2560     // Generates a reasonable name for a range of uniforms.
uniformName(SlotRange range) const2561     std::string uniformName(SlotRange range) const {
2562         return this->slotOrUniformName(fProgram.fDebugTrace->fUniformInfo, /*names=*/{}, range);
2563     }
2564 
2565     // Attempts to interpret the passed-in pointer as a uniform range.
uniformPtrCtx(const float * ptr,int numSlots) const2566     std::string uniformPtrCtx(const float* ptr, int numSlots) const {
2567         const float* end = ptr + numSlots;
2568         if (ptr >= fUniforms.begin() && end <= fUniforms.end()) {
2569             int uniformIdx = ptr - fUniforms.begin();
2570             if (fProgram.fDebugTrace) {
2571                 // Handle pointers to named uniform slots.
2572                 std::string name = this->uniformName({uniformIdx, numSlots});
2573                 if (!name.empty()) {
2574                     return name;
2575                 }
2576             }
2577             // Handle pointers to uniforms (when no debug info exists).
2578             return 'u' + this->asRange(uniformIdx, numSlots);
2579         }
2580         return {};
2581     }
2582 
2583     // Attempts to interpret the passed-in pointer as a value slot range.
valuePtrCtx(const float * ptr,int numSlots) const2584     std::string valuePtrCtx(const float* ptr, int numSlots) const {
2585         const float* end = ptr + (N * numSlots);
2586         if (ptr >= fSlots.values.begin() && end <= fSlots.values.end()) {
2587             int valueIdx = ptr - fSlots.values.begin();
2588             SkASSERT((valueIdx % N) == 0);
2589             valueIdx /= N;
2590             if (fProgram.fDebugTrace) {
2591                 // Handle pointers to named value slots.
2592                 std::string name = this->slotName({valueIdx, numSlots});
2593                 if (!name.empty()) {
2594                     return name;
2595                 }
2596             }
2597             // Handle pointers to value slots (when no debug info exists).
2598             return 'v' + this->asRange(valueIdx, numSlots);
2599         }
2600         return {};
2601     }
2602 
2603     // Attempts to interpret the passed-in pointer as a immutable slot range.
immutablePtrCtx(const float * ptr,int numSlots) const2604     std::string immutablePtrCtx(const float* ptr, int numSlots) const {
2605         const float* end = ptr + numSlots;
2606         if (ptr >= fSlots.immutable.begin() && end <= fSlots.immutable.end()) {
2607             int index = ptr - fSlots.immutable.begin();
2608             return 'i' + this->asRange(index, numSlots) + ' ' +
2609                    this->multiImmCtx(ptr, numSlots);
2610         }
2611         return {};
2612     }
2613 
2614     // Interprets the context value as a pointer to `count` immediate values.
multiImmCtx(const float * ptr,int count) const2615     std::string multiImmCtx(const float* ptr, int count) const {
2616         // If this is a uniform, print it by name.
2617         if (std::string text = this->uniformPtrCtx(ptr, count); !text.empty()) {
2618             return text;
2619         }
2620         // Emit a single bracketed immediate.
2621         if (count == 1) {
2622             return '[' + this->imm(*ptr) + ']';
2623         }
2624         // Emit a list like `[0x00000000 (0.0), 0x3F80000 (1.0)]`.
2625         std::string text = "[";
2626         auto separator = SkSL::String::Separator();
2627         while (count--) {
2628             text += separator();
2629             text += this->imm(*ptr++);
2630         }
2631         return text + ']';
2632     }
2633 
2634     // Interprets the context value as a generic pointer.
ptrCtx(const void * ctx,int numSlots) const2635     std::string ptrCtx(const void* ctx, int numSlots) const {
2636         const float *ctxAsSlot = static_cast<const float*>(ctx);
2637         // Check for uniform, value, and immutable pointers.
2638         if (std::string uniform = this->uniformPtrCtx(ctxAsSlot, numSlots); !uniform.empty()) {
2639             return uniform;
2640         }
2641         if (std::string value = this->valuePtrCtx(ctxAsSlot, numSlots); !value.empty()) {
2642             return value;
2643         }
2644         if (std::string value = this->immutablePtrCtx(ctxAsSlot, numSlots); !value.empty()) {
2645             return value;
2646         }
2647         // Handle pointers to temporary stack slots.
2648         if (ctxAsSlot >= fSlots.stack.begin() && ctxAsSlot < fSlots.stack.end()) {
2649             int stackIdx = ctxAsSlot - fSlots.stack.begin();
2650             SkASSERT((stackIdx % N) == 0);
2651             return '$' + this->asRange(stackIdx / N, numSlots);
2652         }
2653         // This pointer is out of our expected bounds; this generally isn't expected to happen.
2654         return "ExternalPtr(" + this->asRange(0, numSlots) + ")";
2655     }
2656 
2657     // Converts an SkRPOffset to a pointer into the value-slot range.
offsetToPtr(SkRPOffset offset) const2658     std::byte* offsetToPtr(SkRPOffset offset) const {
2659         return (std::byte*)fSlots.values.data() + offset;
2660     }
2661 
2662     // Interprets a slab offset as a slot range.
offsetCtx(SkRPOffset offset,int numSlots) const2663     std::string offsetCtx(SkRPOffset offset, int numSlots) const {
2664         return this->ptrCtx(this->offsetToPtr(offset), numSlots);
2665     }
2666 
2667     // Interprets the context value as a packed ConstantCtx structure.
constantCtx(const void * v,int slots,bool showAsFloat=true) const2668     std::tuple<std::string, std::string> constantCtx(const void* v,
2669                                                      int slots,
2670                                                      bool showAsFloat = true) const {
2671         auto ctx = SkRPCtxUtils::Unpack((const SkRasterPipeline_ConstantCtx*)v);
2672         return {this->offsetCtx(ctx.dst, slots),
2673                 this->imm(sk_bit_cast<float>(ctx.value), showAsFloat)};
2674     }
2675 
2676     // Interprets the context value as a BinaryOp structure for copy_n_slots (numSlots is dictated
2677     // by the op itself).
binaryOpCtx(const void * v,int numSlots) const2678     std::tuple<std::string, std::string> binaryOpCtx(const void* v, int numSlots) const {
2679         auto ctx = SkRPCtxUtils::Unpack((const SkRasterPipeline_BinaryOpCtx*)v);
2680         return {this->offsetCtx(ctx.dst, numSlots),
2681                 this->offsetCtx(ctx.src, numSlots)};
2682     }
2683 
2684     // Interprets the context value as a BinaryOp structure for copy_n_uniforms (numSlots is
2685     // dictated by the op itself).
copyUniformCtx(const void * v,int numSlots) const2686     std::tuple<std::string, std::string> copyUniformCtx(const void* v, int numSlots) const {
2687         const auto *ctx = static_cast<const SkRasterPipeline_UniformCtx*>(v);
2688         return {this->ptrCtx(ctx->dst, numSlots),
2689                 this->multiImmCtx(reinterpret_cast<const float*>(ctx->src), numSlots)};
2690     }
2691 
2692     // Interprets the context value as a pointer to two adjacent values.
adjacentPtrCtx(const void * ctx,int numSlots) const2693     std::tuple<std::string, std::string> adjacentPtrCtx(const void* ctx, int numSlots) const {
2694         const float *ctxAsSlot = static_cast<const float*>(ctx);
2695         return std::make_tuple(this->ptrCtx(ctxAsSlot, numSlots),
2696                                this->ptrCtx(ctxAsSlot + (N * numSlots), numSlots));
2697     }
2698 
2699     // Interprets a slab offset as two adjacent slot ranges.
adjacentOffsetCtx(SkRPOffset offset,int numSlots) const2700     std::tuple<std::string, std::string> adjacentOffsetCtx(SkRPOffset offset, int numSlots) const {
2701         return this->adjacentPtrCtx((std::byte*)fSlots.values.data() + offset, numSlots);
2702     }
2703 
2704     // Interprets the context value as a BinaryOp structure (numSlots is inferred from the distance
2705     // between pointers).
adjacentBinaryOpCtx(const void * v) const2706     std::tuple<std::string, std::string> adjacentBinaryOpCtx(const void* v) const {
2707         auto ctx = SkRPCtxUtils::Unpack((const SkRasterPipeline_BinaryOpCtx*)v);
2708         int numSlots = (ctx.src - ctx.dst) / (N * sizeof(float));
2709         return this->adjacentOffsetCtx(ctx.dst, numSlots);
2710     }
2711 
2712     // Interprets the context value as a pointer to three adjacent values.
adjacent3PtrCtx(const void * ctx,int numSlots) const2713     std::tuple<std::string, std::string, std::string> adjacent3PtrCtx(const void* ctx,
2714                                                                       int numSlots) const {
2715         const float *ctxAsSlot = static_cast<const float*>(ctx);
2716         return {this->ptrCtx(ctxAsSlot, numSlots),
2717                 this->ptrCtx(ctxAsSlot + (N * numSlots), numSlots),
2718                 this->ptrCtx(ctxAsSlot + (2 * N * numSlots), numSlots)};
2719     }
2720 
2721     // Interprets a slab offset as three adjacent slot ranges.
adjacent3OffsetCtx(SkRPOffset offset,int numSlots) const2722     std::tuple<std::string, std::string, std::string> adjacent3OffsetCtx(SkRPOffset offset,
2723                                                                          int numSlots) const {
2724         return this->adjacent3PtrCtx((std::byte*)fSlots.values.data() + offset, numSlots);
2725     }
2726 
2727     // Interprets the context value as a TernaryOp structure (numSlots is inferred from `delta`).
adjacentTernaryOpCtx(const void * v) const2728     std::tuple<std::string, std::string, std::string> adjacentTernaryOpCtx(const void* v) const {
2729         auto ctx = SkRPCtxUtils::Unpack((const SkRasterPipeline_TernaryOpCtx*)v);
2730         int numSlots = ctx.delta / (sizeof(float) * N);
2731         return this->adjacent3OffsetCtx(ctx.dst, numSlots);
2732     }
2733 
2734     // Stringizes a span of swizzle offsets to the textual equivalent (`xyzw`).
2735     template <typename T>
swizzleOffsetSpan(SkSpan<T> offsets) const2736     std::string swizzleOffsetSpan(SkSpan<T> offsets) const {
2737         std::string src;
2738         for (uint16_t offset : offsets) {
2739             if (offset == (0 * N * sizeof(float))) {
2740                 src.push_back('x');
2741             } else if (offset == (1 * N * sizeof(float))) {
2742                 src.push_back('y');
2743             } else if (offset == (2 * N * sizeof(float))) {
2744                 src.push_back('z');
2745             } else if (offset == (3 * N * sizeof(float))) {
2746                 src.push_back('w');
2747             } else {
2748                 src.push_back('?');
2749             }
2750         }
2751         return src;
2752     }
2753 
2754     // Determines the effective width of a swizzle op. When we decode a swizzle, we don't know the
2755     // slot width of the original value; that's not preserved in the instruction encoding. (e.g.,
2756     // myFloat4.y would be indistinguishable from myFloat2.y.) We do our best to make a readable
2757     // dump using the data we have.
2758     template <typename T>
swizzleWidth(SkSpan<T> offsets) const2759     size_t swizzleWidth(SkSpan<T> offsets) const {
2760         size_t highestComponent = *std::max_element(offsets.begin(), offsets.end()) /
2761                                   (N * sizeof(float));
2762         size_t swizzleWidth = offsets.size();
2763         return std::max(swizzleWidth, highestComponent + 1);
2764     }
2765 
2766     // Stringizes a swizzled pointer.
2767     template <typename T>
swizzlePtr(const void * ptr,SkSpan<T> offsets) const2768     std::string swizzlePtr(const void* ptr, SkSpan<T> offsets) const {
2769         return "(" + this->ptrCtx(ptr, this->swizzleWidth(SkSpan(offsets))) + ")." +
2770                this->swizzleOffsetSpan(SkSpan(offsets));
2771     }
2772 
2773     // Interprets the context value as a SwizzleCtx structure.
swizzleCtx(ProgramOp op,const void * v) const2774     std::tuple<std::string, std::string> swizzleCtx(ProgramOp op, const void* v) const {
2775         auto ctx = SkRPCtxUtils::Unpack((const SkRasterPipeline_SwizzleCtx*)v);
2776         int destSlots = (int)op - (int)BuilderOp::swizzle_1 + 1;
2777         return {this->offsetCtx(ctx.dst, destSlots),
2778                 this->swizzlePtr(this->offsetToPtr(ctx.dst), SkSpan(ctx.offsets, destSlots))};
2779     }
2780 
2781     // Interprets the context value as a SwizzleCopyCtx structure.
swizzleCopyCtx(ProgramOp op,const void * v) const2782     std::tuple<std::string, std::string> swizzleCopyCtx(ProgramOp op, const void* v) const {
2783         const auto* ctx = static_cast<const SkRasterPipeline_SwizzleCopyCtx*>(v);
2784         int destSlots = (int)op - (int)BuilderOp::swizzle_copy_slot_masked + 1;
2785 
2786         return {this->swizzlePtr(ctx->dst, SkSpan(ctx->offsets, destSlots)),
2787                 this->ptrCtx(ctx->src, destSlots)};
2788     }
2789 
2790     // Interprets the context value as a ShuffleCtx structure.
shuffleCtx(const void * v) const2791     std::tuple<std::string, std::string> shuffleCtx(const void* v) const {
2792         const auto* ctx = static_cast<const SkRasterPipeline_ShuffleCtx*>(v);
2793 
2794         std::string dst = this->ptrCtx(ctx->ptr, ctx->count);
2795         std::string src = "(" + dst + ")[";
2796         for (int index = 0; index < ctx->count; ++index) {
2797             if (ctx->offsets[index] % (N * sizeof(float))) {
2798                 src.push_back('?');
2799             } else {
2800                 src += std::to_string(ctx->offsets[index] / (N * sizeof(float)));
2801             }
2802             src.push_back(' ');
2803         }
2804         src.back() = ']';
2805         return std::make_tuple(dst, src);
2806     }
2807 
2808     // Interprets the context value as a packed MatrixMultiplyCtx structure.
matrixMultiply(const void * v) const2809     std::tuple<std::string, std::string, std::string> matrixMultiply(const void* v) const {
2810         auto ctx = SkRPCtxUtils::Unpack((const SkRasterPipeline_MatrixMultiplyCtx*)v);
2811         int leftMatrix = ctx.leftColumns * ctx.leftRows;
2812         int rightMatrix = ctx.rightColumns * ctx.rightRows;
2813         int resultMatrix = ctx.rightColumns * ctx.leftRows;
2814         SkRPOffset leftOffset = ctx.dst + (ctx.rightColumns * ctx.leftRows * sizeof(float) * N);
2815         SkRPOffset rightOffset = leftOffset + (ctx.leftColumns * ctx.leftRows * sizeof(float) * N);
2816         return {SkSL::String::printf("mat%dx%d(%s)",
2817                                      ctx.rightColumns,
2818                                      ctx.leftRows,
2819                                      this->offsetCtx(ctx.dst, resultMatrix).c_str()),
2820                 SkSL::String::printf("mat%dx%d(%s)",
2821                                      ctx.leftColumns,
2822                                      ctx.leftRows,
2823                                      this->offsetCtx(leftOffset, leftMatrix).c_str()),
2824                 SkSL::String::printf("mat%dx%d(%s)",
2825                                      ctx.rightColumns,
2826                                      ctx.rightRows,
2827                                      this->offsetCtx(rightOffset, rightMatrix).c_str())};
2828     }
2829 
2830 private:
2831     const int N = SkOpts::raster_pipeline_highp_stride;
2832     const Program& fProgram;
2833     TArray<Stage> fStages;
2834     TArray<std::string> fSlotNameList;
2835     THashMap<int, int> fLabelToStageMap;  // <label ID, stage index>
2836     SlotData fSlots;
2837     SkSpan<float> fUniforms;
2838 };
2839 
dump(SkWStream * out,bool writeInstructionCount)2840 void Program::Dumper::dump(SkWStream* out, bool writeInstructionCount) {
2841     using POp = ProgramOp;
2842 
2843     // Allocate memory for the slot and uniform data, even though the program won't ever be
2844     // executed. The program requires pointer ranges for managing its data, and ASAN will report
2845     // errors if those pointers are pointing at unallocated memory.
2846     SkArenaAlloc alloc(/*firstHeapAllocation=*/1000);
2847     fSlots = fProgram.allocateSlotData(&alloc);
2848     float* uniformPtr = alloc.makeArray<float>(fProgram.fNumUniformSlots);
2849     fUniforms = SkSpan(uniformPtr, fProgram.fNumUniformSlots);
2850 
2851     // Turn this program into an array of Raster Pipeline stages.
2852     fProgram.makeStages(&fStages, &alloc, fUniforms, fSlots);
2853 
2854     // Assemble lookup tables for program labels and slot names.
2855     this->buildLabelToStageMap();
2856     this->buildUniqueSlotNameList();
2857 
2858     // Emit the program's instruction count.
2859     if (writeInstructionCount) {
2860         int invocationCount = 0, instructionCount = 0;
2861         for (const Stage& stage : fStages) {
2862             switch (stage.op) {
2863                 case POp::label:
2864                     // consumes zero instructions
2865                     break;
2866 
2867                 case POp::invoke_shader:
2868                 case POp::invoke_color_filter:
2869                 case POp::invoke_blender:
2870                 case POp::invoke_to_linear_srgb:
2871                 case POp::invoke_from_linear_srgb:
2872                     ++invocationCount;
2873                     break;
2874 
2875                 default:
2876                     ++instructionCount;
2877                     break;
2878             }
2879         }
2880 
2881         out->writeText(std::to_string(instructionCount).c_str());
2882         out->writeText(" instructions");
2883         if (invocationCount > 0) {
2884             out->writeText(", ");
2885             out->writeText(std::to_string(invocationCount).c_str());
2886             out->writeText(" invocations");
2887         }
2888         out->writeText("\n\n");
2889     }
2890 
2891     // Emit all of the program's immutable data.
2892     const char* header = "[immutable slots]\n";
2893     const char* footer = "";
2894     for (const Instruction& inst : fProgram.fInstructions) {
2895         if (inst.fOp == BuilderOp::store_immutable_value) {
2896             out->writeText(header);
2897             out->writeText("i");
2898             out->writeText(std::to_string(inst.fSlotA).c_str());
2899             out->writeText(" = ");
2900             out->writeText(this->imm(sk_bit_cast<float>(inst.fImmA)).c_str());
2901             out->writeText("\n");
2902 
2903             header = "";
2904             footer = "\n";
2905         }
2906     }
2907     out->writeText(footer);
2908 
2909     // Emit the program's instruction list.
2910     for (int index = 0; index < fStages.size(); ++index) {
2911         const Stage& stage = fStages[index];
2912 
2913         std::string opArg1, opArg2, opArg3, opSwizzle;
2914         switch (stage.op) {
2915             case POp::label:
2916             case POp::invoke_shader:
2917             case POp::invoke_color_filter:
2918             case POp::invoke_blender:
2919                 opArg1 = this->immCtx(stage.ctx, /*showAsFloat=*/false);
2920                 break;
2921 
2922             case POp::case_op: {
2923                 auto ctx = SkRPCtxUtils::Unpack((const SkRasterPipeline_CaseOpCtx*)stage.ctx);
2924                 opArg1 = this->offsetCtx(ctx.offset, 1);
2925                 opArg2 = this->offsetCtx(ctx.offset + sizeof(int32_t) * N, 1);
2926                 opArg3 = this->imm(sk_bit_cast<float>(ctx.expectedValue), /*showAsFloat=*/false);
2927                 break;
2928             }
2929             case POp::swizzle_1:
2930             case POp::swizzle_2:
2931             case POp::swizzle_3:
2932             case POp::swizzle_4:
2933                 std::tie(opArg1, opArg2) = this->swizzleCtx(stage.op, stage.ctx);
2934                 break;
2935 
2936             case POp::swizzle_copy_slot_masked:
2937             case POp::swizzle_copy_2_slots_masked:
2938             case POp::swizzle_copy_3_slots_masked:
2939             case POp::swizzle_copy_4_slots_masked:
2940                 std::tie(opArg1, opArg2) = this->swizzleCopyCtx(stage.op, stage.ctx);
2941                 break;
2942 
2943             case POp::refract_4_floats:
2944                 std::tie(opArg1, opArg2) = this->adjacentPtrCtx(stage.ctx, 4);
2945                 opArg3 = this->ptrCtx((const float*)(stage.ctx) + (8 * N), 1);
2946                 break;
2947 
2948             case POp::dot_2_floats:
2949                 opArg1 = this->ptrCtx(stage.ctx, 1);
2950                 std::tie(opArg2, opArg3) = this->adjacentPtrCtx(stage.ctx, 2);
2951                 break;
2952 
2953             case POp::dot_3_floats:
2954                 opArg1 = this->ptrCtx(stage.ctx, 1);
2955                 std::tie(opArg2, opArg3) = this->adjacentPtrCtx(stage.ctx, 3);
2956                 break;
2957 
2958             case POp::dot_4_floats:
2959                 opArg1 = this->ptrCtx(stage.ctx, 1);
2960                 std::tie(opArg2, opArg3) = this->adjacentPtrCtx(stage.ctx, 4);
2961                 break;
2962 
2963             case POp::shuffle:
2964                 std::tie(opArg1, opArg2) = this->shuffleCtx(stage.ctx);
2965                 break;
2966 
2967             case POp::matrix_multiply_2:
2968             case POp::matrix_multiply_3:
2969             case POp::matrix_multiply_4:
2970                 std::tie(opArg1, opArg2, opArg3) = this->matrixMultiply(stage.ctx);
2971                 break;
2972 
2973             case POp::load_condition_mask:
2974             case POp::store_condition_mask:
2975             case POp::load_loop_mask:
2976             case POp::store_loop_mask:
2977             case POp::merge_loop_mask:
2978             case POp::reenable_loop_mask:
2979             case POp::load_return_mask:
2980             case POp::store_return_mask:
2981             case POp::continue_op:
2982             case POp::cast_to_float_from_int: case POp::cast_to_float_from_uint:
2983             case POp::cast_to_int_from_float: case POp::cast_to_uint_from_float:
2984             case POp::abs_int:
2985             case POp::acos_float:
2986             case POp::asin_float:
2987             case POp::atan_float:
2988             case POp::ceil_float:
2989             case POp::cos_float:
2990             case POp::exp_float:
2991             case POp::exp2_float:
2992             case POp::log_float:
2993             case POp::log2_float:
2994             case POp::floor_float:
2995             case POp::invsqrt_float:
2996             case POp::sin_float:
2997             case POp::sqrt_float:
2998             case POp::tan_float:
2999                 opArg1 = this->ptrCtx(stage.ctx, 1);
3000                 break;
3001 
3002             case POp::store_src_rg:
3003             case POp::cast_to_float_from_2_ints: case POp::cast_to_float_from_2_uints:
3004             case POp::cast_to_int_from_2_floats: case POp::cast_to_uint_from_2_floats:
3005             case POp::abs_2_ints:
3006             case POp::ceil_2_floats:
3007             case POp::floor_2_floats:
3008             case POp::invsqrt_2_floats:
3009                 opArg1 = this->ptrCtx(stage.ctx, 2);
3010                 break;
3011 
3012             case POp::cast_to_float_from_3_ints: case POp::cast_to_float_from_3_uints:
3013             case POp::cast_to_int_from_3_floats: case POp::cast_to_uint_from_3_floats:
3014             case POp::abs_3_ints:
3015             case POp::ceil_3_floats:
3016             case POp::floor_3_floats:
3017             case POp::invsqrt_3_floats:
3018                 opArg1 = this->ptrCtx(stage.ctx, 3);
3019                 break;
3020 
3021             case POp::load_src:
3022             case POp::load_dst:
3023             case POp::exchange_src:
3024             case POp::store_src:
3025             case POp::store_dst:
3026             case POp::store_device_xy01:
3027             case POp::invoke_to_linear_srgb:
3028             case POp::invoke_from_linear_srgb:
3029             case POp::cast_to_float_from_4_ints: case POp::cast_to_float_from_4_uints:
3030             case POp::cast_to_int_from_4_floats: case POp::cast_to_uint_from_4_floats:
3031             case POp::abs_4_ints:
3032             case POp::ceil_4_floats:
3033             case POp::floor_4_floats:
3034             case POp::invsqrt_4_floats:
3035             case POp::inverse_mat2:
3036                 opArg1 = this->ptrCtx(stage.ctx, 4);
3037                 break;
3038 
3039             case POp::inverse_mat3:
3040                 opArg1 = this->ptrCtx(stage.ctx, 9);
3041                 break;
3042 
3043             case POp::inverse_mat4:
3044                 opArg1 = this->ptrCtx(stage.ctx, 16);
3045                 break;
3046 
3047             case POp::copy_constant:
3048             case POp::add_imm_float:
3049             case POp::mul_imm_float:
3050             case POp::cmple_imm_float:
3051             case POp::cmplt_imm_float:
3052             case POp::cmpeq_imm_float:
3053             case POp::cmpne_imm_float:
3054             case POp::min_imm_float:
3055             case POp::max_imm_float:
3056                 std::tie(opArg1, opArg2) = this->constantCtx(stage.ctx, 1);
3057                 break;
3058 
3059             case POp::add_imm_int:
3060             case POp::mul_imm_int:
3061             case POp::bitwise_and_imm_int:
3062             case POp::bitwise_xor_imm_int:
3063             case POp::cmple_imm_int:
3064             case POp::cmple_imm_uint:
3065             case POp::cmplt_imm_int:
3066             case POp::cmplt_imm_uint:
3067             case POp::cmpeq_imm_int:
3068             case POp::cmpne_imm_int:
3069                 std::tie(opArg1, opArg2) = this->constantCtx(stage.ctx, 1, /*showAsFloat=*/false);
3070                 break;
3071 
3072             case POp::splat_2_constants:
3073             case POp::bitwise_and_imm_2_ints:
3074                 std::tie(opArg1, opArg2) = this->constantCtx(stage.ctx, 2);
3075                 break;
3076 
3077             case POp::splat_3_constants:
3078             case POp::bitwise_and_imm_3_ints:
3079                 std::tie(opArg1, opArg2) = this->constantCtx(stage.ctx, 3);
3080                 break;
3081 
3082             case POp::splat_4_constants:
3083             case POp::bitwise_and_imm_4_ints:
3084                 std::tie(opArg1, opArg2) = this->constantCtx(stage.ctx, 4);
3085                 break;
3086 
3087             case POp::copy_uniform:
3088                 std::tie(opArg1, opArg2) = this->copyUniformCtx(stage.ctx, 1);
3089                 break;
3090 
3091             case POp::copy_2_uniforms:
3092                 std::tie(opArg1, opArg2) = this->copyUniformCtx(stage.ctx, 2);
3093                 break;
3094 
3095             case POp::copy_3_uniforms:
3096                 std::tie(opArg1, opArg2) = this->copyUniformCtx(stage.ctx, 3);
3097                 break;
3098 
3099             case POp::copy_4_uniforms:
3100                 std::tie(opArg1, opArg2) = this->copyUniformCtx(stage.ctx, 4);
3101                 break;
3102 
3103             case POp::copy_slot_masked:
3104             case POp::copy_slot_unmasked:
3105             case POp::copy_immutable_unmasked:
3106                 std::tie(opArg1, opArg2) = this->binaryOpCtx(stage.ctx, 1);
3107                 break;
3108 
3109             case POp::copy_2_slots_masked:
3110             case POp::copy_2_slots_unmasked:
3111             case POp::copy_2_immutables_unmasked:
3112                 std::tie(opArg1, opArg2) = this->binaryOpCtx(stage.ctx, 2);
3113                 break;
3114 
3115             case POp::copy_3_slots_masked:
3116             case POp::copy_3_slots_unmasked:
3117             case POp::copy_3_immutables_unmasked:
3118                 std::tie(opArg1, opArg2) = this->binaryOpCtx(stage.ctx, 3);
3119                 break;
3120 
3121             case POp::copy_4_slots_masked:
3122             case POp::copy_4_slots_unmasked:
3123             case POp::copy_4_immutables_unmasked:
3124                 std::tie(opArg1, opArg2) = this->binaryOpCtx(stage.ctx, 4);
3125                 break;
3126 
3127             case POp::copy_from_indirect_uniform_unmasked:
3128             case POp::copy_from_indirect_unmasked:
3129             case POp::copy_to_indirect_masked: {
3130                 const auto* ctx = static_cast<SkRasterPipeline_CopyIndirectCtx*>(stage.ctx);
3131                 // We don't incorporate the indirect-limit in the output
3132                 opArg1 = this->ptrCtx(ctx->dst, ctx->slots);
3133                 opArg2 = this->ptrCtx(ctx->src, ctx->slots);
3134                 opArg3 = this->ptrCtx(ctx->indirectOffset, 1);
3135                 break;
3136             }
3137             case POp::swizzle_copy_to_indirect_masked: {
3138                 const auto* ctx = static_cast<SkRasterPipeline_SwizzleCopyIndirectCtx*>(stage.ctx);
3139                 opArg1 = this->ptrCtx(ctx->dst, this->swizzleWidth(SkSpan(ctx->offsets,
3140                                                                           ctx->slots)));
3141                 opArg2 = this->ptrCtx(ctx->src, ctx->slots);
3142                 opArg3 = this->ptrCtx(ctx->indirectOffset, 1);
3143                 opSwizzle = this->swizzleOffsetSpan(SkSpan(ctx->offsets, ctx->slots));
3144                 break;
3145             }
3146             case POp::merge_condition_mask:
3147             case POp::merge_inv_condition_mask:
3148             case POp::add_float:   case POp::add_int:
3149             case POp::sub_float:   case POp::sub_int:
3150             case POp::mul_float:   case POp::mul_int:
3151             case POp::div_float:   case POp::div_int:   case POp::div_uint:
3152                                    case POp::bitwise_and_int:
3153                                    case POp::bitwise_or_int:
3154                                    case POp::bitwise_xor_int:
3155             case POp::mod_float:
3156             case POp::min_float:   case POp::min_int:   case POp::min_uint:
3157             case POp::max_float:   case POp::max_int:   case POp::max_uint:
3158             case POp::cmplt_float: case POp::cmplt_int: case POp::cmplt_uint:
3159             case POp::cmple_float: case POp::cmple_int: case POp::cmple_uint:
3160             case POp::cmpeq_float: case POp::cmpeq_int:
3161             case POp::cmpne_float: case POp::cmpne_int:
3162                 std::tie(opArg1, opArg2) = this->adjacentPtrCtx(stage.ctx, 1);
3163                 break;
3164 
3165             case POp::mix_float:   case POp::mix_int:
3166                 std::tie(opArg1, opArg2, opArg3) = this->adjacent3PtrCtx(stage.ctx, 1);
3167                 break;
3168 
3169             case POp::add_2_floats:   case POp::add_2_ints:
3170             case POp::sub_2_floats:   case POp::sub_2_ints:
3171             case POp::mul_2_floats:   case POp::mul_2_ints:
3172             case POp::div_2_floats:   case POp::div_2_ints:   case POp::div_2_uints:
3173                                       case POp::bitwise_and_2_ints:
3174                                       case POp::bitwise_or_2_ints:
3175                                       case POp::bitwise_xor_2_ints:
3176             case POp::mod_2_floats:
3177             case POp::min_2_floats:   case POp::min_2_ints:   case POp::min_2_uints:
3178             case POp::max_2_floats:   case POp::max_2_ints:   case POp::max_2_uints:
3179             case POp::cmplt_2_floats: case POp::cmplt_2_ints: case POp::cmplt_2_uints:
3180             case POp::cmple_2_floats: case POp::cmple_2_ints: case POp::cmple_2_uints:
3181             case POp::cmpeq_2_floats: case POp::cmpeq_2_ints:
3182             case POp::cmpne_2_floats: case POp::cmpne_2_ints:
3183                 std::tie(opArg1, opArg2) = this->adjacentPtrCtx(stage.ctx, 2);
3184                 break;
3185 
3186             case POp::mix_2_floats:   case POp::mix_2_ints:
3187                 std::tie(opArg1, opArg2, opArg3) = this->adjacent3PtrCtx(stage.ctx, 2);
3188                 break;
3189 
3190             case POp::add_3_floats:   case POp::add_3_ints:
3191             case POp::sub_3_floats:   case POp::sub_3_ints:
3192             case POp::mul_3_floats:   case POp::mul_3_ints:
3193             case POp::div_3_floats:   case POp::div_3_ints:   case POp::div_3_uints:
3194                                       case POp::bitwise_and_3_ints:
3195                                       case POp::bitwise_or_3_ints:
3196                                       case POp::bitwise_xor_3_ints:
3197             case POp::mod_3_floats:
3198             case POp::min_3_floats:   case POp::min_3_ints:   case POp::min_3_uints:
3199             case POp::max_3_floats:   case POp::max_3_ints:   case POp::max_3_uints:
3200             case POp::cmplt_3_floats: case POp::cmplt_3_ints: case POp::cmplt_3_uints:
3201             case POp::cmple_3_floats: case POp::cmple_3_ints: case POp::cmple_3_uints:
3202             case POp::cmpeq_3_floats: case POp::cmpeq_3_ints:
3203             case POp::cmpne_3_floats: case POp::cmpne_3_ints:
3204                 std::tie(opArg1, opArg2) = this->adjacentPtrCtx(stage.ctx, 3);
3205                 break;
3206 
3207             case POp::mix_3_floats:   case POp::mix_3_ints:
3208                 std::tie(opArg1, opArg2, opArg3) = this->adjacent3PtrCtx(stage.ctx, 3);
3209                 break;
3210 
3211             case POp::add_4_floats:   case POp::add_4_ints:
3212             case POp::sub_4_floats:   case POp::sub_4_ints:
3213             case POp::mul_4_floats:   case POp::mul_4_ints:
3214             case POp::div_4_floats:   case POp::div_4_ints:   case POp::div_4_uints:
3215                                       case POp::bitwise_and_4_ints:
3216                                       case POp::bitwise_or_4_ints:
3217                                       case POp::bitwise_xor_4_ints:
3218             case POp::mod_4_floats:
3219             case POp::min_4_floats:   case POp::min_4_ints:   case POp::min_4_uints:
3220             case POp::max_4_floats:   case POp::max_4_ints:   case POp::max_4_uints:
3221             case POp::cmplt_4_floats: case POp::cmplt_4_ints: case POp::cmplt_4_uints:
3222             case POp::cmple_4_floats: case POp::cmple_4_ints: case POp::cmple_4_uints:
3223             case POp::cmpeq_4_floats: case POp::cmpeq_4_ints:
3224             case POp::cmpne_4_floats: case POp::cmpne_4_ints:
3225                 std::tie(opArg1, opArg2) = this->adjacentPtrCtx(stage.ctx, 4);
3226                 break;
3227 
3228             case POp::mix_4_floats:   case POp::mix_4_ints:
3229                 std::tie(opArg1, opArg2, opArg3) = this->adjacent3PtrCtx(stage.ctx, 4);
3230                 break;
3231 
3232             case POp::add_n_floats:   case POp::add_n_ints:
3233             case POp::sub_n_floats:   case POp::sub_n_ints:
3234             case POp::mul_n_floats:   case POp::mul_n_ints:
3235             case POp::div_n_floats:   case POp::div_n_ints:   case POp::div_n_uints:
3236                                       case POp::bitwise_and_n_ints:
3237                                       case POp::bitwise_or_n_ints:
3238                                       case POp::bitwise_xor_n_ints:
3239             case POp::mod_n_floats:
3240             case POp::min_n_floats:   case POp::min_n_ints:   case POp::min_n_uints:
3241             case POp::max_n_floats:   case POp::max_n_ints:   case POp::max_n_uints:
3242             case POp::cmplt_n_floats: case POp::cmplt_n_ints: case POp::cmplt_n_uints:
3243             case POp::cmple_n_floats: case POp::cmple_n_ints: case POp::cmple_n_uints:
3244             case POp::cmpeq_n_floats: case POp::cmpeq_n_ints:
3245             case POp::cmpne_n_floats: case POp::cmpne_n_ints:
3246             case POp::atan2_n_floats:
3247             case POp::pow_n_floats:
3248                 std::tie(opArg1, opArg2) = this->adjacentBinaryOpCtx(stage.ctx);
3249                 break;
3250 
3251             case POp::mix_n_floats:        case POp::mix_n_ints:
3252             case POp::smoothstep_n_floats:
3253                 std::tie(opArg1, opArg2, opArg3) = this->adjacentTernaryOpCtx(stage.ctx);
3254                 break;
3255 
3256             case POp::jump:
3257             case POp::branch_if_all_lanes_active:
3258             case POp::branch_if_any_lanes_active:
3259             case POp::branch_if_no_lanes_active:
3260                 opArg1 = this->branchOffset(static_cast<SkRasterPipeline_BranchCtx*>(stage.ctx),
3261                                             index);
3262                 break;
3263 
3264             case POp::branch_if_no_active_lanes_eq: {
3265                 const auto* ctx = static_cast<SkRasterPipeline_BranchIfEqualCtx*>(stage.ctx);
3266                 opArg1 = this->branchOffset(ctx, index);
3267                 opArg2 = this->ptrCtx(ctx->ptr, 1);
3268                 opArg3 = this->imm(sk_bit_cast<float>(ctx->value));
3269                 break;
3270             }
3271             case POp::trace_var: {
3272                 const auto* ctx = static_cast<SkRasterPipeline_TraceVarCtx*>(stage.ctx);
3273                 opArg1 = this->ptrCtx(ctx->traceMask, 1);
3274                 opArg2 = this->ptrCtx(ctx->data, ctx->numSlots);
3275                 if (ctx->indirectOffset != nullptr) {
3276                     opArg3 = " + " + this->ptrCtx(ctx->indirectOffset, 1);
3277                 }
3278                 break;
3279             }
3280             case POp::trace_line: {
3281                 const auto* ctx = static_cast<SkRasterPipeline_TraceLineCtx*>(stage.ctx);
3282                 opArg1 = this->ptrCtx(ctx->traceMask, 1);
3283                 opArg2 = std::to_string(ctx->lineNumber);
3284                 break;
3285             }
3286             case POp::trace_enter:
3287             case POp::trace_exit: {
3288                 const auto* ctx = static_cast<SkRasterPipeline_TraceFuncCtx*>(stage.ctx);
3289                 opArg1 = this->ptrCtx(ctx->traceMask, 1);
3290                 opArg2 = (fProgram.fDebugTrace &&
3291                           ctx->funcIdx >= 0 &&
3292                           ctx->funcIdx < (int)fProgram.fDebugTrace->fFuncInfo.size())
3293                                  ? fProgram.fDebugTrace->fFuncInfo[ctx->funcIdx].name
3294                                  : "???";
3295                 break;
3296             }
3297             case POp::trace_scope: {
3298                 const auto* ctx = static_cast<SkRasterPipeline_TraceScopeCtx*>(stage.ctx);
3299                 opArg1 = this->ptrCtx(ctx->traceMask, 1);
3300                 opArg2 = SkSL::String::printf("%+d", ctx->delta);
3301                 break;
3302             }
3303             default:
3304                 break;
3305         }
3306 
3307         std::string_view opName;
3308         switch (stage.op) {
3309         #define M(x) case POp::x: opName = #x; break;
3310             SK_RASTER_PIPELINE_OPS_ALL(M)
3311             SKRP_EXTENDED_OPS(M)
3312         #undef M
3313         }
3314 
3315         std::string opText;
3316         switch (stage.op) {
3317             case POp::trace_var:
3318                 opText = "TraceVar(" + opArg2 + opArg3 + ") when " + opArg1 + " is true";
3319                 break;
3320 
3321             case POp::trace_line:
3322                 opText = "TraceLine(" + opArg2 + ") when " + opArg1 + " is true";
3323                 break;
3324 
3325             case POp::trace_enter:
3326                 opText = "TraceEnter(" + opArg2 + ") when " + opArg1 + " is true";
3327                 break;
3328 
3329             case POp::trace_exit:
3330                 opText = "TraceExit(" + opArg2 + ") when " + opArg1 + " is true";
3331                 break;
3332 
3333             case POp::trace_scope:
3334                 opText = "TraceScope(" + opArg2 + ") when " + opArg1 + " is true";
3335                 break;
3336 
3337             case POp::init_lane_masks:
3338                 opText = "CondMask = LoopMask = RetMask = true";
3339                 break;
3340 
3341             case POp::load_condition_mask:
3342                 opText = "CondMask = " + opArg1;
3343                 break;
3344 
3345             case POp::store_condition_mask:
3346                 opText = opArg1 + " = CondMask";
3347                 break;
3348 
3349             case POp::merge_condition_mask:
3350                 opText = "CondMask = " + opArg1 + " & " + opArg2;
3351                 break;
3352 
3353             case POp::merge_inv_condition_mask:
3354                 opText = "CondMask = " + opArg1 + " & ~" + opArg2;
3355                 break;
3356 
3357             case POp::load_loop_mask:
3358                 opText = "LoopMask = " + opArg1;
3359                 break;
3360 
3361             case POp::store_loop_mask:
3362                 opText = opArg1 + " = LoopMask";
3363                 break;
3364 
3365             case POp::mask_off_loop_mask:
3366                 opText = "LoopMask &= ~(CondMask & LoopMask & RetMask)";
3367                 break;
3368 
3369             case POp::reenable_loop_mask:
3370                 opText = "LoopMask |= " + opArg1;
3371                 break;
3372 
3373             case POp::merge_loop_mask:
3374                 opText = "LoopMask &= " + opArg1;
3375                 break;
3376 
3377             case POp::load_return_mask:
3378                 opText = "RetMask = " + opArg1;
3379                 break;
3380 
3381             case POp::store_return_mask:
3382                 opText = opArg1 + " = RetMask";
3383                 break;
3384 
3385             case POp::mask_off_return_mask:
3386                 opText = "RetMask &= ~(CondMask & LoopMask & RetMask)";
3387                 break;
3388 
3389             case POp::store_src_rg:
3390                 opText = opArg1 + " = src.rg";
3391                 break;
3392 
3393             case POp::exchange_src:
3394                 opText = "swap(src.rgba, " + opArg1 + ")";
3395                 break;
3396 
3397             case POp::store_src:
3398                 opText = opArg1 + " = src.rgba";
3399                 break;
3400 
3401             case POp::store_dst:
3402                 opText = opArg1 + " = dst.rgba";
3403                 break;
3404 
3405             case POp::store_device_xy01:
3406                 opText = opArg1 + " = DeviceCoords.xy01";
3407                 break;
3408 
3409             case POp::load_src:
3410                 opText = "src.rgba = " + opArg1;
3411                 break;
3412 
3413             case POp::load_dst:
3414                 opText = "dst.rgba = " + opArg1;
3415                 break;
3416 
3417             case POp::bitwise_and_int:
3418             case POp::bitwise_and_2_ints:
3419             case POp::bitwise_and_3_ints:
3420             case POp::bitwise_and_4_ints:
3421             case POp::bitwise_and_n_ints:
3422             case POp::bitwise_and_imm_int:
3423             case POp::bitwise_and_imm_2_ints:
3424             case POp::bitwise_and_imm_3_ints:
3425             case POp::bitwise_and_imm_4_ints:
3426                 opText = opArg1 + " &= " + opArg2;
3427                 break;
3428 
3429             case POp::bitwise_or_int:
3430             case POp::bitwise_or_2_ints:
3431             case POp::bitwise_or_3_ints:
3432             case POp::bitwise_or_4_ints:
3433             case POp::bitwise_or_n_ints:
3434                 opText = opArg1 + " |= " + opArg2;
3435                 break;
3436 
3437             case POp::bitwise_xor_int:
3438             case POp::bitwise_xor_2_ints:
3439             case POp::bitwise_xor_3_ints:
3440             case POp::bitwise_xor_4_ints:
3441             case POp::bitwise_xor_n_ints:
3442             case POp::bitwise_xor_imm_int:
3443                 opText = opArg1 + " ^= " + opArg2;
3444                 break;
3445 
3446             case POp::cast_to_float_from_int:
3447             case POp::cast_to_float_from_2_ints:
3448             case POp::cast_to_float_from_3_ints:
3449             case POp::cast_to_float_from_4_ints:
3450                 opText = opArg1 + " = IntToFloat(" + opArg1 + ")";
3451                 break;
3452 
3453             case POp::cast_to_float_from_uint:
3454             case POp::cast_to_float_from_2_uints:
3455             case POp::cast_to_float_from_3_uints:
3456             case POp::cast_to_float_from_4_uints:
3457                 opText = opArg1 + " = UintToFloat(" + opArg1 + ")";
3458                 break;
3459 
3460             case POp::cast_to_int_from_float:
3461             case POp::cast_to_int_from_2_floats:
3462             case POp::cast_to_int_from_3_floats:
3463             case POp::cast_to_int_from_4_floats:
3464                 opText = opArg1 + " = FloatToInt(" + opArg1 + ")";
3465                 break;
3466 
3467             case POp::cast_to_uint_from_float:
3468             case POp::cast_to_uint_from_2_floats:
3469             case POp::cast_to_uint_from_3_floats:
3470             case POp::cast_to_uint_from_4_floats:
3471                 opText = opArg1 + " = FloatToUint(" + opArg1 + ")";
3472                 break;
3473 
3474             case POp::copy_slot_masked:            case POp::copy_2_slots_masked:
3475             case POp::copy_3_slots_masked:         case POp::copy_4_slots_masked:
3476             case POp::swizzle_copy_slot_masked:    case POp::swizzle_copy_2_slots_masked:
3477             case POp::swizzle_copy_3_slots_masked: case POp::swizzle_copy_4_slots_masked:
3478                 opText = opArg1 + " = Mask(" + opArg2 + ")";
3479                 break;
3480 
3481             case POp::copy_uniform:                case POp::copy_2_uniforms:
3482             case POp::copy_3_uniforms:             case POp::copy_4_uniforms:
3483             case POp::copy_slot_unmasked:          case POp::copy_2_slots_unmasked:
3484             case POp::copy_3_slots_unmasked:       case POp::copy_4_slots_unmasked:
3485             case POp::copy_immutable_unmasked:     case POp::copy_2_immutables_unmasked:
3486             case POp::copy_3_immutables_unmasked:  case POp::copy_4_immutables_unmasked:
3487             case POp::copy_constant:               case POp::splat_2_constants:
3488             case POp::splat_3_constants:           case POp::splat_4_constants:
3489             case POp::swizzle_1:                   case POp::swizzle_2:
3490             case POp::swizzle_3:                   case POp::swizzle_4:
3491             case POp::shuffle:
3492                 opText = opArg1 + " = " + opArg2;
3493                 break;
3494 
3495             case POp::copy_from_indirect_unmasked:
3496             case POp::copy_from_indirect_uniform_unmasked:
3497                 opText = opArg1 + " = Indirect(" + opArg2 + " + " + opArg3 + ")";
3498                 break;
3499 
3500             case POp::copy_to_indirect_masked:
3501                 opText = "Indirect(" + opArg1 + " + " + opArg3 + ") = Mask(" + opArg2 + ")";
3502                 break;
3503 
3504             case POp::swizzle_copy_to_indirect_masked:
3505                 opText = "Indirect(" + opArg1 + " + " + opArg3 + ")." + opSwizzle + " = Mask(" +
3506                          opArg2 + ")";
3507                 break;
3508 
3509             case POp::abs_int:
3510             case POp::abs_2_ints:
3511             case POp::abs_3_ints:
3512             case POp::abs_4_ints:
3513                 opText = opArg1 + " = abs(" + opArg1 + ")";
3514                 break;
3515 
3516             case POp::acos_float:
3517                 opText = opArg1 + " = acos(" + opArg1 + ")";
3518                 break;
3519 
3520             case POp::asin_float:
3521                 opText = opArg1 + " = asin(" + opArg1 + ")";
3522                 break;
3523 
3524             case POp::atan_float:
3525                 opText = opArg1 + " = atan(" + opArg1 + ")";
3526                 break;
3527 
3528             case POp::atan2_n_floats:
3529                 opText = opArg1 + " = atan2(" + opArg1 + ", " + opArg2 + ")";
3530                 break;
3531 
3532             case POp::ceil_float:
3533             case POp::ceil_2_floats:
3534             case POp::ceil_3_floats:
3535             case POp::ceil_4_floats:
3536                 opText = opArg1 + " = ceil(" + opArg1 + ")";
3537                 break;
3538 
3539             case POp::cos_float:
3540                 opText = opArg1 + " = cos(" + opArg1 + ")";
3541                 break;
3542 
3543             case POp::refract_4_floats:
3544                 opText = opArg1 + " = refract(" + opArg1 + ", " + opArg2 + ", " + opArg3 + ")";
3545                 break;
3546 
3547             case POp::dot_2_floats:
3548             case POp::dot_3_floats:
3549             case POp::dot_4_floats:
3550                 opText = opArg1 + " = dot(" + opArg2 + ", " + opArg3 + ")";
3551                 break;
3552 
3553             case POp::exp_float:
3554                 opText = opArg1 + " = exp(" + opArg1 + ")";
3555                 break;
3556 
3557             case POp::exp2_float:
3558                 opText = opArg1 + " = exp2(" + opArg1 + ")";
3559                 break;
3560 
3561             case POp::log_float:
3562                 opText = opArg1 + " = log(" + opArg1 + ")";
3563                 break;
3564 
3565             case POp::log2_float:
3566                 opText = opArg1 + " = log2(" + opArg1 + ")";
3567                 break;
3568 
3569             case POp::pow_n_floats:
3570                 opText = opArg1 + " = pow(" + opArg1 + ", " + opArg2 + ")";
3571                 break;
3572 
3573             case POp::sin_float:
3574                 opText = opArg1 + " = sin(" + opArg1 + ")";
3575                 break;
3576 
3577             case POp::sqrt_float:
3578                 opText = opArg1 + " = sqrt(" + opArg1 + ")";
3579                 break;
3580 
3581             case POp::tan_float:
3582                 opText = opArg1 + " = tan(" + opArg1 + ")";
3583                 break;
3584 
3585             case POp::floor_float:
3586             case POp::floor_2_floats:
3587             case POp::floor_3_floats:
3588             case POp::floor_4_floats:
3589                 opText = opArg1 + " = floor(" + opArg1 + ")";
3590                 break;
3591 
3592             case POp::invsqrt_float:
3593             case POp::invsqrt_2_floats:
3594             case POp::invsqrt_3_floats:
3595             case POp::invsqrt_4_floats:
3596                 opText = opArg1 + " = inversesqrt(" + opArg1 + ")";
3597                 break;
3598 
3599             case POp::inverse_mat2:
3600             case POp::inverse_mat3:
3601             case POp::inverse_mat4:
3602                 opText = opArg1 + " = inverse(" + opArg1 + ")";
3603                 break;
3604 
3605             case POp::add_float:     case POp::add_int:
3606             case POp::add_2_floats:  case POp::add_2_ints:
3607             case POp::add_3_floats:  case POp::add_3_ints:
3608             case POp::add_4_floats:  case POp::add_4_ints:
3609             case POp::add_n_floats:  case POp::add_n_ints:
3610             case POp::add_imm_float: case POp::add_imm_int:
3611                 opText = opArg1 + " += " + opArg2;
3612                 break;
3613 
3614             case POp::sub_float:    case POp::sub_int:
3615             case POp::sub_2_floats: case POp::sub_2_ints:
3616             case POp::sub_3_floats: case POp::sub_3_ints:
3617             case POp::sub_4_floats: case POp::sub_4_ints:
3618             case POp::sub_n_floats: case POp::sub_n_ints:
3619                 opText = opArg1 + " -= " + opArg2;
3620                 break;
3621 
3622             case POp::mul_float:     case POp::mul_int:
3623             case POp::mul_2_floats:  case POp::mul_2_ints:
3624             case POp::mul_3_floats:  case POp::mul_3_ints:
3625             case POp::mul_4_floats:  case POp::mul_4_ints:
3626             case POp::mul_n_floats:  case POp::mul_n_ints:
3627             case POp::mul_imm_float: case POp::mul_imm_int:
3628                 opText = opArg1 + " *= " + opArg2;
3629                 break;
3630 
3631             case POp::div_float:    case POp::div_int:    case POp::div_uint:
3632             case POp::div_2_floats: case POp::div_2_ints: case POp::div_2_uints:
3633             case POp::div_3_floats: case POp::div_3_ints: case POp::div_3_uints:
3634             case POp::div_4_floats: case POp::div_4_ints: case POp::div_4_uints:
3635             case POp::div_n_floats: case POp::div_n_ints: case POp::div_n_uints:
3636                 opText = opArg1 + " /= " + opArg2;
3637                 break;
3638 
3639             case POp::matrix_multiply_2:
3640             case POp::matrix_multiply_3:
3641             case POp::matrix_multiply_4:
3642                 opText = opArg1 + " = " + opArg2 + " * " + opArg3;
3643                 break;
3644 
3645             case POp::mod_float:
3646             case POp::mod_2_floats:
3647             case POp::mod_3_floats:
3648             case POp::mod_4_floats:
3649             case POp::mod_n_floats:
3650                 opText = opArg1 + " = mod(" + opArg1 + ", " + opArg2 + ")";
3651                 break;
3652 
3653             case POp::min_float:        case POp::min_int:          case POp::min_uint:
3654             case POp::min_2_floats:     case POp::min_2_ints:       case POp::min_2_uints:
3655             case POp::min_3_floats:     case POp::min_3_ints:       case POp::min_3_uints:
3656             case POp::min_4_floats:     case POp::min_4_ints:       case POp::min_4_uints:
3657             case POp::min_n_floats:     case POp::min_n_ints:       case POp::min_n_uints:
3658             case POp::min_imm_float:
3659                 opText = opArg1 + " = min(" + opArg1 + ", " + opArg2 + ")";
3660                 break;
3661 
3662             case POp::max_float:        case POp::max_int:          case POp::max_uint:
3663             case POp::max_2_floats:     case POp::max_2_ints:       case POp::max_2_uints:
3664             case POp::max_3_floats:     case POp::max_3_ints:       case POp::max_3_uints:
3665             case POp::max_4_floats:     case POp::max_4_ints:       case POp::max_4_uints:
3666             case POp::max_n_floats:     case POp::max_n_ints:       case POp::max_n_uints:
3667             case POp::max_imm_float:
3668                 opText = opArg1 + " = max(" + opArg1 + ", " + opArg2 + ")";
3669                 break;
3670 
3671             case POp::cmplt_float:     case POp::cmplt_int:     case POp::cmplt_uint:
3672             case POp::cmplt_2_floats:  case POp::cmplt_2_ints:  case POp::cmplt_2_uints:
3673             case POp::cmplt_3_floats:  case POp::cmplt_3_ints:  case POp::cmplt_3_uints:
3674             case POp::cmplt_4_floats:  case POp::cmplt_4_ints:  case POp::cmplt_4_uints:
3675             case POp::cmplt_n_floats:  case POp::cmplt_n_ints:  case POp::cmplt_n_uints:
3676             case POp::cmplt_imm_float: case POp::cmplt_imm_int: case POp::cmplt_imm_uint:
3677                 opText = opArg1 + " = lessThan(" + opArg1 + ", " + opArg2 + ")";
3678                 break;
3679 
3680             case POp::cmple_float:     case POp::cmple_int:     case POp::cmple_uint:
3681             case POp::cmple_2_floats:  case POp::cmple_2_ints:  case POp::cmple_2_uints:
3682             case POp::cmple_3_floats:  case POp::cmple_3_ints:  case POp::cmple_3_uints:
3683             case POp::cmple_4_floats:  case POp::cmple_4_ints:  case POp::cmple_4_uints:
3684             case POp::cmple_n_floats:  case POp::cmple_n_ints:  case POp::cmple_n_uints:
3685             case POp::cmple_imm_float: case POp::cmple_imm_int: case POp::cmple_imm_uint:
3686                 opText = opArg1 + " = lessThanEqual(" + opArg1 + ", " + opArg2 + ")";
3687                 break;
3688 
3689             case POp::cmpeq_float:     case POp::cmpeq_int:
3690             case POp::cmpeq_2_floats:  case POp::cmpeq_2_ints:
3691             case POp::cmpeq_3_floats:  case POp::cmpeq_3_ints:
3692             case POp::cmpeq_4_floats:  case POp::cmpeq_4_ints:
3693             case POp::cmpeq_n_floats:  case POp::cmpeq_n_ints:
3694             case POp::cmpeq_imm_float: case POp::cmpeq_imm_int:
3695                 opText = opArg1 + " = equal(" + opArg1 + ", " + opArg2 + ")";
3696                 break;
3697 
3698             case POp::cmpne_float:     case POp::cmpne_int:
3699             case POp::cmpne_2_floats:  case POp::cmpne_2_ints:
3700             case POp::cmpne_3_floats:  case POp::cmpne_3_ints:
3701             case POp::cmpne_4_floats:  case POp::cmpne_4_ints:
3702             case POp::cmpne_n_floats:  case POp::cmpne_n_ints:
3703             case POp::cmpne_imm_float: case POp::cmpne_imm_int:
3704                 opText = opArg1 + " = notEqual(" + opArg1 + ", " + opArg2 + ")";
3705                 break;
3706 
3707             case POp::mix_float:      case POp::mix_int:
3708             case POp::mix_2_floats:   case POp::mix_2_ints:
3709             case POp::mix_3_floats:   case POp::mix_3_ints:
3710             case POp::mix_4_floats:   case POp::mix_4_ints:
3711             case POp::mix_n_floats:   case POp::mix_n_ints:
3712                 opText = opArg1 + " = mix(" + opArg2 + ", " + opArg3 + ", " + opArg1 + ")";
3713                 break;
3714 
3715             case POp::smoothstep_n_floats:
3716                 opText = opArg1 + " = smoothstep(" + opArg1 + ", " + opArg2 + ", " + opArg3 + ")";
3717                 break;
3718 
3719             case POp::jump:
3720             case POp::branch_if_all_lanes_active:
3721             case POp::branch_if_any_lanes_active:
3722             case POp::branch_if_no_lanes_active:
3723             case POp::invoke_shader:
3724             case POp::invoke_color_filter:
3725             case POp::invoke_blender:
3726                 opText = std::string(opName) + " " + opArg1;
3727                 break;
3728 
3729             case POp::invoke_to_linear_srgb:
3730                 opText = opArg1 + " = toLinearSrgb(" + opArg1 + ")";
3731                 break;
3732 
3733             case POp::invoke_from_linear_srgb:
3734                 opText = opArg1 + " = fromLinearSrgb(" + opArg1 + ")";
3735                 break;
3736 
3737             case POp::branch_if_no_active_lanes_eq:
3738                 opText = "branch " + opArg1 + " if no lanes of " + opArg2 + " == " + opArg3;
3739                 break;
3740 
3741             case POp::label:
3742                 opText = "label " + opArg1;
3743                 break;
3744 
3745             case POp::case_op:
3746                 opText = "if (" + opArg1 + " == " + opArg3 +
3747                          ") { LoopMask = true; " + opArg2 + " = false; }";
3748                 break;
3749 
3750             case POp::continue_op:
3751                 opText = opArg1 +
3752                          " |= Mask(0xFFFFFFFF); LoopMask &= ~(CondMask & LoopMask & RetMask)";
3753                 break;
3754 
3755             default:
3756                 break;
3757         }
3758 
3759         opName = opName.substr(0, 30);
3760         if (!opText.empty()) {
3761             out->writeText(SkSL::String::printf("%-30.*s %s\n",
3762                                                 (int)opName.size(), opName.data(),
3763                                                 opText.c_str()).c_str());
3764         } else {
3765             out->writeText(SkSL::String::printf("%.*s\n",
3766                                                 (int)opName.size(), opName.data()).c_str());
3767         }
3768     }
3769 }
3770 
dump(SkWStream * out,bool writeInstructionCount) const3771 void Program::dump(SkWStream* out, bool writeInstructionCount) const {
3772     Dumper(*this).dump(out, writeInstructionCount);
3773 }
3774 
3775 }  // namespace SkSL::RP
3776