• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*
2  * Copyright (C) 2022 Collabora Ltd.
3  *
4  * Permission is hereby granted, free of charge, to any person obtaining a
5  * copy of this software and associated documentation files (the "Software"),
6  * to deal in the Software without restriction, including without limitation
7  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8  * and/or sell copies of the Software, and to permit persons to whom the
9  * Software is furnished to do so, subject to the following conditions:
10  *
11  * The above copyright notice and this permission notice (including the next
12  * paragraph) shall be included in all copies or substantial portions of the
13  * Software.
14  *
15  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
18  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21  * IN THE SOFTWARE.
22  */
23 
24 #pragma once
25 
26 #if !defined(PAN_ARCH) || PAN_ARCH < 10
27 #error "cs_builder.h requires PAN_ARCH >= 10"
28 #endif
29 
30 #include "gen_macros.h"
31 
32 #include "util/bitset.h"
33 #include "util/u_dynarray.h"
34 
35 /*
36  * cs_builder implements a builder for CSF command streams. It manages the
37  * allocation and overflow behaviour of queues and provides helpers for emitting
38  * commands to run on the CSF pipe.
39  *
40  * Users are responsible for the CS buffer allocation and must initialize the
41  * command stream with an initial buffer using cs_builder_init(). The CS can
42  * be extended with new buffers allocated with cs_builder_conf::alloc_buffer()
43  * if the builder runs out of memory.
44  */
45 
46 struct cs_buffer {
47    /* CPU pointer */
48    uint64_t *cpu;
49 
50    /* GPU pointer */
51    uint64_t gpu;
52 
53    /* Capacity in number of 64-bit instructions */
54    uint32_t capacity;
55 };
56 
57 /**
58  * This is used to check that:
59  * 1. registers are not used as a source after being loaded without a
60  *    WAIT(<ls_scoreboard>) in the middle
61  * 2. registers are not reused (used as a destination) after they served as a
62  *    STORE() source without a WAIT(<ls_scoreboard>) in the middle
63  */
64 struct cs_load_store_tracker {
65    BITSET_DECLARE(pending_loads, 256);
66    BITSET_DECLARE(pending_stores, 256);
67    uint8_t sb_slot;
68 };
69 
70 /**
71  * This is used to determine which registers as been written to (a.k.a. used
72  * as an instruction's destination).
73  */
74 struct cs_dirty_tracker {
75    BITSET_DECLARE(regs, 256);
76 };
77 
78 enum cs_reg_perm {
79    CS_REG_NO_ACCESS = 0,
80    CS_REG_RD = BITFIELD_BIT(1),
81    CS_REG_WR = BITFIELD_BIT(2),
82    CS_REG_RW = CS_REG_RD | CS_REG_WR,
83 };
84 
85 struct cs_builder;
86 
87 typedef enum cs_reg_perm (*reg_perm_cb_t)(struct cs_builder *b, unsigned reg);
88 
89 struct cs_builder_conf {
90    /* Number of 32-bit registers in the hardware register file */
91    uint8_t nr_registers;
92 
93    /* Number of 32-bit registers used by the kernel at submission time */
94    uint8_t nr_kernel_registers;
95 
96    /* CS buffer allocator */
97    struct cs_buffer (*alloc_buffer)(void *cookie);
98 
99    /* Optional load/store tracker. */
100    struct cs_load_store_tracker *ls_tracker;
101 
102    /* Optional dirty registers tracker. */
103    struct cs_dirty_tracker *dirty_tracker;
104 
105    /* Optional register access checker. */
106    reg_perm_cb_t reg_perm;
107 
108    /* Cookie passed back to alloc_buffer() */
109    void *cookie;
110 };
111 
112 /* The CS is formed of one or more CS chunks linked with JUMP instructions.
113  * The builder keeps track of the current chunk and the position inside this
114  * chunk, so it can emit new instructions, and decide when a new chunk needs
115  * to be allocated.
116  */
117 struct cs_chunk {
118    /* CS buffer object backing this chunk */
119    struct cs_buffer buffer;
120 
121    union {
122       /* Current position in the buffer object when the chunk is active. */
123       uint32_t pos;
124 
125       /* Chunk size when the chunk was wrapped. */
126       uint32_t size;
127    };
128 };
129 
130 /* Monolithic sequence of instruction. Must live in a virtually contiguous
131  * portion of code.
132  */
133 struct cs_block {
134    /* Used to insert the block in the block stack. */
135    struct cs_block *next;
136 };
137 
138 #define CS_LABEL_INVALID_POS ~0u
139 
140 /* Labels can only be used inside a cs_block. They can be defined and
141  * referenced before they are set to point to a specific position
142  * in the block. */
143 struct cs_label {
144    /* The last reference we have seen pointing to this block before
145     * it was set. If set to CS_LABEL_INVALID_POS, no forward reference
146     * pointing to this label exist.
147     */
148    uint32_t last_forward_ref;
149 
150    /* The label target. If set to CS_LABEL_INVALID_POS, the label has
151     * not been set yet.
152     */
153    uint32_t target;
154 };
155 
156 /* CS if/else block. */
157 struct cs_if_else {
158    struct cs_block block;
159    struct cs_label end_label;
160 };
161 
162 struct cs_builder {
163    /* CS builder configuration */
164    struct cs_builder_conf conf;
165 
166    /* True if an allocation failed, making the whole CS invalid. */
167    bool invalid;
168 
169    /* Initial (root) CS chunk. */
170    struct cs_chunk root_chunk;
171 
172    /* Current CS chunk. */
173    struct cs_chunk cur_chunk;
174 
175    /* Temporary storage for inner blocks that need to be built
176     * and copied in one monolithic sequence of instructions with no
177     * jump in the middle.
178     */
179    struct {
180       struct cs_block *stack;
181       struct util_dynarray instrs;
182       struct cs_if_else pending_if;
183       unsigned last_load_ip_target;
184    } blocks;
185 
186    /* Move immediate instruction at the end of the last CS chunk that needs to
187     * be patched with the final length of the current CS chunk in order to
188     * facilitate correct overflow behaviour.
189     */
190    uint32_t *length_patch;
191 
192    /* Used as temporary storage when the allocator couldn't allocate a new
193     * CS chunk.
194     */
195    uint64_t discard_instr_slot;
196 };
197 
198 static inline void
cs_builder_init(struct cs_builder * b,const struct cs_builder_conf * conf,struct cs_buffer root_buffer)199 cs_builder_init(struct cs_builder *b, const struct cs_builder_conf *conf,
200                 struct cs_buffer root_buffer)
201 {
202    *b = (struct cs_builder){
203       .conf = *conf,
204       .root_chunk.buffer = root_buffer,
205       .cur_chunk.buffer = root_buffer,
206    };
207 
208    /* We need at least 3 registers for CS chunk linking. Assume the kernel needs
209     * at least that too.
210     */
211    b->conf.nr_kernel_registers = MAX2(b->conf.nr_kernel_registers, 3);
212 
213    util_dynarray_init(&b->blocks.instrs, NULL);
214 }
215 
216 static inline bool
cs_is_valid(struct cs_builder * b)217 cs_is_valid(struct cs_builder *b)
218 {
219    return !b->invalid;
220 }
221 
222 static inline bool
cs_is_empty(struct cs_builder * b)223 cs_is_empty(struct cs_builder *b)
224 {
225    return b->cur_chunk.pos == 0 &&
226           b->root_chunk.buffer.gpu == b->cur_chunk.buffer.gpu;
227 }
228 
229 static inline uint64_t
cs_root_chunk_gpu_addr(struct cs_builder * b)230 cs_root_chunk_gpu_addr(struct cs_builder *b)
231 {
232    return b->root_chunk.buffer.gpu;
233 }
234 
235 static inline uint32_t
cs_root_chunk_size(struct cs_builder * b)236 cs_root_chunk_size(struct cs_builder *b)
237 {
238    /* Make sure cs_finish() was called. */
239    assert(!memcmp(&b->cur_chunk, &(struct cs_chunk){0}, sizeof(b->cur_chunk)));
240 
241    return b->root_chunk.size * sizeof(uint64_t);
242 }
243 
244 /*
245  * Wrap the current queue. External users shouldn't call this function
246  * directly, they should call cs_finish() when they are done building
247  * the command stream, which will in turn call cs_wrap_queue().
248  *
249  * Internally, this is also used to finalize internal CS chunks when
250  * allocating new sub-chunks. See cs_alloc_chunk() for details.
251  *
252  * This notably requires patching the previous chunk with the length
253  * we ended up emitting for this chunk.
254  */
255 static inline void
cs_wrap_chunk(struct cs_builder * b)256 cs_wrap_chunk(struct cs_builder *b)
257 {
258    if (!cs_is_valid(b))
259       return;
260 
261    if (b->length_patch) {
262       *b->length_patch = (b->cur_chunk.pos * 8);
263       b->length_patch = NULL;
264    }
265 
266    if (b->root_chunk.buffer.gpu == b->cur_chunk.buffer.gpu)
267       b->root_chunk.size = b->cur_chunk.size;
268 }
269 
270 enum cs_index_type {
271    CS_INDEX_REGISTER = 0,
272    CS_INDEX_UNDEF,
273 };
274 
275 struct cs_index {
276    enum cs_index_type type;
277 
278    /* Number of 32-bit words in the index, must be nonzero */
279    uint8_t size;
280 
281    union {
282       uint64_t imm;
283       uint8_t reg;
284    };
285 };
286 
287 static inline struct cs_index
cs_undef(void)288 cs_undef(void)
289 {
290    return (struct cs_index){
291       .type = CS_INDEX_UNDEF,
292    };
293 }
294 
295 static inline uint8_t
cs_to_reg_tuple(struct cs_index idx,ASSERTED unsigned expected_size)296 cs_to_reg_tuple(struct cs_index idx, ASSERTED unsigned expected_size)
297 {
298    assert(idx.type == CS_INDEX_REGISTER);
299    assert(idx.size == expected_size);
300 
301    return idx.reg;
302 }
303 
304 static inline unsigned
cs_src_tuple(struct cs_builder * b,struct cs_index src,ASSERTED unsigned count,uint16_t mask)305 cs_src_tuple(struct cs_builder *b, struct cs_index src, ASSERTED unsigned count,
306              uint16_t mask)
307 {
308    unsigned reg = cs_to_reg_tuple(src, count);
309 
310    if (unlikely(b->conf.reg_perm)) {
311       for (unsigned i = reg; i < reg + count; i++) {
312          if (mask & BITFIELD_BIT(i - reg)) {
313             assert((b->conf.reg_perm(b, i) & CS_REG_RD) ||
314                    !"Trying to read a restricted register");
315          }
316       }
317    }
318 
319    struct cs_load_store_tracker *ls_tracker = b->conf.ls_tracker;
320 
321    if (unlikely(ls_tracker)) {
322       for (unsigned i = reg; i < reg + count; i++) {
323          if ((mask & BITFIELD_BIT(i - reg)) &&
324              BITSET_TEST(ls_tracker->pending_loads, i))
325             assert(!"register used as a source before flushing loads\n");
326       }
327    }
328 
329    return reg;
330 }
331 
332 static inline unsigned
cs_src32(struct cs_builder * b,struct cs_index src)333 cs_src32(struct cs_builder *b, struct cs_index src)
334 {
335    return cs_src_tuple(b, src, 1, BITFIELD_MASK(1));
336 }
337 
338 static inline unsigned
cs_src64(struct cs_builder * b,struct cs_index src)339 cs_src64(struct cs_builder *b, struct cs_index src)
340 {
341    return cs_src_tuple(b, src, 2, BITFIELD_MASK(2));
342 }
343 
344 static inline unsigned
cs_dst_tuple(struct cs_builder * b,struct cs_index dst,ASSERTED unsigned count,uint16_t mask)345 cs_dst_tuple(struct cs_builder *b, struct cs_index dst, ASSERTED unsigned count,
346              uint16_t mask)
347 {
348    unsigned reg = cs_to_reg_tuple(dst, count);
349 
350    if (unlikely(b->conf.reg_perm)) {
351       for (unsigned i = reg; i < reg + count; i++) {
352          if (mask & BITFIELD_BIT(i - reg)) {
353             assert((b->conf.reg_perm(b, i) & CS_REG_WR) ||
354                    !"Trying to write a restricted register");
355          }
356       }
357    }
358 
359    struct cs_load_store_tracker *ls_tracker = b->conf.ls_tracker;
360 
361    if (unlikely(ls_tracker)) {
362       for (unsigned i = reg; i < reg + count; i++) {
363          if ((mask & BITFIELD_BIT(i - reg)) &&
364              BITSET_TEST(ls_tracker->pending_stores, i))
365             assert(
366                !"register reused as a destination before flushing stores\n");
367       }
368    }
369 
370    if (unlikely(b->conf.dirty_tracker)) {
371       for (unsigned i = reg; i < reg + count; i++) {
372          if (mask & BITFIELD_BIT(i - reg))
373             BITSET_SET(b->conf.dirty_tracker->regs, i);
374       }
375    }
376 
377    return reg;
378 }
379 
380 static inline unsigned
cs_dst32(struct cs_builder * b,struct cs_index dst)381 cs_dst32(struct cs_builder *b, struct cs_index dst)
382 {
383    return cs_dst_tuple(b, dst, 1, BITFIELD_MASK(1));
384 }
385 
386 static inline unsigned
cs_dst64(struct cs_builder * b,struct cs_index dst)387 cs_dst64(struct cs_builder *b, struct cs_index dst)
388 {
389    return cs_dst_tuple(b, dst, 2, BITFIELD_MASK(2));
390 }
391 
392 static inline struct cs_index
cs_reg_tuple(ASSERTED struct cs_builder * b,unsigned reg,unsigned size)393 cs_reg_tuple(ASSERTED struct cs_builder *b, unsigned reg, unsigned size)
394 {
395    assert(reg + size <= b->conf.nr_registers - b->conf.nr_kernel_registers &&
396           "overflowed register file");
397    assert(size <= 16 && "unsupported");
398 
399    return (struct cs_index){
400       .type = CS_INDEX_REGISTER,
401       .size = size,
402       .reg = reg,
403    };
404 }
405 
406 static inline struct cs_index
cs_reg32(struct cs_builder * b,unsigned reg)407 cs_reg32(struct cs_builder *b, unsigned reg)
408 {
409    return cs_reg_tuple(b, reg, 1);
410 }
411 
412 static inline struct cs_index
cs_reg64(struct cs_builder * b,unsigned reg)413 cs_reg64(struct cs_builder *b, unsigned reg)
414 {
415    assert((reg % 2) == 0 && "unaligned 64-bit reg");
416    return cs_reg_tuple(b, reg, 2);
417 }
418 
419 /*
420  * The top of the register file is reserved for cs_builder internal use. We
421  * need 3 spare registers for handling command queue overflow. These are
422  * available here.
423  */
424 static inline uint8_t
cs_overflow_address_reg(struct cs_builder * b)425 cs_overflow_address_reg(struct cs_builder *b)
426 {
427    return b->conf.nr_registers - 2;
428 }
429 
430 static inline uint8_t
cs_overflow_length_reg(struct cs_builder * b)431 cs_overflow_length_reg(struct cs_builder *b)
432 {
433    return b->conf.nr_registers - 3;
434 }
435 
436 static inline struct cs_index
cs_extract32(struct cs_builder * b,struct cs_index idx,unsigned word)437 cs_extract32(struct cs_builder *b, struct cs_index idx, unsigned word)
438 {
439    assert(idx.type == CS_INDEX_REGISTER && "unsupported");
440    assert(word < idx.size && "overrun");
441 
442    return cs_reg32(b, idx.reg + word);
443 }
444 
445 static inline struct cs_block *
cs_cur_block(struct cs_builder * b)446 cs_cur_block(struct cs_builder *b)
447 {
448    return b->blocks.stack;
449 }
450 
451 #define JUMP_SEQ_INSTR_COUNT 4
452 
453 static inline bool
cs_reserve_instrs(struct cs_builder * b,uint32_t num_instrs)454 cs_reserve_instrs(struct cs_builder *b, uint32_t num_instrs)
455 {
456    /* Don't call this function with num_instrs=0. */
457    assert(num_instrs > 0);
458    assert(cs_cur_block(b) == NULL);
459 
460    /* If an allocation failure happened before, we just discard all following
461     * instructions.
462     */
463    if (unlikely(!cs_is_valid(b)))
464       return false;
465 
466    /* Lazy root chunk allocation. */
467    if (unlikely(!b->root_chunk.buffer.cpu)) {
468       b->root_chunk.buffer = b->conf.alloc_buffer(b->conf.cookie);
469       b->cur_chunk.buffer = b->root_chunk.buffer;
470       if (!b->cur_chunk.buffer.cpu) {
471          b->invalid = true;
472          return false;
473       }
474    }
475 
476    /* Make sure the instruction sequence fits in a single chunk. */
477    assert(b->cur_chunk.buffer.capacity >= num_instrs);
478 
479    /* If the current chunk runs out of space, allocate a new one and jump to it.
480     * We actually do this a few instructions before running out, because the
481     * sequence to jump to a new queue takes multiple instructions.
482     */
483    if (unlikely((b->cur_chunk.size + num_instrs + JUMP_SEQ_INSTR_COUNT) >
484                 b->cur_chunk.buffer.capacity)) {
485       /* Now, allocate a new chunk */
486       struct cs_buffer newbuf = b->conf.alloc_buffer(b->conf.cookie);
487 
488       /* Allocation failure, from now on, all new instructions will be
489        * discarded.
490        */
491       if (unlikely(!newbuf.cpu)) {
492          b->invalid = true;
493          return false;
494       }
495 
496       uint64_t *ptr = b->cur_chunk.buffer.cpu + (b->cur_chunk.pos++);
497 
498       pan_cast_and_pack(ptr, CS_MOVE, I) {
499          I.destination = cs_overflow_address_reg(b);
500          I.immediate = newbuf.gpu;
501       }
502 
503       ptr = b->cur_chunk.buffer.cpu + (b->cur_chunk.pos++);
504 
505       pan_cast_and_pack(ptr, CS_MOVE32, I) {
506          I.destination = cs_overflow_length_reg(b);
507       }
508 
509       /* The length will be patched in later */
510       uint32_t *length_patch = (uint32_t *)ptr;
511 
512       ptr = b->cur_chunk.buffer.cpu + (b->cur_chunk.pos++);
513 
514       pan_cast_and_pack(ptr, CS_JUMP, I) {
515          I.length = cs_overflow_length_reg(b);
516          I.address = cs_overflow_address_reg(b);
517       }
518 
519       /* Now that we've emitted everything, finish up the previous queue */
520       cs_wrap_chunk(b);
521 
522       /* And make this one current */
523       b->length_patch = length_patch;
524       b->cur_chunk.buffer = newbuf;
525       b->cur_chunk.pos = 0;
526    }
527 
528    return true;
529 }
530 
531 static inline void *
cs_alloc_ins_block(struct cs_builder * b,uint32_t num_instrs)532 cs_alloc_ins_block(struct cs_builder *b, uint32_t num_instrs)
533 {
534    if (cs_cur_block(b))
535       return util_dynarray_grow(&b->blocks.instrs, uint64_t, num_instrs);
536 
537    if (!cs_reserve_instrs(b, num_instrs))
538       return NULL;
539 
540    assert(b->cur_chunk.size + num_instrs - 1 < b->cur_chunk.buffer.capacity);
541    uint32_t pos = b->cur_chunk.pos;
542    b->cur_chunk.pos += num_instrs;
543    return b->cur_chunk.buffer.cpu + pos;
544 }
545 
546 static inline void
cs_flush_block_instrs(struct cs_builder * b)547 cs_flush_block_instrs(struct cs_builder *b)
548 {
549    if (cs_cur_block(b) != NULL)
550       return;
551 
552    uint32_t num_instrs =
553       util_dynarray_num_elements(&b->blocks.instrs, uint64_t);
554    if (!num_instrs)
555       return;
556 
557    /* If LOAD_IP is the last instruction in the block, we reserve one more
558     * slot to make sure the next instruction won't point to a CS chunk linking
559     * sequence. */
560    if (unlikely(b->blocks.last_load_ip_target >= num_instrs)) {
561       if (!cs_reserve_instrs(b, num_instrs + 1))
562          return;
563    }
564 
565    void *buffer = cs_alloc_ins_block(b, num_instrs);
566 
567    if (likely(buffer != NULL)) {
568       /* If we have a LOAD_IP chain, we need to patch each LOAD_IP
569        * instruction before we copy the block to the final memory
570        * region. */
571       while (unlikely(b->blocks.last_load_ip_target)) {
572          uint64_t *instr = util_dynarray_element(
573             &b->blocks.instrs, uint64_t, b->blocks.last_load_ip_target - 1);
574          unsigned prev_load_ip_target = *instr & BITFIELD_MASK(32);
575          uint64_t ip =
576             b->cur_chunk.buffer.gpu +
577             ((b->cur_chunk.pos - num_instrs + b->blocks.last_load_ip_target) *
578              sizeof(uint64_t));
579 
580          /* Drop the prev_load_ip_target value and replace it by the final
581 	  * IP. */
582          *instr &= ~BITFIELD64_MASK(32);
583          *instr |= ip;
584 
585          b->blocks.last_load_ip_target = prev_load_ip_target;
586       }
587 
588       memcpy(buffer, b->blocks.instrs.data, b->blocks.instrs.size);
589    }
590 
591    util_dynarray_clear(&b->blocks.instrs);
592 }
593 
594 static inline uint32_t
cs_block_next_pos(struct cs_builder * b)595 cs_block_next_pos(struct cs_builder *b)
596 {
597    assert(cs_cur_block(b) != NULL);
598 
599    return util_dynarray_num_elements(&b->blocks.instrs, uint64_t);
600 }
601 
602 static inline void
cs_label_init(struct cs_label * label)603 cs_label_init(struct cs_label *label)
604 {
605    label->last_forward_ref = CS_LABEL_INVALID_POS;
606    label->target = CS_LABEL_INVALID_POS;
607 }
608 
609 static inline void
cs_set_label(struct cs_builder * b,struct cs_label * label)610 cs_set_label(struct cs_builder *b, struct cs_label *label)
611 {
612    assert(label->target == CS_LABEL_INVALID_POS);
613    label->target = cs_block_next_pos(b);
614 
615    for (uint32_t next_forward_ref, forward_ref = label->last_forward_ref;
616         forward_ref != CS_LABEL_INVALID_POS; forward_ref = next_forward_ref) {
617       uint64_t *ins =
618          util_dynarray_element(&b->blocks.instrs, uint64_t, forward_ref);
619 
620       assert(forward_ref < label->target);
621       assert(label->target - forward_ref <= INT16_MAX);
622 
623       /* Save the next forward reference to this target before overwritting
624        * it with the final offset.
625        */
626       int16_t offset = *ins & BITFIELD64_MASK(16);
627 
628       next_forward_ref =
629          offset > 0 ? forward_ref - offset : CS_LABEL_INVALID_POS;
630 
631       assert(next_forward_ref == CS_LABEL_INVALID_POS ||
632              next_forward_ref < forward_ref);
633 
634       *ins &= ~BITFIELD64_MASK(16);
635       *ins |= label->target - forward_ref - 1;
636    }
637 }
638 
639 static inline void
cs_flush_pending_if(struct cs_builder * b)640 cs_flush_pending_if(struct cs_builder *b)
641 {
642    if (likely(cs_cur_block(b) != &b->blocks.pending_if.block))
643       return;
644 
645    cs_set_label(b, &b->blocks.pending_if.end_label);
646    b->blocks.stack = b->blocks.pending_if.block.next;
647    cs_flush_block_instrs(b);
648 }
649 
650 static inline void *
cs_alloc_ins(struct cs_builder * b)651 cs_alloc_ins(struct cs_builder *b)
652 {
653    /* If an instruction is emitted after an if_end(), it flushes the pending if,
654     * causing further cs_else_start() instructions to be invalid. */
655    cs_flush_pending_if(b);
656 
657    return cs_alloc_ins_block(b, 1) ?: &b->discard_instr_slot;
658 }
659 
660 /* Call this when you are done building a command stream and want to prepare
661  * it for submission.
662  */
663 static inline void
cs_finish(struct cs_builder * b)664 cs_finish(struct cs_builder *b)
665 {
666    if (!cs_is_valid(b))
667       return;
668 
669    cs_flush_pending_if(b);
670    cs_wrap_chunk(b);
671 
672    /* This prevents adding instructions after that point. */
673    memset(&b->cur_chunk, 0, sizeof(b->cur_chunk));
674 
675    util_dynarray_fini(&b->blocks.instrs);
676 }
677 
678 /*
679  * Helper to emit a new instruction into the command queue. The allocation needs
680  * to be separated out being pan_pack can evaluate its argument multiple times,
681  * yet cs_alloc has side effects.
682  */
683 #define cs_emit(b, T, cfg) pan_cast_and_pack(cs_alloc_ins(b), CS_##T, cfg)
684 
685 /* Asynchronous operations take a mask of scoreboard slots to wait on
686  * before executing the instruction, and signal a scoreboard slot when
687  * the operation is complete.
688  * A wait_mask of zero means the operation is synchronous, and signal_slot
689  * is ignored in that case.
690  */
691 struct cs_async_op {
692    uint16_t wait_mask;
693    uint8_t signal_slot;
694 };
695 
696 static inline struct cs_async_op
cs_defer(unsigned wait_mask,unsigned signal_slot)697 cs_defer(unsigned wait_mask, unsigned signal_slot)
698 {
699    /* The scoreboard slot to signal is incremented before the wait operation,
700     * waiting on it would cause an infinite wait.
701     */
702    assert(!(wait_mask & BITFIELD_BIT(signal_slot)));
703 
704    return (struct cs_async_op){
705       .wait_mask = wait_mask,
706       .signal_slot = signal_slot,
707    };
708 }
709 
710 static inline struct cs_async_op
cs_now(void)711 cs_now(void)
712 {
713    return (struct cs_async_op){
714       .wait_mask = 0,
715       .signal_slot = ~0,
716    };
717 }
718 
719 static inline bool
cs_instr_is_asynchronous(enum mali_cs_opcode opcode,uint16_t wait_mask)720 cs_instr_is_asynchronous(enum mali_cs_opcode opcode, uint16_t wait_mask)
721 {
722    switch (opcode) {
723    case MALI_CS_OPCODE_FLUSH_CACHE2:
724    case MALI_CS_OPCODE_FINISH_TILING:
725    case MALI_CS_OPCODE_LOAD_MULTIPLE:
726    case MALI_CS_OPCODE_STORE_MULTIPLE:
727    case MALI_CS_OPCODE_RUN_COMPUTE:
728    case MALI_CS_OPCODE_RUN_COMPUTE_INDIRECT:
729    case MALI_CS_OPCODE_RUN_FRAGMENT:
730    case MALI_CS_OPCODE_RUN_FULLSCREEN:
731    case MALI_CS_OPCODE_RUN_IDVS:
732    case MALI_CS_OPCODE_RUN_TILING:
733       /* Always asynchronous. */
734       return true;
735 
736    case MALI_CS_OPCODE_FINISH_FRAGMENT:
737    case MALI_CS_OPCODE_SYNC_ADD32:
738    case MALI_CS_OPCODE_SYNC_SET32:
739    case MALI_CS_OPCODE_SYNC_ADD64:
740    case MALI_CS_OPCODE_SYNC_SET64:
741    case MALI_CS_OPCODE_STORE_STATE:
742    case MALI_CS_OPCODE_TRACE_POINT:
743    case MALI_CS_OPCODE_HEAP_OPERATION:
744       /* Asynchronous only if wait_mask != 0. */
745       return wait_mask != 0;
746 
747    default:
748       return false;
749    }
750 }
751 
752 #define cs_apply_async(I, async)                                               \
753    do {                                                                        \
754       I.wait_mask = async.wait_mask;                                           \
755       I.signal_slot = cs_instr_is_asynchronous(I.opcode, I.wait_mask)          \
756                          ? async.signal_slot                                   \
757                          : 0;                                                  \
758       assert(I.signal_slot != ~0 ||                                            \
759              !"Can't use cs_now() on pure async instructions");                \
760    } while (0)
761 
762 static inline void
cs_move32_to(struct cs_builder * b,struct cs_index dest,unsigned imm)763 cs_move32_to(struct cs_builder *b, struct cs_index dest, unsigned imm)
764 {
765    cs_emit(b, MOVE32, I) {
766       I.destination = cs_dst32(b, dest);
767       I.immediate = imm;
768    }
769 }
770 
771 static inline void
cs_move48_to(struct cs_builder * b,struct cs_index dest,uint64_t imm)772 cs_move48_to(struct cs_builder *b, struct cs_index dest, uint64_t imm)
773 {
774    cs_emit(b, MOVE, I) {
775       I.destination = cs_dst64(b, dest);
776       I.immediate = imm;
777    }
778 }
779 
780 static inline void
cs_load_ip_to(struct cs_builder * b,struct cs_index dest)781 cs_load_ip_to(struct cs_builder *b, struct cs_index dest)
782 {
783    /* If a load_ip instruction is emitted after an if_end(), it flushes the
784     * pending if, causing further cs_else_start() instructions to be invalid.
785     */
786    cs_flush_pending_if(b);
787 
788    if (likely(cs_cur_block(b) == NULL)) {
789       if (!cs_reserve_instrs(b, 2))
790          return;
791 
792       /* We make IP point to the instruction right after our MOVE. */
793       uint64_t ip =
794          b->cur_chunk.buffer.gpu + (sizeof(uint64_t) * (b->cur_chunk.pos + 1));
795       cs_move48_to(b, dest, ip);
796    } else {
797       cs_move48_to(b, dest, b->blocks.last_load_ip_target);
798       b->blocks.last_load_ip_target =
799          util_dynarray_num_elements(&b->blocks.instrs, uint64_t);
800    }
801 }
802 
803 static inline void
cs_block_start(struct cs_builder * b,struct cs_block * block)804 cs_block_start(struct cs_builder *b, struct cs_block *block)
805 {
806    cs_flush_pending_if(b);
807    block->next = b->blocks.stack;
808    b->blocks.stack = block;
809 }
810 
811 static inline void
cs_block_end(struct cs_builder * b,struct cs_block * block)812 cs_block_end(struct cs_builder *b, struct cs_block *block)
813 {
814    cs_flush_pending_if(b);
815 
816    assert(cs_cur_block(b) == block);
817 
818    b->blocks.stack = block->next;
819 
820    cs_flush_block_instrs(b);
821 }
822 
823 static inline void
cs_branch(struct cs_builder * b,int offset,enum mali_cs_condition cond,struct cs_index val)824 cs_branch(struct cs_builder *b, int offset, enum mali_cs_condition cond,
825           struct cs_index val)
826 {
827    cs_emit(b, BRANCH, I) {
828       I.offset = offset;
829       I.condition = cond;
830       I.value = cs_src32(b, val);
831    }
832 }
833 
834 static inline void
cs_branch_label(struct cs_builder * b,struct cs_label * label,enum mali_cs_condition cond,struct cs_index val)835 cs_branch_label(struct cs_builder *b, struct cs_label *label,
836                 enum mali_cs_condition cond, struct cs_index val)
837 {
838    assert(cs_cur_block(b) != NULL);
839 
840    if (label->target == CS_LABEL_INVALID_POS) {
841       uint32_t branch_ins_pos = cs_block_next_pos(b);
842 
843       /* Instead of emitting a BRANCH with the final offset, we record the
844        * diff between the current branch, and the previous branch that was
845        * referencing this unset label. This way we build a single link list
846        * that can be walked when the label is set with cs_set_label().
847        * We use -1 as the end-of-list marker.
848        */
849       int16_t offset = -1;
850       if (label->last_forward_ref != CS_LABEL_INVALID_POS) {
851          assert(label->last_forward_ref < branch_ins_pos);
852          assert(branch_ins_pos - label->last_forward_ref <= INT16_MAX);
853          offset = branch_ins_pos - label->last_forward_ref;
854       }
855 
856       cs_emit(b, BRANCH, I) {
857          I.offset = offset;
858          I.condition = cond;
859          I.value = cond != MALI_CS_CONDITION_ALWAYS ? cs_src32(b, val) : 0;
860       }
861 
862       label->last_forward_ref = branch_ins_pos;
863    } else {
864       int32_t offset = label->target - cs_block_next_pos(b) - 1;
865 
866       /* The branch target is encoded in a 16-bit signed integer, make sure we
867        * don't underflow.
868        */
869       assert(offset >= INT16_MIN);
870 
871       /* Backward references are easy, we can emit them immediately. */
872       cs_emit(b, BRANCH, I) {
873          I.offset = offset;
874          I.condition = cond;
875          I.value = cond != MALI_CS_CONDITION_ALWAYS ? cs_src32(b, val) : 0;
876       }
877    }
878 }
879 
880 static inline enum mali_cs_condition
cs_invert_cond(enum mali_cs_condition cond)881 cs_invert_cond(enum mali_cs_condition cond)
882 {
883    switch (cond) {
884    case MALI_CS_CONDITION_LEQUAL:
885       return MALI_CS_CONDITION_GREATER;
886    case MALI_CS_CONDITION_EQUAL:
887       return MALI_CS_CONDITION_NEQUAL;
888    case MALI_CS_CONDITION_LESS:
889       return MALI_CS_CONDITION_GEQUAL;
890    case MALI_CS_CONDITION_GREATER:
891       return MALI_CS_CONDITION_LEQUAL;
892    case MALI_CS_CONDITION_NEQUAL:
893       return MALI_CS_CONDITION_EQUAL;
894    case MALI_CS_CONDITION_GEQUAL:
895       return MALI_CS_CONDITION_LESS;
896    case MALI_CS_CONDITION_ALWAYS:
897       unreachable("cannot invert ALWAYS");
898    default:
899       unreachable("invalid cond");
900    }
901 }
902 
903 static inline struct cs_if_else *
cs_if_start(struct cs_builder * b,struct cs_if_else * if_else,enum mali_cs_condition cond,struct cs_index val)904 cs_if_start(struct cs_builder *b, struct cs_if_else *if_else,
905             enum mali_cs_condition cond, struct cs_index val)
906 {
907    cs_block_start(b, &if_else->block);
908    cs_label_init(&if_else->end_label);
909    cs_branch_label(b, &if_else->end_label, cs_invert_cond(cond), val);
910    return if_else;
911 }
912 
913 static inline void
cs_if_end(struct cs_builder * b,struct cs_if_else * if_else)914 cs_if_end(struct cs_builder *b, struct cs_if_else *if_else)
915 {
916    assert(cs_cur_block(b) == &if_else->block);
917 
918    b->blocks.pending_if.block.next = if_else->block.next;
919    b->blocks.stack = &b->blocks.pending_if.block;
920    b->blocks.pending_if.end_label = if_else->end_label;
921 }
922 
923 static inline struct cs_if_else *
cs_else_start(struct cs_builder * b,struct cs_if_else * if_else)924 cs_else_start(struct cs_builder *b, struct cs_if_else *if_else)
925 {
926    assert(cs_cur_block(b) == &b->blocks.pending_if.block);
927 
928    if_else->block.next = b->blocks.pending_if.block.next;
929    b->blocks.stack = &if_else->block;
930    cs_label_init(&if_else->end_label);
931    cs_branch_label(b, &if_else->end_label, MALI_CS_CONDITION_ALWAYS,
932                    cs_undef());
933    cs_set_label(b, &b->blocks.pending_if.end_label);
934    cs_label_init(&b->blocks.pending_if.end_label);
935 
936    return if_else;
937 }
938 
939 static inline void
cs_else_end(struct cs_builder * b,struct cs_if_else * if_else)940 cs_else_end(struct cs_builder *b, struct cs_if_else *if_else)
941 {
942    cs_set_label(b, &if_else->end_label);
943    cs_block_end(b, &if_else->block);
944 }
945 
946 #define cs_if(__b, __cond, __val)                                              \
947    for (struct cs_if_else __storage,                                           \
948         *__if_else = cs_if_start(__b, &__storage, __cond, __val);              \
949         __if_else != NULL; cs_if_end(__b, __if_else), __if_else = NULL)
950 
951 #define cs_else(__b)                                                           \
952    for (struct cs_if_else __storage,                                           \
953         *__if_else = cs_else_start(__b, &__storage);                           \
954         __if_else != NULL; cs_else_end(__b, __if_else), __if_else = NULL)
955 
956 struct cs_loop {
957    struct cs_label start, end;
958    struct cs_block block;
959    enum mali_cs_condition cond;
960    struct cs_index val;
961    struct cs_load_store_tracker *orig_ls_state;
962    struct cs_load_store_tracker ls_state;
963 };
964 
965 static inline void
cs_loop_diverge_ls_update(struct cs_builder * b,struct cs_loop * loop)966 cs_loop_diverge_ls_update(struct cs_builder *b, struct cs_loop *loop)
967 {
968    if (likely(!b->conf.ls_tracker))
969       return;
970 
971    if (!loop->orig_ls_state) {
972       loop->orig_ls_state = b->conf.ls_tracker;
973       loop->ls_state = *loop->orig_ls_state;
974       b->conf.ls_tracker = &loop->ls_state;
975    } else {
976       BITSET_OR(loop->orig_ls_state->pending_loads,
977                 loop->orig_ls_state->pending_loads,
978                 loop->ls_state.pending_loads);
979       BITSET_OR(loop->orig_ls_state->pending_stores,
980                 loop->orig_ls_state->pending_stores,
981                 loop->ls_state.pending_stores);
982    }
983 }
984 
985 static inline struct cs_loop *
cs_do_while_start(struct cs_builder * b,struct cs_loop * loop,enum mali_cs_condition cond,struct cs_index val)986 cs_do_while_start(struct cs_builder *b, struct cs_loop *loop,
987                   enum mali_cs_condition cond, struct cs_index val)
988 {
989    *loop = (struct cs_loop){
990       .cond = cond,
991       .val = val,
992    };
993 
994    cs_block_start(b, &loop->block);
995    cs_label_init(&loop->start);
996    cs_label_init(&loop->end);
997    cs_set_label(b, &loop->start);
998    return loop;
999 }
1000 
1001 static inline struct cs_loop *
cs_while_start(struct cs_builder * b,struct cs_loop * loop,enum mali_cs_condition cond,struct cs_index val)1002 cs_while_start(struct cs_builder *b, struct cs_loop *loop,
1003                enum mali_cs_condition cond, struct cs_index val)
1004 {
1005    cs_do_while_start(b, loop, cond, val);
1006 
1007    /* Do an initial check on the condition, and if it's false, jump to
1008     * the end of the loop block. For 'while(true)' loops, skip the
1009     * conditional branch.
1010     */
1011    if (cond != MALI_CS_CONDITION_ALWAYS) {
1012       cs_branch_label(b, &loop->end, cs_invert_cond(cond), val);
1013       cs_loop_diverge_ls_update(b, loop);
1014    }
1015 
1016    return loop;
1017 }
1018 
1019 static inline void
cs_loop_conditional_continue(struct cs_builder * b,struct cs_loop * loop,enum mali_cs_condition cond,struct cs_index val)1020 cs_loop_conditional_continue(struct cs_builder *b, struct cs_loop *loop,
1021                              enum mali_cs_condition cond, struct cs_index val)
1022 {
1023    cs_flush_pending_if(b);
1024    cs_branch_label(b, &loop->start, cond, val);
1025    cs_loop_diverge_ls_update(b, loop);
1026 }
1027 
1028 static inline void
cs_loop_conditional_break(struct cs_builder * b,struct cs_loop * loop,enum mali_cs_condition cond,struct cs_index val)1029 cs_loop_conditional_break(struct cs_builder *b, struct cs_loop *loop,
1030                           enum mali_cs_condition cond, struct cs_index val)
1031 {
1032    cs_flush_pending_if(b);
1033    cs_branch_label(b, &loop->end, cond, val);
1034    cs_loop_diverge_ls_update(b, loop);
1035 }
1036 
1037 static inline void
cs_while_end(struct cs_builder * b,struct cs_loop * loop)1038 cs_while_end(struct cs_builder *b, struct cs_loop *loop)
1039 {
1040    cs_flush_pending_if(b);
1041    cs_branch_label(b, &loop->start, loop->cond, loop->val);
1042    cs_set_label(b, &loop->end);
1043    cs_block_end(b, &loop->block);
1044 
1045    if (unlikely(loop->orig_ls_state)) {
1046       BITSET_OR(loop->orig_ls_state->pending_loads,
1047                 loop->orig_ls_state->pending_loads,
1048                 loop->ls_state.pending_loads);
1049       BITSET_OR(loop->orig_ls_state->pending_stores,
1050                 loop->orig_ls_state->pending_stores,
1051                 loop->ls_state.pending_stores);
1052       b->conf.ls_tracker = loop->orig_ls_state;
1053    }
1054 }
1055 
1056 #define cs_while(__b, __cond, __val)                                           \
1057    for (struct cs_loop __loop_storage,                                         \
1058         *__loop = cs_while_start(__b, &__loop_storage, __cond, __val);         \
1059         __loop != NULL; cs_while_end(__b, __loop), __loop = NULL)
1060 
1061 #define cs_continue(__b)                                                       \
1062    cs_loop_conditional_continue(__b, __loop, MALI_CS_CONDITION_ALWAYS,         \
1063                                 cs_undef())
1064 
1065 #define cs_break(__b)                                                          \
1066    cs_loop_conditional_break(__b, __loop, MALI_CS_CONDITION_ALWAYS, cs_undef())
1067 
1068 /* Pseudoinstructions follow */
1069 
1070 static inline void
cs_move64_to(struct cs_builder * b,struct cs_index dest,uint64_t imm)1071 cs_move64_to(struct cs_builder *b, struct cs_index dest, uint64_t imm)
1072 {
1073    if (imm < (1ull << 48)) {
1074       /* Zero extends */
1075       cs_move48_to(b, dest, imm);
1076    } else {
1077       cs_move32_to(b, cs_extract32(b, dest, 0), imm);
1078       cs_move32_to(b, cs_extract32(b, dest, 1), imm >> 32);
1079    }
1080 }
1081 
1082 static inline void
cs_wait_slots(struct cs_builder * b,unsigned wait_mask,bool progress_inc)1083 cs_wait_slots(struct cs_builder *b, unsigned wait_mask, bool progress_inc)
1084 {
1085    struct cs_load_store_tracker *ls_tracker = b->conf.ls_tracker;
1086 
1087    cs_emit(b, WAIT, I) {
1088       I.wait_mask = wait_mask;
1089       I.progress_increment = progress_inc;
1090    }
1091 
1092    /* We don't do advanced tracking of cs_defer(), and assume that
1093     * load/store will be flushed with an explicit wait on the load/store
1094     * scoreboard. */
1095    if (unlikely(ls_tracker) &&
1096        (wait_mask & BITFIELD_BIT(ls_tracker->sb_slot))) {
1097       BITSET_CLEAR_RANGE(ls_tracker->pending_loads, 0, 255);
1098       BITSET_CLEAR_RANGE(ls_tracker->pending_stores, 0, 255);
1099    }
1100 }
1101 
1102 static inline void
cs_wait_slot(struct cs_builder * b,unsigned slot,bool progress_inc)1103 cs_wait_slot(struct cs_builder *b, unsigned slot, bool progress_inc)
1104 {
1105    assert(slot < 8 && "invalid slot");
1106 
1107    cs_wait_slots(b, BITFIELD_BIT(slot), progress_inc);
1108 }
1109 
1110 struct cs_shader_res_sel {
1111    uint8_t srt, fau, spd, tsd;
1112 };
1113 
1114 static inline struct cs_shader_res_sel
cs_shader_res_sel(unsigned srt,unsigned fau,unsigned spd,unsigned tsd)1115 cs_shader_res_sel(unsigned srt, unsigned fau, unsigned spd, unsigned tsd)
1116 {
1117    return (struct cs_shader_res_sel){
1118       .srt = srt,
1119       .fau = fau,
1120       .spd = spd,
1121       .tsd = tsd,
1122    };
1123 }
1124 
1125 static inline void
cs_run_compute(struct cs_builder * b,unsigned task_increment,enum mali_task_axis task_axis,bool progress_inc,struct cs_shader_res_sel res_sel)1126 cs_run_compute(struct cs_builder *b, unsigned task_increment,
1127                enum mali_task_axis task_axis, bool progress_inc,
1128                struct cs_shader_res_sel res_sel)
1129 {
1130    cs_emit(b, RUN_COMPUTE, I) {
1131       I.task_increment = task_increment;
1132       I.task_axis = task_axis;
1133       I.progress_increment = progress_inc;
1134       I.srt_select = res_sel.srt;
1135       I.spd_select = res_sel.spd;
1136       I.tsd_select = res_sel.tsd;
1137       I.fau_select = res_sel.fau;
1138    }
1139 }
1140 
1141 static inline void
cs_run_tiling(struct cs_builder * b,uint32_t flags_override,bool progress_inc,struct cs_shader_res_sel res_sel)1142 cs_run_tiling(struct cs_builder *b, uint32_t flags_override, bool progress_inc,
1143               struct cs_shader_res_sel res_sel)
1144 {
1145    cs_emit(b, RUN_TILING, I) {
1146       I.flags_override = flags_override;
1147       I.progress_increment = progress_inc;
1148       I.srt_select = res_sel.srt;
1149       I.spd_select = res_sel.spd;
1150       I.tsd_select = res_sel.tsd;
1151       I.fau_select = res_sel.fau;
1152    }
1153 }
1154 
1155 static inline void
cs_run_idvs(struct cs_builder * b,uint32_t flags_override,bool progress_inc,bool malloc_enable,struct cs_shader_res_sel varying_sel,struct cs_shader_res_sel frag_sel,struct cs_index draw_id)1156 cs_run_idvs(struct cs_builder *b, uint32_t flags_override, bool progress_inc,
1157             bool malloc_enable, struct cs_shader_res_sel varying_sel,
1158             struct cs_shader_res_sel frag_sel, struct cs_index draw_id)
1159 {
1160    cs_emit(b, RUN_IDVS, I) {
1161       I.flags_override = flags_override;
1162       I.progress_increment = progress_inc;
1163       I.malloc_enable = malloc_enable;
1164 
1165       if (draw_id.type == CS_INDEX_UNDEF) {
1166          I.draw_id_register_enable = false;
1167       } else {
1168          I.draw_id_register_enable = true;
1169          I.draw_id = cs_src32(b, draw_id);
1170       }
1171 
1172       assert(varying_sel.spd == 1);
1173       assert(varying_sel.fau == 0 || varying_sel.fau == 1);
1174       assert(varying_sel.srt == 0 || varying_sel.srt == 1);
1175       assert(varying_sel.tsd == 0 || varying_sel.tsd == 1);
1176       I.varying_fau_select = varying_sel.fau == 1;
1177       I.varying_srt_select = varying_sel.srt == 1;
1178       I.varying_tsd_select = varying_sel.tsd == 1;
1179 
1180       assert(frag_sel.spd == 2);
1181       assert(frag_sel.fau == 2);
1182       assert(frag_sel.srt == 2 || frag_sel.srt == 0);
1183       assert(frag_sel.tsd == 2 || frag_sel.tsd == 0);
1184       I.fragment_srt_select = frag_sel.srt == 2;
1185       I.fragment_tsd_select = frag_sel.tsd == 2;
1186    }
1187 }
1188 
1189 static inline void
cs_run_fragment(struct cs_builder * b,bool enable_tem,enum mali_tile_render_order tile_order,bool progress_inc)1190 cs_run_fragment(struct cs_builder *b, bool enable_tem,
1191                 enum mali_tile_render_order tile_order, bool progress_inc)
1192 {
1193    cs_emit(b, RUN_FRAGMENT, I) {
1194       I.enable_tem = enable_tem;
1195       I.tile_order = tile_order;
1196       I.progress_increment = progress_inc;
1197    }
1198 }
1199 
1200 static inline void
cs_run_fullscreen(struct cs_builder * b,uint32_t flags_override,bool progress_inc,struct cs_index dcd)1201 cs_run_fullscreen(struct cs_builder *b, uint32_t flags_override,
1202                   bool progress_inc, struct cs_index dcd)
1203 {
1204    cs_emit(b, RUN_FULLSCREEN, I) {
1205       I.flags_override = flags_override;
1206       I.progress_increment = progress_inc;
1207       I.dcd = cs_src64(b, dcd);
1208    }
1209 }
1210 
1211 static inline void
cs_finish_tiling(struct cs_builder * b,bool progress_inc)1212 cs_finish_tiling(struct cs_builder *b, bool progress_inc)
1213 {
1214    cs_emit(b, FINISH_TILING, I)
1215       I.progress_increment = progress_inc;
1216 }
1217 
1218 static inline void
cs_finish_fragment(struct cs_builder * b,bool increment_frag_completed,struct cs_index first_free_heap_chunk,struct cs_index last_free_heap_chunk,struct cs_async_op async)1219 cs_finish_fragment(struct cs_builder *b, bool increment_frag_completed,
1220                    struct cs_index first_free_heap_chunk,
1221                    struct cs_index last_free_heap_chunk,
1222                    struct cs_async_op async)
1223 {
1224    cs_emit(b, FINISH_FRAGMENT, I) {
1225       I.increment_fragment_completed = increment_frag_completed;
1226       cs_apply_async(I, async);
1227       I.first_heap_chunk = cs_src64(b, first_free_heap_chunk);
1228       I.last_heap_chunk = cs_src64(b, last_free_heap_chunk);
1229    }
1230 }
1231 
1232 static inline void
cs_add32(struct cs_builder * b,struct cs_index dest,struct cs_index src,unsigned imm)1233 cs_add32(struct cs_builder *b, struct cs_index dest, struct cs_index src,
1234          unsigned imm)
1235 {
1236    cs_emit(b, ADD_IMMEDIATE32, I) {
1237       I.destination = cs_dst32(b, dest);
1238       I.source = cs_src32(b, src);
1239       I.immediate = imm;
1240    }
1241 }
1242 
1243 static inline void
cs_add64(struct cs_builder * b,struct cs_index dest,struct cs_index src,unsigned imm)1244 cs_add64(struct cs_builder *b, struct cs_index dest, struct cs_index src,
1245          unsigned imm)
1246 {
1247    cs_emit(b, ADD_IMMEDIATE64, I) {
1248       I.destination = cs_dst64(b, dest);
1249       I.source = cs_src64(b, src);
1250       I.immediate = imm;
1251    }
1252 }
1253 
1254 static inline void
cs_umin32(struct cs_builder * b,struct cs_index dest,struct cs_index src1,struct cs_index src2)1255 cs_umin32(struct cs_builder *b, struct cs_index dest, struct cs_index src1,
1256           struct cs_index src2)
1257 {
1258    cs_emit(b, UMIN32, I) {
1259       I.destination = cs_dst32(b, dest);
1260       I.source_1 = cs_src32(b, src1);
1261       I.source_2 = cs_src32(b, src2);
1262    }
1263 }
1264 
1265 static inline void
cs_load_to(struct cs_builder * b,struct cs_index dest,struct cs_index address,unsigned mask,int offset)1266 cs_load_to(struct cs_builder *b, struct cs_index dest, struct cs_index address,
1267            unsigned mask, int offset)
1268 {
1269    unsigned count = util_last_bit(mask);
1270    unsigned base_reg = cs_dst_tuple(b, dest, count, mask);
1271 
1272    cs_emit(b, LOAD_MULTIPLE, I) {
1273       I.base_register = base_reg;
1274       I.address = cs_src64(b, address);
1275       I.mask = mask;
1276       I.offset = offset;
1277    }
1278 
1279    if (unlikely(b->conf.ls_tracker)) {
1280       for (unsigned i = 0; i < count; i++) {
1281          if (mask & BITFIELD_BIT(i))
1282             BITSET_SET(b->conf.ls_tracker->pending_loads, base_reg + i);
1283       }
1284    }
1285 }
1286 
1287 static inline void
cs_load32_to(struct cs_builder * b,struct cs_index dest,struct cs_index address,int offset)1288 cs_load32_to(struct cs_builder *b, struct cs_index dest,
1289              struct cs_index address, int offset)
1290 {
1291    cs_load_to(b, dest, address, BITFIELD_MASK(1), offset);
1292 }
1293 
1294 static inline void
cs_load64_to(struct cs_builder * b,struct cs_index dest,struct cs_index address,int offset)1295 cs_load64_to(struct cs_builder *b, struct cs_index dest,
1296              struct cs_index address, int offset)
1297 {
1298    cs_load_to(b, dest, address, BITFIELD_MASK(2), offset);
1299 }
1300 
1301 static inline void
cs_store(struct cs_builder * b,struct cs_index data,struct cs_index address,unsigned mask,int offset)1302 cs_store(struct cs_builder *b, struct cs_index data, struct cs_index address,
1303          unsigned mask, int offset)
1304 {
1305    unsigned count = util_last_bit(mask);
1306    unsigned base_reg = cs_src_tuple(b, data, count, mask);
1307 
1308    cs_emit(b, STORE_MULTIPLE, I) {
1309       I.base_register = base_reg;
1310       I.address = cs_src64(b, address);
1311       I.mask = mask;
1312       I.offset = offset;
1313    }
1314 
1315    if (unlikely(b->conf.ls_tracker)) {
1316       for (unsigned i = 0; i < count; i++) {
1317          if (mask & BITFIELD_BIT(i))
1318             BITSET_SET(b->conf.ls_tracker->pending_stores, base_reg + i);
1319       }
1320    }
1321 }
1322 
1323 static inline void
cs_store32(struct cs_builder * b,struct cs_index data,struct cs_index address,int offset)1324 cs_store32(struct cs_builder *b, struct cs_index data, struct cs_index address,
1325            int offset)
1326 {
1327    cs_store(b, data, address, BITFIELD_MASK(1), offset);
1328 }
1329 
1330 static inline void
cs_store64(struct cs_builder * b,struct cs_index data,struct cs_index address,int offset)1331 cs_store64(struct cs_builder *b, struct cs_index data, struct cs_index address,
1332            int offset)
1333 {
1334    cs_store(b, data, address, BITFIELD_MASK(2), offset);
1335 }
1336 
1337 /*
1338  * Select which scoreboard entry will track endpoint tasks and other tasks
1339  * respectively. Pass to cs_wait to wait later.
1340  */
1341 static inline void
cs_set_scoreboard_entry(struct cs_builder * b,unsigned ep,unsigned other)1342 cs_set_scoreboard_entry(struct cs_builder *b, unsigned ep, unsigned other)
1343 {
1344    assert(ep < 8 && "invalid slot");
1345    assert(other < 8 && "invalid slot");
1346 
1347    cs_emit(b, SET_SB_ENTRY, I) {
1348       I.endpoint_entry = ep;
1349       I.other_entry = other;
1350    }
1351 
1352    /* We assume the load/store scoreboard entry is static to keep things
1353     * simple. */
1354    if (unlikely(b->conf.ls_tracker))
1355       assert(b->conf.ls_tracker->sb_slot == other);
1356 }
1357 
1358 static inline void
cs_progress_wait(struct cs_builder * b,unsigned queue,struct cs_index ref)1359 cs_progress_wait(struct cs_builder *b, unsigned queue, struct cs_index ref)
1360 {
1361    cs_emit(b, PROGRESS_WAIT, I) {
1362       I.source = cs_src64(b, ref);
1363       I.queue = queue;
1364    }
1365 }
1366 
1367 static inline void
cs_set_exception_handler(struct cs_builder * b,enum mali_cs_exception_type exception_type,struct cs_index address,struct cs_index length)1368 cs_set_exception_handler(struct cs_builder *b,
1369                          enum mali_cs_exception_type exception_type,
1370                          struct cs_index address, struct cs_index length)
1371 {
1372    cs_emit(b, SET_EXCEPTION_HANDLER, I) {
1373       I.exception_type = exception_type;
1374       I.address = cs_src64(b, address);
1375       I.length = cs_src32(b, length);
1376    }
1377 }
1378 
1379 static inline void
cs_call(struct cs_builder * b,struct cs_index address,struct cs_index length)1380 cs_call(struct cs_builder *b, struct cs_index address, struct cs_index length)
1381 {
1382    cs_emit(b, CALL, I) {
1383       I.address = cs_src64(b, address);
1384       I.length = cs_src32(b, length);
1385    }
1386 }
1387 
1388 static inline void
cs_jump(struct cs_builder * b,struct cs_index address,struct cs_index length)1389 cs_jump(struct cs_builder *b, struct cs_index address, struct cs_index length)
1390 {
1391    cs_emit(b, JUMP, I) {
1392       I.address = cs_src64(b, address);
1393       I.length = cs_src32(b, length);
1394    }
1395 }
1396 
1397 enum cs_res_id {
1398    CS_COMPUTE_RES = BITFIELD_BIT(0),
1399    CS_FRAG_RES = BITFIELD_BIT(1),
1400    CS_TILER_RES = BITFIELD_BIT(2),
1401    CS_IDVS_RES = BITFIELD_BIT(3),
1402 };
1403 
1404 static inline void
cs_req_res(struct cs_builder * b,uint32_t res_mask)1405 cs_req_res(struct cs_builder *b, uint32_t res_mask)
1406 {
1407    cs_emit(b, REQ_RESOURCE, I) {
1408       I.compute = res_mask & CS_COMPUTE_RES;
1409       I.tiler = res_mask & CS_TILER_RES;
1410       I.idvs = res_mask & CS_IDVS_RES;
1411       I.fragment = res_mask & CS_FRAG_RES;
1412    }
1413 }
1414 
1415 static inline void
cs_flush_caches(struct cs_builder * b,enum mali_cs_flush_mode l2,enum mali_cs_flush_mode lsc,bool other_inv,struct cs_index flush_id,struct cs_async_op async)1416 cs_flush_caches(struct cs_builder *b, enum mali_cs_flush_mode l2,
1417                 enum mali_cs_flush_mode lsc, bool other_inv,
1418                 struct cs_index flush_id, struct cs_async_op async)
1419 {
1420    cs_emit(b, FLUSH_CACHE2, I) {
1421       I.l2_flush_mode = l2;
1422       I.lsc_flush_mode = lsc;
1423       I.other_invalidate = other_inv;
1424       I.latest_flush_id = cs_src32(b, flush_id);
1425       cs_apply_async(I, async);
1426    }
1427 }
1428 
1429 #define CS_SYNC_OPS(__cnt_width)                                               \
1430    static inline void cs_sync##__cnt_width##_set(                              \
1431       struct cs_builder *b, bool propagate_error,                              \
1432       enum mali_cs_sync_scope scope, struct cs_index val,                      \
1433       struct cs_index addr, struct cs_async_op async)                          \
1434    {                                                                           \
1435       cs_emit(b, SYNC_SET##__cnt_width, I) {                                   \
1436          I.error_propagate = propagate_error;                                  \
1437          I.scope = scope;                                                      \
1438          I.data = cs_src##__cnt_width(b, val);                                 \
1439          I.address = cs_src64(b, addr);                                        \
1440          cs_apply_async(I, async);                                             \
1441       }                                                                        \
1442    }                                                                           \
1443                                                                                \
1444    static inline void cs_sync##__cnt_width##_add(                              \
1445       struct cs_builder *b, bool propagate_error,                              \
1446       enum mali_cs_sync_scope scope, struct cs_index val,                      \
1447       struct cs_index addr, struct cs_async_op async)                          \
1448    {                                                                           \
1449       cs_emit(b, SYNC_ADD##__cnt_width, I) {                                   \
1450          I.error_propagate = propagate_error;                                  \
1451          I.scope = scope;                                                      \
1452          I.data = cs_src##__cnt_width(b, val);                                 \
1453          I.address = cs_src64(b, addr);                                        \
1454          cs_apply_async(I, async);                                             \
1455       }                                                                        \
1456    }                                                                           \
1457                                                                                \
1458    static inline void cs_sync##__cnt_width##_wait(                             \
1459       struct cs_builder *b, bool reject_error, enum mali_cs_condition cond,    \
1460       struct cs_index ref, struct cs_index addr)                               \
1461    {                                                                           \
1462       assert(cond == MALI_CS_CONDITION_LEQUAL ||                               \
1463              cond == MALI_CS_CONDITION_GREATER);                               \
1464       cs_emit(b, SYNC_WAIT##__cnt_width, I) {                                  \
1465          I.error_reject = reject_error;                                        \
1466          I.condition = cond;                                                   \
1467          I.data = cs_src##__cnt_width(b, ref);                                 \
1468          I.address = cs_src64(b, addr);                                        \
1469       }                                                                        \
1470    }
1471 
1472 CS_SYNC_OPS(32)
1473 CS_SYNC_OPS(64)
1474 
1475 static inline void
cs_store_state(struct cs_builder * b,struct cs_index address,int offset,enum mali_cs_state state,struct cs_async_op async)1476 cs_store_state(struct cs_builder *b, struct cs_index address, int offset,
1477                enum mali_cs_state state, struct cs_async_op async)
1478 {
1479    cs_emit(b, STORE_STATE, I) {
1480       I.offset = offset;
1481       I.state = state;
1482       I.address = cs_src64(b, address);
1483       cs_apply_async(I, async);
1484    }
1485 }
1486 
1487 static inline void
cs_prot_region(struct cs_builder * b,unsigned size)1488 cs_prot_region(struct cs_builder *b, unsigned size)
1489 {
1490    cs_emit(b, PROT_REGION, I) {
1491       I.size = size;
1492    }
1493 }
1494 
1495 static inline void
cs_progress_store(struct cs_builder * b,struct cs_index src)1496 cs_progress_store(struct cs_builder *b, struct cs_index src)
1497 {
1498    cs_emit(b, PROGRESS_STORE, I)
1499       I.source = cs_src64(b, src);
1500 }
1501 
1502 static inline void
cs_progress_load(struct cs_builder * b,struct cs_index dst)1503 cs_progress_load(struct cs_builder *b, struct cs_index dst)
1504 {
1505    cs_emit(b, PROGRESS_LOAD, I)
1506       I.destination = cs_dst64(b, dst);
1507 }
1508 
1509 static inline void
cs_run_compute_indirect(struct cs_builder * b,unsigned wg_per_task,bool progress_inc,struct cs_shader_res_sel res_sel)1510 cs_run_compute_indirect(struct cs_builder *b, unsigned wg_per_task,
1511                         bool progress_inc, struct cs_shader_res_sel res_sel)
1512 {
1513    cs_emit(b, RUN_COMPUTE_INDIRECT, I) {
1514       I.workgroups_per_task = wg_per_task;
1515       I.progress_increment = progress_inc;
1516       I.srt_select = res_sel.srt;
1517       I.spd_select = res_sel.spd;
1518       I.tsd_select = res_sel.tsd;
1519       I.fau_select = res_sel.fau;
1520    }
1521 }
1522 
1523 static inline void
cs_error_barrier(struct cs_builder * b)1524 cs_error_barrier(struct cs_builder *b)
1525 {
1526    cs_emit(b, ERROR_BARRIER, _)
1527       ;
1528 }
1529 
1530 static inline void
cs_heap_set(struct cs_builder * b,struct cs_index address)1531 cs_heap_set(struct cs_builder *b, struct cs_index address)
1532 {
1533    cs_emit(b, HEAP_SET, I) {
1534       I.address = cs_src64(b, address);
1535    }
1536 }
1537 
1538 static inline void
cs_heap_operation(struct cs_builder * b,enum mali_cs_heap_operation operation,struct cs_async_op async)1539 cs_heap_operation(struct cs_builder *b, enum mali_cs_heap_operation operation,
1540                   struct cs_async_op async)
1541 {
1542    cs_emit(b, HEAP_OPERATION, I) {
1543       I.operation = operation;
1544       cs_apply_async(I, async);
1545    }
1546 }
1547 
1548 static inline void
cs_vt_start(struct cs_builder * b,struct cs_async_op async)1549 cs_vt_start(struct cs_builder *b, struct cs_async_op async)
1550 {
1551    cs_heap_operation(b, MALI_CS_HEAP_OPERATION_VERTEX_TILER_STARTED, async);
1552 }
1553 
1554 static inline void
cs_vt_end(struct cs_builder * b,struct cs_async_op async)1555 cs_vt_end(struct cs_builder *b, struct cs_async_op async)
1556 {
1557    cs_heap_operation(b, MALI_CS_HEAP_OPERATION_VERTEX_TILER_COMPLETED, async);
1558 }
1559 
1560 static inline void
cs_frag_end(struct cs_builder * b,struct cs_async_op async)1561 cs_frag_end(struct cs_builder *b, struct cs_async_op async)
1562 {
1563    cs_heap_operation(b, MALI_CS_HEAP_OPERATION_FRAGMENT_COMPLETED, async);
1564 }
1565 
1566 static inline void
cs_trace_point(struct cs_builder * b,struct cs_index regs,struct cs_async_op async)1567 cs_trace_point(struct cs_builder *b, struct cs_index regs,
1568                struct cs_async_op async)
1569 {
1570    cs_emit(b, TRACE_POINT, I) {
1571       I.base_register =
1572          cs_src_tuple(b, regs, regs.size, BITFIELD_MASK(regs.size));
1573       I.register_count = regs.size;
1574       cs_apply_async(I, async);
1575    }
1576 }
1577 
1578 struct cs_match {
1579    struct cs_block block;
1580    struct cs_label break_label;
1581    struct cs_block case_block;
1582    struct cs_label next_case_label;
1583    struct cs_index val;
1584    struct cs_index scratch_reg;
1585    struct cs_load_store_tracker case_ls_state;
1586    struct cs_load_store_tracker ls_state;
1587    struct cs_load_store_tracker *orig_ls_state;
1588    bool default_emitted;
1589 };
1590 
1591 static inline struct cs_match *
cs_match_start(struct cs_builder * b,struct cs_match * match,struct cs_index val,struct cs_index scratch_reg)1592 cs_match_start(struct cs_builder *b, struct cs_match *match,
1593                struct cs_index val, struct cs_index scratch_reg)
1594 {
1595    *match = (struct cs_match){
1596       .val = val,
1597       .scratch_reg = scratch_reg,
1598       .orig_ls_state = b->conf.ls_tracker,
1599    };
1600 
1601    cs_block_start(b, &match->block);
1602    cs_label_init(&match->break_label);
1603    cs_label_init(&match->next_case_label);
1604 
1605    return match;
1606 }
1607 
1608 static inline void
cs_match_case_ls_set(struct cs_builder * b,struct cs_match * match)1609 cs_match_case_ls_set(struct cs_builder *b, struct cs_match *match)
1610 {
1611    if (unlikely(match->orig_ls_state)) {
1612       match->case_ls_state = *match->orig_ls_state;
1613       b->conf.ls_tracker = &match->case_ls_state;
1614    }
1615 }
1616 
1617 static inline void
cs_match_case_ls_get(struct cs_match * match)1618 cs_match_case_ls_get(struct cs_match *match)
1619 {
1620    if (unlikely(match->orig_ls_state)) {
1621       BITSET_OR(match->ls_state.pending_loads,
1622                 match->case_ls_state.pending_loads,
1623                 match->ls_state.pending_loads);
1624       BITSET_OR(match->ls_state.pending_stores,
1625                 match->case_ls_state.pending_stores,
1626                 match->ls_state.pending_stores);
1627    }
1628 }
1629 
1630 static inline void
cs_match_case(struct cs_builder * b,struct cs_match * match,uint32_t id)1631 cs_match_case(struct cs_builder *b, struct cs_match *match, uint32_t id)
1632 {
1633    assert(!match->default_emitted || !"default case must be last");
1634    if (match->next_case_label.last_forward_ref != CS_LABEL_INVALID_POS) {
1635       cs_branch_label(b, &match->break_label, MALI_CS_CONDITION_ALWAYS,
1636                       cs_undef());
1637       cs_block_end(b, &match->case_block);
1638       cs_match_case_ls_get(match);
1639       cs_set_label(b, &match->next_case_label);
1640       cs_label_init(&match->next_case_label);
1641    }
1642 
1643    if (id)
1644       cs_add32(b, match->scratch_reg, match->val, -id);
1645 
1646    cs_branch_label(b, &match->next_case_label, MALI_CS_CONDITION_NEQUAL,
1647                    id ? match->scratch_reg : match->val);
1648 
1649    cs_match_case_ls_set(b, match);
1650    cs_block_start(b, &match->case_block);
1651 }
1652 
1653 static inline void
cs_match_default(struct cs_builder * b,struct cs_match * match)1654 cs_match_default(struct cs_builder *b, struct cs_match *match)
1655 {
1656    assert(match->next_case_label.last_forward_ref != CS_LABEL_INVALID_POS ||
1657           !"default case requires at least one other case");
1658    cs_branch_label(b, &match->break_label, MALI_CS_CONDITION_ALWAYS,
1659                    cs_undef());
1660 
1661    if (cs_cur_block(b) == &match->case_block) {
1662       cs_block_end(b, &match->case_block);
1663       cs_match_case_ls_get(match);
1664    }
1665 
1666    cs_set_label(b, &match->next_case_label);
1667    cs_label_init(&match->next_case_label);
1668    cs_match_case_ls_set(b, match);
1669    cs_block_start(b, &match->case_block);
1670    match->default_emitted = true;
1671 }
1672 
1673 static inline void
cs_match_end(struct cs_builder * b,struct cs_match * match)1674 cs_match_end(struct cs_builder *b, struct cs_match *match)
1675 {
1676    if (cs_cur_block(b) == &match->case_block) {
1677       cs_match_case_ls_get(match);
1678       cs_block_end(b, &match->case_block);
1679    }
1680 
1681    if (unlikely(match->orig_ls_state)) {
1682       if (!match->default_emitted) {
1683          /* If we don't have a default, assume we don't handle all possible cases
1684           * and the match load/store state with the original load/store state.
1685           */
1686          BITSET_OR(match->orig_ls_state->pending_loads,
1687                    match->ls_state.pending_loads,
1688                    match->orig_ls_state->pending_loads);
1689          BITSET_OR(match->orig_ls_state->pending_stores,
1690                    match->ls_state.pending_stores,
1691                    match->orig_ls_state->pending_stores);
1692       } else {
1693          *match->orig_ls_state = match->ls_state;
1694       }
1695 
1696       b->conf.ls_tracker = match->orig_ls_state;
1697    }
1698 
1699    cs_set_label(b, &match->next_case_label);
1700    cs_set_label(b, &match->break_label);
1701 
1702    cs_block_end(b, &match->block);
1703 }
1704 
1705 #define cs_match(__b, __val, __scratch)                                        \
1706    for (struct cs_match __match_storage,                                       \
1707         *__match = cs_match_start(__b, &__match_storage, __val, __scratch);    \
1708         __match != NULL; cs_match_end(__b, &__match_storage), __match = NULL)
1709 
1710 #define cs_case(__b, __ref)                                                    \
1711    for (bool __case_defined = ({                                               \
1712            cs_match_case(__b, __match, __ref);                                 \
1713            false;                                                              \
1714         });                                                                    \
1715         !__case_defined; __case_defined = true)
1716 
1717 #define cs_default(__b)                                                        \
1718    for (bool __default_defined = ({                                            \
1719            cs_match_default(__b, __match);                                     \
1720            false;                                                              \
1721         });                                                                    \
1722         !__default_defined; __default_defined = true)
1723 
1724 static inline void
cs_nop(struct cs_builder * b)1725 cs_nop(struct cs_builder *b)
1726 {
1727    cs_emit(b, NOP, I) {};
1728 }
1729 
1730 struct cs_exception_handler_ctx {
1731    struct cs_index ctx_reg;
1732    unsigned dump_addr_offset;
1733    uint8_t ls_sb_slot;
1734 };
1735 
1736 struct cs_exception_handler {
1737    struct cs_block block;
1738    struct cs_dirty_tracker dirty;
1739    struct cs_exception_handler_ctx ctx;
1740    unsigned dump_size;
1741    uint64_t address;
1742    uint32_t length;
1743 };
1744 
1745 static inline struct cs_exception_handler *
cs_exception_handler_start(struct cs_builder * b,struct cs_exception_handler * handler,struct cs_exception_handler_ctx ctx)1746 cs_exception_handler_start(struct cs_builder *b,
1747                            struct cs_exception_handler *handler,
1748                            struct cs_exception_handler_ctx ctx)
1749 {
1750    assert(cs_cur_block(b) == NULL);
1751    assert(b->conf.dirty_tracker == NULL);
1752 
1753    *handler = (struct cs_exception_handler){
1754       .ctx = ctx,
1755    };
1756 
1757    cs_block_start(b, &handler->block);
1758 
1759    b->conf.dirty_tracker = &handler->dirty;
1760 
1761    return handler;
1762 }
1763 
1764 #define SAVE_RESTORE_MAX_OPS (256 / 16)
1765 
1766 static inline void
cs_exception_handler_end(struct cs_builder * b,struct cs_exception_handler * handler)1767 cs_exception_handler_end(struct cs_builder *b,
1768                          struct cs_exception_handler *handler)
1769 {
1770    struct cs_index ranges[SAVE_RESTORE_MAX_OPS];
1771    uint16_t masks[SAVE_RESTORE_MAX_OPS];
1772    unsigned num_ranges = 0;
1773    uint32_t num_instrs =
1774       util_dynarray_num_elements(&b->blocks.instrs, uint64_t);
1775    struct cs_index addr_reg = {
1776       .type = CS_INDEX_REGISTER,
1777       .size = 2,
1778       .reg = b->conf.nr_registers - 2,
1779    };
1780 
1781    /* Manual cs_block_end() without an instruction flush. We do that to insert
1782     * the preamble without having to move memory in b->blocks.instrs. The flush
1783     * will be done after the preamble has been emitted. */
1784    assert(cs_cur_block(b) == &handler->block);
1785    assert(handler->block.next == NULL);
1786    b->blocks.stack = NULL;
1787 
1788    if (!num_instrs)
1789       return;
1790 
1791    /* Try to minimize number of load/store by grouping them */
1792    unsigned nregs = b->conf.nr_registers - b->conf.nr_kernel_registers;
1793    unsigned pos, last = 0;
1794 
1795    BITSET_FOREACH_SET(pos, handler->dirty.regs, nregs) {
1796       unsigned range = MIN2(nregs - pos, 16);
1797       unsigned word = BITSET_BITWORD(pos);
1798       unsigned bit = pos % BITSET_WORDBITS;
1799       unsigned remaining_bits = BITSET_WORDBITS - bit;
1800 
1801       if (pos < last)
1802          continue;
1803 
1804       masks[num_ranges] = handler->dirty.regs[word] >> bit;
1805       if (remaining_bits < range)
1806          masks[num_ranges] |= handler->dirty.regs[word + 1] << remaining_bits;
1807       masks[num_ranges] &= BITFIELD_MASK(range);
1808 
1809       ranges[num_ranges] =
1810          cs_reg_tuple(b, pos, util_last_bit(masks[num_ranges]));
1811       num_ranges++;
1812       last = pos + range;
1813    }
1814 
1815    handler->dump_size = BITSET_COUNT(handler->dirty.regs) * sizeof(uint32_t);
1816 
1817    /* Make sure the current chunk is able to accommodate the block
1818     * instructions as well as the preamble and postamble.
1819     * Adding 4 instructions (2x wait_slot and the move for the address) as
1820     * the move might actually be translated to two MOVE32 instructions. */
1821    num_instrs += (num_ranges * 2) + 4;
1822 
1823    /* Align things on a cache-line in case the buffer contains more than one
1824     * exception handler (64 bytes = 8 instructions). */
1825    uint32_t padded_num_instrs = ALIGN_POT(num_instrs, 8);
1826 
1827    if (!cs_reserve_instrs(b, padded_num_instrs))
1828       return;
1829 
1830    handler->address =
1831       b->cur_chunk.buffer.gpu + (b->cur_chunk.pos * sizeof(uint64_t));
1832 
1833    /* Preamble: backup modified registers */
1834    if (num_ranges > 0) {
1835       unsigned offset = 0;
1836 
1837       cs_load64_to(b, addr_reg, handler->ctx.ctx_reg,
1838                    handler->ctx.dump_addr_offset);
1839       cs_wait_slot(b, handler->ctx.ls_sb_slot, false);
1840 
1841       for (unsigned i = 0; i < num_ranges; ++i) {
1842          unsigned reg_count = util_bitcount(masks[i]);
1843 
1844          cs_store(b, ranges[i], addr_reg, masks[i], offset);
1845          offset += reg_count * 4;
1846       }
1847 
1848       cs_wait_slot(b, handler->ctx.ls_sb_slot, false);
1849    }
1850 
1851    /* Now that the preamble is emitted, we can flush the instructions we have in
1852     * our exception handler block. */
1853    cs_flush_block_instrs(b);
1854 
1855    /* Postamble: restore modified registers */
1856    if (num_ranges > 0) {
1857       unsigned offset = 0;
1858 
1859       cs_load64_to(b, addr_reg, handler->ctx.ctx_reg,
1860                    handler->ctx.dump_addr_offset);
1861       cs_wait_slot(b, handler->ctx.ls_sb_slot, false);
1862 
1863       for (unsigned i = 0; i < num_ranges; ++i) {
1864          unsigned reg_count = util_bitcount(masks[i]);
1865 
1866          cs_load_to(b, ranges[i], addr_reg, masks[i], offset);
1867          offset += reg_count * 4;
1868       }
1869 
1870       cs_wait_slot(b, handler->ctx.ls_sb_slot, false);
1871    }
1872 
1873    /* Fill the rest of the buffer with NOPs. */
1874    for (; num_instrs < padded_num_instrs; num_instrs++)
1875       cs_nop(b);
1876 
1877    handler->length = padded_num_instrs;
1878 }
1879 
1880 #define cs_exception_handler_def(__b, __handler, __ctx)                        \
1881    for (struct cs_exception_handler *__ehandler =                              \
1882            cs_exception_handler_start(__b, __handler, __ctx);                  \
1883         __ehandler != NULL;                                                    \
1884         cs_exception_handler_end(__b, __handler), __ehandler = NULL)
1885 
1886 struct cs_tracing_ctx {
1887    bool enabled;
1888    struct cs_index ctx_reg;
1889    unsigned tracebuf_addr_offset;
1890    uint8_t ls_sb_slot;
1891 };
1892 
1893 static inline void
cs_trace_preamble(struct cs_builder * b,const struct cs_tracing_ctx * ctx,struct cs_index scratch_regs,unsigned trace_size)1894 cs_trace_preamble(struct cs_builder *b, const struct cs_tracing_ctx *ctx,
1895                   struct cs_index scratch_regs, unsigned trace_size)
1896 {
1897    assert(trace_size > 0 && ALIGN_POT(trace_size, 64) == trace_size &&
1898           trace_size < INT16_MAX);
1899    assert(scratch_regs.size >= 4 && !(scratch_regs.reg & 1));
1900 
1901    struct cs_index tracebuf_addr = cs_reg64(b, scratch_regs.reg);
1902 
1903    /* We always update the tracebuf position first, so we can easily detect OOB
1904     * access. Use cs_trace_field_offset() to get an offset taking this
1905     * pre-increment into account. */
1906    cs_load64_to(b, tracebuf_addr, ctx->ctx_reg, ctx->tracebuf_addr_offset);
1907    cs_wait_slot(b, ctx->ls_sb_slot, false);
1908    cs_add64(b, tracebuf_addr, tracebuf_addr, trace_size);
1909    cs_store64(b, tracebuf_addr, ctx->ctx_reg, ctx->tracebuf_addr_offset);
1910    cs_wait_slot(b, ctx->ls_sb_slot, false);
1911 }
1912 
1913 #define cs_trace_field_offset(__type, __field)                                 \
1914    (int16_t)(offsetof(struct cs_##__type##_trace, __field) -                   \
1915              sizeof(struct cs_##__type##_trace))
1916 
1917 struct cs_run_fragment_trace {
1918    uint64_t ip;
1919    uint32_t sr[7];
1920 } __attribute__((aligned(64)));
1921 
1922 static inline void
cs_trace_run_fragment(struct cs_builder * b,const struct cs_tracing_ctx * ctx,struct cs_index scratch_regs,bool enable_tem,enum mali_tile_render_order tile_order,bool progress_inc)1923 cs_trace_run_fragment(struct cs_builder *b, const struct cs_tracing_ctx *ctx,
1924                       struct cs_index scratch_regs, bool enable_tem,
1925                       enum mali_tile_render_order tile_order, bool progress_inc)
1926 {
1927    if (likely(!ctx->enabled)) {
1928       cs_run_fragment(b, enable_tem, tile_order, progress_inc);
1929       return;
1930    }
1931 
1932    struct cs_index tracebuf_addr = cs_reg64(b, scratch_regs.reg);
1933    struct cs_index data = cs_reg64(b, scratch_regs.reg + 2);
1934 
1935    cs_trace_preamble(b, ctx, scratch_regs,
1936                      sizeof(struct cs_run_fragment_trace));
1937 
1938    /* cs_run_xx() must immediately follow cs_load_ip_to() otherwise the IP
1939     * won't point to the right instruction. */
1940    cs_load_ip_to(b, data);
1941    cs_run_fragment(b, enable_tem, tile_order, progress_inc);
1942    cs_store64(b, data, tracebuf_addr, cs_trace_field_offset(run_fragment, ip));
1943 
1944    cs_store(b, cs_reg_tuple(b, 40, 7), tracebuf_addr, BITFIELD_MASK(7),
1945             cs_trace_field_offset(run_fragment, sr));
1946    cs_wait_slot(b, ctx->ls_sb_slot, false);
1947 }
1948 
1949 struct cs_run_idvs_trace {
1950    uint64_t ip;
1951    uint32_t draw_id;
1952    uint32_t pad;
1953    uint32_t sr[61];
1954 } __attribute__((aligned(64)));
1955 
1956 static inline void
cs_trace_run_idvs(struct cs_builder * b,const struct cs_tracing_ctx * ctx,struct cs_index scratch_regs,uint32_t flags_override,bool progress_inc,bool malloc_enable,struct cs_shader_res_sel varying_sel,struct cs_shader_res_sel frag_sel,struct cs_index draw_id)1957 cs_trace_run_idvs(struct cs_builder *b, const struct cs_tracing_ctx *ctx,
1958                   struct cs_index scratch_regs, uint32_t flags_override,
1959                   bool progress_inc, bool malloc_enable,
1960                   struct cs_shader_res_sel varying_sel,
1961                   struct cs_shader_res_sel frag_sel, struct cs_index draw_id)
1962 {
1963    if (likely(!ctx->enabled)) {
1964       cs_run_idvs(b, flags_override, progress_inc, malloc_enable, varying_sel,
1965                   frag_sel, draw_id);
1966       return;
1967    }
1968 
1969    struct cs_index tracebuf_addr = cs_reg64(b, scratch_regs.reg);
1970    struct cs_index data = cs_reg64(b, scratch_regs.reg + 2);
1971 
1972    cs_trace_preamble(b, ctx, scratch_regs,
1973                      sizeof(struct cs_run_idvs_trace));
1974 
1975    /* cs_run_xx() must immediately follow cs_load_ip_to() otherwise the IP
1976     * won't point to the right instruction. */
1977    cs_load_ip_to(b, data);
1978    cs_run_idvs(b, flags_override, progress_inc, malloc_enable, varying_sel,
1979                frag_sel, draw_id);
1980    cs_store64(b, data, tracebuf_addr, cs_trace_field_offset(run_idvs, ip));
1981 
1982    if (draw_id.type != CS_INDEX_UNDEF)
1983       cs_store32(b, draw_id, tracebuf_addr,
1984                  cs_trace_field_offset(run_idvs, draw_id));
1985 
1986    for (unsigned i = 0; i < 48; i += 16)
1987       cs_store(b, cs_reg_tuple(b, i, 16), tracebuf_addr, BITFIELD_MASK(16),
1988                cs_trace_field_offset(run_idvs, sr[i]));
1989    cs_store(b, cs_reg_tuple(b, 48, 13), tracebuf_addr, BITFIELD_MASK(13),
1990             cs_trace_field_offset(run_idvs, sr[48]));
1991    cs_wait_slot(b, ctx->ls_sb_slot, false);
1992 }
1993 
1994 struct cs_run_compute_trace {
1995    uint64_t ip;
1996    uint32_t sr[40];
1997 } __attribute__((aligned(64)));
1998 
1999 static inline void
cs_trace_run_compute(struct cs_builder * b,const struct cs_tracing_ctx * ctx,struct cs_index scratch_regs,unsigned task_increment,enum mali_task_axis task_axis,bool progress_inc,struct cs_shader_res_sel res_sel)2000 cs_trace_run_compute(struct cs_builder *b, const struct cs_tracing_ctx *ctx,
2001                      struct cs_index scratch_regs, unsigned task_increment,
2002                      enum mali_task_axis task_axis, bool progress_inc,
2003                      struct cs_shader_res_sel res_sel)
2004 {
2005    if (likely(!ctx->enabled)) {
2006       cs_run_compute(b, task_increment, task_axis, progress_inc, res_sel);
2007       return;
2008    }
2009 
2010    struct cs_index tracebuf_addr = cs_reg64(b, scratch_regs.reg);
2011    struct cs_index data = cs_reg64(b, scratch_regs.reg + 2);
2012 
2013    cs_trace_preamble(b, ctx, scratch_regs,
2014                      sizeof(struct cs_run_compute_trace));
2015 
2016    /* cs_run_xx() must immediately follow cs_load_ip_to() otherwise the IP
2017     * won't point to the right instruction. */
2018    cs_load_ip_to(b, data);
2019    cs_run_compute(b, task_increment, task_axis, progress_inc, res_sel);
2020    cs_store64(b, data, tracebuf_addr, cs_trace_field_offset(run_compute, ip));
2021 
2022    for (unsigned i = 0; i < 32; i += 16)
2023       cs_store(b, cs_reg_tuple(b, i, 16), tracebuf_addr, BITFIELD_MASK(16),
2024                cs_trace_field_offset(run_compute, sr[i]));
2025    cs_store(b, cs_reg_tuple(b, 32, 8), tracebuf_addr, BITFIELD_MASK(8),
2026             cs_trace_field_offset(run_compute, sr[32]));
2027    cs_wait_slot(b, ctx->ls_sb_slot, false);
2028 }
2029 
2030 static inline void
cs_trace_run_compute_indirect(struct cs_builder * b,const struct cs_tracing_ctx * ctx,struct cs_index scratch_regs,unsigned wg_per_task,bool progress_inc,struct cs_shader_res_sel res_sel)2031 cs_trace_run_compute_indirect(struct cs_builder *b,
2032                               const struct cs_tracing_ctx *ctx,
2033                               struct cs_index scratch_regs,
2034                               unsigned wg_per_task, bool progress_inc,
2035                               struct cs_shader_res_sel res_sel)
2036 {
2037    if (likely(!ctx->enabled)) {
2038       cs_run_compute_indirect(b, wg_per_task, progress_inc, res_sel);
2039       return;
2040    }
2041 
2042    struct cs_index tracebuf_addr = cs_reg64(b, scratch_regs.reg);
2043    struct cs_index data = cs_reg64(b, scratch_regs.reg + 2);
2044 
2045    cs_trace_preamble(b, ctx, scratch_regs,
2046                      sizeof(struct cs_run_compute_trace));
2047 
2048    /* cs_run_xx() must immediately follow cs_load_ip_to() otherwise the IP
2049     * won't point to the right instruction. */
2050    cs_load_ip_to(b, data);
2051    cs_run_compute_indirect(b, wg_per_task, progress_inc, res_sel);
2052    cs_store64(b, data, tracebuf_addr, cs_trace_field_offset(run_compute, ip));
2053 
2054    for (unsigned i = 0; i < 32; i += 16)
2055       cs_store(b, cs_reg_tuple(b, i, 16), tracebuf_addr, BITFIELD_MASK(16),
2056                cs_trace_field_offset(run_compute, sr[i]));
2057    cs_store(b, cs_reg_tuple(b, 32, 8), tracebuf_addr, BITFIELD_MASK(8),
2058             cs_trace_field_offset(run_compute, sr[32]));
2059    cs_wait_slot(b, ctx->ls_sb_slot, false);
2060 }
2061