1 /*
2 * Copyright (C) 2022 Collabora Ltd.
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
13 * Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21 * IN THE SOFTWARE.
22 */
23
24 #pragma once
25
26 #if !defined(PAN_ARCH) || PAN_ARCH < 10
27 #error "cs_builder.h requires PAN_ARCH >= 10"
28 #endif
29
30 #include "gen_macros.h"
31
32 #include "util/bitset.h"
33 #include "util/u_dynarray.h"
34
35 /*
36 * cs_builder implements a builder for CSF command streams. It manages the
37 * allocation and overflow behaviour of queues and provides helpers for emitting
38 * commands to run on the CSF pipe.
39 *
40 * Users are responsible for the CS buffer allocation and must initialize the
41 * command stream with an initial buffer using cs_builder_init(). The CS can
42 * be extended with new buffers allocated with cs_builder_conf::alloc_buffer()
43 * if the builder runs out of memory.
44 */
45
46 struct cs_buffer {
47 /* CPU pointer */
48 uint64_t *cpu;
49
50 /* GPU pointer */
51 uint64_t gpu;
52
53 /* Capacity in number of 64-bit instructions */
54 uint32_t capacity;
55 };
56
57 /**
58 * This is used to check that:
59 * 1. registers are not used as a source after being loaded without a
60 * WAIT(<ls_scoreboard>) in the middle
61 * 2. registers are not reused (used as a destination) after they served as a
62 * STORE() source without a WAIT(<ls_scoreboard>) in the middle
63 */
64 struct cs_load_store_tracker {
65 BITSET_DECLARE(pending_loads, 256);
66 BITSET_DECLARE(pending_stores, 256);
67 uint8_t sb_slot;
68 };
69
70 /**
71 * This is used to determine which registers as been written to (a.k.a. used
72 * as an instruction's destination).
73 */
74 struct cs_dirty_tracker {
75 BITSET_DECLARE(regs, 256);
76 };
77
78 enum cs_reg_perm {
79 CS_REG_NO_ACCESS = 0,
80 CS_REG_RD = BITFIELD_BIT(1),
81 CS_REG_WR = BITFIELD_BIT(2),
82 CS_REG_RW = CS_REG_RD | CS_REG_WR,
83 };
84
85 struct cs_builder;
86
87 typedef enum cs_reg_perm (*reg_perm_cb_t)(struct cs_builder *b, unsigned reg);
88
89 struct cs_builder_conf {
90 /* Number of 32-bit registers in the hardware register file */
91 uint8_t nr_registers;
92
93 /* Number of 32-bit registers used by the kernel at submission time */
94 uint8_t nr_kernel_registers;
95
96 /* CS buffer allocator */
97 struct cs_buffer (*alloc_buffer)(void *cookie);
98
99 /* Optional load/store tracker. */
100 struct cs_load_store_tracker *ls_tracker;
101
102 /* Optional dirty registers tracker. */
103 struct cs_dirty_tracker *dirty_tracker;
104
105 /* Optional register access checker. */
106 reg_perm_cb_t reg_perm;
107
108 /* Cookie passed back to alloc_buffer() */
109 void *cookie;
110 };
111
112 /* The CS is formed of one or more CS chunks linked with JUMP instructions.
113 * The builder keeps track of the current chunk and the position inside this
114 * chunk, so it can emit new instructions, and decide when a new chunk needs
115 * to be allocated.
116 */
117 struct cs_chunk {
118 /* CS buffer object backing this chunk */
119 struct cs_buffer buffer;
120
121 union {
122 /* Current position in the buffer object when the chunk is active. */
123 uint32_t pos;
124
125 /* Chunk size when the chunk was wrapped. */
126 uint32_t size;
127 };
128 };
129
130 /* Monolithic sequence of instruction. Must live in a virtually contiguous
131 * portion of code.
132 */
133 struct cs_block {
134 /* Used to insert the block in the block stack. */
135 struct cs_block *next;
136 };
137
138 #define CS_LABEL_INVALID_POS ~0u
139
140 /* Labels can only be used inside a cs_block. They can be defined and
141 * referenced before they are set to point to a specific position
142 * in the block. */
143 struct cs_label {
144 /* The last reference we have seen pointing to this block before
145 * it was set. If set to CS_LABEL_INVALID_POS, no forward reference
146 * pointing to this label exist.
147 */
148 uint32_t last_forward_ref;
149
150 /* The label target. If set to CS_LABEL_INVALID_POS, the label has
151 * not been set yet.
152 */
153 uint32_t target;
154 };
155
156 /* CS if/else block. */
157 struct cs_if_else {
158 struct cs_block block;
159 struct cs_label end_label;
160 };
161
162 struct cs_builder {
163 /* CS builder configuration */
164 struct cs_builder_conf conf;
165
166 /* True if an allocation failed, making the whole CS invalid. */
167 bool invalid;
168
169 /* Initial (root) CS chunk. */
170 struct cs_chunk root_chunk;
171
172 /* Current CS chunk. */
173 struct cs_chunk cur_chunk;
174
175 /* Temporary storage for inner blocks that need to be built
176 * and copied in one monolithic sequence of instructions with no
177 * jump in the middle.
178 */
179 struct {
180 struct cs_block *stack;
181 struct util_dynarray instrs;
182 struct cs_if_else pending_if;
183 unsigned last_load_ip_target;
184 } blocks;
185
186 /* Move immediate instruction at the end of the last CS chunk that needs to
187 * be patched with the final length of the current CS chunk in order to
188 * facilitate correct overflow behaviour.
189 */
190 uint32_t *length_patch;
191
192 /* Used as temporary storage when the allocator couldn't allocate a new
193 * CS chunk.
194 */
195 uint64_t discard_instr_slot;
196 };
197
198 static inline void
cs_builder_init(struct cs_builder * b,const struct cs_builder_conf * conf,struct cs_buffer root_buffer)199 cs_builder_init(struct cs_builder *b, const struct cs_builder_conf *conf,
200 struct cs_buffer root_buffer)
201 {
202 *b = (struct cs_builder){
203 .conf = *conf,
204 .root_chunk.buffer = root_buffer,
205 .cur_chunk.buffer = root_buffer,
206 };
207
208 /* We need at least 3 registers for CS chunk linking. Assume the kernel needs
209 * at least that too.
210 */
211 b->conf.nr_kernel_registers = MAX2(b->conf.nr_kernel_registers, 3);
212
213 util_dynarray_init(&b->blocks.instrs, NULL);
214 }
215
216 static inline bool
cs_is_valid(struct cs_builder * b)217 cs_is_valid(struct cs_builder *b)
218 {
219 return !b->invalid;
220 }
221
222 static inline bool
cs_is_empty(struct cs_builder * b)223 cs_is_empty(struct cs_builder *b)
224 {
225 return b->cur_chunk.pos == 0 &&
226 b->root_chunk.buffer.gpu == b->cur_chunk.buffer.gpu;
227 }
228
229 static inline uint64_t
cs_root_chunk_gpu_addr(struct cs_builder * b)230 cs_root_chunk_gpu_addr(struct cs_builder *b)
231 {
232 return b->root_chunk.buffer.gpu;
233 }
234
235 static inline uint32_t
cs_root_chunk_size(struct cs_builder * b)236 cs_root_chunk_size(struct cs_builder *b)
237 {
238 /* Make sure cs_finish() was called. */
239 assert(!memcmp(&b->cur_chunk, &(struct cs_chunk){0}, sizeof(b->cur_chunk)));
240
241 return b->root_chunk.size * sizeof(uint64_t);
242 }
243
244 /*
245 * Wrap the current queue. External users shouldn't call this function
246 * directly, they should call cs_finish() when they are done building
247 * the command stream, which will in turn call cs_wrap_queue().
248 *
249 * Internally, this is also used to finalize internal CS chunks when
250 * allocating new sub-chunks. See cs_alloc_chunk() for details.
251 *
252 * This notably requires patching the previous chunk with the length
253 * we ended up emitting for this chunk.
254 */
255 static inline void
cs_wrap_chunk(struct cs_builder * b)256 cs_wrap_chunk(struct cs_builder *b)
257 {
258 if (!cs_is_valid(b))
259 return;
260
261 if (b->length_patch) {
262 *b->length_patch = (b->cur_chunk.pos * 8);
263 b->length_patch = NULL;
264 }
265
266 if (b->root_chunk.buffer.gpu == b->cur_chunk.buffer.gpu)
267 b->root_chunk.size = b->cur_chunk.size;
268 }
269
270 enum cs_index_type {
271 CS_INDEX_REGISTER = 0,
272 CS_INDEX_UNDEF,
273 };
274
275 struct cs_index {
276 enum cs_index_type type;
277
278 /* Number of 32-bit words in the index, must be nonzero */
279 uint8_t size;
280
281 union {
282 uint64_t imm;
283 uint8_t reg;
284 };
285 };
286
287 static inline struct cs_index
cs_undef(void)288 cs_undef(void)
289 {
290 return (struct cs_index){
291 .type = CS_INDEX_UNDEF,
292 };
293 }
294
295 static inline uint8_t
cs_to_reg_tuple(struct cs_index idx,ASSERTED unsigned expected_size)296 cs_to_reg_tuple(struct cs_index idx, ASSERTED unsigned expected_size)
297 {
298 assert(idx.type == CS_INDEX_REGISTER);
299 assert(idx.size == expected_size);
300
301 return idx.reg;
302 }
303
304 static inline unsigned
cs_src_tuple(struct cs_builder * b,struct cs_index src,ASSERTED unsigned count,uint16_t mask)305 cs_src_tuple(struct cs_builder *b, struct cs_index src, ASSERTED unsigned count,
306 uint16_t mask)
307 {
308 unsigned reg = cs_to_reg_tuple(src, count);
309
310 if (unlikely(b->conf.reg_perm)) {
311 for (unsigned i = reg; i < reg + count; i++) {
312 if (mask & BITFIELD_BIT(i - reg)) {
313 assert((b->conf.reg_perm(b, i) & CS_REG_RD) ||
314 !"Trying to read a restricted register");
315 }
316 }
317 }
318
319 struct cs_load_store_tracker *ls_tracker = b->conf.ls_tracker;
320
321 if (unlikely(ls_tracker)) {
322 for (unsigned i = reg; i < reg + count; i++) {
323 if ((mask & BITFIELD_BIT(i - reg)) &&
324 BITSET_TEST(ls_tracker->pending_loads, i))
325 assert(!"register used as a source before flushing loads\n");
326 }
327 }
328
329 return reg;
330 }
331
332 static inline unsigned
cs_src32(struct cs_builder * b,struct cs_index src)333 cs_src32(struct cs_builder *b, struct cs_index src)
334 {
335 return cs_src_tuple(b, src, 1, BITFIELD_MASK(1));
336 }
337
338 static inline unsigned
cs_src64(struct cs_builder * b,struct cs_index src)339 cs_src64(struct cs_builder *b, struct cs_index src)
340 {
341 return cs_src_tuple(b, src, 2, BITFIELD_MASK(2));
342 }
343
344 static inline unsigned
cs_dst_tuple(struct cs_builder * b,struct cs_index dst,ASSERTED unsigned count,uint16_t mask)345 cs_dst_tuple(struct cs_builder *b, struct cs_index dst, ASSERTED unsigned count,
346 uint16_t mask)
347 {
348 unsigned reg = cs_to_reg_tuple(dst, count);
349
350 if (unlikely(b->conf.reg_perm)) {
351 for (unsigned i = reg; i < reg + count; i++) {
352 if (mask & BITFIELD_BIT(i - reg)) {
353 assert((b->conf.reg_perm(b, i) & CS_REG_WR) ||
354 !"Trying to write a restricted register");
355 }
356 }
357 }
358
359 struct cs_load_store_tracker *ls_tracker = b->conf.ls_tracker;
360
361 if (unlikely(ls_tracker)) {
362 for (unsigned i = reg; i < reg + count; i++) {
363 if ((mask & BITFIELD_BIT(i - reg)) &&
364 BITSET_TEST(ls_tracker->pending_stores, i))
365 assert(
366 !"register reused as a destination before flushing stores\n");
367 }
368 }
369
370 if (unlikely(b->conf.dirty_tracker)) {
371 for (unsigned i = reg; i < reg + count; i++) {
372 if (mask & BITFIELD_BIT(i - reg))
373 BITSET_SET(b->conf.dirty_tracker->regs, i);
374 }
375 }
376
377 return reg;
378 }
379
380 static inline unsigned
cs_dst32(struct cs_builder * b,struct cs_index dst)381 cs_dst32(struct cs_builder *b, struct cs_index dst)
382 {
383 return cs_dst_tuple(b, dst, 1, BITFIELD_MASK(1));
384 }
385
386 static inline unsigned
cs_dst64(struct cs_builder * b,struct cs_index dst)387 cs_dst64(struct cs_builder *b, struct cs_index dst)
388 {
389 return cs_dst_tuple(b, dst, 2, BITFIELD_MASK(2));
390 }
391
392 static inline struct cs_index
cs_reg_tuple(ASSERTED struct cs_builder * b,unsigned reg,unsigned size)393 cs_reg_tuple(ASSERTED struct cs_builder *b, unsigned reg, unsigned size)
394 {
395 assert(reg + size <= b->conf.nr_registers - b->conf.nr_kernel_registers &&
396 "overflowed register file");
397 assert(size <= 16 && "unsupported");
398
399 return (struct cs_index){
400 .type = CS_INDEX_REGISTER,
401 .size = size,
402 .reg = reg,
403 };
404 }
405
406 static inline struct cs_index
cs_reg32(struct cs_builder * b,unsigned reg)407 cs_reg32(struct cs_builder *b, unsigned reg)
408 {
409 return cs_reg_tuple(b, reg, 1);
410 }
411
412 static inline struct cs_index
cs_reg64(struct cs_builder * b,unsigned reg)413 cs_reg64(struct cs_builder *b, unsigned reg)
414 {
415 assert((reg % 2) == 0 && "unaligned 64-bit reg");
416 return cs_reg_tuple(b, reg, 2);
417 }
418
419 /*
420 * The top of the register file is reserved for cs_builder internal use. We
421 * need 3 spare registers for handling command queue overflow. These are
422 * available here.
423 */
424 static inline uint8_t
cs_overflow_address_reg(struct cs_builder * b)425 cs_overflow_address_reg(struct cs_builder *b)
426 {
427 return b->conf.nr_registers - 2;
428 }
429
430 static inline uint8_t
cs_overflow_length_reg(struct cs_builder * b)431 cs_overflow_length_reg(struct cs_builder *b)
432 {
433 return b->conf.nr_registers - 3;
434 }
435
436 static inline struct cs_index
cs_extract32(struct cs_builder * b,struct cs_index idx,unsigned word)437 cs_extract32(struct cs_builder *b, struct cs_index idx, unsigned word)
438 {
439 assert(idx.type == CS_INDEX_REGISTER && "unsupported");
440 assert(word < idx.size && "overrun");
441
442 return cs_reg32(b, idx.reg + word);
443 }
444
445 static inline struct cs_block *
cs_cur_block(struct cs_builder * b)446 cs_cur_block(struct cs_builder *b)
447 {
448 return b->blocks.stack;
449 }
450
451 #define JUMP_SEQ_INSTR_COUNT 4
452
453 static inline bool
cs_reserve_instrs(struct cs_builder * b,uint32_t num_instrs)454 cs_reserve_instrs(struct cs_builder *b, uint32_t num_instrs)
455 {
456 /* Don't call this function with num_instrs=0. */
457 assert(num_instrs > 0);
458 assert(cs_cur_block(b) == NULL);
459
460 /* If an allocation failure happened before, we just discard all following
461 * instructions.
462 */
463 if (unlikely(!cs_is_valid(b)))
464 return false;
465
466 /* Lazy root chunk allocation. */
467 if (unlikely(!b->root_chunk.buffer.cpu)) {
468 b->root_chunk.buffer = b->conf.alloc_buffer(b->conf.cookie);
469 b->cur_chunk.buffer = b->root_chunk.buffer;
470 if (!b->cur_chunk.buffer.cpu) {
471 b->invalid = true;
472 return false;
473 }
474 }
475
476 /* Make sure the instruction sequence fits in a single chunk. */
477 assert(b->cur_chunk.buffer.capacity >= num_instrs);
478
479 /* If the current chunk runs out of space, allocate a new one and jump to it.
480 * We actually do this a few instructions before running out, because the
481 * sequence to jump to a new queue takes multiple instructions.
482 */
483 if (unlikely((b->cur_chunk.size + num_instrs + JUMP_SEQ_INSTR_COUNT) >
484 b->cur_chunk.buffer.capacity)) {
485 /* Now, allocate a new chunk */
486 struct cs_buffer newbuf = b->conf.alloc_buffer(b->conf.cookie);
487
488 /* Allocation failure, from now on, all new instructions will be
489 * discarded.
490 */
491 if (unlikely(!newbuf.cpu)) {
492 b->invalid = true;
493 return false;
494 }
495
496 uint64_t *ptr = b->cur_chunk.buffer.cpu + (b->cur_chunk.pos++);
497
498 pan_cast_and_pack(ptr, CS_MOVE, I) {
499 I.destination = cs_overflow_address_reg(b);
500 I.immediate = newbuf.gpu;
501 }
502
503 ptr = b->cur_chunk.buffer.cpu + (b->cur_chunk.pos++);
504
505 pan_cast_and_pack(ptr, CS_MOVE32, I) {
506 I.destination = cs_overflow_length_reg(b);
507 }
508
509 /* The length will be patched in later */
510 uint32_t *length_patch = (uint32_t *)ptr;
511
512 ptr = b->cur_chunk.buffer.cpu + (b->cur_chunk.pos++);
513
514 pan_cast_and_pack(ptr, CS_JUMP, I) {
515 I.length = cs_overflow_length_reg(b);
516 I.address = cs_overflow_address_reg(b);
517 }
518
519 /* Now that we've emitted everything, finish up the previous queue */
520 cs_wrap_chunk(b);
521
522 /* And make this one current */
523 b->length_patch = length_patch;
524 b->cur_chunk.buffer = newbuf;
525 b->cur_chunk.pos = 0;
526 }
527
528 return true;
529 }
530
531 static inline void *
cs_alloc_ins_block(struct cs_builder * b,uint32_t num_instrs)532 cs_alloc_ins_block(struct cs_builder *b, uint32_t num_instrs)
533 {
534 if (cs_cur_block(b))
535 return util_dynarray_grow(&b->blocks.instrs, uint64_t, num_instrs);
536
537 if (!cs_reserve_instrs(b, num_instrs))
538 return NULL;
539
540 assert(b->cur_chunk.size + num_instrs - 1 < b->cur_chunk.buffer.capacity);
541 uint32_t pos = b->cur_chunk.pos;
542 b->cur_chunk.pos += num_instrs;
543 return b->cur_chunk.buffer.cpu + pos;
544 }
545
546 static inline void
cs_flush_block_instrs(struct cs_builder * b)547 cs_flush_block_instrs(struct cs_builder *b)
548 {
549 if (cs_cur_block(b) != NULL)
550 return;
551
552 uint32_t num_instrs =
553 util_dynarray_num_elements(&b->blocks.instrs, uint64_t);
554 if (!num_instrs)
555 return;
556
557 /* If LOAD_IP is the last instruction in the block, we reserve one more
558 * slot to make sure the next instruction won't point to a CS chunk linking
559 * sequence. */
560 if (unlikely(b->blocks.last_load_ip_target >= num_instrs)) {
561 if (!cs_reserve_instrs(b, num_instrs + 1))
562 return;
563 }
564
565 void *buffer = cs_alloc_ins_block(b, num_instrs);
566
567 if (likely(buffer != NULL)) {
568 /* If we have a LOAD_IP chain, we need to patch each LOAD_IP
569 * instruction before we copy the block to the final memory
570 * region. */
571 while (unlikely(b->blocks.last_load_ip_target)) {
572 uint64_t *instr = util_dynarray_element(
573 &b->blocks.instrs, uint64_t, b->blocks.last_load_ip_target - 1);
574 unsigned prev_load_ip_target = *instr & BITFIELD_MASK(32);
575 uint64_t ip =
576 b->cur_chunk.buffer.gpu +
577 ((b->cur_chunk.pos - num_instrs + b->blocks.last_load_ip_target) *
578 sizeof(uint64_t));
579
580 /* Drop the prev_load_ip_target value and replace it by the final
581 * IP. */
582 *instr &= ~BITFIELD64_MASK(32);
583 *instr |= ip;
584
585 b->blocks.last_load_ip_target = prev_load_ip_target;
586 }
587
588 memcpy(buffer, b->blocks.instrs.data, b->blocks.instrs.size);
589 }
590
591 util_dynarray_clear(&b->blocks.instrs);
592 }
593
594 static inline uint32_t
cs_block_next_pos(struct cs_builder * b)595 cs_block_next_pos(struct cs_builder *b)
596 {
597 assert(cs_cur_block(b) != NULL);
598
599 return util_dynarray_num_elements(&b->blocks.instrs, uint64_t);
600 }
601
602 static inline void
cs_label_init(struct cs_label * label)603 cs_label_init(struct cs_label *label)
604 {
605 label->last_forward_ref = CS_LABEL_INVALID_POS;
606 label->target = CS_LABEL_INVALID_POS;
607 }
608
609 static inline void
cs_set_label(struct cs_builder * b,struct cs_label * label)610 cs_set_label(struct cs_builder *b, struct cs_label *label)
611 {
612 assert(label->target == CS_LABEL_INVALID_POS);
613 label->target = cs_block_next_pos(b);
614
615 for (uint32_t next_forward_ref, forward_ref = label->last_forward_ref;
616 forward_ref != CS_LABEL_INVALID_POS; forward_ref = next_forward_ref) {
617 uint64_t *ins =
618 util_dynarray_element(&b->blocks.instrs, uint64_t, forward_ref);
619
620 assert(forward_ref < label->target);
621 assert(label->target - forward_ref <= INT16_MAX);
622
623 /* Save the next forward reference to this target before overwritting
624 * it with the final offset.
625 */
626 int16_t offset = *ins & BITFIELD64_MASK(16);
627
628 next_forward_ref =
629 offset > 0 ? forward_ref - offset : CS_LABEL_INVALID_POS;
630
631 assert(next_forward_ref == CS_LABEL_INVALID_POS ||
632 next_forward_ref < forward_ref);
633
634 *ins &= ~BITFIELD64_MASK(16);
635 *ins |= label->target - forward_ref - 1;
636 }
637 }
638
639 static inline void
cs_flush_pending_if(struct cs_builder * b)640 cs_flush_pending_if(struct cs_builder *b)
641 {
642 if (likely(cs_cur_block(b) != &b->blocks.pending_if.block))
643 return;
644
645 cs_set_label(b, &b->blocks.pending_if.end_label);
646 b->blocks.stack = b->blocks.pending_if.block.next;
647 cs_flush_block_instrs(b);
648 }
649
650 static inline void *
cs_alloc_ins(struct cs_builder * b)651 cs_alloc_ins(struct cs_builder *b)
652 {
653 /* If an instruction is emitted after an if_end(), it flushes the pending if,
654 * causing further cs_else_start() instructions to be invalid. */
655 cs_flush_pending_if(b);
656
657 return cs_alloc_ins_block(b, 1) ?: &b->discard_instr_slot;
658 }
659
660 /* Call this when you are done building a command stream and want to prepare
661 * it for submission.
662 */
663 static inline void
cs_finish(struct cs_builder * b)664 cs_finish(struct cs_builder *b)
665 {
666 if (!cs_is_valid(b))
667 return;
668
669 cs_flush_pending_if(b);
670 cs_wrap_chunk(b);
671
672 /* This prevents adding instructions after that point. */
673 memset(&b->cur_chunk, 0, sizeof(b->cur_chunk));
674
675 util_dynarray_fini(&b->blocks.instrs);
676 }
677
678 /*
679 * Helper to emit a new instruction into the command queue. The allocation needs
680 * to be separated out being pan_pack can evaluate its argument multiple times,
681 * yet cs_alloc has side effects.
682 */
683 #define cs_emit(b, T, cfg) pan_cast_and_pack(cs_alloc_ins(b), CS_##T, cfg)
684
685 /* Asynchronous operations take a mask of scoreboard slots to wait on
686 * before executing the instruction, and signal a scoreboard slot when
687 * the operation is complete.
688 * A wait_mask of zero means the operation is synchronous, and signal_slot
689 * is ignored in that case.
690 */
691 struct cs_async_op {
692 uint16_t wait_mask;
693 uint8_t signal_slot;
694 };
695
696 static inline struct cs_async_op
cs_defer(unsigned wait_mask,unsigned signal_slot)697 cs_defer(unsigned wait_mask, unsigned signal_slot)
698 {
699 /* The scoreboard slot to signal is incremented before the wait operation,
700 * waiting on it would cause an infinite wait.
701 */
702 assert(!(wait_mask & BITFIELD_BIT(signal_slot)));
703
704 return (struct cs_async_op){
705 .wait_mask = wait_mask,
706 .signal_slot = signal_slot,
707 };
708 }
709
710 static inline struct cs_async_op
cs_now(void)711 cs_now(void)
712 {
713 return (struct cs_async_op){
714 .wait_mask = 0,
715 .signal_slot = ~0,
716 };
717 }
718
719 static inline bool
cs_instr_is_asynchronous(enum mali_cs_opcode opcode,uint16_t wait_mask)720 cs_instr_is_asynchronous(enum mali_cs_opcode opcode, uint16_t wait_mask)
721 {
722 switch (opcode) {
723 case MALI_CS_OPCODE_FLUSH_CACHE2:
724 case MALI_CS_OPCODE_FINISH_TILING:
725 case MALI_CS_OPCODE_LOAD_MULTIPLE:
726 case MALI_CS_OPCODE_STORE_MULTIPLE:
727 case MALI_CS_OPCODE_RUN_COMPUTE:
728 case MALI_CS_OPCODE_RUN_COMPUTE_INDIRECT:
729 case MALI_CS_OPCODE_RUN_FRAGMENT:
730 case MALI_CS_OPCODE_RUN_FULLSCREEN:
731 case MALI_CS_OPCODE_RUN_IDVS:
732 case MALI_CS_OPCODE_RUN_TILING:
733 /* Always asynchronous. */
734 return true;
735
736 case MALI_CS_OPCODE_FINISH_FRAGMENT:
737 case MALI_CS_OPCODE_SYNC_ADD32:
738 case MALI_CS_OPCODE_SYNC_SET32:
739 case MALI_CS_OPCODE_SYNC_ADD64:
740 case MALI_CS_OPCODE_SYNC_SET64:
741 case MALI_CS_OPCODE_STORE_STATE:
742 case MALI_CS_OPCODE_TRACE_POINT:
743 case MALI_CS_OPCODE_HEAP_OPERATION:
744 /* Asynchronous only if wait_mask != 0. */
745 return wait_mask != 0;
746
747 default:
748 return false;
749 }
750 }
751
752 #define cs_apply_async(I, async) \
753 do { \
754 I.wait_mask = async.wait_mask; \
755 I.signal_slot = cs_instr_is_asynchronous(I.opcode, I.wait_mask) \
756 ? async.signal_slot \
757 : 0; \
758 assert(I.signal_slot != ~0 || \
759 !"Can't use cs_now() on pure async instructions"); \
760 } while (0)
761
762 static inline void
cs_move32_to(struct cs_builder * b,struct cs_index dest,unsigned imm)763 cs_move32_to(struct cs_builder *b, struct cs_index dest, unsigned imm)
764 {
765 cs_emit(b, MOVE32, I) {
766 I.destination = cs_dst32(b, dest);
767 I.immediate = imm;
768 }
769 }
770
771 static inline void
cs_move48_to(struct cs_builder * b,struct cs_index dest,uint64_t imm)772 cs_move48_to(struct cs_builder *b, struct cs_index dest, uint64_t imm)
773 {
774 cs_emit(b, MOVE, I) {
775 I.destination = cs_dst64(b, dest);
776 I.immediate = imm;
777 }
778 }
779
780 static inline void
cs_load_ip_to(struct cs_builder * b,struct cs_index dest)781 cs_load_ip_to(struct cs_builder *b, struct cs_index dest)
782 {
783 /* If a load_ip instruction is emitted after an if_end(), it flushes the
784 * pending if, causing further cs_else_start() instructions to be invalid.
785 */
786 cs_flush_pending_if(b);
787
788 if (likely(cs_cur_block(b) == NULL)) {
789 if (!cs_reserve_instrs(b, 2))
790 return;
791
792 /* We make IP point to the instruction right after our MOVE. */
793 uint64_t ip =
794 b->cur_chunk.buffer.gpu + (sizeof(uint64_t) * (b->cur_chunk.pos + 1));
795 cs_move48_to(b, dest, ip);
796 } else {
797 cs_move48_to(b, dest, b->blocks.last_load_ip_target);
798 b->blocks.last_load_ip_target =
799 util_dynarray_num_elements(&b->blocks.instrs, uint64_t);
800 }
801 }
802
803 static inline void
cs_block_start(struct cs_builder * b,struct cs_block * block)804 cs_block_start(struct cs_builder *b, struct cs_block *block)
805 {
806 cs_flush_pending_if(b);
807 block->next = b->blocks.stack;
808 b->blocks.stack = block;
809 }
810
811 static inline void
cs_block_end(struct cs_builder * b,struct cs_block * block)812 cs_block_end(struct cs_builder *b, struct cs_block *block)
813 {
814 cs_flush_pending_if(b);
815
816 assert(cs_cur_block(b) == block);
817
818 b->blocks.stack = block->next;
819
820 cs_flush_block_instrs(b);
821 }
822
823 static inline void
cs_branch(struct cs_builder * b,int offset,enum mali_cs_condition cond,struct cs_index val)824 cs_branch(struct cs_builder *b, int offset, enum mali_cs_condition cond,
825 struct cs_index val)
826 {
827 cs_emit(b, BRANCH, I) {
828 I.offset = offset;
829 I.condition = cond;
830 I.value = cs_src32(b, val);
831 }
832 }
833
834 static inline void
cs_branch_label(struct cs_builder * b,struct cs_label * label,enum mali_cs_condition cond,struct cs_index val)835 cs_branch_label(struct cs_builder *b, struct cs_label *label,
836 enum mali_cs_condition cond, struct cs_index val)
837 {
838 assert(cs_cur_block(b) != NULL);
839
840 if (label->target == CS_LABEL_INVALID_POS) {
841 uint32_t branch_ins_pos = cs_block_next_pos(b);
842
843 /* Instead of emitting a BRANCH with the final offset, we record the
844 * diff between the current branch, and the previous branch that was
845 * referencing this unset label. This way we build a single link list
846 * that can be walked when the label is set with cs_set_label().
847 * We use -1 as the end-of-list marker.
848 */
849 int16_t offset = -1;
850 if (label->last_forward_ref != CS_LABEL_INVALID_POS) {
851 assert(label->last_forward_ref < branch_ins_pos);
852 assert(branch_ins_pos - label->last_forward_ref <= INT16_MAX);
853 offset = branch_ins_pos - label->last_forward_ref;
854 }
855
856 cs_emit(b, BRANCH, I) {
857 I.offset = offset;
858 I.condition = cond;
859 I.value = cond != MALI_CS_CONDITION_ALWAYS ? cs_src32(b, val) : 0;
860 }
861
862 label->last_forward_ref = branch_ins_pos;
863 } else {
864 int32_t offset = label->target - cs_block_next_pos(b) - 1;
865
866 /* The branch target is encoded in a 16-bit signed integer, make sure we
867 * don't underflow.
868 */
869 assert(offset >= INT16_MIN);
870
871 /* Backward references are easy, we can emit them immediately. */
872 cs_emit(b, BRANCH, I) {
873 I.offset = offset;
874 I.condition = cond;
875 I.value = cond != MALI_CS_CONDITION_ALWAYS ? cs_src32(b, val) : 0;
876 }
877 }
878 }
879
880 static inline enum mali_cs_condition
cs_invert_cond(enum mali_cs_condition cond)881 cs_invert_cond(enum mali_cs_condition cond)
882 {
883 switch (cond) {
884 case MALI_CS_CONDITION_LEQUAL:
885 return MALI_CS_CONDITION_GREATER;
886 case MALI_CS_CONDITION_EQUAL:
887 return MALI_CS_CONDITION_NEQUAL;
888 case MALI_CS_CONDITION_LESS:
889 return MALI_CS_CONDITION_GEQUAL;
890 case MALI_CS_CONDITION_GREATER:
891 return MALI_CS_CONDITION_LEQUAL;
892 case MALI_CS_CONDITION_NEQUAL:
893 return MALI_CS_CONDITION_EQUAL;
894 case MALI_CS_CONDITION_GEQUAL:
895 return MALI_CS_CONDITION_LESS;
896 case MALI_CS_CONDITION_ALWAYS:
897 unreachable("cannot invert ALWAYS");
898 default:
899 unreachable("invalid cond");
900 }
901 }
902
903 static inline struct cs_if_else *
cs_if_start(struct cs_builder * b,struct cs_if_else * if_else,enum mali_cs_condition cond,struct cs_index val)904 cs_if_start(struct cs_builder *b, struct cs_if_else *if_else,
905 enum mali_cs_condition cond, struct cs_index val)
906 {
907 cs_block_start(b, &if_else->block);
908 cs_label_init(&if_else->end_label);
909 cs_branch_label(b, &if_else->end_label, cs_invert_cond(cond), val);
910 return if_else;
911 }
912
913 static inline void
cs_if_end(struct cs_builder * b,struct cs_if_else * if_else)914 cs_if_end(struct cs_builder *b, struct cs_if_else *if_else)
915 {
916 assert(cs_cur_block(b) == &if_else->block);
917
918 b->blocks.pending_if.block.next = if_else->block.next;
919 b->blocks.stack = &b->blocks.pending_if.block;
920 b->blocks.pending_if.end_label = if_else->end_label;
921 }
922
923 static inline struct cs_if_else *
cs_else_start(struct cs_builder * b,struct cs_if_else * if_else)924 cs_else_start(struct cs_builder *b, struct cs_if_else *if_else)
925 {
926 assert(cs_cur_block(b) == &b->blocks.pending_if.block);
927
928 if_else->block.next = b->blocks.pending_if.block.next;
929 b->blocks.stack = &if_else->block;
930 cs_label_init(&if_else->end_label);
931 cs_branch_label(b, &if_else->end_label, MALI_CS_CONDITION_ALWAYS,
932 cs_undef());
933 cs_set_label(b, &b->blocks.pending_if.end_label);
934 cs_label_init(&b->blocks.pending_if.end_label);
935
936 return if_else;
937 }
938
939 static inline void
cs_else_end(struct cs_builder * b,struct cs_if_else * if_else)940 cs_else_end(struct cs_builder *b, struct cs_if_else *if_else)
941 {
942 cs_set_label(b, &if_else->end_label);
943 cs_block_end(b, &if_else->block);
944 }
945
946 #define cs_if(__b, __cond, __val) \
947 for (struct cs_if_else __storage, \
948 *__if_else = cs_if_start(__b, &__storage, __cond, __val); \
949 __if_else != NULL; cs_if_end(__b, __if_else), __if_else = NULL)
950
951 #define cs_else(__b) \
952 for (struct cs_if_else __storage, \
953 *__if_else = cs_else_start(__b, &__storage); \
954 __if_else != NULL; cs_else_end(__b, __if_else), __if_else = NULL)
955
956 struct cs_loop {
957 struct cs_label start, end;
958 struct cs_block block;
959 enum mali_cs_condition cond;
960 struct cs_index val;
961 struct cs_load_store_tracker *orig_ls_state;
962 struct cs_load_store_tracker ls_state;
963 };
964
965 static inline void
cs_loop_diverge_ls_update(struct cs_builder * b,struct cs_loop * loop)966 cs_loop_diverge_ls_update(struct cs_builder *b, struct cs_loop *loop)
967 {
968 if (likely(!b->conf.ls_tracker))
969 return;
970
971 if (!loop->orig_ls_state) {
972 loop->orig_ls_state = b->conf.ls_tracker;
973 loop->ls_state = *loop->orig_ls_state;
974 b->conf.ls_tracker = &loop->ls_state;
975 } else {
976 BITSET_OR(loop->orig_ls_state->pending_loads,
977 loop->orig_ls_state->pending_loads,
978 loop->ls_state.pending_loads);
979 BITSET_OR(loop->orig_ls_state->pending_stores,
980 loop->orig_ls_state->pending_stores,
981 loop->ls_state.pending_stores);
982 }
983 }
984
985 static inline struct cs_loop *
cs_do_while_start(struct cs_builder * b,struct cs_loop * loop,enum mali_cs_condition cond,struct cs_index val)986 cs_do_while_start(struct cs_builder *b, struct cs_loop *loop,
987 enum mali_cs_condition cond, struct cs_index val)
988 {
989 *loop = (struct cs_loop){
990 .cond = cond,
991 .val = val,
992 };
993
994 cs_block_start(b, &loop->block);
995 cs_label_init(&loop->start);
996 cs_label_init(&loop->end);
997 cs_set_label(b, &loop->start);
998 return loop;
999 }
1000
1001 static inline struct cs_loop *
cs_while_start(struct cs_builder * b,struct cs_loop * loop,enum mali_cs_condition cond,struct cs_index val)1002 cs_while_start(struct cs_builder *b, struct cs_loop *loop,
1003 enum mali_cs_condition cond, struct cs_index val)
1004 {
1005 cs_do_while_start(b, loop, cond, val);
1006
1007 /* Do an initial check on the condition, and if it's false, jump to
1008 * the end of the loop block. For 'while(true)' loops, skip the
1009 * conditional branch.
1010 */
1011 if (cond != MALI_CS_CONDITION_ALWAYS) {
1012 cs_branch_label(b, &loop->end, cs_invert_cond(cond), val);
1013 cs_loop_diverge_ls_update(b, loop);
1014 }
1015
1016 return loop;
1017 }
1018
1019 static inline void
cs_loop_conditional_continue(struct cs_builder * b,struct cs_loop * loop,enum mali_cs_condition cond,struct cs_index val)1020 cs_loop_conditional_continue(struct cs_builder *b, struct cs_loop *loop,
1021 enum mali_cs_condition cond, struct cs_index val)
1022 {
1023 cs_flush_pending_if(b);
1024 cs_branch_label(b, &loop->start, cond, val);
1025 cs_loop_diverge_ls_update(b, loop);
1026 }
1027
1028 static inline void
cs_loop_conditional_break(struct cs_builder * b,struct cs_loop * loop,enum mali_cs_condition cond,struct cs_index val)1029 cs_loop_conditional_break(struct cs_builder *b, struct cs_loop *loop,
1030 enum mali_cs_condition cond, struct cs_index val)
1031 {
1032 cs_flush_pending_if(b);
1033 cs_branch_label(b, &loop->end, cond, val);
1034 cs_loop_diverge_ls_update(b, loop);
1035 }
1036
1037 static inline void
cs_while_end(struct cs_builder * b,struct cs_loop * loop)1038 cs_while_end(struct cs_builder *b, struct cs_loop *loop)
1039 {
1040 cs_flush_pending_if(b);
1041 cs_branch_label(b, &loop->start, loop->cond, loop->val);
1042 cs_set_label(b, &loop->end);
1043 cs_block_end(b, &loop->block);
1044
1045 if (unlikely(loop->orig_ls_state)) {
1046 BITSET_OR(loop->orig_ls_state->pending_loads,
1047 loop->orig_ls_state->pending_loads,
1048 loop->ls_state.pending_loads);
1049 BITSET_OR(loop->orig_ls_state->pending_stores,
1050 loop->orig_ls_state->pending_stores,
1051 loop->ls_state.pending_stores);
1052 b->conf.ls_tracker = loop->orig_ls_state;
1053 }
1054 }
1055
1056 #define cs_while(__b, __cond, __val) \
1057 for (struct cs_loop __loop_storage, \
1058 *__loop = cs_while_start(__b, &__loop_storage, __cond, __val); \
1059 __loop != NULL; cs_while_end(__b, __loop), __loop = NULL)
1060
1061 #define cs_continue(__b) \
1062 cs_loop_conditional_continue(__b, __loop, MALI_CS_CONDITION_ALWAYS, \
1063 cs_undef())
1064
1065 #define cs_break(__b) \
1066 cs_loop_conditional_break(__b, __loop, MALI_CS_CONDITION_ALWAYS, cs_undef())
1067
1068 /* Pseudoinstructions follow */
1069
1070 static inline void
cs_move64_to(struct cs_builder * b,struct cs_index dest,uint64_t imm)1071 cs_move64_to(struct cs_builder *b, struct cs_index dest, uint64_t imm)
1072 {
1073 if (imm < (1ull << 48)) {
1074 /* Zero extends */
1075 cs_move48_to(b, dest, imm);
1076 } else {
1077 cs_move32_to(b, cs_extract32(b, dest, 0), imm);
1078 cs_move32_to(b, cs_extract32(b, dest, 1), imm >> 32);
1079 }
1080 }
1081
1082 static inline void
cs_wait_slots(struct cs_builder * b,unsigned wait_mask,bool progress_inc)1083 cs_wait_slots(struct cs_builder *b, unsigned wait_mask, bool progress_inc)
1084 {
1085 struct cs_load_store_tracker *ls_tracker = b->conf.ls_tracker;
1086
1087 cs_emit(b, WAIT, I) {
1088 I.wait_mask = wait_mask;
1089 I.progress_increment = progress_inc;
1090 }
1091
1092 /* We don't do advanced tracking of cs_defer(), and assume that
1093 * load/store will be flushed with an explicit wait on the load/store
1094 * scoreboard. */
1095 if (unlikely(ls_tracker) &&
1096 (wait_mask & BITFIELD_BIT(ls_tracker->sb_slot))) {
1097 BITSET_CLEAR_RANGE(ls_tracker->pending_loads, 0, 255);
1098 BITSET_CLEAR_RANGE(ls_tracker->pending_stores, 0, 255);
1099 }
1100 }
1101
1102 static inline void
cs_wait_slot(struct cs_builder * b,unsigned slot,bool progress_inc)1103 cs_wait_slot(struct cs_builder *b, unsigned slot, bool progress_inc)
1104 {
1105 assert(slot < 8 && "invalid slot");
1106
1107 cs_wait_slots(b, BITFIELD_BIT(slot), progress_inc);
1108 }
1109
1110 struct cs_shader_res_sel {
1111 uint8_t srt, fau, spd, tsd;
1112 };
1113
1114 static inline struct cs_shader_res_sel
cs_shader_res_sel(unsigned srt,unsigned fau,unsigned spd,unsigned tsd)1115 cs_shader_res_sel(unsigned srt, unsigned fau, unsigned spd, unsigned tsd)
1116 {
1117 return (struct cs_shader_res_sel){
1118 .srt = srt,
1119 .fau = fau,
1120 .spd = spd,
1121 .tsd = tsd,
1122 };
1123 }
1124
1125 static inline void
cs_run_compute(struct cs_builder * b,unsigned task_increment,enum mali_task_axis task_axis,bool progress_inc,struct cs_shader_res_sel res_sel)1126 cs_run_compute(struct cs_builder *b, unsigned task_increment,
1127 enum mali_task_axis task_axis, bool progress_inc,
1128 struct cs_shader_res_sel res_sel)
1129 {
1130 cs_emit(b, RUN_COMPUTE, I) {
1131 I.task_increment = task_increment;
1132 I.task_axis = task_axis;
1133 I.progress_increment = progress_inc;
1134 I.srt_select = res_sel.srt;
1135 I.spd_select = res_sel.spd;
1136 I.tsd_select = res_sel.tsd;
1137 I.fau_select = res_sel.fau;
1138 }
1139 }
1140
1141 static inline void
cs_run_tiling(struct cs_builder * b,uint32_t flags_override,bool progress_inc,struct cs_shader_res_sel res_sel)1142 cs_run_tiling(struct cs_builder *b, uint32_t flags_override, bool progress_inc,
1143 struct cs_shader_res_sel res_sel)
1144 {
1145 cs_emit(b, RUN_TILING, I) {
1146 I.flags_override = flags_override;
1147 I.progress_increment = progress_inc;
1148 I.srt_select = res_sel.srt;
1149 I.spd_select = res_sel.spd;
1150 I.tsd_select = res_sel.tsd;
1151 I.fau_select = res_sel.fau;
1152 }
1153 }
1154
1155 static inline void
cs_run_idvs(struct cs_builder * b,uint32_t flags_override,bool progress_inc,bool malloc_enable,struct cs_shader_res_sel varying_sel,struct cs_shader_res_sel frag_sel,struct cs_index draw_id)1156 cs_run_idvs(struct cs_builder *b, uint32_t flags_override, bool progress_inc,
1157 bool malloc_enable, struct cs_shader_res_sel varying_sel,
1158 struct cs_shader_res_sel frag_sel, struct cs_index draw_id)
1159 {
1160 cs_emit(b, RUN_IDVS, I) {
1161 I.flags_override = flags_override;
1162 I.progress_increment = progress_inc;
1163 I.malloc_enable = malloc_enable;
1164
1165 if (draw_id.type == CS_INDEX_UNDEF) {
1166 I.draw_id_register_enable = false;
1167 } else {
1168 I.draw_id_register_enable = true;
1169 I.draw_id = cs_src32(b, draw_id);
1170 }
1171
1172 assert(varying_sel.spd == 1);
1173 assert(varying_sel.fau == 0 || varying_sel.fau == 1);
1174 assert(varying_sel.srt == 0 || varying_sel.srt == 1);
1175 assert(varying_sel.tsd == 0 || varying_sel.tsd == 1);
1176 I.varying_fau_select = varying_sel.fau == 1;
1177 I.varying_srt_select = varying_sel.srt == 1;
1178 I.varying_tsd_select = varying_sel.tsd == 1;
1179
1180 assert(frag_sel.spd == 2);
1181 assert(frag_sel.fau == 2);
1182 assert(frag_sel.srt == 2 || frag_sel.srt == 0);
1183 assert(frag_sel.tsd == 2 || frag_sel.tsd == 0);
1184 I.fragment_srt_select = frag_sel.srt == 2;
1185 I.fragment_tsd_select = frag_sel.tsd == 2;
1186 }
1187 }
1188
1189 static inline void
cs_run_fragment(struct cs_builder * b,bool enable_tem,enum mali_tile_render_order tile_order,bool progress_inc)1190 cs_run_fragment(struct cs_builder *b, bool enable_tem,
1191 enum mali_tile_render_order tile_order, bool progress_inc)
1192 {
1193 cs_emit(b, RUN_FRAGMENT, I) {
1194 I.enable_tem = enable_tem;
1195 I.tile_order = tile_order;
1196 I.progress_increment = progress_inc;
1197 }
1198 }
1199
1200 static inline void
cs_run_fullscreen(struct cs_builder * b,uint32_t flags_override,bool progress_inc,struct cs_index dcd)1201 cs_run_fullscreen(struct cs_builder *b, uint32_t flags_override,
1202 bool progress_inc, struct cs_index dcd)
1203 {
1204 cs_emit(b, RUN_FULLSCREEN, I) {
1205 I.flags_override = flags_override;
1206 I.progress_increment = progress_inc;
1207 I.dcd = cs_src64(b, dcd);
1208 }
1209 }
1210
1211 static inline void
cs_finish_tiling(struct cs_builder * b,bool progress_inc)1212 cs_finish_tiling(struct cs_builder *b, bool progress_inc)
1213 {
1214 cs_emit(b, FINISH_TILING, I)
1215 I.progress_increment = progress_inc;
1216 }
1217
1218 static inline void
cs_finish_fragment(struct cs_builder * b,bool increment_frag_completed,struct cs_index first_free_heap_chunk,struct cs_index last_free_heap_chunk,struct cs_async_op async)1219 cs_finish_fragment(struct cs_builder *b, bool increment_frag_completed,
1220 struct cs_index first_free_heap_chunk,
1221 struct cs_index last_free_heap_chunk,
1222 struct cs_async_op async)
1223 {
1224 cs_emit(b, FINISH_FRAGMENT, I) {
1225 I.increment_fragment_completed = increment_frag_completed;
1226 cs_apply_async(I, async);
1227 I.first_heap_chunk = cs_src64(b, first_free_heap_chunk);
1228 I.last_heap_chunk = cs_src64(b, last_free_heap_chunk);
1229 }
1230 }
1231
1232 static inline void
cs_add32(struct cs_builder * b,struct cs_index dest,struct cs_index src,unsigned imm)1233 cs_add32(struct cs_builder *b, struct cs_index dest, struct cs_index src,
1234 unsigned imm)
1235 {
1236 cs_emit(b, ADD_IMMEDIATE32, I) {
1237 I.destination = cs_dst32(b, dest);
1238 I.source = cs_src32(b, src);
1239 I.immediate = imm;
1240 }
1241 }
1242
1243 static inline void
cs_add64(struct cs_builder * b,struct cs_index dest,struct cs_index src,unsigned imm)1244 cs_add64(struct cs_builder *b, struct cs_index dest, struct cs_index src,
1245 unsigned imm)
1246 {
1247 cs_emit(b, ADD_IMMEDIATE64, I) {
1248 I.destination = cs_dst64(b, dest);
1249 I.source = cs_src64(b, src);
1250 I.immediate = imm;
1251 }
1252 }
1253
1254 static inline void
cs_umin32(struct cs_builder * b,struct cs_index dest,struct cs_index src1,struct cs_index src2)1255 cs_umin32(struct cs_builder *b, struct cs_index dest, struct cs_index src1,
1256 struct cs_index src2)
1257 {
1258 cs_emit(b, UMIN32, I) {
1259 I.destination = cs_dst32(b, dest);
1260 I.source_1 = cs_src32(b, src1);
1261 I.source_2 = cs_src32(b, src2);
1262 }
1263 }
1264
1265 static inline void
cs_load_to(struct cs_builder * b,struct cs_index dest,struct cs_index address,unsigned mask,int offset)1266 cs_load_to(struct cs_builder *b, struct cs_index dest, struct cs_index address,
1267 unsigned mask, int offset)
1268 {
1269 unsigned count = util_last_bit(mask);
1270 unsigned base_reg = cs_dst_tuple(b, dest, count, mask);
1271
1272 cs_emit(b, LOAD_MULTIPLE, I) {
1273 I.base_register = base_reg;
1274 I.address = cs_src64(b, address);
1275 I.mask = mask;
1276 I.offset = offset;
1277 }
1278
1279 if (unlikely(b->conf.ls_tracker)) {
1280 for (unsigned i = 0; i < count; i++) {
1281 if (mask & BITFIELD_BIT(i))
1282 BITSET_SET(b->conf.ls_tracker->pending_loads, base_reg + i);
1283 }
1284 }
1285 }
1286
1287 static inline void
cs_load32_to(struct cs_builder * b,struct cs_index dest,struct cs_index address,int offset)1288 cs_load32_to(struct cs_builder *b, struct cs_index dest,
1289 struct cs_index address, int offset)
1290 {
1291 cs_load_to(b, dest, address, BITFIELD_MASK(1), offset);
1292 }
1293
1294 static inline void
cs_load64_to(struct cs_builder * b,struct cs_index dest,struct cs_index address,int offset)1295 cs_load64_to(struct cs_builder *b, struct cs_index dest,
1296 struct cs_index address, int offset)
1297 {
1298 cs_load_to(b, dest, address, BITFIELD_MASK(2), offset);
1299 }
1300
1301 static inline void
cs_store(struct cs_builder * b,struct cs_index data,struct cs_index address,unsigned mask,int offset)1302 cs_store(struct cs_builder *b, struct cs_index data, struct cs_index address,
1303 unsigned mask, int offset)
1304 {
1305 unsigned count = util_last_bit(mask);
1306 unsigned base_reg = cs_src_tuple(b, data, count, mask);
1307
1308 cs_emit(b, STORE_MULTIPLE, I) {
1309 I.base_register = base_reg;
1310 I.address = cs_src64(b, address);
1311 I.mask = mask;
1312 I.offset = offset;
1313 }
1314
1315 if (unlikely(b->conf.ls_tracker)) {
1316 for (unsigned i = 0; i < count; i++) {
1317 if (mask & BITFIELD_BIT(i))
1318 BITSET_SET(b->conf.ls_tracker->pending_stores, base_reg + i);
1319 }
1320 }
1321 }
1322
1323 static inline void
cs_store32(struct cs_builder * b,struct cs_index data,struct cs_index address,int offset)1324 cs_store32(struct cs_builder *b, struct cs_index data, struct cs_index address,
1325 int offset)
1326 {
1327 cs_store(b, data, address, BITFIELD_MASK(1), offset);
1328 }
1329
1330 static inline void
cs_store64(struct cs_builder * b,struct cs_index data,struct cs_index address,int offset)1331 cs_store64(struct cs_builder *b, struct cs_index data, struct cs_index address,
1332 int offset)
1333 {
1334 cs_store(b, data, address, BITFIELD_MASK(2), offset);
1335 }
1336
1337 /*
1338 * Select which scoreboard entry will track endpoint tasks and other tasks
1339 * respectively. Pass to cs_wait to wait later.
1340 */
1341 static inline void
cs_set_scoreboard_entry(struct cs_builder * b,unsigned ep,unsigned other)1342 cs_set_scoreboard_entry(struct cs_builder *b, unsigned ep, unsigned other)
1343 {
1344 assert(ep < 8 && "invalid slot");
1345 assert(other < 8 && "invalid slot");
1346
1347 cs_emit(b, SET_SB_ENTRY, I) {
1348 I.endpoint_entry = ep;
1349 I.other_entry = other;
1350 }
1351
1352 /* We assume the load/store scoreboard entry is static to keep things
1353 * simple. */
1354 if (unlikely(b->conf.ls_tracker))
1355 assert(b->conf.ls_tracker->sb_slot == other);
1356 }
1357
1358 static inline void
cs_progress_wait(struct cs_builder * b,unsigned queue,struct cs_index ref)1359 cs_progress_wait(struct cs_builder *b, unsigned queue, struct cs_index ref)
1360 {
1361 cs_emit(b, PROGRESS_WAIT, I) {
1362 I.source = cs_src64(b, ref);
1363 I.queue = queue;
1364 }
1365 }
1366
1367 static inline void
cs_set_exception_handler(struct cs_builder * b,enum mali_cs_exception_type exception_type,struct cs_index address,struct cs_index length)1368 cs_set_exception_handler(struct cs_builder *b,
1369 enum mali_cs_exception_type exception_type,
1370 struct cs_index address, struct cs_index length)
1371 {
1372 cs_emit(b, SET_EXCEPTION_HANDLER, I) {
1373 I.exception_type = exception_type;
1374 I.address = cs_src64(b, address);
1375 I.length = cs_src32(b, length);
1376 }
1377 }
1378
1379 static inline void
cs_call(struct cs_builder * b,struct cs_index address,struct cs_index length)1380 cs_call(struct cs_builder *b, struct cs_index address, struct cs_index length)
1381 {
1382 cs_emit(b, CALL, I) {
1383 I.address = cs_src64(b, address);
1384 I.length = cs_src32(b, length);
1385 }
1386 }
1387
1388 static inline void
cs_jump(struct cs_builder * b,struct cs_index address,struct cs_index length)1389 cs_jump(struct cs_builder *b, struct cs_index address, struct cs_index length)
1390 {
1391 cs_emit(b, JUMP, I) {
1392 I.address = cs_src64(b, address);
1393 I.length = cs_src32(b, length);
1394 }
1395 }
1396
1397 enum cs_res_id {
1398 CS_COMPUTE_RES = BITFIELD_BIT(0),
1399 CS_FRAG_RES = BITFIELD_BIT(1),
1400 CS_TILER_RES = BITFIELD_BIT(2),
1401 CS_IDVS_RES = BITFIELD_BIT(3),
1402 };
1403
1404 static inline void
cs_req_res(struct cs_builder * b,uint32_t res_mask)1405 cs_req_res(struct cs_builder *b, uint32_t res_mask)
1406 {
1407 cs_emit(b, REQ_RESOURCE, I) {
1408 I.compute = res_mask & CS_COMPUTE_RES;
1409 I.tiler = res_mask & CS_TILER_RES;
1410 I.idvs = res_mask & CS_IDVS_RES;
1411 I.fragment = res_mask & CS_FRAG_RES;
1412 }
1413 }
1414
1415 static inline void
cs_flush_caches(struct cs_builder * b,enum mali_cs_flush_mode l2,enum mali_cs_flush_mode lsc,bool other_inv,struct cs_index flush_id,struct cs_async_op async)1416 cs_flush_caches(struct cs_builder *b, enum mali_cs_flush_mode l2,
1417 enum mali_cs_flush_mode lsc, bool other_inv,
1418 struct cs_index flush_id, struct cs_async_op async)
1419 {
1420 cs_emit(b, FLUSH_CACHE2, I) {
1421 I.l2_flush_mode = l2;
1422 I.lsc_flush_mode = lsc;
1423 I.other_invalidate = other_inv;
1424 I.latest_flush_id = cs_src32(b, flush_id);
1425 cs_apply_async(I, async);
1426 }
1427 }
1428
1429 #define CS_SYNC_OPS(__cnt_width) \
1430 static inline void cs_sync##__cnt_width##_set( \
1431 struct cs_builder *b, bool propagate_error, \
1432 enum mali_cs_sync_scope scope, struct cs_index val, \
1433 struct cs_index addr, struct cs_async_op async) \
1434 { \
1435 cs_emit(b, SYNC_SET##__cnt_width, I) { \
1436 I.error_propagate = propagate_error; \
1437 I.scope = scope; \
1438 I.data = cs_src##__cnt_width(b, val); \
1439 I.address = cs_src64(b, addr); \
1440 cs_apply_async(I, async); \
1441 } \
1442 } \
1443 \
1444 static inline void cs_sync##__cnt_width##_add( \
1445 struct cs_builder *b, bool propagate_error, \
1446 enum mali_cs_sync_scope scope, struct cs_index val, \
1447 struct cs_index addr, struct cs_async_op async) \
1448 { \
1449 cs_emit(b, SYNC_ADD##__cnt_width, I) { \
1450 I.error_propagate = propagate_error; \
1451 I.scope = scope; \
1452 I.data = cs_src##__cnt_width(b, val); \
1453 I.address = cs_src64(b, addr); \
1454 cs_apply_async(I, async); \
1455 } \
1456 } \
1457 \
1458 static inline void cs_sync##__cnt_width##_wait( \
1459 struct cs_builder *b, bool reject_error, enum mali_cs_condition cond, \
1460 struct cs_index ref, struct cs_index addr) \
1461 { \
1462 assert(cond == MALI_CS_CONDITION_LEQUAL || \
1463 cond == MALI_CS_CONDITION_GREATER); \
1464 cs_emit(b, SYNC_WAIT##__cnt_width, I) { \
1465 I.error_reject = reject_error; \
1466 I.condition = cond; \
1467 I.data = cs_src##__cnt_width(b, ref); \
1468 I.address = cs_src64(b, addr); \
1469 } \
1470 }
1471
1472 CS_SYNC_OPS(32)
1473 CS_SYNC_OPS(64)
1474
1475 static inline void
cs_store_state(struct cs_builder * b,struct cs_index address,int offset,enum mali_cs_state state,struct cs_async_op async)1476 cs_store_state(struct cs_builder *b, struct cs_index address, int offset,
1477 enum mali_cs_state state, struct cs_async_op async)
1478 {
1479 cs_emit(b, STORE_STATE, I) {
1480 I.offset = offset;
1481 I.state = state;
1482 I.address = cs_src64(b, address);
1483 cs_apply_async(I, async);
1484 }
1485 }
1486
1487 static inline void
cs_prot_region(struct cs_builder * b,unsigned size)1488 cs_prot_region(struct cs_builder *b, unsigned size)
1489 {
1490 cs_emit(b, PROT_REGION, I) {
1491 I.size = size;
1492 }
1493 }
1494
1495 static inline void
cs_progress_store(struct cs_builder * b,struct cs_index src)1496 cs_progress_store(struct cs_builder *b, struct cs_index src)
1497 {
1498 cs_emit(b, PROGRESS_STORE, I)
1499 I.source = cs_src64(b, src);
1500 }
1501
1502 static inline void
cs_progress_load(struct cs_builder * b,struct cs_index dst)1503 cs_progress_load(struct cs_builder *b, struct cs_index dst)
1504 {
1505 cs_emit(b, PROGRESS_LOAD, I)
1506 I.destination = cs_dst64(b, dst);
1507 }
1508
1509 static inline void
cs_run_compute_indirect(struct cs_builder * b,unsigned wg_per_task,bool progress_inc,struct cs_shader_res_sel res_sel)1510 cs_run_compute_indirect(struct cs_builder *b, unsigned wg_per_task,
1511 bool progress_inc, struct cs_shader_res_sel res_sel)
1512 {
1513 cs_emit(b, RUN_COMPUTE_INDIRECT, I) {
1514 I.workgroups_per_task = wg_per_task;
1515 I.progress_increment = progress_inc;
1516 I.srt_select = res_sel.srt;
1517 I.spd_select = res_sel.spd;
1518 I.tsd_select = res_sel.tsd;
1519 I.fau_select = res_sel.fau;
1520 }
1521 }
1522
1523 static inline void
cs_error_barrier(struct cs_builder * b)1524 cs_error_barrier(struct cs_builder *b)
1525 {
1526 cs_emit(b, ERROR_BARRIER, _)
1527 ;
1528 }
1529
1530 static inline void
cs_heap_set(struct cs_builder * b,struct cs_index address)1531 cs_heap_set(struct cs_builder *b, struct cs_index address)
1532 {
1533 cs_emit(b, HEAP_SET, I) {
1534 I.address = cs_src64(b, address);
1535 }
1536 }
1537
1538 static inline void
cs_heap_operation(struct cs_builder * b,enum mali_cs_heap_operation operation,struct cs_async_op async)1539 cs_heap_operation(struct cs_builder *b, enum mali_cs_heap_operation operation,
1540 struct cs_async_op async)
1541 {
1542 cs_emit(b, HEAP_OPERATION, I) {
1543 I.operation = operation;
1544 cs_apply_async(I, async);
1545 }
1546 }
1547
1548 static inline void
cs_vt_start(struct cs_builder * b,struct cs_async_op async)1549 cs_vt_start(struct cs_builder *b, struct cs_async_op async)
1550 {
1551 cs_heap_operation(b, MALI_CS_HEAP_OPERATION_VERTEX_TILER_STARTED, async);
1552 }
1553
1554 static inline void
cs_vt_end(struct cs_builder * b,struct cs_async_op async)1555 cs_vt_end(struct cs_builder *b, struct cs_async_op async)
1556 {
1557 cs_heap_operation(b, MALI_CS_HEAP_OPERATION_VERTEX_TILER_COMPLETED, async);
1558 }
1559
1560 static inline void
cs_frag_end(struct cs_builder * b,struct cs_async_op async)1561 cs_frag_end(struct cs_builder *b, struct cs_async_op async)
1562 {
1563 cs_heap_operation(b, MALI_CS_HEAP_OPERATION_FRAGMENT_COMPLETED, async);
1564 }
1565
1566 static inline void
cs_trace_point(struct cs_builder * b,struct cs_index regs,struct cs_async_op async)1567 cs_trace_point(struct cs_builder *b, struct cs_index regs,
1568 struct cs_async_op async)
1569 {
1570 cs_emit(b, TRACE_POINT, I) {
1571 I.base_register =
1572 cs_src_tuple(b, regs, regs.size, BITFIELD_MASK(regs.size));
1573 I.register_count = regs.size;
1574 cs_apply_async(I, async);
1575 }
1576 }
1577
1578 struct cs_match {
1579 struct cs_block block;
1580 struct cs_label break_label;
1581 struct cs_block case_block;
1582 struct cs_label next_case_label;
1583 struct cs_index val;
1584 struct cs_index scratch_reg;
1585 struct cs_load_store_tracker case_ls_state;
1586 struct cs_load_store_tracker ls_state;
1587 struct cs_load_store_tracker *orig_ls_state;
1588 bool default_emitted;
1589 };
1590
1591 static inline struct cs_match *
cs_match_start(struct cs_builder * b,struct cs_match * match,struct cs_index val,struct cs_index scratch_reg)1592 cs_match_start(struct cs_builder *b, struct cs_match *match,
1593 struct cs_index val, struct cs_index scratch_reg)
1594 {
1595 *match = (struct cs_match){
1596 .val = val,
1597 .scratch_reg = scratch_reg,
1598 .orig_ls_state = b->conf.ls_tracker,
1599 };
1600
1601 cs_block_start(b, &match->block);
1602 cs_label_init(&match->break_label);
1603 cs_label_init(&match->next_case_label);
1604
1605 return match;
1606 }
1607
1608 static inline void
cs_match_case_ls_set(struct cs_builder * b,struct cs_match * match)1609 cs_match_case_ls_set(struct cs_builder *b, struct cs_match *match)
1610 {
1611 if (unlikely(match->orig_ls_state)) {
1612 match->case_ls_state = *match->orig_ls_state;
1613 b->conf.ls_tracker = &match->case_ls_state;
1614 }
1615 }
1616
1617 static inline void
cs_match_case_ls_get(struct cs_match * match)1618 cs_match_case_ls_get(struct cs_match *match)
1619 {
1620 if (unlikely(match->orig_ls_state)) {
1621 BITSET_OR(match->ls_state.pending_loads,
1622 match->case_ls_state.pending_loads,
1623 match->ls_state.pending_loads);
1624 BITSET_OR(match->ls_state.pending_stores,
1625 match->case_ls_state.pending_stores,
1626 match->ls_state.pending_stores);
1627 }
1628 }
1629
1630 static inline void
cs_match_case(struct cs_builder * b,struct cs_match * match,uint32_t id)1631 cs_match_case(struct cs_builder *b, struct cs_match *match, uint32_t id)
1632 {
1633 assert(!match->default_emitted || !"default case must be last");
1634 if (match->next_case_label.last_forward_ref != CS_LABEL_INVALID_POS) {
1635 cs_branch_label(b, &match->break_label, MALI_CS_CONDITION_ALWAYS,
1636 cs_undef());
1637 cs_block_end(b, &match->case_block);
1638 cs_match_case_ls_get(match);
1639 cs_set_label(b, &match->next_case_label);
1640 cs_label_init(&match->next_case_label);
1641 }
1642
1643 if (id)
1644 cs_add32(b, match->scratch_reg, match->val, -id);
1645
1646 cs_branch_label(b, &match->next_case_label, MALI_CS_CONDITION_NEQUAL,
1647 id ? match->scratch_reg : match->val);
1648
1649 cs_match_case_ls_set(b, match);
1650 cs_block_start(b, &match->case_block);
1651 }
1652
1653 static inline void
cs_match_default(struct cs_builder * b,struct cs_match * match)1654 cs_match_default(struct cs_builder *b, struct cs_match *match)
1655 {
1656 assert(match->next_case_label.last_forward_ref != CS_LABEL_INVALID_POS ||
1657 !"default case requires at least one other case");
1658 cs_branch_label(b, &match->break_label, MALI_CS_CONDITION_ALWAYS,
1659 cs_undef());
1660
1661 if (cs_cur_block(b) == &match->case_block) {
1662 cs_block_end(b, &match->case_block);
1663 cs_match_case_ls_get(match);
1664 }
1665
1666 cs_set_label(b, &match->next_case_label);
1667 cs_label_init(&match->next_case_label);
1668 cs_match_case_ls_set(b, match);
1669 cs_block_start(b, &match->case_block);
1670 match->default_emitted = true;
1671 }
1672
1673 static inline void
cs_match_end(struct cs_builder * b,struct cs_match * match)1674 cs_match_end(struct cs_builder *b, struct cs_match *match)
1675 {
1676 if (cs_cur_block(b) == &match->case_block) {
1677 cs_match_case_ls_get(match);
1678 cs_block_end(b, &match->case_block);
1679 }
1680
1681 if (unlikely(match->orig_ls_state)) {
1682 if (!match->default_emitted) {
1683 /* If we don't have a default, assume we don't handle all possible cases
1684 * and the match load/store state with the original load/store state.
1685 */
1686 BITSET_OR(match->orig_ls_state->pending_loads,
1687 match->ls_state.pending_loads,
1688 match->orig_ls_state->pending_loads);
1689 BITSET_OR(match->orig_ls_state->pending_stores,
1690 match->ls_state.pending_stores,
1691 match->orig_ls_state->pending_stores);
1692 } else {
1693 *match->orig_ls_state = match->ls_state;
1694 }
1695
1696 b->conf.ls_tracker = match->orig_ls_state;
1697 }
1698
1699 cs_set_label(b, &match->next_case_label);
1700 cs_set_label(b, &match->break_label);
1701
1702 cs_block_end(b, &match->block);
1703 }
1704
1705 #define cs_match(__b, __val, __scratch) \
1706 for (struct cs_match __match_storage, \
1707 *__match = cs_match_start(__b, &__match_storage, __val, __scratch); \
1708 __match != NULL; cs_match_end(__b, &__match_storage), __match = NULL)
1709
1710 #define cs_case(__b, __ref) \
1711 for (bool __case_defined = ({ \
1712 cs_match_case(__b, __match, __ref); \
1713 false; \
1714 }); \
1715 !__case_defined; __case_defined = true)
1716
1717 #define cs_default(__b) \
1718 for (bool __default_defined = ({ \
1719 cs_match_default(__b, __match); \
1720 false; \
1721 }); \
1722 !__default_defined; __default_defined = true)
1723
1724 static inline void
cs_nop(struct cs_builder * b)1725 cs_nop(struct cs_builder *b)
1726 {
1727 cs_emit(b, NOP, I) {};
1728 }
1729
1730 struct cs_exception_handler_ctx {
1731 struct cs_index ctx_reg;
1732 unsigned dump_addr_offset;
1733 uint8_t ls_sb_slot;
1734 };
1735
1736 struct cs_exception_handler {
1737 struct cs_block block;
1738 struct cs_dirty_tracker dirty;
1739 struct cs_exception_handler_ctx ctx;
1740 unsigned dump_size;
1741 uint64_t address;
1742 uint32_t length;
1743 };
1744
1745 static inline struct cs_exception_handler *
cs_exception_handler_start(struct cs_builder * b,struct cs_exception_handler * handler,struct cs_exception_handler_ctx ctx)1746 cs_exception_handler_start(struct cs_builder *b,
1747 struct cs_exception_handler *handler,
1748 struct cs_exception_handler_ctx ctx)
1749 {
1750 assert(cs_cur_block(b) == NULL);
1751 assert(b->conf.dirty_tracker == NULL);
1752
1753 *handler = (struct cs_exception_handler){
1754 .ctx = ctx,
1755 };
1756
1757 cs_block_start(b, &handler->block);
1758
1759 b->conf.dirty_tracker = &handler->dirty;
1760
1761 return handler;
1762 }
1763
1764 #define SAVE_RESTORE_MAX_OPS (256 / 16)
1765
1766 static inline void
cs_exception_handler_end(struct cs_builder * b,struct cs_exception_handler * handler)1767 cs_exception_handler_end(struct cs_builder *b,
1768 struct cs_exception_handler *handler)
1769 {
1770 struct cs_index ranges[SAVE_RESTORE_MAX_OPS];
1771 uint16_t masks[SAVE_RESTORE_MAX_OPS];
1772 unsigned num_ranges = 0;
1773 uint32_t num_instrs =
1774 util_dynarray_num_elements(&b->blocks.instrs, uint64_t);
1775 struct cs_index addr_reg = {
1776 .type = CS_INDEX_REGISTER,
1777 .size = 2,
1778 .reg = b->conf.nr_registers - 2,
1779 };
1780
1781 /* Manual cs_block_end() without an instruction flush. We do that to insert
1782 * the preamble without having to move memory in b->blocks.instrs. The flush
1783 * will be done after the preamble has been emitted. */
1784 assert(cs_cur_block(b) == &handler->block);
1785 assert(handler->block.next == NULL);
1786 b->blocks.stack = NULL;
1787
1788 if (!num_instrs)
1789 return;
1790
1791 /* Try to minimize number of load/store by grouping them */
1792 unsigned nregs = b->conf.nr_registers - b->conf.nr_kernel_registers;
1793 unsigned pos, last = 0;
1794
1795 BITSET_FOREACH_SET(pos, handler->dirty.regs, nregs) {
1796 unsigned range = MIN2(nregs - pos, 16);
1797 unsigned word = BITSET_BITWORD(pos);
1798 unsigned bit = pos % BITSET_WORDBITS;
1799 unsigned remaining_bits = BITSET_WORDBITS - bit;
1800
1801 if (pos < last)
1802 continue;
1803
1804 masks[num_ranges] = handler->dirty.regs[word] >> bit;
1805 if (remaining_bits < range)
1806 masks[num_ranges] |= handler->dirty.regs[word + 1] << remaining_bits;
1807 masks[num_ranges] &= BITFIELD_MASK(range);
1808
1809 ranges[num_ranges] =
1810 cs_reg_tuple(b, pos, util_last_bit(masks[num_ranges]));
1811 num_ranges++;
1812 last = pos + range;
1813 }
1814
1815 handler->dump_size = BITSET_COUNT(handler->dirty.regs) * sizeof(uint32_t);
1816
1817 /* Make sure the current chunk is able to accommodate the block
1818 * instructions as well as the preamble and postamble.
1819 * Adding 4 instructions (2x wait_slot and the move for the address) as
1820 * the move might actually be translated to two MOVE32 instructions. */
1821 num_instrs += (num_ranges * 2) + 4;
1822
1823 /* Align things on a cache-line in case the buffer contains more than one
1824 * exception handler (64 bytes = 8 instructions). */
1825 uint32_t padded_num_instrs = ALIGN_POT(num_instrs, 8);
1826
1827 if (!cs_reserve_instrs(b, padded_num_instrs))
1828 return;
1829
1830 handler->address =
1831 b->cur_chunk.buffer.gpu + (b->cur_chunk.pos * sizeof(uint64_t));
1832
1833 /* Preamble: backup modified registers */
1834 if (num_ranges > 0) {
1835 unsigned offset = 0;
1836
1837 cs_load64_to(b, addr_reg, handler->ctx.ctx_reg,
1838 handler->ctx.dump_addr_offset);
1839 cs_wait_slot(b, handler->ctx.ls_sb_slot, false);
1840
1841 for (unsigned i = 0; i < num_ranges; ++i) {
1842 unsigned reg_count = util_bitcount(masks[i]);
1843
1844 cs_store(b, ranges[i], addr_reg, masks[i], offset);
1845 offset += reg_count * 4;
1846 }
1847
1848 cs_wait_slot(b, handler->ctx.ls_sb_slot, false);
1849 }
1850
1851 /* Now that the preamble is emitted, we can flush the instructions we have in
1852 * our exception handler block. */
1853 cs_flush_block_instrs(b);
1854
1855 /* Postamble: restore modified registers */
1856 if (num_ranges > 0) {
1857 unsigned offset = 0;
1858
1859 cs_load64_to(b, addr_reg, handler->ctx.ctx_reg,
1860 handler->ctx.dump_addr_offset);
1861 cs_wait_slot(b, handler->ctx.ls_sb_slot, false);
1862
1863 for (unsigned i = 0; i < num_ranges; ++i) {
1864 unsigned reg_count = util_bitcount(masks[i]);
1865
1866 cs_load_to(b, ranges[i], addr_reg, masks[i], offset);
1867 offset += reg_count * 4;
1868 }
1869
1870 cs_wait_slot(b, handler->ctx.ls_sb_slot, false);
1871 }
1872
1873 /* Fill the rest of the buffer with NOPs. */
1874 for (; num_instrs < padded_num_instrs; num_instrs++)
1875 cs_nop(b);
1876
1877 handler->length = padded_num_instrs;
1878 }
1879
1880 #define cs_exception_handler_def(__b, __handler, __ctx) \
1881 for (struct cs_exception_handler *__ehandler = \
1882 cs_exception_handler_start(__b, __handler, __ctx); \
1883 __ehandler != NULL; \
1884 cs_exception_handler_end(__b, __handler), __ehandler = NULL)
1885
1886 struct cs_tracing_ctx {
1887 bool enabled;
1888 struct cs_index ctx_reg;
1889 unsigned tracebuf_addr_offset;
1890 uint8_t ls_sb_slot;
1891 };
1892
1893 static inline void
cs_trace_preamble(struct cs_builder * b,const struct cs_tracing_ctx * ctx,struct cs_index scratch_regs,unsigned trace_size)1894 cs_trace_preamble(struct cs_builder *b, const struct cs_tracing_ctx *ctx,
1895 struct cs_index scratch_regs, unsigned trace_size)
1896 {
1897 assert(trace_size > 0 && ALIGN_POT(trace_size, 64) == trace_size &&
1898 trace_size < INT16_MAX);
1899 assert(scratch_regs.size >= 4 && !(scratch_regs.reg & 1));
1900
1901 struct cs_index tracebuf_addr = cs_reg64(b, scratch_regs.reg);
1902
1903 /* We always update the tracebuf position first, so we can easily detect OOB
1904 * access. Use cs_trace_field_offset() to get an offset taking this
1905 * pre-increment into account. */
1906 cs_load64_to(b, tracebuf_addr, ctx->ctx_reg, ctx->tracebuf_addr_offset);
1907 cs_wait_slot(b, ctx->ls_sb_slot, false);
1908 cs_add64(b, tracebuf_addr, tracebuf_addr, trace_size);
1909 cs_store64(b, tracebuf_addr, ctx->ctx_reg, ctx->tracebuf_addr_offset);
1910 cs_wait_slot(b, ctx->ls_sb_slot, false);
1911 }
1912
1913 #define cs_trace_field_offset(__type, __field) \
1914 (int16_t)(offsetof(struct cs_##__type##_trace, __field) - \
1915 sizeof(struct cs_##__type##_trace))
1916
1917 struct cs_run_fragment_trace {
1918 uint64_t ip;
1919 uint32_t sr[7];
1920 } __attribute__((aligned(64)));
1921
1922 static inline void
cs_trace_run_fragment(struct cs_builder * b,const struct cs_tracing_ctx * ctx,struct cs_index scratch_regs,bool enable_tem,enum mali_tile_render_order tile_order,bool progress_inc)1923 cs_trace_run_fragment(struct cs_builder *b, const struct cs_tracing_ctx *ctx,
1924 struct cs_index scratch_regs, bool enable_tem,
1925 enum mali_tile_render_order tile_order, bool progress_inc)
1926 {
1927 if (likely(!ctx->enabled)) {
1928 cs_run_fragment(b, enable_tem, tile_order, progress_inc);
1929 return;
1930 }
1931
1932 struct cs_index tracebuf_addr = cs_reg64(b, scratch_regs.reg);
1933 struct cs_index data = cs_reg64(b, scratch_regs.reg + 2);
1934
1935 cs_trace_preamble(b, ctx, scratch_regs,
1936 sizeof(struct cs_run_fragment_trace));
1937
1938 /* cs_run_xx() must immediately follow cs_load_ip_to() otherwise the IP
1939 * won't point to the right instruction. */
1940 cs_load_ip_to(b, data);
1941 cs_run_fragment(b, enable_tem, tile_order, progress_inc);
1942 cs_store64(b, data, tracebuf_addr, cs_trace_field_offset(run_fragment, ip));
1943
1944 cs_store(b, cs_reg_tuple(b, 40, 7), tracebuf_addr, BITFIELD_MASK(7),
1945 cs_trace_field_offset(run_fragment, sr));
1946 cs_wait_slot(b, ctx->ls_sb_slot, false);
1947 }
1948
1949 struct cs_run_idvs_trace {
1950 uint64_t ip;
1951 uint32_t draw_id;
1952 uint32_t pad;
1953 uint32_t sr[61];
1954 } __attribute__((aligned(64)));
1955
1956 static inline void
cs_trace_run_idvs(struct cs_builder * b,const struct cs_tracing_ctx * ctx,struct cs_index scratch_regs,uint32_t flags_override,bool progress_inc,bool malloc_enable,struct cs_shader_res_sel varying_sel,struct cs_shader_res_sel frag_sel,struct cs_index draw_id)1957 cs_trace_run_idvs(struct cs_builder *b, const struct cs_tracing_ctx *ctx,
1958 struct cs_index scratch_regs, uint32_t flags_override,
1959 bool progress_inc, bool malloc_enable,
1960 struct cs_shader_res_sel varying_sel,
1961 struct cs_shader_res_sel frag_sel, struct cs_index draw_id)
1962 {
1963 if (likely(!ctx->enabled)) {
1964 cs_run_idvs(b, flags_override, progress_inc, malloc_enable, varying_sel,
1965 frag_sel, draw_id);
1966 return;
1967 }
1968
1969 struct cs_index tracebuf_addr = cs_reg64(b, scratch_regs.reg);
1970 struct cs_index data = cs_reg64(b, scratch_regs.reg + 2);
1971
1972 cs_trace_preamble(b, ctx, scratch_regs,
1973 sizeof(struct cs_run_idvs_trace));
1974
1975 /* cs_run_xx() must immediately follow cs_load_ip_to() otherwise the IP
1976 * won't point to the right instruction. */
1977 cs_load_ip_to(b, data);
1978 cs_run_idvs(b, flags_override, progress_inc, malloc_enable, varying_sel,
1979 frag_sel, draw_id);
1980 cs_store64(b, data, tracebuf_addr, cs_trace_field_offset(run_idvs, ip));
1981
1982 if (draw_id.type != CS_INDEX_UNDEF)
1983 cs_store32(b, draw_id, tracebuf_addr,
1984 cs_trace_field_offset(run_idvs, draw_id));
1985
1986 for (unsigned i = 0; i < 48; i += 16)
1987 cs_store(b, cs_reg_tuple(b, i, 16), tracebuf_addr, BITFIELD_MASK(16),
1988 cs_trace_field_offset(run_idvs, sr[i]));
1989 cs_store(b, cs_reg_tuple(b, 48, 13), tracebuf_addr, BITFIELD_MASK(13),
1990 cs_trace_field_offset(run_idvs, sr[48]));
1991 cs_wait_slot(b, ctx->ls_sb_slot, false);
1992 }
1993
1994 struct cs_run_compute_trace {
1995 uint64_t ip;
1996 uint32_t sr[40];
1997 } __attribute__((aligned(64)));
1998
1999 static inline void
cs_trace_run_compute(struct cs_builder * b,const struct cs_tracing_ctx * ctx,struct cs_index scratch_regs,unsigned task_increment,enum mali_task_axis task_axis,bool progress_inc,struct cs_shader_res_sel res_sel)2000 cs_trace_run_compute(struct cs_builder *b, const struct cs_tracing_ctx *ctx,
2001 struct cs_index scratch_regs, unsigned task_increment,
2002 enum mali_task_axis task_axis, bool progress_inc,
2003 struct cs_shader_res_sel res_sel)
2004 {
2005 if (likely(!ctx->enabled)) {
2006 cs_run_compute(b, task_increment, task_axis, progress_inc, res_sel);
2007 return;
2008 }
2009
2010 struct cs_index tracebuf_addr = cs_reg64(b, scratch_regs.reg);
2011 struct cs_index data = cs_reg64(b, scratch_regs.reg + 2);
2012
2013 cs_trace_preamble(b, ctx, scratch_regs,
2014 sizeof(struct cs_run_compute_trace));
2015
2016 /* cs_run_xx() must immediately follow cs_load_ip_to() otherwise the IP
2017 * won't point to the right instruction. */
2018 cs_load_ip_to(b, data);
2019 cs_run_compute(b, task_increment, task_axis, progress_inc, res_sel);
2020 cs_store64(b, data, tracebuf_addr, cs_trace_field_offset(run_compute, ip));
2021
2022 for (unsigned i = 0; i < 32; i += 16)
2023 cs_store(b, cs_reg_tuple(b, i, 16), tracebuf_addr, BITFIELD_MASK(16),
2024 cs_trace_field_offset(run_compute, sr[i]));
2025 cs_store(b, cs_reg_tuple(b, 32, 8), tracebuf_addr, BITFIELD_MASK(8),
2026 cs_trace_field_offset(run_compute, sr[32]));
2027 cs_wait_slot(b, ctx->ls_sb_slot, false);
2028 }
2029
2030 static inline void
cs_trace_run_compute_indirect(struct cs_builder * b,const struct cs_tracing_ctx * ctx,struct cs_index scratch_regs,unsigned wg_per_task,bool progress_inc,struct cs_shader_res_sel res_sel)2031 cs_trace_run_compute_indirect(struct cs_builder *b,
2032 const struct cs_tracing_ctx *ctx,
2033 struct cs_index scratch_regs,
2034 unsigned wg_per_task, bool progress_inc,
2035 struct cs_shader_res_sel res_sel)
2036 {
2037 if (likely(!ctx->enabled)) {
2038 cs_run_compute_indirect(b, wg_per_task, progress_inc, res_sel);
2039 return;
2040 }
2041
2042 struct cs_index tracebuf_addr = cs_reg64(b, scratch_regs.reg);
2043 struct cs_index data = cs_reg64(b, scratch_regs.reg + 2);
2044
2045 cs_trace_preamble(b, ctx, scratch_regs,
2046 sizeof(struct cs_run_compute_trace));
2047
2048 /* cs_run_xx() must immediately follow cs_load_ip_to() otherwise the IP
2049 * won't point to the right instruction. */
2050 cs_load_ip_to(b, data);
2051 cs_run_compute_indirect(b, wg_per_task, progress_inc, res_sel);
2052 cs_store64(b, data, tracebuf_addr, cs_trace_field_offset(run_compute, ip));
2053
2054 for (unsigned i = 0; i < 32; i += 16)
2055 cs_store(b, cs_reg_tuple(b, i, 16), tracebuf_addr, BITFIELD_MASK(16),
2056 cs_trace_field_offset(run_compute, sr[i]));
2057 cs_store(b, cs_reg_tuple(b, 32, 8), tracebuf_addr, BITFIELD_MASK(8),
2058 cs_trace_field_offset(run_compute, sr[32]));
2059 cs_wait_slot(b, ctx->ls_sb_slot, false);
2060 }
2061