1 /*
2 * Copyright (C) 2021 Collabora, Ltd.
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
13 * Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 * SOFTWARE.
22 */
23
24 #include "bi_builder.h"
25 #include "va_compiler.h"
26 #include "valhall.h"
27 #include "valhall_enums.h"
28
29 /* This file contains the final passes of the compiler. Running after
30 * scheduling and RA, the IR is now finalized, so we need to emit it to actual
31 * bits on the wire (as well as fixup branches)
32 */
33
34 /*
35 * Unreachable for encoding failures, when hitting an invalid instruction.
36 * Prints the (first) failing instruction to aid debugging.
37 */
38 NORETURN static void PRINTFLIKE(2, 3)
invalid_instruction(const bi_instr * I,const char * cause,...)39 invalid_instruction(const bi_instr *I, const char *cause, ...)
40 {
41 fputs("\nInvalid ", stderr);
42
43 va_list ap;
44 va_start(ap, cause);
45 vfprintf(stderr, cause, ap);
46 va_end(ap);
47
48 fputs(":\n\t", stderr);
49 bi_print_instr(I, stderr);
50 fprintf(stderr, "\n");
51
52 unreachable("Invalid instruction");
53 }
54
55 /*
56 * Like assert, but prints the instruction if the assertion fails to aid
57 * debugging invalid inputs to the packing module.
58 */
59 #define pack_assert(I, cond) \
60 if (!(cond)) \
61 invalid_instruction(I, "invariant " #cond);
62
63 /*
64 * Validate that two adjacent 32-bit sources form an aligned 64-bit register
65 * pair. This is a compiler invariant, required on Valhall but not on Bifrost.
66 */
67 static void
va_validate_register_pair(const bi_instr * I,unsigned s)68 va_validate_register_pair(const bi_instr *I, unsigned s)
69 {
70 ASSERTED bi_index lo = I->src[s], hi = I->src[s + 1];
71
72 pack_assert(I, lo.type == hi.type);
73
74 if (lo.type == BI_INDEX_REGISTER) {
75 pack_assert(I, hi.value & 1);
76 pack_assert(I, hi.value == lo.value + 1);
77 } else if (lo.type == BI_INDEX_FAU && lo.value & BIR_FAU_IMMEDIATE) {
78 /* Small constants are zero extended, so the top word encode zero */
79 pack_assert(I, hi.value == (BIR_FAU_IMMEDIATE | 0));
80 } else {
81 pack_assert(I, hi.offset & 1);
82 pack_assert(I, hi.offset == lo.offset + 1);
83 }
84 }
85
86 static unsigned
va_pack_reg(const bi_instr * I,bi_index idx)87 va_pack_reg(const bi_instr *I, bi_index idx)
88 {
89 pack_assert(I, idx.type == BI_INDEX_REGISTER);
90 pack_assert(I, idx.value < 64);
91
92 return idx.value;
93 }
94
95 static unsigned
va_pack_fau_special(const bi_instr * I,enum bir_fau fau)96 va_pack_fau_special(const bi_instr *I, enum bir_fau fau)
97 {
98 switch (fau) {
99 case BIR_FAU_ATEST_PARAM:
100 return VA_FAU_SPECIAL_PAGE_0_ATEST_DATUM;
101 case BIR_FAU_TLS_PTR:
102 return VA_FAU_SPECIAL_PAGE_1_THREAD_LOCAL_POINTER;
103 case BIR_FAU_WLS_PTR:
104 return VA_FAU_SPECIAL_PAGE_1_WORKGROUP_LOCAL_POINTER;
105 case BIR_FAU_LANE_ID:
106 return VA_FAU_SPECIAL_PAGE_3_LANE_ID;
107 case BIR_FAU_PROGRAM_COUNTER:
108 return VA_FAU_SPECIAL_PAGE_3_PROGRAM_COUNTER;
109 case BIR_FAU_SAMPLE_POS_ARRAY:
110 return VA_FAU_SPECIAL_PAGE_0_SAMPLE;
111
112 case BIR_FAU_BLEND_0 ...(BIR_FAU_BLEND_0 + 7):
113 return VA_FAU_SPECIAL_PAGE_0_BLEND_DESCRIPTOR_0 + (fau - BIR_FAU_BLEND_0);
114
115 default:
116 invalid_instruction(I, "FAU");
117 }
118 }
119
120 /*
121 * Encode a 64-bit FAU source. The offset is ignored, so this function can be
122 * used to encode a 32-bit FAU source by or'ing in the appropriate offset.
123 */
124 static unsigned
va_pack_fau_64(const bi_instr * I,bi_index idx)125 va_pack_fau_64(const bi_instr *I, bi_index idx)
126 {
127 pack_assert(I, idx.type == BI_INDEX_FAU);
128
129 unsigned val = (idx.value & BITFIELD_MASK(5));
130
131 if (idx.value & BIR_FAU_IMMEDIATE)
132 return (0x3 << 6) | (val << 1);
133 else if (idx.value & BIR_FAU_UNIFORM)
134 return (0x2 << 6) | (val << 1);
135 else
136 return (0x7 << 5) | (va_pack_fau_special(I, idx.value) << 1);
137 }
138
139 static unsigned
va_pack_src(const bi_instr * I,unsigned s)140 va_pack_src(const bi_instr *I, unsigned s)
141 {
142 bi_index idx = I->src[s];
143
144 if (idx.type == BI_INDEX_REGISTER) {
145 unsigned value = va_pack_reg(I, idx);
146 if (idx.discard)
147 value |= (1 << 6);
148 return value;
149 } else if (idx.type == BI_INDEX_FAU) {
150 pack_assert(I, idx.offset <= 1);
151 return va_pack_fau_64(I, idx) | idx.offset;
152 }
153
154 invalid_instruction(I, "type of source %u", s);
155 }
156
157 static unsigned
va_pack_wrmask(const bi_instr * I)158 va_pack_wrmask(const bi_instr *I)
159 {
160 switch (I->dest[0].swizzle) {
161 case BI_SWIZZLE_H00:
162 return 0x1;
163 case BI_SWIZZLE_H11:
164 return 0x2;
165 case BI_SWIZZLE_H01:
166 return 0x3;
167 default:
168 invalid_instruction(I, "write mask");
169 }
170 }
171
172 static enum va_atomic_operation
va_pack_atom_opc(const bi_instr * I)173 va_pack_atom_opc(const bi_instr *I)
174 {
175 switch (I->atom_opc) {
176 case BI_ATOM_OPC_AADD:
177 return VA_ATOMIC_OPERATION_AADD;
178 case BI_ATOM_OPC_ASMIN:
179 return VA_ATOMIC_OPERATION_ASMIN;
180 case BI_ATOM_OPC_ASMAX:
181 return VA_ATOMIC_OPERATION_ASMAX;
182 case BI_ATOM_OPC_AUMIN:
183 return VA_ATOMIC_OPERATION_AUMIN;
184 case BI_ATOM_OPC_AUMAX:
185 return VA_ATOMIC_OPERATION_AUMAX;
186 case BI_ATOM_OPC_AAND:
187 return VA_ATOMIC_OPERATION_AAND;
188 case BI_ATOM_OPC_AOR:
189 return VA_ATOMIC_OPERATION_AOR;
190 case BI_ATOM_OPC_AXOR:
191 return VA_ATOMIC_OPERATION_AXOR;
192 case BI_ATOM_OPC_ACMPXCHG:
193 case BI_ATOM_OPC_AXCHG:
194 return VA_ATOMIC_OPERATION_AXCHG;
195 default:
196 invalid_instruction(I, "atomic opcode");
197 }
198 }
199
200 static enum va_atomic_operation_with_1
va_pack_atom_opc_1(const bi_instr * I)201 va_pack_atom_opc_1(const bi_instr *I)
202 {
203 switch (I->atom_opc) {
204 case BI_ATOM_OPC_AINC:
205 return VA_ATOMIC_OPERATION_WITH_1_AINC;
206 case BI_ATOM_OPC_ADEC:
207 return VA_ATOMIC_OPERATION_WITH_1_ADEC;
208 case BI_ATOM_OPC_AUMAX1:
209 return VA_ATOMIC_OPERATION_WITH_1_AUMAX1;
210 case BI_ATOM_OPC_ASMAX1:
211 return VA_ATOMIC_OPERATION_WITH_1_ASMAX1;
212 case BI_ATOM_OPC_AOR1:
213 return VA_ATOMIC_OPERATION_WITH_1_AOR1;
214 default:
215 invalid_instruction(I, "atomic opcode with implicit 1");
216 }
217 }
218
219 static unsigned
va_pack_dest(const bi_instr * I)220 va_pack_dest(const bi_instr *I)
221 {
222 assert(I->nr_dests);
223 return va_pack_reg(I, I->dest[0]) | (va_pack_wrmask(I) << 6);
224 }
225
226 static enum va_widen
va_pack_widen_f32(const bi_instr * I,enum bi_swizzle swz)227 va_pack_widen_f32(const bi_instr *I, enum bi_swizzle swz)
228 {
229 switch (swz) {
230 case BI_SWIZZLE_H01:
231 return VA_WIDEN_NONE;
232 case BI_SWIZZLE_H00:
233 return VA_WIDEN_H0;
234 case BI_SWIZZLE_H11:
235 return VA_WIDEN_H1;
236 default:
237 invalid_instruction(I, "widen");
238 }
239 }
240
241 static enum va_swizzles_16_bit
va_pack_swizzle_f16(const bi_instr * I,enum bi_swizzle swz)242 va_pack_swizzle_f16(const bi_instr *I, enum bi_swizzle swz)
243 {
244 switch (swz) {
245 case BI_SWIZZLE_H00:
246 return VA_SWIZZLES_16_BIT_H00;
247 case BI_SWIZZLE_H10:
248 return VA_SWIZZLES_16_BIT_H10;
249 case BI_SWIZZLE_H01:
250 return VA_SWIZZLES_16_BIT_H01;
251 case BI_SWIZZLE_H11:
252 return VA_SWIZZLES_16_BIT_H11;
253 default:
254 invalid_instruction(I, "16-bit swizzle");
255 }
256 }
257
258 static unsigned
va_pack_widen(const bi_instr * I,enum bi_swizzle swz,enum va_size size)259 va_pack_widen(const bi_instr *I, enum bi_swizzle swz, enum va_size size)
260 {
261 if (size == VA_SIZE_8) {
262 switch (swz) {
263 case BI_SWIZZLE_H01:
264 return VA_SWIZZLES_8_BIT_B0123;
265 case BI_SWIZZLE_H00:
266 return VA_SWIZZLES_8_BIT_B0101;
267 case BI_SWIZZLE_H11:
268 return VA_SWIZZLES_8_BIT_B2323;
269 case BI_SWIZZLE_B0000:
270 return VA_SWIZZLES_8_BIT_B0000;
271 case BI_SWIZZLE_B1111:
272 return VA_SWIZZLES_8_BIT_B1111;
273 case BI_SWIZZLE_B2222:
274 return VA_SWIZZLES_8_BIT_B2222;
275 case BI_SWIZZLE_B3333:
276 return VA_SWIZZLES_8_BIT_B3333;
277 default:
278 invalid_instruction(I, "8-bit widen");
279 }
280 } else if (size == VA_SIZE_16) {
281 switch (swz) {
282 case BI_SWIZZLE_H00:
283 return VA_SWIZZLES_16_BIT_H00;
284 case BI_SWIZZLE_H10:
285 return VA_SWIZZLES_16_BIT_H10;
286 case BI_SWIZZLE_H01:
287 return VA_SWIZZLES_16_BIT_H01;
288 case BI_SWIZZLE_H11:
289 return VA_SWIZZLES_16_BIT_H11;
290 case BI_SWIZZLE_B0000:
291 return VA_SWIZZLES_16_BIT_B00;
292 case BI_SWIZZLE_B1111:
293 return VA_SWIZZLES_16_BIT_B11;
294 case BI_SWIZZLE_B2222:
295 return VA_SWIZZLES_16_BIT_B22;
296 case BI_SWIZZLE_B3333:
297 return VA_SWIZZLES_16_BIT_B33;
298 default:
299 invalid_instruction(I, "16-bit widen");
300 }
301 } else if (size == VA_SIZE_32) {
302 switch (swz) {
303 case BI_SWIZZLE_H01:
304 return VA_SWIZZLES_32_BIT_NONE;
305 case BI_SWIZZLE_H00:
306 return VA_SWIZZLES_32_BIT_H0;
307 case BI_SWIZZLE_H11:
308 return VA_SWIZZLES_32_BIT_H1;
309 case BI_SWIZZLE_B0000:
310 return VA_SWIZZLES_32_BIT_B0;
311 case BI_SWIZZLE_B1111:
312 return VA_SWIZZLES_32_BIT_B1;
313 case BI_SWIZZLE_B2222:
314 return VA_SWIZZLES_32_BIT_B2;
315 case BI_SWIZZLE_B3333:
316 return VA_SWIZZLES_32_BIT_B3;
317 default:
318 invalid_instruction(I, "32-bit widen");
319 }
320 } else {
321 invalid_instruction(I, "type size for widen");
322 }
323 }
324
325 static enum va_half_swizzles_8_bit
va_pack_halfswizzle(const bi_instr * I,enum bi_swizzle swz)326 va_pack_halfswizzle(const bi_instr *I, enum bi_swizzle swz)
327 {
328 switch (swz) {
329 case BI_SWIZZLE_B0000:
330 return VA_HALF_SWIZZLES_8_BIT_B00;
331 case BI_SWIZZLE_B1111:
332 return VA_HALF_SWIZZLES_8_BIT_B11;
333 case BI_SWIZZLE_B2222:
334 return VA_HALF_SWIZZLES_8_BIT_B22;
335 case BI_SWIZZLE_B3333:
336 return VA_HALF_SWIZZLES_8_BIT_B33;
337 case BI_SWIZZLE_B0011:
338 return VA_HALF_SWIZZLES_8_BIT_B01;
339 case BI_SWIZZLE_B2233:
340 return VA_HALF_SWIZZLES_8_BIT_B23;
341 case BI_SWIZZLE_B0022:
342 return VA_HALF_SWIZZLES_8_BIT_B02;
343 default:
344 invalid_instruction(I, "v2u8 swizzle");
345 }
346 }
347
348 static enum va_lanes_8_bit
va_pack_shift_lanes(const bi_instr * I,enum bi_swizzle swz)349 va_pack_shift_lanes(const bi_instr *I, enum bi_swizzle swz)
350 {
351 switch (swz) {
352 case BI_SWIZZLE_H01:
353 return VA_LANES_8_BIT_B02;
354 case BI_SWIZZLE_B0000:
355 return VA_LANES_8_BIT_B00;
356 case BI_SWIZZLE_B1111:
357 return VA_LANES_8_BIT_B11;
358 case BI_SWIZZLE_B2222:
359 return VA_LANES_8_BIT_B22;
360 case BI_SWIZZLE_B3333:
361 return VA_LANES_8_BIT_B33;
362 default:
363 invalid_instruction(I, "lane shift");
364 }
365 }
366
367 static enum va_combine
va_pack_combine(const bi_instr * I,enum bi_swizzle swz)368 va_pack_combine(const bi_instr *I, enum bi_swizzle swz)
369 {
370 switch (swz) {
371 case BI_SWIZZLE_H01:
372 return VA_COMBINE_NONE;
373 case BI_SWIZZLE_H00:
374 return VA_COMBINE_H0;
375 case BI_SWIZZLE_H11:
376 return VA_COMBINE_H1;
377 default:
378 invalid_instruction(I, "branch lane");
379 }
380 }
381
382 static enum va_source_format
va_pack_source_format(const bi_instr * I)383 va_pack_source_format(const bi_instr *I)
384 {
385 switch (I->source_format) {
386 case BI_SOURCE_FORMAT_FLAT32:
387 return VA_SOURCE_FORMAT_SRC_FLAT32;
388 case BI_SOURCE_FORMAT_FLAT16:
389 return VA_SOURCE_FORMAT_SRC_FLAT16;
390 case BI_SOURCE_FORMAT_F32:
391 return VA_SOURCE_FORMAT_SRC_F32;
392 case BI_SOURCE_FORMAT_F16:
393 return VA_SOURCE_FORMAT_SRC_F16;
394 }
395
396 invalid_instruction(I, "source format");
397 }
398
399 static uint64_t
va_pack_rhadd(const bi_instr * I)400 va_pack_rhadd(const bi_instr *I)
401 {
402 switch (I->round) {
403 case BI_ROUND_RTN:
404 return 0; /* hadd */
405 case BI_ROUND_RTP:
406 return BITFIELD_BIT(30); /* rhadd */
407 default:
408 unreachable("Invalid round for HADD");
409 }
410 }
411
412 static uint64_t
va_pack_alu(const bi_instr * I)413 va_pack_alu(const bi_instr *I)
414 {
415 struct va_opcode_info info = valhall_opcodes[I->op];
416 uint64_t hex = 0;
417
418 switch (I->op) {
419 /* Add FREXP flags */
420 case BI_OPCODE_FREXPE_F32:
421 case BI_OPCODE_FREXPE_V2F16:
422 case BI_OPCODE_FREXPM_F32:
423 case BI_OPCODE_FREXPM_V2F16:
424 if (I->sqrt)
425 hex |= 1ull << 24;
426 if (I->log)
427 hex |= 1ull << 25;
428 break;
429
430 case BI_OPCODE_FLUSH_F32:
431 case BI_OPCODE_FLUSH_V2F16:
432 hex |= I->nan_mode << 8;
433 if (I->ftz)
434 hex |= 1ull << 10;
435 if (I->flush_inf)
436 hex |= 1ull << 11;
437 break;
438
439 /* Add mux type */
440 case BI_OPCODE_MUX_I32:
441 case BI_OPCODE_MUX_V2I16:
442 case BI_OPCODE_MUX_V4I8:
443 hex |= (uint64_t)I->mux << 32;
444 break;
445
446 /* Add .eq flag */
447 case BI_OPCODE_BRANCHZ_I16:
448 case BI_OPCODE_BRANCHZI:
449 pack_assert(I, I->cmpf == BI_CMPF_EQ || I->cmpf == BI_CMPF_NE);
450
451 if (I->cmpf == BI_CMPF_EQ)
452 hex |= (1ull << 36);
453
454 if (I->op == BI_OPCODE_BRANCHZI)
455 hex |= (0x1ull << 40); /* Absolute */
456 else
457 hex |= ((uint64_t)I->branch_offset & BITFIELD_MASK(27)) << 8;
458
459 break;
460
461 /* Add arithmetic flag */
462 case BI_OPCODE_RSHIFT_AND_I32:
463 case BI_OPCODE_RSHIFT_AND_V2I16:
464 case BI_OPCODE_RSHIFT_AND_V4I8:
465 case BI_OPCODE_RSHIFT_OR_I32:
466 case BI_OPCODE_RSHIFT_OR_V2I16:
467 case BI_OPCODE_RSHIFT_OR_V4I8:
468 case BI_OPCODE_RSHIFT_XOR_I32:
469 case BI_OPCODE_RSHIFT_XOR_V2I16:
470 case BI_OPCODE_RSHIFT_XOR_V4I8:
471 hex |= (uint64_t)I->arithmetic << 34;
472 break;
473
474 case BI_OPCODE_LEA_BUF_IMM:
475 /* Buffer table index */
476 hex |= 0xD << 8;
477 break;
478
479 case BI_OPCODE_LEA_ATTR_IMM:
480 hex |= ((uint64_t)I->table) << 16;
481 hex |= ((uint64_t)I->attribute_index) << 20;
482 break;
483
484 case BI_OPCODE_IADD_IMM_I32:
485 case BI_OPCODE_IADD_IMM_V2I16:
486 case BI_OPCODE_IADD_IMM_V4I8:
487 case BI_OPCODE_FADD_IMM_F32:
488 case BI_OPCODE_FADD_IMM_V2F16:
489 hex |= ((uint64_t)I->index) << 8;
490 break;
491
492 case BI_OPCODE_CLPER_I32:
493 hex |= ((uint64_t)I->inactive_result) << 22;
494 hex |= ((uint64_t)I->lane_op) << 32;
495 hex |= ((uint64_t)I->subgroup) << 36;
496 break;
497
498 case BI_OPCODE_LD_VAR:
499 case BI_OPCODE_LD_VAR_FLAT:
500 case BI_OPCODE_LD_VAR_IMM:
501 case BI_OPCODE_LD_VAR_FLAT_IMM:
502 case BI_OPCODE_LD_VAR_BUF_F16:
503 case BI_OPCODE_LD_VAR_BUF_F32:
504 case BI_OPCODE_LD_VAR_BUF_IMM_F16:
505 case BI_OPCODE_LD_VAR_BUF_IMM_F32:
506 case BI_OPCODE_LD_VAR_SPECIAL:
507 if (I->op == BI_OPCODE_LD_VAR_SPECIAL)
508 hex |= ((uint64_t)I->varying_name) << 12; /* instead of index */
509 else if (I->op == BI_OPCODE_LD_VAR_BUF_IMM_F16 ||
510 I->op == BI_OPCODE_LD_VAR_BUF_IMM_F32) {
511 hex |= ((uint64_t)I->index) << 16;
512 } else if (I->op == BI_OPCODE_LD_VAR_IMM ||
513 I->op == BI_OPCODE_LD_VAR_FLAT_IMM) {
514 hex |= ((uint64_t)I->table) << 8;
515 hex |= ((uint64_t)I->index) << 12;
516 }
517
518 hex |= ((uint64_t)va_pack_source_format(I)) << 24;
519 hex |= ((uint64_t)I->update) << 36;
520 hex |= ((uint64_t)I->sample) << 38;
521 break;
522
523 case BI_OPCODE_LD_ATTR_IMM:
524 hex |= ((uint64_t)I->table) << 16;
525 hex |= ((uint64_t)I->attribute_index) << 20;
526 break;
527
528 case BI_OPCODE_LD_TEX_IMM:
529 case BI_OPCODE_LEA_TEX_IMM:
530 hex |= ((uint64_t)I->table) << 16;
531 hex |= ((uint64_t)I->texture_index) << 20;
532 break;
533
534 case BI_OPCODE_ZS_EMIT:
535 if (I->stencil)
536 hex |= (1 << 24);
537 if (I->z)
538 hex |= (1 << 25);
539 break;
540
541 default:
542 break;
543 }
544
545 /* FMA_RSCALE.f32 special modes treated as extra opcodes */
546 if (I->op == BI_OPCODE_FMA_RSCALE_F32) {
547 pack_assert(I, I->special < 4);
548 hex |= ((uint64_t)I->special) << 48;
549 }
550
551 /* Add the normal destination or a placeholder. Staging destinations are
552 * added elsewhere, as they require special handling for control fields.
553 */
554 if (info.has_dest && info.nr_staging_dests == 0) {
555 hex |= (uint64_t)va_pack_dest(I) << 40;
556 } else if (info.nr_staging_dests == 0 && info.nr_staging_srcs == 0) {
557 pack_assert(I, I->nr_dests == 0);
558 hex |= 0xC0ull << 40; /* Placeholder */
559 }
560
561 bool swap12 = va_swap_12(I->op);
562
563 /* First src is staging if we read, skip it when packing sources */
564 unsigned src_offset = bi_opcode_props[I->op].sr_read ? 1 : 0;
565
566 for (unsigned i = 0; i < info.nr_srcs; ++i) {
567 unsigned logical_i = (swap12 && i == 1) ? 2 : (swap12 && i == 2) ? 1 : i;
568
569 struct va_src_info src_info = info.srcs[i];
570 enum va_size size = src_info.size;
571
572 bi_index src = I->src[logical_i + src_offset];
573 hex |= (uint64_t)va_pack_src(I, logical_i + src_offset) << (8 * i);
574
575 if (src_info.notted) {
576 if (src.neg)
577 hex |= (1ull << 35);
578 } else if (src_info.absneg) {
579 unsigned neg_offs = 32 + 2 + ((2 - i) * 2);
580 unsigned abs_offs = 33 + 2 + ((2 - i) * 2);
581
582 if (src.neg)
583 hex |= 1ull << neg_offs;
584 if (src.abs)
585 hex |= 1ull << abs_offs;
586 } else {
587 if (src.neg)
588 invalid_instruction(I, "negate");
589 if (src.abs)
590 invalid_instruction(I, "absolute value");
591 }
592
593 if (src_info.swizzle) {
594 unsigned offs = 24 + ((2 - i) * 2);
595 unsigned S = src.swizzle;
596 pack_assert(I, size == VA_SIZE_16 || size == VA_SIZE_32);
597
598 uint64_t v = (size == VA_SIZE_32 ? va_pack_widen_f32(I, S)
599 : va_pack_swizzle_f16(I, S));
600 hex |= v << offs;
601 } else if (src_info.widen) {
602 unsigned offs = (i == 1) ? 26 : 36;
603 hex |= (uint64_t)va_pack_widen(I, src.swizzle, src_info.size) << offs;
604 } else if (src_info.lane) {
605 unsigned offs =
606 (I->op == BI_OPCODE_MKVEC_V2I8) ? ((i == 0) ? 38 : 36) : 28;
607
608 if (src_info.size == VA_SIZE_16) {
609 hex |= (src.swizzle == BI_SWIZZLE_H11 ? 1 : 0) << offs;
610 } else if (I->op == BI_OPCODE_BRANCHZ_I16) {
611 hex |= ((uint64_t)va_pack_combine(I, src.swizzle) << 37);
612 } else {
613 pack_assert(I, src_info.size == VA_SIZE_8);
614 unsigned comp = src.swizzle - BI_SWIZZLE_B0000;
615 pack_assert(I, comp < 4);
616 hex |= (uint64_t)comp << offs;
617 }
618 } else if (src_info.lanes) {
619 pack_assert(I, src_info.size == VA_SIZE_8);
620 pack_assert(I, i == 1);
621 hex |= (uint64_t)va_pack_shift_lanes(I, src.swizzle) << 26;
622 } else if (src_info.combine) {
623 /* Treat as swizzle, subgroup ops not yet supported */
624 pack_assert(I, src_info.size == VA_SIZE_32);
625 pack_assert(I, i == 0);
626 hex |= (uint64_t)va_pack_widen_f32(I, src.swizzle) << 37;
627 } else if (src_info.halfswizzle) {
628 pack_assert(I, src_info.size == VA_SIZE_8);
629 pack_assert(I, i == 0);
630 hex |= (uint64_t)va_pack_halfswizzle(I, src.swizzle) << 36;
631 } else if (src.swizzle != BI_SWIZZLE_H01) {
632 invalid_instruction(I, "swizzle");
633 }
634 }
635
636 if (info.saturate)
637 hex |= (uint64_t)I->saturate << 30;
638 if (info.rhadd)
639 hex |= va_pack_rhadd(I);
640 if (info.clamp)
641 hex |= (uint64_t)I->clamp << 32;
642 if (info.round_mode)
643 hex |= (uint64_t)I->round << 30;
644 if (info.condition)
645 hex |= (uint64_t)I->cmpf << 32;
646 if (info.result_type)
647 hex |= (uint64_t)I->result_type << 30;
648
649 return hex;
650 }
651
652 static uint64_t
va_pack_byte_offset(const bi_instr * I)653 va_pack_byte_offset(const bi_instr *I)
654 {
655 int16_t offset = I->byte_offset;
656 if (offset != I->byte_offset)
657 invalid_instruction(I, "byte offset");
658
659 uint16_t offset_as_u16 = offset;
660 return ((uint64_t)offset_as_u16) << 8;
661 }
662
663 static uint64_t
va_pack_byte_offset_8(const bi_instr * I)664 va_pack_byte_offset_8(const bi_instr *I)
665 {
666 uint8_t offset = I->byte_offset;
667 if (offset != I->byte_offset)
668 invalid_instruction(I, "byte offset");
669
670 return ((uint64_t)offset) << 8;
671 }
672
673 static uint64_t
va_pack_load(const bi_instr * I,bool buffer_descriptor)674 va_pack_load(const bi_instr *I, bool buffer_descriptor)
675 {
676 const uint8_t load_lane_identity[8] = {
677 VA_LOAD_LANE_8_BIT_B0, VA_LOAD_LANE_16_BIT_H0,
678 VA_LOAD_LANE_24_BIT_IDENTITY, VA_LOAD_LANE_32_BIT_W0,
679 VA_LOAD_LANE_48_BIT_IDENTITY, VA_LOAD_LANE_64_BIT_IDENTITY,
680 VA_LOAD_LANE_96_BIT_IDENTITY, VA_LOAD_LANE_128_BIT_IDENTITY,
681 };
682
683 unsigned memory_size = (valhall_opcodes[I->op].exact >> 27) & 0x7;
684 uint64_t hex = (uint64_t)load_lane_identity[memory_size] << 36;
685
686 // unsigned
687 hex |= (1ull << 39);
688
689 if (!buffer_descriptor)
690 hex |= va_pack_byte_offset(I);
691
692 hex |= (uint64_t)va_pack_src(I, 0) << 0;
693
694 if (buffer_descriptor)
695 hex |= (uint64_t)va_pack_src(I, 1) << 8;
696
697 return hex;
698 }
699
700 static uint64_t
va_pack_memory_access(const bi_instr * I)701 va_pack_memory_access(const bi_instr *I)
702 {
703 switch (I->seg) {
704 case BI_SEG_TL:
705 return VA_MEMORY_ACCESS_FORCE;
706 case BI_SEG_POS:
707 return VA_MEMORY_ACCESS_ISTREAM;
708 case BI_SEG_VARY:
709 return VA_MEMORY_ACCESS_ESTREAM;
710 default:
711 return VA_MEMORY_ACCESS_NONE;
712 }
713 }
714
715 static uint64_t
va_pack_store(const bi_instr * I)716 va_pack_store(const bi_instr *I)
717 {
718 uint64_t hex = va_pack_memory_access(I) << 24;
719
720 va_validate_register_pair(I, 1);
721 hex |= (uint64_t)va_pack_src(I, 1) << 0;
722
723 hex |= va_pack_byte_offset(I);
724
725 return hex;
726 }
727
728 static enum va_lod_mode
va_pack_lod_mode(const bi_instr * I)729 va_pack_lod_mode(const bi_instr *I)
730 {
731 switch (I->va_lod_mode) {
732 case BI_VA_LOD_MODE_ZERO_LOD:
733 return VA_LOD_MODE_ZERO;
734 case BI_VA_LOD_MODE_COMPUTED_LOD:
735 return VA_LOD_MODE_COMPUTED;
736 case BI_VA_LOD_MODE_EXPLICIT:
737 return VA_LOD_MODE_EXPLICIT;
738 case BI_VA_LOD_MODE_COMPUTED_BIAS:
739 return VA_LOD_MODE_COMPUTED_BIAS;
740 case BI_VA_LOD_MODE_GRDESC:
741 return VA_LOD_MODE_GRDESC;
742 }
743
744 invalid_instruction(I, "LOD mode");
745 }
746
747 static enum va_register_type
va_pack_register_type(const bi_instr * I)748 va_pack_register_type(const bi_instr *I)
749 {
750 switch (I->register_format) {
751 case BI_REGISTER_FORMAT_F16:
752 case BI_REGISTER_FORMAT_F32:
753 return VA_REGISTER_TYPE_F;
754
755 case BI_REGISTER_FORMAT_U16:
756 case BI_REGISTER_FORMAT_U32:
757 return VA_REGISTER_TYPE_U;
758
759 case BI_REGISTER_FORMAT_S16:
760 case BI_REGISTER_FORMAT_S32:
761 return VA_REGISTER_TYPE_S;
762
763 default:
764 invalid_instruction(I, "register type");
765 }
766 }
767
768 static enum va_register_format
va_pack_register_format(const bi_instr * I)769 va_pack_register_format(const bi_instr *I)
770 {
771 switch (I->register_format) {
772 case BI_REGISTER_FORMAT_AUTO:
773 return VA_REGISTER_FORMAT_AUTO;
774 case BI_REGISTER_FORMAT_F32:
775 return VA_REGISTER_FORMAT_F32;
776 case BI_REGISTER_FORMAT_F16:
777 return VA_REGISTER_FORMAT_F16;
778 case BI_REGISTER_FORMAT_S32:
779 return VA_REGISTER_FORMAT_S32;
780 case BI_REGISTER_FORMAT_S16:
781 return VA_REGISTER_FORMAT_S16;
782 case BI_REGISTER_FORMAT_U32:
783 return VA_REGISTER_FORMAT_U32;
784 case BI_REGISTER_FORMAT_U16:
785 return VA_REGISTER_FORMAT_U16;
786 default:
787 invalid_instruction(I, "register format");
788 }
789 }
790
791 uint64_t
va_pack_instr(const bi_instr * I)792 va_pack_instr(const bi_instr *I)
793 {
794 struct va_opcode_info info = valhall_opcodes[I->op];
795
796 uint64_t hex = info.exact | (((uint64_t)I->flow) << 59);
797 hex |= ((uint64_t)va_select_fau_page(I)) << 57;
798
799 if (info.slot)
800 hex |= ((uint64_t)I->slot << 30);
801
802 if (info.sr_count) {
803 bool read = bi_opcode_props[I->op].sr_read;
804 bi_index sr = read ? I->src[0] : I->dest[0];
805
806 unsigned count =
807 read ? bi_count_read_registers(I, 0) : bi_count_write_registers(I, 0);
808
809 hex |= ((uint64_t)count << 33);
810 hex |= (uint64_t)va_pack_reg(I, sr) << 40;
811 hex |= ((uint64_t)info.sr_control << 46);
812 }
813
814 if (info.sr_write_count) {
815 hex |= ((uint64_t)bi_count_write_registers(I, 0) - 1) << 36;
816 hex |= ((uint64_t)va_pack_reg(I, I->dest[0])) << 16;
817 }
818
819 if (info.vecsize)
820 hex |= ((uint64_t)I->vecsize << 28);
821
822 if (info.register_format)
823 hex |= ((uint64_t)va_pack_register_format(I)) << 24;
824
825 switch (I->op) {
826 case BI_OPCODE_LOAD_I8:
827 case BI_OPCODE_LOAD_I16:
828 case BI_OPCODE_LOAD_I24:
829 case BI_OPCODE_LOAD_I32:
830 case BI_OPCODE_LOAD_I48:
831 case BI_OPCODE_LOAD_I64:
832 case BI_OPCODE_LOAD_I96:
833 case BI_OPCODE_LOAD_I128:
834 hex |= va_pack_load(I, false);
835 break;
836
837 case BI_OPCODE_LD_BUFFER_I8:
838 case BI_OPCODE_LD_BUFFER_I16:
839 case BI_OPCODE_LD_BUFFER_I24:
840 case BI_OPCODE_LD_BUFFER_I32:
841 case BI_OPCODE_LD_BUFFER_I48:
842 case BI_OPCODE_LD_BUFFER_I64:
843 case BI_OPCODE_LD_BUFFER_I96:
844 case BI_OPCODE_LD_BUFFER_I128:
845 hex |= va_pack_load(I, true);
846 break;
847
848 case BI_OPCODE_STORE_I8:
849 case BI_OPCODE_STORE_I16:
850 case BI_OPCODE_STORE_I24:
851 case BI_OPCODE_STORE_I32:
852 case BI_OPCODE_STORE_I48:
853 case BI_OPCODE_STORE_I64:
854 case BI_OPCODE_STORE_I96:
855 case BI_OPCODE_STORE_I128:
856 hex |= va_pack_store(I);
857 break;
858
859 case BI_OPCODE_ATOM1_RETURN_I32:
860 /* Permit omitting the destination for plain ATOM1 */
861 if (!bi_count_write_registers(I, 0)) {
862 hex |= (0x40ull << 40); // fake read
863 }
864
865 /* 64-bit source */
866 va_validate_register_pair(I, 0);
867 hex |= (uint64_t)va_pack_src(I, 0) << 0;
868 hex |= va_pack_byte_offset_8(I);
869 hex |= ((uint64_t)va_pack_atom_opc_1(I)) << 22;
870 break;
871
872 case BI_OPCODE_ATOM_I32:
873 case BI_OPCODE_ATOM_RETURN_I32:
874 /* 64-bit source */
875 va_validate_register_pair(I, 1);
876 hex |= (uint64_t)va_pack_src(I, 1) << 0;
877 hex |= va_pack_byte_offset_8(I);
878 hex |= ((uint64_t)va_pack_atom_opc(I)) << 22;
879
880 if (I->op == BI_OPCODE_ATOM_RETURN_I32)
881 hex |= (0xc0ull << 40); // flags
882
883 if (I->atom_opc == BI_ATOM_OPC_ACMPXCHG)
884 hex |= (1 << 26); /* .compare */
885
886 break;
887
888 case BI_OPCODE_ST_CVT:
889 /* Staging read */
890 hex |= va_pack_store(I);
891
892 /* Conversion descriptor */
893 hex |= (uint64_t)va_pack_src(I, 3) << 16;
894 break;
895
896 case BI_OPCODE_BLEND: {
897 /* Source 0 - Blend descriptor (64-bit) */
898 hex |= ((uint64_t)va_pack_src(I, 2)) << 0;
899 va_validate_register_pair(I, 2);
900
901 /* Target */
902 if (I->branch_offset & 0x7)
903 invalid_instruction(I, "unaligned branch");
904 hex |= ((I->branch_offset >> 3) << 8);
905
906 /* Source 2 - coverage mask */
907 hex |= ((uint64_t)va_pack_reg(I, I->src[1])) << 16;
908
909 /* Vector size */
910 unsigned vecsize = 4;
911 hex |= ((uint64_t)(vecsize - 1) << 28);
912
913 break;
914 }
915
916 case BI_OPCODE_TEX_GRADIENT:
917 case BI_OPCODE_TEX_SINGLE:
918 case BI_OPCODE_TEX_FETCH:
919 case BI_OPCODE_TEX_GATHER: {
920 /* Image to read from */
921 hex |= ((uint64_t)va_pack_src(I, 1)) << 0;
922
923 if ((I->op == BI_OPCODE_TEX_FETCH || I->op == BI_OPCODE_TEX_GRADIENT) &&
924 I->shadow)
925 invalid_instruction(I, "texture instruction does not support .shadow");
926
927 if (I->wide_indices)
928 hex |= (1ull << 8);
929 if (I->array_enable)
930 hex |= (1ull << 10);
931 if (I->texel_offset)
932 hex |= (1ull << 11);
933 if (I->shadow)
934 hex |= (1ull << 12);
935 if (I->skip)
936 hex |= (1ull << 39);
937 if (!bi_is_regfmt_16(I->register_format))
938 hex |= (1ull << 46);
939
940 if (I->op == BI_OPCODE_TEX_GRADIENT) {
941 if (I->force_delta_enable)
942 hex |= (1ull << 12);
943 if (I->lod_bias_disable)
944 hex |= (1ull << 13);
945 if (I->lod_clamp_disable)
946 hex |= (1ull << 14);
947 if (I->derivative_enable)
948 hex |= (1ull << 15);
949 }
950
951 if (I->op == BI_OPCODE_TEX_SINGLE)
952 hex |= ((uint64_t)va_pack_lod_mode(I)) << 13;
953
954 if (I->op == BI_OPCODE_TEX_GATHER) {
955 if (I->integer_coordinates)
956 hex |= (1 << 13);
957 hex |= ((uint64_t)I->fetch_component) << 14;
958 }
959
960 hex |= (I->write_mask << 22);
961 hex |= ((uint64_t)I->dimension) << 28;
962
963 break;
964 }
965
966 default:
967 if (!info.exact && I->op != BI_OPCODE_NOP)
968 invalid_instruction(I, "opcode");
969
970 hex |= va_pack_alu(I);
971 break;
972 }
973
974 return hex;
975 }
976
977 static unsigned
va_instructions_in_block(bi_block * block)978 va_instructions_in_block(bi_block *block)
979 {
980 unsigned offset = 0;
981
982 bi_foreach_instr_in_block(block, _) {
983 offset++;
984 }
985
986 return offset;
987 }
988
989 /* Calculate branch_offset from a branch_target for a direct relative branch */
990
991 static void
va_lower_branch_target(bi_context * ctx,bi_block * start,bi_instr * I)992 va_lower_branch_target(bi_context *ctx, bi_block *start, bi_instr *I)
993 {
994 /* Precondition: unlowered relative branch */
995 bi_block *target = I->branch_target;
996 assert(target != NULL);
997
998 /* Signed since we might jump backwards */
999 signed offset = 0;
1000
1001 /* Determine if the target block is strictly greater in source order */
1002 bool forwards = target->index > start->index;
1003
1004 if (forwards) {
1005 /* We have to jump through this block */
1006 bi_foreach_instr_in_block_from(start, _, I) {
1007 offset++;
1008 }
1009
1010 /* We then need to jump over every following block until the target */
1011 bi_foreach_block_from(ctx, start, blk) {
1012 /* End just before the target */
1013 if (blk == target)
1014 break;
1015
1016 /* Count other blocks */
1017 if (blk != start)
1018 offset += va_instructions_in_block(blk);
1019 }
1020 } else {
1021 /* Jump through the beginning of this block */
1022 bi_foreach_instr_in_block_from_rev(start, ins, I) {
1023 if (ins != I)
1024 offset--;
1025 }
1026
1027 /* Jump over preceding blocks up to and including the target to get to
1028 * the beginning of the target */
1029 bi_foreach_block_from_rev(ctx, start, blk) {
1030 if (blk == start)
1031 continue;
1032
1033 offset -= va_instructions_in_block(blk);
1034
1035 /* End just after the target */
1036 if (blk == target)
1037 break;
1038 }
1039 }
1040
1041 /* Offset is relative to the next instruction, so bias */
1042 offset--;
1043
1044 /* Update the instruction */
1045 I->branch_offset = offset;
1046 }
1047
1048 /*
1049 * Late lowering to insert blend shader calls after BLEND instructions. Required
1050 * to support blend shaders, so this pass may be omitted if it is known that
1051 * blend shaders are never used.
1052 *
1053 * This lowering runs late because it introduces control flow changes without
1054 * modifying the control flow graph. It hardcodes registers, meaning running
1055 * after RA makes sense. Finally, it hardcodes a manually sized instruction
1056 * sequence, requiring it to run after scheduling.
1057 *
1058 * As it is Valhall specific, running it as a pre-pack lowering is sensible.
1059 */
1060 static void
va_lower_blend(bi_context * ctx)1061 va_lower_blend(bi_context *ctx)
1062 {
1063 /* Program counter for *next* instruction */
1064 bi_index pc = bi_fau(BIR_FAU_PROGRAM_COUNTER, false);
1065
1066 bi_foreach_instr_global_safe(ctx, I) {
1067 if (I->op != BI_OPCODE_BLEND)
1068 continue;
1069
1070 bi_builder b = bi_init_builder(ctx, bi_after_instr(I));
1071
1072 unsigned prolog_length = 2 * 8;
1073
1074 /* By ABI, r48 is the link register shared with blend shaders */
1075 assert(bi_is_equiv(I->dest[0], bi_register(48)));
1076
1077 if (I->flow == VA_FLOW_END)
1078 bi_iadd_imm_i32_to(&b, I->dest[0], va_zero_lut(), 0);
1079 else
1080 bi_iadd_imm_i32_to(&b, I->dest[0], pc, prolog_length - 8);
1081
1082 bi_branchzi(&b, va_zero_lut(), I->src[3], BI_CMPF_EQ);
1083
1084 /* For fixed function: skip the prologue, or return */
1085 if (I->flow != VA_FLOW_END)
1086 I->branch_offset = prolog_length;
1087 }
1088 }
1089
1090 void
bi_pack_valhall(bi_context * ctx,struct util_dynarray * emission)1091 bi_pack_valhall(bi_context *ctx, struct util_dynarray *emission)
1092 {
1093 unsigned orig_size = emission->size;
1094
1095 va_validate(stderr, ctx);
1096
1097 /* Late lowering */
1098 if (ctx->stage == MESA_SHADER_FRAGMENT && !ctx->inputs->is_blend)
1099 va_lower_blend(ctx);
1100
1101 bi_foreach_block(ctx, block) {
1102 bi_foreach_instr_in_block(block, I) {
1103 if (I->op == BI_OPCODE_BRANCHZ_I16)
1104 va_lower_branch_target(ctx, block, I);
1105
1106 uint64_t hex = va_pack_instr(I);
1107 util_dynarray_append(emission, uint64_t, hex);
1108 }
1109 }
1110
1111 /* Pad with zeroes, but keep empty programs empty so they may be omitted
1112 * altogether. Failing to do this would result in a program containing only
1113 * zeroes, which is invalid and will raise an encoding fault.
1114 *
1115 * Pad an extra 16 byte (one instruction) to separate primary and secondary
1116 * shader disassembles. This is not strictly necessary, but it's a good
1117 * practice. 128 bytes is the optimal program alignment on Trym, so pad
1118 * secondary shaders up to 128 bytes. This may help the instruction cache.
1119 */
1120 if (orig_size != emission->size) {
1121 unsigned aligned = ALIGN_POT(emission->size + 16, 128);
1122 unsigned count = aligned - emission->size;
1123
1124 memset(util_dynarray_grow(emission, uint8_t, count), 0, count);
1125 }
1126 }
1127