1 /*
2 * Copyright (C) 2021 Collabora, Ltd.
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
13 * Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 * SOFTWARE.
22 */
23
24 #include "bi_builder.h"
25 #include "va_compiler.h"
26 #include "valhall.h"
27 #include "valhall_enums.h"
28
29 /* This file contains the final passes of the compiler. Running after
30 * scheduling and RA, the IR is now finalized, so we need to emit it to actual
31 * bits on the wire (as well as fixup branches)
32 */
33
34 /*
35 * Unreachable for encoding failures, when hitting an invalid instruction.
36 * Prints the (first) failing instruction to aid debugging.
37 */
38 NORETURN static void PRINTFLIKE(2, 3)
invalid_instruction(const bi_instr * I,const char * cause,...)39 invalid_instruction(const bi_instr *I, const char *cause, ...)
40 {
41 fputs("\nInvalid ", stderr);
42
43 va_list ap;
44 va_start(ap, cause);
45 vfprintf(stderr, cause, ap);
46 va_end(ap);
47
48 fputs(":\n\t", stderr);
49 bi_print_instr(I, stderr);
50 fprintf(stderr, "\n");
51
52 unreachable("Invalid instruction");
53 }
54
55 /*
56 * Like assert, but prints the instruction if the assertion fails to aid
57 * debugging invalid inputs to the packing module.
58 */
59 #define pack_assert(I, cond) \
60 if (!(cond)) \
61 invalid_instruction(I, "invariant " #cond);
62
63 /*
64 * Validate that two adjacent 32-bit sources form an aligned 64-bit register
65 * pair. This is a compiler invariant, required on Valhall but not on Bifrost.
66 */
67 static void
va_validate_register_pair(const bi_instr * I,unsigned s)68 va_validate_register_pair(const bi_instr *I, unsigned s)
69 {
70 ASSERTED bi_index lo = I->src[s], hi = I->src[s + 1];
71
72 pack_assert(I, lo.type == hi.type);
73
74 if (lo.type == BI_INDEX_REGISTER) {
75 pack_assert(I, hi.value & 1);
76 pack_assert(I, hi.value == lo.value + 1);
77 } else if (lo.type == BI_INDEX_FAU && lo.value & BIR_FAU_IMMEDIATE) {
78 /* Small constants are zero extended, so the top word encode zero */
79 pack_assert(I, hi.value == (BIR_FAU_IMMEDIATE | 0));
80 } else {
81 pack_assert(I, hi.offset & 1);
82 pack_assert(I, hi.offset == lo.offset + 1);
83 }
84 }
85
86 static unsigned
va_pack_reg(const bi_instr * I,bi_index idx)87 va_pack_reg(const bi_instr *I, bi_index idx)
88 {
89 pack_assert(I, idx.type == BI_INDEX_REGISTER);
90 pack_assert(I, idx.value < 64);
91
92 return idx.value;
93 }
94
95 static unsigned
va_pack_fau_special(const bi_instr * I,enum bir_fau fau)96 va_pack_fau_special(const bi_instr *I, enum bir_fau fau)
97 {
98 switch (fau) {
99 case BIR_FAU_ATEST_PARAM:
100 return VA_FAU_SPECIAL_PAGE_0_ATEST_DATUM;
101 case BIR_FAU_TLS_PTR:
102 return VA_FAU_SPECIAL_PAGE_1_THREAD_LOCAL_POINTER;
103 case BIR_FAU_WLS_PTR:
104 return VA_FAU_SPECIAL_PAGE_1_WORKGROUP_LOCAL_POINTER;
105 case BIR_FAU_LANE_ID:
106 return VA_FAU_SPECIAL_PAGE_3_LANE_ID;
107 case BIR_FAU_PROGRAM_COUNTER:
108 return VA_FAU_SPECIAL_PAGE_3_PROGRAM_COUNTER;
109 case BIR_FAU_SAMPLE_POS_ARRAY:
110 return VA_FAU_SPECIAL_PAGE_0_SAMPLE;
111
112 case BIR_FAU_BLEND_0 ...(BIR_FAU_BLEND_0 + 7):
113 return VA_FAU_SPECIAL_PAGE_0_BLEND_DESCRIPTOR_0 + (fau - BIR_FAU_BLEND_0);
114
115 default:
116 invalid_instruction(I, "FAU");
117 }
118 }
119
120 /*
121 * Encode a 64-bit FAU source. The offset is ignored, so this function can be
122 * used to encode a 32-bit FAU source by or'ing in the appropriate offset.
123 */
124 static unsigned
va_pack_fau_64(const bi_instr * I,bi_index idx)125 va_pack_fau_64(const bi_instr *I, bi_index idx)
126 {
127 pack_assert(I, idx.type == BI_INDEX_FAU);
128
129 unsigned val = (idx.value & BITFIELD_MASK(5));
130
131 if (idx.value & BIR_FAU_IMMEDIATE)
132 return (0x3 << 6) | (val << 1);
133 else if (idx.value & BIR_FAU_UNIFORM)
134 return (0x2 << 6) | (val << 1);
135 else
136 return (0x7 << 5) | (va_pack_fau_special(I, idx.value) << 1);
137 }
138
139 static unsigned
va_pack_src(const bi_instr * I,unsigned s)140 va_pack_src(const bi_instr *I, unsigned s)
141 {
142 bi_index idx = I->src[s];
143
144 if (idx.type == BI_INDEX_REGISTER) {
145 unsigned value = va_pack_reg(I, idx);
146 if (idx.discard)
147 value |= (1 << 6);
148 return value;
149 } else if (idx.type == BI_INDEX_FAU) {
150 pack_assert(I, idx.offset <= 1);
151 return va_pack_fau_64(I, idx) | idx.offset;
152 }
153
154 invalid_instruction(I, "type of source %u", s);
155 }
156
157 static unsigned
va_pack_wrmask(const bi_instr * I)158 va_pack_wrmask(const bi_instr *I)
159 {
160 switch (I->dest[0].swizzle) {
161 case BI_SWIZZLE_H00:
162 return 0x1;
163 case BI_SWIZZLE_H11:
164 return 0x2;
165 case BI_SWIZZLE_H01:
166 return 0x3;
167 default:
168 invalid_instruction(I, "write mask");
169 }
170 }
171
172 static enum va_atomic_operation
va_pack_atom_opc(const bi_instr * I)173 va_pack_atom_opc(const bi_instr *I)
174 {
175 switch (I->atom_opc) {
176 case BI_ATOM_OPC_AADD:
177 return VA_ATOMIC_OPERATION_AADD;
178 case BI_ATOM_OPC_ASMIN:
179 return VA_ATOMIC_OPERATION_ASMIN;
180 case BI_ATOM_OPC_ASMAX:
181 return VA_ATOMIC_OPERATION_ASMAX;
182 case BI_ATOM_OPC_AUMIN:
183 return VA_ATOMIC_OPERATION_AUMIN;
184 case BI_ATOM_OPC_AUMAX:
185 return VA_ATOMIC_OPERATION_AUMAX;
186 case BI_ATOM_OPC_AAND:
187 return VA_ATOMIC_OPERATION_AAND;
188 case BI_ATOM_OPC_AOR:
189 return VA_ATOMIC_OPERATION_AOR;
190 case BI_ATOM_OPC_AXOR:
191 return VA_ATOMIC_OPERATION_AXOR;
192 case BI_ATOM_OPC_ACMPXCHG:
193 case BI_ATOM_OPC_AXCHG:
194 return VA_ATOMIC_OPERATION_AXCHG;
195 default:
196 invalid_instruction(I, "atomic opcode");
197 }
198 }
199
200 static enum va_atomic_operation_with_1
va_pack_atom_opc_1(const bi_instr * I)201 va_pack_atom_opc_1(const bi_instr *I)
202 {
203 switch (I->atom_opc) {
204 case BI_ATOM_OPC_AINC:
205 return VA_ATOMIC_OPERATION_WITH_1_AINC;
206 case BI_ATOM_OPC_ADEC:
207 return VA_ATOMIC_OPERATION_WITH_1_ADEC;
208 case BI_ATOM_OPC_AUMAX1:
209 return VA_ATOMIC_OPERATION_WITH_1_AUMAX1;
210 case BI_ATOM_OPC_ASMAX1:
211 return VA_ATOMIC_OPERATION_WITH_1_ASMAX1;
212 case BI_ATOM_OPC_AOR1:
213 return VA_ATOMIC_OPERATION_WITH_1_AOR1;
214 default:
215 invalid_instruction(I, "atomic opcode with implicit 1");
216 }
217 }
218
219 static unsigned
va_pack_dest(const bi_instr * I)220 va_pack_dest(const bi_instr *I)
221 {
222 assert(I->nr_dests);
223 return va_pack_reg(I, I->dest[0]) | (va_pack_wrmask(I) << 6);
224 }
225
226 static enum va_widen
va_pack_widen_f32(const bi_instr * I,enum bi_swizzle swz)227 va_pack_widen_f32(const bi_instr *I, enum bi_swizzle swz)
228 {
229 switch (swz) {
230 case BI_SWIZZLE_H01:
231 return VA_WIDEN_NONE;
232 case BI_SWIZZLE_H00:
233 return VA_WIDEN_H0;
234 case BI_SWIZZLE_H11:
235 return VA_WIDEN_H1;
236 default:
237 invalid_instruction(I, "widen");
238 }
239 }
240
241 static enum va_swizzles_16_bit
va_pack_swizzle_f16(const bi_instr * I,enum bi_swizzle swz)242 va_pack_swizzle_f16(const bi_instr *I, enum bi_swizzle swz)
243 {
244 switch (swz) {
245 case BI_SWIZZLE_H00:
246 return VA_SWIZZLES_16_BIT_H00;
247 case BI_SWIZZLE_H10:
248 return VA_SWIZZLES_16_BIT_H10;
249 case BI_SWIZZLE_H01:
250 return VA_SWIZZLES_16_BIT_H01;
251 case BI_SWIZZLE_H11:
252 return VA_SWIZZLES_16_BIT_H11;
253 default:
254 invalid_instruction(I, "16-bit swizzle");
255 }
256 }
257
258 static unsigned
va_pack_widen(const bi_instr * I,enum bi_swizzle swz,enum va_size size)259 va_pack_widen(const bi_instr *I, enum bi_swizzle swz, enum va_size size)
260 {
261 if (size == VA_SIZE_8) {
262 switch (swz) {
263 case BI_SWIZZLE_H01:
264 return VA_SWIZZLES_8_BIT_B0123;
265 case BI_SWIZZLE_H00:
266 return VA_SWIZZLES_8_BIT_B0101;
267 case BI_SWIZZLE_H11:
268 return VA_SWIZZLES_8_BIT_B2323;
269 case BI_SWIZZLE_B0000:
270 return VA_SWIZZLES_8_BIT_B0000;
271 case BI_SWIZZLE_B1111:
272 return VA_SWIZZLES_8_BIT_B1111;
273 case BI_SWIZZLE_B2222:
274 return VA_SWIZZLES_8_BIT_B2222;
275 case BI_SWIZZLE_B3333:
276 return VA_SWIZZLES_8_BIT_B3333;
277 default:
278 invalid_instruction(I, "8-bit widen");
279 }
280 } else if (size == VA_SIZE_16) {
281 switch (swz) {
282 case BI_SWIZZLE_H00:
283 return VA_SWIZZLES_16_BIT_H00;
284 case BI_SWIZZLE_H10:
285 return VA_SWIZZLES_16_BIT_H10;
286 case BI_SWIZZLE_H01:
287 return VA_SWIZZLES_16_BIT_H01;
288 case BI_SWIZZLE_H11:
289 return VA_SWIZZLES_16_BIT_H11;
290 case BI_SWIZZLE_B0000:
291 return VA_SWIZZLES_16_BIT_B00;
292 case BI_SWIZZLE_B1111:
293 return VA_SWIZZLES_16_BIT_B11;
294 case BI_SWIZZLE_B2222:
295 return VA_SWIZZLES_16_BIT_B22;
296 case BI_SWIZZLE_B3333:
297 return VA_SWIZZLES_16_BIT_B33;
298 default:
299 invalid_instruction(I, "16-bit widen");
300 }
301 } else if (size == VA_SIZE_32) {
302 switch (swz) {
303 case BI_SWIZZLE_H01:
304 return VA_SWIZZLES_32_BIT_NONE;
305 case BI_SWIZZLE_H00:
306 return VA_SWIZZLES_32_BIT_H0;
307 case BI_SWIZZLE_H11:
308 return VA_SWIZZLES_32_BIT_H1;
309 case BI_SWIZZLE_B0000:
310 return VA_SWIZZLES_32_BIT_B0;
311 case BI_SWIZZLE_B1111:
312 return VA_SWIZZLES_32_BIT_B1;
313 case BI_SWIZZLE_B2222:
314 return VA_SWIZZLES_32_BIT_B2;
315 case BI_SWIZZLE_B3333:
316 return VA_SWIZZLES_32_BIT_B3;
317 default:
318 invalid_instruction(I, "32-bit widen");
319 }
320 } else {
321 invalid_instruction(I, "type size for widen");
322 }
323 }
324
325 static enum va_half_swizzles_8_bit
va_pack_halfswizzle(const bi_instr * I,enum bi_swizzle swz)326 va_pack_halfswizzle(const bi_instr *I, enum bi_swizzle swz)
327 {
328 switch (swz) {
329 case BI_SWIZZLE_B0000:
330 return VA_HALF_SWIZZLES_8_BIT_B00;
331 case BI_SWIZZLE_B1111:
332 return VA_HALF_SWIZZLES_8_BIT_B11;
333 case BI_SWIZZLE_B2222:
334 return VA_HALF_SWIZZLES_8_BIT_B22;
335 case BI_SWIZZLE_B3333:
336 return VA_HALF_SWIZZLES_8_BIT_B33;
337 case BI_SWIZZLE_B0011:
338 return VA_HALF_SWIZZLES_8_BIT_B01;
339 case BI_SWIZZLE_B2233:
340 return VA_HALF_SWIZZLES_8_BIT_B23;
341 case BI_SWIZZLE_B0022:
342 return VA_HALF_SWIZZLES_8_BIT_B02;
343 default:
344 invalid_instruction(I, "v2u8 swizzle");
345 }
346 }
347
348 static enum va_lanes_8_bit
va_pack_shift_lanes(const bi_instr * I,enum bi_swizzle swz)349 va_pack_shift_lanes(const bi_instr *I, enum bi_swizzle swz)
350 {
351 switch (swz) {
352 case BI_SWIZZLE_B0000:
353 return VA_LANES_8_BIT_B00;
354 case BI_SWIZZLE_B1111:
355 return VA_LANES_8_BIT_B11;
356 case BI_SWIZZLE_B2222:
357 return VA_LANES_8_BIT_B22;
358 case BI_SWIZZLE_B3333:
359 return VA_LANES_8_BIT_B33;
360 default:
361 invalid_instruction(I, "lane shift");
362 }
363 }
364
365 static enum va_combine
va_pack_combine(const bi_instr * I,enum bi_swizzle swz)366 va_pack_combine(const bi_instr *I, enum bi_swizzle swz)
367 {
368 switch (swz) {
369 case BI_SWIZZLE_H01:
370 return VA_COMBINE_NONE;
371 case BI_SWIZZLE_H00:
372 return VA_COMBINE_H0;
373 case BI_SWIZZLE_H11:
374 return VA_COMBINE_H1;
375 default:
376 invalid_instruction(I, "branch lane");
377 }
378 }
379
380 static enum va_source_format
va_pack_source_format(const bi_instr * I)381 va_pack_source_format(const bi_instr *I)
382 {
383 switch (I->source_format) {
384 case BI_SOURCE_FORMAT_FLAT32:
385 return VA_SOURCE_FORMAT_SRC_FLAT32;
386 case BI_SOURCE_FORMAT_FLAT16:
387 return VA_SOURCE_FORMAT_SRC_FLAT16;
388 case BI_SOURCE_FORMAT_F32:
389 return VA_SOURCE_FORMAT_SRC_F32;
390 case BI_SOURCE_FORMAT_F16:
391 return VA_SOURCE_FORMAT_SRC_F16;
392 }
393
394 invalid_instruction(I, "source format");
395 }
396
397 static uint64_t
va_pack_rhadd(const bi_instr * I)398 va_pack_rhadd(const bi_instr *I)
399 {
400 switch (I->round) {
401 case BI_ROUND_RTN:
402 return 0; /* hadd */
403 case BI_ROUND_RTP:
404 return BITFIELD_BIT(30); /* rhadd */
405 default:
406 unreachable("Invalid round for HADD");
407 }
408 }
409
410 static uint64_t
va_pack_alu(const bi_instr * I)411 va_pack_alu(const bi_instr *I)
412 {
413 struct va_opcode_info info = valhall_opcodes[I->op];
414 uint64_t hex = 0;
415
416 switch (I->op) {
417 /* Add FREXP flags */
418 case BI_OPCODE_FREXPE_F32:
419 case BI_OPCODE_FREXPE_V2F16:
420 case BI_OPCODE_FREXPM_F32:
421 case BI_OPCODE_FREXPM_V2F16:
422 if (I->sqrt)
423 hex |= 1ull << 24;
424 if (I->log)
425 hex |= 1ull << 25;
426 break;
427
428 case BI_OPCODE_FLUSH_F32:
429 case BI_OPCODE_FLUSH_V2F16:
430 hex |= I->nan_mode << 8;
431 if (I->ftz)
432 hex |= 1ull << 10;
433 if (I->flush_inf)
434 hex |= 1ull << 11;
435 break;
436
437 /* Add mux type */
438 case BI_OPCODE_MUX_I32:
439 case BI_OPCODE_MUX_V2I16:
440 case BI_OPCODE_MUX_V4I8:
441 hex |= (uint64_t)I->mux << 32;
442 break;
443
444 /* Add .eq flag */
445 case BI_OPCODE_BRANCHZ_I16:
446 case BI_OPCODE_BRANCHZI:
447 pack_assert(I, I->cmpf == BI_CMPF_EQ || I->cmpf == BI_CMPF_NE);
448
449 if (I->cmpf == BI_CMPF_EQ)
450 hex |= (1ull << 36);
451
452 if (I->op == BI_OPCODE_BRANCHZI)
453 hex |= (0x1ull << 40); /* Absolute */
454 else
455 hex |= ((uint64_t)I->branch_offset & BITFIELD_MASK(27)) << 8;
456
457 break;
458
459 /* Add arithmetic flag */
460 case BI_OPCODE_RSHIFT_AND_I32:
461 case BI_OPCODE_RSHIFT_AND_V2I16:
462 case BI_OPCODE_RSHIFT_AND_V4I8:
463 case BI_OPCODE_RSHIFT_OR_I32:
464 case BI_OPCODE_RSHIFT_OR_V2I16:
465 case BI_OPCODE_RSHIFT_OR_V4I8:
466 case BI_OPCODE_RSHIFT_XOR_I32:
467 case BI_OPCODE_RSHIFT_XOR_V2I16:
468 case BI_OPCODE_RSHIFT_XOR_V4I8:
469 hex |= (uint64_t)I->arithmetic << 34;
470 break;
471
472 case BI_OPCODE_LEA_BUF_IMM:
473 hex |= ((uint64_t)I->table) << 8;
474 hex |= ((uint64_t)I->index) << 12;
475 break;
476
477 case BI_OPCODE_LEA_ATTR_IMM:
478 hex |= ((uint64_t)I->table) << 16;
479 hex |= ((uint64_t)I->attribute_index) << 20;
480 break;
481
482 case BI_OPCODE_IADD_IMM_I32:
483 case BI_OPCODE_IADD_IMM_V2I16:
484 case BI_OPCODE_IADD_IMM_V4I8:
485 case BI_OPCODE_FADD_IMM_F32:
486 case BI_OPCODE_FADD_IMM_V2F16:
487 hex |= ((uint64_t)I->index) << 8;
488 break;
489
490 case BI_OPCODE_CLPER_I32:
491 hex |= ((uint64_t)I->inactive_result) << 22;
492 hex |= ((uint64_t)I->lane_op) << 32;
493 hex |= ((uint64_t)I->subgroup) << 36;
494 break;
495
496 case BI_OPCODE_LD_VAR:
497 case BI_OPCODE_LD_VAR_FLAT:
498 case BI_OPCODE_LD_VAR_IMM:
499 case BI_OPCODE_LD_VAR_FLAT_IMM:
500 case BI_OPCODE_LD_VAR_BUF_F16:
501 case BI_OPCODE_LD_VAR_BUF_F32:
502 case BI_OPCODE_LD_VAR_BUF_IMM_F16:
503 case BI_OPCODE_LD_VAR_BUF_IMM_F32:
504 case BI_OPCODE_LD_VAR_SPECIAL:
505 if (I->op == BI_OPCODE_LD_VAR_SPECIAL)
506 hex |= ((uint64_t)I->varying_name) << 12; /* instead of index */
507 else if (I->op == BI_OPCODE_LD_VAR_BUF_IMM_F16 ||
508 I->op == BI_OPCODE_LD_VAR_BUF_IMM_F32) {
509 hex |= ((uint64_t)I->index) << 16;
510 } else if (I->op == BI_OPCODE_LD_VAR_IMM ||
511 I->op == BI_OPCODE_LD_VAR_FLAT_IMM) {
512 hex |= ((uint64_t)I->table) << 8;
513 hex |= ((uint64_t)I->index) << 12;
514 }
515
516 hex |= ((uint64_t)va_pack_source_format(I)) << 24;
517 hex |= ((uint64_t)I->update) << 36;
518 hex |= ((uint64_t)I->sample) << 38;
519 break;
520
521 case BI_OPCODE_LD_ATTR_IMM:
522 hex |= ((uint64_t)I->table) << 16;
523 hex |= ((uint64_t)I->attribute_index) << 20;
524 break;
525
526 case BI_OPCODE_LD_TEX_IMM:
527 case BI_OPCODE_LEA_TEX_IMM:
528 hex |= ((uint64_t)I->table) << 16;
529 hex |= ((uint64_t)I->texture_index) << 20;
530 break;
531
532 case BI_OPCODE_WMASK:
533 hex |= ((uint64_t)I->subgroup) << 36;
534 break;
535
536 case BI_OPCODE_ZS_EMIT:
537 if (I->stencil)
538 hex |= (1 << 24);
539 if (I->z)
540 hex |= (1 << 25);
541 break;
542
543 default:
544 break;
545 }
546
547 /* FMA_RSCALE.f32 special modes treated as extra opcodes */
548 if (I->op == BI_OPCODE_FMA_RSCALE_F32) {
549 pack_assert(I, I->special < 4);
550 hex |= ((uint64_t)I->special) << 48;
551 }
552
553 /* Add the normal destination or a placeholder. Staging destinations are
554 * added elsewhere, as they require special handling for control fields.
555 */
556 if (info.has_dest && info.nr_staging_dests == 0) {
557 hex |= (uint64_t)va_pack_dest(I) << 40;
558 } else if (info.nr_staging_dests == 0 && info.nr_staging_srcs == 0) {
559 pack_assert(I, I->nr_dests == 0);
560 hex |= 0xC0ull << 40; /* Placeholder */
561 }
562
563 bool swap12 = va_swap_12(I->op);
564
565 /* First src is staging if we read, skip it when packing sources */
566 unsigned src_offset = bi_opcode_props[I->op].sr_read ? 1 : 0;
567
568 for (unsigned i = 0; i < info.nr_srcs; ++i) {
569 unsigned logical_i = (swap12 && i == 1) ? 2 : (swap12 && i == 2) ? 1 : i;
570
571 struct va_src_info src_info = info.srcs[i];
572 enum va_size size = src_info.size;
573
574 bi_index src = I->src[logical_i + src_offset];
575 hex |= (uint64_t)va_pack_src(I, logical_i + src_offset) << (8 * i);
576
577 if (src_info.notted) {
578 if (src.neg)
579 hex |= (1ull << 35);
580 } else if (src_info.absneg) {
581 unsigned neg_offs = 32 + 2 + ((2 - i) * 2);
582 unsigned abs_offs = 33 + 2 + ((2 - i) * 2);
583
584 if (src.neg)
585 hex |= 1ull << neg_offs;
586 if (src.abs)
587 hex |= 1ull << abs_offs;
588 } else {
589 if (src.neg)
590 invalid_instruction(I, "negate");
591 if (src.abs)
592 invalid_instruction(I, "absolute value");
593 }
594
595 if (src_info.swizzle) {
596 unsigned offs = 24 + ((2 - i) * 2);
597 unsigned S = src.swizzle;
598 pack_assert(I, size == VA_SIZE_16 || size == VA_SIZE_32);
599
600 uint64_t v = (size == VA_SIZE_32 ? va_pack_widen_f32(I, S)
601 : va_pack_swizzle_f16(I, S));
602 hex |= v << offs;
603 } else if (src_info.widen) {
604 unsigned offs = (i == 1) ? 26 : 36;
605 hex |= (uint64_t)va_pack_widen(I, src.swizzle, src_info.size) << offs;
606 } else if (src_info.lane) {
607 unsigned offs =
608 (I->op == BI_OPCODE_MKVEC_V2I8) ? ((i == 0) ? 38 : 36) : 28;
609
610 if (src_info.size == VA_SIZE_16) {
611 hex |= (src.swizzle == BI_SWIZZLE_H11 ? 1 : 0) << offs;
612 } else if (I->op == BI_OPCODE_BRANCHZ_I16) {
613 hex |= ((uint64_t)va_pack_combine(I, src.swizzle) << 37);
614 } else {
615 pack_assert(I, src_info.size == VA_SIZE_8);
616 unsigned comp = src.swizzle - BI_SWIZZLE_B0000;
617 pack_assert(I, comp < 4);
618 hex |= (uint64_t)comp << offs;
619 }
620 } else if (src_info.lanes) {
621 pack_assert(I, src_info.size == VA_SIZE_8);
622 pack_assert(I, i == 1);
623 hex |= (uint64_t)va_pack_shift_lanes(I, src.swizzle) << 26;
624 } else if (src_info.combine) {
625 /* Treat as swizzle, subgroup ops not yet supported */
626 pack_assert(I, src_info.size == VA_SIZE_32);
627 pack_assert(I, i == 0);
628 hex |= (uint64_t)va_pack_widen_f32(I, src.swizzle) << 37;
629 } else if (src_info.halfswizzle) {
630 pack_assert(I, src_info.size == VA_SIZE_8);
631 pack_assert(I, i == 0);
632 hex |= (uint64_t)va_pack_halfswizzle(I, src.swizzle) << 36;
633 } else if (src.swizzle != BI_SWIZZLE_H01) {
634 invalid_instruction(I, "swizzle");
635 }
636 }
637
638 if (info.saturate)
639 hex |= (uint64_t)I->saturate << 30;
640 if (info.rhadd)
641 hex |= va_pack_rhadd(I);
642 if (info.clamp)
643 hex |= (uint64_t)I->clamp << 32;
644 if (info.round_mode)
645 hex |= (uint64_t)I->round << 30;
646 if (info.condition)
647 hex |= (uint64_t)I->cmpf << 32;
648 if (info.result_type)
649 hex |= (uint64_t)I->result_type << 30;
650
651 return hex;
652 }
653
654 static uint64_t
va_pack_byte_offset(const bi_instr * I)655 va_pack_byte_offset(const bi_instr *I)
656 {
657 int16_t offset = I->byte_offset;
658 if (offset != I->byte_offset)
659 invalid_instruction(I, "byte offset");
660
661 uint16_t offset_as_u16 = offset;
662 return ((uint64_t)offset_as_u16) << 8;
663 }
664
665 static uint64_t
va_pack_byte_offset_8(const bi_instr * I)666 va_pack_byte_offset_8(const bi_instr *I)
667 {
668 uint8_t offset = I->byte_offset;
669 if (offset != I->byte_offset)
670 invalid_instruction(I, "byte offset");
671
672 return ((uint64_t)offset) << 8;
673 }
674
675 static uint64_t
va_pack_load(const bi_instr * I,bool buffer_descriptor)676 va_pack_load(const bi_instr *I, bool buffer_descriptor)
677 {
678 const uint8_t load_lane_identity[8] = {
679 VA_LOAD_LANE_8_BIT_B0, VA_LOAD_LANE_16_BIT_H0,
680 VA_LOAD_LANE_24_BIT_IDENTITY, VA_LOAD_LANE_32_BIT_W0,
681 VA_LOAD_LANE_48_BIT_IDENTITY, VA_LOAD_LANE_64_BIT_IDENTITY,
682 VA_LOAD_LANE_96_BIT_IDENTITY, VA_LOAD_LANE_128_BIT_IDENTITY,
683 };
684
685 unsigned memory_size = (valhall_opcodes[I->op].exact >> 27) & 0x7;
686 uint64_t hex = (uint64_t)load_lane_identity[memory_size] << 36;
687
688 // unsigned
689 hex |= (1ull << 39);
690
691 if (!buffer_descriptor)
692 hex |= va_pack_byte_offset(I);
693
694 hex |= (uint64_t)va_pack_src(I, 0) << 0;
695
696 if (buffer_descriptor)
697 hex |= (uint64_t)va_pack_src(I, 1) << 8;
698
699 return hex;
700 }
701
702 static uint64_t
va_pack_memory_access(const bi_instr * I)703 va_pack_memory_access(const bi_instr *I)
704 {
705 switch (I->seg) {
706 case BI_SEG_TL:
707 return VA_MEMORY_ACCESS_FORCE;
708 case BI_SEG_POS:
709 return VA_MEMORY_ACCESS_ISTREAM;
710 case BI_SEG_VARY:
711 return VA_MEMORY_ACCESS_ESTREAM;
712 default:
713 return VA_MEMORY_ACCESS_NONE;
714 }
715 }
716
717 static uint64_t
va_pack_store(const bi_instr * I)718 va_pack_store(const bi_instr *I)
719 {
720 uint64_t hex = va_pack_memory_access(I) << 24;
721
722 va_validate_register_pair(I, 1);
723 hex |= (uint64_t)va_pack_src(I, 1) << 0;
724
725 hex |= va_pack_byte_offset(I);
726
727 return hex;
728 }
729
730 static enum va_lod_mode
va_pack_lod_mode(const bi_instr * I)731 va_pack_lod_mode(const bi_instr *I)
732 {
733 switch (I->va_lod_mode) {
734 case BI_VA_LOD_MODE_ZERO_LOD:
735 return VA_LOD_MODE_ZERO;
736 case BI_VA_LOD_MODE_COMPUTED_LOD:
737 return VA_LOD_MODE_COMPUTED;
738 case BI_VA_LOD_MODE_EXPLICIT:
739 return VA_LOD_MODE_EXPLICIT;
740 case BI_VA_LOD_MODE_COMPUTED_BIAS:
741 return VA_LOD_MODE_COMPUTED_BIAS;
742 case BI_VA_LOD_MODE_GRDESC:
743 return VA_LOD_MODE_GRDESC;
744 }
745
746 invalid_instruction(I, "LOD mode");
747 }
748
749 static enum va_register_type
va_pack_register_type(const bi_instr * I)750 va_pack_register_type(const bi_instr *I)
751 {
752 switch (I->register_format) {
753 case BI_REGISTER_FORMAT_F16:
754 case BI_REGISTER_FORMAT_F32:
755 return VA_REGISTER_TYPE_F;
756
757 case BI_REGISTER_FORMAT_U16:
758 case BI_REGISTER_FORMAT_U32:
759 return VA_REGISTER_TYPE_U;
760
761 case BI_REGISTER_FORMAT_S16:
762 case BI_REGISTER_FORMAT_S32:
763 return VA_REGISTER_TYPE_S;
764
765 default:
766 invalid_instruction(I, "register type");
767 }
768 }
769
770 static enum va_register_format
va_pack_register_format(const bi_instr * I)771 va_pack_register_format(const bi_instr *I)
772 {
773 switch (I->register_format) {
774 case BI_REGISTER_FORMAT_AUTO:
775 return VA_REGISTER_FORMAT_AUTO;
776 case BI_REGISTER_FORMAT_F32:
777 return VA_REGISTER_FORMAT_F32;
778 case BI_REGISTER_FORMAT_F16:
779 return VA_REGISTER_FORMAT_F16;
780 case BI_REGISTER_FORMAT_S32:
781 return VA_REGISTER_FORMAT_S32;
782 case BI_REGISTER_FORMAT_S16:
783 return VA_REGISTER_FORMAT_S16;
784 case BI_REGISTER_FORMAT_U32:
785 return VA_REGISTER_FORMAT_U32;
786 case BI_REGISTER_FORMAT_U16:
787 return VA_REGISTER_FORMAT_U16;
788 default:
789 invalid_instruction(I, "register format");
790 }
791 }
792
793 uint64_t
va_pack_instr(const bi_instr * I)794 va_pack_instr(const bi_instr *I)
795 {
796 struct va_opcode_info info = valhall_opcodes[I->op];
797
798 uint64_t hex = info.exact | (((uint64_t)I->flow) << 59);
799 hex |= ((uint64_t)va_select_fau_page(I)) << 57;
800
801 if (info.slot)
802 hex |= ((uint64_t)I->slot << 30);
803
804 if (info.sr_count) {
805 bool read = bi_opcode_props[I->op].sr_read;
806 bi_index sr = read ? I->src[0] : I->dest[0];
807
808 unsigned count =
809 read ? bi_count_read_registers(I, 0) : bi_count_write_registers(I, 0);
810
811 hex |= ((uint64_t)count << 33);
812 hex |= (uint64_t)va_pack_reg(I, sr) << 40;
813 hex |= ((uint64_t)info.sr_control << 46);
814 }
815
816 if (info.sr_write_count) {
817 hex |= ((uint64_t)bi_count_write_registers(I, 0) - 1) << 36;
818 hex |= ((uint64_t)va_pack_reg(I, I->dest[0])) << 16;
819 }
820
821 if (info.vecsize)
822 hex |= ((uint64_t)I->vecsize << 28);
823
824 if (info.register_format)
825 hex |= ((uint64_t)va_pack_register_format(I)) << 24;
826
827 switch (I->op) {
828 case BI_OPCODE_LOAD_I8:
829 case BI_OPCODE_LOAD_I16:
830 case BI_OPCODE_LOAD_I24:
831 case BI_OPCODE_LOAD_I32:
832 case BI_OPCODE_LOAD_I48:
833 case BI_OPCODE_LOAD_I64:
834 case BI_OPCODE_LOAD_I96:
835 case BI_OPCODE_LOAD_I128:
836 hex |= va_pack_load(I, false);
837 break;
838
839 case BI_OPCODE_LD_BUFFER_I8:
840 case BI_OPCODE_LD_BUFFER_I16:
841 case BI_OPCODE_LD_BUFFER_I24:
842 case BI_OPCODE_LD_BUFFER_I32:
843 case BI_OPCODE_LD_BUFFER_I48:
844 case BI_OPCODE_LD_BUFFER_I64:
845 case BI_OPCODE_LD_BUFFER_I96:
846 case BI_OPCODE_LD_BUFFER_I128:
847 hex |= va_pack_load(I, true);
848 break;
849
850 case BI_OPCODE_STORE_I8:
851 case BI_OPCODE_STORE_I16:
852 case BI_OPCODE_STORE_I24:
853 case BI_OPCODE_STORE_I32:
854 case BI_OPCODE_STORE_I48:
855 case BI_OPCODE_STORE_I64:
856 case BI_OPCODE_STORE_I96:
857 case BI_OPCODE_STORE_I128:
858 hex |= va_pack_store(I);
859 break;
860
861 case BI_OPCODE_ATOM1_RETURN_I32:
862 /* Permit omitting the destination for plain ATOM1 */
863 if (!bi_count_write_registers(I, 0)) {
864 hex |= (0x40ull << 40); // fake read
865 }
866
867 /* 64-bit source */
868 va_validate_register_pair(I, 0);
869 hex |= (uint64_t)va_pack_src(I, 0) << 0;
870 hex |= va_pack_byte_offset_8(I);
871 hex |= ((uint64_t)va_pack_atom_opc_1(I)) << 22;
872 break;
873
874 case BI_OPCODE_ATOM_I32:
875 case BI_OPCODE_ATOM_RETURN_I32:
876 /* 64-bit source */
877 va_validate_register_pair(I, 1);
878 hex |= (uint64_t)va_pack_src(I, 1) << 0;
879 hex |= va_pack_byte_offset_8(I);
880 hex |= ((uint64_t)va_pack_atom_opc(I)) << 22;
881
882 if (I->op == BI_OPCODE_ATOM_RETURN_I32)
883 hex |= (0xc0ull << 40); // flags
884
885 if (I->atom_opc == BI_ATOM_OPC_ACMPXCHG)
886 hex |= (1 << 26); /* .compare */
887
888 break;
889
890 case BI_OPCODE_ST_CVT:
891 /* Staging read */
892 hex |= va_pack_store(I);
893
894 /* Conversion descriptor */
895 hex |= (uint64_t)va_pack_src(I, 3) << 16;
896 break;
897
898 case BI_OPCODE_BLEND: {
899 /* Source 0 - Blend descriptor (64-bit) */
900 hex |= ((uint64_t)va_pack_src(I, 2)) << 0;
901 va_validate_register_pair(I, 2);
902
903 /* Target */
904 if (I->branch_offset & 0x7)
905 invalid_instruction(I, "unaligned branch");
906 hex |= ((I->branch_offset >> 3) << 8);
907
908 /* Source 2 - coverage mask */
909 hex |= ((uint64_t)va_pack_reg(I, I->src[1])) << 16;
910
911 /* Vector size */
912 unsigned vecsize = 4;
913 hex |= ((uint64_t)(vecsize - 1) << 28);
914
915 break;
916 }
917
918 case BI_OPCODE_TEX_GRADIENT:
919 case BI_OPCODE_TEX_SINGLE:
920 case BI_OPCODE_TEX_FETCH:
921 case BI_OPCODE_TEX_GATHER: {
922 /* Image to read from */
923 hex |= ((uint64_t)va_pack_src(I, 1)) << 0;
924
925 if ((I->op == BI_OPCODE_TEX_FETCH || I->op == BI_OPCODE_TEX_GRADIENT) &&
926 I->shadow)
927 invalid_instruction(I, "texture instruction does not support .shadow");
928
929 if (I->wide_indices)
930 hex |= (1ull << 8);
931 if (I->array_enable)
932 hex |= (1ull << 10);
933 if (I->texel_offset)
934 hex |= (1ull << 11);
935 if (I->shadow)
936 hex |= (1ull << 12);
937 if (I->skip)
938 hex |= (1ull << 39);
939 if (!bi_is_regfmt_16(I->register_format))
940 hex |= (1ull << 46);
941
942 if (I->op == BI_OPCODE_TEX_GRADIENT) {
943 if (I->force_delta_enable)
944 hex |= (1ull << 12);
945 if (I->lod_bias_disable)
946 hex |= (1ull << 13);
947 if (I->lod_clamp_disable)
948 hex |= (1ull << 14);
949 if (I->derivative_enable)
950 hex |= (1ull << 15);
951 }
952
953 if (I->op == BI_OPCODE_TEX_SINGLE)
954 hex |= ((uint64_t)va_pack_lod_mode(I)) << 13;
955
956 if (I->op == BI_OPCODE_TEX_GATHER) {
957 if (I->integer_coordinates)
958 hex |= (1 << 13);
959 hex |= ((uint64_t)I->fetch_component) << 14;
960 }
961
962 hex |= (I->write_mask << 22);
963 hex |= ((uint64_t)I->dimension) << 28;
964
965 break;
966 }
967
968 default:
969 if (!info.exact && I->op != BI_OPCODE_NOP)
970 invalid_instruction(I, "opcode");
971
972 hex |= va_pack_alu(I);
973 break;
974 }
975
976 return hex;
977 }
978
979 static unsigned
va_instructions_in_block(bi_block * block)980 va_instructions_in_block(bi_block *block)
981 {
982 unsigned offset = 0;
983
984 bi_foreach_instr_in_block(block, _) {
985 offset++;
986 }
987
988 return offset;
989 }
990
991 /* Calculate branch_offset from a branch_target for a direct relative branch */
992
993 static void
va_lower_branch_target(bi_context * ctx,bi_block * start,bi_instr * I)994 va_lower_branch_target(bi_context *ctx, bi_block *start, bi_instr *I)
995 {
996 /* Precondition: unlowered relative branch */
997 bi_block *target = I->branch_target;
998 assert(target != NULL);
999
1000 /* Signed since we might jump backwards */
1001 signed offset = 0;
1002
1003 /* Determine if the target block is strictly greater in source order */
1004 bool forwards = target->index > start->index;
1005
1006 if (forwards) {
1007 /* We have to jump through this block */
1008 bi_foreach_instr_in_block_from(start, _, I) {
1009 offset++;
1010 }
1011
1012 /* We then need to jump over every following block until the target */
1013 bi_foreach_block_from(ctx, start, blk) {
1014 /* End just before the target */
1015 if (blk == target)
1016 break;
1017
1018 /* Count other blocks */
1019 if (blk != start)
1020 offset += va_instructions_in_block(blk);
1021 }
1022 } else {
1023 /* Jump through the beginning of this block */
1024 bi_foreach_instr_in_block_from_rev(start, ins, I) {
1025 if (ins != I)
1026 offset--;
1027 }
1028
1029 /* Jump over preceding blocks up to and including the target to get to
1030 * the beginning of the target */
1031 bi_foreach_block_from_rev(ctx, start, blk) {
1032 if (blk == start)
1033 continue;
1034
1035 offset -= va_instructions_in_block(blk);
1036
1037 /* End just after the target */
1038 if (blk == target)
1039 break;
1040 }
1041 }
1042
1043 /* Offset is relative to the next instruction, so bias */
1044 offset--;
1045
1046 /* Update the instruction */
1047 I->branch_offset = offset;
1048 }
1049
1050 /*
1051 * Late lowering to insert blend shader calls after BLEND instructions. Required
1052 * to support blend shaders, so this pass may be omitted if it is known that
1053 * blend shaders are never used.
1054 *
1055 * This lowering runs late because it introduces control flow changes without
1056 * modifying the control flow graph. It hardcodes registers, meaning running
1057 * after RA makes sense. Finally, it hardcodes a manually sized instruction
1058 * sequence, requiring it to run after scheduling.
1059 *
1060 * As it is Valhall specific, running it as a pre-pack lowering is sensible.
1061 */
1062 static void
va_lower_blend(bi_context * ctx)1063 va_lower_blend(bi_context *ctx)
1064 {
1065 /* Program counter for *next* instruction */
1066 bi_index pc = bi_fau(BIR_FAU_PROGRAM_COUNTER, false);
1067
1068 bi_foreach_instr_global_safe(ctx, I) {
1069 if (I->op != BI_OPCODE_BLEND)
1070 continue;
1071
1072 bi_builder b = bi_init_builder(ctx, bi_after_instr(I));
1073
1074 unsigned prolog_length = 2 * 8;
1075
1076 /* By ABI, r48 is the link register shared with blend shaders */
1077 assert(bi_is_equiv(I->dest[0], bi_register(48)));
1078
1079 if (I->flow == VA_FLOW_END)
1080 bi_iadd_imm_i32_to(&b, I->dest[0], va_zero_lut(), 0);
1081 else
1082 bi_iadd_imm_i32_to(&b, I->dest[0], pc, prolog_length - 8);
1083
1084 bi_branchzi(&b, va_zero_lut(), I->src[3], BI_CMPF_EQ);
1085
1086 /* For fixed function: skip the prologue, or return */
1087 if (I->flow != VA_FLOW_END)
1088 I->branch_offset = prolog_length;
1089 }
1090 }
1091
1092 void
bi_pack_valhall(bi_context * ctx,struct util_dynarray * emission)1093 bi_pack_valhall(bi_context *ctx, struct util_dynarray *emission)
1094 {
1095 unsigned orig_size = emission->size;
1096
1097 va_validate(stderr, ctx);
1098
1099 /* Late lowering */
1100 if (ctx->stage == MESA_SHADER_FRAGMENT && !ctx->inputs->is_blend)
1101 va_lower_blend(ctx);
1102
1103 bi_foreach_block(ctx, block) {
1104 bi_foreach_instr_in_block(block, I) {
1105 if (I->op == BI_OPCODE_BRANCHZ_I16)
1106 va_lower_branch_target(ctx, block, I);
1107
1108 uint64_t hex = va_pack_instr(I);
1109 util_dynarray_append(emission, uint64_t, hex);
1110 }
1111 }
1112
1113 /* Pad with zeroes, but keep empty programs empty so they may be omitted
1114 * altogether. Failing to do this would result in a program containing only
1115 * zeroes, which is invalid and will raise an encoding fault.
1116 *
1117 * Pad an extra 16 byte (one instruction) to separate primary and secondary
1118 * shader disassembles. This is not strictly necessary, but it's a good
1119 * practice. 128 bytes is the optimal program alignment on Trym, so pad
1120 * secondary shaders up to 128 bytes. This may help the instruction cache.
1121 */
1122 if (orig_size != emission->size) {
1123 unsigned aligned = ALIGN_POT(emission->size + 16, 128);
1124 unsigned count = aligned - emission->size;
1125
1126 memset(util_dynarray_grow(emission, uint8_t, count), 0, count);
1127 }
1128 }
1129