1 /*
2 * Copyright (C) 2021 Collabora, Ltd.
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
13 * Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 * SOFTWARE.
22 */
23
24 #include "va_compiler.h"
25 #include "valhall.h"
26 #include "valhall_enums.h"
27 #include "bi_builder.h"
28
29 /* This file contains the final passes of the compiler. Running after
30 * scheduling and RA, the IR is now finalized, so we need to emit it to actual
31 * bits on the wire (as well as fixup branches)
32 */
33
34 /*
35 * Unreachable for encoding failures, when hitting an invalid instruction.
36 * Prints the (first) failing instruction to aid debugging.
37 */
38 NORETURN static void PRINTFLIKE(2, 3)
invalid_instruction(const bi_instr * I,const char * cause,...)39 invalid_instruction(const bi_instr *I, const char *cause, ...)
40 {
41 fputs("\nInvalid ", stderr);
42
43 va_list ap;
44 va_start(ap, cause);
45 vfprintf(stderr, cause, ap);
46 va_end(ap);
47
48 fputs(":\n\t", stderr);
49 bi_print_instr(I, stderr);
50 fprintf(stderr, "\n");
51
52 unreachable("Invalid instruction");
53 }
54
55 /*
56 * Like assert, but prints the instruction if the assertion fails to aid
57 * debugging invalid inputs to the packing module.
58 */
59 #define pack_assert(I, cond) \
60 if (!(cond)) invalid_instruction(I, "invariant " #cond);
61
62 /*
63 * Validate that two adjacent 32-bit sources form an aligned 64-bit register
64 * pair. This is a compiler invariant, required on Valhall but not on Bifrost.
65 */
66 static void
va_validate_register_pair(const bi_instr * I,unsigned s)67 va_validate_register_pair(const bi_instr *I, unsigned s)
68 {
69 ASSERTED bi_index lo = I->src[s], hi = I->src[s + 1];
70
71 pack_assert(I, lo.type == hi.type);
72
73 if (lo.type == BI_INDEX_REGISTER) {
74 pack_assert(I, hi.value & 1);
75 pack_assert(I, hi.value == lo.value + 1);
76 } else if (lo.type == BI_INDEX_FAU && lo.value & BIR_FAU_IMMEDIATE) {
77 /* Small constants are zero extended, so the top word encode zero */
78 pack_assert(I, hi.value == (BIR_FAU_IMMEDIATE | 0));
79 } else {
80 pack_assert(I, hi.offset & 1);
81 pack_assert(I, hi.offset == lo.offset + 1);
82 }
83 }
84
85 static unsigned
va_pack_reg(const bi_instr * I,bi_index idx)86 va_pack_reg(const bi_instr *I, bi_index idx)
87 {
88 pack_assert(I, idx.type == BI_INDEX_REGISTER);
89 pack_assert(I, idx.value < 64);
90
91 return idx.value;
92 }
93
94 static unsigned
va_pack_fau_special(const bi_instr * I,enum bir_fau fau)95 va_pack_fau_special(const bi_instr *I, enum bir_fau fau)
96 {
97 switch (fau) {
98 case BIR_FAU_ATEST_PARAM: return VA_FAU_SPECIAL_PAGE_0_ATEST_DATUM;
99 case BIR_FAU_TLS_PTR: return VA_FAU_SPECIAL_PAGE_1_THREAD_LOCAL_POINTER;
100 case BIR_FAU_WLS_PTR: return VA_FAU_SPECIAL_PAGE_1_WORKGROUP_LOCAL_POINTER;
101 case BIR_FAU_LANE_ID: return VA_FAU_SPECIAL_PAGE_3_LANE_ID;
102 case BIR_FAU_PROGRAM_COUNTER: return VA_FAU_SPECIAL_PAGE_3_PROGRAM_COUNTER;
103 case BIR_FAU_SAMPLE_POS_ARRAY:return VA_FAU_SPECIAL_PAGE_0_SAMPLE;
104
105 case BIR_FAU_BLEND_0...(BIR_FAU_BLEND_0 + 7):
106 return VA_FAU_SPECIAL_PAGE_0_BLEND_DESCRIPTOR_0 + (fau - BIR_FAU_BLEND_0);
107
108 default:
109 invalid_instruction(I, "FAU");
110 }
111 }
112
113 /*
114 * Encode a 64-bit FAU source. The offset is ignored, so this function can be
115 * used to encode a 32-bit FAU source by or'ing in the appropriate offset.
116 */
117 static unsigned
va_pack_fau_64(const bi_instr * I,bi_index idx)118 va_pack_fau_64(const bi_instr *I, bi_index idx)
119 {
120 pack_assert(I, idx.type == BI_INDEX_FAU);
121
122 unsigned val = (idx.value & BITFIELD_MASK(5));
123
124 if (idx.value & BIR_FAU_IMMEDIATE)
125 return (0x3 << 6) | (val << 1);
126 else if (idx.value & BIR_FAU_UNIFORM)
127 return (0x2 << 6) | (val << 1);
128 else
129 return (0x7 << 5) | (va_pack_fau_special(I, idx.value) << 1);
130 }
131
132 static unsigned
va_pack_src(const bi_instr * I,unsigned s)133 va_pack_src(const bi_instr *I, unsigned s)
134 {
135 bi_index idx = I->src[s];
136
137 if (idx.type == BI_INDEX_REGISTER) {
138 unsigned value = va_pack_reg(I, idx);
139 if (idx.discard) value |= (1 << 6);
140 return value;
141 } else if (idx.type == BI_INDEX_FAU) {
142 pack_assert(I, idx.offset <= 1);
143 return va_pack_fau_64(I, idx) | idx.offset;
144 }
145
146 invalid_instruction(I, "type of source %u", s);
147 }
148
149 static unsigned
va_pack_wrmask(const bi_instr * I)150 va_pack_wrmask(const bi_instr *I)
151 {
152 switch (I->dest[0].swizzle) {
153 case BI_SWIZZLE_H00: return 0x1;
154 case BI_SWIZZLE_H11: return 0x2;
155 case BI_SWIZZLE_H01: return 0x3;
156 default: invalid_instruction(I, "write mask");
157 }
158 }
159
160 static enum va_atomic_operation
va_pack_atom_opc(const bi_instr * I)161 va_pack_atom_opc(const bi_instr *I)
162 {
163 switch (I->atom_opc) {
164 case BI_ATOM_OPC_AADD: return VA_ATOMIC_OPERATION_AADD;
165 case BI_ATOM_OPC_ASMIN: return VA_ATOMIC_OPERATION_ASMIN;
166 case BI_ATOM_OPC_ASMAX: return VA_ATOMIC_OPERATION_ASMAX;
167 case BI_ATOM_OPC_AUMIN: return VA_ATOMIC_OPERATION_AUMIN;
168 case BI_ATOM_OPC_AUMAX: return VA_ATOMIC_OPERATION_AUMAX;
169 case BI_ATOM_OPC_AAND: return VA_ATOMIC_OPERATION_AAND;
170 case BI_ATOM_OPC_AOR: return VA_ATOMIC_OPERATION_AOR;
171 case BI_ATOM_OPC_AXOR: return VA_ATOMIC_OPERATION_AXOR;
172 case BI_ATOM_OPC_ACMPXCHG:
173 case BI_ATOM_OPC_AXCHG: return VA_ATOMIC_OPERATION_AXCHG;
174 default: invalid_instruction(I, "atomic opcode");
175 }
176 }
177
178 static enum va_atomic_operation_with_1
va_pack_atom_opc_1(const bi_instr * I)179 va_pack_atom_opc_1(const bi_instr *I)
180 {
181 switch (I->atom_opc) {
182 case BI_ATOM_OPC_AINC: return VA_ATOMIC_OPERATION_WITH_1_AINC;
183 case BI_ATOM_OPC_ADEC: return VA_ATOMIC_OPERATION_WITH_1_ADEC;
184 case BI_ATOM_OPC_AUMAX1: return VA_ATOMIC_OPERATION_WITH_1_AUMAX1;
185 case BI_ATOM_OPC_ASMAX1: return VA_ATOMIC_OPERATION_WITH_1_ASMAX1;
186 case BI_ATOM_OPC_AOR1: return VA_ATOMIC_OPERATION_WITH_1_AOR1;
187 default: invalid_instruction(I, "atomic opcode with implicit 1");
188 }
189 }
190
191 static unsigned
va_pack_dest(const bi_instr * I)192 va_pack_dest(const bi_instr *I)
193 {
194 return va_pack_reg(I, I->dest[0]) | (va_pack_wrmask(I) << 6);
195 }
196
197 static enum va_widen
va_pack_widen_f32(const bi_instr * I,enum bi_swizzle swz)198 va_pack_widen_f32(const bi_instr *I, enum bi_swizzle swz)
199 {
200 switch (swz) {
201 case BI_SWIZZLE_H01: return VA_WIDEN_NONE;
202 case BI_SWIZZLE_H00: return VA_WIDEN_H0;
203 case BI_SWIZZLE_H11: return VA_WIDEN_H1;
204 default: invalid_instruction(I, "widen");
205 }
206 }
207
208 static enum va_swizzles_16_bit
va_pack_swizzle_f16(const bi_instr * I,enum bi_swizzle swz)209 va_pack_swizzle_f16(const bi_instr *I, enum bi_swizzle swz)
210 {
211 switch (swz) {
212 case BI_SWIZZLE_H00: return VA_SWIZZLES_16_BIT_H00;
213 case BI_SWIZZLE_H10: return VA_SWIZZLES_16_BIT_H10;
214 case BI_SWIZZLE_H01: return VA_SWIZZLES_16_BIT_H01;
215 case BI_SWIZZLE_H11: return VA_SWIZZLES_16_BIT_H11;
216 default: invalid_instruction(I, "16-bit swizzle");
217 }
218 }
219
220 static unsigned
va_pack_widen(const bi_instr * I,enum bi_swizzle swz,enum va_size size)221 va_pack_widen(const bi_instr *I, enum bi_swizzle swz, enum va_size size)
222 {
223 if (size == VA_SIZE_8) {
224 switch (swz) {
225 case BI_SWIZZLE_H01: return VA_SWIZZLES_8_BIT_B0123;
226 case BI_SWIZZLE_H00: return VA_SWIZZLES_8_BIT_B0101;
227 case BI_SWIZZLE_H11: return VA_SWIZZLES_8_BIT_B2323;
228 case BI_SWIZZLE_B0000: return VA_SWIZZLES_8_BIT_B0000;
229 case BI_SWIZZLE_B1111: return VA_SWIZZLES_8_BIT_B1111;
230 case BI_SWIZZLE_B2222: return VA_SWIZZLES_8_BIT_B2222;
231 case BI_SWIZZLE_B3333: return VA_SWIZZLES_8_BIT_B3333;
232 default: invalid_instruction(I, "8-bit widen");
233 }
234 } else if (size == VA_SIZE_16) {
235 switch (swz) {
236 case BI_SWIZZLE_H00: return VA_SWIZZLES_16_BIT_H00;
237 case BI_SWIZZLE_H10: return VA_SWIZZLES_16_BIT_H10;
238 case BI_SWIZZLE_H01: return VA_SWIZZLES_16_BIT_H01;
239 case BI_SWIZZLE_H11: return VA_SWIZZLES_16_BIT_H11;
240 case BI_SWIZZLE_B0000: return VA_SWIZZLES_16_BIT_B00;
241 case BI_SWIZZLE_B1111: return VA_SWIZZLES_16_BIT_B11;
242 case BI_SWIZZLE_B2222: return VA_SWIZZLES_16_BIT_B22;
243 case BI_SWIZZLE_B3333: return VA_SWIZZLES_16_BIT_B33;
244 default: invalid_instruction(I, "16-bit widen");
245 }
246 } else if (size == VA_SIZE_32) {
247 switch (swz) {
248 case BI_SWIZZLE_H01: return VA_SWIZZLES_32_BIT_NONE;
249 case BI_SWIZZLE_H00: return VA_SWIZZLES_32_BIT_H0;
250 case BI_SWIZZLE_H11: return VA_SWIZZLES_32_BIT_H1;
251 case BI_SWIZZLE_B0000: return VA_SWIZZLES_32_BIT_B0;
252 case BI_SWIZZLE_B1111: return VA_SWIZZLES_32_BIT_B1;
253 case BI_SWIZZLE_B2222: return VA_SWIZZLES_32_BIT_B2;
254 case BI_SWIZZLE_B3333: return VA_SWIZZLES_32_BIT_B3;
255 default: invalid_instruction(I, "32-bit widen");
256 }
257 } else {
258 invalid_instruction(I, "type size for widen");
259 }
260 }
261
262 static enum va_half_swizzles_8_bit
va_pack_halfswizzle(const bi_instr * I,enum bi_swizzle swz)263 va_pack_halfswizzle(const bi_instr *I, enum bi_swizzle swz)
264 {
265 switch (swz) {
266 case BI_SWIZZLE_B0000: return VA_HALF_SWIZZLES_8_BIT_B00;
267 case BI_SWIZZLE_B1111: return VA_HALF_SWIZZLES_8_BIT_B11;
268 case BI_SWIZZLE_B2222: return VA_HALF_SWIZZLES_8_BIT_B22;
269 case BI_SWIZZLE_B3333: return VA_HALF_SWIZZLES_8_BIT_B33;
270 case BI_SWIZZLE_B0011: return VA_HALF_SWIZZLES_8_BIT_B01;
271 case BI_SWIZZLE_B2233: return VA_HALF_SWIZZLES_8_BIT_B23;
272 case BI_SWIZZLE_B0022: return VA_HALF_SWIZZLES_8_BIT_B02;
273 default: invalid_instruction(I, "v2u8 swizzle");
274 }
275 }
276
277 static enum va_lanes_8_bit
va_pack_shift_lanes(const bi_instr * I,enum bi_swizzle swz)278 va_pack_shift_lanes(const bi_instr *I, enum bi_swizzle swz)
279 {
280 switch (swz) {
281 case BI_SWIZZLE_H01: return VA_LANES_8_BIT_B02;
282 case BI_SWIZZLE_B0000: return VA_LANES_8_BIT_B00;
283 case BI_SWIZZLE_B1111: return VA_LANES_8_BIT_B11;
284 case BI_SWIZZLE_B2222: return VA_LANES_8_BIT_B22;
285 case BI_SWIZZLE_B3333: return VA_LANES_8_BIT_B33;
286 default: invalid_instruction(I, "lane shift");
287 }
288 }
289
290 static enum va_combine
va_pack_combine(const bi_instr * I,enum bi_swizzle swz)291 va_pack_combine(const bi_instr *I, enum bi_swizzle swz)
292 {
293 switch (swz) {
294 case BI_SWIZZLE_H01: return VA_COMBINE_NONE;
295 case BI_SWIZZLE_H00: return VA_COMBINE_H0;
296 case BI_SWIZZLE_H11: return VA_COMBINE_H1;
297 default: invalid_instruction(I, "branch lane");
298 }
299 }
300
301 static enum va_source_format
va_pack_source_format(const bi_instr * I)302 va_pack_source_format(const bi_instr *I)
303 {
304 switch (I->source_format) {
305 case BI_SOURCE_FORMAT_FLAT32: return VA_SOURCE_FORMAT_SRC_FLAT32;
306 case BI_SOURCE_FORMAT_FLAT16: return VA_SOURCE_FORMAT_SRC_FLAT16;
307 case BI_SOURCE_FORMAT_F32: return VA_SOURCE_FORMAT_SRC_F32;
308 case BI_SOURCE_FORMAT_F16: return VA_SOURCE_FORMAT_SRC_F16;
309 }
310
311 invalid_instruction(I, "source format");
312 }
313
314 static uint64_t
va_pack_alu(const bi_instr * I)315 va_pack_alu(const bi_instr *I)
316 {
317 struct va_opcode_info info = valhall_opcodes[I->op];
318 uint64_t hex = 0;
319
320 switch (I->op) {
321 /* Add FREXP flags */
322 case BI_OPCODE_FREXPE_F32:
323 case BI_OPCODE_FREXPE_V2F16:
324 case BI_OPCODE_FREXPM_F32:
325 case BI_OPCODE_FREXPM_V2F16:
326 if (I->sqrt) hex |= 1ull << 24;
327 if (I->log) hex |= 1ull << 25;
328 break;
329
330 /* Add mux type */
331 case BI_OPCODE_MUX_I32:
332 case BI_OPCODE_MUX_V2I16:
333 case BI_OPCODE_MUX_V4I8:
334 hex |= (uint64_t) I->mux << 32;
335 break;
336
337 /* Add .eq flag */
338 case BI_OPCODE_BRANCHZ_I16:
339 case BI_OPCODE_BRANCHZI:
340 pack_assert(I, I->cmpf == BI_CMPF_EQ || I->cmpf == BI_CMPF_NE);
341
342 if (I->cmpf == BI_CMPF_EQ) hex |= (1ull << 36);
343
344 if (I->op == BI_OPCODE_BRANCHZI)
345 hex |= (0x1ull << 40); /* Absolute */
346 else
347 hex |= ((uint64_t) I->branch_offset & BITFIELD_MASK(27)) << 8;
348
349 break;
350
351 /* Add arithmetic flag */
352 case BI_OPCODE_RSHIFT_AND_I32:
353 case BI_OPCODE_RSHIFT_AND_V2I16:
354 case BI_OPCODE_RSHIFT_AND_V4I8:
355 case BI_OPCODE_RSHIFT_OR_I32:
356 case BI_OPCODE_RSHIFT_OR_V2I16:
357 case BI_OPCODE_RSHIFT_OR_V4I8:
358 case BI_OPCODE_RSHIFT_XOR_I32:
359 case BI_OPCODE_RSHIFT_XOR_V2I16:
360 case BI_OPCODE_RSHIFT_XOR_V4I8:
361 hex |= (uint64_t) I->arithmetic << 34;
362 break;
363
364 case BI_OPCODE_LEA_BUF_IMM:
365 /* Buffer table index */
366 hex |= 0xD << 8;
367 break;
368
369 case BI_OPCODE_LEA_ATTR_IMM:
370 hex |= ((uint64_t) I->table) << 16;
371 hex |= ((uint64_t) I->attribute_index) << 20;
372 break;
373
374 case BI_OPCODE_IADD_IMM_I32:
375 case BI_OPCODE_IADD_IMM_V2I16:
376 case BI_OPCODE_IADD_IMM_V4I8:
377 case BI_OPCODE_FADD_IMM_F32:
378 case BI_OPCODE_FADD_IMM_V2F16:
379 hex |= ((uint64_t) I->index) << 8;
380 break;
381
382 case BI_OPCODE_CLPER_I32:
383 hex |= ((uint64_t) I->inactive_result) << 22;
384 hex |= ((uint64_t) I->lane_op) << 32;
385 hex |= ((uint64_t) I->subgroup) << 36;
386 break;
387
388 case BI_OPCODE_LD_VAR:
389 case BI_OPCODE_LD_VAR_FLAT:
390 case BI_OPCODE_LD_VAR_IMM:
391 case BI_OPCODE_LD_VAR_FLAT_IMM:
392 case BI_OPCODE_LD_VAR_BUF_F16:
393 case BI_OPCODE_LD_VAR_BUF_F32:
394 case BI_OPCODE_LD_VAR_BUF_IMM_F16:
395 case BI_OPCODE_LD_VAR_BUF_IMM_F32:
396 case BI_OPCODE_LD_VAR_SPECIAL:
397 if (I->op == BI_OPCODE_LD_VAR_SPECIAL)
398 hex |= ((uint64_t) I->varying_name) << 12; /* instead of index */
399 else if (I->op == BI_OPCODE_LD_VAR_BUF_IMM_F16 ||
400 I->op == BI_OPCODE_LD_VAR_BUF_IMM_F32) {
401 hex |= ((uint64_t) I->index) << 16;
402 } else if (I->op == BI_OPCODE_LD_VAR_IMM ||
403 I->op == BI_OPCODE_LD_VAR_FLAT_IMM) {
404 hex |= ((uint64_t) I->table) << 8;
405 hex |= ((uint64_t) I->index) << 12;
406 }
407
408 hex |= ((uint64_t) va_pack_source_format(I)) << 24;
409 hex |= ((uint64_t) I->update) << 36;
410 hex |= ((uint64_t) I->sample) << 38;
411 break;
412
413 case BI_OPCODE_LD_ATTR_IMM:
414 hex |= ((uint64_t) I->table) << 16;
415 hex |= ((uint64_t) I->attribute_index) << 20;
416 break;
417
418 case BI_OPCODE_LD_TEX_IMM:
419 case BI_OPCODE_LEA_TEX_IMM:
420 hex |= ((uint64_t) I->table) << 16;
421 hex |= ((uint64_t) I->texture_index) << 20;
422 break;
423
424 case BI_OPCODE_ZS_EMIT:
425 if (I->stencil) hex |= (1 << 24);
426 if (I->z) hex |= (1 << 25);
427 break;
428
429 default:
430 break;
431 }
432
433 /* FMA_RSCALE.f32 special modes treated as extra opcodes */
434 if (I->op == BI_OPCODE_FMA_RSCALE_F32) {
435 pack_assert(I, I->special < 4);
436 hex |= ((uint64_t) I->special) << 48;
437 }
438
439 /* Add the normal destination or a placeholder. Staging destinations are
440 * added elsewhere, as they require special handling for control fields.
441 */
442 if (info.has_dest && info.nr_staging_dests == 0) {
443 hex |= (uint64_t) va_pack_dest(I) << 40;
444 } else if (info.nr_staging_dests == 0 && info.nr_staging_srcs == 0) {
445 pack_assert(I, bi_is_null(I->dest[0]));
446 hex |= 0xC0ull << 40; /* Placeholder */
447 }
448
449 bool swap12 = va_swap_12(I->op);
450
451 /* First src is staging if we read, skip it when packing sources */
452 unsigned src_offset = bi_opcode_props[I->op].sr_read ? 1 : 0;
453
454 for (unsigned i = 0; i < info.nr_srcs; ++i) {
455 unsigned logical_i = (swap12 && i == 1) ? 2 : (swap12 && i == 2) ? 1 : i;
456
457 struct va_src_info src_info = info.srcs[i];
458 enum va_size size = src_info.size;
459
460 bi_index src = I->src[logical_i + src_offset];
461 hex |= (uint64_t) va_pack_src(I, logical_i + src_offset) << (8 * i);
462
463 if (src_info.notted) {
464 if (src.neg) hex |= (1ull << 35);
465 } else if (src_info.absneg) {
466 unsigned neg_offs = 32 + 2 + ((2 - i) * 2);
467 unsigned abs_offs = 33 + 2 + ((2 - i) * 2);
468
469 if (src.neg) hex |= 1ull << neg_offs;
470 if (src.abs) hex |= 1ull << abs_offs;
471 } else {
472 if (src.neg) invalid_instruction(I, "negate");
473 if (src.abs) invalid_instruction(I, "absolute value");
474 }
475
476 if (src_info.swizzle) {
477 unsigned offs = 24 + ((2 - i) * 2);
478 unsigned S = src.swizzle;
479 pack_assert(I, size == VA_SIZE_16 || size == VA_SIZE_32);
480
481 uint64_t v = (size == VA_SIZE_32 ? va_pack_widen_f32(I, S) : va_pack_swizzle_f16(I, S));
482 hex |= v << offs;
483 } else if (src_info.widen) {
484 unsigned offs = (i == 1) ? 26 : 36;
485 hex |= (uint64_t) va_pack_widen(I, src.swizzle, src_info.size) << offs;
486 } else if (src_info.lane) {
487 unsigned offs = (I->op == BI_OPCODE_MKVEC_V2I8) ?
488 ((i == 0) ? 38 : 36) :
489 28;
490
491 if (src_info.size == VA_SIZE_16) {
492 hex |= (src.swizzle == BI_SWIZZLE_H11 ? 1 : 0) << offs;
493 } else if (I->op == BI_OPCODE_BRANCHZ_I16) {
494 hex |= ((uint64_t) va_pack_combine(I, src.swizzle) << 37);
495 } else {
496 pack_assert(I, src_info.size == VA_SIZE_8);
497 unsigned comp = src.swizzle - BI_SWIZZLE_B0000;
498 pack_assert(I, comp < 4);
499 hex |= (uint64_t) comp << offs;
500 }
501 } else if (src_info.lanes) {
502 pack_assert(I, src_info.size == VA_SIZE_8);
503 pack_assert(I, i == 1);
504 hex |= (uint64_t) va_pack_shift_lanes(I, src.swizzle) << 26;
505 } else if (src_info.combine) {
506 /* Treat as swizzle, subgroup ops not yet supported */
507 pack_assert(I, src_info.size == VA_SIZE_32);
508 pack_assert(I, i == 0);
509 hex |= (uint64_t) va_pack_widen_f32(I, src.swizzle) << 37;
510 } else if (src_info.halfswizzle) {
511 pack_assert(I, src_info.size == VA_SIZE_8);
512 pack_assert(I, i == 0);
513 hex |= (uint64_t) va_pack_halfswizzle(I, src.swizzle) << 36;
514 } else if (src.swizzle != BI_SWIZZLE_H01) {
515 invalid_instruction(I, "swizzle");
516 }
517 }
518
519 if (info.clamp) hex |= (uint64_t) I->clamp << 32;
520 if (info.round_mode) hex |= (uint64_t) I->round << 30;
521 if (info.condition) hex |= (uint64_t) I->cmpf << 32;
522 if (info.result_type) hex |= (uint64_t) I->result_type << 30;
523
524 return hex;
525 }
526
527 static uint64_t
va_pack_byte_offset(const bi_instr * I)528 va_pack_byte_offset(const bi_instr *I)
529 {
530 int16_t offset = I->byte_offset;
531 if (offset != I->byte_offset) invalid_instruction(I, "byte offset");
532
533 uint16_t offset_as_u16 = offset;
534 return ((uint64_t) offset_as_u16) << 8;
535 }
536
537 static uint64_t
va_pack_byte_offset_8(const bi_instr * I)538 va_pack_byte_offset_8(const bi_instr *I)
539 {
540 uint8_t offset = I->byte_offset;
541 if (offset != I->byte_offset) invalid_instruction(I, "byte offset");
542
543 return ((uint64_t) offset) << 8;
544 }
545
546 static uint64_t
va_pack_load(const bi_instr * I,bool buffer_descriptor)547 va_pack_load(const bi_instr *I, bool buffer_descriptor)
548 {
549 const uint8_t load_lane_identity[8] = {
550 VA_LOAD_LANE_8_BIT_B0,
551 VA_LOAD_LANE_16_BIT_H0,
552 VA_LOAD_LANE_24_BIT_IDENTITY,
553 VA_LOAD_LANE_32_BIT_W0,
554 VA_LOAD_LANE_48_BIT_IDENTITY,
555 VA_LOAD_LANE_64_BIT_IDENTITY,
556 VA_LOAD_LANE_96_BIT_IDENTITY,
557 VA_LOAD_LANE_128_BIT_IDENTITY,
558 };
559
560 unsigned memory_size = (valhall_opcodes[I->op].exact >> 27) & 0x7;
561 uint64_t hex = (uint64_t) load_lane_identity[memory_size] << 36;
562
563 // unsigned
564 hex |= (1ull << 39);
565
566 if (!buffer_descriptor)
567 hex |= va_pack_byte_offset(I);
568
569 hex |= (uint64_t) va_pack_src(I, 0) << 0;
570
571 if (buffer_descriptor)
572 hex |= (uint64_t) va_pack_src(I, 1) << 8;
573
574 return hex;
575 }
576
577 static uint64_t
va_pack_memory_access(const bi_instr * I)578 va_pack_memory_access(const bi_instr *I)
579 {
580 switch (I->seg) {
581 case BI_SEG_TL: return VA_MEMORY_ACCESS_FORCE;
582 case BI_SEG_POS: return VA_MEMORY_ACCESS_ISTREAM;
583 case BI_SEG_VARY: return VA_MEMORY_ACCESS_ESTREAM;
584 default: return VA_MEMORY_ACCESS_NONE;
585 }
586 }
587
588 static uint64_t
va_pack_store(const bi_instr * I)589 va_pack_store(const bi_instr *I)
590 {
591 uint64_t hex = va_pack_memory_access(I) << 24;
592
593 va_validate_register_pair(I, 1);
594 hex |= (uint64_t) va_pack_src(I, 1) << 0;
595
596 hex |= va_pack_byte_offset(I);
597
598 return hex;
599 }
600
601 static enum va_lod_mode
va_pack_lod_mode(const bi_instr * I)602 va_pack_lod_mode(const bi_instr *I)
603 {
604 switch (I->va_lod_mode) {
605 case BI_VA_LOD_MODE_ZERO_LOD: return VA_LOD_MODE_ZERO;
606 case BI_VA_LOD_MODE_COMPUTED_LOD: return VA_LOD_MODE_COMPUTED;
607 case BI_VA_LOD_MODE_EXPLICIT: return VA_LOD_MODE_EXPLICIT;
608 case BI_VA_LOD_MODE_COMPUTED_BIAS: return VA_LOD_MODE_COMPUTED_BIAS;
609 case BI_VA_LOD_MODE_GRDESC: return VA_LOD_MODE_GRDESC;
610 }
611
612 invalid_instruction(I, "LOD mode");
613 }
614
615 static enum va_register_type
va_pack_register_type(const bi_instr * I)616 va_pack_register_type(const bi_instr *I)
617 {
618 switch (I->register_format) {
619 case BI_REGISTER_FORMAT_F16:
620 case BI_REGISTER_FORMAT_F32:
621 return VA_REGISTER_TYPE_F;
622
623 case BI_REGISTER_FORMAT_U16:
624 case BI_REGISTER_FORMAT_U32:
625 return VA_REGISTER_TYPE_U;
626
627 case BI_REGISTER_FORMAT_S16:
628 case BI_REGISTER_FORMAT_S32:
629 return VA_REGISTER_TYPE_S;
630
631 default:
632 invalid_instruction(I, "register type");
633 }
634 }
635
636 static enum va_register_format
va_pack_register_format(const bi_instr * I)637 va_pack_register_format(const bi_instr *I)
638 {
639 switch (I->register_format) {
640 case BI_REGISTER_FORMAT_AUTO: return VA_REGISTER_FORMAT_AUTO;
641 case BI_REGISTER_FORMAT_F32: return VA_REGISTER_FORMAT_F32;
642 case BI_REGISTER_FORMAT_F16: return VA_REGISTER_FORMAT_F16;
643 case BI_REGISTER_FORMAT_S32: return VA_REGISTER_FORMAT_S32;
644 case BI_REGISTER_FORMAT_S16: return VA_REGISTER_FORMAT_S16;
645 case BI_REGISTER_FORMAT_U32: return VA_REGISTER_FORMAT_U32;
646 case BI_REGISTER_FORMAT_U16: return VA_REGISTER_FORMAT_U16;
647 default: invalid_instruction(I, "register format");
648 }
649 }
650
651 uint64_t
va_pack_instr(const bi_instr * I)652 va_pack_instr(const bi_instr *I)
653 {
654 struct va_opcode_info info = valhall_opcodes[I->op];
655
656 uint64_t hex = info.exact | (((uint64_t) I->flow) << 59);
657 hex |= ((uint64_t) va_select_fau_page(I)) << 57;
658
659 if (info.slot)
660 hex |= ((uint64_t) I->slot << 30);
661
662 if (info.sr_count) {
663 bool read = bi_opcode_props[I->op].sr_read;
664 bi_index sr = read ? I->src[0] : I->dest[0];
665
666 unsigned count = read ?
667 bi_count_read_registers(I, 0) :
668 bi_count_write_registers(I, 0);
669
670 hex |= ((uint64_t) count << 33);
671 hex |= (uint64_t) va_pack_reg(I, sr) << 40;
672 hex |= ((uint64_t) info.sr_control << 46);
673 }
674
675 if (info.sr_write_count) {
676 hex |= ((uint64_t) bi_count_write_registers(I, 0) - 1) << 36;
677 hex |= ((uint64_t) va_pack_reg(I, I->dest[0])) << 16;
678 }
679
680 if (info.vecsize)
681 hex |= ((uint64_t) I->vecsize << 28);
682
683 if (info.register_format)
684 hex |= ((uint64_t) va_pack_register_format(I)) << 24;
685
686 switch (I->op) {
687 case BI_OPCODE_LOAD_I8:
688 case BI_OPCODE_LOAD_I16:
689 case BI_OPCODE_LOAD_I24:
690 case BI_OPCODE_LOAD_I32:
691 case BI_OPCODE_LOAD_I48:
692 case BI_OPCODE_LOAD_I64:
693 case BI_OPCODE_LOAD_I96:
694 case BI_OPCODE_LOAD_I128:
695 hex |= va_pack_load(I, false);
696 break;
697
698 case BI_OPCODE_LD_BUFFER_I8:
699 case BI_OPCODE_LD_BUFFER_I16:
700 case BI_OPCODE_LD_BUFFER_I24:
701 case BI_OPCODE_LD_BUFFER_I32:
702 case BI_OPCODE_LD_BUFFER_I48:
703 case BI_OPCODE_LD_BUFFER_I64:
704 case BI_OPCODE_LD_BUFFER_I96:
705 case BI_OPCODE_LD_BUFFER_I128:
706 hex |= va_pack_load(I, true);
707 break;
708
709 case BI_OPCODE_STORE_I8:
710 case BI_OPCODE_STORE_I16:
711 case BI_OPCODE_STORE_I24:
712 case BI_OPCODE_STORE_I32:
713 case BI_OPCODE_STORE_I48:
714 case BI_OPCODE_STORE_I64:
715 case BI_OPCODE_STORE_I96:
716 case BI_OPCODE_STORE_I128:
717 hex |= va_pack_store(I);
718 break;
719
720 case BI_OPCODE_ATOM1_RETURN_I32:
721 /* Permit omitting the destination for plain ATOM1 */
722 if (!bi_count_write_registers(I, 0)) {
723 hex |= (0x40ull << 40); // fake read
724 }
725
726 /* 64-bit source */
727 va_validate_register_pair(I, 0);
728 hex |= (uint64_t) va_pack_src(I, 0) << 0;
729 hex |= va_pack_byte_offset_8(I);
730 hex |= ((uint64_t) va_pack_atom_opc_1(I)) << 22;
731 break;
732
733 case BI_OPCODE_ATOM_I32:
734 case BI_OPCODE_ATOM_RETURN_I32:
735 /* 64-bit source */
736 va_validate_register_pair(I, 1);
737 hex |= (uint64_t) va_pack_src(I, 1) << 0;
738 hex |= va_pack_byte_offset_8(I);
739 hex |= ((uint64_t) va_pack_atom_opc(I)) << 22;
740
741 if (I->op == BI_OPCODE_ATOM_RETURN_I32)
742 hex |= (0xc0ull << 40); // flags
743
744 if (I->atom_opc == BI_ATOM_OPC_ACMPXCHG)
745 hex |= (1 << 26); /* .compare */
746
747 break;
748
749 case BI_OPCODE_ST_CVT:
750 /* Staging read */
751 hex |= va_pack_store(I);
752
753 /* Conversion descriptor */
754 hex |= (uint64_t) va_pack_src(I, 3) << 16;
755 break;
756
757 case BI_OPCODE_BLEND:
758 {
759 /* Source 0 - Blend descriptor (64-bit) */
760 hex |= ((uint64_t) va_pack_src(I, 2)) << 0;
761 va_validate_register_pair(I, 2);
762
763 /* Target */
764 if (I->branch_offset & 0x7) invalid_instruction(I, "unaligned branch");
765 hex |= ((I->branch_offset >> 3) << 8);
766
767 /* Source 2 - coverage mask */
768 hex |= ((uint64_t) va_pack_reg(I, I->src[1])) << 16;
769
770 /* Vector size */
771 unsigned vecsize = 4;
772 hex |= ((uint64_t) (vecsize - 1) << 28);
773
774 break;
775 }
776
777 case BI_OPCODE_TEX_SINGLE:
778 case BI_OPCODE_TEX_FETCH:
779 case BI_OPCODE_TEX_GATHER:
780 {
781 /* Image to read from */
782 hex |= ((uint64_t) va_pack_src(I, 1)) << 0;
783
784 if (I->op == BI_OPCODE_TEX_FETCH && I->shadow)
785 invalid_instruction(I, "TEX_FETCH does not support .shadow");
786
787 if (I->array_enable) hex |= (1ull << 10);
788 if (I->texel_offset) hex |= (1ull << 11);
789 if (I->shadow) hex |= (1ull << 12);
790 if (I->skip) hex |= (1ull << 39);
791 if (!bi_is_regfmt_16(I->register_format)) hex |= (1ull << 46);
792
793 if (I->op == BI_OPCODE_TEX_SINGLE)
794 hex |= ((uint64_t) va_pack_lod_mode(I)) << 13;
795
796 if (I->op == BI_OPCODE_TEX_GATHER) {
797 if (I->integer_coordinates) hex |= (1 << 13);
798 hex |= ((uint64_t) I->fetch_component) << 14;
799 }
800
801 hex |= (VA_WRITE_MASK_RGBA << 22);
802 hex |= ((uint64_t) va_pack_register_type(I)) << 26;
803 hex |= ((uint64_t) I->dimension) << 28;
804
805 break;
806 }
807
808 default:
809 if (!info.exact && I->op != BI_OPCODE_NOP)
810 invalid_instruction(I, "opcode");
811
812 hex |= va_pack_alu(I);
813 break;
814 }
815
816 return hex;
817 }
818
819 static unsigned
va_instructions_in_block(bi_block * block)820 va_instructions_in_block(bi_block *block)
821 {
822 unsigned offset = 0;
823
824 bi_foreach_instr_in_block(block, _) {
825 offset++;
826 }
827
828 return offset;
829 }
830
831 /* Calculate branch_offset from a branch_target for a direct relative branch */
832
833 static void
va_lower_branch_target(bi_context * ctx,bi_block * start,bi_instr * I)834 va_lower_branch_target(bi_context *ctx, bi_block *start, bi_instr *I)
835 {
836 /* Precondition: unlowered relative branch */
837 bi_block *target = I->branch_target;
838 assert(target != NULL);
839
840 /* Signed since we might jump backwards */
841 signed offset = 0;
842
843 /* Determine if the target block is strictly greater in source order */
844 bool forwards = target->index > start->index;
845
846 if (forwards) {
847 /* We have to jump through this block */
848 bi_foreach_instr_in_block_from(start, _, I) {
849 offset++;
850 }
851
852 /* We then need to jump over every following block until the target */
853 bi_foreach_block_from(ctx, start, blk) {
854 /* End just before the target */
855 if (blk == target)
856 break;
857
858 /* Count other blocks */
859 if (blk != start)
860 offset += va_instructions_in_block(blk);
861 }
862 } else {
863 /* Jump through the beginning of this block */
864 bi_foreach_instr_in_block_from_rev(start, ins, I) {
865 if (ins != I)
866 offset--;
867 }
868
869 /* Jump over preceding blocks up to and including the target to get to
870 * the beginning of the target */
871 bi_foreach_block_from_rev(ctx, start, blk) {
872 if (blk == start)
873 continue;
874
875 offset -= va_instructions_in_block(blk);
876
877 /* End just after the target */
878 if (blk == target)
879 break;
880 }
881 }
882
883 /* Offset is relative to the next instruction, so bias */
884 offset--;
885
886 /* Update the instruction */
887 I->branch_offset = offset;
888 }
889
890 /*
891 * Late lowering to insert blend shader calls after BLEND instructions. Required
892 * to support blend shaders, so this pass may be omitted if it is known that
893 * blend shaders are never used.
894 *
895 * This lowering runs late because it introduces control flow changes without
896 * modifying the control flow graph. It hardcodes registers, meaning running
897 * after RA makes sense. Finally, it hardcodes a manually sized instruction
898 * sequence, requiring it to run after scheduling.
899 *
900 * As it is Valhall specific, running it as a pre-pack lowering is sensible.
901 */
902 static void
va_lower_blend(bi_context * ctx)903 va_lower_blend(bi_context *ctx)
904 {
905 /* Link register (ABI between fragment and blend shaders) */
906 bi_index lr = bi_register(48);
907
908 /* Program counter for *next* instruction */
909 bi_index pc = bi_fau(BIR_FAU_PROGRAM_COUNTER, false);
910
911 bi_foreach_instr_global_safe(ctx, I) {
912 if (I->op != BI_OPCODE_BLEND)
913 continue;
914
915 bi_builder b = bi_init_builder(ctx, bi_after_instr(I));
916
917 unsigned prolog_length = 2 * 8;
918
919 if (I->flow == VA_FLOW_END)
920 bi_iadd_imm_i32_to(&b, lr, va_zero_lut(), 0);
921 else
922 bi_iadd_imm_i32_to(&b, lr, pc, prolog_length - 8);
923
924 bi_branchzi(&b, va_zero_lut(), I->src[3], BI_CMPF_EQ);
925
926 /* For fixed function: skip the prologue, or return */
927 if (I->flow != VA_FLOW_END)
928 I->branch_offset = prolog_length;
929 }
930 }
931
932 void
bi_pack_valhall(bi_context * ctx,struct util_dynarray * emission)933 bi_pack_valhall(bi_context *ctx, struct util_dynarray *emission)
934 {
935 unsigned orig_size = emission->size;
936
937 va_validate(stderr, ctx);
938
939 /* Late lowering */
940 if (ctx->stage == MESA_SHADER_FRAGMENT && !ctx->inputs->is_blend)
941 va_lower_blend(ctx);
942
943 bi_foreach_block(ctx, block) {
944 bi_foreach_instr_in_block(block, I) {
945 if (I->op == BI_OPCODE_BRANCHZ_I16)
946 va_lower_branch_target(ctx, block, I);
947
948 uint64_t hex = va_pack_instr(I);
949 util_dynarray_append(emission, uint64_t, hex);
950 }
951 }
952
953 /* Pad with zeroes, but keep empty programs empty so they may be omitted
954 * altogether. Failing to do this would result in a program containing only
955 * zeroes, which is invalid and will raise an encoding fault.
956 *
957 * Pad an extra 16 byte (one instruction) to separate primary and secondary
958 * shader disassembles. This is not strictly necessary, but it's a good
959 * practice. 128 bytes is the optimal program alignment on Trym, so pad
960 * secondary shaders up to 128 bytes. This may help the instruction cache.
961 */
962 if (orig_size != emission->size) {
963 unsigned aligned = ALIGN_POT(emission->size + 16, 128);
964 unsigned count = aligned - emission->size;
965
966 memset(util_dynarray_grow(emission, uint8_t, count), 0, count);
967 }
968 }
969