• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*
2  * Copyright (C) 2021 Collabora, Ltd.
3  *
4  * Permission is hereby granted, free of charge, to any person obtaining a
5  * copy of this software and associated documentation files (the "Software"),
6  * to deal in the Software without restriction, including without limitation
7  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8  * and/or sell copies of the Software, and to permit persons to whom the
9  * Software is furnished to do so, subject to the following conditions:
10  *
11  * The above copyright notice and this permission notice (including the next
12  * paragraph) shall be included in all copies or substantial portions of the
13  * Software.
14  *
15  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
18  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21  * SOFTWARE.
22  */
23 
24 #include "va_compiler.h"
25 #include "valhall.h"
26 #include "valhall_enums.h"
27 #include "bi_builder.h"
28 
29 /* This file contains the final passes of the compiler. Running after
30  * scheduling and RA, the IR is now finalized, so we need to emit it to actual
31  * bits on the wire (as well as fixup branches)
32  */
33 
34 /*
35  * Unreachable for encoding failures, when hitting an invalid instruction.
36  * Prints the (first) failing instruction to aid debugging.
37  */
38 NORETURN static void PRINTFLIKE(2, 3)
invalid_instruction(const bi_instr * I,const char * cause,...)39 invalid_instruction(const bi_instr *I, const char *cause, ...)
40 {
41    fputs("\nInvalid ", stderr);
42 
43    va_list ap;
44    va_start(ap, cause);
45    vfprintf(stderr, cause, ap);
46    va_end(ap);
47 
48    fputs(":\n\t", stderr);
49    bi_print_instr(I, stderr);
50    fprintf(stderr, "\n");
51 
52    unreachable("Invalid instruction");
53 }
54 
55 /*
56  * Like assert, but prints the instruction if the assertion fails to aid
57  * debugging invalid inputs to the packing module.
58  */
59 #define pack_assert(I, cond) \
60    if (!(cond)) invalid_instruction(I, "invariant " #cond);
61 
62 /*
63  * Validate that two adjacent 32-bit sources form an aligned 64-bit register
64  * pair. This is a compiler invariant, required on Valhall but not on Bifrost.
65  */
66 static void
va_validate_register_pair(const bi_instr * I,unsigned s)67 va_validate_register_pair(const bi_instr *I, unsigned s)
68 {
69    ASSERTED bi_index lo = I->src[s], hi = I->src[s + 1];
70 
71    pack_assert(I, lo.type == hi.type);
72 
73    if (lo.type == BI_INDEX_REGISTER) {
74       pack_assert(I, hi.value & 1);
75       pack_assert(I, hi.value == lo.value + 1);
76    } else if (lo.type == BI_INDEX_FAU && lo.value & BIR_FAU_IMMEDIATE) {
77       /* Small constants are zero extended, so the top word encode zero */
78       pack_assert(I, hi.value == (BIR_FAU_IMMEDIATE | 0));
79    } else {
80       pack_assert(I, hi.offset & 1);
81       pack_assert(I, hi.offset == lo.offset + 1);
82    }
83 }
84 
85 static unsigned
va_pack_reg(const bi_instr * I,bi_index idx)86 va_pack_reg(const bi_instr *I, bi_index idx)
87 {
88    pack_assert(I, idx.type == BI_INDEX_REGISTER);
89    pack_assert(I, idx.value < 64);
90 
91    return idx.value;
92 }
93 
94 static unsigned
va_pack_fau_special(const bi_instr * I,enum bir_fau fau)95 va_pack_fau_special(const bi_instr *I, enum bir_fau fau)
96 {
97    switch (fau) {
98    case BIR_FAU_ATEST_PARAM:     return VA_FAU_SPECIAL_PAGE_0_ATEST_DATUM;
99    case BIR_FAU_TLS_PTR:         return VA_FAU_SPECIAL_PAGE_1_THREAD_LOCAL_POINTER;
100    case BIR_FAU_WLS_PTR:         return VA_FAU_SPECIAL_PAGE_1_WORKGROUP_LOCAL_POINTER;
101    case BIR_FAU_LANE_ID:         return VA_FAU_SPECIAL_PAGE_3_LANE_ID;
102    case BIR_FAU_PROGRAM_COUNTER: return VA_FAU_SPECIAL_PAGE_3_PROGRAM_COUNTER;
103    case BIR_FAU_SAMPLE_POS_ARRAY:return VA_FAU_SPECIAL_PAGE_0_SAMPLE;
104 
105    case BIR_FAU_BLEND_0...(BIR_FAU_BLEND_0 + 7):
106       return VA_FAU_SPECIAL_PAGE_0_BLEND_DESCRIPTOR_0 + (fau - BIR_FAU_BLEND_0);
107 
108    default:
109       invalid_instruction(I, "FAU");
110    }
111 }
112 
113 /*
114  * Encode a 64-bit FAU source. The offset is ignored, so this function can be
115  * used to encode a 32-bit FAU source by or'ing in the appropriate offset.
116  */
117 static unsigned
va_pack_fau_64(const bi_instr * I,bi_index idx)118 va_pack_fau_64(const bi_instr *I, bi_index idx)
119 {
120    pack_assert(I, idx.type == BI_INDEX_FAU);
121 
122    unsigned val = (idx.value & BITFIELD_MASK(5));
123 
124    if (idx.value & BIR_FAU_IMMEDIATE)
125       return (0x3 << 6) | (val << 1);
126    else if (idx.value & BIR_FAU_UNIFORM)
127       return (0x2 << 6) | (val << 1);
128    else
129       return (0x7 << 5) | (va_pack_fau_special(I, idx.value) << 1);
130 }
131 
132 static unsigned
va_pack_src(const bi_instr * I,unsigned s)133 va_pack_src(const bi_instr *I, unsigned s)
134 {
135    bi_index idx = I->src[s];
136 
137    if (idx.type == BI_INDEX_REGISTER) {
138       unsigned value = va_pack_reg(I, idx);
139       if (idx.discard) value |= (1 << 6);
140       return value;
141    } else if (idx.type == BI_INDEX_FAU) {
142       pack_assert(I, idx.offset <= 1);
143       return va_pack_fau_64(I, idx) | idx.offset;
144    }
145 
146    invalid_instruction(I, "type of source %u", s);
147 }
148 
149 static unsigned
va_pack_wrmask(const bi_instr * I)150 va_pack_wrmask(const bi_instr *I)
151 {
152    switch (I->dest[0].swizzle) {
153    case BI_SWIZZLE_H00: return 0x1;
154    case BI_SWIZZLE_H11: return 0x2;
155    case BI_SWIZZLE_H01: return 0x3;
156    default: invalid_instruction(I, "write mask");
157    }
158 }
159 
160 static enum va_atomic_operation
va_pack_atom_opc(const bi_instr * I)161 va_pack_atom_opc(const bi_instr *I)
162 {
163    switch (I->atom_opc) {
164    case BI_ATOM_OPC_AADD:  return VA_ATOMIC_OPERATION_AADD;
165    case BI_ATOM_OPC_ASMIN: return VA_ATOMIC_OPERATION_ASMIN;
166    case BI_ATOM_OPC_ASMAX: return VA_ATOMIC_OPERATION_ASMAX;
167    case BI_ATOM_OPC_AUMIN: return VA_ATOMIC_OPERATION_AUMIN;
168    case BI_ATOM_OPC_AUMAX: return VA_ATOMIC_OPERATION_AUMAX;
169    case BI_ATOM_OPC_AAND:  return VA_ATOMIC_OPERATION_AAND;
170    case BI_ATOM_OPC_AOR:   return VA_ATOMIC_OPERATION_AOR;
171    case BI_ATOM_OPC_AXOR:  return VA_ATOMIC_OPERATION_AXOR;
172    case BI_ATOM_OPC_ACMPXCHG:
173    case BI_ATOM_OPC_AXCHG: return VA_ATOMIC_OPERATION_AXCHG;
174    default: invalid_instruction(I, "atomic opcode");
175    }
176 }
177 
178 static enum va_atomic_operation_with_1
va_pack_atom_opc_1(const bi_instr * I)179 va_pack_atom_opc_1(const bi_instr *I)
180 {
181    switch (I->atom_opc) {
182    case BI_ATOM_OPC_AINC:     return VA_ATOMIC_OPERATION_WITH_1_AINC;
183    case BI_ATOM_OPC_ADEC:     return VA_ATOMIC_OPERATION_WITH_1_ADEC;
184    case BI_ATOM_OPC_AUMAX1:   return VA_ATOMIC_OPERATION_WITH_1_AUMAX1;
185    case BI_ATOM_OPC_ASMAX1:   return VA_ATOMIC_OPERATION_WITH_1_ASMAX1;
186    case BI_ATOM_OPC_AOR1:     return VA_ATOMIC_OPERATION_WITH_1_AOR1;
187    default: invalid_instruction(I, "atomic opcode with implicit 1");
188    }
189 }
190 
191 static unsigned
va_pack_dest(const bi_instr * I)192 va_pack_dest(const bi_instr *I)
193 {
194    return va_pack_reg(I, I->dest[0]) | (va_pack_wrmask(I) << 6);
195 }
196 
197 static enum va_widen
va_pack_widen_f32(const bi_instr * I,enum bi_swizzle swz)198 va_pack_widen_f32(const bi_instr *I, enum bi_swizzle swz)
199 {
200    switch (swz) {
201    case BI_SWIZZLE_H01: return VA_WIDEN_NONE;
202    case BI_SWIZZLE_H00: return VA_WIDEN_H0;
203    case BI_SWIZZLE_H11: return VA_WIDEN_H1;
204    default: invalid_instruction(I, "widen");
205    }
206 }
207 
208 static enum va_swizzles_16_bit
va_pack_swizzle_f16(const bi_instr * I,enum bi_swizzle swz)209 va_pack_swizzle_f16(const bi_instr *I, enum bi_swizzle swz)
210 {
211    switch (swz) {
212    case BI_SWIZZLE_H00: return VA_SWIZZLES_16_BIT_H00;
213    case BI_SWIZZLE_H10: return VA_SWIZZLES_16_BIT_H10;
214    case BI_SWIZZLE_H01: return VA_SWIZZLES_16_BIT_H01;
215    case BI_SWIZZLE_H11: return VA_SWIZZLES_16_BIT_H11;
216    default: invalid_instruction(I, "16-bit swizzle");
217    }
218 }
219 
220 static unsigned
va_pack_widen(const bi_instr * I,enum bi_swizzle swz,enum va_size size)221 va_pack_widen(const bi_instr *I, enum bi_swizzle swz, enum va_size size)
222 {
223    if (size == VA_SIZE_8) {
224       switch (swz) {
225       case BI_SWIZZLE_H01:    return VA_SWIZZLES_8_BIT_B0123;
226       case BI_SWIZZLE_H00:    return VA_SWIZZLES_8_BIT_B0101;
227       case BI_SWIZZLE_H11:    return VA_SWIZZLES_8_BIT_B2323;
228       case BI_SWIZZLE_B0000:  return VA_SWIZZLES_8_BIT_B0000;
229       case BI_SWIZZLE_B1111:  return VA_SWIZZLES_8_BIT_B1111;
230       case BI_SWIZZLE_B2222:  return VA_SWIZZLES_8_BIT_B2222;
231       case BI_SWIZZLE_B3333:  return VA_SWIZZLES_8_BIT_B3333;
232       default: invalid_instruction(I, "8-bit widen");
233       }
234    } else if (size == VA_SIZE_16) {
235       switch (swz) {
236       case BI_SWIZZLE_H00:    return VA_SWIZZLES_16_BIT_H00;
237       case BI_SWIZZLE_H10:    return VA_SWIZZLES_16_BIT_H10;
238       case BI_SWIZZLE_H01:    return VA_SWIZZLES_16_BIT_H01;
239       case BI_SWIZZLE_H11:    return VA_SWIZZLES_16_BIT_H11;
240       case BI_SWIZZLE_B0000:  return VA_SWIZZLES_16_BIT_B00;
241       case BI_SWIZZLE_B1111:  return VA_SWIZZLES_16_BIT_B11;
242       case BI_SWIZZLE_B2222:  return VA_SWIZZLES_16_BIT_B22;
243       case BI_SWIZZLE_B3333:  return VA_SWIZZLES_16_BIT_B33;
244       default: invalid_instruction(I, "16-bit widen");
245       }
246    } else if (size == VA_SIZE_32) {
247       switch (swz) {
248       case BI_SWIZZLE_H01:    return VA_SWIZZLES_32_BIT_NONE;
249       case BI_SWIZZLE_H00:    return VA_SWIZZLES_32_BIT_H0;
250       case BI_SWIZZLE_H11:    return VA_SWIZZLES_32_BIT_H1;
251       case BI_SWIZZLE_B0000:  return VA_SWIZZLES_32_BIT_B0;
252       case BI_SWIZZLE_B1111:  return VA_SWIZZLES_32_BIT_B1;
253       case BI_SWIZZLE_B2222:  return VA_SWIZZLES_32_BIT_B2;
254       case BI_SWIZZLE_B3333:  return VA_SWIZZLES_32_BIT_B3;
255       default: invalid_instruction(I, "32-bit widen");
256       }
257    } else {
258       invalid_instruction(I, "type size for widen");
259    }
260 }
261 
262 static enum va_half_swizzles_8_bit
va_pack_halfswizzle(const bi_instr * I,enum bi_swizzle swz)263 va_pack_halfswizzle(const bi_instr *I, enum bi_swizzle swz)
264 {
265    switch (swz) {
266    case BI_SWIZZLE_B0000: return VA_HALF_SWIZZLES_8_BIT_B00;
267    case BI_SWIZZLE_B1111: return VA_HALF_SWIZZLES_8_BIT_B11;
268    case BI_SWIZZLE_B2222: return VA_HALF_SWIZZLES_8_BIT_B22;
269    case BI_SWIZZLE_B3333: return VA_HALF_SWIZZLES_8_BIT_B33;
270    case BI_SWIZZLE_B0011: return VA_HALF_SWIZZLES_8_BIT_B01;
271    case BI_SWIZZLE_B2233: return VA_HALF_SWIZZLES_8_BIT_B23;
272    case BI_SWIZZLE_B0022: return VA_HALF_SWIZZLES_8_BIT_B02;
273    default: invalid_instruction(I, "v2u8 swizzle");
274    }
275 }
276 
277 static enum va_lanes_8_bit
va_pack_shift_lanes(const bi_instr * I,enum bi_swizzle swz)278 va_pack_shift_lanes(const bi_instr *I, enum bi_swizzle swz)
279 {
280    switch (swz) {
281    case BI_SWIZZLE_H01:    return VA_LANES_8_BIT_B02;
282    case BI_SWIZZLE_B0000:  return VA_LANES_8_BIT_B00;
283    case BI_SWIZZLE_B1111:  return VA_LANES_8_BIT_B11;
284    case BI_SWIZZLE_B2222:  return VA_LANES_8_BIT_B22;
285    case BI_SWIZZLE_B3333:  return VA_LANES_8_BIT_B33;
286    default: invalid_instruction(I, "lane shift");
287    }
288 }
289 
290 static enum va_combine
va_pack_combine(const bi_instr * I,enum bi_swizzle swz)291 va_pack_combine(const bi_instr *I, enum bi_swizzle swz)
292 {
293    switch (swz) {
294    case BI_SWIZZLE_H01: return VA_COMBINE_NONE;
295    case BI_SWIZZLE_H00: return VA_COMBINE_H0;
296    case BI_SWIZZLE_H11: return VA_COMBINE_H1;
297    default: invalid_instruction(I, "branch lane");
298    }
299 }
300 
301 static enum va_source_format
va_pack_source_format(const bi_instr * I)302 va_pack_source_format(const bi_instr *I)
303 {
304    switch (I->source_format) {
305    case BI_SOURCE_FORMAT_FLAT32: return VA_SOURCE_FORMAT_SRC_FLAT32;
306    case BI_SOURCE_FORMAT_FLAT16: return VA_SOURCE_FORMAT_SRC_FLAT16;
307    case BI_SOURCE_FORMAT_F32: return VA_SOURCE_FORMAT_SRC_F32;
308    case BI_SOURCE_FORMAT_F16: return VA_SOURCE_FORMAT_SRC_F16;
309    }
310 
311    invalid_instruction(I, "source format");
312 }
313 
314 static uint64_t
va_pack_alu(const bi_instr * I)315 va_pack_alu(const bi_instr *I)
316 {
317    struct va_opcode_info info = valhall_opcodes[I->op];
318    uint64_t hex = 0;
319 
320    switch (I->op) {
321    /* Add FREXP flags */
322    case BI_OPCODE_FREXPE_F32:
323    case BI_OPCODE_FREXPE_V2F16:
324    case BI_OPCODE_FREXPM_F32:
325    case BI_OPCODE_FREXPM_V2F16:
326       if (I->sqrt) hex |= 1ull << 24;
327       if (I->log) hex |= 1ull << 25;
328       break;
329 
330    /* Add mux type */
331    case BI_OPCODE_MUX_I32:
332    case BI_OPCODE_MUX_V2I16:
333    case BI_OPCODE_MUX_V4I8:
334       hex |= (uint64_t) I->mux << 32;
335       break;
336 
337    /* Add .eq flag */
338    case BI_OPCODE_BRANCHZ_I16:
339    case BI_OPCODE_BRANCHZI:
340       pack_assert(I, I->cmpf == BI_CMPF_EQ || I->cmpf == BI_CMPF_NE);
341 
342       if (I->cmpf == BI_CMPF_EQ) hex |= (1ull << 36);
343 
344       if (I->op == BI_OPCODE_BRANCHZI)
345          hex |= (0x1ull << 40); /* Absolute */
346       else
347          hex |= ((uint64_t) I->branch_offset & BITFIELD_MASK(27)) << 8;
348 
349       break;
350 
351    /* Add arithmetic flag */
352    case BI_OPCODE_RSHIFT_AND_I32:
353    case BI_OPCODE_RSHIFT_AND_V2I16:
354    case BI_OPCODE_RSHIFT_AND_V4I8:
355    case BI_OPCODE_RSHIFT_OR_I32:
356    case BI_OPCODE_RSHIFT_OR_V2I16:
357    case BI_OPCODE_RSHIFT_OR_V4I8:
358    case BI_OPCODE_RSHIFT_XOR_I32:
359    case BI_OPCODE_RSHIFT_XOR_V2I16:
360    case BI_OPCODE_RSHIFT_XOR_V4I8:
361       hex |= (uint64_t) I->arithmetic << 34;
362       break;
363 
364    case BI_OPCODE_LEA_BUF_IMM:
365       /* Buffer table index */
366       hex |= 0xD << 8;
367       break;
368 
369    case BI_OPCODE_LEA_ATTR_IMM:
370       hex |= ((uint64_t) I->table) << 16;
371       hex |= ((uint64_t) I->attribute_index) << 20;
372       break;
373 
374    case BI_OPCODE_IADD_IMM_I32:
375    case BI_OPCODE_IADD_IMM_V2I16:
376    case BI_OPCODE_IADD_IMM_V4I8:
377    case BI_OPCODE_FADD_IMM_F32:
378    case BI_OPCODE_FADD_IMM_V2F16:
379       hex |= ((uint64_t) I->index) << 8;
380       break;
381 
382    case BI_OPCODE_CLPER_I32:
383       hex |= ((uint64_t) I->inactive_result) << 22;
384       hex |= ((uint64_t) I->lane_op) << 32;
385       hex |= ((uint64_t) I->subgroup) << 36;
386       break;
387 
388    case BI_OPCODE_LD_VAR:
389    case BI_OPCODE_LD_VAR_FLAT:
390    case BI_OPCODE_LD_VAR_IMM:
391    case BI_OPCODE_LD_VAR_FLAT_IMM:
392    case BI_OPCODE_LD_VAR_BUF_F16:
393    case BI_OPCODE_LD_VAR_BUF_F32:
394    case BI_OPCODE_LD_VAR_BUF_IMM_F16:
395    case BI_OPCODE_LD_VAR_BUF_IMM_F32:
396    case BI_OPCODE_LD_VAR_SPECIAL:
397       if (I->op == BI_OPCODE_LD_VAR_SPECIAL)
398          hex |= ((uint64_t) I->varying_name) << 12; /* instead of index */
399       else if (I->op == BI_OPCODE_LD_VAR_BUF_IMM_F16 ||
400                I->op == BI_OPCODE_LD_VAR_BUF_IMM_F32) {
401          hex |= ((uint64_t) I->index) << 16;
402       } else if (I->op == BI_OPCODE_LD_VAR_IMM ||
403                  I->op == BI_OPCODE_LD_VAR_FLAT_IMM) {
404          hex |= ((uint64_t) I->table) << 8;
405          hex |= ((uint64_t) I->index) << 12;
406       }
407 
408       hex |= ((uint64_t) va_pack_source_format(I)) << 24;
409       hex |= ((uint64_t) I->update) << 36;
410       hex |= ((uint64_t) I->sample) << 38;
411       break;
412 
413    case BI_OPCODE_LD_ATTR_IMM:
414       hex |= ((uint64_t) I->table) << 16;
415       hex |= ((uint64_t) I->attribute_index) << 20;
416       break;
417 
418    case BI_OPCODE_LD_TEX_IMM:
419    case BI_OPCODE_LEA_TEX_IMM:
420       hex |= ((uint64_t) I->table) << 16;
421       hex |= ((uint64_t) I->texture_index) << 20;
422       break;
423 
424    case BI_OPCODE_ZS_EMIT:
425       if (I->stencil) hex |= (1 << 24);
426       if (I->z) hex |= (1 << 25);
427       break;
428 
429    default:
430       break;
431    }
432 
433    /* FMA_RSCALE.f32 special modes treated as extra opcodes */
434    if (I->op == BI_OPCODE_FMA_RSCALE_F32) {
435       pack_assert(I, I->special < 4);
436       hex |= ((uint64_t) I->special) << 48;
437    }
438 
439    /* Add the normal destination or a placeholder.  Staging destinations are
440     * added elsewhere, as they require special handling for control fields.
441     */
442    if (info.has_dest && info.nr_staging_dests == 0) {
443       hex |= (uint64_t) va_pack_dest(I) << 40;
444    } else if (info.nr_staging_dests == 0 && info.nr_staging_srcs == 0) {
445       pack_assert(I, bi_is_null(I->dest[0]));
446       hex |= 0xC0ull << 40; /* Placeholder */
447    }
448 
449    bool swap12 = va_swap_12(I->op);
450 
451    /* First src is staging if we read, skip it when packing sources */
452    unsigned src_offset = bi_opcode_props[I->op].sr_read ? 1 : 0;
453 
454    for (unsigned i = 0; i < info.nr_srcs; ++i) {
455       unsigned logical_i = (swap12 && i == 1) ? 2 : (swap12 && i == 2) ? 1 : i;
456 
457       struct va_src_info src_info = info.srcs[i];
458       enum va_size size = src_info.size;
459 
460       bi_index src = I->src[logical_i + src_offset];
461       hex |= (uint64_t) va_pack_src(I, logical_i + src_offset) << (8 * i);
462 
463       if (src_info.notted) {
464          if (src.neg) hex |= (1ull << 35);
465       } else if (src_info.absneg) {
466          unsigned neg_offs = 32 + 2 + ((2 - i) * 2);
467          unsigned abs_offs = 33 + 2 + ((2 - i) * 2);
468 
469          if (src.neg) hex |= 1ull << neg_offs;
470          if (src.abs) hex |= 1ull << abs_offs;
471       } else {
472          if (src.neg) invalid_instruction(I, "negate");
473          if (src.abs) invalid_instruction(I, "absolute value");
474       }
475 
476       if (src_info.swizzle) {
477          unsigned offs = 24 + ((2 - i) * 2);
478          unsigned S = src.swizzle;
479          pack_assert(I, size == VA_SIZE_16 || size == VA_SIZE_32);
480 
481          uint64_t v = (size == VA_SIZE_32 ? va_pack_widen_f32(I, S) : va_pack_swizzle_f16(I, S));
482          hex |= v << offs;
483       } else if (src_info.widen) {
484          unsigned offs = (i == 1) ? 26 : 36;
485          hex |= (uint64_t) va_pack_widen(I, src.swizzle, src_info.size) << offs;
486       } else if (src_info.lane) {
487          unsigned offs = (I->op == BI_OPCODE_MKVEC_V2I8) ?
488                          ((i == 0) ? 38 : 36) :
489                          28;
490 
491          if (src_info.size == VA_SIZE_16) {
492             hex |= (src.swizzle == BI_SWIZZLE_H11 ? 1 : 0) << offs;
493          } else if (I->op == BI_OPCODE_BRANCHZ_I16) {
494             hex |= ((uint64_t) va_pack_combine(I, src.swizzle) << 37);
495          } else {
496             pack_assert(I, src_info.size == VA_SIZE_8);
497             unsigned comp = src.swizzle - BI_SWIZZLE_B0000;
498             pack_assert(I, comp < 4);
499             hex |= (uint64_t) comp << offs;
500          }
501       } else if (src_info.lanes) {
502          pack_assert(I, src_info.size == VA_SIZE_8);
503          pack_assert(I, i == 1);
504          hex |= (uint64_t) va_pack_shift_lanes(I, src.swizzle) << 26;
505       } else if (src_info.combine) {
506          /* Treat as swizzle, subgroup ops not yet supported */
507          pack_assert(I, src_info.size == VA_SIZE_32);
508          pack_assert(I, i == 0);
509          hex |= (uint64_t) va_pack_widen_f32(I, src.swizzle) << 37;
510       } else if (src_info.halfswizzle) {
511          pack_assert(I, src_info.size == VA_SIZE_8);
512          pack_assert(I, i == 0);
513          hex |= (uint64_t) va_pack_halfswizzle(I, src.swizzle) << 36;
514       } else if (src.swizzle != BI_SWIZZLE_H01) {
515          invalid_instruction(I, "swizzle");
516       }
517    }
518 
519    if (info.clamp) hex |= (uint64_t) I->clamp << 32;
520    if (info.round_mode) hex |= (uint64_t) I->round << 30;
521    if (info.condition) hex |= (uint64_t) I->cmpf << 32;
522    if (info.result_type) hex |= (uint64_t) I->result_type << 30;
523 
524    return hex;
525 }
526 
527 static uint64_t
va_pack_byte_offset(const bi_instr * I)528 va_pack_byte_offset(const bi_instr *I)
529 {
530    int16_t offset = I->byte_offset;
531    if (offset != I->byte_offset) invalid_instruction(I, "byte offset");
532 
533    uint16_t offset_as_u16 = offset;
534    return ((uint64_t) offset_as_u16) << 8;
535 }
536 
537 static uint64_t
va_pack_byte_offset_8(const bi_instr * I)538 va_pack_byte_offset_8(const bi_instr *I)
539 {
540    uint8_t offset = I->byte_offset;
541    if (offset != I->byte_offset) invalid_instruction(I, "byte offset");
542 
543    return ((uint64_t) offset) << 8;
544 }
545 
546 static uint64_t
va_pack_load(const bi_instr * I,bool buffer_descriptor)547 va_pack_load(const bi_instr *I, bool buffer_descriptor)
548 {
549    const uint8_t load_lane_identity[8] = {
550       VA_LOAD_LANE_8_BIT_B0,
551       VA_LOAD_LANE_16_BIT_H0,
552       VA_LOAD_LANE_24_BIT_IDENTITY,
553       VA_LOAD_LANE_32_BIT_W0,
554       VA_LOAD_LANE_48_BIT_IDENTITY,
555       VA_LOAD_LANE_64_BIT_IDENTITY,
556       VA_LOAD_LANE_96_BIT_IDENTITY,
557       VA_LOAD_LANE_128_BIT_IDENTITY,
558    };
559 
560    unsigned memory_size = (valhall_opcodes[I->op].exact >> 27) & 0x7;
561    uint64_t hex = (uint64_t) load_lane_identity[memory_size] << 36;
562 
563    // unsigned
564    hex |= (1ull << 39);
565 
566    if (!buffer_descriptor)
567       hex |= va_pack_byte_offset(I);
568 
569    hex |= (uint64_t) va_pack_src(I, 0) << 0;
570 
571    if (buffer_descriptor)
572       hex |= (uint64_t) va_pack_src(I, 1) << 8;
573 
574    return hex;
575 }
576 
577 static uint64_t
va_pack_memory_access(const bi_instr * I)578 va_pack_memory_access(const bi_instr *I)
579 {
580    switch (I->seg) {
581    case BI_SEG_TL:   return VA_MEMORY_ACCESS_FORCE;
582    case BI_SEG_POS:  return VA_MEMORY_ACCESS_ISTREAM;
583    case BI_SEG_VARY: return VA_MEMORY_ACCESS_ESTREAM;
584    default:          return VA_MEMORY_ACCESS_NONE;
585    }
586 }
587 
588 static uint64_t
va_pack_store(const bi_instr * I)589 va_pack_store(const bi_instr *I)
590 {
591    uint64_t hex = va_pack_memory_access(I) << 24;
592 
593    va_validate_register_pair(I, 1);
594    hex |= (uint64_t) va_pack_src(I, 1) << 0;
595 
596    hex |= va_pack_byte_offset(I);
597 
598    return hex;
599 }
600 
601 static enum va_lod_mode
va_pack_lod_mode(const bi_instr * I)602 va_pack_lod_mode(const bi_instr *I)
603 {
604    switch (I->va_lod_mode) {
605    case BI_VA_LOD_MODE_ZERO_LOD:       return VA_LOD_MODE_ZERO;
606    case BI_VA_LOD_MODE_COMPUTED_LOD:   return VA_LOD_MODE_COMPUTED;
607    case BI_VA_LOD_MODE_EXPLICIT:       return VA_LOD_MODE_EXPLICIT;
608    case BI_VA_LOD_MODE_COMPUTED_BIAS:  return VA_LOD_MODE_COMPUTED_BIAS;
609    case BI_VA_LOD_MODE_GRDESC:         return VA_LOD_MODE_GRDESC;
610    }
611 
612    invalid_instruction(I, "LOD mode");
613 }
614 
615 static enum va_register_type
va_pack_register_type(const bi_instr * I)616 va_pack_register_type(const bi_instr *I)
617 {
618    switch (I->register_format) {
619    case BI_REGISTER_FORMAT_F16:
620    case BI_REGISTER_FORMAT_F32:
621       return VA_REGISTER_TYPE_F;
622 
623    case BI_REGISTER_FORMAT_U16:
624    case BI_REGISTER_FORMAT_U32:
625       return VA_REGISTER_TYPE_U;
626 
627    case BI_REGISTER_FORMAT_S16:
628    case BI_REGISTER_FORMAT_S32:
629       return VA_REGISTER_TYPE_S;
630 
631    default:
632       invalid_instruction(I, "register type");
633    }
634 }
635 
636 static enum va_register_format
va_pack_register_format(const bi_instr * I)637 va_pack_register_format(const bi_instr *I)
638 {
639    switch (I->register_format) {
640    case BI_REGISTER_FORMAT_AUTO: return VA_REGISTER_FORMAT_AUTO;
641    case BI_REGISTER_FORMAT_F32:  return VA_REGISTER_FORMAT_F32;
642    case BI_REGISTER_FORMAT_F16:  return VA_REGISTER_FORMAT_F16;
643    case BI_REGISTER_FORMAT_S32:  return VA_REGISTER_FORMAT_S32;
644    case BI_REGISTER_FORMAT_S16:  return VA_REGISTER_FORMAT_S16;
645    case BI_REGISTER_FORMAT_U32:  return VA_REGISTER_FORMAT_U32;
646    case BI_REGISTER_FORMAT_U16:  return VA_REGISTER_FORMAT_U16;
647    default: invalid_instruction(I, "register format");
648    }
649 }
650 
651 uint64_t
va_pack_instr(const bi_instr * I)652 va_pack_instr(const bi_instr *I)
653 {
654    struct va_opcode_info info = valhall_opcodes[I->op];
655 
656    uint64_t hex = info.exact | (((uint64_t) I->flow) << 59);
657    hex |= ((uint64_t) va_select_fau_page(I)) << 57;
658 
659    if (info.slot)
660       hex |= ((uint64_t) I->slot << 30);
661 
662    if (info.sr_count) {
663       bool read = bi_opcode_props[I->op].sr_read;
664       bi_index sr = read ? I->src[0] : I->dest[0];
665 
666       unsigned count = read ?
667          bi_count_read_registers(I, 0) :
668          bi_count_write_registers(I, 0);
669 
670       hex |= ((uint64_t) count << 33);
671       hex |= (uint64_t) va_pack_reg(I, sr) << 40;
672       hex |= ((uint64_t) info.sr_control << 46);
673    }
674 
675    if (info.sr_write_count) {
676       hex |= ((uint64_t) bi_count_write_registers(I, 0) - 1) << 36;
677       hex |= ((uint64_t) va_pack_reg(I, I->dest[0])) << 16;
678    }
679 
680    if (info.vecsize)
681       hex |= ((uint64_t) I->vecsize << 28);
682 
683    if (info.register_format)
684       hex |= ((uint64_t) va_pack_register_format(I)) << 24;
685 
686    switch (I->op) {
687    case BI_OPCODE_LOAD_I8:
688    case BI_OPCODE_LOAD_I16:
689    case BI_OPCODE_LOAD_I24:
690    case BI_OPCODE_LOAD_I32:
691    case BI_OPCODE_LOAD_I48:
692    case BI_OPCODE_LOAD_I64:
693    case BI_OPCODE_LOAD_I96:
694    case BI_OPCODE_LOAD_I128:
695       hex |= va_pack_load(I, false);
696       break;
697 
698    case BI_OPCODE_LD_BUFFER_I8:
699    case BI_OPCODE_LD_BUFFER_I16:
700    case BI_OPCODE_LD_BUFFER_I24:
701    case BI_OPCODE_LD_BUFFER_I32:
702    case BI_OPCODE_LD_BUFFER_I48:
703    case BI_OPCODE_LD_BUFFER_I64:
704    case BI_OPCODE_LD_BUFFER_I96:
705    case BI_OPCODE_LD_BUFFER_I128:
706       hex |= va_pack_load(I, true);
707       break;
708 
709    case BI_OPCODE_STORE_I8:
710    case BI_OPCODE_STORE_I16:
711    case BI_OPCODE_STORE_I24:
712    case BI_OPCODE_STORE_I32:
713    case BI_OPCODE_STORE_I48:
714    case BI_OPCODE_STORE_I64:
715    case BI_OPCODE_STORE_I96:
716    case BI_OPCODE_STORE_I128:
717       hex |= va_pack_store(I);
718       break;
719 
720    case BI_OPCODE_ATOM1_RETURN_I32:
721       /* Permit omitting the destination for plain ATOM1 */
722       if (!bi_count_write_registers(I, 0)) {
723          hex |= (0x40ull << 40); // fake read
724       }
725 
726       /* 64-bit source */
727       va_validate_register_pair(I, 0);
728       hex |= (uint64_t) va_pack_src(I, 0) << 0;
729       hex |= va_pack_byte_offset_8(I);
730       hex |= ((uint64_t) va_pack_atom_opc_1(I)) << 22;
731       break;
732 
733    case BI_OPCODE_ATOM_I32:
734    case BI_OPCODE_ATOM_RETURN_I32:
735       /* 64-bit source */
736       va_validate_register_pair(I, 1);
737       hex |= (uint64_t) va_pack_src(I, 1) << 0;
738       hex |= va_pack_byte_offset_8(I);
739       hex |= ((uint64_t) va_pack_atom_opc(I)) << 22;
740 
741       if (I->op == BI_OPCODE_ATOM_RETURN_I32)
742          hex |= (0xc0ull << 40); // flags
743 
744       if (I->atom_opc == BI_ATOM_OPC_ACMPXCHG)
745          hex |= (1 << 26); /* .compare */
746 
747       break;
748 
749    case BI_OPCODE_ST_CVT:
750       /* Staging read */
751       hex |= va_pack_store(I);
752 
753       /* Conversion descriptor */
754       hex |= (uint64_t) va_pack_src(I, 3) << 16;
755       break;
756 
757    case BI_OPCODE_BLEND:
758    {
759       /* Source 0 - Blend descriptor (64-bit) */
760       hex |= ((uint64_t) va_pack_src(I, 2)) << 0;
761       va_validate_register_pair(I, 2);
762 
763       /* Target */
764       if (I->branch_offset & 0x7) invalid_instruction(I, "unaligned branch");
765       hex |= ((I->branch_offset >> 3) << 8);
766 
767       /* Source 2 - coverage mask */
768       hex |= ((uint64_t) va_pack_reg(I, I->src[1])) << 16;
769 
770       /* Vector size */
771       unsigned vecsize = 4;
772       hex |= ((uint64_t) (vecsize - 1) << 28);
773 
774       break;
775    }
776 
777    case BI_OPCODE_TEX_SINGLE:
778    case BI_OPCODE_TEX_FETCH:
779    case BI_OPCODE_TEX_GATHER:
780    {
781       /* Image to read from */
782       hex |= ((uint64_t) va_pack_src(I, 1)) << 0;
783 
784       if (I->op == BI_OPCODE_TEX_FETCH && I->shadow)
785          invalid_instruction(I, "TEX_FETCH does not support .shadow");
786 
787       if (I->array_enable) hex |= (1ull << 10);
788       if (I->texel_offset) hex |= (1ull << 11);
789       if (I->shadow) hex |= (1ull << 12);
790       if (I->skip) hex |= (1ull << 39);
791       if (!bi_is_regfmt_16(I->register_format)) hex |= (1ull << 46);
792 
793       if (I->op == BI_OPCODE_TEX_SINGLE)
794          hex |= ((uint64_t) va_pack_lod_mode(I)) << 13;
795 
796       if (I->op == BI_OPCODE_TEX_GATHER) {
797          if (I->integer_coordinates) hex |= (1 << 13);
798          hex |= ((uint64_t) I->fetch_component) << 14;
799       }
800 
801       hex |= (VA_WRITE_MASK_RGBA << 22);
802       hex |= ((uint64_t) va_pack_register_type(I)) << 26;
803       hex |= ((uint64_t) I->dimension) << 28;
804 
805       break;
806    }
807 
808    default:
809       if (!info.exact && I->op != BI_OPCODE_NOP)
810          invalid_instruction(I, "opcode");
811 
812       hex |= va_pack_alu(I);
813       break;
814    }
815 
816    return hex;
817 }
818 
819 static unsigned
va_instructions_in_block(bi_block * block)820 va_instructions_in_block(bi_block *block)
821 {
822    unsigned offset = 0;
823 
824    bi_foreach_instr_in_block(block, _) {
825       offset++;
826    }
827 
828    return offset;
829 }
830 
831 /* Calculate branch_offset from a branch_target for a direct relative branch */
832 
833 static void
va_lower_branch_target(bi_context * ctx,bi_block * start,bi_instr * I)834 va_lower_branch_target(bi_context *ctx, bi_block *start, bi_instr *I)
835 {
836    /* Precondition: unlowered relative branch */
837    bi_block *target = I->branch_target;
838    assert(target != NULL);
839 
840    /* Signed since we might jump backwards */
841    signed offset = 0;
842 
843    /* Determine if the target block is strictly greater in source order */
844    bool forwards = target->index > start->index;
845 
846    if (forwards) {
847       /* We have to jump through this block */
848       bi_foreach_instr_in_block_from(start, _, I) {
849          offset++;
850       }
851 
852       /* We then need to jump over every following block until the target */
853       bi_foreach_block_from(ctx, start, blk) {
854          /* End just before the target */
855          if (blk == target)
856             break;
857 
858          /* Count other blocks */
859          if (blk != start)
860             offset += va_instructions_in_block(blk);
861       }
862    } else {
863       /* Jump through the beginning of this block */
864       bi_foreach_instr_in_block_from_rev(start, ins, I) {
865          if (ins != I)
866             offset--;
867       }
868 
869       /* Jump over preceding blocks up to and including the target to get to
870        * the beginning of the target */
871       bi_foreach_block_from_rev(ctx, start, blk) {
872          if (blk == start)
873             continue;
874 
875          offset -= va_instructions_in_block(blk);
876 
877          /* End just after the target */
878          if (blk == target)
879             break;
880       }
881    }
882 
883    /* Offset is relative to the next instruction, so bias */
884    offset--;
885 
886    /* Update the instruction */
887    I->branch_offset = offset;
888 }
889 
890 /*
891  * Late lowering to insert blend shader calls after BLEND instructions. Required
892  * to support blend shaders, so this pass may be omitted if it is known that
893  * blend shaders are never used.
894  *
895  * This lowering runs late because it introduces control flow changes without
896  * modifying the control flow graph. It hardcodes registers, meaning running
897  * after RA makes sense. Finally, it hardcodes a manually sized instruction
898  * sequence, requiring it to run after scheduling.
899  *
900  * As it is Valhall specific, running it as a pre-pack lowering is sensible.
901  */
902 static void
va_lower_blend(bi_context * ctx)903 va_lower_blend(bi_context *ctx)
904 {
905    /* Link register (ABI between fragment and blend shaders) */
906    bi_index lr = bi_register(48);
907 
908    /* Program counter for *next* instruction */
909    bi_index pc = bi_fau(BIR_FAU_PROGRAM_COUNTER, false);
910 
911    bi_foreach_instr_global_safe(ctx, I) {
912       if (I->op != BI_OPCODE_BLEND)
913          continue;
914 
915       bi_builder b = bi_init_builder(ctx, bi_after_instr(I));
916 
917       unsigned prolog_length = 2 * 8;
918 
919       if (I->flow == VA_FLOW_END)
920          bi_iadd_imm_i32_to(&b, lr, va_zero_lut(), 0);
921       else
922          bi_iadd_imm_i32_to(&b, lr, pc, prolog_length - 8);
923 
924       bi_branchzi(&b, va_zero_lut(), I->src[3], BI_CMPF_EQ);
925 
926       /* For fixed function: skip the prologue, or return */
927       if (I->flow != VA_FLOW_END)
928          I->branch_offset = prolog_length;
929    }
930 }
931 
932 void
bi_pack_valhall(bi_context * ctx,struct util_dynarray * emission)933 bi_pack_valhall(bi_context *ctx, struct util_dynarray *emission)
934 {
935    unsigned orig_size = emission->size;
936 
937    va_validate(stderr, ctx);
938 
939    /* Late lowering */
940    if (ctx->stage == MESA_SHADER_FRAGMENT && !ctx->inputs->is_blend)
941       va_lower_blend(ctx);
942 
943    bi_foreach_block(ctx, block) {
944       bi_foreach_instr_in_block(block, I) {
945          if (I->op == BI_OPCODE_BRANCHZ_I16)
946             va_lower_branch_target(ctx, block, I);
947 
948          uint64_t hex = va_pack_instr(I);
949          util_dynarray_append(emission, uint64_t, hex);
950       }
951    }
952 
953    /* Pad with zeroes, but keep empty programs empty so they may be omitted
954     * altogether. Failing to do this would result in a program containing only
955     * zeroes, which is invalid and will raise an encoding fault.
956     *
957     * Pad an extra 16 byte (one instruction) to separate primary and secondary
958     * shader disassembles. This is not strictly necessary, but it's a good
959     * practice. 128 bytes is the optimal program alignment on Trym, so pad
960     * secondary shaders up to 128 bytes. This may help the instruction cache.
961     */
962    if (orig_size != emission->size) {
963       unsigned aligned = ALIGN_POT(emission->size + 16, 128);
964       unsigned count = aligned - emission->size;
965 
966       memset(util_dynarray_grow(emission, uint8_t, count), 0, count);
967    }
968 }
969