• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*
2  * Copyright (C) 2021 Collabora, Ltd.
3  *
4  * Permission is hereby granted, free of charge, to any person obtaining a
5  * copy of this software and associated documentation files (the "Software"),
6  * to deal in the Software without restriction, including without limitation
7  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8  * and/or sell copies of the Software, and to permit persons to whom the
9  * Software is furnished to do so, subject to the following conditions:
10  *
11  * The above copyright notice and this permission notice (including the next
12  * paragraph) shall be included in all copies or substantial portions of the
13  * Software.
14  *
15  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
18  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21  * SOFTWARE.
22  */
23 
24 #include "bi_builder.h"
25 #include "va_compiler.h"
26 #include "valhall.h"
27 #include "valhall_enums.h"
28 
29 /* This file contains the final passes of the compiler. Running after
30  * scheduling and RA, the IR is now finalized, so we need to emit it to actual
31  * bits on the wire (as well as fixup branches)
32  */
33 
34 /*
35  * Unreachable for encoding failures, when hitting an invalid instruction.
36  * Prints the (first) failing instruction to aid debugging.
37  */
38 NORETURN static void PRINTFLIKE(2, 3)
invalid_instruction(const bi_instr * I,const char * cause,...)39    invalid_instruction(const bi_instr *I, const char *cause, ...)
40 {
41    fputs("\nInvalid ", stderr);
42 
43    va_list ap;
44    va_start(ap, cause);
45    vfprintf(stderr, cause, ap);
46    va_end(ap);
47 
48    fputs(":\n\t", stderr);
49    bi_print_instr(I, stderr);
50    fprintf(stderr, "\n");
51 
52    unreachable("Invalid instruction");
53 }
54 
55 /*
56  * Like assert, but prints the instruction if the assertion fails to aid
57  * debugging invalid inputs to the packing module.
58  */
59 #define pack_assert(I, cond)                                                   \
60    if (!(cond))                                                                \
61       invalid_instruction(I, "invariant " #cond);
62 
63 /*
64  * Validate that two adjacent 32-bit sources form an aligned 64-bit register
65  * pair. This is a compiler invariant, required on Valhall but not on Bifrost.
66  */
67 static void
va_validate_register_pair(const bi_instr * I,unsigned s)68 va_validate_register_pair(const bi_instr *I, unsigned s)
69 {
70    ASSERTED bi_index lo = I->src[s], hi = I->src[s + 1];
71 
72    pack_assert(I, lo.type == hi.type);
73 
74    if (lo.type == BI_INDEX_REGISTER) {
75       pack_assert(I, hi.value & 1);
76       pack_assert(I, hi.value == lo.value + 1);
77    } else if (lo.type == BI_INDEX_FAU && lo.value & BIR_FAU_IMMEDIATE) {
78       /* Small constants are zero extended, so the top word encode zero */
79       pack_assert(I, hi.value == (BIR_FAU_IMMEDIATE | 0));
80    } else {
81       pack_assert(I, hi.offset & 1);
82       pack_assert(I, hi.offset == lo.offset + 1);
83    }
84 }
85 
86 static unsigned
va_pack_reg(const bi_instr * I,bi_index idx)87 va_pack_reg(const bi_instr *I, bi_index idx)
88 {
89    pack_assert(I, idx.type == BI_INDEX_REGISTER);
90    pack_assert(I, idx.value < 64);
91 
92    return idx.value;
93 }
94 
95 static unsigned
va_pack_fau_special(const bi_instr * I,enum bir_fau fau)96 va_pack_fau_special(const bi_instr *I, enum bir_fau fau)
97 {
98    switch (fau) {
99    case BIR_FAU_ATEST_PARAM:
100       return VA_FAU_SPECIAL_PAGE_0_ATEST_DATUM;
101    case BIR_FAU_TLS_PTR:
102       return VA_FAU_SPECIAL_PAGE_1_THREAD_LOCAL_POINTER;
103    case BIR_FAU_WLS_PTR:
104       return VA_FAU_SPECIAL_PAGE_1_WORKGROUP_LOCAL_POINTER;
105    case BIR_FAU_LANE_ID:
106       return VA_FAU_SPECIAL_PAGE_3_LANE_ID;
107    case BIR_FAU_PROGRAM_COUNTER:
108       return VA_FAU_SPECIAL_PAGE_3_PROGRAM_COUNTER;
109    case BIR_FAU_SAMPLE_POS_ARRAY:
110       return VA_FAU_SPECIAL_PAGE_0_SAMPLE;
111 
112    case BIR_FAU_BLEND_0 ...(BIR_FAU_BLEND_0 + 7):
113       return VA_FAU_SPECIAL_PAGE_0_BLEND_DESCRIPTOR_0 + (fau - BIR_FAU_BLEND_0);
114 
115    default:
116       invalid_instruction(I, "FAU");
117    }
118 }
119 
120 /*
121  * Encode a 64-bit FAU source. The offset is ignored, so this function can be
122  * used to encode a 32-bit FAU source by or'ing in the appropriate offset.
123  */
124 static unsigned
va_pack_fau_64(const bi_instr * I,bi_index idx)125 va_pack_fau_64(const bi_instr *I, bi_index idx)
126 {
127    pack_assert(I, idx.type == BI_INDEX_FAU);
128 
129    unsigned val = (idx.value & BITFIELD_MASK(5));
130 
131    if (idx.value & BIR_FAU_IMMEDIATE)
132       return (0x3 << 6) | (val << 1);
133    else if (idx.value & BIR_FAU_UNIFORM)
134       return (0x2 << 6) | (val << 1);
135    else
136       return (0x7 << 5) | (va_pack_fau_special(I, idx.value) << 1);
137 }
138 
139 static unsigned
va_pack_src(const bi_instr * I,unsigned s)140 va_pack_src(const bi_instr *I, unsigned s)
141 {
142    bi_index idx = I->src[s];
143 
144    if (idx.type == BI_INDEX_REGISTER) {
145       unsigned value = va_pack_reg(I, idx);
146       if (idx.discard)
147          value |= (1 << 6);
148       return value;
149    } else if (idx.type == BI_INDEX_FAU) {
150       pack_assert(I, idx.offset <= 1);
151       return va_pack_fau_64(I, idx) | idx.offset;
152    }
153 
154    invalid_instruction(I, "type of source %u", s);
155 }
156 
157 static unsigned
va_pack_wrmask(const bi_instr * I)158 va_pack_wrmask(const bi_instr *I)
159 {
160    switch (I->dest[0].swizzle) {
161    case BI_SWIZZLE_H00:
162       return 0x1;
163    case BI_SWIZZLE_H11:
164       return 0x2;
165    case BI_SWIZZLE_H01:
166       return 0x3;
167    default:
168       invalid_instruction(I, "write mask");
169    }
170 }
171 
172 static enum va_atomic_operation
va_pack_atom_opc(const bi_instr * I)173 va_pack_atom_opc(const bi_instr *I)
174 {
175    switch (I->atom_opc) {
176    case BI_ATOM_OPC_AADD:
177       return VA_ATOMIC_OPERATION_AADD;
178    case BI_ATOM_OPC_ASMIN:
179       return VA_ATOMIC_OPERATION_ASMIN;
180    case BI_ATOM_OPC_ASMAX:
181       return VA_ATOMIC_OPERATION_ASMAX;
182    case BI_ATOM_OPC_AUMIN:
183       return VA_ATOMIC_OPERATION_AUMIN;
184    case BI_ATOM_OPC_AUMAX:
185       return VA_ATOMIC_OPERATION_AUMAX;
186    case BI_ATOM_OPC_AAND:
187       return VA_ATOMIC_OPERATION_AAND;
188    case BI_ATOM_OPC_AOR:
189       return VA_ATOMIC_OPERATION_AOR;
190    case BI_ATOM_OPC_AXOR:
191       return VA_ATOMIC_OPERATION_AXOR;
192    case BI_ATOM_OPC_ACMPXCHG:
193    case BI_ATOM_OPC_AXCHG:
194       return VA_ATOMIC_OPERATION_AXCHG;
195    default:
196       invalid_instruction(I, "atomic opcode");
197    }
198 }
199 
200 static enum va_atomic_operation_with_1
va_pack_atom_opc_1(const bi_instr * I)201 va_pack_atom_opc_1(const bi_instr *I)
202 {
203    switch (I->atom_opc) {
204    case BI_ATOM_OPC_AINC:
205       return VA_ATOMIC_OPERATION_WITH_1_AINC;
206    case BI_ATOM_OPC_ADEC:
207       return VA_ATOMIC_OPERATION_WITH_1_ADEC;
208    case BI_ATOM_OPC_AUMAX1:
209       return VA_ATOMIC_OPERATION_WITH_1_AUMAX1;
210    case BI_ATOM_OPC_ASMAX1:
211       return VA_ATOMIC_OPERATION_WITH_1_ASMAX1;
212    case BI_ATOM_OPC_AOR1:
213       return VA_ATOMIC_OPERATION_WITH_1_AOR1;
214    default:
215       invalid_instruction(I, "atomic opcode with implicit 1");
216    }
217 }
218 
219 static unsigned
va_pack_dest(const bi_instr * I)220 va_pack_dest(const bi_instr *I)
221 {
222    assert(I->nr_dests);
223    return va_pack_reg(I, I->dest[0]) | (va_pack_wrmask(I) << 6);
224 }
225 
226 static enum va_widen
va_pack_widen_f32(const bi_instr * I,enum bi_swizzle swz)227 va_pack_widen_f32(const bi_instr *I, enum bi_swizzle swz)
228 {
229    switch (swz) {
230    case BI_SWIZZLE_H01:
231       return VA_WIDEN_NONE;
232    case BI_SWIZZLE_H00:
233       return VA_WIDEN_H0;
234    case BI_SWIZZLE_H11:
235       return VA_WIDEN_H1;
236    default:
237       invalid_instruction(I, "widen");
238    }
239 }
240 
241 static enum va_swizzles_16_bit
va_pack_swizzle_f16(const bi_instr * I,enum bi_swizzle swz)242 va_pack_swizzle_f16(const bi_instr *I, enum bi_swizzle swz)
243 {
244    switch (swz) {
245    case BI_SWIZZLE_H00:
246       return VA_SWIZZLES_16_BIT_H00;
247    case BI_SWIZZLE_H10:
248       return VA_SWIZZLES_16_BIT_H10;
249    case BI_SWIZZLE_H01:
250       return VA_SWIZZLES_16_BIT_H01;
251    case BI_SWIZZLE_H11:
252       return VA_SWIZZLES_16_BIT_H11;
253    default:
254       invalid_instruction(I, "16-bit swizzle");
255    }
256 }
257 
258 static unsigned
va_pack_widen(const bi_instr * I,enum bi_swizzle swz,enum va_size size)259 va_pack_widen(const bi_instr *I, enum bi_swizzle swz, enum va_size size)
260 {
261    if (size == VA_SIZE_8) {
262       switch (swz) {
263       case BI_SWIZZLE_H01:
264          return VA_SWIZZLES_8_BIT_B0123;
265       case BI_SWIZZLE_H00:
266          return VA_SWIZZLES_8_BIT_B0101;
267       case BI_SWIZZLE_H11:
268          return VA_SWIZZLES_8_BIT_B2323;
269       case BI_SWIZZLE_B0000:
270          return VA_SWIZZLES_8_BIT_B0000;
271       case BI_SWIZZLE_B1111:
272          return VA_SWIZZLES_8_BIT_B1111;
273       case BI_SWIZZLE_B2222:
274          return VA_SWIZZLES_8_BIT_B2222;
275       case BI_SWIZZLE_B3333:
276          return VA_SWIZZLES_8_BIT_B3333;
277       default:
278          invalid_instruction(I, "8-bit widen");
279       }
280    } else if (size == VA_SIZE_16) {
281       switch (swz) {
282       case BI_SWIZZLE_H00:
283          return VA_SWIZZLES_16_BIT_H00;
284       case BI_SWIZZLE_H10:
285          return VA_SWIZZLES_16_BIT_H10;
286       case BI_SWIZZLE_H01:
287          return VA_SWIZZLES_16_BIT_H01;
288       case BI_SWIZZLE_H11:
289          return VA_SWIZZLES_16_BIT_H11;
290       case BI_SWIZZLE_B0000:
291          return VA_SWIZZLES_16_BIT_B00;
292       case BI_SWIZZLE_B1111:
293          return VA_SWIZZLES_16_BIT_B11;
294       case BI_SWIZZLE_B2222:
295          return VA_SWIZZLES_16_BIT_B22;
296       case BI_SWIZZLE_B3333:
297          return VA_SWIZZLES_16_BIT_B33;
298       default:
299          invalid_instruction(I, "16-bit widen");
300       }
301    } else if (size == VA_SIZE_32) {
302       switch (swz) {
303       case BI_SWIZZLE_H01:
304          return VA_SWIZZLES_32_BIT_NONE;
305       case BI_SWIZZLE_H00:
306          return VA_SWIZZLES_32_BIT_H0;
307       case BI_SWIZZLE_H11:
308          return VA_SWIZZLES_32_BIT_H1;
309       case BI_SWIZZLE_B0000:
310          return VA_SWIZZLES_32_BIT_B0;
311       case BI_SWIZZLE_B1111:
312          return VA_SWIZZLES_32_BIT_B1;
313       case BI_SWIZZLE_B2222:
314          return VA_SWIZZLES_32_BIT_B2;
315       case BI_SWIZZLE_B3333:
316          return VA_SWIZZLES_32_BIT_B3;
317       default:
318          invalid_instruction(I, "32-bit widen");
319       }
320    } else {
321       invalid_instruction(I, "type size for widen");
322    }
323 }
324 
325 static enum va_half_swizzles_8_bit
va_pack_halfswizzle(const bi_instr * I,enum bi_swizzle swz)326 va_pack_halfswizzle(const bi_instr *I, enum bi_swizzle swz)
327 {
328    switch (swz) {
329    case BI_SWIZZLE_B0000:
330       return VA_HALF_SWIZZLES_8_BIT_B00;
331    case BI_SWIZZLE_B1111:
332       return VA_HALF_SWIZZLES_8_BIT_B11;
333    case BI_SWIZZLE_B2222:
334       return VA_HALF_SWIZZLES_8_BIT_B22;
335    case BI_SWIZZLE_B3333:
336       return VA_HALF_SWIZZLES_8_BIT_B33;
337    case BI_SWIZZLE_B0011:
338       return VA_HALF_SWIZZLES_8_BIT_B01;
339    case BI_SWIZZLE_B2233:
340       return VA_HALF_SWIZZLES_8_BIT_B23;
341    case BI_SWIZZLE_B0022:
342       return VA_HALF_SWIZZLES_8_BIT_B02;
343    default:
344       invalid_instruction(I, "v2u8 swizzle");
345    }
346 }
347 
348 static enum va_lanes_8_bit
va_pack_shift_lanes(const bi_instr * I,enum bi_swizzle swz)349 va_pack_shift_lanes(const bi_instr *I, enum bi_swizzle swz)
350 {
351    switch (swz) {
352    case BI_SWIZZLE_B0000:
353       return VA_LANES_8_BIT_B00;
354    case BI_SWIZZLE_B1111:
355       return VA_LANES_8_BIT_B11;
356    case BI_SWIZZLE_B2222:
357       return VA_LANES_8_BIT_B22;
358    case BI_SWIZZLE_B3333:
359       return VA_LANES_8_BIT_B33;
360    default:
361       invalid_instruction(I, "lane shift");
362    }
363 }
364 
365 static enum va_combine
va_pack_combine(const bi_instr * I,enum bi_swizzle swz)366 va_pack_combine(const bi_instr *I, enum bi_swizzle swz)
367 {
368    switch (swz) {
369    case BI_SWIZZLE_H01:
370       return VA_COMBINE_NONE;
371    case BI_SWIZZLE_H00:
372       return VA_COMBINE_H0;
373    case BI_SWIZZLE_H11:
374       return VA_COMBINE_H1;
375    default:
376       invalid_instruction(I, "branch lane");
377    }
378 }
379 
380 static enum va_source_format
va_pack_source_format(const bi_instr * I)381 va_pack_source_format(const bi_instr *I)
382 {
383    switch (I->source_format) {
384    case BI_SOURCE_FORMAT_FLAT32:
385       return VA_SOURCE_FORMAT_SRC_FLAT32;
386    case BI_SOURCE_FORMAT_FLAT16:
387       return VA_SOURCE_FORMAT_SRC_FLAT16;
388    case BI_SOURCE_FORMAT_F32:
389       return VA_SOURCE_FORMAT_SRC_F32;
390    case BI_SOURCE_FORMAT_F16:
391       return VA_SOURCE_FORMAT_SRC_F16;
392    }
393 
394    invalid_instruction(I, "source format");
395 }
396 
397 static uint64_t
va_pack_rhadd(const bi_instr * I)398 va_pack_rhadd(const bi_instr *I)
399 {
400    switch (I->round) {
401    case BI_ROUND_RTN:
402       return 0; /* hadd */
403    case BI_ROUND_RTP:
404       return BITFIELD_BIT(30); /* rhadd */
405    default:
406       unreachable("Invalid round for HADD");
407    }
408 }
409 
410 static uint64_t
va_pack_alu(const bi_instr * I)411 va_pack_alu(const bi_instr *I)
412 {
413    struct va_opcode_info info = valhall_opcodes[I->op];
414    uint64_t hex = 0;
415 
416    switch (I->op) {
417    /* Add FREXP flags */
418    case BI_OPCODE_FREXPE_F32:
419    case BI_OPCODE_FREXPE_V2F16:
420    case BI_OPCODE_FREXPM_F32:
421    case BI_OPCODE_FREXPM_V2F16:
422       if (I->sqrt)
423          hex |= 1ull << 24;
424       if (I->log)
425          hex |= 1ull << 25;
426       break;
427 
428    case BI_OPCODE_FLUSH_F32:
429    case BI_OPCODE_FLUSH_V2F16:
430       hex |= I->nan_mode << 8;
431       if (I->ftz)
432          hex |= 1ull << 10;
433       if (I->flush_inf)
434          hex |= 1ull << 11;
435       break;
436 
437    /* Add mux type */
438    case BI_OPCODE_MUX_I32:
439    case BI_OPCODE_MUX_V2I16:
440    case BI_OPCODE_MUX_V4I8:
441       hex |= (uint64_t)I->mux << 32;
442       break;
443 
444    /* Add .eq flag */
445    case BI_OPCODE_BRANCHZ_I16:
446    case BI_OPCODE_BRANCHZI:
447       pack_assert(I, I->cmpf == BI_CMPF_EQ || I->cmpf == BI_CMPF_NE);
448 
449       if (I->cmpf == BI_CMPF_EQ)
450          hex |= (1ull << 36);
451 
452       if (I->op == BI_OPCODE_BRANCHZI)
453          hex |= (0x1ull << 40); /* Absolute */
454       else
455          hex |= ((uint64_t)I->branch_offset & BITFIELD_MASK(27)) << 8;
456 
457       break;
458 
459    /* Add arithmetic flag */
460    case BI_OPCODE_RSHIFT_AND_I32:
461    case BI_OPCODE_RSHIFT_AND_V2I16:
462    case BI_OPCODE_RSHIFT_AND_V4I8:
463    case BI_OPCODE_RSHIFT_OR_I32:
464    case BI_OPCODE_RSHIFT_OR_V2I16:
465    case BI_OPCODE_RSHIFT_OR_V4I8:
466    case BI_OPCODE_RSHIFT_XOR_I32:
467    case BI_OPCODE_RSHIFT_XOR_V2I16:
468    case BI_OPCODE_RSHIFT_XOR_V4I8:
469       hex |= (uint64_t)I->arithmetic << 34;
470       break;
471 
472    case BI_OPCODE_LEA_BUF_IMM:
473       hex |= ((uint64_t)I->table) << 8;
474       hex |= ((uint64_t)I->index) << 12;
475       break;
476 
477    case BI_OPCODE_LEA_ATTR_IMM:
478       hex |= ((uint64_t)I->table) << 16;
479       hex |= ((uint64_t)I->attribute_index) << 20;
480       break;
481 
482    case BI_OPCODE_IADD_IMM_I32:
483    case BI_OPCODE_IADD_IMM_V2I16:
484    case BI_OPCODE_IADD_IMM_V4I8:
485    case BI_OPCODE_FADD_IMM_F32:
486    case BI_OPCODE_FADD_IMM_V2F16:
487       hex |= ((uint64_t)I->index) << 8;
488       break;
489 
490    case BI_OPCODE_CLPER_I32:
491       hex |= ((uint64_t)I->inactive_result) << 22;
492       hex |= ((uint64_t)I->lane_op) << 32;
493       hex |= ((uint64_t)I->subgroup) << 36;
494       break;
495 
496    case BI_OPCODE_LD_VAR:
497    case BI_OPCODE_LD_VAR_FLAT:
498    case BI_OPCODE_LD_VAR_IMM:
499    case BI_OPCODE_LD_VAR_FLAT_IMM:
500    case BI_OPCODE_LD_VAR_BUF_F16:
501    case BI_OPCODE_LD_VAR_BUF_F32:
502    case BI_OPCODE_LD_VAR_BUF_IMM_F16:
503    case BI_OPCODE_LD_VAR_BUF_IMM_F32:
504    case BI_OPCODE_LD_VAR_SPECIAL:
505       if (I->op == BI_OPCODE_LD_VAR_SPECIAL)
506          hex |= ((uint64_t)I->varying_name) << 12; /* instead of index */
507       else if (I->op == BI_OPCODE_LD_VAR_BUF_IMM_F16 ||
508                I->op == BI_OPCODE_LD_VAR_BUF_IMM_F32) {
509          hex |= ((uint64_t)I->index) << 16;
510       } else if (I->op == BI_OPCODE_LD_VAR_IMM ||
511                  I->op == BI_OPCODE_LD_VAR_FLAT_IMM) {
512          hex |= ((uint64_t)I->table) << 8;
513          hex |= ((uint64_t)I->index) << 12;
514       }
515 
516       hex |= ((uint64_t)va_pack_source_format(I)) << 24;
517       hex |= ((uint64_t)I->update) << 36;
518       hex |= ((uint64_t)I->sample) << 38;
519       break;
520 
521    case BI_OPCODE_LD_ATTR_IMM:
522       hex |= ((uint64_t)I->table) << 16;
523       hex |= ((uint64_t)I->attribute_index) << 20;
524       break;
525 
526    case BI_OPCODE_LD_TEX_IMM:
527    case BI_OPCODE_LEA_TEX_IMM:
528       hex |= ((uint64_t)I->table) << 16;
529       hex |= ((uint64_t)I->texture_index) << 20;
530       break;
531 
532    case BI_OPCODE_WMASK:
533       hex |= ((uint64_t)I->subgroup) << 36;
534       break;
535 
536    case BI_OPCODE_ZS_EMIT:
537       if (I->stencil)
538          hex |= (1 << 24);
539       if (I->z)
540          hex |= (1 << 25);
541       break;
542 
543    default:
544       break;
545    }
546 
547    /* FMA_RSCALE.f32 special modes treated as extra opcodes */
548    if (I->op == BI_OPCODE_FMA_RSCALE_F32) {
549       pack_assert(I, I->special < 4);
550       hex |= ((uint64_t)I->special) << 48;
551    }
552 
553    /* Add the normal destination or a placeholder.  Staging destinations are
554     * added elsewhere, as they require special handling for control fields.
555     */
556    if (info.has_dest && info.nr_staging_dests == 0) {
557       hex |= (uint64_t)va_pack_dest(I) << 40;
558    } else if (info.nr_staging_dests == 0 && info.nr_staging_srcs == 0) {
559       pack_assert(I, I->nr_dests == 0);
560       hex |= 0xC0ull << 40; /* Placeholder */
561    }
562 
563    bool swap12 = va_swap_12(I->op);
564 
565    /* First src is staging if we read, skip it when packing sources */
566    unsigned src_offset = bi_opcode_props[I->op].sr_read ? 1 : 0;
567 
568    for (unsigned i = 0; i < info.nr_srcs; ++i) {
569       unsigned logical_i = (swap12 && i == 1) ? 2 : (swap12 && i == 2) ? 1 : i;
570 
571       struct va_src_info src_info = info.srcs[i];
572       enum va_size size = src_info.size;
573 
574       bi_index src = I->src[logical_i + src_offset];
575       hex |= (uint64_t)va_pack_src(I, logical_i + src_offset) << (8 * i);
576 
577       if (src_info.notted) {
578          if (src.neg)
579             hex |= (1ull << 35);
580       } else if (src_info.absneg) {
581          unsigned neg_offs = 32 + 2 + ((2 - i) * 2);
582          unsigned abs_offs = 33 + 2 + ((2 - i) * 2);
583 
584          if (src.neg)
585             hex |= 1ull << neg_offs;
586          if (src.abs)
587             hex |= 1ull << abs_offs;
588       } else {
589          if (src.neg)
590             invalid_instruction(I, "negate");
591          if (src.abs)
592             invalid_instruction(I, "absolute value");
593       }
594 
595       if (src_info.swizzle) {
596          unsigned offs = 24 + ((2 - i) * 2);
597          unsigned S = src.swizzle;
598          pack_assert(I, size == VA_SIZE_16 || size == VA_SIZE_32);
599 
600          uint64_t v = (size == VA_SIZE_32 ? va_pack_widen_f32(I, S)
601                                           : va_pack_swizzle_f16(I, S));
602          hex |= v << offs;
603       } else if (src_info.widen) {
604          unsigned offs = (i == 1) ? 26 : 36;
605          hex |= (uint64_t)va_pack_widen(I, src.swizzle, src_info.size) << offs;
606       } else if (src_info.lane) {
607          unsigned offs =
608             (I->op == BI_OPCODE_MKVEC_V2I8) ? ((i == 0) ? 38 : 36) : 28;
609 
610          if (src_info.size == VA_SIZE_16) {
611             hex |= (src.swizzle == BI_SWIZZLE_H11 ? 1 : 0) << offs;
612          } else if (I->op == BI_OPCODE_BRANCHZ_I16) {
613             hex |= ((uint64_t)va_pack_combine(I, src.swizzle) << 37);
614          } else {
615             pack_assert(I, src_info.size == VA_SIZE_8);
616             unsigned comp = src.swizzle - BI_SWIZZLE_B0000;
617             pack_assert(I, comp < 4);
618             hex |= (uint64_t)comp << offs;
619          }
620       } else if (src_info.lanes) {
621          pack_assert(I, src_info.size == VA_SIZE_8);
622          pack_assert(I, i == 1);
623          hex |= (uint64_t)va_pack_shift_lanes(I, src.swizzle) << 26;
624       } else if (src_info.combine) {
625          /* Treat as swizzle, subgroup ops not yet supported */
626          pack_assert(I, src_info.size == VA_SIZE_32);
627          pack_assert(I, i == 0);
628          hex |= (uint64_t)va_pack_widen_f32(I, src.swizzle) << 37;
629       } else if (src_info.halfswizzle) {
630          pack_assert(I, src_info.size == VA_SIZE_8);
631          pack_assert(I, i == 0);
632          hex |= (uint64_t)va_pack_halfswizzle(I, src.swizzle) << 36;
633       } else if (src.swizzle != BI_SWIZZLE_H01) {
634          invalid_instruction(I, "swizzle");
635       }
636    }
637 
638    if (info.saturate)
639       hex |= (uint64_t)I->saturate << 30;
640    if (info.rhadd)
641       hex |= va_pack_rhadd(I);
642    if (info.clamp)
643       hex |= (uint64_t)I->clamp << 32;
644    if (info.round_mode)
645       hex |= (uint64_t)I->round << 30;
646    if (info.condition)
647       hex |= (uint64_t)I->cmpf << 32;
648    if (info.result_type)
649       hex |= (uint64_t)I->result_type << 30;
650 
651    return hex;
652 }
653 
654 static uint64_t
va_pack_byte_offset(const bi_instr * I)655 va_pack_byte_offset(const bi_instr *I)
656 {
657    int16_t offset = I->byte_offset;
658    if (offset != I->byte_offset)
659       invalid_instruction(I, "byte offset");
660 
661    uint16_t offset_as_u16 = offset;
662    return ((uint64_t)offset_as_u16) << 8;
663 }
664 
665 static uint64_t
va_pack_byte_offset_8(const bi_instr * I)666 va_pack_byte_offset_8(const bi_instr *I)
667 {
668    uint8_t offset = I->byte_offset;
669    if (offset != I->byte_offset)
670       invalid_instruction(I, "byte offset");
671 
672    return ((uint64_t)offset) << 8;
673 }
674 
675 static uint64_t
va_pack_load(const bi_instr * I,bool buffer_descriptor)676 va_pack_load(const bi_instr *I, bool buffer_descriptor)
677 {
678    const uint8_t load_lane_identity[8] = {
679       VA_LOAD_LANE_8_BIT_B0,        VA_LOAD_LANE_16_BIT_H0,
680       VA_LOAD_LANE_24_BIT_IDENTITY, VA_LOAD_LANE_32_BIT_W0,
681       VA_LOAD_LANE_48_BIT_IDENTITY, VA_LOAD_LANE_64_BIT_IDENTITY,
682       VA_LOAD_LANE_96_BIT_IDENTITY, VA_LOAD_LANE_128_BIT_IDENTITY,
683    };
684 
685    unsigned memory_size = (valhall_opcodes[I->op].exact >> 27) & 0x7;
686    uint64_t hex = (uint64_t)load_lane_identity[memory_size] << 36;
687 
688    // unsigned
689    hex |= (1ull << 39);
690 
691    if (!buffer_descriptor)
692       hex |= va_pack_byte_offset(I);
693 
694    hex |= (uint64_t)va_pack_src(I, 0) << 0;
695 
696    if (buffer_descriptor)
697       hex |= (uint64_t)va_pack_src(I, 1) << 8;
698 
699    return hex;
700 }
701 
702 static uint64_t
va_pack_memory_access(const bi_instr * I)703 va_pack_memory_access(const bi_instr *I)
704 {
705    switch (I->seg) {
706    case BI_SEG_TL:
707       return VA_MEMORY_ACCESS_FORCE;
708    case BI_SEG_POS:
709       return VA_MEMORY_ACCESS_ISTREAM;
710    case BI_SEG_VARY:
711       return VA_MEMORY_ACCESS_ESTREAM;
712    default:
713       return VA_MEMORY_ACCESS_NONE;
714    }
715 }
716 
717 static uint64_t
va_pack_store(const bi_instr * I)718 va_pack_store(const bi_instr *I)
719 {
720    uint64_t hex = va_pack_memory_access(I) << 24;
721 
722    va_validate_register_pair(I, 1);
723    hex |= (uint64_t)va_pack_src(I, 1) << 0;
724 
725    hex |= va_pack_byte_offset(I);
726 
727    return hex;
728 }
729 
730 static enum va_lod_mode
va_pack_lod_mode(const bi_instr * I)731 va_pack_lod_mode(const bi_instr *I)
732 {
733    switch (I->va_lod_mode) {
734    case BI_VA_LOD_MODE_ZERO_LOD:
735       return VA_LOD_MODE_ZERO;
736    case BI_VA_LOD_MODE_COMPUTED_LOD:
737       return VA_LOD_MODE_COMPUTED;
738    case BI_VA_LOD_MODE_EXPLICIT:
739       return VA_LOD_MODE_EXPLICIT;
740    case BI_VA_LOD_MODE_COMPUTED_BIAS:
741       return VA_LOD_MODE_COMPUTED_BIAS;
742    case BI_VA_LOD_MODE_GRDESC:
743       return VA_LOD_MODE_GRDESC;
744    }
745 
746    invalid_instruction(I, "LOD mode");
747 }
748 
749 static enum va_register_type
va_pack_register_type(const bi_instr * I)750 va_pack_register_type(const bi_instr *I)
751 {
752    switch (I->register_format) {
753    case BI_REGISTER_FORMAT_F16:
754    case BI_REGISTER_FORMAT_F32:
755       return VA_REGISTER_TYPE_F;
756 
757    case BI_REGISTER_FORMAT_U16:
758    case BI_REGISTER_FORMAT_U32:
759       return VA_REGISTER_TYPE_U;
760 
761    case BI_REGISTER_FORMAT_S16:
762    case BI_REGISTER_FORMAT_S32:
763       return VA_REGISTER_TYPE_S;
764 
765    default:
766       invalid_instruction(I, "register type");
767    }
768 }
769 
770 static enum va_register_format
va_pack_register_format(const bi_instr * I)771 va_pack_register_format(const bi_instr *I)
772 {
773    switch (I->register_format) {
774    case BI_REGISTER_FORMAT_AUTO:
775       return VA_REGISTER_FORMAT_AUTO;
776    case BI_REGISTER_FORMAT_F32:
777       return VA_REGISTER_FORMAT_F32;
778    case BI_REGISTER_FORMAT_F16:
779       return VA_REGISTER_FORMAT_F16;
780    case BI_REGISTER_FORMAT_S32:
781       return VA_REGISTER_FORMAT_S32;
782    case BI_REGISTER_FORMAT_S16:
783       return VA_REGISTER_FORMAT_S16;
784    case BI_REGISTER_FORMAT_U32:
785       return VA_REGISTER_FORMAT_U32;
786    case BI_REGISTER_FORMAT_U16:
787       return VA_REGISTER_FORMAT_U16;
788    default:
789       invalid_instruction(I, "register format");
790    }
791 }
792 
793 uint64_t
va_pack_instr(const bi_instr * I)794 va_pack_instr(const bi_instr *I)
795 {
796    struct va_opcode_info info = valhall_opcodes[I->op];
797 
798    uint64_t hex = info.exact | (((uint64_t)I->flow) << 59);
799    hex |= ((uint64_t)va_select_fau_page(I)) << 57;
800 
801    if (info.slot)
802       hex |= ((uint64_t)I->slot << 30);
803 
804    if (info.sr_count) {
805       bool read = bi_opcode_props[I->op].sr_read;
806       bi_index sr = read ? I->src[0] : I->dest[0];
807 
808       unsigned count =
809          read ? bi_count_read_registers(I, 0) : bi_count_write_registers(I, 0);
810 
811       hex |= ((uint64_t)count << 33);
812       hex |= (uint64_t)va_pack_reg(I, sr) << 40;
813       hex |= ((uint64_t)info.sr_control << 46);
814    }
815 
816    if (info.sr_write_count) {
817       hex |= ((uint64_t)bi_count_write_registers(I, 0) - 1) << 36;
818       hex |= ((uint64_t)va_pack_reg(I, I->dest[0])) << 16;
819    }
820 
821    if (info.vecsize)
822       hex |= ((uint64_t)I->vecsize << 28);
823 
824    if (info.register_format)
825       hex |= ((uint64_t)va_pack_register_format(I)) << 24;
826 
827    switch (I->op) {
828    case BI_OPCODE_LOAD_I8:
829    case BI_OPCODE_LOAD_I16:
830    case BI_OPCODE_LOAD_I24:
831    case BI_OPCODE_LOAD_I32:
832    case BI_OPCODE_LOAD_I48:
833    case BI_OPCODE_LOAD_I64:
834    case BI_OPCODE_LOAD_I96:
835    case BI_OPCODE_LOAD_I128:
836       hex |= va_pack_load(I, false);
837       break;
838 
839    case BI_OPCODE_LD_BUFFER_I8:
840    case BI_OPCODE_LD_BUFFER_I16:
841    case BI_OPCODE_LD_BUFFER_I24:
842    case BI_OPCODE_LD_BUFFER_I32:
843    case BI_OPCODE_LD_BUFFER_I48:
844    case BI_OPCODE_LD_BUFFER_I64:
845    case BI_OPCODE_LD_BUFFER_I96:
846    case BI_OPCODE_LD_BUFFER_I128:
847       hex |= va_pack_load(I, true);
848       break;
849 
850    case BI_OPCODE_STORE_I8:
851    case BI_OPCODE_STORE_I16:
852    case BI_OPCODE_STORE_I24:
853    case BI_OPCODE_STORE_I32:
854    case BI_OPCODE_STORE_I48:
855    case BI_OPCODE_STORE_I64:
856    case BI_OPCODE_STORE_I96:
857    case BI_OPCODE_STORE_I128:
858       hex |= va_pack_store(I);
859       break;
860 
861    case BI_OPCODE_ATOM1_RETURN_I32:
862       /* Permit omitting the destination for plain ATOM1 */
863       if (!bi_count_write_registers(I, 0)) {
864          hex |= (0x40ull << 40); // fake read
865       }
866 
867       /* 64-bit source */
868       va_validate_register_pair(I, 0);
869       hex |= (uint64_t)va_pack_src(I, 0) << 0;
870       hex |= va_pack_byte_offset_8(I);
871       hex |= ((uint64_t)va_pack_atom_opc_1(I)) << 22;
872       break;
873 
874    case BI_OPCODE_ATOM_I32:
875    case BI_OPCODE_ATOM_RETURN_I32:
876       /* 64-bit source */
877       va_validate_register_pair(I, 1);
878       hex |= (uint64_t)va_pack_src(I, 1) << 0;
879       hex |= va_pack_byte_offset_8(I);
880       hex |= ((uint64_t)va_pack_atom_opc(I)) << 22;
881 
882       if (I->op == BI_OPCODE_ATOM_RETURN_I32)
883          hex |= (0xc0ull << 40); // flags
884 
885       if (I->atom_opc == BI_ATOM_OPC_ACMPXCHG)
886          hex |= (1 << 26); /* .compare */
887 
888       break;
889 
890    case BI_OPCODE_ST_CVT:
891       /* Staging read */
892       hex |= va_pack_store(I);
893 
894       /* Conversion descriptor */
895       hex |= (uint64_t)va_pack_src(I, 3) << 16;
896       break;
897 
898    case BI_OPCODE_BLEND: {
899       /* Source 0 - Blend descriptor (64-bit) */
900       hex |= ((uint64_t)va_pack_src(I, 2)) << 0;
901       va_validate_register_pair(I, 2);
902 
903       /* Target */
904       if (I->branch_offset & 0x7)
905          invalid_instruction(I, "unaligned branch");
906       hex |= ((I->branch_offset >> 3) << 8);
907 
908       /* Source 2 - coverage mask */
909       hex |= ((uint64_t)va_pack_reg(I, I->src[1])) << 16;
910 
911       /* Vector size */
912       unsigned vecsize = 4;
913       hex |= ((uint64_t)(vecsize - 1) << 28);
914 
915       break;
916    }
917 
918    case BI_OPCODE_TEX_GRADIENT:
919    case BI_OPCODE_TEX_SINGLE:
920    case BI_OPCODE_TEX_FETCH:
921    case BI_OPCODE_TEX_GATHER: {
922       /* Image to read from */
923       hex |= ((uint64_t)va_pack_src(I, 1)) << 0;
924 
925       if ((I->op == BI_OPCODE_TEX_FETCH || I->op == BI_OPCODE_TEX_GRADIENT) &&
926           I->shadow)
927          invalid_instruction(I, "texture instruction does not support .shadow");
928 
929       if (I->wide_indices)
930          hex |= (1ull << 8);
931       if (I->array_enable)
932          hex |= (1ull << 10);
933       if (I->texel_offset)
934          hex |= (1ull << 11);
935       if (I->shadow)
936          hex |= (1ull << 12);
937       if (I->skip)
938          hex |= (1ull << 39);
939       if (!bi_is_regfmt_16(I->register_format))
940          hex |= (1ull << 46);
941 
942       if (I->op == BI_OPCODE_TEX_GRADIENT) {
943          if (I->force_delta_enable)
944             hex |= (1ull << 12);
945          if (I->lod_bias_disable)
946             hex |= (1ull << 13);
947          if (I->lod_clamp_disable)
948             hex |= (1ull << 14);
949          if (I->derivative_enable)
950             hex |= (1ull << 15);
951       }
952 
953       if (I->op == BI_OPCODE_TEX_SINGLE)
954          hex |= ((uint64_t)va_pack_lod_mode(I)) << 13;
955 
956       if (I->op == BI_OPCODE_TEX_GATHER) {
957          if (I->integer_coordinates)
958             hex |= (1 << 13);
959          hex |= ((uint64_t)I->fetch_component) << 14;
960       }
961 
962       hex |= (I->write_mask << 22);
963       hex |= ((uint64_t)I->dimension) << 28;
964 
965       break;
966    }
967 
968    default:
969       if (!info.exact && I->op != BI_OPCODE_NOP)
970          invalid_instruction(I, "opcode");
971 
972       hex |= va_pack_alu(I);
973       break;
974    }
975 
976    return hex;
977 }
978 
979 static unsigned
va_instructions_in_block(bi_block * block)980 va_instructions_in_block(bi_block *block)
981 {
982    unsigned offset = 0;
983 
984    bi_foreach_instr_in_block(block, _) {
985       offset++;
986    }
987 
988    return offset;
989 }
990 
991 /* Calculate branch_offset from a branch_target for a direct relative branch */
992 
993 static void
va_lower_branch_target(bi_context * ctx,bi_block * start,bi_instr * I)994 va_lower_branch_target(bi_context *ctx, bi_block *start, bi_instr *I)
995 {
996    /* Precondition: unlowered relative branch */
997    bi_block *target = I->branch_target;
998    assert(target != NULL);
999 
1000    /* Signed since we might jump backwards */
1001    signed offset = 0;
1002 
1003    /* Determine if the target block is strictly greater in source order */
1004    bool forwards = target->index > start->index;
1005 
1006    if (forwards) {
1007       /* We have to jump through this block */
1008       bi_foreach_instr_in_block_from(start, _, I) {
1009          offset++;
1010       }
1011 
1012       /* We then need to jump over every following block until the target */
1013       bi_foreach_block_from(ctx, start, blk) {
1014          /* End just before the target */
1015          if (blk == target)
1016             break;
1017 
1018          /* Count other blocks */
1019          if (blk != start)
1020             offset += va_instructions_in_block(blk);
1021       }
1022    } else {
1023       /* Jump through the beginning of this block */
1024       bi_foreach_instr_in_block_from_rev(start, ins, I) {
1025          if (ins != I)
1026             offset--;
1027       }
1028 
1029       /* Jump over preceding blocks up to and including the target to get to
1030        * the beginning of the target */
1031       bi_foreach_block_from_rev(ctx, start, blk) {
1032          if (blk == start)
1033             continue;
1034 
1035          offset -= va_instructions_in_block(blk);
1036 
1037          /* End just after the target */
1038          if (blk == target)
1039             break;
1040       }
1041    }
1042 
1043    /* Offset is relative to the next instruction, so bias */
1044    offset--;
1045 
1046    /* Update the instruction */
1047    I->branch_offset = offset;
1048 }
1049 
1050 /*
1051  * Late lowering to insert blend shader calls after BLEND instructions. Required
1052  * to support blend shaders, so this pass may be omitted if it is known that
1053  * blend shaders are never used.
1054  *
1055  * This lowering runs late because it introduces control flow changes without
1056  * modifying the control flow graph. It hardcodes registers, meaning running
1057  * after RA makes sense. Finally, it hardcodes a manually sized instruction
1058  * sequence, requiring it to run after scheduling.
1059  *
1060  * As it is Valhall specific, running it as a pre-pack lowering is sensible.
1061  */
1062 static void
va_lower_blend(bi_context * ctx)1063 va_lower_blend(bi_context *ctx)
1064 {
1065    /* Program counter for *next* instruction */
1066    bi_index pc = bi_fau(BIR_FAU_PROGRAM_COUNTER, false);
1067 
1068    bi_foreach_instr_global_safe(ctx, I) {
1069       if (I->op != BI_OPCODE_BLEND)
1070          continue;
1071 
1072       bi_builder b = bi_init_builder(ctx, bi_after_instr(I));
1073 
1074       unsigned prolog_length = 2 * 8;
1075 
1076       /* By ABI, r48 is the link register shared with blend shaders */
1077       assert(bi_is_equiv(I->dest[0], bi_register(48)));
1078 
1079       if (I->flow == VA_FLOW_END)
1080          bi_iadd_imm_i32_to(&b, I->dest[0], va_zero_lut(), 0);
1081       else
1082          bi_iadd_imm_i32_to(&b, I->dest[0], pc, prolog_length - 8);
1083 
1084       bi_branchzi(&b, va_zero_lut(), I->src[3], BI_CMPF_EQ);
1085 
1086       /* For fixed function: skip the prologue, or return */
1087       if (I->flow != VA_FLOW_END)
1088          I->branch_offset = prolog_length;
1089    }
1090 }
1091 
1092 void
bi_pack_valhall(bi_context * ctx,struct util_dynarray * emission)1093 bi_pack_valhall(bi_context *ctx, struct util_dynarray *emission)
1094 {
1095    unsigned orig_size = emission->size;
1096 
1097    va_validate(stderr, ctx);
1098 
1099    /* Late lowering */
1100    if (ctx->stage == MESA_SHADER_FRAGMENT && !ctx->inputs->is_blend)
1101       va_lower_blend(ctx);
1102 
1103    bi_foreach_block(ctx, block) {
1104       bi_foreach_instr_in_block(block, I) {
1105          if (I->op == BI_OPCODE_BRANCHZ_I16)
1106             va_lower_branch_target(ctx, block, I);
1107 
1108          uint64_t hex = va_pack_instr(I);
1109          util_dynarray_append(emission, uint64_t, hex);
1110       }
1111    }
1112 
1113    /* Pad with zeroes, but keep empty programs empty so they may be omitted
1114     * altogether. Failing to do this would result in a program containing only
1115     * zeroes, which is invalid and will raise an encoding fault.
1116     *
1117     * Pad an extra 16 byte (one instruction) to separate primary and secondary
1118     * shader disassembles. This is not strictly necessary, but it's a good
1119     * practice. 128 bytes is the optimal program alignment on Trym, so pad
1120     * secondary shaders up to 128 bytes. This may help the instruction cache.
1121     */
1122    if (orig_size != emission->size) {
1123       unsigned aligned = ALIGN_POT(emission->size + 16, 128);
1124       unsigned count = aligned - emission->size;
1125 
1126       memset(util_dynarray_grow(emission, uint8_t, count), 0, count);
1127    }
1128 }
1129