• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1"""
2Copyright 2021 Alyssa Rosenzweig
3# SPDX-License-Identifier: MIT
4"""
5
6opcodes = {}
7immediates = {}
8enums = {}
9
10VARIABLE = ~0
11
12class Opcode(object):
13   def __init__(self, name, dests, srcs, imms, is_float, can_eliminate,
14                can_reorder, schedule_class, encoding_16, encoding_32):
15      self.name = name
16      self.dests = dests if dests != VARIABLE else 0
17      self.srcs = srcs if srcs != VARIABLE else 0
18      self.variable_srcs = (srcs == VARIABLE)
19      self.variable_dests = (dests == VARIABLE)
20      self.imms = imms
21      self.is_float = is_float
22      self.can_eliminate = can_eliminate
23      self.can_reorder = can_reorder
24      self.schedule_class = schedule_class
25      self.encoding_16 = encoding_16
26      self.encoding_32 = encoding_32
27
28class Immediate(object):
29   def __init__(self, name, ctype):
30      self.name = name
31      self.ctype = ctype
32
33class Encoding(object):
34   def __init__(self, description):
35      (exact, mask, length_short, length_long) = description
36
37      # Convenience
38      if length_long is None:
39         length_long = length_short
40
41      self.exact = exact
42      self.mask = mask
43      self.length_short = length_short
44      self.extensible = length_short != length_long
45
46      if self.extensible:
47         assert(length_long == length_short + (4 if length_short > 8 else 2))
48
49def op(name, encoding_32, dests = 1, srcs = 0, imms = [], is_float = False,
50        can_eliminate = True, can_reorder = True, encoding_16 = None,
51        schedule_class = "none"):
52   encoding_16 = Encoding(encoding_16) if encoding_16 is not None else None
53   encoding_32 = Encoding(encoding_32) if encoding_32 is not None else None
54
55   opcodes[name] = Opcode(name, dests, srcs, imms, is_float, can_eliminate,
56                          can_reorder, schedule_class,  encoding_16, encoding_32)
57
58def immediate(name, ctype = "uint32_t"):
59   imm = Immediate(name, ctype)
60   immediates[name] = imm
61   return imm
62
63def enum(name, value_dict):
64   enums[name] = value_dict
65   return immediate(name, "enum agx_" + name)
66
67L = (1 << 15)
68_ = None
69
70FORMAT = immediate("format", "enum agx_format")
71IMM = immediate("imm", "uint64_t")
72WRITEOUT = immediate("writeout")
73INDEX = immediate("index")
74COMPONENT = immediate("component")
75CHANNELS = immediate("channels")
76TRUTH_TABLE = immediate("truth_table")
77ROUND = immediate("round", "enum agx_round")
78SHIFT = immediate("shift")
79MASK = immediate("mask")
80BFI_MASK = immediate("bfi_mask")
81LOD_MODE = immediate("lod_mode", "enum agx_lod_mode")
82PIXEL_OFFSET = immediate("pixel_offset")
83STACK_SIZE = immediate("stack_size", 'int16_t')
84
85DIM = enum("dim", {
86    0: '1d',
87    1: '1d_array',
88    2: '2d',
89    3: '2d_array',
90    4: '2d_ms',
91    5: '3d',
92    6: 'cube',
93    7: 'cube_array',
94    8: '2d_ms_array',
95})
96
97GATHER = enum("gather", {
98	0b000: "none",
99	0b001: "r",
100	0b011: "g",
101	0b101: "b",
102	0b111: "a",
103})
104
105OFFSET = immediate("offset", "bool")
106SHADOW = immediate("shadow", "bool")
107QUERY_LOD = immediate("query_lod", "bool")
108SCOREBOARD = immediate("scoreboard")
109ICOND = immediate("icond", "enum agx_icond")
110FCOND = immediate("fcond", "enum agx_fcond")
111NEST = immediate("nest")
112INVERT_COND = immediate("invert_cond")
113NEST = immediate("nest")
114TARGET = immediate("target", "agx_block *")
115ZS = immediate("zs")
116SR = enum("sr", {
117   0:  'threadgroup_position_in_grid.x',
118   1:  'threadgroup_position_in_grid.y',
119   2:  'threadgroup_position_in_grid.z',
120   4:  'threads_per_threadgroup.x',
121   5:  'threads_per_threadgroup.y',
122   6:  'threads_per_threadgroup.z',
123   8:  'dispatch_threads_per_threadgroup.x',
124   9:  'dispatch_threads_per_threadgroup.y',
125   10: 'dispatch_threads_per_threadgroup.z',
126   20: 'core_id',
127   21: 'vm_slot',
128   48: 'thread_position_in_threadgroup.x',
129   49: 'thread_position_in_threadgroup.y',
130   50: 'thread_position_in_threadgroup.z',
131   51: 'thread_index_in_threadgroup',
132   52: 'thread_index_in_subgroup',
133   53: 'subgroup_index_in_threadgroup',
134   56: 'active_thread_index_in_quad',
135   58: 'active_thread_index_in_subgroup',
136   60: 'coverage_mask',
137   62: 'backfacing',
138   63: 'is_active_thread',
139   80: 'thread_position_in_grid.x',
140   81: 'thread_position_in_grid.y',
141   82: 'thread_position_in_grid.z',
142   124: 'input_sample_mask',
143   144: 'helper_op',
144   146: 'helper_arg_l',
145   147: 'helper_arg_h',
146})
147
148ATOMIC_OPC = enum("atomic_opc", {
149	0: 'add',
150	1: 'sub',
151	2: 'xchg',
152	3: 'cmpxchg',
153	4: 'umin',
154	5: 'imin',
155	6: 'umax',
156	7: 'imax',
157	8: 'and',
158	9: 'or',
159	10: 'xor',
160})
161
162INTERPOLATION = enum("interpolation", {
163    0: 'center',
164    1: 'sample',
165    2: 'centroid',
166    # We translate sample -> sample_register at pack time for simplicity
167    3: 'sample_register',
168})
169
170FUNOP = lambda x: (x << 28)
171FUNOP_MASK = FUNOP((1 << 14) - 1)
172
173def funop(name, opcode, schedule_class = "none"):
174   op(name, (0x0A | L | (opcode << 28),
175      0x3F | L | (((1 << 14) - 1) << 28), 6, _),
176      srcs = 1, is_float = True, schedule_class = schedule_class)
177
178def iunop(name, opcode):
179    assert(opcode < 4)
180    op(name, (0x3E | (opcode << 26),
181              0x7F | L | (((1 << 14) - 1) << 26),
182              6, _),
183       srcs = 1)
184
185# Listing of opcodes
186funop("floor",     0b000000)
187funop("srsqrt",    0b000001)
188funop("dfdx",      0b000100, schedule_class = "coverage")
189funop("dfdy",      0b000110, schedule_class = "coverage")
190funop("rcp",       0b001000)
191funop("rsqrt",     0b001001)
192funop("sin_pt_1",  0b001010)
193funop("log2",      0b001100)
194funop("exp2",      0b001101)
195funop("sin_pt_2",  0b001110)
196funop("ceil",      0b010000)
197funop("trunc",     0b100000)
198funop("roundeven", 0b110000)
199
200iunop("bitrev",    0b01)
201iunop("popcount",  0b10)
202iunop("ffs",       0b11)
203
204op("fadd",
205      encoding_16 = (0x26, 0x3F, 4, 6),
206      encoding_32 = (0x2A, 0x3F, 4, 6),
207      srcs = 2, is_float = True)
208
209op("fma",
210      encoding_16 = (0x36, 0x3F, 6, 8),
211      encoding_32 = (0x3A, 0x3F, 6, 8),
212      srcs = 3, is_float = True)
213
214op("fmul",
215      encoding_16 = (0x16, 0x3F, 4, 6),
216      encoding_32 = (0x1A, 0x3F, 4, 6),
217      srcs = 2, is_float = True)
218
219op("mov_imm",
220      encoding_32 = (0x62, 0xFF, 6, 8),
221      encoding_16 = (0x62, 0xFF, 4, 6),
222      imms = [IMM])
223
224op("iadd",
225      encoding_32 = (0x0E, 0x3F | L, 8, _),
226      srcs = 2, imms = [SHIFT])
227
228op("imad",
229      encoding_32 = (0x1E, 0x3F | L, 8, _),
230      srcs = 3, imms = [SHIFT])
231
232op("bfi",
233      encoding_32 = (0x2E, 0x7F | (0x3 << 26), 8, _),
234      srcs = 3, imms = [BFI_MASK])
235
236op("bfeil",
237      encoding_32 = (0x2E | L, 0x7F | L | (0x3 << 26), 8, _),
238      srcs = 3, imms = [BFI_MASK])
239
240op("extr",
241      encoding_32 = (0x2E | (0x1 << 26), 0x7F | L | (0x3 << 26), 8, _),
242      srcs = 3, imms = [BFI_MASK])
243
244op("asr",
245      encoding_32 = (0x2E | L | (0x1 << 26), 0x7F | L | (0x3 << 26), 8, _),
246      srcs = 2)
247
248def subgroup_op(name, op1, op2):
249    exact      = 0b01101111 | L | (op1 << 47) | (op2 << 26)
250    exact_mask = 0b11111111 | L | (1   << 47) | (0xFFFF << 26)
251
252    op(name, encoding_32 = (exact, exact_mask, 6, _), srcs = 1)
253
254subgroup_op("simd_prefix_iadd", 1, 0b0000000000011000)
255subgroup_op("simd_iadd", 1, 0b0000000000001000)
256
257op("simd_shuffle",
258    encoding_32 = (0b01101111 | (1 << 26),
259                   0xFF | L | (1 << 47) | (3 << 38) | (3 << 26), 6, _),
260    srcs = 2)
261
262for window, w_bit in [('quad_', 0), ('', 1)]:
263    # Pseudo-instruction ballotting a boolean
264    op(f"{window}ballot", _, srcs = 1)
265
266    for T, T_bit, cond in [('f', 0, FCOND), ('i', 1, ICOND)]:
267        op(f"{T}cmp_{window}ballot",
268           encoding_32 = (0b0100010 | (T_bit << 4) | (w_bit << 48), 0, 8, _),
269           srcs = 2, imms = [cond, INVERT_COND])
270
271op("icmpsel",
272      encoding_32 = (0x12, 0x7F, 8, 10),
273      srcs = 4, imms = [ICOND])
274
275op("fcmpsel",
276      encoding_32 = (0x02, 0x7F, 8, 10),
277      srcs = 4, imms = [FCOND])
278
279# Pseudo-instructions for compares returning 1/0
280op("icmp", _, srcs = 2, imms = [ICOND, INVERT_COND])
281op("fcmp", _, srcs = 2, imms = [FCOND, INVERT_COND])
282
283# sources are coordinates, LOD, texture bindless base (zero for texture state
284# registers), texture, sampler, shadow/offset
285# TODO: anything else?
286op("texture_sample",
287      encoding_32 = (0x31, 0x7F, 8, 10), # XXX WRONG SIZE
288      srcs = 6, imms = [DIM, LOD_MODE, MASK, SCOREBOARD, OFFSET, SHADOW,
289                        QUERY_LOD, GATHER])
290for memory, can_reorder in [("texture", True), ("image", False)]:
291    op(f"{memory}_load", encoding_32 = (0x71, 0x7F, 8, 10), # XXX WRONG SIZE
292       srcs = 6, imms = [DIM, LOD_MODE, MASK, SCOREBOARD, OFFSET],
293       can_reorder = can_reorder,
294       schedule_class = "none" if can_reorder else "load")
295
296# sources are base, index
297op("device_load",
298      encoding_32 = (0x05, 0x7F, 6, 8),
299      srcs = 2, imms = [FORMAT, MASK, SHIFT, SCOREBOARD], can_reorder = False,
300      schedule_class = "load")
301
302# sources are base (relative to workgroup memory), index
303op("local_load",
304      encoding_32 = (0b1101001, 0, 6, 8),
305      srcs = 2, imms = [FORMAT, MASK], can_reorder = False,
306      schedule_class = "load")
307
308# sources are value, base, index
309# TODO: Consider permitting the short form
310op("device_store",
311      encoding_32 = (0x45 | (1 << 47), 0, 8, _),
312      dests = 0, srcs = 3, imms = [FORMAT, MASK, SHIFT, SCOREBOARD], can_eliminate = False,
313      schedule_class = "store")
314
315# sources are value, base, index
316op("local_store",
317      encoding_32 = (0b0101001, 0, 6, 8),
318      dests = 0, srcs = 3, imms = [FORMAT, MASK],
319      can_eliminate=False, schedule_class = "store")
320
321# sources are value, index
322# TODO: Consider permitting the short form
323op("uniform_store",
324      encoding_32 = ((0b111 << 27) | 0b1000101 | (1 << 47), 0, 8, _),
325      dests = 0, srcs = 2, can_eliminate = False)
326
327# sources are value, base, index
328op("atomic",
329      encoding_32 = (0x15 | (1 << 26) | (1 << 31) | (5 << 44), 0x3F | (1 << 26) | (1 << 31) | (5 << 44), 8, _),
330      dests = 1, srcs = 3, imms = [ATOMIC_OPC, SCOREBOARD],
331      can_eliminate = False, schedule_class = "atomic")
332
333# XXX: stop hardcoding the long form
334op("local_atomic",
335      encoding_32 = (0x19 | (1 << 15) | (1 << 36) | (1 << 47), 0x3F | (1 << 36) | (1 << 47), 10, _),
336      dests = 1, srcs = 3, imms = [ATOMIC_OPC], schedule_class = "atomic",
337      can_eliminate = False)
338
339op("wait", (0x38, 0xFF, 2, _), dests = 0,
340      can_eliminate = False, imms = [SCOREBOARD], schedule_class = "invalid")
341
342for (suffix, schedule_class) in [("", "none"), ("_coverage", "coverage"), ("_barrier", "barrier")]:
343    op(f"get_sr{suffix}", (0x72, 0x7F | L, 4, _), dests = 1, imms = [SR],
344       schedule_class = schedule_class, can_reorder = schedule_class == "none")
345
346op("sample_mask", (0x7fc1, 0xffff, 6, _), dests = 0, srcs = 2,
347    can_eliminate = False, schedule_class = "coverage")
348
349# Sources: sample mask, combined depth/stencil
350op("zs_emit", (0x41, 0xFF | L, 4, _), dests = 0, srcs = 2,
351              can_eliminate = False, imms = [ZS], schedule_class = "coverage")
352
353# Essentially same encoding. Last source is the sample mask
354op("ld_tile", (0x49, 0x7F, 8, _), dests = 1, srcs = 1,
355        imms = [FORMAT, MASK, PIXEL_OFFSET], can_reorder = False,
356        schedule_class = "coverage")
357
358op("st_tile", (0x09, 0x7F, 8, _), dests = 0, srcs = 2,
359      can_eliminate = False, imms = [FORMAT, MASK, PIXEL_OFFSET],
360      schedule_class = "coverage")
361
362for (name, exact) in [("any", 0xC000), ("none", 0xC020), ("none_after", 0xC020)]:
363   op("jmp_exec_" + name, (exact, (1 << 16) - 1, 6, _), dests = 0, srcs = 0,
364         can_eliminate = False, schedule_class = "invalid", imms = [TARGET])
365
366# TODO: model implicit r0l destinations
367op("pop_exec", (0x52 | (0x3 << 9), ((1 << 48) - 1) ^ (0x3 << 7) ^ (0x3 << 11), 6, _),
368      dests = 0, srcs = 0, can_eliminate = False, schedule_class = "invalid",
369      imms = [NEST])
370
371for is_float in [False, True]:
372   mod_mask = 0 if is_float else (0x3 << 26) | (0x3 << 38)
373
374   for (cf, cf_op) in [("if", 0), ("else", 1), ("while", 2)]:
375      name = "{}_{}cmp".format(cf, "f" if is_float else "i")
376      exact = 0x42 | (0x0 if is_float else 0x10) | (cf_op << 9)
377      mask = 0x7F | (0x3 << 9) | mod_mask | (0x3 << 44)
378      imms = [NEST, FCOND if is_float else ICOND, INVERT_COND, TARGET]
379
380      op(name, (exact, mask, 6, _), dests = 0, srcs = 2, can_eliminate = False,
381            imms = imms, is_float = is_float,
382            schedule_class = "preload" if cf == "else" else "invalid")
383
384op("bitop", (0x7E, 0x7F, 6, _), srcs = 2, imms = [TRUTH_TABLE])
385op("intl", (0x3E, 0x7F, 6, _), srcs = 2, imms = [])
386op("convert", (0x3E | L, 0x7F | L | (0x3 << 38), 6, _), srcs = 2, imms = [ROUND])
387
388# Sources are the coeffient register and the sample index (if applicable)
389op("iter", (0x21, 0xBF, 8, _), srcs = 2, imms = [CHANNELS, INTERPOLATION])
390
391# Sources are the coeffient register for the varying, the coefficient register
392# for W, and the sample index (if applicable)
393op("iterproj", (0x21, 0xBF, 8, _), srcs = 3, imms = [CHANNELS, INTERPOLATION])
394
395op("ldcf", (0xA1, 0xBF, 8, _), srcs = 1, imms = [CHANNELS])
396op("st_vary", None, dests = 0, srcs = 2, can_eliminate = False)
397op("no_varyings", (0x80000051, 0xFFFFFFFF, 4, _), dests = 0, can_eliminate = False)
398op("stop", (0x88, 0xFFFF, 2, _), dests = 0, can_eliminate = False,
399   schedule_class = "invalid")
400op("trap", (0x08, 0xFFFF, 2, _), dests = 0, can_eliminate = False,
401   schedule_class = "invalid")
402op("wait_pix", (0x48, 0xFF, 4, _), dests = 0, imms = [WRITEOUT],
403   can_eliminate = False, schedule_class = "coverage")
404op("signal_pix", (0x58, 0xFF, 4, _), dests = 0, imms = [WRITEOUT],
405   can_eliminate = False, schedule_class = "coverage")
406
407# Sources are the data vector, the coordinate vector, the LOD, the bindless
408# table if present (zero for texture state registers), and texture index.
409op("image_write", (0xF1 | (1 << 23) | (9 << 43), 0xFF, 6, 8), dests = 0, srcs = 5, imms
410   = [DIM], can_eliminate = False, schedule_class = "store")
411
412# Sources are the image, the offset within shared memory, and the layer.
413# TODO: Do we need the short encoding?
414op("block_image_store", (0xB1, 0xFF, 10, _), dests = 0, srcs = 3,
415   imms = [FORMAT, DIM], can_eliminate = False, schedule_class = "store")
416
417# Barriers
418op("threadgroup_barrier", (0x0068, 0xFFFF, 2, _), dests = 0, srcs = 0,
419   can_eliminate = False, schedule_class = "barrier")
420
421def memory_barrier(name, a, b, c):
422    op(name, (0xF5 | (a << 10) | (b << 8) | (c << 12), 0xFFFF, 2, _), dests = 0, srcs = 0,
423       can_eliminate = False, schedule_class = "barrier")
424
425memory_barrier("memory_barrier", 1, 2, 9)
426
427# TODO: Not clear what these individually are. Some might be cache flushes?
428memory_barrier("image_barrier_1", 2, 2, 10)
429memory_barrier("image_barrier_2", 3, 2, 10)
430memory_barrier("image_barrier_3", 2, 1, 10)
431memory_barrier("image_barrier_4", 3, 1, 10)
432
433memory_barrier("flush_memory_to_texture", 0, 0, 4)
434
435memory_barrier("memory_barrier_2", 2, 2, 9)
436memory_barrier("memory_barrier_3", 2, 1, 9)
437memory_barrier("unknown_barrier_1", 0, 3, 3)
438memory_barrier("unknown_barrier_2", 0, 3, 0)
439
440op("doorbell", (0x60020 | 0x28 << 32, (1 << 48) - 1, 6, _), dests = 0,
441      can_eliminate = False, can_reorder = False, imms = [IMM])
442
443op("stack_unmap", (0x00075, (1 << 24) - 1, 8, _), dests = 1, srcs = 0, can_eliminate = False, can_reorder = False, imms = [IMM])
444op("stack_map",   (0x10075, (1 << 24) - 1, 8, _), dests = 0, srcs = 1, can_eliminate = False, can_reorder = False, imms = [IMM])
445
446op("stack_adjust",
447      encoding_32 = (0x10100b5, (1 << 26) - 1, 8, _),
448      dests = 0, srcs = 0, can_eliminate = False, can_reorder = False,
449      imms = [STACK_SIZE], schedule_class = "store")
450
451# source is offset
452op("stack_load",
453      encoding_32 = (0x35, (1 << 20) - 1, 6, 8),
454      srcs = 1, imms = [FORMAT, MASK, SCOREBOARD], can_reorder = False,
455      schedule_class = "load")
456
457# sources are value and offset
458op("stack_store",
459      encoding_32 = (0xb5, (1 << 20) - 1, 6, 8),
460      dests = 0, srcs = 2, imms = [FORMAT, MASK, SCOREBOARD],
461      can_eliminate=False, schedule_class = "store")
462
463# Convenient aliases.
464op("mov", _, srcs = 1)
465op("not", _, srcs = 1)
466
467op("collect", _, srcs = VARIABLE)
468op("split", _, srcs = 1, dests = VARIABLE)
469op("phi", _, srcs = VARIABLE, schedule_class = "preload")
470
471op("unit_test", _, dests = 0, srcs = 1, can_eliminate = False)
472
473# Like mov, but takes a register and can only appear at the start. Guaranteed
474# to be coalesced during RA, rather than lowered to a real move.
475op("preload", _, srcs = 1, schedule_class = "preload")
476
477# Pseudo-instructions to set the nesting counter. Lowers to r0l writes after RA.
478op("begin_cf", _, dests = 0, can_eliminate = False)
479op("break", _, dests = 0, imms = [NEST, TARGET], can_eliminate = False,
480   schedule_class = "invalid")
481
482for (name, is_float) in [("break_if_icmp", False), ("break_if_fcmp", True)]:
483    op(name, _, dests = 0, srcs = 2,
484       imms = [NEST, INVERT_COND, FCOND if is_float else ICOND, TARGET],
485       can_eliminate = False, schedule_class = "invalid")
486