• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1"""
2Copyright 2021 Alyssa Rosenzweig
3# SPDX-License-Identifier: MIT
4"""
5
6opcodes = {}
7immediates = {}
8enums = {}
9
10VARIABLE = ~0
11
12class Opcode(object):
13   def __init__(self, name, dests, srcs, imms, is_float, can_eliminate,
14                can_reorder, schedule_class, encoding):
15      self.name = name
16      self.dests = dests if dests != VARIABLE else 0
17      self.srcs = srcs if srcs != VARIABLE else 0
18      self.variable_srcs = (srcs == VARIABLE)
19      self.variable_dests = (dests == VARIABLE)
20      self.imms = imms
21      self.is_float = is_float
22      self.can_eliminate = can_eliminate
23      self.can_reorder = can_reorder
24      self.schedule_class = schedule_class
25      self.encoding = encoding
26
27class Immediate(object):
28   def __init__(self, name, ctype):
29      self.name = name
30      self.ctype = ctype
31
32class Encoding(object):
33   def __init__(self, description):
34      (exact, mask, length_short, length_long) = description
35
36      # Convenience
37      if length_long is None:
38         length_long = length_short
39
40      self.exact = exact
41      self.mask = mask
42      self.length_short = length_short
43      self.extensible = length_short != length_long
44
45      if self.extensible:
46         assert(length_long == length_short + (4 if length_short > 8 else 2))
47
48def op(name, encoding, dests = 1, srcs = 0, imms = [], is_float = False,
49        can_eliminate = True, can_reorder = True, schedule_class = "none"):
50   encoding = Encoding(encoding) if encoding is not None else None
51
52   opcodes[name] = Opcode(name, dests, srcs, imms, is_float, can_eliminate,
53                          can_reorder, schedule_class,  encoding)
54
55def immediate(name, ctype = "uint32_t"):
56   imm = Immediate(name, ctype)
57   immediates[name] = imm
58   return imm
59
60def enum(name, value_dict):
61   enums[name] = value_dict
62   return immediate(name, "enum agx_" + name)
63
64L = (1 << 15)
65_ = None
66
67FORMAT = immediate("format", "enum agx_format")
68IMM = immediate("imm", "uint64_t")
69WRITEOUT = immediate("writeout")
70INDEX = immediate("index")
71COMPONENT = immediate("component")
72CHANNELS = immediate("channels")
73TRUTH_TABLE = immediate("truth_table")
74ROUND = immediate("round", "enum agx_round")
75SHIFT = immediate("shift")
76MASK = immediate("mask")
77BFI_MASK = immediate("bfi_mask")
78LOD_MODE = immediate("lod_mode", "enum agx_lod_mode")
79PIXEL_OFFSET = immediate("pixel_offset")
80STACK_SIZE = immediate("stack_size", 'int16_t')
81EXPLICIT_COORDS = immediate("explicit_coords", "bool")
82
83DIM = enum("dim", {
84    0: '1d',
85    1: '1d_array',
86    2: '2d',
87    3: '2d_array',
88    4: '2d_ms',
89    5: '3d',
90    6: 'cube',
91    7: 'cube_array',
92    8: '2d_ms_array',
93})
94
95GATHER = enum("gather", {
96	0b000: "none",
97	0b001: "r",
98	0b011: "g",
99	0b101: "b",
100	0b111: "a",
101})
102
103OFFSET = immediate("offset", "bool")
104SHADOW = immediate("shadow", "bool")
105QUERY_LOD = immediate("query_lod", "bool")
106COHERENT = immediate("coherent", "bool")
107SCOREBOARD = immediate("scoreboard")
108ICOND = immediate("icond", "enum agx_icond")
109FCOND = immediate("fcond", "enum agx_fcond")
110NEST = immediate("nest")
111INVERT_COND = immediate("invert_cond")
112NEST = immediate("nest")
113TARGET = immediate("target", "agx_block *")
114ZS = immediate("zs")
115SR = enum("sr", {
116   0:  'threadgroup_position_in_grid.x',
117   1:  'threadgroup_position_in_grid.y',
118   2:  'threadgroup_position_in_grid.z',
119   4:  'threads_per_threadgroup.x',
120   5:  'threads_per_threadgroup.y',
121   6:  'threads_per_threadgroup.z',
122   8:  'dispatch_threads_per_threadgroup.x',
123   9:  'dispatch_threads_per_threadgroup.y',
124   10: 'dispatch_threads_per_threadgroup.z',
125   14: 'samples_log2',
126   20: 'core_id',
127   21: 'vm_slot',
128   48: 'thread_position_in_threadgroup.x',
129   49: 'thread_position_in_threadgroup.y',
130   50: 'thread_position_in_threadgroup.z',
131   51: 'thread_index_in_threadgroup',
132   52: 'thread_index_in_subgroup',
133   53: 'subgroup_index_in_threadgroup',
134   56: 'active_thread_index_in_quad',
135   57: 'total_active_threads_in_quad',
136   58: 'active_thread_index_in_subgroup',
137   59: 'total_active_threads_in_subgroup',
138   60: 'coverage_mask',
139   62: 'backfacing',
140   63: 'is_active_thread',
141   80: 'thread_position_in_grid.x',
142   81: 'thread_position_in_grid.y',
143   82: 'thread_position_in_grid.z',
144   124: 'input_sample_mask',
145   144: 'helper_op',
146   146: 'helper_arg_l',
147   147: 'helper_arg_h',
148})
149
150ATOMIC_OPC = enum("atomic_opc", {
151	0: 'add',
152	1: 'sub',
153	2: 'xchg',
154	3: 'cmpxchg',
155	4: 'umin',
156	5: 'imin',
157	6: 'umax',
158	7: 'imax',
159	8: 'and',
160	9: 'or',
161	10: 'xor',
162})
163
164INTERPOLATION = enum("interpolation", {
165    0: 'center',
166    1: 'sample',
167    2: 'centroid',
168    # We translate sample -> sample_register at pack time for simplicity
169    3: 'sample_register',
170})
171
172SIMD_OP = enum("simd_op", {
173    0b00000: 'and',
174    0b00001: 'fadd',
175    0b00010: 'or',
176    0b00011: 'fmul',
177    0b00100: 'xor',
178    0b00101: 'fmin',
179    0b00111: 'fmax',
180    0b10000: 'iadd',
181    0b10100: 'smin',
182    0b10110: 'smax',
183    0b11100: 'umin',
184    0b11110: 'umax',
185})
186
187FUNOP = lambda x: (x << 28)
188FUNOP_MASK = FUNOP((1 << 14) - 1)
189
190def funop(name, opcode, schedule_class = "none"):
191   op(name, (0x0A | (opcode << 28),
192      0x3F | (((1 << 14) - 1) << 28), 4, 6),
193      srcs = 1, is_float = True, schedule_class = schedule_class)
194
195def iunop(name, opcode):
196    assert(opcode < 4)
197    op(name, (0x3E | (opcode << 26),
198              0x7F | L | (((1 << 14) - 1) << 26),
199              6, _),
200       srcs = 1)
201
202# Listing of opcodes
203funop("floor",     0b000000)
204funop("srsqrt",    0b000001)
205funop("dfdx",      0b000100, schedule_class = "coverage")
206funop("dfdy",      0b000110, schedule_class = "coverage")
207funop("rcp",       0b001000)
208funop("rsqrt",     0b001001)
209funop("sin_pt_1",  0b001010)
210funop("log2",      0b001100)
211funop("exp2",      0b001101)
212funop("sin_pt_2",  0b001110)
213funop("ceil",      0b010000)
214funop("trunc",     0b100000)
215funop("roundeven", 0b110000)
216
217iunop("bitrev",    0b01)
218iunop("popcount",  0b10)
219iunop("ffs",       0b11)
220
221op("fadd",
222      encoding = (0x2A, 0x3F, 4, 6),
223      srcs = 2, is_float = True)
224
225op("ffma",
226      encoding = (0x3A, 0x3F, 6, 8),
227      srcs = 3, is_float = True)
228
229op("fmul",
230      encoding = (0x1A, 0x3F, 4, 6),
231      srcs = 2, is_float = True)
232
233op("hadd",
234      encoding = (0x26, 0x3F, 4, 6),
235      srcs = 2, is_float = True)
236
237op("hfma",
238      encoding = (0x36, 0x3F, 6, 8),
239      srcs = 3, is_float = True)
240
241op("hmul",
242      encoding = (0x16, 0x3F, 4, 6),
243      srcs = 2, is_float = True)
244
245op("mov_imm",
246      encoding = (0x62, 0xFF, 6, 8),
247      imms = [IMM])
248
249op("iadd",
250      encoding = (0x0E, 0x3F | L, 8, _),
251      srcs = 2, imms = [SHIFT])
252
253op("imad",
254      encoding = (0x1E, 0x3F | L, 8, _),
255      srcs = 3, imms = [SHIFT])
256
257op("bfi",
258      encoding = (0x2E, 0x7F | (0x3 << 26), 8, _),
259      srcs = 3, imms = [BFI_MASK])
260
261op("bfeil",
262      encoding = (0x2E | L, 0x7F | L | (0x3 << 26), 8, _),
263      srcs = 3, imms = [BFI_MASK])
264
265op("extr",
266      encoding = (0x2E | (0x1 << 26), 0x7F | L | (0x3 << 26), 8, _),
267      srcs = 3, imms = [BFI_MASK])
268
269op("asr",
270      encoding = (0x2E | L | (0x1 << 26), 0x7F | L | (0x3 << 26), 8, _),
271      srcs = 2)
272
273def subgroup_op(name, opc):
274    exact      = 0b01101111 | L | (opc << 29)
275    exact_mask = 0b11111111 | L | (0x3 << 29)
276
277    op(name, encoding = (exact, exact_mask, 6, _), srcs = 1, imms = [SIMD_OP])
278
279subgroup_op("quad_reduce", 0x0)
280subgroup_op("simd_reduce", 0x1)
281subgroup_op("quad_prefix", 0x2)
282subgroup_op("simd_prefix", 0x3)
283
284for window, w_bit in [('quad_', 0), ('', 1)]:
285    for s, shuffle in enumerate(['', '_xor', '_up', '_down']):
286        op(f"{window}shuffle{shuffle}",
287            encoding = (0b01101111 | (w_bit << 26) | (s << 38),
288                           0xFF | L | (1 << 47) | (3 << 38) | (3 << 26), 6, _),
289            srcs = 2)
290
291    # Pseudo-instruction ballotting a boolean
292    op(f"{window}ballot", _, srcs = 1)
293
294    for T, T_bit, cond in [('f', 0, FCOND), ('i', 1, ICOND)]:
295        op(f"{T}cmp_{window}ballot",
296           encoding = (0b0100010 | (T_bit << 4) | (w_bit << 48), 0, 8, _),
297           srcs = 2, imms = [cond, INVERT_COND])
298
299op("icmpsel",
300      encoding = (0x12, 0x7F, 8, 10),
301      srcs = 4, imms = [ICOND])
302
303op("fcmpsel",
304      encoding = (0x02, 0x7F, 8, 10),
305      srcs = 4, imms = [FCOND])
306
307# Pseudo-instructions for compares returning 1/0
308op("icmp", _, srcs = 2, imms = [ICOND, INVERT_COND])
309op("fcmp", _, srcs = 2, imms = [FCOND, INVERT_COND])
310
311# sources are coordinates, LOD, texture bindless base (zero for texture state
312# registers), texture, sampler, shadow/offset
313# TODO: anything else?
314op("texture_sample",
315      encoding = (0x31, 0x7F, 8, 10), # XXX WRONG SIZE
316      srcs = 6, imms = [DIM, LOD_MODE, MASK, SCOREBOARD, OFFSET, SHADOW,
317                        QUERY_LOD, GATHER])
318for memory, can_reorder in [("texture", True), ("image", False)]:
319    coherency = [COHERENT] if not can_reorder else []
320    op(f"{memory}_load", encoding = (0x71, 0x7F, 8, 10), # XXX WRONG SIZE
321       srcs = 6, imms = [DIM, LOD_MODE, MASK, SCOREBOARD, OFFSET] + coherency,
322       can_reorder = can_reorder,
323       schedule_class = "none" if can_reorder else "load")
324
325# sources are base, index
326op("device_load",
327      encoding = (0x05, 0x7F, 6, 8),
328      srcs = 2, imms = [FORMAT, MASK, SHIFT, SCOREBOARD, COHERENT], can_reorder = False,
329      schedule_class = "load")
330
331# sources are base (relative to workgroup memory), index
332op("local_load",
333      encoding = (0b1101001, 0, 6, 8),
334      srcs = 2, imms = [FORMAT, MASK], can_reorder = False,
335      schedule_class = "load")
336
337# sources are value, base, index
338# TODO: Consider permitting the short form
339op("device_store",
340      encoding = (0x45 | (1 << 47), 0, 8, _),
341      dests = 0, srcs = 3, imms = [FORMAT, MASK, SHIFT, SCOREBOARD, COHERENT], can_eliminate = False,
342      schedule_class = "store")
343
344# sources are value, base, index
345op("local_store",
346      encoding = (0b0101001, 0, 6, 8),
347      dests = 0, srcs = 3, imms = [FORMAT, MASK],
348      can_eliminate=False, schedule_class = "store")
349
350# sources are value, index
351# TODO: Consider permitting the short form
352op("uniform_store",
353      encoding = ((0b111 << 27) | 0b1000101 | (1 << 47), 0, 8, _),
354      dests = 0, srcs = 2, imms = [MASK], can_eliminate = False)
355
356# sources are value, base, index
357op("atomic",
358      encoding = (0x15 | (1 << 26) | (1 << 31) | (5 << 44), 0x3F | (1 << 26) | (1 << 31) | (5 << 44), 8, _),
359      dests = 1, srcs = 3, imms = [ATOMIC_OPC, SCOREBOARD],
360      can_eliminate = False, schedule_class = "atomic")
361
362# XXX: stop hardcoding the long form
363op("local_atomic",
364      encoding = (0x19 | (1 << 15) | (1 << 36) | (1 << 47), 0x3F | (1 << 36) | (1 << 47), 10, _),
365      dests = 1, srcs = 3, imms = [ATOMIC_OPC], schedule_class = "atomic",
366      can_eliminate = False)
367
368op("wait", (0x38, 0xFF, 2, _), dests = 0,
369      can_eliminate = False, imms = [SCOREBOARD], schedule_class = "invalid")
370
371for (suffix, schedule_class) in [("", "none"), ("_coverage", "coverage"), ("_barrier", "barrier")]:
372    op(f"get_sr{suffix}", (0x72, 0x7F | L, 4, _), dests = 1, imms = [SR],
373       schedule_class = schedule_class, can_reorder = schedule_class == "none")
374
375op("sample_mask", (0x7fc1, 0xffff, 6, _), dests = 0, srcs = 2,
376    can_eliminate = False, schedule_class = "coverage")
377
378# Sources: sample mask, combined depth/stencil
379op("zs_emit", (0x41, 0xFF | L, 4, _), dests = 0, srcs = 2,
380              can_eliminate = False, imms = [ZS], schedule_class = "coverage")
381
382# Sources: sample mask, explicit coords (if present)
383op("ld_tile", (0x49, 0x7F, 8, _), dests = 1, srcs = 2,
384        imms = [FORMAT, MASK, PIXEL_OFFSET, EXPLICIT_COORDS], can_reorder = False,
385        schedule_class = "coverage")
386
387# Sources: value, sample mask, explicit coords (if present)
388op("st_tile", (0x09, 0x7F, 8, _), dests = 0, srcs = 3,
389      can_eliminate = False, imms = [FORMAT, MASK, PIXEL_OFFSET, EXPLICIT_COORDS],
390      schedule_class = "coverage")
391
392for (name, exact) in [("any", 0xC000), ("none", 0xC020), ("none_after", 0xC020)]:
393   op("jmp_exec_" + name, (exact, (1 << 16) - 1, 6, _), dests = 0, srcs = 0,
394         can_eliminate = False, schedule_class = "invalid", imms = [TARGET])
395
396# TODO: model implicit r0l destinations
397op("pop_exec", (0x52 | (0x3 << 9), ((1 << 48) - 1) ^ (0x3 << 7) ^ (0x3 << 11), 6, _),
398      dests = 0, srcs = 0, can_eliminate = False, schedule_class = "invalid",
399      imms = [NEST])
400
401for is_float in [False, True]:
402   mod_mask = 0 if is_float else (0x3 << 26) | (0x3 << 38)
403
404   for (cf, cf_op) in [("if", 0), ("else", 1), ("while", 2)]:
405      name = "{}_{}cmp".format(cf, "f" if is_float else "i")
406      exact = 0x42 | (0x0 if is_float else 0x10) | (cf_op << 9)
407      mask = 0x7F | (0x3 << 9) | mod_mask | (0x3 << 44)
408      imms = [NEST, FCOND if is_float else ICOND, INVERT_COND, TARGET]
409
410      op(name, (exact, mask, 6, _), dests = 0, srcs = 2, can_eliminate = False,
411            imms = imms, is_float = is_float,
412            schedule_class = "preload" if cf == "else" else "invalid")
413
414op("bitop", (0x7E, 0x7F, 6, _), srcs = 2, imms = [TRUTH_TABLE])
415op("intl", (0x3E, 0x7F, 6, _), srcs = 2, imms = [])
416op("convert", (0x3E | L, 0x7F | L | (0x3 << 38), 6, _), srcs = 2, imms = [ROUND])
417
418# Sources are the coeffient register and the sample index (if applicable)
419op("iter", (0x21, 0xBF, 8, _), srcs = 2, imms = [CHANNELS, INTERPOLATION])
420
421# Sources are the coeffient register for the varying, the coefficient register
422# for W, and the sample index (if applicable)
423op("iterproj", (0x21, 0xBF, 8, _), srcs = 3, imms = [CHANNELS, INTERPOLATION])
424
425op("ldcf", (0xA1, 0xBF, 8, _), srcs = 1, imms = [CHANNELS])
426op("st_vary", None, dests = 0, srcs = 2, can_eliminate = False)
427op("no_varyings", (0x80000051, 0xFFFFFFFF, 4, _), dests = 0, can_eliminate = False)
428op("stop", (0x88, 0xFFFF, 2, _), dests = 0, can_eliminate = False,
429   schedule_class = "invalid")
430op("trap", (0x08, 0xFFFF, 2, _), dests = 0, can_eliminate = False,
431   schedule_class = "invalid")
432
433# These are modelled as total barriers since they can guard global memory
434# access too, and even need to be properly ordered with loads.
435op("wait_pix", (0x48, 0xFF, 4, _), dests = 0, imms = [WRITEOUT],
436   can_eliminate = False, schedule_class = "barrier")
437op("signal_pix", (0x58, 0xFF, 4, _), dests = 0, imms = [WRITEOUT],
438   can_eliminate = False, schedule_class = "barrier")
439
440# Sources are the data vector, the coordinate vector, the LOD, the bindless
441# table if present (zero for texture state registers), and texture index.
442op("image_write", (0xF1 | (1 << 23), 0xFF, 6, 8), dests = 0, srcs = 5, imms
443   = [DIM, COHERENT], can_eliminate = False, schedule_class = "store")
444
445# Sources are the image base, image index, the offset within shared memory, and
446# the coordinates (or just the layer if implicit).
447# TODO: Do we need the short encoding?
448op("block_image_store", (0xB1, 0xFF, 10, _), dests = 0, srcs = 4,
449   imms = [FORMAT, DIM, EXPLICIT_COORDS], can_eliminate = False, schedule_class = "store")
450
451# Barriers
452op("threadgroup_barrier", (0x0068, 0xFFFF, 2, _), dests = 0, srcs = 0,
453   can_eliminate = False, schedule_class = "barrier")
454
455def memory_barrier(name, a, b, c):
456    op(name, (0xF5 | (a << 10) | (b << 8) | (c << 12), 0xFFFF, 2, _), dests = 0, srcs = 0,
457       can_eliminate = False, schedule_class = "barrier")
458
459memory_barrier("memory_barrier", 1, 2, 9)
460
461# TODO: Not clear what these individually are. Some might be cache flushes?
462memory_barrier("image_barrier_1", 2, 2, 10)
463memory_barrier("image_barrier_2", 3, 2, 10)
464memory_barrier("image_barrier_3", 2, 1, 10)
465memory_barrier("image_barrier_4", 3, 1, 10)
466
467memory_barrier("flush_memory_to_texture", 0, 0, 4)
468
469memory_barrier("memory_barrier_2", 2, 2, 9)
470memory_barrier("memory_barrier_3", 2, 1, 9)
471memory_barrier("unknown_barrier_1", 0, 3, 3)
472memory_barrier("unknown_barrier_2", 0, 3, 0)
473
474# Seen with device-scope memory barriers. Again not clear what's what.
475memory_barrier("device_barrier_1", 3, 1, 9)
476memory_barrier("device_barrier_2", 3, 2, 9)
477
478op("doorbell", (0x60020 | 0x28 << 32, (1 << 48) - 1, 6, _), dests = 0,
479      can_eliminate = False, can_reorder = False, imms = [IMM])
480
481op("stack_unmap", (0x00075, (1 << 24) - 1, 8, _), dests = 1, srcs = 0, can_eliminate = False, can_reorder = False, imms = [IMM])
482op("stack_map",   (0x10075, (1 << 24) - 1, 8, _), dests = 0, srcs = 1, can_eliminate = False, can_reorder = False, imms = [IMM])
483
484op("stack_adjust",
485      encoding = (0x10100b5, (1 << 26) - 1, 8, _),
486      dests = 0, srcs = 0, can_eliminate = False, can_reorder = False,
487      imms = [STACK_SIZE], schedule_class = "store")
488
489# source is offset
490op("stack_load",
491      encoding = (0x35, (1 << 20) - 1, 6, 8),
492      srcs = 1, imms = [FORMAT, MASK, SCOREBOARD], can_reorder = False,
493      schedule_class = "load")
494
495# sources are value and offset
496op("stack_store",
497      encoding = (0xb5, (1 << 20) - 1, 6, 8),
498      dests = 0, srcs = 2, imms = [FORMAT, MASK, SCOREBOARD],
499      can_eliminate=False, schedule_class = "store")
500
501# Convenient aliases.
502op("mov", _, srcs = 1)
503op("not", _, srcs = 1)
504op("signext", _, srcs = 1)
505
506op("collect", _, srcs = VARIABLE)
507op("split", _, srcs = 1, dests = VARIABLE)
508op("phi", _, srcs = VARIABLE, schedule_class = "preload")
509
510# The srcs double as destinations. Only deals in registers. This is generated by
511# parallel copy lowering and lowered soon after. We need this as a dedicated
512# instruction only for RA validation.
513op("swap", _, dests = 0, srcs = 2)
514
515op("unit_test", _, dests = 0, srcs = 1, can_eliminate = False)
516
517# Like mov, but takes a register and can only appear at the start. Guaranteed
518# to be coalesced during RA, rather than lowered to a real move.
519op("preload", _, srcs = 1, schedule_class = "preload")
520
521# Opposite of preload. Exports a scalar value to a particular register at the
522# end of the shader part. Must only appear after the logical end of the exit
523# block, this avoids special casing the source's liveness. Logically all exports
524# happen in parallel at the end of the shader part.
525op("export", _, dests = 0, srcs = 1, imms = [IMM], can_eliminate = False,
526   schedule_class = "invalid")
527
528# Pseudo-instructions to set the nesting counter. Lowers to r0l writes after RA.
529op("begin_cf", _, dests = 0, can_eliminate = False)
530op("break", _, dests = 0, imms = [NEST, TARGET], can_eliminate = False,
531   schedule_class = "invalid")
532
533for (name, is_float) in [("break_if_icmp", False), ("break_if_fcmp", True)]:
534    op(name, _, dests = 0, srcs = 2,
535       imms = [NEST, INVERT_COND, FCOND if is_float else ICOND, TARGET],
536       can_eliminate = False, schedule_class = "invalid")
537