1""" 2Copyright 2021 Alyssa Rosenzweig 3# SPDX-License-Identifier: MIT 4""" 5 6opcodes = {} 7immediates = {} 8enums = {} 9 10VARIABLE = ~0 11 12class Opcode(object): 13 def __init__(self, name, dests, srcs, imms, is_float, can_eliminate, 14 can_reorder, schedule_class, encoding): 15 self.name = name 16 self.dests = dests if dests != VARIABLE else 0 17 self.srcs = srcs if srcs != VARIABLE else 0 18 self.variable_srcs = (srcs == VARIABLE) 19 self.variable_dests = (dests == VARIABLE) 20 self.imms = imms 21 self.is_float = is_float 22 self.can_eliminate = can_eliminate 23 self.can_reorder = can_reorder 24 self.schedule_class = schedule_class 25 self.encoding = encoding 26 27class Immediate(object): 28 def __init__(self, name, ctype): 29 self.name = name 30 self.ctype = ctype 31 32class Encoding(object): 33 def __init__(self, description): 34 (exact, mask, length_short, length_long) = description 35 36 # Convenience 37 if length_long is None: 38 length_long = length_short 39 40 self.exact = exact 41 self.mask = mask 42 self.length_short = length_short 43 self.extensible = length_short != length_long 44 45 if self.extensible: 46 assert(length_long == length_short + (4 if length_short > 8 else 2)) 47 48def op(name, encoding, dests = 1, srcs = 0, imms = [], is_float = False, 49 can_eliminate = True, can_reorder = True, schedule_class = "none"): 50 encoding = Encoding(encoding) if encoding is not None else None 51 52 opcodes[name] = Opcode(name, dests, srcs, imms, is_float, can_eliminate, 53 can_reorder, schedule_class, encoding) 54 55def immediate(name, ctype = "uint32_t"): 56 imm = Immediate(name, ctype) 57 immediates[name] = imm 58 return imm 59 60def enum(name, value_dict): 61 enums[name] = value_dict 62 return immediate(name, "enum agx_" + name) 63 64L = (1 << 15) 65_ = None 66 67FORMAT = immediate("format", "enum agx_format") 68IMM = immediate("imm", "uint64_t") 69WRITEOUT = immediate("writeout") 70INDEX = immediate("index") 71COMPONENT = immediate("component") 72CHANNELS = immediate("channels") 73TRUTH_TABLE = immediate("truth_table") 74ROUND = immediate("round", "enum agx_round") 75SHIFT = immediate("shift") 76MASK = immediate("mask") 77BFI_MASK = immediate("bfi_mask") 78LOD_MODE = immediate("lod_mode", "enum agx_lod_mode") 79PIXEL_OFFSET = immediate("pixel_offset") 80STACK_SIZE = immediate("stack_size", 'int16_t') 81EXPLICIT_COORDS = immediate("explicit_coords", "bool") 82 83DIM = enum("dim", { 84 0: '1d', 85 1: '1d_array', 86 2: '2d', 87 3: '2d_array', 88 4: '2d_ms', 89 5: '3d', 90 6: 'cube', 91 7: 'cube_array', 92 8: '2d_ms_array', 93}) 94 95GATHER = enum("gather", { 96 0b000: "none", 97 0b001: "r", 98 0b011: "g", 99 0b101: "b", 100 0b111: "a", 101}) 102 103OFFSET = immediate("offset", "bool") 104SHADOW = immediate("shadow", "bool") 105QUERY_LOD = immediate("query_lod", "bool") 106COHERENT = immediate("coherent", "bool") 107SCOREBOARD = immediate("scoreboard") 108ICOND = immediate("icond", "enum agx_icond") 109FCOND = immediate("fcond", "enum agx_fcond") 110NEST = immediate("nest") 111INVERT_COND = immediate("invert_cond") 112NEST = immediate("nest") 113TARGET = immediate("target", "agx_block *") 114ZS = immediate("zs") 115SR = enum("sr", { 116 0: 'threadgroup_position_in_grid.x', 117 1: 'threadgroup_position_in_grid.y', 118 2: 'threadgroup_position_in_grid.z', 119 4: 'threads_per_threadgroup.x', 120 5: 'threads_per_threadgroup.y', 121 6: 'threads_per_threadgroup.z', 122 8: 'dispatch_threads_per_threadgroup.x', 123 9: 'dispatch_threads_per_threadgroup.y', 124 10: 'dispatch_threads_per_threadgroup.z', 125 14: 'samples_log2', 126 20: 'core_id', 127 21: 'vm_slot', 128 48: 'thread_position_in_threadgroup.x', 129 49: 'thread_position_in_threadgroup.y', 130 50: 'thread_position_in_threadgroup.z', 131 51: 'thread_index_in_threadgroup', 132 52: 'thread_index_in_subgroup', 133 53: 'subgroup_index_in_threadgroup', 134 56: 'active_thread_index_in_quad', 135 57: 'total_active_threads_in_quad', 136 58: 'active_thread_index_in_subgroup', 137 59: 'total_active_threads_in_subgroup', 138 60: 'coverage_mask', 139 62: 'backfacing', 140 63: 'is_active_thread', 141 80: 'thread_position_in_grid.x', 142 81: 'thread_position_in_grid.y', 143 82: 'thread_position_in_grid.z', 144 124: 'input_sample_mask', 145 144: 'helper_op', 146 146: 'helper_arg_l', 147 147: 'helper_arg_h', 148}) 149 150ATOMIC_OPC = enum("atomic_opc", { 151 0: 'add', 152 1: 'sub', 153 2: 'xchg', 154 3: 'cmpxchg', 155 4: 'umin', 156 5: 'imin', 157 6: 'umax', 158 7: 'imax', 159 8: 'and', 160 9: 'or', 161 10: 'xor', 162}) 163 164INTERPOLATION = enum("interpolation", { 165 0: 'center', 166 1: 'sample', 167 2: 'centroid', 168 # We translate sample -> sample_register at pack time for simplicity 169 3: 'sample_register', 170}) 171 172SIMD_OP = enum("simd_op", { 173 0b00000: 'and', 174 0b00001: 'fadd', 175 0b00010: 'or', 176 0b00011: 'fmul', 177 0b00100: 'xor', 178 0b00101: 'fmin', 179 0b00111: 'fmax', 180 0b10000: 'iadd', 181 0b10100: 'smin', 182 0b10110: 'smax', 183 0b11100: 'umin', 184 0b11110: 'umax', 185}) 186 187FUNOP = lambda x: (x << 28) 188FUNOP_MASK = FUNOP((1 << 14) - 1) 189 190def funop(name, opcode, schedule_class = "none"): 191 op(name, (0x0A | (opcode << 28), 192 0x3F | (((1 << 14) - 1) << 28), 4, 6), 193 srcs = 1, is_float = True, schedule_class = schedule_class) 194 195def iunop(name, opcode): 196 assert(opcode < 4) 197 op(name, (0x3E | (opcode << 26), 198 0x7F | L | (((1 << 14) - 1) << 26), 199 6, _), 200 srcs = 1) 201 202# Listing of opcodes 203funop("floor", 0b000000) 204funop("srsqrt", 0b000001) 205funop("dfdx", 0b000100, schedule_class = "coverage") 206funop("dfdy", 0b000110, schedule_class = "coverage") 207funop("rcp", 0b001000) 208funop("rsqrt", 0b001001) 209funop("sin_pt_1", 0b001010) 210funop("log2", 0b001100) 211funop("exp2", 0b001101) 212funop("sin_pt_2", 0b001110) 213funop("ceil", 0b010000) 214funop("trunc", 0b100000) 215funop("roundeven", 0b110000) 216 217iunop("bitrev", 0b01) 218iunop("popcount", 0b10) 219iunop("ffs", 0b11) 220 221op("fadd", 222 encoding = (0x2A, 0x3F, 4, 6), 223 srcs = 2, is_float = True) 224 225op("ffma", 226 encoding = (0x3A, 0x3F, 6, 8), 227 srcs = 3, is_float = True) 228 229op("fmul", 230 encoding = (0x1A, 0x3F, 4, 6), 231 srcs = 2, is_float = True) 232 233op("hadd", 234 encoding = (0x26, 0x3F, 4, 6), 235 srcs = 2, is_float = True) 236 237op("hfma", 238 encoding = (0x36, 0x3F, 6, 8), 239 srcs = 3, is_float = True) 240 241op("hmul", 242 encoding = (0x16, 0x3F, 4, 6), 243 srcs = 2, is_float = True) 244 245op("mov_imm", 246 encoding = (0x62, 0xFF, 6, 8), 247 imms = [IMM]) 248 249op("iadd", 250 encoding = (0x0E, 0x3F | L, 8, _), 251 srcs = 2, imms = [SHIFT]) 252 253op("imad", 254 encoding = (0x1E, 0x3F | L, 8, _), 255 srcs = 3, imms = [SHIFT]) 256 257op("bfi", 258 encoding = (0x2E, 0x7F | (0x3 << 26), 8, _), 259 srcs = 3, imms = [BFI_MASK]) 260 261op("bfeil", 262 encoding = (0x2E | L, 0x7F | L | (0x3 << 26), 8, _), 263 srcs = 3, imms = [BFI_MASK]) 264 265op("extr", 266 encoding = (0x2E | (0x1 << 26), 0x7F | L | (0x3 << 26), 8, _), 267 srcs = 3, imms = [BFI_MASK]) 268 269op("asr", 270 encoding = (0x2E | L | (0x1 << 26), 0x7F | L | (0x3 << 26), 8, _), 271 srcs = 2) 272 273def subgroup_op(name, opc): 274 exact = 0b01101111 | L | (opc << 29) 275 exact_mask = 0b11111111 | L | (0x3 << 29) 276 277 op(name, encoding = (exact, exact_mask, 6, _), srcs = 1, imms = [SIMD_OP]) 278 279subgroup_op("quad_reduce", 0x0) 280subgroup_op("simd_reduce", 0x1) 281subgroup_op("quad_prefix", 0x2) 282subgroup_op("simd_prefix", 0x3) 283 284for window, w_bit in [('quad_', 0), ('', 1)]: 285 for s, shuffle in enumerate(['', '_xor', '_up', '_down']): 286 op(f"{window}shuffle{shuffle}", 287 encoding = (0b01101111 | (w_bit << 26) | (s << 38), 288 0xFF | L | (1 << 47) | (3 << 38) | (3 << 26), 6, _), 289 srcs = 2) 290 291 # Pseudo-instruction ballotting a boolean 292 op(f"{window}ballot", _, srcs = 1) 293 294 for T, T_bit, cond in [('f', 0, FCOND), ('i', 1, ICOND)]: 295 op(f"{T}cmp_{window}ballot", 296 encoding = (0b0100010 | (T_bit << 4) | (w_bit << 48), 0, 8, _), 297 srcs = 2, imms = [cond, INVERT_COND]) 298 299op("icmpsel", 300 encoding = (0x12, 0x7F, 8, 10), 301 srcs = 4, imms = [ICOND]) 302 303op("fcmpsel", 304 encoding = (0x02, 0x7F, 8, 10), 305 srcs = 4, imms = [FCOND]) 306 307# Pseudo-instructions for compares returning 1/0 308op("icmp", _, srcs = 2, imms = [ICOND, INVERT_COND]) 309op("fcmp", _, srcs = 2, imms = [FCOND, INVERT_COND]) 310 311# sources are coordinates, LOD, texture bindless base (zero for texture state 312# registers), texture, sampler, shadow/offset 313# TODO: anything else? 314op("texture_sample", 315 encoding = (0x31, 0x7F, 8, 10), # XXX WRONG SIZE 316 srcs = 6, imms = [DIM, LOD_MODE, MASK, SCOREBOARD, OFFSET, SHADOW, 317 QUERY_LOD, GATHER]) 318for memory, can_reorder in [("texture", True), ("image", False)]: 319 coherency = [COHERENT] if not can_reorder else [] 320 op(f"{memory}_load", encoding = (0x71, 0x7F, 8, 10), # XXX WRONG SIZE 321 srcs = 6, imms = [DIM, LOD_MODE, MASK, SCOREBOARD, OFFSET] + coherency, 322 can_reorder = can_reorder, 323 schedule_class = "none" if can_reorder else "load") 324 325# sources are base, index 326op("device_load", 327 encoding = (0x05, 0x7F, 6, 8), 328 srcs = 2, imms = [FORMAT, MASK, SHIFT, SCOREBOARD, COHERENT], can_reorder = False, 329 schedule_class = "load") 330 331# sources are base (relative to workgroup memory), index 332op("local_load", 333 encoding = (0b1101001, 0, 6, 8), 334 srcs = 2, imms = [FORMAT, MASK], can_reorder = False, 335 schedule_class = "load") 336 337# sources are value, base, index 338# TODO: Consider permitting the short form 339op("device_store", 340 encoding = (0x45 | (1 << 47), 0, 8, _), 341 dests = 0, srcs = 3, imms = [FORMAT, MASK, SHIFT, SCOREBOARD, COHERENT], can_eliminate = False, 342 schedule_class = "store") 343 344# sources are value, base, index 345op("local_store", 346 encoding = (0b0101001, 0, 6, 8), 347 dests = 0, srcs = 3, imms = [FORMAT, MASK], 348 can_eliminate=False, schedule_class = "store") 349 350# sources are value, index 351# TODO: Consider permitting the short form 352op("uniform_store", 353 encoding = ((0b111 << 27) | 0b1000101 | (1 << 47), 0, 8, _), 354 dests = 0, srcs = 2, imms = [MASK], can_eliminate = False) 355 356# sources are value, base, index 357op("atomic", 358 encoding = (0x15 | (1 << 26) | (1 << 31) | (5 << 44), 0x3F | (1 << 26) | (1 << 31) | (5 << 44), 8, _), 359 dests = 1, srcs = 3, imms = [ATOMIC_OPC, SCOREBOARD], 360 can_eliminate = False, schedule_class = "atomic") 361 362# XXX: stop hardcoding the long form 363op("local_atomic", 364 encoding = (0x19 | (1 << 15) | (1 << 36) | (1 << 47), 0x3F | (1 << 36) | (1 << 47), 10, _), 365 dests = 1, srcs = 3, imms = [ATOMIC_OPC], schedule_class = "atomic", 366 can_eliminate = False) 367 368op("wait", (0x38, 0xFF, 2, _), dests = 0, 369 can_eliminate = False, imms = [SCOREBOARD], schedule_class = "invalid") 370 371for (suffix, schedule_class) in [("", "none"), ("_coverage", "coverage"), ("_barrier", "barrier")]: 372 op(f"get_sr{suffix}", (0x72, 0x7F | L, 4, _), dests = 1, imms = [SR], 373 schedule_class = schedule_class, can_reorder = schedule_class == "none") 374 375op("sample_mask", (0x7fc1, 0xffff, 6, _), dests = 0, srcs = 2, 376 can_eliminate = False, schedule_class = "coverage") 377 378# Sources: sample mask, combined depth/stencil 379op("zs_emit", (0x41, 0xFF | L, 4, _), dests = 0, srcs = 2, 380 can_eliminate = False, imms = [ZS], schedule_class = "coverage") 381 382# Sources: sample mask, explicit coords (if present) 383op("ld_tile", (0x49, 0x7F, 8, _), dests = 1, srcs = 2, 384 imms = [FORMAT, MASK, PIXEL_OFFSET, EXPLICIT_COORDS], can_reorder = False, 385 schedule_class = "coverage") 386 387# Sources: value, sample mask, explicit coords (if present) 388op("st_tile", (0x09, 0x7F, 8, _), dests = 0, srcs = 3, 389 can_eliminate = False, imms = [FORMAT, MASK, PIXEL_OFFSET, EXPLICIT_COORDS], 390 schedule_class = "coverage") 391 392for (name, exact) in [("any", 0xC000), ("none", 0xC020), ("none_after", 0xC020)]: 393 op("jmp_exec_" + name, (exact, (1 << 16) - 1, 6, _), dests = 0, srcs = 0, 394 can_eliminate = False, schedule_class = "invalid", imms = [TARGET]) 395 396# TODO: model implicit r0l destinations 397op("pop_exec", (0x52 | (0x3 << 9), ((1 << 48) - 1) ^ (0x3 << 7) ^ (0x3 << 11), 6, _), 398 dests = 0, srcs = 0, can_eliminate = False, schedule_class = "invalid", 399 imms = [NEST]) 400 401for is_float in [False, True]: 402 mod_mask = 0 if is_float else (0x3 << 26) | (0x3 << 38) 403 404 for (cf, cf_op) in [("if", 0), ("else", 1), ("while", 2)]: 405 name = "{}_{}cmp".format(cf, "f" if is_float else "i") 406 exact = 0x42 | (0x0 if is_float else 0x10) | (cf_op << 9) 407 mask = 0x7F | (0x3 << 9) | mod_mask | (0x3 << 44) 408 imms = [NEST, FCOND if is_float else ICOND, INVERT_COND, TARGET] 409 410 op(name, (exact, mask, 6, _), dests = 0, srcs = 2, can_eliminate = False, 411 imms = imms, is_float = is_float, 412 schedule_class = "preload" if cf == "else" else "invalid") 413 414op("bitop", (0x7E, 0x7F, 6, _), srcs = 2, imms = [TRUTH_TABLE]) 415op("intl", (0x3E, 0x7F, 6, _), srcs = 2, imms = []) 416op("convert", (0x3E | L, 0x7F | L | (0x3 << 38), 6, _), srcs = 2, imms = [ROUND]) 417 418# Sources are the coeffient register and the sample index (if applicable) 419op("iter", (0x21, 0xBF, 8, _), srcs = 2, imms = [CHANNELS, INTERPOLATION]) 420 421# Sources are the coeffient register for the varying, the coefficient register 422# for W, and the sample index (if applicable) 423op("iterproj", (0x21, 0xBF, 8, _), srcs = 3, imms = [CHANNELS, INTERPOLATION]) 424 425op("ldcf", (0xA1, 0xBF, 8, _), srcs = 1, imms = [CHANNELS]) 426op("st_vary", None, dests = 0, srcs = 2, can_eliminate = False) 427op("no_varyings", (0x80000051, 0xFFFFFFFF, 4, _), dests = 0, can_eliminate = False) 428op("stop", (0x88, 0xFFFF, 2, _), dests = 0, can_eliminate = False, 429 schedule_class = "invalid") 430op("trap", (0x08, 0xFFFF, 2, _), dests = 0, can_eliminate = False, 431 schedule_class = "invalid") 432 433# These are modelled as total barriers since they can guard global memory 434# access too, and even need to be properly ordered with loads. 435op("wait_pix", (0x48, 0xFF, 4, _), dests = 0, imms = [WRITEOUT], 436 can_eliminate = False, schedule_class = "barrier") 437op("signal_pix", (0x58, 0xFF, 4, _), dests = 0, imms = [WRITEOUT], 438 can_eliminate = False, schedule_class = "barrier") 439 440# Sources are the data vector, the coordinate vector, the LOD, the bindless 441# table if present (zero for texture state registers), and texture index. 442op("image_write", (0xF1 | (1 << 23), 0xFF, 6, 8), dests = 0, srcs = 5, imms 443 = [DIM, COHERENT], can_eliminate = False, schedule_class = "store") 444 445# Sources are the image base, image index, the offset within shared memory, and 446# the coordinates (or just the layer if implicit). 447# TODO: Do we need the short encoding? 448op("block_image_store", (0xB1, 0xFF, 10, _), dests = 0, srcs = 4, 449 imms = [FORMAT, DIM, EXPLICIT_COORDS], can_eliminate = False, schedule_class = "store") 450 451# Barriers 452op("threadgroup_barrier", (0x0068, 0xFFFF, 2, _), dests = 0, srcs = 0, 453 can_eliminate = False, schedule_class = "barrier") 454 455def memory_barrier(name, a, b, c): 456 op(name, (0xF5 | (a << 10) | (b << 8) | (c << 12), 0xFFFF, 2, _), dests = 0, srcs = 0, 457 can_eliminate = False, schedule_class = "barrier") 458 459memory_barrier("memory_barrier", 1, 2, 9) 460 461# TODO: Not clear what these individually are. Some might be cache flushes? 462memory_barrier("image_barrier_1", 2, 2, 10) 463memory_barrier("image_barrier_2", 3, 2, 10) 464memory_barrier("image_barrier_3", 2, 1, 10) 465memory_barrier("image_barrier_4", 3, 1, 10) 466 467memory_barrier("flush_memory_to_texture", 0, 0, 4) 468 469memory_barrier("memory_barrier_2", 2, 2, 9) 470memory_barrier("memory_barrier_3", 2, 1, 9) 471memory_barrier("unknown_barrier_1", 0, 3, 3) 472memory_barrier("unknown_barrier_2", 0, 3, 0) 473 474# Seen with device-scope memory barriers. Again not clear what's what. 475memory_barrier("device_barrier_1", 3, 1, 9) 476memory_barrier("device_barrier_2", 3, 2, 9) 477 478op("doorbell", (0x60020 | 0x28 << 32, (1 << 48) - 1, 6, _), dests = 0, 479 can_eliminate = False, can_reorder = False, imms = [IMM]) 480 481op("stack_unmap", (0x00075, (1 << 24) - 1, 8, _), dests = 1, srcs = 0, can_eliminate = False, can_reorder = False, imms = [IMM]) 482op("stack_map", (0x10075, (1 << 24) - 1, 8, _), dests = 0, srcs = 1, can_eliminate = False, can_reorder = False, imms = [IMM]) 483 484op("stack_adjust", 485 encoding = (0x10100b5, (1 << 26) - 1, 8, _), 486 dests = 0, srcs = 0, can_eliminate = False, can_reorder = False, 487 imms = [STACK_SIZE], schedule_class = "store") 488 489# source is offset 490op("stack_load", 491 encoding = (0x35, (1 << 20) - 1, 6, 8), 492 srcs = 1, imms = [FORMAT, MASK, SCOREBOARD], can_reorder = False, 493 schedule_class = "load") 494 495# sources are value and offset 496op("stack_store", 497 encoding = (0xb5, (1 << 20) - 1, 6, 8), 498 dests = 0, srcs = 2, imms = [FORMAT, MASK, SCOREBOARD], 499 can_eliminate=False, schedule_class = "store") 500 501# Convenient aliases. 502op("mov", _, srcs = 1) 503op("not", _, srcs = 1) 504op("signext", _, srcs = 1) 505 506op("collect", _, srcs = VARIABLE) 507op("split", _, srcs = 1, dests = VARIABLE) 508op("phi", _, srcs = VARIABLE, schedule_class = "preload") 509 510# The srcs double as destinations. Only deals in registers. This is generated by 511# parallel copy lowering and lowered soon after. We need this as a dedicated 512# instruction only for RA validation. 513op("swap", _, dests = 0, srcs = 2) 514 515op("unit_test", _, dests = 0, srcs = 1, can_eliminate = False) 516 517# Like mov, but takes a register and can only appear at the start. Guaranteed 518# to be coalesced during RA, rather than lowered to a real move. 519op("preload", _, srcs = 1, schedule_class = "preload") 520 521# Opposite of preload. Exports a scalar value to a particular register at the 522# end of the shader part. Must only appear after the logical end of the exit 523# block, this avoids special casing the source's liveness. Logically all exports 524# happen in parallel at the end of the shader part. 525op("export", _, dests = 0, srcs = 1, imms = [IMM], can_eliminate = False, 526 schedule_class = "invalid") 527 528# Pseudo-instructions to set the nesting counter. Lowers to r0l writes after RA. 529op("begin_cf", _, dests = 0, can_eliminate = False) 530op("break", _, dests = 0, imms = [NEST, TARGET], can_eliminate = False, 531 schedule_class = "invalid") 532 533for (name, is_float) in [("break_if_icmp", False), ("break_if_fcmp", True)]: 534 op(name, _, dests = 0, srcs = 2, 535 imms = [NEST, INVERT_COND, FCOND if is_float else ICOND, TARGET], 536 can_eliminate = False, schedule_class = "invalid") 537