1""" 2Copyright 2021 Alyssa Rosenzweig 3# SPDX-License-Identifier: MIT 4""" 5 6opcodes = {} 7immediates = {} 8enums = {} 9 10VARIABLE = ~0 11 12class Opcode(object): 13 def __init__(self, name, dests, srcs, imms, is_float, can_eliminate, 14 can_reorder, schedule_class, encoding_16, encoding_32): 15 self.name = name 16 self.dests = dests if dests != VARIABLE else 0 17 self.srcs = srcs if srcs != VARIABLE else 0 18 self.variable_srcs = (srcs == VARIABLE) 19 self.variable_dests = (dests == VARIABLE) 20 self.imms = imms 21 self.is_float = is_float 22 self.can_eliminate = can_eliminate 23 self.can_reorder = can_reorder 24 self.schedule_class = schedule_class 25 self.encoding_16 = encoding_16 26 self.encoding_32 = encoding_32 27 28class Immediate(object): 29 def __init__(self, name, ctype): 30 self.name = name 31 self.ctype = ctype 32 33class Encoding(object): 34 def __init__(self, description): 35 (exact, mask, length_short, length_long) = description 36 37 # Convenience 38 if length_long is None: 39 length_long = length_short 40 41 self.exact = exact 42 self.mask = mask 43 self.length_short = length_short 44 self.extensible = length_short != length_long 45 46 if self.extensible: 47 assert(length_long == length_short + (4 if length_short > 8 else 2)) 48 49def op(name, encoding_32, dests = 1, srcs = 0, imms = [], is_float = False, 50 can_eliminate = True, can_reorder = True, encoding_16 = None, 51 schedule_class = "none"): 52 encoding_16 = Encoding(encoding_16) if encoding_16 is not None else None 53 encoding_32 = Encoding(encoding_32) if encoding_32 is not None else None 54 55 opcodes[name] = Opcode(name, dests, srcs, imms, is_float, can_eliminate, 56 can_reorder, schedule_class, encoding_16, encoding_32) 57 58def immediate(name, ctype = "uint32_t"): 59 imm = Immediate(name, ctype) 60 immediates[name] = imm 61 return imm 62 63def enum(name, value_dict): 64 enums[name] = value_dict 65 return immediate(name, "enum agx_" + name) 66 67L = (1 << 15) 68_ = None 69 70FORMAT = immediate("format", "enum agx_format") 71IMM = immediate("imm", "uint64_t") 72WRITEOUT = immediate("writeout") 73INDEX = immediate("index") 74COMPONENT = immediate("component") 75CHANNELS = immediate("channels") 76TRUTH_TABLE = immediate("truth_table") 77ROUND = immediate("round", "enum agx_round") 78SHIFT = immediate("shift") 79MASK = immediate("mask") 80BFI_MASK = immediate("bfi_mask") 81LOD_MODE = immediate("lod_mode", "enum agx_lod_mode") 82PIXEL_OFFSET = immediate("pixel_offset") 83STACK_SIZE = immediate("stack_size", 'int16_t') 84 85DIM = enum("dim", { 86 0: '1d', 87 1: '1d_array', 88 2: '2d', 89 3: '2d_array', 90 4: '2d_ms', 91 5: '3d', 92 6: 'cube', 93 7: 'cube_array', 94 8: '2d_ms_array', 95}) 96 97GATHER = enum("gather", { 98 0b000: "none", 99 0b001: "r", 100 0b011: "g", 101 0b101: "b", 102 0b111: "a", 103}) 104 105OFFSET = immediate("offset", "bool") 106SHADOW = immediate("shadow", "bool") 107QUERY_LOD = immediate("query_lod", "bool") 108SCOREBOARD = immediate("scoreboard") 109ICOND = immediate("icond", "enum agx_icond") 110FCOND = immediate("fcond", "enum agx_fcond") 111NEST = immediate("nest") 112INVERT_COND = immediate("invert_cond") 113NEST = immediate("nest") 114TARGET = immediate("target", "agx_block *") 115ZS = immediate("zs") 116SR = enum("sr", { 117 0: 'threadgroup_position_in_grid.x', 118 1: 'threadgroup_position_in_grid.y', 119 2: 'threadgroup_position_in_grid.z', 120 4: 'threads_per_threadgroup.x', 121 5: 'threads_per_threadgroup.y', 122 6: 'threads_per_threadgroup.z', 123 8: 'dispatch_threads_per_threadgroup.x', 124 9: 'dispatch_threads_per_threadgroup.y', 125 10: 'dispatch_threads_per_threadgroup.z', 126 20: 'core_id', 127 21: 'vm_slot', 128 48: 'thread_position_in_threadgroup.x', 129 49: 'thread_position_in_threadgroup.y', 130 50: 'thread_position_in_threadgroup.z', 131 51: 'thread_index_in_threadgroup', 132 52: 'thread_index_in_subgroup', 133 53: 'subgroup_index_in_threadgroup', 134 56: 'active_thread_index_in_quad', 135 58: 'active_thread_index_in_subgroup', 136 60: 'coverage_mask', 137 62: 'backfacing', 138 63: 'is_active_thread', 139 80: 'thread_position_in_grid.x', 140 81: 'thread_position_in_grid.y', 141 82: 'thread_position_in_grid.z', 142 124: 'input_sample_mask', 143 144: 'helper_op', 144 146: 'helper_arg_l', 145 147: 'helper_arg_h', 146}) 147 148ATOMIC_OPC = enum("atomic_opc", { 149 0: 'add', 150 1: 'sub', 151 2: 'xchg', 152 3: 'cmpxchg', 153 4: 'umin', 154 5: 'imin', 155 6: 'umax', 156 7: 'imax', 157 8: 'and', 158 9: 'or', 159 10: 'xor', 160}) 161 162INTERPOLATION = enum("interpolation", { 163 0: 'center', 164 1: 'sample', 165 2: 'centroid', 166 # We translate sample -> sample_register at pack time for simplicity 167 3: 'sample_register', 168}) 169 170FUNOP = lambda x: (x << 28) 171FUNOP_MASK = FUNOP((1 << 14) - 1) 172 173def funop(name, opcode, schedule_class = "none"): 174 op(name, (0x0A | L | (opcode << 28), 175 0x3F | L | (((1 << 14) - 1) << 28), 6, _), 176 srcs = 1, is_float = True, schedule_class = schedule_class) 177 178def iunop(name, opcode): 179 assert(opcode < 4) 180 op(name, (0x3E | (opcode << 26), 181 0x7F | L | (((1 << 14) - 1) << 26), 182 6, _), 183 srcs = 1) 184 185# Listing of opcodes 186funop("floor", 0b000000) 187funop("srsqrt", 0b000001) 188funop("dfdx", 0b000100, schedule_class = "coverage") 189funop("dfdy", 0b000110, schedule_class = "coverage") 190funop("rcp", 0b001000) 191funop("rsqrt", 0b001001) 192funop("sin_pt_1", 0b001010) 193funop("log2", 0b001100) 194funop("exp2", 0b001101) 195funop("sin_pt_2", 0b001110) 196funop("ceil", 0b010000) 197funop("trunc", 0b100000) 198funop("roundeven", 0b110000) 199 200iunop("bitrev", 0b01) 201iunop("popcount", 0b10) 202iunop("ffs", 0b11) 203 204op("fadd", 205 encoding_16 = (0x26, 0x3F, 4, 6), 206 encoding_32 = (0x2A, 0x3F, 4, 6), 207 srcs = 2, is_float = True) 208 209op("fma", 210 encoding_16 = (0x36, 0x3F, 6, 8), 211 encoding_32 = (0x3A, 0x3F, 6, 8), 212 srcs = 3, is_float = True) 213 214op("fmul", 215 encoding_16 = (0x16, 0x3F, 4, 6), 216 encoding_32 = (0x1A, 0x3F, 4, 6), 217 srcs = 2, is_float = True) 218 219op("mov_imm", 220 encoding_32 = (0x62, 0xFF, 6, 8), 221 encoding_16 = (0x62, 0xFF, 4, 6), 222 imms = [IMM]) 223 224op("iadd", 225 encoding_32 = (0x0E, 0x3F | L, 8, _), 226 srcs = 2, imms = [SHIFT]) 227 228op("imad", 229 encoding_32 = (0x1E, 0x3F | L, 8, _), 230 srcs = 3, imms = [SHIFT]) 231 232op("bfi", 233 encoding_32 = (0x2E, 0x7F | (0x3 << 26), 8, _), 234 srcs = 3, imms = [BFI_MASK]) 235 236op("bfeil", 237 encoding_32 = (0x2E | L, 0x7F | L | (0x3 << 26), 8, _), 238 srcs = 3, imms = [BFI_MASK]) 239 240op("extr", 241 encoding_32 = (0x2E | (0x1 << 26), 0x7F | L | (0x3 << 26), 8, _), 242 srcs = 3, imms = [BFI_MASK]) 243 244op("asr", 245 encoding_32 = (0x2E | L | (0x1 << 26), 0x7F | L | (0x3 << 26), 8, _), 246 srcs = 2) 247 248def subgroup_op(name, op1, op2): 249 exact = 0b01101111 | L | (op1 << 47) | (op2 << 26) 250 exact_mask = 0b11111111 | L | (1 << 47) | (0xFFFF << 26) 251 252 op(name, encoding_32 = (exact, exact_mask, 6, _), srcs = 1) 253 254subgroup_op("simd_prefix_iadd", 1, 0b0000000000011000) 255subgroup_op("simd_iadd", 1, 0b0000000000001000) 256 257op("simd_shuffle", 258 encoding_32 = (0b01101111 | (1 << 26), 259 0xFF | L | (1 << 47) | (3 << 38) | (3 << 26), 6, _), 260 srcs = 2) 261 262for window, w_bit in [('quad_', 0), ('', 1)]: 263 # Pseudo-instruction ballotting a boolean 264 op(f"{window}ballot", _, srcs = 1) 265 266 for T, T_bit, cond in [('f', 0, FCOND), ('i', 1, ICOND)]: 267 op(f"{T}cmp_{window}ballot", 268 encoding_32 = (0b0100010 | (T_bit << 4) | (w_bit << 48), 0, 8, _), 269 srcs = 2, imms = [cond, INVERT_COND]) 270 271op("icmpsel", 272 encoding_32 = (0x12, 0x7F, 8, 10), 273 srcs = 4, imms = [ICOND]) 274 275op("fcmpsel", 276 encoding_32 = (0x02, 0x7F, 8, 10), 277 srcs = 4, imms = [FCOND]) 278 279# Pseudo-instructions for compares returning 1/0 280op("icmp", _, srcs = 2, imms = [ICOND, INVERT_COND]) 281op("fcmp", _, srcs = 2, imms = [FCOND, INVERT_COND]) 282 283# sources are coordinates, LOD, texture bindless base (zero for texture state 284# registers), texture, sampler, shadow/offset 285# TODO: anything else? 286op("texture_sample", 287 encoding_32 = (0x31, 0x7F, 8, 10), # XXX WRONG SIZE 288 srcs = 6, imms = [DIM, LOD_MODE, MASK, SCOREBOARD, OFFSET, SHADOW, 289 QUERY_LOD, GATHER]) 290for memory, can_reorder in [("texture", True), ("image", False)]: 291 op(f"{memory}_load", encoding_32 = (0x71, 0x7F, 8, 10), # XXX WRONG SIZE 292 srcs = 6, imms = [DIM, LOD_MODE, MASK, SCOREBOARD, OFFSET], 293 can_reorder = can_reorder, 294 schedule_class = "none" if can_reorder else "load") 295 296# sources are base, index 297op("device_load", 298 encoding_32 = (0x05, 0x7F, 6, 8), 299 srcs = 2, imms = [FORMAT, MASK, SHIFT, SCOREBOARD], can_reorder = False, 300 schedule_class = "load") 301 302# sources are base (relative to workgroup memory), index 303op("local_load", 304 encoding_32 = (0b1101001, 0, 6, 8), 305 srcs = 2, imms = [FORMAT, MASK], can_reorder = False, 306 schedule_class = "load") 307 308# sources are value, base, index 309# TODO: Consider permitting the short form 310op("device_store", 311 encoding_32 = (0x45 | (1 << 47), 0, 8, _), 312 dests = 0, srcs = 3, imms = [FORMAT, MASK, SHIFT, SCOREBOARD], can_eliminate = False, 313 schedule_class = "store") 314 315# sources are value, base, index 316op("local_store", 317 encoding_32 = (0b0101001, 0, 6, 8), 318 dests = 0, srcs = 3, imms = [FORMAT, MASK], 319 can_eliminate=False, schedule_class = "store") 320 321# sources are value, index 322# TODO: Consider permitting the short form 323op("uniform_store", 324 encoding_32 = ((0b111 << 27) | 0b1000101 | (1 << 47), 0, 8, _), 325 dests = 0, srcs = 2, can_eliminate = False) 326 327# sources are value, base, index 328op("atomic", 329 encoding_32 = (0x15 | (1 << 26) | (1 << 31) | (5 << 44), 0x3F | (1 << 26) | (1 << 31) | (5 << 44), 8, _), 330 dests = 1, srcs = 3, imms = [ATOMIC_OPC, SCOREBOARD], 331 can_eliminate = False, schedule_class = "atomic") 332 333# XXX: stop hardcoding the long form 334op("local_atomic", 335 encoding_32 = (0x19 | (1 << 15) | (1 << 36) | (1 << 47), 0x3F | (1 << 36) | (1 << 47), 10, _), 336 dests = 1, srcs = 3, imms = [ATOMIC_OPC], schedule_class = "atomic", 337 can_eliminate = False) 338 339op("wait", (0x38, 0xFF, 2, _), dests = 0, 340 can_eliminate = False, imms = [SCOREBOARD], schedule_class = "invalid") 341 342for (suffix, schedule_class) in [("", "none"), ("_coverage", "coverage"), ("_barrier", "barrier")]: 343 op(f"get_sr{suffix}", (0x72, 0x7F | L, 4, _), dests = 1, imms = [SR], 344 schedule_class = schedule_class, can_reorder = schedule_class == "none") 345 346op("sample_mask", (0x7fc1, 0xffff, 6, _), dests = 0, srcs = 2, 347 can_eliminate = False, schedule_class = "coverage") 348 349# Sources: sample mask, combined depth/stencil 350op("zs_emit", (0x41, 0xFF | L, 4, _), dests = 0, srcs = 2, 351 can_eliminate = False, imms = [ZS], schedule_class = "coverage") 352 353# Essentially same encoding. Last source is the sample mask 354op("ld_tile", (0x49, 0x7F, 8, _), dests = 1, srcs = 1, 355 imms = [FORMAT, MASK, PIXEL_OFFSET], can_reorder = False, 356 schedule_class = "coverage") 357 358op("st_tile", (0x09, 0x7F, 8, _), dests = 0, srcs = 2, 359 can_eliminate = False, imms = [FORMAT, MASK, PIXEL_OFFSET], 360 schedule_class = "coverage") 361 362for (name, exact) in [("any", 0xC000), ("none", 0xC020), ("none_after", 0xC020)]: 363 op("jmp_exec_" + name, (exact, (1 << 16) - 1, 6, _), dests = 0, srcs = 0, 364 can_eliminate = False, schedule_class = "invalid", imms = [TARGET]) 365 366# TODO: model implicit r0l destinations 367op("pop_exec", (0x52 | (0x3 << 9), ((1 << 48) - 1) ^ (0x3 << 7) ^ (0x3 << 11), 6, _), 368 dests = 0, srcs = 0, can_eliminate = False, schedule_class = "invalid", 369 imms = [NEST]) 370 371for is_float in [False, True]: 372 mod_mask = 0 if is_float else (0x3 << 26) | (0x3 << 38) 373 374 for (cf, cf_op) in [("if", 0), ("else", 1), ("while", 2)]: 375 name = "{}_{}cmp".format(cf, "f" if is_float else "i") 376 exact = 0x42 | (0x0 if is_float else 0x10) | (cf_op << 9) 377 mask = 0x7F | (0x3 << 9) | mod_mask | (0x3 << 44) 378 imms = [NEST, FCOND if is_float else ICOND, INVERT_COND, TARGET] 379 380 op(name, (exact, mask, 6, _), dests = 0, srcs = 2, can_eliminate = False, 381 imms = imms, is_float = is_float, 382 schedule_class = "preload" if cf == "else" else "invalid") 383 384op("bitop", (0x7E, 0x7F, 6, _), srcs = 2, imms = [TRUTH_TABLE]) 385op("intl", (0x3E, 0x7F, 6, _), srcs = 2, imms = []) 386op("convert", (0x3E | L, 0x7F | L | (0x3 << 38), 6, _), srcs = 2, imms = [ROUND]) 387 388# Sources are the coeffient register and the sample index (if applicable) 389op("iter", (0x21, 0xBF, 8, _), srcs = 2, imms = [CHANNELS, INTERPOLATION]) 390 391# Sources are the coeffient register for the varying, the coefficient register 392# for W, and the sample index (if applicable) 393op("iterproj", (0x21, 0xBF, 8, _), srcs = 3, imms = [CHANNELS, INTERPOLATION]) 394 395op("ldcf", (0xA1, 0xBF, 8, _), srcs = 1, imms = [CHANNELS]) 396op("st_vary", None, dests = 0, srcs = 2, can_eliminate = False) 397op("no_varyings", (0x80000051, 0xFFFFFFFF, 4, _), dests = 0, can_eliminate = False) 398op("stop", (0x88, 0xFFFF, 2, _), dests = 0, can_eliminate = False, 399 schedule_class = "invalid") 400op("trap", (0x08, 0xFFFF, 2, _), dests = 0, can_eliminate = False, 401 schedule_class = "invalid") 402op("wait_pix", (0x48, 0xFF, 4, _), dests = 0, imms = [WRITEOUT], 403 can_eliminate = False, schedule_class = "coverage") 404op("signal_pix", (0x58, 0xFF, 4, _), dests = 0, imms = [WRITEOUT], 405 can_eliminate = False, schedule_class = "coverage") 406 407# Sources are the data vector, the coordinate vector, the LOD, the bindless 408# table if present (zero for texture state registers), and texture index. 409op("image_write", (0xF1 | (1 << 23) | (9 << 43), 0xFF, 6, 8), dests = 0, srcs = 5, imms 410 = [DIM], can_eliminate = False, schedule_class = "store") 411 412# Sources are the image, the offset within shared memory, and the layer. 413# TODO: Do we need the short encoding? 414op("block_image_store", (0xB1, 0xFF, 10, _), dests = 0, srcs = 3, 415 imms = [FORMAT, DIM], can_eliminate = False, schedule_class = "store") 416 417# Barriers 418op("threadgroup_barrier", (0x0068, 0xFFFF, 2, _), dests = 0, srcs = 0, 419 can_eliminate = False, schedule_class = "barrier") 420 421def memory_barrier(name, a, b, c): 422 op(name, (0xF5 | (a << 10) | (b << 8) | (c << 12), 0xFFFF, 2, _), dests = 0, srcs = 0, 423 can_eliminate = False, schedule_class = "barrier") 424 425memory_barrier("memory_barrier", 1, 2, 9) 426 427# TODO: Not clear what these individually are. Some might be cache flushes? 428memory_barrier("image_barrier_1", 2, 2, 10) 429memory_barrier("image_barrier_2", 3, 2, 10) 430memory_barrier("image_barrier_3", 2, 1, 10) 431memory_barrier("image_barrier_4", 3, 1, 10) 432 433memory_barrier("flush_memory_to_texture", 0, 0, 4) 434 435memory_barrier("memory_barrier_2", 2, 2, 9) 436memory_barrier("memory_barrier_3", 2, 1, 9) 437memory_barrier("unknown_barrier_1", 0, 3, 3) 438memory_barrier("unknown_barrier_2", 0, 3, 0) 439 440op("doorbell", (0x60020 | 0x28 << 32, (1 << 48) - 1, 6, _), dests = 0, 441 can_eliminate = False, can_reorder = False, imms = [IMM]) 442 443op("stack_unmap", (0x00075, (1 << 24) - 1, 8, _), dests = 1, srcs = 0, can_eliminate = False, can_reorder = False, imms = [IMM]) 444op("stack_map", (0x10075, (1 << 24) - 1, 8, _), dests = 0, srcs = 1, can_eliminate = False, can_reorder = False, imms = [IMM]) 445 446op("stack_adjust", 447 encoding_32 = (0x10100b5, (1 << 26) - 1, 8, _), 448 dests = 0, srcs = 0, can_eliminate = False, can_reorder = False, 449 imms = [STACK_SIZE], schedule_class = "store") 450 451# source is offset 452op("stack_load", 453 encoding_32 = (0x35, (1 << 20) - 1, 6, 8), 454 srcs = 1, imms = [FORMAT, MASK, SCOREBOARD], can_reorder = False, 455 schedule_class = "load") 456 457# sources are value and offset 458op("stack_store", 459 encoding_32 = (0xb5, (1 << 20) - 1, 6, 8), 460 dests = 0, srcs = 2, imms = [FORMAT, MASK, SCOREBOARD], 461 can_eliminate=False, schedule_class = "store") 462 463# Convenient aliases. 464op("mov", _, srcs = 1) 465op("not", _, srcs = 1) 466 467op("collect", _, srcs = VARIABLE) 468op("split", _, srcs = 1, dests = VARIABLE) 469op("phi", _, srcs = VARIABLE, schedule_class = "preload") 470 471op("unit_test", _, dests = 0, srcs = 1, can_eliminate = False) 472 473# Like mov, but takes a register and can only appear at the start. Guaranteed 474# to be coalesced during RA, rather than lowered to a real move. 475op("preload", _, srcs = 1, schedule_class = "preload") 476 477# Pseudo-instructions to set the nesting counter. Lowers to r0l writes after RA. 478op("begin_cf", _, dests = 0, can_eliminate = False) 479op("break", _, dests = 0, imms = [NEST, TARGET], can_eliminate = False, 480 schedule_class = "invalid") 481 482for (name, is_float) in [("break_if_icmp", False), ("break_if_fcmp", True)]: 483 op(name, _, dests = 0, srcs = 2, 484 imms = [NEST, INVERT_COND, FCOND if is_float else ICOND, TARGET], 485 can_eliminate = False, schedule_class = "invalid") 486