• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1#
2# Copyright (c) 2018 Valve Corporation
3#
4# Permission is hereby granted, free of charge, to any person obtaining a
5# copy of this software and associated documentation files (the "Software"),
6# to deal in the Software without restriction, including without limitation
7# the rights to use, copy, modify, merge, publish, distribute, sublicense,
8# and/or sell copies of the Software, and to permit persons to whom the
9# Software is furnished to do so, subject to the following conditions:
10#
11# The above copyright notice and this permission notice (including the next
12# paragraph) shall be included in all copies or substantial portions of the
13# Software.
14#
15# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
18# THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20# FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21# IN THE SOFTWARE.
22#
23
24# Class that represents all the information we have about the opcode
25# NOTE: this must be kept in sync with aco_op_info
26
27import sys
28from enum import Enum, IntEnum, auto
29
30class InstrClass(Enum):
31   Valu32 = "valu32"
32   ValuConvert32 = "valu_convert32"
33   Valu64 = "valu64"
34   ValuQuarterRate32 = "valu_quarter_rate32"
35   ValuFma = "valu_fma"
36   ValuTranscendental32 = "valu_transcendental32"
37   ValuDouble = "valu_double"
38   ValuDoubleAdd = "valu_double_add"
39   ValuDoubleConvert = "valu_double_convert"
40   ValuDoubleTranscendental = "valu_double_transcendental"
41   WMMA = "wmma"
42   Salu = "salu"
43   SMem = "smem"
44   Barrier = "barrier"
45   Branch = "branch"
46   Sendmsg = "sendmsg"
47   DS = "ds"
48   Export = "exp"
49   VMem = "vmem"
50   Waitcnt = "waitcnt"
51   Other = "other"
52
53# Representation of the instruction's microcode encoding format
54# Note: Some Vector ALU Formats can be combined, such that:
55# - VOP2* | VOP3 represents a VOP2 instruction in VOP3 encoding
56# - VOP2* | DPP represents a VOP2 instruction with data parallel primitive.
57# - VOP2* | SDWA represents a VOP2 instruction with sub-dword addressing.
58#
59# (*) The same is applicable for VOP1 and VOPC instructions.
60class Format(IntEnum):
61   # Pseudo Instruction Formats
62   PSEUDO = 0
63   PSEUDO_BRANCH = auto()
64   PSEUDO_BARRIER = auto()
65   PSEUDO_REDUCTION = auto()
66   # Scalar ALU & Control Formats
67   SOP1 = auto()
68   SOP2 = auto()
69   SOPK = auto()
70   SOPP = auto()
71   SOPC = auto()
72   # Scalar Memory Format
73   SMEM = auto()
74   # LDS/GDS Format
75   DS = auto()
76   LDSDIR = auto()
77   # Vector Memory Buffer Formats
78   MTBUF = auto()
79   MUBUF = auto()
80   # Vector Memory Image Format
81   MIMG = auto()
82   # Export Format
83   EXP = auto()
84   # Flat Formats
85   FLAT = auto()
86   GLOBAL = auto()
87   SCRATCH = auto()
88   # Vector Parameter Interpolation Formats
89   VINTRP = auto()
90   # Vector ALU Formats
91   VINTERP_INREG = auto()
92   VOPD = auto()
93   VOP1 = 1 << 7
94   VOP2 = 1 << 8
95   VOPC = 1 << 9
96   VOP3 = 1 << 10
97   VOP3P = 1 << 11
98   SDWA = 1 << 12
99   DPP16 = 1 << 13
100   DPP8 = 1 << 14
101
102   def get_builder_fields(self):
103      if self == Format.SOPK:
104         return [('uint16_t', 'imm', None)]
105      elif self == Format.SOPP:
106         return [('uint32_t', 'block', '-1'),
107                 ('uint32_t', 'imm', '0')]
108      elif self == Format.SMEM:
109         return [('memory_sync_info', 'sync', 'memory_sync_info()'),
110                 ('bool', 'glc', 'false'),
111                 ('bool', 'dlc', 'false'),
112                 ('bool', 'nv', 'false')]
113      elif self == Format.DS:
114         return [('uint16_t', 'offset0', '0'),
115                 ('uint8_t', 'offset1', '0'),
116                 ('bool', 'gds', 'false')]
117      elif self == Format.LDSDIR:
118         return [('uint8_t', 'attr', 0),
119                 ('uint8_t', 'attr_chan', 0),
120                 ('memory_sync_info', 'sync', 'memory_sync_info()'),
121                 ('uint8_t', 'wait_vdst', 15)]
122      elif self == Format.MTBUF:
123         return [('unsigned', 'dfmt', None),
124                 ('unsigned', 'nfmt', None),
125                 ('unsigned', 'offset', None),
126                 ('bool', 'offen', None),
127                 ('bool', 'idxen', 'false'),
128                 ('bool', 'disable_wqm', 'false'),
129                 ('bool', 'glc', 'false'),
130                 ('bool', 'dlc', 'false'),
131                 ('bool', 'slc', 'false'),
132                 ('bool', 'tfe', 'false')]
133      elif self == Format.MUBUF:
134         return [('unsigned', 'offset', None),
135                 ('bool', 'offen', None),
136                 ('bool', 'swizzled', 'false'),
137                 ('bool', 'idxen', 'false'),
138                 ('bool', 'addr64', 'false'),
139                 ('bool', 'disable_wqm', 'false'),
140                 ('bool', 'glc', 'false'),
141                 ('bool', 'dlc', 'false'),
142                 ('bool', 'slc', 'false'),
143                 ('bool', 'tfe', 'false'),
144                 ('bool', 'lds', 'false')]
145      elif self == Format.MIMG:
146         return [('unsigned', 'dmask', '0xF'),
147                 ('bool', 'da', 'false'),
148                 ('bool', 'unrm', 'false'),
149                 ('bool', 'disable_wqm', 'false'),
150                 ('bool', 'glc', 'false'),
151                 ('bool', 'dlc', 'false'),
152                 ('bool', 'slc', 'false'),
153                 ('bool', 'tfe', 'false'),
154                 ('bool', 'lwe', 'false'),
155                 ('bool', 'r128', 'false'),
156                 ('bool', 'a16', 'false'),
157                 ('bool', 'd16', 'false')]
158         return [('unsigned', 'attribute', None),
159                 ('unsigned', 'component', None)]
160      elif self == Format.EXP:
161         return [('unsigned', 'enabled_mask', None),
162                 ('unsigned', 'dest', None),
163                 ('bool', 'compr', 'false', 'compressed'),
164                 ('bool', 'done', 'false'),
165                 ('bool', 'vm', 'false', 'valid_mask')]
166      elif self == Format.PSEUDO_BRANCH:
167         return [('uint32_t', 'target0', '0', 'target[0]'),
168                 ('uint32_t', 'target1', '0', 'target[1]')]
169      elif self == Format.PSEUDO_REDUCTION:
170         return [('ReduceOp', 'op', None, 'reduce_op'),
171                 ('unsigned', 'cluster_size', '0')]
172      elif self == Format.PSEUDO_BARRIER:
173         return [('memory_sync_info', 'sync', None),
174                 ('sync_scope', 'exec_scope', 'scope_invocation')]
175      elif self == Format.VINTRP:
176         return [('unsigned', 'attribute', None),
177                 ('unsigned', 'component', None)]
178      elif self == Format.DPP16:
179         return [('uint16_t', 'dpp_ctrl', None),
180                 ('uint8_t', 'row_mask', '0xF'),
181                 ('uint8_t', 'bank_mask', '0xF'),
182                 ('bool', 'bound_ctrl', 'true'),
183                 ('bool', 'fetch_inactive', 'true')]
184      elif self == Format.DPP8:
185         return [('uint32_t', 'lane_sel', 0),
186                 ('bool', 'fetch_inactive', 'true')]
187      elif self == Format.VOP3P:
188         return [('uint8_t', 'opsel_lo', None),
189                 ('uint8_t', 'opsel_hi', None)]
190      elif self == Format.VOPD:
191         return [('aco_opcode', 'opy', None)]
192      elif self == Format.VINTERP_INREG:
193         return [('unsigned', 'wait_exp', 7),
194                 ('uint8_t', 'opsel', 0)]
195      elif self in [Format.FLAT, Format.GLOBAL, Format.SCRATCH]:
196         return [('int16_t', 'offset', 0),
197                 ('memory_sync_info', 'sync', 'memory_sync_info()'),
198                 ('bool', 'glc', 'false'),
199                 ('bool', 'slc', 'false'),
200                 ('bool', 'lds', 'false'),
201                 ('bool', 'nv', 'false')]
202      else:
203         return []
204
205   def get_builder_field_names(self):
206      return [f[1] for f in self.get_builder_fields()]
207
208   def get_builder_field_dests(self):
209      return [(f[3] if len(f) >= 4 else f[1]) for f in self.get_builder_fields()]
210
211   def get_builder_field_decls(self):
212      return [('%s %s=%s' % (f[0], f[1], f[2]) if f[2] != None else '%s %s' % (f[0], f[1])) for f in self.get_builder_fields()]
213
214   def get_builder_initialization(self, num_operands):
215      res = ''
216      if self == Format.SDWA:
217         for i in range(min(num_operands, 2)):
218            res += 'instr->sel[{0}] = SubdwordSel(op{0}.op.bytes(), 0, false);'.format(i)
219         res += 'instr->dst_sel = SubdwordSel(def0.bytes(), 0, false);\n'
220      elif self in [Format.DPP16, Format.DPP8]:
221         res += 'instr->fetch_inactive &= program->gfx_level >= GFX10;\n'
222      return res
223
224
225class Opcode(object):
226   """Class that represents all the information we have about the opcode
227   NOTE: this must be kept in sync with aco_op_info
228   """
229   def __init__(self, name, opcode_gfx7, opcode_gfx9, opcode_gfx10, opcode_gfx11, format, input_mod, output_mod, is_atomic, cls, definitions, operands):
230      assert isinstance(name, str)
231      assert isinstance(opcode_gfx7, int)
232      assert isinstance(opcode_gfx9, int)
233      assert isinstance(opcode_gfx10, int)
234      assert isinstance(opcode_gfx11, int)
235      assert isinstance(format, Format)
236      assert isinstance(input_mod, bool)
237      assert isinstance(output_mod, bool)
238      assert isinstance(definitions, int)
239      assert isinstance(operands, int)
240
241      self.name = name
242      self.opcode_gfx7 = opcode_gfx7
243      self.opcode_gfx9 = opcode_gfx9
244      self.opcode_gfx10 = opcode_gfx10
245      self.opcode_gfx11 = opcode_gfx11
246      self.input_mod = "1" if input_mod else "0"
247      self.output_mod = "1" if output_mod else "0"
248      self.is_atomic = "1" if is_atomic else "0"
249      self.format = format
250      self.cls = cls
251      self.definitions = definitions
252      self.operands = operands
253
254      parts = name.replace('_e64', '').rsplit('_', 2)
255      op_dtype = parts[-1]
256
257      op_dtype_sizes = {'{}{}'.format(prefix, size) : size for prefix in 'biuf' for size in [64, 32, 24, 16]}
258      # inline constants are 32-bit for 16-bit integer/typeless instructions: https://reviews.llvm.org/D81841
259      op_dtype_sizes['b16'] = 32
260      op_dtype_sizes['i16'] = 32
261      op_dtype_sizes['u16'] = 32
262
263      # If we can't tell the operand size, default to 32.
264      self.operand_size = op_dtype_sizes.get(op_dtype, 32)
265
266      # exceptions for operands:
267      if 'qsad_' in name:
268        self.operand_size = 0
269      elif 'sad_' in name:
270        self.operand_size = 32
271      elif name in ['v_mad_u64_u32', 'v_mad_i64_i32']:
272        self.operand_size = 0
273      elif self.operand_size == 24:
274        self.operand_size = 32
275      elif op_dtype == 'u8' or op_dtype == 'i8':
276        self.operand_size = 32
277      elif name in ['v_cvt_f32_ubyte0', 'v_cvt_f32_ubyte1',
278                    'v_cvt_f32_ubyte2', 'v_cvt_f32_ubyte3']:
279        self.operand_size = 32
280
281
282# Matches PhysReg
283VCC = 106
284M0 = 124
285EXEC_LO = 126
286EXEC = 127 # Some instructins only write lo, so use exec_hi encoding here
287SCC = 253
288
289def src(op1 = 0, op2 = 0, op3 = 0, op4 = 0):
290   return op1 | (op2 << 8) | (op3 << 16) | (op4 << 24)
291
292def dst(def1 = 0, def2 = 0, def3 = 0, def4 = 0):
293   return def1 | (def2 << 8) | (def3 << 16) | (def4 << 24)
294
295# global dictionary of opcodes
296opcodes = {}
297
298def opcode(name, opcode_gfx7 = -1, opcode_gfx9 = -1, opcode_gfx10 = -1, opcode_gfx11 = -1, format = Format.PSEUDO, cls = InstrClass.Other, input_mod = False, output_mod = False, is_atomic = False, definitions = 0, operands = 0):
299   assert name not in opcodes
300   opcodes[name] = Opcode(name, opcode_gfx7, opcode_gfx9, opcode_gfx10, opcode_gfx11, format, input_mod, output_mod, is_atomic, cls, definitions, operands)
301
302def default_class(opcodes, cls):
303   for op in opcodes:
304      if isinstance(op[-1], InstrClass):
305         yield op
306      else:
307         yield op + (cls,)
308
309opcode("exp", 0, 0, 0, 0, format = Format.EXP, cls = InstrClass.Export)
310opcode("p_parallelcopy")
311opcode("p_startpgm")
312opcode("p_return")
313opcode("p_phi")
314opcode("p_linear_phi")
315opcode("p_as_uniform")
316opcode("p_unit_test")
317
318opcode("p_create_vector")
319opcode("p_extract_vector")
320opcode("p_split_vector")
321
322# start/end the parts where we can use exec based instructions
323# implicitly
324opcode("p_logical_start")
325opcode("p_logical_end")
326
327# e.g. subgroupMin() in SPIR-V
328opcode("p_reduce", format=Format.PSEUDO_REDUCTION)
329# e.g. subgroupInclusiveMin()
330opcode("p_inclusive_scan", format=Format.PSEUDO_REDUCTION)
331# e.g. subgroupExclusiveMin()
332opcode("p_exclusive_scan", format=Format.PSEUDO_REDUCTION)
333
334opcode("p_branch", format=Format.PSEUDO_BRANCH)
335opcode("p_cbranch", format=Format.PSEUDO_BRANCH)
336opcode("p_cbranch_z", format=Format.PSEUDO_BRANCH)
337opcode("p_cbranch_nz", format=Format.PSEUDO_BRANCH)
338
339opcode("p_barrier", format=Format.PSEUDO_BARRIER)
340
341# Primitive Ordered Pixel Shading pseudo-instructions.
342
343# For querying whether the current wave can enter the ordered section on GFX9-10.3, doing
344# s_add_i32(pops_exiting_wave_id, op0), but in a way that it's different from a usual SALU
345# instruction so that it's easier to maintain the volatility of pops_exiting_wave_id and to handle
346# the polling specially in scheduling.
347# Definitions:
348# - Result SGPR;
349# - Clobbered SCC.
350# Operands:
351# - s1 value to add, usually -(current_wave_ID + 1) (or ~current_wave_ID) to remap the exiting wave
352#   ID from wrapping [0, 0x3FF] to monotonic [0, 0xFFFFFFFF].
353opcode("p_pops_gfx9_add_exiting_wave_id")
354
355# Indicates that the wait for the completion of the ordered section in overlapped waves has been
356# finished on GFX9-10.3. Not lowered to any hardware instructions.
357opcode("p_pops_gfx9_overlapped_wave_wait_done")
358
359# Indicates that a POPS ordered section has ended, hints that overlapping waves can possibly
360# continue execution. The overlapping waves may actually be resumed by this instruction or anywhere
361# later, however, especially taking into account the fact that there can be multiple ordered
362# sections in a wave (for instance, if one is chosen in divergent control flow in the source
363# shader), thus multiple p_pops_gfx9_ordered_section_done instructions. At least one must be present
364# in the program if POPS is used, however, otherwise the location of the end of the ordered section
365# will be undefined. Only needed on GFX9-10.3 (GFX11+ ordered section is until the last export,
366# can't be exited early). Not lowered to any hardware instructions.
367opcode("p_pops_gfx9_ordered_section_done")
368
369opcode("p_spill")
370opcode("p_reload")
371
372# Start/end linear vgprs. p_start_linear_vgpr can take an operand to copy from, into the linear vgpr
373opcode("p_start_linear_vgpr")
374opcode("p_end_linear_vgpr")
375
376opcode("p_end_wqm")
377opcode("p_discard_if")
378opcode("p_demote_to_helper")
379opcode("p_is_helper")
380opcode("p_exit_early_if")
381
382# simulates proper bpermute behavior using v_readlane_b32
383# definitions: result VGPR, temp EXEC, clobbered VCC
384# operands: index, input data
385opcode("p_bpermute_readlane")
386
387# simulates proper wave64 bpermute behavior using shared vgprs (for GFX10/10.3)
388# definitions: result VGPR, temp EXEC, clobbered SCC
389# operands: index * 4, input data, same half (bool)
390opcode("p_bpermute_shared_vgpr")
391
392# simulates proper wave64 bpermute behavior using v_permlane64_b32 (for GFX11+)
393# definitions: result VGPR, temp EXEC, clobbered SCC
394# operands: linear VGPR, index * 4, input data, same half (bool)
395opcode("p_bpermute_permlane")
396
397# creates a lane mask where only the first active lane is selected
398opcode("p_elect")
399
400opcode("p_constaddr")
401opcode("p_resume_shader_address")
402
403# These don't have to be pseudo-ops, but it makes optimization easier to only
404# have to consider two instructions.
405# (src0 >> (index * bits)) & ((1 << bits) - 1) with optional sign extension
406opcode("p_extract") # src1=index, src2=bits, src3=signext
407# (src0 & ((1 << bits) - 1)) << (index * bits)
408opcode("p_insert") # src1=index, src2=bits
409
410opcode("p_init_scratch")
411
412# jumps to a shader epilog
413opcode("p_jump_to_epilog")
414
415# loads and interpolates a fragment shader input with a correct exec mask
416#dst0=result, src0=linear_vgpr, src1=attribute, src2=component, src3=coord1, src4=coord2, src5=m0
417#dst0=result, src0=linear_vgpr, src1=attribute, src2=component, src3=dpp_ctrl, src4=m0
418opcode("p_interp_gfx11")
419
420# performs dual source MRTs swizzling and emits exports on GFX11
421opcode("p_dual_src_export_gfx11")
422
423# Let shader end with specific registers set to wanted value, used by multi part
424# shader to pass arguments to next part.
425opcode("p_end_with_regs")
426
427# SOP2 instructions: 2 scalar inputs, 1 scalar output (+optional scc)
428SOP2 = {
429  # GFX6, GFX7, GFX8, GFX9, GFX10,GFX11,name
430   (0x00, 0x00, 0x00, 0x00, 0x00, 0x00, "s_add_u32", dst(1, SCC), src(1, 1)),
431   (0x01, 0x01, 0x01, 0x01, 0x01, 0x01, "s_sub_u32", dst(1, SCC), src(1, 1)),
432   (0x02, 0x02, 0x02, 0x02, 0x02, 0x02, "s_add_i32", dst(1, SCC), src(1, 1)),
433   (0x03, 0x03, 0x03, 0x03, 0x03, 0x03, "s_sub_i32", dst(1, SCC), src(1, 1)),
434   (0x04, 0x04, 0x04, 0x04, 0x04, 0x04, "s_addc_u32", dst(1, SCC), src(1, 1, SCC)),
435   (0x05, 0x05, 0x05, 0x05, 0x05, 0x05, "s_subb_u32", dst(1, SCC), src(1, 1, SCC)),
436   (0x06, 0x06, 0x06, 0x06, 0x06, 0x12, "s_min_i32", dst(1, SCC), src(1, 1)),
437   (0x07, 0x07, 0x07, 0x07, 0x07, 0x13, "s_min_u32", dst(1, SCC), src(1, 1)),
438   (0x08, 0x08, 0x08, 0x08, 0x08, 0x14, "s_max_i32", dst(1, SCC), src(1, 1)),
439   (0x09, 0x09, 0x09, 0x09, 0x09, 0x15, "s_max_u32", dst(1, SCC), src(1, 1)),
440   (0x0a, 0x0a, 0x0a, 0x0a, 0x0a, 0x30, "s_cselect_b32", dst(1), src(1, 1, SCC)),
441   (0x0b, 0x0b, 0x0b, 0x0b, 0x0b, 0x31, "s_cselect_b64", dst(2), src(2, 2, SCC)),
442   (0x0e, 0x0e, 0x0c, 0x0c, 0x0e, 0x16, "s_and_b32", dst(1, SCC), src(1, 1)),
443   (0x0f, 0x0f, 0x0d, 0x0d, 0x0f, 0x17, "s_and_b64", dst(2, SCC), src(2, 2)),
444   (0x10, 0x10, 0x0e, 0x0e, 0x10, 0x18, "s_or_b32", dst(1, SCC), src(1, 1)),
445   (0x11, 0x11, 0x0f, 0x0f, 0x11, 0x19, "s_or_b64", dst(2, SCC), src(2, 2)),
446   (0x12, 0x12, 0x10, 0x10, 0x12, 0x1a, "s_xor_b32", dst(1, SCC), src(1, 1)),
447   (0x13, 0x13, 0x11, 0x11, 0x13, 0x1b, "s_xor_b64", dst(2, SCC), src(2, 2)),
448   (0x14, 0x14, 0x12, 0x12, 0x14, 0x22, "s_andn2_b32", dst(1, SCC), src(1, 1)), #s_and_not1_b32 in GFX11
449   (0x15, 0x15, 0x13, 0x13, 0x15, 0x23, "s_andn2_b64", dst(2, SCC), src(2, 2)), #s_and_not1_b64 in GFX11
450   (0x16, 0x16, 0x14, 0x14, 0x16, 0x24, "s_orn2_b32", dst(1, SCC), src(1, 1)), #s_or_not1_b32 in GFX11
451   (0x17, 0x17, 0x15, 0x15, 0x17, 0x25, "s_orn2_b64", dst(2, SCC), src(2, 2)), #s_or_not1_b64 in GFX11
452   (0x18, 0x18, 0x16, 0x16, 0x18, 0x1c, "s_nand_b32", dst(1, SCC), src(1, 1)),
453   (0x19, 0x19, 0x17, 0x17, 0x19, 0x1d, "s_nand_b64", dst(2, SCC), src(2, 2)),
454   (0x1a, 0x1a, 0x18, 0x18, 0x1a, 0x1e, "s_nor_b32", dst(1, SCC), src(1, 1)),
455   (0x1b, 0x1b, 0x19, 0x19, 0x1b, 0x1f, "s_nor_b64", dst(2, SCC), src(2, 2)),
456   (0x1c, 0x1c, 0x1a, 0x1a, 0x1c, 0x20, "s_xnor_b32", dst(1, SCC), src(1, 1)),
457   (0x1d, 0x1d, 0x1b, 0x1b, 0x1d, 0x21, "s_xnor_b64", dst(2, SCC), src(2, 2)),
458   (0x1e, 0x1e, 0x1c, 0x1c, 0x1e, 0x08, "s_lshl_b32", dst(1, SCC), src(1, 1)),
459   (0x1f, 0x1f, 0x1d, 0x1d, 0x1f, 0x09, "s_lshl_b64", dst(2, SCC), src(2, 1)),
460   (0x20, 0x20, 0x1e, 0x1e, 0x20, 0x0a, "s_lshr_b32", dst(1, SCC), src(1, 1)),
461   (0x21, 0x21, 0x1f, 0x1f, 0x21, 0x0b, "s_lshr_b64", dst(2, SCC), src(2, 1)),
462   (0x22, 0x22, 0x20, 0x20, 0x22, 0x0c, "s_ashr_i32", dst(1, SCC), src(1, 1)),
463   (0x23, 0x23, 0x21, 0x21, 0x23, 0x0d, "s_ashr_i64", dst(2, SCC), src(2, 1)),
464   (0x24, 0x24, 0x22, 0x22, 0x24, 0x2a, "s_bfm_b32", dst(1), src(1, 1)),
465   (0x25, 0x25, 0x23, 0x23, 0x25, 0x2b, "s_bfm_b64", dst(2), src(1, 1)),
466   (0x26, 0x26, 0x24, 0x24, 0x26, 0x2c, "s_mul_i32", dst(1), src(1, 1)),
467   (0x27, 0x27, 0x25, 0x25, 0x27, 0x26, "s_bfe_u32", dst(1, SCC), src(1, 1)),
468   (0x28, 0x28, 0x26, 0x26, 0x28, 0x27, "s_bfe_i32", dst(1, SCC), src(1, 1)),
469   (0x29, 0x29, 0x27, 0x27, 0x29, 0x28, "s_bfe_u64", dst(2, SCC), src(2, 1)),
470   (0x2a, 0x2a, 0x28, 0x28, 0x2a, 0x29, "s_bfe_i64", dst(2, SCC), src(2, 1)),
471   (0x2b, 0x2b, 0x29, 0x29,   -1,   -1, "s_cbranch_g_fork", dst(), src(), InstrClass.Branch),
472   (0x2c, 0x2c, 0x2a, 0x2a, 0x2c, 0x06, "s_absdiff_i32", dst(1, SCC), src(1, 1)),
473   (  -1,   -1, 0x2b, 0x2b,   -1,   -1, "s_rfe_restore_b64", dst(), src(), InstrClass.Branch),
474   (  -1,   -1,   -1, 0x2e, 0x2e, 0x0e, "s_lshl1_add_u32", dst(1, SCC), src(1, 1)),
475   (  -1,   -1,   -1, 0x2f, 0x2f, 0x0f, "s_lshl2_add_u32", dst(1, SCC), src(1, 1)),
476   (  -1,   -1,   -1, 0x30, 0x30, 0x10, "s_lshl3_add_u32", dst(1, SCC), src(1, 1)),
477   (  -1,   -1,   -1, 0x31, 0x31, 0x11, "s_lshl4_add_u32", dst(1, SCC), src(1, 1)),
478   (  -1,   -1,   -1, 0x32, 0x32, 0x32, "s_pack_ll_b32_b16", dst(1), src(1, 1)),
479   (  -1,   -1,   -1, 0x33, 0x33, 0x33, "s_pack_lh_b32_b16", dst(1), src(1, 1)),
480   (  -1,   -1,   -1, 0x34, 0x34, 0x34, "s_pack_hh_b32_b16", dst(1), src(1, 1)),
481   (  -1,   -1,   -1,   -1,   -1, 0x35, "s_pack_hl_b32_b16", dst(1), src(1, 1)),
482   (  -1,   -1,   -1, 0x2c, 0x35, 0x2d, "s_mul_hi_u32", dst(1), src(1, 1)),
483   (  -1,   -1,   -1, 0x2d, 0x36, 0x2e, "s_mul_hi_i32", dst(1), src(1, 1)),
484   # actually a pseudo-instruction. it's lowered to SALU during assembly though, so it's useful to identify it as a SOP2.
485   (  -1,   -1,   -1,   -1,   -1,   -1, "p_constaddr_addlo", dst(1, SCC), src(1, 1, 1)),
486   (  -1,   -1,   -1,   -1,   -1,   -1, "p_resumeaddr_addlo", dst(1, SCC), src(1, 1, 1)),
487}
488for (gfx6, gfx7, gfx8, gfx9, gfx10, gfx11, name, defs, ops, cls) in default_class(SOP2, InstrClass.Salu):
489    opcode(name, gfx7, gfx9, gfx10, gfx11, Format.SOP2, cls, definitions = defs, operands = ops)
490
491
492# SOPK instructions: 0 input (+ imm), 1 output + optional scc
493SOPK = {
494  # GFX6, GFX7, GFX8, GFX9, GFX10,GFX11,name
495   (0x00, 0x00, 0x00, 0x00, 0x00, 0x00, "s_movk_i32", dst(1), src()),
496   (  -1,   -1,   -1,   -1, 0x01, 0x01, "s_version", dst(), src()),
497   (0x02, 0x02, 0x01, 0x01, 0x02, 0x02, "s_cmovk_i32", dst(1), src(1, SCC)),
498   (0x03, 0x03, 0x02, 0x02, 0x03, 0x03, "s_cmpk_eq_i32", dst(SCC), src(1)),
499   (0x04, 0x04, 0x03, 0x03, 0x04, 0x04, "s_cmpk_lg_i32", dst(SCC), src(1)),
500   (0x05, 0x05, 0x04, 0x04, 0x05, 0x05, "s_cmpk_gt_i32", dst(SCC), src(1)),
501   (0x06, 0x06, 0x05, 0x05, 0x06, 0x06, "s_cmpk_ge_i32", dst(SCC), src(1)),
502   (0x07, 0x07, 0x06, 0x06, 0x07, 0x07, "s_cmpk_lt_i32", dst(SCC), src(1)),
503   (0x08, 0x08, 0x07, 0x07, 0x08, 0x08, "s_cmpk_le_i32", dst(SCC), src(1)),
504   (0x09, 0x09, 0x08, 0x08, 0x09, 0x09, "s_cmpk_eq_u32", dst(SCC), src(1)),
505   (0x0a, 0x0a, 0x09, 0x09, 0x0a, 0x0a, "s_cmpk_lg_u32", dst(SCC), src(1)),
506   (0x0b, 0x0b, 0x0a, 0x0a, 0x0b, 0x0b, "s_cmpk_gt_u32", dst(SCC), src(1)),
507   (0x0c, 0x0c, 0x0b, 0x0b, 0x0c, 0x0c, "s_cmpk_ge_u32", dst(SCC), src(1)),
508   (0x0d, 0x0d, 0x0c, 0x0c, 0x0d, 0x0d, "s_cmpk_lt_u32", dst(SCC), src(1)),
509   (0x0e, 0x0e, 0x0d, 0x0d, 0x0e, 0x0e, "s_cmpk_le_u32", dst(SCC), src(1)),
510   (0x0f, 0x0f, 0x0e, 0x0e, 0x0f, 0x0f, "s_addk_i32", dst(1, SCC), src(1)),
511   (0x10, 0x10, 0x0f, 0x0f, 0x10, 0x10, "s_mulk_i32", dst(1), src(1)),
512   (0x11, 0x11, 0x10, 0x10,   -1,   -1, "s_cbranch_i_fork", dst(), src(), InstrClass.Branch),
513   (0x12, 0x12, 0x11, 0x11, 0x12, 0x11, "s_getreg_b32", dst(1), src()),
514   (0x13, 0x13, 0x12, 0x12, 0x13, 0x12, "s_setreg_b32", dst(), src(1)),
515   (0x15, 0x15, 0x14, 0x14, 0x15, 0x13, "s_setreg_imm32_b32", dst(), src(1)), # requires 32bit literal
516   (  -1,   -1, 0x15, 0x15, 0x16, 0x14, "s_call_b64", dst(2), src(), InstrClass.Branch),
517   (  -1,   -1,   -1,   -1, 0x17, 0x18, "s_waitcnt_vscnt", dst(), src(1), InstrClass.Waitcnt),
518   (  -1,   -1,   -1,   -1, 0x18, 0x19, "s_waitcnt_vmcnt", dst(), src(1), InstrClass.Waitcnt),
519   (  -1,   -1,   -1,   -1, 0x19, 0x1a, "s_waitcnt_expcnt", dst(), src(1), InstrClass.Waitcnt),
520   (  -1,   -1,   -1,   -1, 0x1a, 0x1b, "s_waitcnt_lgkmcnt", dst(), src(1), InstrClass.Waitcnt),
521   (  -1,   -1,   -1,   -1, 0x1b, 0x16, "s_subvector_loop_begin", dst(), src(), InstrClass.Branch),
522   (  -1,   -1,   -1,   -1, 0x1c, 0x17, "s_subvector_loop_end", dst(), src(), InstrClass.Branch),
523}
524for (gfx6, gfx7, gfx8, gfx9, gfx10, gfx11, name, defs, ops, cls) in default_class(SOPK, InstrClass.Salu):
525   opcode(name, gfx7, gfx9, gfx10, gfx11, Format.SOPK, cls, definitions = defs, operands = ops)
526
527
528# SOP1 instructions: 1 input, 1 output (+optional SCC)
529SOP1 = {
530  # GFX6, GFX7, GFX8, GFX9, GFX10,GFX11,name
531   (0x03, 0x03, 0x00, 0x00, 0x03, 0x00, "s_mov_b32", dst(1), src(1)),
532   (0x04, 0x04, 0x01, 0x01, 0x04, 0x01, "s_mov_b64", dst(2), src(2)),
533   (0x05, 0x05, 0x02, 0x02, 0x05, 0x02, "s_cmov_b32", dst(1), src(1, 1, SCC)),
534   (0x06, 0x06, 0x03, 0x03, 0x06, 0x03, "s_cmov_b64", dst(2), src(2, 2, SCC)),
535   (0x07, 0x07, 0x04, 0x04, 0x07, 0x1e, "s_not_b32", dst(1, SCC), src(1)),
536   (0x08, 0x08, 0x05, 0x05, 0x08, 0x1f, "s_not_b64", dst(2, SCC), src(2)),
537   (0x09, 0x09, 0x06, 0x06, 0x09, 0x1c, "s_wqm_b32", dst(1, SCC), src(1)),
538   (0x0a, 0x0a, 0x07, 0x07, 0x0a, 0x1d, "s_wqm_b64", dst(2, SCC), src(2)),
539   (0x0b, 0x0b, 0x08, 0x08, 0x0b, 0x04, "s_brev_b32", dst(1), src(1)),
540   (0x0c, 0x0c, 0x09, 0x09, 0x0c, 0x05, "s_brev_b64", dst(2), src(2)),
541   (0x0d, 0x0d, 0x0a, 0x0a, 0x0d, 0x16, "s_bcnt0_i32_b32", dst(1, SCC), src(1)),
542   (0x0e, 0x0e, 0x0b, 0x0b, 0x0e, 0x17, "s_bcnt0_i32_b64", dst(1, SCC), src(2)),
543   (0x0f, 0x0f, 0x0c, 0x0c, 0x0f, 0x18, "s_bcnt1_i32_b32", dst(1, SCC), src(1)),
544   (0x10, 0x10, 0x0d, 0x0d, 0x10, 0x19, "s_bcnt1_i32_b64", dst(1, SCC), src(2)),
545   (0x11, 0x11, 0x0e, 0x0e, 0x11,   -1, "s_ff0_i32_b32", dst(1), src(1)),
546   (0x12, 0x12, 0x0f, 0x0f, 0x12,   -1, "s_ff0_i32_b64", dst(1), src(2)),
547   (0x13, 0x13, 0x10, 0x10, 0x13, 0x08, "s_ff1_i32_b32", dst(1), src(1)), #s_ctz_i32_b32 in GFX11
548   (0x14, 0x14, 0x11, 0x11, 0x14, 0x09, "s_ff1_i32_b64", dst(1), src(2)), #s_ctz_i32_b64 in GFX11
549   (0x15, 0x15, 0x12, 0x12, 0x15, 0x0a, "s_flbit_i32_b32", dst(1), src(1)), #s_clz_i32_u32 in GFX11
550   (0x16, 0x16, 0x13, 0x13, 0x16, 0x0b, "s_flbit_i32_b64", dst(1), src(2)), #s_clz_i32_u64 in GFX11
551   (0x17, 0x17, 0x14, 0x14, 0x17, 0x0c, "s_flbit_i32", dst(1), src(1)), #s_cls_i32 in GFX11
552   (0x18, 0x18, 0x15, 0x15, 0x18, 0x0d, "s_flbit_i32_i64", dst(1), src(2)), #s_cls_i32_i64 in GFX11
553   (0x19, 0x19, 0x16, 0x16, 0x19, 0x0e, "s_sext_i32_i8", dst(1), src(1)),
554   (0x1a, 0x1a, 0x17, 0x17, 0x1a, 0x0f, "s_sext_i32_i16", dst(1), src(1)),
555   (0x1b, 0x1b, 0x18, 0x18, 0x1b, 0x10, "s_bitset0_b32", dst(1), src(1, 1)),
556   (0x1c, 0x1c, 0x19, 0x19, 0x1c, 0x11, "s_bitset0_b64", dst(2), src(1, 2)),
557   (0x1d, 0x1d, 0x1a, 0x1a, 0x1d, 0x12, "s_bitset1_b32", dst(1), src(1, 1)),
558   (0x1e, 0x1e, 0x1b, 0x1b, 0x1e, 0x13, "s_bitset1_b64", dst(2), src(1, 2)),
559   (0x1f, 0x1f, 0x1c, 0x1c, 0x1f, 0x47, "s_getpc_b64", dst(2), src()),
560   (0x20, 0x20, 0x1d, 0x1d, 0x20, 0x48, "s_setpc_b64", dst(), src(2), InstrClass.Branch),
561   (0x21, 0x21, 0x1e, 0x1e, 0x21, 0x49, "s_swappc_b64", dst(2), src(2), InstrClass.Branch),
562   (0x22, 0x22, 0x1f, 0x1f, 0x22, 0x4a, "s_rfe_b64", dst(), src(2), InstrClass.Branch),
563   (0x24, 0x24, 0x20, 0x20, 0x24, 0x21, "s_and_saveexec_b64", dst(2, SCC, EXEC), src(2, EXEC)),
564   (0x25, 0x25, 0x21, 0x21, 0x25, 0x23, "s_or_saveexec_b64", dst(2, SCC, EXEC), src(2, EXEC)),
565   (0x26, 0x26, 0x22, 0x22, 0x26, 0x25, "s_xor_saveexec_b64", dst(2, SCC, EXEC), src(2, EXEC)),
566   (0x27, 0x27, 0x23, 0x23, 0x27, 0x31, "s_andn2_saveexec_b64", dst(2, SCC, EXEC), src(2, EXEC)), #s_and_not1_saveexec_b64 in GFX11
567   (0x28, 0x28, 0x24, 0x24, 0x28, 0x33, "s_orn2_saveexec_b64", dst(2, SCC, EXEC), src(2, EXEC)), #s_or_not1_saveexec_b64 in GFX11
568   (0x29, 0x29, 0x25, 0x25, 0x29, 0x27, "s_nand_saveexec_b64", dst(2, SCC, EXEC), src(2, EXEC)),
569   (0x2a, 0x2a, 0x26, 0x26, 0x2a, 0x29, "s_nor_saveexec_b64", dst(2, SCC, EXEC), src(2, EXEC)),
570   (0x2b, 0x2b, 0x27, 0x27, 0x2b, 0x2b, "s_xnor_saveexec_b64", dst(2, SCC, EXEC), src(2, EXEC)),
571   (0x2c, 0x2c, 0x28, 0x28, 0x2c, 0x1a, "s_quadmask_b32", dst(1, SCC), src(1)),
572   (0x2d, 0x2d, 0x29, 0x29, 0x2d, 0x1b, "s_quadmask_b64", dst(2, SCC), src(2)), # Always writes 0 to the second SGPR
573   (0x2e, 0x2e, 0x2a, 0x2a, 0x2e, 0x40, "s_movrels_b32", dst(1), src(1, M0)),
574   (0x2f, 0x2f, 0x2b, 0x2b, 0x2f, 0x41, "s_movrels_b64", dst(2), src(2, M0)),
575   (0x30, 0x30, 0x2c, 0x2c, 0x30, 0x42, "s_movreld_b32", dst(1), src(1, M0)),
576   (0x31, 0x31, 0x2d, 0x2d, 0x31, 0x43, "s_movreld_b64", dst(2), src(2, M0)),
577   (0x32, 0x32, 0x2e, 0x2e,   -1,   -1, "s_cbranch_join", dst(), src(), InstrClass.Branch),
578   (0x34, 0x34, 0x30, 0x30, 0x34, 0x15, "s_abs_i32", dst(1, SCC), src(1)),
579   (0x35, 0x35,   -1,   -1, 0x35,   -1, "s_mov_fed_b32", dst(), src()),
580   (  -1,   -1, 0x32, 0x32,   -1,   -1, "s_set_gpr_idx_idx", dst(M0), src(1, M0)),
581   (  -1,   -1,   -1, 0x33, 0x37, 0x2d, "s_andn1_saveexec_b64", dst(2, SCC, EXEC), src(2, EXEC)), #s_and_not0_savexec_b64 in GFX11
582   (  -1,   -1,   -1, 0x34, 0x38, 0x2f, "s_orn1_saveexec_b64", dst(2, SCC, EXEC), src(2, EXEC)), #s_or_not0_savexec_b64 in GFX11
583   (  -1,   -1,   -1, 0x35, 0x39, 0x35, "s_andn1_wrexec_b64", dst(2, SCC, EXEC), src(2, EXEC)), #s_and_not0_wrexec_b64 in GFX11
584   (  -1,   -1,   -1, 0x36, 0x3a, 0x37, "s_andn2_wrexec_b64", dst(2, SCC, EXEC), src(2, EXEC)), #s_and_not1_wrexec_b64 in GFX11
585   (  -1,   -1,   -1, 0x37, 0x3b, 0x14, "s_bitreplicate_b64_b32", dst(2), src(1)),
586   (  -1,   -1,   -1,   -1, 0x3c, 0x20, "s_and_saveexec_b32", dst(1, SCC, EXEC_LO), src(1, EXEC_LO)),
587   (  -1,   -1,   -1,   -1, 0x3d, 0x22, "s_or_saveexec_b32", dst(1, SCC, EXEC_LO), src(1, EXEC_LO)),
588   (  -1,   -1,   -1,   -1, 0x3e, 0x24, "s_xor_saveexec_b32", dst(1, SCC, EXEC_LO), src(1, EXEC_LO)),
589   (  -1,   -1,   -1,   -1, 0x3f, 0x30, "s_andn2_saveexec_b32", dst(1, SCC, EXEC_LO), src(1, EXEC_LO)), #s_and_not1_saveexec_b32 in GFX11
590   (  -1,   -1,   -1,   -1, 0x40, 0x32, "s_orn2_saveexec_b32", dst(1, SCC, EXEC_LO), src(1, EXEC_LO)), #s_or_not1_saveexec_b32 in GFX11
591   (  -1,   -1,   -1,   -1, 0x41, 0x26, "s_nand_saveexec_b32", dst(1, SCC, EXEC_LO), src(1, EXEC_LO)),
592   (  -1,   -1,   -1,   -1, 0x42, 0x28, "s_nor_saveexec_b32", dst(1, SCC, EXEC_LO), src(1, EXEC_LO)),
593   (  -1,   -1,   -1,   -1, 0x43, 0x2a, "s_xnor_saveexec_b32", dst(1, SCC, EXEC_LO), src(1, EXEC_LO)),
594   (  -1,   -1,   -1,   -1, 0x44, 0x2c, "s_andn1_saveexec_b32", dst(1, SCC, EXEC_LO), src(1, EXEC_LO)), #s_and_not0_savexec_b32 in GFX11
595   (  -1,   -1,   -1,   -1, 0x45, 0x2e, "s_orn1_saveexec_b32", dst(1, SCC, EXEC_LO), src(1, EXEC_LO)), #s_or_not0_savexec_b32 in GFX11
596   (  -1,   -1,   -1,   -1, 0x46, 0x34, "s_andn1_wrexec_b32", dst(1, SCC, EXEC_LO), src(1, EXEC_LO)), #s_and_not0_wrexec_b32 in GFX11
597   (  -1,   -1,   -1,   -1, 0x47, 0x36, "s_andn2_wrexec_b32", dst(1, SCC, EXEC_LO), src(1, EXEC_LO)), #s_and_not1_wrexec_b32 in GFX11
598   (  -1,   -1,   -1,   -1, 0x49, 0x44, "s_movrelsd_2_b32", dst(1), src(1, M0)),
599   (  -1,   -1,   -1,   -1,   -1, 0x4c, "s_sendmsg_rtn_b32", dst(1), src(1)),
600   (  -1,   -1,   -1,   -1,   -1, 0x4d, "s_sendmsg_rtn_b64", dst(2), src(1)),
601   # actually a pseudo-instruction. it's lowered to SALU during assembly though, so it's useful to identify it as a SOP1.
602   (  -1,   -1,   -1,   -1,   -1,   -1, "p_constaddr_getpc", dst(2), src(1)),
603   (  -1,   -1,   -1,   -1,   -1,   -1, "p_resumeaddr_getpc", dst(2), src(1)),
604   (  -1,   -1,   -1,   -1,   -1,   -1, "p_load_symbol", dst(1), src(1)),
605}
606for (gfx6, gfx7, gfx8, gfx9, gfx10, gfx11, name, defs, ops, cls) in default_class(SOP1, InstrClass.Salu):
607   opcode(name, gfx7, gfx9, gfx10, gfx11, Format.SOP1, cls, definitions = defs, operands = ops)
608
609
610# SOPC instructions: 2 inputs and 0 outputs (+SCC)
611SOPC = {
612  # GFX6, GFX7, GFX8, GFX9, GFX10,GFX11,name
613   (0x00, 0x00, 0x00, 0x00, 0x00, 0x00, "s_cmp_eq_i32", dst(SCC), src(1, 1)),
614   (0x01, 0x01, 0x01, 0x01, 0x01, 0x01, "s_cmp_lg_i32", dst(SCC), src(1, 1)),
615   (0x02, 0x02, 0x02, 0x02, 0x02, 0x02, "s_cmp_gt_i32", dst(SCC), src(1, 1)),
616   (0x03, 0x03, 0x03, 0x03, 0x03, 0x03, "s_cmp_ge_i32", dst(SCC), src(1, 1)),
617   (0x04, 0x04, 0x04, 0x04, 0x04, 0x04, "s_cmp_lt_i32", dst(SCC), src(1, 1)),
618   (0x05, 0x05, 0x05, 0x05, 0x05, 0x05, "s_cmp_le_i32", dst(SCC), src(1, 1)),
619   (0x06, 0x06, 0x06, 0x06, 0x06, 0x06, "s_cmp_eq_u32", dst(SCC), src(1, 1)),
620   (0x07, 0x07, 0x07, 0x07, 0x07, 0x07, "s_cmp_lg_u32", dst(SCC), src(1, 1)),
621   (0x08, 0x08, 0x08, 0x08, 0x08, 0x08, "s_cmp_gt_u32", dst(SCC), src(1, 1)),
622   (0x09, 0x09, 0x09, 0x09, 0x09, 0x09, "s_cmp_ge_u32", dst(SCC), src(1, 1)),
623   (0x0a, 0x0a, 0x0a, 0x0a, 0x0a, 0x0a, "s_cmp_lt_u32", dst(SCC), src(1, 1)),
624   (0x0b, 0x0b, 0x0b, 0x0b, 0x0b, 0x0b, "s_cmp_le_u32", dst(SCC), src(1, 1)),
625   (0x0c, 0x0c, 0x0c, 0x0c, 0x0c, 0x0c, "s_bitcmp0_b32", dst(SCC), src(1, 1)),
626   (0x0d, 0x0d, 0x0d, 0x0d, 0x0d, 0x0d, "s_bitcmp1_b32", dst(SCC), src(1, 1)),
627   (0x0e, 0x0e, 0x0e, 0x0e, 0x0e, 0x0e, "s_bitcmp0_b64", dst(SCC), src(2, 1)),
628   (0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, "s_bitcmp1_b64", dst(SCC), src(2, 1)),
629   (0x10, 0x10, 0x10, 0x10,   -1,   -1, "s_setvskip", dst(), src(1, 1)),
630   (  -1,   -1, 0x11, 0x11,   -1,   -1, "s_set_gpr_idx_on", dst(M0), src(1, 1, M0)),
631   (  -1,   -1, 0x12, 0x12, 0x12, 0x10, "s_cmp_eq_u64", dst(SCC), src(2, 2)),
632   (  -1,   -1, 0x13, 0x13, 0x13, 0x11, "s_cmp_lg_u64", dst(SCC), src(2, 2)),
633}
634for (gfx6, gfx7, gfx8, gfx9, gfx10, gfx11, name, defs, ops) in SOPC:
635   opcode(name, gfx7, gfx9, gfx10, gfx11, Format.SOPC, InstrClass.Salu, definitions = defs, operands = ops)
636
637
638# SOPP instructions: 0 inputs (+optional scc/vcc), 0 outputs
639SOPP = {
640  # GFX6, GFX7, GFX8, GFX9, GFX10,GFX11,name
641   (0x00, 0x00, 0x00, 0x00, 0x00, 0x00, "s_nop", dst(), src()),
642   (0x01, 0x01, 0x01, 0x01, 0x01, 0x30, "s_endpgm", dst(), src()),
643   (0x02, 0x02, 0x02, 0x02, 0x02, 0x20, "s_branch", dst(), src(), InstrClass.Branch),
644   (  -1,   -1, 0x03, 0x03, 0x03, 0x34, "s_wakeup", dst(), src()),
645   (0x04, 0x04, 0x04, 0x04, 0x04, 0x21, "s_cbranch_scc0", dst(), src(), InstrClass.Branch),
646   (0x05, 0x05, 0x05, 0x05, 0x05, 0x22, "s_cbranch_scc1", dst(), src(), InstrClass.Branch),
647   (0x06, 0x06, 0x06, 0x06, 0x06, 0x23, "s_cbranch_vccz", dst(), src(), InstrClass.Branch),
648   (0x07, 0x07, 0x07, 0x07, 0x07, 0x24, "s_cbranch_vccnz", dst(), src(), InstrClass.Branch),
649   (0x08, 0x08, 0x08, 0x08, 0x08, 0x25, "s_cbranch_execz", dst(), src(), InstrClass.Branch),
650   (0x09, 0x09, 0x09, 0x09, 0x09, 0x26, "s_cbranch_execnz", dst(), src(), InstrClass.Branch),
651   (0x0a, 0x0a, 0x0a, 0x0a, 0x0a, 0x3d, "s_barrier", dst(), src(), InstrClass.Barrier),
652   (  -1, 0x0b, 0x0b, 0x0b, 0x0b, 0x01, "s_setkill", dst(), src()),
653   (0x0c, 0x0c, 0x0c, 0x0c, 0x0c, 0x09, "s_waitcnt", dst(), src(), InstrClass.Waitcnt),
654   (0x0d, 0x0d, 0x0d, 0x0d, 0x0d, 0x02, "s_sethalt", dst(), src()),
655   (0x0e, 0x0e, 0x0e, 0x0e, 0x0e, 0x03, "s_sleep", dst(), src()),
656   (0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x35, "s_setprio", dst(), src()),
657   (0x10, 0x10, 0x10, 0x10, 0x10, 0x36, "s_sendmsg", dst(), src(), InstrClass.Sendmsg),
658   (0x11, 0x11, 0x11, 0x11, 0x11, 0x37, "s_sendmsghalt", dst(), src(), InstrClass.Sendmsg),
659   (0x12, 0x12, 0x12, 0x12, 0x12, 0x10, "s_trap", dst(), src(), InstrClass.Branch),
660   (0x13, 0x13, 0x13, 0x13, 0x13, 0x3c, "s_icache_inv", dst(), src()),
661   (0x14, 0x14, 0x14, 0x14, 0x14, 0x38, "s_incperflevel", dst(), src()),
662   (0x15, 0x15, 0x15, 0x15, 0x15, 0x39, "s_decperflevel", dst(), src()),
663   (0x16, 0x16, 0x16, 0x16, 0x16, 0x3a, "s_ttracedata", dst(), src(M0)),
664   (  -1, 0x17, 0x17, 0x17, 0x17, 0x27, "s_cbranch_cdbgsys", dst(), src(), InstrClass.Branch),
665   (  -1, 0x18, 0x18, 0x18, 0x18, 0x28, "s_cbranch_cdbguser", dst(), src(), InstrClass.Branch),
666   (  -1, 0x19, 0x19, 0x19, 0x19, 0x29, "s_cbranch_cdbgsys_or_user", dst(), src(), InstrClass.Branch),
667   (  -1, 0x1a, 0x1a, 0x1a, 0x1a, 0x2a, "s_cbranch_cdbgsys_and_user", dst(), src(), InstrClass.Branch),
668   (  -1,   -1, 0x1b, 0x1b, 0x1b, 0x31, "s_endpgm_saved", dst(), src()),
669   (  -1,   -1, 0x1c, 0x1c,   -1,   -1, "s_set_gpr_idx_off", dst(), src()),
670   (  -1,   -1, 0x1d, 0x1d,   -1,   -1, "s_set_gpr_idx_mode", dst(M0), src(M0)),
671   (  -1,   -1,   -1, 0x1e, 0x1e,   -1, "s_endpgm_ordered_ps_done", dst(), src()),
672   (  -1,   -1,   -1,   -1, 0x1f, 0x1f, "s_code_end", dst(), src()),
673   (  -1,   -1,   -1,   -1, 0x20, 0x04, "s_inst_prefetch", dst(), src()), #s_set_inst_prefetch_distance in GFX11
674   (  -1,   -1,   -1,   -1, 0x21, 0x05, "s_clause", dst(), src()),
675   (  -1,   -1,   -1,   -1, 0x22, 0x0a, "s_wait_idle", dst(), src(), InstrClass.Waitcnt),
676   (  -1,   -1,   -1,   -1, 0x23, 0x08, "s_waitcnt_depctr", dst(), src(), InstrClass.Waitcnt),
677   (  -1,   -1,   -1,   -1, 0x24, 0x11, "s_round_mode", dst(), src()),
678   (  -1,   -1,   -1,   -1, 0x25, 0x12, "s_denorm_mode", dst(), src()),
679   (  -1,   -1,   -1,   -1, 0x26, 0x3b, "s_ttracedata_imm", dst(), src()),
680   (  -1,   -1,   -1,   -1,   -1, 0x07, "s_delay_alu", dst(), src(), InstrClass.Waitcnt),
681   (  -1,   -1,   -1,   -1,   -1, 0x0b, "s_wait_event", dst(), src()),
682}
683for (gfx6, gfx7, gfx8, gfx9, gfx10, gfx11, name, defs, ops, cls) in default_class(SOPP, InstrClass.Salu):
684   opcode(name, gfx7, gfx9, gfx10, gfx11, Format.SOPP, cls, definitions = defs, operands = ops)
685
686
687# SMEM instructions: sbase input (2 sgpr), potentially 2 offset inputs, 1 sdata input/output
688# Unlike GFX10, GFX10.3 does not have SMEM store, atomic or scratch instructions
689SMEM = {
690  # GFX6, GFX7, GFX8, GFX9, GFX10,GFX11,name
691   (0x00, 0x00, 0x00, 0x00, 0x00, 0x00, "s_load_dword"), #s_load_b32 in GFX11
692   (0x01, 0x01, 0x01, 0x01, 0x01, 0x01, "s_load_dwordx2"), #s_load_b64 in GFX11
693   (0x02, 0x02, 0x02, 0x02, 0x02, 0x02, "s_load_dwordx4"), #s_load_b128 in GFX11
694   (0x03, 0x03, 0x03, 0x03, 0x03, 0x03, "s_load_dwordx8"), #s_load_b256 in GFX11
695   (0x04, 0x04, 0x04, 0x04, 0x04, 0x04, "s_load_dwordx16"), #s_load_b512 in GFX11
696   (  -1,   -1,   -1, 0x05, 0x05,   -1, "s_scratch_load_dword"),
697   (  -1,   -1,   -1, 0x06, 0x06,   -1, "s_scratch_load_dwordx2"),
698   (  -1,   -1,   -1, 0x07, 0x07,   -1, "s_scratch_load_dwordx4"),
699   (0x08, 0x08, 0x08, 0x08, 0x08, 0x08, "s_buffer_load_dword"), #s_buffer_load_b32 in GFX11
700   (0x09, 0x09, 0x09, 0x09, 0x09, 0x09, "s_buffer_load_dwordx2"), #s_buffer_load_b64 in GFX11
701   (0x0a, 0x0a, 0x0a, 0x0a, 0x0a, 0x0a, "s_buffer_load_dwordx4"), #s_buffer_load_b128 in GFX11
702   (0x0b, 0x0b, 0x0b, 0x0b, 0x0b, 0x0b, "s_buffer_load_dwordx8"), #s_buffer_load_b256 in GFX11
703   (0x0c, 0x0c, 0x0c, 0x0c, 0x0c, 0x0c, "s_buffer_load_dwordx16"), #s_buffer_load_b512 in GFX11
704   (  -1,   -1, 0x10, 0x10, 0x10,   -1, "s_store_dword"),
705   (  -1,   -1, 0x11, 0x11, 0x11,   -1, "s_store_dwordx2"),
706   (  -1,   -1, 0x12, 0x12, 0x12,   -1, "s_store_dwordx4"),
707   (  -1,   -1,   -1, 0x15, 0x15,   -1, "s_scratch_store_dword"),
708   (  -1,   -1,   -1, 0x16, 0x16,   -1, "s_scratch_store_dwordx2"),
709   (  -1,   -1,   -1, 0x17, 0x17,   -1, "s_scratch_store_dwordx4"),
710   (  -1,   -1, 0x18, 0x18, 0x18,   -1, "s_buffer_store_dword"),
711   (  -1,   -1, 0x19, 0x19, 0x19,   -1, "s_buffer_store_dwordx2"),
712   (  -1,   -1, 0x1a, 0x1a, 0x1a,   -1, "s_buffer_store_dwordx4"),
713   (  -1,   -1, 0x1f, 0x1f, 0x1f, 0x20, "s_gl1_inv"),
714   (0x1f, 0x1f, 0x20, 0x20, 0x20, 0x21, "s_dcache_inv"),
715   (  -1,   -1, 0x21, 0x21, 0x21,   -1, "s_dcache_wb"),
716   (  -1, 0x1d, 0x22, 0x22,   -1,   -1, "s_dcache_inv_vol"),
717   (  -1,   -1, 0x23, 0x23,   -1,   -1, "s_dcache_wb_vol"),
718   (0x1e, 0x1e, 0x24, 0x24, 0x24,   -1, "s_memtime"), #GFX6-GFX10
719   (  -1,   -1, 0x25, 0x25, 0x25,   -1, "s_memrealtime"),
720   (  -1,   -1, 0x26, 0x26, 0x26, 0x22, "s_atc_probe"),
721   (  -1,   -1, 0x27, 0x27, 0x27, 0x23, "s_atc_probe_buffer"),
722   (  -1,   -1,   -1, 0x28, 0x28,   -1, "s_dcache_discard"),
723   (  -1,   -1,   -1, 0x29, 0x29,   -1, "s_dcache_discard_x2"),
724   (  -1,   -1,   -1,   -1, 0x2a,   -1, "s_get_waveid_in_workgroup"),
725   (  -1,   -1,   -1, 0x40, 0x40,   -1, "s_buffer_atomic_swap"),
726   (  -1,   -1,   -1, 0x41, 0x41,   -1, "s_buffer_atomic_cmpswap"),
727   (  -1,   -1,   -1, 0x42, 0x42,   -1, "s_buffer_atomic_add"),
728   (  -1,   -1,   -1, 0x43, 0x43,   -1, "s_buffer_atomic_sub"),
729   (  -1,   -1,   -1, 0x44, 0x44,   -1, "s_buffer_atomic_smin"),
730   (  -1,   -1,   -1, 0x45, 0x45,   -1, "s_buffer_atomic_umin"),
731   (  -1,   -1,   -1, 0x46, 0x46,   -1, "s_buffer_atomic_smax"),
732   (  -1,   -1,   -1, 0x47, 0x47,   -1, "s_buffer_atomic_umax"),
733   (  -1,   -1,   -1, 0x48, 0x48,   -1, "s_buffer_atomic_and"),
734   (  -1,   -1,   -1, 0x49, 0x49,   -1, "s_buffer_atomic_or"),
735   (  -1,   -1,   -1, 0x4a, 0x4a,   -1, "s_buffer_atomic_xor"),
736   (  -1,   -1,   -1, 0x4b, 0x4b,   -1, "s_buffer_atomic_inc"),
737   (  -1,   -1,   -1, 0x4c, 0x4c,   -1, "s_buffer_atomic_dec"),
738   (  -1,   -1,   -1, 0x60, 0x60,   -1, "s_buffer_atomic_swap_x2"),
739   (  -1,   -1,   -1, 0x61, 0x61,   -1, "s_buffer_atomic_cmpswap_x2"),
740   (  -1,   -1,   -1, 0x62, 0x62,   -1, "s_buffer_atomic_add_x2"),
741   (  -1,   -1,   -1, 0x63, 0x63,   -1, "s_buffer_atomic_sub_x2"),
742   (  -1,   -1,   -1, 0x64, 0x64,   -1, "s_buffer_atomic_smin_x2"),
743   (  -1,   -1,   -1, 0x65, 0x65,   -1, "s_buffer_atomic_umin_x2"),
744   (  -1,   -1,   -1, 0x66, 0x66,   -1, "s_buffer_atomic_smax_x2"),
745   (  -1,   -1,   -1, 0x67, 0x67,   -1, "s_buffer_atomic_umax_x2"),
746   (  -1,   -1,   -1, 0x68, 0x68,   -1, "s_buffer_atomic_and_x2"),
747   (  -1,   -1,   -1, 0x69, 0x69,   -1, "s_buffer_atomic_or_x2"),
748   (  -1,   -1,   -1, 0x6a, 0x6a,   -1, "s_buffer_atomic_xor_x2"),
749   (  -1,   -1,   -1, 0x6b, 0x6b,   -1, "s_buffer_atomic_inc_x2"),
750   (  -1,   -1,   -1, 0x6c, 0x6c,   -1, "s_buffer_atomic_dec_x2"),
751   (  -1,   -1,   -1, 0x80, 0x80,   -1, "s_atomic_swap"),
752   (  -1,   -1,   -1, 0x81, 0x81,   -1, "s_atomic_cmpswap"),
753   (  -1,   -1,   -1, 0x82, 0x82,   -1, "s_atomic_add"),
754   (  -1,   -1,   -1, 0x83, 0x83,   -1, "s_atomic_sub"),
755   (  -1,   -1,   -1, 0x84, 0x84,   -1, "s_atomic_smin"),
756   (  -1,   -1,   -1, 0x85, 0x85,   -1, "s_atomic_umin"),
757   (  -1,   -1,   -1, 0x86, 0x86,   -1, "s_atomic_smax"),
758   (  -1,   -1,   -1, 0x87, 0x87,   -1, "s_atomic_umax"),
759   (  -1,   -1,   -1, 0x88, 0x88,   -1, "s_atomic_and"),
760   (  -1,   -1,   -1, 0x89, 0x89,   -1, "s_atomic_or"),
761   (  -1,   -1,   -1, 0x8a, 0x8a,   -1, "s_atomic_xor"),
762   (  -1,   -1,   -1, 0x8b, 0x8b,   -1, "s_atomic_inc"),
763   (  -1,   -1,   -1, 0x8c, 0x8c,   -1, "s_atomic_dec"),
764   (  -1,   -1,   -1, 0xa0, 0xa0,   -1, "s_atomic_swap_x2"),
765   (  -1,   -1,   -1, 0xa1, 0xa1,   -1, "s_atomic_cmpswap_x2"),
766   (  -1,   -1,   -1, 0xa2, 0xa2,   -1, "s_atomic_add_x2"),
767   (  -1,   -1,   -1, 0xa3, 0xa3,   -1, "s_atomic_sub_x2"),
768   (  -1,   -1,   -1, 0xa4, 0xa4,   -1, "s_atomic_smin_x2"),
769   (  -1,   -1,   -1, 0xa5, 0xa5,   -1, "s_atomic_umin_x2"),
770   (  -1,   -1,   -1, 0xa6, 0xa6,   -1, "s_atomic_smax_x2"),
771   (  -1,   -1,   -1, 0xa7, 0xa7,   -1, "s_atomic_umax_x2"),
772   (  -1,   -1,   -1, 0xa8, 0xa8,   -1, "s_atomic_and_x2"),
773   (  -1,   -1,   -1, 0xa9, 0xa9,   -1, "s_atomic_or_x2"),
774   (  -1,   -1,   -1, 0xaa, 0xaa,   -1, "s_atomic_xor_x2"),
775   (  -1,   -1,   -1, 0xab, 0xab,   -1, "s_atomic_inc_x2"),
776   (  -1,   -1,   -1, 0xac, 0xac,   -1, "s_atomic_dec_x2"),
777}
778for (gfx6, gfx7, gfx8, gfx9, gfx10, gfx11, name) in SMEM:
779   opcode(name, gfx7, gfx9, gfx10, gfx11, Format.SMEM, InstrClass.SMem, is_atomic = "atomic" in name)
780
781
782# VOP2 instructions: 2 inputs, 1 output (+ optional vcc)
783# TODO: misses some GFX6_7 opcodes which were shifted to VOP3 in GFX8
784VOP2 = {
785  # GFX6, GFX7, GFX8, GFX9, GFX10,GFX11,name, input modifiers, output modifiers
786   (0x00, 0x00, 0x00, 0x00, 0x01, 0x01, "v_cndmask_b32", True, False, dst(1), src(1, 1, VCC)),
787   (0x01, 0x01,   -1,   -1,   -1,   -1, "v_readlane_b32", False, False, dst(1), src(1, 1)),
788   (0x02, 0x02,   -1,   -1,   -1,   -1, "v_writelane_b32", False, False, dst(1), src(1, 1, 1)),
789   (0x03, 0x03, 0x01, 0x01, 0x03, 0x03, "v_add_f32", True, True, dst(1), src(1, 1)),
790   (0x04, 0x04, 0x02, 0x02, 0x04, 0x04, "v_sub_f32", True, True, dst(1), src(1, 1)),
791   (0x05, 0x05, 0x03, 0x03, 0x05, 0x05, "v_subrev_f32", True, True, dst(1), src(1, 1)),
792   (0x06, 0x06,   -1,   -1, 0x06,   -1, "v_mac_legacy_f32", True, True, dst(1), src(1, 1, 1)), #GFX6,7,10
793   (  -1,   -1,   -1,   -1, 0x06, 0x06, "v_fmac_legacy_f32", True, True, dst(1), src(1, 1, 1)), #GFX10.3+, v_fmac_dx9_zero_f32 in GFX11
794   (0x07, 0x07, 0x04, 0x04, 0x07, 0x07, "v_mul_legacy_f32", True, True, dst(1), src(1, 1)), #v_mul_dx9_zero_f32 in GFX11
795   (0x08, 0x08, 0x05, 0x05, 0x08, 0x08, "v_mul_f32", True, True, dst(1), src(1, 1)),
796   (0x09, 0x09, 0x06, 0x06, 0x09, 0x09, "v_mul_i32_i24", False, False, dst(1), src(1, 1)),
797   (0x0a, 0x0a, 0x07, 0x07, 0x0a, 0x0a, "v_mul_hi_i32_i24", False, False, dst(1), src(1, 1)),
798   (0x0b, 0x0b, 0x08, 0x08, 0x0b, 0x0b, "v_mul_u32_u24", False, False, dst(1), src(1, 1)),
799   (0x0c, 0x0c, 0x09, 0x09, 0x0c, 0x0c, "v_mul_hi_u32_u24", False, False, dst(1), src(1, 1)),
800   (  -1,   -1,   -1, 0x39, 0x0d,   -1, "v_dot4c_i32_i8", False, False, dst(1), src(1, 1, 1)),
801   (0x0d, 0x0d,   -1,   -1,   -1,   -1, "v_min_legacy_f32", True, True, dst(1), src(1, 1)),
802   (0x0e, 0x0e,   -1,   -1,   -1,   -1, "v_max_legacy_f32", True, True, dst(1), src(1, 1)),
803   (0x0f, 0x0f, 0x0a, 0x0a, 0x0f, 0x0f, "v_min_f32", True, True, dst(1), src(1, 1)),
804   (0x10, 0x10, 0x0b, 0x0b, 0x10, 0x10, "v_max_f32", True, True, dst(1), src(1, 1)),
805   (0x11, 0x11, 0x0c, 0x0c, 0x11, 0x11, "v_min_i32", False, False, dst(1), src(1, 1)),
806   (0x12, 0x12, 0x0d, 0x0d, 0x12, 0x12, "v_max_i32", False, False, dst(1), src(1, 1)),
807   (0x13, 0x13, 0x0e, 0x0e, 0x13, 0x13, "v_min_u32", False, False, dst(1), src(1, 1)),
808   (0x14, 0x14, 0x0f, 0x0f, 0x14, 0x14, "v_max_u32", False, False, dst(1), src(1, 1)),
809   (0x15, 0x15,   -1,   -1,   -1,   -1, "v_lshr_b32", False, False, dst(1), src(1, 1)),
810   (0x16, 0x16, 0x10, 0x10, 0x16, 0x19, "v_lshrrev_b32", False, False, dst(1), src(1, 1)),
811   (0x17, 0x17,   -1,   -1,   -1,   -1, "v_ashr_i32", False, False, dst(1), src(1, 1)),
812   (0x18, 0x18, 0x11, 0x11, 0x18, 0x1a, "v_ashrrev_i32", False, False, dst(1), src(1, 1)),
813   (0x19, 0x19,   -1,   -1,   -1,   -1, "v_lshl_b32", False, False, dst(1), src(1, 1)),
814   (0x1a, 0x1a, 0x12, 0x12, 0x1a, 0x18, "v_lshlrev_b32", False, False, dst(1), src(1, 1)),
815   (0x1b, 0x1b, 0x13, 0x13, 0x1b, 0x1b, "v_and_b32", False, False, dst(1), src(1, 1)),
816   (0x1c, 0x1c, 0x14, 0x14, 0x1c, 0x1c, "v_or_b32", False, False, dst(1), src(1, 1)),
817   (0x1d, 0x1d, 0x15, 0x15, 0x1d, 0x1d, "v_xor_b32", False, False, dst(1), src(1, 1)),
818   (  -1,   -1,   -1,   -1, 0x1e, 0x1e, "v_xnor_b32", False, False, dst(1), src(1, 1)),
819   (0x1f, 0x1f, 0x16, 0x16, 0x1f,   -1, "v_mac_f32", True, True, dst(1), src(1, 1, 1)),
820   (0x20, 0x20, 0x17, 0x17, 0x20,   -1, "v_madmk_f32", False, False, dst(1), src(1, 1, 1)),
821   (0x21, 0x21, 0x18, 0x18, 0x21,   -1, "v_madak_f32", False, False, dst(1), src(1, 1, 1)),
822   (0x24, 0x24,   -1,   -1,   -1,   -1, "v_mbcnt_hi_u32_b32", False, False, dst(1), src(1, 1)),
823   (0x25, 0x25, 0x19, 0x19,   -1,   -1, "v_add_co_u32", False, False, dst(1, VCC), src(1, 1)), # VOP3B only in RDNA
824   (0x26, 0x26, 0x1a, 0x1a,   -1,   -1, "v_sub_co_u32", False, False, dst(1, VCC), src(1, 1)), # VOP3B only in RDNA
825   (0x27, 0x27, 0x1b, 0x1b,   -1,   -1, "v_subrev_co_u32", False, False, dst(1, VCC), src(1, 1)), # VOP3B only in RDNA
826   (0x28, 0x28, 0x1c, 0x1c, 0x28, 0x20, "v_addc_co_u32", False, False, dst(1, VCC), src(1, 1, VCC)), # v_add_co_ci_u32 in RDNA
827   (0x29, 0x29, 0x1d, 0x1d, 0x29, 0x21, "v_subb_co_u32", False, False, dst(1, VCC), src(1, 1, VCC)), # v_sub_co_ci_u32 in RDNA
828   (0x2a, 0x2a, 0x1e, 0x1e, 0x2a, 0x22, "v_subbrev_co_u32", False, False, dst(1, VCC), src(1, 1, VCC)), # v_subrev_co_ci_u32 in RDNA
829   (  -1,   -1,   -1,   -1, 0x2b, 0x2b, "v_fmac_f32", True, True, dst(1), src(1, 1, 1)),
830   (  -1,   -1,   -1,   -1, 0x2c, 0x2c, "v_fmamk_f32", False, False, dst(1), src(1, 1, 1)),
831   (  -1,   -1,   -1,   -1, 0x2d, 0x2d, "v_fmaak_f32", False, False, dst(1), src(1, 1, 1)),
832   (0x2f, 0x2f,   -1,   -1, 0x2f, 0x2f, "v_cvt_pkrtz_f16_f32", True, False, dst(1), src(1, 1)), #v_cvt_pk_rtz_f16_f32 in GFX11
833   (  -1,   -1, 0x1f, 0x1f, 0x32, 0x32, "v_add_f16", True, True, dst(1), src(1, 1)),
834   (  -1,   -1, 0x20, 0x20, 0x33, 0x33, "v_sub_f16", True, True, dst(1), src(1, 1)),
835   (  -1,   -1, 0x21, 0x21, 0x34, 0x34, "v_subrev_f16", True, True, dst(1), src(1, 1)),
836   (  -1,   -1, 0x22, 0x22, 0x35, 0x35, "v_mul_f16", True, True, dst(1), src(1, 1)),
837   (  -1,   -1, 0x23, 0x23,   -1,   -1, "v_mac_f16", True, True, dst(1), src(1, 1, 1)),
838   (  -1,   -1, 0x24, 0x24,   -1,   -1, "v_madmk_f16", False, False, dst(1), src(1, 1, 1)),
839   (  -1,   -1, 0x25, 0x25,   -1,   -1, "v_madak_f16", False, False, dst(1), src(1, 1, 1)),
840   (  -1,   -1, 0x26, 0x26,   -1,   -1, "v_add_u16", False, False, dst(1), src(1, 1)),
841   (  -1,   -1, 0x27, 0x27,   -1,   -1, "v_sub_u16", False, False, dst(1), src(1, 1)),
842   (  -1,   -1, 0x28, 0x28,   -1,   -1, "v_subrev_u16", False, False, dst(1), src(1, 1)),
843   (  -1,   -1, 0x29, 0x29,   -1,   -1, "v_mul_lo_u16", False, False, dst(1), src(1, 1)),
844   (  -1,   -1, 0x2a, 0x2a,   -1,   -1, "v_lshlrev_b16", False, False, dst(1), src(1, 1)),
845   (  -1,   -1, 0x2b, 0x2b,   -1,   -1, "v_lshrrev_b16", False, False, dst(1), src(1, 1)),
846   (  -1,   -1, 0x2c, 0x2c,   -1,   -1, "v_ashrrev_i16", False, False, dst(1), src(1, 1)),
847   (  -1,   -1, 0x2d, 0x2d, 0x39, 0x39, "v_max_f16", True, True, dst(1), src(1, 1)),
848   (  -1,   -1, 0x2e, 0x2e, 0x3a, 0x3a, "v_min_f16", True, True, dst(1), src(1, 1)),
849   (  -1,   -1, 0x2f, 0x2f,   -1,   -1, "v_max_u16", False, False, dst(1), src(1, 1)),
850   (  -1,   -1, 0x30, 0x30,   -1,   -1, "v_max_i16", False, False, dst(1), src(1, 1)),
851   (  -1,   -1, 0x31, 0x31,   -1,   -1, "v_min_u16", False, False, dst(1), src(1, 1)),
852   (  -1,   -1, 0x32, 0x32,   -1,   -1, "v_min_i16", False, False, dst(1), src(1, 1)),
853   (  -1,   -1, 0x33, 0x33, 0x3b, 0x3b, "v_ldexp_f16", False, True, dst(1), src(1, 1)),
854   (  -1,   -1,   -1, 0x34, 0x25, 0x25, "v_add_u32", False, False, dst(1), src(1, 1)), # called v_add_nc_u32 in RDNA
855   (  -1,   -1,   -1, 0x35, 0x26, 0x26, "v_sub_u32", False, False, dst(1), src(1, 1)), # called v_sub_nc_u32 in RDNA
856   (  -1,   -1,   -1, 0x36, 0x27, 0x27, "v_subrev_u32", False, False, dst(1), src(1, 1)), # called v_subrev_nc_u32 in RDNA
857   (  -1,   -1,   -1,   -1, 0x36, 0x36, "v_fmac_f16", True, True, dst(1), src(1, 1, 1)),
858   (  -1,   -1,   -1,   -1, 0x37, 0x37, "v_fmamk_f16", False, False, dst(1), src(1, 1, 1)),
859   (  -1,   -1,   -1,   -1, 0x38, 0x38, "v_fmaak_f16", False, False, dst(1), src(1, 1, 1)),
860   (  -1,   -1,   -1,   -1, 0x3c, 0x3c, "v_pk_fmac_f16", False, False, dst(1), src(1, 1, 1)),
861   (  -1,   -1,   -1, 0x37, 0x02, 0x02, "v_dot2c_f32_f16", False, False, dst(1), src(1, 1, 1)), #v_dot2acc_f32_f16 in GFX11
862}
863for (gfx6, gfx7, gfx8, gfx9, gfx10, gfx11, name, in_mod, out_mod, defs, ops) in VOP2:
864   opcode(name, gfx7, gfx9, gfx10, gfx11, Format.VOP2, InstrClass.Valu32, in_mod, out_mod, definitions = defs, operands = ops)
865
866
867# VOP1 instructions: instructions with 1 input and 1 output
868VOP1 = {
869  # GFX6, GFX7, GFX8, GFX9, GFX10,GFX11,name, input_modifiers, output_modifiers
870   (0x00, 0x00, 0x00, 0x00, 0x00, 0x00, "v_nop", False, False, dst(), src()),
871   (0x01, 0x01, 0x01, 0x01, 0x01, 0x01, "v_mov_b32", False, False, dst(1), src(1)),
872   (0x02, 0x02, 0x02, 0x02, 0x02, 0x02, "v_readfirstlane_b32", False, False, dst(1), src(1)),
873   (0x03, 0x03, 0x03, 0x03, 0x03, 0x03, "v_cvt_i32_f64", True, False, dst(1), src(2), InstrClass.ValuDoubleConvert),
874   (0x04, 0x04, 0x04, 0x04, 0x04, 0x04, "v_cvt_f64_i32", False, True, dst(2), src(1), InstrClass.ValuDoubleConvert),
875   (0x05, 0x05, 0x05, 0x05, 0x05, 0x05, "v_cvt_f32_i32", False, True, dst(1), src(1)),
876   (0x06, 0x06, 0x06, 0x06, 0x06, 0x06, "v_cvt_f32_u32", False, True, dst(1), src(1)),
877   (0x07, 0x07, 0x07, 0x07, 0x07, 0x07, "v_cvt_u32_f32", True, False, dst(1), src(1)),
878   (0x08, 0x08, 0x08, 0x08, 0x08, 0x08, "v_cvt_i32_f32", True, False, dst(1), src(1)),
879   (0x0a, 0x0a, 0x0a, 0x0a, 0x0a, 0x0a, "v_cvt_f16_f32", True, True, dst(1), src(1)),
880   (  -1,   -1,   -1,   -1,   -1,   -1, "p_cvt_f16_f32_rtne", True, True, dst(1), src(1)),
881   (0x0b, 0x0b, 0x0b, 0x0b, 0x0b, 0x0b, "v_cvt_f32_f16", True, True, dst(1), src(1)),
882   (0x0c, 0x0c, 0x0c, 0x0c, 0x0c, 0x0c, "v_cvt_rpi_i32_f32", True, False, dst(1), src(1)), #v_cvt_nearest_i32_f32 in GFX11
883   (0x0d, 0x0d, 0x0d, 0x0d, 0x0d, 0x0d, "v_cvt_flr_i32_f32", True, False, dst(1), src(1)),#v_cvt_floor_i32_f32 in GFX11
884   (0x0e, 0x0e, 0x0e, 0x0e, 0x0e, 0x0e, "v_cvt_off_f32_i4", False, True, dst(1), src(1)),
885   (0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, "v_cvt_f32_f64", True, True, dst(1), src(2), InstrClass.ValuDoubleConvert),
886   (0x10, 0x10, 0x10, 0x10, 0x10, 0x10, "v_cvt_f64_f32", True, True, dst(2), src(1), InstrClass.ValuDoubleConvert),
887   (0x11, 0x11, 0x11, 0x11, 0x11, 0x11, "v_cvt_f32_ubyte0", False, True, dst(1), src(1)),
888   (0x12, 0x12, 0x12, 0x12, 0x12, 0x12, "v_cvt_f32_ubyte1", False, True, dst(1), src(1)),
889   (0x13, 0x13, 0x13, 0x13, 0x13, 0x13, "v_cvt_f32_ubyte2", False, True, dst(1), src(1)),
890   (0x14, 0x14, 0x14, 0x14, 0x14, 0x14, "v_cvt_f32_ubyte3", False, True, dst(1), src(1)),
891   (0x15, 0x15, 0x15, 0x15, 0x15, 0x15, "v_cvt_u32_f64", True, False, dst(1), src(2), InstrClass.ValuDoubleConvert),
892   (0x16, 0x16, 0x16, 0x16, 0x16, 0x16, "v_cvt_f64_u32", False, True, dst(2), src(1), InstrClass.ValuDoubleConvert),
893   (  -1, 0x17, 0x17, 0x17, 0x17, 0x17, "v_trunc_f64", True, True, dst(2), src(2), InstrClass.ValuDouble),
894   (  -1, 0x18, 0x18, 0x18, 0x18, 0x18, "v_ceil_f64", True, True, dst(2), src(2), InstrClass.ValuDouble),
895   (  -1, 0x19, 0x19, 0x19, 0x19, 0x19, "v_rndne_f64", True, True, dst(2), src(2), InstrClass.ValuDouble),
896   (  -1, 0x1a, 0x1a, 0x1a, 0x1a, 0x1a, "v_floor_f64", True, True, dst(2), src(2), InstrClass.ValuDouble),
897   (  -1,   -1,   -1,   -1, 0x1b, 0x1b, "v_pipeflush", False, False, dst(), src()),
898   (0x20, 0x20, 0x1b, 0x1b, 0x20, 0x20, "v_fract_f32", True, True, dst(1), src(1)),
899   (0x21, 0x21, 0x1c, 0x1c, 0x21, 0x21, "v_trunc_f32", True, True, dst(1), src(1)),
900   (0x22, 0x22, 0x1d, 0x1d, 0x22, 0x22, "v_ceil_f32", True, True, dst(1), src(1)),
901   (0x23, 0x23, 0x1e, 0x1e, 0x23, 0x23, "v_rndne_f32", True, True, dst(1), src(1)),
902   (0x24, 0x24, 0x1f, 0x1f, 0x24, 0x24, "v_floor_f32", True, True, dst(1), src(1)),
903   (0x25, 0x25, 0x20, 0x20, 0x25, 0x25, "v_exp_f32", True, True, dst(1), src(1), InstrClass.ValuTranscendental32),
904   (0x26, 0x26,   -1,   -1,   -1,   -1, "v_log_clamp_f32", True, True, dst(1), src(1), InstrClass.ValuTranscendental32),
905   (0x27, 0x27, 0x21, 0x21, 0x27, 0x27, "v_log_f32", True, True, dst(1), src(1), InstrClass.ValuTranscendental32),
906   (0x28, 0x28,   -1,   -1,   -1,   -1, "v_rcp_clamp_f32", True, True, dst(1), src(1), InstrClass.ValuTranscendental32),
907   (0x29, 0x29,   -1,   -1,   -1,   -1, "v_rcp_legacy_f32", True, True, dst(1), src(1), InstrClass.ValuTranscendental32),
908   (0x2a, 0x2a, 0x22, 0x22, 0x2a, 0x2a, "v_rcp_f32", True, True, dst(1), src(1), InstrClass.ValuTranscendental32),
909   (0x2b, 0x2b, 0x23, 0x23, 0x2b, 0x2b, "v_rcp_iflag_f32", True, True, dst(1), src(1), InstrClass.ValuTranscendental32),
910   (0x2c, 0x2c,   -1,   -1,   -1,   -1, "v_rsq_clamp_f32", True, True, dst(1), src(1), InstrClass.ValuTranscendental32),
911   (0x2d, 0x2d,   -1,   -1,   -1,   -1, "v_rsq_legacy_f32", True, True, dst(1), src(1), InstrClass.ValuTranscendental32),
912   (0x2e, 0x2e, 0x24, 0x24, 0x2e, 0x2e, "v_rsq_f32", True, True, dst(1), src(1), InstrClass.ValuTranscendental32),
913   (0x2f, 0x2f, 0x25, 0x25, 0x2f, 0x2f, "v_rcp_f64", True, True, dst(2), src(2), InstrClass.ValuDoubleTranscendental),
914   (0x30, 0x30,   -1,   -1,   -1,   -1, "v_rcp_clamp_f64", True, True, dst(2), src(2), InstrClass.ValuDoubleTranscendental),
915   (0x31, 0x31, 0x26, 0x26, 0x31, 0x31, "v_rsq_f64", True, True, dst(2), src(2), InstrClass.ValuDoubleTranscendental),
916   (0x32, 0x32,   -1,   -1,   -1,   -1, "v_rsq_clamp_f64", True, True, dst(2), src(2), InstrClass.ValuDoubleTranscendental),
917   (0x33, 0x33, 0x27, 0x27, 0x33, 0x33, "v_sqrt_f32", True, True, dst(1), src(1), InstrClass.ValuTranscendental32),
918   (0x34, 0x34, 0x28, 0x28, 0x34, 0x34, "v_sqrt_f64", True, True, dst(2), src(2), InstrClass.ValuDoubleTranscendental),
919   (0x35, 0x35, 0x29, 0x29, 0x35, 0x35, "v_sin_f32", True, True, dst(1), src(1), InstrClass.ValuTranscendental32),
920   (0x36, 0x36, 0x2a, 0x2a, 0x36, 0x36, "v_cos_f32", True, True, dst(1), src(1), InstrClass.ValuTranscendental32),
921   (0x37, 0x37, 0x2b, 0x2b, 0x37, 0x37, "v_not_b32", False, False, dst(1), src(1)),
922   (0x38, 0x38, 0x2c, 0x2c, 0x38, 0x38, "v_bfrev_b32", False, False, dst(1), src(1)),
923   (0x39, 0x39, 0x2d, 0x2d, 0x39, 0x39, "v_ffbh_u32", False, False, dst(1), src(1)), #v_clz_i32_u32 in GFX11
924   (0x3a, 0x3a, 0x2e, 0x2e, 0x3a, 0x3a, "v_ffbl_b32", False, False, dst(1), src(1)), #v_ctz_i32_b32 in GFX11
925   (0x3b, 0x3b, 0x2f, 0x2f, 0x3b, 0x3b, "v_ffbh_i32", False, False, dst(1), src(1)), #v_cls_i32 in GFX11
926   (0x3c, 0x3c, 0x30, 0x30, 0x3c, 0x3c, "v_frexp_exp_i32_f64", True, False, dst(1), src(2), InstrClass.ValuDouble),
927   (0x3d, 0x3d, 0x31, 0x31, 0x3d, 0x3d, "v_frexp_mant_f64", True, False, dst(2), src(2), InstrClass.ValuDouble),
928   (0x3e, 0x3e, 0x32, 0x32, 0x3e, 0x3e, "v_fract_f64", True, True, dst(2), src(2), InstrClass.ValuDouble),
929   (0x3f, 0x3f, 0x33, 0x33, 0x3f, 0x3f, "v_frexp_exp_i32_f32", True, False, dst(1), src(1)),
930   (0x40, 0x40, 0x34, 0x34, 0x40, 0x40, "v_frexp_mant_f32", True, False, dst(1), src(1)),
931   (0x41, 0x41, 0x35, 0x35, 0x41,   -1, "v_clrexcp", False, False, dst(), src()),
932   (0x42, 0x42, 0x36,   -1, 0x42, 0x42, "v_movreld_b32", False, False, dst(1), src(1, M0)),
933   (0x43, 0x43, 0x37,   -1, 0x43, 0x43, "v_movrels_b32", False, False, dst(1), src(1, M0)),
934   (0x44, 0x44, 0x38,   -1, 0x44, 0x44, "v_movrelsd_b32", False, False, dst(1), src(1, M0)),
935   (  -1,   -1,   -1,   -1, 0x48, 0x48, "v_movrelsd_2_b32", False, False, dst(1), src(1, M0)),
936   (  -1,   -1,   -1, 0x37,   -1,   -1, "v_screen_partition_4se_b32", False, False, dst(1), src(1)),
937   (  -1,   -1, 0x39, 0x39, 0x50, 0x50, "v_cvt_f16_u16", False, True, dst(1), src(1)),
938   (  -1,   -1, 0x3a, 0x3a, 0x51, 0x51, "v_cvt_f16_i16", False, True, dst(1), src(1)),
939   (  -1,   -1, 0x3b, 0x3b, 0x52, 0x52, "v_cvt_u16_f16", True, False, dst(1), src(1)),
940   (  -1,   -1, 0x3c, 0x3c, 0x53, 0x53, "v_cvt_i16_f16", True, False, dst(1), src(1)),
941   (  -1,   -1, 0x3d, 0x3d, 0x54, 0x54, "v_rcp_f16", True, True, dst(1), src(1), InstrClass.ValuTranscendental32),
942   (  -1,   -1, 0x3e, 0x3e, 0x55, 0x55, "v_sqrt_f16", True, True, dst(1), src(1), InstrClass.ValuTranscendental32),
943   (  -1,   -1, 0x3f, 0x3f, 0x56, 0x56, "v_rsq_f16", True, True, dst(1), src(1), InstrClass.ValuTranscendental32),
944   (  -1,   -1, 0x40, 0x40, 0x57, 0x57, "v_log_f16", True, True, dst(1), src(1), InstrClass.ValuTranscendental32),
945   (  -1,   -1, 0x41, 0x41, 0x58, 0x58, "v_exp_f16", True, True, dst(1), src(1), InstrClass.ValuTranscendental32),
946   (  -1,   -1, 0x42, 0x42, 0x59, 0x59, "v_frexp_mant_f16", True, False, dst(1), src(1)),
947   (  -1,   -1, 0x43, 0x43, 0x5a, 0x5a, "v_frexp_exp_i16_f16", True, False, dst(1), src(1)),
948   (  -1,   -1, 0x44, 0x44, 0x5b, 0x5b, "v_floor_f16", True, True, dst(1), src(1)),
949   (  -1,   -1, 0x45, 0x45, 0x5c, 0x5c, "v_ceil_f16", True, True, dst(1), src(1)),
950   (  -1,   -1, 0x46, 0x46, 0x5d, 0x5d, "v_trunc_f16", True, True, dst(1), src(1)),
951   (  -1,   -1, 0x47, 0x47, 0x5e, 0x5e, "v_rndne_f16", True, True, dst(1), src(1)),
952   (  -1,   -1, 0x48, 0x48, 0x5f, 0x5f, "v_fract_f16", True, True, dst(1), src(1)),
953   (  -1,   -1, 0x49, 0x49, 0x60, 0x60, "v_sin_f16", True, True, dst(1), src(1), InstrClass.ValuTranscendental32),
954   (  -1,   -1, 0x4a, 0x4a, 0x61, 0x61, "v_cos_f16", True, True, dst(1), src(1), InstrClass.ValuTranscendental32),
955   (  -1, 0x46, 0x4b, 0x4b,   -1,   -1, "v_exp_legacy_f32", True, True, dst(1), src(1), InstrClass.ValuTranscendental32),
956   (  -1, 0x45, 0x4c, 0x4c,   -1,   -1, "v_log_legacy_f32", True, True, dst(1), src(1), InstrClass.ValuTranscendental32),
957   (  -1,   -1,   -1, 0x4f, 0x62, 0x62, "v_sat_pk_u8_i16", False, False, dst(1), src(1)),
958   (  -1,   -1,   -1, 0x4d, 0x63, 0x63, "v_cvt_norm_i16_f16", True, False, dst(1), src(1)),
959   (  -1,   -1,   -1, 0x4e, 0x64, 0x64, "v_cvt_norm_u16_f16", True, False, dst(1), src(1)),
960   (  -1,   -1,   -1, 0x51, 0x65, 0x65, "v_swap_b32", False, False, dst(1, 1), src(1, 1)),
961   (  -1,   -1,   -1,   -1, 0x68, 0x68, "v_swaprel_b32", False, False, dst(1, 1), src(1, 1, M0)),
962   (  -1,   -1,   -1,   -1,   -1, 0x67, "v_permlane64_b32", False, False, dst(1), src(1)), #cannot use VOP3
963   (  -1,   -1,   -1,   -1,   -1, 0x69, "v_not_b16", False, False, dst(1), src(1)),
964   (  -1,   -1,   -1,   -1,   -1, 0x6a, "v_cvt_i32_i16", False, False, dst(1), src(1)),
965   (  -1,   -1,   -1,   -1,   -1, 0x6b, "v_cvt_u32_u16", False, False, dst(1), src(1)),
966   (  -1,   -1,   -1,   -1,   -1, 0x1c, "v_mov_b16", True, False, dst(1), src(1)),
967}
968for (gfx6, gfx7, gfx8, gfx9, gfx10, gfx11, name, in_mod, out_mod, defs, ops, cls) in default_class(VOP1, InstrClass.Valu32):
969   opcode(name, gfx7, gfx9, gfx10, gfx11, Format.VOP1, cls, in_mod, out_mod, definitions = defs, operands = ops)
970
971
972# VOPC instructions:
973
974VOPC_CLASS = {
975   (0x88, 0x88, 0x10, 0x10, 0x88, 0x7e, "v_cmp_class_f32", dst(VCC), src(1, 1)),
976   (  -1,   -1, 0x14, 0x14, 0x8f, 0x7d, "v_cmp_class_f16", dst(VCC), src(1, 1)),
977   (0x98, 0x98, 0x11, 0x11, 0x98, 0xfe, "v_cmpx_class_f32", dst(EXEC), src(1, 1)),
978   (  -1,   -1, 0x15, 0x15, 0x9f, 0xfd, "v_cmpx_class_f16", dst(EXEC), src(1, 1)),
979   (0xa8, 0xa8, 0x12, 0x12, 0xa8, 0x7f, "v_cmp_class_f64", dst(VCC), src(2, 1), InstrClass.ValuDouble),
980   (0xb8, 0xb8, 0x13, 0x13, 0xb8, 0xff, "v_cmpx_class_f64", dst(EXEC), src(2, 1), InstrClass.ValuDouble),
981}
982for (gfx6, gfx7, gfx8, gfx9, gfx10, gfx11, name, defs, ops, cls) in default_class(VOPC_CLASS, InstrClass.Valu32):
983    opcode(name, gfx7, gfx9, gfx10, gfx11, Format.VOPC, cls, True, False, definitions = defs, operands = ops)
984
985COMPF = ["f", "lt", "eq", "le", "gt", "lg", "ge", "o", "u", "nge", "nlg", "ngt", "nle", "neq", "nlt", "tru"]
986
987for i in range(8):
988   (gfx6, gfx7, gfx8, gfx9, gfx10, gfx11, name) = (-1, -1, 0x20+i, 0x20+i, 0xc8+i, 0x00+i, "v_cmp_"+COMPF[i]+"_f16")
989   opcode(name, gfx7, gfx9, gfx10, gfx11, Format.VOPC, InstrClass.Valu32, True, False, definitions = dst(VCC), operands = src(1, 1))
990   (gfx6, gfx7, gfx8, gfx9, gfx10, gfx11, name) = (-1, -1, 0x30+i, 0x30+i, 0xd8+i, 0x80+i, "v_cmpx_"+COMPF[i]+"_f16")
991   opcode(name, gfx7, gfx9, gfx10, gfx11, Format.VOPC, InstrClass.Valu32, True, False, definitions = dst(EXEC), operands = src(1, 1))
992   (gfx6, gfx7, gfx8, gfx9, gfx10, gfx11, name) = (-1, -1, 0x28+i, 0x28+i, 0xe8+i, 0x08+i, "v_cmp_"+COMPF[i+8]+"_f16")
993   opcode(name, gfx7, gfx9, gfx10, gfx11, Format.VOPC, InstrClass.Valu32, True, False, definitions = dst(VCC), operands = src(1, 1))
994   (gfx6, gfx7, gfx8, gfx9, gfx10, gfx11, name) = (-1, -1, 0x38+i, 0x38+i, 0xf8+i, 0x88+i, "v_cmpx_"+COMPF[i+8]+"_f16")
995   opcode(name, gfx7, gfx9, gfx10, gfx11, Format.VOPC, InstrClass.Valu32, True, False, definitions = dst(EXEC), operands = src(1, 1))
996
997for i in range(16):
998   (gfx6, gfx7, gfx8, gfx9, gfx10, gfx11, name) = (0x00+i, 0x00+i, 0x40+i, 0x40+i, 0x00+i, 0x10+i, "v_cmp_"+COMPF[i]+"_f32")
999   opcode(name, gfx7, gfx9, gfx10, gfx11, Format.VOPC, InstrClass.Valu32, True, False, definitions = dst(VCC), operands = src(1, 1))
1000   (gfx6, gfx7, gfx8, gfx9, gfx10, gfx11, name) = (0x10+i, 0x10+i, 0x50+i, 0x50+i, 0x10+i, 0x90+i, "v_cmpx_"+COMPF[i]+"_f32")
1001   opcode(name, gfx7, gfx9, gfx10, gfx11, Format.VOPC, InstrClass.Valu32, True, False, definitions = dst(EXEC), operands = src(1, 1))
1002   (gfx6, gfx7, gfx8, gfx9, gfx10, gfx11, name) = (0x20+i, 0x20+i, 0x60+i, 0x60+i, 0x20+i, 0x20+i, "v_cmp_"+COMPF[i]+"_f64")
1003   opcode(name, gfx7, gfx9, gfx10, gfx11, Format.VOPC, InstrClass.ValuDouble, True, False, definitions = dst(VCC), operands = src(2, 2))
1004   (gfx6, gfx7, gfx8, gfx9, gfx10, gfx11, name) = (0x30+i, 0x30+i, 0x70+i, 0x70+i, 0x30+i, 0xa0+i, "v_cmpx_"+COMPF[i]+"_f64")
1005   opcode(name, gfx7, gfx9, gfx10, gfx11, Format.VOPC, InstrClass.ValuDouble, True, False, definitions = dst(EXEC), operands = src(2, 2))
1006   # GFX_6_7
1007   (gfx6, gfx7, gfx8, gfx9, gfx10, gfx11, name) = (0x40+i, 0x40+i, -1, -1, -1, -1, "v_cmps_"+COMPF[i]+"_f32")
1008   (gfx6, gfx7, gfx8, gfx9, gfx10, gfx11, name) = (0x50+i, 0x50+i, -1, -1, -1, -1, "v_cmpsx_"+COMPF[i]+"_f32")
1009   (gfx6, gfx7, gfx8, gfx9, gfx10, gfx11, name) = (0x60+i, 0x60+i, -1, -1, -1, -1, "v_cmps_"+COMPF[i]+"_f64")
1010   (gfx6, gfx7, gfx8, gfx9, gfx10, gfx11, name) = (0x70+i, 0x70+i, -1, -1, -1, -1, "v_cmpsx_"+COMPF[i]+"_f64")
1011
1012COMPI = ["f", "lt", "eq", "le", "gt", "lg", "ge", "tru"]
1013
1014# GFX_8_9
1015for i in [0,7]: # only 0 and 7
1016   (gfx6, gfx7, gfx8, gfx9, gfx10, gfx11, name) = (-1, -1, 0xa0+i, 0xa0+i, -1, -1, "v_cmp_"+COMPI[i]+"_i16")
1017   opcode(name, gfx7, gfx9, gfx10, gfx11, Format.VOPC, InstrClass.Valu32, definitions = dst(VCC), operands = src(1, 1))
1018   (gfx6, gfx7, gfx8, gfx9, gfx10, gfx11, name) = (-1, -1, 0xb0+i, 0xb0+i, -1, -1, "v_cmpx_"+COMPI[i]+"_i16")
1019   opcode(name, gfx7, gfx9, gfx10, gfx11, Format.VOPC, InstrClass.Valu32, definitions = dst(EXEC), operands = src(1, 1))
1020   (gfx6, gfx7, gfx8, gfx9, gfx10, gfx11, name) = (-1, -1, 0xa8+i, 0xa8+i, -1, -1, "v_cmp_"+COMPI[i]+"_u16")
1021   opcode(name, gfx7, gfx9, gfx10, gfx11, Format.VOPC, InstrClass.Valu32, definitions = dst(VCC), operands = src(1, 1))
1022   (gfx6, gfx7, gfx8, gfx9, gfx10, gfx11, name) = (-1, -1, 0xb8+i, 0xb8+i, -1, -1, "v_cmpx_"+COMPI[i]+"_u16")
1023   opcode(name, gfx7, gfx9, gfx10, gfx11, Format.VOPC, InstrClass.Valu32, definitions = dst(EXEC), operands = src(1, 1))
1024
1025for i in range(1, 7): # [1..6]
1026   (gfx6, gfx7, gfx8, gfx9, gfx10, gfx11, name) = (-1, -1, 0xa0+i, 0xa0+i, 0x88+i, 0x30+i, "v_cmp_"+COMPI[i]+"_i16")
1027   opcode(name, gfx7, gfx9, gfx10, gfx11, Format.VOPC, InstrClass.Valu32, definitions = dst(VCC), operands = src(1, 1))
1028   (gfx6, gfx7, gfx8, gfx9, gfx10, gfx11, name) = (-1, -1, 0xb0+i, 0xb0+i, 0x98+i, 0xb0+i, "v_cmpx_"+COMPI[i]+"_i16")
1029   opcode(name, gfx7, gfx9, gfx10, gfx11, Format.VOPC, InstrClass.Valu32, definitions = dst(EXEC), operands = src(1, 1))
1030   (gfx6, gfx7, gfx8, gfx9, gfx10, gfx11, name) = (-1, -1, 0xa8+i, 0xa8+i, 0xa8+i, 0x38+i, "v_cmp_"+COMPI[i]+"_u16")
1031   opcode(name, gfx7, gfx9, gfx10, gfx11, Format.VOPC, InstrClass.Valu32, definitions = dst(VCC), operands = src(1, 1))
1032   (gfx6, gfx7, gfx8, gfx9, gfx10, gfx11, name) = (-1, -1, 0xb8+i, 0xb8+i, 0xb8+i, 0xb8+i, "v_cmpx_"+COMPI[i]+"_u16")
1033   opcode(name, gfx7, gfx9, gfx10, gfx11, Format.VOPC, InstrClass.Valu32, definitions = dst(EXEC), operands = src(1, 1))
1034
1035for i in range(8):
1036   (gfx6, gfx7, gfx8, gfx9, gfx10, gfx11, name) = (0x80+i, 0x80+i, 0xc0+i, 0xc0+i, 0x80+i, 0x40+i, "v_cmp_"+COMPI[i]+"_i32")
1037   opcode(name, gfx7, gfx9, gfx10, gfx11, Format.VOPC, InstrClass.Valu32, definitions = dst(VCC), operands = src(1, 1))
1038   (gfx6, gfx7, gfx8, gfx9, gfx10, gfx11, name) = (0x90+i, 0x90+i, 0xd0+i, 0xd0+i, 0x90+i, 0xc0+i, "v_cmpx_"+COMPI[i]+"_i32")
1039   opcode(name, gfx7, gfx9, gfx10, gfx11, Format.VOPC, InstrClass.Valu32, definitions = dst(EXEC), operands = src(1, 1))
1040   (gfx6, gfx7, gfx8, gfx9, gfx10, gfx11, name) = (0xa0+i, 0xa0+i, 0xe0+i, 0xe0+i, 0xa0+i, 0x50+i, "v_cmp_"+COMPI[i]+"_i64")
1041   opcode(name, gfx7, gfx9, gfx10, gfx11, Format.VOPC, InstrClass.Valu64, definitions = dst(VCC), operands = src(2, 2))
1042   (gfx6, gfx7, gfx8, gfx9, gfx10, gfx11, name) = (0xb0+i, 0xb0+i, 0xf0+i, 0xf0+i, 0xb0+i, 0xd0+i, "v_cmpx_"+COMPI[i]+"_i64")
1043   opcode(name, gfx7, gfx9, gfx10, gfx11, Format.VOPC, InstrClass.Valu64, definitions = dst(EXEC), operands = src(2, 2))
1044   (gfx6, gfx7, gfx8, gfx9, gfx10, gfx11, name) = (0xc0+i, 0xc0+i, 0xc8+i, 0xc8+i, 0xc0+i, 0x48+i, "v_cmp_"+COMPI[i]+"_u32")
1045   opcode(name, gfx7, gfx9, gfx10, gfx11, Format.VOPC, InstrClass.Valu32, definitions = dst(VCC), operands = src(1, 1))
1046   (gfx6, gfx7, gfx8, gfx9, gfx10, gfx11, name) = (0xd0+i, 0xd0+i, 0xd8+i, 0xd8+i, 0xd0+i, 0xc8+i, "v_cmpx_"+COMPI[i]+"_u32")
1047   opcode(name, gfx7, gfx9, gfx10, gfx11, Format.VOPC, InstrClass.Valu32, definitions = dst(EXEC), operands = src(1, 1))
1048   (gfx6, gfx7, gfx8, gfx9, gfx10, gfx11, name) = (0xe0+i, 0xe0+i, 0xe8+i, 0xe8+i, 0xe0+i, 0x58+i, "v_cmp_"+COMPI[i]+"_u64")
1049   opcode(name, gfx7, gfx9, gfx10, gfx11, Format.VOPC, InstrClass.Valu64, definitions = dst(VCC), operands = src(2, 2))
1050   (gfx6, gfx7, gfx8, gfx9, gfx10, gfx11, name) = (0xf0+i, 0xf0+i, 0xf8+i, 0xf8+i, 0xf0+i, 0xd8+i, "v_cmpx_"+COMPI[i]+"_u64")
1051   opcode(name, gfx7, gfx9, gfx10, gfx11, Format.VOPC, InstrClass.Valu64, definitions = dst(EXEC), operands = src(2, 2))
1052
1053
1054# VOPP instructions: packed 16bit instructions - 2 or 3 inputs and 1 output
1055VOPP = {
1056   # opcode, name, input/output modifiers
1057   (0x00, "v_pk_mad_i16", False, dst(1), src(1, 1, 1)),
1058   (0x01, "v_pk_mul_lo_u16", False, dst(1), src(1, 1)),
1059   (0x02, "v_pk_add_i16", False, dst(1), src(1, 1)),
1060   (0x03, "v_pk_sub_i16", False, dst(1), src(1, 1)),
1061   (0x04, "v_pk_lshlrev_b16", False, dst(1), src(1, 1)),
1062   (0x05, "v_pk_lshrrev_b16", False, dst(1), src(1, 1)),
1063   (0x06, "v_pk_ashrrev_i16", False, dst(1), src(1, 1)),
1064   (0x07, "v_pk_max_i16", False, dst(1), src(1, 1)),
1065   (0x08, "v_pk_min_i16", False, dst(1), src(1, 1)),
1066   (0x09, "v_pk_mad_u16", False, dst(1), src(1, 1, 1)),
1067   (0x0a, "v_pk_add_u16", False, dst(1), src(1, 1)),
1068   (0x0b, "v_pk_sub_u16", False, dst(1), src(1, 1)),
1069   (0x0c, "v_pk_max_u16", False, dst(1), src(1, 1)),
1070   (0x0d, "v_pk_min_u16", False, dst(1), src(1, 1)),
1071   (0x0e, "v_pk_fma_f16", True, dst(1), src(1, 1, 1)),
1072   (0x0f, "v_pk_add_f16", True, dst(1), src(1, 1)),
1073   (0x10, "v_pk_mul_f16", True, dst(1), src(1, 1)),
1074   (0x11, "v_pk_min_f16", True, dst(1), src(1, 1)),
1075   (0x12, "v_pk_max_f16", True, dst(1), src(1, 1)),
1076   (0x20, "v_fma_mix_f32", True, dst(1), src(1, 1, 1)), # v_mad_mix_f32 in VEGA ISA, v_fma_mix_f32 in RDNA ISA
1077   (0x21, "v_fma_mixlo_f16", True, dst(1), src(1, 1, 1)), # v_mad_mixlo_f16 in VEGA ISA, v_fma_mixlo_f16 in RDNA ISA
1078   (0x22, "v_fma_mixhi_f16", True, dst(1), src(1, 1, 1)), # v_mad_mixhi_f16 in VEGA ISA, v_fma_mixhi_f16 in RDNA ISA
1079}
1080# note that these are only supported on gfx9+ so we'll need to distinguish between gfx8 and gfx9 here
1081# (gfx6, gfx7, gfx8, gfx9, gfx10, gfx11, name) = (-1, -1, -1, code, code, code, name)
1082for (code, name, modifiers, defs, ops) in VOPP:
1083   opcode(name, -1, code, code, code, Format.VOP3P, InstrClass.Valu32, modifiers, modifiers, definitions = defs, operands = ops)
1084opcode("v_dot2_i32_i16", -1, 0x26, 0x14, -1, Format.VOP3P, InstrClass.Valu32, definitions = dst(1), operands = src(1, 1, 1))
1085opcode("v_dot2_u32_u16", -1, 0x27, 0x15, -1, Format.VOP3P, InstrClass.Valu32, definitions = dst(1), operands = src(1, 1, 1))
1086opcode("v_dot4_i32_iu8", -1, -1, -1, 0x16, Format.VOP3P, InstrClass.Valu32, definitions = dst(1), operands = src(1, 1, 1))
1087opcode("v_dot4_i32_i8", -1, 0x28, 0x16, -1, Format.VOP3P, InstrClass.Valu32, definitions = dst(1), operands = src(1, 1, 1))
1088opcode("v_dot4_u32_u8", -1, 0x29, 0x17, 0x17, Format.VOP3P, InstrClass.Valu32, definitions = dst(1), operands = src(1, 1, 1))
1089opcode("v_dot8_i32_iu4", -1, -1, -1, 0x18, Format.VOP3P, InstrClass.Valu32, definitions = dst(1), operands = src(1, 1, 1))
1090opcode("v_dot8_u32_u4", -1, 0x2b, 0x19, 0x19, Format.VOP3P, InstrClass.Valu32, definitions = dst(1), operands = src(1, 1, 1))
1091opcode("v_dot2_f32_f16", -1, 0x23, 0x13, 0x13, Format.VOP3P, InstrClass.Valu32, definitions = dst(1), operands = src(1, 1, 1))
1092opcode("v_dot2_f32_bf16", -1, -1, -1, 0x1a, Format.VOP3P, InstrClass.Valu32, definitions = dst(1), operands = src(1, 1, 1))
1093opcode("v_wmma_f32_16x16x16_f16", -1, -1, -1, 0x40, Format.VOP3P, InstrClass.WMMA, False, False)
1094opcode("v_wmma_f32_16x16x16_bf16", -1, -1, -1, 0x41, Format.VOP3P, InstrClass.WMMA, False, False)
1095opcode("v_wmma_f16_16x16x16_f16", -1, -1, -1, 0x42, Format.VOP3P, InstrClass.WMMA, False, False)
1096opcode("v_wmma_bf16_16x16x16_bf16", -1, -1, -1, 0x43, Format.VOP3P, InstrClass.WMMA, False, False)
1097opcode("v_wmma_i32_16x16x16_iu8", -1, -1, -1, 0x44, Format.VOP3P, InstrClass.WMMA, False, False)
1098opcode("v_wmma_i32_16x16x16_iu4", -1, -1, -1, 0x45, Format.VOP3P, InstrClass.WMMA, False, False)
1099
1100
1101# VINTRP (GFX6 - GFX10.3) instructions:
1102VINTRP = {
1103   (0x00, "v_interp_p1_f32", dst(1), src(1, M0)),
1104   (0x01, "v_interp_p2_f32", dst(1), src(1, M0, 1)),
1105   (0x02, "v_interp_mov_f32", dst(1), src(1, M0)),
1106}
1107# (gfx6, gfx7, gfx8, gfx9, gfx10, gfx11, name) = (code, code, code, code, code, code, name)
1108for (code, name, defs, ops) in VINTRP:
1109   opcode(name, code, code, code, -1, Format.VINTRP, InstrClass.Valu32, definitions = defs, operands = ops)
1110
1111
1112# VINTERP (GFX11+) instructions:
1113VINTERP = {
1114   (0x00, "v_interp_p10_f32_inreg"),
1115   (0x01, "v_interp_p2_f32_inreg"),
1116   (0x02, "v_interp_p10_f16_f32_inreg"),
1117   (0x03, "v_interp_p2_f16_f32_inreg"),
1118   (0x04, "v_interp_p10_rtz_f16_f32_inreg"),
1119   (0x05, "v_interp_p2_rtz_f16_f32_inreg"),
1120}
1121for (code, name) in VINTERP:
1122   opcode(name, -1, -1, -1, code, Format.VINTERP_INREG, InstrClass.Valu32, False, True, definitions = dst(1), operands = src(1, 1, 1))
1123
1124
1125# VOP3 instructions: 3 inputs, 1 output
1126# VOP3b instructions: have a unique scalar output, e.g. VOP2 with vcc out
1127VOP3 = {
1128   (0x140, 0x140, 0x1c0, 0x1c0, 0x140,    -1, "v_mad_legacy_f32", True, True, dst(1), src(1, 1, 1)), # GFX6-GFX10
1129   (0x141, 0x141, 0x1c1, 0x1c1, 0x141,    -1, "v_mad_f32", True, True, dst(1), src(1, 1, 1)),
1130   (0x142, 0x142, 0x1c2, 0x1c2, 0x142, 0x20a, "v_mad_i32_i24", False, False, dst(1), src(1, 1, 1)),
1131   (0x143, 0x143, 0x1c3, 0x1c3, 0x143, 0x20b, "v_mad_u32_u24", False, False, dst(1), src(1, 1, 1)),
1132   (0x144, 0x144, 0x1c4, 0x1c4, 0x144, 0x20c, "v_cubeid_f32", True, True, dst(1), src(1, 1, 1)),
1133   (0x145, 0x145, 0x1c5, 0x1c5, 0x145, 0x20d, "v_cubesc_f32", True, True, dst(1), src(1, 1, 1)),
1134   (0x146, 0x146, 0x1c6, 0x1c6, 0x146, 0x20e, "v_cubetc_f32", True, True, dst(1), src(1, 1, 1)),
1135   (0x147, 0x147, 0x1c7, 0x1c7, 0x147, 0x20f, "v_cubema_f32", True, True, dst(1), src(1, 1, 1)),
1136   (0x148, 0x148, 0x1c8, 0x1c8, 0x148, 0x210, "v_bfe_u32", False, False, dst(1), src(1, 1, 1)),
1137   (0x149, 0x149, 0x1c9, 0x1c9, 0x149, 0x211, "v_bfe_i32", False, False, dst(1), src(1, 1, 1)),
1138   (0x14a, 0x14a, 0x1ca, 0x1ca, 0x14a, 0x212, "v_bfi_b32", False, False, dst(1), src(1, 1, 1)),
1139   (0x14b, 0x14b, 0x1cb, 0x1cb, 0x14b, 0x213, "v_fma_f32", True, True, dst(1), src(1, 1, 1), InstrClass.ValuFma),
1140   (0x14c, 0x14c, 0x1cc, 0x1cc, 0x14c, 0x214, "v_fma_f64", True, True, dst(2), src(2, 2, 2), InstrClass.ValuDouble),
1141   (0x14d, 0x14d, 0x1cd, 0x1cd, 0x14d, 0x215, "v_lerp_u8", False, False, dst(1), src(1, 1, 1)),
1142   (0x14e, 0x14e, 0x1ce, 0x1ce, 0x14e, 0x216, "v_alignbit_b32", False, False, dst(1), src(1, 1, 1)),
1143   (0x14f, 0x14f, 0x1cf, 0x1cf, 0x14f, 0x217, "v_alignbyte_b32", False, False, dst(1), src(1, 1, 1)),
1144   (0x150, 0x150,    -1,    -1, 0x150, 0x218, "v_mullit_f32", True, True, dst(1), src(1, 1, 1)),
1145   (0x151, 0x151, 0x1d0, 0x1d0, 0x151, 0x219, "v_min3_f32", True, True, dst(1), src(1, 1, 1)),
1146   (0x152, 0x152, 0x1d1, 0x1d1, 0x152, 0x21a, "v_min3_i32", False, False, dst(1), src(1, 1, 1)),
1147   (0x153, 0x153, 0x1d2, 0x1d2, 0x153, 0x21b, "v_min3_u32", False, False, dst(1), src(1, 1, 1)),
1148   (0x154, 0x154, 0x1d3, 0x1d3, 0x154, 0x21c, "v_max3_f32", True, True, dst(1), src(1, 1, 1)),
1149   (0x155, 0x155, 0x1d4, 0x1d4, 0x155, 0x21d, "v_max3_i32", False, False, dst(1), src(1, 1, 1)),
1150   (0x156, 0x156, 0x1d5, 0x1d5, 0x156, 0x21e, "v_max3_u32", False, False, dst(1), src(1, 1, 1)),
1151   (0x157, 0x157, 0x1d6, 0x1d6, 0x157, 0x21f, "v_med3_f32", True, True, dst(1), src(1, 1, 1)),
1152   (0x158, 0x158, 0x1d7, 0x1d7, 0x158, 0x220, "v_med3_i32", False, False, dst(1), src(1, 1, 1)),
1153   (0x159, 0x159, 0x1d8, 0x1d8, 0x159, 0x221, "v_med3_u32", False, False, dst(1), src(1, 1, 1)),
1154   (0x15a, 0x15a, 0x1d9, 0x1d9, 0x15a, 0x222, "v_sad_u8", False, False, dst(1), src(1, 1, 1)),
1155   (0x15b, 0x15b, 0x1da, 0x1da, 0x15b, 0x223, "v_sad_hi_u8", False, False, dst(1), src(1, 1, 1)),
1156   (0x15c, 0x15c, 0x1db, 0x1db, 0x15c, 0x224, "v_sad_u16", False, False, dst(1), src(1, 1, 1)),
1157   (0x15d, 0x15d, 0x1dc, 0x1dc, 0x15d, 0x225, "v_sad_u32", False, False, dst(1), src(1, 1, 1)),
1158   (0x15e, 0x15e, 0x1dd, 0x1dd, 0x15e, 0x226, "v_cvt_pk_u8_f32", True, False, dst(1), src(1, 1, 1)),
1159   (0x15f, 0x15f, 0x1de, 0x1de, 0x15f, 0x227, "v_div_fixup_f32", True, True, dst(1), src(1, 1, 1)),
1160   (0x160, 0x160, 0x1df, 0x1df, 0x160, 0x228, "v_div_fixup_f64", True, True, dst(2), src(2, 2, 2)),
1161   (0x161, 0x161,    -1,    -1,    -1,    -1, "v_lshl_b64", False, False, dst(2), src(2, 1), InstrClass.Valu64),
1162   (0x162, 0x162,    -1,    -1,    -1,    -1, "v_lshr_b64", False, False, dst(2), src(2, 1), InstrClass.Valu64),
1163   (0x163, 0x163,    -1,    -1,    -1,    -1, "v_ashr_i64", False, False, dst(2), src(2, 1), InstrClass.Valu64),
1164   (0x164, 0x164, 0x280, 0x280, 0x164, 0x327, "v_add_f64", True, True, dst(2), src(2, 2), InstrClass.ValuDoubleAdd),
1165   (0x165, 0x165, 0x281, 0x281, 0x165, 0x328, "v_mul_f64", True, True, dst(2), src(2, 2), InstrClass.ValuDouble),
1166   (0x166, 0x166, 0x282, 0x282, 0x166, 0x329, "v_min_f64", True, True, dst(2), src(2, 2), InstrClass.ValuDouble),
1167   (0x167, 0x167, 0x283, 0x283, 0x167, 0x32a, "v_max_f64", True, True, dst(2), src(2, 2), InstrClass.ValuDouble),
1168   (0x168, 0x168, 0x284, 0x284, 0x168, 0x32b, "v_ldexp_f64", False, True, dst(2), src(2, 1), InstrClass.ValuDouble), # src1 can take input modifiers
1169   (0x169, 0x169, 0x285, 0x285, 0x169, 0x32c, "v_mul_lo_u32", False, False, dst(1), src(1, 1), InstrClass.ValuQuarterRate32),
1170   (0x16a, 0x16a, 0x286, 0x286, 0x16a, 0x32d, "v_mul_hi_u32", False, False, dst(1), src(1, 1), InstrClass.ValuQuarterRate32),
1171   (0x16b, 0x16b, 0x285, 0x285, 0x16b, 0x32c, "v_mul_lo_i32", False, False, dst(1), src(1, 1), InstrClass.ValuQuarterRate32), # identical to v_mul_lo_u32
1172   (0x16c, 0x16c, 0x287, 0x287, 0x16c, 0x32e, "v_mul_hi_i32", False, False, dst(1), src(1, 1), InstrClass.ValuQuarterRate32),
1173   (0x16d, 0x16d, 0x1e0, 0x1e0, 0x16d, 0x2fc, "v_div_scale_f32", True, True, dst(1, VCC), src(1, 1, 1)),
1174   (0x16e, 0x16e, 0x1e1, 0x1e1, 0x16e, 0x2fd, "v_div_scale_f64", True, True, dst(2, VCC), src(2, 2, 2), InstrClass.ValuDouble),
1175   (0x16f, 0x16f, 0x1e2, 0x1e2, 0x16f, 0x237, "v_div_fmas_f32", True, True, dst(1), src(1, 1, 1, VCC)),
1176   (0x170, 0x170, 0x1e3, 0x1e3, 0x170, 0x238, "v_div_fmas_f64", True, True, dst(2), src(2, 2, 2, VCC), InstrClass.ValuDouble),
1177   (0x171, 0x171, 0x1e4, 0x1e4, 0x171, 0x239, "v_msad_u8", False, False, dst(1), src(1, 1, 1)),
1178   (0x172, 0x172, 0x1e5, 0x1e5, 0x172, 0x23a, "v_qsad_pk_u16_u8", False, False, dst(2), src(2, 1, 2)),
1179   (0x173, 0x173, 0x1e6, 0x1e6, 0x173, 0x23b, "v_mqsad_pk_u16_u8", False, False, dst(2), src(2, 1, 2)),
1180   (0x174, 0x174, 0x292, 0x292, 0x174, 0x32f, "v_trig_preop_f64", False, False, dst(2), src(2, 2), InstrClass.ValuDouble),
1181   (   -1, 0x175, 0x1e7, 0x1e7, 0x175, 0x23d, "v_mqsad_u32_u8", False, False, dst(4), src(2, 1, 4)),
1182   (   -1, 0x176, 0x1e8, 0x1e8, 0x176, 0x2fe, "v_mad_u64_u32", False, False, dst(2, VCC), src(1, 1, 2), InstrClass.Valu64),
1183   (   -1, 0x177, 0x1e9, 0x1e9, 0x177, 0x2ff, "v_mad_i64_i32", False, False, dst(2, VCC), src(1, 1, 2), InstrClass.Valu64),
1184   (   -1,    -1, 0x1ea, 0x1ea,    -1,    -1, "v_mad_legacy_f16", True, True, dst(1), src(1, 1, 1)),
1185   (   -1,    -1, 0x1eb, 0x1eb,    -1,    -1, "v_mad_legacy_u16", False, False, dst(1), src(1, 1, 1)),
1186   (   -1,    -1, 0x1ec, 0x1ec,    -1,    -1, "v_mad_legacy_i16", False, False, dst(1), src(1, 1, 1)),
1187   (   -1,    -1, 0x1ed, 0x1ed, 0x344, 0x244, "v_perm_b32", False, False, dst(1), src(1, 1, 1)),
1188   (   -1,    -1, 0x1ee, 0x1ee,    -1,    -1, "v_fma_legacy_f16", True, True, dst(1), src(1, 1, 1), InstrClass.ValuFma),
1189   (   -1,    -1, 0x1ef, 0x1ef,    -1,    -1, "v_div_fixup_legacy_f16", True, True, dst(1), src(1, 1, 1)),
1190   (0x12c, 0x12c, 0x1f0, 0x1f0,    -1,    -1, "v_cvt_pkaccum_u8_f32", True, False, dst(1), src(1, 1, 1)),
1191   (   -1,    -1,    -1, 0x1f1, 0x373, 0x259, "v_mad_u32_u16", False, False, dst(1), src(1, 1, 1)),
1192   (   -1,    -1,    -1, 0x1f2, 0x375, 0x25a, "v_mad_i32_i16", False, False, dst(1), src(1, 1, 1)),
1193   (   -1,    -1,    -1, 0x1f3, 0x345, 0x245, "v_xad_u32", False, False, dst(1), src(1, 1, 1)),
1194   (   -1,    -1,    -1, 0x1f4, 0x351, 0x249, "v_min3_f16", True, True, dst(1), src(1, 1, 1)),
1195   (   -1,    -1,    -1, 0x1f5, 0x352, 0x24a, "v_min3_i16", False, False, dst(1), src(1, 1, 1)),
1196   (   -1,    -1,    -1, 0x1f6, 0x353, 0x24b, "v_min3_u16", False, False, dst(1), src(1, 1, 1)),
1197   (   -1,    -1,    -1, 0x1f7, 0x354, 0x24c, "v_max3_f16", True, True, dst(1), src(1, 1, 1)),
1198   (   -1,    -1,    -1, 0x1f8, 0x355, 0x24d, "v_max3_i16", False, False, dst(1), src(1, 1, 1)),
1199   (   -1,    -1,    -1, 0x1f9, 0x356, 0x24e, "v_max3_u16", False, False, dst(1), src(1, 1, 1)),
1200   (   -1,    -1,    -1, 0x1fa, 0x357, 0x24f, "v_med3_f16", True, True, dst(1), src(1, 1, 1)),
1201   (   -1,    -1,    -1, 0x1fb, 0x358, 0x250, "v_med3_i16", False, False, dst(1), src(1, 1, 1)),
1202   (   -1,    -1,    -1, 0x1fc, 0x359, 0x251, "v_med3_u16", False, False, dst(1), src(1, 1, 1)),
1203   (   -1,    -1,    -1, 0x1fd, 0x346, 0x246, "v_lshl_add_u32", False, False, dst(1), src(1, 1, 1)),
1204   (   -1,    -1,    -1, 0x1fe, 0x347, 0x247, "v_add_lshl_u32", False, False, dst(1), src(1, 1, 1)),
1205   (   -1,    -1,    -1, 0x1ff, 0x36d, 0x255, "v_add3_u32", False, False, dst(1), src(1, 1, 1)),
1206   (   -1,    -1,    -1, 0x200, 0x36f, 0x256, "v_lshl_or_b32", False, False, dst(1), src(1, 1, 1)),
1207   (   -1,    -1,    -1, 0x201, 0x371, 0x257, "v_and_or_b32", False, False, dst(1), src(1, 1, 1)),
1208   (   -1,    -1,    -1, 0x202, 0x372, 0x258, "v_or3_b32", False, False, dst(1), src(1, 1, 1)),
1209   (   -1,    -1,    -1, 0x203,    -1,    -1, "v_mad_f16", True, True, dst(1), src(1, 1, 1)),
1210   (   -1,    -1,    -1, 0x204, 0x340, 0x241, "v_mad_u16", False, False, dst(1), src(1, 1, 1)),
1211   (   -1,    -1,    -1, 0x205, 0x35e, 0x253, "v_mad_i16", False, False, dst(1), src(1, 1, 1)),
1212   (   -1,    -1,    -1, 0x206, 0x34b, 0x248, "v_fma_f16", True, True, dst(1), src(1, 1, 1)),
1213   (   -1,    -1,    -1, 0x207, 0x35f, 0x254, "v_div_fixup_f16", True, True, dst(1), src(1, 1, 1)),
1214   (   -1,    -1, 0x274, 0x274, 0x342,    -1, "v_interp_p1ll_f16", True, True, dst(1), src(1, M0)),
1215   (   -1,    -1, 0x275, 0x275, 0x343,    -1, "v_interp_p1lv_f16", True, True, dst(1), src(1, M0, 1)),
1216   (   -1,    -1, 0x276, 0x276,    -1,    -1, "v_interp_p2_legacy_f16", True, True, dst(1), src(1, M0, 1)),
1217   (   -1,    -1,    -1, 0x277, 0x35a,    -1, "v_interp_p2_f16", True, True, dst(1), src(1, M0, 1)),
1218   (0x12b, 0x12b, 0x288, 0x288, 0x362, 0x31c, "v_ldexp_f32", False, True, dst(1), src(1, 1)),
1219   (   -1,    -1, 0x289, 0x289, 0x360, 0x360, "v_readlane_b32_e64", False, False, dst(1), src(1, 1)),
1220   (   -1,    -1, 0x28a, 0x28a, 0x361, 0x361, "v_writelane_b32_e64", False, False, dst(1), src(1, 1, 1)),
1221   (0x122, 0x122, 0x28b, 0x28b, 0x364, 0x31e, "v_bcnt_u32_b32", False, False, dst(1), src(1, 1)),
1222   (0x123, 0x123, 0x28c, 0x28c, 0x365, 0x31f, "v_mbcnt_lo_u32_b32", False, False, dst(1), src(1, 1)),
1223   (   -1,    -1, 0x28d, 0x28d, 0x366, 0x320, "v_mbcnt_hi_u32_b32_e64", False, False, dst(1), src(1, 1)),
1224   (   -1,    -1, 0x28f, 0x28f, 0x2ff, 0x33c, "v_lshlrev_b64", False, False, dst(2), src(1, 2), InstrClass.Valu64),
1225   (   -1,    -1, 0x290, 0x290, 0x300, 0x33d, "v_lshrrev_b64", False, False, dst(2), src(1, 2), InstrClass.Valu64),
1226   (   -1,    -1, 0x291, 0x291, 0x301, 0x33e, "v_ashrrev_i64", False, False, dst(2), src(1, 2), InstrClass.Valu64),
1227   (0x11e, 0x11e, 0x293, 0x293, 0x363, 0x31d, "v_bfm_b32", False, False, dst(1), src(1, 1)),
1228   (0x12d, 0x12d, 0x294, 0x294, 0x368, 0x321, "v_cvt_pknorm_i16_f32", True, False, dst(1), src(1, 1)),
1229   (0x12e, 0x12e, 0x295, 0x295, 0x369, 0x322, "v_cvt_pknorm_u16_f32", True, False, dst(1), src(1, 1)),
1230   (   -1,    -1, 0x296, 0x296,    -1,    -1, "v_cvt_pkrtz_f16_f32_e64", True, False, dst(1), src(1, 1)),
1231   (0x130, 0x130, 0x297, 0x297, 0x36a, 0x323, "v_cvt_pk_u16_u32", False, False, dst(1), src(1, 1)),
1232   (0x131, 0x131, 0x298, 0x298, 0x36b, 0x324, "v_cvt_pk_i16_i32", False, False, dst(1), src(1, 1)),
1233   (   -1,    -1,    -1, 0x299, 0x312, 0x312, "v_cvt_pknorm_i16_f16", True, False, dst(1), src(1, 1)), #v_cvt_pk_norm_i16_f32 in GFX11
1234   (   -1,    -1,    -1, 0x29a, 0x313, 0x313, "v_cvt_pknorm_u16_f16", True, False, dst(1), src(1, 1)), #v_cvt_pk_norm_u16_f32 in GFX11
1235   (   -1,    -1,    -1, 0x29c, 0x37f, 0x326, "v_add_i32", False, False, dst(1), src(1, 1)),
1236   (   -1,    -1,    -1, 0x29d, 0x376, 0x325, "v_sub_i32", False, False, dst(1), src(1, 1)),
1237   (   -1,    -1,    -1, 0x29e, 0x30d, 0x30d, "v_add_i16", False, False, dst(1), src(1, 1)),
1238   (   -1,    -1,    -1, 0x29f, 0x30e, 0x30e, "v_sub_i16", False, False, dst(1), src(1, 1)),
1239   (   -1,    -1,    -1, 0x2a0, 0x311, 0x311, "v_pack_b32_f16", True, False, dst(1), src(1, 1)),
1240   (   -1,    -1,    -1,    -1, 0x178, 0x240, "v_xor3_b32", False, False, dst(1), src(1, 1, 1)),
1241   (   -1,    -1,    -1,    -1, 0x377, 0x25b, "v_permlane16_b32", False, False, dst(1), src(1, 1, 1)),
1242   (   -1,    -1,    -1,    -1, 0x378, 0x25c, "v_permlanex16_b32", False, False, dst(1), src(1, 1, 1)),
1243   (   -1,    -1,    -1,    -1, 0x30f, 0x300, "v_add_co_u32_e64", False, False, dst(1, VCC), src(1, 1)),
1244   (   -1,    -1,    -1,    -1, 0x310, 0x301, "v_sub_co_u32_e64", False, False, dst(1, VCC), src(1, 1)),
1245   (   -1,    -1,    -1,    -1, 0x319, 0x302, "v_subrev_co_u32_e64", False, False, dst(1, VCC), src(1, 1)),
1246   (   -1,    -1,    -1,    -1, 0x303, 0x303, "v_add_u16_e64", False, False, dst(1), src(1, 1)),
1247   (   -1,    -1,    -1,    -1, 0x304, 0x304, "v_sub_u16_e64", False, False, dst(1), src(1, 1)),
1248   (   -1,    -1,    -1,    -1, 0x305, 0x305, "v_mul_lo_u16_e64", False, False, dst(1), src(1, 1)),
1249   (   -1,    -1,    -1,    -1, 0x309, 0x309, "v_max_u16_e64", False, False, dst(1), src(1, 1)),
1250   (   -1,    -1,    -1,    -1, 0x30a, 0x30a, "v_max_i16_e64", False, False, dst(1), src(1, 1)),
1251   (   -1,    -1,    -1,    -1, 0x30b, 0x30b, "v_min_u16_e64", False, False, dst(1), src(1, 1)),
1252   (   -1,    -1,    -1,    -1, 0x30c, 0x30c, "v_min_i16_e64", False, False, dst(1), src(1, 1)),
1253   (   -1,    -1,    -1,    -1, 0x307, 0x339, "v_lshrrev_b16_e64", False, False, dst(1), src(1, 1)),
1254   (   -1,    -1,    -1,    -1, 0x308, 0x33a, "v_ashrrev_i16_e64", False, False, dst(1), src(1, 1)),
1255   (   -1,    -1,    -1,    -1, 0x314, 0x338, "v_lshlrev_b16_e64", False, False, dst(1), src(1, 1)),
1256   (   -1,    -1,    -1,    -1, 0x140, 0x209, "v_fma_legacy_f32", True, True, dst(1), src(1, 1, 1), InstrClass.ValuFma), #GFX10.3+, v_fma_dx9_zero_f32 in GFX11
1257   (   -1,    -1,    -1,    -1,    -1, 0x25e, "v_maxmin_f32", True, True, dst(1), src(1, 1, 1)),
1258   (   -1,    -1,    -1,    -1,    -1, 0x25f, "v_minmax_f32", True, True, dst(1), src(1, 1, 1)),
1259   (   -1,    -1,    -1,    -1,    -1, 0x260, "v_maxmin_f16", True, True, dst(1), src(1, 1, 1)),
1260   (   -1,    -1,    -1,    -1,    -1, 0x261, "v_minmax_f16", True, True, dst(1), src(1, 1, 1)),
1261   (   -1,    -1,    -1,    -1,    -1, 0x262, "v_maxmin_u32", False, False, dst(1), src(1, 1, 1)),
1262   (   -1,    -1,    -1,    -1,    -1, 0x263, "v_minmax_u32", False, False, dst(1), src(1, 1, 1)),
1263   (   -1,    -1,    -1,    -1,    -1, 0x264, "v_maxmin_i32", False, False, dst(1), src(1, 1, 1)),
1264   (   -1,    -1,    -1,    -1,    -1, 0x265, "v_minmax_i32", False, False, dst(1), src(1, 1, 1)),
1265   (   -1,    -1,    -1,    -1,    -1, 0x266, "v_dot2_f16_f16", False, False, dst(1), src(1, 1, 1)),
1266   (   -1,    -1,    -1,    -1,    -1, 0x267, "v_dot2_bf16_bf16", False, False, dst(1), src(1, 1, 1)),
1267   (   -1,    -1,    -1,    -1,    -1, 0x306, "v_cvt_pk_i16_f32", True, False, dst(1), src(1, 1)),
1268   (   -1,    -1,    -1,    -1,    -1, 0x307, "v_cvt_pk_u16_f32", True, False, dst(1), src(1, 1)),
1269   (   -1,    -1,    -1,    -1,    -1, 0x362, "v_and_b16", False, False, dst(1), src(1, 1)),
1270   (   -1,    -1,    -1,    -1,    -1, 0x363, "v_or_b16", False, False, dst(1), src(1, 1)),
1271   (   -1,    -1,    -1,    -1,    -1, 0x364, "v_xor_b16", False, False, dst(1), src(1, 1)),
1272   (   -1,    -1,    -1,    -1,    -1, 0x25d, "v_cndmask_b16", True, False, dst(1), src(1, 1, VCC)),
1273}
1274for (gfx6, gfx7, gfx8, gfx9, gfx10, gfx11, name, in_mod, out_mod, defs, ops, cls) in default_class(VOP3, InstrClass.Valu32):
1275   opcode(name, gfx7, gfx9, gfx10, gfx11, Format.VOP3, cls, in_mod, out_mod, definitions = defs, operands = ops)
1276
1277
1278VOPD = {
1279   (0x00, "v_dual_fmac_f32"),
1280   (0x01, "v_dual_fmaak_f32"),
1281   (0x02, "v_dual_fmamk_f32"),
1282   (0x03, "v_dual_mul_f32"),
1283   (0x04, "v_dual_add_f32"),
1284   (0x05, "v_dual_sub_f32"),
1285   (0x06, "v_dual_subrev_f32"),
1286   (0x07, "v_dual_mul_dx9_zero_f32"),
1287   (0x08, "v_dual_mov_b32"),
1288   (0x09, "v_dual_cndmask_b32"),
1289   (0x0a, "v_dual_max_f32"),
1290   (0x0b, "v_dual_min_f32"),
1291   (0x0c, "v_dual_dot2acc_f32_f16"),
1292   (0x0d, "v_dual_dot2acc_f32_bf16"),
1293   (0x10, "v_dual_add_nc_u32"),
1294   (0x11, "v_dual_lshlrev_b32"),
1295   (0x12, "v_dual_and_b32"),
1296}
1297for gfx11, name in VOPD:
1298   opcode(name, -1, -1, -1, gfx11, format = Format.VOPD, cls = InstrClass.Valu32)
1299
1300
1301# DS instructions: 3 inputs (1 addr, 2 data), 1 output
1302DS = {
1303   (0x00, 0x00, 0x00, 0x00, 0x00, 0x00, "ds_add_u32"),
1304   (0x01, 0x01, 0x01, 0x01, 0x01, 0x01, "ds_sub_u32"),
1305   (0x02, 0x02, 0x02, 0x02, 0x02, 0x02, "ds_rsub_u32"),
1306   (0x03, 0x03, 0x03, 0x03, 0x03, 0x03, "ds_inc_u32"),
1307   (0x04, 0x04, 0x04, 0x04, 0x04, 0x04, "ds_dec_u32"),
1308   (0x05, 0x05, 0x05, 0x05, 0x05, 0x05, "ds_min_i32"),
1309   (0x06, 0x06, 0x06, 0x06, 0x06, 0x06, "ds_max_i32"),
1310   (0x07, 0x07, 0x07, 0x07, 0x07, 0x07, "ds_min_u32"),
1311   (0x08, 0x08, 0x08, 0x08, 0x08, 0x08, "ds_max_u32"),
1312   (0x09, 0x09, 0x09, 0x09, 0x09, 0x09, "ds_and_b32"),
1313   (0x0a, 0x0a, 0x0a, 0x0a, 0x0a, 0x0a, "ds_or_b32"),
1314   (0x0b, 0x0b, 0x0b, 0x0b, 0x0b, 0x0b, "ds_xor_b32"),
1315   (0x0c, 0x0c, 0x0c, 0x0c, 0x0c, 0x0c, "ds_mskor_b32"),
1316   (0x0d, 0x0d, 0x0d, 0x0d, 0x0d, 0x0d, "ds_write_b32"), #ds_store_b32 in GFX11
1317   (0x0e, 0x0e, 0x0e, 0x0e, 0x0e, 0x0e, "ds_write2_b32"), #ds_store_2addr_b32 in GFX11
1318   (0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, "ds_write2st64_b32"), #ds_store_2addr_stride64_b32 in GFX11
1319   (0x10, 0x10, 0x10, 0x10, 0x10, 0x10, "ds_cmpst_b32"), #ds_cmpstore_b32 in GFX11
1320   (0x11, 0x11, 0x11, 0x11, 0x11, 0x11, "ds_cmpst_f32"), #ds_cmpstore_f32 in GFX11
1321   (0x12, 0x12, 0x12, 0x12, 0x12, 0x12, "ds_min_f32"),
1322   (0x13, 0x13, 0x13, 0x13, 0x13, 0x13, "ds_max_f32"),
1323   (  -1, 0x14, 0x14, 0x14, 0x14, 0x14, "ds_nop"),
1324   (  -1,   -1, 0x15, 0x15, 0x15, 0x15, "ds_add_f32"),
1325   (  -1,   -1, 0x1d, 0x1d, 0xb0, 0xb0, "ds_write_addtid_b32"), #ds_store_addtid_b32 in GFX11
1326   (0x1e, 0x1e, 0x1e, 0x1e, 0x1e, 0x1e, "ds_write_b8"), #ds_store_b8 in GFX11
1327   (0x1f, 0x1f, 0x1f, 0x1f, 0x1f, 0x1f, "ds_write_b16"), #ds_store_b16 in GFX11
1328   (0x20, 0x20, 0x20, 0x20, 0x20, 0x20, "ds_add_rtn_u32"),
1329   (0x21, 0x21, 0x21, 0x21, 0x21, 0x21, "ds_sub_rtn_u32"),
1330   (0x22, 0x22, 0x22, 0x22, 0x22, 0x22, "ds_rsub_rtn_u32"),
1331   (0x23, 0x23, 0x23, 0x23, 0x23, 0x23, "ds_inc_rtn_u32"),
1332   (0x24, 0x24, 0x24, 0x24, 0x24, 0x24, "ds_dec_rtn_u32"),
1333   (0x25, 0x25, 0x25, 0x25, 0x25, 0x25, "ds_min_rtn_i32"),
1334   (0x26, 0x26, 0x26, 0x26, 0x26, 0x26, "ds_max_rtn_i32"),
1335   (0x27, 0x27, 0x27, 0x27, 0x27, 0x27, "ds_min_rtn_u32"),
1336   (0x28, 0x28, 0x28, 0x28, 0x28, 0x28, "ds_max_rtn_u32"),
1337   (0x29, 0x29, 0x29, 0x29, 0x29, 0x29, "ds_and_rtn_b32"),
1338   (0x2a, 0x2a, 0x2a, 0x2a, 0x2a, 0x2a, "ds_or_rtn_b32"),
1339   (0x2b, 0x2b, 0x2b, 0x2b, 0x2b, 0x2b, "ds_xor_rtn_b32"),
1340   (0x2c, 0x2c, 0x2c, 0x2c, 0x2c, 0x2c, "ds_mskor_rtn_b32"),
1341   (0x2d, 0x2d, 0x2d, 0x2d, 0x2d, 0x2d, "ds_wrxchg_rtn_b32"), #ds_storexchg_rtn_b32 in GFX11
1342   (0x2e, 0x2e, 0x2e, 0x2e, 0x2e, 0x2e, "ds_wrxchg2_rtn_b32"), #ds_storexchg_2addr_rtn_b32 in GFX11
1343   (0x2f, 0x2f, 0x2f, 0x2f, 0x2f, 0x2f, "ds_wrxchg2st64_rtn_b32"), #ds_storexchg_2addr_stride64_rtn_b32 in GFX11
1344   (0x30, 0x30, 0x30, 0x30, 0x30, 0x30, "ds_cmpst_rtn_b32"), #ds_cmpstore_rtn_b32 in GFX11
1345   (0x31, 0x31, 0x31, 0x31, 0x31, 0x31, "ds_cmpst_rtn_f32"), #ds_cmpstore_rtn_f32 in GFX11
1346   (0x32, 0x32, 0x32, 0x32, 0x32, 0x32, "ds_min_rtn_f32"),
1347   (0x33, 0x33, 0x33, 0x33, 0x33, 0x33, "ds_max_rtn_f32"),
1348   (  -1, 0x34, 0x34, 0x34, 0x34, 0x34, "ds_wrap_rtn_b32"),
1349   (  -1,   -1, 0x35, 0x35, 0x55, 0x79, "ds_add_rtn_f32"),
1350   (0x36, 0x36, 0x36, 0x36, 0x36, 0x36, "ds_read_b32"), #ds_load_b32 in GFX11
1351   (0x37, 0x37, 0x37, 0x37, 0x37, 0x37, "ds_read2_b32"), #ds_load_2addr_b32 in GFX11
1352   (0x38, 0x38, 0x38, 0x38, 0x38, 0x38, "ds_read2st64_b32"), #ds_load_2addr_stride64_b32 in GFX11
1353   (0x39, 0x39, 0x39, 0x39, 0x39, 0x39, "ds_read_i8"), #ds_load_i8 in GFX11
1354   (0x3a, 0x3a, 0x3a, 0x3a, 0x3a, 0x3a, "ds_read_u8"), #ds_load_u8 in GFX11
1355   (0x3b, 0x3b, 0x3b, 0x3b, 0x3b, 0x3b, "ds_read_i16"), #ds_load_i16 in GFX11
1356   (0x3c, 0x3c, 0x3c, 0x3c, 0x3c, 0x3c, "ds_read_u16"), #ds_load_u16 in GFX11
1357   (0x35, 0x35, 0x3d, 0x3d, 0x35, 0x35, "ds_swizzle_b32"), #data1 & offset, no addr/data2
1358   (  -1,   -1, 0x3e, 0x3e, 0xb2, 0xb2, "ds_permute_b32"),
1359   (  -1,   -1, 0x3f, 0x3f, 0xb3, 0xb3, "ds_bpermute_b32"),
1360   (0x40, 0x40, 0x40, 0x40, 0x40, 0x40, "ds_add_u64"),
1361   (0x41, 0x41, 0x41, 0x41, 0x41, 0x41, "ds_sub_u64"),
1362   (0x42, 0x42, 0x42, 0x42, 0x42, 0x42, "ds_rsub_u64"),
1363   (0x43, 0x43, 0x43, 0x43, 0x43, 0x43, "ds_inc_u64"),
1364   (0x44, 0x44, 0x44, 0x44, 0x44, 0x44, "ds_dec_u64"),
1365   (0x45, 0x45, 0x45, 0x45, 0x45, 0x45, "ds_min_i64"),
1366   (0x46, 0x46, 0x46, 0x46, 0x46, 0x46, "ds_max_i64"),
1367   (0x47, 0x47, 0x47, 0x47, 0x47, 0x47, "ds_min_u64"),
1368   (0x48, 0x48, 0x48, 0x48, 0x48, 0x48, "ds_max_u64"),
1369   (0x49, 0x49, 0x49, 0x49, 0x49, 0x49, "ds_and_b64"),
1370   (0x4a, 0x4a, 0x4a, 0x4a, 0x4a, 0x4a, "ds_or_b64"),
1371   (0x4b, 0x4b, 0x4b, 0x4b, 0x4b, 0x4b, "ds_xor_b64"),
1372   (0x4c, 0x4c, 0x4c, 0x4c, 0x4c, 0x4c, "ds_mskor_b64"),
1373   (0x4d, 0x4d, 0x4d, 0x4d, 0x4d, 0x4d, "ds_write_b64"), #ds_store_b64 in GFX11
1374   (0x4e, 0x4e, 0x4e, 0x4e, 0x4e, 0x4e, "ds_write2_b64"), #ds_store_2addr_b64 in GFX11
1375   (0x4f, 0x4f, 0x4f, 0x4f, 0x4f, 0x4f, "ds_write2st64_b64"), #ds_store_2addr_stride64_b64 in GFX11
1376   (0x50, 0x50, 0x50, 0x50, 0x50, 0x50, "ds_cmpst_b64"), #ds_cmpstore_b64 in GFX11
1377   (0x51, 0x51, 0x51, 0x51, 0x51, 0x51, "ds_cmpst_f64"), #ds_cmpstore_f64 in GFX11
1378   (0x52, 0x52, 0x52, 0x52, 0x52, 0x52, "ds_min_f64"),
1379   (0x53, 0x53, 0x53, 0x53, 0x53, 0x53, "ds_max_f64"),
1380   (  -1,   -1,   -1, 0x54, 0xa0, 0xa0, "ds_write_b8_d16_hi"), #ds_store_b8_d16_hi in GFX11
1381   (  -1,   -1,   -1, 0x55, 0xa1, 0xa1, "ds_write_b16_d16_hi"), #ds_store_b16_d16_hi in GFX11
1382   (  -1,   -1,   -1, 0x56, 0xa2, 0xa2, "ds_read_u8_d16"), #ds_load_u8_d16 in GFX11
1383   (  -1,   -1,   -1, 0x57, 0xa3, 0xa3, "ds_read_u8_d16_hi"), #ds_load_u8_d16_hi in GFX11
1384   (  -1,   -1,   -1, 0x58, 0xa4, 0xa4, "ds_read_i8_d16"), #ds_load_i8_d16 in GFX11
1385   (  -1,   -1,   -1, 0x59, 0xa5, 0xa5, "ds_read_i8_d16_hi"), #ds_load_i8_d16_hi in GFX11
1386   (  -1,   -1,   -1, 0x5a, 0xa6, 0xa6, "ds_read_u16_d16"), #ds_load_u16_d16 in GFX11
1387   (  -1,   -1,   -1, 0x5b, 0xa7, 0xa7, "ds_read_u16_d16_hi"), #ds_load_u16_d16_hi in GFX11
1388   (0x60, 0x60, 0x60, 0x60, 0x60, 0x60, "ds_add_rtn_u64"),
1389   (0x61, 0x61, 0x61, 0x61, 0x61, 0x61, "ds_sub_rtn_u64"),
1390   (0x62, 0x62, 0x62, 0x62, 0x62, 0x62, "ds_rsub_rtn_u64"),
1391   (0x63, 0x63, 0x63, 0x63, 0x63, 0x63, "ds_inc_rtn_u64"),
1392   (0x64, 0x64, 0x64, 0x64, 0x64, 0x64, "ds_dec_rtn_u64"),
1393   (0x65, 0x65, 0x65, 0x65, 0x65, 0x65, "ds_min_rtn_i64"),
1394   (0x66, 0x66, 0x66, 0x66, 0x66, 0x66, "ds_max_rtn_i64"),
1395   (0x67, 0x67, 0x67, 0x67, 0x67, 0x67, "ds_min_rtn_u64"),
1396   (0x68, 0x68, 0x68, 0x68, 0x68, 0x68, "ds_max_rtn_u64"),
1397   (0x69, 0x69, 0x69, 0x69, 0x69, 0x69, "ds_and_rtn_b64"),
1398   (0x6a, 0x6a, 0x6a, 0x6a, 0x6a, 0x6a, "ds_or_rtn_b64"),
1399   (0x6b, 0x6b, 0x6b, 0x6b, 0x6b, 0x6b, "ds_xor_rtn_b64"),
1400   (0x6c, 0x6c, 0x6c, 0x6c, 0x6c, 0x6c, "ds_mskor_rtn_b64"),
1401   (0x6d, 0x6d, 0x6d, 0x6d, 0x6d, 0x6d, "ds_wrxchg_rtn_b64"), #ds_storexchg_rtn_b64 in GFX11
1402   (0x6e, 0x6e, 0x6e, 0x6e, 0x6e, 0x6e, "ds_wrxchg2_rtn_b64"), #ds_storexchg_2addr_rtn_b64 in GFX11
1403   (0x6f, 0x6f, 0x6f, 0x6f, 0x6f, 0x6f, "ds_wrxchg2st64_rtn_b64"), #ds_storexchg_2addr_stride64_rtn_b64 in GFX11
1404   (0x70, 0x70, 0x70, 0x70, 0x70, 0x70, "ds_cmpst_rtn_b64"), #ds_cmpstore_rtn_b64 in GFX11
1405   (0x71, 0x71, 0x71, 0x71, 0x71, 0x71, "ds_cmpst_rtn_f64"), #ds_cmpstore_rtn_f64 in GFX11
1406   (0x72, 0x72, 0x72, 0x72, 0x72, 0x72, "ds_min_rtn_f64"),
1407   (0x73, 0x73, 0x73, 0x73, 0x73, 0x73, "ds_max_rtn_f64"),
1408   (0x76, 0x76, 0x76, 0x76, 0x76, 0x76, "ds_read_b64"), #ds_load_b64 in GFX11
1409   (0x77, 0x77, 0x77, 0x77, 0x77, 0x77, "ds_read2_b64"), #ds_load_2addr_b64 in GFX11
1410   (0x78, 0x78, 0x78, 0x78, 0x78, 0x78, "ds_read2st64_b64"), #ds_load_2addr_stride64_b64 in GFX11
1411   (  -1, 0x7e, 0x7e, 0x7e, 0x7e, 0x7e, "ds_condxchg32_rtn_b64"),
1412   (0x80, 0x80, 0x80, 0x80, 0x80,   -1, "ds_add_src2_u32"),
1413   (0x81, 0x81, 0x81, 0x81, 0x81,   -1, "ds_sub_src2_u32"),
1414   (0x82, 0x82, 0x82, 0x82, 0x82,   -1, "ds_rsub_src2_u32"),
1415   (0x83, 0x83, 0x83, 0x83, 0x83,   -1, "ds_inc_src2_u32"),
1416   (0x84, 0x84, 0x84, 0x84, 0x84,   -1, "ds_dec_src2_u32"),
1417   (0x85, 0x85, 0x85, 0x85, 0x85,   -1, "ds_min_src2_i32"),
1418   (0x86, 0x86, 0x86, 0x86, 0x86,   -1, "ds_max_src2_i32"),
1419   (0x87, 0x87, 0x87, 0x87, 0x87,   -1, "ds_min_src2_u32"),
1420   (0x88, 0x88, 0x88, 0x88, 0x88,   -1, "ds_max_src2_u32"),
1421   (0x89, 0x89, 0x89, 0x89, 0x89,   -1, "ds_and_src2_b32"),
1422   (0x8a, 0x8a, 0x8a, 0x8a, 0x8a,   -1, "ds_or_src2_b32"),
1423   (0x8b, 0x8b, 0x8b, 0x8b, 0x8b,   -1, "ds_xor_src2_b32"),
1424   (0x8d, 0x8d, 0x8d, 0x8d, 0x8d,   -1, "ds_write_src2_b32"),
1425   (0x92, 0x92, 0x92, 0x92, 0x92,   -1, "ds_min_src2_f32"),
1426   (0x93, 0x93, 0x93, 0x93, 0x93,   -1, "ds_max_src2_f32"),
1427   (  -1,   -1, 0x95, 0x95, 0x95,   -1, "ds_add_src2_f32"),
1428   (  -1, 0x18, 0x98, 0x98, 0x18, 0x18, "ds_gws_sema_release_all"),
1429   (0x19, 0x19, 0x99, 0x99, 0x19, 0x19, "ds_gws_init"),
1430   (0x1a, 0x1a, 0x9a, 0x9a, 0x1a, 0x1a, "ds_gws_sema_v"),
1431   (0x1b, 0x1b, 0x9b, 0x9b, 0x1b, 0x1b, "ds_gws_sema_br"),
1432   (0x1c, 0x1c, 0x9c, 0x9c, 0x1c, 0x1c, "ds_gws_sema_p"),
1433   (0x1d, 0x1d, 0x9d, 0x9d, 0x1d, 0x1d, "ds_gws_barrier"),
1434   (  -1,   -1, 0xb6, 0xb6, 0xb1, 0xb1, "ds_read_addtid_b32"), #ds_load_addtid_b32 in GFX11
1435   (0x3d, 0x3d, 0xbd, 0xbd, 0x3d, 0x3d, "ds_consume"),
1436   (0x3e, 0x3e, 0xbe, 0xbe, 0x3e, 0x3e, "ds_append"),
1437   (0x3f, 0x3f, 0xbf, 0xbf, 0x3f, 0x3f, "ds_ordered_count"),
1438   (0xc0, 0xc0, 0xc0, 0xc0, 0xc0,   -1, "ds_add_src2_u64"),
1439   (0xc1, 0xc1, 0xc1, 0xc1, 0xc1,   -1, "ds_sub_src2_u64"),
1440   (0xc2, 0xc2, 0xc2, 0xc2, 0xc2,   -1, "ds_rsub_src2_u64"),
1441   (0xc3, 0xc3, 0xc3, 0xc3, 0xc3,   -1, "ds_inc_src2_u64"),
1442   (0xc4, 0xc4, 0xc4, 0xc4, 0xc4,   -1, "ds_dec_src2_u64"),
1443   (0xc5, 0xc5, 0xc5, 0xc5, 0xc5,   -1, "ds_min_src2_i64"),
1444   (0xc6, 0xc6, 0xc6, 0xc6, 0xc6,   -1, "ds_max_src2_i64"),
1445   (0xc7, 0xc7, 0xc7, 0xc7, 0xc7,   -1, "ds_min_src2_u64"),
1446   (0xc8, 0xc8, 0xc8, 0xc8, 0xc8,   -1, "ds_max_src2_u64"),
1447   (0xc9, 0xc9, 0xc9, 0xc9, 0xc9,   -1, "ds_and_src2_b64"),
1448   (0xca, 0xca, 0xca, 0xca, 0xca,   -1, "ds_or_src2_b64"),
1449   (0xcb, 0xcb, 0xcb, 0xcb, 0xcb,   -1, "ds_xor_src2_b64"),
1450   (0xcd, 0xcd, 0xcd, 0xcd, 0xcd,   -1, "ds_write_src2_b64"),
1451   (0xd2, 0xd2, 0xd2, 0xd2, 0xd2,   -1, "ds_min_src2_f64"),
1452   (0xd3, 0xd3, 0xd3, 0xd3, 0xd3,   -1, "ds_max_src2_f64"),
1453   (  -1, 0xde, 0xde, 0xde, 0xde, 0xde, "ds_write_b96"), #ds_store_b96 in GFX11
1454   (  -1, 0xdf, 0xdf, 0xdf, 0xdf, 0xdf, "ds_write_b128"), #ds_store_b128 in GFX11
1455   (  -1, 0xfd, 0xfd,   -1,   -1,   -1, "ds_condxchg32_rtn_b128"),
1456   (  -1, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, "ds_read_b96"), #ds_load_b96 in GFX11
1457   (  -1, 0xff, 0xff, 0xff, 0xff, 0xff, "ds_read_b128"), #ds_load_b128 in GFX11
1458   (  -1,   -1,   -1,   -1,   -1, 0x7a, "ds_add_gs_reg_rtn"),
1459   (  -1,   -1,   -1,   -1,   -1, 0x7b, "ds_sub_gs_reg_rtn"),
1460}
1461for (gfx6, gfx7, gfx8, gfx9, gfx10, gfx11, name) in DS:
1462    opcode(name, gfx7, gfx9, gfx10, gfx11, Format.DS, InstrClass.DS)
1463
1464
1465# LDSDIR instructions:
1466LDSDIR = {
1467   (0x00, "lds_param_load"),
1468   (0x01, "lds_direct_load"),
1469}
1470for (code, name) in LDSDIR:
1471    opcode(name, -1, -1, -1, code, Format.LDSDIR, InstrClass.DS)
1472
1473# MUBUF instructions:
1474MUBUF = {
1475   (0x00, 0x00, 0x00, 0x00, 0x00, 0x00, "buffer_load_format_x"),
1476   (0x01, 0x01, 0x01, 0x01, 0x01, 0x01, "buffer_load_format_xy"),
1477   (0x02, 0x02, 0x02, 0x02, 0x02, 0x02, "buffer_load_format_xyz"),
1478   (0x03, 0x03, 0x03, 0x03, 0x03, 0x03, "buffer_load_format_xyzw"),
1479   (0x04, 0x04, 0x04, 0x04, 0x04, 0x04, "buffer_store_format_x"),
1480   (0x05, 0x05, 0x05, 0x05, 0x05, 0x05, "buffer_store_format_xy"),
1481   (0x06, 0x06, 0x06, 0x06, 0x06, 0x06, "buffer_store_format_xyz"),
1482   (0x07, 0x07, 0x07, 0x07, 0x07, 0x07, "buffer_store_format_xyzw"),
1483   (  -1,   -1, 0x08, 0x08, 0x80, 0x08, "buffer_load_format_d16_x"),
1484   (  -1,   -1, 0x09, 0x09, 0x81, 0x09, "buffer_load_format_d16_xy"),
1485   (  -1,   -1, 0x0a, 0x0a, 0x82, 0x0a, "buffer_load_format_d16_xyz"),
1486   (  -1,   -1, 0x0b, 0x0b, 0x83, 0x0b, "buffer_load_format_d16_xyzw"),
1487   (  -1,   -1, 0x0c, 0x0c, 0x84, 0x0c, "buffer_store_format_d16_x"),
1488   (  -1,   -1, 0x0d, 0x0d, 0x85, 0x0d, "buffer_store_format_d16_xy"),
1489   (  -1,   -1, 0x0e, 0x0e, 0x86, 0x0e, "buffer_store_format_d16_xyz"),
1490   (  -1,   -1, 0x0f, 0x0f, 0x87, 0x0f, "buffer_store_format_d16_xyzw"),
1491   (0x08, 0x08, 0x10, 0x10, 0x08, 0x10, "buffer_load_ubyte"),
1492   (0x09, 0x09, 0x11, 0x11, 0x09, 0x11, "buffer_load_sbyte"),
1493   (0x0a, 0x0a, 0x12, 0x12, 0x0a, 0x12, "buffer_load_ushort"),
1494   (0x0b, 0x0b, 0x13, 0x13, 0x0b, 0x13, "buffer_load_sshort"),
1495   (0x0c, 0x0c, 0x14, 0x14, 0x0c, 0x14, "buffer_load_dword"),
1496   (0x0d, 0x0d, 0x15, 0x15, 0x0d, 0x15, "buffer_load_dwordx2"),
1497   (  -1, 0x0f, 0x16, 0x16, 0x0f, 0x16, "buffer_load_dwordx3"),
1498   (0x0f, 0x0e, 0x17, 0x17, 0x0e, 0x17, "buffer_load_dwordx4"),
1499   (0x18, 0x18, 0x18, 0x18, 0x18, 0x18, "buffer_store_byte"),
1500   (  -1,   -1,   -1, 0x19, 0x19, 0x24, "buffer_store_byte_d16_hi"),
1501   (0x1a, 0x1a, 0x1a, 0x1a, 0x1a, 0x19, "buffer_store_short"),
1502   (  -1,   -1,   -1, 0x1b, 0x1b, 0x25, "buffer_store_short_d16_hi"),
1503   (0x1c, 0x1c, 0x1c, 0x1c, 0x1c, 0x1a, "buffer_store_dword"),
1504   (0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1b, "buffer_store_dwordx2"),
1505   (  -1, 0x1f, 0x1e, 0x1e, 0x1f, 0x1c, "buffer_store_dwordx3"),
1506   (0x1e, 0x1e, 0x1f, 0x1f, 0x1e, 0x1d, "buffer_store_dwordx4"),
1507   (  -1,   -1,   -1, 0x20, 0x20, 0x1e, "buffer_load_ubyte_d16"),
1508   (  -1,   -1,   -1, 0x21, 0x21, 0x21, "buffer_load_ubyte_d16_hi"),
1509   (  -1,   -1,   -1, 0x22, 0x22, 0x1f, "buffer_load_sbyte_d16"),
1510   (  -1,   -1,   -1, 0x23, 0x23, 0x22, "buffer_load_sbyte_d16_hi"),
1511   (  -1,   -1,   -1, 0x24, 0x24, 0x20, "buffer_load_short_d16"),
1512   (  -1,   -1,   -1, 0x25, 0x25, 0x23, "buffer_load_short_d16_hi"),
1513   (  -1,   -1,   -1, 0x26, 0x26, 0x26, "buffer_load_format_d16_hi_x"),
1514   (  -1,   -1,   -1, 0x27, 0x27, 0x27, "buffer_store_format_d16_hi_x"),
1515   (  -1,   -1, 0x3d, 0x3d,   -1,   -1, "buffer_store_lds_dword"),
1516   (0x71, 0x71, 0x3e, 0x3e,   -1,   -1, "buffer_wbinvl1"),
1517   (0x70, 0x70, 0x3f, 0x3f,   -1,   -1, "buffer_wbinvl1_vol"),
1518   (0x30, 0x30, 0x40, 0x40, 0x30, 0x33, "buffer_atomic_swap"),
1519   (0x31, 0x31, 0x41, 0x41, 0x31, 0x34, "buffer_atomic_cmpswap"),
1520   (0x32, 0x32, 0x42, 0x42, 0x32, 0x35, "buffer_atomic_add"),
1521   (0x33, 0x33, 0x43, 0x43, 0x33, 0x36, "buffer_atomic_sub"),
1522   (0x34,   -1,   -1,   -1,   -1,   -1, "buffer_atomic_rsub"),
1523   (0x35, 0x35, 0x44, 0x44, 0x35, 0x38, "buffer_atomic_smin"),
1524   (0x36, 0x36, 0x45, 0x45, 0x36, 0x39, "buffer_atomic_umin"),
1525   (0x37, 0x37, 0x46, 0x46, 0x37, 0x3a, "buffer_atomic_smax"),
1526   (0x38, 0x38, 0x47, 0x47, 0x38, 0x3b, "buffer_atomic_umax"),
1527   (0x39, 0x39, 0x48, 0x48, 0x39, 0x3c, "buffer_atomic_and"),
1528   (0x3a, 0x3a, 0x49, 0x49, 0x3a, 0x3d, "buffer_atomic_or"),
1529   (0x3b, 0x3b, 0x4a, 0x4a, 0x3b, 0x3e, "buffer_atomic_xor"),
1530   (0x3c, 0x3c, 0x4b, 0x4b, 0x3c, 0x3f, "buffer_atomic_inc"),
1531   (0x3d, 0x3d, 0x4c, 0x4c, 0x3d, 0x40, "buffer_atomic_dec"),
1532   (0x3e, 0x3e,   -1,   -1, 0x3e, 0x50, "buffer_atomic_fcmpswap"),
1533   (0x3f, 0x3f,   -1,   -1, 0x3f, 0x51, "buffer_atomic_fmin"),
1534   (0x40, 0x40,   -1,   -1, 0x40, 0x52, "buffer_atomic_fmax"),
1535   (0x50, 0x50, 0x60, 0x60, 0x50, 0x41, "buffer_atomic_swap_x2"),
1536   (0x51, 0x51, 0x61, 0x61, 0x51, 0x42, "buffer_atomic_cmpswap_x2"),
1537   (0x52, 0x52, 0x62, 0x62, 0x52, 0x43, "buffer_atomic_add_x2"),
1538   (0x53, 0x53, 0x63, 0x63, 0x53, 0x44, "buffer_atomic_sub_x2"),
1539   (0x54,   -1,   -1,   -1,   -1,   -1, "buffer_atomic_rsub_x2"),
1540   (0x55, 0x55, 0x64, 0x64, 0x55, 0x45, "buffer_atomic_smin_x2"),
1541   (0x56, 0x56, 0x65, 0x65, 0x56, 0x46, "buffer_atomic_umin_x2"),
1542   (0x57, 0x57, 0x66, 0x66, 0x57, 0x47, "buffer_atomic_smax_x2"),
1543   (0x58, 0x58, 0x67, 0x67, 0x58, 0x48, "buffer_atomic_umax_x2"),
1544   (0x59, 0x59, 0x68, 0x68, 0x59, 0x49, "buffer_atomic_and_x2"),
1545   (0x5a, 0x5a, 0x69, 0x69, 0x5a, 0x4a, "buffer_atomic_or_x2"),
1546   (0x5b, 0x5b, 0x6a, 0x6a, 0x5b, 0x4b, "buffer_atomic_xor_x2"),
1547   (0x5c, 0x5c, 0x6b, 0x6b, 0x5c, 0x4c, "buffer_atomic_inc_x2"),
1548   (0x5d, 0x5d, 0x6c, 0x6c, 0x5d, 0x4d, "buffer_atomic_dec_x2"),
1549   (0x5e, 0x5e,   -1,   -1, 0x5e,   -1, "buffer_atomic_fcmpswap_x2"),
1550   (0x5f, 0x5f,   -1,   -1, 0x5f,   -1, "buffer_atomic_fmin_x2"),
1551   (0x60, 0x60,   -1,   -1, 0x60,   -1, "buffer_atomic_fmax_x2"),
1552   (  -1,   -1,   -1,   -1, 0x71, 0x2b, "buffer_gl0_inv"),
1553   (  -1,   -1,   -1,   -1, 0x72, 0x2c, "buffer_gl1_inv"),
1554   (  -1,   -1,   -1,   -1, 0x34, 0x37, "buffer_atomic_csub"), #GFX10.3+. seems glc must be set. buffer_atomic_csub_u32 in GFX11
1555   (  -1,   -1,   -1,   -1,   -1, 0x31, "buffer_load_lds_b32"),
1556   (  -1,   -1,   -1,   -1,   -1, 0x32, "buffer_load_lds_format_x"),
1557   (  -1,   -1,   -1,   -1,   -1, 0x2e, "buffer_load_lds_i8"),
1558   (  -1,   -1,   -1,   -1,   -1, 0x30, "buffer_load_lds_i16"),
1559   (  -1,   -1,   -1,   -1,   -1, 0x2d, "buffer_load_lds_u8"),
1560   (  -1,   -1,   -1,   -1,   -1, 0x2f, "buffer_load_lds_u16"),
1561   (  -1,   -1,   -1,   -1,   -1, 0x56, "buffer_atomic_add_f32"),
1562}
1563for (gfx6, gfx7, gfx8, gfx9, gfx10, gfx11, name) in MUBUF:
1564    opcode(name, gfx7, gfx9, gfx10, gfx11, Format.MUBUF, InstrClass.VMem, is_atomic = "atomic" in name)
1565
1566MTBUF = {
1567   (0x00, 0x00, 0x00, 0x00, 0x00, 0x00, "tbuffer_load_format_x"),
1568   (0x01, 0x01, 0x01, 0x01, 0x01, 0x01, "tbuffer_load_format_xy"),
1569   (0x02, 0x02, 0x02, 0x02, 0x02, 0x02, "tbuffer_load_format_xyz"),
1570   (0x03, 0x03, 0x03, 0x03, 0x03, 0x03, "tbuffer_load_format_xyzw"),
1571   (0x04, 0x04, 0x04, 0x04, 0x04, 0x04, "tbuffer_store_format_x"),
1572   (0x05, 0x05, 0x05, 0x05, 0x05, 0x05, "tbuffer_store_format_xy"),
1573   (0x06, 0x06, 0x06, 0x06, 0x06, 0x06, "tbuffer_store_format_xyz"),
1574   (0x07, 0x07, 0x07, 0x07, 0x07, 0x07, "tbuffer_store_format_xyzw"),
1575   (  -1,   -1, 0x08, 0x08, 0x08, 0x08, "tbuffer_load_format_d16_x"),
1576   (  -1,   -1, 0x09, 0x09, 0x09, 0x09, "tbuffer_load_format_d16_xy"),
1577   (  -1,   -1, 0x0a, 0x0a, 0x0a, 0x0a, "tbuffer_load_format_d16_xyz"),
1578   (  -1,   -1, 0x0b, 0x0b, 0x0b, 0x0b, "tbuffer_load_format_d16_xyzw"),
1579   (  -1,   -1, 0x0c, 0x0c, 0x0c, 0x0c, "tbuffer_store_format_d16_x"),
1580   (  -1,   -1, 0x0d, 0x0d, 0x0d, 0x0d, "tbuffer_store_format_d16_xy"),
1581   (  -1,   -1, 0x0e, 0x0e, 0x0e, 0x0e, "tbuffer_store_format_d16_xyz"),
1582   (  -1,   -1, 0x0f, 0x0f, 0x0f, 0x0f, "tbuffer_store_format_d16_xyzw"),
1583}
1584for (gfx6, gfx7, gfx8, gfx9, gfx10, gfx11, name) in MTBUF:
1585    opcode(name, gfx7, gfx9, gfx10, gfx11, Format.MTBUF, InstrClass.VMem)
1586
1587
1588IMAGE = {
1589   (0x00, 0x00, "image_load"),
1590   (0x01, 0x01, "image_load_mip"),
1591   (0x02, 0x02, "image_load_pck"),
1592   (0x03, 0x03, "image_load_pck_sgn"),
1593   (0x04, 0x04, "image_load_mip_pck"),
1594   (0x05, 0x05, "image_load_mip_pck_sgn"),
1595   (0x08, 0x06, "image_store"),
1596   (0x09, 0x07, "image_store_mip"),
1597   (0x0a, 0x08, "image_store_pck"),
1598   (0x0b, 0x09, "image_store_mip_pck"),
1599   (0x0e, 0x17, "image_get_resinfo"),
1600   (0x60, 0x38, "image_get_lod"),
1601}
1602# (gfx6, gfx7, gfx8, gfx9, gfx10, gfx11, name) = (code, code, code, code, code, name)
1603for (code, gfx11, name) in IMAGE:
1604   opcode(name, code, code, code, gfx11, Format.MIMG, InstrClass.VMem)
1605
1606opcode("image_msaa_load", -1, -1, 0x80, 0x18, Format.MIMG, InstrClass.VMem) #GFX10.3+
1607
1608IMAGE_ATOMIC = {
1609   (0x0f, 0x0f, 0x10, 0x0a, "image_atomic_swap"),
1610   (0x10, 0x10, 0x11, 0x0b, "image_atomic_cmpswap"),
1611   (0x11, 0x11, 0x12, 0x0c, "image_atomic_add"),
1612   (0x12, 0x12, 0x13, 0x0d, "image_atomic_sub"),
1613   (0x13,   -1,   -1,   -1, "image_atomic_rsub"),
1614   (0x14, 0x14, 0x14, 0x0e, "image_atomic_smin"),
1615   (0x15, 0x15, 0x15, 0x0f, "image_atomic_umin"),
1616   (0x16, 0x16, 0x16, 0x10, "image_atomic_smax"),
1617   (0x17, 0x17, 0x17, 0x11, "image_atomic_umax"),
1618   (0x18, 0x18, 0x18, 0x12, "image_atomic_and"),
1619   (0x19, 0x19, 0x19, 0x13, "image_atomic_or"),
1620   (0x1a, 0x1a, 0x1a, 0x14, "image_atomic_xor"),
1621   (0x1b, 0x1b, 0x1b, 0x15, "image_atomic_inc"),
1622   (0x1c, 0x1c, 0x1c, 0x16, "image_atomic_dec"),
1623   (0x1d, 0x1d,   -1,   -1, "image_atomic_fcmpswap"),
1624   (0x1e, 0x1e,   -1,   -1, "image_atomic_fmin"),
1625   (0x1f, 0x1f,   -1,   -1, "image_atomic_fmax"),
1626}
1627# (gfx6, gfx7, gfx8, gfx9, gfx10, gfx11, name) = (gfx6, gfx7, gfx89, gfx89, ???, gfx11, name)
1628# gfx7 and gfx10 opcodes are the same here
1629for (gfx6, gfx7, gfx89, gfx11, name) in IMAGE_ATOMIC:
1630   opcode(name, gfx7, gfx89, gfx7, gfx11, Format.MIMG, InstrClass.VMem, is_atomic = True)
1631
1632IMAGE_SAMPLE = {
1633   (0x20, 0x1b, "image_sample"),
1634   (0x21, 0x40, "image_sample_cl"),
1635   (0x22, 0x1c, "image_sample_d"),
1636   (0x23, 0x41, "image_sample_d_cl"),
1637   (0x24, 0x1d, "image_sample_l"),
1638   (0x25, 0x1e, "image_sample_b"),
1639   (0x26, 0x42, "image_sample_b_cl"),
1640   (0x27, 0x1f, "image_sample_lz"),
1641   (0x28, 0x20, "image_sample_c"),
1642   (0x29, 0x43, "image_sample_c_cl"),
1643   (0x2a, 0x21, "image_sample_c_d"),
1644   (0x2b, 0x44, "image_sample_c_d_cl"),
1645   (0x2c, 0x22, "image_sample_c_l"),
1646   (0x2d, 0x23, "image_sample_c_b"),
1647   (0x2e, 0x45, "image_sample_c_b_cl"),
1648   (0x2f, 0x24, "image_sample_c_lz"),
1649   (0x30, 0x25, "image_sample_o"),
1650   (0x31, 0x46, "image_sample_cl_o"),
1651   (0x32, 0x26, "image_sample_d_o"),
1652   (0x33, 0x47, "image_sample_d_cl_o"),
1653   (0x34, 0x27, "image_sample_l_o"),
1654   (0x35, 0x28, "image_sample_b_o"),
1655   (0x36, 0x48, "image_sample_b_cl_o"),
1656   (0x37, 0x29, "image_sample_lz_o"),
1657   (0x38, 0x2a, "image_sample_c_o"),
1658   (0x39, 0x49, "image_sample_c_cl_o"),
1659   (0x3a, 0x2b, "image_sample_c_d_o"),
1660   (0x3b, 0x4a, "image_sample_c_d_cl_o"),
1661   (0x3c, 0x2c, "image_sample_c_l_o"),
1662   (0x3d, 0x2d, "image_sample_c_b_o"),
1663   (0x3e, 0x4b, "image_sample_c_b_cl_o"),
1664   (0x3f, 0x2e, "image_sample_c_lz_o"),
1665   (0x68,   -1, "image_sample_cd"),
1666   (0x69,   -1, "image_sample_cd_cl"),
1667   (0x6a,   -1, "image_sample_c_cd"),
1668   (0x6b,   -1, "image_sample_c_cd_cl"),
1669   (0x6c,   -1, "image_sample_cd_o"),
1670   (0x6d,   -1, "image_sample_cd_cl_o"),
1671   (0x6e,   -1, "image_sample_c_cd_o"),
1672   (0x6f,   -1, "image_sample_c_cd_cl_o"),
1673}
1674# (gfx6, gfx7, gfx8, gfx9, gfx10, gfx11, name) = (code, code, code, code, code, gfx11, name)
1675for (code, gfx11, name) in IMAGE_SAMPLE:
1676   opcode(name, code, code, code, gfx11, Format.MIMG, InstrClass.VMem)
1677
1678IMAGE_SAMPLE_G16 = {
1679   (0xa2, 0x39, "image_sample_d_g16"),
1680   (0xa3, 0x5f, "image_sample_d_cl_g16"),
1681   (0xaa, 0x3a, "image_sample_c_d_g16"),
1682   (0xab, 0x54, "image_sample_c_d_cl_g16"),
1683   (0xb2, 0x3b, "image_sample_d_o_g16"),
1684   (0xb3, 0x55, "image_sample_d_cl_o_g16"),
1685   (0xba, 0x3c, "image_sample_c_d_o_g16"),
1686   (0xbb, 0x56, "image_sample_c_d_cl_o_g16"),
1687}
1688
1689# (gfx6, gfx7, gfx8, gfx9, gfx10, gfx11, name) = (-1, -1, -1, -1, code, gfx11, name)
1690for (code, gfx11, name) in IMAGE_SAMPLE_G16:
1691   opcode(name, -1, -1, code, gfx11, Format.MIMG, InstrClass.VMem)
1692
1693IMAGE_GATHER4 = {
1694   (0x40, 0x2f, "image_gather4"),
1695   (0x41, 0x60, "image_gather4_cl"),
1696   #(0x42, "image_gather4h"), VEGA only?
1697   (0x44, 0x30, "image_gather4_l"), # following instructions have different opcodes according to ISA sheet.
1698   (0x45, 0x31, "image_gather4_b"),
1699   (0x46, 0x61, "image_gather4_b_cl"),
1700   (0x47, 0x32, "image_gather4_lz"),
1701   (0x48, 0x33, "image_gather4_c"),
1702   (0x49, 0x62, "image_gather4_c_cl"), # previous instructions have different opcodes according to ISA sheet.
1703   #(0x4a, "image_gather4h_pck"), VEGA only?
1704   #(0x4b, "image_gather8h_pck"), VGEA only?
1705   (0x4c, 0x63, "image_gather4_c_l"),
1706   (0x4d, 0x64, "image_gather4_c_b"),
1707   (0x4e, 0x65, "image_gather4_c_b_cl"),
1708   (0x4f, 0x34, "image_gather4_c_lz"),
1709   (0x50, 0x35, "image_gather4_o"),
1710   (0x51,   -1, "image_gather4_cl_o"),
1711   (0x54,   -1, "image_gather4_l_o"),
1712   (0x55,   -1, "image_gather4_b_o"),
1713   (0x56,   -1, "image_gather4_b_cl_o"),
1714   (0x57, 0x36, "image_gather4_lz_o"),
1715   (0x58,   -1, "image_gather4_c_o"),
1716   (0x59,   -1, "image_gather4_c_cl_o"),
1717   (0x5c,   -1, "image_gather4_c_l_o"),
1718   (0x5d,   -1, "image_gather4_c_b_o"),
1719   (0x5e,   -1, "image_gather4_c_b_cl_o"),
1720   (0x5f, 0x37, "image_gather4_c_lz_o"),
1721}
1722# (gfx6, gfx7, gfx8, gfx9, gfx10, gfx11, name) = (code, code, code, code, code, gfx11, name)
1723for (code, gfx11, name) in IMAGE_GATHER4:
1724   opcode(name, code, code, code, gfx11, Format.MIMG, InstrClass.VMem)
1725
1726opcode("image_bvh_intersect_ray", -1, -1, 0xe6, 0x19, Format.MIMG, InstrClass.VMem)
1727opcode("image_bvh64_intersect_ray", -1, -1, 0xe7, 0x1a, Format.MIMG, InstrClass.VMem)
1728
1729FLAT = {
1730   #GFX7, GFX89,GFX10,GFX11
1731   (0x08, 0x10, 0x08, 0x10, "flat_load_ubyte"),
1732   (0x09, 0x11, 0x09, 0x11, "flat_load_sbyte"),
1733   (0x0a, 0x12, 0x0a, 0x12, "flat_load_ushort"),
1734   (0x0b, 0x13, 0x0b, 0x13, "flat_load_sshort"),
1735   (0x0c, 0x14, 0x0c, 0x14, "flat_load_dword"),
1736   (0x0d, 0x15, 0x0d, 0x15, "flat_load_dwordx2"),
1737   (0x0f, 0x16, 0x0f, 0x16, "flat_load_dwordx3"),
1738   (0x0e, 0x17, 0x0e, 0x17, "flat_load_dwordx4"),
1739   (0x18, 0x18, 0x18, 0x18, "flat_store_byte"),
1740   (  -1, 0x19, 0x19, 0x24, "flat_store_byte_d16_hi"),
1741   (0x1a, 0x1a, 0x1a, 0x19, "flat_store_short"),
1742   (  -1, 0x1b, 0x1b, 0x25, "flat_store_short_d16_hi"),
1743   (0x1c, 0x1c, 0x1c, 0x1a, "flat_store_dword"),
1744   (0x1d, 0x1d, 0x1d, 0x1b, "flat_store_dwordx2"),
1745   (0x1f, 0x1e, 0x1f, 0x1c, "flat_store_dwordx3"),
1746   (0x1e, 0x1f, 0x1e, 0x1d, "flat_store_dwordx4"),
1747   (  -1, 0x20, 0x20, 0x1e, "flat_load_ubyte_d16"),
1748   (  -1, 0x21, 0x21, 0x21, "flat_load_ubyte_d16_hi"),
1749   (  -1, 0x22, 0x22, 0x1f, "flat_load_sbyte_d16"),
1750   (  -1, 0x23, 0x23, 0x22, "flat_load_sbyte_d16_hi"),
1751   (  -1, 0x24, 0x24, 0x20, "flat_load_short_d16"),
1752   (  -1, 0x25, 0x25, 0x23, "flat_load_short_d16_hi"),
1753   (0x30, 0x40, 0x30, 0x33, "flat_atomic_swap"),
1754   (0x31, 0x41, 0x31, 0x34, "flat_atomic_cmpswap"),
1755   (0x32, 0x42, 0x32, 0x35, "flat_atomic_add"),
1756   (0x33, 0x43, 0x33, 0x36, "flat_atomic_sub"),
1757   (0x35, 0x44, 0x35, 0x38, "flat_atomic_smin"),
1758   (0x36, 0x45, 0x36, 0x39, "flat_atomic_umin"),
1759   (0x37, 0x46, 0x37, 0x3a, "flat_atomic_smax"),
1760   (0x38, 0x47, 0x38, 0x3b, "flat_atomic_umax"),
1761   (0x39, 0x48, 0x39, 0x3c, "flat_atomic_and"),
1762   (0x3a, 0x49, 0x3a, 0x3d, "flat_atomic_or"),
1763   (0x3b, 0x4a, 0x3b, 0x3e, "flat_atomic_xor"),
1764   (0x3c, 0x4b, 0x3c, 0x3f, "flat_atomic_inc"),
1765   (0x3d, 0x4c, 0x3d, 0x40, "flat_atomic_dec"),
1766   (0x3e,   -1, 0x3e, 0x50, "flat_atomic_fcmpswap"),
1767   (0x3f,   -1, 0x3f, 0x51, "flat_atomic_fmin"),
1768   (0x40,   -1, 0x40, 0x52, "flat_atomic_fmax"),
1769   (0x50, 0x60, 0x50, 0x41, "flat_atomic_swap_x2"),
1770   (0x51, 0x61, 0x51, 0x42, "flat_atomic_cmpswap_x2"),
1771   (0x52, 0x62, 0x52, 0x43, "flat_atomic_add_x2"),
1772   (0x53, 0x63, 0x53, 0x44, "flat_atomic_sub_x2"),
1773   (0x55, 0x64, 0x55, 0x45, "flat_atomic_smin_x2"),
1774   (0x56, 0x65, 0x56, 0x46, "flat_atomic_umin_x2"),
1775   (0x57, 0x66, 0x57, 0x47, "flat_atomic_smax_x2"),
1776   (0x58, 0x67, 0x58, 0x48, "flat_atomic_umax_x2"),
1777   (0x59, 0x68, 0x59, 0x49, "flat_atomic_and_x2"),
1778   (0x5a, 0x69, 0x5a, 0x4a, "flat_atomic_or_x2"),
1779   (0x5b, 0x6a, 0x5b, 0x4b, "flat_atomic_xor_x2"),
1780   (0x5c, 0x6b, 0x5c, 0x4c, "flat_atomic_inc_x2"),
1781   (0x5d, 0x6c, 0x5d, 0x4d, "flat_atomic_dec_x2"),
1782   (0x5e,   -1, 0x5e,   -1, "flat_atomic_fcmpswap_x2"),
1783   (0x5f,   -1, 0x5f,   -1, "flat_atomic_fmin_x2"),
1784   (0x60,   -1, 0x60,   -1, "flat_atomic_fmax_x2"),
1785   (  -1,   -1,   -1, 0x56, "flat_atomic_add_f32"),
1786}
1787for (gfx7, gfx8, gfx10, gfx11, name) in FLAT:
1788    opcode(name, gfx7, gfx8, gfx10, gfx11, Format.FLAT, InstrClass.VMem, is_atomic = "atomic" in name) #TODO: also LDS?
1789
1790GLOBAL = {
1791   #GFX89,GFX10,GFX11
1792   (0x10, 0x08, 0x10, "global_load_ubyte"),
1793   (0x11, 0x09, 0x11, "global_load_sbyte"),
1794   (0x12, 0x0a, 0x12, "global_load_ushort"),
1795   (0x13, 0x0b, 0x13, "global_load_sshort"),
1796   (0x14, 0x0c, 0x14, "global_load_dword"),
1797   (0x15, 0x0d, 0x15, "global_load_dwordx2"),
1798   (0x16, 0x0f, 0x16, "global_load_dwordx3"),
1799   (0x17, 0x0e, 0x17, "global_load_dwordx4"),
1800   (0x18, 0x18, 0x18, "global_store_byte"),
1801   (0x19, 0x19, 0x24, "global_store_byte_d16_hi"),
1802   (0x1a, 0x1a, 0x19, "global_store_short"),
1803   (0x1b, 0x1b, 0x25, "global_store_short_d16_hi"),
1804   (0x1c, 0x1c, 0x1a, "global_store_dword"),
1805   (0x1d, 0x1d, 0x1b, "global_store_dwordx2"),
1806   (0x1e, 0x1f, 0x1c, "global_store_dwordx3"),
1807   (0x1f, 0x1e, 0x1d, "global_store_dwordx4"),
1808   (0x20, 0x20, 0x1e, "global_load_ubyte_d16"),
1809   (0x21, 0x21, 0x21, "global_load_ubyte_d16_hi"),
1810   (0x22, 0x22, 0x1f, "global_load_sbyte_d16"),
1811   (0x23, 0x23, 0x22, "global_load_sbyte_d16_hi"),
1812   (0x24, 0x24, 0x20, "global_load_short_d16"),
1813   (0x25, 0x25, 0x23, "global_load_short_d16_hi"),
1814   (0x40, 0x30, 0x33, "global_atomic_swap"),
1815   (0x41, 0x31, 0x34, "global_atomic_cmpswap"),
1816   (0x42, 0x32, 0x35, "global_atomic_add"),
1817   (0x43, 0x33, 0x36, "global_atomic_sub"),
1818   (0x44, 0x35, 0x38, "global_atomic_smin"),
1819   (0x45, 0x36, 0x39, "global_atomic_umin"),
1820   (0x46, 0x37, 0x3a, "global_atomic_smax"),
1821   (0x47, 0x38, 0x3b, "global_atomic_umax"),
1822   (0x48, 0x39, 0x3c, "global_atomic_and"),
1823   (0x49, 0x3a, 0x3d, "global_atomic_or"),
1824   (0x4a, 0x3b, 0x3e, "global_atomic_xor"),
1825   (0x4b, 0x3c, 0x3f, "global_atomic_inc"),
1826   (0x4c, 0x3d, 0x40, "global_atomic_dec"),
1827   (  -1, 0x3e, 0x50, "global_atomic_fcmpswap"),
1828   (  -1, 0x3f, 0x51, "global_atomic_fmin"),
1829   (  -1, 0x40, 0x52, "global_atomic_fmax"),
1830   (0x60, 0x50, 0x41, "global_atomic_swap_x2"),
1831   (0x61, 0x51, 0x42, "global_atomic_cmpswap_x2"),
1832   (0x62, 0x52, 0x43, "global_atomic_add_x2"),
1833   (0x63, 0x53, 0x44, "global_atomic_sub_x2"),
1834   (0x64, 0x55, 0x45, "global_atomic_smin_x2"),
1835   (0x65, 0x56, 0x46, "global_atomic_umin_x2"),
1836   (0x66, 0x57, 0x47, "global_atomic_smax_x2"),
1837   (0x67, 0x58, 0x48, "global_atomic_umax_x2"),
1838   (0x68, 0x59, 0x49, "global_atomic_and_x2"),
1839   (0x69, 0x5a, 0x4a, "global_atomic_or_x2"),
1840   (0x6a, 0x5b, 0x4b, "global_atomic_xor_x2"),
1841   (0x6b, 0x5c, 0x4c, "global_atomic_inc_x2"),
1842   (0x6c, 0x5d, 0x4d, "global_atomic_dec_x2"),
1843   (  -1, 0x5e,   -1, "global_atomic_fcmpswap_x2"),
1844   (  -1, 0x5f,   -1, "global_atomic_fmin_x2"),
1845   (  -1, 0x60,   -1, "global_atomic_fmax_x2"),
1846   (  -1, 0x16, 0x28, "global_load_dword_addtid"), #GFX10.3+
1847   (  -1, 0x17, 0x29, "global_store_dword_addtid"), #GFX10.3+
1848   (  -1, 0x34, 0x37, "global_atomic_csub"), #GFX10.3+. seems glc must be set
1849   (  -1,   -1, 0x56, "global_atomic_add_f32"),
1850}
1851for (gfx8, gfx10, gfx11, name) in GLOBAL:
1852    opcode(name, -1, gfx8, gfx10, gfx11, Format.GLOBAL, InstrClass.VMem, is_atomic = "atomic" in name)
1853
1854SCRATCH = {
1855   #GFX89,GFX10,GFX11
1856   (0x10, 0x08, 0x10, "scratch_load_ubyte"),
1857   (0x11, 0x09, 0x11, "scratch_load_sbyte"),
1858   (0x12, 0x0a, 0x12, "scratch_load_ushort"),
1859   (0x13, 0x0b, 0x13, "scratch_load_sshort"),
1860   (0x14, 0x0c, 0x14, "scratch_load_dword"),
1861   (0x15, 0x0d, 0x15, "scratch_load_dwordx2"),
1862   (0x16, 0x0f, 0x16, "scratch_load_dwordx3"),
1863   (0x17, 0x0e, 0x17, "scratch_load_dwordx4"),
1864   (0x18, 0x18, 0x18, "scratch_store_byte"),
1865   (0x19, 0x19, 0x24, "scratch_store_byte_d16_hi"),
1866   (0x1a, 0x1a, 0x19, "scratch_store_short"),
1867   (0x1b, 0x1b, 0x25, "scratch_store_short_d16_hi"),
1868   (0x1c, 0x1c, 0x1a, "scratch_store_dword"),
1869   (0x1d, 0x1d, 0x1b, "scratch_store_dwordx2"),
1870   (0x1e, 0x1f, 0x1c, "scratch_store_dwordx3"),
1871   (0x1f, 0x1e, 0x1d, "scratch_store_dwordx4"),
1872   (0x20, 0x20, 0x1e, "scratch_load_ubyte_d16"),
1873   (0x21, 0x21, 0x21, "scratch_load_ubyte_d16_hi"),
1874   (0x22, 0x22, 0x1f, "scratch_load_sbyte_d16"),
1875   (0x23, 0x23, 0x22, "scratch_load_sbyte_d16_hi"),
1876   (0x24, 0x24, 0x20, "scratch_load_short_d16"),
1877   (0x25, 0x25, 0x23, "scratch_load_short_d16_hi"),
1878}
1879for (gfx8, gfx10, gfx11, name) in SCRATCH:
1880    opcode(name, -1, gfx8, gfx10, gfx11, Format.SCRATCH, InstrClass.VMem)
1881
1882# check for duplicate opcode numbers
1883for ver in ['gfx9', 'gfx10', 'gfx11']:
1884    op_to_name = {}
1885    for op in opcodes.values():
1886        if op.format in [Format.PSEUDO, Format.PSEUDO_BRANCH, Format.PSEUDO_BARRIER, Format.PSEUDO_REDUCTION]:
1887            continue
1888
1889        num = getattr(op, 'opcode_' + ver)
1890        if num == -1:
1891            continue
1892
1893        key = (op.format, num)
1894
1895        if key in op_to_name:
1896            # exceptions
1897            names = set([op_to_name[key], op.name])
1898            if ver in ['gfx8', 'gfx9', 'gfx11'] and names == set(['v_mul_lo_i32', 'v_mul_lo_u32']):
1899                continue
1900            # v_mad_legacy_f32 is replaced with v_fma_legacy_f32 on GFX10.3
1901            if ver == 'gfx10' and names == set(['v_mad_legacy_f32', 'v_fma_legacy_f32']):
1902                continue
1903            # v_mac_legacy_f32 is replaced with v_fmac_legacy_f32 on GFX10.3
1904            if ver == 'gfx10' and names == set(['v_mac_legacy_f32', 'v_fmac_legacy_f32']):
1905                continue
1906
1907            print('%s and %s share the same opcode number (%s)' % (op_to_name[key], op.name, ver))
1908            sys.exit(1)
1909        else:
1910            op_to_name[key] = op.name
1911