• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1#
2# Copyright (c) 2018 Valve Corporation
3#
4# Permission is hereby granted, free of charge, to any person obtaining a
5# copy of this software and associated documentation files (the "Software"),
6# to deal in the Software without restriction, including without limitation
7# the rights to use, copy, modify, merge, publish, distribute, sublicense,
8# and/or sell copies of the Software, and to permit persons to whom the
9# Software is furnished to do so, subject to the following conditions:
10#
11# The above copyright notice and this permission notice (including the next
12# paragraph) shall be included in all copies or substantial portions of the
13# Software.
14#
15# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
18# THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20# FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21# IN THE SOFTWARE.
22#
23# Authors:
24#    Daniel Schuermann (daniel.schuermann@campus.tu-berlin.de)
25
26
27# Class that represents all the information we have about the opcode
28# NOTE: this must be kept in sync with aco_op_info
29
30import sys
31from enum import Enum
32
33class Format(Enum):
34   PSEUDO = 0
35   SOP1 = 1
36   SOP2 = 2
37   SOPK = 3
38   SOPP = 4
39   SOPC = 5
40   SMEM = 6
41   DS = 8
42   MTBUF = 9
43   MUBUF = 10
44   MIMG = 11
45   EXP = 12
46   FLAT = 13
47   GLOBAL = 14
48   SCRATCH = 15
49   PSEUDO_BRANCH = 16
50   PSEUDO_BARRIER = 17
51   PSEUDO_REDUCTION = 18
52   VOP3P = 19
53   VOP1 = 1 << 8
54   VOP2 = 1 << 9
55   VOPC = 1 << 10
56   VOP3A = 1 << 11
57   VOP3B = 1 << 11
58   VINTRP = 1 << 12
59   DPP = 1 << 13
60   SDWA = 1 << 14
61
62   def get_builder_fields(self):
63      if self == Format.SOPK:
64         return [('uint16_t', 'imm', None)]
65      elif self == Format.SOPP:
66         return [('uint32_t', 'block', '-1'),
67                 ('uint32_t', 'imm', '0')]
68      elif self == Format.SMEM:
69         return [('memory_sync_info', 'sync', 'memory_sync_info()'),
70                 ('bool', 'glc', 'false'),
71                 ('bool', 'dlc', 'false'),
72                 ('bool', 'nv', 'false')]
73      elif self == Format.DS:
74         return [('int16_t', 'offset0', '0'),
75                 ('int8_t', 'offset1', '0'),
76                 ('bool', 'gds', 'false')]
77      elif self == Format.MTBUF:
78         return [('unsigned', 'dfmt', None),
79                 ('unsigned', 'nfmt', None),
80                 ('unsigned', 'offset', None),
81                 ('bool', 'offen', None),
82                 ('bool', 'idxen', 'false'),
83                 ('bool', 'disable_wqm', 'false'),
84                 ('bool', 'glc', 'false'),
85                 ('bool', 'dlc', 'false'),
86                 ('bool', 'slc', 'false'),
87                 ('bool', 'tfe', 'false')]
88      elif self == Format.MUBUF:
89         return [('unsigned', 'offset', None),
90                 ('bool', 'offen', None),
91                 ('bool', 'swizzled', 'false'),
92                 ('bool', 'idxen', 'false'),
93                 ('bool', 'addr64', 'false'),
94                 ('bool', 'disable_wqm', 'false'),
95                 ('bool', 'glc', 'false'),
96                 ('bool', 'dlc', 'false'),
97                 ('bool', 'slc', 'false'),
98                 ('bool', 'tfe', 'false'),
99                 ('bool', 'lds', 'false')]
100      elif self == Format.MIMG:
101         return [('unsigned', 'dmask', '0xF'),
102                 ('bool', 'da', 'false'),
103                 ('bool', 'unrm', 'true'),
104                 ('bool', 'disable_wqm', 'false'),
105                 ('bool', 'glc', 'false'),
106                 ('bool', 'dlc', 'false'),
107                 ('bool', 'slc', 'false'),
108                 ('bool', 'tfe', 'false'),
109                 ('bool', 'lwe', 'false'),
110                 ('bool', 'r128_a16', 'false', 'r128'),
111                 ('bool', 'd16', 'false')]
112         return [('unsigned', 'attribute', None),
113                 ('unsigned', 'component', None)]
114      elif self == Format.EXP:
115         return [('unsigned', 'enabled_mask', None),
116                 ('unsigned', 'dest', None),
117                 ('bool', 'compr', 'false', 'compressed'),
118                 ('bool', 'done', 'false'),
119                 ('bool', 'vm', 'false', 'valid_mask')]
120      elif self == Format.PSEUDO_BRANCH:
121         return [('uint32_t', 'target0', '0', 'target[0]'),
122                 ('uint32_t', 'target1', '0', 'target[1]')]
123      elif self == Format.PSEUDO_REDUCTION:
124         return [('ReduceOp', 'op', None, 'reduce_op'),
125                 ('unsigned', 'cluster_size', '0')]
126      elif self == Format.PSEUDO_BARRIER:
127         return [('memory_sync_info', 'sync', None),
128                 ('sync_scope', 'exec_scope', 'scope_invocation')]
129      elif self == Format.VINTRP:
130         return [('unsigned', 'attribute', None),
131                 ('unsigned', 'component', None)]
132      elif self == Format.DPP:
133         return [('uint16_t', 'dpp_ctrl', None),
134                 ('uint8_t', 'row_mask', '0xF'),
135                 ('uint8_t', 'bank_mask', '0xF'),
136                 ('bool', 'bound_ctrl', 'true')]
137      elif self in [Format.FLAT, Format.GLOBAL, Format.SCRATCH]:
138         return [('uint16_t', 'offset', 0),
139                 ('memory_sync_info', 'sync', 'memory_sync_info()'),
140                 ('bool', 'glc', 'false'),
141                 ('bool', 'slc', 'false'),
142                 ('bool', 'lds', 'false'),
143                 ('bool', 'nv', 'false')]
144      else:
145         return []
146
147   def get_builder_field_names(self):
148      return [f[1] for f in self.get_builder_fields()]
149
150   def get_builder_field_dests(self):
151      return [(f[3] if len(f) >= 4 else f[1]) for f in self.get_builder_fields()]
152
153   def get_builder_field_decls(self):
154      return [('%s %s=%s' % (f[0], f[1], f[2]) if f[2] != None else '%s %s' % (f[0], f[1])) for f in self.get_builder_fields()]
155
156   def get_builder_initialization(self, num_operands):
157      res = ''
158      if self == Format.SDWA:
159         for i in range(min(num_operands, 2)):
160            res += 'instr->sel[{0}] = op{0}.op.bytes() == 2 ? sdwa_uword : (op{0}.op.bytes() == 1 ? sdwa_ubyte : sdwa_udword);\n'.format(i)
161         res += 'instr->dst_sel = def0.bytes() == 2 ? sdwa_uword : (def0.bytes() == 1 ? sdwa_ubyte : sdwa_udword);\n'
162         res += 'instr->dst_preserve = true;'
163      return res
164
165
166class Opcode(object):
167   """Class that represents all the information we have about the opcode
168   NOTE: this must be kept in sync with aco_op_info
169   """
170   def __init__(self, name, opcode_gfx7, opcode_gfx9, opcode_gfx10, format, input_mod, output_mod, is_atomic):
171      """Parameters:
172
173      - name is the name of the opcode (prepend nir_op_ for the enum name)
174      - all types are strings that get nir_type_ prepended to them
175      - input_types is a list of types
176      - algebraic_properties is a space-seperated string, where nir_op_is_ is
177        prepended before each entry
178      - const_expr is an expression or series of statements that computes the
179        constant value of the opcode given the constant values of its inputs.
180      """
181      assert isinstance(name, str)
182      assert isinstance(opcode_gfx7, int)
183      assert isinstance(opcode_gfx9, int)
184      assert isinstance(opcode_gfx10, int)
185      assert isinstance(format, Format)
186      assert isinstance(input_mod, bool)
187      assert isinstance(output_mod, bool)
188
189      self.name = name
190      self.opcode_gfx7 = opcode_gfx7
191      self.opcode_gfx9 = opcode_gfx9
192      self.opcode_gfx10 = opcode_gfx10
193      self.input_mod = "1" if input_mod else "0"
194      self.output_mod = "1" if output_mod else "0"
195      self.is_atomic = "1" if is_atomic else "0"
196      self.format = format
197
198      parts = name.replace('_e64', '').rsplit('_', 2)
199      op_dtype = parts[-1]
200      def_dtype = parts[-2] if len(parts) > 1 else parts[-1]
201
202      def_dtype_sizes = {'{}{}'.format(prefix, size) : size for prefix in 'biuf' for size in [64, 32, 24, 16]}
203      op_dtype_sizes = {k:v for k, v in def_dtype_sizes.items()}
204      # inline constants are 32-bit for 16-bit integer/typeless instructions: https://reviews.llvm.org/D81841
205      op_dtype_sizes['b16'] = 32
206      op_dtype_sizes['i16'] = 32
207      op_dtype_sizes['u16'] = 32
208
209      # If we can't tell the definition size and the operand size, default to
210      # 32. Some opcodes can have a larger definition size, but
211      # get_subdword_definition_info() handles that.
212      self.operand_size = op_dtype_sizes.get(op_dtype, 32)
213      self.definition_size = def_dtype_sizes.get(def_dtype, self.operand_size)
214
215      # exceptions
216      if self.operand_size == 16 and op_dtype != 'f16':
217         self.operand_size = 16
218      elif self.operand_size == 24:
219        self.operand_size = 32
220      elif name in ['s_sext_i32_i8', 's_sext_i32_i16', 'v_msad_u8', 'v_cvt_pk_u16_u32', 'v_cvt_pk_i16_i32']:
221         self.operand_size = 32
222      elif name in ['v_qsad_pk_u16_u8', 'v_mqsad_pk_u16_u8', 'v_mqsad_u32_u8']:
223         self.definition_size = 0
224         self.operand_size = 0
225      elif name in ['v_mad_u64_u32', 'v_mad_i64_i32']:
226         self.operand_size = 0
227      elif '_pk' in name or name in ['v_lerp_u8', 'v_sad_u8', 'v_sad_u16',
228                                      'v_cvt_f32_ubyte0', 'v_cvt_f32_ubyte1',
229                                      'v_cvt_f32_ubyte2', 'v_cvt_f32_ubyte3']:
230         self.operand_size = 32
231         self.definition_size = 32
232      elif '_pknorm_' in name:
233         self.definition_size = 32
234
235
236# global dictionary of opcodes
237opcodes = {}
238
239def opcode(name, opcode_gfx7 = -1, opcode_gfx9 = -1, opcode_gfx10 = -1, format = Format.PSEUDO, input_mod = False, output_mod = False, is_atomic = False):
240   assert name not in opcodes
241   opcodes[name] = Opcode(name, opcode_gfx7, opcode_gfx9, opcode_gfx10, format, input_mod, output_mod, is_atomic)
242
243opcode("exp", 0, 0, 0, format = Format.EXP)
244opcode("p_parallelcopy")
245opcode("p_startpgm")
246opcode("p_phi")
247opcode("p_linear_phi")
248opcode("p_as_uniform")
249opcode("p_unit_test")
250
251opcode("p_create_vector")
252opcode("p_extract_vector")
253opcode("p_split_vector")
254
255# start/end the parts where we can use exec based instructions
256# implicitly
257opcode("p_logical_start")
258opcode("p_logical_end")
259
260# e.g. subgroupMin() in SPIR-V
261opcode("p_reduce", format=Format.PSEUDO_REDUCTION)
262# e.g. subgroupInclusiveMin()
263opcode("p_inclusive_scan", format=Format.PSEUDO_REDUCTION)
264# e.g. subgroupExclusiveMin()
265opcode("p_exclusive_scan", format=Format.PSEUDO_REDUCTION)
266
267opcode("p_branch", format=Format.PSEUDO_BRANCH)
268opcode("p_cbranch", format=Format.PSEUDO_BRANCH)
269opcode("p_cbranch_z", format=Format.PSEUDO_BRANCH)
270opcode("p_cbranch_nz", format=Format.PSEUDO_BRANCH)
271
272opcode("p_barrier", format=Format.PSEUDO_BARRIER)
273
274opcode("p_spill")
275opcode("p_reload")
276
277# start/end linear vgprs
278opcode("p_start_linear_vgpr")
279opcode("p_end_linear_vgpr")
280
281opcode("p_wqm")
282opcode("p_discard_if")
283opcode("p_load_helper")
284opcode("p_demote_to_helper")
285opcode("p_is_helper")
286opcode("p_exit_early_if")
287
288opcode("p_fs_buffer_store_smem", format=Format.SMEM)
289
290# simulates proper bpermute behavior when it's unsupported, eg. GFX10 wave64
291opcode("p_bpermute")
292
293# SOP2 instructions: 2 scalar inputs, 1 scalar output (+optional scc)
294SOP2 = {
295  # GFX6, GFX7, GFX8, GFX9, GFX10, name
296   (0x00, 0x00, 0x00, 0x00, 0x00, "s_add_u32"),
297   (0x01, 0x01, 0x01, 0x01, 0x01, "s_sub_u32"),
298   (0x02, 0x02, 0x02, 0x02, 0x02, "s_add_i32"),
299   (0x03, 0x03, 0x03, 0x03, 0x03, "s_sub_i32"),
300   (0x04, 0x04, 0x04, 0x04, 0x04, "s_addc_u32"),
301   (0x05, 0x05, 0x05, 0x05, 0x05, "s_subb_u32"),
302   (0x06, 0x06, 0x06, 0x06, 0x06, "s_min_i32"),
303   (0x07, 0x07, 0x07, 0x07, 0x07, "s_min_u32"),
304   (0x08, 0x08, 0x08, 0x08, 0x08, "s_max_i32"),
305   (0x09, 0x09, 0x09, 0x09, 0x09, "s_max_u32"),
306   (0x0a, 0x0a, 0x0a, 0x0a, 0x0a, "s_cselect_b32"),
307   (0x0b, 0x0b, 0x0b, 0x0b, 0x0b, "s_cselect_b64"),
308   (0x0e, 0x0e, 0x0c, 0x0c, 0x0e, "s_and_b32"),
309   (0x0f, 0x0f, 0x0d, 0x0d, 0x0f, "s_and_b64"),
310   (0x10, 0x10, 0x0e, 0x0e, 0x10, "s_or_b32"),
311   (0x11, 0x11, 0x0f, 0x0f, 0x11, "s_or_b64"),
312   (0x12, 0x12, 0x10, 0x10, 0x12, "s_xor_b32"),
313   (0x13, 0x13, 0x11, 0x11, 0x13, "s_xor_b64"),
314   (0x14, 0x14, 0x12, 0x12, 0x14, "s_andn2_b32"),
315   (0x15, 0x15, 0x13, 0x13, 0x15, "s_andn2_b64"),
316   (0x16, 0x16, 0x14, 0x14, 0x16, "s_orn2_b32"),
317   (0x17, 0x17, 0x15, 0x15, 0x17, "s_orn2_b64"),
318   (0x18, 0x18, 0x16, 0x16, 0x18, "s_nand_b32"),
319   (0x19, 0x19, 0x17, 0x17, 0x19, "s_nand_b64"),
320   (0x1a, 0x1a, 0x18, 0x18, 0x1a, "s_nor_b32"),
321   (0x1b, 0x1b, 0x19, 0x19, 0x1b, "s_nor_b64"),
322   (0x1c, 0x1c, 0x1a, 0x1a, 0x1c, "s_xnor_b32"),
323   (0x1d, 0x1d, 0x1b, 0x1b, 0x1d, "s_xnor_b64"),
324   (0x1e, 0x1e, 0x1c, 0x1c, 0x1e, "s_lshl_b32"),
325   (0x1f, 0x1f, 0x1d, 0x1d, 0x1f, "s_lshl_b64"),
326   (0x20, 0x20, 0x1e, 0x1e, 0x20, "s_lshr_b32"),
327   (0x21, 0x21, 0x1f, 0x1f, 0x21, "s_lshr_b64"),
328   (0x22, 0x22, 0x20, 0x20, 0x22, "s_ashr_i32"),
329   (0x23, 0x23, 0x21, 0x21, 0x23, "s_ashr_i64"),
330   (0x24, 0x24, 0x22, 0x22, 0x24, "s_bfm_b32"),
331   (0x25, 0x25, 0x23, 0x23, 0x25, "s_bfm_b64"),
332   (0x26, 0x26, 0x24, 0x24, 0x26, "s_mul_i32"),
333   (0x27, 0x27, 0x25, 0x25, 0x27, "s_bfe_u32"),
334   (0x28, 0x28, 0x26, 0x26, 0x28, "s_bfe_i32"),
335   (0x29, 0x29, 0x27, 0x27, 0x29, "s_bfe_u64"),
336   (0x2a, 0x2a, 0x28, 0x28, 0x2a, "s_bfe_i64"),
337   (0x2b, 0x2b, 0x29, 0x29,   -1, "s_cbranch_g_fork"),
338   (0x2c, 0x2c, 0x2a, 0x2a, 0x2c, "s_absdiff_i32"),
339   (  -1,   -1, 0x2b, 0x2b,   -1, "s_rfe_restore_b64"),
340   (  -1,   -1,   -1, 0x2e, 0x2e, "s_lshl1_add_u32"),
341   (  -1,   -1,   -1, 0x2f, 0x2f, "s_lshl2_add_u32"),
342   (  -1,   -1,   -1, 0x30, 0x30, "s_lshl3_add_u32"),
343   (  -1,   -1,   -1, 0x31, 0x31, "s_lshl4_add_u32"),
344   (  -1,   -1,   -1, 0x32, 0x32, "s_pack_ll_b32_b16"),
345   (  -1,   -1,   -1, 0x33, 0x33, "s_pack_lh_b32_b16"),
346   (  -1,   -1,   -1, 0x34, 0x34, "s_pack_hh_b32_b16"),
347   (  -1,   -1,   -1, 0x2c, 0x35, "s_mul_hi_u32"),
348   (  -1,   -1,   -1, 0x2d, 0x36, "s_mul_hi_i32"),
349}
350for (gfx6, gfx7, gfx8, gfx9, gfx10, name) in SOP2:
351    opcode(name, gfx7, gfx9, gfx10, Format.SOP2)
352
353
354# SOPK instructions: 0 input (+ imm), 1 output + optional scc
355SOPK = {
356  # GFX6, GFX7, GFX8, GFX9, GFX10, name
357   (0x00, 0x00, 0x00, 0x00, 0x00, "s_movk_i32"),
358   (  -1,   -1,   -1,   -1, 0x01, "s_version"), # GFX10+
359   (0x02, 0x02, 0x01, 0x01, 0x02, "s_cmovk_i32"), # GFX8_GFX9
360   (0x03, 0x03, 0x02, 0x02, 0x03, "s_cmpk_eq_i32"),
361   (0x04, 0x04, 0x03, 0x03, 0x04, "s_cmpk_lg_i32"),
362   (0x05, 0x05, 0x04, 0x04, 0x05, "s_cmpk_gt_i32"),
363   (0x06, 0x06, 0x05, 0x05, 0x06, "s_cmpk_ge_i32"),
364   (0x07, 0x07, 0x06, 0x06, 0x07, "s_cmpk_lt_i32"),
365   (0x08, 0x08, 0x07, 0x07, 0x08, "s_cmpk_le_i32"),
366   (0x09, 0x09, 0x08, 0x08, 0x09, "s_cmpk_eq_u32"),
367   (0x0a, 0x0a, 0x09, 0x09, 0x0a, "s_cmpk_lg_u32"),
368   (0x0b, 0x0b, 0x0a, 0x0a, 0x0b, "s_cmpk_gt_u32"),
369   (0x0c, 0x0c, 0x0b, 0x0b, 0x0c, "s_cmpk_ge_u32"),
370   (0x0d, 0x0d, 0x0c, 0x0c, 0x0d, "s_cmpk_lt_u32"),
371   (0x0e, 0x0e, 0x0d, 0x0d, 0x0e, "s_cmpk_le_u32"),
372   (0x0f, 0x0f, 0x0e, 0x0e, 0x0f, "s_addk_i32"),
373   (0x10, 0x10, 0x0f, 0x0f, 0x10, "s_mulk_i32"),
374   (0x11, 0x11, 0x10, 0x10,   -1, "s_cbranch_i_fork"),
375   (0x12, 0x12, 0x11, 0x11, 0x12, "s_getreg_b32"),
376   (0x13, 0x13, 0x12, 0x12, 0x13, "s_setreg_b32"),
377   (0x15, 0x15, 0x14, 0x14, 0x15, "s_setreg_imm32_b32"), # requires 32bit literal
378   (  -1,   -1, 0x15, 0x15, 0x16, "s_call_b64"),
379   (  -1,   -1,   -1,   -1, 0x17, "s_waitcnt_vscnt"),
380   (  -1,   -1,   -1,   -1, 0x18, "s_waitcnt_vmcnt"),
381   (  -1,   -1,   -1,   -1, 0x19, "s_waitcnt_expcnt"),
382   (  -1,   -1,   -1,   -1, 0x1a, "s_waitcnt_lgkmcnt"),
383   (  -1,   -1,   -1,   -1, 0x1b, "s_subvector_loop_begin"),
384   (  -1,   -1,   -1,   -1, 0x1c, "s_subvector_loop_end"),
385}
386for (gfx6, gfx7, gfx8, gfx9, gfx10, name) in SOPK:
387   opcode(name, gfx7, gfx9, gfx10, Format.SOPK)
388
389
390# SOP1 instructions: 1 input, 1 output (+optional SCC)
391SOP1 = {
392  # GFX6, GFX7, GFX8, GFX9, GFX10, name
393   (0x03, 0x03, 0x00, 0x00, 0x03, "s_mov_b32"),
394   (0x04, 0x04, 0x01, 0x01, 0x04, "s_mov_b64"),
395   (0x05, 0x05, 0x02, 0x02, 0x05, "s_cmov_b32"),
396   (0x06, 0x06, 0x03, 0x03, 0x06, "s_cmov_b64"),
397   (0x07, 0x07, 0x04, 0x04, 0x07, "s_not_b32"),
398   (0x08, 0x08, 0x05, 0x05, 0x08, "s_not_b64"),
399   (0x09, 0x09, 0x06, 0x06, 0x09, "s_wqm_b32"),
400   (0x0a, 0x0a, 0x07, 0x07, 0x0a, "s_wqm_b64"),
401   (0x0b, 0x0b, 0x08, 0x08, 0x0b, "s_brev_b32"),
402   (0x0c, 0x0c, 0x09, 0x09, 0x0c, "s_brev_b64"),
403   (0x0d, 0x0d, 0x0a, 0x0a, 0x0d, "s_bcnt0_i32_b32"),
404   (0x0e, 0x0e, 0x0b, 0x0b, 0x0e, "s_bcnt0_i32_b64"),
405   (0x0f, 0x0f, 0x0c, 0x0c, 0x0f, "s_bcnt1_i32_b32"),
406   (0x10, 0x10, 0x0d, 0x0d, 0x10, "s_bcnt1_i32_b64"),
407   (0x11, 0x11, 0x0e, 0x0e, 0x11, "s_ff0_i32_b32"),
408   (0x12, 0x12, 0x0f, 0x0f, 0x12, "s_ff0_i32_b64"),
409   (0x13, 0x13, 0x10, 0x10, 0x13, "s_ff1_i32_b32"),
410   (0x14, 0x14, 0x11, 0x11, 0x14, "s_ff1_i32_b64"),
411   (0x15, 0x15, 0x12, 0x12, 0x15, "s_flbit_i32_b32"),
412   (0x16, 0x16, 0x13, 0x13, 0x16, "s_flbit_i32_b64"),
413   (0x17, 0x17, 0x14, 0x14, 0x17, "s_flbit_i32"),
414   (0x18, 0x18, 0x15, 0x15, 0x18, "s_flbit_i32_i64"),
415   (0x19, 0x19, 0x16, 0x16, 0x19, "s_sext_i32_i8"),
416   (0x1a, 0x1a, 0x17, 0x17, 0x1a, "s_sext_i32_i16"),
417   (0x1b, 0x1b, 0x18, 0x18, 0x1b, "s_bitset0_b32"),
418   (0x1c, 0x1c, 0x19, 0x19, 0x1c, "s_bitset0_b64"),
419   (0x1d, 0x1d, 0x1a, 0x1a, 0x1d, "s_bitset1_b32"),
420   (0x1e, 0x1e, 0x1b, 0x1b, 0x1e, "s_bitset1_b64"),
421   (0x1f, 0x1f, 0x1c, 0x1c, 0x1f, "s_getpc_b64"),
422   (0x20, 0x20, 0x1d, 0x1d, 0x20, "s_setpc_b64"),
423   (0x21, 0x21, 0x1e, 0x1e, 0x21, "s_swappc_b64"),
424   (0x22, 0x22, 0x1f, 0x1f, 0x22, "s_rfe_b64"),
425   (0x24, 0x24, 0x20, 0x20, 0x24, "s_and_saveexec_b64"),
426   (0x25, 0x25, 0x21, 0x21, 0x25, "s_or_saveexec_b64"),
427   (0x26, 0x26, 0x22, 0x22, 0x26, "s_xor_saveexec_b64"),
428   (0x27, 0x27, 0x23, 0x23, 0x27, "s_andn2_saveexec_b64"),
429   (0x28, 0x28, 0x24, 0x24, 0x28, "s_orn2_saveexec_b64"),
430   (0x29, 0x29, 0x25, 0x25, 0x29, "s_nand_saveexec_b64"),
431   (0x2a, 0x2a, 0x26, 0x26, 0x2a, "s_nor_saveexec_b64"),
432   (0x2b, 0x2b, 0x27, 0x27, 0x2b, "s_xnor_saveexec_b64"),
433   (0x2c, 0x2c, 0x28, 0x28, 0x2c, "s_quadmask_b32"),
434   (0x2d, 0x2d, 0x29, 0x29, 0x2d, "s_quadmask_b64"),
435   (0x2e, 0x2e, 0x2a, 0x2a, 0x2e, "s_movrels_b32"),
436   (0x2f, 0x2f, 0x2b, 0x2b, 0x2f, "s_movrels_b64"),
437   (0x30, 0x30, 0x2c, 0x2c, 0x30, "s_movreld_b32"),
438   (0x31, 0x31, 0x2d, 0x2d, 0x31, "s_movreld_b64"),
439   (0x32, 0x32, 0x2e, 0x2e,   -1, "s_cbranch_join"),
440   (0x34, 0x34, 0x30, 0x30, 0x34, "s_abs_i32"),
441   (0x35, 0x35,   -1,   -1, 0x35, "s_mov_fed_b32"),
442   (  -1,   -1, 0x32, 0x32,   -1, "s_set_gpr_idx_idx"),
443   (  -1,   -1,   -1, 0x33, 0x37, "s_andn1_saveexec_b64"),
444   (  -1,   -1,   -1, 0x34, 0x38, "s_orn1_saveexec_b64"),
445   (  -1,   -1,   -1, 0x35, 0x39, "s_andn1_wrexec_b64"),
446   (  -1,   -1,   -1, 0x36, 0x3a, "s_andn2_wrexec_b64"),
447   (  -1,   -1,   -1, 0x37, 0x3b, "s_bitreplicate_b64_b32"),
448   (  -1,   -1,   -1,   -1, 0x3c, "s_and_saveexec_b32"),
449   (  -1,   -1,   -1,   -1, 0x3d, "s_or_saveexec_b32"),
450   (  -1,   -1,   -1,   -1, 0x3e, "s_xor_saveexec_b32"),
451   (  -1,   -1,   -1,   -1, 0x3f, "s_andn2_saveexec_b32"),
452   (  -1,   -1,   -1,   -1, 0x40, "s_orn2_saveexec_b32"),
453   (  -1,   -1,   -1,   -1, 0x41, "s_nand_saveexec_b32"),
454   (  -1,   -1,   -1,   -1, 0x42, "s_nor_saveexec_b32"),
455   (  -1,   -1,   -1,   -1, 0x43, "s_xnor_saveexec_b32"),
456   (  -1,   -1,   -1,   -1, 0x44, "s_andn1_saveexec_b32"),
457   (  -1,   -1,   -1,   -1, 0x45, "s_orn1_saveexec_b32"),
458   (  -1,   -1,   -1,   -1, 0x46, "s_andn1_wrexec_b32"),
459   (  -1,   -1,   -1,   -1, 0x47, "s_andn2_wrexec_b32"),
460   (  -1,   -1,   -1,   -1, 0x49, "s_movrelsd_2_b32"),
461   # actually a pseudo-instruction. it's lowered to SALU during assembly though, so it's useful to identify it as a SOP1.
462   (  -1,   -1,   -1,   -1,   -1, "p_constaddr"),
463}
464for (gfx6, gfx7, gfx8, gfx9, gfx10, name) in SOP1:
465   opcode(name, gfx7, gfx9, gfx10, Format.SOP1)
466
467
468# SOPC instructions: 2 inputs and 0 outputs (+SCC)
469SOPC = {
470  # GFX6, GFX7, GFX8, GFX9, GFX10, name
471   (0x00, 0x00, 0x00, 0x00, 0x00, "s_cmp_eq_i32"),
472   (0x01, 0x01, 0x01, 0x01, 0x01, "s_cmp_lg_i32"),
473   (0x02, 0x02, 0x02, 0x02, 0x02, "s_cmp_gt_i32"),
474   (0x03, 0x03, 0x03, 0x03, 0x03, "s_cmp_ge_i32"),
475   (0x04, 0x04, 0x04, 0x04, 0x04, "s_cmp_lt_i32"),
476   (0x05, 0x05, 0x05, 0x05, 0x05, "s_cmp_le_i32"),
477   (0x06, 0x06, 0x06, 0x06, 0x06, "s_cmp_eq_u32"),
478   (0x07, 0x07, 0x07, 0x07, 0x07, "s_cmp_lg_u32"),
479   (0x08, 0x08, 0x08, 0x08, 0x08, "s_cmp_gt_u32"),
480   (0x09, 0x09, 0x09, 0x09, 0x09, "s_cmp_ge_u32"),
481   (0x0a, 0x0a, 0x0a, 0x0a, 0x0a, "s_cmp_lt_u32"),
482   (0x0b, 0x0b, 0x0b, 0x0b, 0x0b, "s_cmp_le_u32"),
483   (0x0c, 0x0c, 0x0c, 0x0c, 0x0c, "s_bitcmp0_b32"),
484   (0x0d, 0x0d, 0x0d, 0x0d, 0x0d, "s_bitcmp1_b32"),
485   (0x0e, 0x0e, 0x0e, 0x0e, 0x0e, "s_bitcmp0_b64"),
486   (0x0f, 0x0f, 0x0f, 0x0f, 0x0f, "s_bitcmp1_b64"),
487   (0x10, 0x10, 0x10, 0x10,   -1, "s_setvskip"),
488   (  -1,   -1, 0x11, 0x11,   -1, "s_set_gpr_idx_on"),
489   (  -1,   -1, 0x12, 0x12, 0x12, "s_cmp_eq_u64"),
490   (  -1,   -1, 0x13, 0x13, 0x13, "s_cmp_lg_u64"),
491}
492for (gfx6, gfx7, gfx8, gfx9, gfx10, name) in SOPC:
493   opcode(name, gfx7, gfx9, gfx10, Format.SOPC)
494
495
496# SOPP instructions: 0 inputs (+optional scc/vcc), 0 outputs
497SOPP = {
498  # GFX6, GFX7, GFX8, GFX9, GFX10, name
499   (0x00, 0x00, 0x00, 0x00, 0x00, "s_nop"),
500   (0x01, 0x01, 0x01, 0x01, 0x01, "s_endpgm"),
501   (0x02, 0x02, 0x02, 0x02, 0x02, "s_branch"),
502   (  -1,   -1, 0x03, 0x03, 0x03, "s_wakeup"),
503   (0x04, 0x04, 0x04, 0x04, 0x04, "s_cbranch_scc0"),
504   (0x05, 0x05, 0x05, 0x05, 0x05, "s_cbranch_scc1"),
505   (0x06, 0x06, 0x06, 0x06, 0x06, "s_cbranch_vccz"),
506   (0x07, 0x07, 0x07, 0x07, 0x07, "s_cbranch_vccnz"),
507   (0x08, 0x08, 0x08, 0x08, 0x08, "s_cbranch_execz"),
508   (0x09, 0x09, 0x09, 0x09, 0x09, "s_cbranch_execnz"),
509   (0x0a, 0x0a, 0x0a, 0x0a, 0x0a, "s_barrier"),
510   (  -1, 0x0b, 0x0b, 0x0b, 0x0b, "s_setkill"),
511   (0x0c, 0x0c, 0x0c, 0x0c, 0x0c, "s_waitcnt"),
512   (0x0d, 0x0d, 0x0d, 0x0d, 0x0d, "s_sethalt"),
513   (0x0e, 0x0e, 0x0e, 0x0e, 0x0e, "s_sleep"),
514   (0x0f, 0x0f, 0x0f, 0x0f, 0x0f, "s_setprio"),
515   (0x10, 0x10, 0x10, 0x10, 0x10, "s_sendmsg"),
516   (0x11, 0x11, 0x11, 0x11, 0x11, "s_sendmsghalt"),
517   (0x12, 0x12, 0x12, 0x12, 0x12, "s_trap"),
518   (0x13, 0x13, 0x13, 0x13, 0x13, "s_icache_inv"),
519   (0x14, 0x14, 0x14, 0x14, 0x14, "s_incperflevel"),
520   (0x15, 0x15, 0x15, 0x15, 0x15, "s_decperflevel"),
521   (0x16, 0x16, 0x16, 0x16, 0x16, "s_ttracedata"),
522   (  -1, 0x17, 0x17, 0x17, 0x17, "s_cbranch_cdbgsys"),
523   (  -1, 0x18, 0x18, 0x18, 0x18, "s_cbranch_cdbguser"),
524   (  -1, 0x19, 0x19, 0x19, 0x19, "s_cbranch_cdbgsys_or_user"),
525   (  -1, 0x1a, 0x1a, 0x1a, 0x1a, "s_cbranch_cdbgsys_and_user"),
526   (  -1,   -1, 0x1b, 0x1b, 0x1b, "s_endpgm_saved"),
527   (  -1,   -1, 0x1c, 0x1c,   -1, "s_set_gpr_idx_off"),
528   (  -1,   -1, 0x1d, 0x1d,   -1, "s_set_gpr_idx_mode"),
529   (  -1,   -1,   -1, 0x1e, 0x1e, "s_endpgm_ordered_ps_done"),
530   (  -1,   -1,   -1,   -1, 0x1f, "s_code_end"),
531   (  -1,   -1,   -1,   -1, 0x20, "s_inst_prefetch"),
532   (  -1,   -1,   -1,   -1, 0x21, "s_clause"),
533   (  -1,   -1,   -1,   -1, 0x22, "s_wait_idle"),
534   (  -1,   -1,   -1,   -1, 0x23, "s_waitcnt_depctr"),
535   (  -1,   -1,   -1,   -1, 0x24, "s_round_mode"),
536   (  -1,   -1,   -1,   -1, 0x25, "s_denorm_mode"),
537   (  -1,   -1,   -1,   -1, 0x26, "s_ttracedata_imm"),
538}
539for (gfx6, gfx7, gfx8, gfx9, gfx10, name) in SOPP:
540   opcode(name, gfx7, gfx9, gfx10, Format.SOPP)
541
542
543# SMEM instructions: sbase input (2 sgpr), potentially 2 offset inputs, 1 sdata input/output
544# Unlike GFX10, GFX10.3 does not have SMEM store, atomic or scratch instructions
545SMEM = {
546  # GFX6, GFX7, GFX8, GFX9, GFX10, name
547   (0x00, 0x00, 0x00, 0x00, 0x00, "s_load_dword"),
548   (0x01, 0x01, 0x01, 0x01, 0x01, "s_load_dwordx2"),
549   (0x02, 0x02, 0x02, 0x02, 0x02, "s_load_dwordx4"),
550   (0x03, 0x03, 0x03, 0x03, 0x03, "s_load_dwordx8"),
551   (0x04, 0x04, 0x04, 0x04, 0x04, "s_load_dwordx16"),
552   (  -1,   -1,   -1, 0x05, 0x05, "s_scratch_load_dword"),
553   (  -1,   -1,   -1, 0x06, 0x06, "s_scratch_load_dwordx2"),
554   (  -1,   -1,   -1, 0x07, 0x07, "s_scratch_load_dwordx4"),
555   (0x08, 0x08, 0x08, 0x08, 0x08, "s_buffer_load_dword"),
556   (0x09, 0x09, 0x09, 0x09, 0x09, "s_buffer_load_dwordx2"),
557   (0x0a, 0x0a, 0x0a, 0x0a, 0x0a, "s_buffer_load_dwordx4"),
558   (0x0b, 0x0b, 0x0b, 0x0b, 0x0b, "s_buffer_load_dwordx8"),
559   (0x0c, 0x0c, 0x0c, 0x0c, 0x0c, "s_buffer_load_dwordx16"),
560   (  -1,   -1, 0x10, 0x10, 0x10, "s_store_dword"),
561   (  -1,   -1, 0x11, 0x11, 0x11, "s_store_dwordx2"),
562   (  -1,   -1, 0x12, 0x12, 0x12, "s_store_dwordx4"),
563   (  -1,   -1,   -1, 0x15, 0x15, "s_scratch_store_dword"),
564   (  -1,   -1,   -1, 0x16, 0x16, "s_scratch_store_dwordx2"),
565   (  -1,   -1,   -1, 0x17, 0x17, "s_scratch_store_dwordx4"),
566   (  -1,   -1, 0x18, 0x18, 0x18, "s_buffer_store_dword"),
567   (  -1,   -1, 0x19, 0x19, 0x19, "s_buffer_store_dwordx2"),
568   (  -1,   -1, 0x1a, 0x1a, 0x1a, "s_buffer_store_dwordx4"),
569   (  -1,   -1, 0x1f, 0x1f, 0x1f, "s_gl1_inv"),
570   (0x1f, 0x1f, 0x20, 0x20, 0x20, "s_dcache_inv"),
571   (  -1,   -1, 0x21, 0x21, 0x21, "s_dcache_wb"),
572   (  -1, 0x1d, 0x22, 0x22,   -1, "s_dcache_inv_vol"),
573   (  -1,   -1, 0x23, 0x23,   -1, "s_dcache_wb_vol"),
574   (0x1e, 0x1e, 0x24, 0x24, 0x24, "s_memtime"), #GFX6-GFX10
575   (  -1,   -1, 0x25, 0x25, 0x25, "s_memrealtime"),
576   (  -1,   -1, 0x26, 0x26, 0x26, "s_atc_probe"),
577   (  -1,   -1, 0x27, 0x27, 0x27, "s_atc_probe_buffer"),
578   (  -1,   -1,   -1, 0x28, 0x28, "s_dcache_discard"),
579   (  -1,   -1,   -1, 0x29, 0x29, "s_dcache_discard_x2"),
580   (  -1,   -1,   -1,   -1, 0x2a, "s_get_waveid_in_workgroup"),
581   (  -1,   -1,   -1, 0x40, 0x40, "s_buffer_atomic_swap"),
582   (  -1,   -1,   -1, 0x41, 0x41, "s_buffer_atomic_cmpswap"),
583   (  -1,   -1,   -1, 0x42, 0x42, "s_buffer_atomic_add"),
584   (  -1,   -1,   -1, 0x43, 0x43, "s_buffer_atomic_sub"),
585   (  -1,   -1,   -1, 0x44, 0x44, "s_buffer_atomic_smin"),
586   (  -1,   -1,   -1, 0x45, 0x45, "s_buffer_atomic_umin"),
587   (  -1,   -1,   -1, 0x46, 0x46, "s_buffer_atomic_smax"),
588   (  -1,   -1,   -1, 0x47, 0x47, "s_buffer_atomic_umax"),
589   (  -1,   -1,   -1, 0x48, 0x48, "s_buffer_atomic_and"),
590   (  -1,   -1,   -1, 0x49, 0x49, "s_buffer_atomic_or"),
591   (  -1,   -1,   -1, 0x4a, 0x4a, "s_buffer_atomic_xor"),
592   (  -1,   -1,   -1, 0x4b, 0x4b, "s_buffer_atomic_inc"),
593   (  -1,   -1,   -1, 0x4c, 0x4c, "s_buffer_atomic_dec"),
594   (  -1,   -1,   -1, 0x60, 0x60, "s_buffer_atomic_swap_x2"),
595   (  -1,   -1,   -1, 0x61, 0x61, "s_buffer_atomic_cmpswap_x2"),
596   (  -1,   -1,   -1, 0x62, 0x62, "s_buffer_atomic_add_x2"),
597   (  -1,   -1,   -1, 0x63, 0x63, "s_buffer_atomic_sub_x2"),
598   (  -1,   -1,   -1, 0x64, 0x64, "s_buffer_atomic_smin_x2"),
599   (  -1,   -1,   -1, 0x65, 0x65, "s_buffer_atomic_umin_x2"),
600   (  -1,   -1,   -1, 0x66, 0x66, "s_buffer_atomic_smax_x2"),
601   (  -1,   -1,   -1, 0x67, 0x67, "s_buffer_atomic_umax_x2"),
602   (  -1,   -1,   -1, 0x68, 0x68, "s_buffer_atomic_and_x2"),
603   (  -1,   -1,   -1, 0x69, 0x69, "s_buffer_atomic_or_x2"),
604   (  -1,   -1,   -1, 0x6a, 0x6a, "s_buffer_atomic_xor_x2"),
605   (  -1,   -1,   -1, 0x6b, 0x6b, "s_buffer_atomic_inc_x2"),
606   (  -1,   -1,   -1, 0x6c, 0x6c, "s_buffer_atomic_dec_x2"),
607   (  -1,   -1,   -1, 0x80, 0x80, "s_atomic_swap"),
608   (  -1,   -1,   -1, 0x81, 0x81, "s_atomic_cmpswap"),
609   (  -1,   -1,   -1, 0x82, 0x82, "s_atomic_add"),
610   (  -1,   -1,   -1, 0x83, 0x83, "s_atomic_sub"),
611   (  -1,   -1,   -1, 0x84, 0x84, "s_atomic_smin"),
612   (  -1,   -1,   -1, 0x85, 0x85, "s_atomic_umin"),
613   (  -1,   -1,   -1, 0x86, 0x86, "s_atomic_smax"),
614   (  -1,   -1,   -1, 0x87, 0x87, "s_atomic_umax"),
615   (  -1,   -1,   -1, 0x88, 0x88, "s_atomic_and"),
616   (  -1,   -1,   -1, 0x89, 0x89, "s_atomic_or"),
617   (  -1,   -1,   -1, 0x8a, 0x8a, "s_atomic_xor"),
618   (  -1,   -1,   -1, 0x8b, 0x8b, "s_atomic_inc"),
619   (  -1,   -1,   -1, 0x8c, 0x8c, "s_atomic_dec"),
620   (  -1,   -1,   -1, 0xa0, 0xa0, "s_atomic_swap_x2"),
621   (  -1,   -1,   -1, 0xa1, 0xa1, "s_atomic_cmpswap_x2"),
622   (  -1,   -1,   -1, 0xa2, 0xa2, "s_atomic_add_x2"),
623   (  -1,   -1,   -1, 0xa3, 0xa3, "s_atomic_sub_x2"),
624   (  -1,   -1,   -1, 0xa4, 0xa4, "s_atomic_smin_x2"),
625   (  -1,   -1,   -1, 0xa5, 0xa5, "s_atomic_umin_x2"),
626   (  -1,   -1,   -1, 0xa6, 0xa6, "s_atomic_smax_x2"),
627   (  -1,   -1,   -1, 0xa7, 0xa7, "s_atomic_umax_x2"),
628   (  -1,   -1,   -1, 0xa8, 0xa8, "s_atomic_and_x2"),
629   (  -1,   -1,   -1, 0xa9, 0xa9, "s_atomic_or_x2"),
630   (  -1,   -1,   -1, 0xaa, 0xaa, "s_atomic_xor_x2"),
631   (  -1,   -1,   -1, 0xab, 0xab, "s_atomic_inc_x2"),
632   (  -1,   -1,   -1, 0xac, 0xac, "s_atomic_dec_x2"),
633}
634for (gfx6, gfx7, gfx8, gfx9, gfx10, name) in SMEM:
635   opcode(name, gfx7, gfx9, gfx10, Format.SMEM, is_atomic = "atomic" in name)
636
637
638# VOP2 instructions: 2 inputs, 1 output (+ optional vcc)
639# TODO: misses some GFX6_7 opcodes which were shifted to VOP3 in GFX8
640VOP2 = {
641  # GFX6, GFX7, GFX8, GFX9, GFX10, name, input/output modifiers
642   (0x01, 0x01,   -1,   -1,   -1, "v_readlane_b32", False),
643   (0x02, 0x02,   -1,   -1,   -1, "v_writelane_b32", False),
644   (0x03, 0x03, 0x01, 0x01, 0x03, "v_add_f32", True),
645   (0x04, 0x04, 0x02, 0x02, 0x04, "v_sub_f32", True),
646   (0x05, 0x05, 0x03, 0x03, 0x05, "v_subrev_f32", True),
647   (0x06, 0x06,   -1,   -1, 0x06, "v_mac_legacy_f32", True),
648   (0x07, 0x07, 0x04, 0x04, 0x07, "v_mul_legacy_f32", True),
649   (0x08, 0x08, 0x05, 0x05, 0x08, "v_mul_f32", True),
650   (0x09, 0x09, 0x06, 0x06, 0x09, "v_mul_i32_i24", False),
651   (0x0a, 0x0a, 0x07, 0x07, 0x0a, "v_mul_hi_i32_i24", False),
652   (0x0b, 0x0b, 0x08, 0x08, 0x0b, "v_mul_u32_u24", False),
653   (0x0c, 0x0c, 0x09, 0x09, 0x0c, "v_mul_hi_u32_u24", False),
654   (0x0d, 0x0d,   -1,   -1,   -1, "v_min_legacy_f32", True),
655   (0x0e, 0x0e,   -1,   -1,   -1, "v_max_legacy_f32", True),
656   (0x0f, 0x0f, 0x0a, 0x0a, 0x0f, "v_min_f32", True),
657   (0x10, 0x10, 0x0b, 0x0b, 0x10, "v_max_f32", True),
658   (0x11, 0x11, 0x0c, 0x0c, 0x11, "v_min_i32", False),
659   (0x12, 0x12, 0x0d, 0x0d, 0x12, "v_max_i32", False),
660   (0x13, 0x13, 0x0e, 0x0e, 0x13, "v_min_u32", False),
661   (0x14, 0x14, 0x0f, 0x0f, 0x14, "v_max_u32", False),
662   (0x15, 0x15,   -1,   -1,   -1, "v_lshr_b32", False),
663   (0x16, 0x16, 0x10, 0x10, 0x16, "v_lshrrev_b32", False),
664   (0x17, 0x17,   -1,   -1,   -1, "v_ashr_i32", False),
665   (0x18, 0x18, 0x11, 0x11, 0x18, "v_ashrrev_i32", False),
666   (0x19, 0x19,   -1,   -1,   -1, "v_lshl_b32", False),
667   (0x1a, 0x1a, 0x12, 0x12, 0x1a, "v_lshlrev_b32", False),
668   (0x1b, 0x1b, 0x13, 0x13, 0x1b, "v_and_b32", False),
669   (0x1c, 0x1c, 0x14, 0x14, 0x1c, "v_or_b32", False),
670   (0x1d, 0x1d, 0x15, 0x15, 0x1d, "v_xor_b32", False),
671   (  -1,   -1,   -1,   -1, 0x1e, "v_xnor_b32", False),
672   (0x1f, 0x1f, 0x16, 0x16, 0x1f, "v_mac_f32", True),
673   (0x20, 0x20, 0x17, 0x17, 0x20, "v_madmk_f32", False),
674   (0x21, 0x21, 0x18, 0x18, 0x21, "v_madak_f32", False),
675   (0x24, 0x24,   -1,   -1,   -1, "v_mbcnt_hi_u32_b32", False),
676   (0x25, 0x25, 0x19, 0x19,   -1, "v_add_co_u32", False), # VOP3B only in RDNA
677   (0x26, 0x26, 0x1a, 0x1a,   -1, "v_sub_co_u32", False), # VOP3B only in RDNA
678   (0x27, 0x27, 0x1b, 0x1b,   -1, "v_subrev_co_u32", False), # VOP3B only in RDNA
679   (0x28, 0x28, 0x1c, 0x1c, 0x28, "v_addc_co_u32", False), # v_add_co_ci_u32 in RDNA
680   (0x29, 0x29, 0x1d, 0x1d, 0x29, "v_subb_co_u32", False), # v_sub_co_ci_u32 in RDNA
681   (0x2a, 0x2a, 0x1e, 0x1e, 0x2a, "v_subbrev_co_u32", False), # v_subrev_co_ci_u32 in RDNA
682   (  -1,   -1,   -1,   -1, 0x2b, "v_fmac_f32", True),
683   (  -1,   -1,   -1,   -1, 0x2c, "v_fmamk_f32", True),
684   (  -1,   -1,   -1,   -1, 0x2d, "v_fmaak_f32", True),
685   (0x2f, 0x2f,   -1,   -1, 0x2f, "v_cvt_pkrtz_f16_f32", True),
686   (  -1,   -1, 0x1f, 0x1f, 0x32, "v_add_f16", True),
687   (  -1,   -1, 0x20, 0x20, 0x33, "v_sub_f16", True),
688   (  -1,   -1, 0x21, 0x21, 0x34, "v_subrev_f16", True),
689   (  -1,   -1, 0x22, 0x22, 0x35, "v_mul_f16", True),
690   (  -1,   -1, 0x23, 0x23,   -1, "v_mac_f16", True),
691   (  -1,   -1, 0x24, 0x24,   -1, "v_madmk_f16", False),
692   (  -1,   -1, 0x25, 0x25,   -1, "v_madak_f16", False),
693   (  -1,   -1, 0x26, 0x26,   -1, "v_add_u16", False),
694   (  -1,   -1, 0x27, 0x27,   -1, "v_sub_u16", False),
695   (  -1,   -1, 0x28, 0x28,   -1, "v_subrev_u16", False),
696   (  -1,   -1, 0x29, 0x29,   -1, "v_mul_lo_u16", False),
697   (  -1,   -1, 0x2a, 0x2a,   -1, "v_lshlrev_b16", False),
698   (  -1,   -1, 0x2b, 0x2b,   -1, "v_lshrrev_b16", False),
699   (  -1,   -1, 0x2c, 0x2c,   -1, "v_ashrrev_i16", False),
700   (  -1,   -1, 0x2d, 0x2d, 0x39, "v_max_f16", True),
701   (  -1,   -1, 0x2e, 0x2e, 0x3a, "v_min_f16", True),
702   (  -1,   -1, 0x2f, 0x2f,   -1, "v_max_u16", False),
703   (  -1,   -1, 0x30, 0x30,   -1, "v_max_i16", False),
704   (  -1,   -1, 0x31, 0x31,   -1, "v_min_u16", False),
705   (  -1,   -1, 0x32, 0x32,   -1, "v_min_i16", False),
706   (  -1,   -1, 0x33, 0x33, 0x3b, "v_ldexp_f16", False),
707   (  -1,   -1, 0x34, 0x34, 0x25, "v_add_u32", False), # v_add_nc_u32 in RDNA
708   (  -1,   -1, 0x35, 0x35, 0x26, "v_sub_u32", False), # v_sub_nc_u32 in RDNA
709   (  -1,   -1, 0x36, 0x36, 0x27, "v_subrev_u32", False), # v_subrev_nc_u32 in RDNA
710   (  -1,   -1,   -1,   -1, 0x36, "v_fmac_f16", False),
711   (  -1,   -1,   -1,   -1, 0x37, "v_fmamk_f16", False),
712   (  -1,   -1,   -1,   -1, 0x38, "v_fmaak_f16", False),
713   (  -1,   -1,   -1,   -1, 0x3c, "v_pk_fmac_f16", False),
714}
715for (gfx6, gfx7, gfx8, gfx9, gfx10, name, modifiers) in VOP2:
716   opcode(name, gfx7, gfx9, gfx10, Format.VOP2, modifiers, modifiers)
717
718if True:
719    # v_cndmask_b32 can use input modifiers but not output modifiers
720    (gfx6, gfx7, gfx8, gfx9, gfx10, name) = (0x00, 0x00, 0x00, 0x00, 0x01, "v_cndmask_b32")
721    opcode(name, gfx7, gfx9, gfx10, Format.VOP2, True, False)
722
723
724# VOP1 instructions: instructions with 1 input and 1 output
725VOP1 = {
726  # GFX6, GFX7, GFX8, GFX9, GFX10, name, input_modifiers, output_modifiers
727   (0x00, 0x00, 0x00, 0x00, 0x00, "v_nop", False, False),
728   (0x01, 0x01, 0x01, 0x01, 0x01, "v_mov_b32", False, False),
729   (0x02, 0x02, 0x02, 0x02, 0x02, "v_readfirstlane_b32", False, False),
730   (0x03, 0x03, 0x03, 0x03, 0x03, "v_cvt_i32_f64", True, False),
731   (0x04, 0x04, 0x04, 0x04, 0x04, "v_cvt_f64_i32", False, True),
732   (0x05, 0x05, 0x05, 0x05, 0x05, "v_cvt_f32_i32", False, True),
733   (0x06, 0x06, 0x06, 0x06, 0x06, "v_cvt_f32_u32", False, True),
734   (0x07, 0x07, 0x07, 0x07, 0x07, "v_cvt_u32_f32", True, False),
735   (0x08, 0x08, 0x08, 0x08, 0x08, "v_cvt_i32_f32", True, False),
736   (0x09, 0x09,   -1,   -1, 0x09, "v_mov_fed_b32", True, False), # LLVM mentions it for GFX8_9
737   (0x0a, 0x0a, 0x0a, 0x0a, 0x0a, "v_cvt_f16_f32", True, True),
738   (  -1,   -1,   -1,   -1,   -1, "p_cvt_f16_f32_rtne", True, True),
739   (0x0b, 0x0b, 0x0b, 0x0b, 0x0b, "v_cvt_f32_f16", True, True),
740   (0x0c, 0x0c, 0x0c, 0x0c, 0x0c, "v_cvt_rpi_i32_f32", True, False),
741   (0x0d, 0x0d, 0x0d, 0x0d, 0x0d, "v_cvt_flr_i32_f32", True, False),
742   (0x0e, 0x0e, 0x0e, 0x0e, 0x0e, "v_cvt_off_f32_i4", False, True),
743   (0x0f, 0x0f, 0x0f, 0x0f, 0x0f, "v_cvt_f32_f64", True, True),
744   (0x10, 0x10, 0x10, 0x10, 0x10, "v_cvt_f64_f32", True, True),
745   (0x11, 0x11, 0x11, 0x11, 0x11, "v_cvt_f32_ubyte0", False, True),
746   (0x12, 0x12, 0x12, 0x12, 0x12, "v_cvt_f32_ubyte1", False, True),
747   (0x13, 0x13, 0x13, 0x13, 0x13, "v_cvt_f32_ubyte2", False, True),
748   (0x14, 0x14, 0x14, 0x14, 0x14, "v_cvt_f32_ubyte3", False, True),
749   (0x15, 0x15, 0x15, 0x15, 0x15, "v_cvt_u32_f64", True, False),
750   (0x16, 0x16, 0x16, 0x16, 0x16, "v_cvt_f64_u32", False, True),
751   (  -1, 0x17, 0x17, 0x17, 0x17, "v_trunc_f64", True, True),
752   (  -1, 0x18, 0x18, 0x18, 0x18, "v_ceil_f64", True, True),
753   (  -1, 0x19, 0x19, 0x19, 0x19, "v_rndne_f64", True, True),
754   (  -1, 0x1a, 0x1a, 0x1a, 0x1a, "v_floor_f64", True, True),
755   (  -1,   -1,   -1,   -1, 0x1b, "v_pipeflush", False, False),
756   (0x20, 0x20, 0x1b, 0x1b, 0x20, "v_fract_f32", True, True),
757   (0x21, 0x21, 0x1c, 0x1c, 0x21, "v_trunc_f32", True, True),
758   (0x22, 0x22, 0x1d, 0x1d, 0x22, "v_ceil_f32", True, True),
759   (0x23, 0x23, 0x1e, 0x1e, 0x23, "v_rndne_f32", True, True),
760   (0x24, 0x24, 0x1f, 0x1f, 0x24, "v_floor_f32", True, True),
761   (0x25, 0x25, 0x20, 0x20, 0x25, "v_exp_f32", True, True),
762   (0x26, 0x26,   -1,   -1,   -1, "v_log_clamp_f32", True, True),
763   (0x27, 0x27, 0x21, 0x21, 0x27, "v_log_f32", True, True),
764   (0x28, 0x28,   -1,   -1,   -1, "v_rcp_clamp_f32", True, True),
765   (0x29, 0x29,   -1,   -1,   -1, "v_rcp_legacy_f32", True, True),
766   (0x2a, 0x2a, 0x22, 0x22, 0x2a, "v_rcp_f32", True, True),
767   (0x2b, 0x2b, 0x23, 0x23, 0x2b, "v_rcp_iflag_f32", True, True),
768   (0x2c, 0x2c,   -1,   -1,   -1, "v_rsq_clamp_f32", True, True),
769   (0x2d, 0x2d,   -1,   -1,   -1, "v_rsq_legacy_f32", True, True),
770   (0x2e, 0x2e, 0x24, 0x24, 0x2e, "v_rsq_f32", True, True),
771   (0x2f, 0x2f, 0x25, 0x25, 0x2f, "v_rcp_f64", True, True),
772   (0x30, 0x30,   -1,   -1,   -1, "v_rcp_clamp_f64", True, True),
773   (0x31, 0x31, 0x26, 0x26, 0x31, "v_rsq_f64", True, True),
774   (0x32, 0x32,   -1,   -1,   -1, "v_rsq_clamp_f64", True, True),
775   (0x33, 0x33, 0x27, 0x27, 0x33, "v_sqrt_f32", True, True),
776   (0x34, 0x34, 0x28, 0x28, 0x34, "v_sqrt_f64", True, True),
777   (0x35, 0x35, 0x29, 0x29, 0x35, "v_sin_f32", True, True),
778   (0x36, 0x36, 0x2a, 0x2a, 0x36, "v_cos_f32", True, True),
779   (0x37, 0x37, 0x2b, 0x2b, 0x37, "v_not_b32", False, False),
780   (0x38, 0x38, 0x2c, 0x2c, 0x38, "v_bfrev_b32", False, False),
781   (0x39, 0x39, 0x2d, 0x2d, 0x39, "v_ffbh_u32", False, False),
782   (0x3a, 0x3a, 0x2e, 0x2e, 0x3a, "v_ffbl_b32", False, False),
783   (0x3b, 0x3b, 0x2f, 0x2f, 0x3b, "v_ffbh_i32", False, False),
784   (0x3c, 0x3c, 0x30, 0x30, 0x3c, "v_frexp_exp_i32_f64", True, False),
785   (0x3d, 0x3d, 0x31, 0x31, 0x3d, "v_frexp_mant_f64", True, False),
786   (0x3e, 0x3e, 0x32, 0x32, 0x3e, "v_fract_f64", True, True),
787   (0x3f, 0x3f, 0x33, 0x33, 0x3f, "v_frexp_exp_i32_f32", True, False),
788   (0x40, 0x40, 0x34, 0x34, 0x40, "v_frexp_mant_f32", True, False),
789   (0x41, 0x41, 0x35, 0x35, 0x41, "v_clrexcp", False, False),
790   (0x42, 0x42, 0x36,   -1, 0x42, "v_movreld_b32", False, False),
791   (0x43, 0x43, 0x37,   -1, 0x43, "v_movrels_b32", False, False),
792   (0x44, 0x44, 0x38,   -1, 0x44, "v_movrelsd_b32", False, False),
793   (  -1,   -1,   -1,   -1, 0x48, "v_movrelsd_2_b32", False, False),
794   (  -1,   -1,   -1, 0x37,   -1, "v_screen_partition_4se_b32", False, False),
795   (  -1,   -1, 0x39, 0x39, 0x50, "v_cvt_f16_u16", False, True),
796   (  -1,   -1, 0x3a, 0x3a, 0x51, "v_cvt_f16_i16", False, True),
797   (  -1,   -1, 0x3b, 0x3b, 0x52, "v_cvt_u16_f16", True, False),
798   (  -1,   -1, 0x3c, 0x3c, 0x53, "v_cvt_i16_f16", True, False),
799   (  -1,   -1, 0x3d, 0x3d, 0x54, "v_rcp_f16", True, True),
800   (  -1,   -1, 0x3e, 0x3e, 0x55, "v_sqrt_f16", True, True),
801   (  -1,   -1, 0x3f, 0x3f, 0x56, "v_rsq_f16", True, True),
802   (  -1,   -1, 0x40, 0x40, 0x57, "v_log_f16", True, True),
803   (  -1,   -1, 0x41, 0x41, 0x58, "v_exp_f16", True, True),
804   (  -1,   -1, 0x42, 0x42, 0x59, "v_frexp_mant_f16", True, False),
805   (  -1,   -1, 0x43, 0x43, 0x5a, "v_frexp_exp_i16_f16", True, False),
806   (  -1,   -1, 0x44, 0x44, 0x5b, "v_floor_f16", True, True),
807   (  -1,   -1, 0x45, 0x45, 0x5c, "v_ceil_f16", True, True),
808   (  -1,   -1, 0x46, 0x46, 0x5d, "v_trunc_f16", True, True),
809   (  -1,   -1, 0x47, 0x47, 0x5e, "v_rndne_f16", True, True),
810   (  -1,   -1, 0x48, 0x48, 0x5f, "v_fract_f16", True, True),
811   (  -1,   -1, 0x49, 0x49, 0x60, "v_sin_f16", True, True),
812   (  -1,   -1, 0x4a, 0x4a, 0x61, "v_cos_f16", True, True),
813   (  -1, 0x46, 0x4b, 0x4b,   -1, "v_exp_legacy_f32", True, True),
814   (  -1, 0x45, 0x4c, 0x4c,   -1, "v_log_legacy_f32", True, True),
815   (  -1,   -1,   -1, 0x4f, 0x62, "v_sat_pk_u8_i16", False, False),
816   (  -1,   -1,   -1, 0x4d, 0x63, "v_cvt_norm_i16_f16", True, False),
817   (  -1,   -1,   -1, 0x4e, 0x64, "v_cvt_norm_u16_f16", True, False),
818   (  -1,   -1,   -1, 0x51, 0x65, "v_swap_b32", False, False),
819   (  -1,   -1,   -1,   -1, 0x68, "v_swaprel_b32", False, False),
820}
821for (gfx6, gfx7, gfx8, gfx9, gfx10, name, in_mod, out_mod) in VOP1:
822   opcode(name, gfx7, gfx9, gfx10, Format.VOP1, in_mod, out_mod)
823
824
825# VOPC instructions:
826
827VOPC_CLASS = {
828   (0x88, 0x88, 0x10, 0x10, 0x88, "v_cmp_class_f32"),
829   (  -1,   -1, 0x14, 0x14, 0x8f, "v_cmp_class_f16"),
830   (0x98, 0x98, 0x11, 0x11, 0x98, "v_cmpx_class_f32"),
831   (  -1,   -1, 0x15, 0x15, 0x9f, "v_cmpx_class_f16"),
832   (0xa8, 0xa8, 0x12, 0x12, 0xa8, "v_cmp_class_f64"),
833   (0xb8, 0xb8, 0x13, 0x13, 0xb8, "v_cmpx_class_f64"),
834}
835for (gfx6, gfx7, gfx8, gfx9, gfx10, name) in VOPC_CLASS:
836    opcode(name, gfx7, gfx9, gfx10, Format.VOPC, True, False)
837
838COMPF = ["f", "lt", "eq", "le", "gt", "lg", "ge", "o", "u", "nge", "nlg", "ngt", "nle", "neq", "nlt", "tru"]
839
840for i in range(8):
841   (gfx6, gfx7, gfx8, gfx9, gfx10, name) = (-1, -1, 0x20+i, 0x20+i, 0xc8+i, "v_cmp_"+COMPF[i]+"_f16")
842   opcode(name, gfx7, gfx9, gfx10, Format.VOPC, True, False)
843   (gfx6, gfx7, gfx8, gfx9, gfx10, name) = (-1, -1, 0x30+i, 0x30+i, 0xd8+i, "v_cmpx_"+COMPF[i]+"_f16")
844   opcode(name, gfx7, gfx9, gfx10, Format.VOPC, True, False)
845   (gfx6, gfx7, gfx8, gfx9, gfx10, name) = (-1, -1, 0x28+i, 0x28+i, 0xe8+i, "v_cmp_"+COMPF[i+8]+"_f16")
846   opcode(name, gfx7, gfx9, gfx10, Format.VOPC, True, False)
847   (gfx6, gfx7, gfx8, gfx9, gfx10, name) = (-1, -1, 0x38+i, 0x38+i, 0xf8+i, "v_cmpx_"+COMPF[i+8]+"_f16")
848   opcode(name, gfx7, gfx9, gfx10, Format.VOPC, True, False)
849
850for i in range(16):
851   (gfx6, gfx7, gfx8, gfx9, gfx10, name) = (0x00+i, 0x00+i, 0x40+i, 0x40+i, 0x00+i, "v_cmp_"+COMPF[i]+"_f32")
852   opcode(name, gfx7, gfx9, gfx10, Format.VOPC, True, False)
853   (gfx6, gfx7, gfx8, gfx9, gfx10, name) = (0x10+i, 0x10+i, 0x50+i, 0x50+i, 0x10+i, "v_cmpx_"+COMPF[i]+"_f32")
854   opcode(name, gfx7, gfx9, gfx10, Format.VOPC, True, False)
855   (gfx6, gfx7, gfx8, gfx9, gfx10, name) = (0x20+i, 0x20+i, 0x60+i, 0x60+i, 0x20+i, "v_cmp_"+COMPF[i]+"_f64")
856   opcode(name, gfx7, gfx9, gfx10, Format.VOPC, True, False)
857   (gfx6, gfx7, gfx8, gfx9, gfx10, name) = (0x30+i, 0x30+i, 0x70+i, 0x70+i, 0x30+i, "v_cmpx_"+COMPF[i]+"_f64")
858   opcode(name, gfx7, gfx9, gfx10, Format.VOPC, True, False)
859   # GFX_6_7
860   (gfx6, gfx7, gfx8, gfx9, gfx10, name) = (0x40+i, 0x40+i, -1, -1, -1, "v_cmps_"+COMPF[i]+"_f32")
861   (gfx6, gfx7, gfx8, gfx9, gfx10, name) = (0x50+i, 0x50+i, -1, -1, -1, "v_cmpsx_"+COMPF[i]+"_f32")
862   (gfx6, gfx7, gfx8, gfx9, gfx10, name) = (0x60+i, 0x60+i, -1, -1, -1, "v_cmps_"+COMPF[i]+"_f64")
863   (gfx6, gfx7, gfx8, gfx9, gfx10, name) = (0x70+i, 0x70+i, -1, -1, -1, "v_cmpsx_"+COMPF[i]+"_f64")
864
865COMPI = ["f", "lt", "eq", "le", "gt", "lg", "ge", "tru"]
866
867# GFX_8_9
868for i in [0,7]: # only 0 and 7
869   (gfx6, gfx7, gfx8, gfx9, gfx10, name) = (-1, -1, 0xa0+i, 0xa0+i, -1, "v_cmp_"+COMPI[i]+"_i16")
870   opcode(name, gfx7, gfx9, gfx10, Format.VOPC)
871   (gfx6, gfx7, gfx8, gfx9, gfx10, name) = (-1, -1, 0xb0+i, 0xb0+i, -1, "v_cmpx_"+COMPI[i]+"_i16")
872   opcode(name, gfx7, gfx9, gfx10, Format.VOPC)
873   (gfx6, gfx7, gfx8, gfx9, gfx10, name) = (-1, -1, 0xa8+i, 0xa8+i, -1, "v_cmp_"+COMPI[i]+"_u16")
874   opcode(name, gfx7, gfx9, gfx10, Format.VOPC)
875   (gfx6, gfx7, gfx8, gfx9, gfx10, name) = (-1, -1, 0xb8+i, 0xb8+i, -1, "v_cmpx_"+COMPI[i]+"_u16")
876   opcode(name, gfx7, gfx9, gfx10, Format.VOPC)
877
878for i in range(1, 7): # [1..6]
879   (gfx6, gfx7, gfx8, gfx9, gfx10, name) = (-1, -1, 0xa0+i, 0xa0+i, 0x88+i, "v_cmp_"+COMPI[i]+"_i16")
880   opcode(name, gfx7, gfx9, gfx10, Format.VOPC)
881   (gfx6, gfx7, gfx8, gfx9, gfx10, name) = (-1, -1, 0xb0+i, 0xb0+i, 0x98+i, "v_cmpx_"+COMPI[i]+"_i16")
882   opcode(name, gfx7, gfx9, gfx10, Format.VOPC)
883   (gfx6, gfx7, gfx8, gfx9, gfx10, name) = (-1, -1, 0xa8+i, 0xa8+i, 0xa8+i, "v_cmp_"+COMPI[i]+"_u16")
884   opcode(name, gfx7, gfx9, gfx10, Format.VOPC)
885   (gfx6, gfx7, gfx8, gfx9, gfx10, name) = (-1, -1, 0xb8+i, 0xb8+i, 0xb8+i, "v_cmpx_"+COMPI[i]+"_u16")
886   opcode(name, gfx7, gfx9, gfx10, Format.VOPC)
887
888for i in range(8):
889   (gfx6, gfx7, gfx8, gfx9, gfx10, name) = (0x80+i, 0x80+i, 0xc0+i, 0xc0+i, 0x80+i, "v_cmp_"+COMPI[i]+"_i32")
890   opcode(name, gfx7, gfx9, gfx10, Format.VOPC)
891   (gfx6, gfx7, gfx8, gfx9, gfx10, name) = (0x90+i, 0x90+i, 0xd0+i, 0xd0+i, 0x90+i, "v_cmpx_"+COMPI[i]+"_i32")
892   opcode(name, gfx7, gfx9, gfx10, Format.VOPC)
893   (gfx6, gfx7, gfx8, gfx9, gfx10, name) = (0xa0+i, 0xa0+i, 0xe0+i, 0xe0+i, 0xa0+i, "v_cmp_"+COMPI[i]+"_i64")
894   opcode(name, gfx7, gfx9, gfx10, Format.VOPC)
895   (gfx6, gfx7, gfx8, gfx9, gfx10, name) = (0xb0+i, 0xb0+i, 0xf0+i, 0xf0+i, 0xb0+i, "v_cmpx_"+COMPI[i]+"_i64")
896   opcode(name, gfx7, gfx9, gfx10, Format.VOPC)
897   (gfx6, gfx7, gfx8, gfx9, gfx10, name) = (0xc0+i, 0xc0+i, 0xc8+i, 0xc8+i, 0xc0+i, "v_cmp_"+COMPI[i]+"_u32")
898   opcode(name, gfx7, gfx9, gfx10, Format.VOPC)
899   (gfx6, gfx7, gfx8, gfx9, gfx10, name) = (0xd0+i, 0xd0+i, 0xd8+i, 0xd8+i, 0xd0+i, "v_cmpx_"+COMPI[i]+"_u32")
900   opcode(name, gfx7, gfx9, gfx10, Format.VOPC)
901   (gfx6, gfx7, gfx8, gfx9, gfx10, name) = (0xe0+i, 0xe0+i, 0xe8+i, 0xe8+i, 0xe0+i, "v_cmp_"+COMPI[i]+"_u64")
902   opcode(name, gfx7, gfx9, gfx10, Format.VOPC)
903   (gfx6, gfx7, gfx8, gfx9, gfx10, name) = (0xf0+i, 0xf0+i, 0xf8+i, 0xf8+i, 0xf0+i, "v_cmpx_"+COMPI[i]+"_u64")
904   opcode(name, gfx7, gfx9, gfx10, Format.VOPC)
905
906
907# VOPP instructions: packed 16bit instructions - 1 or 2 inputs and 1 output
908VOPP = {
909   (0x00, "v_pk_mad_i16"),
910   (0x01, "v_pk_mul_lo_u16"),
911   (0x02, "v_pk_add_i16"),
912   (0x03, "v_pk_sub_i16"),
913   (0x04, "v_pk_lshlrev_b16"),
914   (0x05, "v_pk_lshrrev_b16"),
915   (0x06, "v_pk_ashrrev_i16"),
916   (0x07, "v_pk_max_i16"),
917   (0x08, "v_pk_min_i16"),
918   (0x09, "v_pk_mad_u16"),
919   (0x0a, "v_pk_add_u16"),
920   (0x0b, "v_pk_sub_u16"),
921   (0x0c, "v_pk_max_u16"),
922   (0x0d, "v_pk_min_u16"),
923   (0x0e, "v_pk_fma_f16"),
924   (0x0f, "v_pk_add_f16"),
925   (0x10, "v_pk_mul_f16"),
926   (0x11, "v_pk_min_f16"),
927   (0x12, "v_pk_max_f16"),
928   (0x20, "v_pk_fma_mix_f32"), # v_mad_mix_f32 in VEGA ISA, v_fma_mix_f32 in RDNA ISA
929   (0x21, "v_pk_fma_mixlo_f16"), # v_mad_mixlo_f16 in VEGA ISA, v_fma_mixlo_f16 in RDNA ISA
930   (0x22, "v_pk_fma_mixhi_f16"), # v_mad_mixhi_f16 in VEGA ISA, v_fma_mixhi_f16 in RDNA ISA
931}
932# note that these are only supported on gfx9+ so we'll need to distinguish between gfx8 and gfx9 here
933# (gfx6, gfx7, gfx8, gfx9, gfx10, name) = (-1, -1, -1, code, code, name)
934for (code, name) in VOPP:
935   opcode(name, -1, code, code, Format.VOP3P)
936
937
938# VINTERP instructions:
939VINTRP = {
940   (0x00, "v_interp_p1_f32"),
941   (0x01, "v_interp_p2_f32"),
942   (0x02, "v_interp_mov_f32"),
943}
944# (gfx6, gfx7, gfx8, gfx9, gfx10, name) = (code, code, code, code, code, name)
945for (code, name) in VINTRP:
946   opcode(name, code, code, code, Format.VINTRP)
947
948# VOP3 instructions: 3 inputs, 1 output
949# VOP3b instructions: have a unique scalar output, e.g. VOP2 with vcc out
950VOP3 = {
951   (0x140, 0x140, 0x1c0, 0x1c0, 0x140, "v_mad_legacy_f32", True, True), # GFX6-GFX10
952   (0x141, 0x141, 0x1c1, 0x1c1, 0x141, "v_mad_f32", True, True),
953   (0x142, 0x142, 0x1c2, 0x1c2, 0x142, "v_mad_i32_i24", False, False),
954   (0x143, 0x143, 0x1c3, 0x1c3, 0x143, "v_mad_u32_u24", False, False),
955   (0x144, 0x144, 0x1c4, 0x1c4, 0x144, "v_cubeid_f32", True, True),
956   (0x145, 0x145, 0x1c5, 0x1c5, 0x145, "v_cubesc_f32", True, True),
957   (0x146, 0x146, 0x1c6, 0x1c6, 0x146, "v_cubetc_f32", True, True),
958   (0x147, 0x147, 0x1c7, 0x1c7, 0x147, "v_cubema_f32", True, True),
959   (0x148, 0x148, 0x1c8, 0x1c8, 0x148, "v_bfe_u32", False, False),
960   (0x149, 0x149, 0x1c9, 0x1c9, 0x149, "v_bfe_i32", False, False),
961   (0x14a, 0x14a, 0x1ca, 0x1ca, 0x14a, "v_bfi_b32", False, False),
962   (0x14b, 0x14b, 0x1cb, 0x1cb, 0x14b, "v_fma_f32", True, True),
963   (0x14c, 0x14c, 0x1cc, 0x1cc, 0x14c, "v_fma_f64", True, True),
964   (0x14d, 0x14d, 0x1cd, 0x1cd, 0x14d, "v_lerp_u8", False, False),
965   (0x14e, 0x14e, 0x1ce, 0x1ce, 0x14e, "v_alignbit_b32", False, False),
966   (0x14f, 0x14f, 0x1cf, 0x1cf, 0x14f, "v_alignbyte_b32", False, False),
967   (0x150, 0x150,    -1,    -1, 0x150, "v_mullit_f32", True, True),
968   (0x151, 0x151, 0x1d0, 0x1d0, 0x151, "v_min3_f32", True, True),
969   (0x152, 0x152, 0x1d1, 0x1d1, 0x152, "v_min3_i32", False, False),
970   (0x153, 0x153, 0x1d2, 0x1d2, 0x153, "v_min3_u32", False, False),
971   (0x154, 0x154, 0x1d3, 0x1d3, 0x154, "v_max3_f32", True, True),
972   (0x155, 0x155, 0x1d4, 0x1d4, 0x155, "v_max3_i32", False, False),
973   (0x156, 0x156, 0x1d5, 0x1d5, 0x156, "v_max3_u32", False, False),
974   (0x157, 0x157, 0x1d6, 0x1d6, 0x157, "v_med3_f32", True, True),
975   (0x158, 0x158, 0x1d7, 0x1d7, 0x158, "v_med3_i32", False, False),
976   (0x159, 0x159, 0x1d8, 0x1d8, 0x159, "v_med3_u32", False, False),
977   (0x15a, 0x15a, 0x1d9, 0x1d9, 0x15a, "v_sad_u8", False, False),
978   (0x15b, 0x15b, 0x1da, 0x1da, 0x15b, "v_sad_hi_u8", False, False),
979   (0x15c, 0x15c, 0x1db, 0x1db, 0x15c, "v_sad_u16", False, False),
980   (0x15d, 0x15d, 0x1dc, 0x1dc, 0x15d, "v_sad_u32", False, False),
981   (0x15e, 0x15e, 0x1dd, 0x1dd, 0x15e, "v_cvt_pk_u8_f32", True, False),
982   (0x15f, 0x15f, 0x1de, 0x1de, 0x15f, "v_div_fixup_f32", True, True),
983   (0x160, 0x160, 0x1df, 0x1df, 0x160, "v_div_fixup_f64", True, True),
984   (0x161, 0x161,    -1,    -1,    -1, "v_lshl_b64", False, False),
985   (0x162, 0x162,    -1,    -1,    -1, "v_lshr_b64", False, False),
986   (0x163, 0x163,    -1,    -1,    -1, "v_ashr_i64", False, False),
987   (0x164, 0x164, 0x280, 0x280, 0x164, "v_add_f64", True, True),
988   (0x165, 0x165, 0x281, 0x281, 0x165, "v_mul_f64", True, True),
989   (0x166, 0x166, 0x282, 0x282, 0x166, "v_min_f64", True, True),
990   (0x167, 0x167, 0x283, 0x283, 0x167, "v_max_f64", True, True),
991   (0x168, 0x168, 0x284, 0x284, 0x168, "v_ldexp_f64", False, True), # src1 can take input modifiers
992   (0x169, 0x169, 0x285, 0x285, 0x169, "v_mul_lo_u32", False, False),
993   (0x16a, 0x16a, 0x286, 0x286, 0x16a, "v_mul_hi_u32", False, False),
994   (0x16b, 0x16b, 0x285, 0x285, 0x16b, "v_mul_lo_i32", False, False), # identical to v_mul_lo_u32
995   (0x16c, 0x16c, 0x287, 0x287, 0x16c, "v_mul_hi_i32", False, False),
996   (0x16d, 0x16d, 0x1e0, 0x1e0, 0x16d, "v_div_scale_f32", True, True), # writes to VCC
997   (0x16e, 0x16e, 0x1e1, 0x1e1, 0x16e, "v_div_scale_f64", True, True), # writes to VCC
998   (0x16f, 0x16f, 0x1e2, 0x1e2, 0x16f, "v_div_fmas_f32", True, True), # takes VCC input
999   (0x170, 0x170, 0x1e3, 0x1e3, 0x170, "v_div_fmas_f64", True, True), # takes VCC input
1000   (0x171, 0x171, 0x1e4, 0x1e4, 0x171, "v_msad_u8", False, False),
1001   (0x172, 0x172, 0x1e5, 0x1e5, 0x172, "v_qsad_pk_u16_u8", False, False),
1002   (0x172,    -1,    -1,    -1,    -1, "v_qsad_u8", False, False), # what's the difference?
1003   (0x173, 0x173, 0x1e6, 0x1e6, 0x173, "v_mqsad_pk_u16_u8", False, False),
1004   (0x173,    -1,    -1,    -1,    -1, "v_mqsad_u8", False, False), # what's the difference?
1005   (0x174, 0x174, 0x292, 0x292, 0x174, "v_trig_preop_f64", False, False),
1006   (   -1, 0x175, 0x1e7, 0x1e7, 0x175, "v_mqsad_u32_u8", False, False),
1007   (   -1, 0x176, 0x1e8, 0x1e8, 0x176, "v_mad_u64_u32", False, False),
1008   (   -1, 0x177, 0x1e9, 0x1e9, 0x177, "v_mad_i64_i32", False, False),
1009   (   -1,    -1, 0x1ea, 0x1ea,    -1, "v_mad_legacy_f16", True, True),
1010   (   -1,    -1, 0x1eb, 0x1eb,    -1, "v_mad_legacy_u16", False, False),
1011   (   -1,    -1, 0x1ec, 0x1ec,    -1, "v_mad_legacy_i16", False, False),
1012   (   -1,    -1, 0x1ed, 0x1ed, 0x344, "v_perm_b32", False, False),
1013   (   -1,    -1, 0x1ee, 0x1ee,    -1, "v_fma_legacy_f16", True, True),
1014   (   -1,    -1, 0x1ef, 0x1ef,    -1, "v_div_fixup_legacy_f16", True, True),
1015   (0x12c, 0x12c, 0x1f0, 0x1f0,    -1, "v_cvt_pkaccum_u8_f32", True, False),
1016   (   -1,    -1,    -1, 0x1f1, 0x373, "v_mad_u32_u16", False, False),
1017   (   -1,    -1,    -1, 0x1f2, 0x375, "v_mad_i32_i16", False, False),
1018   (   -1,    -1,    -1, 0x1f3, 0x345, "v_xad_u32", False, False),
1019   (   -1,    -1,    -1, 0x1f4, 0x351, "v_min3_f16", True, True),
1020   (   -1,    -1,    -1, 0x1f5, 0x352, "v_min3_i16", False, False),
1021   (   -1,    -1,    -1, 0x1f6, 0x353, "v_min3_u16", False, False),
1022   (   -1,    -1,    -1, 0x1f7, 0x354, "v_max3_f16", True, True),
1023   (   -1,    -1,    -1, 0x1f8, 0x355, "v_max3_i16", False, False),
1024   (   -1,    -1,    -1, 0x1f9, 0x356, "v_max3_u16", False, False),
1025   (   -1,    -1,    -1, 0x1fa, 0x357, "v_med3_f16", True, True),
1026   (   -1,    -1,    -1, 0x1fb, 0x358, "v_med3_i16", False, False),
1027   (   -1,    -1,    -1, 0x1fc, 0x359, "v_med3_u16", False, False),
1028   (   -1,    -1,    -1, 0x1fd, 0x346, "v_lshl_add_u32", False, False),
1029   (   -1,    -1,    -1, 0x1fe, 0x347, "v_add_lshl_u32", False, False),
1030   (   -1,    -1,    -1, 0x1ff, 0x36d, "v_add3_u32", False, False),
1031   (   -1,    -1,    -1, 0x200, 0x36f, "v_lshl_or_b32", False, False),
1032   (   -1,    -1,    -1, 0x201, 0x371, "v_and_or_b32", False, False),
1033   (   -1,    -1,    -1, 0x202, 0x372, "v_or3_b32", False, False),
1034   (   -1,    -1,    -1, 0x203,    -1, "v_mad_f16", True, True),
1035   (   -1,    -1,    -1, 0x204, 0x340, "v_mad_u16", False, False),
1036   (   -1,    -1,    -1, 0x205, 0x35e, "v_mad_i16", False, False),
1037   (   -1,    -1,    -1, 0x206, 0x34b, "v_fma_f16", True, True),
1038   (   -1,    -1,    -1, 0x207, 0x35f, "v_div_fixup_f16", True, True),
1039   (   -1,    -1, 0x274, 0x274, 0x342, "v_interp_p1ll_f16", True, True),
1040   (   -1,    -1, 0x275, 0x275, 0x343, "v_interp_p1lv_f16", True, True),
1041   (   -1,    -1, 0x276, 0x276,    -1, "v_interp_p2_legacy_f16", True, True),
1042   (   -1,    -1,    -1, 0x277, 0x35a, "v_interp_p2_f16", True, True),
1043   (0x12b, 0x12b, 0x288, 0x288, 0x362, "v_ldexp_f32", False, True),
1044   (   -1,    -1, 0x289, 0x289, 0x360, "v_readlane_b32_e64", False, False),
1045   (   -1,    -1, 0x28a, 0x28a, 0x361, "v_writelane_b32_e64", False, False),
1046   (0x122, 0x122, 0x28b, 0x28b, 0x364, "v_bcnt_u32_b32", False, False),
1047   (0x123, 0x123, 0x28c, 0x28c, 0x365, "v_mbcnt_lo_u32_b32", False, False),
1048   (   -1,    -1, 0x28d, 0x28d, 0x366, "v_mbcnt_hi_u32_b32_e64", False, False),
1049   (   -1,    -1, 0x28f, 0x28f, 0x2ff, "v_lshlrev_b64", False, False),
1050   (   -1,    -1, 0x290, 0x290, 0x300, "v_lshrrev_b64", False, False),
1051   (   -1,    -1, 0x291, 0x291, 0x301, "v_ashrrev_i64", False, False),
1052   (0x11e, 0x11e, 0x293, 0x293, 0x363, "v_bfm_b32", False, False),
1053   (0x12d, 0x12d, 0x294, 0x294, 0x368, "v_cvt_pknorm_i16_f32", True, False),
1054   (0x12e, 0x12e, 0x295, 0x295, 0x369, "v_cvt_pknorm_u16_f32", True, False),
1055   (0x12f, 0x12f, 0x296, 0x296, 0x12f, "v_cvt_pkrtz_f16_f32_e64", True, False), # GFX6_7_10 is VOP2 with opcode 0x02f
1056   (0x130, 0x130, 0x297, 0x297, 0x36a, "v_cvt_pk_u16_u32", False, False),
1057   (0x131, 0x131, 0x298, 0x298, 0x36b, "v_cvt_pk_i16_i32", False, False),
1058   (   -1,    -1,    -1, 0x299, 0x312, "v_cvt_pknorm_i16_f16", True, False),
1059   (   -1,    -1,    -1, 0x29a, 0x313, "v_cvt_pknorm_u16_f16", True, False),
1060   (   -1,    -1,    -1, 0x29c, 0x37f, "v_add_i32", False, False),
1061   (   -1,    -1,    -1, 0x29d, 0x376, "v_sub_i32", False, False),
1062   (   -1,    -1,    -1, 0x29e, 0x30d, "v_add_i16", False, False),
1063   (   -1,    -1,    -1, 0x29f, 0x30e, "v_sub_i16", False, False),
1064   (   -1,    -1,    -1, 0x2a0, 0x311, "v_pack_b32_f16", True, False),
1065   (   -1,    -1,    -1,    -1, 0x178, "v_xor3_b32", False, False),
1066   (   -1,    -1,    -1,    -1, 0x377, "v_permlane16_b32", False, False),
1067   (   -1,    -1,    -1,    -1, 0x378, "v_permlanex16_b32", False, False),
1068   (   -1,    -1,    -1,    -1, 0x30f, "v_add_co_u32_e64", False, False),
1069   (   -1,    -1,    -1,    -1, 0x310, "v_sub_co_u32_e64", False, False),
1070   (   -1,    -1,    -1,    -1, 0x319, "v_subrev_co_u32_e64", False, False),
1071   (   -1,    -1,    -1,    -1, 0x303, "v_add_u16_e64", False, False),
1072   (   -1,    -1,    -1,    -1, 0x304, "v_sub_u16_e64", False, False),
1073   (   -1,    -1,    -1,    -1, 0x305, "v_mul_lo_u16_e64", False, False),
1074   (   -1,    -1,    -1,    -1, 0x309, "v_max_u16_e64", False, False),
1075   (   -1,    -1,    -1,    -1, 0x30a, "v_max_i16_e64", False, False),
1076   (   -1,    -1,    -1,    -1, 0x30b, "v_min_u16_e64", False, False),
1077   (   -1,    -1,    -1,    -1, 0x30c, "v_min_i16_e64", False, False),
1078   (   -1,    -1,    -1,    -1, 0x307, "v_lshrrev_b16_e64", False, False),
1079   (   -1,    -1,    -1,    -1, 0x308, "v_ashrrev_i16_e64", False, False),
1080   (   -1,    -1,    -1,    -1, 0x314, "v_lshlrev_b16_e64", False, False),
1081   (   -1,    -1,    -1,    -1, 0x140, "v_fma_legacy_f32", True, True), #GFX10.3+
1082}
1083for (gfx6, gfx7, gfx8, gfx9, gfx10, name, in_mod, out_mod) in VOP3:
1084   opcode(name, gfx7, gfx9, gfx10, Format.VOP3A, in_mod, out_mod)
1085
1086
1087# DS instructions: 3 inputs (1 addr, 2 data), 1 output
1088DS = {
1089   (0x00, 0x00, 0x00, 0x00, 0x00, "ds_add_u32"),
1090   (0x01, 0x01, 0x01, 0x01, 0x01, "ds_sub_u32"),
1091   (0x02, 0x02, 0x02, 0x02, 0x02, "ds_rsub_u32"),
1092   (0x03, 0x03, 0x03, 0x03, 0x03, "ds_inc_u32"),
1093   (0x04, 0x04, 0x04, 0x04, 0x04, "ds_dec_u32"),
1094   (0x05, 0x05, 0x05, 0x05, 0x05, "ds_min_i32"),
1095   (0x06, 0x06, 0x06, 0x06, 0x06, "ds_max_i32"),
1096   (0x07, 0x07, 0x07, 0x07, 0x07, "ds_min_u32"),
1097   (0x08, 0x08, 0x08, 0x08, 0x08, "ds_max_u32"),
1098   (0x09, 0x09, 0x09, 0x09, 0x09, "ds_and_b32"),
1099   (0x0a, 0x0a, 0x0a, 0x0a, 0x0a, "ds_or_b32"),
1100   (0x0b, 0x0b, 0x0b, 0x0b, 0x0b, "ds_xor_b32"),
1101   (0x0c, 0x0c, 0x0c, 0x0c, 0x0c, "ds_mskor_b32"),
1102   (0x0d, 0x0d, 0x0d, 0x0d, 0x0d, "ds_write_b32"),
1103   (0x0e, 0x0e, 0x0e, 0x0e, 0x0e, "ds_write2_b32"),
1104   (0x0f, 0x0f, 0x0f, 0x0f, 0x0f, "ds_write2st64_b32"),
1105   (0x10, 0x10, 0x10, 0x10, 0x10, "ds_cmpst_b32"),
1106   (0x11, 0x11, 0x11, 0x11, 0x11, "ds_cmpst_f32"),
1107   (0x12, 0x12, 0x12, 0x12, 0x12, "ds_min_f32"),
1108   (0x13, 0x13, 0x13, 0x13, 0x13, "ds_max_f32"),
1109   (  -1, 0x14, 0x14, 0x14, 0x14, "ds_nop"),
1110   (  -1,   -1, 0x15, 0x15, 0x15, "ds_add_f32"),
1111   (  -1,   -1, 0x1d, 0x1d, 0xb0, "ds_write_addtid_b32"),
1112   (0x1e, 0x1e, 0x1e, 0x1e, 0x1e, "ds_write_b8"),
1113   (0x1f, 0x1f, 0x1f, 0x1f, 0x1f, "ds_write_b16"),
1114   (0x20, 0x20, 0x20, 0x20, 0x20, "ds_add_rtn_u32"),
1115   (0x21, 0x21, 0x21, 0x21, 0x21, "ds_sub_rtn_u32"),
1116   (0x22, 0x22, 0x22, 0x22, 0x22, "ds_rsub_rtn_u32"),
1117   (0x23, 0x23, 0x23, 0x23, 0x23, "ds_inc_rtn_u32"),
1118   (0x24, 0x24, 0x24, 0x24, 0x24, "ds_dec_rtn_u32"),
1119   (0x25, 0x25, 0x25, 0x25, 0x25, "ds_min_rtn_i32"),
1120   (0x26, 0x26, 0x26, 0x26, 0x26, "ds_max_rtn_i32"),
1121   (0x27, 0x27, 0x27, 0x27, 0x27, "ds_min_rtn_u32"),
1122   (0x28, 0x28, 0x28, 0x28, 0x28, "ds_max_rtn_u32"),
1123   (0x29, 0x29, 0x29, 0x29, 0x29, "ds_and_rtn_b32"),
1124   (0x2a, 0x2a, 0x2a, 0x2a, 0x2a, "ds_or_rtn_b32"),
1125   (0x2b, 0x2b, 0x2b, 0x2b, 0x2b, "ds_xor_rtn_b32"),
1126   (0x2c, 0x2c, 0x2c, 0x2c, 0x2c, "ds_mskor_rtn_b32"),
1127   (0x2d, 0x2d, 0x2d, 0x2d, 0x2d, "ds_wrxchg_rtn_b32"),
1128   (0x2e, 0x2e, 0x2e, 0x2e, 0x2e, "ds_wrxchg2_rtn_b32"),
1129   (0x2f, 0x2f, 0x2f, 0x2f, 0x2f, "ds_wrxchg2st64_rtn_b32"),
1130   (0x30, 0x30, 0x30, 0x30, 0x30, "ds_cmpst_rtn_b32"),
1131   (0x31, 0x31, 0x31, 0x31, 0x31, "ds_cmpst_rtn_f32"),
1132   (0x32, 0x32, 0x32, 0x32, 0x32, "ds_min_rtn_f32"),
1133   (0x33, 0x33, 0x33, 0x33, 0x33, "ds_max_rtn_f32"),
1134   (  -1, 0x34, 0x34, 0x34, 0x34, "ds_wrap_rtn_b32"),
1135   (  -1,   -1, 0x35, 0x35, 0x55, "ds_add_rtn_f32"),
1136   (0x36, 0x36, 0x36, 0x36, 0x36, "ds_read_b32"),
1137   (0x37, 0x37, 0x37, 0x37, 0x37, "ds_read2_b32"),
1138   (0x38, 0x38, 0x38, 0x38, 0x38, "ds_read2st64_b32"),
1139   (0x39, 0x39, 0x39, 0x39, 0x39, "ds_read_i8"),
1140   (0x3a, 0x3a, 0x3a, 0x3a, 0x3a, "ds_read_u8"),
1141   (0x3b, 0x3b, 0x3b, 0x3b, 0x3b, "ds_read_i16"),
1142   (0x3c, 0x3c, 0x3c, 0x3c, 0x3c, "ds_read_u16"),
1143   (0x35, 0x35, 0x3d, 0x3d, 0x35, "ds_swizzle_b32"), #data1 & offset, no addr/data2
1144   (  -1,   -1, 0x3e, 0x3e, 0xb2, "ds_permute_b32"),
1145   (  -1,   -1, 0x3f, 0x3f, 0xb3, "ds_bpermute_b32"),
1146   (0x40, 0x40, 0x40, 0x40, 0x40, "ds_add_u64"),
1147   (0x41, 0x41, 0x41, 0x41, 0x41, "ds_sub_u64"),
1148   (0x42, 0x42, 0x42, 0x42, 0x42, "ds_rsub_u64"),
1149   (0x43, 0x43, 0x43, 0x43, 0x43, "ds_inc_u64"),
1150   (0x44, 0x44, 0x44, 0x44, 0x44, "ds_dec_u64"),
1151   (0x45, 0x45, 0x45, 0x45, 0x45, "ds_min_i64"),
1152   (0x46, 0x46, 0x46, 0x46, 0x46, "ds_max_i64"),
1153   (0x47, 0x47, 0x47, 0x47, 0x47, "ds_min_u64"),
1154   (0x48, 0x48, 0x48, 0x48, 0x48, "ds_max_u64"),
1155   (0x49, 0x49, 0x49, 0x49, 0x49, "ds_and_b64"),
1156   (0x4a, 0x4a, 0x4a, 0x4a, 0x4a, "ds_or_b64"),
1157   (0x4b, 0x4b, 0x4b, 0x4b, 0x4b, "ds_xor_b64"),
1158   (0x4c, 0x4c, 0x4c, 0x4c, 0x4c, "ds_mskor_b64"),
1159   (0x4d, 0x4d, 0x4d, 0x4d, 0x4d, "ds_write_b64"),
1160   (0x4e, 0x4e, 0x4e, 0x4e, 0x4e, "ds_write2_b64"),
1161   (0x4f, 0x4f, 0x4f, 0x4f, 0x4f, "ds_write2st64_b64"),
1162   (0x50, 0x50, 0x50, 0x50, 0x50, "ds_cmpst_b64"),
1163   (0x51, 0x51, 0x51, 0x51, 0x51, "ds_cmpst_f64"),
1164   (0x52, 0x52, 0x52, 0x52, 0x52, "ds_min_f64"),
1165   (0x53, 0x53, 0x53, 0x53, 0x53, "ds_max_f64"),
1166   (  -1,   -1, 0x54, 0x54, 0xa0, "ds_write_b8_d16_hi"),
1167   (  -1,   -1, 0x55, 0x55, 0xa1, "ds_write_b16_d16_hi"),
1168   (  -1,   -1, 0x56, 0x56, 0xa2, "ds_read_u8_d16"),
1169   (  -1,   -1, 0x57, 0x57, 0xa3, "ds_read_u8_d16_hi"),
1170   (  -1,   -1, 0x58, 0x58, 0xa4, "ds_read_i8_d16"),
1171   (  -1,   -1, 0x59, 0x59, 0xa5, "ds_read_i8_d16_hi"),
1172   (  -1,   -1, 0x5a, 0x5a, 0xa6, "ds_read_u16_d16"),
1173   (  -1,   -1, 0x5b, 0x5b, 0xa7, "ds_read_u16_d16_hi"),
1174   (0x60, 0x60, 0x60, 0x60, 0x60, "ds_add_rtn_u64"),
1175   (0x61, 0x61, 0x61, 0x61, 0x61, "ds_sub_rtn_u64"),
1176   (0x62, 0x62, 0x62, 0x62, 0x62, "ds_rsub_rtn_u64"),
1177   (0x63, 0x63, 0x63, 0x63, 0x63, "ds_inc_rtn_u64"),
1178   (0x64, 0x64, 0x64, 0x64, 0x64, "ds_dec_rtn_u64"),
1179   (0x65, 0x65, 0x65, 0x65, 0x65, "ds_min_rtn_i64"),
1180   (0x66, 0x66, 0x66, 0x66, 0x66, "ds_max_rtn_i64"),
1181   (0x67, 0x67, 0x67, 0x67, 0x67, "ds_min_rtn_u64"),
1182   (0x68, 0x68, 0x68, 0x68, 0x68, "ds_max_rtn_u64"),
1183   (0x69, 0x69, 0x69, 0x69, 0x69, "ds_and_rtn_b64"),
1184   (0x6a, 0x6a, 0x6a, 0x6a, 0x6a, "ds_or_rtn_b64"),
1185   (0x6b, 0x6b, 0x6b, 0x6b, 0x6b, "ds_xor_rtn_b64"),
1186   (0x6c, 0x6c, 0x6c, 0x6c, 0x6c, "ds_mskor_rtn_b64"),
1187   (0x6d, 0x6d, 0x6d, 0x6d, 0x6d, "ds_wrxchg_rtn_b64"),
1188   (0x6e, 0x6e, 0x6e, 0x6e, 0x6e, "ds_wrxchg2_rtn_b64"),
1189   (0x6f, 0x6f, 0x6f, 0x6f, 0x6f, "ds_wrxchg2st64_rtn_b64"),
1190   (0x70, 0x70, 0x70, 0x70, 0x70, "ds_cmpst_rtn_b64"),
1191   (0x71, 0x71, 0x71, 0x71, 0x71, "ds_cmpst_rtn_f64"),
1192   (0x72, 0x72, 0x72, 0x72, 0x72, "ds_min_rtn_f64"),
1193   (0x73, 0x73, 0x73, 0x73, 0x73, "ds_max_rtn_f64"),
1194   (0x76, 0x76, 0x76, 0x76, 0x76, "ds_read_b64"),
1195   (0x77, 0x77, 0x77, 0x77, 0x77, "ds_read2_b64"),
1196   (0x78, 0x78, 0x78, 0x78, 0x78, "ds_read2st64_b64"),
1197   (  -1, 0x7e, 0x7e, 0x7e, 0x7e, "ds_condxchg32_rtn_b64"),
1198   (0x80, 0x80, 0x80, 0x80, 0x80, "ds_add_src2_u32"),
1199   (0x81, 0x81, 0x81, 0x81, 0x81, "ds_sub_src2_u32"),
1200   (0x82, 0x82, 0x82, 0x82, 0x82, "ds_rsub_src2_u32"),
1201   (0x83, 0x83, 0x83, 0x83, 0x83, "ds_inc_src2_u32"),
1202   (0x84, 0x84, 0x84, 0x84, 0x84, "ds_dec_src2_u32"),
1203   (0x85, 0x85, 0x85, 0x85, 0x85, "ds_min_src2_i32"),
1204   (0x86, 0x86, 0x86, 0x86, 0x86, "ds_max_src2_i32"),
1205   (0x87, 0x87, 0x87, 0x87, 0x87, "ds_min_src2_u32"),
1206   (0x88, 0x88, 0x88, 0x88, 0x88, "ds_max_src2_u32"),
1207   (0x89, 0x89, 0x89, 0x89, 0x89, "ds_and_src2_b32"),
1208   (0x8a, 0x8a, 0x8a, 0x8a, 0x8a, "ds_or_src2_b32"),
1209   (0x8b, 0x8b, 0x8b, 0x8b, 0x8b, "ds_xor_src2_b32"),
1210   (0x8d, 0x8d, 0x8d, 0x8d, 0x8d, "ds_write_src2_b32"),
1211   (0x92, 0x92, 0x92, 0x92, 0x92, "ds_min_src2_f32"),
1212   (0x93, 0x93, 0x93, 0x93, 0x93, "ds_max_src2_f32"),
1213   (  -1,   -1, 0x95, 0x95, 0x95, "ds_add_src2_f32"),
1214   (  -1, 0x18, 0x98, 0x98, 0x18, "ds_gws_sema_release_all"),
1215   (0x19, 0x19, 0x99, 0x99, 0x19, "ds_gws_init"),
1216   (0x1a, 0x1a, 0x9a, 0x9a, 0x1a, "ds_gws_sema_v"),
1217   (0x1b, 0x1b, 0x9b, 0x9b, 0x1b, "ds_gws_sema_br"),
1218   (0x1c, 0x1c, 0x9c, 0x9c, 0x1c, "ds_gws_sema_p"),
1219   (0x1d, 0x1d, 0x9d, 0x9d, 0x1d, "ds_gws_barrier"),
1220   (  -1,   -1, 0xb6, 0xb6, 0xb1, "ds_read_addtid_b32"),
1221   (0x3d, 0x3d, 0xbd, 0xbd, 0x3d, "ds_consume"),
1222   (0x3e, 0x3e, 0xbe, 0xbe, 0x3e, "ds_append"),
1223   (0x3f, 0x3f, 0xbf, 0xbf, 0x3f, "ds_ordered_count"),
1224   (0xc0, 0xc0, 0xc0, 0xc0, 0xc0, "ds_add_src2_u64"),
1225   (0xc1, 0xc1, 0xc1, 0xc1, 0xc1, "ds_sub_src2_u64"),
1226   (0xc2, 0xc2, 0xc2, 0xc2, 0xc2, "ds_rsub_src2_u64"),
1227   (0xc3, 0xc3, 0xc3, 0xc3, 0xc3, "ds_inc_src2_u64"),
1228   (0xc4, 0xc4, 0xc4, 0xc4, 0xc4, "ds_dec_src2_u64"),
1229   (0xc5, 0xc5, 0xc5, 0xc5, 0xc5, "ds_min_src2_i64"),
1230   (0xc6, 0xc6, 0xc6, 0xc6, 0xc6, "ds_max_src2_i64"),
1231   (0xc7, 0xc7, 0xc7, 0xc7, 0xc7, "ds_min_src2_u64"),
1232   (0xc8, 0xc8, 0xc8, 0xc8, 0xc8, "ds_max_src2_u64"),
1233   (0xc9, 0xc9, 0xc9, 0xc9, 0xc9, "ds_and_src2_b64"),
1234   (0xca, 0xca, 0xca, 0xca, 0xca, "ds_or_src2_b64"),
1235   (0xcb, 0xcb, 0xcb, 0xcb, 0xcb, "ds_xor_src2_b64"),
1236   (0xcd, 0xcd, 0xcd, 0xcd, 0xcd, "ds_write_src2_b64"),
1237   (0xd2, 0xd2, 0xd2, 0xd2, 0xd2, "ds_min_src2_f64"),
1238   (0xd3, 0xd3, 0xd3, 0xd3, 0xd3, "ds_max_src2_f64"),
1239   (  -1, 0xde, 0xde, 0xde, 0xde, "ds_write_b96"),
1240   (  -1, 0xdf, 0xdf, 0xdf, 0xdf, "ds_write_b128"),
1241   (  -1, 0xfd, 0xfd,   -1,   -1, "ds_condxchg32_rtn_b128"),
1242   (  -1, 0xfe, 0xfe, 0xfe, 0xfe, "ds_read_b96"),
1243   (  -1, 0xff, 0xff, 0xff, 0xff, "ds_read_b128"),
1244}
1245for (gfx6, gfx7, gfx8, gfx9, gfx10, name) in DS:
1246    opcode(name, gfx7, gfx9, gfx10, Format.DS)
1247
1248# MUBUF instructions:
1249MUBUF = {
1250   (0x00, 0x00, 0x00, 0x00, 0x00, "buffer_load_format_x"),
1251   (0x01, 0x01, 0x01, 0x01, 0x01, "buffer_load_format_xy"),
1252   (0x02, 0x02, 0x02, 0x02, 0x02, "buffer_load_format_xyz"),
1253   (0x03, 0x03, 0x03, 0x03, 0x03, "buffer_load_format_xyzw"),
1254   (0x04, 0x04, 0x04, 0x04, 0x04, "buffer_store_format_x"),
1255   (0x05, 0x05, 0x05, 0x05, 0x05, "buffer_store_format_xy"),
1256   (0x06, 0x06, 0x06, 0x06, 0x06, "buffer_store_format_xyz"),
1257   (0x07, 0x07, 0x07, 0x07, 0x07, "buffer_store_format_xyzw"),
1258   (  -1,   -1, 0x08, 0x08, 0x80, "buffer_load_format_d16_x"),
1259   (  -1,   -1, 0x09, 0x09, 0x81, "buffer_load_format_d16_xy"),
1260   (  -1,   -1, 0x0a, 0x0a, 0x82, "buffer_load_format_d16_xyz"),
1261   (  -1,   -1, 0x0b, 0x0b, 0x83, "buffer_load_format_d16_xyzw"),
1262   (  -1,   -1, 0x0c, 0x0c, 0x84, "buffer_store_format_d16_x"),
1263   (  -1,   -1, 0x0d, 0x0d, 0x85, "buffer_store_format_d16_xy"),
1264   (  -1,   -1, 0x0e, 0x0e, 0x86, "buffer_store_format_d16_xyz"),
1265   (  -1,   -1, 0x0f, 0x0f, 0x87, "buffer_store_format_d16_xyzw"),
1266   (0x08, 0x08, 0x10, 0x10, 0x08, "buffer_load_ubyte"),
1267   (0x09, 0x09, 0x11, 0x11, 0x09, "buffer_load_sbyte"),
1268   (0x0a, 0x0a, 0x12, 0x12, 0x0a, "buffer_load_ushort"),
1269   (0x0b, 0x0b, 0x13, 0x13, 0x0b, "buffer_load_sshort"),
1270   (0x0c, 0x0c, 0x14, 0x14, 0x0c, "buffer_load_dword"),
1271   (0x0d, 0x0d, 0x15, 0x15, 0x0d, "buffer_load_dwordx2"),
1272   (  -1, 0x0f, 0x16, 0x16, 0x0f, "buffer_load_dwordx3"),
1273   (0x0f, 0x0e, 0x17, 0x17, 0x0e, "buffer_load_dwordx4"),
1274   (0x18, 0x18, 0x18, 0x18, 0x18, "buffer_store_byte"),
1275   (  -1,   -1,   -1, 0x19, 0x19, "buffer_store_byte_d16_hi"),
1276   (0x1a, 0x1a, 0x1a, 0x1a, 0x1a, "buffer_store_short"),
1277   (  -1,   -1,   -1, 0x1b, 0x1b, "buffer_store_short_d16_hi"),
1278   (0x1c, 0x1c, 0x1c, 0x1c, 0x1c, "buffer_store_dword"),
1279   (0x1d, 0x1d, 0x1d, 0x1d, 0x1d, "buffer_store_dwordx2"),
1280   (  -1, 0x1f, 0x1e, 0x1e, 0x1f, "buffer_store_dwordx3"),
1281   (0x1e, 0x1e, 0x1f, 0x1f, 0x1e, "buffer_store_dwordx4"),
1282   (  -1,   -1,   -1, 0x20, 0x20, "buffer_load_ubyte_d16"),
1283   (  -1,   -1,   -1, 0x21, 0x21, "buffer_load_ubyte_d16_hi"),
1284   (  -1,   -1,   -1, 0x22, 0x22, "buffer_load_sbyte_d16"),
1285   (  -1,   -1,   -1, 0x23, 0x23, "buffer_load_sbyte_d16_hi"),
1286   (  -1,   -1,   -1, 0x24, 0x24, "buffer_load_short_d16"),
1287   (  -1,   -1,   -1, 0x25, 0x25, "buffer_load_short_d16_hi"),
1288   (  -1,   -1,   -1, 0x26, 0x26, "buffer_load_format_d16_hi_x"),
1289   (  -1,   -1,   -1, 0x27, 0x27, "buffer_store_format_d16_hi_x"),
1290   (  -1,   -1, 0x3d, 0x3d,   -1, "buffer_store_lds_dword"),
1291   (0x71, 0x71, 0x3e, 0x3e,   -1, "buffer_wbinvl1"),
1292   (0x70, 0x70, 0x3f, 0x3f,   -1, "buffer_wbinvl1_vol"),
1293   (0x30, 0x30, 0x40, 0x40, 0x30, "buffer_atomic_swap"),
1294   (0x31, 0x31, 0x41, 0x41, 0x31, "buffer_atomic_cmpswap"),
1295   (0x32, 0x32, 0x42, 0x42, 0x32, "buffer_atomic_add"),
1296   (0x33, 0x33, 0x43, 0x43, 0x33, "buffer_atomic_sub"),
1297   (0x34,   -1,   -1,   -1,   -1, "buffer_atomic_rsub"),
1298   (0x35, 0x35, 0x44, 0x44, 0x35, "buffer_atomic_smin"),
1299   (0x36, 0x36, 0x45, 0x45, 0x36, "buffer_atomic_umin"),
1300   (0x37, 0x37, 0x46, 0x46, 0x37, "buffer_atomic_smax"),
1301   (0x38, 0x38, 0x47, 0x47, 0x38, "buffer_atomic_umax"),
1302   (0x39, 0x39, 0x48, 0x48, 0x39, "buffer_atomic_and"),
1303   (0x3a, 0x3a, 0x49, 0x49, 0x3a, "buffer_atomic_or"),
1304   (0x3b, 0x3b, 0x4a, 0x4a, 0x3b, "buffer_atomic_xor"),
1305   (0x3c, 0x3c, 0x4b, 0x4b, 0x3c, "buffer_atomic_inc"),
1306   (0x3d, 0x3d, 0x4c, 0x4c, 0x3d, "buffer_atomic_dec"),
1307   (0x3e, 0x3e,   -1,   -1, 0x3e, "buffer_atomic_fcmpswap"),
1308   (0x3f, 0x3f,   -1,   -1, 0x3f, "buffer_atomic_fmin"),
1309   (0x40, 0x40,   -1,   -1, 0x40, "buffer_atomic_fmax"),
1310   (0x50, 0x50, 0x60, 0x60, 0x50, "buffer_atomic_swap_x2"),
1311   (0x51, 0x51, 0x61, 0x61, 0x51, "buffer_atomic_cmpswap_x2"),
1312   (0x52, 0x52, 0x62, 0x62, 0x52, "buffer_atomic_add_x2"),
1313   (0x53, 0x53, 0x63, 0x63, 0x53, "buffer_atomic_sub_x2"),
1314   (0x54,   -1,   -1,   -1,   -1, "buffer_atomic_rsub_x2"),
1315   (0x55, 0x55, 0x64, 0x64, 0x55, "buffer_atomic_smin_x2"),
1316   (0x56, 0x56, 0x65, 0x65, 0x56, "buffer_atomic_umin_x2"),
1317   (0x57, 0x57, 0x66, 0x66, 0x57, "buffer_atomic_smax_x2"),
1318   (0x58, 0x58, 0x67, 0x67, 0x58, "buffer_atomic_umax_x2"),
1319   (0x59, 0x59, 0x68, 0x68, 0x59, "buffer_atomic_and_x2"),
1320   (0x5a, 0x5a, 0x69, 0x69, 0x5a, "buffer_atomic_or_x2"),
1321   (0x5b, 0x5b, 0x6a, 0x6a, 0x5b, "buffer_atomic_xor_x2"),
1322   (0x5c, 0x5c, 0x6b, 0x6b, 0x5c, "buffer_atomic_inc_x2"),
1323   (0x5d, 0x5d, 0x6c, 0x6c, 0x5d, "buffer_atomic_dec_x2"),
1324   (0x5e, 0x5e,   -1,   -1, 0x5e, "buffer_atomic_fcmpswap_x2"),
1325   (0x5f, 0x5f,   -1,   -1, 0x5f, "buffer_atomic_fmin_x2"),
1326   (0x60, 0x60,   -1,   -1, 0x60, "buffer_atomic_fmax_x2"),
1327   (  -1,   -1,   -1,   -1, 0x71, "buffer_gl0_inv"),
1328   (  -1,   -1,   -1,   -1, 0x72, "buffer_gl1_inv"),
1329   (  -1,   -1,   -1,   -1, 0x34, "buffer_atomic_csub"), #GFX10.3+. seems glc must be set
1330}
1331for (gfx6, gfx7, gfx8, gfx9, gfx10, name) in MUBUF:
1332    opcode(name, gfx7, gfx9, gfx10, Format.MUBUF, is_atomic = "atomic" in name)
1333
1334MTBUF = {
1335   (0x00, 0x00, 0x00, 0x00, 0x00, "tbuffer_load_format_x"),
1336   (0x01, 0x01, 0x01, 0x01, 0x01, "tbuffer_load_format_xy"),
1337   (0x02, 0x02, 0x02, 0x02, 0x02, "tbuffer_load_format_xyz"),
1338   (0x03, 0x03, 0x03, 0x03, 0x03, "tbuffer_load_format_xyzw"),
1339   (0x04, 0x04, 0x04, 0x04, 0x04, "tbuffer_store_format_x"),
1340   (0x05, 0x05, 0x05, 0x05, 0x05, "tbuffer_store_format_xy"),
1341   (0x06, 0x06, 0x06, 0x06, 0x06, "tbuffer_store_format_xyz"),
1342   (0x07, 0x07, 0x07, 0x07, 0x07, "tbuffer_store_format_xyzw"),
1343   (  -1,   -1, 0x08, 0x08, 0x08, "tbuffer_load_format_d16_x"),
1344   (  -1,   -1, 0x09, 0x09, 0x09, "tbuffer_load_format_d16_xy"),
1345   (  -1,   -1, 0x0a, 0x0a, 0x0a, "tbuffer_load_format_d16_xyz"),
1346   (  -1,   -1, 0x0b, 0x0b, 0x0b, "tbuffer_load_format_d16_xyzw"),
1347   (  -1,   -1, 0x0c, 0x0c, 0x0c, "tbuffer_store_format_d16_x"),
1348   (  -1,   -1, 0x0d, 0x0d, 0x0d, "tbuffer_store_format_d16_xy"),
1349   (  -1,   -1, 0x0e, 0x0e, 0x0e, "tbuffer_store_format_d16_xyz"),
1350   (  -1,   -1, 0x0f, 0x0f, 0x0f, "tbuffer_store_format_d16_xyzw"),
1351}
1352for (gfx6, gfx7, gfx8, gfx9, gfx10, name) in MTBUF:
1353    opcode(name, gfx7, gfx9, gfx10, Format.MTBUF)
1354
1355
1356IMAGE = {
1357   (0x00, "image_load"),
1358   (0x01, "image_load_mip"),
1359   (0x02, "image_load_pck"),
1360   (0x03, "image_load_pck_sgn"),
1361   (0x04, "image_load_mip_pck"),
1362   (0x05, "image_load_mip_pck_sgn"),
1363   (0x08, "image_store"),
1364   (0x09, "image_store_mip"),
1365   (0x0a, "image_store_pck"),
1366   (0x0b, "image_store_mip_pck"),
1367   (0x0e, "image_get_resinfo"),
1368   (0x60, "image_get_lod"),
1369}
1370# (gfx6, gfx7, gfx8, gfx9, gfx10, name) = (code, code, code, code, code, name)
1371for (code, name) in IMAGE:
1372   opcode(name, code, code, code, Format.MIMG)
1373
1374opcode("image_msaa_load", -1, -1, 0x80, Format.MIMG) #GFX10.3+
1375
1376IMAGE_ATOMIC = {
1377   (0x0f, 0x0f, 0x10, "image_atomic_swap"),
1378   (0x10, 0x10, 0x11, "image_atomic_cmpswap"),
1379   (0x11, 0x11, 0x12, "image_atomic_add"),
1380   (0x12, 0x12, 0x13, "image_atomic_sub"),
1381   (0x13,   -1,   -1, "image_atomic_rsub"),
1382   (0x14, 0x14, 0x14, "image_atomic_smin"),
1383   (0x15, 0x15, 0x15, "image_atomic_umin"),
1384   (0x16, 0x16, 0x16, "image_atomic_smax"),
1385   (0x17, 0x17, 0x17, "image_atomic_umax"),
1386   (0x18, 0x18, 0x18, "image_atomic_and"),
1387   (0x19, 0x19, 0x19, "image_atomic_or"),
1388   (0x1a, 0x1a, 0x1a, "image_atomic_xor"),
1389   (0x1b, 0x1b, 0x1b, "image_atomic_inc"),
1390   (0x1c, 0x1c, 0x1c, "image_atomic_dec"),
1391   (0x1d, 0x1d,   -1, "image_atomic_fcmpswap"),
1392   (0x1e, 0x1e,   -1, "image_atomic_fmin"),
1393   (0x1f, 0x1f,   -1, "image_atomic_fmax"),
1394}
1395# (gfx6, gfx7, gfx8, gfx9, gfx10, name) = (gfx6, gfx7, gfx89, gfx89, ???, name)
1396# gfx7 and gfx10 opcodes are the same here
1397for (gfx6, gfx7, gfx89, name) in IMAGE_ATOMIC:
1398   opcode(name, gfx7, gfx89, gfx7, Format.MIMG, is_atomic = True)
1399
1400IMAGE_SAMPLE = {
1401   (0x20, "image_sample"),
1402   (0x21, "image_sample_cl"),
1403   (0x22, "image_sample_d"),
1404   (0x23, "image_sample_d_cl"),
1405   (0x24, "image_sample_l"),
1406   (0x25, "image_sample_b"),
1407   (0x26, "image_sample_b_cl"),
1408   (0x27, "image_sample_lz"),
1409   (0x28, "image_sample_c"),
1410   (0x29, "image_sample_c_cl"),
1411   (0x2a, "image_sample_c_d"),
1412   (0x2b, "image_sample_c_d_cl"),
1413   (0x2c, "image_sample_c_l"),
1414   (0x2d, "image_sample_c_b"),
1415   (0x2e, "image_sample_c_b_cl"),
1416   (0x2f, "image_sample_c_lz"),
1417   (0x30, "image_sample_o"),
1418   (0x31, "image_sample_cl_o"),
1419   (0x32, "image_sample_d_o"),
1420   (0x33, "image_sample_d_cl_o"),
1421   (0x34, "image_sample_l_o"),
1422   (0x35, "image_sample_b_o"),
1423   (0x36, "image_sample_b_cl_o"),
1424   (0x37, "image_sample_lz_o"),
1425   (0x38, "image_sample_c_o"),
1426   (0x39, "image_sample_c_cl_o"),
1427   (0x3a, "image_sample_c_d_o"),
1428   (0x3b, "image_sample_c_d_cl_o"),
1429   (0x3c, "image_sample_c_l_o"),
1430   (0x3d, "image_sample_c_b_o"),
1431   (0x3e, "image_sample_c_b_cl_o"),
1432   (0x3f, "image_sample_c_lz_o"),
1433   (0x68, "image_sample_cd"),
1434   (0x69, "image_sample_cd_cl"),
1435   (0x6a, "image_sample_c_cd"),
1436   (0x6b, "image_sample_c_cd_cl"),
1437   (0x6c, "image_sample_cd_o"),
1438   (0x6d, "image_sample_cd_cl_o"),
1439   (0x6e, "image_sample_c_cd_o"),
1440   (0x6f, "image_sample_c_cd_cl_o"),
1441}
1442# (gfx6, gfx7, gfx8, gfx9, gfx10, name) = (code, code, code, code, code, name)
1443for (code, name) in IMAGE_SAMPLE:
1444   opcode(name, code, code, code, Format.MIMG)
1445
1446IMAGE_GATHER4 = {
1447   (0x40, "image_gather4"),
1448   (0x41, "image_gather4_cl"),
1449   #(0x42, "image_gather4h"), VEGA only?
1450   (0x44, "image_gather4_l"), # following instructions have different opcodes according to ISA sheet.
1451   (0x45, "image_gather4_b"),
1452   (0x46, "image_gather4_b_cl"),
1453   (0x47, "image_gather4_lz"),
1454   (0x48, "image_gather4_c"),
1455   (0x49, "image_gather4_c_cl"), # previous instructions have different opcodes according to ISA sheet.
1456   #(0x4a, "image_gather4h_pck"), VEGA only?
1457   #(0x4b, "image_gather8h_pck"), VGEA only?
1458   (0x4c, "image_gather4_c_l"),
1459   (0x4d, "image_gather4_c_b"),
1460   (0x4e, "image_gather4_c_b_cl"),
1461   (0x4f, "image_gather4_c_lz"),
1462   (0x50, "image_gather4_o"),
1463   (0x51, "image_gather4_cl_o"),
1464   (0x54, "image_gather4_l_o"),
1465   (0x55, "image_gather4_b_o"),
1466   (0x56, "image_gather4_b_cl_o"),
1467   (0x57, "image_gather4_lz_o"),
1468   (0x58, "image_gather4_c_o"),
1469   (0x59, "image_gather4_c_cl_o"),
1470   (0x5c, "image_gather4_c_l_o"),
1471   (0x5d, "image_gather4_c_b_o"),
1472   (0x5e, "image_gather4_c_b_cl_o"),
1473   (0x5f, "image_gather4_c_lz_o"),
1474}
1475# (gfx6, gfx7, gfx8, gfx9, gfx10, name) = (code, code, code, code, code, name)
1476for (code, name) in IMAGE_GATHER4:
1477   opcode(name, code, code, code, Format.MIMG)
1478
1479
1480FLAT = {
1481   #GFX7, GFX8_9, GFX10
1482   (0x08, 0x10, 0x08, "flat_load_ubyte"),
1483   (0x09, 0x11, 0x09, "flat_load_sbyte"),
1484   (0x0a, 0x12, 0x0a, "flat_load_ushort"),
1485   (0x0b, 0x13, 0x0b, "flat_load_sshort"),
1486   (0x0c, 0x14, 0x0c, "flat_load_dword"),
1487   (0x0d, 0x15, 0x0d, "flat_load_dwordx2"),
1488   (0x0f, 0x16, 0x0f, "flat_load_dwordx3"),
1489   (0x0e, 0x17, 0x0e, "flat_load_dwordx4"),
1490   (0x18, 0x18, 0x18, "flat_store_byte"),
1491   (  -1, 0x19, 0x19, "flat_store_byte_d16_hi"),
1492   (0x1a, 0x1a, 0x1a, "flat_store_short"),
1493   (  -1, 0x1b, 0x1b, "flat_store_short_d16_hi"),
1494   (0x1c, 0x1c, 0x1c, "flat_store_dword"),
1495   (0x1d, 0x1d, 0x1d, "flat_store_dwordx2"),
1496   (0x1f, 0x1e, 0x1f, "flat_store_dwordx3"),
1497   (0x1e, 0x1f, 0x1e, "flat_store_dwordx4"),
1498   (  -1, 0x20, 0x20, "flat_load_ubyte_d16"),
1499   (  -1, 0x21, 0x21, "flat_load_ubyte_d16_hi"),
1500   (  -1, 0x22, 0x22, "flat_load_sbyte_d16"),
1501   (  -1, 0x23, 0x23, "flat_load_sbyte_d16_hi"),
1502   (  -1, 0x24, 0x24, "flat_load_short_d16"),
1503   (  -1, 0x25, 0x25, "flat_load_short_d16_hi"),
1504   (0x30, 0x40, 0x30, "flat_atomic_swap"),
1505   (0x31, 0x41, 0x31, "flat_atomic_cmpswap"),
1506   (0x32, 0x42, 0x32, "flat_atomic_add"),
1507   (0x33, 0x43, 0x33, "flat_atomic_sub"),
1508   (0x35, 0x44, 0x35, "flat_atomic_smin"),
1509   (0x36, 0x45, 0x36, "flat_atomic_umin"),
1510   (0x37, 0x46, 0x37, "flat_atomic_smax"),
1511   (0x38, 0x47, 0x38, "flat_atomic_umax"),
1512   (0x39, 0x48, 0x39, "flat_atomic_and"),
1513   (0x3a, 0x49, 0x3a, "flat_atomic_or"),
1514   (0x3b, 0x4a, 0x3b, "flat_atomic_xor"),
1515   (0x3c, 0x4b, 0x3c, "flat_atomic_inc"),
1516   (0x3d, 0x4c, 0x3d, "flat_atomic_dec"),
1517   (0x3e,   -1, 0x3e, "flat_atomic_fcmpswap"),
1518   (0x3f,   -1, 0x3f, "flat_atomic_fmin"),
1519   (0x40,   -1, 0x40, "flat_atomic_fmax"),
1520   (0x50, 0x60, 0x50, "flat_atomic_swap_x2"),
1521   (0x51, 0x61, 0x51, "flat_atomic_cmpswap_x2"),
1522   (0x52, 0x62, 0x52, "flat_atomic_add_x2"),
1523   (0x53, 0x63, 0x53, "flat_atomic_sub_x2"),
1524   (0x55, 0x64, 0x55, "flat_atomic_smin_x2"),
1525   (0x56, 0x65, 0x56, "flat_atomic_umin_x2"),
1526   (0x57, 0x66, 0x57, "flat_atomic_smax_x2"),
1527   (0x58, 0x67, 0x58, "flat_atomic_umax_x2"),
1528   (0x59, 0x68, 0x59, "flat_atomic_and_x2"),
1529   (0x5a, 0x69, 0x5a, "flat_atomic_or_x2"),
1530   (0x5b, 0x6a, 0x5b, "flat_atomic_xor_x2"),
1531   (0x5c, 0x6b, 0x5c, "flat_atomic_inc_x2"),
1532   (0x5d, 0x6c, 0x5d, "flat_atomic_dec_x2"),
1533   (0x5e,   -1, 0x5e, "flat_atomic_fcmpswap_x2"),
1534   (0x5f,   -1, 0x5f, "flat_atomic_fmin_x2"),
1535   (0x60,   -1, 0x60, "flat_atomic_fmax_x2"),
1536}
1537for (gfx7, gfx8, gfx10, name) in FLAT:
1538    opcode(name, gfx7, gfx8, gfx10, Format.FLAT, is_atomic = "atomic" in name)
1539
1540GLOBAL = {
1541   #GFX8_9, GFX10
1542   (0x10, 0x08, "global_load_ubyte"),
1543   (0x11, 0x09, "global_load_sbyte"),
1544   (0x12, 0x0a, "global_load_ushort"),
1545   (0x13, 0x0b, "global_load_sshort"),
1546   (0x14, 0x0c, "global_load_dword"),
1547   (0x15, 0x0d, "global_load_dwordx2"),
1548   (0x16, 0x0f, "global_load_dwordx3"),
1549   (0x17, 0x0e, "global_load_dwordx4"),
1550   (0x18, 0x18, "global_store_byte"),
1551   (0x19, 0x19, "global_store_byte_d16_hi"),
1552   (0x1a, 0x1a, "global_store_short"),
1553   (0x1b, 0x1b, "global_store_short_d16_hi"),
1554   (0x1c, 0x1c, "global_store_dword"),
1555   (0x1d, 0x1d, "global_store_dwordx2"),
1556   (0x1e, 0x1f, "global_store_dwordx3"),
1557   (0x1f, 0x1e, "global_store_dwordx4"),
1558   (0x20, 0x20, "global_load_ubyte_d16"),
1559   (0x21, 0x21, "global_load_ubyte_d16_hi"),
1560   (0x22, 0x22, "global_load_sbyte_d16"),
1561   (0x23, 0x23, "global_load_sbyte_d16_hi"),
1562   (0x24, 0x24, "global_load_short_d16"),
1563   (0x25, 0x25, "global_load_short_d16_hi"),
1564   (0x40, 0x30, "global_atomic_swap"),
1565   (0x41, 0x31, "global_atomic_cmpswap"),
1566   (0x42, 0x32, "global_atomic_add"),
1567   (0x43, 0x33, "global_atomic_sub"),
1568   (0x44, 0x35, "global_atomic_smin"),
1569   (0x45, 0x36, "global_atomic_umin"),
1570   (0x46, 0x37, "global_atomic_smax"),
1571   (0x47, 0x38, "global_atomic_umax"),
1572   (0x48, 0x39, "global_atomic_and"),
1573   (0x49, 0x3a, "global_atomic_or"),
1574   (0x4a, 0x3b, "global_atomic_xor"),
1575   (0x4b, 0x3c, "global_atomic_inc"),
1576   (0x4c, 0x3d, "global_atomic_dec"),
1577   (  -1, 0x3e, "global_atomic_fcmpswap"),
1578   (  -1, 0x3f, "global_atomic_fmin"),
1579   (  -1, 0x40, "global_atomic_fmax"),
1580   (0x60, 0x50, "global_atomic_swap_x2"),
1581   (0x61, 0x51, "global_atomic_cmpswap_x2"),
1582   (0x62, 0x52, "global_atomic_add_x2"),
1583   (0x63, 0x53, "global_atomic_sub_x2"),
1584   (0x64, 0x55, "global_atomic_smin_x2"),
1585   (0x65, 0x56, "global_atomic_umin_x2"),
1586   (0x66, 0x57, "global_atomic_smax_x2"),
1587   (0x67, 0x58, "global_atomic_umax_x2"),
1588   (0x68, 0x59, "global_atomic_and_x2"),
1589   (0x69, 0x5a, "global_atomic_or_x2"),
1590   (0x6a, 0x5b, "global_atomic_xor_x2"),
1591   (0x6b, 0x5c, "global_atomic_inc_x2"),
1592   (0x6c, 0x5d, "global_atomic_dec_x2"),
1593   (  -1, 0x5e, "global_atomic_fcmpswap_x2"),
1594   (  -1, 0x5f, "global_atomic_fmin_x2"),
1595   (  -1, 0x60, "global_atomic_fmax_x2"),
1596   (  -1, 0x16, "global_load_dword_addtid"), #GFX10.3+
1597   (  -1, 0x17, "global_store_dword_addtid"), #GFX10.3+
1598   (  -1, 0x34, "global_atomic_csub"), #GFX10.3+. seems glc must be set
1599}
1600for (gfx8, gfx10, name) in GLOBAL:
1601    opcode(name, -1, gfx8, gfx10, Format.GLOBAL, is_atomic = "atomic" in name)
1602
1603SCRATCH = {
1604   #GFX8_9, GFX10
1605   (0x10, 0x08, "scratch_load_ubyte"),
1606   (0x11, 0x09, "scratch_load_sbyte"),
1607   (0x12, 0x0a, "scratch_load_ushort"),
1608   (0x13, 0x0b, "scratch_load_sshort"),
1609   (0x14, 0x0c, "scratch_load_dword"),
1610   (0x15, 0x0d, "scratch_load_dwordx2"),
1611   (0x16, 0x0f, "scratch_load_dwordx3"),
1612   (0x17, 0x0e, "scratch_load_dwordx4"),
1613   (0x18, 0x18, "scratch_store_byte"),
1614   (0x19, 0x19, "scratch_store_byte_d16_hi"),
1615   (0x1a, 0x1a, "scratch_store_short"),
1616   (0x1b, 0x1b, "scratch_store_short_d16_hi"),
1617   (0x1c, 0x1c, "scratch_store_dword"),
1618   (0x1d, 0x1d, "scratch_store_dwordx2"),
1619   (0x1e, 0x1f, "scratch_store_dwordx3"),
1620   (0x1f, 0x1e, "scratch_store_dwordx4"),
1621   (0x20, 0x20, "scratch_load_ubyte_d16"),
1622   (0x21, 0x21, "scratch_load_ubyte_d16_hi"),
1623   (0x22, 0x22, "scratch_load_sbyte_d16"),
1624   (0x23, 0x23, "scratch_load_sbyte_d16_hi"),
1625   (0x24, 0x24, "scratch_load_short_d16"),
1626   (0x25, 0x25, "scratch_load_short_d16_hi"),
1627}
1628for (gfx8, gfx10, name) in SCRATCH:
1629    opcode(name, -1, gfx8, gfx10, Format.SCRATCH)
1630
1631# check for duplicate opcode numbers
1632for ver in ['gfx9', 'gfx10']:
1633    op_to_name = {}
1634    for op in opcodes.values():
1635        if op.format in [Format.PSEUDO, Format.PSEUDO_BRANCH, Format.PSEUDO_BARRIER, Format.PSEUDO_REDUCTION]:
1636            continue
1637
1638        num = getattr(op, 'opcode_' + ver)
1639        if num == -1:
1640            continue
1641
1642        key = (op.format, num)
1643
1644        if key in op_to_name:
1645            # exceptions
1646            names = set([op_to_name[key], op.name])
1647            if ver in ['gfx8', 'gfx9'] and names == set(['v_mul_lo_i32', 'v_mul_lo_u32']):
1648                continue
1649            # v_mad_legacy_f32 is replaced with v_fma_legacy_f32 on GFX10.3
1650            if ver == 'gfx10' and names == set(['v_mad_legacy_f32', 'v_fma_legacy_f32']):
1651                continue
1652
1653            print('%s and %s share the same opcode number (%s)' % (op_to_name[key], op.name, ver))
1654            sys.exit(1)
1655        else:
1656            op_to_name[key] = op.name
1657