• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1#
2# Copyright (C) 2014 Connor Abbott
3#
4# Permission is hereby granted, free of charge, to any person obtaining a
5# copy of this software and associated documentation files (the "Software"),
6# to deal in the Software without restriction, including without limitation
7# the rights to use, copy, modify, merge, publish, distribute, sublicense,
8# and/or sell copies of the Software, and to permit persons to whom the
9# Software is furnished to do so, subject to the following conditions:
10#
11# The above copyright notice and this permission notice (including the next
12# paragraph) shall be included in all copies or substantial portions of the
13# Software.
14#
15# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
18# THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20# FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21# IN THE SOFTWARE.
22#
23# Authors:
24#    Connor Abbott (cwabbott0@gmail.com)
25
26import re
27
28# Class that represents all the information we have about the opcode
29# NOTE: this must be kept in sync with nir_op_info
30
31class Opcode(object):
32   """Class that represents all the information we have about the opcode
33   NOTE: this must be kept in sync with nir_op_info
34   """
35   def __init__(self, name, output_size, output_type, input_sizes,
36                input_types, is_conversion, algebraic_properties, const_expr,
37                description):
38      """Parameters:
39
40      - name is the name of the opcode (prepend nir_op_ for the enum name)
41      - all types are strings that get nir_type_ prepended to them
42      - input_types is a list of types
43      - is_conversion is true if this opcode represents a type conversion
44      - algebraic_properties is a space-seperated string, where nir_op_is_ is
45        prepended before each entry
46      - const_expr is an expression or series of statements that computes the
47        constant value of the opcode given the constant values of its inputs.
48      - Optional description of the opcode for documentation.
49
50      Constant expressions are formed from the variables src0, src1, ...,
51      src(N-1), where N is the number of arguments.  The output of the
52      expression should be stored in the dst variable.  Per-component input
53      and output variables will be scalars and non-per-component input and
54      output variables will be a struct with fields named x, y, z, and w
55      all of the correct type.  Input and output variables can be assumed
56      to already be of the correct type and need no conversion.  In
57      particular, the conversion from the C bool type to/from  NIR_TRUE and
58      NIR_FALSE happens automatically.
59
60      For per-component instructions, the entire expression will be
61      executed once for each component.  For non-per-component
62      instructions, the expression is expected to store the correct values
63      in dst.x, dst.y, etc.  If "dst" does not exist anywhere in the
64      constant expression, an assignment to dst will happen automatically
65      and the result will be equivalent to "dst = <expression>" for
66      per-component instructions and "dst.x = dst.y = ... = <expression>"
67      for non-per-component instructions.
68      """
69      assert isinstance(name, str)
70      assert isinstance(output_size, int)
71      assert isinstance(output_type, str)
72      assert isinstance(input_sizes, list)
73      assert isinstance(input_sizes[0], int)
74      assert isinstance(input_types, list)
75      assert isinstance(input_types[0], str)
76      assert isinstance(is_conversion, bool)
77      assert isinstance(algebraic_properties, str)
78      assert isinstance(const_expr, str)
79      assert len(input_sizes) == len(input_types)
80      assert 0 <= output_size <= 5 or (output_size == 8) or (output_size == 16)
81      for size in input_sizes:
82         assert 0 <= size <= 5 or (size == 8) or (size == 16)
83         if output_size != 0:
84            assert size != 0
85      self.name = name
86      self.num_inputs = len(input_sizes)
87      self.output_size = output_size
88      self.output_type = output_type
89      self.input_sizes = input_sizes
90      self.input_types = input_types
91      self.is_conversion = is_conversion
92      self.algebraic_properties = algebraic_properties
93      self.const_expr = const_expr
94      self.description = description
95
96# helper variables for strings
97tfloat = "float"
98tint = "int"
99tbool = "bool"
100tbool1 = "bool1"
101tbool8 = "bool8"
102tbool16 = "bool16"
103tbool32 = "bool32"
104tuint = "uint"
105tuint8 = "uint8"
106tint16 = "int16"
107tuint16 = "uint16"
108tfloat16 = "float16"
109tfloat32 = "float32"
110tint32 = "int32"
111tuint32 = "uint32"
112tint64 = "int64"
113tuint64 = "uint64"
114tfloat64 = "float64"
115
116_TYPE_SPLIT_RE = re.compile(r'(?P<type>int|uint|float|bool)(?P<bits>\d+)?')
117
118def type_has_size(type_):
119    m = _TYPE_SPLIT_RE.match(type_)
120    assert m is not None, 'Invalid NIR type string: "{}"'.format(type_)
121    return m.group('bits') is not None
122
123def type_size(type_):
124    m = _TYPE_SPLIT_RE.match(type_)
125    assert m is not None, 'Invalid NIR type string: "{}"'.format(type_)
126    assert m.group('bits') is not None, \
127           'NIR type string has no bit size: "{}"'.format(type_)
128    return int(m.group('bits'))
129
130def type_sizes(type_):
131    if type_has_size(type_):
132        return [type_size(type_)]
133    elif type_ == 'bool':
134        return [1, 8, 16, 32]
135    elif type_ == 'float':
136        return [16, 32, 64]
137    else:
138        return [1, 8, 16, 32, 64]
139
140def type_base_type(type_):
141    m = _TYPE_SPLIT_RE.match(type_)
142    assert m is not None, 'Invalid NIR type string: "{}"'.format(type_)
143    return m.group('type')
144
145# Operation where the first two sources are commutative.
146#
147# For 2-source operations, this just mathematical commutativity.  Some
148# 3-source operations, like ffma, are only commutative in the first two
149# sources.
150_2src_commutative = "2src_commutative "
151associative = "associative "
152selection = "selection "
153
154# global dictionary of opcodes
155opcodes = {}
156
157def opcode(name, output_size, output_type, input_sizes, input_types,
158           is_conversion, algebraic_properties, const_expr, description = ""):
159   assert name not in opcodes
160   opcodes[name] = Opcode(name, output_size, output_type, input_sizes,
161                          input_types, is_conversion, algebraic_properties,
162                          const_expr, description)
163
164def unop_convert(name, out_type, in_type, const_expr, description = ""):
165   opcode(name, 0, out_type, [0], [in_type], False, "", const_expr, description)
166
167def unop(name, ty, const_expr, description = "", algebraic_properties = ""):
168   opcode(name, 0, ty, [0], [ty], False, algebraic_properties, const_expr,
169          description)
170
171def unop_horiz(name, output_size, output_type, input_size, input_type,
172               const_expr, description = ""):
173   opcode(name, output_size, output_type, [input_size], [input_type],
174          False, "", const_expr, description)
175
176def unop_reduce(name, output_size, output_type, input_type, prereduce_expr,
177                reduce_expr, final_expr, description = ""):
178   def prereduce(src):
179      return "(" + prereduce_expr.format(src=src) + ")"
180   def final(src):
181      return final_expr.format(src="(" + src + ")")
182   def reduce_(src0, src1):
183      return reduce_expr.format(src0=src0, src1=src1)
184   src0 = prereduce("src0.x")
185   src1 = prereduce("src0.y")
186   src2 = prereduce("src0.z")
187   src3 = prereduce("src0.w")
188   unop_horiz(name + "2", output_size, output_type, 2, input_type,
189              final(reduce_(src0, src1)), description)
190   unop_horiz(name + "3", output_size, output_type, 3, input_type,
191              final(reduce_(reduce_(src0, src1), src2)), description)
192   unop_horiz(name + "4", output_size, output_type, 4, input_type,
193              final(reduce_(reduce_(src0, src1), reduce_(src2, src3))),
194              description)
195
196def unop_numeric_convert(name, out_type, in_type, const_expr, description = ""):
197   opcode(name, 0, out_type, [0], [in_type], True, "", const_expr, description)
198
199unop("mov", tuint, "src0")
200
201unop("ineg", tint, "src0 == u_intN_min(bit_size) ? src0 : -src0")
202unop("fneg", tfloat, "-src0")
203unop("inot", tint, "~src0", description = "Invert every bit of the integer")
204
205unop("fsign", tfloat, ("bit_size == 64 ? " +
206                       "(isnan(src0) ? 0.0  : ((src0 == 0.0 ) ? src0 : (src0 > 0.0 ) ? 1.0  : -1.0 )) : " +
207                       "(isnan(src0) ? 0.0f : ((src0 == 0.0f) ? src0 : (src0 > 0.0f) ? 1.0f : -1.0f))"),
208     description = """
209Roughly implements the OpenGL / Vulkan rules for ``sign(float)``.
210The ``GLSL.std.450 FSign`` instruction is defined as:
211
212    Result is 1.0 if x > 0, 0.0 if x = 0, or -1.0 if x < 0.
213
214If the source is equal to zero, there is a preference for the result to have
215the same sign, but this is not required (it is required by OpenCL).  If the
216source is not a number, there is a preference for the result to be +0.0, but
217this is not required (it is required by OpenCL).  If the source is not a
218number, and the result is not +0.0, the result should definitely **not** be
219NaN.
220
221The values returned for constant folding match the behavior required by
222OpenCL.
223     """)
224
225unop("isign", tint, "(src0 == 0) ? 0 : ((src0 > 0) ? 1 : -1)")
226unop("iabs", tint, "(src0 < 0) ? -src0 : src0")
227unop("fabs", tfloat, "fabs(src0)")
228unop("fsat", tfloat, ("fmin(fmax(src0, 0.0), 1.0)"))
229unop("frcp", tfloat, "bit_size == 64 ? 1.0 / src0 : 1.0f / src0")
230unop("frsq", tfloat, "bit_size == 64 ? 1.0 / sqrt(src0) : 1.0f / sqrtf(src0)")
231unop("fsqrt", tfloat, "bit_size == 64 ? sqrt(src0) : sqrtf(src0)")
232unop("fexp2", tfloat, "exp2f(src0)")
233unop("flog2", tfloat, "log2f(src0)")
234
235# Generate all of the numeric conversion opcodes
236for src_t in [tint, tuint, tfloat, tbool]:
237   if src_t == tbool:
238      dst_types = [tfloat, tint, tbool]
239   elif src_t == tint:
240      dst_types = [tfloat, tint]
241   elif src_t == tuint:
242      dst_types = [tfloat, tuint]
243   elif src_t == tfloat:
244      dst_types = [tint, tuint, tfloat]
245
246   for dst_t in dst_types:
247      for dst_bit_size in type_sizes(dst_t):
248          if dst_bit_size == 16 and dst_t == tfloat and src_t == tfloat:
249              rnd_modes = ['_rtne', '_rtz', '']
250              for rnd_mode in rnd_modes:
251                  if rnd_mode == '_rtne':
252                      conv_expr = """
253                      if (bit_size > 32) {
254                         dst = _mesa_half_to_float(_mesa_double_to_float16_rtne(src0));
255                      } else if (bit_size > 16) {
256                         dst = _mesa_half_to_float(_mesa_float_to_float16_rtne(src0));
257                      } else {
258                         dst = src0;
259                      }
260                      """
261                  elif rnd_mode == '_rtz':
262                      conv_expr = """
263                      if (bit_size > 32) {
264                         dst = _mesa_half_to_float(_mesa_double_to_float16_rtz(src0));
265                      } else if (bit_size > 16) {
266                         dst = _mesa_half_to_float(_mesa_float_to_float16_rtz(src0));
267                      } else {
268                         dst = src0;
269                      }
270                      """
271                  else:
272                      conv_expr = """
273                      if (bit_size > 32) {
274                         if (nir_is_rounding_mode_rtz(execution_mode, 16))
275                            dst = _mesa_half_to_float(_mesa_double_to_float16_rtz(src0));
276                         else
277                            dst = _mesa_half_to_float(_mesa_double_to_float16_rtne(src0));
278                      } else if (bit_size > 16) {
279                         if (nir_is_rounding_mode_rtz(execution_mode, 16))
280                            dst = _mesa_half_to_float(_mesa_float_to_float16_rtz(src0));
281                         else
282                            dst = _mesa_half_to_float(_mesa_float_to_float16_rtne(src0));
283                      } else {
284                         dst = src0;
285                      }
286                      """
287
288                  unop_numeric_convert("{0}2{1}{2}{3}".format(src_t[0],
289                                                              dst_t[0],
290                                                              dst_bit_size,
291                                                              rnd_mode),
292                                       dst_t + str(dst_bit_size),
293                                       src_t, conv_expr)
294          elif dst_bit_size == 32 and dst_t == tfloat and src_t == tfloat:
295              conv_expr = """
296              if (bit_size > 32 && nir_is_rounding_mode_rtz(execution_mode, 32)) {
297                 dst = _mesa_double_to_float_rtz(src0);
298              } else {
299                 dst = src0;
300              }
301              """
302              unop_numeric_convert("{0}2{1}{2}".format(src_t[0], dst_t[0],
303                                                       dst_bit_size),
304                                   dst_t + str(dst_bit_size), src_t, conv_expr)
305          else:
306              conv_expr = "src0 != 0" if dst_t == tbool else "src0"
307              unop_numeric_convert("{0}2{1}{2}".format(src_t[0], dst_t[0],
308                                                       dst_bit_size),
309                                   dst_t + str(dst_bit_size), src_t, conv_expr)
310
311def unop_numeric_convert_mp(base, src_t, dst_t):
312    op_like = base + "16"
313    unop_numeric_convert(base + "mp", src_t, dst_t, opcodes[op_like].const_expr,
314                         description = """
315Special opcode that is the same as :nir:alu-op:`{}` except that it is safe to
316remove it if the result is immediately converted back to 32 bits again. This is
317generated as part of the precision lowering pass. ``mp`` stands for medium
318precision.
319                         """.format(op_like))
320
321unop_numeric_convert_mp("f2f", tfloat16, tfloat32)
322unop_numeric_convert_mp("i2i", tint16, tint32)
323# u2ump isn't defined, because the behavior is equal to i2imp
324unop_numeric_convert_mp("f2i", tint16, tfloat32)
325unop_numeric_convert_mp("f2u", tuint16, tfloat32)
326unop_numeric_convert_mp("i2f", tfloat16, tint32)
327unop_numeric_convert_mp("u2f", tfloat16, tuint32)
328
329# Unary floating-point rounding operations.
330
331
332unop("ftrunc", tfloat, "bit_size == 64 ? trunc(src0) : truncf(src0)")
333unop("fceil", tfloat, "bit_size == 64 ? ceil(src0) : ceilf(src0)")
334unop("ffloor", tfloat, "bit_size == 64 ? floor(src0) : floorf(src0)")
335unop("ffract", tfloat, "src0 - (bit_size == 64 ? floor(src0) : floorf(src0))")
336unop("fround_even", tfloat, "bit_size == 64 ? _mesa_roundeven(src0) : _mesa_roundevenf(src0)")
337
338unop("fquantize2f16", tfloat, "(fabs(src0) < ldexpf(1.0, -14)) ? copysignf(0.0f, src0) : _mesa_half_to_float(_mesa_float_to_half(src0))")
339
340# Trigonometric operations.
341
342
343unop("fsin", tfloat, "bit_size == 64 ? sin(src0) : sinf(src0)")
344unop("fcos", tfloat, "bit_size == 64 ? cos(src0) : cosf(src0)")
345
346# dfrexp
347unop_convert("frexp_exp", tint32, tfloat, "frexp(src0, &dst);")
348unop_convert("frexp_sig", tfloat, tfloat, "int n; dst = frexp(src0, &n);")
349
350# Floating point pack and unpack operations.
351
352def pack_2x16(fmt, in_type):
353   unop_horiz("pack_" + fmt + "_2x16", 1, tuint32, 2, in_type, """
354dst.x = (uint32_t) pack_fmt_1x16(src0.x);
355dst.x |= ((uint32_t) pack_fmt_1x16(src0.y)) << 16;
356""".replace("fmt", fmt))
357
358def pack_4x8(fmt):
359   unop_horiz("pack_" + fmt + "_4x8", 1, tuint32, 4, tfloat32, """
360dst.x = (uint32_t) pack_fmt_1x8(src0.x);
361dst.x |= ((uint32_t) pack_fmt_1x8(src0.y)) << 8;
362dst.x |= ((uint32_t) pack_fmt_1x8(src0.z)) << 16;
363dst.x |= ((uint32_t) pack_fmt_1x8(src0.w)) << 24;
364""".replace("fmt", fmt))
365
366def unpack_2x16(fmt):
367   unop_horiz("unpack_" + fmt + "_2x16", 2, tfloat32, 1, tuint32, """
368dst.x = unpack_fmt_1x16((uint16_t)(src0.x & 0xffff));
369dst.y = unpack_fmt_1x16((uint16_t)(src0.x << 16));
370""".replace("fmt", fmt))
371
372def unpack_4x8(fmt):
373   unop_horiz("unpack_" + fmt + "_4x8", 4, tfloat32, 1, tuint32, """
374dst.x = unpack_fmt_1x8((uint8_t)(src0.x & 0xff));
375dst.y = unpack_fmt_1x8((uint8_t)((src0.x >> 8) & 0xff));
376dst.z = unpack_fmt_1x8((uint8_t)((src0.x >> 16) & 0xff));
377dst.w = unpack_fmt_1x8((uint8_t)(src0.x >> 24));
378""".replace("fmt", fmt))
379
380
381pack_2x16("snorm", tfloat)
382pack_4x8("snorm")
383pack_2x16("unorm", tfloat)
384pack_4x8("unorm")
385pack_2x16("half", tfloat32)
386unpack_2x16("snorm")
387unpack_4x8("snorm")
388unpack_2x16("unorm")
389unpack_4x8("unorm")
390
391unop_horiz("pack_uint_2x16", 1, tuint32, 2, tuint32, """
392dst.x = _mesa_unsigned_to_unsigned(src0.x, 16);
393dst.x |= _mesa_unsigned_to_unsigned(src0.y, 16) << 16;
394""", description = """
395Convert two unsigned integers into a packed unsigned short (clamp is applied).
396""")
397
398unop_horiz("pack_sint_2x16", 1, tint32, 2, tint32, """
399dst.x = _mesa_signed_to_signed(src0.x, 16) & 0xffff;
400dst.x |= _mesa_signed_to_signed(src0.y, 16) << 16;
401""", description = """
402Convert two signed integers into a packed signed short (clamp is applied).
403""")
404
405unop_horiz("pack_uvec2_to_uint", 1, tuint32, 2, tuint32, """
406dst.x = (src0.x & 0xffff) | (src0.y << 16);
407""")
408
409unop_horiz("pack_uvec4_to_uint", 1, tuint32, 4, tuint32, """
410dst.x = (src0.x <<  0) |
411        (src0.y <<  8) |
412        (src0.z << 16) |
413        (src0.w << 24);
414""")
415
416unop_horiz("pack_32_4x8", 1, tuint32, 4, tuint8,
417           "dst.x = src0.x | ((uint32_t)src0.y << 8) | ((uint32_t)src0.z << 16) | ((uint32_t)src0.w << 24);")
418
419unop_horiz("pack_32_2x16", 1, tuint32, 2, tuint16,
420           "dst.x = src0.x | ((uint32_t)src0.y << 16);")
421
422unop_horiz("pack_64_2x32", 1, tuint64, 2, tuint32,
423           "dst.x = src0.x | ((uint64_t)src0.y << 32);")
424
425unop_horiz("pack_64_4x16", 1, tuint64, 4, tuint16,
426           "dst.x = src0.x | ((uint64_t)src0.y << 16) | ((uint64_t)src0.z << 32) | ((uint64_t)src0.w << 48);")
427
428unop_horiz("unpack_64_2x32", 2, tuint32, 1, tuint64,
429           "dst.x = src0.x; dst.y = src0.x >> 32;")
430
431unop_horiz("unpack_64_4x16", 4, tuint16, 1, tuint64,
432           "dst.x = src0.x; dst.y = src0.x >> 16; dst.z = src0.x >> 32; dst.w = src0.x >> 48;")
433
434unop_horiz("unpack_32_2x16", 2, tuint16, 1, tuint32,
435           "dst.x = src0.x; dst.y = src0.x >> 16;")
436
437unop_horiz("unpack_32_4x8", 4, tuint8, 1, tuint32,
438           "dst.x = src0.x; dst.y = src0.x >> 8; dst.z = src0.x >> 16; dst.w = src0.x >> 24;")
439
440unop_horiz("unpack_half_2x16", 2, tfloat32, 1, tuint32, """
441dst.x = unpack_half_1x16((uint16_t)(src0.x & 0xffff), nir_is_denorm_flush_to_zero(execution_mode, 16));
442dst.y = unpack_half_1x16((uint16_t)(src0.x >> 16), nir_is_denorm_flush_to_zero(execution_mode, 16));
443""")
444
445# Lowered floating point unpacking operations.
446
447unop_convert("unpack_half_2x16_split_x", tfloat32, tuint32,
448             "unpack_half_1x16((uint16_t)(src0 & 0xffff), nir_is_denorm_flush_to_zero(execution_mode, 16))")
449unop_convert("unpack_half_2x16_split_y", tfloat32, tuint32,
450             "unpack_half_1x16((uint16_t)(src0 >> 16), nir_is_denorm_flush_to_zero(execution_mode, 16))")
451
452
453unop_convert("unpack_32_2x16_split_x", tuint16, tuint32, "src0")
454unop_convert("unpack_32_2x16_split_y", tuint16, tuint32, "src0 >> 16")
455
456unop_convert("unpack_64_2x32_split_x", tuint32, tuint64, "src0")
457unop_convert("unpack_64_2x32_split_y", tuint32, tuint64, "src0 >> 32")
458
459# Bit operations, part of ARB_gpu_shader5.
460
461
462unop("bitfield_reverse", tuint32, """
463/* we're not winning any awards for speed here, but that's ok */
464dst = 0;
465for (unsigned bit = 0; bit < 32; bit++)
466   dst |= ((src0 >> bit) & 1) << (31 - bit);
467""")
468unop_convert("bit_count", tuint32, tuint, """
469dst = 0;
470for (unsigned bit = 0; bit < bit_size; bit++) {
471   if ((src0 >> bit) & 1)
472      dst++;
473}
474""")
475
476unop_convert("ufind_msb", tint32, tuint, """
477dst = -1;
478for (int bit = bit_size - 1; bit >= 0; bit--) {
479   if ((src0 >> bit) & 1) {
480      dst = bit;
481      break;
482   }
483}
484""")
485
486unop_convert("ufind_msb_rev", tint32, tuint, """
487dst = -1;
488for (int bit = 0; bit < bit_size; bit++) {
489   if ((src0 << bit) & 0x80000000) {
490      dst = bit;
491      break;
492   }
493}
494""")
495
496unop("uclz", tuint32, """
497int bit;
498for (bit = bit_size - 1; bit >= 0; bit--) {
499   if ((src0 & (1u << bit)) != 0)
500      break;
501}
502dst = (unsigned)(bit_size - bit - 1);
503""")
504
505unop("ifind_msb", tint32, """
506dst = -1;
507for (int bit = bit_size - 1; bit >= 0; bit--) {
508   /* If src0 < 0, we're looking for the first 0 bit.
509    * if src0 >= 0, we're looking for the first 1 bit.
510    */
511   if ((((src0 >> bit) & 1) && (src0 >= 0)) ||
512      (!((src0 >> bit) & 1) && (src0 < 0))) {
513      dst = bit;
514      break;
515   }
516}
517""")
518
519unop("ifind_msb_rev", tint32, """
520dst = -1;
521/* We are looking for the highest bit that's not the same as the sign bit. */
522uint32_t sign = src0 & 0x80000000u;
523for (int bit = 0; bit < 32; bit++) {
524   if (((src0 << bit) & 0x80000000u) != sign) {
525      dst = bit;
526      break;
527   }
528}
529""")
530
531unop_convert("find_lsb", tint32, tint, """
532dst = -1;
533for (unsigned bit = 0; bit < bit_size; bit++) {
534   if ((src0 >> bit) & 1) {
535      dst = bit;
536      break;
537   }
538}
539""")
540
541unop_reduce("fsum", 1, tfloat, tfloat, "{src}", "{src0} + {src1}", "{src}",
542            description = "Sum of vector components")
543
544def binop_convert(name, out_type, in_type1, alg_props, const_expr, description="", in_type2=None):
545   if in_type2 is None:
546      in_type2 = in_type1
547   opcode(name, 0, out_type, [0, 0], [in_type1, in_type2],
548          False, alg_props, const_expr, description)
549
550def binop(name, ty, alg_props, const_expr, description = ""):
551   binop_convert(name, ty, ty, alg_props, const_expr, description)
552
553def binop_compare(name, ty, alg_props, const_expr, description = "", ty2=None):
554   binop_convert(name, tbool1, ty, alg_props, const_expr, description, ty2)
555
556def binop_compare8(name, ty, alg_props, const_expr, description = "", ty2=None):
557   binop_convert(name, tbool8, ty, alg_props, const_expr, description, ty2)
558
559def binop_compare16(name, ty, alg_props, const_expr, description = "", ty2=None):
560   binop_convert(name, tbool16, ty, alg_props, const_expr, description, ty2)
561
562def binop_compare32(name, ty, alg_props, const_expr, description = "", ty2=None):
563   binop_convert(name, tbool32, ty, alg_props, const_expr, description, ty2)
564
565def binop_compare_all_sizes(name, ty, alg_props, const_expr, description = "", ty2=None):
566   binop_compare(name, ty, alg_props, const_expr, description, ty2)
567   binop_compare8(name + "8", ty, alg_props, const_expr, description, ty2)
568   binop_compare16(name + "16", ty, alg_props, const_expr, description, ty2)
569   binop_compare32(name + "32", ty, alg_props, const_expr, description, ty2)
570
571def binop_horiz(name, out_size, out_type, src1_size, src1_type, src2_size,
572                src2_type, const_expr, description = ""):
573   opcode(name, out_size, out_type, [src1_size, src2_size], [src1_type, src2_type],
574          False, "", const_expr, description)
575
576def binop_reduce(name, output_size, output_type, src_type, prereduce_expr,
577                 reduce_expr, final_expr, suffix="", description = ""):
578   def final(src):
579      return final_expr.format(src= "(" + src + ")")
580   def reduce_(src0, src1):
581      return reduce_expr.format(src0=src0, src1=src1)
582   def prereduce(src0, src1):
583      return "(" + prereduce_expr.format(src0=src0, src1=src1) + ")"
584   srcs = [prereduce("src0." + letter, "src1." + letter) for letter in "xyzwefghijklmnop"]
585   def pairwise_reduce(start, size):
586      if (size == 1):
587         return srcs[start]
588      return reduce_(pairwise_reduce(start + size // 2, size // 2), pairwise_reduce(start, size // 2))
589   for size in [2, 4, 8, 16]:
590      opcode(name + str(size) + suffix, output_size, output_type,
591             [size, size], [src_type, src_type], False, _2src_commutative,
592             final(pairwise_reduce(0, size)), description)
593   opcode(name + "3" + suffix, output_size, output_type,
594          [3, 3], [src_type, src_type], False, _2src_commutative,
595          final(reduce_(reduce_(srcs[2], srcs[1]), srcs[0])), description)
596   opcode(name + "5" + suffix, output_size, output_type,
597          [5, 5], [src_type, src_type], False, _2src_commutative,
598          final(reduce_(srcs[4], reduce_(reduce_(srcs[3], srcs[2]),
599                                         reduce_(srcs[1], srcs[0])))),
600          description)
601
602def binop_reduce_all_sizes(name, output_size, src_type, prereduce_expr,
603                           reduce_expr, final_expr, description = ""):
604   binop_reduce(name, output_size, tbool1, src_type,
605                prereduce_expr, reduce_expr, final_expr, description)
606   binop_reduce("b8" + name[1:], output_size, tbool8, src_type,
607                prereduce_expr, reduce_expr, final_expr, description)
608   binop_reduce("b16" + name[1:], output_size, tbool16, src_type,
609                prereduce_expr, reduce_expr, final_expr, description)
610   binop_reduce("b32" + name[1:], output_size, tbool32, src_type,
611                prereduce_expr, reduce_expr, final_expr, description)
612
613binop("fadd", tfloat, _2src_commutative + associative,"""
614if (nir_is_rounding_mode_rtz(execution_mode, bit_size)) {
615   if (bit_size == 64)
616      dst = _mesa_double_add_rtz(src0, src1);
617   else
618      dst = _mesa_double_to_float_rtz((double)src0 + (double)src1);
619} else {
620   dst = src0 + src1;
621}
622""")
623binop("iadd", tint, _2src_commutative + associative, "(uint64_t)src0 + (uint64_t)src1")
624binop("iadd_sat", tint, _2src_commutative, """
625      src1 > 0 ?
626         (src0 + src1 < src0 ? u_intN_max(bit_size) : src0 + src1) :
627         (src0 < src0 + src1 ? u_intN_min(bit_size) : src0 + src1)
628""")
629binop("uadd_sat", tuint, _2src_commutative,
630      "(src0 + src1) < src0 ? u_uintN_max(sizeof(src0) * 8) : (src0 + src1)")
631binop("isub_sat", tint, "", """
632      src1 < 0 ?
633         (src0 - src1 < src0 ? u_intN_max(bit_size) : src0 - src1) :
634         (src0 < src0 - src1 ? u_intN_min(bit_size) : src0 - src1)
635""")
636binop("usub_sat", tuint, "", "src0 < src1 ? 0 : src0 - src1")
637
638binop("fsub", tfloat, "", """
639if (nir_is_rounding_mode_rtz(execution_mode, bit_size)) {
640   if (bit_size == 64)
641      dst = _mesa_double_sub_rtz(src0, src1);
642   else
643      dst = _mesa_double_to_float_rtz((double)src0 - (double)src1);
644} else {
645   dst = src0 - src1;
646}
647""")
648binop("isub", tint, "", "src0 - src1")
649binop_convert("uabs_isub", tuint, tint, "", """
650              src1 > src0 ? (uint64_t) src1 - (uint64_t) src0
651                          : (uint64_t) src0 - (uint64_t) src1
652""")
653binop("uabs_usub", tuint, "", "(src1 > src0) ? (src1 - src0) : (src0 - src1)")
654
655binop("fmul", tfloat, _2src_commutative + associative, """
656if (nir_is_rounding_mode_rtz(execution_mode, bit_size)) {
657   if (bit_size == 64)
658      dst = _mesa_double_mul_rtz(src0, src1);
659   else
660      dst = _mesa_double_to_float_rtz((double)src0 * (double)src1);
661} else {
662   dst = src0 * src1;
663}
664""")
665
666binop("fmulz", tfloat32, _2src_commutative + associative, """
667if (src0 == 0.0 || src1 == 0.0)
668   dst = 0.0;
669else if (nir_is_rounding_mode_rtz(execution_mode, 32))
670   dst = _mesa_double_to_float_rtz((double)src0 * (double)src1);
671else
672   dst = src0 * src1;
673""", description = """
674Unlike :nir:alu-op:`fmul`, anything (even infinity or NaN) multiplied by zero is
675always zero. ``fmulz(0.0, inf)`` and ``fmulz(0.0, nan)`` must be +/-0.0, even
676if ``INF_PRESERVE/NAN_PRESERVE`` is not used. If ``SIGNED_ZERO_PRESERVE`` is
677used, then the result must be a positive zero if either operand is zero.
678""")
679
680
681binop("imul", tint, _2src_commutative + associative, """
682   /* Use 64-bit multiplies to prevent overflow of signed arithmetic */
683   dst = (uint64_t)src0 * (uint64_t)src1;
684""", description = "Low 32-bits of signed/unsigned integer multiply")
685
686binop_convert("imul_2x32_64", tint64, tint32, _2src_commutative,
687              "(int64_t)src0 * (int64_t)src1",
688              description = "Multiply signed 32-bit integers, 64-bit result")
689binop_convert("umul_2x32_64", tuint64, tuint32, _2src_commutative,
690              "(uint64_t)src0 * (uint64_t)src1",
691              description = "Multiply unsigned 32-bit integers, 64-bit result")
692
693binop("imul_high", tint, _2src_commutative, """
694if (bit_size == 64) {
695   /* We need to do a full 128-bit x 128-bit multiply in order for the sign
696    * extension to work properly.  The casts are kind-of annoying but needed
697    * to prevent compiler warnings.
698    */
699   uint32_t src0_u32[4] = {
700      src0,
701      (int64_t)src0 >> 32,
702      (int64_t)src0 >> 63,
703      (int64_t)src0 >> 63,
704   };
705   uint32_t src1_u32[4] = {
706      src1,
707      (int64_t)src1 >> 32,
708      (int64_t)src1 >> 63,
709      (int64_t)src1 >> 63,
710   };
711   uint32_t prod_u32[4];
712   ubm_mul_u32arr(prod_u32, src0_u32, src1_u32);
713   dst = (uint64_t)prod_u32[2] | ((uint64_t)prod_u32[3] << 32);
714} else {
715   /* First, sign-extend to 64-bit, then convert to unsigned to prevent
716    * potential overflow of signed multiply */
717   dst = ((uint64_t)(int64_t)src0 * (uint64_t)(int64_t)src1) >> bit_size;
718}
719""", description = "High 32-bits of signed integer multiply")
720
721binop("umul_high", tuint, _2src_commutative, """
722if (bit_size == 64) {
723   /* The casts are kind-of annoying but needed to prevent compiler warnings. */
724   uint32_t src0_u32[2] = { src0, (uint64_t)src0 >> 32 };
725   uint32_t src1_u32[2] = { src1, (uint64_t)src1 >> 32 };
726   uint32_t prod_u32[4];
727   ubm_mul_u32arr(prod_u32, src0_u32, src1_u32);
728   dst = (uint64_t)prod_u32[2] | ((uint64_t)prod_u32[3] << 32);
729} else {
730   dst = ((uint64_t)src0 * (uint64_t)src1) >> bit_size;
731}
732""", description = "High 32-bits of unsigned integer multiply")
733
734binop("umul_low", tuint32, _2src_commutative, """
735uint64_t mask = (1 << (bit_size / 2)) - 1;
736dst = ((uint64_t)src0 & mask) * ((uint64_t)src1 & mask);
737""", description = "Low 32-bits of unsigned integer multiply")
738
739binop("imul_32x16", tint32, "", "src0 * (int16_t) src1",
740      description = "Multiply 32-bits with low 16-bits, with sign extension")
741binop("umul_32x16", tuint32, "", "src0 * (uint16_t) src1",
742      description = "Multiply 32-bits with low 16-bits, with zero extension")
743
744binop("fdiv", tfloat, "", "src0 / src1")
745binop("idiv", tint, "", "src1 == 0 ? 0 : (src0 / src1)")
746binop("udiv", tuint, "", "src1 == 0 ? 0 : (src0 / src1)")
747
748binop_convert("uadd_carry", tuint, tuint, _2src_commutative,
749              "src0 + src1 < src0",
750              description = """
751Return an integer (1 or 0) representing the carry resulting from the
752addition of the two unsigned arguments.
753              """)
754
755binop_convert("usub_borrow", tuint, tuint, "", "src0 < src1", description = """
756Return an integer (1 or 0) representing the borrow resulting from the
757subtraction of the two unsigned arguments.
758              """)
759
760# hadd: (a + b) >> 1 (without overflow)
761# x + y = x - (x & ~y) + (x & ~y) + y - (~x & y) + (~x & y)
762#       =      (x & y) + (x & ~y) +      (x & y) + (~x & y)
763#       = 2 *  (x & y) + (x & ~y) +                (~x & y)
764#       =     ((x & y) << 1) + (x ^ y)
765#
766# Since we know that the bottom bit of (x & y) << 1 is zero,
767#
768# (x + y) >> 1 = (((x & y) << 1) + (x ^ y)) >> 1
769#              =   (x & y) +      ((x ^ y)  >> 1)
770binop("ihadd", tint, _2src_commutative, "(src0 & src1) + ((src0 ^ src1) >> 1)")
771binop("uhadd", tuint, _2src_commutative, "(src0 & src1) + ((src0 ^ src1) >> 1)")
772
773# rhadd: (a + b + 1) >> 1 (without overflow)
774# x + y + 1 = x + (~x & y) - (~x & y) + y + (x & ~y) - (x & ~y) + 1
775#           =      (x | y) - (~x & y) +      (x | y) - (x & ~y) + 1
776#           = 2 *  (x | y) - ((~x & y) +               (x & ~y)) + 1
777#           =     ((x | y) << 1) - (x ^ y) + 1
778#
779# Since we know that the bottom bit of (x & y) << 1 is zero,
780#
781# (x + y + 1) >> 1 = (x | y) + (-(x ^ y) + 1) >> 1)
782#                  = (x | y) -  ((x ^ y)      >> 1)
783binop("irhadd", tint, _2src_commutative, "(src0 | src1) - ((src0 ^ src1) >> 1)")
784binop("urhadd", tuint, _2src_commutative, "(src0 | src1) - ((src0 ^ src1) >> 1)")
785
786binop("umod", tuint, "", "src1 == 0 ? 0 : src0 % src1")
787
788# For signed integers, there are several different possible definitions of
789# "modulus" or "remainder".  We follow the conventions used by LLVM and
790# SPIR-V.  The irem opcode implements the standard C/C++ signed "%"
791# operation while the imod opcode implements the more mathematical
792# "modulus" operation.  For details on the difference, see
793#
794# http://mathforum.org/library/drmath/view/52343.html
795
796binop("irem", tint, "", "src1 == 0 ? 0 : src0 % src1")
797binop("imod", tint, "",
798      "src1 == 0 ? 0 : ((src0 % src1 == 0 || (src0 >= 0) == (src1 >= 0)) ?"
799      "                 src0 % src1 : src0 % src1 + src1)")
800binop("fmod", tfloat, "", "src0 - src1 * floorf(src0 / src1)")
801binop("frem", tfloat, "", "src0 - src1 * truncf(src0 / src1)")
802
803#
804# Comparisons
805#
806
807
808# these integer-aware comparisons return a boolean (0 or ~0)
809
810binop_compare_all_sizes("flt", tfloat, "", "src0 < src1")
811binop_compare_all_sizes("fge", tfloat, "", "src0 >= src1")
812binop_compare_all_sizes("fltu", tfloat, "", "isnan(src0) || isnan(src1) || src0 < src1")
813binop_compare_all_sizes("fgeu", tfloat, "", "isnan(src0) || isnan(src1) || src0 >= src1")
814binop_compare_all_sizes("feq", tfloat, _2src_commutative, "src0 == src1")
815binop_compare_all_sizes("fneu", tfloat, _2src_commutative, "src0 != src1")
816binop_compare_all_sizes("fequ", tfloat, _2src_commutative, "isnan(src0) || isnan(src1) || src0 == src1")
817binop_compare_all_sizes("fneo", tfloat, _2src_commutative, "!isnan(src0) && !isnan(src1) && src0 != src1")
818binop_compare_all_sizes("funord", tfloat, _2src_commutative, "isnan(src0) || isnan(src1)")
819binop_compare_all_sizes("ford", tfloat, _2src_commutative, "!isnan(src0) && !isnan(src1)")
820binop_compare_all_sizes("ilt", tint, "", "src0 < src1")
821binop_compare_all_sizes("ige", tint, "", "src0 >= src1")
822binop_compare_all_sizes("ieq", tint, _2src_commutative, "src0 == src1")
823binop_compare_all_sizes("ine", tint, _2src_commutative, "src0 != src1")
824binop_compare_all_sizes("ult", tuint, "", "src0 < src1")
825binop_compare_all_sizes("uge", tuint, "", "src0 >= src1")
826
827binop_compare_all_sizes("bitnz", tuint, "", "((uint64_t)src0 >> (src1 & (bit_size - 1)) & 0x1) == 0x1",
828   "only uses the least significant bits like SM5 shifts", tuint32)
829
830binop_compare_all_sizes("bitz", tuint, "", "((uint64_t)src0 >> (src1 & (bit_size - 1)) & 0x1) == 0x0",
831   "only uses the least significant bits like SM5 shifts", tuint32)
832
833# integer-aware GLSL-style comparisons that compare floats and ints
834
835binop_reduce_all_sizes("ball_fequal",  1, tfloat, "{src0} == {src1}",
836                       "{src0} && {src1}", "{src}")
837binop_reduce_all_sizes("bany_fnequal", 1, tfloat, "{src0} != {src1}",
838                       "{src0} || {src1}", "{src}")
839binop_reduce_all_sizes("ball_iequal",  1, tint, "{src0} == {src1}",
840                       "{src0} && {src1}", "{src}")
841binop_reduce_all_sizes("bany_inequal", 1, tint, "{src0} != {src1}",
842                       "{src0} || {src1}", "{src}")
843
844# non-integer-aware GLSL-style comparisons that return 0.0 or 1.0
845
846binop_reduce("fall_equal",  1, tfloat32, tfloat32, "{src0} == {src1}",
847             "{src0} && {src1}", "{src} ? 1.0f : 0.0f")
848binop_reduce("fany_nequal", 1, tfloat32, tfloat32, "{src0} != {src1}",
849             "{src0} || {src1}", "{src} ? 1.0f : 0.0f")
850
851# These comparisons for integer-less hardware return 1.0 and 0.0 for true
852# and false respectively
853
854binop("slt", tfloat, "", "(src0 < src1) ? 1.0f : 0.0f") # Set on Less Than
855binop("sge", tfloat, "", "(src0 >= src1) ? 1.0f : 0.0f") # Set on Greater or Equal
856binop("seq", tfloat, _2src_commutative, "(src0 == src1) ? 1.0f : 0.0f") # Set on Equal
857binop("sne", tfloat, _2src_commutative, "(src0 != src1) ? 1.0f : 0.0f") # Set on Not Equal
858
859shift_note = """
860SPIRV shifts are undefined for shift-operands >= bitsize,
861but SM5 shifts are defined to use only the least significant bits.
862The NIR definition is according to the SM5 specification.
863"""
864
865opcode("ishl", 0, tint, [0, 0], [tint, tuint32], False, "",
866       "(uint64_t)src0 << (src1 & (sizeof(src0) * 8 - 1))",
867       description = "Left shift." + shift_note)
868opcode("ishr", 0, tint, [0, 0], [tint, tuint32], False, "",
869       "src0 >> (src1 & (sizeof(src0) * 8 - 1))",
870       description = "Signed right-shift." + shift_note)
871opcode("ushr", 0, tuint, [0, 0], [tuint, tuint32], False, "",
872       "src0 >> (src1 & (sizeof(src0) * 8 - 1))",
873       description = "Unsigned right-shift." + shift_note)
874
875opcode("udiv_aligned_4", 0, tuint, [0], [tuint], False, "",
876       "src0 >> 2", description = "Divide a multiple of 4 by 4")
877
878opcode("urol", 0, tuint, [0, 0], [tuint, tuint32], False, "", """
879   uint32_t rotate_mask = sizeof(src0) * 8 - 1;
880   dst = (src0 << (src1 & rotate_mask)) |
881         (src0 >> (-src1 & rotate_mask));
882""")
883opcode("uror", 0, tuint, [0, 0], [tuint, tuint32], False, "", """
884   uint32_t rotate_mask = sizeof(src0) * 8 - 1;
885   dst = (src0 >> (src1 & rotate_mask)) |
886         (src0 << (-src1 & rotate_mask));
887""")
888
889opcode("shfr", 0, tuint32, [0, 0, 0], [tuint32, tuint32, tuint32], False, "", """
890   uint32_t rotate_mask = sizeof(src0) * 8 - 1;
891   uint64_t src = src1 | ((uint64_t)src0 << 32);
892   dst = src >> (src2 & rotate_mask);
893""")
894
895bitwise_description = """
896Bitwise {0}, also used as a boolean {0} for hardware supporting integers.
897"""
898
899binop("iand", tuint, _2src_commutative + associative, "src0 & src1",
900      description = bitwise_description.format("AND"))
901binop("ior", tuint, _2src_commutative + associative, "src0 | src1",
902      description = bitwise_description.format("OR"))
903binop("ixor", tuint, _2src_commutative + associative, "src0 ^ src1",
904      description = bitwise_description.format("XOR"))
905
906
907binop_reduce("fdot", 1, tfloat, tfloat, "{src0} * {src1}", "{src0} + {src1}",
908             "{src}")
909
910binop_reduce("fdot", 0, tfloat, tfloat,
911             "{src0} * {src1}", "{src0} + {src1}", "{src}",
912             suffix="_replicated")
913
914opcode("fdph", 1, tfloat, [3, 4], [tfloat, tfloat], False, "",
915       "src0.x * src1.x + src0.y * src1.y + src0.z * src1.z + src1.w")
916opcode("fdph_replicated", 0, tfloat, [3, 4], [tfloat, tfloat], False, "",
917       "src0.x * src1.x + src0.y * src1.y + src0.z * src1.z + src1.w")
918
919# The C fmin/fmax functions have implementation-defined behaviour for signed
920# zeroes. However, SPIR-V requires:
921#
922#   fmin(-0, +0) = -0
923#   fmax(+0, -0) = +0
924#
925# The NIR opcodes match SPIR-V. Furthermore, the NIR opcodes are commutative, so
926# we must also ensure:
927#
928#   fmin(+0, -0) = -0
929#   fmax(-0, +0) = +0
930#
931# To implement the constant folding, when the sources are equal, we use the
932# min/max of the bit patterns which will order the signed zeroes while
933# preserving all other values.
934for op, macro in [("fmin", "MIN2"), ("fmax", "MAX2")]:
935    binop(op, tfloat, _2src_commutative + associative,
936          "bit_size == 64 ? " +
937          f"(src0 == src1 ? uid({macro}((int64_t)dui(src0), (int64_t)dui(src1))) : {op}(src0, src1)) :"
938          f"(src0 == src1 ? uif({macro}((int32_t)fui(src0), (int32_t)fui(src1))) : {op}f(src0, src1))")
939
940binop("imin", tint, _2src_commutative + associative, "MIN2(src0, src1)")
941binop("umin", tuint, _2src_commutative + associative, "MIN2(src0, src1)")
942binop("imax", tint, _2src_commutative + associative, "MAX2(src0, src1)")
943binop("umax", tuint, _2src_commutative + associative, "MAX2(src0, src1)")
944
945binop("fpow", tfloat, "", "bit_size == 64 ? pow(src0, src1) : powf(src0, src1)")
946
947binop_horiz("pack_half_2x16_split", 1, tuint32, 1, tfloat32, 1, tfloat32,
948            "pack_half_1x16(src0.x) | ((uint32_t)(pack_half_1x16(src1.x)) << 16)")
949
950binop_horiz("pack_half_2x16_rtz_split", 1, tuint32, 1, tfloat32, 1, tfloat32,
951            "pack_half_1x16_rtz(src0.x) | (uint32_t)(pack_half_1x16_rtz(src1.x) << 16)")
952
953binop_convert("pack_64_2x32_split", tuint64, tuint32, "",
954              "src0 | ((uint64_t)src1 << 32)")
955
956binop_convert("pack_32_2x16_split", tuint32, tuint16, "",
957              "src0 | ((uint32_t)src1 << 16)")
958
959opcode("pack_32_4x8_split", 0, tuint32, [0, 0, 0, 0], [tuint8, tuint8, tuint8, tuint8],
960       False, "",
961       "src0 | ((uint32_t)src1 << 8) | ((uint32_t)src2 << 16) | ((uint32_t)src3 << 24)")
962
963binop_convert("bfm", tuint32, tint32, "", """
964int bits = src0 & 0x1F;
965int offset = src1 & 0x1F;
966dst = ((1u << bits) - 1) << offset;
967""", description = """
968Implements the behavior of the first operation of the SM5 "bfi" assembly
969and that of the "bfi1" i965 instruction. That is, the bits and offset values
970are from the low five bits of src0 and src1, respectively.
971""")
972
973opcode("ldexp", 0, tfloat, [0, 0], [tfloat, tint32], False, "", """
974dst = (bit_size == 64) ? ldexp(src0, src1) : ldexpf(src0, src1);
975/* flush denormals to zero. */
976if (!isnormal(dst))
977   dst = copysignf(0.0f, src0);
978""")
979
980binop_horiz("vec2", 2, tuint, 1, tuint, 1, tuint, """
981dst.x = src0.x;
982dst.y = src1.x;
983""", description = """
984Combines the first component of each input to make a 2-component vector.
985""")
986
987# Byte extraction
988binop("extract_u8", tuint, "", "(uint8_t)(src0 >> (src1 * 8))")
989binop("extract_i8", tint, "", "(int8_t)(src0 >> (src1 * 8))")
990
991# Word extraction
992binop("extract_u16", tuint, "", "(uint16_t)(src0 >> (src1 * 16))")
993binop("extract_i16", tint, "", "(int16_t)(src0 >> (src1 * 16))")
994
995# Byte/word insertion
996binop("insert_u8", tuint, "", "(src0 & 0xff) << (src1 * 8)")
997binop("insert_u16", tuint, "", "(src0 & 0xffff) << (src1 * 16)")
998
999
1000def triop(name, ty, alg_props, const_expr, description = ""):
1001   opcode(name, 0, ty, [0, 0, 0], [ty, ty, ty], False, alg_props, const_expr,
1002          description)
1003def triop_horiz(name, output_size, src1_size, src2_size, src3_size, const_expr,
1004                description = ""):
1005   opcode(name, output_size, tuint,
1006   [src1_size, src2_size, src3_size],
1007   [tuint, tuint, tuint], False, "", const_expr, description)
1008
1009triop("ffma", tfloat, _2src_commutative, """
1010if (nir_is_rounding_mode_rtz(execution_mode, bit_size)) {
1011   if (bit_size == 64)
1012      dst = _mesa_double_fma_rtz(src0, src1, src2);
1013   else if (bit_size == 32)
1014      dst = _mesa_float_fma_rtz(src0, src1, src2);
1015   else
1016      dst = _mesa_double_to_float_rtz(_mesa_double_fma_rtz(src0, src1, src2));
1017} else {
1018   if (bit_size == 32)
1019      dst = fmaf(src0, src1, src2);
1020   else
1021      dst = fma(src0, src1, src2);
1022}
1023""")
1024
1025triop("ffmaz", tfloat32, _2src_commutative, """
1026if (src0 == 0.0 || src1 == 0.0)
1027   dst = 0.0 + src2;
1028else if (nir_is_rounding_mode_rtz(execution_mode, 32))
1029   dst = _mesa_float_fma_rtz(src0, src1, src2);
1030else
1031   dst = fmaf(src0, src1, src2);
1032""", description = """
1033Floating-point multiply-add with modified zero handling.
1034
1035Unlike :nir:alu-op:`ffma`, anything (even infinity or NaN) multiplied by zero is
1036always zero. ``ffmaz(0.0, inf, src2)`` and ``ffmaz(0.0, nan, src2)`` must be
1037``+/-0.0 + src2``, even if ``INF_PRESERVE/NAN_PRESERVE`` is not used. If
1038``SIGNED_ZERO_PRESERVE`` is used, then the result must be a positive
1039zero plus src2 if either src0 or src1 is zero.
1040""")
1041
1042triop("flrp", tfloat, "", "src0 * (1 - src2) + src1 * src2")
1043
1044triop("iadd3", tint, _2src_commutative + associative, "src0 + src1 + src2",
1045      description = "Ternary addition")
1046
1047triop("imad", tint, _2src_commutative + associative, "src0 * src1 + src2",
1048      description = "Integer multiply-add")
1049
1050csel_description = """
1051A vector conditional select instruction (like ?:, but operating per-
1052component on vectors). The condition is {} bool ({}).
1053"""
1054
1055triop("fcsel", tfloat32, selection, "(src0 != 0.0f) ? src1 : src2",
1056      description = csel_description.format("a floating point", "0.0 vs 1.0"))
1057opcode("bcsel", 0, tuint, [0, 0, 0],
1058       [tbool1, tuint, tuint], False, selection, "src0 ? src1 : src2",
1059       description = csel_description.format("a 1-bit", "0 vs 1"))
1060opcode("b8csel", 0, tuint, [0, 0, 0],
1061       [tbool8, tuint, tuint], False, selection, "src0 ? src1 : src2",
1062       description = csel_description.format("an 8-bit", "0 vs ~0"))
1063opcode("b16csel", 0, tuint, [0, 0, 0],
1064       [tbool16, tuint, tuint], False, selection, "src0 ? src1 : src2",
1065       description = csel_description.format("a 16-bit", "0 vs ~0"))
1066opcode("b32csel", 0, tuint, [0, 0, 0],
1067       [tbool32, tuint, tuint], False, selection, "src0 ? src1 : src2",
1068       description = csel_description.format("a 32-bit", "0 vs ~0"))
1069
1070triop("icsel_eqz", tint, selection, "(src0 == 0) ? src1 : src2")
1071
1072triop("i32csel_gt", tint32, selection, "(src0 > 0) ? src1 : src2")
1073triop("i32csel_ge", tint32, selection, "(src0 >= 0) ? src1 : src2")
1074
1075triop("fcsel_gt", tfloat32, selection, "(src0 > 0.0f) ? src1 : src2")
1076triop("fcsel_ge", tfloat32, selection, "(src0 >= 0.0f) ? src1 : src2")
1077
1078triop("bfi", tuint32, "", """
1079unsigned mask = src0, insert = src1, base = src2;
1080if (mask == 0) {
1081   dst = base;
1082} else {
1083   unsigned tmp = mask;
1084   while (!(tmp & 1)) {
1085      tmp >>= 1;
1086      insert <<= 1;
1087   }
1088   dst = (base & ~mask) | (insert & mask);
1089}
1090""", description = "SM5 bfi assembly")
1091
1092
1093triop("bitfield_select", tuint, "", "(src0 & src1) | (~src0 & src2)")
1094
1095# SM5 ubfe/ibfe assembly: only the 5 least significant bits of offset and bits are used.
1096opcode("ubfe", 0, tuint32,
1097       [0, 0, 0], [tuint32, tuint32, tuint32], False, "", """
1098unsigned base = src0;
1099unsigned offset = src1 & 0x1F;
1100unsigned bits = src2 & 0x1F;
1101if (bits == 0) {
1102   dst = 0;
1103} else if (offset + bits < 32) {
1104   dst = (base << (32 - bits - offset)) >> (32 - bits);
1105} else {
1106   dst = base >> offset;
1107}
1108""")
1109opcode("ibfe", 0, tint32,
1110       [0, 0, 0], [tint32, tuint32, tuint32], False, "", """
1111int base = src0;
1112unsigned offset = src1 & 0x1F;
1113unsigned bits = src2 & 0x1F;
1114if (bits == 0) {
1115   dst = 0;
1116} else if (offset + bits < 32) {
1117   dst = (base << (32 - bits - offset)) >> (32 - bits);
1118} else {
1119   dst = base >> offset;
1120}
1121""")
1122
1123# GLSL bitfieldExtract()
1124opcode("ubitfield_extract", 0, tuint32,
1125       [0, 0, 0], [tuint32, tint32, tint32], False, "", """
1126unsigned base = src0;
1127int offset = src1, bits = src2;
1128if (bits == 0) {
1129   dst = 0;
1130} else if (bits < 0 || offset < 0 || offset + bits > 32) {
1131   dst = 0; /* undefined per the spec */
1132} else {
1133   dst = (base >> offset) & ((1ull << bits) - 1);
1134}
1135""")
1136opcode("ibitfield_extract", 0, tint32,
1137       [0, 0, 0], [tint32, tint32, tint32], False, "", """
1138int base = src0;
1139int offset = src1, bits = src2;
1140if (bits == 0) {
1141   dst = 0;
1142} else if (offset < 0 || bits < 0 || offset + bits > 32) {
1143   dst = 0;
1144} else {
1145   dst = (base << (32 - offset - bits)) >> (32 - bits); /* use sign-extending shift */
1146}
1147""")
1148
1149triop("msad_4x8", tuint32, "", """
1150dst = msad(src0, src1, src2);
1151""", description = """
1152Masked sum of absolute differences with accumulation. Equivalent to AMD's v_msad_u8
1153instruction and DXIL's MSAD.
1154
1155The first two sources contain packed 8-bit unsigned integers, the instruction
1156will calculate the absolute difference of integers when src0's is non-zero, and
1157then add them together. There is also a third source which is a 32-bit unsigned
1158integer and added to the result.
1159""")
1160
1161opcode("mqsad_4x8", 4, tuint32, [1, 2, 4], [tuint32, tuint32, tuint32], False, "", """
1162uint64_t src = src1.x | ((uint64_t)src1.y << 32);
1163dst.x = msad(src0.x, src, src2.x);
1164dst.y = msad(src0.x, src >> 8, src2.y);
1165dst.z = msad(src0.x, src >> 16, src2.z);
1166dst.w = msad(src0.x, src >> 24, src2.w);
1167""")
1168
1169# Combines the first component of each input to make a 3-component vector.
1170
1171triop_horiz("vec3", 3, 1, 1, 1, """
1172dst.x = src0.x;
1173dst.y = src1.x;
1174dst.z = src2.x;
1175""")
1176
1177def quadop_horiz(name, output_size, src1_size, src2_size, src3_size,
1178                 src4_size, const_expr):
1179   opcode(name, output_size, tuint,
1180          [src1_size, src2_size, src3_size, src4_size],
1181          [tuint, tuint, tuint, tuint],
1182          False, "", const_expr)
1183
1184opcode("bitfield_insert", 0, tuint32, [0, 0, 0, 0],
1185       [tuint32, tuint32, tint32, tint32], False, "", """
1186unsigned base = src0, insert = src1;
1187int offset = src2, bits = src3;
1188if (bits == 0) {
1189   dst = base;
1190} else if (offset < 0 || bits < 0 || bits + offset > 32) {
1191   dst = 0;
1192} else {
1193   unsigned mask = ((1ull << bits) - 1) << offset;
1194   dst = (base & ~mask) | ((insert << offset) & mask);
1195}
1196""")
1197
1198quadop_horiz("vec4", 4, 1, 1, 1, 1, """
1199dst.x = src0.x;
1200dst.y = src1.x;
1201dst.z = src2.x;
1202dst.w = src3.x;
1203""")
1204
1205opcode("vec5", 5, tuint,
1206       [1] * 5, [tuint] * 5,
1207       False, "", """
1208dst.x = src0.x;
1209dst.y = src1.x;
1210dst.z = src2.x;
1211dst.w = src3.x;
1212dst.e = src4.x;
1213""")
1214
1215opcode("vec8", 8, tuint,
1216       [1] * 8, [tuint] * 8,
1217       False, "", """
1218dst.x = src0.x;
1219dst.y = src1.x;
1220dst.z = src2.x;
1221dst.w = src3.x;
1222dst.e = src4.x;
1223dst.f = src5.x;
1224dst.g = src6.x;
1225dst.h = src7.x;
1226""")
1227
1228opcode("vec16", 16, tuint,
1229       [1] * 16, [tuint] * 16,
1230       False, "", """
1231dst.x = src0.x;
1232dst.y = src1.x;
1233dst.z = src2.x;
1234dst.w = src3.x;
1235dst.e = src4.x;
1236dst.f = src5.x;
1237dst.g = src6.x;
1238dst.h = src7.x;
1239dst.i = src8.x;
1240dst.j = src9.x;
1241dst.k = src10.x;
1242dst.l = src11.x;
1243dst.m = src12.x;
1244dst.n = src13.x;
1245dst.o = src14.x;
1246dst.p = src15.x;
1247""")
1248
1249# An integer multiply instruction for address calculation.  This is
1250# similar to imul, except that the results are undefined in case of
1251# overflow.  Overflow is defined according to the size of the variable
1252# being dereferenced.
1253#
1254# This relaxed definition, compared to imul, allows an optimization
1255# pass to propagate bounds (ie, from an load/store intrinsic) to the
1256# sources, such that lower precision integer multiplies can be used.
1257# This is useful on hw that has 24b or perhaps 16b integer multiply
1258# instructions.
1259binop("amul", tint, _2src_commutative + associative, "src0 * src1")
1260
1261# ir3-specific instruction that maps directly to mul-add shift high mix,
1262# (IMADSH_MIX16 i.e. al * bh << 16 + c). It is used for lowering integer
1263# multiplication (imul) on Freedreno backend..
1264opcode("imadsh_mix16", 0, tint32,
1265       [0, 0, 0], [tint32, tint32, tint32], False, "", """
1266dst = ((((src0 & 0x0000ffff) << 16) * (src1 & 0xffff0000)) >> 16) + src2;
1267""")
1268
1269# ir3-specific instruction that maps directly to ir3 mad.s24.
1270#
1271# 24b multiply into 32b result (with sign extension) plus 32b int
1272triop("imad24_ir3", tint32, _2src_commutative,
1273      "(((int32_t)src0 << 8) >> 8) * (((int32_t)src1 << 8) >> 8) + src2")
1274
1275def triop_shift_ir3(name, shift_op, bit_op):
1276    opcode(name, 0, tuint, [0, 0, 0], [tuint, tuint32, tuint], False, "",
1277           f"(src0 {shift_op} (src1 & (sizeof(src0) * 8 - 1))) {bit_op} src2")
1278
1279triop_shift_ir3("shrm_ir3", ">>", "&")
1280triop_shift_ir3("shlm_ir3", "<<", "&")
1281triop_shift_ir3("shrg_ir3", ">>", "|")
1282triop_shift_ir3("shlg_ir3", "<<", "|")
1283triop("andg_ir3", tuint, _2src_commutative, "(src0 & src1) | src2")
1284
1285# r600/gcn specific instruction that evaluates unnormalized cube texture coordinates
1286# and face index
1287# The actual texture coordinates are evaluated from this according to
1288#    dst.yx / abs(dst.z) + 1.5
1289unop_horiz("cube_amd", 4, tfloat32, 3, tfloat32, """
1290   dst.x = dst.y = dst.z = 0.0;
1291   float absX = fabsf(src0.x);
1292   float absY = fabsf(src0.y);
1293   float absZ = fabsf(src0.z);
1294
1295   if (absX >= absY && absX >= absZ) { dst.z = 2 * src0.x; }
1296   if (absY >= absX && absY >= absZ) { dst.z = 2 * src0.y; }
1297   if (absZ >= absX && absZ >= absY) { dst.z = 2 * src0.z; }
1298
1299   if (src0.x >= 0 && absX >= absY && absX >= absZ) {
1300      dst.y = -src0.z; dst.x = -src0.y; dst.w = 0;
1301   }
1302   if (src0.x < 0 && absX >= absY && absX >= absZ) {
1303      dst.y = src0.z; dst.x = -src0.y; dst.w = 1;
1304   }
1305   if (src0.y >= 0 && absY >= absX && absY >= absZ) {
1306      dst.y = src0.x; dst.x = src0.z; dst.w = 2;
1307   }
1308   if (src0.y < 0 && absY >= absX && absY >= absZ) {
1309      dst.y = src0.x; dst.x = -src0.z; dst.w = 3;
1310   }
1311   if (src0.z >= 0 && absZ >= absX && absZ >= absY) {
1312      dst.y = src0.x; dst.x = -src0.y; dst.w = 4;
1313   }
1314   if (src0.z < 0 && absZ >= absX && absZ >= absY) {
1315      dst.y = -src0.x; dst.x = -src0.y; dst.w = 5;
1316   }
1317""")
1318
1319# r600/gcn specific sin and cos
1320# these trigeometric functions need some lowering because the supported
1321# input values are expected to be normalized by dividing by (2 * pi)
1322unop("fsin_amd", tfloat, "sinf(6.2831853 * src0)")
1323unop("fcos_amd", tfloat, "cosf(6.2831853 * src0)")
1324
1325opcode("alignbyte_amd", 0, tuint32, [0, 0, 0], [tuint32, tuint32, tuint32], False, "", """
1326   uint64_t src = src1 | ((uint64_t)src0 << 32);
1327   dst = src >> ((src2 & 0x3) * 8);
1328""")
1329
1330# Midgard specific sin and cos
1331# These expect their inputs to be divided by pi.
1332unop("fsin_mdg", tfloat, "sinf(3.141592653589793 * src0)")
1333unop("fcos_mdg", tfloat, "cosf(3.141592653589793 * src0)")
1334
1335# AGX specific sin with input expressed in quadrants. Used in the lowering for
1336# fsin/fcos. This corresponds to a sequence of 3 ALU ops in the backend (where
1337# the angle is further decomposed by quadrant, sinc is computed, and the angle
1338# is multiplied back for sin). Lowering fsin/fcos to fsin_agx requires some
1339# additional ALU that NIR may be able to optimize.
1340unop("fsin_agx", tfloat, "sinf(src0 * (6.2831853/4.0))")
1341
1342# AGX specific bitfield extraction from a pair of 32bit registers.
1343# src0,src1: the two registers
1344# src2: bit position of the LSB of the bitfield
1345# src3: number of bits in the bitfield if src3 > 0
1346#       src3 = 0 is equivalent to src3 = 32
1347# NOTE: src3 is a nir constant by contract
1348opcode("extr_agx", 0, tuint32,
1349       [0, 0, 0, 0], [tuint32, tuint32, tuint32, tuint32], False, "", """
1350    uint32_t mask = 0xFFFFFFFF;
1351    uint8_t shift = src2 & 0x7F;
1352    if (src3 != 0) {
1353       mask = (1 << src3) - 1;
1354    }
1355    if (shift >= 64) {
1356        dst = 0;
1357    } else {
1358        dst = (((((uint64_t) src1) << 32) | (uint64_t) src0) >> shift) & mask;
1359    }
1360""");
1361
1362# AGX multiply-shift-add. Corresponds to iadd/isub/imad/imsub instructions.
1363# The shift must be <= 4 (domain restriction). For performance, it should be
1364# constant.
1365opcode("imadshl_agx", 0, tint, [0, 0, 0, 0], [tint, tint, tint, tint], False,
1366       "", f"(src0 * src1) + (src2 << src3)")
1367opcode("imsubshl_agx", 0, tint, [0, 0, 0, 0], [tint, tint, tint, tint], False,
1368       "", f"(src0 * src1) - (src2 << src3)")
1369
1370# Address arithmetic instructions: extend, shift, and add
1371# Shift must be a small constant.
1372opcode("ilea_agx", 0, tuint64, [0, 0, 0], [tuint64, tint32, tuint32], False,
1373       "", f"src0 + (((int64_t)src1) << src2)")
1374opcode("ulea_agx", 0, tuint64, [0, 0, 0], [tuint64, tuint32, tuint32], False,
1375       "", f"src0 + (((uint64_t)src1) << src2)")
1376
1377# Bounds check instruction.
1378#
1379# Sources: <data, end offset, bounds>
1380opcode("bounds_agx", 0, tint, [0, 0, 0],
1381       [tint, tint, tint], False,
1382       "", "src1 <= src2 ? src0 : 0")
1383
1384binop_convert("interleave_agx", tuint32, tuint16, "", """
1385      dst = 0;
1386      for (unsigned bit = 0; bit < 16; bit++) {
1387          dst |= (src0 & (1 << bit)) << bit;
1388          dst |= (src1 & (1 << bit)) << (bit + 1);
1389      }""", description="""
1390      Interleave bits of 16-bit integers to calculate a 32-bit integer. This can
1391      be used as-is for Morton encoding.
1392      """)
1393
1394# These are like fmin/fmax, but do not flush denorms on the output which is why
1395# they're modeled as conversions. AGX flushes fp32 denorms but preserves fp16
1396# denorms, so fp16 fmin/fmax work without lowering.
1397binop_convert("fmin_agx", tuint32, tfloat32, _2src_commutative + associative,
1398              "(src0 < src1 || isnan(src1)) ? src0 : src1")
1399binop_convert("fmax_agx", tuint32, tfloat32, _2src_commutative + associative,
1400              "(src0 > src1 || isnan(src1)) ? src0 : src1")
1401
1402# NVIDIA PRMT
1403opcode("prmt_nv", 0, tuint32, [0, 0, 0], [tuint32, tuint32, tuint32],
1404       False, "", """
1405    dst = 0;
1406    for (unsigned i = 0; i < 4; i++) {
1407        uint8_t byte = (src0 >> (i * 4)) & 0x7;
1408        uint8_t x = byte < 4 ? (src1 >> (byte * 8))
1409                             : (src2 >> ((byte - 4) * 8));
1410        if ((src0 >> (i * 4)) & 0x8)
1411            x = ((int8_t)x) >> 7;
1412        dst |= ((uint32_t)x) << i * 8;
1413    }""")
1414
1415# 24b multiply into 32b result (with sign extension)
1416binop("imul24", tint32, _2src_commutative + associative,
1417      "(((int32_t)src0 << 8) >> 8) * (((int32_t)src1 << 8) >> 8)")
1418
1419# unsigned 24b multiply into 32b result plus 32b int
1420triop("umad24", tuint32, _2src_commutative,
1421      "(((uint32_t)src0 << 8) >> 8) * (((uint32_t)src1 << 8) >> 8) + src2")
1422
1423# unsigned 24b multiply into 32b result uint
1424binop("umul24", tint32, _2src_commutative + associative,
1425      "(((uint32_t)src0 << 8) >> 8) * (((uint32_t)src1 << 8) >> 8)")
1426
1427# relaxed versions of the above, which assume input is in the 24bit range (no clamping)
1428binop("imul24_relaxed", tint32, _2src_commutative + associative, "src0 * src1")
1429triop("umad24_relaxed", tuint32, _2src_commutative, "src0 * src1 + src2")
1430binop("umul24_relaxed", tuint32, _2src_commutative + associative, "src0 * src1")
1431
1432unop_convert("fisnormal", tbool1, tfloat, "isnormal(src0)")
1433unop_convert("fisfinite", tbool1, tfloat, "isfinite(src0)")
1434unop_convert("fisfinite32", tbool32, tfloat, "isfinite(src0)")
1435
1436# vc4-specific opcodes
1437
1438# Saturated vector add for 4 8bit ints.
1439binop("usadd_4x8_vc4", tint32, _2src_commutative + associative, """
1440dst = 0;
1441for (int i = 0; i < 32; i += 8) {
1442   dst |= MIN2(((src0 >> i) & 0xff) + ((src1 >> i) & 0xff), 0xff) << i;
1443}
1444""")
1445
1446# Saturated vector subtract for 4 8bit ints.
1447binop("ussub_4x8_vc4", tint32, "", """
1448dst = 0;
1449for (int i = 0; i < 32; i += 8) {
1450   int src0_chan = (src0 >> i) & 0xff;
1451   int src1_chan = (src1 >> i) & 0xff;
1452   if (src0_chan > src1_chan)
1453      dst |= (src0_chan - src1_chan) << i;
1454}
1455""")
1456
1457# vector min for 4 8bit ints.
1458binop("umin_4x8_vc4", tint32, _2src_commutative + associative, """
1459dst = 0;
1460for (int i = 0; i < 32; i += 8) {
1461   dst |= MIN2((src0 >> i) & 0xff, (src1 >> i) & 0xff) << i;
1462}
1463""")
1464
1465# vector max for 4 8bit ints.
1466binop("umax_4x8_vc4", tint32, _2src_commutative + associative, """
1467dst = 0;
1468for (int i = 0; i < 32; i += 8) {
1469   dst |= MAX2((src0 >> i) & 0xff, (src1 >> i) & 0xff) << i;
1470}
1471""")
1472
1473# unorm multiply: (a * b) / 255.
1474binop("umul_unorm_4x8_vc4", tuint32, _2src_commutative + associative, """
1475dst = 0;
1476for (int i = 0; i < 32; i += 8) {
1477   uint32_t src0_chan = (src0 >> i) & 0xff;
1478   uint32_t src1_chan = (src1 >> i) & 0xff;
1479   dst |= ((src0_chan * src1_chan) / 255) << i;
1480}
1481""")
1482
1483# v3d-specific opcodes
1484
1485# v3d-specific (v71) instruction that packs bits of 2 2x16 floating point into
1486# r11g11b10 bits, rounding to nearest even, so
1487#  dst[10:0]  = float16_to_float11 (src0[15:0])
1488#  dst[21:11] = float16_to_float11 (src0[31:16])
1489#  dst[31:22] = float16_to_float10 (src1[15:0])
1490binop_convert("pack_32_to_r11g11b10_v3d", tuint32, tuint32, "",
1491              "pack_32_to_r11g11b10_v3d(src0, src1)")
1492
1493# v3d-specific (v71) instruction that packs 2x32 bit to 2x16 bit integer. The
1494# difference with pack_32_2x16_split is that the sources are 32bit too. So it
1495# receives 2 32-bit integer, and packs the lower halfword as 2x16 on a 32-bit
1496# integer.
1497binop_horiz("pack_2x32_to_2x16_v3d", 1, tuint32, 1, tuint32, 1, tuint32,
1498            "(src0.x & 0xffff) | (src1.x << 16)")
1499
1500# v3d-specific (v71) instruction that packs bits of 2 2x16 integers into
1501# r10g10b10a2:
1502#   dst[9:0]   = src0[9:0]
1503#   dst[19:10] = src0[25:16]
1504#   dst[29:20] = src1[9:0]
1505#   dst[31:30] = src1[17:16]
1506binop_convert("pack_uint_32_to_r10g10b10a2_v3d", tuint32, tuint32, "",
1507              "(src0 & 0x3ff) | ((src0 >> 16) & 0x3ff) << 10 | (src1 & 0x3ff) << 20 | ((src1 >> 16) & 0x3ff) << 30")
1508
1509# v3d-specific (v71) instruction that packs 2 2x16 bit integers into 4x8 bits:
1510#   dst[7:0]   = src0[7:0]
1511#   dst[15:8]  = src0[23:16]
1512#   dst[23:16] = src1[7:0]
1513#   dst[31:24] = src1[23:16]
1514opcode("pack_4x16_to_4x8_v3d", 0, tuint32, [0, 0], [tuint32, tuint32],
1515       False, "",
1516       "(src0 & 0x000000ff) | (src0 & 0x00ff0000) >> 8 | (src1 & 0x000000ff) << 16 | (src1 & 0x00ff0000) << 8")
1517
1518# v3d-specific (v71) instructions to convert 2x16 floating point to 2x8 bit unorm/snorm
1519unop("pack_2x16_to_unorm_2x8_v3d", tuint32,
1520     "_mesa_half_to_unorm(src0 & 0xffff, 8) | (_mesa_half_to_unorm(src0 >> 16, 8) << 16)")
1521unop("pack_2x16_to_snorm_2x8_v3d", tuint32,
1522     "_mesa_half_to_snorm(src0 & 0xffff, 8) | ((uint32_t)(_mesa_half_to_snorm(src0 >> 16, 8)) << 16)")
1523
1524# v3d-specific (v71) instructions to convert 32-bit floating point to 16 bit unorm/snorm
1525unop("f2unorm_16_v3d", tuint32, "_mesa_float_to_unorm16(src0)")
1526unop("f2snorm_16_v3d", tuint32, "_mesa_float_to_snorm16(src0)")
1527
1528# v3d-specific (v71) instructions to convert 2x16 bit floating points to 2x10 bit unorm
1529unop("pack_2x16_to_unorm_2x10_v3d", tuint32, "pack_2x16_to_unorm_2x10(src0)")
1530
1531# v3d-specific (v71) instructions to convert 2x16 bit floating points to one 2-bit
1532# and one 10 bit unorm
1533unop("pack_2x16_to_unorm_10_2_v3d", tuint32, "pack_2x16_to_unorm_10_2(src0)")
1534
1535# These opcodes are used used by Mali and V3D
1536unop("fsat_signed", tfloat, ("fmin(fmax(src0, -1.0), 1.0)"))
1537unop("fclamp_pos", tfloat, ("fmax(src0, 0.0)"))
1538
1539opcode("b32fcsel_mdg", 0, tuint, [0, 0, 0],
1540       [tbool32, tfloat, tfloat], False, selection, "src0 ? src1 : src2",
1541       description = csel_description.format("a 32-bit", "0 vs ~0") + """
1542       This Midgard-specific variant takes floating-point sources, rather than
1543       integer sources. That includes support for floating point modifiers in
1544       the backend.
1545       """)
1546
1547# DXIL specific double [un]pack
1548# DXIL doesn't support generic [un]pack instructions, so we want those
1549# lowered to bit ops. HLSL doesn't support 64bit bitcasts to/from
1550# double, only [un]pack. Technically DXIL does, but considering they
1551# can't be generated from HLSL, we want to match what would be coming from DXC.
1552# This is essentially just the standard [un]pack, except that it doesn't get
1553# lowered so we can handle it in the backend and turn it into MakeDouble/SplitDouble
1554unop_horiz("pack_double_2x32_dxil", 1, tuint64, 2, tuint32,
1555           "dst.x = src0.x | ((uint64_t)src0.y << 32);")
1556unop_horiz("unpack_double_2x32_dxil", 2, tuint32, 1, tuint64,
1557           "dst.x = src0.x; dst.y = src0.x >> 32;")
1558
1559# src0 and src1 are i8vec4 packed in an int32, and src2 is an int32.  The int8
1560# components are sign-extended to 32-bits, and a dot-product is performed on
1561# the resulting vectors.  src2 is added to the result of the dot-product.
1562opcode("sdot_4x8_iadd", 0, tint32, [0, 0, 0], [tuint32, tuint32, tint32],
1563       False, _2src_commutative, """
1564   const int32_t v0x = (int8_t)(src0      );
1565   const int32_t v0y = (int8_t)(src0 >>  8);
1566   const int32_t v0z = (int8_t)(src0 >> 16);
1567   const int32_t v0w = (int8_t)(src0 >> 24);
1568   const int32_t v1x = (int8_t)(src1      );
1569   const int32_t v1y = (int8_t)(src1 >>  8);
1570   const int32_t v1z = (int8_t)(src1 >> 16);
1571   const int32_t v1w = (int8_t)(src1 >> 24);
1572
1573   dst = (v0x * v1x) + (v0y * v1y) + (v0z * v1z) + (v0w * v1w) + src2;
1574""")
1575
1576# Like sdot_4x8_iadd, but unsigned.
1577opcode("udot_4x8_uadd", 0, tuint32, [0, 0, 0], [tuint32, tuint32, tuint32],
1578       False, _2src_commutative, """
1579   const uint32_t v0x = (uint8_t)(src0      );
1580   const uint32_t v0y = (uint8_t)(src0 >>  8);
1581   const uint32_t v0z = (uint8_t)(src0 >> 16);
1582   const uint32_t v0w = (uint8_t)(src0 >> 24);
1583   const uint32_t v1x = (uint8_t)(src1      );
1584   const uint32_t v1y = (uint8_t)(src1 >>  8);
1585   const uint32_t v1z = (uint8_t)(src1 >> 16);
1586   const uint32_t v1w = (uint8_t)(src1 >> 24);
1587
1588   dst = (v0x * v1x) + (v0y * v1y) + (v0z * v1z) + (v0w * v1w) + src2;
1589""")
1590
1591# src0 is i8vec4 packed in an int32, src1 is u8vec4 packed in an int32, and
1592# src2 is an int32.  The 8-bit components are extended to 32-bits, and a
1593# dot-product is performed on the resulting vectors.  src2 is added to the
1594# result of the dot-product.
1595#
1596# NOTE: Unlike many of the other dp4a opcodes, this mixed signs of source 0
1597# and source 1 mean that this opcode is not 2-source commutative
1598opcode("sudot_4x8_iadd", 0, tint32, [0, 0, 0], [tuint32, tuint32, tint32],
1599       False, "", """
1600   const int32_t v0x = (int8_t)(src0      );
1601   const int32_t v0y = (int8_t)(src0 >>  8);
1602   const int32_t v0z = (int8_t)(src0 >> 16);
1603   const int32_t v0w = (int8_t)(src0 >> 24);
1604   const uint32_t v1x = (uint8_t)(src1      );
1605   const uint32_t v1y = (uint8_t)(src1 >>  8);
1606   const uint32_t v1z = (uint8_t)(src1 >> 16);
1607   const uint32_t v1w = (uint8_t)(src1 >> 24);
1608
1609   dst = (v0x * v1x) + (v0y * v1y) + (v0z * v1z) + (v0w * v1w) + src2;
1610""")
1611
1612# Like sdot_4x8_iadd, but the result is clampled to the range [-0x80000000, 0x7ffffffff].
1613opcode("sdot_4x8_iadd_sat", 0, tint32, [0, 0, 0], [tuint32, tuint32, tint32],
1614       False, _2src_commutative, """
1615   const int64_t v0x = (int8_t)(src0      );
1616   const int64_t v0y = (int8_t)(src0 >>  8);
1617   const int64_t v0z = (int8_t)(src0 >> 16);
1618   const int64_t v0w = (int8_t)(src0 >> 24);
1619   const int64_t v1x = (int8_t)(src1      );
1620   const int64_t v1y = (int8_t)(src1 >>  8);
1621   const int64_t v1z = (int8_t)(src1 >> 16);
1622   const int64_t v1w = (int8_t)(src1 >> 24);
1623
1624   const int64_t tmp = (v0x * v1x) + (v0y * v1y) + (v0z * v1z) + (v0w * v1w) + src2;
1625
1626   dst = tmp >= INT32_MAX ? INT32_MAX : (tmp <= INT32_MIN ? INT32_MIN : tmp);
1627""")
1628
1629# Like udot_4x8_uadd, but the result is clampled to the range [0, 0xfffffffff].
1630opcode("udot_4x8_uadd_sat", 0, tint32, [0, 0, 0], [tuint32, tuint32, tint32],
1631       False, _2src_commutative, """
1632   const uint64_t v0x = (uint8_t)(src0      );
1633   const uint64_t v0y = (uint8_t)(src0 >>  8);
1634   const uint64_t v0z = (uint8_t)(src0 >> 16);
1635   const uint64_t v0w = (uint8_t)(src0 >> 24);
1636   const uint64_t v1x = (uint8_t)(src1      );
1637   const uint64_t v1y = (uint8_t)(src1 >>  8);
1638   const uint64_t v1z = (uint8_t)(src1 >> 16);
1639   const uint64_t v1w = (uint8_t)(src1 >> 24);
1640
1641   const uint64_t tmp = (v0x * v1x) + (v0y * v1y) + (v0z * v1z) + (v0w * v1w) + src2;
1642
1643   dst = tmp >= UINT32_MAX ? UINT32_MAX : tmp;
1644""")
1645
1646# Like sudot_4x8_iadd, but the result is clampled to the range [-0x80000000, 0x7ffffffff].
1647#
1648# NOTE: Unlike many of the other dp4a opcodes, this mixed signs of source 0
1649# and source 1 mean that this opcode is not 2-source commutative
1650opcode("sudot_4x8_iadd_sat", 0, tint32, [0, 0, 0], [tuint32, tuint32, tint32],
1651       False, "", """
1652   const int64_t v0x = (int8_t)(src0      );
1653   const int64_t v0y = (int8_t)(src0 >>  8);
1654   const int64_t v0z = (int8_t)(src0 >> 16);
1655   const int64_t v0w = (int8_t)(src0 >> 24);
1656   const uint64_t v1x = (uint8_t)(src1      );
1657   const uint64_t v1y = (uint8_t)(src1 >>  8);
1658   const uint64_t v1z = (uint8_t)(src1 >> 16);
1659   const uint64_t v1w = (uint8_t)(src1 >> 24);
1660
1661   const int64_t tmp = (v0x * v1x) + (v0y * v1y) + (v0z * v1z) + (v0w * v1w) + src2;
1662
1663   dst = tmp >= INT32_MAX ? INT32_MAX : (tmp <= INT32_MIN ? INT32_MIN : tmp);
1664""")
1665
1666# src0 and src1 are i16vec2 packed in an int32, and src2 is an int32.  The int16
1667# components are sign-extended to 32-bits, and a dot-product is performed on
1668# the resulting vectors.  src2 is added to the result of the dot-product.
1669opcode("sdot_2x16_iadd", 0, tint32, [0, 0, 0], [tuint32, tuint32, tint32],
1670       False, _2src_commutative, """
1671   const int32_t v0x = (int16_t)(src0      );
1672   const int32_t v0y = (int16_t)(src0 >> 16);
1673   const int32_t v1x = (int16_t)(src1      );
1674   const int32_t v1y = (int16_t)(src1 >> 16);
1675
1676   dst = (v0x * v1x) + (v0y * v1y) + src2;
1677""")
1678
1679# Like sdot_2x16_iadd, but unsigned.
1680opcode("udot_2x16_uadd", 0, tuint32, [0, 0, 0], [tuint32, tuint32, tuint32],
1681       False, _2src_commutative, """
1682   const uint32_t v0x = (uint16_t)(src0      );
1683   const uint32_t v0y = (uint16_t)(src0 >> 16);
1684   const uint32_t v1x = (uint16_t)(src1      );
1685   const uint32_t v1y = (uint16_t)(src1 >> 16);
1686
1687   dst = (v0x * v1x) + (v0y * v1y) + src2;
1688""")
1689
1690# Like sdot_2x16_iadd, but the result is clampled to the range [-0x80000000, 0x7ffffffff].
1691opcode("sdot_2x16_iadd_sat", 0, tint32, [0, 0, 0], [tuint32, tuint32, tint32],
1692       False, _2src_commutative, """
1693   const int64_t v0x = (int16_t)(src0      );
1694   const int64_t v0y = (int16_t)(src0 >> 16);
1695   const int64_t v1x = (int16_t)(src1      );
1696   const int64_t v1y = (int16_t)(src1 >> 16);
1697
1698   const int64_t tmp = (v0x * v1x) + (v0y * v1y) + src2;
1699
1700   dst = tmp >= INT32_MAX ? INT32_MAX : (tmp <= INT32_MIN ? INT32_MIN : tmp);
1701""")
1702
1703# Like udot_2x16_uadd, but the result is clampled to the range [0, 0xfffffffff].
1704opcode("udot_2x16_uadd_sat", 0, tint32, [0, 0, 0], [tuint32, tuint32, tint32],
1705       False, _2src_commutative, """
1706   const uint64_t v0x = (uint16_t)(src0      );
1707   const uint64_t v0y = (uint16_t)(src0 >> 16);
1708   const uint64_t v1x = (uint16_t)(src1      );
1709   const uint64_t v1y = (uint16_t)(src1 >> 16);
1710
1711   const uint64_t tmp = (v0x * v1x) + (v0y * v1y) + src2;
1712
1713   dst = tmp >= UINT32_MAX ? UINT32_MAX : tmp;
1714""")
1715