• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1#
2# Copyright (C) 2014 Connor Abbott
3#
4# Permission is hereby granted, free of charge, to any person obtaining a
5# copy of this software and associated documentation files (the "Software"),
6# to deal in the Software without restriction, including without limitation
7# the rights to use, copy, modify, merge, publish, distribute, sublicense,
8# and/or sell copies of the Software, and to permit persons to whom the
9# Software is furnished to do so, subject to the following conditions:
10#
11# The above copyright notice and this permission notice (including the next
12# paragraph) shall be included in all copies or substantial portions of the
13# Software.
14#
15# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
18# THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20# FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21# IN THE SOFTWARE.
22#
23# Authors:
24#    Connor Abbott (cwabbott0@gmail.com)
25
26import re
27
28# Class that represents all the information we have about the opcode
29# NOTE: this must be kept in sync with nir_op_info
30
31class Opcode(object):
32   """Class that represents all the information we have about the opcode
33   NOTE: this must be kept in sync with nir_op_info
34   """
35   def __init__(self, name, output_size, output_type, input_sizes,
36                input_types, is_conversion, algebraic_properties, const_expr,
37                description):
38      """Parameters:
39
40      - name is the name of the opcode (prepend nir_op_ for the enum name)
41      - all types are strings that get nir_type_ prepended to them
42      - input_types is a list of types
43      - is_conversion is true if this opcode represents a type conversion
44      - algebraic_properties is a space-seperated string, where nir_op_is_ is
45        prepended before each entry
46      - const_expr is an expression or series of statements that computes the
47        constant value of the opcode given the constant values of its inputs.
48      - Optional description of the opcode for documentation.
49
50      Constant expressions are formed from the variables src0, src1, ...,
51      src(N-1), where N is the number of arguments.  The output of the
52      expression should be stored in the dst variable.  Per-component input
53      and output variables will be scalars and non-per-component input and
54      output variables will be a struct with fields named x, y, z, and w
55      all of the correct type.  Input and output variables can be assumed
56      to already be of the correct type and need no conversion.  In
57      particular, the conversion from the C bool type to/from  NIR_TRUE and
58      NIR_FALSE happens automatically.
59
60      For per-component instructions, the entire expression will be
61      executed once for each component.  For non-per-component
62      instructions, the expression is expected to store the correct values
63      in dst.x, dst.y, etc.  If "dst" does not exist anywhere in the
64      constant expression, an assignment to dst will happen automatically
65      and the result will be equivalent to "dst = <expression>" for
66      per-component instructions and "dst.x = dst.y = ... = <expression>"
67      for non-per-component instructions.
68      """
69      assert isinstance(name, str)
70      assert isinstance(output_size, int)
71      assert isinstance(output_type, str)
72      assert isinstance(input_sizes, list)
73      assert isinstance(input_sizes[0], int)
74      assert isinstance(input_types, list)
75      assert isinstance(input_types[0], str)
76      assert isinstance(is_conversion, bool)
77      assert isinstance(algebraic_properties, str)
78      assert isinstance(const_expr, str)
79      assert len(input_sizes) == len(input_types)
80      assert 0 <= output_size <= 5 or (output_size == 8) or (output_size == 16)
81      for size in input_sizes:
82         assert 0 <= size <= 5 or (size == 8) or (size == 16)
83         if output_size != 0:
84            assert size != 0
85      self.name = name
86      self.num_inputs = len(input_sizes)
87      self.output_size = output_size
88      self.output_type = output_type
89      self.input_sizes = input_sizes
90      self.input_types = input_types
91      self.is_conversion = is_conversion
92      self.algebraic_properties = algebraic_properties
93      self.const_expr = const_expr
94      self.description = description
95
96# helper variables for strings
97tfloat = "float"
98tint = "int"
99tbool = "bool"
100tbool1 = "bool1"
101tbool8 = "bool8"
102tbool16 = "bool16"
103tbool32 = "bool32"
104tuint = "uint"
105tuint8 = "uint8"
106tint16 = "int16"
107tuint16 = "uint16"
108tfloat16 = "float16"
109tfloat32 = "float32"
110tint32 = "int32"
111tuint32 = "uint32"
112tint64 = "int64"
113tuint64 = "uint64"
114tfloat64 = "float64"
115
116_TYPE_SPLIT_RE = re.compile(r'(?P<type>int|uint|float|bool)(?P<bits>\d+)?')
117
118def type_has_size(type_):
119    m = _TYPE_SPLIT_RE.match(type_)
120    assert m is not None, 'Invalid NIR type string: "{}"'.format(type_)
121    return m.group('bits') is not None
122
123def type_size(type_):
124    m = _TYPE_SPLIT_RE.match(type_)
125    assert m is not None, 'Invalid NIR type string: "{}"'.format(type_)
126    assert m.group('bits') is not None, \
127           'NIR type string has no bit size: "{}"'.format(type_)
128    return int(m.group('bits'))
129
130def type_sizes(type_):
131    if type_has_size(type_):
132        return [type_size(type_)]
133    elif type_ == 'bool':
134        return [1, 8, 16, 32]
135    elif type_ == 'float':
136        return [16, 32, 64]
137    else:
138        return [1, 8, 16, 32, 64]
139
140def type_base_type(type_):
141    m = _TYPE_SPLIT_RE.match(type_)
142    assert m is not None, 'Invalid NIR type string: "{}"'.format(type_)
143    return m.group('type')
144
145# Operation where the first two sources are commutative.
146#
147# For 2-source operations, this just mathematical commutativity.  Some
148# 3-source operations, like ffma, are only commutative in the first two
149# sources.
150_2src_commutative = "2src_commutative "
151associative = "associative "
152selection = "selection "
153derivative = "derivative "
154
155# global dictionary of opcodes
156opcodes = {}
157
158def opcode(name, output_size, output_type, input_sizes, input_types,
159           is_conversion, algebraic_properties, const_expr, description = ""):
160   assert name not in opcodes
161   opcodes[name] = Opcode(name, output_size, output_type, input_sizes,
162                          input_types, is_conversion, algebraic_properties,
163                          const_expr, description)
164
165def unop_convert(name, out_type, in_type, const_expr, description = ""):
166   opcode(name, 0, out_type, [0], [in_type], False, "", const_expr, description)
167
168def unop(name, ty, const_expr, description = "", algebraic_properties = ""):
169   opcode(name, 0, ty, [0], [ty], False, algebraic_properties, const_expr,
170          description)
171
172def unop_horiz(name, output_size, output_type, input_size, input_type,
173               const_expr, description = ""):
174   opcode(name, output_size, output_type, [input_size], [input_type],
175          False, "", const_expr, description)
176
177def unop_reduce(name, output_size, output_type, input_type, prereduce_expr,
178                reduce_expr, final_expr, description = ""):
179   def prereduce(src):
180      return "(" + prereduce_expr.format(src=src) + ")"
181   def final(src):
182      return final_expr.format(src="(" + src + ")")
183   def reduce_(src0, src1):
184      return reduce_expr.format(src0=src0, src1=src1)
185   src0 = prereduce("src0.x")
186   src1 = prereduce("src0.y")
187   src2 = prereduce("src0.z")
188   src3 = prereduce("src0.w")
189   unop_horiz(name + "2", output_size, output_type, 2, input_type,
190              final(reduce_(src0, src1)), description)
191   unop_horiz(name + "3", output_size, output_type, 3, input_type,
192              final(reduce_(reduce_(src0, src1), src2)), description)
193   unop_horiz(name + "4", output_size, output_type, 4, input_type,
194              final(reduce_(reduce_(src0, src1), reduce_(src2, src3))),
195              description)
196
197def unop_numeric_convert(name, out_type, in_type, const_expr, description = ""):
198   opcode(name, 0, out_type, [0], [in_type], True, "", const_expr, description)
199
200unop("mov", tuint, "src0")
201
202unop("ineg", tint, "-src0")
203unop("fneg", tfloat, "-src0")
204unop("inot", tint, "~src0", description = "Invert every bit of the integer")
205
206unop("fsign", tfloat, ("bit_size == 64 ? " +
207                       "(isnan(src0) ? 0.0  : ((src0 == 0.0 ) ? src0 : (src0 > 0.0 ) ? 1.0  : -1.0 )) : " +
208                       "(isnan(src0) ? 0.0f : ((src0 == 0.0f) ? src0 : (src0 > 0.0f) ? 1.0f : -1.0f))"),
209     description = """
210Roughly implements the OpenGL / Vulkan rules for ``sign(float)``.
211The ``GLSL.std.450 FSign`` instruction is defined as:
212
213    Result is 1.0 if x > 0, 0.0 if x = 0, or -1.0 if x < 0.
214
215If the source is equal to zero, there is a preference for the result to have
216the same sign, but this is not required (it is required by OpenCL).  If the
217source is not a number, there is a preference for the result to be +0.0, but
218this is not required (it is required by OpenCL).  If the source is not a
219number, and the result is not +0.0, the result should definitely **not** be
220NaN.
221
222The values returned for constant folding match the behavior required by
223OpenCL.
224     """)
225
226unop("isign", tint, "(src0 == 0) ? 0 : ((src0 > 0) ? 1 : -1)")
227unop("iabs", tint, "(src0 < 0) ? -src0 : src0")
228unop("fabs", tfloat, "fabs(src0)")
229unop("fsat", tfloat, ("fmin(fmax(src0, 0.0), 1.0)"))
230unop("frcp", tfloat, "bit_size == 64 ? 1.0 / src0 : 1.0f / src0")
231unop("frsq", tfloat, "bit_size == 64 ? 1.0 / sqrt(src0) : 1.0f / sqrtf(src0)")
232unop("fsqrt", tfloat, "bit_size == 64 ? sqrt(src0) : sqrtf(src0)")
233unop("fexp2", tfloat, "exp2f(src0)")
234unop("flog2", tfloat, "log2f(src0)")
235
236# Generate all of the numeric conversion opcodes
237for src_t in [tint, tuint, tfloat, tbool]:
238   if src_t == tbool:
239      dst_types = [tfloat, tint, tbool]
240   elif src_t == tint:
241      dst_types = [tfloat, tint]
242   elif src_t == tuint:
243      dst_types = [tfloat, tuint]
244   elif src_t == tfloat:
245      dst_types = [tint, tuint, tfloat]
246
247   for dst_t in dst_types:
248      for dst_bit_size in type_sizes(dst_t):
249          if dst_bit_size == 16 and dst_t == tfloat and src_t == tfloat:
250              rnd_modes = ['_rtne', '_rtz', '']
251              for rnd_mode in rnd_modes:
252                  if rnd_mode == '_rtne':
253                      conv_expr = """
254                      if (bit_size > 32) {
255                         dst = _mesa_half_to_float(_mesa_double_to_float16_rtne(src0));
256                      } else if (bit_size > 16) {
257                         dst = _mesa_half_to_float(_mesa_float_to_float16_rtne(src0));
258                      } else {
259                         dst = src0;
260                      }
261                      """
262                  elif rnd_mode == '_rtz':
263                      conv_expr = """
264                      if (bit_size > 32) {
265                         dst = _mesa_half_to_float(_mesa_double_to_float16_rtz(src0));
266                      } else if (bit_size > 16) {
267                         dst = _mesa_half_to_float(_mesa_float_to_float16_rtz(src0));
268                      } else {
269                         dst = src0;
270                      }
271                      """
272                  else:
273                      conv_expr = """
274                      if (bit_size > 32) {
275                         if (nir_is_rounding_mode_rtz(execution_mode, 16))
276                            dst = _mesa_half_to_float(_mesa_double_to_float16_rtz(src0));
277                         else
278                            dst = _mesa_half_to_float(_mesa_double_to_float16_rtne(src0));
279                      } else if (bit_size > 16) {
280                         if (nir_is_rounding_mode_rtz(execution_mode, 16))
281                            dst = _mesa_half_to_float(_mesa_float_to_float16_rtz(src0));
282                         else
283                            dst = _mesa_half_to_float(_mesa_float_to_float16_rtne(src0));
284                      } else {
285                         dst = src0;
286                      }
287                      """
288
289                  unop_numeric_convert("{0}2{1}{2}{3}".format(src_t[0],
290                                                              dst_t[0],
291                                                              dst_bit_size,
292                                                              rnd_mode),
293                                       dst_t + str(dst_bit_size),
294                                       src_t, conv_expr)
295          elif dst_bit_size == 32 and dst_t == tfloat and src_t == tfloat:
296              conv_expr = """
297              if (bit_size > 32 && nir_is_rounding_mode_rtz(execution_mode, 32)) {
298                 dst = _mesa_double_to_float_rtz(src0);
299              } else {
300                 dst = src0;
301              }
302              """
303              unop_numeric_convert("{0}2{1}{2}".format(src_t[0], dst_t[0],
304                                                       dst_bit_size),
305                                   dst_t + str(dst_bit_size), src_t, conv_expr)
306          else:
307              conv_expr = "src0 != 0" if dst_t == tbool else "src0"
308              unop_numeric_convert("{0}2{1}{2}".format(src_t[0], dst_t[0],
309                                                       dst_bit_size),
310                                   dst_t + str(dst_bit_size), src_t, conv_expr)
311
312def unop_numeric_convert_mp(base, src_t, dst_t):
313    op_like = base + "16"
314    unop_numeric_convert(base + "mp", src_t, dst_t, opcodes[op_like].const_expr,
315                         description = """
316Special opcode that is the same as :nir:alu-op:`{}` except that it is safe to
317remove it if the result is immediately converted back to 32 bits again. This is
318generated as part of the precision lowering pass. ``mp`` stands for medium
319precision.
320                         """.format(op_like))
321
322unop_numeric_convert_mp("f2f", tfloat16, tfloat32)
323unop_numeric_convert_mp("i2i", tint16, tint32)
324# u2ump isn't defined, because the behavior is equal to i2imp
325unop_numeric_convert_mp("f2i", tint16, tfloat32)
326unop_numeric_convert_mp("f2u", tuint16, tfloat32)
327unop_numeric_convert_mp("i2f", tfloat16, tint32)
328unop_numeric_convert_mp("u2f", tfloat16, tuint32)
329
330# Unary floating-point rounding operations.
331
332
333unop("ftrunc", tfloat, "bit_size == 64 ? trunc(src0) : truncf(src0)")
334unop("fceil", tfloat, "bit_size == 64 ? ceil(src0) : ceilf(src0)")
335unop("ffloor", tfloat, "bit_size == 64 ? floor(src0) : floorf(src0)")
336unop("ffract", tfloat, "src0 - (bit_size == 64 ? floor(src0) : floorf(src0))")
337unop("fround_even", tfloat, "bit_size == 64 ? _mesa_roundeven(src0) : _mesa_roundevenf(src0)")
338
339unop("fquantize2f16", tfloat, "(fabs(src0) < ldexpf(1.0, -14)) ? copysignf(0.0f, src0) : _mesa_half_to_float(_mesa_float_to_half(src0))")
340
341# Trigonometric operations.
342
343
344unop("fsin", tfloat, "bit_size == 64 ? sin(src0) : sinf(src0)")
345unop("fcos", tfloat, "bit_size == 64 ? cos(src0) : cosf(src0)")
346
347# dfrexp
348unop_convert("frexp_exp", tint32, tfloat, "frexp(src0, &dst);")
349unop_convert("frexp_sig", tfloat, tfloat, "int n; dst = frexp(src0, &n);")
350
351# Partial derivatives.
352deriv_template = """
353Calculate the screen-space partial derivative using {} derivatives of the input
354with respect to the {}-axis. The constant folding is trivial as the derivative
355of a constant is 0 if the constant is not Inf or NaN.
356"""
357
358for mode, suffix in [("either fine or coarse", ""), ("fine", "_fine"), ("coarse", "_coarse")]:
359    for axis in ["x", "y"]:
360        unop(f"fdd{axis}{suffix}", tfloat, "isfinite(src0) ? 0.0 : NAN",
361             algebraic_properties = derivative,
362             description = deriv_template.format(mode, axis.upper()))
363
364# Floating point pack and unpack operations.
365
366def pack_2x16(fmt, in_type):
367   unop_horiz("pack_" + fmt + "_2x16", 1, tuint32, 2, in_type, """
368dst.x = (uint32_t) pack_fmt_1x16(src0.x);
369dst.x |= ((uint32_t) pack_fmt_1x16(src0.y)) << 16;
370""".replace("fmt", fmt))
371
372def pack_4x8(fmt):
373   unop_horiz("pack_" + fmt + "_4x8", 1, tuint32, 4, tfloat32, """
374dst.x = (uint32_t) pack_fmt_1x8(src0.x);
375dst.x |= ((uint32_t) pack_fmt_1x8(src0.y)) << 8;
376dst.x |= ((uint32_t) pack_fmt_1x8(src0.z)) << 16;
377dst.x |= ((uint32_t) pack_fmt_1x8(src0.w)) << 24;
378""".replace("fmt", fmt))
379
380def unpack_2x16(fmt):
381   unop_horiz("unpack_" + fmt + "_2x16", 2, tfloat32, 1, tuint32, """
382dst.x = unpack_fmt_1x16((uint16_t)(src0.x & 0xffff));
383dst.y = unpack_fmt_1x16((uint16_t)(src0.x << 16));
384""".replace("fmt", fmt))
385
386def unpack_4x8(fmt):
387   unop_horiz("unpack_" + fmt + "_4x8", 4, tfloat32, 1, tuint32, """
388dst.x = unpack_fmt_1x8((uint8_t)(src0.x & 0xff));
389dst.y = unpack_fmt_1x8((uint8_t)((src0.x >> 8) & 0xff));
390dst.z = unpack_fmt_1x8((uint8_t)((src0.x >> 16) & 0xff));
391dst.w = unpack_fmt_1x8((uint8_t)(src0.x >> 24));
392""".replace("fmt", fmt))
393
394
395pack_2x16("snorm", tfloat)
396pack_4x8("snorm")
397pack_2x16("unorm", tfloat)
398pack_4x8("unorm")
399pack_2x16("half", tfloat32)
400unpack_2x16("snorm")
401unpack_4x8("snorm")
402unpack_2x16("unorm")
403unpack_4x8("unorm")
404unpack_2x16("half")
405
406unop_horiz("pack_uint_2x16", 1, tuint32, 2, tuint32, """
407dst.x = _mesa_unsigned_to_unsigned(src0.x, 16);
408dst.x |= _mesa_unsigned_to_unsigned(src0.y, 16) << 16;
409""", description = """
410Convert two unsigned integers into a packed unsigned short (clamp is applied).
411""")
412
413unop_horiz("pack_sint_2x16", 1, tint32, 2, tint32, """
414dst.x = _mesa_signed_to_signed(src0.x, 16) & 0xffff;
415dst.x |= _mesa_signed_to_signed(src0.y, 16) << 16;
416""", description = """
417Convert two signed integers into a packed signed short (clamp is applied).
418""")
419
420unop_horiz("pack_uvec2_to_uint", 1, tuint32, 2, tuint32, """
421dst.x = (src0.x & 0xffff) | (src0.y << 16);
422""")
423
424unop_horiz("pack_uvec4_to_uint", 1, tuint32, 4, tuint32, """
425dst.x = (src0.x <<  0) |
426        (src0.y <<  8) |
427        (src0.z << 16) |
428        (src0.w << 24);
429""")
430
431unop_horiz("pack_32_4x8", 1, tuint32, 4, tuint8,
432           "dst.x = src0.x | ((uint32_t)src0.y << 8) | ((uint32_t)src0.z << 16) | ((uint32_t)src0.w << 24);")
433
434unop_horiz("pack_32_2x16", 1, tuint32, 2, tuint16,
435           "dst.x = src0.x | ((uint32_t)src0.y << 16);")
436
437unop_horiz("pack_64_2x32", 1, tuint64, 2, tuint32,
438           "dst.x = src0.x | ((uint64_t)src0.y << 32);")
439
440unop_horiz("pack_64_4x16", 1, tuint64, 4, tuint16,
441           "dst.x = src0.x | ((uint64_t)src0.y << 16) | ((uint64_t)src0.z << 32) | ((uint64_t)src0.w << 48);")
442
443unop_horiz("unpack_64_2x32", 2, tuint32, 1, tuint64,
444           "dst.x = src0.x; dst.y = src0.x >> 32;")
445
446unop_horiz("unpack_64_4x16", 4, tuint16, 1, tuint64,
447           "dst.x = src0.x; dst.y = src0.x >> 16; dst.z = src0.x >> 32; dst.w = src0.x >> 48;")
448
449unop_horiz("unpack_32_2x16", 2, tuint16, 1, tuint32,
450           "dst.x = src0.x; dst.y = src0.x >> 16;")
451
452unop_horiz("unpack_32_4x8", 4, tuint8, 1, tuint32,
453           "dst.x = src0.x; dst.y = src0.x >> 8; dst.z = src0.x >> 16; dst.w = src0.x >> 24;")
454
455unop_horiz("unpack_half_2x16_flush_to_zero", 2, tfloat32, 1, tuint32, """
456dst.x = unpack_half_1x16_flush_to_zero((uint16_t)(src0.x & 0xffff));
457dst.y = unpack_half_1x16_flush_to_zero((uint16_t)(src0.x << 16));
458""")
459
460# Lowered floating point unpacking operations.
461
462unop_convert("unpack_half_2x16_split_x", tfloat32, tuint32,
463             "unpack_half_1x16((uint16_t)(src0 & 0xffff))")
464unop_convert("unpack_half_2x16_split_y", tfloat32, tuint32,
465             "unpack_half_1x16((uint16_t)(src0 >> 16))")
466
467unop_convert("unpack_half_2x16_split_x_flush_to_zero", tfloat32, tuint32,
468             "unpack_half_1x16_flush_to_zero((uint16_t)(src0 & 0xffff))")
469unop_convert("unpack_half_2x16_split_y_flush_to_zero", tfloat32, tuint32,
470             "unpack_half_1x16_flush_to_zero((uint16_t)(src0 >> 16))")
471
472unop_convert("unpack_32_2x16_split_x", tuint16, tuint32, "src0")
473unop_convert("unpack_32_2x16_split_y", tuint16, tuint32, "src0 >> 16")
474
475unop_convert("unpack_64_2x32_split_x", tuint32, tuint64, "src0")
476unop_convert("unpack_64_2x32_split_y", tuint32, tuint64, "src0 >> 32")
477
478# Bit operations, part of ARB_gpu_shader5.
479
480
481unop("bitfield_reverse", tuint32, """
482/* we're not winning any awards for speed here, but that's ok */
483dst = 0;
484for (unsigned bit = 0; bit < 32; bit++)
485   dst |= ((src0 >> bit) & 1) << (31 - bit);
486""")
487unop_convert("bit_count", tuint32, tuint, """
488dst = 0;
489for (unsigned bit = 0; bit < bit_size; bit++) {
490   if ((src0 >> bit) & 1)
491      dst++;
492}
493""")
494
495unop_convert("ufind_msb", tint32, tuint, """
496dst = -1;
497for (int bit = bit_size - 1; bit >= 0; bit--) {
498   if ((src0 >> bit) & 1) {
499      dst = bit;
500      break;
501   }
502}
503""")
504
505unop_convert("ufind_msb_rev", tint32, tuint, """
506dst = -1;
507for (int bit = 0; bit < bit_size; bit++) {
508   if ((src0 << bit) & 0x80000000) {
509      dst = bit;
510      break;
511   }
512}
513""")
514
515unop("uclz", tuint32, """
516int bit;
517for (bit = bit_size - 1; bit >= 0; bit--) {
518   if ((src0 & (1u << bit)) != 0)
519      break;
520}
521dst = (unsigned)(bit_size - bit - 1);
522""")
523
524unop("ifind_msb", tint32, """
525dst = -1;
526for (int bit = bit_size - 1; bit >= 0; bit--) {
527   /* If src0 < 0, we're looking for the first 0 bit.
528    * if src0 >= 0, we're looking for the first 1 bit.
529    */
530   if ((((src0 >> bit) & 1) && (src0 >= 0)) ||
531      (!((src0 >> bit) & 1) && (src0 < 0))) {
532      dst = bit;
533      break;
534   }
535}
536""")
537
538unop("ifind_msb_rev", tint32, """
539dst = -1;
540/* We are looking for the highest bit that's not the same as the sign bit. */
541uint32_t sign = src0 & 0x80000000u;
542for (int bit = 0; bit < 32; bit++) {
543   if (((src0 << bit) & 0x80000000u) != sign) {
544      dst = bit;
545      break;
546   }
547}
548""")
549
550unop_convert("find_lsb", tint32, tint, """
551dst = -1;
552for (unsigned bit = 0; bit < bit_size; bit++) {
553   if ((src0 >> bit) & 1) {
554      dst = bit;
555      break;
556   }
557}
558""")
559
560unop_reduce("fsum", 1, tfloat, tfloat, "{src}", "{src0} + {src1}", "{src}",
561            description = "Sum of vector components")
562
563def binop_convert(name, out_type, in_type1, alg_props, const_expr, description="", in_type2=None):
564   if in_type2 is None:
565      in_type2 = in_type1
566   opcode(name, 0, out_type, [0, 0], [in_type1, in_type2],
567          False, alg_props, const_expr, description)
568
569def binop(name, ty, alg_props, const_expr, description = ""):
570   binop_convert(name, ty, ty, alg_props, const_expr, description)
571
572def binop_compare(name, ty, alg_props, const_expr, description = "", ty2=None):
573   binop_convert(name, tbool1, ty, alg_props, const_expr, description, ty2)
574
575def binop_compare8(name, ty, alg_props, const_expr, description = "", ty2=None):
576   binop_convert(name, tbool8, ty, alg_props, const_expr, description, ty2)
577
578def binop_compare16(name, ty, alg_props, const_expr, description = "", ty2=None):
579   binop_convert(name, tbool16, ty, alg_props, const_expr, description, ty2)
580
581def binop_compare32(name, ty, alg_props, const_expr, description = "", ty2=None):
582   binop_convert(name, tbool32, ty, alg_props, const_expr, description, ty2)
583
584def binop_compare_all_sizes(name, ty, alg_props, const_expr, description = "", ty2=None):
585   binop_compare(name, ty, alg_props, const_expr, description, ty2)
586   binop_compare8(name + "8", ty, alg_props, const_expr, description, ty2)
587   binop_compare16(name + "16", ty, alg_props, const_expr, description, ty2)
588   binop_compare32(name + "32", ty, alg_props, const_expr, description, ty2)
589
590def binop_horiz(name, out_size, out_type, src1_size, src1_type, src2_size,
591                src2_type, const_expr, description = ""):
592   opcode(name, out_size, out_type, [src1_size, src2_size], [src1_type, src2_type],
593          False, "", const_expr, description)
594
595def binop_reduce(name, output_size, output_type, src_type, prereduce_expr,
596                 reduce_expr, final_expr, suffix="", description = ""):
597   def final(src):
598      return final_expr.format(src= "(" + src + ")")
599   def reduce_(src0, src1):
600      return reduce_expr.format(src0=src0, src1=src1)
601   def prereduce(src0, src1):
602      return "(" + prereduce_expr.format(src0=src0, src1=src1) + ")"
603   srcs = [prereduce("src0." + letter, "src1." + letter) for letter in "xyzwefghijklmnop"]
604   def pairwise_reduce(start, size):
605      if (size == 1):
606         return srcs[start]
607      return reduce_(pairwise_reduce(start + size // 2, size // 2), pairwise_reduce(start, size // 2))
608   for size in [2, 4, 8, 16]:
609      opcode(name + str(size) + suffix, output_size, output_type,
610             [size, size], [src_type, src_type], False, _2src_commutative,
611             final(pairwise_reduce(0, size)), description)
612   opcode(name + "3" + suffix, output_size, output_type,
613          [3, 3], [src_type, src_type], False, _2src_commutative,
614          final(reduce_(reduce_(srcs[2], srcs[1]), srcs[0])), description)
615   opcode(name + "5" + suffix, output_size, output_type,
616          [5, 5], [src_type, src_type], False, _2src_commutative,
617          final(reduce_(srcs[4], reduce_(reduce_(srcs[3], srcs[2]),
618                                         reduce_(srcs[1], srcs[0])))),
619          description)
620
621def binop_reduce_all_sizes(name, output_size, src_type, prereduce_expr,
622                           reduce_expr, final_expr, description = ""):
623   binop_reduce(name, output_size, tbool1, src_type,
624                prereduce_expr, reduce_expr, final_expr, description)
625   binop_reduce("b8" + name[1:], output_size, tbool8, src_type,
626                prereduce_expr, reduce_expr, final_expr, description)
627   binop_reduce("b16" + name[1:], output_size, tbool16, src_type,
628                prereduce_expr, reduce_expr, final_expr, description)
629   binop_reduce("b32" + name[1:], output_size, tbool32, src_type,
630                prereduce_expr, reduce_expr, final_expr, description)
631
632binop("fadd", tfloat, _2src_commutative + associative,"""
633if (nir_is_rounding_mode_rtz(execution_mode, bit_size)) {
634   if (bit_size == 64)
635      dst = _mesa_double_add_rtz(src0, src1);
636   else
637      dst = _mesa_double_to_float_rtz((double)src0 + (double)src1);
638} else {
639   dst = src0 + src1;
640}
641""")
642binop("iadd", tint, _2src_commutative + associative, "(uint64_t)src0 + (uint64_t)src1")
643binop("iadd_sat", tint, _2src_commutative, """
644      src1 > 0 ?
645         (src0 + src1 < src0 ? u_intN_max(bit_size) : src0 + src1) :
646         (src0 < src0 + src1 ? u_intN_min(bit_size) : src0 + src1)
647""")
648binop("uadd_sat", tuint, _2src_commutative,
649      "(src0 + src1) < src0 ? u_uintN_max(sizeof(src0) * 8) : (src0 + src1)")
650binop("isub_sat", tint, "", """
651      src1 < 0 ?
652         (src0 - src1 < src0 ? u_intN_max(bit_size) : src0 - src1) :
653         (src0 < src0 - src1 ? u_intN_min(bit_size) : src0 - src1)
654""")
655binop("usub_sat", tuint, "", "src0 < src1 ? 0 : src0 - src1")
656
657binop("fsub", tfloat, "", """
658if (nir_is_rounding_mode_rtz(execution_mode, bit_size)) {
659   if (bit_size == 64)
660      dst = _mesa_double_sub_rtz(src0, src1);
661   else
662      dst = _mesa_double_to_float_rtz((double)src0 - (double)src1);
663} else {
664   dst = src0 - src1;
665}
666""")
667binop("isub", tint, "", "src0 - src1")
668binop_convert("uabs_isub", tuint, tint, "", """
669              src1 > src0 ? (uint64_t) src1 - (uint64_t) src0
670                          : (uint64_t) src0 - (uint64_t) src1
671""")
672binop("uabs_usub", tuint, "", "(src1 > src0) ? (src1 - src0) : (src0 - src1)")
673
674binop("fmul", tfloat, _2src_commutative + associative, """
675if (nir_is_rounding_mode_rtz(execution_mode, bit_size)) {
676   if (bit_size == 64)
677      dst = _mesa_double_mul_rtz(src0, src1);
678   else
679      dst = _mesa_double_to_float_rtz((double)src0 * (double)src1);
680} else {
681   dst = src0 * src1;
682}
683""")
684
685binop("fmulz", tfloat32, _2src_commutative + associative, """
686if (src0 == 0.0 || src1 == 0.0)
687   dst = 0.0;
688else if (nir_is_rounding_mode_rtz(execution_mode, 32))
689   dst = _mesa_double_to_float_rtz((double)src0 * (double)src1);
690else
691   dst = src0 * src1;
692""", description = """
693Unlike :nir:alu-op:`fmul`, anything (even infinity or NaN) multiplied by zero is
694always zero. ``fmulz(0.0, inf)`` and ``fmulz(0.0, nan)`` must be +/-0.0, even
695if ``INF_PRESERVE/NAN_PRESERVE`` is not used. If ``SIGNED_ZERO_PRESERVE`` is
696used, then the result must be a positive zero if either operand is zero.
697""")
698
699
700binop("imul", tint, _2src_commutative + associative, """
701   /* Use 64-bit multiplies to prevent overflow of signed arithmetic */
702   dst = (uint64_t)src0 * (uint64_t)src1;
703""", description = "Low 32-bits of signed/unsigned integer multiply")
704
705binop_convert("imul_2x32_64", tint64, tint32, _2src_commutative,
706              "(int64_t)src0 * (int64_t)src1",
707              description = "Multiply signed 32-bit integers, 64-bit result")
708binop_convert("umul_2x32_64", tuint64, tuint32, _2src_commutative,
709              "(uint64_t)src0 * (uint64_t)src1",
710              description = "Multiply unsigned 32-bit integers, 64-bit result")
711
712binop("imul_high", tint, _2src_commutative, """
713if (bit_size == 64) {
714   /* We need to do a full 128-bit x 128-bit multiply in order for the sign
715    * extension to work properly.  The casts are kind-of annoying but needed
716    * to prevent compiler warnings.
717    */
718   uint32_t src0_u32[4] = {
719      src0,
720      (int64_t)src0 >> 32,
721      (int64_t)src0 >> 63,
722      (int64_t)src0 >> 63,
723   };
724   uint32_t src1_u32[4] = {
725      src1,
726      (int64_t)src1 >> 32,
727      (int64_t)src1 >> 63,
728      (int64_t)src1 >> 63,
729   };
730   uint32_t prod_u32[4];
731   ubm_mul_u32arr(prod_u32, src0_u32, src1_u32);
732   dst = (uint64_t)prod_u32[2] | ((uint64_t)prod_u32[3] << 32);
733} else {
734   /* First, sign-extend to 64-bit, then convert to unsigned to prevent
735    * potential overflow of signed multiply */
736   dst = ((uint64_t)(int64_t)src0 * (uint64_t)(int64_t)src1) >> bit_size;
737}
738""", description = "High 32-bits of signed integer multiply")
739
740binop("umul_high", tuint, _2src_commutative, """
741if (bit_size == 64) {
742   /* The casts are kind-of annoying but needed to prevent compiler warnings. */
743   uint32_t src0_u32[2] = { src0, (uint64_t)src0 >> 32 };
744   uint32_t src1_u32[2] = { src1, (uint64_t)src1 >> 32 };
745   uint32_t prod_u32[4];
746   ubm_mul_u32arr(prod_u32, src0_u32, src1_u32);
747   dst = (uint64_t)prod_u32[2] | ((uint64_t)prod_u32[3] << 32);
748} else {
749   dst = ((uint64_t)src0 * (uint64_t)src1) >> bit_size;
750}
751""", description = "High 32-bits of unsigned integer multiply")
752
753binop("umul_low", tuint32, _2src_commutative, """
754uint64_t mask = (1 << (bit_size / 2)) - 1;
755dst = ((uint64_t)src0 & mask) * ((uint64_t)src1 & mask);
756""", description = "Low 32-bits of unsigned integer multiply")
757
758binop("imul_32x16", tint32, "", "src0 * (int16_t) src1",
759      description = "Multiply 32-bits with low 16-bits, with sign extension")
760binop("umul_32x16", tuint32, "", "src0 * (uint16_t) src1",
761      description = "Multiply 32-bits with low 16-bits, with zero extension")
762
763binop("fdiv", tfloat, "", "src0 / src1")
764binop("idiv", tint, "", "src1 == 0 ? 0 : (src0 / src1)")
765binop("udiv", tuint, "", "src1 == 0 ? 0 : (src0 / src1)")
766
767binop_convert("uadd_carry", tuint, tuint, _2src_commutative,
768              "src0 + src1 < src0",
769              description = """
770Return an integer (1 or 0) representing the carry resulting from the
771addition of the two unsigned arguments.
772              """)
773
774binop_convert("usub_borrow", tuint, tuint, "", "src0 < src1", description = """
775Return an integer (1 or 0) representing the borrow resulting from the
776subtraction of the two unsigned arguments.
777              """)
778
779# hadd: (a + b) >> 1 (without overflow)
780# x + y = x - (x & ~y) + (x & ~y) + y - (~x & y) + (~x & y)
781#       =      (x & y) + (x & ~y) +      (x & y) + (~x & y)
782#       = 2 *  (x & y) + (x & ~y) +                (~x & y)
783#       =     ((x & y) << 1) + (x ^ y)
784#
785# Since we know that the bottom bit of (x & y) << 1 is zero,
786#
787# (x + y) >> 1 = (((x & y) << 1) + (x ^ y)) >> 1
788#              =   (x & y) +      ((x ^ y)  >> 1)
789binop("ihadd", tint, _2src_commutative, "(src0 & src1) + ((src0 ^ src1) >> 1)")
790binop("uhadd", tuint, _2src_commutative, "(src0 & src1) + ((src0 ^ src1) >> 1)")
791
792# rhadd: (a + b + 1) >> 1 (without overflow)
793# x + y + 1 = x + (~x & y) - (~x & y) + y + (x & ~y) - (x & ~y) + 1
794#           =      (x | y) - (~x & y) +      (x | y) - (x & ~y) + 1
795#           = 2 *  (x | y) - ((~x & y) +               (x & ~y)) + 1
796#           =     ((x | y) << 1) - (x ^ y) + 1
797#
798# Since we know that the bottom bit of (x & y) << 1 is zero,
799#
800# (x + y + 1) >> 1 = (x | y) + (-(x ^ y) + 1) >> 1)
801#                  = (x | y) -  ((x ^ y)      >> 1)
802binop("irhadd", tint, _2src_commutative, "(src0 | src1) - ((src0 ^ src1) >> 1)")
803binop("urhadd", tuint, _2src_commutative, "(src0 | src1) - ((src0 ^ src1) >> 1)")
804
805binop("umod", tuint, "", "src1 == 0 ? 0 : src0 % src1")
806
807# For signed integers, there are several different possible definitions of
808# "modulus" or "remainder".  We follow the conventions used by LLVM and
809# SPIR-V.  The irem opcode implements the standard C/C++ signed "%"
810# operation while the imod opcode implements the more mathematical
811# "modulus" operation.  For details on the difference, see
812#
813# http://mathforum.org/library/drmath/view/52343.html
814
815binop("irem", tint, "", "src1 == 0 ? 0 : src0 % src1")
816binop("imod", tint, "",
817      "src1 == 0 ? 0 : ((src0 % src1 == 0 || (src0 >= 0) == (src1 >= 0)) ?"
818      "                 src0 % src1 : src0 % src1 + src1)")
819binop("fmod", tfloat, "", "src0 - src1 * floorf(src0 / src1)")
820binop("frem", tfloat, "", "src0 - src1 * truncf(src0 / src1)")
821
822#
823# Comparisons
824#
825
826
827# these integer-aware comparisons return a boolean (0 or ~0)
828
829binop_compare_all_sizes("flt", tfloat, "", "src0 < src1")
830binop_compare_all_sizes("fge", tfloat, "", "src0 >= src1")
831binop_compare_all_sizes("feq", tfloat, _2src_commutative, "src0 == src1")
832binop_compare_all_sizes("fneu", tfloat, _2src_commutative, "src0 != src1")
833binop_compare_all_sizes("ilt", tint, "", "src0 < src1")
834binop_compare_all_sizes("ige", tint, "", "src0 >= src1")
835binop_compare_all_sizes("ieq", tint, _2src_commutative, "src0 == src1")
836binop_compare_all_sizes("ine", tint, _2src_commutative, "src0 != src1")
837binop_compare_all_sizes("ult", tuint, "", "src0 < src1")
838binop_compare_all_sizes("uge", tuint, "", "src0 >= src1")
839
840binop_compare_all_sizes("bitnz", tuint, "", "((uint64_t)src0 >> (src1 & (bit_size - 1)) & 0x1) == 0x1",
841   "only uses the least significant bits like SM5 shifts", tuint32)
842
843binop_compare_all_sizes("bitz", tuint, "", "((uint64_t)src0 >> (src1 & (bit_size - 1)) & 0x1) == 0x0",
844   "only uses the least significant bits like SM5 shifts", tuint32)
845
846# integer-aware GLSL-style comparisons that compare floats and ints
847
848binop_reduce_all_sizes("ball_fequal",  1, tfloat, "{src0} == {src1}",
849                       "{src0} && {src1}", "{src}")
850binop_reduce_all_sizes("bany_fnequal", 1, tfloat, "{src0} != {src1}",
851                       "{src0} || {src1}", "{src}")
852binop_reduce_all_sizes("ball_iequal",  1, tint, "{src0} == {src1}",
853                       "{src0} && {src1}", "{src}")
854binop_reduce_all_sizes("bany_inequal", 1, tint, "{src0} != {src1}",
855                       "{src0} || {src1}", "{src}")
856
857# non-integer-aware GLSL-style comparisons that return 0.0 or 1.0
858
859binop_reduce("fall_equal",  1, tfloat32, tfloat32, "{src0} == {src1}",
860             "{src0} && {src1}", "{src} ? 1.0f : 0.0f")
861binop_reduce("fany_nequal", 1, tfloat32, tfloat32, "{src0} != {src1}",
862             "{src0} || {src1}", "{src} ? 1.0f : 0.0f")
863
864# These comparisons for integer-less hardware return 1.0 and 0.0 for true
865# and false respectively
866
867binop("slt", tfloat, "", "(src0 < src1) ? 1.0f : 0.0f") # Set on Less Than
868binop("sge", tfloat, "", "(src0 >= src1) ? 1.0f : 0.0f") # Set on Greater or Equal
869binop("seq", tfloat, _2src_commutative, "(src0 == src1) ? 1.0f : 0.0f") # Set on Equal
870binop("sne", tfloat, _2src_commutative, "(src0 != src1) ? 1.0f : 0.0f") # Set on Not Equal
871
872shift_note = """
873SPIRV shifts are undefined for shift-operands >= bitsize,
874but SM5 shifts are defined to use only the least significant bits.
875The NIR definition is according to the SM5 specification.
876"""
877
878opcode("ishl", 0, tint, [0, 0], [tint, tuint32], False, "",
879       "(uint64_t)src0 << (src1 & (sizeof(src0) * 8 - 1))",
880       description = "Left shift." + shift_note)
881opcode("ishr", 0, tint, [0, 0], [tint, tuint32], False, "",
882       "src0 >> (src1 & (sizeof(src0) * 8 - 1))",
883       description = "Signed right-shift." + shift_note)
884opcode("ushr", 0, tuint, [0, 0], [tuint, tuint32], False, "",
885       "src0 >> (src1 & (sizeof(src0) * 8 - 1))",
886       description = "Unsigned right-shift." + shift_note)
887
888opcode("urol", 0, tuint, [0, 0], [tuint, tuint32], False, "", """
889   uint32_t rotate_mask = sizeof(src0) * 8 - 1;
890   dst = (src0 << (src1 & rotate_mask)) |
891         (src0 >> (-src1 & rotate_mask));
892""")
893opcode("uror", 0, tuint, [0, 0], [tuint, tuint32], False, "", """
894   uint32_t rotate_mask = sizeof(src0) * 8 - 1;
895   dst = (src0 >> (src1 & rotate_mask)) |
896         (src0 << (-src1 & rotate_mask));
897""")
898
899bitwise_description = """
900Bitwise {0}, also used as a boolean {0} for hardware supporting integers.
901"""
902
903binop("iand", tuint, _2src_commutative + associative, "src0 & src1",
904      description = bitwise_description.format("AND"))
905binop("ior", tuint, _2src_commutative + associative, "src0 | src1",
906      description = bitwise_description.format("OR"))
907binop("ixor", tuint, _2src_commutative + associative, "src0 ^ src1",
908      description = bitwise_description.format("XOR"))
909
910
911binop_reduce("fdot", 1, tfloat, tfloat, "{src0} * {src1}", "{src0} + {src1}",
912             "{src}")
913
914binop_reduce("fdot", 0, tfloat, tfloat,
915             "{src0} * {src1}", "{src0} + {src1}", "{src}",
916             suffix="_replicated")
917
918opcode("fdph", 1, tfloat, [3, 4], [tfloat, tfloat], False, "",
919       "src0.x * src1.x + src0.y * src1.y + src0.z * src1.z + src1.w")
920opcode("fdph_replicated", 0, tfloat, [3, 4], [tfloat, tfloat], False, "",
921       "src0.x * src1.x + src0.y * src1.y + src0.z * src1.z + src1.w")
922
923binop("fmin", tfloat, _2src_commutative + associative, "fmin(src0, src1)")
924binop("imin", tint, _2src_commutative + associative, "src1 > src0 ? src0 : src1")
925binop("umin", tuint, _2src_commutative + associative, "src1 > src0 ? src0 : src1")
926binop("fmax", tfloat, _2src_commutative + associative, "fmax(src0, src1)")
927binop("imax", tint, _2src_commutative + associative, "src1 > src0 ? src1 : src0")
928binop("umax", tuint, _2src_commutative + associative, "src1 > src0 ? src1 : src0")
929
930binop("fpow", tfloat, "", "bit_size == 64 ? pow(src0, src1) : powf(src0, src1)")
931
932binop_horiz("pack_half_2x16_split", 1, tuint32, 1, tfloat32, 1, tfloat32,
933            "pack_half_1x16(src0.x) | (pack_half_1x16(src1.x) << 16)")
934
935binop_horiz("pack_half_2x16_rtz_split", 1, tuint32, 1, tfloat32, 1, tfloat32,
936            "pack_half_1x16_rtz(src0.x) | (pack_half_1x16_rtz(src1.x) << 16)")
937
938binop_convert("pack_64_2x32_split", tuint64, tuint32, "",
939              "src0 | ((uint64_t)src1 << 32)")
940
941binop_convert("pack_32_2x16_split", tuint32, tuint16, "",
942              "src0 | ((uint32_t)src1 << 16)")
943
944opcode("pack_32_4x8_split", 0, tuint32, [0, 0, 0, 0], [tuint8, tuint8, tuint8, tuint8],
945       False, "",
946       "src0 | ((uint32_t)src1 << 8) | ((uint32_t)src2 << 16) | ((uint32_t)src3 << 24)")
947
948binop_convert("bfm", tuint32, tint32, "", """
949int bits = src0 & 0x1F;
950int offset = src1 & 0x1F;
951dst = ((1u << bits) - 1) << offset;
952""", description = """
953Implements the behavior of the first operation of the SM5 "bfi" assembly
954and that of the "bfi1" i965 instruction. That is, the bits and offset values
955are from the low five bits of src0 and src1, respectively.
956""")
957
958opcode("ldexp", 0, tfloat, [0, 0], [tfloat, tint32], False, "", """
959dst = (bit_size == 64) ? ldexp(src0, src1) : ldexpf(src0, src1);
960/* flush denormals to zero. */
961if (!isnormal(dst))
962   dst = copysignf(0.0f, src0);
963""")
964
965binop_horiz("vec2", 2, tuint, 1, tuint, 1, tuint, """
966dst.x = src0.x;
967dst.y = src1.x;
968""", description = """
969Combines the first component of each input to make a 2-component vector.
970""")
971
972# Byte extraction
973binop("extract_u8", tuint, "", "(uint8_t)(src0 >> (src1 * 8))")
974binop("extract_i8", tint, "", "(int8_t)(src0 >> (src1 * 8))")
975
976# Word extraction
977binop("extract_u16", tuint, "", "(uint16_t)(src0 >> (src1 * 16))")
978binop("extract_i16", tint, "", "(int16_t)(src0 >> (src1 * 16))")
979
980# Byte/word insertion
981binop("insert_u8", tuint, "", "(src0 & 0xff) << (src1 * 8)")
982binop("insert_u16", tuint, "", "(src0 & 0xffff) << (src1 * 16)")
983
984
985def triop(name, ty, alg_props, const_expr, description = ""):
986   opcode(name, 0, ty, [0, 0, 0], [ty, ty, ty], False, alg_props, const_expr,
987          description)
988def triop_horiz(name, output_size, src1_size, src2_size, src3_size, const_expr,
989                description = ""):
990   opcode(name, output_size, tuint,
991   [src1_size, src2_size, src3_size],
992   [tuint, tuint, tuint], False, "", const_expr, description)
993
994triop("ffma", tfloat, _2src_commutative, """
995if (nir_is_rounding_mode_rtz(execution_mode, bit_size)) {
996   if (bit_size == 64)
997      dst = _mesa_double_fma_rtz(src0, src1, src2);
998   else if (bit_size == 32)
999      dst = _mesa_float_fma_rtz(src0, src1, src2);
1000   else
1001      dst = _mesa_double_to_float_rtz(_mesa_double_fma_rtz(src0, src1, src2));
1002} else {
1003   if (bit_size == 32)
1004      dst = fmaf(src0, src1, src2);
1005   else
1006      dst = fma(src0, src1, src2);
1007}
1008""")
1009
1010triop("ffmaz", tfloat32, _2src_commutative, """
1011if (src0 == 0.0 || src1 == 0.0)
1012   dst = 0.0 + src2;
1013else if (nir_is_rounding_mode_rtz(execution_mode, 32))
1014   dst = _mesa_float_fma_rtz(src0, src1, src2);
1015else
1016   dst = fmaf(src0, src1, src2);
1017""", description = """
1018Floating-point multiply-add with modified zero handling.
1019
1020Unlike :nir:alu-op:`ffma`, anything (even infinity or NaN) multiplied by zero is
1021always zero. ``ffmaz(0.0, inf, src2)`` and ``ffmaz(0.0, nan, src2)`` must be
1022``+/-0.0 + src2``, even if ``INF_PRESERVE/NAN_PRESERVE`` is not used. If
1023``SIGNED_ZERO_PRESERVE`` is used, then the result must be a positive
1024zero plus src2 if either src0 or src1 is zero.
1025""")
1026
1027triop("flrp", tfloat, "", "src0 * (1 - src2) + src1 * src2")
1028
1029triop("iadd3", tint, _2src_commutative + associative, "src0 + src1 + src2",
1030      description = "Ternary addition")
1031
1032triop("imad", tint, _2src_commutative + associative, "src0 * src1 + src2",
1033      description = "Integer multiply-add")
1034
1035csel_description = """
1036A vector conditional select instruction (like ?:, but operating per-
1037component on vectors). The condition is {} bool ({}).
1038"""
1039
1040triop("fcsel", tfloat32, selection, "(src0 != 0.0f) ? src1 : src2",
1041      description = csel_description.format("a floating point", "0.0 vs 1.0"))
1042opcode("bcsel", 0, tuint, [0, 0, 0],
1043       [tbool1, tuint, tuint], False, selection, "src0 ? src1 : src2",
1044       description = csel_description.format("a 1-bit", "0 vs 1"))
1045opcode("b8csel", 0, tuint, [0, 0, 0],
1046       [tbool8, tuint, tuint], False, selection, "src0 ? src1 : src2",
1047       description = csel_description.format("an 8-bit", "0 vs ~0"))
1048opcode("b16csel", 0, tuint, [0, 0, 0],
1049       [tbool16, tuint, tuint], False, selection, "src0 ? src1 : src2",
1050       description = csel_description.format("a 16-bit", "0 vs ~0"))
1051opcode("b32csel", 0, tuint, [0, 0, 0],
1052       [tbool32, tuint, tuint], False, selection, "src0 ? src1 : src2",
1053       description = csel_description.format("a 32-bit", "0 vs ~0"))
1054
1055triop("i32csel_gt", tint32, selection, "(src0 > 0) ? src1 : src2")
1056triop("i32csel_ge", tint32, selection, "(src0 >= 0) ? src1 : src2")
1057
1058triop("fcsel_gt", tfloat32, selection, "(src0 > 0.0f) ? src1 : src2")
1059triop("fcsel_ge", tfloat32, selection, "(src0 >= 0.0f) ? src1 : src2")
1060
1061triop("bfi", tuint32, "", """
1062unsigned mask = src0, insert = src1, base = src2;
1063if (mask == 0) {
1064   dst = base;
1065} else {
1066   unsigned tmp = mask;
1067   while (!(tmp & 1)) {
1068      tmp >>= 1;
1069      insert <<= 1;
1070   }
1071   dst = (base & ~mask) | (insert & mask);
1072}
1073""", description = "SM5 bfi assembly")
1074
1075
1076triop("bitfield_select", tuint, "", "(src0 & src1) | (~src0 & src2)")
1077
1078# SM5 ubfe/ibfe assembly: only the 5 least significant bits of offset and bits are used.
1079opcode("ubfe", 0, tuint32,
1080       [0, 0, 0], [tuint32, tuint32, tuint32], False, "", """
1081unsigned base = src0;
1082unsigned offset = src1 & 0x1F;
1083unsigned bits = src2 & 0x1F;
1084if (bits == 0) {
1085   dst = 0;
1086} else if (offset + bits < 32) {
1087   dst = (base << (32 - bits - offset)) >> (32 - bits);
1088} else {
1089   dst = base >> offset;
1090}
1091""")
1092opcode("ibfe", 0, tint32,
1093       [0, 0, 0], [tint32, tuint32, tuint32], False, "", """
1094int base = src0;
1095unsigned offset = src1 & 0x1F;
1096unsigned bits = src2 & 0x1F;
1097if (bits == 0) {
1098   dst = 0;
1099} else if (offset + bits < 32) {
1100   dst = (base << (32 - bits - offset)) >> (32 - bits);
1101} else {
1102   dst = base >> offset;
1103}
1104""")
1105
1106# GLSL bitfieldExtract()
1107opcode("ubitfield_extract", 0, tuint32,
1108       [0, 0, 0], [tuint32, tint32, tint32], False, "", """
1109unsigned base = src0;
1110int offset = src1, bits = src2;
1111if (bits == 0) {
1112   dst = 0;
1113} else if (bits < 0 || offset < 0 || offset + bits > 32) {
1114   dst = 0; /* undefined per the spec */
1115} else {
1116   dst = (base >> offset) & ((1ull << bits) - 1);
1117}
1118""")
1119opcode("ibitfield_extract", 0, tint32,
1120       [0, 0, 0], [tint32, tint32, tint32], False, "", """
1121int base = src0;
1122int offset = src1, bits = src2;
1123if (bits == 0) {
1124   dst = 0;
1125} else if (offset < 0 || bits < 0 || offset + bits > 32) {
1126   dst = 0;
1127} else {
1128   dst = (base << (32 - offset - bits)) >> (32 - bits); /* use sign-extending shift */
1129}
1130""")
1131
1132triop("msad_4x8", tuint32, "", """
1133dst = msad(src0, src1, src2);
1134""", description = """
1135Masked sum of absolute differences with accumulation. Equivalent to AMD's v_msad_u8
1136instruction and DXIL's MSAD.
1137
1138The first two sources contain packed 8-bit unsigned integers, the instruction
1139will calculate the absolute difference of integers when src0's is non-zero, and
1140then add them together. There is also a third source which is a 32-bit unsigned
1141integer and added to the result.
1142""")
1143
1144# Combines the first component of each input to make a 3-component vector.
1145
1146triop_horiz("vec3", 3, 1, 1, 1, """
1147dst.x = src0.x;
1148dst.y = src1.x;
1149dst.z = src2.x;
1150""")
1151
1152def quadop_horiz(name, output_size, src1_size, src2_size, src3_size,
1153                 src4_size, const_expr):
1154   opcode(name, output_size, tuint,
1155          [src1_size, src2_size, src3_size, src4_size],
1156          [tuint, tuint, tuint, tuint],
1157          False, "", const_expr)
1158
1159opcode("bitfield_insert", 0, tuint32, [0, 0, 0, 0],
1160       [tuint32, tuint32, tint32, tint32], False, "", """
1161unsigned base = src0, insert = src1;
1162int offset = src2, bits = src3;
1163if (bits == 0) {
1164   dst = base;
1165} else if (offset < 0 || bits < 0 || bits + offset > 32) {
1166   dst = 0;
1167} else {
1168   unsigned mask = ((1ull << bits) - 1) << offset;
1169   dst = (base & ~mask) | ((insert << offset) & mask);
1170}
1171""")
1172
1173quadop_horiz("vec4", 4, 1, 1, 1, 1, """
1174dst.x = src0.x;
1175dst.y = src1.x;
1176dst.z = src2.x;
1177dst.w = src3.x;
1178""")
1179
1180opcode("vec5", 5, tuint,
1181       [1] * 5, [tuint] * 5,
1182       False, "", """
1183dst.x = src0.x;
1184dst.y = src1.x;
1185dst.z = src2.x;
1186dst.w = src3.x;
1187dst.e = src4.x;
1188""")
1189
1190opcode("vec8", 8, tuint,
1191       [1] * 8, [tuint] * 8,
1192       False, "", """
1193dst.x = src0.x;
1194dst.y = src1.x;
1195dst.z = src2.x;
1196dst.w = src3.x;
1197dst.e = src4.x;
1198dst.f = src5.x;
1199dst.g = src6.x;
1200dst.h = src7.x;
1201""")
1202
1203opcode("vec16", 16, tuint,
1204       [1] * 16, [tuint] * 16,
1205       False, "", """
1206dst.x = src0.x;
1207dst.y = src1.x;
1208dst.z = src2.x;
1209dst.w = src3.x;
1210dst.e = src4.x;
1211dst.f = src5.x;
1212dst.g = src6.x;
1213dst.h = src7.x;
1214dst.i = src8.x;
1215dst.j = src9.x;
1216dst.k = src10.x;
1217dst.l = src11.x;
1218dst.m = src12.x;
1219dst.n = src13.x;
1220dst.o = src14.x;
1221dst.p = src15.x;
1222""")
1223
1224# An integer multiply instruction for address calculation.  This is
1225# similar to imul, except that the results are undefined in case of
1226# overflow.  Overflow is defined according to the size of the variable
1227# being dereferenced.
1228#
1229# This relaxed definition, compared to imul, allows an optimization
1230# pass to propagate bounds (ie, from an load/store intrinsic) to the
1231# sources, such that lower precision integer multiplies can be used.
1232# This is useful on hw that has 24b or perhaps 16b integer multiply
1233# instructions.
1234binop("amul", tint, _2src_commutative + associative, "src0 * src1")
1235
1236# ir3-specific instruction that maps directly to mul-add shift high mix,
1237# (IMADSH_MIX16 i.e. ah * bl << 16 + c). It is used for lowering integer
1238# multiplication (imul) on Freedreno backend..
1239opcode("imadsh_mix16", 0, tint32,
1240       [0, 0, 0], [tint32, tint32, tint32], False, "", """
1241dst = ((((src0 & 0xffff0000) >> 16) * (src1 & 0x0000ffff)) << 16) + src2;
1242""")
1243
1244# ir3-specific instruction that maps directly to ir3 mad.s24.
1245#
1246# 24b multiply into 32b result (with sign extension) plus 32b int
1247triop("imad24_ir3", tint32, _2src_commutative,
1248      "(((int32_t)src0 << 8) >> 8) * (((int32_t)src1 << 8) >> 8) + src2")
1249
1250# r600/gcn specific instruction that evaluates unnormalized cube texture coordinates
1251# and face index
1252# The actual texture coordinates are evaluated from this according to
1253#    dst.yx / abs(dst.z) + 1.5
1254unop_horiz("cube_amd", 4, tfloat32, 3, tfloat32, """
1255   dst.x = dst.y = dst.z = 0.0;
1256   float absX = fabsf(src0.x);
1257   float absY = fabsf(src0.y);
1258   float absZ = fabsf(src0.z);
1259
1260   if (absX >= absY && absX >= absZ) { dst.z = 2 * src0.x; }
1261   if (absY >= absX && absY >= absZ) { dst.z = 2 * src0.y; }
1262   if (absZ >= absX && absZ >= absY) { dst.z = 2 * src0.z; }
1263
1264   if (src0.x >= 0 && absX >= absY && absX >= absZ) {
1265      dst.y = -src0.z; dst.x = -src0.y; dst.w = 0;
1266   }
1267   if (src0.x < 0 && absX >= absY && absX >= absZ) {
1268      dst.y = src0.z; dst.x = -src0.y; dst.w = 1;
1269   }
1270   if (src0.y >= 0 && absY >= absX && absY >= absZ) {
1271      dst.y = src0.x; dst.x = src0.z; dst.w = 2;
1272   }
1273   if (src0.y < 0 && absY >= absX && absY >= absZ) {
1274      dst.y = src0.x; dst.x = -src0.z; dst.w = 3;
1275   }
1276   if (src0.z >= 0 && absZ >= absX && absZ >= absY) {
1277      dst.y = src0.x; dst.x = -src0.y; dst.w = 4;
1278   }
1279   if (src0.z < 0 && absZ >= absX && absZ >= absY) {
1280      dst.y = -src0.x; dst.x = -src0.y; dst.w = 5;
1281   }
1282""")
1283
1284# r600/gcn specific sin and cos
1285# these trigeometric functions need some lowering because the supported
1286# input values are expected to be normalized by dividing by (2 * pi)
1287unop("fsin_amd", tfloat, "sinf(6.2831853 * src0)")
1288unop("fcos_amd", tfloat, "cosf(6.2831853 * src0)")
1289
1290# Midgard specific sin and cos
1291# These expect their inputs to be divided by pi.
1292unop("fsin_mdg", tfloat, "sinf(3.141592653589793 * src0)")
1293unop("fcos_mdg", tfloat, "cosf(3.141592653589793 * src0)")
1294
1295# AGX specific sin with input expressed in quadrants. Used in the lowering for
1296# fsin/fcos. This corresponds to a sequence of 3 ALU ops in the backend (where
1297# the angle is further decomposed by quadrant, sinc is computed, and the angle
1298# is multiplied back for sin). Lowering fsin/fcos to fsin_agx requires some
1299# additional ALU that NIR may be able to optimize.
1300unop("fsin_agx", tfloat, "sinf(src0 * (6.2831853/4.0))")
1301
1302# AGX specific bitfield extraction from a pair of 32bit registers.
1303# src0,src1: the two registers
1304# src2: bit position of the LSB of the bitfield
1305# src3: number of bits in the bitfield if src3 > 0
1306#       src3 = 0 is equivalent to src3 = 32
1307# NOTE: src3 is a nir constant by contract
1308opcode("extr_agx", 0, tuint32,
1309       [0, 0, 0, 0], [tuint32, tuint32, tuint32, tuint32], False, "", """
1310    uint32_t mask = 0xFFFFFFFF;
1311    uint8_t shift = src2 & 0x7F;
1312    if (src3 != 0) {
1313       mask = (1 << src3) - 1;
1314    }
1315    if (shift >= 64) {
1316        dst = 0;
1317    } else {
1318        dst = (((((uint64_t) src1) << 32) | (uint64_t) src0) >> shift) & mask;
1319    }
1320""");
1321
1322# AGX multiply-shift-add. Corresponds to iadd/isub/imad/imsub instructions.
1323# The shift must be <= 4 (domain restriction). For performance, it should be
1324# constant.
1325opcode("imadshl_agx", 0, tint, [0, 0, 0, 0], [tint, tint, tint, tint], False,
1326       "", f"(src0 * src1) + (src2 << src3)")
1327opcode("imsubshl_agx", 0, tint, [0, 0, 0, 0], [tint, tint, tint, tint], False,
1328       "", f"(src0 * src1) - (src2 << src3)")
1329
1330binop_convert("interleave_agx", tuint32, tuint16, "", """
1331      dst = 0;
1332      for (unsigned bit = 0; bit < 16; bit++) {
1333          dst |= (src0 & (1 << bit)) << bit;
1334          dst |= (src1 & (1 << bit)) << (bit + 1);
1335      }""", description="""
1336      Interleave bits of 16-bit integers to calculate a 32-bit integer. This can
1337      be used as-is for Morton encoding.
1338      """)
1339
1340# 24b multiply into 32b result (with sign extension)
1341binop("imul24", tint32, _2src_commutative + associative,
1342      "(((int32_t)src0 << 8) >> 8) * (((int32_t)src1 << 8) >> 8)")
1343
1344# unsigned 24b multiply into 32b result plus 32b int
1345triop("umad24", tuint32, _2src_commutative,
1346      "(((uint32_t)src0 << 8) >> 8) * (((uint32_t)src1 << 8) >> 8) + src2")
1347
1348# unsigned 24b multiply into 32b result uint
1349binop("umul24", tint32, _2src_commutative + associative,
1350      "(((uint32_t)src0 << 8) >> 8) * (((uint32_t)src1 << 8) >> 8)")
1351
1352# relaxed versions of the above, which assume input is in the 24bit range (no clamping)
1353binop("imul24_relaxed", tint32, _2src_commutative + associative, "src0 * src1")
1354triop("umad24_relaxed", tuint32, _2src_commutative, "src0 * src1 + src2")
1355binop("umul24_relaxed", tuint32, _2src_commutative + associative, "src0 * src1")
1356
1357unop_convert("fisnormal", tbool1, tfloat, "isnormal(src0)")
1358unop_convert("fisfinite", tbool1, tfloat, "isfinite(src0)")
1359unop_convert("fisfinite32", tbool32, tfloat, "isfinite(src0)")
1360
1361# vc4-specific opcodes
1362
1363# Saturated vector add for 4 8bit ints.
1364binop("usadd_4x8_vc4", tint32, _2src_commutative + associative, """
1365dst = 0;
1366for (int i = 0; i < 32; i += 8) {
1367   dst |= MIN2(((src0 >> i) & 0xff) + ((src1 >> i) & 0xff), 0xff) << i;
1368}
1369""")
1370
1371# Saturated vector subtract for 4 8bit ints.
1372binop("ussub_4x8_vc4", tint32, "", """
1373dst = 0;
1374for (int i = 0; i < 32; i += 8) {
1375   int src0_chan = (src0 >> i) & 0xff;
1376   int src1_chan = (src1 >> i) & 0xff;
1377   if (src0_chan > src1_chan)
1378      dst |= (src0_chan - src1_chan) << i;
1379}
1380""")
1381
1382# vector min for 4 8bit ints.
1383binop("umin_4x8_vc4", tint32, _2src_commutative + associative, """
1384dst = 0;
1385for (int i = 0; i < 32; i += 8) {
1386   dst |= MIN2((src0 >> i) & 0xff, (src1 >> i) & 0xff) << i;
1387}
1388""")
1389
1390# vector max for 4 8bit ints.
1391binop("umax_4x8_vc4", tint32, _2src_commutative + associative, """
1392dst = 0;
1393for (int i = 0; i < 32; i += 8) {
1394   dst |= MAX2((src0 >> i) & 0xff, (src1 >> i) & 0xff) << i;
1395}
1396""")
1397
1398# unorm multiply: (a * b) / 255.
1399binop("umul_unorm_4x8_vc4", tint32, _2src_commutative + associative, """
1400dst = 0;
1401for (int i = 0; i < 32; i += 8) {
1402   int src0_chan = (src0 >> i) & 0xff;
1403   int src1_chan = (src1 >> i) & 0xff;
1404   dst |= ((src0_chan * src1_chan) / 255) << i;
1405}
1406""")
1407
1408# v3d-specific opcodes
1409
1410# v3d-specific (v71) instruction that packs bits of 2 2x16 floating point into
1411# r11g11b10 bits, rounding to nearest even, so
1412#  dst[10:0]  = float16_to_float11 (src0[15:0])
1413#  dst[21:11] = float16_to_float11 (src0[31:16])
1414#  dst[31:22] = float16_to_float10 (src1[15:0])
1415binop_convert("pack_32_to_r11g11b10_v3d", tuint32, tuint32, "",
1416              "pack_32_to_r11g11b10_v3d(src0, src1)")
1417
1418# v3d-specific (v71) instruction that packs 2x32 bit to 2x16 bit integer. The
1419# difference with pack_32_2x16_split is that the sources are 32bit too. So it
1420# receives 2 32-bit integer, and packs the lower halfword as 2x16 on a 32-bit
1421# integer.
1422binop_horiz("pack_2x32_to_2x16_v3d", 1, tuint32, 1, tuint32, 1, tuint32,
1423            "(src0.x & 0xffff) | (src1.x << 16)")
1424
1425# v3d-specific (v71) instruction that packs bits of 2 2x16 integers into
1426# r10g10b10a2:
1427#   dst[9:0]   = src0[9:0]
1428#   dst[19:10] = src0[25:16]
1429#   dst[29:20] = src1[9:0]
1430#   dst[31:30] = src1[17:16]
1431binop_convert("pack_uint_32_to_r10g10b10a2_v3d", tuint32, tuint32, "",
1432              "(src0 & 0x3ff) | ((src0 >> 16) & 0x3ff) << 10 | (src1 & 0x3ff) << 20 | ((src1 >> 16) & 0x3ff) << 30")
1433
1434# v3d-specific (v71) instruction that packs 2 2x16 bit integers into 4x8 bits:
1435#   dst[7:0]   = src0[7:0]
1436#   dst[15:8]  = src0[23:16]
1437#   dst[23:16] = src1[7:0]
1438#   dst[31:24] = src1[23:16]
1439opcode("pack_4x16_to_4x8_v3d", 0, tuint32, [0, 0], [tuint32, tuint32],
1440       False, "",
1441       "(src0 & 0x000000ff) | (src0 & 0x00ff0000) >> 8 | (src1 & 0x000000ff) << 16 | (src1 & 0x00ff0000) << 8")
1442
1443# v3d-specific (v71) instructions to convert 2x16 floating point to 2x8 bit unorm/snorm
1444unop("pack_2x16_to_unorm_2x8_v3d", tuint32,
1445     "_mesa_half_to_unorm(src0 & 0xffff, 8) | (_mesa_half_to_unorm(src0 >> 16, 8) << 16)")
1446unop("pack_2x16_to_snorm_2x8_v3d", tuint32,
1447     "_mesa_half_to_snorm(src0 & 0xffff, 8) | (_mesa_half_to_snorm(src0 >> 16, 8) << 16)")
1448
1449# v3d-specific (v71) instructions to convert 32-bit floating point to 16 bit unorm/snorm
1450unop("f2unorm_16_v3d", tuint32, "_mesa_float_to_unorm16(src0)")
1451unop("f2snorm_16_v3d", tuint32, "_mesa_float_to_snorm16(src0)")
1452
1453# v3d-specific (v71) instructions to convert 2x16 bit floating points to 2x10 bit unorm
1454unop("pack_2x16_to_unorm_2x10_v3d", tuint32, "pack_2x16_to_unorm_2x10(src0)")
1455
1456# v3d-specific (v71) instructions to convert 2x16 bit floating points to one 2-bit
1457# and one 10 bit unorm
1458unop("pack_2x16_to_unorm_10_2_v3d", tuint32, "pack_2x16_to_unorm_10_2(src0)")
1459
1460# Mali-specific opcodes
1461unop("fsat_signed_mali", tfloat, ("fmin(fmax(src0, -1.0), 1.0)"))
1462unop("fclamp_pos_mali", tfloat, ("fmax(src0, 0.0)"))
1463
1464opcode("b32fcsel_mdg", 0, tuint, [0, 0, 0],
1465       [tbool32, tfloat, tfloat], False, selection, "src0 ? src1 : src2",
1466       description = csel_description.format("a 32-bit", "0 vs ~0") + """
1467       This Midgard-specific variant takes floating-point sources, rather than
1468       integer sources. That includes support for floating point modifiers in
1469       the backend.
1470       """)
1471
1472# Magnitude equal to fddx/y, sign undefined. Derivative of a constant is zero.
1473unop("fddx_must_abs_mali", tfloat, "0.0", algebraic_properties = "derivative")
1474unop("fddy_must_abs_mali", tfloat, "0.0", algebraic_properties = "derivative")
1475
1476# DXIL specific double [un]pack
1477# DXIL doesn't support generic [un]pack instructions, so we want those
1478# lowered to bit ops. HLSL doesn't support 64bit bitcasts to/from
1479# double, only [un]pack. Technically DXIL does, but considering they
1480# can't be generated from HLSL, we want to match what would be coming from DXC.
1481# This is essentially just the standard [un]pack, except that it doesn't get
1482# lowered so we can handle it in the backend and turn it into MakeDouble/SplitDouble
1483unop_horiz("pack_double_2x32_dxil", 1, tuint64, 2, tuint32,
1484           "dst.x = src0.x | ((uint64_t)src0.y << 32);")
1485unop_horiz("unpack_double_2x32_dxil", 2, tuint32, 1, tuint64,
1486           "dst.x = src0.x; dst.y = src0.x >> 32;")
1487
1488# src0 and src1 are i8vec4 packed in an int32, and src2 is an int32.  The int8
1489# components are sign-extended to 32-bits, and a dot-product is performed on
1490# the resulting vectors.  src2 is added to the result of the dot-product.
1491opcode("sdot_4x8_iadd", 0, tint32, [0, 0, 0], [tuint32, tuint32, tint32],
1492       False, _2src_commutative, """
1493   const int32_t v0x = (int8_t)(src0      );
1494   const int32_t v0y = (int8_t)(src0 >>  8);
1495   const int32_t v0z = (int8_t)(src0 >> 16);
1496   const int32_t v0w = (int8_t)(src0 >> 24);
1497   const int32_t v1x = (int8_t)(src1      );
1498   const int32_t v1y = (int8_t)(src1 >>  8);
1499   const int32_t v1z = (int8_t)(src1 >> 16);
1500   const int32_t v1w = (int8_t)(src1 >> 24);
1501
1502   dst = (v0x * v1x) + (v0y * v1y) + (v0z * v1z) + (v0w * v1w) + src2;
1503""")
1504
1505# Like sdot_4x8_iadd, but unsigned.
1506opcode("udot_4x8_uadd", 0, tuint32, [0, 0, 0], [tuint32, tuint32, tuint32],
1507       False, _2src_commutative, """
1508   const uint32_t v0x = (uint8_t)(src0      );
1509   const uint32_t v0y = (uint8_t)(src0 >>  8);
1510   const uint32_t v0z = (uint8_t)(src0 >> 16);
1511   const uint32_t v0w = (uint8_t)(src0 >> 24);
1512   const uint32_t v1x = (uint8_t)(src1      );
1513   const uint32_t v1y = (uint8_t)(src1 >>  8);
1514   const uint32_t v1z = (uint8_t)(src1 >> 16);
1515   const uint32_t v1w = (uint8_t)(src1 >> 24);
1516
1517   dst = (v0x * v1x) + (v0y * v1y) + (v0z * v1z) + (v0w * v1w) + src2;
1518""")
1519
1520# src0 is i8vec4 packed in an int32, src1 is u8vec4 packed in an int32, and
1521# src2 is an int32.  The 8-bit components are extended to 32-bits, and a
1522# dot-product is performed on the resulting vectors.  src2 is added to the
1523# result of the dot-product.
1524#
1525# NOTE: Unlike many of the other dp4a opcodes, this mixed signs of source 0
1526# and source 1 mean that this opcode is not 2-source commutative
1527opcode("sudot_4x8_iadd", 0, tint32, [0, 0, 0], [tuint32, tuint32, tint32],
1528       False, "", """
1529   const int32_t v0x = (int8_t)(src0      );
1530   const int32_t v0y = (int8_t)(src0 >>  8);
1531   const int32_t v0z = (int8_t)(src0 >> 16);
1532   const int32_t v0w = (int8_t)(src0 >> 24);
1533   const uint32_t v1x = (uint8_t)(src1      );
1534   const uint32_t v1y = (uint8_t)(src1 >>  8);
1535   const uint32_t v1z = (uint8_t)(src1 >> 16);
1536   const uint32_t v1w = (uint8_t)(src1 >> 24);
1537
1538   dst = (v0x * v1x) + (v0y * v1y) + (v0z * v1z) + (v0w * v1w) + src2;
1539""")
1540
1541# Like sdot_4x8_iadd, but the result is clampled to the range [-0x80000000, 0x7ffffffff].
1542opcode("sdot_4x8_iadd_sat", 0, tint32, [0, 0, 0], [tuint32, tuint32, tint32],
1543       False, _2src_commutative, """
1544   const int64_t v0x = (int8_t)(src0      );
1545   const int64_t v0y = (int8_t)(src0 >>  8);
1546   const int64_t v0z = (int8_t)(src0 >> 16);
1547   const int64_t v0w = (int8_t)(src0 >> 24);
1548   const int64_t v1x = (int8_t)(src1      );
1549   const int64_t v1y = (int8_t)(src1 >>  8);
1550   const int64_t v1z = (int8_t)(src1 >> 16);
1551   const int64_t v1w = (int8_t)(src1 >> 24);
1552
1553   const int64_t tmp = (v0x * v1x) + (v0y * v1y) + (v0z * v1z) + (v0w * v1w) + src2;
1554
1555   dst = tmp >= INT32_MAX ? INT32_MAX : (tmp <= INT32_MIN ? INT32_MIN : tmp);
1556""")
1557
1558# Like udot_4x8_uadd, but the result is clampled to the range [0, 0xfffffffff].
1559opcode("udot_4x8_uadd_sat", 0, tint32, [0, 0, 0], [tuint32, tuint32, tint32],
1560       False, _2src_commutative, """
1561   const uint64_t v0x = (uint8_t)(src0      );
1562   const uint64_t v0y = (uint8_t)(src0 >>  8);
1563   const uint64_t v0z = (uint8_t)(src0 >> 16);
1564   const uint64_t v0w = (uint8_t)(src0 >> 24);
1565   const uint64_t v1x = (uint8_t)(src1      );
1566   const uint64_t v1y = (uint8_t)(src1 >>  8);
1567   const uint64_t v1z = (uint8_t)(src1 >> 16);
1568   const uint64_t v1w = (uint8_t)(src1 >> 24);
1569
1570   const uint64_t tmp = (v0x * v1x) + (v0y * v1y) + (v0z * v1z) + (v0w * v1w) + src2;
1571
1572   dst = tmp >= UINT32_MAX ? UINT32_MAX : tmp;
1573""")
1574
1575# Like sudot_4x8_iadd, but the result is clampled to the range [-0x80000000, 0x7ffffffff].
1576#
1577# NOTE: Unlike many of the other dp4a opcodes, this mixed signs of source 0
1578# and source 1 mean that this opcode is not 2-source commutative
1579opcode("sudot_4x8_iadd_sat", 0, tint32, [0, 0, 0], [tuint32, tuint32, tint32],
1580       False, "", """
1581   const int64_t v0x = (int8_t)(src0      );
1582   const int64_t v0y = (int8_t)(src0 >>  8);
1583   const int64_t v0z = (int8_t)(src0 >> 16);
1584   const int64_t v0w = (int8_t)(src0 >> 24);
1585   const uint64_t v1x = (uint8_t)(src1      );
1586   const uint64_t v1y = (uint8_t)(src1 >>  8);
1587   const uint64_t v1z = (uint8_t)(src1 >> 16);
1588   const uint64_t v1w = (uint8_t)(src1 >> 24);
1589
1590   const int64_t tmp = (v0x * v1x) + (v0y * v1y) + (v0z * v1z) + (v0w * v1w) + src2;
1591
1592   dst = tmp >= INT32_MAX ? INT32_MAX : (tmp <= INT32_MIN ? INT32_MIN : tmp);
1593""")
1594
1595# src0 and src1 are i16vec2 packed in an int32, and src2 is an int32.  The int16
1596# components are sign-extended to 32-bits, and a dot-product is performed on
1597# the resulting vectors.  src2 is added to the result of the dot-product.
1598opcode("sdot_2x16_iadd", 0, tint32, [0, 0, 0], [tuint32, tuint32, tint32],
1599       False, _2src_commutative, """
1600   const int32_t v0x = (int16_t)(src0      );
1601   const int32_t v0y = (int16_t)(src0 >> 16);
1602   const int32_t v1x = (int16_t)(src1      );
1603   const int32_t v1y = (int16_t)(src1 >> 16);
1604
1605   dst = (v0x * v1x) + (v0y * v1y) + src2;
1606""")
1607
1608# Like sdot_2x16_iadd, but unsigned.
1609opcode("udot_2x16_uadd", 0, tuint32, [0, 0, 0], [tuint32, tuint32, tuint32],
1610       False, _2src_commutative, """
1611   const uint32_t v0x = (uint16_t)(src0      );
1612   const uint32_t v0y = (uint16_t)(src0 >> 16);
1613   const uint32_t v1x = (uint16_t)(src1      );
1614   const uint32_t v1y = (uint16_t)(src1 >> 16);
1615
1616   dst = (v0x * v1x) + (v0y * v1y) + src2;
1617""")
1618
1619# Like sdot_2x16_iadd, but the result is clampled to the range [-0x80000000, 0x7ffffffff].
1620opcode("sdot_2x16_iadd_sat", 0, tint32, [0, 0, 0], [tuint32, tuint32, tint32],
1621       False, _2src_commutative, """
1622   const int64_t v0x = (int16_t)(src0      );
1623   const int64_t v0y = (int16_t)(src0 >> 16);
1624   const int64_t v1x = (int16_t)(src1      );
1625   const int64_t v1y = (int16_t)(src1 >> 16);
1626
1627   const int64_t tmp = (v0x * v1x) + (v0y * v1y) + src2;
1628
1629   dst = tmp >= INT32_MAX ? INT32_MAX : (tmp <= INT32_MIN ? INT32_MIN : tmp);
1630""")
1631
1632# Like udot_2x16_uadd, but the result is clampled to the range [0, 0xfffffffff].
1633opcode("udot_2x16_uadd_sat", 0, tint32, [0, 0, 0], [tuint32, tuint32, tint32],
1634       False, _2src_commutative, """
1635   const uint64_t v0x = (uint16_t)(src0      );
1636   const uint64_t v0y = (uint16_t)(src0 >> 16);
1637   const uint64_t v1x = (uint16_t)(src1      );
1638   const uint64_t v1y = (uint16_t)(src1 >> 16);
1639
1640   const uint64_t tmp = (v0x * v1x) + (v0y * v1y) + src2;
1641
1642   dst = tmp >= UINT32_MAX ? UINT32_MAX : tmp;
1643""")
1644