• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1#
2# Copyright (C) 2014 Connor Abbott
3#
4# Permission is hereby granted, free of charge, to any person obtaining a
5# copy of this software and associated documentation files (the "Software"),
6# to deal in the Software without restriction, including without limitation
7# the rights to use, copy, modify, merge, publish, distribute, sublicense,
8# and/or sell copies of the Software, and to permit persons to whom the
9# Software is furnished to do so, subject to the following conditions:
10#
11# The above copyright notice and this permission notice (including the next
12# paragraph) shall be included in all copies or substantial portions of the
13# Software.
14#
15# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
18# THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20# FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21# IN THE SOFTWARE.
22#
23# Authors:
24#    Connor Abbott (cwabbott0@gmail.com)
25
26import re
27
28# Class that represents all the information we have about the opcode
29# NOTE: this must be kept in sync with nir_op_info
30
31class Opcode(object):
32   """Class that represents all the information we have about the opcode
33   NOTE: this must be kept in sync with nir_op_info
34   """
35   def __init__(self, name, output_size, output_type, input_sizes,
36                input_types, is_conversion, algebraic_properties, const_expr):
37      """Parameters:
38
39      - name is the name of the opcode (prepend nir_op_ for the enum name)
40      - all types are strings that get nir_type_ prepended to them
41      - input_types is a list of types
42      - is_conversion is true if this opcode represents a type conversion
43      - algebraic_properties is a space-seperated string, where nir_op_is_ is
44        prepended before each entry
45      - const_expr is an expression or series of statements that computes the
46        constant value of the opcode given the constant values of its inputs.
47
48      Constant expressions are formed from the variables src0, src1, ...,
49      src(N-1), where N is the number of arguments.  The output of the
50      expression should be stored in the dst variable.  Per-component input
51      and output variables will be scalars and non-per-component input and
52      output variables will be a struct with fields named x, y, z, and w
53      all of the correct type.  Input and output variables can be assumed
54      to already be of the correct type and need no conversion.  In
55      particular, the conversion from the C bool type to/from  NIR_TRUE and
56      NIR_FALSE happens automatically.
57
58      For per-component instructions, the entire expression will be
59      executed once for each component.  For non-per-component
60      instructions, the expression is expected to store the correct values
61      in dst.x, dst.y, etc.  If "dst" does not exist anywhere in the
62      constant expression, an assignment to dst will happen automatically
63      and the result will be equivalent to "dst = <expression>" for
64      per-component instructions and "dst.x = dst.y = ... = <expression>"
65      for non-per-component instructions.
66      """
67      assert isinstance(name, str)
68      assert isinstance(output_size, int)
69      assert isinstance(output_type, str)
70      assert isinstance(input_sizes, list)
71      assert isinstance(input_sizes[0], int)
72      assert isinstance(input_types, list)
73      assert isinstance(input_types[0], str)
74      assert isinstance(is_conversion, bool)
75      assert isinstance(algebraic_properties, str)
76      assert isinstance(const_expr, str)
77      assert len(input_sizes) == len(input_types)
78      assert 0 <= output_size <= 5 or (output_size == 8) or (output_size == 16)
79      for size in input_sizes:
80         assert 0 <= size <= 5 or (size == 8) or (size == 16)
81         if output_size != 0:
82            assert size != 0
83      self.name = name
84      self.num_inputs = len(input_sizes)
85      self.output_size = output_size
86      self.output_type = output_type
87      self.input_sizes = input_sizes
88      self.input_types = input_types
89      self.is_conversion = is_conversion
90      self.algebraic_properties = algebraic_properties
91      self.const_expr = const_expr
92
93# helper variables for strings
94tfloat = "float"
95tint = "int"
96tbool = "bool"
97tbool1 = "bool1"
98tbool8 = "bool8"
99tbool16 = "bool16"
100tbool32 = "bool32"
101tuint = "uint"
102tuint8 = "uint8"
103tint16 = "int16"
104tuint16 = "uint16"
105tfloat16 = "float16"
106tfloat32 = "float32"
107tint32 = "int32"
108tuint32 = "uint32"
109tint64 = "int64"
110tuint64 = "uint64"
111tfloat64 = "float64"
112
113_TYPE_SPLIT_RE = re.compile(r'(?P<type>int|uint|float|bool)(?P<bits>\d+)?')
114
115def type_has_size(type_):
116    m = _TYPE_SPLIT_RE.match(type_)
117    assert m is not None, 'Invalid NIR type string: "{}"'.format(type_)
118    return m.group('bits') is not None
119
120def type_size(type_):
121    m = _TYPE_SPLIT_RE.match(type_)
122    assert m is not None, 'Invalid NIR type string: "{}"'.format(type_)
123    assert m.group('bits') is not None, \
124           'NIR type string has no bit size: "{}"'.format(type_)
125    return int(m.group('bits'))
126
127def type_sizes(type_):
128    if type_has_size(type_):
129        return [type_size(type_)]
130    elif type_ == 'bool':
131        return [1, 8, 16, 32]
132    elif type_ == 'float':
133        return [16, 32, 64]
134    else:
135        return [1, 8, 16, 32, 64]
136
137def type_base_type(type_):
138    m = _TYPE_SPLIT_RE.match(type_)
139    assert m is not None, 'Invalid NIR type string: "{}"'.format(type_)
140    return m.group('type')
141
142# Operation where the first two sources are commutative.
143#
144# For 2-source operations, this just mathematical commutativity.  Some
145# 3-source operations, like ffma, are only commutative in the first two
146# sources.
147_2src_commutative = "2src_commutative "
148associative = "associative "
149selection = "selection "
150
151# global dictionary of opcodes
152opcodes = {}
153
154def opcode(name, output_size, output_type, input_sizes, input_types,
155           is_conversion, algebraic_properties, const_expr):
156   assert name not in opcodes
157   opcodes[name] = Opcode(name, output_size, output_type, input_sizes,
158                          input_types, is_conversion, algebraic_properties,
159                          const_expr)
160
161def unop_convert(name, out_type, in_type, const_expr):
162   opcode(name, 0, out_type, [0], [in_type], False, "", const_expr)
163
164def unop(name, ty, const_expr):
165   opcode(name, 0, ty, [0], [ty], False, "", const_expr)
166
167def unop_horiz(name, output_size, output_type, input_size, input_type,
168               const_expr):
169   opcode(name, output_size, output_type, [input_size], [input_type],
170          False, "", const_expr)
171
172def unop_reduce(name, output_size, output_type, input_type, prereduce_expr,
173                reduce_expr, final_expr):
174   def prereduce(src):
175      return "(" + prereduce_expr.format(src=src) + ")"
176   def final(src):
177      return final_expr.format(src="(" + src + ")")
178   def reduce_(src0, src1):
179      return reduce_expr.format(src0=src0, src1=src1)
180   src0 = prereduce("src0.x")
181   src1 = prereduce("src0.y")
182   src2 = prereduce("src0.z")
183   src3 = prereduce("src0.w")
184   unop_horiz(name + "2", output_size, output_type, 2, input_type,
185              final(reduce_(src0, src1)))
186   unop_horiz(name + "3", output_size, output_type, 3, input_type,
187              final(reduce_(reduce_(src0, src1), src2)))
188   unop_horiz(name + "4", output_size, output_type, 4, input_type,
189              final(reduce_(reduce_(src0, src1), reduce_(src2, src3))))
190
191def unop_numeric_convert(name, out_type, in_type, const_expr):
192   opcode(name, 0, out_type, [0], [in_type], True, "", const_expr)
193
194unop("mov", tuint, "src0")
195
196unop("ineg", tint, "-src0")
197unop("fneg", tfloat, "-src0")
198unop("inot", tint, "~src0") # invert every bit of the integer
199
200# nir_op_fsign roughly implements the OpenGL / Vulkan rules for sign(float).
201# The GLSL.std.450 FSign instruction is defined as:
202#
203#    Result is 1.0 if x > 0, 0.0 if x = 0, or -1.0 if x < 0.
204#
205# If the source is equal to zero, there is a preference for the result to have
206# the same sign, but this is not required (it is required by OpenCL).  If the
207# source is not a number, there is a preference for the result to be +0.0, but
208# this is not required (it is required by OpenCL).  If the source is not a
209# number, and the result is not +0.0, the result should definitely **not** be
210# NaN.
211#
212# The values returned for constant folding match the behavior required by
213# OpenCL.
214unop("fsign", tfloat, ("bit_size == 64 ? " +
215                       "(isnan(src0) ? 0.0  : ((src0 == 0.0 ) ? src0 : (src0 > 0.0 ) ? 1.0  : -1.0 )) : " +
216                       "(isnan(src0) ? 0.0f : ((src0 == 0.0f) ? src0 : (src0 > 0.0f) ? 1.0f : -1.0f))"))
217unop("isign", tint, "(src0 == 0) ? 0 : ((src0 > 0) ? 1 : -1)")
218unop("iabs", tint, "(src0 < 0) ? -src0 : src0")
219unop("fabs", tfloat, "fabs(src0)")
220unop("fsat", tfloat, ("fmin(fmax(src0, 0.0), 1.0)"))
221unop("frcp", tfloat, "bit_size == 64 ? 1.0 / src0 : 1.0f / src0")
222unop("frsq", tfloat, "bit_size == 64 ? 1.0 / sqrt(src0) : 1.0f / sqrtf(src0)")
223unop("fsqrt", tfloat, "bit_size == 64 ? sqrt(src0) : sqrtf(src0)")
224unop("fexp2", tfloat, "exp2f(src0)")
225unop("flog2", tfloat, "log2f(src0)")
226
227# Generate all of the numeric conversion opcodes
228for src_t in [tint, tuint, tfloat, tbool]:
229   if src_t == tbool:
230      dst_types = [tfloat, tint, tbool]
231   elif src_t == tint:
232      dst_types = [tfloat, tint, tbool]
233   elif src_t == tuint:
234      dst_types = [tfloat, tuint]
235   elif src_t == tfloat:
236      dst_types = [tint, tuint, tfloat, tbool]
237
238   for dst_t in dst_types:
239      for dst_bit_size in type_sizes(dst_t):
240          if dst_bit_size == 16 and dst_t == tfloat and src_t == tfloat:
241              rnd_modes = ['_rtne', '_rtz', '']
242              for rnd_mode in rnd_modes:
243                  if rnd_mode == '_rtne':
244                      conv_expr = """
245                      if (bit_size > 16) {
246                         dst = _mesa_half_to_float(_mesa_float_to_float16_rtne(src0));
247                      } else {
248                         dst = src0;
249                      }
250                      """
251                  elif rnd_mode == '_rtz':
252                      conv_expr = """
253                      if (bit_size > 16) {
254                         dst = _mesa_half_to_float(_mesa_float_to_float16_rtz(src0));
255                      } else {
256                         dst = src0;
257                      }
258                      """
259                  else:
260                      conv_expr = "src0"
261
262                  unop_numeric_convert("{0}2{1}{2}{3}".format(src_t[0],
263                                                              dst_t[0],
264                                                              dst_bit_size,
265                                                              rnd_mode),
266                                       dst_t + str(dst_bit_size),
267                                       src_t, conv_expr)
268          elif dst_bit_size == 32 and dst_t == tfloat and src_t == tfloat:
269              conv_expr = """
270              if (bit_size > 32 && nir_is_rounding_mode_rtz(execution_mode, 32)) {
271                 dst = _mesa_double_to_float_rtz(src0);
272              } else {
273                 dst = src0;
274              }
275              """
276              unop_numeric_convert("{0}2{1}{2}".format(src_t[0], dst_t[0],
277                                                       dst_bit_size),
278                                   dst_t + str(dst_bit_size), src_t, conv_expr)
279          else:
280              conv_expr = "src0 != 0" if dst_t == tbool else "src0"
281              unop_numeric_convert("{0}2{1}{2}".format(src_t[0], dst_t[0],
282                                                       dst_bit_size),
283                                   dst_t + str(dst_bit_size), src_t, conv_expr)
284
285# Special opcode that is the same as f2f16, i2i16, u2u16 except that it is safe
286# to remove it if the result is immediately converted back to 32 bits again.
287# This is generated as part of the precision lowering pass. mp stands for medium
288# precision.
289unop_numeric_convert("f2fmp", tfloat16, tfloat32, opcodes["f2f16"].const_expr)
290unop_numeric_convert("i2imp", tint16, tint32, opcodes["i2i16"].const_expr)
291# u2ump isn't defined, because the behavior is equal to i2imp
292unop_numeric_convert("f2imp", tint16, tfloat32, opcodes["f2i16"].const_expr)
293unop_numeric_convert("f2ump", tuint16, tfloat32, opcodes["f2u16"].const_expr)
294unop_numeric_convert("i2fmp", tfloat16, tint32, opcodes["i2f16"].const_expr)
295unop_numeric_convert("u2fmp", tfloat16, tuint32, opcodes["u2f16"].const_expr)
296
297# Unary floating-point rounding operations.
298
299
300unop("ftrunc", tfloat, "bit_size == 64 ? trunc(src0) : truncf(src0)")
301unop("fceil", tfloat, "bit_size == 64 ? ceil(src0) : ceilf(src0)")
302unop("ffloor", tfloat, "bit_size == 64 ? floor(src0) : floorf(src0)")
303unop("ffract", tfloat, "src0 - (bit_size == 64 ? floor(src0) : floorf(src0))")
304unop("fround_even", tfloat, "bit_size == 64 ? _mesa_roundeven(src0) : _mesa_roundevenf(src0)")
305
306unop("fquantize2f16", tfloat, "(fabs(src0) < ldexpf(1.0, -14)) ? copysignf(0.0f, src0) : _mesa_half_to_float(_mesa_float_to_half(src0))")
307
308# Trigonometric operations.
309
310
311unop("fsin", tfloat, "bit_size == 64 ? sin(src0) : sinf(src0)")
312unop("fcos", tfloat, "bit_size == 64 ? cos(src0) : cosf(src0)")
313
314# dfrexp
315unop_convert("frexp_exp", tint32, tfloat, "frexp(src0, &dst);")
316unop_convert("frexp_sig", tfloat, tfloat, "int n; dst = frexp(src0, &n);")
317
318# Partial derivatives.
319
320
321unop("fddx", tfloat, "0.0") # the derivative of a constant is 0.
322unop("fddy", tfloat, "0.0")
323unop("fddx_fine", tfloat, "0.0")
324unop("fddy_fine", tfloat, "0.0")
325unop("fddx_coarse", tfloat, "0.0")
326unop("fddy_coarse", tfloat, "0.0")
327
328
329# Floating point pack and unpack operations.
330
331def pack_2x16(fmt):
332   unop_horiz("pack_" + fmt + "_2x16", 1, tuint32, 2, tfloat32, """
333dst.x = (uint32_t) pack_fmt_1x16(src0.x);
334dst.x |= ((uint32_t) pack_fmt_1x16(src0.y)) << 16;
335""".replace("fmt", fmt))
336
337def pack_4x8(fmt):
338   unop_horiz("pack_" + fmt + "_4x8", 1, tuint32, 4, tfloat32, """
339dst.x = (uint32_t) pack_fmt_1x8(src0.x);
340dst.x |= ((uint32_t) pack_fmt_1x8(src0.y)) << 8;
341dst.x |= ((uint32_t) pack_fmt_1x8(src0.z)) << 16;
342dst.x |= ((uint32_t) pack_fmt_1x8(src0.w)) << 24;
343""".replace("fmt", fmt))
344
345def unpack_2x16(fmt):
346   unop_horiz("unpack_" + fmt + "_2x16", 2, tfloat32, 1, tuint32, """
347dst.x = unpack_fmt_1x16((uint16_t)(src0.x & 0xffff));
348dst.y = unpack_fmt_1x16((uint16_t)(src0.x << 16));
349""".replace("fmt", fmt))
350
351def unpack_4x8(fmt):
352   unop_horiz("unpack_" + fmt + "_4x8", 4, tfloat32, 1, tuint32, """
353dst.x = unpack_fmt_1x8((uint8_t)(src0.x & 0xff));
354dst.y = unpack_fmt_1x8((uint8_t)((src0.x >> 8) & 0xff));
355dst.z = unpack_fmt_1x8((uint8_t)((src0.x >> 16) & 0xff));
356dst.w = unpack_fmt_1x8((uint8_t)(src0.x >> 24));
357""".replace("fmt", fmt))
358
359
360pack_2x16("snorm")
361pack_4x8("snorm")
362pack_2x16("unorm")
363pack_4x8("unorm")
364pack_2x16("half")
365unpack_2x16("snorm")
366unpack_4x8("snorm")
367unpack_2x16("unorm")
368unpack_4x8("unorm")
369unpack_2x16("half")
370
371# Convert two unsigned integers into a packed unsigned short (clamp is applied).
372unop_horiz("pack_uint_2x16", 1, tuint32, 2, tuint32, """
373dst.x = _mesa_unsigned_to_unsigned(src0.x, 16);
374dst.x |= _mesa_unsigned_to_unsigned(src0.y, 16) << 16;
375""")
376
377# Convert two signed integers into a packed signed short (clamp is applied).
378unop_horiz("pack_sint_2x16", 1, tint32, 2, tint32, """
379dst.x = _mesa_signed_to_signed(src0.x, 16) & 0xffff;
380dst.x |= _mesa_signed_to_signed(src0.y, 16) << 16;
381""")
382
383unop_horiz("pack_uvec2_to_uint", 1, tuint32, 2, tuint32, """
384dst.x = (src0.x & 0xffff) | (src0.y << 16);
385""")
386
387unop_horiz("pack_uvec4_to_uint", 1, tuint32, 4, tuint32, """
388dst.x = (src0.x <<  0) |
389        (src0.y <<  8) |
390        (src0.z << 16) |
391        (src0.w << 24);
392""")
393
394unop_horiz("pack_32_4x8", 1, tuint32, 4, tuint8,
395           "dst.x = src0.x | ((uint32_t)src0.y << 8) | ((uint32_t)src0.z << 16) | ((uint32_t)src0.w << 24);")
396
397unop_horiz("pack_32_2x16", 1, tuint32, 2, tuint16,
398           "dst.x = src0.x | ((uint32_t)src0.y << 16);")
399
400unop_horiz("pack_64_2x32", 1, tuint64, 2, tuint32,
401           "dst.x = src0.x | ((uint64_t)src0.y << 32);")
402
403unop_horiz("pack_64_4x16", 1, tuint64, 4, tuint16,
404           "dst.x = src0.x | ((uint64_t)src0.y << 16) | ((uint64_t)src0.z << 32) | ((uint64_t)src0.w << 48);")
405
406unop_horiz("unpack_64_2x32", 2, tuint32, 1, tuint64,
407           "dst.x = src0.x; dst.y = src0.x >> 32;")
408
409unop_horiz("unpack_64_4x16", 4, tuint16, 1, tuint64,
410           "dst.x = src0.x; dst.y = src0.x >> 16; dst.z = src0.x >> 32; dst.w = src0.w >> 48;")
411
412unop_horiz("unpack_32_2x16", 2, tuint16, 1, tuint32,
413           "dst.x = src0.x; dst.y = src0.x >> 16;")
414
415unop_horiz("unpack_32_4x8", 4, tuint8, 1, tuint32,
416           "dst.x = src0.x; dst.y = src0.x >> 8; dst.z = src0.x >> 16; dst.w = src0.x >> 24;")
417
418unop_horiz("unpack_half_2x16_flush_to_zero", 2, tfloat32, 1, tuint32, """
419dst.x = unpack_half_1x16_flush_to_zero((uint16_t)(src0.x & 0xffff));
420dst.y = unpack_half_1x16_flush_to_zero((uint16_t)(src0.x << 16));
421""")
422
423# Lowered floating point unpacking operations.
424
425unop_convert("unpack_half_2x16_split_x", tfloat32, tuint32,
426             "unpack_half_1x16((uint16_t)(src0 & 0xffff))")
427unop_convert("unpack_half_2x16_split_y", tfloat32, tuint32,
428             "unpack_half_1x16((uint16_t)(src0 >> 16))")
429
430unop_convert("unpack_half_2x16_split_x_flush_to_zero", tfloat32, tuint32,
431             "unpack_half_1x16_flush_to_zero((uint16_t)(src0 & 0xffff))")
432unop_convert("unpack_half_2x16_split_y_flush_to_zero", tfloat32, tuint32,
433             "unpack_half_1x16_flush_to_zero((uint16_t)(src0 >> 16))")
434
435unop_convert("unpack_32_2x16_split_x", tuint16, tuint32, "src0")
436unop_convert("unpack_32_2x16_split_y", tuint16, tuint32, "src0 >> 16")
437
438unop_convert("unpack_64_2x32_split_x", tuint32, tuint64, "src0")
439unop_convert("unpack_64_2x32_split_y", tuint32, tuint64, "src0 >> 32")
440
441# Bit operations, part of ARB_gpu_shader5.
442
443
444unop("bitfield_reverse", tuint32, """
445/* we're not winning any awards for speed here, but that's ok */
446dst = 0;
447for (unsigned bit = 0; bit < 32; bit++)
448   dst |= ((src0 >> bit) & 1) << (31 - bit);
449""")
450unop_convert("bit_count", tuint32, tuint, """
451dst = 0;
452for (unsigned bit = 0; bit < bit_size; bit++) {
453   if ((src0 >> bit) & 1)
454      dst++;
455}
456""")
457
458unop_convert("ufind_msb", tint32, tuint, """
459dst = -1;
460for (int bit = bit_size - 1; bit >= 0; bit--) {
461   if ((src0 >> bit) & 1) {
462      dst = bit;
463      break;
464   }
465}
466""")
467
468unop_convert("ufind_msb_rev", tint32, tuint, """
469dst = -1;
470for (int bit = 0; bit < bit_size; bit++) {
471   if ((src0 << bit) & 0x80000000) {
472      dst = bit;
473      break;
474   }
475}
476""")
477
478unop("uclz", tuint32, """
479int bit;
480for (bit = bit_size - 1; bit >= 0; bit--) {
481   if ((src0 & (1u << bit)) != 0)
482      break;
483}
484dst = (unsigned)(bit_size - bit - 1);
485""")
486
487unop("ifind_msb", tint32, """
488dst = -1;
489for (int bit = bit_size - 1; bit >= 0; bit--) {
490   /* If src0 < 0, we're looking for the first 0 bit.
491    * if src0 >= 0, we're looking for the first 1 bit.
492    */
493   if ((((src0 >> bit) & 1) && (src0 >= 0)) ||
494      (!((src0 >> bit) & 1) && (src0 < 0))) {
495      dst = bit;
496      break;
497   }
498}
499""")
500
501unop_convert("ifind_msb_rev", tint32, tint, """
502dst = -1;
503/* We are looking for the highest bit that's not the same as the sign bit. */
504uint32_t sign = src0 & 0x80000000u;
505for (int bit = 0; bit < 32; bit++) {
506   if (((src0 << bit) & 0x80000000u) != sign) {
507      dst = bit;
508      break;
509   }
510}
511""")
512
513unop_convert("find_lsb", tint32, tint, """
514dst = -1;
515for (unsigned bit = 0; bit < bit_size; bit++) {
516   if ((src0 >> bit) & 1) {
517      dst = bit;
518      break;
519   }
520}
521""")
522
523# AMD_gcn_shader extended instructions
524unop_horiz("cube_face_coord_amd", 2, tfloat32, 3, tfloat32, """
525dst.x = dst.y = 0.0;
526float absX = fabsf(src0.x);
527float absY = fabsf(src0.y);
528float absZ = fabsf(src0.z);
529
530float ma = 0.0;
531if (absX >= absY && absX >= absZ) { ma = 2 * src0.x; }
532if (absY >= absX && absY >= absZ) { ma = 2 * src0.y; }
533if (absZ >= absX && absZ >= absY) { ma = 2 * src0.z; }
534
535if (src0.x >= 0 && absX >= absY && absX >= absZ) { dst.x = -src0.z; dst.y = -src0.y; }
536if (src0.x < 0 && absX >= absY && absX >= absZ) { dst.x = src0.z; dst.y = -src0.y; }
537if (src0.y >= 0 && absY >= absX && absY >= absZ) { dst.x = src0.x; dst.y = src0.z; }
538if (src0.y < 0 && absY >= absX && absY >= absZ) { dst.x = src0.x; dst.y = -src0.z; }
539if (src0.z >= 0 && absZ >= absX && absZ >= absY) { dst.x = src0.x; dst.y = -src0.y; }
540if (src0.z < 0 && absZ >= absX && absZ >= absY) { dst.x = -src0.x; dst.y = -src0.y; }
541
542dst.x = dst.x * (1.0f / ma) + 0.5f;
543dst.y = dst.y * (1.0f / ma) + 0.5f;
544""")
545
546unop_horiz("cube_face_index_amd", 1, tfloat32, 3, tfloat32, """
547dst.x = 0.0;
548float absX = fabsf(src0.x);
549float absY = fabsf(src0.y);
550float absZ = fabsf(src0.z);
551if (src0.x >= 0 && absX >= absY && absX >= absZ) dst.x = 0;
552if (src0.x < 0 && absX >= absY && absX >= absZ) dst.x = 1;
553if (src0.y >= 0 && absY >= absX && absY >= absZ) dst.x = 2;
554if (src0.y < 0 && absY >= absX && absY >= absZ) dst.x = 3;
555if (src0.z >= 0 && absZ >= absX && absZ >= absY) dst.x = 4;
556if (src0.z < 0 && absZ >= absX && absZ >= absY) dst.x = 5;
557""")
558
559# Sum of vector components
560unop_reduce("fsum", 1, tfloat, tfloat, "{src}", "{src0} + {src1}", "{src}")
561
562def binop_convert(name, out_type, in_type, alg_props, const_expr):
563   opcode(name, 0, out_type, [0, 0], [in_type, in_type],
564          False, alg_props, const_expr)
565
566def binop(name, ty, alg_props, const_expr):
567   binop_convert(name, ty, ty, alg_props, const_expr)
568
569def binop_compare(name, ty, alg_props, const_expr):
570   binop_convert(name, tbool1, ty, alg_props, const_expr)
571
572def binop_compare8(name, ty, alg_props, const_expr):
573   binop_convert(name, tbool8, ty, alg_props, const_expr)
574
575def binop_compare16(name, ty, alg_props, const_expr):
576   binop_convert(name, tbool16, ty, alg_props, const_expr)
577
578def binop_compare32(name, ty, alg_props, const_expr):
579   binop_convert(name, tbool32, ty, alg_props, const_expr)
580
581def binop_compare_all_sizes(name, ty, alg_props, const_expr):
582   binop_compare(name, ty, alg_props, const_expr)
583   binop_compare8(name + "8", ty, alg_props, const_expr)
584   binop_compare16(name + "16", ty, alg_props, const_expr)
585   binop_compare32(name + "32", ty, alg_props, const_expr)
586
587def binop_horiz(name, out_size, out_type, src1_size, src1_type, src2_size,
588                src2_type, const_expr):
589   opcode(name, out_size, out_type, [src1_size, src2_size], [src1_type, src2_type],
590          False, "", const_expr)
591
592def binop_reduce(name, output_size, output_type, src_type, prereduce_expr,
593                 reduce_expr, final_expr, suffix=""):
594   def final(src):
595      return final_expr.format(src= "(" + src + ")")
596   def reduce_(src0, src1):
597      return reduce_expr.format(src0=src0, src1=src1)
598   def prereduce(src0, src1):
599      return "(" + prereduce_expr.format(src0=src0, src1=src1) + ")"
600   srcs = [prereduce("src0." + letter, "src1." + letter) for letter in "xyzwefghijklmnop"]
601   def pairwise_reduce(start, size):
602      if (size == 1):
603         return srcs[start]
604      return reduce_(pairwise_reduce(start + size // 2, size // 2), pairwise_reduce(start, size // 2))
605   for size in [2, 4, 8, 16]:
606      opcode(name + str(size) + suffix, output_size, output_type,
607             [size, size], [src_type, src_type], False, _2src_commutative,
608             final(pairwise_reduce(0, size)))
609   opcode(name + "3" + suffix, output_size, output_type,
610          [3, 3], [src_type, src_type], False, _2src_commutative,
611          final(reduce_(reduce_(srcs[2], srcs[1]), srcs[0])))
612   opcode(name + "5" + suffix, output_size, output_type,
613          [5, 5], [src_type, src_type], False, _2src_commutative,
614          final(reduce_(srcs[4], reduce_(reduce_(srcs[3], srcs[2]), reduce_(srcs[1], srcs[0])))))
615
616def binop_reduce_all_sizes(name, output_size, src_type, prereduce_expr,
617                           reduce_expr, final_expr):
618   binop_reduce(name, output_size, tbool1, src_type,
619                prereduce_expr, reduce_expr, final_expr)
620   binop_reduce("b8" + name[1:], output_size, tbool8, src_type,
621                prereduce_expr, reduce_expr, final_expr)
622   binop_reduce("b16" + name[1:], output_size, tbool16, src_type,
623                prereduce_expr, reduce_expr, final_expr)
624   binop_reduce("b32" + name[1:], output_size, tbool32, src_type,
625                prereduce_expr, reduce_expr, final_expr)
626
627binop("fadd", tfloat, _2src_commutative + associative,"""
628if (nir_is_rounding_mode_rtz(execution_mode, bit_size)) {
629   if (bit_size == 64)
630      dst = _mesa_double_add_rtz(src0, src1);
631   else
632      dst = _mesa_double_to_float_rtz((double)src0 + (double)src1);
633} else {
634   dst = src0 + src1;
635}
636""")
637binop("iadd", tint, _2src_commutative + associative, "(uint64_t)src0 + (uint64_t)src1")
638binop("iadd_sat", tint, _2src_commutative, """
639      src1 > 0 ?
640         (src0 + src1 < src0 ? u_intN_max(bit_size) : src0 + src1) :
641         (src0 < src0 + src1 ? u_intN_min(bit_size) : src0 + src1)
642""")
643binop("uadd_sat", tuint, _2src_commutative,
644      "(src0 + src1) < src0 ? u_uintN_max(sizeof(src0) * 8) : (src0 + src1)")
645binop("isub_sat", tint, "", """
646      src1 < 0 ?
647         (src0 - src1 < src0 ? u_intN_max(bit_size) : src0 - src1) :
648         (src0 < src0 - src1 ? u_intN_min(bit_size) : src0 - src1)
649""")
650binop("usub_sat", tuint, "", "src0 < src1 ? 0 : src0 - src1")
651
652binop("fsub", tfloat, "", """
653if (nir_is_rounding_mode_rtz(execution_mode, bit_size)) {
654   if (bit_size == 64)
655      dst = _mesa_double_sub_rtz(src0, src1);
656   else
657      dst = _mesa_double_to_float_rtz((double)src0 - (double)src1);
658} else {
659   dst = src0 - src1;
660}
661""")
662binop("isub", tint, "", "src0 - src1")
663binop_convert("uabs_isub", tuint, tint, "", """
664              src1 > src0 ? (uint64_t) src1 - (uint64_t) src0
665                          : (uint64_t) src0 - (uint64_t) src1
666""")
667binop("uabs_usub", tuint, "", "(src1 > src0) ? (src1 - src0) : (src0 - src1)")
668
669binop("fmul", tfloat, _2src_commutative + associative, """
670if (nir_is_rounding_mode_rtz(execution_mode, bit_size)) {
671   if (bit_size == 64)
672      dst = _mesa_double_mul_rtz(src0, src1);
673   else
674      dst = _mesa_double_to_float_rtz((double)src0 * (double)src1);
675} else {
676   dst = src0 * src1;
677}
678""")
679
680# Unlike fmul, anything (even infinity or NaN) multiplied by zero is always zero.
681# fmulz(0.0, inf) and fmulz(0.0, nan) must be +/-0.0, even if
682# SIGNED_ZERO_INF_NAN_PRESERVE is not used. If SIGNED_ZERO_INF_NAN_PRESERVE is used, then
683# the result must be a positive zero if either operand is zero.
684binop("fmulz", tfloat32, _2src_commutative + associative, """
685if (src0 == 0.0 || src1 == 0.0)
686   dst = 0.0;
687else if (nir_is_rounding_mode_rtz(execution_mode, 32))
688   dst = _mesa_double_to_float_rtz((double)src0 * (double)src1);
689else
690   dst = src0 * src1;
691""")
692
693# low 32-bits of signed/unsigned integer multiply
694binop("imul", tint, _2src_commutative + associative, """
695   /* Use 64-bit multiplies to prevent overflow of signed arithmetic */
696   dst = (uint64_t)src0 * (uint64_t)src1;
697""")
698
699# Generate 64 bit result from 2 32 bits quantity
700binop_convert("imul_2x32_64", tint64, tint32, _2src_commutative,
701              "(int64_t)src0 * (int64_t)src1")
702binop_convert("umul_2x32_64", tuint64, tuint32, _2src_commutative,
703              "(uint64_t)src0 * (uint64_t)src1")
704
705# high 32-bits of signed integer multiply
706binop("imul_high", tint, _2src_commutative, """
707if (bit_size == 64) {
708   /* We need to do a full 128-bit x 128-bit multiply in order for the sign
709    * extension to work properly.  The casts are kind-of annoying but needed
710    * to prevent compiler warnings.
711    */
712   uint32_t src0_u32[4] = {
713      src0,
714      (int64_t)src0 >> 32,
715      (int64_t)src0 >> 63,
716      (int64_t)src0 >> 63,
717   };
718   uint32_t src1_u32[4] = {
719      src1,
720      (int64_t)src1 >> 32,
721      (int64_t)src1 >> 63,
722      (int64_t)src1 >> 63,
723   };
724   uint32_t prod_u32[4];
725   ubm_mul_u32arr(prod_u32, src0_u32, src1_u32);
726   dst = (uint64_t)prod_u32[2] | ((uint64_t)prod_u32[3] << 32);
727} else {
728   /* First, sign-extend to 64-bit, then convert to unsigned to prevent
729    * potential overflow of signed multiply */
730   dst = ((uint64_t)(int64_t)src0 * (uint64_t)(int64_t)src1) >> bit_size;
731}
732""")
733
734# high 32-bits of unsigned integer multiply
735binop("umul_high", tuint, _2src_commutative, """
736if (bit_size == 64) {
737   /* The casts are kind-of annoying but needed to prevent compiler warnings. */
738   uint32_t src0_u32[2] = { src0, (uint64_t)src0 >> 32 };
739   uint32_t src1_u32[2] = { src1, (uint64_t)src1 >> 32 };
740   uint32_t prod_u32[4];
741   ubm_mul_u32arr(prod_u32, src0_u32, src1_u32);
742   dst = (uint64_t)prod_u32[2] | ((uint64_t)prod_u32[3] << 32);
743} else {
744   dst = ((uint64_t)src0 * (uint64_t)src1) >> bit_size;
745}
746""")
747
748# low 32-bits of unsigned integer multiply
749binop("umul_low", tuint32, _2src_commutative, """
750uint64_t mask = (1 << (bit_size / 2)) - 1;
751dst = ((uint64_t)src0 & mask) * ((uint64_t)src1 & mask);
752""")
753
754# Multiply 32-bits with low 16-bits.
755binop("imul_32x16", tint32, "", "src0 * (int16_t) src1")
756binop("umul_32x16", tuint32, "", "src0 * (uint16_t) src1")
757
758binop("fdiv", tfloat, "", "src0 / src1")
759binop("idiv", tint, "", "src1 == 0 ? 0 : (src0 / src1)")
760binop("udiv", tuint, "", "src1 == 0 ? 0 : (src0 / src1)")
761
762# returns an integer (1 or 0) representing the carry resulting from the
763# addition of the two unsigned arguments.
764
765binop_convert("uadd_carry", tuint, tuint, _2src_commutative, "src0 + src1 < src0")
766
767# returns an integer (1 or 0) representing the borrow resulting from the
768# subtraction of the two unsigned arguments.
769
770binop_convert("usub_borrow", tuint, tuint, "", "src0 < src1")
771
772# hadd: (a + b) >> 1 (without overflow)
773# x + y = x - (x & ~y) + (x & ~y) + y - (~x & y) + (~x & y)
774#       =      (x & y) + (x & ~y) +      (x & y) + (~x & y)
775#       = 2 *  (x & y) + (x & ~y) +                (~x & y)
776#       =     ((x & y) << 1) + (x ^ y)
777#
778# Since we know that the bottom bit of (x & y) << 1 is zero,
779#
780# (x + y) >> 1 = (((x & y) << 1) + (x ^ y)) >> 1
781#              =   (x & y) +      ((x ^ y)  >> 1)
782binop("ihadd", tint, _2src_commutative, "(src0 & src1) + ((src0 ^ src1) >> 1)")
783binop("uhadd", tuint, _2src_commutative, "(src0 & src1) + ((src0 ^ src1) >> 1)")
784
785# rhadd: (a + b + 1) >> 1 (without overflow)
786# x + y + 1 = x + (~x & y) - (~x & y) + y + (x & ~y) - (x & ~y) + 1
787#           =      (x | y) - (~x & y) +      (x | y) - (x & ~y) + 1
788#           = 2 *  (x | y) - ((~x & y) +               (x & ~y)) + 1
789#           =     ((x | y) << 1) - (x ^ y) + 1
790#
791# Since we know that the bottom bit of (x & y) << 1 is zero,
792#
793# (x + y + 1) >> 1 = (x | y) + (-(x ^ y) + 1) >> 1)
794#                  = (x | y) -  ((x ^ y)      >> 1)
795binop("irhadd", tint, _2src_commutative, "(src0 | src1) - ((src0 ^ src1) >> 1)")
796binop("urhadd", tuint, _2src_commutative, "(src0 | src1) - ((src0 ^ src1) >> 1)")
797
798binop("umod", tuint, "", "src1 == 0 ? 0 : src0 % src1")
799
800# For signed integers, there are several different possible definitions of
801# "modulus" or "remainder".  We follow the conventions used by LLVM and
802# SPIR-V.  The irem opcode implements the standard C/C++ signed "%"
803# operation while the imod opcode implements the more mathematical
804# "modulus" operation.  For details on the difference, see
805#
806# http://mathforum.org/library/drmath/view/52343.html
807
808binop("irem", tint, "", "src1 == 0 ? 0 : src0 % src1")
809binop("imod", tint, "",
810      "src1 == 0 ? 0 : ((src0 % src1 == 0 || (src0 >= 0) == (src1 >= 0)) ?"
811      "                 src0 % src1 : src0 % src1 + src1)")
812binop("fmod", tfloat, "", "src0 - src1 * floorf(src0 / src1)")
813binop("frem", tfloat, "", "src0 - src1 * truncf(src0 / src1)")
814
815#
816# Comparisons
817#
818
819
820# these integer-aware comparisons return a boolean (0 or ~0)
821
822binop_compare_all_sizes("flt", tfloat, "", "src0 < src1")
823binop_compare_all_sizes("fge", tfloat, "", "src0 >= src1")
824binop_compare_all_sizes("feq", tfloat, _2src_commutative, "src0 == src1")
825binop_compare_all_sizes("fneu", tfloat, _2src_commutative, "src0 != src1")
826binop_compare_all_sizes("ilt", tint, "", "src0 < src1")
827binop_compare_all_sizes("ige", tint, "", "src0 >= src1")
828binop_compare_all_sizes("ieq", tint, _2src_commutative, "src0 == src1")
829binop_compare_all_sizes("ine", tint, _2src_commutative, "src0 != src1")
830binop_compare_all_sizes("ult", tuint, "", "src0 < src1")
831binop_compare_all_sizes("uge", tuint, "", "src0 >= src1")
832
833# integer-aware GLSL-style comparisons that compare floats and ints
834
835binop_reduce_all_sizes("ball_fequal",  1, tfloat, "{src0} == {src1}",
836                       "{src0} && {src1}", "{src}")
837binop_reduce_all_sizes("bany_fnequal", 1, tfloat, "{src0} != {src1}",
838                       "{src0} || {src1}", "{src}")
839binop_reduce_all_sizes("ball_iequal",  1, tint, "{src0} == {src1}",
840                       "{src0} && {src1}", "{src}")
841binop_reduce_all_sizes("bany_inequal", 1, tint, "{src0} != {src1}",
842                       "{src0} || {src1}", "{src}")
843
844# non-integer-aware GLSL-style comparisons that return 0.0 or 1.0
845
846binop_reduce("fall_equal",  1, tfloat32, tfloat32, "{src0} == {src1}",
847             "{src0} && {src1}", "{src} ? 1.0f : 0.0f")
848binop_reduce("fany_nequal", 1, tfloat32, tfloat32, "{src0} != {src1}",
849             "{src0} || {src1}", "{src} ? 1.0f : 0.0f")
850
851# These comparisons for integer-less hardware return 1.0 and 0.0 for true
852# and false respectively
853
854binop("slt", tfloat, "", "(src0 < src1) ? 1.0f : 0.0f") # Set on Less Than
855binop("sge", tfloat, "", "(src0 >= src1) ? 1.0f : 0.0f") # Set on Greater or Equal
856binop("seq", tfloat, _2src_commutative, "(src0 == src1) ? 1.0f : 0.0f") # Set on Equal
857binop("sne", tfloat, _2src_commutative, "(src0 != src1) ? 1.0f : 0.0f") # Set on Not Equal
858
859# SPIRV shifts are undefined for shift-operands >= bitsize,
860# but SM5 shifts are defined to use only the least significant bits.
861# The NIR definition is according to the SM5 specification.
862opcode("ishl", 0, tint, [0, 0], [tint, tuint32], False, "",
863       "(uint64_t)src0 << (src1 & (sizeof(src0) * 8 - 1))")
864opcode("ishr", 0, tint, [0, 0], [tint, tuint32], False, "",
865       "src0 >> (src1 & (sizeof(src0) * 8 - 1))")
866opcode("ushr", 0, tuint, [0, 0], [tuint, tuint32], False, "",
867       "src0 >> (src1 & (sizeof(src0) * 8 - 1))")
868
869opcode("urol", 0, tuint, [0, 0], [tuint, tuint32], False, "", """
870   uint32_t rotate_mask = sizeof(src0) * 8 - 1;
871   dst = (src0 << (src1 & rotate_mask)) |
872         (src0 >> (-src1 & rotate_mask));
873""")
874opcode("uror", 0, tuint, [0, 0], [tuint, tuint32], False, "", """
875   uint32_t rotate_mask = sizeof(src0) * 8 - 1;
876   dst = (src0 >> (src1 & rotate_mask)) |
877         (src0 << (-src1 & rotate_mask));
878""")
879
880# bitwise logic operators
881#
882# These are also used as boolean and, or, xor for hardware supporting
883# integers.
884
885
886binop("iand", tuint, _2src_commutative + associative, "src0 & src1")
887binop("ior", tuint, _2src_commutative + associative, "src0 | src1")
888binop("ixor", tuint, _2src_commutative + associative, "src0 ^ src1")
889
890
891binop_reduce("fdot", 1, tfloat, tfloat, "{src0} * {src1}", "{src0} + {src1}",
892             "{src}")
893
894binop_reduce("fdot", 0, tfloat, tfloat,
895             "{src0} * {src1}", "{src0} + {src1}", "{src}",
896             suffix="_replicated")
897
898opcode("fdph", 1, tfloat, [3, 4], [tfloat, tfloat], False, "",
899       "src0.x * src1.x + src0.y * src1.y + src0.z * src1.z + src1.w")
900opcode("fdph_replicated", 0, tfloat, [3, 4], [tfloat, tfloat], False, "",
901       "src0.x * src1.x + src0.y * src1.y + src0.z * src1.z + src1.w")
902
903binop("fmin", tfloat, _2src_commutative + associative, "fmin(src0, src1)")
904binop("imin", tint, _2src_commutative + associative, "src1 > src0 ? src0 : src1")
905binop("umin", tuint, _2src_commutative + associative, "src1 > src0 ? src0 : src1")
906binop("fmax", tfloat, _2src_commutative + associative, "fmax(src0, src1)")
907binop("imax", tint, _2src_commutative + associative, "src1 > src0 ? src1 : src0")
908binop("umax", tuint, _2src_commutative + associative, "src1 > src0 ? src1 : src0")
909
910binop("fpow", tfloat, "", "bit_size == 64 ? powf(src0, src1) : pow(src0, src1)")
911
912binop_horiz("pack_half_2x16_split", 1, tuint32, 1, tfloat32, 1, tfloat32,
913            "pack_half_1x16(src0.x) | (pack_half_1x16(src1.x) << 16)")
914
915binop_convert("pack_64_2x32_split", tuint64, tuint32, "",
916              "src0 | ((uint64_t)src1 << 32)")
917
918binop_convert("pack_32_2x16_split", tuint32, tuint16, "",
919              "src0 | ((uint32_t)src1 << 16)")
920
921opcode("pack_32_4x8_split", 0, tuint32, [0, 0, 0, 0], [tuint8, tuint8, tuint8, tuint8],
922       False, "",
923       "src0 | ((uint32_t)src1 << 8) | ((uint32_t)src2 << 16) | ((uint32_t)src3 << 24)")
924
925# bfm implements the behavior of the first operation of the SM5 "bfi" assembly
926# and that of the "bfi1" i965 instruction. That is, the bits and offset values
927# are from the low five bits of src0 and src1, respectively.
928binop_convert("bfm", tuint32, tint32, "", """
929int bits = src0 & 0x1F;
930int offset = src1 & 0x1F;
931dst = ((1u << bits) - 1) << offset;
932""")
933
934opcode("ldexp", 0, tfloat, [0, 0], [tfloat, tint32], False, "", """
935dst = (bit_size == 64) ? ldexp(src0, src1) : ldexpf(src0, src1);
936/* flush denormals to zero. */
937if (!isnormal(dst))
938   dst = copysignf(0.0f, src0);
939""")
940
941# Combines the first component of each input to make a 2-component vector.
942
943binop_horiz("vec2", 2, tuint, 1, tuint, 1, tuint, """
944dst.x = src0.x;
945dst.y = src1.x;
946""")
947
948# Byte extraction
949binop("extract_u8", tuint, "", "(uint8_t)(src0 >> (src1 * 8))")
950binop("extract_i8", tint, "", "(int8_t)(src0 >> (src1 * 8))")
951
952# Word extraction
953binop("extract_u16", tuint, "", "(uint16_t)(src0 >> (src1 * 16))")
954binop("extract_i16", tint, "", "(int16_t)(src0 >> (src1 * 16))")
955
956# Byte/word insertion
957binop("insert_u8", tuint, "", "(src0 & 0xff) << (src1 * 8)")
958binop("insert_u16", tuint, "", "(src0 & 0xffff) << (src1 * 16)")
959
960
961def triop(name, ty, alg_props, const_expr):
962   opcode(name, 0, ty, [0, 0, 0], [ty, ty, ty], False, alg_props, const_expr)
963def triop_horiz(name, output_size, src1_size, src2_size, src3_size, const_expr):
964   opcode(name, output_size, tuint,
965   [src1_size, src2_size, src3_size],
966   [tuint, tuint, tuint], False, "", const_expr)
967
968triop("ffma", tfloat, _2src_commutative, """
969if (nir_is_rounding_mode_rtz(execution_mode, bit_size)) {
970   if (bit_size == 64)
971      dst = _mesa_double_fma_rtz(src0, src1, src2);
972   else if (bit_size == 32)
973      dst = _mesa_float_fma_rtz(src0, src1, src2);
974   else
975      dst = _mesa_double_to_float_rtz(_mesa_double_fma_rtz(src0, src1, src2));
976} else {
977   if (bit_size == 32)
978      dst = fmaf(src0, src1, src2);
979   else
980      dst = fma(src0, src1, src2);
981}
982""")
983
984# Unlike ffma, anything (even infinity or NaN) multiplied by zero is always zero.
985# ffmaz(0.0, inf, src2) and ffmaz(0.0, nan, src2) must be +/-0.0 + src2, even if
986# SIGNED_ZERO_INF_NAN_PRESERVE is not used. If SIGNED_ZERO_INF_NAN_PRESERVE is used, then
987# the result must be a positive zero plus src2 if either src0 or src1 is zero.
988triop("ffmaz", tfloat32, _2src_commutative, """
989if (src0 == 0.0 || src1 == 0.0)
990   dst = 0.0 + src2;
991else if (nir_is_rounding_mode_rtz(execution_mode, 32))
992   dst = _mesa_float_fma_rtz(src0, src1, src2);
993else
994   dst = fmaf(src0, src1, src2);
995""")
996
997triop("flrp", tfloat, "", "src0 * (1 - src2) + src1 * src2")
998
999# Ternary addition
1000triop("iadd3", tint, _2src_commutative + associative, "src0 + src1 + src2")
1001
1002# Conditional Select
1003#
1004# A vector conditional select instruction (like ?:, but operating per-
1005# component on vectors). There are two versions, one for floating point
1006# bools (0.0 vs 1.0) and one for integer bools (0 vs ~0).
1007
1008triop("fcsel", tfloat32, selection, "(src0 != 0.0f) ? src1 : src2")
1009
1010opcode("bcsel", 0, tuint, [0, 0, 0],
1011       [tbool1, tuint, tuint], False, selection, "src0 ? src1 : src2")
1012opcode("b8csel", 0, tuint, [0, 0, 0],
1013       [tbool8, tuint, tuint], False, selection, "src0 ? src1 : src2")
1014opcode("b16csel", 0, tuint, [0, 0, 0],
1015       [tbool16, tuint, tuint], False, selection, "src0 ? src1 : src2")
1016opcode("b32csel", 0, tuint, [0, 0, 0],
1017       [tbool32, tuint, tuint], False, selection, "src0 ? src1 : src2")
1018
1019triop("i32csel_gt", tint32, selection, "(src0 > 0) ? src1 : src2")
1020triop("i32csel_ge", tint32, selection, "(src0 >= 0) ? src1 : src2")
1021
1022triop("fcsel_gt", tfloat32, selection, "(src0 > 0.0f) ? src1 : src2")
1023triop("fcsel_ge", tfloat32, selection, "(src0 >= 0.0f) ? src1 : src2")
1024
1025# SM5 bfi assembly
1026triop("bfi", tuint32, "", """
1027unsigned mask = src0, insert = src1, base = src2;
1028if (mask == 0) {
1029   dst = base;
1030} else {
1031   unsigned tmp = mask;
1032   while (!(tmp & 1)) {
1033      tmp >>= 1;
1034      insert <<= 1;
1035   }
1036   dst = (base & ~mask) | (insert & mask);
1037}
1038""")
1039
1040
1041triop("bitfield_select", tuint, "", "(src0 & src1) | (~src0 & src2)")
1042
1043# SM5 ubfe/ibfe assembly: only the 5 least significant bits of offset and bits are used.
1044opcode("ubfe", 0, tuint32,
1045       [0, 0, 0], [tuint32, tuint32, tuint32], False, "", """
1046unsigned base = src0;
1047unsigned offset = src1 & 0x1F;
1048unsigned bits = src2 & 0x1F;
1049if (bits == 0) {
1050   dst = 0;
1051} else if (offset + bits < 32) {
1052   dst = (base << (32 - bits - offset)) >> (32 - bits);
1053} else {
1054   dst = base >> offset;
1055}
1056""")
1057opcode("ibfe", 0, tint32,
1058       [0, 0, 0], [tint32, tuint32, tuint32], False, "", """
1059int base = src0;
1060unsigned offset = src1 & 0x1F;
1061unsigned bits = src2 & 0x1F;
1062if (bits == 0) {
1063   dst = 0;
1064} else if (offset + bits < 32) {
1065   dst = (base << (32 - bits - offset)) >> (32 - bits);
1066} else {
1067   dst = base >> offset;
1068}
1069""")
1070
1071# GLSL bitfieldExtract()
1072opcode("ubitfield_extract", 0, tuint32,
1073       [0, 0, 0], [tuint32, tint32, tint32], False, "", """
1074unsigned base = src0;
1075int offset = src1, bits = src2;
1076if (bits == 0) {
1077   dst = 0;
1078} else if (bits < 0 || offset < 0 || offset + bits > 32) {
1079   dst = 0; /* undefined per the spec */
1080} else {
1081   dst = (base >> offset) & ((1ull << bits) - 1);
1082}
1083""")
1084opcode("ibitfield_extract", 0, tint32,
1085       [0, 0, 0], [tint32, tint32, tint32], False, "", """
1086int base = src0;
1087int offset = src1, bits = src2;
1088if (bits == 0) {
1089   dst = 0;
1090} else if (offset < 0 || bits < 0 || offset + bits > 32) {
1091   dst = 0;
1092} else {
1093   dst = (base << (32 - offset - bits)) >> (32 - bits); /* use sign-extending shift */
1094}
1095""")
1096
1097# Sum of absolute differences with accumulation.
1098# (Equivalent to AMD's v_sad_u8 instruction.)
1099# The first two sources contain packed 8-bit unsigned integers, the instruction
1100# will calculate the absolute difference of these, and then add them together.
1101# There is also a third source which is a 32-bit unsigned integer and added to the result.
1102triop_horiz("sad_u8x4", 1, 1, 1, 1, """
1103uint8_t s0_b0 = (src0.x & 0x000000ff) >> 0;
1104uint8_t s0_b1 = (src0.x & 0x0000ff00) >> 8;
1105uint8_t s0_b2 = (src0.x & 0x00ff0000) >> 16;
1106uint8_t s0_b3 = (src0.x & 0xff000000) >> 24;
1107
1108uint8_t s1_b0 = (src1.x & 0x000000ff) >> 0;
1109uint8_t s1_b1 = (src1.x & 0x0000ff00) >> 8;
1110uint8_t s1_b2 = (src1.x & 0x00ff0000) >> 16;
1111uint8_t s1_b3 = (src1.x & 0xff000000) >> 24;
1112
1113dst.x = src2.x +
1114        (s0_b0 > s1_b0 ? (s0_b0 - s1_b0) : (s1_b0 - s0_b0)) +
1115        (s0_b1 > s1_b1 ? (s0_b1 - s1_b1) : (s1_b1 - s0_b1)) +
1116        (s0_b2 > s1_b2 ? (s0_b2 - s1_b2) : (s1_b2 - s0_b2)) +
1117        (s0_b3 > s1_b3 ? (s0_b3 - s1_b3) : (s1_b3 - s0_b3));
1118""")
1119
1120# Combines the first component of each input to make a 3-component vector.
1121
1122triop_horiz("vec3", 3, 1, 1, 1, """
1123dst.x = src0.x;
1124dst.y = src1.x;
1125dst.z = src2.x;
1126""")
1127
1128def quadop_horiz(name, output_size, src1_size, src2_size, src3_size,
1129                 src4_size, const_expr):
1130   opcode(name, output_size, tuint,
1131          [src1_size, src2_size, src3_size, src4_size],
1132          [tuint, tuint, tuint, tuint],
1133          False, "", const_expr)
1134
1135opcode("bitfield_insert", 0, tuint32, [0, 0, 0, 0],
1136       [tuint32, tuint32, tint32, tint32], False, "", """
1137unsigned base = src0, insert = src1;
1138int offset = src2, bits = src3;
1139if (bits == 0) {
1140   dst = base;
1141} else if (offset < 0 || bits < 0 || bits + offset > 32) {
1142   dst = 0;
1143} else {
1144   unsigned mask = ((1ull << bits) - 1) << offset;
1145   dst = (base & ~mask) | ((insert << offset) & mask);
1146}
1147""")
1148
1149quadop_horiz("vec4", 4, 1, 1, 1, 1, """
1150dst.x = src0.x;
1151dst.y = src1.x;
1152dst.z = src2.x;
1153dst.w = src3.x;
1154""")
1155
1156opcode("vec5", 5, tuint,
1157       [1] * 5, [tuint] * 5,
1158       False, "", """
1159dst.x = src0.x;
1160dst.y = src1.x;
1161dst.z = src2.x;
1162dst.w = src3.x;
1163dst.e = src4.x;
1164""")
1165
1166opcode("vec8", 8, tuint,
1167       [1] * 8, [tuint] * 8,
1168       False, "", """
1169dst.x = src0.x;
1170dst.y = src1.x;
1171dst.z = src2.x;
1172dst.w = src3.x;
1173dst.e = src4.x;
1174dst.f = src5.x;
1175dst.g = src6.x;
1176dst.h = src7.x;
1177""")
1178
1179opcode("vec16", 16, tuint,
1180       [1] * 16, [tuint] * 16,
1181       False, "", """
1182dst.x = src0.x;
1183dst.y = src1.x;
1184dst.z = src2.x;
1185dst.w = src3.x;
1186dst.e = src4.x;
1187dst.f = src5.x;
1188dst.g = src6.x;
1189dst.h = src7.x;
1190dst.i = src8.x;
1191dst.j = src9.x;
1192dst.k = src10.x;
1193dst.l = src11.x;
1194dst.m = src12.x;
1195dst.n = src13.x;
1196dst.o = src14.x;
1197dst.p = src15.x;
1198""")
1199
1200# An integer multiply instruction for address calculation.  This is
1201# similar to imul, except that the results are undefined in case of
1202# overflow.  Overflow is defined according to the size of the variable
1203# being dereferenced.
1204#
1205# This relaxed definition, compared to imul, allows an optimization
1206# pass to propagate bounds (ie, from an load/store intrinsic) to the
1207# sources, such that lower precision integer multiplies can be used.
1208# This is useful on hw that has 24b or perhaps 16b integer multiply
1209# instructions.
1210binop("amul", tint, _2src_commutative + associative, "src0 * src1")
1211
1212# ir3-specific instruction that maps directly to mul-add shift high mix,
1213# (IMADSH_MIX16 i.e. ah * bl << 16 + c). It is used for lowering integer
1214# multiplication (imul) on Freedreno backend..
1215opcode("imadsh_mix16", 0, tint32,
1216       [0, 0, 0], [tint32, tint32, tint32], False, "", """
1217dst = ((((src0 & 0xffff0000) >> 16) * (src1 & 0x0000ffff)) << 16) + src2;
1218""")
1219
1220# ir3-specific instruction that maps directly to ir3 mad.s24.
1221#
1222# 24b multiply into 32b result (with sign extension) plus 32b int
1223triop("imad24_ir3", tint32, _2src_commutative,
1224      "(((int32_t)src0 << 8) >> 8) * (((int32_t)src1 << 8) >> 8) + src2")
1225
1226# r600-specific instruction that evaluates unnormalized cube texture coordinates
1227# and face index
1228# The actual texture coordinates are evaluated from this according to
1229#    dst.yx / abs(dst.z) + 1.5
1230unop_horiz("cube_r600", 4, tfloat32, 3, tfloat32, """
1231   dst.x = dst.y = dst.z = 0.0;
1232   float absX = fabsf(src0.x);
1233   float absY = fabsf(src0.y);
1234   float absZ = fabsf(src0.z);
1235
1236   if (absX >= absY && absX >= absZ) { dst.z = 2 * src0.x; }
1237   if (absY >= absX && absY >= absZ) { dst.z = 2 * src0.y; }
1238   if (absZ >= absX && absZ >= absY) { dst.z = 2 * src0.z; }
1239
1240   if (src0.x >= 0 && absX >= absY && absX >= absZ) {
1241      dst.y = -src0.z; dst.x = -src0.y; dst.w = 0;
1242   }
1243   if (src0.x < 0 && absX >= absY && absX >= absZ) {
1244      dst.y = src0.z; dst.x = -src0.y; dst.w = 1;
1245   }
1246   if (src0.y >= 0 && absY >= absX && absY >= absZ) {
1247      dst.y = src0.x; dst.x = src0.z; dst.w = 2;
1248   }
1249   if (src0.y < 0 && absY >= absX && absY >= absZ) {
1250      dst.y = src0.x; dst.x = -src0.z; dst.w = 3;
1251   }
1252   if (src0.z >= 0 && absZ >= absX && absZ >= absY) {
1253      dst.y = src0.x; dst.x = -src0.y; dst.w = 4;
1254   }
1255   if (src0.z < 0 && absZ >= absX && absZ >= absY) {
1256      dst.y = -src0.x; dst.x = -src0.y; dst.w = 5;
1257   }
1258""")
1259
1260# r600/gcn specific sin and cos
1261# these trigeometric functions need some lowering because the supported
1262# input values are expected to be normalized by dividing by (2 * pi)
1263unop("fsin_amd", tfloat, "sinf(6.2831853 * src0)")
1264unop("fcos_amd", tfloat, "cosf(6.2831853 * src0)")
1265
1266# AGX specific sin with input expressed in quadrants. Used in the lowering for
1267# fsin/fcos. This corresponds to a sequence of 3 ALU ops in the backend (where
1268# the angle is further decomposed by quadrant, sinc is computed, and the angle
1269# is multiplied back for sin). Lowering fsin/fcos to fsin_agx requires some
1270# additional ALU that NIR may be able to optimize.
1271unop("fsin_agx", tfloat, "sinf(src0 * (6.2831853/4.0))")
1272
1273# 24b multiply into 32b result (with sign extension)
1274binop("imul24", tint32, _2src_commutative + associative,
1275      "(((int32_t)src0 << 8) >> 8) * (((int32_t)src1 << 8) >> 8)")
1276
1277# unsigned 24b multiply into 32b result plus 32b int
1278triop("umad24", tuint32, _2src_commutative,
1279      "(((uint32_t)src0 << 8) >> 8) * (((uint32_t)src1 << 8) >> 8) + src2")
1280
1281# unsigned 24b multiply into 32b result uint
1282binop("umul24", tint32, _2src_commutative + associative,
1283      "(((uint32_t)src0 << 8) >> 8) * (((uint32_t)src1 << 8) >> 8)")
1284
1285# relaxed versions of the above, which assume input is in the 24bit range (no clamping)
1286binop("imul24_relaxed", tint32, _2src_commutative + associative, "src0 * src1")
1287triop("umad24_relaxed", tuint32, _2src_commutative, "src0 * src1 + src2")
1288binop("umul24_relaxed", tuint32, _2src_commutative + associative, "src0 * src1")
1289
1290unop_convert("fisnormal", tbool1, tfloat, "isnormal(src0)")
1291unop_convert("fisfinite", tbool1, tfloat, "isfinite(src0)")
1292unop_convert("fisfinite32", tbool32, tfloat, "isfinite(src0)")
1293
1294# vc4-specific opcodes
1295
1296# Saturated vector add for 4 8bit ints.
1297binop("usadd_4x8_vc4", tint32, _2src_commutative + associative, """
1298dst = 0;
1299for (int i = 0; i < 32; i += 8) {
1300   dst |= MIN2(((src0 >> i) & 0xff) + ((src1 >> i) & 0xff), 0xff) << i;
1301}
1302""")
1303
1304# Saturated vector subtract for 4 8bit ints.
1305binop("ussub_4x8_vc4", tint32, "", """
1306dst = 0;
1307for (int i = 0; i < 32; i += 8) {
1308   int src0_chan = (src0 >> i) & 0xff;
1309   int src1_chan = (src1 >> i) & 0xff;
1310   if (src0_chan > src1_chan)
1311      dst |= (src0_chan - src1_chan) << i;
1312}
1313""")
1314
1315# vector min for 4 8bit ints.
1316binop("umin_4x8_vc4", tint32, _2src_commutative + associative, """
1317dst = 0;
1318for (int i = 0; i < 32; i += 8) {
1319   dst |= MIN2((src0 >> i) & 0xff, (src1 >> i) & 0xff) << i;
1320}
1321""")
1322
1323# vector max for 4 8bit ints.
1324binop("umax_4x8_vc4", tint32, _2src_commutative + associative, """
1325dst = 0;
1326for (int i = 0; i < 32; i += 8) {
1327   dst |= MAX2((src0 >> i) & 0xff, (src1 >> i) & 0xff) << i;
1328}
1329""")
1330
1331# unorm multiply: (a * b) / 255.
1332binop("umul_unorm_4x8_vc4", tint32, _2src_commutative + associative, """
1333dst = 0;
1334for (int i = 0; i < 32; i += 8) {
1335   int src0_chan = (src0 >> i) & 0xff;
1336   int src1_chan = (src1 >> i) & 0xff;
1337   dst |= ((src0_chan * src1_chan) / 255) << i;
1338}
1339""")
1340
1341# Mali-specific opcodes
1342unop("fsat_signed_mali", tfloat, ("fmin(fmax(src0, -1.0), 1.0)"))
1343unop("fclamp_pos_mali", tfloat, ("fmax(src0, 0.0)"))
1344
1345# Magnitude equal to fddx/y, sign undefined. Derivative of a constant is zero.
1346unop("fddx_must_abs_mali", tfloat, "0.0")
1347unop("fddy_must_abs_mali", tfloat, "0.0")
1348
1349# DXIL specific double [un]pack
1350# DXIL doesn't support generic [un]pack instructions, so we want those
1351# lowered to bit ops. HLSL doesn't support 64bit bitcasts to/from
1352# double, only [un]pack. Technically DXIL does, but considering they
1353# can't be generated from HLSL, we want to match what would be coming from DXC.
1354# This is essentially just the standard [un]pack, except that it doesn't get
1355# lowered so we can handle it in the backend and turn it into MakeDouble/SplitDouble
1356unop_horiz("pack_double_2x32_dxil", 1, tuint64, 2, tuint32,
1357           "dst.x = src0.x | ((uint64_t)src0.y << 32);")
1358unop_horiz("unpack_double_2x32_dxil", 2, tuint32, 1, tuint64,
1359           "dst.x = src0.x; dst.y = src0.x >> 32;")
1360
1361# src0 and src1 are i8vec4 packed in an int32, and src2 is an int32.  The int8
1362# components are sign-extended to 32-bits, and a dot-product is performed on
1363# the resulting vectors.  src2 is added to the result of the dot-product.
1364opcode("sdot_4x8_iadd", 0, tint32, [0, 0, 0], [tuint32, tuint32, tint32],
1365       False, _2src_commutative, """
1366   const int32_t v0x = (int8_t)(src0      );
1367   const int32_t v0y = (int8_t)(src0 >>  8);
1368   const int32_t v0z = (int8_t)(src0 >> 16);
1369   const int32_t v0w = (int8_t)(src0 >> 24);
1370   const int32_t v1x = (int8_t)(src1      );
1371   const int32_t v1y = (int8_t)(src1 >>  8);
1372   const int32_t v1z = (int8_t)(src1 >> 16);
1373   const int32_t v1w = (int8_t)(src1 >> 24);
1374
1375   dst = (v0x * v1x) + (v0y * v1y) + (v0z * v1z) + (v0w * v1w) + src2;
1376""")
1377
1378# Like sdot_4x8_iadd, but unsigned.
1379opcode("udot_4x8_uadd", 0, tuint32, [0, 0, 0], [tuint32, tuint32, tuint32],
1380       False, _2src_commutative, """
1381   const uint32_t v0x = (uint8_t)(src0      );
1382   const uint32_t v0y = (uint8_t)(src0 >>  8);
1383   const uint32_t v0z = (uint8_t)(src0 >> 16);
1384   const uint32_t v0w = (uint8_t)(src0 >> 24);
1385   const uint32_t v1x = (uint8_t)(src1      );
1386   const uint32_t v1y = (uint8_t)(src1 >>  8);
1387   const uint32_t v1z = (uint8_t)(src1 >> 16);
1388   const uint32_t v1w = (uint8_t)(src1 >> 24);
1389
1390   dst = (v0x * v1x) + (v0y * v1y) + (v0z * v1z) + (v0w * v1w) + src2;
1391""")
1392
1393# src0 is i8vec4 packed in an int32, src1 is u8vec4 packed in an int32, and
1394# src2 is an int32.  The 8-bit components are extended to 32-bits, and a
1395# dot-product is performed on the resulting vectors.  src2 is added to the
1396# result of the dot-product.
1397#
1398# NOTE: Unlike many of the other dp4a opcodes, this mixed signs of source 0
1399# and source 1 mean that this opcode is not 2-source commutative
1400opcode("sudot_4x8_iadd", 0, tint32, [0, 0, 0], [tuint32, tuint32, tint32],
1401       False, "", """
1402   const int32_t v0x = (int8_t)(src0      );
1403   const int32_t v0y = (int8_t)(src0 >>  8);
1404   const int32_t v0z = (int8_t)(src0 >> 16);
1405   const int32_t v0w = (int8_t)(src0 >> 24);
1406   const uint32_t v1x = (uint8_t)(src1      );
1407   const uint32_t v1y = (uint8_t)(src1 >>  8);
1408   const uint32_t v1z = (uint8_t)(src1 >> 16);
1409   const uint32_t v1w = (uint8_t)(src1 >> 24);
1410
1411   dst = (v0x * v1x) + (v0y * v1y) + (v0z * v1z) + (v0w * v1w) + src2;
1412""")
1413
1414# Like sdot_4x8_iadd, but the result is clampled to the range [-0x80000000, 0x7ffffffff].
1415opcode("sdot_4x8_iadd_sat", 0, tint32, [0, 0, 0], [tuint32, tuint32, tint32],
1416       False, _2src_commutative, """
1417   const int64_t v0x = (int8_t)(src0      );
1418   const int64_t v0y = (int8_t)(src0 >>  8);
1419   const int64_t v0z = (int8_t)(src0 >> 16);
1420   const int64_t v0w = (int8_t)(src0 >> 24);
1421   const int64_t v1x = (int8_t)(src1      );
1422   const int64_t v1y = (int8_t)(src1 >>  8);
1423   const int64_t v1z = (int8_t)(src1 >> 16);
1424   const int64_t v1w = (int8_t)(src1 >> 24);
1425
1426   const int64_t tmp = (v0x * v1x) + (v0y * v1y) + (v0z * v1z) + (v0w * v1w) + src2;
1427
1428   dst = tmp >= INT32_MAX ? INT32_MAX : (tmp <= INT32_MIN ? INT32_MIN : tmp);
1429""")
1430
1431# Like udot_4x8_uadd, but the result is clampled to the range [0, 0xfffffffff].
1432opcode("udot_4x8_uadd_sat", 0, tint32, [0, 0, 0], [tuint32, tuint32, tint32],
1433       False, _2src_commutative, """
1434   const uint64_t v0x = (uint8_t)(src0      );
1435   const uint64_t v0y = (uint8_t)(src0 >>  8);
1436   const uint64_t v0z = (uint8_t)(src0 >> 16);
1437   const uint64_t v0w = (uint8_t)(src0 >> 24);
1438   const uint64_t v1x = (uint8_t)(src1      );
1439   const uint64_t v1y = (uint8_t)(src1 >>  8);
1440   const uint64_t v1z = (uint8_t)(src1 >> 16);
1441   const uint64_t v1w = (uint8_t)(src1 >> 24);
1442
1443   const uint64_t tmp = (v0x * v1x) + (v0y * v1y) + (v0z * v1z) + (v0w * v1w) + src2;
1444
1445   dst = tmp >= UINT32_MAX ? UINT32_MAX : tmp;
1446""")
1447
1448# Like sudot_4x8_iadd, but the result is clampled to the range [-0x80000000, 0x7ffffffff].
1449#
1450# NOTE: Unlike many of the other dp4a opcodes, this mixed signs of source 0
1451# and source 1 mean that this opcode is not 2-source commutative
1452opcode("sudot_4x8_iadd_sat", 0, tint32, [0, 0, 0], [tuint32, tuint32, tint32],
1453       False, "", """
1454   const int64_t v0x = (int8_t)(src0      );
1455   const int64_t v0y = (int8_t)(src0 >>  8);
1456   const int64_t v0z = (int8_t)(src0 >> 16);
1457   const int64_t v0w = (int8_t)(src0 >> 24);
1458   const uint64_t v1x = (uint8_t)(src1      );
1459   const uint64_t v1y = (uint8_t)(src1 >>  8);
1460   const uint64_t v1z = (uint8_t)(src1 >> 16);
1461   const uint64_t v1w = (uint8_t)(src1 >> 24);
1462
1463   const int64_t tmp = (v0x * v1x) + (v0y * v1y) + (v0z * v1z) + (v0w * v1w) + src2;
1464
1465   dst = tmp >= INT32_MAX ? INT32_MAX : (tmp <= INT32_MIN ? INT32_MIN : tmp);
1466""")
1467
1468# src0 and src1 are i16vec2 packed in an int32, and src2 is an int32.  The int16
1469# components are sign-extended to 32-bits, and a dot-product is performed on
1470# the resulting vectors.  src2 is added to the result of the dot-product.
1471opcode("sdot_2x16_iadd", 0, tint32, [0, 0, 0], [tuint32, tuint32, tint32],
1472       False, _2src_commutative, """
1473   const int32_t v0x = (int16_t)(src0      );
1474   const int32_t v0y = (int16_t)(src0 >> 16);
1475   const int32_t v1x = (int16_t)(src1      );
1476   const int32_t v1y = (int16_t)(src1 >> 16);
1477
1478   dst = (v0x * v1x) + (v0y * v1y) + src2;
1479""")
1480
1481# Like sdot_2x16_iadd, but unsigned.
1482opcode("udot_2x16_uadd", 0, tuint32, [0, 0, 0], [tuint32, tuint32, tuint32],
1483       False, _2src_commutative, """
1484   const uint32_t v0x = (uint16_t)(src0      );
1485   const uint32_t v0y = (uint16_t)(src0 >> 16);
1486   const uint32_t v1x = (uint16_t)(src1      );
1487   const uint32_t v1y = (uint16_t)(src1 >> 16);
1488
1489   dst = (v0x * v1x) + (v0y * v1y) + src2;
1490""")
1491
1492# Like sdot_2x16_iadd, but the result is clampled to the range [-0x80000000, 0x7ffffffff].
1493opcode("sdot_2x16_iadd_sat", 0, tint32, [0, 0, 0], [tuint32, tuint32, tint32],
1494       False, _2src_commutative, """
1495   const int64_t v0x = (int16_t)(src0      );
1496   const int64_t v0y = (int16_t)(src0 >> 16);
1497   const int64_t v1x = (int16_t)(src1      );
1498   const int64_t v1y = (int16_t)(src1 >> 16);
1499
1500   const int64_t tmp = (v0x * v1x) + (v0y * v1y) + src2;
1501
1502   dst = tmp >= INT32_MAX ? INT32_MAX : (tmp <= INT32_MIN ? INT32_MIN : tmp);
1503""")
1504
1505# Like udot_2x16_uadd, but the result is clampled to the range [0, 0xfffffffff].
1506opcode("udot_2x16_uadd_sat", 0, tint32, [0, 0, 0], [tuint32, tuint32, tint32],
1507       False, _2src_commutative, """
1508   const uint64_t v0x = (uint16_t)(src0      );
1509   const uint64_t v0y = (uint16_t)(src0 >> 16);
1510   const uint64_t v1x = (uint16_t)(src1      );
1511   const uint64_t v1y = (uint16_t)(src1 >> 16);
1512
1513   const uint64_t tmp = (v0x * v1x) + (v0y * v1y) + src2;
1514
1515   dst = tmp >= UINT32_MAX ? UINT32_MAX : tmp;
1516""")
1517