• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 #
2 # Copyright (C) 2014 Connor Abbott
3 #
4 # Permission is hereby granted, free of charge, to any person obtaining a
5 # copy of this software and associated documentation files (the "Software"),
6 # to deal in the Software without restriction, including without limitation
7 # the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 # and/or sell copies of the Software, and to permit persons to whom the
9 # Software is furnished to do so, subject to the following conditions:
10 #
11 # The above copyright notice and this permission notice (including the next
12 # paragraph) shall be included in all copies or substantial portions of the
13 # Software.
14 #
15 # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
18 # THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 # FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21 # IN THE SOFTWARE.
22 #
23 # Authors:
24 #    Connor Abbott (cwabbott0@gmail.com)
25 
26 import re
27 
28 # Class that represents all the information we have about the opcode
29 # NOTE: this must be kept in sync with nir_op_info
30 
31 class Opcode(object):
32    """Class that represents all the information we have about the opcode
33    NOTE: this must be kept in sync with nir_op_info
34    """
35    def __init__(self, name, output_size, output_type, input_sizes,
36                 input_types, is_conversion, algebraic_properties, const_expr):
37       """Parameters:
38 
39       - name is the name of the opcode (prepend nir_op_ for the enum name)
40       - all types are strings that get nir_type_ prepended to them
41       - input_types is a list of types
42       - is_conversion is true if this opcode represents a type conversion
43       - algebraic_properties is a space-seperated string, where nir_op_is_ is
44         prepended before each entry
45       - const_expr is an expression or series of statements that computes the
46         constant value of the opcode given the constant values of its inputs.
47 
48       Constant expressions are formed from the variables src0, src1, ...,
49       src(N-1), where N is the number of arguments.  The output of the
50       expression should be stored in the dst variable.  Per-component input
51       and output variables will be scalars and non-per-component input and
52       output variables will be a struct with fields named x, y, z, and w
53       all of the correct type.  Input and output variables can be assumed
54       to already be of the correct type and need no conversion.  In
55       particular, the conversion from the C bool type to/from  NIR_TRUE and
56       NIR_FALSE happens automatically.
57 
58       For per-component instructions, the entire expression will be
59       executed once for each component.  For non-per-component
60       instructions, the expression is expected to store the correct values
61       in dst.x, dst.y, etc.  If "dst" does not exist anywhere in the
62       constant expression, an assignment to dst will happen automatically
63       and the result will be equivalent to "dst = <expression>" for
64       per-component instructions and "dst.x = dst.y = ... = <expression>"
65       for non-per-component instructions.
66       """
67       assert isinstance(name, str)
68       assert isinstance(output_size, int)
69       assert isinstance(output_type, str)
70       assert isinstance(input_sizes, list)
71       assert isinstance(input_sizes[0], int)
72       assert isinstance(input_types, list)
73       assert isinstance(input_types[0], str)
74       assert isinstance(is_conversion, bool)
75       assert isinstance(algebraic_properties, str)
76       assert isinstance(const_expr, str)
77       assert len(input_sizes) == len(input_types)
78       assert 0 <= output_size <= 5 or (output_size == 8) or (output_size == 16)
79       for size in input_sizes:
80          assert 0 <= size <= 5 or (size == 8) or (size == 16)
81          if output_size == 0:
82             assert size == 0
83          if output_size != 0:
84             assert size != 0
85       self.name = name
86       self.num_inputs = len(input_sizes)
87       self.output_size = output_size
88       self.output_type = output_type
89       self.input_sizes = input_sizes
90       self.input_types = input_types
91       self.is_conversion = is_conversion
92       self.algebraic_properties = algebraic_properties
93       self.const_expr = const_expr
94 
95 # helper variables for strings
96 tfloat = "float"
97 tint = "int"
98 tbool = "bool"
99 tbool1 = "bool1"
100 tbool8 = "bool8"
101 tbool16 = "bool16"
102 tbool32 = "bool32"
103 tuint = "uint"
104 tuint8 = "uint8"
105 tint16 = "int16"
106 tuint16 = "uint16"
107 tfloat16 = "float16"
108 tfloat32 = "float32"
109 tint32 = "int32"
110 tuint32 = "uint32"
111 tint64 = "int64"
112 tuint64 = "uint64"
113 tfloat64 = "float64"
114 
115 _TYPE_SPLIT_RE = re.compile(r'(?P<type>int|uint|float|bool)(?P<bits>\d+)?')
116 
117 def type_has_size(type_):
118     m = _TYPE_SPLIT_RE.match(type_)
119     assert m is not None, 'Invalid NIR type string: "{}"'.format(type_)
120     return m.group('bits') is not None
121 
122 def type_size(type_):
123     m = _TYPE_SPLIT_RE.match(type_)
124     assert m is not None, 'Invalid NIR type string: "{}"'.format(type_)
125     assert m.group('bits') is not None, \
126            'NIR type string has no bit size: "{}"'.format(type_)
127     return int(m.group('bits'))
128 
129 def type_sizes(type_):
130     if type_has_size(type_):
131         return [type_size(type_)]
132     elif type_ == 'bool':
133         return [1, 8, 16, 32]
134     elif type_ == 'float':
135         return [16, 32, 64]
136     else:
137         return [1, 8, 16, 32, 64]
138 
139 def type_base_type(type_):
140     m = _TYPE_SPLIT_RE.match(type_)
141     assert m is not None, 'Invalid NIR type string: "{}"'.format(type_)
142     return m.group('type')
143 
144 # Operation where the first two sources are commutative.
145 #
146 # For 2-source operations, this just mathematical commutativity.  Some
147 # 3-source operations, like ffma, are only commutative in the first two
148 # sources.
149 _2src_commutative = "2src_commutative "
150 associative = "associative "
151 
152 # global dictionary of opcodes
153 opcodes = {}
154 
155 def opcode(name, output_size, output_type, input_sizes, input_types,
156            is_conversion, algebraic_properties, const_expr):
157    assert name not in opcodes
158    opcodes[name] = Opcode(name, output_size, output_type, input_sizes,
159                           input_types, is_conversion, algebraic_properties,
160                           const_expr)
161 
162 def unop_convert(name, out_type, in_type, const_expr):
163    opcode(name, 0, out_type, [0], [in_type], False, "", const_expr)
164 
165 def unop(name, ty, const_expr):
166    opcode(name, 0, ty, [0], [ty], False, "", const_expr)
167 
168 def unop_horiz(name, output_size, output_type, input_size, input_type,
169                const_expr):
170    opcode(name, output_size, output_type, [input_size], [input_type],
171           False, "", const_expr)
172 
173 def unop_reduce(name, output_size, output_type, input_type, prereduce_expr,
174                 reduce_expr, final_expr):
175    def prereduce(src):
176       return "(" + prereduce_expr.format(src=src) + ")"
177    def final(src):
178       return final_expr.format(src="(" + src + ")")
179    def reduce_(src0, src1):
180       return reduce_expr.format(src0=src0, src1=src1)
181    src0 = prereduce("src0.x")
182    src1 = prereduce("src0.y")
183    src2 = prereduce("src0.z")
184    src3 = prereduce("src0.w")
185    unop_horiz(name + "2", output_size, output_type, 2, input_type,
186               final(reduce_(src0, src1)))
187    unop_horiz(name + "3", output_size, output_type, 3, input_type,
188               final(reduce_(reduce_(src0, src1), src2)))
189    unop_horiz(name + "4", output_size, output_type, 4, input_type,
190               final(reduce_(reduce_(src0, src1), reduce_(src2, src3))))
191 
192 def unop_numeric_convert(name, out_type, in_type, const_expr):
193    opcode(name, 0, out_type, [0], [in_type], True, "", const_expr)
194 
195 unop("mov", tuint, "src0")
196 
197 unop("ineg", tint, "-src0")
198 unop("fneg", tfloat, "-src0")
199 unop("inot", tint, "~src0") # invert every bit of the integer
200 
201 # nir_op_fsign roughly implements the OpenGL / Vulkan rules for sign(float).
202 # The GLSL.std.450 FSign instruction is defined as:
203 #
204 #    Result is 1.0 if x > 0, 0.0 if x = 0, or -1.0 if x < 0.
205 #
206 # If the source is equal to zero, there is a preference for the result to have
207 # the same sign, but this is not required (it is required by OpenCL).  If the
208 # source is not a number, there is a preference for the result to be +0.0, but
209 # this is not required (it is required by OpenCL).  If the source is not a
210 # number, and the result is not +0.0, the result should definitely **not** be
211 # NaN.
212 #
213 # The values returned for constant folding match the behavior required by
214 # OpenCL.
215 unop("fsign", tfloat, ("bit_size == 64 ? " +
216                        "(isnan(src0) ? 0.0  : ((src0 == 0.0 ) ? src0 : (src0 > 0.0 ) ? 1.0  : -1.0 )) : " +
217                        "(isnan(src0) ? 0.0f : ((src0 == 0.0f) ? src0 : (src0 > 0.0f) ? 1.0f : -1.0f))"))
218 unop("isign", tint, "(src0 == 0) ? 0 : ((src0 > 0) ? 1 : -1)")
219 unop("iabs", tint, "(src0 < 0) ? -src0 : src0")
220 unop("fabs", tfloat, "fabs(src0)")
221 unop("fsat", tfloat, ("fmin(fmax(src0, 0.0), 1.0)"))
222 unop("frcp", tfloat, "bit_size == 64 ? 1.0 / src0 : 1.0f / src0")
223 unop("frsq", tfloat, "bit_size == 64 ? 1.0 / sqrt(src0) : 1.0f / sqrtf(src0)")
224 unop("fsqrt", tfloat, "bit_size == 64 ? sqrt(src0) : sqrtf(src0)")
225 unop("fexp2", tfloat, "exp2f(src0)")
226 unop("flog2", tfloat, "log2f(src0)")
227 
228 # Generate all of the numeric conversion opcodes
229 for src_t in [tint, tuint, tfloat, tbool]:
230    if src_t == tbool:
231       dst_types = [tfloat, tint, tbool]
232    elif src_t == tint:
233       dst_types = [tfloat, tint, tbool]
234    elif src_t == tuint:
235       dst_types = [tfloat, tuint]
236    elif src_t == tfloat:
237       dst_types = [tint, tuint, tfloat, tbool]
238 
239    for dst_t in dst_types:
240       for dst_bit_size in type_sizes(dst_t):
241           if dst_bit_size == 16 and dst_t == tfloat and src_t == tfloat:
242               rnd_modes = ['_rtne', '_rtz', '']
243               for rnd_mode in rnd_modes:
244                   if rnd_mode == '_rtne':
245                       conv_expr = """
246                       if (bit_size > 16) {
247                          dst = _mesa_half_to_float(_mesa_float_to_float16_rtne(src0));
248                       } else {
249                          dst = src0;
250                       }
251                       """
252                   elif rnd_mode == '_rtz':
253                       conv_expr = """
254                       if (bit_size > 16) {
255                          dst = _mesa_half_to_float(_mesa_float_to_float16_rtz(src0));
256                       } else {
257                          dst = src0;
258                       }
259                       """
260                   else:
261                       conv_expr = "src0"
262 
263                   unop_numeric_convert("{0}2{1}{2}{3}".format(src_t[0],
264                                                               dst_t[0],
265                                                               dst_bit_size,
266                                                               rnd_mode),
267                                        dst_t + str(dst_bit_size),
268                                        src_t, conv_expr)
269           elif dst_bit_size == 32 and dst_t == tfloat and src_t == tfloat:
270               conv_expr = """
271               if (bit_size > 32 && nir_is_rounding_mode_rtz(execution_mode, 32)) {
272                  dst = _mesa_double_to_float_rtz(src0);
273               } else {
274                  dst = src0;
275               }
276               """
277               unop_numeric_convert("{0}2{1}{2}".format(src_t[0], dst_t[0],
278                                                        dst_bit_size),
279                                    dst_t + str(dst_bit_size), src_t, conv_expr)
280           else:
281               conv_expr = "src0 != 0" if dst_t == tbool else "src0"
282               unop_numeric_convert("{0}2{1}{2}".format(src_t[0], dst_t[0],
283                                                        dst_bit_size),
284                                    dst_t + str(dst_bit_size), src_t, conv_expr)
285 
286 # Special opcode that is the same as f2f16, i2i16, u2u16 except that it is safe
287 # to remove it if the result is immediately converted back to 32 bits again.
288 # This is generated as part of the precision lowering pass. mp stands for medium
289 # precision.
290 unop_numeric_convert("f2fmp", tfloat16, tfloat32, opcodes["f2f16"].const_expr)
291 unop_numeric_convert("i2imp", tint16, tint32, opcodes["i2i16"].const_expr)
292 # u2ump isn't defined, because the behavior is equal to i2imp
293 unop_numeric_convert("f2imp", tint16, tfloat32, opcodes["f2i16"].const_expr)
294 unop_numeric_convert("f2ump", tuint16, tfloat32, opcodes["f2u16"].const_expr)
295 unop_numeric_convert("i2fmp", tfloat16, tint32, opcodes["i2f16"].const_expr)
296 unop_numeric_convert("u2fmp", tfloat16, tuint32, opcodes["u2f16"].const_expr)
297 
298 # Unary floating-point rounding operations.
299 
300 
301 unop("ftrunc", tfloat, "bit_size == 64 ? trunc(src0) : truncf(src0)")
302 unop("fceil", tfloat, "bit_size == 64 ? ceil(src0) : ceilf(src0)")
303 unop("ffloor", tfloat, "bit_size == 64 ? floor(src0) : floorf(src0)")
304 unop("ffract", tfloat, "src0 - (bit_size == 64 ? floor(src0) : floorf(src0))")
305 unop("fround_even", tfloat, "bit_size == 64 ? _mesa_roundeven(src0) : _mesa_roundevenf(src0)")
306 
307 unop("fquantize2f16", tfloat, "(fabs(src0) < ldexpf(1.0, -14)) ? copysignf(0.0f, src0) : _mesa_half_to_float(_mesa_float_to_half(src0))")
308 
309 # Trigonometric operations.
310 
311 
312 unop("fsin", tfloat, "bit_size == 64 ? sin(src0) : sinf(src0)")
313 unop("fcos", tfloat, "bit_size == 64 ? cos(src0) : cosf(src0)")
314 
315 # dfrexp
316 unop_convert("frexp_exp", tint32, tfloat, "frexp(src0, &dst);")
317 unop_convert("frexp_sig", tfloat, tfloat, "int n; dst = frexp(src0, &n);")
318 
319 # Partial derivatives.
320 
321 
322 unop("fddx", tfloat, "0.0") # the derivative of a constant is 0.
323 unop("fddy", tfloat, "0.0")
324 unop("fddx_fine", tfloat, "0.0")
325 unop("fddy_fine", tfloat, "0.0")
326 unop("fddx_coarse", tfloat, "0.0")
327 unop("fddy_coarse", tfloat, "0.0")
328 
329 
330 # Floating point pack and unpack operations.
331 
332 def pack_2x16(fmt):
333    unop_horiz("pack_" + fmt + "_2x16", 1, tuint32, 2, tfloat32, """
334 dst.x = (uint32_t) pack_fmt_1x16(src0.x);
335 dst.x |= ((uint32_t) pack_fmt_1x16(src0.y)) << 16;
336 """.replace("fmt", fmt))
337 
338 def pack_4x8(fmt):
339    unop_horiz("pack_" + fmt + "_4x8", 1, tuint32, 4, tfloat32, """
340 dst.x = (uint32_t) pack_fmt_1x8(src0.x);
341 dst.x |= ((uint32_t) pack_fmt_1x8(src0.y)) << 8;
342 dst.x |= ((uint32_t) pack_fmt_1x8(src0.z)) << 16;
343 dst.x |= ((uint32_t) pack_fmt_1x8(src0.w)) << 24;
344 """.replace("fmt", fmt))
345 
346 def unpack_2x16(fmt):
347    unop_horiz("unpack_" + fmt + "_2x16", 2, tfloat32, 1, tuint32, """
348 dst.x = unpack_fmt_1x16((uint16_t)(src0.x & 0xffff));
349 dst.y = unpack_fmt_1x16((uint16_t)(src0.x << 16));
350 """.replace("fmt", fmt))
351 
352 def unpack_4x8(fmt):
353    unop_horiz("unpack_" + fmt + "_4x8", 4, tfloat32, 1, tuint32, """
354 dst.x = unpack_fmt_1x8((uint8_t)(src0.x & 0xff));
355 dst.y = unpack_fmt_1x8((uint8_t)((src0.x >> 8) & 0xff));
356 dst.z = unpack_fmt_1x8((uint8_t)((src0.x >> 16) & 0xff));
357 dst.w = unpack_fmt_1x8((uint8_t)(src0.x >> 24));
358 """.replace("fmt", fmt))
359 
360 
361 pack_2x16("snorm")
362 pack_4x8("snorm")
363 pack_2x16("unorm")
364 pack_4x8("unorm")
365 pack_2x16("half")
366 unpack_2x16("snorm")
367 unpack_4x8("snorm")
368 unpack_2x16("unorm")
369 unpack_4x8("unorm")
370 unpack_2x16("half")
371 
372 unop_horiz("pack_uvec2_to_uint", 1, tuint32, 2, tuint32, """
373 dst.x = (src0.x & 0xffff) | (src0.y << 16);
374 """)
375 
376 unop_horiz("pack_uvec4_to_uint", 1, tuint32, 4, tuint32, """
377 dst.x = (src0.x <<  0) |
378         (src0.y <<  8) |
379         (src0.z << 16) |
380         (src0.w << 24);
381 """)
382 
383 unop_horiz("pack_32_4x8", 1, tuint32, 4, tuint8,
384            "dst.x = src0.x | ((uint32_t)src0.y << 8) | ((uint32_t)src0.z << 16) | ((uint32_t)src0.w << 24);")
385 
386 unop_horiz("pack_32_2x16", 1, tuint32, 2, tuint16,
387            "dst.x = src0.x | ((uint32_t)src0.y << 16);")
388 
389 unop_horiz("pack_64_2x32", 1, tuint64, 2, tuint32,
390            "dst.x = src0.x | ((uint64_t)src0.y << 32);")
391 
392 unop_horiz("pack_64_4x16", 1, tuint64, 4, tuint16,
393            "dst.x = src0.x | ((uint64_t)src0.y << 16) | ((uint64_t)src0.z << 32) | ((uint64_t)src0.w << 48);")
394 
395 unop_horiz("unpack_64_2x32", 2, tuint32, 1, tuint64,
396            "dst.x = src0.x; dst.y = src0.x >> 32;")
397 
398 unop_horiz("unpack_64_4x16", 4, tuint16, 1, tuint64,
399            "dst.x = src0.x; dst.y = src0.x >> 16; dst.z = src0.x >> 32; dst.w = src0.w >> 48;")
400 
401 unop_horiz("unpack_32_2x16", 2, tuint16, 1, tuint32,
402            "dst.x = src0.x; dst.y = src0.x >> 16;")
403 
404 unop_horiz("unpack_32_4x8", 4, tuint8, 1, tuint32,
405            "dst.x = src0.x; dst.y = src0.x >> 8; dst.z = src0.x >> 16; dst.w = src0.x >> 24;")
406 
407 unop_horiz("unpack_half_2x16_flush_to_zero", 2, tfloat32, 1, tuint32, """
408 dst.x = unpack_half_1x16_flush_to_zero((uint16_t)(src0.x & 0xffff));
409 dst.y = unpack_half_1x16_flush_to_zero((uint16_t)(src0.x << 16));
410 """)
411 
412 # Lowered floating point unpacking operations.
413 
414 unop_convert("unpack_half_2x16_split_x", tfloat32, tuint32,
415              "unpack_half_1x16((uint16_t)(src0 & 0xffff))")
416 unop_convert("unpack_half_2x16_split_y", tfloat32, tuint32,
417              "unpack_half_1x16((uint16_t)(src0 >> 16))")
418 
419 unop_convert("unpack_half_2x16_split_x_flush_to_zero", tfloat32, tuint32,
420              "unpack_half_1x16_flush_to_zero((uint16_t)(src0 & 0xffff))")
421 unop_convert("unpack_half_2x16_split_y_flush_to_zero", tfloat32, tuint32,
422              "unpack_half_1x16_flush_to_zero((uint16_t)(src0 >> 16))")
423 
424 unop_convert("unpack_32_2x16_split_x", tuint16, tuint32, "src0")
425 unop_convert("unpack_32_2x16_split_y", tuint16, tuint32, "src0 >> 16")
426 
427 unop_convert("unpack_64_2x32_split_x", tuint32, tuint64, "src0")
428 unop_convert("unpack_64_2x32_split_y", tuint32, tuint64, "src0 >> 32")
429 
430 # Bit operations, part of ARB_gpu_shader5.
431 
432 
433 unop("bitfield_reverse", tuint32, """
434 /* we're not winning any awards for speed here, but that's ok */
435 dst = 0;
436 for (unsigned bit = 0; bit < 32; bit++)
437    dst |= ((src0 >> bit) & 1) << (31 - bit);
438 """)
439 unop_convert("bit_count", tuint32, tuint, """
440 dst = 0;
441 for (unsigned bit = 0; bit < bit_size; bit++) {
442    if ((src0 >> bit) & 1)
443       dst++;
444 }
445 """)
446 
447 unop_convert("ufind_msb", tint32, tuint, """
448 dst = -1;
449 for (int bit = bit_size - 1; bit >= 0; bit--) {
450    if ((src0 >> bit) & 1) {
451       dst = bit;
452       break;
453    }
454 }
455 """)
456 
457 unop_convert("ufind_msb_rev", tint32, tuint, """
458 dst = -1;
459 for (int bit = 0; bit < bit_size; bit++) {
460    if ((src0 << bit) & 0x80000000) {
461       dst = bit;
462       break;
463    }
464 }
465 """)
466 
467 unop("uclz", tuint32, """
468 int bit;
469 for (bit = bit_size - 1; bit >= 0; bit--) {
470    if ((src0 & (1u << bit)) != 0)
471       break;
472 }
473 dst = (unsigned)(31 - bit);
474 """)
475 
476 unop("ifind_msb", tint32, """
477 dst = -1;
478 for (int bit = 31; bit >= 0; bit--) {
479    /* If src0 < 0, we're looking for the first 0 bit.
480     * if src0 >= 0, we're looking for the first 1 bit.
481     */
482    if ((((src0 >> bit) & 1) && (src0 >= 0)) ||
483       (!((src0 >> bit) & 1) && (src0 < 0))) {
484       dst = bit;
485       break;
486    }
487 }
488 """)
489 
490 unop_convert("ifind_msb_rev", tint32, tint, """
491 dst = -1;
492 if (src0 != 0 && src0 != -1) {
493    for (int bit = 0; bit < 31; bit++) {
494       /* If src0 < 0, we're looking for the first 0 bit.
495        * if src0 >= 0, we're looking for the first 1 bit.
496        */
497       if ((((src0 << bit) & 0x40000000) && (src0 >= 0)) ||
498           ((!((src0 << bit) & 0x40000000)) && (src0 < 0))) {
499          dst = bit;
500          break;
501       }
502    }
503 }
504 """)
505 
506 unop_convert("find_lsb", tint32, tint, """
507 dst = -1;
508 for (unsigned bit = 0; bit < bit_size; bit++) {
509    if ((src0 >> bit) & 1) {
510       dst = bit;
511       break;
512    }
513 }
514 """)
515 
516 # AMD_gcn_shader extended instructions
517 unop_horiz("cube_face_coord_amd", 2, tfloat32, 3, tfloat32, """
518 dst.x = dst.y = 0.0;
519 float absX = fabsf(src0.x);
520 float absY = fabsf(src0.y);
521 float absZ = fabsf(src0.z);
522 
523 float ma = 0.0;
524 if (absX >= absY && absX >= absZ) { ma = 2 * src0.x; }
525 if (absY >= absX && absY >= absZ) { ma = 2 * src0.y; }
526 if (absZ >= absX && absZ >= absY) { ma = 2 * src0.z; }
527 
528 if (src0.x >= 0 && absX >= absY && absX >= absZ) { dst.x = -src0.z; dst.y = -src0.y; }
529 if (src0.x < 0 && absX >= absY && absX >= absZ) { dst.x = src0.z; dst.y = -src0.y; }
530 if (src0.y >= 0 && absY >= absX && absY >= absZ) { dst.x = src0.x; dst.y = src0.z; }
531 if (src0.y < 0 && absY >= absX && absY >= absZ) { dst.x = src0.x; dst.y = -src0.z; }
532 if (src0.z >= 0 && absZ >= absX && absZ >= absY) { dst.x = src0.x; dst.y = -src0.y; }
533 if (src0.z < 0 && absZ >= absX && absZ >= absY) { dst.x = -src0.x; dst.y = -src0.y; }
534 
535 dst.x = dst.x * (1.0f / ma) + 0.5f;
536 dst.y = dst.y * (1.0f / ma) + 0.5f;
537 """)
538 
539 unop_horiz("cube_face_index_amd", 1, tfloat32, 3, tfloat32, """
540 dst.x = 0.0;
541 float absX = fabsf(src0.x);
542 float absY = fabsf(src0.y);
543 float absZ = fabsf(src0.z);
544 if (src0.x >= 0 && absX >= absY && absX >= absZ) dst.x = 0;
545 if (src0.x < 0 && absX >= absY && absX >= absZ) dst.x = 1;
546 if (src0.y >= 0 && absY >= absX && absY >= absZ) dst.x = 2;
547 if (src0.y < 0 && absY >= absX && absY >= absZ) dst.x = 3;
548 if (src0.z >= 0 && absZ >= absX && absZ >= absY) dst.x = 4;
549 if (src0.z < 0 && absZ >= absX && absZ >= absY) dst.x = 5;
550 """)
551 
552 # Sum of vector components
553 unop_reduce("fsum", 1, tfloat, tfloat, "{src}", "{src0} + {src1}", "{src}")
554 
555 def binop_convert(name, out_type, in_type, alg_props, const_expr):
556    opcode(name, 0, out_type, [0, 0], [in_type, in_type],
557           False, alg_props, const_expr)
558 
559 def binop(name, ty, alg_props, const_expr):
560    binop_convert(name, ty, ty, alg_props, const_expr)
561 
562 def binop_compare(name, ty, alg_props, const_expr):
563    binop_convert(name, tbool1, ty, alg_props, const_expr)
564 
565 def binop_compare8(name, ty, alg_props, const_expr):
566    binop_convert(name, tbool8, ty, alg_props, const_expr)
567 
568 def binop_compare16(name, ty, alg_props, const_expr):
569    binop_convert(name, tbool16, ty, alg_props, const_expr)
570 
571 def binop_compare32(name, ty, alg_props, const_expr):
572    binop_convert(name, tbool32, ty, alg_props, const_expr)
573 
574 def binop_compare_all_sizes(name, ty, alg_props, const_expr):
575    binop_compare(name, ty, alg_props, const_expr)
576    binop_compare8(name + "8", ty, alg_props, const_expr)
577    binop_compare16(name + "16", ty, alg_props, const_expr)
578    binop_compare32(name + "32", ty, alg_props, const_expr)
579 
580 def binop_horiz(name, out_size, out_type, src1_size, src1_type, src2_size,
581                 src2_type, const_expr):
582    opcode(name, out_size, out_type, [src1_size, src2_size], [src1_type, src2_type],
583           False, "", const_expr)
584 
585 def binop_reduce(name, output_size, output_type, src_type, prereduce_expr,
586                  reduce_expr, final_expr, suffix=""):
587    def final(src):
588       return final_expr.format(src= "(" + src + ")")
589    def reduce_(src0, src1):
590       return reduce_expr.format(src0=src0, src1=src1)
591    def prereduce(src0, src1):
592       return "(" + prereduce_expr.format(src0=src0, src1=src1) + ")"
593    srcs = [prereduce("src0." + letter, "src1." + letter) for letter in "xyzwefghijklmnop"]
594    def pairwise_reduce(start, size):
595       if (size == 1):
596          return srcs[start]
597       return reduce_(pairwise_reduce(start + size // 2, size // 2), pairwise_reduce(start, size // 2))
598    for size in [2, 4, 8, 16]:
599       opcode(name + str(size) + suffix, output_size, output_type,
600              [size, size], [src_type, src_type], False, _2src_commutative,
601              final(pairwise_reduce(0, size)))
602    opcode(name + "3" + suffix, output_size, output_type,
603           [3, 3], [src_type, src_type], False, _2src_commutative,
604           final(reduce_(reduce_(srcs[2], srcs[1]), srcs[0])))
605    opcode(name + "5" + suffix, output_size, output_type,
606           [5, 5], [src_type, src_type], False, _2src_commutative,
607           final(reduce_(srcs[4], reduce_(reduce_(srcs[3], srcs[2]), reduce_(srcs[1], srcs[0])))))
608 
609 def binop_reduce_all_sizes(name, output_size, src_type, prereduce_expr,
610                            reduce_expr, final_expr):
611    binop_reduce(name, output_size, tbool1, src_type,
612                 prereduce_expr, reduce_expr, final_expr)
613    binop_reduce("b8" + name[1:], output_size, tbool8, src_type,
614                 prereduce_expr, reduce_expr, final_expr)
615    binop_reduce("b16" + name[1:], output_size, tbool16, src_type,
616                 prereduce_expr, reduce_expr, final_expr)
617    binop_reduce("b32" + name[1:], output_size, tbool32, src_type,
618                 prereduce_expr, reduce_expr, final_expr)
619 
620 binop("fadd", tfloat, _2src_commutative + associative,"""
621 if (nir_is_rounding_mode_rtz(execution_mode, bit_size)) {
622    if (bit_size == 64)
623       dst = _mesa_double_add_rtz(src0, src1);
624    else
625       dst = _mesa_double_to_float_rtz((double)src0 + (double)src1);
626 } else {
627    dst = src0 + src1;
628 }
629 """)
630 binop("iadd", tint, _2src_commutative + associative, "(uint64_t)src0 + (uint64_t)src1")
631 binop("iadd_sat", tint, _2src_commutative, """
632       src1 > 0 ?
633          (src0 + src1 < src0 ? u_intN_max(bit_size) : src0 + src1) :
634          (src0 < src0 + src1 ? u_intN_min(bit_size) : src0 + src1)
635 """)
636 binop("uadd_sat", tuint, _2src_commutative,
637       "(src0 + src1) < src0 ? u_uintN_max(sizeof(src0) * 8) : (src0 + src1)")
638 binop("isub_sat", tint, "", """
639       src1 < 0 ?
640          (src0 - src1 < src0 ? u_intN_max(bit_size) : src0 - src1) :
641          (src0 < src0 - src1 ? u_intN_min(bit_size) : src0 - src1)
642 """)
643 binop("usub_sat", tuint, "", "src0 < src1 ? 0 : src0 - src1")
644 
645 binop("fsub", tfloat, "", """
646 if (nir_is_rounding_mode_rtz(execution_mode, bit_size)) {
647    if (bit_size == 64)
648       dst = _mesa_double_sub_rtz(src0, src1);
649    else
650       dst = _mesa_double_to_float_rtz((double)src0 - (double)src1);
651 } else {
652    dst = src0 - src1;
653 }
654 """)
655 binop("isub", tint, "", "src0 - src1")
656 binop_convert("uabs_isub", tuint, tint, "", """
657               src1 > src0 ? (uint64_t) src1 - (uint64_t) src0
658                           : (uint64_t) src0 - (uint64_t) src1
659 """)
660 binop("uabs_usub", tuint, "", "(src1 > src0) ? (src1 - src0) : (src0 - src1)")
661 
662 binop("fmul", tfloat, _2src_commutative + associative, """
663 if (nir_is_rounding_mode_rtz(execution_mode, bit_size)) {
664    if (bit_size == 64)
665       dst = _mesa_double_mul_rtz(src0, src1);
666    else
667       dst = _mesa_double_to_float_rtz((double)src0 * (double)src1);
668 } else {
669    dst = src0 * src1;
670 }
671 """)
672 # low 32-bits of signed/unsigned integer multiply
673 binop("imul", tint, _2src_commutative + associative, """
674    /* Use 64-bit multiplies to prevent overflow of signed arithmetic */
675    dst = (uint64_t)src0 * (uint64_t)src1;
676 """)
677 
678 # Generate 64 bit result from 2 32 bits quantity
679 binop_convert("imul_2x32_64", tint64, tint32, _2src_commutative,
680               "(int64_t)src0 * (int64_t)src1")
681 binop_convert("umul_2x32_64", tuint64, tuint32, _2src_commutative,
682               "(uint64_t)src0 * (uint64_t)src1")
683 
684 # high 32-bits of signed integer multiply
685 binop("imul_high", tint, _2src_commutative, """
686 if (bit_size == 64) {
687    /* We need to do a full 128-bit x 128-bit multiply in order for the sign
688     * extension to work properly.  The casts are kind-of annoying but needed
689     * to prevent compiler warnings.
690     */
691    uint32_t src0_u32[4] = {
692       src0,
693       (int64_t)src0 >> 32,
694       (int64_t)src0 >> 63,
695       (int64_t)src0 >> 63,
696    };
697    uint32_t src1_u32[4] = {
698       src1,
699       (int64_t)src1 >> 32,
700       (int64_t)src1 >> 63,
701       (int64_t)src1 >> 63,
702    };
703    uint32_t prod_u32[4];
704    ubm_mul_u32arr(prod_u32, src0_u32, src1_u32);
705    dst = (uint64_t)prod_u32[2] | ((uint64_t)prod_u32[3] << 32);
706 } else {
707    /* First, sign-extend to 64-bit, then convert to unsigned to prevent
708     * potential overflow of signed multiply */
709    dst = ((uint64_t)(int64_t)src0 * (uint64_t)(int64_t)src1) >> bit_size;
710 }
711 """)
712 
713 # high 32-bits of unsigned integer multiply
714 binop("umul_high", tuint, _2src_commutative, """
715 if (bit_size == 64) {
716    /* The casts are kind-of annoying but needed to prevent compiler warnings. */
717    uint32_t src0_u32[2] = { src0, (uint64_t)src0 >> 32 };
718    uint32_t src1_u32[2] = { src1, (uint64_t)src1 >> 32 };
719    uint32_t prod_u32[4];
720    ubm_mul_u32arr(prod_u32, src0_u32, src1_u32);
721    dst = (uint64_t)prod_u32[2] | ((uint64_t)prod_u32[3] << 32);
722 } else {
723    dst = ((uint64_t)src0 * (uint64_t)src1) >> bit_size;
724 }
725 """)
726 
727 # low 32-bits of unsigned integer multiply
728 binop("umul_low", tuint32, _2src_commutative, """
729 uint64_t mask = (1 << (bit_size / 2)) - 1;
730 dst = ((uint64_t)src0 & mask) * ((uint64_t)src1 & mask);
731 """)
732 
733 # Multiply 32-bits with low 16-bits.
734 binop("imul_32x16", tint32, "", "src0 * (int16_t) src1")
735 binop("umul_32x16", tuint32, "", "src0 * (uint16_t) src1")
736 
737 binop("fdiv", tfloat, "", "src0 / src1")
738 binop("idiv", tint, "", "src1 == 0 ? 0 : (src0 / src1)")
739 binop("udiv", tuint, "", "src1 == 0 ? 0 : (src0 / src1)")
740 
741 # returns a boolean representing the carry resulting from the addition of
742 # the two unsigned arguments.
743 
744 binop_convert("uadd_carry", tuint, tuint, _2src_commutative, "src0 + src1 < src0")
745 
746 # returns a boolean representing the borrow resulting from the subtraction
747 # of the two unsigned arguments.
748 
749 binop_convert("usub_borrow", tuint, tuint, "", "src0 < src1")
750 
751 # hadd: (a + b) >> 1 (without overflow)
752 # x + y = x - (x & ~y) + (x & ~y) + y - (~x & y) + (~x & y)
753 #       =      (x & y) + (x & ~y) +      (x & y) + (~x & y)
754 #       = 2 *  (x & y) + (x & ~y) +                (~x & y)
755 #       =     ((x & y) << 1) + (x ^ y)
756 #
757 # Since we know that the bottom bit of (x & y) << 1 is zero,
758 #
759 # (x + y) >> 1 = (((x & y) << 1) + (x ^ y)) >> 1
760 #              =   (x & y) +      ((x ^ y)  >> 1)
761 binop("ihadd", tint, _2src_commutative, "(src0 & src1) + ((src0 ^ src1) >> 1)")
762 binop("uhadd", tuint, _2src_commutative, "(src0 & src1) + ((src0 ^ src1) >> 1)")
763 
764 # rhadd: (a + b + 1) >> 1 (without overflow)
765 # x + y + 1 = x + (~x & y) - (~x & y) + y + (x & ~y) - (x & ~y) + 1
766 #           =      (x | y) - (~x & y) +      (x | y) - (x & ~y) + 1
767 #           = 2 *  (x | y) - ((~x & y) +               (x & ~y)) + 1
768 #           =     ((x | y) << 1) - (x ^ y) + 1
769 #
770 # Since we know that the bottom bit of (x & y) << 1 is zero,
771 #
772 # (x + y + 1) >> 1 = (x | y) + (-(x ^ y) + 1) >> 1)
773 #                  = (x | y) -  ((x ^ y)      >> 1)
774 binop("irhadd", tint, _2src_commutative, "(src0 | src1) - ((src0 ^ src1) >> 1)")
775 binop("urhadd", tuint, _2src_commutative, "(src0 | src1) - ((src0 ^ src1) >> 1)")
776 
777 binop("umod", tuint, "", "src1 == 0 ? 0 : src0 % src1")
778 
779 # For signed integers, there are several different possible definitions of
780 # "modulus" or "remainder".  We follow the conventions used by LLVM and
781 # SPIR-V.  The irem opcode implements the standard C/C++ signed "%"
782 # operation while the imod opcode implements the more mathematical
783 # "modulus" operation.  For details on the difference, see
784 #
785 # http://mathforum.org/library/drmath/view/52343.html
786 
787 binop("irem", tint, "", "src1 == 0 ? 0 : src0 % src1")
788 binop("imod", tint, "",
789       "src1 == 0 ? 0 : ((src0 % src1 == 0 || (src0 >= 0) == (src1 >= 0)) ?"
790       "                 src0 % src1 : src0 % src1 + src1)")
791 binop("fmod", tfloat, "", "src0 - src1 * floorf(src0 / src1)")
792 binop("frem", tfloat, "", "src0 - src1 * truncf(src0 / src1)")
793 
794 #
795 # Comparisons
796 #
797 
798 
799 # these integer-aware comparisons return a boolean (0 or ~0)
800 
801 binop_compare_all_sizes("flt", tfloat, "", "src0 < src1")
802 binop_compare_all_sizes("fge", tfloat, "", "src0 >= src1")
803 binop_compare_all_sizes("feq", tfloat, _2src_commutative, "src0 == src1")
804 binop_compare_all_sizes("fneu", tfloat, _2src_commutative, "src0 != src1")
805 binop_compare_all_sizes("ilt", tint, "", "src0 < src1")
806 binop_compare_all_sizes("ige", tint, "", "src0 >= src1")
807 binop_compare_all_sizes("ieq", tint, _2src_commutative, "src0 == src1")
808 binop_compare_all_sizes("ine", tint, _2src_commutative, "src0 != src1")
809 binop_compare_all_sizes("ult", tuint, "", "src0 < src1")
810 binop_compare_all_sizes("uge", tuint, "", "src0 >= src1")
811 
812 # integer-aware GLSL-style comparisons that compare floats and ints
813 
814 binop_reduce_all_sizes("ball_fequal",  1, tfloat, "{src0} == {src1}",
815                        "{src0} && {src1}", "{src}")
816 binop_reduce_all_sizes("bany_fnequal", 1, tfloat, "{src0} != {src1}",
817                        "{src0} || {src1}", "{src}")
818 binop_reduce_all_sizes("ball_iequal",  1, tint, "{src0} == {src1}",
819                        "{src0} && {src1}", "{src}")
820 binop_reduce_all_sizes("bany_inequal", 1, tint, "{src0} != {src1}",
821                        "{src0} || {src1}", "{src}")
822 
823 # non-integer-aware GLSL-style comparisons that return 0.0 or 1.0
824 
825 binop_reduce("fall_equal",  1, tfloat32, tfloat32, "{src0} == {src1}",
826              "{src0} && {src1}", "{src} ? 1.0f : 0.0f")
827 binop_reduce("fany_nequal", 1, tfloat32, tfloat32, "{src0} != {src1}",
828              "{src0} || {src1}", "{src} ? 1.0f : 0.0f")
829 
830 # These comparisons for integer-less hardware return 1.0 and 0.0 for true
831 # and false respectively
832 
833 binop("slt", tfloat32, "", "(src0 < src1) ? 1.0f : 0.0f") # Set on Less Than
834 binop("sge", tfloat, "", "(src0 >= src1) ? 1.0f : 0.0f") # Set on Greater or Equal
835 binop("seq", tfloat32, _2src_commutative, "(src0 == src1) ? 1.0f : 0.0f") # Set on Equal
836 binop("sne", tfloat32, _2src_commutative, "(src0 != src1) ? 1.0f : 0.0f") # Set on Not Equal
837 
838 # SPIRV shifts are undefined for shift-operands >= bitsize,
839 # but SM5 shifts are defined to use only the least significant bits.
840 # The NIR definition is according to the SM5 specification.
841 opcode("ishl", 0, tint, [0, 0], [tint, tuint32], False, "",
842        "(uint64_t)src0 << (src1 & (sizeof(src0) * 8 - 1))")
843 opcode("ishr", 0, tint, [0, 0], [tint, tuint32], False, "",
844        "src0 >> (src1 & (sizeof(src0) * 8 - 1))")
845 opcode("ushr", 0, tuint, [0, 0], [tuint, tuint32], False, "",
846        "src0 >> (src1 & (sizeof(src0) * 8 - 1))")
847 
848 opcode("urol", 0, tuint, [0, 0], [tuint, tuint32], False, "", """
849    uint32_t rotate_mask = sizeof(src0) * 8 - 1;
850    dst = (src0 << (src1 & rotate_mask)) |
851          (src0 >> (-src1 & rotate_mask));
852 """)
853 opcode("uror", 0, tuint, [0, 0], [tuint, tuint32], False, "", """
854    uint32_t rotate_mask = sizeof(src0) * 8 - 1;
855    dst = (src0 >> (src1 & rotate_mask)) |
856          (src0 << (-src1 & rotate_mask));
857 """)
858 
859 # bitwise logic operators
860 #
861 # These are also used as boolean and, or, xor for hardware supporting
862 # integers.
863 
864 
865 binop("iand", tuint, _2src_commutative + associative, "src0 & src1")
866 binop("ior", tuint, _2src_commutative + associative, "src0 | src1")
867 binop("ixor", tuint, _2src_commutative + associative, "src0 ^ src1")
868 
869 
870 binop_reduce("fdot", 1, tfloat, tfloat, "{src0} * {src1}", "{src0} + {src1}",
871              "{src}")
872 
873 binop_reduce("fdot", 4, tfloat, tfloat,
874              "{src0} * {src1}", "{src0} + {src1}", "{src}",
875              suffix="_replicated")
876 
877 opcode("fdph", 1, tfloat, [3, 4], [tfloat, tfloat], False, "",
878        "src0.x * src1.x + src0.y * src1.y + src0.z * src1.z + src1.w")
879 opcode("fdph_replicated", 4, tfloat, [3, 4], [tfloat, tfloat], False, "",
880        "src0.x * src1.x + src0.y * src1.y + src0.z * src1.z + src1.w")
881 
882 binop("fmin", tfloat, _2src_commutative + associative, "fmin(src0, src1)")
883 binop("imin", tint, _2src_commutative + associative, "src1 > src0 ? src0 : src1")
884 binop("umin", tuint, _2src_commutative + associative, "src1 > src0 ? src0 : src1")
885 binop("fmax", tfloat, _2src_commutative + associative, "fmax(src0, src1)")
886 binop("imax", tint, _2src_commutative + associative, "src1 > src0 ? src1 : src0")
887 binop("umax", tuint, _2src_commutative + associative, "src1 > src0 ? src1 : src0")
888 
889 binop("fpow", tfloat, "", "bit_size == 64 ? powf(src0, src1) : pow(src0, src1)")
890 
891 binop_horiz("pack_half_2x16_split", 1, tuint32, 1, tfloat32, 1, tfloat32,
892             "pack_half_1x16(src0.x) | (pack_half_1x16(src1.x) << 16)")
893 
894 binop_convert("pack_64_2x32_split", tuint64, tuint32, "",
895               "src0 | ((uint64_t)src1 << 32)")
896 
897 binop_convert("pack_32_2x16_split", tuint32, tuint16, "",
898               "src0 | ((uint32_t)src1 << 16)")
899 
900 opcode("pack_32_4x8_split", 0, tuint32, [0, 0, 0, 0], [tuint8, tuint8, tuint8, tuint8],
901        False, "",
902        "src0 | ((uint32_t)src1 << 8) | ((uint32_t)src2 << 16) | ((uint32_t)src3 << 24)")
903 
904 # bfm implements the behavior of the first operation of the SM5 "bfi" assembly
905 # and that of the "bfi1" i965 instruction. That is, the bits and offset values
906 # are from the low five bits of src0 and src1, respectively.
907 binop_convert("bfm", tuint32, tint32, "", """
908 int bits = src0 & 0x1F;
909 int offset = src1 & 0x1F;
910 dst = ((1u << bits) - 1) << offset;
911 """)
912 
913 opcode("ldexp", 0, tfloat, [0, 0], [tfloat, tint32], False, "", """
914 dst = (bit_size == 64) ? ldexp(src0, src1) : ldexpf(src0, src1);
915 /* flush denormals to zero. */
916 if (!isnormal(dst))
917    dst = copysignf(0.0f, src0);
918 """)
919 
920 # Combines the first component of each input to make a 2-component vector.
921 
922 binop_horiz("vec2", 2, tuint, 1, tuint, 1, tuint, """
923 dst.x = src0.x;
924 dst.y = src1.x;
925 """)
926 
927 # Byte extraction
928 binop("extract_u8", tuint, "", "(uint8_t)(src0 >> (src1 * 8))")
929 binop("extract_i8", tint, "", "(int8_t)(src0 >> (src1 * 8))")
930 
931 # Word extraction
932 binop("extract_u16", tuint, "", "(uint16_t)(src0 >> (src1 * 16))")
933 binop("extract_i16", tint, "", "(int16_t)(src0 >> (src1 * 16))")
934 
935 # Byte/word insertion
936 binop("insert_u8", tuint, "", "(src0 & 0xff) << (src1 * 8)")
937 binop("insert_u16", tuint, "", "(src0 & 0xffff) << (src1 * 16)")
938 
939 
940 def triop(name, ty, alg_props, const_expr):
941    opcode(name, 0, ty, [0, 0, 0], [ty, ty, ty], False, alg_props, const_expr)
942 def triop_horiz(name, output_size, src1_size, src2_size, src3_size, const_expr):
943    opcode(name, output_size, tuint,
944    [src1_size, src2_size, src3_size],
945    [tuint, tuint, tuint], False, "", const_expr)
946 
947 triop("ffma", tfloat, _2src_commutative, """
948 if (nir_is_rounding_mode_rtz(execution_mode, bit_size)) {
949    if (bit_size == 64)
950       dst = _mesa_double_fma_rtz(src0, src1, src2);
951    else if (bit_size == 32)
952       dst = _mesa_float_fma_rtz(src0, src1, src2);
953    else
954       dst = _mesa_double_to_float_rtz(_mesa_double_fma_rtz(src0, src1, src2));
955 } else {
956    if (bit_size == 32)
957       dst = fmaf(src0, src1, src2);
958    else
959       dst = fma(src0, src1, src2);
960 }
961 """)
962 
963 triop("flrp", tfloat, "", "src0 * (1 - src2) + src1 * src2")
964 
965 # Ternary addition
966 triop("iadd3", tint, _2src_commutative + associative, "src0 + src1 + src2")
967 
968 # Conditional Select
969 #
970 # A vector conditional select instruction (like ?:, but operating per-
971 # component on vectors). There are two versions, one for floating point
972 # bools (0.0 vs 1.0) and one for integer bools (0 vs ~0).
973 
974 triop("fcsel", tfloat32, "", "(src0 != 0.0f) ? src1 : src2")
975 
976 opcode("bcsel", 0, tuint, [0, 0, 0],
977        [tbool1, tuint, tuint], False, "", "src0 ? src1 : src2")
978 opcode("b8csel", 0, tuint, [0, 0, 0],
979        [tbool8, tuint, tuint], False, "", "src0 ? src1 : src2")
980 opcode("b16csel", 0, tuint, [0, 0, 0],
981        [tbool16, tuint, tuint], False, "", "src0 ? src1 : src2")
982 opcode("b32csel", 0, tuint, [0, 0, 0],
983        [tbool32, tuint, tuint], False, "", "src0 ? src1 : src2")
984 
985 triop("i32csel_gt", tint32, "", "(src0 > 0.0f) ? src1 : src2")
986 triop("i32csel_ge", tint32, "", "(src0 >= 0.0f) ? src1 : src2")
987 
988 triop("fcsel_gt", tfloat32, "", "(src0 > 0.0f) ? src1 : src2")
989 triop("fcsel_ge", tfloat32, "", "(src0 >= 0.0f) ? src1 : src2")
990 
991 # SM5 bfi assembly
992 triop("bfi", tuint32, "", """
993 unsigned mask = src0, insert = src1, base = src2;
994 if (mask == 0) {
995    dst = base;
996 } else {
997    unsigned tmp = mask;
998    while (!(tmp & 1)) {
999       tmp >>= 1;
1000       insert <<= 1;
1001    }
1002    dst = (base & ~mask) | (insert & mask);
1003 }
1004 """)
1005 
1006 
1007 triop("bitfield_select", tuint, "", "(src0 & src1) | (~src0 & src2)")
1008 
1009 # SM5 ubfe/ibfe assembly: only the 5 least significant bits of offset and bits are used.
1010 opcode("ubfe", 0, tuint32,
1011        [0, 0, 0], [tuint32, tuint32, tuint32], False, "", """
1012 unsigned base = src0;
1013 unsigned offset = src1 & 0x1F;
1014 unsigned bits = src2 & 0x1F;
1015 if (bits == 0) {
1016    dst = 0;
1017 } else if (offset + bits < 32) {
1018    dst = (base << (32 - bits - offset)) >> (32 - bits);
1019 } else {
1020    dst = base >> offset;
1021 }
1022 """)
1023 opcode("ibfe", 0, tint32,
1024        [0, 0, 0], [tint32, tuint32, tuint32], False, "", """
1025 int base = src0;
1026 unsigned offset = src1 & 0x1F;
1027 unsigned bits = src2 & 0x1F;
1028 if (bits == 0) {
1029    dst = 0;
1030 } else if (offset + bits < 32) {
1031    dst = (base << (32 - bits - offset)) >> (32 - bits);
1032 } else {
1033    dst = base >> offset;
1034 }
1035 """)
1036 
1037 # GLSL bitfieldExtract()
1038 opcode("ubitfield_extract", 0, tuint32,
1039        [0, 0, 0], [tuint32, tint32, tint32], False, "", """
1040 unsigned base = src0;
1041 int offset = src1, bits = src2;
1042 if (bits == 0) {
1043    dst = 0;
1044 } else if (bits < 0 || offset < 0 || offset + bits > 32) {
1045    dst = 0; /* undefined per the spec */
1046 } else {
1047    dst = (base >> offset) & ((1ull << bits) - 1);
1048 }
1049 """)
1050 opcode("ibitfield_extract", 0, tint32,
1051        [0, 0, 0], [tint32, tint32, tint32], False, "", """
1052 int base = src0;
1053 int offset = src1, bits = src2;
1054 if (bits == 0) {
1055    dst = 0;
1056 } else if (offset < 0 || bits < 0 || offset + bits > 32) {
1057    dst = 0;
1058 } else {
1059    dst = (base << (32 - offset - bits)) >> (32 - bits); /* use sign-extending shift */
1060 }
1061 """)
1062 
1063 # Sum of absolute differences with accumulation.
1064 # (Equivalent to AMD's v_sad_u8 instruction.)
1065 # The first two sources contain packed 8-bit unsigned integers, the instruction
1066 # will calculate the absolute difference of these, and then add them together.
1067 # There is also a third source which is a 32-bit unsigned integer and added to the result.
1068 triop_horiz("sad_u8x4", 1, 1, 1, 1, """
1069 uint8_t s0_b0 = (src0.x & 0x000000ff) >> 0;
1070 uint8_t s0_b1 = (src0.x & 0x0000ff00) >> 8;
1071 uint8_t s0_b2 = (src0.x & 0x00ff0000) >> 16;
1072 uint8_t s0_b3 = (src0.x & 0xff000000) >> 24;
1073 
1074 uint8_t s1_b0 = (src1.x & 0x000000ff) >> 0;
1075 uint8_t s1_b1 = (src1.x & 0x0000ff00) >> 8;
1076 uint8_t s1_b2 = (src1.x & 0x00ff0000) >> 16;
1077 uint8_t s1_b3 = (src1.x & 0xff000000) >> 24;
1078 
1079 dst.x = src2.x +
1080         (s0_b0 > s1_b0 ? (s0_b0 - s1_b0) : (s1_b0 - s0_b0)) +
1081         (s0_b1 > s1_b1 ? (s0_b1 - s1_b1) : (s1_b1 - s0_b1)) +
1082         (s0_b2 > s1_b2 ? (s0_b2 - s1_b2) : (s1_b2 - s0_b2)) +
1083         (s0_b3 > s1_b3 ? (s0_b3 - s1_b3) : (s1_b3 - s0_b3));
1084 """)
1085 
1086 # Combines the first component of each input to make a 3-component vector.
1087 
1088 triop_horiz("vec3", 3, 1, 1, 1, """
1089 dst.x = src0.x;
1090 dst.y = src1.x;
1091 dst.z = src2.x;
1092 """)
1093 
1094 def quadop_horiz(name, output_size, src1_size, src2_size, src3_size,
1095                  src4_size, const_expr):
1096    opcode(name, output_size, tuint,
1097           [src1_size, src2_size, src3_size, src4_size],
1098           [tuint, tuint, tuint, tuint],
1099           False, "", const_expr)
1100 
1101 opcode("bitfield_insert", 0, tuint32, [0, 0, 0, 0],
1102        [tuint32, tuint32, tint32, tint32], False, "", """
1103 unsigned base = src0, insert = src1;
1104 int offset = src2, bits = src3;
1105 if (bits == 0) {
1106    dst = base;
1107 } else if (offset < 0 || bits < 0 || bits + offset > 32) {
1108    dst = 0;
1109 } else {
1110    unsigned mask = ((1ull << bits) - 1) << offset;
1111    dst = (base & ~mask) | ((insert << offset) & mask);
1112 }
1113 """)
1114 
1115 quadop_horiz("vec4", 4, 1, 1, 1, 1, """
1116 dst.x = src0.x;
1117 dst.y = src1.x;
1118 dst.z = src2.x;
1119 dst.w = src3.x;
1120 """)
1121 
1122 opcode("vec5", 5, tuint,
1123        [1] * 5, [tuint] * 5,
1124        False, "", """
1125 dst.x = src0.x;
1126 dst.y = src1.x;
1127 dst.z = src2.x;
1128 dst.w = src3.x;
1129 dst.e = src4.x;
1130 """)
1131 
1132 opcode("vec8", 8, tuint,
1133        [1] * 8, [tuint] * 8,
1134        False, "", """
1135 dst.x = src0.x;
1136 dst.y = src1.x;
1137 dst.z = src2.x;
1138 dst.w = src3.x;
1139 dst.e = src4.x;
1140 dst.f = src5.x;
1141 dst.g = src6.x;
1142 dst.h = src7.x;
1143 """)
1144 
1145 opcode("vec16", 16, tuint,
1146        [1] * 16, [tuint] * 16,
1147        False, "", """
1148 dst.x = src0.x;
1149 dst.y = src1.x;
1150 dst.z = src2.x;
1151 dst.w = src3.x;
1152 dst.e = src4.x;
1153 dst.f = src5.x;
1154 dst.g = src6.x;
1155 dst.h = src7.x;
1156 dst.i = src8.x;
1157 dst.j = src9.x;
1158 dst.k = src10.x;
1159 dst.l = src11.x;
1160 dst.m = src12.x;
1161 dst.n = src13.x;
1162 dst.o = src14.x;
1163 dst.p = src15.x;
1164 """)
1165 
1166 # An integer multiply instruction for address calculation.  This is
1167 # similar to imul, except that the results are undefined in case of
1168 # overflow.  Overflow is defined according to the size of the variable
1169 # being dereferenced.
1170 #
1171 # This relaxed definition, compared to imul, allows an optimization
1172 # pass to propagate bounds (ie, from an load/store intrinsic) to the
1173 # sources, such that lower precision integer multiplies can be used.
1174 # This is useful on hw that has 24b or perhaps 16b integer multiply
1175 # instructions.
1176 binop("amul", tint, _2src_commutative + associative, "src0 * src1")
1177 
1178 # ir3-specific instruction that maps directly to mul-add shift high mix,
1179 # (IMADSH_MIX16 i.e. ah * bl << 16 + c). It is used for lowering integer
1180 # multiplication (imul) on Freedreno backend..
1181 opcode("imadsh_mix16", 0, tint32,
1182        [0, 0, 0], [tint32, tint32, tint32], False, "", """
1183 dst = ((((src0 & 0xffff0000) >> 16) * (src1 & 0x0000ffff)) << 16) + src2;
1184 """)
1185 
1186 # ir3-specific instruction that maps directly to ir3 mad.s24.
1187 #
1188 # 24b multiply into 32b result (with sign extension) plus 32b int
1189 triop("imad24_ir3", tint32, _2src_commutative,
1190       "(((int32_t)src0 << 8) >> 8) * (((int32_t)src1 << 8) >> 8) + src2")
1191 
1192 # r600-specific instruction that evaluates unnormalized cube texture coordinates
1193 # and face index
1194 # The actual texture coordinates are evaluated from this according to
1195 #    dst.yx / abs(dst.z) + 1.5
1196 unop_horiz("cube_r600", 4, tfloat32, 3, tfloat32, """
1197    dst.x = dst.y = dst.z = 0.0;
1198    float absX = fabsf(src0.x);
1199    float absY = fabsf(src0.y);
1200    float absZ = fabsf(src0.z);
1201 
1202    if (absX >= absY && absX >= absZ) { dst.z = 2 * src0.x; }
1203    if (absY >= absX && absY >= absZ) { dst.z = 2 * src0.y; }
1204    if (absZ >= absX && absZ >= absY) { dst.z = 2 * src0.z; }
1205 
1206    if (src0.x >= 0 && absX >= absY && absX >= absZ) {
1207       dst.y = -src0.z; dst.x = -src0.y; dst.w = 0;
1208    }
1209    if (src0.x < 0 && absX >= absY && absX >= absZ) {
1210       dst.y = src0.z; dst.x = -src0.y; dst.w = 1;
1211    }
1212    if (src0.y >= 0 && absY >= absX && absY >= absZ) {
1213       dst.y = src0.x; dst.x = src0.z; dst.w = 2;
1214    }
1215    if (src0.y < 0 && absY >= absX && absY >= absZ) {
1216       dst.y = src0.x; dst.x = -src0.z; dst.w = 3;
1217    }
1218    if (src0.z >= 0 && absZ >= absX && absZ >= absY) {
1219       dst.y = src0.x; dst.x = -src0.y; dst.w = 4;
1220    }
1221    if (src0.z < 0 && absZ >= absX && absZ >= absY) {
1222       dst.y = -src0.x; dst.x = -src0.y; dst.w = 5;
1223    }
1224 """)
1225 
1226 # r600 specific sin and cos
1227 # these trigeometric functions need some lowering because the supported
1228 # input values are expected to be normalized by dividing by (2 * pi)
1229 unop("fsin_r600", tfloat32, "sinf(6.2831853 * src0)")
1230 unop("fcos_r600", tfloat32, "cosf(6.2831853 * src0)")
1231 
1232 # AGX specific sin with input expressed in quadrants. Used in the lowering for
1233 # fsin/fcos. This corresponds to a sequence of 3 ALU ops in the backend (where
1234 # the angle is further decomposed by quadrant, sinc is computed, and the angle
1235 # is multiplied back for sin). Lowering fsin/fcos to fsin_agx requires some
1236 # additional ALU that NIR may be able to optimize.
1237 unop("fsin_agx", tfloat, "sinf(src0 * (6.2831853/4.0))")
1238 
1239 # 24b multiply into 32b result (with sign extension)
1240 binop("imul24", tint32, _2src_commutative + associative,
1241       "(((int32_t)src0 << 8) >> 8) * (((int32_t)src1 << 8) >> 8)")
1242 
1243 # unsigned 24b multiply into 32b result plus 32b int
1244 triop("umad24", tuint32, _2src_commutative,
1245       "(((uint32_t)src0 << 8) >> 8) * (((uint32_t)src1 << 8) >> 8) + src2")
1246 
1247 # unsigned 24b multiply into 32b result uint
1248 binop("umul24", tint32, _2src_commutative + associative,
1249       "(((uint32_t)src0 << 8) >> 8) * (((uint32_t)src1 << 8) >> 8)")
1250 
1251 # relaxed versions of the above, which assume input is in the 24bit range (no clamping)
1252 binop("imul24_relaxed", tint32, _2src_commutative + associative, "src0 * src1")
1253 triop("umad24_relaxed", tuint32, _2src_commutative, "src0 * src1 + src2")
1254 binop("umul24_relaxed", tuint32, _2src_commutative + associative, "src0 * src1")
1255 
1256 unop_convert("fisnormal", tbool1, tfloat, "isnormal(src0)")
1257 unop_convert("fisfinite", tbool1, tfloat, "isfinite(src0)")
1258 unop_convert("fisfinite32", tint32, tfloat, "isfinite(src0)")
1259 
1260 # vc4-specific opcodes
1261 
1262 # Saturated vector add for 4 8bit ints.
1263 binop("usadd_4x8_vc4", tint32, _2src_commutative + associative, """
1264 dst = 0;
1265 for (int i = 0; i < 32; i += 8) {
1266    dst |= MIN2(((src0 >> i) & 0xff) + ((src1 >> i) & 0xff), 0xff) << i;
1267 }
1268 """)
1269 
1270 # Saturated vector subtract for 4 8bit ints.
1271 binop("ussub_4x8_vc4", tint32, "", """
1272 dst = 0;
1273 for (int i = 0; i < 32; i += 8) {
1274    int src0_chan = (src0 >> i) & 0xff;
1275    int src1_chan = (src1 >> i) & 0xff;
1276    if (src0_chan > src1_chan)
1277       dst |= (src0_chan - src1_chan) << i;
1278 }
1279 """)
1280 
1281 # vector min for 4 8bit ints.
1282 binop("umin_4x8_vc4", tint32, _2src_commutative + associative, """
1283 dst = 0;
1284 for (int i = 0; i < 32; i += 8) {
1285    dst |= MIN2((src0 >> i) & 0xff, (src1 >> i) & 0xff) << i;
1286 }
1287 """)
1288 
1289 # vector max for 4 8bit ints.
1290 binop("umax_4x8_vc4", tint32, _2src_commutative + associative, """
1291 dst = 0;
1292 for (int i = 0; i < 32; i += 8) {
1293    dst |= MAX2((src0 >> i) & 0xff, (src1 >> i) & 0xff) << i;
1294 }
1295 """)
1296 
1297 # unorm multiply: (a * b) / 255.
1298 binop("umul_unorm_4x8_vc4", tint32, _2src_commutative + associative, """
1299 dst = 0;
1300 for (int i = 0; i < 32; i += 8) {
1301    int src0_chan = (src0 >> i) & 0xff;
1302    int src1_chan = (src1 >> i) & 0xff;
1303    dst |= ((src0_chan * src1_chan) / 255) << i;
1304 }
1305 """)
1306 
1307 # Mali-specific opcodes
1308 unop("fsat_signed_mali", tfloat, ("fmin(fmax(src0, -1.0), 1.0)"))
1309 unop("fclamp_pos_mali", tfloat, ("fmax(src0, 0.0)"))
1310 
1311 # Magnitude equal to fddx/y, sign undefined. Derivative of a constant is zero.
1312 unop("fddx_must_abs_mali", tfloat, "0.0")
1313 unop("fddy_must_abs_mali", tfloat, "0.0")
1314 
1315 # DXIL specific double [un]pack
1316 # DXIL doesn't support generic [un]pack instructions, so we want those
1317 # lowered to bit ops. HLSL doesn't support 64bit bitcasts to/from
1318 # double, only [un]pack. Technically DXIL does, but considering they
1319 # can't be generated from HLSL, we want to match what would be coming from DXC.
1320 # This is essentially just the standard [un]pack, except that it doesn't get
1321 # lowered so we can handle it in the backend and turn it into MakeDouble/SplitDouble
1322 unop_horiz("pack_double_2x32_dxil", 1, tuint64, 2, tuint32,
1323            "dst.x = src0.x | ((uint64_t)src0.y << 32);")
1324 unop_horiz("unpack_double_2x32_dxil", 2, tuint32, 1, tuint64,
1325            "dst.x = src0.x; dst.y = src0.x >> 32;")
1326 
1327 # src0 and src1 are i8vec4 packed in an int32, and src2 is an int32.  The int8
1328 # components are sign-extended to 32-bits, and a dot-product is performed on
1329 # the resulting vectors.  src2 is added to the result of the dot-product.
1330 opcode("sdot_4x8_iadd", 0, tint32, [0, 0, 0], [tuint32, tuint32, tint32],
1331        False, _2src_commutative, """
1332    const int32_t v0x = (int8_t)(src0      );
1333    const int32_t v0y = (int8_t)(src0 >>  8);
1334    const int32_t v0z = (int8_t)(src0 >> 16);
1335    const int32_t v0w = (int8_t)(src0 >> 24);
1336    const int32_t v1x = (int8_t)(src1      );
1337    const int32_t v1y = (int8_t)(src1 >>  8);
1338    const int32_t v1z = (int8_t)(src1 >> 16);
1339    const int32_t v1w = (int8_t)(src1 >> 24);
1340 
1341    dst = (v0x * v1x) + (v0y * v1y) + (v0z * v1z) + (v0w * v1w) + src2;
1342 """)
1343 
1344 # Like sdot_4x8_iadd, but unsigned.
1345 opcode("udot_4x8_uadd", 0, tuint32, [0, 0, 0], [tuint32, tuint32, tuint32],
1346        False, _2src_commutative, """
1347    const uint32_t v0x = (uint8_t)(src0      );
1348    const uint32_t v0y = (uint8_t)(src0 >>  8);
1349    const uint32_t v0z = (uint8_t)(src0 >> 16);
1350    const uint32_t v0w = (uint8_t)(src0 >> 24);
1351    const uint32_t v1x = (uint8_t)(src1      );
1352    const uint32_t v1y = (uint8_t)(src1 >>  8);
1353    const uint32_t v1z = (uint8_t)(src1 >> 16);
1354    const uint32_t v1w = (uint8_t)(src1 >> 24);
1355 
1356    dst = (v0x * v1x) + (v0y * v1y) + (v0z * v1z) + (v0w * v1w) + src2;
1357 """)
1358 
1359 # src0 is i8vec4 packed in an int32, src1 is u8vec4 packed in an int32, and
1360 # src2 is an int32.  The 8-bit components are extended to 32-bits, and a
1361 # dot-product is performed on the resulting vectors.  src2 is added to the
1362 # result of the dot-product.
1363 #
1364 # NOTE: Unlike many of the other dp4a opcodes, this mixed signs of source 0
1365 # and source 1 mean that this opcode is not 2-source commutative
1366 opcode("sudot_4x8_iadd", 0, tint32, [0, 0, 0], [tuint32, tuint32, tint32],
1367        False, "", """
1368    const int32_t v0x = (int8_t)(src0      );
1369    const int32_t v0y = (int8_t)(src0 >>  8);
1370    const int32_t v0z = (int8_t)(src0 >> 16);
1371    const int32_t v0w = (int8_t)(src0 >> 24);
1372    const uint32_t v1x = (uint8_t)(src1      );
1373    const uint32_t v1y = (uint8_t)(src1 >>  8);
1374    const uint32_t v1z = (uint8_t)(src1 >> 16);
1375    const uint32_t v1w = (uint8_t)(src1 >> 24);
1376 
1377    dst = (v0x * v1x) + (v0y * v1y) + (v0z * v1z) + (v0w * v1w) + src2;
1378 """)
1379 
1380 # Like sdot_4x8_iadd, but the result is clampled to the range [-0x80000000, 0x7ffffffff].
1381 opcode("sdot_4x8_iadd_sat", 0, tint32, [0, 0, 0], [tuint32, tuint32, tint32],
1382        False, _2src_commutative, """
1383    const int64_t v0x = (int8_t)(src0      );
1384    const int64_t v0y = (int8_t)(src0 >>  8);
1385    const int64_t v0z = (int8_t)(src0 >> 16);
1386    const int64_t v0w = (int8_t)(src0 >> 24);
1387    const int64_t v1x = (int8_t)(src1      );
1388    const int64_t v1y = (int8_t)(src1 >>  8);
1389    const int64_t v1z = (int8_t)(src1 >> 16);
1390    const int64_t v1w = (int8_t)(src1 >> 24);
1391 
1392    const int64_t tmp = (v0x * v1x) + (v0y * v1y) + (v0z * v1z) + (v0w * v1w) + src2;
1393 
1394    dst = tmp >= INT32_MAX ? INT32_MAX : (tmp <= INT32_MIN ? INT32_MIN : tmp);
1395 """)
1396 
1397 # Like udot_4x8_uadd, but the result is clampled to the range [0, 0xfffffffff].
1398 opcode("udot_4x8_uadd_sat", 0, tint32, [0, 0, 0], [tuint32, tuint32, tint32],
1399        False, _2src_commutative, """
1400    const uint64_t v0x = (uint8_t)(src0      );
1401    const uint64_t v0y = (uint8_t)(src0 >>  8);
1402    const uint64_t v0z = (uint8_t)(src0 >> 16);
1403    const uint64_t v0w = (uint8_t)(src0 >> 24);
1404    const uint64_t v1x = (uint8_t)(src1      );
1405    const uint64_t v1y = (uint8_t)(src1 >>  8);
1406    const uint64_t v1z = (uint8_t)(src1 >> 16);
1407    const uint64_t v1w = (uint8_t)(src1 >> 24);
1408 
1409    const uint64_t tmp = (v0x * v1x) + (v0y * v1y) + (v0z * v1z) + (v0w * v1w) + src2;
1410 
1411    dst = tmp >= UINT32_MAX ? UINT32_MAX : tmp;
1412 """)
1413 
1414 # Like sudot_4x8_iadd, but the result is clampled to the range [-0x80000000, 0x7ffffffff].
1415 #
1416 # NOTE: Unlike many of the other dp4a opcodes, this mixed signs of source 0
1417 # and source 1 mean that this opcode is not 2-source commutative
1418 opcode("sudot_4x8_iadd_sat", 0, tint32, [0, 0, 0], [tuint32, tuint32, tint32],
1419        False, "", """
1420    const int64_t v0x = (int8_t)(src0      );
1421    const int64_t v0y = (int8_t)(src0 >>  8);
1422    const int64_t v0z = (int8_t)(src0 >> 16);
1423    const int64_t v0w = (int8_t)(src0 >> 24);
1424    const uint64_t v1x = (uint8_t)(src1      );
1425    const uint64_t v1y = (uint8_t)(src1 >>  8);
1426    const uint64_t v1z = (uint8_t)(src1 >> 16);
1427    const uint64_t v1w = (uint8_t)(src1 >> 24);
1428 
1429    const int64_t tmp = (v0x * v1x) + (v0y * v1y) + (v0z * v1z) + (v0w * v1w) + src2;
1430 
1431    dst = tmp >= INT32_MAX ? INT32_MAX : (tmp <= INT32_MIN ? INT32_MIN : tmp);
1432 """)
1433 
1434 # src0 and src1 are i16vec2 packed in an int32, and src2 is an int32.  The int16
1435 # components are sign-extended to 32-bits, and a dot-product is performed on
1436 # the resulting vectors.  src2 is added to the result of the dot-product.
1437 opcode("sdot_2x16_iadd", 0, tint32, [0, 0, 0], [tuint32, tuint32, tint32],
1438        False, _2src_commutative, """
1439    const int32_t v0x = (int16_t)(src0      );
1440    const int32_t v0y = (int16_t)(src0 >> 16);
1441    const int32_t v1x = (int16_t)(src1      );
1442    const int32_t v1y = (int16_t)(src1 >> 16);
1443 
1444    dst = (v0x * v1x) + (v0y * v1y) + src2;
1445 """)
1446 
1447 # Like sdot_2x16_iadd, but unsigned.
1448 opcode("udot_2x16_uadd", 0, tuint32, [0, 0, 0], [tuint32, tuint32, tuint32],
1449        False, _2src_commutative, """
1450    const uint32_t v0x = (uint16_t)(src0      );
1451    const uint32_t v0y = (uint16_t)(src0 >> 16);
1452    const uint32_t v1x = (uint16_t)(src1      );
1453    const uint32_t v1y = (uint16_t)(src1 >> 16);
1454 
1455    dst = (v0x * v1x) + (v0y * v1y) + src2;
1456 """)
1457 
1458 # Like sdot_2x16_iadd, but the result is clampled to the range [-0x80000000, 0x7ffffffff].
1459 opcode("sdot_2x16_iadd_sat", 0, tint32, [0, 0, 0], [tuint32, tuint32, tint32],
1460        False, _2src_commutative, """
1461    const int64_t v0x = (int16_t)(src0      );
1462    const int64_t v0y = (int16_t)(src0 >> 16);
1463    const int64_t v1x = (int16_t)(src1      );
1464    const int64_t v1y = (int16_t)(src1 >> 16);
1465 
1466    const int64_t tmp = (v0x * v1x) + (v0y * v1y) + src2;
1467 
1468    dst = tmp >= INT32_MAX ? INT32_MAX : (tmp <= INT32_MIN ? INT32_MIN : tmp);
1469 """)
1470 
1471 # Like udot_2x16_uadd, but the result is clampled to the range [0, 0xfffffffff].
1472 opcode("udot_2x16_uadd_sat", 0, tint32, [0, 0, 0], [tuint32, tuint32, tint32],
1473        False, _2src_commutative, """
1474    const uint64_t v0x = (uint16_t)(src0      );
1475    const uint64_t v0y = (uint16_t)(src0 >> 16);
1476    const uint64_t v1x = (uint16_t)(src1      );
1477    const uint64_t v1y = (uint16_t)(src1 >> 16);
1478 
1479    const uint64_t tmp = (v0x * v1x) + (v0y * v1y) + src2;
1480 
1481    dst = tmp >= UINT32_MAX ? UINT32_MAX : tmp;
1482 """)
1483