• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1#
2# Copyright (C) 2014 Connor Abbott
3#
4# Permission is hereby granted, free of charge, to any person obtaining a
5# copy of this software and associated documentation files (the "Software"),
6# to deal in the Software without restriction, including without limitation
7# the rights to use, copy, modify, merge, publish, distribute, sublicense,
8# and/or sell copies of the Software, and to permit persons to whom the
9# Software is furnished to do so, subject to the following conditions:
10#
11# The above copyright notice and this permission notice (including the next
12# paragraph) shall be included in all copies or substantial portions of the
13# Software.
14#
15# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
18# THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20# FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21# IN THE SOFTWARE.
22#
23# Authors:
24#    Connor Abbott (cwabbott0@gmail.com)
25
26import re
27
28# Class that represents all the information we have about the opcode
29# NOTE: this must be kept in sync with nir_op_info
30
31class Opcode(object):
32   """Class that represents all the information we have about the opcode
33   NOTE: this must be kept in sync with nir_op_info
34   """
35   def __init__(self, name, output_size, output_type, input_sizes,
36                input_types, is_conversion, algebraic_properties, const_expr):
37      """Parameters:
38
39      - name is the name of the opcode (prepend nir_op_ for the enum name)
40      - all types are strings that get nir_type_ prepended to them
41      - input_types is a list of types
42      - is_conversion is true if this opcode represents a type conversion
43      - algebraic_properties is a space-seperated string, where nir_op_is_ is
44        prepended before each entry
45      - const_expr is an expression or series of statements that computes the
46        constant value of the opcode given the constant values of its inputs.
47
48      Constant expressions are formed from the variables src0, src1, ...,
49      src(N-1), where N is the number of arguments.  The output of the
50      expression should be stored in the dst variable.  Per-component input
51      and output variables will be scalars and non-per-component input and
52      output variables will be a struct with fields named x, y, z, and w
53      all of the correct type.  Input and output variables can be assumed
54      to already be of the correct type and need no conversion.  In
55      particular, the conversion from the C bool type to/from  NIR_TRUE and
56      NIR_FALSE happens automatically.
57
58      For per-component instructions, the entire expression will be
59      executed once for each component.  For non-per-component
60      instructions, the expression is expected to store the correct values
61      in dst.x, dst.y, etc.  If "dst" does not exist anywhere in the
62      constant expression, an assignment to dst will happen automatically
63      and the result will be equivalent to "dst = <expression>" for
64      per-component instructions and "dst.x = dst.y = ... = <expression>"
65      for non-per-component instructions.
66      """
67      assert isinstance(name, str)
68      assert isinstance(output_size, int)
69      assert isinstance(output_type, str)
70      assert isinstance(input_sizes, list)
71      assert isinstance(input_sizes[0], int)
72      assert isinstance(input_types, list)
73      assert isinstance(input_types[0], str)
74      assert isinstance(is_conversion, bool)
75      assert isinstance(algebraic_properties, str)
76      assert isinstance(const_expr, str)
77      assert len(input_sizes) == len(input_types)
78      assert 0 <= output_size <= 4 or (output_size == 8) or (output_size == 16)
79      for size in input_sizes:
80         assert 0 <= size <= 4 or (size == 8) or (size == 16)
81         if output_size != 0:
82            assert size != 0
83      self.name = name
84      self.num_inputs = len(input_sizes)
85      self.output_size = output_size
86      self.output_type = output_type
87      self.input_sizes = input_sizes
88      self.input_types = input_types
89      self.is_conversion = is_conversion
90      self.algebraic_properties = algebraic_properties
91      self.const_expr = const_expr
92
93# helper variables for strings
94tfloat = "float"
95tint = "int"
96tbool = "bool"
97tbool1 = "bool1"
98tbool8 = "bool8"
99tbool16 = "bool16"
100tbool32 = "bool32"
101tuint = "uint"
102tuint8 = "uint8"
103tint16 = "int16"
104tuint16 = "uint16"
105tfloat16 = "float16"
106tfloat32 = "float32"
107tint32 = "int32"
108tuint32 = "uint32"
109tint64 = "int64"
110tuint64 = "uint64"
111tfloat64 = "float64"
112
113_TYPE_SPLIT_RE = re.compile(r'(?P<type>int|uint|float|bool)(?P<bits>\d+)?')
114
115def type_has_size(type_):
116    m = _TYPE_SPLIT_RE.match(type_)
117    assert m is not None, 'Invalid NIR type string: "{}"'.format(type_)
118    return m.group('bits') is not None
119
120def type_size(type_):
121    m = _TYPE_SPLIT_RE.match(type_)
122    assert m is not None, 'Invalid NIR type string: "{}"'.format(type_)
123    assert m.group('bits') is not None, \
124           'NIR type string has no bit size: "{}"'.format(type_)
125    return int(m.group('bits'))
126
127def type_sizes(type_):
128    if type_has_size(type_):
129        return [type_size(type_)]
130    elif type_ == 'bool':
131        return [1, 8, 16, 32]
132    elif type_ == 'float':
133        return [16, 32, 64]
134    else:
135        return [1, 8, 16, 32, 64]
136
137def type_base_type(type_):
138    m = _TYPE_SPLIT_RE.match(type_)
139    assert m is not None, 'Invalid NIR type string: "{}"'.format(type_)
140    return m.group('type')
141
142# Operation where the first two sources are commutative.
143#
144# For 2-source operations, this just mathematical commutativity.  Some
145# 3-source operations, like ffma, are only commutative in the first two
146# sources.
147_2src_commutative = "2src_commutative "
148associative = "associative "
149
150# global dictionary of opcodes
151opcodes = {}
152
153def opcode(name, output_size, output_type, input_sizes, input_types,
154           is_conversion, algebraic_properties, const_expr):
155   assert name not in opcodes
156   opcodes[name] = Opcode(name, output_size, output_type, input_sizes,
157                          input_types, is_conversion, algebraic_properties,
158                          const_expr)
159
160def unop_convert(name, out_type, in_type, const_expr):
161   opcode(name, 0, out_type, [0], [in_type], False, "", const_expr)
162
163def unop(name, ty, const_expr):
164   opcode(name, 0, ty, [0], [ty], False, "", const_expr)
165
166def unop_horiz(name, output_size, output_type, input_size, input_type,
167               const_expr):
168   opcode(name, output_size, output_type, [input_size], [input_type],
169          False, "", const_expr)
170
171def unop_reduce(name, output_size, output_type, input_type, prereduce_expr,
172                reduce_expr, final_expr):
173   def prereduce(src):
174      return "(" + prereduce_expr.format(src=src) + ")"
175   def final(src):
176      return final_expr.format(src="(" + src + ")")
177   def reduce_(src0, src1):
178      return reduce_expr.format(src0=src0, src1=src1)
179   src0 = prereduce("src0.x")
180   src1 = prereduce("src0.y")
181   src2 = prereduce("src0.z")
182   src3 = prereduce("src0.w")
183   unop_horiz(name + "2", output_size, output_type, 2, input_type,
184              final(reduce_(src0, src1)))
185   unop_horiz(name + "3", output_size, output_type, 3, input_type,
186              final(reduce_(reduce_(src0, src1), src2)))
187   unop_horiz(name + "4", output_size, output_type, 4, input_type,
188              final(reduce_(reduce_(src0, src1), reduce_(src2, src3))))
189
190def unop_numeric_convert(name, out_type, in_type, const_expr):
191   opcode(name, 0, out_type, [0], [in_type], True, "", const_expr)
192
193unop("mov", tuint, "src0")
194
195unop("ineg", tint, "-src0")
196unop("fneg", tfloat, "-src0")
197unop("inot", tint, "~src0") # invert every bit of the integer
198unop("fsign", tfloat, ("bit_size == 64 ? " +
199                       "((src0 == 0.0) ? 0.0 : ((src0 > 0.0) ? 1.0 : -1.0)) : " +
200                       "((src0 == 0.0f) ? 0.0f : ((src0 > 0.0f) ? 1.0f : -1.0f))"))
201unop("isign", tint, "(src0 == 0) ? 0 : ((src0 > 0) ? 1 : -1)")
202unop("iabs", tint, "(src0 < 0) ? -src0 : src0")
203unop("fabs", tfloat, "fabs(src0)")
204unop("fsat", tfloat, ("fmin(fmax(src0, 0.0), 1.0)"))
205unop("fsat_signed", tfloat, ("fmin(fmax(src0, -1.0), 1.0)"))
206unop("fclamp_pos", tfloat, ("fmax(src0, 0.0)"))
207unop("frcp", tfloat, "bit_size == 64 ? 1.0 / src0 : 1.0f / src0")
208unop("frsq", tfloat, "bit_size == 64 ? 1.0 / sqrt(src0) : 1.0f / sqrtf(src0)")
209unop("fsqrt", tfloat, "bit_size == 64 ? sqrt(src0) : sqrtf(src0)")
210unop("fexp2", tfloat, "exp2f(src0)")
211unop("flog2", tfloat, "log2f(src0)")
212
213# Generate all of the numeric conversion opcodes
214for src_t in [tint, tuint, tfloat, tbool]:
215   if src_t == tbool:
216      dst_types = [tfloat, tint, tbool]
217   elif src_t == tint:
218      dst_types = [tfloat, tint, tbool]
219   elif src_t == tuint:
220      dst_types = [tfloat, tuint]
221   elif src_t == tfloat:
222      dst_types = [tint, tuint, tfloat, tbool]
223
224   for dst_t in dst_types:
225      for dst_bit_size in type_sizes(dst_t):
226          if dst_bit_size == 16 and dst_t == tfloat and src_t == tfloat:
227              rnd_modes = ['_rtne', '_rtz', '']
228              for rnd_mode in rnd_modes:
229                  if rnd_mode == '_rtne':
230                      conv_expr = """
231                      if (bit_size > 16) {
232                         dst = _mesa_half_to_float(_mesa_float_to_float16_rtne(src0));
233                      } else {
234                         dst = src0;
235                      }
236                      """
237                  elif rnd_mode == '_rtz':
238                      conv_expr = """
239                      if (bit_size > 16) {
240                         dst = _mesa_half_to_float(_mesa_float_to_float16_rtz(src0));
241                      } else {
242                         dst = src0;
243                      }
244                      """
245                  else:
246                      conv_expr = "src0"
247
248                  unop_numeric_convert("{0}2{1}{2}{3}".format(src_t[0],
249                                                              dst_t[0],
250                                                              dst_bit_size,
251                                                              rnd_mode),
252                                       dst_t + str(dst_bit_size),
253                                       src_t, conv_expr)
254          elif dst_bit_size == 32 and dst_t == tfloat and src_t == tfloat:
255              conv_expr = """
256              if (bit_size > 32 && nir_is_rounding_mode_rtz(execution_mode, 32)) {
257                 dst = _mesa_double_to_float_rtz(src0);
258              } else {
259                 dst = src0;
260              }
261              """
262              unop_numeric_convert("{0}2{1}{2}".format(src_t[0], dst_t[0],
263                                                       dst_bit_size),
264                                   dst_t + str(dst_bit_size), src_t, conv_expr)
265          else:
266              conv_expr = "src0 != 0" if dst_t == tbool else "src0"
267              unop_numeric_convert("{0}2{1}{2}".format(src_t[0], dst_t[0],
268                                                       dst_bit_size),
269                                   dst_t + str(dst_bit_size), src_t, conv_expr)
270
271# Special opcode that is the same as f2f16, i2i16, u2u16 except that it is safe
272# to remove it if the result is immediately converted back to 32 bits again.
273# This is generated as part of the precision lowering pass. mp stands for medium
274# precision.
275unop_numeric_convert("f2fmp", tfloat16, tfloat32, opcodes["f2f16"].const_expr)
276unop_numeric_convert("i2imp", tint16, tint32, opcodes["i2i16"].const_expr)
277# u2ump isn't defined, because the behavior is equal to i2imp
278unop_numeric_convert("f2imp", tint16, tfloat32, opcodes["f2i16"].const_expr)
279unop_numeric_convert("f2ump", tuint16, tfloat32, opcodes["f2u16"].const_expr)
280unop_numeric_convert("i2fmp", tfloat16, tint32, opcodes["i2f16"].const_expr)
281unop_numeric_convert("u2fmp", tfloat16, tuint32, opcodes["u2f16"].const_expr)
282
283# Unary floating-point rounding operations.
284
285
286unop("ftrunc", tfloat, "bit_size == 64 ? trunc(src0) : truncf(src0)")
287unop("fceil", tfloat, "bit_size == 64 ? ceil(src0) : ceilf(src0)")
288unop("ffloor", tfloat, "bit_size == 64 ? floor(src0) : floorf(src0)")
289unop("ffract", tfloat, "src0 - (bit_size == 64 ? floor(src0) : floorf(src0))")
290unop("fround_even", tfloat, "bit_size == 64 ? _mesa_roundeven(src0) : _mesa_roundevenf(src0)")
291
292unop("fquantize2f16", tfloat, "(fabs(src0) < ldexpf(1.0, -14)) ? copysignf(0.0f, src0) : _mesa_half_to_float(_mesa_float_to_half(src0))")
293
294# Trigonometric operations.
295
296
297unop("fsin", tfloat, "bit_size == 64 ? sin(src0) : sinf(src0)")
298unop("fcos", tfloat, "bit_size == 64 ? cos(src0) : cosf(src0)")
299
300# dfrexp
301unop_convert("frexp_exp", tint32, tfloat, "frexp(src0, &dst);")
302unop_convert("frexp_sig", tfloat, tfloat, "int n; dst = frexp(src0, &n);")
303
304# Partial derivatives.
305
306
307unop("fddx", tfloat, "0.0") # the derivative of a constant is 0.
308unop("fddy", tfloat, "0.0")
309unop("fddx_fine", tfloat, "0.0")
310unop("fddy_fine", tfloat, "0.0")
311unop("fddx_coarse", tfloat, "0.0")
312unop("fddy_coarse", tfloat, "0.0")
313
314
315# Floating point pack and unpack operations.
316
317def pack_2x16(fmt):
318   unop_horiz("pack_" + fmt + "_2x16", 1, tuint32, 2, tfloat32, """
319dst.x = (uint32_t) pack_fmt_1x16(src0.x);
320dst.x |= ((uint32_t) pack_fmt_1x16(src0.y)) << 16;
321""".replace("fmt", fmt))
322
323def pack_4x8(fmt):
324   unop_horiz("pack_" + fmt + "_4x8", 1, tuint32, 4, tfloat32, """
325dst.x = (uint32_t) pack_fmt_1x8(src0.x);
326dst.x |= ((uint32_t) pack_fmt_1x8(src0.y)) << 8;
327dst.x |= ((uint32_t) pack_fmt_1x8(src0.z)) << 16;
328dst.x |= ((uint32_t) pack_fmt_1x8(src0.w)) << 24;
329""".replace("fmt", fmt))
330
331def unpack_2x16(fmt):
332   unop_horiz("unpack_" + fmt + "_2x16", 2, tfloat32, 1, tuint32, """
333dst.x = unpack_fmt_1x16((uint16_t)(src0.x & 0xffff));
334dst.y = unpack_fmt_1x16((uint16_t)(src0.x << 16));
335""".replace("fmt", fmt))
336
337def unpack_4x8(fmt):
338   unop_horiz("unpack_" + fmt + "_4x8", 4, tfloat32, 1, tuint32, """
339dst.x = unpack_fmt_1x8((uint8_t)(src0.x & 0xff));
340dst.y = unpack_fmt_1x8((uint8_t)((src0.x >> 8) & 0xff));
341dst.z = unpack_fmt_1x8((uint8_t)((src0.x >> 16) & 0xff));
342dst.w = unpack_fmt_1x8((uint8_t)(src0.x >> 24));
343""".replace("fmt", fmt))
344
345
346pack_2x16("snorm")
347pack_4x8("snorm")
348pack_2x16("unorm")
349pack_4x8("unorm")
350pack_2x16("half")
351unpack_2x16("snorm")
352unpack_4x8("snorm")
353unpack_2x16("unorm")
354unpack_4x8("unorm")
355unpack_2x16("half")
356
357unop_horiz("pack_uvec2_to_uint", 1, tuint32, 2, tuint32, """
358dst.x = (src0.x & 0xffff) | (src0.y << 16);
359""")
360
361unop_horiz("pack_uvec4_to_uint", 1, tuint32, 4, tuint32, """
362dst.x = (src0.x <<  0) |
363        (src0.y <<  8) |
364        (src0.z << 16) |
365        (src0.w << 24);
366""")
367
368unop_horiz("pack_32_4x8", 1, tuint32, 4, tuint8,
369           "dst.x = src0.x | ((uint32_t)src0.y << 8) | ((uint32_t)src0.z << 16) | ((uint32_t)src0.w << 24);")
370
371unop_horiz("pack_32_2x16", 1, tuint32, 2, tuint16,
372           "dst.x = src0.x | ((uint32_t)src0.y << 16);")
373
374unop_horiz("pack_64_2x32", 1, tuint64, 2, tuint32,
375           "dst.x = src0.x | ((uint64_t)src0.y << 32);")
376
377unop_horiz("pack_64_4x16", 1, tuint64, 4, tuint16,
378           "dst.x = src0.x | ((uint64_t)src0.y << 16) | ((uint64_t)src0.z << 32) | ((uint64_t)src0.w << 48);")
379
380unop_horiz("unpack_64_2x32", 2, tuint32, 1, tuint64,
381           "dst.x = src0.x; dst.y = src0.x >> 32;")
382
383unop_horiz("unpack_64_4x16", 4, tuint16, 1, tuint64,
384           "dst.x = src0.x; dst.y = src0.x >> 16; dst.z = src0.x >> 32; dst.w = src0.w >> 48;")
385
386unop_horiz("unpack_32_2x16", 2, tuint16, 1, tuint32,
387           "dst.x = src0.x; dst.y = src0.x >> 16;")
388
389unop_horiz("unpack_32_4x8", 4, tuint8, 1, tuint32,
390           "dst.x = src0.x; dst.y = src0.x >> 8; dst.z = src0.x >> 16; dst.w = src0.x >> 24;")
391
392unop_horiz("unpack_half_2x16_flush_to_zero", 2, tfloat32, 1, tuint32, """
393dst.x = unpack_half_1x16_flush_to_zero((uint16_t)(src0.x & 0xffff));
394dst.y = unpack_half_1x16_flush_to_zero((uint16_t)(src0.x << 16));
395""")
396
397# Lowered floating point unpacking operations.
398
399unop_convert("unpack_half_2x16_split_x", tfloat32, tuint32,
400             "unpack_half_1x16((uint16_t)(src0 & 0xffff))")
401unop_convert("unpack_half_2x16_split_y", tfloat32, tuint32,
402             "unpack_half_1x16((uint16_t)(src0 >> 16))")
403
404unop_convert("unpack_half_2x16_split_x_flush_to_zero", tfloat32, tuint32,
405             "unpack_half_1x16_flush_to_zero((uint16_t)(src0 & 0xffff))")
406unop_convert("unpack_half_2x16_split_y_flush_to_zero", tfloat32, tuint32,
407             "unpack_half_1x16_flush_to_zero((uint16_t)(src0 >> 16))")
408
409unop_convert("unpack_32_2x16_split_x", tuint16, tuint32, "src0")
410unop_convert("unpack_32_2x16_split_y", tuint16, tuint32, "src0 >> 16")
411
412unop_convert("unpack_64_2x32_split_x", tuint32, tuint64, "src0")
413unop_convert("unpack_64_2x32_split_y", tuint32, tuint64, "src0 >> 32")
414
415# Bit operations, part of ARB_gpu_shader5.
416
417
418unop("bitfield_reverse", tuint32, """
419/* we're not winning any awards for speed here, but that's ok */
420dst = 0;
421for (unsigned bit = 0; bit < 32; bit++)
422   dst |= ((src0 >> bit) & 1) << (31 - bit);
423""")
424unop_convert("bit_count", tuint32, tuint, """
425dst = 0;
426for (unsigned bit = 0; bit < bit_size; bit++) {
427   if ((src0 >> bit) & 1)
428      dst++;
429}
430""")
431
432unop_convert("ufind_msb", tint32, tuint, """
433dst = -1;
434for (int bit = bit_size - 1; bit >= 0; bit--) {
435   if ((src0 >> bit) & 1) {
436      dst = bit;
437      break;
438   }
439}
440""")
441
442unop("uclz", tuint32, """
443int bit;
444for (bit = bit_size - 1; bit >= 0; bit--) {
445   if ((src0 & (1u << bit)) != 0)
446      break;
447}
448dst = (unsigned)(31 - bit);
449""")
450
451unop("ifind_msb", tint32, """
452dst = -1;
453for (int bit = 31; bit >= 0; bit--) {
454   /* If src0 < 0, we're looking for the first 0 bit.
455    * if src0 >= 0, we're looking for the first 1 bit.
456    */
457   if ((((src0 >> bit) & 1) && (src0 >= 0)) ||
458      (!((src0 >> bit) & 1) && (src0 < 0))) {
459      dst = bit;
460      break;
461   }
462}
463""")
464
465unop_convert("find_lsb", tint32, tint, """
466dst = -1;
467for (unsigned bit = 0; bit < bit_size; bit++) {
468   if ((src0 >> bit) & 1) {
469      dst = bit;
470      break;
471   }
472}
473""")
474
475# AMD_gcn_shader extended instructions
476unop_horiz("cube_face_coord", 2, tfloat32, 3, tfloat32, """
477dst.x = dst.y = 0.0;
478float absX = fabsf(src0.x);
479float absY = fabsf(src0.y);
480float absZ = fabsf(src0.z);
481
482float ma = 0.0;
483if (absX >= absY && absX >= absZ) { ma = 2 * src0.x; }
484if (absY >= absX && absY >= absZ) { ma = 2 * src0.y; }
485if (absZ >= absX && absZ >= absY) { ma = 2 * src0.z; }
486
487if (src0.x >= 0 && absX >= absY && absX >= absZ) { dst.x = -src0.z; dst.y = -src0.y; }
488if (src0.x < 0 && absX >= absY && absX >= absZ) { dst.x = src0.z; dst.y = -src0.y; }
489if (src0.y >= 0 && absY >= absX && absY >= absZ) { dst.x = src0.x; dst.y = src0.z; }
490if (src0.y < 0 && absY >= absX && absY >= absZ) { dst.x = src0.x; dst.y = -src0.z; }
491if (src0.z >= 0 && absZ >= absX && absZ >= absY) { dst.x = src0.x; dst.y = -src0.y; }
492if (src0.z < 0 && absZ >= absX && absZ >= absY) { dst.x = -src0.x; dst.y = -src0.y; }
493
494dst.x = dst.x * (1.0f / ma) + 0.5f;
495dst.y = dst.y * (1.0f / ma) + 0.5f;
496""")
497
498unop_horiz("cube_face_index", 1, tfloat32, 3, tfloat32, """
499float absX = fabsf(src0.x);
500float absY = fabsf(src0.y);
501float absZ = fabsf(src0.z);
502if (src0.x >= 0 && absX >= absY && absX >= absZ) dst.x = 0;
503if (src0.x < 0 && absX >= absY && absX >= absZ) dst.x = 1;
504if (src0.y >= 0 && absY >= absX && absY >= absZ) dst.x = 2;
505if (src0.y < 0 && absY >= absX && absY >= absZ) dst.x = 3;
506if (src0.z >= 0 && absZ >= absX && absZ >= absY) dst.x = 4;
507if (src0.z < 0 && absZ >= absX && absZ >= absY) dst.x = 5;
508""")
509
510# Sum of vector components
511unop_reduce("fsum", 1, tfloat, tfloat, "{src}", "{src0} + {src1}", "{src}")
512
513def binop_convert(name, out_type, in_type, alg_props, const_expr):
514   opcode(name, 0, out_type, [0, 0], [in_type, in_type],
515          False, alg_props, const_expr)
516
517def binop(name, ty, alg_props, const_expr):
518   binop_convert(name, ty, ty, alg_props, const_expr)
519
520def binop_compare(name, ty, alg_props, const_expr):
521   binop_convert(name, tbool1, ty, alg_props, const_expr)
522
523def binop_compare8(name, ty, alg_props, const_expr):
524   binop_convert(name, tbool8, ty, alg_props, const_expr)
525
526def binop_compare16(name, ty, alg_props, const_expr):
527   binop_convert(name, tbool16, ty, alg_props, const_expr)
528
529def binop_compare32(name, ty, alg_props, const_expr):
530   binop_convert(name, tbool32, ty, alg_props, const_expr)
531
532def binop_compare_all_sizes(name, ty, alg_props, const_expr):
533   binop_compare(name, ty, alg_props, const_expr)
534   binop_compare8(name + "8", ty, alg_props, const_expr)
535   binop_compare16(name + "16", ty, alg_props, const_expr)
536   binop_compare32(name + "32", ty, alg_props, const_expr)
537
538def binop_horiz(name, out_size, out_type, src1_size, src1_type, src2_size,
539                src2_type, const_expr):
540   opcode(name, out_size, out_type, [src1_size, src2_size], [src1_type, src2_type],
541          False, "", const_expr)
542
543def binop_reduce(name, output_size, output_type, src_type, prereduce_expr,
544                 reduce_expr, final_expr, suffix=""):
545   def final(src):
546      return final_expr.format(src= "(" + src + ")")
547   def reduce_(src0, src1):
548      return reduce_expr.format(src0=src0, src1=src1)
549   def prereduce(src0, src1):
550      return "(" + prereduce_expr.format(src0=src0, src1=src1) + ")"
551   srcs = [prereduce("src0." + letter, "src1." + letter) for letter in "xyzwefghijklmnop"]
552   def pairwise_reduce(start, size):
553      if (size == 1):
554         return srcs[start]
555      return reduce_(pairwise_reduce(start + size // 2, size // 2), pairwise_reduce(start, size // 2))
556   for size in [2, 4, 8, 16]:
557      opcode(name + str(size) + suffix, output_size, output_type,
558             [size, size], [src_type, src_type], False, _2src_commutative,
559             final(pairwise_reduce(0, size)))
560   opcode(name + "3" + suffix, output_size, output_type,
561          [3, 3], [src_type, src_type], False, _2src_commutative,
562          final(reduce_(reduce_(srcs[2], srcs[1]), srcs[0])))
563
564def binop_reduce_all_sizes(name, output_size, src_type, prereduce_expr,
565                           reduce_expr, final_expr):
566   binop_reduce(name, output_size, tbool1, src_type,
567                prereduce_expr, reduce_expr, final_expr)
568   binop_reduce("b8" + name[1:], output_size, tbool8, src_type,
569                prereduce_expr, reduce_expr, final_expr)
570   binop_reduce("b16" + name[1:], output_size, tbool16, src_type,
571                prereduce_expr, reduce_expr, final_expr)
572   binop_reduce("b32" + name[1:], output_size, tbool32, src_type,
573                prereduce_expr, reduce_expr, final_expr)
574
575binop("fadd", tfloat, _2src_commutative + associative,"""
576if (nir_is_rounding_mode_rtz(execution_mode, bit_size)) {
577   if (bit_size == 64)
578      dst = _mesa_double_add_rtz(src0, src1);
579   else
580      dst = _mesa_double_to_float_rtz((double)src0 + (double)src1);
581} else {
582   dst = src0 + src1;
583}
584""")
585binop("iadd", tint, _2src_commutative + associative, "src0 + src1")
586binop("iadd_sat", tint, _2src_commutative, """
587      src1 > 0 ?
588         (src0 + src1 < src0 ? (1ull << (bit_size - 1)) - 1 : src0 + src1) :
589         (src0 < src0 + src1 ? (1ull << (bit_size - 1))     : src0 + src1)
590""")
591binop("uadd_sat", tuint, _2src_commutative,
592      "(src0 + src1) < src0 ? MAX_UINT_FOR_SIZE(sizeof(src0) * 8) : (src0 + src1)")
593binop("isub_sat", tint, "", """
594      src1 < 0 ?
595         (src0 - src1 < src0 ? (1ull << (bit_size - 1)) - 1 : src0 - src1) :
596         (src0 < src0 - src1 ? (1ull << (bit_size - 1))     : src0 - src1)
597""")
598binop("usub_sat", tuint, "", "src0 < src1 ? 0 : src0 - src1")
599
600binop("fsub", tfloat, "", """
601if (nir_is_rounding_mode_rtz(execution_mode, bit_size)) {
602   if (bit_size == 64)
603      dst = _mesa_double_sub_rtz(src0, src1);
604   else
605      dst = _mesa_double_to_float_rtz((double)src0 - (double)src1);
606} else {
607   dst = src0 - src1;
608}
609""")
610binop("isub", tint, "", "src0 - src1")
611binop_convert("uabs_isub", tuint, tint, "", """
612              src1 > src0 ? (uint64_t) src1 - (uint64_t) src0
613                          : (uint64_t) src0 - (uint64_t) src1
614""")
615binop("uabs_usub", tuint, "", "(src1 > src0) ? (src1 - src0) : (src0 - src1)")
616
617binop("fmul", tfloat, _2src_commutative + associative, """
618if (nir_is_rounding_mode_rtz(execution_mode, bit_size)) {
619   if (bit_size == 64)
620      dst = _mesa_double_mul_rtz(src0, src1);
621   else
622      dst = _mesa_double_to_float_rtz((double)src0 * (double)src1);
623} else {
624   dst = src0 * src1;
625}
626""")
627# low 32-bits of signed/unsigned integer multiply
628binop("imul", tint, _2src_commutative + associative, """
629   /* Use 64-bit multiplies to prevent overflow of signed arithmetic */
630   dst = (uint64_t)src0 * (uint64_t)src1;
631""")
632
633# Generate 64 bit result from 2 32 bits quantity
634binop_convert("imul_2x32_64", tint64, tint32, _2src_commutative,
635              "(int64_t)src0 * (int64_t)src1")
636binop_convert("umul_2x32_64", tuint64, tuint32, _2src_commutative,
637              "(uint64_t)src0 * (uint64_t)src1")
638
639# high 32-bits of signed integer multiply
640binop("imul_high", tint, _2src_commutative, """
641if (bit_size == 64) {
642   /* We need to do a full 128-bit x 128-bit multiply in order for the sign
643    * extension to work properly.  The casts are kind-of annoying but needed
644    * to prevent compiler warnings.
645    */
646   uint32_t src0_u32[4] = {
647      src0,
648      (int64_t)src0 >> 32,
649      (int64_t)src0 >> 63,
650      (int64_t)src0 >> 63,
651   };
652   uint32_t src1_u32[4] = {
653      src1,
654      (int64_t)src1 >> 32,
655      (int64_t)src1 >> 63,
656      (int64_t)src1 >> 63,
657   };
658   uint32_t prod_u32[4];
659   ubm_mul_u32arr(prod_u32, src0_u32, src1_u32);
660   dst = (uint64_t)prod_u32[2] | ((uint64_t)prod_u32[3] << 32);
661} else {
662   /* First, sign-extend to 64-bit, then convert to unsigned to prevent
663    * potential overflow of signed multiply */
664   dst = ((uint64_t)(int64_t)src0 * (uint64_t)(int64_t)src1) >> bit_size;
665}
666""")
667
668# high 32-bits of unsigned integer multiply
669binop("umul_high", tuint, _2src_commutative, """
670if (bit_size == 64) {
671   /* The casts are kind-of annoying but needed to prevent compiler warnings. */
672   uint32_t src0_u32[2] = { src0, (uint64_t)src0 >> 32 };
673   uint32_t src1_u32[2] = { src1, (uint64_t)src1 >> 32 };
674   uint32_t prod_u32[4];
675   ubm_mul_u32arr(prod_u32, src0_u32, src1_u32);
676   dst = (uint64_t)prod_u32[2] | ((uint64_t)prod_u32[3] << 32);
677} else {
678   dst = ((uint64_t)src0 * (uint64_t)src1) >> bit_size;
679}
680""")
681
682# low 32-bits of unsigned integer multiply
683binop("umul_low", tuint32, _2src_commutative, """
684uint64_t mask = (1 << (bit_size / 2)) - 1;
685dst = ((uint64_t)src0 & mask) * ((uint64_t)src1 & mask);
686""")
687
688# Multiply 32-bits with low 16-bits.
689binop("imul_32x16", tint32, "", "src0 * (int16_t) src1")
690binop("umul_32x16", tuint32, "", "src0 * (uint16_t) src1")
691
692binop("fdiv", tfloat, "", "src0 / src1")
693binop("idiv", tint, "", "src1 == 0 ? 0 : (src0 / src1)")
694binop("udiv", tuint, "", "src1 == 0 ? 0 : (src0 / src1)")
695
696# returns a boolean representing the carry resulting from the addition of
697# the two unsigned arguments.
698
699binop_convert("uadd_carry", tuint, tuint, _2src_commutative, "src0 + src1 < src0")
700
701# returns a boolean representing the borrow resulting from the subtraction
702# of the two unsigned arguments.
703
704binop_convert("usub_borrow", tuint, tuint, "", "src0 < src1")
705
706# hadd: (a + b) >> 1 (without overflow)
707# x + y = x - (x & ~y) + (x & ~y) + y - (~x & y) + (~x & y)
708#       =      (x & y) + (x & ~y) +      (x & y) + (~x & y)
709#       = 2 *  (x & y) + (x & ~y) +                (~x & y)
710#       =     ((x & y) << 1) + (x ^ y)
711#
712# Since we know that the bottom bit of (x & y) << 1 is zero,
713#
714# (x + y) >> 1 = (((x & y) << 1) + (x ^ y)) >> 1
715#              =   (x & y) +      ((x ^ y)  >> 1)
716binop("ihadd", tint, _2src_commutative, "(src0 & src1) + ((src0 ^ src1) >> 1)")
717binop("uhadd", tuint, _2src_commutative, "(src0 & src1) + ((src0 ^ src1) >> 1)")
718
719# rhadd: (a + b + 1) >> 1 (without overflow)
720# x + y + 1 = x + (~x & y) - (~x & y) + y + (x & ~y) - (x & ~y) + 1
721#           =      (x | y) - (~x & y) +      (x | y) - (x & ~y) + 1
722#           = 2 *  (x | y) - ((~x & y) +               (x & ~y)) + 1
723#           =     ((x | y) << 1) - (x ^ y) + 1
724#
725# Since we know that the bottom bit of (x & y) << 1 is zero,
726#
727# (x + y + 1) >> 1 = (x | y) + (-(x ^ y) + 1) >> 1)
728#                  = (x | y) -  ((x ^ y)      >> 1)
729binop("irhadd", tint, _2src_commutative, "(src0 | src1) + ((src0 ^ src1) >> 1)")
730binop("urhadd", tuint, _2src_commutative, "(src0 | src1) + ((src0 ^ src1) >> 1)")
731
732binop("umod", tuint, "", "src1 == 0 ? 0 : src0 % src1")
733
734# For signed integers, there are several different possible definitions of
735# "modulus" or "remainder".  We follow the conventions used by LLVM and
736# SPIR-V.  The irem opcode implements the standard C/C++ signed "%"
737# operation while the imod opcode implements the more mathematical
738# "modulus" operation.  For details on the difference, see
739#
740# http://mathforum.org/library/drmath/view/52343.html
741
742binop("irem", tint, "", "src1 == 0 ? 0 : src0 % src1")
743binop("imod", tint, "",
744      "src1 == 0 ? 0 : ((src0 % src1 == 0 || (src0 >= 0) == (src1 >= 0)) ?"
745      "                 src0 % src1 : src0 % src1 + src1)")
746binop("fmod", tfloat, "", "src0 - src1 * floorf(src0 / src1)")
747binop("frem", tfloat, "", "src0 - src1 * truncf(src0 / src1)")
748
749#
750# Comparisons
751#
752
753
754# these integer-aware comparisons return a boolean (0 or ~0)
755
756binop_compare_all_sizes("flt", tfloat, "", "src0 < src1")
757binop_compare_all_sizes("fge", tfloat, "", "src0 >= src1")
758binop_compare_all_sizes("feq", tfloat, _2src_commutative, "src0 == src1")
759binop_compare_all_sizes("fneu", tfloat, _2src_commutative, "src0 != src1")
760binop_compare_all_sizes("ilt", tint, "", "src0 < src1")
761binop_compare_all_sizes("ige", tint, "", "src0 >= src1")
762binop_compare_all_sizes("ieq", tint, _2src_commutative, "src0 == src1")
763binop_compare_all_sizes("ine", tint, _2src_commutative, "src0 != src1")
764binop_compare_all_sizes("ult", tuint, "", "src0 < src1")
765binop_compare_all_sizes("uge", tuint, "", "src0 >= src1")
766
767# integer-aware GLSL-style comparisons that compare floats and ints
768
769binop_reduce_all_sizes("ball_fequal",  1, tfloat, "{src0} == {src1}",
770                       "{src0} && {src1}", "{src}")
771binop_reduce_all_sizes("bany_fnequal", 1, tfloat, "{src0} != {src1}",
772                       "{src0} || {src1}", "{src}")
773binop_reduce_all_sizes("ball_iequal",  1, tint, "{src0} == {src1}",
774                       "{src0} && {src1}", "{src}")
775binop_reduce_all_sizes("bany_inequal", 1, tint, "{src0} != {src1}",
776                       "{src0} || {src1}", "{src}")
777
778# non-integer-aware GLSL-style comparisons that return 0.0 or 1.0
779
780binop_reduce("fall_equal",  1, tfloat32, tfloat32, "{src0} == {src1}",
781             "{src0} && {src1}", "{src} ? 1.0f : 0.0f")
782binop_reduce("fany_nequal", 1, tfloat32, tfloat32, "{src0} != {src1}",
783             "{src0} || {src1}", "{src} ? 1.0f : 0.0f")
784
785# These comparisons for integer-less hardware return 1.0 and 0.0 for true
786# and false respectively
787
788binop("slt", tfloat32, "", "(src0 < src1) ? 1.0f : 0.0f") # Set on Less Than
789binop("sge", tfloat, "", "(src0 >= src1) ? 1.0f : 0.0f") # Set on Greater or Equal
790binop("seq", tfloat32, _2src_commutative, "(src0 == src1) ? 1.0f : 0.0f") # Set on Equal
791binop("sne", tfloat32, _2src_commutative, "(src0 != src1) ? 1.0f : 0.0f") # Set on Not Equal
792
793# SPIRV shifts are undefined for shift-operands >= bitsize,
794# but SM5 shifts are defined to use the least significant bits, only
795# The NIR definition is according to the SM5 specification.
796opcode("ishl", 0, tint, [0, 0], [tint, tuint32], False, "",
797       "(uint64_t)src0 << (src1 & (sizeof(src0) * 8 - 1))")
798opcode("ishr", 0, tint, [0, 0], [tint, tuint32], False, "",
799       "src0 >> (src1 & (sizeof(src0) * 8 - 1))")
800opcode("ushr", 0, tuint, [0, 0], [tuint, tuint32], False, "",
801       "src0 >> (src1 & (sizeof(src0) * 8 - 1))")
802
803opcode("urol", 0, tuint, [0, 0], [tuint, tuint32], False, "", """
804   uint32_t rotate_mask = sizeof(src0) * 8 - 1;
805   dst = (src0 << (src1 & rotate_mask)) |
806         (src0 >> (-src1 & rotate_mask));
807""")
808opcode("uror", 0, tuint, [0, 0], [tuint, tuint32], False, "", """
809   uint32_t rotate_mask = sizeof(src0) * 8 - 1;
810   dst = (src0 >> (src1 & rotate_mask)) |
811         (src0 << (-src1 & rotate_mask));
812""")
813
814# bitwise logic operators
815#
816# These are also used as boolean and, or, xor for hardware supporting
817# integers.
818
819
820binop("iand", tuint, _2src_commutative + associative, "src0 & src1")
821binop("ior", tuint, _2src_commutative + associative, "src0 | src1")
822binop("ixor", tuint, _2src_commutative + associative, "src0 ^ src1")
823
824
825binop_reduce("fdot", 1, tfloat, tfloat, "{src0} * {src1}", "{src0} + {src1}",
826             "{src}")
827
828binop_reduce("fdot", 4, tfloat, tfloat,
829             "{src0} * {src1}", "{src0} + {src1}", "{src}",
830             suffix="_replicated")
831
832opcode("fdph", 1, tfloat, [3, 4], [tfloat, tfloat], False, "",
833       "src0.x * src1.x + src0.y * src1.y + src0.z * src1.z + src1.w")
834opcode("fdph_replicated", 4, tfloat, [3, 4], [tfloat, tfloat], False, "",
835       "src0.x * src1.x + src0.y * src1.y + src0.z * src1.z + src1.w")
836
837binop("fmin", tfloat, _2src_commutative + associative, "fmin(src0, src1)")
838binop("imin", tint, _2src_commutative + associative, "src1 > src0 ? src0 : src1")
839binop("umin", tuint, _2src_commutative + associative, "src1 > src0 ? src0 : src1")
840binop("fmax", tfloat, _2src_commutative + associative, "fmax(src0, src1)")
841binop("imax", tint, _2src_commutative + associative, "src1 > src0 ? src1 : src0")
842binop("umax", tuint, _2src_commutative + associative, "src1 > src0 ? src1 : src0")
843
844# Saturated vector add for 4 8bit ints.
845binop("usadd_4x8", tint32, _2src_commutative + associative, """
846dst = 0;
847for (int i = 0; i < 32; i += 8) {
848   dst |= MIN2(((src0 >> i) & 0xff) + ((src1 >> i) & 0xff), 0xff) << i;
849}
850""")
851
852# Saturated vector subtract for 4 8bit ints.
853binop("ussub_4x8", tint32, "", """
854dst = 0;
855for (int i = 0; i < 32; i += 8) {
856   int src0_chan = (src0 >> i) & 0xff;
857   int src1_chan = (src1 >> i) & 0xff;
858   if (src0_chan > src1_chan)
859      dst |= (src0_chan - src1_chan) << i;
860}
861""")
862
863# vector min for 4 8bit ints.
864binop("umin_4x8", tint32, _2src_commutative + associative, """
865dst = 0;
866for (int i = 0; i < 32; i += 8) {
867   dst |= MIN2((src0 >> i) & 0xff, (src1 >> i) & 0xff) << i;
868}
869""")
870
871# vector max for 4 8bit ints.
872binop("umax_4x8", tint32, _2src_commutative + associative, """
873dst = 0;
874for (int i = 0; i < 32; i += 8) {
875   dst |= MAX2((src0 >> i) & 0xff, (src1 >> i) & 0xff) << i;
876}
877""")
878
879# unorm multiply: (a * b) / 255.
880binop("umul_unorm_4x8", tint32, _2src_commutative + associative, """
881dst = 0;
882for (int i = 0; i < 32; i += 8) {
883   int src0_chan = (src0 >> i) & 0xff;
884   int src1_chan = (src1 >> i) & 0xff;
885   dst |= ((src0_chan * src1_chan) / 255) << i;
886}
887""")
888
889binop("fpow", tfloat, "", "bit_size == 64 ? powf(src0, src1) : pow(src0, src1)")
890
891binop_horiz("pack_half_2x16_split", 1, tuint32, 1, tfloat32, 1, tfloat32,
892            "pack_half_1x16(src0.x) | (pack_half_1x16(src1.x) << 16)")
893
894binop_convert("pack_64_2x32_split", tuint64, tuint32, "",
895              "src0 | ((uint64_t)src1 << 32)")
896
897binop_convert("pack_32_2x16_split", tuint32, tuint16, "",
898              "src0 | ((uint32_t)src1 << 16)")
899
900# bfm implements the behavior of the first operation of the SM5 "bfi" assembly
901# and that of the "bfi1" i965 instruction. That is, the bits and offset values
902# are from the low five bits of src0 and src1, respectively.
903binop_convert("bfm", tuint32, tint32, "", """
904int bits = src0 & 0x1F;
905int offset = src1 & 0x1F;
906dst = ((1u << bits) - 1) << offset;
907""")
908
909opcode("ldexp", 0, tfloat, [0, 0], [tfloat, tint32], False, "", """
910dst = (bit_size == 64) ? ldexp(src0, src1) : ldexpf(src0, src1);
911/* flush denormals to zero. */
912if (!isnormal(dst))
913   dst = copysignf(0.0f, src0);
914""")
915
916# Combines the first component of each input to make a 2-component vector.
917
918binop_horiz("vec2", 2, tuint, 1, tuint, 1, tuint, """
919dst.x = src0.x;
920dst.y = src1.x;
921""")
922
923# Byte extraction
924binop("extract_u8", tuint, "", "(uint8_t)(src0 >> (src1 * 8))")
925binop("extract_i8", tint, "", "(int8_t)(src0 >> (src1 * 8))")
926
927# Word extraction
928binop("extract_u16", tuint, "", "(uint16_t)(src0 >> (src1 * 16))")
929binop("extract_i16", tint, "", "(int16_t)(src0 >> (src1 * 16))")
930
931
932def triop(name, ty, alg_props, const_expr):
933   opcode(name, 0, ty, [0, 0, 0], [ty, ty, ty], False, alg_props, const_expr)
934def triop_horiz(name, output_size, src1_size, src2_size, src3_size, const_expr):
935   opcode(name, output_size, tuint,
936   [src1_size, src2_size, src3_size],
937   [tuint, tuint, tuint], False, "", const_expr)
938
939triop("ffma", tfloat, _2src_commutative, """
940if (nir_is_rounding_mode_rtz(execution_mode, bit_size)) {
941   if (bit_size == 64)
942      dst = _mesa_double_fma_rtz(src0, src1, src2);
943   else if (bit_size == 32)
944      dst = _mesa_float_fma_rtz(src0, src1, src2);
945   else
946      dst = _mesa_double_to_float_rtz(_mesa_double_fma_rtz(src0, src1, src2));
947} else {
948   if (bit_size == 32)
949      dst = fmaf(src0, src1, src2);
950   else
951      dst = fma(src0, src1, src2);
952}
953""")
954
955triop("flrp", tfloat, "", "src0 * (1 - src2) + src1 * src2")
956
957# Conditional Select
958#
959# A vector conditional select instruction (like ?:, but operating per-
960# component on vectors). There are two versions, one for floating point
961# bools (0.0 vs 1.0) and one for integer bools (0 vs ~0).
962
963triop("fcsel", tfloat32, "", "(src0 != 0.0f) ? src1 : src2")
964
965opcode("bcsel", 0, tuint, [0, 0, 0],
966       [tbool1, tuint, tuint], False, "", "src0 ? src1 : src2")
967opcode("b8csel", 0, tuint, [0, 0, 0],
968       [tbool8, tuint, tuint], False, "", "src0 ? src1 : src2")
969opcode("b16csel", 0, tuint, [0, 0, 0],
970       [tbool16, tuint, tuint], False, "", "src0 ? src1 : src2")
971opcode("b32csel", 0, tuint, [0, 0, 0],
972       [tbool32, tuint, tuint], False, "", "src0 ? src1 : src2")
973
974# SM5 bfi assembly
975triop("bfi", tuint32, "", """
976unsigned mask = src0, insert = src1, base = src2;
977if (mask == 0) {
978   dst = base;
979} else {
980   unsigned tmp = mask;
981   while (!(tmp & 1)) {
982      tmp >>= 1;
983      insert <<= 1;
984   }
985   dst = (base & ~mask) | (insert & mask);
986}
987""")
988
989
990triop("bitfield_select", tuint, "", "(src0 & src1) | (~src0 & src2)")
991
992# SM5 ubfe/ibfe assembly: only the 5 least significant bits of offset and bits are used.
993opcode("ubfe", 0, tuint32,
994       [0, 0, 0], [tuint32, tuint32, tuint32], False, "", """
995unsigned base = src0;
996unsigned offset = src1 & 0x1F;
997unsigned bits = src2 & 0x1F;
998if (bits == 0) {
999   dst = 0;
1000} else if (offset + bits < 32) {
1001   dst = (base << (32 - bits - offset)) >> (32 - bits);
1002} else {
1003   dst = base >> offset;
1004}
1005""")
1006opcode("ibfe", 0, tint32,
1007       [0, 0, 0], [tint32, tuint32, tuint32], False, "", """
1008int base = src0;
1009unsigned offset = src1 & 0x1F;
1010unsigned bits = src2 & 0x1F;
1011if (bits == 0) {
1012   dst = 0;
1013} else if (offset + bits < 32) {
1014   dst = (base << (32 - bits - offset)) >> (32 - bits);
1015} else {
1016   dst = base >> offset;
1017}
1018""")
1019
1020# GLSL bitfieldExtract()
1021opcode("ubitfield_extract", 0, tuint32,
1022       [0, 0, 0], [tuint32, tint32, tint32], False, "", """
1023unsigned base = src0;
1024int offset = src1, bits = src2;
1025if (bits == 0) {
1026   dst = 0;
1027} else if (bits < 0 || offset < 0 || offset + bits > 32) {
1028   dst = 0; /* undefined per the spec */
1029} else {
1030   dst = (base >> offset) & ((1ull << bits) - 1);
1031}
1032""")
1033opcode("ibitfield_extract", 0, tint32,
1034       [0, 0, 0], [tint32, tint32, tint32], False, "", """
1035int base = src0;
1036int offset = src1, bits = src2;
1037if (bits == 0) {
1038   dst = 0;
1039} else if (offset < 0 || bits < 0 || offset + bits > 32) {
1040   dst = 0;
1041} else {
1042   dst = (base << (32 - offset - bits)) >> offset; /* use sign-extending shift */
1043}
1044""")
1045
1046# Combines the first component of each input to make a 3-component vector.
1047
1048triop_horiz("vec3", 3, 1, 1, 1, """
1049dst.x = src0.x;
1050dst.y = src1.x;
1051dst.z = src2.x;
1052""")
1053
1054def quadop_horiz(name, output_size, src1_size, src2_size, src3_size,
1055                 src4_size, const_expr):
1056   opcode(name, output_size, tuint,
1057          [src1_size, src2_size, src3_size, src4_size],
1058          [tuint, tuint, tuint, tuint],
1059          False, "", const_expr)
1060
1061opcode("bitfield_insert", 0, tuint32, [0, 0, 0, 0],
1062       [tuint32, tuint32, tint32, tint32], False, "", """
1063unsigned base = src0, insert = src1;
1064int offset = src2, bits = src3;
1065if (bits == 0) {
1066   dst = base;
1067} else if (offset < 0 || bits < 0 || bits + offset > 32) {
1068   dst = 0;
1069} else {
1070   unsigned mask = ((1ull << bits) - 1) << offset;
1071   dst = (base & ~mask) | ((insert << offset) & mask);
1072}
1073""")
1074
1075quadop_horiz("vec4", 4, 1, 1, 1, 1, """
1076dst.x = src0.x;
1077dst.y = src1.x;
1078dst.z = src2.x;
1079dst.w = src3.x;
1080""")
1081
1082opcode("vec8", 8, tuint,
1083       [1] * 8, [tuint] * 8,
1084       False, "", """
1085dst.x = src0.x;
1086dst.y = src1.x;
1087dst.z = src2.x;
1088dst.w = src3.x;
1089dst.e = src4.x;
1090dst.f = src5.x;
1091dst.g = src6.x;
1092dst.h = src7.x;
1093""")
1094
1095opcode("vec16", 16, tuint,
1096       [1] * 16, [tuint] * 16,
1097       False, "", """
1098dst.x = src0.x;
1099dst.y = src1.x;
1100dst.z = src2.x;
1101dst.w = src3.x;
1102dst.e = src4.x;
1103dst.f = src5.x;
1104dst.g = src6.x;
1105dst.h = src7.x;
1106dst.i = src8.x;
1107dst.j = src9.x;
1108dst.k = src10.x;
1109dst.l = src11.x;
1110dst.m = src12.x;
1111dst.n = src13.x;
1112dst.o = src14.x;
1113dst.p = src15.x;
1114""")
1115
1116# An integer multiply instruction for address calculation.  This is
1117# similar to imul, except that the results are undefined in case of
1118# overflow.  Overflow is defined according to the size of the variable
1119# being dereferenced.
1120#
1121# This relaxed definition, compared to imul, allows an optimization
1122# pass to propagate bounds (ie, from an load/store intrinsic) to the
1123# sources, such that lower precision integer multiplies can be used.
1124# This is useful on hw that has 24b or perhaps 16b integer multiply
1125# instructions.
1126binop("amul", tint, _2src_commutative + associative, "src0 * src1")
1127
1128# ir3-specific instruction that maps directly to mul-add shift high mix,
1129# (IMADSH_MIX16 i.e. ah * bl << 16 + c). It is used for lowering integer
1130# multiplication (imul) on Freedreno backend..
1131opcode("imadsh_mix16", 0, tint32,
1132       [0, 0, 0], [tint32, tint32, tint32], False, "", """
1133dst = ((((src0 & 0xffff0000) >> 16) * (src1 & 0x0000ffff)) << 16) + src2;
1134""")
1135
1136# ir3-specific instruction that maps directly to ir3 mad.s24.
1137#
1138# 24b multiply into 32b result (with sign extension) plus 32b int
1139triop("imad24_ir3", tint32, _2src_commutative,
1140      "(((int32_t)src0 << 8) >> 8) * (((int32_t)src1 << 8) >> 8) + src2")
1141
1142# 24b multiply into 32b result (with sign extension)
1143binop("imul24", tint32, _2src_commutative + associative,
1144      "(((int32_t)src0 << 8) >> 8) * (((int32_t)src1 << 8) >> 8)")
1145
1146# unsigned 24b multiply into 32b result plus 32b int
1147triop("umad24", tuint32, _2src_commutative,
1148      "(((uint32_t)src0 << 8) >> 8) * (((uint32_t)src1 << 8) >> 8) + src2")
1149
1150# unsigned 24b multiply into 32b result uint
1151binop("umul24", tint32, _2src_commutative + associative,
1152      "(((uint32_t)src0 << 8) >> 8) * (((uint32_t)src1 << 8) >> 8)")
1153
1154unop_convert("fisnormal", tbool1, tfloat, "isnormal(src0)")
1155unop_convert("fisfinite", tbool1, tfloat, "isfinite(src0)")
1156