1# 2# Copyright (C) 2014 Connor Abbott 3# 4# Permission is hereby granted, free of charge, to any person obtaining a 5# copy of this software and associated documentation files (the "Software"), 6# to deal in the Software without restriction, including without limitation 7# the rights to use, copy, modify, merge, publish, distribute, sublicense, 8# and/or sell copies of the Software, and to permit persons to whom the 9# Software is furnished to do so, subject to the following conditions: 10# 11# The above copyright notice and this permission notice (including the next 12# paragraph) shall be included in all copies or substantial portions of the 13# Software. 14# 15# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 18# THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING 20# FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS 21# IN THE SOFTWARE. 22# 23# Authors: 24# Connor Abbott (cwabbott0@gmail.com) 25 26import re 27 28# Class that represents all the information we have about the opcode 29# NOTE: this must be kept in sync with nir_op_info 30 31class Opcode(object): 32 """Class that represents all the information we have about the opcode 33 NOTE: this must be kept in sync with nir_op_info 34 """ 35 def __init__(self, name, output_size, output_type, input_sizes, 36 input_types, is_conversion, algebraic_properties, const_expr): 37 """Parameters: 38 39 - name is the name of the opcode (prepend nir_op_ for the enum name) 40 - all types are strings that get nir_type_ prepended to them 41 - input_types is a list of types 42 - is_conversion is true if this opcode represents a type conversion 43 - algebraic_properties is a space-seperated string, where nir_op_is_ is 44 prepended before each entry 45 - const_expr is an expression or series of statements that computes the 46 constant value of the opcode given the constant values of its inputs. 47 48 Constant expressions are formed from the variables src0, src1, ..., 49 src(N-1), where N is the number of arguments. The output of the 50 expression should be stored in the dst variable. Per-component input 51 and output variables will be scalars and non-per-component input and 52 output variables will be a struct with fields named x, y, z, and w 53 all of the correct type. Input and output variables can be assumed 54 to already be of the correct type and need no conversion. In 55 particular, the conversion from the C bool type to/from NIR_TRUE and 56 NIR_FALSE happens automatically. 57 58 For per-component instructions, the entire expression will be 59 executed once for each component. For non-per-component 60 instructions, the expression is expected to store the correct values 61 in dst.x, dst.y, etc. If "dst" does not exist anywhere in the 62 constant expression, an assignment to dst will happen automatically 63 and the result will be equivalent to "dst = <expression>" for 64 per-component instructions and "dst.x = dst.y = ... = <expression>" 65 for non-per-component instructions. 66 """ 67 assert isinstance(name, str) 68 assert isinstance(output_size, int) 69 assert isinstance(output_type, str) 70 assert isinstance(input_sizes, list) 71 assert isinstance(input_sizes[0], int) 72 assert isinstance(input_types, list) 73 assert isinstance(input_types[0], str) 74 assert isinstance(is_conversion, bool) 75 assert isinstance(algebraic_properties, str) 76 assert isinstance(const_expr, str) 77 assert len(input_sizes) == len(input_types) 78 assert 0 <= output_size <= 5 or (output_size == 8) or (output_size == 16) 79 for size in input_sizes: 80 assert 0 <= size <= 5 or (size == 8) or (size == 16) 81 if output_size != 0: 82 assert size != 0 83 self.name = name 84 self.num_inputs = len(input_sizes) 85 self.output_size = output_size 86 self.output_type = output_type 87 self.input_sizes = input_sizes 88 self.input_types = input_types 89 self.is_conversion = is_conversion 90 self.algebraic_properties = algebraic_properties 91 self.const_expr = const_expr 92 93# helper variables for strings 94tfloat = "float" 95tint = "int" 96tbool = "bool" 97tbool1 = "bool1" 98tbool8 = "bool8" 99tbool16 = "bool16" 100tbool32 = "bool32" 101tuint = "uint" 102tuint8 = "uint8" 103tint16 = "int16" 104tuint16 = "uint16" 105tfloat16 = "float16" 106tfloat32 = "float32" 107tint32 = "int32" 108tuint32 = "uint32" 109tint64 = "int64" 110tuint64 = "uint64" 111tfloat64 = "float64" 112 113_TYPE_SPLIT_RE = re.compile(r'(?P<type>int|uint|float|bool)(?P<bits>\d+)?') 114 115def type_has_size(type_): 116 m = _TYPE_SPLIT_RE.match(type_) 117 assert m is not None, 'Invalid NIR type string: "{}"'.format(type_) 118 return m.group('bits') is not None 119 120def type_size(type_): 121 m = _TYPE_SPLIT_RE.match(type_) 122 assert m is not None, 'Invalid NIR type string: "{}"'.format(type_) 123 assert m.group('bits') is not None, \ 124 'NIR type string has no bit size: "{}"'.format(type_) 125 return int(m.group('bits')) 126 127def type_sizes(type_): 128 if type_has_size(type_): 129 return [type_size(type_)] 130 elif type_ == 'bool': 131 return [1, 8, 16, 32] 132 elif type_ == 'float': 133 return [16, 32, 64] 134 else: 135 return [1, 8, 16, 32, 64] 136 137def type_base_type(type_): 138 m = _TYPE_SPLIT_RE.match(type_) 139 assert m is not None, 'Invalid NIR type string: "{}"'.format(type_) 140 return m.group('type') 141 142# Operation where the first two sources are commutative. 143# 144# For 2-source operations, this just mathematical commutativity. Some 145# 3-source operations, like ffma, are only commutative in the first two 146# sources. 147_2src_commutative = "2src_commutative " 148associative = "associative " 149selection = "selection " 150 151# global dictionary of opcodes 152opcodes = {} 153 154def opcode(name, output_size, output_type, input_sizes, input_types, 155 is_conversion, algebraic_properties, const_expr): 156 assert name not in opcodes 157 opcodes[name] = Opcode(name, output_size, output_type, input_sizes, 158 input_types, is_conversion, algebraic_properties, 159 const_expr) 160 161def unop_convert(name, out_type, in_type, const_expr): 162 opcode(name, 0, out_type, [0], [in_type], False, "", const_expr) 163 164def unop(name, ty, const_expr): 165 opcode(name, 0, ty, [0], [ty], False, "", const_expr) 166 167def unop_horiz(name, output_size, output_type, input_size, input_type, 168 const_expr): 169 opcode(name, output_size, output_type, [input_size], [input_type], 170 False, "", const_expr) 171 172def unop_reduce(name, output_size, output_type, input_type, prereduce_expr, 173 reduce_expr, final_expr): 174 def prereduce(src): 175 return "(" + prereduce_expr.format(src=src) + ")" 176 def final(src): 177 return final_expr.format(src="(" + src + ")") 178 def reduce_(src0, src1): 179 return reduce_expr.format(src0=src0, src1=src1) 180 src0 = prereduce("src0.x") 181 src1 = prereduce("src0.y") 182 src2 = prereduce("src0.z") 183 src3 = prereduce("src0.w") 184 unop_horiz(name + "2", output_size, output_type, 2, input_type, 185 final(reduce_(src0, src1))) 186 unop_horiz(name + "3", output_size, output_type, 3, input_type, 187 final(reduce_(reduce_(src0, src1), src2))) 188 unop_horiz(name + "4", output_size, output_type, 4, input_type, 189 final(reduce_(reduce_(src0, src1), reduce_(src2, src3)))) 190 191def unop_numeric_convert(name, out_type, in_type, const_expr): 192 opcode(name, 0, out_type, [0], [in_type], True, "", const_expr) 193 194unop("mov", tuint, "src0") 195 196unop("ineg", tint, "-src0") 197unop("fneg", tfloat, "-src0") 198unop("inot", tint, "~src0") # invert every bit of the integer 199 200# nir_op_fsign roughly implements the OpenGL / Vulkan rules for sign(float). 201# The GLSL.std.450 FSign instruction is defined as: 202# 203# Result is 1.0 if x > 0, 0.0 if x = 0, or -1.0 if x < 0. 204# 205# If the source is equal to zero, there is a preference for the result to have 206# the same sign, but this is not required (it is required by OpenCL). If the 207# source is not a number, there is a preference for the result to be +0.0, but 208# this is not required (it is required by OpenCL). If the source is not a 209# number, and the result is not +0.0, the result should definitely **not** be 210# NaN. 211# 212# The values returned for constant folding match the behavior required by 213# OpenCL. 214unop("fsign", tfloat, ("bit_size == 64 ? " + 215 "(isnan(src0) ? 0.0 : ((src0 == 0.0 ) ? src0 : (src0 > 0.0 ) ? 1.0 : -1.0 )) : " + 216 "(isnan(src0) ? 0.0f : ((src0 == 0.0f) ? src0 : (src0 > 0.0f) ? 1.0f : -1.0f))")) 217unop("isign", tint, "(src0 == 0) ? 0 : ((src0 > 0) ? 1 : -1)") 218unop("iabs", tint, "(src0 < 0) ? -src0 : src0") 219unop("fabs", tfloat, "fabs(src0)") 220unop("fsat", tfloat, ("fmin(fmax(src0, 0.0), 1.0)")) 221unop("frcp", tfloat, "bit_size == 64 ? 1.0 / src0 : 1.0f / src0") 222unop("frsq", tfloat, "bit_size == 64 ? 1.0 / sqrt(src0) : 1.0f / sqrtf(src0)") 223unop("fsqrt", tfloat, "bit_size == 64 ? sqrt(src0) : sqrtf(src0)") 224unop("fexp2", tfloat, "exp2f(src0)") 225unop("flog2", tfloat, "log2f(src0)") 226 227# Generate all of the numeric conversion opcodes 228for src_t in [tint, tuint, tfloat, tbool]: 229 if src_t == tbool: 230 dst_types = [tfloat, tint, tbool] 231 elif src_t == tint: 232 dst_types = [tfloat, tint, tbool] 233 elif src_t == tuint: 234 dst_types = [tfloat, tuint] 235 elif src_t == tfloat: 236 dst_types = [tint, tuint, tfloat, tbool] 237 238 for dst_t in dst_types: 239 for dst_bit_size in type_sizes(dst_t): 240 if dst_bit_size == 16 and dst_t == tfloat and src_t == tfloat: 241 rnd_modes = ['_rtne', '_rtz', ''] 242 for rnd_mode in rnd_modes: 243 if rnd_mode == '_rtne': 244 conv_expr = """ 245 if (bit_size > 16) { 246 dst = _mesa_half_to_float(_mesa_float_to_float16_rtne(src0)); 247 } else { 248 dst = src0; 249 } 250 """ 251 elif rnd_mode == '_rtz': 252 conv_expr = """ 253 if (bit_size > 16) { 254 dst = _mesa_half_to_float(_mesa_float_to_float16_rtz(src0)); 255 } else { 256 dst = src0; 257 } 258 """ 259 else: 260 conv_expr = "src0" 261 262 unop_numeric_convert("{0}2{1}{2}{3}".format(src_t[0], 263 dst_t[0], 264 dst_bit_size, 265 rnd_mode), 266 dst_t + str(dst_bit_size), 267 src_t, conv_expr) 268 elif dst_bit_size == 32 and dst_t == tfloat and src_t == tfloat: 269 conv_expr = """ 270 if (bit_size > 32 && nir_is_rounding_mode_rtz(execution_mode, 32)) { 271 dst = _mesa_double_to_float_rtz(src0); 272 } else { 273 dst = src0; 274 } 275 """ 276 unop_numeric_convert("{0}2{1}{2}".format(src_t[0], dst_t[0], 277 dst_bit_size), 278 dst_t + str(dst_bit_size), src_t, conv_expr) 279 else: 280 conv_expr = "src0 != 0" if dst_t == tbool else "src0" 281 unop_numeric_convert("{0}2{1}{2}".format(src_t[0], dst_t[0], 282 dst_bit_size), 283 dst_t + str(dst_bit_size), src_t, conv_expr) 284 285# Special opcode that is the same as f2f16, i2i16, u2u16 except that it is safe 286# to remove it if the result is immediately converted back to 32 bits again. 287# This is generated as part of the precision lowering pass. mp stands for medium 288# precision. 289unop_numeric_convert("f2fmp", tfloat16, tfloat32, opcodes["f2f16"].const_expr) 290unop_numeric_convert("i2imp", tint16, tint32, opcodes["i2i16"].const_expr) 291# u2ump isn't defined, because the behavior is equal to i2imp 292unop_numeric_convert("f2imp", tint16, tfloat32, opcodes["f2i16"].const_expr) 293unop_numeric_convert("f2ump", tuint16, tfloat32, opcodes["f2u16"].const_expr) 294unop_numeric_convert("i2fmp", tfloat16, tint32, opcodes["i2f16"].const_expr) 295unop_numeric_convert("u2fmp", tfloat16, tuint32, opcodes["u2f16"].const_expr) 296 297# Unary floating-point rounding operations. 298 299 300unop("ftrunc", tfloat, "bit_size == 64 ? trunc(src0) : truncf(src0)") 301unop("fceil", tfloat, "bit_size == 64 ? ceil(src0) : ceilf(src0)") 302unop("ffloor", tfloat, "bit_size == 64 ? floor(src0) : floorf(src0)") 303unop("ffract", tfloat, "src0 - (bit_size == 64 ? floor(src0) : floorf(src0))") 304unop("fround_even", tfloat, "bit_size == 64 ? _mesa_roundeven(src0) : _mesa_roundevenf(src0)") 305 306unop("fquantize2f16", tfloat, "(fabs(src0) < ldexpf(1.0, -14)) ? copysignf(0.0f, src0) : _mesa_half_to_float(_mesa_float_to_half(src0))") 307 308# Trigonometric operations. 309 310 311unop("fsin", tfloat, "bit_size == 64 ? sin(src0) : sinf(src0)") 312unop("fcos", tfloat, "bit_size == 64 ? cos(src0) : cosf(src0)") 313 314# dfrexp 315unop_convert("frexp_exp", tint32, tfloat, "frexp(src0, &dst);") 316unop_convert("frexp_sig", tfloat, tfloat, "int n; dst = frexp(src0, &n);") 317 318# Partial derivatives. 319 320 321unop("fddx", tfloat, "0.0") # the derivative of a constant is 0. 322unop("fddy", tfloat, "0.0") 323unop("fddx_fine", tfloat, "0.0") 324unop("fddy_fine", tfloat, "0.0") 325unop("fddx_coarse", tfloat, "0.0") 326unop("fddy_coarse", tfloat, "0.0") 327 328 329# Floating point pack and unpack operations. 330 331def pack_2x16(fmt): 332 unop_horiz("pack_" + fmt + "_2x16", 1, tuint32, 2, tfloat32, """ 333dst.x = (uint32_t) pack_fmt_1x16(src0.x); 334dst.x |= ((uint32_t) pack_fmt_1x16(src0.y)) << 16; 335""".replace("fmt", fmt)) 336 337def pack_4x8(fmt): 338 unop_horiz("pack_" + fmt + "_4x8", 1, tuint32, 4, tfloat32, """ 339dst.x = (uint32_t) pack_fmt_1x8(src0.x); 340dst.x |= ((uint32_t) pack_fmt_1x8(src0.y)) << 8; 341dst.x |= ((uint32_t) pack_fmt_1x8(src0.z)) << 16; 342dst.x |= ((uint32_t) pack_fmt_1x8(src0.w)) << 24; 343""".replace("fmt", fmt)) 344 345def unpack_2x16(fmt): 346 unop_horiz("unpack_" + fmt + "_2x16", 2, tfloat32, 1, tuint32, """ 347dst.x = unpack_fmt_1x16((uint16_t)(src0.x & 0xffff)); 348dst.y = unpack_fmt_1x16((uint16_t)(src0.x << 16)); 349""".replace("fmt", fmt)) 350 351def unpack_4x8(fmt): 352 unop_horiz("unpack_" + fmt + "_4x8", 4, tfloat32, 1, tuint32, """ 353dst.x = unpack_fmt_1x8((uint8_t)(src0.x & 0xff)); 354dst.y = unpack_fmt_1x8((uint8_t)((src0.x >> 8) & 0xff)); 355dst.z = unpack_fmt_1x8((uint8_t)((src0.x >> 16) & 0xff)); 356dst.w = unpack_fmt_1x8((uint8_t)(src0.x >> 24)); 357""".replace("fmt", fmt)) 358 359 360pack_2x16("snorm") 361pack_4x8("snorm") 362pack_2x16("unorm") 363pack_4x8("unorm") 364pack_2x16("half") 365unpack_2x16("snorm") 366unpack_4x8("snorm") 367unpack_2x16("unorm") 368unpack_4x8("unorm") 369unpack_2x16("half") 370 371# Convert two unsigned integers into a packed unsigned short (clamp is applied). 372unop_horiz("pack_uint_2x16", 1, tuint32, 2, tuint32, """ 373dst.x = _mesa_unsigned_to_unsigned(src0.x, 16); 374dst.x |= _mesa_unsigned_to_unsigned(src0.y, 16) << 16; 375""") 376 377# Convert two signed integers into a packed signed short (clamp is applied). 378unop_horiz("pack_sint_2x16", 1, tint32, 2, tint32, """ 379dst.x = _mesa_signed_to_signed(src0.x, 16) & 0xffff; 380dst.x |= _mesa_signed_to_signed(src0.y, 16) << 16; 381""") 382 383unop_horiz("pack_uvec2_to_uint", 1, tuint32, 2, tuint32, """ 384dst.x = (src0.x & 0xffff) | (src0.y << 16); 385""") 386 387unop_horiz("pack_uvec4_to_uint", 1, tuint32, 4, tuint32, """ 388dst.x = (src0.x << 0) | 389 (src0.y << 8) | 390 (src0.z << 16) | 391 (src0.w << 24); 392""") 393 394unop_horiz("pack_32_4x8", 1, tuint32, 4, tuint8, 395 "dst.x = src0.x | ((uint32_t)src0.y << 8) | ((uint32_t)src0.z << 16) | ((uint32_t)src0.w << 24);") 396 397unop_horiz("pack_32_2x16", 1, tuint32, 2, tuint16, 398 "dst.x = src0.x | ((uint32_t)src0.y << 16);") 399 400unop_horiz("pack_64_2x32", 1, tuint64, 2, tuint32, 401 "dst.x = src0.x | ((uint64_t)src0.y << 32);") 402 403unop_horiz("pack_64_4x16", 1, tuint64, 4, tuint16, 404 "dst.x = src0.x | ((uint64_t)src0.y << 16) | ((uint64_t)src0.z << 32) | ((uint64_t)src0.w << 48);") 405 406unop_horiz("unpack_64_2x32", 2, tuint32, 1, tuint64, 407 "dst.x = src0.x; dst.y = src0.x >> 32;") 408 409unop_horiz("unpack_64_4x16", 4, tuint16, 1, tuint64, 410 "dst.x = src0.x; dst.y = src0.x >> 16; dst.z = src0.x >> 32; dst.w = src0.w >> 48;") 411 412unop_horiz("unpack_32_2x16", 2, tuint16, 1, tuint32, 413 "dst.x = src0.x; dst.y = src0.x >> 16;") 414 415unop_horiz("unpack_32_4x8", 4, tuint8, 1, tuint32, 416 "dst.x = src0.x; dst.y = src0.x >> 8; dst.z = src0.x >> 16; dst.w = src0.x >> 24;") 417 418unop_horiz("unpack_half_2x16_flush_to_zero", 2, tfloat32, 1, tuint32, """ 419dst.x = unpack_half_1x16_flush_to_zero((uint16_t)(src0.x & 0xffff)); 420dst.y = unpack_half_1x16_flush_to_zero((uint16_t)(src0.x << 16)); 421""") 422 423# Lowered floating point unpacking operations. 424 425unop_convert("unpack_half_2x16_split_x", tfloat32, tuint32, 426 "unpack_half_1x16((uint16_t)(src0 & 0xffff))") 427unop_convert("unpack_half_2x16_split_y", tfloat32, tuint32, 428 "unpack_half_1x16((uint16_t)(src0 >> 16))") 429 430unop_convert("unpack_half_2x16_split_x_flush_to_zero", tfloat32, tuint32, 431 "unpack_half_1x16_flush_to_zero((uint16_t)(src0 & 0xffff))") 432unop_convert("unpack_half_2x16_split_y_flush_to_zero", tfloat32, tuint32, 433 "unpack_half_1x16_flush_to_zero((uint16_t)(src0 >> 16))") 434 435unop_convert("unpack_32_2x16_split_x", tuint16, tuint32, "src0") 436unop_convert("unpack_32_2x16_split_y", tuint16, tuint32, "src0 >> 16") 437 438unop_convert("unpack_64_2x32_split_x", tuint32, tuint64, "src0") 439unop_convert("unpack_64_2x32_split_y", tuint32, tuint64, "src0 >> 32") 440 441# Bit operations, part of ARB_gpu_shader5. 442 443 444unop("bitfield_reverse", tuint32, """ 445/* we're not winning any awards for speed here, but that's ok */ 446dst = 0; 447for (unsigned bit = 0; bit < 32; bit++) 448 dst |= ((src0 >> bit) & 1) << (31 - bit); 449""") 450unop_convert("bit_count", tuint32, tuint, """ 451dst = 0; 452for (unsigned bit = 0; bit < bit_size; bit++) { 453 if ((src0 >> bit) & 1) 454 dst++; 455} 456""") 457 458unop_convert("ufind_msb", tint32, tuint, """ 459dst = -1; 460for (int bit = bit_size - 1; bit >= 0; bit--) { 461 if ((src0 >> bit) & 1) { 462 dst = bit; 463 break; 464 } 465} 466""") 467 468unop_convert("ufind_msb_rev", tint32, tuint, """ 469dst = -1; 470for (int bit = 0; bit < bit_size; bit++) { 471 if ((src0 << bit) & 0x80000000) { 472 dst = bit; 473 break; 474 } 475} 476""") 477 478unop("uclz", tuint32, """ 479int bit; 480for (bit = bit_size - 1; bit >= 0; bit--) { 481 if ((src0 & (1u << bit)) != 0) 482 break; 483} 484dst = (unsigned)(bit_size - bit - 1); 485""") 486 487unop("ifind_msb", tint32, """ 488dst = -1; 489for (int bit = bit_size - 1; bit >= 0; bit--) { 490 /* If src0 < 0, we're looking for the first 0 bit. 491 * if src0 >= 0, we're looking for the first 1 bit. 492 */ 493 if ((((src0 >> bit) & 1) && (src0 >= 0)) || 494 (!((src0 >> bit) & 1) && (src0 < 0))) { 495 dst = bit; 496 break; 497 } 498} 499""") 500 501unop_convert("ifind_msb_rev", tint32, tint, """ 502dst = -1; 503/* We are looking for the highest bit that's not the same as the sign bit. */ 504uint32_t sign = src0 & 0x80000000u; 505for (int bit = 0; bit < 32; bit++) { 506 if (((src0 << bit) & 0x80000000u) != sign) { 507 dst = bit; 508 break; 509 } 510} 511""") 512 513unop_convert("find_lsb", tint32, tint, """ 514dst = -1; 515for (unsigned bit = 0; bit < bit_size; bit++) { 516 if ((src0 >> bit) & 1) { 517 dst = bit; 518 break; 519 } 520} 521""") 522 523# AMD_gcn_shader extended instructions 524unop_horiz("cube_face_coord_amd", 2, tfloat32, 3, tfloat32, """ 525dst.x = dst.y = 0.0; 526float absX = fabsf(src0.x); 527float absY = fabsf(src0.y); 528float absZ = fabsf(src0.z); 529 530float ma = 0.0; 531if (absX >= absY && absX >= absZ) { ma = 2 * src0.x; } 532if (absY >= absX && absY >= absZ) { ma = 2 * src0.y; } 533if (absZ >= absX && absZ >= absY) { ma = 2 * src0.z; } 534 535if (src0.x >= 0 && absX >= absY && absX >= absZ) { dst.x = -src0.z; dst.y = -src0.y; } 536if (src0.x < 0 && absX >= absY && absX >= absZ) { dst.x = src0.z; dst.y = -src0.y; } 537if (src0.y >= 0 && absY >= absX && absY >= absZ) { dst.x = src0.x; dst.y = src0.z; } 538if (src0.y < 0 && absY >= absX && absY >= absZ) { dst.x = src0.x; dst.y = -src0.z; } 539if (src0.z >= 0 && absZ >= absX && absZ >= absY) { dst.x = src0.x; dst.y = -src0.y; } 540if (src0.z < 0 && absZ >= absX && absZ >= absY) { dst.x = -src0.x; dst.y = -src0.y; } 541 542dst.x = dst.x * (1.0f / ma) + 0.5f; 543dst.y = dst.y * (1.0f / ma) + 0.5f; 544""") 545 546unop_horiz("cube_face_index_amd", 1, tfloat32, 3, tfloat32, """ 547dst.x = 0.0; 548float absX = fabsf(src0.x); 549float absY = fabsf(src0.y); 550float absZ = fabsf(src0.z); 551if (src0.x >= 0 && absX >= absY && absX >= absZ) dst.x = 0; 552if (src0.x < 0 && absX >= absY && absX >= absZ) dst.x = 1; 553if (src0.y >= 0 && absY >= absX && absY >= absZ) dst.x = 2; 554if (src0.y < 0 && absY >= absX && absY >= absZ) dst.x = 3; 555if (src0.z >= 0 && absZ >= absX && absZ >= absY) dst.x = 4; 556if (src0.z < 0 && absZ >= absX && absZ >= absY) dst.x = 5; 557""") 558 559# Sum of vector components 560unop_reduce("fsum", 1, tfloat, tfloat, "{src}", "{src0} + {src1}", "{src}") 561 562def binop_convert(name, out_type, in_type, alg_props, const_expr): 563 opcode(name, 0, out_type, [0, 0], [in_type, in_type], 564 False, alg_props, const_expr) 565 566def binop(name, ty, alg_props, const_expr): 567 binop_convert(name, ty, ty, alg_props, const_expr) 568 569def binop_compare(name, ty, alg_props, const_expr): 570 binop_convert(name, tbool1, ty, alg_props, const_expr) 571 572def binop_compare8(name, ty, alg_props, const_expr): 573 binop_convert(name, tbool8, ty, alg_props, const_expr) 574 575def binop_compare16(name, ty, alg_props, const_expr): 576 binop_convert(name, tbool16, ty, alg_props, const_expr) 577 578def binop_compare32(name, ty, alg_props, const_expr): 579 binop_convert(name, tbool32, ty, alg_props, const_expr) 580 581def binop_compare_all_sizes(name, ty, alg_props, const_expr): 582 binop_compare(name, ty, alg_props, const_expr) 583 binop_compare8(name + "8", ty, alg_props, const_expr) 584 binop_compare16(name + "16", ty, alg_props, const_expr) 585 binop_compare32(name + "32", ty, alg_props, const_expr) 586 587def binop_horiz(name, out_size, out_type, src1_size, src1_type, src2_size, 588 src2_type, const_expr): 589 opcode(name, out_size, out_type, [src1_size, src2_size], [src1_type, src2_type], 590 False, "", const_expr) 591 592def binop_reduce(name, output_size, output_type, src_type, prereduce_expr, 593 reduce_expr, final_expr, suffix=""): 594 def final(src): 595 return final_expr.format(src= "(" + src + ")") 596 def reduce_(src0, src1): 597 return reduce_expr.format(src0=src0, src1=src1) 598 def prereduce(src0, src1): 599 return "(" + prereduce_expr.format(src0=src0, src1=src1) + ")" 600 srcs = [prereduce("src0." + letter, "src1." + letter) for letter in "xyzwefghijklmnop"] 601 def pairwise_reduce(start, size): 602 if (size == 1): 603 return srcs[start] 604 return reduce_(pairwise_reduce(start + size // 2, size // 2), pairwise_reduce(start, size // 2)) 605 for size in [2, 4, 8, 16]: 606 opcode(name + str(size) + suffix, output_size, output_type, 607 [size, size], [src_type, src_type], False, _2src_commutative, 608 final(pairwise_reduce(0, size))) 609 opcode(name + "3" + suffix, output_size, output_type, 610 [3, 3], [src_type, src_type], False, _2src_commutative, 611 final(reduce_(reduce_(srcs[2], srcs[1]), srcs[0]))) 612 opcode(name + "5" + suffix, output_size, output_type, 613 [5, 5], [src_type, src_type], False, _2src_commutative, 614 final(reduce_(srcs[4], reduce_(reduce_(srcs[3], srcs[2]), reduce_(srcs[1], srcs[0]))))) 615 616def binop_reduce_all_sizes(name, output_size, src_type, prereduce_expr, 617 reduce_expr, final_expr): 618 binop_reduce(name, output_size, tbool1, src_type, 619 prereduce_expr, reduce_expr, final_expr) 620 binop_reduce("b8" + name[1:], output_size, tbool8, src_type, 621 prereduce_expr, reduce_expr, final_expr) 622 binop_reduce("b16" + name[1:], output_size, tbool16, src_type, 623 prereduce_expr, reduce_expr, final_expr) 624 binop_reduce("b32" + name[1:], output_size, tbool32, src_type, 625 prereduce_expr, reduce_expr, final_expr) 626 627binop("fadd", tfloat, _2src_commutative + associative,""" 628if (nir_is_rounding_mode_rtz(execution_mode, bit_size)) { 629 if (bit_size == 64) 630 dst = _mesa_double_add_rtz(src0, src1); 631 else 632 dst = _mesa_double_to_float_rtz((double)src0 + (double)src1); 633} else { 634 dst = src0 + src1; 635} 636""") 637binop("iadd", tint, _2src_commutative + associative, "(uint64_t)src0 + (uint64_t)src1") 638binop("iadd_sat", tint, _2src_commutative, """ 639 src1 > 0 ? 640 (src0 + src1 < src0 ? u_intN_max(bit_size) : src0 + src1) : 641 (src0 < src0 + src1 ? u_intN_min(bit_size) : src0 + src1) 642""") 643binop("uadd_sat", tuint, _2src_commutative, 644 "(src0 + src1) < src0 ? u_uintN_max(sizeof(src0) * 8) : (src0 + src1)") 645binop("isub_sat", tint, "", """ 646 src1 < 0 ? 647 (src0 - src1 < src0 ? u_intN_max(bit_size) : src0 - src1) : 648 (src0 < src0 - src1 ? u_intN_min(bit_size) : src0 - src1) 649""") 650binop("usub_sat", tuint, "", "src0 < src1 ? 0 : src0 - src1") 651 652binop("fsub", tfloat, "", """ 653if (nir_is_rounding_mode_rtz(execution_mode, bit_size)) { 654 if (bit_size == 64) 655 dst = _mesa_double_sub_rtz(src0, src1); 656 else 657 dst = _mesa_double_to_float_rtz((double)src0 - (double)src1); 658} else { 659 dst = src0 - src1; 660} 661""") 662binop("isub", tint, "", "src0 - src1") 663binop_convert("uabs_isub", tuint, tint, "", """ 664 src1 > src0 ? (uint64_t) src1 - (uint64_t) src0 665 : (uint64_t) src0 - (uint64_t) src1 666""") 667binop("uabs_usub", tuint, "", "(src1 > src0) ? (src1 - src0) : (src0 - src1)") 668 669binop("fmul", tfloat, _2src_commutative + associative, """ 670if (nir_is_rounding_mode_rtz(execution_mode, bit_size)) { 671 if (bit_size == 64) 672 dst = _mesa_double_mul_rtz(src0, src1); 673 else 674 dst = _mesa_double_to_float_rtz((double)src0 * (double)src1); 675} else { 676 dst = src0 * src1; 677} 678""") 679 680# Unlike fmul, anything (even infinity or NaN) multiplied by zero is always zero. 681# fmulz(0.0, inf) and fmulz(0.0, nan) must be +/-0.0, even if 682# SIGNED_ZERO_INF_NAN_PRESERVE is not used. If SIGNED_ZERO_INF_NAN_PRESERVE is used, then 683# the result must be a positive zero if either operand is zero. 684binop("fmulz", tfloat32, _2src_commutative + associative, """ 685if (src0 == 0.0 || src1 == 0.0) 686 dst = 0.0; 687else if (nir_is_rounding_mode_rtz(execution_mode, 32)) 688 dst = _mesa_double_to_float_rtz((double)src0 * (double)src1); 689else 690 dst = src0 * src1; 691""") 692 693# low 32-bits of signed/unsigned integer multiply 694binop("imul", tint, _2src_commutative + associative, """ 695 /* Use 64-bit multiplies to prevent overflow of signed arithmetic */ 696 dst = (uint64_t)src0 * (uint64_t)src1; 697""") 698 699# Generate 64 bit result from 2 32 bits quantity 700binop_convert("imul_2x32_64", tint64, tint32, _2src_commutative, 701 "(int64_t)src0 * (int64_t)src1") 702binop_convert("umul_2x32_64", tuint64, tuint32, _2src_commutative, 703 "(uint64_t)src0 * (uint64_t)src1") 704 705# high 32-bits of signed integer multiply 706binop("imul_high", tint, _2src_commutative, """ 707if (bit_size == 64) { 708 /* We need to do a full 128-bit x 128-bit multiply in order for the sign 709 * extension to work properly. The casts are kind-of annoying but needed 710 * to prevent compiler warnings. 711 */ 712 uint32_t src0_u32[4] = { 713 src0, 714 (int64_t)src0 >> 32, 715 (int64_t)src0 >> 63, 716 (int64_t)src0 >> 63, 717 }; 718 uint32_t src1_u32[4] = { 719 src1, 720 (int64_t)src1 >> 32, 721 (int64_t)src1 >> 63, 722 (int64_t)src1 >> 63, 723 }; 724 uint32_t prod_u32[4]; 725 ubm_mul_u32arr(prod_u32, src0_u32, src1_u32); 726 dst = (uint64_t)prod_u32[2] | ((uint64_t)prod_u32[3] << 32); 727} else { 728 /* First, sign-extend to 64-bit, then convert to unsigned to prevent 729 * potential overflow of signed multiply */ 730 dst = ((uint64_t)(int64_t)src0 * (uint64_t)(int64_t)src1) >> bit_size; 731} 732""") 733 734# high 32-bits of unsigned integer multiply 735binop("umul_high", tuint, _2src_commutative, """ 736if (bit_size == 64) { 737 /* The casts are kind-of annoying but needed to prevent compiler warnings. */ 738 uint32_t src0_u32[2] = { src0, (uint64_t)src0 >> 32 }; 739 uint32_t src1_u32[2] = { src1, (uint64_t)src1 >> 32 }; 740 uint32_t prod_u32[4]; 741 ubm_mul_u32arr(prod_u32, src0_u32, src1_u32); 742 dst = (uint64_t)prod_u32[2] | ((uint64_t)prod_u32[3] << 32); 743} else { 744 dst = ((uint64_t)src0 * (uint64_t)src1) >> bit_size; 745} 746""") 747 748# low 32-bits of unsigned integer multiply 749binop("umul_low", tuint32, _2src_commutative, """ 750uint64_t mask = (1 << (bit_size / 2)) - 1; 751dst = ((uint64_t)src0 & mask) * ((uint64_t)src1 & mask); 752""") 753 754# Multiply 32-bits with low 16-bits. 755binop("imul_32x16", tint32, "", "src0 * (int16_t) src1") 756binop("umul_32x16", tuint32, "", "src0 * (uint16_t) src1") 757 758binop("fdiv", tfloat, "", "src0 / src1") 759binop("idiv", tint, "", "src1 == 0 ? 0 : (src0 / src1)") 760binop("udiv", tuint, "", "src1 == 0 ? 0 : (src0 / src1)") 761 762# returns an integer (1 or 0) representing the carry resulting from the 763# addition of the two unsigned arguments. 764 765binop_convert("uadd_carry", tuint, tuint, _2src_commutative, "src0 + src1 < src0") 766 767# returns an integer (1 or 0) representing the borrow resulting from the 768# subtraction of the two unsigned arguments. 769 770binop_convert("usub_borrow", tuint, tuint, "", "src0 < src1") 771 772# hadd: (a + b) >> 1 (without overflow) 773# x + y = x - (x & ~y) + (x & ~y) + y - (~x & y) + (~x & y) 774# = (x & y) + (x & ~y) + (x & y) + (~x & y) 775# = 2 * (x & y) + (x & ~y) + (~x & y) 776# = ((x & y) << 1) + (x ^ y) 777# 778# Since we know that the bottom bit of (x & y) << 1 is zero, 779# 780# (x + y) >> 1 = (((x & y) << 1) + (x ^ y)) >> 1 781# = (x & y) + ((x ^ y) >> 1) 782binop("ihadd", tint, _2src_commutative, "(src0 & src1) + ((src0 ^ src1) >> 1)") 783binop("uhadd", tuint, _2src_commutative, "(src0 & src1) + ((src0 ^ src1) >> 1)") 784 785# rhadd: (a + b + 1) >> 1 (without overflow) 786# x + y + 1 = x + (~x & y) - (~x & y) + y + (x & ~y) - (x & ~y) + 1 787# = (x | y) - (~x & y) + (x | y) - (x & ~y) + 1 788# = 2 * (x | y) - ((~x & y) + (x & ~y)) + 1 789# = ((x | y) << 1) - (x ^ y) + 1 790# 791# Since we know that the bottom bit of (x & y) << 1 is zero, 792# 793# (x + y + 1) >> 1 = (x | y) + (-(x ^ y) + 1) >> 1) 794# = (x | y) - ((x ^ y) >> 1) 795binop("irhadd", tint, _2src_commutative, "(src0 | src1) - ((src0 ^ src1) >> 1)") 796binop("urhadd", tuint, _2src_commutative, "(src0 | src1) - ((src0 ^ src1) >> 1)") 797 798binop("umod", tuint, "", "src1 == 0 ? 0 : src0 % src1") 799 800# For signed integers, there are several different possible definitions of 801# "modulus" or "remainder". We follow the conventions used by LLVM and 802# SPIR-V. The irem opcode implements the standard C/C++ signed "%" 803# operation while the imod opcode implements the more mathematical 804# "modulus" operation. For details on the difference, see 805# 806# http://mathforum.org/library/drmath/view/52343.html 807 808binop("irem", tint, "", "src1 == 0 ? 0 : src0 % src1") 809binop("imod", tint, "", 810 "src1 == 0 ? 0 : ((src0 % src1 == 0 || (src0 >= 0) == (src1 >= 0)) ?" 811 " src0 % src1 : src0 % src1 + src1)") 812binop("fmod", tfloat, "", "src0 - src1 * floorf(src0 / src1)") 813binop("frem", tfloat, "", "src0 - src1 * truncf(src0 / src1)") 814 815# 816# Comparisons 817# 818 819 820# these integer-aware comparisons return a boolean (0 or ~0) 821 822binop_compare_all_sizes("flt", tfloat, "", "src0 < src1") 823binop_compare_all_sizes("fge", tfloat, "", "src0 >= src1") 824binop_compare_all_sizes("feq", tfloat, _2src_commutative, "src0 == src1") 825binop_compare_all_sizes("fneu", tfloat, _2src_commutative, "src0 != src1") 826binop_compare_all_sizes("ilt", tint, "", "src0 < src1") 827binop_compare_all_sizes("ige", tint, "", "src0 >= src1") 828binop_compare_all_sizes("ieq", tint, _2src_commutative, "src0 == src1") 829binop_compare_all_sizes("ine", tint, _2src_commutative, "src0 != src1") 830binop_compare_all_sizes("ult", tuint, "", "src0 < src1") 831binop_compare_all_sizes("uge", tuint, "", "src0 >= src1") 832 833# integer-aware GLSL-style comparisons that compare floats and ints 834 835binop_reduce_all_sizes("ball_fequal", 1, tfloat, "{src0} == {src1}", 836 "{src0} && {src1}", "{src}") 837binop_reduce_all_sizes("bany_fnequal", 1, tfloat, "{src0} != {src1}", 838 "{src0} || {src1}", "{src}") 839binop_reduce_all_sizes("ball_iequal", 1, tint, "{src0} == {src1}", 840 "{src0} && {src1}", "{src}") 841binop_reduce_all_sizes("bany_inequal", 1, tint, "{src0} != {src1}", 842 "{src0} || {src1}", "{src}") 843 844# non-integer-aware GLSL-style comparisons that return 0.0 or 1.0 845 846binop_reduce("fall_equal", 1, tfloat32, tfloat32, "{src0} == {src1}", 847 "{src0} && {src1}", "{src} ? 1.0f : 0.0f") 848binop_reduce("fany_nequal", 1, tfloat32, tfloat32, "{src0} != {src1}", 849 "{src0} || {src1}", "{src} ? 1.0f : 0.0f") 850 851# These comparisons for integer-less hardware return 1.0 and 0.0 for true 852# and false respectively 853 854binop("slt", tfloat, "", "(src0 < src1) ? 1.0f : 0.0f") # Set on Less Than 855binop("sge", tfloat, "", "(src0 >= src1) ? 1.0f : 0.0f") # Set on Greater or Equal 856binop("seq", tfloat, _2src_commutative, "(src0 == src1) ? 1.0f : 0.0f") # Set on Equal 857binop("sne", tfloat, _2src_commutative, "(src0 != src1) ? 1.0f : 0.0f") # Set on Not Equal 858 859# SPIRV shifts are undefined for shift-operands >= bitsize, 860# but SM5 shifts are defined to use only the least significant bits. 861# The NIR definition is according to the SM5 specification. 862opcode("ishl", 0, tint, [0, 0], [tint, tuint32], False, "", 863 "(uint64_t)src0 << (src1 & (sizeof(src0) * 8 - 1))") 864opcode("ishr", 0, tint, [0, 0], [tint, tuint32], False, "", 865 "src0 >> (src1 & (sizeof(src0) * 8 - 1))") 866opcode("ushr", 0, tuint, [0, 0], [tuint, tuint32], False, "", 867 "src0 >> (src1 & (sizeof(src0) * 8 - 1))") 868 869opcode("urol", 0, tuint, [0, 0], [tuint, tuint32], False, "", """ 870 uint32_t rotate_mask = sizeof(src0) * 8 - 1; 871 dst = (src0 << (src1 & rotate_mask)) | 872 (src0 >> (-src1 & rotate_mask)); 873""") 874opcode("uror", 0, tuint, [0, 0], [tuint, tuint32], False, "", """ 875 uint32_t rotate_mask = sizeof(src0) * 8 - 1; 876 dst = (src0 >> (src1 & rotate_mask)) | 877 (src0 << (-src1 & rotate_mask)); 878""") 879 880# bitwise logic operators 881# 882# These are also used as boolean and, or, xor for hardware supporting 883# integers. 884 885 886binop("iand", tuint, _2src_commutative + associative, "src0 & src1") 887binop("ior", tuint, _2src_commutative + associative, "src0 | src1") 888binop("ixor", tuint, _2src_commutative + associative, "src0 ^ src1") 889 890 891binop_reduce("fdot", 1, tfloat, tfloat, "{src0} * {src1}", "{src0} + {src1}", 892 "{src}") 893 894binop_reduce("fdot", 0, tfloat, tfloat, 895 "{src0} * {src1}", "{src0} + {src1}", "{src}", 896 suffix="_replicated") 897 898opcode("fdph", 1, tfloat, [3, 4], [tfloat, tfloat], False, "", 899 "src0.x * src1.x + src0.y * src1.y + src0.z * src1.z + src1.w") 900opcode("fdph_replicated", 0, tfloat, [3, 4], [tfloat, tfloat], False, "", 901 "src0.x * src1.x + src0.y * src1.y + src0.z * src1.z + src1.w") 902 903binop("fmin", tfloat, _2src_commutative + associative, "fmin(src0, src1)") 904binop("imin", tint, _2src_commutative + associative, "src1 > src0 ? src0 : src1") 905binop("umin", tuint, _2src_commutative + associative, "src1 > src0 ? src0 : src1") 906binop("fmax", tfloat, _2src_commutative + associative, "fmax(src0, src1)") 907binop("imax", tint, _2src_commutative + associative, "src1 > src0 ? src1 : src0") 908binop("umax", tuint, _2src_commutative + associative, "src1 > src0 ? src1 : src0") 909 910binop("fpow", tfloat, "", "bit_size == 64 ? powf(src0, src1) : pow(src0, src1)") 911 912binop_horiz("pack_half_2x16_split", 1, tuint32, 1, tfloat32, 1, tfloat32, 913 "pack_half_1x16(src0.x) | (pack_half_1x16(src1.x) << 16)") 914 915binop_convert("pack_64_2x32_split", tuint64, tuint32, "", 916 "src0 | ((uint64_t)src1 << 32)") 917 918binop_convert("pack_32_2x16_split", tuint32, tuint16, "", 919 "src0 | ((uint32_t)src1 << 16)") 920 921opcode("pack_32_4x8_split", 0, tuint32, [0, 0, 0, 0], [tuint8, tuint8, tuint8, tuint8], 922 False, "", 923 "src0 | ((uint32_t)src1 << 8) | ((uint32_t)src2 << 16) | ((uint32_t)src3 << 24)") 924 925# bfm implements the behavior of the first operation of the SM5 "bfi" assembly 926# and that of the "bfi1" i965 instruction. That is, the bits and offset values 927# are from the low five bits of src0 and src1, respectively. 928binop_convert("bfm", tuint32, tint32, "", """ 929int bits = src0 & 0x1F; 930int offset = src1 & 0x1F; 931dst = ((1u << bits) - 1) << offset; 932""") 933 934opcode("ldexp", 0, tfloat, [0, 0], [tfloat, tint32], False, "", """ 935dst = (bit_size == 64) ? ldexp(src0, src1) : ldexpf(src0, src1); 936/* flush denormals to zero. */ 937if (!isnormal(dst)) 938 dst = copysignf(0.0f, src0); 939""") 940 941# Combines the first component of each input to make a 2-component vector. 942 943binop_horiz("vec2", 2, tuint, 1, tuint, 1, tuint, """ 944dst.x = src0.x; 945dst.y = src1.x; 946""") 947 948# Byte extraction 949binop("extract_u8", tuint, "", "(uint8_t)(src0 >> (src1 * 8))") 950binop("extract_i8", tint, "", "(int8_t)(src0 >> (src1 * 8))") 951 952# Word extraction 953binop("extract_u16", tuint, "", "(uint16_t)(src0 >> (src1 * 16))") 954binop("extract_i16", tint, "", "(int16_t)(src0 >> (src1 * 16))") 955 956# Byte/word insertion 957binop("insert_u8", tuint, "", "(src0 & 0xff) << (src1 * 8)") 958binop("insert_u16", tuint, "", "(src0 & 0xffff) << (src1 * 16)") 959 960 961def triop(name, ty, alg_props, const_expr): 962 opcode(name, 0, ty, [0, 0, 0], [ty, ty, ty], False, alg_props, const_expr) 963def triop_horiz(name, output_size, src1_size, src2_size, src3_size, const_expr): 964 opcode(name, output_size, tuint, 965 [src1_size, src2_size, src3_size], 966 [tuint, tuint, tuint], False, "", const_expr) 967 968triop("ffma", tfloat, _2src_commutative, """ 969if (nir_is_rounding_mode_rtz(execution_mode, bit_size)) { 970 if (bit_size == 64) 971 dst = _mesa_double_fma_rtz(src0, src1, src2); 972 else if (bit_size == 32) 973 dst = _mesa_float_fma_rtz(src0, src1, src2); 974 else 975 dst = _mesa_double_to_float_rtz(_mesa_double_fma_rtz(src0, src1, src2)); 976} else { 977 if (bit_size == 32) 978 dst = fmaf(src0, src1, src2); 979 else 980 dst = fma(src0, src1, src2); 981} 982""") 983 984# Unlike ffma, anything (even infinity or NaN) multiplied by zero is always zero. 985# ffmaz(0.0, inf, src2) and ffmaz(0.0, nan, src2) must be +/-0.0 + src2, even if 986# SIGNED_ZERO_INF_NAN_PRESERVE is not used. If SIGNED_ZERO_INF_NAN_PRESERVE is used, then 987# the result must be a positive zero plus src2 if either src0 or src1 is zero. 988triop("ffmaz", tfloat32, _2src_commutative, """ 989if (src0 == 0.0 || src1 == 0.0) 990 dst = 0.0 + src2; 991else if (nir_is_rounding_mode_rtz(execution_mode, 32)) 992 dst = _mesa_float_fma_rtz(src0, src1, src2); 993else 994 dst = fmaf(src0, src1, src2); 995""") 996 997triop("flrp", tfloat, "", "src0 * (1 - src2) + src1 * src2") 998 999# Ternary addition 1000triop("iadd3", tint, _2src_commutative + associative, "src0 + src1 + src2") 1001 1002# Conditional Select 1003# 1004# A vector conditional select instruction (like ?:, but operating per- 1005# component on vectors). There are two versions, one for floating point 1006# bools (0.0 vs 1.0) and one for integer bools (0 vs ~0). 1007 1008triop("fcsel", tfloat32, selection, "(src0 != 0.0f) ? src1 : src2") 1009 1010opcode("bcsel", 0, tuint, [0, 0, 0], 1011 [tbool1, tuint, tuint], False, selection, "src0 ? src1 : src2") 1012opcode("b8csel", 0, tuint, [0, 0, 0], 1013 [tbool8, tuint, tuint], False, selection, "src0 ? src1 : src2") 1014opcode("b16csel", 0, tuint, [0, 0, 0], 1015 [tbool16, tuint, tuint], False, selection, "src0 ? src1 : src2") 1016opcode("b32csel", 0, tuint, [0, 0, 0], 1017 [tbool32, tuint, tuint], False, selection, "src0 ? src1 : src2") 1018 1019triop("i32csel_gt", tint32, selection, "(src0 > 0) ? src1 : src2") 1020triop("i32csel_ge", tint32, selection, "(src0 >= 0) ? src1 : src2") 1021 1022triop("fcsel_gt", tfloat32, selection, "(src0 > 0.0f) ? src1 : src2") 1023triop("fcsel_ge", tfloat32, selection, "(src0 >= 0.0f) ? src1 : src2") 1024 1025# SM5 bfi assembly 1026triop("bfi", tuint32, "", """ 1027unsigned mask = src0, insert = src1, base = src2; 1028if (mask == 0) { 1029 dst = base; 1030} else { 1031 unsigned tmp = mask; 1032 while (!(tmp & 1)) { 1033 tmp >>= 1; 1034 insert <<= 1; 1035 } 1036 dst = (base & ~mask) | (insert & mask); 1037} 1038""") 1039 1040 1041triop("bitfield_select", tuint, "", "(src0 & src1) | (~src0 & src2)") 1042 1043# SM5 ubfe/ibfe assembly: only the 5 least significant bits of offset and bits are used. 1044opcode("ubfe", 0, tuint32, 1045 [0, 0, 0], [tuint32, tuint32, tuint32], False, "", """ 1046unsigned base = src0; 1047unsigned offset = src1 & 0x1F; 1048unsigned bits = src2 & 0x1F; 1049if (bits == 0) { 1050 dst = 0; 1051} else if (offset + bits < 32) { 1052 dst = (base << (32 - bits - offset)) >> (32 - bits); 1053} else { 1054 dst = base >> offset; 1055} 1056""") 1057opcode("ibfe", 0, tint32, 1058 [0, 0, 0], [tint32, tuint32, tuint32], False, "", """ 1059int base = src0; 1060unsigned offset = src1 & 0x1F; 1061unsigned bits = src2 & 0x1F; 1062if (bits == 0) { 1063 dst = 0; 1064} else if (offset + bits < 32) { 1065 dst = (base << (32 - bits - offset)) >> (32 - bits); 1066} else { 1067 dst = base >> offset; 1068} 1069""") 1070 1071# GLSL bitfieldExtract() 1072opcode("ubitfield_extract", 0, tuint32, 1073 [0, 0, 0], [tuint32, tint32, tint32], False, "", """ 1074unsigned base = src0; 1075int offset = src1, bits = src2; 1076if (bits == 0) { 1077 dst = 0; 1078} else if (bits < 0 || offset < 0 || offset + bits > 32) { 1079 dst = 0; /* undefined per the spec */ 1080} else { 1081 dst = (base >> offset) & ((1ull << bits) - 1); 1082} 1083""") 1084opcode("ibitfield_extract", 0, tint32, 1085 [0, 0, 0], [tint32, tint32, tint32], False, "", """ 1086int base = src0; 1087int offset = src1, bits = src2; 1088if (bits == 0) { 1089 dst = 0; 1090} else if (offset < 0 || bits < 0 || offset + bits > 32) { 1091 dst = 0; 1092} else { 1093 dst = (base << (32 - offset - bits)) >> (32 - bits); /* use sign-extending shift */ 1094} 1095""") 1096 1097# Sum of absolute differences with accumulation. 1098# (Equivalent to AMD's v_sad_u8 instruction.) 1099# The first two sources contain packed 8-bit unsigned integers, the instruction 1100# will calculate the absolute difference of these, and then add them together. 1101# There is also a third source which is a 32-bit unsigned integer and added to the result. 1102triop_horiz("sad_u8x4", 1, 1, 1, 1, """ 1103uint8_t s0_b0 = (src0.x & 0x000000ff) >> 0; 1104uint8_t s0_b1 = (src0.x & 0x0000ff00) >> 8; 1105uint8_t s0_b2 = (src0.x & 0x00ff0000) >> 16; 1106uint8_t s0_b3 = (src0.x & 0xff000000) >> 24; 1107 1108uint8_t s1_b0 = (src1.x & 0x000000ff) >> 0; 1109uint8_t s1_b1 = (src1.x & 0x0000ff00) >> 8; 1110uint8_t s1_b2 = (src1.x & 0x00ff0000) >> 16; 1111uint8_t s1_b3 = (src1.x & 0xff000000) >> 24; 1112 1113dst.x = src2.x + 1114 (s0_b0 > s1_b0 ? (s0_b0 - s1_b0) : (s1_b0 - s0_b0)) + 1115 (s0_b1 > s1_b1 ? (s0_b1 - s1_b1) : (s1_b1 - s0_b1)) + 1116 (s0_b2 > s1_b2 ? (s0_b2 - s1_b2) : (s1_b2 - s0_b2)) + 1117 (s0_b3 > s1_b3 ? (s0_b3 - s1_b3) : (s1_b3 - s0_b3)); 1118""") 1119 1120# Combines the first component of each input to make a 3-component vector. 1121 1122triop_horiz("vec3", 3, 1, 1, 1, """ 1123dst.x = src0.x; 1124dst.y = src1.x; 1125dst.z = src2.x; 1126""") 1127 1128def quadop_horiz(name, output_size, src1_size, src2_size, src3_size, 1129 src4_size, const_expr): 1130 opcode(name, output_size, tuint, 1131 [src1_size, src2_size, src3_size, src4_size], 1132 [tuint, tuint, tuint, tuint], 1133 False, "", const_expr) 1134 1135opcode("bitfield_insert", 0, tuint32, [0, 0, 0, 0], 1136 [tuint32, tuint32, tint32, tint32], False, "", """ 1137unsigned base = src0, insert = src1; 1138int offset = src2, bits = src3; 1139if (bits == 0) { 1140 dst = base; 1141} else if (offset < 0 || bits < 0 || bits + offset > 32) { 1142 dst = 0; 1143} else { 1144 unsigned mask = ((1ull << bits) - 1) << offset; 1145 dst = (base & ~mask) | ((insert << offset) & mask); 1146} 1147""") 1148 1149quadop_horiz("vec4", 4, 1, 1, 1, 1, """ 1150dst.x = src0.x; 1151dst.y = src1.x; 1152dst.z = src2.x; 1153dst.w = src3.x; 1154""") 1155 1156opcode("vec5", 5, tuint, 1157 [1] * 5, [tuint] * 5, 1158 False, "", """ 1159dst.x = src0.x; 1160dst.y = src1.x; 1161dst.z = src2.x; 1162dst.w = src3.x; 1163dst.e = src4.x; 1164""") 1165 1166opcode("vec8", 8, tuint, 1167 [1] * 8, [tuint] * 8, 1168 False, "", """ 1169dst.x = src0.x; 1170dst.y = src1.x; 1171dst.z = src2.x; 1172dst.w = src3.x; 1173dst.e = src4.x; 1174dst.f = src5.x; 1175dst.g = src6.x; 1176dst.h = src7.x; 1177""") 1178 1179opcode("vec16", 16, tuint, 1180 [1] * 16, [tuint] * 16, 1181 False, "", """ 1182dst.x = src0.x; 1183dst.y = src1.x; 1184dst.z = src2.x; 1185dst.w = src3.x; 1186dst.e = src4.x; 1187dst.f = src5.x; 1188dst.g = src6.x; 1189dst.h = src7.x; 1190dst.i = src8.x; 1191dst.j = src9.x; 1192dst.k = src10.x; 1193dst.l = src11.x; 1194dst.m = src12.x; 1195dst.n = src13.x; 1196dst.o = src14.x; 1197dst.p = src15.x; 1198""") 1199 1200# An integer multiply instruction for address calculation. This is 1201# similar to imul, except that the results are undefined in case of 1202# overflow. Overflow is defined according to the size of the variable 1203# being dereferenced. 1204# 1205# This relaxed definition, compared to imul, allows an optimization 1206# pass to propagate bounds (ie, from an load/store intrinsic) to the 1207# sources, such that lower precision integer multiplies can be used. 1208# This is useful on hw that has 24b or perhaps 16b integer multiply 1209# instructions. 1210binop("amul", tint, _2src_commutative + associative, "src0 * src1") 1211 1212# ir3-specific instruction that maps directly to mul-add shift high mix, 1213# (IMADSH_MIX16 i.e. ah * bl << 16 + c). It is used for lowering integer 1214# multiplication (imul) on Freedreno backend.. 1215opcode("imadsh_mix16", 0, tint32, 1216 [0, 0, 0], [tint32, tint32, tint32], False, "", """ 1217dst = ((((src0 & 0xffff0000) >> 16) * (src1 & 0x0000ffff)) << 16) + src2; 1218""") 1219 1220# ir3-specific instruction that maps directly to ir3 mad.s24. 1221# 1222# 24b multiply into 32b result (with sign extension) plus 32b int 1223triop("imad24_ir3", tint32, _2src_commutative, 1224 "(((int32_t)src0 << 8) >> 8) * (((int32_t)src1 << 8) >> 8) + src2") 1225 1226# r600-specific instruction that evaluates unnormalized cube texture coordinates 1227# and face index 1228# The actual texture coordinates are evaluated from this according to 1229# dst.yx / abs(dst.z) + 1.5 1230unop_horiz("cube_r600", 4, tfloat32, 3, tfloat32, """ 1231 dst.x = dst.y = dst.z = 0.0; 1232 float absX = fabsf(src0.x); 1233 float absY = fabsf(src0.y); 1234 float absZ = fabsf(src0.z); 1235 1236 if (absX >= absY && absX >= absZ) { dst.z = 2 * src0.x; } 1237 if (absY >= absX && absY >= absZ) { dst.z = 2 * src0.y; } 1238 if (absZ >= absX && absZ >= absY) { dst.z = 2 * src0.z; } 1239 1240 if (src0.x >= 0 && absX >= absY && absX >= absZ) { 1241 dst.y = -src0.z; dst.x = -src0.y; dst.w = 0; 1242 } 1243 if (src0.x < 0 && absX >= absY && absX >= absZ) { 1244 dst.y = src0.z; dst.x = -src0.y; dst.w = 1; 1245 } 1246 if (src0.y >= 0 && absY >= absX && absY >= absZ) { 1247 dst.y = src0.x; dst.x = src0.z; dst.w = 2; 1248 } 1249 if (src0.y < 0 && absY >= absX && absY >= absZ) { 1250 dst.y = src0.x; dst.x = -src0.z; dst.w = 3; 1251 } 1252 if (src0.z >= 0 && absZ >= absX && absZ >= absY) { 1253 dst.y = src0.x; dst.x = -src0.y; dst.w = 4; 1254 } 1255 if (src0.z < 0 && absZ >= absX && absZ >= absY) { 1256 dst.y = -src0.x; dst.x = -src0.y; dst.w = 5; 1257 } 1258""") 1259 1260# r600/gcn specific sin and cos 1261# these trigeometric functions need some lowering because the supported 1262# input values are expected to be normalized by dividing by (2 * pi) 1263unop("fsin_amd", tfloat, "sinf(6.2831853 * src0)") 1264unop("fcos_amd", tfloat, "cosf(6.2831853 * src0)") 1265 1266# AGX specific sin with input expressed in quadrants. Used in the lowering for 1267# fsin/fcos. This corresponds to a sequence of 3 ALU ops in the backend (where 1268# the angle is further decomposed by quadrant, sinc is computed, and the angle 1269# is multiplied back for sin). Lowering fsin/fcos to fsin_agx requires some 1270# additional ALU that NIR may be able to optimize. 1271unop("fsin_agx", tfloat, "sinf(src0 * (6.2831853/4.0))") 1272 1273# 24b multiply into 32b result (with sign extension) 1274binop("imul24", tint32, _2src_commutative + associative, 1275 "(((int32_t)src0 << 8) >> 8) * (((int32_t)src1 << 8) >> 8)") 1276 1277# unsigned 24b multiply into 32b result plus 32b int 1278triop("umad24", tuint32, _2src_commutative, 1279 "(((uint32_t)src0 << 8) >> 8) * (((uint32_t)src1 << 8) >> 8) + src2") 1280 1281# unsigned 24b multiply into 32b result uint 1282binop("umul24", tint32, _2src_commutative + associative, 1283 "(((uint32_t)src0 << 8) >> 8) * (((uint32_t)src1 << 8) >> 8)") 1284 1285# relaxed versions of the above, which assume input is in the 24bit range (no clamping) 1286binop("imul24_relaxed", tint32, _2src_commutative + associative, "src0 * src1") 1287triop("umad24_relaxed", tuint32, _2src_commutative, "src0 * src1 + src2") 1288binop("umul24_relaxed", tuint32, _2src_commutative + associative, "src0 * src1") 1289 1290unop_convert("fisnormal", tbool1, tfloat, "isnormal(src0)") 1291unop_convert("fisfinite", tbool1, tfloat, "isfinite(src0)") 1292unop_convert("fisfinite32", tbool32, tfloat, "isfinite(src0)") 1293 1294# vc4-specific opcodes 1295 1296# Saturated vector add for 4 8bit ints. 1297binop("usadd_4x8_vc4", tint32, _2src_commutative + associative, """ 1298dst = 0; 1299for (int i = 0; i < 32; i += 8) { 1300 dst |= MIN2(((src0 >> i) & 0xff) + ((src1 >> i) & 0xff), 0xff) << i; 1301} 1302""") 1303 1304# Saturated vector subtract for 4 8bit ints. 1305binop("ussub_4x8_vc4", tint32, "", """ 1306dst = 0; 1307for (int i = 0; i < 32; i += 8) { 1308 int src0_chan = (src0 >> i) & 0xff; 1309 int src1_chan = (src1 >> i) & 0xff; 1310 if (src0_chan > src1_chan) 1311 dst |= (src0_chan - src1_chan) << i; 1312} 1313""") 1314 1315# vector min for 4 8bit ints. 1316binop("umin_4x8_vc4", tint32, _2src_commutative + associative, """ 1317dst = 0; 1318for (int i = 0; i < 32; i += 8) { 1319 dst |= MIN2((src0 >> i) & 0xff, (src1 >> i) & 0xff) << i; 1320} 1321""") 1322 1323# vector max for 4 8bit ints. 1324binop("umax_4x8_vc4", tint32, _2src_commutative + associative, """ 1325dst = 0; 1326for (int i = 0; i < 32; i += 8) { 1327 dst |= MAX2((src0 >> i) & 0xff, (src1 >> i) & 0xff) << i; 1328} 1329""") 1330 1331# unorm multiply: (a * b) / 255. 1332binop("umul_unorm_4x8_vc4", tint32, _2src_commutative + associative, """ 1333dst = 0; 1334for (int i = 0; i < 32; i += 8) { 1335 int src0_chan = (src0 >> i) & 0xff; 1336 int src1_chan = (src1 >> i) & 0xff; 1337 dst |= ((src0_chan * src1_chan) / 255) << i; 1338} 1339""") 1340 1341# Mali-specific opcodes 1342unop("fsat_signed_mali", tfloat, ("fmin(fmax(src0, -1.0), 1.0)")) 1343unop("fclamp_pos_mali", tfloat, ("fmax(src0, 0.0)")) 1344 1345# Magnitude equal to fddx/y, sign undefined. Derivative of a constant is zero. 1346unop("fddx_must_abs_mali", tfloat, "0.0") 1347unop("fddy_must_abs_mali", tfloat, "0.0") 1348 1349# DXIL specific double [un]pack 1350# DXIL doesn't support generic [un]pack instructions, so we want those 1351# lowered to bit ops. HLSL doesn't support 64bit bitcasts to/from 1352# double, only [un]pack. Technically DXIL does, but considering they 1353# can't be generated from HLSL, we want to match what would be coming from DXC. 1354# This is essentially just the standard [un]pack, except that it doesn't get 1355# lowered so we can handle it in the backend and turn it into MakeDouble/SplitDouble 1356unop_horiz("pack_double_2x32_dxil", 1, tuint64, 2, tuint32, 1357 "dst.x = src0.x | ((uint64_t)src0.y << 32);") 1358unop_horiz("unpack_double_2x32_dxil", 2, tuint32, 1, tuint64, 1359 "dst.x = src0.x; dst.y = src0.x >> 32;") 1360 1361# src0 and src1 are i8vec4 packed in an int32, and src2 is an int32. The int8 1362# components are sign-extended to 32-bits, and a dot-product is performed on 1363# the resulting vectors. src2 is added to the result of the dot-product. 1364opcode("sdot_4x8_iadd", 0, tint32, [0, 0, 0], [tuint32, tuint32, tint32], 1365 False, _2src_commutative, """ 1366 const int32_t v0x = (int8_t)(src0 ); 1367 const int32_t v0y = (int8_t)(src0 >> 8); 1368 const int32_t v0z = (int8_t)(src0 >> 16); 1369 const int32_t v0w = (int8_t)(src0 >> 24); 1370 const int32_t v1x = (int8_t)(src1 ); 1371 const int32_t v1y = (int8_t)(src1 >> 8); 1372 const int32_t v1z = (int8_t)(src1 >> 16); 1373 const int32_t v1w = (int8_t)(src1 >> 24); 1374 1375 dst = (v0x * v1x) + (v0y * v1y) + (v0z * v1z) + (v0w * v1w) + src2; 1376""") 1377 1378# Like sdot_4x8_iadd, but unsigned. 1379opcode("udot_4x8_uadd", 0, tuint32, [0, 0, 0], [tuint32, tuint32, tuint32], 1380 False, _2src_commutative, """ 1381 const uint32_t v0x = (uint8_t)(src0 ); 1382 const uint32_t v0y = (uint8_t)(src0 >> 8); 1383 const uint32_t v0z = (uint8_t)(src0 >> 16); 1384 const uint32_t v0w = (uint8_t)(src0 >> 24); 1385 const uint32_t v1x = (uint8_t)(src1 ); 1386 const uint32_t v1y = (uint8_t)(src1 >> 8); 1387 const uint32_t v1z = (uint8_t)(src1 >> 16); 1388 const uint32_t v1w = (uint8_t)(src1 >> 24); 1389 1390 dst = (v0x * v1x) + (v0y * v1y) + (v0z * v1z) + (v0w * v1w) + src2; 1391""") 1392 1393# src0 is i8vec4 packed in an int32, src1 is u8vec4 packed in an int32, and 1394# src2 is an int32. The 8-bit components are extended to 32-bits, and a 1395# dot-product is performed on the resulting vectors. src2 is added to the 1396# result of the dot-product. 1397# 1398# NOTE: Unlike many of the other dp4a opcodes, this mixed signs of source 0 1399# and source 1 mean that this opcode is not 2-source commutative 1400opcode("sudot_4x8_iadd", 0, tint32, [0, 0, 0], [tuint32, tuint32, tint32], 1401 False, "", """ 1402 const int32_t v0x = (int8_t)(src0 ); 1403 const int32_t v0y = (int8_t)(src0 >> 8); 1404 const int32_t v0z = (int8_t)(src0 >> 16); 1405 const int32_t v0w = (int8_t)(src0 >> 24); 1406 const uint32_t v1x = (uint8_t)(src1 ); 1407 const uint32_t v1y = (uint8_t)(src1 >> 8); 1408 const uint32_t v1z = (uint8_t)(src1 >> 16); 1409 const uint32_t v1w = (uint8_t)(src1 >> 24); 1410 1411 dst = (v0x * v1x) + (v0y * v1y) + (v0z * v1z) + (v0w * v1w) + src2; 1412""") 1413 1414# Like sdot_4x8_iadd, but the result is clampled to the range [-0x80000000, 0x7ffffffff]. 1415opcode("sdot_4x8_iadd_sat", 0, tint32, [0, 0, 0], [tuint32, tuint32, tint32], 1416 False, _2src_commutative, """ 1417 const int64_t v0x = (int8_t)(src0 ); 1418 const int64_t v0y = (int8_t)(src0 >> 8); 1419 const int64_t v0z = (int8_t)(src0 >> 16); 1420 const int64_t v0w = (int8_t)(src0 >> 24); 1421 const int64_t v1x = (int8_t)(src1 ); 1422 const int64_t v1y = (int8_t)(src1 >> 8); 1423 const int64_t v1z = (int8_t)(src1 >> 16); 1424 const int64_t v1w = (int8_t)(src1 >> 24); 1425 1426 const int64_t tmp = (v0x * v1x) + (v0y * v1y) + (v0z * v1z) + (v0w * v1w) + src2; 1427 1428 dst = tmp >= INT32_MAX ? INT32_MAX : (tmp <= INT32_MIN ? INT32_MIN : tmp); 1429""") 1430 1431# Like udot_4x8_uadd, but the result is clampled to the range [0, 0xfffffffff]. 1432opcode("udot_4x8_uadd_sat", 0, tint32, [0, 0, 0], [tuint32, tuint32, tint32], 1433 False, _2src_commutative, """ 1434 const uint64_t v0x = (uint8_t)(src0 ); 1435 const uint64_t v0y = (uint8_t)(src0 >> 8); 1436 const uint64_t v0z = (uint8_t)(src0 >> 16); 1437 const uint64_t v0w = (uint8_t)(src0 >> 24); 1438 const uint64_t v1x = (uint8_t)(src1 ); 1439 const uint64_t v1y = (uint8_t)(src1 >> 8); 1440 const uint64_t v1z = (uint8_t)(src1 >> 16); 1441 const uint64_t v1w = (uint8_t)(src1 >> 24); 1442 1443 const uint64_t tmp = (v0x * v1x) + (v0y * v1y) + (v0z * v1z) + (v0w * v1w) + src2; 1444 1445 dst = tmp >= UINT32_MAX ? UINT32_MAX : tmp; 1446""") 1447 1448# Like sudot_4x8_iadd, but the result is clampled to the range [-0x80000000, 0x7ffffffff]. 1449# 1450# NOTE: Unlike many of the other dp4a opcodes, this mixed signs of source 0 1451# and source 1 mean that this opcode is not 2-source commutative 1452opcode("sudot_4x8_iadd_sat", 0, tint32, [0, 0, 0], [tuint32, tuint32, tint32], 1453 False, "", """ 1454 const int64_t v0x = (int8_t)(src0 ); 1455 const int64_t v0y = (int8_t)(src0 >> 8); 1456 const int64_t v0z = (int8_t)(src0 >> 16); 1457 const int64_t v0w = (int8_t)(src0 >> 24); 1458 const uint64_t v1x = (uint8_t)(src1 ); 1459 const uint64_t v1y = (uint8_t)(src1 >> 8); 1460 const uint64_t v1z = (uint8_t)(src1 >> 16); 1461 const uint64_t v1w = (uint8_t)(src1 >> 24); 1462 1463 const int64_t tmp = (v0x * v1x) + (v0y * v1y) + (v0z * v1z) + (v0w * v1w) + src2; 1464 1465 dst = tmp >= INT32_MAX ? INT32_MAX : (tmp <= INT32_MIN ? INT32_MIN : tmp); 1466""") 1467 1468# src0 and src1 are i16vec2 packed in an int32, and src2 is an int32. The int16 1469# components are sign-extended to 32-bits, and a dot-product is performed on 1470# the resulting vectors. src2 is added to the result of the dot-product. 1471opcode("sdot_2x16_iadd", 0, tint32, [0, 0, 0], [tuint32, tuint32, tint32], 1472 False, _2src_commutative, """ 1473 const int32_t v0x = (int16_t)(src0 ); 1474 const int32_t v0y = (int16_t)(src0 >> 16); 1475 const int32_t v1x = (int16_t)(src1 ); 1476 const int32_t v1y = (int16_t)(src1 >> 16); 1477 1478 dst = (v0x * v1x) + (v0y * v1y) + src2; 1479""") 1480 1481# Like sdot_2x16_iadd, but unsigned. 1482opcode("udot_2x16_uadd", 0, tuint32, [0, 0, 0], [tuint32, tuint32, tuint32], 1483 False, _2src_commutative, """ 1484 const uint32_t v0x = (uint16_t)(src0 ); 1485 const uint32_t v0y = (uint16_t)(src0 >> 16); 1486 const uint32_t v1x = (uint16_t)(src1 ); 1487 const uint32_t v1y = (uint16_t)(src1 >> 16); 1488 1489 dst = (v0x * v1x) + (v0y * v1y) + src2; 1490""") 1491 1492# Like sdot_2x16_iadd, but the result is clampled to the range [-0x80000000, 0x7ffffffff]. 1493opcode("sdot_2x16_iadd_sat", 0, tint32, [0, 0, 0], [tuint32, tuint32, tint32], 1494 False, _2src_commutative, """ 1495 const int64_t v0x = (int16_t)(src0 ); 1496 const int64_t v0y = (int16_t)(src0 >> 16); 1497 const int64_t v1x = (int16_t)(src1 ); 1498 const int64_t v1y = (int16_t)(src1 >> 16); 1499 1500 const int64_t tmp = (v0x * v1x) + (v0y * v1y) + src2; 1501 1502 dst = tmp >= INT32_MAX ? INT32_MAX : (tmp <= INT32_MIN ? INT32_MIN : tmp); 1503""") 1504 1505# Like udot_2x16_uadd, but the result is clampled to the range [0, 0xfffffffff]. 1506opcode("udot_2x16_uadd_sat", 0, tint32, [0, 0, 0], [tuint32, tuint32, tint32], 1507 False, _2src_commutative, """ 1508 const uint64_t v0x = (uint16_t)(src0 ); 1509 const uint64_t v0y = (uint16_t)(src0 >> 16); 1510 const uint64_t v1x = (uint16_t)(src1 ); 1511 const uint64_t v1y = (uint16_t)(src1 >> 16); 1512 1513 const uint64_t tmp = (v0x * v1x) + (v0y * v1y) + src2; 1514 1515 dst = tmp >= UINT32_MAX ? UINT32_MAX : tmp; 1516""") 1517