1# 2# Copyright (C) 2014 Connor Abbott 3# 4# Permission is hereby granted, free of charge, to any person obtaining a 5# copy of this software and associated documentation files (the "Software"), 6# to deal in the Software without restriction, including without limitation 7# the rights to use, copy, modify, merge, publish, distribute, sublicense, 8# and/or sell copies of the Software, and to permit persons to whom the 9# Software is furnished to do so, subject to the following conditions: 10# 11# The above copyright notice and this permission notice (including the next 12# paragraph) shall be included in all copies or substantial portions of the 13# Software. 14# 15# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 18# THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING 20# FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS 21# IN THE SOFTWARE. 22# 23# Authors: 24# Connor Abbott (cwabbott0@gmail.com) 25 26import re 27 28# Class that represents all the information we have about the opcode 29# NOTE: this must be kept in sync with nir_op_info 30 31class Opcode(object): 32 """Class that represents all the information we have about the opcode 33 NOTE: this must be kept in sync with nir_op_info 34 """ 35 def __init__(self, name, output_size, output_type, input_sizes, 36 input_types, is_conversion, algebraic_properties, const_expr, 37 description): 38 """Parameters: 39 40 - name is the name of the opcode (prepend nir_op_ for the enum name) 41 - all types are strings that get nir_type_ prepended to them 42 - input_types is a list of types 43 - is_conversion is true if this opcode represents a type conversion 44 - algebraic_properties is a space-seperated string, where nir_op_is_ is 45 prepended before each entry 46 - const_expr is an expression or series of statements that computes the 47 constant value of the opcode given the constant values of its inputs. 48 - Optional description of the opcode for documentation. 49 50 Constant expressions are formed from the variables src0, src1, ..., 51 src(N-1), where N is the number of arguments. The output of the 52 expression should be stored in the dst variable. Per-component input 53 and output variables will be scalars and non-per-component input and 54 output variables will be a struct with fields named x, y, z, and w 55 all of the correct type. Input and output variables can be assumed 56 to already be of the correct type and need no conversion. In 57 particular, the conversion from the C bool type to/from NIR_TRUE and 58 NIR_FALSE happens automatically. 59 60 For per-component instructions, the entire expression will be 61 executed once for each component. For non-per-component 62 instructions, the expression is expected to store the correct values 63 in dst.x, dst.y, etc. If "dst" does not exist anywhere in the 64 constant expression, an assignment to dst will happen automatically 65 and the result will be equivalent to "dst = <expression>" for 66 per-component instructions and "dst.x = dst.y = ... = <expression>" 67 for non-per-component instructions. 68 """ 69 assert isinstance(name, str) 70 assert isinstance(output_size, int) 71 assert isinstance(output_type, str) 72 assert isinstance(input_sizes, list) 73 assert isinstance(input_sizes[0], int) 74 assert isinstance(input_types, list) 75 assert isinstance(input_types[0], str) 76 assert isinstance(is_conversion, bool) 77 assert isinstance(algebraic_properties, str) 78 assert isinstance(const_expr, str) 79 assert len(input_sizes) == len(input_types) 80 assert 0 <= output_size <= 5 or (output_size == 8) or (output_size == 16) 81 for size in input_sizes: 82 assert 0 <= size <= 5 or (size == 8) or (size == 16) 83 if output_size != 0: 84 assert size != 0 85 self.name = name 86 self.num_inputs = len(input_sizes) 87 self.output_size = output_size 88 self.output_type = output_type 89 self.input_sizes = input_sizes 90 self.input_types = input_types 91 self.is_conversion = is_conversion 92 self.algebraic_properties = algebraic_properties 93 self.const_expr = const_expr 94 self.description = description 95 96# helper variables for strings 97tfloat = "float" 98tint = "int" 99tbool = "bool" 100tbool1 = "bool1" 101tbool8 = "bool8" 102tbool16 = "bool16" 103tbool32 = "bool32" 104tuint = "uint" 105tuint8 = "uint8" 106tint16 = "int16" 107tuint16 = "uint16" 108tfloat16 = "float16" 109tfloat32 = "float32" 110tint32 = "int32" 111tuint32 = "uint32" 112tint64 = "int64" 113tuint64 = "uint64" 114tfloat64 = "float64" 115 116_TYPE_SPLIT_RE = re.compile(r'(?P<type>int|uint|float|bool)(?P<bits>\d+)?') 117 118def type_has_size(type_): 119 m = _TYPE_SPLIT_RE.match(type_) 120 assert m is not None, 'Invalid NIR type string: "{}"'.format(type_) 121 return m.group('bits') is not None 122 123def type_size(type_): 124 m = _TYPE_SPLIT_RE.match(type_) 125 assert m is not None, 'Invalid NIR type string: "{}"'.format(type_) 126 assert m.group('bits') is not None, \ 127 'NIR type string has no bit size: "{}"'.format(type_) 128 return int(m.group('bits')) 129 130def type_sizes(type_): 131 if type_has_size(type_): 132 return [type_size(type_)] 133 elif type_ == 'bool': 134 return [1, 8, 16, 32] 135 elif type_ == 'float': 136 return [16, 32, 64] 137 else: 138 return [1, 8, 16, 32, 64] 139 140def type_base_type(type_): 141 m = _TYPE_SPLIT_RE.match(type_) 142 assert m is not None, 'Invalid NIR type string: "{}"'.format(type_) 143 return m.group('type') 144 145# Operation where the first two sources are commutative. 146# 147# For 2-source operations, this just mathematical commutativity. Some 148# 3-source operations, like ffma, are only commutative in the first two 149# sources. 150_2src_commutative = "2src_commutative " 151associative = "associative " 152selection = "selection " 153derivative = "derivative " 154 155# global dictionary of opcodes 156opcodes = {} 157 158def opcode(name, output_size, output_type, input_sizes, input_types, 159 is_conversion, algebraic_properties, const_expr, description = ""): 160 assert name not in opcodes 161 opcodes[name] = Opcode(name, output_size, output_type, input_sizes, 162 input_types, is_conversion, algebraic_properties, 163 const_expr, description) 164 165def unop_convert(name, out_type, in_type, const_expr, description = ""): 166 opcode(name, 0, out_type, [0], [in_type], False, "", const_expr, description) 167 168def unop(name, ty, const_expr, description = "", algebraic_properties = ""): 169 opcode(name, 0, ty, [0], [ty], False, algebraic_properties, const_expr, 170 description) 171 172def unop_horiz(name, output_size, output_type, input_size, input_type, 173 const_expr, description = ""): 174 opcode(name, output_size, output_type, [input_size], [input_type], 175 False, "", const_expr, description) 176 177def unop_reduce(name, output_size, output_type, input_type, prereduce_expr, 178 reduce_expr, final_expr, description = ""): 179 def prereduce(src): 180 return "(" + prereduce_expr.format(src=src) + ")" 181 def final(src): 182 return final_expr.format(src="(" + src + ")") 183 def reduce_(src0, src1): 184 return reduce_expr.format(src0=src0, src1=src1) 185 src0 = prereduce("src0.x") 186 src1 = prereduce("src0.y") 187 src2 = prereduce("src0.z") 188 src3 = prereduce("src0.w") 189 unop_horiz(name + "2", output_size, output_type, 2, input_type, 190 final(reduce_(src0, src1)), description) 191 unop_horiz(name + "3", output_size, output_type, 3, input_type, 192 final(reduce_(reduce_(src0, src1), src2)), description) 193 unop_horiz(name + "4", output_size, output_type, 4, input_type, 194 final(reduce_(reduce_(src0, src1), reduce_(src2, src3))), 195 description) 196 197def unop_numeric_convert(name, out_type, in_type, const_expr, description = ""): 198 opcode(name, 0, out_type, [0], [in_type], True, "", const_expr, description) 199 200unop("mov", tuint, "src0") 201 202unop("ineg", tint, "-src0") 203unop("fneg", tfloat, "-src0") 204unop("inot", tint, "~src0", description = "Invert every bit of the integer") 205 206unop("fsign", tfloat, ("bit_size == 64 ? " + 207 "(isnan(src0) ? 0.0 : ((src0 == 0.0 ) ? src0 : (src0 > 0.0 ) ? 1.0 : -1.0 )) : " + 208 "(isnan(src0) ? 0.0f : ((src0 == 0.0f) ? src0 : (src0 > 0.0f) ? 1.0f : -1.0f))"), 209 description = """ 210Roughly implements the OpenGL / Vulkan rules for ``sign(float)``. 211The ``GLSL.std.450 FSign`` instruction is defined as: 212 213 Result is 1.0 if x > 0, 0.0 if x = 0, or -1.0 if x < 0. 214 215If the source is equal to zero, there is a preference for the result to have 216the same sign, but this is not required (it is required by OpenCL). If the 217source is not a number, there is a preference for the result to be +0.0, but 218this is not required (it is required by OpenCL). If the source is not a 219number, and the result is not +0.0, the result should definitely **not** be 220NaN. 221 222The values returned for constant folding match the behavior required by 223OpenCL. 224 """) 225 226unop("isign", tint, "(src0 == 0) ? 0 : ((src0 > 0) ? 1 : -1)") 227unop("iabs", tint, "(src0 < 0) ? -src0 : src0") 228unop("fabs", tfloat, "fabs(src0)") 229unop("fsat", tfloat, ("fmin(fmax(src0, 0.0), 1.0)")) 230unop("frcp", tfloat, "bit_size == 64 ? 1.0 / src0 : 1.0f / src0") 231unop("frsq", tfloat, "bit_size == 64 ? 1.0 / sqrt(src0) : 1.0f / sqrtf(src0)") 232unop("fsqrt", tfloat, "bit_size == 64 ? sqrt(src0) : sqrtf(src0)") 233unop("fexp2", tfloat, "exp2f(src0)") 234unop("flog2", tfloat, "log2f(src0)") 235 236# Generate all of the numeric conversion opcodes 237for src_t in [tint, tuint, tfloat, tbool]: 238 if src_t == tbool: 239 dst_types = [tfloat, tint, tbool] 240 elif src_t == tint: 241 dst_types = [tfloat, tint] 242 elif src_t == tuint: 243 dst_types = [tfloat, tuint] 244 elif src_t == tfloat: 245 dst_types = [tint, tuint, tfloat] 246 247 for dst_t in dst_types: 248 for dst_bit_size in type_sizes(dst_t): 249 if dst_bit_size == 16 and dst_t == tfloat and src_t == tfloat: 250 rnd_modes = ['_rtne', '_rtz', ''] 251 for rnd_mode in rnd_modes: 252 if rnd_mode == '_rtne': 253 conv_expr = """ 254 if (bit_size > 32) { 255 dst = _mesa_half_to_float(_mesa_double_to_float16_rtne(src0)); 256 } else if (bit_size > 16) { 257 dst = _mesa_half_to_float(_mesa_float_to_float16_rtne(src0)); 258 } else { 259 dst = src0; 260 } 261 """ 262 elif rnd_mode == '_rtz': 263 conv_expr = """ 264 if (bit_size > 32) { 265 dst = _mesa_half_to_float(_mesa_double_to_float16_rtz(src0)); 266 } else if (bit_size > 16) { 267 dst = _mesa_half_to_float(_mesa_float_to_float16_rtz(src0)); 268 } else { 269 dst = src0; 270 } 271 """ 272 else: 273 conv_expr = """ 274 if (bit_size > 32) { 275 if (nir_is_rounding_mode_rtz(execution_mode, 16)) 276 dst = _mesa_half_to_float(_mesa_double_to_float16_rtz(src0)); 277 else 278 dst = _mesa_half_to_float(_mesa_double_to_float16_rtne(src0)); 279 } else if (bit_size > 16) { 280 if (nir_is_rounding_mode_rtz(execution_mode, 16)) 281 dst = _mesa_half_to_float(_mesa_float_to_float16_rtz(src0)); 282 else 283 dst = _mesa_half_to_float(_mesa_float_to_float16_rtne(src0)); 284 } else { 285 dst = src0; 286 } 287 """ 288 289 unop_numeric_convert("{0}2{1}{2}{3}".format(src_t[0], 290 dst_t[0], 291 dst_bit_size, 292 rnd_mode), 293 dst_t + str(dst_bit_size), 294 src_t, conv_expr) 295 elif dst_bit_size == 32 and dst_t == tfloat and src_t == tfloat: 296 conv_expr = """ 297 if (bit_size > 32 && nir_is_rounding_mode_rtz(execution_mode, 32)) { 298 dst = _mesa_double_to_float_rtz(src0); 299 } else { 300 dst = src0; 301 } 302 """ 303 unop_numeric_convert("{0}2{1}{2}".format(src_t[0], dst_t[0], 304 dst_bit_size), 305 dst_t + str(dst_bit_size), src_t, conv_expr) 306 else: 307 conv_expr = "src0 != 0" if dst_t == tbool else "src0" 308 unop_numeric_convert("{0}2{1}{2}".format(src_t[0], dst_t[0], 309 dst_bit_size), 310 dst_t + str(dst_bit_size), src_t, conv_expr) 311 312def unop_numeric_convert_mp(base, src_t, dst_t): 313 op_like = base + "16" 314 unop_numeric_convert(base + "mp", src_t, dst_t, opcodes[op_like].const_expr, 315 description = """ 316Special opcode that is the same as :nir:alu-op:`{}` except that it is safe to 317remove it if the result is immediately converted back to 32 bits again. This is 318generated as part of the precision lowering pass. ``mp`` stands for medium 319precision. 320 """.format(op_like)) 321 322unop_numeric_convert_mp("f2f", tfloat16, tfloat32) 323unop_numeric_convert_mp("i2i", tint16, tint32) 324# u2ump isn't defined, because the behavior is equal to i2imp 325unop_numeric_convert_mp("f2i", tint16, tfloat32) 326unop_numeric_convert_mp("f2u", tuint16, tfloat32) 327unop_numeric_convert_mp("i2f", tfloat16, tint32) 328unop_numeric_convert_mp("u2f", tfloat16, tuint32) 329 330# Unary floating-point rounding operations. 331 332 333unop("ftrunc", tfloat, "bit_size == 64 ? trunc(src0) : truncf(src0)") 334unop("fceil", tfloat, "bit_size == 64 ? ceil(src0) : ceilf(src0)") 335unop("ffloor", tfloat, "bit_size == 64 ? floor(src0) : floorf(src0)") 336unop("ffract", tfloat, "src0 - (bit_size == 64 ? floor(src0) : floorf(src0))") 337unop("fround_even", tfloat, "bit_size == 64 ? _mesa_roundeven(src0) : _mesa_roundevenf(src0)") 338 339unop("fquantize2f16", tfloat, "(fabs(src0) < ldexpf(1.0, -14)) ? copysignf(0.0f, src0) : _mesa_half_to_float(_mesa_float_to_half(src0))") 340 341# Trigonometric operations. 342 343 344unop("fsin", tfloat, "bit_size == 64 ? sin(src0) : sinf(src0)") 345unop("fcos", tfloat, "bit_size == 64 ? cos(src0) : cosf(src0)") 346 347# dfrexp 348unop_convert("frexp_exp", tint32, tfloat, "frexp(src0, &dst);") 349unop_convert("frexp_sig", tfloat, tfloat, "int n; dst = frexp(src0, &n);") 350 351# Partial derivatives. 352deriv_template = """ 353Calculate the screen-space partial derivative using {} derivatives of the input 354with respect to the {}-axis. The constant folding is trivial as the derivative 355of a constant is 0 if the constant is not Inf or NaN. 356""" 357 358for mode, suffix in [("either fine or coarse", ""), ("fine", "_fine"), ("coarse", "_coarse")]: 359 for axis in ["x", "y"]: 360 unop(f"fdd{axis}{suffix}", tfloat, "isfinite(src0) ? 0.0 : NAN", 361 algebraic_properties = derivative, 362 description = deriv_template.format(mode, axis.upper())) 363 364# Floating point pack and unpack operations. 365 366def pack_2x16(fmt, in_type): 367 unop_horiz("pack_" + fmt + "_2x16", 1, tuint32, 2, in_type, """ 368dst.x = (uint32_t) pack_fmt_1x16(src0.x); 369dst.x |= ((uint32_t) pack_fmt_1x16(src0.y)) << 16; 370""".replace("fmt", fmt)) 371 372def pack_4x8(fmt): 373 unop_horiz("pack_" + fmt + "_4x8", 1, tuint32, 4, tfloat32, """ 374dst.x = (uint32_t) pack_fmt_1x8(src0.x); 375dst.x |= ((uint32_t) pack_fmt_1x8(src0.y)) << 8; 376dst.x |= ((uint32_t) pack_fmt_1x8(src0.z)) << 16; 377dst.x |= ((uint32_t) pack_fmt_1x8(src0.w)) << 24; 378""".replace("fmt", fmt)) 379 380def unpack_2x16(fmt): 381 unop_horiz("unpack_" + fmt + "_2x16", 2, tfloat32, 1, tuint32, """ 382dst.x = unpack_fmt_1x16((uint16_t)(src0.x & 0xffff)); 383dst.y = unpack_fmt_1x16((uint16_t)(src0.x << 16)); 384""".replace("fmt", fmt)) 385 386def unpack_4x8(fmt): 387 unop_horiz("unpack_" + fmt + "_4x8", 4, tfloat32, 1, tuint32, """ 388dst.x = unpack_fmt_1x8((uint8_t)(src0.x & 0xff)); 389dst.y = unpack_fmt_1x8((uint8_t)((src0.x >> 8) & 0xff)); 390dst.z = unpack_fmt_1x8((uint8_t)((src0.x >> 16) & 0xff)); 391dst.w = unpack_fmt_1x8((uint8_t)(src0.x >> 24)); 392""".replace("fmt", fmt)) 393 394 395pack_2x16("snorm", tfloat) 396pack_4x8("snorm") 397pack_2x16("unorm", tfloat) 398pack_4x8("unorm") 399pack_2x16("half", tfloat32) 400unpack_2x16("snorm") 401unpack_4x8("snorm") 402unpack_2x16("unorm") 403unpack_4x8("unorm") 404unpack_2x16("half") 405 406unop_horiz("pack_uint_2x16", 1, tuint32, 2, tuint32, """ 407dst.x = _mesa_unsigned_to_unsigned(src0.x, 16); 408dst.x |= _mesa_unsigned_to_unsigned(src0.y, 16) << 16; 409""", description = """ 410Convert two unsigned integers into a packed unsigned short (clamp is applied). 411""") 412 413unop_horiz("pack_sint_2x16", 1, tint32, 2, tint32, """ 414dst.x = _mesa_signed_to_signed(src0.x, 16) & 0xffff; 415dst.x |= _mesa_signed_to_signed(src0.y, 16) << 16; 416""", description = """ 417Convert two signed integers into a packed signed short (clamp is applied). 418""") 419 420unop_horiz("pack_uvec2_to_uint", 1, tuint32, 2, tuint32, """ 421dst.x = (src0.x & 0xffff) | (src0.y << 16); 422""") 423 424unop_horiz("pack_uvec4_to_uint", 1, tuint32, 4, tuint32, """ 425dst.x = (src0.x << 0) | 426 (src0.y << 8) | 427 (src0.z << 16) | 428 (src0.w << 24); 429""") 430 431unop_horiz("pack_32_4x8", 1, tuint32, 4, tuint8, 432 "dst.x = src0.x | ((uint32_t)src0.y << 8) | ((uint32_t)src0.z << 16) | ((uint32_t)src0.w << 24);") 433 434unop_horiz("pack_32_2x16", 1, tuint32, 2, tuint16, 435 "dst.x = src0.x | ((uint32_t)src0.y << 16);") 436 437unop_horiz("pack_64_2x32", 1, tuint64, 2, tuint32, 438 "dst.x = src0.x | ((uint64_t)src0.y << 32);") 439 440unop_horiz("pack_64_4x16", 1, tuint64, 4, tuint16, 441 "dst.x = src0.x | ((uint64_t)src0.y << 16) | ((uint64_t)src0.z << 32) | ((uint64_t)src0.w << 48);") 442 443unop_horiz("unpack_64_2x32", 2, tuint32, 1, tuint64, 444 "dst.x = src0.x; dst.y = src0.x >> 32;") 445 446unop_horiz("unpack_64_4x16", 4, tuint16, 1, tuint64, 447 "dst.x = src0.x; dst.y = src0.x >> 16; dst.z = src0.x >> 32; dst.w = src0.x >> 48;") 448 449unop_horiz("unpack_32_2x16", 2, tuint16, 1, tuint32, 450 "dst.x = src0.x; dst.y = src0.x >> 16;") 451 452unop_horiz("unpack_32_4x8", 4, tuint8, 1, tuint32, 453 "dst.x = src0.x; dst.y = src0.x >> 8; dst.z = src0.x >> 16; dst.w = src0.x >> 24;") 454 455unop_horiz("unpack_half_2x16_flush_to_zero", 2, tfloat32, 1, tuint32, """ 456dst.x = unpack_half_1x16_flush_to_zero((uint16_t)(src0.x & 0xffff)); 457dst.y = unpack_half_1x16_flush_to_zero((uint16_t)(src0.x << 16)); 458""") 459 460# Lowered floating point unpacking operations. 461 462unop_convert("unpack_half_2x16_split_x", tfloat32, tuint32, 463 "unpack_half_1x16((uint16_t)(src0 & 0xffff))") 464unop_convert("unpack_half_2x16_split_y", tfloat32, tuint32, 465 "unpack_half_1x16((uint16_t)(src0 >> 16))") 466 467unop_convert("unpack_half_2x16_split_x_flush_to_zero", tfloat32, tuint32, 468 "unpack_half_1x16_flush_to_zero((uint16_t)(src0 & 0xffff))") 469unop_convert("unpack_half_2x16_split_y_flush_to_zero", tfloat32, tuint32, 470 "unpack_half_1x16_flush_to_zero((uint16_t)(src0 >> 16))") 471 472unop_convert("unpack_32_2x16_split_x", tuint16, tuint32, "src0") 473unop_convert("unpack_32_2x16_split_y", tuint16, tuint32, "src0 >> 16") 474 475unop_convert("unpack_64_2x32_split_x", tuint32, tuint64, "src0") 476unop_convert("unpack_64_2x32_split_y", tuint32, tuint64, "src0 >> 32") 477 478# Bit operations, part of ARB_gpu_shader5. 479 480 481unop("bitfield_reverse", tuint32, """ 482/* we're not winning any awards for speed here, but that's ok */ 483dst = 0; 484for (unsigned bit = 0; bit < 32; bit++) 485 dst |= ((src0 >> bit) & 1) << (31 - bit); 486""") 487unop_convert("bit_count", tuint32, tuint, """ 488dst = 0; 489for (unsigned bit = 0; bit < bit_size; bit++) { 490 if ((src0 >> bit) & 1) 491 dst++; 492} 493""") 494 495unop_convert("ufind_msb", tint32, tuint, """ 496dst = -1; 497for (int bit = bit_size - 1; bit >= 0; bit--) { 498 if ((src0 >> bit) & 1) { 499 dst = bit; 500 break; 501 } 502} 503""") 504 505unop_convert("ufind_msb_rev", tint32, tuint, """ 506dst = -1; 507for (int bit = 0; bit < bit_size; bit++) { 508 if ((src0 << bit) & 0x80000000) { 509 dst = bit; 510 break; 511 } 512} 513""") 514 515unop("uclz", tuint32, """ 516int bit; 517for (bit = bit_size - 1; bit >= 0; bit--) { 518 if ((src0 & (1u << bit)) != 0) 519 break; 520} 521dst = (unsigned)(bit_size - bit - 1); 522""") 523 524unop("ifind_msb", tint32, """ 525dst = -1; 526for (int bit = bit_size - 1; bit >= 0; bit--) { 527 /* If src0 < 0, we're looking for the first 0 bit. 528 * if src0 >= 0, we're looking for the first 1 bit. 529 */ 530 if ((((src0 >> bit) & 1) && (src0 >= 0)) || 531 (!((src0 >> bit) & 1) && (src0 < 0))) { 532 dst = bit; 533 break; 534 } 535} 536""") 537 538unop("ifind_msb_rev", tint32, """ 539dst = -1; 540/* We are looking for the highest bit that's not the same as the sign bit. */ 541uint32_t sign = src0 & 0x80000000u; 542for (int bit = 0; bit < 32; bit++) { 543 if (((src0 << bit) & 0x80000000u) != sign) { 544 dst = bit; 545 break; 546 } 547} 548""") 549 550unop_convert("find_lsb", tint32, tint, """ 551dst = -1; 552for (unsigned bit = 0; bit < bit_size; bit++) { 553 if ((src0 >> bit) & 1) { 554 dst = bit; 555 break; 556 } 557} 558""") 559 560unop_reduce("fsum", 1, tfloat, tfloat, "{src}", "{src0} + {src1}", "{src}", 561 description = "Sum of vector components") 562 563def binop_convert(name, out_type, in_type1, alg_props, const_expr, description="", in_type2=None): 564 if in_type2 is None: 565 in_type2 = in_type1 566 opcode(name, 0, out_type, [0, 0], [in_type1, in_type2], 567 False, alg_props, const_expr, description) 568 569def binop(name, ty, alg_props, const_expr, description = ""): 570 binop_convert(name, ty, ty, alg_props, const_expr, description) 571 572def binop_compare(name, ty, alg_props, const_expr, description = "", ty2=None): 573 binop_convert(name, tbool1, ty, alg_props, const_expr, description, ty2) 574 575def binop_compare8(name, ty, alg_props, const_expr, description = "", ty2=None): 576 binop_convert(name, tbool8, ty, alg_props, const_expr, description, ty2) 577 578def binop_compare16(name, ty, alg_props, const_expr, description = "", ty2=None): 579 binop_convert(name, tbool16, ty, alg_props, const_expr, description, ty2) 580 581def binop_compare32(name, ty, alg_props, const_expr, description = "", ty2=None): 582 binop_convert(name, tbool32, ty, alg_props, const_expr, description, ty2) 583 584def binop_compare_all_sizes(name, ty, alg_props, const_expr, description = "", ty2=None): 585 binop_compare(name, ty, alg_props, const_expr, description, ty2) 586 binop_compare8(name + "8", ty, alg_props, const_expr, description, ty2) 587 binop_compare16(name + "16", ty, alg_props, const_expr, description, ty2) 588 binop_compare32(name + "32", ty, alg_props, const_expr, description, ty2) 589 590def binop_horiz(name, out_size, out_type, src1_size, src1_type, src2_size, 591 src2_type, const_expr, description = ""): 592 opcode(name, out_size, out_type, [src1_size, src2_size], [src1_type, src2_type], 593 False, "", const_expr, description) 594 595def binop_reduce(name, output_size, output_type, src_type, prereduce_expr, 596 reduce_expr, final_expr, suffix="", description = ""): 597 def final(src): 598 return final_expr.format(src= "(" + src + ")") 599 def reduce_(src0, src1): 600 return reduce_expr.format(src0=src0, src1=src1) 601 def prereduce(src0, src1): 602 return "(" + prereduce_expr.format(src0=src0, src1=src1) + ")" 603 srcs = [prereduce("src0." + letter, "src1." + letter) for letter in "xyzwefghijklmnop"] 604 def pairwise_reduce(start, size): 605 if (size == 1): 606 return srcs[start] 607 return reduce_(pairwise_reduce(start + size // 2, size // 2), pairwise_reduce(start, size // 2)) 608 for size in [2, 4, 8, 16]: 609 opcode(name + str(size) + suffix, output_size, output_type, 610 [size, size], [src_type, src_type], False, _2src_commutative, 611 final(pairwise_reduce(0, size)), description) 612 opcode(name + "3" + suffix, output_size, output_type, 613 [3, 3], [src_type, src_type], False, _2src_commutative, 614 final(reduce_(reduce_(srcs[2], srcs[1]), srcs[0])), description) 615 opcode(name + "5" + suffix, output_size, output_type, 616 [5, 5], [src_type, src_type], False, _2src_commutative, 617 final(reduce_(srcs[4], reduce_(reduce_(srcs[3], srcs[2]), 618 reduce_(srcs[1], srcs[0])))), 619 description) 620 621def binop_reduce_all_sizes(name, output_size, src_type, prereduce_expr, 622 reduce_expr, final_expr, description = ""): 623 binop_reduce(name, output_size, tbool1, src_type, 624 prereduce_expr, reduce_expr, final_expr, description) 625 binop_reduce("b8" + name[1:], output_size, tbool8, src_type, 626 prereduce_expr, reduce_expr, final_expr, description) 627 binop_reduce("b16" + name[1:], output_size, tbool16, src_type, 628 prereduce_expr, reduce_expr, final_expr, description) 629 binop_reduce("b32" + name[1:], output_size, tbool32, src_type, 630 prereduce_expr, reduce_expr, final_expr, description) 631 632binop("fadd", tfloat, _2src_commutative + associative,""" 633if (nir_is_rounding_mode_rtz(execution_mode, bit_size)) { 634 if (bit_size == 64) 635 dst = _mesa_double_add_rtz(src0, src1); 636 else 637 dst = _mesa_double_to_float_rtz((double)src0 + (double)src1); 638} else { 639 dst = src0 + src1; 640} 641""") 642binop("iadd", tint, _2src_commutative + associative, "(uint64_t)src0 + (uint64_t)src1") 643binop("iadd_sat", tint, _2src_commutative, """ 644 src1 > 0 ? 645 (src0 + src1 < src0 ? u_intN_max(bit_size) : src0 + src1) : 646 (src0 < src0 + src1 ? u_intN_min(bit_size) : src0 + src1) 647""") 648binop("uadd_sat", tuint, _2src_commutative, 649 "(src0 + src1) < src0 ? u_uintN_max(sizeof(src0) * 8) : (src0 + src1)") 650binop("isub_sat", tint, "", """ 651 src1 < 0 ? 652 (src0 - src1 < src0 ? u_intN_max(bit_size) : src0 - src1) : 653 (src0 < src0 - src1 ? u_intN_min(bit_size) : src0 - src1) 654""") 655binop("usub_sat", tuint, "", "src0 < src1 ? 0 : src0 - src1") 656 657binop("fsub", tfloat, "", """ 658if (nir_is_rounding_mode_rtz(execution_mode, bit_size)) { 659 if (bit_size == 64) 660 dst = _mesa_double_sub_rtz(src0, src1); 661 else 662 dst = _mesa_double_to_float_rtz((double)src0 - (double)src1); 663} else { 664 dst = src0 - src1; 665} 666""") 667binop("isub", tint, "", "src0 - src1") 668binop_convert("uabs_isub", tuint, tint, "", """ 669 src1 > src0 ? (uint64_t) src1 - (uint64_t) src0 670 : (uint64_t) src0 - (uint64_t) src1 671""") 672binop("uabs_usub", tuint, "", "(src1 > src0) ? (src1 - src0) : (src0 - src1)") 673 674binop("fmul", tfloat, _2src_commutative + associative, """ 675if (nir_is_rounding_mode_rtz(execution_mode, bit_size)) { 676 if (bit_size == 64) 677 dst = _mesa_double_mul_rtz(src0, src1); 678 else 679 dst = _mesa_double_to_float_rtz((double)src0 * (double)src1); 680} else { 681 dst = src0 * src1; 682} 683""") 684 685binop("fmulz", tfloat32, _2src_commutative + associative, """ 686if (src0 == 0.0 || src1 == 0.0) 687 dst = 0.0; 688else if (nir_is_rounding_mode_rtz(execution_mode, 32)) 689 dst = _mesa_double_to_float_rtz((double)src0 * (double)src1); 690else 691 dst = src0 * src1; 692""", description = """ 693Unlike :nir:alu-op:`fmul`, anything (even infinity or NaN) multiplied by zero is 694always zero. ``fmulz(0.0, inf)`` and ``fmulz(0.0, nan)`` must be +/-0.0, even 695if ``INF_PRESERVE/NAN_PRESERVE`` is not used. If ``SIGNED_ZERO_PRESERVE`` is 696used, then the result must be a positive zero if either operand is zero. 697""") 698 699 700binop("imul", tint, _2src_commutative + associative, """ 701 /* Use 64-bit multiplies to prevent overflow of signed arithmetic */ 702 dst = (uint64_t)src0 * (uint64_t)src1; 703""", description = "Low 32-bits of signed/unsigned integer multiply") 704 705binop_convert("imul_2x32_64", tint64, tint32, _2src_commutative, 706 "(int64_t)src0 * (int64_t)src1", 707 description = "Multiply signed 32-bit integers, 64-bit result") 708binop_convert("umul_2x32_64", tuint64, tuint32, _2src_commutative, 709 "(uint64_t)src0 * (uint64_t)src1", 710 description = "Multiply unsigned 32-bit integers, 64-bit result") 711 712binop("imul_high", tint, _2src_commutative, """ 713if (bit_size == 64) { 714 /* We need to do a full 128-bit x 128-bit multiply in order for the sign 715 * extension to work properly. The casts are kind-of annoying but needed 716 * to prevent compiler warnings. 717 */ 718 uint32_t src0_u32[4] = { 719 src0, 720 (int64_t)src0 >> 32, 721 (int64_t)src0 >> 63, 722 (int64_t)src0 >> 63, 723 }; 724 uint32_t src1_u32[4] = { 725 src1, 726 (int64_t)src1 >> 32, 727 (int64_t)src1 >> 63, 728 (int64_t)src1 >> 63, 729 }; 730 uint32_t prod_u32[4]; 731 ubm_mul_u32arr(prod_u32, src0_u32, src1_u32); 732 dst = (uint64_t)prod_u32[2] | ((uint64_t)prod_u32[3] << 32); 733} else { 734 /* First, sign-extend to 64-bit, then convert to unsigned to prevent 735 * potential overflow of signed multiply */ 736 dst = ((uint64_t)(int64_t)src0 * (uint64_t)(int64_t)src1) >> bit_size; 737} 738""", description = "High 32-bits of signed integer multiply") 739 740binop("umul_high", tuint, _2src_commutative, """ 741if (bit_size == 64) { 742 /* The casts are kind-of annoying but needed to prevent compiler warnings. */ 743 uint32_t src0_u32[2] = { src0, (uint64_t)src0 >> 32 }; 744 uint32_t src1_u32[2] = { src1, (uint64_t)src1 >> 32 }; 745 uint32_t prod_u32[4]; 746 ubm_mul_u32arr(prod_u32, src0_u32, src1_u32); 747 dst = (uint64_t)prod_u32[2] | ((uint64_t)prod_u32[3] << 32); 748} else { 749 dst = ((uint64_t)src0 * (uint64_t)src1) >> bit_size; 750} 751""", description = "High 32-bits of unsigned integer multiply") 752 753binop("umul_low", tuint32, _2src_commutative, """ 754uint64_t mask = (1 << (bit_size / 2)) - 1; 755dst = ((uint64_t)src0 & mask) * ((uint64_t)src1 & mask); 756""", description = "Low 32-bits of unsigned integer multiply") 757 758binop("imul_32x16", tint32, "", "src0 * (int16_t) src1", 759 description = "Multiply 32-bits with low 16-bits, with sign extension") 760binop("umul_32x16", tuint32, "", "src0 * (uint16_t) src1", 761 description = "Multiply 32-bits with low 16-bits, with zero extension") 762 763binop("fdiv", tfloat, "", "src0 / src1") 764binop("idiv", tint, "", "src1 == 0 ? 0 : (src0 / src1)") 765binop("udiv", tuint, "", "src1 == 0 ? 0 : (src0 / src1)") 766 767binop_convert("uadd_carry", tuint, tuint, _2src_commutative, 768 "src0 + src1 < src0", 769 description = """ 770Return an integer (1 or 0) representing the carry resulting from the 771addition of the two unsigned arguments. 772 """) 773 774binop_convert("usub_borrow", tuint, tuint, "", "src0 < src1", description = """ 775Return an integer (1 or 0) representing the borrow resulting from the 776subtraction of the two unsigned arguments. 777 """) 778 779# hadd: (a + b) >> 1 (without overflow) 780# x + y = x - (x & ~y) + (x & ~y) + y - (~x & y) + (~x & y) 781# = (x & y) + (x & ~y) + (x & y) + (~x & y) 782# = 2 * (x & y) + (x & ~y) + (~x & y) 783# = ((x & y) << 1) + (x ^ y) 784# 785# Since we know that the bottom bit of (x & y) << 1 is zero, 786# 787# (x + y) >> 1 = (((x & y) << 1) + (x ^ y)) >> 1 788# = (x & y) + ((x ^ y) >> 1) 789binop("ihadd", tint, _2src_commutative, "(src0 & src1) + ((src0 ^ src1) >> 1)") 790binop("uhadd", tuint, _2src_commutative, "(src0 & src1) + ((src0 ^ src1) >> 1)") 791 792# rhadd: (a + b + 1) >> 1 (without overflow) 793# x + y + 1 = x + (~x & y) - (~x & y) + y + (x & ~y) - (x & ~y) + 1 794# = (x | y) - (~x & y) + (x | y) - (x & ~y) + 1 795# = 2 * (x | y) - ((~x & y) + (x & ~y)) + 1 796# = ((x | y) << 1) - (x ^ y) + 1 797# 798# Since we know that the bottom bit of (x & y) << 1 is zero, 799# 800# (x + y + 1) >> 1 = (x | y) + (-(x ^ y) + 1) >> 1) 801# = (x | y) - ((x ^ y) >> 1) 802binop("irhadd", tint, _2src_commutative, "(src0 | src1) - ((src0 ^ src1) >> 1)") 803binop("urhadd", tuint, _2src_commutative, "(src0 | src1) - ((src0 ^ src1) >> 1)") 804 805binop("umod", tuint, "", "src1 == 0 ? 0 : src0 % src1") 806 807# For signed integers, there are several different possible definitions of 808# "modulus" or "remainder". We follow the conventions used by LLVM and 809# SPIR-V. The irem opcode implements the standard C/C++ signed "%" 810# operation while the imod opcode implements the more mathematical 811# "modulus" operation. For details on the difference, see 812# 813# http://mathforum.org/library/drmath/view/52343.html 814 815binop("irem", tint, "", "src1 == 0 ? 0 : src0 % src1") 816binop("imod", tint, "", 817 "src1 == 0 ? 0 : ((src0 % src1 == 0 || (src0 >= 0) == (src1 >= 0)) ?" 818 " src0 % src1 : src0 % src1 + src1)") 819binop("fmod", tfloat, "", "src0 - src1 * floorf(src0 / src1)") 820binop("frem", tfloat, "", "src0 - src1 * truncf(src0 / src1)") 821 822# 823# Comparisons 824# 825 826 827# these integer-aware comparisons return a boolean (0 or ~0) 828 829binop_compare_all_sizes("flt", tfloat, "", "src0 < src1") 830binop_compare_all_sizes("fge", tfloat, "", "src0 >= src1") 831binop_compare_all_sizes("feq", tfloat, _2src_commutative, "src0 == src1") 832binop_compare_all_sizes("fneu", tfloat, _2src_commutative, "src0 != src1") 833binop_compare_all_sizes("ilt", tint, "", "src0 < src1") 834binop_compare_all_sizes("ige", tint, "", "src0 >= src1") 835binop_compare_all_sizes("ieq", tint, _2src_commutative, "src0 == src1") 836binop_compare_all_sizes("ine", tint, _2src_commutative, "src0 != src1") 837binop_compare_all_sizes("ult", tuint, "", "src0 < src1") 838binop_compare_all_sizes("uge", tuint, "", "src0 >= src1") 839 840binop_compare_all_sizes("bitnz", tuint, "", "((uint64_t)src0 >> (src1 & (bit_size - 1)) & 0x1) == 0x1", 841 "only uses the least significant bits like SM5 shifts", tuint32) 842 843binop_compare_all_sizes("bitz", tuint, "", "((uint64_t)src0 >> (src1 & (bit_size - 1)) & 0x1) == 0x0", 844 "only uses the least significant bits like SM5 shifts", tuint32) 845 846# integer-aware GLSL-style comparisons that compare floats and ints 847 848binop_reduce_all_sizes("ball_fequal", 1, tfloat, "{src0} == {src1}", 849 "{src0} && {src1}", "{src}") 850binop_reduce_all_sizes("bany_fnequal", 1, tfloat, "{src0} != {src1}", 851 "{src0} || {src1}", "{src}") 852binop_reduce_all_sizes("ball_iequal", 1, tint, "{src0} == {src1}", 853 "{src0} && {src1}", "{src}") 854binop_reduce_all_sizes("bany_inequal", 1, tint, "{src0} != {src1}", 855 "{src0} || {src1}", "{src}") 856 857# non-integer-aware GLSL-style comparisons that return 0.0 or 1.0 858 859binop_reduce("fall_equal", 1, tfloat32, tfloat32, "{src0} == {src1}", 860 "{src0} && {src1}", "{src} ? 1.0f : 0.0f") 861binop_reduce("fany_nequal", 1, tfloat32, tfloat32, "{src0} != {src1}", 862 "{src0} || {src1}", "{src} ? 1.0f : 0.0f") 863 864# These comparisons for integer-less hardware return 1.0 and 0.0 for true 865# and false respectively 866 867binop("slt", tfloat, "", "(src0 < src1) ? 1.0f : 0.0f") # Set on Less Than 868binop("sge", tfloat, "", "(src0 >= src1) ? 1.0f : 0.0f") # Set on Greater or Equal 869binop("seq", tfloat, _2src_commutative, "(src0 == src1) ? 1.0f : 0.0f") # Set on Equal 870binop("sne", tfloat, _2src_commutative, "(src0 != src1) ? 1.0f : 0.0f") # Set on Not Equal 871 872shift_note = """ 873SPIRV shifts are undefined for shift-operands >= bitsize, 874but SM5 shifts are defined to use only the least significant bits. 875The NIR definition is according to the SM5 specification. 876""" 877 878opcode("ishl", 0, tint, [0, 0], [tint, tuint32], False, "", 879 "(uint64_t)src0 << (src1 & (sizeof(src0) * 8 - 1))", 880 description = "Left shift." + shift_note) 881opcode("ishr", 0, tint, [0, 0], [tint, tuint32], False, "", 882 "src0 >> (src1 & (sizeof(src0) * 8 - 1))", 883 description = "Signed right-shift." + shift_note) 884opcode("ushr", 0, tuint, [0, 0], [tuint, tuint32], False, "", 885 "src0 >> (src1 & (sizeof(src0) * 8 - 1))", 886 description = "Unsigned right-shift." + shift_note) 887 888opcode("urol", 0, tuint, [0, 0], [tuint, tuint32], False, "", """ 889 uint32_t rotate_mask = sizeof(src0) * 8 - 1; 890 dst = (src0 << (src1 & rotate_mask)) | 891 (src0 >> (-src1 & rotate_mask)); 892""") 893opcode("uror", 0, tuint, [0, 0], [tuint, tuint32], False, "", """ 894 uint32_t rotate_mask = sizeof(src0) * 8 - 1; 895 dst = (src0 >> (src1 & rotate_mask)) | 896 (src0 << (-src1 & rotate_mask)); 897""") 898 899bitwise_description = """ 900Bitwise {0}, also used as a boolean {0} for hardware supporting integers. 901""" 902 903binop("iand", tuint, _2src_commutative + associative, "src0 & src1", 904 description = bitwise_description.format("AND")) 905binop("ior", tuint, _2src_commutative + associative, "src0 | src1", 906 description = bitwise_description.format("OR")) 907binop("ixor", tuint, _2src_commutative + associative, "src0 ^ src1", 908 description = bitwise_description.format("XOR")) 909 910 911binop_reduce("fdot", 1, tfloat, tfloat, "{src0} * {src1}", "{src0} + {src1}", 912 "{src}") 913 914binop_reduce("fdot", 0, tfloat, tfloat, 915 "{src0} * {src1}", "{src0} + {src1}", "{src}", 916 suffix="_replicated") 917 918opcode("fdph", 1, tfloat, [3, 4], [tfloat, tfloat], False, "", 919 "src0.x * src1.x + src0.y * src1.y + src0.z * src1.z + src1.w") 920opcode("fdph_replicated", 0, tfloat, [3, 4], [tfloat, tfloat], False, "", 921 "src0.x * src1.x + src0.y * src1.y + src0.z * src1.z + src1.w") 922 923binop("fmin", tfloat, _2src_commutative + associative, "fmin(src0, src1)") 924binop("imin", tint, _2src_commutative + associative, "src1 > src0 ? src0 : src1") 925binop("umin", tuint, _2src_commutative + associative, "src1 > src0 ? src0 : src1") 926binop("fmax", tfloat, _2src_commutative + associative, "fmax(src0, src1)") 927binop("imax", tint, _2src_commutative + associative, "src1 > src0 ? src1 : src0") 928binop("umax", tuint, _2src_commutative + associative, "src1 > src0 ? src1 : src0") 929 930binop("fpow", tfloat, "", "bit_size == 64 ? pow(src0, src1) : powf(src0, src1)") 931 932binop_horiz("pack_half_2x16_split", 1, tuint32, 1, tfloat32, 1, tfloat32, 933 "pack_half_1x16(src0.x) | (pack_half_1x16(src1.x) << 16)") 934 935binop_horiz("pack_half_2x16_rtz_split", 1, tuint32, 1, tfloat32, 1, tfloat32, 936 "pack_half_1x16_rtz(src0.x) | (pack_half_1x16_rtz(src1.x) << 16)") 937 938binop_convert("pack_64_2x32_split", tuint64, tuint32, "", 939 "src0 | ((uint64_t)src1 << 32)") 940 941binop_convert("pack_32_2x16_split", tuint32, tuint16, "", 942 "src0 | ((uint32_t)src1 << 16)") 943 944opcode("pack_32_4x8_split", 0, tuint32, [0, 0, 0, 0], [tuint8, tuint8, tuint8, tuint8], 945 False, "", 946 "src0 | ((uint32_t)src1 << 8) | ((uint32_t)src2 << 16) | ((uint32_t)src3 << 24)") 947 948binop_convert("bfm", tuint32, tint32, "", """ 949int bits = src0 & 0x1F; 950int offset = src1 & 0x1F; 951dst = ((1u << bits) - 1) << offset; 952""", description = """ 953Implements the behavior of the first operation of the SM5 "bfi" assembly 954and that of the "bfi1" i965 instruction. That is, the bits and offset values 955are from the low five bits of src0 and src1, respectively. 956""") 957 958opcode("ldexp", 0, tfloat, [0, 0], [tfloat, tint32], False, "", """ 959dst = (bit_size == 64) ? ldexp(src0, src1) : ldexpf(src0, src1); 960/* flush denormals to zero. */ 961if (!isnormal(dst)) 962 dst = copysignf(0.0f, src0); 963""") 964 965binop_horiz("vec2", 2, tuint, 1, tuint, 1, tuint, """ 966dst.x = src0.x; 967dst.y = src1.x; 968""", description = """ 969Combines the first component of each input to make a 2-component vector. 970""") 971 972# Byte extraction 973binop("extract_u8", tuint, "", "(uint8_t)(src0 >> (src1 * 8))") 974binop("extract_i8", tint, "", "(int8_t)(src0 >> (src1 * 8))") 975 976# Word extraction 977binop("extract_u16", tuint, "", "(uint16_t)(src0 >> (src1 * 16))") 978binop("extract_i16", tint, "", "(int16_t)(src0 >> (src1 * 16))") 979 980# Byte/word insertion 981binop("insert_u8", tuint, "", "(src0 & 0xff) << (src1 * 8)") 982binop("insert_u16", tuint, "", "(src0 & 0xffff) << (src1 * 16)") 983 984 985def triop(name, ty, alg_props, const_expr, description = ""): 986 opcode(name, 0, ty, [0, 0, 0], [ty, ty, ty], False, alg_props, const_expr, 987 description) 988def triop_horiz(name, output_size, src1_size, src2_size, src3_size, const_expr, 989 description = ""): 990 opcode(name, output_size, tuint, 991 [src1_size, src2_size, src3_size], 992 [tuint, tuint, tuint], False, "", const_expr, description) 993 994triop("ffma", tfloat, _2src_commutative, """ 995if (nir_is_rounding_mode_rtz(execution_mode, bit_size)) { 996 if (bit_size == 64) 997 dst = _mesa_double_fma_rtz(src0, src1, src2); 998 else if (bit_size == 32) 999 dst = _mesa_float_fma_rtz(src0, src1, src2); 1000 else 1001 dst = _mesa_double_to_float_rtz(_mesa_double_fma_rtz(src0, src1, src2)); 1002} else { 1003 if (bit_size == 32) 1004 dst = fmaf(src0, src1, src2); 1005 else 1006 dst = fma(src0, src1, src2); 1007} 1008""") 1009 1010triop("ffmaz", tfloat32, _2src_commutative, """ 1011if (src0 == 0.0 || src1 == 0.0) 1012 dst = 0.0 + src2; 1013else if (nir_is_rounding_mode_rtz(execution_mode, 32)) 1014 dst = _mesa_float_fma_rtz(src0, src1, src2); 1015else 1016 dst = fmaf(src0, src1, src2); 1017""", description = """ 1018Floating-point multiply-add with modified zero handling. 1019 1020Unlike :nir:alu-op:`ffma`, anything (even infinity or NaN) multiplied by zero is 1021always zero. ``ffmaz(0.0, inf, src2)`` and ``ffmaz(0.0, nan, src2)`` must be 1022``+/-0.0 + src2``, even if ``INF_PRESERVE/NAN_PRESERVE`` is not used. If 1023``SIGNED_ZERO_PRESERVE`` is used, then the result must be a positive 1024zero plus src2 if either src0 or src1 is zero. 1025""") 1026 1027triop("flrp", tfloat, "", "src0 * (1 - src2) + src1 * src2") 1028 1029triop("iadd3", tint, _2src_commutative + associative, "src0 + src1 + src2", 1030 description = "Ternary addition") 1031 1032triop("imad", tint, _2src_commutative + associative, "src0 * src1 + src2", 1033 description = "Integer multiply-add") 1034 1035csel_description = """ 1036A vector conditional select instruction (like ?:, but operating per- 1037component on vectors). The condition is {} bool ({}). 1038""" 1039 1040triop("fcsel", tfloat32, selection, "(src0 != 0.0f) ? src1 : src2", 1041 description = csel_description.format("a floating point", "0.0 vs 1.0")) 1042opcode("bcsel", 0, tuint, [0, 0, 0], 1043 [tbool1, tuint, tuint], False, selection, "src0 ? src1 : src2", 1044 description = csel_description.format("a 1-bit", "0 vs 1")) 1045opcode("b8csel", 0, tuint, [0, 0, 0], 1046 [tbool8, tuint, tuint], False, selection, "src0 ? src1 : src2", 1047 description = csel_description.format("an 8-bit", "0 vs ~0")) 1048opcode("b16csel", 0, tuint, [0, 0, 0], 1049 [tbool16, tuint, tuint], False, selection, "src0 ? src1 : src2", 1050 description = csel_description.format("a 16-bit", "0 vs ~0")) 1051opcode("b32csel", 0, tuint, [0, 0, 0], 1052 [tbool32, tuint, tuint], False, selection, "src0 ? src1 : src2", 1053 description = csel_description.format("a 32-bit", "0 vs ~0")) 1054 1055triop("i32csel_gt", tint32, selection, "(src0 > 0) ? src1 : src2") 1056triop("i32csel_ge", tint32, selection, "(src0 >= 0) ? src1 : src2") 1057 1058triop("fcsel_gt", tfloat32, selection, "(src0 > 0.0f) ? src1 : src2") 1059triop("fcsel_ge", tfloat32, selection, "(src0 >= 0.0f) ? src1 : src2") 1060 1061triop("bfi", tuint32, "", """ 1062unsigned mask = src0, insert = src1, base = src2; 1063if (mask == 0) { 1064 dst = base; 1065} else { 1066 unsigned tmp = mask; 1067 while (!(tmp & 1)) { 1068 tmp >>= 1; 1069 insert <<= 1; 1070 } 1071 dst = (base & ~mask) | (insert & mask); 1072} 1073""", description = "SM5 bfi assembly") 1074 1075 1076triop("bitfield_select", tuint, "", "(src0 & src1) | (~src0 & src2)") 1077 1078# SM5 ubfe/ibfe assembly: only the 5 least significant bits of offset and bits are used. 1079opcode("ubfe", 0, tuint32, 1080 [0, 0, 0], [tuint32, tuint32, tuint32], False, "", """ 1081unsigned base = src0; 1082unsigned offset = src1 & 0x1F; 1083unsigned bits = src2 & 0x1F; 1084if (bits == 0) { 1085 dst = 0; 1086} else if (offset + bits < 32) { 1087 dst = (base << (32 - bits - offset)) >> (32 - bits); 1088} else { 1089 dst = base >> offset; 1090} 1091""") 1092opcode("ibfe", 0, tint32, 1093 [0, 0, 0], [tint32, tuint32, tuint32], False, "", """ 1094int base = src0; 1095unsigned offset = src1 & 0x1F; 1096unsigned bits = src2 & 0x1F; 1097if (bits == 0) { 1098 dst = 0; 1099} else if (offset + bits < 32) { 1100 dst = (base << (32 - bits - offset)) >> (32 - bits); 1101} else { 1102 dst = base >> offset; 1103} 1104""") 1105 1106# GLSL bitfieldExtract() 1107opcode("ubitfield_extract", 0, tuint32, 1108 [0, 0, 0], [tuint32, tint32, tint32], False, "", """ 1109unsigned base = src0; 1110int offset = src1, bits = src2; 1111if (bits == 0) { 1112 dst = 0; 1113} else if (bits < 0 || offset < 0 || offset + bits > 32) { 1114 dst = 0; /* undefined per the spec */ 1115} else { 1116 dst = (base >> offset) & ((1ull << bits) - 1); 1117} 1118""") 1119opcode("ibitfield_extract", 0, tint32, 1120 [0, 0, 0], [tint32, tint32, tint32], False, "", """ 1121int base = src0; 1122int offset = src1, bits = src2; 1123if (bits == 0) { 1124 dst = 0; 1125} else if (offset < 0 || bits < 0 || offset + bits > 32) { 1126 dst = 0; 1127} else { 1128 dst = (base << (32 - offset - bits)) >> (32 - bits); /* use sign-extending shift */ 1129} 1130""") 1131 1132triop("msad_4x8", tuint32, "", """ 1133dst = msad(src0, src1, src2); 1134""", description = """ 1135Masked sum of absolute differences with accumulation. Equivalent to AMD's v_msad_u8 1136instruction and DXIL's MSAD. 1137 1138The first two sources contain packed 8-bit unsigned integers, the instruction 1139will calculate the absolute difference of integers when src0's is non-zero, and 1140then add them together. There is also a third source which is a 32-bit unsigned 1141integer and added to the result. 1142""") 1143 1144# Combines the first component of each input to make a 3-component vector. 1145 1146triop_horiz("vec3", 3, 1, 1, 1, """ 1147dst.x = src0.x; 1148dst.y = src1.x; 1149dst.z = src2.x; 1150""") 1151 1152def quadop_horiz(name, output_size, src1_size, src2_size, src3_size, 1153 src4_size, const_expr): 1154 opcode(name, output_size, tuint, 1155 [src1_size, src2_size, src3_size, src4_size], 1156 [tuint, tuint, tuint, tuint], 1157 False, "", const_expr) 1158 1159opcode("bitfield_insert", 0, tuint32, [0, 0, 0, 0], 1160 [tuint32, tuint32, tint32, tint32], False, "", """ 1161unsigned base = src0, insert = src1; 1162int offset = src2, bits = src3; 1163if (bits == 0) { 1164 dst = base; 1165} else if (offset < 0 || bits < 0 || bits + offset > 32) { 1166 dst = 0; 1167} else { 1168 unsigned mask = ((1ull << bits) - 1) << offset; 1169 dst = (base & ~mask) | ((insert << offset) & mask); 1170} 1171""") 1172 1173quadop_horiz("vec4", 4, 1, 1, 1, 1, """ 1174dst.x = src0.x; 1175dst.y = src1.x; 1176dst.z = src2.x; 1177dst.w = src3.x; 1178""") 1179 1180opcode("vec5", 5, tuint, 1181 [1] * 5, [tuint] * 5, 1182 False, "", """ 1183dst.x = src0.x; 1184dst.y = src1.x; 1185dst.z = src2.x; 1186dst.w = src3.x; 1187dst.e = src4.x; 1188""") 1189 1190opcode("vec8", 8, tuint, 1191 [1] * 8, [tuint] * 8, 1192 False, "", """ 1193dst.x = src0.x; 1194dst.y = src1.x; 1195dst.z = src2.x; 1196dst.w = src3.x; 1197dst.e = src4.x; 1198dst.f = src5.x; 1199dst.g = src6.x; 1200dst.h = src7.x; 1201""") 1202 1203opcode("vec16", 16, tuint, 1204 [1] * 16, [tuint] * 16, 1205 False, "", """ 1206dst.x = src0.x; 1207dst.y = src1.x; 1208dst.z = src2.x; 1209dst.w = src3.x; 1210dst.e = src4.x; 1211dst.f = src5.x; 1212dst.g = src6.x; 1213dst.h = src7.x; 1214dst.i = src8.x; 1215dst.j = src9.x; 1216dst.k = src10.x; 1217dst.l = src11.x; 1218dst.m = src12.x; 1219dst.n = src13.x; 1220dst.o = src14.x; 1221dst.p = src15.x; 1222""") 1223 1224# An integer multiply instruction for address calculation. This is 1225# similar to imul, except that the results are undefined in case of 1226# overflow. Overflow is defined according to the size of the variable 1227# being dereferenced. 1228# 1229# This relaxed definition, compared to imul, allows an optimization 1230# pass to propagate bounds (ie, from an load/store intrinsic) to the 1231# sources, such that lower precision integer multiplies can be used. 1232# This is useful on hw that has 24b or perhaps 16b integer multiply 1233# instructions. 1234binop("amul", tint, _2src_commutative + associative, "src0 * src1") 1235 1236# ir3-specific instruction that maps directly to mul-add shift high mix, 1237# (IMADSH_MIX16 i.e. ah * bl << 16 + c). It is used for lowering integer 1238# multiplication (imul) on Freedreno backend.. 1239opcode("imadsh_mix16", 0, tint32, 1240 [0, 0, 0], [tint32, tint32, tint32], False, "", """ 1241dst = ((((src0 & 0xffff0000) >> 16) * (src1 & 0x0000ffff)) << 16) + src2; 1242""") 1243 1244# ir3-specific instruction that maps directly to ir3 mad.s24. 1245# 1246# 24b multiply into 32b result (with sign extension) plus 32b int 1247triop("imad24_ir3", tint32, _2src_commutative, 1248 "(((int32_t)src0 << 8) >> 8) * (((int32_t)src1 << 8) >> 8) + src2") 1249 1250# r600/gcn specific instruction that evaluates unnormalized cube texture coordinates 1251# and face index 1252# The actual texture coordinates are evaluated from this according to 1253# dst.yx / abs(dst.z) + 1.5 1254unop_horiz("cube_amd", 4, tfloat32, 3, tfloat32, """ 1255 dst.x = dst.y = dst.z = 0.0; 1256 float absX = fabsf(src0.x); 1257 float absY = fabsf(src0.y); 1258 float absZ = fabsf(src0.z); 1259 1260 if (absX >= absY && absX >= absZ) { dst.z = 2 * src0.x; } 1261 if (absY >= absX && absY >= absZ) { dst.z = 2 * src0.y; } 1262 if (absZ >= absX && absZ >= absY) { dst.z = 2 * src0.z; } 1263 1264 if (src0.x >= 0 && absX >= absY && absX >= absZ) { 1265 dst.y = -src0.z; dst.x = -src0.y; dst.w = 0; 1266 } 1267 if (src0.x < 0 && absX >= absY && absX >= absZ) { 1268 dst.y = src0.z; dst.x = -src0.y; dst.w = 1; 1269 } 1270 if (src0.y >= 0 && absY >= absX && absY >= absZ) { 1271 dst.y = src0.x; dst.x = src0.z; dst.w = 2; 1272 } 1273 if (src0.y < 0 && absY >= absX && absY >= absZ) { 1274 dst.y = src0.x; dst.x = -src0.z; dst.w = 3; 1275 } 1276 if (src0.z >= 0 && absZ >= absX && absZ >= absY) { 1277 dst.y = src0.x; dst.x = -src0.y; dst.w = 4; 1278 } 1279 if (src0.z < 0 && absZ >= absX && absZ >= absY) { 1280 dst.y = -src0.x; dst.x = -src0.y; dst.w = 5; 1281 } 1282""") 1283 1284# r600/gcn specific sin and cos 1285# these trigeometric functions need some lowering because the supported 1286# input values are expected to be normalized by dividing by (2 * pi) 1287unop("fsin_amd", tfloat, "sinf(6.2831853 * src0)") 1288unop("fcos_amd", tfloat, "cosf(6.2831853 * src0)") 1289 1290# Midgard specific sin and cos 1291# These expect their inputs to be divided by pi. 1292unop("fsin_mdg", tfloat, "sinf(3.141592653589793 * src0)") 1293unop("fcos_mdg", tfloat, "cosf(3.141592653589793 * src0)") 1294 1295# AGX specific sin with input expressed in quadrants. Used in the lowering for 1296# fsin/fcos. This corresponds to a sequence of 3 ALU ops in the backend (where 1297# the angle is further decomposed by quadrant, sinc is computed, and the angle 1298# is multiplied back for sin). Lowering fsin/fcos to fsin_agx requires some 1299# additional ALU that NIR may be able to optimize. 1300unop("fsin_agx", tfloat, "sinf(src0 * (6.2831853/4.0))") 1301 1302# AGX specific bitfield extraction from a pair of 32bit registers. 1303# src0,src1: the two registers 1304# src2: bit position of the LSB of the bitfield 1305# src3: number of bits in the bitfield if src3 > 0 1306# src3 = 0 is equivalent to src3 = 32 1307# NOTE: src3 is a nir constant by contract 1308opcode("extr_agx", 0, tuint32, 1309 [0, 0, 0, 0], [tuint32, tuint32, tuint32, tuint32], False, "", """ 1310 uint32_t mask = 0xFFFFFFFF; 1311 uint8_t shift = src2 & 0x7F; 1312 if (src3 != 0) { 1313 mask = (1 << src3) - 1; 1314 } 1315 if (shift >= 64) { 1316 dst = 0; 1317 } else { 1318 dst = (((((uint64_t) src1) << 32) | (uint64_t) src0) >> shift) & mask; 1319 } 1320"""); 1321 1322# AGX multiply-shift-add. Corresponds to iadd/isub/imad/imsub instructions. 1323# The shift must be <= 4 (domain restriction). For performance, it should be 1324# constant. 1325opcode("imadshl_agx", 0, tint, [0, 0, 0, 0], [tint, tint, tint, tint], False, 1326 "", f"(src0 * src1) + (src2 << src3)") 1327opcode("imsubshl_agx", 0, tint, [0, 0, 0, 0], [tint, tint, tint, tint], False, 1328 "", f"(src0 * src1) - (src2 << src3)") 1329 1330binop_convert("interleave_agx", tuint32, tuint16, "", """ 1331 dst = 0; 1332 for (unsigned bit = 0; bit < 16; bit++) { 1333 dst |= (src0 & (1 << bit)) << bit; 1334 dst |= (src1 & (1 << bit)) << (bit + 1); 1335 }""", description=""" 1336 Interleave bits of 16-bit integers to calculate a 32-bit integer. This can 1337 be used as-is for Morton encoding. 1338 """) 1339 1340# 24b multiply into 32b result (with sign extension) 1341binop("imul24", tint32, _2src_commutative + associative, 1342 "(((int32_t)src0 << 8) >> 8) * (((int32_t)src1 << 8) >> 8)") 1343 1344# unsigned 24b multiply into 32b result plus 32b int 1345triop("umad24", tuint32, _2src_commutative, 1346 "(((uint32_t)src0 << 8) >> 8) * (((uint32_t)src1 << 8) >> 8) + src2") 1347 1348# unsigned 24b multiply into 32b result uint 1349binop("umul24", tint32, _2src_commutative + associative, 1350 "(((uint32_t)src0 << 8) >> 8) * (((uint32_t)src1 << 8) >> 8)") 1351 1352# relaxed versions of the above, which assume input is in the 24bit range (no clamping) 1353binop("imul24_relaxed", tint32, _2src_commutative + associative, "src0 * src1") 1354triop("umad24_relaxed", tuint32, _2src_commutative, "src0 * src1 + src2") 1355binop("umul24_relaxed", tuint32, _2src_commutative + associative, "src0 * src1") 1356 1357unop_convert("fisnormal", tbool1, tfloat, "isnormal(src0)") 1358unop_convert("fisfinite", tbool1, tfloat, "isfinite(src0)") 1359unop_convert("fisfinite32", tbool32, tfloat, "isfinite(src0)") 1360 1361# vc4-specific opcodes 1362 1363# Saturated vector add for 4 8bit ints. 1364binop("usadd_4x8_vc4", tint32, _2src_commutative + associative, """ 1365dst = 0; 1366for (int i = 0; i < 32; i += 8) { 1367 dst |= MIN2(((src0 >> i) & 0xff) + ((src1 >> i) & 0xff), 0xff) << i; 1368} 1369""") 1370 1371# Saturated vector subtract for 4 8bit ints. 1372binop("ussub_4x8_vc4", tint32, "", """ 1373dst = 0; 1374for (int i = 0; i < 32; i += 8) { 1375 int src0_chan = (src0 >> i) & 0xff; 1376 int src1_chan = (src1 >> i) & 0xff; 1377 if (src0_chan > src1_chan) 1378 dst |= (src0_chan - src1_chan) << i; 1379} 1380""") 1381 1382# vector min for 4 8bit ints. 1383binop("umin_4x8_vc4", tint32, _2src_commutative + associative, """ 1384dst = 0; 1385for (int i = 0; i < 32; i += 8) { 1386 dst |= MIN2((src0 >> i) & 0xff, (src1 >> i) & 0xff) << i; 1387} 1388""") 1389 1390# vector max for 4 8bit ints. 1391binop("umax_4x8_vc4", tint32, _2src_commutative + associative, """ 1392dst = 0; 1393for (int i = 0; i < 32; i += 8) { 1394 dst |= MAX2((src0 >> i) & 0xff, (src1 >> i) & 0xff) << i; 1395} 1396""") 1397 1398# unorm multiply: (a * b) / 255. 1399binop("umul_unorm_4x8_vc4", tint32, _2src_commutative + associative, """ 1400dst = 0; 1401for (int i = 0; i < 32; i += 8) { 1402 int src0_chan = (src0 >> i) & 0xff; 1403 int src1_chan = (src1 >> i) & 0xff; 1404 dst |= ((src0_chan * src1_chan) / 255) << i; 1405} 1406""") 1407 1408# v3d-specific opcodes 1409 1410# v3d-specific (v71) instruction that packs bits of 2 2x16 floating point into 1411# r11g11b10 bits, rounding to nearest even, so 1412# dst[10:0] = float16_to_float11 (src0[15:0]) 1413# dst[21:11] = float16_to_float11 (src0[31:16]) 1414# dst[31:22] = float16_to_float10 (src1[15:0]) 1415binop_convert("pack_32_to_r11g11b10_v3d", tuint32, tuint32, "", 1416 "pack_32_to_r11g11b10_v3d(src0, src1)") 1417 1418# v3d-specific (v71) instruction that packs 2x32 bit to 2x16 bit integer. The 1419# difference with pack_32_2x16_split is that the sources are 32bit too. So it 1420# receives 2 32-bit integer, and packs the lower halfword as 2x16 on a 32-bit 1421# integer. 1422binop_horiz("pack_2x32_to_2x16_v3d", 1, tuint32, 1, tuint32, 1, tuint32, 1423 "(src0.x & 0xffff) | (src1.x << 16)") 1424 1425# v3d-specific (v71) instruction that packs bits of 2 2x16 integers into 1426# r10g10b10a2: 1427# dst[9:0] = src0[9:0] 1428# dst[19:10] = src0[25:16] 1429# dst[29:20] = src1[9:0] 1430# dst[31:30] = src1[17:16] 1431binop_convert("pack_uint_32_to_r10g10b10a2_v3d", tuint32, tuint32, "", 1432 "(src0 & 0x3ff) | ((src0 >> 16) & 0x3ff) << 10 | (src1 & 0x3ff) << 20 | ((src1 >> 16) & 0x3ff) << 30") 1433 1434# v3d-specific (v71) instruction that packs 2 2x16 bit integers into 4x8 bits: 1435# dst[7:0] = src0[7:0] 1436# dst[15:8] = src0[23:16] 1437# dst[23:16] = src1[7:0] 1438# dst[31:24] = src1[23:16] 1439opcode("pack_4x16_to_4x8_v3d", 0, tuint32, [0, 0], [tuint32, tuint32], 1440 False, "", 1441 "(src0 & 0x000000ff) | (src0 & 0x00ff0000) >> 8 | (src1 & 0x000000ff) << 16 | (src1 & 0x00ff0000) << 8") 1442 1443# v3d-specific (v71) instructions to convert 2x16 floating point to 2x8 bit unorm/snorm 1444unop("pack_2x16_to_unorm_2x8_v3d", tuint32, 1445 "_mesa_half_to_unorm(src0 & 0xffff, 8) | (_mesa_half_to_unorm(src0 >> 16, 8) << 16)") 1446unop("pack_2x16_to_snorm_2x8_v3d", tuint32, 1447 "_mesa_half_to_snorm(src0 & 0xffff, 8) | (_mesa_half_to_snorm(src0 >> 16, 8) << 16)") 1448 1449# v3d-specific (v71) instructions to convert 32-bit floating point to 16 bit unorm/snorm 1450unop("f2unorm_16_v3d", tuint32, "_mesa_float_to_unorm16(src0)") 1451unop("f2snorm_16_v3d", tuint32, "_mesa_float_to_snorm16(src0)") 1452 1453# v3d-specific (v71) instructions to convert 2x16 bit floating points to 2x10 bit unorm 1454unop("pack_2x16_to_unorm_2x10_v3d", tuint32, "pack_2x16_to_unorm_2x10(src0)") 1455 1456# v3d-specific (v71) instructions to convert 2x16 bit floating points to one 2-bit 1457# and one 10 bit unorm 1458unop("pack_2x16_to_unorm_10_2_v3d", tuint32, "pack_2x16_to_unorm_10_2(src0)") 1459 1460# Mali-specific opcodes 1461unop("fsat_signed_mali", tfloat, ("fmin(fmax(src0, -1.0), 1.0)")) 1462unop("fclamp_pos_mali", tfloat, ("fmax(src0, 0.0)")) 1463 1464opcode("b32fcsel_mdg", 0, tuint, [0, 0, 0], 1465 [tbool32, tfloat, tfloat], False, selection, "src0 ? src1 : src2", 1466 description = csel_description.format("a 32-bit", "0 vs ~0") + """ 1467 This Midgard-specific variant takes floating-point sources, rather than 1468 integer sources. That includes support for floating point modifiers in 1469 the backend. 1470 """) 1471 1472# Magnitude equal to fddx/y, sign undefined. Derivative of a constant is zero. 1473unop("fddx_must_abs_mali", tfloat, "0.0", algebraic_properties = "derivative") 1474unop("fddy_must_abs_mali", tfloat, "0.0", algebraic_properties = "derivative") 1475 1476# DXIL specific double [un]pack 1477# DXIL doesn't support generic [un]pack instructions, so we want those 1478# lowered to bit ops. HLSL doesn't support 64bit bitcasts to/from 1479# double, only [un]pack. Technically DXIL does, but considering they 1480# can't be generated from HLSL, we want to match what would be coming from DXC. 1481# This is essentially just the standard [un]pack, except that it doesn't get 1482# lowered so we can handle it in the backend and turn it into MakeDouble/SplitDouble 1483unop_horiz("pack_double_2x32_dxil", 1, tuint64, 2, tuint32, 1484 "dst.x = src0.x | ((uint64_t)src0.y << 32);") 1485unop_horiz("unpack_double_2x32_dxil", 2, tuint32, 1, tuint64, 1486 "dst.x = src0.x; dst.y = src0.x >> 32;") 1487 1488# src0 and src1 are i8vec4 packed in an int32, and src2 is an int32. The int8 1489# components are sign-extended to 32-bits, and a dot-product is performed on 1490# the resulting vectors. src2 is added to the result of the dot-product. 1491opcode("sdot_4x8_iadd", 0, tint32, [0, 0, 0], [tuint32, tuint32, tint32], 1492 False, _2src_commutative, """ 1493 const int32_t v0x = (int8_t)(src0 ); 1494 const int32_t v0y = (int8_t)(src0 >> 8); 1495 const int32_t v0z = (int8_t)(src0 >> 16); 1496 const int32_t v0w = (int8_t)(src0 >> 24); 1497 const int32_t v1x = (int8_t)(src1 ); 1498 const int32_t v1y = (int8_t)(src1 >> 8); 1499 const int32_t v1z = (int8_t)(src1 >> 16); 1500 const int32_t v1w = (int8_t)(src1 >> 24); 1501 1502 dst = (v0x * v1x) + (v0y * v1y) + (v0z * v1z) + (v0w * v1w) + src2; 1503""") 1504 1505# Like sdot_4x8_iadd, but unsigned. 1506opcode("udot_4x8_uadd", 0, tuint32, [0, 0, 0], [tuint32, tuint32, tuint32], 1507 False, _2src_commutative, """ 1508 const uint32_t v0x = (uint8_t)(src0 ); 1509 const uint32_t v0y = (uint8_t)(src0 >> 8); 1510 const uint32_t v0z = (uint8_t)(src0 >> 16); 1511 const uint32_t v0w = (uint8_t)(src0 >> 24); 1512 const uint32_t v1x = (uint8_t)(src1 ); 1513 const uint32_t v1y = (uint8_t)(src1 >> 8); 1514 const uint32_t v1z = (uint8_t)(src1 >> 16); 1515 const uint32_t v1w = (uint8_t)(src1 >> 24); 1516 1517 dst = (v0x * v1x) + (v0y * v1y) + (v0z * v1z) + (v0w * v1w) + src2; 1518""") 1519 1520# src0 is i8vec4 packed in an int32, src1 is u8vec4 packed in an int32, and 1521# src2 is an int32. The 8-bit components are extended to 32-bits, and a 1522# dot-product is performed on the resulting vectors. src2 is added to the 1523# result of the dot-product. 1524# 1525# NOTE: Unlike many of the other dp4a opcodes, this mixed signs of source 0 1526# and source 1 mean that this opcode is not 2-source commutative 1527opcode("sudot_4x8_iadd", 0, tint32, [0, 0, 0], [tuint32, tuint32, tint32], 1528 False, "", """ 1529 const int32_t v0x = (int8_t)(src0 ); 1530 const int32_t v0y = (int8_t)(src0 >> 8); 1531 const int32_t v0z = (int8_t)(src0 >> 16); 1532 const int32_t v0w = (int8_t)(src0 >> 24); 1533 const uint32_t v1x = (uint8_t)(src1 ); 1534 const uint32_t v1y = (uint8_t)(src1 >> 8); 1535 const uint32_t v1z = (uint8_t)(src1 >> 16); 1536 const uint32_t v1w = (uint8_t)(src1 >> 24); 1537 1538 dst = (v0x * v1x) + (v0y * v1y) + (v0z * v1z) + (v0w * v1w) + src2; 1539""") 1540 1541# Like sdot_4x8_iadd, but the result is clampled to the range [-0x80000000, 0x7ffffffff]. 1542opcode("sdot_4x8_iadd_sat", 0, tint32, [0, 0, 0], [tuint32, tuint32, tint32], 1543 False, _2src_commutative, """ 1544 const int64_t v0x = (int8_t)(src0 ); 1545 const int64_t v0y = (int8_t)(src0 >> 8); 1546 const int64_t v0z = (int8_t)(src0 >> 16); 1547 const int64_t v0w = (int8_t)(src0 >> 24); 1548 const int64_t v1x = (int8_t)(src1 ); 1549 const int64_t v1y = (int8_t)(src1 >> 8); 1550 const int64_t v1z = (int8_t)(src1 >> 16); 1551 const int64_t v1w = (int8_t)(src1 >> 24); 1552 1553 const int64_t tmp = (v0x * v1x) + (v0y * v1y) + (v0z * v1z) + (v0w * v1w) + src2; 1554 1555 dst = tmp >= INT32_MAX ? INT32_MAX : (tmp <= INT32_MIN ? INT32_MIN : tmp); 1556""") 1557 1558# Like udot_4x8_uadd, but the result is clampled to the range [0, 0xfffffffff]. 1559opcode("udot_4x8_uadd_sat", 0, tint32, [0, 0, 0], [tuint32, tuint32, tint32], 1560 False, _2src_commutative, """ 1561 const uint64_t v0x = (uint8_t)(src0 ); 1562 const uint64_t v0y = (uint8_t)(src0 >> 8); 1563 const uint64_t v0z = (uint8_t)(src0 >> 16); 1564 const uint64_t v0w = (uint8_t)(src0 >> 24); 1565 const uint64_t v1x = (uint8_t)(src1 ); 1566 const uint64_t v1y = (uint8_t)(src1 >> 8); 1567 const uint64_t v1z = (uint8_t)(src1 >> 16); 1568 const uint64_t v1w = (uint8_t)(src1 >> 24); 1569 1570 const uint64_t tmp = (v0x * v1x) + (v0y * v1y) + (v0z * v1z) + (v0w * v1w) + src2; 1571 1572 dst = tmp >= UINT32_MAX ? UINT32_MAX : tmp; 1573""") 1574 1575# Like sudot_4x8_iadd, but the result is clampled to the range [-0x80000000, 0x7ffffffff]. 1576# 1577# NOTE: Unlike many of the other dp4a opcodes, this mixed signs of source 0 1578# and source 1 mean that this opcode is not 2-source commutative 1579opcode("sudot_4x8_iadd_sat", 0, tint32, [0, 0, 0], [tuint32, tuint32, tint32], 1580 False, "", """ 1581 const int64_t v0x = (int8_t)(src0 ); 1582 const int64_t v0y = (int8_t)(src0 >> 8); 1583 const int64_t v0z = (int8_t)(src0 >> 16); 1584 const int64_t v0w = (int8_t)(src0 >> 24); 1585 const uint64_t v1x = (uint8_t)(src1 ); 1586 const uint64_t v1y = (uint8_t)(src1 >> 8); 1587 const uint64_t v1z = (uint8_t)(src1 >> 16); 1588 const uint64_t v1w = (uint8_t)(src1 >> 24); 1589 1590 const int64_t tmp = (v0x * v1x) + (v0y * v1y) + (v0z * v1z) + (v0w * v1w) + src2; 1591 1592 dst = tmp >= INT32_MAX ? INT32_MAX : (tmp <= INT32_MIN ? INT32_MIN : tmp); 1593""") 1594 1595# src0 and src1 are i16vec2 packed in an int32, and src2 is an int32. The int16 1596# components are sign-extended to 32-bits, and a dot-product is performed on 1597# the resulting vectors. src2 is added to the result of the dot-product. 1598opcode("sdot_2x16_iadd", 0, tint32, [0, 0, 0], [tuint32, tuint32, tint32], 1599 False, _2src_commutative, """ 1600 const int32_t v0x = (int16_t)(src0 ); 1601 const int32_t v0y = (int16_t)(src0 >> 16); 1602 const int32_t v1x = (int16_t)(src1 ); 1603 const int32_t v1y = (int16_t)(src1 >> 16); 1604 1605 dst = (v0x * v1x) + (v0y * v1y) + src2; 1606""") 1607 1608# Like sdot_2x16_iadd, but unsigned. 1609opcode("udot_2x16_uadd", 0, tuint32, [0, 0, 0], [tuint32, tuint32, tuint32], 1610 False, _2src_commutative, """ 1611 const uint32_t v0x = (uint16_t)(src0 ); 1612 const uint32_t v0y = (uint16_t)(src0 >> 16); 1613 const uint32_t v1x = (uint16_t)(src1 ); 1614 const uint32_t v1y = (uint16_t)(src1 >> 16); 1615 1616 dst = (v0x * v1x) + (v0y * v1y) + src2; 1617""") 1618 1619# Like sdot_2x16_iadd, but the result is clampled to the range [-0x80000000, 0x7ffffffff]. 1620opcode("sdot_2x16_iadd_sat", 0, tint32, [0, 0, 0], [tuint32, tuint32, tint32], 1621 False, _2src_commutative, """ 1622 const int64_t v0x = (int16_t)(src0 ); 1623 const int64_t v0y = (int16_t)(src0 >> 16); 1624 const int64_t v1x = (int16_t)(src1 ); 1625 const int64_t v1y = (int16_t)(src1 >> 16); 1626 1627 const int64_t tmp = (v0x * v1x) + (v0y * v1y) + src2; 1628 1629 dst = tmp >= INT32_MAX ? INT32_MAX : (tmp <= INT32_MIN ? INT32_MIN : tmp); 1630""") 1631 1632# Like udot_2x16_uadd, but the result is clampled to the range [0, 0xfffffffff]. 1633opcode("udot_2x16_uadd_sat", 0, tint32, [0, 0, 0], [tuint32, tuint32, tint32], 1634 False, _2src_commutative, """ 1635 const uint64_t v0x = (uint16_t)(src0 ); 1636 const uint64_t v0y = (uint16_t)(src0 >> 16); 1637 const uint64_t v1x = (uint16_t)(src1 ); 1638 const uint64_t v1y = (uint16_t)(src1 >> 16); 1639 1640 const uint64_t tmp = (v0x * v1x) + (v0y * v1y) + src2; 1641 1642 dst = tmp >= UINT32_MAX ? UINT32_MAX : tmp; 1643""") 1644