1# 2# Copyright (C) 2014 Connor Abbott 3# 4# Permission is hereby granted, free of charge, to any person obtaining a 5# copy of this software and associated documentation files (the "Software"), 6# to deal in the Software without restriction, including without limitation 7# the rights to use, copy, modify, merge, publish, distribute, sublicense, 8# and/or sell copies of the Software, and to permit persons to whom the 9# Software is furnished to do so, subject to the following conditions: 10# 11# The above copyright notice and this permission notice (including the next 12# paragraph) shall be included in all copies or substantial portions of the 13# Software. 14# 15# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 18# THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING 20# FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS 21# IN THE SOFTWARE. 22# 23# Authors: 24# Connor Abbott (cwabbott0@gmail.com) 25 26import re 27 28# Class that represents all the information we have about the opcode 29# NOTE: this must be kept in sync with nir_op_info 30 31class Opcode(object): 32 """Class that represents all the information we have about the opcode 33 NOTE: this must be kept in sync with nir_op_info 34 """ 35 def __init__(self, name, output_size, output_type, input_sizes, 36 input_types, is_conversion, algebraic_properties, const_expr, 37 description): 38 """Parameters: 39 40 - name is the name of the opcode (prepend nir_op_ for the enum name) 41 - all types are strings that get nir_type_ prepended to them 42 - input_types is a list of types 43 - is_conversion is true if this opcode represents a type conversion 44 - algebraic_properties is a space-seperated string, where nir_op_is_ is 45 prepended before each entry 46 - const_expr is an expression or series of statements that computes the 47 constant value of the opcode given the constant values of its inputs. 48 - Optional description of the opcode for documentation. 49 50 Constant expressions are formed from the variables src0, src1, ..., 51 src(N-1), where N is the number of arguments. The output of the 52 expression should be stored in the dst variable. Per-component input 53 and output variables will be scalars and non-per-component input and 54 output variables will be a struct with fields named x, y, z, and w 55 all of the correct type. Input and output variables can be assumed 56 to already be of the correct type and need no conversion. In 57 particular, the conversion from the C bool type to/from NIR_TRUE and 58 NIR_FALSE happens automatically. 59 60 For per-component instructions, the entire expression will be 61 executed once for each component. For non-per-component 62 instructions, the expression is expected to store the correct values 63 in dst.x, dst.y, etc. If "dst" does not exist anywhere in the 64 constant expression, an assignment to dst will happen automatically 65 and the result will be equivalent to "dst = <expression>" for 66 per-component instructions and "dst.x = dst.y = ... = <expression>" 67 for non-per-component instructions. 68 """ 69 assert isinstance(name, str) 70 assert isinstance(output_size, int) 71 assert isinstance(output_type, str) 72 assert isinstance(input_sizes, list) 73 assert isinstance(input_sizes[0], int) 74 assert isinstance(input_types, list) 75 assert isinstance(input_types[0], str) 76 assert isinstance(is_conversion, bool) 77 assert isinstance(algebraic_properties, str) 78 assert isinstance(const_expr, str) 79 assert len(input_sizes) == len(input_types) 80 assert 0 <= output_size <= 5 or (output_size == 8) or (output_size == 16) 81 for size in input_sizes: 82 assert 0 <= size <= 5 or (size == 8) or (size == 16) 83 if output_size != 0: 84 assert size != 0 85 self.name = name 86 self.num_inputs = len(input_sizes) 87 self.output_size = output_size 88 self.output_type = output_type 89 self.input_sizes = input_sizes 90 self.input_types = input_types 91 self.is_conversion = is_conversion 92 self.algebraic_properties = algebraic_properties 93 self.const_expr = const_expr 94 self.description = description 95 96# helper variables for strings 97tfloat = "float" 98tint = "int" 99tbool = "bool" 100tbool1 = "bool1" 101tbool8 = "bool8" 102tbool16 = "bool16" 103tbool32 = "bool32" 104tuint = "uint" 105tuint8 = "uint8" 106tint16 = "int16" 107tuint16 = "uint16" 108tfloat16 = "float16" 109tfloat32 = "float32" 110tint32 = "int32" 111tuint32 = "uint32" 112tint64 = "int64" 113tuint64 = "uint64" 114tfloat64 = "float64" 115 116_TYPE_SPLIT_RE = re.compile(r'(?P<type>int|uint|float|bool)(?P<bits>\d+)?') 117 118def type_has_size(type_): 119 m = _TYPE_SPLIT_RE.match(type_) 120 assert m is not None, 'Invalid NIR type string: "{}"'.format(type_) 121 return m.group('bits') is not None 122 123def type_size(type_): 124 m = _TYPE_SPLIT_RE.match(type_) 125 assert m is not None, 'Invalid NIR type string: "{}"'.format(type_) 126 assert m.group('bits') is not None, \ 127 'NIR type string has no bit size: "{}"'.format(type_) 128 return int(m.group('bits')) 129 130def type_sizes(type_): 131 if type_has_size(type_): 132 return [type_size(type_)] 133 elif type_ == 'bool': 134 return [1, 8, 16, 32] 135 elif type_ == 'float': 136 return [16, 32, 64] 137 else: 138 return [1, 8, 16, 32, 64] 139 140def type_base_type(type_): 141 m = _TYPE_SPLIT_RE.match(type_) 142 assert m is not None, 'Invalid NIR type string: "{}"'.format(type_) 143 return m.group('type') 144 145# Operation where the first two sources are commutative. 146# 147# For 2-source operations, this just mathematical commutativity. Some 148# 3-source operations, like ffma, are only commutative in the first two 149# sources. 150_2src_commutative = "2src_commutative " 151associative = "associative " 152selection = "selection " 153 154# global dictionary of opcodes 155opcodes = {} 156 157def opcode(name, output_size, output_type, input_sizes, input_types, 158 is_conversion, algebraic_properties, const_expr, description = ""): 159 assert name not in opcodes 160 opcodes[name] = Opcode(name, output_size, output_type, input_sizes, 161 input_types, is_conversion, algebraic_properties, 162 const_expr, description) 163 164def unop_convert(name, out_type, in_type, const_expr, description = ""): 165 opcode(name, 0, out_type, [0], [in_type], False, "", const_expr, description) 166 167def unop(name, ty, const_expr, description = "", algebraic_properties = ""): 168 opcode(name, 0, ty, [0], [ty], False, algebraic_properties, const_expr, 169 description) 170 171def unop_horiz(name, output_size, output_type, input_size, input_type, 172 const_expr, description = ""): 173 opcode(name, output_size, output_type, [input_size], [input_type], 174 False, "", const_expr, description) 175 176def unop_reduce(name, output_size, output_type, input_type, prereduce_expr, 177 reduce_expr, final_expr, description = ""): 178 def prereduce(src): 179 return "(" + prereduce_expr.format(src=src) + ")" 180 def final(src): 181 return final_expr.format(src="(" + src + ")") 182 def reduce_(src0, src1): 183 return reduce_expr.format(src0=src0, src1=src1) 184 src0 = prereduce("src0.x") 185 src1 = prereduce("src0.y") 186 src2 = prereduce("src0.z") 187 src3 = prereduce("src0.w") 188 unop_horiz(name + "2", output_size, output_type, 2, input_type, 189 final(reduce_(src0, src1)), description) 190 unop_horiz(name + "3", output_size, output_type, 3, input_type, 191 final(reduce_(reduce_(src0, src1), src2)), description) 192 unop_horiz(name + "4", output_size, output_type, 4, input_type, 193 final(reduce_(reduce_(src0, src1), reduce_(src2, src3))), 194 description) 195 196def unop_numeric_convert(name, out_type, in_type, const_expr, description = ""): 197 opcode(name, 0, out_type, [0], [in_type], True, "", const_expr, description) 198 199unop("mov", tuint, "src0") 200 201unop("ineg", tint, "src0 == u_intN_min(bit_size) ? src0 : -src0") 202unop("fneg", tfloat, "-src0") 203unop("inot", tint, "~src0", description = "Invert every bit of the integer") 204 205unop("fsign", tfloat, ("bit_size == 64 ? " + 206 "(isnan(src0) ? 0.0 : ((src0 == 0.0 ) ? src0 : (src0 > 0.0 ) ? 1.0 : -1.0 )) : " + 207 "(isnan(src0) ? 0.0f : ((src0 == 0.0f) ? src0 : (src0 > 0.0f) ? 1.0f : -1.0f))"), 208 description = """ 209Roughly implements the OpenGL / Vulkan rules for ``sign(float)``. 210The ``GLSL.std.450 FSign`` instruction is defined as: 211 212 Result is 1.0 if x > 0, 0.0 if x = 0, or -1.0 if x < 0. 213 214If the source is equal to zero, there is a preference for the result to have 215the same sign, but this is not required (it is required by OpenCL). If the 216source is not a number, there is a preference for the result to be +0.0, but 217this is not required (it is required by OpenCL). If the source is not a 218number, and the result is not +0.0, the result should definitely **not** be 219NaN. 220 221The values returned for constant folding match the behavior required by 222OpenCL. 223 """) 224 225unop("isign", tint, "(src0 == 0) ? 0 : ((src0 > 0) ? 1 : -1)") 226unop("iabs", tint, "(src0 < 0) ? -src0 : src0") 227unop("fabs", tfloat, "fabs(src0)") 228unop("fsat", tfloat, ("fmin(fmax(src0, 0.0), 1.0)")) 229unop("frcp", tfloat, "bit_size == 64 ? 1.0 / src0 : 1.0f / src0") 230unop("frsq", tfloat, "bit_size == 64 ? 1.0 / sqrt(src0) : 1.0f / sqrtf(src0)") 231unop("fsqrt", tfloat, "bit_size == 64 ? sqrt(src0) : sqrtf(src0)") 232unop("fexp2", tfloat, "exp2f(src0)") 233unop("flog2", tfloat, "log2f(src0)") 234 235# Generate all of the numeric conversion opcodes 236for src_t in [tint, tuint, tfloat, tbool]: 237 if src_t == tbool: 238 dst_types = [tfloat, tint, tbool] 239 elif src_t == tint: 240 dst_types = [tfloat, tint] 241 elif src_t == tuint: 242 dst_types = [tfloat, tuint] 243 elif src_t == tfloat: 244 dst_types = [tint, tuint, tfloat] 245 246 for dst_t in dst_types: 247 for dst_bit_size in type_sizes(dst_t): 248 if dst_bit_size == 16 and dst_t == tfloat and src_t == tfloat: 249 rnd_modes = ['_rtne', '_rtz', ''] 250 for rnd_mode in rnd_modes: 251 if rnd_mode == '_rtne': 252 conv_expr = """ 253 if (bit_size > 32) { 254 dst = _mesa_half_to_float(_mesa_double_to_float16_rtne(src0)); 255 } else if (bit_size > 16) { 256 dst = _mesa_half_to_float(_mesa_float_to_float16_rtne(src0)); 257 } else { 258 dst = src0; 259 } 260 """ 261 elif rnd_mode == '_rtz': 262 conv_expr = """ 263 if (bit_size > 32) { 264 dst = _mesa_half_to_float(_mesa_double_to_float16_rtz(src0)); 265 } else if (bit_size > 16) { 266 dst = _mesa_half_to_float(_mesa_float_to_float16_rtz(src0)); 267 } else { 268 dst = src0; 269 } 270 """ 271 else: 272 conv_expr = """ 273 if (bit_size > 32) { 274 if (nir_is_rounding_mode_rtz(execution_mode, 16)) 275 dst = _mesa_half_to_float(_mesa_double_to_float16_rtz(src0)); 276 else 277 dst = _mesa_half_to_float(_mesa_double_to_float16_rtne(src0)); 278 } else if (bit_size > 16) { 279 if (nir_is_rounding_mode_rtz(execution_mode, 16)) 280 dst = _mesa_half_to_float(_mesa_float_to_float16_rtz(src0)); 281 else 282 dst = _mesa_half_to_float(_mesa_float_to_float16_rtne(src0)); 283 } else { 284 dst = src0; 285 } 286 """ 287 288 unop_numeric_convert("{0}2{1}{2}{3}".format(src_t[0], 289 dst_t[0], 290 dst_bit_size, 291 rnd_mode), 292 dst_t + str(dst_bit_size), 293 src_t, conv_expr) 294 elif dst_bit_size == 32 and dst_t == tfloat and src_t == tfloat: 295 conv_expr = """ 296 if (bit_size > 32 && nir_is_rounding_mode_rtz(execution_mode, 32)) { 297 dst = _mesa_double_to_float_rtz(src0); 298 } else { 299 dst = src0; 300 } 301 """ 302 unop_numeric_convert("{0}2{1}{2}".format(src_t[0], dst_t[0], 303 dst_bit_size), 304 dst_t + str(dst_bit_size), src_t, conv_expr) 305 else: 306 conv_expr = "src0 != 0" if dst_t == tbool else "src0" 307 unop_numeric_convert("{0}2{1}{2}".format(src_t[0], dst_t[0], 308 dst_bit_size), 309 dst_t + str(dst_bit_size), src_t, conv_expr) 310 311def unop_numeric_convert_mp(base, src_t, dst_t): 312 op_like = base + "16" 313 unop_numeric_convert(base + "mp", src_t, dst_t, opcodes[op_like].const_expr, 314 description = """ 315Special opcode that is the same as :nir:alu-op:`{}` except that it is safe to 316remove it if the result is immediately converted back to 32 bits again. This is 317generated as part of the precision lowering pass. ``mp`` stands for medium 318precision. 319 """.format(op_like)) 320 321unop_numeric_convert_mp("f2f", tfloat16, tfloat32) 322unop_numeric_convert_mp("i2i", tint16, tint32) 323# u2ump isn't defined, because the behavior is equal to i2imp 324unop_numeric_convert_mp("f2i", tint16, tfloat32) 325unop_numeric_convert_mp("f2u", tuint16, tfloat32) 326unop_numeric_convert_mp("i2f", tfloat16, tint32) 327unop_numeric_convert_mp("u2f", tfloat16, tuint32) 328 329# Unary floating-point rounding operations. 330 331 332unop("ftrunc", tfloat, "bit_size == 64 ? trunc(src0) : truncf(src0)") 333unop("fceil", tfloat, "bit_size == 64 ? ceil(src0) : ceilf(src0)") 334unop("ffloor", tfloat, "bit_size == 64 ? floor(src0) : floorf(src0)") 335unop("ffract", tfloat, "src0 - (bit_size == 64 ? floor(src0) : floorf(src0))") 336unop("fround_even", tfloat, "bit_size == 64 ? _mesa_roundeven(src0) : _mesa_roundevenf(src0)") 337 338unop("fquantize2f16", tfloat, "(fabs(src0) < ldexpf(1.0, -14)) ? copysignf(0.0f, src0) : _mesa_half_to_float(_mesa_float_to_half(src0))") 339 340# Trigonometric operations. 341 342 343unop("fsin", tfloat, "bit_size == 64 ? sin(src0) : sinf(src0)") 344unop("fcos", tfloat, "bit_size == 64 ? cos(src0) : cosf(src0)") 345 346# dfrexp 347unop_convert("frexp_exp", tint32, tfloat, "frexp(src0, &dst);") 348unop_convert("frexp_sig", tfloat, tfloat, "int n; dst = frexp(src0, &n);") 349 350# Floating point pack and unpack operations. 351 352def pack_2x16(fmt, in_type): 353 unop_horiz("pack_" + fmt + "_2x16", 1, tuint32, 2, in_type, """ 354dst.x = (uint32_t) pack_fmt_1x16(src0.x); 355dst.x |= ((uint32_t) pack_fmt_1x16(src0.y)) << 16; 356""".replace("fmt", fmt)) 357 358def pack_4x8(fmt): 359 unop_horiz("pack_" + fmt + "_4x8", 1, tuint32, 4, tfloat32, """ 360dst.x = (uint32_t) pack_fmt_1x8(src0.x); 361dst.x |= ((uint32_t) pack_fmt_1x8(src0.y)) << 8; 362dst.x |= ((uint32_t) pack_fmt_1x8(src0.z)) << 16; 363dst.x |= ((uint32_t) pack_fmt_1x8(src0.w)) << 24; 364""".replace("fmt", fmt)) 365 366def unpack_2x16(fmt): 367 unop_horiz("unpack_" + fmt + "_2x16", 2, tfloat32, 1, tuint32, """ 368dst.x = unpack_fmt_1x16((uint16_t)(src0.x & 0xffff)); 369dst.y = unpack_fmt_1x16((uint16_t)(src0.x << 16)); 370""".replace("fmt", fmt)) 371 372def unpack_4x8(fmt): 373 unop_horiz("unpack_" + fmt + "_4x8", 4, tfloat32, 1, tuint32, """ 374dst.x = unpack_fmt_1x8((uint8_t)(src0.x & 0xff)); 375dst.y = unpack_fmt_1x8((uint8_t)((src0.x >> 8) & 0xff)); 376dst.z = unpack_fmt_1x8((uint8_t)((src0.x >> 16) & 0xff)); 377dst.w = unpack_fmt_1x8((uint8_t)(src0.x >> 24)); 378""".replace("fmt", fmt)) 379 380 381pack_2x16("snorm", tfloat) 382pack_4x8("snorm") 383pack_2x16("unorm", tfloat) 384pack_4x8("unorm") 385pack_2x16("half", tfloat32) 386unpack_2x16("snorm") 387unpack_4x8("snorm") 388unpack_2x16("unorm") 389unpack_4x8("unorm") 390 391unop_horiz("pack_uint_2x16", 1, tuint32, 2, tuint32, """ 392dst.x = _mesa_unsigned_to_unsigned(src0.x, 16); 393dst.x |= _mesa_unsigned_to_unsigned(src0.y, 16) << 16; 394""", description = """ 395Convert two unsigned integers into a packed unsigned short (clamp is applied). 396""") 397 398unop_horiz("pack_sint_2x16", 1, tint32, 2, tint32, """ 399dst.x = _mesa_signed_to_signed(src0.x, 16) & 0xffff; 400dst.x |= _mesa_signed_to_signed(src0.y, 16) << 16; 401""", description = """ 402Convert two signed integers into a packed signed short (clamp is applied). 403""") 404 405unop_horiz("pack_uvec2_to_uint", 1, tuint32, 2, tuint32, """ 406dst.x = (src0.x & 0xffff) | (src0.y << 16); 407""") 408 409unop_horiz("pack_uvec4_to_uint", 1, tuint32, 4, tuint32, """ 410dst.x = (src0.x << 0) | 411 (src0.y << 8) | 412 (src0.z << 16) | 413 (src0.w << 24); 414""") 415 416unop_horiz("pack_32_4x8", 1, tuint32, 4, tuint8, 417 "dst.x = src0.x | ((uint32_t)src0.y << 8) | ((uint32_t)src0.z << 16) | ((uint32_t)src0.w << 24);") 418 419unop_horiz("pack_32_2x16", 1, tuint32, 2, tuint16, 420 "dst.x = src0.x | ((uint32_t)src0.y << 16);") 421 422unop_horiz("pack_64_2x32", 1, tuint64, 2, tuint32, 423 "dst.x = src0.x | ((uint64_t)src0.y << 32);") 424 425unop_horiz("pack_64_4x16", 1, tuint64, 4, tuint16, 426 "dst.x = src0.x | ((uint64_t)src0.y << 16) | ((uint64_t)src0.z << 32) | ((uint64_t)src0.w << 48);") 427 428unop_horiz("unpack_64_2x32", 2, tuint32, 1, tuint64, 429 "dst.x = src0.x; dst.y = src0.x >> 32;") 430 431unop_horiz("unpack_64_4x16", 4, tuint16, 1, tuint64, 432 "dst.x = src0.x; dst.y = src0.x >> 16; dst.z = src0.x >> 32; dst.w = src0.x >> 48;") 433 434unop_horiz("unpack_32_2x16", 2, tuint16, 1, tuint32, 435 "dst.x = src0.x; dst.y = src0.x >> 16;") 436 437unop_horiz("unpack_32_4x8", 4, tuint8, 1, tuint32, 438 "dst.x = src0.x; dst.y = src0.x >> 8; dst.z = src0.x >> 16; dst.w = src0.x >> 24;") 439 440unop_horiz("unpack_half_2x16", 2, tfloat32, 1, tuint32, """ 441dst.x = unpack_half_1x16((uint16_t)(src0.x & 0xffff), nir_is_denorm_flush_to_zero(execution_mode, 16)); 442dst.y = unpack_half_1x16((uint16_t)(src0.x >> 16), nir_is_denorm_flush_to_zero(execution_mode, 16)); 443""") 444 445# Lowered floating point unpacking operations. 446 447unop_convert("unpack_half_2x16_split_x", tfloat32, tuint32, 448 "unpack_half_1x16((uint16_t)(src0 & 0xffff), nir_is_denorm_flush_to_zero(execution_mode, 16))") 449unop_convert("unpack_half_2x16_split_y", tfloat32, tuint32, 450 "unpack_half_1x16((uint16_t)(src0 >> 16), nir_is_denorm_flush_to_zero(execution_mode, 16))") 451 452 453unop_convert("unpack_32_2x16_split_x", tuint16, tuint32, "src0") 454unop_convert("unpack_32_2x16_split_y", tuint16, tuint32, "src0 >> 16") 455 456unop_convert("unpack_64_2x32_split_x", tuint32, tuint64, "src0") 457unop_convert("unpack_64_2x32_split_y", tuint32, tuint64, "src0 >> 32") 458 459# Bit operations, part of ARB_gpu_shader5. 460 461 462unop("bitfield_reverse", tuint32, """ 463/* we're not winning any awards for speed here, but that's ok */ 464dst = 0; 465for (unsigned bit = 0; bit < 32; bit++) 466 dst |= ((src0 >> bit) & 1) << (31 - bit); 467""") 468unop_convert("bit_count", tuint32, tuint, """ 469dst = 0; 470for (unsigned bit = 0; bit < bit_size; bit++) { 471 if ((src0 >> bit) & 1) 472 dst++; 473} 474""") 475 476unop_convert("ufind_msb", tint32, tuint, """ 477dst = -1; 478for (int bit = bit_size - 1; bit >= 0; bit--) { 479 if ((src0 >> bit) & 1) { 480 dst = bit; 481 break; 482 } 483} 484""") 485 486unop_convert("ufind_msb_rev", tint32, tuint, """ 487dst = -1; 488for (int bit = 0; bit < bit_size; bit++) { 489 if ((src0 << bit) & 0x80000000) { 490 dst = bit; 491 break; 492 } 493} 494""") 495 496unop("uclz", tuint32, """ 497int bit; 498for (bit = bit_size - 1; bit >= 0; bit--) { 499 if ((src0 & (1u << bit)) != 0) 500 break; 501} 502dst = (unsigned)(bit_size - bit - 1); 503""") 504 505unop("ifind_msb", tint32, """ 506dst = -1; 507for (int bit = bit_size - 1; bit >= 0; bit--) { 508 /* If src0 < 0, we're looking for the first 0 bit. 509 * if src0 >= 0, we're looking for the first 1 bit. 510 */ 511 if ((((src0 >> bit) & 1) && (src0 >= 0)) || 512 (!((src0 >> bit) & 1) && (src0 < 0))) { 513 dst = bit; 514 break; 515 } 516} 517""") 518 519unop("ifind_msb_rev", tint32, """ 520dst = -1; 521/* We are looking for the highest bit that's not the same as the sign bit. */ 522uint32_t sign = src0 & 0x80000000u; 523for (int bit = 0; bit < 32; bit++) { 524 if (((src0 << bit) & 0x80000000u) != sign) { 525 dst = bit; 526 break; 527 } 528} 529""") 530 531unop_convert("find_lsb", tint32, tint, """ 532dst = -1; 533for (unsigned bit = 0; bit < bit_size; bit++) { 534 if ((src0 >> bit) & 1) { 535 dst = bit; 536 break; 537 } 538} 539""") 540 541unop_reduce("fsum", 1, tfloat, tfloat, "{src}", "{src0} + {src1}", "{src}", 542 description = "Sum of vector components") 543 544def binop_convert(name, out_type, in_type1, alg_props, const_expr, description="", in_type2=None): 545 if in_type2 is None: 546 in_type2 = in_type1 547 opcode(name, 0, out_type, [0, 0], [in_type1, in_type2], 548 False, alg_props, const_expr, description) 549 550def binop(name, ty, alg_props, const_expr, description = ""): 551 binop_convert(name, ty, ty, alg_props, const_expr, description) 552 553def binop_compare(name, ty, alg_props, const_expr, description = "", ty2=None): 554 binop_convert(name, tbool1, ty, alg_props, const_expr, description, ty2) 555 556def binop_compare8(name, ty, alg_props, const_expr, description = "", ty2=None): 557 binop_convert(name, tbool8, ty, alg_props, const_expr, description, ty2) 558 559def binop_compare16(name, ty, alg_props, const_expr, description = "", ty2=None): 560 binop_convert(name, tbool16, ty, alg_props, const_expr, description, ty2) 561 562def binop_compare32(name, ty, alg_props, const_expr, description = "", ty2=None): 563 binop_convert(name, tbool32, ty, alg_props, const_expr, description, ty2) 564 565def binop_compare_all_sizes(name, ty, alg_props, const_expr, description = "", ty2=None): 566 binop_compare(name, ty, alg_props, const_expr, description, ty2) 567 binop_compare8(name + "8", ty, alg_props, const_expr, description, ty2) 568 binop_compare16(name + "16", ty, alg_props, const_expr, description, ty2) 569 binop_compare32(name + "32", ty, alg_props, const_expr, description, ty2) 570 571def binop_horiz(name, out_size, out_type, src1_size, src1_type, src2_size, 572 src2_type, const_expr, description = ""): 573 opcode(name, out_size, out_type, [src1_size, src2_size], [src1_type, src2_type], 574 False, "", const_expr, description) 575 576def binop_reduce(name, output_size, output_type, src_type, prereduce_expr, 577 reduce_expr, final_expr, suffix="", description = ""): 578 def final(src): 579 return final_expr.format(src= "(" + src + ")") 580 def reduce_(src0, src1): 581 return reduce_expr.format(src0=src0, src1=src1) 582 def prereduce(src0, src1): 583 return "(" + prereduce_expr.format(src0=src0, src1=src1) + ")" 584 srcs = [prereduce("src0." + letter, "src1." + letter) for letter in "xyzwefghijklmnop"] 585 def pairwise_reduce(start, size): 586 if (size == 1): 587 return srcs[start] 588 return reduce_(pairwise_reduce(start + size // 2, size // 2), pairwise_reduce(start, size // 2)) 589 for size in [2, 4, 8, 16]: 590 opcode(name + str(size) + suffix, output_size, output_type, 591 [size, size], [src_type, src_type], False, _2src_commutative, 592 final(pairwise_reduce(0, size)), description) 593 opcode(name + "3" + suffix, output_size, output_type, 594 [3, 3], [src_type, src_type], False, _2src_commutative, 595 final(reduce_(reduce_(srcs[2], srcs[1]), srcs[0])), description) 596 opcode(name + "5" + suffix, output_size, output_type, 597 [5, 5], [src_type, src_type], False, _2src_commutative, 598 final(reduce_(srcs[4], reduce_(reduce_(srcs[3], srcs[2]), 599 reduce_(srcs[1], srcs[0])))), 600 description) 601 602def binop_reduce_all_sizes(name, output_size, src_type, prereduce_expr, 603 reduce_expr, final_expr, description = ""): 604 binop_reduce(name, output_size, tbool1, src_type, 605 prereduce_expr, reduce_expr, final_expr, description) 606 binop_reduce("b8" + name[1:], output_size, tbool8, src_type, 607 prereduce_expr, reduce_expr, final_expr, description) 608 binop_reduce("b16" + name[1:], output_size, tbool16, src_type, 609 prereduce_expr, reduce_expr, final_expr, description) 610 binop_reduce("b32" + name[1:], output_size, tbool32, src_type, 611 prereduce_expr, reduce_expr, final_expr, description) 612 613binop("fadd", tfloat, _2src_commutative + associative,""" 614if (nir_is_rounding_mode_rtz(execution_mode, bit_size)) { 615 if (bit_size == 64) 616 dst = _mesa_double_add_rtz(src0, src1); 617 else 618 dst = _mesa_double_to_float_rtz((double)src0 + (double)src1); 619} else { 620 dst = src0 + src1; 621} 622""") 623binop("iadd", tint, _2src_commutative + associative, "(uint64_t)src0 + (uint64_t)src1") 624binop("iadd_sat", tint, _2src_commutative, """ 625 src1 > 0 ? 626 (src0 + src1 < src0 ? u_intN_max(bit_size) : src0 + src1) : 627 (src0 < src0 + src1 ? u_intN_min(bit_size) : src0 + src1) 628""") 629binop("uadd_sat", tuint, _2src_commutative, 630 "(src0 + src1) < src0 ? u_uintN_max(sizeof(src0) * 8) : (src0 + src1)") 631binop("isub_sat", tint, "", """ 632 src1 < 0 ? 633 (src0 - src1 < src0 ? u_intN_max(bit_size) : src0 - src1) : 634 (src0 < src0 - src1 ? u_intN_min(bit_size) : src0 - src1) 635""") 636binop("usub_sat", tuint, "", "src0 < src1 ? 0 : src0 - src1") 637 638binop("fsub", tfloat, "", """ 639if (nir_is_rounding_mode_rtz(execution_mode, bit_size)) { 640 if (bit_size == 64) 641 dst = _mesa_double_sub_rtz(src0, src1); 642 else 643 dst = _mesa_double_to_float_rtz((double)src0 - (double)src1); 644} else { 645 dst = src0 - src1; 646} 647""") 648binop("isub", tint, "", "src0 - src1") 649binop_convert("uabs_isub", tuint, tint, "", """ 650 src1 > src0 ? (uint64_t) src1 - (uint64_t) src0 651 : (uint64_t) src0 - (uint64_t) src1 652""") 653binop("uabs_usub", tuint, "", "(src1 > src0) ? (src1 - src0) : (src0 - src1)") 654 655binop("fmul", tfloat, _2src_commutative + associative, """ 656if (nir_is_rounding_mode_rtz(execution_mode, bit_size)) { 657 if (bit_size == 64) 658 dst = _mesa_double_mul_rtz(src0, src1); 659 else 660 dst = _mesa_double_to_float_rtz((double)src0 * (double)src1); 661} else { 662 dst = src0 * src1; 663} 664""") 665 666binop("fmulz", tfloat32, _2src_commutative + associative, """ 667if (src0 == 0.0 || src1 == 0.0) 668 dst = 0.0; 669else if (nir_is_rounding_mode_rtz(execution_mode, 32)) 670 dst = _mesa_double_to_float_rtz((double)src0 * (double)src1); 671else 672 dst = src0 * src1; 673""", description = """ 674Unlike :nir:alu-op:`fmul`, anything (even infinity or NaN) multiplied by zero is 675always zero. ``fmulz(0.0, inf)`` and ``fmulz(0.0, nan)`` must be +/-0.0, even 676if ``INF_PRESERVE/NAN_PRESERVE`` is not used. If ``SIGNED_ZERO_PRESERVE`` is 677used, then the result must be a positive zero if either operand is zero. 678""") 679 680 681binop("imul", tint, _2src_commutative + associative, """ 682 /* Use 64-bit multiplies to prevent overflow of signed arithmetic */ 683 dst = (uint64_t)src0 * (uint64_t)src1; 684""", description = "Low 32-bits of signed/unsigned integer multiply") 685 686binop_convert("imul_2x32_64", tint64, tint32, _2src_commutative, 687 "(int64_t)src0 * (int64_t)src1", 688 description = "Multiply signed 32-bit integers, 64-bit result") 689binop_convert("umul_2x32_64", tuint64, tuint32, _2src_commutative, 690 "(uint64_t)src0 * (uint64_t)src1", 691 description = "Multiply unsigned 32-bit integers, 64-bit result") 692 693binop("imul_high", tint, _2src_commutative, """ 694if (bit_size == 64) { 695 /* We need to do a full 128-bit x 128-bit multiply in order for the sign 696 * extension to work properly. The casts are kind-of annoying but needed 697 * to prevent compiler warnings. 698 */ 699 uint32_t src0_u32[4] = { 700 src0, 701 (int64_t)src0 >> 32, 702 (int64_t)src0 >> 63, 703 (int64_t)src0 >> 63, 704 }; 705 uint32_t src1_u32[4] = { 706 src1, 707 (int64_t)src1 >> 32, 708 (int64_t)src1 >> 63, 709 (int64_t)src1 >> 63, 710 }; 711 uint32_t prod_u32[4]; 712 ubm_mul_u32arr(prod_u32, src0_u32, src1_u32); 713 dst = (uint64_t)prod_u32[2] | ((uint64_t)prod_u32[3] << 32); 714} else { 715 /* First, sign-extend to 64-bit, then convert to unsigned to prevent 716 * potential overflow of signed multiply */ 717 dst = ((uint64_t)(int64_t)src0 * (uint64_t)(int64_t)src1) >> bit_size; 718} 719""", description = "High 32-bits of signed integer multiply") 720 721binop("umul_high", tuint, _2src_commutative, """ 722if (bit_size == 64) { 723 /* The casts are kind-of annoying but needed to prevent compiler warnings. */ 724 uint32_t src0_u32[2] = { src0, (uint64_t)src0 >> 32 }; 725 uint32_t src1_u32[2] = { src1, (uint64_t)src1 >> 32 }; 726 uint32_t prod_u32[4]; 727 ubm_mul_u32arr(prod_u32, src0_u32, src1_u32); 728 dst = (uint64_t)prod_u32[2] | ((uint64_t)prod_u32[3] << 32); 729} else { 730 dst = ((uint64_t)src0 * (uint64_t)src1) >> bit_size; 731} 732""", description = "High 32-bits of unsigned integer multiply") 733 734binop("umul_low", tuint32, _2src_commutative, """ 735uint64_t mask = (1 << (bit_size / 2)) - 1; 736dst = ((uint64_t)src0 & mask) * ((uint64_t)src1 & mask); 737""", description = "Low 32-bits of unsigned integer multiply") 738 739binop("imul_32x16", tint32, "", "src0 * (int16_t) src1", 740 description = "Multiply 32-bits with low 16-bits, with sign extension") 741binop("umul_32x16", tuint32, "", "src0 * (uint16_t) src1", 742 description = "Multiply 32-bits with low 16-bits, with zero extension") 743 744binop("fdiv", tfloat, "", "src0 / src1") 745binop("idiv", tint, "", "src1 == 0 ? 0 : (src0 / src1)") 746binop("udiv", tuint, "", "src1 == 0 ? 0 : (src0 / src1)") 747 748binop_convert("uadd_carry", tuint, tuint, _2src_commutative, 749 "src0 + src1 < src0", 750 description = """ 751Return an integer (1 or 0) representing the carry resulting from the 752addition of the two unsigned arguments. 753 """) 754 755binop_convert("usub_borrow", tuint, tuint, "", "src0 < src1", description = """ 756Return an integer (1 or 0) representing the borrow resulting from the 757subtraction of the two unsigned arguments. 758 """) 759 760# hadd: (a + b) >> 1 (without overflow) 761# x + y = x - (x & ~y) + (x & ~y) + y - (~x & y) + (~x & y) 762# = (x & y) + (x & ~y) + (x & y) + (~x & y) 763# = 2 * (x & y) + (x & ~y) + (~x & y) 764# = ((x & y) << 1) + (x ^ y) 765# 766# Since we know that the bottom bit of (x & y) << 1 is zero, 767# 768# (x + y) >> 1 = (((x & y) << 1) + (x ^ y)) >> 1 769# = (x & y) + ((x ^ y) >> 1) 770binop("ihadd", tint, _2src_commutative, "(src0 & src1) + ((src0 ^ src1) >> 1)") 771binop("uhadd", tuint, _2src_commutative, "(src0 & src1) + ((src0 ^ src1) >> 1)") 772 773# rhadd: (a + b + 1) >> 1 (without overflow) 774# x + y + 1 = x + (~x & y) - (~x & y) + y + (x & ~y) - (x & ~y) + 1 775# = (x | y) - (~x & y) + (x | y) - (x & ~y) + 1 776# = 2 * (x | y) - ((~x & y) + (x & ~y)) + 1 777# = ((x | y) << 1) - (x ^ y) + 1 778# 779# Since we know that the bottom bit of (x & y) << 1 is zero, 780# 781# (x + y + 1) >> 1 = (x | y) + (-(x ^ y) + 1) >> 1) 782# = (x | y) - ((x ^ y) >> 1) 783binop("irhadd", tint, _2src_commutative, "(src0 | src1) - ((src0 ^ src1) >> 1)") 784binop("urhadd", tuint, _2src_commutative, "(src0 | src1) - ((src0 ^ src1) >> 1)") 785 786binop("umod", tuint, "", "src1 == 0 ? 0 : src0 % src1") 787 788# For signed integers, there are several different possible definitions of 789# "modulus" or "remainder". We follow the conventions used by LLVM and 790# SPIR-V. The irem opcode implements the standard C/C++ signed "%" 791# operation while the imod opcode implements the more mathematical 792# "modulus" operation. For details on the difference, see 793# 794# http://mathforum.org/library/drmath/view/52343.html 795 796binop("irem", tint, "", "src1 == 0 ? 0 : src0 % src1") 797binop("imod", tint, "", 798 "src1 == 0 ? 0 : ((src0 % src1 == 0 || (src0 >= 0) == (src1 >= 0)) ?" 799 " src0 % src1 : src0 % src1 + src1)") 800binop("fmod", tfloat, "", "src0 - src1 * floorf(src0 / src1)") 801binop("frem", tfloat, "", "src0 - src1 * truncf(src0 / src1)") 802 803# 804# Comparisons 805# 806 807 808# these integer-aware comparisons return a boolean (0 or ~0) 809 810binop_compare_all_sizes("flt", tfloat, "", "src0 < src1") 811binop_compare_all_sizes("fge", tfloat, "", "src0 >= src1") 812binop_compare_all_sizes("fltu", tfloat, "", "isnan(src0) || isnan(src1) || src0 < src1") 813binop_compare_all_sizes("fgeu", tfloat, "", "isnan(src0) || isnan(src1) || src0 >= src1") 814binop_compare_all_sizes("feq", tfloat, _2src_commutative, "src0 == src1") 815binop_compare_all_sizes("fneu", tfloat, _2src_commutative, "src0 != src1") 816binop_compare_all_sizes("fequ", tfloat, _2src_commutative, "isnan(src0) || isnan(src1) || src0 == src1") 817binop_compare_all_sizes("fneo", tfloat, _2src_commutative, "!isnan(src0) && !isnan(src1) && src0 != src1") 818binop_compare_all_sizes("funord", tfloat, _2src_commutative, "isnan(src0) || isnan(src1)") 819binop_compare_all_sizes("ford", tfloat, _2src_commutative, "!isnan(src0) && !isnan(src1)") 820binop_compare_all_sizes("ilt", tint, "", "src0 < src1") 821binop_compare_all_sizes("ige", tint, "", "src0 >= src1") 822binop_compare_all_sizes("ieq", tint, _2src_commutative, "src0 == src1") 823binop_compare_all_sizes("ine", tint, _2src_commutative, "src0 != src1") 824binop_compare_all_sizes("ult", tuint, "", "src0 < src1") 825binop_compare_all_sizes("uge", tuint, "", "src0 >= src1") 826 827binop_compare_all_sizes("bitnz", tuint, "", "((uint64_t)src0 >> (src1 & (bit_size - 1)) & 0x1) == 0x1", 828 "only uses the least significant bits like SM5 shifts", tuint32) 829 830binop_compare_all_sizes("bitz", tuint, "", "((uint64_t)src0 >> (src1 & (bit_size - 1)) & 0x1) == 0x0", 831 "only uses the least significant bits like SM5 shifts", tuint32) 832 833# integer-aware GLSL-style comparisons that compare floats and ints 834 835binop_reduce_all_sizes("ball_fequal", 1, tfloat, "{src0} == {src1}", 836 "{src0} && {src1}", "{src}") 837binop_reduce_all_sizes("bany_fnequal", 1, tfloat, "{src0} != {src1}", 838 "{src0} || {src1}", "{src}") 839binop_reduce_all_sizes("ball_iequal", 1, tint, "{src0} == {src1}", 840 "{src0} && {src1}", "{src}") 841binop_reduce_all_sizes("bany_inequal", 1, tint, "{src0} != {src1}", 842 "{src0} || {src1}", "{src}") 843 844# non-integer-aware GLSL-style comparisons that return 0.0 or 1.0 845 846binop_reduce("fall_equal", 1, tfloat32, tfloat32, "{src0} == {src1}", 847 "{src0} && {src1}", "{src} ? 1.0f : 0.0f") 848binop_reduce("fany_nequal", 1, tfloat32, tfloat32, "{src0} != {src1}", 849 "{src0} || {src1}", "{src} ? 1.0f : 0.0f") 850 851# These comparisons for integer-less hardware return 1.0 and 0.0 for true 852# and false respectively 853 854binop("slt", tfloat, "", "(src0 < src1) ? 1.0f : 0.0f") # Set on Less Than 855binop("sge", tfloat, "", "(src0 >= src1) ? 1.0f : 0.0f") # Set on Greater or Equal 856binop("seq", tfloat, _2src_commutative, "(src0 == src1) ? 1.0f : 0.0f") # Set on Equal 857binop("sne", tfloat, _2src_commutative, "(src0 != src1) ? 1.0f : 0.0f") # Set on Not Equal 858 859shift_note = """ 860SPIRV shifts are undefined for shift-operands >= bitsize, 861but SM5 shifts are defined to use only the least significant bits. 862The NIR definition is according to the SM5 specification. 863""" 864 865opcode("ishl", 0, tint, [0, 0], [tint, tuint32], False, "", 866 "(uint64_t)src0 << (src1 & (sizeof(src0) * 8 - 1))", 867 description = "Left shift." + shift_note) 868opcode("ishr", 0, tint, [0, 0], [tint, tuint32], False, "", 869 "src0 >> (src1 & (sizeof(src0) * 8 - 1))", 870 description = "Signed right-shift." + shift_note) 871opcode("ushr", 0, tuint, [0, 0], [tuint, tuint32], False, "", 872 "src0 >> (src1 & (sizeof(src0) * 8 - 1))", 873 description = "Unsigned right-shift." + shift_note) 874 875opcode("udiv_aligned_4", 0, tuint, [0], [tuint], False, "", 876 "src0 >> 2", description = "Divide a multiple of 4 by 4") 877 878opcode("urol", 0, tuint, [0, 0], [tuint, tuint32], False, "", """ 879 uint32_t rotate_mask = sizeof(src0) * 8 - 1; 880 dst = (src0 << (src1 & rotate_mask)) | 881 (src0 >> (-src1 & rotate_mask)); 882""") 883opcode("uror", 0, tuint, [0, 0], [tuint, tuint32], False, "", """ 884 uint32_t rotate_mask = sizeof(src0) * 8 - 1; 885 dst = (src0 >> (src1 & rotate_mask)) | 886 (src0 << (-src1 & rotate_mask)); 887""") 888 889opcode("shfr", 0, tuint32, [0, 0, 0], [tuint32, tuint32, tuint32], False, "", """ 890 uint32_t rotate_mask = sizeof(src0) * 8 - 1; 891 uint64_t src = src1 | ((uint64_t)src0 << 32); 892 dst = src >> (src2 & rotate_mask); 893""") 894 895bitwise_description = """ 896Bitwise {0}, also used as a boolean {0} for hardware supporting integers. 897""" 898 899binop("iand", tuint, _2src_commutative + associative, "src0 & src1", 900 description = bitwise_description.format("AND")) 901binop("ior", tuint, _2src_commutative + associative, "src0 | src1", 902 description = bitwise_description.format("OR")) 903binop("ixor", tuint, _2src_commutative + associative, "src0 ^ src1", 904 description = bitwise_description.format("XOR")) 905 906 907binop_reduce("fdot", 1, tfloat, tfloat, "{src0} * {src1}", "{src0} + {src1}", 908 "{src}") 909 910binop_reduce("fdot", 0, tfloat, tfloat, 911 "{src0} * {src1}", "{src0} + {src1}", "{src}", 912 suffix="_replicated") 913 914opcode("fdph", 1, tfloat, [3, 4], [tfloat, tfloat], False, "", 915 "src0.x * src1.x + src0.y * src1.y + src0.z * src1.z + src1.w") 916opcode("fdph_replicated", 0, tfloat, [3, 4], [tfloat, tfloat], False, "", 917 "src0.x * src1.x + src0.y * src1.y + src0.z * src1.z + src1.w") 918 919# The C fmin/fmax functions have implementation-defined behaviour for signed 920# zeroes. However, SPIR-V requires: 921# 922# fmin(-0, +0) = -0 923# fmax(+0, -0) = +0 924# 925# The NIR opcodes match SPIR-V. Furthermore, the NIR opcodes are commutative, so 926# we must also ensure: 927# 928# fmin(+0, -0) = -0 929# fmax(-0, +0) = +0 930# 931# To implement the constant folding, when the sources are equal, we use the 932# min/max of the bit patterns which will order the signed zeroes while 933# preserving all other values. 934for op, macro in [("fmin", "MIN2"), ("fmax", "MAX2")]: 935 binop(op, tfloat, _2src_commutative + associative, 936 "bit_size == 64 ? " + 937 f"(src0 == src1 ? uid({macro}((int64_t)dui(src0), (int64_t)dui(src1))) : {op}(src0, src1)) :" 938 f"(src0 == src1 ? uif({macro}((int32_t)fui(src0), (int32_t)fui(src1))) : {op}f(src0, src1))") 939 940binop("imin", tint, _2src_commutative + associative, "MIN2(src0, src1)") 941binop("umin", tuint, _2src_commutative + associative, "MIN2(src0, src1)") 942binop("imax", tint, _2src_commutative + associative, "MAX2(src0, src1)") 943binop("umax", tuint, _2src_commutative + associative, "MAX2(src0, src1)") 944 945binop("fpow", tfloat, "", "bit_size == 64 ? pow(src0, src1) : powf(src0, src1)") 946 947binop_horiz("pack_half_2x16_split", 1, tuint32, 1, tfloat32, 1, tfloat32, 948 "pack_half_1x16(src0.x) | ((uint32_t)(pack_half_1x16(src1.x)) << 16)") 949 950binop_horiz("pack_half_2x16_rtz_split", 1, tuint32, 1, tfloat32, 1, tfloat32, 951 "pack_half_1x16_rtz(src0.x) | (uint32_t)(pack_half_1x16_rtz(src1.x) << 16)") 952 953binop_convert("pack_64_2x32_split", tuint64, tuint32, "", 954 "src0 | ((uint64_t)src1 << 32)") 955 956binop_convert("pack_32_2x16_split", tuint32, tuint16, "", 957 "src0 | ((uint32_t)src1 << 16)") 958 959opcode("pack_32_4x8_split", 0, tuint32, [0, 0, 0, 0], [tuint8, tuint8, tuint8, tuint8], 960 False, "", 961 "src0 | ((uint32_t)src1 << 8) | ((uint32_t)src2 << 16) | ((uint32_t)src3 << 24)") 962 963binop_convert("bfm", tuint32, tint32, "", """ 964int bits = src0 & 0x1F; 965int offset = src1 & 0x1F; 966dst = ((1u << bits) - 1) << offset; 967""", description = """ 968Implements the behavior of the first operation of the SM5 "bfi" assembly 969and that of the "bfi1" i965 instruction. That is, the bits and offset values 970are from the low five bits of src0 and src1, respectively. 971""") 972 973opcode("ldexp", 0, tfloat, [0, 0], [tfloat, tint32], False, "", """ 974dst = (bit_size == 64) ? ldexp(src0, src1) : ldexpf(src0, src1); 975/* flush denormals to zero. */ 976if (!isnormal(dst)) 977 dst = copysignf(0.0f, src0); 978""") 979 980binop_horiz("vec2", 2, tuint, 1, tuint, 1, tuint, """ 981dst.x = src0.x; 982dst.y = src1.x; 983""", description = """ 984Combines the first component of each input to make a 2-component vector. 985""") 986 987# Byte extraction 988binop("extract_u8", tuint, "", "(uint8_t)(src0 >> (src1 * 8))") 989binop("extract_i8", tint, "", "(int8_t)(src0 >> (src1 * 8))") 990 991# Word extraction 992binop("extract_u16", tuint, "", "(uint16_t)(src0 >> (src1 * 16))") 993binop("extract_i16", tint, "", "(int16_t)(src0 >> (src1 * 16))") 994 995# Byte/word insertion 996binop("insert_u8", tuint, "", "(src0 & 0xff) << (src1 * 8)") 997binop("insert_u16", tuint, "", "(src0 & 0xffff) << (src1 * 16)") 998 999 1000def triop(name, ty, alg_props, const_expr, description = ""): 1001 opcode(name, 0, ty, [0, 0, 0], [ty, ty, ty], False, alg_props, const_expr, 1002 description) 1003def triop_horiz(name, output_size, src1_size, src2_size, src3_size, const_expr, 1004 description = ""): 1005 opcode(name, output_size, tuint, 1006 [src1_size, src2_size, src3_size], 1007 [tuint, tuint, tuint], False, "", const_expr, description) 1008 1009triop("ffma", tfloat, _2src_commutative, """ 1010if (nir_is_rounding_mode_rtz(execution_mode, bit_size)) { 1011 if (bit_size == 64) 1012 dst = _mesa_double_fma_rtz(src0, src1, src2); 1013 else if (bit_size == 32) 1014 dst = _mesa_float_fma_rtz(src0, src1, src2); 1015 else 1016 dst = _mesa_double_to_float_rtz(_mesa_double_fma_rtz(src0, src1, src2)); 1017} else { 1018 if (bit_size == 32) 1019 dst = fmaf(src0, src1, src2); 1020 else 1021 dst = fma(src0, src1, src2); 1022} 1023""") 1024 1025triop("ffmaz", tfloat32, _2src_commutative, """ 1026if (src0 == 0.0 || src1 == 0.0) 1027 dst = 0.0 + src2; 1028else if (nir_is_rounding_mode_rtz(execution_mode, 32)) 1029 dst = _mesa_float_fma_rtz(src0, src1, src2); 1030else 1031 dst = fmaf(src0, src1, src2); 1032""", description = """ 1033Floating-point multiply-add with modified zero handling. 1034 1035Unlike :nir:alu-op:`ffma`, anything (even infinity or NaN) multiplied by zero is 1036always zero. ``ffmaz(0.0, inf, src2)`` and ``ffmaz(0.0, nan, src2)`` must be 1037``+/-0.0 + src2``, even if ``INF_PRESERVE/NAN_PRESERVE`` is not used. If 1038``SIGNED_ZERO_PRESERVE`` is used, then the result must be a positive 1039zero plus src2 if either src0 or src1 is zero. 1040""") 1041 1042triop("flrp", tfloat, "", "src0 * (1 - src2) + src1 * src2") 1043 1044triop("iadd3", tint, _2src_commutative + associative, "src0 + src1 + src2", 1045 description = "Ternary addition") 1046 1047triop("imad", tint, _2src_commutative + associative, "src0 * src1 + src2", 1048 description = "Integer multiply-add") 1049 1050csel_description = """ 1051A vector conditional select instruction (like ?:, but operating per- 1052component on vectors). The condition is {} bool ({}). 1053""" 1054 1055triop("fcsel", tfloat32, selection, "(src0 != 0.0f) ? src1 : src2", 1056 description = csel_description.format("a floating point", "0.0 vs 1.0")) 1057opcode("bcsel", 0, tuint, [0, 0, 0], 1058 [tbool1, tuint, tuint], False, selection, "src0 ? src1 : src2", 1059 description = csel_description.format("a 1-bit", "0 vs 1")) 1060opcode("b8csel", 0, tuint, [0, 0, 0], 1061 [tbool8, tuint, tuint], False, selection, "src0 ? src1 : src2", 1062 description = csel_description.format("an 8-bit", "0 vs ~0")) 1063opcode("b16csel", 0, tuint, [0, 0, 0], 1064 [tbool16, tuint, tuint], False, selection, "src0 ? src1 : src2", 1065 description = csel_description.format("a 16-bit", "0 vs ~0")) 1066opcode("b32csel", 0, tuint, [0, 0, 0], 1067 [tbool32, tuint, tuint], False, selection, "src0 ? src1 : src2", 1068 description = csel_description.format("a 32-bit", "0 vs ~0")) 1069 1070triop("icsel_eqz", tint, selection, "(src0 == 0) ? src1 : src2") 1071 1072triop("i32csel_gt", tint32, selection, "(src0 > 0) ? src1 : src2") 1073triop("i32csel_ge", tint32, selection, "(src0 >= 0) ? src1 : src2") 1074 1075triop("fcsel_gt", tfloat32, selection, "(src0 > 0.0f) ? src1 : src2") 1076triop("fcsel_ge", tfloat32, selection, "(src0 >= 0.0f) ? src1 : src2") 1077 1078triop("bfi", tuint32, "", """ 1079unsigned mask = src0, insert = src1, base = src2; 1080if (mask == 0) { 1081 dst = base; 1082} else { 1083 unsigned tmp = mask; 1084 while (!(tmp & 1)) { 1085 tmp >>= 1; 1086 insert <<= 1; 1087 } 1088 dst = (base & ~mask) | (insert & mask); 1089} 1090""", description = "SM5 bfi assembly") 1091 1092 1093triop("bitfield_select", tuint, "", "(src0 & src1) | (~src0 & src2)") 1094 1095# SM5 ubfe/ibfe assembly: only the 5 least significant bits of offset and bits are used. 1096opcode("ubfe", 0, tuint32, 1097 [0, 0, 0], [tuint32, tuint32, tuint32], False, "", """ 1098unsigned base = src0; 1099unsigned offset = src1 & 0x1F; 1100unsigned bits = src2 & 0x1F; 1101if (bits == 0) { 1102 dst = 0; 1103} else if (offset + bits < 32) { 1104 dst = (base << (32 - bits - offset)) >> (32 - bits); 1105} else { 1106 dst = base >> offset; 1107} 1108""") 1109opcode("ibfe", 0, tint32, 1110 [0, 0, 0], [tint32, tuint32, tuint32], False, "", """ 1111int base = src0; 1112unsigned offset = src1 & 0x1F; 1113unsigned bits = src2 & 0x1F; 1114if (bits == 0) { 1115 dst = 0; 1116} else if (offset + bits < 32) { 1117 dst = (base << (32 - bits - offset)) >> (32 - bits); 1118} else { 1119 dst = base >> offset; 1120} 1121""") 1122 1123# GLSL bitfieldExtract() 1124opcode("ubitfield_extract", 0, tuint32, 1125 [0, 0, 0], [tuint32, tint32, tint32], False, "", """ 1126unsigned base = src0; 1127int offset = src1, bits = src2; 1128if (bits == 0) { 1129 dst = 0; 1130} else if (bits < 0 || offset < 0 || offset + bits > 32) { 1131 dst = 0; /* undefined per the spec */ 1132} else { 1133 dst = (base >> offset) & ((1ull << bits) - 1); 1134} 1135""") 1136opcode("ibitfield_extract", 0, tint32, 1137 [0, 0, 0], [tint32, tint32, tint32], False, "", """ 1138int base = src0; 1139int offset = src1, bits = src2; 1140if (bits == 0) { 1141 dst = 0; 1142} else if (offset < 0 || bits < 0 || offset + bits > 32) { 1143 dst = 0; 1144} else { 1145 dst = (base << (32 - offset - bits)) >> (32 - bits); /* use sign-extending shift */ 1146} 1147""") 1148 1149triop("msad_4x8", tuint32, "", """ 1150dst = msad(src0, src1, src2); 1151""", description = """ 1152Masked sum of absolute differences with accumulation. Equivalent to AMD's v_msad_u8 1153instruction and DXIL's MSAD. 1154 1155The first two sources contain packed 8-bit unsigned integers, the instruction 1156will calculate the absolute difference of integers when src0's is non-zero, and 1157then add them together. There is also a third source which is a 32-bit unsigned 1158integer and added to the result. 1159""") 1160 1161opcode("mqsad_4x8", 4, tuint32, [1, 2, 4], [tuint32, tuint32, tuint32], False, "", """ 1162uint64_t src = src1.x | ((uint64_t)src1.y << 32); 1163dst.x = msad(src0.x, src, src2.x); 1164dst.y = msad(src0.x, src >> 8, src2.y); 1165dst.z = msad(src0.x, src >> 16, src2.z); 1166dst.w = msad(src0.x, src >> 24, src2.w); 1167""") 1168 1169# Combines the first component of each input to make a 3-component vector. 1170 1171triop_horiz("vec3", 3, 1, 1, 1, """ 1172dst.x = src0.x; 1173dst.y = src1.x; 1174dst.z = src2.x; 1175""") 1176 1177def quadop_horiz(name, output_size, src1_size, src2_size, src3_size, 1178 src4_size, const_expr): 1179 opcode(name, output_size, tuint, 1180 [src1_size, src2_size, src3_size, src4_size], 1181 [tuint, tuint, tuint, tuint], 1182 False, "", const_expr) 1183 1184opcode("bitfield_insert", 0, tuint32, [0, 0, 0, 0], 1185 [tuint32, tuint32, tint32, tint32], False, "", """ 1186unsigned base = src0, insert = src1; 1187int offset = src2, bits = src3; 1188if (bits == 0) { 1189 dst = base; 1190} else if (offset < 0 || bits < 0 || bits + offset > 32) { 1191 dst = 0; 1192} else { 1193 unsigned mask = ((1ull << bits) - 1) << offset; 1194 dst = (base & ~mask) | ((insert << offset) & mask); 1195} 1196""") 1197 1198quadop_horiz("vec4", 4, 1, 1, 1, 1, """ 1199dst.x = src0.x; 1200dst.y = src1.x; 1201dst.z = src2.x; 1202dst.w = src3.x; 1203""") 1204 1205opcode("vec5", 5, tuint, 1206 [1] * 5, [tuint] * 5, 1207 False, "", """ 1208dst.x = src0.x; 1209dst.y = src1.x; 1210dst.z = src2.x; 1211dst.w = src3.x; 1212dst.e = src4.x; 1213""") 1214 1215opcode("vec8", 8, tuint, 1216 [1] * 8, [tuint] * 8, 1217 False, "", """ 1218dst.x = src0.x; 1219dst.y = src1.x; 1220dst.z = src2.x; 1221dst.w = src3.x; 1222dst.e = src4.x; 1223dst.f = src5.x; 1224dst.g = src6.x; 1225dst.h = src7.x; 1226""") 1227 1228opcode("vec16", 16, tuint, 1229 [1] * 16, [tuint] * 16, 1230 False, "", """ 1231dst.x = src0.x; 1232dst.y = src1.x; 1233dst.z = src2.x; 1234dst.w = src3.x; 1235dst.e = src4.x; 1236dst.f = src5.x; 1237dst.g = src6.x; 1238dst.h = src7.x; 1239dst.i = src8.x; 1240dst.j = src9.x; 1241dst.k = src10.x; 1242dst.l = src11.x; 1243dst.m = src12.x; 1244dst.n = src13.x; 1245dst.o = src14.x; 1246dst.p = src15.x; 1247""") 1248 1249# An integer multiply instruction for address calculation. This is 1250# similar to imul, except that the results are undefined in case of 1251# overflow. Overflow is defined according to the size of the variable 1252# being dereferenced. 1253# 1254# This relaxed definition, compared to imul, allows an optimization 1255# pass to propagate bounds (ie, from an load/store intrinsic) to the 1256# sources, such that lower precision integer multiplies can be used. 1257# This is useful on hw that has 24b or perhaps 16b integer multiply 1258# instructions. 1259binop("amul", tint, _2src_commutative + associative, "src0 * src1") 1260 1261# ir3-specific instruction that maps directly to mul-add shift high mix, 1262# (IMADSH_MIX16 i.e. al * bh << 16 + c). It is used for lowering integer 1263# multiplication (imul) on Freedreno backend.. 1264opcode("imadsh_mix16", 0, tint32, 1265 [0, 0, 0], [tint32, tint32, tint32], False, "", """ 1266dst = ((((src0 & 0x0000ffff) << 16) * (src1 & 0xffff0000)) >> 16) + src2; 1267""") 1268 1269# ir3-specific instruction that maps directly to ir3 mad.s24. 1270# 1271# 24b multiply into 32b result (with sign extension) plus 32b int 1272triop("imad24_ir3", tint32, _2src_commutative, 1273 "(((int32_t)src0 << 8) >> 8) * (((int32_t)src1 << 8) >> 8) + src2") 1274 1275def triop_shift_ir3(name, shift_op, bit_op): 1276 opcode(name, 0, tuint, [0, 0, 0], [tuint, tuint32, tuint], False, "", 1277 f"(src0 {shift_op} (src1 & (sizeof(src0) * 8 - 1))) {bit_op} src2") 1278 1279triop_shift_ir3("shrm_ir3", ">>", "&") 1280triop_shift_ir3("shlm_ir3", "<<", "&") 1281triop_shift_ir3("shrg_ir3", ">>", "|") 1282triop_shift_ir3("shlg_ir3", "<<", "|") 1283triop("andg_ir3", tuint, _2src_commutative, "(src0 & src1) | src2") 1284 1285# r600/gcn specific instruction that evaluates unnormalized cube texture coordinates 1286# and face index 1287# The actual texture coordinates are evaluated from this according to 1288# dst.yx / abs(dst.z) + 1.5 1289unop_horiz("cube_amd", 4, tfloat32, 3, tfloat32, """ 1290 dst.x = dst.y = dst.z = 0.0; 1291 float absX = fabsf(src0.x); 1292 float absY = fabsf(src0.y); 1293 float absZ = fabsf(src0.z); 1294 1295 if (absX >= absY && absX >= absZ) { dst.z = 2 * src0.x; } 1296 if (absY >= absX && absY >= absZ) { dst.z = 2 * src0.y; } 1297 if (absZ >= absX && absZ >= absY) { dst.z = 2 * src0.z; } 1298 1299 if (src0.x >= 0 && absX >= absY && absX >= absZ) { 1300 dst.y = -src0.z; dst.x = -src0.y; dst.w = 0; 1301 } 1302 if (src0.x < 0 && absX >= absY && absX >= absZ) { 1303 dst.y = src0.z; dst.x = -src0.y; dst.w = 1; 1304 } 1305 if (src0.y >= 0 && absY >= absX && absY >= absZ) { 1306 dst.y = src0.x; dst.x = src0.z; dst.w = 2; 1307 } 1308 if (src0.y < 0 && absY >= absX && absY >= absZ) { 1309 dst.y = src0.x; dst.x = -src0.z; dst.w = 3; 1310 } 1311 if (src0.z >= 0 && absZ >= absX && absZ >= absY) { 1312 dst.y = src0.x; dst.x = -src0.y; dst.w = 4; 1313 } 1314 if (src0.z < 0 && absZ >= absX && absZ >= absY) { 1315 dst.y = -src0.x; dst.x = -src0.y; dst.w = 5; 1316 } 1317""") 1318 1319# r600/gcn specific sin and cos 1320# these trigeometric functions need some lowering because the supported 1321# input values are expected to be normalized by dividing by (2 * pi) 1322unop("fsin_amd", tfloat, "sinf(6.2831853 * src0)") 1323unop("fcos_amd", tfloat, "cosf(6.2831853 * src0)") 1324 1325opcode("alignbyte_amd", 0, tuint32, [0, 0, 0], [tuint32, tuint32, tuint32], False, "", """ 1326 uint64_t src = src1 | ((uint64_t)src0 << 32); 1327 dst = src >> ((src2 & 0x3) * 8); 1328""") 1329 1330# Midgard specific sin and cos 1331# These expect their inputs to be divided by pi. 1332unop("fsin_mdg", tfloat, "sinf(3.141592653589793 * src0)") 1333unop("fcos_mdg", tfloat, "cosf(3.141592653589793 * src0)") 1334 1335# AGX specific sin with input expressed in quadrants. Used in the lowering for 1336# fsin/fcos. This corresponds to a sequence of 3 ALU ops in the backend (where 1337# the angle is further decomposed by quadrant, sinc is computed, and the angle 1338# is multiplied back for sin). Lowering fsin/fcos to fsin_agx requires some 1339# additional ALU that NIR may be able to optimize. 1340unop("fsin_agx", tfloat, "sinf(src0 * (6.2831853/4.0))") 1341 1342# AGX specific bitfield extraction from a pair of 32bit registers. 1343# src0,src1: the two registers 1344# src2: bit position of the LSB of the bitfield 1345# src3: number of bits in the bitfield if src3 > 0 1346# src3 = 0 is equivalent to src3 = 32 1347# NOTE: src3 is a nir constant by contract 1348opcode("extr_agx", 0, tuint32, 1349 [0, 0, 0, 0], [tuint32, tuint32, tuint32, tuint32], False, "", """ 1350 uint32_t mask = 0xFFFFFFFF; 1351 uint8_t shift = src2 & 0x7F; 1352 if (src3 != 0) { 1353 mask = (1 << src3) - 1; 1354 } 1355 if (shift >= 64) { 1356 dst = 0; 1357 } else { 1358 dst = (((((uint64_t) src1) << 32) | (uint64_t) src0) >> shift) & mask; 1359 } 1360"""); 1361 1362# AGX multiply-shift-add. Corresponds to iadd/isub/imad/imsub instructions. 1363# The shift must be <= 4 (domain restriction). For performance, it should be 1364# constant. 1365opcode("imadshl_agx", 0, tint, [0, 0, 0, 0], [tint, tint, tint, tint], False, 1366 "", f"(src0 * src1) + (src2 << src3)") 1367opcode("imsubshl_agx", 0, tint, [0, 0, 0, 0], [tint, tint, tint, tint], False, 1368 "", f"(src0 * src1) - (src2 << src3)") 1369 1370# Address arithmetic instructions: extend, shift, and add 1371# Shift must be a small constant. 1372opcode("ilea_agx", 0, tuint64, [0, 0, 0], [tuint64, tint32, tuint32], False, 1373 "", f"src0 + (((int64_t)src1) << src2)") 1374opcode("ulea_agx", 0, tuint64, [0, 0, 0], [tuint64, tuint32, tuint32], False, 1375 "", f"src0 + (((uint64_t)src1) << src2)") 1376 1377# Bounds check instruction. 1378# 1379# Sources: <data, end offset, bounds> 1380opcode("bounds_agx", 0, tint, [0, 0, 0], 1381 [tint, tint, tint], False, 1382 "", "src1 <= src2 ? src0 : 0") 1383 1384binop_convert("interleave_agx", tuint32, tuint16, "", """ 1385 dst = 0; 1386 for (unsigned bit = 0; bit < 16; bit++) { 1387 dst |= (src0 & (1 << bit)) << bit; 1388 dst |= (src1 & (1 << bit)) << (bit + 1); 1389 }""", description=""" 1390 Interleave bits of 16-bit integers to calculate a 32-bit integer. This can 1391 be used as-is for Morton encoding. 1392 """) 1393 1394# These are like fmin/fmax, but do not flush denorms on the output which is why 1395# they're modeled as conversions. AGX flushes fp32 denorms but preserves fp16 1396# denorms, so fp16 fmin/fmax work without lowering. 1397binop_convert("fmin_agx", tuint32, tfloat32, _2src_commutative + associative, 1398 "(src0 < src1 || isnan(src1)) ? src0 : src1") 1399binop_convert("fmax_agx", tuint32, tfloat32, _2src_commutative + associative, 1400 "(src0 > src1 || isnan(src1)) ? src0 : src1") 1401 1402# NVIDIA PRMT 1403opcode("prmt_nv", 0, tuint32, [0, 0, 0], [tuint32, tuint32, tuint32], 1404 False, "", """ 1405 dst = 0; 1406 for (unsigned i = 0; i < 4; i++) { 1407 uint8_t byte = (src0 >> (i * 4)) & 0x7; 1408 uint8_t x = byte < 4 ? (src1 >> (byte * 8)) 1409 : (src2 >> ((byte - 4) * 8)); 1410 if ((src0 >> (i * 4)) & 0x8) 1411 x = ((int8_t)x) >> 7; 1412 dst |= ((uint32_t)x) << i * 8; 1413 }""") 1414 1415# 24b multiply into 32b result (with sign extension) 1416binop("imul24", tint32, _2src_commutative + associative, 1417 "(((int32_t)src0 << 8) >> 8) * (((int32_t)src1 << 8) >> 8)") 1418 1419# unsigned 24b multiply into 32b result plus 32b int 1420triop("umad24", tuint32, _2src_commutative, 1421 "(((uint32_t)src0 << 8) >> 8) * (((uint32_t)src1 << 8) >> 8) + src2") 1422 1423# unsigned 24b multiply into 32b result uint 1424binop("umul24", tint32, _2src_commutative + associative, 1425 "(((uint32_t)src0 << 8) >> 8) * (((uint32_t)src1 << 8) >> 8)") 1426 1427# relaxed versions of the above, which assume input is in the 24bit range (no clamping) 1428binop("imul24_relaxed", tint32, _2src_commutative + associative, "src0 * src1") 1429triop("umad24_relaxed", tuint32, _2src_commutative, "src0 * src1 + src2") 1430binop("umul24_relaxed", tuint32, _2src_commutative + associative, "src0 * src1") 1431 1432unop_convert("fisnormal", tbool1, tfloat, "isnormal(src0)") 1433unop_convert("fisfinite", tbool1, tfloat, "isfinite(src0)") 1434unop_convert("fisfinite32", tbool32, tfloat, "isfinite(src0)") 1435 1436# vc4-specific opcodes 1437 1438# Saturated vector add for 4 8bit ints. 1439binop("usadd_4x8_vc4", tint32, _2src_commutative + associative, """ 1440dst = 0; 1441for (int i = 0; i < 32; i += 8) { 1442 dst |= MIN2(((src0 >> i) & 0xff) + ((src1 >> i) & 0xff), 0xff) << i; 1443} 1444""") 1445 1446# Saturated vector subtract for 4 8bit ints. 1447binop("ussub_4x8_vc4", tint32, "", """ 1448dst = 0; 1449for (int i = 0; i < 32; i += 8) { 1450 int src0_chan = (src0 >> i) & 0xff; 1451 int src1_chan = (src1 >> i) & 0xff; 1452 if (src0_chan > src1_chan) 1453 dst |= (src0_chan - src1_chan) << i; 1454} 1455""") 1456 1457# vector min for 4 8bit ints. 1458binop("umin_4x8_vc4", tint32, _2src_commutative + associative, """ 1459dst = 0; 1460for (int i = 0; i < 32; i += 8) { 1461 dst |= MIN2((src0 >> i) & 0xff, (src1 >> i) & 0xff) << i; 1462} 1463""") 1464 1465# vector max for 4 8bit ints. 1466binop("umax_4x8_vc4", tint32, _2src_commutative + associative, """ 1467dst = 0; 1468for (int i = 0; i < 32; i += 8) { 1469 dst |= MAX2((src0 >> i) & 0xff, (src1 >> i) & 0xff) << i; 1470} 1471""") 1472 1473# unorm multiply: (a * b) / 255. 1474binop("umul_unorm_4x8_vc4", tuint32, _2src_commutative + associative, """ 1475dst = 0; 1476for (int i = 0; i < 32; i += 8) { 1477 uint32_t src0_chan = (src0 >> i) & 0xff; 1478 uint32_t src1_chan = (src1 >> i) & 0xff; 1479 dst |= ((src0_chan * src1_chan) / 255) << i; 1480} 1481""") 1482 1483# v3d-specific opcodes 1484 1485# v3d-specific (v71) instruction that packs bits of 2 2x16 floating point into 1486# r11g11b10 bits, rounding to nearest even, so 1487# dst[10:0] = float16_to_float11 (src0[15:0]) 1488# dst[21:11] = float16_to_float11 (src0[31:16]) 1489# dst[31:22] = float16_to_float10 (src1[15:0]) 1490binop_convert("pack_32_to_r11g11b10_v3d", tuint32, tuint32, "", 1491 "pack_32_to_r11g11b10_v3d(src0, src1)") 1492 1493# v3d-specific (v71) instruction that packs 2x32 bit to 2x16 bit integer. The 1494# difference with pack_32_2x16_split is that the sources are 32bit too. So it 1495# receives 2 32-bit integer, and packs the lower halfword as 2x16 on a 32-bit 1496# integer. 1497binop_horiz("pack_2x32_to_2x16_v3d", 1, tuint32, 1, tuint32, 1, tuint32, 1498 "(src0.x & 0xffff) | (src1.x << 16)") 1499 1500# v3d-specific (v71) instruction that packs bits of 2 2x16 integers into 1501# r10g10b10a2: 1502# dst[9:0] = src0[9:0] 1503# dst[19:10] = src0[25:16] 1504# dst[29:20] = src1[9:0] 1505# dst[31:30] = src1[17:16] 1506binop_convert("pack_uint_32_to_r10g10b10a2_v3d", tuint32, tuint32, "", 1507 "(src0 & 0x3ff) | ((src0 >> 16) & 0x3ff) << 10 | (src1 & 0x3ff) << 20 | ((src1 >> 16) & 0x3ff) << 30") 1508 1509# v3d-specific (v71) instruction that packs 2 2x16 bit integers into 4x8 bits: 1510# dst[7:0] = src0[7:0] 1511# dst[15:8] = src0[23:16] 1512# dst[23:16] = src1[7:0] 1513# dst[31:24] = src1[23:16] 1514opcode("pack_4x16_to_4x8_v3d", 0, tuint32, [0, 0], [tuint32, tuint32], 1515 False, "", 1516 "(src0 & 0x000000ff) | (src0 & 0x00ff0000) >> 8 | (src1 & 0x000000ff) << 16 | (src1 & 0x00ff0000) << 8") 1517 1518# v3d-specific (v71) instructions to convert 2x16 floating point to 2x8 bit unorm/snorm 1519unop("pack_2x16_to_unorm_2x8_v3d", tuint32, 1520 "_mesa_half_to_unorm(src0 & 0xffff, 8) | (_mesa_half_to_unorm(src0 >> 16, 8) << 16)") 1521unop("pack_2x16_to_snorm_2x8_v3d", tuint32, 1522 "_mesa_half_to_snorm(src0 & 0xffff, 8) | ((uint32_t)(_mesa_half_to_snorm(src0 >> 16, 8)) << 16)") 1523 1524# v3d-specific (v71) instructions to convert 32-bit floating point to 16 bit unorm/snorm 1525unop("f2unorm_16_v3d", tuint32, "_mesa_float_to_unorm16(src0)") 1526unop("f2snorm_16_v3d", tuint32, "_mesa_float_to_snorm16(src0)") 1527 1528# v3d-specific (v71) instructions to convert 2x16 bit floating points to 2x10 bit unorm 1529unop("pack_2x16_to_unorm_2x10_v3d", tuint32, "pack_2x16_to_unorm_2x10(src0)") 1530 1531# v3d-specific (v71) instructions to convert 2x16 bit floating points to one 2-bit 1532# and one 10 bit unorm 1533unop("pack_2x16_to_unorm_10_2_v3d", tuint32, "pack_2x16_to_unorm_10_2(src0)") 1534 1535# These opcodes are used used by Mali and V3D 1536unop("fsat_signed", tfloat, ("fmin(fmax(src0, -1.0), 1.0)")) 1537unop("fclamp_pos", tfloat, ("fmax(src0, 0.0)")) 1538 1539opcode("b32fcsel_mdg", 0, tuint, [0, 0, 0], 1540 [tbool32, tfloat, tfloat], False, selection, "src0 ? src1 : src2", 1541 description = csel_description.format("a 32-bit", "0 vs ~0") + """ 1542 This Midgard-specific variant takes floating-point sources, rather than 1543 integer sources. That includes support for floating point modifiers in 1544 the backend. 1545 """) 1546 1547# DXIL specific double [un]pack 1548# DXIL doesn't support generic [un]pack instructions, so we want those 1549# lowered to bit ops. HLSL doesn't support 64bit bitcasts to/from 1550# double, only [un]pack. Technically DXIL does, but considering they 1551# can't be generated from HLSL, we want to match what would be coming from DXC. 1552# This is essentially just the standard [un]pack, except that it doesn't get 1553# lowered so we can handle it in the backend and turn it into MakeDouble/SplitDouble 1554unop_horiz("pack_double_2x32_dxil", 1, tuint64, 2, tuint32, 1555 "dst.x = src0.x | ((uint64_t)src0.y << 32);") 1556unop_horiz("unpack_double_2x32_dxil", 2, tuint32, 1, tuint64, 1557 "dst.x = src0.x; dst.y = src0.x >> 32;") 1558 1559# src0 and src1 are i8vec4 packed in an int32, and src2 is an int32. The int8 1560# components are sign-extended to 32-bits, and a dot-product is performed on 1561# the resulting vectors. src2 is added to the result of the dot-product. 1562opcode("sdot_4x8_iadd", 0, tint32, [0, 0, 0], [tuint32, tuint32, tint32], 1563 False, _2src_commutative, """ 1564 const int32_t v0x = (int8_t)(src0 ); 1565 const int32_t v0y = (int8_t)(src0 >> 8); 1566 const int32_t v0z = (int8_t)(src0 >> 16); 1567 const int32_t v0w = (int8_t)(src0 >> 24); 1568 const int32_t v1x = (int8_t)(src1 ); 1569 const int32_t v1y = (int8_t)(src1 >> 8); 1570 const int32_t v1z = (int8_t)(src1 >> 16); 1571 const int32_t v1w = (int8_t)(src1 >> 24); 1572 1573 dst = (v0x * v1x) + (v0y * v1y) + (v0z * v1z) + (v0w * v1w) + src2; 1574""") 1575 1576# Like sdot_4x8_iadd, but unsigned. 1577opcode("udot_4x8_uadd", 0, tuint32, [0, 0, 0], [tuint32, tuint32, tuint32], 1578 False, _2src_commutative, """ 1579 const uint32_t v0x = (uint8_t)(src0 ); 1580 const uint32_t v0y = (uint8_t)(src0 >> 8); 1581 const uint32_t v0z = (uint8_t)(src0 >> 16); 1582 const uint32_t v0w = (uint8_t)(src0 >> 24); 1583 const uint32_t v1x = (uint8_t)(src1 ); 1584 const uint32_t v1y = (uint8_t)(src1 >> 8); 1585 const uint32_t v1z = (uint8_t)(src1 >> 16); 1586 const uint32_t v1w = (uint8_t)(src1 >> 24); 1587 1588 dst = (v0x * v1x) + (v0y * v1y) + (v0z * v1z) + (v0w * v1w) + src2; 1589""") 1590 1591# src0 is i8vec4 packed in an int32, src1 is u8vec4 packed in an int32, and 1592# src2 is an int32. The 8-bit components are extended to 32-bits, and a 1593# dot-product is performed on the resulting vectors. src2 is added to the 1594# result of the dot-product. 1595# 1596# NOTE: Unlike many of the other dp4a opcodes, this mixed signs of source 0 1597# and source 1 mean that this opcode is not 2-source commutative 1598opcode("sudot_4x8_iadd", 0, tint32, [0, 0, 0], [tuint32, tuint32, tint32], 1599 False, "", """ 1600 const int32_t v0x = (int8_t)(src0 ); 1601 const int32_t v0y = (int8_t)(src0 >> 8); 1602 const int32_t v0z = (int8_t)(src0 >> 16); 1603 const int32_t v0w = (int8_t)(src0 >> 24); 1604 const uint32_t v1x = (uint8_t)(src1 ); 1605 const uint32_t v1y = (uint8_t)(src1 >> 8); 1606 const uint32_t v1z = (uint8_t)(src1 >> 16); 1607 const uint32_t v1w = (uint8_t)(src1 >> 24); 1608 1609 dst = (v0x * v1x) + (v0y * v1y) + (v0z * v1z) + (v0w * v1w) + src2; 1610""") 1611 1612# Like sdot_4x8_iadd, but the result is clampled to the range [-0x80000000, 0x7ffffffff]. 1613opcode("sdot_4x8_iadd_sat", 0, tint32, [0, 0, 0], [tuint32, tuint32, tint32], 1614 False, _2src_commutative, """ 1615 const int64_t v0x = (int8_t)(src0 ); 1616 const int64_t v0y = (int8_t)(src0 >> 8); 1617 const int64_t v0z = (int8_t)(src0 >> 16); 1618 const int64_t v0w = (int8_t)(src0 >> 24); 1619 const int64_t v1x = (int8_t)(src1 ); 1620 const int64_t v1y = (int8_t)(src1 >> 8); 1621 const int64_t v1z = (int8_t)(src1 >> 16); 1622 const int64_t v1w = (int8_t)(src1 >> 24); 1623 1624 const int64_t tmp = (v0x * v1x) + (v0y * v1y) + (v0z * v1z) + (v0w * v1w) + src2; 1625 1626 dst = tmp >= INT32_MAX ? INT32_MAX : (tmp <= INT32_MIN ? INT32_MIN : tmp); 1627""") 1628 1629# Like udot_4x8_uadd, but the result is clampled to the range [0, 0xfffffffff]. 1630opcode("udot_4x8_uadd_sat", 0, tint32, [0, 0, 0], [tuint32, tuint32, tint32], 1631 False, _2src_commutative, """ 1632 const uint64_t v0x = (uint8_t)(src0 ); 1633 const uint64_t v0y = (uint8_t)(src0 >> 8); 1634 const uint64_t v0z = (uint8_t)(src0 >> 16); 1635 const uint64_t v0w = (uint8_t)(src0 >> 24); 1636 const uint64_t v1x = (uint8_t)(src1 ); 1637 const uint64_t v1y = (uint8_t)(src1 >> 8); 1638 const uint64_t v1z = (uint8_t)(src1 >> 16); 1639 const uint64_t v1w = (uint8_t)(src1 >> 24); 1640 1641 const uint64_t tmp = (v0x * v1x) + (v0y * v1y) + (v0z * v1z) + (v0w * v1w) + src2; 1642 1643 dst = tmp >= UINT32_MAX ? UINT32_MAX : tmp; 1644""") 1645 1646# Like sudot_4x8_iadd, but the result is clampled to the range [-0x80000000, 0x7ffffffff]. 1647# 1648# NOTE: Unlike many of the other dp4a opcodes, this mixed signs of source 0 1649# and source 1 mean that this opcode is not 2-source commutative 1650opcode("sudot_4x8_iadd_sat", 0, tint32, [0, 0, 0], [tuint32, tuint32, tint32], 1651 False, "", """ 1652 const int64_t v0x = (int8_t)(src0 ); 1653 const int64_t v0y = (int8_t)(src0 >> 8); 1654 const int64_t v0z = (int8_t)(src0 >> 16); 1655 const int64_t v0w = (int8_t)(src0 >> 24); 1656 const uint64_t v1x = (uint8_t)(src1 ); 1657 const uint64_t v1y = (uint8_t)(src1 >> 8); 1658 const uint64_t v1z = (uint8_t)(src1 >> 16); 1659 const uint64_t v1w = (uint8_t)(src1 >> 24); 1660 1661 const int64_t tmp = (v0x * v1x) + (v0y * v1y) + (v0z * v1z) + (v0w * v1w) + src2; 1662 1663 dst = tmp >= INT32_MAX ? INT32_MAX : (tmp <= INT32_MIN ? INT32_MIN : tmp); 1664""") 1665 1666# src0 and src1 are i16vec2 packed in an int32, and src2 is an int32. The int16 1667# components are sign-extended to 32-bits, and a dot-product is performed on 1668# the resulting vectors. src2 is added to the result of the dot-product. 1669opcode("sdot_2x16_iadd", 0, tint32, [0, 0, 0], [tuint32, tuint32, tint32], 1670 False, _2src_commutative, """ 1671 const int32_t v0x = (int16_t)(src0 ); 1672 const int32_t v0y = (int16_t)(src0 >> 16); 1673 const int32_t v1x = (int16_t)(src1 ); 1674 const int32_t v1y = (int16_t)(src1 >> 16); 1675 1676 dst = (v0x * v1x) + (v0y * v1y) + src2; 1677""") 1678 1679# Like sdot_2x16_iadd, but unsigned. 1680opcode("udot_2x16_uadd", 0, tuint32, [0, 0, 0], [tuint32, tuint32, tuint32], 1681 False, _2src_commutative, """ 1682 const uint32_t v0x = (uint16_t)(src0 ); 1683 const uint32_t v0y = (uint16_t)(src0 >> 16); 1684 const uint32_t v1x = (uint16_t)(src1 ); 1685 const uint32_t v1y = (uint16_t)(src1 >> 16); 1686 1687 dst = (v0x * v1x) + (v0y * v1y) + src2; 1688""") 1689 1690# Like sdot_2x16_iadd, but the result is clampled to the range [-0x80000000, 0x7ffffffff]. 1691opcode("sdot_2x16_iadd_sat", 0, tint32, [0, 0, 0], [tuint32, tuint32, tint32], 1692 False, _2src_commutative, """ 1693 const int64_t v0x = (int16_t)(src0 ); 1694 const int64_t v0y = (int16_t)(src0 >> 16); 1695 const int64_t v1x = (int16_t)(src1 ); 1696 const int64_t v1y = (int16_t)(src1 >> 16); 1697 1698 const int64_t tmp = (v0x * v1x) + (v0y * v1y) + src2; 1699 1700 dst = tmp >= INT32_MAX ? INT32_MAX : (tmp <= INT32_MIN ? INT32_MIN : tmp); 1701""") 1702 1703# Like udot_2x16_uadd, but the result is clampled to the range [0, 0xfffffffff]. 1704opcode("udot_2x16_uadd_sat", 0, tint32, [0, 0, 0], [tuint32, tuint32, tint32], 1705 False, _2src_commutative, """ 1706 const uint64_t v0x = (uint16_t)(src0 ); 1707 const uint64_t v0y = (uint16_t)(src0 >> 16); 1708 const uint64_t v1x = (uint16_t)(src1 ); 1709 const uint64_t v1y = (uint16_t)(src1 >> 16); 1710 1711 const uint64_t tmp = (v0x * v1x) + (v0y * v1y) + src2; 1712 1713 dst = tmp >= UINT32_MAX ? UINT32_MAX : tmp; 1714""") 1715