1# 2# Copyright (C) 2014 Connor Abbott 3# 4# Permission is hereby granted, free of charge, to any person obtaining a 5# copy of this software and associated documentation files (the "Software"), 6# to deal in the Software without restriction, including without limitation 7# the rights to use, copy, modify, merge, publish, distribute, sublicense, 8# and/or sell copies of the Software, and to permit persons to whom the 9# Software is furnished to do so, subject to the following conditions: 10# 11# The above copyright notice and this permission notice (including the next 12# paragraph) shall be included in all copies or substantial portions of the 13# Software. 14# 15# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 18# THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING 20# FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS 21# IN THE SOFTWARE. 22# 23# Authors: 24# Connor Abbott (cwabbott0@gmail.com) 25 26import re 27 28# Class that represents all the information we have about the opcode 29# NOTE: this must be kept in sync with nir_op_info 30 31class Opcode(object): 32 """Class that represents all the information we have about the opcode 33 NOTE: this must be kept in sync with nir_op_info 34 """ 35 def __init__(self, name, output_size, output_type, input_sizes, 36 input_types, is_conversion, algebraic_properties, const_expr): 37 """Parameters: 38 39 - name is the name of the opcode (prepend nir_op_ for the enum name) 40 - all types are strings that get nir_type_ prepended to them 41 - input_types is a list of types 42 - is_conversion is true if this opcode represents a type conversion 43 - algebraic_properties is a space-seperated string, where nir_op_is_ is 44 prepended before each entry 45 - const_expr is an expression or series of statements that computes the 46 constant value of the opcode given the constant values of its inputs. 47 48 Constant expressions are formed from the variables src0, src1, ..., 49 src(N-1), where N is the number of arguments. The output of the 50 expression should be stored in the dst variable. Per-component input 51 and output variables will be scalars and non-per-component input and 52 output variables will be a struct with fields named x, y, z, and w 53 all of the correct type. Input and output variables can be assumed 54 to already be of the correct type and need no conversion. In 55 particular, the conversion from the C bool type to/from NIR_TRUE and 56 NIR_FALSE happens automatically. 57 58 For per-component instructions, the entire expression will be 59 executed once for each component. For non-per-component 60 instructions, the expression is expected to store the correct values 61 in dst.x, dst.y, etc. If "dst" does not exist anywhere in the 62 constant expression, an assignment to dst will happen automatically 63 and the result will be equivalent to "dst = <expression>" for 64 per-component instructions and "dst.x = dst.y = ... = <expression>" 65 for non-per-component instructions. 66 """ 67 assert isinstance(name, str) 68 assert isinstance(output_size, int) 69 assert isinstance(output_type, str) 70 assert isinstance(input_sizes, list) 71 assert isinstance(input_sizes[0], int) 72 assert isinstance(input_types, list) 73 assert isinstance(input_types[0], str) 74 assert isinstance(is_conversion, bool) 75 assert isinstance(algebraic_properties, str) 76 assert isinstance(const_expr, str) 77 assert len(input_sizes) == len(input_types) 78 assert 0 <= output_size <= 4 or (output_size == 8) or (output_size == 16) 79 for size in input_sizes: 80 assert 0 <= size <= 4 or (size == 8) or (size == 16) 81 if output_size != 0: 82 assert size != 0 83 self.name = name 84 self.num_inputs = len(input_sizes) 85 self.output_size = output_size 86 self.output_type = output_type 87 self.input_sizes = input_sizes 88 self.input_types = input_types 89 self.is_conversion = is_conversion 90 self.algebraic_properties = algebraic_properties 91 self.const_expr = const_expr 92 93# helper variables for strings 94tfloat = "float" 95tint = "int" 96tbool = "bool" 97tbool1 = "bool1" 98tbool8 = "bool8" 99tbool16 = "bool16" 100tbool32 = "bool32" 101tuint = "uint" 102tuint8 = "uint8" 103tint16 = "int16" 104tuint16 = "uint16" 105tfloat16 = "float16" 106tfloat32 = "float32" 107tint32 = "int32" 108tuint32 = "uint32" 109tint64 = "int64" 110tuint64 = "uint64" 111tfloat64 = "float64" 112 113_TYPE_SPLIT_RE = re.compile(r'(?P<type>int|uint|float|bool)(?P<bits>\d+)?') 114 115def type_has_size(type_): 116 m = _TYPE_SPLIT_RE.match(type_) 117 assert m is not None, 'Invalid NIR type string: "{}"'.format(type_) 118 return m.group('bits') is not None 119 120def type_size(type_): 121 m = _TYPE_SPLIT_RE.match(type_) 122 assert m is not None, 'Invalid NIR type string: "{}"'.format(type_) 123 assert m.group('bits') is not None, \ 124 'NIR type string has no bit size: "{}"'.format(type_) 125 return int(m.group('bits')) 126 127def type_sizes(type_): 128 if type_has_size(type_): 129 return [type_size(type_)] 130 elif type_ == 'bool': 131 return [1, 8, 16, 32] 132 elif type_ == 'float': 133 return [16, 32, 64] 134 else: 135 return [1, 8, 16, 32, 64] 136 137def type_base_type(type_): 138 m = _TYPE_SPLIT_RE.match(type_) 139 assert m is not None, 'Invalid NIR type string: "{}"'.format(type_) 140 return m.group('type') 141 142# Operation where the first two sources are commutative. 143# 144# For 2-source operations, this just mathematical commutativity. Some 145# 3-source operations, like ffma, are only commutative in the first two 146# sources. 147_2src_commutative = "2src_commutative " 148associative = "associative " 149 150# global dictionary of opcodes 151opcodes = {} 152 153def opcode(name, output_size, output_type, input_sizes, input_types, 154 is_conversion, algebraic_properties, const_expr): 155 assert name not in opcodes 156 opcodes[name] = Opcode(name, output_size, output_type, input_sizes, 157 input_types, is_conversion, algebraic_properties, 158 const_expr) 159 160def unop_convert(name, out_type, in_type, const_expr): 161 opcode(name, 0, out_type, [0], [in_type], False, "", const_expr) 162 163def unop(name, ty, const_expr): 164 opcode(name, 0, ty, [0], [ty], False, "", const_expr) 165 166def unop_horiz(name, output_size, output_type, input_size, input_type, 167 const_expr): 168 opcode(name, output_size, output_type, [input_size], [input_type], 169 False, "", const_expr) 170 171def unop_reduce(name, output_size, output_type, input_type, prereduce_expr, 172 reduce_expr, final_expr): 173 def prereduce(src): 174 return "(" + prereduce_expr.format(src=src) + ")" 175 def final(src): 176 return final_expr.format(src="(" + src + ")") 177 def reduce_(src0, src1): 178 return reduce_expr.format(src0=src0, src1=src1) 179 src0 = prereduce("src0.x") 180 src1 = prereduce("src0.y") 181 src2 = prereduce("src0.z") 182 src3 = prereduce("src0.w") 183 unop_horiz(name + "2", output_size, output_type, 2, input_type, 184 final(reduce_(src0, src1))) 185 unop_horiz(name + "3", output_size, output_type, 3, input_type, 186 final(reduce_(reduce_(src0, src1), src2))) 187 unop_horiz(name + "4", output_size, output_type, 4, input_type, 188 final(reduce_(reduce_(src0, src1), reduce_(src2, src3)))) 189 190def unop_numeric_convert(name, out_type, in_type, const_expr): 191 opcode(name, 0, out_type, [0], [in_type], True, "", const_expr) 192 193unop("mov", tuint, "src0") 194 195unop("ineg", tint, "-src0") 196unop("fneg", tfloat, "-src0") 197unop("inot", tint, "~src0") # invert every bit of the integer 198unop("fsign", tfloat, ("bit_size == 64 ? " + 199 "((src0 == 0.0) ? 0.0 : ((src0 > 0.0) ? 1.0 : -1.0)) : " + 200 "((src0 == 0.0f) ? 0.0f : ((src0 > 0.0f) ? 1.0f : -1.0f))")) 201unop("isign", tint, "(src0 == 0) ? 0 : ((src0 > 0) ? 1 : -1)") 202unop("iabs", tint, "(src0 < 0) ? -src0 : src0") 203unop("fabs", tfloat, "fabs(src0)") 204unop("fsat", tfloat, ("fmin(fmax(src0, 0.0), 1.0)")) 205unop("fsat_signed", tfloat, ("fmin(fmax(src0, -1.0), 1.0)")) 206unop("fclamp_pos", tfloat, ("fmax(src0, 0.0)")) 207unop("frcp", tfloat, "bit_size == 64 ? 1.0 / src0 : 1.0f / src0") 208unop("frsq", tfloat, "bit_size == 64 ? 1.0 / sqrt(src0) : 1.0f / sqrtf(src0)") 209unop("fsqrt", tfloat, "bit_size == 64 ? sqrt(src0) : sqrtf(src0)") 210unop("fexp2", tfloat, "exp2f(src0)") 211unop("flog2", tfloat, "log2f(src0)") 212 213# Generate all of the numeric conversion opcodes 214for src_t in [tint, tuint, tfloat, tbool]: 215 if src_t == tbool: 216 dst_types = [tfloat, tint, tbool] 217 elif src_t == tint: 218 dst_types = [tfloat, tint, tbool] 219 elif src_t == tuint: 220 dst_types = [tfloat, tuint] 221 elif src_t == tfloat: 222 dst_types = [tint, tuint, tfloat, tbool] 223 224 for dst_t in dst_types: 225 for dst_bit_size in type_sizes(dst_t): 226 if dst_bit_size == 16 and dst_t == tfloat and src_t == tfloat: 227 rnd_modes = ['_rtne', '_rtz', ''] 228 for rnd_mode in rnd_modes: 229 if rnd_mode == '_rtne': 230 conv_expr = """ 231 if (bit_size > 16) { 232 dst = _mesa_half_to_float(_mesa_float_to_float16_rtne(src0)); 233 } else { 234 dst = src0; 235 } 236 """ 237 elif rnd_mode == '_rtz': 238 conv_expr = """ 239 if (bit_size > 16) { 240 dst = _mesa_half_to_float(_mesa_float_to_float16_rtz(src0)); 241 } else { 242 dst = src0; 243 } 244 """ 245 else: 246 conv_expr = "src0" 247 248 unop_numeric_convert("{0}2{1}{2}{3}".format(src_t[0], 249 dst_t[0], 250 dst_bit_size, 251 rnd_mode), 252 dst_t + str(dst_bit_size), 253 src_t, conv_expr) 254 elif dst_bit_size == 32 and dst_t == tfloat and src_t == tfloat: 255 conv_expr = """ 256 if (bit_size > 32 && nir_is_rounding_mode_rtz(execution_mode, 32)) { 257 dst = _mesa_double_to_float_rtz(src0); 258 } else { 259 dst = src0; 260 } 261 """ 262 unop_numeric_convert("{0}2{1}{2}".format(src_t[0], dst_t[0], 263 dst_bit_size), 264 dst_t + str(dst_bit_size), src_t, conv_expr) 265 else: 266 conv_expr = "src0 != 0" if dst_t == tbool else "src0" 267 unop_numeric_convert("{0}2{1}{2}".format(src_t[0], dst_t[0], 268 dst_bit_size), 269 dst_t + str(dst_bit_size), src_t, conv_expr) 270 271# Special opcode that is the same as f2f16, i2i16, u2u16 except that it is safe 272# to remove it if the result is immediately converted back to 32 bits again. 273# This is generated as part of the precision lowering pass. mp stands for medium 274# precision. 275unop_numeric_convert("f2fmp", tfloat16, tfloat32, opcodes["f2f16"].const_expr) 276unop_numeric_convert("i2imp", tint16, tint32, opcodes["i2i16"].const_expr) 277# u2ump isn't defined, because the behavior is equal to i2imp 278unop_numeric_convert("f2imp", tint16, tfloat32, opcodes["f2i16"].const_expr) 279unop_numeric_convert("f2ump", tuint16, tfloat32, opcodes["f2u16"].const_expr) 280unop_numeric_convert("i2fmp", tfloat16, tint32, opcodes["i2f16"].const_expr) 281unop_numeric_convert("u2fmp", tfloat16, tuint32, opcodes["u2f16"].const_expr) 282 283# Unary floating-point rounding operations. 284 285 286unop("ftrunc", tfloat, "bit_size == 64 ? trunc(src0) : truncf(src0)") 287unop("fceil", tfloat, "bit_size == 64 ? ceil(src0) : ceilf(src0)") 288unop("ffloor", tfloat, "bit_size == 64 ? floor(src0) : floorf(src0)") 289unop("ffract", tfloat, "src0 - (bit_size == 64 ? floor(src0) : floorf(src0))") 290unop("fround_even", tfloat, "bit_size == 64 ? _mesa_roundeven(src0) : _mesa_roundevenf(src0)") 291 292unop("fquantize2f16", tfloat, "(fabs(src0) < ldexpf(1.0, -14)) ? copysignf(0.0f, src0) : _mesa_half_to_float(_mesa_float_to_half(src0))") 293 294# Trigonometric operations. 295 296 297unop("fsin", tfloat, "bit_size == 64 ? sin(src0) : sinf(src0)") 298unop("fcos", tfloat, "bit_size == 64 ? cos(src0) : cosf(src0)") 299 300# dfrexp 301unop_convert("frexp_exp", tint32, tfloat, "frexp(src0, &dst);") 302unop_convert("frexp_sig", tfloat, tfloat, "int n; dst = frexp(src0, &n);") 303 304# Partial derivatives. 305 306 307unop("fddx", tfloat, "0.0") # the derivative of a constant is 0. 308unop("fddy", tfloat, "0.0") 309unop("fddx_fine", tfloat, "0.0") 310unop("fddy_fine", tfloat, "0.0") 311unop("fddx_coarse", tfloat, "0.0") 312unop("fddy_coarse", tfloat, "0.0") 313 314 315# Floating point pack and unpack operations. 316 317def pack_2x16(fmt): 318 unop_horiz("pack_" + fmt + "_2x16", 1, tuint32, 2, tfloat32, """ 319dst.x = (uint32_t) pack_fmt_1x16(src0.x); 320dst.x |= ((uint32_t) pack_fmt_1x16(src0.y)) << 16; 321""".replace("fmt", fmt)) 322 323def pack_4x8(fmt): 324 unop_horiz("pack_" + fmt + "_4x8", 1, tuint32, 4, tfloat32, """ 325dst.x = (uint32_t) pack_fmt_1x8(src0.x); 326dst.x |= ((uint32_t) pack_fmt_1x8(src0.y)) << 8; 327dst.x |= ((uint32_t) pack_fmt_1x8(src0.z)) << 16; 328dst.x |= ((uint32_t) pack_fmt_1x8(src0.w)) << 24; 329""".replace("fmt", fmt)) 330 331def unpack_2x16(fmt): 332 unop_horiz("unpack_" + fmt + "_2x16", 2, tfloat32, 1, tuint32, """ 333dst.x = unpack_fmt_1x16((uint16_t)(src0.x & 0xffff)); 334dst.y = unpack_fmt_1x16((uint16_t)(src0.x << 16)); 335""".replace("fmt", fmt)) 336 337def unpack_4x8(fmt): 338 unop_horiz("unpack_" + fmt + "_4x8", 4, tfloat32, 1, tuint32, """ 339dst.x = unpack_fmt_1x8((uint8_t)(src0.x & 0xff)); 340dst.y = unpack_fmt_1x8((uint8_t)((src0.x >> 8) & 0xff)); 341dst.z = unpack_fmt_1x8((uint8_t)((src0.x >> 16) & 0xff)); 342dst.w = unpack_fmt_1x8((uint8_t)(src0.x >> 24)); 343""".replace("fmt", fmt)) 344 345 346pack_2x16("snorm") 347pack_4x8("snorm") 348pack_2x16("unorm") 349pack_4x8("unorm") 350pack_2x16("half") 351unpack_2x16("snorm") 352unpack_4x8("snorm") 353unpack_2x16("unorm") 354unpack_4x8("unorm") 355unpack_2x16("half") 356 357unop_horiz("pack_uvec2_to_uint", 1, tuint32, 2, tuint32, """ 358dst.x = (src0.x & 0xffff) | (src0.y << 16); 359""") 360 361unop_horiz("pack_uvec4_to_uint", 1, tuint32, 4, tuint32, """ 362dst.x = (src0.x << 0) | 363 (src0.y << 8) | 364 (src0.z << 16) | 365 (src0.w << 24); 366""") 367 368unop_horiz("pack_32_4x8", 1, tuint32, 4, tuint8, 369 "dst.x = src0.x | ((uint32_t)src0.y << 8) | ((uint32_t)src0.z << 16) | ((uint32_t)src0.w << 24);") 370 371unop_horiz("pack_32_2x16", 1, tuint32, 2, tuint16, 372 "dst.x = src0.x | ((uint32_t)src0.y << 16);") 373 374unop_horiz("pack_64_2x32", 1, tuint64, 2, tuint32, 375 "dst.x = src0.x | ((uint64_t)src0.y << 32);") 376 377unop_horiz("pack_64_4x16", 1, tuint64, 4, tuint16, 378 "dst.x = src0.x | ((uint64_t)src0.y << 16) | ((uint64_t)src0.z << 32) | ((uint64_t)src0.w << 48);") 379 380unop_horiz("unpack_64_2x32", 2, tuint32, 1, tuint64, 381 "dst.x = src0.x; dst.y = src0.x >> 32;") 382 383unop_horiz("unpack_64_4x16", 4, tuint16, 1, tuint64, 384 "dst.x = src0.x; dst.y = src0.x >> 16; dst.z = src0.x >> 32; dst.w = src0.w >> 48;") 385 386unop_horiz("unpack_32_2x16", 2, tuint16, 1, tuint32, 387 "dst.x = src0.x; dst.y = src0.x >> 16;") 388 389unop_horiz("unpack_32_4x8", 4, tuint8, 1, tuint32, 390 "dst.x = src0.x; dst.y = src0.x >> 8; dst.z = src0.x >> 16; dst.w = src0.x >> 24;") 391 392unop_horiz("unpack_half_2x16_flush_to_zero", 2, tfloat32, 1, tuint32, """ 393dst.x = unpack_half_1x16_flush_to_zero((uint16_t)(src0.x & 0xffff)); 394dst.y = unpack_half_1x16_flush_to_zero((uint16_t)(src0.x << 16)); 395""") 396 397# Lowered floating point unpacking operations. 398 399unop_convert("unpack_half_2x16_split_x", tfloat32, tuint32, 400 "unpack_half_1x16((uint16_t)(src0 & 0xffff))") 401unop_convert("unpack_half_2x16_split_y", tfloat32, tuint32, 402 "unpack_half_1x16((uint16_t)(src0 >> 16))") 403 404unop_convert("unpack_half_2x16_split_x_flush_to_zero", tfloat32, tuint32, 405 "unpack_half_1x16_flush_to_zero((uint16_t)(src0 & 0xffff))") 406unop_convert("unpack_half_2x16_split_y_flush_to_zero", tfloat32, tuint32, 407 "unpack_half_1x16_flush_to_zero((uint16_t)(src0 >> 16))") 408 409unop_convert("unpack_32_2x16_split_x", tuint16, tuint32, "src0") 410unop_convert("unpack_32_2x16_split_y", tuint16, tuint32, "src0 >> 16") 411 412unop_convert("unpack_64_2x32_split_x", tuint32, tuint64, "src0") 413unop_convert("unpack_64_2x32_split_y", tuint32, tuint64, "src0 >> 32") 414 415# Bit operations, part of ARB_gpu_shader5. 416 417 418unop("bitfield_reverse", tuint32, """ 419/* we're not winning any awards for speed here, but that's ok */ 420dst = 0; 421for (unsigned bit = 0; bit < 32; bit++) 422 dst |= ((src0 >> bit) & 1) << (31 - bit); 423""") 424unop_convert("bit_count", tuint32, tuint, """ 425dst = 0; 426for (unsigned bit = 0; bit < bit_size; bit++) { 427 if ((src0 >> bit) & 1) 428 dst++; 429} 430""") 431 432unop_convert("ufind_msb", tint32, tuint, """ 433dst = -1; 434for (int bit = bit_size - 1; bit >= 0; bit--) { 435 if ((src0 >> bit) & 1) { 436 dst = bit; 437 break; 438 } 439} 440""") 441 442unop("uclz", tuint32, """ 443int bit; 444for (bit = bit_size - 1; bit >= 0; bit--) { 445 if ((src0 & (1u << bit)) != 0) 446 break; 447} 448dst = (unsigned)(31 - bit); 449""") 450 451unop("ifind_msb", tint32, """ 452dst = -1; 453for (int bit = 31; bit >= 0; bit--) { 454 /* If src0 < 0, we're looking for the first 0 bit. 455 * if src0 >= 0, we're looking for the first 1 bit. 456 */ 457 if ((((src0 >> bit) & 1) && (src0 >= 0)) || 458 (!((src0 >> bit) & 1) && (src0 < 0))) { 459 dst = bit; 460 break; 461 } 462} 463""") 464 465unop_convert("find_lsb", tint32, tint, """ 466dst = -1; 467for (unsigned bit = 0; bit < bit_size; bit++) { 468 if ((src0 >> bit) & 1) { 469 dst = bit; 470 break; 471 } 472} 473""") 474 475# AMD_gcn_shader extended instructions 476unop_horiz("cube_face_coord", 2, tfloat32, 3, tfloat32, """ 477dst.x = dst.y = 0.0; 478float absX = fabsf(src0.x); 479float absY = fabsf(src0.y); 480float absZ = fabsf(src0.z); 481 482float ma = 0.0; 483if (absX >= absY && absX >= absZ) { ma = 2 * src0.x; } 484if (absY >= absX && absY >= absZ) { ma = 2 * src0.y; } 485if (absZ >= absX && absZ >= absY) { ma = 2 * src0.z; } 486 487if (src0.x >= 0 && absX >= absY && absX >= absZ) { dst.x = -src0.z; dst.y = -src0.y; } 488if (src0.x < 0 && absX >= absY && absX >= absZ) { dst.x = src0.z; dst.y = -src0.y; } 489if (src0.y >= 0 && absY >= absX && absY >= absZ) { dst.x = src0.x; dst.y = src0.z; } 490if (src0.y < 0 && absY >= absX && absY >= absZ) { dst.x = src0.x; dst.y = -src0.z; } 491if (src0.z >= 0 && absZ >= absX && absZ >= absY) { dst.x = src0.x; dst.y = -src0.y; } 492if (src0.z < 0 && absZ >= absX && absZ >= absY) { dst.x = -src0.x; dst.y = -src0.y; } 493 494dst.x = dst.x * (1.0f / ma) + 0.5f; 495dst.y = dst.y * (1.0f / ma) + 0.5f; 496""") 497 498unop_horiz("cube_face_index", 1, tfloat32, 3, tfloat32, """ 499float absX = fabsf(src0.x); 500float absY = fabsf(src0.y); 501float absZ = fabsf(src0.z); 502if (src0.x >= 0 && absX >= absY && absX >= absZ) dst.x = 0; 503if (src0.x < 0 && absX >= absY && absX >= absZ) dst.x = 1; 504if (src0.y >= 0 && absY >= absX && absY >= absZ) dst.x = 2; 505if (src0.y < 0 && absY >= absX && absY >= absZ) dst.x = 3; 506if (src0.z >= 0 && absZ >= absX && absZ >= absY) dst.x = 4; 507if (src0.z < 0 && absZ >= absX && absZ >= absY) dst.x = 5; 508""") 509 510# Sum of vector components 511unop_reduce("fsum", 1, tfloat, tfloat, "{src}", "{src0} + {src1}", "{src}") 512 513def binop_convert(name, out_type, in_type, alg_props, const_expr): 514 opcode(name, 0, out_type, [0, 0], [in_type, in_type], 515 False, alg_props, const_expr) 516 517def binop(name, ty, alg_props, const_expr): 518 binop_convert(name, ty, ty, alg_props, const_expr) 519 520def binop_compare(name, ty, alg_props, const_expr): 521 binop_convert(name, tbool1, ty, alg_props, const_expr) 522 523def binop_compare8(name, ty, alg_props, const_expr): 524 binop_convert(name, tbool8, ty, alg_props, const_expr) 525 526def binop_compare16(name, ty, alg_props, const_expr): 527 binop_convert(name, tbool16, ty, alg_props, const_expr) 528 529def binop_compare32(name, ty, alg_props, const_expr): 530 binop_convert(name, tbool32, ty, alg_props, const_expr) 531 532def binop_compare_all_sizes(name, ty, alg_props, const_expr): 533 binop_compare(name, ty, alg_props, const_expr) 534 binop_compare8(name + "8", ty, alg_props, const_expr) 535 binop_compare16(name + "16", ty, alg_props, const_expr) 536 binop_compare32(name + "32", ty, alg_props, const_expr) 537 538def binop_horiz(name, out_size, out_type, src1_size, src1_type, src2_size, 539 src2_type, const_expr): 540 opcode(name, out_size, out_type, [src1_size, src2_size], [src1_type, src2_type], 541 False, "", const_expr) 542 543def binop_reduce(name, output_size, output_type, src_type, prereduce_expr, 544 reduce_expr, final_expr, suffix=""): 545 def final(src): 546 return final_expr.format(src= "(" + src + ")") 547 def reduce_(src0, src1): 548 return reduce_expr.format(src0=src0, src1=src1) 549 def prereduce(src0, src1): 550 return "(" + prereduce_expr.format(src0=src0, src1=src1) + ")" 551 srcs = [prereduce("src0." + letter, "src1." + letter) for letter in "xyzwefghijklmnop"] 552 def pairwise_reduce(start, size): 553 if (size == 1): 554 return srcs[start] 555 return reduce_(pairwise_reduce(start + size // 2, size // 2), pairwise_reduce(start, size // 2)) 556 for size in [2, 4, 8, 16]: 557 opcode(name + str(size) + suffix, output_size, output_type, 558 [size, size], [src_type, src_type], False, _2src_commutative, 559 final(pairwise_reduce(0, size))) 560 opcode(name + "3" + suffix, output_size, output_type, 561 [3, 3], [src_type, src_type], False, _2src_commutative, 562 final(reduce_(reduce_(srcs[2], srcs[1]), srcs[0]))) 563 564def binop_reduce_all_sizes(name, output_size, src_type, prereduce_expr, 565 reduce_expr, final_expr): 566 binop_reduce(name, output_size, tbool1, src_type, 567 prereduce_expr, reduce_expr, final_expr) 568 binop_reduce("b8" + name[1:], output_size, tbool8, src_type, 569 prereduce_expr, reduce_expr, final_expr) 570 binop_reduce("b16" + name[1:], output_size, tbool16, src_type, 571 prereduce_expr, reduce_expr, final_expr) 572 binop_reduce("b32" + name[1:], output_size, tbool32, src_type, 573 prereduce_expr, reduce_expr, final_expr) 574 575binop("fadd", tfloat, _2src_commutative + associative,""" 576if (nir_is_rounding_mode_rtz(execution_mode, bit_size)) { 577 if (bit_size == 64) 578 dst = _mesa_double_add_rtz(src0, src1); 579 else 580 dst = _mesa_double_to_float_rtz((double)src0 + (double)src1); 581} else { 582 dst = src0 + src1; 583} 584""") 585binop("iadd", tint, _2src_commutative + associative, "src0 + src1") 586binop("iadd_sat", tint, _2src_commutative, """ 587 src1 > 0 ? 588 (src0 + src1 < src0 ? (1ull << (bit_size - 1)) - 1 : src0 + src1) : 589 (src0 < src0 + src1 ? (1ull << (bit_size - 1)) : src0 + src1) 590""") 591binop("uadd_sat", tuint, _2src_commutative, 592 "(src0 + src1) < src0 ? MAX_UINT_FOR_SIZE(sizeof(src0) * 8) : (src0 + src1)") 593binop("isub_sat", tint, "", """ 594 src1 < 0 ? 595 (src0 - src1 < src0 ? (1ull << (bit_size - 1)) - 1 : src0 - src1) : 596 (src0 < src0 - src1 ? (1ull << (bit_size - 1)) : src0 - src1) 597""") 598binop("usub_sat", tuint, "", "src0 < src1 ? 0 : src0 - src1") 599 600binop("fsub", tfloat, "", """ 601if (nir_is_rounding_mode_rtz(execution_mode, bit_size)) { 602 if (bit_size == 64) 603 dst = _mesa_double_sub_rtz(src0, src1); 604 else 605 dst = _mesa_double_to_float_rtz((double)src0 - (double)src1); 606} else { 607 dst = src0 - src1; 608} 609""") 610binop("isub", tint, "", "src0 - src1") 611binop_convert("uabs_isub", tuint, tint, "", """ 612 src1 > src0 ? (uint64_t) src1 - (uint64_t) src0 613 : (uint64_t) src0 - (uint64_t) src1 614""") 615binop("uabs_usub", tuint, "", "(src1 > src0) ? (src1 - src0) : (src0 - src1)") 616 617binop("fmul", tfloat, _2src_commutative + associative, """ 618if (nir_is_rounding_mode_rtz(execution_mode, bit_size)) { 619 if (bit_size == 64) 620 dst = _mesa_double_mul_rtz(src0, src1); 621 else 622 dst = _mesa_double_to_float_rtz((double)src0 * (double)src1); 623} else { 624 dst = src0 * src1; 625} 626""") 627# low 32-bits of signed/unsigned integer multiply 628binop("imul", tint, _2src_commutative + associative, """ 629 /* Use 64-bit multiplies to prevent overflow of signed arithmetic */ 630 dst = (uint64_t)src0 * (uint64_t)src1; 631""") 632 633# Generate 64 bit result from 2 32 bits quantity 634binop_convert("imul_2x32_64", tint64, tint32, _2src_commutative, 635 "(int64_t)src0 * (int64_t)src1") 636binop_convert("umul_2x32_64", tuint64, tuint32, _2src_commutative, 637 "(uint64_t)src0 * (uint64_t)src1") 638 639# high 32-bits of signed integer multiply 640binop("imul_high", tint, _2src_commutative, """ 641if (bit_size == 64) { 642 /* We need to do a full 128-bit x 128-bit multiply in order for the sign 643 * extension to work properly. The casts are kind-of annoying but needed 644 * to prevent compiler warnings. 645 */ 646 uint32_t src0_u32[4] = { 647 src0, 648 (int64_t)src0 >> 32, 649 (int64_t)src0 >> 63, 650 (int64_t)src0 >> 63, 651 }; 652 uint32_t src1_u32[4] = { 653 src1, 654 (int64_t)src1 >> 32, 655 (int64_t)src1 >> 63, 656 (int64_t)src1 >> 63, 657 }; 658 uint32_t prod_u32[4]; 659 ubm_mul_u32arr(prod_u32, src0_u32, src1_u32); 660 dst = (uint64_t)prod_u32[2] | ((uint64_t)prod_u32[3] << 32); 661} else { 662 /* First, sign-extend to 64-bit, then convert to unsigned to prevent 663 * potential overflow of signed multiply */ 664 dst = ((uint64_t)(int64_t)src0 * (uint64_t)(int64_t)src1) >> bit_size; 665} 666""") 667 668# high 32-bits of unsigned integer multiply 669binop("umul_high", tuint, _2src_commutative, """ 670if (bit_size == 64) { 671 /* The casts are kind-of annoying but needed to prevent compiler warnings. */ 672 uint32_t src0_u32[2] = { src0, (uint64_t)src0 >> 32 }; 673 uint32_t src1_u32[2] = { src1, (uint64_t)src1 >> 32 }; 674 uint32_t prod_u32[4]; 675 ubm_mul_u32arr(prod_u32, src0_u32, src1_u32); 676 dst = (uint64_t)prod_u32[2] | ((uint64_t)prod_u32[3] << 32); 677} else { 678 dst = ((uint64_t)src0 * (uint64_t)src1) >> bit_size; 679} 680""") 681 682# low 32-bits of unsigned integer multiply 683binop("umul_low", tuint32, _2src_commutative, """ 684uint64_t mask = (1 << (bit_size / 2)) - 1; 685dst = ((uint64_t)src0 & mask) * ((uint64_t)src1 & mask); 686""") 687 688# Multiply 32-bits with low 16-bits. 689binop("imul_32x16", tint32, "", "src0 * (int16_t) src1") 690binop("umul_32x16", tuint32, "", "src0 * (uint16_t) src1") 691 692binop("fdiv", tfloat, "", "src0 / src1") 693binop("idiv", tint, "", "src1 == 0 ? 0 : (src0 / src1)") 694binop("udiv", tuint, "", "src1 == 0 ? 0 : (src0 / src1)") 695 696# returns a boolean representing the carry resulting from the addition of 697# the two unsigned arguments. 698 699binop_convert("uadd_carry", tuint, tuint, _2src_commutative, "src0 + src1 < src0") 700 701# returns a boolean representing the borrow resulting from the subtraction 702# of the two unsigned arguments. 703 704binop_convert("usub_borrow", tuint, tuint, "", "src0 < src1") 705 706# hadd: (a + b) >> 1 (without overflow) 707# x + y = x - (x & ~y) + (x & ~y) + y - (~x & y) + (~x & y) 708# = (x & y) + (x & ~y) + (x & y) + (~x & y) 709# = 2 * (x & y) + (x & ~y) + (~x & y) 710# = ((x & y) << 1) + (x ^ y) 711# 712# Since we know that the bottom bit of (x & y) << 1 is zero, 713# 714# (x + y) >> 1 = (((x & y) << 1) + (x ^ y)) >> 1 715# = (x & y) + ((x ^ y) >> 1) 716binop("ihadd", tint, _2src_commutative, "(src0 & src1) + ((src0 ^ src1) >> 1)") 717binop("uhadd", tuint, _2src_commutative, "(src0 & src1) + ((src0 ^ src1) >> 1)") 718 719# rhadd: (a + b + 1) >> 1 (without overflow) 720# x + y + 1 = x + (~x & y) - (~x & y) + y + (x & ~y) - (x & ~y) + 1 721# = (x | y) - (~x & y) + (x | y) - (x & ~y) + 1 722# = 2 * (x | y) - ((~x & y) + (x & ~y)) + 1 723# = ((x | y) << 1) - (x ^ y) + 1 724# 725# Since we know that the bottom bit of (x & y) << 1 is zero, 726# 727# (x + y + 1) >> 1 = (x | y) + (-(x ^ y) + 1) >> 1) 728# = (x | y) - ((x ^ y) >> 1) 729binop("irhadd", tint, _2src_commutative, "(src0 | src1) + ((src0 ^ src1) >> 1)") 730binop("urhadd", tuint, _2src_commutative, "(src0 | src1) + ((src0 ^ src1) >> 1)") 731 732binop("umod", tuint, "", "src1 == 0 ? 0 : src0 % src1") 733 734# For signed integers, there are several different possible definitions of 735# "modulus" or "remainder". We follow the conventions used by LLVM and 736# SPIR-V. The irem opcode implements the standard C/C++ signed "%" 737# operation while the imod opcode implements the more mathematical 738# "modulus" operation. For details on the difference, see 739# 740# http://mathforum.org/library/drmath/view/52343.html 741 742binop("irem", tint, "", "src1 == 0 ? 0 : src0 % src1") 743binop("imod", tint, "", 744 "src1 == 0 ? 0 : ((src0 % src1 == 0 || (src0 >= 0) == (src1 >= 0)) ?" 745 " src0 % src1 : src0 % src1 + src1)") 746binop("fmod", tfloat, "", "src0 - src1 * floorf(src0 / src1)") 747binop("frem", tfloat, "", "src0 - src1 * truncf(src0 / src1)") 748 749# 750# Comparisons 751# 752 753 754# these integer-aware comparisons return a boolean (0 or ~0) 755 756binop_compare_all_sizes("flt", tfloat, "", "src0 < src1") 757binop_compare_all_sizes("fge", tfloat, "", "src0 >= src1") 758binop_compare_all_sizes("feq", tfloat, _2src_commutative, "src0 == src1") 759binop_compare_all_sizes("fneu", tfloat, _2src_commutative, "src0 != src1") 760binop_compare_all_sizes("ilt", tint, "", "src0 < src1") 761binop_compare_all_sizes("ige", tint, "", "src0 >= src1") 762binop_compare_all_sizes("ieq", tint, _2src_commutative, "src0 == src1") 763binop_compare_all_sizes("ine", tint, _2src_commutative, "src0 != src1") 764binop_compare_all_sizes("ult", tuint, "", "src0 < src1") 765binop_compare_all_sizes("uge", tuint, "", "src0 >= src1") 766 767# integer-aware GLSL-style comparisons that compare floats and ints 768 769binop_reduce_all_sizes("ball_fequal", 1, tfloat, "{src0} == {src1}", 770 "{src0} && {src1}", "{src}") 771binop_reduce_all_sizes("bany_fnequal", 1, tfloat, "{src0} != {src1}", 772 "{src0} || {src1}", "{src}") 773binop_reduce_all_sizes("ball_iequal", 1, tint, "{src0} == {src1}", 774 "{src0} && {src1}", "{src}") 775binop_reduce_all_sizes("bany_inequal", 1, tint, "{src0} != {src1}", 776 "{src0} || {src1}", "{src}") 777 778# non-integer-aware GLSL-style comparisons that return 0.0 or 1.0 779 780binop_reduce("fall_equal", 1, tfloat32, tfloat32, "{src0} == {src1}", 781 "{src0} && {src1}", "{src} ? 1.0f : 0.0f") 782binop_reduce("fany_nequal", 1, tfloat32, tfloat32, "{src0} != {src1}", 783 "{src0} || {src1}", "{src} ? 1.0f : 0.0f") 784 785# These comparisons for integer-less hardware return 1.0 and 0.0 for true 786# and false respectively 787 788binop("slt", tfloat32, "", "(src0 < src1) ? 1.0f : 0.0f") # Set on Less Than 789binop("sge", tfloat, "", "(src0 >= src1) ? 1.0f : 0.0f") # Set on Greater or Equal 790binop("seq", tfloat32, _2src_commutative, "(src0 == src1) ? 1.0f : 0.0f") # Set on Equal 791binop("sne", tfloat32, _2src_commutative, "(src0 != src1) ? 1.0f : 0.0f") # Set on Not Equal 792 793# SPIRV shifts are undefined for shift-operands >= bitsize, 794# but SM5 shifts are defined to use the least significant bits, only 795# The NIR definition is according to the SM5 specification. 796opcode("ishl", 0, tint, [0, 0], [tint, tuint32], False, "", 797 "(uint64_t)src0 << (src1 & (sizeof(src0) * 8 - 1))") 798opcode("ishr", 0, tint, [0, 0], [tint, tuint32], False, "", 799 "src0 >> (src1 & (sizeof(src0) * 8 - 1))") 800opcode("ushr", 0, tuint, [0, 0], [tuint, tuint32], False, "", 801 "src0 >> (src1 & (sizeof(src0) * 8 - 1))") 802 803opcode("urol", 0, tuint, [0, 0], [tuint, tuint32], False, "", """ 804 uint32_t rotate_mask = sizeof(src0) * 8 - 1; 805 dst = (src0 << (src1 & rotate_mask)) | 806 (src0 >> (-src1 & rotate_mask)); 807""") 808opcode("uror", 0, tuint, [0, 0], [tuint, tuint32], False, "", """ 809 uint32_t rotate_mask = sizeof(src0) * 8 - 1; 810 dst = (src0 >> (src1 & rotate_mask)) | 811 (src0 << (-src1 & rotate_mask)); 812""") 813 814# bitwise logic operators 815# 816# These are also used as boolean and, or, xor for hardware supporting 817# integers. 818 819 820binop("iand", tuint, _2src_commutative + associative, "src0 & src1") 821binop("ior", tuint, _2src_commutative + associative, "src0 | src1") 822binop("ixor", tuint, _2src_commutative + associative, "src0 ^ src1") 823 824 825binop_reduce("fdot", 1, tfloat, tfloat, "{src0} * {src1}", "{src0} + {src1}", 826 "{src}") 827 828binop_reduce("fdot", 4, tfloat, tfloat, 829 "{src0} * {src1}", "{src0} + {src1}", "{src}", 830 suffix="_replicated") 831 832opcode("fdph", 1, tfloat, [3, 4], [tfloat, tfloat], False, "", 833 "src0.x * src1.x + src0.y * src1.y + src0.z * src1.z + src1.w") 834opcode("fdph_replicated", 4, tfloat, [3, 4], [tfloat, tfloat], False, "", 835 "src0.x * src1.x + src0.y * src1.y + src0.z * src1.z + src1.w") 836 837binop("fmin", tfloat, _2src_commutative + associative, "fmin(src0, src1)") 838binop("imin", tint, _2src_commutative + associative, "src1 > src0 ? src0 : src1") 839binop("umin", tuint, _2src_commutative + associative, "src1 > src0 ? src0 : src1") 840binop("fmax", tfloat, _2src_commutative + associative, "fmax(src0, src1)") 841binop("imax", tint, _2src_commutative + associative, "src1 > src0 ? src1 : src0") 842binop("umax", tuint, _2src_commutative + associative, "src1 > src0 ? src1 : src0") 843 844# Saturated vector add for 4 8bit ints. 845binop("usadd_4x8", tint32, _2src_commutative + associative, """ 846dst = 0; 847for (int i = 0; i < 32; i += 8) { 848 dst |= MIN2(((src0 >> i) & 0xff) + ((src1 >> i) & 0xff), 0xff) << i; 849} 850""") 851 852# Saturated vector subtract for 4 8bit ints. 853binop("ussub_4x8", tint32, "", """ 854dst = 0; 855for (int i = 0; i < 32; i += 8) { 856 int src0_chan = (src0 >> i) & 0xff; 857 int src1_chan = (src1 >> i) & 0xff; 858 if (src0_chan > src1_chan) 859 dst |= (src0_chan - src1_chan) << i; 860} 861""") 862 863# vector min for 4 8bit ints. 864binop("umin_4x8", tint32, _2src_commutative + associative, """ 865dst = 0; 866for (int i = 0; i < 32; i += 8) { 867 dst |= MIN2((src0 >> i) & 0xff, (src1 >> i) & 0xff) << i; 868} 869""") 870 871# vector max for 4 8bit ints. 872binop("umax_4x8", tint32, _2src_commutative + associative, """ 873dst = 0; 874for (int i = 0; i < 32; i += 8) { 875 dst |= MAX2((src0 >> i) & 0xff, (src1 >> i) & 0xff) << i; 876} 877""") 878 879# unorm multiply: (a * b) / 255. 880binop("umul_unorm_4x8", tint32, _2src_commutative + associative, """ 881dst = 0; 882for (int i = 0; i < 32; i += 8) { 883 int src0_chan = (src0 >> i) & 0xff; 884 int src1_chan = (src1 >> i) & 0xff; 885 dst |= ((src0_chan * src1_chan) / 255) << i; 886} 887""") 888 889binop("fpow", tfloat, "", "bit_size == 64 ? powf(src0, src1) : pow(src0, src1)") 890 891binop_horiz("pack_half_2x16_split", 1, tuint32, 1, tfloat32, 1, tfloat32, 892 "pack_half_1x16(src0.x) | (pack_half_1x16(src1.x) << 16)") 893 894binop_convert("pack_64_2x32_split", tuint64, tuint32, "", 895 "src0 | ((uint64_t)src1 << 32)") 896 897binop_convert("pack_32_2x16_split", tuint32, tuint16, "", 898 "src0 | ((uint32_t)src1 << 16)") 899 900# bfm implements the behavior of the first operation of the SM5 "bfi" assembly 901# and that of the "bfi1" i965 instruction. That is, the bits and offset values 902# are from the low five bits of src0 and src1, respectively. 903binop_convert("bfm", tuint32, tint32, "", """ 904int bits = src0 & 0x1F; 905int offset = src1 & 0x1F; 906dst = ((1u << bits) - 1) << offset; 907""") 908 909opcode("ldexp", 0, tfloat, [0, 0], [tfloat, tint32], False, "", """ 910dst = (bit_size == 64) ? ldexp(src0, src1) : ldexpf(src0, src1); 911/* flush denormals to zero. */ 912if (!isnormal(dst)) 913 dst = copysignf(0.0f, src0); 914""") 915 916# Combines the first component of each input to make a 2-component vector. 917 918binop_horiz("vec2", 2, tuint, 1, tuint, 1, tuint, """ 919dst.x = src0.x; 920dst.y = src1.x; 921""") 922 923# Byte extraction 924binop("extract_u8", tuint, "", "(uint8_t)(src0 >> (src1 * 8))") 925binop("extract_i8", tint, "", "(int8_t)(src0 >> (src1 * 8))") 926 927# Word extraction 928binop("extract_u16", tuint, "", "(uint16_t)(src0 >> (src1 * 16))") 929binop("extract_i16", tint, "", "(int16_t)(src0 >> (src1 * 16))") 930 931 932def triop(name, ty, alg_props, const_expr): 933 opcode(name, 0, ty, [0, 0, 0], [ty, ty, ty], False, alg_props, const_expr) 934def triop_horiz(name, output_size, src1_size, src2_size, src3_size, const_expr): 935 opcode(name, output_size, tuint, 936 [src1_size, src2_size, src3_size], 937 [tuint, tuint, tuint], False, "", const_expr) 938 939triop("ffma", tfloat, _2src_commutative, """ 940if (nir_is_rounding_mode_rtz(execution_mode, bit_size)) { 941 if (bit_size == 64) 942 dst = _mesa_double_fma_rtz(src0, src1, src2); 943 else if (bit_size == 32) 944 dst = _mesa_float_fma_rtz(src0, src1, src2); 945 else 946 dst = _mesa_double_to_float_rtz(_mesa_double_fma_rtz(src0, src1, src2)); 947} else { 948 if (bit_size == 32) 949 dst = fmaf(src0, src1, src2); 950 else 951 dst = fma(src0, src1, src2); 952} 953""") 954 955triop("flrp", tfloat, "", "src0 * (1 - src2) + src1 * src2") 956 957# Conditional Select 958# 959# A vector conditional select instruction (like ?:, but operating per- 960# component on vectors). There are two versions, one for floating point 961# bools (0.0 vs 1.0) and one for integer bools (0 vs ~0). 962 963triop("fcsel", tfloat32, "", "(src0 != 0.0f) ? src1 : src2") 964 965opcode("bcsel", 0, tuint, [0, 0, 0], 966 [tbool1, tuint, tuint], False, "", "src0 ? src1 : src2") 967opcode("b8csel", 0, tuint, [0, 0, 0], 968 [tbool8, tuint, tuint], False, "", "src0 ? src1 : src2") 969opcode("b16csel", 0, tuint, [0, 0, 0], 970 [tbool16, tuint, tuint], False, "", "src0 ? src1 : src2") 971opcode("b32csel", 0, tuint, [0, 0, 0], 972 [tbool32, tuint, tuint], False, "", "src0 ? src1 : src2") 973 974# SM5 bfi assembly 975triop("bfi", tuint32, "", """ 976unsigned mask = src0, insert = src1, base = src2; 977if (mask == 0) { 978 dst = base; 979} else { 980 unsigned tmp = mask; 981 while (!(tmp & 1)) { 982 tmp >>= 1; 983 insert <<= 1; 984 } 985 dst = (base & ~mask) | (insert & mask); 986} 987""") 988 989 990triop("bitfield_select", tuint, "", "(src0 & src1) | (~src0 & src2)") 991 992# SM5 ubfe/ibfe assembly: only the 5 least significant bits of offset and bits are used. 993opcode("ubfe", 0, tuint32, 994 [0, 0, 0], [tuint32, tuint32, tuint32], False, "", """ 995unsigned base = src0; 996unsigned offset = src1 & 0x1F; 997unsigned bits = src2 & 0x1F; 998if (bits == 0) { 999 dst = 0; 1000} else if (offset + bits < 32) { 1001 dst = (base << (32 - bits - offset)) >> (32 - bits); 1002} else { 1003 dst = base >> offset; 1004} 1005""") 1006opcode("ibfe", 0, tint32, 1007 [0, 0, 0], [tint32, tuint32, tuint32], False, "", """ 1008int base = src0; 1009unsigned offset = src1 & 0x1F; 1010unsigned bits = src2 & 0x1F; 1011if (bits == 0) { 1012 dst = 0; 1013} else if (offset + bits < 32) { 1014 dst = (base << (32 - bits - offset)) >> (32 - bits); 1015} else { 1016 dst = base >> offset; 1017} 1018""") 1019 1020# GLSL bitfieldExtract() 1021opcode("ubitfield_extract", 0, tuint32, 1022 [0, 0, 0], [tuint32, tint32, tint32], False, "", """ 1023unsigned base = src0; 1024int offset = src1, bits = src2; 1025if (bits == 0) { 1026 dst = 0; 1027} else if (bits < 0 || offset < 0 || offset + bits > 32) { 1028 dst = 0; /* undefined per the spec */ 1029} else { 1030 dst = (base >> offset) & ((1ull << bits) - 1); 1031} 1032""") 1033opcode("ibitfield_extract", 0, tint32, 1034 [0, 0, 0], [tint32, tint32, tint32], False, "", """ 1035int base = src0; 1036int offset = src1, bits = src2; 1037if (bits == 0) { 1038 dst = 0; 1039} else if (offset < 0 || bits < 0 || offset + bits > 32) { 1040 dst = 0; 1041} else { 1042 dst = (base << (32 - offset - bits)) >> offset; /* use sign-extending shift */ 1043} 1044""") 1045 1046# Combines the first component of each input to make a 3-component vector. 1047 1048triop_horiz("vec3", 3, 1, 1, 1, """ 1049dst.x = src0.x; 1050dst.y = src1.x; 1051dst.z = src2.x; 1052""") 1053 1054def quadop_horiz(name, output_size, src1_size, src2_size, src3_size, 1055 src4_size, const_expr): 1056 opcode(name, output_size, tuint, 1057 [src1_size, src2_size, src3_size, src4_size], 1058 [tuint, tuint, tuint, tuint], 1059 False, "", const_expr) 1060 1061opcode("bitfield_insert", 0, tuint32, [0, 0, 0, 0], 1062 [tuint32, tuint32, tint32, tint32], False, "", """ 1063unsigned base = src0, insert = src1; 1064int offset = src2, bits = src3; 1065if (bits == 0) { 1066 dst = base; 1067} else if (offset < 0 || bits < 0 || bits + offset > 32) { 1068 dst = 0; 1069} else { 1070 unsigned mask = ((1ull << bits) - 1) << offset; 1071 dst = (base & ~mask) | ((insert << offset) & mask); 1072} 1073""") 1074 1075quadop_horiz("vec4", 4, 1, 1, 1, 1, """ 1076dst.x = src0.x; 1077dst.y = src1.x; 1078dst.z = src2.x; 1079dst.w = src3.x; 1080""") 1081 1082opcode("vec8", 8, tuint, 1083 [1] * 8, [tuint] * 8, 1084 False, "", """ 1085dst.x = src0.x; 1086dst.y = src1.x; 1087dst.z = src2.x; 1088dst.w = src3.x; 1089dst.e = src4.x; 1090dst.f = src5.x; 1091dst.g = src6.x; 1092dst.h = src7.x; 1093""") 1094 1095opcode("vec16", 16, tuint, 1096 [1] * 16, [tuint] * 16, 1097 False, "", """ 1098dst.x = src0.x; 1099dst.y = src1.x; 1100dst.z = src2.x; 1101dst.w = src3.x; 1102dst.e = src4.x; 1103dst.f = src5.x; 1104dst.g = src6.x; 1105dst.h = src7.x; 1106dst.i = src8.x; 1107dst.j = src9.x; 1108dst.k = src10.x; 1109dst.l = src11.x; 1110dst.m = src12.x; 1111dst.n = src13.x; 1112dst.o = src14.x; 1113dst.p = src15.x; 1114""") 1115 1116# An integer multiply instruction for address calculation. This is 1117# similar to imul, except that the results are undefined in case of 1118# overflow. Overflow is defined according to the size of the variable 1119# being dereferenced. 1120# 1121# This relaxed definition, compared to imul, allows an optimization 1122# pass to propagate bounds (ie, from an load/store intrinsic) to the 1123# sources, such that lower precision integer multiplies can be used. 1124# This is useful on hw that has 24b or perhaps 16b integer multiply 1125# instructions. 1126binop("amul", tint, _2src_commutative + associative, "src0 * src1") 1127 1128# ir3-specific instruction that maps directly to mul-add shift high mix, 1129# (IMADSH_MIX16 i.e. ah * bl << 16 + c). It is used for lowering integer 1130# multiplication (imul) on Freedreno backend.. 1131opcode("imadsh_mix16", 0, tint32, 1132 [0, 0, 0], [tint32, tint32, tint32], False, "", """ 1133dst = ((((src0 & 0xffff0000) >> 16) * (src1 & 0x0000ffff)) << 16) + src2; 1134""") 1135 1136# ir3-specific instruction that maps directly to ir3 mad.s24. 1137# 1138# 24b multiply into 32b result (with sign extension) plus 32b int 1139triop("imad24_ir3", tint32, _2src_commutative, 1140 "(((int32_t)src0 << 8) >> 8) * (((int32_t)src1 << 8) >> 8) + src2") 1141 1142# 24b multiply into 32b result (with sign extension) 1143binop("imul24", tint32, _2src_commutative + associative, 1144 "(((int32_t)src0 << 8) >> 8) * (((int32_t)src1 << 8) >> 8)") 1145 1146# unsigned 24b multiply into 32b result plus 32b int 1147triop("umad24", tuint32, _2src_commutative, 1148 "(((uint32_t)src0 << 8) >> 8) * (((uint32_t)src1 << 8) >> 8) + src2") 1149 1150# unsigned 24b multiply into 32b result uint 1151binop("umul24", tint32, _2src_commutative + associative, 1152 "(((uint32_t)src0 << 8) >> 8) * (((uint32_t)src1 << 8) >> 8)") 1153 1154unop_convert("fisnormal", tbool1, tfloat, "isnormal(src0)") 1155unop_convert("fisfinite", tbool1, tfloat, "isfinite(src0)") 1156