1#! /usr/bin/env python 2# 3# Copyright (C) 2014 Connor Abbott 4# 5# Permission is hereby granted, free of charge, to any person obtaining a 6# copy of this software and associated documentation files (the "Software"), 7# to deal in the Software without restriction, including without limitation 8# the rights to use, copy, modify, merge, publish, distribute, sublicense, 9# and/or sell copies of the Software, and to permit persons to whom the 10# Software is furnished to do so, subject to the following conditions: 11# 12# The above copyright notice and this permission notice (including the next 13# paragraph) shall be included in all copies or substantial portions of the 14# Software. 15# 16# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 17# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 18# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 19# THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 20# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING 21# FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS 22# IN THE SOFTWARE. 23# 24# Authors: 25# Connor Abbott (cwabbott0@gmail.com) 26 27 28# Class that represents all the information we have about the opcode 29# NOTE: this must be kept in sync with nir_op_info 30 31class Opcode(object): 32 """Class that represents all the information we have about the opcode 33 NOTE: this must be kept in sync with nir_op_info 34 """ 35 def __init__(self, name, output_size, output_type, input_sizes, 36 input_types, algebraic_properties, const_expr): 37 """Parameters: 38 39 - name is the name of the opcode (prepend nir_op_ for the enum name) 40 - all types are strings that get nir_type_ prepended to them 41 - input_types is a list of types 42 - algebraic_properties is a space-seperated string, where nir_op_is_ is 43 prepended before each entry 44 - const_expr is an expression or series of statements that computes the 45 constant value of the opcode given the constant values of its inputs. 46 47 Constant expressions are formed from the variables src0, src1, ..., 48 src(N-1), where N is the number of arguments. The output of the 49 expression should be stored in the dst variable. Per-component input 50 and output variables will be scalars and non-per-component input and 51 output variables will be a struct with fields named x, y, z, and w 52 all of the correct type. Input and output variables can be assumed 53 to already be of the correct type and need no conversion. In 54 particular, the conversion from the C bool type to/from NIR_TRUE and 55 NIR_FALSE happens automatically. 56 57 For per-component instructions, the entire expression will be 58 executed once for each component. For non-per-component 59 instructions, the expression is expected to store the correct values 60 in dst.x, dst.y, etc. If "dst" does not exist anywhere in the 61 constant expression, an assignment to dst will happen automatically 62 and the result will be equivalent to "dst = <expression>" for 63 per-component instructions and "dst.x = dst.y = ... = <expression>" 64 for non-per-component instructions. 65 """ 66 assert isinstance(name, str) 67 assert isinstance(output_size, int) 68 assert isinstance(output_type, str) 69 assert isinstance(input_sizes, list) 70 assert isinstance(input_sizes[0], int) 71 assert isinstance(input_types, list) 72 assert isinstance(input_types[0], str) 73 assert isinstance(algebraic_properties, str) 74 assert isinstance(const_expr, str) 75 assert len(input_sizes) == len(input_types) 76 assert 0 <= output_size <= 4 77 for size in input_sizes: 78 assert 0 <= size <= 4 79 if output_size != 0: 80 assert size != 0 81 self.name = name 82 self.num_inputs = len(input_sizes) 83 self.output_size = output_size 84 self.output_type = output_type 85 self.input_sizes = input_sizes 86 self.input_types = input_types 87 self.algebraic_properties = algebraic_properties 88 self.const_expr = const_expr 89 90# helper variables for strings 91tfloat = "float" 92tint = "int" 93tbool = "bool32" 94tuint = "uint" 95tfloat32 = "float32" 96tint32 = "int32" 97tuint32 = "uint32" 98tuint64 = "uint64" 99tfloat64 = "float64" 100 101commutative = "commutative " 102associative = "associative " 103 104# global dictionary of opcodes 105opcodes = {} 106 107def opcode(name, output_size, output_type, input_sizes, input_types, 108 algebraic_properties, const_expr): 109 assert name not in opcodes 110 opcodes[name] = Opcode(name, output_size, output_type, input_sizes, 111 input_types, algebraic_properties, const_expr) 112 113def unop_convert(name, out_type, in_type, const_expr): 114 opcode(name, 0, out_type, [0], [in_type], "", const_expr) 115 116def unop(name, ty, const_expr): 117 opcode(name, 0, ty, [0], [ty], "", const_expr) 118 119def unop_horiz(name, output_size, output_type, input_size, input_type, 120 const_expr): 121 opcode(name, output_size, output_type, [input_size], [input_type], "", 122 const_expr) 123 124def unop_reduce(name, output_size, output_type, input_type, prereduce_expr, 125 reduce_expr, final_expr): 126 def prereduce(src): 127 return "(" + prereduce_expr.format(src=src) + ")" 128 def final(src): 129 return final_expr.format(src="(" + src + ")") 130 def reduce_(src0, src1): 131 return reduce_expr.format(src0=src0, src1=src1) 132 src0 = prereduce("src0.x") 133 src1 = prereduce("src0.y") 134 src2 = prereduce("src0.z") 135 src3 = prereduce("src0.w") 136 unop_horiz(name + "2", output_size, output_type, 2, input_type, 137 final(reduce_(src0, src1))) 138 unop_horiz(name + "3", output_size, output_type, 3, input_type, 139 final(reduce_(reduce_(src0, src1), src2))) 140 unop_horiz(name + "4", output_size, output_type, 4, input_type, 141 final(reduce_(reduce_(src0, src1), reduce_(src2, src3)))) 142 143 144# These two move instructions differ in what modifiers they support and what 145# the negate modifier means. Otherwise, they are identical. 146unop("fmov", tfloat, "src0") 147unop("imov", tint, "src0") 148 149unop("ineg", tint, "-src0") 150unop("fneg", tfloat, "-src0") 151unop("inot", tint, "~src0") # invert every bit of the integer 152unop("fnot", tfloat, ("bit_size == 64 ? ((src0 == 0.0) ? 1.0 : 0.0f) : " + 153 "((src0 == 0.0f) ? 1.0f : 0.0f)")) 154unop("fsign", tfloat, ("bit_size == 64 ? " + 155 "((src0 == 0.0) ? 0.0 : ((src0 > 0.0) ? 1.0 : -1.0)) : " + 156 "((src0 == 0.0f) ? 0.0f : ((src0 > 0.0f) ? 1.0f : -1.0f))")) 157unop("isign", tint, "(src0 == 0) ? 0 : ((src0 > 0) ? 1 : -1)") 158unop("iabs", tint, "(src0 < 0) ? -src0 : src0") 159unop("fabs", tfloat, "bit_size == 64 ? fabs(src0) : fabsf(src0)") 160unop("fsat", tfloat, ("bit_size == 64 ? " + 161 "((src0 > 1.0) ? 1.0 : ((src0 <= 0.0) ? 0.0 : src0)) : " + 162 "((src0 > 1.0f) ? 1.0f : ((src0 <= 0.0f) ? 0.0f : src0))")) 163unop("frcp", tfloat, "bit_size == 64 ? 1.0 / src0 : 1.0f / src0") 164unop("frsq", tfloat, "bit_size == 64 ? 1.0 / sqrt(src0) : 1.0f / sqrtf(src0)") 165unop("fsqrt", tfloat, "bit_size == 64 ? sqrt(src0) : sqrtf(src0)") 166unop("fexp2", tfloat, "exp2f(src0)") 167unop("flog2", tfloat, "log2f(src0)") 168unop_convert("f2i", tint32, tfloat32, "src0") # Float-to-integer conversion. 169unop_convert("f2u", tuint32, tfloat32, "src0") # Float-to-unsigned conversion 170unop_convert("d2i", tint32, tfloat64, "src0") # Double-to-integer conversion. 171unop_convert("d2u", tuint32, tfloat64, "src0") # Double-to-unsigned conversion. 172unop_convert("i2f", tfloat32, tint32, "src0") # Integer-to-float conversion. 173unop_convert("i2d", tfloat64, tint32, "src0") # Integer-to-double conversion. 174# Float-to-boolean conversion 175unop_convert("f2b", tbool, tfloat32, "src0 != 0.0f") 176unop_convert("d2b", tbool, tfloat64, "src0 != 0.0") 177# Boolean-to-float conversion 178unop_convert("b2f", tfloat32, tbool, "src0 ? 1.0f : 0.0f") 179# Int-to-boolean conversion 180unop_convert("i2b", tbool, tint32, "src0 != 0") 181unop_convert("b2i", tint32, tbool, "src0 ? 1 : 0") # Boolean-to-int conversion 182unop_convert("u2f", tfloat32, tuint32, "src0") # Unsigned-to-float conversion. 183unop_convert("u2d", tfloat64, tuint32, "src0") # Unsigned-to-double conversion. 184# double-to-float conversion 185unop_convert("d2f", tfloat32, tfloat64, "src0") # Double to single precision 186unop_convert("f2d", tfloat64, tfloat32, "src0") # Single to double precision 187 188# Unary floating-point rounding operations. 189 190 191unop("ftrunc", tfloat, "bit_size == 64 ? trunc(src0) : truncf(src0)") 192unop("fceil", tfloat, "bit_size == 64 ? ceil(src0) : ceilf(src0)") 193unop("ffloor", tfloat, "bit_size == 64 ? floor(src0) : floorf(src0)") 194unop("ffract", tfloat, "src0 - (bit_size == 64 ? floor(src0) : floorf(src0))") 195unop("fround_even", tfloat, "bit_size == 64 ? _mesa_roundeven(src0) : _mesa_roundevenf(src0)") 196 197unop("fquantize2f16", tfloat, "(fabs(src0) < ldexpf(1.0, -14)) ? copysignf(0.0f, src0) : _mesa_half_to_float(_mesa_float_to_half(src0))") 198 199# Trigonometric operations. 200 201 202unop("fsin", tfloat, "bit_size == 64 ? sin(src0) : sinf(src0)") 203unop("fcos", tfloat, "bit_size == 64 ? cos(src0) : cosf(src0)") 204 205 206# Partial derivatives. 207 208 209unop("fddx", tfloat, "0.0") # the derivative of a constant is 0. 210unop("fddy", tfloat, "0.0") 211unop("fddx_fine", tfloat, "0.0") 212unop("fddy_fine", tfloat, "0.0") 213unop("fddx_coarse", tfloat, "0.0") 214unop("fddy_coarse", tfloat, "0.0") 215 216 217# Floating point pack and unpack operations. 218 219def pack_2x16(fmt): 220 unop_horiz("pack_" + fmt + "_2x16", 1, tuint32, 2, tfloat32, """ 221dst.x = (uint32_t) pack_fmt_1x16(src0.x); 222dst.x |= ((uint32_t) pack_fmt_1x16(src0.y)) << 16; 223""".replace("fmt", fmt)) 224 225def pack_4x8(fmt): 226 unop_horiz("pack_" + fmt + "_4x8", 1, tuint32, 4, tfloat32, """ 227dst.x = (uint32_t) pack_fmt_1x8(src0.x); 228dst.x |= ((uint32_t) pack_fmt_1x8(src0.y)) << 8; 229dst.x |= ((uint32_t) pack_fmt_1x8(src0.z)) << 16; 230dst.x |= ((uint32_t) pack_fmt_1x8(src0.w)) << 24; 231""".replace("fmt", fmt)) 232 233def unpack_2x16(fmt): 234 unop_horiz("unpack_" + fmt + "_2x16", 2, tfloat32, 1, tuint32, """ 235dst.x = unpack_fmt_1x16((uint16_t)(src0.x & 0xffff)); 236dst.y = unpack_fmt_1x16((uint16_t)(src0.x << 16)); 237""".replace("fmt", fmt)) 238 239def unpack_4x8(fmt): 240 unop_horiz("unpack_" + fmt + "_4x8", 4, tfloat32, 1, tuint32, """ 241dst.x = unpack_fmt_1x8((uint8_t)(src0.x & 0xff)); 242dst.y = unpack_fmt_1x8((uint8_t)((src0.x >> 8) & 0xff)); 243dst.z = unpack_fmt_1x8((uint8_t)((src0.x >> 16) & 0xff)); 244dst.w = unpack_fmt_1x8((uint8_t)(src0.x >> 24)); 245""".replace("fmt", fmt)) 246 247 248pack_2x16("snorm") 249pack_4x8("snorm") 250pack_2x16("unorm") 251pack_4x8("unorm") 252pack_2x16("half") 253unpack_2x16("snorm") 254unpack_4x8("snorm") 255unpack_2x16("unorm") 256unpack_4x8("unorm") 257unpack_2x16("half") 258 259unop_horiz("pack_uvec2_to_uint", 1, tuint32, 2, tuint32, """ 260dst.x = (src0.x & 0xffff) | (src0.y << 16); 261""") 262 263unop_horiz("pack_uvec4_to_uint", 1, tuint32, 4, tuint32, """ 264dst.x = (src0.x << 0) | 265 (src0.y << 8) | 266 (src0.z << 16) | 267 (src0.w << 24); 268""") 269 270unop_horiz("pack_double_2x32", 1, tuint64, 2, tuint32, 271 "dst.x = src0.x | ((uint64_t)src0.y << 32);") 272 273unop_horiz("unpack_double_2x32", 2, tuint32, 1, tuint64, 274 "dst.x = src0.x; dst.y = src0.x >> 32;") 275 276# Lowered floating point unpacking operations. 277 278 279unop_horiz("unpack_half_2x16_split_x", 1, tfloat32, 1, tuint32, 280 "unpack_half_1x16((uint16_t)(src0.x & 0xffff))") 281unop_horiz("unpack_half_2x16_split_y", 1, tfloat32, 1, tuint32, 282 "unpack_half_1x16((uint16_t)(src0.x >> 16))") 283 284unop_convert("unpack_double_2x32_split_x", tuint32, tuint64, "src0") 285unop_convert("unpack_double_2x32_split_y", tuint32, tuint64, "src0 >> 32") 286 287# Bit operations, part of ARB_gpu_shader5. 288 289 290unop("bitfield_reverse", tuint32, """ 291/* we're not winning any awards for speed here, but that's ok */ 292dst = 0; 293for (unsigned bit = 0; bit < 32; bit++) 294 dst |= ((src0 >> bit) & 1) << (31 - bit); 295""") 296unop("bit_count", tuint32, """ 297dst = 0; 298for (unsigned bit = 0; bit < 32; bit++) { 299 if ((src0 >> bit) & 1) 300 dst++; 301} 302""") 303 304unop_convert("ufind_msb", tint32, tuint32, """ 305dst = -1; 306for (int bit = 31; bit > 0; bit--) { 307 if ((src0 >> bit) & 1) { 308 dst = bit; 309 break; 310 } 311} 312""") 313 314unop("ifind_msb", tint32, """ 315dst = -1; 316for (int bit = 31; bit >= 0; bit--) { 317 /* If src0 < 0, we're looking for the first 0 bit. 318 * if src0 >= 0, we're looking for the first 1 bit. 319 */ 320 if ((((src0 >> bit) & 1) && (src0 >= 0)) || 321 (!((src0 >> bit) & 1) && (src0 < 0))) { 322 dst = bit; 323 break; 324 } 325} 326""") 327 328unop("find_lsb", tint32, """ 329dst = -1; 330for (unsigned bit = 0; bit < 32; bit++) { 331 if ((src0 >> bit) & 1) { 332 dst = bit; 333 break; 334 } 335} 336""") 337 338 339for i in xrange(1, 5): 340 for j in xrange(1, 5): 341 unop_horiz("fnoise{0}_{1}".format(i, j), i, tfloat, j, tfloat, "0.0f") 342 343def binop_convert(name, out_type, in_type, alg_props, const_expr): 344 opcode(name, 0, out_type, [0, 0], [in_type, in_type], alg_props, const_expr) 345 346def binop(name, ty, alg_props, const_expr): 347 binop_convert(name, ty, ty, alg_props, const_expr) 348 349def binop_compare(name, ty, alg_props, const_expr): 350 binop_convert(name, tbool, ty, alg_props, const_expr) 351 352def binop_horiz(name, out_size, out_type, src1_size, src1_type, src2_size, 353 src2_type, const_expr): 354 opcode(name, out_size, out_type, [src1_size, src2_size], [src1_type, src2_type], 355 "", const_expr) 356 357def binop_reduce(name, output_size, output_type, src_type, prereduce_expr, 358 reduce_expr, final_expr): 359 def final(src): 360 return final_expr.format(src= "(" + src + ")") 361 def reduce_(src0, src1): 362 return reduce_expr.format(src0=src0, src1=src1) 363 def prereduce(src0, src1): 364 return "(" + prereduce_expr.format(src0=src0, src1=src1) + ")" 365 src0 = prereduce("src0.x", "src1.x") 366 src1 = prereduce("src0.y", "src1.y") 367 src2 = prereduce("src0.z", "src1.z") 368 src3 = prereduce("src0.w", "src1.w") 369 opcode(name + "2", output_size, output_type, 370 [2, 2], [src_type, src_type], commutative, 371 final(reduce_(src0, src1))) 372 opcode(name + "3", output_size, output_type, 373 [3, 3], [src_type, src_type], commutative, 374 final(reduce_(reduce_(src0, src1), src2))) 375 opcode(name + "4", output_size, output_type, 376 [4, 4], [src_type, src_type], commutative, 377 final(reduce_(reduce_(src0, src1), reduce_(src2, src3)))) 378 379binop("fadd", tfloat, commutative + associative, "src0 + src1") 380binop("iadd", tint, commutative + associative, "src0 + src1") 381binop("fsub", tfloat, "", "src0 - src1") 382binop("isub", tint, "", "src0 - src1") 383 384binop("fmul", tfloat, commutative + associative, "src0 * src1") 385# low 32-bits of signed/unsigned integer multiply 386binop("imul", tint, commutative + associative, "src0 * src1") 387# high 32-bits of signed integer multiply 388binop("imul_high", tint32, commutative, 389 "(int32_t)(((int64_t) src0 * (int64_t) src1) >> 32)") 390# high 32-bits of unsigned integer multiply 391binop("umul_high", tuint32, commutative, 392 "(uint32_t)(((uint64_t) src0 * (uint64_t) src1) >> 32)") 393 394binop("fdiv", tfloat, "", "src0 / src1") 395binop("idiv", tint, "", "src0 / src1") 396binop("udiv", tuint, "", "src0 / src1") 397 398# returns a boolean representing the carry resulting from the addition of 399# the two unsigned arguments. 400 401binop_convert("uadd_carry", tuint, tuint, commutative, "src0 + src1 < src0") 402 403# returns a boolean representing the borrow resulting from the subtraction 404# of the two unsigned arguments. 405 406binop_convert("usub_borrow", tuint, tuint, "", "src0 < src1") 407 408binop("umod", tuint, "", "src1 == 0 ? 0 : src0 % src1") 409 410# For signed integers, there are several different possible definitions of 411# "modulus" or "remainder". We follow the conventions used by LLVM and 412# SPIR-V. The irem opcode implements the standard C/C++ signed "%" 413# operation while the imod opcode implements the more mathematical 414# "modulus" operation. For details on the difference, see 415# 416# http://mathforum.org/library/drmath/view/52343.html 417 418binop("irem", tint, "", "src1 == 0 ? 0 : src0 % src1") 419binop("imod", tint, "", 420 "src1 == 0 ? 0 : ((src0 % src1 == 0 || (src0 >= 0) == (src1 >= 0)) ?" 421 " src0 % src1 : src0 % src1 + src1)") 422binop("fmod", tfloat, "", "src0 - src1 * floorf(src0 / src1)") 423binop("frem", tfloat, "", "src0 - src1 * truncf(src0 / src1)") 424 425# 426# Comparisons 427# 428 429 430# these integer-aware comparisons return a boolean (0 or ~0) 431 432binop_compare("flt", tfloat, "", "src0 < src1") 433binop_compare("fge", tfloat, "", "src0 >= src1") 434binop_compare("feq", tfloat, commutative, "src0 == src1") 435binop_compare("fne", tfloat, commutative, "src0 != src1") 436binop_compare("ilt", tint, "", "src0 < src1") 437binop_compare("ige", tint, "", "src0 >= src1") 438binop_compare("ieq", tint, commutative, "src0 == src1") 439binop_compare("ine", tint, commutative, "src0 != src1") 440binop_compare("ult", tuint, "", "src0 < src1") 441binop_compare("uge", tuint, "", "src0 >= src1") 442 443# integer-aware GLSL-style comparisons that compare floats and ints 444 445binop_reduce("ball_fequal", 1, tbool, tfloat, "{src0} == {src1}", 446 "{src0} && {src1}", "{src}") 447binop_reduce("bany_fnequal", 1, tbool, tfloat, "{src0} != {src1}", 448 "{src0} || {src1}", "{src}") 449binop_reduce("ball_iequal", 1, tbool, tint, "{src0} == {src1}", 450 "{src0} && {src1}", "{src}") 451binop_reduce("bany_inequal", 1, tbool, tint, "{src0} != {src1}", 452 "{src0} || {src1}", "{src}") 453 454# non-integer-aware GLSL-style comparisons that return 0.0 or 1.0 455 456binop_reduce("fall_equal", 1, tfloat32, tfloat32, "{src0} == {src1}", 457 "{src0} && {src1}", "{src} ? 1.0f : 0.0f") 458binop_reduce("fany_nequal", 1, tfloat32, tfloat32, "{src0} != {src1}", 459 "{src0} || {src1}", "{src} ? 1.0f : 0.0f") 460 461# These comparisons for integer-less hardware return 1.0 and 0.0 for true 462# and false respectively 463 464binop("slt", tfloat32, "", "(src0 < src1) ? 1.0f : 0.0f") # Set on Less Than 465binop("sge", tfloat32, "", "(src0 >= src1) ? 1.0f : 0.0f") # Set on Greater or Equal 466binop("seq", tfloat32, commutative, "(src0 == src1) ? 1.0f : 0.0f") # Set on Equal 467binop("sne", tfloat32, commutative, "(src0 != src1) ? 1.0f : 0.0f") # Set on Not Equal 468 469 470binop("ishl", tint, "", "src0 << src1") 471binop("ishr", tint, "", "src0 >> src1") 472binop("ushr", tuint, "", "src0 >> src1") 473 474# bitwise logic operators 475# 476# These are also used as boolean and, or, xor for hardware supporting 477# integers. 478 479 480binop("iand", tuint, commutative + associative, "src0 & src1") 481binop("ior", tuint, commutative + associative, "src0 | src1") 482binop("ixor", tuint, commutative + associative, "src0 ^ src1") 483 484 485# floating point logic operators 486# 487# These use (src != 0.0) for testing the truth of the input, and output 1.0 488# for true and 0.0 for false 489 490binop("fand", tfloat32, commutative, 491 "((src0 != 0.0f) && (src1 != 0.0f)) ? 1.0f : 0.0f") 492binop("for", tfloat32, commutative, 493 "((src0 != 0.0f) || (src1 != 0.0f)) ? 1.0f : 0.0f") 494binop("fxor", tfloat32, commutative, 495 "(src0 != 0.0f && src1 == 0.0f) || (src0 == 0.0f && src1 != 0.0f) ? 1.0f : 0.0f") 496 497binop_reduce("fdot", 1, tfloat, tfloat, "{src0} * {src1}", "{src0} + {src1}", 498 "{src}") 499 500binop_reduce("fdot_replicated", 4, tfloat, tfloat, 501 "{src0} * {src1}", "{src0} + {src1}", "{src}") 502 503opcode("fdph", 1, tfloat, [3, 4], [tfloat, tfloat], "", 504 "src0.x * src1.x + src0.y * src1.y + src0.z * src1.z + src1.w") 505opcode("fdph_replicated", 4, tfloat, [3, 4], [tfloat, tfloat], "", 506 "src0.x * src1.x + src0.y * src1.y + src0.z * src1.z + src1.w") 507 508binop("fmin", tfloat, "", "fminf(src0, src1)") 509binop("imin", tint, commutative + associative, "src1 > src0 ? src0 : src1") 510binop("umin", tuint, commutative + associative, "src1 > src0 ? src0 : src1") 511binop("fmax", tfloat, "", "fmaxf(src0, src1)") 512binop("imax", tint, commutative + associative, "src1 > src0 ? src1 : src0") 513binop("umax", tuint, commutative + associative, "src1 > src0 ? src1 : src0") 514 515# Saturated vector add for 4 8bit ints. 516binop("usadd_4x8", tint32, commutative + associative, """ 517dst = 0; 518for (int i = 0; i < 32; i += 8) { 519 dst |= MIN2(((src0 >> i) & 0xff) + ((src1 >> i) & 0xff), 0xff) << i; 520} 521""") 522 523# Saturated vector subtract for 4 8bit ints. 524binop("ussub_4x8", tint32, "", """ 525dst = 0; 526for (int i = 0; i < 32; i += 8) { 527 int src0_chan = (src0 >> i) & 0xff; 528 int src1_chan = (src1 >> i) & 0xff; 529 if (src0_chan > src1_chan) 530 dst |= (src0_chan - src1_chan) << i; 531} 532""") 533 534# vector min for 4 8bit ints. 535binop("umin_4x8", tint32, commutative + associative, """ 536dst = 0; 537for (int i = 0; i < 32; i += 8) { 538 dst |= MIN2((src0 >> i) & 0xff, (src1 >> i) & 0xff) << i; 539} 540""") 541 542# vector max for 4 8bit ints. 543binop("umax_4x8", tint32, commutative + associative, """ 544dst = 0; 545for (int i = 0; i < 32; i += 8) { 546 dst |= MAX2((src0 >> i) & 0xff, (src1 >> i) & 0xff) << i; 547} 548""") 549 550# unorm multiply: (a * b) / 255. 551binop("umul_unorm_4x8", tint32, commutative + associative, """ 552dst = 0; 553for (int i = 0; i < 32; i += 8) { 554 int src0_chan = (src0 >> i) & 0xff; 555 int src1_chan = (src1 >> i) & 0xff; 556 dst |= ((src0_chan * src1_chan) / 255) << i; 557} 558""") 559 560binop("fpow", tfloat, "", "bit_size == 64 ? powf(src0, src1) : pow(src0, src1)") 561 562binop_horiz("pack_half_2x16_split", 1, tuint32, 1, tfloat32, 1, tfloat32, 563 "pack_half_1x16(src0.x) | (pack_half_1x16(src1.x) << 16)") 564 565binop_convert("pack_double_2x32_split", tuint64, tuint32, "", 566 "src0 | ((uint64_t)src1 << 32)") 567 568# bfm implements the behavior of the first operation of the SM5 "bfi" assembly 569# and that of the "bfi1" i965 instruction. That is, it has undefined behavior 570# if either of its arguments are 32. 571binop_convert("bfm", tuint32, tint32, "", """ 572int bits = src0, offset = src1; 573if (offset < 0 || bits < 0 || offset > 31 || bits > 31 || offset + bits > 32) 574 dst = 0; /* undefined */ 575else 576 dst = ((1u << bits) - 1) << offset; 577""") 578 579opcode("ldexp", 0, tfloat, [0, 0], [tfloat, tint32], "", """ 580dst = (bit_size == 64) ? ldexp(src0, src1) : ldexpf(src0, src1); 581/* flush denormals to zero. */ 582if (!isnormal(dst)) 583 dst = copysignf(0.0f, src0); 584""") 585 586# Combines the first component of each input to make a 2-component vector. 587 588binop_horiz("vec2", 2, tuint, 1, tuint, 1, tuint, """ 589dst.x = src0.x; 590dst.y = src1.x; 591""") 592 593# Byte extraction 594binop("extract_u8", tuint, "", "(uint8_t)(src0 >> (src1 * 8))") 595binop("extract_i8", tint, "", "(int8_t)(src0 >> (src1 * 8))") 596 597# Word extraction 598binop("extract_u16", tuint, "", "(uint16_t)(src0 >> (src1 * 16))") 599binop("extract_i16", tint, "", "(int16_t)(src0 >> (src1 * 16))") 600 601 602def triop(name, ty, const_expr): 603 opcode(name, 0, ty, [0, 0, 0], [ty, ty, ty], "", const_expr) 604def triop_horiz(name, output_size, src1_size, src2_size, src3_size, const_expr): 605 opcode(name, output_size, tuint, 606 [src1_size, src2_size, src3_size], 607 [tuint, tuint, tuint], "", const_expr) 608 609triop("ffma", tfloat, "src0 * src1 + src2") 610 611triop("flrp", tfloat, "src0 * (1 - src2) + src1 * src2") 612 613# Conditional Select 614# 615# A vector conditional select instruction (like ?:, but operating per- 616# component on vectors). There are two versions, one for floating point 617# bools (0.0 vs 1.0) and one for integer bools (0 vs ~0). 618 619 620triop("fcsel", tfloat32, "(src0 != 0.0f) ? src1 : src2") 621opcode("bcsel", 0, tuint, [0, 0, 0], 622 [tbool, tuint, tuint], "", "src0 ? src1 : src2") 623 624# SM5 bfi assembly 625triop("bfi", tuint32, """ 626unsigned mask = src0, insert = src1, base = src2; 627if (mask == 0) { 628 dst = base; 629} else { 630 unsigned tmp = mask; 631 while (!(tmp & 1)) { 632 tmp >>= 1; 633 insert <<= 1; 634 } 635 dst = (base & ~mask) | (insert & mask); 636} 637""") 638 639# SM5 ubfe/ibfe assembly 640opcode("ubfe", 0, tuint32, 641 [0, 0, 0], [tuint32, tint32, tint32], "", """ 642unsigned base = src0; 643int offset = src1, bits = src2; 644if (bits == 0) { 645 dst = 0; 646} else if (bits < 0 || offset < 0) { 647 dst = 0; /* undefined */ 648} else if (offset + bits < 32) { 649 dst = (base << (32 - bits - offset)) >> (32 - bits); 650} else { 651 dst = base >> offset; 652} 653""") 654opcode("ibfe", 0, tint32, 655 [0, 0, 0], [tint32, tint32, tint32], "", """ 656int base = src0; 657int offset = src1, bits = src2; 658if (bits == 0) { 659 dst = 0; 660} else if (bits < 0 || offset < 0) { 661 dst = 0; /* undefined */ 662} else if (offset + bits < 32) { 663 dst = (base << (32 - bits - offset)) >> (32 - bits); 664} else { 665 dst = base >> offset; 666} 667""") 668 669# GLSL bitfieldExtract() 670opcode("ubitfield_extract", 0, tuint32, 671 [0, 0, 0], [tuint32, tint32, tint32], "", """ 672unsigned base = src0; 673int offset = src1, bits = src2; 674if (bits == 0) { 675 dst = 0; 676} else if (bits < 0 || offset < 0 || offset + bits > 32) { 677 dst = 0; /* undefined per the spec */ 678} else { 679 dst = (base >> offset) & ((1ull << bits) - 1); 680} 681""") 682opcode("ibitfield_extract", 0, tint32, 683 [0, 0, 0], [tint32, tint32, tint32], "", """ 684int base = src0; 685int offset = src1, bits = src2; 686if (bits == 0) { 687 dst = 0; 688} else if (offset < 0 || bits < 0 || offset + bits > 32) { 689 dst = 0; 690} else { 691 dst = (base << (32 - offset - bits)) >> offset; /* use sign-extending shift */ 692} 693""") 694 695# Combines the first component of each input to make a 3-component vector. 696 697triop_horiz("vec3", 3, 1, 1, 1, """ 698dst.x = src0.x; 699dst.y = src1.x; 700dst.z = src2.x; 701""") 702 703def quadop_horiz(name, output_size, src1_size, src2_size, src3_size, 704 src4_size, const_expr): 705 opcode(name, output_size, tuint, 706 [src1_size, src2_size, src3_size, src4_size], 707 [tuint, tuint, tuint, tuint], 708 "", const_expr) 709 710opcode("bitfield_insert", 0, tuint32, [0, 0, 0, 0], 711 [tuint32, tuint32, tint32, tint32], "", """ 712unsigned base = src0, insert = src1; 713int offset = src2, bits = src3; 714if (bits == 0) { 715 dst = 0; 716} else if (offset < 0 || bits < 0 || bits + offset > 32) { 717 dst = 0; 718} else { 719 unsigned mask = ((1ull << bits) - 1) << offset; 720 dst = (base & ~mask) | ((insert << bits) & mask); 721} 722""") 723 724quadop_horiz("vec4", 4, 1, 1, 1, 1, """ 725dst.x = src0.x; 726dst.y = src1.x; 727dst.z = src2.x; 728dst.w = src3.x; 729""") 730 731 732