1# -*- coding: utf-8 -*- 2# 3# Copyright (C) 2014 Intel Corporation 4# 5# Permission is hereby granted, free of charge, to any person obtaining a 6# copy of this software and associated documentation files (the "Software"), 7# to deal in the Software without restriction, including without limitation 8# the rights to use, copy, modify, merge, publish, distribute, sublicense, 9# and/or sell copies of the Software, and to permit persons to whom the 10# Software is furnished to do so, subject to the following conditions: 11# 12# The above copyright notice and this permission notice (including the next 13# paragraph) shall be included in all copies or substantial portions of the 14# Software. 15# 16# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 17# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 18# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 19# THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 20# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING 21# FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS 22# IN THE SOFTWARE. 23 24import argparse 25from collections import OrderedDict 26import nir_algebraic 27from nir_opcodes import type_sizes 28import itertools 29import struct 30from math import pi 31import math 32 33# Convenience variables 34a = 'a' 35b = 'b' 36c = 'c' 37d = 'd' 38e = 'e' 39NAN = math.nan 40 41has_fmulz = '(options->has_fmulz || \ 42 (options->has_fmulz_no_denorms && \ 43 !nir_is_denorm_preserve(info->float_controls_execution_mode, 32)))' 44 45ignore_exact = nir_algebraic.ignore_exact 46 47# Written in the form (<search>, <replace>) where <search> is an expression 48# and <replace> is either an expression or a value. An expression is 49# defined as a tuple of the form ([~]<op>, <src0>, <src1>, <src2>, <src3>) 50# where each source is either an expression or a value. A value can be 51# either a numeric constant or a string representing a variable name. 52# 53# If the opcode in a search expression is prefixed by a '~' character, this 54# indicates that the operation is inexact. Such operations will only get 55# applied to SSA values that do not have the exact bit set. This should be 56# used by by any optimizations that are not bit-for-bit exact. It should not, 57# however, be used for backend-requested lowering operations as those need to 58# happen regardless of precision. 59# 60# Variable names are specified as "[#]name[@type][(cond)][.swiz]" where: 61# "#" indicates that the given variable will only match constants, 62# type indicates that the given variable will only match values from ALU 63# instructions with the given output type, 64# (cond) specifies an additional condition function (see nir_search_helpers.h), 65# swiz is a swizzle applied to the variable (only in the <replace> expression) 66# 67# For constants, you have to be careful to make sure that it is the right 68# type because python is unaware of the source and destination types of the 69# opcodes. 70# 71# All expression types can have a bit-size specified. For opcodes, this 72# looks like "op@32", for variables it is "a@32" or "a@uint32" to specify a 73# type and size. In the search half of the expression this indicates that it 74# should only match that particular bit-size. In the replace half of the 75# expression this indicates that the constructed value should have that 76# bit-size. 77# 78# If the opcode in a replacement expression is prefixed by a '!' character, 79# this indicated that the new expression will be marked exact. 80# 81# A special condition "many-comm-expr" can be used with expressions to note 82# that the expression and its subexpressions have more commutative expressions 83# than nir_replace_instr can handle. If this special condition is needed with 84# another condition, the two can be separated by a comma (e.g., 85# "(many-comm-expr,is_used_once)"). 86# 87# Another set of special "conditions" are 88# "nsz": sign of zero is not preserved 89# "ninf": infinities are not preserved 90# "nnan": nan is not preserved 91# These relate to the float controls/fpfastmath and more descriptions of the 92# expression than conditions. That is, an expression with the "nsz" condition 93# means that the replacement expression won't preserve the sign of zero of the 94# result, and so it will be skipped if the matching instruction has the 95# 'signed_zero_preserve' flag set. 96 97# based on https://web.archive.org/web/20180105155939/http://forum.devmaster.net/t/fast-and-accurate-sine-cosine/9648 98def lowered_sincos(c): 99 x = ('fsub', ('fmul', 2.0, ('ffract', ('fadd', ('fmul', 0.5 / pi, a), c))), 1.0) 100 x = ('fmul', ('fsub', x, ('fmul', x, ('fabs', x))), 4.0) 101 return ('ffma', ('ffma', x, ('fabs', x), ('fneg', x)), 0.225, x) 102 103def intBitsToFloat(i): 104 return struct.unpack('!f', struct.pack('!I', i))[0] 105 106# Takes a pattern as input and returns a list of patterns where each 107# pattern has a different permutation of fneg/fabs(value) as the replacement 108# for the key operands in replacements. 109def add_fabs_fneg(pattern, replacements, commutative = True): 110 def to_list(pattern): 111 return [to_list(i) if isinstance(i, tuple) else i for i in pattern] 112 113 def to_tuple(pattern): 114 return tuple(to_tuple(i) if isinstance(i, list) else i for i in pattern) 115 116 def replace_varible(pattern, search, replace): 117 for i in range(len(pattern)): 118 if pattern[i] == search: 119 pattern[i] = replace 120 elif isinstance(pattern[i], list): 121 replace_varible(pattern[i], search, replace) 122 123 if commutative: 124 perms = itertools.combinations_with_replacement(range(4), len(replacements)) 125 else: 126 perms = itertools.product(range(4), repeat=len(replacements)) 127 128 result = [] 129 130 for perm in perms: 131 curr = to_list(pattern) 132 133 for i, (search, base) in enumerate(replacements.items()): 134 if perm[i] == 0: 135 replace = ['fneg', ['fabs', base]] 136 elif perm[i] == 1: 137 replace = ['fabs', base] 138 elif perm[i] == 2: 139 replace = ['fneg', base] 140 elif perm[i] == 3: 141 replace = base 142 143 replace_varible(curr, search, replace) 144 145 result.append(to_tuple(curr)) 146 return result 147 148 149optimizations = [ 150 # These will be recreated by late_algebraic if supported. 151 # Lowering here means we don't have to duplicate all other optimization patterns. 152 (('fgeu', a, b), ('inot', ('flt', a, b))), 153 (('fltu', a, b), ('inot', ('fge', a, b))), 154 (('fneo', 0.0, a), ('flt', 0.0, ('fabs', a))), 155 (('fequ', 0.0, a), ('inot', ('flt', 0.0, ('fabs', a)))), 156 157 158 (('imul', a, '#b(is_pos_power_of_two)'), ('ishl', a, ('find_lsb', b)), '!options->lower_bitops'), 159 (('imul', 'a@8', 0x80), ('ishl', a, 7), '!options->lower_bitops'), 160 (('imul', 'a@16', 0x8000), ('ishl', a, 15), '!options->lower_bitops'), 161 (('imul', 'a@32', 0x80000000), ('ishl', a, 31), '!options->lower_bitops'), 162 (('imul', 'a@64', 0x8000000000000000), ('ishl', a, 63), '!options->lower_bitops'), 163 (('imul', a, '#b(is_neg_power_of_two)'), ('ineg', ('ishl', a, ('find_lsb', ('iabs', b)))), '!options->lower_bitops'), 164 (('ishl', a, '#b'), ('imul', a, ('ishl', 1, b)), 'options->lower_bitops'), 165 166 (('imul@64', a, '#b(is_bitcount2)'), ('iadd', ('ishl', a, ('ufind_msb', b)), ('ishl', a, ('find_lsb', b))), 167 '!options->lower_bitops && (options->lower_int64_options & (nir_lower_imul64 | nir_lower_shift64)) == nir_lower_imul64'), 168 169 (('unpack_64_2x32_split_x', ('imul_2x32_64(is_used_once)', a, b)), ('imul', a, b)), 170 (('unpack_64_2x32_split_x', ('umul_2x32_64(is_used_once)', a, b)), ('imul', a, b)), 171 (('imul_2x32_64', a, b), ('pack_64_2x32_split', ('imul', a, b), ('imul_high', a, b)), 'options->lower_mul_2x32_64'), 172 (('umul_2x32_64', a, b), ('pack_64_2x32_split', ('imul', a, b), ('umul_high', a, b)), 'options->lower_mul_2x32_64'), 173 (('udiv', a, 1), a), 174 (('idiv', a, 1), a), 175 (('umod', a, 1), 0), 176 (('imod', a, 1), 0), 177 (('imod', a, -1), 0), 178 (('irem', a, 1), 0), 179 (('irem', a, -1), 0), 180 (('udiv', a, '#b(is_pos_power_of_two)'), ('ushr', a, ('find_lsb', b)), '!options->lower_bitops'), 181 (('idiv', a, '#b(is_pos_power_of_two)'), ('imul', ('isign', a), ('ushr', ('iabs', a), ('find_lsb', b))), '!options->lower_bitops'), 182 (('idiv', a, '#b(is_neg_power_of_two)'), ('ineg', ('imul', ('isign', a), ('ushr', ('iabs', a), ('find_lsb', ('iabs', b))))), '!options->lower_bitops'), 183 (('umod', a, '#b(is_pos_power_of_two)'), ('iand', a, ('isub', b, 1)), '!options->lower_bitops'), 184 (('imod', a, '#b(is_pos_power_of_two)'), ('iand', a, ('isub', b, 1)), '!options->lower_bitops'), 185 (('imod', a, '#b(is_neg_power_of_two)'), ('bcsel', ('ieq', ('ior', a, b), b), 0, ('ior', a, b)), '!options->lower_bitops'), 186 # 'irem(a, b)' -> 'a - ((a < 0 ? (a + b - 1) : a) & -b)' 187 (('irem', a, '#b(is_pos_power_of_two)'), 188 ('isub', a, ('iand', ('bcsel', ('ilt', a, 0), ('iadd', a, ('isub', b, 1)), a), ('ineg', b))), 189 '!options->lower_bitops'), 190 (('irem', a, '#b(is_neg_power_of_two)'), ('irem', a, ('iabs', b)), '!options->lower_bitops'), 191 192 (('~fmul', ('fsign', a), ('ffloor', ('fadd', ('fabs', a), 0.5))), ('ftrunc', ('fadd', a, ('fmul', ('fsign', a), 0.5))), '!options->lower_ftrunc || options->lower_ffloor'), 193 194 (('~fneg', ('fneg', a)), a), 195 (('ineg', ('ineg', a)), a), 196 (('fabs', ('fneg', a)), ('fabs', a)), 197 (('fabs', ('u2f', a)), ('u2f', a)), 198 (('iabs', ('iabs', a)), ('iabs', a)), 199 (('iabs', ('ineg', a)), ('iabs', a)), 200 (('~fadd', a, 0.0), a), 201 # a+0.0 is 'a' unless 'a' is denormal or -0.0. If it's only used by a 202 # floating point instruction, they should flush any input denormals and we 203 # can replace -0.0 with 0.0 if the float execution mode allows it. 204 (('fadd(is_only_used_as_float,nsz)', 'a', 0.0), a), 205 (('fadd(is_only_used_as_float)', a, '#b(is_negative_zero)'), a), 206 (('fadd', ('fneg', a), '#b(is_negative_zero)'), ('fneg', a)), 207 (('iadd', a, 0), a), 208 (('iadd_sat', a, 0), a), 209 (('isub_sat', a, 0), a), 210 (('uadd_sat', a, 0), a), 211 (('usub_sat', a, 0), a), 212 (('usadd_4x8_vc4', a, 0), a), 213 (('usadd_4x8_vc4', a, ~0), ~0), 214 (('~fadd', ('fmul', a, b), ('fmul', a, c)), ('fmul', a, ('fadd', b, c))), 215 (('~fadd', ('fmulz', a, b), ('fmulz', a, c)), ('fmulz', a, ('fadd', b, c))), 216 (('~ffma', a, b, ('ffma(is_used_once)', a, c, d)), ('ffma', a, ('fadd', b, c), d)), 217 (('~ffma', a, b, ('fmul(is_used_once)', a, c)), ('fmul', a, ('fadd', b, c))), 218 (('~fadd', ('fmul(is_used_once)', a, b), ('ffma(is_used_once)', a, c, d)), ('ffma', a, ('fadd', b, c), d)), 219 (('~ffma', a, ('fmul(is_used_once)', b, c), ('fmul(is_used_once)', b, d)), ('fmul', b, ('ffma', a, c, d))), 220 (('~ffmaz', a, b, ('ffmaz(is_used_once)', a, c, d)), ('ffmaz', a, ('fadd', b, c), d)), 221 (('~ffmaz', a, b, ('fmulz(is_used_once)', a, c)), ('fmulz', a, ('fadd', b, c))), 222 (('~fadd', ('fmulz(is_used_once)', a, b), ('ffmaz(is_used_once)', a, c, d)), ('ffmaz', a, ('fadd', b, c), d)), 223 (('~ffmaz', a, ('fmulz(is_used_once)', b, c), ('fmulz(is_used_once)', b, d)), ('fmulz', b, ('ffmaz', a, c, d))), 224 (('iadd', ('imul', a, b), ('imul', a, c)), ('imul', a, ('iadd', b, c))), 225 (('iadd', ('ishl', b, a), ('ishl', c, a)), ('ishl', ('iadd', b, c), a)), 226 (('iand', ('iand', a, b), ('iand(is_used_once)', a, c)), ('iand', ('iand', a, b), c)), 227 (('ior', ('ior', a, b), ('ior(is_used_once)', a, c)), ('ior', ('ior', a, b), c)), 228 (('iand', ('ior(is_used_once)', a, b), ('ior(is_used_once)', a, c)), ('ior', a, ('iand', b, c))), 229 (('ior', ('iand(is_used_once)', a, b), ('iand(is_used_once)', a, c)), ('iand', a, ('ior', b, c))), 230 # (a & b) | (a | c) => ((a & b) | a) | c => a | c 231 (('ior', ('iand', a, b), ('ior', a, c)), ('ior', a, c)), 232 # (a & b) & (a | c) => b & (a & (a | c)) => b & a 233 (('iand', ('iand', a, b), ('ior', a, c)), ('iand', a, b)), 234 (('ieq', ('iand', a, '#b(is_pos_power_of_two)'), b), ('ine', ('iand', a, b), 0)), 235 (('ine', ('iand', a, '#b(is_pos_power_of_two)'), b), ('ieq', ('iand', a, b), 0)), 236 (('ieq', ('ushr(is_used_once)', a, '#b'), 0), ('ult', a, ('ishl', 1, b))), 237 (('ine', ('ushr(is_used_once)', a, '#b'), 0), ('uge', a, ('ishl', 1, b))), 238 (('~fadd', ('fneg', a), a), 0.0), 239 (('iadd', ('ineg', a), a), 0), 240 (('iadd', ('ineg', a), ('iadd', a, b)), b), 241 (('iadd', a, ('iadd', ('ineg', a), b)), b), 242 (('~fadd', ('fneg', a), ('fadd', a, b)), b), 243 (('~fadd', a, ('fadd', ('fneg', a), b)), b), 244 (('fadd', ('fsat', a), ('fsat', ('fneg', a))), ('fsat', ('fabs', a))), 245 (('~fmul', a, 0.0), 0.0), 246 # The only effect a*0.0 should have is when 'a' is infinity, -0.0 or NaN 247 (('fmul(nsz,nnan)', 'a', 0.0), 0.0), 248 (('fmulz', a, 0.0), 0.0), 249 (('fmulz(nsz)', a, 'b(is_finite_not_zero)'), ('fmul', a, b)), 250 (('fmulz', 'a(is_finite)', 'b(is_finite)'), ('fmul', a, b)), 251 (('fmulz', a, a), ('fmul', a, a)), 252 (('ffmaz(nsz)', a, 'b(is_finite_not_zero)', c), ('ffma', a, b, c)), 253 (('ffmaz', 'a(is_finite)', 'b(is_finite)', c), ('ffma', a, b, c)), 254 (('ffmaz', a, a, b), ('ffma', a, a, b)), 255 (('imul', a, 0), 0), 256 (('umul_unorm_4x8_vc4', a, 0), 0), 257 (('umul_unorm_4x8_vc4', a, ~0), a), 258 (('~fmul', a, 1.0), a), 259 (('~fmulz', a, 1.0), a), 260 # The only effect a*1.0 can have is flushing denormals. If it's only used by 261 # a floating point instruction, they should flush any input denormals and 262 # this multiplication isn't needed. 263 (('fmul(is_only_used_as_float)', a, 1.0), a), 264 (('imul', a, 1), a), 265 (('fmul', a, -1.0), ('fneg', a)), 266 (('imul', a, -1), ('ineg', a)), 267 # If a < 0: fsign(a)*a*a => -1*a*a => -a*a => abs(a)*a 268 # If a > 0: fsign(a)*a*a => 1*a*a => a*a => abs(a)*a 269 # If a == 0: fsign(a)*a*a => 0*0*0 => abs(0)*0 270 # If a != a: fsign(a)*a*a => 0*NaN*NaN => abs(NaN)*NaN 271 (('fmul', ('fsign', a), ('fmul', a, a)), ('fmul', ('fabs', a), a)), 272 (('fmul', ('fmul', ('fsign', a), a), a), ('fmul', ('fabs', a), a)), 273 (('~ffma', 0.0, a, b), b), 274 (('ffma(is_only_used_as_float,nsz,nnan,ninf)', 0.0, a, b), b), 275 (('ffmaz', 0.0, a, b), ('fadd', 0.0, b)), 276 (('~ffma', a, b, 0.0), ('fmul', a, b)), 277 (('ffma(nsz)', a, b, 0.0), ('fmul', a, b)), 278 (('ffmaz(nsz)', a, b, 0.0), ('fmulz', a, b)), 279 (('ffma', 1.0, a, b), ('fadd', a, b)), 280 (('ffmaz(nsz)', 1.0, a, b), ('fadd', a, b)), 281 (('ffma', -1.0, a, b), ('fadd', ('fneg', a), b)), 282 (('ffmaz(nsz)', -1.0, a, b), ('fadd', ('fneg', a), b)), 283 (('~ffma', '#a', '#b', c), ('fadd', ('fmul', a, b), c)), 284 (('~ffmaz', '#a', '#b', c), ('fadd', ('fmulz', a, b), c)), 285 (('~flrp', a, b, 0.0), a), 286 (('~flrp', a, b, 1.0), b), 287 (('~flrp', a, a, b), a), 288 (('~flrp', 0.0, a, b), ('fmul', a, b)), 289 290 # flrp(a, a + b, c) => a + flrp(0, b, c) => a + (b * c) 291 (('~flrp', a, ('fadd(is_used_once)', a, b), c), ('fadd', ('fmul', b, c), a)), 292 293 (('sdot_4x8_iadd', a, 0, b), b), 294 (('udot_4x8_uadd', a, 0, b), b), 295 (('sdot_4x8_iadd_sat', a, 0, b), b), 296 (('udot_4x8_uadd_sat', a, 0, b), b), 297 (('sdot_2x16_iadd', a, 0, b), b), 298 (('udot_2x16_uadd', a, 0, b), b), 299 (('sdot_2x16_iadd_sat', a, 0, b), b), 300 (('udot_2x16_uadd_sat', a, 0, b), b), 301 302 # sudot_4x8_iadd is not commutative at all, so the patterns must be 303 # duplicated with zeros on each of the first positions. 304 (('sudot_4x8_iadd', a, 0, b), b), 305 (('sudot_4x8_iadd', 0, a, b), b), 306 (('sudot_4x8_iadd_sat', a, 0, b), b), 307 (('sudot_4x8_iadd_sat', 0, a, b), b), 308 309 (('iadd', ('sdot_4x8_iadd(is_used_once)', a, b, '#c'), '#d'), ('sdot_4x8_iadd', a, b, ('iadd', c, d))), 310 (('iadd', ('udot_4x8_uadd(is_used_once)', a, b, '#c'), '#d'), ('udot_4x8_uadd', a, b, ('iadd', c, d))), 311 (('iadd', ('sudot_4x8_iadd(is_used_once)', a, b, '#c'), '#d'), ('sudot_4x8_iadd', a, b, ('iadd', c, d))), 312 (('iadd', ('sdot_2x16_iadd(is_used_once)', a, b, '#c'), '#d'), ('sdot_2x16_iadd', a, b, ('iadd', c, d))), 313 (('iadd', ('udot_2x16_uadd(is_used_once)', a, b, '#c'), '#d'), ('udot_2x16_uadd', a, b, ('iadd', c, d))), 314 315 # Try to let constant folding eliminate the dot-product part. These are 316 # safe because the dot product cannot overflow 32 bits. 317 (('iadd', ('sdot_4x8_iadd', 'a(is_not_const)', b, 0), c), ('sdot_4x8_iadd', a, b, c)), 318 (('iadd', ('udot_4x8_uadd', 'a(is_not_const)', b, 0), c), ('udot_4x8_uadd', a, b, c)), 319 (('iadd', ('sudot_4x8_iadd', 'a(is_not_const)', b, 0), c), ('sudot_4x8_iadd', a, b, c)), 320 (('iadd', ('sudot_4x8_iadd', a, 'b(is_not_const)', 0), c), ('sudot_4x8_iadd', a, b, c)), 321 (('iadd', ('sdot_2x16_iadd', 'a(is_not_const)', b, 0), c), ('sdot_2x16_iadd', a, b, c)), 322 (('iadd', ('udot_2x16_uadd', 'a(is_not_const)', b, 0), c), ('udot_2x16_uadd', a, b, c)), 323 (('sdot_4x8_iadd', '#a', '#b', 'c(is_not_const)'), ('iadd', ('sdot_4x8_iadd', a, b, 0), c)), 324 (('udot_4x8_uadd', '#a', '#b', 'c(is_not_const)'), ('iadd', ('udot_4x8_uadd', a, b, 0), c)), 325 (('sudot_4x8_iadd', '#a', '#b', 'c(is_not_const)'), ('iadd', ('sudot_4x8_iadd', a, b, 0), c)), 326 (('sdot_2x16_iadd', '#a', '#b', 'c(is_not_const)'), ('iadd', ('sdot_2x16_iadd', a, b, 0), c)), 327 (('udot_2x16_uadd', '#a', '#b', 'c(is_not_const)'), ('iadd', ('udot_2x16_uadd', a, b, 0), c)), 328 (('sdot_4x8_iadd_sat', '#a', '#b', 'c(is_not_const)'), ('iadd_sat', ('sdot_4x8_iadd', a, b, 0), c), '!options->lower_iadd_sat'), 329 (('udot_4x8_uadd_sat', '#a', '#b', 'c(is_not_const)'), ('uadd_sat', ('udot_4x8_uadd', a, b, 0), c), '!options->lower_uadd_sat'), 330 (('sudot_4x8_iadd_sat', '#a', '#b', 'c(is_not_const)'), ('iadd_sat', ('sudot_4x8_iadd', a, b, 0), c), '!options->lower_iadd_sat'), 331 (('sdot_2x16_iadd_sat', '#a', '#b', 'c(is_not_const)'), ('iadd_sat', ('sdot_2x16_iadd', a, b, 0), c), '!options->lower_iadd_sat'), 332 (('udot_2x16_uadd_sat', '#a', '#b', 'c(is_not_const)'), ('uadd_sat', ('udot_2x16_uadd', a, b, 0), c), '!options->lower_uadd_sat'), 333 334 # Optimize open-coded fmulz. 335 # (b==0.0 ? 0.0 : a) * (a==0.0 ? 0.0 : b) -> fmulz(a, b) 336 *add_fabs_fneg((('fmul@32(nsz)', ('bcsel', ignore_exact('feq', b, 0.0), 0.0, 'ma'), ('bcsel', ignore_exact('feq', a, 0.0), 0.0, 'mb')), 337 ('fmulz', 'ma', 'mb'), has_fmulz), {'ma' : a, 'mb' : b}), 338 *add_fabs_fneg((('fmul@32(nsz)', 'ma', ('bcsel', ignore_exact('feq', a, 0.0), 0.0, '#b(is_not_const_zero)')), 339 ('fmulz', 'ma', b), has_fmulz), {'ma' : a}), 340 341 # ffma(b==0.0 ? 0.0 : a, a==0.0 ? 0.0 : b, c) -> ffmaz(a, b, c) 342 *add_fabs_fneg((('ffma@32(nsz)', ('bcsel', ignore_exact('feq', b, 0.0), 0.0, 'ma'), ('bcsel', ignore_exact('feq', a, 0.0), 0.0, 'mb'), c), 343 ('ffmaz', 'ma', 'mb', c), has_fmulz), {'ma' : a, 'mb' : b}), 344 *add_fabs_fneg((('ffma@32(nsz)', 'ma', ('bcsel', ignore_exact('feq', a, 0.0), 0.0, '#b(is_not_const_zero)'), c), 345 ('ffmaz', 'ma', b, c), has_fmulz), {'ma' : a}), 346 347 # b == 0.0 ? 1.0 : fexp2(fmul(a, b)) -> fexp2(fmulz(a, b)) 348 *add_fabs_fneg((('bcsel(nsz,nnan,ninf)', ignore_exact('feq', b, 0.0), 1.0, ('fexp2', ('fmul@32', a, 'mb'))), 349 ('fexp2', ('fmulz', a, 'mb')), 350 has_fmulz), {'mb': b}), 351 *add_fabs_fneg((('bcsel', ignore_exact('feq', b, 0.0), 1.0, ('fexp2', ('fmulz', a, 'mb'))), 352 ('fexp2', ('fmulz', a, 'mb'))), {'mb': b}), 353] 354 355# Bitwise operations affecting the sign may be replaced by equivalent 356# floating point operations, except possibly for denormal 357# behaviour hence the is_only_used_as_float. 358for sz in (16, 32, 64): 359 sign_bit = 1 << (sz - 1) 360 361 optimizations.extend([ 362 (('iand(is_only_used_as_float)', f'a@{sz}', sign_bit - 1), ('fabs', a)), 363 (('ixor(is_only_used_as_float)', f'a@{sz}', sign_bit), ('fneg', a)), 364 (('ior(is_only_used_as_float)', f'a@{sz}', sign_bit), ('fneg', ('fabs', a))), 365 ]) 366 367# Shorthand for the expansion of just the dot product part of the [iu]dp4a 368# instructions. 369sdot_4x8_a_b = ('iadd', ('iadd', ('imul', ('extract_i8', a, 0), ('extract_i8', b, 0)), 370 ('imul', ('extract_i8', a, 1), ('extract_i8', b, 1))), 371 ('iadd', ('imul', ('extract_i8', a, 2), ('extract_i8', b, 2)), 372 ('imul', ('extract_i8', a, 3), ('extract_i8', b, 3)))) 373udot_4x8_a_b = ('iadd', ('iadd', ('imul', ('extract_u8', a, 0), ('extract_u8', b, 0)), 374 ('imul', ('extract_u8', a, 1), ('extract_u8', b, 1))), 375 ('iadd', ('imul', ('extract_u8', a, 2), ('extract_u8', b, 2)), 376 ('imul', ('extract_u8', a, 3), ('extract_u8', b, 3)))) 377sudot_4x8_a_b = ('iadd', ('iadd', ('imul', ('extract_i8', a, 0), ('extract_u8', b, 0)), 378 ('imul', ('extract_i8', a, 1), ('extract_u8', b, 1))), 379 ('iadd', ('imul', ('extract_i8', a, 2), ('extract_u8', b, 2)), 380 ('imul', ('extract_i8', a, 3), ('extract_u8', b, 3)))) 381sdot_2x16_a_b = ('iadd', ('imul', ('extract_i16', a, 0), ('extract_i16', b, 0)), 382 ('imul', ('extract_i16', a, 1), ('extract_i16', b, 1))) 383udot_2x16_a_b = ('iadd', ('imul', ('extract_u16', a, 0), ('extract_u16', b, 0)), 384 ('imul', ('extract_u16', a, 1), ('extract_u16', b, 1))) 385 386optimizations.extend([ 387 (('sdot_4x8_iadd', a, b, c), ('iadd', sdot_4x8_a_b, c), '!options->has_sdot_4x8'), 388 (('udot_4x8_uadd', a, b, c), ('iadd', udot_4x8_a_b, c), '!options->has_udot_4x8'), 389 (('sudot_4x8_iadd', a, b, c), ('iadd', sudot_4x8_a_b, c), '!options->has_sudot_4x8'), 390 (('sdot_2x16_iadd', a, b, c), ('iadd', sdot_2x16_a_b, c), '!options->has_dot_2x16'), 391 (('udot_2x16_uadd', a, b, c), ('iadd', udot_2x16_a_b, c), '!options->has_dot_2x16'), 392 393 # For the unsigned dot-product, the largest possible value 4*(255*255) = 394 # 0x3f804, so we don't have to worry about that intermediate result 395 # overflowing. 0x100000000 - 0x3f804 = 0xfffc07fc. If c is a constant 396 # that is less than 0xfffc07fc, then the result cannot overflow ever. 397 (('udot_4x8_uadd_sat', a, b, '#c(is_ult_0xfffc07fc)'), ('udot_4x8_uadd', a, b, c)), 398 (('udot_4x8_uadd_sat', a, b, c), ('uadd_sat', ('udot_4x8_uadd', a, b, 0), c), '!options->has_udot_4x8_sat'), 399 400 # For the signed dot-product, the largest positive value is 4*(-128*-128) = 401 # 0x10000, and the largest negative value is 4*(-128*127) = -0xfe00. We 402 # don't have to worry about that intermediate result overflowing or 403 # underflowing. 404 (('sdot_4x8_iadd_sat', a, b, c), ('iadd_sat', ('sdot_4x8_iadd', a, b, 0), c), '!options->has_sdot_4x8_sat'), 405 406 (('sudot_4x8_iadd_sat', a, b, c), ('iadd_sat', ('sudot_4x8_iadd', a, b, 0), c), '!options->has_sudot_4x8_sat'), 407 408 (('udot_2x16_uadd_sat', a, b, c), ('uadd_sat', udot_2x16_a_b, c), '!options->has_dot_2x16'), 409 (('sdot_2x16_iadd_sat', a, b, c), ('iadd_sat', sdot_2x16_a_b, c), '!options->has_dot_2x16'), 410]) 411 412# Float sizes 413for s in [16, 32, 64]: 414 optimizations.extend([ 415 (('~flrp@{}'.format(s), a, b, ('b2f', 'c@1')), ('bcsel', c, b, a), 'options->lower_flrp{}'.format(s)), 416 417 (('~flrp@{}'.format(s), a, ('fadd', a, b), c), ('fadd', ('fmul', b, c), a), 'options->lower_flrp{}'.format(s)), 418 (('~flrp@{}'.format(s), ('fadd(is_used_once)', a, b), ('fadd(is_used_once)', a, c), d), ('fadd', ('flrp', b, c, d), a), 'options->lower_flrp{}'.format(s)), 419 (('~flrp@{}'.format(s), a, ('fmul(is_used_once)', a, b), c), ('fmul', ('flrp', 1.0, b, c), a), 'options->lower_flrp{}'.format(s)), 420 421 (('~fadd@{}'.format(s), ('fmul', a, ('fadd', 1.0, ('fneg', c))), ('fmul', b, c)), ('flrp', a, b, c), '!options->lower_flrp{}'.format(s)), 422 # These are the same as the previous three rules, but it depends on 423 # 1-fsat(x) <=> fsat(1-x). See below. 424 (('~fadd@{}'.format(s), ('fmul', a, ('fsat', ('fadd', 1.0, ('fneg', c)))), ('fmul', b, ('fsat', c))), ('flrp', a, b, ('fsat', c)), '!options->lower_flrp{}'.format(s)), 425 (('~fadd@{}'.format(s), a, ('fmul', c, ('fadd', b, ('fneg', a)))), ('flrp', a, b, c), '!options->lower_flrp{}'.format(s)), 426 427 (('~fadd@{}'.format(s), ('fmul', a, ('fadd', 1.0, ('fneg', ('b2f', 'c@1')))), ('fmul', b, ('b2f', c))), ('bcsel', c, b, a), 'options->lower_flrp{}'.format(s)), 428 (('~fadd@{}'.format(s), a, ('fmul', ('b2f', 'c@1'), ('fadd', b, ('fneg', a)))), ('bcsel', c, b, a), 'options->lower_flrp{}'.format(s)), 429 430 (('~ffma@{}'.format(s), a, ('fadd', 1.0, ('fneg', ('b2f', 'c@1'))), ('fmul', b, ('b2f', 'c@1'))), ('bcsel', c, b, a)), 431 (('~ffma@{}'.format(s), b, ('b2f', 'c@1'), ('ffma', ('fneg', a), ('b2f', 'c@1'), a)), ('bcsel', c, b, a)), 432 433 # These two aren't flrp lowerings, but do appear in some shaders. 434 (('~ffma@{}'.format(s), ('b2f', 'c@1'), ('fadd', b, ('fneg', a)), a), ('bcsel', c, b, a)), 435 (('~ffma@{}'.format(s), ('b2f', 'c@1'), ('ffma', ('fneg', a), b, d), ('fmul', a, b)), ('bcsel', c, d, ('fmul', a, b))), 436 437 # 1 - ((1 - a) * (1 - b)) 438 # 1 - (1 - a - b + a*b) 439 # 1 - 1 + a + b - a*b 440 # a + b - a*b 441 # a + b*(1 - a) 442 # b*(1 - a) + 1*a 443 # flrp(b, 1, a) 444 (('~fadd@{}'.format(s), 1.0, ('fneg', ('fmul', ('fadd', 1.0, ('fneg', a)), ('fadd', 1.0, ('fneg', b))))), ('flrp', b, 1.0, a), '!options->lower_flrp{}'.format(s)), 445 ]) 446 447optimizations.extend([ 448 (('~flrp', ('fmul(is_used_once)', a, b), ('fmul(is_used_once)', a, c), d), ('fmul', ('flrp', b, c, d), a)), 449 450 (('~flrp', a, 0.0, c), ('fadd', ('fmul', ('fneg', a), c), a)), 451 452 # D3D9 vertex shader trunc 453 (('fadd', ('ffloor', a), ('b2f', ('iand', ('flt', a, 0), ('flt', ('fneg', ('ffract', a)), ('ffract', a))))), ('ftrunc', ('fadd', a, 0))), 454 # D3D9 pixel shader trunc 455 (('fadd', ('ffloor', a), ('b2f', ('inot', ('fge', 0, ('fmin', ('fneg', a), ('ffract', a)))))), ('ftrunc', ('fadd', a, 0))), 456 (('fadd', ('ffloor', a), ('b2f', ('flt', 0, ('fmin', ('fneg', a), ('ffract', a))))), ('ftrunc', ('fadd', a, 0))), 457 458 (('fadd(nnan,nsz)', a, ('ffract', ('fneg', a))), ('fceil', a), '!options->lower_fceil'), 459 460 (('ftrunc@16', a), ('bcsel', ('flt', a, 0.0), ('fneg', ('ffloor', ('fabs', a))), ('ffloor', ('fabs', a))), 'options->lower_ftrunc'), 461 (('ftrunc@32', a), ('bcsel', ('flt', a, 0.0), ('fneg', ('ffloor', ('fabs', a))), ('ffloor', ('fabs', a))), 'options->lower_ftrunc'), 462 (('ftrunc@64', a), ('bcsel', ('flt', a, 0.0), ('fneg', ('ffloor', ('fabs', a))), ('ffloor', ('fabs', a))), 463 '(options->lower_ftrunc || (options->lower_doubles_options & nir_lower_dtrunc)) && (!(options->lower_doubles_options & nir_lower_dfloor) || !(options->lower_doubles_options & nir_lower_dfract))'), 464 465 (('ffloor@16', a), ('fsub', a, ('ffract', a)), 'options->lower_ffloor'), 466 (('ffloor@32', a), ('fsub', a, ('ffract', a)), 'options->lower_ffloor'), 467 (('ffloor@64', a), ('fsub', a, ('ffract', a)), '(options->lower_ffloor || (options->lower_doubles_options & nir_lower_dfloor)) && !(options->lower_doubles_options & nir_lower_dfract)'), 468 (('fadd@16', a, ('fadd@16', b, ('fneg', ('ffract', a)))), ('fadd@16', b, ('ffloor', a)), '!options->lower_ffloor'), 469 (('fadd@32', a, ('fadd@32', b, ('fneg', ('ffract', a)))), ('fadd@32', b, ('ffloor', a)), '!options->lower_ffloor'), 470 (('fadd@64', a, ('fadd@64', b, ('fneg', ('ffract', a)))), ('fadd@64', b, ('ffloor', a)), '!options->lower_ffloor && !(options->lower_doubles_options & nir_lower_dfloor)'), 471 (('fadd@16(nnan)', a, ('fneg', ('ffract', a))), ('ffloor', a), '!options->lower_ffloor'), 472 (('fadd@32(nnan)', a, ('fneg', ('ffract', a))), ('ffloor', a), '!options->lower_ffloor'), 473 (('fadd@64(nnan)', a, ('fneg', ('ffract', a))), ('ffloor', a), '!options->lower_ffloor && !(options->lower_doubles_options & nir_lower_dfloor)'), 474 (('ffract@16', a), ('fsub', a, ('ffloor', a)), 'options->lower_ffract'), 475 (('ffract@32', a), ('fsub', a, ('ffloor', a)), 'options->lower_ffract'), 476 (('ffract@64', a), ('fsub', a, ('ffloor', a)), 477 '(options->lower_ffract || (options->lower_doubles_options & nir_lower_dfract)) && !(options->lower_doubles_options & nir_lower_dfloor)'), 478 (('fceil', a), ('fneg', ('ffloor', ('fneg', a))), 'options->lower_fceil'), 479 (('ffma@16', a, b, c), ('fadd', ('fmul', a, b), c), 'options->lower_ffma16'), 480 (('ffma@32', a, b, c), ('fadd', ('fmul', a, b), c), 'options->lower_ffma32'), 481 (('ffma@64', a, b, c), ('fadd', ('fmul', a, b), c), 'options->lower_ffma64'), 482 (('ffmaz', a, b, c), ('fadd', ('fmulz', a, b), c), 'options->lower_ffma32'), 483 # Always lower inexact ffma, because it will be fused back by late optimizations (nir_opt_algebraic_late). 484 (('~ffma@16', a, b, c), ('fadd', ('fmul', a, b), c), 'options->fuse_ffma16'), 485 (('~ffma@32', a, b, c), ('fadd', ('fmul', a, b), c), 'options->fuse_ffma32'), 486 (('~ffma@64', a, b, c), ('fadd', ('fmul', a, b), c), 'options->fuse_ffma64'), 487 (('~ffmaz', a, b, c), ('fadd', ('fmulz', a, b), c), 'options->fuse_ffma32'), 488 489 (('~fmul', ('fadd', ('iand', ('ineg', ('b2i', 'a@bool')), ('fmul', b, c)), '#d'), '#e'), 490 ('bcsel', a, ('fmul', ('fadd', ('fmul', b, c), d), e), ('fmul', d, e))), 491 492 (('fdph', a, b), ('fdot4', ('vec4', 'a.x', 'a.y', 'a.z', 1.0), b), 'options->lower_fdph'), 493 494 (('fdot4', a, 0.0), 0.0), 495 (('fdot3', a, 0.0), 0.0), 496 (('fdot2', a, 0.0), 0.0), 497 498 (('fdot4', ('vec4', a, b, c, 1.0), d), ('fdph', ('vec3', a, b, c), d), '!options->lower_fdph'), 499 (('fdot4', ('vec4', a, 0.0, 0.0, 0.0), b), ('fmul', a, b)), 500 (('fdot4', ('vec4', a, b, 0.0, 0.0), c), ('fdot2', ('vec2', a, b), c)), 501 (('fdot4', ('vec4', a, b, c, 0.0), d), ('fdot3', ('vec3', a, b, c), d)), 502 503 (('fdot3', ('vec3', a, 0.0, 0.0), b), ('fmul', a, b)), 504 (('fdot3', ('vec3', a, b, 0.0), c), ('fdot2', ('vec2', a, b), c)), 505 506 (('fdot2', ('vec2', a, 0.0), b), ('fmul', a, b)), 507 (('fdot2', a, 1.0), ('fadd', 'a.x', 'a.y')), 508 509 # Lower fdot to fsum when it is available 510 (('fdot2', a, b), ('fsum2', ('fmul', a, b)), 'options->lower_fdot'), 511 (('fdot3', a, b), ('fsum3', ('fmul', a, b)), 'options->lower_fdot'), 512 (('fdot4', a, b), ('fsum4', ('fmul', a, b)), 'options->lower_fdot'), 513 (('fsum2', a), ('fadd', 'a.x', 'a.y'), 'options->lower_fdot'), 514 515 # If x >= 0 and x <= 1: fsat(1 - x) == 1 - fsat(x) trivially 516 # If x < 0: 1 - fsat(x) => 1 - 0 => 1 and fsat(1 - x) => fsat(> 1) => 1 517 # If x > 1: 1 - fsat(x) => 1 - 1 => 0 and fsat(1 - x) => fsat(< 0) => 0 518 (('~fadd', ('fneg(is_used_once)', ('fsat(is_used_once)', 'a(is_not_fmul)')), 1.0), ('fsat', ('fadd', 1.0, ('fneg', a)))), 519 520 # (a * #b + #c) << #d 521 # ((a * #b) << #d) + (#c << #d) 522 # (a * (#b << #d)) + (#c << #d) 523 (('ishl', ('iadd', ('imul', a, '#b'), '#c'), '#d'), 524 ('iadd', ('imul', a, ('ishl', b, d)), ('ishl', c, d))), 525 526 # (a * #b) << #c 527 # a * (#b << #c) 528 (('ishl', ('imul', a, '#b'), '#c'), ('imul', a, ('ishl', b, c))), 529]) 530 531# Care must be taken here. Shifts in NIR uses only the lower log2(bitsize) 532# bits of the second source. These replacements must correctly handle the 533# case where (b % bitsize) + (c % bitsize) >= bitsize. 534for s in [8, 16, 32, 64]: 535 mask = s - 1 536 537 ishl = "ishl@{}".format(s) 538 ishr = "ishr@{}".format(s) 539 ushr = "ushr@{}".format(s) 540 541 in_bounds = ('ult', ('iadd', ('iand', b, mask), ('iand', c, mask)), s) 542 543 optimizations.extend([ 544 ((ishl, (ishl, a, '#b'), '#c'), ('bcsel', in_bounds, (ishl, a, ('iadd', b, c)), 0)), 545 ((ushr, (ushr, a, '#b'), '#c'), ('bcsel', in_bounds, (ushr, a, ('iadd', b, c)), 0)), 546 547 # To get get -1 for large shifts of negative values, ishr must instead 548 # clamp the shift count to the maximum value. 549 ((ishr, (ishr, a, '#b'), '#c'), 550 (ishr, a, ('imin', ('iadd', ('iand', b, mask), ('iand', c, mask)), s - 1))), 551 ]) 552 553# Optimize a pattern of address calculation created by DXVK where the offset is 554# divided by 4 and then multipled by 4. This can be turned into an iand and the 555# additions before can be reassociated to CSE the iand instruction. 556 557for size, mask in ((8, 0xff), (16, 0xffff), (32, 0xffffffff), (64, 0xffffffffffffffff)): 558 a_sz = 'a@{}'.format(size) 559 560 optimizations.extend([ 561 # 'a >> #b << #b' -> 'a & ~((1 << #b) - 1)' 562 (('ishl', ('ushr', a_sz, '#b'), b), ('iand', a, ('ishl', mask, b))), 563 (('ishl', ('ishr', a_sz, '#b'), b), ('iand', a, ('ishl', mask, b))), 564 565 # This does not trivially work with ishr. 566 (('ushr', ('ishl', a_sz, '#b'), b), ('iand', a, ('ushr', mask, b))), 567 ]) 568 569# Collapses ubfe(ubfe(a, b, c), d, e) when b, c, d, e are constants. 570def ubfe_ubfe(a, b, c, d, e): 571 inner_offset = ('iand', b, 0x1f) 572 inner_bits = ('umin', ('iand', c, 0x1f), ('isub', 32, inner_offset)) 573 outer_offset = ('iand', d, 0x1f) 574 outer_bits = ('iand', e, 0x1f) 575 576 offset = ('iadd', inner_offset, outer_offset) 577 bits = ('umin', outer_bits, ('imax', ('isub', inner_bits, outer_offset), 0)) 578 collapsed = ('ubfe', a, offset, bits) 579 offset_out_of_range = ('ilt', 31, offset) 580 581 # This will be constant-folded to either 0 or the collapsed ubfe, 582 # whose offset and bits operands will also be constant folded. 583 return ('bcsel', offset_out_of_range, 0, collapsed) 584 585optimizations.extend([ 586 # Create bitfield extract from right-shift + and pattern. 587 (('iand@32', ('ushr@32(is_used_once)', a, b), '#c(is_const_bitmask)'), 588 ('ubfe', a, b, ('bit_count', c)), 589 'options->has_bfe && !options->avoid_ternary_with_two_constants'), 590 591 (('iand@32', ('ushr@32', a, b), ('bfm', c, 0)), 592 ('ubfe', a, b, c), 'options->has_bfe'), 593 594 (('ushr', ('iand', a, ('bfm', c, b)), b), 595 ('ubfe', a, b, c), 'options->has_bfe'), 596 597 # Collapse two bitfield extracts with constant operands into a single one. 598 (('ubfe', ('ubfe', a, '#b', '#c'), '#d', '#e'), 599 ubfe_ubfe(a, b, c, d, e)), 600 601 # Collapse non-zero right-shift into bitfield extract. 602 (('ushr@32', ('ubfe', a, '#b', '#c'), '#d(is_5lsb_not_zero)'), 603 ubfe_ubfe(a, b, c, d, 31)), 604 605 (('iand', ('ishl', 'a@32', '#b(is_first_5_bits_uge_2)'), -4), ('ishl', a, b)), 606 (('iand', ('imul', a, '#b(is_unsigned_multiple_of_4)'), -4), ('imul', a, b)), 607]) 608 609for log2 in range(1, 7): # powers of two from 2 to 64 610 v = 1 << log2 611 mask = 0xffffffff & ~(v - 1) 612 b_is_multiple = '#b(is_unsigned_multiple_of_{})'.format(v) 613 614 optimizations.extend([ 615 # Reassociate for improved CSE 616 (('iand@32', ('iadd@32', a, b_is_multiple), mask), ('iadd', ('iand', a, mask), b)), 617 ]) 618 619# To save space in the state tables, reduce to the set that is known to help. 620# Previously, this was range(1, 32). In addition, a couple rules inside the 621# loop are commented out. Revisit someday, probably after mesa/#2635 has some 622# resolution. 623for i in [1, 2, 16, 24]: 624 lo_mask = 0xffffffff >> i 625 hi_mask = (0xffffffff << i) & 0xffffffff 626 627 optimizations.extend([ 628 # This pattern seems to only help in the soft-fp64 code. 629 (('ishl@32', ('iand', 'a@32', lo_mask), i), ('ishl', a, i)), 630# (('ushr@32', ('iand', 'a@32', hi_mask), i), ('ushr', a, i)), 631# (('ishr@32', ('iand', 'a@32', hi_mask), i), ('ishr', a, i)), 632 633 (('iand', ('ishl', 'a@32', i), hi_mask), ('ishl', a, i)), 634 (('iand', ('ushr', 'a@32', i), lo_mask), ('ushr', a, i)), 635# (('iand', ('ishr', 'a@32', i), lo_mask), ('ushr', a, i)), # Yes, ushr is correct 636 ]) 637 638optimizations.extend([ 639 # This is common for address calculations. Reassociating may enable the 640 # 'a<<c' to be CSE'd. It also helps architectures that have an ISHLADD 641 # instruction or a constant offset field for in load / store instructions. 642 (('ishl', ('iadd', a, '#b'), '#c'), ('iadd', ('ishl', a, c), ('ishl', b, c))), 643 644 # (a + #b) * #c => (a * #c) + (#b * #c) 645 (('imul', ('iadd(is_used_once)', a, '#b'), '#c'), ('iadd', ('imul', a, c), ('imul', b, c))), 646 647 # ((a + #b) + c) * #d => ((a + c) * #d) + (#b * #d) 648 (('imul', ('iadd(is_used_once)', ('iadd(is_used_once)', a, '#b'), c), '#d'), 649 ('iadd', ('imul', ('iadd', a, c), d), ('imul', b, d))), 650 (('ishl', ('iadd(is_used_once)', ('iadd(is_used_once)', a, '#b'), c), '#d'), 651 ('iadd', ('ishl', ('iadd', a, c), d), ('ishl', b, d))), 652 653 # Comparison simplifications 654 (('inot', ('flt(is_used_once)', 'a(is_a_number)', 'b(is_a_number)')), ('fge', a, b)), 655 (('inot', ('fge(is_used_once)', 'a(is_a_number)', 'b(is_a_number)')), ('flt', a, b)), 656 (('inot', ('feq(is_used_once)', a, b)), ('fneu', a, b)), 657 (('inot', ('fneu(is_used_once)', a, b)), ('feq', a, b)), 658 (('inot', ('ilt(is_used_once)', a, b)), ('ige', a, b)), 659 (('inot', ('ult(is_used_once)', a, b)), ('uge', a, b)), 660 (('inot', ('ige(is_used_once)', a, b)), ('ilt', a, b)), 661 (('inot', ('uge(is_used_once)', a, b)), ('ult', a, b)), 662 (('inot', ('ieq(is_used_once)', a, b)), ('ine', a, b)), 663 (('inot', ('ine(is_used_once)', a, b)), ('ieq', a, b)), 664 665 (('iand', ('feq', a, b), ('fneu', a, b)), False), 666 (('iand', ('flt', a, b), ('flt', b, a)), False), 667 (('iand', ('ieq', a, b), ('ine', a, b)), False), 668 (('iand', ('ilt', a, b), ('ilt', b, a)), False), 669 (('iand', ('ult', a, b), ('ult', b, a)), False), 670 671 # This helps some shaders because, after some optimizations, they end up 672 # with patterns like (-a < -b) || (b < a). In an ideal world, this sort of 673 # matching would be handled by CSE. 674 (('flt', ('fneg', a), ('fneg', b)), ('flt', b, a)), 675 (('fge', ('fneg', a), ('fneg', b)), ('fge', b, a)), 676 (('feq', ('fneg', a), ('fneg', b)), ('feq', b, a)), 677 (('fneu', ('fneg', a), ('fneg', b)), ('fneu', b, a)), 678 (('flt', ('fneg', 'a(is_not_const)'), '#b'), ('flt', ('fneg', b), a)), 679 (('flt', '#b', ('fneg', 'a(is_not_const)')), ('flt', a, ('fneg', b))), 680 (('fge', ('fneg', 'a(is_not_const)'), '#b'), ('fge', ('fneg', b), a)), 681 (('fge', '#b', ('fneg', 'a(is_not_const)')), ('fge', a, ('fneg', b))), 682 (('fneu', ('fneg', 'a(is_not_const)'), '#b'), ('fneu', ('fneg', b), a)), 683 (('feq', '#b', ('fneg', 'a(is_not_const)')), ('feq', a, ('fneg', b))), 684 (('flt', a, '#b(is_negative_zero)'), ('flt', a, 0.0)), 685 (('flt', '#b(is_negative_zero)', a), ('flt', 0.0, a)), 686 (('fge', a, '#b(is_negative_zero)'), ('fge', a, 0.0)), 687 (('fge', '#b(is_negative_zero)', a), ('fge', 0.0, a)), 688 (('fneu', a, '#b(is_negative_zero)'), ('fneu', 0.0, a)), 689 (('feq', '#b(is_negative_zero)', a), ('feq', a, 0.0)), 690 691 (('ieq', ('ineg', a), 0), ('ieq', a, 0)), 692 (('ine', ('ineg', a), 0), ('ine', a, 0)), 693 (('ieq', ('iabs', a), 0), ('ieq', a, 0)), 694 (('ine', ('iabs', a), 0), ('ine', a, 0)), 695 (('fneu', ('fabs', a), 0.0), ('fneu', a, 0.0)), 696 (('feq', ('fabs', a), 0.0), ('feq', a, 0.0)), 697 (('fneu', ('fabs', a), ('fabs', a)), ('fneu', a, a)), 698 (('feq', ('fabs', a), ('fabs', a)), ('feq', a, a)), 699 700 # b < fsat(NaN) -> b < 0 -> false, and b < Nan -> false. 701 (('flt', '#b(is_gt_0_and_lt_1)', ('fsat(is_used_once)', a)), ('flt', b, a)), 702 703 # fsat(NaN) >= b -> 0 >= b -> false, and NaN >= b -> false. 704 (('fge', ('fsat(is_used_once)', a), '#b(is_gt_0_and_lt_1)'), ('fge', a, b)), 705 706 # b == fsat(NaN) -> b == 0 -> false, and b == NaN -> false. 707 (('feq', ('fsat(is_used_once)', a), '#b(is_gt_0_and_lt_1)'), ('feq', a, b)), 708 709 # b != fsat(NaN) -> b != 0 -> true, and b != NaN -> true. 710 (('fneu', ('fsat(is_used_once)', a), '#b(is_gt_0_and_lt_1)'), ('fneu', a, b)), 711 712 # fsat(NaN) >= 1 -> 0 >= 1 -> false, and NaN >= 1 -> false. 713 (('fge', ('fsat(is_used_once)', a), 1.0), ('fge', a, 1.0)), 714 715 # 0 < fsat(NaN) -> 0 < 0 -> false, and 0 < NaN -> false. 716 (('flt', 0.0, ('fsat(is_used_once)', a)), ('flt', 0.0, a)), 717 718 # 0.0 >= b2f(a) 719 # b2f(a) <= 0.0 720 # b2f(a) == 0.0 because b2f(a) can only be 0 or 1 721 # inot(a) 722 (('fge', 0.0, ('b2f', 'a@1')), ('inot', a)), 723 724 (('fge', ('fneg', ('b2f', 'a@1')), 0.0), ('inot', a)), 725 726 (('fneu', ('fadd', ('b2f', 'a@1'), ('b2f', 'b@1')), 0.0), ('ior', a, b)), 727 (('fneu', ('bcsel', a, 1.0, ('b2f', 'b@1')) , 0.0), ('ior', a, b)), 728 (('fneu', ('b2f', 'a@1'), ('fneg', ('b2f', 'b@1'))), ('ior', a, b)), 729 (('fneu', ('fmul', ('b2f', 'a@1'), ('b2f', 'b@1')), 0.0), ('iand', a, b)), 730 (('fneu', ('bcsel', a, ('b2f', 'b@1'), 0.0) , 0.0), ('iand', a, b)), 731 (('fneu', ('fadd', ('b2f', 'a@1'), ('fneg', ('b2f', 'b@1'))), 0.0), ('ixor', a, b)), 732 (('fneu', ('b2f', 'a@1') , ('b2f', 'b@1') ), ('ixor', a, b)), 733 (('fneu', ('fneg', ('b2f', 'a@1')), ('fneg', ('b2f', 'b@1'))), ('ixor', a, b)), 734 (('feq', ('fadd', ('b2f', 'a@1'), ('b2f', 'b@1')), 0.0), ('inot', ('ior', a, b))), 735 (('feq', ('bcsel', a, 1.0, ('b2f', 'b@1')) , 0.0), ('inot', ('ior', a, b))), 736 (('feq', ('b2f', 'a@1'), ('fneg', ('b2f', 'b@1'))), ('inot', ('ior', a, b))), 737 (('feq', ('fmul', ('b2f', 'a@1'), ('b2f', 'b@1')), 0.0), ('inot', ('iand', a, b))), 738 (('feq', ('bcsel', a, ('b2f', 'b@1'), 0.0) , 0.0), ('inot', ('iand', a, b))), 739 (('feq', ('fadd', ('b2f', 'a@1'), ('fneg', ('b2f', 'b@1'))), 0.0), ('ieq', a, b)), 740 (('feq', ('b2f', 'a@1') , ('b2f', 'b@1') ), ('ieq', a, b)), 741 (('feq', ('fneg', ('b2f', 'a@1')), ('fneg', ('b2f', 'b@1'))), ('ieq', a, b)), 742 743 # -(b2f(a) + b2f(b)) < 0 744 # 0 < b2f(a) + b2f(b) 745 # 0 != b2f(a) + b2f(b) b2f must be 0 or 1, so the sum is non-negative 746 # a || b 747 (('flt', ('fneg', ('fadd', ('b2f', 'a@1'), ('b2f', 'b@1'))), 0.0), ('ior', a, b)), 748 (('flt', 0.0, ('fadd', ('b2f', 'a@1'), ('b2f', 'b@1'))), ('ior', a, b)), 749 750 # -(b2f(a) + b2f(b)) >= 0 751 # 0 >= b2f(a) + b2f(b) 752 # 0 == b2f(a) + b2f(b) b2f must be 0 or 1, so the sum is non-negative 753 # !(a || b) 754 (('fge', ('fneg', ('fadd', ('b2f', 'a@1'), ('b2f', 'b@1'))), 0.0), ('inot', ('ior', a, b))), 755 (('fge', 0.0, ('fadd', ('b2f', 'a@1'), ('b2f', 'b@1'))), ('inot', ('ior', a, b))), 756 757 (('flt', a, ('fneg', a)), ('flt', a, 0.0)), 758 (('fge', a, ('fneg', a)), ('fge', a, 0.0)), 759 760 # Some optimizations (below) convert things like (a < b || c < b) into 761 # (min(a, c) < b). However, this interfers with the previous optimizations 762 # that try to remove comparisons with negated sums of b2f. This just 763 # breaks that apart. 764 (('flt', ('fmin', c, ('fneg', ('fadd', ('b2f', 'a@1'), ('b2f', 'b@1')))), 0.0), 765 ('ior', ('flt', c, 0.0), ('ior', a, b))), 766 767 (('~flt', ('fadd', a, b), a), ('flt', b, 0.0)), 768 (('~fge', ('fadd', a, b), a), ('fge', b, 0.0)), 769 (('~feq', ('fadd', a, b), a), ('feq', b, 0.0)), 770 (('~fneu', ('fadd', a, b), a), ('fneu', b, 0.0)), 771 (('~flt', ('fadd(is_used_once)', a, '#b'), '#c'), ('flt', a, ('fadd', c, ('fneg', b)))), 772 (('~flt', ('fneg(is_used_once)', ('fadd(is_used_once)', a, '#b')), '#c'), ('flt', ('fneg', ('fadd', c, b)), a)), 773 (('~fge', ('fadd(is_used_once)', a, '#b'), '#c'), ('fge', a, ('fadd', c, ('fneg', b)))), 774 (('~fge', ('fneg(is_used_once)', ('fadd(is_used_once)', a, '#b')), '#c'), ('fge', ('fneg', ('fadd', c, b)), a)), 775 (('~feq', ('fadd(is_used_once)', a, '#b'), '#c'), ('feq', a, ('fadd', c, ('fneg', b)))), 776 (('~feq', ('fneg(is_used_once)', ('fadd(is_used_once)', a, '#b')), '#c'), ('feq', ('fneg', ('fadd', c, b)), a)), 777 (('~fneu', ('fadd(is_used_once)', a, '#b'), '#c'), ('fneu', a, ('fadd', c, ('fneg', b)))), 778 (('~fneu', ('fneg(is_used_once)', ('fadd(is_used_once)', a, '#b')), '#c'), ('fneu', ('fneg', ('fadd', c, b)), a)), 779 780 # Cannot remove the addition from ilt or ige due to overflow. 781 (('ieq', ('iadd', a, b), a), ('ieq', b, 0)), 782 (('ine', ('iadd', a, b), a), ('ine', b, 0)), 783 784 (('feq', ('b2f', 'a@1'), 0.0), ('inot', a)), 785 (('fge', 0.0, ('b2f', 'a@1')), ('inot', a)), 786 (('fneu', ('b2f', 'a@1'), 0.0), a), 787 (('flt', 0.0, ('b2f', 'a@1')), a), 788 (('ieq', ('b2i', 'a@1'), 0), ('inot', a)), 789 (('ine', ('b2i', 'a@1'), 0), a), 790 (('ieq', 'a@1', False), ('inot', a)), 791 (('ieq', 'a@1', True), a), 792 (('ine', 'a@1', False), a), 793 (('ine', 'a@1', True), ('inot', a)), 794 795 (('fneu', ('u2f', a), 0.0), ('ine', a, 0)), 796 (('feq', ('u2f', a), 0.0), ('ieq', a, 0)), 797 (('fge', ('u2f', a), 0.0), True), 798 (('fge', 0.0, ('u2f', a)), ('uge', 0, a)), # ieq instead? 799 (('flt', ('u2f', a), 0.0), False), 800 (('flt', 0.0, ('u2f', a)), ('ult', 0, a)), # ine instead? 801 (('fneu', ('i2f', a), 0.0), ('ine', a, 0)), 802 (('feq', ('i2f', a), 0.0), ('ieq', a, 0)), 803 (('fge', ('i2f', a), 0.0), ('ige', a, 0)), 804 (('fge', 0.0, ('i2f', a)), ('ige', 0, a)), 805 (('flt', ('i2f', a), 0.0), ('ilt', a, 0)), 806 (('flt', 0.0, ('i2f', a)), ('ilt', 0, a)), 807 808 # 0.0 < fabs(a) 809 # fabs(a) > 0.0 810 # fabs(a) != 0.0 because fabs(a) must be >= 0 811 # a != 0.0 812 (('~flt', 0.0, ('fabs', a)), ('fneu', a, 0.0)), 813 814 # -fabs(a) < 0.0 815 # fabs(a) > 0.0 816 (('~flt', ('fneg', ('fabs', a)), 0.0), ('fneu', a, 0.0)), 817 818 # 0.0 >= fabs(a) 819 # 0.0 == fabs(a) because fabs(a) must be >= 0 820 # 0.0 == a 821 (('fge', 0.0, ('fabs', a)), ('feq', a, 0.0)), 822 823 # -fabs(a) >= 0.0 824 # 0.0 >= fabs(a) 825 (('fge', ('fneg', ('fabs', a)), 0.0), ('feq', a, 0.0)), 826 827 # (a >= 0.0) && (a <= 1.0) -> fsat(a) == a 828 # 829 # This should be NaN safe. 830 # 831 # NaN >= 0 && 1 >= NaN -> false && false -> false 832 # 833 # vs. 834 # 835 # NaN == fsat(NaN) -> NaN == 0 -> false 836 (('iand', ('fge', a, 0.0), ('fge', 1.0, a)), ('feq', a, ('fsat', a)), '!options->lower_fsat'), 837 838 # Note: fmin(-a, -b) == -fmax(a, b) 839 (('fmax', ('b2f(is_used_once)', 'a@1'), ('b2f', 'b@1')), ('b2f', ('ior', a, b))), 840 (('fmax', ('fneg(is_used_once)', ('b2f(is_used_once)', 'a@1')), ('fneg', ('b2f', 'b@1'))), ('fneg', ('b2f', ('iand', a, b)))), 841 (('fmin', ('b2f(is_used_once)', 'a@1'), ('b2f', 'b@1')), ('b2f', ('iand', a, b))), 842 (('fmin', ('fneg(is_used_once)', ('b2f(is_used_once)', 'a@1')), ('fneg', ('b2f', 'b@1'))), ('fneg', ('b2f', ('ior', a, b)))), 843 844 # fmin(b2f(a), b) 845 # bcsel(a, fmin(b2f(a), b), fmin(b2f(a), b)) 846 # bcsel(a, fmin(b2f(True), b), fmin(b2f(False), b)) 847 # bcsel(a, fmin(1.0, b), fmin(0.0, b)) 848 # 849 # Since b is a constant, constant folding will eliminate the fmin and the 850 # fmax. If b is > 1.0, the bcsel will be replaced with a b2f. 851 (('fmin', ('b2f', 'a@1'), '#b'), ('bcsel', a, ('fmin', b, 1.0), ('fmin', b, 0.0))), 852 853 (('flt', ('fadd(is_used_once)', a, ('fneg', b)), 0.0), ('flt', a, b)), 854 855 (('fge', ('fneg', ('fabs', a)), 0.0), ('feq', a, 0.0)), 856 (('~bcsel', ('flt', b, a), b, a), ('fmin', a, b)), 857 (('~bcsel', ('flt', a, b), b, a), ('fmax', a, b)), 858 (('~bcsel', ('fge', a, b), b, a), ('fmin', a, b)), 859 (('~bcsel', ('fge', b, a), b, a), ('fmax', a, b)), 860 (('bcsel', ('inot', a), b, c), ('bcsel', a, c, b)), 861 (('bcsel', a, ('bcsel', a, b, c), d), ('bcsel', a, b, d)), 862 (('bcsel', a, b, ('bcsel', a, c, d)), ('bcsel', a, b, d)), 863 (('bcsel', a, ('bcsel', b, c, d), ('bcsel(is_used_once)', b, c, 'e')), ('bcsel', b, c, ('bcsel', a, d, 'e'))), 864 (('bcsel', a, ('bcsel(is_used_once)', b, c, d), ('bcsel', b, c, 'e')), ('bcsel', b, c, ('bcsel', a, d, 'e'))), 865 (('bcsel', a, ('bcsel', b, c, d), ('bcsel(is_used_once)', b, 'e', d)), ('bcsel', b, ('bcsel', a, c, 'e'), d)), 866 (('bcsel', a, ('bcsel(is_used_once)', b, c, d), ('bcsel', b, 'e', d)), ('bcsel', b, ('bcsel', a, c, 'e'), d)), 867 (('bcsel', a, True, b), ('ior', a, b)), 868 (('bcsel', a, a, b), ('ior', a, b)), 869 (('bcsel', a, b, False), ('iand', a, b)), 870 (('bcsel', a, b, a), ('iand', a, b)), 871 (('~fmin', a, a), a), 872 (('~fmax', a, a), a), 873 (('imin', a, a), a), 874 (('imax', a, a), a), 875 (('umin', a, a), a), 876 (('umin', a, 0), 0), 877 (('umin', a, -1), a), 878 (('umax', a, a), a), 879 (('umax', a, 0), a), 880 (('umax', a, -1), -1), 881 (('fmax', ('fmax', a, b), b), ('fmax', a, b)), 882 (('umax', ('umax', a, b), b), ('umax', a, b)), 883 (('imax', ('imax', a, b), b), ('imax', a, b)), 884 (('fmin', ('fmin', a, b), b), ('fmin', a, b)), 885 (('umin', ('umin', a, b), b), ('umin', a, b)), 886 (('imin', ('imin', a, b), b), ('imin', a, b)), 887 (('fmax', ('fmax', ('fmax', a, b), c), a), ('fmax', ('fmax', a, b), c)), 888 (('umax', ('umax', ('umax', a, b), c), a), ('umax', ('umax', a, b), c)), 889 (('imax', ('imax', ('imax', a, b), c), a), ('imax', ('imax', a, b), c)), 890 (('fmin', ('fmin', ('fmin', a, b), c), a), ('fmin', ('fmin', a, b), c)), 891 (('umin', ('umin', ('umin', a, b), c), a), ('umin', ('umin', a, b), c)), 892 (('imin', ('imin', ('imin', a, b), c), a), ('imin', ('imin', a, b), c)), 893 (('fmin', ('fmax', 'a(is_finite)', b), a), ('fmul', 1.0, a)), 894 (('fmax', ('fmin', 'a(is_finite)', b), a), ('fmul', 1.0, a)), 895 (('umin', ('umax', a, b), a), a), 896 (('umax', ('umin', a, b), a), a), 897 (('imin', ('imax', a, b), a), a), 898 (('imax', ('imin', a, b), a), a), 899]) 900 901for N in [8, 16, 32, 64]: 902 b2iN = 'b2i{0}'.format(N) 903 optimizations.extend([ 904 (('ieq', (b2iN, 'a@1'), (b2iN, 'b@1')), ('ieq', a, b)), 905 (('ine', (b2iN, 'a@1'), (b2iN, 'b@1')), ('ine', a, b)), 906 ]) 907 908for N in [16, 32, 64]: 909 b2fN = 'b2f{0}'.format(N) 910 optimizations.extend([ 911 (('feq', (b2fN, 'a@1'), (b2fN, 'b@1')), ('ieq', a, b)), 912 (('fneu', (b2fN, 'a@1'), (b2fN, 'b@1')), ('ine', a, b)), 913 ]) 914 915# Integer sizes 916for s in [8, 16, 32, 64]: 917 optimizations.extend([ 918 (('iand@{}'.format(s), a, ('inot', ('ishr', a, s - 1))), ('imax', a, 0)), 919 920 # Simplify logic to detect sign of an integer. 921 (('ieq', ('iand', 'a@{}'.format(s), 1 << (s - 1)), 0), ('ige', a, 0)), 922 (('ine', ('iand', 'a@{}'.format(s), 1 << (s - 1)), 1 << (s - 1)), ('ige', a, 0)), 923 (('ine', ('iand', 'a@{}'.format(s), 1 << (s - 1)), 0), ('ilt', a, 0)), 924 (('ieq', ('iand', 'a@{}'.format(s), 1 << (s - 1)), 1 << (s - 1)), ('ilt', a, 0)), 925 (('ine', ('ushr', 'a@{}'.format(s), s - 1), 0), ('ilt', a, 0)), 926 (('ieq', ('ushr', 'a@{}'.format(s), s - 1), 0), ('ige', a, 0)), 927 (('ieq', ('ushr', 'a@{}'.format(s), s - 1), 1), ('ilt', a, 0)), 928 (('ine', ('ushr', 'a@{}'.format(s), s - 1), 1), ('ige', a, 0)), 929 (('ine', ('ishr', 'a@{}'.format(s), s - 1), 0), ('ilt', a, 0)), 930 (('ieq', ('ishr', 'a@{}'.format(s), s - 1), 0), ('ige', a, 0)), 931 (('ieq', ('ishr', 'a@{}'.format(s), s - 1), -1), ('ilt', a, 0)), 932 (('ine', ('ishr', 'a@{}'.format(s), s - 1), -1), ('ige', a, 0)), 933 ]) 934 935optimizations.extend([ 936 (('fmin', a, ('fneg', a)), ('fneg', ('fabs', a))), 937 (('imin', a, ('ineg', a)), ('ineg', ('iabs', a))), 938 (('fmin', a, ('fneg', ('fabs', a))), ('fneg', ('fabs', a))), 939 (('imin', a, ('ineg', ('iabs', a))), ('ineg', ('iabs', a))), 940 (('~fmin', a, ('fabs', a)), a), 941 (('imin', a, ('iabs', a)), a), 942 (('~fmax', a, ('fneg', ('fabs', a))), a), 943 (('imax', a, ('ineg', ('iabs', a))), a), 944 (('fmax', a, ('fabs', a)), ('fabs', a)), 945 (('imax', a, ('iabs', a)), ('iabs', a)), 946 (('fmax', a, ('fneg', a)), ('fabs', a)), 947 (('imax', a, ('ineg', a)), ('iabs', a), '!options->lower_iabs'), 948 (('~fmax', ('fabs', a), 0.0), ('fabs', a)), 949 (('fmin', ('fmax', a, 0.0), 1.0), ('fsat', a), '!options->lower_fsat'), 950 # fmax(fmin(a, 1.0), 0.0) is inexact because it returns 1.0 on NaN, while 951 # fsat(a) returns 0.0. 952 (('~fmax', ('fmin', a, 1.0), 0.0), ('fsat', a), '!options->lower_fsat'), 953 # fmin(fmax(a, -1.0), 0.0) is inexact because it returns -1.0 on NaN, while 954 # fneg(fsat(fneg(a))) returns -0.0 on NaN. 955 (('~fmin', ('fmax', a, -1.0), 0.0), ('fneg', ('fsat', ('fneg', a))), '!options->lower_fsat'), 956 # fmax(fmin(a, 0.0), -1.0) is inexact because it returns 0.0 on NaN, while 957 # fneg(fsat(fneg(a))) returns -0.0 on NaN. This only matters if 958 # SignedZeroInfNanPreserve is set, but we don't currently have any way of 959 # representing this in the optimizations other than the usual ~. 960 (('~fmax', ('fmin', a, 0.0), -1.0), ('fneg', ('fsat', ('fneg', a))), '!options->lower_fsat'), 961 # fsat(fsign(NaN)) = fsat(0) = 0, and b2f(0 < NaN) = b2f(False) = 0. Mark 962 # the new comparison precise to prevent it being changed to 'a != 0'. 963 (('fsat', ('fsign', a)), ('b2f', ('!flt', 0.0, a))), 964 (('fsat', ('b2f', a)), ('b2f', a)), 965 (('fsat', a), ('fmin', ('fmax', a, 0.0), 1.0), 'options->lower_fsat'), 966 (('fsat', ('fsat', a)), ('fsat', a)), 967 (('fsat', ('fneg(is_used_once)', ('fadd(is_used_once)', a, b))), ('fsat', ('fadd', ('fneg', a), ('fneg', b))), '!options->lower_fsat'), 968 (('fsat', ('fneg(is_used_once)', ('fmul(is_used_once)', a, b))), ('fsat', ('fmul', ('fneg', a), b)), '!options->lower_fsat'), 969 (('fsat(nsz)', ('fneg(is_used_once)', ('fmulz(is_used_once)', a, b))), ('fsat', ('fmulz', ('fneg', a), b)), '!options->lower_fsat'), 970 (('fsat', ('fabs(is_used_once)', ('fmul(is_used_once)', a, b))), ('fsat', ('fmul', ('fabs', a), ('fabs', b))), '!options->lower_fsat'), 971 (('fmin', ('fmax', ('fmin', ('fmax', a, b), c), b), c), ('fmin', ('fmax', a, b), c)), 972 (('imin', ('imax', ('imin', ('imax', a, b), c), b), c), ('imin', ('imax', a, b), c)), 973 (('umin', ('umax', ('umin', ('umax', a, b), c), b), c), ('umin', ('umax', a, b), c)), 974 # Both the left and right patterns are "b" when isnan(a), so this is exact. 975 (('fmax', ('fsat', a), '#b(is_zero_to_one)'), ('fsat', ('fmax', a, b))), 976 (('fmax', ('fsat(is_used_once)', a), ('fsat(is_used_once)', b)), ('fsat', ('fmax', a, b))), 977 # The left pattern is 0.0 when isnan(a) (because fmin(fsat(NaN), b) -> 978 # fmin(0.0, b)) while the right one is "b", so this optimization is inexact. 979 (('~fmin', ('fsat', a), '#b(is_zero_to_one)'), ('fsat', ('fmin', a, b))), 980 981 # If a >= 0 ... 1 + a >= 1 ... so fsat(1 + a) = 1 982 (('fsat', ('fadd', 1.0, 'a(is_ge_zero)')), 1.0), 983 984 # Let constant folding do its job. This can have emergent behaviour. 985 (('fneg', ('bcsel(is_used_once)', a, '#b', '#c')), ('bcsel', a, ('fneg', b), ('fneg', c))), 986 987 # max(-min(b, a), b) -> max(abs(b), -a) 988 # min(-max(b, a), b) -> min(-abs(b), -a) 989 (('fmax', ('fneg', ('fmin', b, a)), b), ('fmax', ('fabs', b), ('fneg', a))), 990 (('fmin', ('fneg', ('fmax', b, a)), b), ('fmin', ('fneg', ('fabs', b)), ('fneg', a))), 991 992 # If a in [0,b] then b-a is also in [0,b]. Since b in [0,1], max(b-a, 0) = 993 # fsat(b-a). 994 # 995 # If a > b, then b-a < 0 and max(b-a, 0) = fsat(b-a) = 0 996 # 997 # This should be NaN safe since max(NaN, 0) = fsat(NaN) = 0. 998 (('fmax', ('fadd(is_used_once)', ('fneg', 'a(is_not_negative)'), '#b(is_zero_to_one)'), 0.0), 999 ('fsat', ('fadd', ('fneg', a), b)), '!options->lower_fsat'), 1000 1001 (('extract_u8', ('imin', ('imax', a, 0), 0xff), 0), ('imin', ('imax', a, 0), 0xff)), 1002 1003 # The ior versions are exact because fmin and fmax will always pick a 1004 # non-NaN value, if one exists. Therefore (a < NaN) || (a < c) == a < 1005 # fmax(NaN, c) == a < c. Mark the fmin or fmax in the replacement as exact 1006 # to prevent other optimizations from ruining the "NaN clensing" property 1007 # of the fmin or fmax. 1008 (('ior', ('flt(is_used_once)', a, b), ('flt', a, c)), ('flt', a, ('!fmax', b, c))), 1009 (('ior', ('flt(is_used_once)', a, c), ('flt', b, c)), ('flt', ('!fmin', a, b), c)), 1010 (('ior', ('fge(is_used_once)', a, b), ('fge', a, c)), ('fge', a, ('!fmin', b, c))), 1011 (('ior', ('fge(is_used_once)', a, c), ('fge', b, c)), ('fge', ('!fmax', a, b), c)), 1012 (('ior', ('flt', a, '#b'), ('flt', a, '#c')), ('flt', a, ('!fmax', b, c))), 1013 (('ior', ('flt', '#a', c), ('flt', '#b', c)), ('flt', ('!fmin', a, b), c)), 1014 (('ior', ('fge', a, '#b'), ('fge', a, '#c')), ('fge', a, ('!fmin', b, c))), 1015 (('ior', ('fge', '#a', c), ('fge', '#b', c)), ('fge', ('!fmax', a, b), c)), 1016 (('~iand', ('flt(is_used_once)', a, b), ('flt', a, c)), ('flt', a, ('fmin', b, c))), 1017 (('~iand', ('flt(is_used_once)', a, c), ('flt', b, c)), ('flt', ('fmax', a, b), c)), 1018 (('~iand', ('fge(is_used_once)', a, b), ('fge', a, c)), ('fge', a, ('fmax', b, c))), 1019 (('~iand', ('fge(is_used_once)', a, c), ('fge', b, c)), ('fge', ('fmin', a, b), c)), 1020 (('iand', ('flt', a, '#b(is_a_number)'), ('flt', a, '#c(is_a_number)')), ('flt', a, ('fmin', b, c))), 1021 (('iand', ('flt', '#a(is_a_number)', c), ('flt', '#b(is_a_number)', c)), ('flt', ('fmax', a, b), c)), 1022 (('iand', ('fge', a, '#b(is_a_number)'), ('fge', a, '#c(is_a_number)')), ('fge', a, ('fmax', b, c))), 1023 (('iand', ('fge', '#a(is_a_number)', c), ('fge', '#b(is_a_number)', c)), ('fge', ('fmin', a, b), c)), 1024 1025 (('ior', ('ilt(is_used_once)', a, b), ('ilt', a, c)), ('ilt', a, ('imax', b, c))), 1026 (('ior', ('ilt(is_used_once)', a, c), ('ilt', b, c)), ('ilt', ('imin', a, b), c)), 1027 (('ior', ('ige(is_used_once)', a, b), ('ige', a, c)), ('ige', a, ('imin', b, c))), 1028 (('ior', ('ige(is_used_once)', a, c), ('ige', b, c)), ('ige', ('imax', a, b), c)), 1029 (('ior', ('ult(is_used_once)', a, b), ('ult', a, c)), ('ult', a, ('umax', b, c))), 1030 (('ior', ('ult(is_used_once)', a, c), ('ult', b, c)), ('ult', ('umin', a, b), c)), 1031 (('ior', ('uge(is_used_once)', a, b), ('uge', a, c)), ('uge', a, ('umin', b, c))), 1032 (('ior', ('uge(is_used_once)', a, c), ('uge', b, c)), ('uge', ('umax', a, b), c)), 1033 (('iand', ('ilt(is_used_once)', a, b), ('ilt', a, c)), ('ilt', a, ('imin', b, c))), 1034 (('iand', ('ilt(is_used_once)', a, c), ('ilt', b, c)), ('ilt', ('imax', a, b), c)), 1035 (('iand', ('ige(is_used_once)', a, b), ('ige', a, c)), ('ige', a, ('imax', b, c))), 1036 (('iand', ('ige(is_used_once)', a, c), ('ige', b, c)), ('ige', ('imin', a, b), c)), 1037 (('iand', ('ult(is_used_once)', a, b), ('ult', a, c)), ('ult', a, ('umin', b, c))), 1038 (('iand', ('ult(is_used_once)', a, c), ('ult', b, c)), ('ult', ('umax', a, b), c)), 1039 (('iand', ('uge(is_used_once)', a, b), ('uge', a, c)), ('uge', a, ('umax', b, c))), 1040 (('iand', ('uge(is_used_once)', a, c), ('uge', b, c)), ('uge', ('umin', a, b), c)), 1041 1042 # A number of shaders contain a pattern like a.x < 0.0 || a.x > 1.0 || a.y 1043 # < 0.0, || a.y > 1.0 || ... These patterns rearrange and replace in a 1044 # single step. Doing just the replacement can lead to an infinite loop as 1045 # the pattern is repeatedly applied to the result of the previous 1046 # application of the pattern. 1047 (('ior', ('ior(is_used_once)', ('flt(is_used_once)', a, c), d), ('flt', b, c)), ('ior', ('flt', ('!fmin', a, b), c), d)), 1048 (('ior', ('ior(is_used_once)', ('flt', a, c), d), ('flt(is_used_once)', b, c)), ('ior', ('flt', ('!fmin', a, b), c), d)), 1049 (('ior', ('ior(is_used_once)', ('flt(is_used_once)', a, b), d), ('flt', a, c)), ('ior', ('flt', a, ('!fmax', b, c)), d)), 1050 (('ior', ('ior(is_used_once)', ('flt', a, b), d), ('flt(is_used_once)', a, c)), ('ior', ('flt', a, ('!fmax', b, c)), d)), 1051 1052 # This is how SpvOpFOrdNotEqual might be implemented. If both values are 1053 # numbers, then it can be replaced with fneu. 1054 (('ior', ('flt', 'a(is_a_number)', 'b(is_a_number)'), ('flt', b, a)), ('fneu', a, b)), 1055 1056 # Other patterns may optimize the resulting iand tree further. 1057 (('umin', ('iand', a, '#b(is_pos_power_of_two)'), ('iand', c, b)), 1058 ('iand', ('iand', a, b), ('iand', c, b))), 1059]) 1060 1061# Float sizes 1062for s in [16, 32, 64]: 1063 if s == 64: 1064 match_fsign_cond = "!options->lower_fsign & !(options->lower_doubles_options & nir_lower_dsign)" 1065 else: 1066 match_fsign_cond = "!options->lower_fsign" 1067 optimizations.extend([ 1068 # These derive from the previous patterns with the application of b < 0 <=> 1069 # 0 < -b. The transformation should be applied if either comparison is 1070 # used once as this ensures that the number of comparisons will not 1071 # increase. The sources to the ior and iand are not symmetric, so the 1072 # rules have to be duplicated to get this behavior. 1073 (('ior', ('flt(is_used_once)', 0.0, 'a@{}'.format(s)), ('flt', 'b@{}'.format(s), 0.0)), ('flt', 0.0, ('fmax', a, ('fneg', b)))), 1074 (('ior', ('flt', 0.0, 'a@{}'.format(s)), ('flt(is_used_once)', 'b@{}'.format(s), 0.0)), ('flt', 0.0, ('fmax', a, ('fneg', b)))), 1075 (('ior', ('fge(is_used_once)', 0.0, 'a@{}'.format(s)), ('fge', 'b@{}'.format(s), 0.0)), ('fge', 0.0, ('fmin', a, ('fneg', b)))), 1076 (('ior', ('fge', 0.0, 'a@{}'.format(s)), ('fge(is_used_once)', 'b@{}'.format(s), 0.0)), ('fge', 0.0, ('fmin', a, ('fneg', b)))), 1077 (('~iand', ('flt(is_used_once)', 0.0, 'a@{}'.format(s)), ('flt', 'b@{}'.format(s), 0.0)), ('flt', 0.0, ('fmin', a, ('fneg', b)))), 1078 (('~iand', ('flt', 0.0, 'a@{}'.format(s)), ('flt(is_used_once)', 'b@{}'.format(s), 0.0)), ('flt', 0.0, ('fmin', a, ('fneg', b)))), 1079 (('~iand', ('fge(is_used_once)', 0.0, 'a@{}'.format(s)), ('fge', 'b@{}'.format(s), 0.0)), ('fge', 0.0, ('fmax', a, ('fneg', b)))), 1080 (('~iand', ('fge', 0.0, 'a@{}'.format(s)), ('fge(is_used_once)', 'b@{}'.format(s), 0.0)), ('fge', 0.0, ('fmax', a, ('fneg', b)))), 1081 1082 (('ior', ('feq(is_used_once)', 'a@{}'.format(s), 0.0), ('feq', 'b@{}'.format(s), 0.0)), ('feq', ('fmin', ('fabs', a), ('fabs', b)), 0.0)), 1083 (('ior', ('fneu(is_used_once)', 'a@{}'.format(s), 0.0), ('fneu', 'b@{}'.format(s), 0.0)), ('fneu', ('fadd', ('fabs', a), ('fabs', b)), 0.0)), 1084 (('iand', ('feq(is_used_once)', 'a@{}'.format(s), 0.0), ('feq', 'b@{}'.format(s), 0.0)), ('feq', ('fadd', ('fabs', a), ('fabs', b)), 0.0)), 1085 (('iand', ('fneu(is_used_once)', 'a@{}'.format(s), 0.0), ('fneu', 'b@{}'.format(s), 0.0)), ('fneu', ('fmin', ('fabs', a), ('fabs', b)), 0.0)), 1086 1087 # The (i2f32, ...) part is an open-coded fsign. When that is combined 1088 # with the bcsel, it's basically copysign(1.0, a). There are some 1089 # behavior differences between this pattern and copysign w.r.t. ±0 and 1090 # NaN. copysign(x, y) blindly takes the sign bit from y and applies it 1091 # to x, regardless of whether either or both values are NaN. 1092 # 1093 # If a != a: bcsel(False, 1.0, i2f(b2i(False) - b2i(False))) = 0, 1094 # int(NaN >= 0.0) - int(NaN < 0.0) = 0 - 0 = 0 1095 # If a == ±0: bcsel(True, 1.0, ...) = 1.0, 1096 # int(±0.0 >= 0.0) - int(±0.0 < 0.0) = 1 - 0 = 1 1097 # 1098 # For all other values of 'a', the original and replacement behave as 1099 # copysign. 1100 # 1101 # Marking the replacement comparisons as precise prevents any future 1102 # optimizations from replacing either of the comparisons with the 1103 # logical-not of the other. 1104 # 1105 # Note: Use b2i32 in the replacement because some platforms that 1106 # support fp16 don't support int16. 1107 (('bcsel@{}'.format(s), ('feq', a, 0.0), 1.0, ('i2f{}'.format(s), ('iadd', ('b2i{}'.format(s), ('flt', 0.0, 'a@{}'.format(s))), ('ineg', ('b2i{}'.format(s), ('flt', 'a@{}'.format(s), 0.0)))))), 1108 ('i2f{}'.format(s), ('iadd', ('b2i32', ('!fge', a, 0.0)), ('ineg', ('b2i32', ('!flt', a, 0.0)))))), 1109 1110 (('bcsel', a, ('b2f(is_used_once)', 'b@{}'.format(s)), ('b2f', 'c@{}'.format(s))), ('b2f', ('bcsel', a, b, c))), 1111 1112 # The C spec says, "If the value of the integral part cannot be represented 1113 # by the integer type, the behavior is undefined." "Undefined" can mean 1114 # "the conversion doesn't happen at all." 1115 (('~i2f{}'.format(s), ('f2i', 'a@{}'.format(s))), ('ftrunc', a)), 1116 1117 # Ironically, mark these as imprecise because removing the conversions may 1118 # preserve more precision than doing the conversions (e.g., 1119 # uint(float(0x81818181u)) == 0x81818200). 1120 (('~f2i{}'.format(s), ('i2f', 'a@{}'.format(s))), a), 1121 (('~f2i{}'.format(s), ('u2f', 'a@{}'.format(s))), a), 1122 (('~f2u{}'.format(s), ('i2f', 'a@{}'.format(s))), a), 1123 (('~f2u{}'.format(s), ('u2f', 'a@{}'.format(s))), a), 1124 1125 (('fadd', ('b2f{}'.format(s), ('flt', 0.0, 'a@{}'.format(s))), ('fneg', ('b2f{}'.format(s), ('flt', 'a@{}'.format(s), 0.0)))), ('fsign', a), match_fsign_cond), 1126 (('iadd', ('b2i{}'.format(s), ('flt', 0, 'a@{}'.format(s))), ('ineg', ('b2i{}'.format(s), ('flt', 'a@{}'.format(s), 0)))), ('f2i{}'.format(s), ('fsign', a)), match_fsign_cond), 1127 1128 # float? -> float? -> floatS ==> float? -> floatS 1129 (('~f2f{}'.format(s), ('f2f', a)), ('f2f{}'.format(s), a)), 1130 1131 # int? -> float? -> floatS ==> int? -> floatS 1132 (('~f2f{}'.format(s), ('u2f', a)), ('u2f{}'.format(s), a)), 1133 (('~f2f{}'.format(s), ('i2f', a)), ('i2f{}'.format(s), a)), 1134 1135 # float? -> float? -> intS ==> float? -> intS 1136 (('~f2u{}'.format(s), ('f2f', a)), ('f2u{}'.format(s), a)), 1137 (('~f2i{}'.format(s), ('f2f', a)), ('f2i{}'.format(s), a)), 1138 1139 # HLSL's sign function returns an integer 1140 (('i2f{}'.format(s), ('f2i', ('fsign', 'a@{}'.format(s)))), ('fsign', a)), 1141 ]) 1142 1143 for B in [32, 64]: 1144 if s < B: 1145 optimizations.extend([ 1146 # S = smaller, B = bigger 1147 # floatS -> floatB -> floatS ==> identity 1148 (('~f2f{}'.format(s), ('f2f{}'.format(B), 'a@{}'.format(s))), a), 1149 1150 # floatS -> floatB -> intB ==> floatS -> intB 1151 (('f2u{}'.format(B), ('f2f{}'.format(B), 'a@{}'.format(s))), ('f2u{}'.format(B), a)), 1152 (('f2i{}'.format(B), ('f2f{}'.format(B), 'a@{}'.format(s))), ('f2i{}'.format(B), a)), 1153 1154 # int? -> floatB -> floatS ==> int? -> floatS 1155 (('f2f{}'.format(s), ('u2f{}'.format(B), a)), ('u2f{}'.format(s), a)), 1156 (('f2f{}'.format(s), ('i2f{}'.format(B), a)), ('i2f{}'.format(s), a)), 1157 ]) 1158 1159for S in [1, 8, 16, 32]: 1160 for B in [8, 16, 32, 64]: 1161 if B <= S: 1162 continue 1163 optimizations.extend([ 1164 # intS -> intB -> intS ==> identity 1165 (('i2i{}'.format(S), ('i2i{}'.format(B), 'a@{}'.format(S))), a), 1166 (('u2u{}'.format(S), ('u2u{}'.format(B), 'a@{}'.format(S))), a), 1167 ]) 1168 1169 if B < 16: 1170 continue 1171 for C in [8, 16, 32, 64]: 1172 if C <= S: 1173 continue 1174 optimizations.extend([ 1175 # intS -> intC -> floatB ==> intS -> floatB 1176 (('u2f{}'.format(B), ('u2u{}'.format(C), 'a@{}'.format(S))), ('u2f{}'.format(B), a)), 1177 (('i2f{}'.format(B), ('i2i{}'.format(C), 'a@{}'.format(S))), ('i2f{}'.format(B), a)), 1178 ]) 1179 1180# mediump variants of the above 1181optimizations.extend([ 1182 # int32 -> float32 -> float16 ==> int32 -> float16 1183 (('f2fmp', ('u2f32', 'a@32')), ('u2fmp', a)), 1184 (('f2fmp', ('i2f32', 'a@32')), ('i2fmp', a)), 1185 1186 # float32 -> float16 -> int16 ==> float32 -> int16 1187 (('f2u16', ('f2fmp', 'a@32')), ('f2u16', a)), 1188 (('f2i16', ('f2fmp', 'a@32')), ('f2i16', a)), 1189 1190 # float32 -> int32 -> int16 ==> float32 -> int16 1191 (('i2imp', ('f2u32', 'a@32')), ('f2ump', a)), 1192 (('i2imp', ('f2i32', 'a@32')), ('f2imp', a)), 1193 1194 # int32 -> int16 -> float16 ==> int32 -> float16 1195 (('u2f16', ('i2imp', 'a@32')), ('u2f16', a)), 1196 (('i2f16', ('i2imp', 'a@32')), ('i2f16', a)), 1197]) 1198 1199# Clean up junk left from 8-bit integer to 16-bit integer lowering. 1200optimizations.extend([ 1201 # The u2u16(u2u8(X)) just masks off the upper 8-bits of X. This can be 1202 # accomplished by mask the upper 8-bit of the immediate operand to the 1203 # iand instruction. Often times, both patterns will end up being applied 1204 # to the same original expression tree. 1205 (('iand', ('u2u16', ('u2u8', 'a@16')), '#b'), ('iand', a, ('iand', b, 0xff))), 1206 (('u2u16', ('u2u8(is_used_once)', ('iand', 'a@16', '#b'))), ('iand', a, ('iand', b, 0xff))), 1207]) 1208 1209for op in ['iand', 'ior', 'ixor']: 1210 optimizations.extend([ 1211 (('u2u8', (op, ('u2u16', ('u2u8', 'a@16')), ('u2u16', ('u2u8', 'b@16')))), ('u2u8', (op, a, b))), 1212 (('u2u8', (op, ('u2u16', ('u2u8', 'a@32')), ('u2u16', ('u2u8', 'b@32')))), ('u2u8', (op, a, b))), 1213 1214 # Undistribute extract from a logic op 1215 ((op, ('extract_i8', a, '#b'), ('extract_i8', c, b)), ('extract_i8', (op, a, c), b)), 1216 ((op, ('extract_u8', a, '#b'), ('extract_u8', c, b)), ('extract_u8', (op, a, c), b)), 1217 ((op, ('extract_i16', a, '#b'), ('extract_i16', c, b)), ('extract_i16', (op, a, c), b)), 1218 ((op, ('extract_u16', a, '#b'), ('extract_u16', c, b)), ('extract_u16', (op, a, c), b)), 1219 1220 # Undistribute shifts from a logic op 1221 ((op, ('ushr(is_used_once)', a, '#b'), ('ushr', c, b)), ('ushr', (op, a, c), b)), 1222 ((op, ('ishr(is_used_once)', a, '#b'), ('ishr', c, b)), ('ishr', (op, a, c), b)), 1223 ((op, ('ishl(is_used_once)', a, '#b'), ('ishl', c, b)), ('ishl', (op, a, c), b)), 1224 ]) 1225 1226# Integer sizes 1227for s in [8, 16, 32, 64]: 1228 amount_bits = int(math.log2(s)) 1229 1230 lower_umin = 'options->lower_umin' 1231 lower_umax = 'options->lower_umax' 1232 lower_imin = 'false' 1233 lower_imax = 'false' 1234 lower_ior = 'options->lower_bitops' 1235 if s == 64: 1236 lower_umin = '(options->lower_umin || (options->lower_int64_options & nir_lower_minmax64) != 0)' 1237 lower_umax = '(options->lower_umax || (options->lower_int64_options & nir_lower_minmax64) != 0)' 1238 lower_imin = '((options->lower_int64_options & nir_lower_minmax64) != 0)' 1239 lower_imax = '((options->lower_int64_options & nir_lower_minmax64) != 0)' 1240 lower_ior = '(options->lower_bitops || (options->lower_int64_options & nir_lower_logic64) != 0)' 1241 1242 optimizations.extend([ 1243 (('iand', ('ieq', 'a@{}'.format(s), 0), ('ieq', 'b@{}'.format(s), 0)), ('ieq', ('ior', a, b), 0), lower_umax + ' && !' + lower_ior), 1244 (('ior', ('ine', 'a@{}'.format(s), 0), ('ine', 'b@{}'.format(s), 0)), ('ine', ('ior', a, b), 0), lower_umin + ' && !' + lower_ior), 1245 (('iand', ('ieq', 'a@{}'.format(s), 0), ('ieq', 'b@{}'.format(s), 0)), ('ieq', ('umax', a, b), 0), '!'+lower_umax), 1246 (('ior', ('ieq', 'a@{}'.format(s), 0), ('ieq', 'b@{}'.format(s), 0)), ('ieq', ('umin', a, b), 0), '!'+lower_umin), 1247 (('iand', ('ine', 'a@{}'.format(s), 0), ('ine', 'b@{}'.format(s), 0)), ('ine', ('umin', a, b), 0), '!'+lower_umin), 1248 (('ior', ('ine', 'a@{}'.format(s), 0), ('ine', 'b@{}'.format(s), 0)), ('ine', ('umax', a, b), 0), '!'+lower_umax), 1249 1250 (('bcsel', ('ult', 'b@{}'.format(s), a), b, a), ('umin', a, b), '!'+lower_umin), 1251 (('bcsel', ('ult', 'a@{}'.format(s), b), b, a), ('umax', a, b), '!'+lower_umax), 1252 (('bcsel', ('uge', 'a@{}'.format(s), b), b, a), ('umin', a, b), '!'+lower_umin), 1253 (('bcsel', ('uge', 'b@{}'.format(s), a), b, a), ('umax', a, b), '!'+lower_umax), 1254 (('bcsel', ('ilt', 'b@{}'.format(s), a), b, a), ('imin', a, b), '!'+lower_imin), 1255 (('bcsel', ('ilt', 'a@{}'.format(s), b), b, a), ('imax', a, b), '!'+lower_imax), 1256 (('bcsel', ('ige', 'a@{}'.format(s), b), b, a), ('imin', a, b), '!'+lower_imin), 1257 (('bcsel', ('ige', 'b@{}'.format(s), a), b, a), ('imax', a, b), '!'+lower_imax), 1258 1259 # True/False are ~0 and 0 in NIR. b2i of True is 1, and -1 is ~0 (True). 1260 (('ineg', ('b2i{}'.format(s), 'a@{}'.format(s))), a), 1261 1262 # SM5 32-bit shifts are defined to use the 5 least significant bits (or 4 bits for 16 bits) 1263 (('ishl', 'a@{}'.format(s), ('iand', s - 1, b)), ('ishl', a, b)), 1264 (('ishr', 'a@{}'.format(s), ('iand', s - 1, b)), ('ishr', a, b)), 1265 (('ushr', 'a@{}'.format(s), ('iand', s - 1, b)), ('ushr', a, b)), 1266 (('ushr', 'a@{}'.format(s), ('ishl(is_used_once)', ('iand', b, 1), amount_bits - 1)), ('ushr', a, ('ishl', b, amount_bits - 1))), 1267 (('ushr', 'a@{}'.format(s), ('ishl(is_used_once)', ('iand', b, 3), amount_bits - 2)), ('ushr', a, ('ishl', b, amount_bits - 2))), 1268 ]) 1269 1270optimizations.extend([ 1271 # Common pattern like 'if (i == 0 || i == 1 || ...)' 1272 (('ior', ('ieq', a, 0), ('ieq', a, 1)), ('uge', 1, a)), 1273 (('ior', ('uge', 1, a), ('ieq', a, 2)), ('uge', 2, a)), 1274 (('ior', ('uge', 2, a), ('ieq', a, 3)), ('uge', 3, a)), 1275 (('ior', a, ('ieq', a, False)), True), 1276 1277 (('uge', a, 1), ('ine', a, 0)), 1278 (('ult', a, 1), ('ieq', a, 0)), 1279 1280 (('ine', ('ineg', ('b2i', 'a@1')), ('ineg', ('b2i', 'b@1'))), ('ine', a, b)), 1281 (('b2i', ('ine', 'a@1', 'b@1')), ('b2i', ('ixor', a, b))), 1282 1283 (('ishl', ('b2i32', ('ine', ('iand', 'a@32', '#b(is_pos_power_of_two)'), 0)), '#c'), 1284 ('bcsel', ('ige', ('iand', c, 31), ('find_lsb', b)), 1285 ('ishl', ('iand', a, b), ('iadd', ('iand', c, 31), ('ineg', ('find_lsb', b)))), 1286 ('ushr', ('iand', a, b), ('iadd', ('ineg', ('iand', c, 31)), ('find_lsb', b))) 1287 ) 1288 ), 1289 1290 (('b2i32', ('ine', ('iand', 'a@32', '#b(is_pos_power_of_two)'), 0)), 1291 ('ushr', ('iand', a, b), ('find_lsb', b)), '!options->lower_bitops'), 1292 1293 (('ior', ('b2i', a), ('iand', b, 1)), ('iand', ('ior', ('b2i', a), b), 1)), 1294 (('iand', ('b2i', a), ('iand', b, 1)), ('iand', ('b2i', a), b)), 1295 1296 # This pattern occurs coutresy of __flt64_nonnan in the soft-fp64 code. 1297 # The first part of the iand comes from the !__feq64_nonnan. 1298 # 1299 # The second pattern is a reformulation of the first based on the relation 1300 # (a == 0 || y == 0) <=> umin(a, y) == 0, where b in the first equation 1301 # happens to be y == 0. 1302 (('iand', ('inot', ('iand', ('ior', ('ieq', a, 0), b), c)), ('ilt', a, 0)), 1303 ('iand', ('inot', ('iand', b , c)), ('ilt', a, 0))), 1304 (('iand', ('inot', ('iand', ('ieq', ('umin', a, b), 0), c)), ('ilt', a, 0)), 1305 ('iand', ('inot', ('iand', ('ieq', b , 0), c)), ('ilt', a, 0))), 1306 1307 # These patterns can result when (a < b || a < c) => (a < min(b, c)) 1308 # transformations occur before constant propagation and loop-unrolling. 1309 # 1310 # The flt versions are exact. If isnan(a), the original pattern is 1311 # trivially false, and the replacements are false too. If isnan(b): 1312 # 1313 # a < fmax(NaN, a) => a < a => false vs a < NaN => false 1314 (('flt', a, ('fmax', b, a)), ('flt', a, b)), 1315 (('flt', ('fmin', a, b), a), ('flt', b, a)), 1316 (('~fge', a, ('fmin', b, a)), True), 1317 (('~fge', ('fmax', a, b), a), True), 1318 (('flt', a, ('fmin', b, a)), False), 1319 (('flt', ('fmax', a, b), a), False), 1320 (('~fge', a, ('fmax', b, a)), ('fge', a, b)), 1321 (('~fge', ('fmin', a, b), a), ('fge', b, a)), 1322 1323 (('ilt', a, ('imax', b, a)), ('ilt', a, b)), 1324 (('ilt', ('imin', a, b), a), ('ilt', b, a)), 1325 (('ige', a, ('imin', b, a)), True), 1326 (('ige', ('imax', a, b), a), True), 1327 (('ult', a, ('umax', b, a)), ('ult', a, b)), 1328 (('ult', ('umin', a, b), a), ('ult', b, a)), 1329 (('uge', a, ('umin', b, a)), True), 1330 (('uge', ('umax', a, b), a), True), 1331 (('ilt', a, ('imin', b, a)), False), 1332 (('ilt', ('imax', a, b), a), False), 1333 (('ige', a, ('imax', b, a)), ('ige', a, b)), 1334 (('ige', ('imin', a, b), a), ('ige', b, a)), 1335 (('ult', a, ('umin', b, a)), False), 1336 (('ult', ('umax', a, b), a), False), 1337 (('uge', a, ('umax', b, a)), ('uge', a, b)), 1338 (('uge', ('umin', a, b), a), ('uge', b, a)), 1339 (('ult', a, ('iand', b, a)), False), 1340 (('ult', ('ior', a, b), a), False), 1341 (('uge', a, ('iand', b, a)), True), 1342 (('uge', ('ior', a, b), a), True), 1343 1344 (('ilt', '#a', ('imax', '#b', c)), ('ior', ('ilt', a, b), ('ilt', a, c))), 1345 (('ilt', ('imin', '#a', b), '#c'), ('ior', ('ilt', a, c), ('ilt', b, c))), 1346 (('ige', '#a', ('imin', '#b', c)), ('ior', ('ige', a, b), ('ige', a, c))), 1347 (('ige', ('imax', '#a', b), '#c'), ('ior', ('ige', a, c), ('ige', b, c))), 1348 (('ult', '#a', ('umax', '#b', c)), ('ior', ('ult', a, b), ('ult', a, c))), 1349 (('ult', ('umin', '#a', b), '#c'), ('ior', ('ult', a, c), ('ult', b, c))), 1350 (('uge', '#a', ('umin', '#b', c)), ('ior', ('uge', a, b), ('uge', a, c))), 1351 (('uge', ('umax', '#a', b), '#c'), ('ior', ('uge', a, c), ('uge', b, c))), 1352 (('ilt', '#a', ('imin', '#b', c)), ('iand', ('ilt', a, b), ('ilt', a, c))), 1353 (('ilt', ('imax', '#a', b), '#c'), ('iand', ('ilt', a, c), ('ilt', b, c))), 1354 (('ige', '#a', ('imax', '#b', c)), ('iand', ('ige', a, b), ('ige', a, c))), 1355 (('ige', ('imin', '#a', b), '#c'), ('iand', ('ige', a, c), ('ige', b, c))), 1356 (('ult', '#a', ('umin', '#b', c)), ('iand', ('ult', a, b), ('ult', a, c))), 1357 (('ult', ('umax', '#a', b), '#c'), ('iand', ('ult', a, c), ('ult', b, c))), 1358 (('uge', '#a', ('umax', '#b', c)), ('iand', ('uge', a, b), ('uge', a, c))), 1359 (('uge', ('umin', '#a', b), '#c'), ('iand', ('uge', a, c), ('uge', b, c))), 1360 1361 # Thanks to sign extension, the ishr(a, b) is negative if and only if a is 1362 # negative. 1363 (('bcsel', ('ilt', a, 0), ('ineg', ('ishr', a, b)), ('ishr', a, b)), 1364 ('iabs', ('ishr', a, b))), 1365 (('iabs', ('ishr', ('iabs', a), b)), ('ushr', ('iabs', a), b)), 1366 (('iabs', ('ushr', ('iabs', a), b)), ('ushr', ('iabs', a), b)), 1367 1368 (('fabs', ('slt', a, b)), ('slt', a, b)), 1369 (('fabs', ('sge', a, b)), ('sge', a, b)), 1370 (('fabs', ('seq', a, b)), ('seq', a, b)), 1371 (('fabs', ('sne', a, b)), ('sne', a, b)), 1372 (('slt', a, b), ('b2f', ('flt', a, b)), 'options->lower_scmp'), 1373 (('sge', a, b), ('b2f', ('fge', a, b)), 'options->lower_scmp'), 1374 (('seq', a, b), ('b2f', ('feq', a, b)), 'options->lower_scmp'), 1375 (('sne', a, b), ('b2f', ('fneu', a, b)), 'options->lower_scmp'), 1376 (('seq', ('seq', a, b), 1.0), ('seq', a, b)), 1377 (('seq', ('sne', a, b), 1.0), ('sne', a, b)), 1378 (('seq', ('slt', a, b), 1.0), ('slt', a, b)), 1379 (('seq', ('sge', a, b), 1.0), ('sge', a, b)), 1380 (('sne', ('seq', a, b), 0.0), ('seq', a, b)), 1381 (('sne', ('sne', a, b), 0.0), ('sne', a, b)), 1382 (('sne', ('slt', a, b), 0.0), ('slt', a, b)), 1383 (('sne', ('sge', a, b), 0.0), ('sge', a, b)), 1384 (('seq', ('seq', a, b), 0.0), ('sne', a, b)), 1385 (('seq', ('sne', a, b), 0.0), ('seq', a, b)), 1386 (('seq', ('slt', a, b), 0.0), ('sge', a, b)), 1387 (('seq', ('sge', a, b), 0.0), ('slt', a, b)), 1388 (('sne', ('seq', a, b), 1.0), ('sne', a, b)), 1389 (('sne', ('sne', a, b), 1.0), ('seq', a, b)), 1390 (('sne', ('slt', a, b), 1.0), ('sge', a, b)), 1391 (('sne', ('sge', a, b), 1.0), ('slt', a, b)), 1392 (('fall_equal2', a, b), ('fmin', ('seq', 'a.x', 'b.x'), ('seq', 'a.y', 'b.y')), 'options->lower_vector_cmp'), 1393 (('fall_equal3', a, b), ('seq', ('fany_nequal3', a, b), 0.0), 'options->lower_vector_cmp'), 1394 (('fall_equal4', a, b), ('seq', ('fany_nequal4', a, b), 0.0), 'options->lower_vector_cmp'), 1395 (('fall_equal8', a, b), ('seq', ('fany_nequal8', a, b), 0.0), 'options->lower_vector_cmp'), 1396 (('fall_equal16', a, b), ('seq', ('fany_nequal16', a, b), 0.0), 'options->lower_vector_cmp'), 1397 (('fany_nequal2', a, b), ('fmax', ('sne', 'a.x', 'b.x'), ('sne', 'a.y', 'b.y')), 'options->lower_vector_cmp'), 1398 (('fany_nequal3', a, b), ('fsat', ('fdot3', ('sne', a, b), ('sne', a, b))), 'options->lower_vector_cmp'), 1399 (('fany_nequal4', a, b), ('fsat', ('fdot4', ('sne', a, b), ('sne', a, b))), 'options->lower_vector_cmp'), 1400 (('fany_nequal8', a, b), ('fsat', ('fdot8', ('sne', a, b), ('sne', a, b))), 'options->lower_vector_cmp'), 1401 (('fany_nequal16', a, b), ('fsat', ('fdot16', ('sne', a, b), ('sne', a, b))), 'options->lower_vector_cmp'), 1402]) 1403 1404def vector_cmp(reduce_op, cmp_op, comps): 1405 if len(comps) == 1: 1406 return (cmp_op, 'a.' + comps[0], 'b.' + comps[0]) 1407 else: 1408 mid = len(comps) // 2 1409 return (reduce_op, vector_cmp(reduce_op, cmp_op, comps[:mid]), 1410 vector_cmp(reduce_op, cmp_op, comps[mid:])) 1411 1412for op in [ 1413 ('ball_iequal', 'ieq', 'iand'), 1414 ('ball_fequal', 'feq', 'iand'), 1415 ('bany_inequal', 'ine', 'ior'), 1416 ('bany_fnequal', 'fneu', 'ior'), 1417]: 1418 optimizations.extend([ 1419 ((op[0] + '2', a, b), vector_cmp(op[2], op[1], 'xy'), 'options->lower_vector_cmp'), 1420 ((op[0] + '3', a, b), vector_cmp(op[2], op[1], 'xyz'), 'options->lower_vector_cmp'), 1421 ((op[0] + '4', a, b), vector_cmp(op[2], op[1], 'xyzw'), 'options->lower_vector_cmp'), 1422 ((op[0] + '8', a, b), vector_cmp(op[2], op[1], 'abcdefgh'), 'options->lower_vector_cmp'), 1423 ((op[0] + '16', a, b), vector_cmp(op[2], op[1], 'abcdefghijklmnop'), 'options->lower_vector_cmp'), 1424 ]) 1425 1426# D3D Boolean emulation 1427for s in [8, 16, 32, 64]: 1428 cond = 'true' 1429 if s == 64: 1430 cond = '!(options->lower_int64_options & nir_lower_conv64)' 1431 1432 optimizations.extend([ 1433 (('bcsel@{}'.format(s), a, -1, 0), ('ineg', ('b2i', 'a@1')), cond), 1434 (('bcsel@{}'.format(s), a, 0, -1), ('ineg', ('b2i', ('inot', a))), cond), 1435 (('bcsel@{}'.format(s), a, 1, 0), ('b2i', 'a@1'), cond), 1436 (('bcsel@{}'.format(s), a, 0, 1), ('b2i', ('inot', a)), cond), 1437 ]) 1438 1439optimizations.extend([ 1440 (('iand', ('ineg', ('b2i', 'a@1')), ('ineg', ('b2i', 'b@1'))), 1441 ('ineg', ('b2i', ('iand', a, b)))), 1442 (('ior', ('ineg', ('b2i','a@1')), ('ineg', ('b2i', 'b@1'))), 1443 ('ineg', ('b2i', ('ior', a, b)))), 1444 (('ieq', ('ineg', ('b2i', 'a@1')), -1), a), 1445 (('ine', ('ineg', ('b2i', 'a@1')), -1), ('inot', a)), 1446 (('ige', ('ineg', ('b2i', 'a@1')), 0), ('inot', a)), 1447 (('ilt', ('ineg', ('b2i', 'a@1')), 0), a), 1448 (('ult', 0, ('ineg', ('b2i', 'a@1'))), a), 1449 (('iand', ('ineg', ('b2i', a)), 1.0), ('b2f', a)), 1450 (('iand', ('ineg', ('b2i', a)), 1), ('b2i', a)), 1451]) 1452 1453optimizations.extend([ 1454 (('feq', ('seq', a, b), 1.0), ('feq', a, b)), 1455 (('feq', ('sne', a, b), 1.0), ('fneu', a, b)), 1456 (('feq', ('slt', a, b), 1.0), ('flt', a, b)), 1457 (('feq', ('sge', a, b), 1.0), ('fge', a, b)), 1458 (('fneu', ('seq', a, b), 0.0), ('feq', a, b)), 1459 (('fneu', ('sne', a, b), 0.0), ('fneu', a, b)), 1460 (('fneu', ('slt', a, b), 0.0), ('flt', a, b)), 1461 (('fneu', ('sge', a, b), 0.0), ('fge', a, b)), 1462 (('feq', ('seq', a, b), 0.0), ('fneu', a, b)), 1463 (('feq', ('sne', a, b), 0.0), ('feq', a, b)), 1464 (('feq', ('slt', a, b), 0.0), ('fge', a, b)), 1465 (('feq', ('sge', a, b), 0.0), ('flt', a, b)), 1466 (('fneu', ('seq', a, b), 1.0), ('fneu', a, b)), 1467 (('fneu', ('sne', a, b), 1.0), ('feq', a, b)), 1468 (('fneu', ('slt', a, b), 1.0), ('fge', a, b)), 1469 (('fneu', ('sge', a, b), 1.0), ('flt', a, b)), 1470 1471 (('fneu', ('fneg', a), a), ('fneu', a, 0.0)), 1472 (('feq', ('fneg', a), a), ('feq', a, 0.0)), 1473 # Emulating booleans 1474 (('imul', ('b2i', 'a@1'), ('b2i', 'b@1')), ('b2i', ('iand', a, b))), 1475 (('iand', ('b2i', 'a@1'), ('b2i', 'b@1')), ('b2i', ('iand', a, b))), 1476 (('ior', ('b2i', 'a@1'), ('b2i', 'b@1')), ('b2i', ('ior', a, b))), 1477 (('fmul', ('b2f', 'a@1'), ('b2f', 'b@1')), ('b2f', ('iand', a, b))), 1478 (('ffma', ('b2f', 'a@1'), ('b2f', 'b@1'), c), ('fadd', ('b2f', ('iand', a, b)), c)), 1479 (('fsat', ('fadd', ('b2f', 'a@1'), ('b2f', 'b@1'))), ('b2f', ('ior', a, b))), 1480 (('iand', 'a@bool16', 1.0), ('b2f', a)), 1481 (('iand', 'a@bool32', 1.0), ('b2f', a)), 1482 (('flt', ('fneg', ('b2f', 'a@1')), 0), a), # Generated by TGSI KILL_IF. 1483 # Comparison with the same args. Note that these are only done for the 1484 # float versions when the source must be a number. Generally, NaN cmp NaN 1485 # produces the opposite result of X cmp X. flt is the outlier. NaN < NaN 1486 # is false, and, for any number X, X < X is also false. 1487 (('ilt', a, a), False), 1488 (('ige', a, a), True), 1489 (('ieq', a, a), True), 1490 (('ine', a, a), False), 1491 (('ult', a, a), False), 1492 (('uge', a, a), True), 1493 (('flt', a, a), False), 1494 (('fge', 'a(is_a_number)', a), True), 1495 (('feq', 'a(is_a_number)', a), True), 1496 (('fneu', 'a(is_a_number)', a), False), 1497 # Logical and bit operations 1498 (('iand', a, a), a), 1499 (('iand', a, 0), 0), 1500 (('iand', a, -1), a), 1501 (('iand', a, ('inot', a)), 0), 1502 (('ior', a, a), a), 1503 (('ior', a, 0), a), 1504 (('ior', a, -1), -1), 1505 (('ior', a, ('inot', a)), -1), 1506 (('ixor', a, a), 0), 1507 (('ixor', a, 0), a), 1508 (('ixor', a, ('ixor', a, b)), b), 1509 (('ixor', a, -1), ('inot', a)), 1510 (('inot', ('inot', a)), a), 1511 (('ior', ('iand', a, b), b), b), 1512 (('ior', ('ior', a, b), b), ('ior', a, b)), 1513 (('iand', ('ior', a, b), b), b), 1514 (('iand', ('iand', a, b), b), ('iand', a, b)), 1515 1516 # It is common for sequences of (x & 1) to occur in large trees. Replacing 1517 # an expression like ((a & 1) & (b & 1)) with ((a & b) & 1) allows the "& 1518 # 1" to eventually bubble up to the top of the tree. 1519 (('iand', ('iand(is_used_once)', a, b), ('iand(is_used_once)', a, c)), 1520 ('iand', a, ('iand', b, c))), 1521 1522 (('iand@64', a, '#b(is_lower_half_zero)'), 1523 ('pack_64_2x32_split', 0, 1524 ('iand', ('unpack_64_2x32_split_y', a), ('unpack_64_2x32_split_y', b))), 1525 '!options->lower_pack_64_2x32_split'), 1526 (('iand@64', a, '#b(is_upper_half_zero)'), 1527 ('pack_64_2x32_split', ('iand', ('unpack_64_2x32_split_x', a), ('unpack_64_2x32_split_x', b)), 1528 0), 1529 '!options->lower_pack_64_2x32_split'), 1530 (('iand@64', a, '#b(is_lower_half_negative_one)'), 1531 ('pack_64_2x32_split', ('unpack_64_2x32_split_x', a), 1532 ('iand', ('unpack_64_2x32_split_y', a), ('unpack_64_2x32_split_y', b))), 1533 '!options->lower_pack_64_2x32_split'), 1534 (('iand@64', a, '#b(is_upper_half_negative_one)'), 1535 ('pack_64_2x32_split', ('iand', ('unpack_64_2x32_split_x', a), ('unpack_64_2x32_split_x', b)), 1536 ('unpack_64_2x32_split_y', a)), 1537 '!options->lower_pack_64_2x32_split'), 1538 1539 (('ior@64', a, '#b(is_lower_half_zero)'), 1540 ('pack_64_2x32_split', ('unpack_64_2x32_split_x', a), 1541 ('ior', ('unpack_64_2x32_split_y', a), ('unpack_64_2x32_split_y', b))), 1542 '!options->lower_pack_64_2x32_split'), 1543 (('ior@64', a, '#b(is_upper_half_zero)'), 1544 ('pack_64_2x32_split', ('ior', ('unpack_64_2x32_split_x', a), ('unpack_64_2x32_split_x', b)), 1545 ('unpack_64_2x32_split_y', a)), 1546 '!options->lower_pack_64_2x32_split'), 1547 (('ior@64', a, '#b(is_lower_half_negative_one)'), 1548 ('pack_64_2x32_split', -1, 1549 ('ior', ('unpack_64_2x32_split_y', a), ('unpack_64_2x32_split_y', b))), 1550 '!options->lower_pack_64_2x32_split'), 1551 (('ior@64', a, '#b(is_upper_half_negative_one)'), 1552 ('pack_64_2x32_split', ('ior', ('unpack_64_2x32_split_x', a), ('unpack_64_2x32_split_x', b)), 1553 -1), 1554 '!options->lower_pack_64_2x32_split'), 1555 1556 (('ixor@64', a, '#b(is_lower_half_zero)'), 1557 ('pack_64_2x32_split', ('unpack_64_2x32_split_x', a), 1558 ('ixor', ('unpack_64_2x32_split_y', a), ('unpack_64_2x32_split_y', b))), 1559 '!options->lower_pack_64_2x32_split'), 1560 (('ixor@64', a, '#b(is_upper_half_zero)'), 1561 ('pack_64_2x32_split', ('ixor', ('unpack_64_2x32_split_x', a), ('unpack_64_2x32_split_x', b)), 1562 ('unpack_64_2x32_split_y', a)), 1563 '!options->lower_pack_64_2x32_split'), 1564 1565 # DeMorgan's Laws 1566 (('iand', ('inot', a), ('inot', b)), ('inot', ('ior', a, b))), 1567 (('ior', ('inot', a), ('inot', b)), ('inot', ('iand', a, b))), 1568 # Shift optimizations 1569 (('ishl', 0, a), 0), 1570 (('ishl', a, 0), a), 1571 (('ishr', 0, a), 0), 1572 (('ishr', -1, a), -1), 1573 (('ishr', a, 0), a), 1574 (('ushr', 0, a), 0), 1575 (('ushr', a, 0), a), 1576 (('bcsel', ('ieq', b, 0), a, ('ushr', a, b)), ('ushr', a, b)), 1577 (('bcsel', ('ieq', b, 0), a, ('ishr', a, b)), ('ishr', a, b)), 1578 (('bcsel', ('ieq', b, 0), a, ('ishl', a, b)), ('ishl', a, b)), 1579 (('bcsel', ('ine', b, 0), ('ushr', a, b), a), ('ushr', a, b)), 1580 (('bcsel', ('ine', b, 0), ('ishr', a, b), a), ('ishr', a, b)), 1581 (('bcsel', ('ine', b, 0), ('ishl', a, b), a), ('ishl', a, b)), 1582 (('ior', ('ishl@16', a, b), ('ushr@16', a, ('iadd', 16, ('ineg', b)))), ('urol', a, b), 'options->has_rotate16'), 1583 (('ior', ('ishl@16', a, b), ('ushr@16', a, ('isub', 16, b))), ('urol', a, b), 'options->has_rotate16'), 1584 (('ior', ('ishl@32', a, b), ('ushr@32', a, ('iadd', 32, ('ineg', b)))), ('urol', a, b), 'options->has_rotate32'), 1585 (('ior', ('ishl@32', a, b), ('ushr@32', a, ('isub', 32, b))), ('urol', a, b), 'options->has_rotate32'), 1586 (('ior', ('ushr@16', a, b), ('ishl@16', a, ('iadd', 16, ('ineg', b)))), ('uror', a, b), 'options->has_rotate16'), 1587 (('ior', ('ushr@16', a, b), ('ishl@16', a, ('isub', 16, b))), ('uror', a, b), 'options->has_rotate16'), 1588 (('ior', ('ushr@32', a, b), ('ishl@32', a, ('iadd', 32, ('ineg', b)))), ('uror', a, b), 'options->has_rotate32'), 1589 (('ior', ('ushr@32', a, b), ('ishl@32', a, ('isub', 32, b))), ('uror', a, b), 'options->has_rotate32'), 1590 (('urol@8', a, b), ('ior', ('ishl', a, b), ('ushr', a, ('isub', 8, b))), '!options->has_rotate8'), 1591 (('urol@16', a, b), ('ior', ('ishl', a, b), ('ushr', a, ('isub', 16, b))), '!options->has_rotate16'), 1592 (('urol@32', a, b), ('ior', ('ishl', a, b), ('ushr', a, ('isub', 32, b))), '!options->has_rotate32'), 1593 (('urol@64', a, b), ('ior', ('ishl', a, b), ('ushr', a, ('isub', 64, b)))), 1594 (('uror@8', a, b), ('ior', ('ushr', a, b), ('ishl', a, ('isub', 8, b))), '!options->has_rotate8'), 1595 (('uror@16', a, b), ('ior', ('ushr', a, b), ('ishl', a, ('isub', 16, b))), '!options->has_rotate16'), 1596 (('uror@32', a, b), ('ior', ('ushr', a, b), ('ishl', a, ('isub', 32, b))), '!options->has_rotate32'), 1597 (('uror@64', a, b), ('ior', ('ushr', a, b), ('ishl', a, ('isub', 64, b)))), 1598 1599 (('bitfield_select', 0xff000000, ('ishl', 'b@32', 24), ('ushr', a, 8)), ('shfr', b, a, 8), 'options->has_shfr32'), 1600 (('bitfield_select', 0xffff0000, ('ishl', 'b@32', 16), ('extract_u16', a, 1)), ('shfr', b, a, 16), 'options->has_shfr32'), 1601 (('bitfield_select', 0xffffff00, ('ishl', 'b@32', 8), ('extract_u8', a, 3)), ('shfr', b, a, 24), 'options->has_shfr32'), 1602 (('ior', ('ishl', 'b@32', 24), ('ushr', a, 8)), ('shfr', b, a, 8), 'options->has_shfr32'), 1603 (('ior', ('ishl', 'b@32', 16), ('extract_u16', a, 1)), ('shfr', b, a, 16), 'options->has_shfr32'), 1604 (('ior', ('ishl', 'b@32', 8), ('extract_u8', a, 3)), ('shfr', b, a, 24), 'options->has_shfr32'), 1605 (('bcsel', ('ieq', c, 0), a, ('ior', ('ishl', 'b@32', ('iadd', 32, ('ineg', c))), ('ushr@32', a, c))), ('shfr', b, a, c), 'options->has_shfr32'), 1606 (('bcsel', ('ine', c, 0), ('ior', ('ishl', 'b@32', ('iadd', 32, ('ineg', c))), ('ushr@32', a, c)), a), ('shfr', b, a, c), 'options->has_shfr32'), 1607 (('ior', ('ishl', 'a@32', ('iadd', 32, ('ineg', b))), ('ushr@32', a, b)), ('shfr', a, a, b), 'options->has_shfr32 && !options->has_rotate32'), 1608 1609 # bfi(X, a, b) = (b & ~X) | (a & X) 1610 # If X = ~0: (b & 0) | (a & 0xffffffff) = a 1611 # If X = 0: (b & 0xffffffff) | (a & 0) = b 1612 (('bfi', 0xffffffff, a, b), a), 1613 (('bfi', 0x00000000, a, b), b), 1614 1615 # The result of -int(some_bool) is 0 or 0xffffffff, so the result of the 1616 # bfi is either b or c. 1617 (('bfi', ('ineg', ('b2i', 'a@1')), b, c), ('bcsel', a, b, c)), 1618 1619 # bfi(a, 0, 0) = ((0 << find_lsb(a)) & a) | (0 & ~a) 1620 # = 0 1621 (('bfi', a, 0, 0), 0), 1622 1623 # bfi(a, b, b) = ((b << find_lsb(a)) & a) | (b & ~a) 1624 # = (a & b) | (b & ~a) If a is odd, find_lsb(a) == 0 1625 # = b 1626 (('bfi', '#a(is_odd)', b, b), b), 1627 1628 # bfi(a, a, b) = ((a << find_lsb(a)) & a) | (b & ~a) 1629 # = (a & a) | (b & ~a) If a is odd, find_lsb(a) == 0 1630 # = a | (b & ~a) 1631 # = a | b 1632 (('bfi', '#a(is_odd)', a, b), ('ior', a, b)), 1633 1634 # bfi(a, b, 0) = ((b << find_lsb(a)) & a) | (0 & ~a) 1635 # = ((b << find_lsb(a)) & a) 1636 # = (b & a) If a is odd, find_lsb(a) == 0 1637 (('bfi', '#a(is_odd)', b, 0), ('iand', a, b)), 1638 1639 # Because 'a' is a positive power of two, the result of the bfi is either 0 1640 # or 'a' depending on whether or not 'b' is odd. Use 'b&1' for the zero 1641 # value to help platforms that can't have two constants in a bcsel. 1642 (('u2f32', ('bfi', '#a(is_pos_power_of_two)', b, 0)), 1643 ('bcsel', ('ieq', ('iand', b, 1), 0), ('iand', b, 1), ('u2f', a))), 1644 (('u2f', ('bfi', '#a(is_pos_power_of_two)', b, 0)), 1645 ('bcsel', ('ieq', ('iand', b, 1), 0), 0, ('u2f', a))), 1646 1647 # Exponential/logarithmic identities 1648 (('~fexp2', ('flog2', a)), a), # 2^lg2(a) = a 1649 (('~flog2', ('fexp2', a)), a), # lg2(2^a) = a 1650 # 32-bit fpow should use fmulz to fix https://gitlab.freedesktop.org/mesa/mesa/-/issues/11464 (includes apitrace) 1651 (('fpow@32', a, b), ('fexp2', ('fmulz', ('flog2', a), b)), 'options->lower_fpow && ' + has_fmulz), # a^b = 2^(lg2(a)*b) 1652 (('fpow', a, b), ('fexp2', ('fmul', ('flog2', a), b)), 'options->lower_fpow'), # a^b = 2^(lg2(a)*b) 1653 (('~fexp2', ('fmul', ('flog2', a), b)), ('fpow', a, b), '!options->lower_fpow'), # 2^(lg2(a)*b) = a^b 1654 (('~fexp2', ('fadd', ('fmul', ('flog2', a), b), ('fmul', ('flog2', c), d))), 1655 ('~fmul', ('fpow', a, b), ('fpow', c, d)), '!options->lower_fpow'), # 2^(lg2(a) * b + lg2(c) + d) = a^b * c^d 1656 (('~fexp2', ('fmul', ('flog2', a), 0.5)), ('fsqrt', a)), 1657 (('~fexp2', ('fmul', ('flog2', a), 2.0)), ('fmul', a, a)), 1658 (('~fexp2', ('fmul', ('flog2', a), 3.0)), ('fmul', ('fmul', a, a), a)), 1659 (('~fexp2', ('fmul', ('flog2', a), 4.0)), ('fmul', ('fmul', a, a), ('fmul', a, a))), 1660 (('~fexp2', ('fmul', ('flog2', a), 5.0)), ('fmul', ('fmul', ('fmul', a, a), ('fmul', a, a)), a)), 1661 (('~fexp2', ('fmul', ('flog2', a), 6.0)), ('fmul', ('fmul', ('fmul', a, a), ('fmul', a, a)), ('fmul', a, a))), 1662 (('~fexp2', ('fmul', ('flog2', a), 8.0)), ('fmul', ('fmul', ('fmul', a, a), ('fmul', a, a)), ('fmul', ('fmul', a, a), ('fmul', a, a)))), 1663 (('~fpow', a, 1.0), a), 1664 (('~fpow', a, 2.0), ('fmul', a, a)), 1665 (('~fpow', a, 3.0), ('fmul', ('fmul', a, a), a)), 1666 (('~fpow', a, 4.0), ('fmul', ('fmul', a, a), ('fmul', a, a))), 1667 (('~fpow', 2.0, a), ('fexp2', a)), 1668 (('~fpow', ('fpow', a, 2.2), 0.454545), a), 1669 (('~fpow', ('fabs', ('fpow', a, 2.2)), 0.454545), ('fabs', a)), 1670 (('~fsqrt', ('fexp2', a)), ('fexp2', ('fmul', 0.5, a))), 1671 (('~frcp', ('fexp2', a)), ('fexp2', ('fneg', a))), 1672 (('~frsq', ('fexp2', a)), ('fexp2', ('fmul', -0.5, a))), 1673 (('~flog2', ('fsqrt', a)), ('fmul', 0.5, ('flog2', a))), 1674 (('~flog2', ('frcp', a)), ('fneg', ('flog2', a))), 1675 (('~flog2', ('frsq', a)), ('fmul', -0.5, ('flog2', a))), 1676 (('~flog2', ('fpow', a, b)), ('fmul', b, ('flog2', a))), 1677 (('~fmul', ('fexp2(is_used_once)', a), ('fexp2(is_used_once)', b)), ('fexp2', ('fadd', a, b))), 1678 (('bcsel', ('flt', a, 0.0), 0.0, ('fsqrt', a)), ('fsqrt', ('fmax', a, 0.0))), 1679 (('~fmul', ('fsqrt', a), ('fsqrt', a)), ('fabs',a)), 1680 (('~fmulz', ('fsqrt', a), ('fsqrt', a)), ('fabs', a)), 1681 # Division and reciprocal 1682 (('~fdiv', 1.0, a), ('frcp', a)), 1683 (('fdiv', a, b), ('fmul', a, ('frcp', b)), 'options->lower_fdiv'), 1684 (('~frcp', ('frcp', a)), a), 1685 (('~frcp', ('fsqrt', a)), ('frsq', a)), 1686 (('fsqrt', a), ('frcp', ('frsq', a)), 'options->lower_fsqrt'), 1687 (('~frcp', ('frsq', a)), ('fsqrt', a), '!options->lower_fsqrt'), 1688 # Trig 1689 (('fsin', a), lowered_sincos(0.5), 'options->lower_sincos'), 1690 (('fcos', a), lowered_sincos(0.75), 'options->lower_sincos'), 1691 # Boolean simplifications 1692 (('ieq', a, True), a), 1693 (('ine(is_not_used_by_if)', a, True), ('inot', a)), 1694 (('ine', a, False), a), 1695 (('ieq(is_not_used_by_if)', a, False), ('inot', 'a')), 1696 (('bcsel', a, True, False), a), 1697 (('bcsel', a, False, True), ('inot', a)), 1698 (('bcsel', True, b, c), b), 1699 (('bcsel', False, b, c), c), 1700 1701 (('bcsel@16', a, 1.0, 0.0), ('b2f', a)), 1702 (('bcsel@16', a, 0.0, 1.0), ('b2f', ('inot', a))), 1703 (('bcsel@16', a, -1.0, -0.0), ('fneg', ('b2f', a))), 1704 (('bcsel@16', a, -0.0, -1.0), ('fneg', ('b2f', ('inot', a)))), 1705 (('bcsel@32', a, 1.0, 0.0), ('b2f', a)), 1706 (('bcsel@32', a, 0.0, 1.0), ('b2f', ('inot', a))), 1707 (('bcsel@32', a, -1.0, -0.0), ('fneg', ('b2f', a))), 1708 (('bcsel@32', a, -0.0, -1.0), ('fneg', ('b2f', ('inot', a)))), 1709 (('bcsel@64', a, 1.0, 0.0), ('b2f', a), '!(options->lower_doubles_options & nir_lower_fp64_full_software)'), 1710 (('bcsel@64', a, 0.0, 1.0), ('b2f', ('inot', a)), '!(options->lower_doubles_options & nir_lower_fp64_full_software)'), 1711 (('bcsel@64', a, -1.0, -0.0), ('fneg', ('b2f', a)), '!(options->lower_doubles_options & nir_lower_fp64_full_software)'), 1712 (('bcsel@64', a, -0.0, -1.0), ('fneg', ('b2f', ('inot', a))), '!(options->lower_doubles_options & nir_lower_fp64_full_software)'), 1713 1714 (('bcsel', a, b, b), b), 1715 (('~fcsel', a, b, b), b), 1716 1717 # With D3D booleans, imax is AND and umax is OR 1718 (('imax', ('ineg', ('b2i', 'a@1')), ('ineg', ('b2i', 'b@1'))), 1719 ('ineg', ('b2i', ('iand', a, b)))), 1720 (('imin', ('ineg', ('b2i', 'a@1')), ('ineg', ('b2i', 'b@1'))), 1721 ('ineg', ('b2i', ('ior', a, b)))), 1722 (('umax', ('ineg', ('b2i', 'a@1')), ('ineg', ('b2i', 'b@1'))), 1723 ('ineg', ('b2i', ('ior', a, b)))), 1724 (('umin', ('ineg', ('b2i', 'a@1')), ('ineg', ('b2i', 'b@1'))), 1725 ('ineg', ('b2i', ('iand', a, b)))), 1726 (('umax', ('b2i', 'a@1'), ('b2i', 'b@1')), ('b2i', ('ior', a, b))), 1727 (('umin', ('b2i', 'a@1'), ('b2i', 'b@1')), ('b2i', ('iand', a, b))), 1728 1729 # Clean up LLVM booleans. b2i output is 0/1 so iand is a no-op. 1730 (('iand', ('b2i', a), 1), ('b2i', a)), 1731 1732 (('ine', ('umin', ('ineg', ('b2i', 'a@1')), b), 0), ('iand', a, ('ine', b, 0))), 1733 (('ine', ('umax', ('ineg', ('b2i', 'a@1')), b), 0), ('ior' , a, ('ine', b, 0))), 1734 1735 # Conversions 1736 (('f2i', ('ftrunc', a)), ('f2i', a)), 1737 (('f2u', ('ftrunc', a)), ('f2u', a)), 1738 1739 # Conversions from 16 bits to 32 bits and back can always be removed 1740 (('f2fmp', ('f2f32', 'a@16')), a), 1741 (('i2imp', ('i2i32', 'a@16')), a), 1742 (('i2imp', ('u2u32', 'a@16')), a), 1743 1744 (('f2imp', ('f2f32', 'a@16')), ('f2i16', a)), 1745 (('f2ump', ('f2f32', 'a@16')), ('f2u16', a)), 1746 (('i2fmp', ('i2i32', 'a@16')), ('i2f16', a)), 1747 (('u2fmp', ('u2u32', 'a@16')), ('u2f16', a)), 1748 1749 (('f2fmp', ('b2f32', 'a@1')), ('b2f16', a)), 1750 (('i2imp', ('b2i32', 'a@1')), ('b2i16', a)), 1751 (('i2imp', ('b2i32', 'a@1')), ('b2i16', a)), 1752 1753 (('f2imp', ('b2f32', 'a@1')), ('b2i16', a)), 1754 (('f2ump', ('b2f32', 'a@1')), ('b2i16', a)), 1755 (('i2fmp', ('b2i32', 'a@1')), ('b2f16', a)), 1756 (('u2fmp', ('b2i32', 'a@1')), ('b2f16', a)), 1757 1758 # Conversions to 16 bits would be lossy so they should only be removed if 1759 # the instruction was generated by the precision lowering pass. 1760 (('f2f32', ('f2fmp', 'a@32')), a), 1761 (('i2i32', ('i2imp', 'a@32')), a), 1762 (('u2u32', ('i2imp', 'a@32')), a), 1763 1764 # typeA@32 -> typeB@16 -> typeB@32 ==> typeA@32 -> typeB@32 1765 (('i2i32', ('f2imp', 'a@32')), ('f2i32', a)), 1766 (('u2u32', ('f2ump', 'a@32')), ('f2u32', a)), 1767 (('f2f32', ('i2fmp', 'a@32')), ('i2f32', a)), 1768 (('f2f32', ('u2fmp', 'a@32')), ('u2f32', a)), 1769 1770 # typeA@32 -> typeA@16 -> typeB@32 ==> typeA@32 -> typeB@32 1771 (('f2i32', ('f2fmp', 'a@32')), ('f2i32', a)), 1772 (('f2u32', ('f2fmp', 'a@32')), ('f2u32', a)), 1773 (('i2f32', ('i2imp', 'a@32')), ('i2f32', a)), 1774 1775 (('ffloor', 'a(is_integral)'), a), 1776 (('fceil', 'a(is_integral)'), a), 1777 (('ftrunc', 'a(is_integral)'), a), 1778 (('fround_even', 'a(is_integral)'), a), 1779 1780 # fract(x) = x - floor(x), so fract(NaN) = NaN 1781 (('~ffract', 'a(is_integral)'), 0.0), 1782 (('fabs', 'a(is_not_negative)'), a), 1783 (('iabs', 'a(is_not_negative)'), a), 1784 (('fsat', 'a(is_not_positive)'), 0.0), 1785 1786 (('~fmin', 'a(is_not_negative)', 1.0), ('fsat', a), '!options->lower_fsat'), 1787 1788 # The result of the multiply must be in [-1, 0], so the result of the ffma 1789 # must be in [0, 1]. 1790 (('flt', ('fadd', ('fmul', ('fsat', a), ('fneg', ('fsat', a))), 1.0), 0.0), False), 1791 (('flt', ('fadd', ('fneg', ('fmul', ('fsat', a), ('fsat', a))), 1.0), 0.0), False), 1792 (('fmax', ('fadd', ('fmul', ('fsat', a), ('fneg', ('fsat', a))), 1.0), 0.0), ('fadd', ('fmul', ('fsat', a), ('fneg', ('fsat', a))), 1.0)), 1793 (('fmax', ('fadd', ('fneg', ('fmul', ('fsat', a), ('fsat', a))), 1.0), 0.0), ('fadd', ('fneg', ('fmul', ('fsat', a), ('fsat', a))), 1.0)), 1794 1795 (('fneu', 'a(is_not_zero)', 0.0), True), 1796 (('feq', 'a(is_not_zero)', 0.0), False), 1797 1798 # In this chart, + means value > 0 and - means value < 0. 1799 # 1800 # + >= + -> unknown 0 >= + -> false - >= + -> false 1801 # + >= 0 -> true 0 >= 0 -> true - >= 0 -> false 1802 # + >= - -> true 0 >= - -> true - >= - -> unknown 1803 # 1804 # Using grouping conceptually similar to a Karnaugh map... 1805 # 1806 # (+ >= 0, + >= -, 0 >= 0, 0 >= -) == (is_not_negative >= is_not_positive) -> true 1807 # (0 >= +, - >= +) == (is_not_positive >= gt_zero) -> false 1808 # (- >= +, - >= 0) == (lt_zero >= is_not_negative) -> false 1809 # 1810 # The flt / ilt cases just invert the expected result. 1811 # 1812 # The results expecting true, must be marked imprecise. The results 1813 # expecting false are fine because NaN compared >= or < anything is false. 1814 1815 (('fge', 'a(is_a_number_not_negative)', 'b(is_a_number_not_positive)'), True), 1816 (('fge', 'a(is_not_positive)', 'b(is_gt_zero)'), False), 1817 (('fge', 'a(is_lt_zero)', 'b(is_not_negative)'), False), 1818 1819 (('flt', 'a(is_not_negative)', 'b(is_not_positive)'), False), 1820 (('flt', 'a(is_a_number_not_positive)', 'b(is_a_number_gt_zero)'), True), 1821 (('flt', 'a(is_a_number_lt_zero)', 'b(is_a_number_not_negative)'), True), 1822 1823 (('ine', 'a(is_not_zero)', 0), True), 1824 (('ieq', 'a(is_not_zero)', 0), False), 1825 1826 (('ige', 'a(is_not_negative)', 'b(is_not_positive)'), True), 1827 (('ige', 'a(is_not_positive)', 'b(is_gt_zero)'), False), 1828 (('ige', 'a(is_lt_zero)', 'b(is_not_negative)'), False), 1829 1830 (('ilt', 'a(is_not_negative)', 'b(is_not_positive)'), False), 1831 (('ilt', 'a(is_not_positive)', 'b(is_gt_zero)'), True), 1832 (('ilt', 'a(is_lt_zero)', 'b(is_not_negative)'), True), 1833 1834 (('ult', 0, 'a(is_gt_zero)'), True), 1835 (('ult', a, 0), False), 1836]) 1837 1838# Packing and then unpacking does nothing 1839for pack, bits, compbits in [('pack_64_2x32', 64, 32), ('pack_32_2x16', 32, 16)]: 1840 unpack = 'un' + pack 1841 optimizations += [ 1842 ((unpack + '_split_x', (pack + '_split', a, b)), a), 1843 ((unpack + '_split_y', (pack + '_split', a, b)), b), 1844 ((unpack + '_split_x', (pack, a)), 'a.x'), 1845 ((unpack + '_split_y', (pack, a)), 'a.y'), 1846 ((unpack + '_split_x', ('u2u' + str(bits), 'a@' + str(compbits))), a), 1847 ((unpack + '_split_x', ('i2i' + str(bits), 'a@' + str(compbits))), a), 1848 ((unpack + '_split_y', ('i2i' + str(bits) + '(is_used_once)', 'a@' + str(compbits))), ('ishr', a, compbits - 1)), 1849 ((unpack, (pack + '_split', a, b)), ('vec2', a, b)), 1850 ((unpack, (pack, a)), a), 1851 ((pack + '_split', (unpack + '_split_x', a), (unpack + '_split_y', a)), a), 1852 ((pack + '_split', (unpack, a), (unpack + '.y', a)), a), 1853 ((pack, ('vec2', (unpack + '_split_x', a), (unpack + '_split_y', a))), a), 1854 ((pack, (unpack, a)), a), 1855 ] 1856 1857optimizations.extend([ 1858 (('unpack_64_2x32_split_y', ('u2u64', 'a@1')), 0), 1859 (('unpack_64_2x32_split_y', ('u2u64', 'a@8')), 0), 1860 (('unpack_64_2x32_split_y', ('u2u64', 'a@16')), 0), 1861 (('unpack_64_2x32_split_y', ('u2u64', 'a@32')), 0), # Don't do that for u64 -> u64 1862 (('unpack_double_2x32_dxil', ('pack_double_2x32_dxil', a)), a), 1863 (('pack_double_2x32_dxil', ('unpack_double_2x32_dxil', a)), a), 1864 1865 (('unpack_64_4x16', ('pack_64_4x16', a)), a), 1866 (('pack_64_4x16', ('unpack_64_4x16', a)), a), 1867 (('unpack_32_4x8', ('pack_32_4x8', a)), a), 1868 (('pack_32_4x8', ('unpack_32_4x8', a)), a), 1869 1870 (('unpack_64_4x16', ('pack_64_2x32', ('vec2', ('pack_32_2x16_split', a, b), ('pack_32_2x16_split', c, d)))), ('vec4', a, b, c, d)), 1871 (('unpack_64_4x16', ('pack_64_2x32_split', ('pack_32_2x16_split', a, b), ('pack_32_2x16_split', c, d))), ('vec4', a, b, c, d)), 1872 1873 (('pack_64_2x32_split', ('pack_32_2x16_split', a, b), ('pack_32_2x16_split', c, d)), 1874 ('pack_64_4x16', ('vec4', a, b, c, d)), '!options->lower_pack_64_4x16'), 1875 (('pack_64_2x32', ('vec2', ('pack_32_2x16_split', a, b), ('pack_32_2x16_split', c, d))), 1876 ('pack_64_4x16', ('vec4', a, b, c, d)), '!options->lower_pack_64_4x16'), 1877 (('pack_64_2x32', ('vec2', ('pack_32_2x16', ('vec2', a, b)), ('pack_32_2x16', ('vec2', c, d)))), 1878 ('pack_64_4x16', ('vec4', a, b, c, d)), '!options->lower_pack_64_4x16'), 1879 1880 # Comparing two halves of an unpack separately. While this optimization 1881 # should be correct for non-constant values, it's less obvious that it's 1882 # useful in that case. For constant values, the pack will fold and we're 1883 # guaranteed to reduce the whole tree to one instruction. 1884 (('iand', ('ieq', ('unpack_32_2x16_split_x', a), '#b'), 1885 ('ieq', ('unpack_32_2x16_split_y', a), '#c')), 1886 ('ieq', a, ('pack_32_2x16_split', b, c))), 1887 1888 # Byte extraction 1889 (('ushr', 'a@16', 8), ('extract_u8', a, 1), '!options->lower_extract_byte'), 1890 (('ushr', 'a@32', 24), ('extract_u8', a, 3), '!options->lower_extract_byte'), 1891 (('ushr', 'a@64', 56), ('extract_u8', a, 7), '!options->lower_extract_byte'), 1892 (('ishr', 'a@16', 8), ('extract_i8', a, 1), '!options->lower_extract_byte'), 1893 (('ishr', 'a@32', 24), ('extract_i8', a, 3), '!options->lower_extract_byte'), 1894 (('ishr', 'a@64', 56), ('extract_i8', a, 7), '!options->lower_extract_byte'), 1895 (('iand', 0xff, a), ('extract_u8', a, 0), '!options->lower_extract_byte'), 1896 (('ishr', ('iand', 'a@32', 0x0000ff00), 8), ('extract_u8', a, 1), '!options->lower_extract_byte'), 1897 (('ishr', ('iand', 'a@64', 0x0000ff00), 8), ('extract_u8', a, 1), '!options->lower_extract_byte'), 1898 (('ishr', ('iand', a, 0x00ff0000), 16), ('extract_u8', a, 2), '!options->lower_extract_byte'), 1899 1900 # Common pattern in many Vulkan CTS tests that read 8-bit integers from a 1901 # storage buffer. 1902 (('u2u8', ('extract_u16', a, 1)), ('u2u8', ('extract_u8', a, 2)), '!options->lower_extract_byte'), 1903 (('u2u8', ('ushr', a, 8)), ('u2u8', ('extract_u8', a, 1)), '!options->lower_extract_byte'), 1904 1905 # Common pattern after lowering 8-bit integers to 16-bit. 1906 (('i2i16', ('u2u8', ('extract_u8', a, b))), ('i2i16', ('extract_i8', a, b))), 1907 (('u2u16', ('u2u8', ('extract_u8', a, b))), ('u2u16', ('extract_u8', a, b))), 1908 1909 (('ubfe', a, 0, 8), ('extract_u8', a, 0), '!options->lower_extract_byte'), 1910 (('ubfe', a, 8, 8), ('extract_u8', a, 1), '!options->lower_extract_byte'), 1911 (('ubfe', a, 16, 8), ('extract_u8', a, 2), '!options->lower_extract_byte'), 1912 (('ubfe', a, 24, 8), ('extract_u8', a, 3), '!options->lower_extract_byte'), 1913 (('ibfe', a, 0, 8), ('extract_i8', a, 0), '!options->lower_extract_byte'), 1914 (('ibfe', a, 8, 8), ('extract_i8', a, 1), '!options->lower_extract_byte'), 1915 (('ibfe', a, 16, 8), ('extract_i8', a, 2), '!options->lower_extract_byte'), 1916 (('ibfe', a, 24, 8), ('extract_i8', a, 3), '!options->lower_extract_byte'), 1917 1918 (('extract_u8', ('extract_i8', a, b), 0), ('extract_u8', a, b)), 1919 (('extract_u8', ('extract_u8', a, b), 0), ('extract_u8', a, b)), 1920 1921 # The extract_X8(a & 0xff) patterns aren't included because the iand will 1922 # already be converted to extract_u8. 1923 (('extract_i8', ('iand', a, 0x0000ff00), 1), ('extract_i8', a, 1)), 1924 (('extract_i8', ('iand', a, 0x00ff0000), 2), ('extract_i8', a, 2)), 1925 (('extract_i8', ('iand', a, 0xff000000), 3), ('extract_i8', a, 3)), 1926 1927 (('extract_u8', ('iand', a, 0x0000ff00), 1), ('extract_u8', a, 1)), 1928 (('extract_u8', ('iand', a, 0x00ff0000), 2), ('extract_u8', a, 2)), 1929 (('extract_u8', ('iand', a, 0xff000000), 3), ('extract_u8', a, 3)), 1930 1931 (('iand', ('extract_u8', a, 0), '#b'), ('iand', a, ('iand', b, 0x00ff))), 1932 (('iand', ('extract_u16', a, 0), '#b'), ('iand', a, ('iand', b, 0xffff))), 1933 1934 (('ieq', ('iand', ('extract_u8', a, '#b'), '#c'), 0), ('ieq', ('iand', a, ('ishl', ('iand', c, 0x00ff), ('imul', ('i2i32', b), 8))), 0)), 1935 (('ine', ('iand', ('extract_u8', a, '#b'), '#c'), 0), ('ine', ('iand', a, ('ishl', ('iand', c, 0x00ff), ('imul', ('i2i32', b), 8))), 0)), 1936 (('ieq', ('iand', ('extract_u16(is_used_once)', a, '#b'), '#c'), 0), ('ieq', ('iand', a, ('ishl', ('iand', c, 0xffff), ('imul', ('i2i32', b), 16))), 0)), 1937 (('ine', ('iand', ('extract_u16(is_used_once)', a, '#b'), '#c'), 0), ('ine', ('iand', a, ('ishl', ('iand', c, 0xffff), ('imul', ('i2i32', b), 16))), 0)), 1938 1939 # Word extraction 1940 (('ushr', ('ishl', 'a@32', 16), 16), ('extract_u16', a, 0), '!options->lower_extract_word'), 1941 (('ushr', 'a@32', 16), ('extract_u16', a, 1), '!options->lower_extract_word'), 1942 (('ishr', ('ishl', 'a@32', 16), 16), ('extract_i16', a, 0), '!options->lower_extract_word'), 1943 (('ishr', 'a@32', 16), ('extract_i16', a, 1), '!options->lower_extract_word'), 1944 (('iand', 0xffff, a), ('extract_u16', a, 0), '!options->lower_extract_word'), 1945 1946 (('ubfe', a, 0, 16), ('extract_u16', a, 0), '!options->lower_extract_word'), 1947 (('ubfe', a, 16, 16), ('extract_u16', a, 1), '!options->lower_extract_word'), 1948 (('ibfe', a, 0, 16), ('extract_i16', a, 0), '!options->lower_extract_word'), 1949 (('ibfe', a, 16, 16), ('extract_i16', a, 1), '!options->lower_extract_word'), 1950 1951 # Packing a u8vec4 to write to an SSBO. 1952 (('ior', ('ishl', ('u2u32', 'a@8'), 24), ('ior', ('ishl', ('u2u32', 'b@8'), 16), ('ior', ('ishl', ('u2u32', 'c@8'), 8), ('u2u32', 'd@8')))), 1953 ('pack_32_4x8', ('vec4', d, c, b, a)), 'options->has_pack_32_4x8'), 1954 1955 (('extract_u16', ('extract_i16', a, b), 0), ('extract_u16', a, b)), 1956 (('extract_u16', ('extract_u16', a, b), 0), ('extract_u16', a, b)), 1957 1958 # The extract_X16(a & 0xff) patterns aren't included because the iand will 1959 # already be converted to extract_u8. 1960 (('extract_i16', ('iand', a, 0x00ff0000), 1), ('extract_u8', a, 2), '!options->lower_extract_byte'), # extract_u8 is correct 1961 (('extract_u16', ('iand', a, 0x00ff0000), 1), ('extract_u8', a, 2), '!options->lower_extract_byte'), 1962 1963 # Lower pack/unpack 1964 (('pack_64_2x32_split', a, b), ('ior', ('u2u64', a), ('ishl', ('u2u64', b), 32)), 'options->lower_pack_64_2x32_split'), 1965 (('pack_32_2x16_split', a, b), ('ior', ('u2u32', a), ('ishl', ('u2u32', b), 16)), 'options->lower_pack_32_2x16_split || options->lower_pack_split'), 1966 (('pack_half_2x16_split', a, b), ('pack_half_2x16_rtz_split', a, b), 'options->has_pack_half_2x16_rtz'), 1967 (('unpack_64_2x32_split_x', a), ('u2u32', a), 'options->lower_unpack_64_2x32_split'), 1968 (('unpack_64_2x32_split_y', a), ('u2u32', ('ushr', a, 32)), 'options->lower_unpack_64_2x32_split'), 1969 (('unpack_32_2x16_split_x', a), ('u2u16', a), 'options->lower_unpack_32_2x16_split || options->lower_pack_split'), 1970 (('unpack_32_2x16_split_y', a), ('u2u16', ('ushr', a, 16)), 'options->lower_unpack_32_2x16_split || options->lower_pack_split'), 1971 1972 (('unpack_64_2x32_split_x', ('ushr', a, 32)), ('unpack_64_2x32_split_y', a), '!options->lower_unpack_64_2x32_split'), 1973 (('u2u32', ('ushr', 'a@64', 32)), ('unpack_64_2x32_split_y', a), '!options->lower_unpack_64_2x32_split'), 1974 1975 # Useless masking before unpacking 1976 (('unpack_half_2x16_split_x', ('iand', a, 0xffff)), ('unpack_half_2x16_split_x', a)), 1977 (('unpack_32_2x16_split_x', ('iand', a, 0xffff)), ('unpack_32_2x16_split_x', a)), 1978 (('unpack_64_2x32_split_x', ('iand', a, 0xffffffff)), ('unpack_64_2x32_split_x', a)), 1979 (('unpack_half_2x16_split_y', ('iand', a, 0xffff0000)), ('unpack_half_2x16_split_y', a)), 1980 (('unpack_32_2x16_split_y', ('iand', a, 0xffff0000)), ('unpack_32_2x16_split_y', a)), 1981 (('unpack_64_2x32_split_y', ('iand', a, 0xffffffff00000000)), ('unpack_64_2x32_split_y', a)), 1982 1983 (('unpack_half_2x16_split_x', ('extract_u16', a, 0)), ('unpack_half_2x16_split_x', a)), 1984 (('unpack_half_2x16_split_x', ('extract_u16', a, 1)), ('unpack_half_2x16_split_y', a)), 1985 (('unpack_half_2x16_split_x', ('ushr', a, 16)), ('unpack_half_2x16_split_y', a)), 1986 (('unpack_32_2x16_split_x', ('extract_u16', a, 0)), ('unpack_32_2x16_split_x', a)), 1987 (('unpack_32_2x16_split_x', ('extract_u16', a, 1)), ('unpack_32_2x16_split_y', a)), 1988 1989 # Optimize half packing 1990 (('ishl', ('pack_half_2x16', ('vec2', a, 0)), 16), ('pack_half_2x16', ('vec2', 0, a))), 1991 (('ushr', ('pack_half_2x16', ('vec2', 0, a)), 16), ('pack_half_2x16', ('vec2', a, 0))), 1992 1993 (('iadd', ('pack_half_2x16', ('vec2', a, 0)), ('pack_half_2x16', ('vec2', 0, b))), 1994 ('pack_half_2x16', ('vec2', a, b))), 1995 (('ior', ('pack_half_2x16', ('vec2', a, 0)), ('pack_half_2x16', ('vec2', 0, b))), 1996 ('pack_half_2x16', ('vec2', a, b))), 1997 1998 (('ishl', ('pack_half_2x16_split', a, 0), 16), ('pack_half_2x16_split', 0, a)), 1999 (('ushr', ('pack_half_2x16_split', 0, a), 16), ('pack_half_2x16_split', a, 0)), 2000 (('extract_u16', ('pack_half_2x16_split', 0, a), 1), ('pack_half_2x16_split', a, 0)), 2001 2002 (('ishl', ('pack_half_2x16_rtz_split', a, 0), 16), ('pack_half_2x16_rtz_split', 0, a)), 2003 (('ushr', ('pack_half_2x16_rtz_split', 0, a), 16), ('pack_half_2x16_rtz_split', a, 0)), 2004 (('extract_u16', ('pack_half_2x16_rtz_split', 0, a), 1), ('pack_half_2x16_rtz_split', a, 0)), 2005 2006 (('iadd', ('pack_half_2x16_split', a, 0), ('pack_half_2x16_split', 0, b)), ('pack_half_2x16_split', a, b)), 2007 (('ior', ('pack_half_2x16_split', a, 0), ('pack_half_2x16_split', 0, b)), ('pack_half_2x16_split', a, b)), 2008 2009 (('iadd', ('pack_half_2x16_rtz_split', a, 0), ('pack_half_2x16_rtz_split', 0, b)), ('pack_half_2x16_rtz_split', a, b)), 2010 (('ior', ('pack_half_2x16_rtz_split', a, 0), ('pack_half_2x16_rtz_split', 0, b)), ('pack_half_2x16_rtz_split', a, b)), 2011 2012 (('pack_uint_2x16', ('vec2', ('pack_half_2x16_rtz_split', a, 0), ('pack_half_2x16_rtz_split', b, 0))), ('pack_half_2x16_rtz_split', a, b)), 2013 2014 (('bfi', 0xffff0000, ('pack_half_2x16_split', a, b), ('pack_half_2x16_split', c, d)), 2015 ('pack_half_2x16_split', c, a)), 2016 2017 # The important part here is that ~0xf & 0xfffffffc = ~0xf. 2018 (('iand', ('bfi', 0x0000000f, '#a', b), 0xfffffffc), 2019 ('bfi', 0x0000000f, ('iand', a, 0xfffffffc), b)), 2020 (('iand', ('bfi', 0x00000007, '#a', b), 0xfffffffc), 2021 ('bfi', 0x00000007, ('iand', a, 0xfffffffc), b)), 2022 2023 # 0x0f << 3 == 0x78, so that's already the maximum possible value. 2024 (('umin', ('ishl', ('iand', a, 0xf), 3), 0x78), ('ishl', ('iand', a, 0xf), 3)), 2025 2026 (('extract_i8', ('pack_32_4x8_split', a, b, c, d), 0), ('i2i', a)), 2027 (('extract_i8', ('pack_32_4x8_split', a, b, c, d), 1), ('i2i', b)), 2028 (('extract_i8', ('pack_32_4x8_split', a, b, c, d), 2), ('i2i', c)), 2029 (('extract_i8', ('pack_32_4x8_split', a, b, c, d), 3), ('i2i', d)), 2030 (('extract_u8', ('pack_32_4x8_split', a, b, c, d), 0), ('u2u', a)), 2031 (('extract_u8', ('pack_32_4x8_split', a, b, c, d), 1), ('u2u', b)), 2032 (('extract_u8', ('pack_32_4x8_split', a, b, c, d), 2), ('u2u', c)), 2033 (('extract_u8', ('pack_32_4x8_split', a, b, c, d), 3), ('u2u', d)), 2034 2035 # Reduce intermediate precision with int64. 2036 (('u2u32', ('iadd(is_used_once)', 'a@64', b)), 2037 ('iadd', ('u2u32', a), ('u2u32', b))), 2038 2039 (('u2u32', ('imul(is_used_once)', 'a@64', b)), 2040 ('imul', ('u2u32', a), ('u2u32', b))), 2041 2042 (('u2f32', ('u2u64', 'a@32')), ('u2f32', a)), 2043 2044 # UINT32_MAX < a just checks the high half of a 64-bit value. This occurs 2045 # when lowering convert_uint_sat(ulong). Although the replacement is more 2046 # instructions, it replaces a 64-bit instruction with a 32-bit instruction 2047 # and a move that will likely be coalesced. 2048 (('ult', 0xffffffff, 'a@64'), ('ine', ('unpack_64_2x32_split_y', a), 0)), 2049 2050 # Redundant trip through 8-bit 2051 (('i2i16', ('u2u8', ('iand', 'a@16', 1))), ('iand', 'a@16', 1)), 2052 (('u2u16', ('u2u8', ('iand', 'a@16', 1))), ('iand', 'a@16', 1)), 2053 2054 # Reduce 16-bit integers to 1-bit booleans, hit with OpenCL. In turn, this 2055 # lets iand(b2i1(...), 1) get simplified. Backends can usually fuse iand/inot 2056 # so this should be no worse when it isn't strictly better. 2057 (('bcsel', a, 0, ('b2i16', 'b@1')), ('b2i16', ('iand', ('inot', a), b))), 2058 (('bcsel', a, ('b2i16', 'b@1'), ('b2i16', 'c@1')), ('b2i16', ('bcsel', a, b, c))), 2059 2060 # Lowered pack followed by lowered unpack, for the high bits 2061 (('u2u32', ('ushr', ('ior', ('ishl', a, 32), ('u2u64', 'b@8')), 32)), ('u2u32', a)), 2062 (('u2u32', ('ushr', ('ior', ('ishl', a, 32), ('u2u64', 'b@16')), 32)), ('u2u32', a)), 2063 (('u2u32', ('ushr', ('ior', ('ishl', a, 32), ('u2u64', 'b@32')), 32)), ('u2u32', a)), 2064 (('u2u16', ('ushr', ('ior', ('ishl', a, 16), ('u2u32', 'b@8')), 16)), ('u2u16', a)), 2065 (('u2u16', ('ushr', ('ior', ('ishl', a, 16), ('u2u32', 'b@16')), 16)), ('u2u16', a)), 2066]) 2067 2068# After the ('extract_u8', a, 0) pattern, above, triggers, there will be 2069# patterns like those below. 2070for op in ('ushr', 'ishr'): 2071 optimizations.extend([(('extract_u8', (op, 'a@16', 8), 0), ('extract_u8', a, 1))]) 2072 optimizations.extend([(('extract_u8', (op, 'a@32', 8 * i), 0), ('extract_u8', a, i)) for i in range(1, 4)]) 2073 optimizations.extend([(('extract_u8', (op, 'a@64', 8 * i), 0), ('extract_u8', a, i)) for i in range(1, 8)]) 2074 2075optimizations.extend([(('extract_u8', ('extract_u16', a, 1), 0), ('extract_u8', a, 2))]) 2076 2077# After the ('extract_[iu]8', a, 3) patterns, above, trigger, there will be 2078# patterns like those below. 2079for op in ('extract_u8', 'extract_i8'): 2080 optimizations.extend([((op, ('ishl', 'a@16', 8), 1), (op, a, 0))]) 2081 optimizations.extend([((op, ('ishl', 'a@32', 24 - 8 * i), 3), (op, a, i)) for i in range(2, -1, -1)]) 2082 optimizations.extend([((op, ('ishl', 'a@64', 56 - 8 * i), 7), (op, a, i)) for i in range(6, -1, -1)]) 2083 2084for op, repl in [('ieq', 'ieq'), ('ine', 'ine'), 2085 ('ult', 'ult'), ('ilt', 'ult'), 2086 ('uge', 'uge'), ('ige', 'uge')]: 2087 optimizations.extend([ 2088 ((op, ('pack_64_2x32_split', a, 0), ('pack_64_2x32_split', b, 0)), (repl, a, b)), 2089 ((op, ('pack_64_2x32_split', a, 0), '#b(is_upper_half_zero)'), (repl, a, ('unpack_64_2x32_split_x', b))), 2090 ((op, '#a(is_upper_half_zero)', ('pack_64_2x32_split', b, 0)), (repl, ('unpack_64_2x32_split_x', a), b)), 2091 2092 ((op, ('pack_64_2x32_split', 0, a), ('pack_64_2x32_split', 0, b)), (op, a, b)), 2093 ((op, ('pack_64_2x32_split', 0, a), '#b(is_lower_half_zero)'), (op, a, ('unpack_64_2x32_split_y', b))), 2094 ((op, '#a(is_lower_half_zero)', ('pack_64_2x32_split', 0, b)), (op, ('unpack_64_2x32_split_y', a), b)), 2095 ]) 2096 2097optimizations.extend([ 2098 # Subtracts 2099 (('ussub_4x8_vc4', a, 0), a), 2100 (('ussub_4x8_vc4', a, ~0), 0), 2101 # Lower all Subtractions first - they can get recombined later 2102 (('fsub', a, b), ('fadd', a, ('fneg', b))), 2103 (('isub', a, b), ('iadd', a, ('ineg', b))), 2104 (('uabs_usub', a, b), ('bcsel', ('ult', a, b), ('ineg', ('isub', a, b)), ('isub', a, b))), 2105 # This is correct. We don't need isub_sat because the result type is unsigned, so it cannot overflow. 2106 (('uabs_isub', a, b), ('bcsel', ('ilt', a, b), ('ineg', ('isub', a, b)), ('isub', a, b))), 2107 (('bitz', a, b), ('inot', ('bitnz', a, b))), 2108 2109 # Propagate negation up multiplication chains 2110 (('fmul(is_used_by_non_fsat)', ('fneg', a), b), ('fneg', ('fmul', a, b))), 2111 (('fmulz(is_used_by_non_fsat,nsz)', ('fneg', a), b), ('fneg', ('fmulz', a, b))), 2112 (('ffma', ('fneg', a), ('fneg', b), c), ('ffma', a, b, c)), 2113 (('ffmaz', ('fneg', a), ('fneg', b), c), ('ffmaz', a, b, c)), 2114 (('imul', ('ineg', a), b), ('ineg', ('imul', a, b))), 2115 2116 # Propagate constants up multiplication chains 2117 (('~fmul(is_used_once)', ('fmul(is_used_once)', 'a(is_not_const)', 'b(is_not_const)'), '#c'), ('fmul', ('fmul', a, c), b)), 2118 (('~fmulz(is_used_once)', ('fmulz(is_used_once)', 'a(is_not_const)', 'b(is_not_const)'), '#c'), ('fmulz', ('fmulz', a, c), b)), 2119 (('~fmul(is_used_once)', ('fmulz(is_used_once)', 'a(is_not_const)', 'b(is_not_const)'), '#c(is_finite_not_zero)'), ('fmulz', ('fmul', a, c), b)), 2120 (('imul(is_used_once)', ('imul(is_used_once)', 'a(is_not_const)', 'b(is_not_const)'), '#c'), ('imul', ('imul', a, c), b)), 2121 (('~ffma', ('fmul(is_used_once)', 'a(is_not_const)', 'b(is_not_const)'), '#c', d), ('ffma', ('fmul', a, c), b, d)), 2122 (('~ffmaz', ('fmulz(is_used_once)', 'a(is_not_const)', 'b(is_not_const)'), '#c', d), ('ffmaz', ('fmulz', a, c), b, d)), 2123 (('~ffma', ('fmulz(is_used_once)', 'a(is_not_const)', 'b(is_not_const)'), '#c(is_finite_not_zero)', d), ('ffmaz', ('fmul', a, c), b, d)), 2124 # Prefer moving out a multiplication for more MAD/FMA-friendly code 2125 (('~fadd(is_used_once)', ('fadd(is_used_once)', 'a(is_not_const)', 'b(is_fmul)'), '#c'), ('fadd', ('fadd', a, c), b)), 2126 (('~fadd(is_used_once)', ('fadd(is_used_once)', 'a(is_not_const)', 'b(is_not_const)'), '#c'), ('fadd', ('fadd', a, c), b)), 2127 (('~fadd(is_used_once)', ('ffma(is_used_once)', 'a(is_not_const)', b, 'c(is_not_const)'), '#d'), ('fadd', ('ffma', a, b, d), c)), 2128 (('~fadd(is_used_once)', ('ffmaz(is_used_once)', 'a(is_not_const)', b, 'c(is_not_const)'), '#d'), ('fadd', ('ffmaz', a, b, d), c)), 2129 (('iadd(is_used_once)', ('iadd(is_used_once)', 'a(is_not_const)', 'b(is_not_const)'), '#c'), ('iadd', ('iadd', a, c), b)), 2130 2131 # Reassociate constants in add/mul chains so they can be folded together. 2132 # For now, we mostly only handle cases where the constants are separated by 2133 # a single non-constant. We could do better eventually. 2134 (('~fmul', '#a', ('fmul', 'b(is_not_const)', '#c')), ('fmul', ('fmul', a, c), b)), 2135 (('~fmulz', '#a', ('fmulz', 'b(is_not_const)', '#c')), ('fmulz', ('fmulz', a, c), b)), 2136 (('~fmul', '#a(is_finite_not_zero)', ('fmulz', 'b(is_not_const)', '#c')), ('fmulz', ('fmul', a, c), b)), 2137 (('~ffma', '#a', ('fmul', 'b(is_not_const)', '#c'), d), ('ffma', ('fmul', a, c), b, d)), 2138 (('~ffmaz', '#a', ('fmulz', 'b(is_not_const)', '#c'), d), ('ffmaz', ('fmulz', a, c), b, d)), 2139 (('~ffmaz', '#a(is_finite_not_zero)', ('fmulz', 'b(is_not_const)', '#c'), d), ('ffmaz', ('fmul', a, c), b, d)), 2140 (('imul', '#a', ('imul', 'b(is_not_const)', '#c')), ('imul', ('imul', a, c), b)), 2141 (('~fadd', '#a', ('fadd', 'b(is_not_const)', '#c')), ('fadd', ('fadd', a, c), b)), 2142 (('~fadd', '#a', ('fneg', ('fadd', 'b(is_not_const)', '#c'))), ('fadd', ('fadd', a, ('fneg', c)), ('fneg', b))), 2143 (('~fadd', '#a', ('ffma', 'b(is_not_const)', 'c(is_not_const)', '#d')), ('ffma', b, c, ('fadd', a, d))), 2144 (('~fadd', '#a', ('fneg', ('ffma', 'b(is_not_const)', 'c(is_not_const)', '#d'))), ('ffma', ('fneg', b), c, ('fadd', a, ('fneg', d)))), 2145 (('~fadd', '#a', ('ffmaz', 'b(is_not_const)', 'c(is_not_const)', '#d')), ('ffmaz', b, c, ('fadd', a, d))), 2146 (('~fadd', '#a', ('fneg', ('ffmaz', 'b(is_not_const)', 'c(is_not_const)', '#d'))), ('ffmaz', ('fneg', b), c, ('fadd', a, ('fneg', d)))), 2147 (('iadd', '#a', ('iadd', 'b(is_not_const)', '#c')), ('iadd', ('iadd', a, c), b)), 2148 (('iand', '#a', ('iand', 'b(is_not_const)', '#c')), ('iand', ('iand', a, c), b)), 2149 (('ior', '#a', ('ior', 'b(is_not_const)', '#c')), ('ior', ('ior', a, c), b)), 2150 (('ixor', '#a', ('ixor', 'b(is_not_const)', '#c')), ('ixor', ('ixor', a, c), b)), 2151 (('ior', ('iand', a, '#c'), ('ior', b, ('iand', a, '#d'))), ('ior', b, ('iand', a, ('ior', c, d)))), 2152 2153 # Reassociate add chains for more MAD/FMA-friendly code 2154 (('~fadd', ('fadd(is_used_once)', 'a(is_fmul)', 'b(is_fmul)'), 'c(is_not_fmul)'), ('fadd', ('fadd', a, c), b)), 2155 2156 # Drop mul-div by the same value when there's no wrapping. 2157 (('idiv', ('imul(no_signed_wrap)', a, b), b), a), 2158 2159 # By definition... 2160 (('bcsel', ('ige', ('find_lsb', a), 0), ('find_lsb', a), -1), ('find_lsb', a)), 2161 (('bcsel', ('ige', ('ifind_msb', a), 0), ('ifind_msb', a), -1), ('ifind_msb', a)), 2162 (('bcsel', ('ige', ('ufind_msb', a), 0), ('ufind_msb', a), -1), ('ufind_msb', a)), 2163 (('bcsel', ('ige', ('ifind_msb_rev', a), 0), ('ifind_msb_rev', a), -1), ('ifind_msb_rev', a)), 2164 (('bcsel', ('ige', ('ufind_msb_rev', a), 0), ('ufind_msb_rev', a), -1), ('ufind_msb_rev', a)), 2165 2166 (('bcsel', ('ine', a, 0), ('find_lsb', a), -1), ('find_lsb', a)), 2167 (('bcsel', ('ine', a, 0), ('ifind_msb', a), -1), ('ifind_msb', a)), 2168 (('bcsel', ('ine', a, 0), ('ufind_msb', a), -1), ('ufind_msb', a)), 2169 (('bcsel', ('ine', a, 0), ('ifind_msb_rev', a), -1), ('ifind_msb_rev', a)), 2170 (('bcsel', ('ine', a, 0), ('ufind_msb_rev', a), -1), ('ufind_msb_rev', a)), 2171 2172 (('bcsel', ('ine', a, -1), ('ifind_msb', a), -1), ('ifind_msb', a)), 2173 (('bcsel', ('ine', a, -1), ('ifind_msb_rev', a), -1), ('ifind_msb_rev', a)), 2174 2175 (('bcsel', ('ine', ('ifind_msb', 'a@32'), -1), ('iadd', 31, ('ineg', ('ifind_msb', a))), -1), ('ifind_msb_rev', a), 'options->has_find_msb_rev'), 2176 (('bcsel', ('ine', ('ufind_msb', 'a@32'), -1), ('iadd', 31, ('ineg', ('ufind_msb', a))), -1), ('ufind_msb_rev', a), 'options->has_find_msb_rev'), 2177 (('bcsel', ('ieq', ('ifind_msb', 'a@32'), -1), -1, ('iadd', 31, ('ineg', ('ifind_msb', a)))), ('ifind_msb_rev', a), 'options->has_find_msb_rev'), 2178 (('bcsel', ('ieq', ('ufind_msb', 'a@32'), -1), -1, ('iadd', 31, ('ineg', ('ufind_msb', a)))), ('ufind_msb_rev', a), 'options->has_find_msb_rev'), 2179 (('bcsel', ('ine', ('ifind_msb', 'a@32'), -1), ('iadd', 31, ('ineg', ('ifind_msb', a))), ('ifind_msb', a)), ('ifind_msb_rev', a), 'options->has_find_msb_rev'), 2180 (('bcsel', ('ine', ('ufind_msb', 'a@32'), -1), ('iadd', 31, ('ineg', ('ufind_msb', a))), ('ufind_msb', a)), ('ufind_msb_rev', a), 'options->has_find_msb_rev'), 2181 (('bcsel', ('ieq', ('ifind_msb', 'a@32'), -1), ('ifind_msb', a), ('iadd', 31, ('ineg', ('ifind_msb', a)))), ('ifind_msb_rev', a), 'options->has_find_msb_rev'), 2182 (('bcsel', ('ieq', ('ufind_msb', 'a@32'), -1), ('ufind_msb', a), ('iadd', 31, ('ineg', ('ufind_msb', a)))), ('ufind_msb_rev', a), 'options->has_find_msb_rev'), 2183 (('bcsel', ('ine', 'a@32', 0), ('iadd', 31, ('ineg', ('ufind_msb', a))), -1), ('ufind_msb_rev', a), 'options->has_find_msb_rev'), 2184 (('bcsel', ('ieq', 'a@32', 0), -1, ('iadd', 31, ('ineg', ('ufind_msb', a)))), ('ufind_msb_rev', a), 'options->has_find_msb_rev'), 2185 (('bcsel', ('ine', 'a@32', 0), ('iadd', 31, ('ineg', ('ufind_msb', a))), ('ufind_msb', a)), ('ufind_msb_rev', a), 'options->has_find_msb_rev'), 2186 (('bcsel', ('ieq', 'a@32', 0), ('ufind_msb', a), ('iadd', 31, ('ineg', ('ufind_msb', a)))), ('ufind_msb_rev', a), 'options->has_find_msb_rev'), 2187 2188 (('bcsel', ('ine', ('ifind_msb_rev', 'a@32'), -1), ('iadd', 31, ('ineg', ('ifind_msb_rev', a))), -1), ('ifind_msb', a), '!options->lower_ifind_msb'), 2189 (('bcsel', ('ine', ('ufind_msb_rev', 'a@32'), -1), ('iadd', 31, ('ineg', ('ufind_msb_rev', a))), -1), ('ufind_msb', a), '!options->lower_ufind_msb'), 2190 (('bcsel', ('ieq', ('ifind_msb_rev', 'a@32'), -1), -1, ('iadd', 31, ('ineg', ('ifind_msb_rev', a)))), ('ifind_msb', a), '!options->lower_ifind_msb'), 2191 (('bcsel', ('ieq', ('ufind_msb_rev', 'a@32'), -1), -1, ('iadd', 31, ('ineg', ('ufind_msb_rev', a)))), ('ufind_msb', a), '!options->lower_ufind_msb'), 2192 (('bcsel', ('ine', ('ifind_msb_rev', 'a@32'), -1), ('iadd', 31, ('ineg', ('ifind_msb_rev', a))), ('ifind_msb_rev', a)), ('ifind_msb', a), '!options->lower_ifind_msb'), 2193 (('bcsel', ('ine', ('ufind_msb_rev', 'a@32'), -1), ('iadd', 31, ('ineg', ('ufind_msb_rev', a))), ('ufind_msb_rev', a)), ('ufind_msb', a), '!options->lower_ufind_msb'), 2194 (('bcsel', ('ieq', ('ifind_msb_rev', 'a@32'), -1), ('ifind_msb_rev', a), ('iadd', 31, ('ineg', ('ifind_msb_rev', a)))), ('ifind_msb', a), '!options->lower_ifind_msb'), 2195 (('bcsel', ('ieq', ('ufind_msb_rev', 'a@32'), -1), ('ufind_msb_rev', a), ('iadd', 31, ('ineg', ('ufind_msb_rev', a)))), ('ufind_msb', a), '!options->lower_ufind_msb'), 2196 (('bcsel', ('ine', 'a@32', 0), ('iadd', 31, ('ineg', ('ufind_msb_rev', a))), -1), ('ufind_msb', a), '!options->lower_ufind_msb'), 2197 (('bcsel', ('ieq', 'a@32', 0), -1, ('iadd', 31, ('ineg', ('ufind_msb_rev', a)))), ('ufind_msb', a), '!options->lower_ufind_msb'), 2198 (('bcsel', ('ine', 'a@32', 0), ('iadd', 31, ('ineg', ('ufind_msb_rev', a))), ('ufind_msb_rev', a)), ('ufind_msb', a), '!options->lower_ufind_msb'), 2199 (('bcsel', ('ieq', 'a@32', 0), ('ufind_msb_rev', a), ('iadd', 31, ('ineg', ('ufind_msb_rev', a)))), ('ufind_msb', a), '!options->lower_ufind_msb'), 2200 2201 # Clear the LSB 2202 (('iand', a, ('inot', ('ishl', 1, ('find_lsb', a)))), ('iand', a, ('inot', ('ineg', a)))), 2203 2204 # This is safe. Both ufind_msb_rev and bitfield_reverse can only have 2205 # 32-bit sources, so the transformation can only generate correct NIR. 2206 (('find_lsb', ('bitfield_reverse', a)), ('ufind_msb_rev', a), 'options->has_find_msb_rev'), 2207 (('ufind_msb_rev', ('bitfield_reverse', a)), ('find_lsb', a), '!options->lower_find_lsb'), 2208 2209 (('ifind_msb', ('f2i32(is_used_once)', a)), ('ufind_msb', ('f2i32', ('fabs', a)))), 2210 (('ifind_msb', ('extract_u8', a, b)), ('ufind_msb', ('extract_u8', a, b))), 2211 (('ifind_msb', ('extract_u16', a, b)), ('ufind_msb', ('extract_u16', a, b))), 2212 (('ifind_msb', ('imax', a, 1)), ('ufind_msb', ('imax', a, 1))), 2213 2214 (('~fmul', ('bcsel(is_used_once)', c, -1.0, 1.0), b), ('bcsel', c, ('fneg', b), b)), 2215 (('~fmul', ('bcsel(is_used_once)', c, 1.0, -1.0), b), ('bcsel', c, b, ('fneg', b))), 2216 (('~fmulz', ('bcsel(is_used_once)', c, -1.0, 1.0), b), ('bcsel', c, ('fneg', b), b)), 2217 (('~fmulz', ('bcsel(is_used_once)', c, 1.0, -1.0), b), ('bcsel', c, b, ('fneg', b))), 2218 (('fabs', ('bcsel(is_used_once)', b, ('fneg', a), a)), ('fabs', a)), 2219 (('fabs', ('bcsel(is_used_once)', b, a, ('fneg', a))), ('fabs', a)), 2220 (('~bcsel', ('flt', a, 0.0), ('fneg', a), a), ('fabs', a)), 2221 2222 (('bcsel', a, ('bcsel(is_used_once)', b, c, d), d), ('bcsel', ('iand', a, b), c, d)), 2223 (('bcsel', a, ('bcsel(is_used_once)', b, d, c), d), ('bcsel', ('iand', a, ('inot', b)), c, d)), 2224 (('bcsel', a, b, ('bcsel(is_used_once)', c, b, d)), ('bcsel', ('ior', a, c), b, d)), 2225 (('bcsel', a, b, ('bcsel(is_used_once)', c, d, b)), ('bcsel', ('iand', c, ('inot', a)), d, b)), 2226 2227 # Misc. lowering 2228 (('fmod', a, b), ('fsub', a, ('fmul', b, ('ffloor', ('fdiv', a, b)))), 'options->lower_fmod'), 2229 (('frem', a, b), ('fsub', a, ('fmul', b, ('ftrunc', ('fdiv', a, b)))), 'options->lower_fmod'), 2230 (('uadd_carry', a, b), ('b2i', ('ult', ('iadd', a, b), a)), 'options->lower_uadd_carry'), 2231 (('usub_borrow', a, b), ('b2i', ('ult', a, b)), 'options->lower_usub_borrow'), 2232 2233 (('bitfield_insert', 'base', 'insert', 'offset', 'bits'), 2234 ('bcsel', ('ult', 31, 'bits'), 'insert', 2235 ('bfi', ('bfm', 'bits', 'offset'), 'insert', 'base')), 2236 'options->lower_bitfield_insert && options->has_bfm && options->has_bfi'), 2237 (('ihadd', a, b), ('iadd', ('iand', a, b), ('ishr', ('ixor', a, b), 1)), 'options->lower_hadd'), 2238 (('uhadd', a, b), ('iadd', ('iand', a, b), ('ushr', ('ixor', a, b), 1)), 'options->lower_hadd'), 2239 (('irhadd', a, b), ('isub', ('ior', a, b), ('ishr', ('ixor', a, b), 1)), 'options->lower_hadd'), 2240 (('urhadd', a, b), ('isub', ('ior', a, b), ('ushr', ('ixor', a, b), 1)), 'options->lower_hadd'), 2241 (('ihadd@64', a, b), ('iadd', ('iand', a, b), ('ishr', ('ixor', a, b), 1)), 'options->lower_hadd64 || (options->lower_int64_options & nir_lower_iadd64) != 0'), 2242 (('uhadd@64', a, b), ('iadd', ('iand', a, b), ('ushr', ('ixor', a, b), 1)), 'options->lower_hadd64 || (options->lower_int64_options & nir_lower_iadd64) != 0'), 2243 (('irhadd@64', a, b), ('isub', ('ior', a, b), ('ishr', ('ixor', a, b), 1)), 'options->lower_hadd64 || (options->lower_int64_options & nir_lower_iadd64) != 0'), 2244 (('urhadd@64', a, b), ('isub', ('ior', a, b), ('ushr', ('ixor', a, b), 1)), 'options->lower_hadd64 || (options->lower_int64_options & nir_lower_iadd64) != 0'), 2245 2246 (('imul_32x16', a, b), ('imul', a, ('extract_i16', b, 0)), 'options->lower_mul_32x16'), 2247 (('umul_32x16', a, b), ('imul', a, ('extract_u16', b, 0)), 'options->lower_mul_32x16'), 2248 2249 (('uadd_sat@64', a, b), ('bcsel', ('ult', ('iadd', a, b), a), -1, ('iadd', a, b)), 2250 'options->lower_uadd_sat || (options->lower_int64_options & (nir_lower_iadd64 | nir_lower_uadd_sat64)) != 0'), 2251 (('uadd_sat', a, b), ('bcsel', ('ult', ('iadd', a, b), a), -1, ('iadd', a, b)), 'options->lower_uadd_sat'), 2252 (('usub_sat', a, b), ('bcsel', ('ult', a, b), 0, ('isub', a, b)), 'options->lower_usub_sat'), 2253 (('usub_sat@64', a, b), ('bcsel', ('ult', a, b), 0, ('isub', a, b)), '(options->lower_int64_options & nir_lower_usub_sat64) != 0'), 2254 2255 # int64_t sum = a + b; 2256 # 2257 # if (a < 0 && b < 0 && a < sum) 2258 # sum = INT64_MIN; 2259 # } else if (a >= 0 && b >= 0 && sum < a) 2260 # sum = INT64_MAX; 2261 # } 2262 # 2263 # A couple optimizations are applied. 2264 # 2265 # 1. a < sum => sum >= 0. This replacement works because it is known that 2266 # a < 0 and b < 0, so sum should also be < 0 unless there was 2267 # underflow. 2268 # 2269 # 2. sum < a => sum < 0. This replacement works because it is known that 2270 # a >= 0 and b >= 0, so sum should also be >= 0 unless there was 2271 # overflow. 2272 # 2273 # 3. Invert the second if-condition and swap the order of parameters for 2274 # the bcsel. !(a >= 0 && b >= 0 && sum < 0) becomes !(a >= 0) || !(b >= 2275 # 0) || !(sum < 0), and that becomes (a < 0) || (b < 0) || (sum >= 0) 2276 # 2277 # On Intel Gen11, this saves ~11 instructions. 2278 (('iadd_sat@64', a, b), ('bcsel', 2279 ('iand', ('iand', ('ilt', a, 0), ('ilt', b, 0)), ('ige', ('iadd', a, b), 0)), 2280 0x8000000000000000, 2281 ('bcsel', 2282 ('ior', ('ior', ('ilt', a, 0), ('ilt', b, 0)), ('ige', ('iadd', a, b), 0)), 2283 ('iadd', a, b), 2284 0x7fffffffffffffff)), 2285 '(options->lower_int64_options & nir_lower_iadd_sat64) != 0'), 2286 2287 # int64_t sum = a - b; 2288 # 2289 # if (a < 0 && b >= 0 && a < sum) 2290 # sum = INT64_MIN; 2291 # } else if (a >= 0 && b < 0 && a >= sum) 2292 # sum = INT64_MAX; 2293 # } 2294 # 2295 # Optimizations similar to the iadd_sat case are applied here. 2296 (('isub_sat@64', a, b), ('bcsel', 2297 ('iand', ('iand', ('ilt', a, 0), ('ige', b, 0)), ('ige', ('isub', a, b), 0)), 2298 0x8000000000000000, 2299 ('bcsel', 2300 ('ior', ('ior', ('ilt', a, 0), ('ige', b, 0)), ('ige', ('isub', a, b), 0)), 2301 ('isub', a, b), 2302 0x7fffffffffffffff)), 2303 '(options->lower_int64_options & nir_lower_iadd_sat64) != 0'), 2304 2305 # These are done here instead of in the backend because the int64 lowering 2306 # pass will make a mess of the patterns. The first patterns are 2307 # conditioned on nir_lower_minmax64 because it was not clear that it was 2308 # always an improvement on platforms that have real int64 support. No 2309 # shaders in shader-db hit this, so it was hard to say one way or the 2310 # other. 2311 (('ilt', ('imax(is_used_once)', 'a@64', 'b@64'), 0), ('ilt', ('imax', ('unpack_64_2x32_split_y', a), ('unpack_64_2x32_split_y', b)), 0), '(options->lower_int64_options & nir_lower_minmax64) != 0'), 2312 (('ilt', ('imin(is_used_once)', 'a@64', 'b@64'), 0), ('ilt', ('imin', ('unpack_64_2x32_split_y', a), ('unpack_64_2x32_split_y', b)), 0), '(options->lower_int64_options & nir_lower_minmax64) != 0'), 2313 (('ige', ('imax(is_used_once)', 'a@64', 'b@64'), 0), ('ige', ('imax', ('unpack_64_2x32_split_y', a), ('unpack_64_2x32_split_y', b)), 0), '(options->lower_int64_options & nir_lower_minmax64) != 0'), 2314 (('ige', ('imin(is_used_once)', 'a@64', 'b@64'), 0), ('ige', ('imin', ('unpack_64_2x32_split_y', a), ('unpack_64_2x32_split_y', b)), 0), '(options->lower_int64_options & nir_lower_minmax64) != 0'), 2315 (('ilt', 'a@64', 0), ('ilt', ('unpack_64_2x32_split_y', a), 0), '(options->lower_int64_options & nir_lower_icmp64) != 0'), 2316 (('ige', 'a@64', 0), ('ige', ('unpack_64_2x32_split_y', a), 0), '(options->lower_int64_options & nir_lower_icmp64) != 0'), 2317 2318 (('ine', 'a@64', 0), ('ine', ('ior', ('unpack_64_2x32_split_x', a), ('unpack_64_2x32_split_y', a)), 0), '(options->lower_int64_options & nir_lower_icmp64) != 0'), 2319 (('ieq', 'a@64', 0), ('ieq', ('ior', ('unpack_64_2x32_split_x', a), ('unpack_64_2x32_split_y', a)), 0), '(options->lower_int64_options & nir_lower_icmp64) != 0'), 2320 # 0u < uint(a) <=> uint(a) != 0u 2321 (('ult', 0, 'a@64'), ('ine', ('ior', ('unpack_64_2x32_split_x', a), ('unpack_64_2x32_split_y', a)), 0), '(options->lower_int64_options & nir_lower_icmp64) != 0'), 2322 2323 # Alternative lowering that doesn't rely on bfi. 2324 (('bitfield_insert', 'base', 'insert', 'offset', 'bits'), 2325 ('bcsel', ('ult', 31, 'bits'), 2326 'insert', 2327 (('ior', 2328 ('iand', 'base', ('inot', ('ishl', ('isub', ('ishl', 1, 'bits'), 1), 'offset'))), 2329 ('iand', ('ishl', 'insert', 'offset'), ('ishl', ('isub', ('ishl', 1, 'bits'), 1), 'offset'))))), 2330 'options->lower_bitfield_insert && (!options->has_bfm || (!options->has_bfi && !options->has_bitfield_select))'), 2331 2332 # Alternative lowering that uses bitfield_select. 2333 (('bitfield_insert', 'base', 'insert', 'offset', 'bits'), 2334 ('bcsel', ('ult', 31, 'bits'), 'insert', 2335 ('bitfield_select', ('bfm', 'bits', 'offset'), ('ishl', 'insert', 'offset'), 'base')), 2336 'options->lower_bitfield_insert && options->has_bfm && options->has_bitfield_select'), 2337 2338 (('ibitfield_extract', 'value', 'offset', 'bits'), 2339 ('bcsel', ('ult', 31, 'bits'), 'value', 2340 ('ibfe', 'value', 'offset', 'bits')), 2341 'options->lower_bitfield_extract && options->has_bfe'), 2342 2343 (('ubitfield_extract', 'value', 'offset', 'bits'), 2344 ('bcsel', ('ult', 31, 'bits'), 'value', 2345 ('ubfe', 'value', 'offset', 'bits')), 2346 'options->lower_bitfield_extract && options->has_bfe'), 2347 2348 # (src0 & src1) | (~src0 & src2). Constant fold if src2 is 0. 2349 (('bitfield_select', a, b, 0), ('iand', a, b)), 2350 (('bitfield_select', a, ('iand', a, b), c), ('bitfield_select', a, b, c)), 2351 2352 # Note that these opcodes are defined to only use the five least significant bits of 'offset' and 'bits' 2353 (('ubfe', 'value', 'offset', ('iand', 31, 'bits')), ('ubfe', 'value', 'offset', 'bits')), 2354 (('ubfe', 'value', ('iand', 31, 'offset'), 'bits'), ('ubfe', 'value', 'offset', 'bits')), 2355 (('ibfe', 'value', 'offset', ('iand', 31, 'bits')), ('ibfe', 'value', 'offset', 'bits')), 2356 (('ibfe', 'value', ('iand', 31, 'offset'), 'bits'), ('ibfe', 'value', 'offset', 'bits')), 2357 (('bfm', 'bits', ('iand', 31, 'offset')), ('bfm', 'bits', 'offset')), 2358 (('bfm', ('iand', 31, 'bits'), 'offset'), ('bfm', 'bits', 'offset')), 2359 2360 # Optimizations for ubitfield_extract(value, offset, umin(bits, 32-(offset&0x1f))) and such 2361 (('ult', a, ('umin', ('iand', a, b), c)), False), 2362 (('ult', 31, ('umin', '#bits(is_ult_32)', a)), False), 2363 (('ubfe', 'value', 'offset', ('umin', 'width', ('iadd', 32, ('ineg', ('iand', 31, 'offset'))))), 2364 ('ubfe', 'value', 'offset', 'width')), 2365 (('ibfe', 'value', 'offset', ('umin', 'width', ('iadd', 32, ('ineg', ('iand', 31, 'offset'))))), 2366 ('ibfe', 'value', 'offset', 'width')), 2367 (('bfm', ('umin', 'width', ('iadd', 32, ('ineg', ('iand', 31, 'offset')))), 'offset'), 2368 ('bfm', 'width', 'offset')), 2369 2370 # open-coded BFM 2371 (('iadd@32', ('ishl', 1, a), -1), ('bfm', a, 0), 'options->has_bfm'), 2372 (('ishl', ('bfm', a, 0), b), ('bfm', a, b)), 2373 2374 # Section 8.8 (Integer Functions) of the GLSL 4.60 spec says: 2375 # 2376 # If bits is zero, the result will be zero. 2377 # 2378 # These patterns prevent other patterns from generating invalid results 2379 # when count is zero. 2380 (('ubfe', a, b, 0), 0), 2381 (('ibfe', a, b, 0), 0), 2382 2383 (('ubfe', a, 0, '#b'), ('iand', a, ('ushr', 0xffffffff, ('ineg', b)))), 2384 2385 (('b2i32', ('ine', ('ubfe', a, b, 1), 0)), ('ubfe', a, b, 1)), 2386 (('b2i32', ('ine', ('ibfe', a, b, 1), 0)), ('ubfe', a, b, 1)), # ubfe in the replacement is correct 2387 (('ine', ('ibfe(is_used_once)', a, '#b', '#c'), 0), ('ine', ('iand', a, ('ishl', ('ushr', 0xffffffff, ('ineg', c)), b)), 0)), 2388 (('ieq', ('ibfe(is_used_once)', a, '#b', '#c'), 0), ('ieq', ('iand', a, ('ishl', ('ushr', 0xffffffff, ('ineg', c)), b)), 0)), 2389 (('ine', ('ubfe(is_used_once)', a, '#b', '#c'), 0), ('ine', ('iand', a, ('ishl', ('ushr', 0xffffffff, ('ineg', c)), b)), 0)), 2390 (('ieq', ('ubfe(is_used_once)', a, '#b', '#c'), 0), ('ieq', ('iand', a, ('ishl', ('ushr', 0xffffffff, ('ineg', c)), b)), 0)), 2391 2392 (('ibitfield_extract', 'value', 'offset', 'bits'), 2393 ('bcsel', ('ieq', 0, 'bits'), 2394 0, 2395 ('ishr', 2396 ('ishl', 'value', ('isub', ('isub', 32, 'bits'), 'offset')), 2397 ('isub', 32, 'bits'))), 2398 'options->lower_bitfield_extract && !options->has_bfe'), 2399 2400 (('ubitfield_extract', 'value', 'offset', 'bits'), 2401 ('iand', 2402 ('ushr', 'value', 'offset'), 2403 ('bcsel', ('ieq', 'bits', 32), 2404 0xffffffff, 2405 ('isub', ('ishl', 1, 'bits'), 1))), 2406 'options->lower_bitfield_extract && !options->has_bfe'), 2407 2408 (('ifind_msb', 'value'), 2409 ('ufind_msb', ('bcsel', ('ilt', 'value', 0), ('inot', 'value'), 'value')), 2410 'options->lower_ifind_msb && !options->has_find_msb_rev && !options->has_uclz'), 2411 2412 (('ifind_msb', 'value'), 2413 ('bcsel', ('ige', ('ifind_msb_rev', 'value'), 0), 2414 ('isub', 31, ('ifind_msb_rev', 'value')), 2415 ('ifind_msb_rev', 'value')), 2416 'options->lower_ifind_msb && options->has_find_msb_rev'), 2417 2418 # uclz of an absolute value source almost always does the right thing. 2419 # There are a couple problem values: 2420 # 2421 # * 0x80000000. Since abs(0x80000000) == 0x80000000, uclz returns 0. 2422 # However, findMSB(int(0x80000000)) == 30. 2423 # 2424 # * 0xffffffff. Since abs(0xffffffff) == 1, uclz returns 31. Section 8.8 2425 # (Integer Functions) of the GLSL 4.50 spec says: 2426 # 2427 # For a value of zero or negative one, -1 will be returned. 2428 # 2429 # * Negative powers of two. uclz(abs(-(1<<x))) returns x, but 2430 # findMSB(-(1<<x)) should return x-1. 2431 # 2432 # For all negative number cases, including 0x80000000 and 0xffffffff, the 2433 # correct value is obtained from uclz if instead of negating the (already 2434 # negative) value the logical-not is used. A conditional logical-not can 2435 # be achieved by (x ^ (x >> 31)). 2436 (('ifind_msb', 'value'), 2437 ('isub', 31, ('uclz', ('ixor', 'value', ('ishr', 'value', 31)))), 2438 'options->lower_ifind_msb && options->has_uclz'), 2439 2440 (('ufind_msb', 'value@32'), 2441 ('bcsel', ('ige', ('ufind_msb_rev', 'value'), 0), 2442 ('isub', 31, ('ufind_msb_rev', 'value')), 2443 ('ufind_msb_rev', 'value')), 2444 'options->lower_ufind_msb && options->has_find_msb_rev'), 2445 2446 (('ufind_msb', 'value@32'), 2447 ('isub', 31, ('uclz', 'value')), 2448 'options->lower_ufind_msb && options->has_uclz'), 2449 2450 (('uclz', a), ('umin', 32, ('ufind_msb_rev', a)), '!options->has_uclz && options->has_find_msb_rev'), 2451 2452 (('find_lsb', 'value@64'), 2453 ('ufind_msb', ('iand', 'value', ('ineg', 'value'))), 2454 'options->lower_find_lsb'), 2455 2456 (('find_lsb', 'value'), 2457 ('ufind_msb', ('u2u32', ('iand', 'value', ('ineg', 'value')))), 2458 'options->lower_find_lsb'), 2459 2460 (('extract_i8', a, 'b@32'), 2461 ('ishr', ('ishl', a, ('imul', ('isub', 3, b), 8)), 24), 2462 'options->lower_extract_byte'), 2463 2464 (('extract_u8', a, 'b@32'), 2465 ('iand', ('ushr', a, ('imul', b, 8)), 0xff), 2466 'options->lower_extract_byte'), 2467 2468 (('extract_i16', a, 'b@32'), 2469 ('ishr', ('ishl', a, ('imul', ('isub', 1, b), 16)), 16), 2470 'options->lower_extract_word'), 2471 2472 (('extract_u16', a, 'b@32'), 2473 ('iand', ('ushr', a, ('imul', b, 16)), 0xffff), 2474 'options->lower_extract_word'), 2475 2476 (('pack_unorm_2x16', 'v'), 2477 ('pack_uvec2_to_uint', 2478 ('f2u32', ('fround_even', ('fmul', ('fsat', 'v'), 65535.0)))), 2479 'options->lower_pack_unorm_2x16'), 2480 2481 (('pack_unorm_4x8', 'v'), 2482 ('pack_uvec4_to_uint', 2483 ('f2u32', ('fround_even', ('fmul', ('fsat', 'v'), 255.0)))), 2484 'options->lower_pack_unorm_4x8 && !options->has_pack_32_4x8'), 2485 2486 (('pack_unorm_4x8', 'v'), 2487 ('pack_32_4x8', 2488 ('f2u8', ('fround_even', ('fmul', ('fsat', 'v'), 255.0)))), 2489 'options->lower_pack_unorm_4x8 && options->has_pack_32_4x8'), 2490 2491 (('pack_snorm_2x16', 'v'), 2492 ('pack_uvec2_to_uint', 2493 ('f2i32', ('fround_even', ('fmul', ('fmin', 1.0, ('fmax', -1.0, 'v')), 32767.0)))), 2494 'options->lower_pack_snorm_2x16'), 2495 2496 (('pack_snorm_4x8', 'v'), 2497 ('pack_uvec4_to_uint', 2498 ('f2i32', ('fround_even', ('fmul', ('fmin', 1.0, ('fmax', -1.0, 'v')), 127.0)))), 2499 'options->lower_pack_snorm_4x8 && !options->has_pack_32_4x8'), 2500 2501 (('pack_snorm_4x8', 'v'), 2502 ('pack_32_4x8', 2503 ('f2i8', ('fround_even', ('fmul', ('fmin', 1.0, ('fmax', -1.0, 'v')), 127.0)))), 2504 'options->lower_pack_snorm_4x8 && options->has_pack_32_4x8'), 2505 2506 (('unpack_unorm_2x16', 'v'), 2507 ('fdiv', ('u2f32', ('vec2', ('extract_u16', 'v', 0), 2508 ('extract_u16', 'v', 1))), 2509 65535.0), 2510 'options->lower_unpack_unorm_2x16'), 2511 2512 (('unpack_unorm_4x8', 'v'), 2513 ('fdiv', ('u2f32', ('vec4', ('extract_u8', 'v', 0), 2514 ('extract_u8', 'v', 1), 2515 ('extract_u8', 'v', 2), 2516 ('extract_u8', 'v', 3))), 2517 255.0), 2518 'options->lower_unpack_unorm_4x8'), 2519 2520 (('unpack_snorm_2x16', 'v'), 2521 ('fmin', 1.0, ('fmax', -1.0, ('fdiv', ('i2f', ('vec2', ('extract_i16', 'v', 0), 2522 ('extract_i16', 'v', 1))), 2523 32767.0))), 2524 'options->lower_unpack_snorm_2x16'), 2525 2526 (('unpack_snorm_4x8', 'v'), 2527 ('fmin', 1.0, ('fmax', -1.0, ('fdiv', ('i2f', ('vec4', ('extract_i8', 'v', 0), 2528 ('extract_i8', 'v', 1), 2529 ('extract_i8', 'v', 2), 2530 ('extract_i8', 'v', 3))), 2531 127.0))), 2532 'options->lower_unpack_snorm_4x8'), 2533 2534 (('pack_half_2x16_split', 'a@32', 'b@32'), 2535 ('ior', ('ishl', ('u2u32', ('f2f16', b)), 16), ('u2u32', ('f2f16', a))), 2536 'options->lower_pack_split'), 2537 2538 (('unpack_half_2x16_split_x', 'a@32'), 2539 ('f2f32', ('u2u16', a)), 2540 'options->lower_pack_split && !nir_is_denorm_flush_to_zero(info->float_controls_execution_mode, 16)'), 2541 2542 (('unpack_half_2x16_split_x', 'a@32'), 2543 ('f2f32', ('fmul', 1.0, ('u2u16', a))), 2544 'options->lower_pack_split && nir_is_denorm_flush_to_zero(info->float_controls_execution_mode, 16)'), 2545 2546 (('unpack_half_2x16_split_y', 'a@32'), 2547 ('f2f32', ('u2u16', ('ushr', a, 16))), 2548 'options->lower_pack_split && !nir_is_denorm_flush_to_zero(info->float_controls_execution_mode, 16)'), 2549 2550 (('unpack_half_2x16_split_y', 'a@32'), 2551 ('f2f32', ('fmul', 1.0, ('u2u16', ('ushr', a, 16)))), 2552 'options->lower_pack_split && nir_is_denorm_flush_to_zero(info->float_controls_execution_mode, 16)'), 2553 2554 (('isign', a), ('imin', ('imax', a, -1), 1), 'options->lower_isign'), 2555 (('imin', ('imax', a, -1), 1), ('isign', a), '!options->lower_isign'), 2556 (('imax', ('imin', a, 1), -1), ('isign', a), '!options->lower_isign'), 2557 # float(0 < NaN) - float(NaN < 0) = float(False) - float(False) = 0 - 0 = 0 2558 # Mark the new comparisons precise to prevent them being changed to 'a != 2559 # 0' or 'a == 0'. 2560 (('fsign', a), ('fsub', ('b2f', ('!flt', 0.0, a)), ('b2f', ('!flt', a, 0.0))), 'options->lower_fsign'), 2561 (('fsign', 'a@64'), ('fsub', ('b2f', ('!flt', 0.0, a)), ('b2f', ('!flt', a, 0.0))), 'options->lower_doubles_options & nir_lower_dsign'), 2562 2563 # Address/offset calculations: 2564 # Drivers supporting imul24 should use a pass like nir_lower_amul(), this 2565 # rule converts everyone else to imul: 2566 (('amul', a, b), ('imul', a, b), '!options->has_imul24 && !options->has_amul'), 2567 2568 # udiv_aligned_4 assumes the source is a multiple of 4 specifically to enable 2569 # this identity. Usually this transform would require masking. 2570 (('amul', ('udiv_aligned_4', a), 4), a), 2571 (('imul', ('udiv_aligned_4', a), 4), a), 2572 2573 (('umul24', a, b), 2574 ('imul', ('iand', a, 0xffffff), ('iand', b, 0xffffff)), 2575 '!options->has_umul24'), 2576 (('umad24', a, b, c), 2577 ('iadd', ('imul', ('iand', a, 0xffffff), ('iand', b, 0xffffff)), c), 2578 '!options->has_umad24'), 2579 2580 # Relaxed 24bit ops 2581 (('imul24_relaxed', a, b), ('imul24', a, b), 'options->has_imul24'), 2582 (('imul24_relaxed', a, b), ('imul', a, b), '!options->has_imul24'), 2583 (('umad24_relaxed', a, b, c), ('umad24', a, b, c), 'options->has_umad24'), 2584 (('umad24_relaxed', a, b, c), ('iadd', ('umul24_relaxed', a, b), c), '!options->has_umad24'), 2585 (('umul24_relaxed', a, b), ('umul24', a, b), 'options->has_umul24'), 2586 (('umul24_relaxed', a, b), ('imul', a, b), '!options->has_umul24'), 2587 2588 (('imad24_ir3', a, b, 0), ('imul24', a, b)), 2589 (('imad24_ir3', a, 0, c), (c)), 2590 (('imad24_ir3', a, 1, c), ('iadd', a, c)), 2591 2592 # if first two srcs are const, crack apart the imad so constant folding 2593 # can clean up the imul: 2594 # TODO ffma should probably get a similar rule: 2595 (('imad24_ir3', '#a', '#b', c), ('iadd', ('imul', a, b), c)), 2596 2597 # These will turn 24b address/offset calc back into 32b shifts, but 2598 # it should be safe to get back some of the bits of precision that we 2599 # already decided were no necessary: 2600 (('imul24', a, '#b@32(is_pos_power_of_two)'), ('ishl', a, ('find_lsb', b)), '!options->lower_bitops'), 2601 (('imul24', a, '#b@32(is_neg_power_of_two)'), ('ineg', ('ishl', a, ('find_lsb', ('iabs', b)))), '!options->lower_bitops'), 2602 (('imul24', a, 0), (0)), 2603]) 2604 2605for bit_size in [8, 16, 32, 64]: 2606 cond = '!options->lower_uadd_sat' 2607 if bit_size == 64: 2608 cond += ' && !(options->lower_int64_options & (nir_lower_iadd64 | nir_lower_uadd_sat64))' 2609 add = 'iadd@' + str(bit_size) 2610 2611 optimizations += [ 2612 (('bcsel', ('ult', ('iadd', a, b), a), -1, (add, a, b)), ('uadd_sat', a, b), cond), 2613 (('bcsel', ('uge', ('iadd', a, b), a), (add, a, b), -1), ('uadd_sat', a, b), cond), 2614 (('bcsel', ('ieq', ('uadd_carry', a, b), 0), (add, a, b), -1), ('uadd_sat', a, b), cond), 2615 (('bcsel', ('ine', ('uadd_carry', a, b), 0), -1, (add, a, b)), ('uadd_sat', a, b), cond), 2616 ] 2617 2618for bit_size in [8, 16, 32, 64]: 2619 cond = '!options->lower_usub_sat' 2620 if bit_size == 64: 2621 cond += ' && !(options->lower_int64_options & nir_lower_usub_sat64)' 2622 add = 'iadd@' + str(bit_size) 2623 2624 optimizations += [ 2625 (('bcsel', ('ult', a, b), 0, (add, a, ('ineg', b))), ('usub_sat', a, b), cond), 2626 (('bcsel', ('uge', a, b), (add, a, ('ineg', b)), 0), ('usub_sat', a, b), cond), 2627 (('bcsel', ('ieq', ('usub_borrow', a, b), 0), (add, a, ('ineg', b)), 0), ('usub_sat', a, b), cond), 2628 (('bcsel', ('ine', ('usub_borrow', a, b), 0), 0, (add, a, ('ineg', b))), ('usub_sat', a, b), cond), 2629 ] 2630 2631# bit_size dependent lowerings 2632for bit_size in [8, 16, 32, 64]: 2633 # convenience constants 2634 intmax = (1 << (bit_size - 1)) - 1 2635 intmin = 1 << (bit_size - 1) 2636 2637 optimizations += [ 2638 (('iadd_sat@' + str(bit_size), a, b), 2639 ('bcsel', ('ige', b, 1), ('bcsel', ('ilt', ('iadd', a, b), a), intmax, ('iadd', a, b)), 2640 ('bcsel', ('ilt', a, ('iadd', a, b)), intmin, ('iadd', a, b))), 'options->lower_iadd_sat'), 2641 (('isub_sat@' + str(bit_size), a, b), 2642 ('bcsel', ('ilt', b, 0), ('bcsel', ('ilt', ('isub', a, b), a), intmax, ('isub', a, b)), 2643 ('bcsel', ('ilt', a, ('isub', a, b)), intmin, ('isub', a, b))), 'options->lower_iadd_sat'), 2644 ] 2645 2646invert = OrderedDict([('feq', 'fneu'), ('fneu', 'feq')]) 2647 2648for left, right in itertools.combinations_with_replacement(invert.keys(), 2): 2649 optimizations.append((('inot', ('ior(is_used_once)', (left + '(is_used_once)', a, b), 2650 (right + '(is_used_once)', c, d))), 2651 ('iand', (invert[left], a, b), (invert[right], c, d)))) 2652 optimizations.append((('inot', ('iand(is_used_once)', (left + '(is_used_once)', a, b), 2653 (right + '(is_used_once)', c, d))), 2654 ('ior', (invert[left], a, b), (invert[right], c, d)))) 2655 2656# Optimize x2yN(b2x(x)) -> b2y 2657for x, y in itertools.product(['f', 'u', 'i'], ['f', 'u', 'i']): 2658 if x != 'f' and y != 'f' and x != y: 2659 continue 2660 2661 b2x = 'b2f' if x == 'f' else 'b2i' 2662 b2y = 'b2f' if y == 'f' else 'b2i' 2663 x2yN = '{}2{}'.format(x, y) 2664 optimizations.append(((x2yN, (b2x, a)), (b2y, a))) 2665 2666# Optimize away x2xN(a@N) 2667for t in ['int', 'uint', 'float', 'bool']: 2668 for N in type_sizes(t): 2669 x2xN = '{0}2{0}{1}'.format(t[0], N) 2670 aN = 'a@{0}'.format(N) 2671 optimizations.append(((x2xN, aN), a)) 2672 2673# Optimize x2xN(y2yM(a@P)) -> y2yN(a) for integers 2674# In particular, we can optimize away everything except upcast of downcast and 2675# upcasts where the type differs from the other cast 2676for N, M in itertools.product(type_sizes('uint'), type_sizes('uint')): 2677 if N < M: 2678 # The outer cast is a down-cast. It doesn't matter what the size of the 2679 # argument of the inner cast is because we'll never been in the upcast 2680 # of downcast case. Regardless of types, we'll always end up with y2yN 2681 # in the end. 2682 for x, y in itertools.product(['i', 'u'], ['i', 'u']): 2683 x2xN = '{0}2{0}{1}'.format(x, N) 2684 y2yM = '{0}2{0}{1}'.format(y, M) 2685 y2yN = '{0}2{0}{1}'.format(y, N) 2686 optimizations.append(((x2xN, (y2yM, a)), (y2yN, a))) 2687 elif N > M: 2688 # If the outer cast is an up-cast, we have to be more careful about the 2689 # size of the argument of the inner cast and with types. In this case, 2690 # the type is always the type of type up-cast which is given by the 2691 # outer cast. 2692 for P in type_sizes('uint'): 2693 # We can't optimize away up-cast of down-cast. 2694 if M < P: 2695 continue 2696 2697 # Because we're doing down-cast of down-cast, the types always have 2698 # to match between the two casts 2699 for x in ['i', 'u']: 2700 x2xN = '{0}2{0}{1}'.format(x, N) 2701 x2xM = '{0}2{0}{1}'.format(x, M) 2702 aP = 'a@{0}'.format(P) 2703 optimizations.append(((x2xN, (x2xM, aP)), (x2xN, a))) 2704 else: 2705 # The N == M case is handled by other optimizations 2706 pass 2707 2708# Downcast operations should be able to see through pack 2709for t in ['i', 'u']: 2710 for N in [8, 16, 32]: 2711 x2xN = '{0}2{0}{1}'.format(t, N) 2712 optimizations += [ 2713 ((x2xN, ('pack_64_2x32_split', a, b)), (x2xN, a)), 2714 ((x2xN, ('pack_64_2x32_split', a, b)), (x2xN, a)), 2715 ] 2716 2717# Optimize comparisons with up-casts 2718for t in ['int', 'uint', 'float']: 2719 for N, M in itertools.product(type_sizes(t), repeat=2): 2720 if N == 1 or N >= M: 2721 continue 2722 2723 cond = 'true' 2724 if N == 8: 2725 cond = 'options->support_8bit_alu' 2726 elif N == 16: 2727 cond = 'options->support_16bit_alu' 2728 x2xM = '{0}2{0}{1}'.format(t[0], M) 2729 x2xN = '{0}2{0}{1}'.format(t[0], N) 2730 aN = 'a@' + str(N) 2731 bN = 'b@' + str(N) 2732 xeq = 'feq' if t == 'float' else 'ieq' 2733 xne = 'fneu' if t == 'float' else 'ine' 2734 xge = '{0}ge'.format(t[0]) 2735 xlt = '{0}lt'.format(t[0]) 2736 2737 # Up-casts are lossless so for correctly signed comparisons of 2738 # up-casted values we can do the comparison at the largest of the two 2739 # original sizes and drop one or both of the casts. (We have 2740 # optimizations to drop the no-op casts which this may generate.) 2741 for P in type_sizes(t): 2742 if P == 1 or P > N: 2743 continue 2744 2745 bP = 'b@' + str(P) 2746 optimizations += [ 2747 ((xeq, (x2xM, aN), (x2xM, bP)), (xeq, a, (x2xN, b)), cond), 2748 ((xne, (x2xM, aN), (x2xM, bP)), (xne, a, (x2xN, b)), cond), 2749 ((xge, (x2xM, aN), (x2xM, bP)), (xge, a, (x2xN, b)), cond), 2750 ((xlt, (x2xM, aN), (x2xM, bP)), (xlt, a, (x2xN, b)), cond), 2751 ((xge, (x2xM, bP), (x2xM, aN)), (xge, (x2xN, b), a), cond), 2752 ((xlt, (x2xM, bP), (x2xM, aN)), (xlt, (x2xN, b), a), cond), 2753 ] 2754 2755 # The next bit doesn't work on floats because the range checks would 2756 # get way too complicated. 2757 if t in ['int', 'uint']: 2758 if t == 'int': 2759 xN_min = -(1 << (N - 1)) 2760 xN_max = (1 << (N - 1)) - 1 2761 elif t == 'uint': 2762 xN_min = 0 2763 xN_max = (1 << N) - 1 2764 else: 2765 assert False 2766 2767 # If we're up-casting and comparing to a constant, we can unfold 2768 # the comparison into a comparison with the shrunk down constant 2769 # and a check that the constant fits in the smaller bit size. 2770 optimizations += [ 2771 ((xeq, (x2xM, aN), '#b'), 2772 ('iand', (xeq, a, (x2xN, b)), (xeq, (x2xM, (x2xN, b)), b)), cond), 2773 ((xne, (x2xM, aN), '#b'), 2774 ('ior', (xne, a, (x2xN, b)), (xne, (x2xM, (x2xN, b)), b)), cond), 2775 ((xlt, (x2xM, aN), '#b'), 2776 ('iand', (xlt, xN_min, b), 2777 ('ior', (xlt, xN_max, b), (xlt, a, (x2xN, b)))), cond), 2778 ((xlt, '#a', (x2xM, bN)), 2779 ('iand', (xlt, a, xN_max), 2780 ('ior', (xlt, a, xN_min), (xlt, (x2xN, a), b))), cond), 2781 ((xge, (x2xM, aN), '#b'), 2782 ('iand', (xge, xN_max, b), 2783 ('ior', (xge, xN_min, b), (xge, a, (x2xN, b)))), cond), 2784 ((xge, '#a', (x2xM, bN)), 2785 ('iand', (xge, a, xN_min), 2786 ('ior', (xge, a, xN_max), (xge, (x2xN, a), b))), cond), 2787 ] 2788 2789# Convert masking followed by signed downcast to just unsigned downcast 2790optimizations += [ 2791 (('i2i32', ('iand', 'a@64', 0xffffffff)), ('u2u32', a)), 2792 (('i2i16', ('iand', 'a@32', 0xffff)), ('u2u16', a)), 2793 (('i2i16', ('iand', 'a@64', 0xffff)), ('u2u16', a)), 2794 (('i2i8', ('iand', 'a@16', 0xff)), ('u2u8', a)), 2795 (('i2i8', ('iand', 'a@32', 0xff)), ('u2u8', a)), 2796 (('i2i8', ('iand', 'a@64', 0xff)), ('u2u8', a)), 2797] 2798 2799# Some operations such as iadd have the property that the bottom N bits of the 2800# output only depends on the bottom N bits of each of the inputs so we can 2801# remove casts 2802for N in [16, 32]: 2803 for M in [8, 16]: 2804 if M >= N: 2805 continue 2806 2807 aN = 'a@' + str(N) 2808 u2uM = 'u2u{0}'.format(M) 2809 i2iM = 'i2i{0}'.format(M) 2810 2811 for x in ['u', 'i']: 2812 x2xN = '{0}2{0}{1}'.format(x, N) 2813 extract_xM = 'extract_{0}{1}'.format(x, M) 2814 2815 x2xN_M_bits = '{0}(only_lower_{1}_bits_used)'.format(x2xN, M) 2816 extract_xM_M_bits = \ 2817 '{0}(only_lower_{1}_bits_used)'.format(extract_xM, M) 2818 optimizations += [ 2819 ((x2xN_M_bits, (u2uM, aN)), a), 2820 ((extract_xM_M_bits, aN, 0), a), 2821 ] 2822 2823 bcsel_M_bits = 'bcsel(only_lower_{0}_bits_used)'.format(M) 2824 optimizations += [ 2825 ((bcsel_M_bits, c, (x2xN, (u2uM, aN)), b), ('bcsel', c, a, b)), 2826 ((bcsel_M_bits, c, (x2xN, (i2iM, aN)), b), ('bcsel', c, a, b)), 2827 ((bcsel_M_bits, c, (extract_xM, aN, 0), b), ('bcsel', c, a, b)), 2828 ] 2829 2830 for op in ['iadd', 'imul', 'iand', 'ior', 'ixor']: 2831 op_M_bits = '{0}(only_lower_{1}_bits_used)'.format(op, M) 2832 optimizations += [ 2833 ((op_M_bits, (x2xN, (u2uM, aN)), b), (op, a, b)), 2834 ((op_M_bits, (x2xN, (i2iM, aN)), b), (op, a, b)), 2835 ((op_M_bits, (extract_xM, aN, 0), b), (op, a, b)), 2836 ] 2837 2838def fexp2i(exp, bits): 2839 # Generate an expression which constructs value 2.0^exp or 0.0. 2840 # 2841 # We assume that exp is already in a valid range: 2842 # 2843 # * [-15, 15] for 16-bit float 2844 # * [-127, 127] for 32-bit float 2845 # * [-1023, 1023] for 16-bit float 2846 # 2847 # If exp is the lowest value in the valid range, a value of 0.0 is 2848 # constructed. Otherwise, the value 2.0^exp is constructed. 2849 if bits == 16: 2850 return ('i2i16', ('ishl', ('iadd', exp, 15), 10)) 2851 elif bits == 32: 2852 return ('ishl', ('iadd', exp, 127), 23) 2853 elif bits == 64: 2854 return ('pack_64_2x32_split', 0, ('ishl', ('iadd', exp, 1023), 20)) 2855 else: 2856 assert False 2857 2858def ldexp(f, exp, bits): 2859 # The maximum possible range for a normal exponent is [-126, 127] and, 2860 # throwing in denormals, you get a maximum range of [-149, 127]. This 2861 # means that we can potentially have a swing of +-276. If you start with 2862 # FLT_MAX, you actually have to do ldexp(FLT_MAX, -278) to get it to flush 2863 # all the way to zero. The GLSL spec only requires that we handle a subset 2864 # of this range. From version 4.60 of the spec: 2865 # 2866 # "If exp is greater than +128 (single-precision) or +1024 2867 # (double-precision), the value returned is undefined. If exp is less 2868 # than -126 (single-precision) or -1022 (double-precision), the value 2869 # returned may be flushed to zero. Additionally, splitting the value 2870 # into a significand and exponent using frexp() and then reconstructing 2871 # a floating-point value using ldexp() should yield the original input 2872 # for zero and all finite non-denormalized values." 2873 # 2874 # The SPIR-V spec has similar language. 2875 # 2876 # In order to handle the maximum value +128 using the fexp2i() helper 2877 # above, we have to split the exponent in half and do two multiply 2878 # operations. 2879 # 2880 # First, we clamp exp to a reasonable range. Specifically, we clamp to 2881 # twice the full range that is valid for the fexp2i() function above. If 2882 # exp/2 is the bottom value of that range, the fexp2i() expression will 2883 # yield 0.0f which, when multiplied by f, will flush it to zero which is 2884 # allowed by the GLSL and SPIR-V specs for low exponent values. If the 2885 # value is clamped from above, then it must have been above the supported 2886 # range of the GLSL built-in and therefore any return value is acceptable. 2887 if bits == 16: 2888 exp = ('imin', ('imax', exp, -30), 30) 2889 elif bits == 32: 2890 exp = ('imin', ('imax', exp, -254), 254) 2891 elif bits == 64: 2892 exp = ('imin', ('imax', exp, -2046), 2046) 2893 else: 2894 assert False 2895 2896 # Now we compute two powers of 2, one for exp/2 and one for exp-exp/2. 2897 # (We use ishr which isn't the same for -1, but the -1 case still works 2898 # since we use exp-exp/2 as the second exponent.) While the spec 2899 # technically defines ldexp as f * 2.0^exp, simply multiplying once doesn't 2900 # work with denormals and doesn't allow for the full swing in exponents 2901 # that you can get with normalized values. Instead, we create two powers 2902 # of two and multiply by them each in turn. That way the effective range 2903 # of our exponent is doubled. 2904 pow2_1 = fexp2i(('ishr', exp, 1), bits) 2905 pow2_2 = fexp2i(('isub', exp, ('ishr', exp, 1)), bits) 2906 return ('fmul', ('fmul', f, pow2_1), pow2_2) 2907 2908optimizations += [ 2909 (('ldexp@16', 'x', 'exp'), ldexp('x', 'exp', 16), 'options->lower_ldexp'), 2910 (('ldexp@32', 'x', 'exp'), ldexp('x', 'exp', 32), 'options->lower_ldexp'), 2911 (('ldexp@64', 'x', 'exp'), ldexp('x', 'exp', 64), 'options->lower_ldexp'), 2912] 2913 2914# XCOM 2 (OpenGL) open-codes bitfieldReverse() 2915def bitfield_reverse_xcom2(u): 2916 step1 = ('iadd', ('ishl', u, 16), ('ushr', u, 16)) 2917 step2 = ('iadd', ('iand', ('ishl', step1, 1), 0xaaaaaaaa), ('iand', ('ushr', step1, 1), 0x55555555)) 2918 step3 = ('iadd', ('iand', ('ishl', step2, 2), 0xcccccccc), ('iand', ('ushr', step2, 2), 0x33333333)) 2919 step4 = ('iadd', ('iand', ('ishl', step3, 4), 0xf0f0f0f0), ('iand', ('ushr', step3, 4), 0x0f0f0f0f)) 2920 step5 = ('iadd(many-comm-expr)', ('iand', ('ishl', step4, 8), 0xff00ff00), ('iand', ('ushr', step4, 8), 0x00ff00ff)) 2921 2922 return step5 2923 2924# Unreal Engine 4 demo applications open-codes bitfieldReverse() 2925def bitfield_reverse_ue4(u): 2926 step1 = ('ior', ('ishl', u, 16), ('ushr', u, 16)) 2927 step2 = ('ior', ('ishl', ('iand', step1, 0x00ff00ff), 8), ('ushr', ('iand', step1, 0xff00ff00), 8)) 2928 step3 = ('ior', ('ishl', ('iand', step2, 0x0f0f0f0f), 4), ('ushr', ('iand', step2, 0xf0f0f0f0), 4)) 2929 step4 = ('ior', ('ishl', ('iand', step3, 0x33333333), 2), ('ushr', ('iand', step3, 0xcccccccc), 2)) 2930 step5 = ('ior(many-comm-expr)', ('ishl', ('iand', step4, 0x55555555), 1), ('ushr', ('iand', step4, 0xaaaaaaaa), 1)) 2931 2932 return step5 2933 2934# Cyberpunk 2077 open-codes bitfieldReverse() 2935def bitfield_reverse_cp2077(u): 2936 step1 = ('ior', ('ishl', u, 16), ('ushr', u, 16)) 2937 step2 = ('ior', ('iand', ('ishl', step1, 1), 0xaaaaaaaa), ('iand', ('ushr', step1, 1), 0x55555555)) 2938 step3 = ('ior', ('iand', ('ishl', step2, 2), 0xcccccccc), ('iand', ('ushr', step2, 2), 0x33333333)) 2939 step4 = ('ior', ('iand', ('ishl', step3, 4), 0xf0f0f0f0), ('iand', ('ushr', step3, 4), 0x0f0f0f0f)) 2940 step5 = ('ior(many-comm-expr)', ('iand', ('ishl', step4, 8), 0xff00ff00), ('iand', ('ushr', step4, 8), 0x00ff00ff)) 2941 2942 return step5 2943 2944optimizations += [(bitfield_reverse_xcom2('x@32'), ('bitfield_reverse', 'x'), '!options->lower_bitfield_reverse')] 2945optimizations += [(bitfield_reverse_ue4('x@32'), ('bitfield_reverse', 'x'), '!options->lower_bitfield_reverse')] 2946optimizations += [(bitfield_reverse_cp2077('x@32'), ('bitfield_reverse', 'x'), '!options->lower_bitfield_reverse')] 2947 2948# VKD3D-Proton DXBC f32 to f16 conversion implements a float conversion using PackHalf2x16. 2949# Because the spec does not specify a rounding mode or behaviour regarding infinity, 2950# it emits a sequence to ensure D3D-like behaviour for infinity. 2951# When we know the current backend already behaves like we need, we can eliminate the extra sequence. 2952# 2953# Input is f32, output is u32 that has the f16 packed into its low bits. 2954def vkd3d_proton_packed_f2f16_rtz_lo(a, abs_a): 2955 packed_half = ('pack_half_2x16_rtz_split', a, 0) 2956 packed_half_minus1 = ('iadd', packed_half, 0xffffffff) 2957 f32_was_not_inf = ('ine', abs_a, 0x7f800000) 2958 f16_is_now_inf = ('ieq', ('iand', packed_half, 0x7fff), 0x7c00) 2959 return ('bcsel', ('iand', f32_was_not_inf, f16_is_now_inf), packed_half_minus1, packed_half) 2960 2961optimizations += [ 2962 (vkd3d_proton_packed_f2f16_rtz_lo('x', ('fabs', 'x')), ('pack_half_2x16_rtz_split', 'x', 0)), 2963 (vkd3d_proton_packed_f2f16_rtz_lo('x(is_not_negative)', 'x'), ('pack_half_2x16_rtz_split', 'x', 0)), 2964 (vkd3d_proton_packed_f2f16_rtz_lo(('fneg', 'x'), ('fabs', 'x')), ('pack_half_2x16_rtz_split', ('fneg', 'x'), 0)), 2965] 2966 2967def vkd3d_proton_msad(): 2968 pattern = None 2969 for i in range(4): 2970 ref = ('extract_u8', 'a@32', i) 2971 src = ('extract_u8', 'b@32', i) 2972 sad = ('iabs', ('iadd', ref, ('ineg', src))) 2973 msad = ('bcsel', ('ieq', ref, 0), 0, sad) 2974 if pattern == None: 2975 pattern = msad 2976 else: 2977 pattern = ('iadd', pattern, msad) 2978 pattern = (pattern[0] + '(many-comm-expr)', *pattern[1:]) 2979 return pattern 2980 2981optimizations += [ 2982 (vkd3d_proton_msad(), ('msad_4x8', a, b, 0), 'options->has_msad'), 2983 (('iadd', ('msad_4x8', a, b, 0), c), ('msad_4x8', a, b, c)), 2984] 2985 2986 2987# "all_equal(eq(a, b), vec(~0))" is the same as "all_equal(a, b)" 2988# "any_nequal(neq(a, b), vec(0))" is the same as "any_nequal(a, b)" 2989for ncomp in [2, 3, 4, 8, 16]: 2990 optimizations += [ 2991 (('ball_iequal' + str(ncomp), ('ieq', a, b), ~0), ('ball_iequal' + str(ncomp), a, b)), 2992 (('ball_iequal' + str(ncomp), ('feq', a, b), ~0), ('ball_fequal' + str(ncomp), a, b)), 2993 (('bany_inequal' + str(ncomp), ('ine', a, b), 0), ('bany_inequal' + str(ncomp), a, b)), 2994 (('bany_inequal' + str(ncomp), ('fneu', a, b), 0), ('bany_fnequal' + str(ncomp), a, b)), 2995 ] 2996 2997# For any float comparison operation, "cmp", if you have "a == a && a cmp b" 2998# then the "a == a" is redundant because it's equivalent to "a is not NaN" 2999# and, if a is a NaN then the second comparison will fail anyway. 3000for op in ['flt', 'fge', 'feq']: 3001 optimizations += [ 3002 (('iand', ('feq', a, a), (op, a, b)), ('!' + op, a, b)), 3003 (('iand', ('feq', a, a), (op, b, a)), ('!' + op, b, a)), 3004 ] 3005 3006# Add optimizations to handle the case where the result of a ternary is 3007# compared to a constant. This way we can take things like 3008# 3009# (a ? 0 : 1) > 0 3010# 3011# and turn it into 3012# 3013# a ? (0 > 0) : (1 > 0) 3014# 3015# which constant folding will eat for lunch. The resulting ternary will 3016# further get cleaned up by the boolean reductions above and we will be 3017# left with just the original variable "a". 3018for op in ['feq', 'fneu', 'ieq', 'ine']: 3019 optimizations += [ 3020 ((op, ('bcsel', 'a', '#b', '#c'), '#d'), 3021 ('bcsel', 'a', (op, 'b', 'd'), (op, 'c', 'd'))), 3022 ] 3023 3024for op in ['flt', 'fge', 'ilt', 'ige', 'ult', 'uge']: 3025 optimizations += [ 3026 ((op, ('bcsel', 'a', '#b', '#c'), '#d'), 3027 ('bcsel', 'a', (op, 'b', 'd'), (op, 'c', 'd'))), 3028 ((op, '#d', ('bcsel', a, '#b', '#c')), 3029 ('bcsel', 'a', (op, 'd', 'b'), (op, 'd', 'c'))), 3030 ] 3031 3032 3033# For example, this converts things like 3034# 3035# 1 + mix(0, a - 1, condition) 3036# 3037# into 3038# 3039# mix(1, (a-1)+1, condition) 3040# 3041# Other optimizations will rearrange the constants. 3042for op in ['fadd', 'fmul', 'fmulz', 'iadd', 'imul']: 3043 optimizations += [ 3044 ((op, ('bcsel(is_used_once)', a, '#b', c), '#d'), ('bcsel', a, (op, b, d), (op, c, d))) 3045 ] 3046 3047# Some optimizations for ir3-specific instructions. 3048optimizations += [ 3049 # 'al * bl': If either 'al' or 'bl' is zero, return zero. 3050 (('umul_low', '#a(is_lower_half_zero)', 'b'), (0)), 3051 # '(al * bh) << 16 + c': If either 'al' or 'bh' is zero, return 'c'. 3052 (('imadsh_mix16', '#a@32(is_lower_half_zero)', 'b@32', 'c@32'), ('c')), 3053 (('imadsh_mix16', 'a@32', '#b@32(is_upper_half_zero)', 'c@32'), ('c')), 3054] 3055 3056# These kinds of sequences can occur after nir_opt_peephole_select. 3057# 3058# NOTE: fadd is not handled here because that gets in the way of ffma 3059# generation in the i965 driver. Instead, fadd and ffma are handled in 3060# late_optimizations. 3061 3062for op in ['flrp']: 3063 optimizations += [ 3064 (('bcsel', a, (op + '(is_used_once)', b, c, d), (op, b, c, e)), (op, b, c, ('bcsel', a, d, e))), 3065 (('bcsel', a, (op, b, c, d), (op + '(is_used_once)', b, c, e)), (op, b, c, ('bcsel', a, d, e))), 3066 (('bcsel', a, (op + '(is_used_once)', b, c, d), (op, b, e, d)), (op, b, ('bcsel', a, c, e), d)), 3067 (('bcsel', a, (op, b, c, d), (op + '(is_used_once)', b, e, d)), (op, b, ('bcsel', a, c, e), d)), 3068 (('bcsel', a, (op + '(is_used_once)', b, c, d), (op, e, c, d)), (op, ('bcsel', a, b, e), c, d)), 3069 (('bcsel', a, (op, b, c, d), (op + '(is_used_once)', e, c, d)), (op, ('bcsel', a, b, e), c, d)), 3070 ] 3071 3072for op in ['fmulz', 'fmul', 'iadd', 'imul', 'iand', 'ior', 'ixor', 'fmin', 'fmax', 'imin', 'imax', 'umin', 'umax']: 3073 optimizations += [ 3074 (('bcsel', a, (op + '(is_used_once)', b, c), (op, b, 'd(is_not_const)')), (op, b, ('bcsel', a, c, d))), 3075 (('bcsel', a, (op + '(is_used_once)', b, 'c(is_not_const)'), (op, b, d)), (op, b, ('bcsel', a, c, d))), 3076 (('bcsel', a, (op, b, 'c(is_not_const)'), (op + '(is_used_once)', b, d)), (op, b, ('bcsel', a, c, d))), 3077 (('bcsel', a, (op, b, c), (op + '(is_used_once)', b, 'd(is_not_const)')), (op, b, ('bcsel', a, c, d))), 3078 ] 3079 3080for op in ['fpow']: 3081 optimizations += [ 3082 (('bcsel', a, (op + '(is_used_once)', b, c), (op, b, d)), (op, b, ('bcsel', a, c, d))), 3083 (('bcsel', a, (op, b, c), (op + '(is_used_once)', b, d)), (op, b, ('bcsel', a, c, d))), 3084 (('bcsel', a, (op + '(is_used_once)', b, c), (op, d, c)), (op, ('bcsel', a, b, d), c)), 3085 (('bcsel', a, (op, b, c), (op + '(is_used_once)', d, c)), (op, ('bcsel', a, b, d), c)), 3086 ] 3087 3088for op in ['frcp', 'frsq', 'fsqrt', 'fexp2', 'flog2', 'fsign', 'fsin', 'fcos', 'fsin_amd', 'fcos_amd', 'fsin_mdg', 'fcos_mdg', 'fsin_agx', 'fneg', 'fabs', 'fsign']: 3089 optimizations += [ 3090 (('bcsel', c, (op + '(is_used_once)', a), (op + '(is_used_once)', b)), (op, ('bcsel', c, a, b))), 3091 ] 3092 3093for op in ['ineg', 'iabs', 'inot', 'isign']: 3094 optimizations += [ 3095 ((op, ('bcsel', c, '#a', '#b')), ('bcsel', c, (op, a), (op, b))), 3096 ] 3097 3098optimizations.extend([ 3099 (('fisnormal', 'a@16'), ('ult', 0xfff, ('iadd', ('ishl', a, 1), 0x800)), 'options->lower_fisnormal'), 3100 (('fisnormal', 'a@32'), ('ult', 0x1ffffff, ('iadd', ('ishl', a, 1), 0x1000000)), 'options->lower_fisnormal'), 3101 (('fisnormal', 'a@64'), ('ult', 0x3fffffffffffff, ('iadd', ('ishl', a, 1), 0x20000000000000)), 'options->lower_fisnormal') 3102 ]) 3103 3104 3105""" 3106 if (fabs(val) < SMALLEST_NORMALIZED_FLOAT16) 3107 return (val & SIGN_BIT) /* +0.0 or -0.0 as appropriate */; 3108 else 3109 return f2f32(f2f16(val)); 3110""" 3111optimizations.extend([ 3112 (('fquantize2f16', 'a@32'), 3113 ('bcsel', ('!flt', ('!fabs', a), math.ldexp(1.0, -14)), 3114 ('iand', a, 1 << 31), 3115 ('!f2f32', ('!f2f16_rtne', a))), 3116 'options->lower_fquantize2f16') 3117 ]) 3118 3119for s in range(0, 31): 3120 mask = 0xffffffff << s 3121 3122 # bfi is ((mask & ...) | (~mask & ...)). Since the two sources of the ior 3123 # will never both have the same bits set, replacing the ior with an iadd 3124 # is safe (i.e., a carry out of a bit can never be generated). The iadd is 3125 # more likely to participate in other optimization patterns (e.g., iadd of 3126 # constant reassociation) 3127 optimizations.extend([ 3128 (('bfi', mask, a, '#b'), ('iadd', ('ishl', a, s), ('iand', b, ~mask)), 3129 'options->avoid_ternary_with_two_constants'), 3130 ]) 3131 3132# NaN propagation: Binary opcodes. If any operand is NaN, replace it with NaN. 3133# (unary opcodes with NaN are evaluated by nir_opt_constant_folding, not here) 3134for op in ['fadd', 'fdiv', 'fmod', 'fmul', 'fpow', 'frem', 'fsub']: 3135 optimizations += [((op, '#a(is_nan)', b), NAN)] 3136 optimizations += [((op, a, '#b(is_nan)'), NAN)] # some opcodes are not commutative 3137 3138# NaN propagation: Trinary opcodes. If any operand is NaN, replace it with NaN. 3139for op in ['ffma', 'flrp']: 3140 optimizations += [((op, '#a(is_nan)', b, c), NAN)] 3141 optimizations += [((op, a, '#b(is_nan)', c), NAN)] # some opcodes are not commutative 3142 optimizations += [((op, a, b, '#c(is_nan)'), NAN)] 3143 3144# NaN propagation: FP min/max. Pick the non-NaN operand. 3145for op in ['fmin', 'fmax']: 3146 optimizations += [((op, '#a(is_nan)', b), b)] # commutative 3147 3148# NaN propagation: ldexp is NaN if the first operand is NaN. 3149optimizations += [(('ldexp', '#a(is_nan)', b), NAN)] 3150 3151# NaN propagation: Dot opcodes. If any component is NaN, replace it with NaN. 3152for op in ['fdot2', 'fdot3', 'fdot4', 'fdot5', 'fdot8', 'fdot16']: 3153 optimizations += [((op, '#a(is_any_comp_nan)', b), NAN)] # commutative 3154 3155# NaN propagation: FP comparison opcodes except !=. Replace it with false. 3156for op in ['feq', 'fge', 'flt']: 3157 optimizations += [((op, '#a(is_nan)', b), False)] 3158 optimizations += [((op, a, '#b(is_nan)'), False)] # some opcodes are not commutative 3159 3160# NaN propagation: FP comparison opcodes using !=. Replace it with true. 3161# Operator != is the only opcode where a comparison with NaN returns true. 3162for op in ['fneu']: 3163 optimizations += [((op, '#a(is_nan)', b), True)] # commutative 3164 3165# NaN propagation: FP comparison opcodes except != returning FP 0 or 1. 3166for op in ['seq', 'sge', 'slt']: 3167 optimizations += [((op, '#a(is_nan)', b), 0.0)] 3168 optimizations += [((op, a, '#b(is_nan)'), 0.0)] # some opcodes are not commutative 3169 3170# NaN propagation: FP comparison opcodes using != returning FP 0 or 1. 3171# Operator != is the only opcode where a comparison with NaN returns true. 3172optimizations += [(('sne', '#a(is_nan)', b), 1.0)] # commutative 3173 3174# This section contains optimizations to propagate downsizing conversions of 3175# constructed vectors into vectors of downsized components. Whether this is 3176# useful depends on the SIMD semantics of the backend. On a true SIMD machine, 3177# this reduces the register pressure of the vector itself and often enables the 3178# conversions to be eliminated via other algebraic rules or constant folding. 3179# In the worst case on a SIMD architecture, the propagated conversions may be 3180# revectorized via nir_opt_vectorize so instruction count is minimally 3181# impacted. 3182# 3183# On a machine with SIMD-within-a-register only, this actually 3184# counterintuitively hurts instruction count. These machines are the same that 3185# require vectorize_vec2_16bit, so we predicate the optimizations on that flag 3186# not being set. 3187# 3188# Finally for scalar architectures, there should be no difference in generated 3189# code since it all ends up scalarized at the end, but it might minimally help 3190# compile-times. 3191 3192for i in range(2, 4 + 1): 3193 for T in ('f', 'u', 'i'): 3194 vec_inst = ('vec' + str(i),) 3195 3196 indices = ['a', 'b', 'c', 'd'] 3197 suffix_in = tuple((indices[j] + '@32') for j in range(i)) 3198 3199 to_16 = '{}2{}16'.format(T, T) 3200 to_mp = '{}2{}mp'.format(T, T) 3201 3202 out_16 = tuple((to_16, indices[j]) for j in range(i)) 3203 out_mp = tuple((to_mp, indices[j]) for j in range(i)) 3204 3205 optimizations += [ 3206 ((to_16, vec_inst + suffix_in), vec_inst + out_16, '!options->vectorize_vec2_16bit'), 3207 ] 3208 # u2ump doesn't exist, because it's equal to i2imp 3209 if T in ['f', 'i']: 3210 optimizations += [ 3211 ((to_mp, vec_inst + suffix_in), vec_inst + out_mp, '!options->vectorize_vec2_16bit') 3212 ] 3213 3214# This section contains "late" optimizations that should be run before 3215# creating ffmas and calling regular optimizations for the final time. 3216# Optimizations should go here if they help code generation and conflict 3217# with the regular optimizations. 3218before_ffma_optimizations = [ 3219 # Propagate constants down multiplication chains 3220 (('~fmul(is_used_once)', ('fmul(is_used_once)', 'a(is_not_const)', '#b'), 'c(is_not_const)'), ('fmul', ('fmul', a, c), b)), 3221 (('imul(is_used_once)', ('imul(is_used_once)', 'a(is_not_const)', '#b'), 'c(is_not_const)'), ('imul', ('imul', a, c), b)), 3222 (('~fadd(is_used_once)', ('fadd(is_used_once)', 'a(is_not_const)', '#b'), 'c(is_not_const)'), ('fadd', ('fadd', a, c), b)), 3223 (('iadd(is_used_once)', ('iadd(is_used_once)', 'a(is_not_const)', '#b'), 'c(is_not_const)'), ('iadd', ('iadd', a, c), b)), 3224 3225 (('~fadd', ('fmul', a, b), ('fmul', a, c)), ('fmul', a, ('fadd', b, c))), 3226 (('iadd', ('imul', a, b), ('imul', a, c)), ('imul', a, ('iadd', b, c))), 3227 (('~fadd', ('fneg', a), a), 0.0), 3228 (('iadd', ('ineg', a), a), 0), 3229 (('iadd', ('ineg', a), ('iadd', a, b)), b), 3230 (('iadd', a, ('iadd', ('ineg', a), b)), b), 3231 (('~fadd', ('fneg', a), ('fadd', a, b)), b), 3232 (('~fadd', a, ('fadd', ('fneg', a), b)), b), 3233 3234 (('~flrp', ('fadd(is_used_once)', a, -1.0), ('fadd(is_used_once)', a, 1.0), d), ('fadd', ('flrp', -1.0, 1.0, d), a)), 3235 (('~flrp', ('fadd(is_used_once)', a, 1.0), ('fadd(is_used_once)', a, -1.0), d), ('fadd', ('flrp', 1.0, -1.0, d), a)), 3236 (('~flrp', ('fadd(is_used_once)', a, '#b'), ('fadd(is_used_once)', a, '#c'), d), ('fadd', ('fmul', d, ('fadd', c, ('fneg', b))), ('fadd', a, b))), 3237] 3238 3239# This section contains "late" optimizations that should be run after the 3240# regular optimizations have finished. Optimizations should go here if 3241# they help code generation but do not necessarily produce code that is 3242# more easily optimizable. 3243late_optimizations = [ 3244 # The rearrangements are fine w.r.t. NaN. However, they produce incorrect 3245 # results if one operand is +Inf and the other is -Inf. 3246 # 3247 # 1. Inf + -Inf = NaN 3248 # 2. ∀x: x + NaN = NaN and x - NaN = NaN 3249 # 3. ∀x: x != NaN = true 3250 # 4. ∀x, ∀ cmp ∈ {<, >, ≤, ≥, =}: x cmp NaN = false 3251 # 3252 # a=Inf, b=-Inf a=-Inf, b=Inf a=NaN b=NaN 3253 # (a+b) < 0 false false false false 3254 # a < -b false false false false 3255 # -(a+b) < 0 false false false false 3256 # -a < b false false false false 3257 # (a+b) >= 0 false false false false 3258 # a >= -b true true false false 3259 # -(a+b) >= 0 false false false false 3260 # -a >= b true true false false 3261 # (a+b) == 0 false false false false 3262 # a == -b true true false false 3263 # (a+b) != 0 true true true true 3264 # a != -b false false true true 3265 (('flt', ('fadd(is_used_once)', a, b), 0.0), ('flt', a, ('fneg', b))), 3266 (('flt', ('fneg(is_used_once)', ('fadd(is_used_once)', a, b)), 0.0), ('flt', ('fneg', a), b)), 3267 (('flt', 0.0, ('fadd(is_used_once)', a, b) ), ('flt', ('fneg', a), b)), 3268 (('flt', 0.0, ('fneg(is_used_once)', ('fadd(is_used_once)', a, b))), ('flt', a, ('fneg', b))), 3269 (('~fge', ('fadd(is_used_once)', a, b), 0.0), ('fge', a, ('fneg', b))), 3270 (('~fge', ('fneg(is_used_once)', ('fadd(is_used_once)', a, b)), 0.0), ('fge', ('fneg', a), b)), 3271 (('~fge', 0.0, ('fadd(is_used_once)', a, b) ), ('fge', ('fneg', a), b)), 3272 (('~fge', 0.0, ('fneg(is_used_once)', ('fadd(is_used_once)', a, b))), ('fge', a, ('fneg', b))), 3273 (('~feq', ('fadd(is_used_once)', a, b), 0.0), ('feq', a, ('fneg', b))), 3274 (('~fneu', ('fadd(is_used_once)', a, b), 0.0), ('fneu', a, ('fneg', b))), 3275 3276 # If either source must be finite, then the original (a+b) cannot produce 3277 # NaN due to Inf-Inf. The patterns and the replacements produce the same 3278 # result if b is NaN. Therefore, the replacements are exact. 3279 (('fge', ('fadd(is_used_once)', 'a(is_finite)', b), 0.0), ('fge', a, ('fneg', b))), 3280 (('fge', ('fneg(is_used_once)', ('fadd(is_used_once)', 'a(is_finite)', b)), 0.0), ('fge', ('fneg', a), b)), 3281 (('fge', 0.0, ('fadd(is_used_once)', 'a(is_finite)', b) ), ('fge', ('fneg', a), b)), 3282 (('fge', 0.0, ('fneg(is_used_once)', ('fadd(is_used_once)', 'a(is_finite)', b))), ('fge', a, ('fneg', b))), 3283 (('feq', ('fadd(is_used_once)', 'a(is_finite)', b), 0.0), ('feq', a, ('fneg', b))), 3284 (('fneu', ('fadd(is_used_once)', 'a(is_finite)', b), 0.0), ('fneu', a, ('fneg', b))), 3285 3286 # This is how SpvOpFOrdNotEqual might be implemented. Replace it with 3287 # SpvOpLessOrGreater. 3288 *add_fabs_fneg((('iand', ('fneu', 'ma', 'mb'), ('iand', ('feq', a, a), ('feq', b, b))), ('ior', ('!flt', 'ma', 'mb'), ('!flt', 'mb', 'ma'))), {'ma' : a, 'mb' : b}), 3289 (('iand', ('fneu', a, 0.0), ('feq', a, a)), ('!flt', 0.0, ('fabs', a))), 3290 3291 # This is how SpvOpFUnordEqual might be implemented. Replace it with 3292 # !SpvOpLessOrGreater. 3293 *add_fabs_fneg((('ior', ('feq', 'ma', 'mb'), ('ior', ('fneu', a, a), ('fneu', b, b))), ('inot', ('ior', ('!flt', 'ma', 'mb'), ('!flt', 'mb', 'ma')))), {'ma' : a, 'mb' : b}), 3294 (('ior', ('feq', a, 0.0), ('fneu', a, a)), ('inot', ('!flt', 0.0, ('fabs', a)))), 3295 3296 *add_fabs_fneg((('ior', ('flt', 'ma', 'mb'), ('ior', ('fneu', a, a), ('fneu', b, b))), ('inot', ('fge', 'ma', 'mb'))), {'ma' : a, 'mb' : b}, False), 3297 *add_fabs_fneg((('ior', ('fge', 'ma', 'mb'), ('ior', ('fneu', a, a), ('fneu', b, b))), ('inot', ('flt', 'ma', 'mb'))), {'ma' : a, 'mb' : b}, False), 3298 *add_fabs_fneg((('ior', ('flt', 'ma', 'b(is_a_number)'), ('fneu', a, a)), ('inot', ('fge', 'ma', b))), {'ma' : a}), 3299 *add_fabs_fneg((('ior', ('fge', 'ma', 'b(is_a_number)'), ('fneu', a, a)), ('inot', ('flt', 'ma', b))), {'ma' : a}), 3300 *add_fabs_fneg((('ior', ('flt', 'a(is_a_number)', 'mb'), ('fneu', b, b)), ('inot', ('fge', a, 'mb'))), {'mb' : b}), 3301 *add_fabs_fneg((('ior', ('fge', 'a(is_a_number)', 'mb'), ('fneu', b, b)), ('inot', ('flt', a, 'mb'))), {'mb' : b}), 3302 *add_fabs_fneg((('iand', ('fneu', 'ma', 'b(is_a_number)'), ('feq', a, a)), ('fneo', 'ma', b), 'options->has_fneo_fcmpu'), {'ma' : a}), 3303 *add_fabs_fneg((('ior', ('feq', 'ma', 'b(is_a_number)'), ('fneu', a, a)), ('fequ', 'ma', b), 'options->has_fneo_fcmpu'), {'ma' : a}), 3304 3305 (('ior', ('flt', a, b), ('flt', b, a)), ('fneo', a, b), 'options->has_fneo_fcmpu'), 3306 (('flt', 0.0, ('fabs', a)), ('fneo', 0.0, a), 'options->has_fneo_fcmpu'), 3307 3308 3309 # These don't interfere with the previous optimizations which include this 3310 # in the search expression, because nir_algebraic_impl visits instructions 3311 # in reverse order. 3312 (('ior', ('fneu', 'a@16', a), ('fneu', 'b@16', b)), ('funord', a, b), 'options->has_ford_funord'), 3313 (('iand', ('feq', 'a@16', a), ('feq', 'b@16', b)), ('ford', a, b), 'options->has_ford_funord'), 3314 (('ior', ('fneu', 'a@32', a), ('fneu', 'b@32', b)), ('funord', a, b), 'options->has_ford_funord'), 3315 (('iand', ('feq', 'a@32', a), ('feq', 'b@32', b)), ('ford', a, b), 'options->has_ford_funord'), 3316 (('ior', ('fneu', 'a@64', a), ('fneu', 'b@64', b)), ('funord', a, b), 'options->has_ford_funord'), 3317 (('iand', ('feq', 'a@64', a), ('feq', 'b@64', b)), ('ford', a, b), 'options->has_ford_funord'), 3318 3319 (('inot', ('ford(is_used_once)', a, b)), ('funord', a, b)), 3320 (('inot', ('funord(is_used_once)', a, b)), ('ford', a, b)), 3321 (('inot', ('feq(is_used_once)', a, b)), ('fneu', a, b)), 3322 (('inot', ('fneu(is_used_once)', a, b)), ('feq', a, b)), 3323 (('inot', ('fequ(is_used_once)', a, b)), ('fneo', a, b)), 3324 (('inot', ('fneo(is_used_once)', a, b)), ('fequ', a, b)), 3325 (('inot', ('flt(is_used_once)', a, b)), ('fgeu', a, b), 'options->has_fneo_fcmpu'), 3326 (('inot', ('fgeu(is_used_once)', a, b)), ('flt', a, b)), 3327 (('inot', ('fge(is_used_once)', a, b)), ('fltu', a, b), 'options->has_fneo_fcmpu'), 3328 (('inot', ('fltu(is_used_once)', a, b)), ('fge', a, b)), 3329 3330 # nir_lower_to_source_mods will collapse this, but its existence during the 3331 # optimization loop can prevent other optimizations. 3332 (('fneg', ('fneg', a)), a), 3333 3334 # combine imul and iadd to imad 3335 (('iadd@32', ('imul(is_only_used_by_iadd)', a, b), c), ('imad', a, b, c), 'options->has_imad32'), 3336 3337 # Drivers do not actually implement udiv_aligned_4, it is just used to 3338 # optimize scratch lowering. 3339 (('udiv_aligned_4', a), ('ushr', a, 2)), 3340] 3341 3342# re-combine inexact mul+add to ffma. Do this before fsub so that a * b - c 3343# gets combined to fma(a, b, -c). 3344for sz, mulz in itertools.product([16, 32, 64], [False, True]): 3345 # fmulz/ffmaz only for fp32 3346 if mulz and sz != 32: 3347 continue 3348 3349 # Fuse the correct fmul. Only consider fmuls where the only users are fadd 3350 # (or fneg/fabs which are assumed to be propagated away), as a heuristic to 3351 # avoid fusing in cases where it's harmful. 3352 fmul = ('fmulz' if mulz else 'fmul') + '(is_only_used_by_fadd)' 3353 ffma = 'ffmaz' if mulz else 'ffma' 3354 3355 fadd = '~fadd@{}'.format(sz) 3356 option = 'options->fuse_ffma{}'.format(sz) 3357 3358 late_optimizations.extend([ 3359 ((fadd, (fmul, a, b), c), (ffma, a, b, c), option), 3360 3361 ((fadd, ('fneg(is_only_used_by_fadd)', (fmul, a, b)), c), 3362 (ffma, ('fneg', a), b, c), option), 3363 3364 ((fadd, ('fabs(is_only_used_by_fadd)', (fmul, a, b)), c), 3365 (ffma, ('fabs', a), ('fabs', b), c), option), 3366 3367 ((fadd, ('fneg(is_only_used_by_fadd)', ('fabs', (fmul, a, b))), c), 3368 (ffma, ('fneg', ('fabs', a)), ('fabs', b), c), option), 3369 ]) 3370 3371late_optimizations.extend([ 3372 # Subtractions get lowered during optimization, so we need to recombine them 3373 (('fadd@8', a, ('fneg', 'b')), ('fsub', 'a', 'b'), 'options->has_fsub'), 3374 (('fadd@16', a, ('fneg', 'b')), ('fsub', 'a', 'b'), 'options->has_fsub'), 3375 (('fadd@32', a, ('fneg', 'b')), ('fsub', 'a', 'b'), 'options->has_fsub'), 3376 (('fadd@64', a, ('fneg', 'b')), ('fsub', 'a', 'b'), 'options->has_fsub && !(options->lower_doubles_options & nir_lower_dsub)'), 3377 3378 (('fneg', a), ('fmul', a, -1.0), 'options->lower_fneg'), 3379 (('iadd', a, ('ineg', 'b')), ('isub', 'a', 'b'), 'options->has_isub || options->lower_ineg'), 3380 (('ineg', a), ('isub', 0, a), 'options->lower_ineg'), 3381 (('iabs', a), ('imax', a, ('ineg', a)), 'options->lower_iabs'), 3382]) 3383 3384for s in [8, 16, 32, 64]: 3385 cond = 'options->has_iadd3' 3386 if s == 64: 3387 cond += ' && !(options->lower_int64_options & nir_lower_iadd3_64)' 3388 3389 iadd = "iadd@{}".format(s) 3390 3391 # On Intel GPUs, the constant field for an ADD3 instruction must be either 3392 # int16_t or uint16_t. 3393 late_optimizations.extend([ 3394 ((iadd, ('iadd(is_used_once)', 'a(is_not_const)', 'b(is_not_const)'), 'c(is_not_const)'), ('iadd3', a, b, c), cond), 3395 ((iadd, ('iadd(is_used_once)', '#a(is_16_bits)', 'b(is_not_const)'), 'c(is_not_const)'), ('iadd3', a, b, c), cond), 3396 ((iadd, ('iadd(is_used_once)', 'a(is_not_const)', 'b(is_not_const)'), '#c(is_16_bits)'), ('iadd3', a, b, c), cond), 3397 ((iadd, ('ineg', ('iadd(is_used_once)', 'a(is_not_const)', 'b(is_not_const)')), 'c(is_not_const)'), ('iadd3', ('ineg', a), ('ineg', b), c), cond), 3398 ((iadd, ('ineg', ('iadd(is_used_once)', '#a(is_16_bits)', 'b(is_not_const)')), 'c(is_not_const)'), ('iadd3', ('ineg', a), ('ineg', b), c), cond), 3399 ((iadd, ('ineg', ('iadd(is_used_once)', 'a(is_not_const)', 'b(is_not_const)')), '#c(is_16_bits)'), ('iadd3', ('ineg', a), ('ineg', b), c), cond), 3400 3401 ((iadd, ('ishl', a, 1), 'b(is_not_const)'), ('iadd3', a, a, b), cond), 3402 ((iadd, ('ishl', a, 1), '#b(is_16_bits)' ), ('iadd3', a, a, b), cond), 3403 ((iadd, ('ineg', ('ishl', a, 1)), 'b(is_not_const)'), ('iadd3', ('ineg', a), ('ineg', a), b), cond), 3404 ((iadd, ('ineg', ('ishl', a, 1)), '#b(is_16_bits)' ), ('iadd3', ('ineg', a), ('ineg', a), b), cond), 3405 3406 # Use special checks to ensure (b+b) or -(b+b) fit in 16 bits. 3407 (('ishl@{}'.format(s), ('iadd', a, '#b(is_2x_16_bits)'), 1), ('iadd3', a, a, ('iadd', b, b)), cond), 3408 (('ishl@{}'.format(s), ('ineg', ('iadd', a, '#b(is_neg2x_16_bits)')), 1), ('iadd3', ('ineg', a), ('ineg', a), ('ineg', ('iadd', b, b))), cond), 3409 ]) 3410 3411late_optimizations.extend([ 3412 # fneg_lo / fneg_hi 3413 (('vec2(is_only_used_as_float)', ('fneg@16', a), b), ('fmul', ('vec2', a, b), ('vec2', -1.0, 1.0)), 'options->vectorize_vec2_16bit'), 3414 (('vec2(is_only_used_as_float)', a, ('fneg@16', b)), ('fmul', ('vec2', a, b), ('vec2', 1.0, -1.0)), 'options->vectorize_vec2_16bit'), 3415 3416 # These are duplicated from the main optimizations table. The late 3417 # patterns that rearrange expressions like x - .5 < 0 to x < .5 can create 3418 # new patterns like these. The patterns that compare with zero are removed 3419 # because they are unlikely to be created in by anything in 3420 # late_optimizations. 3421 (('flt', '#b(is_gt_0_and_lt_1)', ('fsat(is_used_once)', a)), ('flt', b, a)), 3422 (('fge', ('fsat(is_used_once)', a), '#b(is_gt_0_and_lt_1)'), ('fge', a, b)), 3423 (('feq', ('fsat(is_used_once)', a), '#b(is_gt_0_and_lt_1)'), ('feq', a, b)), 3424 (('fneu', ('fsat(is_used_once)', a), '#b(is_gt_0_and_lt_1)'), ('fneu', a, b)), 3425 3426 (('fge', ('fsat(is_used_once)', a), 1.0), ('fge', a, 1.0)), 3427 3428 (('~fge', ('fmin(is_used_once)', ('fadd(is_used_once)', a, b), ('fadd', c, d)), 0.0), ('iand', ('fge', a, ('fneg', b)), ('fge', c, ('fneg', d)))), 3429 3430 (('flt', ('fneg', a), ('fneg', b)), ('flt', b, a)), 3431 (('fge', ('fneg', a), ('fneg', b)), ('fge', b, a)), 3432 (('feq', ('fneg', a), ('fneg', b)), ('feq', b, a)), 3433 (('fneu', ('fneg', a), ('fneg', b)), ('fneu', b, a)), 3434 (('flt', ('fneg', a), -1.0), ('flt', 1.0, a)), 3435 (('flt', -1.0, ('fneg', a)), ('flt', a, 1.0)), 3436 (('fge', ('fneg', a), -1.0), ('fge', 1.0, a)), 3437 (('fge', -1.0, ('fneg', a)), ('fge', a, 1.0)), 3438 (('fneu', ('fneg', a), -1.0), ('fneu', 1.0, a)), 3439 (('feq', -1.0, ('fneg', a)), ('feq', a, 1.0)), 3440 3441 (('ior', a, a), a), 3442 (('iand', a, a), a), 3443 3444 (('~fadd', ('fneg(is_used_once)', ('fsat(is_used_once)', 'a(is_not_fmul)')), 1.0), ('fsat', ('fadd', 1.0, ('fneg', a)))), 3445 3446 (('fdot2', a, b), ('fdot2_replicated', a, b), 'options->fdot_replicates'), 3447 (('fdot3', a, b), ('fdot3_replicated', a, b), 'options->fdot_replicates'), 3448 (('fdot4', a, b), ('fdot4_replicated', a, b), 'options->fdot_replicates'), 3449 (('fdph', a, b), ('fdph_replicated', a, b), 'options->fdot_replicates'), 3450 3451 (('~flrp', ('fadd(is_used_once)', a, b), ('fadd(is_used_once)', a, c), d), ('fadd', ('flrp', b, c, d), a)), 3452 3453 # Approximate handling of fround_even for DX9 addressing from gallium nine on 3454 # DX9-class hardware with no proper fround support. This is in 3455 # late_optimizations so that the is_integral() opts in the main pass get a 3456 # chance to eliminate the fround_even first. 3457 (('fround_even', a), ('bcsel', 3458 ('feq', ('ffract', a), 0.5), 3459 ('fadd', ('ffloor', ('fadd', a, 0.5)), 1.0), 3460 ('ffloor', ('fadd', a, 0.5))), 'options->lower_fround_even'), 3461 3462 # A similar operation could apply to any ffma(#a, b, #(-a/2)), but this 3463 # particular operation is common for expanding values stored in a texture 3464 # from [0,1] to [-1,1]. 3465 (('~ffma@32', a, 2.0, -1.0), ('flrp', -1.0, 1.0, a ), '!options->lower_flrp32'), 3466 (('~ffma@32', a, -2.0, -1.0), ('flrp', -1.0, 1.0, ('fneg', a)), '!options->lower_flrp32'), 3467 (('~ffma@32', a, -2.0, 1.0), ('flrp', 1.0, -1.0, a ), '!options->lower_flrp32'), 3468 (('~ffma@32', a, 2.0, 1.0), ('flrp', 1.0, -1.0, ('fneg', a)), '!options->lower_flrp32'), 3469 (('~fadd@32', ('fmul(is_used_once)', 2.0, a), -1.0), ('flrp', -1.0, 1.0, a ), '!options->lower_flrp32'), 3470 (('~fadd@32', ('fmul(is_used_once)', -2.0, a), -1.0), ('flrp', -1.0, 1.0, ('fneg', a)), '!options->lower_flrp32'), 3471 (('~fadd@32', ('fmul(is_used_once)', -2.0, a), 1.0), ('flrp', 1.0, -1.0, a ), '!options->lower_flrp32'), 3472 (('~fadd@32', ('fmul(is_used_once)', 2.0, a), 1.0), ('flrp', 1.0, -1.0, ('fneg', a)), '!options->lower_flrp32'), 3473 3474 # flrp(a, b, a) 3475 # a*(1-a) + b*a 3476 # a + -a*a + a*b (1) 3477 # a + a*(b - a) 3478 # Option 1: ffma(a, (b-a), a) 3479 # 3480 # Alternately, after (1): 3481 # a*(1+b) + -a*a 3482 # a*((1+b) + -a) 3483 # 3484 # Let b=1 3485 # 3486 # Option 2: ffma(a, 2, -(a*a)) 3487 # Option 3: ffma(a, 2, (-a)*a) 3488 # Option 4: ffma(a, -a, (2*a) 3489 # Option 5: a * (2 - a) 3490 # 3491 # There are a lot of other possible combinations. 3492 (('~ffma@32', ('fadd', b, ('fneg', a)), a, a), ('flrp', a, b, a), '!options->lower_flrp32'), 3493 (('~ffma@32', a, 2.0, ('fneg', ('fmul', a, a))), ('flrp', a, 1.0, a), '!options->lower_flrp32'), 3494 (('~ffma@32', a, 2.0, ('fmul', ('fneg', a), a)), ('flrp', a, 1.0, a), '!options->lower_flrp32'), 3495 (('~ffma@32', a, ('fneg', a), ('fmul', 2.0, a)), ('flrp', a, 1.0, a), '!options->lower_flrp32'), 3496 (('~fmul@32', a, ('fadd', 2.0, ('fneg', a))), ('flrp', a, 1.0, a), '!options->lower_flrp32'), 3497 3498 # we do these late so that we don't get in the way of creating ffmas 3499 (('fmin', ('fadd(is_used_once)', '#c', a), ('fadd(is_used_once)', '#c', b)), ('fadd', c, ('fmin', a, b))), 3500 (('fmax', ('fadd(is_used_once)', '#c', a), ('fadd(is_used_once)', '#c', b)), ('fadd', c, ('fmax', a, b))), 3501 3502 # Putting this in 'optimizations' interferes with the bcsel(a, op(b, c), 3503 # op(b, d)) => op(b, bcsel(a, c, d)) transformations. I do not know why. 3504 (('bcsel', ('feq', ('fsqrt', 'a(is_not_negative)'), 0.0), intBitsToFloat(0x7f7fffff), ('frsq', a)), 3505 ('fmin', ('frsq', a), intBitsToFloat(0x7f7fffff))), 3506 3507 # Things that look like DPH in the source shader may get expanded to 3508 # something that looks like dot(v1.xyz, v2.xyz) + v1.w by the time it gets 3509 # to NIR. After FFMA is generated, this can look like: 3510 # 3511 # fadd(ffma(v1.z, v2.z, ffma(v1.y, v2.y, fmul(v1.x, v2.x))), v1.w) 3512 # 3513 # Reassociate the last addition into the first multiplication. 3514 # 3515 # Some shaders do not use 'invariant' in vertex and (possibly) geometry 3516 # shader stages on some outputs that are intended to be invariant. For 3517 # various reasons, this optimization may not be fully applied in all 3518 # shaders used for different rendering passes of the same geometry. This 3519 # can result in Z-fighting artifacts (at best). For now, disable this 3520 # optimization in these stages. See bugzilla #111490. In tessellation 3521 # stages applications seem to use 'precise' when necessary, so allow the 3522 # optimization in those stages. 3523 (('~fadd', ('ffma(is_used_once)', a, b, ('ffma(is_used_once)', c, d, ('ffma', e, 'f', ('fmul(is_used_once)', 'g(is_not_const_and_not_fsign)', 'h(is_not_const_and_not_fsign)')))), 'i(is_not_const)'), 3524 ('ffma', a, b, ('ffma', c, d, ('ffma', e, 'f', ('ffma', 'g', 'h', 'i')))), '(info->stage != MESA_SHADER_VERTEX && info->stage != MESA_SHADER_GEOMETRY) && !options->intel_vec4'), 3525 (('~fadd', ('ffma(is_used_once)', a, b, ('ffma', c, d, ('fmul(is_used_once)', 'e(is_not_const_and_not_fsign)', 'f(is_not_const_and_not_fsign)'))), 'g(is_not_const)'), 3526 ('ffma', a, b, ('ffma', c, d, ('ffma', e, 'f', 'g'))), '(info->stage != MESA_SHADER_VERTEX && info->stage != MESA_SHADER_GEOMETRY) && !options->intel_vec4'), 3527 (('~fadd', ('ffma(is_used_once)', a, b, ('fmul(is_used_once)', 'c(is_not_const_and_not_fsign)', 'd(is_not_const_and_not_fsign)') ), 'e(is_not_const)'), 3528 ('ffma', a, b, ('ffma', c, d, e)), '(info->stage != MESA_SHADER_VERTEX && info->stage != MESA_SHADER_GEOMETRY) && !options->intel_vec4'), 3529 (('~fadd', ('fneg', ('ffma(is_used_once)', a, b, ('ffma', c, d, ('fmul(is_used_once)', 'e(is_not_const_and_not_fsign)', 'f(is_not_const_and_not_fsign)')))), 'g(is_not_const)'), 3530 ('ffma', ('fneg', a), b, ('ffma', ('fneg', c), d, ('ffma', ('fneg', e), 'f', 'g'))), '(info->stage != MESA_SHADER_VERTEX && info->stage != MESA_SHADER_GEOMETRY) && !options->intel_vec4'), 3531 3532 (('~fadd', ('ffmaz(is_used_once)', a, b, ('ffmaz', c, d, ('fmulz(is_used_once)', 'e(is_not_const_and_not_fsign)', 'f(is_not_const_and_not_fsign)'))), 'g(is_not_const)'), 3533 ('ffmaz', a, b, ('ffmaz', c, d, ('ffmaz', e, 'f', 'g'))), '(info->stage != MESA_SHADER_VERTEX && info->stage != MESA_SHADER_GEOMETRY) && !options->intel_vec4'), 3534 (('~fadd', ('ffmaz(is_used_once)', a, b, ('fmulz(is_used_once)', 'c(is_not_const_and_not_fsign)', 'd(is_not_const_and_not_fsign)') ), 'e(is_not_const)'), 3535 ('ffmaz', a, b, ('ffmaz', c, d, e)), '(info->stage != MESA_SHADER_VERTEX && info->stage != MESA_SHADER_GEOMETRY) && !options->intel_vec4'), 3536 (('~fadd', ('fneg', ('ffmaz(is_used_once)', a, b, ('ffmaz', c, d, ('fmulz(is_used_once)', 'e(is_not_const_and_not_fsign)', 'f(is_not_const_and_not_fsign)')))), 'g(is_not_const)'), 3537 ('ffmaz', ('fneg', a), b, ('ffmaz', ('fneg', c), d, ('ffmaz', ('fneg', e), 'f', 'g'))), '(info->stage != MESA_SHADER_VERTEX && info->stage != MESA_SHADER_GEOMETRY) && !options->intel_vec4'), 3538 3539 # Section 8.8 (Integer Functions) of the GLSL 4.60 spec says: 3540 # 3541 # If bits is zero, the result will be zero. 3542 # 3543 # These prevent the next two lowerings generating incorrect results when 3544 # count is zero. 3545 (('ubfe', a, b, 0), 0), 3546 (('ibfe', a, b, 0), 0), 3547 3548 # On Intel GPUs, BFE is a 3-source instruction. Like all 3-source 3549 # instructions on Intel GPUs, it cannot have an immediate values as 3550 # sources. There are also limitations on source register strides. As a 3551 # result, it is very easy for 3-source instruction combined with either 3552 # loads of immediate values or copies from weird register strides to be 3553 # more expensive than the primitive instructions it represents. 3554 (('ubfe', a, '#b', '#c'), ('iand', ('ushr', 0xffffffff, ('ineg', c)), ('ushr', a, b)), 'options->avoid_ternary_with_two_constants'), 3555 3556 # b is the lowest order bit to be extracted and c is the number of bits to 3557 # extract. The inner shift removes the bits above b + c by shifting left 3558 # 32 - (b + c). ishl only sees the low 5 bits of the shift count, which is 3559 # -(b + c). The outer shift moves the bit that was at b to bit zero. 3560 # After the first shift, that bit is now at b + (32 - (b + c)) or 32 - c. 3561 # This means that it must be shifted right by 32 - c or -c bits. 3562 (('ibfe', a, '#b', '#c'), ('ishr', ('ishl', a, ('ineg', ('iadd', b, c))), ('ineg', c)), 'options->avoid_ternary_with_two_constants'), 3563 3564 # Clean up no-op shifts that may result from the bfe lowerings. 3565 (('ishl', a, 0), a), 3566 (('ishl', a, -32), a), 3567 (('ishr', a, 0), a), 3568 (('ishr', a, -32), a), 3569 (('ushr', a, 0), a), 3570 3571 (('extract_i8', ('extract_i8', a, b), 0), ('extract_i8', a, b)), 3572 (('extract_i8', ('extract_u8', a, b), 0), ('extract_i8', a, b)), 3573 (('extract_u8', ('extract_i8', a, b), 0), ('extract_u8', a, b)), 3574 (('extract_u8', ('extract_u8', a, b), 0), ('extract_u8', a, b)), 3575 3576 # open coded bit test 3577 (('ine', ('iand', a, '#b(is_pos_power_of_two)'), 0), ('bitnz', a, ('find_lsb', b)), 'options->has_bit_test'), 3578 (('ieq', ('iand', a, '#b(is_pos_power_of_two)'), 0), ('bitz', a, ('find_lsb', b)), 'options->has_bit_test'), 3579 (('ine', ('iand', a, '#b(is_pos_power_of_two)'), b), ('bitz', a, ('find_lsb', b)), 'options->has_bit_test'), 3580 (('ieq', ('iand', a, '#b(is_pos_power_of_two)'), b), ('bitnz', a, ('find_lsb', b)), 'options->has_bit_test'), 3581 (('ine', ('iand', a, ('ishl', 1, b)), 0), ('bitnz', a, b), 'options->has_bit_test'), 3582 (('ieq', ('iand', a, ('ishl', 1, b)), 0), ('bitz', a, b), 'options->has_bit_test'), 3583 (('ine', ('iand', a, ('ishl', 1, b)), ('ishl', 1, b)), ('bitz', a, b), 'options->has_bit_test'), 3584 (('ieq', ('iand', a, ('ishl', 1, b)), ('ishl', 1, b)), ('bitnz', a, b), 'options->has_bit_test'), 3585 (('bitz', ('ushr', a, b), 0), ('bitz', a, b)), 3586 (('bitz', ('ishr', a, b), 0), ('bitz', a, b)), 3587 (('bitnz', ('ushr', a, b), 0), ('bitnz', a, b)), 3588 (('bitnz', ('ishr', a, b), 0), ('bitnz', a, b)), 3589 (('ine', ('ubfe', a, b, 1), 0), ('bitnz', a, b), 'options->has_bit_test'), 3590 (('ieq', ('ubfe', a, b, 1), 0), ('bitz', a, b), 'options->has_bit_test'), 3591 (('ine', ('ubfe', a, b, 1), 1), ('bitz', a, b), 'options->has_bit_test'), 3592 (('ieq', ('ubfe', a, b, 1), 1), ('bitnz', a, b), 'options->has_bit_test'), 3593 (('ine', ('ibfe', a, b, 1), 0), ('bitnz', a, b), 'options->has_bit_test'), 3594 (('ieq', ('ibfe', a, b, 1), 0), ('bitz', a, b), 'options->has_bit_test'), 3595 (('ine', ('ibfe', a, b, 1), -1), ('bitz', a, b), 'options->has_bit_test'), 3596 (('ieq', ('ibfe', a, b, 1), -1), ('bitnz', a, b), 'options->has_bit_test'), 3597 (('inot', ('bitnz', a, b)), ('bitz', a, b)), 3598 (('inot', ('bitz', a, b)), ('bitnz', a, b)), 3599 (('bitnz', ('inot', a), b), ('bitz', a, b)), 3600 (('bitz', ('inot', a), b), ('bitnz', a, b)), 3601]) 3602 3603# A few more extract cases we'd rather leave late 3604for N in [16, 32]: 3605 aN = 'a@{0}'.format(N) 3606 u2uM = 'u2u{0}'.format(M) 3607 i2iM = 'i2i{0}'.format(M) 3608 3609 for x in ['u', 'i']: 3610 x2xN = '{0}2{0}{1}'.format(x, N) 3611 extract_x8 = 'extract_{0}8'.format(x) 3612 extract_x16 = 'extract_{0}16'.format(x) 3613 3614 late_optimizations.extend([ 3615 ((x2xN, ('u2u8', aN)), (extract_x8, a, 0), '!options->lower_extract_byte'), 3616 ((x2xN, ('i2i8', aN)), (extract_x8, a, 0), '!options->lower_extract_byte'), 3617 ]) 3618 3619 if N > 16: 3620 late_optimizations.extend([ 3621 ((x2xN, ('u2u16', aN)), (extract_x16, a, 0), '!options->lower_extract_word'), 3622 ((x2xN, ('i2i16', aN)), (extract_x16, a, 0), '!options->lower_extract_word'), 3623 ]) 3624 3625# Byte insertion 3626late_optimizations.extend([(('ishl', ('extract_u8', 'a@32', 0), 8 * i), ('insert_u8', a, i), '!options->lower_insert_byte') for i in range(1, 4)]) 3627late_optimizations.extend([(('iand', ('ishl', 'a@32', 8 * i), 0xff << (8 * i)), ('insert_u8', a, i), '!options->lower_insert_byte') for i in range(1, 4)]) 3628late_optimizations.append((('ishl', 'a@32', 24), ('insert_u8', a, 3), '!options->lower_insert_byte')) 3629 3630late_optimizations += [ 3631 # Word insertion 3632 (('ishl', 'a@32', 16), ('insert_u16', a, 1), '!options->lower_insert_word'), 3633 3634 # Extract and then insert 3635 (('insert_u8', ('extract_u8', 'a', 0), b), ('insert_u8', a, b)), 3636 (('insert_u16', ('extract_u16', 'a', 0), b), ('insert_u16', a, b)), 3637] 3638 3639# Float sizes 3640for s in [16, 32, 64]: 3641 late_optimizations.extend([ 3642 (('~fadd@{}'.format(s), 1.0, ('fmul(is_used_once)', c , ('fadd', b, -1.0 ))), ('fadd', ('fadd', 1.0, ('fneg', c)), ('fmul', b, c)), 'options->lower_flrp{}'.format(s)), 3643 (('bcsel', a, 0, ('b2f{}'.format(s), ('inot', 'b@bool'))), ('b2f{}'.format(s), ('inot', ('ior', a, b)))), 3644 ]) 3645 3646for op in ['fadd']: 3647 late_optimizations += [ 3648 (('bcsel', a, (op + '(is_used_once)', b, c), (op, b, d)), (op, b, ('bcsel', a, c, d))), 3649 (('bcsel', a, (op, b, c), (op + '(is_used_once)', b, d)), (op, b, ('bcsel', a, c, d))), 3650 ] 3651 3652for op in ['ffma', 'ffmaz']: 3653 late_optimizations += [ 3654 (('bcsel', a, (op + '(is_used_once)', b, c, d), (op, b, c, e)), (op, b, c, ('bcsel', a, d, e))), 3655 (('bcsel', a, (op, b, c, d), (op + '(is_used_once)', b, c, e)), (op, b, c, ('bcsel', a, d, e))), 3656 3657 (('bcsel', a, (op + '(is_used_once)', b, c, d), (op, b, e, d)), (op, b, ('bcsel', a, c, e), d)), 3658 (('bcsel', a, (op, b, c, d), (op + '(is_used_once)', b, e, d)), (op, b, ('bcsel', a, c, e), d)), 3659 ] 3660 3661# mediump: If an opcode is surrounded by conversions, remove the conversions. 3662# The rationale is that type conversions + the low precision opcode are more 3663# expensive that the same arithmetic opcode at higher precision. 3664# 3665# This must be done in late optimizations, because we need normal optimizations to 3666# first eliminate temporary up-conversions such as in op1(f2fmp(f2f32(op2()))). 3667# 3668# Unary opcodes 3669for op in ['fabs', 'fceil', 'fcos', 'fexp2', 'ffloor', 'ffract', 'flog2', 'fneg', 3670 'frcp', 'fround_even', 'frsq', 'fsat', 'fsign', 'fsin', 'fsqrt']: 3671 late_optimizations += [(('~f2f32', (op, ('f2fmp', a))), (op, a))] 3672 3673# Binary opcodes 3674for op in ['fadd', 'fdiv', 'fmax', 'fmin', 'fmod', 'fmul', 'fpow', 'frem']: 3675 late_optimizations += [(('~f2f32', (op, ('f2fmp', a), ('f2fmp', b))), (op, a, b))] 3676 3677# Ternary opcodes 3678for op in ['ffma', 'flrp']: 3679 late_optimizations += [(('~f2f32', (op, ('f2fmp', a), ('f2fmp', b), ('f2fmp', c))), (op, a, b, c))] 3680 3681# Comparison opcodes 3682for op in ['feq', 'fge', 'flt', 'fneu']: 3683 late_optimizations += [(('~' + op, ('f2fmp', a), ('f2fmp', b)), (op, a, b))] 3684 3685# Do this last, so that the f2fmp patterns above have effect. 3686late_optimizations += [ 3687 # Convert *2*mp instructions to concrete *2*16 instructions. At this point 3688 # any conversions that could have been removed will have been removed in 3689 # nir_opt_algebraic so any remaining ones are required. 3690 (('f2fmp', a), ('f2f16', a), "!options->preserve_mediump"), 3691 (('f2imp', a), ('f2i16', a), "!options->preserve_mediump"), 3692 (('f2ump', a), ('f2u16', a), "!options->preserve_mediump"), 3693 (('i2imp', a), ('i2i16', a), "!options->preserve_mediump"), 3694 (('i2fmp', a), ('i2f16', a), "!options->preserve_mediump"), 3695 (('i2imp', a), ('u2u16', a), "!options->preserve_mediump"), 3696 (('u2fmp', a), ('u2f16', a), "!options->preserve_mediump"), 3697 (('fisfinite', a), ('flt', ('fabs', a), float("inf"))), 3698 3699 (('f2f16', a), ('f2f16_rtz', a), "options->force_f2f16_rtz && !nir_is_rounding_mode_rtne(info->float_controls_execution_mode, 16)"), 3700 3701 (('fcsel', ('slt', 0, a), b, c), ('fcsel_gt', a, b, c), "options->has_fused_comp_and_csel"), 3702 (('fcsel', ('slt', a, 0), b, c), ('fcsel_gt', ('fneg', a), b, c), "options->has_fused_comp_and_csel"), 3703 (('fcsel', ('sge', a, 0), b, c), ('fcsel_ge', a, b, c), "options->has_fused_comp_and_csel"), 3704 (('fcsel', ('sge', 0, a), b, c), ('fcsel_ge', ('fneg', a), b, c), "options->has_fused_comp_and_csel"), 3705 3706 (('bcsel', ('ilt', 0, 'a@32'), 'b@32', 'c@32'), ('i32csel_gt', a, b, c), "options->has_fused_comp_and_csel && !options->no_integers"), 3707 (('bcsel', ('ilt', 'a@32', 0), 'b@32', 'c@32'), ('i32csel_ge', a, c, b), "options->has_fused_comp_and_csel && !options->no_integers"), 3708 (('bcsel', ('ige', 'a@32', 0), 'b@32', 'c@32'), ('i32csel_ge', a, b, c), "options->has_fused_comp_and_csel && !options->no_integers"), 3709 (('bcsel', ('ige', 0, 'a@32'), 'b@32', 'c@32'), ('i32csel_gt', a, c, b), "options->has_fused_comp_and_csel && !options->no_integers"), 3710 3711 (('bcsel', ('flt', 0, 'a@32'), 'b@32', 'c@32'), ('fcsel_gt', a, b, c), "options->has_fused_comp_and_csel"), 3712 (('bcsel', ('flt', 'a@32', 0), 'b@32', 'c@32'), ('fcsel_gt', ('fneg', a), b, c), "options->has_fused_comp_and_csel"), 3713 (('bcsel', ('fge', 'a@32', 0), 'b@32', 'c@32'), ('fcsel_ge', a, b, c), "options->has_fused_comp_and_csel"), 3714 (('bcsel', ('fge', 0, 'a@32'), 'b@32', 'c@32'), ('fcsel_ge', ('fneg', a), b, c), "options->has_fused_comp_and_csel"), 3715] 3716 3717for s in [16, 32, 64]: 3718 late_optimizations.extend([ 3719 (('bcsel@{}'.format(s), ('ieq', 0, 'a@{}'.format(s)), 'b@{}'.format(s), 'c@{}'.format(s)), ('icsel_eqz', a, b, c), "options->has_icsel_eqz{} && !options->no_integers".format(s)), 3720 (('bcsel@{}'.format(s), ('ine', 0, 'a@{}'.format(s)), 'b@{}'.format(s), 'c@{}'.format(s)), ('icsel_eqz', a, c, b), "options->has_icsel_eqz{} && !options->no_integers".format(s)), 3721 ]) 3722 3723distribute_src_mods = [ 3724 # Try to remove some spurious negations rather than pushing them down. 3725 (('fmul', ('fneg', a), ('fneg', b)), ('fmul', a, b)), 3726 (('ffma', ('fneg', a), ('fneg', b), c), ('ffma', a, b, c)), 3727 (('fdot2_replicated', ('fneg', a), ('fneg', b)), ('fdot2_replicated', a, b)), 3728 (('fdot3_replicated', ('fneg', a), ('fneg', b)), ('fdot3_replicated', a, b)), 3729 (('fdot4_replicated', ('fneg', a), ('fneg', b)), ('fdot4_replicated', a, b)), 3730 (('fneg', ('fneg', a)), a), 3731 3732 (('fneg', ('fmul(is_used_once)', a, b)), ('fmul', ('fneg', a), b)), 3733 (('fabs', ('fmul(is_used_once)', a, b)), ('fmul', ('fabs', a), ('fabs', b))), 3734 3735 (('fneg', ('ffma(is_used_once)', a, b, c)), ('ffma', ('fneg', a), b, ('fneg', c))), 3736 (('fneg', ('flrp(is_used_once)', a, b, c)), ('flrp', ('fneg', a), ('fneg', b), c)), 3737 (('fneg', ('~fadd(is_used_once)', a, b)), ('fadd', ('fneg', a), ('fneg', b))), 3738 3739 # Note that fmin <-> fmax. I don't think there is a way to distribute 3740 # fabs() into fmin or fmax. 3741 (('fneg', ('fmin(is_used_once)', a, b)), ('fmax', ('fneg', a), ('fneg', b))), 3742 (('fneg', ('fmax(is_used_once)', a, b)), ('fmin', ('fneg', a), ('fneg', b))), 3743 3744 (('fneg', ('fdot2_replicated(is_used_once)', a, b)), ('fdot2_replicated', ('fneg', a), b)), 3745 (('fneg', ('fdot3_replicated(is_used_once)', a, b)), ('fdot3_replicated', ('fneg', a), b)), 3746 (('fneg', ('fdot4_replicated(is_used_once)', a, b)), ('fdot4_replicated', ('fneg', a), b)), 3747 3748 # fdph works mostly like fdot, but to get the correct result, the negation 3749 # must be applied to the second source. 3750 (('fneg', ('fdph_replicated(is_used_once)', a, b)), ('fdph_replicated', a, ('fneg', b))), 3751 3752 (('fneg', ('fsign(is_used_once)', a)), ('fsign', ('fneg', a))), 3753 (('fabs', ('fsign(is_used_once)', a)), ('fsign', ('fabs', a))), 3754] 3755 3756before_lower_int64_optimizations = [ 3757 # The i2i64(a) implies that 'a' has at most 32-bits of data. 3758 (('ishl', ('i2i64', a), b), 3759 # Effective shift count of zero, just return 'a'. 3760 ('bcsel', ('ieq', ('iand', b, 63), 0), ('i2i64', a), 3761 ('bcsel', ('ilt', ('iand', b, 63), 32), 3762 # Shifting less than 32 bits, so both 32-bit halves will have 3763 # some data. These (and the else case) shift counts are of 32-bit 3764 # values, so the shift counts are implicitly moduolo 32. 3765 ('pack_64_2x32_split', ('ishl', ('i2i32', a), b), ('ishr', ('i2i32', a), ('iadd', ('ineg', b), 32) )), 3766 # Shifting 32 bits or more, so lower 32 bits must be zero. 3767 ('pack_64_2x32_split', 0 , ('ishl', ('i2i32', a), ('iabs', ('iadd', ('ineg', b), 32)))))), 3768 '(options->lower_int64_options & nir_lower_shift64) != 0'), 3769 3770 (('ishl', ('u2u64', a), b), 3771 ('bcsel', ('ieq', ('iand', b, 63), 0), ('u2u64', a), 3772 ('bcsel', ('ilt', ('iand', b, 63), 32), 3773 ('pack_64_2x32_split', ('ishl', ('u2u32', a), b), ('ushr', ('u2u32', a), ('iadd', ('ineg', b), 32) )), 3774 ('pack_64_2x32_split', 0 , ('ishl', ('u2u32', a), ('iabs', ('iadd', ('ineg', b), 32)))))), 3775 '(options->lower_int64_options & nir_lower_shift64) != 0'), 3776 3777 # If ineg64 is lowered, then the negation is not free. Try to eliminate 3778 # some of the negations. 3779 (('iadd@64', ('ineg', a), ('ineg(is_used_once)', b)), ('isub', ('ineg', a), b), '(options->lower_int64_options & nir_lower_ineg64) != 0'), 3780 (('iadd@64', a, ('ineg', b)), ('isub', a, b), '(options->lower_int64_options & nir_lower_ineg64) != 0'), 3781 (('isub@64', a, ('ineg', b)), ('iadd', a, b), '(options->lower_int64_options & nir_lower_ineg64) != 0'), 3782 (('isub@64', ('ineg', a), ('ineg', b)), ('isub', b, a), '(options->lower_int64_options & nir_lower_ineg64) != 0'), 3783 3784 (('imul@64', ('ineg', a), ('ineg', b)), ('imul', a, b)), 3785 (('idiv@64', ('ineg', a), ('ineg', b)), ('idiv', a, b)), 3786 3787 # If the hardware can do int64, the shift is the same cost as the add. It 3788 # should be fine to do this transformation unconditionally. 3789 (('iadd', ('i2i64', a), ('i2i64', a)), ('ishl', ('i2i64', a), 1)), 3790 (('iadd', ('u2u64', a), ('u2u64', a)), ('ishl', ('u2u64', a), 1)), 3791] 3792 3793parser = argparse.ArgumentParser() 3794parser.add_argument('--out', required=True) 3795args = parser.parse_args() 3796 3797with open(args.out, "w", encoding='utf-8') as f: 3798 f.write(nir_algebraic.AlgebraicPass("nir_opt_algebraic", optimizations).render()) 3799 f.write(nir_algebraic.AlgebraicPass("nir_opt_algebraic_before_ffma", 3800 before_ffma_optimizations).render()) 3801 f.write(nir_algebraic.AlgebraicPass("nir_opt_algebraic_before_lower_int64", 3802 before_lower_int64_optimizations).render()) 3803 f.write(nir_algebraic.AlgebraicPass("nir_opt_algebraic_late", 3804 late_optimizations).render()) 3805 f.write(nir_algebraic.AlgebraicPass("nir_opt_algebraic_distribute_src_mods", 3806 distribute_src_mods).render()) 3807